Context Navigation

dfa.c@ 3658

Last change on this file since 3658 was 3658, checked in by bird, 9 months ago
grep/dfa.c: Workaround for Visual C++ 2022 (amd64) optimizer bug. (same as sed)
Property svn:eol-style set to `native`
File size: 134.7 KB

Line
1	/* dfa.c - deterministic extended regexp routines for GNU
2	Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2021 Free Software
3	Foundation, Inc.
4
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation; either version 3, or (at your option)
8	any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License
16	along with this program; if not, write to the Free Software
17	Foundation, Inc.,
18	51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20	/* Written June, 1988 by Mike Haertel
21	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23	#include <config.h>
24
25	#include "dfa.h"
26
27	#include "flexmember.h"
28	#include "idx.h"
29	#include "verify.h"
30
31	#include <assert.h>
32	#include <ctype.h>
33	#include <stdint.h>
34	#include <stdio.h>
35	#include <stdlib.h>
36	#include <limits.h>
37	#include <string.h>
38
39	/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
40	understandably cannot deduce that the input comes from a
41	well-formed regular expression. There's little point to the
42	runtime overhead of 'assert' instead of 'assume_nonnull' when the
43	MMU will check anyway. */
44	#define assume_nonnull(x) assume ((x) != NULL)
45
46	static bool
47	streq (char const a, char const b)
48	{
49	return strcmp (a, b) == 0;
50	}
51
52	static bool
53	isasciidigit (char c)
54	{
55	return '0' <= c && c <= '9';
56	}
57
58	#include "gettext.h"
59	#define _(str) gettext (str)
60
61	#include <wchar.h>
62
63	#include "xalloc.h"
64	#include "localeinfo.h"
65
66	#ifndef FALLTHROUGH
67	# if 201710L < __STDC_VERSION__
68	# define FALLTHROUGH [[__fallthrough__]]
69	# elif (__GNUC__ >= 7) \|\| (__clang_major__ >= 10)
70	# define FALLTHROUGH __attribute__ ((__fallthrough__))
71	# else
72	# define FALLTHROUGH ((void) 0)
73	# endif
74	#endif
75
76	#ifndef MIN
77	# define MIN(a,b) ((a) < (b) ? (a) : (b))
78	#endif
79
80	/* HPUX defines these as macros in sys/param.h. */
81	#ifdef setbit
82	# undef setbit
83	#endif
84	#ifdef clrbit
85	# undef clrbit
86	#endif
87
88	/* For code that does not use Gnulibâs isblank module. */
89	#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
90	# define isblank dfa_isblank
91	static int
92	isblank (int c)
93	{
94	return c == ' ' \|\| c == '\t';
95	}
96	#endif
97
98	/* First integer value that is greater than any character code. */
99	enum { NOTCHAR = 1 << CHAR_BIT };
100
101	#ifdef UINT_LEAST64_MAX
102
103	/* Number of bits used in a charclass word. */
104	enum { CHARCLASS_WORD_BITS = 64 };
105
106	/* This represents part of a character class. It must be unsigned and
107	at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
108	typedef uint_least64_t charclass_word;
109
110	/* Part of a charclass initializer that represents 64 bits' worth of a
111	charclass, where LO and HI are the low and high-order 32 bits of
112	the 64-bit quantity. */
113	# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
114
115	#else
116	/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
117	enum { CHARCLASS_WORD_BITS = 32 };
118	typedef unsigned long charclass_word;
119	# define CHARCLASS_PAIR(lo, hi) lo, hi
120	#endif
121
122	/* An initializer for a charclass whose 32-bit words are A through H. */
123	#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
124	{{ \
125	CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
126	CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
127	}}
128
129	/* The maximum useful value of a charclass_word; all used bits are 1. */
130	static charclass_word const CHARCLASS_WORD_MASK
131	= ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
132
133	/* Number of words required to hold a bit for every character. */
134	enum
135	{
136	CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
137	};
138
139	/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
140	typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
141
142	/* Convert a possibly-signed character to an unsigned character. This is
143	a bit safer than casting to unsigned char, since it catches some type
144	errors that the cast doesn't. */
145	static unsigned char
146	to_uchar (char ch)
147	{
148	return ch;
149	}
150
151	/* Contexts tell us whether a character is a newline or a word constituent.
152	Word-constituent characters are those that satisfy iswalnum, plus '_'.
153	Each character has a single CTX_* value; bitmasks of CTX_* values denote
154	a particular character class.
155
156	A state also stores a context value, which is a bitmask of CTX_* values.
157	A state's context represents a set of characters that the state's
158	predecessors must match. For example, a state whose context does not
159	include CTX_LETTER will never have transitions where the previous
160	character is a word constituent. A state whose context is CTX_ANY
161	might have transitions from any character. */
162
163	enum
164	{
165	CTX_NONE = 1,
166	CTX_LETTER = 2,
167	CTX_NEWLINE = 4,
168	CTX_ANY = 7
169	};
170
171	/* Sometimes characters can only be matched depending on the surrounding
172	context. Such context decisions depend on what the previous character
173	was, and the value of the current (lookahead) character. Context
174	dependent constraints are encoded as 9-bit integers. Each bit that
175	is set indicates that the constraint succeeds in the corresponding
176	context.
177
178	bit 6-8 - valid contexts when next character is CTX_NEWLINE
179	bit 3-5 - valid contexts when next character is CTX_LETTER
180	bit 0-2 - valid contexts when next character is CTX_NONE
181
182	succeeds_in_context determines whether a given constraint
183	succeeds in a particular context. Prev is a bitmask of possible
184	context values for the previous character, curr is the (single-bit)
185	context value for the lookahead character. */
186	static int
187	newline_constraint (int constraint)
188	{
189	return (constraint >> 6) & 7;
190	}
191	static int
192	letter_constraint (int constraint)
193	{
194	return (constraint >> 3) & 7;
195	}
196	static int
197	other_constraint (int constraint)
198	{
199	return constraint & 7;
200	}
201
202	static bool
203	succeeds_in_context (int constraint, int prev, int curr)
204	{
205	return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
206	\| (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
207	\| (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
208	& prev);
209	}
210
211	/* The following describe what a constraint depends on. */
212	static bool
213	prev_newline_dependent (int constraint)
214	{
215	return ((constraint ^ constraint >> 2) & 0111) != 0;
216	}
217	static bool
218	prev_letter_dependent (int constraint)
219	{
220	return ((constraint ^ constraint >> 1) & 0111) != 0;
221	}
222
223	/* Tokens that match the empty string subject to some constraint actually
224	work by applying that constraint to determine what may follow them,
225	taking into account what has gone before. The following values are
226	the constraints corresponding to the special tokens previously defined. */
227	enum
228	{
229	NO_CONSTRAINT = 0777,
230	BEGLINE_CONSTRAINT = 0444,
231	ENDLINE_CONSTRAINT = 0700,
232	BEGWORD_CONSTRAINT = 0050,
233	ENDWORD_CONSTRAINT = 0202,
234	LIMWORD_CONSTRAINT = 0252,
235	NOTLIMWORD_CONSTRAINT = 0525
236	};
237
238	/* The regexp is parsed into an array of tokens in postfix form. Some tokens
239	are operators and others are terminal symbols. Most (but not all) of these
240	codes are returned by the lexical analyzer. */
241
242	typedef ptrdiff_t token;
243	static token const TOKEN_MAX = PTRDIFF_MAX;
244
245	/* States are indexed by state_num values. These are normally
246	nonnegative but -1 is used as a special value. */
247	typedef ptrdiff_t state_num;
248
249	/* Predefined token values. */
250	enum
251	{
252	END = -1, /* END is a terminal symbol that matches the
253	end of input; any value of END or less in
254	the parse tree is such a symbol. Accepting
255	states of the DFA are those that would have
256	a transition on END. This is -1, not some
257	more-negative value, to tweak the speed of
258	comparisons to END. */
259
260	/* Ordinary character values are terminal symbols that match themselves. */
261
262	/* CSET must come last in the following list of special tokens. Otherwise,
263	the list order matters only for performance. Related special tokens
264	should have nearby values so that code like (t == ANYCHAR \|\| t == MBCSET
265	\|\| CSET <= t) can be done with a single machine-level comparison. */
266
267	EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
268	the empty string. */
269
270	QMARK, /* QMARK is an operator of one argument that
271	matches zero or one occurrences of its
272	argument. */
273
274	STAR, /* STAR is an operator of one argument that
275	matches the Kleene closure (zero or more
276	occurrences) of its argument. */
277
278	PLUS, /* PLUS is an operator of one argument that
279	matches the positive closure (one or more
280	occurrences) of its argument. */
281
282	REPMN, /* REPMN is a lexical token corresponding
283	to the {m,n} construct. REPMN never
284	appears in the compiled token vector. */
285
286	CAT, /* CAT is an operator of two arguments that
287	matches the concatenation of its
288	arguments. CAT is never returned by the
289	lexical analyzer. */
290
291	OR, /* OR is an operator of two arguments that
292	matches either of its arguments. */
293
294	LPAREN, /* LPAREN never appears in the parse tree,
295	it is only a lexeme. */
296
297	RPAREN, /* RPAREN never appears in the parse tree. */
298
299	#if defined(KMK_GREP) && defined(KBUILD_OS_WINDOWS)
300	# define WCHAR DFA_WCHAR
301	#endif
302	WCHAR, /* Only returned by lex. wctok contains
303	the wide character representation. */
304
305	ANYCHAR, /* ANYCHAR is a terminal symbol that matches
306	a valid multibyte (or single byte) character.
307	It is used only if MB_CUR_MAX > 1. */
308
309	BEG, /* BEG is an initial symbol that matches the
310	beginning of input. */
311
312	BEGLINE, /* BEGLINE is a terminal symbol that matches
313	the empty string at the beginning of a
314	line. */
315
316	ENDLINE, /* ENDLINE is a terminal symbol that matches
317	the empty string at the end of a line. */
318
319	BEGWORD, /* BEGWORD is a terminal symbol that matches
320	the empty string at the beginning of a
321	word. */
322
323	ENDWORD, /* ENDWORD is a terminal symbol that matches
324	the empty string at the end of a word. */
325
326	LIMWORD, /* LIMWORD is a terminal symbol that matches
327	the empty string at the beginning or the
328	end of a word. */
329
330	NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
331	matches the empty string not at
332	the beginning or end of a word. */
333
334	BACKREF, /* BACKREF is generated by \<digit>
335	or by any other construct that
336	is not completely handled. If the scanner
337	detects a transition on backref, it returns
338	a kind of "semi-success" indicating that
339	the match will have to be verified with
340	a backtracking matcher. */
341
342	MBCSET, /* MBCSET is similar to CSET, but for
343	multibyte characters. */
344
345	CSET /* CSET and (and any value greater) is a
346	terminal symbol that matches any of a
347	class of characters. */
348	};
349
350
351	/* States of the recognizer correspond to sets of positions in the parse
352	tree, together with the constraints under which they may be matched.
353	So a position is encoded as an index into the parse tree together with
354	a constraint. */
355	typedef struct
356	{
357	idx_t index; /* Index into the parse array. */
358	unsigned int constraint; /* Constraint for matching this position. */
359	} position;
360
361	/* Sets of positions are stored as arrays. */
362	typedef struct
363	{
364	position elems; / Elements of this position set. */
365	idx_t nelem; /* Number of elements in this set. */
366	idx_t alloc; /* Number of elements allocated in ELEMS. */
367	} position_set;
368
369	/* A state of the dfa consists of a set of positions, some flags,
370	and the token value of the lowest-numbered position of the state that
371	contains an END token. */
372	typedef struct
373	{
374	size_t hash; /* Hash of the positions of this state. */
375	position_set elems; /* Positions this state could match. */
376	unsigned char context; /* Context from previous state. */
377	unsigned short constraint; /* Constraint for this state to accept. */
378	position_set mbps; /* Positions which can match multibyte
379	characters or the follows, e.g., period.
380	Used only if MB_CUR_MAX > 1. */
381	state_num mb_trindex; /* Index of this state in MB_TRANS, or
382	negative if the state does not have
383	ANYCHAR. */
384	} dfa_state;
385
386	/* Maximum for any transition table count. This should be at least 3,
387	for the initial state setup. */
388	enum { MAX_TRCOUNT = 1024 };
389
390	/* A bracket operator.
391	e.g., [a-c], [[:alpha:]], etc. */
392	struct mb_char_classes
393	{
394	ptrdiff_t cset;
395	bool invert;
396	wchar_t chars; / Normal characters. */
397	idx_t nchars;
398	idx_t nchars_alloc;
399	};
400
401	struct regex_syntax
402	{
403	/* Syntax bits controlling the behavior of the lexical analyzer. */
404	reg_syntax_t syntax_bits;
405	bool syntax_bits_set;
406
407	/* Flag for case-folding letters into sets. */
408	bool case_fold;
409
410	/* True if ^ and $ match only the start and end of data, and do not match
411	end-of-line within data. */
412	bool anchor;
413
414	/* End-of-line byte in data. */
415	unsigned char eolbyte;
416
417	/* Cache of char-context values. */
418	char sbit[NOTCHAR];
419
420	/* If never_trail[B], the byte B cannot be a non-initial byte in a
421	multibyte character. */
422	bool never_trail[NOTCHAR];
423
424	/* Set of characters considered letters. */
425	charclass letters;
426
427	/* Set of characters that are newline. */
428	charclass newline;
429	};
430
431	/* Lexical analyzer. All the dross that deals with the obnoxious
432	GNU Regex syntax bits is located here. The poor, suffering
433	reader is referred to the GNU Regex documentation for the
434	meaning of the @#%!@#%^!@ syntax bits. */
435	struct lexer_state
436	{
437	char const ptr; / Pointer to next input character. */
438	idx_t left; /* Number of characters remaining. */
439	token lasttok; /* Previous token returned; initially END. */
440	idx_t parens; /* Count of outstanding left parens. */
441	int minrep, maxrep; /* Repeat counts for {m,n}. */
442
443	/* Wide character representation of the current multibyte character,
444	or WEOF if there was an encoding error. Used only if
445	MB_CUR_MAX > 1. */
446	wint_t wctok;
447
448	/* The most recently analyzed multibyte bracket expression. */
449	struct mb_char_classes brack;
450
451	/* We're separated from beginning or (, \| only by zero-width characters. */
452	bool laststart;
453	};
454
455	/* Recursive descent parser for regular expressions. */
456
457	struct parser_state
458	{
459	token tok; /* Lookahead token. */
460	idx_t depth; /* Current depth of a hypothetical stack
461	holding deferred productions. This is
462	used to determine the depth that will be
463	required of the real stack later on in
464	dfaanalyze. */
465	};
466
467	/* A compiled regular expression. */
468	struct dfa
469	{
470	/* Fields filled by the scanner. */
471	charclass charclasses; / Array of character sets for CSET tokens. */
472	idx_t cindex; /* Index for adding new charclasses. */
473	idx_t calloc; /* Number of charclasses allocated. */
474	ptrdiff_t canychar; /* Index of anychar class, or -1. */
475
476	/* Scanner state */
477	struct lexer_state lex;
478
479	/* Parser state */
480	struct parser_state parse;
481
482	/* Fields filled by the parser. */
483	token tokens; / Postfix parse array. */
484	idx_t tindex; /* Index for adding new tokens. */
485	idx_t talloc; /* Number of tokens currently allocated. */
486	idx_t depth; /* Depth required of an evaluation stack
487	used for depth-first traversal of the
488	parse tree. */
489	idx_t nleaves; /* Number of non-EMPTY leaves
490	in the parse tree. */
491	idx_t nregexps; /* Count of parallel regexps being built
492	with dfaparse. */
493	bool fast; /* The DFA is fast. */
494	bool epsilon; /* Does a token match only the empty string? */
495	token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
496	mbstate_t mbs; /* Multibyte conversion state. */
497
498	/* The following are valid only if MB_CUR_MAX > 1. */
499
500	/* The value of multibyte_prop[i] is defined by following rule.
501	if tokens[i] < NOTCHAR
502	bit 0 : tokens[i] is the first byte of a character, including
503	single-byte characters.
504	bit 1 : tokens[i] is the last byte of a character, including
505	single-byte characters.
506
507	e.g.
508	tokens
509	= 'single_byte_a', 'multi_byte_A', single_byte_b'
510	= 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
511	multibyte_prop
512	= 3 , 1 , 0 , 2 , 3
513	*/
514	char *multibyte_prop;
515
516	/* Fields filled by the superset. */
517	struct dfa superset; / Hint of the dfa. */
518
519	/* Fields filled by the state builder. */
520	dfa_state states; / States of the dfa. */
521	state_num sindex; /* Index for adding new states. */
522	idx_t salloc; /* Number of states currently allocated. */
523
524	/* Fields filled by the parse tree->NFA conversion. */
525	position_set follows; / Array of follow sets, indexed by position
526	index. The follow of a position is the set
527	of positions containing characters that
528	could conceivably follow a character
529	matching the given position in a string
530	matching the regexp. Allocated to the
531	maximum possible position index. */
532	bool searchflag; /* We are supposed to build a searching
533	as opposed to an exact matcher. A searching
534	matcher finds the first and shortest string
535	matching a regexp anywhere in the buffer,
536	whereas an exact matcher finds the longest
537	string matching, but anchored to the
538	beginning of the buffer. */
539
540	/* Fields filled by dfaanalyze. */
541	int constraints; / Array of union of accepting constraints
542	in the follow of a position. */
543	int separates; / Array of contexts on follow of a
544	position. */
545
546	/* Fields filled by dfaexec. */
547	state_num tralloc; /* Number of transition tables that have
548	slots so far, not counting trans[-1] and
549	trans[-2]. */
550	int trcount; /* Number of transition tables that have
551	been built, other than for initial
552	states. */
553	int min_trcount; /* Number of initial states. Equivalently,
554	the minimum state number for which trcount
555	counts transitions. */
556	state_num *trans; / Transition tables for states that can
557	never accept. If the transitions for a
558	state have not yet been computed, or the
559	state could possibly accept, its entry in
560	this table is NULL. This points to two
561	past the start of the allocated array,
562	and trans[-1] and trans[-2] are always
563	NULL. */
564	state_num *fails; / Transition tables after failing to accept
565	on a state that potentially could do so.
566	If trans[i] is non-null, fails[i] must
567	be null. */
568	char success; / Table of acceptance conditions used in
569	dfaexec and computed in build_state. */
570	state_num newlines; / Transitions on newlines. The entry for a
571	newline in any transition table is always
572	-1 so we can count lines without wasting
573	too many cycles. The transition for a
574	newline is stored separately and handled
575	as a special case. Newline is also used
576	as a sentinel at the end of the buffer. */
577	state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
578	context in multibyte locales, in which we
579	do not distinguish between their contexts,
580	as not supported word. */
581	position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
582	state_num *mb_trans; / Transition tables for states with
583	ANYCHAR. */
584	state_num mb_trcount; /* Number of transition tables for states with
585	ANYCHAR that have actually been built. */
586
587	/* Syntax configuration. This is near the end so that dfacopysyntax
588	can memset up to here. */
589	struct regex_syntax syntax;
590
591	/* Information derived from the locale. This is at the end so that
592	a quick memset need not clear it specially. */
593
594	/* dfaexec implementation. */
595	char (dfaexec) (struct dfa , char const , char *,
596	bool, ptrdiff_t , bool );
597
598	/* Other cached information derived from the locale. */
599	struct localeinfo localeinfo;
600	};
601
602	/* User access to dfa internals. */
603
604	/* S could possibly be an accepting state of R. */
605	static bool
606	accepting (state_num s, struct dfa const *r)
607	{
608	return r->states[s].constraint != 0;
609	}
610
611	/* STATE accepts in the specified context. */
612	static bool
613	accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
614	{
615	return succeeds_in_context (dfa->states[state].constraint, prev, curr);
616	}
617
618	static void regexp (struct dfa *dfa);
619
620	/* Store into *PWC the result of converting the leading bytes of the
621	multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
622	and updating the conversion state in *D. On conversion error,
623	convert just a single byte, to WEOF. Return the number of bytes
624	converted.
625
626	This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
627
628	* PWC points to wint_t, not to wchar_t.
629	* The last arg is a dfa *D instead of merely a multibyte conversion
630	state D->mbs.
631	* N is idx_t not size_t, and must be at least 1.
632	* S[N - 1] must be a sentinel byte.
633	* Shift encodings are not supported.
634	* The return value is always in the range 1..N.
635	* D->mbs is always valid afterwards.
636	* PWC is always set to something. /
637	static int
638	mbs_to_wchar (wint_t pwc, char const s, idx_t n, struct dfa *d)
639	{
640	unsigned char uc = s[0];
641	wint_t wc = d->localeinfo.sbctowc[uc];
642
643	if (wc == WEOF)
644	{
645	wchar_t wch;
646	size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
647	if (0 < nbytes && nbytes < (size_t) -2)
648	{
649	*pwc = wch;
650	return nbytes;
651	}
652	memset (&d->mbs, 0, sizeof d->mbs);
653	}
654
655	*pwc = wc;
656	return 1;
657	}
658
659	#ifdef DEBUG
660
661	static void
662	prtok (token t)
663	{
664	if (t <= END)
665	fprintf (stderr, "END");
666	else if (0 <= t && t < NOTCHAR)
667	{
668	unsigned int ch = t;
669	fprintf (stderr, "0x%02x", ch);
670	}
671	else
672	{
673	char const *s;
674	switch (t)
675	{
676	case BEG:
677	s = "BEG";
678	break;
679	case EMPTY:
680	s = "EMPTY";
681	break;
682	case BACKREF:
683	s = "BACKREF";
684	break;
685	case BEGLINE:
686	s = "BEGLINE";
687	break;
688	case ENDLINE:
689	s = "ENDLINE";
690	break;
691	case BEGWORD:
692	s = "BEGWORD";
693	break;
694	case ENDWORD:
695	s = "ENDWORD";
696	break;
697	case LIMWORD:
698	s = "LIMWORD";
699	break;
700	case NOTLIMWORD:
701	s = "NOTLIMWORD";
702	break;
703	case QMARK:
704	s = "QMARK";
705	break;
706	case STAR:
707	s = "STAR";
708	break;
709	case PLUS:
710	s = "PLUS";
711	break;
712	case CAT:
713	s = "CAT";
714	break;
715	case OR:
716	s = "OR";
717	break;
718	case LPAREN:
719	s = "LPAREN";
720	break;
721	case RPAREN:
722	s = "RPAREN";
723	break;
724	case ANYCHAR:
725	s = "ANYCHAR";
726	break;
727	case MBCSET:
728	s = "MBCSET";
729	break;
730	default:
731	s = "CSET";
732	break;
733	}
734	fprintf (stderr, "%s", s);
735	}
736	}
737	#endif /* DEBUG */
738
739	/* Stuff pertaining to charclasses. */
740
741	static bool
742	tstbit (unsigned int b, charclass const *c)
743	{
744	return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
745	}
746
747	static void
748	setbit (unsigned int b, charclass *c)
749	{
750	charclass_word one = 1;
751	c->w[b / CHARCLASS_WORD_BITS] \|= one << b % CHARCLASS_WORD_BITS;
752	}
753
754	static void
755	clrbit (unsigned int b, charclass *c)
756	{
757	charclass_word one = 1;
758	c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
759	}
760
761	static void
762	zeroset (charclass *s)
763	{
764	memset (s, 0, sizeof *s);
765	}
766
767	static void
768	fillset (charclass *s)
769	{
770	for (int i = 0; i < CHARCLASS_WORDS; i++)
771	s->w[i] = CHARCLASS_WORD_MASK;
772	}
773
774	static void
775	notset (charclass *s)
776	{
777	for (int i = 0; i < CHARCLASS_WORDS; ++i)
778	s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
779	}
780
781	static bool
782	equal (charclass const s1, charclass const s2)
783	{
784	charclass_word w = 0;
785	for (int i = 0; i < CHARCLASS_WORDS; i++)
786	w \|= s1->w[i] ^ s2->w[i];
787	return w == 0;
788	}
789
790	static bool
791	emptyset (charclass const *s)
792	{
793	charclass_word w = 0;
794	for (int i = 0; i < CHARCLASS_WORDS; i++)
795	w \|= s->w[i];
796	return w == 0;
797	}
798
799	/* Ensure that the array addressed by PA holds at least I + 1 items.
800	Either return PA, or reallocate the array and return its new address.
801	Although PA may be null, the returned value is never null.
802
803	The array holds NITEMS items, where 0 <= I <= NITEMS; *NITEMS
804	is updated on reallocation. If PA is null, *NITEMS must be zero.
805	Do not allocate more than NITEMS_MAX items total; -1 means no limit.
806	ITEM_SIZE is the size of one item; it must be positive.
807	Avoid O(N*2) behavior on arrays growing linearly. /
808	static void *
809	maybe_realloc (void pa, idx_t i, idx_t nitems,
810	ptrdiff_t nitems_max, idx_t item_size)
811	{
812	if (i < *nitems)
813	return pa;
814	return xpalloc (pa, nitems, 1, nitems_max, item_size);
815	}
816
817	/* In DFA D, find the index of charclass S, or allocate a new one. */
818	static idx_t
819	charclass_index (struct dfa d, charclass const s)
820	{
821	idx_t i;
822
823	for (i = 0; i < d->cindex; ++i)
824	if (equal (s, &d->charclasses[i]))
825	return i;
826	d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
827	TOKEN_MAX - CSET, sizeof *d->charclasses);
828	++d->cindex;
829	d->charclasses[i] = *s;
830	return i;
831	}
832
833	static bool
834	unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
835	{
836	return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) \|\| (c) == '_');
837	}
838
839	static int
840	char_context (struct dfa const *dfa, unsigned char c)
841	{
842	if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
843	return CTX_NEWLINE;
844	if (unibyte_word_constituent (dfa, c))
845	return CTX_LETTER;
846	return CTX_NONE;
847	}
848
849	/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
850	is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
851	this may happen when folding case in weird Turkish locales where
852	dotless i/dotted I are not included in the chosen character set.
853	Return whether a bit was set in the charclass. */
854	static bool
855	setbit_wc (wint_t wc, charclass *c)
856	{
857	int b = wctob (wc);
858	if (b < 0)
859	return false;
860
861	setbit (b, c);
862	return true;
863	}
864
865	/* Set a bit for B and its case variants in the charclass C.
866	MB_CUR_MAX must be 1. */
867	static void
868	setbit_case_fold_c (int b, charclass *c)
869	{
870	int ub = toupper (b);
871	for (int i = 0; i < NOTCHAR; i++)
872	if (toupper (i) == ub)
873	setbit (i, c);
874	}
875
876	/* Fetch the next lexical input character from the pattern. There
877	must at least one byte of pattern input. Set DFA->lex.wctok to the
878	value of the character or to WEOF depending on whether the input is
879	a valid multibyte character (possibly of length 1). Then return
880	the next input byte value, except return EOF if the input is a
881	multibyte character of length greater than 1. */
882	static int
883	fetch_wc (struct dfa *dfa)
884	{
885	int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
886	dfa);
887	int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
888	dfa->lex.ptr += nbytes;
889	dfa->lex.left -= nbytes;
890	return c;
891	}
892
893	/* If there is no more input, report an error about unbalanced brackets.
894	Otherwise, behave as with fetch_wc (DFA). */
895	static int
896	bracket_fetch_wc (struct dfa *dfa)
897	{
898	if (! dfa->lex.left)
899	dfaerror (_("unbalanced ["));
900	return fetch_wc (dfa);
901	}
902
903	typedef int predicate (int);
904
905	/* The following list maps the names of the Posix named character classes
906	to predicate functions that determine whether a given character is in
907	the class. The leading [ has already been eaten by the lexical
908	analyzer. */
909	struct dfa_ctype
910	{
911	const char *name;
912	predicate *func;
913	bool single_byte_only;
914	};
915
916	static const struct dfa_ctype prednames[] = {
917	{"alpha", isalpha, false},
918	{"upper", isupper, false},
919	{"lower", islower, false},
920	{"digit", isdigit, true},
921	{"xdigit", isxdigit, false},
922	{"space", isspace, false},
923	{"punct", ispunct, false},
924	{"alnum", isalnum, false},
925	{"print", isprint, false},
926	{"graph", isgraph, false},
927	{"cntrl", iscntrl, false},
928	{"blank", isblank, false},
929	{NULL, NULL, false}
930	};
931
932	static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
933	find_pred (const char *str)
934	{
935	for (int i = 0; prednames[i].name; i++)
936	if (streq (str, prednames[i].name))
937	return &prednames[i];
938	return NULL;
939	}
940
941	/* Parse a bracket expression, which possibly includes multibyte
942	characters. */
943	static token
944	parse_bracket_exp (struct dfa *dfa)
945	{
946	/* This is a bracket expression that dfaexec is known to
947	process correctly. */
948	bool known_bracket_exp = true;
949
950	/* Used to warn about [:space:].
951	Bit 0 = first character is a colon.
952	Bit 1 = last character is a colon.
953	Bit 2 = includes any other character but a colon.
954	Bit 3 = includes ranges, char/equiv classes or collation elements. */
955	int colon_warning_state;
956
957	dfa->lex.brack.nchars = 0;
958	charclass ccl;
959	zeroset (&ccl);
960	int c = bracket_fetch_wc (dfa);
961	bool invert = c == '^';
962	if (invert)
963	{
964	c = bracket_fetch_wc (dfa);
965	known_bracket_exp = dfa->localeinfo.simple;
966	}
967	wint_t wc = dfa->lex.wctok;
968	int c1;
969	wint_t wc1;
970	colon_warning_state = (c == ':');
971	do
972	{
973	c1 = NOTCHAR; /* Mark c1 as not initialized. */
974	colon_warning_state &= ~2;
975
976	/* Note that if we're looking at some other [:...:] construct,
977	we just treat it as a bunch of ordinary characters. We can do
978	this because we assume regex has checked for syntax errors before
979	dfa is ever called. */
980	if (c == '[')
981	{
982	c1 = bracket_fetch_wc (dfa);
983	wc1 = dfa->lex.wctok;
984
985	if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
986	\|\| c1 == '.' \|\| c1 == '=')
987	{
988	enum { MAX_BRACKET_STRING_LEN = 32 };
989	char str[MAX_BRACKET_STRING_LEN + 1];
990	int len = 0;
991	for (;;)
992	{
993	c = bracket_fetch_wc (dfa);
994	if (dfa->lex.left == 0
995	\|\| (c == c1 && dfa->lex.ptr[0] == ']'))
996	break;
997	if (len < MAX_BRACKET_STRING_LEN)
998	str[len++] = c;
999	else
1000	/* This is in any case an invalid class name. */
1001	str[0] = '\0';
1002	}
1003	str[len] = '\0';
1004
1005	/* Fetch bracket. */
1006	c = bracket_fetch_wc (dfa);
1007	wc = dfa->lex.wctok;
1008	if (c1 == ':')
1009	/* Build character class. POSIX allows character
1010	classes to match multicharacter collating elements,
1011	but the regex code does not support that, so do not
1012	worry about that possibility. */
1013	{
1014	char const *class
1015	= (dfa->syntax.case_fold && (streq (str, "upper")
1016	\|\| streq (str, "lower"))
1017	? "alpha" : str);
1018	const struct dfa_ctype *pred = find_pred (class);
1019	if (!pred)
1020	dfaerror (_("invalid character class"));
1021
1022	if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1023	known_bracket_exp = false;
1024	else
1025	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1026	if (pred->func (c2))
1027	setbit (c2, &ccl);
1028	}
1029	else
1030	known_bracket_exp = false;
1031
1032	colon_warning_state \|= 8;
1033
1034	/* Fetch new lookahead character. */
1035	c1 = bracket_fetch_wc (dfa);
1036	wc1 = dfa->lex.wctok;
1037	continue;
1038	}
1039
1040	/* We treat '[' as a normal character here. c/c1/wc/wc1
1041	are already set up. */
1042	}
1043
1044	if (c == '\\'
1045	&& (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1046	{
1047	c = bracket_fetch_wc (dfa);
1048	wc = dfa->lex.wctok;
1049	}
1050
1051	if (c1 == NOTCHAR)
1052	{
1053	c1 = bracket_fetch_wc (dfa);
1054	wc1 = dfa->lex.wctok;
1055	}
1056
1057	if (c1 == '-')
1058	/* build range characters. */
1059	{
1060	int c2 = bracket_fetch_wc (dfa);
1061	wint_t wc2 = dfa->lex.wctok;
1062
1063	/* A bracket expression like [a-[.aa.]] matches an unknown set.
1064	Treat it like [-a[.aa.]] while parsing it, and
1065	remember that the set is unknown. */
1066	if (c2 == '[' && dfa->lex.ptr[0] == '.')
1067	{
1068	known_bracket_exp = false;
1069	c2 = ']';
1070	}
1071
1072	if (c2 == ']')
1073	{
1074	/* In the case [x-], the - is an ordinary hyphen,
1075	which is left in c1, the lookahead character. */
1076	dfa->lex.ptr--;
1077	dfa->lex.left++;
1078	}
1079	else
1080	{
1081	if (c2 == '\\' && (dfa->syntax.syntax_bits
1082	& RE_BACKSLASH_ESCAPE_IN_LISTS))
1083	{
1084	c2 = bracket_fetch_wc (dfa);
1085	wc2 = dfa->lex.wctok;
1086	}
1087
1088	colon_warning_state \|= 8;
1089	c1 = bracket_fetch_wc (dfa);
1090	wc1 = dfa->lex.wctok;
1091
1092	/* Treat [x-y] as a range if x != y. */
1093	if (wc != wc2 \|\| wc == WEOF)
1094	{
1095	if (dfa->localeinfo.simple
1096	\|\| (isasciidigit (c) & isasciidigit (c2)))
1097	{
1098	for (int ci = c; ci <= c2; ci++)
1099	if (dfa->syntax.case_fold && isalpha (ci))
1100	setbit_case_fold_c (ci, &ccl);
1101	else
1102	setbit (ci, &ccl);
1103	}
1104	else
1105	known_bracket_exp = false;
1106
1107	continue;
1108	}
1109	}
1110	}
1111
1112	colon_warning_state \|= (c == ':') ? 2 : 4;
1113
1114	if (!dfa->localeinfo.multibyte)
1115	{
1116	if (dfa->syntax.case_fold && isalpha (c))
1117	setbit_case_fold_c (c, &ccl);
1118	else
1119	setbit (c, &ccl);
1120	continue;
1121	}
1122
1123	if (wc == WEOF)
1124	known_bracket_exp = false;
1125	else
1126	{
1127	wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1128	int n = (dfa->syntax.case_fold
1129	? case_folded_counterparts (wc, folded + 1) + 1
1130	: 1);
1131	folded[0] = wc;
1132	for (int i = 0; i < n; i++)
1133	if (!setbit_wc (folded[i], &ccl))
1134	{
1135	dfa->lex.brack.chars
1136	= maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1137	&dfa->lex.brack.nchars_alloc, -1,
1138	sizeof *dfa->lex.brack.chars);
1139	dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1140	}
1141	}
1142	}
1143	while ((wc = wc1, (c = c1) != ']'));
1144
1145	if (colon_warning_state == 7)
1146	dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1147
1148	if (! known_bracket_exp)
1149	return BACKREF;
1150
1151	if (dfa->localeinfo.multibyte && (invert \|\| dfa->lex.brack.nchars != 0))
1152	{
1153	dfa->lex.brack.invert = invert;
1154	dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1155	return MBCSET;
1156	}
1157
1158	if (invert)
1159	{
1160	notset (&ccl);
1161	if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1162	clrbit ('\n', &ccl);
1163	}
1164
1165	return CSET + charclass_index (dfa, &ccl);
1166	}
1167
1168	struct lexptr
1169	{
1170	char const *ptr;
1171	idx_t left;
1172	};
1173
1174	static void
1175	push_lex_state (struct dfa dfa, struct lexptr ls, char const *s)
1176	{
1177	ls->ptr = dfa->lex.ptr;
1178	ls->left = dfa->lex.left;
1179	dfa->lex.ptr = s;
1180	dfa->lex.left = strlen (s);
1181	}
1182
1183	static void
1184	pop_lex_state (struct dfa dfa, struct lexptr const ls)
1185	{
1186	dfa->lex.ptr = ls->ptr;
1187	dfa->lex.left = ls->left;
1188	}
1189
1190	static token
1191	lex (struct dfa *dfa)
1192	{
1193	bool backslash = false;
1194
1195	/* Basic plan: We fetch a character. If it's a backslash,
1196	we set the backslash flag and go through the loop again.
1197	On the plus side, this avoids having a duplicate of the
1198	main switch inside the backslash case. On the minus side,
1199	it means that just about every case begins with
1200	"if (backslash) ...". */
1201	for (int i = 0; i < 2; ++i)
1202	{
1203	if (! dfa->lex.left)
1204	return dfa->lex.lasttok = END;
1205	int c = fetch_wc (dfa);
1206
1207	switch (c)
1208	{
1209	case '\\':
1210	if (backslash)
1211	goto normal_char;
1212	if (dfa->lex.left == 0)
1213	dfaerror (_("unfinished \\ escape"));
1214	backslash = true;
1215	break;
1216
1217	case '^':
1218	if (backslash)
1219	goto normal_char;
1220	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1221	\|\| dfa->lex.lasttok == END \|\| dfa->lex.lasttok == LPAREN
1222	\|\| dfa->lex.lasttok == OR)
1223	return dfa->lex.lasttok = BEGLINE;
1224	goto normal_char;
1225
1226	case '$':
1227	if (backslash)
1228	goto normal_char;
1229	/* kmk: cl v19.29.30139/amd64 messes this function up when optimizing
1230	for speed, workaround is to optimize it for size instead. The
1231	symptom is that the following SED expression fail to match:
1232	s/^[0-9a-fA-F]\{1,\} $00[0-9a-fA-F]$ ABS notype External \| $[^.]\{1,\}$\.$.*$$/ 1=\1 2=\2 3=\3/
1233
1234	Seems the exact problem is that it gets the indexing here wrong:
1235	dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) & (dfa->lex.ptr[0] == '\\')]
1236	It forgets to do the ` dfa->lex.ptr[0] == '\\' ` part and instead
1237	ANDs with a register initialized to zero. Rewriting the
1238	expressions using the tinary operator works around the problem,
1239	although the resulting code is a lot bulkier.
1240	*/
1241	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1242	\|\| dfa->lex.left == 0
1243	#ifdef _MSC_VER /* see above */
1244	\|\| (!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1245	? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == ')'
1246	: dfa->lex.left > 0 && dfa->lex.ptr[0] == ')')
1247	#else
1248	\|\| ((dfa->lex.left
1249	> !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1250	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1251	& (dfa->lex.ptr[0] == '\\')]
1252	== ')'))
1253	#endif
1254	#ifdef _MSC_VER /* see above */
1255	\|\| (!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1256	? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == '\|'
1257	: dfa->lex.left > 0 && dfa->lex.ptr[0] == '\|')
1258	#else
1259	\|\| ((dfa->lex.left
1260	> !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1261	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1262	& (dfa->lex.ptr[0] == '\\')]
1263	== '\|'))
1264	#endif
1265	\|\| ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1266	&& dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1267	return dfa->lex.lasttok = ENDLINE;
1268	goto normal_char;
1269
1270	case '1':
1271	case '2':
1272	case '3':
1273	case '4':
1274	case '5':
1275	case '6':
1276	case '7':
1277	case '8':
1278	case '9':
1279	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1280	{
1281	dfa->lex.laststart = false;
1282	return dfa->lex.lasttok = BACKREF;
1283	}
1284	goto normal_char;
1285
1286	case '`':
1287	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1288	{
1289	/* FIXME: should be beginning of string */
1290	return dfa->lex.lasttok = BEGLINE;
1291	}
1292	goto normal_char;
1293
1294	case '\'':
1295	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1296	{
1297	/* FIXME: should be end of string */
1298	return dfa->lex.lasttok = ENDLINE;
1299	}
1300	goto normal_char;
1301
1302	case '<':
1303	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1304	return dfa->lex.lasttok = BEGWORD;
1305	goto normal_char;
1306
1307	case '>':
1308	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1309	return dfa->lex.lasttok = ENDWORD;
1310	goto normal_char;
1311
1312	case 'b':
1313	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1314	return dfa->lex.lasttok = LIMWORD;
1315	goto normal_char;
1316
1317	case 'B':
1318	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1319	return dfa->lex.lasttok = NOTLIMWORD;
1320	goto normal_char;
1321
1322	case '?':
1323	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1324	goto normal_char;
1325	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1326	goto normal_char;
1327	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1328	&& dfa->lex.laststart)
1329	goto normal_char;
1330	return dfa->lex.lasttok = QMARK;
1331
1332	case '*':
1333	if (backslash)
1334	goto normal_char;
1335	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1336	&& dfa->lex.laststart)
1337	goto normal_char;
1338	return dfa->lex.lasttok = STAR;
1339
1340	case '+':
1341	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1342	goto normal_char;
1343	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1344	goto normal_char;
1345	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1346	&& dfa->lex.laststart)
1347	goto normal_char;
1348	return dfa->lex.lasttok = PLUS;
1349
1350	case '{':
1351	if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1352	goto normal_char;
1353	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1354	goto normal_char;
1355	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1356	&& dfa->lex.laststart)
1357	goto normal_char;
1358
1359	/* Cases:
1360	{M} - exact count
1361	{M,} - minimum count, maximum is infinity
1362	{,N} - 0 through N
1363	{,} - 0 to infinity (same as '*')
1364	{M,N} - M through N */
1365	{
1366	char const *p = dfa->lex.ptr;
1367	char const *lim = p + dfa->lex.left;
1368	dfa->lex.minrep = dfa->lex.maxrep = -1;
1369	for (; p != lim && isasciidigit (*p); p++)
1370	dfa->lex.minrep = (dfa->lex.minrep < 0
1371	? *p - '0'
1372	: MIN (RE_DUP_MAX + 1,
1373	dfa->lex.minrep * 10 + *p - '0'));
1374	if (p != lim)
1375	{
1376	if (*p != ',')
1377	dfa->lex.maxrep = dfa->lex.minrep;
1378	else
1379	{
1380	if (dfa->lex.minrep < 0)
1381	dfa->lex.minrep = 0;
1382	while (++p != lim && isasciidigit (*p))
1383	dfa->lex.maxrep
1384	= (dfa->lex.maxrep < 0
1385	? *p - '0'
1386	: MIN (RE_DUP_MAX + 1,
1387	dfa->lex.maxrep * 10 + *p - '0'));
1388	}
1389	}
1390	if (! ((! backslash \|\| (p != lim && *p++ == '\\'))
1391	&& p != lim && *p++ == '}'
1392	&& 0 <= dfa->lex.minrep
1393	&& (dfa->lex.maxrep < 0
1394	\|\| dfa->lex.minrep <= dfa->lex.maxrep)))
1395	{
1396	if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1397	goto normal_char;
1398	dfaerror (_("invalid content of \\{\\}"));
1399	}
1400	if (RE_DUP_MAX < dfa->lex.maxrep)
1401	dfaerror (_("regular expression too big"));
1402	dfa->lex.ptr = p;
1403	dfa->lex.left = lim - p;
1404	}
1405	dfa->lex.laststart = false;
1406	return dfa->lex.lasttok = REPMN;
1407
1408	case '\|':
1409	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1410	goto normal_char;
1411	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1412	goto normal_char;
1413	dfa->lex.laststart = true;
1414	return dfa->lex.lasttok = OR;
1415
1416	case '\n':
1417	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1418	\|\| backslash \|\| !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1419	goto normal_char;
1420	dfa->lex.laststart = true;
1421	return dfa->lex.lasttok = OR;
1422
1423	case '(':
1424	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1425	goto normal_char;
1426	dfa->lex.parens++;
1427	dfa->lex.laststart = true;
1428	return dfa->lex.lasttok = LPAREN;
1429
1430	case ')':
1431	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1432	goto normal_char;
1433	if (dfa->lex.parens == 0
1434	&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1435	goto normal_char;
1436	dfa->lex.parens--;
1437	dfa->lex.laststart = false;
1438	return dfa->lex.lasttok = RPAREN;
1439
1440	case '.':
1441	if (backslash)
1442	goto normal_char;
1443	if (dfa->canychar < 0)
1444	{
1445	charclass ccl;
1446	fillset (&ccl);
1447	if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1448	clrbit ('\n', &ccl);
1449	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1450	clrbit ('\0', &ccl);
1451	if (dfa->localeinfo.multibyte)
1452	for (int c2 = 0; c2 < NOTCHAR; c2++)
1453	if (dfa->localeinfo.sbctowc[c2] == WEOF)
1454	clrbit (c2, &ccl);
1455	dfa->canychar = charclass_index (dfa, &ccl);
1456	}
1457	dfa->lex.laststart = false;
1458	return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1459	? ANYCHAR
1460	: CSET + dfa->canychar);
1461
1462	case 's':
1463	case 'S':
1464	if (!backslash \|\| (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1465	goto normal_char;
1466	if (!dfa->localeinfo.multibyte)
1467	{
1468	charclass ccl;
1469	zeroset (&ccl);
1470	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1471	if (isspace (c2))
1472	setbit (c2, &ccl);
1473	if (c == 'S')
1474	notset (&ccl);
1475	dfa->lex.laststart = false;
1476	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1477	}
1478
1479	/* FIXME: see if optimizing this, as is done with ANYCHAR and
1480	add_utf8_anychar, makes sense. */
1481
1482	/* \s and \S are documented to be equivalent to [[:space:]] and
1483	[^[:space:]] respectively, so tell the lexer to process those
1484	strings, each minus its "already processed" '['. */
1485	{
1486	struct lexptr ls;
1487	push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1488	dfa->lex.lasttok = parse_bracket_exp (dfa);
1489	pop_lex_state (dfa, &ls);
1490	}
1491
1492	dfa->lex.laststart = false;
1493	return dfa->lex.lasttok;
1494
1495	case 'w':
1496	case 'W':
1497	if (!backslash \|\| (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1498	goto normal_char;
1499
1500	if (!dfa->localeinfo.multibyte)
1501	{
1502	charclass ccl;
1503	zeroset (&ccl);
1504	for (int c2 = 0; c2 < NOTCHAR; ++c2)
1505	if (dfa->syntax.sbit[c2] == CTX_LETTER)
1506	setbit (c2, &ccl);
1507	if (c == 'W')
1508	notset (&ccl);
1509	dfa->lex.laststart = false;
1510	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1511	}
1512
1513	/* FIXME: see if optimizing this, as is done with ANYCHAR and
1514	add_utf8_anychar, makes sense. */
1515
1516	/* \w and \W are documented to be equivalent to [_[:alnum:]] and
1517	[^_[:alnum:]] respectively, so tell the lexer to process those
1518	strings, each minus its "already processed" '['. */
1519	{
1520	struct lexptr ls;
1521	push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1522	dfa->lex.lasttok = parse_bracket_exp (dfa);
1523	pop_lex_state (dfa, &ls);
1524	}
1525
1526	dfa->lex.laststart = false;
1527	return dfa->lex.lasttok;
1528
1529	case '[':
1530	if (backslash)
1531	goto normal_char;
1532	dfa->lex.laststart = false;
1533	return dfa->lex.lasttok = parse_bracket_exp (dfa);
1534
1535	default:
1536	normal_char:
1537	dfa->lex.laststart = false;
1538	/* For multibyte character sets, folding is done in atom. Always
1539	return WCHAR. */
1540	if (dfa->localeinfo.multibyte)
1541	return dfa->lex.lasttok = WCHAR;
1542
1543	if (dfa->syntax.case_fold && isalpha (c))
1544	{
1545	charclass ccl;
1546	zeroset (&ccl);
1547	setbit_case_fold_c (c, &ccl);
1548	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1549	}
1550
1551	return dfa->lex.lasttok = c;
1552	}
1553	}
1554
1555	/* The above loop should consume at most a backslash
1556	and some other character. */
1557	abort ();
1558	return END; /* keeps pedantic compilers happy. */
1559	}
1560
1561	static void
1562	addtok_mb (struct dfa *dfa, token t, char mbprop)
1563	{
1564	if (dfa->talloc == dfa->tindex)
1565	{
1566	dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1567	sizeof *dfa->tokens);
1568	if (dfa->localeinfo.multibyte)
1569	dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1570	sizeof *dfa->multibyte_prop);
1571	}
1572	if (dfa->localeinfo.multibyte)
1573	dfa->multibyte_prop[dfa->tindex] = mbprop;
1574	dfa->tokens[dfa->tindex++] = t;
1575
1576	switch (t)
1577	{
1578	case QMARK:
1579	case STAR:
1580	case PLUS:
1581	break;
1582
1583	case CAT:
1584	case OR:
1585	dfa->parse.depth--;
1586	break;
1587
1588	case EMPTY:
1589	dfa->epsilon = true;
1590	goto increment_depth;
1591
1592	case BACKREF:
1593	dfa->fast = false;
1594	goto increment_nleaves;
1595
1596	case BEGLINE:
1597	case ENDLINE:
1598	case BEGWORD:
1599	case ENDWORD:
1600	case LIMWORD:
1601	case NOTLIMWORD:
1602	dfa->epsilon = true;
1603	FALLTHROUGH;
1604	default:
1605	increment_nleaves:
1606	dfa->nleaves++;
1607	increment_depth:
1608	dfa->parse.depth++;
1609	if (dfa->depth < dfa->parse.depth)
1610	dfa->depth = dfa->parse.depth;
1611	break;
1612	}
1613	}
1614
1615	static void addtok_wc (struct dfa *dfa, wint_t wc);
1616
1617	/* Add the given token to the parse tree, maintaining the depth count and
1618	updating the maximum depth if necessary. */
1619	static void
1620	addtok (struct dfa *dfa, token t)
1621	{
1622	if (dfa->localeinfo.multibyte && t == MBCSET)
1623	{
1624	bool need_or = false;
1625
1626	/* Extract wide characters into alternations for better performance.
1627	This does not require UTF-8. */
1628	for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1629	{
1630	addtok_wc (dfa, dfa->lex.brack.chars[i]);
1631	if (need_or)
1632	addtok (dfa, OR);
1633	need_or = true;
1634	}
1635	dfa->lex.brack.nchars = 0;
1636
1637	/* Wide characters have been handled above, so it is possible
1638	that the set is empty now. Do nothing in that case. */
1639	if (dfa->lex.brack.cset != -1)
1640	{
1641	addtok (dfa, CSET + dfa->lex.brack.cset);
1642	if (need_or)
1643	addtok (dfa, OR);
1644	}
1645	}
1646	else
1647	{
1648	addtok_mb (dfa, t, 3);
1649	}
1650	}
1651
1652	/* We treat a multibyte character as a single atom, so that DFA
1653	can treat a multibyte character as a single expression.
1654
1655	e.g., we construct the following tree from "<mb1><mb2>".
1656	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1657	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1658	static void
1659	addtok_wc (struct dfa *dfa, wint_t wc)
1660	{
1661	unsigned char buf[MB_LEN_MAX];
1662	mbstate_t s = { 0 };
1663	size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1664	int buflen;
1665
1666	if (stored_bytes != (size_t) -1)
1667	buflen = stored_bytes;
1668	else
1669	{
1670	/* This is merely stop-gap. buf[0] is undefined, yet skipping
1671	the addtok_mb call altogether can corrupt the heap. */
1672	buflen = 1;
1673	buf[0] = 0;
1674	}
1675
1676	addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1677	for (int i = 1; i < buflen; i++)
1678	{
1679	addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1680	addtok (dfa, CAT);
1681	}
1682	}
1683
1684	static void
1685	add_utf8_anychar (struct dfa *dfa)
1686	{
1687	/* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1688	UTF-8 byte sequence has been defined as follows:
1689
1690	([\x00-\x7f]
1691	\|[\xc2-\xdf][\x80-\xbf]
1692	\|[\xe0][\xa0-\xbf][\x80-\xbf]
1693	\|[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1694	\|[\xed][\x80-\x9f][\x80-\xbf]
1695	\|[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1696	\|[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1697	\|[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1698
1699	which I'll write more concisely "A\|BC\|DEC\|FCC\|GHC\|IJCC\|KCCC\|LMCC",
1700	where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1701	D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1702	H = [\x80-\x9f], I = [\xf0],
1703	J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1704
1705	This can be refactored to "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C". */
1706
1707	/* Mnemonics for classes containing two or more bytes. */
1708	enum { A, B, C, E, F, H, J, K, M };
1709
1710	/* Mnemonics for single-byte tokens. */
1711	enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1712
1713	static charclass const utf8_classes[] = {
1714	/* A. 00-7f: 1-byte sequence. */
1715	CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1716
1717	/* B. c2-df: 1st byte of a 2-byte sequence. */
1718	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1719
1720	/* C. 80-bf: non-leading bytes. */
1721	CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1722
1723	/* D. e0 (just a token). */
1724
1725	/* E. a0-bf: 2nd byte of a "DEC" sequence. */
1726	CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1727
1728	/* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1729	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1730
1731	/* G. ed (just a token). */
1732
1733	/* H. 80-9f: 2nd byte of a "GHC" sequence. */
1734	CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1735
1736	/* I. f0 (just a token). */
1737
1738	/* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1739	CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1740
1741	/* K. f1-f3: 1st byte of a "KCCC" sequence. */
1742	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1743
1744	/* L. f4 (just a token). */
1745
1746	/* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1747	CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1748	};
1749
1750	/* Define the character classes that are needed below. */
1751	if (dfa->utf8_anychar_classes[0] == 0)
1752	{
1753	charclass c = utf8_classes[0];
1754	if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1755	clrbit ('\n', &c);
1756	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1757	clrbit ('\0', &c);
1758	dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1759
1760	for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1761	dfa->utf8_anychar_classes[i]
1762	= CSET + charclass_index (dfa, &utf8_classes[i]);
1763	}
1764
1765	/* Implement the "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C" pattern mentioned above.
1766	The token buffer is in reverse Polish order, so we get
1767	"A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1768	C CAT OR C CAT OR C CAT OR". */
1769	addtok (dfa, dfa->utf8_anychar_classes[A]);
1770	addtok (dfa, dfa->utf8_anychar_classes[B]);
1771	addtok (dfa, D_token);
1772	addtok (dfa, dfa->utf8_anychar_classes[E]);
1773	addtok (dfa, CAT);
1774	addtok (dfa, OR);
1775	addtok (dfa, G_token);
1776	addtok (dfa, dfa->utf8_anychar_classes[H]);
1777	addtok (dfa, CAT);
1778	addtok (dfa, OR);
1779	addtok (dfa, dfa->utf8_anychar_classes[F]);
1780	addtok (dfa, I_token);
1781	addtok (dfa, dfa->utf8_anychar_classes[J]);
1782	addtok (dfa, CAT);
1783	addtok (dfa, OR);
1784	addtok (dfa, L_token);
1785	addtok (dfa, dfa->utf8_anychar_classes[M]);
1786	addtok (dfa, CAT);
1787	addtok (dfa, OR);
1788	addtok (dfa, dfa->utf8_anychar_classes[K]);
1789	for (int i = 0; i < 3; i++)
1790	{
1791	addtok (dfa, dfa->utf8_anychar_classes[C]);
1792	addtok (dfa, CAT);
1793	addtok (dfa, OR);
1794	}
1795	}
1796
1797	/* The grammar understood by the parser is as follows.
1798
1799	regexp:
1800	regexp OR branch
1801	branch
1802
1803	branch:
1804	branch closure
1805	closure
1806
1807	closure:
1808	closure QMARK
1809	closure STAR
1810	closure PLUS
1811	closure REPMN
1812	atom
1813
1814	atom:
1815	<normal character>
1816	<multibyte character>
1817	ANYCHAR
1818	MBCSET
1819	CSET
1820	BACKREF
1821	BEGLINE
1822	ENDLINE
1823	BEGWORD
1824	ENDWORD
1825	LIMWORD
1826	NOTLIMWORD
1827	LPAREN regexp RPAREN
1828	<empty>
1829
1830	The parser builds a parse tree in postfix form in an array of tokens. */
1831
1832	static void
1833	atom (struct dfa *dfa)
1834	{
1835	if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1836	\|\| dfa->parse.tok >= CSET
1837	\|\| dfa->parse.tok == BEG \|\| dfa->parse.tok == BACKREF
1838	\|\| dfa->parse.tok == BEGLINE \|\| dfa->parse.tok == ENDLINE
1839	\|\| dfa->parse.tok == BEGWORD \|\| dfa->parse.tok == ENDWORD
1840	\|\| dfa->parse.tok == LIMWORD \|\| dfa->parse.tok == NOTLIMWORD
1841	\|\| dfa->parse.tok == ANYCHAR \|\| dfa->parse.tok == MBCSET)
1842	{
1843	if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1844	{
1845	/* For UTF-8 expand the period to a series of CSETs that define a
1846	valid UTF-8 character. This avoids using the slow multibyte
1847	path. I'm pretty sure it would be both profitable and correct to
1848	do it for any encoding; however, the optimization must be done
1849	manually as it is done above in add_utf8_anychar. So, let's
1850	start with UTF-8: it is the most used, and the structure of the
1851	encoding makes the correctness more obvious. */
1852	add_utf8_anychar (dfa);
1853	}
1854	else
1855	addtok (dfa, dfa->parse.tok);
1856	dfa->parse.tok = lex (dfa);
1857	}
1858	else if (dfa->parse.tok == WCHAR)
1859	{
1860	if (dfa->lex.wctok == WEOF)
1861	addtok (dfa, BACKREF);
1862	else
1863	{
1864	addtok_wc (dfa, dfa->lex.wctok);
1865
1866	if (dfa->syntax.case_fold)
1867	{
1868	wchar_t folded[CASE_FOLDED_BUFSIZE];
1869	int n = case_folded_counterparts (dfa->lex.wctok, folded);
1870	for (int i = 0; i < n; i++)
1871	{
1872	addtok_wc (dfa, folded[i]);
1873	addtok (dfa, OR);
1874	}
1875	}
1876	}
1877
1878	dfa->parse.tok = lex (dfa);
1879	}
1880	else if (dfa->parse.tok == LPAREN)
1881	{
1882	dfa->parse.tok = lex (dfa);
1883	regexp (dfa);
1884	if (dfa->parse.tok != RPAREN)
1885	dfaerror (_("unbalanced ("));
1886	dfa->parse.tok = lex (dfa);
1887	}
1888	else
1889	addtok (dfa, EMPTY);
1890	}
1891
1892	/* Return the number of tokens in the given subexpression. */
1893	static idx_t _GL_ATTRIBUTE_PURE
1894	nsubtoks (struct dfa const *dfa, idx_t tindex)
1895	{
1896	switch (dfa->tokens[tindex - 1])
1897	{
1898	default:
1899	return 1;
1900	case QMARK:
1901	case STAR:
1902	case PLUS:
1903	return 1 + nsubtoks (dfa, tindex - 1);
1904	case CAT:
1905	case OR:
1906	{
1907	idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1908	return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1909	}
1910	}
1911	}
1912
1913	/* Copy the given subexpression to the top of the tree. */
1914	static void
1915	copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1916	{
1917	if (dfa->localeinfo.multibyte)
1918	for (idx_t i = 0; i < ntokens; i++)
1919	addtok_mb (dfa, dfa->tokens[tindex + i],
1920	dfa->multibyte_prop[tindex + i]);
1921	else
1922	for (idx_t i = 0; i < ntokens; i++)
1923	addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1924	}
1925
1926	static void
1927	closure (struct dfa *dfa)
1928	{
1929	atom (dfa);
1930	while (dfa->parse.tok == QMARK \|\| dfa->parse.tok == STAR
1931	\|\| dfa->parse.tok == PLUS \|\| dfa->parse.tok == REPMN)
1932	if (dfa->parse.tok == REPMN && (dfa->lex.minrep \|\| dfa->lex.maxrep))
1933	{
1934	idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1935	idx_t tindex = dfa->tindex - ntokens;
1936	if (dfa->lex.maxrep < 0)
1937	addtok (dfa, PLUS);
1938	if (dfa->lex.minrep == 0)
1939	addtok (dfa, QMARK);
1940	int i;
1941	for (i = 1; i < dfa->lex.minrep; i++)
1942	{
1943	copytoks (dfa, tindex, ntokens);
1944	addtok (dfa, CAT);
1945	}
1946	for (; i < dfa->lex.maxrep; i++)
1947	{
1948	copytoks (dfa, tindex, ntokens);
1949	addtok (dfa, QMARK);
1950	addtok (dfa, CAT);
1951	}
1952	dfa->parse.tok = lex (dfa);
1953	}
1954	else if (dfa->parse.tok == REPMN)
1955	{
1956	dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1957	dfa->parse.tok = lex (dfa);
1958	closure (dfa);
1959	}
1960	else
1961	{
1962	addtok (dfa, dfa->parse.tok);
1963	dfa->parse.tok = lex (dfa);
1964	}
1965	}
1966
1967	static void
1968	branch (struct dfa* dfa)
1969	{
1970	closure (dfa);
1971	while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1972	&& dfa->parse.tok >= 0)
1973	{
1974	closure (dfa);
1975	addtok (dfa, CAT);
1976	}
1977	}
1978
1979	static void
1980	regexp (struct dfa *dfa)
1981	{
1982	branch (dfa);
1983	while (dfa->parse.tok == OR)
1984	{
1985	dfa->parse.tok = lex (dfa);
1986	branch (dfa);
1987	addtok (dfa, OR);
1988	}
1989	}
1990
1991	/* Parse a string S of length LEN into D. S can include NUL characters.
1992	This is the main entry point for the parser. */
1993	void
1994	dfaparse (char const s, idx_t len, struct dfa d)
1995	{
1996	d->lex.ptr = s;
1997	d->lex.left = len;
1998	d->lex.lasttok = END;
1999	d->lex.laststart = true;
2000
2001	if (!d->syntax.syntax_bits_set)
2002	dfaerror (_("no syntax specified"));
2003
2004	if (!d->nregexps)
2005	addtok (d, BEG);
2006
2007	d->parse.tok = lex (d);
2008	d->parse.depth = d->depth;
2009
2010	regexp (d);
2011
2012	if (d->parse.tok != END)
2013	dfaerror (_("unbalanced )"));
2014
2015	addtok (d, END - d->nregexps);
2016	addtok (d, CAT);
2017
2018	if (d->nregexps)
2019	addtok (d, OR);
2020
2021	++d->nregexps;
2022	}
2023
2024	/* Some primitives for operating on sets of positions. */
2025
2026	/* Copy one set to another. */
2027	static void
2028	copy (position_set const src, position_set dst)
2029	{
2030	if (dst->alloc < src->nelem)
2031	{
2032	free (dst->elems);
2033	dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2034	sizeof *dst->elems);
2035	}
2036	dst->nelem = src->nelem;
2037	if (src->nelem != 0)
2038	memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2039	}
2040
2041	static void
2042	alloc_position_set (position_set *s, idx_t size)
2043	{
2044	s->elems = xnmalloc (size, sizeof *s->elems);
2045	s->alloc = size;
2046	s->nelem = 0;
2047	}
2048
2049	/* Insert position P in set S. S is maintained in sorted order on
2050	decreasing index. If there is already an entry in S with P.index
2051	then merge (logically-OR) P's constraints into the one in S.
2052	S->elems must point to an array large enough to hold the resulting set. */
2053	static void
2054	insert (position p, position_set *s)
2055	{
2056	idx_t count = s->nelem;
2057	idx_t lo = 0, hi = count;
2058	while (lo < hi)
2059	{
2060	idx_t mid = (lo + hi) >> 1;
2061	if (s->elems[mid].index < p.index)
2062	lo = mid + 1;
2063	else if (s->elems[mid].index == p.index)
2064	{
2065	s->elems[mid].constraint \|= p.constraint;
2066	return;
2067	}
2068	else
2069	hi = mid;
2070	}
2071
2072	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2073	for (idx_t i = count; i > lo; i--)
2074	s->elems[i] = s->elems[i - 1];
2075	s->elems[lo] = p;
2076	++s->nelem;
2077	}
2078
2079	static void
2080	append (position p, position_set *s)
2081	{
2082	idx_t count = s->nelem;
2083	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2084	s->elems[s->nelem++] = p;
2085	}
2086
2087	/* Merge S1 and S2 (with the additional constraint C2) into M. The
2088	result is as if the positions of S1, and of S2 with the additional
2089	constraint C2, were inserted into an initially empty set. */
2090	static void
2091	merge_constrained (position_set const s1, position_set const s2,
2092	unsigned int c2, position_set *m)
2093	{
2094	idx_t i = 0, j = 0;
2095
2096	if (m->alloc - s1->nelem < s2->nelem)
2097	{
2098	free (m->elems);
2099	m->alloc = s1->nelem;
2100	m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2101	}
2102	m->nelem = 0;
2103	while (i < s1->nelem \|\| j < s2->nelem)
2104	if (! (j < s2->nelem)
2105	\|\| (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2106	{
2107	unsigned int c = ((i < s1->nelem && j < s2->nelem
2108	&& s1->elems[i].index == s2->elems[j].index)
2109	? s2->elems[j++].constraint & c2
2110	: 0);
2111	m->elems[m->nelem].index = s1->elems[i].index;
2112	m->elems[m->nelem++].constraint = s1->elems[i++].constraint \| c;
2113	}
2114	else
2115	{
2116	if (s2->elems[j].constraint & c2)
2117	{
2118	m->elems[m->nelem].index = s2->elems[j].index;
2119	m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2120	}
2121	j++;
2122	}
2123	}
2124
2125	/* Merge two sets of positions into a third. The result is exactly as if
2126	the positions of both sets were inserted into an initially empty set. */
2127	static void
2128	merge (position_set const s1, position_set const s2, position_set *m)
2129	{
2130	merge_constrained (s1, s2, -1, m);
2131	}
2132
2133	/* Merge into DST all the elements of SRC, possibly destroying
2134	the contents of the temporary M. */
2135	static void
2136	merge2 (position_set dst, position_set const src, position_set *m)
2137	{
2138	if (src->nelem < 4)
2139	{
2140	for (idx_t i = 0; i < src->nelem; i++)
2141	insert (src->elems[i], dst);
2142	}
2143	else
2144	{
2145	merge (src, dst, m);
2146	copy (m, dst);
2147	}
2148	}
2149
2150	/* Delete a position from a set. Return the nonzero constraint of the
2151	deleted position, or zero if there was no such position. */
2152	static unsigned int
2153	delete (idx_t del, position_set *s)
2154	{
2155	idx_t count = s->nelem;
2156	idx_t lo = 0, hi = count;
2157	while (lo < hi)
2158	{
2159	idx_t mid = (lo + hi) >> 1;
2160	if (s->elems[mid].index < del)
2161	lo = mid + 1;
2162	else if (s->elems[mid].index == del)
2163	{
2164	unsigned int c = s->elems[mid].constraint;
2165	idx_t i;
2166	for (i = mid; i + 1 < count; i++)
2167	s->elems[i] = s->elems[i + 1];
2168	s->nelem = i;
2169	return c;
2170	}
2171	else
2172	hi = mid;
2173	}
2174	return 0;
2175	}
2176
2177	/* Replace a position with the followed set. */
2178	static void
2179	replace (position_set dst, idx_t del, position_set add,
2180	unsigned int constraint, position_set *tmp)
2181	{
2182	unsigned int c = delete (del, dst) & constraint;
2183
2184	if (c)
2185	{
2186	copy (dst, tmp);
2187	merge_constrained (tmp, add, c, dst);
2188	}
2189	}
2190
2191	/* Find the index of the state corresponding to the given position set with
2192	the given preceding context, or create a new state if there is no such
2193	state. Context tells whether we got here on a newline or letter. */
2194	static state_num
2195	state_index (struct dfa d, position_set const s, int context)
2196	{
2197	size_t hash = 0;
2198	int constraint = 0;
2199	state_num i;
2200
2201	for (i = 0; i < s->nelem; ++i)
2202	{
2203	idx_t ind = s->elems[i].index;
2204	hash ^= ind + s->elems[i].constraint;
2205	}
2206
2207	/* Try to find a state that exactly matches the proposed one. */
2208	for (i = 0; i < d->sindex; ++i)
2209	{
2210	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
2211	\|\| context != d->states[i].context)
2212	continue;
2213	state_num j;
2214	for (j = 0; j < s->nelem; ++j)
2215	if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2216	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
2217	break;
2218	if (j == s->nelem)
2219	return i;
2220	}
2221
2222	#ifdef DEBUG
2223	fprintf (stderr, "new state %td\n nextpos:", i);
2224	for (state_num j = 0; j < s->nelem; j++)
2225	{
2226	fprintf (stderr, " %td:", s->elems[j].index);
2227	prtok (d->tokens[s->elems[j].index]);
2228	}
2229	fprintf (stderr, "\n context:");
2230	if (context ^ CTX_ANY)
2231	{
2232	if (context & CTX_NONE)
2233	fprintf (stderr, " CTX_NONE");
2234	if (context & CTX_LETTER)
2235	fprintf (stderr, " CTX_LETTER");
2236	if (context & CTX_NEWLINE)
2237	fprintf (stderr, " CTX_NEWLINE");
2238	}
2239	else
2240	fprintf (stderr, " CTX_ANY");
2241	fprintf (stderr, "\n");
2242	#endif
2243
2244	for (state_num j = 0; j < s->nelem; j++)
2245	{
2246	int c = d->constraints[s->elems[j].index];
2247
2248	if (c != 0)
2249	{
2250	if (succeeds_in_context (c, context, CTX_ANY))
2251	constraint \|= c;
2252	}
2253	else if (d->tokens[s->elems[j].index] == BACKREF)
2254	constraint = NO_CONSTRAINT;
2255	}
2256
2257
2258	/* Create a new state. */
2259	d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2260	sizeof *d->states);
2261	d->states[i].hash = hash;
2262	alloc_position_set (&d->states[i].elems, s->nelem);
2263	copy (s, &d->states[i].elems);
2264	d->states[i].context = context;
2265	d->states[i].constraint = constraint;
2266	d->states[i].mbps.nelem = 0;
2267	d->states[i].mbps.elems = NULL;
2268	d->states[i].mb_trindex = -1;
2269
2270	++d->sindex;
2271
2272	return i;
2273	}
2274
2275	/* Find the epsilon closure of D's set of positions. If any position of the set
2276	contains a symbol that matches the empty string in some context, replace
2277	that position with the elements of its follow labeled with an appropriate
2278	constraint. Repeat exhaustively until no funny positions are left.
2279	S->elems must be large enough to hold the result. BACKWARD is D's
2280	backward set; use and update it too. */
2281	static void
2282	epsclosure (struct dfa const d, position_set backward)
2283	{
2284	position_set tmp;
2285	alloc_position_set (&tmp, d->nleaves);
2286	for (idx_t i = 0; i < d->tindex; i++)
2287	if (0 < d->follows[i].nelem)
2288	{
2289	unsigned int constraint;
2290	switch (d->tokens[i])
2291	{
2292	default:
2293	continue;
2294
2295	case BEGLINE:
2296	constraint = BEGLINE_CONSTRAINT;
2297	break;
2298	case ENDLINE:
2299	constraint = ENDLINE_CONSTRAINT;
2300	break;
2301	case BEGWORD:
2302	constraint = BEGWORD_CONSTRAINT;
2303	break;
2304	case ENDWORD:
2305	constraint = ENDWORD_CONSTRAINT;
2306	break;
2307	case LIMWORD:
2308	constraint = LIMWORD_CONSTRAINT;
2309	break;
2310	case NOTLIMWORD:
2311	constraint = NOTLIMWORD_CONSTRAINT;
2312	break;
2313	case EMPTY:
2314	constraint = NO_CONSTRAINT;
2315	break;
2316	}
2317
2318	delete (i, &d->follows[i]);
2319
2320	for (idx_t j = 0; j < backward[i].nelem; j++)
2321	replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2322	constraint, &tmp);
2323	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2324	replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2325	NO_CONSTRAINT, &tmp);
2326	}
2327	free (tmp.elems);
2328	}
2329
2330	/* Returns the set of contexts for which there is at least one
2331	character included in C. */
2332
2333	static int
2334	charclass_context (struct dfa const dfa, charclass const c)
2335	{
2336	int context = 0;
2337
2338	for (int j = 0; j < CHARCLASS_WORDS; j++)
2339	{
2340	if (c->w[j] & dfa->syntax.newline.w[j])
2341	context \|= CTX_NEWLINE;
2342	if (c->w[j] & dfa->syntax.letters.w[j])
2343	context \|= CTX_LETTER;
2344	if (c->w[j] & ~(dfa->syntax.letters.w[j] \| dfa->syntax.newline.w[j]))
2345	context \|= CTX_NONE;
2346	}
2347
2348	return context;
2349	}
2350
2351	/* Returns the contexts on which the position set S depends. Each context
2352	in the set of returned contexts (let's call it SC) may have a different
2353	follow set than other contexts in SC, and also different from the
2354	follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2355	in the complement set will have the same follow set. */
2356
2357	static int _GL_ATTRIBUTE_PURE
2358	state_separate_contexts (struct dfa d, position_set const s)
2359	{
2360	int separate_contexts = 0;
2361
2362	for (idx_t j = 0; j < s->nelem; j++)
2363	separate_contexts \|= d->separates[s->elems[j].index];
2364
2365	return separate_contexts;
2366	}
2367
2368	enum
2369	{
2370	/* Single token is repeated. It is distinguished from non-repeated. */
2371	OPT_REPEAT = (1 << 0),
2372
2373	/* Multiple tokens are repeated. This flag is on at head of tokens. The
2374	node is not merged. */
2375	OPT_LPAREN = (1 << 1),
2376
2377	/* Multiple branches are joined. The node is not merged. */
2378	OPT_RPAREN = (1 << 2),
2379
2380	/* The node is walked. If the node is found in walking again, OPT_RPAREN
2381	flag is turned on. */
2382	OPT_WALKED = (1 << 3),
2383
2384	/* The node is queued. The node is not queued again. */
2385	OPT_QUEUED = (1 << 4)
2386	};
2387
2388	static void
2389	merge_nfa_state (struct dfa d, idx_t tindex, char flags,
2390	position_set *merged)
2391	{
2392	position_set *follows = d->follows;
2393	idx_t nelem = 0;
2394
2395	for (idx_t i = 0; i < follows[tindex].nelem; i++)
2396	{
2397	idx_t sindex = follows[tindex].elems[i].index;
2398
2399	/* Skip the node as pruned in future. */
2400	unsigned int iconstraint = follows[tindex].elems[i].constraint;
2401	if (iconstraint == 0)
2402	continue;
2403
2404	if (d->tokens[follows[tindex].elems[i].index] <= END)
2405	{
2406	d->constraints[tindex] \|= follows[tindex].elems[i].constraint;
2407	continue;
2408	}
2409
2410	if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN \| OPT_RPAREN)))
2411	{
2412	idx_t j;
2413
2414	for (j = 0; j < nelem; j++)
2415	{
2416	idx_t dindex = follows[tindex].elems[j].index;
2417
2418	if (dindex == tindex)
2419	continue;
2420
2421	if (follows[tindex].elems[j].constraint != iconstraint)
2422	continue;
2423
2424	if (flags[dindex] & (OPT_LPAREN \| OPT_RPAREN))
2425	continue;
2426
2427	if (d->tokens[sindex] != d->tokens[dindex])
2428	continue;
2429
2430	if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2431	continue;
2432
2433	if (flags[sindex] & OPT_REPEAT)
2434	delete (sindex, &follows[sindex]);
2435
2436	merge2 (&follows[dindex], &follows[sindex], merged);
2437
2438	break;
2439	}
2440
2441	if (j < nelem)
2442	continue;
2443	}
2444
2445	follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2446	flags[sindex] \|= OPT_QUEUED;
2447	}
2448
2449	follows[tindex].nelem = nelem;
2450	}
2451
2452	static int
2453	compare (const void a, const void b)
2454	{
2455	position const p = a, q = b;
2456	return (p->index > q->index) - (p->index < q->index);
2457	}
2458
2459	static void
2460	reorder_tokens (struct dfa *d)
2461	{
2462	idx_t nleaves = 0;
2463	ptrdiff_t map = xnmalloc (d->tindex, sizeof map);
2464	map[0] = nleaves++;
2465	for (idx_t i = 1; i < d->tindex; i++)
2466	map[i] = -1;
2467
2468	token tokens = xnmalloc (d->nleaves, sizeof tokens);
2469	position_set follows = xnmalloc (d->nleaves, sizeof follows);
2470	int constraints = xnmalloc (d->nleaves, sizeof constraints);
2471	char *multibyte_prop = (d->localeinfo.multibyte
2472	? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2473	: NULL);
2474
2475	for (idx_t i = 0; i < d->tindex; i++)
2476	{
2477	if (map[i] < 0)
2478	{
2479	free (d->follows[i].elems);
2480	d->follows[i].elems = NULL;
2481	d->follows[i].nelem = 0;
2482	continue;
2483	}
2484
2485	tokens[map[i]] = d->tokens[i];
2486	follows[map[i]] = d->follows[i];
2487	constraints[map[i]] = d->constraints[i];
2488
2489	if (multibyte_prop != NULL)
2490	multibyte_prop[map[i]] = d->multibyte_prop[i];
2491
2492	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2493	{
2494	if (map[d->follows[i].elems[j].index] == -1)
2495	map[d->follows[i].elems[j].index] = nleaves++;
2496
2497	d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2498	}
2499
2500	qsort (d->follows[i].elems, d->follows[i].nelem,
2501	sizeof *d->follows[i].elems, compare);
2502	}
2503
2504	for (idx_t i = 0; i < nleaves; i++)
2505	{
2506	d->tokens[i] = tokens[i];
2507	d->follows[i] = follows[i];
2508	d->constraints[i] = constraints[i];
2509
2510	if (multibyte_prop != NULL)
2511	d->multibyte_prop[i] = multibyte_prop[i];
2512	}
2513
2514	d->tindex = d->nleaves = nleaves;
2515
2516	free (tokens);
2517	free (follows);
2518	free (constraints);
2519	free (multibyte_prop);
2520	free (map);
2521	}
2522
2523	static void
2524	dfaoptimize (struct dfa *d)
2525	{
2526	char *flags = xizalloc (d->tindex);
2527
2528	for (idx_t i = 0; i < d->tindex; i++)
2529	{
2530	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2531	{
2532	if (d->follows[i].elems[j].index == i)
2533	flags[d->follows[i].elems[j].index] \|= OPT_REPEAT;
2534	else if (d->follows[i].elems[j].index < i)
2535	flags[d->follows[i].elems[j].index] \|= OPT_LPAREN;
2536	else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2537	flags[d->follows[i].elems[j].index] \|= OPT_RPAREN;
2538	else
2539	flags[d->follows[i].elems[j].index] \|= OPT_WALKED;
2540	}
2541	}
2542
2543	flags[0] \|= OPT_QUEUED;
2544
2545	position_set merged0;
2546	position_set *merged = &merged0;
2547	alloc_position_set (merged, d->nleaves);
2548
2549	d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2550
2551	for (idx_t i = 0; i < d->tindex; i++)
2552	if (flags[i] & OPT_QUEUED)
2553	merge_nfa_state (d, i, flags, merged);
2554
2555	reorder_tokens (d);
2556
2557	free (merged->elems);
2558	free (flags);
2559	}
2560
2561	/* Perform bottom-up analysis on the parse tree, computing various functions.
2562	Note that at this point, we're pretending constructs like \< are real
2563	characters rather than constraints on what can follow them.
2564
2565	Nullable: A node is nullable if it is at the root of a regexp that can
2566	match the empty string.
2567	* EMPTY leaves are nullable.
2568	* No other leaf is nullable.
2569	* A QMARK or STAR node is nullable.
2570	* A PLUS node is nullable if its argument is nullable.
2571	* A CAT node is nullable if both its arguments are nullable.
2572	* An OR node is nullable if either argument is nullable.
2573
2574	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2575	that could correspond to the first character of a string matching the
2576	regexp rooted at the given node.
2577	* EMPTY leaves have empty firstpos.
2578	* The firstpos of a nonempty leaf is that leaf itself.
2579	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2580	argument.
2581	* The firstpos of a CAT node is the firstpos of the left argument, union
2582	the firstpos of the right if the left argument is nullable.
2583	* The firstpos of an OR node is the union of firstpos of each argument.
2584
2585	Lastpos: The lastpos of a node is the set of positions that could
2586	correspond to the last character of a string matching the regexp at
2587	the given node.
2588	* EMPTY leaves have empty lastpos.
2589	* The lastpos of a nonempty leaf is that leaf itself.
2590	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2591	argument.
2592	* The lastpos of a CAT node is the lastpos of its right argument, union
2593	the lastpos of the left if the right argument is nullable.
2594	* The lastpos of an OR node is the union of the lastpos of each argument.
2595
2596	Follow: The follow of a position is the set of positions that could
2597	correspond to the character following a character matching the node in
2598	a string matching the regexp. At this point we consider special symbols
2599	that match the empty string in some context to be just normal characters.
2600	Later, if we find that a special symbol is in a follow set, we will
2601	replace it with the elements of its follow, labeled with an appropriate
2602	constraint.
2603	* Every node in the firstpos of the argument of a STAR or PLUS node is in
2604	the follow of every node in the lastpos.
2605	* Every node in the firstpos of the second argument of a CAT node is in
2606	the follow of every node in the lastpos of the first argument.
2607
2608	Because of the postfix representation of the parse tree, the depth-first
2609	analysis is conveniently done by a linear scan with the aid of a stack.
2610	Sets are stored as arrays of the elements, obeying a stack-like allocation
2611	scheme; the number of elements in each set deeper in the stack can be
2612	used to determine the address of a particular set's array. */
2613	static void
2614	dfaanalyze (struct dfa *d, bool searchflag)
2615	{
2616	/* Array allocated to hold position sets. */
2617	position posalloc = xnmalloc (d->nleaves, 2 sizeof *posalloc);
2618	/* Firstpos and lastpos elements. */
2619	position *firstpos = posalloc;
2620	position *lastpos = firstpos + d->nleaves;
2621	position pos;
2622	position_set tmp;
2623
2624	/* Stack for element counts and nullable flags. */
2625	struct
2626	{
2627	/* Whether the entry is nullable. */
2628	bool nullable;
2629
2630	/* Counts of firstpos and lastpos sets. */
2631	idx_t nfirstpos;
2632	idx_t nlastpos;
2633	} stkalloc = xnmalloc (d->depth, sizeof stkalloc), *stk = stkalloc;
2634
2635	position_set merged; /* Result of merging sets. */
2636
2637	addtok (d, CAT);
2638	idx_t tindex = d->tindex;
2639
2640	#ifdef DEBUG
2641	fprintf (stderr, "dfaanalyze:\n");
2642	for (idx_t i = 0; i < tindex; i++)
2643	{
2644	fprintf (stderr, " %td:", i);
2645	prtok (d->tokens[i]);
2646	}
2647	putc ('\n', stderr);
2648	#endif
2649
2650	d->searchflag = searchflag;
2651	alloc_position_set (&merged, d->nleaves);
2652	d->follows = xicalloc (tindex, sizeof *d->follows);
2653	position_set *backward
2654	= d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2655
2656	for (idx_t i = 0; i < tindex; i++)
2657	{
2658	switch (d->tokens[i])
2659	{
2660	case EMPTY:
2661	/* The empty set is nullable. */
2662	stk->nullable = true;
2663
2664	/* The firstpos and lastpos of the empty leaf are both empty. */
2665	stk->nfirstpos = stk->nlastpos = 0;
2666	stk++;
2667	break;
2668
2669	case STAR:
2670	case PLUS:
2671	/* Every element in the lastpos of the argument is in the backward
2672	set of every element in the firstpos. */
2673	if (d->epsilon)
2674	{
2675	tmp.elems = lastpos - stk[-1].nlastpos;
2676	tmp.nelem = stk[-1].nlastpos;
2677	for (position *p = firstpos - stk[-1].nfirstpos;
2678	p < firstpos; p++)
2679	merge2 (&backward[p->index], &tmp, &merged);
2680	}
2681
2682	/* Every element in the firstpos of the argument is in the follow
2683	of every element in the lastpos. */
2684	{
2685	tmp.elems = firstpos - stk[-1].nfirstpos;
2686	tmp.nelem = stk[-1].nfirstpos;
2687	for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2688	merge2 (&d->follows[p->index], &tmp, &merged);
2689	}
2690	FALLTHROUGH;
2691	case QMARK:
2692	/* A QMARK or STAR node is automatically nullable. */
2693	if (d->tokens[i] != PLUS)
2694	stk[-1].nullable = true;
2695	break;
2696
2697	case CAT:
2698	/* Every element in the lastpos of the first argument is in
2699	the backward set of every element in the firstpos of the
2700	second argument. */
2701	if (backward)
2702	{
2703	tmp.nelem = stk[-2].nlastpos;
2704	tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2705	for (position *p = firstpos - stk[-1].nfirstpos;
2706	p < firstpos; p++)
2707	merge2 (&backward[p->index], &tmp, &merged);
2708	}
2709
2710	/* Every element in the firstpos of the second argument is in the
2711	follow of every element in the lastpos of the first argument. */
2712	{
2713	tmp.nelem = stk[-1].nfirstpos;
2714	tmp.elems = firstpos - stk[-1].nfirstpos;
2715	for (position *plim = lastpos - stk[-1].nlastpos,
2716	*p = plim - stk[-2].nlastpos;
2717	p < plim; p++)
2718	merge2 (&d->follows[p->index], &tmp, &merged);
2719	}
2720
2721	/* The firstpos of a CAT node is the firstpos of the first argument,
2722	union that of the second argument if the first is nullable. */
2723	if (stk[-2].nullable)
2724	stk[-2].nfirstpos += stk[-1].nfirstpos;
2725	else
2726	firstpos -= stk[-1].nfirstpos;
2727
2728	/* The lastpos of a CAT node is the lastpos of the second argument,
2729	union that of the first argument if the second is nullable. */
2730	if (stk[-1].nullable)
2731	stk[-2].nlastpos += stk[-1].nlastpos;
2732	else
2733	{
2734	position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2735	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2736	p[j] = p[j + stk[-2].nlastpos];
2737	lastpos -= stk[-2].nlastpos;
2738	stk[-2].nlastpos = stk[-1].nlastpos;
2739	}
2740
2741	/* A CAT node is nullable if both arguments are nullable. */
2742	stk[-2].nullable &= stk[-1].nullable;
2743	stk--;
2744	break;
2745
2746	case OR:
2747	/* The firstpos is the union of the firstpos of each argument. */
2748	stk[-2].nfirstpos += stk[-1].nfirstpos;
2749
2750	/* The lastpos is the union of the lastpos of each argument. */
2751	stk[-2].nlastpos += stk[-1].nlastpos;
2752
2753	/* An OR node is nullable if either argument is nullable. */
2754	stk[-2].nullable \|= stk[-1].nullable;
2755	stk--;
2756	break;
2757
2758	default:
2759	/* Anything else is a nonempty position. (Note that special
2760	constructs like \< are treated as nonempty strings here;
2761	an "epsilon closure" effectively makes them nullable later.
2762	Backreferences have to get a real position so we can detect
2763	transitions on them later. But they are nullable. */
2764	stk->nullable = d->tokens[i] == BACKREF;
2765
2766	/* This position is in its own firstpos and lastpos. */
2767	stk->nfirstpos = stk->nlastpos = 1;
2768	stk++;
2769
2770	firstpos->index = lastpos->index = i;
2771	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2772	firstpos++, lastpos++;
2773
2774	break;
2775	}
2776	#ifdef DEBUG
2777	/* ... balance the above nonsyntactic #ifdef goo... */
2778	fprintf (stderr, "node %td:", i);
2779	prtok (d->tokens[i]);
2780	putc ('\n', stderr);
2781	fprintf (stderr,
2782	stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2783	fprintf (stderr, " firstpos:");
2784	for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2785	{
2786	fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2787	prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2788	}
2789	fprintf (stderr, "\n lastpos:");
2790	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2791	{
2792	fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2793	prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2794	}
2795	putc ('\n', stderr);
2796	#endif
2797	}
2798
2799	if (backward)
2800	{
2801	/* For each follow set that is the follow set of a real position,
2802	replace it with its epsilon closure. */
2803	epsclosure (d, backward);
2804
2805	for (idx_t i = 0; i < tindex; i++)
2806	free (backward[i].elems);
2807	free (backward);
2808	}
2809
2810	dfaoptimize (d);
2811
2812	#ifdef DEBUG
2813	for (idx_t i = 0; i < tindex; i++)
2814	if (d->tokens[i] == BEG \|\| d->tokens[i] < NOTCHAR
2815	\|\| d->tokens[i] == BACKREF \|\| d->tokens[i] == ANYCHAR
2816	\|\| d->tokens[i] == MBCSET \|\| d->tokens[i] >= CSET)
2817	{
2818	fprintf (stderr, "follows(%td:", i);
2819	prtok (d->tokens[i]);
2820	fprintf (stderr, "):");
2821	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2822	{
2823	fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2824	prtok (d->tokens[d->follows[i].elems[j].index]);
2825	}
2826	putc ('\n', stderr);
2827	}
2828	#endif
2829
2830	pos.index = 0;
2831	pos.constraint = NO_CONSTRAINT;
2832
2833	alloc_position_set (&tmp, 1);
2834
2835	append (pos, &tmp);
2836
2837	d->separates = xicalloc (tindex, sizeof *d->separates);
2838
2839	for (idx_t i = 0; i < tindex; i++)
2840	{
2841	if (prev_newline_dependent (d->constraints[i]))
2842	d->separates[i] \|= CTX_NEWLINE;
2843	if (prev_letter_dependent (d->constraints[i]))
2844	d->separates[i] \|= CTX_LETTER;
2845
2846	for (idx_t j = 0; j < d->follows[i].nelem; j++)
2847	{
2848	if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2849	d->separates[i] \|= CTX_NEWLINE;
2850	if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2851	d->separates[i] \|= CTX_LETTER;
2852	}
2853	}
2854
2855	/* Context wanted by some position. */
2856	int separate_contexts = state_separate_contexts (d, &tmp);
2857
2858	/* Build the initial state. */
2859	if (separate_contexts & CTX_NEWLINE)
2860	state_index (d, &tmp, CTX_NEWLINE);
2861	d->initstate_notbol = d->min_trcount
2862	= state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2863	if (separate_contexts & CTX_LETTER)
2864	d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2865	d->min_trcount++;
2866	d->trcount = 0;
2867
2868	free (posalloc);
2869	free (stkalloc);
2870	free (merged.elems);
2871	free (tmp.elems);
2872	}
2873
2874	/* Make sure D's state arrays are large enough to hold NEW_STATE. */
2875	static void
2876	realloc_trans_if_necessary (struct dfa *d)
2877	{
2878	state_num oldalloc = d->tralloc;
2879	if (oldalloc < d->sindex)
2880	{
2881	state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2882	idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2883	realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2884	-1, sizeof *realtrans);
2885	realtrans[0] = realtrans[1] = NULL;
2886	d->trans = realtrans + 2;
2887	idx_t newalloc = d->tralloc = newalloc1 - 2;
2888	d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2889	d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2890	d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2891	if (d->localeinfo.multibyte)
2892	{
2893	realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2894	realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2895	if (oldalloc == 0)
2896	realtrans[0] = realtrans[1] = NULL;
2897	d->mb_trans = realtrans + 2;
2898	}
2899	for (; oldalloc < newalloc; oldalloc++)
2900	{
2901	d->trans[oldalloc] = NULL;
2902	d->fails[oldalloc] = NULL;
2903	if (d->localeinfo.multibyte)
2904	d->mb_trans[oldalloc] = NULL;
2905	}
2906	}
2907	}
2908
2909	/*
2910	Calculate the transition table for a new state derived from state s
2911	for a compiled dfa d after input character uc, and return the new
2912	state number.
2913
2914	Do not worry about all possible input characters; calculate just the group
2915	of positions that match uc. Label it with the set of characters that
2916	every position in the group matches (taking into account, if necessary,
2917	preceding context information of s). Then find the union
2918	of these positions' follows, i.e., the set of positions of the
2919	new state. For each character in the group's label, set the transition
2920	on this character to be to a state corresponding to the set's positions,
2921	and its associated backward context information, if necessary.
2922
2923	When building a searching matcher, include the positions of state
2924	0 in every state.
2925
2926	The group is constructed by building an equivalence-class
2927	partition of the positions of s.
2928
2929	For each position, find the set of characters C that it matches. Eliminate
2930	any characters from C that fail on grounds of backward context.
2931
2932	Check whether the group's label L has nonempty
2933	intersection with C. If L - C is nonempty, create a new group labeled
2934	L - C and having the same positions as the current group, and set L to
2935	the intersection of L and C. Insert the position in the group, set
2936	C = C - L, and resume scanning.
2937
2938	If after comparing with every group there are characters remaining in C,
2939	create a new group labeled with the characters of C and insert this
2940	position in that group. */
2941
2942	static state_num
2943	build_state (state_num s, struct dfa *d, unsigned char uc)
2944	{
2945	position_set follows; /* Union of the follows for each
2946	position of the current state. */
2947	position_set group; /* Positions that match the input char. */
2948	position_set tmp; /* Temporary space for merging sets. */
2949	state_num state; /* New state. */
2950	state_num state_newline; /* New state on a newline transition. */
2951	state_num state_letter; /* New state on a letter transition. */
2952
2953	#ifdef DEBUG
2954	fprintf (stderr, "build state %td\n", s);
2955	#endif
2956
2957	/* A pointer to the new transition table, and the table itself. */
2958	state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2959	state_num trans = ptrans;
2960
2961	if (!trans)
2962	{
2963	/* MAX_TRCOUNT is an arbitrary upper limit on the number of
2964	transition tables that can exist at once, other than for
2965	initial states. Often-used transition tables are quickly
2966	rebuilt, whereas rarely-used ones are cleared away. */
2967	if (MAX_TRCOUNT <= d->trcount)
2968	{
2969	for (state_num i = d->min_trcount; i < d->tralloc; i++)
2970	{
2971	free (d->trans[i]);
2972	free (d->fails[i]);
2973	d->trans[i] = d->fails[i] = NULL;
2974	}
2975	d->trcount = 0;
2976	}
2977
2978	d->trcount++;
2979	ptrans = trans = xmalloc (NOTCHAR sizeof *trans);
2980
2981	/* Fill transition table with a default value which means that the
2982	transited state has not been calculated yet. */
2983	for (int i = 0; i < NOTCHAR; i++)
2984	trans[i] = -2;
2985	}
2986
2987	/* Set up the success bits for this state. */
2988	d->success[s] = 0;
2989	if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2990	d->success[s] \|= CTX_NEWLINE;
2991	if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2992	d->success[s] \|= CTX_LETTER;
2993	if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2994	d->success[s] \|= CTX_NONE;
2995
2996	alloc_position_set (&follows, d->nleaves);
2997
2998	/* Find the union of the follows of the positions of the group.
2999	This is a hideously inefficient loop. Fix it someday. */
3000	for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3001	for (idx_t k = 0;
3002	k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3003	insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3004	&follows);
3005
3006	/* Positions that match the input char. */
3007	alloc_position_set (&group, d->nleaves);
3008
3009	/* The group's label. */
3010	charclass label;
3011	fillset (&label);
3012
3013	for (idx_t i = 0; i < follows.nelem; i++)
3014	{
3015	charclass matches; /* Set of matching characters. */
3016	position pos = follows.elems[i];
3017	bool matched = false;
3018	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3019	{
3020	zeroset (&matches);
3021	setbit (d->tokens[pos.index], &matches);
3022	if (d->tokens[pos.index] == uc)
3023	matched = true;
3024	}
3025	else if (d->tokens[pos.index] >= CSET)
3026	{
3027	matches = d->charclasses[d->tokens[pos.index] - CSET];
3028	if (tstbit (uc, &matches))
3029	matched = true;
3030	}
3031	else if (d->tokens[pos.index] == ANYCHAR)
3032	{
3033	matches = d->charclasses[d->canychar];
3034	if (tstbit (uc, &matches))
3035	matched = true;
3036
3037	/* ANYCHAR must match with a single character, so we must put
3038	it to D->states[s].mbps which contains the positions which
3039	can match with a single character not a byte. If all
3040	positions which has ANYCHAR does not depend on context of
3041	next character, we put the follows instead of it to
3042	D->states[s].mbps to optimize. */
3043	if (succeeds_in_context (pos.constraint, d->states[s].context,
3044	CTX_NONE))
3045	{
3046	if (d->states[s].mbps.nelem == 0)
3047	alloc_position_set (&d->states[s].mbps, 1);
3048	insert (pos, &d->states[s].mbps);
3049	}
3050	}
3051	else
3052	continue;
3053
3054	/* Some characters may need to be eliminated from matches because
3055	they fail in the current context. */
3056	if (pos.constraint != NO_CONSTRAINT)
3057	{
3058	if (!succeeds_in_context (pos.constraint,
3059	d->states[s].context, CTX_NEWLINE))
3060	for (int j = 0; j < CHARCLASS_WORDS; j++)
3061	matches.w[j] &= ~d->syntax.newline.w[j];
3062	if (!succeeds_in_context (pos.constraint,
3063	d->states[s].context, CTX_LETTER))
3064	for (int j = 0; j < CHARCLASS_WORDS; ++j)
3065	matches.w[j] &= ~d->syntax.letters.w[j];
3066	if (!succeeds_in_context (pos.constraint,
3067	d->states[s].context, CTX_NONE))
3068	for (int j = 0; j < CHARCLASS_WORDS; ++j)
3069	matches.w[j] &= d->syntax.letters.w[j] \| d->syntax.newline.w[j];
3070
3071	/* If there are no characters left, there's no point in going on. */
3072	if (emptyset (&matches))
3073	continue;
3074
3075	/* If we have reset the bit that made us declare "matched", reset
3076	that indicator, too. This is required to avoid an infinite loop
3077	with this command: echo cx \| LC_ALL=C grep -E 'c\b[x ]' */
3078	if (!tstbit (uc, &matches))
3079	matched = false;
3080	}
3081
3082	#ifdef DEBUG
3083	fprintf (stderr, " nextpos %td:", pos.index);
3084	prtok (d->tokens[pos.index]);
3085	fprintf (stderr, " of");
3086	for (unsigned j = 0; j < NOTCHAR; j++)
3087	if (tstbit (j, &matches))
3088	fprintf (stderr, " 0x%02x", j);
3089	fprintf (stderr, "\n");
3090	#endif
3091
3092	if (matched)
3093	{
3094	for (int k = 0; k < CHARCLASS_WORDS; ++k)
3095	label.w[k] &= matches.w[k];
3096	append (pos, &group);
3097	}
3098	else
3099	{
3100	for (int k = 0; k < CHARCLASS_WORDS; ++k)
3101	label.w[k] &= ~matches.w[k];
3102	}
3103	}
3104
3105	alloc_position_set (&tmp, d->nleaves);
3106
3107	if (group.nelem > 0)
3108	{
3109	/* If we are building a searching matcher, throw in the positions
3110	of state 0 as well, if possible. */
3111	if (d->searchflag)
3112	{
3113	/* If a token in follows.elems is not 1st byte of a multibyte
3114	character, or the states of follows must accept the bytes
3115	which are not 1st byte of the multibyte character.
3116	Then, if a state of follows encounters a byte, it must not be
3117	a 1st byte of a multibyte character nor a single byte character.
3118	In this case, do not add state[0].follows to next state, because
3119	state[0] must accept 1st-byte.
3120
3121	For example, suppose <sb a> is a certain single byte character,
3122	<mb A> is a certain multibyte character, and the codepoint of
3123	<sb a> equals the 2nd byte of the codepoint of <mb A>. When
3124	state[0] accepts <sb a>, state[i] transits to state[i+1] by
3125	accepting the 1st byte of <mb A>, and state[i+1] accepts the
3126	2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3127	<sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3128	not add state[0]. */
3129
3130	bool mergeit = !d->localeinfo.multibyte;
3131	if (!mergeit)
3132	{
3133	mergeit = true;
3134	for (idx_t j = 0; mergeit && j < group.nelem; j++)
3135	mergeit &= d->multibyte_prop[group.elems[j].index];
3136	}
3137	if (mergeit)
3138	merge2 (&group, &d->states[0].elems, &tmp);
3139	}
3140
3141	/* Find out if the new state will want any context information,
3142	by calculating possible contexts that the group can match,
3143	and separate contexts that the new state wants to know. */
3144	int possible_contexts = charclass_context (d, &label);
3145	int separate_contexts = state_separate_contexts (d, &group);
3146
3147	/* Find the state(s) corresponding to the union of the follows. */
3148	if (possible_contexts & ~separate_contexts)
3149	state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3150	else
3151	state = -1;
3152	if (separate_contexts & possible_contexts & CTX_NEWLINE)
3153	state_newline = state_index (d, &group, CTX_NEWLINE);
3154	else
3155	state_newline = state;
3156	if (separate_contexts & possible_contexts & CTX_LETTER)
3157	state_letter = state_index (d, &group, CTX_LETTER);
3158	else
3159	state_letter = state;
3160
3161	/* Reallocate now, to reallocate any newline transition properly. */
3162	realloc_trans_if_necessary (d);
3163	}
3164
3165	/* If we are a searching matcher, the default transition is to a state
3166	containing the positions of state 0, otherwise the default transition
3167	is to fail miserably. */
3168	else if (d->searchflag)
3169	{
3170	state_newline = 0;
3171	state_letter = d->min_trcount - 1;
3172	state = d->initstate_notbol;
3173	}
3174	else
3175	{
3176	state_newline = -1;
3177	state_letter = -1;
3178	state = -1;
3179	}
3180
3181	/* Set the transitions for each character in the label. */
3182	for (int i = 0; i < NOTCHAR; i++)
3183	if (tstbit (i, &label))
3184	switch (d->syntax.sbit[i])
3185	{
3186	case CTX_NEWLINE:
3187	trans[i] = state_newline;
3188	break;
3189	case CTX_LETTER:
3190	trans[i] = state_letter;
3191	break;
3192	default:
3193	trans[i] = state;
3194	break;
3195	}
3196
3197	#ifdef DEBUG
3198	fprintf (stderr, "trans table %td", s);
3199	for (int i = 0; i < NOTCHAR; ++i)
3200	{
3201	if (!(i & 0xf))
3202	fprintf (stderr, "\n");
3203	fprintf (stderr, " %2td", trans[i]);
3204	}
3205	fprintf (stderr, "\n");
3206	#endif
3207
3208	free (group.elems);
3209	free (follows.elems);
3210	free (tmp.elems);
3211
3212	/* Keep the newline transition in a special place so we can use it as
3213	a sentinel. */
3214	if (tstbit (d->syntax.eolbyte, &label))
3215	{
3216	d->newlines[s] = trans[d->syntax.eolbyte];
3217	trans[d->syntax.eolbyte] = -1;
3218	}
3219
3220	return trans[uc];
3221	}
3222
3223	/* Multibyte character handling sub-routines for dfaexec. */
3224
3225	/* Consume a single byte and transit state from 's' to '*next_state'.
3226	This function is almost same as the state transition routin in dfaexec.
3227	But state transition is done just once, otherwise matching succeed or
3228	reach the end of the buffer. */
3229	static state_num
3230	transit_state_singlebyte (struct dfa d, state_num s, unsigned char const *pp)
3231	{
3232	state_num *t;
3233
3234	if (d->trans[s])
3235	t = d->trans[s];
3236	else if (d->fails[s])
3237	t = d->fails[s];
3238	else
3239	{
3240	build_state (s, d, **pp);
3241	if (d->trans[s])
3242	t = d->trans[s];
3243	else
3244	{
3245	t = d->fails[s];
3246	assert (t);
3247	}
3248	}
3249
3250	if (t[**pp] == -2)
3251	build_state (s, d, **pp);
3252
3253	return t[(pp)++];
3254	}
3255
3256	/* Transit state from s, then return new state and update the pointer of
3257	the buffer. This function is for a period operator which can match a
3258	multi-byte character. */
3259	static state_num
3260	transit_state (struct dfa d, state_num s, unsigned char const *pp,
3261	unsigned char const *end)
3262	{
3263	wint_t wc;
3264
3265	int mbclen = mbs_to_wchar (&wc, (char const ) pp, end - *pp, d);
3266
3267	/* This state has some operators which can match a multibyte character. */
3268	d->mb_follows.nelem = 0;
3269
3270	/* Calculate the state which can be reached from the state 's' by
3271	consuming 'mbclen' single bytes from the buffer. */
3272	state_num s1 = s;
3273	int mbci;
3274	for (mbci = 0; mbci < mbclen && (mbci == 0 \|\| d->min_trcount <= s); mbci++)
3275	s = transit_state_singlebyte (d, s, pp);
3276	*pp += mbclen - mbci;
3277
3278	if (wc == WEOF)
3279	{
3280	/* It is an invalid character, so ANYCHAR is not accepted. */
3281	return s;
3282	}
3283
3284	/* If all positions which have ANYCHAR do not depend on the context
3285	of the next character, calculate the next state with
3286	pre-calculated follows and cache the result. */
3287	if (d->states[s1].mb_trindex < 0)
3288	{
3289	if (MAX_TRCOUNT <= d->mb_trcount)
3290	{
3291	state_num s3;
3292	for (s3 = -1; s3 < d->tralloc; s3++)
3293	{
3294	free (d->mb_trans[s3]);
3295	d->mb_trans[s3] = NULL;
3296	}
3297
3298	for (state_num i = 0; i < d->sindex; i++)
3299	d->states[i].mb_trindex = -1;
3300	d->mb_trcount = 0;
3301	}
3302	d->states[s1].mb_trindex = d->mb_trcount++;
3303	}
3304
3305	if (! d->mb_trans[s])
3306	{
3307	enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3308	enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3309	d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3310	for (int i = 0; i < MAX_TRCOUNT; i++)
3311	d->mb_trans[s][i] = -1;
3312	}
3313	else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3314	return d->mb_trans[s][d->states[s1].mb_trindex];
3315
3316	if (s == -1)
3317	copy (&d->states[s1].mbps, &d->mb_follows);
3318	else
3319	merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3320
3321	int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3322	state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3323	realloc_trans_if_necessary (d);
3324
3325	d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3326
3327	return s2;
3328	}
3329
3330	/* The initial state may encounter a byte which is not a single byte character
3331	nor the first byte of a multibyte character. But it is incorrect for the
3332	initial state to accept such a byte. For example, in Shift JIS the regular
3333	expression "\\" accepts the codepoint 0x5c, but should not accept the second
3334	byte of the codepoint 0x815c. Then the initial state must skip the bytes
3335	that are not a single byte character nor the first byte of a multibyte
3336	character.
3337
3338	Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3339	or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3340	the result is greater than P, set *WCP to the final wide character
3341	processed, or to WEOF if no wide character is processed. Otherwise,
3342	if WCP is non-NULL, *WCP may or may not be updated.
3343
3344	Both P and MBP must be no larger than END. */
3345	static unsigned char const *
3346	skip_remains_mb (struct dfa d, unsigned char const p,
3347	unsigned char const mbp, char const end)
3348	{
3349	if (d->syntax.never_trail[*p])
3350	return p;
3351	while (mbp < p)
3352	{
3353	wint_t wc;
3354	mbp += mbs_to_wchar (&wc, (char const *) mbp,
3355	end - (char const *) mbp, d);
3356	}
3357	return mbp;
3358	}
3359
3360	/* Search through a buffer looking for a match to the struct dfa *D.
3361	Find the first occurrence of a string matching the regexp in the
3362	buffer, and the shortest possible version thereof. Return a pointer to
3363	the first character after the match, or NULL if none is found. BEGIN
3364	points to the beginning of the buffer, and END points to the first byte
3365	after its end. Note however that we store a sentinel byte (usually
3366	newline) in *END, so the actual buffer must be one byte longer.
3367	When ALLOW_NL, newlines may appear in the matching string.
3368	If COUNT is non-NULL, increment *COUNT once for each newline processed.
3369	If MULTIBYTE, the input consists of multibyte characters and/or
3370	encoding-error bytes. Otherwise, it consists of single-byte characters.
3371	Here is the list of features that make this DFA matcher punt:
3372	- [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3373	- [^...] in non-simple locale
3374	- [[=foo=]] or [[.foo.]]
3375	- [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3376	- back-reference: (.)\1
3377	- word-delimiter in multibyte locale: \<, \>, \b, \B
3378	See struct localeinfo.simple for the definition of "simple locale". */
3379
3380	static inline char *
3381	dfaexec_main (struct dfa d, char const begin, char *end, bool allow_nl,
3382	ptrdiff_t *count, bool multibyte)
3383	{
3384	if (MAX_TRCOUNT <= d->sindex)
3385	{
3386	for (state_num s = d->min_trcount; s < d->sindex; s++)
3387	{
3388	free (d->states[s].elems.elems);
3389	free (d->states[s].mbps.elems);
3390	}
3391	d->sindex = d->min_trcount;
3392
3393	if (d->trans)
3394	{
3395	for (state_num s = 0; s < d->tralloc; s++)
3396	{
3397	free (d->trans[s]);
3398	free (d->fails[s]);
3399	d->trans[s] = d->fails[s] = NULL;
3400	}
3401	d->trcount = 0;
3402	}
3403
3404	if (d->localeinfo.multibyte && d->mb_trans)
3405	{
3406	for (state_num s = -1; s < d->tralloc; s++)
3407	{
3408	free (d->mb_trans[s]);
3409	d->mb_trans[s] = NULL;
3410	}
3411	for (state_num s = 0; s < d->min_trcount; s++)
3412	d->states[s].mb_trindex = -1;
3413	d->mb_trcount = 0;
3414	}
3415	}
3416
3417	if (!d->tralloc)
3418	realloc_trans_if_necessary (d);
3419
3420	/* Current state. */
3421	state_num s = 0, s1 = 0;
3422
3423	/* Current input character. */
3424	unsigned char const p = (unsigned char const ) begin;
3425	unsigned char const *mbp = p;
3426
3427	/* Copy of d->trans so it can be optimized into a register. */
3428	state_num **trans = d->trans;
3429	unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3430	unsigned char saved_end = (unsigned char ) end;
3431	*end = eol;
3432
3433	if (multibyte)
3434	{
3435	memset (&d->mbs, 0, sizeof d->mbs);
3436	if (d->mb_follows.alloc == 0)
3437	alloc_position_set (&d->mb_follows, d->nleaves);
3438	}
3439
3440	idx_t nlcount = 0;
3441	for (;;)
3442	{
3443	state_num *t;
3444	while ((t = trans[s]) != NULL)
3445	{
3446	if (s < d->min_trcount)
3447	{
3448	if (!multibyte \|\| d->states[s].mbps.nelem == 0)
3449	{
3450	while (t[*p] == s)
3451	p++;
3452	}
3453	if (multibyte)
3454	p = mbp = skip_remains_mb (d, p, mbp, end);
3455	}
3456
3457	if (multibyte)
3458	{
3459	s1 = s;
3460
3461	if (d->states[s].mbps.nelem == 0
3462	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
3463	{
3464	/* If an input character does not match ANYCHAR, do it
3465	like a single-byte character. */
3466	s = t[*p++];
3467	}
3468	else
3469	{
3470	s = transit_state (d, s, &p, (unsigned char *) end);
3471	mbp = p;
3472	trans = d->trans;
3473	}
3474	}
3475	else
3476	{
3477	s1 = t[*p++];
3478	t = trans[s1];
3479	if (! t)
3480	{
3481	state_num tmp = s;
3482	s = s1;
3483	s1 = tmp; /* swap */
3484	break;
3485	}
3486	if (s < d->min_trcount)
3487	{
3488	while (t[*p] == s1)
3489	p++;
3490	}
3491	s = t[*p++];
3492	}
3493	}
3494
3495	if (s < 0)
3496	{
3497	if (s == -2)
3498	{
3499	s = build_state (s1, d, p[-1]);
3500	trans = d->trans;
3501	}
3502	else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3503	{
3504	/* The previous character was a newline. Count it, and skip
3505	checking of multibyte character boundary until here. */
3506	nlcount++;
3507	mbp = p;
3508
3509	s = (allow_nl ? d->newlines[s1]
3510	: d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3511	: d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3512	: d->initstate_notbol);
3513	}
3514	else
3515	{
3516	p = NULL;
3517	goto done;
3518	}
3519	}
3520	else if (d->fails[s])
3521	{
3522	if ((d->success[s] & d->syntax.sbit[*p])
3523	\|\| ((char *) p == end
3524	&& accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3525	d)))
3526	goto done;
3527
3528	if (multibyte && s < d->min_trcount)
3529	p = mbp = skip_remains_mb (d, p, mbp, end);
3530
3531	s1 = s;
3532	if (!multibyte \|\| d->states[s].mbps.nelem == 0
3533	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
3534	{
3535	/* If a input character does not match ANYCHAR, do it
3536	like a single-byte character. */
3537	s = d->fails[s][*p++];
3538	}
3539	else
3540	{
3541	s = transit_state (d, s, &p, (unsigned char *) end);
3542	mbp = p;
3543	trans = d->trans;
3544	}
3545	}
3546	else
3547	{
3548	build_state (s, d, p[0]);
3549	trans = d->trans;
3550	}
3551	}
3552
3553	done:
3554	if (count)
3555	*count += nlcount;
3556	*end = saved_end;
3557	return (char *) p;
3558	}
3559
3560	/* Specialized versions of dfaexec for multibyte and single-byte cases.
3561	This is for performance, as dfaexec_main is an inline function. */
3562
3563	static char *
3564	dfaexec_mb (struct dfa d, char const begin, char *end,
3565	bool allow_nl, ptrdiff_t count, bool backref)
3566	{
3567	return dfaexec_main (d, begin, end, allow_nl, count, true);
3568	}
3569
3570	static char *
3571	dfaexec_sb (struct dfa d, char const begin, char *end,
3572	bool allow_nl, ptrdiff_t count, bool backref)
3573	{
3574	return dfaexec_main (d, begin, end, allow_nl, count, false);
3575	}
3576
3577	/* Always set *BACKREF and return BEGIN. Use this wrapper for
3578	any regexp that uses a construct not supported by this code. */
3579	static char *
3580	dfaexec_noop (struct dfa d, char const begin, char *end,
3581	bool allow_nl, ptrdiff_t count, bool backref)
3582	{
3583	*backref = true;
3584	return (char *) begin;
3585	}
3586
3587	/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3588	but faster and set *BACKREF if the DFA code does not support this
3589	regexp usage. */
3590
3591	char *
3592	dfaexec (struct dfa d, char const begin, char *end,
3593	bool allow_nl, ptrdiff_t count, bool backref)
3594	{
3595	return d->dfaexec (d, begin, end, allow_nl, count, backref);
3596	}
3597
3598	struct dfa *
3599	dfasuperset (struct dfa const *d)
3600	{
3601	return d->superset;
3602	}
3603
3604	bool
3605	dfaisfast (struct dfa const *d)
3606	{
3607	return d->fast;
3608	}
3609
3610	static void
3611	free_mbdata (struct dfa *d)
3612	{
3613	free (d->multibyte_prop);
3614	free (d->lex.brack.chars);
3615	free (d->mb_follows.elems);
3616
3617	if (d->mb_trans)
3618	{
3619	state_num s;
3620	for (s = -1; s < d->tralloc; s++)
3621	free (d->mb_trans[s]);
3622	free (d->mb_trans - 2);
3623	}
3624	}
3625
3626	/* Return true if every construct in D is supported by this DFA matcher. */
3627	bool
3628	dfasupported (struct dfa const *d)
3629	{
3630	for (idx_t i = 0; i < d->tindex; i++)
3631	{
3632	switch (d->tokens[i])
3633	{
3634	case BEGWORD:
3635	case ENDWORD:
3636	case LIMWORD:
3637	case NOTLIMWORD:
3638	if (!d->localeinfo.multibyte)
3639	continue;
3640	FALLTHROUGH;
3641	case BACKREF:
3642	case MBCSET:
3643	return false;
3644	}
3645	}
3646	return true;
3647	}
3648
3649	/* Disable use of the superset DFA if it is not likely to help
3650	performance. */
3651	static void
3652	maybe_disable_superset_dfa (struct dfa *d)
3653	{
3654	if (!d->localeinfo.using_utf8)
3655	return;
3656
3657	bool have_backref = false;
3658	for (idx_t i = 0; i < d->tindex; i++)
3659	{
3660	switch (d->tokens[i])
3661	{
3662	case ANYCHAR:
3663	/* Lowered. */
3664	abort ();
3665	case BACKREF:
3666	have_backref = true;
3667	break;
3668	case MBCSET:
3669	/* Requires multi-byte algorithm. */
3670	return;
3671	default:
3672	break;
3673	}
3674	}
3675
3676	if (!have_backref && d->superset)
3677	{
3678	/* The superset DFA is not likely to be much faster, so remove it. */
3679	dfafree (d->superset);
3680	free (d->superset);
3681	d->superset = NULL;
3682	}
3683
3684	free_mbdata (d);
3685	d->localeinfo.multibyte = false;
3686	d->dfaexec = dfaexec_sb;
3687	d->fast = true;
3688	}
3689
3690	static void
3691	dfassbuild (struct dfa *d)
3692	{
3693	struct dfa *sup = dfaalloc ();
3694
3695	sup = d;
3696	sup->localeinfo.multibyte = false;
3697	sup->dfaexec = dfaexec_sb;
3698	sup->multibyte_prop = NULL;
3699	sup->superset = NULL;
3700	sup->states = NULL;
3701	sup->sindex = 0;
3702	sup->constraints = NULL;
3703	sup->separates = NULL;
3704	sup->follows = NULL;
3705	sup->tralloc = 0;
3706	sup->trans = NULL;
3707	sup->fails = NULL;
3708	sup->success = NULL;
3709	sup->newlines = NULL;
3710
3711	sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3712	if (d->cindex)
3713	{
3714	memcpy (sup->charclasses, d->charclasses,
3715	d->cindex * sizeof *sup->charclasses);
3716	}
3717
3718	sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3719	sup->talloc = d->tindex * 2;
3720
3721	bool have_achar = false;
3722	bool have_nchar = false;
3723	idx_t j;
3724	for (idx_t i = j = 0; i < d->tindex; i++)
3725	{
3726	switch (d->tokens[i])
3727	{
3728	case ANYCHAR:
3729	case MBCSET:
3730	case BACKREF:
3731	{
3732	charclass ccl;
3733	fillset (&ccl);
3734	sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3735	sup->tokens[j++] = STAR;
3736	if (d->tokens[i + 1] == QMARK \|\| d->tokens[i + 1] == STAR
3737	\|\| d->tokens[i + 1] == PLUS)
3738	i++;
3739	have_achar = true;
3740	}
3741	break;
3742	case BEGWORD:
3743	case ENDWORD:
3744	case LIMWORD:
3745	case NOTLIMWORD:
3746	if (d->localeinfo.multibyte)
3747	{
3748	/* These constraints aren't supported in a multibyte locale.
3749	Ignore them in the superset DFA. */
3750	sup->tokens[j++] = EMPTY;
3751	break;
3752	}
3753	FALLTHROUGH;
3754	default:
3755	sup->tokens[j++] = d->tokens[i];
3756	if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3757	\|\| d->tokens[i] >= CSET)
3758	have_nchar = true;
3759	break;
3760	}
3761	}
3762	sup->tindex = j;
3763
3764	if (have_nchar && (have_achar \|\| d->localeinfo.multibyte))
3765	d->superset = sup;
3766	else
3767	{
3768	dfafree (sup);
3769	free (sup);
3770	}
3771	}
3772
3773	/* Parse a string S of length LEN into D (but skip this step if S is null).
3774	Then analyze D and build a matcher for it.
3775	SEARCHFLAG says whether to build a searching or an exact matcher. */
3776	void
3777	dfacomp (char const s, idx_t len, struct dfa d, bool searchflag)
3778	{
3779	if (s != NULL)
3780	dfaparse (s, len, d);
3781
3782	dfassbuild (d);
3783
3784	if (dfasupported (d))
3785	{
3786	maybe_disable_superset_dfa (d);
3787	dfaanalyze (d, searchflag);
3788	}
3789	else
3790	{
3791	d->dfaexec = dfaexec_noop;
3792	}
3793
3794	if (d->superset)
3795	{
3796	d->fast = true;
3797	dfaanalyze (d->superset, searchflag);
3798	}
3799	}
3800
3801	/* Free the storage held by the components of a dfa. */
3802	void
3803	dfafree (struct dfa *d)
3804	{
3805	free (d->charclasses);
3806	free (d->tokens);
3807
3808	if (d->localeinfo.multibyte)
3809	free_mbdata (d);
3810
3811	free (d->constraints);
3812	free (d->separates);
3813
3814	for (idx_t i = 0; i < d->sindex; i++)
3815	{
3816	free (d->states[i].elems.elems);
3817	free (d->states[i].mbps.elems);
3818	}
3819	free (d->states);
3820
3821	if (d->follows)
3822	{
3823	for (idx_t i = 0; i < d->tindex; i++)
3824	free (d->follows[i].elems);
3825	free (d->follows);
3826	}
3827
3828	if (d->trans)
3829	{
3830	for (idx_t i = 0; i < d->tralloc; i++)
3831	{
3832	free (d->trans[i]);
3833	free (d->fails[i]);
3834	}
3835
3836	free (d->trans - 2);
3837	free (d->fails);
3838	free (d->newlines);
3839	free (d->success);
3840	}
3841
3842	if (d->superset)
3843	{
3844	dfafree (d->superset);
3845	free (d->superset);
3846	}
3847	}
3848
3849	/* Having found the postfix representation of the regular expression,
3850	try to find a long sequence of characters that must appear in any line
3851	containing the r.e.
3852	Finding a "longest" sequence is beyond the scope here;
3853	we take an easy way out and hope for the best.
3854	(Take "(ab\|a)b"--please.)
3855
3856	We do a bottom-up calculation of sequences of characters that must appear
3857	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3858	representation:
3859	sequences that must appear at the left of the match ("left")
3860	sequences that must appear at the right of the match ("right")
3861	lists of sequences that must appear somewhere in the match ("in")
3862	sequences that must constitute the match ("is")
3863
3864	When we get to the root of the tree, we use one of the longest of its
3865	calculated "in" sequences as our answer.
3866
3867	The sequences calculated for the various types of node (in pseudo ANSI c)
3868	are shown below. "p" is the operand of unary operators (and the left-hand
3869	operand of binary operators); "q" is the right-hand operand of binary
3870	operators.
3871
3872	"ZERO" means "a zero-length sequence" below.
3873
3874	Type left right is in
3875	---- ---- ----- -- --
3876	char c # c # c # c # c
3877
3878	ANYCHAR ZERO ZERO ZERO ZERO
3879
3880	MBCSET ZERO ZERO ZERO ZERO
3881
3882	CSET ZERO ZERO ZERO ZERO
3883
3884	STAR ZERO ZERO ZERO ZERO
3885
3886	QMARK ZERO ZERO ZERO ZERO
3887
3888	PLUS p->left p->right ZERO p->in
3889
3890	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3891	p->left : q->right : q->is!=ZERO) ? q->in plus
3892	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3893	ZERO
3894
3895	OR longest common longest common (do p->is and substrings common
3896	leading trailing to q->is have same p->in and
3897	(sub)sequence (sub)sequence q->in length and content) ?
3898	of p->left of p->right
3899	and q->left and q->right p->is : NULL
3900
3901	If there's anything else we recognize in the tree, all four sequences get set
3902	to zero-length sequences. If there's something we don't recognize in the
3903	tree, we just return a zero-length sequence.
3904
3905	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3906	'aaa')?
3907
3908	And ... is it here or someplace that we might ponder "optimizations" such as
3909	egrep 'psi\|epsilon' -> egrep 'psi'
3910	egrep 'pepsi\|epsilon' -> egrep 'epsi'
3911	(Yes, we now find "epsi" as a "string
3912	that must occur", but we might also
3913	simplify the entire r.e. being sought)
3914	grep '[c]' -> grep 'c'
3915	grep '(ab\|a)b' -> grep 'ab'
3916	grep 'ab*' -> grep 'a'
3917	grep 'a*b' -> grep 'b'
3918
3919	There are several issues:
3920
3921	Is optimization easy (enough)?
3922
3923	Does optimization actually accomplish anything,
3924	or is the automaton you get from "psi\|epsilon" (for example)
3925	the same as the one you get from "psi" (for example)?
3926
3927	Are optimizable r.e.'s likely to be used in real-life situations
3928	(something like 'ab*' is probably unlikely; something like is
3929	'psi\|epsilon' is likelier)? */
3930
3931	static char *
3932	icatalloc (char old, char const new)
3933	{
3934	idx_t newsize = strlen (new);
3935	if (newsize == 0)
3936	return old;
3937	idx_t oldsize = strlen (old);
3938	char *result = xirealloc (old, oldsize + newsize + 1);
3939	memcpy (result + oldsize, new, newsize + 1);
3940	return result;
3941	}
3942
3943	static void
3944	freelist (char **cpp)
3945	{
3946	while (*cpp)
3947	free (*cpp++);
3948	}
3949
3950	static char **
3951	enlistnew (char *cpp, char new)
3952	{
3953	/* Is there already something in the list that's new (or longer)? */
3954	idx_t i;
3955	for (i = 0; cpp[i] != NULL; i++)
3956	if (strstr (cpp[i], new) != NULL)
3957	{
3958	free (new);
3959	return cpp;
3960	}
3961	/* Eliminate any obsoleted strings. */
3962	for (idx_t j = 0; cpp[j] != NULL; )
3963	if (strstr (new, cpp[j]) == NULL)
3964	++j;
3965	else
3966	{
3967	free (cpp[j]);
3968	if (--i == j)
3969	break;
3970	cpp[j] = cpp[i];
3971	cpp[i] = NULL;
3972	}
3973	/* Add the new string. */
3974	cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
3975	cpp[i] = new;
3976	cpp[i + 1] = NULL;
3977	return cpp;
3978	}
3979
3980	static char **
3981	enlist (char *cpp, char const str, idx_t len)
3982	{
3983	return enlistnew (cpp, ximemdup0 (str, len));
3984	}
3985
3986	/* Given pointers to two strings, return a pointer to an allocated
3987	list of their distinct common substrings. */
3988	static char **
3989	comsubs (char left, char const right)
3990	{
3991	char *cpp = xzalloc (sizeof cpp);
3992
3993	for (char lcp = left; lcp != '\0'; lcp++)
3994	{
3995	idx_t len = 0;
3996	char rcp = strchr (right, lcp);
3997	while (rcp != NULL)
3998	{
3999	idx_t i;
4000	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4001	continue;
4002	if (i > len)
4003	len = i;
4004	rcp = strchr (rcp + 1, *lcp);
4005	}
4006	if (len != 0)
4007	cpp = enlist (cpp, lcp, len);
4008	}
4009	return cpp;
4010	}
4011
4012	static char **
4013	addlists (char old, char new)
4014	{
4015	for (; *new; new++)
4016	old = enlistnew (old, xstrdup (*new));
4017	return old;
4018	}
4019
4020	/* Given two lists of substrings, return a new list giving substrings
4021	common to both. */
4022	static char **
4023	inboth (char left, char right)
4024	{
4025	char *both = xzalloc (sizeof both);
4026
4027	for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4028	{
4029	for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4030	{
4031	char **temp = comsubs (left[lnum], right[rnum]);
4032	both = addlists (both, temp);
4033	freelist (temp);
4034	free (temp);
4035	}
4036	}
4037	return both;
4038	}
4039
4040	typedef struct must must;
4041
4042	struct must
4043	{
4044	char **in;
4045	char *left;
4046	char *right;
4047	char *is;
4048	bool begline;
4049	bool endline;
4050	must *prev;
4051	};
4052
4053	static must *
4054	allocmust (must *mp, idx_t size)
4055	{
4056	must new_mp = xmalloc (sizeof new_mp);
4057	new_mp->in = xzalloc (sizeof *new_mp->in);
4058	new_mp->left = xizalloc (size);
4059	new_mp->right = xizalloc (size);
4060	new_mp->is = xizalloc (size);
4061	new_mp->begline = false;
4062	new_mp->endline = false;
4063	new_mp->prev = mp;
4064	return new_mp;
4065	}
4066
4067	static void
4068	resetmust (must *mp)
4069	{
4070	freelist (mp->in);
4071	mp->in[0] = NULL;
4072	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4073	mp->begline = false;
4074	mp->endline = false;
4075	}
4076
4077	static void
4078	freemust (must *mp)
4079	{
4080	freelist (mp->in);
4081	free (mp->in);
4082	free (mp->left);
4083	free (mp->right);
4084	free (mp->is);
4085	free (mp);
4086	}
4087
4088	struct dfamust *
4089	dfamust (struct dfa const *d)
4090	{
4091	must *mp = NULL;
4092	char const *result = "";
4093	bool exact = false;
4094	bool begline = false;
4095	bool endline = false;
4096	bool need_begline = false;
4097	bool need_endline = false;
4098	bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4099
4100	for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4101	{
4102	token t = d->tokens[ri];
4103	switch (t)
4104	{
4105	case BEGLINE:
4106	mp = allocmust (mp, 2);
4107	mp->begline = true;
4108	need_begline = true;
4109	break;
4110	case ENDLINE:
4111	mp = allocmust (mp, 2);
4112	mp->endline = true;
4113	need_endline = true;
4114	break;
4115	case LPAREN:
4116	case RPAREN:
4117	assert (!"neither LPAREN nor RPAREN may appear here");
4118
4119	case EMPTY:
4120	case BEGWORD:
4121	case ENDWORD:
4122	case LIMWORD:
4123	case NOTLIMWORD:
4124	case BACKREF:
4125	case ANYCHAR:
4126	case MBCSET:
4127	mp = allocmust (mp, 2);
4128	break;
4129
4130	case STAR:
4131	case QMARK:
4132	assume_nonnull (mp);
4133	resetmust (mp);
4134	break;
4135
4136	case OR:
4137	{
4138	char **new;
4139	must *rmp = mp;
4140	assume_nonnull (rmp);
4141	must *lmp = mp = mp->prev;
4142	assume_nonnull (lmp);
4143	idx_t j, ln, rn, n;
4144
4145	/* Guaranteed to be. Unlikely, but ... */
4146	if (streq (lmp->is, rmp->is))
4147	{
4148	lmp->begline &= rmp->begline;
4149	lmp->endline &= rmp->endline;
4150	}
4151	else
4152	{
4153	lmp->is[0] = '\0';
4154	lmp->begline = false;
4155	lmp->endline = false;
4156	}
4157	/* Left side--easy */
4158	idx_t i = 0;
4159	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4160	++i;
4161	lmp->left[i] = '\0';
4162	/* Right side */
4163	ln = strlen (lmp->right);
4164	rn = strlen (rmp->right);
4165	n = ln;
4166	if (n > rn)
4167	n = rn;
4168	for (i = 0; i < n; ++i)
4169	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4170	break;
4171	for (j = 0; j < i; ++j)
4172	lmp->right[j] = lmp->right[(ln - i) + j];
4173	lmp->right[j] = '\0';
4174	new = inboth (lmp->in, rmp->in);
4175	freelist (lmp->in);
4176	free (lmp->in);
4177	lmp->in = new;
4178	freemust (rmp);
4179	}
4180	break;
4181
4182	case PLUS:
4183	assume_nonnull (mp);
4184	mp->is[0] = '\0';
4185	break;
4186
4187	case END:
4188	assume_nonnull (mp);
4189	assert (!mp->prev);
4190	for (idx_t i = 0; mp->in[i] != NULL; i++)
4191	if (strlen (mp->in[i]) > strlen (result))
4192	result = mp->in[i];
4193	if (streq (result, mp->is))
4194	{
4195	if ((!need_begline \|\| mp->begline) && (!need_endline
4196	\|\| mp->endline))
4197	exact = true;
4198	begline = mp->begline;
4199	endline = mp->endline;
4200	}
4201	goto done;
4202
4203	case CAT:
4204	{
4205	must *rmp = mp;
4206	assume_nonnull (rmp);
4207	must *lmp = mp = mp->prev;
4208	assume_nonnull (lmp);
4209
4210	/* In. Everything in left, plus everything in
4211	right, plus concatenation of
4212	left's right and right's left. */
4213	lmp->in = addlists (lmp->in, rmp->in);
4214	if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4215	{
4216	idx_t lrlen = strlen (lmp->right);
4217	idx_t rllen = strlen (rmp->left);
4218	char *tp = ximalloc (lrlen + rllen + 1);
4219	memcpy (tp + lrlen, rmp->left, rllen + 1);
4220	memcpy (tp, lmp->right, lrlen);
4221	lmp->in = enlistnew (lmp->in, tp);
4222	}
4223	/* Left-hand */
4224	if (lmp->is[0] != '\0')
4225	lmp->left = icatalloc (lmp->left, rmp->left);
4226	/* Right-hand */
4227	if (rmp->is[0] == '\0')
4228	lmp->right[0] = '\0';
4229	lmp->right = icatalloc (lmp->right, rmp->right);
4230	/* Guaranteed to be */
4231	if ((lmp->is[0] != '\0' \|\| lmp->begline)
4232	&& (rmp->is[0] != '\0' \|\| rmp->endline))
4233	{
4234	lmp->is = icatalloc (lmp->is, rmp->is);
4235	lmp->endline = rmp->endline;
4236	}
4237	else
4238	{
4239	lmp->is[0] = '\0';
4240	lmp->begline = false;
4241	lmp->endline = false;
4242	}
4243	freemust (rmp);
4244	}
4245	break;
4246
4247	case '\0':
4248	/* Not on my shift. */
4249	goto done;
4250
4251	default:
4252	if (CSET <= t)
4253	{
4254	/* If T is a singleton, or if case-folding in a unibyte
4255	locale and T's members all case-fold to the same char,
4256	convert T to one of its members. Otherwise, do
4257	nothing further with T. */
4258	charclass *ccl = &d->charclasses[t - CSET];
4259	int j;
4260	for (j = 0; j < NOTCHAR; j++)
4261	if (tstbit (j, ccl))
4262	break;
4263	if (! (j < NOTCHAR))
4264	{
4265	mp = allocmust (mp, 2);
4266	break;
4267	}
4268	t = j;
4269	while (++j < NOTCHAR)
4270	if (tstbit (j, ccl)
4271	&& ! (case_fold_unibyte
4272	&& toupper (j) == toupper (t)))
4273	break;
4274	if (j < NOTCHAR)
4275	{
4276	mp = allocmust (mp, 2);
4277	break;
4278	}
4279	}
4280
4281	idx_t rj = ri + 2;
4282	if (d->tokens[ri + 1] == CAT)
4283	{
4284	for (; rj < d->tindex - 1; rj += 2)
4285	{
4286	if ((rj != ri && (d->tokens[rj] <= 0
4287	\|\| NOTCHAR <= d->tokens[rj]))
4288	\|\| d->tokens[rj + 1] != CAT)
4289	break;
4290	}
4291	}
4292	mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4293	mp->is[0] = mp->left[0] = mp->right[0]
4294	= case_fold_unibyte ? toupper (t) : t;
4295
4296	idx_t i;
4297	for (i = 1; ri + 2 < rj; i++)
4298	{
4299	ri += 2;
4300	t = d->tokens[ri];
4301	mp->is[i] = mp->left[i] = mp->right[i]
4302	= case_fold_unibyte ? toupper (t) : t;
4303	}
4304	mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4305	mp->in = enlist (mp->in, mp->is, i);
4306	break;
4307	}
4308	}
4309	done:;
4310
4311	struct dfamust *dm = NULL;
4312	if (*result)
4313	{
4314	dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4315	dm->exact = exact;
4316	dm->begline = begline;
4317	dm->endline = endline;
4318	strcpy (dm->must, result);
4319	}
4320
4321	while (mp)
4322	{
4323	must *prev = mp->prev;
4324	freemust (mp);
4325	mp = prev;
4326	}
4327
4328	return dm;
4329	}
4330
4331	void
4332	dfamustfree (struct dfamust *dm)
4333	{
4334	free (dm);
4335	}
4336
4337	struct dfa *
4338	dfaalloc (void)
4339	{
4340	return xmalloc (sizeof (struct dfa));
4341	}
4342
4343	/* Initialize DFA. */
4344	void
4345	dfasyntax (struct dfa dfa, struct localeinfo const linfo,
4346	reg_syntax_t bits, int dfaopts)
4347	{
4348	memset (dfa, 0, offsetof (struct dfa, dfaexec));
4349	dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4350	dfa->localeinfo = *linfo;
4351
4352	dfa->fast = !dfa->localeinfo.multibyte;
4353
4354	dfa->canychar = -1;
4355	dfa->syntax.syntax_bits_set = true;
4356	dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4357	dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4358	dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4359	dfa->syntax.syntax_bits = bits;
4360
4361	for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4362	{
4363	unsigned char uc = i;
4364
4365	dfa->syntax.sbit[uc] = char_context (dfa, uc);
4366	switch (dfa->syntax.sbit[uc])
4367	{
4368	case CTX_LETTER:
4369	setbit (uc, &dfa->syntax.letters);
4370	break;
4371	case CTX_NEWLINE:
4372	setbit (uc, &dfa->syntax.newline);
4373	break;
4374	}
4375
4376	/* POSIX requires that the five bytes in "\n\r./" (including the
4377	terminating NUL) cannot occur inside a multibyte character. */
4378	dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4379	? (uc & 0xc0) != 0x80
4380	: strchr ("\n\r./", uc) != NULL);
4381	}
4382	}
4383
4384	/* Initialize TO by copying FROM's syntax settings. */
4385	void
4386	dfacopysyntax (struct dfa to, struct dfa const from)
4387	{
4388	memset (to, 0, offsetof (struct dfa, syntax));
4389	to->canychar = -1;
4390	to->fast = from->fast;
4391	to->syntax = from->syntax;
4392	to->dfaexec = from->dfaexec;
4393	to->localeinfo = from->localeinfo;
4394	}
4395
4396	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/grep/lib/dfa.c@ 3658

Download in other formats: