Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

dfa.c

Last change on this file was 3657, checked in by bird, 9 months ago
sed/dfa.c: Workaround for Visual C++ 2022 (amd64) optimizer bug.
File size: 136.3 KB

Rev	Line
[3611]	1	/* dfa.c - deterministic extended regexp routines for GNU
	2	Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2022 Free Software
	3	Foundation, Inc.
	4
	5	This program is free software; you can redistribute it and/or modify
	6	it under the terms of the GNU General Public License as published by
	7	the Free Software Foundation, either version 3, or (at your option)
	8	any later version.
	9
	10	This program is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	GNU General Public License for more details.
	14
	15	You should have received a copy of the GNU General Public License
	16	along with this program; if not, write to the Free Software
	17	Foundation, Inc.,
	18	51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
	19
	20	/* Written June, 1988 by Mike Haertel
	21	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
	22
	23	#include <config.h>
	24
	25	#include "dfa.h"
	26
	27	#include "flexmember.h"
	28	#include "idx.h"
	29	#include "verify.h"
	30
	31	#include <assert.h>
	32	#include <ctype.h>
	33	#include <stdint.h>
	34	#include <stdio.h>
	35	#include <stdlib.h>
	36	#include <limits.h>
	37	#include <string.h>
	38
	39	/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
	40	understandably cannot deduce that the input comes from a
	41	well-formed regular expression. There's little point to the
	42	runtime overhead of 'assert' instead of 'assume_nonnull' when the
	43	MMU will check anyway. */
	44	#define assume_nonnull(x) assume ((x) != NULL)
	45
	46	static bool
	47	str_eq (char const a, char const b)
	48	{
	49	return strcmp (a, b) == 0;
	50	}
	51
	52	static bool
	53	c_isdigit (char c)
	54	{
	55	return '0' <= c && c <= '9';
	56	}
	57
	58	#include "gettext.h"
	59	#define _(str) gettext (str)
	60
	61	#include <wchar.h>
	62	#include <wctype.h>
	63
	64	#include "xalloc.h"
	65	#include "localeinfo.h"
	66
	67	#ifndef FALLTHROUGH
	68	# if 201710L < __STDC_VERSION__
	69	# define FALLTHROUGH [[__fallthrough__]]
	70	# elif (__GNUC__ >= 7) \|\| (__clang_major__ >= 10)
	71	# define FALLTHROUGH __attribute__ ((__fallthrough__))
	72	# else
	73	# define FALLTHROUGH ((void) 0)
	74	# endif
	75	#endif
	76
	77	#ifndef MIN
	78	# define MIN(a,b) ((a) < (b) ? (a) : (b))
	79	#endif
	80
	81	/* HPUX defines these as macros in sys/param.h. */
	82	#ifdef setbit
	83	# undef setbit
	84	#endif
	85	#ifdef clrbit
	86	# undef clrbit
	87	#endif
	88
	89	/* For code that does not use Gnulibâs isblank module. */
	90	#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
	91	# define isblank dfa_isblank
	92	static int
	93	isblank (int c)
	94	{
	95	return c == ' ' \|\| c == '\t';
	96	}
	97	#endif
	98
	99	/* First integer value that is greater than any character code. */
	100	enum { NOTCHAR = 1 << CHAR_BIT };
	101
	102	#ifdef UINT_LEAST64_MAX
	103
	104	/* Number of bits used in a charclass word. */
	105	enum { CHARCLASS_WORD_BITS = 64 };
	106
	107	/* This represents part of a character class. It must be unsigned and
	108	at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
	109	typedef uint_least64_t charclass_word;
	110
	111	/* Part of a charclass initializer that represents 64 bits' worth of a
	112	charclass, where LO and HI are the low and high-order 32 bits of
	113	the 64-bit quantity. */
	114	# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
	115
	116	#else
	117	/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
	118	enum { CHARCLASS_WORD_BITS = 32 };
	119	typedef unsigned long charclass_word;
	120	# define CHARCLASS_PAIR(lo, hi) lo, hi
	121	#endif
	122
	123	/* An initializer for a charclass whose 32-bit words are A through H. */
	124	#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
	125	{{ \
	126	CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
	127	CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
	128	}}
	129
	130	/* The maximum useful value of a charclass_word; all used bits are 1. */
	131	static charclass_word const CHARCLASS_WORD_MASK
	132	= ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
	133
	134	/* Number of words required to hold a bit for every character. */
	135	enum
	136	{
	137	CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
	138	};
	139
	140	/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
	141	typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
	142
	143	/* Convert a possibly-signed character to an unsigned character. This is
	144	a bit safer than casting to unsigned char, since it catches some type
	145	errors that the cast doesn't. */
	146	static unsigned char
	147	to_uchar (char ch)
	148	{
	149	return ch;
	150	}
	151
	152	/* Contexts tell us whether a character is a newline or a word constituent.
	153	Word-constituent characters are those that satisfy iswalnum, plus '_'.
	154	Each character has a single CTX_* value; bitmasks of CTX_* values denote
	155	a particular character class.
	156
	157	A state also stores a context value, which is a bitmask of CTX_* values.
	158	A state's context represents a set of characters that the state's
	159	predecessors must match. For example, a state whose context does not
	160	include CTX_LETTER will never have transitions where the previous
	161	character is a word constituent. A state whose context is CTX_ANY
	162	might have transitions from any character. */
	163
	164	enum
	165	{
	166	CTX_NONE = 1,
	167	CTX_LETTER = 2,
	168	CTX_NEWLINE = 4,
	169	CTX_ANY = 7
	170	};
	171
	172	/* Sometimes characters can only be matched depending on the surrounding
	173	context. Such context decisions depend on what the previous character
	174	was, and the value of the current (lookahead) character. Context
	175	dependent constraints are encoded as 9-bit integers. Each bit that
	176	is set indicates that the constraint succeeds in the corresponding
	177	context.
	178
	179	bit 6-8 - valid contexts when next character is CTX_NEWLINE
	180	bit 3-5 - valid contexts when next character is CTX_LETTER
	181	bit 0-2 - valid contexts when next character is CTX_NONE
	182
	183	succeeds_in_context determines whether a given constraint
	184	succeeds in a particular context. Prev is a bitmask of possible
	185	context values for the previous character, curr is the (single-bit)
	186	context value for the lookahead character. */
	187	static int
	188	newline_constraint (int constraint)
	189	{
	190	return (constraint >> 6) & 7;
	191	}
	192	static int
	193	letter_constraint (int constraint)
	194	{
	195	return (constraint >> 3) & 7;
	196	}
	197	static int
	198	other_constraint (int constraint)
	199	{
	200	return constraint & 7;
	201	}
	202
	203	static bool
	204	succeeds_in_context (int constraint, int prev, int curr)
	205	{
	206	return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
	207	\| (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
	208	\| (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
	209	& prev);
	210	}
	211
	212	/* The following describe what a constraint depends on. */
	213	static bool
	214	prev_newline_dependent (int constraint)
	215	{
	216	return ((constraint ^ constraint >> 2) & 0111) != 0;
	217	}
	218	static bool
	219	prev_letter_dependent (int constraint)
	220	{
	221	return ((constraint ^ constraint >> 1) & 0111) != 0;
	222	}
	223
	224	/* Tokens that match the empty string subject to some constraint actually
	225	work by applying that constraint to determine what may follow them,
	226	taking into account what has gone before. The following values are
	227	the constraints corresponding to the special tokens previously defined. */
	228	enum
	229	{
	230	NO_CONSTRAINT = 0777,
	231	BEGLINE_CONSTRAINT = 0444,
	232	ENDLINE_CONSTRAINT = 0700,
	233	BEGWORD_CONSTRAINT = 0050,
	234	ENDWORD_CONSTRAINT = 0202,
	235	LIMWORD_CONSTRAINT = 0252,
	236	NOTLIMWORD_CONSTRAINT = 0525
	237	};
	238
	239	/* The regexp is parsed into an array of tokens in postfix form. Some tokens
	240	are operators and others are terminal symbols. Most (but not all) of these
	241	codes are returned by the lexical analyzer. */
	242
	243	typedef ptrdiff_t token;
	244	static token const TOKEN_MAX = PTRDIFF_MAX;
	245
	246	/* States are indexed by state_num values. These are normally
	247	nonnegative but -1 is used as a special value. */
	248	typedef ptrdiff_t state_num;
	249
	250	/* Predefined token values. */
	251	enum
	252	{
	253	END = -1, /* END is a terminal symbol that matches the
	254	end of input; any value of END or less in
	255	the parse tree is such a symbol. Accepting
	256	states of the DFA are those that would have
	257	a transition on END. This is -1, not some
	258	more-negative value, to tweak the speed of
	259	comparisons to END. */
	260
	261	/* Ordinary character values are terminal symbols that match themselves. */
	262
	263	/* CSET must come last in the following list of special tokens. Otherwise,
	264	the list order matters only for performance. Related special tokens
	265	should have nearby values so that code like (t == ANYCHAR \|\| t == MBCSET
	266	\|\| CSET <= t) can be done with a single machine-level comparison. */
	267
	268	EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
	269	the empty string. */
	270
	271	QMARK, /* QMARK is an operator of one argument that
	272	matches zero or one occurrences of its
	273	argument. */
	274
	275	STAR, /* STAR is an operator of one argument that
	276	matches the Kleene closure (zero or more
	277	occurrences) of its argument. */
	278
	279	PLUS, /* PLUS is an operator of one argument that
	280	matches the positive closure (one or more
	281	occurrences) of its argument. */
	282
	283	REPMN, /* REPMN is a lexical token corresponding
	284	to the {m,n} construct. REPMN never
	285	appears in the compiled token vector. */
	286
	287	CAT, /* CAT is an operator of two arguments that
	288	matches the concatenation of its
	289	arguments. CAT is never returned by the
	290	lexical analyzer. */
	291
	292	OR, /* OR is an operator of two arguments that
	293	matches either of its arguments. */
	294
	295	LPAREN, /* LPAREN never appears in the parse tree,
	296	it is only a lexeme. */
	297
	298	RPAREN, /* RPAREN never appears in the parse tree. */
	299
	300	WCHAR, /* Only returned by lex. wctok contains
	301	the wide character representation. */
	302
	303	ANYCHAR, /* ANYCHAR is a terminal symbol that matches
	304	a valid multibyte (or single byte) character.
	305	It is used only if MB_CUR_MAX > 1. */
	306
	307	BEG, /* BEG is an initial symbol that matches the
	308	beginning of input. */
	309
	310	BEGLINE, /* BEGLINE is a terminal symbol that matches
	311	the empty string at the beginning of a
	312	line. */
	313
	314	ENDLINE, /* ENDLINE is a terminal symbol that matches
	315	the empty string at the end of a line. */
	316
	317	BEGWORD, /* BEGWORD is a terminal symbol that matches
	318	the empty string at the beginning of a
	319	word. */
	320
	321	ENDWORD, /* ENDWORD is a terminal symbol that matches
	322	the empty string at the end of a word. */
	323
	324	LIMWORD, /* LIMWORD is a terminal symbol that matches
	325	the empty string at the beginning or the
	326	end of a word. */
	327
	328	NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
	329	matches the empty string not at
	330	the beginning or end of a word. */
	331
	332	BACKREF, /* BACKREF is generated by \<digit>
	333	or by any other construct that
	334	is not completely handled. If the scanner
	335	detects a transition on backref, it returns
	336	a kind of "semi-success" indicating that
	337	the match will have to be verified with
	338	a backtracking matcher. */
	339
	340	MBCSET, /* MBCSET is similar to CSET, but for
	341	multibyte characters. */
	342
	343	CSET /* CSET and (and any value greater) is a
	344	terminal symbol that matches any of a
	345	class of characters. */
	346	};
	347
	348
	349	/* States of the recognizer correspond to sets of positions in the parse
	350	tree, together with the constraints under which they may be matched.
	351	So a position is encoded as an index into the parse tree together with
	352	a constraint. */
	353	typedef struct
	354	{
	355	idx_t index; /* Index into the parse array. */
	356	unsigned int constraint; /* Constraint for matching this position. */
	357	} position;
	358
	359	/* Sets of positions are stored as arrays. */
	360	typedef struct
	361	{
	362	position elems; / Elements of this position set. */
	363	idx_t nelem; /* Number of elements in this set. */
	364	idx_t alloc; /* Number of elements allocated in ELEMS. */
	365	} position_set;
	366
	367	/* A state of the dfa consists of a set of positions, some flags,
	368	and the token value of the lowest-numbered position of the state that
	369	contains an END token. */
	370	typedef struct
	371	{
	372	size_t hash; /* Hash of the positions of this state. */
	373	position_set elems; /* Positions this state could match. */
	374	unsigned char context; /* Context from previous state. */
	375	unsigned short constraint; /* Constraint for this state to accept. */
	376	position_set mbps; /* Positions which can match multibyte
	377	characters or the follows, e.g., period.
	378	Used only if MB_CUR_MAX > 1. */
	379	state_num mb_trindex; /* Index of this state in MB_TRANS, or
	380	negative if the state does not have
	381	ANYCHAR. */
	382	} dfa_state;
	383
	384	/* Maximum for any transition table count. This should be at least 3,
	385	for the initial state setup. */
	386	enum { MAX_TRCOUNT = 1024 };
	387
	388	/* A bracket operator.
	389	e.g., [a-c], [[:alpha:]], etc. */
	390	struct mb_char_classes
	391	{
	392	ptrdiff_t cset;
	393	bool invert;
	394	wchar_t chars; / Normal characters. */
	395	idx_t nchars;
	396	idx_t nchars_alloc;
	397	};
	398
	399	struct regex_syntax
	400	{
	401	/* Syntax bits controlling the behavior of the lexical analyzer. */
	402	reg_syntax_t syntax_bits;
	403	int dfaopts;
	404	bool syntax_bits_set;
	405
	406	/* Flag for case-folding letters into sets. */
	407	bool case_fold;
	408
	409	/* End-of-line byte in data. */
	410	unsigned char eolbyte;
	411
	412	/* Cache of char-context values. */
	413	char sbit[NOTCHAR];
	414
	415	/* If never_trail[B], the byte B cannot be a non-initial byte in a
	416	multibyte character. */
	417	bool never_trail[NOTCHAR];
	418
	419	/* Set of characters considered letters. */
	420	charclass letters;
	421
	422	/* Set of characters that are newline. */
	423	charclass newline;
	424	};
	425
	426	/* Lexical analyzer. All the dross that deals with the obnoxious
	427	GNU Regex syntax bits is located here. The poor, suffering
	428	reader is referred to the GNU Regex documentation for the
	429	meaning of the @#%!@#%^!@ syntax bits. */
	430	struct lexer_state
	431	{
	432	char const ptr; / Pointer to next input character. */
	433	idx_t left; /* Number of characters remaining. */
	434	token lasttok; /* Previous token returned; initially END. */
	435	idx_t parens; /* Count of outstanding left parens. */
	436	int minrep, maxrep; /* Repeat counts for {m,n}. */
	437
	438	/* Wide character representation of the current multibyte character,
	439	or WEOF if there was an encoding error. Used only if
	440	MB_CUR_MAX > 1. */
	441	wint_t wctok;
	442
	443	/* The most recently analyzed multibyte bracket expression. */
	444	struct mb_char_classes brack;
	445
	446	/* We're separated from beginning or (, \| only by zero-width characters. */
	447	bool laststart;
	448	};
	449
	450	/* Recursive descent parser for regular expressions. */
	451
	452	struct parser_state
	453	{
	454	token tok; /* Lookahead token. */
	455	idx_t depth; /* Current depth of a hypothetical stack
	456	holding deferred productions. This is
	457	used to determine the depth that will be
	458	required of the real stack later on in
	459	dfaanalyze. */
	460	};
	461
	462	/* A compiled regular expression. */
	463	struct dfa
	464	{
	465	/* Fields filled by the scanner. */
	466	charclass charclasses; / Array of character sets for CSET tokens. */
	467	idx_t cindex; /* Index for adding new charclasses. */
	468	idx_t calloc; /* Number of charclasses allocated. */
	469	ptrdiff_t canychar; /* Index of anychar class, or -1. */
	470
	471	/* Scanner state */
	472	struct lexer_state lex;
	473
	474	/* Parser state */
	475	struct parser_state parse;
	476
	477	/* Fields filled by the parser. */
	478	token tokens; / Postfix parse array. */
	479	idx_t tindex; /* Index for adding new tokens. */
	480	idx_t talloc; /* Number of tokens currently allocated. */
	481	idx_t depth; /* Depth required of an evaluation stack
	482	used for depth-first traversal of the
	483	parse tree. */
	484	idx_t nleaves; /* Number of non-EMPTY leaves
	485	in the parse tree. */
	486	idx_t nregexps; /* Count of parallel regexps being built
	487	with dfaparse. */
	488	bool fast; /* The DFA is fast. */
	489	bool epsilon; /* Does a token match only the empty string? */
	490	token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
	491	mbstate_t mbs; /* Multibyte conversion state. */
	492
	493	/* The following are valid only if MB_CUR_MAX > 1. */
	494
	495	/* The value of multibyte_prop[i] is defined by following rule.
	496	if tokens[i] < NOTCHAR
	497	bit 0 : tokens[i] is the first byte of a character, including
	498	single-byte characters.
	499	bit 1 : tokens[i] is the last byte of a character, including
	500	single-byte characters.
	501
	502	e.g.
	503	tokens
	504	= 'single_byte_a', 'multi_byte_A', single_byte_b'
	505	= 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
	506	multibyte_prop
	507	= 3 , 1 , 0 , 2 , 3
	508	*/
	509	char *multibyte_prop;
	510
	511	/* Fields filled by the superset. */
	512	struct dfa superset; / Hint of the dfa. */
	513
	514	/* Fields filled by the state builder. */
	515	dfa_state states; / States of the dfa. */
	516	state_num sindex; /* Index for adding new states. */
	517	idx_t salloc; /* Number of states currently allocated. */
	518
	519	/* Fields filled by the parse tree->NFA conversion. */
	520	position_set follows; / Array of follow sets, indexed by position
	521	index. The follow of a position is the set
	522	of positions containing characters that
	523	could conceivably follow a character
	524	matching the given position in a string
	525	matching the regexp. Allocated to the
	526	maximum possible position index. */
	527	bool searchflag; /* We are supposed to build a searching
	528	as opposed to an exact matcher. A searching
	529	matcher finds the first and shortest string
	530	matching a regexp anywhere in the buffer,
	531	whereas an exact matcher finds the longest
	532	string matching, but anchored to the
	533	beginning of the buffer. */
	534
	535	/* Fields filled by dfaanalyze. */
	536	int constraints; / Array of union of accepting constraints
	537	in the follow of a position. */
	538	int separates; / Array of contexts on follow of a
	539	position. */
	540
	541	/* Fields filled by dfaexec. */
	542	state_num tralloc; /* Number of transition tables that have
	543	slots so far, not counting trans[-1] and
	544	trans[-2]. */
	545	int trcount; /* Number of transition tables that have
	546	been built, other than for initial
	547	states. */
	548	int min_trcount; /* Number of initial states. Equivalently,
	549	the minimum state number for which trcount
	550	counts transitions. */
	551	state_num *trans; / Transition tables for states that can
	552	never accept. If the transitions for a
	553	state have not yet been computed, or the
	554	state could possibly accept, its entry in
	555	this table is NULL. This points to two
	556	past the start of the allocated array,
	557	and trans[-1] and trans[-2] are always
	558	NULL. */
	559	state_num *fails; / Transition tables after failing to accept
	560	on a state that potentially could do so.
	561	If trans[i] is non-null, fails[i] must
	562	be null. */
	563	char success; / Table of acceptance conditions used in
	564	dfaexec and computed in build_state. */
	565	state_num newlines; / Transitions on newlines. The entry for a
	566	newline in any transition table is always
	567	-1 so we can count lines without wasting
	568	too many cycles. The transition for a
	569	newline is stored separately and handled
	570	as a special case. Newline is also used
	571	as a sentinel at the end of the buffer. */
	572	state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
	573	context in multibyte locales, in which we
	574	do not distinguish between their contexts,
	575	as not supported word. */
	576	position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
	577	state_num *mb_trans; / Transition tables for states with
	578	ANYCHAR. */
	579	state_num mb_trcount; /* Number of transition tables for states with
	580	ANYCHAR that have actually been built. */
	581
	582	/* Syntax configuration. This is near the end so that dfacopysyntax
	583	can memset up to here. */
	584	struct regex_syntax syntax;
	585
	586	/* Information derived from the locale. This is at the end so that
	587	a quick memset need not clear it specially. */
	588
	589	/* dfaexec implementation. */
	590	char (dfaexec) (struct dfa , char const , char *,
	591	bool, idx_t , bool );
	592
	593	/* Other cached information derived from the locale. */
	594	struct localeinfo localeinfo;
	595	};
	596
	597	/* User access to dfa internals. */
	598
	599	/* S could possibly be an accepting state of R. */
	600	static bool
	601	accepting (state_num s, struct dfa const *r)
	602	{
	603	return r->states[s].constraint != 0;
	604	}
	605
	606	/* STATE accepts in the specified context. */
	607	static bool
	608	accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
	609	{
	610	return succeeds_in_context (dfa->states[state].constraint, prev, curr);
	611	}
	612
	613	static void regexp (struct dfa *dfa);
	614
	615	/* Store into *PWC the result of converting the leading bytes of the
	616	multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
	617	and updating the conversion state in *D. On conversion error,
	618	convert just a single byte, to WEOF. Return the number of bytes
	619	converted.
	620
	621	This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
	622
	623	* PWC points to wint_t, not to wchar_t.
	624	* The last arg is a dfa *D instead of merely a multibyte conversion
	625	state D->mbs.
	626	* N is idx_t not size_t, and must be at least 1.
	627	* S[N - 1] must be a sentinel byte.
	628	* Shift encodings are not supported.
	629	* The return value is always in the range 1..N.
	630	* D->mbs is always valid afterwards.
	631	* PWC is always set to something. /
	632	static int
	633	mbs_to_wchar (wint_t pwc, char const s, idx_t n, struct dfa *d)
	634	{
	635	unsigned char uc = s[0];
	636	wint_t wc = d->localeinfo.sbctowc[uc];
	637
	638	if (wc == WEOF)
	639	{
	640	wchar_t wch;
	641	size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
	642	if (0 < nbytes && nbytes < (size_t) -2)
	643	{
	644	*pwc = wch;
	645	return nbytes;
	646	}
	647	memset (&d->mbs, 0, sizeof d->mbs);
	648	}
	649
	650	*pwc = wc;
	651	return 1;
	652	}
	653
	654	#ifdef DEBUG
	655
	656	static void
	657	prtok (token t)
	658	{
	659	if (t <= END)
	660	fprintf (stderr, "END");
	661	else if (0 <= t && t < NOTCHAR)
	662	{
	663	unsigned int ch = t;
	664	fprintf (stderr, "0x%02x", ch);
	665	}
	666	else
	667	{
	668	char const *s;
	669	switch (t)
	670	{
	671	case BEG:
	672	s = "BEG";
	673	break;
	674	case EMPTY:
	675	s = "EMPTY";
	676	break;
	677	case BACKREF:
	678	s = "BACKREF";
	679	break;
	680	case BEGLINE:
	681	s = "BEGLINE";
	682	break;
	683	case ENDLINE:
	684	s = "ENDLINE";
	685	break;
	686	case BEGWORD:
	687	s = "BEGWORD";
	688	break;
	689	case ENDWORD:
	690	s = "ENDWORD";
	691	break;
	692	case LIMWORD:
	693	s = "LIMWORD";
	694	break;
	695	case NOTLIMWORD:
	696	s = "NOTLIMWORD";
	697	break;
	698	case QMARK:
	699	s = "QMARK";
	700	break;
	701	case STAR:
	702	s = "STAR";
	703	break;
	704	case PLUS:
	705	s = "PLUS";
	706	break;
	707	case CAT:
	708	s = "CAT";
	709	break;
	710	case OR:
	711	s = "OR";
	712	break;
	713	case LPAREN:
	714	s = "LPAREN";
	715	break;
	716	case RPAREN:
	717	s = "RPAREN";
	718	break;
	719	case ANYCHAR:
	720	s = "ANYCHAR";
	721	break;
	722	case MBCSET:
	723	s = "MBCSET";
	724	break;
	725	default:
	726	s = "CSET";
	727	break;
	728	}
	729	fprintf (stderr, "%s", s);
	730	}
	731	}
	732	#endif /* DEBUG */
	733
	734	/* Stuff pertaining to charclasses. */
	735
	736	static bool
	737	tstbit (unsigned int b, charclass const *c)
	738	{
	739	return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
	740	}
	741
	742	static void
	743	setbit (unsigned int b, charclass *c)
	744	{
	745	charclass_word one = 1;
	746	c->w[b / CHARCLASS_WORD_BITS] \|= one << b % CHARCLASS_WORD_BITS;
	747	}
	748
	749	static void
	750	clrbit (unsigned int b, charclass *c)
	751	{
	752	charclass_word one = 1;
	753	c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
	754	}
	755
	756	static void
	757	zeroset (charclass *s)
	758	{
	759	memset (s, 0, sizeof *s);
	760	}
	761
	762	static void
	763	fillset (charclass *s)
	764	{
	765	for (int i = 0; i < CHARCLASS_WORDS; i++)
	766	s->w[i] = CHARCLASS_WORD_MASK;
	767	}
	768
	769	static void
	770	notset (charclass *s)
	771	{
	772	for (int i = 0; i < CHARCLASS_WORDS; ++i)
	773	s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
	774	}
	775
	776	static bool
	777	equal (charclass const s1, charclass const s2)
	778	{
	779	charclass_word w = 0;
	780	for (int i = 0; i < CHARCLASS_WORDS; i++)
	781	w \|= s1->w[i] ^ s2->w[i];
	782	return w == 0;
	783	}
	784
	785	static bool
	786	emptyset (charclass const *s)
	787	{
	788	charclass_word w = 0;
	789	for (int i = 0; i < CHARCLASS_WORDS; i++)
	790	w \|= s->w[i];
	791	return w == 0;
	792	}
	793
	794	/* Ensure that the array addressed by PA holds at least I + 1 items.
	795	Either return PA, or reallocate the array and return its new address.
	796	Although PA may be null, the returned value is never null.
	797
	798	The array holds NITEMS items, where 0 <= I <= NITEMS; *NITEMS
	799	is updated on reallocation. If PA is null, *NITEMS must be zero.
	800	Do not allocate more than NITEMS_MAX items total; -1 means no limit.
	801	ITEM_SIZE is the size of one item; it must be positive.
	802	Avoid O(N*2) behavior on arrays growing linearly. /
	803	static void *
	804	maybe_realloc (void pa, idx_t i, idx_t nitems,
	805	ptrdiff_t nitems_max, idx_t item_size)
	806	{
	807	if (i < *nitems)
	808	return pa;
	809	return xpalloc (pa, nitems, 1, nitems_max, item_size);
	810	}
	811
	812	/* In DFA D, find the index of charclass S, or allocate a new one. */
	813	static idx_t
	814	charclass_index (struct dfa d, charclass const s)
	815	{
	816	idx_t i;
	817
	818	for (i = 0; i < d->cindex; ++i)
	819	if (equal (s, &d->charclasses[i]))
	820	return i;
	821	d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
	822	TOKEN_MAX - CSET, sizeof *d->charclasses);
	823	++d->cindex;
	824	d->charclasses[i] = *s;
	825	return i;
	826	}
	827
	828	static bool
	829	unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
	830	{
	831	return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) \|\| (c) == '_');
	832	}
	833
	834	static int
	835	char_context (struct dfa const *dfa, unsigned char c)
	836	{
	837	if (c == dfa->syntax.eolbyte && !(dfa->syntax.dfaopts & DFA_ANCHOR))
	838	return CTX_NEWLINE;
	839	if (unibyte_word_constituent (dfa, c))
	840	return CTX_LETTER;
	841	return CTX_NONE;
	842	}
	843
	844	/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
	845	is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
	846	this may happen when folding case in weird Turkish locales where
	847	dotless i/dotted I are not included in the chosen character set.
	848	Return whether a bit was set in the charclass. */
	849	static bool
	850	setbit_wc (wint_t wc, charclass *c)
	851	{
	852	int b = wctob (wc);
	853	if (b < 0)
	854	return false;
	855
	856	setbit (b, c);
	857	return true;
	858	}
	859
	860	/* Set a bit for B and its case variants in the charclass C.
	861	MB_CUR_MAX must be 1. */
	862	static void
	863	setbit_case_fold_c (int b, charclass *c)
	864	{
	865	int ub = toupper (b);
	866	for (int i = 0; i < NOTCHAR; i++)
	867	if (toupper (i) == ub)
	868	setbit (i, c);
	869	}
	870
	871	/* Fetch the next lexical input character from the pattern. There
	872	must at least one byte of pattern input. Set DFA->lex.wctok to the
	873	value of the character or to WEOF depending on whether the input is
	874	a valid multibyte character (possibly of length 1). Then return
	875	the next input byte value, except return EOF if the input is a
	876	multibyte character of length greater than 1. */
	877	static int
	878	fetch_wc (struct dfa *dfa)
	879	{
	880	int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
	881	dfa);
	882	int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
	883	dfa->lex.ptr += nbytes;
	884	dfa->lex.left -= nbytes;
	885	return c;
	886	}
	887
	888	/* If there is no more input, report an error about unbalanced brackets.
	889	Otherwise, behave as with fetch_wc (DFA). */
	890	static int
	891	bracket_fetch_wc (struct dfa *dfa)
	892	{
	893	if (! dfa->lex.left)
	894	dfaerror (_("unbalanced ["));
	895	return fetch_wc (dfa);
	896	}
	897
	898	typedef int predicate (int);
	899
	900	/* The following list maps the names of the Posix named character classes
	901	to predicate functions that determine whether a given character is in
	902	the class. The leading [ has already been eaten by the lexical
	903	analyzer. */
	904	struct dfa_ctype
	905	{
	906	const char *name;
	907	predicate *func;
	908	bool single_byte_only;
	909	};
	910
	911	static const struct dfa_ctype prednames[] = {
	912	{"alpha", isalpha, false},
	913	{"upper", isupper, false},
	914	{"lower", islower, false},
	915	{"digit", isdigit, true},
	916	{"xdigit", isxdigit, false},
	917	{"space", isspace, false},
	918	{"punct", ispunct, false},
	919	{"alnum", isalnum, false},
	920	{"print", isprint, false},
	921	{"graph", isgraph, false},
	922	{"cntrl", iscntrl, false},
	923	{"blank", isblank, false},
	924	{NULL, NULL, false}
	925	};
	926
	927	static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
	928	find_pred (const char *str)
	929	{
	930	for (int i = 0; prednames[i].name; i++)
	931	if (str_eq (str, prednames[i].name))
	932	return &prednames[i];
	933	return NULL;
	934	}
	935
	936	/* Parse a bracket expression, which possibly includes multibyte
	937	characters. */
	938	static token
	939	parse_bracket_exp (struct dfa *dfa)
	940	{
	941	/* This is a bracket expression that dfaexec is known to
	942	process correctly. */
	943	bool known_bracket_exp = true;
	944
	945	/* Used to warn about [:space:].
	946	Bit 0 = first character is a colon.
	947	Bit 1 = last character is a colon.
	948	Bit 2 = includes any other character but a colon.
	949	Bit 3 = includes ranges, char/equiv classes or collation elements. */
	950	int colon_warning_state;
	951
	952	dfa->lex.brack.nchars = 0;
	953	charclass ccl;
	954	zeroset (&ccl);
	955	int c = bracket_fetch_wc (dfa);
	956	bool invert = c == '^';
	957	if (invert)
	958	{
	959	c = bracket_fetch_wc (dfa);
	960	known_bracket_exp = dfa->localeinfo.simple;
	961	}
	962	wint_t wc = dfa->lex.wctok;
	963	int c1;
	964	wint_t wc1;
	965	colon_warning_state = (c == ':');
	966	do
	967	{
	968	c1 = NOTCHAR; /* Mark c1 as not initialized. */
	969	colon_warning_state &= ~2;
	970
	971	/* Note that if we're looking at some other [:...:] construct,
	972	we just treat it as a bunch of ordinary characters. We can do
	973	this because we assume regex has checked for syntax errors before
	974	dfa is ever called. */
	975	if (c == '[')
	976	{
	977	c1 = bracket_fetch_wc (dfa);
	978	wc1 = dfa->lex.wctok;
	979
	980	if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
	981	\|\| c1 == '.' \|\| c1 == '=')
	982	{
	983	enum { MAX_BRACKET_STRING_LEN = 32 };
	984	char str[MAX_BRACKET_STRING_LEN + 1];
	985	int len = 0;
	986	for (;;)
	987	{
	988	c = bracket_fetch_wc (dfa);
	989	if (dfa->lex.left == 0
	990	\|\| (c == c1 && dfa->lex.ptr[0] == ']'))
	991	break;
	992	if (len < MAX_BRACKET_STRING_LEN)
	993	str[len++] = c;
	994	else
	995	/* This is in any case an invalid class name. */
	996	str[0] = '\0';
	997	}
	998	str[len] = '\0';
	999
	1000	/* Fetch bracket. */
	1001	c = bracket_fetch_wc (dfa);
	1002	wc = dfa->lex.wctok;
	1003	if (c1 == ':')
	1004	/* Build character class. POSIX allows character
	1005	classes to match multicharacter collating elements,
	1006	but the regex code does not support that, so do not
	1007	worry about that possibility. */
	1008	{
	1009	char const *class
	1010	= (dfa->syntax.case_fold && (str_eq (str, "upper")
	1011	\|\| str_eq (str, "lower"))
	1012	? "alpha" : str);
	1013	const struct dfa_ctype *pred = find_pred (class);
	1014	if (!pred)
	1015	dfaerror (_("invalid character class"));
	1016
	1017	if (dfa->localeinfo.multibyte && !pred->single_byte_only)
	1018	known_bracket_exp = false;
	1019	else
	1020	for (int c2 = 0; c2 < NOTCHAR; ++c2)
	1021	if (pred->func (c2))
	1022	setbit (c2, &ccl);
	1023	}
	1024	else
	1025	known_bracket_exp = false;
	1026
	1027	colon_warning_state \|= 8;
	1028
	1029	/* Fetch new lookahead character. */
	1030	c1 = bracket_fetch_wc (dfa);
	1031	wc1 = dfa->lex.wctok;
	1032	continue;
	1033	}
	1034
	1035	/* We treat '[' as a normal character here. c/c1/wc/wc1
	1036	are already set up. */
	1037	}
	1038
	1039	if (c == '\\'
	1040	&& (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
	1041	{
	1042	c = bracket_fetch_wc (dfa);
	1043	wc = dfa->lex.wctok;
	1044	}
	1045
	1046	if (c1 == NOTCHAR)
	1047	{
	1048	c1 = bracket_fetch_wc (dfa);
	1049	wc1 = dfa->lex.wctok;
	1050	}
	1051
	1052	if (c1 == '-')
	1053	/* build range characters. */
	1054	{
	1055	int c2 = bracket_fetch_wc (dfa);
	1056	wint_t wc2 = dfa->lex.wctok;
	1057
	1058	/* A bracket expression like [a-[.aa.]] matches an unknown set.
	1059	Treat it like [-a[.aa.]] while parsing it, and
	1060	remember that the set is unknown. */
	1061	if (c2 == '[' && dfa->lex.ptr[0] == '.')
	1062	{
	1063	known_bracket_exp = false;
	1064	c2 = ']';
	1065	}
	1066
	1067	if (c2 == ']')
	1068	{
	1069	/* In the case [x-], the - is an ordinary hyphen,
	1070	which is left in c1, the lookahead character. */
	1071	dfa->lex.ptr--;
	1072	dfa->lex.left++;
	1073	}
	1074	else
	1075	{
	1076	if (c2 == '\\' && (dfa->syntax.syntax_bits
	1077	& RE_BACKSLASH_ESCAPE_IN_LISTS))
	1078	{
	1079	c2 = bracket_fetch_wc (dfa);
	1080	wc2 = dfa->lex.wctok;
	1081	}
	1082
	1083	colon_warning_state \|= 8;
	1084	c1 = bracket_fetch_wc (dfa);
	1085	wc1 = dfa->lex.wctok;
	1086
	1087	/* Treat [x-y] as a range if x != y. */
	1088	if (wc != wc2 \|\| wc == WEOF)
	1089	{
	1090	if (dfa->localeinfo.simple
	1091	\|\| (c_isdigit (c) & c_isdigit (c2)))
	1092	{
	1093	for (int ci = c; ci <= c2; ci++)
	1094	if (dfa->syntax.case_fold && isalpha (ci))
	1095	setbit_case_fold_c (ci, &ccl);
	1096	else
	1097	setbit (ci, &ccl);
	1098	}
	1099	else
	1100	known_bracket_exp = false;
	1101
	1102	continue;
	1103	}
	1104	}
	1105	}
	1106
	1107	colon_warning_state \|= (c == ':') ? 2 : 4;
	1108
	1109	if (!dfa->localeinfo.multibyte)
	1110	{
	1111	if (dfa->syntax.case_fold && isalpha (c))
	1112	setbit_case_fold_c (c, &ccl);
	1113	else
	1114	setbit (c, &ccl);
	1115	continue;
	1116	}
	1117
	1118	if (wc == WEOF)
	1119	known_bracket_exp = false;
	1120	else
	1121	{
	1122	wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
	1123	int n = (dfa->syntax.case_fold
	1124	? case_folded_counterparts (wc, folded + 1) + 1
	1125	: 1);
	1126	folded[0] = wc;
	1127	for (int i = 0; i < n; i++)
	1128	if (!setbit_wc (folded[i], &ccl))
	1129	{
	1130	dfa->lex.brack.chars
	1131	= maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
	1132	&dfa->lex.brack.nchars_alloc, -1,
	1133	sizeof *dfa->lex.brack.chars);
	1134	dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
	1135	}
	1136	}
	1137	}
	1138	while ((wc = wc1, (c = c1) != ']'));
	1139
	1140	if (colon_warning_state == 7)
	1141	((dfa->syntax.dfaopts & DFA_CONFUSING_BRACKETS_ERROR
	1142	? dfaerror : dfawarn)
	1143	(_("character class syntax is [[:space:]], not [:space:]")));
	1144
	1145	if (! known_bracket_exp)
	1146	return BACKREF;
	1147
	1148	if (dfa->localeinfo.multibyte && (invert \|\| dfa->lex.brack.nchars != 0))
	1149	{
	1150	dfa->lex.brack.invert = invert;
	1151	dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
	1152	return MBCSET;
	1153	}
	1154
	1155	if (invert)
	1156	{
	1157	notset (&ccl);
	1158	if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
	1159	clrbit ('\n', &ccl);
	1160	}
	1161
	1162	return CSET + charclass_index (dfa, &ccl);
	1163	}
	1164
	1165	struct lexptr
	1166	{
	1167	char const *ptr;
	1168	idx_t left;
	1169	};
	1170
	1171	static void
	1172	push_lex_state (struct dfa dfa, struct lexptr ls, char const *s)
	1173	{
	1174	ls->ptr = dfa->lex.ptr;
	1175	ls->left = dfa->lex.left;
	1176	dfa->lex.ptr = s;
	1177	dfa->lex.left = strlen (s);
	1178	}
	1179
	1180	static void
	1181	pop_lex_state (struct dfa dfa, struct lexptr const ls)
	1182	{
	1183	dfa->lex.ptr = ls->ptr;
	1184	dfa->lex.left = ls->left;
	1185	}
	1186
	1187	static token
	1188	lex (struct dfa *dfa)
	1189	{
	1190	bool backslash = false;
	1191
	1192	/* Basic plan: We fetch a character. If it's a backslash,
	1193	we set the backslash flag and go through the loop again.
	1194	On the plus side, this avoids having a duplicate of the
	1195	main switch inside the backslash case. On the minus side,
	1196	it means that just about every case tests the backslash flag. */
	1197	for (int i = 0; i < 2; ++i)
	1198	{
	1199	if (! dfa->lex.left)
	1200	return dfa->lex.lasttok = END;
	1201	int c = fetch_wc (dfa);
	1202
	1203	switch (c)
	1204	{
	1205	case '\\':
	1206	if (backslash)
	1207	goto normal_char;
	1208	if (dfa->lex.left == 0)
	1209	dfaerror (_("unfinished \\ escape"));
	1210	backslash = true;
	1211	break;
	1212
	1213	case '^':
	1214	if (backslash)
	1215	goto normal_char;
	1216	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
	1217	\|\| dfa->lex.lasttok == END \|\| dfa->lex.lasttok == LPAREN
	1218	\|\| dfa->lex.lasttok == OR)
	1219	return dfa->lex.lasttok = BEGLINE;
	1220	goto normal_char;
	1221
	1222	case '$':
	1223	if (backslash)
	1224	goto normal_char;
[3657]	1225	/* kmk: cl v19.29.30139/amd64 messes this function up when optimizing
	1226	for speed, workaround is to optimize it for size instead. The
	1227	symptom is that the following SED expression fail to match:
	1228	s/^[0-9a-fA-F]\{1,\} $00[0-9a-fA-F]$ ABS notype External \| $[^.]\{1,\}$\.$.*$$/ 1=\1 2=\2 3=\3/
	1229
	1230	Seems the exact problem is that it gets the indexing here wrong:
	1231	dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) & (dfa->lex.ptr[0] == '\\')]
	1232	It forgets to do the ` dfa->lex.ptr[0] == '\\' ` part and instead
	1233	ANDs with a register initialized to zero. Rewriting the
	1234	expressions using the tinary operator works around the problem,
	1235	although the resulting code is a lot bulkier.
	1236	*/
[3611]	1237	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
	1238	\|\| dfa->lex.left == 0
[3657]	1239	#ifdef _MSC_VER /* see above */
	1240	\|\| (!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
	1241	? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == ')'
	1242	: dfa->lex.left > 0 && dfa->lex.ptr[0] == ')')
	1243	#else
[3611]	1244	\|\| ((dfa->lex.left
	1245	> !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
	1246	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
	1247	& (dfa->lex.ptr[0] == '\\')]
	1248	== ')'))
[3657]	1249	#endif
	1250	#ifdef _MSC_VER /* see above */
	1251	\|\| (!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
	1252	? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == '\|'
	1253	: dfa->lex.left > 0 && dfa->lex.ptr[0] == '\|')
	1254	#else
[3611]	1255	\|\| ((dfa->lex.left
	1256	> !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
	1257	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
	1258	& (dfa->lex.ptr[0] == '\\')]
	1259	== '\|'))
[3657]	1260	#endif
[3611]	1261	\|\| ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
	1262	&& dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
	1263	return dfa->lex.lasttok = ENDLINE;
	1264	goto normal_char;
	1265
	1266	case '1':
	1267	case '2':
	1268	case '3':
	1269	case '4':
	1270	case '5':
	1271	case '6':
	1272	case '7':
	1273	case '8':
	1274	case '9':
	1275	if (!backslash)
	1276	goto normal_char;
	1277	if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
	1278	goto stray_backslash;
	1279
	1280	dfa->lex.laststart = false;
	1281	return dfa->lex.lasttok = BACKREF;
	1282
	1283	case '`':
	1284	if (!backslash)
	1285	goto normal_char;
	1286	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
	1287	goto stray_backslash;
	1288
	1289	/* FIXME: should be beginning of string */
	1290	return dfa->lex.lasttok = BEGLINE;
	1291
	1292	case '\'':
	1293	if (!backslash)
	1294	goto normal_char;
	1295	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
	1296	goto stray_backslash;
	1297
	1298	/* FIXME: should be end of string */
	1299	return dfa->lex.lasttok = ENDLINE;
	1300
	1301	case '<':
	1302	if (!backslash)
	1303	goto normal_char;
	1304	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
	1305	goto stray_backslash;
	1306
	1307	return dfa->lex.lasttok = BEGWORD;
	1308
	1309	case '>':
	1310	if (!backslash)
	1311	goto normal_char;
	1312	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
	1313	goto stray_backslash;
	1314
	1315	return dfa->lex.lasttok = ENDWORD;
	1316
	1317	case 'b':
	1318	if (!backslash)
	1319	goto normal_char;
	1320	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
	1321	goto stray_backslash;
	1322
	1323	return dfa->lex.lasttok = LIMWORD;
	1324
	1325	case 'B':
	1326	if (!backslash)
	1327	goto normal_char;
	1328	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
	1329	goto stray_backslash;
	1330
	1331	return dfa->lex.lasttok = NOTLIMWORD;
	1332
	1333	case '?':
	1334	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
	1335	goto default_case;
	1336	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
	1337	goto normal_char;
	1338	if (dfa->lex.laststart)
	1339	{
	1340	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
	1341	goto default_case;
	1342	if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
	1343	dfawarn (_("? at start of expression"));
	1344	}
	1345	return dfa->lex.lasttok = QMARK;
	1346
	1347	case '*':
	1348	if (backslash)
	1349	goto normal_char;
	1350	if (dfa->lex.laststart)
	1351	{
	1352	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
	1353	goto default_case;
	1354	if (dfa->syntax.dfaopts & DFA_STAR_WARN)
	1355	dfawarn (_("* at start of expression"));
	1356	}
	1357	return dfa->lex.lasttok = STAR;
	1358
	1359	case '+':
	1360	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
	1361	goto default_case;
	1362	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
	1363	goto normal_char;
	1364	if (dfa->lex.laststart)
	1365	{
	1366	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
	1367	goto default_case;
	1368	if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
	1369	dfawarn (_("+ at start of expression"));
	1370	}
	1371	return dfa->lex.lasttok = PLUS;
	1372
	1373	case '{':
	1374	if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
	1375	goto default_case;
	1376	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
	1377	goto normal_char;
	1378
	1379	/* Cases:
	1380	{M} - exact count
	1381	{M,} - minimum count, maximum is infinity
	1382	{,N} - 0 through N
	1383	{,} - 0 to infinity (same as '*')
	1384	{M,N} - M through N */
	1385	{
	1386	char const *p = dfa->lex.ptr;
	1387	char const *lim = p + dfa->lex.left;
	1388	dfa->lex.minrep = dfa->lex.maxrep = -1;
	1389	for (; p != lim && c_isdigit (*p); p++)
	1390	dfa->lex.minrep = (dfa->lex.minrep < 0
	1391	? *p - '0'
	1392	: MIN (RE_DUP_MAX + 1,
	1393	dfa->lex.minrep * 10 + *p - '0'));
	1394	if (p != lim)
	1395	{
	1396	if (*p != ',')
	1397	dfa->lex.maxrep = dfa->lex.minrep;
	1398	else
	1399	{
	1400	if (dfa->lex.minrep < 0)
	1401	dfa->lex.minrep = 0;
	1402	while (++p != lim && c_isdigit (*p))
	1403	dfa->lex.maxrep
	1404	= (dfa->lex.maxrep < 0
	1405	? *p - '0'
	1406	: MIN (RE_DUP_MAX + 1,
	1407	dfa->lex.maxrep * 10 + *p - '0'));
	1408	}
	1409	}
	1410	bool invalid_content
	1411	= ! ((! backslash \|\| (p != lim && *p++ == '\\'))
	1412	&& p != lim && *p++ == '}'
	1413	&& 0 <= dfa->lex.minrep
	1414	&& (dfa->lex.maxrep < 0
	1415	\|\| dfa->lex.minrep <= dfa->lex.maxrep));
	1416	if (invalid_content
	1417	&& (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD))
	1418	goto normal_char;
	1419	if (dfa->lex.laststart)
	1420	{
	1421	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
	1422	goto default_case;
	1423	if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
	1424	dfawarn (_("{...} at start of expression"));
	1425	}
	1426	if (invalid_content)
	1427	dfaerror (_("invalid content of \\{\\}"));
	1428	if (RE_DUP_MAX < dfa->lex.maxrep)
	1429	dfaerror (_("regular expression too big"));
	1430	dfa->lex.ptr = p;
	1431	dfa->lex.left = lim - p;
	1432	}
	1433	dfa->lex.laststart = false;
	1434	return dfa->lex.lasttok = REPMN;
	1435
	1436	case '\|':
	1437	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
	1438	goto default_case;
	1439	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
	1440	goto normal_char;
	1441	dfa->lex.laststart = true;
	1442	return dfa->lex.lasttok = OR;
	1443
	1444	case '\n':
	1445	if (!(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
	1446	goto default_case;
	1447	if (backslash)
	1448	goto normal_char;
	1449	dfa->lex.laststart = true;
	1450	return dfa->lex.lasttok = OR;
	1451
	1452	case '(':
	1453	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
	1454	goto normal_char;
	1455	dfa->lex.parens++;
	1456	dfa->lex.laststart = true;
	1457	return dfa->lex.lasttok = LPAREN;
	1458
	1459	case ')':
	1460	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
	1461	goto normal_char;
	1462	if (dfa->lex.parens == 0
	1463	&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
	1464	goto normal_char;
	1465	dfa->lex.parens--;
	1466	dfa->lex.laststart = false;
	1467	return dfa->lex.lasttok = RPAREN;
	1468
	1469	case '.':
	1470	if (backslash)
	1471	goto normal_char;
	1472	if (dfa->canychar < 0)
	1473	{
	1474	charclass ccl;
	1475	fillset (&ccl);
	1476	if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
	1477	clrbit ('\n', &ccl);
	1478	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
	1479	clrbit ('\0', &ccl);
	1480	if (dfa->localeinfo.multibyte)
	1481	for (int c2 = 0; c2 < NOTCHAR; c2++)
	1482	if (dfa->localeinfo.sbctowc[c2] == WEOF)
	1483	clrbit (c2, &ccl);
	1484	dfa->canychar = charclass_index (dfa, &ccl);
	1485	}
	1486	dfa->lex.laststart = false;
	1487	return dfa->lex.lasttok = (dfa->localeinfo.multibyte
	1488	? ANYCHAR
	1489	: CSET + dfa->canychar);
	1490
	1491	case 's':
	1492	case 'S':
	1493	if (!backslash)
	1494	goto normal_char;
	1495	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
	1496	goto stray_backslash;
	1497
	1498	if (!dfa->localeinfo.multibyte)
	1499	{
	1500	charclass ccl;
	1501	zeroset (&ccl);
	1502	for (int c2 = 0; c2 < NOTCHAR; ++c2)
	1503	if (isspace (c2))
	1504	setbit (c2, &ccl);
	1505	if (c == 'S')
	1506	notset (&ccl);
	1507	dfa->lex.laststart = false;
	1508	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
	1509	}
	1510
	1511	/* FIXME: see if optimizing this, as is done with ANYCHAR and
	1512	add_utf8_anychar, makes sense. */
	1513
	1514	/* \s and \S are documented to be equivalent to [[:space:]] and
	1515	[^[:space:]] respectively, so tell the lexer to process those
	1516	strings, each minus its "already processed" '['. */
	1517	{
	1518	struct lexptr ls;
	1519	push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
	1520	dfa->lex.lasttok = parse_bracket_exp (dfa);
	1521	pop_lex_state (dfa, &ls);
	1522	}
	1523
	1524	dfa->lex.laststart = false;
	1525	return dfa->lex.lasttok;
	1526
	1527	case 'w':
	1528	case 'W':
	1529	if (!backslash)
	1530	goto normal_char;
	1531	if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
	1532	goto stray_backslash;
	1533
	1534	if (!dfa->localeinfo.multibyte)
	1535	{
	1536	charclass ccl;
	1537	zeroset (&ccl);
	1538	for (int c2 = 0; c2 < NOTCHAR; ++c2)
	1539	if (dfa->syntax.sbit[c2] == CTX_LETTER)
	1540	setbit (c2, &ccl);
	1541	if (c == 'W')
	1542	notset (&ccl);
	1543	dfa->lex.laststart = false;
	1544	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
	1545	}
	1546
	1547	/* FIXME: see if optimizing this, as is done with ANYCHAR and
	1548	add_utf8_anychar, makes sense. */
	1549
	1550	/* \w and \W are documented to be equivalent to [_[:alnum:]] and
	1551	[^_[:alnum:]] respectively, so tell the lexer to process those
	1552	strings, each minus its "already processed" '['. */
	1553	{
	1554	struct lexptr ls;
	1555	push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
	1556	dfa->lex.lasttok = parse_bracket_exp (dfa);
	1557	pop_lex_state (dfa, &ls);
	1558	}
	1559
	1560	dfa->lex.laststart = false;
	1561	return dfa->lex.lasttok;
	1562
	1563	case '[':
	1564	if (backslash)
	1565	goto normal_char;
	1566	dfa->lex.laststart = false;
	1567	return dfa->lex.lasttok = parse_bracket_exp (dfa);
	1568
	1569	default:
	1570	default_case:
	1571	if (!backslash)
	1572	goto normal_char;
	1573	stray_backslash:
	1574	if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
	1575	{
	1576	char const *msg;
	1577	char msgbuf[100];
	1578	if (!iswprint (dfa->lex.wctok))
	1579	msg = _("stray \\ before unprintable character");
	1580	else if (iswspace (dfa->lex.wctok))
	1581	msg = _("stray \\ before white space");
	1582	else
	1583	{
	1584	int n = snprintf (msgbuf, sizeof msgbuf,
	1585	_("stray \\ before %lc"), dfa->lex.wctok);
	1586	msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
	1587	}
	1588	dfawarn (msg);
	1589	}
	1590	FALLTHROUGH;
	1591	case ']': case '}':
	1592	normal_char:
	1593	dfa->lex.laststart = false;
	1594	/* For multibyte character sets, folding is done in atom. Always
	1595	return WCHAR. */
	1596	if (dfa->localeinfo.multibyte)
	1597	return dfa->lex.lasttok = WCHAR;
	1598
	1599	if (dfa->syntax.case_fold && isalpha (c))
	1600	{
	1601	charclass ccl;
	1602	zeroset (&ccl);
	1603	setbit_case_fold_c (c, &ccl);
	1604	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
	1605	}
	1606
	1607	return dfa->lex.lasttok = c;
	1608	}
	1609	}
	1610
	1611	/* The above loop should consume at most a backslash
	1612	and some other character. */
	1613	abort ();
	1614	return END; /* keeps pedantic compilers happy. */
	1615	}
	1616
	1617	static void
	1618	addtok_mb (struct dfa *dfa, token t, char mbprop)
	1619	{
	1620	if (dfa->talloc == dfa->tindex)
	1621	{
	1622	dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
	1623	sizeof *dfa->tokens);
	1624	if (dfa->localeinfo.multibyte)
	1625	dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
	1626	sizeof *dfa->multibyte_prop);
	1627	}
	1628	if (dfa->localeinfo.multibyte)
	1629	dfa->multibyte_prop[dfa->tindex] = mbprop;
	1630	dfa->tokens[dfa->tindex++] = t;
	1631
	1632	switch (t)
	1633	{
	1634	case QMARK:
	1635	case STAR:
	1636	case PLUS:
	1637	break;
	1638
	1639	case CAT:
	1640	case OR:
	1641	dfa->parse.depth--;
	1642	break;
	1643
	1644	case EMPTY:
	1645	dfa->epsilon = true;
	1646	goto increment_depth;
	1647
	1648	case BACKREF:
	1649	dfa->fast = false;
	1650	goto increment_nleaves;
	1651
	1652	case BEGLINE:
	1653	case ENDLINE:
	1654	case BEGWORD:
	1655	case ENDWORD:
	1656	case LIMWORD:
	1657	case NOTLIMWORD:
	1658	dfa->epsilon = true;
	1659	FALLTHROUGH;
	1660	default:
	1661	increment_nleaves:
	1662	dfa->nleaves++;
	1663	increment_depth:
	1664	dfa->parse.depth++;
	1665	if (dfa->depth < dfa->parse.depth)
	1666	dfa->depth = dfa->parse.depth;
	1667	break;
	1668	}
	1669	}
	1670
	1671	static void addtok_wc (struct dfa *dfa, wint_t wc);
	1672
	1673	/* Add the given token to the parse tree, maintaining the depth count and
	1674	updating the maximum depth if necessary. */
	1675	static void
	1676	addtok (struct dfa *dfa, token t)
	1677	{
	1678	if (dfa->localeinfo.multibyte && t == MBCSET)
	1679	{
	1680	bool need_or = false;
	1681
	1682	/* Extract wide characters into alternations for better performance.
	1683	This does not require UTF-8. */
	1684	for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
	1685	{
	1686	addtok_wc (dfa, dfa->lex.brack.chars[i]);
	1687	if (need_or)
	1688	addtok (dfa, OR);
	1689	need_or = true;
	1690	}
	1691	dfa->lex.brack.nchars = 0;
	1692
	1693	/* Wide characters have been handled above, so it is possible
	1694	that the set is empty now. Do nothing in that case. */
	1695	if (dfa->lex.brack.cset != -1)
	1696	{
	1697	addtok (dfa, CSET + dfa->lex.brack.cset);
	1698	if (need_or)
	1699	addtok (dfa, OR);
	1700	}
	1701	}
	1702	else
	1703	{
	1704	addtok_mb (dfa, t, 3);
	1705	}
	1706	}
	1707
	1708	/* We treat a multibyte character as a single atom, so that DFA
	1709	can treat a multibyte character as a single expression.
	1710
	1711	e.g., we construct the following tree from "<mb1><mb2>".
	1712	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
	1713	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
	1714	static void
	1715	addtok_wc (struct dfa *dfa, wint_t wc)
	1716	{
	1717	unsigned char buf[MB_LEN_MAX];
	1718	mbstate_t s = { 0 };
	1719	size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
	1720	int buflen;
	1721
	1722	if (stored_bytes != (size_t) -1)
	1723	buflen = stored_bytes;
	1724	else
	1725	{
	1726	/* This is merely stop-gap. buf[0] is undefined, yet skipping
	1727	the addtok_mb call altogether can corrupt the heap. */
	1728	buflen = 1;
	1729	buf[0] = 0;
	1730	}
	1731
	1732	addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
	1733	for (int i = 1; i < buflen; i++)
	1734	{
	1735	addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
	1736	addtok (dfa, CAT);
	1737	}
	1738	}
	1739
	1740	static void
	1741	add_utf8_anychar (struct dfa *dfa)
	1742	{
	1743	/* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
	1744	UTF-8 byte sequence has been defined as follows:
	1745
	1746	([\x00-\x7f]
	1747	\|[\xc2-\xdf][\x80-\xbf]
	1748	\|[\xe0][\xa0-\xbf][\x80-\xbf]
	1749	\|[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
	1750	\|[\xed][\x80-\x9f][\x80-\xbf]
	1751	\|[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
	1752	\|[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
	1753	\|[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
	1754
	1755	which I'll write more concisely "A\|BC\|DEC\|FCC\|GHC\|IJCC\|KCCC\|LMCC",
	1756	where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
	1757	D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
	1758	H = [\x80-\x9f], I = [\xf0],
	1759	J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
	1760
	1761	This can be refactored to "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C". */
	1762
	1763	/* Mnemonics for classes containing two or more bytes. */
	1764	enum { A, B, C, E, F, H, J, K, M };
	1765
	1766	/* Mnemonics for single-byte tokens. */
	1767	enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
	1768
	1769	static charclass const utf8_classes[] = {
	1770	/* A. 00-7f: 1-byte sequence. */
	1771	CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
	1772
	1773	/* B. c2-df: 1st byte of a 2-byte sequence. */
	1774	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
	1775
	1776	/* C. 80-bf: non-leading bytes. */
	1777	CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
	1778
	1779	/* D. e0 (just a token). */
	1780
	1781	/* E. a0-bf: 2nd byte of a "DEC" sequence. */
	1782	CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
	1783
	1784	/* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
	1785	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
	1786
	1787	/* G. ed (just a token). */
	1788
	1789	/* H. 80-9f: 2nd byte of a "GHC" sequence. */
	1790	CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0, 0, 0),
	1791
	1792	/* I. f0 (just a token). */
	1793
	1794	/* J. 90-bf: 2nd byte of an "IJCC" sequence. */
	1795	CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
	1796
	1797	/* K. f1-f3: 1st byte of a "KCCC" sequence. */
	1798	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
	1799
	1800	/* L. f4 (just a token). */
	1801
	1802	/* M. 80-8f: 2nd byte of a "LMCC" sequence. */
	1803	CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
	1804	};
	1805
	1806	/* Define the character classes that are needed below. */
	1807	if (dfa->utf8_anychar_classes[0] == 0)
	1808	{
	1809	charclass c = utf8_classes[0];
	1810	if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
	1811	clrbit ('\n', &c);
	1812	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
	1813	clrbit ('\0', &c);
	1814	dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
	1815
	1816	for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
	1817	dfa->utf8_anychar_classes[i]
	1818	= CSET + charclass_index (dfa, &utf8_classes[i]);
	1819	}
	1820
	1821	/* Implement the "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C" pattern mentioned above.
	1822	The token buffer is in reverse Polish order, so we get
	1823	"A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
	1824	C CAT OR C CAT OR C CAT OR". */
	1825	addtok (dfa, dfa->utf8_anychar_classes[A]);
	1826	addtok (dfa, dfa->utf8_anychar_classes[B]);
	1827	addtok (dfa, D_token);
	1828	addtok (dfa, dfa->utf8_anychar_classes[E]);
	1829	addtok (dfa, CAT);
	1830	addtok (dfa, OR);
	1831	addtok (dfa, G_token);
	1832	addtok (dfa, dfa->utf8_anychar_classes[H]);
	1833	addtok (dfa, CAT);
	1834	addtok (dfa, OR);
	1835	addtok (dfa, dfa->utf8_anychar_classes[F]);
	1836	addtok (dfa, I_token);
	1837	addtok (dfa, dfa->utf8_anychar_classes[J]);
	1838	addtok (dfa, CAT);
	1839	addtok (dfa, OR);
	1840	addtok (dfa, L_token);
	1841	addtok (dfa, dfa->utf8_anychar_classes[M]);
	1842	addtok (dfa, CAT);
	1843	addtok (dfa, OR);
	1844	addtok (dfa, dfa->utf8_anychar_classes[K]);
	1845	for (int i = 0; i < 3; i++)
	1846	{
	1847	addtok (dfa, dfa->utf8_anychar_classes[C]);
	1848	addtok (dfa, CAT);
	1849	addtok (dfa, OR);
	1850	}
	1851	}
	1852
	1853	/* The grammar understood by the parser is as follows.
	1854
	1855	regexp:
	1856	regexp OR branch
	1857	branch
	1858
	1859	branch:
	1860	branch closure
	1861	closure
	1862
	1863	closure:
	1864	closure QMARK
	1865	closure STAR
	1866	closure PLUS
	1867	closure REPMN
	1868	atom
	1869
	1870	atom:
	1871	<normal character>
	1872	<multibyte character>
	1873	ANYCHAR
	1874	MBCSET
	1875	CSET
	1876	BACKREF
	1877	BEGLINE
	1878	ENDLINE
	1879	BEGWORD
	1880	ENDWORD
	1881	LIMWORD
	1882	NOTLIMWORD
	1883	LPAREN regexp RPAREN
	1884	<empty>
	1885
	1886	The parser builds a parse tree in postfix form in an array of tokens. */
	1887
	1888	static void
	1889	atom (struct dfa *dfa)
	1890	{
	1891	if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
	1892	\|\| dfa->parse.tok >= CSET
	1893	\|\| dfa->parse.tok == BEG \|\| dfa->parse.tok == BACKREF
	1894	\|\| dfa->parse.tok == BEGLINE \|\| dfa->parse.tok == ENDLINE
	1895	\|\| dfa->parse.tok == BEGWORD \|\| dfa->parse.tok == ENDWORD
	1896	\|\| dfa->parse.tok == LIMWORD \|\| dfa->parse.tok == NOTLIMWORD
	1897	\|\| dfa->parse.tok == ANYCHAR \|\| dfa->parse.tok == MBCSET)
	1898	{
	1899	if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
	1900	{
	1901	/* For UTF-8 expand the period to a series of CSETs that define a
	1902	valid UTF-8 character. This avoids using the slow multibyte
	1903	path. I'm pretty sure it would be both profitable and correct to
	1904	do it for any encoding; however, the optimization must be done
	1905	manually as it is done above in add_utf8_anychar. So, let's
	1906	start with UTF-8: it is the most used, and the structure of the
	1907	encoding makes the correctness more obvious. */
	1908	add_utf8_anychar (dfa);
	1909	}
	1910	else
	1911	addtok (dfa, dfa->parse.tok);
	1912	dfa->parse.tok = lex (dfa);
	1913	}
	1914	else if (dfa->parse.tok == WCHAR)
	1915	{
	1916	if (dfa->lex.wctok == WEOF)
	1917	addtok (dfa, BACKREF);
	1918	else
	1919	{
	1920	addtok_wc (dfa, dfa->lex.wctok);
	1921
	1922	if (dfa->syntax.case_fold)
	1923	{
	1924	wchar_t folded[CASE_FOLDED_BUFSIZE];
	1925	int n = case_folded_counterparts (dfa->lex.wctok, folded);
	1926	for (int i = 0; i < n; i++)
	1927	{
	1928	addtok_wc (dfa, folded[i]);
	1929	addtok (dfa, OR);
	1930	}
	1931	}
	1932	}
	1933
	1934	dfa->parse.tok = lex (dfa);
	1935	}
	1936	else if (dfa->parse.tok == LPAREN)
	1937	{
	1938	dfa->parse.tok = lex (dfa);
	1939	regexp (dfa);
	1940	if (dfa->parse.tok != RPAREN)
	1941	dfaerror (_("unbalanced ("));
	1942	dfa->parse.tok = lex (dfa);
	1943	}
	1944	else
	1945	addtok (dfa, EMPTY);
	1946	}
	1947
	1948	/* Return the number of tokens in the given subexpression. */
	1949	static idx_t _GL_ATTRIBUTE_PURE
	1950	nsubtoks (struct dfa const *dfa, idx_t tindex)
	1951	{
	1952	switch (dfa->tokens[tindex - 1])
	1953	{
	1954	default:
	1955	return 1;
	1956	case QMARK:
	1957	case STAR:
	1958	case PLUS:
	1959	return 1 + nsubtoks (dfa, tindex - 1);
	1960	case CAT:
	1961	case OR:
	1962	{
	1963	idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
	1964	return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
	1965	}
	1966	}
	1967	}
	1968
	1969	/* Copy the given subexpression to the top of the tree. */
	1970	static void
	1971	copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
	1972	{
	1973	if (dfa->localeinfo.multibyte)
	1974	for (idx_t i = 0; i < ntokens; i++)
	1975	addtok_mb (dfa, dfa->tokens[tindex + i],
	1976	dfa->multibyte_prop[tindex + i]);
	1977	else
	1978	for (idx_t i = 0; i < ntokens; i++)
	1979	addtok_mb (dfa, dfa->tokens[tindex + i], 3);
	1980	}
	1981
	1982	static void
	1983	closure (struct dfa *dfa)
	1984	{
	1985	atom (dfa);
	1986	while (dfa->parse.tok == QMARK \|\| dfa->parse.tok == STAR
	1987	\|\| dfa->parse.tok == PLUS \|\| dfa->parse.tok == REPMN)
	1988	if (dfa->parse.tok == REPMN && (dfa->lex.minrep \|\| dfa->lex.maxrep))
	1989	{
	1990	idx_t ntokens = nsubtoks (dfa, dfa->tindex);
	1991	idx_t tindex = dfa->tindex - ntokens;
	1992	if (dfa->lex.maxrep < 0)
	1993	addtok (dfa, PLUS);
	1994	if (dfa->lex.minrep == 0)
	1995	addtok (dfa, QMARK);
	1996	int i;
	1997	for (i = 1; i < dfa->lex.minrep; i++)
	1998	{
	1999	copytoks (dfa, tindex, ntokens);
	2000	addtok (dfa, CAT);
	2001	}
	2002	for (; i < dfa->lex.maxrep; i++)
	2003	{
	2004	copytoks (dfa, tindex, ntokens);
	2005	addtok (dfa, QMARK);
	2006	addtok (dfa, CAT);
	2007	}
	2008	dfa->parse.tok = lex (dfa);
	2009	}
	2010	else if (dfa->parse.tok == REPMN)
	2011	{
	2012	dfa->tindex -= nsubtoks (dfa, dfa->tindex);
	2013	dfa->parse.tok = lex (dfa);
	2014	closure (dfa);
	2015	}
	2016	else
	2017	{
	2018	addtok (dfa, dfa->parse.tok);
	2019	dfa->parse.tok = lex (dfa);
	2020	}
	2021	}
	2022
	2023	static void
	2024	branch (struct dfa* dfa)
	2025	{
	2026	closure (dfa);
	2027	while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
	2028	&& dfa->parse.tok >= 0)
	2029	{
	2030	closure (dfa);
	2031	addtok (dfa, CAT);
	2032	}
	2033	}
	2034
	2035	static void
	2036	regexp (struct dfa *dfa)
	2037	{
	2038	branch (dfa);
	2039	while (dfa->parse.tok == OR)
	2040	{
	2041	dfa->parse.tok = lex (dfa);
	2042	branch (dfa);
	2043	addtok (dfa, OR);
	2044	}
	2045	}
	2046
	2047	/* Parse a string S of length LEN into D. S can include NUL characters.
	2048	This is the main entry point for the parser. */
	2049	void
	2050	dfaparse (char const s, idx_t len, struct dfa d)
	2051	{
	2052	d->lex.ptr = s;
	2053	d->lex.left = len;
	2054	d->lex.lasttok = END;
	2055	d->lex.laststart = true;
	2056
	2057	if (!d->syntax.syntax_bits_set)
	2058	dfaerror (_("no syntax specified"));
	2059
	2060	if (!d->nregexps)
	2061	addtok (d, BEG);
	2062
	2063	d->parse.tok = lex (d);
	2064	d->parse.depth = d->depth;
	2065
	2066	regexp (d);
	2067
	2068	if (d->parse.tok != END)
	2069	dfaerror (_("unbalanced )"));
	2070
	2071	addtok (d, END - d->nregexps);
	2072	addtok (d, CAT);
	2073
	2074	if (d->nregexps)
	2075	addtok (d, OR);
	2076
	2077	++d->nregexps;
	2078	}
	2079
	2080	/* Some primitives for operating on sets of positions. */
	2081
	2082	/* Copy one set to another. */
	2083	static void
	2084	copy (position_set const src, position_set dst)
	2085	{
	2086	if (dst->alloc < src->nelem)
	2087	{
	2088	free (dst->elems);
	2089	dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
	2090	sizeof *dst->elems);
	2091	}
	2092	dst->nelem = src->nelem;
	2093	if (src->nelem != 0)
	2094	memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
	2095	}
	2096
	2097	static void
	2098	alloc_position_set (position_set *s, idx_t size)
	2099	{
	2100	s->elems = xnmalloc (size, sizeof *s->elems);
	2101	s->alloc = size;
	2102	s->nelem = 0;
	2103	}
	2104
	2105	/* Insert position P in set S. S is maintained in sorted order on
	2106	decreasing index. If there is already an entry in S with P.index
	2107	then merge (logically-OR) P's constraints into the one in S.
	2108	S->elems must point to an array large enough to hold the resulting set. */
	2109	static void
	2110	insert (position p, position_set *s)
	2111	{
	2112	idx_t count = s->nelem;
	2113	idx_t lo = 0, hi = count;
	2114	while (lo < hi)
	2115	{
	2116	idx_t mid = (lo + hi) >> 1;
	2117	if (s->elems[mid].index < p.index)
	2118	lo = mid + 1;
	2119	else if (s->elems[mid].index == p.index)
	2120	{
	2121	s->elems[mid].constraint \|= p.constraint;
	2122	return;
	2123	}
	2124	else
	2125	hi = mid;
	2126	}
	2127
	2128	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
	2129	for (idx_t i = count; i > lo; i--)
	2130	s->elems[i] = s->elems[i - 1];
	2131	s->elems[lo] = p;
	2132	++s->nelem;
	2133	}
	2134
	2135	static void
	2136	append (position p, position_set *s)
	2137	{
	2138	idx_t count = s->nelem;
	2139	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
	2140	s->elems[s->nelem++] = p;
	2141	}
	2142
	2143	/* Merge S1 and S2 (with the additional constraint C2) into M. The
	2144	result is as if the positions of S1, and of S2 with the additional
	2145	constraint C2, were inserted into an initially empty set. */
	2146	static void
	2147	merge_constrained (position_set const s1, position_set const s2,
	2148	unsigned int c2, position_set *m)
	2149	{
	2150	idx_t i = 0, j = 0;
	2151
	2152	if (m->alloc - s1->nelem < s2->nelem)
	2153	{
	2154	free (m->elems);
	2155	m->alloc = s1->nelem;
	2156	m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
	2157	}
	2158	m->nelem = 0;
	2159	while (i < s1->nelem \|\| j < s2->nelem)
	2160	if (! (j < s2->nelem)
	2161	\|\| (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
	2162	{
	2163	unsigned int c = ((i < s1->nelem && j < s2->nelem
	2164	&& s1->elems[i].index == s2->elems[j].index)
	2165	? s2->elems[j++].constraint & c2
	2166	: 0);
	2167	m->elems[m->nelem].index = s1->elems[i].index;
	2168	m->elems[m->nelem++].constraint = s1->elems[i++].constraint \| c;
	2169	}
	2170	else
	2171	{
	2172	if (s2->elems[j].constraint & c2)
	2173	{
	2174	m->elems[m->nelem].index = s2->elems[j].index;
	2175	m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
	2176	}
	2177	j++;
	2178	}
	2179	}
	2180
	2181	/* Merge two sets of positions into a third. The result is exactly as if
	2182	the positions of both sets were inserted into an initially empty set. */
	2183	static void
	2184	merge (position_set const s1, position_set const s2, position_set *m)
	2185	{
	2186	merge_constrained (s1, s2, -1, m);
	2187	}
	2188
	2189	/* Merge into DST all the elements of SRC, possibly destroying
	2190	the contents of the temporary M. */
	2191	static void
	2192	merge2 (position_set dst, position_set const src, position_set *m)
	2193	{
	2194	if (src->nelem < 4)
	2195	{
	2196	for (idx_t i = 0; i < src->nelem; i++)
	2197	insert (src->elems[i], dst);
	2198	}
	2199	else
	2200	{
	2201	merge (src, dst, m);
	2202	copy (m, dst);
	2203	}
	2204	}
	2205
	2206	/* Delete a position from a set. Return the nonzero constraint of the
	2207	deleted position, or zero if there was no such position. */
	2208	static unsigned int
	2209	delete (idx_t del, position_set *s)
	2210	{
	2211	idx_t count = s->nelem;
	2212	idx_t lo = 0, hi = count;
	2213	while (lo < hi)
	2214	{
	2215	idx_t mid = (lo + hi) >> 1;
	2216	if (s->elems[mid].index < del)
	2217	lo = mid + 1;
	2218	else if (s->elems[mid].index == del)
	2219	{
	2220	unsigned int c = s->elems[mid].constraint;
	2221	idx_t i;
	2222	for (i = mid; i + 1 < count; i++)
	2223	s->elems[i] = s->elems[i + 1];
	2224	s->nelem = i;
	2225	return c;
	2226	}
	2227	else
	2228	hi = mid;
	2229	}
	2230	return 0;
	2231	}
	2232
	2233	/* Replace a position with the followed set. */
	2234	static void
	2235	replace (position_set dst, idx_t del, position_set add,
	2236	unsigned int constraint, position_set *tmp)
	2237	{
	2238	unsigned int c = delete (del, dst) & constraint;
	2239
	2240	if (c)
	2241	{
	2242	copy (dst, tmp);
	2243	merge_constrained (tmp, add, c, dst);
	2244	}
	2245	}
	2246
	2247	/* Find the index of the state corresponding to the given position set with
	2248	the given preceding context, or create a new state if there is no such
	2249	state. Context tells whether we got here on a newline or letter. */
	2250	static state_num
	2251	state_index (struct dfa d, position_set const s, int context)
	2252	{
	2253	size_t hash = 0;
	2254	int constraint = 0;
	2255	state_num i;
	2256
	2257	for (i = 0; i < s->nelem; ++i)
	2258	{
	2259	idx_t ind = s->elems[i].index;
	2260	hash ^= ind + s->elems[i].constraint;
	2261	}
	2262
	2263	/* Try to find a state that exactly matches the proposed one. */
	2264	for (i = 0; i < d->sindex; ++i)
	2265	{
	2266	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
	2267	\|\| context != d->states[i].context)
	2268	continue;
	2269	state_num j;
	2270	for (j = 0; j < s->nelem; ++j)
	2271	if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
	2272	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
	2273	break;
	2274	if (j == s->nelem)
	2275	return i;
	2276	}
	2277
	2278	#ifdef DEBUG
	2279	fprintf (stderr, "new state %td\n nextpos:", i);
	2280	for (state_num j = 0; j < s->nelem; j++)
	2281	{
	2282	fprintf (stderr, " %td:", s->elems[j].index);
	2283	prtok (d->tokens[s->elems[j].index]);
	2284	}
	2285	fprintf (stderr, "\n context:");
	2286	if (context ^ CTX_ANY)
	2287	{
	2288	if (context & CTX_NONE)
	2289	fprintf (stderr, " CTX_NONE");
	2290	if (context & CTX_LETTER)
	2291	fprintf (stderr, " CTX_LETTER");
	2292	if (context & CTX_NEWLINE)
	2293	fprintf (stderr, " CTX_NEWLINE");
	2294	}
	2295	else
	2296	fprintf (stderr, " CTX_ANY");
	2297	fprintf (stderr, "\n");
	2298	#endif
	2299
	2300	for (state_num j = 0; j < s->nelem; j++)
	2301	{
	2302	int c = d->constraints[s->elems[j].index];
	2303
	2304	if (c != 0)
	2305	{
	2306	if (succeeds_in_context (c, context, CTX_ANY))
	2307	constraint \|= c;
	2308	}
	2309	else if (d->tokens[s->elems[j].index] == BACKREF)
	2310	constraint = NO_CONSTRAINT;
	2311	}
	2312
	2313
	2314	/* Create a new state. */
	2315	d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
	2316	sizeof *d->states);
	2317	d->states[i].hash = hash;
	2318	alloc_position_set (&d->states[i].elems, s->nelem);
	2319	copy (s, &d->states[i].elems);
	2320	d->states[i].context = context;
	2321	d->states[i].constraint = constraint;
	2322	d->states[i].mbps.nelem = 0;
	2323	d->states[i].mbps.elems = NULL;
	2324	d->states[i].mb_trindex = -1;
	2325
	2326	++d->sindex;
	2327
	2328	return i;
	2329	}
	2330
	2331	/* Find the epsilon closure of D's set of positions. If any position of the set
	2332	contains a symbol that matches the empty string in some context, replace
	2333	that position with the elements of its follow labeled with an appropriate
	2334	constraint. Repeat exhaustively until no funny positions are left.
	2335	S->elems must be large enough to hold the result. BACKWARD is D's
	2336	backward set; use and update it too. */
	2337	static void
	2338	epsclosure (struct dfa const d, position_set backward)
	2339	{
	2340	position_set tmp;
	2341	alloc_position_set (&tmp, d->nleaves);
	2342	for (idx_t i = 0; i < d->tindex; i++)
	2343	if (0 < d->follows[i].nelem)
	2344	{
	2345	unsigned int constraint;
	2346	switch (d->tokens[i])
	2347	{
	2348	default:
	2349	continue;
	2350
	2351	case BEGLINE:
	2352	constraint = BEGLINE_CONSTRAINT;
	2353	break;
	2354	case ENDLINE:
	2355	constraint = ENDLINE_CONSTRAINT;
	2356	break;
	2357	case BEGWORD:
	2358	constraint = BEGWORD_CONSTRAINT;
	2359	break;
	2360	case ENDWORD:
	2361	constraint = ENDWORD_CONSTRAINT;
	2362	break;
	2363	case LIMWORD:
	2364	constraint = LIMWORD_CONSTRAINT;
	2365	break;
	2366	case NOTLIMWORD:
	2367	constraint = NOTLIMWORD_CONSTRAINT;
	2368	break;
	2369	case EMPTY:
	2370	constraint = NO_CONSTRAINT;
	2371	break;
	2372	}
	2373
	2374	delete (i, &d->follows[i]);
	2375
	2376	for (idx_t j = 0; j < backward[i].nelem; j++)
	2377	replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
	2378	constraint, &tmp);
	2379	for (idx_t j = 0; j < d->follows[i].nelem; j++)
	2380	replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
	2381	NO_CONSTRAINT, &tmp);
	2382	}
	2383	free (tmp.elems);
	2384	}
	2385
	2386	/* Returns the set of contexts for which there is at least one
	2387	character included in C. */
	2388
	2389	static int
	2390	charclass_context (struct dfa const dfa, charclass const c)
	2391	{
	2392	int context = 0;
	2393
	2394	for (int j = 0; j < CHARCLASS_WORDS; j++)
	2395	{
	2396	if (c->w[j] & dfa->syntax.newline.w[j])
	2397	context \|= CTX_NEWLINE;
	2398	if (c->w[j] & dfa->syntax.letters.w[j])
	2399	context \|= CTX_LETTER;
	2400	if (c->w[j] & ~(dfa->syntax.letters.w[j] \| dfa->syntax.newline.w[j]))
	2401	context \|= CTX_NONE;
	2402	}
	2403
	2404	return context;
	2405	}
	2406
	2407	/* Returns the contexts on which the position set S depends. Each context
	2408	in the set of returned contexts (let's call it SC) may have a different
	2409	follow set than other contexts in SC, and also different from the
	2410	follow set of the complement set (sc ^ CTX_ANY). However, all contexts
	2411	in the complement set will have the same follow set. */
	2412
	2413	static int _GL_ATTRIBUTE_PURE
	2414	state_separate_contexts (struct dfa d, position_set const s)
	2415	{
	2416	int separate_contexts = 0;
	2417
	2418	for (idx_t j = 0; j < s->nelem; j++)
	2419	separate_contexts \|= d->separates[s->elems[j].index];
	2420
	2421	return separate_contexts;
	2422	}
	2423
	2424	enum
	2425	{
	2426	/* Single token is repeated. It is distinguished from non-repeated. */
	2427	OPT_REPEAT = (1 << 0),
	2428
	2429	/* Multiple tokens are repeated. This flag is on at head of tokens. The
	2430	node is not merged. */
	2431	OPT_LPAREN = (1 << 1),
	2432
	2433	/* Multiple branches are joined. The node is not merged. */
	2434	OPT_RPAREN = (1 << 2),
	2435
	2436	/* The node is walked. If the node is found in walking again, OPT_RPAREN
	2437	flag is turned on. */
	2438	OPT_WALKED = (1 << 3),
	2439
	2440	/* The node is queued. The node is not queued again. */
	2441	OPT_QUEUED = (1 << 4)
	2442	};
	2443
	2444	static void
	2445	merge_nfa_state (struct dfa d, idx_t tindex, char flags,
	2446	position_set *merged)
	2447	{
	2448	position_set *follows = d->follows;
	2449	idx_t nelem = 0;
	2450
	2451	for (idx_t i = 0; i < follows[tindex].nelem; i++)
	2452	{
	2453	idx_t sindex = follows[tindex].elems[i].index;
	2454
	2455	/* Skip the node as pruned in future. */
	2456	unsigned int iconstraint = follows[tindex].elems[i].constraint;
	2457	if (iconstraint == 0)
	2458	continue;
	2459
	2460	if (d->tokens[follows[tindex].elems[i].index] <= END)
	2461	{
	2462	d->constraints[tindex] \|= follows[tindex].elems[i].constraint;
	2463	continue;
	2464	}
	2465
	2466	if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN \| OPT_RPAREN)))
	2467	{
	2468	idx_t j;
	2469
	2470	for (j = 0; j < nelem; j++)
	2471	{
	2472	idx_t dindex = follows[tindex].elems[j].index;
	2473
	2474	if (dindex == tindex)
	2475	continue;
	2476
	2477	if (follows[tindex].elems[j].constraint != iconstraint)
	2478	continue;
	2479
	2480	if (flags[dindex] & (OPT_LPAREN \| OPT_RPAREN))
	2481	continue;
	2482
	2483	if (d->tokens[sindex] != d->tokens[dindex])
	2484	continue;
	2485
	2486	if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
	2487	continue;
	2488
	2489	if (flags[sindex] & OPT_REPEAT)
	2490	delete (sindex, &follows[sindex]);
	2491
	2492	merge2 (&follows[dindex], &follows[sindex], merged);
	2493
	2494	break;
	2495	}
	2496
	2497	if (j < nelem)
	2498	continue;
	2499	}
	2500
	2501	follows[tindex].elems[nelem++] = follows[tindex].elems[i];
	2502	flags[sindex] \|= OPT_QUEUED;
	2503	}
	2504
	2505	follows[tindex].nelem = nelem;
	2506	}
	2507
	2508	static int
	2509	compare (const void a, const void b)
	2510	{
	2511	position const p = a, q = b;
	2512	return (p->index > q->index) - (p->index < q->index);
	2513	}
	2514
	2515	static void
	2516	reorder_tokens (struct dfa *d)
	2517	{
	2518	idx_t nleaves = 0;
	2519	ptrdiff_t map = xnmalloc (d->tindex, sizeof map);
	2520	map[0] = nleaves++;
	2521	for (idx_t i = 1; i < d->tindex; i++)
	2522	map[i] = -1;
	2523
	2524	token tokens = xnmalloc (d->nleaves, sizeof tokens);
	2525	position_set follows = xnmalloc (d->nleaves, sizeof follows);
	2526	int constraints = xnmalloc (d->nleaves, sizeof constraints);
	2527	char *multibyte_prop = (d->localeinfo.multibyte
	2528	? xnmalloc (d->nleaves, sizeof *multibyte_prop)
	2529	: NULL);
	2530
	2531	for (idx_t i = 0; i < d->tindex; i++)
	2532	{
	2533	if (map[i] < 0)
	2534	{
	2535	free (d->follows[i].elems);
	2536	d->follows[i].elems = NULL;
	2537	d->follows[i].nelem = 0;
	2538	continue;
	2539	}
	2540
	2541	tokens[map[i]] = d->tokens[i];
	2542	follows[map[i]] = d->follows[i];
	2543	constraints[map[i]] = d->constraints[i];
	2544
	2545	if (multibyte_prop != NULL)
	2546	multibyte_prop[map[i]] = d->multibyte_prop[i];
	2547
	2548	for (idx_t j = 0; j < d->follows[i].nelem; j++)
	2549	{
	2550	if (map[d->follows[i].elems[j].index] == -1)
	2551	map[d->follows[i].elems[j].index] = nleaves++;
	2552
	2553	d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
	2554	}
	2555
	2556	qsort (d->follows[i].elems, d->follows[i].nelem,
	2557	sizeof *d->follows[i].elems, compare);
	2558	}
	2559
	2560	for (idx_t i = 0; i < nleaves; i++)
	2561	{
	2562	d->tokens[i] = tokens[i];
	2563	d->follows[i] = follows[i];
	2564	d->constraints[i] = constraints[i];
	2565
	2566	if (multibyte_prop != NULL)
	2567	d->multibyte_prop[i] = multibyte_prop[i];
	2568	}
	2569
	2570	d->tindex = d->nleaves = nleaves;
	2571
	2572	free (tokens);
	2573	free (follows);
	2574	free (constraints);
	2575	free (multibyte_prop);
	2576	free (map);
	2577	}
	2578
	2579	static void
	2580	dfaoptimize (struct dfa *d)
	2581	{
	2582	char *flags = xizalloc (d->tindex);
	2583
	2584	for (idx_t i = 0; i < d->tindex; i++)
	2585	{
	2586	for (idx_t j = 0; j < d->follows[i].nelem; j++)
	2587	{
	2588	if (d->follows[i].elems[j].index == i)
	2589	flags[d->follows[i].elems[j].index] \|= OPT_REPEAT;
	2590	else if (d->follows[i].elems[j].index < i)
	2591	flags[d->follows[i].elems[j].index] \|= OPT_LPAREN;
	2592	else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
	2593	flags[d->follows[i].elems[j].index] \|= OPT_RPAREN;
	2594	else
	2595	flags[d->follows[i].elems[j].index] \|= OPT_WALKED;
	2596	}
	2597	}
	2598
	2599	flags[0] \|= OPT_QUEUED;
	2600
	2601	position_set merged0;
	2602	position_set *merged = &merged0;
	2603	alloc_position_set (merged, d->nleaves);
	2604
	2605	d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
	2606
	2607	for (idx_t i = 0; i < d->tindex; i++)
	2608	if (flags[i] & OPT_QUEUED)
	2609	merge_nfa_state (d, i, flags, merged);
	2610
	2611	reorder_tokens (d);
	2612
	2613	free (merged->elems);
	2614	free (flags);
	2615	}
	2616
	2617	/* Perform bottom-up analysis on the parse tree, computing various functions.
	2618	Note that at this point, we're pretending constructs like \< are real
	2619	characters rather than constraints on what can follow them.
	2620
	2621	Nullable: A node is nullable if it is at the root of a regexp that can
	2622	match the empty string.
	2623	* EMPTY leaves are nullable.
	2624	* No other leaf is nullable.
	2625	* A QMARK or STAR node is nullable.
	2626	* A PLUS node is nullable if its argument is nullable.
	2627	* A CAT node is nullable if both its arguments are nullable.
	2628	* An OR node is nullable if either argument is nullable.
	2629
	2630	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
	2631	that could correspond to the first character of a string matching the
	2632	regexp rooted at the given node.
	2633	* EMPTY leaves have empty firstpos.
	2634	* The firstpos of a nonempty leaf is that leaf itself.
	2635	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
	2636	argument.
	2637	* The firstpos of a CAT node is the firstpos of the left argument, union
	2638	the firstpos of the right if the left argument is nullable.
	2639	* The firstpos of an OR node is the union of firstpos of each argument.
	2640
	2641	Lastpos: The lastpos of a node is the set of positions that could
	2642	correspond to the last character of a string matching the regexp at
	2643	the given node.
	2644	* EMPTY leaves have empty lastpos.
	2645	* The lastpos of a nonempty leaf is that leaf itself.
	2646	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
	2647	argument.
	2648	* The lastpos of a CAT node is the lastpos of its right argument, union
	2649	the lastpos of the left if the right argument is nullable.
	2650	* The lastpos of an OR node is the union of the lastpos of each argument.
	2651
	2652	Follow: The follow of a position is the set of positions that could
	2653	correspond to the character following a character matching the node in
	2654	a string matching the regexp. At this point we consider special symbols
	2655	that match the empty string in some context to be just normal characters.
	2656	Later, if we find that a special symbol is in a follow set, we will
	2657	replace it with the elements of its follow, labeled with an appropriate
	2658	constraint.
	2659	* Every node in the firstpos of the argument of a STAR or PLUS node is in
	2660	the follow of every node in the lastpos.
	2661	* Every node in the firstpos of the second argument of a CAT node is in
	2662	the follow of every node in the lastpos of the first argument.
	2663
	2664	Because of the postfix representation of the parse tree, the depth-first
	2665	analysis is conveniently done by a linear scan with the aid of a stack.
	2666	Sets are stored as arrays of the elements, obeying a stack-like allocation
	2667	scheme; the number of elements in each set deeper in the stack can be
	2668	used to determine the address of a particular set's array. */
	2669	static void
	2670	dfaanalyze (struct dfa *d, bool searchflag)
	2671	{
	2672	/* Array allocated to hold position sets. */
	2673	position posalloc = xnmalloc (d->nleaves, 2 sizeof *posalloc);
	2674	/* Firstpos and lastpos elements. */
	2675	position *firstpos = posalloc;
	2676	position *lastpos = firstpos + d->nleaves;
	2677	position pos;
	2678	position_set tmp;
	2679
	2680	/* Stack for element counts and nullable flags. */
	2681	struct
	2682	{
	2683	/* Whether the entry is nullable. */
	2684	bool nullable;
	2685
	2686	/* Counts of firstpos and lastpos sets. */
	2687	idx_t nfirstpos;
	2688	idx_t nlastpos;
	2689	} stkalloc = xnmalloc (d->depth, sizeof stkalloc), *stk = stkalloc;
	2690
	2691	position_set merged; /* Result of merging sets. */
	2692
	2693	addtok (d, CAT);
	2694	idx_t tindex = d->tindex;
	2695
	2696	#ifdef DEBUG
	2697	fprintf (stderr, "dfaanalyze:\n");
	2698	for (idx_t i = 0; i < tindex; i++)
	2699	{
	2700	fprintf (stderr, " %td:", i);
	2701	prtok (d->tokens[i]);
	2702	}
	2703	putc ('\n', stderr);
	2704	#endif
	2705
	2706	d->searchflag = searchflag;
	2707	alloc_position_set (&merged, d->nleaves);
	2708	d->follows = xicalloc (tindex, sizeof *d->follows);
	2709	position_set *backward
	2710	= d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
	2711
	2712	for (idx_t i = 0; i < tindex; i++)
	2713	{
	2714	switch (d->tokens[i])
	2715	{
	2716	case EMPTY:
	2717	/* The empty set is nullable. */
	2718	stk->nullable = true;
	2719
	2720	/* The firstpos and lastpos of the empty leaf are both empty. */
	2721	stk->nfirstpos = stk->nlastpos = 0;
	2722	stk++;
	2723	break;
	2724
	2725	case STAR:
	2726	case PLUS:
	2727	/* Every element in the lastpos of the argument is in the backward
	2728	set of every element in the firstpos. */
	2729	if (d->epsilon)
	2730	{
	2731	tmp.elems = lastpos - stk[-1].nlastpos;
	2732	tmp.nelem = stk[-1].nlastpos;
	2733	for (position *p = firstpos - stk[-1].nfirstpos;
	2734	p < firstpos; p++)
	2735	merge2 (&backward[p->index], &tmp, &merged);
	2736	}
	2737
	2738	/* Every element in the firstpos of the argument is in the follow
	2739	of every element in the lastpos. */
	2740	{
	2741	tmp.elems = firstpos - stk[-1].nfirstpos;
	2742	tmp.nelem = stk[-1].nfirstpos;
	2743	for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
	2744	merge2 (&d->follows[p->index], &tmp, &merged);
	2745	}
	2746	FALLTHROUGH;
	2747	case QMARK:
	2748	/* A QMARK or STAR node is automatically nullable. */
	2749	if (d->tokens[i] != PLUS)
	2750	stk[-1].nullable = true;
	2751	break;
	2752
	2753	case CAT:
	2754	/* Every element in the lastpos of the first argument is in
	2755	the backward set of every element in the firstpos of the
	2756	second argument. */
	2757	if (backward)
	2758	{
	2759	tmp.nelem = stk[-2].nlastpos;
	2760	tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
	2761	for (position *p = firstpos - stk[-1].nfirstpos;
	2762	p < firstpos; p++)
	2763	merge2 (&backward[p->index], &tmp, &merged);
	2764	}
	2765
	2766	/* Every element in the firstpos of the second argument is in the
	2767	follow of every element in the lastpos of the first argument. */
	2768	{
	2769	tmp.nelem = stk[-1].nfirstpos;
	2770	tmp.elems = firstpos - stk[-1].nfirstpos;
	2771	for (position *plim = lastpos - stk[-1].nlastpos,
	2772	*p = plim - stk[-2].nlastpos;
	2773	p < plim; p++)
	2774	merge2 (&d->follows[p->index], &tmp, &merged);
	2775	}
	2776
	2777	/* The firstpos of a CAT node is the firstpos of the first argument,
	2778	union that of the second argument if the first is nullable. */
	2779	if (stk[-2].nullable)
	2780	stk[-2].nfirstpos += stk[-1].nfirstpos;
	2781	else
	2782	firstpos -= stk[-1].nfirstpos;
	2783
	2784	/* The lastpos of a CAT node is the lastpos of the second argument,
	2785	union that of the first argument if the second is nullable. */
	2786	if (stk[-1].nullable)
	2787	stk[-2].nlastpos += stk[-1].nlastpos;
	2788	else
	2789	{
	2790	position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
	2791	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
	2792	p[j] = p[j + stk[-2].nlastpos];
	2793	lastpos -= stk[-2].nlastpos;
	2794	stk[-2].nlastpos = stk[-1].nlastpos;
	2795	}
	2796
	2797	/* A CAT node is nullable if both arguments are nullable. */
	2798	stk[-2].nullable &= stk[-1].nullable;
	2799	stk--;
	2800	break;
	2801
	2802	case OR:
	2803	/* The firstpos is the union of the firstpos of each argument. */
	2804	stk[-2].nfirstpos += stk[-1].nfirstpos;
	2805
	2806	/* The lastpos is the union of the lastpos of each argument. */
	2807	stk[-2].nlastpos += stk[-1].nlastpos;
	2808
	2809	/* An OR node is nullable if either argument is nullable. */
	2810	stk[-2].nullable \|= stk[-1].nullable;
	2811	stk--;
	2812	break;
	2813
	2814	default:
	2815	/* Anything else is a nonempty position. (Note that special
	2816	constructs like \< are treated as nonempty strings here;
	2817	an "epsilon closure" effectively makes them nullable later.
	2818	Backreferences have to get a real position so we can detect
	2819	transitions on them later. But they are nullable. */
	2820	stk->nullable = d->tokens[i] == BACKREF;
	2821
	2822	/* This position is in its own firstpos and lastpos. */
	2823	stk->nfirstpos = stk->nlastpos = 1;
	2824	stk++;
	2825
	2826	firstpos->index = lastpos->index = i;
	2827	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
	2828	firstpos++, lastpos++;
	2829
	2830	break;
	2831	}
	2832	#ifdef DEBUG
	2833	/* ... balance the above nonsyntactic #ifdef goo... */
	2834	fprintf (stderr, "node %td:", i);
	2835	prtok (d->tokens[i]);
	2836	putc ('\n', stderr);
	2837	fprintf (stderr,
	2838	stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
	2839	fprintf (stderr, " firstpos:");
	2840	for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
	2841	{
	2842	fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
	2843	prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
	2844	}
	2845	fprintf (stderr, "\n lastpos:");
	2846	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
	2847	{
	2848	fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
	2849	prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
	2850	}
	2851	putc ('\n', stderr);
	2852	#endif
	2853	}
	2854
	2855	if (backward)
	2856	{
	2857	/* For each follow set that is the follow set of a real position,
	2858	replace it with its epsilon closure. */
	2859	epsclosure (d, backward);
	2860
	2861	for (idx_t i = 0; i < tindex; i++)
	2862	free (backward[i].elems);
	2863	free (backward);
	2864	}
	2865
	2866	dfaoptimize (d);
	2867
	2868	#ifdef DEBUG
	2869	for (idx_t i = 0; i < tindex; i++)
	2870	if (d->tokens[i] == BEG \|\| d->tokens[i] < NOTCHAR
	2871	\|\| d->tokens[i] == BACKREF \|\| d->tokens[i] == ANYCHAR
	2872	\|\| d->tokens[i] == MBCSET \|\| d->tokens[i] >= CSET)
	2873	{
	2874	fprintf (stderr, "follows(%td:", i);
	2875	prtok (d->tokens[i]);
	2876	fprintf (stderr, "):");
	2877	for (idx_t j = 0; j < d->follows[i].nelem; j++)
	2878	{
	2879	fprintf (stderr, " %td:", d->follows[i].elems[j].index);
	2880	prtok (d->tokens[d->follows[i].elems[j].index]);
	2881	}
	2882	putc ('\n', stderr);
	2883	}
	2884	#endif
	2885
	2886	pos.index = 0;
	2887	pos.constraint = NO_CONSTRAINT;
	2888
	2889	alloc_position_set (&tmp, 1);
	2890
	2891	append (pos, &tmp);
	2892
	2893	d->separates = xicalloc (tindex, sizeof *d->separates);
	2894
	2895	for (idx_t i = 0; i < tindex; i++)
	2896	{
	2897	if (prev_newline_dependent (d->constraints[i]))
	2898	d->separates[i] \|= CTX_NEWLINE;
	2899	if (prev_letter_dependent (d->constraints[i]))
	2900	d->separates[i] \|= CTX_LETTER;
	2901
	2902	for (idx_t j = 0; j < d->follows[i].nelem; j++)
	2903	{
	2904	if (prev_newline_dependent (d->follows[i].elems[j].constraint))
	2905	d->separates[i] \|= CTX_NEWLINE;
	2906	if (prev_letter_dependent (d->follows[i].elems[j].constraint))
	2907	d->separates[i] \|= CTX_LETTER;
	2908	}
	2909	}
	2910
	2911	/* Context wanted by some position. */
	2912	int separate_contexts = state_separate_contexts (d, &tmp);
	2913
	2914	/* Build the initial state. */
	2915	if (separate_contexts & CTX_NEWLINE)
	2916	state_index (d, &tmp, CTX_NEWLINE);
	2917	d->initstate_notbol = d->min_trcount
	2918	= state_index (d, &tmp, separate_contexts ^ CTX_ANY);
	2919	if (separate_contexts & CTX_LETTER)
	2920	d->min_trcount = state_index (d, &tmp, CTX_LETTER);
	2921	d->min_trcount++;
	2922	d->trcount = 0;
	2923
	2924	free (posalloc);
	2925	free (stkalloc);
	2926	free (merged.elems);
	2927	free (tmp.elems);
	2928	}
	2929
	2930	/* Make sure D's state arrays are large enough to hold NEW_STATE. */
	2931	static void
	2932	realloc_trans_if_necessary (struct dfa *d)
	2933	{
	2934	state_num oldalloc = d->tralloc;
	2935	if (oldalloc < d->sindex)
	2936	{
	2937	state_num **realtrans = d->trans ? d->trans - 2 : NULL;
	2938	idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
	2939	realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
	2940	-1, sizeof *realtrans);
	2941	realtrans[0] = realtrans[1] = NULL;
	2942	d->trans = realtrans + 2;
	2943	idx_t newalloc = d->tralloc = newalloc1 - 2;
	2944	d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
	2945	d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
	2946	d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
	2947	if (d->localeinfo.multibyte)
	2948	{
	2949	realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
	2950	realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
	2951	if (oldalloc == 0)
	2952	realtrans[0] = realtrans[1] = NULL;
	2953	d->mb_trans = realtrans + 2;
	2954	}
	2955	for (; oldalloc < newalloc; oldalloc++)
	2956	{
	2957	d->trans[oldalloc] = NULL;
	2958	d->fails[oldalloc] = NULL;
	2959	if (d->localeinfo.multibyte)
	2960	d->mb_trans[oldalloc] = NULL;
	2961	}
	2962	}
	2963	}
	2964
	2965	/*
	2966	Calculate the transition table for a new state derived from state s
	2967	for a compiled dfa d after input character uc, and return the new
	2968	state number.
	2969
	2970	Do not worry about all possible input characters; calculate just the group
	2971	of positions that match uc. Label it with the set of characters that
	2972	every position in the group matches (taking into account, if necessary,
	2973	preceding context information of s). Then find the union
	2974	of these positions' follows, i.e., the set of positions of the
	2975	new state. For each character in the group's label, set the transition
	2976	on this character to be to a state corresponding to the set's positions,
	2977	and its associated backward context information, if necessary.
	2978
	2979	When building a searching matcher, include the positions of state
	2980	0 in every state.
	2981
	2982	The group is constructed by building an equivalence-class
	2983	partition of the positions of s.
	2984
	2985	For each position, find the set of characters C that it matches. Eliminate
	2986	any characters from C that fail on grounds of backward context.
	2987
	2988	Check whether the group's label L has nonempty
	2989	intersection with C. If L - C is nonempty, create a new group labeled
	2990	L - C and having the same positions as the current group, and set L to
	2991	the intersection of L and C. Insert the position in the group, set
	2992	C = C - L, and resume scanning.
	2993
	2994	If after comparing with every group there are characters remaining in C,
	2995	create a new group labeled with the characters of C and insert this
	2996	position in that group. */
	2997
	2998	static state_num
	2999	build_state (state_num s, struct dfa *d, unsigned char uc)
	3000	{
	3001	position_set follows; /* Union of the follows for each
	3002	position of the current state. */
	3003	position_set group; /* Positions that match the input char. */
	3004	position_set tmp; /* Temporary space for merging sets. */
	3005	state_num state; /* New state. */
	3006	state_num state_newline; /* New state on a newline transition. */
	3007	state_num state_letter; /* New state on a letter transition. */
	3008
	3009	#ifdef DEBUG
	3010	fprintf (stderr, "build state %td\n", s);
	3011	#endif
	3012
	3013	/* A pointer to the new transition table, and the table itself. */
	3014	state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
	3015	state_num trans = ptrans;
	3016
	3017	if (!trans)
	3018	{
	3019	/* MAX_TRCOUNT is an arbitrary upper limit on the number of
	3020	transition tables that can exist at once, other than for
	3021	initial states. Often-used transition tables are quickly
	3022	rebuilt, whereas rarely-used ones are cleared away. */
	3023	if (MAX_TRCOUNT <= d->trcount)
	3024	{
	3025	for (state_num i = d->min_trcount; i < d->tralloc; i++)
	3026	{
	3027	free (d->trans[i]);
	3028	free (d->fails[i]);
	3029	d->trans[i] = d->fails[i] = NULL;
	3030	}
	3031	d->trcount = 0;
	3032	}
	3033
	3034	d->trcount++;
	3035	ptrans = trans = xmalloc (NOTCHAR sizeof *trans);
	3036
	3037	/* Fill transition table with a default value which means that the
	3038	transited state has not been calculated yet. */
	3039	for (int i = 0; i < NOTCHAR; i++)
	3040	trans[i] = -2;
	3041	}
	3042
	3043	/* Set up the success bits for this state. */
	3044	d->success[s] = 0;
	3045	if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
	3046	d->success[s] \|= CTX_NEWLINE;
	3047	if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
	3048	d->success[s] \|= CTX_LETTER;
	3049	if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
	3050	d->success[s] \|= CTX_NONE;
	3051
	3052	alloc_position_set (&follows, d->nleaves);
	3053
	3054	/* Find the union of the follows of the positions of the group.
	3055	This is a hideously inefficient loop. Fix it someday. */
	3056	for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
	3057	for (idx_t k = 0;
	3058	k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
	3059	insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
	3060	&follows);
	3061
	3062	/* Positions that match the input char. */
	3063	alloc_position_set (&group, d->nleaves);
	3064
	3065	/* The group's label. */
	3066	charclass label;
	3067	fillset (&label);
	3068
	3069	for (idx_t i = 0; i < follows.nelem; i++)
	3070	{
	3071	charclass matches; /* Set of matching characters. */
	3072	position pos = follows.elems[i];
	3073	bool matched = false;
	3074	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
	3075	{
	3076	zeroset (&matches);
	3077	setbit (d->tokens[pos.index], &matches);
	3078	if (d->tokens[pos.index] == uc)
	3079	matched = true;
	3080	}
	3081	else if (d->tokens[pos.index] >= CSET)
	3082	{
	3083	matches = d->charclasses[d->tokens[pos.index] - CSET];
	3084	if (tstbit (uc, &matches))
	3085	matched = true;
	3086	}
	3087	else if (d->tokens[pos.index] == ANYCHAR)
	3088	{
	3089	matches = d->charclasses[d->canychar];
	3090	if (tstbit (uc, &matches))
	3091	matched = true;
	3092
	3093	/* ANYCHAR must match with a single character, so we must put
	3094	it to D->states[s].mbps which contains the positions which
	3095	can match with a single character not a byte. If all
	3096	positions which has ANYCHAR does not depend on context of
	3097	next character, we put the follows instead of it to
	3098	D->states[s].mbps to optimize. */
	3099	if (succeeds_in_context (pos.constraint, d->states[s].context,
	3100	CTX_NONE))
	3101	{
	3102	if (d->states[s].mbps.nelem == 0)
	3103	alloc_position_set (&d->states[s].mbps, 1);
	3104	insert (pos, &d->states[s].mbps);
	3105	}
	3106	}
	3107	else
	3108	continue;
	3109
	3110	/* Some characters may need to be eliminated from matches because
	3111	they fail in the current context. */
	3112	if (pos.constraint != NO_CONSTRAINT)
	3113	{
	3114	if (!succeeds_in_context (pos.constraint,
	3115	d->states[s].context, CTX_NEWLINE))
	3116	for (int j = 0; j < CHARCLASS_WORDS; j++)
	3117	matches.w[j] &= ~d->syntax.newline.w[j];
	3118	if (!succeeds_in_context (pos.constraint,
	3119	d->states[s].context, CTX_LETTER))
	3120	for (int j = 0; j < CHARCLASS_WORDS; ++j)
	3121	matches.w[j] &= ~d->syntax.letters.w[j];
	3122	if (!succeeds_in_context (pos.constraint,
	3123	d->states[s].context, CTX_NONE))
	3124	for (int j = 0; j < CHARCLASS_WORDS; ++j)
	3125	matches.w[j] &= d->syntax.letters.w[j] \| d->syntax.newline.w[j];
	3126
	3127	/* If there are no characters left, there's no point in going on. */
	3128	if (emptyset (&matches))
	3129	continue;
	3130
	3131	/* If we have reset the bit that made us declare "matched", reset
	3132	that indicator, too. This is required to avoid an infinite loop
	3133	with this command: echo cx \| LC_ALL=C grep -E 'c\b[x ]' */
	3134	if (!tstbit (uc, &matches))
	3135	matched = false;
	3136	}
	3137
	3138	#ifdef DEBUG
	3139	fprintf (stderr, " nextpos %td:", pos.index);
	3140	prtok (d->tokens[pos.index]);
	3141	fprintf (stderr, " of");
	3142	for (unsigned j = 0; j < NOTCHAR; j++)
	3143	if (tstbit (j, &matches))
	3144	fprintf (stderr, " 0x%02x", j);
	3145	fprintf (stderr, "\n");
	3146	#endif
	3147
	3148	if (matched)
	3149	{
	3150	for (int k = 0; k < CHARCLASS_WORDS; ++k)
	3151	label.w[k] &= matches.w[k];
	3152	append (pos, &group);
	3153	}
	3154	else
	3155	{
	3156	for (int k = 0; k < CHARCLASS_WORDS; ++k)
	3157	label.w[k] &= ~matches.w[k];
	3158	}
	3159	}
	3160
	3161	alloc_position_set (&tmp, d->nleaves);
	3162
	3163	if (group.nelem > 0)
	3164	{
	3165	/* If we are building a searching matcher, throw in the positions
	3166	of state 0 as well, if possible. */
	3167	if (d->searchflag)
	3168	{
	3169	/* If a token in follows.elems is not 1st byte of a multibyte
	3170	character, or the states of follows must accept the bytes
	3171	which are not 1st byte of the multibyte character.
	3172	Then, if a state of follows encounters a byte, it must not be
	3173	a 1st byte of a multibyte character nor a single byte character.
	3174	In this case, do not add state[0].follows to next state, because
	3175	state[0] must accept 1st-byte.
	3176
	3177	For example, suppose <sb a> is a certain single byte character,
	3178	<mb A> is a certain multibyte character, and the codepoint of
	3179	<sb a> equals the 2nd byte of the codepoint of <mb A>. When
	3180	state[0] accepts <sb a>, state[i] transits to state[i+1] by
	3181	accepting the 1st byte of <mb A>, and state[i+1] accepts the
	3182	2nd byte of <mb A>, if state[i+1] encounters the codepoint of
	3183	<sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
	3184	not add state[0]. */
	3185
	3186	bool mergeit = !d->localeinfo.multibyte;
	3187	if (!mergeit)
	3188	{
	3189	mergeit = true;
	3190	for (idx_t j = 0; mergeit && j < group.nelem; j++)
	3191	mergeit &= d->multibyte_prop[group.elems[j].index];
	3192	}
	3193	if (mergeit)
	3194	merge2 (&group, &d->states[0].elems, &tmp);
	3195	}
	3196
	3197	/* Find out if the new state will want any context information,
	3198	by calculating possible contexts that the group can match,
	3199	and separate contexts that the new state wants to know. */
	3200	int possible_contexts = charclass_context (d, &label);
	3201	int separate_contexts = state_separate_contexts (d, &group);
	3202
	3203	/* Find the state(s) corresponding to the union of the follows. */
	3204	if (possible_contexts & ~separate_contexts)
	3205	state = state_index (d, &group, separate_contexts ^ CTX_ANY);
	3206	else
	3207	state = -1;
	3208	if (separate_contexts & possible_contexts & CTX_NEWLINE)
	3209	state_newline = state_index (d, &group, CTX_NEWLINE);
	3210	else
	3211	state_newline = state;
	3212	if (separate_contexts & possible_contexts & CTX_LETTER)
	3213	state_letter = state_index (d, &group, CTX_LETTER);
	3214	else
	3215	state_letter = state;
	3216
	3217	/* Reallocate now, to reallocate any newline transition properly. */
	3218	realloc_trans_if_necessary (d);
	3219	}
	3220
	3221	/* If we are a searching matcher, the default transition is to a state
	3222	containing the positions of state 0, otherwise the default transition
	3223	is to fail miserably. */
	3224	else if (d->searchflag)
	3225	{
	3226	state_newline = 0;
	3227	state_letter = d->min_trcount - 1;
	3228	state = d->initstate_notbol;
	3229	}
	3230	else
	3231	{
	3232	state_newline = -1;
	3233	state_letter = -1;
	3234	state = -1;
	3235	}
	3236
	3237	/* Set the transitions for each character in the label. */
	3238	for (int i = 0; i < NOTCHAR; i++)
	3239	if (tstbit (i, &label))
	3240	switch (d->syntax.sbit[i])
	3241	{
	3242	case CTX_NEWLINE:
	3243	trans[i] = state_newline;
	3244	break;
	3245	case CTX_LETTER:
	3246	trans[i] = state_letter;
	3247	break;
	3248	default:
	3249	trans[i] = state;
	3250	break;
	3251	}
	3252
	3253	#ifdef DEBUG
	3254	fprintf (stderr, "trans table %td", s);
	3255	for (int i = 0; i < NOTCHAR; ++i)
	3256	{
	3257	if (!(i & 0xf))
	3258	fprintf (stderr, "\n");
	3259	fprintf (stderr, " %2td", trans[i]);
	3260	}
	3261	fprintf (stderr, "\n");
	3262	#endif
	3263
	3264	free (group.elems);
	3265	free (follows.elems);
	3266	free (tmp.elems);
	3267
	3268	/* Keep the newline transition in a special place so we can use it as
	3269	a sentinel. */
	3270	if (tstbit (d->syntax.eolbyte, &label))
	3271	{
	3272	d->newlines[s] = trans[d->syntax.eolbyte];
	3273	trans[d->syntax.eolbyte] = -1;
	3274	}
	3275
	3276	return trans[uc];
	3277	}
	3278
	3279	/* Multibyte character handling sub-routines for dfaexec. */
	3280
	3281	/* Consume a single byte and transit state from 's' to '*next_state'.
	3282	This function is almost same as the state transition routin in dfaexec.
	3283	But state transition is done just once, otherwise matching succeed or
	3284	reach the end of the buffer. */
	3285	static state_num
	3286	transit_state_singlebyte (struct dfa d, state_num s, unsigned char const *pp)
	3287	{
	3288	state_num *t;
	3289
	3290	if (d->trans[s])
	3291	t = d->trans[s];
	3292	else if (d->fails[s])
	3293	t = d->fails[s];
	3294	else
	3295	{
	3296	build_state (s, d, **pp);
	3297	if (d->trans[s])
	3298	t = d->trans[s];
	3299	else
	3300	{
	3301	t = d->fails[s];
	3302	assert (t);
	3303	}
	3304	}
	3305
	3306	if (t[**pp] == -2)
	3307	build_state (s, d, **pp);
	3308
	3309	return t[(pp)++];
	3310	}
	3311
	3312	/* Transit state from s, then return new state and update the pointer of
	3313	the buffer. This function is for a period operator which can match a
	3314	multi-byte character. */
	3315	static state_num
	3316	transit_state (struct dfa d, state_num s, unsigned char const *pp,
	3317	unsigned char const *end)
	3318	{
	3319	wint_t wc;
	3320
	3321	int mbclen = mbs_to_wchar (&wc, (char const ) pp, end - *pp, d);
	3322
	3323	/* This state has some operators which can match a multibyte character. */
	3324	d->mb_follows.nelem = 0;
	3325
	3326	/* Calculate the state which can be reached from the state 's' by
	3327	consuming 'mbclen' single bytes from the buffer. */
	3328	state_num s1 = s;
	3329	int mbci;
	3330	for (mbci = 0; mbci < mbclen && (mbci == 0 \|\| d->min_trcount <= s); mbci++)
	3331	s = transit_state_singlebyte (d, s, pp);
	3332	*pp += mbclen - mbci;
	3333
	3334	if (wc == WEOF)
	3335	{
	3336	/* It is an invalid character, so ANYCHAR is not accepted. */
	3337	return s;
	3338	}
	3339
	3340	/* If all positions which have ANYCHAR do not depend on the context
	3341	of the next character, calculate the next state with
	3342	pre-calculated follows and cache the result. */
	3343	if (d->states[s1].mb_trindex < 0)
	3344	{
	3345	if (MAX_TRCOUNT <= d->mb_trcount)
	3346	{
	3347	state_num s3;
	3348	for (s3 = -1; s3 < d->tralloc; s3++)
	3349	{
	3350	free (d->mb_trans[s3]);
	3351	d->mb_trans[s3] = NULL;
	3352	}
	3353
	3354	for (state_num i = 0; i < d->sindex; i++)
	3355	d->states[i].mb_trindex = -1;
	3356	d->mb_trcount = 0;
	3357	}
	3358	d->states[s1].mb_trindex = d->mb_trcount++;
	3359	}
	3360
	3361	if (! d->mb_trans[s])
	3362	{
	3363	enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
	3364	enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
	3365	d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
	3366	for (int i = 0; i < MAX_TRCOUNT; i++)
	3367	d->mb_trans[s][i] = -1;
	3368	}
	3369	else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
	3370	return d->mb_trans[s][d->states[s1].mb_trindex];
	3371
	3372	if (s == -1)
	3373	copy (&d->states[s1].mbps, &d->mb_follows);
	3374	else
	3375	merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
	3376
	3377	int separate_contexts = state_separate_contexts (d, &d->mb_follows);
	3378	state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
	3379	realloc_trans_if_necessary (d);
	3380
	3381	d->mb_trans[s][d->states[s1].mb_trindex] = s2;
	3382
	3383	return s2;
	3384	}
	3385
	3386	/* The initial state may encounter a byte which is not a single byte character
	3387	nor the first byte of a multibyte character. But it is incorrect for the
	3388	initial state to accept such a byte. For example, in Shift JIS the regular
	3389	expression "\\" accepts the codepoint 0x5c, but should not accept the second
	3390	byte of the codepoint 0x815c. Then the initial state must skip the bytes
	3391	that are not a single byte character nor the first byte of a multibyte
	3392	character.
	3393
	3394	Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
	3395	or exceeds P, and return the advanced MBP. If WCP is non-NULL and
	3396	the result is greater than P, set *WCP to the final wide character
	3397	processed, or to WEOF if no wide character is processed. Otherwise,
	3398	if WCP is non-NULL, *WCP may or may not be updated.
	3399
	3400	Both P and MBP must be no larger than END. */
	3401	static unsigned char const *
	3402	skip_remains_mb (struct dfa d, unsigned char const p,
	3403	unsigned char const mbp, char const end)
	3404	{
	3405	if (d->syntax.never_trail[*p])
	3406	return p;
	3407	while (mbp < p)
	3408	{
	3409	wint_t wc;
	3410	mbp += mbs_to_wchar (&wc, (char const *) mbp,
	3411	end - (char const *) mbp, d);
	3412	}
	3413	return mbp;
	3414	}
	3415
	3416	/* Search through a buffer looking for a match to the struct dfa *D.
	3417	Find the first occurrence of a string matching the regexp in the
	3418	buffer, and the shortest possible version thereof. Return a pointer to
	3419	the first character after the match, or NULL if none is found. BEGIN
	3420	points to the beginning of the buffer, and END points to the first byte
	3421	after its end. Note however that we store a sentinel byte (usually
	3422	newline) in *END, so the actual buffer must be one byte longer.
	3423	When ALLOW_NL, newlines may appear in the matching string.
	3424	If COUNT is non-NULL, increment *COUNT once for each newline processed.
	3425	If MULTIBYTE, the input consists of multibyte characters and/or
	3426	encoding-error bytes. Otherwise, it consists of single-byte characters.
	3427	Here is the list of features that make this DFA matcher punt:
	3428	- [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
	3429	- [^...] in non-simple locale
	3430	- [[=foo=]] or [[.foo.]]
	3431	- [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
	3432	- back-reference: (.)\1
	3433	- word-delimiter in multibyte locale: \<, \>, \b, \B
	3434	See struct localeinfo.simple for the definition of "simple locale". */
	3435
	3436	static inline char *
	3437	dfaexec_main (struct dfa d, char const begin, char *end, bool allow_nl,
	3438	idx_t *count, bool multibyte)
	3439	{
	3440	if (MAX_TRCOUNT <= d->sindex)
	3441	{
	3442	for (state_num s = d->min_trcount; s < d->sindex; s++)
	3443	{
	3444	free (d->states[s].elems.elems);
	3445	free (d->states[s].mbps.elems);
	3446	}
	3447	d->sindex = d->min_trcount;
	3448
	3449	if (d->trans)
	3450	{
	3451	for (state_num s = 0; s < d->tralloc; s++)
	3452	{
	3453	free (d->trans[s]);
	3454	free (d->fails[s]);
	3455	d->trans[s] = d->fails[s] = NULL;
	3456	}
	3457	d->trcount = 0;
	3458	}
	3459
	3460	if (d->localeinfo.multibyte && d->mb_trans)
	3461	{
	3462	for (state_num s = -1; s < d->tralloc; s++)
	3463	{
	3464	free (d->mb_trans[s]);
	3465	d->mb_trans[s] = NULL;
	3466	}
	3467	for (state_num s = 0; s < d->min_trcount; s++)
	3468	d->states[s].mb_trindex = -1;
	3469	d->mb_trcount = 0;
	3470	}
	3471	}
	3472
	3473	if (!d->tralloc)
	3474	realloc_trans_if_necessary (d);
	3475
	3476	/* Current state. */
	3477	state_num s = 0, s1 = 0;
	3478
	3479	/* Current input character. */
	3480	unsigned char const p = (unsigned char const ) begin;
	3481	unsigned char const *mbp = p;
	3482
	3483	/* Copy of d->trans so it can be optimized into a register. */
	3484	state_num **trans = d->trans;
	3485	unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
	3486	unsigned char saved_end = (unsigned char ) end;
	3487	*end = eol;
	3488
	3489	if (multibyte)
	3490	{
	3491	memset (&d->mbs, 0, sizeof d->mbs);
	3492	if (d->mb_follows.alloc == 0)
	3493	alloc_position_set (&d->mb_follows, d->nleaves);
	3494	}
	3495
	3496	idx_t nlcount = 0;
	3497	for (;;)
	3498	{
	3499	state_num *t;
	3500	while ((t = trans[s]) != NULL)
	3501	{
	3502	if (s < d->min_trcount)
	3503	{
	3504	if (!multibyte \|\| d->states[s].mbps.nelem == 0)
	3505	{
	3506	while (t[*p] == s)
	3507	p++;
	3508	}
	3509	if (multibyte)
	3510	p = mbp = skip_remains_mb (d, p, mbp, end);
	3511	}
	3512
	3513	if (multibyte)
	3514	{
	3515	s1 = s;
	3516
	3517	if (d->states[s].mbps.nelem == 0
	3518	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
	3519	{
	3520	/* If an input character does not match ANYCHAR, do it
	3521	like a single-byte character. */
	3522	s = t[*p++];
	3523	}
	3524	else
	3525	{
	3526	s = transit_state (d, s, &p, (unsigned char *) end);
	3527	mbp = p;
	3528	trans = d->trans;
	3529	}
	3530	}
	3531	else
	3532	{
	3533	s1 = t[*p++];
	3534	t = trans[s1];
	3535	if (! t)
	3536	{
	3537	state_num tmp = s;
	3538	s = s1;
	3539	s1 = tmp; /* swap */
	3540	break;
	3541	}
	3542	if (s < d->min_trcount)
	3543	{
	3544	while (t[*p] == s1)
	3545	p++;
	3546	}
	3547	s = t[*p++];
	3548	}
	3549	}
	3550
	3551	if (s < 0)
	3552	{
	3553	if (s == -2)
	3554	{
	3555	s = build_state (s1, d, p[-1]);
	3556	trans = d->trans;
	3557	}
	3558	else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
	3559	{
	3560	/* The previous character was a newline. Count it, and skip
	3561	checking of multibyte character boundary until here. */
	3562	nlcount++;
	3563	mbp = p;
	3564
	3565	s = (allow_nl ? d->newlines[s1]
	3566	: d->syntax.sbit[eol] == CTX_NEWLINE ? 0
	3567	: d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
	3568	: d->initstate_notbol);
	3569	}
	3570	else
	3571	{
	3572	p = NULL;
	3573	goto done;
	3574	}
	3575	}
	3576	else if (d->fails[s])
	3577	{
	3578	if ((d->success[s] & d->syntax.sbit[*p])
	3579	\|\| ((char *) p == end
	3580	&& accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
	3581	d)))
	3582	goto done;
	3583
	3584	if (multibyte && s < d->min_trcount)
	3585	p = mbp = skip_remains_mb (d, p, mbp, end);
	3586
	3587	s1 = s;
	3588	if (!multibyte \|\| d->states[s].mbps.nelem == 0
	3589	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
	3590	{
	3591	/* If a input character does not match ANYCHAR, do it
	3592	like a single-byte character. */
	3593	s = d->fails[s][*p++];
	3594	}
	3595	else
	3596	{
	3597	s = transit_state (d, s, &p, (unsigned char *) end);
	3598	mbp = p;
	3599	trans = d->trans;
	3600	}
	3601	}
	3602	else
	3603	{
	3604	build_state (s, d, p[0]);
	3605	trans = d->trans;
	3606	}
	3607	}
	3608
	3609	done:
	3610	if (count)
	3611	*count += nlcount;
	3612	*end = saved_end;
	3613	return (char *) p;
	3614	}
	3615
	3616	/* Specialized versions of dfaexec for multibyte and single-byte cases.
	3617	This is for performance, as dfaexec_main is an inline function. */
	3618
	3619	static char *
	3620	dfaexec_mb (struct dfa d, char const begin, char *end,
	3621	bool allow_nl, idx_t count, bool backref)
	3622	{
	3623	return dfaexec_main (d, begin, end, allow_nl, count, true);
	3624	}
	3625
	3626	static char *
	3627	dfaexec_sb (struct dfa d, char const begin, char *end,
	3628	bool allow_nl, idx_t count, bool backref)
	3629	{
	3630	return dfaexec_main (d, begin, end, allow_nl, count, false);
	3631	}
	3632
	3633	/* Always set *BACKREF and return BEGIN. Use this wrapper for
	3634	any regexp that uses a construct not supported by this code. */
	3635	static char *
	3636	dfaexec_noop (struct dfa d, char const begin, char *end,
	3637	bool allow_nl, idx_t count, bool backref)
	3638	{
	3639	*backref = true;
	3640	return (char *) begin;
	3641	}
	3642
	3643	/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
	3644	but faster and set *BACKREF if the DFA code does not support this
	3645	regexp usage. */
	3646
	3647	char *
	3648	dfaexec (struct dfa d, char const begin, char *end,
	3649	bool allow_nl, idx_t count, bool backref)
	3650	{
	3651	return d->dfaexec (d, begin, end, allow_nl, count, backref);
	3652	}
	3653
	3654	struct dfa *
	3655	dfasuperset (struct dfa const *d)
	3656	{
	3657	return d->superset;
	3658	}
	3659
	3660	bool
	3661	dfaisfast (struct dfa const *d)
	3662	{
	3663	return d->fast;
	3664	}
	3665
	3666	static void
	3667	free_mbdata (struct dfa *d)
	3668	{
	3669	free (d->multibyte_prop);
	3670	free (d->lex.brack.chars);
	3671	free (d->mb_follows.elems);
	3672
	3673	if (d->mb_trans)
	3674	{
	3675	state_num s;
	3676	for (s = -1; s < d->tralloc; s++)
	3677	free (d->mb_trans[s]);
	3678	free (d->mb_trans - 2);
	3679	}
	3680	}
	3681
	3682	/* Return true if every construct in D is supported by this DFA matcher. */
	3683	bool
	3684	dfasupported (struct dfa const *d)
	3685	{
	3686	for (idx_t i = 0; i < d->tindex; i++)
	3687	{
	3688	switch (d->tokens[i])
	3689	{
	3690	case BEGWORD:
	3691	case ENDWORD:
	3692	case LIMWORD:
	3693	case NOTLIMWORD:
	3694	if (!d->localeinfo.multibyte)
	3695	continue;
	3696	FALLTHROUGH;
	3697	case BACKREF:
	3698	case MBCSET:
	3699	return false;
	3700	}
	3701	}
	3702	return true;
	3703	}
	3704
	3705	/* Disable use of the superset DFA if it is not likely to help
	3706	performance. */
	3707	static void
	3708	maybe_disable_superset_dfa (struct dfa *d)
	3709	{
	3710	if (!d->localeinfo.using_utf8)
	3711	return;
	3712
	3713	bool have_backref = false;
	3714	for (idx_t i = 0; i < d->tindex; i++)
	3715	{
	3716	switch (d->tokens[i])
	3717	{
	3718	case ANYCHAR:
	3719	/* Lowered. */
	3720	abort ();
	3721	case BACKREF:
	3722	have_backref = true;
	3723	break;
	3724	case MBCSET:
	3725	/* Requires multi-byte algorithm. */
	3726	return;
	3727	default:
	3728	break;
	3729	}
	3730	}
	3731
	3732	if (!have_backref && d->superset)
	3733	{
	3734	/* The superset DFA is not likely to be much faster, so remove it. */
	3735	dfafree (d->superset);
	3736	free (d->superset);
	3737	d->superset = NULL;
	3738	}
	3739
	3740	free_mbdata (d);
	3741	d->localeinfo.multibyte = false;
	3742	d->dfaexec = dfaexec_sb;
	3743	d->fast = true;
	3744	}
	3745
	3746	static void
	3747	dfassbuild (struct dfa *d)
	3748	{
	3749	struct dfa *sup = dfaalloc ();
	3750
	3751	sup = d;
	3752	sup->localeinfo.multibyte = false;
	3753	sup->dfaexec = dfaexec_sb;
	3754	sup->multibyte_prop = NULL;
	3755	sup->superset = NULL;
	3756	sup->states = NULL;
	3757	sup->sindex = 0;
	3758	sup->constraints = NULL;
	3759	sup->separates = NULL;
	3760	sup->follows = NULL;
	3761	sup->tralloc = 0;
	3762	sup->trans = NULL;
	3763	sup->fails = NULL;
	3764	sup->success = NULL;
	3765	sup->newlines = NULL;
	3766
	3767	sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
	3768	if (d->cindex)
	3769	{
	3770	memcpy (sup->charclasses, d->charclasses,
	3771	d->cindex * sizeof *sup->charclasses);
	3772	}
	3773
	3774	sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
	3775	sup->talloc = d->tindex * 2;
	3776
	3777	bool have_achar = false;
	3778	bool have_nchar = false;
	3779	idx_t j;
	3780	for (idx_t i = j = 0; i < d->tindex; i++)
	3781	{
	3782	switch (d->tokens[i])
	3783	{
	3784	case ANYCHAR:
	3785	case MBCSET:
	3786	case BACKREF:
	3787	{
	3788	charclass ccl;
	3789	fillset (&ccl);
	3790	sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
	3791	sup->tokens[j++] = STAR;
	3792	if (d->tokens[i + 1] == QMARK \|\| d->tokens[i + 1] == STAR
	3793	\|\| d->tokens[i + 1] == PLUS)
	3794	i++;
	3795	have_achar = true;
	3796	}
	3797	break;
	3798	case BEGWORD:
	3799	case ENDWORD:
	3800	case LIMWORD:
	3801	case NOTLIMWORD:
	3802	if (d->localeinfo.multibyte)
	3803	{
	3804	/* These constraints aren't supported in a multibyte locale.
	3805	Ignore them in the superset DFA. */
	3806	sup->tokens[j++] = EMPTY;
	3807	break;
	3808	}
	3809	FALLTHROUGH;
	3810	default:
	3811	sup->tokens[j++] = d->tokens[i];
	3812	if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
	3813	\|\| d->tokens[i] >= CSET)
	3814	have_nchar = true;
	3815	break;
	3816	}
	3817	}
	3818	sup->tindex = j;
	3819
	3820	if (have_nchar && (have_achar \|\| d->localeinfo.multibyte))
	3821	d->superset = sup;
	3822	else
	3823	{
	3824	dfafree (sup);
	3825	free (sup);
	3826	}
	3827	}
	3828
	3829	/* Parse a string S of length LEN into D (but skip this step if S is null).
	3830	Then analyze D and build a matcher for it.
	3831	SEARCHFLAG says whether to build a searching or an exact matcher. */
	3832	void
	3833	dfacomp (char const s, idx_t len, struct dfa d, bool searchflag)
	3834	{
	3835	if (s != NULL)
	3836	dfaparse (s, len, d);
	3837
	3838	dfassbuild (d);
	3839
	3840	if (dfasupported (d))
	3841	{
	3842	maybe_disable_superset_dfa (d);
	3843	dfaanalyze (d, searchflag);
	3844	}
	3845	else
	3846	{
	3847	d->dfaexec = dfaexec_noop;
	3848	}
	3849
	3850	if (d->superset)
	3851	{
	3852	d->fast = true;
	3853	dfaanalyze (d->superset, searchflag);
	3854	}
	3855	}
	3856
	3857	/* Free the storage held by the components of a dfa. */
	3858	void
	3859	dfafree (struct dfa *d)
	3860	{
	3861	free (d->charclasses);
	3862	free (d->tokens);
	3863
	3864	if (d->localeinfo.multibyte)
	3865	free_mbdata (d);
	3866
	3867	free (d->constraints);
	3868	free (d->separates);
	3869
	3870	for (idx_t i = 0; i < d->sindex; i++)
	3871	{
	3872	free (d->states[i].elems.elems);
	3873	free (d->states[i].mbps.elems);
	3874	}
	3875	free (d->states);
	3876
	3877	if (d->follows)
	3878	{
	3879	for (idx_t i = 0; i < d->tindex; i++)
	3880	free (d->follows[i].elems);
	3881	free (d->follows);
	3882	}
	3883
	3884	if (d->trans)
	3885	{
	3886	for (idx_t i = 0; i < d->tralloc; i++)
	3887	{
	3888	free (d->trans[i]);
	3889	free (d->fails[i]);
	3890	}
	3891
	3892	free (d->trans - 2);
	3893	free (d->fails);
	3894	free (d->newlines);
	3895	free (d->success);
	3896	}
	3897
	3898	if (d->superset)
	3899	{
	3900	dfafree (d->superset);
	3901	free (d->superset);
	3902	}
	3903	}
	3904
	3905	/* Having found the postfix representation of the regular expression,
	3906	try to find a long sequence of characters that must appear in any line
	3907	containing the r.e.
	3908	Finding a "longest" sequence is beyond the scope here;
	3909	we take an easy way out and hope for the best.
	3910	(Take "(ab\|a)b"--please.)
	3911
	3912	We do a bottom-up calculation of sequences of characters that must appear
	3913	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
	3914	representation:
	3915	sequences that must appear at the left of the match ("left")
	3916	sequences that must appear at the right of the match ("right")
	3917	lists of sequences that must appear somewhere in the match ("in")
	3918	sequences that must constitute the match ("is")
	3919
	3920	When we get to the root of the tree, we use one of the longest of its
	3921	calculated "in" sequences as our answer.
	3922
	3923	The sequences calculated for the various types of node (in pseudo ANSI c)
	3924	are shown below. "p" is the operand of unary operators (and the left-hand
	3925	operand of binary operators); "q" is the right-hand operand of binary
	3926	operators.
	3927
	3928	"ZERO" means "a zero-length sequence" below.
	3929
	3930	Type left right is in
	3931	---- ---- ----- -- --
	3932	char c # c # c # c # c
	3933
	3934	ANYCHAR ZERO ZERO ZERO ZERO
	3935
	3936	MBCSET ZERO ZERO ZERO ZERO
	3937
	3938	CSET ZERO ZERO ZERO ZERO
	3939
	3940	STAR ZERO ZERO ZERO ZERO
	3941
	3942	QMARK ZERO ZERO ZERO ZERO
	3943
	3944	PLUS p->left p->right ZERO p->in
	3945
	3946	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
	3947	p->left : q->right : q->is!=ZERO) ? q->in plus
	3948	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
	3949	ZERO
	3950
	3951	OR longest common longest common (do p->is and substrings common
	3952	leading trailing to q->is have same p->in and
	3953	(sub)sequence (sub)sequence q->in length and content) ?
	3954	of p->left of p->right
	3955	and q->left and q->right p->is : NULL
	3956
	3957	If there's anything else we recognize in the tree, all four sequences get set
	3958	to zero-length sequences. If there's something we don't recognize in the
	3959	tree, we just return a zero-length sequence.
	3960
	3961	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
	3962	'aaa')?
	3963
	3964	And ... is it here or someplace that we might ponder "optimizations" such as
	3965	egrep 'psi\|epsilon' -> egrep 'psi'
	3966	egrep 'pepsi\|epsilon' -> egrep 'epsi'
	3967	(Yes, we now find "epsi" as a "string
	3968	that must occur", but we might also
	3969	simplify the entire r.e. being sought)
	3970	grep '[c]' -> grep 'c'
	3971	grep '(ab\|a)b' -> grep 'ab'
	3972	grep 'ab*' -> grep 'a'
	3973	grep 'a*b' -> grep 'b'
	3974
	3975	There are several issues:
	3976
	3977	Is optimization easy (enough)?
	3978
	3979	Does optimization actually accomplish anything,
	3980	or is the automaton you get from "psi\|epsilon" (for example)
	3981	the same as the one you get from "psi" (for example)?
	3982
	3983	Are optimizable r.e.'s likely to be used in real-life situations
	3984	(something like 'ab*' is probably unlikely; something like is
	3985	'psi\|epsilon' is likelier)? */
	3986
	3987	static char *
	3988	icatalloc (char old, char const new)
	3989	{
	3990	idx_t newsize = strlen (new);
	3991	if (newsize == 0)
	3992	return old;
	3993	idx_t oldsize = strlen (old);
	3994	char *result = xirealloc (old, oldsize + newsize + 1);
	3995	memcpy (result + oldsize, new, newsize + 1);
	3996	return result;
	3997	}
	3998
	3999	static void
	4000	freelist (char **cpp)
	4001	{
	4002	while (*cpp)
	4003	free (*cpp++);
	4004	}
	4005
	4006	static char **
	4007	enlistnew (char *cpp, char new)
	4008	{
	4009	/* Is there already something in the list that's new (or longer)? */
	4010	idx_t i;
	4011	for (i = 0; cpp[i] != NULL; i++)
	4012	if (strstr (cpp[i], new) != NULL)
	4013	{
	4014	free (new);
	4015	return cpp;
	4016	}
	4017	/* Eliminate any obsoleted strings. */
	4018	for (idx_t j = 0; cpp[j] != NULL; )
	4019	if (strstr (new, cpp[j]) == NULL)
	4020	++j;
	4021	else
	4022	{
	4023	free (cpp[j]);
	4024	if (--i == j)
	4025	break;
	4026	cpp[j] = cpp[i];
	4027	cpp[i] = NULL;
	4028	}
	4029	/* Add the new string. */
	4030	cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
	4031	cpp[i] = new;
	4032	cpp[i + 1] = NULL;
	4033	return cpp;
	4034	}
	4035
	4036	static char **
	4037	enlist (char *cpp, char const str, idx_t len)
	4038	{
	4039	return enlistnew (cpp, ximemdup0 (str, len));
	4040	}
	4041
	4042	/* Given pointers to two strings, return a pointer to an allocated
	4043	list of their distinct common substrings. */
	4044	static char **
	4045	comsubs (char left, char const right)
	4046	{
	4047	char *cpp = xzalloc (sizeof cpp);
	4048
	4049	for (char lcp = left; lcp != '\0'; lcp++)
	4050	{
	4051	idx_t len = 0;
	4052	char rcp = strchr (right, lcp);
	4053	while (rcp != NULL)
	4054	{
	4055	idx_t i;
	4056	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
	4057	continue;
	4058	if (i > len)
	4059	len = i;
	4060	rcp = strchr (rcp + 1, *lcp);
	4061	}
	4062	if (len != 0)
	4063	cpp = enlist (cpp, lcp, len);
	4064	}
	4065	return cpp;
	4066	}
	4067
	4068	static char **
	4069	addlists (char old, char new)
	4070	{
	4071	for (; *new; new++)
	4072	old = enlistnew (old, xstrdup (*new));
	4073	return old;
	4074	}
	4075
	4076	/* Given two lists of substrings, return a new list giving substrings
	4077	common to both. */
	4078	static char **
	4079	inboth (char left, char right)
	4080	{
	4081	char *both = xzalloc (sizeof both);
	4082
	4083	for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
	4084	{
	4085	for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
	4086	{
	4087	char **temp = comsubs (left[lnum], right[rnum]);
	4088	both = addlists (both, temp);
	4089	freelist (temp);
	4090	free (temp);
	4091	}
	4092	}
	4093	return both;
	4094	}
	4095
	4096	typedef struct must must;
	4097
	4098	struct must
	4099	{
	4100	char **in;
	4101	char *left;
	4102	char *right;
	4103	char *is;
	4104	bool begline;
	4105	bool endline;
	4106	must *prev;
	4107	};
	4108
	4109	static must *
	4110	allocmust (must *mp, idx_t size)
	4111	{
	4112	must new_mp = xmalloc (sizeof new_mp);
	4113	new_mp->in = xzalloc (sizeof *new_mp->in);
	4114	new_mp->left = xizalloc (size);
	4115	new_mp->right = xizalloc (size);
	4116	new_mp->is = xizalloc (size);
	4117	new_mp->begline = false;
	4118	new_mp->endline = false;
	4119	new_mp->prev = mp;
	4120	return new_mp;
	4121	}
	4122
	4123	static void
	4124	resetmust (must *mp)
	4125	{
	4126	freelist (mp->in);
	4127	mp->in[0] = NULL;
	4128	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
	4129	mp->begline = false;
	4130	mp->endline = false;
	4131	}
	4132
	4133	static void
	4134	freemust (must *mp)
	4135	{
	4136	freelist (mp->in);
	4137	free (mp->in);
	4138	free (mp->left);
	4139	free (mp->right);
	4140	free (mp->is);
	4141	free (mp);
	4142	}
	4143
	4144	struct dfamust *
	4145	dfamust (struct dfa const *d)
	4146	{
	4147	must *mp = NULL;
	4148	char const *result = "";
	4149	bool exact = false;
	4150	bool begline = false;
	4151	bool endline = false;
	4152	bool need_begline = false;
	4153	bool need_endline = false;
	4154	bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
	4155
	4156	for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
	4157	{
	4158	token t = d->tokens[ri];
	4159	switch (t)
	4160	{
	4161	case BEGLINE:
	4162	mp = allocmust (mp, 2);
	4163	mp->begline = true;
	4164	need_begline = true;
	4165	break;
	4166	case ENDLINE:
	4167	mp = allocmust (mp, 2);
	4168	mp->endline = true;
	4169	need_endline = true;
	4170	break;
	4171	case LPAREN:
	4172	case RPAREN:
	4173	assert (!"neither LPAREN nor RPAREN may appear here");
	4174
	4175	case EMPTY:
	4176	case BEGWORD:
	4177	case ENDWORD:
	4178	case LIMWORD:
	4179	case NOTLIMWORD:
	4180	case BACKREF:
	4181	case ANYCHAR:
	4182	case MBCSET:
	4183	mp = allocmust (mp, 2);
	4184	break;
	4185
	4186	case STAR:
	4187	case QMARK:
	4188	assume_nonnull (mp);
	4189	resetmust (mp);
	4190	break;
	4191
	4192	case OR:
	4193	{
	4194	char **new;
	4195	must *rmp = mp;
	4196	assume_nonnull (rmp);
	4197	must *lmp = mp = mp->prev;
	4198	assume_nonnull (lmp);
	4199	idx_t j, ln, rn, n;
	4200
	4201	/* Guaranteed to be. Unlikely, but ... */
	4202	if (str_eq (lmp->is, rmp->is))
	4203	{
	4204	lmp->begline &= rmp->begline;
	4205	lmp->endline &= rmp->endline;
	4206	}
	4207	else
	4208	{
	4209	lmp->is[0] = '\0';
	4210	lmp->begline = false;
	4211	lmp->endline = false;
	4212	}
	4213	/* Left side--easy */
	4214	idx_t i = 0;
	4215	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
	4216	++i;
	4217	lmp->left[i] = '\0';
	4218	/* Right side */
	4219	ln = strlen (lmp->right);
	4220	rn = strlen (rmp->right);
	4221	n = ln;
	4222	if (n > rn)
	4223	n = rn;
	4224	for (i = 0; i < n; ++i)
	4225	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
	4226	break;
	4227	for (j = 0; j < i; ++j)
	4228	lmp->right[j] = lmp->right[(ln - i) + j];
	4229	lmp->right[j] = '\0';
	4230	new = inboth (lmp->in, rmp->in);
	4231	freelist (lmp->in);
	4232	free (lmp->in);
	4233	lmp->in = new;
	4234	freemust (rmp);
	4235	}
	4236	break;
	4237
	4238	case PLUS:
	4239	assume_nonnull (mp);
	4240	mp->is[0] = '\0';
	4241	break;
	4242
	4243	case END:
	4244	assume_nonnull (mp);
	4245	assert (!mp->prev);
	4246	for (idx_t i = 0; mp->in[i] != NULL; i++)
	4247	if (strlen (mp->in[i]) > strlen (result))
	4248	result = mp->in[i];
	4249	if (str_eq (result, mp->is))
	4250	{
	4251	if ((!need_begline \|\| mp->begline) && (!need_endline
	4252	\|\| mp->endline))
	4253	exact = true;
	4254	begline = mp->begline;
	4255	endline = mp->endline;
	4256	}
	4257	goto done;
	4258
	4259	case CAT:
	4260	{
	4261	must *rmp = mp;
	4262	assume_nonnull (rmp);
	4263	must *lmp = mp = mp->prev;
	4264	assume_nonnull (lmp);
	4265
	4266	/* In. Everything in left, plus everything in
	4267	right, plus concatenation of
	4268	left's right and right's left. */
	4269	lmp->in = addlists (lmp->in, rmp->in);
	4270	if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
	4271	{
	4272	idx_t lrlen = strlen (lmp->right);
	4273	idx_t rllen = strlen (rmp->left);
	4274	char *tp = ximalloc (lrlen + rllen + 1);
	4275	memcpy (tp + lrlen, rmp->left, rllen + 1);
	4276	memcpy (tp, lmp->right, lrlen);
	4277	lmp->in = enlistnew (lmp->in, tp);
	4278	}
	4279	/* Left-hand */
	4280	if (lmp->is[0] != '\0')
	4281	lmp->left = icatalloc (lmp->left, rmp->left);
	4282	/* Right-hand */
	4283	if (rmp->is[0] == '\0')
	4284	lmp->right[0] = '\0';
	4285	lmp->right = icatalloc (lmp->right, rmp->right);
	4286	/* Guaranteed to be */
	4287	if ((lmp->is[0] != '\0' \|\| lmp->begline)
	4288	&& (rmp->is[0] != '\0' \|\| rmp->endline))
	4289	{
	4290	lmp->is = icatalloc (lmp->is, rmp->is);
	4291	lmp->endline = rmp->endline;
	4292	}
	4293	else
	4294	{
	4295	lmp->is[0] = '\0';
	4296	lmp->begline = false;
	4297	lmp->endline = false;
	4298	}
	4299	freemust (rmp);
	4300	}
	4301	break;
	4302
	4303	case '\0':
	4304	/* Not on my shift. */
	4305	goto done;
	4306
	4307	default:
	4308	if (CSET <= t)
	4309	{
	4310	/* If T is a singleton, or if case-folding in a unibyte
	4311	locale and T's members all case-fold to the same char,
	4312	convert T to one of its members. Otherwise, do
	4313	nothing further with T. */
	4314	charclass *ccl = &d->charclasses[t - CSET];
	4315	int j;
	4316	for (j = 0; j < NOTCHAR; j++)
	4317	if (tstbit (j, ccl))
	4318	break;
	4319	if (! (j < NOTCHAR))
	4320	{
	4321	mp = allocmust (mp, 2);
	4322	break;
	4323	}
	4324	t = j;
	4325	while (++j < NOTCHAR)
	4326	if (tstbit (j, ccl)
	4327	&& ! (case_fold_unibyte
	4328	&& toupper (j) == toupper (t)))
	4329	break;
	4330	if (j < NOTCHAR)
	4331	{
	4332	mp = allocmust (mp, 2);
	4333	break;
	4334	}
	4335	}
	4336
	4337	idx_t rj = ri + 2;
	4338	if (d->tokens[ri + 1] == CAT)
	4339	{
	4340	for (; rj < d->tindex - 1; rj += 2)
	4341	{
	4342	if ((rj != ri && (d->tokens[rj] <= 0
	4343	\|\| NOTCHAR <= d->tokens[rj]))
	4344	\|\| d->tokens[rj + 1] != CAT)
	4345	break;
	4346	}
	4347	}
	4348	mp = allocmust (mp, ((rj - ri) >> 1) + 1);
	4349	mp->is[0] = mp->left[0] = mp->right[0]
	4350	= case_fold_unibyte ? toupper (t) : t;
	4351
	4352	idx_t i;
	4353	for (i = 1; ri + 2 < rj; i++)
	4354	{
	4355	ri += 2;
	4356	t = d->tokens[ri];
	4357	mp->is[i] = mp->left[i] = mp->right[i]
	4358	= case_fold_unibyte ? toupper (t) : t;
	4359	}
	4360	mp->is[i] = mp->left[i] = mp->right[i] = '\0';
	4361	mp->in = enlist (mp->in, mp->is, i);
	4362	break;
	4363	}
	4364	}
	4365	done:;
	4366
	4367	struct dfamust *dm = NULL;
	4368	if (*result)
	4369	{
	4370	dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
	4371	dm->exact = exact;
	4372	dm->begline = begline;
	4373	dm->endline = endline;
	4374	strcpy (dm->must, result);
	4375	}
	4376
	4377	while (mp)
	4378	{
	4379	must *prev = mp->prev;
	4380	freemust (mp);
	4381	mp = prev;
	4382	}
	4383
	4384	return dm;
	4385	}
	4386
	4387	void
	4388	dfamustfree (struct dfamust *dm)
	4389	{
	4390	free (dm);
	4391	}
	4392
	4393	struct dfa *
	4394	dfaalloc (void)
	4395	{
	4396	return xmalloc (sizeof (struct dfa));
	4397	}
	4398
	4399	/* Initialize DFA. */
	4400	void
	4401	dfasyntax (struct dfa dfa, struct localeinfo const linfo,
	4402	reg_syntax_t bits, int dfaopts)
	4403	{
	4404	memset (dfa, 0, offsetof (struct dfa, dfaexec));
	4405	dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
	4406	dfa->localeinfo = *linfo;
	4407
	4408	dfa->fast = !dfa->localeinfo.multibyte;
	4409
	4410	dfa->canychar = -1;
	4411	dfa->syntax.syntax_bits_set = true;
	4412	dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
	4413	dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
	4414	dfa->syntax.syntax_bits = bits;
	4415	dfa->syntax.dfaopts = dfaopts;
	4416
	4417	for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
	4418	{
	4419	unsigned char uc = i;
	4420
	4421	dfa->syntax.sbit[uc] = char_context (dfa, uc);
	4422	switch (dfa->syntax.sbit[uc])
	4423	{
	4424	case CTX_LETTER:
	4425	setbit (uc, &dfa->syntax.letters);
	4426	break;
	4427	case CTX_NEWLINE:
	4428	setbit (uc, &dfa->syntax.newline);
	4429	break;
	4430	}
	4431
	4432	/* POSIX requires that the five bytes in "\n\r./" (including the
	4433	terminating NUL) cannot occur inside a multibyte character. */
	4434	dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
	4435	? (uc & 0xc0) != 0x80
	4436	: strchr ("\n\r./", uc) != NULL);
	4437	}
	4438	}
	4439
	4440	/* Initialize TO by copying FROM's syntax settings. */
	4441	void
	4442	dfacopysyntax (struct dfa to, struct dfa const from)
	4443	{
	4444	memset (to, 0, offsetof (struct dfa, syntax));
	4445	to->canychar = -1;
	4446	to->fast = from->fast;
	4447	to->syntax = from->syntax;
	4448	to->dfaexec = from->dfaexec;
	4449	to->localeinfo = from->localeinfo;
	4450	}
	4451
	4452	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/sed/lib/dfa.c

Download in other formats: