Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

dfa.c

Last change on this file was 3658, checked in by bird, 9 months ago
grep/dfa.c: Workaround for Visual C++ 2022 (amd64) optimizer bug. (same as sed)
Property svn:eol-style set to `native`
File size: 134.7 KB

Rev	Line
[3529]	1	/* dfa.c - deterministic extended regexp routines for GNU
	2	Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2021 Free Software
	3	Foundation, Inc.
	4
	5	This program is free software; you can redistribute it and/or modify
	6	it under the terms of the GNU General Public License as published by
	7	the Free Software Foundation; either version 3, or (at your option)
	8	any later version.
	9
	10	This program is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	GNU General Public License for more details.
	14
	15	You should have received a copy of the GNU General Public License
	16	along with this program; if not, write to the Free Software
	17	Foundation, Inc.,
	18	51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
	19
	20	/* Written June, 1988 by Mike Haertel
	21	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
	22
	23	#include <config.h>
	24
	25	#include "dfa.h"
	26
	27	#include "flexmember.h"
	28	#include "idx.h"
	29	#include "verify.h"
	30
	31	#include <assert.h>
	32	#include <ctype.h>
	33	#include <stdint.h>
	34	#include <stdio.h>
	35	#include <stdlib.h>
	36	#include <limits.h>
	37	#include <string.h>
	38
	39	/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
	40	understandably cannot deduce that the input comes from a
	41	well-formed regular expression. There's little point to the
	42	runtime overhead of 'assert' instead of 'assume_nonnull' when the
	43	MMU will check anyway. */
	44	#define assume_nonnull(x) assume ((x) != NULL)
	45
	46	static bool
	47	streq (char const a, char const b)
	48	{
	49	return strcmp (a, b) == 0;
	50	}
	51
	52	static bool
	53	isasciidigit (char c)
	54	{
	55	return '0' <= c && c <= '9';
	56	}
	57
	58	#include "gettext.h"
	59	#define _(str) gettext (str)
	60
	61	#include <wchar.h>
	62
	63	#include "xalloc.h"
	64	#include "localeinfo.h"
	65
	66	#ifndef FALLTHROUGH
	67	# if 201710L < __STDC_VERSION__
	68	# define FALLTHROUGH [[__fallthrough__]]
	69	# elif (__GNUC__ >= 7) \|\| (__clang_major__ >= 10)
	70	# define FALLTHROUGH __attribute__ ((__fallthrough__))
	71	# else
	72	# define FALLTHROUGH ((void) 0)
	73	# endif
	74	#endif
	75
	76	#ifndef MIN
	77	# define MIN(a,b) ((a) < (b) ? (a) : (b))
	78	#endif
	79
	80	/* HPUX defines these as macros in sys/param.h. */
	81	#ifdef setbit
	82	# undef setbit
	83	#endif
	84	#ifdef clrbit
	85	# undef clrbit
	86	#endif
	87
	88	/* For code that does not use Gnulibâs isblank module. */
	89	#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
	90	# define isblank dfa_isblank
	91	static int
	92	isblank (int c)
	93	{
	94	return c == ' ' \|\| c == '\t';
	95	}
	96	#endif
	97
	98	/* First integer value that is greater than any character code. */
	99	enum { NOTCHAR = 1 << CHAR_BIT };
	100
	101	#ifdef UINT_LEAST64_MAX
	102
	103	/* Number of bits used in a charclass word. */
	104	enum { CHARCLASS_WORD_BITS = 64 };
	105
	106	/* This represents part of a character class. It must be unsigned and
	107	at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
	108	typedef uint_least64_t charclass_word;
	109
	110	/* Part of a charclass initializer that represents 64 bits' worth of a
	111	charclass, where LO and HI are the low and high-order 32 bits of
	112	the 64-bit quantity. */
	113	# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
	114
	115	#else
	116	/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
	117	enum { CHARCLASS_WORD_BITS = 32 };
	118	typedef unsigned long charclass_word;
	119	# define CHARCLASS_PAIR(lo, hi) lo, hi
	120	#endif
	121
	122	/* An initializer for a charclass whose 32-bit words are A through H. */
	123	#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
	124	{{ \
	125	CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
	126	CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
	127	}}
	128
	129	/* The maximum useful value of a charclass_word; all used bits are 1. */
	130	static charclass_word const CHARCLASS_WORD_MASK
	131	= ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
	132
	133	/* Number of words required to hold a bit for every character. */
	134	enum
	135	{
	136	CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
	137	};
	138
	139	/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
	140	typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
	141
	142	/* Convert a possibly-signed character to an unsigned character. This is
	143	a bit safer than casting to unsigned char, since it catches some type
	144	errors that the cast doesn't. */
	145	static unsigned char
	146	to_uchar (char ch)
	147	{
	148	return ch;
	149	}
	150
	151	/* Contexts tell us whether a character is a newline or a word constituent.
	152	Word-constituent characters are those that satisfy iswalnum, plus '_'.
	153	Each character has a single CTX_* value; bitmasks of CTX_* values denote
	154	a particular character class.
	155
	156	A state also stores a context value, which is a bitmask of CTX_* values.
	157	A state's context represents a set of characters that the state's
	158	predecessors must match. For example, a state whose context does not
	159	include CTX_LETTER will never have transitions where the previous
	160	character is a word constituent. A state whose context is CTX_ANY
	161	might have transitions from any character. */
	162
	163	enum
	164	{
	165	CTX_NONE = 1,
	166	CTX_LETTER = 2,
	167	CTX_NEWLINE = 4,
	168	CTX_ANY = 7
	169	};
	170
	171	/* Sometimes characters can only be matched depending on the surrounding
	172	context. Such context decisions depend on what the previous character
	173	was, and the value of the current (lookahead) character. Context
	174	dependent constraints are encoded as 9-bit integers. Each bit that
	175	is set indicates that the constraint succeeds in the corresponding
	176	context.
	177
	178	bit 6-8 - valid contexts when next character is CTX_NEWLINE
	179	bit 3-5 - valid contexts when next character is CTX_LETTER
	180	bit 0-2 - valid contexts when next character is CTX_NONE
	181
	182	succeeds_in_context determines whether a given constraint
	183	succeeds in a particular context. Prev is a bitmask of possible
	184	context values for the previous character, curr is the (single-bit)
	185	context value for the lookahead character. */
	186	static int
	187	newline_constraint (int constraint)
	188	{
	189	return (constraint >> 6) & 7;
	190	}
	191	static int
	192	letter_constraint (int constraint)
	193	{
	194	return (constraint >> 3) & 7;
	195	}
	196	static int
	197	other_constraint (int constraint)
	198	{
	199	return constraint & 7;
	200	}
	201
	202	static bool
	203	succeeds_in_context (int constraint, int prev, int curr)
	204	{
	205	return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
	206	\| (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
	207	\| (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
	208	& prev);
	209	}
	210
	211	/* The following describe what a constraint depends on. */
	212	static bool
	213	prev_newline_dependent (int constraint)
	214	{
	215	return ((constraint ^ constraint >> 2) & 0111) != 0;
	216	}
	217	static bool
	218	prev_letter_dependent (int constraint)
	219	{
	220	return ((constraint ^ constraint >> 1) & 0111) != 0;
	221	}
	222
	223	/* Tokens that match the empty string subject to some constraint actually
	224	work by applying that constraint to determine what may follow them,
	225	taking into account what has gone before. The following values are
	226	the constraints corresponding to the special tokens previously defined. */
	227	enum
	228	{
	229	NO_CONSTRAINT = 0777,
	230	BEGLINE_CONSTRAINT = 0444,
	231	ENDLINE_CONSTRAINT = 0700,
	232	BEGWORD_CONSTRAINT = 0050,
	233	ENDWORD_CONSTRAINT = 0202,
	234	LIMWORD_CONSTRAINT = 0252,
	235	NOTLIMWORD_CONSTRAINT = 0525
	236	};
	237
	238	/* The regexp is parsed into an array of tokens in postfix form. Some tokens
	239	are operators and others are terminal symbols. Most (but not all) of these
	240	codes are returned by the lexical analyzer. */
	241
	242	typedef ptrdiff_t token;
	243	static token const TOKEN_MAX = PTRDIFF_MAX;
	244
	245	/* States are indexed by state_num values. These are normally
	246	nonnegative but -1 is used as a special value. */
	247	typedef ptrdiff_t state_num;
	248
	249	/* Predefined token values. */
	250	enum
	251	{
	252	END = -1, /* END is a terminal symbol that matches the
	253	end of input; any value of END or less in
	254	the parse tree is such a symbol. Accepting
	255	states of the DFA are those that would have
	256	a transition on END. This is -1, not some
	257	more-negative value, to tweak the speed of
	258	comparisons to END. */
	259
	260	/* Ordinary character values are terminal symbols that match themselves. */
	261
	262	/* CSET must come last in the following list of special tokens. Otherwise,
	263	the list order matters only for performance. Related special tokens
	264	should have nearby values so that code like (t == ANYCHAR \|\| t == MBCSET
	265	\|\| CSET <= t) can be done with a single machine-level comparison. */
	266
	267	EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
	268	the empty string. */
	269
	270	QMARK, /* QMARK is an operator of one argument that
	271	matches zero or one occurrences of its
	272	argument. */
	273
	274	STAR, /* STAR is an operator of one argument that
	275	matches the Kleene closure (zero or more
	276	occurrences) of its argument. */
	277
	278	PLUS, /* PLUS is an operator of one argument that
	279	matches the positive closure (one or more
	280	occurrences) of its argument. */
	281
	282	REPMN, /* REPMN is a lexical token corresponding
	283	to the {m,n} construct. REPMN never
	284	appears in the compiled token vector. */
	285
	286	CAT, /* CAT is an operator of two arguments that
	287	matches the concatenation of its
	288	arguments. CAT is never returned by the
	289	lexical analyzer. */
	290
	291	OR, /* OR is an operator of two arguments that
	292	matches either of its arguments. */
	293
	294	LPAREN, /* LPAREN never appears in the parse tree,
	295	it is only a lexeme. */
	296
	297	RPAREN, /* RPAREN never appears in the parse tree. */
	298
[3532]	299	#if defined(KMK_GREP) && defined(KBUILD_OS_WINDOWS)
	300	# define WCHAR DFA_WCHAR
	301	#endif
[3529]	302	WCHAR, /* Only returned by lex. wctok contains
	303	the wide character representation. */
	304
	305	ANYCHAR, /* ANYCHAR is a terminal symbol that matches
	306	a valid multibyte (or single byte) character.
	307	It is used only if MB_CUR_MAX > 1. */
	308
	309	BEG, /* BEG is an initial symbol that matches the
	310	beginning of input. */
	311
	312	BEGLINE, /* BEGLINE is a terminal symbol that matches
	313	the empty string at the beginning of a
	314	line. */
	315
	316	ENDLINE, /* ENDLINE is a terminal symbol that matches
	317	the empty string at the end of a line. */
	318
	319	BEGWORD, /* BEGWORD is a terminal symbol that matches
	320	the empty string at the beginning of a
	321	word. */
	322
	323	ENDWORD, /* ENDWORD is a terminal symbol that matches
	324	the empty string at the end of a word. */
	325
	326	LIMWORD, /* LIMWORD is a terminal symbol that matches
	327	the empty string at the beginning or the
	328	end of a word. */
	329
	330	NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
	331	matches the empty string not at
	332	the beginning or end of a word. */
	333
	334	BACKREF, /* BACKREF is generated by \<digit>
	335	or by any other construct that
	336	is not completely handled. If the scanner
	337	detects a transition on backref, it returns
	338	a kind of "semi-success" indicating that
	339	the match will have to be verified with
	340	a backtracking matcher. */
	341
	342	MBCSET, /* MBCSET is similar to CSET, but for
	343	multibyte characters. */
	344
	345	CSET /* CSET and (and any value greater) is a
	346	terminal symbol that matches any of a
	347	class of characters. */
	348	};
	349
	350
	351	/* States of the recognizer correspond to sets of positions in the parse
	352	tree, together with the constraints under which they may be matched.
	353	So a position is encoded as an index into the parse tree together with
	354	a constraint. */
	355	typedef struct
	356	{
	357	idx_t index; /* Index into the parse array. */
	358	unsigned int constraint; /* Constraint for matching this position. */
	359	} position;
	360
	361	/* Sets of positions are stored as arrays. */
	362	typedef struct
	363	{
	364	position elems; / Elements of this position set. */
	365	idx_t nelem; /* Number of elements in this set. */
	366	idx_t alloc; /* Number of elements allocated in ELEMS. */
	367	} position_set;
	368
	369	/* A state of the dfa consists of a set of positions, some flags,
	370	and the token value of the lowest-numbered position of the state that
	371	contains an END token. */
	372	typedef struct
	373	{
	374	size_t hash; /* Hash of the positions of this state. */
	375	position_set elems; /* Positions this state could match. */
	376	unsigned char context; /* Context from previous state. */
	377	unsigned short constraint; /* Constraint for this state to accept. */
	378	position_set mbps; /* Positions which can match multibyte
	379	characters or the follows, e.g., period.
	380	Used only if MB_CUR_MAX > 1. */
	381	state_num mb_trindex; /* Index of this state in MB_TRANS, or
	382	negative if the state does not have
	383	ANYCHAR. */
	384	} dfa_state;
	385
	386	/* Maximum for any transition table count. This should be at least 3,
	387	for the initial state setup. */
	388	enum { MAX_TRCOUNT = 1024 };
	389
	390	/* A bracket operator.
	391	e.g., [a-c], [[:alpha:]], etc. */
	392	struct mb_char_classes
	393	{
	394	ptrdiff_t cset;
	395	bool invert;
	396	wchar_t chars; / Normal characters. */
	397	idx_t nchars;
	398	idx_t nchars_alloc;
	399	};
	400
	401	struct regex_syntax
	402	{
	403	/* Syntax bits controlling the behavior of the lexical analyzer. */
	404	reg_syntax_t syntax_bits;
	405	bool syntax_bits_set;
	406
	407	/* Flag for case-folding letters into sets. */
	408	bool case_fold;
	409
	410	/* True if ^ and $ match only the start and end of data, and do not match
	411	end-of-line within data. */
	412	bool anchor;
	413
	414	/* End-of-line byte in data. */
	415	unsigned char eolbyte;
	416
	417	/* Cache of char-context values. */
	418	char sbit[NOTCHAR];
	419
	420	/* If never_trail[B], the byte B cannot be a non-initial byte in a
	421	multibyte character. */
	422	bool never_trail[NOTCHAR];
	423
	424	/* Set of characters considered letters. */
	425	charclass letters;
	426
	427	/* Set of characters that are newline. */
	428	charclass newline;
	429	};
	430
	431	/* Lexical analyzer. All the dross that deals with the obnoxious
	432	GNU Regex syntax bits is located here. The poor, suffering
	433	reader is referred to the GNU Regex documentation for the
	434	meaning of the @#%!@#%^!@ syntax bits. */
	435	struct lexer_state
	436	{
	437	char const ptr; / Pointer to next input character. */
	438	idx_t left; /* Number of characters remaining. */
	439	token lasttok; /* Previous token returned; initially END. */
	440	idx_t parens; /* Count of outstanding left parens. */
	441	int minrep, maxrep; /* Repeat counts for {m,n}. */
	442
	443	/* Wide character representation of the current multibyte character,
	444	or WEOF if there was an encoding error. Used only if
	445	MB_CUR_MAX > 1. */
	446	wint_t wctok;
	447
	448	/* The most recently analyzed multibyte bracket expression. */
	449	struct mb_char_classes brack;
	450
	451	/* We're separated from beginning or (, \| only by zero-width characters. */
	452	bool laststart;
	453	};
	454
	455	/* Recursive descent parser for regular expressions. */
	456
	457	struct parser_state
	458	{
	459	token tok; /* Lookahead token. */
	460	idx_t depth; /* Current depth of a hypothetical stack
	461	holding deferred productions. This is
	462	used to determine the depth that will be
	463	required of the real stack later on in
	464	dfaanalyze. */
	465	};
	466
	467	/* A compiled regular expression. */
	468	struct dfa
	469	{
	470	/* Fields filled by the scanner. */
	471	charclass charclasses; / Array of character sets for CSET tokens. */
	472	idx_t cindex; /* Index for adding new charclasses. */
	473	idx_t calloc; /* Number of charclasses allocated. */
	474	ptrdiff_t canychar; /* Index of anychar class, or -1. */
	475
	476	/* Scanner state */
	477	struct lexer_state lex;
	478
	479	/* Parser state */
	480	struct parser_state parse;
	481
	482	/* Fields filled by the parser. */
	483	token tokens; / Postfix parse array. */
	484	idx_t tindex; /* Index for adding new tokens. */
	485	idx_t talloc; /* Number of tokens currently allocated. */
	486	idx_t depth; /* Depth required of an evaluation stack
	487	used for depth-first traversal of the
	488	parse tree. */
	489	idx_t nleaves; /* Number of non-EMPTY leaves
	490	in the parse tree. */
	491	idx_t nregexps; /* Count of parallel regexps being built
	492	with dfaparse. */
	493	bool fast; /* The DFA is fast. */
	494	bool epsilon; /* Does a token match only the empty string? */
	495	token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
	496	mbstate_t mbs; /* Multibyte conversion state. */
	497
	498	/* The following are valid only if MB_CUR_MAX > 1. */
	499
	500	/* The value of multibyte_prop[i] is defined by following rule.
	501	if tokens[i] < NOTCHAR
	502	bit 0 : tokens[i] is the first byte of a character, including
	503	single-byte characters.
	504	bit 1 : tokens[i] is the last byte of a character, including
	505	single-byte characters.
	506
	507	e.g.
	508	tokens
	509	= 'single_byte_a', 'multi_byte_A', single_byte_b'
	510	= 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
	511	multibyte_prop
	512	= 3 , 1 , 0 , 2 , 3
	513	*/
	514	char *multibyte_prop;
	515
	516	/* Fields filled by the superset. */
	517	struct dfa superset; / Hint of the dfa. */
	518
	519	/* Fields filled by the state builder. */
	520	dfa_state states; / States of the dfa. */
	521	state_num sindex; /* Index for adding new states. */
	522	idx_t salloc; /* Number of states currently allocated. */
	523
	524	/* Fields filled by the parse tree->NFA conversion. */
	525	position_set follows; / Array of follow sets, indexed by position
	526	index. The follow of a position is the set
	527	of positions containing characters that
	528	could conceivably follow a character
	529	matching the given position in a string
	530	matching the regexp. Allocated to the
	531	maximum possible position index. */
	532	bool searchflag; /* We are supposed to build a searching
	533	as opposed to an exact matcher. A searching
	534	matcher finds the first and shortest string
	535	matching a regexp anywhere in the buffer,
	536	whereas an exact matcher finds the longest
	537	string matching, but anchored to the
	538	beginning of the buffer. */
	539
	540	/* Fields filled by dfaanalyze. */
	541	int constraints; / Array of union of accepting constraints
	542	in the follow of a position. */
	543	int separates; / Array of contexts on follow of a
	544	position. */
	545
	546	/* Fields filled by dfaexec. */
	547	state_num tralloc; /* Number of transition tables that have
	548	slots so far, not counting trans[-1] and
	549	trans[-2]. */
	550	int trcount; /* Number of transition tables that have
	551	been built, other than for initial
	552	states. */
	553	int min_trcount; /* Number of initial states. Equivalently,
	554	the minimum state number for which trcount
	555	counts transitions. */
	556	state_num *trans; / Transition tables for states that can
	557	never accept. If the transitions for a
	558	state have not yet been computed, or the
	559	state could possibly accept, its entry in
	560	this table is NULL. This points to two
	561	past the start of the allocated array,
	562	and trans[-1] and trans[-2] are always
	563	NULL. */
	564	state_num *fails; / Transition tables after failing to accept
	565	on a state that potentially could do so.
	566	If trans[i] is non-null, fails[i] must
	567	be null. */
	568	char success; / Table of acceptance conditions used in
	569	dfaexec and computed in build_state. */
	570	state_num newlines; / Transitions on newlines. The entry for a
	571	newline in any transition table is always
	572	-1 so we can count lines without wasting
	573	too many cycles. The transition for a
	574	newline is stored separately and handled
	575	as a special case. Newline is also used
	576	as a sentinel at the end of the buffer. */
	577	state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
	578	context in multibyte locales, in which we
	579	do not distinguish between their contexts,
	580	as not supported word. */
	581	position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
	582	state_num *mb_trans; / Transition tables for states with
	583	ANYCHAR. */
	584	state_num mb_trcount; /* Number of transition tables for states with
	585	ANYCHAR that have actually been built. */
	586
	587	/* Syntax configuration. This is near the end so that dfacopysyntax
	588	can memset up to here. */
	589	struct regex_syntax syntax;
	590
	591	/* Information derived from the locale. This is at the end so that
	592	a quick memset need not clear it specially. */
	593
	594	/* dfaexec implementation. */
	595	char (dfaexec) (struct dfa , char const , char *,
	596	bool, ptrdiff_t , bool );
	597
	598	/* Other cached information derived from the locale. */
	599	struct localeinfo localeinfo;
	600	};
	601
	602	/* User access to dfa internals. */
	603
	604	/* S could possibly be an accepting state of R. */
	605	static bool
	606	accepting (state_num s, struct dfa const *r)
	607	{
	608	return r->states[s].constraint != 0;
	609	}
	610
	611	/* STATE accepts in the specified context. */
	612	static bool
	613	accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
	614	{
	615	return succeeds_in_context (dfa->states[state].constraint, prev, curr);
	616	}
	617
	618	static void regexp (struct dfa *dfa);
	619
	620	/* Store into *PWC the result of converting the leading bytes of the
	621	multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
	622	and updating the conversion state in *D. On conversion error,
	623	convert just a single byte, to WEOF. Return the number of bytes
	624	converted.
	625
	626	This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
	627
	628	* PWC points to wint_t, not to wchar_t.
	629	* The last arg is a dfa *D instead of merely a multibyte conversion
	630	state D->mbs.
	631	* N is idx_t not size_t, and must be at least 1.
	632	* S[N - 1] must be a sentinel byte.
	633	* Shift encodings are not supported.
	634	* The return value is always in the range 1..N.
	635	* D->mbs is always valid afterwards.
	636	* PWC is always set to something. /
	637	static int
	638	mbs_to_wchar (wint_t pwc, char const s, idx_t n, struct dfa *d)
	639	{
	640	unsigned char uc = s[0];
	641	wint_t wc = d->localeinfo.sbctowc[uc];
	642
	643	if (wc == WEOF)
	644	{
	645	wchar_t wch;
	646	size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
	647	if (0 < nbytes && nbytes < (size_t) -2)
	648	{
	649	*pwc = wch;
	650	return nbytes;
	651	}
	652	memset (&d->mbs, 0, sizeof d->mbs);
	653	}
	654
	655	*pwc = wc;
	656	return 1;
	657	}
	658
	659	#ifdef DEBUG
	660
	661	static void
	662	prtok (token t)
	663	{
	664	if (t <= END)
	665	fprintf (stderr, "END");
	666	else if (0 <= t && t < NOTCHAR)
	667	{
	668	unsigned int ch = t;
	669	fprintf (stderr, "0x%02x", ch);
	670	}
	671	else
	672	{
	673	char const *s;
	674	switch (t)
	675	{
	676	case BEG:
	677	s = "BEG";
	678	break;
	679	case EMPTY:
	680	s = "EMPTY";
	681	break;
	682	case BACKREF:
	683	s = "BACKREF";
	684	break;
	685	case BEGLINE:
	686	s = "BEGLINE";
	687	break;
	688	case ENDLINE:
	689	s = "ENDLINE";
	690	break;
	691	case BEGWORD:
	692	s = "BEGWORD";
	693	break;
	694	case ENDWORD:
	695	s = "ENDWORD";
	696	break;
	697	case LIMWORD:
	698	s = "LIMWORD";
	699	break;
	700	case NOTLIMWORD:
	701	s = "NOTLIMWORD";
	702	break;
	703	case QMARK:
	704	s = "QMARK";
	705	break;
	706	case STAR:
	707	s = "STAR";
	708	break;
	709	case PLUS:
	710	s = "PLUS";
	711	break;
	712	case CAT:
	713	s = "CAT";
	714	break;
	715	case OR:
	716	s = "OR";
	717	break;
	718	case LPAREN:
	719	s = "LPAREN";
	720	break;
	721	case RPAREN:
	722	s = "RPAREN";
	723	break;
	724	case ANYCHAR:
	725	s = "ANYCHAR";
	726	break;
	727	case MBCSET:
	728	s = "MBCSET";
	729	break;
	730	default:
	731	s = "CSET";
	732	break;
	733	}
	734	fprintf (stderr, "%s", s);
	735	}
	736	}
	737	#endif /* DEBUG */
	738
	739	/* Stuff pertaining to charclasses. */
	740
	741	static bool
	742	tstbit (unsigned int b, charclass const *c)
	743	{
	744	return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
	745	}
	746
	747	static void
	748	setbit (unsigned int b, charclass *c)
	749	{
	750	charclass_word one = 1;
	751	c->w[b / CHARCLASS_WORD_BITS] \|= one << b % CHARCLASS_WORD_BITS;
	752	}
	753
	754	static void
	755	clrbit (unsigned int b, charclass *c)
	756	{
	757	charclass_word one = 1;
	758	c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
	759	}
	760
	761	static void
	762	zeroset (charclass *s)
	763	{
	764	memset (s, 0, sizeof *s);
	765	}
	766
	767	static void
	768	fillset (charclass *s)
	769	{
	770	for (int i = 0; i < CHARCLASS_WORDS; i++)
	771	s->w[i] = CHARCLASS_WORD_MASK;
	772	}
	773
	774	static void
	775	notset (charclass *s)
	776	{
	777	for (int i = 0; i < CHARCLASS_WORDS; ++i)
	778	s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
	779	}
	780
	781	static bool
	782	equal (charclass const s1, charclass const s2)
	783	{
	784	charclass_word w = 0;
	785	for (int i = 0; i < CHARCLASS_WORDS; i++)
	786	w \|= s1->w[i] ^ s2->w[i];
	787	return w == 0;
	788	}
	789
	790	static bool
	791	emptyset (charclass const *s)
	792	{
	793	charclass_word w = 0;
	794	for (int i = 0; i < CHARCLASS_WORDS; i++)
	795	w \|= s->w[i];
	796	return w == 0;
	797	}
	798
	799	/* Ensure that the array addressed by PA holds at least I + 1 items.
	800	Either return PA, or reallocate the array and return its new address.
	801	Although PA may be null, the returned value is never null.
	802
	803	The array holds NITEMS items, where 0 <= I <= NITEMS; *NITEMS
	804	is updated on reallocation. If PA is null, *NITEMS must be zero.
	805	Do not allocate more than NITEMS_MAX items total; -1 means no limit.
	806	ITEM_SIZE is the size of one item; it must be positive.
	807	Avoid O(N*2) behavior on arrays growing linearly. /
	808	static void *
	809	maybe_realloc (void pa, idx_t i, idx_t nitems,
	810	ptrdiff_t nitems_max, idx_t item_size)
	811	{
	812	if (i < *nitems)
	813	return pa;
	814	return xpalloc (pa, nitems, 1, nitems_max, item_size);
	815	}
	816
	817	/* In DFA D, find the index of charclass S, or allocate a new one. */
	818	static idx_t
	819	charclass_index (struct dfa d, charclass const s)
	820	{
	821	idx_t i;
	822
	823	for (i = 0; i < d->cindex; ++i)
	824	if (equal (s, &d->charclasses[i]))
	825	return i;
	826	d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
	827	TOKEN_MAX - CSET, sizeof *d->charclasses);
	828	++d->cindex;
	829	d->charclasses[i] = *s;
	830	return i;
	831	}
	832
	833	static bool
	834	unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
	835	{
	836	return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) \|\| (c) == '_');
	837	}
	838
	839	static int
	840	char_context (struct dfa const *dfa, unsigned char c)
	841	{
	842	if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
	843	return CTX_NEWLINE;
	844	if (unibyte_word_constituent (dfa, c))
	845	return CTX_LETTER;
	846	return CTX_NONE;
	847	}
	848
	849	/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
	850	is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
	851	this may happen when folding case in weird Turkish locales where
	852	dotless i/dotted I are not included in the chosen character set.
	853	Return whether a bit was set in the charclass. */
	854	static bool
	855	setbit_wc (wint_t wc, charclass *c)
	856	{
	857	int b = wctob (wc);
	858	if (b < 0)
	859	return false;
	860
	861	setbit (b, c);
	862	return true;
	863	}
	864
	865	/* Set a bit for B and its case variants in the charclass C.
	866	MB_CUR_MAX must be 1. */
	867	static void
	868	setbit_case_fold_c (int b, charclass *c)
	869	{
	870	int ub = toupper (b);
	871	for (int i = 0; i < NOTCHAR; i++)
	872	if (toupper (i) == ub)
	873	setbit (i, c);
	874	}
	875
	876	/* Fetch the next lexical input character from the pattern. There
	877	must at least one byte of pattern input. Set DFA->lex.wctok to the
	878	value of the character or to WEOF depending on whether the input is
	879	a valid multibyte character (possibly of length 1). Then return
	880	the next input byte value, except return EOF if the input is a
	881	multibyte character of length greater than 1. */
	882	static int
	883	fetch_wc (struct dfa *dfa)
	884	{
	885	int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
	886	dfa);
	887	int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
	888	dfa->lex.ptr += nbytes;
	889	dfa->lex.left -= nbytes;
	890	return c;
	891	}
	892
	893	/* If there is no more input, report an error about unbalanced brackets.
	894	Otherwise, behave as with fetch_wc (DFA). */
	895	static int
	896	bracket_fetch_wc (struct dfa *dfa)
	897	{
	898	if (! dfa->lex.left)
	899	dfaerror (_("unbalanced ["));
	900	return fetch_wc (dfa);
	901	}
	902
	903	typedef int predicate (int);
	904
	905	/* The following list maps the names of the Posix named character classes
	906	to predicate functions that determine whether a given character is in
	907	the class. The leading [ has already been eaten by the lexical
	908	analyzer. */
	909	struct dfa_ctype
	910	{
	911	const char *name;
	912	predicate *func;
	913	bool single_byte_only;
	914	};
	915
	916	static const struct dfa_ctype prednames[] = {
	917	{"alpha", isalpha, false},
	918	{"upper", isupper, false},
	919	{"lower", islower, false},
	920	{"digit", isdigit, true},
	921	{"xdigit", isxdigit, false},
	922	{"space", isspace, false},
	923	{"punct", ispunct, false},
	924	{"alnum", isalnum, false},
	925	{"print", isprint, false},
	926	{"graph", isgraph, false},
	927	{"cntrl", iscntrl, false},
	928	{"blank", isblank, false},
	929	{NULL, NULL, false}
	930	};
	931
	932	static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
	933	find_pred (const char *str)
	934	{
	935	for (int i = 0; prednames[i].name; i++)
	936	if (streq (str, prednames[i].name))
	937	return &prednames[i];
	938	return NULL;
	939	}
	940
	941	/* Parse a bracket expression, which possibly includes multibyte
	942	characters. */
	943	static token
	944	parse_bracket_exp (struct dfa *dfa)
	945	{
	946	/* This is a bracket expression that dfaexec is known to
	947	process correctly. */
	948	bool known_bracket_exp = true;
	949
	950	/* Used to warn about [:space:].
	951	Bit 0 = first character is a colon.
	952	Bit 1 = last character is a colon.
	953	Bit 2 = includes any other character but a colon.
	954	Bit 3 = includes ranges, char/equiv classes or collation elements. */
	955	int colon_warning_state;
	956
	957	dfa->lex.brack.nchars = 0;
	958	charclass ccl;
	959	zeroset (&ccl);
	960	int c = bracket_fetch_wc (dfa);
	961	bool invert = c == '^';
	962	if (invert)
	963	{
	964	c = bracket_fetch_wc (dfa);
	965	known_bracket_exp = dfa->localeinfo.simple;
	966	}
	967	wint_t wc = dfa->lex.wctok;
	968	int c1;
	969	wint_t wc1;
	970	colon_warning_state = (c == ':');
	971	do
	972	{
	973	c1 = NOTCHAR; /* Mark c1 as not initialized. */
	974	colon_warning_state &= ~2;
	975
	976	/* Note that if we're looking at some other [:...:] construct,
	977	we just treat it as a bunch of ordinary characters. We can do
	978	this because we assume regex has checked for syntax errors before
	979	dfa is ever called. */
	980	if (c == '[')
	981	{
	982	c1 = bracket_fetch_wc (dfa);
	983	wc1 = dfa->lex.wctok;
	984
	985	if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
	986	\|\| c1 == '.' \|\| c1 == '=')
	987	{
	988	enum { MAX_BRACKET_STRING_LEN = 32 };
	989	char str[MAX_BRACKET_STRING_LEN + 1];
	990	int len = 0;
	991	for (;;)
	992	{
	993	c = bracket_fetch_wc (dfa);
	994	if (dfa->lex.left == 0
	995	\|\| (c == c1 && dfa->lex.ptr[0] == ']'))
	996	break;
	997	if (len < MAX_BRACKET_STRING_LEN)
	998	str[len++] = c;
	999	else
	1000	/* This is in any case an invalid class name. */
	1001	str[0] = '\0';
	1002	}
	1003	str[len] = '\0';
	1004
	1005	/* Fetch bracket. */
	1006	c = bracket_fetch_wc (dfa);
	1007	wc = dfa->lex.wctok;
	1008	if (c1 == ':')
	1009	/* Build character class. POSIX allows character
	1010	classes to match multicharacter collating elements,
	1011	but the regex code does not support that, so do not
	1012	worry about that possibility. */
	1013	{
	1014	char const *class
	1015	= (dfa->syntax.case_fold && (streq (str, "upper")
	1016	\|\| streq (str, "lower"))
	1017	? "alpha" : str);
	1018	const struct dfa_ctype *pred = find_pred (class);
	1019	if (!pred)
	1020	dfaerror (_("invalid character class"));
	1021
	1022	if (dfa->localeinfo.multibyte && !pred->single_byte_only)
	1023	known_bracket_exp = false;
	1024	else
	1025	for (int c2 = 0; c2 < NOTCHAR; ++c2)
	1026	if (pred->func (c2))
	1027	setbit (c2, &ccl);
	1028	}
	1029	else
	1030	known_bracket_exp = false;
	1031
	1032	colon_warning_state \|= 8;
	1033
	1034	/* Fetch new lookahead character. */
	1035	c1 = bracket_fetch_wc (dfa);
	1036	wc1 = dfa->lex.wctok;
	1037	continue;
	1038	}
	1039
	1040	/* We treat '[' as a normal character here. c/c1/wc/wc1
	1041	are already set up. */
	1042	}
	1043
	1044	if (c == '\\'
	1045	&& (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
	1046	{
	1047	c = bracket_fetch_wc (dfa);
	1048	wc = dfa->lex.wctok;
	1049	}
	1050
	1051	if (c1 == NOTCHAR)
	1052	{
	1053	c1 = bracket_fetch_wc (dfa);
	1054	wc1 = dfa->lex.wctok;
	1055	}
	1056
	1057	if (c1 == '-')
	1058	/* build range characters. */
	1059	{
	1060	int c2 = bracket_fetch_wc (dfa);
	1061	wint_t wc2 = dfa->lex.wctok;
	1062
	1063	/* A bracket expression like [a-[.aa.]] matches an unknown set.
	1064	Treat it like [-a[.aa.]] while parsing it, and
	1065	remember that the set is unknown. */
	1066	if (c2 == '[' && dfa->lex.ptr[0] == '.')
	1067	{
	1068	known_bracket_exp = false;
	1069	c2 = ']';
	1070	}
	1071
	1072	if (c2 == ']')
	1073	{
	1074	/* In the case [x-], the - is an ordinary hyphen,
	1075	which is left in c1, the lookahead character. */
	1076	dfa->lex.ptr--;
	1077	dfa->lex.left++;
	1078	}
	1079	else
	1080	{
	1081	if (c2 == '\\' && (dfa->syntax.syntax_bits
	1082	& RE_BACKSLASH_ESCAPE_IN_LISTS))
	1083	{
	1084	c2 = bracket_fetch_wc (dfa);
	1085	wc2 = dfa->lex.wctok;
	1086	}
	1087
	1088	colon_warning_state \|= 8;
	1089	c1 = bracket_fetch_wc (dfa);
	1090	wc1 = dfa->lex.wctok;
	1091
	1092	/* Treat [x-y] as a range if x != y. */
	1093	if (wc != wc2 \|\| wc == WEOF)
	1094	{
	1095	if (dfa->localeinfo.simple
	1096	\|\| (isasciidigit (c) & isasciidigit (c2)))
	1097	{
	1098	for (int ci = c; ci <= c2; ci++)
	1099	if (dfa->syntax.case_fold && isalpha (ci))
	1100	setbit_case_fold_c (ci, &ccl);
	1101	else
	1102	setbit (ci, &ccl);
	1103	}
	1104	else
	1105	known_bracket_exp = false;
	1106
	1107	continue;
	1108	}
	1109	}
	1110	}
	1111
	1112	colon_warning_state \|= (c == ':') ? 2 : 4;
	1113
	1114	if (!dfa->localeinfo.multibyte)
	1115	{
	1116	if (dfa->syntax.case_fold && isalpha (c))
	1117	setbit_case_fold_c (c, &ccl);
	1118	else
	1119	setbit (c, &ccl);
	1120	continue;
	1121	}
	1122
	1123	if (wc == WEOF)
	1124	known_bracket_exp = false;
	1125	else
	1126	{
	1127	wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
	1128	int n = (dfa->syntax.case_fold
	1129	? case_folded_counterparts (wc, folded + 1) + 1
	1130	: 1);
	1131	folded[0] = wc;
	1132	for (int i = 0; i < n; i++)
	1133	if (!setbit_wc (folded[i], &ccl))
	1134	{
	1135	dfa->lex.brack.chars
	1136	= maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
	1137	&dfa->lex.brack.nchars_alloc, -1,
	1138	sizeof *dfa->lex.brack.chars);
	1139	dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
	1140	}
	1141	}
	1142	}
	1143	while ((wc = wc1, (c = c1) != ']'));
	1144
	1145	if (colon_warning_state == 7)
	1146	dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
	1147
	1148	if (! known_bracket_exp)
	1149	return BACKREF;
	1150
	1151	if (dfa->localeinfo.multibyte && (invert \|\| dfa->lex.brack.nchars != 0))
	1152	{
	1153	dfa->lex.brack.invert = invert;
	1154	dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
	1155	return MBCSET;
	1156	}
	1157
	1158	if (invert)
	1159	{
	1160	notset (&ccl);
	1161	if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
	1162	clrbit ('\n', &ccl);
	1163	}
	1164
	1165	return CSET + charclass_index (dfa, &ccl);
	1166	}
	1167
	1168	struct lexptr
	1169	{
	1170	char const *ptr;
	1171	idx_t left;
	1172	};
	1173
	1174	static void
	1175	push_lex_state (struct dfa dfa, struct lexptr ls, char const *s)
	1176	{
	1177	ls->ptr = dfa->lex.ptr;
	1178	ls->left = dfa->lex.left;
	1179	dfa->lex.ptr = s;
	1180	dfa->lex.left = strlen (s);
	1181	}
	1182
	1183	static void
	1184	pop_lex_state (struct dfa dfa, struct lexptr const ls)
	1185	{
	1186	dfa->lex.ptr = ls->ptr;
	1187	dfa->lex.left = ls->left;
	1188	}
	1189
	1190	static token
	1191	lex (struct dfa *dfa)
	1192	{
	1193	bool backslash = false;
	1194
	1195	/* Basic plan: We fetch a character. If it's a backslash,
	1196	we set the backslash flag and go through the loop again.
	1197	On the plus side, this avoids having a duplicate of the
	1198	main switch inside the backslash case. On the minus side,
	1199	it means that just about every case begins with
	1200	"if (backslash) ...". */
	1201	for (int i = 0; i < 2; ++i)
	1202	{
	1203	if (! dfa->lex.left)
	1204	return dfa->lex.lasttok = END;
	1205	int c = fetch_wc (dfa);
	1206
	1207	switch (c)
	1208	{
	1209	case '\\':
	1210	if (backslash)
	1211	goto normal_char;
	1212	if (dfa->lex.left == 0)
	1213	dfaerror (_("unfinished \\ escape"));
	1214	backslash = true;
	1215	break;
	1216
	1217	case '^':
	1218	if (backslash)
	1219	goto normal_char;
	1220	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
	1221	\|\| dfa->lex.lasttok == END \|\| dfa->lex.lasttok == LPAREN
	1222	\|\| dfa->lex.lasttok == OR)
	1223	return dfa->lex.lasttok = BEGLINE;
	1224	goto normal_char;
	1225
	1226	case '$':
	1227	if (backslash)
	1228	goto normal_char;
[3658]	1229	/* kmk: cl v19.29.30139/amd64 messes this function up when optimizing
	1230	for speed, workaround is to optimize it for size instead. The
	1231	symptom is that the following SED expression fail to match:
	1232	s/^[0-9a-fA-F]\{1,\} $00[0-9a-fA-F]$ ABS notype External \| $[^.]\{1,\}$\.$.*$$/ 1=\1 2=\2 3=\3/
	1233
	1234	Seems the exact problem is that it gets the indexing here wrong:
	1235	dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) & (dfa->lex.ptr[0] == '\\')]
	1236	It forgets to do the ` dfa->lex.ptr[0] == '\\' ` part and instead
	1237	ANDs with a register initialized to zero. Rewriting the
	1238	expressions using the tinary operator works around the problem,
	1239	although the resulting code is a lot bulkier.
	1240	*/
[3529]	1241	if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
	1242	\|\| dfa->lex.left == 0
[3658]	1243	#ifdef _MSC_VER /* see above */
	1244	\|\| (!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
	1245	? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == ')'
	1246	: dfa->lex.left > 0 && dfa->lex.ptr[0] == ')')
	1247	#else
[3529]	1248	\|\| ((dfa->lex.left
	1249	> !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
	1250	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
	1251	& (dfa->lex.ptr[0] == '\\')]
	1252	== ')'))
[3658]	1253	#endif
	1254	#ifdef _MSC_VER /* see above */
	1255	\|\| (!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
	1256	? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == '\|'
	1257	: dfa->lex.left > 0 && dfa->lex.ptr[0] == '\|')
	1258	#else
[3529]	1259	\|\| ((dfa->lex.left
	1260	> !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
	1261	&& (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
	1262	& (dfa->lex.ptr[0] == '\\')]
	1263	== '\|'))
[3658]	1264	#endif
[3529]	1265	\|\| ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
	1266	&& dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
	1267	return dfa->lex.lasttok = ENDLINE;
	1268	goto normal_char;
	1269
	1270	case '1':
	1271	case '2':
	1272	case '3':
	1273	case '4':
	1274	case '5':
	1275	case '6':
	1276	case '7':
	1277	case '8':
	1278	case '9':
	1279	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
	1280	{
	1281	dfa->lex.laststart = false;
	1282	return dfa->lex.lasttok = BACKREF;
	1283	}
	1284	goto normal_char;
	1285
	1286	case '`':
	1287	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
	1288	{
	1289	/* FIXME: should be beginning of string */
	1290	return dfa->lex.lasttok = BEGLINE;
	1291	}
	1292	goto normal_char;
	1293
	1294	case '\'':
	1295	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
	1296	{
	1297	/* FIXME: should be end of string */
	1298	return dfa->lex.lasttok = ENDLINE;
	1299	}
	1300	goto normal_char;
	1301
	1302	case '<':
	1303	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
	1304	return dfa->lex.lasttok = BEGWORD;
	1305	goto normal_char;
	1306
	1307	case '>':
	1308	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
	1309	return dfa->lex.lasttok = ENDWORD;
	1310	goto normal_char;
	1311
	1312	case 'b':
	1313	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
	1314	return dfa->lex.lasttok = LIMWORD;
	1315	goto normal_char;
	1316
	1317	case 'B':
	1318	if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
	1319	return dfa->lex.lasttok = NOTLIMWORD;
	1320	goto normal_char;
	1321
	1322	case '?':
	1323	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
	1324	goto normal_char;
	1325	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
	1326	goto normal_char;
	1327	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
	1328	&& dfa->lex.laststart)
	1329	goto normal_char;
	1330	return dfa->lex.lasttok = QMARK;
	1331
	1332	case '*':
	1333	if (backslash)
	1334	goto normal_char;
	1335	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
	1336	&& dfa->lex.laststart)
	1337	goto normal_char;
	1338	return dfa->lex.lasttok = STAR;
	1339
	1340	case '+':
	1341	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
	1342	goto normal_char;
	1343	if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
	1344	goto normal_char;
	1345	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
	1346	&& dfa->lex.laststart)
	1347	goto normal_char;
	1348	return dfa->lex.lasttok = PLUS;
	1349
	1350	case '{':
	1351	if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
	1352	goto normal_char;
	1353	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
	1354	goto normal_char;
	1355	if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
	1356	&& dfa->lex.laststart)
	1357	goto normal_char;
	1358
	1359	/* Cases:
	1360	{M} - exact count
	1361	{M,} - minimum count, maximum is infinity
	1362	{,N} - 0 through N
	1363	{,} - 0 to infinity (same as '*')
	1364	{M,N} - M through N */
	1365	{
	1366	char const *p = dfa->lex.ptr;
	1367	char const *lim = p + dfa->lex.left;
	1368	dfa->lex.minrep = dfa->lex.maxrep = -1;
	1369	for (; p != lim && isasciidigit (*p); p++)
	1370	dfa->lex.minrep = (dfa->lex.minrep < 0
	1371	? *p - '0'
	1372	: MIN (RE_DUP_MAX + 1,
	1373	dfa->lex.minrep * 10 + *p - '0'));
	1374	if (p != lim)
	1375	{
	1376	if (*p != ',')
	1377	dfa->lex.maxrep = dfa->lex.minrep;
	1378	else
	1379	{
	1380	if (dfa->lex.minrep < 0)
	1381	dfa->lex.minrep = 0;
	1382	while (++p != lim && isasciidigit (*p))
	1383	dfa->lex.maxrep
	1384	= (dfa->lex.maxrep < 0
	1385	? *p - '0'
	1386	: MIN (RE_DUP_MAX + 1,
	1387	dfa->lex.maxrep * 10 + *p - '0'));
	1388	}
	1389	}
	1390	if (! ((! backslash \|\| (p != lim && *p++ == '\\'))
	1391	&& p != lim && *p++ == '}'
	1392	&& 0 <= dfa->lex.minrep
	1393	&& (dfa->lex.maxrep < 0
	1394	\|\| dfa->lex.minrep <= dfa->lex.maxrep)))
	1395	{
	1396	if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
	1397	goto normal_char;
	1398	dfaerror (_("invalid content of \\{\\}"));
	1399	}
	1400	if (RE_DUP_MAX < dfa->lex.maxrep)
	1401	dfaerror (_("regular expression too big"));
	1402	dfa->lex.ptr = p;
	1403	dfa->lex.left = lim - p;
	1404	}
	1405	dfa->lex.laststart = false;
	1406	return dfa->lex.lasttok = REPMN;
	1407
	1408	case '\|':
	1409	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
	1410	goto normal_char;
	1411	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
	1412	goto normal_char;
	1413	dfa->lex.laststart = true;
	1414	return dfa->lex.lasttok = OR;
	1415
	1416	case '\n':
	1417	if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
	1418	\|\| backslash \|\| !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
	1419	goto normal_char;
	1420	dfa->lex.laststart = true;
	1421	return dfa->lex.lasttok = OR;
	1422
	1423	case '(':
	1424	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
	1425	goto normal_char;
	1426	dfa->lex.parens++;
	1427	dfa->lex.laststart = true;
	1428	return dfa->lex.lasttok = LPAREN;
	1429
	1430	case ')':
	1431	if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
	1432	goto normal_char;
	1433	if (dfa->lex.parens == 0
	1434	&& dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
	1435	goto normal_char;
	1436	dfa->lex.parens--;
	1437	dfa->lex.laststart = false;
	1438	return dfa->lex.lasttok = RPAREN;
	1439
	1440	case '.':
	1441	if (backslash)
	1442	goto normal_char;
	1443	if (dfa->canychar < 0)
	1444	{
	1445	charclass ccl;
	1446	fillset (&ccl);
	1447	if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
	1448	clrbit ('\n', &ccl);
	1449	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
	1450	clrbit ('\0', &ccl);
	1451	if (dfa->localeinfo.multibyte)
	1452	for (int c2 = 0; c2 < NOTCHAR; c2++)
	1453	if (dfa->localeinfo.sbctowc[c2] == WEOF)
	1454	clrbit (c2, &ccl);
	1455	dfa->canychar = charclass_index (dfa, &ccl);
	1456	}
	1457	dfa->lex.laststart = false;
	1458	return dfa->lex.lasttok = (dfa->localeinfo.multibyte
	1459	? ANYCHAR
	1460	: CSET + dfa->canychar);
	1461
	1462	case 's':
	1463	case 'S':
	1464	if (!backslash \|\| (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
	1465	goto normal_char;
	1466	if (!dfa->localeinfo.multibyte)
	1467	{
	1468	charclass ccl;
	1469	zeroset (&ccl);
	1470	for (int c2 = 0; c2 < NOTCHAR; ++c2)
	1471	if (isspace (c2))
	1472	setbit (c2, &ccl);
	1473	if (c == 'S')
	1474	notset (&ccl);
	1475	dfa->lex.laststart = false;
	1476	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
	1477	}
	1478
	1479	/* FIXME: see if optimizing this, as is done with ANYCHAR and
	1480	add_utf8_anychar, makes sense. */
	1481
	1482	/* \s and \S are documented to be equivalent to [[:space:]] and
	1483	[^[:space:]] respectively, so tell the lexer to process those
	1484	strings, each minus its "already processed" '['. */
	1485	{
	1486	struct lexptr ls;
	1487	push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
	1488	dfa->lex.lasttok = parse_bracket_exp (dfa);
	1489	pop_lex_state (dfa, &ls);
	1490	}
	1491
	1492	dfa->lex.laststart = false;
	1493	return dfa->lex.lasttok;
	1494
	1495	case 'w':
	1496	case 'W':
	1497	if (!backslash \|\| (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
	1498	goto normal_char;
	1499
	1500	if (!dfa->localeinfo.multibyte)
	1501	{
	1502	charclass ccl;
	1503	zeroset (&ccl);
	1504	for (int c2 = 0; c2 < NOTCHAR; ++c2)
	1505	if (dfa->syntax.sbit[c2] == CTX_LETTER)
	1506	setbit (c2, &ccl);
	1507	if (c == 'W')
	1508	notset (&ccl);
	1509	dfa->lex.laststart = false;
	1510	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
	1511	}
	1512
	1513	/* FIXME: see if optimizing this, as is done with ANYCHAR and
	1514	add_utf8_anychar, makes sense. */
	1515
	1516	/* \w and \W are documented to be equivalent to [_[:alnum:]] and
	1517	[^_[:alnum:]] respectively, so tell the lexer to process those
	1518	strings, each minus its "already processed" '['. */
	1519	{
	1520	struct lexptr ls;
	1521	push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
	1522	dfa->lex.lasttok = parse_bracket_exp (dfa);
	1523	pop_lex_state (dfa, &ls);
	1524	}
	1525
	1526	dfa->lex.laststart = false;
	1527	return dfa->lex.lasttok;
	1528
	1529	case '[':
	1530	if (backslash)
	1531	goto normal_char;
	1532	dfa->lex.laststart = false;
	1533	return dfa->lex.lasttok = parse_bracket_exp (dfa);
	1534
	1535	default:
	1536	normal_char:
	1537	dfa->lex.laststart = false;
	1538	/* For multibyte character sets, folding is done in atom. Always
	1539	return WCHAR. */
	1540	if (dfa->localeinfo.multibyte)
	1541	return dfa->lex.lasttok = WCHAR;
	1542
	1543	if (dfa->syntax.case_fold && isalpha (c))
	1544	{
	1545	charclass ccl;
	1546	zeroset (&ccl);
	1547	setbit_case_fold_c (c, &ccl);
	1548	return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
	1549	}
	1550
	1551	return dfa->lex.lasttok = c;
	1552	}
	1553	}
	1554
	1555	/* The above loop should consume at most a backslash
	1556	and some other character. */
	1557	abort ();
	1558	return END; /* keeps pedantic compilers happy. */
	1559	}
	1560
	1561	static void
	1562	addtok_mb (struct dfa *dfa, token t, char mbprop)
	1563	{
	1564	if (dfa->talloc == dfa->tindex)
	1565	{
	1566	dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
	1567	sizeof *dfa->tokens);
	1568	if (dfa->localeinfo.multibyte)
	1569	dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
	1570	sizeof *dfa->multibyte_prop);
	1571	}
	1572	if (dfa->localeinfo.multibyte)
	1573	dfa->multibyte_prop[dfa->tindex] = mbprop;
	1574	dfa->tokens[dfa->tindex++] = t;
	1575
	1576	switch (t)
	1577	{
	1578	case QMARK:
	1579	case STAR:
	1580	case PLUS:
	1581	break;
	1582
	1583	case CAT:
	1584	case OR:
	1585	dfa->parse.depth--;
	1586	break;
	1587
	1588	case EMPTY:
	1589	dfa->epsilon = true;
	1590	goto increment_depth;
	1591
	1592	case BACKREF:
	1593	dfa->fast = false;
	1594	goto increment_nleaves;
	1595
	1596	case BEGLINE:
	1597	case ENDLINE:
	1598	case BEGWORD:
	1599	case ENDWORD:
	1600	case LIMWORD:
	1601	case NOTLIMWORD:
	1602	dfa->epsilon = true;
	1603	FALLTHROUGH;
	1604	default:
	1605	increment_nleaves:
	1606	dfa->nleaves++;
	1607	increment_depth:
	1608	dfa->parse.depth++;
	1609	if (dfa->depth < dfa->parse.depth)
	1610	dfa->depth = dfa->parse.depth;
	1611	break;
	1612	}
	1613	}
	1614
	1615	static void addtok_wc (struct dfa *dfa, wint_t wc);
	1616
	1617	/* Add the given token to the parse tree, maintaining the depth count and
	1618	updating the maximum depth if necessary. */
	1619	static void
	1620	addtok (struct dfa *dfa, token t)
	1621	{
	1622	if (dfa->localeinfo.multibyte && t == MBCSET)
	1623	{
	1624	bool need_or = false;
	1625
	1626	/* Extract wide characters into alternations for better performance.
	1627	This does not require UTF-8. */
	1628	for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
	1629	{
	1630	addtok_wc (dfa, dfa->lex.brack.chars[i]);
	1631	if (need_or)
	1632	addtok (dfa, OR);
	1633	need_or = true;
	1634	}
	1635	dfa->lex.brack.nchars = 0;
	1636
	1637	/* Wide characters have been handled above, so it is possible
	1638	that the set is empty now. Do nothing in that case. */
	1639	if (dfa->lex.brack.cset != -1)
	1640	{
	1641	addtok (dfa, CSET + dfa->lex.brack.cset);
	1642	if (need_or)
	1643	addtok (dfa, OR);
	1644	}
	1645	}
	1646	else
	1647	{
	1648	addtok_mb (dfa, t, 3);
	1649	}
	1650	}
	1651
	1652	/* We treat a multibyte character as a single atom, so that DFA
	1653	can treat a multibyte character as a single expression.
	1654
	1655	e.g., we construct the following tree from "<mb1><mb2>".
	1656	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
	1657	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
	1658	static void
	1659	addtok_wc (struct dfa *dfa, wint_t wc)
	1660	{
	1661	unsigned char buf[MB_LEN_MAX];
	1662	mbstate_t s = { 0 };
	1663	size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
	1664	int buflen;
	1665
	1666	if (stored_bytes != (size_t) -1)
	1667	buflen = stored_bytes;
	1668	else
	1669	{
	1670	/* This is merely stop-gap. buf[0] is undefined, yet skipping
	1671	the addtok_mb call altogether can corrupt the heap. */
	1672	buflen = 1;
	1673	buf[0] = 0;
	1674	}
	1675
	1676	addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
	1677	for (int i = 1; i < buflen; i++)
	1678	{
	1679	addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
	1680	addtok (dfa, CAT);
	1681	}
	1682	}
	1683
	1684	static void
	1685	add_utf8_anychar (struct dfa *dfa)
	1686	{
	1687	/* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
	1688	UTF-8 byte sequence has been defined as follows:
	1689
	1690	([\x00-\x7f]
	1691	\|[\xc2-\xdf][\x80-\xbf]
	1692	\|[\xe0][\xa0-\xbf][\x80-\xbf]
	1693	\|[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
	1694	\|[\xed][\x80-\x9f][\x80-\xbf]
	1695	\|[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
	1696	\|[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
	1697	\|[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
	1698
	1699	which I'll write more concisely "A\|BC\|DEC\|FCC\|GHC\|IJCC\|KCCC\|LMCC",
	1700	where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
	1701	D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
	1702	H = [\x80-\x9f], I = [\xf0],
	1703	J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
	1704
	1705	This can be refactored to "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C". */
	1706
	1707	/* Mnemonics for classes containing two or more bytes. */
	1708	enum { A, B, C, E, F, H, J, K, M };
	1709
	1710	/* Mnemonics for single-byte tokens. */
	1711	enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
	1712
	1713	static charclass const utf8_classes[] = {
	1714	/* A. 00-7f: 1-byte sequence. */
	1715	CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
	1716
	1717	/* B. c2-df: 1st byte of a 2-byte sequence. */
	1718	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
	1719
	1720	/* C. 80-bf: non-leading bytes. */
	1721	CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
	1722
	1723	/* D. e0 (just a token). */
	1724
	1725	/* E. a0-bf: 2nd byte of a "DEC" sequence. */
	1726	CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
	1727
	1728	/* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
	1729	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
	1730
	1731	/* G. ed (just a token). */
	1732
	1733	/* H. 80-9f: 2nd byte of a "GHC" sequence. */
	1734	CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
	1735
	1736	/* I. f0 (just a token). */
	1737
	1738	/* J. 90-bf: 2nd byte of an "IJCC" sequence. */
	1739	CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
	1740
	1741	/* K. f1-f3: 1st byte of a "KCCC" sequence. */
	1742	CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
	1743
	1744	/* L. f4 (just a token). */
	1745
	1746	/* M. 80-8f: 2nd byte of a "LMCC" sequence. */
	1747	CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
	1748	};
	1749
	1750	/* Define the character classes that are needed below. */
	1751	if (dfa->utf8_anychar_classes[0] == 0)
	1752	{
	1753	charclass c = utf8_classes[0];
	1754	if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
	1755	clrbit ('\n', &c);
	1756	if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
	1757	clrbit ('\0', &c);
	1758	dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
	1759
	1760	for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
	1761	dfa->utf8_anychar_classes[i]
	1762	= CSET + charclass_index (dfa, &utf8_classes[i]);
	1763	}
	1764
	1765	/* Implement the "A\|(B\|DE\|GH\|(F\|IJ\|LM\|KC)C)C" pattern mentioned above.
	1766	The token buffer is in reverse Polish order, so we get
	1767	"A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
	1768	C CAT OR C CAT OR C CAT OR". */
	1769	addtok (dfa, dfa->utf8_anychar_classes[A]);
	1770	addtok (dfa, dfa->utf8_anychar_classes[B]);
	1771	addtok (dfa, D_token);
	1772	addtok (dfa, dfa->utf8_anychar_classes[E]);
	1773	addtok (dfa, CAT);
	1774	addtok (dfa, OR);
	1775	addtok (dfa, G_token);
	1776	addtok (dfa, dfa->utf8_anychar_classes[H]);
	1777	addtok (dfa, CAT);
	1778	addtok (dfa, OR);
	1779	addtok (dfa, dfa->utf8_anychar_classes[F]);
	1780	addtok (dfa, I_token);
	1781	addtok (dfa, dfa->utf8_anychar_classes[J]);
	1782	addtok (dfa, CAT);
	1783	addtok (dfa, OR);
	1784	addtok (dfa, L_token);
	1785	addtok (dfa, dfa->utf8_anychar_classes[M]);
	1786	addtok (dfa, CAT);
	1787	addtok (dfa, OR);
	1788	addtok (dfa, dfa->utf8_anychar_classes[K]);
	1789	for (int i = 0; i < 3; i++)
	1790	{
	1791	addtok (dfa, dfa->utf8_anychar_classes[C]);
	1792	addtok (dfa, CAT);
	1793	addtok (dfa, OR);
	1794	}
	1795	}
	1796
	1797	/* The grammar understood by the parser is as follows.
	1798
	1799	regexp:
	1800	regexp OR branch
	1801	branch
	1802
	1803	branch:
	1804	branch closure
	1805	closure
	1806
	1807	closure:
	1808	closure QMARK
	1809	closure STAR
	1810	closure PLUS
	1811	closure REPMN
	1812	atom
	1813
	1814	atom:
	1815	<normal character>
	1816	<multibyte character>
	1817	ANYCHAR
	1818	MBCSET
	1819	CSET
	1820	BACKREF
	1821	BEGLINE
	1822	ENDLINE
	1823	BEGWORD
	1824	ENDWORD
	1825	LIMWORD
	1826	NOTLIMWORD
	1827	LPAREN regexp RPAREN
	1828	<empty>
	1829
	1830	The parser builds a parse tree in postfix form in an array of tokens. */
	1831
	1832	static void
	1833	atom (struct dfa *dfa)
	1834	{
	1835	if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
	1836	\|\| dfa->parse.tok >= CSET
	1837	\|\| dfa->parse.tok == BEG \|\| dfa->parse.tok == BACKREF
	1838	\|\| dfa->parse.tok == BEGLINE \|\| dfa->parse.tok == ENDLINE
	1839	\|\| dfa->parse.tok == BEGWORD \|\| dfa->parse.tok == ENDWORD
	1840	\|\| dfa->parse.tok == LIMWORD \|\| dfa->parse.tok == NOTLIMWORD
	1841	\|\| dfa->parse.tok == ANYCHAR \|\| dfa->parse.tok == MBCSET)
	1842	{
	1843	if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
	1844	{
	1845	/* For UTF-8 expand the period to a series of CSETs that define a
	1846	valid UTF-8 character. This avoids using the slow multibyte
	1847	path. I'm pretty sure it would be both profitable and correct to
	1848	do it for any encoding; however, the optimization must be done
	1849	manually as it is done above in add_utf8_anychar. So, let's
	1850	start with UTF-8: it is the most used, and the structure of the
	1851	encoding makes the correctness more obvious. */
	1852	add_utf8_anychar (dfa);
	1853	}
	1854	else
	1855	addtok (dfa, dfa->parse.tok);
	1856	dfa->parse.tok = lex (dfa);
	1857	}
	1858	else if (dfa->parse.tok == WCHAR)
	1859	{
	1860	if (dfa->lex.wctok == WEOF)
	1861	addtok (dfa, BACKREF);
	1862	else
	1863	{
	1864	addtok_wc (dfa, dfa->lex.wctok);
	1865
	1866	if (dfa->syntax.case_fold)
	1867	{
	1868	wchar_t folded[CASE_FOLDED_BUFSIZE];
	1869	int n = case_folded_counterparts (dfa->lex.wctok, folded);
	1870	for (int i = 0; i < n; i++)
	1871	{
	1872	addtok_wc (dfa, folded[i]);
	1873	addtok (dfa, OR);
	1874	}
	1875	}
	1876	}
	1877
	1878	dfa->parse.tok = lex (dfa);
	1879	}
	1880	else if (dfa->parse.tok == LPAREN)
	1881	{
	1882	dfa->parse.tok = lex (dfa);
	1883	regexp (dfa);
	1884	if (dfa->parse.tok != RPAREN)
	1885	dfaerror (_("unbalanced ("));
	1886	dfa->parse.tok = lex (dfa);
	1887	}
	1888	else
	1889	addtok (dfa, EMPTY);
	1890	}
	1891
	1892	/* Return the number of tokens in the given subexpression. */
	1893	static idx_t _GL_ATTRIBUTE_PURE
	1894	nsubtoks (struct dfa const *dfa, idx_t tindex)
	1895	{
	1896	switch (dfa->tokens[tindex - 1])
	1897	{
	1898	default:
	1899	return 1;
	1900	case QMARK:
	1901	case STAR:
	1902	case PLUS:
	1903	return 1 + nsubtoks (dfa, tindex - 1);
	1904	case CAT:
	1905	case OR:
	1906	{
	1907	idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
	1908	return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
	1909	}
	1910	}
	1911	}
	1912
	1913	/* Copy the given subexpression to the top of the tree. */
	1914	static void
	1915	copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
	1916	{
	1917	if (dfa->localeinfo.multibyte)
	1918	for (idx_t i = 0; i < ntokens; i++)
	1919	addtok_mb (dfa, dfa->tokens[tindex + i],
	1920	dfa->multibyte_prop[tindex + i]);
	1921	else
	1922	for (idx_t i = 0; i < ntokens; i++)
	1923	addtok_mb (dfa, dfa->tokens[tindex + i], 3);
	1924	}
	1925
	1926	static void
	1927	closure (struct dfa *dfa)
	1928	{
	1929	atom (dfa);
	1930	while (dfa->parse.tok == QMARK \|\| dfa->parse.tok == STAR
	1931	\|\| dfa->parse.tok == PLUS \|\| dfa->parse.tok == REPMN)
	1932	if (dfa->parse.tok == REPMN && (dfa->lex.minrep \|\| dfa->lex.maxrep))
	1933	{
	1934	idx_t ntokens = nsubtoks (dfa, dfa->tindex);
	1935	idx_t tindex = dfa->tindex - ntokens;
	1936	if (dfa->lex.maxrep < 0)
	1937	addtok (dfa, PLUS);
	1938	if (dfa->lex.minrep == 0)
	1939	addtok (dfa, QMARK);
	1940	int i;
	1941	for (i = 1; i < dfa->lex.minrep; i++)
	1942	{
	1943	copytoks (dfa, tindex, ntokens);
	1944	addtok (dfa, CAT);
	1945	}
	1946	for (; i < dfa->lex.maxrep; i++)
	1947	{
	1948	copytoks (dfa, tindex, ntokens);
	1949	addtok (dfa, QMARK);
	1950	addtok (dfa, CAT);
	1951	}
	1952	dfa->parse.tok = lex (dfa);
	1953	}
	1954	else if (dfa->parse.tok == REPMN)
	1955	{
	1956	dfa->tindex -= nsubtoks (dfa, dfa->tindex);
	1957	dfa->parse.tok = lex (dfa);
	1958	closure (dfa);
	1959	}
	1960	else
	1961	{
	1962	addtok (dfa, dfa->parse.tok);
	1963	dfa->parse.tok = lex (dfa);
	1964	}
	1965	}
	1966
	1967	static void
	1968	branch (struct dfa* dfa)
	1969	{
	1970	closure (dfa);
	1971	while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
	1972	&& dfa->parse.tok >= 0)
	1973	{
	1974	closure (dfa);
	1975	addtok (dfa, CAT);
	1976	}
	1977	}
	1978
	1979	static void
	1980	regexp (struct dfa *dfa)
	1981	{
	1982	branch (dfa);
	1983	while (dfa->parse.tok == OR)
	1984	{
	1985	dfa->parse.tok = lex (dfa);
	1986	branch (dfa);
	1987	addtok (dfa, OR);
	1988	}
	1989	}
	1990
	1991	/* Parse a string S of length LEN into D. S can include NUL characters.
	1992	This is the main entry point for the parser. */
	1993	void
	1994	dfaparse (char const s, idx_t len, struct dfa d)
	1995	{
	1996	d->lex.ptr = s;
	1997	d->lex.left = len;
	1998	d->lex.lasttok = END;
	1999	d->lex.laststart = true;
	2000
	2001	if (!d->syntax.syntax_bits_set)
	2002	dfaerror (_("no syntax specified"));
	2003
	2004	if (!d->nregexps)
	2005	addtok (d, BEG);
	2006
	2007	d->parse.tok = lex (d);
	2008	d->parse.depth = d->depth;
	2009
	2010	regexp (d);
	2011
	2012	if (d->parse.tok != END)
	2013	dfaerror (_("unbalanced )"));
	2014
	2015	addtok (d, END - d->nregexps);
	2016	addtok (d, CAT);
	2017
	2018	if (d->nregexps)
	2019	addtok (d, OR);
	2020
	2021	++d->nregexps;
	2022	}
	2023
	2024	/* Some primitives for operating on sets of positions. */
	2025
	2026	/* Copy one set to another. */
	2027	static void
	2028	copy (position_set const src, position_set dst)
	2029	{
	2030	if (dst->alloc < src->nelem)
	2031	{
	2032	free (dst->elems);
	2033	dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
	2034	sizeof *dst->elems);
	2035	}
	2036	dst->nelem = src->nelem;
	2037	if (src->nelem != 0)
	2038	memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
	2039	}
	2040
	2041	static void
	2042	alloc_position_set (position_set *s, idx_t size)
	2043	{
	2044	s->elems = xnmalloc (size, sizeof *s->elems);
	2045	s->alloc = size;
	2046	s->nelem = 0;
	2047	}
	2048
	2049	/* Insert position P in set S. S is maintained in sorted order on
	2050	decreasing index. If there is already an entry in S with P.index
	2051	then merge (logically-OR) P's constraints into the one in S.
	2052	S->elems must point to an array large enough to hold the resulting set. */
	2053	static void
	2054	insert (position p, position_set *s)
	2055	{
	2056	idx_t count = s->nelem;
	2057	idx_t lo = 0, hi = count;
	2058	while (lo < hi)
	2059	{
	2060	idx_t mid = (lo + hi) >> 1;
	2061	if (s->elems[mid].index < p.index)
	2062	lo = mid + 1;
	2063	else if (s->elems[mid].index == p.index)
	2064	{
	2065	s->elems[mid].constraint \|= p.constraint;
	2066	return;
	2067	}
	2068	else
	2069	hi = mid;
	2070	}
	2071
	2072	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
	2073	for (idx_t i = count; i > lo; i--)
	2074	s->elems[i] = s->elems[i - 1];
	2075	s->elems[lo] = p;
	2076	++s->nelem;
	2077	}
	2078
	2079	static void
	2080	append (position p, position_set *s)
	2081	{
	2082	idx_t count = s->nelem;
	2083	s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
	2084	s->elems[s->nelem++] = p;
	2085	}
	2086
	2087	/* Merge S1 and S2 (with the additional constraint C2) into M. The
	2088	result is as if the positions of S1, and of S2 with the additional
	2089	constraint C2, were inserted into an initially empty set. */
	2090	static void
	2091	merge_constrained (position_set const s1, position_set const s2,
	2092	unsigned int c2, position_set *m)
	2093	{
	2094	idx_t i = 0, j = 0;
	2095
	2096	if (m->alloc - s1->nelem < s2->nelem)
	2097	{
	2098	free (m->elems);
	2099	m->alloc = s1->nelem;
	2100	m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
	2101	}
	2102	m->nelem = 0;
	2103	while (i < s1->nelem \|\| j < s2->nelem)
	2104	if (! (j < s2->nelem)
	2105	\|\| (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
	2106	{
	2107	unsigned int c = ((i < s1->nelem && j < s2->nelem
	2108	&& s1->elems[i].index == s2->elems[j].index)
	2109	? s2->elems[j++].constraint & c2
	2110	: 0);
	2111	m->elems[m->nelem].index = s1->elems[i].index;
	2112	m->elems[m->nelem++].constraint = s1->elems[i++].constraint \| c;
	2113	}
	2114	else
	2115	{
	2116	if (s2->elems[j].constraint & c2)
	2117	{
	2118	m->elems[m->nelem].index = s2->elems[j].index;
	2119	m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
	2120	}
	2121	j++;
	2122	}
	2123	}
	2124
	2125	/* Merge two sets of positions into a third. The result is exactly as if
	2126	the positions of both sets were inserted into an initially empty set. */
	2127	static void
	2128	merge (position_set const s1, position_set const s2, position_set *m)
	2129	{
	2130	merge_constrained (s1, s2, -1, m);
	2131	}
	2132
	2133	/* Merge into DST all the elements of SRC, possibly destroying
	2134	the contents of the temporary M. */
	2135	static void
	2136	merge2 (position_set dst, position_set const src, position_set *m)
	2137	{
	2138	if (src->nelem < 4)
	2139	{
	2140	for (idx_t i = 0; i < src->nelem; i++)
	2141	insert (src->elems[i], dst);
	2142	}
	2143	else
	2144	{
	2145	merge (src, dst, m);
	2146	copy (m, dst);
	2147	}
	2148	}
	2149
	2150	/* Delete a position from a set. Return the nonzero constraint of the
	2151	deleted position, or zero if there was no such position. */
	2152	static unsigned int
	2153	delete (idx_t del, position_set *s)
	2154	{
	2155	idx_t count = s->nelem;
	2156	idx_t lo = 0, hi = count;
	2157	while (lo < hi)
	2158	{
	2159	idx_t mid = (lo + hi) >> 1;
	2160	if (s->elems[mid].index < del)
	2161	lo = mid + 1;
	2162	else if (s->elems[mid].index == del)
	2163	{
	2164	unsigned int c = s->elems[mid].constraint;
	2165	idx_t i;
	2166	for (i = mid; i + 1 < count; i++)
	2167	s->elems[i] = s->elems[i + 1];
	2168	s->nelem = i;
	2169	return c;
	2170	}
	2171	else
	2172	hi = mid;
	2173	}
	2174	return 0;
	2175	}
	2176
	2177	/* Replace a position with the followed set. */
	2178	static void
	2179	replace (position_set dst, idx_t del, position_set add,
	2180	unsigned int constraint, position_set *tmp)
	2181	{
	2182	unsigned int c = delete (del, dst) & constraint;
	2183
	2184	if (c)
	2185	{
	2186	copy (dst, tmp);
	2187	merge_constrained (tmp, add, c, dst);
	2188	}
	2189	}
	2190
	2191	/* Find the index of the state corresponding to the given position set with
	2192	the given preceding context, or create a new state if there is no such
	2193	state. Context tells whether we got here on a newline or letter. */
	2194	static state_num
	2195	state_index (struct dfa d, position_set const s, int context)
	2196	{
	2197	size_t hash = 0;
	2198	int constraint = 0;
	2199	state_num i;
	2200
	2201	for (i = 0; i < s->nelem; ++i)
	2202	{
	2203	idx_t ind = s->elems[i].index;
	2204	hash ^= ind + s->elems[i].constraint;
	2205	}
	2206
	2207	/* Try to find a state that exactly matches the proposed one. */
	2208	for (i = 0; i < d->sindex; ++i)
	2209	{
	2210	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
	2211	\|\| context != d->states[i].context)
	2212	continue;
	2213	state_num j;
	2214	for (j = 0; j < s->nelem; ++j)
	2215	if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
	2216	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
	2217	break;
	2218	if (j == s->nelem)
	2219	return i;
	2220	}
	2221
	2222	#ifdef DEBUG
	2223	fprintf (stderr, "new state %td\n nextpos:", i);
	2224	for (state_num j = 0; j < s->nelem; j++)
	2225	{
	2226	fprintf (stderr, " %td:", s->elems[j].index);
	2227	prtok (d->tokens[s->elems[j].index]);
	2228	}
	2229	fprintf (stderr, "\n context:");
	2230	if (context ^ CTX_ANY)
	2231	{
	2232	if (context & CTX_NONE)
	2233	fprintf (stderr, " CTX_NONE");
	2234	if (context & CTX_LETTER)
	2235	fprintf (stderr, " CTX_LETTER");
	2236	if (context & CTX_NEWLINE)
	2237	fprintf (stderr, " CTX_NEWLINE");
	2238	}
	2239	else
	2240	fprintf (stderr, " CTX_ANY");
	2241	fprintf (stderr, "\n");
	2242	#endif
	2243
	2244	for (state_num j = 0; j < s->nelem; j++)
	2245	{
	2246	int c = d->constraints[s->elems[j].index];
	2247
	2248	if (c != 0)
	2249	{
	2250	if (succeeds_in_context (c, context, CTX_ANY))
	2251	constraint \|= c;
	2252	}
	2253	else if (d->tokens[s->elems[j].index] == BACKREF)
	2254	constraint = NO_CONSTRAINT;
	2255	}
	2256
	2257
	2258	/* Create a new state. */
	2259	d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
	2260	sizeof *d->states);
	2261	d->states[i].hash = hash;
	2262	alloc_position_set (&d->states[i].elems, s->nelem);
	2263	copy (s, &d->states[i].elems);
	2264	d->states[i].context = context;
	2265	d->states[i].constraint = constraint;
	2266	d->states[i].mbps.nelem = 0;
	2267	d->states[i].mbps.elems = NULL;
	2268	d->states[i].mb_trindex = -1;
	2269
	2270	++d->sindex;
	2271
	2272	return i;
	2273	}
	2274
	2275	/* Find the epsilon closure of D's set of positions. If any position of the set
	2276	contains a symbol that matches the empty string in some context, replace
	2277	that position with the elements of its follow labeled with an appropriate
	2278	constraint. Repeat exhaustively until no funny positions are left.
	2279	S->elems must be large enough to hold the result. BACKWARD is D's
	2280	backward set; use and update it too. */
	2281	static void
	2282	epsclosure (struct dfa const d, position_set backward)
	2283	{
	2284	position_set tmp;
	2285	alloc_position_set (&tmp, d->nleaves);
	2286	for (idx_t i = 0; i < d->tindex; i++)
	2287	if (0 < d->follows[i].nelem)
	2288	{
	2289	unsigned int constraint;
	2290	switch (d->tokens[i])
	2291	{
	2292	default:
	2293	continue;
	2294
	2295	case BEGLINE:
	2296	constraint = BEGLINE_CONSTRAINT;
	2297	break;
	2298	case ENDLINE:
	2299	constraint = ENDLINE_CONSTRAINT;
	2300	break;
	2301	case BEGWORD:
	2302	constraint = BEGWORD_CONSTRAINT;
	2303	break;
	2304	case ENDWORD:
	2305	constraint = ENDWORD_CONSTRAINT;
	2306	break;
	2307	case LIMWORD:
	2308	constraint = LIMWORD_CONSTRAINT;
	2309	break;
	2310	case NOTLIMWORD:
	2311	constraint = NOTLIMWORD_CONSTRAINT;
	2312	break;
	2313	case EMPTY:
	2314	constraint = NO_CONSTRAINT;
	2315	break;
	2316	}
	2317
	2318	delete (i, &d->follows[i]);
	2319
	2320	for (idx_t j = 0; j < backward[i].nelem; j++)
	2321	replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
	2322	constraint, &tmp);
	2323	for (idx_t j = 0; j < d->follows[i].nelem; j++)
	2324	replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
	2325	NO_CONSTRAINT, &tmp);
	2326	}
	2327	free (tmp.elems);
	2328	}
	2329
	2330	/* Returns the set of contexts for which there is at least one
	2331	character included in C. */
	2332
	2333	static int
	2334	charclass_context (struct dfa const dfa, charclass const c)
	2335	{
	2336	int context = 0;
	2337
	2338	for (int j = 0; j < CHARCLASS_WORDS; j++)
	2339	{
	2340	if (c->w[j] & dfa->syntax.newline.w[j])
	2341	context \|= CTX_NEWLINE;
	2342	if (c->w[j] & dfa->syntax.letters.w[j])
	2343	context \|= CTX_LETTER;
	2344	if (c->w[j] & ~(dfa->syntax.letters.w[j] \| dfa->syntax.newline.w[j]))
	2345	context \|= CTX_NONE;
	2346	}
	2347
	2348	return context;
	2349	}
	2350
	2351	/* Returns the contexts on which the position set S depends. Each context
	2352	in the set of returned contexts (let's call it SC) may have a different
	2353	follow set than other contexts in SC, and also different from the
	2354	follow set of the complement set (sc ^ CTX_ANY). However, all contexts
	2355	in the complement set will have the same follow set. */
	2356
	2357	static int _GL_ATTRIBUTE_PURE
	2358	state_separate_contexts (struct dfa d, position_set const s)
	2359	{
	2360	int separate_contexts = 0;
	2361
	2362	for (idx_t j = 0; j < s->nelem; j++)
	2363	separate_contexts \|= d->separates[s->elems[j].index];
	2364
	2365	return separate_contexts;
	2366	}
	2367
	2368	enum
	2369	{
	2370	/* Single token is repeated. It is distinguished from non-repeated. */
	2371	OPT_REPEAT = (1 << 0),
	2372
	2373	/* Multiple tokens are repeated. This flag is on at head of tokens. The
	2374	node is not merged. */
	2375	OPT_LPAREN = (1 << 1),
	2376
	2377	/* Multiple branches are joined. The node is not merged. */
	2378	OPT_RPAREN = (1 << 2),
	2379
	2380	/* The node is walked. If the node is found in walking again, OPT_RPAREN
	2381	flag is turned on. */
	2382	OPT_WALKED = (1 << 3),
	2383
	2384	/* The node is queued. The node is not queued again. */
	2385	OPT_QUEUED = (1 << 4)
	2386	};
	2387
	2388	static void
	2389	merge_nfa_state (struct dfa d, idx_t tindex, char flags,
	2390	position_set *merged)
	2391	{
	2392	position_set *follows = d->follows;
	2393	idx_t nelem = 0;
	2394
	2395	for (idx_t i = 0; i < follows[tindex].nelem; i++)
	2396	{
	2397	idx_t sindex = follows[tindex].elems[i].index;
	2398
	2399	/* Skip the node as pruned in future. */
	2400	unsigned int iconstraint = follows[tindex].elems[i].constraint;
	2401	if (iconstraint == 0)
	2402	continue;
	2403
	2404	if (d->tokens[follows[tindex].elems[i].index] <= END)
	2405	{
	2406	d->constraints[tindex] \|= follows[tindex].elems[i].constraint;
	2407	continue;
	2408	}
	2409
	2410	if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN \| OPT_RPAREN)))
	2411	{
	2412	idx_t j;
	2413
	2414	for (j = 0; j < nelem; j++)
	2415	{
	2416	idx_t dindex = follows[tindex].elems[j].index;
	2417
	2418	if (dindex == tindex)
	2419	continue;
	2420
	2421	if (follows[tindex].elems[j].constraint != iconstraint)
	2422	continue;
	2423
	2424	if (flags[dindex] & (OPT_LPAREN \| OPT_RPAREN))
	2425	continue;
	2426
	2427	if (d->tokens[sindex] != d->tokens[dindex])
	2428	continue;
	2429
	2430	if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
	2431	continue;
	2432
	2433	if (flags[sindex] & OPT_REPEAT)
	2434	delete (sindex, &follows[sindex]);
	2435
	2436	merge2 (&follows[dindex], &follows[sindex], merged);
	2437
	2438	break;
	2439	}
	2440
	2441	if (j < nelem)
	2442	continue;
	2443	}
	2444
	2445	follows[tindex].elems[nelem++] = follows[tindex].elems[i];
	2446	flags[sindex] \|= OPT_QUEUED;
	2447	}
	2448
	2449	follows[tindex].nelem = nelem;
	2450	}
	2451
	2452	static int
	2453	compare (const void a, const void b)
	2454	{
	2455	position const p = a, q = b;
	2456	return (p->index > q->index) - (p->index < q->index);
	2457	}
	2458
	2459	static void
	2460	reorder_tokens (struct dfa *d)
	2461	{
	2462	idx_t nleaves = 0;
	2463	ptrdiff_t map = xnmalloc (d->tindex, sizeof map);
	2464	map[0] = nleaves++;
	2465	for (idx_t i = 1; i < d->tindex; i++)
	2466	map[i] = -1;
	2467
	2468	token tokens = xnmalloc (d->nleaves, sizeof tokens);
	2469	position_set follows = xnmalloc (d->nleaves, sizeof follows);
	2470	int constraints = xnmalloc (d->nleaves, sizeof constraints);
	2471	char *multibyte_prop = (d->localeinfo.multibyte
	2472	? xnmalloc (d->nleaves, sizeof *multibyte_prop)
	2473	: NULL);
	2474
	2475	for (idx_t i = 0; i < d->tindex; i++)
	2476	{
	2477	if (map[i] < 0)
	2478	{
	2479	free (d->follows[i].elems);
	2480	d->follows[i].elems = NULL;
	2481	d->follows[i].nelem = 0;
	2482	continue;
	2483	}
	2484
	2485	tokens[map[i]] = d->tokens[i];
	2486	follows[map[i]] = d->follows[i];
	2487	constraints[map[i]] = d->constraints[i];
	2488
	2489	if (multibyte_prop != NULL)
	2490	multibyte_prop[map[i]] = d->multibyte_prop[i];
	2491
	2492	for (idx_t j = 0; j < d->follows[i].nelem; j++)
	2493	{
	2494	if (map[d->follows[i].elems[j].index] == -1)
	2495	map[d->follows[i].elems[j].index] = nleaves++;
	2496
	2497	d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
	2498	}
	2499
	2500	qsort (d->follows[i].elems, d->follows[i].nelem,
	2501	sizeof *d->follows[i].elems, compare);
	2502	}
	2503
	2504	for (idx_t i = 0; i < nleaves; i++)
	2505	{
	2506	d->tokens[i] = tokens[i];
	2507	d->follows[i] = follows[i];
	2508	d->constraints[i] = constraints[i];
	2509
	2510	if (multibyte_prop != NULL)
	2511	d->multibyte_prop[i] = multibyte_prop[i];
	2512	}
	2513
	2514	d->tindex = d->nleaves = nleaves;
	2515
	2516	free (tokens);
	2517	free (follows);
	2518	free (constraints);
	2519	free (multibyte_prop);
	2520	free (map);
	2521	}
	2522
	2523	static void
	2524	dfaoptimize (struct dfa *d)
	2525	{
	2526	char *flags = xizalloc (d->tindex);
	2527
	2528	for (idx_t i = 0; i < d->tindex; i++)
	2529	{
	2530	for (idx_t j = 0; j < d->follows[i].nelem; j++)
	2531	{
	2532	if (d->follows[i].elems[j].index == i)
	2533	flags[d->follows[i].elems[j].index] \|= OPT_REPEAT;
	2534	else if (d->follows[i].elems[j].index < i)
	2535	flags[d->follows[i].elems[j].index] \|= OPT_LPAREN;
	2536	else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
	2537	flags[d->follows[i].elems[j].index] \|= OPT_RPAREN;
	2538	else
	2539	flags[d->follows[i].elems[j].index] \|= OPT_WALKED;
	2540	}
	2541	}
	2542
	2543	flags[0] \|= OPT_QUEUED;
	2544
	2545	position_set merged0;
	2546	position_set *merged = &merged0;
	2547	alloc_position_set (merged, d->nleaves);
	2548
	2549	d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
	2550
	2551	for (idx_t i = 0; i < d->tindex; i++)
	2552	if (flags[i] & OPT_QUEUED)
	2553	merge_nfa_state (d, i, flags, merged);
	2554
	2555	reorder_tokens (d);
	2556
	2557	free (merged->elems);
	2558	free (flags);
	2559	}
	2560
	2561	/* Perform bottom-up analysis on the parse tree, computing various functions.
	2562	Note that at this point, we're pretending constructs like \< are real
	2563	characters rather than constraints on what can follow them.
	2564
	2565	Nullable: A node is nullable if it is at the root of a regexp that can
	2566	match the empty string.
	2567	* EMPTY leaves are nullable.
	2568	* No other leaf is nullable.
	2569	* A QMARK or STAR node is nullable.
	2570	* A PLUS node is nullable if its argument is nullable.
	2571	* A CAT node is nullable if both its arguments are nullable.
	2572	* An OR node is nullable if either argument is nullable.
	2573
	2574	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
	2575	that could correspond to the first character of a string matching the
	2576	regexp rooted at the given node.
	2577	* EMPTY leaves have empty firstpos.
	2578	* The firstpos of a nonempty leaf is that leaf itself.
	2579	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
	2580	argument.
	2581	* The firstpos of a CAT node is the firstpos of the left argument, union
	2582	the firstpos of the right if the left argument is nullable.
	2583	* The firstpos of an OR node is the union of firstpos of each argument.
	2584
	2585	Lastpos: The lastpos of a node is the set of positions that could
	2586	correspond to the last character of a string matching the regexp at
	2587	the given node.
	2588	* EMPTY leaves have empty lastpos.
	2589	* The lastpos of a nonempty leaf is that leaf itself.
	2590	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
	2591	argument.
	2592	* The lastpos of a CAT node is the lastpos of its right argument, union
	2593	the lastpos of the left if the right argument is nullable.
	2594	* The lastpos of an OR node is the union of the lastpos of each argument.
	2595
	2596	Follow: The follow of a position is the set of positions that could
	2597	correspond to the character following a character matching the node in
	2598	a string matching the regexp. At this point we consider special symbols
	2599	that match the empty string in some context to be just normal characters.
	2600	Later, if we find that a special symbol is in a follow set, we will
	2601	replace it with the elements of its follow, labeled with an appropriate
	2602	constraint.
	2603	* Every node in the firstpos of the argument of a STAR or PLUS node is in
	2604	the follow of every node in the lastpos.
	2605	* Every node in the firstpos of the second argument of a CAT node is in
	2606	the follow of every node in the lastpos of the first argument.
	2607
	2608	Because of the postfix representation of the parse tree, the depth-first
	2609	analysis is conveniently done by a linear scan with the aid of a stack.
	2610	Sets are stored as arrays of the elements, obeying a stack-like allocation
	2611	scheme; the number of elements in each set deeper in the stack can be
	2612	used to determine the address of a particular set's array. */
	2613	static void
	2614	dfaanalyze (struct dfa *d, bool searchflag)
	2615	{
	2616	/* Array allocated to hold position sets. */
	2617	position posalloc = xnmalloc (d->nleaves, 2 sizeof *posalloc);
	2618	/* Firstpos and lastpos elements. */
	2619	position *firstpos = posalloc;
	2620	position *lastpos = firstpos + d->nleaves;
	2621	position pos;
	2622	position_set tmp;
	2623
	2624	/* Stack for element counts and nullable flags. */
	2625	struct
	2626	{
	2627	/* Whether the entry is nullable. */
	2628	bool nullable;
	2629
	2630	/* Counts of firstpos and lastpos sets. */
	2631	idx_t nfirstpos;
	2632	idx_t nlastpos;
	2633	} stkalloc = xnmalloc (d->depth, sizeof stkalloc), *stk = stkalloc;
	2634
	2635	position_set merged; /* Result of merging sets. */
	2636
	2637	addtok (d, CAT);
	2638	idx_t tindex = d->tindex;
	2639
	2640	#ifdef DEBUG
	2641	fprintf (stderr, "dfaanalyze:\n");
	2642	for (idx_t i = 0; i < tindex; i++)
	2643	{
	2644	fprintf (stderr, " %td:", i);
	2645	prtok (d->tokens[i]);
	2646	}
	2647	putc ('\n', stderr);
	2648	#endif
	2649
	2650	d->searchflag = searchflag;
	2651	alloc_position_set (&merged, d->nleaves);
	2652	d->follows = xicalloc (tindex, sizeof *d->follows);
	2653	position_set *backward
	2654	= d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
	2655
	2656	for (idx_t i = 0; i < tindex; i++)
	2657	{
	2658	switch (d->tokens[i])
	2659	{
	2660	case EMPTY:
	2661	/* The empty set is nullable. */
	2662	stk->nullable = true;
	2663
	2664	/* The firstpos and lastpos of the empty leaf are both empty. */
	2665	stk->nfirstpos = stk->nlastpos = 0;
	2666	stk++;
	2667	break;
	2668
	2669	case STAR:
	2670	case PLUS:
	2671	/* Every element in the lastpos of the argument is in the backward
	2672	set of every element in the firstpos. */
	2673	if (d->epsilon)
	2674	{
	2675	tmp.elems = lastpos - stk[-1].nlastpos;
	2676	tmp.nelem = stk[-1].nlastpos;
	2677	for (position *p = firstpos - stk[-1].nfirstpos;
	2678	p < firstpos; p++)
	2679	merge2 (&backward[p->index], &tmp, &merged);
	2680	}
	2681
	2682	/* Every element in the firstpos of the argument is in the follow
	2683	of every element in the lastpos. */
	2684	{
	2685	tmp.elems = firstpos - stk[-1].nfirstpos;
	2686	tmp.nelem = stk[-1].nfirstpos;
	2687	for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
	2688	merge2 (&d->follows[p->index], &tmp, &merged);
	2689	}
	2690	FALLTHROUGH;
	2691	case QMARK:
	2692	/* A QMARK or STAR node is automatically nullable. */
	2693	if (d->tokens[i] != PLUS)
	2694	stk[-1].nullable = true;
	2695	break;
	2696
	2697	case CAT:
	2698	/* Every element in the lastpos of the first argument is in
	2699	the backward set of every element in the firstpos of the
	2700	second argument. */
	2701	if (backward)
	2702	{
	2703	tmp.nelem = stk[-2].nlastpos;
	2704	tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
	2705	for (position *p = firstpos - stk[-1].nfirstpos;
	2706	p < firstpos; p++)
	2707	merge2 (&backward[p->index], &tmp, &merged);
	2708	}
	2709
	2710	/* Every element in the firstpos of the second argument is in the
	2711	follow of every element in the lastpos of the first argument. */
	2712	{
	2713	tmp.nelem = stk[-1].nfirstpos;
	2714	tmp.elems = firstpos - stk[-1].nfirstpos;
	2715	for (position *plim = lastpos - stk[-1].nlastpos,
	2716	*p = plim - stk[-2].nlastpos;
	2717	p < plim; p++)
	2718	merge2 (&d->follows[p->index], &tmp, &merged);
	2719	}
	2720
	2721	/* The firstpos of a CAT node is the firstpos of the first argument,
	2722	union that of the second argument if the first is nullable. */
	2723	if (stk[-2].nullable)
	2724	stk[-2].nfirstpos += stk[-1].nfirstpos;
	2725	else
	2726	firstpos -= stk[-1].nfirstpos;
	2727
	2728	/* The lastpos of a CAT node is the lastpos of the second argument,
	2729	union that of the first argument if the second is nullable. */
	2730	if (stk[-1].nullable)
	2731	stk[-2].nlastpos += stk[-1].nlastpos;
	2732	else
	2733	{
	2734	position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
	2735	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
	2736	p[j] = p[j + stk[-2].nlastpos];
	2737	lastpos -= stk[-2].nlastpos;
	2738	stk[-2].nlastpos = stk[-1].nlastpos;
	2739	}
	2740
	2741	/* A CAT node is nullable if both arguments are nullable. */
	2742	stk[-2].nullable &= stk[-1].nullable;
	2743	stk--;
	2744	break;
	2745
	2746	case OR:
	2747	/* The firstpos is the union of the firstpos of each argument. */
	2748	stk[-2].nfirstpos += stk[-1].nfirstpos;
	2749
	2750	/* The lastpos is the union of the lastpos of each argument. */
	2751	stk[-2].nlastpos += stk[-1].nlastpos;
	2752
	2753	/* An OR node is nullable if either argument is nullable. */
	2754	stk[-2].nullable \|= stk[-1].nullable;
	2755	stk--;
	2756	break;
	2757
	2758	default:
	2759	/* Anything else is a nonempty position. (Note that special
	2760	constructs like \< are treated as nonempty strings here;
	2761	an "epsilon closure" effectively makes them nullable later.
	2762	Backreferences have to get a real position so we can detect
	2763	transitions on them later. But they are nullable. */
	2764	stk->nullable = d->tokens[i] == BACKREF;
	2765
	2766	/* This position is in its own firstpos and lastpos. */
	2767	stk->nfirstpos = stk->nlastpos = 1;
	2768	stk++;
	2769
	2770	firstpos->index = lastpos->index = i;
	2771	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
	2772	firstpos++, lastpos++;
	2773
	2774	break;
	2775	}
	2776	#ifdef DEBUG
	2777	/* ... balance the above nonsyntactic #ifdef goo... */
	2778	fprintf (stderr, "node %td:", i);
	2779	prtok (d->tokens[i]);
	2780	putc ('\n', stderr);
	2781	fprintf (stderr,
	2782	stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
	2783	fprintf (stderr, " firstpos:");
	2784	for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
	2785	{
	2786	fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
	2787	prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
	2788	}
	2789	fprintf (stderr, "\n lastpos:");
	2790	for (idx_t j = 0; j < stk[-1].nlastpos; j++)
	2791	{
	2792	fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
	2793	prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
	2794	}
	2795	putc ('\n', stderr);
	2796	#endif
	2797	}
	2798
	2799	if (backward)
	2800	{
	2801	/* For each follow set that is the follow set of a real position,
	2802	replace it with its epsilon closure. */
	2803	epsclosure (d, backward);
	2804
	2805	for (idx_t i = 0; i < tindex; i++)
	2806	free (backward[i].elems);
	2807	free (backward);
	2808	}
	2809
	2810	dfaoptimize (d);
	2811
	2812	#ifdef DEBUG
	2813	for (idx_t i = 0; i < tindex; i++)
	2814	if (d->tokens[i] == BEG \|\| d->tokens[i] < NOTCHAR
	2815	\|\| d->tokens[i] == BACKREF \|\| d->tokens[i] == ANYCHAR
	2816	\|\| d->tokens[i] == MBCSET \|\| d->tokens[i] >= CSET)
	2817	{
	2818	fprintf (stderr, "follows(%td:", i);
	2819	prtok (d->tokens[i]);
	2820	fprintf (stderr, "):");
	2821	for (idx_t j = 0; j < d->follows[i].nelem; j++)
	2822	{
	2823	fprintf (stderr, " %td:", d->follows[i].elems[j].index);
	2824	prtok (d->tokens[d->follows[i].elems[j].index]);
	2825	}
	2826	putc ('\n', stderr);
	2827	}
	2828	#endif
	2829
	2830	pos.index = 0;
	2831	pos.constraint = NO_CONSTRAINT;
	2832
	2833	alloc_position_set (&tmp, 1);
	2834
	2835	append (pos, &tmp);
	2836
	2837	d->separates = xicalloc (tindex, sizeof *d->separates);
	2838
	2839	for (idx_t i = 0; i < tindex; i++)
	2840	{
	2841	if (prev_newline_dependent (d->constraints[i]))
	2842	d->separates[i] \|= CTX_NEWLINE;
	2843	if (prev_letter_dependent (d->constraints[i]))
	2844	d->separates[i] \|= CTX_LETTER;
	2845
	2846	for (idx_t j = 0; j < d->follows[i].nelem; j++)
	2847	{
	2848	if (prev_newline_dependent (d->follows[i].elems[j].constraint))
	2849	d->separates[i] \|= CTX_NEWLINE;
	2850	if (prev_letter_dependent (d->follows[i].elems[j].constraint))
	2851	d->separates[i] \|= CTX_LETTER;
	2852	}
	2853	}
	2854
	2855	/* Context wanted by some position. */
	2856	int separate_contexts = state_separate_contexts (d, &tmp);
	2857
	2858	/* Build the initial state. */
	2859	if (separate_contexts & CTX_NEWLINE)
	2860	state_index (d, &tmp, CTX_NEWLINE);
	2861	d->initstate_notbol = d->min_trcount
	2862	= state_index (d, &tmp, separate_contexts ^ CTX_ANY);
	2863	if (separate_contexts & CTX_LETTER)
	2864	d->min_trcount = state_index (d, &tmp, CTX_LETTER);
	2865	d->min_trcount++;
	2866	d->trcount = 0;
	2867
	2868	free (posalloc);
	2869	free (stkalloc);
	2870	free (merged.elems);
	2871	free (tmp.elems);
	2872	}
	2873
	2874	/* Make sure D's state arrays are large enough to hold NEW_STATE. */
	2875	static void
	2876	realloc_trans_if_necessary (struct dfa *d)
	2877	{
	2878	state_num oldalloc = d->tralloc;
	2879	if (oldalloc < d->sindex)
	2880	{
	2881	state_num **realtrans = d->trans ? d->trans - 2 : NULL;
	2882	idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
	2883	realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
	2884	-1, sizeof *realtrans);
	2885	realtrans[0] = realtrans[1] = NULL;
	2886	d->trans = realtrans + 2;
	2887	idx_t newalloc = d->tralloc = newalloc1 - 2;
	2888	d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
	2889	d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
	2890	d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
	2891	if (d->localeinfo.multibyte)
	2892	{
	2893	realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
	2894	realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
	2895	if (oldalloc == 0)
	2896	realtrans[0] = realtrans[1] = NULL;
	2897	d->mb_trans = realtrans + 2;
	2898	}
	2899	for (; oldalloc < newalloc; oldalloc++)
	2900	{
	2901	d->trans[oldalloc] = NULL;
	2902	d->fails[oldalloc] = NULL;
	2903	if (d->localeinfo.multibyte)
	2904	d->mb_trans[oldalloc] = NULL;
	2905	}
	2906	}
	2907	}
	2908
	2909	/*
	2910	Calculate the transition table for a new state derived from state s
	2911	for a compiled dfa d after input character uc, and return the new
	2912	state number.
	2913
	2914	Do not worry about all possible input characters; calculate just the group
	2915	of positions that match uc. Label it with the set of characters that
	2916	every position in the group matches (taking into account, if necessary,
	2917	preceding context information of s). Then find the union
	2918	of these positions' follows, i.e., the set of positions of the
	2919	new state. For each character in the group's label, set the transition
	2920	on this character to be to a state corresponding to the set's positions,
	2921	and its associated backward context information, if necessary.
	2922
	2923	When building a searching matcher, include the positions of state
	2924	0 in every state.
	2925
	2926	The group is constructed by building an equivalence-class
	2927	partition of the positions of s.
	2928
	2929	For each position, find the set of characters C that it matches. Eliminate
	2930	any characters from C that fail on grounds of backward context.
	2931
	2932	Check whether the group's label L has nonempty
	2933	intersection with C. If L - C is nonempty, create a new group labeled
	2934	L - C and having the same positions as the current group, and set L to
	2935	the intersection of L and C. Insert the position in the group, set
	2936	C = C - L, and resume scanning.
	2937
	2938	If after comparing with every group there are characters remaining in C,
	2939	create a new group labeled with the characters of C and insert this
	2940	position in that group. */
	2941
	2942	static state_num
	2943	build_state (state_num s, struct dfa *d, unsigned char uc)
	2944	{
	2945	position_set follows; /* Union of the follows for each
	2946	position of the current state. */
	2947	position_set group; /* Positions that match the input char. */
	2948	position_set tmp; /* Temporary space for merging sets. */
	2949	state_num state; /* New state. */
	2950	state_num state_newline; /* New state on a newline transition. */
	2951	state_num state_letter; /* New state on a letter transition. */
	2952
	2953	#ifdef DEBUG
	2954	fprintf (stderr, "build state %td\n", s);
	2955	#endif
	2956
	2957	/* A pointer to the new transition table, and the table itself. */
	2958	state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
	2959	state_num trans = ptrans;
	2960
	2961	if (!trans)
	2962	{
	2963	/* MAX_TRCOUNT is an arbitrary upper limit on the number of
	2964	transition tables that can exist at once, other than for
	2965	initial states. Often-used transition tables are quickly
	2966	rebuilt, whereas rarely-used ones are cleared away. */
	2967	if (MAX_TRCOUNT <= d->trcount)
	2968	{
	2969	for (state_num i = d->min_trcount; i < d->tralloc; i++)
	2970	{
	2971	free (d->trans[i]);
	2972	free (d->fails[i]);
	2973	d->trans[i] = d->fails[i] = NULL;
	2974	}
	2975	d->trcount = 0;
	2976	}
	2977
	2978	d->trcount++;
	2979	ptrans = trans = xmalloc (NOTCHAR sizeof *trans);
	2980
	2981	/* Fill transition table with a default value which means that the
	2982	transited state has not been calculated yet. */
	2983	for (int i = 0; i < NOTCHAR; i++)
	2984	trans[i] = -2;
	2985	}
	2986
	2987	/* Set up the success bits for this state. */
	2988	d->success[s] = 0;
	2989	if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
	2990	d->success[s] \|= CTX_NEWLINE;
	2991	if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
	2992	d->success[s] \|= CTX_LETTER;
	2993	if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
	2994	d->success[s] \|= CTX_NONE;
	2995
	2996	alloc_position_set (&follows, d->nleaves);
	2997
	2998	/* Find the union of the follows of the positions of the group.
	2999	This is a hideously inefficient loop. Fix it someday. */
	3000	for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
	3001	for (idx_t k = 0;
	3002	k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
	3003	insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
	3004	&follows);
	3005
	3006	/* Positions that match the input char. */
	3007	alloc_position_set (&group, d->nleaves);
	3008
	3009	/* The group's label. */
	3010	charclass label;
	3011	fillset (&label);
	3012
	3013	for (idx_t i = 0; i < follows.nelem; i++)
	3014	{
	3015	charclass matches; /* Set of matching characters. */
	3016	position pos = follows.elems[i];
	3017	bool matched = false;
	3018	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
	3019	{
	3020	zeroset (&matches);
	3021	setbit (d->tokens[pos.index], &matches);
	3022	if (d->tokens[pos.index] == uc)
	3023	matched = true;
	3024	}
	3025	else if (d->tokens[pos.index] >= CSET)
	3026	{
	3027	matches = d->charclasses[d->tokens[pos.index] - CSET];
	3028	if (tstbit (uc, &matches))
	3029	matched = true;
	3030	}
	3031	else if (d->tokens[pos.index] == ANYCHAR)
	3032	{
	3033	matches = d->charclasses[d->canychar];
	3034	if (tstbit (uc, &matches))
	3035	matched = true;
	3036
	3037	/* ANYCHAR must match with a single character, so we must put
	3038	it to D->states[s].mbps which contains the positions which
	3039	can match with a single character not a byte. If all
	3040	positions which has ANYCHAR does not depend on context of
	3041	next character, we put the follows instead of it to
	3042	D->states[s].mbps to optimize. */
	3043	if (succeeds_in_context (pos.constraint, d->states[s].context,
	3044	CTX_NONE))
	3045	{
	3046	if (d->states[s].mbps.nelem == 0)
	3047	alloc_position_set (&d->states[s].mbps, 1);
	3048	insert (pos, &d->states[s].mbps);
	3049	}
	3050	}
	3051	else
	3052	continue;
	3053
	3054	/* Some characters may need to be eliminated from matches because
	3055	they fail in the current context. */
	3056	if (pos.constraint != NO_CONSTRAINT)
	3057	{
	3058	if (!succeeds_in_context (pos.constraint,
	3059	d->states[s].context, CTX_NEWLINE))
	3060	for (int j = 0; j < CHARCLASS_WORDS; j++)
	3061	matches.w[j] &= ~d->syntax.newline.w[j];
	3062	if (!succeeds_in_context (pos.constraint,
	3063	d->states[s].context, CTX_LETTER))
	3064	for (int j = 0; j < CHARCLASS_WORDS; ++j)
	3065	matches.w[j] &= ~d->syntax.letters.w[j];
	3066	if (!succeeds_in_context (pos.constraint,
	3067	d->states[s].context, CTX_NONE))
	3068	for (int j = 0; j < CHARCLASS_WORDS; ++j)
	3069	matches.w[j] &= d->syntax.letters.w[j] \| d->syntax.newline.w[j];
	3070
	3071	/* If there are no characters left, there's no point in going on. */
	3072	if (emptyset (&matches))
	3073	continue;
	3074
	3075	/* If we have reset the bit that made us declare "matched", reset
	3076	that indicator, too. This is required to avoid an infinite loop
	3077	with this command: echo cx \| LC_ALL=C grep -E 'c\b[x ]' */
	3078	if (!tstbit (uc, &matches))
	3079	matched = false;
	3080	}
	3081
	3082	#ifdef DEBUG
	3083	fprintf (stderr, " nextpos %td:", pos.index);
	3084	prtok (d->tokens[pos.index]);
	3085	fprintf (stderr, " of");
	3086	for (unsigned j = 0; j < NOTCHAR; j++)
	3087	if (tstbit (j, &matches))
	3088	fprintf (stderr, " 0x%02x", j);
	3089	fprintf (stderr, "\n");
	3090	#endif
	3091
	3092	if (matched)
	3093	{
	3094	for (int k = 0; k < CHARCLASS_WORDS; ++k)
	3095	label.w[k] &= matches.w[k];
	3096	append (pos, &group);
	3097	}
	3098	else
	3099	{
	3100	for (int k = 0; k < CHARCLASS_WORDS; ++k)
	3101	label.w[k] &= ~matches.w[k];
	3102	}
	3103	}
	3104
	3105	alloc_position_set (&tmp, d->nleaves);
	3106
	3107	if (group.nelem > 0)
	3108	{
	3109	/* If we are building a searching matcher, throw in the positions
	3110	of state 0 as well, if possible. */
	3111	if (d->searchflag)
	3112	{
	3113	/* If a token in follows.elems is not 1st byte of a multibyte
	3114	character, or the states of follows must accept the bytes
	3115	which are not 1st byte of the multibyte character.
	3116	Then, if a state of follows encounters a byte, it must not be
	3117	a 1st byte of a multibyte character nor a single byte character.
	3118	In this case, do not add state[0].follows to next state, because
	3119	state[0] must accept 1st-byte.
	3120
	3121	For example, suppose <sb a> is a certain single byte character,
	3122	<mb A> is a certain multibyte character, and the codepoint of
	3123	<sb a> equals the 2nd byte of the codepoint of <mb A>. When
	3124	state[0] accepts <sb a>, state[i] transits to state[i+1] by
	3125	accepting the 1st byte of <mb A>, and state[i+1] accepts the
	3126	2nd byte of <mb A>, if state[i+1] encounters the codepoint of
	3127	<sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
	3128	not add state[0]. */
	3129
	3130	bool mergeit = !d->localeinfo.multibyte;
	3131	if (!mergeit)
	3132	{
	3133	mergeit = true;
	3134	for (idx_t j = 0; mergeit && j < group.nelem; j++)
	3135	mergeit &= d->multibyte_prop[group.elems[j].index];
	3136	}
	3137	if (mergeit)
	3138	merge2 (&group, &d->states[0].elems, &tmp);
	3139	}
	3140
	3141	/* Find out if the new state will want any context information,
	3142	by calculating possible contexts that the group can match,
	3143	and separate contexts that the new state wants to know. */
	3144	int possible_contexts = charclass_context (d, &label);
	3145	int separate_contexts = state_separate_contexts (d, &group);
	3146
	3147	/* Find the state(s) corresponding to the union of the follows. */
	3148	if (possible_contexts & ~separate_contexts)
	3149	state = state_index (d, &group, separate_contexts ^ CTX_ANY);
	3150	else
	3151	state = -1;
	3152	if (separate_contexts & possible_contexts & CTX_NEWLINE)
	3153	state_newline = state_index (d, &group, CTX_NEWLINE);
	3154	else
	3155	state_newline = state;
	3156	if (separate_contexts & possible_contexts & CTX_LETTER)
	3157	state_letter = state_index (d, &group, CTX_LETTER);
	3158	else
	3159	state_letter = state;
	3160
	3161	/* Reallocate now, to reallocate any newline transition properly. */
	3162	realloc_trans_if_necessary (d);
	3163	}
	3164
	3165	/* If we are a searching matcher, the default transition is to a state
	3166	containing the positions of state 0, otherwise the default transition
	3167	is to fail miserably. */
	3168	else if (d->searchflag)
	3169	{
	3170	state_newline = 0;
	3171	state_letter = d->min_trcount - 1;
	3172	state = d->initstate_notbol;
	3173	}
	3174	else
	3175	{
	3176	state_newline = -1;
	3177	state_letter = -1;
	3178	state = -1;
	3179	}
	3180
	3181	/* Set the transitions for each character in the label. */
	3182	for (int i = 0; i < NOTCHAR; i++)
	3183	if (tstbit (i, &label))
	3184	switch (d->syntax.sbit[i])
	3185	{
	3186	case CTX_NEWLINE:
	3187	trans[i] = state_newline;
	3188	break;
	3189	case CTX_LETTER:
	3190	trans[i] = state_letter;
	3191	break;
	3192	default:
	3193	trans[i] = state;
	3194	break;
	3195	}
	3196
	3197	#ifdef DEBUG
	3198	fprintf (stderr, "trans table %td", s);
	3199	for (int i = 0; i < NOTCHAR; ++i)
	3200	{
	3201	if (!(i & 0xf))
	3202	fprintf (stderr, "\n");
	3203	fprintf (stderr, " %2td", trans[i]);
	3204	}
	3205	fprintf (stderr, "\n");
	3206	#endif
	3207
	3208	free (group.elems);
	3209	free (follows.elems);
	3210	free (tmp.elems);
	3211
	3212	/* Keep the newline transition in a special place so we can use it as
	3213	a sentinel. */
	3214	if (tstbit (d->syntax.eolbyte, &label))
	3215	{
	3216	d->newlines[s] = trans[d->syntax.eolbyte];
	3217	trans[d->syntax.eolbyte] = -1;
	3218	}
	3219
	3220	return trans[uc];
	3221	}
	3222
	3223	/* Multibyte character handling sub-routines for dfaexec. */
	3224
	3225	/* Consume a single byte and transit state from 's' to '*next_state'.
	3226	This function is almost same as the state transition routin in dfaexec.
	3227	But state transition is done just once, otherwise matching succeed or
	3228	reach the end of the buffer. */
	3229	static state_num
	3230	transit_state_singlebyte (struct dfa d, state_num s, unsigned char const *pp)
	3231	{
	3232	state_num *t;
	3233
	3234	if (d->trans[s])
	3235	t = d->trans[s];
	3236	else if (d->fails[s])
	3237	t = d->fails[s];
	3238	else
	3239	{
	3240	build_state (s, d, **pp);
	3241	if (d->trans[s])
	3242	t = d->trans[s];
	3243	else
	3244	{
	3245	t = d->fails[s];
	3246	assert (t);
	3247	}
	3248	}
	3249
	3250	if (t[**pp] == -2)
	3251	build_state (s, d, **pp);
	3252
	3253	return t[(pp)++];
	3254	}
	3255
	3256	/* Transit state from s, then return new state and update the pointer of
	3257	the buffer. This function is for a period operator which can match a
	3258	multi-byte character. */
	3259	static state_num
	3260	transit_state (struct dfa d, state_num s, unsigned char const *pp,
	3261	unsigned char const *end)
	3262	{
	3263	wint_t wc;
	3264
	3265	int mbclen = mbs_to_wchar (&wc, (char const ) pp, end - *pp, d);
	3266
	3267	/* This state has some operators which can match a multibyte character. */
	3268	d->mb_follows.nelem = 0;
	3269
	3270	/* Calculate the state which can be reached from the state 's' by
	3271	consuming 'mbclen' single bytes from the buffer. */
	3272	state_num s1 = s;
	3273	int mbci;
	3274	for (mbci = 0; mbci < mbclen && (mbci == 0 \|\| d->min_trcount <= s); mbci++)
	3275	s = transit_state_singlebyte (d, s, pp);
	3276	*pp += mbclen - mbci;
	3277
	3278	if (wc == WEOF)
	3279	{
	3280	/* It is an invalid character, so ANYCHAR is not accepted. */
	3281	return s;
	3282	}
	3283
	3284	/* If all positions which have ANYCHAR do not depend on the context
	3285	of the next character, calculate the next state with
	3286	pre-calculated follows and cache the result. */
	3287	if (d->states[s1].mb_trindex < 0)
	3288	{
	3289	if (MAX_TRCOUNT <= d->mb_trcount)
	3290	{
	3291	state_num s3;
	3292	for (s3 = -1; s3 < d->tralloc; s3++)
	3293	{
	3294	free (d->mb_trans[s3]);
	3295	d->mb_trans[s3] = NULL;
	3296	}
	3297
	3298	for (state_num i = 0; i < d->sindex; i++)
	3299	d->states[i].mb_trindex = -1;
	3300	d->mb_trcount = 0;
	3301	}
	3302	d->states[s1].mb_trindex = d->mb_trcount++;
	3303	}
	3304
	3305	if (! d->mb_trans[s])
	3306	{
	3307	enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
	3308	enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
	3309	d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
	3310	for (int i = 0; i < MAX_TRCOUNT; i++)
	3311	d->mb_trans[s][i] = -1;
	3312	}
	3313	else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
	3314	return d->mb_trans[s][d->states[s1].mb_trindex];
	3315
	3316	if (s == -1)
	3317	copy (&d->states[s1].mbps, &d->mb_follows);
	3318	else
	3319	merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
	3320
	3321	int separate_contexts = state_separate_contexts (d, &d->mb_follows);
	3322	state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
	3323	realloc_trans_if_necessary (d);
	3324
	3325	d->mb_trans[s][d->states[s1].mb_trindex] = s2;
	3326
	3327	return s2;
	3328	}
	3329
	3330	/* The initial state may encounter a byte which is not a single byte character
	3331	nor the first byte of a multibyte character. But it is incorrect for the
	3332	initial state to accept such a byte. For example, in Shift JIS the regular
	3333	expression "\\" accepts the codepoint 0x5c, but should not accept the second
	3334	byte of the codepoint 0x815c. Then the initial state must skip the bytes
	3335	that are not a single byte character nor the first byte of a multibyte
	3336	character.
	3337
	3338	Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
	3339	or exceeds P, and return the advanced MBP. If WCP is non-NULL and
	3340	the result is greater than P, set *WCP to the final wide character
	3341	processed, or to WEOF if no wide character is processed. Otherwise,
	3342	if WCP is non-NULL, *WCP may or may not be updated.
	3343
	3344	Both P and MBP must be no larger than END. */
	3345	static unsigned char const *
	3346	skip_remains_mb (struct dfa d, unsigned char const p,
	3347	unsigned char const mbp, char const end)
	3348	{
	3349	if (d->syntax.never_trail[*p])
	3350	return p;
	3351	while (mbp < p)
	3352	{
	3353	wint_t wc;
	3354	mbp += mbs_to_wchar (&wc, (char const *) mbp,
	3355	end - (char const *) mbp, d);
	3356	}
	3357	return mbp;
	3358	}
	3359
	3360	/* Search through a buffer looking for a match to the struct dfa *D.
	3361	Find the first occurrence of a string matching the regexp in the
	3362	buffer, and the shortest possible version thereof. Return a pointer to
	3363	the first character after the match, or NULL if none is found. BEGIN
	3364	points to the beginning of the buffer, and END points to the first byte
	3365	after its end. Note however that we store a sentinel byte (usually
	3366	newline) in *END, so the actual buffer must be one byte longer.
	3367	When ALLOW_NL, newlines may appear in the matching string.
	3368	If COUNT is non-NULL, increment *COUNT once for each newline processed.
	3369	If MULTIBYTE, the input consists of multibyte characters and/or
	3370	encoding-error bytes. Otherwise, it consists of single-byte characters.
	3371	Here is the list of features that make this DFA matcher punt:
	3372	- [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
	3373	- [^...] in non-simple locale
	3374	- [[=foo=]] or [[.foo.]]
	3375	- [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
	3376	- back-reference: (.)\1
	3377	- word-delimiter in multibyte locale: \<, \>, \b, \B
	3378	See struct localeinfo.simple for the definition of "simple locale". */
	3379
	3380	static inline char *
	3381	dfaexec_main (struct dfa d, char const begin, char *end, bool allow_nl,
	3382	ptrdiff_t *count, bool multibyte)
	3383	{
	3384	if (MAX_TRCOUNT <= d->sindex)
	3385	{
	3386	for (state_num s = d->min_trcount; s < d->sindex; s++)
	3387	{
	3388	free (d->states[s].elems.elems);
	3389	free (d->states[s].mbps.elems);
	3390	}
	3391	d->sindex = d->min_trcount;
	3392
	3393	if (d->trans)
	3394	{
	3395	for (state_num s = 0; s < d->tralloc; s++)
	3396	{
	3397	free (d->trans[s]);
	3398	free (d->fails[s]);
	3399	d->trans[s] = d->fails[s] = NULL;
	3400	}
	3401	d->trcount = 0;
	3402	}
	3403
	3404	if (d->localeinfo.multibyte && d->mb_trans)
	3405	{
	3406	for (state_num s = -1; s < d->tralloc; s++)
	3407	{
	3408	free (d->mb_trans[s]);
	3409	d->mb_trans[s] = NULL;
	3410	}
	3411	for (state_num s = 0; s < d->min_trcount; s++)
	3412	d->states[s].mb_trindex = -1;
	3413	d->mb_trcount = 0;
	3414	}
	3415	}
	3416
	3417	if (!d->tralloc)
	3418	realloc_trans_if_necessary (d);
	3419
	3420	/* Current state. */
	3421	state_num s = 0, s1 = 0;
	3422
	3423	/* Current input character. */
	3424	unsigned char const p = (unsigned char const ) begin;
	3425	unsigned char const *mbp = p;
	3426
	3427	/* Copy of d->trans so it can be optimized into a register. */
	3428	state_num **trans = d->trans;
	3429	unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
	3430	unsigned char saved_end = (unsigned char ) end;
	3431	*end = eol;
	3432
	3433	if (multibyte)
	3434	{
	3435	memset (&d->mbs, 0, sizeof d->mbs);
	3436	if (d->mb_follows.alloc == 0)
	3437	alloc_position_set (&d->mb_follows, d->nleaves);
	3438	}
	3439
	3440	idx_t nlcount = 0;
	3441	for (;;)
	3442	{
	3443	state_num *t;
	3444	while ((t = trans[s]) != NULL)
	3445	{
	3446	if (s < d->min_trcount)
	3447	{
	3448	if (!multibyte \|\| d->states[s].mbps.nelem == 0)
	3449	{
	3450	while (t[*p] == s)
	3451	p++;
	3452	}
	3453	if (multibyte)
	3454	p = mbp = skip_remains_mb (d, p, mbp, end);
	3455	}
	3456
	3457	if (multibyte)
	3458	{
	3459	s1 = s;
	3460
	3461	if (d->states[s].mbps.nelem == 0
	3462	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
	3463	{
	3464	/* If an input character does not match ANYCHAR, do it
	3465	like a single-byte character. */
	3466	s = t[*p++];
	3467	}
	3468	else
	3469	{
	3470	s = transit_state (d, s, &p, (unsigned char *) end);
	3471	mbp = p;
	3472	trans = d->trans;
	3473	}
	3474	}
	3475	else
	3476	{
	3477	s1 = t[*p++];
	3478	t = trans[s1];
	3479	if (! t)
	3480	{
	3481	state_num tmp = s;
	3482	s = s1;
	3483	s1 = tmp; /* swap */
	3484	break;
	3485	}
	3486	if (s < d->min_trcount)
	3487	{
	3488	while (t[*p] == s1)
	3489	p++;
	3490	}
	3491	s = t[*p++];
	3492	}
	3493	}
	3494
	3495	if (s < 0)
	3496	{
	3497	if (s == -2)
	3498	{
	3499	s = build_state (s1, d, p[-1]);
	3500	trans = d->trans;
	3501	}
	3502	else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
	3503	{
	3504	/* The previous character was a newline. Count it, and skip
	3505	checking of multibyte character boundary until here. */
	3506	nlcount++;
	3507	mbp = p;
	3508
	3509	s = (allow_nl ? d->newlines[s1]
	3510	: d->syntax.sbit[eol] == CTX_NEWLINE ? 0
	3511	: d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
	3512	: d->initstate_notbol);
	3513	}
	3514	else
	3515	{
	3516	p = NULL;
	3517	goto done;
	3518	}
	3519	}
	3520	else if (d->fails[s])
	3521	{
	3522	if ((d->success[s] & d->syntax.sbit[*p])
	3523	\|\| ((char *) p == end
	3524	&& accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
	3525	d)))
	3526	goto done;
	3527
	3528	if (multibyte && s < d->min_trcount)
	3529	p = mbp = skip_remains_mb (d, p, mbp, end);
	3530
	3531	s1 = s;
	3532	if (!multibyte \|\| d->states[s].mbps.nelem == 0
	3533	\|\| d->localeinfo.sbctowc[p] != WEOF \|\| (char ) p >= end)
	3534	{
	3535	/* If a input character does not match ANYCHAR, do it
	3536	like a single-byte character. */
	3537	s = d->fails[s][*p++];
	3538	}
	3539	else
	3540	{
	3541	s = transit_state (d, s, &p, (unsigned char *) end);
	3542	mbp = p;
	3543	trans = d->trans;
	3544	}
	3545	}
	3546	else
	3547	{
	3548	build_state (s, d, p[0]);
	3549	trans = d->trans;
	3550	}
	3551	}
	3552
	3553	done:
	3554	if (count)
	3555	*count += nlcount;
	3556	*end = saved_end;
	3557	return (char *) p;
	3558	}
	3559
	3560	/* Specialized versions of dfaexec for multibyte and single-byte cases.
	3561	This is for performance, as dfaexec_main is an inline function. */
	3562
	3563	static char *
	3564	dfaexec_mb (struct dfa d, char const begin, char *end,
	3565	bool allow_nl, ptrdiff_t count, bool backref)
	3566	{
	3567	return dfaexec_main (d, begin, end, allow_nl, count, true);
	3568	}
	3569
	3570	static char *
	3571	dfaexec_sb (struct dfa d, char const begin, char *end,
	3572	bool allow_nl, ptrdiff_t count, bool backref)
	3573	{
	3574	return dfaexec_main (d, begin, end, allow_nl, count, false);
	3575	}
	3576
	3577	/* Always set *BACKREF and return BEGIN. Use this wrapper for
	3578	any regexp that uses a construct not supported by this code. */
	3579	static char *
	3580	dfaexec_noop (struct dfa d, char const begin, char *end,
	3581	bool allow_nl, ptrdiff_t count, bool backref)
	3582	{
	3583	*backref = true;
	3584	return (char *) begin;
	3585	}
	3586
	3587	/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
	3588	but faster and set *BACKREF if the DFA code does not support this
	3589	regexp usage. */
	3590
	3591	char *
	3592	dfaexec (struct dfa d, char const begin, char *end,
	3593	bool allow_nl, ptrdiff_t count, bool backref)
	3594	{
	3595	return d->dfaexec (d, begin, end, allow_nl, count, backref);
	3596	}
	3597
	3598	struct dfa *
	3599	dfasuperset (struct dfa const *d)
	3600	{
	3601	return d->superset;
	3602	}
	3603
	3604	bool
	3605	dfaisfast (struct dfa const *d)
	3606	{
	3607	return d->fast;
	3608	}
	3609
	3610	static void
	3611	free_mbdata (struct dfa *d)
	3612	{
	3613	free (d->multibyte_prop);
	3614	free (d->lex.brack.chars);
	3615	free (d->mb_follows.elems);
	3616
	3617	if (d->mb_trans)
	3618	{
	3619	state_num s;
	3620	for (s = -1; s < d->tralloc; s++)
	3621	free (d->mb_trans[s]);
	3622	free (d->mb_trans - 2);
	3623	}
	3624	}
	3625
	3626	/* Return true if every construct in D is supported by this DFA matcher. */
	3627	bool
	3628	dfasupported (struct dfa const *d)
	3629	{
	3630	for (idx_t i = 0; i < d->tindex; i++)
	3631	{
	3632	switch (d->tokens[i])
	3633	{
	3634	case BEGWORD:
	3635	case ENDWORD:
	3636	case LIMWORD:
	3637	case NOTLIMWORD:
	3638	if (!d->localeinfo.multibyte)
	3639	continue;
	3640	FALLTHROUGH;
	3641	case BACKREF:
	3642	case MBCSET:
	3643	return false;
	3644	}
	3645	}
	3646	return true;
	3647	}
	3648
	3649	/* Disable use of the superset DFA if it is not likely to help
	3650	performance. */
	3651	static void
	3652	maybe_disable_superset_dfa (struct dfa *d)
	3653	{
	3654	if (!d->localeinfo.using_utf8)
	3655	return;
	3656
	3657	bool have_backref = false;
	3658	for (idx_t i = 0; i < d->tindex; i++)
	3659	{
	3660	switch (d->tokens[i])
	3661	{
	3662	case ANYCHAR:
	3663	/* Lowered. */
	3664	abort ();
	3665	case BACKREF:
	3666	have_backref = true;
	3667	break;
	3668	case MBCSET:
	3669	/* Requires multi-byte algorithm. */
	3670	return;
	3671	default:
	3672	break;
	3673	}
	3674	}
	3675
	3676	if (!have_backref && d->superset)
	3677	{
	3678	/* The superset DFA is not likely to be much faster, so remove it. */
	3679	dfafree (d->superset);
	3680	free (d->superset);
	3681	d->superset = NULL;
	3682	}
	3683
	3684	free_mbdata (d);
	3685	d->localeinfo.multibyte = false;
	3686	d->dfaexec = dfaexec_sb;
	3687	d->fast = true;
	3688	}
	3689
	3690	static void
	3691	dfassbuild (struct dfa *d)
	3692	{
	3693	struct dfa *sup = dfaalloc ();
	3694
	3695	sup = d;
	3696	sup->localeinfo.multibyte = false;
	3697	sup->dfaexec = dfaexec_sb;
	3698	sup->multibyte_prop = NULL;
	3699	sup->superset = NULL;
	3700	sup->states = NULL;
	3701	sup->sindex = 0;
	3702	sup->constraints = NULL;
	3703	sup->separates = NULL;
	3704	sup->follows = NULL;
	3705	sup->tralloc = 0;
	3706	sup->trans = NULL;
	3707	sup->fails = NULL;
	3708	sup->success = NULL;
	3709	sup->newlines = NULL;
	3710
	3711	sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
	3712	if (d->cindex)
	3713	{
	3714	memcpy (sup->charclasses, d->charclasses,
	3715	d->cindex * sizeof *sup->charclasses);
	3716	}
	3717
	3718	sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
	3719	sup->talloc = d->tindex * 2;
	3720
	3721	bool have_achar = false;
	3722	bool have_nchar = false;
	3723	idx_t j;
	3724	for (idx_t i = j = 0; i < d->tindex; i++)
	3725	{
	3726	switch (d->tokens[i])
	3727	{
	3728	case ANYCHAR:
	3729	case MBCSET:
	3730	case BACKREF:
	3731	{
	3732	charclass ccl;
	3733	fillset (&ccl);
	3734	sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
	3735	sup->tokens[j++] = STAR;
	3736	if (d->tokens[i + 1] == QMARK \|\| d->tokens[i + 1] == STAR
	3737	\|\| d->tokens[i + 1] == PLUS)
	3738	i++;
	3739	have_achar = true;
	3740	}
	3741	break;
	3742	case BEGWORD:
	3743	case ENDWORD:
	3744	case LIMWORD:
	3745	case NOTLIMWORD:
	3746	if (d->localeinfo.multibyte)
	3747	{
	3748	/* These constraints aren't supported in a multibyte locale.
	3749	Ignore them in the superset DFA. */
	3750	sup->tokens[j++] = EMPTY;
	3751	break;
	3752	}
	3753	FALLTHROUGH;
	3754	default:
	3755	sup->tokens[j++] = d->tokens[i];
	3756	if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
	3757	\|\| d->tokens[i] >= CSET)
	3758	have_nchar = true;
	3759	break;
	3760	}
	3761	}
	3762	sup->tindex = j;
	3763
	3764	if (have_nchar && (have_achar \|\| d->localeinfo.multibyte))
	3765	d->superset = sup;
	3766	else
	3767	{
	3768	dfafree (sup);
	3769	free (sup);
	3770	}
	3771	}
	3772
	3773	/* Parse a string S of length LEN into D (but skip this step if S is null).
	3774	Then analyze D and build a matcher for it.
	3775	SEARCHFLAG says whether to build a searching or an exact matcher. */
	3776	void
	3777	dfacomp (char const s, idx_t len, struct dfa d, bool searchflag)
	3778	{
	3779	if (s != NULL)
	3780	dfaparse (s, len, d);
	3781
	3782	dfassbuild (d);
	3783
	3784	if (dfasupported (d))
	3785	{
	3786	maybe_disable_superset_dfa (d);
	3787	dfaanalyze (d, searchflag);
	3788	}
	3789	else
	3790	{
	3791	d->dfaexec = dfaexec_noop;
	3792	}
	3793
	3794	if (d->superset)
	3795	{
	3796	d->fast = true;
	3797	dfaanalyze (d->superset, searchflag);
	3798	}
	3799	}
	3800
	3801	/* Free the storage held by the components of a dfa. */
	3802	void
	3803	dfafree (struct dfa *d)
	3804	{
	3805	free (d->charclasses);
	3806	free (d->tokens);
	3807
	3808	if (d->localeinfo.multibyte)
	3809	free_mbdata (d);
	3810
	3811	free (d->constraints);
	3812	free (d->separates);
	3813
	3814	for (idx_t i = 0; i < d->sindex; i++)
	3815	{
	3816	free (d->states[i].elems.elems);
	3817	free (d->states[i].mbps.elems);
	3818	}
	3819	free (d->states);
	3820
	3821	if (d->follows)
	3822	{
	3823	for (idx_t i = 0; i < d->tindex; i++)
	3824	free (d->follows[i].elems);
	3825	free (d->follows);
	3826	}
	3827
	3828	if (d->trans)
	3829	{
	3830	for (idx_t i = 0; i < d->tralloc; i++)
	3831	{
	3832	free (d->trans[i]);
	3833	free (d->fails[i]);
	3834	}
	3835
	3836	free (d->trans - 2);
	3837	free (d->fails);
	3838	free (d->newlines);
	3839	free (d->success);
	3840	}
	3841
	3842	if (d->superset)
	3843	{
	3844	dfafree (d->superset);
	3845	free (d->superset);
	3846	}
	3847	}
	3848
	3849	/* Having found the postfix representation of the regular expression,
	3850	try to find a long sequence of characters that must appear in any line
	3851	containing the r.e.
	3852	Finding a "longest" sequence is beyond the scope here;
	3853	we take an easy way out and hope for the best.
	3854	(Take "(ab\|a)b"--please.)
	3855
	3856	We do a bottom-up calculation of sequences of characters that must appear
	3857	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
	3858	representation:
	3859	sequences that must appear at the left of the match ("left")
	3860	sequences that must appear at the right of the match ("right")
	3861	lists of sequences that must appear somewhere in the match ("in")
	3862	sequences that must constitute the match ("is")
	3863
	3864	When we get to the root of the tree, we use one of the longest of its
	3865	calculated "in" sequences as our answer.
	3866
	3867	The sequences calculated for the various types of node (in pseudo ANSI c)
	3868	are shown below. "p" is the operand of unary operators (and the left-hand
	3869	operand of binary operators); "q" is the right-hand operand of binary
	3870	operators.
	3871
	3872	"ZERO" means "a zero-length sequence" below.
	3873
	3874	Type left right is in
	3875	---- ---- ----- -- --
	3876	char c # c # c # c # c
	3877
	3878	ANYCHAR ZERO ZERO ZERO ZERO
	3879
	3880	MBCSET ZERO ZERO ZERO ZERO
	3881
	3882	CSET ZERO ZERO ZERO ZERO
	3883
	3884	STAR ZERO ZERO ZERO ZERO
	3885
	3886	QMARK ZERO ZERO ZERO ZERO
	3887
	3888	PLUS p->left p->right ZERO p->in
	3889
	3890	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
	3891	p->left : q->right : q->is!=ZERO) ? q->in plus
	3892	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
	3893	ZERO
	3894
	3895	OR longest common longest common (do p->is and substrings common
	3896	leading trailing to q->is have same p->in and
	3897	(sub)sequence (sub)sequence q->in length and content) ?
	3898	of p->left of p->right
	3899	and q->left and q->right p->is : NULL
	3900
	3901	If there's anything else we recognize in the tree, all four sequences get set
	3902	to zero-length sequences. If there's something we don't recognize in the
	3903	tree, we just return a zero-length sequence.
	3904
	3905	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
	3906	'aaa')?
	3907
	3908	And ... is it here or someplace that we might ponder "optimizations" such as
	3909	egrep 'psi\|epsilon' -> egrep 'psi'
	3910	egrep 'pepsi\|epsilon' -> egrep 'epsi'
	3911	(Yes, we now find "epsi" as a "string
	3912	that must occur", but we might also
	3913	simplify the entire r.e. being sought)
	3914	grep '[c]' -> grep 'c'
	3915	grep '(ab\|a)b' -> grep 'ab'
	3916	grep 'ab*' -> grep 'a'
	3917	grep 'a*b' -> grep 'b'
	3918
	3919	There are several issues:
	3920
	3921	Is optimization easy (enough)?
	3922
	3923	Does optimization actually accomplish anything,
	3924	or is the automaton you get from "psi\|epsilon" (for example)
	3925	the same as the one you get from "psi" (for example)?
	3926
	3927	Are optimizable r.e.'s likely to be used in real-life situations
	3928	(something like 'ab*' is probably unlikely; something like is
	3929	'psi\|epsilon' is likelier)? */
	3930
	3931	static char *
	3932	icatalloc (char old, char const new)
	3933	{
	3934	idx_t newsize = strlen (new);
	3935	if (newsize == 0)
	3936	return old;
	3937	idx_t oldsize = strlen (old);
	3938	char *result = xirealloc (old, oldsize + newsize + 1);
	3939	memcpy (result + oldsize, new, newsize + 1);
	3940	return result;
	3941	}
	3942
	3943	static void
	3944	freelist (char **cpp)
	3945	{
	3946	while (*cpp)
	3947	free (*cpp++);
	3948	}
	3949
	3950	static char **
	3951	enlistnew (char *cpp, char new)
	3952	{
	3953	/* Is there already something in the list that's new (or longer)? */
	3954	idx_t i;
	3955	for (i = 0; cpp[i] != NULL; i++)
	3956	if (strstr (cpp[i], new) != NULL)
	3957	{
	3958	free (new);
	3959	return cpp;
	3960	}
	3961	/* Eliminate any obsoleted strings. */
	3962	for (idx_t j = 0; cpp[j] != NULL; )
	3963	if (strstr (new, cpp[j]) == NULL)
	3964	++j;
	3965	else
	3966	{
	3967	free (cpp[j]);
	3968	if (--i == j)
	3969	break;
	3970	cpp[j] = cpp[i];
	3971	cpp[i] = NULL;
	3972	}
	3973	/* Add the new string. */
	3974	cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
	3975	cpp[i] = new;
	3976	cpp[i + 1] = NULL;
	3977	return cpp;
	3978	}
	3979
	3980	static char **
	3981	enlist (char *cpp, char const str, idx_t len)
	3982	{
	3983	return enlistnew (cpp, ximemdup0 (str, len));
	3984	}
	3985
	3986	/* Given pointers to two strings, return a pointer to an allocated
	3987	list of their distinct common substrings. */
	3988	static char **
	3989	comsubs (char left, char const right)
	3990	{
	3991	char *cpp = xzalloc (sizeof cpp);
	3992
	3993	for (char lcp = left; lcp != '\0'; lcp++)
	3994	{
	3995	idx_t len = 0;
	3996	char rcp = strchr (right, lcp);
	3997	while (rcp != NULL)
	3998	{
	3999	idx_t i;
	4000	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
	4001	continue;
	4002	if (i > len)
	4003	len = i;
	4004	rcp = strchr (rcp + 1, *lcp);
	4005	}
	4006	if (len != 0)
	4007	cpp = enlist (cpp, lcp, len);
	4008	}
	4009	return cpp;
	4010	}
	4011
	4012	static char **
	4013	addlists (char old, char new)
	4014	{
	4015	for (; *new; new++)
	4016	old = enlistnew (old, xstrdup (*new));
	4017	return old;
	4018	}
	4019
	4020	/* Given two lists of substrings, return a new list giving substrings
	4021	common to both. */
	4022	static char **
	4023	inboth (char left, char right)
	4024	{
	4025	char *both = xzalloc (sizeof both);
	4026
	4027	for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
	4028	{
	4029	for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
	4030	{
	4031	char **temp = comsubs (left[lnum], right[rnum]);
	4032	both = addlists (both, temp);
	4033	freelist (temp);
	4034	free (temp);
	4035	}
	4036	}
	4037	return both;
	4038	}
	4039
	4040	typedef struct must must;
	4041
	4042	struct must
	4043	{
	4044	char **in;
	4045	char *left;
	4046	char *right;
	4047	char *is;
	4048	bool begline;
	4049	bool endline;
	4050	must *prev;
	4051	};
	4052
	4053	static must *
	4054	allocmust (must *mp, idx_t size)
	4055	{
	4056	must new_mp = xmalloc (sizeof new_mp);
	4057	new_mp->in = xzalloc (sizeof *new_mp->in);
	4058	new_mp->left = xizalloc (size);
	4059	new_mp->right = xizalloc (size);
	4060	new_mp->is = xizalloc (size);
	4061	new_mp->begline = false;
	4062	new_mp->endline = false;
	4063	new_mp->prev = mp;
	4064	return new_mp;
	4065	}
	4066
	4067	static void
	4068	resetmust (must *mp)
	4069	{
	4070	freelist (mp->in);
	4071	mp->in[0] = NULL;
	4072	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
	4073	mp->begline = false;
	4074	mp->endline = false;
	4075	}
	4076
	4077	static void
	4078	freemust (must *mp)
	4079	{
	4080	freelist (mp->in);
	4081	free (mp->in);
	4082	free (mp->left);
	4083	free (mp->right);
	4084	free (mp->is);
	4085	free (mp);
	4086	}
	4087
	4088	struct dfamust *
	4089	dfamust (struct dfa const *d)
	4090	{
	4091	must *mp = NULL;
	4092	char const *result = "";
	4093	bool exact = false;
	4094	bool begline = false;
	4095	bool endline = false;
	4096	bool need_begline = false;
	4097	bool need_endline = false;
	4098	bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
	4099
	4100	for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
	4101	{
	4102	token t = d->tokens[ri];
	4103	switch (t)
	4104	{
	4105	case BEGLINE:
	4106	mp = allocmust (mp, 2);
	4107	mp->begline = true;
	4108	need_begline = true;
	4109	break;
	4110	case ENDLINE:
	4111	mp = allocmust (mp, 2);
	4112	mp->endline = true;
	4113	need_endline = true;
	4114	break;
	4115	case LPAREN:
	4116	case RPAREN:
	4117	assert (!"neither LPAREN nor RPAREN may appear here");
	4118
	4119	case EMPTY:
	4120	case BEGWORD:
	4121	case ENDWORD:
	4122	case LIMWORD:
	4123	case NOTLIMWORD:
	4124	case BACKREF:
	4125	case ANYCHAR:
	4126	case MBCSET:
	4127	mp = allocmust (mp, 2);
	4128	break;
	4129
	4130	case STAR:
	4131	case QMARK:
	4132	assume_nonnull (mp);
	4133	resetmust (mp);
	4134	break;
	4135
	4136	case OR:
	4137	{
	4138	char **new;
	4139	must *rmp = mp;
	4140	assume_nonnull (rmp);
	4141	must *lmp = mp = mp->prev;
	4142	assume_nonnull (lmp);
	4143	idx_t j, ln, rn, n;
	4144
	4145	/* Guaranteed to be. Unlikely, but ... */
	4146	if (streq (lmp->is, rmp->is))
	4147	{
	4148	lmp->begline &= rmp->begline;
	4149	lmp->endline &= rmp->endline;
	4150	}
	4151	else
	4152	{
	4153	lmp->is[0] = '\0';
	4154	lmp->begline = false;
	4155	lmp->endline = false;
	4156	}
	4157	/* Left side--easy */
	4158	idx_t i = 0;
	4159	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
	4160	++i;
	4161	lmp->left[i] = '\0';
	4162	/* Right side */
	4163	ln = strlen (lmp->right);
	4164	rn = strlen (rmp->right);
	4165	n = ln;
	4166	if (n > rn)
	4167	n = rn;
	4168	for (i = 0; i < n; ++i)
	4169	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
	4170	break;
	4171	for (j = 0; j < i; ++j)
	4172	lmp->right[j] = lmp->right[(ln - i) + j];
	4173	lmp->right[j] = '\0';
	4174	new = inboth (lmp->in, rmp->in);
	4175	freelist (lmp->in);
	4176	free (lmp->in);
	4177	lmp->in = new;
	4178	freemust (rmp);
	4179	}
	4180	break;
	4181
	4182	case PLUS:
	4183	assume_nonnull (mp);
	4184	mp->is[0] = '\0';
	4185	break;
	4186
	4187	case END:
	4188	assume_nonnull (mp);
	4189	assert (!mp->prev);
	4190	for (idx_t i = 0; mp->in[i] != NULL; i++)
	4191	if (strlen (mp->in[i]) > strlen (result))
	4192	result = mp->in[i];
	4193	if (streq (result, mp->is))
	4194	{
	4195	if ((!need_begline \|\| mp->begline) && (!need_endline
	4196	\|\| mp->endline))
	4197	exact = true;
	4198	begline = mp->begline;
	4199	endline = mp->endline;
	4200	}
	4201	goto done;
	4202
	4203	case CAT:
	4204	{
	4205	must *rmp = mp;
	4206	assume_nonnull (rmp);
	4207	must *lmp = mp = mp->prev;
	4208	assume_nonnull (lmp);
	4209
	4210	/* In. Everything in left, plus everything in
	4211	right, plus concatenation of
	4212	left's right and right's left. */
	4213	lmp->in = addlists (lmp->in, rmp->in);
	4214	if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
	4215	{
	4216	idx_t lrlen = strlen (lmp->right);
	4217	idx_t rllen = strlen (rmp->left);
	4218	char *tp = ximalloc (lrlen + rllen + 1);
	4219	memcpy (tp + lrlen, rmp->left, rllen + 1);
	4220	memcpy (tp, lmp->right, lrlen);
	4221	lmp->in = enlistnew (lmp->in, tp);
	4222	}
	4223	/* Left-hand */
	4224	if (lmp->is[0] != '\0')
	4225	lmp->left = icatalloc (lmp->left, rmp->left);
	4226	/* Right-hand */
	4227	if (rmp->is[0] == '\0')
	4228	lmp->right[0] = '\0';
	4229	lmp->right = icatalloc (lmp->right, rmp->right);
	4230	/* Guaranteed to be */
	4231	if ((lmp->is[0] != '\0' \|\| lmp->begline)
	4232	&& (rmp->is[0] != '\0' \|\| rmp->endline))
	4233	{
	4234	lmp->is = icatalloc (lmp->is, rmp->is);
	4235	lmp->endline = rmp->endline;
	4236	}
	4237	else
	4238	{
	4239	lmp->is[0] = '\0';
	4240	lmp->begline = false;
	4241	lmp->endline = false;
	4242	}
	4243	freemust (rmp);
	4244	}
	4245	break;
	4246
	4247	case '\0':
	4248	/* Not on my shift. */
	4249	goto done;
	4250
	4251	default:
	4252	if (CSET <= t)
	4253	{
	4254	/* If T is a singleton, or if case-folding in a unibyte
	4255	locale and T's members all case-fold to the same char,
	4256	convert T to one of its members. Otherwise, do
	4257	nothing further with T. */
	4258	charclass *ccl = &d->charclasses[t - CSET];
	4259	int j;
	4260	for (j = 0; j < NOTCHAR; j++)
	4261	if (tstbit (j, ccl))
	4262	break;
	4263	if (! (j < NOTCHAR))
	4264	{
	4265	mp = allocmust (mp, 2);
	4266	break;
	4267	}
	4268	t = j;
	4269	while (++j < NOTCHAR)
	4270	if (tstbit (j, ccl)
	4271	&& ! (case_fold_unibyte
	4272	&& toupper (j) == toupper (t)))
	4273	break;
	4274	if (j < NOTCHAR)
	4275	{
	4276	mp = allocmust (mp, 2);
	4277	break;
	4278	}
	4279	}
	4280
	4281	idx_t rj = ri + 2;
	4282	if (d->tokens[ri + 1] == CAT)
	4283	{
	4284	for (; rj < d->tindex - 1; rj += 2)
	4285	{
	4286	if ((rj != ri && (d->tokens[rj] <= 0
	4287	\|\| NOTCHAR <= d->tokens[rj]))
	4288	\|\| d->tokens[rj + 1] != CAT)
	4289	break;
	4290	}
	4291	}
	4292	mp = allocmust (mp, ((rj - ri) >> 1) + 1);
	4293	mp->is[0] = mp->left[0] = mp->right[0]
	4294	= case_fold_unibyte ? toupper (t) : t;
	4295
	4296	idx_t i;
	4297	for (i = 1; ri + 2 < rj; i++)
	4298	{
	4299	ri += 2;
	4300	t = d->tokens[ri];
	4301	mp->is[i] = mp->left[i] = mp->right[i]
	4302	= case_fold_unibyte ? toupper (t) : t;
	4303	}
	4304	mp->is[i] = mp->left[i] = mp->right[i] = '\0';
	4305	mp->in = enlist (mp->in, mp->is, i);
	4306	break;
	4307	}
	4308	}
	4309	done:;
	4310
	4311	struct dfamust *dm = NULL;
	4312	if (*result)
	4313	{
	4314	dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
	4315	dm->exact = exact;
	4316	dm->begline = begline;
	4317	dm->endline = endline;
	4318	strcpy (dm->must, result);
	4319	}
	4320
	4321	while (mp)
	4322	{
	4323	must *prev = mp->prev;
	4324	freemust (mp);
	4325	mp = prev;
	4326	}
	4327
	4328	return dm;
	4329	}
	4330
	4331	void
	4332	dfamustfree (struct dfamust *dm)
	4333	{
	4334	free (dm);
	4335	}
	4336
	4337	struct dfa *
	4338	dfaalloc (void)
	4339	{
	4340	return xmalloc (sizeof (struct dfa));
	4341	}
	4342
	4343	/* Initialize DFA. */
	4344	void
	4345	dfasyntax (struct dfa dfa, struct localeinfo const linfo,
	4346	reg_syntax_t bits, int dfaopts)
	4347	{
	4348	memset (dfa, 0, offsetof (struct dfa, dfaexec));
	4349	dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
	4350	dfa->localeinfo = *linfo;
	4351
	4352	dfa->fast = !dfa->localeinfo.multibyte;
	4353
	4354	dfa->canychar = -1;
	4355	dfa->syntax.syntax_bits_set = true;
	4356	dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
	4357	dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
	4358	dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
	4359	dfa->syntax.syntax_bits = bits;
	4360
	4361	for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
	4362	{
	4363	unsigned char uc = i;
	4364
	4365	dfa->syntax.sbit[uc] = char_context (dfa, uc);
	4366	switch (dfa->syntax.sbit[uc])
	4367	{
	4368	case CTX_LETTER:
	4369	setbit (uc, &dfa->syntax.letters);
	4370	break;
	4371	case CTX_NEWLINE:
	4372	setbit (uc, &dfa->syntax.newline);
	4373	break;
	4374	}
	4375
	4376	/* POSIX requires that the five bytes in "\n\r./" (including the
	4377	terminating NUL) cannot occur inside a multibyte character. */
	4378	dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
	4379	? (uc & 0xc0) != 0x80
	4380	: strchr ("\n\r./", uc) != NULL);
	4381	}
	4382	}
	4383
	4384	/* Initialize TO by copying FROM's syntax settings. */
	4385	void
	4386	dfacopysyntax (struct dfa to, struct dfa const from)
	4387	{
	4388	memset (to, 0, offsetof (struct dfa, syntax));
	4389	to->canychar = -1;
	4390	to->fast = from->fast;
	4391	to->syntax = from->syntax;
	4392	to->dfaexec = from->dfaexec;
	4393	to->localeinfo = from->localeinfo;
	4394	}
	4395
	4396	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/grep/lib/dfa.c

Download in other formats: