Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

regcomp.c

Last change on this file was 3548, checked in by bird, 3 years ago
grep: Use get_crt_codepage(). Don't default to the UTF-8 manifest for older VCC versions as the CRT won't do the right thing.
Property svn:eol-style set to `native`
File size: 111.9 KB

Rev	Line
[3529]	1	/* Extended regular expression matching and search library.
	2	Copyright (C) 2002-2021 Free Software Foundation, Inc.
	3	This file is part of the GNU C Library.
	4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
	17	License along with the GNU C Library; if not, see
	18	<https://www.gnu.org/licenses/>. */
	19
	20	#ifdef _LIBC
	21	# include <locale/weight.h>
	22	#endif
	23
	24	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
	25	size_t length, reg_syntax_t syntax);
	26	static void re_compile_fastmap_iter (regex_t *bufp,
	27	const re_dfastate_t *init_state,
	28	char *fastmap);
	29	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
	30	#ifdef RE_ENABLE_I18N
	31	static void free_charset (re_charset_t *cset);
	32	#endif /* RE_ENABLE_I18N */
	33	static void free_workarea_compile (regex_t *preg);
	34	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
	35	#ifdef RE_ENABLE_I18N
	36	static void optimize_utf8 (re_dfa_t *dfa);
	37	#endif
	38	static reg_errcode_t analyze (regex_t *preg);
	39	static reg_errcode_t preorder (bin_tree_t *root,
	40	reg_errcode_t (fn (void , bin_tree_t )),
	41	void *extra);
	42	static reg_errcode_t postorder (bin_tree_t *root,
	43	reg_errcode_t (fn (void , bin_tree_t )),
	44	void *extra);
	45	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
	46	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
	47	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
	48	bin_tree_t *node);
	49	static reg_errcode_t calc_first (void extra, bin_tree_t node);
	50	static reg_errcode_t calc_next (void extra, bin_tree_t node);
	51	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
	52	static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
	53	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
	54	unsigned int constraint);
	55	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
	56	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
	57	Idx node, bool root);
	58	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
	59	static Idx fetch_number (re_string_t input, re_token_t token,
	60	reg_syntax_t syntax);
	61	static int peek_token (re_token_t token, re_string_t input,
	62	reg_syntax_t syntax);
	63	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
	64	reg_syntax_t syntax, reg_errcode_t *err);
	65	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
	66	re_token_t *token, reg_syntax_t syntax,
	67	Idx nest, reg_errcode_t *err);
	68	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
	69	re_token_t *token, reg_syntax_t syntax,
	70	Idx nest, reg_errcode_t *err);
	71	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
	72	re_token_t *token, reg_syntax_t syntax,
	73	Idx nest, reg_errcode_t *err);
	74	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
	75	re_token_t *token, reg_syntax_t syntax,
	76	Idx nest, reg_errcode_t *err);
	77	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
	78	re_dfa_t dfa, re_token_t token,
	79	reg_syntax_t syntax, reg_errcode_t *err);
	80	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
	81	re_token_t *token, reg_syntax_t syntax,
	82	reg_errcode_t *err);
	83	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
	84	re_string_t *regexp,
	85	re_token_t *token, int token_len,
	86	re_dfa_t *dfa,
	87	reg_syntax_t syntax,
	88	bool accept_hyphen);
	89	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
	90	re_string_t *regexp,
	91	re_token_t *token);
	92	#ifdef RE_ENABLE_I18N
	93	static reg_errcode_t build_equiv_class (bitset_t sbcset,
	94	re_charset_t *mbcset,
	95	Idx *equiv_class_alloc,
	96	const unsigned char *name);
	97	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
	98	bitset_t sbcset,
	99	re_charset_t *mbcset,
	100	Idx *char_class_alloc,
	101	const char *class_name,
	102	reg_syntax_t syntax);
	103	#else /* not RE_ENABLE_I18N */
	104	static reg_errcode_t build_equiv_class (bitset_t sbcset,
	105	const unsigned char *name);
	106	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
	107	bitset_t sbcset,
	108	const char *class_name,
	109	reg_syntax_t syntax);
	110	#endif /* not RE_ENABLE_I18N */
	111	static bin_tree_t build_charclass_op (re_dfa_t dfa,
	112	RE_TRANSLATE_TYPE trans,
	113	const char *class_name,
	114	const char *extra,
	115	bool non_match, reg_errcode_t *err);
	116	static bin_tree_t create_tree (re_dfa_t dfa,
	117	bin_tree_t left, bin_tree_t right,
	118	re_token_type_t type);
	119	static bin_tree_t create_token_tree (re_dfa_t dfa,
	120	bin_tree_t left, bin_tree_t right,
	121	const re_token_t *token);
	122	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
	123	static void free_token (re_token_t *node);
	124	static reg_errcode_t free_tree (void extra, bin_tree_t node);
	125	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
	126
	127
	128	/* This table gives an error message for each of the error codes listed
	129	in regex.h. Obviously the order here has to be same as there.
	130	POSIX doesn't require that we do anything for REG_NOERROR,
	131	but why not be nice? */
	132
	133	static const char __re_error_msgid[] =
	134	{
	135	#define REG_NOERROR_IDX 0
	136	gettext_noop ("Success") /* REG_NOERROR */
	137	"\0"
	138	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
	139	gettext_noop ("No match") /* REG_NOMATCH */
	140	"\0"
	141	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
	142	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
	143	"\0"
	144	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
	145	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
	146	"\0"
	147	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
	148	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
	149	"\0"
	150	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
	151	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
	152	"\0"
	153	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
	154	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
	155	"\0"
	156	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
	157	gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
	158	"\0"
	159	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
	160	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
	161	"\0"
	162	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
	163	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
	164	"\0"
	165	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
	166	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
	167	"\0"
	168	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
	169	gettext_noop ("Invalid range end") /* REG_ERANGE */
	170	"\0"
	171	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
	172	gettext_noop ("Memory exhausted") /* REG_ESPACE */
	173	"\0"
	174	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
	175	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
	176	"\0"
	177	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
	178	gettext_noop ("Premature end of regular expression") /* REG_EEND */
	179	"\0"
	180	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
	181	gettext_noop ("Regular expression too big") /* REG_ESIZE */
	182	"\0"
	183	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
	184	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
	185	};
	186
	187	static const size_t __re_error_msgid_idx[] =
	188	{
	189	REG_NOERROR_IDX,
	190	REG_NOMATCH_IDX,
	191	REG_BADPAT_IDX,
	192	REG_ECOLLATE_IDX,
	193	REG_ECTYPE_IDX,
	194	REG_EESCAPE_IDX,
	195	REG_ESUBREG_IDX,
	196	REG_EBRACK_IDX,
	197	REG_EPAREN_IDX,
	198	REG_EBRACE_IDX,
	199	REG_BADBR_IDX,
	200	REG_ERANGE_IDX,
	201	REG_ESPACE_IDX,
	202	REG_BADRPT_IDX,
	203	REG_EEND_IDX,
	204	REG_ESIZE_IDX,
	205	REG_ERPAREN_IDX
	206	};
	207
	208
	209	/* Entry points for GNU code. */
	210
	211	/* re_compile_pattern is the GNU regular expression compiler: it
	212	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
	213	Returns 0 if the pattern was valid, otherwise an error string.
	214
	215	Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
	216	are set in BUFP on entry. */
	217
	218	const char *
	219	re_compile_pattern (const char *pattern, size_t length,
	220	struct re_pattern_buffer *bufp)
	221	{
	222	reg_errcode_t ret;
	223
	224	/* And GNU code determines whether or not to get register information
	225	by passing null for the REGS argument to re_match, etc., not by
	226	setting no_sub, unless RE_NO_SUB is set. */
	227	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
	228
	229	/* Match anchors at newline. */
	230	bufp->newline_anchor = 1;
	231
	232	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
	233
	234	if (!ret)
	235	return NULL;
	236	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
	237	}
	238	weak_alias (__re_compile_pattern, re_compile_pattern)
	239
	240	/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
	241	also be assigned to arbitrarily: each pattern buffer stores its own
	242	syntax, so it can be changed between regex compilations. */
	243	/* This has no initializer because initialized variables in Emacs
	244	become read-only after dumping. */
	245	reg_syntax_t re_syntax_options;
	246
	247
	248	/* Specify the precise syntax of regexps for compilation. This provides
	249	for compatibility for various utilities which historically have
	250	different, incompatible syntaxes.
	251
	252	The argument SYNTAX is a bit mask comprised of the various bits
	253	defined in regex.h. We return the old syntax. */
	254
	255	reg_syntax_t
	256	re_set_syntax (reg_syntax_t syntax)
	257	{
	258	reg_syntax_t ret = re_syntax_options;
	259
	260	re_syntax_options = syntax;
	261	return ret;
	262	}
	263	weak_alias (__re_set_syntax, re_set_syntax)
	264
	265	int
	266	re_compile_fastmap (struct re_pattern_buffer *bufp)
	267	{
	268	re_dfa_t *dfa = bufp->buffer;
	269	char *fastmap = bufp->fastmap;
	270
	271	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
	272	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
	273	if (dfa->init_state != dfa->init_state_word)
	274	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
	275	if (dfa->init_state != dfa->init_state_nl)
	276	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
	277	if (dfa->init_state != dfa->init_state_begbuf)
	278	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
	279	bufp->fastmap_accurate = 1;
	280	return 0;
	281	}
	282	weak_alias (__re_compile_fastmap, re_compile_fastmap)
	283
	284	static inline void
	285	__attribute__ ((always_inline))
	286	re_set_fastmap (char *fastmap, bool icase, int ch)
	287	{
	288	fastmap[ch] = 1;
	289	if (icase)
	290	fastmap[tolower (ch)] = 1;
	291	}
	292
	293	/* Helper function for re_compile_fastmap.
	294	Compile fastmap for the initial_state INIT_STATE. */
	295
	296	static void
	297	re_compile_fastmap_iter (regex_t bufp, const re_dfastate_t init_state,
	298	char *fastmap)
	299	{
	300	re_dfa_t *dfa = bufp->buffer;
	301	Idx node_cnt;
	302	bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
	303	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
	304	{
	305	Idx node = init_state->nodes.elems[node_cnt];
	306	re_token_type_t type = dfa->nodes[node].type;
	307
	308	if (type == CHARACTER)
	309	{
	310	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
	311	#ifdef RE_ENABLE_I18N
	312	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
	313	{
	314	unsigned char buf[MB_LEN_MAX];
	315	unsigned char *p;
	316	wchar_t wc;
	317	mbstate_t state;
	318
	319	p = buf;
	320	*p++ = dfa->nodes[node].opr.c;
	321	while (++node < dfa->nodes_len
	322	&& dfa->nodes[node].type == CHARACTER
	323	&& dfa->nodes[node].mb_partial)
	324	*p++ = dfa->nodes[node].opr.c;
	325	memset (&state, '\0', sizeof (state));
	326	if (__mbrtowc (&wc, (const char *) buf, p - buf,
	327	&state) == p - buf
	328	&& (__wcrtomb ((char *) buf, __towlower (wc), &state)
	329	!= (size_t) -1))
	330	re_set_fastmap (fastmap, false, buf[0]);
	331	}
	332	#endif
	333	}
	334	else if (type == SIMPLE_BRACKET)
	335	{
	336	int i, ch;
	337	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
	338	{
	339	int j;
	340	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
	341	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
	342	if (w & ((bitset_word_t) 1 << j))
	343	re_set_fastmap (fastmap, icase, ch);
	344	}
	345	}
	346	#ifdef RE_ENABLE_I18N
	347	else if (type == COMPLEX_BRACKET)
	348	{
	349	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
	350	Idx i;
	351
	352	# ifdef _LIBC
	353	/* See if we have to try all bytes which start multiple collation
	354	elements.
	355	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
	356	collation element, and don't catch 'b' since 'b' is
	357	the only collation element which starts from 'b' (and
	358	it is caught by SIMPLE_BRACKET). */
	359	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
	360	&& (cset->ncoll_syms \|\| cset->nranges))
	361	{
	362	const int32_t table = (const int32_t )
	363	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
	364	for (i = 0; i < SBC_MAX; ++i)
	365	if (table[i] < 0)
	366	re_set_fastmap (fastmap, icase, i);
	367	}
	368	# endif /* _LIBC */
	369
	370	/* See if we have to start the match at all multibyte characters,
	371	i.e. where we would not find an invalid sequence. This only
	372	applies to multibyte character sets; for single byte character
	373	sets, the SIMPLE_BRACKET again suffices. */
	374	if (dfa->mb_cur_max > 1
	375	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
	376	# ifdef _LIBC
	377	\|\| cset->nequiv_classes
	378	# endif /* _LIBC */
	379	))
	380	{
	381	unsigned char c = 0;
	382	do
	383	{
	384	mbstate_t mbs;
	385	memset (&mbs, 0, sizeof (mbs));
	386	if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
	387	re_set_fastmap (fastmap, false, (int) c);
	388	}
	389	while (++c != 0);
	390	}
	391
	392	else
	393	{
	394	/* ... Else catch all bytes which can start the mbchars. */
	395	for (i = 0; i < cset->nmbchars; ++i)
	396	{
	397	char buf[256];
	398	mbstate_t state;
	399	memset (&state, '\0', sizeof (state));
	400	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
	401	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
	402	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
	403	{
	404	if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
	405	!= (size_t) -1)
	406	re_set_fastmap (fastmap, false, (unsigned char ) buf);
	407	}
	408	}
	409	}
	410	}
	411	#endif /* RE_ENABLE_I18N */
	412	else if (type == OP_PERIOD
	413	#ifdef RE_ENABLE_I18N
	414	\|\| type == OP_UTF8_PERIOD
	415	#endif /* RE_ENABLE_I18N */
	416	\|\| type == END_OF_RE)
	417	{
	418	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
	419	if (type == END_OF_RE)
	420	bufp->can_be_null = 1;
	421	return;
	422	}
	423	}
	424	}
	425
	426
	427	/* Entry point for POSIX code. */
	428	/* regcomp takes a regular expression as a string and compiles it.
	429
	430	PREG is a regex_t *. We do not expect any fields to be initialized,
	431	since POSIX says we shouldn't. Thus, we set
	432
	433	'buffer' to the compiled pattern;
	434	'used' to the length of the compiled pattern;
	435	'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
	436	REG_EXTENDED bit in CFLAGS is set; otherwise, to
	437	RE_SYNTAX_POSIX_BASIC;
	438	'newline_anchor' to REG_NEWLINE being set in CFLAGS;
	439	'fastmap' to an allocated space for the fastmap;
	440	'fastmap_accurate' to zero;
	441	're_nsub' to the number of subexpressions in PATTERN.
	442
	443	PATTERN is the address of the pattern string.
	444
	445	CFLAGS is a series of bits which affect compilation.
	446
	447	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
	448	use POSIX basic syntax.
	449
	450	If REG_NEWLINE is set, then . and [^...] don't match newline.
	451	Also, regexec will try a match beginning after every newline.
	452
	453	If REG_ICASE is set, then we considers upper- and lowercase
	454	versions of letters to be equivalent when matching.
	455
	456	If REG_NOSUB is set, then when PREG is passed to regexec, that
	457	routine will report only success or failure, and nothing about the
	458	registers.
	459
	460	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
	461	the return codes and their meanings.) */
	462
	463	int
	464	regcomp (regex_t __restrict preg, const char __restrict pattern, int cflags)
	465	{
	466	reg_errcode_t ret;
	467	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
	468	: RE_SYNTAX_POSIX_BASIC);
	469
	470	preg->buffer = NULL;
	471	preg->allocated = 0;
	472	preg->used = 0;
	473
	474	/* Try to allocate space for the fastmap. */
	475	preg->fastmap = re_malloc (char, SBC_MAX);
	476	if (__glibc_unlikely (preg->fastmap == NULL))
	477	return REG_ESPACE;
	478
	479	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
	480
	481	/* If REG_NEWLINE is set, newlines are treated differently. */
	482	if (cflags & REG_NEWLINE)
	483	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
	484	syntax &= ~RE_DOT_NEWLINE;
	485	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
	486	/* It also changes the matching behavior. */
	487	preg->newline_anchor = 1;
	488	}
	489	else
	490	preg->newline_anchor = 0;
	491	preg->no_sub = !!(cflags & REG_NOSUB);
	492	preg->translate = NULL;
	493
	494	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
	495
	496	/* POSIX doesn't distinguish between an unmatched open-group and an
	497	unmatched close-group: both are REG_EPAREN. */
	498	if (ret == REG_ERPAREN)
	499	ret = REG_EPAREN;
	500
	501	/* We have already checked preg->fastmap != NULL. */
	502	if (__glibc_likely (ret == REG_NOERROR))
	503	/* Compute the fastmap now, since regexec cannot modify the pattern
	504	buffer. This function never fails in this implementation. */
	505	(void) re_compile_fastmap (preg);
	506	else
	507	{
	508	/* Some error occurred while compiling the expression. */
	509	re_free (preg->fastmap);
	510	preg->fastmap = NULL;
	511	}
	512
	513	return (int) ret;
	514	}
	515	libc_hidden_def (__regcomp)
	516	weak_alias (__regcomp, regcomp)
	517
	518	/* Returns a message corresponding to an error code, ERRCODE, returned
	519	from either regcomp or regexec. We don't use PREG here. */
	520
	521	size_t
	522	regerror (int errcode, const regex_t __restrict preg, char __restrict errbuf,
	523	size_t errbuf_size)
	524	{
	525	const char *msg;
	526	size_t msg_size;
	527	int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
	528
	529	if (__glibc_unlikely (errcode < 0 \|\| errcode >= nerrcodes))
	530	/* Only error codes returned by the rest of the code should be passed
	531	to this routine. If we are given anything else, or if other regex
	532	code generates an invalid error code, then the program has a bug.
	533	Dump core so we can fix it. */
	534	abort ();
	535
	536	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
	537
	538	msg_size = strlen (msg) + 1; /* Includes the null. */
	539
	540	if (__glibc_likely (errbuf_size != 0))
	541	{
	542	size_t cpy_size = msg_size;
	543	if (__glibc_unlikely (msg_size > errbuf_size))
	544	{
	545	cpy_size = errbuf_size - 1;
	546	errbuf[cpy_size] = '\0';
	547	}
	548	memcpy (errbuf, msg, cpy_size);
	549	}
	550
	551	return msg_size;
	552	}
	553	weak_alias (__regerror, regerror)
	554
	555
	556	#ifdef RE_ENABLE_I18N
	557	/* This static array is used for the map to single-byte characters when
	558	UTF-8 is used. Otherwise we would allocate memory just to initialize
	559	it the same all the time. UTF-8 is the preferred encoding so this is
	560	a worthwhile optimization. */
	561	static const bitset_t utf8_sb_map =
	562	{
	563	/* Set the first 128 bits. */
	564	# if (defined __GNUC__ \|\| __clang_major__ >= 4) && !defined __STRICT_ANSI__
	565	[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
	566	# else
	567	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
	568	# error "bitset_word_t is narrower than 32 bits"
	569	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
	570	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
	571	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
	572	BITSET_WORD_MAX, BITSET_WORD_MAX,
	573	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
	574	BITSET_WORD_MAX,
	575	# endif
	576	(BITSET_WORD_MAX
	577	>> (SBC_MAX % BITSET_WORD_BITS == 0
	578	? 0
	579	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
	580	# endif
	581	};
	582	#endif
	583
	584
	585	static void
	586	free_dfa_content (re_dfa_t *dfa)
	587	{
	588	Idx i, j;
	589
	590	if (dfa->nodes)
	591	for (i = 0; i < dfa->nodes_len; ++i)
	592	free_token (dfa->nodes + i);
	593	re_free (dfa->nexts);
	594	for (i = 0; i < dfa->nodes_len; ++i)
	595	{
	596	if (dfa->eclosures != NULL)
	597	re_node_set_free (dfa->eclosures + i);
	598	if (dfa->inveclosures != NULL)
	599	re_node_set_free (dfa->inveclosures + i);
	600	if (dfa->edests != NULL)
	601	re_node_set_free (dfa->edests + i);
	602	}
	603	re_free (dfa->edests);
	604	re_free (dfa->eclosures);
	605	re_free (dfa->inveclosures);
	606	re_free (dfa->nodes);
	607
	608	if (dfa->state_table)
	609	for (i = 0; i <= dfa->state_hash_mask; ++i)
	610	{
	611	struct re_state_table_entry *entry = dfa->state_table + i;
	612	for (j = 0; j < entry->num; ++j)
	613	{
	614	re_dfastate_t *state = entry->array[j];
	615	free_state (state);
	616	}
	617	re_free (entry->array);
	618	}
	619	re_free (dfa->state_table);
	620	#ifdef RE_ENABLE_I18N
	621	if (dfa->sb_char != utf8_sb_map)
	622	re_free (dfa->sb_char);
	623	#endif
	624	re_free (dfa->subexp_map);
	625	#ifdef DEBUG
	626	re_free (dfa->re_str);
	627	#endif
	628
	629	re_free (dfa);
	630	}
	631
	632
	633	/* Free dynamically allocated space used by PREG. */
	634
	635	void
	636	regfree (regex_t *preg)
	637	{
	638	re_dfa_t *dfa = preg->buffer;
	639	if (__glibc_likely (dfa != NULL))
	640	{
	641	lock_fini (dfa->lock);
	642	free_dfa_content (dfa);
	643	}
	644	preg->buffer = NULL;
	645	preg->allocated = 0;
	646
	647	re_free (preg->fastmap);
	648	preg->fastmap = NULL;
	649
	650	re_free (preg->translate);
	651	preg->translate = NULL;
	652	}
	653	libc_hidden_def (__regfree)
	654	weak_alias (__regfree, regfree)
	655
	656
	657	/* Entry points compatible with 4.2 BSD regex library. We don't define
	658	them unless specifically requested. */
	659
	660	#if defined _REGEX_RE_COMP \|\| defined _LIBC
	661
	662	/* BSD has one and only one pattern buffer. */
	663	static struct re_pattern_buffer re_comp_buf;
	664
	665	char *
	666	# ifdef _LIBC
	667	/* Make these definitions weak in libc, so POSIX programs can redefine
	668	these names if they don't use our functions, and still use
	669	regcomp/regexec above without link errors. */
	670	weak_function
	671	# endif
	672	re_comp (const char *s)
	673	{
	674	reg_errcode_t ret;
	675	char *fastmap;
	676
	677	if (!s)
	678	{
	679	if (!re_comp_buf.buffer)
	680	return gettext ("No previous regular expression");
	681	return 0;
	682	}
	683
	684	if (re_comp_buf.buffer)
	685	{
	686	fastmap = re_comp_buf.fastmap;
	687	re_comp_buf.fastmap = NULL;
	688	__regfree (&re_comp_buf);
	689	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
	690	re_comp_buf.fastmap = fastmap;
	691	}
	692
	693	if (re_comp_buf.fastmap == NULL)
	694	{
	695	re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
	696	if (re_comp_buf.fastmap == NULL)
	697	return (char *) gettext (__re_error_msgid
	698	+ __re_error_msgid_idx[(int) REG_ESPACE]);
	699	}
	700
	701	/* Since 're_exec' always passes NULL for the 'regs' argument, we
	702	don't need to initialize the pattern buffer fields which affect it. */
	703
	704	/* Match anchors at newlines. */
	705	re_comp_buf.newline_anchor = 1;
	706
	707	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
	708
	709	if (!ret)
	710	return NULL;
	711
	712	/* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
	713	return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
	714	}
	715
	716	#ifdef _LIBC
	717	libc_freeres_fn (free_mem)
	718	{
	719	__regfree (&re_comp_buf);
	720	}
	721	#endif
	722
	723	#endif /* _REGEX_RE_COMP */
	724
	725
	726	/* Internal entry point.
	727	Compile the regular expression PATTERN, whose length is LENGTH.
	728	SYNTAX indicate regular expression's syntax. */
	729
	730	static reg_errcode_t
	731	re_compile_internal (regex_t preg, const char pattern, size_t length,
	732	reg_syntax_t syntax)
	733	{
	734	reg_errcode_t err = REG_NOERROR;
	735	re_dfa_t *dfa;
	736	re_string_t regexp;
	737
	738	/* Initialize the pattern buffer. */
	739	preg->fastmap_accurate = 0;
	740	preg->syntax = syntax;
	741	preg->not_bol = preg->not_eol = 0;
	742	preg->used = 0;
	743	preg->re_nsub = 0;
	744	preg->can_be_null = 0;
	745	preg->regs_allocated = REGS_UNALLOCATED;
	746
	747	/* Initialize the dfa. */
	748	dfa = preg->buffer;
	749	if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
	750	{
	751	/* If zero allocated, but buffer is non-null, try to realloc
	752	enough space. This loses if buffer's address is bogus, but
	753	that is the user's responsibility. If ->buffer is NULL this
	754	is a simple allocation. */
	755	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
	756	if (dfa == NULL)
	757	return REG_ESPACE;
	758	preg->allocated = sizeof (re_dfa_t);
	759	preg->buffer = dfa;
	760	}
	761	preg->used = sizeof (re_dfa_t);
	762
	763	err = init_dfa (dfa, length);
	764	if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
	765	err = REG_ESPACE;
	766	if (__glibc_unlikely (err != REG_NOERROR))
	767	{
	768	free_dfa_content (dfa);
	769	preg->buffer = NULL;
	770	preg->allocated = 0;
	771	return err;
	772	}
	773	#ifdef DEBUG
	774	/* Note: length+1 will not overflow since it is checked in init_dfa. */
	775	dfa->re_str = re_malloc (char, length + 1);
	776	strncpy (dfa->re_str, pattern, length + 1);
	777	#endif
	778
	779	err = re_string_construct (&regexp, pattern, length, preg->translate,
	780	(syntax & RE_ICASE) != 0, dfa);
	781	if (__glibc_unlikely (err != REG_NOERROR))
	782	{
	783	re_compile_internal_free_return:
	784	free_workarea_compile (preg);
	785	re_string_destruct (&regexp);
	786	lock_fini (dfa->lock);
	787	free_dfa_content (dfa);
	788	preg->buffer = NULL;
	789	preg->allocated = 0;
	790	return err;
	791	}
	792
	793	/* Parse the regular expression, and build a structure tree. */
	794	preg->re_nsub = 0;
	795	dfa->str_tree = parse (&regexp, preg, syntax, &err);
	796	if (__glibc_unlikely (dfa->str_tree == NULL))
	797	goto re_compile_internal_free_return;
	798
	799	/* Analyze the tree and create the nfa. */
	800	err = analyze (preg);
	801	if (__glibc_unlikely (err != REG_NOERROR))
	802	goto re_compile_internal_free_return;
	803
	804	#ifdef RE_ENABLE_I18N
	805	/* If possible, do searching in single byte encoding to speed things up. */
	806	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
	807	optimize_utf8 (dfa);
	808	#endif
	809
	810	/* Then create the initial state of the dfa. */
	811	err = create_initial_state (dfa);
	812
	813	/* Release work areas. */
	814	free_workarea_compile (preg);
	815	re_string_destruct (&regexp);
	816
	817	if (__glibc_unlikely (err != REG_NOERROR))
	818	{
	819	lock_fini (dfa->lock);
	820	free_dfa_content (dfa);
	821	preg->buffer = NULL;
	822	preg->allocated = 0;
	823	}
	824
	825	return err;
	826	}
	827
	828	/* Initialize DFA. We use the length of the regular expression PAT_LEN
	829	as the initial length of some arrays. */
	830
	831	static reg_errcode_t
	832	init_dfa (re_dfa_t *dfa, size_t pat_len)
	833	{
	834	__re_size_t table_size;
	835	#ifndef _LIBC
	836	const char *codeset_name;
	837	#endif
	838	#ifdef RE_ENABLE_I18N
	839	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
	840	#else
	841	size_t max_i18n_object_size = 0;
	842	#endif
	843	size_t max_object_size =
	844	MAX (sizeof (struct re_state_table_entry),
	845	MAX (sizeof (re_token_t),
	846	MAX (sizeof (re_node_set),
	847	MAX (sizeof (regmatch_t),
	848	max_i18n_object_size))));
	849
	850	memset (dfa, '\0', sizeof (re_dfa_t));
	851
	852	/* Force allocation of str_tree_storage the first time. */
	853	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
	854
	855	/* Avoid overflows. The extra "/ 2" is for the table_size doubling
	856	calculation below, and for similar doubling calculations
	857	elsewhere. And it's <= rather than <, because some of the
	858	doubling calculations add 1 afterwards. */
	859	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
	860	<= pat_len))
	861	return REG_ESPACE;
	862
	863	dfa->nodes_alloc = pat_len + 1;
	864	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
	865
	866	/* table_size = 2 ^ ceil(log pat_len) */
	867	for (table_size = 1; ; table_size <<= 1)
	868	if (table_size > pat_len)
	869	break;
	870
	871	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
	872	dfa->state_hash_mask = table_size - 1;
	873
	874	dfa->mb_cur_max = MB_CUR_MAX;
	875	#ifdef _LIBC
	876	if (dfa->mb_cur_max == 6
[3532]	877	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
	878	dfa->is_utf8 = 1;
[3548]	879	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
[3532]	880	!= 0);
[3529]	881	#else
	882	# ifdef _MSC_VER
	883	(void)codeset_name;
	884	if (get_crt_codepage() == CP_UTF8)
	885	# else
[3532]	886	codeset_name = nl_langinfo (CODESET);
[3529]	887	if ((codeset_name[0] == 'U' \|\| codeset_name[0] == 'u')
	888	&& (codeset_name[1] == 'T' \|\| codeset_name[1] == 't')
	889	&& (codeset_name[2] == 'F' \|\| codeset_name[2] == 'f')
	890	&& strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
	891	# endif
	892	dfa->is_utf8 = 1;
	893
	894	/* We check exhaustively in the loop below if this charset is a
	895	superset of ASCII. */
	896	dfa->map_notascii = 0;
	897	#endif
	898
	899	#ifdef RE_ENABLE_I18N
	900	if (dfa->mb_cur_max > 1)
	901	{
	902	if (dfa->is_utf8)
	903	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
	904	else
	905	{
	906	int i, j, ch;
	907
	908	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
	909	if (__glibc_unlikely (dfa->sb_char == NULL))
	910	return REG_ESPACE;
	911
	912	/* Set the bits corresponding to single byte chars. */
	913	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
	914	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
	915	{
	916	wint_t wch = __btowc (ch);
	917	if (wch != WEOF)
	918	dfa->sb_char[i] \|= (bitset_word_t) 1 << j;
	919	# ifndef _LIBC
	920	if (isascii (ch) && wch != ch)
	921	dfa->map_notascii = 1;
	922	# endif
	923	}
	924	}
	925	}
	926	#endif
	927
	928	if (__glibc_unlikely (dfa->nodes == NULL \|\| dfa->state_table == NULL))
	929	return REG_ESPACE;
	930	return REG_NOERROR;
	931	}
	932
	933	/* Initialize WORD_CHAR table, which indicate which character is
	934	"word". In this case "word" means that it is the word construction
	935	character used by some operators like "\<", "\>", etc. */
	936
	937	static void
	938	init_word_char (re_dfa_t *dfa)
	939	{
	940	int i = 0;
	941	int j;
	942	int ch = 0;
	943	dfa->word_ops_used = 1;
	944	if (__glibc_likely (dfa->map_notascii == 0))
	945	{
	946	/* Avoid uint32_t and uint64_t as some non-GCC platforms lack
	947	them, an issue when this code is used in Gnulib. */
	948	bitset_word_t bits0 = 0x00000000;
	949	bitset_word_t bits1 = 0x03ff0000;
	950	bitset_word_t bits2 = 0x87fffffe;
	951	bitset_word_t bits3 = 0x07fffffe;
	952	if (BITSET_WORD_BITS == 64)
	953	{
	954	/* Pacify gcc -Woverflow on 32-bit platformns. */
	955	dfa->word_char[0] = bits1 << 31 << 1 \| bits0;
	956	dfa->word_char[1] = bits3 << 31 << 1 \| bits2;
	957	i = 2;
	958	}
	959	else if (BITSET_WORD_BITS == 32)
	960	{
	961	dfa->word_char[0] = bits0;
	962	dfa->word_char[1] = bits1;
	963	dfa->word_char[2] = bits2;
	964	dfa->word_char[3] = bits3;
	965	i = 4;
	966	}
	967	else
	968	goto general_case;
	969	ch = 128;
	970
	971	if (__glibc_likely (dfa->is_utf8))
	972	{
	973	memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
	974	return;
	975	}
	976	}
	977
	978	general_case:
	979	for (; i < BITSET_WORDS; ++i)
	980	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
	981	if (isalnum (ch) \|\| ch == '_')
	982	dfa->word_char[i] \|= (bitset_word_t) 1 << j;
	983	}
	984
	985	/* Free the work area which are only used while compiling. */
	986
	987	static void
	988	free_workarea_compile (regex_t *preg)
	989	{
	990	re_dfa_t *dfa = preg->buffer;
	991	bin_tree_storage_t storage, next;
	992	for (storage = dfa->str_tree_storage; storage; storage = next)
	993	{
	994	next = storage->next;
	995	re_free (storage);
	996	}
	997	dfa->str_tree_storage = NULL;
	998	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
	999	dfa->str_tree = NULL;
	1000	re_free (dfa->org_indices);
	1001	dfa->org_indices = NULL;
	1002	}
	1003
	1004	/* Create initial states for all contexts. */
	1005
	1006	static reg_errcode_t
	1007	create_initial_state (re_dfa_t *dfa)
	1008	{
	1009	Idx first, i;
	1010	reg_errcode_t err;
	1011	re_node_set init_nodes;
	1012
	1013	/* Initial states have the epsilon closure of the node which is
	1014	the first node of the regular expression. */
	1015	first = dfa->str_tree->first->node_idx;
	1016	dfa->init_node = first;
	1017	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
	1018	if (__glibc_unlikely (err != REG_NOERROR))
	1019	return err;
	1020
	1021	/* The back-references which are in initial states can epsilon transit,
	1022	since in this case all of the subexpressions can be null.
	1023	Then we add epsilon closures of the nodes which are the next nodes of
	1024	the back-references. */
	1025	if (dfa->nbackref > 0)
	1026	for (i = 0; i < init_nodes.nelem; ++i)
	1027	{
	1028	Idx node_idx = init_nodes.elems[i];
	1029	re_token_type_t type = dfa->nodes[node_idx].type;
	1030
	1031	Idx clexp_idx;
	1032	if (type != OP_BACK_REF)
	1033	continue;
	1034	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
	1035	{
	1036	re_token_t *clexp_node;
	1037	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
	1038	if (clexp_node->type == OP_CLOSE_SUBEXP
	1039	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
	1040	break;
	1041	}
	1042	if (clexp_idx == init_nodes.nelem)
	1043	continue;
	1044
	1045	if (type == OP_BACK_REF)
	1046	{
	1047	Idx dest_idx = dfa->edests[node_idx].elems[0];
	1048	if (!re_node_set_contains (&init_nodes, dest_idx))
	1049	{
	1050	reg_errcode_t merge_err
	1051	= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
	1052	if (merge_err != REG_NOERROR)
	1053	return merge_err;
	1054	i = 0;
	1055	}
	1056	}
	1057	}
	1058
	1059	/* It must be the first time to invoke acquire_state. */
	1060	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
	1061	/* We don't check ERR here, since the initial state must not be NULL. */
	1062	if (__glibc_unlikely (dfa->init_state == NULL))
	1063	return err;
	1064	if (dfa->init_state->has_constraint)
	1065	{
	1066	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
	1067	CONTEXT_WORD);
	1068	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
	1069	CONTEXT_NEWLINE);
	1070	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
	1071	&init_nodes,
	1072	CONTEXT_NEWLINE
	1073	\| CONTEXT_BEGBUF);
	1074	if (__glibc_unlikely (dfa->init_state_word == NULL
	1075	\|\| dfa->init_state_nl == NULL
	1076	\|\| dfa->init_state_begbuf == NULL))
	1077	return err;
	1078	}
	1079	else
	1080	dfa->init_state_word = dfa->init_state_nl
	1081	= dfa->init_state_begbuf = dfa->init_state;
	1082
	1083	re_node_set_free (&init_nodes);
	1084	return REG_NOERROR;
	1085	}
	1086
	1087
	1088	#ifdef RE_ENABLE_I18N
	1089	/* If it is possible to do searching in single byte encoding instead of UTF-8
	1090	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
	1091	DFA nodes where needed. */
	1092
	1093	static void
	1094	optimize_utf8 (re_dfa_t *dfa)
	1095	{
	1096	Idx node;
	1097	int i;
	1098	bool mb_chars = false;
	1099	bool has_period = false;
	1100
	1101	for (node = 0; node < dfa->nodes_len; ++node)
	1102	switch (dfa->nodes[node].type)
	1103	{
	1104	case CHARACTER:
	1105	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
	1106	mb_chars = true;
	1107	break;
	1108	case ANCHOR:
	1109	switch (dfa->nodes[node].opr.ctx_type)
	1110	{
	1111	case LINE_FIRST:
	1112	case LINE_LAST:
	1113	case BUF_FIRST:
	1114	case BUF_LAST:
	1115	break;
	1116	default:
	1117	/* Word anchors etc. cannot be handled. It's okay to test
	1118	opr.ctx_type since constraints (for all DFA nodes) are
	1119	created by ORing one or more opr.ctx_type values. */
	1120	return;
	1121	}
	1122	break;
	1123	case OP_PERIOD:
	1124	has_period = true;
	1125	break;
	1126	case OP_BACK_REF:
	1127	case OP_ALT:
	1128	case END_OF_RE:
	1129	case OP_DUP_ASTERISK:
	1130	case OP_OPEN_SUBEXP:
	1131	case OP_CLOSE_SUBEXP:
	1132	break;
	1133	case COMPLEX_BRACKET:
	1134	return;
	1135	case SIMPLE_BRACKET:
	1136	/* Just double check. */
	1137	{
	1138	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
	1139	? 0
	1140	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
	1141	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
	1142	{
	1143	if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
	1144	return;
	1145	rshift = 0;
	1146	}
	1147	}
	1148	break;
	1149	default:
	1150	abort ();
	1151	}
	1152
	1153	if (mb_chars \|\| has_period)
	1154	for (node = 0; node < dfa->nodes_len; ++node)
	1155	{
	1156	if (dfa->nodes[node].type == CHARACTER
	1157	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
	1158	dfa->nodes[node].mb_partial = 0;
	1159	else if (dfa->nodes[node].type == OP_PERIOD)
	1160	dfa->nodes[node].type = OP_UTF8_PERIOD;
	1161	}
	1162
	1163	/* The search can be in single byte locale. */
	1164	dfa->mb_cur_max = 1;
	1165	dfa->is_utf8 = 0;
	1166	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
	1167	}
	1168	#endif
	1169
	1170
	1171	/* Analyze the structure tree, and calculate "first", "next", "edest",
	1172	"eclosure", and "inveclosure". */
	1173
	1174	static reg_errcode_t
	1175	analyze (regex_t *preg)
	1176	{
	1177	re_dfa_t *dfa = preg->buffer;
	1178	reg_errcode_t ret;
	1179
	1180	/* Allocate arrays. */
	1181	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
	1182	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
	1183	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
	1184	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
	1185	if (__glibc_unlikely (dfa->nexts == NULL \|\| dfa->org_indices == NULL
	1186	\|\| dfa->edests == NULL \|\| dfa->eclosures == NULL))
	1187	return REG_ESPACE;
	1188
	1189	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
	1190	if (dfa->subexp_map != NULL)
	1191	{
	1192	Idx i;
	1193	for (i = 0; i < preg->re_nsub; i++)
	1194	dfa->subexp_map[i] = i;
	1195	preorder (dfa->str_tree, optimize_subexps, dfa);
	1196	for (i = 0; i < preg->re_nsub; i++)
	1197	if (dfa->subexp_map[i] != i)
	1198	break;
	1199	if (i == preg->re_nsub)
	1200	{
	1201	re_free (dfa->subexp_map);
	1202	dfa->subexp_map = NULL;
	1203	}
	1204	}
	1205
	1206	ret = postorder (dfa->str_tree, lower_subexps, preg);
	1207	if (__glibc_unlikely (ret != REG_NOERROR))
	1208	return ret;
	1209	ret = postorder (dfa->str_tree, calc_first, dfa);
	1210	if (__glibc_unlikely (ret != REG_NOERROR))
	1211	return ret;
	1212	preorder (dfa->str_tree, calc_next, dfa);
	1213	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
	1214	if (__glibc_unlikely (ret != REG_NOERROR))
	1215	return ret;
	1216	ret = calc_eclosure (dfa);
	1217	if (__glibc_unlikely (ret != REG_NOERROR))
	1218	return ret;
	1219
	1220	/* We only need this during the prune_impossible_nodes pass in regexec.c;
	1221	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
	1222	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
	1223	\|\| dfa->nbackref)
	1224	{
	1225	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
	1226	if (__glibc_unlikely (dfa->inveclosures == NULL))
	1227	return REG_ESPACE;
	1228	ret = calc_inveclosure (dfa);
	1229	}
	1230
	1231	return ret;
	1232	}
	1233
	1234	/* Our parse trees are very unbalanced, so we cannot use a stack to
	1235	implement parse tree visits. Instead, we use parent pointers and
	1236	some hairy code in these two functions. */
	1237	static reg_errcode_t
	1238	postorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
	1239	void *extra)
	1240	{
	1241	bin_tree_t node, prev;
	1242
	1243	for (node = root; ; )
	1244	{
	1245	/* Descend down the tree, preferably to the left (or to the right
	1246	if that's the only child). */
	1247	while (node->left \|\| node->right)
	1248	if (node->left)
	1249	node = node->left;
	1250	else
	1251	node = node->right;
	1252
	1253	do
	1254	{
	1255	reg_errcode_t err = fn (extra, node);
	1256	if (__glibc_unlikely (err != REG_NOERROR))
	1257	return err;
	1258	if (node->parent == NULL)
	1259	return REG_NOERROR;
	1260	prev = node;
	1261	node = node->parent;
	1262	}
	1263	/* Go up while we have a node that is reached from the right. */
	1264	while (node->right == prev \|\| node->right == NULL);
	1265	node = node->right;
	1266	}
	1267	}
	1268
	1269	static reg_errcode_t
	1270	preorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
	1271	void *extra)
	1272	{
	1273	bin_tree_t *node;
	1274
	1275	for (node = root; ; )
	1276	{
	1277	reg_errcode_t err = fn (extra, node);
	1278	if (__glibc_unlikely (err != REG_NOERROR))
	1279	return err;
	1280
	1281	/* Go to the left node, or up and to the right. */
	1282	if (node->left)
	1283	node = node->left;
	1284	else
	1285	{
	1286	bin_tree_t *prev = NULL;
	1287	while (node->right == prev \|\| node->right == NULL)
	1288	{
	1289	prev = node;
	1290	node = node->parent;
	1291	if (!node)
	1292	return REG_NOERROR;
	1293	}
	1294	node = node->right;
	1295	}
	1296	}
	1297	}
	1298
	1299	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
	1300	re_search_internal to map the inner one's opr.idx to this one's. Adjust
	1301	backreferences as well. Requires a preorder visit. */
	1302	static reg_errcode_t
	1303	optimize_subexps (void extra, bin_tree_t node)
	1304	{
	1305	re_dfa_t dfa = (re_dfa_t ) extra;
	1306
	1307	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
	1308	{
	1309	int idx = node->token.opr.idx;
	1310	node->token.opr.idx = dfa->subexp_map[idx];
	1311	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
	1312	}
	1313
	1314	else if (node->token.type == SUBEXP
	1315	&& node->left && node->left->token.type == SUBEXP)
	1316	{
	1317	Idx other_idx = node->left->token.opr.idx;
	1318
	1319	node->left = node->left->left;
	1320	if (node->left)
	1321	node->left->parent = node;
	1322
	1323	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
	1324	if (other_idx < BITSET_WORD_BITS)
	1325	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
	1326	}
	1327
	1328	return REG_NOERROR;
	1329	}
	1330
	1331	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
	1332	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
	1333	static reg_errcode_t
	1334	lower_subexps (void extra, bin_tree_t node)
	1335	{
	1336	regex_t preg = (regex_t ) extra;
	1337	reg_errcode_t err = REG_NOERROR;
	1338
	1339	if (node->left && node->left->token.type == SUBEXP)
	1340	{
	1341	node->left = lower_subexp (&err, preg, node->left);
	1342	if (node->left)
	1343	node->left->parent = node;
	1344	}
	1345	if (node->right && node->right->token.type == SUBEXP)
	1346	{
	1347	node->right = lower_subexp (&err, preg, node->right);
	1348	if (node->right)
	1349	node->right->parent = node;
	1350	}
	1351
	1352	return err;
	1353	}
	1354
	1355	static bin_tree_t *
	1356	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
	1357	{
	1358	re_dfa_t *dfa = preg->buffer;
	1359	bin_tree_t *body = node->left;
	1360	bin_tree_t op, cls, tree1, tree;
	1361
	1362	if (preg->no_sub
	1363	/* We do not optimize empty subexpressions, because otherwise we may
	1364	have bad CONCAT nodes with NULL children. This is obviously not
	1365	very common, so we do not lose much. An example that triggers
	1366	this case is the sed "script" //x. */
	1367	&& node->left != NULL
	1368	&& (node->token.opr.idx >= BITSET_WORD_BITS
	1369	\|\| !(dfa->used_bkref_map
	1370	& ((bitset_word_t) 1 << node->token.opr.idx))))
	1371	return node->left;
	1372
	1373	/* Convert the SUBEXP node to the concatenation of an
	1374	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
	1375	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
	1376	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
	1377	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
	1378	tree = create_tree (dfa, op, tree1, CONCAT);
	1379	if (__glibc_unlikely (tree == NULL \|\| tree1 == NULL
	1380	\|\| op == NULL \|\| cls == NULL))
	1381	{
	1382	*err = REG_ESPACE;
	1383	return NULL;
	1384	}
	1385
	1386	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
	1387	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
	1388	return tree;
	1389	}
	1390
	1391	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
	1392	nodes. Requires a postorder visit. */
	1393	static reg_errcode_t
	1394	calc_first (void extra, bin_tree_t node)
	1395	{
	1396	re_dfa_t dfa = (re_dfa_t ) extra;
	1397	if (node->token.type == CONCAT)
	1398	{
	1399	node->first = node->left->first;
	1400	node->node_idx = node->left->node_idx;
	1401	}
	1402	else
	1403	{
	1404	node->first = node;
	1405	node->node_idx = re_dfa_add_node (dfa, node->token);
	1406	if (__glibc_unlikely (node->node_idx == -1))
	1407	return REG_ESPACE;
	1408	if (node->token.type == ANCHOR)
	1409	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
	1410	}
	1411	return REG_NOERROR;
	1412	}
	1413
	1414	/* Pass 2: compute NEXT on the tree. Preorder visit. */
	1415	static reg_errcode_t
	1416	calc_next (void extra, bin_tree_t node)
	1417	{
	1418	switch (node->token.type)
	1419	{
	1420	case OP_DUP_ASTERISK:
	1421	node->left->next = node;
	1422	break;
	1423	case CONCAT:
	1424	node->left->next = node->right->first;
	1425	node->right->next = node->next;
	1426	break;
	1427	default:
	1428	if (node->left)
	1429	node->left->next = node->next;
	1430	if (node->right)
	1431	node->right->next = node->next;
	1432	break;
	1433	}
	1434	return REG_NOERROR;
	1435	}
	1436
	1437	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
	1438	static reg_errcode_t
	1439	link_nfa_nodes (void extra, bin_tree_t node)
	1440	{
	1441	re_dfa_t dfa = (re_dfa_t ) extra;
	1442	Idx idx = node->node_idx;
	1443	reg_errcode_t err = REG_NOERROR;
	1444
	1445	switch (node->token.type)
	1446	{
	1447	case CONCAT:
	1448	break;
	1449
	1450	case END_OF_RE:
	1451	DEBUG_ASSERT (node->next == NULL);
	1452	break;
	1453
	1454	case OP_DUP_ASTERISK:
	1455	case OP_ALT:
	1456	{
	1457	Idx left, right;
	1458	dfa->has_plural_match = 1;
	1459	if (node->left != NULL)
	1460	left = node->left->first->node_idx;
	1461	else
	1462	left = node->next->node_idx;
	1463	if (node->right != NULL)
	1464	right = node->right->first->node_idx;
	1465	else
	1466	right = node->next->node_idx;
	1467	DEBUG_ASSERT (left > -1);
	1468	DEBUG_ASSERT (right > -1);
	1469	err = re_node_set_init_2 (dfa->edests + idx, left, right);
	1470	}
	1471	break;
	1472
	1473	case ANCHOR:
	1474	case OP_OPEN_SUBEXP:
	1475	case OP_CLOSE_SUBEXP:
	1476	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
	1477	break;
	1478
	1479	case OP_BACK_REF:
	1480	dfa->nexts[idx] = node->next->node_idx;
	1481	if (node->token.type == OP_BACK_REF)
	1482	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
	1483	break;
	1484
	1485	default:
	1486	DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
	1487	dfa->nexts[idx] = node->next->node_idx;
	1488	break;
	1489	}
	1490
	1491	return err;
	1492	}
	1493
	1494	/* Duplicate the epsilon closure of the node ROOT_NODE.
	1495	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
	1496	to their own constraint. */
	1497
	1498	static reg_errcode_t
	1499	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
	1500	Idx root_node, unsigned int init_constraint)
	1501	{
	1502	Idx org_node, clone_node;
	1503	bool ok;
	1504	unsigned int constraint = init_constraint;
	1505	for (org_node = top_org_node, clone_node = top_clone_node;;)
	1506	{
	1507	Idx org_dest, clone_dest;
	1508	if (dfa->nodes[org_node].type == OP_BACK_REF)
	1509	{
	1510	/* If the back reference epsilon-transit, its destination must
	1511	also have the constraint. Then duplicate the epsilon closure
	1512	of the destination of the back reference, and store it in
	1513	edests of the back reference. */
	1514	org_dest = dfa->nexts[org_node];
	1515	re_node_set_empty (dfa->edests + clone_node);
	1516	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1517	if (__glibc_unlikely (clone_dest == -1))
	1518	return REG_ESPACE;
	1519	dfa->nexts[clone_node] = dfa->nexts[org_node];
	1520	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1521	if (__glibc_unlikely (! ok))
	1522	return REG_ESPACE;
	1523	}
	1524	else if (dfa->edests[org_node].nelem == 0)
	1525	{
	1526	/* In case of the node can't epsilon-transit, don't duplicate the
	1527	destination and store the original destination as the
	1528	destination of the node. */
	1529	dfa->nexts[clone_node] = dfa->nexts[org_node];
	1530	break;
	1531	}
	1532	else if (dfa->edests[org_node].nelem == 1)
	1533	{
	1534	/* In case of the node can epsilon-transit, and it has only one
	1535	destination. */
	1536	org_dest = dfa->edests[org_node].elems[0];
	1537	re_node_set_empty (dfa->edests + clone_node);
	1538	/* If the node is root_node itself, it means the epsilon closure
	1539	has a loop. Then tie it to the destination of the root_node. */
	1540	if (org_node == root_node && clone_node != org_node)
	1541	{
	1542	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
	1543	if (__glibc_unlikely (! ok))
	1544	return REG_ESPACE;
	1545	break;
	1546	}
	1547	/* In case the node has another constraint, append it. */
	1548	constraint \|= dfa->nodes[org_node].constraint;
	1549	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1550	if (__glibc_unlikely (clone_dest == -1))
	1551	return REG_ESPACE;
	1552	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1553	if (__glibc_unlikely (! ok))
	1554	return REG_ESPACE;
	1555	}
	1556	else /* dfa->edests[org_node].nelem == 2 */
	1557	{
	1558	/* In case of the node can epsilon-transit, and it has two
	1559	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
	1560	org_dest = dfa->edests[org_node].elems[0];
	1561	re_node_set_empty (dfa->edests + clone_node);
	1562	/* Search for a duplicated node which satisfies the constraint. */
	1563	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
	1564	if (clone_dest == -1)
	1565	{
	1566	/* There is no such duplicated node, create a new one. */
	1567	reg_errcode_t err;
	1568	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1569	if (__glibc_unlikely (clone_dest == -1))
	1570	return REG_ESPACE;
	1571	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1572	if (__glibc_unlikely (! ok))
	1573	return REG_ESPACE;
	1574	err = duplicate_node_closure (dfa, org_dest, clone_dest,
	1575	root_node, constraint);
	1576	if (__glibc_unlikely (err != REG_NOERROR))
	1577	return err;
	1578	}
	1579	else
	1580	{
	1581	/* There is a duplicated node which satisfies the constraint,
	1582	use it to avoid infinite loop. */
	1583	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1584	if (__glibc_unlikely (! ok))
	1585	return REG_ESPACE;
	1586	}
	1587
	1588	org_dest = dfa->edests[org_node].elems[1];
	1589	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1590	if (__glibc_unlikely (clone_dest == -1))
	1591	return REG_ESPACE;
	1592	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1593	if (__glibc_unlikely (! ok))
	1594	return REG_ESPACE;
	1595	}
	1596	org_node = org_dest;
	1597	clone_node = clone_dest;
	1598	}
	1599	return REG_NOERROR;
	1600	}
	1601
	1602	/* Search for a node which is duplicated from the node ORG_NODE, and
	1603	satisfies the constraint CONSTRAINT. */
	1604
	1605	static Idx
	1606	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
	1607	unsigned int constraint)
	1608	{
	1609	Idx idx;
	1610	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
	1611	{
	1612	if (org_node == dfa->org_indices[idx]
	1613	&& constraint == dfa->nodes[idx].constraint)
	1614	return idx; /* Found. */
	1615	}
	1616	return -1; /* Not found. */
	1617	}
	1618
	1619	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
	1620	Return the index of the new node, or -1 if insufficient storage is
	1621	available. */
	1622
	1623	static Idx
	1624	duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
	1625	{
	1626	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
	1627	if (__glibc_likely (dup_idx != -1))
	1628	{
	1629	dfa->nodes[dup_idx].constraint = constraint;
	1630	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
	1631	dfa->nodes[dup_idx].duplicated = 1;
	1632
	1633	/* Store the index of the original node. */
	1634	dfa->org_indices[dup_idx] = org_idx;
	1635	}
	1636	return dup_idx;
	1637	}
	1638
	1639	static reg_errcode_t
	1640	calc_inveclosure (re_dfa_t *dfa)
	1641	{
	1642	Idx src, idx;
	1643	bool ok;
	1644	for (idx = 0; idx < dfa->nodes_len; ++idx)
	1645	re_node_set_init_empty (dfa->inveclosures + idx);
	1646
	1647	for (src = 0; src < dfa->nodes_len; ++src)
	1648	{
	1649	Idx *elems = dfa->eclosures[src].elems;
	1650	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
	1651	{
	1652	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
	1653	if (__glibc_unlikely (! ok))
	1654	return REG_ESPACE;
	1655	}
	1656	}
	1657
	1658	return REG_NOERROR;
	1659	}
	1660
	1661	/* Calculate "eclosure" for all the node in DFA. */
	1662
	1663	static reg_errcode_t
	1664	calc_eclosure (re_dfa_t *dfa)
	1665	{
	1666	Idx node_idx;
	1667	bool incomplete;
	1668	DEBUG_ASSERT (dfa->nodes_len > 0);
	1669	incomplete = false;
	1670	/* For each nodes, calculate epsilon closure. */
	1671	for (node_idx = 0; ; ++node_idx)
	1672	{
	1673	reg_errcode_t err;
	1674	re_node_set eclosure_elem;
	1675	if (node_idx == dfa->nodes_len)
	1676	{
	1677	if (!incomplete)
	1678	break;
	1679	incomplete = false;
	1680	node_idx = 0;
	1681	}
	1682
	1683	DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
	1684
	1685	/* If we have already calculated, skip it. */
	1686	if (dfa->eclosures[node_idx].nelem != 0)
	1687	continue;
	1688	/* Calculate epsilon closure of 'node_idx'. */
	1689	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
	1690	if (__glibc_unlikely (err != REG_NOERROR))
	1691	return err;
	1692
	1693	if (dfa->eclosures[node_idx].nelem == 0)
	1694	{
	1695	incomplete = true;
	1696	re_node_set_free (&eclosure_elem);
	1697	}
	1698	}
	1699	return REG_NOERROR;
	1700	}
	1701
	1702	/* Calculate epsilon closure of NODE. */
	1703
	1704	static reg_errcode_t
	1705	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
	1706	{
	1707	reg_errcode_t err;
	1708	Idx i;
	1709	re_node_set eclosure;
	1710	bool incomplete = false;
	1711	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
	1712	if (__glibc_unlikely (err != REG_NOERROR))
	1713	return err;
	1714
	1715	/* An epsilon closure includes itself. */
	1716	eclosure.elems[eclosure.nelem++] = node;
	1717
	1718	/* This indicates that we are calculating this node now.
	1719	We reference this value to avoid infinite loop. */
	1720	dfa->eclosures[node].nelem = -1;
	1721
	1722	/* If the current node has constraints, duplicate all nodes
	1723	since they must inherit the constraints. */
	1724	if (dfa->nodes[node].constraint
	1725	&& dfa->edests[node].nelem
	1726	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
	1727	{
	1728	err = duplicate_node_closure (dfa, node, node, node,
	1729	dfa->nodes[node].constraint);
	1730	if (__glibc_unlikely (err != REG_NOERROR))
	1731	return err;
	1732	}
	1733
	1734	/* Expand each epsilon destination nodes. */
	1735	if (IS_EPSILON_NODE(dfa->nodes[node].type))
	1736	for (i = 0; i < dfa->edests[node].nelem; ++i)
	1737	{
	1738	re_node_set eclosure_elem;
	1739	Idx edest = dfa->edests[node].elems[i];
	1740	/* If calculating the epsilon closure of 'edest' is in progress,
	1741	return intermediate result. */
	1742	if (dfa->eclosures[edest].nelem == -1)
	1743	{
	1744	incomplete = true;
	1745	continue;
	1746	}
	1747	/* If we haven't calculated the epsilon closure of 'edest' yet,
	1748	calculate now. Otherwise use calculated epsilon closure. */
	1749	if (dfa->eclosures[edest].nelem == 0)
	1750	{
	1751	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
	1752	if (__glibc_unlikely (err != REG_NOERROR))
	1753	return err;
	1754	}
	1755	else
	1756	eclosure_elem = dfa->eclosures[edest];
	1757	/* Merge the epsilon closure of 'edest'. */
	1758	err = re_node_set_merge (&eclosure, &eclosure_elem);
	1759	if (__glibc_unlikely (err != REG_NOERROR))
	1760	return err;
	1761	/* If the epsilon closure of 'edest' is incomplete,
	1762	the epsilon closure of this node is also incomplete. */
	1763	if (dfa->eclosures[edest].nelem == 0)
	1764	{
	1765	incomplete = true;
	1766	re_node_set_free (&eclosure_elem);
	1767	}
	1768	}
	1769
	1770	if (incomplete && !root)
	1771	dfa->eclosures[node].nelem = 0;
	1772	else
	1773	dfa->eclosures[node] = eclosure;
	1774	*new_set = eclosure;
	1775	return REG_NOERROR;
	1776	}
	1777
	1778
	1779	/* Functions for token which are used in the parser. */
	1780
	1781	/* Fetch a token from INPUT.
	1782	We must not use this function inside bracket expressions. */
	1783
	1784	static void
	1785	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
	1786	{
	1787	re_string_skip_bytes (input, peek_token (result, input, syntax));
	1788	}
	1789
	1790	/* Peek a token from INPUT, and return the length of the token.
	1791	We must not use this function inside bracket expressions. */
	1792
	1793	static int
	1794	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
	1795	{
	1796	unsigned char c;
	1797
	1798	if (re_string_eoi (input))
	1799	{
	1800	token->type = END_OF_RE;
	1801	return 0;
	1802	}
	1803
	1804	c = re_string_peek_byte (input, 0);
	1805	token->opr.c = c;
	1806
	1807	token->word_char = 0;
	1808	#ifdef RE_ENABLE_I18N
	1809	token->mb_partial = 0;
	1810	if (input->mb_cur_max > 1
	1811	&& !re_string_first_byte (input, re_string_cur_idx (input)))
	1812	{
	1813	token->type = CHARACTER;
	1814	token->mb_partial = 1;
	1815	return 1;
	1816	}
	1817	#endif
	1818	if (c == '\\')
	1819	{
	1820	unsigned char c2;
	1821	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
	1822	{
	1823	token->type = BACK_SLASH;
	1824	return 1;
	1825	}
	1826
	1827	c2 = re_string_peek_byte_case (input, 1);
	1828	token->opr.c = c2;
	1829	token->type = CHARACTER;
	1830	#ifdef RE_ENABLE_I18N
	1831	if (input->mb_cur_max > 1)
	1832	{
	1833	wint_t wc = re_string_wchar_at (input,
	1834	re_string_cur_idx (input) + 1);
	1835	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
	1836	}
	1837	else
	1838	#endif
	1839	token->word_char = IS_WORD_CHAR (c2) != 0;
	1840
	1841	switch (c2)
	1842	{
	1843	case '\|':
	1844	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
	1845	token->type = OP_ALT;
	1846	break;
	1847	case '1': case '2': case '3': case '4': case '5':
	1848	case '6': case '7': case '8': case '9':
	1849	if (!(syntax & RE_NO_BK_REFS))
	1850	{
	1851	token->type = OP_BACK_REF;
	1852	token->opr.idx = c2 - '1';
	1853	}
	1854	break;
	1855	case '<':
	1856	if (!(syntax & RE_NO_GNU_OPS))
	1857	{
	1858	token->type = ANCHOR;
	1859	token->opr.ctx_type = WORD_FIRST;
	1860	}
	1861	break;
	1862	case '>':
	1863	if (!(syntax & RE_NO_GNU_OPS))
	1864	{
	1865	token->type = ANCHOR;
	1866	token->opr.ctx_type = WORD_LAST;
	1867	}
	1868	break;
	1869	case 'b':
	1870	if (!(syntax & RE_NO_GNU_OPS))
	1871	{
	1872	token->type = ANCHOR;
	1873	token->opr.ctx_type = WORD_DELIM;
	1874	}
	1875	break;
	1876	case 'B':
	1877	if (!(syntax & RE_NO_GNU_OPS))
	1878	{
	1879	token->type = ANCHOR;
	1880	token->opr.ctx_type = NOT_WORD_DELIM;
	1881	}
	1882	break;
	1883	case 'w':
	1884	if (!(syntax & RE_NO_GNU_OPS))
	1885	token->type = OP_WORD;
	1886	break;
	1887	case 'W':
	1888	if (!(syntax & RE_NO_GNU_OPS))
	1889	token->type = OP_NOTWORD;
	1890	break;
	1891	case 's':
	1892	if (!(syntax & RE_NO_GNU_OPS))
	1893	token->type = OP_SPACE;
	1894	break;
	1895	case 'S':
	1896	if (!(syntax & RE_NO_GNU_OPS))
	1897	token->type = OP_NOTSPACE;
	1898	break;
	1899	case '`':
	1900	if (!(syntax & RE_NO_GNU_OPS))
	1901	{
	1902	token->type = ANCHOR;
	1903	token->opr.ctx_type = BUF_FIRST;
	1904	}
	1905	break;
	1906	case '\'':
	1907	if (!(syntax & RE_NO_GNU_OPS))
	1908	{
	1909	token->type = ANCHOR;
	1910	token->opr.ctx_type = BUF_LAST;
	1911	}
	1912	break;
	1913	case '(':
	1914	if (!(syntax & RE_NO_BK_PARENS))
	1915	token->type = OP_OPEN_SUBEXP;
	1916	break;
	1917	case ')':
	1918	if (!(syntax & RE_NO_BK_PARENS))
	1919	token->type = OP_CLOSE_SUBEXP;
	1920	break;
	1921	case '+':
	1922	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
	1923	token->type = OP_DUP_PLUS;
	1924	break;
	1925	case '?':
	1926	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
	1927	token->type = OP_DUP_QUESTION;
	1928	break;
	1929	case '{':
	1930	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
	1931	token->type = OP_OPEN_DUP_NUM;
	1932	break;
	1933	case '}':
	1934	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
	1935	token->type = OP_CLOSE_DUP_NUM;
	1936	break;
	1937	default:
	1938	break;
	1939	}
	1940	return 2;
	1941	}
	1942
	1943	token->type = CHARACTER;
	1944	#ifdef RE_ENABLE_I18N
	1945	if (input->mb_cur_max > 1)
	1946	{
	1947	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
	1948	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
	1949	}
	1950	else
	1951	#endif
	1952	token->word_char = IS_WORD_CHAR (token->opr.c);
	1953
	1954	switch (c)
	1955	{
	1956	case '\n':
	1957	if (syntax & RE_NEWLINE_ALT)
	1958	token->type = OP_ALT;
	1959	break;
	1960	case '\|':
	1961	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
	1962	token->type = OP_ALT;
	1963	break;
	1964	case '*':
	1965	token->type = OP_DUP_ASTERISK;
	1966	break;
	1967	case '+':
	1968	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
	1969	token->type = OP_DUP_PLUS;
	1970	break;
	1971	case '?':
	1972	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
	1973	token->type = OP_DUP_QUESTION;
	1974	break;
	1975	case '{':
	1976	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
	1977	token->type = OP_OPEN_DUP_NUM;
	1978	break;
	1979	case '}':
	1980	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
	1981	token->type = OP_CLOSE_DUP_NUM;
	1982	break;
	1983	case '(':
	1984	if (syntax & RE_NO_BK_PARENS)
	1985	token->type = OP_OPEN_SUBEXP;
	1986	break;
	1987	case ')':
	1988	if (syntax & RE_NO_BK_PARENS)
	1989	token->type = OP_CLOSE_SUBEXP;
	1990	break;
	1991	case '[':
	1992	token->type = OP_OPEN_BRACKET;
	1993	break;
	1994	case '.':
	1995	token->type = OP_PERIOD;
	1996	break;
	1997	case '^':
	1998	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE))
	1999	&& re_string_cur_idx (input) != 0)
	2000	{
	2001	char prev = re_string_peek_byte (input, -1);
	2002	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
	2003	break;
	2004	}
	2005	token->type = ANCHOR;
	2006	token->opr.ctx_type = LINE_FIRST;
	2007	break;
	2008	case '$':
	2009	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
	2010	&& re_string_cur_idx (input) + 1 != re_string_length (input))
	2011	{
	2012	re_token_t next;
	2013	re_string_skip_bytes (input, 1);
	2014	peek_token (&next, input, syntax);
	2015	re_string_skip_bytes (input, -1);
	2016	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
	2017	break;
	2018	}
	2019	token->type = ANCHOR;
	2020	token->opr.ctx_type = LINE_LAST;
	2021	break;
	2022	default:
	2023	break;
	2024	}
	2025	return 1;
	2026	}
	2027
	2028	/* Peek a token from INPUT, and return the length of the token.
	2029	We must not use this function out of bracket expressions. */
	2030
	2031	static int
	2032	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
	2033	{
	2034	unsigned char c;
	2035	if (re_string_eoi (input))
	2036	{
	2037	token->type = END_OF_RE;
	2038	return 0;
	2039	}
	2040	c = re_string_peek_byte (input, 0);
	2041	token->opr.c = c;
	2042
	2043	#ifdef RE_ENABLE_I18N
	2044	if (input->mb_cur_max > 1
	2045	&& !re_string_first_byte (input, re_string_cur_idx (input)))
	2046	{
	2047	token->type = CHARACTER;
	2048	return 1;
	2049	}
	2050	#endif /* RE_ENABLE_I18N */
	2051
	2052	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
	2053	&& re_string_cur_idx (input) + 1 < re_string_length (input))
	2054	{
	2055	/* In this case, '\' escape a character. */
	2056	unsigned char c2;
	2057	re_string_skip_bytes (input, 1);
	2058	c2 = re_string_peek_byte (input, 0);
	2059	token->opr.c = c2;
	2060	token->type = CHARACTER;
	2061	return 1;
	2062	}
	2063	if (c == '[') /* '[' is a special char in a bracket exps. */
	2064	{
	2065	unsigned char c2;
	2066	int token_len;
	2067	if (re_string_cur_idx (input) + 1 < re_string_length (input))
	2068	c2 = re_string_peek_byte (input, 1);
	2069	else
	2070	c2 = 0;
	2071	token->opr.c = c2;
	2072	token_len = 2;
	2073	switch (c2)
	2074	{
	2075	case '.':
	2076	token->type = OP_OPEN_COLL_ELEM;
	2077	break;
	2078
	2079	case '=':
	2080	token->type = OP_OPEN_EQUIV_CLASS;
	2081	break;
	2082
	2083	case ':':
	2084	if (syntax & RE_CHAR_CLASSES)
	2085	{
	2086	token->type = OP_OPEN_CHAR_CLASS;
	2087	break;
	2088	}
	2089	FALLTHROUGH;
	2090	default:
	2091	token->type = CHARACTER;
	2092	token->opr.c = c;
	2093	token_len = 1;
	2094	break;
	2095	}
	2096	return token_len;
	2097	}
	2098	switch (c)
	2099	{
	2100	case '-':
	2101	token->type = OP_CHARSET_RANGE;
	2102	break;
	2103	case ']':
	2104	token->type = OP_CLOSE_BRACKET;
	2105	break;
	2106	case '^':
	2107	token->type = OP_NON_MATCH_LIST;
	2108	break;
	2109	default:
	2110	token->type = CHARACTER;
	2111	}
	2112	return 1;
	2113	}
	2114
	2115
	2116	/* Functions for parser. */
	2117
	2118	/* Entry point of the parser.
	2119	Parse the regular expression REGEXP and return the structure tree.
	2120	If an error occurs, ERR is set by error code, and return NULL.
	2121	This function build the following tree, from regular expression <reg_exp>:
	2122	CAT
	2123	/ \
	2124	/ \
	2125	<reg_exp> EOR
	2126
	2127	CAT means concatenation.
	2128	EOR means end of regular expression. */
	2129
	2130	static bin_tree_t *
	2131	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
	2132	reg_errcode_t *err)
	2133	{
	2134	re_dfa_t *dfa = preg->buffer;
	2135	bin_tree_t tree, eor, *root;
	2136	re_token_t current_token;
	2137	dfa->syntax = syntax;
	2138	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2139	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
	2140	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2141	return NULL;
	2142	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
	2143	if (tree != NULL)
	2144	root = create_tree (dfa, tree, eor, CONCAT);
	2145	else
	2146	root = eor;
	2147	if (__glibc_unlikely (eor == NULL \|\| root == NULL))
	2148	{
	2149	*err = REG_ESPACE;
	2150	return NULL;
	2151	}
	2152	return root;
	2153	}
	2154
	2155	/* This function build the following tree, from regular expression
	2156	<branch1>\|<branch2>:
	2157	ALT
	2158	/ \
	2159	/ \
	2160	<branch1> <branch2>
	2161
	2162	ALT means alternative, which represents the operator '\|'. */
	2163
	2164	static bin_tree_t *
	2165	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
	2166	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2167	{
	2168	re_dfa_t *dfa = preg->buffer;
	2169	bin_tree_t tree, branch = NULL;
	2170	bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
	2171	tree = parse_branch (regexp, preg, token, syntax, nest, err);
	2172	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2173	return NULL;
	2174
	2175	while (token->type == OP_ALT)
	2176	{
	2177	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2178	if (token->type != OP_ALT && token->type != END_OF_RE
	2179	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
	2180	{
	2181	bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
	2182	dfa->completed_bkref_map = initial_bkref_map;
	2183	branch = parse_branch (regexp, preg, token, syntax, nest, err);
	2184	if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
	2185	{
	2186	if (tree != NULL)
	2187	postorder (tree, free_tree, NULL);
	2188	return NULL;
	2189	}
	2190	dfa->completed_bkref_map \|= accumulated_bkref_map;
	2191	}
	2192	else
	2193	branch = NULL;
	2194	tree = create_tree (dfa, tree, branch, OP_ALT);
	2195	if (__glibc_unlikely (tree == NULL))
	2196	{
	2197	*err = REG_ESPACE;
	2198	return NULL;
	2199	}
	2200	}
	2201	return tree;
	2202	}
	2203
	2204	/* This function build the following tree, from regular expression
	2205	<exp1><exp2>:
	2206	CAT
	2207	/ \
	2208	/ \
	2209	<exp1> <exp2>
	2210
	2211	CAT means concatenation. */
	2212
	2213	static bin_tree_t *
	2214	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
	2215	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2216	{
	2217	bin_tree_t tree, expr;
	2218	re_dfa_t *dfa = preg->buffer;
	2219	tree = parse_expression (regexp, preg, token, syntax, nest, err);
	2220	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2221	return NULL;
	2222
	2223	while (token->type != OP_ALT && token->type != END_OF_RE
	2224	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
	2225	{
	2226	expr = parse_expression (regexp, preg, token, syntax, nest, err);
	2227	if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
	2228	{
	2229	if (tree != NULL)
	2230	postorder (tree, free_tree, NULL);
	2231	return NULL;
	2232	}
	2233	if (tree != NULL && expr != NULL)
	2234	{
	2235	bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
	2236	if (newtree == NULL)
	2237	{
	2238	postorder (expr, free_tree, NULL);
	2239	postorder (tree, free_tree, NULL);
	2240	*err = REG_ESPACE;
	2241	return NULL;
	2242	}
	2243	tree = newtree;
	2244	}
	2245	else if (tree == NULL)
	2246	tree = expr;
	2247	/* Otherwise expr == NULL, we don't need to create new tree. */
	2248	}
	2249	return tree;
	2250	}
	2251
	2252	/* This function build the following tree, from regular expression a*:
	2253	*
	2254	\|
	2255	a
	2256	*/
	2257
	2258	static bin_tree_t *
	2259	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
	2260	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2261	{
	2262	re_dfa_t *dfa = preg->buffer;
	2263	bin_tree_t *tree;
	2264	switch (token->type)
	2265	{
	2266	case CHARACTER:
	2267	tree = create_token_tree (dfa, NULL, NULL, token);
	2268	if (__glibc_unlikely (tree == NULL))
	2269	{
	2270	*err = REG_ESPACE;
	2271	return NULL;
	2272	}
	2273	#ifdef RE_ENABLE_I18N
	2274	if (dfa->mb_cur_max > 1)
	2275	{
	2276	while (!re_string_eoi (regexp)
	2277	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
	2278	{
	2279	bin_tree_t *mbc_remain;
	2280	fetch_token (token, regexp, syntax);
	2281	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
	2282	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
	2283	if (__glibc_unlikely (mbc_remain == NULL \|\| tree == NULL))
	2284	{
	2285	*err = REG_ESPACE;
	2286	return NULL;
	2287	}
	2288	}
	2289	}
	2290	#endif
	2291	break;
	2292
	2293	case OP_OPEN_SUBEXP:
	2294	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
	2295	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2296	return NULL;
	2297	break;
	2298
	2299	case OP_OPEN_BRACKET:
	2300	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
	2301	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2302	return NULL;
	2303	break;
	2304
	2305	case OP_BACK_REF:
	2306	if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
	2307	{
	2308	*err = REG_ESUBREG;
	2309	return NULL;
	2310	}
	2311	dfa->used_bkref_map \|= 1 << token->opr.idx;
	2312	tree = create_token_tree (dfa, NULL, NULL, token);
	2313	if (__glibc_unlikely (tree == NULL))
	2314	{
	2315	*err = REG_ESPACE;
	2316	return NULL;
	2317	}
	2318	++dfa->nbackref;
	2319	dfa->has_mb_node = 1;
	2320	break;
	2321
	2322	case OP_OPEN_DUP_NUM:
	2323	if (syntax & RE_CONTEXT_INVALID_DUP)
	2324	{
	2325	*err = REG_BADRPT;
	2326	return NULL;
	2327	}
	2328	FALLTHROUGH;
	2329	case OP_DUP_ASTERISK:
	2330	case OP_DUP_PLUS:
	2331	case OP_DUP_QUESTION:
	2332	if (syntax & RE_CONTEXT_INVALID_OPS)
	2333	{
	2334	*err = REG_BADRPT;
	2335	return NULL;
	2336	}
	2337	else if (syntax & RE_CONTEXT_INDEP_OPS)
	2338	{
	2339	fetch_token (token, regexp, syntax);
	2340	return parse_expression (regexp, preg, token, syntax, nest, err);
	2341	}
	2342	FALLTHROUGH;
	2343	case OP_CLOSE_SUBEXP:
	2344	if ((token->type == OP_CLOSE_SUBEXP)
	2345	&& !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
	2346	{
	2347	*err = REG_ERPAREN;
	2348	return NULL;
	2349	}
	2350	FALLTHROUGH;
	2351	case OP_CLOSE_DUP_NUM:
	2352	/* We treat it as a normal character. */
	2353
	2354	/* Then we can these characters as normal characters. */
	2355	token->type = CHARACTER;
	2356	/* mb_partial and word_char bits should be initialized already
	2357	by peek_token. */
	2358	tree = create_token_tree (dfa, NULL, NULL, token);
	2359	if (__glibc_unlikely (tree == NULL))
	2360	{
	2361	*err = REG_ESPACE;
	2362	return NULL;
	2363	}
	2364	break;
	2365
	2366	case ANCHOR:
	2367	if ((token->opr.ctx_type
	2368	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
	2369	&& dfa->word_ops_used == 0)
	2370	init_word_char (dfa);
	2371	if (token->opr.ctx_type == WORD_DELIM
	2372	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
	2373	{
	2374	bin_tree_t tree_first, tree_last;
	2375	if (token->opr.ctx_type == WORD_DELIM)
	2376	{
	2377	token->opr.ctx_type = WORD_FIRST;
	2378	tree_first = create_token_tree (dfa, NULL, NULL, token);
	2379	token->opr.ctx_type = WORD_LAST;
	2380	}
	2381	else
	2382	{
	2383	token->opr.ctx_type = INSIDE_WORD;
	2384	tree_first = create_token_tree (dfa, NULL, NULL, token);
	2385	token->opr.ctx_type = INSIDE_NOTWORD;
	2386	}
	2387	tree_last = create_token_tree (dfa, NULL, NULL, token);
	2388	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
	2389	if (__glibc_unlikely (tree_first == NULL \|\| tree_last == NULL
	2390	\|\| tree == NULL))
	2391	{
	2392	*err = REG_ESPACE;
	2393	return NULL;
	2394	}
	2395	}
	2396	else
	2397	{
	2398	tree = create_token_tree (dfa, NULL, NULL, token);
	2399	if (__glibc_unlikely (tree == NULL))
	2400	{
	2401	*err = REG_ESPACE;
	2402	return NULL;
	2403	}
	2404	}
	2405	/* We must return here, since ANCHORs can't be followed
	2406	by repetition operators.
	2407	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
	2408	it must not be "<ANCHOR(^)><REPEAT()>". /
	2409	fetch_token (token, regexp, syntax);
	2410	return tree;
	2411
	2412	case OP_PERIOD:
	2413	tree = create_token_tree (dfa, NULL, NULL, token);
	2414	if (__glibc_unlikely (tree == NULL))
	2415	{
	2416	*err = REG_ESPACE;
	2417	return NULL;
	2418	}
	2419	if (dfa->mb_cur_max > 1)
	2420	dfa->has_mb_node = 1;
	2421	break;
	2422
	2423	case OP_WORD:
	2424	case OP_NOTWORD:
	2425	tree = build_charclass_op (dfa, regexp->trans,
	2426	"alnum",
	2427	"_",
	2428	token->type == OP_NOTWORD, err);
	2429	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2430	return NULL;
	2431	break;
	2432
	2433	case OP_SPACE:
	2434	case OP_NOTSPACE:
	2435	tree = build_charclass_op (dfa, regexp->trans,
	2436	"space",
	2437	"",
	2438	token->type == OP_NOTSPACE, err);
	2439	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2440	return NULL;
	2441	break;
	2442
	2443	case OP_ALT:
	2444	case END_OF_RE:
	2445	return NULL;
	2446
	2447	case BACK_SLASH:
	2448	*err = REG_EESCAPE;
	2449	return NULL;
	2450
	2451	default:
	2452	/* Must not happen? */
	2453	DEBUG_ASSERT (false);
	2454	return NULL;
	2455	}
	2456	fetch_token (token, regexp, syntax);
	2457
	2458	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
	2459	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
	2460	{
	2461	bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
	2462	syntax, err);
	2463	if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
	2464	{
	2465	if (tree != NULL)
	2466	postorder (tree, free_tree, NULL);
	2467	return NULL;
	2468	}
	2469	tree = dup_tree;
	2470	/* In BRE consecutive duplications are not allowed. */
	2471	if ((syntax & RE_CONTEXT_INVALID_DUP)
	2472	&& (token->type == OP_DUP_ASTERISK
	2473	\|\| token->type == OP_OPEN_DUP_NUM))
	2474	{
	2475	if (tree != NULL)
	2476	postorder (tree, free_tree, NULL);
	2477	*err = REG_BADRPT;
	2478	return NULL;
	2479	}
	2480	}
	2481
	2482	return tree;
	2483	}
	2484
	2485	/* This function build the following tree, from regular expression
	2486	(<reg_exp>):
	2487	SUBEXP
	2488	\|
	2489	<reg_exp>
	2490	*/
	2491
	2492	static bin_tree_t *
	2493	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
	2494	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2495	{
	2496	re_dfa_t *dfa = preg->buffer;
	2497	bin_tree_t *tree;
	2498	size_t cur_nsub;
	2499	cur_nsub = preg->re_nsub++;
	2500
	2501	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2502
	2503	/* The subexpression may be a null string. */
	2504	if (token->type == OP_CLOSE_SUBEXP)
	2505	tree = NULL;
	2506	else
	2507	{
	2508	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
	2509	if (__glibc_unlikely (*err == REG_NOERROR
	2510	&& token->type != OP_CLOSE_SUBEXP))
	2511	{
	2512	if (tree != NULL)
	2513	postorder (tree, free_tree, NULL);
	2514	*err = REG_EPAREN;
	2515	}
	2516	if (__glibc_unlikely (*err != REG_NOERROR))
	2517	return NULL;
	2518	}
	2519
	2520	if (cur_nsub <= '9' - '1')
	2521	dfa->completed_bkref_map \|= 1 << cur_nsub;
	2522
	2523	tree = create_tree (dfa, tree, NULL, SUBEXP);
	2524	if (__glibc_unlikely (tree == NULL))
	2525	{
	2526	*err = REG_ESPACE;
	2527	return NULL;
	2528	}
	2529	tree->token.opr.idx = cur_nsub;
	2530	return tree;
	2531	}
	2532
	2533	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
	2534
	2535	static bin_tree_t *
	2536	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
	2537	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
	2538	{
	2539	bin_tree_t tree = NULL, old_tree = NULL;
	2540	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
	2541	re_token_t start_token = *token;
	2542
	2543	if (token->type == OP_OPEN_DUP_NUM)
	2544	{
	2545	end = 0;
	2546	start = fetch_number (regexp, token, syntax);
	2547	if (start == -1)
	2548	{
	2549	if (token->type == CHARACTER && token->opr.c == ',')
	2550	start = 0; /* We treat "{,m}" as "{0,m}". */
	2551	else
	2552	{
	2553	err = REG_BADBR; / <re>{} is invalid. */
	2554	return NULL;
	2555	}
	2556	}
	2557	if (__glibc_likely (start != -2))
	2558	{
	2559	/* We treat "{n}" as "{n,n}". */
	2560	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
	2561	: ((token->type == CHARACTER && token->opr.c == ',')
	2562	? fetch_number (regexp, token, syntax) : -2));
	2563	}
	2564	if (__glibc_unlikely (start == -2 \|\| end == -2))
	2565	{
	2566	/* Invalid sequence. */
	2567	if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
	2568	{
	2569	if (token->type == END_OF_RE)
	2570	*err = REG_EBRACE;
	2571	else
	2572	*err = REG_BADBR;
	2573
	2574	return NULL;
	2575	}
	2576
	2577	/* If the syntax bit is set, rollback. */
	2578	re_string_set_index (regexp, start_idx);
	2579	*token = start_token;
	2580	token->type = CHARACTER;
	2581	/* mb_partial and word_char bits should be already initialized by
	2582	peek_token. */
	2583	return elem;
	2584	}
	2585
	2586	if (__glibc_unlikely ((end != -1 && start > end)
	2587	\|\| token->type != OP_CLOSE_DUP_NUM))
	2588	{
	2589	/* First number greater than second. */
	2590	*err = REG_BADBR;
	2591	return NULL;
	2592	}
	2593
	2594	if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
	2595	{
	2596	*err = REG_ESIZE;
	2597	return NULL;
	2598	}
	2599	}
	2600	else
	2601	{
	2602	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
	2603	end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
	2604	}
	2605
	2606	fetch_token (token, regexp, syntax);
	2607
	2608	if (__glibc_unlikely (elem == NULL))
	2609	return NULL;
	2610	if (__glibc_unlikely (start == 0 && end == 0))
	2611	{
	2612	postorder (elem, free_tree, NULL);
	2613	return NULL;
	2614	}
	2615
	2616	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
	2617	if (__glibc_unlikely (start > 0))
	2618	{
	2619	tree = elem;
	2620	for (i = 2; i <= start; ++i)
	2621	{
	2622	elem = duplicate_tree (elem, dfa);
	2623	tree = create_tree (dfa, tree, elem, CONCAT);
	2624	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
	2625	goto parse_dup_op_espace;
	2626	}
	2627
	2628	if (start == end)
	2629	return tree;
	2630
	2631	/* Duplicate ELEM before it is marked optional. */
	2632	elem = duplicate_tree (elem, dfa);
	2633	if (__glibc_unlikely (elem == NULL))
	2634	goto parse_dup_op_espace;
	2635	old_tree = tree;
	2636	}
	2637	else
	2638	old_tree = NULL;
	2639
	2640	if (elem->token.type == SUBEXP)
	2641	{
	2642	uintptr_t subidx = elem->token.opr.idx;
	2643	postorder (elem, mark_opt_subexp, (void *) subidx);
	2644	}
	2645
	2646	tree = create_tree (dfa, elem, NULL,
	2647	(end == -1 ? OP_DUP_ASTERISK : OP_ALT));
	2648	if (__glibc_unlikely (tree == NULL))
	2649	goto parse_dup_op_espace;
	2650
	2651	/* This loop is actually executed only when end != -1,
	2652	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
	2653	already created the start+1-th copy. */
	2654	if (TYPE_SIGNED (Idx) \|\| end != -1)
	2655	for (i = start + 2; i <= end; ++i)
	2656	{
	2657	elem = duplicate_tree (elem, dfa);
	2658	tree = create_tree (dfa, tree, elem, CONCAT);
	2659	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
	2660	goto parse_dup_op_espace;
	2661
	2662	tree = create_tree (dfa, tree, NULL, OP_ALT);
	2663	if (__glibc_unlikely (tree == NULL))
	2664	goto parse_dup_op_espace;
	2665	}
	2666
	2667	if (old_tree)
	2668	tree = create_tree (dfa, old_tree, tree, CONCAT);
	2669
	2670	return tree;
	2671
	2672	parse_dup_op_espace:
	2673	*err = REG_ESPACE;
	2674	return NULL;
	2675	}
	2676
	2677	/* Size of the names for collating symbol/equivalence_class/character_class.
	2678	I'm not sure, but maybe enough. */
	2679	#define BRACKET_NAME_BUF_SIZE 32
	2680
	2681	#ifndef _LIBC
	2682
	2683	# ifdef RE_ENABLE_I18N
	2684	/* Convert the byte B to the corresponding wide character. In a
	2685	unibyte locale, treat B as itself. In a multibyte locale, return
	2686	WEOF if B is an encoding error. */
	2687	static wint_t
	2688	parse_byte (unsigned char b, re_charset_t *mbcset)
	2689	{
	2690	return mbcset == NULL ? b : __btowc (b);
	2691	}
	2692	# endif
	2693
	2694	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
	2695	Build the range expression which starts from START_ELEM, and ends
	2696	at END_ELEM. The result are written to MBCSET and SBCSET.
	2697	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
	2698	mbcset->range_ends, is a pointer argument since we may
	2699	update it. */
	2700
	2701	static reg_errcode_t
	2702	# ifdef RE_ENABLE_I18N
	2703	build_range_exp (const reg_syntax_t syntax,
	2704	bitset_t sbcset,
	2705	re_charset_t *mbcset,
	2706	Idx *range_alloc,
	2707	const bracket_elem_t *start_elem,
	2708	const bracket_elem_t *end_elem)
	2709	# else /* not RE_ENABLE_I18N */
	2710	build_range_exp (const reg_syntax_t syntax,
	2711	bitset_t sbcset,
	2712	const bracket_elem_t *start_elem,
	2713	const bracket_elem_t *end_elem)
	2714	# endif /* not RE_ENABLE_I18N */
	2715	{
	2716	unsigned int start_ch, end_ch;
	2717	/* Equivalence Classes and Character Classes can't be a range start/end. */
	2718	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
	2719	\|\| start_elem->type == CHAR_CLASS
	2720	\|\| end_elem->type == EQUIV_CLASS
	2721	\|\| end_elem->type == CHAR_CLASS))
	2722	return REG_ERANGE;
	2723
	2724	/* We can handle no multi character collating elements without libc
	2725	support. */
	2726	if (__glibc_unlikely ((start_elem->type == COLL_SYM
	2727	&& strlen ((char *) start_elem->opr.name) > 1)
	2728	\|\| (end_elem->type == COLL_SYM
	2729	&& strlen ((char *) end_elem->opr.name) > 1)))
	2730	return REG_ECOLLATE;
	2731
	2732	# ifdef RE_ENABLE_I18N
	2733	{
	2734	wchar_t wc;
	2735	wint_t start_wc;
	2736	wint_t end_wc;
	2737
	2738	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
	2739	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
	2740	: 0));
	2741	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
	2742	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
	2743	: 0));
	2744	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
	2745	? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
	2746	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
	2747	? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
	2748	if (start_wc == WEOF \|\| end_wc == WEOF)
	2749	return REG_ECOLLATE;
	2750	else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
	2751	&& start_wc > end_wc))
	2752	return REG_ERANGE;
	2753
	2754	/* Got valid collation sequence values, add them as a new entry.
	2755	However, for !_LIBC we have no collation elements: if the
	2756	character set is single byte, the single byte character set
	2757	that we build below suffices. parse_bracket_exp passes
	2758	no MBCSET if dfa->mb_cur_max == 1. */
	2759	if (mbcset)
	2760	{
	2761	/* Check the space of the arrays. */
	2762	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
	2763	{
	2764	/* There is not enough space, need realloc. */
	2765	wchar_t new_array_start, new_array_end;
	2766	Idx new_nranges;
	2767
	2768	/* +1 in case of mbcset->nranges is 0. */
	2769	new_nranges = 2 * mbcset->nranges + 1;
	2770	/* Use realloc since mbcset->range_starts and mbcset->range_ends
	2771	are NULL if range_alloc == 0. /
	2772	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
	2773	new_nranges);
	2774	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
	2775	new_nranges);
	2776
	2777	if (__glibc_unlikely (new_array_start == NULL
	2778	\|\| new_array_end == NULL))
	2779	{
	2780	re_free (new_array_start);
	2781	re_free (new_array_end);
	2782	return REG_ESPACE;
	2783	}
	2784
	2785	mbcset->range_starts = new_array_start;
	2786	mbcset->range_ends = new_array_end;
	2787	*range_alloc = new_nranges;
	2788	}
	2789
	2790	mbcset->range_starts[mbcset->nranges] = start_wc;
	2791	mbcset->range_ends[mbcset->nranges++] = end_wc;
	2792	}
	2793
	2794	/* Build the table for single byte characters. */
	2795	for (wc = 0; wc < SBC_MAX; ++wc)
	2796	{
	2797	if (start_wc <= wc && wc <= end_wc)
	2798	bitset_set (sbcset, wc);
	2799	}
	2800	}
	2801	# else /* not RE_ENABLE_I18N */
	2802	{
	2803	unsigned int ch;
	2804	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
	2805	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
	2806	: 0));
	2807	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
	2808	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
	2809	: 0));
	2810	if (start_ch > end_ch)
	2811	return REG_ERANGE;
	2812	/* Build the table for single byte characters. */
	2813	for (ch = 0; ch < SBC_MAX; ++ch)
	2814	if (start_ch <= ch && ch <= end_ch)
	2815	bitset_set (sbcset, ch);
	2816	}
	2817	# endif /* not RE_ENABLE_I18N */
	2818	return REG_NOERROR;
	2819	}
	2820	#endif /* not _LIBC */
	2821
	2822	#ifndef _LIBC
	2823	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
	2824	Build the collating element which is represented by NAME.
	2825	The result are written to MBCSET and SBCSET.
	2826	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
	2827	pointer argument since we may update it. */
	2828
	2829	static reg_errcode_t
	2830	# ifdef RE_ENABLE_I18N
	2831	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
	2832	Idx coll_sym_alloc, const unsigned char name)
	2833	# else /* not RE_ENABLE_I18N */
	2834	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
	2835	# endif /* not RE_ENABLE_I18N */
	2836	{
	2837	size_t name_len = strlen ((const char *) name);
	2838	if (__glibc_unlikely (name_len != 1))
	2839	return REG_ECOLLATE;
	2840	else
	2841	{
	2842	bitset_set (sbcset, name[0]);
	2843	return REG_NOERROR;
	2844	}
	2845	}
	2846	#endif /* not _LIBC */
	2847
	2848	/* This function parse bracket expression like "[abc]", "[a-c]",
	2849	"[[.a-a.]]" etc. */
	2850
	2851	static bin_tree_t *
	2852	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
	2853	reg_syntax_t syntax, reg_errcode_t *err)
	2854	{
	2855	#ifdef _LIBC
	2856	const unsigned char *collseqmb;
	2857	const char *collseqwc;
	2858	uint32_t nrules;
	2859	int32_t table_size;
	2860	const int32_t *symb_table;
	2861	const unsigned char *extra;
	2862
	2863	/* Local function for parse_bracket_exp used in _LIBC environment.
	2864	Seek the collating symbol entry corresponding to NAME.
	2865	Return the index of the symbol in the SYMB_TABLE,
	2866	or -1 if not found. */
	2867
	2868	auto inline int32_t
	2869	__attribute__ ((always_inline))
	2870	seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
	2871	{
	2872	int32_t elem;
	2873
	2874	for (elem = 0; elem < table_size; elem++)
	2875	if (symb_table[2 * elem] != 0)
	2876	{
	2877	int32_t idx = symb_table[2 * elem + 1];
	2878	/* Skip the name of collating element name. */
	2879	idx += 1 + extra[idx];
	2880	if (/* Compare the length of the name. */
	2881	name_len == extra[idx]
	2882	/* Compare the name. */
	2883	&& memcmp (name, &extra[idx + 1], name_len) == 0)
	2884	/* Yep, this is the entry. */
	2885	return elem;
	2886	}
	2887	return -1;
	2888	}
	2889
	2890	/* Local function for parse_bracket_exp used in _LIBC environment.
	2891	Look up the collation sequence value of BR_ELEM.
	2892	Return the value if succeeded, UINT_MAX otherwise. */
	2893
	2894	auto inline unsigned int
	2895	__attribute__ ((always_inline))
	2896	lookup_collation_sequence_value (bracket_elem_t *br_elem)
	2897	{
	2898	if (br_elem->type == SB_CHAR)
	2899	{
	2900	/*
	2901	if (MB_CUR_MAX == 1)
	2902	*/
	2903	if (nrules == 0)
	2904	return collseqmb[br_elem->opr.ch];
	2905	else
	2906	{
	2907	wint_t wc = __btowc (br_elem->opr.ch);
	2908	return __collseq_table_lookup (collseqwc, wc);
	2909	}
	2910	}
	2911	else if (br_elem->type == MB_CHAR)
	2912	{
	2913	if (nrules != 0)
	2914	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
	2915	}
	2916	else if (br_elem->type == COLL_SYM)
	2917	{
	2918	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
	2919	if (nrules != 0)
	2920	{
	2921	int32_t elem, idx;
	2922	elem = seek_collating_symbol_entry (br_elem->opr.name,
	2923	sym_name_len);
	2924	if (elem != -1)
	2925	{
	2926	/* We found the entry. */
	2927	idx = symb_table[2 * elem + 1];
	2928	/* Skip the name of collating element name. */
	2929	idx += 1 + extra[idx];
	2930	/* Skip the byte sequence of the collating element. */
	2931	idx += 1 + extra[idx];
	2932	/* Adjust for the alignment. */
	2933	idx = (idx + 3) & ~3;
	2934	/* Skip the multibyte collation sequence value. */
	2935	idx += sizeof (unsigned int);
	2936	/* Skip the wide char sequence of the collating element. */
	2937	idx += sizeof (unsigned int) *
	2938	(1 + (unsigned int ) (extra + idx));
	2939	/* Return the collation sequence value. */
	2940	return (unsigned int ) (extra + idx);
	2941	}
	2942	else if (sym_name_len == 1)
	2943	{
	2944	/* No valid character. Match it as a single byte
	2945	character. */
	2946	return collseqmb[br_elem->opr.name[0]];
	2947	}
	2948	}
	2949	else if (sym_name_len == 1)
	2950	return collseqmb[br_elem->opr.name[0]];
	2951	}
	2952	return UINT_MAX;
	2953	}
	2954
	2955	/* Local function for parse_bracket_exp used in _LIBC environment.
	2956	Build the range expression which starts from START_ELEM, and ends
	2957	at END_ELEM. The result are written to MBCSET and SBCSET.
	2958	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
	2959	mbcset->range_ends, is a pointer argument since we may
	2960	update it. */
	2961
	2962	auto inline reg_errcode_t
	2963	__attribute__ ((always_inline))
	2964	build_range_exp (bitset_t sbcset, re_charset_t mbcset, int range_alloc,
	2965	bracket_elem_t start_elem, bracket_elem_t end_elem)
	2966	{
	2967	unsigned int ch;
	2968	uint32_t start_collseq;
	2969	uint32_t end_collseq;
	2970
	2971	/* Equivalence Classes and Character Classes can't be a range
	2972	start/end. */
	2973	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
	2974	\|\| start_elem->type == CHAR_CLASS
	2975	\|\| end_elem->type == EQUIV_CLASS
	2976	\|\| end_elem->type == CHAR_CLASS))
	2977	return REG_ERANGE;
	2978
	2979	/* FIXME: Implement rational ranges here, too. */
	2980	start_collseq = lookup_collation_sequence_value (start_elem);
	2981	end_collseq = lookup_collation_sequence_value (end_elem);
	2982	/* Check start/end collation sequence values. */
	2983	if (__glibc_unlikely (start_collseq == UINT_MAX
	2984	\|\| end_collseq == UINT_MAX))
	2985	return REG_ECOLLATE;
	2986	if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
	2987	&& start_collseq > end_collseq))
	2988	return REG_ERANGE;
	2989
	2990	/* Got valid collation sequence values, add them as a new entry.
	2991	However, if we have no collation elements, and the character set
	2992	is single byte, the single byte character set that we
	2993	build below suffices. */
	2994	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
	2995	{
	2996	/* Check the space of the arrays. */
	2997	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
	2998	{
	2999	/* There is not enough space, need realloc. */
	3000	uint32_t *new_array_start;
	3001	uint32_t *new_array_end;
	3002	Idx new_nranges;
	3003
	3004	/* +1 in case of mbcset->nranges is 0. */
	3005	new_nranges = 2 * mbcset->nranges + 1;
	3006	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
	3007	new_nranges);
	3008	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
	3009	new_nranges);
	3010
	3011	if (__glibc_unlikely (new_array_start == NULL
	3012	\|\| new_array_end == NULL))
	3013	return REG_ESPACE;
	3014
	3015	mbcset->range_starts = new_array_start;
	3016	mbcset->range_ends = new_array_end;
	3017	*range_alloc = new_nranges;
	3018	}
	3019
	3020	mbcset->range_starts[mbcset->nranges] = start_collseq;
	3021	mbcset->range_ends[mbcset->nranges++] = end_collseq;
	3022	}
	3023
	3024	/* Build the table for single byte characters. */
	3025	for (ch = 0; ch < SBC_MAX; ch++)
	3026	{
	3027	uint32_t ch_collseq;
	3028	/*
	3029	if (MB_CUR_MAX == 1)
	3030	*/
	3031	if (nrules == 0)
	3032	ch_collseq = collseqmb[ch];
	3033	else
	3034	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
	3035	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
	3036	bitset_set (sbcset, ch);
	3037	}
	3038	return REG_NOERROR;
	3039	}
	3040
	3041	/* Local function for parse_bracket_exp used in _LIBC environment.
	3042	Build the collating element which is represented by NAME.
	3043	The result are written to MBCSET and SBCSET.
	3044	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
	3045	pointer argument since we may update it. */
	3046
	3047	auto inline reg_errcode_t
	3048	__attribute__ ((always_inline))
	3049	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
	3050	Idx coll_sym_alloc, const unsigned char name)
	3051	{
	3052	int32_t elem, idx;
	3053	size_t name_len = strlen ((const char *) name);
	3054	if (nrules != 0)
	3055	{
	3056	elem = seek_collating_symbol_entry (name, name_len);
	3057	if (elem != -1)
	3058	{
	3059	/* We found the entry. */
	3060	idx = symb_table[2 * elem + 1];
	3061	/* Skip the name of collating element name. */
	3062	idx += 1 + extra[idx];
	3063	}
	3064	else if (name_len == 1)
	3065	{
	3066	/* No valid character, treat it as a normal
	3067	character. */
	3068	bitset_set (sbcset, name[0]);
	3069	return REG_NOERROR;
	3070	}
	3071	else
	3072	return REG_ECOLLATE;
	3073
	3074	/* Got valid collation sequence, add it as a new entry. */
	3075	/* Check the space of the arrays. */
	3076	if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
	3077	{
	3078	/* Not enough, realloc it. */
	3079	/* +1 in case of mbcset->ncoll_syms is 0. */
	3080	Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
	3081	/* Use realloc since mbcset->coll_syms is NULL
	3082	if alloc == 0. /
	3083	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
	3084	new_coll_sym_alloc);
	3085	if (__glibc_unlikely (new_coll_syms == NULL))
	3086	return REG_ESPACE;
	3087	mbcset->coll_syms = new_coll_syms;
	3088	*coll_sym_alloc = new_coll_sym_alloc;
	3089	}
	3090	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
	3091	return REG_NOERROR;
	3092	}
	3093	else
	3094	{
	3095	if (__glibc_unlikely (name_len != 1))
	3096	return REG_ECOLLATE;
	3097	else
	3098	{
	3099	bitset_set (sbcset, name[0]);
	3100	return REG_NOERROR;
	3101	}
	3102	}
	3103	}
	3104	#endif
	3105
	3106	re_token_t br_token;
	3107	re_bitset_ptr_t sbcset;
	3108	#ifdef RE_ENABLE_I18N
	3109	re_charset_t *mbcset;
	3110	Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
	3111	Idx equiv_class_alloc = 0, char_class_alloc = 0;
	3112	#endif /* not RE_ENABLE_I18N */
	3113	bool non_match = false;
	3114	bin_tree_t *work_tree;
	3115	int token_len;
	3116	bool first_round = true;
	3117	#ifdef _LIBC
	3118	collseqmb = (const unsigned char *)
	3119	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
	3120	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
	3121	if (nrules)
	3122	{
	3123	/*
	3124	if (MB_CUR_MAX > 1)
	3125	*/
	3126	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
	3127	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
	3128	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
	3129	_NL_COLLATE_SYMB_TABLEMB);
	3130	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3131	_NL_COLLATE_SYMB_EXTRAMB);
	3132	}
	3133	#endif
	3134	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
	3135	#ifdef RE_ENABLE_I18N
	3136	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
	3137	#endif /* RE_ENABLE_I18N */
	3138	#ifdef RE_ENABLE_I18N
	3139	if (__glibc_unlikely (sbcset == NULL \|\| mbcset == NULL))
	3140	#else
	3141	if (__glibc_unlikely (sbcset == NULL))
	3142	#endif /* RE_ENABLE_I18N */
	3143	{
	3144	re_free (sbcset);
	3145	#ifdef RE_ENABLE_I18N
	3146	re_free (mbcset);
	3147	#endif
	3148	*err = REG_ESPACE;
	3149	return NULL;
	3150	}
	3151
	3152	token_len = peek_token_bracket (token, regexp, syntax);
	3153	if (__glibc_unlikely (token->type == END_OF_RE))
	3154	{
	3155	*err = REG_BADPAT;
	3156	goto parse_bracket_exp_free_return;
	3157	}
	3158	if (token->type == OP_NON_MATCH_LIST)
	3159	{
	3160	#ifdef RE_ENABLE_I18N
	3161	mbcset->non_match = 1;
	3162	#endif /* not RE_ENABLE_I18N */
	3163	non_match = true;
	3164	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
	3165	bitset_set (sbcset, '\n');
	3166	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3167	token_len = peek_token_bracket (token, regexp, syntax);
	3168	if (__glibc_unlikely (token->type == END_OF_RE))
	3169	{
	3170	*err = REG_BADPAT;
	3171	goto parse_bracket_exp_free_return;
	3172	}
	3173	}
	3174
	3175	/* We treat the first ']' as a normal character. */
	3176	if (token->type == OP_CLOSE_BRACKET)
	3177	token->type = CHARACTER;
	3178
	3179	while (1)
	3180	{
	3181	bracket_elem_t start_elem, end_elem;
	3182	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
	3183	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
	3184	reg_errcode_t ret;
	3185	int token_len2 = 0;
	3186	bool is_range_exp = false;
	3187	re_token_t token2;
	3188
	3189	start_elem.opr.name = start_name_buf;
	3190	start_elem.type = COLL_SYM;
	3191	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
	3192	syntax, first_round);
	3193	if (__glibc_unlikely (ret != REG_NOERROR))
	3194	{
	3195	*err = ret;
	3196	goto parse_bracket_exp_free_return;
	3197	}
	3198	first_round = false;
	3199
	3200	/* Get information about the next token. We need it in any case. */
	3201	token_len = peek_token_bracket (token, regexp, syntax);
	3202
	3203	/* Do not check for ranges if we know they are not allowed. */
	3204	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
	3205	{
	3206	if (__glibc_unlikely (token->type == END_OF_RE))
	3207	{
	3208	*err = REG_EBRACK;
	3209	goto parse_bracket_exp_free_return;
	3210	}
	3211	if (token->type == OP_CHARSET_RANGE)
	3212	{
	3213	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
	3214	token_len2 = peek_token_bracket (&token2, regexp, syntax);
	3215	if (__glibc_unlikely (token2.type == END_OF_RE))
	3216	{
	3217	*err = REG_EBRACK;
	3218	goto parse_bracket_exp_free_return;
	3219	}
	3220	if (token2.type == OP_CLOSE_BRACKET)
	3221	{
	3222	/* We treat the last '-' as a normal character. */
	3223	re_string_skip_bytes (regexp, -token_len);
	3224	token->type = CHARACTER;
	3225	}
	3226	else
	3227	is_range_exp = true;
	3228	}
	3229	}
	3230
	3231	if (is_range_exp == true)
	3232	{
	3233	end_elem.opr.name = end_name_buf;
	3234	end_elem.type = COLL_SYM;
	3235	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
	3236	dfa, syntax, true);
	3237	if (__glibc_unlikely (ret != REG_NOERROR))
	3238	{
	3239	*err = ret;
	3240	goto parse_bracket_exp_free_return;
	3241	}
	3242
	3243	token_len = peek_token_bracket (token, regexp, syntax);
	3244
	3245	#ifdef _LIBC
	3246	*err = build_range_exp (sbcset, mbcset, &range_alloc,
	3247	&start_elem, &end_elem);
	3248	#else
	3249	# ifdef RE_ENABLE_I18N
	3250	*err = build_range_exp (syntax, sbcset,
	3251	dfa->mb_cur_max > 1 ? mbcset : NULL,
	3252	&range_alloc, &start_elem, &end_elem);
	3253	# else
	3254	*err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
	3255	# endif
	3256	#endif /* RE_ENABLE_I18N */
	3257	if (__glibc_unlikely (*err != REG_NOERROR))
	3258	goto parse_bracket_exp_free_return;
	3259	}
	3260	else
	3261	{
	3262	switch (start_elem.type)
	3263	{
	3264	case SB_CHAR:
	3265	bitset_set (sbcset, start_elem.opr.ch);
	3266	break;
	3267	#ifdef RE_ENABLE_I18N
	3268	case MB_CHAR:
	3269	/* Check whether the array has enough space. */
	3270	if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
	3271	{
	3272	wchar_t *new_mbchars;
	3273	/* Not enough, realloc it. */
	3274	/* +1 in case of mbcset->nmbchars is 0. */
	3275	mbchar_alloc = 2 * mbcset->nmbchars + 1;
	3276	/* Use realloc since array is NULL if alloc == 0. /
	3277	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
	3278	mbchar_alloc);
	3279	if (__glibc_unlikely (new_mbchars == NULL))
	3280	goto parse_bracket_exp_espace;
	3281	mbcset->mbchars = new_mbchars;
	3282	}
	3283	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
	3284	break;
	3285	#endif /* RE_ENABLE_I18N */
	3286	case EQUIV_CLASS:
	3287	*err = build_equiv_class (sbcset,
	3288	#ifdef RE_ENABLE_I18N
	3289	mbcset, &equiv_class_alloc,
	3290	#endif /* RE_ENABLE_I18N */
	3291	start_elem.opr.name);
	3292	if (__glibc_unlikely (*err != REG_NOERROR))
	3293	goto parse_bracket_exp_free_return;
	3294	break;
	3295	case COLL_SYM:
	3296	*err = build_collating_symbol (sbcset,
	3297	#ifdef RE_ENABLE_I18N
	3298	mbcset, &coll_sym_alloc,
	3299	#endif /* RE_ENABLE_I18N */
	3300	start_elem.opr.name);
	3301	if (__glibc_unlikely (*err != REG_NOERROR))
	3302	goto parse_bracket_exp_free_return;
	3303	break;
	3304	case CHAR_CLASS:
	3305	*err = build_charclass (regexp->trans, sbcset,
	3306	#ifdef RE_ENABLE_I18N
	3307	mbcset, &char_class_alloc,
	3308	#endif /* RE_ENABLE_I18N */
	3309	(const char *) start_elem.opr.name,
	3310	syntax);
	3311	if (__glibc_unlikely (*err != REG_NOERROR))
	3312	goto parse_bracket_exp_free_return;
	3313	break;
	3314	default:
	3315	DEBUG_ASSERT (false);
	3316	break;
	3317	}
	3318	}
	3319	if (__glibc_unlikely (token->type == END_OF_RE))
	3320	{
	3321	*err = REG_EBRACK;
	3322	goto parse_bracket_exp_free_return;
	3323	}
	3324	if (token->type == OP_CLOSE_BRACKET)
	3325	break;
	3326	}
	3327
	3328	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3329
	3330	/* If it is non-matching list. */
	3331	if (non_match)
	3332	bitset_not (sbcset);
	3333
	3334	#ifdef RE_ENABLE_I18N
	3335	/* Ensure only single byte characters are set. */
	3336	if (dfa->mb_cur_max > 1)
	3337	bitset_mask (sbcset, dfa->sb_char);
	3338
	3339	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
	3340	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
	3341	\|\| mbcset->non_match)))
	3342	{
	3343	bin_tree_t *mbc_tree;
	3344	int sbc_idx;
	3345	/* Build a tree for complex bracket. */
	3346	dfa->has_mb_node = 1;
	3347	br_token.type = COMPLEX_BRACKET;
	3348	br_token.opr.mbcset = mbcset;
	3349	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3350	if (__glibc_unlikely (mbc_tree == NULL))
	3351	goto parse_bracket_exp_espace;
	3352	for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
	3353	if (sbcset[sbc_idx])
	3354	break;
	3355	/* If there are no bits set in sbcset, there is no point
	3356	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
	3357	if (sbc_idx < BITSET_WORDS)
	3358	{
	3359	/* Build a tree for simple bracket. */
	3360	br_token.type = SIMPLE_BRACKET;
	3361	br_token.opr.sbcset = sbcset;
	3362	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3363	if (__glibc_unlikely (work_tree == NULL))
	3364	goto parse_bracket_exp_espace;
	3365
	3366	/* Then join them by ALT node. */
	3367	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
	3368	if (__glibc_unlikely (work_tree == NULL))
	3369	goto parse_bracket_exp_espace;
	3370	}
	3371	else
	3372	{
	3373	re_free (sbcset);
	3374	work_tree = mbc_tree;
	3375	}
	3376	}
	3377	else
	3378	#endif /* not RE_ENABLE_I18N */
	3379	{
	3380	#ifdef RE_ENABLE_I18N
	3381	free_charset (mbcset);
	3382	#endif
	3383	/* Build a tree for simple bracket. */
	3384	br_token.type = SIMPLE_BRACKET;
	3385	br_token.opr.sbcset = sbcset;
	3386	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3387	if (__glibc_unlikely (work_tree == NULL))
	3388	goto parse_bracket_exp_espace;
	3389	}
	3390	return work_tree;
	3391
	3392	parse_bracket_exp_espace:
	3393	*err = REG_ESPACE;
	3394	parse_bracket_exp_free_return:
	3395	re_free (sbcset);
	3396	#ifdef RE_ENABLE_I18N
	3397	free_charset (mbcset);
	3398	#endif /* RE_ENABLE_I18N */
	3399	return NULL;
	3400	}
	3401
	3402	/* Parse an element in the bracket expression. */
	3403
	3404	static reg_errcode_t
	3405	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
	3406	re_token_t token, int token_len, re_dfa_t dfa,
	3407	reg_syntax_t syntax, bool accept_hyphen)
	3408	{
	3409	#ifdef RE_ENABLE_I18N
	3410	int cur_char_size;
	3411	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
	3412	if (cur_char_size > 1)
	3413	{
	3414	elem->type = MB_CHAR;
	3415	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
	3416	re_string_skip_bytes (regexp, cur_char_size);
	3417	return REG_NOERROR;
	3418	}
	3419	#endif /* RE_ENABLE_I18N */
	3420	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3421	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
	3422	\|\| token->type == OP_OPEN_EQUIV_CLASS)
	3423	return parse_bracket_symbol (elem, regexp, token);
	3424	if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
	3425	{
	3426	/* A '-' must only appear as anything but a range indicator before
	3427	the closing bracket. Everything else is an error. */
	3428	re_token_t token2;
	3429	(void) peek_token_bracket (&token2, regexp, syntax);
	3430	if (token2.type != OP_CLOSE_BRACKET)
	3431	/* The actual error value is not standardized since this whole
	3432	case is undefined. But ERANGE makes good sense. */
	3433	return REG_ERANGE;
	3434	}
	3435	elem->type = SB_CHAR;
	3436	elem->opr.ch = token->opr.c;
	3437	return REG_NOERROR;
	3438	}
	3439
	3440	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
	3441	such as [:<character_class>:], [.<collating_element>.], and
	3442	[=<equivalent_class>=]. */
	3443
	3444	static reg_errcode_t
	3445	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
	3446	re_token_t *token)
	3447	{
	3448	unsigned char ch, delim = token->opr.c;
	3449	int i = 0;
	3450	if (re_string_eoi(regexp))
	3451	return REG_EBRACK;
	3452	for (;; ++i)
	3453	{
	3454	if (i >= BRACKET_NAME_BUF_SIZE)
	3455	return REG_EBRACK;
	3456	if (token->type == OP_OPEN_CHAR_CLASS)
	3457	ch = re_string_fetch_byte_case (regexp);
	3458	else
	3459	ch = re_string_fetch_byte (regexp);
	3460	if (re_string_eoi(regexp))
	3461	return REG_EBRACK;
	3462	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
	3463	break;
	3464	elem->opr.name[i] = ch;
	3465	}
	3466	re_string_skip_bytes (regexp, 1);
	3467	elem->opr.name[i] = '\0';
	3468	switch (token->type)
	3469	{
	3470	case OP_OPEN_COLL_ELEM:
	3471	elem->type = COLL_SYM;
	3472	break;
	3473	case OP_OPEN_EQUIV_CLASS:
	3474	elem->type = EQUIV_CLASS;
	3475	break;
	3476	case OP_OPEN_CHAR_CLASS:
	3477	elem->type = CHAR_CLASS;
	3478	break;
	3479	default:
	3480	break;
	3481	}
	3482	return REG_NOERROR;
	3483	}
	3484
	3485	/* Helper function for parse_bracket_exp.
	3486	Build the equivalence class which is represented by NAME.
	3487	The result are written to MBCSET and SBCSET.
	3488	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
	3489	is a pointer argument since we may update it. */
	3490
	3491	static reg_errcode_t
	3492	#ifdef RE_ENABLE_I18N
	3493	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
	3494	Idx equiv_class_alloc, const unsigned char name)
	3495	#else /* not RE_ENABLE_I18N */
	3496	build_equiv_class (bitset_t sbcset, const unsigned char *name)
	3497	#endif /* not RE_ENABLE_I18N */
	3498	{
	3499	#ifdef _LIBC
	3500	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
	3501	if (nrules != 0)
	3502	{
	3503	const int32_t table, indirect;
	3504	const unsigned char weights, extra, *cp;
	3505	unsigned char char_buf[2];
	3506	int32_t idx1, idx2;
	3507	unsigned int ch;
	3508	size_t len;
	3509	/* Calculate the index for equivalence class. */
	3510	cp = name;
	3511	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
	3512	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3513	_NL_COLLATE_WEIGHTMB);
	3514	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3515	_NL_COLLATE_EXTRAMB);
	3516	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
	3517	_NL_COLLATE_INDIRECTMB);
	3518	idx1 = findidx (table, indirect, extra, &cp, -1);
	3519	if (__glibc_unlikely (idx1 == 0 \|\| *cp != '\0'))
	3520	/* This isn't a valid character. */
	3521	return REG_ECOLLATE;
	3522
	3523	/* Build single byte matching table for this equivalence class. */
	3524	len = weights[idx1 & 0xffffff];
	3525	for (ch = 0; ch < SBC_MAX; ++ch)
	3526	{
	3527	char_buf[0] = ch;
	3528	cp = char_buf;
	3529	idx2 = findidx (table, indirect, extra, &cp, 1);
	3530	/*
	3531	idx2 = table[ch];
	3532	*/
	3533	if (idx2 == 0)
	3534	/* This isn't a valid character. */
	3535	continue;
	3536	/* Compare only if the length matches and the collation rule
	3537	index is the same. */
	3538	if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
	3539	&& memcmp (weights + (idx1 & 0xffffff) + 1,
	3540	weights + (idx2 & 0xffffff) + 1, len) == 0)
	3541	bitset_set (sbcset, ch);
	3542	}
	3543	/* Check whether the array has enough space. */
	3544	if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
	3545	{
	3546	/* Not enough, realloc it. */
	3547	/* +1 in case of mbcset->nequiv_classes is 0. */
	3548	Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
	3549	/* Use realloc since the array is NULL if alloc == 0. /
	3550	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
	3551	int32_t,
	3552	new_equiv_class_alloc);
	3553	if (__glibc_unlikely (new_equiv_classes == NULL))
	3554	return REG_ESPACE;
	3555	mbcset->equiv_classes = new_equiv_classes;
	3556	*equiv_class_alloc = new_equiv_class_alloc;
	3557	}
	3558	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
	3559	}
	3560	else
	3561	#endif /* _LIBC */
	3562	{
	3563	if (__glibc_unlikely (strlen ((const char *) name) != 1))
	3564	return REG_ECOLLATE;
	3565	bitset_set (sbcset, *name);
	3566	}
	3567	return REG_NOERROR;
	3568	}
	3569
	3570	/* Helper function for parse_bracket_exp.
	3571	Build the character class which is represented by NAME.
	3572	The result are written to MBCSET and SBCSET.
	3573	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
	3574	is a pointer argument since we may update it. */
	3575
	3576	static reg_errcode_t
	3577	#ifdef RE_ENABLE_I18N
	3578	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
	3579	re_charset_t mbcset, Idx char_class_alloc,
	3580	const char *class_name, reg_syntax_t syntax)
	3581	#else /* not RE_ENABLE_I18N */
	3582	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
	3583	const char *class_name, reg_syntax_t syntax)
	3584	#endif /* not RE_ENABLE_I18N */
	3585	{
	3586	int i;
	3587	const char *name = class_name;
	3588
	3589	/* In case of REG_ICASE "upper" and "lower" match the both of
	3590	upper and lower cases. */
	3591	if ((syntax & RE_ICASE)
	3592	&& (strcmp (name, "upper") == 0 \|\| strcmp (name, "lower") == 0))
	3593	name = "alpha";
	3594
	3595	#ifdef RE_ENABLE_I18N
	3596	/* Check the space of the arrays. */
	3597	if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
	3598	{
	3599	/* Not enough, realloc it. */
	3600	/* +1 in case of mbcset->nchar_classes is 0. */
	3601	Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
	3602	/* Use realloc since array is NULL if alloc == 0. /
	3603	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
	3604	new_char_class_alloc);
	3605	if (__glibc_unlikely (new_char_classes == NULL))
	3606	return REG_ESPACE;
	3607	mbcset->char_classes = new_char_classes;
	3608	*char_class_alloc = new_char_class_alloc;
	3609	}
	3610	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
	3611	#endif /* RE_ENABLE_I18N */
	3612
	3613	#define BUILD_CHARCLASS_LOOP(ctype_func) \
	3614	do { \
	3615	if (__glibc_unlikely (trans != NULL)) \
	3616	{ \
	3617	for (i = 0; i < SBC_MAX; ++i) \
	3618	if (ctype_func (i)) \
	3619	bitset_set (sbcset, trans[i]); \
	3620	} \
	3621	else \
	3622	{ \
	3623	for (i = 0; i < SBC_MAX; ++i) \
	3624	if (ctype_func (i)) \
	3625	bitset_set (sbcset, i); \
	3626	} \
	3627	} while (0)
	3628
	3629	if (strcmp (name, "alnum") == 0)
	3630	BUILD_CHARCLASS_LOOP (isalnum);
	3631	else if (strcmp (name, "cntrl") == 0)
	3632	BUILD_CHARCLASS_LOOP (iscntrl);
	3633	else if (strcmp (name, "lower") == 0)
	3634	BUILD_CHARCLASS_LOOP (islower);
	3635	else if (strcmp (name, "space") == 0)
	3636	BUILD_CHARCLASS_LOOP (isspace);
	3637	else if (strcmp (name, "alpha") == 0)
	3638	BUILD_CHARCLASS_LOOP (isalpha);
	3639	else if (strcmp (name, "digit") == 0)
	3640	BUILD_CHARCLASS_LOOP (isdigit);
	3641	else if (strcmp (name, "print") == 0)
	3642	BUILD_CHARCLASS_LOOP (isprint);
	3643	else if (strcmp (name, "upper") == 0)
	3644	BUILD_CHARCLASS_LOOP (isupper);
	3645	else if (strcmp (name, "blank") == 0)
	3646	BUILD_CHARCLASS_LOOP (isblank);
	3647	else if (strcmp (name, "graph") == 0)
	3648	BUILD_CHARCLASS_LOOP (isgraph);
	3649	else if (strcmp (name, "punct") == 0)
	3650	BUILD_CHARCLASS_LOOP (ispunct);
	3651	else if (strcmp (name, "xdigit") == 0)
	3652	BUILD_CHARCLASS_LOOP (isxdigit);
	3653	else
	3654	return REG_ECTYPE;
	3655
	3656	return REG_NOERROR;
	3657	}
	3658
	3659	static bin_tree_t *
	3660	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
	3661	const char *class_name,
	3662	const char *extra, bool non_match,
	3663	reg_errcode_t *err)
	3664	{
	3665	re_bitset_ptr_t sbcset;
	3666	#ifdef RE_ENABLE_I18N
	3667	re_charset_t *mbcset;
	3668	Idx alloc = 0;
	3669	#endif /* not RE_ENABLE_I18N */
	3670	reg_errcode_t ret;
	3671	bin_tree_t *tree;
	3672
	3673	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
	3674	if (__glibc_unlikely (sbcset == NULL))
	3675	{
	3676	*err = REG_ESPACE;
	3677	return NULL;
	3678	}
	3679	#ifdef RE_ENABLE_I18N
	3680	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
	3681	if (__glibc_unlikely (mbcset == NULL))
	3682	{
	3683	re_free (sbcset);
	3684	*err = REG_ESPACE;
	3685	return NULL;
	3686	}
	3687	mbcset->non_match = non_match;
	3688	#endif /* RE_ENABLE_I18N */
	3689
	3690	/* We don't care the syntax in this case. */
	3691	ret = build_charclass (trans, sbcset,
	3692	#ifdef RE_ENABLE_I18N
	3693	mbcset, &alloc,
	3694	#endif /* RE_ENABLE_I18N */
	3695	class_name, 0);
	3696
	3697	if (__glibc_unlikely (ret != REG_NOERROR))
	3698	{
	3699	re_free (sbcset);
	3700	#ifdef RE_ENABLE_I18N
	3701	free_charset (mbcset);
	3702	#endif /* RE_ENABLE_I18N */
	3703	*err = ret;
	3704	return NULL;
	3705	}
	3706	/* \w match '_' also. */
	3707	for (; *extra; extra++)
	3708	bitset_set (sbcset, *extra);
	3709
	3710	/* If it is non-matching list. */
	3711	if (non_match)
	3712	bitset_not (sbcset);
	3713
	3714	#ifdef RE_ENABLE_I18N
	3715	/* Ensure only single byte characters are set. */
	3716	if (dfa->mb_cur_max > 1)
	3717	bitset_mask (sbcset, dfa->sb_char);
	3718	#endif
	3719
	3720	/* Build a tree for simple bracket. */
	3721	re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
	3722	tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3723	if (__glibc_unlikely (tree == NULL))
	3724	goto build_word_op_espace;
	3725
	3726	#ifdef RE_ENABLE_I18N
	3727	if (dfa->mb_cur_max > 1)
	3728	{
	3729	bin_tree_t *mbc_tree;
	3730	/* Build a tree for complex bracket. */
	3731	br_token.type = COMPLEX_BRACKET;
	3732	br_token.opr.mbcset = mbcset;
	3733	dfa->has_mb_node = 1;
	3734	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3735	if (__glibc_unlikely (mbc_tree == NULL))
	3736	goto build_word_op_espace;
	3737	/* Then join them by ALT node. */
	3738	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
	3739	if (__glibc_likely (mbc_tree != NULL))
	3740	return tree;
	3741	}
	3742	else
	3743	{
	3744	free_charset (mbcset);
	3745	return tree;
	3746	}
	3747	#else /* not RE_ENABLE_I18N */
	3748	return tree;
	3749	#endif /* not RE_ENABLE_I18N */
	3750
	3751	build_word_op_espace:
	3752	re_free (sbcset);
	3753	#ifdef RE_ENABLE_I18N
	3754	free_charset (mbcset);
	3755	#endif /* RE_ENABLE_I18N */
	3756	*err = REG_ESPACE;
	3757	return NULL;
	3758	}
	3759
	3760	/* This is intended for the expressions like "a{1,3}".
	3761	Fetch a number from 'input', and return the number.
	3762	Return -1 if the number field is empty like "{,1}".
	3763	Return RE_DUP_MAX + 1 if the number field is too large.
	3764	Return -2 if an error occurred. */
	3765
	3766	static Idx
	3767	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
	3768	{
	3769	Idx num = -1;
	3770	unsigned char c;
	3771	while (1)
	3772	{
	3773	fetch_token (token, input, syntax);
	3774	c = token->opr.c;
	3775	if (__glibc_unlikely (token->type == END_OF_RE))
	3776	return -2;
	3777	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
	3778	break;
	3779	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c \|\| num == -2)
	3780	? -2
	3781	: num == -1
	3782	? c - '0'
	3783	: MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
	3784	}
	3785	return num;
	3786	}
	3787
	3788
	3789	#ifdef RE_ENABLE_I18N
	3790	static void
	3791	free_charset (re_charset_t *cset)
	3792	{
	3793	re_free (cset->mbchars);
	3794	# ifdef _LIBC
	3795	re_free (cset->coll_syms);
	3796	re_free (cset->equiv_classes);
	3797	# endif
	3798	re_free (cset->range_starts);
	3799	re_free (cset->range_ends);
	3800	re_free (cset->char_classes);
	3801	re_free (cset);
	3802	}
	3803	#endif /* RE_ENABLE_I18N */
	3804
	3805
	3806	/* Functions for binary tree operation. */
	3807
	3808	/* Create a tree node. */
	3809
	3810	static bin_tree_t *
	3811	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
	3812	re_token_type_t type)
	3813	{
	3814	re_token_t t = { .type = type };
	3815	return create_token_tree (dfa, left, right, &t);
	3816	}
	3817
	3818	static bin_tree_t *
	3819	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
	3820	const re_token_t *token)
	3821	{
	3822	bin_tree_t *tree;
	3823	if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
	3824	{
	3825	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
	3826
	3827	if (storage == NULL)
	3828	return NULL;
	3829	storage->next = dfa->str_tree_storage;
	3830	dfa->str_tree_storage = storage;
	3831	dfa->str_tree_storage_idx = 0;
	3832	}
	3833	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
	3834
	3835	tree->parent = NULL;
	3836	tree->left = left;
	3837	tree->right = right;
	3838	tree->token = *token;
	3839	tree->token.duplicated = 0;
	3840	tree->token.opt_subexp = 0;
	3841	tree->first = NULL;
	3842	tree->next = NULL;
	3843	tree->node_idx = -1;
	3844
	3845	if (left != NULL)
	3846	left->parent = tree;
	3847	if (right != NULL)
	3848	right->parent = tree;
	3849	return tree;
	3850	}
	3851
	3852	/* Mark the tree SRC as an optional subexpression.
	3853	To be called from preorder or postorder. */
	3854
	3855	static reg_errcode_t
	3856	mark_opt_subexp (void extra, bin_tree_t node)
	3857	{
	3858	Idx idx = (uintptr_t) extra;
	3859	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
	3860	node->token.opt_subexp = 1;
	3861
	3862	return REG_NOERROR;
	3863	}
	3864
	3865	/* Free the allocated memory inside NODE. */
	3866
	3867	static void
	3868	free_token (re_token_t *node)
	3869	{
	3870	#ifdef RE_ENABLE_I18N
	3871	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
	3872	free_charset (node->opr.mbcset);
	3873	else
	3874	#endif /* RE_ENABLE_I18N */
	3875	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
	3876	re_free (node->opr.sbcset);
	3877	}
	3878
	3879	/* Worker function for tree walking. Free the allocated memory inside NODE
	3880	and its children. */
	3881
	3882	static reg_errcode_t
	3883	free_tree (void extra, bin_tree_t node)
	3884	{
	3885	free_token (&node->token);
	3886	return REG_NOERROR;
	3887	}
	3888
	3889
	3890	/* Duplicate the node SRC, and return new node. This is a preorder
	3891	visit similar to the one implemented by the generic visitor, but
	3892	we need more infrastructure to maintain two parallel trees --- so,
	3893	it's easier to duplicate. */
	3894
	3895	static bin_tree_t *
	3896	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
	3897	{
	3898	const bin_tree_t *node;
	3899	bin_tree_t *dup_root;
	3900	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
	3901
	3902	for (node = root; ; )
	3903	{
	3904	/* Create a new tree and link it back to the current parent. */
	3905	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
	3906	if (*p_new == NULL)
	3907	return NULL;
	3908	(*p_new)->parent = dup_node;
	3909	(*p_new)->token.duplicated = 1;
	3910	dup_node = *p_new;
	3911
	3912	/* Go to the left node, or up and to the right. */
	3913	if (node->left)
	3914	{
	3915	node = node->left;
	3916	p_new = &dup_node->left;
	3917	}
	3918	else
	3919	{
	3920	const bin_tree_t *prev = NULL;
	3921	while (node->right == prev \|\| node->right == NULL)
	3922	{
	3923	prev = node;
	3924	node = node->parent;
	3925	dup_node = dup_node->parent;
	3926	if (!node)
	3927	return dup_root;
	3928	}
	3929	node = node->right;
	3930	p_new = &dup_node->right;
	3931	}
	3932	}
	3933	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/grep/lib/regcomp.c

Download in other formats: