Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

regcomp.c

Last change on this file was 3613, checked in by bird, 10 months ago
src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge ^{/vendor/sed/4.1.5}/vendor/sed/current .)
File size: 109.5 KB

Rev	Line
[599]	1	/* Extended regular expression matching and search library.
[3613]	2	Copyright (C) 2002-2022 Free Software Foundation, Inc.
[599]	3	This file is part of the GNU C Library.
	4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
[3613]	17	License along with the GNU C Library; if not, see
	18	<https://www.gnu.org/licenses/>. */
[599]	19
[3613]	20	#ifdef _LIBC
	21	# include <locale/weight.h>
	22	#endif
	23
[599]	24	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
	25	size_t length, reg_syntax_t syntax);
	26	static void re_compile_fastmap_iter (regex_t *bufp,
	27	const re_dfastate_t *init_state,
	28	char *fastmap);
	29	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
	30	static void free_charset (re_charset_t *cset);
	31	static void free_workarea_compile (regex_t *preg);
	32	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
	33	static void optimize_utf8 (re_dfa_t *dfa);
	34	static reg_errcode_t analyze (regex_t *preg);
	35	static reg_errcode_t preorder (bin_tree_t *root,
	36	reg_errcode_t (fn (void , bin_tree_t )),
	37	void *extra);
	38	static reg_errcode_t postorder (bin_tree_t *root,
	39	reg_errcode_t (fn (void , bin_tree_t )),
	40	void *extra);
	41	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
	42	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
	43	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
	44	bin_tree_t *node);
	45	static reg_errcode_t calc_first (void extra, bin_tree_t node);
	46	static reg_errcode_t calc_next (void extra, bin_tree_t node);
	47	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
[3613]	48	static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
	49	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
[599]	50	unsigned int constraint);
	51	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
	52	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
[3613]	53	Idx node, bool root);
[599]	54	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
[3613]	55	static Idx fetch_number (re_string_t input, re_token_t token,
[599]	56	reg_syntax_t syntax);
	57	static int peek_token (re_token_t token, re_string_t input,
[3613]	58	reg_syntax_t syntax);
[599]	59	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
	60	reg_syntax_t syntax, reg_errcode_t *err);
	61	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
	62	re_token_t *token, reg_syntax_t syntax,
[3613]	63	Idx nest, reg_errcode_t *err);
[599]	64	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
	65	re_token_t *token, reg_syntax_t syntax,
[3613]	66	Idx nest, reg_errcode_t *err);
[599]	67	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
	68	re_token_t *token, reg_syntax_t syntax,
[3613]	69	Idx nest, reg_errcode_t *err);
[599]	70	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
	71	re_token_t *token, reg_syntax_t syntax,
[3613]	72	Idx nest, reg_errcode_t *err);
[599]	73	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
	74	re_dfa_t dfa, re_token_t token,
	75	reg_syntax_t syntax, reg_errcode_t *err);
	76	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
	77	re_token_t *token, reg_syntax_t syntax,
	78	reg_errcode_t *err);
	79	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
	80	re_string_t *regexp,
	81	re_token_t *token, int token_len,
	82	re_dfa_t *dfa,
	83	reg_syntax_t syntax,
[3613]	84	bool accept_hyphen);
[599]	85	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
	86	re_string_t *regexp,
	87	re_token_t *token);
	88	static reg_errcode_t build_equiv_class (bitset_t sbcset,
	89	re_charset_t *mbcset,
[3613]	90	Idx *equiv_class_alloc,
[599]	91	const unsigned char *name);
	92	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
	93	bitset_t sbcset,
	94	re_charset_t *mbcset,
[3613]	95	Idx *char_class_alloc,
	96	const char *class_name,
[599]	97	reg_syntax_t syntax);
	98	static bin_tree_t build_charclass_op (re_dfa_t dfa,
	99	RE_TRANSLATE_TYPE trans,
[3613]	100	const char *class_name,
	101	const char *extra,
	102	bool non_match, reg_errcode_t *err);
[599]	103	static bin_tree_t create_tree (re_dfa_t dfa,
	104	bin_tree_t left, bin_tree_t right,
	105	re_token_type_t type);
	106	static bin_tree_t create_token_tree (re_dfa_t dfa,
	107	bin_tree_t left, bin_tree_t right,
	108	const re_token_t *token);
	109	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
	110	static void free_token (re_token_t *node);
	111	static reg_errcode_t free_tree (void extra, bin_tree_t node);
	112	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
	113
	114
	115	/* This table gives an error message for each of the error codes listed
	116	in regex.h. Obviously the order here has to be same as there.
	117	POSIX doesn't require that we do anything for REG_NOERROR,
	118	but why not be nice? */
[3613]	119
[599]	120	static const char __re_error_msgid[] =
	121	{
	122	#define REG_NOERROR_IDX 0
	123	gettext_noop ("Success") /* REG_NOERROR */
	124	"\0"
	125	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
	126	gettext_noop ("No match") /* REG_NOMATCH */
	127	"\0"
	128	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
	129	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
	130	"\0"
	131	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
	132	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
	133	"\0"
	134	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
	135	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
	136	"\0"
	137	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
	138	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
	139	"\0"
	140	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
	141	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
	142	"\0"
[3613]	143	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
[599]	144	gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
[3613]	145	"\0"
[599]	146	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
	147	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
	148	"\0"
	149	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
	150	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
	151	"\0"
	152	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
	153	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
	154	"\0"
	155	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
	156	gettext_noop ("Invalid range end") /* REG_ERANGE */
	157	"\0"
	158	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
	159	gettext_noop ("Memory exhausted") /* REG_ESPACE */
	160	"\0"
	161	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
	162	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
	163	"\0"
	164	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
	165	gettext_noop ("Premature end of regular expression") /* REG_EEND */
	166	"\0"
	167	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
	168	gettext_noop ("Regular expression too big") /* REG_ESIZE */
	169	"\0"
	170	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
	171	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
	172	};
[3613]	173
[599]	174	static const size_t __re_error_msgid_idx[] =
	175	{
	176	REG_NOERROR_IDX,
	177	REG_NOMATCH_IDX,
	178	REG_BADPAT_IDX,
	179	REG_ECOLLATE_IDX,
	180	REG_ECTYPE_IDX,
	181	REG_EESCAPE_IDX,
	182	REG_ESUBREG_IDX,
	183	REG_EBRACK_IDX,
	184	REG_EPAREN_IDX,
	185	REG_EBRACE_IDX,
	186	REG_BADBR_IDX,
	187	REG_ERANGE_IDX,
	188	REG_ESPACE_IDX,
	189	REG_BADRPT_IDX,
	190	REG_EEND_IDX,
	191	REG_ESIZE_IDX,
	192	REG_ERPAREN_IDX
	193	};
	194
	195
	196	/* Entry points for GNU code. */
	197
	198	/* re_compile_pattern is the GNU regular expression compiler: it
	199	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
[3613]	200	Returns 0 if the pattern was valid, otherwise an error string.
[599]	201
	202	Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
	203	are set in BUFP on entry. */
[3613]	204
	205	const char *
[599]	206	re_compile_pattern (const char *pattern, size_t length,
	207	struct re_pattern_buffer *bufp)
	208	{
	209	reg_errcode_t ret;
	210
	211	/* And GNU code determines whether or not to get register information
	212	by passing null for the REGS argument to re_match, etc., not by
	213	setting no_sub, unless RE_NO_SUB is set. */
	214	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
	215
	216	/* Match anchors at newline. */
	217	bufp->newline_anchor = 1;
	218
	219	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
	220
	221	if (!ret)
	222	return NULL;
	223	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
	224	}
[3613]	225	weak_alias (__re_compile_pattern, re_compile_pattern)
[599]	226
	227	/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
	228	also be assigned to arbitrarily: each pattern buffer stores its own
	229	syntax, so it can be changed between regex compilations. */
	230	/* This has no initializer because initialized variables in Emacs
	231	become read-only after dumping. */
	232	reg_syntax_t re_syntax_options;
	233
	234
	235	/* Specify the precise syntax of regexps for compilation. This provides
	236	for compatibility for various utilities which historically have
	237	different, incompatible syntaxes.
	238
	239	The argument SYNTAX is a bit mask comprised of the various bits
	240	defined in regex.h. We return the old syntax. */
[3613]	241
[599]	242	reg_syntax_t
	243	re_set_syntax (reg_syntax_t syntax)
	244	{
	245	reg_syntax_t ret = re_syntax_options;
	246
	247	re_syntax_options = syntax;
	248	return ret;
	249	}
	250	weak_alias (__re_set_syntax, re_set_syntax)
[3613]	251
[599]	252	int
[3613]	253	re_compile_fastmap (struct re_pattern_buffer *bufp)
[599]	254	{
	255	re_dfa_t *dfa = bufp->buffer;
	256	char *fastmap = bufp->fastmap;
	257
	258	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
	259	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
	260	if (dfa->init_state != dfa->init_state_word)
	261	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
	262	if (dfa->init_state != dfa->init_state_nl)
	263	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
	264	if (dfa->init_state != dfa->init_state_begbuf)
	265	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
	266	bufp->fastmap_accurate = 1;
	267	return 0;
	268	}
[3613]	269	weak_alias (__re_compile_fastmap, re_compile_fastmap)
	270
[599]	271	static __always_inline void
	272	re_set_fastmap (char *fastmap, bool icase, int ch)
	273	{
	274	fastmap[ch] = 1;
	275	if (icase)
	276	fastmap[tolower (ch)] = 1;
	277	}
	278
	279	/* Helper function for re_compile_fastmap.
	280	Compile fastmap for the initial_state INIT_STATE. */
	281
	282	static void
	283	re_compile_fastmap_iter (regex_t bufp, const re_dfastate_t init_state,
[3613]	284	char *fastmap)
	285	{
	286	re_dfa_t *dfa = bufp->buffer;
[599]	287	Idx node_cnt;
	288	bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
[3613]	289	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
[599]	290	{
	291	Idx node = init_state->nodes.elems[node_cnt];
	292	re_token_type_t type = dfa->nodes[node].type;
	293
	294	if (type == CHARACTER)
	295	{
	296	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
[3613]	297	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
	298	{
[599]	299	unsigned char buf[MB_LEN_MAX];
	300	unsigned char *p;
	301	wchar_t wc;
	302	mbstate_t state;
	303
	304	p = buf;
	305	*p++ = dfa->nodes[node].opr.c;
	306	while (++node < dfa->nodes_len
	307	&& dfa->nodes[node].type == CHARACTER
	308	&& dfa->nodes[node].mb_partial)
[3613]	309	*p++ = dfa->nodes[node].opr.c;
	310	memset (&state, '\0', sizeof (state));
	311	if (__mbrtowc (&wc, (const char *) buf, p - buf,
[599]	312	&state) == p - buf
[3613]	313	&& (__wcrtomb ((char *) buf, __towlower (wc), &state)
[599]	314	!= (size_t) -1))
	315	re_set_fastmap (fastmap, false, buf[0]);
	316	}
	317	}
	318	else if (type == SIMPLE_BRACKET)
	319	{
	320	int i, ch;
	321	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
	322	{
	323	int j;
	324	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
	325	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
	326	if (w & ((bitset_word_t) 1 << j))
	327	re_set_fastmap (fastmap, icase, ch);
	328	}
	329	}
	330	else if (type == COMPLEX_BRACKET)
[3613]	331	{
	332	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
	333	Idx i;
	334
	335	#ifdef _LIBC
	336	/* See if we have to try all bytes which start multiple collation
	337	elements.
	338	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
	339	collation element, and don't catch 'b' since 'b' is
	340	the only collation element which starts from 'b' (and
	341	it is caught by SIMPLE_BRACKET). */
[599]	342	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
	343	&& (cset->ncoll_syms \|\| cset->nranges))
	344	{
	345	const int32_t table = (const int32_t )
	346	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
	347	for (i = 0; i < SBC_MAX; ++i)
	348	if (table[i] < 0)
[3613]	349	re_set_fastmap (fastmap, icase, i);
	350	}
	351	#endif /* _LIBC */
	352
	353	/* See if we have to start the match at all multibyte characters,
	354	i.e. where we would not find an invalid sequence. This only
	355	applies to multibyte character sets; for single byte character
	356	sets, the SIMPLE_BRACKET again suffices. */
	357	if (dfa->mb_cur_max > 1
	358	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
	359	#ifdef _LIBC
	360	\|\| cset->nequiv_classes
	361	#endif /* _LIBC */
	362	))
	363	{
	364	unsigned char c = 0;
	365	do
	366	{
	367	mbstate_t mbs;
	368	memset (&mbs, 0, sizeof (mbs));
	369	if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
	370	re_set_fastmap (fastmap, false, (int) c);
[599]	371	}
[3613]	372	while (++c != 0);
	373	}
[599]	374
[3613]	375	else
	376	{
[599]	377	/* ... Else catch all bytes which can start the mbchars. */
[3613]	378	for (i = 0; i < cset->nmbchars; ++i)
	379	{
	380	char buf[256];
	381	mbstate_t state;
	382	memset (&state, '\0', sizeof (state));
	383	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
	384	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
	385	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
	386	{
	387	if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
	388	!= (size_t) -1)
[599]	389	re_set_fastmap (fastmap, false, (unsigned char ) buf);
	390	}
	391	}
[3613]	392	}
[599]	393	}
	394	else if (type == OP_PERIOD \|\| type == OP_UTF8_PERIOD \|\| type == END_OF_RE)
	395	{
	396	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
	397	if (type == END_OF_RE)
	398	bufp->can_be_null = 1;
	399	return;
	400	}
	401	}
	402	}
	403
	404
	405	/* Entry point for POSIX code. */
	406	/* regcomp takes a regular expression as a string and compiles it.
	407
[3613]	408	PREG is a regex_t *. We do not expect any fields to be initialized,
	409	since POSIX says we shouldn't. Thus, we set
	410
[599]	411	'buffer' to the compiled pattern;
	412	'used' to the length of the compiled pattern;
[3613]	413	'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
	414	REG_EXTENDED bit in CFLAGS is set; otherwise, to
	415	RE_SYNTAX_POSIX_BASIC;
	416	'newline_anchor' to REG_NEWLINE being set in CFLAGS;
[599]	417	'fastmap' to an allocated space for the fastmap;
	418	'fastmap_accurate' to zero;
	419	're_nsub' to the number of subexpressions in PATTERN.
	420
	421	PATTERN is the address of the pattern string.
	422
	423	CFLAGS is a series of bits which affect compilation.
	424
	425	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
	426	use POSIX basic syntax.
	427
	428	If REG_NEWLINE is set, then . and [^...] don't match newline.
	429	Also, regexec will try a match beginning after every newline.
	430
	431	If REG_ICASE is set, then we considers upper- and lowercase
	432	versions of letters to be equivalent when matching.
	433
	434	If REG_NOSUB is set, then when PREG is passed to regexec, that
	435	routine will report only success or failure, and nothing about the
	436	registers.
	437
	438	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
[3613]	439	the return codes and their meanings.) */
[599]	440
	441	int
	442	regcomp (regex_t __restrict preg, const char __restrict pattern, int cflags)
	443	{
	444	reg_errcode_t ret;
	445	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
	446	: RE_SYNTAX_POSIX_BASIC);
	447
	448	preg->buffer = NULL;
	449	preg->allocated = 0;
	450	preg->used = 0;
[3613]	451
[599]	452	/* Try to allocate space for the fastmap. */
	453	preg->fastmap = re_malloc (char, SBC_MAX);
	454	if (__glibc_unlikely (preg->fastmap == NULL))
	455	return REG_ESPACE;
	456
	457	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
	458
	459	/* If REG_NEWLINE is set, newlines are treated differently. */
	460	if (cflags & REG_NEWLINE)
	461	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
	462	syntax &= ~RE_DOT_NEWLINE;
	463	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
	464	/* It also changes the matching behavior. */
	465	preg->newline_anchor = 1;
	466	}
	467	else
	468	preg->newline_anchor = 0;
	469	preg->no_sub = !!(cflags & REG_NOSUB);
	470	preg->translate = NULL;
	471
	472	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
	473
	474	/* POSIX doesn't distinguish between an unmatched open-group and an
	475	unmatched close-group: both are REG_EPAREN. */
	476	if (ret == REG_ERPAREN)
[3613]	477	ret = REG_EPAREN;
[599]	478
	479	/* We have already checked preg->fastmap != NULL. */
	480	if (__glibc_likely (ret == REG_NOERROR))
	481	/* Compute the fastmap now, since regexec cannot modify the pattern
	482	buffer. This function never fails in this implementation. */
	483	(void) re_compile_fastmap (preg);
	484	else
	485	{
	486	/* Some error occurred while compiling the expression. */
	487	re_free (preg->fastmap);
	488	preg->fastmap = NULL;
	489	}
[3613]	490
[599]	491	return (int) ret;
	492	}
	493	libc_hidden_def (__regcomp)
	494	weak_alias (__regcomp, regcomp)
	495
	496	/* Returns a message corresponding to an error code, ERRCODE, returned
[3613]	497	from either regcomp or regexec. We don't use PREG here. */
	498
[599]	499	size_t
	500	regerror (int errcode, const regex_t __restrict preg, char __restrict errbuf,
	501	size_t errbuf_size)
[3613]	502	{
[599]	503	const char *msg;
[3613]	504	size_t msg_size;
[599]	505	int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
	506
	507	if (__glibc_unlikely (errcode < 0 \|\| errcode >= nerrcodes))
	508	/* Only error codes returned by the rest of the code should be passed
	509	to this routine. If we are given anything else, or if other regex
	510	code generates an invalid error code, then the program has a bug.
	511	Dump core so we can fix it. */
	512	abort ();
	513
	514	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
[3613]	515
[599]	516	msg_size = strlen (msg) + 1; /* Includes the null. */
[3613]	517
	518	if (__glibc_likely (errbuf_size != 0))
[599]	519	{
[3613]	520	size_t cpy_size = msg_size;
	521	if (__glibc_unlikely (msg_size > errbuf_size))
[599]	522	{
[3613]	523	cpy_size = errbuf_size - 1;
[599]	524	errbuf[cpy_size] = '\0';
	525	}
	526	memcpy (errbuf, msg, cpy_size);
	527	}
	528
	529	return msg_size;
	530	}
	531	weak_alias (__regerror, regerror)
	532
	533
	534	/* This static array is used for the map to single-byte characters when
	535	UTF-8 is used. Otherwise we would allocate memory just to initialize
	536	it the same all the time. UTF-8 is the preferred encoding so this is
[606]	537	a worthwhile optimization. */
[3613]	538	static const bitset_t utf8_sb_map =
	539	{
[606]	540	/* Set the first 128 bits. */
[3613]	541	#if (defined __GNUC__ \|\| __clang_major__ >= 4) && !defined __STRICT_ANSI__
	542	[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
	543	#else
	544	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
	545	# error "bitset_word_t is narrower than 32 bits"
	546	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
	547	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
	548	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
	549	BITSET_WORD_MAX, BITSET_WORD_MAX,
	550	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
	551	BITSET_WORD_MAX,
	552	# endif
	553	(BITSET_WORD_MAX
[606]	554	>> (SBC_MAX % BITSET_WORD_BITS == 0
[599]	555	? 0
	556	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
	557	#endif
	558	};
	559
	560
[3613]	561	static void
[599]	562	free_dfa_content (re_dfa_t *dfa)
	563	{
	564	Idx i, j;
	565
	566	if (dfa->nodes)
	567	for (i = 0; i < dfa->nodes_len; ++i)
	568	free_token (dfa->nodes + i);
	569	re_free (dfa->nexts);
	570	for (i = 0; i < dfa->nodes_len; ++i)
	571	{
	572	if (dfa->eclosures != NULL)
	573	re_node_set_free (dfa->eclosures + i);
	574	if (dfa->inveclosures != NULL)
	575	re_node_set_free (dfa->inveclosures + i);
	576	if (dfa->edests != NULL)
	577	re_node_set_free (dfa->edests + i);
	578	}
	579	re_free (dfa->edests);
	580	re_free (dfa->eclosures);
	581	re_free (dfa->inveclosures);
	582	re_free (dfa->nodes);
	583
	584	if (dfa->state_table)
	585	for (i = 0; i <= dfa->state_hash_mask; ++i)
	586	{
	587	struct re_state_table_entry *entry = dfa->state_table + i;
	588	for (j = 0; j < entry->num; ++j)
	589	{
[3613]	590	re_dfastate_t *state = entry->array[j];
[599]	591	free_state (state);
	592	}
	593	re_free (entry->array);
	594	}
	595	re_free (dfa->state_table);
	596	if (dfa->sb_char != utf8_sb_map)
	597	re_free (dfa->sb_char);
	598	re_free (dfa->subexp_map);
	599	#ifdef DEBUG
	600	re_free (dfa->re_str);
	601	#endif
	602
	603	re_free (dfa);
	604	}
	605
	606
[3613]	607	/* Free dynamically allocated space used by PREG. */
[599]	608
[3613]	609	void
	610	regfree (regex_t *preg)
	611	{
	612	re_dfa_t *dfa = preg->buffer;
	613	if (__glibc_likely (dfa != NULL))
	614	{
[599]	615	lock_fini (dfa->lock);
	616	free_dfa_content (dfa);
	617	}
	618	preg->buffer = NULL;
	619	preg->allocated = 0;
	620
	621	re_free (preg->fastmap);
	622	preg->fastmap = NULL;
	623
[3613]	624	re_free (preg->translate);
[599]	625	preg->translate = NULL;
	626	}
	627	libc_hidden_def (__regfree)
	628	weak_alias (__regfree, regfree)
	629
	630
	631	/* Entry points compatible with 4.2 BSD regex library. We don't define
	632	them unless specifically requested. */
	633
	634	#if defined _REGEX_RE_COMP \|\| defined _LIBC
	635
	636	/* BSD has one and only one pattern buffer. */
	637	static struct re_pattern_buffer re_comp_buf;
	638
	639	char *
	640	# ifdef _LIBC
	641	/* Make these definitions weak in libc, so POSIX programs can redefine
[3613]	642	these names if they don't use our functions, and still use
[599]	643	regcomp/regexec above without link errors. */
	644	weak_function
	645	# endif
	646	re_comp (const char *s)
	647	{
	648	reg_errcode_t ret;
	649	char *fastmap;
	650
	651	if (!s)
	652	{
	653	if (!re_comp_buf.buffer)
	654	return gettext ("No previous regular expression");
	655	return 0;
	656	}
	657
	658	if (re_comp_buf.buffer)
	659	{
	660	fastmap = re_comp_buf.fastmap;
	661	re_comp_buf.fastmap = NULL;
	662	__regfree (&re_comp_buf);
	663	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
	664	re_comp_buf.fastmap = fastmap;
[3613]	665	}
[599]	666
	667	if (re_comp_buf.fastmap == NULL)
	668	{
	669	re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
	670	if (re_comp_buf.fastmap == NULL)
[3613]	671	return (char *) gettext (__re_error_msgid
[599]	672	+ __re_error_msgid_idx[(int) REG_ESPACE]);
	673	}
	674
	675	/* Since 're_exec' always passes NULL for the 'regs' argument, we
	676	don't need to initialize the pattern buffer fields which affect it. */
	677
	678	/* Match anchors at newlines. */
	679	re_comp_buf.newline_anchor = 1;
	680
	681	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
[3613]	682
[599]	683	if (!ret)
	684	return NULL;
	685
	686	/* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
	687	return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
	688	}
	689
	690	#ifdef _LIBC
	691	libc_freeres_fn (free_mem)
	692	{
	693	__regfree (&re_comp_buf);
	694	}
	695	#endif
	696
	697	#endif /* _REGEX_RE_COMP */
	698
	699
	700	/* Internal entry point.
	701	Compile the regular expression PATTERN, whose length is LENGTH.
	702	SYNTAX indicate regular expression's syntax. */
	703
	704	static reg_errcode_t
	705	re_compile_internal (regex_t preg, const char pattern, size_t length,
	706	reg_syntax_t syntax)
	707	{
	708	reg_errcode_t err = REG_NOERROR;
	709	re_dfa_t *dfa;
	710	re_string_t regexp;
	711
	712	/* Initialize the pattern buffer. */
	713	preg->fastmap_accurate = 0;
	714	preg->syntax = syntax;
	715	preg->not_bol = preg->not_eol = 0;
	716	preg->used = 0;
[3613]	717	preg->re_nsub = 0;
	718	preg->can_be_null = 0;
[599]	719	preg->regs_allocated = REGS_UNALLOCATED;
	720
	721	/* Initialize the dfa. */
	722	dfa = preg->buffer;
	723	if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
	724	{
	725	/* If zero allocated, but buffer is non-null, try to realloc
	726	enough space. This loses if buffer's address is bogus, but
	727	that is the user's responsibility. If ->buffer is NULL this
[3613]	728	is a simple allocation. */
[599]	729	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
	730	if (dfa == NULL)
	731	return REG_ESPACE;
	732	preg->allocated = sizeof (re_dfa_t);
[3613]	733	preg->buffer = dfa;
	734	}
	735	preg->used = sizeof (re_dfa_t);
[599]	736
	737	err = init_dfa (dfa, length);
	738	if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
	739	err = REG_ESPACE;
	740	if (__glibc_unlikely (err != REG_NOERROR))
	741	{
	742	free_dfa_content (dfa);
	743	preg->buffer = NULL;
	744	preg->allocated = 0;
	745	return err;
	746	}
	747	#ifdef DEBUG
	748	/* Note: length+1 will not overflow since it is checked in init_dfa. */
[3613]	749	dfa->re_str = re_malloc (char, length + 1);
	750	strncpy (dfa->re_str, pattern, length + 1);
[599]	751	#endif
	752
	753	err = re_string_construct (&regexp, pattern, length, preg->translate,
	754	(syntax & RE_ICASE) != 0, dfa);
[3613]	755	if (__glibc_unlikely (err != REG_NOERROR))
[599]	756	{
	757	re_compile_internal_free_return:
	758	free_workarea_compile (preg);
	759	re_string_destruct (&regexp);
	760	lock_fini (dfa->lock);
	761	free_dfa_content (dfa);
	762	preg->buffer = NULL;
	763	preg->allocated = 0;
	764	return err;
[3613]	765	}
[599]	766
	767	/* Parse the regular expression, and build a structure tree. */
	768	preg->re_nsub = 0;
	769	dfa->str_tree = parse (&regexp, preg, syntax, &err);
[3613]	770	if (__glibc_unlikely (dfa->str_tree == NULL))
[599]	771	goto re_compile_internal_free_return;
	772
	773	/* Analyze the tree and create the nfa. */
	774	err = analyze (preg);
	775	if (__glibc_unlikely (err != REG_NOERROR))
	776	goto re_compile_internal_free_return;
	777
	778	/* If possible, do searching in single byte encoding to speed things up. */
	779	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
	780	optimize_utf8 (dfa);
	781
	782	/* Then create the initial state of the dfa. */
	783	err = create_initial_state (dfa);
[3613]	784
[599]	785	/* Release work areas. */
[3613]	786	free_workarea_compile (preg);
[599]	787	re_string_destruct (&regexp);
	788
	789	if (__glibc_unlikely (err != REG_NOERROR))
	790	{
	791	lock_fini (dfa->lock);
	792	free_dfa_content (dfa);
	793	preg->buffer = NULL;
	794	preg->allocated = 0;
	795	}
	796
	797	return err;
	798	}
	799
	800	/* Initialize DFA. We use the length of the regular expression PAT_LEN
[3613]	801	as the initial length of some arrays. */
[599]	802
[3613]	803	static reg_errcode_t
[599]	804	init_dfa (re_dfa_t *dfa, size_t pat_len)
[3613]	805	{
	806	__re_size_t table_size;
	807	#ifndef _LIBC
	808	const char *codeset_name;
	809	#endif
	810	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
	811	size_t max_object_size =
[599]	812	MAX (sizeof (struct re_state_table_entry),
	813	MAX (sizeof (re_token_t),
	814	MAX (sizeof (re_node_set),
	815	MAX (sizeof (regmatch_t),
	816	max_i18n_object_size))));
	817
[3613]	818	memset (dfa, '\0', sizeof (re_dfa_t));
	819
	820	/* Force allocation of str_tree_storage the first time. */
	821	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
	822
	823	/* Avoid overflows. The extra "/ 2" is for the table_size doubling
[599]	824	calculation below, and for similar doubling calculations
	825	elsewhere. And it's <= rather than <, because some of the
	826	doubling calculations add 1 afterwards. */
	827	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
	828	<= pat_len))
	829	return REG_ESPACE;
	830
	831	dfa->nodes_alloc = pat_len + 1;
	832	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
	833
	834	/* table_size = 2 ^ ceil(log pat_len) */
	835	for (table_size = 1; ; table_size <<= 1)
	836	if (table_size > pat_len)
	837	break;
	838
	839	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
	840	dfa->state_hash_mask = table_size - 1;
	841
	842	dfa->mb_cur_max = MB_CUR_MAX;
	843	#ifdef _LIBC
	844	if (dfa->mb_cur_max == 6
[3550]	845	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
	846	dfa->is_utf8 = 1;
	847	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
	848	!= 0);
[599]	849	#else
[3613]	850	# ifdef KBUILD_OS_WINDOWS
	851	(void)codeset_name;
	852	if (get_crt_codepage() == MY_CP_UTF8)
	853	# else
[3550]	854	codeset_name = nl_langinfo (CODESET);
[599]	855	if ((codeset_name[0] == 'U' \|\| codeset_name[0] == 'u')
	856	&& (codeset_name[1] == 'T' \|\| codeset_name[1] == 't')
	857	&& (codeset_name[2] == 'F' \|\| codeset_name[2] == 'f')
	858	&& strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
	859	# endif
	860	dfa->is_utf8 = 1;
	861
	862	/* We check exhaustively in the loop below if this charset is a
	863	superset of ASCII. */
	864	dfa->map_notascii = 0;
	865	#endif
	866
	867	if (dfa->mb_cur_max > 1)
	868	{
	869	if (dfa->is_utf8)
	870	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
[3613]	871	else
[599]	872	{
	873	int i, j, ch;
	874
	875	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
	876	if (__glibc_unlikely (dfa->sb_char == NULL))
	877	return REG_ESPACE;
	878
	879	/* Set the bits corresponding to single byte chars. */
	880	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
[3613]	881	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
[599]	882	{
	883	wint_t wch = __btowc (ch);
[3613]	884	if (wch != WEOF)
[599]	885	dfa->sb_char[i] \|= (bitset_word_t) 1 << j;
	886	#ifndef _LIBC
	887	if (isascii (ch) && wch != ch)
	888	dfa->map_notascii = 1;
[3613]	889	#endif
[599]	890	}
	891	}
	892	}
	893
	894	if (__glibc_unlikely (dfa->nodes == NULL \|\| dfa->state_table == NULL))
	895	return REG_ESPACE;
	896	return REG_NOERROR;
	897	}
	898
	899	/* Initialize WORD_CHAR table, which indicate which character is
	900	"word". In this case "word" means that it is the word construction
[3613]	901	character used by some operators like "\<", "\>", etc. */
	902
	903	static void
[599]	904	init_word_char (re_dfa_t *dfa)
[3613]	905	{
	906	int i = 0;
	907	int j;
	908	int ch = 0;
	909	dfa->word_ops_used = 1;
	910	if (__glibc_likely (dfa->map_notascii == 0))
	911	{
	912	bitset_word_t bits0 = 0x00000000;
	913	bitset_word_t bits1 = 0x03ff0000;
	914	bitset_word_t bits2 = 0x87fffffe;
	915	bitset_word_t bits3 = 0x07fffffe;
	916	if (BITSET_WORD_BITS == 64)
	917	{
	918	/* Pacify gcc -Woverflow on 32-bit platformns. */
	919	dfa->word_char[0] = bits1 << 31 << 1 \| bits0;
	920	dfa->word_char[1] = bits3 << 31 << 1 \| bits2;
	921	i = 2;
	922	}
	923	else if (BITSET_WORD_BITS == 32)
	924	{
	925	dfa->word_char[0] = bits0;
	926	dfa->word_char[1] = bits1;
	927	dfa->word_char[2] = bits2;
	928	dfa->word_char[3] = bits3;
	929	i = 4;
	930	}
	931	else
	932	goto general_case;
	933	ch = 128;
	934
	935	if (__glibc_likely (dfa->is_utf8))
	936	{
	937	memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
	938	return;
[599]	939	}
	940	}
	941
	942	general_case:
	943	for (; i < BITSET_WORDS; ++i)
	944	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
	945	if (isalnum (ch) \|\| ch == '_')
	946	dfa->word_char[i] \|= (bitset_word_t) 1 << j;
	947	}
	948
[3613]	949	/* Free the work area which are only used while compiling. */
[599]	950
	951	static void
	952	free_workarea_compile (regex_t *preg)
	953	{
	954	re_dfa_t *dfa = preg->buffer;
	955	bin_tree_storage_t storage, next;
	956	for (storage = dfa->str_tree_storage; storage; storage = next)
	957	{
	958	next = storage->next;
	959	re_free (storage);
	960	}
	961	dfa->str_tree_storage = NULL;
	962	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
	963	dfa->str_tree = NULL;
	964	re_free (dfa->org_indices);
	965	dfa->org_indices = NULL;
	966	}
	967
[3613]	968	/* Create initial states for all contexts. */
[599]	969
	970	static reg_errcode_t
	971	create_initial_state (re_dfa_t *dfa)
	972	{
	973	Idx first, i;
	974	reg_errcode_t err;
	975	re_node_set init_nodes;
	976
[3613]	977	/* Initial states have the epsilon closure of the node which is
[599]	978	the first node of the regular expression. */
	979	first = dfa->str_tree->first->node_idx;
	980	dfa->init_node = first;
	981	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
	982	if (__glibc_unlikely (err != REG_NOERROR))
	983	return err;
	984
	985	/* The back-references which are in initial states can epsilon transit,
	986	since in this case all of the subexpressions can be null.
[3613]	987	Then we add epsilon closures of the nodes which are the next nodes of
[599]	988	the back-references. */
	989	if (dfa->nbackref > 0)
[3613]	990	for (i = 0; i < init_nodes.nelem; ++i)
[599]	991	{
	992	Idx node_idx = init_nodes.elems[i];
	993	re_token_type_t type = dfa->nodes[node_idx].type;
	994
	995	Idx clexp_idx;
	996	if (type != OP_BACK_REF)
	997	continue;
	998	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
	999	{
	1000	re_token_t *clexp_node;
	1001	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
	1002	if (clexp_node->type == OP_CLOSE_SUBEXP
	1003	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
	1004	break;
	1005	}
[3613]	1006	if (clexp_idx == init_nodes.nelem)
[599]	1007	continue;
	1008
[3613]	1009	if (type == OP_BACK_REF)
	1010	{
	1011	Idx dest_idx = dfa->edests[node_idx].elems[0];
	1012	if (!re_node_set_contains (&init_nodes, dest_idx))
[599]	1013	{
	1014	reg_errcode_t merge_err
	1015	= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
	1016	if (merge_err != REG_NOERROR)
	1017	return merge_err;
	1018	i = 0;
	1019	}
	1020	}
[3613]	1021	}
[599]	1022
	1023	/* It must be the first time to invoke acquire_state. */
	1024	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
	1025	/* We don't check ERR here, since the initial state must not be NULL. */
	1026	if (__glibc_unlikely (dfa->init_state == NULL))
	1027	return err;
	1028	if (dfa->init_state->has_constraint)
	1029	{
	1030	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
	1031	CONTEXT_WORD);
	1032	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
[3613]	1033	CONTEXT_NEWLINE);
	1034	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
	1035	&init_nodes,
[599]	1036	CONTEXT_NEWLINE
	1037	\| CONTEXT_BEGBUF);
	1038	if (__glibc_unlikely (dfa->init_state_word == NULL
	1039	\|\| dfa->init_state_nl == NULL
	1040	\|\| dfa->init_state_begbuf == NULL))
	1041	return err;
	1042	}
	1043	else
	1044	dfa->init_state_word = dfa->init_state_nl
	1045	= dfa->init_state_begbuf = dfa->init_state;
	1046
	1047	re_node_set_free (&init_nodes);
	1048	return REG_NOERROR;
	1049	}
	1050
	1051
	1052	/* If it is possible to do searching in single byte encoding instead of UTF-8
[3613]	1053	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
	1054	DFA nodes where needed. */
	1055
	1056	static void
[599]	1057	optimize_utf8 (re_dfa_t *dfa)
	1058	{
	1059	Idx node;
	1060	int i;
	1061	bool mb_chars = false;
[3613]	1062	bool has_period = false;
	1063
[599]	1064	for (node = 0; node < dfa->nodes_len; ++node)
	1065	switch (dfa->nodes[node].type)
[3613]	1066	{
[599]	1067	case CHARACTER:
	1068	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
	1069	mb_chars = true;
	1070	break;
	1071	case ANCHOR:
	1072	switch (dfa->nodes[node].opr.ctx_type)
	1073	{
[3613]	1074	case LINE_FIRST:
	1075	case LINE_LAST:
	1076	case BUF_FIRST:
[599]	1077	case BUF_LAST:
	1078	break;
	1079	default:
	1080	/* Word anchors etc. cannot be handled. It's okay to test
[3613]	1081	opr.ctx_type since constraints (for all DFA nodes) are
	1082	created by ORing one or more opr.ctx_type values. */
[599]	1083	return;
	1084	}
	1085	break;
	1086	case OP_PERIOD:
	1087	has_period = true;
	1088	break;
	1089	case OP_BACK_REF:
	1090	case OP_ALT:
	1091	case END_OF_RE:
	1092	case OP_DUP_ASTERISK:
[3613]	1093	case OP_OPEN_SUBEXP:
	1094	case OP_CLOSE_SUBEXP:
	1095	break;
	1096	case COMPLEX_BRACKET:
	1097	return;
	1098	case SIMPLE_BRACKET:
	1099	/* Just double check. */
	1100	{
	1101	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
	1102	? 0
	1103	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
	1104	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
[599]	1105	{
	1106	if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
	1107	return;
	1108	rshift = 0;
	1109	}
	1110	}
	1111	break;
	1112	default:
	1113	abort ();
[3613]	1114	}
[599]	1115
	1116	if (mb_chars \|\| has_period)
	1117	for (node = 0; node < dfa->nodes_len; ++node)
	1118	{
	1119	if (dfa->nodes[node].type == CHARACTER
	1120	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
	1121	dfa->nodes[node].mb_partial = 0;
	1122	else if (dfa->nodes[node].type == OP_PERIOD)
	1123	dfa->nodes[node].type = OP_UTF8_PERIOD;
	1124	}
	1125
	1126	/* The search can be in single byte locale. */
	1127	dfa->mb_cur_max = 1;
	1128	dfa->is_utf8 = 0;
	1129	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
	1130	}
	1131
[3613]	1132
[599]	1133	/* Analyze the structure tree, and calculate "first", "next", "edest",
	1134	"eclosure", and "inveclosure". */
	1135
[3613]	1136	static reg_errcode_t
	1137	analyze (regex_t *preg)
[599]	1138	{
	1139	re_dfa_t *dfa = preg->buffer;
[3613]	1140	reg_errcode_t ret;
	1141
[599]	1142	/* Allocate arrays. */
	1143	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
[3613]	1144	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
[599]	1145	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
	1146	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
[3613]	1147	if (__glibc_unlikely (dfa->nexts == NULL \|\| dfa->org_indices == NULL
[599]	1148	\|\| dfa->edests == NULL \|\| dfa->eclosures == NULL))
	1149	return REG_ESPACE;
	1150
	1151	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
	1152	if (dfa->subexp_map != NULL)
	1153	{
	1154	Idx i;
	1155	for (i = 0; i < preg->re_nsub; i++)
[3613]	1156	dfa->subexp_map[i] = i;
[599]	1157	preorder (dfa->str_tree, optimize_subexps, dfa);
	1158	for (i = 0; i < preg->re_nsub; i++)
	1159	if (dfa->subexp_map[i] != i)
	1160	break;
	1161	if (i == preg->re_nsub)
[3613]	1162	{
[599]	1163	re_free (dfa->subexp_map);
	1164	dfa->subexp_map = NULL;
[3613]	1165	}
[599]	1166	}
	1167
	1168	ret = postorder (dfa->str_tree, lower_subexps, preg);
[3613]	1169	if (__glibc_unlikely (ret != REG_NOERROR))
[599]	1170	return ret;
	1171	ret = postorder (dfa->str_tree, calc_first, dfa);
[3613]	1172	if (__glibc_unlikely (ret != REG_NOERROR))
[599]	1173	return ret;
	1174	preorder (dfa->str_tree, calc_next, dfa);
	1175	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
	1176	if (__glibc_unlikely (ret != REG_NOERROR))
	1177	return ret;
	1178	ret = calc_eclosure (dfa);
	1179	if (__glibc_unlikely (ret != REG_NOERROR))
	1180	return ret;
[3613]	1181
	1182	/* We only need this during the prune_impossible_nodes pass in regexec.c;
[599]	1183	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
	1184	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
	1185	\|\| dfa->nbackref)
	1186	{
	1187	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
	1188	if (__glibc_unlikely (dfa->inveclosures == NULL))
	1189	return REG_ESPACE;
	1190	ret = calc_inveclosure (dfa);
	1191	}
	1192
	1193	return ret;
	1194	}
	1195
	1196	/* Our parse trees are very unbalanced, so we cannot use a stack to
	1197	implement parse tree visits. Instead, we use parent pointers and
	1198	some hairy code in these two functions. */
	1199	static reg_errcode_t
	1200	postorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
	1201	void *extra)
	1202	{
	1203	bin_tree_t node, prev;
[3613]	1204
	1205	for (node = root; ; )
	1206	{
[599]	1207	/* Descend down the tree, preferably to the left (or to the right
	1208	if that's the only child). */
	1209	while (node->left \|\| node->right)
	1210	if (node->left)
[3613]	1211	node = node->left;
[599]	1212	else
[3613]	1213	node = node->right;
[599]	1214
	1215	do
	1216	{
	1217	reg_errcode_t err = fn (extra, node);
	1218	if (__glibc_unlikely (err != REG_NOERROR))
	1219	return err;
	1220	if (node->parent == NULL)
	1221	return REG_NOERROR;
	1222	prev = node;
	1223	node = node->parent;
	1224	}
	1225	/* Go up while we have a node that is reached from the right. */
	1226	while (node->right == prev \|\| node->right == NULL);
	1227	node = node->right;
	1228	}
	1229	}
	1230
	1231	static reg_errcode_t
	1232	preorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
[3613]	1233	void *extra)
[599]	1234	{
	1235	bin_tree_t *node;
	1236
	1237	for (node = root; ; )
	1238	{
	1239	reg_errcode_t err = fn (extra, node);
	1240	if (__glibc_unlikely (err != REG_NOERROR))
	1241	return err;
	1242
	1243	/* Go to the left node, or up and to the right. */
	1244	if (node->left)
	1245	node = node->left;
	1246	else
[3613]	1247	{
[599]	1248	bin_tree_t *prev = NULL;
	1249	while (node->right == prev \|\| node->right == NULL)
	1250	{
	1251	prev = node;
	1252	node = node->parent;
	1253	if (!node)
	1254	return REG_NOERROR;
	1255	}
	1256	node = node->right;
	1257	}
	1258	}
	1259	}
	1260
	1261	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
	1262	re_search_internal to map the inner one's opr.idx to this one's. Adjust
	1263	backreferences as well. Requires a preorder visit. */
	1264	static reg_errcode_t
	1265	optimize_subexps (void extra, bin_tree_t node)
	1266	{
	1267	re_dfa_t dfa = (re_dfa_t ) extra;
	1268
	1269	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
[3613]	1270	{
[599]	1271	int idx = node->token.opr.idx;
[3613]	1272	node->token.opr.idx = dfa->subexp_map[idx];
[599]	1273	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
	1274	}
	1275
[3613]	1276	else if (node->token.type == SUBEXP
[599]	1277	&& node->left && node->left->token.type == SUBEXP)
	1278	{
	1279	Idx other_idx = node->left->token.opr.idx;
[3613]	1280
[599]	1281	node->left = node->left->left;
	1282	if (node->left)
	1283	node->left->parent = node;
	1284
	1285	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
	1286	if (other_idx < BITSET_WORD_BITS)
	1287	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
	1288	}
	1289
	1290	return REG_NOERROR;
	1291	}
	1292
	1293	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
	1294	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
	1295	static reg_errcode_t
	1296	lower_subexps (void extra, bin_tree_t node)
	1297	{
	1298	regex_t preg = (regex_t ) extra;
	1299	reg_errcode_t err = REG_NOERROR;
	1300
	1301	if (node->left && node->left->token.type == SUBEXP)
	1302	{
	1303	node->left = lower_subexp (&err, preg, node->left);
	1304	if (node->left)
	1305	node->left->parent = node;
	1306	}
	1307	if (node->right && node->right->token.type == SUBEXP)
	1308	{
	1309	node->right = lower_subexp (&err, preg, node->right);
	1310	if (node->right)
	1311	node->right->parent = node;
	1312	}
[3613]	1313
[599]	1314	return err;
	1315	}
	1316
	1317	static bin_tree_t *
	1318	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
	1319	{
	1320	re_dfa_t *dfa = preg->buffer;
	1321	bin_tree_t *body = node->left;
	1322	bin_tree_t op, cls, tree1, tree;
	1323
	1324	if (preg->no_sub
	1325	/* We do not optimize empty subexpressions, because otherwise we may
	1326	have bad CONCAT nodes with NULL children. This is obviously not
	1327	very common, so we do not lose much. An example that triggers
	1328	this case is the sed "script" //x. */
	1329	&& node->left != NULL
	1330	&& (node->token.opr.idx >= BITSET_WORD_BITS
	1331	\|\| !(dfa->used_bkref_map
	1332	& ((bitset_word_t) 1 << node->token.opr.idx))))
	1333	return node->left;
[3613]	1334
	1335	/* Convert the SUBEXP node to the concatenation of an
[599]	1336	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
	1337	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
	1338	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
	1339	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
	1340	tree = create_tree (dfa, op, tree1, CONCAT);
	1341	if (__glibc_unlikely (tree == NULL \|\| tree1 == NULL
	1342	\|\| op == NULL \|\| cls == NULL))
	1343	{
	1344	*err = REG_ESPACE;
	1345	return NULL;
	1346	}
	1347
	1348	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
	1349	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
	1350	return tree;
	1351	}
	1352
	1353	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
	1354	nodes. Requires a postorder visit. */
	1355	static reg_errcode_t
	1356	calc_first (void extra, bin_tree_t node)
	1357	{
	1358	re_dfa_t dfa = (re_dfa_t ) extra;
	1359	if (node->token.type == CONCAT)
	1360	{
[3613]	1361	node->first = node->left->first;
	1362	node->node_idx = node->left->node_idx;
	1363	}
	1364	else
[599]	1365	{
	1366	node->first = node;
	1367	node->node_idx = re_dfa_add_node (dfa, node->token);
	1368	if (__glibc_unlikely (node->node_idx == -1))
	1369	return REG_ESPACE;
	1370	if (node->token.type == ANCHOR)
	1371	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
	1372	}
	1373	return REG_NOERROR;
	1374	}
	1375
	1376	/* Pass 2: compute NEXT on the tree. Preorder visit. */
	1377	static reg_errcode_t
	1378	calc_next (void extra, bin_tree_t node)
	1379	{
	1380	switch (node->token.type)
	1381	{
	1382	case OP_DUP_ASTERISK:
	1383	node->left->next = node;
	1384	break;
	1385	case CONCAT:
[3613]	1386	node->left->next = node->right->first;
[599]	1387	node->right->next = node->next;
	1388	break;
	1389	default:
	1390	if (node->left)
	1391	node->left->next = node->next;
	1392	if (node->right)
	1393	node->right->next = node->next;
	1394	break;
	1395	}
	1396	return REG_NOERROR;
[3613]	1397	}
[599]	1398
	1399	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
	1400	static reg_errcode_t
	1401	link_nfa_nodes (void extra, bin_tree_t node)
	1402	{
	1403	re_dfa_t dfa = (re_dfa_t ) extra;
	1404	Idx idx = node->node_idx;
	1405	reg_errcode_t err = REG_NOERROR;
[3613]	1406
[599]	1407	switch (node->token.type)
	1408	{
	1409	case CONCAT:
	1410	break;
	1411
[3613]	1412	case END_OF_RE:
[599]	1413	DEBUG_ASSERT (node->next == NULL);
	1414	break;
	1415
	1416	case OP_DUP_ASTERISK:
	1417	case OP_ALT:
	1418	{
	1419	Idx left, right;
	1420	dfa->has_plural_match = 1;
	1421	if (node->left != NULL)
[3613]	1422	left = node->left->first->node_idx;
	1423	else
[599]	1424	left = node->next->node_idx;
	1425	if (node->right != NULL)
	1426	right = node->right->first->node_idx;
	1427	else
	1428	right = node->next->node_idx;
	1429	DEBUG_ASSERT (left > -1);
	1430	DEBUG_ASSERT (right > -1);
	1431	err = re_node_set_init_2 (dfa->edests + idx, left, right);
	1432	}
	1433	break;
	1434
	1435	case ANCHOR:
	1436	case OP_OPEN_SUBEXP:
[3613]	1437	case OP_CLOSE_SUBEXP:
[599]	1438	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
	1439	break;
	1440
[3613]	1441	case OP_BACK_REF:
[599]	1442	dfa->nexts[idx] = node->next->node_idx;
	1443	if (node->token.type == OP_BACK_REF)
	1444	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
	1445	break;
	1446
	1447	default:
	1448	DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
	1449	dfa->nexts[idx] = node->next->node_idx;
	1450	break;
	1451	}
	1452
	1453	return err;
[3613]	1454	}
	1455
[599]	1456	/* Duplicate the epsilon closure of the node ROOT_NODE.
[3613]	1457	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
	1458	to their own constraint. */
[599]	1459
	1460	static reg_errcode_t
	1461	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
[3613]	1462	Idx root_node, unsigned int init_constraint)
[599]	1463	{
	1464	Idx org_node, clone_node;
	1465	bool ok;
	1466	unsigned int constraint = init_constraint;
	1467	for (org_node = top_org_node, clone_node = top_clone_node;;)
	1468	{
	1469	Idx org_dest, clone_dest;
	1470	if (dfa->nodes[org_node].type == OP_BACK_REF)
	1471	{
[3613]	1472	/* If the back reference epsilon-transit, its destination must
[599]	1473	also have the constraint. Then duplicate the epsilon closure
	1474	of the destination of the back reference, and store it in
[3613]	1475	edests of the back reference. */
	1476	org_dest = dfa->nexts[org_node];
[599]	1477	re_node_set_empty (dfa->edests + clone_node);
	1478	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1479	if (__glibc_unlikely (clone_dest == -1))
	1480	return REG_ESPACE;
	1481	dfa->nexts[clone_node] = dfa->nexts[org_node];
	1482	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1483	if (__glibc_unlikely (! ok))
	1484	return REG_ESPACE;
	1485	}
	1486	else if (dfa->edests[org_node].nelem == 0)
	1487	{
	1488	/* In case of the node can't epsilon-transit, don't duplicate the
	1489	destination and store the original destination as the
	1490	destination of the node. */
	1491	dfa->nexts[clone_node] = dfa->nexts[org_node];
	1492	break;
[3613]	1493	}
	1494	else if (dfa->edests[org_node].nelem == 1)
	1495	{
[599]	1496	/* In case of the node can epsilon-transit, and it has only one
[3613]	1497	destination. */
	1498	org_dest = dfa->edests[org_node].elems[0];
	1499	re_node_set_empty (dfa->edests + clone_node);
	1500	/* If the node is root_node itself, it means the epsilon closure
[599]	1501	has a loop. Then tie it to the destination of the root_node. */
[3613]	1502	if (org_node == root_node && clone_node != org_node)
	1503	{
[599]	1504	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
[3613]	1505	if (__glibc_unlikely (! ok))
[599]	1506	return REG_ESPACE;
[3613]	1507	break;
	1508	}
[599]	1509	/* In case the node has another constraint, append it. */
	1510	constraint \|= dfa->nodes[org_node].constraint;
	1511	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1512	if (__glibc_unlikely (clone_dest == -1))
	1513	return REG_ESPACE;
	1514	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1515	if (__glibc_unlikely (! ok))
	1516	return REG_ESPACE;
	1517	}
	1518	else /* dfa->edests[org_node].nelem == 2 */
	1519	{
	1520	/* In case of the node can epsilon-transit, and it has two
[3613]	1521	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
[599]	1522	org_dest = dfa->edests[org_node].elems[0];
	1523	re_node_set_empty (dfa->edests + clone_node);
[3613]	1524	/* Search for a duplicated node which satisfies the constraint. */
[599]	1525	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
[3613]	1526	if (clone_dest == -1)
	1527	{
[599]	1528	/* There is no such duplicated node, create a new one. */
	1529	reg_errcode_t err;
	1530	clone_dest = duplicate_node (dfa, org_dest, constraint);
[3613]	1531	if (__glibc_unlikely (clone_dest == -1))
[599]	1532	return REG_ESPACE;
	1533	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1534	if (__glibc_unlikely (! ok))
	1535	return REG_ESPACE;
[3613]	1536	err = duplicate_node_closure (dfa, org_dest, clone_dest,
[599]	1537	root_node, constraint);
[3613]	1538	if (__glibc_unlikely (err != REG_NOERROR))
	1539	return err;
[599]	1540	}
	1541	else
	1542	{
	1543	/* There is a duplicated node which satisfies the constraint,
	1544	use it to avoid infinite loop. */
[3613]	1545	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
[599]	1546	if (__glibc_unlikely (! ok))
[3613]	1547	return REG_ESPACE;
	1548	}
[599]	1549
	1550	org_dest = dfa->edests[org_node].elems[1];
	1551	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1552	if (__glibc_unlikely (clone_dest == -1))
	1553	return REG_ESPACE;
	1554	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1555	if (__glibc_unlikely (! ok))
	1556	return REG_ESPACE;
	1557	}
	1558	org_node = org_dest;
	1559	clone_node = clone_dest;
[3613]	1560	}
	1561	return REG_NOERROR;
[599]	1562	}
	1563
[3613]	1564	/* Search for a node which is duplicated from the node ORG_NODE, and
[599]	1565	satisfies the constraint CONSTRAINT. */
	1566
	1567	static Idx
	1568	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
	1569	unsigned int constraint)
	1570	{
	1571	Idx idx;
	1572	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
	1573	{
	1574	if (org_node == dfa->org_indices[idx]
	1575	&& constraint == dfa->nodes[idx].constraint)
	1576	return idx; /* Found. */
	1577	}
[3613]	1578	return -1; /* Not found. */
	1579	}
[599]	1580
[3613]	1581	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
	1582	Return the index of the new node, or -1 if insufficient storage is
[599]	1583	available. */
	1584
[3613]	1585	static Idx
[599]	1586	duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
	1587	{
	1588	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
	1589	if (__glibc_likely (dup_idx != -1))
	1590	{
	1591	dfa->nodes[dup_idx].constraint = constraint;
	1592	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
	1593	dfa->nodes[dup_idx].duplicated = 1;
	1594
	1595	/* Store the index of the original node. */
	1596	dfa->org_indices[dup_idx] = org_idx;
[3613]	1597	}
	1598	return dup_idx;
[599]	1599	}
	1600
	1601	static reg_errcode_t
	1602	calc_inveclosure (re_dfa_t *dfa)
	1603	{
[3613]	1604	Idx src, idx;
[599]	1605	bool ok;
	1606	for (idx = 0; idx < dfa->nodes_len; ++idx)
[3613]	1607	re_node_set_init_empty (dfa->inveclosures + idx);
	1608
[599]	1609	for (src = 0; src < dfa->nodes_len; ++src)
	1610	{
	1611	Idx *elems = dfa->eclosures[src].elems;
	1612	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
	1613	{
	1614	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
	1615	if (__glibc_unlikely (! ok))
	1616	return REG_ESPACE;
	1617	}
	1618	}
	1619
	1620	return REG_NOERROR;
[3613]	1621	}
	1622
	1623	/* Calculate "eclosure" for all the node in DFA. */
	1624
[599]	1625	static reg_errcode_t
	1626	calc_eclosure (re_dfa_t *dfa)
	1627	{
	1628	Idx node_idx;
	1629	bool incomplete;
	1630	DEBUG_ASSERT (dfa->nodes_len > 0);
	1631	incomplete = false;
	1632	/* For each nodes, calculate epsilon closure. */
	1633	for (node_idx = 0; ; ++node_idx)
[3613]	1634	{
[599]	1635	reg_errcode_t err;
	1636	re_node_set eclosure_elem;
	1637	if (node_idx == dfa->nodes_len)
[3613]	1638	{
[599]	1639	if (!incomplete)
	1640	break;
	1641	incomplete = false;
	1642	node_idx = 0;
[3613]	1643	}
	1644
	1645	DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
[599]	1646
	1647	/* If we have already calculated, skip it. */
	1648	if (dfa->eclosures[node_idx].nelem != 0)
	1649	continue;
[3613]	1650	/* Calculate epsilon closure of 'node_idx'. */
[599]	1651	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
	1652	if (__glibc_unlikely (err != REG_NOERROR))
	1653	return err;
	1654
	1655	if (dfa->eclosures[node_idx].nelem == 0)
	1656	{
	1657	incomplete = true;
	1658	re_node_set_free (&eclosure_elem);
	1659	}
[3613]	1660	}
[599]	1661	return REG_NOERROR;
	1662	}
[3613]	1663
[599]	1664	/* Calculate epsilon closure of NODE. */
[3613]	1665
[599]	1666	static reg_errcode_t
[3613]	1667	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
[599]	1668	{
	1669	reg_errcode_t err;
[3613]	1670	Idx i;
	1671	re_node_set eclosure;
	1672	bool incomplete = false;
[599]	1673	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
	1674	if (__glibc_unlikely (err != REG_NOERROR))
	1675	return err;
	1676
[3613]	1677	/* An epsilon closure includes itself. */
	1678	eclosure.elems[eclosure.nelem++] = node;
	1679
[599]	1680	/* This indicates that we are calculating this node now.
	1681	We reference this value to avoid infinite loop. */
	1682	dfa->eclosures[node].nelem = -1;
[3613]	1683
	1684	/* If the current node has constraints, duplicate all nodes
	1685	since they must inherit the constraints. */
[599]	1686	if (dfa->nodes[node].constraint
	1687	&& dfa->edests[node].nelem
	1688	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
	1689	{
	1690	err = duplicate_node_closure (dfa, node, node, node,
	1691	dfa->nodes[node].constraint);
	1692	if (__glibc_unlikely (err != REG_NOERROR))
	1693	return err;
[3613]	1694	}
	1695
[599]	1696	/* Expand each epsilon destination nodes. */
	1697	if (IS_EPSILON_NODE(dfa->nodes[node].type))
	1698	for (i = 0; i < dfa->edests[node].nelem; ++i)
[3613]	1699	{
[599]	1700	re_node_set eclosure_elem;
	1701	Idx edest = dfa->edests[node].elems[i];
[3613]	1702	/* If calculating the epsilon closure of 'edest' is in progress,
[599]	1703	return intermediate result. */
	1704	if (dfa->eclosures[edest].nelem == -1)
	1705	{
[3613]	1706	incomplete = true;
	1707	continue;
[599]	1708	}
	1709	/* If we haven't calculated the epsilon closure of 'edest' yet,
	1710	calculate now. Otherwise use calculated epsilon closure. */
	1711	if (dfa->eclosures[edest].nelem == 0)
[3613]	1712	{
	1713	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
	1714	if (__glibc_unlikely (err != REG_NOERROR))
	1715	return err;
	1716	}
[599]	1717	else
	1718	eclosure_elem = dfa->eclosures[edest];
	1719	/* Merge the epsilon closure of 'edest'. */
[3613]	1720	err = re_node_set_merge (&eclosure, &eclosure_elem);
[599]	1721	if (__glibc_unlikely (err != REG_NOERROR))
	1722	return err;
	1723	/* If the epsilon closure of 'edest' is incomplete,
	1724	the epsilon closure of this node is also incomplete. */
	1725	if (dfa->eclosures[edest].nelem == 0)
	1726	{
	1727	incomplete = true;
	1728	re_node_set_free (&eclosure_elem);
	1729	}
	1730	}
	1731
	1732	if (incomplete && !root)
	1733	dfa->eclosures[node].nelem = 0;
	1734	else
	1735	dfa->eclosures[node] = eclosure;
	1736	*new_set = eclosure;
	1737	return REG_NOERROR;
	1738	}
	1739
	1740
	1741	/* Functions for token which are used in the parser. */
	1742
	1743	/* Fetch a token from INPUT.
	1744	We must not use this function inside bracket expressions. */
	1745
	1746	static void
	1747	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
	1748	{
	1749	re_string_skip_bytes (input, peek_token (result, input, syntax));
	1750	}
	1751
	1752	/* Peek a token from INPUT, and return the length of the token.
	1753	We must not use this function inside bracket expressions. */
	1754
	1755	static int
	1756	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
	1757	{
	1758	unsigned char c;
	1759
	1760	if (re_string_eoi (input))
	1761	{
	1762	token->type = END_OF_RE;
[3613]	1763	return 0;
	1764	}
[599]	1765
	1766	c = re_string_peek_byte (input, 0);
	1767	token->opr.c = c;
	1768
	1769	token->word_char = 0;
	1770	token->mb_partial = 0;
	1771	if (input->mb_cur_max > 1
	1772	&& !re_string_first_byte (input, re_string_cur_idx (input)))
	1773	{
	1774	token->type = CHARACTER;
	1775	token->mb_partial = 1;
	1776	return 1;
	1777	}
	1778	if (c == '\\')
	1779	{
	1780	unsigned char c2;
	1781	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
	1782	{
	1783	token->type = BACK_SLASH;
	1784	return 1;
	1785	}
	1786
	1787	c2 = re_string_peek_byte_case (input, 1);
	1788	token->opr.c = c2;
	1789	token->type = CHARACTER;
	1790	if (input->mb_cur_max > 1)
	1791	{
	1792	wint_t wc = re_string_wchar_at (input,
	1793	re_string_cur_idx (input) + 1);
	1794	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
	1795	}
	1796	else
	1797	token->word_char = IS_WORD_CHAR (c2) != 0;
	1798
	1799	switch (c2)
	1800	{
	1801	case '\|':
	1802	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
	1803	token->type = OP_ALT;
	1804	break;
	1805	case '1': case '2': case '3': case '4': case '5':
	1806	case '6': case '7': case '8': case '9':
	1807	if (!(syntax & RE_NO_BK_REFS))
	1808	{
	1809	token->type = OP_BACK_REF;
	1810	token->opr.idx = c2 - '1';
	1811	}
	1812	break;
	1813	case '<':
	1814	if (!(syntax & RE_NO_GNU_OPS))
	1815	{
	1816	token->type = ANCHOR;
	1817	token->opr.ctx_type = WORD_FIRST;
	1818	}
	1819	break;
	1820	case '>':
	1821	if (!(syntax & RE_NO_GNU_OPS))
	1822	{
	1823	token->type = ANCHOR;
	1824	token->opr.ctx_type = WORD_LAST;
	1825	}
	1826	break;
	1827	case 'b':
	1828	if (!(syntax & RE_NO_GNU_OPS))
	1829	{
	1830	token->type = ANCHOR;
	1831	token->opr.ctx_type = WORD_DELIM;
	1832	}
	1833	break;
	1834	case 'B':
	1835	if (!(syntax & RE_NO_GNU_OPS))
	1836	{
	1837	token->type = ANCHOR;
	1838	token->opr.ctx_type = NOT_WORD_DELIM;
	1839	}
	1840	break;
	1841	case 'w':
	1842	if (!(syntax & RE_NO_GNU_OPS))
	1843	token->type = OP_WORD;
	1844	break;
	1845	case 'W':
	1846	if (!(syntax & RE_NO_GNU_OPS))
	1847	token->type = OP_NOTWORD;
	1848	break;
	1849	case 's':
	1850	if (!(syntax & RE_NO_GNU_OPS))
	1851	token->type = OP_SPACE;
	1852	break;
	1853	case 'S':
	1854	if (!(syntax & RE_NO_GNU_OPS))
	1855	token->type = OP_NOTSPACE;
	1856	break;
	1857	case '`':
	1858	if (!(syntax & RE_NO_GNU_OPS))
	1859	{
	1860	token->type = ANCHOR;
	1861	token->opr.ctx_type = BUF_FIRST;
	1862	}
	1863	break;
	1864	case '\'':
	1865	if (!(syntax & RE_NO_GNU_OPS))
	1866	{
	1867	token->type = ANCHOR;
	1868	token->opr.ctx_type = BUF_LAST;
	1869	}
	1870	break;
	1871	case '(':
	1872	if (!(syntax & RE_NO_BK_PARENS))
	1873	token->type = OP_OPEN_SUBEXP;
	1874	break;
	1875	case ')':
	1876	if (!(syntax & RE_NO_BK_PARENS))
	1877	token->type = OP_CLOSE_SUBEXP;
	1878	break;
	1879	case '+':
	1880	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
	1881	token->type = OP_DUP_PLUS;
	1882	break;
	1883	case '?':
	1884	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
	1885	token->type = OP_DUP_QUESTION;
	1886	break;
	1887	case '{':
	1888	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
	1889	token->type = OP_OPEN_DUP_NUM;
	1890	break;
	1891	case '}':
	1892	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
	1893	token->type = OP_CLOSE_DUP_NUM;
	1894	break;
	1895	default:
	1896	break;
	1897	}
	1898	return 2;
	1899	}
	1900
	1901	token->type = CHARACTER;
	1902	if (input->mb_cur_max > 1)
	1903	{
	1904	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
	1905	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
	1906	}
	1907	else
	1908	token->word_char = IS_WORD_CHAR (token->opr.c);
	1909
	1910	switch (c)
	1911	{
	1912	case '\n':
	1913	if (syntax & RE_NEWLINE_ALT)
	1914	token->type = OP_ALT;
	1915	break;
	1916	case '\|':
	1917	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
	1918	token->type = OP_ALT;
	1919	break;
	1920	case '*':
	1921	token->type = OP_DUP_ASTERISK;
	1922	break;
	1923	case '+':
	1924	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
	1925	token->type = OP_DUP_PLUS;
	1926	break;
	1927	case '?':
	1928	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
	1929	token->type = OP_DUP_QUESTION;
	1930	break;
	1931	case '{':
	1932	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
	1933	token->type = OP_OPEN_DUP_NUM;
	1934	break;
	1935	case '}':
	1936	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
	1937	token->type = OP_CLOSE_DUP_NUM;
	1938	break;
	1939	case '(':
	1940	if (syntax & RE_NO_BK_PARENS)
	1941	token->type = OP_OPEN_SUBEXP;
	1942	break;
	1943	case ')':
	1944	if (syntax & RE_NO_BK_PARENS)
	1945	token->type = OP_CLOSE_SUBEXP;
[3613]	1946	break;
	1947	case '[':
[599]	1948	token->type = OP_OPEN_BRACKET;
	1949	break;
	1950	case '.':
	1951	token->type = OP_PERIOD;
	1952	break;
	1953	case '^':
	1954	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE))
	1955	&& re_string_cur_idx (input) != 0)
	1956	{
[3613]	1957	char prev = re_string_peek_byte (input, -1);
	1958	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
[599]	1959	break;
	1960	}
	1961	token->type = ANCHOR;
	1962	token->opr.ctx_type = LINE_FIRST;
	1963	break;
	1964	case '$':
	1965	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
	1966	&& re_string_cur_idx (input) + 1 != re_string_length (input))
	1967	{
	1968	re_token_t next;
	1969	re_string_skip_bytes (input, 1);
	1970	peek_token (&next, input, syntax);
	1971	re_string_skip_bytes (input, -1);
	1972	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
	1973	break;
	1974	}
	1975	token->type = ANCHOR;
	1976	token->opr.ctx_type = LINE_LAST;
	1977	break;
	1978	default:
	1979	break;
	1980	}
	1981	return 1;
	1982	}
	1983
	1984	/* Peek a token from INPUT, and return the length of the token.
	1985	We must not use this function out of bracket expressions. */
	1986
	1987	static int
	1988	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
	1989	{
	1990	unsigned char c;
[3613]	1991	if (re_string_eoi (input))
	1992	{
[599]	1993	token->type = END_OF_RE;
	1994	return 0;
	1995	}
	1996	c = re_string_peek_byte (input, 0);
	1997	token->opr.c = c;
	1998
	1999	if (input->mb_cur_max > 1
	2000	&& !re_string_first_byte (input, re_string_cur_idx (input)))
	2001	{
	2002	token->type = CHARACTER;
	2003	return 1;
	2004	}
	2005
	2006	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
	2007	&& re_string_cur_idx (input) + 1 < re_string_length (input))
	2008	{
	2009	/* In this case, '\' escape a character. */
	2010	unsigned char c2;
	2011	re_string_skip_bytes (input, 1);
	2012	c2 = re_string_peek_byte (input, 0);
	2013	token->opr.c = c2;
	2014	token->type = CHARACTER;
	2015	return 1;
	2016	}
	2017	if (c == '[') /* '[' is a special char in a bracket exps. */
	2018	{
	2019	unsigned char c2;
	2020	int token_len;
	2021	if (re_string_cur_idx (input) + 1 < re_string_length (input))
	2022	c2 = re_string_peek_byte (input, 1);
	2023	else
[3613]	2024	c2 = 0;
[599]	2025	token->opr.c = c2;
	2026	token_len = 2;
	2027	switch (c2)
[3613]	2028	{
[599]	2029	case '.':
	2030	token->type = OP_OPEN_COLL_ELEM;
	2031	break;
	2032
	2033	case '=':
	2034	token->type = OP_OPEN_EQUIV_CLASS;
[3613]	2035	break;
[599]	2036
	2037	case ':':
	2038	if (syntax & RE_CHAR_CLASSES)
	2039	{
	2040	token->type = OP_OPEN_CHAR_CLASS;
	2041	break;
	2042	}
	2043	FALLTHROUGH;
	2044	default:
	2045	token->type = CHARACTER;
	2046	token->opr.c = c;
	2047	token_len = 1;
	2048	break;
	2049	}
	2050	return token_len;
	2051	}
[3613]	2052	switch (c)
	2053	{
	2054	case ']':
	2055	token->type = OP_CLOSE_BRACKET;
	2056	break;
	2057	case '^':
	2058	token->type = OP_NON_MATCH_LIST;
	2059	break;
	2060	case '-':
	2061	/* In V7 Unix grep and Unix awk and mawk, [...---...]
	2062	(3 adjacent minus signs) stands for a single minus sign.
	2063	Support that without breaking anything else. */
	2064	if (! (re_string_cur_idx (input) + 2 < re_string_length (input)
[599]	2065	&& re_string_peek_byte (input, 1) == '-'
	2066	&& re_string_peek_byte (input, 2) == '-'))
	2067	{
	2068	token->type = OP_CHARSET_RANGE;
	2069	break;
	2070	}
	2071	re_string_skip_bytes (input, 2);
	2072	FALLTHROUGH;
	2073	default:
	2074	token->type = CHARACTER;
[3613]	2075	}
[599]	2076	return 1;
	2077	}
	2078
	2079
	2080	/* Functions for parser. */
	2081
	2082	/* Entry point of the parser.
	2083	Parse the regular expression REGEXP and return the structure tree.
	2084	If an error occurs, ERR is set by error code, and return NULL.
	2085	This function build the following tree, from regular expression <reg_exp>:
	2086	CAT
	2087	/ \
	2088	/ \
[3613]	2089	<reg_exp> EOR
[599]	2090
	2091	CAT means concatenation.
	2092	EOR means end of regular expression. */
	2093
	2094	static bin_tree_t *
[3613]	2095	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
[599]	2096	reg_errcode_t *err)
	2097	{
	2098	re_dfa_t *dfa = preg->buffer;
	2099	bin_tree_t tree, eor, *root;
	2100	re_token_t current_token;
	2101	dfa->syntax = syntax;
[3613]	2102	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
[599]	2103	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
	2104	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2105	return NULL;
	2106	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
	2107	if (tree != NULL)
	2108	root = create_tree (dfa, tree, eor, CONCAT);
	2109	else
	2110	root = eor;
	2111	if (__glibc_unlikely (eor == NULL \|\| root == NULL))
	2112	{
	2113	*err = REG_ESPACE;
	2114	return NULL;
	2115	}
	2116	return root;
[3613]	2117	}
[599]	2118
	2119	/* This function build the following tree, from regular expression
	2120	<branch1>\|<branch2>:
[3613]	2121	ALT
[599]	2122	/ \
[3613]	2123	/ \
[599]	2124	<branch1> <branch2>
[3613]	2125
[599]	2126	ALT means alternative, which represents the operator '\|'. */
[3613]	2127
[599]	2128	static bin_tree_t *
	2129	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
	2130	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2131	{
	2132	re_dfa_t *dfa = preg->buffer;
	2133	bin_tree_t tree, branch = NULL;
	2134	bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
	2135	tree = parse_branch (regexp, preg, token, syntax, nest, err);
[3613]	2136	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2137	return NULL;
[599]	2138
[3613]	2139	while (token->type == OP_ALT)
	2140	{
	2141	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2142	if (token->type != OP_ALT && token->type != END_OF_RE
	2143	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
	2144	{
	2145	bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
[599]	2146	dfa->completed_bkref_map = initial_bkref_map;
	2147	branch = parse_branch (regexp, preg, token, syntax, nest, err);
	2148	if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
	2149	{
[3613]	2150	if (tree != NULL)
[599]	2151	postorder (tree, free_tree, NULL);
	2152	return NULL;
	2153	}
	2154	dfa->completed_bkref_map \|= accumulated_bkref_map;
	2155	}
	2156	else
	2157	branch = NULL;
	2158	tree = create_tree (dfa, tree, branch, OP_ALT);
	2159	if (__glibc_unlikely (tree == NULL))
	2160	{
	2161	*err = REG_ESPACE;
	2162	return NULL;
	2163	}
	2164	}
	2165	return tree;
	2166	}
	2167
	2168	/* This function build the following tree, from regular expression
	2169	<exp1><exp2>:
[3613]	2170	CAT
[599]	2171	/ \
[3613]	2172	/ \
	2173	<exp1> <exp2>
[599]	2174
[3613]	2175	CAT means concatenation. */
[599]	2176
	2177	static bin_tree_t *
	2178	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
	2179	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2180	{
[3613]	2181	bin_tree_t tree, expr;
	2182	re_dfa_t *dfa = preg->buffer;
[599]	2183	tree = parse_expression (regexp, preg, token, syntax, nest, err);
[3613]	2184	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2185	return NULL;
[599]	2186
	2187	while (token->type != OP_ALT && token->type != END_OF_RE
[3613]	2188	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
[599]	2189	{
[3613]	2190	expr = parse_expression (regexp, preg, token, syntax, nest, err);
	2191	if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
[599]	2192	{
[3613]	2193	if (tree != NULL)
	2194	postorder (tree, free_tree, NULL);
[599]	2195	return NULL;
	2196	}
	2197	if (tree != NULL && expr != NULL)
[3613]	2198	{
[599]	2199	bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
	2200	if (newtree == NULL)
[3613]	2201	{
	2202	postorder (expr, free_tree, NULL);
[599]	2203	postorder (tree, free_tree, NULL);
	2204	*err = REG_ESPACE;
	2205	return NULL;
	2206	}
	2207	tree = newtree;
	2208	}
	2209	else if (tree == NULL)
	2210	tree = expr;
	2211	/* Otherwise expr == NULL, we don't need to create new tree. */
	2212	}
	2213	return tree;
	2214	}
[3613]	2215
[599]	2216	/* This function build the following tree, from regular expression a*:
[3613]	2217	*
[599]	2218	\|
	2219	a
	2220	*/
	2221
	2222	static bin_tree_t *
[3613]	2223	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
[599]	2224	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2225	{
	2226	re_dfa_t *dfa = preg->buffer;
	2227	bin_tree_t *tree;
	2228	switch (token->type)
	2229	{
	2230	case CHARACTER:
	2231	tree = create_token_tree (dfa, NULL, NULL, token);
	2232	if (__glibc_unlikely (tree == NULL))
	2233	{
	2234	*err = REG_ESPACE;
	2235	return NULL;
	2236	}
[3613]	2237	if (dfa->mb_cur_max > 1)
[599]	2238	{
	2239	while (!re_string_eoi (regexp)
	2240	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
	2241	{
	2242	bin_tree_t *mbc_remain;
	2243	fetch_token (token, regexp, syntax);
	2244	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
[3613]	2245	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
[599]	2246	if (__glibc_unlikely (mbc_remain == NULL \|\| tree == NULL))
	2247	{
[3613]	2248	*err = REG_ESPACE;
[599]	2249	return NULL;
	2250	}
[3613]	2251	}
[599]	2252	}
	2253	break;
[3613]	2254
[599]	2255	case OP_OPEN_SUBEXP:
	2256	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
[3613]	2257	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
[599]	2258	return NULL;
[3613]	2259	break;
[599]	2260
	2261	case OP_OPEN_BRACKET:
	2262	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
	2263	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2264	return NULL;
	2265	break;
[3613]	2266
[599]	2267	case OP_BACK_REF:
	2268	if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
	2269	{
	2270	*err = REG_ESUBREG;
	2271	return NULL;
	2272	}
	2273	dfa->used_bkref_map \|= 1 << token->opr.idx;
[3613]	2274	tree = create_token_tree (dfa, NULL, NULL, token);
[599]	2275	if (__glibc_unlikely (tree == NULL))
	2276	{
	2277	*err = REG_ESPACE;
	2278	return NULL;
	2279	}
	2280	++dfa->nbackref;
[3613]	2281	dfa->has_mb_node = 1;
[599]	2282	break;
	2283
	2284	case OP_OPEN_DUP_NUM:
	2285	if (syntax & RE_CONTEXT_INVALID_DUP)
	2286	{
	2287	*err = REG_BADRPT;
	2288	return NULL;
	2289	}
	2290	FALLTHROUGH;
	2291	case OP_DUP_ASTERISK:
	2292	case OP_DUP_PLUS:
	2293	case OP_DUP_QUESTION:
	2294	if (syntax & RE_CONTEXT_INVALID_OPS)
[3613]	2295	{
[599]	2296	*err = REG_BADRPT;
[3613]	2297	return NULL;
	2298	}
[599]	2299	else if (syntax & RE_CONTEXT_INDEP_OPS)
	2300	{
	2301	fetch_token (token, regexp, syntax);
	2302	return parse_expression (regexp, preg, token, syntax, nest, err);
[3613]	2303	}
[599]	2304	FALLTHROUGH;
	2305	case OP_CLOSE_SUBEXP:
	2306	if ((token->type == OP_CLOSE_SUBEXP)
	2307	&& !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
	2308	{
	2309	*err = REG_ERPAREN;
	2310	return NULL;
	2311	}
[3613]	2312	FALLTHROUGH;
[599]	2313	case OP_CLOSE_DUP_NUM:
	2314	/* We treat it as a normal character. */
	2315
	2316	/* Then we can these characters as normal characters. */
	2317	token->type = CHARACTER;
[3613]	2318	/* mb_partial and word_char bits should be initialized already
[599]	2319	by peek_token. */
	2320	tree = create_token_tree (dfa, NULL, NULL, token);
	2321	if (__glibc_unlikely (tree == NULL))
	2322	{
	2323	*err = REG_ESPACE;
	2324	return NULL;
[3613]	2325	}
[599]	2326	break;
	2327
	2328	case ANCHOR:
	2329	if ((token->opr.ctx_type
	2330	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
	2331	&& dfa->word_ops_used == 0)
	2332	init_word_char (dfa);
[3613]	2333	if (token->opr.ctx_type == WORD_DELIM
	2334	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
	2335	{
[599]	2336	bin_tree_t tree_first, tree_last;
	2337	if (token->opr.ctx_type == WORD_DELIM)
	2338	{
[3613]	2339	token->opr.ctx_type = WORD_FIRST;
[599]	2340	tree_first = create_token_tree (dfa, NULL, NULL, token);
	2341	token->opr.ctx_type = WORD_LAST;
[3613]	2342	}
	2343	else
[599]	2344	{
	2345	token->opr.ctx_type = INSIDE_WORD;
	2346	tree_first = create_token_tree (dfa, NULL, NULL, token);
	2347	token->opr.ctx_type = INSIDE_NOTWORD;
	2348	}
	2349	tree_last = create_token_tree (dfa, NULL, NULL, token);
	2350	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
	2351	if (__glibc_unlikely (tree_first == NULL \|\| tree_last == NULL
[3613]	2352	\|\| tree == NULL))
[599]	2353	{
	2354	*err = REG_ESPACE;
	2355	return NULL;
	2356	}
	2357	}
	2358	else
	2359	{
	2360	tree = create_token_tree (dfa, NULL, NULL, token);
	2361	if (__glibc_unlikely (tree == NULL))
	2362	{
	2363	*err = REG_ESPACE;
[3613]	2364	return NULL;
[599]	2365	}
	2366	}
[3613]	2367	/* We must return here, since ANCHORs can't be followed
[599]	2368	by repetition operators.
	2369	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
	2370	it must not be "<ANCHOR(^)><REPEAT()>". /
	2371	fetch_token (token, regexp, syntax);
	2372	return tree;
	2373
	2374	case OP_PERIOD:
[3613]	2375	tree = create_token_tree (dfa, NULL, NULL, token);
[599]	2376	if (__glibc_unlikely (tree == NULL))
	2377	{
	2378	*err = REG_ESPACE;
[3613]	2379	return NULL;
	2380	}
[599]	2381	if (dfa->mb_cur_max > 1)
[3613]	2382	dfa->has_mb_node = 1;
[599]	2383	break;
	2384
[3613]	2385	case OP_WORD:
[599]	2386	case OP_NOTWORD:
	2387	tree = build_charclass_op (dfa, regexp->trans,
	2388	"alnum",
[3613]	2389	"_",
	2390	token->type == OP_NOTWORD, err);
[599]	2391	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
[3613]	2392	return NULL;
[599]	2393	break;
	2394
[3613]	2395	case OP_SPACE:
[599]	2396	case OP_NOTSPACE:
	2397	tree = build_charclass_op (dfa, regexp->trans,
	2398	"space",
[3613]	2399	"",
[599]	2400	token->type == OP_NOTSPACE, err);
	2401	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
	2402	return NULL;
[3613]	2403	break;
[599]	2404
	2405	case OP_ALT:
[3613]	2406	case END_OF_RE:
[599]	2407	return NULL;
	2408
	2409	case BACK_SLASH:
	2410	*err = REG_EESCAPE;
	2411	return NULL;
	2412
	2413	default:
[3613]	2414	/* Must not happen? */
	2415	DEBUG_ASSERT (false);
	2416	return NULL;
	2417	}
	2418	fetch_token (token, regexp, syntax);
	2419
	2420	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
	2421	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
	2422	{
[599]	2423	bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
	2424	syntax, err);
	2425	if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
	2426	{
	2427	if (tree != NULL)
[3613]	2428	postorder (tree, free_tree, NULL);
	2429	return NULL;
[599]	2430	}
	2431	tree = dup_tree;
	2432	/* In BRE consecutive duplications are not allowed. */
	2433	if ((syntax & RE_CONTEXT_INVALID_DUP)
	2434	&& (token->type == OP_DUP_ASTERISK
	2435	\|\| token->type == OP_OPEN_DUP_NUM))
	2436	{
	2437	if (tree != NULL)
	2438	postorder (tree, free_tree, NULL);
	2439	*err = REG_BADRPT;
	2440	return NULL;
	2441	}
	2442	}
	2443
	2444	return tree;
	2445	}
	2446
[3613]	2447	/* This function build the following tree, from regular expression
[599]	2448	(<reg_exp>):
[3613]	2449	SUBEXP
[599]	2450	\|
	2451	<reg_exp>
	2452	*/
	2453
	2454	static bin_tree_t *
	2455	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
	2456	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2457	{
	2458	re_dfa_t *dfa = preg->buffer;
	2459	bin_tree_t *tree;
	2460	size_t cur_nsub;
	2461	cur_nsub = preg->re_nsub++;
[3613]	2462
	2463	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2464
	2465	/* The subexpression may be a null string. */
	2466	if (token->type == OP_CLOSE_SUBEXP)
	2467	tree = NULL;
	2468	else
	2469	{
[599]	2470	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
	2471	if (__glibc_unlikely (*err == REG_NOERROR
	2472	&& token->type != OP_CLOSE_SUBEXP))
	2473	{
	2474	if (tree != NULL)
	2475	postorder (tree, free_tree, NULL);
	2476	*err = REG_EPAREN;
[3613]	2477	}
[599]	2478	if (__glibc_unlikely (*err != REG_NOERROR))
	2479	return NULL;
	2480	}
	2481
	2482	if (cur_nsub <= '9' - '1')
	2483	dfa->completed_bkref_map \|= 1 << cur_nsub;
	2484
	2485	tree = create_tree (dfa, tree, NULL, SUBEXP);
	2486	if (__glibc_unlikely (tree == NULL))
	2487	{
	2488	*err = REG_ESPACE;
	2489	return NULL;
	2490	}
	2491	tree->token.opr.idx = cur_nsub;
	2492	return tree;
[3613]	2493	}
[599]	2494
	2495	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
	2496
	2497	static bin_tree_t *
	2498	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
	2499	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
	2500	{
	2501	bin_tree_t tree = NULL, old_tree = NULL;
	2502	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
	2503	re_token_t start_token = *token;
	2504
	2505	if (token->type == OP_OPEN_DUP_NUM)
	2506	{
	2507	end = 0;
	2508	start = fetch_number (regexp, token, syntax);
	2509	if (start == -1)
[3613]	2510	{
[599]	2511	if (token->type == CHARACTER && token->opr.c == ',')
	2512	start = 0; /* We treat "{,m}" as "{0,m}". */
	2513	else
	2514	{
	2515	err = REG_BADBR; / <re>{} is invalid. */
	2516	return NULL;
[3613]	2517	}
[599]	2518	}
	2519	if (__glibc_likely (start != -2))
[3613]	2520	{
[599]	2521	/* We treat "{n}" as "{n,n}". */
	2522	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
	2523	: ((token->type == CHARACTER && token->opr.c == ',')
	2524	? fetch_number (regexp, token, syntax) : -2));
	2525	}
	2526	if (__glibc_unlikely (start == -2 \|\| end == -2))
	2527	{
	2528	/* Invalid sequence. */
	2529	if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
	2530	{
	2531	if (token->type == END_OF_RE)
	2532	*err = REG_EBRACE;
	2533	else
	2534	*err = REG_BADBR;
	2535
	2536	return NULL;
	2537	}
	2538
[3613]	2539	/* If the syntax bit is set, rollback. */
	2540	re_string_set_index (regexp, start_idx);
[599]	2541	*token = start_token;
	2542	token->type = CHARACTER;
	2543	/* mb_partial and word_char bits should be already initialized by
	2544	peek_token. */
	2545	return elem;
[3613]	2546	}
	2547
	2548	if (__glibc_unlikely ((end != -1 && start > end)
	2549	\|\| token->type != OP_CLOSE_DUP_NUM))
	2550	{
	2551	/* First number greater than second. */
[599]	2552	*err = REG_BADBR;
	2553	return NULL;
	2554	}
	2555
	2556	if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
	2557	{
	2558	*err = REG_ESIZE;
	2559	return NULL;
	2560	}
[3613]	2561	}
[599]	2562	else
[3613]	2563	{
[599]	2564	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
	2565	end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
	2566	}
	2567
	2568	fetch_token (token, regexp, syntax);
	2569
[3613]	2570	if (__glibc_unlikely (elem == NULL))
[599]	2571	return NULL;
	2572	if (__glibc_unlikely (start == 0 && end == 0))
	2573	{
	2574	postorder (elem, free_tree, NULL);
	2575	return NULL;
	2576	}
[3613]	2577
[599]	2578	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
	2579	if (__glibc_unlikely (start > 0))
	2580	{
	2581	tree = elem;
	2582	for (i = 2; i <= start; ++i)
	2583	{
	2584	elem = duplicate_tree (elem, dfa);
	2585	tree = create_tree (dfa, tree, elem, CONCAT);
[3613]	2586	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
	2587	goto parse_dup_op_espace;
[599]	2588	}
	2589
	2590	if (start == end)
	2591	return tree;
	2592
	2593	/* Duplicate ELEM before it is marked optional. */
[3613]	2594	elem = duplicate_tree (elem, dfa);
	2595	if (__glibc_unlikely (elem == NULL))
	2596	goto parse_dup_op_espace;
	2597	old_tree = tree;
[599]	2598	}
[3613]	2599	else
	2600	old_tree = NULL;
	2601
[599]	2602	if (elem->token.type == SUBEXP)
	2603	{
	2604	uintptr_t subidx = elem->token.opr.idx;
	2605	postorder (elem, mark_opt_subexp, (void *) subidx);
	2606	}
[3613]	2607
	2608	tree = create_tree (dfa, elem, NULL,
	2609	(end == -1 ? OP_DUP_ASTERISK : OP_ALT));
	2610	if (__glibc_unlikely (tree == NULL))
	2611	goto parse_dup_op_espace;
	2612
	2613	/* This loop is actually executed only when end != -1,
[599]	2614	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
[3613]	2615	already created the start+1-th copy. */
	2616	if (TYPE_SIGNED (Idx) \|\| end != -1)
	2617	for (i = start + 2; i <= end; ++i)
	2618	{
[599]	2619	elem = duplicate_tree (elem, dfa);
	2620	tree = create_tree (dfa, tree, elem, CONCAT);
	2621	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
	2622	goto parse_dup_op_espace;
	2623
	2624	tree = create_tree (dfa, tree, NULL, OP_ALT);
	2625	if (__glibc_unlikely (tree == NULL))
	2626	goto parse_dup_op_espace;
	2627	}
	2628
	2629	if (old_tree)
	2630	tree = create_tree (dfa, old_tree, tree, CONCAT);
	2631
	2632	return tree;
	2633
	2634	parse_dup_op_espace:
	2635	*err = REG_ESPACE;
[3613]	2636	return NULL;
	2637	}
	2638
	2639	/* Size of the names for collating symbol/equivalence_class/character_class.
	2640	I'm not sure, but maybe enough. */
	2641	#define BRACKET_NAME_BUF_SIZE 32
	2642
	2643	#ifndef _LIBC
	2644
	2645	/* Convert the byte B to the corresponding wide character. In a
	2646	unibyte locale, treat B as itself. In a multibyte locale, return
	2647	WEOF if B is an encoding error. */
	2648	static wint_t
	2649	parse_byte (unsigned char b, re_dfa_t const *dfa)
	2650	{
	2651	return dfa->mb_cur_max > 1 ? __btowc (b) : b;
[599]	2652	}
[3613]	2653
	2654	/* Local function for parse_bracket_exp used in _LIBC environment.
	2655	Build the range expression which starts from START_ELEM, and ends
	2656	at END_ELEM. The result are written to MBCSET and SBCSET.
	2657	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
	2658	mbcset->range_ends, is a pointer argument since we may
[599]	2659	update it. */
	2660
[3613]	2661	static reg_errcode_t
	2662	build_range_exp (bitset_t sbcset, re_charset_t mbcset, Idx range_alloc,
	2663	bracket_elem_t start_elem, bracket_elem_t end_elem,
	2664	re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
[599]	2665	const unsigned char collseqmb, const char collseqwc,
	2666	int_fast32_t table_size, const void *symb_table,
	2667	const unsigned char *extra)
	2668	{
[3613]	2669	/* Equivalence Classes and Character Classes can't be a range start/end. */
	2670	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
	2671	\|\| start_elem->type == CHAR_CLASS
	2672	\|\| end_elem->type == EQUIV_CLASS
[599]	2673	\|\| end_elem->type == CHAR_CLASS))
	2674	return REG_ERANGE;
[3613]	2675
[599]	2676	/* We can handle no multi character collating elements without libc
	2677	support. */
[3613]	2678	if (__glibc_unlikely ((start_elem->type == COLL_SYM
[599]	2679	&& strlen ((char *) start_elem->opr.name) > 1)
	2680	\|\| (end_elem->type == COLL_SYM
	2681	&& strlen ((char *) end_elem->opr.name) > 1)))
[3613]	2682	return REG_ECOLLATE;
[599]	2683
[3613]	2684	unsigned int
[599]	2685	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
[3613]	2686	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
[599]	2687	: 0)),
[3613]	2688	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
	2689	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
	2690	: 0));
	2691	wint_t
	2692	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
[599]	2693	? parse_byte (start_ch, dfa) : start_elem->opr.wch),
[3613]	2694	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
	2695	? parse_byte (end_ch, dfa) : end_elem->opr.wch);
	2696
	2697	if (start_wc == WEOF \|\| end_wc == WEOF)
	2698	return REG_ECOLLATE;
	2699	else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
	2700	&& start_wc > end_wc))
	2701	return REG_ERANGE;
	2702
	2703	/* Got valid collation sequence values, add them as a new entry.
	2704	However, for !_LIBC we have no collation elements: if the
	2705	character set is single byte, the single byte character set
	2706	that we build below suffices. parse_bracket_exp passes
[599]	2707	no MBCSET if dfa->mb_cur_max == 1. */
[3613]	2708	if (dfa->mb_cur_max > 1)
	2709	{
	2710	/* Check the space of the arrays. */
	2711	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
	2712	{
	2713	/* There is not enough space, need realloc. */
	2714	wchar_t new_array_start, new_array_end;
	2715	Idx new_nranges;
[599]	2716
[3613]	2717	/* +1 in case of mbcset->nranges is 0. */
	2718	new_nranges = 2 * mbcset->nranges + 1;
	2719	/* Use realloc since mbcset->range_starts and mbcset->range_ends
	2720	are NULL if range_alloc == 0. /
	2721	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
	2722	new_nranges);
	2723	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
[599]	2724	new_nranges);
[3613]	2725
	2726	if (__glibc_unlikely (new_array_start == NULL
	2727	\|\| new_array_end == NULL))
	2728	{
[599]	2729	re_free (new_array_start);
[3613]	2730	re_free (new_array_end);
	2731	return REG_ESPACE;
	2732	}
	2733
	2734	mbcset->range_starts = new_array_start;
	2735	mbcset->range_ends = new_array_end;
	2736	*range_alloc = new_nranges;
	2737	}
	2738
	2739	mbcset->range_starts[mbcset->nranges] = start_wc;
	2740	mbcset->range_ends[mbcset->nranges++] = end_wc;
[599]	2741	}
	2742
	2743	/* Build the table for single byte characters. */
	2744	for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
	2745	{
[3613]	2746	if (start_wc <= wc && wc <= end_wc)
[599]	2747	bitset_set (sbcset, wc);
	2748	}
	2749
	2750	return REG_NOERROR;
	2751	}
	2752	#endif /* not _LIBC */
	2753
[3613]	2754	#ifndef _LIBC
	2755	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
	2756	Build the collating element which is represented by NAME.
[599]	2757	The result are written to MBCSET and SBCSET.
	2758	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
[3613]	2759	pointer argument since we may update it. */
[599]	2760
	2761	static reg_errcode_t
	2762	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
	2763	Idx coll_sym_alloc, const unsigned char name,
	2764	uint_fast32_t nrules, int_fast32_t table_size,
	2765	const void symb_table, const unsigned char extra)
	2766	{
	2767	size_t name_len = strlen ((const char *) name);
	2768	if (__glibc_unlikely (name_len != 1))
[3613]	2769	return REG_ECOLLATE;
	2770	else
	2771	{
	2772	bitset_set (sbcset, name[0]);
	2773	return REG_NOERROR;
[599]	2774	}
[3613]	2775	}
	2776	#endif /* not _LIBC */
	2777
	2778	#ifdef _LIBC
	2779	/* Local function for parse_bracket_exp used in _LIBC environment.
[599]	2780	Seek the collating symbol entry corresponding to NAME.
[3613]	2781	Return the index of the symbol in the SYMB_TABLE,
[599]	2782	or -1 if not found. */
[3613]	2783
	2784	static __always_inline int32_t
	2785	seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
	2786	const int32_t *symb_table,
	2787	int_fast32_t table_size,
	2788	const unsigned char *extra)
	2789	{
	2790	int_fast32_t elem;
	2791
	2792	for (elem = 0; elem < table_size; elem++)
	2793	if (symb_table[2 * elem] != 0)
	2794	{
	2795	int32_t idx = symb_table[2 * elem + 1];
	2796	/* Skip the name of collating element name. */
	2797	idx += 1 + extra[idx];
[599]	2798	if (/* Compare the length of the name. */
[3613]	2799	name_len == extra[idx]
	2800	/* Compare the name. */
	2801	&& memcmp (name, &extra[idx + 1], name_len) == 0)
	2802	/* Yep, this is the entry. */
	2803	return elem;
	2804	}
	2805	return -1;
	2806	}
	2807
	2808	/* Local function for parse_bracket_exp used in _LIBC environment.
	2809	Look up the collation sequence value of BR_ELEM.
	2810	Return the value if succeeded, UINT_MAX otherwise. */
	2811
[599]	2812	static __always_inline unsigned int
[3613]	2813	lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
	2814	const unsigned char *collseqmb,
	2815	const char *collseqwc,
	2816	int_fast32_t table_size,
[599]	2817	const int32_t *symb_table,
[3613]	2818	const unsigned char *extra)
	2819	{
[599]	2820	if (br_elem->type == SB_CHAR)
	2821	{
[3613]	2822	/* if (MB_CUR_MAX == 1) */
[599]	2823	if (nrules == 0)
[3613]	2824	return collseqmb[br_elem->opr.ch];
	2825	else
	2826	{
	2827	wint_t wc = __btowc (br_elem->opr.ch);
	2828	return __collseq_table_lookup (collseqwc, wc);
	2829	}
	2830	}
[599]	2831	else if (br_elem->type == MB_CHAR)
[3613]	2832	{
	2833	if (nrules != 0)
	2834	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
	2835	}
	2836	else if (br_elem->type == COLL_SYM)
	2837	{
[599]	2838	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
[3613]	2839	if (nrules != 0)
	2840	{
	2841	int32_t elem, idx;
	2842	elem = seek_collating_symbol_entry (br_elem->opr.name,
	2843	sym_name_len,
	2844	symb_table, table_size,
	2845	extra);
	2846	if (elem != -1)
	2847	{
	2848	/* We found the entry. */
	2849	idx = symb_table[2 * elem + 1];
	2850	/* Skip the name of collating element name. */
	2851	idx += 1 + extra[idx];
	2852	/* Skip the byte sequence of the collating element. */
	2853	idx += 1 + extra[idx];
[599]	2854	/* Adjust for the alignment. */
[3613]	2855	idx = (idx + 3) & ~3;
[599]	2856	/* Skip the multibyte collation sequence value. */
[3613]	2857	idx += sizeof (unsigned int);
	2858	/* Skip the wide char sequence of the collating element. */
	2859	idx += sizeof (unsigned int) *
[599]	2860	(1 + (unsigned int ) (extra + idx));
	2861	/* Return the collation sequence value. */
[3613]	2862	return (unsigned int ) (extra + idx);
	2863	}
[599]	2864	else if (sym_name_len == 1)
[3613]	2865	{
	2866	/* No valid character. Match it as a single byte
[599]	2867	character. */
[3613]	2868	return collseqmb[br_elem->opr.name[0]];
	2869	}
	2870	}
	2871	else if (sym_name_len == 1)
	2872	return collseqmb[br_elem->opr.name[0]];
	2873	}
[599]	2874	return UINT_MAX;
[3613]	2875	}
	2876
	2877	/* Local function for parse_bracket_exp used in _LIBC environment.
	2878	Build the range expression which starts from START_ELEM, and ends
	2879	at END_ELEM. The result are written to MBCSET and SBCSET.
	2880	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
	2881	mbcset->range_ends, is a pointer argument since we may
	2882	update it. */
	2883
	2884	static __always_inline reg_errcode_t
	2885	build_range_exp (bitset_t sbcset, re_charset_t mbcset, Idx range_alloc,
[599]	2886	bracket_elem_t start_elem, bracket_elem_t end_elem,
[3613]	2887	re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
	2888	const unsigned char collseqmb, const char collseqwc,
	2889	int_fast32_t table_size, const int32_t *symb_table,
	2890	const unsigned char *extra)
	2891	{
	2892	unsigned int ch;
	2893	uint32_t start_collseq;
[599]	2894	uint32_t end_collseq;
[3613]	2895
	2896	/* Equivalence Classes and Character Classes can't be a range
	2897	start/end. */
	2898	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
	2899	\|\| start_elem->type == CHAR_CLASS
	2900	\|\| end_elem->type == EQUIV_CLASS
	2901	\|\| end_elem->type == CHAR_CLASS))
	2902	return REG_ERANGE;
	2903
	2904	/* FIXME: Implement rational ranges here, too. */
	2905	start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
	2906	table_size, symb_table, extra);
[599]	2907	end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
[3613]	2908	table_size, symb_table, extra);
	2909	/* Check start/end collation sequence values. */
	2910	if (__glibc_unlikely (start_collseq == UINT_MAX
	2911	\|\| end_collseq == UINT_MAX))
	2912	return REG_ECOLLATE;
	2913	if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
	2914	&& start_collseq > end_collseq))
	2915	return REG_ERANGE;
[599]	2916
[3613]	2917	/* Got valid collation sequence values, add them as a new entry.
	2918	However, if we have no collation elements, and the character set
	2919	is single byte, the single byte character set that we
	2920	build below suffices. */
[599]	2921	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
[3613]	2922	{
	2923	/* Check the space of the arrays. */
	2924	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
	2925	{
	2926	/* There is not enough space, need realloc. */
	2927	uint32_t *new_array_start;
[599]	2928	uint32_t *new_array_end;
[3613]	2929	int new_nranges;
	2930
	2931	/* +1 in case of mbcset->nranges is 0. */
[599]	2932	new_nranges = 2 * mbcset->nranges + 1;
[3613]	2933	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
	2934	new_nranges);
	2935	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
[599]	2936	new_nranges);
	2937
[3613]	2938	if (__glibc_unlikely (new_array_start == NULL
	2939	\|\| new_array_end == NULL))
[599]	2940	return REG_ESPACE;
	2941
[3613]	2942	mbcset->range_starts = new_array_start;
	2943	mbcset->range_ends = new_array_end;
	2944	*range_alloc = new_nranges;
	2945	}
	2946
	2947	mbcset->range_starts[mbcset->nranges] = start_collseq;
	2948	mbcset->range_ends[mbcset->nranges++] = end_collseq;
	2949	}
	2950
	2951	/* Build the table for single byte characters. */
	2952	for (ch = 0; ch < SBC_MAX; ch++)
	2953	{
	2954	uint32_t ch_collseq;
	2955	/* if (MB_CUR_MAX == 1) */
[599]	2956	if (nrules == 0)
[3613]	2957	ch_collseq = collseqmb[ch];
	2958	else
	2959	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
	2960	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
	2961	bitset_set (sbcset, ch);
	2962	}
	2963	return REG_NOERROR;
	2964	}
	2965
	2966	/* Local function for parse_bracket_exp used in _LIBC environment.
	2967	Build the collating element which is represented by NAME.
	2968	The result are written to MBCSET and SBCSET.
	2969	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
	2970	pointer argument since we may update it. */
	2971
[599]	2972	static __always_inline reg_errcode_t
[3613]	2973	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
	2974	Idx coll_sym_alloc, const unsigned char name,
	2975	uint_fast32_t nrules, int_fast32_t table_size,
[599]	2976	const int32_t symb_table, const unsigned char extra)
[3613]	2977	{
	2978	int32_t elem, idx;
	2979	size_t name_len = strlen ((const char *) name);
	2980	if (nrules != 0)
	2981	{
	2982	elem = seek_collating_symbol_entry (name, name_len, symb_table,
	2983	table_size, extra);
	2984	if (elem != -1)
	2985	{
	2986	/* We found the entry. */
[599]	2987	idx = symb_table[2 * elem + 1];
	2988	/* Skip the name of collating element name. */
	2989	idx += 1 + extra[idx];
[3613]	2990	}
	2991	else if (name_len == 1)
	2992	{
	2993	/* No valid character, treat it as a normal
	2994	character. */
[599]	2995	bitset_set (sbcset, name[0]);
[3613]	2996	return REG_NOERROR;
	2997	}
	2998	else
	2999	return REG_ECOLLATE;
	3000
	3001	/* Got valid collation sequence, add it as a new entry. */
	3002	/* Check the space of the arrays. */
	3003	if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
	3004	{
	3005	/* Not enough, realloc it. */
	3006	/* +1 in case of mbcset->ncoll_syms is 0. */
[599]	3007	int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
[3613]	3008	/* Use realloc since mbcset->coll_syms is NULL
	3009	if alloc == 0. /
[599]	3010	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
[3613]	3011	new_coll_sym_alloc);
	3012	if (__glibc_unlikely (new_coll_syms == NULL))
	3013	return REG_ESPACE;
	3014	mbcset->coll_syms = new_coll_syms;
	3015	*coll_sym_alloc = new_coll_sym_alloc;
	3016	}
	3017	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
	3018	return REG_NOERROR;
	3019	}
	3020	else
	3021	{
	3022	if (__glibc_unlikely (name_len != 1))
[599]	3023	return REG_ECOLLATE;
[3613]	3024	else
	3025	{
	3026	bitset_set (sbcset, name[0]);
	3027	return REG_NOERROR;
	3028	}
	3029	}
	3030	}
	3031	#endif /* _LIBC */
	3032
	3033	/* This function parse bracket expression like "[abc]", "[a-c]",
	3034	"[[.a-a.]]" etc. */
	3035
	3036	static bin_tree_t *
	3037	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
[599]	3038	reg_syntax_t syntax, reg_errcode_t *err)
	3039	{
	3040	const unsigned char *collseqmb = NULL;
[3613]	3041	const char *collseqwc = NULL;
	3042	uint_fast32_t nrules = 0;
	3043	int_fast32_t table_size = 0;
[599]	3044	const void *symb_table = NULL;
	3045	const unsigned char *extra = NULL;
[3613]	3046
[599]	3047	re_token_t br_token;
	3048	re_bitset_ptr_t sbcset;
	3049	re_charset_t *mbcset;
	3050	Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
	3051	Idx equiv_class_alloc = 0, char_class_alloc = 0;
	3052	bool non_match = false;
	3053	bin_tree_t *work_tree;
	3054	int token_len;
	3055	bool first_round = true;
	3056	#ifdef _LIBC
	3057	collseqmb = (const unsigned char *)
[3613]	3058	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
[599]	3059	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
	3060	if (nrules)
	3061	{
	3062	/*
	3063	if (MB_CUR_MAX > 1)
	3064	*/
[3613]	3065	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
[599]	3066	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
[3613]	3067	symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
	3068	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
[599]	3069	_NL_COLLATE_SYMB_EXTRAMB);
	3070	}
	3071	#endif
	3072	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
	3073	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
[3613]	3074	if (__glibc_unlikely (sbcset == NULL \|\| mbcset == NULL))
[599]	3075	{
	3076	re_free (sbcset);
	3077	re_free (mbcset);
	3078	*err = REG_ESPACE;
	3079	return NULL;
	3080	}
	3081
[3613]	3082	token_len = peek_token_bracket (token, regexp, syntax);
[599]	3083	if (__glibc_unlikely (token->type == END_OF_RE))
[3613]	3084	{
[599]	3085	*err = REG_BADPAT;
	3086	goto parse_bracket_exp_free_return;
[3613]	3087	}
[599]	3088	if (token->type == OP_NON_MATCH_LIST)
	3089	{
	3090	mbcset->non_match = 1;
	3091	non_match = true;
	3092	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
	3093	bitset_set (sbcset, '\n');
	3094	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3095	token_len = peek_token_bracket (token, regexp, syntax);
	3096	if (__glibc_unlikely (token->type == END_OF_RE))
	3097	{
	3098	*err = REG_BADPAT;
	3099	goto parse_bracket_exp_free_return;
	3100	}
	3101	}
	3102
	3103	/* We treat the first ']' as a normal character. */
[3613]	3104	if (token->type == OP_CLOSE_BRACKET)
	3105	token->type = CHARACTER;
[599]	3106
	3107	while (1)
	3108	{
[3613]	3109	bracket_elem_t start_elem, end_elem;
[599]	3110	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
	3111	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
[3613]	3112	reg_errcode_t ret;
[599]	3113	int token_len2 = 0;
	3114	bool is_range_exp = false;
	3115	re_token_t token2;
	3116
[3613]	3117	start_elem.opr.name = start_name_buf;
[599]	3118	start_elem.type = COLL_SYM;
	3119	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
	3120	syntax, first_round);
	3121	if (__glibc_unlikely (ret != REG_NOERROR))
	3122	{
	3123	*err = ret;
	3124	goto parse_bracket_exp_free_return;
[3613]	3125	}
[599]	3126	first_round = false;
	3127
	3128	/* Get information about the next token. We need it in any case. */
	3129	token_len = peek_token_bracket (token, regexp, syntax);
	3130
	3131	/* Do not check for ranges if we know they are not allowed. */
	3132	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
	3133	{
[3613]	3134	if (__glibc_unlikely (token->type == END_OF_RE))
[599]	3135	{
	3136	*err = REG_EBRACK;
	3137	goto parse_bracket_exp_free_return;
	3138	}
	3139	if (token->type == OP_CHARSET_RANGE)
	3140	{
	3141	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
	3142	token_len2 = peek_token_bracket (&token2, regexp, syntax);
	3143	if (__glibc_unlikely (token2.type == END_OF_RE))
	3144	{
	3145	*err = REG_EBRACK;
[3613]	3146	goto parse_bracket_exp_free_return;
[599]	3147	}
	3148	if (token2.type == OP_CLOSE_BRACKET)
	3149	{
[3613]	3150	/* We treat the last '-' as a normal character. */
[599]	3151	re_string_skip_bytes (regexp, -token_len);
	3152	token->type = CHARACTER;
[3613]	3153	}
[599]	3154	else
[3613]	3155	is_range_exp = true;
	3156	}
[599]	3157	}
	3158
	3159	if (is_range_exp == true)
	3160	{
	3161	end_elem.opr.name = end_name_buf;
	3162	end_elem.type = COLL_SYM;
	3163	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
	3164	dfa, syntax, true);
[3613]	3165	if (__glibc_unlikely (ret != REG_NOERROR))
	3166	{
	3167	*err = ret;
	3168	goto parse_bracket_exp_free_return;
[599]	3169	}
	3170
	3171	token_len = peek_token_bracket (token, regexp, syntax);
	3172
	3173	*err = build_range_exp (sbcset, mbcset, &range_alloc,
	3174	&start_elem, &end_elem,
	3175	dfa, syntax, nrules, collseqmb, collseqwc,
	3176	table_size, symb_table, extra);
	3177	if (__glibc_unlikely (*err != REG_NOERROR))
	3178	goto parse_bracket_exp_free_return;
	3179	}
[3613]	3180	else
[599]	3181	{
	3182	switch (start_elem.type)
	3183	{
	3184	case SB_CHAR:
	3185	bitset_set (sbcset, start_elem.opr.ch);
	3186	break;
	3187	case MB_CHAR:
	3188	/* Check whether the array has enough space. */
[3613]	3189	if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
[599]	3190	{
	3191	wchar_t *new_mbchars;
	3192	/* Not enough, realloc it. */
	3193	/* +1 in case of mbcset->nmbchars is 0. */
	3194	mbchar_alloc = 2 * mbcset->nmbchars + 1;
	3195	/* Use realloc since array is NULL if alloc == 0. /
	3196	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
	3197	mbchar_alloc);
	3198	if (__glibc_unlikely (new_mbchars == NULL))
[3613]	3199	goto parse_bracket_exp_espace;
[599]	3200	mbcset->mbchars = new_mbchars;
	3201	}
	3202	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
	3203	break;
	3204	case EQUIV_CLASS:
[3613]	3205	*err = build_equiv_class (sbcset,
	3206	mbcset, &equiv_class_alloc,
	3207	start_elem.opr.name);
[599]	3208	if (__glibc_unlikely (*err != REG_NOERROR))
	3209	goto parse_bracket_exp_free_return;
	3210	break;
	3211	case COLL_SYM:
	3212	*err = build_collating_symbol (sbcset,
[3613]	3213	mbcset, &coll_sym_alloc,
	3214	start_elem.opr.name,
	3215	nrules, table_size, symb_table, extra);
[599]	3216	if (__glibc_unlikely (*err != REG_NOERROR))
	3217	goto parse_bracket_exp_free_return;
	3218	break;
[3613]	3219	case CHAR_CLASS:
[599]	3220	*err = build_charclass (regexp->trans, sbcset,
	3221	mbcset, &char_class_alloc,
	3222	(const char *) start_elem.opr.name,
[3613]	3223	syntax);
[599]	3224	if (__glibc_unlikely (*err != REG_NOERROR))
	3225	goto parse_bracket_exp_free_return;
	3226	break;
	3227	default:
	3228	DEBUG_ASSERT (false);
	3229	break;
	3230	}
	3231	}
	3232	if (__glibc_unlikely (token->type == END_OF_RE))
	3233	{
	3234	*err = REG_EBRACK;
	3235	goto parse_bracket_exp_free_return;
	3236	}
	3237	if (token->type == OP_CLOSE_BRACKET)
	3238	break;
	3239	}
	3240
	3241	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3242
	3243	/* If it is non-matching list. */
	3244	if (non_match)
	3245	bitset_not (sbcset);
	3246
	3247	/* Ensure only single byte characters are set. */
	3248	if (dfa->mb_cur_max > 1)
	3249	bitset_mask (sbcset, dfa->sb_char);
	3250
	3251	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
	3252	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
[3613]	3253	\|\| mbcset->non_match)))
[599]	3254	{
	3255	bin_tree_t *mbc_tree;
	3256	int sbc_idx;
	3257	/* Build a tree for complex bracket. */
	3258	dfa->has_mb_node = 1;
	3259	br_token.type = COMPLEX_BRACKET;
	3260	br_token.opr.mbcset = mbcset;
	3261	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
[3613]	3262	if (__glibc_unlikely (mbc_tree == NULL))
	3263	goto parse_bracket_exp_espace;
	3264	for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
	3265	if (sbcset[sbc_idx])
	3266	break;
	3267	/* If there are no bits set in sbcset, there is no point
[599]	3268	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
[3613]	3269	if (sbc_idx < BITSET_WORDS)
	3270	{
	3271	/* Build a tree for simple bracket. */
	3272	br_token.type = SIMPLE_BRACKET;
[599]	3273	br_token.opr.sbcset = sbcset;
	3274	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3275	if (__glibc_unlikely (work_tree == NULL))
	3276	goto parse_bracket_exp_espace;
	3277
	3278	/* Then join them by ALT node. */
	3279	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
	3280	if (__glibc_unlikely (work_tree == NULL))
	3281	goto parse_bracket_exp_espace;
	3282	}
	3283	else
	3284	{
	3285	re_free (sbcset);
	3286	work_tree = mbc_tree;
[3613]	3287	}
	3288	}
[599]	3289	else
	3290	{
	3291	free_charset (mbcset);
	3292	/* Build a tree for simple bracket. */
	3293	br_token.type = SIMPLE_BRACKET;
	3294	br_token.opr.sbcset = sbcset;
	3295	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3296	if (__glibc_unlikely (work_tree == NULL))
	3297	goto parse_bracket_exp_espace;
	3298	}
	3299	return work_tree;
	3300
	3301	parse_bracket_exp_espace:
	3302	*err = REG_ESPACE;
	3303	parse_bracket_exp_free_return:
	3304	re_free (sbcset);
[3613]	3305	free_charset (mbcset);
[599]	3306	return NULL;
	3307	}
	3308
	3309	/* Parse an element in the bracket expression. */
	3310
	3311	static reg_errcode_t
	3312	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
	3313	re_token_t token, int token_len, re_dfa_t dfa,
	3314	reg_syntax_t syntax, bool accept_hyphen)
	3315	{
	3316	int cur_char_size;
	3317	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
	3318	if (cur_char_size > 1)
	3319	{
[3613]	3320	elem->type = MB_CHAR;
[599]	3321	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
	3322	re_string_skip_bytes (regexp, cur_char_size);
	3323	return REG_NOERROR;
	3324	}
	3325	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3326	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
	3327	\|\| token->type == OP_OPEN_EQUIV_CLASS)
	3328	return parse_bracket_symbol (elem, regexp, token);
	3329	if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
	3330	{
	3331	/* A '-' must only appear as anything but a range indicator before
	3332	the closing bracket. Everything else is an error. */
	3333	re_token_t token2;
	3334	(void) peek_token_bracket (&token2, regexp, syntax);
	3335	if (token2.type != OP_CLOSE_BRACKET)
	3336	/* The actual error value is not standardized since this whole
	3337	case is undefined. But ERANGE makes good sense. */
	3338	return REG_ERANGE;
	3339	}
	3340	elem->type = SB_CHAR;
	3341	elem->opr.ch = token->opr.c;
	3342	return REG_NOERROR;
	3343	}
	3344
	3345	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
	3346	such as [:<character_class>:], [.<collating_element>.], and
	3347	[=<equivalent_class>=]. */
	3348
	3349	static reg_errcode_t
	3350	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
	3351	re_token_t *token)
	3352	{
	3353	unsigned char ch, delim = token->opr.c;
	3354	int i = 0;
	3355	if (re_string_eoi(regexp))
	3356	return REG_EBRACK;
	3357	for (;; ++i)
	3358	{
	3359	if (i >= BRACKET_NAME_BUF_SIZE)
	3360	return REG_EBRACK;
	3361	if (token->type == OP_OPEN_CHAR_CLASS)
	3362	ch = re_string_fetch_byte_case (regexp);
	3363	else
	3364	ch = re_string_fetch_byte (regexp);
	3365	if (re_string_eoi(regexp))
	3366	return REG_EBRACK;
	3367	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
	3368	break;
	3369	elem->opr.name[i] = ch;
	3370	}
	3371	re_string_skip_bytes (regexp, 1);
	3372	elem->opr.name[i] = '\0';
	3373	switch (token->type)
	3374	{
	3375	case OP_OPEN_COLL_ELEM:
	3376	elem->type = COLL_SYM;
	3377	break;
	3378	case OP_OPEN_EQUIV_CLASS:
	3379	elem->type = EQUIV_CLASS;
	3380	break;
	3381	case OP_OPEN_CHAR_CLASS:
	3382	elem->type = CHAR_CLASS;
	3383	break;
	3384	default:
[3613]	3385	break;
[599]	3386	}
	3387	return REG_NOERROR;
	3388	}
[3613]	3389
[599]	3390	/* Helper function for parse_bracket_exp.
	3391	Build the equivalence class which is represented by NAME.
	3392	The result are written to MBCSET and SBCSET.
	3393	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
	3394	is a pointer argument since we may update it. */
	3395
	3396	static reg_errcode_t
	3397	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
	3398	Idx equiv_class_alloc, const unsigned char name)
	3399	{
	3400	#ifdef _LIBC
	3401	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
	3402	if (nrules != 0)
	3403	{
	3404	const int32_t table, indirect;
	3405	const unsigned char weights, extra, *cp;
	3406	unsigned char char_buf[2];
	3407	int32_t idx1, idx2;
	3408	unsigned int ch;
	3409	size_t len;
[3613]	3410	/* Calculate the index for equivalence class. */
	3411	cp = name;
[599]	3412	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
	3413	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3414	_NL_COLLATE_WEIGHTMB);
[3613]	3415	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3416	_NL_COLLATE_EXTRAMB);
[599]	3417	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
	3418	_NL_COLLATE_INDIRECTMB);
	3419	idx1 = findidx (table, indirect, extra, &cp, -1);
	3420	if (__glibc_unlikely (idx1 == 0 \|\| *cp != '\0'))
[3613]	3421	/* This isn't a valid character. */
[599]	3422	return REG_ECOLLATE;
	3423
	3424	/* Build single byte matching table for this equivalence class. */
	3425	len = weights[idx1 & 0xffffff];
	3426	for (ch = 0; ch < SBC_MAX; ++ch)
	3427	{
[3613]	3428	char_buf[0] = ch;
	3429	cp = char_buf;
	3430	idx2 = findidx (table, indirect, extra, &cp, 1);
	3431	/*
	3432	idx2 = table[ch];
	3433	*/
[599]	3434	if (idx2 == 0)
	3435	/* This isn't a valid character. */
[3613]	3436	continue;
[599]	3437	/* Compare only if the length matches and the collation rule
	3438	index is the same. */
	3439	if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
[3613]	3440	&& memcmp (weights + (idx1 & 0xffffff) + 1,
[599]	3441	weights + (idx2 & 0xffffff) + 1, len) == 0)
	3442	bitset_set (sbcset, ch);
	3443	}
	3444	/* Check whether the array has enough space. */
[3613]	3445	if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
[599]	3446	{
	3447	/* Not enough, realloc it. */
	3448	/* +1 in case of mbcset->nequiv_classes is 0. */
	3449	Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
	3450	/* Use realloc since the array is NULL if alloc == 0. /
	3451	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
	3452	int32_t,
	3453	new_equiv_class_alloc);
	3454	if (__glibc_unlikely (new_equiv_classes == NULL))
[3613]	3455	return REG_ESPACE;
[599]	3456	mbcset->equiv_classes = new_equiv_classes;
	3457	*equiv_class_alloc = new_equiv_class_alloc;
	3458	}
	3459	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
	3460	}
	3461	else
	3462	#endif /* _LIBC */
	3463	{
	3464	if (__glibc_unlikely (strlen ((const char *) name) != 1))
	3465	return REG_ECOLLATE;
[3613]	3466	bitset_set (sbcset, *name);
[599]	3467	}
	3468	return REG_NOERROR;
	3469	}
[3613]	3470
	3471	/* Helper function for parse_bracket_exp.
[599]	3472	Build the character class which is represented by NAME.
	3473	The result are written to MBCSET and SBCSET.
[3613]	3474	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
[599]	3475	is a pointer argument since we may update it. */
	3476
	3477	static reg_errcode_t
	3478	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
	3479	re_charset_t mbcset, Idx char_class_alloc,
	3480	const char *class_name, reg_syntax_t syntax)
	3481	{
	3482	int i;
[3613]	3483	const char *name = class_name;
[599]	3484
	3485	/* In case of REG_ICASE "upper" and "lower" match the both of
	3486	upper and lower cases. */
[3613]	3487	if ((syntax & RE_ICASE)
[599]	3488	&& (strcmp (name, "upper") == 0 \|\| strcmp (name, "lower") == 0))
	3489	name = "alpha";
	3490
[3613]	3491	/* Check the space of the arrays. */
[599]	3492	if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
	3493	{
	3494	/* Not enough, realloc it. */
	3495	/* +1 in case of mbcset->nchar_classes is 0. */
	3496	Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
	3497	/* Use realloc since array is NULL if alloc == 0. /
	3498	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
	3499	new_char_class_alloc);
[3613]	3500	if (__glibc_unlikely (new_char_classes == NULL))
[599]	3501	return REG_ESPACE;
	3502	mbcset->char_classes = new_char_classes;
[3613]	3503	*char_class_alloc = new_char_class_alloc;
[599]	3504	}
	3505	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
	3506
	3507	#define BUILD_CHARCLASS_LOOP(ctype_func) \
	3508	do { \
[3613]	3509	if (__glibc_unlikely (trans != NULL)) \
[599]	3510	{ \
	3511	for (i = 0; i < SBC_MAX; ++i) \
	3512	if (ctype_func (i)) \
	3513	bitset_set (sbcset, trans[i]); \
	3514	} \
	3515	else \
	3516	{ \
	3517	for (i = 0; i < SBC_MAX; ++i) \
	3518	if (ctype_func (i)) \
	3519	bitset_set (sbcset, i); \
	3520	} \
	3521	} while (0)
	3522
	3523	if (strcmp (name, "alnum") == 0)
	3524	BUILD_CHARCLASS_LOOP (isalnum);
	3525	else if (strcmp (name, "cntrl") == 0)
	3526	BUILD_CHARCLASS_LOOP (iscntrl);
	3527	else if (strcmp (name, "lower") == 0)
	3528	BUILD_CHARCLASS_LOOP (islower);
	3529	else if (strcmp (name, "space") == 0)
	3530	BUILD_CHARCLASS_LOOP (isspace);
	3531	else if (strcmp (name, "alpha") == 0)
	3532	BUILD_CHARCLASS_LOOP (isalpha);
	3533	else if (strcmp (name, "digit") == 0)
	3534	BUILD_CHARCLASS_LOOP (isdigit);
	3535	else if (strcmp (name, "print") == 0)
	3536	BUILD_CHARCLASS_LOOP (isprint);
	3537	else if (strcmp (name, "upper") == 0)
	3538	BUILD_CHARCLASS_LOOP (isupper);
	3539	else if (strcmp (name, "blank") == 0)
	3540	BUILD_CHARCLASS_LOOP (isblank);
	3541	else if (strcmp (name, "graph") == 0)
	3542	BUILD_CHARCLASS_LOOP (isgraph);
	3543	else if (strcmp (name, "punct") == 0)
	3544	BUILD_CHARCLASS_LOOP (ispunct);
	3545	else if (strcmp (name, "xdigit") == 0)
[3613]	3546	BUILD_CHARCLASS_LOOP (isxdigit);
	3547	else
[599]	3548	return REG_ECTYPE;
	3549
	3550	return REG_NOERROR;
	3551	}
[3613]	3552
[599]	3553	static bin_tree_t *
	3554	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
	3555	const char *class_name,
	3556	const char *extra, bool non_match,
[3613]	3557	reg_errcode_t *err)
[599]	3558	{
	3559	re_bitset_ptr_t sbcset;
	3560	re_charset_t *mbcset;
	3561	Idx alloc = 0;
[3613]	3562	reg_errcode_t ret;
	3563	bin_tree_t *tree;
[599]	3564
[3613]	3565	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
	3566	if (__glibc_unlikely (sbcset == NULL))
	3567	{
[599]	3568	*err = REG_ESPACE;
[3613]	3569	return NULL;
[599]	3570	}
	3571	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
[3613]	3572	if (__glibc_unlikely (mbcset == NULL))
[599]	3573	{
[3613]	3574	re_free (sbcset);
[599]	3575	*err = REG_ESPACE;
	3576	return NULL;
	3577	}
	3578	mbcset->non_match = non_match;
	3579
	3580	/* We don't care the syntax in this case. */
	3581	ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
	3582
	3583	if (__glibc_unlikely (ret != REG_NOERROR))
	3584	{
	3585	re_free (sbcset);
	3586	free_charset (mbcset);
	3587	*err = ret;
	3588	return NULL;
	3589	}
	3590	/* \w match '_' also. */
	3591	for (; *extra; extra++)
	3592	bitset_set (sbcset, *extra);
	3593
[3613]	3594	/* If it is non-matching list. */
[599]	3595	if (non_match)
[3613]	3596	bitset_not (sbcset);
[599]	3597
	3598	/* Ensure only single byte characters are set. */
	3599	if (dfa->mb_cur_max > 1)
	3600	bitset_mask (sbcset, dfa->sb_char);
	3601
	3602	/* Build a tree for simple bracket. */
	3603	re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
	3604	tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3605	if (__glibc_unlikely (tree == NULL))
	3606	goto build_word_op_espace;
[3613]	3607
[599]	3608	if (dfa->mb_cur_max > 1)
	3609	{
	3610	bin_tree_t *mbc_tree;
[3613]	3611	/* Build a tree for complex bracket. */
[599]	3612	br_token.type = COMPLEX_BRACKET;
	3613	br_token.opr.mbcset = mbcset;
	3614	dfa->has_mb_node = 1;
	3615	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3616	if (__glibc_unlikely (mbc_tree == NULL))
	3617	goto build_word_op_espace;
	3618	/* Then join them by ALT node. */
	3619	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
	3620	if (__glibc_likely (mbc_tree != NULL))
	3621	return tree;
	3622	}
	3623	else
	3624	{
	3625	free_charset (mbcset);
	3626	return tree;
	3627	}
[3613]	3628
	3629	build_word_op_espace:
	3630	re_free (sbcset);
	3631	free_charset (mbcset);
[599]	3632	*err = REG_ESPACE;
[3613]	3633	return NULL;
[599]	3634	}
	3635
[3613]	3636	/* This is intended for the expressions like "a{1,3}".
[599]	3637	Fetch a number from 'input', and return the number.
	3638	Return -1 if the number field is empty like "{,1}".
	3639	Return RE_DUP_MAX + 1 if the number field is too large.
	3640	Return -2 if an error occurred. */
	3641
[3613]	3642	static Idx
[599]	3643	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
	3644	{
	3645	Idx num = -1;
	3646	unsigned char c;
[3613]	3647	while (1)
	3648	{
	3649	fetch_token (token, input, syntax);
	3650	c = token->opr.c;
[599]	3651	if (__glibc_unlikely (token->type == END_OF_RE))
	3652	return -2;
	3653	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
	3654	break;
	3655	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c \|\| num == -2)
	3656	? -2
	3657	: num == -1
	3658	? c - '0'
[3613]	3659	: MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
[599]	3660	}
	3661	return num;
[3613]	3662	}
[599]	3663
	3664
	3665	static void
	3666	free_charset (re_charset_t *cset)
	3667	{
	3668	re_free (cset->mbchars);
	3669	#ifdef _LIBC
	3670	re_free (cset->coll_syms);
	3671	re_free (cset->equiv_classes);
	3672	#endif
	3673	re_free (cset->range_starts);
	3674	re_free (cset->range_ends);
	3675	re_free (cset->char_classes);
	3676	re_free (cset);
[3613]	3677	}
[599]	3678
	3679
	3680	/* Functions for binary tree operation. */
	3681
	3682	/* Create a tree node. */
	3683
	3684	static bin_tree_t *
	3685	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
[3613]	3686	re_token_type_t type)
[599]	3687	{
	3688	re_token_t t = { .type = type };
	3689	return create_token_tree (dfa, left, right, &t);
	3690	}
	3691
	3692	static bin_tree_t *
	3693	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
	3694	const re_token_t *token)
	3695	{
	3696	bin_tree_t *tree;
	3697	if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
	3698	{
	3699	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
	3700
	3701	if (storage == NULL)
	3702	return NULL;
	3703	storage->next = dfa->str_tree_storage;
	3704	dfa->str_tree_storage = storage;
	3705	dfa->str_tree_storage_idx = 0;
	3706	}
	3707	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
	3708
	3709	tree->parent = NULL;
	3710	tree->left = left;
	3711	tree->right = right;
	3712	tree->token = *token;
	3713	tree->token.duplicated = 0;
	3714	tree->token.opt_subexp = 0;
	3715	tree->first = NULL;
	3716	tree->next = NULL;
	3717	tree->node_idx = -1;
	3718
	3719	if (left != NULL)
	3720	left->parent = tree;
[3613]	3721	if (right != NULL)
[599]	3722	right->parent = tree;
	3723	return tree;
	3724	}
	3725
	3726	/* Mark the tree SRC as an optional subexpression.
	3727	To be called from preorder or postorder. */
	3728
	3729	static reg_errcode_t
	3730	mark_opt_subexp (void extra, bin_tree_t node)
	3731	{
	3732	Idx idx = (uintptr_t) extra;
	3733	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
	3734	node->token.opt_subexp = 1;
[3613]	3735
	3736	return REG_NOERROR;
[599]	3737	}
	3738
	3739	/* Free the allocated memory inside NODE. */
	3740
	3741	static void
	3742	free_token (re_token_t *node)
	3743	{
	3744	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
	3745	free_charset (node->opr.mbcset);
	3746	else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
	3747	re_free (node->opr.sbcset);
	3748	}
	3749
	3750	/* Worker function for tree walking. Free the allocated memory inside NODE
	3751	and its children. */
	3752
	3753	static reg_errcode_t
	3754	free_tree (void extra, bin_tree_t node)
	3755	{
	3756	free_token (&node->token);
	3757	return REG_NOERROR;
	3758	}
	3759
	3760
	3761	/* Duplicate the node SRC, and return new node. This is a preorder
	3762	visit similar to the one implemented by the generic visitor, but
	3763	we need more infrastructure to maintain two parallel trees --- so,
	3764	it's easier to duplicate. */
	3765
	3766	static bin_tree_t *
	3767	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
	3768	{
	3769	const bin_tree_t *node;
	3770	bin_tree_t *dup_root;
	3771	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
	3772
	3773	for (node = root; ; )
	3774	{
	3775	/* Create a new tree and link it back to the current parent. */
	3776	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
	3777	if (*p_new == NULL)
	3778	return NULL;
	3779	(*p_new)->parent = dup_node;
	3780	(*p_new)->token.duplicated = 1;
	3781	dup_node = *p_new;
	3782
	3783	/* Go to the left node, or up and to the right. */
	3784	if (node->left)
	3785	{
	3786	node = node->left;
[3613]	3787	p_new = &dup_node->left;
[599]	3788	}
	3789	else
	3790	{
	3791	const bin_tree_t *prev = NULL;
	3792	while (node->right == prev \|\| node->right == NULL)
	3793	{
	3794	prev = node;
	3795	node = node->parent;
	3796	dup_node = dup_node->parent;
	3797	if (!node)
	3798	return dup_root;
	3799	}
	3800	node = node->right;
	3801	p_new = &dup_node->right;
	3802	}
	3803	}
	3804	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/sed/lib/regcomp.c

Download in other formats: