Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

regcomp.c@ 3531

Last change on this file since 3531 was 3529, checked in by bird, 4 years ago
Imported grep 3.7 from grep-3.7.tar.gz (sha256: c22b0cf2d4f6bbe599c902387e8058990e1eee99aef333a203829e5fd3dbb342), applying minimal auto-props.
Property svn:eol-style set to `native`
File size: 111.8 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002-2021 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. */
19
20	#ifdef _LIBC
21	# include <locale/weight.h>
22	#endif
23
24	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
25	size_t length, reg_syntax_t syntax);
26	static void re_compile_fastmap_iter (regex_t *bufp,
27	const re_dfastate_t *init_state,
28	char *fastmap);
29	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30	#ifdef RE_ENABLE_I18N
31	static void free_charset (re_charset_t *cset);
32	#endif /* RE_ENABLE_I18N */
33	static void free_workarea_compile (regex_t *preg);
34	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35	#ifdef RE_ENABLE_I18N
36	static void optimize_utf8 (re_dfa_t *dfa);
37	#endif
38	static reg_errcode_t analyze (regex_t *preg);
39	static reg_errcode_t preorder (bin_tree_t *root,
40	reg_errcode_t (fn (void , bin_tree_t )),
41	void *extra);
42	static reg_errcode_t postorder (bin_tree_t *root,
43	reg_errcode_t (fn (void , bin_tree_t )),
44	void *extra);
45	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
46	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
47	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
48	bin_tree_t *node);
49	static reg_errcode_t calc_first (void extra, bin_tree_t node);
50	static reg_errcode_t calc_next (void extra, bin_tree_t node);
51	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
52	static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54	unsigned int constraint);
55	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
57	Idx node, bool root);
58	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59	static Idx fetch_number (re_string_t input, re_token_t token,
60	reg_syntax_t syntax);
61	static int peek_token (re_token_t token, re_string_t input,
62	reg_syntax_t syntax);
63	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
64	reg_syntax_t syntax, reg_errcode_t *err);
65	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
66	re_token_t *token, reg_syntax_t syntax,
67	Idx nest, reg_errcode_t *err);
68	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
69	re_token_t *token, reg_syntax_t syntax,
70	Idx nest, reg_errcode_t *err);
71	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
72	re_token_t *token, reg_syntax_t syntax,
73	Idx nest, reg_errcode_t *err);
74	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
75	re_token_t *token, reg_syntax_t syntax,
76	Idx nest, reg_errcode_t *err);
77	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
78	re_dfa_t dfa, re_token_t token,
79	reg_syntax_t syntax, reg_errcode_t *err);
80	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
81	re_token_t *token, reg_syntax_t syntax,
82	reg_errcode_t *err);
83	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84	re_string_t *regexp,
85	re_token_t *token, int token_len,
86	re_dfa_t *dfa,
87	reg_syntax_t syntax,
88	bool accept_hyphen);
89	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90	re_string_t *regexp,
91	re_token_t *token);
92	#ifdef RE_ENABLE_I18N
93	static reg_errcode_t build_equiv_class (bitset_t sbcset,
94	re_charset_t *mbcset,
95	Idx *equiv_class_alloc,
96	const unsigned char *name);
97	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98	bitset_t sbcset,
99	re_charset_t *mbcset,
100	Idx *char_class_alloc,
101	const char *class_name,
102	reg_syntax_t syntax);
103	#else /* not RE_ENABLE_I18N */
104	static reg_errcode_t build_equiv_class (bitset_t sbcset,
105	const unsigned char *name);
106	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107	bitset_t sbcset,
108	const char *class_name,
109	reg_syntax_t syntax);
110	#endif /* not RE_ENABLE_I18N */
111	static bin_tree_t build_charclass_op (re_dfa_t dfa,
112	RE_TRANSLATE_TYPE trans,
113	const char *class_name,
114	const char *extra,
115	bool non_match, reg_errcode_t *err);
116	static bin_tree_t create_tree (re_dfa_t dfa,
117	bin_tree_t left, bin_tree_t right,
118	re_token_type_t type);
119	static bin_tree_t create_token_tree (re_dfa_t dfa,
120	bin_tree_t left, bin_tree_t right,
121	const re_token_t *token);
122	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
123	static void free_token (re_token_t *node);
124	static reg_errcode_t free_tree (void extra, bin_tree_t node);
125	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
126
127
128	/* This table gives an error message for each of the error codes listed
129	in regex.h. Obviously the order here has to be same as there.
130	POSIX doesn't require that we do anything for REG_NOERROR,
131	but why not be nice? */
132
133	static const char __re_error_msgid[] =
134	{
135	#define REG_NOERROR_IDX 0
136	gettext_noop ("Success") /* REG_NOERROR */
137	"\0"
138	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
139	gettext_noop ("No match") /* REG_NOMATCH */
140	"\0"
141	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
142	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143	"\0"
144	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
145	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146	"\0"
147	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
148	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149	"\0"
150	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
151	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152	"\0"
153	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
154	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155	"\0"
156	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
157	gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
158	"\0"
159	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
160	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161	"\0"
162	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
163	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164	"\0"
165	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
166	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167	"\0"
168	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
169	gettext_noop ("Invalid range end") /* REG_ERANGE */
170	"\0"
171	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
172	gettext_noop ("Memory exhausted") /* REG_ESPACE */
173	"\0"
174	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
175	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176	"\0"
177	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
178	gettext_noop ("Premature end of regular expression") /* REG_EEND */
179	"\0"
180	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
181	gettext_noop ("Regular expression too big") /* REG_ESIZE */
182	"\0"
183	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
184	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
185	};
186
187	static const size_t __re_error_msgid_idx[] =
188	{
189	REG_NOERROR_IDX,
190	REG_NOMATCH_IDX,
191	REG_BADPAT_IDX,
192	REG_ECOLLATE_IDX,
193	REG_ECTYPE_IDX,
194	REG_EESCAPE_IDX,
195	REG_ESUBREG_IDX,
196	REG_EBRACK_IDX,
197	REG_EPAREN_IDX,
198	REG_EBRACE_IDX,
199	REG_BADBR_IDX,
200	REG_ERANGE_IDX,
201	REG_ESPACE_IDX,
202	REG_BADRPT_IDX,
203	REG_EEND_IDX,
204	REG_ESIZE_IDX,
205	REG_ERPAREN_IDX
206	};
207
208
209	/* Entry points for GNU code. */
210
211	/* re_compile_pattern is the GNU regular expression compiler: it
212	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213	Returns 0 if the pattern was valid, otherwise an error string.
214
215	Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216	are set in BUFP on entry. */
217
218	const char *
219	re_compile_pattern (const char *pattern, size_t length,
220	struct re_pattern_buffer *bufp)
221	{
222	reg_errcode_t ret;
223
224	/* And GNU code determines whether or not to get register information
225	by passing null for the REGS argument to re_match, etc., not by
226	setting no_sub, unless RE_NO_SUB is set. */
227	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
228
229	/* Match anchors at newline. */
230	bufp->newline_anchor = 1;
231
232	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
233
234	if (!ret)
235	return NULL;
236	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
237	}
238	weak_alias (__re_compile_pattern, re_compile_pattern)
239
240	/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241	also be assigned to arbitrarily: each pattern buffer stores its own
242	syntax, so it can be changed between regex compilations. */
243	/* This has no initializer because initialized variables in Emacs
244	become read-only after dumping. */
245	reg_syntax_t re_syntax_options;
246
247
248	/* Specify the precise syntax of regexps for compilation. This provides
249	for compatibility for various utilities which historically have
250	different, incompatible syntaxes.
251
252	The argument SYNTAX is a bit mask comprised of the various bits
253	defined in regex.h. We return the old syntax. */
254
255	reg_syntax_t
256	re_set_syntax (reg_syntax_t syntax)
257	{
258	reg_syntax_t ret = re_syntax_options;
259
260	re_syntax_options = syntax;
261	return ret;
262	}
263	weak_alias (__re_set_syntax, re_set_syntax)
264
265	int
266	re_compile_fastmap (struct re_pattern_buffer *bufp)
267	{
268	re_dfa_t *dfa = bufp->buffer;
269	char *fastmap = bufp->fastmap;
270
271	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
272	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
273	if (dfa->init_state != dfa->init_state_word)
274	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
275	if (dfa->init_state != dfa->init_state_nl)
276	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
277	if (dfa->init_state != dfa->init_state_begbuf)
278	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
279	bufp->fastmap_accurate = 1;
280	return 0;
281	}
282	weak_alias (__re_compile_fastmap, re_compile_fastmap)
283
284	static inline void
285	__attribute__ ((always_inline))
286	re_set_fastmap (char *fastmap, bool icase, int ch)
287	{
288	fastmap[ch] = 1;
289	if (icase)
290	fastmap[tolower (ch)] = 1;
291	}
292
293	/* Helper function for re_compile_fastmap.
294	Compile fastmap for the initial_state INIT_STATE. */
295
296	static void
297	re_compile_fastmap_iter (regex_t bufp, const re_dfastate_t init_state,
298	char *fastmap)
299	{
300	re_dfa_t *dfa = bufp->buffer;
301	Idx node_cnt;
302	bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
303	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
304	{
305	Idx node = init_state->nodes.elems[node_cnt];
306	re_token_type_t type = dfa->nodes[node].type;
307
308	if (type == CHARACTER)
309	{
310	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311	#ifdef RE_ENABLE_I18N
312	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
313	{
314	unsigned char buf[MB_LEN_MAX];
315	unsigned char *p;
316	wchar_t wc;
317	mbstate_t state;
318
319	p = buf;
320	*p++ = dfa->nodes[node].opr.c;
321	while (++node < dfa->nodes_len
322	&& dfa->nodes[node].type == CHARACTER
323	&& dfa->nodes[node].mb_partial)
324	*p++ = dfa->nodes[node].opr.c;
325	memset (&state, '\0', sizeof (state));
326	if (__mbrtowc (&wc, (const char *) buf, p - buf,
327	&state) == p - buf
328	&& (__wcrtomb ((char *) buf, __towlower (wc), &state)
329	!= (size_t) -1))
330	re_set_fastmap (fastmap, false, buf[0]);
331	}
332	#endif
333	}
334	else if (type == SIMPLE_BRACKET)
335	{
336	int i, ch;
337	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338	{
339	int j;
340	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
341	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
342	if (w & ((bitset_word_t) 1 << j))
343	re_set_fastmap (fastmap, icase, ch);
344	}
345	}
346	#ifdef RE_ENABLE_I18N
347	else if (type == COMPLEX_BRACKET)
348	{
349	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
350	Idx i;
351
352	# ifdef _LIBC
353	/* See if we have to try all bytes which start multiple collation
354	elements.
355	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
356	collation element, and don't catch 'b' since 'b' is
357	the only collation element which starts from 'b' (and
358	it is caught by SIMPLE_BRACKET). */
359	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
360	&& (cset->ncoll_syms \|\| cset->nranges))
361	{
362	const int32_t table = (const int32_t )
363	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
364	for (i = 0; i < SBC_MAX; ++i)
365	if (table[i] < 0)
366	re_set_fastmap (fastmap, icase, i);
367	}
368	# endif /* _LIBC */
369
370	/* See if we have to start the match at all multibyte characters,
371	i.e. where we would not find an invalid sequence. This only
372	applies to multibyte character sets; for single byte character
373	sets, the SIMPLE_BRACKET again suffices. */
374	if (dfa->mb_cur_max > 1
375	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
376	# ifdef _LIBC
377	\|\| cset->nequiv_classes
378	# endif /* _LIBC */
379	))
380	{
381	unsigned char c = 0;
382	do
383	{
384	mbstate_t mbs;
385	memset (&mbs, 0, sizeof (mbs));
386	if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
387	re_set_fastmap (fastmap, false, (int) c);
388	}
389	while (++c != 0);
390	}
391
392	else
393	{
394	/* ... Else catch all bytes which can start the mbchars. */
395	for (i = 0; i < cset->nmbchars; ++i)
396	{
397	char buf[256];
398	mbstate_t state;
399	memset (&state, '\0', sizeof (state));
400	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
401	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
402	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
403	{
404	if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
405	!= (size_t) -1)
406	re_set_fastmap (fastmap, false, (unsigned char ) buf);
407	}
408	}
409	}
410	}
411	#endif /* RE_ENABLE_I18N */
412	else if (type == OP_PERIOD
413	#ifdef RE_ENABLE_I18N
414	\|\| type == OP_UTF8_PERIOD
415	#endif /* RE_ENABLE_I18N */
416	\|\| type == END_OF_RE)
417	{
418	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
419	if (type == END_OF_RE)
420	bufp->can_be_null = 1;
421	return;
422	}
423	}
424	}
425
426
427	/* Entry point for POSIX code. */
428	/* regcomp takes a regular expression as a string and compiles it.
429
430	PREG is a regex_t *. We do not expect any fields to be initialized,
431	since POSIX says we shouldn't. Thus, we set
432
433	'buffer' to the compiled pattern;
434	'used' to the length of the compiled pattern;
435	'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
436	REG_EXTENDED bit in CFLAGS is set; otherwise, to
437	RE_SYNTAX_POSIX_BASIC;
438	'newline_anchor' to REG_NEWLINE being set in CFLAGS;
439	'fastmap' to an allocated space for the fastmap;
440	'fastmap_accurate' to zero;
441	're_nsub' to the number of subexpressions in PATTERN.
442
443	PATTERN is the address of the pattern string.
444
445	CFLAGS is a series of bits which affect compilation.
446
447	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
448	use POSIX basic syntax.
449
450	If REG_NEWLINE is set, then . and [^...] don't match newline.
451	Also, regexec will try a match beginning after every newline.
452
453	If REG_ICASE is set, then we considers upper- and lowercase
454	versions of letters to be equivalent when matching.
455
456	If REG_NOSUB is set, then when PREG is passed to regexec, that
457	routine will report only success or failure, and nothing about the
458	registers.
459
460	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
461	the return codes and their meanings.) */
462
463	int
464	regcomp (regex_t __restrict preg, const char __restrict pattern, int cflags)
465	{
466	reg_errcode_t ret;
467	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
468	: RE_SYNTAX_POSIX_BASIC);
469
470	preg->buffer = NULL;
471	preg->allocated = 0;
472	preg->used = 0;
473
474	/* Try to allocate space for the fastmap. */
475	preg->fastmap = re_malloc (char, SBC_MAX);
476	if (__glibc_unlikely (preg->fastmap == NULL))
477	return REG_ESPACE;
478
479	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
480
481	/* If REG_NEWLINE is set, newlines are treated differently. */
482	if (cflags & REG_NEWLINE)
483	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
484	syntax &= ~RE_DOT_NEWLINE;
485	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
486	/* It also changes the matching behavior. */
487	preg->newline_anchor = 1;
488	}
489	else
490	preg->newline_anchor = 0;
491	preg->no_sub = !!(cflags & REG_NOSUB);
492	preg->translate = NULL;
493
494	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
495
496	/* POSIX doesn't distinguish between an unmatched open-group and an
497	unmatched close-group: both are REG_EPAREN. */
498	if (ret == REG_ERPAREN)
499	ret = REG_EPAREN;
500
501	/* We have already checked preg->fastmap != NULL. */
502	if (__glibc_likely (ret == REG_NOERROR))
503	/* Compute the fastmap now, since regexec cannot modify the pattern
504	buffer. This function never fails in this implementation. */
505	(void) re_compile_fastmap (preg);
506	else
507	{
508	/* Some error occurred while compiling the expression. */
509	re_free (preg->fastmap);
510	preg->fastmap = NULL;
511	}
512
513	return (int) ret;
514	}
515	libc_hidden_def (__regcomp)
516	weak_alias (__regcomp, regcomp)
517
518	/* Returns a message corresponding to an error code, ERRCODE, returned
519	from either regcomp or regexec. We don't use PREG here. */
520
521	size_t
522	regerror (int errcode, const regex_t __restrict preg, char __restrict errbuf,
523	size_t errbuf_size)
524	{
525	const char *msg;
526	size_t msg_size;
527	int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
528
529	if (__glibc_unlikely (errcode < 0 \|\| errcode >= nerrcodes))
530	/* Only error codes returned by the rest of the code should be passed
531	to this routine. If we are given anything else, or if other regex
532	code generates an invalid error code, then the program has a bug.
533	Dump core so we can fix it. */
534	abort ();
535
536	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
537
538	msg_size = strlen (msg) + 1; /* Includes the null. */
539
540	if (__glibc_likely (errbuf_size != 0))
541	{
542	size_t cpy_size = msg_size;
543	if (__glibc_unlikely (msg_size > errbuf_size))
544	{
545	cpy_size = errbuf_size - 1;
546	errbuf[cpy_size] = '\0';
547	}
548	memcpy (errbuf, msg, cpy_size);
549	}
550
551	return msg_size;
552	}
553	weak_alias (__regerror, regerror)
554
555
556	#ifdef RE_ENABLE_I18N
557	/* This static array is used for the map to single-byte characters when
558	UTF-8 is used. Otherwise we would allocate memory just to initialize
559	it the same all the time. UTF-8 is the preferred encoding so this is
560	a worthwhile optimization. */
561	static const bitset_t utf8_sb_map =
562	{
563	/* Set the first 128 bits. */
564	# if (defined __GNUC__ \|\| __clang_major__ >= 4) && !defined __STRICT_ANSI__
565	[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
566	# else
567	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
568	# error "bitset_word_t is narrower than 32 bits"
569	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
570	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
571	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
572	BITSET_WORD_MAX, BITSET_WORD_MAX,
573	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
574	BITSET_WORD_MAX,
575	# endif
576	(BITSET_WORD_MAX
577	>> (SBC_MAX % BITSET_WORD_BITS == 0
578	? 0
579	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
580	# endif
581	};
582	#endif
583
584
585	static void
586	free_dfa_content (re_dfa_t *dfa)
587	{
588	Idx i, j;
589
590	if (dfa->nodes)
591	for (i = 0; i < dfa->nodes_len; ++i)
592	free_token (dfa->nodes + i);
593	re_free (dfa->nexts);
594	for (i = 0; i < dfa->nodes_len; ++i)
595	{
596	if (dfa->eclosures != NULL)
597	re_node_set_free (dfa->eclosures + i);
598	if (dfa->inveclosures != NULL)
599	re_node_set_free (dfa->inveclosures + i);
600	if (dfa->edests != NULL)
601	re_node_set_free (dfa->edests + i);
602	}
603	re_free (dfa->edests);
604	re_free (dfa->eclosures);
605	re_free (dfa->inveclosures);
606	re_free (dfa->nodes);
607
608	if (dfa->state_table)
609	for (i = 0; i <= dfa->state_hash_mask; ++i)
610	{
611	struct re_state_table_entry *entry = dfa->state_table + i;
612	for (j = 0; j < entry->num; ++j)
613	{
614	re_dfastate_t *state = entry->array[j];
615	free_state (state);
616	}
617	re_free (entry->array);
618	}
619	re_free (dfa->state_table);
620	#ifdef RE_ENABLE_I18N
621	if (dfa->sb_char != utf8_sb_map)
622	re_free (dfa->sb_char);
623	#endif
624	re_free (dfa->subexp_map);
625	#ifdef DEBUG
626	re_free (dfa->re_str);
627	#endif
628
629	re_free (dfa);
630	}
631
632
633	/* Free dynamically allocated space used by PREG. */
634
635	void
636	regfree (regex_t *preg)
637	{
638	re_dfa_t *dfa = preg->buffer;
639	if (__glibc_likely (dfa != NULL))
640	{
641	lock_fini (dfa->lock);
642	free_dfa_content (dfa);
643	}
644	preg->buffer = NULL;
645	preg->allocated = 0;
646
647	re_free (preg->fastmap);
648	preg->fastmap = NULL;
649
650	re_free (preg->translate);
651	preg->translate = NULL;
652	}
653	libc_hidden_def (__regfree)
654	weak_alias (__regfree, regfree)
655
656
657	/* Entry points compatible with 4.2 BSD regex library. We don't define
658	them unless specifically requested. */
659
660	#if defined _REGEX_RE_COMP \|\| defined _LIBC
661
662	/* BSD has one and only one pattern buffer. */
663	static struct re_pattern_buffer re_comp_buf;
664
665	char *
666	# ifdef _LIBC
667	/* Make these definitions weak in libc, so POSIX programs can redefine
668	these names if they don't use our functions, and still use
669	regcomp/regexec above without link errors. */
670	weak_function
671	# endif
672	re_comp (const char *s)
673	{
674	reg_errcode_t ret;
675	char *fastmap;
676
677	if (!s)
678	{
679	if (!re_comp_buf.buffer)
680	return gettext ("No previous regular expression");
681	return 0;
682	}
683
684	if (re_comp_buf.buffer)
685	{
686	fastmap = re_comp_buf.fastmap;
687	re_comp_buf.fastmap = NULL;
688	__regfree (&re_comp_buf);
689	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690	re_comp_buf.fastmap = fastmap;
691	}
692
693	if (re_comp_buf.fastmap == NULL)
694	{
695	re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
696	if (re_comp_buf.fastmap == NULL)
697	return (char *) gettext (__re_error_msgid
698	+ __re_error_msgid_idx[(int) REG_ESPACE]);
699	}
700
701	/* Since 're_exec' always passes NULL for the 'regs' argument, we
702	don't need to initialize the pattern buffer fields which affect it. */
703
704	/* Match anchors at newlines. */
705	re_comp_buf.newline_anchor = 1;
706
707	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708
709	if (!ret)
710	return NULL;
711
712	/* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
713	return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
714	}
715
716	#ifdef _LIBC
717	libc_freeres_fn (free_mem)
718	{
719	__regfree (&re_comp_buf);
720	}
721	#endif
722
723	#endif /* _REGEX_RE_COMP */
724
725
726	/* Internal entry point.
727	Compile the regular expression PATTERN, whose length is LENGTH.
728	SYNTAX indicate regular expression's syntax. */
729
730	static reg_errcode_t
731	re_compile_internal (regex_t preg, const char pattern, size_t length,
732	reg_syntax_t syntax)
733	{
734	reg_errcode_t err = REG_NOERROR;
735	re_dfa_t *dfa;
736	re_string_t regexp;
737
738	/* Initialize the pattern buffer. */
739	preg->fastmap_accurate = 0;
740	preg->syntax = syntax;
741	preg->not_bol = preg->not_eol = 0;
742	preg->used = 0;
743	preg->re_nsub = 0;
744	preg->can_be_null = 0;
745	preg->regs_allocated = REGS_UNALLOCATED;
746
747	/* Initialize the dfa. */
748	dfa = preg->buffer;
749	if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
750	{
751	/* If zero allocated, but buffer is non-null, try to realloc
752	enough space. This loses if buffer's address is bogus, but
753	that is the user's responsibility. If ->buffer is NULL this
754	is a simple allocation. */
755	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
756	if (dfa == NULL)
757	return REG_ESPACE;
758	preg->allocated = sizeof (re_dfa_t);
759	preg->buffer = dfa;
760	}
761	preg->used = sizeof (re_dfa_t);
762
763	err = init_dfa (dfa, length);
764	if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
765	err = REG_ESPACE;
766	if (__glibc_unlikely (err != REG_NOERROR))
767	{
768	free_dfa_content (dfa);
769	preg->buffer = NULL;
770	preg->allocated = 0;
771	return err;
772	}
773	#ifdef DEBUG
774	/* Note: length+1 will not overflow since it is checked in init_dfa. */
775	dfa->re_str = re_malloc (char, length + 1);
776	strncpy (dfa->re_str, pattern, length + 1);
777	#endif
778
779	err = re_string_construct (&regexp, pattern, length, preg->translate,
780	(syntax & RE_ICASE) != 0, dfa);
781	if (__glibc_unlikely (err != REG_NOERROR))
782	{
783	re_compile_internal_free_return:
784	free_workarea_compile (preg);
785	re_string_destruct (&regexp);
786	lock_fini (dfa->lock);
787	free_dfa_content (dfa);
788	preg->buffer = NULL;
789	preg->allocated = 0;
790	return err;
791	}
792
793	/* Parse the regular expression, and build a structure tree. */
794	preg->re_nsub = 0;
795	dfa->str_tree = parse (&regexp, preg, syntax, &err);
796	if (__glibc_unlikely (dfa->str_tree == NULL))
797	goto re_compile_internal_free_return;
798
799	/* Analyze the tree and create the nfa. */
800	err = analyze (preg);
801	if (__glibc_unlikely (err != REG_NOERROR))
802	goto re_compile_internal_free_return;
803
804	#ifdef RE_ENABLE_I18N
805	/* If possible, do searching in single byte encoding to speed things up. */
806	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
807	optimize_utf8 (dfa);
808	#endif
809
810	/* Then create the initial state of the dfa. */
811	err = create_initial_state (dfa);
812
813	/* Release work areas. */
814	free_workarea_compile (preg);
815	re_string_destruct (&regexp);
816
817	if (__glibc_unlikely (err != REG_NOERROR))
818	{
819	lock_fini (dfa->lock);
820	free_dfa_content (dfa);
821	preg->buffer = NULL;
822	preg->allocated = 0;
823	}
824
825	return err;
826	}
827
828	/* Initialize DFA. We use the length of the regular expression PAT_LEN
829	as the initial length of some arrays. */
830
831	static reg_errcode_t
832	init_dfa (re_dfa_t *dfa, size_t pat_len)
833	{
834	__re_size_t table_size;
835	#ifndef _LIBC
836	const char *codeset_name;
837	#endif
838	#ifdef RE_ENABLE_I18N
839	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
840	#else
841	size_t max_i18n_object_size = 0;
842	#endif
843	size_t max_object_size =
844	MAX (sizeof (struct re_state_table_entry),
845	MAX (sizeof (re_token_t),
846	MAX (sizeof (re_node_set),
847	MAX (sizeof (regmatch_t),
848	max_i18n_object_size))));
849
850	memset (dfa, '\0', sizeof (re_dfa_t));
851
852	/* Force allocation of str_tree_storage the first time. */
853	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854
855	/* Avoid overflows. The extra "/ 2" is for the table_size doubling
856	calculation below, and for similar doubling calculations
857	elsewhere. And it's <= rather than <, because some of the
858	doubling calculations add 1 afterwards. */
859	if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
860	<= pat_len))
861	return REG_ESPACE;
862
863	dfa->nodes_alloc = pat_len + 1;
864	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
865
866	/* table_size = 2 ^ ceil(log pat_len) */
867	for (table_size = 1; ; table_size <<= 1)
868	if (table_size > pat_len)
869	break;
870
871	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
872	dfa->state_hash_mask = table_size - 1;
873
874	dfa->mb_cur_max = MB_CUR_MAX;
875	#ifdef _LIBC
876	if (dfa->mb_cur_max == 6
877	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
878	dfa->is_utf8 = 1;
879	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
880	!= 0);
881	#else
882	codeset_name = nl_langinfo (CODESET);
883	if ((codeset_name[0] == 'U' \|\| codeset_name[0] == 'u')
884	&& (codeset_name[1] == 'T' \|\| codeset_name[1] == 't')
885	&& (codeset_name[2] == 'F' \|\| codeset_name[2] == 'f')
886	&& strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
887	dfa->is_utf8 = 1;
888
889	/* We check exhaustively in the loop below if this charset is a
890	superset of ASCII. */
891	dfa->map_notascii = 0;
892	#endif
893
894	#ifdef RE_ENABLE_I18N
895	if (dfa->mb_cur_max > 1)
896	{
897	if (dfa->is_utf8)
898	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
899	else
900	{
901	int i, j, ch;
902
903	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
904	if (__glibc_unlikely (dfa->sb_char == NULL))
905	return REG_ESPACE;
906
907	/* Set the bits corresponding to single byte chars. */
908	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
909	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
910	{
911	wint_t wch = __btowc (ch);
912	if (wch != WEOF)
913	dfa->sb_char[i] \|= (bitset_word_t) 1 << j;
914	# ifndef _LIBC
915	if (isascii (ch) && wch != ch)
916	dfa->map_notascii = 1;
917	# endif
918	}
919	}
920	}
921	#endif
922
923	if (__glibc_unlikely (dfa->nodes == NULL \|\| dfa->state_table == NULL))
924	return REG_ESPACE;
925	return REG_NOERROR;
926	}
927
928	/* Initialize WORD_CHAR table, which indicate which character is
929	"word". In this case "word" means that it is the word construction
930	character used by some operators like "\<", "\>", etc. */
931
932	static void
933	init_word_char (re_dfa_t *dfa)
934	{
935	int i = 0;
936	int j;
937	int ch = 0;
938	dfa->word_ops_used = 1;
939	if (__glibc_likely (dfa->map_notascii == 0))
940	{
941	/* Avoid uint32_t and uint64_t as some non-GCC platforms lack
942	them, an issue when this code is used in Gnulib. */
943	bitset_word_t bits0 = 0x00000000;
944	bitset_word_t bits1 = 0x03ff0000;
945	bitset_word_t bits2 = 0x87fffffe;
946	bitset_word_t bits3 = 0x07fffffe;
947	if (BITSET_WORD_BITS == 64)
948	{
949	/* Pacify gcc -Woverflow on 32-bit platformns. */
950	dfa->word_char[0] = bits1 << 31 << 1 \| bits0;
951	dfa->word_char[1] = bits3 << 31 << 1 \| bits2;
952	i = 2;
953	}
954	else if (BITSET_WORD_BITS == 32)
955	{
956	dfa->word_char[0] = bits0;
957	dfa->word_char[1] = bits1;
958	dfa->word_char[2] = bits2;
959	dfa->word_char[3] = bits3;
960	i = 4;
961	}
962	else
963	goto general_case;
964	ch = 128;
965
966	if (__glibc_likely (dfa->is_utf8))
967	{
968	memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
969	return;
970	}
971	}
972
973	general_case:
974	for (; i < BITSET_WORDS; ++i)
975	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
976	if (isalnum (ch) \|\| ch == '_')
977	dfa->word_char[i] \|= (bitset_word_t) 1 << j;
978	}
979
980	/* Free the work area which are only used while compiling. */
981
982	static void
983	free_workarea_compile (regex_t *preg)
984	{
985	re_dfa_t *dfa = preg->buffer;
986	bin_tree_storage_t storage, next;
987	for (storage = dfa->str_tree_storage; storage; storage = next)
988	{
989	next = storage->next;
990	re_free (storage);
991	}
992	dfa->str_tree_storage = NULL;
993	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
994	dfa->str_tree = NULL;
995	re_free (dfa->org_indices);
996	dfa->org_indices = NULL;
997	}
998
999	/* Create initial states for all contexts. */
1000
1001	static reg_errcode_t
1002	create_initial_state (re_dfa_t *dfa)
1003	{
1004	Idx first, i;
1005	reg_errcode_t err;
1006	re_node_set init_nodes;
1007
1008	/* Initial states have the epsilon closure of the node which is
1009	the first node of the regular expression. */
1010	first = dfa->str_tree->first->node_idx;
1011	dfa->init_node = first;
1012	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1013	if (__glibc_unlikely (err != REG_NOERROR))
1014	return err;
1015
1016	/* The back-references which are in initial states can epsilon transit,
1017	since in this case all of the subexpressions can be null.
1018	Then we add epsilon closures of the nodes which are the next nodes of
1019	the back-references. */
1020	if (dfa->nbackref > 0)
1021	for (i = 0; i < init_nodes.nelem; ++i)
1022	{
1023	Idx node_idx = init_nodes.elems[i];
1024	re_token_type_t type = dfa->nodes[node_idx].type;
1025
1026	Idx clexp_idx;
1027	if (type != OP_BACK_REF)
1028	continue;
1029	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1030	{
1031	re_token_t *clexp_node;
1032	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1033	if (clexp_node->type == OP_CLOSE_SUBEXP
1034	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1035	break;
1036	}
1037	if (clexp_idx == init_nodes.nelem)
1038	continue;
1039
1040	if (type == OP_BACK_REF)
1041	{
1042	Idx dest_idx = dfa->edests[node_idx].elems[0];
1043	if (!re_node_set_contains (&init_nodes, dest_idx))
1044	{
1045	reg_errcode_t merge_err
1046	= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1047	if (merge_err != REG_NOERROR)
1048	return merge_err;
1049	i = 0;
1050	}
1051	}
1052	}
1053
1054	/* It must be the first time to invoke acquire_state. */
1055	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1056	/* We don't check ERR here, since the initial state must not be NULL. */
1057	if (__glibc_unlikely (dfa->init_state == NULL))
1058	return err;
1059	if (dfa->init_state->has_constraint)
1060	{
1061	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1062	CONTEXT_WORD);
1063	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1064	CONTEXT_NEWLINE);
1065	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1066	&init_nodes,
1067	CONTEXT_NEWLINE
1068	\| CONTEXT_BEGBUF);
1069	if (__glibc_unlikely (dfa->init_state_word == NULL
1070	\|\| dfa->init_state_nl == NULL
1071	\|\| dfa->init_state_begbuf == NULL))
1072	return err;
1073	}
1074	else
1075	dfa->init_state_word = dfa->init_state_nl
1076	= dfa->init_state_begbuf = dfa->init_state;
1077
1078	re_node_set_free (&init_nodes);
1079	return REG_NOERROR;
1080	}
1081
1082
1083	#ifdef RE_ENABLE_I18N
1084	/* If it is possible to do searching in single byte encoding instead of UTF-8
1085	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1086	DFA nodes where needed. */
1087
1088	static void
1089	optimize_utf8 (re_dfa_t *dfa)
1090	{
1091	Idx node;
1092	int i;
1093	bool mb_chars = false;
1094	bool has_period = false;
1095
1096	for (node = 0; node < dfa->nodes_len; ++node)
1097	switch (dfa->nodes[node].type)
1098	{
1099	case CHARACTER:
1100	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1101	mb_chars = true;
1102	break;
1103	case ANCHOR:
1104	switch (dfa->nodes[node].opr.ctx_type)
1105	{
1106	case LINE_FIRST:
1107	case LINE_LAST:
1108	case BUF_FIRST:
1109	case BUF_LAST:
1110	break;
1111	default:
1112	/* Word anchors etc. cannot be handled. It's okay to test
1113	opr.ctx_type since constraints (for all DFA nodes) are
1114	created by ORing one or more opr.ctx_type values. */
1115	return;
1116	}
1117	break;
1118	case OP_PERIOD:
1119	has_period = true;
1120	break;
1121	case OP_BACK_REF:
1122	case OP_ALT:
1123	case END_OF_RE:
1124	case OP_DUP_ASTERISK:
1125	case OP_OPEN_SUBEXP:
1126	case OP_CLOSE_SUBEXP:
1127	break;
1128	case COMPLEX_BRACKET:
1129	return;
1130	case SIMPLE_BRACKET:
1131	/* Just double check. */
1132	{
1133	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1134	? 0
1135	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1136	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1137	{
1138	if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1139	return;
1140	rshift = 0;
1141	}
1142	}
1143	break;
1144	default:
1145	abort ();
1146	}
1147
1148	if (mb_chars \|\| has_period)
1149	for (node = 0; node < dfa->nodes_len; ++node)
1150	{
1151	if (dfa->nodes[node].type == CHARACTER
1152	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
1153	dfa->nodes[node].mb_partial = 0;
1154	else if (dfa->nodes[node].type == OP_PERIOD)
1155	dfa->nodes[node].type = OP_UTF8_PERIOD;
1156	}
1157
1158	/* The search can be in single byte locale. */
1159	dfa->mb_cur_max = 1;
1160	dfa->is_utf8 = 0;
1161	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
1162	}
1163	#endif
1164
1165
1166	/* Analyze the structure tree, and calculate "first", "next", "edest",
1167	"eclosure", and "inveclosure". */
1168
1169	static reg_errcode_t
1170	analyze (regex_t *preg)
1171	{
1172	re_dfa_t *dfa = preg->buffer;
1173	reg_errcode_t ret;
1174
1175	/* Allocate arrays. */
1176	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1177	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1178	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1179	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1180	if (__glibc_unlikely (dfa->nexts == NULL \|\| dfa->org_indices == NULL
1181	\|\| dfa->edests == NULL \|\| dfa->eclosures == NULL))
1182	return REG_ESPACE;
1183
1184	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1185	if (dfa->subexp_map != NULL)
1186	{
1187	Idx i;
1188	for (i = 0; i < preg->re_nsub; i++)
1189	dfa->subexp_map[i] = i;
1190	preorder (dfa->str_tree, optimize_subexps, dfa);
1191	for (i = 0; i < preg->re_nsub; i++)
1192	if (dfa->subexp_map[i] != i)
1193	break;
1194	if (i == preg->re_nsub)
1195	{
1196	re_free (dfa->subexp_map);
1197	dfa->subexp_map = NULL;
1198	}
1199	}
1200
1201	ret = postorder (dfa->str_tree, lower_subexps, preg);
1202	if (__glibc_unlikely (ret != REG_NOERROR))
1203	return ret;
1204	ret = postorder (dfa->str_tree, calc_first, dfa);
1205	if (__glibc_unlikely (ret != REG_NOERROR))
1206	return ret;
1207	preorder (dfa->str_tree, calc_next, dfa);
1208	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1209	if (__glibc_unlikely (ret != REG_NOERROR))
1210	return ret;
1211	ret = calc_eclosure (dfa);
1212	if (__glibc_unlikely (ret != REG_NOERROR))
1213	return ret;
1214
1215	/* We only need this during the prune_impossible_nodes pass in regexec.c;
1216	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1217	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1218	\|\| dfa->nbackref)
1219	{
1220	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1221	if (__glibc_unlikely (dfa->inveclosures == NULL))
1222	return REG_ESPACE;
1223	ret = calc_inveclosure (dfa);
1224	}
1225
1226	return ret;
1227	}
1228
1229	/* Our parse trees are very unbalanced, so we cannot use a stack to
1230	implement parse tree visits. Instead, we use parent pointers and
1231	some hairy code in these two functions. */
1232	static reg_errcode_t
1233	postorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
1234	void *extra)
1235	{
1236	bin_tree_t node, prev;
1237
1238	for (node = root; ; )
1239	{
1240	/* Descend down the tree, preferably to the left (or to the right
1241	if that's the only child). */
1242	while (node->left \|\| node->right)
1243	if (node->left)
1244	node = node->left;
1245	else
1246	node = node->right;
1247
1248	do
1249	{
1250	reg_errcode_t err = fn (extra, node);
1251	if (__glibc_unlikely (err != REG_NOERROR))
1252	return err;
1253	if (node->parent == NULL)
1254	return REG_NOERROR;
1255	prev = node;
1256	node = node->parent;
1257	}
1258	/* Go up while we have a node that is reached from the right. */
1259	while (node->right == prev \|\| node->right == NULL);
1260	node = node->right;
1261	}
1262	}
1263
1264	static reg_errcode_t
1265	preorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
1266	void *extra)
1267	{
1268	bin_tree_t *node;
1269
1270	for (node = root; ; )
1271	{
1272	reg_errcode_t err = fn (extra, node);
1273	if (__glibc_unlikely (err != REG_NOERROR))
1274	return err;
1275
1276	/* Go to the left node, or up and to the right. */
1277	if (node->left)
1278	node = node->left;
1279	else
1280	{
1281	bin_tree_t *prev = NULL;
1282	while (node->right == prev \|\| node->right == NULL)
1283	{
1284	prev = node;
1285	node = node->parent;
1286	if (!node)
1287	return REG_NOERROR;
1288	}
1289	node = node->right;
1290	}
1291	}
1292	}
1293
1294	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1295	re_search_internal to map the inner one's opr.idx to this one's. Adjust
1296	backreferences as well. Requires a preorder visit. */
1297	static reg_errcode_t
1298	optimize_subexps (void extra, bin_tree_t node)
1299	{
1300	re_dfa_t dfa = (re_dfa_t ) extra;
1301
1302	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1303	{
1304	int idx = node->token.opr.idx;
1305	node->token.opr.idx = dfa->subexp_map[idx];
1306	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
1307	}
1308
1309	else if (node->token.type == SUBEXP
1310	&& node->left && node->left->token.type == SUBEXP)
1311	{
1312	Idx other_idx = node->left->token.opr.idx;
1313
1314	node->left = node->left->left;
1315	if (node->left)
1316	node->left->parent = node;
1317
1318	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1319	if (other_idx < BITSET_WORD_BITS)
1320	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1321	}
1322
1323	return REG_NOERROR;
1324	}
1325
1326	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1327	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1328	static reg_errcode_t
1329	lower_subexps (void extra, bin_tree_t node)
1330	{
1331	regex_t preg = (regex_t ) extra;
1332	reg_errcode_t err = REG_NOERROR;
1333
1334	if (node->left && node->left->token.type == SUBEXP)
1335	{
1336	node->left = lower_subexp (&err, preg, node->left);
1337	if (node->left)
1338	node->left->parent = node;
1339	}
1340	if (node->right && node->right->token.type == SUBEXP)
1341	{
1342	node->right = lower_subexp (&err, preg, node->right);
1343	if (node->right)
1344	node->right->parent = node;
1345	}
1346
1347	return err;
1348	}
1349
1350	static bin_tree_t *
1351	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
1352	{
1353	re_dfa_t *dfa = preg->buffer;
1354	bin_tree_t *body = node->left;
1355	bin_tree_t op, cls, tree1, tree;
1356
1357	if (preg->no_sub
1358	/* We do not optimize empty subexpressions, because otherwise we may
1359	have bad CONCAT nodes with NULL children. This is obviously not
1360	very common, so we do not lose much. An example that triggers
1361	this case is the sed "script" //x. */
1362	&& node->left != NULL
1363	&& (node->token.opr.idx >= BITSET_WORD_BITS
1364	\|\| !(dfa->used_bkref_map
1365	& ((bitset_word_t) 1 << node->token.opr.idx))))
1366	return node->left;
1367
1368	/* Convert the SUBEXP node to the concatenation of an
1369	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1370	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1371	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1372	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1373	tree = create_tree (dfa, op, tree1, CONCAT);
1374	if (__glibc_unlikely (tree == NULL \|\| tree1 == NULL
1375	\|\| op == NULL \|\| cls == NULL))
1376	{
1377	*err = REG_ESPACE;
1378	return NULL;
1379	}
1380
1381	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1382	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1383	return tree;
1384	}
1385
1386	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1387	nodes. Requires a postorder visit. */
1388	static reg_errcode_t
1389	calc_first (void extra, bin_tree_t node)
1390	{
1391	re_dfa_t dfa = (re_dfa_t ) extra;
1392	if (node->token.type == CONCAT)
1393	{
1394	node->first = node->left->first;
1395	node->node_idx = node->left->node_idx;
1396	}
1397	else
1398	{
1399	node->first = node;
1400	node->node_idx = re_dfa_add_node (dfa, node->token);
1401	if (__glibc_unlikely (node->node_idx == -1))
1402	return REG_ESPACE;
1403	if (node->token.type == ANCHOR)
1404	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1405	}
1406	return REG_NOERROR;
1407	}
1408
1409	/* Pass 2: compute NEXT on the tree. Preorder visit. */
1410	static reg_errcode_t
1411	calc_next (void extra, bin_tree_t node)
1412	{
1413	switch (node->token.type)
1414	{
1415	case OP_DUP_ASTERISK:
1416	node->left->next = node;
1417	break;
1418	case CONCAT:
1419	node->left->next = node->right->first;
1420	node->right->next = node->next;
1421	break;
1422	default:
1423	if (node->left)
1424	node->left->next = node->next;
1425	if (node->right)
1426	node->right->next = node->next;
1427	break;
1428	}
1429	return REG_NOERROR;
1430	}
1431
1432	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1433	static reg_errcode_t
1434	link_nfa_nodes (void extra, bin_tree_t node)
1435	{
1436	re_dfa_t dfa = (re_dfa_t ) extra;
1437	Idx idx = node->node_idx;
1438	reg_errcode_t err = REG_NOERROR;
1439
1440	switch (node->token.type)
1441	{
1442	case CONCAT:
1443	break;
1444
1445	case END_OF_RE:
1446	DEBUG_ASSERT (node->next == NULL);
1447	break;
1448
1449	case OP_DUP_ASTERISK:
1450	case OP_ALT:
1451	{
1452	Idx left, right;
1453	dfa->has_plural_match = 1;
1454	if (node->left != NULL)
1455	left = node->left->first->node_idx;
1456	else
1457	left = node->next->node_idx;
1458	if (node->right != NULL)
1459	right = node->right->first->node_idx;
1460	else
1461	right = node->next->node_idx;
1462	DEBUG_ASSERT (left > -1);
1463	DEBUG_ASSERT (right > -1);
1464	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1465	}
1466	break;
1467
1468	case ANCHOR:
1469	case OP_OPEN_SUBEXP:
1470	case OP_CLOSE_SUBEXP:
1471	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1472	break;
1473
1474	case OP_BACK_REF:
1475	dfa->nexts[idx] = node->next->node_idx;
1476	if (node->token.type == OP_BACK_REF)
1477	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1478	break;
1479
1480	default:
1481	DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1482	dfa->nexts[idx] = node->next->node_idx;
1483	break;
1484	}
1485
1486	return err;
1487	}
1488
1489	/* Duplicate the epsilon closure of the node ROOT_NODE.
1490	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1491	to their own constraint. */
1492
1493	static reg_errcode_t
1494	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1495	Idx root_node, unsigned int init_constraint)
1496	{
1497	Idx org_node, clone_node;
1498	bool ok;
1499	unsigned int constraint = init_constraint;
1500	for (org_node = top_org_node, clone_node = top_clone_node;;)
1501	{
1502	Idx org_dest, clone_dest;
1503	if (dfa->nodes[org_node].type == OP_BACK_REF)
1504	{
1505	/* If the back reference epsilon-transit, its destination must
1506	also have the constraint. Then duplicate the epsilon closure
1507	of the destination of the back reference, and store it in
1508	edests of the back reference. */
1509	org_dest = dfa->nexts[org_node];
1510	re_node_set_empty (dfa->edests + clone_node);
1511	clone_dest = duplicate_node (dfa, org_dest, constraint);
1512	if (__glibc_unlikely (clone_dest == -1))
1513	return REG_ESPACE;
1514	dfa->nexts[clone_node] = dfa->nexts[org_node];
1515	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516	if (__glibc_unlikely (! ok))
1517	return REG_ESPACE;
1518	}
1519	else if (dfa->edests[org_node].nelem == 0)
1520	{
1521	/* In case of the node can't epsilon-transit, don't duplicate the
1522	destination and store the original destination as the
1523	destination of the node. */
1524	dfa->nexts[clone_node] = dfa->nexts[org_node];
1525	break;
1526	}
1527	else if (dfa->edests[org_node].nelem == 1)
1528	{
1529	/* In case of the node can epsilon-transit, and it has only one
1530	destination. */
1531	org_dest = dfa->edests[org_node].elems[0];
1532	re_node_set_empty (dfa->edests + clone_node);
1533	/* If the node is root_node itself, it means the epsilon closure
1534	has a loop. Then tie it to the destination of the root_node. */
1535	if (org_node == root_node && clone_node != org_node)
1536	{
1537	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1538	if (__glibc_unlikely (! ok))
1539	return REG_ESPACE;
1540	break;
1541	}
1542	/* In case the node has another constraint, append it. */
1543	constraint \|= dfa->nodes[org_node].constraint;
1544	clone_dest = duplicate_node (dfa, org_dest, constraint);
1545	if (__glibc_unlikely (clone_dest == -1))
1546	return REG_ESPACE;
1547	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548	if (__glibc_unlikely (! ok))
1549	return REG_ESPACE;
1550	}
1551	else /* dfa->edests[org_node].nelem == 2 */
1552	{
1553	/* In case of the node can epsilon-transit, and it has two
1554	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
1555	org_dest = dfa->edests[org_node].elems[0];
1556	re_node_set_empty (dfa->edests + clone_node);
1557	/* Search for a duplicated node which satisfies the constraint. */
1558	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1559	if (clone_dest == -1)
1560	{
1561	/* There is no such duplicated node, create a new one. */
1562	reg_errcode_t err;
1563	clone_dest = duplicate_node (dfa, org_dest, constraint);
1564	if (__glibc_unlikely (clone_dest == -1))
1565	return REG_ESPACE;
1566	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567	if (__glibc_unlikely (! ok))
1568	return REG_ESPACE;
1569	err = duplicate_node_closure (dfa, org_dest, clone_dest,
1570	root_node, constraint);
1571	if (__glibc_unlikely (err != REG_NOERROR))
1572	return err;
1573	}
1574	else
1575	{
1576	/* There is a duplicated node which satisfies the constraint,
1577	use it to avoid infinite loop. */
1578	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1579	if (__glibc_unlikely (! ok))
1580	return REG_ESPACE;
1581	}
1582
1583	org_dest = dfa->edests[org_node].elems[1];
1584	clone_dest = duplicate_node (dfa, org_dest, constraint);
1585	if (__glibc_unlikely (clone_dest == -1))
1586	return REG_ESPACE;
1587	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1588	if (__glibc_unlikely (! ok))
1589	return REG_ESPACE;
1590	}
1591	org_node = org_dest;
1592	clone_node = clone_dest;
1593	}
1594	return REG_NOERROR;
1595	}
1596
1597	/* Search for a node which is duplicated from the node ORG_NODE, and
1598	satisfies the constraint CONSTRAINT. */
1599
1600	static Idx
1601	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1602	unsigned int constraint)
1603	{
1604	Idx idx;
1605	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1606	{
1607	if (org_node == dfa->org_indices[idx]
1608	&& constraint == dfa->nodes[idx].constraint)
1609	return idx; /* Found. */
1610	}
1611	return -1; /* Not found. */
1612	}
1613
1614	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1615	Return the index of the new node, or -1 if insufficient storage is
1616	available. */
1617
1618	static Idx
1619	duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1620	{
1621	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1622	if (__glibc_likely (dup_idx != -1))
1623	{
1624	dfa->nodes[dup_idx].constraint = constraint;
1625	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
1626	dfa->nodes[dup_idx].duplicated = 1;
1627
1628	/* Store the index of the original node. */
1629	dfa->org_indices[dup_idx] = org_idx;
1630	}
1631	return dup_idx;
1632	}
1633
1634	static reg_errcode_t
1635	calc_inveclosure (re_dfa_t *dfa)
1636	{
1637	Idx src, idx;
1638	bool ok;
1639	for (idx = 0; idx < dfa->nodes_len; ++idx)
1640	re_node_set_init_empty (dfa->inveclosures + idx);
1641
1642	for (src = 0; src < dfa->nodes_len; ++src)
1643	{
1644	Idx *elems = dfa->eclosures[src].elems;
1645	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1646	{
1647	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1648	if (__glibc_unlikely (! ok))
1649	return REG_ESPACE;
1650	}
1651	}
1652
1653	return REG_NOERROR;
1654	}
1655
1656	/* Calculate "eclosure" for all the node in DFA. */
1657
1658	static reg_errcode_t
1659	calc_eclosure (re_dfa_t *dfa)
1660	{
1661	Idx node_idx;
1662	bool incomplete;
1663	DEBUG_ASSERT (dfa->nodes_len > 0);
1664	incomplete = false;
1665	/* For each nodes, calculate epsilon closure. */
1666	for (node_idx = 0; ; ++node_idx)
1667	{
1668	reg_errcode_t err;
1669	re_node_set eclosure_elem;
1670	if (node_idx == dfa->nodes_len)
1671	{
1672	if (!incomplete)
1673	break;
1674	incomplete = false;
1675	node_idx = 0;
1676	}
1677
1678	DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1679
1680	/* If we have already calculated, skip it. */
1681	if (dfa->eclosures[node_idx].nelem != 0)
1682	continue;
1683	/* Calculate epsilon closure of 'node_idx'. */
1684	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1685	if (__glibc_unlikely (err != REG_NOERROR))
1686	return err;
1687
1688	if (dfa->eclosures[node_idx].nelem == 0)
1689	{
1690	incomplete = true;
1691	re_node_set_free (&eclosure_elem);
1692	}
1693	}
1694	return REG_NOERROR;
1695	}
1696
1697	/* Calculate epsilon closure of NODE. */
1698
1699	static reg_errcode_t
1700	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
1701	{
1702	reg_errcode_t err;
1703	Idx i;
1704	re_node_set eclosure;
1705	bool incomplete = false;
1706	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1707	if (__glibc_unlikely (err != REG_NOERROR))
1708	return err;
1709
1710	/* An epsilon closure includes itself. */
1711	eclosure.elems[eclosure.nelem++] = node;
1712
1713	/* This indicates that we are calculating this node now.
1714	We reference this value to avoid infinite loop. */
1715	dfa->eclosures[node].nelem = -1;
1716
1717	/* If the current node has constraints, duplicate all nodes
1718	since they must inherit the constraints. */
1719	if (dfa->nodes[node].constraint
1720	&& dfa->edests[node].nelem
1721	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1722	{
1723	err = duplicate_node_closure (dfa, node, node, node,
1724	dfa->nodes[node].constraint);
1725	if (__glibc_unlikely (err != REG_NOERROR))
1726	return err;
1727	}
1728
1729	/* Expand each epsilon destination nodes. */
1730	if (IS_EPSILON_NODE(dfa->nodes[node].type))
1731	for (i = 0; i < dfa->edests[node].nelem; ++i)
1732	{
1733	re_node_set eclosure_elem;
1734	Idx edest = dfa->edests[node].elems[i];
1735	/* If calculating the epsilon closure of 'edest' is in progress,
1736	return intermediate result. */
1737	if (dfa->eclosures[edest].nelem == -1)
1738	{
1739	incomplete = true;
1740	continue;
1741	}
1742	/* If we haven't calculated the epsilon closure of 'edest' yet,
1743	calculate now. Otherwise use calculated epsilon closure. */
1744	if (dfa->eclosures[edest].nelem == 0)
1745	{
1746	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1747	if (__glibc_unlikely (err != REG_NOERROR))
1748	return err;
1749	}
1750	else
1751	eclosure_elem = dfa->eclosures[edest];
1752	/* Merge the epsilon closure of 'edest'. */
1753	err = re_node_set_merge (&eclosure, &eclosure_elem);
1754	if (__glibc_unlikely (err != REG_NOERROR))
1755	return err;
1756	/* If the epsilon closure of 'edest' is incomplete,
1757	the epsilon closure of this node is also incomplete. */
1758	if (dfa->eclosures[edest].nelem == 0)
1759	{
1760	incomplete = true;
1761	re_node_set_free (&eclosure_elem);
1762	}
1763	}
1764
1765	if (incomplete && !root)
1766	dfa->eclosures[node].nelem = 0;
1767	else
1768	dfa->eclosures[node] = eclosure;
1769	*new_set = eclosure;
1770	return REG_NOERROR;
1771	}
1772
1773
1774	/* Functions for token which are used in the parser. */
1775
1776	/* Fetch a token from INPUT.
1777	We must not use this function inside bracket expressions. */
1778
1779	static void
1780	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
1781	{
1782	re_string_skip_bytes (input, peek_token (result, input, syntax));
1783	}
1784
1785	/* Peek a token from INPUT, and return the length of the token.
1786	We must not use this function inside bracket expressions. */
1787
1788	static int
1789	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
1790	{
1791	unsigned char c;
1792
1793	if (re_string_eoi (input))
1794	{
1795	token->type = END_OF_RE;
1796	return 0;
1797	}
1798
1799	c = re_string_peek_byte (input, 0);
1800	token->opr.c = c;
1801
1802	token->word_char = 0;
1803	#ifdef RE_ENABLE_I18N
1804	token->mb_partial = 0;
1805	if (input->mb_cur_max > 1
1806	&& !re_string_first_byte (input, re_string_cur_idx (input)))
1807	{
1808	token->type = CHARACTER;
1809	token->mb_partial = 1;
1810	return 1;
1811	}
1812	#endif
1813	if (c == '\\')
1814	{
1815	unsigned char c2;
1816	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1817	{
1818	token->type = BACK_SLASH;
1819	return 1;
1820	}
1821
1822	c2 = re_string_peek_byte_case (input, 1);
1823	token->opr.c = c2;
1824	token->type = CHARACTER;
1825	#ifdef RE_ENABLE_I18N
1826	if (input->mb_cur_max > 1)
1827	{
1828	wint_t wc = re_string_wchar_at (input,
1829	re_string_cur_idx (input) + 1);
1830	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1831	}
1832	else
1833	#endif
1834	token->word_char = IS_WORD_CHAR (c2) != 0;
1835
1836	switch (c2)
1837	{
1838	case '\|':
1839	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1840	token->type = OP_ALT;
1841	break;
1842	case '1': case '2': case '3': case '4': case '5':
1843	case '6': case '7': case '8': case '9':
1844	if (!(syntax & RE_NO_BK_REFS))
1845	{
1846	token->type = OP_BACK_REF;
1847	token->opr.idx = c2 - '1';
1848	}
1849	break;
1850	case '<':
1851	if (!(syntax & RE_NO_GNU_OPS))
1852	{
1853	token->type = ANCHOR;
1854	token->opr.ctx_type = WORD_FIRST;
1855	}
1856	break;
1857	case '>':
1858	if (!(syntax & RE_NO_GNU_OPS))
1859	{
1860	token->type = ANCHOR;
1861	token->opr.ctx_type = WORD_LAST;
1862	}
1863	break;
1864	case 'b':
1865	if (!(syntax & RE_NO_GNU_OPS))
1866	{
1867	token->type = ANCHOR;
1868	token->opr.ctx_type = WORD_DELIM;
1869	}
1870	break;
1871	case 'B':
1872	if (!(syntax & RE_NO_GNU_OPS))
1873	{
1874	token->type = ANCHOR;
1875	token->opr.ctx_type = NOT_WORD_DELIM;
1876	}
1877	break;
1878	case 'w':
1879	if (!(syntax & RE_NO_GNU_OPS))
1880	token->type = OP_WORD;
1881	break;
1882	case 'W':
1883	if (!(syntax & RE_NO_GNU_OPS))
1884	token->type = OP_NOTWORD;
1885	break;
1886	case 's':
1887	if (!(syntax & RE_NO_GNU_OPS))
1888	token->type = OP_SPACE;
1889	break;
1890	case 'S':
1891	if (!(syntax & RE_NO_GNU_OPS))
1892	token->type = OP_NOTSPACE;
1893	break;
1894	case '`':
1895	if (!(syntax & RE_NO_GNU_OPS))
1896	{
1897	token->type = ANCHOR;
1898	token->opr.ctx_type = BUF_FIRST;
1899	}
1900	break;
1901	case '\'':
1902	if (!(syntax & RE_NO_GNU_OPS))
1903	{
1904	token->type = ANCHOR;
1905	token->opr.ctx_type = BUF_LAST;
1906	}
1907	break;
1908	case '(':
1909	if (!(syntax & RE_NO_BK_PARENS))
1910	token->type = OP_OPEN_SUBEXP;
1911	break;
1912	case ')':
1913	if (!(syntax & RE_NO_BK_PARENS))
1914	token->type = OP_CLOSE_SUBEXP;
1915	break;
1916	case '+':
1917	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1918	token->type = OP_DUP_PLUS;
1919	break;
1920	case '?':
1921	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1922	token->type = OP_DUP_QUESTION;
1923	break;
1924	case '{':
1925	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1926	token->type = OP_OPEN_DUP_NUM;
1927	break;
1928	case '}':
1929	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1930	token->type = OP_CLOSE_DUP_NUM;
1931	break;
1932	default:
1933	break;
1934	}
1935	return 2;
1936	}
1937
1938	token->type = CHARACTER;
1939	#ifdef RE_ENABLE_I18N
1940	if (input->mb_cur_max > 1)
1941	{
1942	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1943	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1944	}
1945	else
1946	#endif
1947	token->word_char = IS_WORD_CHAR (token->opr.c);
1948
1949	switch (c)
1950	{
1951	case '\n':
1952	if (syntax & RE_NEWLINE_ALT)
1953	token->type = OP_ALT;
1954	break;
1955	case '\|':
1956	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1957	token->type = OP_ALT;
1958	break;
1959	case '*':
1960	token->type = OP_DUP_ASTERISK;
1961	break;
1962	case '+':
1963	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1964	token->type = OP_DUP_PLUS;
1965	break;
1966	case '?':
1967	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1968	token->type = OP_DUP_QUESTION;
1969	break;
1970	case '{':
1971	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1972	token->type = OP_OPEN_DUP_NUM;
1973	break;
1974	case '}':
1975	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1976	token->type = OP_CLOSE_DUP_NUM;
1977	break;
1978	case '(':
1979	if (syntax & RE_NO_BK_PARENS)
1980	token->type = OP_OPEN_SUBEXP;
1981	break;
1982	case ')':
1983	if (syntax & RE_NO_BK_PARENS)
1984	token->type = OP_CLOSE_SUBEXP;
1985	break;
1986	case '[':
1987	token->type = OP_OPEN_BRACKET;
1988	break;
1989	case '.':
1990	token->type = OP_PERIOD;
1991	break;
1992	case '^':
1993	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE))
1994	&& re_string_cur_idx (input) != 0)
1995	{
1996	char prev = re_string_peek_byte (input, -1);
1997	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
1998	break;
1999	}
2000	token->type = ANCHOR;
2001	token->opr.ctx_type = LINE_FIRST;
2002	break;
2003	case '$':
2004	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
2005	&& re_string_cur_idx (input) + 1 != re_string_length (input))
2006	{
2007	re_token_t next;
2008	re_string_skip_bytes (input, 1);
2009	peek_token (&next, input, syntax);
2010	re_string_skip_bytes (input, -1);
2011	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2012	break;
2013	}
2014	token->type = ANCHOR;
2015	token->opr.ctx_type = LINE_LAST;
2016	break;
2017	default:
2018	break;
2019	}
2020	return 1;
2021	}
2022
2023	/* Peek a token from INPUT, and return the length of the token.
2024	We must not use this function out of bracket expressions. */
2025
2026	static int
2027	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
2028	{
2029	unsigned char c;
2030	if (re_string_eoi (input))
2031	{
2032	token->type = END_OF_RE;
2033	return 0;
2034	}
2035	c = re_string_peek_byte (input, 0);
2036	token->opr.c = c;
2037
2038	#ifdef RE_ENABLE_I18N
2039	if (input->mb_cur_max > 1
2040	&& !re_string_first_byte (input, re_string_cur_idx (input)))
2041	{
2042	token->type = CHARACTER;
2043	return 1;
2044	}
2045	#endif /* RE_ENABLE_I18N */
2046
2047	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2048	&& re_string_cur_idx (input) + 1 < re_string_length (input))
2049	{
2050	/* In this case, '\' escape a character. */
2051	unsigned char c2;
2052	re_string_skip_bytes (input, 1);
2053	c2 = re_string_peek_byte (input, 0);
2054	token->opr.c = c2;
2055	token->type = CHARACTER;
2056	return 1;
2057	}
2058	if (c == '[') /* '[' is a special char in a bracket exps. */
2059	{
2060	unsigned char c2;
2061	int token_len;
2062	if (re_string_cur_idx (input) + 1 < re_string_length (input))
2063	c2 = re_string_peek_byte (input, 1);
2064	else
2065	c2 = 0;
2066	token->opr.c = c2;
2067	token_len = 2;
2068	switch (c2)
2069	{
2070	case '.':
2071	token->type = OP_OPEN_COLL_ELEM;
2072	break;
2073
2074	case '=':
2075	token->type = OP_OPEN_EQUIV_CLASS;
2076	break;
2077
2078	case ':':
2079	if (syntax & RE_CHAR_CLASSES)
2080	{
2081	token->type = OP_OPEN_CHAR_CLASS;
2082	break;
2083	}
2084	FALLTHROUGH;
2085	default:
2086	token->type = CHARACTER;
2087	token->opr.c = c;
2088	token_len = 1;
2089	break;
2090	}
2091	return token_len;
2092	}
2093	switch (c)
2094	{
2095	case '-':
2096	token->type = OP_CHARSET_RANGE;
2097	break;
2098	case ']':
2099	token->type = OP_CLOSE_BRACKET;
2100	break;
2101	case '^':
2102	token->type = OP_NON_MATCH_LIST;
2103	break;
2104	default:
2105	token->type = CHARACTER;
2106	}
2107	return 1;
2108	}
2109
2110
2111	/* Functions for parser. */
2112
2113	/* Entry point of the parser.
2114	Parse the regular expression REGEXP and return the structure tree.
2115	If an error occurs, ERR is set by error code, and return NULL.
2116	This function build the following tree, from regular expression <reg_exp>:
2117	CAT
2118	/ \
2119	/ \
2120	<reg_exp> EOR
2121
2122	CAT means concatenation.
2123	EOR means end of regular expression. */
2124
2125	static bin_tree_t *
2126	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
2127	reg_errcode_t *err)
2128	{
2129	re_dfa_t *dfa = preg->buffer;
2130	bin_tree_t tree, eor, *root;
2131	re_token_t current_token;
2132	dfa->syntax = syntax;
2133	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2134	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2135	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2136	return NULL;
2137	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2138	if (tree != NULL)
2139	root = create_tree (dfa, tree, eor, CONCAT);
2140	else
2141	root = eor;
2142	if (__glibc_unlikely (eor == NULL \|\| root == NULL))
2143	{
2144	*err = REG_ESPACE;
2145	return NULL;
2146	}
2147	return root;
2148	}
2149
2150	/* This function build the following tree, from regular expression
2151	<branch1>\|<branch2>:
2152	ALT
2153	/ \
2154	/ \
2155	<branch1> <branch2>
2156
2157	ALT means alternative, which represents the operator '\|'. */
2158
2159	static bin_tree_t *
2160	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2161	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2162	{
2163	re_dfa_t *dfa = preg->buffer;
2164	bin_tree_t tree, branch = NULL;
2165	bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2166	tree = parse_branch (regexp, preg, token, syntax, nest, err);
2167	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2168	return NULL;
2169
2170	while (token->type == OP_ALT)
2171	{
2172	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2173	if (token->type != OP_ALT && token->type != END_OF_RE
2174	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2175	{
2176	bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2177	dfa->completed_bkref_map = initial_bkref_map;
2178	branch = parse_branch (regexp, preg, token, syntax, nest, err);
2179	if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2180	{
2181	if (tree != NULL)
2182	postorder (tree, free_tree, NULL);
2183	return NULL;
2184	}
2185	dfa->completed_bkref_map \|= accumulated_bkref_map;
2186	}
2187	else
2188	branch = NULL;
2189	tree = create_tree (dfa, tree, branch, OP_ALT);
2190	if (__glibc_unlikely (tree == NULL))
2191	{
2192	*err = REG_ESPACE;
2193	return NULL;
2194	}
2195	}
2196	return tree;
2197	}
2198
2199	/* This function build the following tree, from regular expression
2200	<exp1><exp2>:
2201	CAT
2202	/ \
2203	/ \
2204	<exp1> <exp2>
2205
2206	CAT means concatenation. */
2207
2208	static bin_tree_t *
2209	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
2210	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2211	{
2212	bin_tree_t tree, expr;
2213	re_dfa_t *dfa = preg->buffer;
2214	tree = parse_expression (regexp, preg, token, syntax, nest, err);
2215	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2216	return NULL;
2217
2218	while (token->type != OP_ALT && token->type != END_OF_RE
2219	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2220	{
2221	expr = parse_expression (regexp, preg, token, syntax, nest, err);
2222	if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2223	{
2224	if (tree != NULL)
2225	postorder (tree, free_tree, NULL);
2226	return NULL;
2227	}
2228	if (tree != NULL && expr != NULL)
2229	{
2230	bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2231	if (newtree == NULL)
2232	{
2233	postorder (expr, free_tree, NULL);
2234	postorder (tree, free_tree, NULL);
2235	*err = REG_ESPACE;
2236	return NULL;
2237	}
2238	tree = newtree;
2239	}
2240	else if (tree == NULL)
2241	tree = expr;
2242	/* Otherwise expr == NULL, we don't need to create new tree. */
2243	}
2244	return tree;
2245	}
2246
2247	/* This function build the following tree, from regular expression a*:
2248	*
2249	\|
2250	a
2251	*/
2252
2253	static bin_tree_t *
2254	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
2255	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2256	{
2257	re_dfa_t *dfa = preg->buffer;
2258	bin_tree_t *tree;
2259	switch (token->type)
2260	{
2261	case CHARACTER:
2262	tree = create_token_tree (dfa, NULL, NULL, token);
2263	if (__glibc_unlikely (tree == NULL))
2264	{
2265	*err = REG_ESPACE;
2266	return NULL;
2267	}
2268	#ifdef RE_ENABLE_I18N
2269	if (dfa->mb_cur_max > 1)
2270	{
2271	while (!re_string_eoi (regexp)
2272	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2273	{
2274	bin_tree_t *mbc_remain;
2275	fetch_token (token, regexp, syntax);
2276	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2277	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2278	if (__glibc_unlikely (mbc_remain == NULL \|\| tree == NULL))
2279	{
2280	*err = REG_ESPACE;
2281	return NULL;
2282	}
2283	}
2284	}
2285	#endif
2286	break;
2287
2288	case OP_OPEN_SUBEXP:
2289	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2290	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2291	return NULL;
2292	break;
2293
2294	case OP_OPEN_BRACKET:
2295	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2296	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2297	return NULL;
2298	break;
2299
2300	case OP_BACK_REF:
2301	if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2302	{
2303	*err = REG_ESUBREG;
2304	return NULL;
2305	}
2306	dfa->used_bkref_map \|= 1 << token->opr.idx;
2307	tree = create_token_tree (dfa, NULL, NULL, token);
2308	if (__glibc_unlikely (tree == NULL))
2309	{
2310	*err = REG_ESPACE;
2311	return NULL;
2312	}
2313	++dfa->nbackref;
2314	dfa->has_mb_node = 1;
2315	break;
2316
2317	case OP_OPEN_DUP_NUM:
2318	if (syntax & RE_CONTEXT_INVALID_DUP)
2319	{
2320	*err = REG_BADRPT;
2321	return NULL;
2322	}
2323	FALLTHROUGH;
2324	case OP_DUP_ASTERISK:
2325	case OP_DUP_PLUS:
2326	case OP_DUP_QUESTION:
2327	if (syntax & RE_CONTEXT_INVALID_OPS)
2328	{
2329	*err = REG_BADRPT;
2330	return NULL;
2331	}
2332	else if (syntax & RE_CONTEXT_INDEP_OPS)
2333	{
2334	fetch_token (token, regexp, syntax);
2335	return parse_expression (regexp, preg, token, syntax, nest, err);
2336	}
2337	FALLTHROUGH;
2338	case OP_CLOSE_SUBEXP:
2339	if ((token->type == OP_CLOSE_SUBEXP)
2340	&& !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2341	{
2342	*err = REG_ERPAREN;
2343	return NULL;
2344	}
2345	FALLTHROUGH;
2346	case OP_CLOSE_DUP_NUM:
2347	/* We treat it as a normal character. */
2348
2349	/* Then we can these characters as normal characters. */
2350	token->type = CHARACTER;
2351	/* mb_partial and word_char bits should be initialized already
2352	by peek_token. */
2353	tree = create_token_tree (dfa, NULL, NULL, token);
2354	if (__glibc_unlikely (tree == NULL))
2355	{
2356	*err = REG_ESPACE;
2357	return NULL;
2358	}
2359	break;
2360
2361	case ANCHOR:
2362	if ((token->opr.ctx_type
2363	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
2364	&& dfa->word_ops_used == 0)
2365	init_word_char (dfa);
2366	if (token->opr.ctx_type == WORD_DELIM
2367	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
2368	{
2369	bin_tree_t tree_first, tree_last;
2370	if (token->opr.ctx_type == WORD_DELIM)
2371	{
2372	token->opr.ctx_type = WORD_FIRST;
2373	tree_first = create_token_tree (dfa, NULL, NULL, token);
2374	token->opr.ctx_type = WORD_LAST;
2375	}
2376	else
2377	{
2378	token->opr.ctx_type = INSIDE_WORD;
2379	tree_first = create_token_tree (dfa, NULL, NULL, token);
2380	token->opr.ctx_type = INSIDE_NOTWORD;
2381	}
2382	tree_last = create_token_tree (dfa, NULL, NULL, token);
2383	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2384	if (__glibc_unlikely (tree_first == NULL \|\| tree_last == NULL
2385	\|\| tree == NULL))
2386	{
2387	*err = REG_ESPACE;
2388	return NULL;
2389	}
2390	}
2391	else
2392	{
2393	tree = create_token_tree (dfa, NULL, NULL, token);
2394	if (__glibc_unlikely (tree == NULL))
2395	{
2396	*err = REG_ESPACE;
2397	return NULL;
2398	}
2399	}
2400	/* We must return here, since ANCHORs can't be followed
2401	by repetition operators.
2402	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
2403	it must not be "<ANCHOR(^)><REPEAT()>". /
2404	fetch_token (token, regexp, syntax);
2405	return tree;
2406
2407	case OP_PERIOD:
2408	tree = create_token_tree (dfa, NULL, NULL, token);
2409	if (__glibc_unlikely (tree == NULL))
2410	{
2411	*err = REG_ESPACE;
2412	return NULL;
2413	}
2414	if (dfa->mb_cur_max > 1)
2415	dfa->has_mb_node = 1;
2416	break;
2417
2418	case OP_WORD:
2419	case OP_NOTWORD:
2420	tree = build_charclass_op (dfa, regexp->trans,
2421	"alnum",
2422	"_",
2423	token->type == OP_NOTWORD, err);
2424	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2425	return NULL;
2426	break;
2427
2428	case OP_SPACE:
2429	case OP_NOTSPACE:
2430	tree = build_charclass_op (dfa, regexp->trans,
2431	"space",
2432	"",
2433	token->type == OP_NOTSPACE, err);
2434	if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2435	return NULL;
2436	break;
2437
2438	case OP_ALT:
2439	case END_OF_RE:
2440	return NULL;
2441
2442	case BACK_SLASH:
2443	*err = REG_EESCAPE;
2444	return NULL;
2445
2446	default:
2447	/* Must not happen? */
2448	DEBUG_ASSERT (false);
2449	return NULL;
2450	}
2451	fetch_token (token, regexp, syntax);
2452
2453	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
2454	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
2455	{
2456	bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2457	syntax, err);
2458	if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2459	{
2460	if (tree != NULL)
2461	postorder (tree, free_tree, NULL);
2462	return NULL;
2463	}
2464	tree = dup_tree;
2465	/* In BRE consecutive duplications are not allowed. */
2466	if ((syntax & RE_CONTEXT_INVALID_DUP)
2467	&& (token->type == OP_DUP_ASTERISK
2468	\|\| token->type == OP_OPEN_DUP_NUM))
2469	{
2470	if (tree != NULL)
2471	postorder (tree, free_tree, NULL);
2472	*err = REG_BADRPT;
2473	return NULL;
2474	}
2475	}
2476
2477	return tree;
2478	}
2479
2480	/* This function build the following tree, from regular expression
2481	(<reg_exp>):
2482	SUBEXP
2483	\|
2484	<reg_exp>
2485	*/
2486
2487	static bin_tree_t *
2488	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2489	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2490	{
2491	re_dfa_t *dfa = preg->buffer;
2492	bin_tree_t *tree;
2493	size_t cur_nsub;
2494	cur_nsub = preg->re_nsub++;
2495
2496	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2497
2498	/* The subexpression may be a null string. */
2499	if (token->type == OP_CLOSE_SUBEXP)
2500	tree = NULL;
2501	else
2502	{
2503	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2504	if (__glibc_unlikely (*err == REG_NOERROR
2505	&& token->type != OP_CLOSE_SUBEXP))
2506	{
2507	if (tree != NULL)
2508	postorder (tree, free_tree, NULL);
2509	*err = REG_EPAREN;
2510	}
2511	if (__glibc_unlikely (*err != REG_NOERROR))
2512	return NULL;
2513	}
2514
2515	if (cur_nsub <= '9' - '1')
2516	dfa->completed_bkref_map \|= 1 << cur_nsub;
2517
2518	tree = create_tree (dfa, tree, NULL, SUBEXP);
2519	if (__glibc_unlikely (tree == NULL))
2520	{
2521	*err = REG_ESPACE;
2522	return NULL;
2523	}
2524	tree->token.opr.idx = cur_nsub;
2525	return tree;
2526	}
2527
2528	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
2529
2530	static bin_tree_t *
2531	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
2532	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
2533	{
2534	bin_tree_t tree = NULL, old_tree = NULL;
2535	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2536	re_token_t start_token = *token;
2537
2538	if (token->type == OP_OPEN_DUP_NUM)
2539	{
2540	end = 0;
2541	start = fetch_number (regexp, token, syntax);
2542	if (start == -1)
2543	{
2544	if (token->type == CHARACTER && token->opr.c == ',')
2545	start = 0; /* We treat "{,m}" as "{0,m}". */
2546	else
2547	{
2548	err = REG_BADBR; / <re>{} is invalid. */
2549	return NULL;
2550	}
2551	}
2552	if (__glibc_likely (start != -2))
2553	{
2554	/* We treat "{n}" as "{n,n}". */
2555	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2556	: ((token->type == CHARACTER && token->opr.c == ',')
2557	? fetch_number (regexp, token, syntax) : -2));
2558	}
2559	if (__glibc_unlikely (start == -2 \|\| end == -2))
2560	{
2561	/* Invalid sequence. */
2562	if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2563	{
2564	if (token->type == END_OF_RE)
2565	*err = REG_EBRACE;
2566	else
2567	*err = REG_BADBR;
2568
2569	return NULL;
2570	}
2571
2572	/* If the syntax bit is set, rollback. */
2573	re_string_set_index (regexp, start_idx);
2574	*token = start_token;
2575	token->type = CHARACTER;
2576	/* mb_partial and word_char bits should be already initialized by
2577	peek_token. */
2578	return elem;
2579	}
2580
2581	if (__glibc_unlikely ((end != -1 && start > end)
2582	\|\| token->type != OP_CLOSE_DUP_NUM))
2583	{
2584	/* First number greater than second. */
2585	*err = REG_BADBR;
2586	return NULL;
2587	}
2588
2589	if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2590	{
2591	*err = REG_ESIZE;
2592	return NULL;
2593	}
2594	}
2595	else
2596	{
2597	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2598	end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2599	}
2600
2601	fetch_token (token, regexp, syntax);
2602
2603	if (__glibc_unlikely (elem == NULL))
2604	return NULL;
2605	if (__glibc_unlikely (start == 0 && end == 0))
2606	{
2607	postorder (elem, free_tree, NULL);
2608	return NULL;
2609	}
2610
2611	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2612	if (__glibc_unlikely (start > 0))
2613	{
2614	tree = elem;
2615	for (i = 2; i <= start; ++i)
2616	{
2617	elem = duplicate_tree (elem, dfa);
2618	tree = create_tree (dfa, tree, elem, CONCAT);
2619	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2620	goto parse_dup_op_espace;
2621	}
2622
2623	if (start == end)
2624	return tree;
2625
2626	/* Duplicate ELEM before it is marked optional. */
2627	elem = duplicate_tree (elem, dfa);
2628	if (__glibc_unlikely (elem == NULL))
2629	goto parse_dup_op_espace;
2630	old_tree = tree;
2631	}
2632	else
2633	old_tree = NULL;
2634
2635	if (elem->token.type == SUBEXP)
2636	{
2637	uintptr_t subidx = elem->token.opr.idx;
2638	postorder (elem, mark_opt_subexp, (void *) subidx);
2639	}
2640
2641	tree = create_tree (dfa, elem, NULL,
2642	(end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2643	if (__glibc_unlikely (tree == NULL))
2644	goto parse_dup_op_espace;
2645
2646	/* This loop is actually executed only when end != -1,
2647	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2648	already created the start+1-th copy. */
2649	if (TYPE_SIGNED (Idx) \|\| end != -1)
2650	for (i = start + 2; i <= end; ++i)
2651	{
2652	elem = duplicate_tree (elem, dfa);
2653	tree = create_tree (dfa, tree, elem, CONCAT);
2654	if (__glibc_unlikely (elem == NULL \|\| tree == NULL))
2655	goto parse_dup_op_espace;
2656
2657	tree = create_tree (dfa, tree, NULL, OP_ALT);
2658	if (__glibc_unlikely (tree == NULL))
2659	goto parse_dup_op_espace;
2660	}
2661
2662	if (old_tree)
2663	tree = create_tree (dfa, old_tree, tree, CONCAT);
2664
2665	return tree;
2666
2667	parse_dup_op_espace:
2668	*err = REG_ESPACE;
2669	return NULL;
2670	}
2671
2672	/* Size of the names for collating symbol/equivalence_class/character_class.
2673	I'm not sure, but maybe enough. */
2674	#define BRACKET_NAME_BUF_SIZE 32
2675
2676	#ifndef _LIBC
2677
2678	# ifdef RE_ENABLE_I18N
2679	/* Convert the byte B to the corresponding wide character. In a
2680	unibyte locale, treat B as itself. In a multibyte locale, return
2681	WEOF if B is an encoding error. */
2682	static wint_t
2683	parse_byte (unsigned char b, re_charset_t *mbcset)
2684	{
2685	return mbcset == NULL ? b : __btowc (b);
2686	}
2687	# endif
2688
2689	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2690	Build the range expression which starts from START_ELEM, and ends
2691	at END_ELEM. The result are written to MBCSET and SBCSET.
2692	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2693	mbcset->range_ends, is a pointer argument since we may
2694	update it. */
2695
2696	static reg_errcode_t
2697	# ifdef RE_ENABLE_I18N
2698	build_range_exp (const reg_syntax_t syntax,
2699	bitset_t sbcset,
2700	re_charset_t *mbcset,
2701	Idx *range_alloc,
2702	const bracket_elem_t *start_elem,
2703	const bracket_elem_t *end_elem)
2704	# else /* not RE_ENABLE_I18N */
2705	build_range_exp (const reg_syntax_t syntax,
2706	bitset_t sbcset,
2707	const bracket_elem_t *start_elem,
2708	const bracket_elem_t *end_elem)
2709	# endif /* not RE_ENABLE_I18N */
2710	{
2711	unsigned int start_ch, end_ch;
2712	/* Equivalence Classes and Character Classes can't be a range start/end. */
2713	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2714	\|\| start_elem->type == CHAR_CLASS
2715	\|\| end_elem->type == EQUIV_CLASS
2716	\|\| end_elem->type == CHAR_CLASS))
2717	return REG_ERANGE;
2718
2719	/* We can handle no multi character collating elements without libc
2720	support. */
2721	if (__glibc_unlikely ((start_elem->type == COLL_SYM
2722	&& strlen ((char *) start_elem->opr.name) > 1)
2723	\|\| (end_elem->type == COLL_SYM
2724	&& strlen ((char *) end_elem->opr.name) > 1)))
2725	return REG_ECOLLATE;
2726
2727	# ifdef RE_ENABLE_I18N
2728	{
2729	wchar_t wc;
2730	wint_t start_wc;
2731	wint_t end_wc;
2732
2733	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2734	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2735	: 0));
2736	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2737	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2738	: 0));
2739	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
2740	? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2741	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
2742	? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2743	if (start_wc == WEOF \|\| end_wc == WEOF)
2744	return REG_ECOLLATE;
2745	else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2746	&& start_wc > end_wc))
2747	return REG_ERANGE;
2748
2749	/* Got valid collation sequence values, add them as a new entry.
2750	However, for !_LIBC we have no collation elements: if the
2751	character set is single byte, the single byte character set
2752	that we build below suffices. parse_bracket_exp passes
2753	no MBCSET if dfa->mb_cur_max == 1. */
2754	if (mbcset)
2755	{
2756	/* Check the space of the arrays. */
2757	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2758	{
2759	/* There is not enough space, need realloc. */
2760	wchar_t new_array_start, new_array_end;
2761	Idx new_nranges;
2762
2763	/* +1 in case of mbcset->nranges is 0. */
2764	new_nranges = 2 * mbcset->nranges + 1;
2765	/* Use realloc since mbcset->range_starts and mbcset->range_ends
2766	are NULL if range_alloc == 0. /
2767	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2768	new_nranges);
2769	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2770	new_nranges);
2771
2772	if (__glibc_unlikely (new_array_start == NULL
2773	\|\| new_array_end == NULL))
2774	{
2775	re_free (new_array_start);
2776	re_free (new_array_end);
2777	return REG_ESPACE;
2778	}
2779
2780	mbcset->range_starts = new_array_start;
2781	mbcset->range_ends = new_array_end;
2782	*range_alloc = new_nranges;
2783	}
2784
2785	mbcset->range_starts[mbcset->nranges] = start_wc;
2786	mbcset->range_ends[mbcset->nranges++] = end_wc;
2787	}
2788
2789	/* Build the table for single byte characters. */
2790	for (wc = 0; wc < SBC_MAX; ++wc)
2791	{
2792	if (start_wc <= wc && wc <= end_wc)
2793	bitset_set (sbcset, wc);
2794	}
2795	}
2796	# else /* not RE_ENABLE_I18N */
2797	{
2798	unsigned int ch;
2799	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2800	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2801	: 0));
2802	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2803	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2804	: 0));
2805	if (start_ch > end_ch)
2806	return REG_ERANGE;
2807	/* Build the table for single byte characters. */
2808	for (ch = 0; ch < SBC_MAX; ++ch)
2809	if (start_ch <= ch && ch <= end_ch)
2810	bitset_set (sbcset, ch);
2811	}
2812	# endif /* not RE_ENABLE_I18N */
2813	return REG_NOERROR;
2814	}
2815	#endif /* not _LIBC */
2816
2817	#ifndef _LIBC
2818	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2819	Build the collating element which is represented by NAME.
2820	The result are written to MBCSET and SBCSET.
2821	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2822	pointer argument since we may update it. */
2823
2824	static reg_errcode_t
2825	# ifdef RE_ENABLE_I18N
2826	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2827	Idx coll_sym_alloc, const unsigned char name)
2828	# else /* not RE_ENABLE_I18N */
2829	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2830	# endif /* not RE_ENABLE_I18N */
2831	{
2832	size_t name_len = strlen ((const char *) name);
2833	if (__glibc_unlikely (name_len != 1))
2834	return REG_ECOLLATE;
2835	else
2836	{
2837	bitset_set (sbcset, name[0]);
2838	return REG_NOERROR;
2839	}
2840	}
2841	#endif /* not _LIBC */
2842
2843	/* This function parse bracket expression like "[abc]", "[a-c]",
2844	"[[.a-a.]]" etc. */
2845
2846	static bin_tree_t *
2847	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
2848	reg_syntax_t syntax, reg_errcode_t *err)
2849	{
2850	#ifdef _LIBC
2851	const unsigned char *collseqmb;
2852	const char *collseqwc;
2853	uint32_t nrules;
2854	int32_t table_size;
2855	const int32_t *symb_table;
2856	const unsigned char *extra;
2857
2858	/* Local function for parse_bracket_exp used in _LIBC environment.
2859	Seek the collating symbol entry corresponding to NAME.
2860	Return the index of the symbol in the SYMB_TABLE,
2861	or -1 if not found. */
2862
2863	auto inline int32_t
2864	__attribute__ ((always_inline))
2865	seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2866	{
2867	int32_t elem;
2868
2869	for (elem = 0; elem < table_size; elem++)
2870	if (symb_table[2 * elem] != 0)
2871	{
2872	int32_t idx = symb_table[2 * elem + 1];
2873	/* Skip the name of collating element name. */
2874	idx += 1 + extra[idx];
2875	if (/* Compare the length of the name. */
2876	name_len == extra[idx]
2877	/* Compare the name. */
2878	&& memcmp (name, &extra[idx + 1], name_len) == 0)
2879	/* Yep, this is the entry. */
2880	return elem;
2881	}
2882	return -1;
2883	}
2884
2885	/* Local function for parse_bracket_exp used in _LIBC environment.
2886	Look up the collation sequence value of BR_ELEM.
2887	Return the value if succeeded, UINT_MAX otherwise. */
2888
2889	auto inline unsigned int
2890	__attribute__ ((always_inline))
2891	lookup_collation_sequence_value (bracket_elem_t *br_elem)
2892	{
2893	if (br_elem->type == SB_CHAR)
2894	{
2895	/*
2896	if (MB_CUR_MAX == 1)
2897	*/
2898	if (nrules == 0)
2899	return collseqmb[br_elem->opr.ch];
2900	else
2901	{
2902	wint_t wc = __btowc (br_elem->opr.ch);
2903	return __collseq_table_lookup (collseqwc, wc);
2904	}
2905	}
2906	else if (br_elem->type == MB_CHAR)
2907	{
2908	if (nrules != 0)
2909	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2910	}
2911	else if (br_elem->type == COLL_SYM)
2912	{
2913	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2914	if (nrules != 0)
2915	{
2916	int32_t elem, idx;
2917	elem = seek_collating_symbol_entry (br_elem->opr.name,
2918	sym_name_len);
2919	if (elem != -1)
2920	{
2921	/* We found the entry. */
2922	idx = symb_table[2 * elem + 1];
2923	/* Skip the name of collating element name. */
2924	idx += 1 + extra[idx];
2925	/* Skip the byte sequence of the collating element. */
2926	idx += 1 + extra[idx];
2927	/* Adjust for the alignment. */
2928	idx = (idx + 3) & ~3;
2929	/* Skip the multibyte collation sequence value. */
2930	idx += sizeof (unsigned int);
2931	/* Skip the wide char sequence of the collating element. */
2932	idx += sizeof (unsigned int) *
2933	(1 + (unsigned int ) (extra + idx));
2934	/* Return the collation sequence value. */
2935	return (unsigned int ) (extra + idx);
2936	}
2937	else if (sym_name_len == 1)
2938	{
2939	/* No valid character. Match it as a single byte
2940	character. */
2941	return collseqmb[br_elem->opr.name[0]];
2942	}
2943	}
2944	else if (sym_name_len == 1)
2945	return collseqmb[br_elem->opr.name[0]];
2946	}
2947	return UINT_MAX;
2948	}
2949
2950	/* Local function for parse_bracket_exp used in _LIBC environment.
2951	Build the range expression which starts from START_ELEM, and ends
2952	at END_ELEM. The result are written to MBCSET and SBCSET.
2953	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2954	mbcset->range_ends, is a pointer argument since we may
2955	update it. */
2956
2957	auto inline reg_errcode_t
2958	__attribute__ ((always_inline))
2959	build_range_exp (bitset_t sbcset, re_charset_t mbcset, int range_alloc,
2960	bracket_elem_t start_elem, bracket_elem_t end_elem)
2961	{
2962	unsigned int ch;
2963	uint32_t start_collseq;
2964	uint32_t end_collseq;
2965
2966	/* Equivalence Classes and Character Classes can't be a range
2967	start/end. */
2968	if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2969	\|\| start_elem->type == CHAR_CLASS
2970	\|\| end_elem->type == EQUIV_CLASS
2971	\|\| end_elem->type == CHAR_CLASS))
2972	return REG_ERANGE;
2973
2974	/* FIXME: Implement rational ranges here, too. */
2975	start_collseq = lookup_collation_sequence_value (start_elem);
2976	end_collseq = lookup_collation_sequence_value (end_elem);
2977	/* Check start/end collation sequence values. */
2978	if (__glibc_unlikely (start_collseq == UINT_MAX
2979	\|\| end_collseq == UINT_MAX))
2980	return REG_ECOLLATE;
2981	if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2982	&& start_collseq > end_collseq))
2983	return REG_ERANGE;
2984
2985	/* Got valid collation sequence values, add them as a new entry.
2986	However, if we have no collation elements, and the character set
2987	is single byte, the single byte character set that we
2988	build below suffices. */
2989	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
2990	{
2991	/* Check the space of the arrays. */
2992	if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2993	{
2994	/* There is not enough space, need realloc. */
2995	uint32_t *new_array_start;
2996	uint32_t *new_array_end;
2997	Idx new_nranges;
2998
2999	/* +1 in case of mbcset->nranges is 0. */
3000	new_nranges = 2 * mbcset->nranges + 1;
3001	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3002	new_nranges);
3003	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3004	new_nranges);
3005
3006	if (__glibc_unlikely (new_array_start == NULL
3007	\|\| new_array_end == NULL))
3008	return REG_ESPACE;
3009
3010	mbcset->range_starts = new_array_start;
3011	mbcset->range_ends = new_array_end;
3012	*range_alloc = new_nranges;
3013	}
3014
3015	mbcset->range_starts[mbcset->nranges] = start_collseq;
3016	mbcset->range_ends[mbcset->nranges++] = end_collseq;
3017	}
3018
3019	/* Build the table for single byte characters. */
3020	for (ch = 0; ch < SBC_MAX; ch++)
3021	{
3022	uint32_t ch_collseq;
3023	/*
3024	if (MB_CUR_MAX == 1)
3025	*/
3026	if (nrules == 0)
3027	ch_collseq = collseqmb[ch];
3028	else
3029	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3030	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3031	bitset_set (sbcset, ch);
3032	}
3033	return REG_NOERROR;
3034	}
3035
3036	/* Local function for parse_bracket_exp used in _LIBC environment.
3037	Build the collating element which is represented by NAME.
3038	The result are written to MBCSET and SBCSET.
3039	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3040	pointer argument since we may update it. */
3041
3042	auto inline reg_errcode_t
3043	__attribute__ ((always_inline))
3044	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3045	Idx coll_sym_alloc, const unsigned char name)
3046	{
3047	int32_t elem, idx;
3048	size_t name_len = strlen ((const char *) name);
3049	if (nrules != 0)
3050	{
3051	elem = seek_collating_symbol_entry (name, name_len);
3052	if (elem != -1)
3053	{
3054	/* We found the entry. */
3055	idx = symb_table[2 * elem + 1];
3056	/* Skip the name of collating element name. */
3057	idx += 1 + extra[idx];
3058	}
3059	else if (name_len == 1)
3060	{
3061	/* No valid character, treat it as a normal
3062	character. */
3063	bitset_set (sbcset, name[0]);
3064	return REG_NOERROR;
3065	}
3066	else
3067	return REG_ECOLLATE;
3068
3069	/* Got valid collation sequence, add it as a new entry. */
3070	/* Check the space of the arrays. */
3071	if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3072	{
3073	/* Not enough, realloc it. */
3074	/* +1 in case of mbcset->ncoll_syms is 0. */
3075	Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3076	/* Use realloc since mbcset->coll_syms is NULL
3077	if alloc == 0. /
3078	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3079	new_coll_sym_alloc);
3080	if (__glibc_unlikely (new_coll_syms == NULL))
3081	return REG_ESPACE;
3082	mbcset->coll_syms = new_coll_syms;
3083	*coll_sym_alloc = new_coll_sym_alloc;
3084	}
3085	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3086	return REG_NOERROR;
3087	}
3088	else
3089	{
3090	if (__glibc_unlikely (name_len != 1))
3091	return REG_ECOLLATE;
3092	else
3093	{
3094	bitset_set (sbcset, name[0]);
3095	return REG_NOERROR;
3096	}
3097	}
3098	}
3099	#endif
3100
3101	re_token_t br_token;
3102	re_bitset_ptr_t sbcset;
3103	#ifdef RE_ENABLE_I18N
3104	re_charset_t *mbcset;
3105	Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3106	Idx equiv_class_alloc = 0, char_class_alloc = 0;
3107	#endif /* not RE_ENABLE_I18N */
3108	bool non_match = false;
3109	bin_tree_t *work_tree;
3110	int token_len;
3111	bool first_round = true;
3112	#ifdef _LIBC
3113	collseqmb = (const unsigned char *)
3114	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3115	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3116	if (nrules)
3117	{
3118	/*
3119	if (MB_CUR_MAX > 1)
3120	*/
3121	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3122	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3123	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3124	_NL_COLLATE_SYMB_TABLEMB);
3125	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3126	_NL_COLLATE_SYMB_EXTRAMB);
3127	}
3128	#endif
3129	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3130	#ifdef RE_ENABLE_I18N
3131	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3132	#endif /* RE_ENABLE_I18N */
3133	#ifdef RE_ENABLE_I18N
3134	if (__glibc_unlikely (sbcset == NULL \|\| mbcset == NULL))
3135	#else
3136	if (__glibc_unlikely (sbcset == NULL))
3137	#endif /* RE_ENABLE_I18N */
3138	{
3139	re_free (sbcset);
3140	#ifdef RE_ENABLE_I18N
3141	re_free (mbcset);
3142	#endif
3143	*err = REG_ESPACE;
3144	return NULL;
3145	}
3146
3147	token_len = peek_token_bracket (token, regexp, syntax);
3148	if (__glibc_unlikely (token->type == END_OF_RE))
3149	{
3150	*err = REG_BADPAT;
3151	goto parse_bracket_exp_free_return;
3152	}
3153	if (token->type == OP_NON_MATCH_LIST)
3154	{
3155	#ifdef RE_ENABLE_I18N
3156	mbcset->non_match = 1;
3157	#endif /* not RE_ENABLE_I18N */
3158	non_match = true;
3159	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3160	bitset_set (sbcset, '\n');
3161	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3162	token_len = peek_token_bracket (token, regexp, syntax);
3163	if (__glibc_unlikely (token->type == END_OF_RE))
3164	{
3165	*err = REG_BADPAT;
3166	goto parse_bracket_exp_free_return;
3167	}
3168	}
3169
3170	/* We treat the first ']' as a normal character. */
3171	if (token->type == OP_CLOSE_BRACKET)
3172	token->type = CHARACTER;
3173
3174	while (1)
3175	{
3176	bracket_elem_t start_elem, end_elem;
3177	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3178	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3179	reg_errcode_t ret;
3180	int token_len2 = 0;
3181	bool is_range_exp = false;
3182	re_token_t token2;
3183
3184	start_elem.opr.name = start_name_buf;
3185	start_elem.type = COLL_SYM;
3186	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3187	syntax, first_round);
3188	if (__glibc_unlikely (ret != REG_NOERROR))
3189	{
3190	*err = ret;
3191	goto parse_bracket_exp_free_return;
3192	}
3193	first_round = false;
3194
3195	/* Get information about the next token. We need it in any case. */
3196	token_len = peek_token_bracket (token, regexp, syntax);
3197
3198	/* Do not check for ranges if we know they are not allowed. */
3199	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3200	{
3201	if (__glibc_unlikely (token->type == END_OF_RE))
3202	{
3203	*err = REG_EBRACK;
3204	goto parse_bracket_exp_free_return;
3205	}
3206	if (token->type == OP_CHARSET_RANGE)
3207	{
3208	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3209	token_len2 = peek_token_bracket (&token2, regexp, syntax);
3210	if (__glibc_unlikely (token2.type == END_OF_RE))
3211	{
3212	*err = REG_EBRACK;
3213	goto parse_bracket_exp_free_return;
3214	}
3215	if (token2.type == OP_CLOSE_BRACKET)
3216	{
3217	/* We treat the last '-' as a normal character. */
3218	re_string_skip_bytes (regexp, -token_len);
3219	token->type = CHARACTER;
3220	}
3221	else
3222	is_range_exp = true;
3223	}
3224	}
3225
3226	if (is_range_exp == true)
3227	{
3228	end_elem.opr.name = end_name_buf;
3229	end_elem.type = COLL_SYM;
3230	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3231	dfa, syntax, true);
3232	if (__glibc_unlikely (ret != REG_NOERROR))
3233	{
3234	*err = ret;
3235	goto parse_bracket_exp_free_return;
3236	}
3237
3238	token_len = peek_token_bracket (token, regexp, syntax);
3239
3240	#ifdef _LIBC
3241	*err = build_range_exp (sbcset, mbcset, &range_alloc,
3242	&start_elem, &end_elem);
3243	#else
3244	# ifdef RE_ENABLE_I18N
3245	*err = build_range_exp (syntax, sbcset,
3246	dfa->mb_cur_max > 1 ? mbcset : NULL,
3247	&range_alloc, &start_elem, &end_elem);
3248	# else
3249	*err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3250	# endif
3251	#endif /* RE_ENABLE_I18N */
3252	if (__glibc_unlikely (*err != REG_NOERROR))
3253	goto parse_bracket_exp_free_return;
3254	}
3255	else
3256	{
3257	switch (start_elem.type)
3258	{
3259	case SB_CHAR:
3260	bitset_set (sbcset, start_elem.opr.ch);
3261	break;
3262	#ifdef RE_ENABLE_I18N
3263	case MB_CHAR:
3264	/* Check whether the array has enough space. */
3265	if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3266	{
3267	wchar_t *new_mbchars;
3268	/* Not enough, realloc it. */
3269	/* +1 in case of mbcset->nmbchars is 0. */
3270	mbchar_alloc = 2 * mbcset->nmbchars + 1;
3271	/* Use realloc since array is NULL if alloc == 0. /
3272	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3273	mbchar_alloc);
3274	if (__glibc_unlikely (new_mbchars == NULL))
3275	goto parse_bracket_exp_espace;
3276	mbcset->mbchars = new_mbchars;
3277	}
3278	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3279	break;
3280	#endif /* RE_ENABLE_I18N */
3281	case EQUIV_CLASS:
3282	*err = build_equiv_class (sbcset,
3283	#ifdef RE_ENABLE_I18N
3284	mbcset, &equiv_class_alloc,
3285	#endif /* RE_ENABLE_I18N */
3286	start_elem.opr.name);
3287	if (__glibc_unlikely (*err != REG_NOERROR))
3288	goto parse_bracket_exp_free_return;
3289	break;
3290	case COLL_SYM:
3291	*err = build_collating_symbol (sbcset,
3292	#ifdef RE_ENABLE_I18N
3293	mbcset, &coll_sym_alloc,
3294	#endif /* RE_ENABLE_I18N */
3295	start_elem.opr.name);
3296	if (__glibc_unlikely (*err != REG_NOERROR))
3297	goto parse_bracket_exp_free_return;
3298	break;
3299	case CHAR_CLASS:
3300	*err = build_charclass (regexp->trans, sbcset,
3301	#ifdef RE_ENABLE_I18N
3302	mbcset, &char_class_alloc,
3303	#endif /* RE_ENABLE_I18N */
3304	(const char *) start_elem.opr.name,
3305	syntax);
3306	if (__glibc_unlikely (*err != REG_NOERROR))
3307	goto parse_bracket_exp_free_return;
3308	break;
3309	default:
3310	DEBUG_ASSERT (false);
3311	break;
3312	}
3313	}
3314	if (__glibc_unlikely (token->type == END_OF_RE))
3315	{
3316	*err = REG_EBRACK;
3317	goto parse_bracket_exp_free_return;
3318	}
3319	if (token->type == OP_CLOSE_BRACKET)
3320	break;
3321	}
3322
3323	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3324
3325	/* If it is non-matching list. */
3326	if (non_match)
3327	bitset_not (sbcset);
3328
3329	#ifdef RE_ENABLE_I18N
3330	/* Ensure only single byte characters are set. */
3331	if (dfa->mb_cur_max > 1)
3332	bitset_mask (sbcset, dfa->sb_char);
3333
3334	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
3335	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3336	\|\| mbcset->non_match)))
3337	{
3338	bin_tree_t *mbc_tree;
3339	int sbc_idx;
3340	/* Build a tree for complex bracket. */
3341	dfa->has_mb_node = 1;
3342	br_token.type = COMPLEX_BRACKET;
3343	br_token.opr.mbcset = mbcset;
3344	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3345	if (__glibc_unlikely (mbc_tree == NULL))
3346	goto parse_bracket_exp_espace;
3347	for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3348	if (sbcset[sbc_idx])
3349	break;
3350	/* If there are no bits set in sbcset, there is no point
3351	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3352	if (sbc_idx < BITSET_WORDS)
3353	{
3354	/* Build a tree for simple bracket. */
3355	br_token.type = SIMPLE_BRACKET;
3356	br_token.opr.sbcset = sbcset;
3357	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3358	if (__glibc_unlikely (work_tree == NULL))
3359	goto parse_bracket_exp_espace;
3360
3361	/* Then join them by ALT node. */
3362	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3363	if (__glibc_unlikely (work_tree == NULL))
3364	goto parse_bracket_exp_espace;
3365	}
3366	else
3367	{
3368	re_free (sbcset);
3369	work_tree = mbc_tree;
3370	}
3371	}
3372	else
3373	#endif /* not RE_ENABLE_I18N */
3374	{
3375	#ifdef RE_ENABLE_I18N
3376	free_charset (mbcset);
3377	#endif
3378	/* Build a tree for simple bracket. */
3379	br_token.type = SIMPLE_BRACKET;
3380	br_token.opr.sbcset = sbcset;
3381	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3382	if (__glibc_unlikely (work_tree == NULL))
3383	goto parse_bracket_exp_espace;
3384	}
3385	return work_tree;
3386
3387	parse_bracket_exp_espace:
3388	*err = REG_ESPACE;
3389	parse_bracket_exp_free_return:
3390	re_free (sbcset);
3391	#ifdef RE_ENABLE_I18N
3392	free_charset (mbcset);
3393	#endif /* RE_ENABLE_I18N */
3394	return NULL;
3395	}
3396
3397	/* Parse an element in the bracket expression. */
3398
3399	static reg_errcode_t
3400	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
3401	re_token_t token, int token_len, re_dfa_t dfa,
3402	reg_syntax_t syntax, bool accept_hyphen)
3403	{
3404	#ifdef RE_ENABLE_I18N
3405	int cur_char_size;
3406	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3407	if (cur_char_size > 1)
3408	{
3409	elem->type = MB_CHAR;
3410	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3411	re_string_skip_bytes (regexp, cur_char_size);
3412	return REG_NOERROR;
3413	}
3414	#endif /* RE_ENABLE_I18N */
3415	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3416	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
3417	\|\| token->type == OP_OPEN_EQUIV_CLASS)
3418	return parse_bracket_symbol (elem, regexp, token);
3419	if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3420	{
3421	/* A '-' must only appear as anything but a range indicator before
3422	the closing bracket. Everything else is an error. */
3423	re_token_t token2;
3424	(void) peek_token_bracket (&token2, regexp, syntax);
3425	if (token2.type != OP_CLOSE_BRACKET)
3426	/* The actual error value is not standardized since this whole
3427	case is undefined. But ERANGE makes good sense. */
3428	return REG_ERANGE;
3429	}
3430	elem->type = SB_CHAR;
3431	elem->opr.ch = token->opr.c;
3432	return REG_NOERROR;
3433	}
3434
3435	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3436	such as [:<character_class>:], [.<collating_element>.], and
3437	[=<equivalent_class>=]. */
3438
3439	static reg_errcode_t
3440	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
3441	re_token_t *token)
3442	{
3443	unsigned char ch, delim = token->opr.c;
3444	int i = 0;
3445	if (re_string_eoi(regexp))
3446	return REG_EBRACK;
3447	for (;; ++i)
3448	{
3449	if (i >= BRACKET_NAME_BUF_SIZE)
3450	return REG_EBRACK;
3451	if (token->type == OP_OPEN_CHAR_CLASS)
3452	ch = re_string_fetch_byte_case (regexp);
3453	else
3454	ch = re_string_fetch_byte (regexp);
3455	if (re_string_eoi(regexp))
3456	return REG_EBRACK;
3457	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3458	break;
3459	elem->opr.name[i] = ch;
3460	}
3461	re_string_skip_bytes (regexp, 1);
3462	elem->opr.name[i] = '\0';
3463	switch (token->type)
3464	{
3465	case OP_OPEN_COLL_ELEM:
3466	elem->type = COLL_SYM;
3467	break;
3468	case OP_OPEN_EQUIV_CLASS:
3469	elem->type = EQUIV_CLASS;
3470	break;
3471	case OP_OPEN_CHAR_CLASS:
3472	elem->type = CHAR_CLASS;
3473	break;
3474	default:
3475	break;
3476	}
3477	return REG_NOERROR;
3478	}
3479
3480	/* Helper function for parse_bracket_exp.
3481	Build the equivalence class which is represented by NAME.
3482	The result are written to MBCSET and SBCSET.
3483	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3484	is a pointer argument since we may update it. */
3485
3486	static reg_errcode_t
3487	#ifdef RE_ENABLE_I18N
3488	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3489	Idx equiv_class_alloc, const unsigned char name)
3490	#else /* not RE_ENABLE_I18N */
3491	build_equiv_class (bitset_t sbcset, const unsigned char *name)
3492	#endif /* not RE_ENABLE_I18N */
3493	{
3494	#ifdef _LIBC
3495	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3496	if (nrules != 0)
3497	{
3498	const int32_t table, indirect;
3499	const unsigned char weights, extra, *cp;
3500	unsigned char char_buf[2];
3501	int32_t idx1, idx2;
3502	unsigned int ch;
3503	size_t len;
3504	/* Calculate the index for equivalence class. */
3505	cp = name;
3506	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3507	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3508	_NL_COLLATE_WEIGHTMB);
3509	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3510	_NL_COLLATE_EXTRAMB);
3511	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3512	_NL_COLLATE_INDIRECTMB);
3513	idx1 = findidx (table, indirect, extra, &cp, -1);
3514	if (__glibc_unlikely (idx1 == 0 \|\| *cp != '\0'))
3515	/* This isn't a valid character. */
3516	return REG_ECOLLATE;
3517
3518	/* Build single byte matching table for this equivalence class. */
3519	len = weights[idx1 & 0xffffff];
3520	for (ch = 0; ch < SBC_MAX; ++ch)
3521	{
3522	char_buf[0] = ch;
3523	cp = char_buf;
3524	idx2 = findidx (table, indirect, extra, &cp, 1);
3525	/*
3526	idx2 = table[ch];
3527	*/
3528	if (idx2 == 0)
3529	/* This isn't a valid character. */
3530	continue;
3531	/* Compare only if the length matches and the collation rule
3532	index is the same. */
3533	if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3534	&& memcmp (weights + (idx1 & 0xffffff) + 1,
3535	weights + (idx2 & 0xffffff) + 1, len) == 0)
3536	bitset_set (sbcset, ch);
3537	}
3538	/* Check whether the array has enough space. */
3539	if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3540	{
3541	/* Not enough, realloc it. */
3542	/* +1 in case of mbcset->nequiv_classes is 0. */
3543	Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3544	/* Use realloc since the array is NULL if alloc == 0. /
3545	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3546	int32_t,
3547	new_equiv_class_alloc);
3548	if (__glibc_unlikely (new_equiv_classes == NULL))
3549	return REG_ESPACE;
3550	mbcset->equiv_classes = new_equiv_classes;
3551	*equiv_class_alloc = new_equiv_class_alloc;
3552	}
3553	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3554	}
3555	else
3556	#endif /* _LIBC */
3557	{
3558	if (__glibc_unlikely (strlen ((const char *) name) != 1))
3559	return REG_ECOLLATE;
3560	bitset_set (sbcset, *name);
3561	}
3562	return REG_NOERROR;
3563	}
3564
3565	/* Helper function for parse_bracket_exp.
3566	Build the character class which is represented by NAME.
3567	The result are written to MBCSET and SBCSET.
3568	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3569	is a pointer argument since we may update it. */
3570
3571	static reg_errcode_t
3572	#ifdef RE_ENABLE_I18N
3573	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3574	re_charset_t mbcset, Idx char_class_alloc,
3575	const char *class_name, reg_syntax_t syntax)
3576	#else /* not RE_ENABLE_I18N */
3577	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3578	const char *class_name, reg_syntax_t syntax)
3579	#endif /* not RE_ENABLE_I18N */
3580	{
3581	int i;
3582	const char *name = class_name;
3583
3584	/* In case of REG_ICASE "upper" and "lower" match the both of
3585	upper and lower cases. */
3586	if ((syntax & RE_ICASE)
3587	&& (strcmp (name, "upper") == 0 \|\| strcmp (name, "lower") == 0))
3588	name = "alpha";
3589
3590	#ifdef RE_ENABLE_I18N
3591	/* Check the space of the arrays. */
3592	if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3593	{
3594	/* Not enough, realloc it. */
3595	/* +1 in case of mbcset->nchar_classes is 0. */
3596	Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3597	/* Use realloc since array is NULL if alloc == 0. /
3598	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3599	new_char_class_alloc);
3600	if (__glibc_unlikely (new_char_classes == NULL))
3601	return REG_ESPACE;
3602	mbcset->char_classes = new_char_classes;
3603	*char_class_alloc = new_char_class_alloc;
3604	}
3605	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3606	#endif /* RE_ENABLE_I18N */
3607
3608	#define BUILD_CHARCLASS_LOOP(ctype_func) \
3609	do { \
3610	if (__glibc_unlikely (trans != NULL)) \
3611	{ \
3612	for (i = 0; i < SBC_MAX; ++i) \
3613	if (ctype_func (i)) \
3614	bitset_set (sbcset, trans[i]); \
3615	} \
3616	else \
3617	{ \
3618	for (i = 0; i < SBC_MAX; ++i) \
3619	if (ctype_func (i)) \
3620	bitset_set (sbcset, i); \
3621	} \
3622	} while (0)
3623
3624	if (strcmp (name, "alnum") == 0)
3625	BUILD_CHARCLASS_LOOP (isalnum);
3626	else if (strcmp (name, "cntrl") == 0)
3627	BUILD_CHARCLASS_LOOP (iscntrl);
3628	else if (strcmp (name, "lower") == 0)
3629	BUILD_CHARCLASS_LOOP (islower);
3630	else if (strcmp (name, "space") == 0)
3631	BUILD_CHARCLASS_LOOP (isspace);
3632	else if (strcmp (name, "alpha") == 0)
3633	BUILD_CHARCLASS_LOOP (isalpha);
3634	else if (strcmp (name, "digit") == 0)
3635	BUILD_CHARCLASS_LOOP (isdigit);
3636	else if (strcmp (name, "print") == 0)
3637	BUILD_CHARCLASS_LOOP (isprint);
3638	else if (strcmp (name, "upper") == 0)
3639	BUILD_CHARCLASS_LOOP (isupper);
3640	else if (strcmp (name, "blank") == 0)
3641	BUILD_CHARCLASS_LOOP (isblank);
3642	else if (strcmp (name, "graph") == 0)
3643	BUILD_CHARCLASS_LOOP (isgraph);
3644	else if (strcmp (name, "punct") == 0)
3645	BUILD_CHARCLASS_LOOP (ispunct);
3646	else if (strcmp (name, "xdigit") == 0)
3647	BUILD_CHARCLASS_LOOP (isxdigit);
3648	else
3649	return REG_ECTYPE;
3650
3651	return REG_NOERROR;
3652	}
3653
3654	static bin_tree_t *
3655	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3656	const char *class_name,
3657	const char *extra, bool non_match,
3658	reg_errcode_t *err)
3659	{
3660	re_bitset_ptr_t sbcset;
3661	#ifdef RE_ENABLE_I18N
3662	re_charset_t *mbcset;
3663	Idx alloc = 0;
3664	#endif /* not RE_ENABLE_I18N */
3665	reg_errcode_t ret;
3666	bin_tree_t *tree;
3667
3668	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3669	if (__glibc_unlikely (sbcset == NULL))
3670	{
3671	*err = REG_ESPACE;
3672	return NULL;
3673	}
3674	#ifdef RE_ENABLE_I18N
3675	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3676	if (__glibc_unlikely (mbcset == NULL))
3677	{
3678	re_free (sbcset);
3679	*err = REG_ESPACE;
3680	return NULL;
3681	}
3682	mbcset->non_match = non_match;
3683	#endif /* RE_ENABLE_I18N */
3684
3685	/* We don't care the syntax in this case. */
3686	ret = build_charclass (trans, sbcset,
3687	#ifdef RE_ENABLE_I18N
3688	mbcset, &alloc,
3689	#endif /* RE_ENABLE_I18N */
3690	class_name, 0);
3691
3692	if (__glibc_unlikely (ret != REG_NOERROR))
3693	{
3694	re_free (sbcset);
3695	#ifdef RE_ENABLE_I18N
3696	free_charset (mbcset);
3697	#endif /* RE_ENABLE_I18N */
3698	*err = ret;
3699	return NULL;
3700	}
3701	/* \w match '_' also. */
3702	for (; *extra; extra++)
3703	bitset_set (sbcset, *extra);
3704
3705	/* If it is non-matching list. */
3706	if (non_match)
3707	bitset_not (sbcset);
3708
3709	#ifdef RE_ENABLE_I18N
3710	/* Ensure only single byte characters are set. */
3711	if (dfa->mb_cur_max > 1)
3712	bitset_mask (sbcset, dfa->sb_char);
3713	#endif
3714
3715	/* Build a tree for simple bracket. */
3716	re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3717	tree = create_token_tree (dfa, NULL, NULL, &br_token);
3718	if (__glibc_unlikely (tree == NULL))
3719	goto build_word_op_espace;
3720
3721	#ifdef RE_ENABLE_I18N
3722	if (dfa->mb_cur_max > 1)
3723	{
3724	bin_tree_t *mbc_tree;
3725	/* Build a tree for complex bracket. */
3726	br_token.type = COMPLEX_BRACKET;
3727	br_token.opr.mbcset = mbcset;
3728	dfa->has_mb_node = 1;
3729	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3730	if (__glibc_unlikely (mbc_tree == NULL))
3731	goto build_word_op_espace;
3732	/* Then join them by ALT node. */
3733	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3734	if (__glibc_likely (mbc_tree != NULL))
3735	return tree;
3736	}
3737	else
3738	{
3739	free_charset (mbcset);
3740	return tree;
3741	}
3742	#else /* not RE_ENABLE_I18N */
3743	return tree;
3744	#endif /* not RE_ENABLE_I18N */
3745
3746	build_word_op_espace:
3747	re_free (sbcset);
3748	#ifdef RE_ENABLE_I18N
3749	free_charset (mbcset);
3750	#endif /* RE_ENABLE_I18N */
3751	*err = REG_ESPACE;
3752	return NULL;
3753	}
3754
3755	/* This is intended for the expressions like "a{1,3}".
3756	Fetch a number from 'input', and return the number.
3757	Return -1 if the number field is empty like "{,1}".
3758	Return RE_DUP_MAX + 1 if the number field is too large.
3759	Return -2 if an error occurred. */
3760
3761	static Idx
3762	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
3763	{
3764	Idx num = -1;
3765	unsigned char c;
3766	while (1)
3767	{
3768	fetch_token (token, input, syntax);
3769	c = token->opr.c;
3770	if (__glibc_unlikely (token->type == END_OF_RE))
3771	return -2;
3772	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
3773	break;
3774	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c \|\| num == -2)
3775	? -2
3776	: num == -1
3777	? c - '0'
3778	: MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3779	}
3780	return num;
3781	}
3782
3783
3784	#ifdef RE_ENABLE_I18N
3785	static void
3786	free_charset (re_charset_t *cset)
3787	{
3788	re_free (cset->mbchars);
3789	# ifdef _LIBC
3790	re_free (cset->coll_syms);
3791	re_free (cset->equiv_classes);
3792	# endif
3793	re_free (cset->range_starts);
3794	re_free (cset->range_ends);
3795	re_free (cset->char_classes);
3796	re_free (cset);
3797	}
3798	#endif /* RE_ENABLE_I18N */
3799
3800
3801	/* Functions for binary tree operation. */
3802
3803	/* Create a tree node. */
3804
3805	static bin_tree_t *
3806	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3807	re_token_type_t type)
3808	{
3809	re_token_t t = { .type = type };
3810	return create_token_tree (dfa, left, right, &t);
3811	}
3812
3813	static bin_tree_t *
3814	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3815	const re_token_t *token)
3816	{
3817	bin_tree_t *tree;
3818	if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3819	{
3820	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3821
3822	if (storage == NULL)
3823	return NULL;
3824	storage->next = dfa->str_tree_storage;
3825	dfa->str_tree_storage = storage;
3826	dfa->str_tree_storage_idx = 0;
3827	}
3828	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3829
3830	tree->parent = NULL;
3831	tree->left = left;
3832	tree->right = right;
3833	tree->token = *token;
3834	tree->token.duplicated = 0;
3835	tree->token.opt_subexp = 0;
3836	tree->first = NULL;
3837	tree->next = NULL;
3838	tree->node_idx = -1;
3839
3840	if (left != NULL)
3841	left->parent = tree;
3842	if (right != NULL)
3843	right->parent = tree;
3844	return tree;
3845	}
3846
3847	/* Mark the tree SRC as an optional subexpression.
3848	To be called from preorder or postorder. */
3849
3850	static reg_errcode_t
3851	mark_opt_subexp (void extra, bin_tree_t node)
3852	{
3853	Idx idx = (uintptr_t) extra;
3854	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3855	node->token.opt_subexp = 1;
3856
3857	return REG_NOERROR;
3858	}
3859
3860	/* Free the allocated memory inside NODE. */
3861
3862	static void
3863	free_token (re_token_t *node)
3864	{
3865	#ifdef RE_ENABLE_I18N
3866	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3867	free_charset (node->opr.mbcset);
3868	else
3869	#endif /* RE_ENABLE_I18N */
3870	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3871	re_free (node->opr.sbcset);
3872	}
3873
3874	/* Worker function for tree walking. Free the allocated memory inside NODE
3875	and its children. */
3876
3877	static reg_errcode_t
3878	free_tree (void extra, bin_tree_t node)
3879	{
3880	free_token (&node->token);
3881	return REG_NOERROR;
3882	}
3883
3884
3885	/* Duplicate the node SRC, and return new node. This is a preorder
3886	visit similar to the one implemented by the generic visitor, but
3887	we need more infrastructure to maintain two parallel trees --- so,
3888	it's easier to duplicate. */
3889
3890	static bin_tree_t *
3891	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
3892	{
3893	const bin_tree_t *node;
3894	bin_tree_t *dup_root;
3895	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
3896
3897	for (node = root; ; )
3898	{
3899	/* Create a new tree and link it back to the current parent. */
3900	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3901	if (*p_new == NULL)
3902	return NULL;
3903	(*p_new)->parent = dup_node;
3904	(*p_new)->token.duplicated = 1;
3905	dup_node = *p_new;
3906
3907	/* Go to the left node, or up and to the right. */
3908	if (node->left)
3909	{
3910	node = node->left;
3911	p_new = &dup_node->left;
3912	}
3913	else
3914	{
3915	const bin_tree_t *prev = NULL;
3916	while (node->right == prev \|\| node->right == NULL)
3917	{
3918	prev = node;
3919	node = node->parent;
3920	dup_node = dup_node->parent;
3921	if (!node)
3922	return dup_root;
3923	}
3924	node = node->right;
3925	p_new = &dup_node->right;
3926	}
3927	}
3928	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/grep/lib/regcomp.c@ 3531

Download in other formats: