Context Navigation

regcomp.c

Visit:

Last change on this file was 3342, checked in by bird, 18 years ago
tar 1.16.1
File size: 109.1 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	This program is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 2, or (at your option)
9	any later version.
10
11	This program is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License along
17	with this program; if not, write to the Free Software Foundation,
18	Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
21	size_t length, reg_syntax_t syntax);
22	static void re_compile_fastmap_iter (regex_t *bufp,
23	const re_dfastate_t *init_state,
24	char *fastmap);
25	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26	#ifdef RE_ENABLE_I18N
27	static void free_charset (re_charset_t *cset);
28	#endif /* RE_ENABLE_I18N */
29	static void free_workarea_compile (regex_t *preg);
30	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31	#ifdef RE_ENABLE_I18N
32	static void optimize_utf8 (re_dfa_t *dfa);
33	#endif
34	static reg_errcode_t analyze (regex_t *preg);
35	static reg_errcode_t preorder (bin_tree_t *root,
36	reg_errcode_t (fn (void , bin_tree_t )),
37	void *extra);
38	static reg_errcode_t postorder (bin_tree_t *root,
39	reg_errcode_t (fn (void , bin_tree_t )),
40	void *extra);
41	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
42	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
43	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
44	bin_tree_t *node);
45	static reg_errcode_t calc_first (void extra, bin_tree_t node);
46	static reg_errcode_t calc_next (void extra, bin_tree_t node);
47	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
48	static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50	unsigned int constraint);
51	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
53	Idx node, bool root);
54	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55	static Idx fetch_number (re_string_t input, re_token_t token,
56	reg_syntax_t syntax);
57	static int peek_token (re_token_t token, re_string_t input,
58	reg_syntax_t syntax) internal_function;
59	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
60	reg_syntax_t syntax, reg_errcode_t *err);
61	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
62	re_token_t *token, reg_syntax_t syntax,
63	Idx nest, reg_errcode_t *err);
64	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
65	re_token_t *token, reg_syntax_t syntax,
66	Idx nest, reg_errcode_t *err);
67	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
68	re_token_t *token, reg_syntax_t syntax,
69	Idx nest, reg_errcode_t *err);
70	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
71	re_token_t *token, reg_syntax_t syntax,
72	Idx nest, reg_errcode_t *err);
73	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
74	re_dfa_t dfa, re_token_t token,
75	reg_syntax_t syntax, reg_errcode_t *err);
76	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
77	re_token_t *token, reg_syntax_t syntax,
78	reg_errcode_t *err);
79	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80	re_string_t *regexp,
81	re_token_t *token, int token_len,
82	re_dfa_t *dfa,
83	reg_syntax_t syntax,
84	bool accept_hyphen);
85	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86	re_string_t *regexp,
87	re_token_t *token);
88	#ifdef RE_ENABLE_I18N
89	static reg_errcode_t build_equiv_class (bitset_t sbcset,
90	re_charset_t *mbcset,
91	Idx *equiv_class_alloc,
92	const unsigned char *name);
93	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94	bitset_t sbcset,
95	re_charset_t *mbcset,
96	Idx *char_class_alloc,
97	const unsigned char *class_name,
98	reg_syntax_t syntax);
99	#else /* not RE_ENABLE_I18N */
100	static reg_errcode_t build_equiv_class (bitset_t sbcset,
101	const unsigned char *name);
102	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103	bitset_t sbcset,
104	const unsigned char *class_name,
105	reg_syntax_t syntax);
106	#endif /* not RE_ENABLE_I18N */
107	static bin_tree_t build_charclass_op (re_dfa_t dfa,
108	RE_TRANSLATE_TYPE trans,
109	const unsigned char *class_name,
110	const unsigned char *extra,
111	bool non_match, reg_errcode_t *err);
112	static bin_tree_t create_tree (re_dfa_t dfa,
113	bin_tree_t left, bin_tree_t right,
114	re_token_type_t type);
115	static bin_tree_t create_token_tree (re_dfa_t dfa,
116	bin_tree_t left, bin_tree_t right,
117	const re_token_t *token);
118	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
119	static void free_token (re_token_t *node);
120	static reg_errcode_t free_tree (void extra, bin_tree_t node);
121	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
122
123
124	/* This table gives an error message for each of the error codes listed
125	in regex.h. Obviously the order here has to be same as there.
126	POSIX doesn't require that we do anything for REG_NOERROR,
127	but why not be nice? */
128
129	static const char __re_error_msgid[] =
130	{
131	#define REG_NOERROR_IDX 0
132	gettext_noop ("Success") /* REG_NOERROR */
133	"\0"
134	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135	gettext_noop ("No match") /* REG_NOMATCH */
136	"\0"
137	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139	"\0"
140	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142	"\0"
143	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145	"\0"
146	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148	"\0"
149	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151	"\0"
152	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153	gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154	"\0"
155	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157	"\0"
158	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160	"\0"
161	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163	"\0"
164	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165	gettext_noop ("Invalid range end") /* REG_ERANGE */
166	"\0"
167	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168	gettext_noop ("Memory exhausted") /* REG_ESPACE */
169	"\0"
170	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172	"\0"
173	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174	gettext_noop ("Premature end of regular expression") /* REG_EEND */
175	"\0"
176	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177	gettext_noop ("Regular expression too big") /* REG_ESIZE */
178	"\0"
179	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181	};
182
183	static const size_t __re_error_msgid_idx[] =
184	{
185	REG_NOERROR_IDX,
186	REG_NOMATCH_IDX,
187	REG_BADPAT_IDX,
188	REG_ECOLLATE_IDX,
189	REG_ECTYPE_IDX,
190	REG_EESCAPE_IDX,
191	REG_ESUBREG_IDX,
192	REG_EBRACK_IDX,
193	REG_EPAREN_IDX,
194	REG_EBRACE_IDX,
195	REG_BADBR_IDX,
196	REG_ERANGE_IDX,
197	REG_ESPACE_IDX,
198	REG_BADRPT_IDX,
199	REG_EEND_IDX,
200	REG_ESIZE_IDX,
201	REG_ERPAREN_IDX
202	};
203
204
205	/* Entry points for GNU code. */
206
207	/* re_compile_pattern is the GNU regular expression compiler: it
208	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209	Returns 0 if the pattern was valid, otherwise an error string.
210
211	Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212	are set in BUFP on entry. */
213
214	#ifdef _LIBC
215	const char *
216	re_compile_pattern (pattern, length, bufp)
217	const char *pattern;
218	size_t length;
219	struct re_pattern_buffer *bufp;
220	#else /* size_t might promote */
221	const char *
222	re_compile_pattern (const char *pattern, size_t length,
223	struct re_pattern_buffer *bufp)
224	#endif
225	{
226	reg_errcode_t ret;
227
228	/* And GNU code determines whether or not to get register information
229	by passing null for the REGS argument to re_match, etc., not by
230	setting no_sub, unless RE_NO_SUB is set. */
231	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232
233	/* Match anchors at newline. */
234	bufp->newline_anchor = 1;
235
236	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
237
238	if (!ret)
239	return NULL;
240	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241	}
242	#ifdef _LIBC
243	weak_alias (__re_compile_pattern, re_compile_pattern)
244	#endif
245
246	/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
247	also be assigned to arbitrarily: each pattern buffer stores its own
248	syntax, so it can be changed between regex compilations. */
249	/* This has no initializer because initialized variables in Emacs
250	become read-only after dumping. */
251	reg_syntax_t re_syntax_options;
252
253
254	/* Specify the precise syntax of regexps for compilation. This provides
255	for compatibility for various utilities which historically have
256	different, incompatible syntaxes.
257
258	The argument SYNTAX is a bit mask comprised of the various bits
259	defined in regex.h. We return the old syntax. */
260
261	reg_syntax_t
262	re_set_syntax (syntax)
263	reg_syntax_t syntax;
264	{
265	reg_syntax_t ret = re_syntax_options;
266
267	re_syntax_options = syntax;
268	return ret;
269	}
270	#ifdef _LIBC
271	weak_alias (__re_set_syntax, re_set_syntax)
272	#endif
273
274	int
275	re_compile_fastmap (bufp)
276	struct re_pattern_buffer *bufp;
277	{
278	re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
279	char *fastmap = bufp->fastmap;
280
281	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
282	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
283	if (dfa->init_state != dfa->init_state_word)
284	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
285	if (dfa->init_state != dfa->init_state_nl)
286	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
287	if (dfa->init_state != dfa->init_state_begbuf)
288	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
289	bufp->fastmap_accurate = 1;
290	return 0;
291	}
292	#ifdef _LIBC
293	weak_alias (__re_compile_fastmap, re_compile_fastmap)
294	#endif
295
296	static inline void
297	__attribute ((always_inline))
298	re_set_fastmap (char *fastmap, bool icase, int ch)
299	{
300	fastmap[ch] = 1;
301	if (icase)
302	fastmap[tolower (ch)] = 1;
303	}
304
305	/* Helper function for re_compile_fastmap.
306	Compile fastmap for the initial_state INIT_STATE. */
307
308	static void
309	re_compile_fastmap_iter (regex_t bufp, const re_dfastate_t init_state,
310	char *fastmap)
311	{
312	re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
313	Idx node_cnt;
314	bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
315	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316	{
317	Idx node = init_state->nodes.elems[node_cnt];
318	re_token_type_t type = dfa->nodes[node].type;
319
320	if (type == CHARACTER)
321	{
322	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
323	#ifdef RE_ENABLE_I18N
324	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325	{
326	unsigned char buf[MB_LEN_MAX];
327	unsigned char *p;
328	wchar_t wc;
329	mbstate_t state;
330
331	p = buf;
332	*p++ = dfa->nodes[node].opr.c;
333	while (++node < dfa->nodes_len
334	&& dfa->nodes[node].type == CHARACTER
335	&& dfa->nodes[node].mb_partial)
336	*p++ = dfa->nodes[node].opr.c;
337	memset (&state, '\0', sizeof (state));
338	if (mbrtowc (&wc, (const char *) buf, p - buf,
339	&state) == p - buf
340	&& (__wcrtomb ((char *) buf, towlower (wc), &state)
341	!= (size_t) -1))
342	re_set_fastmap (fastmap, false, buf[0]);
343	}
344	#endif
345	}
346	else if (type == SIMPLE_BRACKET)
347	{
348	int i, ch;
349	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350	{
351	int j;
352	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
353	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
354	if (w & ((bitset_word_t) 1 << j))
355	re_set_fastmap (fastmap, icase, ch);
356	}
357	}
358	#ifdef RE_ENABLE_I18N
359	else if (type == COMPLEX_BRACKET)
360	{
361	Idx i;
362	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
363	if (cset->non_match \|\| cset->ncoll_syms \|\| cset->nequiv_classes
364	\|\| cset->nranges \|\| cset->nchar_classes)
365	{
366	# ifdef _LIBC
367	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
368	{
369	/* In this case we want to catch the bytes which are
370	the first byte of any collation elements.
371	e.g. In da_DK, we want to catch 'a' since "aa"
372	is a valid collation element, and don't catch
373	'b' since 'b' is the only collation element
374	which starts from 'b'. */
375	const int32_t table = (const int32_t )
376	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
377	for (i = 0; i < SBC_MAX; ++i)
378	if (table[i] < 0)
379	re_set_fastmap (fastmap, icase, i);
380	}
381	# else
382	if (dfa->mb_cur_max > 1)
383	for (i = 0; i < SBC_MAX; ++i)
384	if (__btowc (i) == WEOF)
385	re_set_fastmap (fastmap, icase, i);
386	# endif /* not _LIBC */
387	}
388	for (i = 0; i < cset->nmbchars; ++i)
389	{
390	char buf[256];
391	mbstate_t state;
392	memset (&state, '\0', sizeof (state));
393	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
394	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
395	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
396	{
397	if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
398	!= (size_t) -1)
399	re_set_fastmap (fastmap, false, (unsigned char ) buf);
400	}
401	}
402	}
403	#endif /* RE_ENABLE_I18N */
404	else if (type == OP_PERIOD
405	#ifdef RE_ENABLE_I18N
406	\|\| type == OP_UTF8_PERIOD
407	#endif /* RE_ENABLE_I18N */
408	\|\| type == END_OF_RE)
409	{
410	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
411	if (type == END_OF_RE)
412	bufp->can_be_null = 1;
413	return;
414	}
415	}
416	}
417
418
419	/* Entry point for POSIX code. */
420	/* regcomp takes a regular expression as a string and compiles it.
421
422	PREG is a regex_t *. We do not expect any fields to be initialized,
423	since POSIX says we shouldn't. Thus, we set
424
425	`buffer' to the compiled pattern;
426	`used' to the length of the compiled pattern;
427	`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
428	REG_EXTENDED bit in CFLAGS is set; otherwise, to
429	RE_SYNTAX_POSIX_BASIC;
430	`newline_anchor' to REG_NEWLINE being set in CFLAGS;
431	`fastmap' to an allocated space for the fastmap;
432	`fastmap_accurate' to zero;
433	`re_nsub' to the number of subexpressions in PATTERN.
434
435	PATTERN is the address of the pattern string.
436
437	CFLAGS is a series of bits which affect compilation.
438
439	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
440	use POSIX basic syntax.
441
442	If REG_NEWLINE is set, then . and [^...] don't match newline.
443	Also, regexec will try a match beginning after every newline.
444
445	If REG_ICASE is set, then we considers upper- and lowercase
446	versions of letters to be equivalent when matching.
447
448	If REG_NOSUB is set, then when PREG is passed to regexec, that
449	routine will report only success or failure, and nothing about the
450	registers.
451
452	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
453	the return codes and their meanings.) */
454
455	int
456	regcomp (preg, pattern, cflags)
457	regex_t *__restrict preg;
458	const char *__restrict pattern;
459	int cflags;
460	{
461	reg_errcode_t ret;
462	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
463	: RE_SYNTAX_POSIX_BASIC);
464
465	preg->buffer = NULL;
466	preg->allocated = 0;
467	preg->used = 0;
468
469	/* Try to allocate space for the fastmap. */
470	preg->fastmap = re_malloc (char, SBC_MAX);
471	if (BE (preg->fastmap == NULL, 0))
472	return REG_ESPACE;
473
474	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
475
476	/* If REG_NEWLINE is set, newlines are treated differently. */
477	if (cflags & REG_NEWLINE)
478	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
479	syntax &= ~RE_DOT_NEWLINE;
480	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
481	/* It also changes the matching behavior. */
482	preg->newline_anchor = 1;
483	}
484	else
485	preg->newline_anchor = 0;
486	preg->no_sub = !!(cflags & REG_NOSUB);
487	preg->translate = NULL;
488
489	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
490
491	/* POSIX doesn't distinguish between an unmatched open-group and an
492	unmatched close-group: both are REG_EPAREN. */
493	if (ret == REG_ERPAREN)
494	ret = REG_EPAREN;
495
496	/* We have already checked preg->fastmap != NULL. */
497	if (BE (ret == REG_NOERROR, 1))
498	/* Compute the fastmap now, since regexec cannot modify the pattern
499	buffer. This function never fails in this implementation. */
500	(void) re_compile_fastmap (preg);
501	else
502	{
503	/* Some error occurred while compiling the expression. */
504	re_free (preg->fastmap);
505	preg->fastmap = NULL;
506	}
507
508	return (int) ret;
509	}
510	#ifdef _LIBC
511	weak_alias (__regcomp, regcomp)
512	#endif
513
514	/* Returns a message corresponding to an error code, ERRCODE, returned
515	from either regcomp or regexec. We don't use PREG here. */
516
517	#ifdef _LIBC
518	size_t
519	regerror (errcode, preg, errbuf, errbuf_size)
520	int errcode;
521	const regex_t *__restrict preg;
522	char *__restrict errbuf;
523	size_t errbuf_size;
524	#else /* size_t might promote */
525	size_t
526	regerror (int errcode, const regex_t *__restrict preg,
527	char *__restrict errbuf, size_t errbuf_size)
528	#endif
529	{
530	const char *msg;
531	size_t msg_size;
532
533	if (BE (errcode < 0
534	\|\| errcode >= (int) (sizeof (__re_error_msgid_idx)
535	/ sizeof (__re_error_msgid_idx[0])), 0))
536	/* Only error codes returned by the rest of the code should be passed
537	to this routine. If we are given anything else, or if other regex
538	code generates an invalid error code, then the program has a bug.
539	Dump core so we can fix it. */
540	abort ();
541
542	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543
544	msg_size = strlen (msg) + 1; /* Includes the null. */
545
546	if (BE (errbuf_size != 0, 1))
547	{
548	if (BE (msg_size > errbuf_size, 0))
549	{
550	#if defined HAVE_MEMPCPY \|\| defined _LIBC
551	((char ) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
552	#else
553	memcpy (errbuf, msg, errbuf_size - 1);
554	errbuf[errbuf_size - 1] = 0;
555	#endif
556	}
557	else
558	memcpy (errbuf, msg, msg_size);
559	}
560
561	return msg_size;
562	}
563	#ifdef _LIBC
564	weak_alias (__regerror, regerror)
565	#endif
566
567
568	#ifdef RE_ENABLE_I18N
569	/* This static array is used for the map to single-byte characters when
570	UTF-8 is used. Otherwise we would allocate memory just to initialize
571	it the same all the time. UTF-8 is the preferred encoding so this is
572	a worthwhile optimization. */
573	static const bitset_t utf8_sb_map =
574	{
575	/* Set the first 128 bits. */
576	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
577	# error "bitset_word_t is narrower than 32 bits"
578	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
579	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
580	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
581	BITSET_WORD_MAX, BITSET_WORD_MAX,
582	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
583	BITSET_WORD_MAX,
584	# endif
585	(BITSET_WORD_MAX
586	>> (SBC_MAX % BITSET_WORD_BITS == 0
587	? 0
588	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
589	};
590	#endif
591
592
593	static void
594	free_dfa_content (re_dfa_t *dfa)
595	{
596	Idx i, j;
597
598	if (dfa->nodes)
599	for (i = 0; i < dfa->nodes_len; ++i)
600	free_token (dfa->nodes + i);
601	re_free (dfa->nexts);
602	for (i = 0; i < dfa->nodes_len; ++i)
603	{
604	if (dfa->eclosures != NULL)
605	re_node_set_free (dfa->eclosures + i);
606	if (dfa->inveclosures != NULL)
607	re_node_set_free (dfa->inveclosures + i);
608	if (dfa->edests != NULL)
609	re_node_set_free (dfa->edests + i);
610	}
611	re_free (dfa->edests);
612	re_free (dfa->eclosures);
613	re_free (dfa->inveclosures);
614	re_free (dfa->nodes);
615
616	if (dfa->state_table)
617	for (i = 0; i <= dfa->state_hash_mask; ++i)
618	{
619	struct re_state_table_entry *entry = dfa->state_table + i;
620	for (j = 0; j < entry->num; ++j)
621	{
622	re_dfastate_t *state = entry->array[j];
623	free_state (state);
624	}
625	re_free (entry->array);
626	}
627	re_free (dfa->state_table);
628	#ifdef RE_ENABLE_I18N
629	if (dfa->sb_char != utf8_sb_map)
630	re_free (dfa->sb_char);
631	#endif
632	re_free (dfa->subexp_map);
633	#ifdef DEBUG
634	re_free (dfa->re_str);
635	#endif
636
637	re_free (dfa);
638	}
639
640
641	/* Free dynamically allocated space used by PREG. */
642
643	void
644	regfree (preg)
645	regex_t *preg;
646	{
647	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
648	if (BE (dfa != NULL, 1))
649	free_dfa_content (dfa);
650	preg->buffer = NULL;
651	preg->allocated = 0;
652
653	re_free (preg->fastmap);
654	preg->fastmap = NULL;
655
656	re_free (preg->translate);
657	preg->translate = NULL;
658	}
659	#ifdef _LIBC
660	weak_alias (__regfree, regfree)
661	#endif
662
663
664	/* Entry points compatible with 4.2 BSD regex library. We don't define
665	them unless specifically requested. */
666
667	#if defined _REGEX_RE_COMP \|\| defined _LIBC
668
669	/* BSD has one and only one pattern buffer. */
670	static struct re_pattern_buffer re_comp_buf;
671
672	char *
673	# ifdef _LIBC
674	/* Make these definitions weak in libc, so POSIX programs can redefine
675	these names if they don't use our functions, and still use
676	regcomp/regexec above without link errors. */
677	weak_function
678	# endif
679	re_comp (s)
680	const char *s;
681	{
682	reg_errcode_t ret;
683	char *fastmap;
684
685	if (!s)
686	{
687	if (!re_comp_buf.buffer)
688	return gettext ("No previous regular expression");
689	return 0;
690	}
691
692	if (re_comp_buf.buffer)
693	{
694	fastmap = re_comp_buf.fastmap;
695	re_comp_buf.fastmap = NULL;
696	__regfree (&re_comp_buf);
697	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
698	re_comp_buf.fastmap = fastmap;
699	}
700
701	if (re_comp_buf.fastmap == NULL)
702	{
703	re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
704	if (re_comp_buf.fastmap == NULL)
705	return (char *) gettext (__re_error_msgid
706	+ __re_error_msgid_idx[(int) REG_ESPACE]);
707	}
708
709	/* Since `re_exec' always passes NULL for the `regs' argument, we
710	don't need to initialize the pattern buffer fields which affect it. */
711
712	/* Match anchors at newlines. */
713	re_comp_buf.newline_anchor = 1;
714
715	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
716
717	if (!ret)
718	return NULL;
719
720	/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
721	return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
722	}
723
724	#ifdef _LIBC
725	libc_freeres_fn (free_mem)
726	{
727	__regfree (&re_comp_buf);
728	}
729	#endif
730
731	#endif /* _REGEX_RE_COMP */
732
733
734	/* Internal entry point.
735	Compile the regular expression PATTERN, whose length is LENGTH.
736	SYNTAX indicate regular expression's syntax. */
737
738	static reg_errcode_t
739	re_compile_internal (regex_t preg, const char pattern, size_t length,
740	reg_syntax_t syntax)
741	{
742	reg_errcode_t err = REG_NOERROR;
743	re_dfa_t *dfa;
744	re_string_t regexp;
745
746	/* Initialize the pattern buffer. */
747	preg->fastmap_accurate = 0;
748	preg->syntax = syntax;
749	preg->not_bol = preg->not_eol = 0;
750	preg->used = 0;
751	preg->re_nsub = 0;
752	preg->can_be_null = 0;
753	preg->regs_allocated = REGS_UNALLOCATED;
754
755	/* Initialize the dfa. */
756	dfa = (re_dfa_t *) preg->buffer;
757	if (BE (preg->allocated < sizeof (re_dfa_t), 0))
758	{
759	/* If zero allocated, but buffer is non-null, try to realloc
760	enough space. This loses if buffer's address is bogus, but
761	that is the user's responsibility. If ->buffer is NULL this
762	is a simple allocation. */
763	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
764	if (dfa == NULL)
765	return REG_ESPACE;
766	preg->allocated = sizeof (re_dfa_t);
767	preg->buffer = (unsigned char *) dfa;
768	}
769	preg->used = sizeof (re_dfa_t);
770
771	err = init_dfa (dfa, length);
772	if (BE (err != REG_NOERROR, 0))
773	{
774	free_dfa_content (dfa);
775	preg->buffer = NULL;
776	preg->allocated = 0;
777	return err;
778	}
779	#ifdef DEBUG
780	/* Note: length+1 will not overflow since it is checked in init_dfa. */
781	dfa->re_str = re_malloc (char, length + 1);
782	strncpy (dfa->re_str, pattern, length + 1);
783	#endif
784
785	__libc_lock_init (dfa->lock);
786
787	err = re_string_construct (&regexp, pattern, length, preg->translate,
788	syntax & RE_ICASE, dfa);
789	if (BE (err != REG_NOERROR, 0))
790	{
791	re_compile_internal_free_return:
792	free_workarea_compile (preg);
793	re_string_destruct (&regexp);
794	free_dfa_content (dfa);
795	preg->buffer = NULL;
796	preg->allocated = 0;
797	return err;
798	}
799
800	/* Parse the regular expression, and build a structure tree. */
801	preg->re_nsub = 0;
802	dfa->str_tree = parse (&regexp, preg, syntax, &err);
803	if (BE (dfa->str_tree == NULL, 0))
804	goto re_compile_internal_free_return;
805
806	/* Analyze the tree and create the nfa. */
807	err = analyze (preg);
808	if (BE (err != REG_NOERROR, 0))
809	goto re_compile_internal_free_return;
810
811	#ifdef RE_ENABLE_I18N
812	/* If possible, do searching in single byte encoding to speed things up. */
813	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
814	optimize_utf8 (dfa);
815	#endif
816
817	/* Then create the initial state of the dfa. */
818	err = create_initial_state (dfa);
819
820	/* Release work areas. */
821	free_workarea_compile (preg);
822	re_string_destruct (&regexp);
823
824	if (BE (err != REG_NOERROR, 0))
825	{
826	free_dfa_content (dfa);
827	preg->buffer = NULL;
828	preg->allocated = 0;
829	}
830
831	return err;
832	}
833
834	/* Initialize DFA. We use the length of the regular expression PAT_LEN
835	as the initial length of some arrays. */
836
837	static reg_errcode_t
838	init_dfa (re_dfa_t *dfa, size_t pat_len)
839	{
840	__re_size_t table_size;
841	#ifndef _LIBC
842	char *codeset_name;
843	#endif
844	#ifdef RE_ENABLE_I18N
845	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
846	#else
847	size_t max_i18n_object_size = 0;
848	#endif
849	size_t max_object_size =
850	MAX (sizeof (struct re_state_table_entry),
851	MAX (sizeof (re_token_t),
852	MAX (sizeof (re_node_set),
853	MAX (sizeof (regmatch_t),
854	max_i18n_object_size))));
855
856	memset (dfa, '\0', sizeof (re_dfa_t));
857
858	/* Force allocation of str_tree_storage the first time. */
859	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
860
861	/* Avoid overflows. The extra "/ 2" is for the table_size doubling
862	calculation below, and for similar doubling calculations
863	elsewhere. And it's <= rather than <, because some of the
864	doubling calculations add 1 afterwards. */
865	if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
866	return REG_ESPACE;
867
868	dfa->nodes_alloc = pat_len + 1;
869	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
870
871	/* table_size = 2 ^ ceil(log pat_len) */
872	for (table_size = 1; ; table_size <<= 1)
873	if (table_size > pat_len)
874	break;
875
876	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
877	dfa->state_hash_mask = table_size - 1;
878
879	dfa->mb_cur_max = MB_CUR_MAX;
880	#ifdef _LIBC
881	if (dfa->mb_cur_max == 6
882	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
883	dfa->is_utf8 = 1;
884	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
885	!= 0);
886	#else
887	# ifdef HAVE_LANGINFO_CODESET
888	codeset_name = nl_langinfo (CODESET);
889	# else
890	codeset_name = getenv ("LC_ALL");
891	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
892	codeset_name = getenv ("LC_CTYPE");
893	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
894	codeset_name = getenv ("LANG");
895	if (codeset_name == NULL)
896	codeset_name = "";
897	else if (strchr (codeset_name, '.') != NULL)
898	codeset_name = strchr (codeset_name, '.') + 1;
899	# endif
900
901	if (strcasecmp (codeset_name, "UTF-8") == 0
902	\|\| strcasecmp (codeset_name, "UTF8") == 0)
903	dfa->is_utf8 = 1;
904
905	/* We check exhaustively in the loop below if this charset is a
906	superset of ASCII. */
907	dfa->map_notascii = 0;
908	#endif
909
910	#ifdef RE_ENABLE_I18N
911	if (dfa->mb_cur_max > 1)
912	{
913	if (dfa->is_utf8)
914	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
915	else
916	{
917	int i, j, ch;
918
919	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
920	if (BE (dfa->sb_char == NULL, 0))
921	return REG_ESPACE;
922
923	/* Set the bits corresponding to single byte chars. */
924	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
925	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
926	{
927	wint_t wch = __btowc (ch);
928	if (wch != WEOF)
929	dfa->sb_char[i] \|= (bitset_word_t) 1 << j;
930	# ifndef _LIBC
931	if (isascii (ch) && wch != ch)
932	dfa->map_notascii = 1;
933	# endif
934	}
935	}
936	}
937	#endif
938
939	if (BE (dfa->nodes == NULL \|\| dfa->state_table == NULL, 0))
940	return REG_ESPACE;
941	return REG_NOERROR;
942	}
943
944	/* Initialize WORD_CHAR table, which indicate which character is
945	"word". In this case "word" means that it is the word construction
946	character used by some operators like "\<", "\>", etc. */
947
948	static void
949	internal_function
950	init_word_char (re_dfa_t *dfa)
951	{
952	int i, j, ch;
953	dfa->word_ops_used = 1;
954	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
955	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
956	if (isalnum (ch) \|\| ch == '_')
957	dfa->word_char[i] \|= (bitset_word_t) 1 << j;
958	}
959
960	/* Free the work area which are only used while compiling. */
961
962	static void
963	free_workarea_compile (regex_t *preg)
964	{
965	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
966	bin_tree_storage_t storage, next;
967	for (storage = dfa->str_tree_storage; storage; storage = next)
968	{
969	next = storage->next;
970	re_free (storage);
971	}
972	dfa->str_tree_storage = NULL;
973	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
974	dfa->str_tree = NULL;
975	re_free (dfa->org_indices);
976	dfa->org_indices = NULL;
977	}
978
979	/* Create initial states for all contexts. */
980
981	static reg_errcode_t
982	create_initial_state (re_dfa_t *dfa)
983	{
984	Idx first, i;
985	reg_errcode_t err;
986	re_node_set init_nodes;
987
988	/* Initial states have the epsilon closure of the node which is
989	the first node of the regular expression. */
990	first = dfa->str_tree->first->node_idx;
991	dfa->init_node = first;
992	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
993	if (BE (err != REG_NOERROR, 0))
994	return err;
995
996	/* The back-references which are in initial states can epsilon transit,
997	since in this case all of the subexpressions can be null.
998	Then we add epsilon closures of the nodes which are the next nodes of
999	the back-references. */
1000	if (dfa->nbackref > 0)
1001	for (i = 0; i < init_nodes.nelem; ++i)
1002	{
1003	Idx node_idx = init_nodes.elems[i];
1004	re_token_type_t type = dfa->nodes[node_idx].type;
1005
1006	Idx clexp_idx;
1007	if (type != OP_BACK_REF)
1008	continue;
1009	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1010	{
1011	re_token_t *clexp_node;
1012	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1013	if (clexp_node->type == OP_CLOSE_SUBEXP
1014	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1015	break;
1016	}
1017	if (clexp_idx == init_nodes.nelem)
1018	continue;
1019
1020	if (type == OP_BACK_REF)
1021	{
1022	Idx dest_idx = dfa->edests[node_idx].elems[0];
1023	if (!re_node_set_contains (&init_nodes, dest_idx))
1024	{
1025	re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1026	i = 0;
1027	}
1028	}
1029	}
1030
1031	/* It must be the first time to invoke acquire_state. */
1032	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1033	/* We don't check ERR here, since the initial state must not be NULL. */
1034	if (BE (dfa->init_state == NULL, 0))
1035	return err;
1036	if (dfa->init_state->has_constraint)
1037	{
1038	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1039	CONTEXT_WORD);
1040	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1041	CONTEXT_NEWLINE);
1042	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1043	&init_nodes,
1044	CONTEXT_NEWLINE
1045	\| CONTEXT_BEGBUF);
1046	if (BE (dfa->init_state_word == NULL \|\| dfa->init_state_nl == NULL
1047	\|\| dfa->init_state_begbuf == NULL, 0))
1048	return err;
1049	}
1050	else
1051	dfa->init_state_word = dfa->init_state_nl
1052	= dfa->init_state_begbuf = dfa->init_state;
1053
1054	re_node_set_free (&init_nodes);
1055	return REG_NOERROR;
1056	}
1057
1058
1059	#ifdef RE_ENABLE_I18N
1060	/* If it is possible to do searching in single byte encoding instead of UTF-8
1061	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1062	DFA nodes where needed. */
1063
1064	static void
1065	optimize_utf8 (re_dfa_t *dfa)
1066	{
1067	Idx node;
1068	int i;
1069	bool mb_chars = false;
1070	bool has_period = false;
1071
1072	for (node = 0; node < dfa->nodes_len; ++node)
1073	switch (dfa->nodes[node].type)
1074	{
1075	case CHARACTER:
1076	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1077	mb_chars = true;
1078	break;
1079	case ANCHOR:
1080	switch (dfa->nodes[node].opr.idx)
1081	{
1082	case LINE_FIRST:
1083	case LINE_LAST:
1084	case BUF_FIRST:
1085	case BUF_LAST:
1086	break;
1087	default:
1088	/* Word anchors etc. cannot be handled. */
1089	return;
1090	}
1091	break;
1092	case OP_PERIOD:
1093	has_period = true;
1094	break;
1095	case OP_BACK_REF:
1096	case OP_ALT:
1097	case END_OF_RE:
1098	case OP_DUP_ASTERISK:
1099	case OP_OPEN_SUBEXP:
1100	case OP_CLOSE_SUBEXP:
1101	break;
1102	case COMPLEX_BRACKET:
1103	return;
1104	case SIMPLE_BRACKET:
1105	/* Just double check. */
1106	{
1107	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1108	? 0
1109	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1110	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1111	{
1112	if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1113	return;
1114	rshift = 0;
1115	}
1116	}
1117	break;
1118	default:
1119	abort ();
1120	}
1121
1122	if (mb_chars \|\| has_period)
1123	for (node = 0; node < dfa->nodes_len; ++node)
1124	{
1125	if (dfa->nodes[node].type == CHARACTER
1126	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
1127	dfa->nodes[node].mb_partial = 0;
1128	else if (dfa->nodes[node].type == OP_PERIOD)
1129	dfa->nodes[node].type = OP_UTF8_PERIOD;
1130	}
1131
1132	/* The search can be in single byte locale. */
1133	dfa->mb_cur_max = 1;
1134	dfa->is_utf8 = 0;
1135	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
1136	}
1137	#endif
1138
1139
1140	/* Analyze the structure tree, and calculate "first", "next", "edest",
1141	"eclosure", and "inveclosure". */
1142
1143	static reg_errcode_t
1144	analyze (regex_t *preg)
1145	{
1146	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
1147	reg_errcode_t ret;
1148
1149	/* Allocate arrays. */
1150	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1151	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1152	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1153	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1154	if (BE (dfa->nexts == NULL \|\| dfa->org_indices == NULL \|\| dfa->edests == NULL
1155	\|\| dfa->eclosures == NULL, 0))
1156	return REG_ESPACE;
1157
1158	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1159	if (dfa->subexp_map != NULL)
1160	{
1161	Idx i;
1162	for (i = 0; i < preg->re_nsub; i++)
1163	dfa->subexp_map[i] = i;
1164	preorder (dfa->str_tree, optimize_subexps, dfa);
1165	for (i = 0; i < preg->re_nsub; i++)
1166	if (dfa->subexp_map[i] != i)
1167	break;
1168	if (i == preg->re_nsub)
1169	{
1170	free (dfa->subexp_map);
1171	dfa->subexp_map = NULL;
1172	}
1173	}
1174
1175	ret = postorder (dfa->str_tree, lower_subexps, preg);
1176	if (BE (ret != REG_NOERROR, 0))
1177	return ret;
1178	ret = postorder (dfa->str_tree, calc_first, dfa);
1179	if (BE (ret != REG_NOERROR, 0))
1180	return ret;
1181	preorder (dfa->str_tree, calc_next, dfa);
1182	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1183	if (BE (ret != REG_NOERROR, 0))
1184	return ret;
1185	ret = calc_eclosure (dfa);
1186	if (BE (ret != REG_NOERROR, 0))
1187	return ret;
1188
1189	/* We only need this during the prune_impossible_nodes pass in regexec.c;
1190	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1191	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1192	\|\| dfa->nbackref)
1193	{
1194	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1195	if (BE (dfa->inveclosures == NULL, 0))
1196	return REG_ESPACE;
1197	ret = calc_inveclosure (dfa);
1198	}
1199
1200	return ret;
1201	}
1202
1203	/* Our parse trees are very unbalanced, so we cannot use a stack to
1204	implement parse tree visits. Instead, we use parent pointers and
1205	some hairy code in these two functions. */
1206	static reg_errcode_t
1207	postorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
1208	void *extra)
1209	{
1210	bin_tree_t node, prev;
1211
1212	for (node = root; ; )
1213	{
1214	/* Descend down the tree, preferably to the left (or to the right
1215	if that's the only child). */
1216	while (node->left \|\| node->right)
1217	if (node->left)
1218	node = node->left;
1219	else
1220	node = node->right;
1221
1222	do
1223	{
1224	reg_errcode_t err = fn (extra, node);
1225	if (BE (err != REG_NOERROR, 0))
1226	return err;
1227	if (node->parent == NULL)
1228	return REG_NOERROR;
1229	prev = node;
1230	node = node->parent;
1231	}
1232	/* Go up while we have a node that is reached from the right. */
1233	while (node->right == prev \|\| node->right == NULL);
1234	node = node->right;
1235	}
1236	}
1237
1238	static reg_errcode_t
1239	preorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
1240	void *extra)
1241	{
1242	bin_tree_t *node;
1243
1244	for (node = root; ; )
1245	{
1246	reg_errcode_t err = fn (extra, node);
1247	if (BE (err != REG_NOERROR, 0))
1248	return err;
1249
1250	/* Go to the left node, or up and to the right. */
1251	if (node->left)
1252	node = node->left;
1253	else
1254	{
1255	bin_tree_t *prev = NULL;
1256	while (node->right == prev \|\| node->right == NULL)
1257	{
1258	prev = node;
1259	node = node->parent;
1260	if (!node)
1261	return REG_NOERROR;
1262	}
1263	node = node->right;
1264	}
1265	}
1266	}
1267
1268	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1269	re_search_internal to map the inner one's opr.idx to this one's. Adjust
1270	backreferences as well. Requires a preorder visit. */
1271	static reg_errcode_t
1272	optimize_subexps (void extra, bin_tree_t node)
1273	{
1274	re_dfa_t dfa = (re_dfa_t ) extra;
1275
1276	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1277	{
1278	int idx = node->token.opr.idx;
1279	node->token.opr.idx = dfa->subexp_map[idx];
1280	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
1281	}
1282
1283	else if (node->token.type == SUBEXP
1284	&& node->left && node->left->token.type == SUBEXP)
1285	{
1286	Idx other_idx = node->left->token.opr.idx;
1287
1288	node->left = node->left->left;
1289	if (node->left)
1290	node->left->parent = node;
1291
1292	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1293	if (other_idx < BITSET_WORD_BITS)
1294	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1295	}
1296
1297	return REG_NOERROR;
1298	}
1299
1300	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1301	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1302	static reg_errcode_t
1303	lower_subexps (void extra, bin_tree_t node)
1304	{
1305	regex_t preg = (regex_t ) extra;
1306	reg_errcode_t err = REG_NOERROR;
1307
1308	if (node->left && node->left->token.type == SUBEXP)
1309	{
1310	node->left = lower_subexp (&err, preg, node->left);
1311	if (node->left)
1312	node->left->parent = node;
1313	}
1314	if (node->right && node->right->token.type == SUBEXP)
1315	{
1316	node->right = lower_subexp (&err, preg, node->right);
1317	if (node->right)
1318	node->right->parent = node;
1319	}
1320
1321	return err;
1322	}
1323
1324	static bin_tree_t *
1325	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
1326	{
1327	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
1328	bin_tree_t *body = node->left;
1329	bin_tree_t op, cls, tree1, tree;
1330
1331	if (preg->no_sub
1332	/* We do not optimize empty subexpressions, because otherwise we may
1333	have bad CONCAT nodes with NULL children. This is obviously not
1334	very common, so we do not lose much. An example that triggers
1335	this case is the sed "script" //x. */
1336	&& node->left != NULL
1337	&& (node->token.opr.idx >= BITSET_WORD_BITS
1338	\|\| !(dfa->used_bkref_map
1339	& ((bitset_word_t) 1 << node->token.opr.idx))))
1340	return node->left;
1341
1342	/* Convert the SUBEXP node to the concatenation of an
1343	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1344	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1345	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1346	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1347	tree = create_tree (dfa, op, tree1, CONCAT);
1348	if (BE (tree == NULL \|\| tree1 == NULL \|\| op == NULL \|\| cls == NULL, 0))
1349	{
1350	*err = REG_ESPACE;
1351	return NULL;
1352	}
1353
1354	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1355	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1356	return tree;
1357	}
1358
1359	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1360	nodes. Requires a postorder visit. */
1361	static reg_errcode_t
1362	calc_first (void extra, bin_tree_t node)
1363	{
1364	re_dfa_t dfa = (re_dfa_t ) extra;
1365	if (node->token.type == CONCAT)
1366	{
1367	node->first = node->left->first;
1368	node->node_idx = node->left->node_idx;
1369	}
1370	else
1371	{
1372	node->first = node;
1373	node->node_idx = re_dfa_add_node (dfa, node->token);
1374	if (BE (node->node_idx == REG_MISSING, 0))
1375	return REG_ESPACE;
1376	}
1377	return REG_NOERROR;
1378	}
1379
1380	/* Pass 2: compute NEXT on the tree. Preorder visit. */
1381	static reg_errcode_t
1382	calc_next (void extra, bin_tree_t node)
1383	{
1384	switch (node->token.type)
1385	{
1386	case OP_DUP_ASTERISK:
1387	node->left->next = node;
1388	break;
1389	case CONCAT:
1390	node->left->next = node->right->first;
1391	node->right->next = node->next;
1392	break;
1393	default:
1394	if (node->left)
1395	node->left->next = node->next;
1396	if (node->right)
1397	node->right->next = node->next;
1398	break;
1399	}
1400	return REG_NOERROR;
1401	}
1402
1403	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1404	static reg_errcode_t
1405	link_nfa_nodes (void extra, bin_tree_t node)
1406	{
1407	re_dfa_t dfa = (re_dfa_t ) extra;
1408	Idx idx = node->node_idx;
1409	reg_errcode_t err = REG_NOERROR;
1410
1411	switch (node->token.type)
1412	{
1413	case CONCAT:
1414	break;
1415
1416	case END_OF_RE:
1417	assert (node->next == NULL);
1418	break;
1419
1420	case OP_DUP_ASTERISK:
1421	case OP_ALT:
1422	{
1423	Idx left, right;
1424	dfa->has_plural_match = 1;
1425	if (node->left != NULL)
1426	left = node->left->first->node_idx;
1427	else
1428	left = node->next->node_idx;
1429	if (node->right != NULL)
1430	right = node->right->first->node_idx;
1431	else
1432	right = node->next->node_idx;
1433	assert (REG_VALID_INDEX (left));
1434	assert (REG_VALID_INDEX (right));
1435	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1436	}
1437	break;
1438
1439	case ANCHOR:
1440	case OP_OPEN_SUBEXP:
1441	case OP_CLOSE_SUBEXP:
1442	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1443	break;
1444
1445	case OP_BACK_REF:
1446	dfa->nexts[idx] = node->next->node_idx;
1447	if (node->token.type == OP_BACK_REF)
1448	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1449	break;
1450
1451	default:
1452	assert (!IS_EPSILON_NODE (node->token.type));
1453	dfa->nexts[idx] = node->next->node_idx;
1454	break;
1455	}
1456
1457	return err;
1458	}
1459
1460	/* Duplicate the epsilon closure of the node ROOT_NODE.
1461	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1462	to their own constraint. */
1463
1464	static reg_errcode_t
1465	internal_function
1466	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1467	Idx root_node, unsigned int init_constraint)
1468	{
1469	Idx org_node, clone_node;
1470	bool ok;
1471	unsigned int constraint = init_constraint;
1472	for (org_node = top_org_node, clone_node = top_clone_node;;)
1473	{
1474	Idx org_dest, clone_dest;
1475	if (dfa->nodes[org_node].type == OP_BACK_REF)
1476	{
1477	/* If the back reference epsilon-transit, its destination must
1478	also have the constraint. Then duplicate the epsilon closure
1479	of the destination of the back reference, and store it in
1480	edests of the back reference. */
1481	org_dest = dfa->nexts[org_node];
1482	re_node_set_empty (dfa->edests + clone_node);
1483	clone_dest = duplicate_node (dfa, org_dest, constraint);
1484	if (BE (clone_dest == REG_MISSING, 0))
1485	return REG_ESPACE;
1486	dfa->nexts[clone_node] = dfa->nexts[org_node];
1487	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488	if (BE (! ok, 0))
1489	return REG_ESPACE;
1490	}
1491	else if (dfa->edests[org_node].nelem == 0)
1492	{
1493	/* In case of the node can't epsilon-transit, don't duplicate the
1494	destination and store the original destination as the
1495	destination of the node. */
1496	dfa->nexts[clone_node] = dfa->nexts[org_node];
1497	break;
1498	}
1499	else if (dfa->edests[org_node].nelem == 1)
1500	{
1501	/* In case of the node can epsilon-transit, and it has only one
1502	destination. */
1503	org_dest = dfa->edests[org_node].elems[0];
1504	re_node_set_empty (dfa->edests + clone_node);
1505	if (dfa->nodes[org_node].type == ANCHOR)
1506	{
1507	/* In case of the node has another constraint, append it. */
1508	if (org_node == root_node && clone_node != org_node)
1509	{
1510	/* ...but if the node is root_node itself, it means the
1511	epsilon closure have a loop, then tie it to the
1512	destination of the root_node. */
1513	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1514	if (BE (! ok, 0))
1515	return REG_ESPACE;
1516	break;
1517	}
1518	constraint \|= dfa->nodes[org_node].opr.ctx_type;
1519	}
1520	clone_dest = duplicate_node (dfa, org_dest, constraint);
1521	if (BE (clone_dest == REG_MISSING, 0))
1522	return REG_ESPACE;
1523	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1524	if (BE (! ok, 0))
1525	return REG_ESPACE;
1526	}
1527	else /* dfa->edests[org_node].nelem == 2 */
1528	{
1529	/* In case of the node can epsilon-transit, and it has two
1530	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
1531	org_dest = dfa->edests[org_node].elems[0];
1532	re_node_set_empty (dfa->edests + clone_node);
1533	/* Search for a duplicated node which satisfies the constraint. */
1534	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1535	if (clone_dest == REG_MISSING)
1536	{
1537	/* There are no such a duplicated node, create a new one. */
1538	reg_errcode_t err;
1539	clone_dest = duplicate_node (dfa, org_dest, constraint);
1540	if (BE (clone_dest == REG_MISSING, 0))
1541	return REG_ESPACE;
1542	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543	if (BE (! ok, 0))
1544	return REG_ESPACE;
1545	err = duplicate_node_closure (dfa, org_dest, clone_dest,
1546	root_node, constraint);
1547	if (BE (err != REG_NOERROR, 0))
1548	return err;
1549	}
1550	else
1551	{
1552	/* There are a duplicated node which satisfy the constraint,
1553	use it to avoid infinite loop. */
1554	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555	if (BE (! ok, 0))
1556	return REG_ESPACE;
1557	}
1558
1559	org_dest = dfa->edests[org_node].elems[1];
1560	clone_dest = duplicate_node (dfa, org_dest, constraint);
1561	if (BE (clone_dest == REG_MISSING, 0))
1562	return REG_ESPACE;
1563	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564	if (BE (! ok, 0))
1565	return REG_ESPACE;
1566	}
1567	org_node = org_dest;
1568	clone_node = clone_dest;
1569	}
1570	return REG_NOERROR;
1571	}
1572
1573	/* Search for a node which is duplicated from the node ORG_NODE, and
1574	satisfies the constraint CONSTRAINT. */
1575
1576	static Idx
1577	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1578	unsigned int constraint)
1579	{
1580	Idx idx;
1581	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1582	{
1583	if (org_node == dfa->org_indices[idx]
1584	&& constraint == dfa->nodes[idx].constraint)
1585	return idx; /* Found. */
1586	}
1587	return REG_MISSING; /* Not found. */
1588	}
1589
1590	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1591	Return the index of the new node, or REG_MISSING if insufficient storage is
1592	available. */
1593
1594	static Idx
1595	duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1596	{
1597	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1598	if (BE (dup_idx != REG_MISSING, 1))
1599	{
1600	dfa->nodes[dup_idx].constraint = constraint;
1601	if (dfa->nodes[org_idx].type == ANCHOR)
1602	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].opr.ctx_type;
1603	dfa->nodes[dup_idx].duplicated = 1;
1604
1605	/* Store the index of the original node. */
1606	dfa->org_indices[dup_idx] = org_idx;
1607	}
1608	return dup_idx;
1609	}
1610
1611	static reg_errcode_t
1612	calc_inveclosure (re_dfa_t *dfa)
1613	{
1614	Idx src, idx;
1615	bool ok;
1616	for (idx = 0; idx < dfa->nodes_len; ++idx)
1617	re_node_set_init_empty (dfa->inveclosures + idx);
1618
1619	for (src = 0; src < dfa->nodes_len; ++src)
1620	{
1621	Idx *elems = dfa->eclosures[src].elems;
1622	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623	{
1624	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1625	if (BE (! ok, 0))
1626	return REG_ESPACE;
1627	}
1628	}
1629
1630	return REG_NOERROR;
1631	}
1632
1633	/* Calculate "eclosure" for all the node in DFA. */
1634
1635	static reg_errcode_t
1636	calc_eclosure (re_dfa_t *dfa)
1637	{
1638	Idx node_idx;
1639	bool incomplete;
1640	#ifdef DEBUG
1641	assert (dfa->nodes_len > 0);
1642	#endif
1643	incomplete = false;
1644	/* For each nodes, calculate epsilon closure. */
1645	for (node_idx = 0; ; ++node_idx)
1646	{
1647	reg_errcode_t err;
1648	re_node_set eclosure_elem;
1649	if (node_idx == dfa->nodes_len)
1650	{
1651	if (!incomplete)
1652	break;
1653	incomplete = false;
1654	node_idx = 0;
1655	}
1656
1657	#ifdef DEBUG
1658	assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1659	#endif
1660
1661	/* If we have already calculated, skip it. */
1662	if (dfa->eclosures[node_idx].nelem != 0)
1663	continue;
1664	/* Calculate epsilon closure of `node_idx'. */
1665	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1666	if (BE (err != REG_NOERROR, 0))
1667	return err;
1668
1669	if (dfa->eclosures[node_idx].nelem == 0)
1670	{
1671	incomplete = true;
1672	re_node_set_free (&eclosure_elem);
1673	}
1674	}
1675	return REG_NOERROR;
1676	}
1677
1678	/* Calculate epsilon closure of NODE. */
1679
1680	static reg_errcode_t
1681	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
1682	{
1683	reg_errcode_t err;
1684	unsigned int constraint;
1685	Idx i;
1686	bool incomplete;
1687	bool ok;
1688	re_node_set eclosure;
1689	incomplete = false;
1690	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1691	if (BE (err != REG_NOERROR, 0))
1692	return err;
1693
1694	/* This indicates that we are calculating this node now.
1695	We reference this value to avoid infinite loop. */
1696	dfa->eclosures[node].nelem = REG_MISSING;
1697
1698	constraint = ((dfa->nodes[node].type == ANCHOR)
1699	? dfa->nodes[node].opr.ctx_type : 0);
1700	/* If the current node has constraints, duplicate all nodes.
1701	Since they must inherit the constraints. */
1702	if (constraint
1703	&& dfa->edests[node].nelem
1704	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1705	{
1706	err = duplicate_node_closure (dfa, node, node, node, constraint);
1707	if (BE (err != REG_NOERROR, 0))
1708	return err;
1709	}
1710
1711	/* Expand each epsilon destination nodes. */
1712	if (IS_EPSILON_NODE(dfa->nodes[node].type))
1713	for (i = 0; i < dfa->edests[node].nelem; ++i)
1714	{
1715	re_node_set eclosure_elem;
1716	Idx edest = dfa->edests[node].elems[i];
1717	/* If calculating the epsilon closure of `edest' is in progress,
1718	return intermediate result. */
1719	if (dfa->eclosures[edest].nelem == REG_MISSING)
1720	{
1721	incomplete = true;
1722	continue;
1723	}
1724	/* If we haven't calculated the epsilon closure of `edest' yet,
1725	calculate now. Otherwise use calculated epsilon closure. */
1726	if (dfa->eclosures[edest].nelem == 0)
1727	{
1728	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1729	if (BE (err != REG_NOERROR, 0))
1730	return err;
1731	}
1732	else
1733	eclosure_elem = dfa->eclosures[edest];
1734	/* Merge the epsilon closure of `edest'. */
1735	re_node_set_merge (&eclosure, &eclosure_elem);
1736	/* If the epsilon closure of `edest' is incomplete,
1737	the epsilon closure of this node is also incomplete. */
1738	if (dfa->eclosures[edest].nelem == 0)
1739	{
1740	incomplete = true;
1741	re_node_set_free (&eclosure_elem);
1742	}
1743	}
1744
1745	/* Epsilon closures include itself. */
1746	ok = re_node_set_insert (&eclosure, node);
1747	if (BE (! ok, 0))
1748	return REG_ESPACE;
1749	if (incomplete && !root)
1750	dfa->eclosures[node].nelem = 0;
1751	else
1752	dfa->eclosures[node] = eclosure;
1753	*new_set = eclosure;
1754	return REG_NOERROR;
1755	}
1756
1757
1758	/* Functions for token which are used in the parser. */
1759
1760	/* Fetch a token from INPUT.
1761	We must not use this function inside bracket expressions. */
1762
1763	static void
1764	internal_function
1765	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
1766	{
1767	re_string_skip_bytes (input, peek_token (result, input, syntax));
1768	}
1769
1770	/* Peek a token from INPUT, and return the length of the token.
1771	We must not use this function inside bracket expressions. */
1772
1773	static int
1774	internal_function
1775	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
1776	{
1777	unsigned char c;
1778
1779	if (re_string_eoi (input))
1780	{
1781	token->type = END_OF_RE;
1782	return 0;
1783	}
1784
1785	c = re_string_peek_byte (input, 0);
1786	token->opr.c = c;
1787
1788	token->word_char = 0;
1789	#ifdef RE_ENABLE_I18N
1790	token->mb_partial = 0;
1791	if (input->mb_cur_max > 1 &&
1792	!re_string_first_byte (input, re_string_cur_idx (input)))
1793	{
1794	token->type = CHARACTER;
1795	token->mb_partial = 1;
1796	return 1;
1797	}
1798	#endif
1799	if (c == '\\')
1800	{
1801	unsigned char c2;
1802	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1803	{
1804	token->type = BACK_SLASH;
1805	return 1;
1806	}
1807
1808	c2 = re_string_peek_byte_case (input, 1);
1809	token->opr.c = c2;
1810	token->type = CHARACTER;
1811	#ifdef RE_ENABLE_I18N
1812	if (input->mb_cur_max > 1)
1813	{
1814	wint_t wc = re_string_wchar_at (input,
1815	re_string_cur_idx (input) + 1);
1816	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1817	}
1818	else
1819	#endif
1820	token->word_char = IS_WORD_CHAR (c2) != 0;
1821
1822	switch (c2)
1823	{
1824	case '\|':
1825	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1826	token->type = OP_ALT;
1827	break;
1828	case '1': case '2': case '3': case '4': case '5':
1829	case '6': case '7': case '8': case '9':
1830	if (!(syntax & RE_NO_BK_REFS))
1831	{
1832	token->type = OP_BACK_REF;
1833	token->opr.idx = c2 - '1';
1834	}
1835	break;
1836	case '<':
1837	if (!(syntax & RE_NO_GNU_OPS))
1838	{
1839	token->type = ANCHOR;
1840	token->opr.ctx_type = WORD_FIRST;
1841	}
1842	break;
1843	case '>':
1844	if (!(syntax & RE_NO_GNU_OPS))
1845	{
1846	token->type = ANCHOR;
1847	token->opr.ctx_type = WORD_LAST;
1848	}
1849	break;
1850	case 'b':
1851	if (!(syntax & RE_NO_GNU_OPS))
1852	{
1853	token->type = ANCHOR;
1854	token->opr.ctx_type = WORD_DELIM;
1855	}
1856	break;
1857	case 'B':
1858	if (!(syntax & RE_NO_GNU_OPS))
1859	{
1860	token->type = ANCHOR;
1861	token->opr.ctx_type = NOT_WORD_DELIM;
1862	}
1863	break;
1864	case 'w':
1865	if (!(syntax & RE_NO_GNU_OPS))
1866	token->type = OP_WORD;
1867	break;
1868	case 'W':
1869	if (!(syntax & RE_NO_GNU_OPS))
1870	token->type = OP_NOTWORD;
1871	break;
1872	case 's':
1873	if (!(syntax & RE_NO_GNU_OPS))
1874	token->type = OP_SPACE;
1875	break;
1876	case 'S':
1877	if (!(syntax & RE_NO_GNU_OPS))
1878	token->type = OP_NOTSPACE;
1879	break;
1880	case '`':
1881	if (!(syntax & RE_NO_GNU_OPS))
1882	{
1883	token->type = ANCHOR;
1884	token->opr.ctx_type = BUF_FIRST;
1885	}
1886	break;
1887	case '\'':
1888	if (!(syntax & RE_NO_GNU_OPS))
1889	{
1890	token->type = ANCHOR;
1891	token->opr.ctx_type = BUF_LAST;
1892	}
1893	break;
1894	case '(':
1895	if (!(syntax & RE_NO_BK_PARENS))
1896	token->type = OP_OPEN_SUBEXP;
1897	break;
1898	case ')':
1899	if (!(syntax & RE_NO_BK_PARENS))
1900	token->type = OP_CLOSE_SUBEXP;
1901	break;
1902	case '+':
1903	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904	token->type = OP_DUP_PLUS;
1905	break;
1906	case '?':
1907	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908	token->type = OP_DUP_QUESTION;
1909	break;
1910	case '{':
1911	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912	token->type = OP_OPEN_DUP_NUM;
1913	break;
1914	case '}':
1915	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916	token->type = OP_CLOSE_DUP_NUM;
1917	break;
1918	default:
1919	break;
1920	}
1921	return 2;
1922	}
1923
1924	token->type = CHARACTER;
1925	#ifdef RE_ENABLE_I18N
1926	if (input->mb_cur_max > 1)
1927	{
1928	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1929	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1930	}
1931	else
1932	#endif
1933	token->word_char = IS_WORD_CHAR (token->opr.c);
1934
1935	switch (c)
1936	{
1937	case '\n':
1938	if (syntax & RE_NEWLINE_ALT)
1939	token->type = OP_ALT;
1940	break;
1941	case '\|':
1942	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1943	token->type = OP_ALT;
1944	break;
1945	case '*':
1946	token->type = OP_DUP_ASTERISK;
1947	break;
1948	case '+':
1949	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1950	token->type = OP_DUP_PLUS;
1951	break;
1952	case '?':
1953	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1954	token->type = OP_DUP_QUESTION;
1955	break;
1956	case '{':
1957	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1958	token->type = OP_OPEN_DUP_NUM;
1959	break;
1960	case '}':
1961	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1962	token->type = OP_CLOSE_DUP_NUM;
1963	break;
1964	case '(':
1965	if (syntax & RE_NO_BK_PARENS)
1966	token->type = OP_OPEN_SUBEXP;
1967	break;
1968	case ')':
1969	if (syntax & RE_NO_BK_PARENS)
1970	token->type = OP_CLOSE_SUBEXP;
1971	break;
1972	case '[':
1973	token->type = OP_OPEN_BRACKET;
1974	break;
1975	case '.':
1976	token->type = OP_PERIOD;
1977	break;
1978	case '^':
1979	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE)) &&
1980	re_string_cur_idx (input) != 0)
1981	{
1982	char prev = re_string_peek_byte (input, -1);
1983	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
1984	break;
1985	}
1986	token->type = ANCHOR;
1987	token->opr.ctx_type = LINE_FIRST;
1988	break;
1989	case '$':
1990	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1991	re_string_cur_idx (input) + 1 != re_string_length (input))
1992	{
1993	re_token_t next;
1994	re_string_skip_bytes (input, 1);
1995	peek_token (&next, input, syntax);
1996	re_string_skip_bytes (input, -1);
1997	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1998	break;
1999	}
2000	token->type = ANCHOR;
2001	token->opr.ctx_type = LINE_LAST;
2002	break;
2003	default:
2004	break;
2005	}
2006	return 1;
2007	}
2008
2009	/* Peek a token from INPUT, and return the length of the token.
2010	We must not use this function out of bracket expressions. */
2011
2012	static int
2013	internal_function
2014	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
2015	{
2016	unsigned char c;
2017	if (re_string_eoi (input))
2018	{
2019	token->type = END_OF_RE;
2020	return 0;
2021	}
2022	c = re_string_peek_byte (input, 0);
2023	token->opr.c = c;
2024
2025	#ifdef RE_ENABLE_I18N
2026	if (input->mb_cur_max > 1 &&
2027	!re_string_first_byte (input, re_string_cur_idx (input)))
2028	{
2029	token->type = CHARACTER;
2030	return 1;
2031	}
2032	#endif /* RE_ENABLE_I18N */
2033
2034	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2035	&& re_string_cur_idx (input) + 1 < re_string_length (input))
2036	{
2037	/* In this case, '\' escape a character. */
2038	unsigned char c2;
2039	re_string_skip_bytes (input, 1);
2040	c2 = re_string_peek_byte (input, 0);
2041	token->opr.c = c2;
2042	token->type = CHARACTER;
2043	return 1;
2044	}
2045	if (c == '[') /* '[' is a special char in a bracket exps. */
2046	{
2047	unsigned char c2;
2048	int token_len;
2049	if (re_string_cur_idx (input) + 1 < re_string_length (input))
2050	c2 = re_string_peek_byte (input, 1);
2051	else
2052	c2 = 0;
2053	token->opr.c = c2;
2054	token_len = 2;
2055	switch (c2)
2056	{
2057	case '.':
2058	token->type = OP_OPEN_COLL_ELEM;
2059	break;
2060	case '=':
2061	token->type = OP_OPEN_EQUIV_CLASS;
2062	break;
2063	case ':':
2064	if (syntax & RE_CHAR_CLASSES)
2065	{
2066	token->type = OP_OPEN_CHAR_CLASS;
2067	break;
2068	}
2069	/* else fall through. */
2070	default:
2071	token->type = CHARACTER;
2072	token->opr.c = c;
2073	token_len = 1;
2074	break;
2075	}
2076	return token_len;
2077	}
2078	switch (c)
2079	{
2080	case '-':
2081	token->type = OP_CHARSET_RANGE;
2082	break;
2083	case ']':
2084	token->type = OP_CLOSE_BRACKET;
2085	break;
2086	case '^':
2087	token->type = OP_NON_MATCH_LIST;
2088	break;
2089	default:
2090	token->type = CHARACTER;
2091	}
2092	return 1;
2093	}
2094
2095
2096	/* Functions for parser. */
2097
2098	/* Entry point of the parser.
2099	Parse the regular expression REGEXP and return the structure tree.
2100	If an error is occured, ERR is set by error code, and return NULL.
2101	This function build the following tree, from regular expression <reg_exp>:
2102	CAT
2103	/ \
2104	/ \
2105	<reg_exp> EOR
2106
2107	CAT means concatenation.
2108	EOR means end of regular expression. */
2109
2110	static bin_tree_t *
2111	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
2112	reg_errcode_t *err)
2113	{
2114	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2115	bin_tree_t tree, eor, *root;
2116	re_token_t current_token;
2117	dfa->syntax = syntax;
2118	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2119	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2120	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121	return NULL;
2122	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2123	if (tree != NULL)
2124	root = create_tree (dfa, tree, eor, CONCAT);
2125	else
2126	root = eor;
2127	if (BE (eor == NULL \|\| root == NULL, 0))
2128	{
2129	*err = REG_ESPACE;
2130	return NULL;
2131	}
2132	return root;
2133	}
2134
2135	/* This function build the following tree, from regular expression
2136	<branch1>\|<branch2>:
2137	ALT
2138	/ \
2139	/ \
2140	<branch1> <branch2>
2141
2142	ALT means alternative, which represents the operator `\|'. */
2143
2144	static bin_tree_t *
2145	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2146	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2147	{
2148	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2149	bin_tree_t tree, branch = NULL;
2150	tree = parse_branch (regexp, preg, token, syntax, nest, err);
2151	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2152	return NULL;
2153
2154	while (token->type == OP_ALT)
2155	{
2156	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2157	if (token->type != OP_ALT && token->type != END_OF_RE
2158	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2159	{
2160	branch = parse_branch (regexp, preg, token, syntax, nest, err);
2161	if (BE (*err != REG_NOERROR && branch == NULL, 0))
2162	return NULL;
2163	}
2164	else
2165	branch = NULL;
2166	tree = create_tree (dfa, tree, branch, OP_ALT);
2167	if (BE (tree == NULL, 0))
2168	{
2169	*err = REG_ESPACE;
2170	return NULL;
2171	}
2172	}
2173	return tree;
2174	}
2175
2176	/* This function build the following tree, from regular expression
2177	<exp1><exp2>:
2178	CAT
2179	/ \
2180	/ \
2181	<exp1> <exp2>
2182
2183	CAT means concatenation. */
2184
2185	static bin_tree_t *
2186	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
2187	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2188	{
2189	bin_tree_t tree, expr;
2190	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2191	tree = parse_expression (regexp, preg, token, syntax, nest, err);
2192	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193	return NULL;
2194
2195	while (token->type != OP_ALT && token->type != END_OF_RE
2196	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2197	{
2198	expr = parse_expression (regexp, preg, token, syntax, nest, err);
2199	if (BE (*err != REG_NOERROR && expr == NULL, 0))
2200	{
2201	return NULL;
2202	}
2203	if (tree != NULL && expr != NULL)
2204	{
2205	tree = create_tree (dfa, tree, expr, CONCAT);
2206	if (tree == NULL)
2207	{
2208	*err = REG_ESPACE;
2209	return NULL;
2210	}
2211	}
2212	else if (tree == NULL)
2213	tree = expr;
2214	/* Otherwise expr == NULL, we don't need to create new tree. */
2215	}
2216	return tree;
2217	}
2218
2219	/* This function build the following tree, from regular expression a*:
2220	*
2221	\|
2222	a
2223	*/
2224
2225	static bin_tree_t *
2226	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
2227	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2228	{
2229	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2230	bin_tree_t *tree;
2231	switch (token->type)
2232	{
2233	case CHARACTER:
2234	tree = create_token_tree (dfa, NULL, NULL, token);
2235	if (BE (tree == NULL, 0))
2236	{
2237	*err = REG_ESPACE;
2238	return NULL;
2239	}
2240	#ifdef RE_ENABLE_I18N
2241	if (dfa->mb_cur_max > 1)
2242	{
2243	while (!re_string_eoi (regexp)
2244	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2245	{
2246	bin_tree_t *mbc_remain;
2247	fetch_token (token, regexp, syntax);
2248	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2249	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2250	if (BE (mbc_remain == NULL \|\| tree == NULL, 0))
2251	{
2252	*err = REG_ESPACE;
2253	return NULL;
2254	}
2255	}
2256	}
2257	#endif
2258	break;
2259	case OP_OPEN_SUBEXP:
2260	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2261	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262	return NULL;
2263	break;
2264	case OP_OPEN_BRACKET:
2265	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2266	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267	return NULL;
2268	break;
2269	case OP_BACK_REF:
2270	if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2271	{
2272	*err = REG_ESUBREG;
2273	return NULL;
2274	}
2275	dfa->used_bkref_map \|= 1 << token->opr.idx;
2276	tree = create_token_tree (dfa, NULL, NULL, token);
2277	if (BE (tree == NULL, 0))
2278	{
2279	*err = REG_ESPACE;
2280	return NULL;
2281	}
2282	++dfa->nbackref;
2283	dfa->has_mb_node = 1;
2284	break;
2285	case OP_OPEN_DUP_NUM:
2286	if (syntax & RE_CONTEXT_INVALID_DUP)
2287	{
2288	*err = REG_BADRPT;
2289	return NULL;
2290	}
2291	/* FALLTHROUGH */
2292	case OP_DUP_ASTERISK:
2293	case OP_DUP_PLUS:
2294	case OP_DUP_QUESTION:
2295	if (syntax & RE_CONTEXT_INVALID_OPS)
2296	{
2297	*err = REG_BADRPT;
2298	return NULL;
2299	}
2300	else if (syntax & RE_CONTEXT_INDEP_OPS)
2301	{
2302	fetch_token (token, regexp, syntax);
2303	return parse_expression (regexp, preg, token, syntax, nest, err);
2304	}
2305	/* else fall through */
2306	case OP_CLOSE_SUBEXP:
2307	if ((token->type == OP_CLOSE_SUBEXP) &&
2308	!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2309	{
2310	*err = REG_ERPAREN;
2311	return NULL;
2312	}
2313	/* else fall through */
2314	case OP_CLOSE_DUP_NUM:
2315	/* We treat it as a normal character. */
2316
2317	/* Then we can these characters as normal characters. */
2318	token->type = CHARACTER;
2319	/* mb_partial and word_char bits should be initialized already
2320	by peek_token. */
2321	tree = create_token_tree (dfa, NULL, NULL, token);
2322	if (BE (tree == NULL, 0))
2323	{
2324	*err = REG_ESPACE;
2325	return NULL;
2326	}
2327	break;
2328	case ANCHOR:
2329	if ((token->opr.ctx_type
2330	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
2331	&& dfa->word_ops_used == 0)
2332	init_word_char (dfa);
2333	if (token->opr.ctx_type == WORD_DELIM
2334	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
2335	{
2336	bin_tree_t tree_first, tree_last;
2337	if (token->opr.ctx_type == WORD_DELIM)
2338	{
2339	token->opr.ctx_type = WORD_FIRST;
2340	tree_first = create_token_tree (dfa, NULL, NULL, token);
2341	token->opr.ctx_type = WORD_LAST;
2342	}
2343	else
2344	{
2345	token->opr.ctx_type = INSIDE_WORD;
2346	tree_first = create_token_tree (dfa, NULL, NULL, token);
2347	token->opr.ctx_type = INSIDE_NOTWORD;
2348	}
2349	tree_last = create_token_tree (dfa, NULL, NULL, token);
2350	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2351	if (BE (tree_first == NULL \|\| tree_last == NULL \|\| tree == NULL, 0))
2352	{
2353	*err = REG_ESPACE;
2354	return NULL;
2355	}
2356	}
2357	else
2358	{
2359	tree = create_token_tree (dfa, NULL, NULL, token);
2360	if (BE (tree == NULL, 0))
2361	{
2362	*err = REG_ESPACE;
2363	return NULL;
2364	}
2365	}
2366	/* We must return here, since ANCHORs can't be followed
2367	by repetition operators.
2368	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
2369	it must not be "<ANCHOR(^)><REPEAT()>". /
2370	fetch_token (token, regexp, syntax);
2371	return tree;
2372	case OP_PERIOD:
2373	tree = create_token_tree (dfa, NULL, NULL, token);
2374	if (BE (tree == NULL, 0))
2375	{
2376	*err = REG_ESPACE;
2377	return NULL;
2378	}
2379	if (dfa->mb_cur_max > 1)
2380	dfa->has_mb_node = 1;
2381	break;
2382	case OP_WORD:
2383	case OP_NOTWORD:
2384	tree = build_charclass_op (dfa, regexp->trans,
2385	(const unsigned char *) "alnum",
2386	(const unsigned char *) "_",
2387	token->type == OP_NOTWORD, err);
2388	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389	return NULL;
2390	break;
2391	case OP_SPACE:
2392	case OP_NOTSPACE:
2393	tree = build_charclass_op (dfa, regexp->trans,
2394	(const unsigned char *) "space",
2395	(const unsigned char *) "",
2396	token->type == OP_NOTSPACE, err);
2397	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2398	return NULL;
2399	break;
2400	case OP_ALT:
2401	case END_OF_RE:
2402	return NULL;
2403	case BACK_SLASH:
2404	*err = REG_EESCAPE;
2405	return NULL;
2406	default:
2407	/* Must not happen? */
2408	#ifdef DEBUG
2409	assert (0);
2410	#endif
2411	return NULL;
2412	}
2413	fetch_token (token, regexp, syntax);
2414
2415	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
2416	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
2417	{
2418	tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2419	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2420	return NULL;
2421	/* In BRE consecutive duplications are not allowed. */
2422	if ((syntax & RE_CONTEXT_INVALID_DUP)
2423	&& (token->type == OP_DUP_ASTERISK
2424	\|\| token->type == OP_OPEN_DUP_NUM))
2425	{
2426	*err = REG_BADRPT;
2427	return NULL;
2428	}
2429	}
2430
2431	return tree;
2432	}
2433
2434	/* This function build the following tree, from regular expression
2435	(<reg_exp>):
2436	SUBEXP
2437	\|
2438	<reg_exp>
2439	*/
2440
2441	static bin_tree_t *
2442	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
2443	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2444	{
2445	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2446	bin_tree_t *tree;
2447	size_t cur_nsub;
2448	cur_nsub = preg->re_nsub++;
2449
2450	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2451
2452	/* The subexpression may be a null string. */
2453	if (token->type == OP_CLOSE_SUBEXP)
2454	tree = NULL;
2455	else
2456	{
2457	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2458	if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2459	*err = REG_EPAREN;
2460	if (BE (*err != REG_NOERROR, 0))
2461	return NULL;
2462	}
2463
2464	if (cur_nsub <= '9' - '1')
2465	dfa->completed_bkref_map \|= 1 << cur_nsub;
2466
2467	tree = create_tree (dfa, tree, NULL, SUBEXP);
2468	if (BE (tree == NULL, 0))
2469	{
2470	*err = REG_ESPACE;
2471	return NULL;
2472	}
2473	tree->token.opr.idx = cur_nsub;
2474	return tree;
2475	}
2476
2477	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
2478
2479	static bin_tree_t *
2480	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
2481	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
2482	{
2483	bin_tree_t tree = NULL, old_tree = NULL;
2484	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2485	re_token_t start_token = *token;
2486
2487	if (token->type == OP_OPEN_DUP_NUM)
2488	{
2489	end = 0;
2490	start = fetch_number (regexp, token, syntax);
2491	if (start == REG_MISSING)
2492	{
2493	if (token->type == CHARACTER && token->opr.c == ',')
2494	start = 0; /* We treat "{,m}" as "{0,m}". */
2495	else
2496	{
2497	err = REG_BADBR; / <re>{} is invalid. */
2498	return NULL;
2499	}
2500	}
2501	if (BE (start != REG_ERROR, 1))
2502	{
2503	/* We treat "{n}" as "{n,n}". */
2504	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2505	: ((token->type == CHARACTER && token->opr.c == ',')
2506	? fetch_number (regexp, token, syntax) : REG_ERROR));
2507	}
2508	if (BE (start == REG_ERROR \|\| end == REG_ERROR, 0))
2509	{
2510	/* Invalid sequence. */
2511	if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2512	{
2513	if (token->type == END_OF_RE)
2514	*err = REG_EBRACE;
2515	else
2516	*err = REG_BADBR;
2517
2518	return NULL;
2519	}
2520
2521	/* If the syntax bit is set, rollback. */
2522	re_string_set_index (regexp, start_idx);
2523	*token = start_token;
2524	token->type = CHARACTER;
2525	/* mb_partial and word_char bits should be already initialized by
2526	peek_token. */
2527	return elem;
2528	}
2529
2530	if (BE (end != REG_MISSING && start > end, 0))
2531	{
2532	/* First number greater than second. */
2533	*err = REG_BADBR;
2534	return NULL;
2535	}
2536	}
2537	else
2538	{
2539	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2540	end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2541	}
2542
2543	fetch_token (token, regexp, syntax);
2544
2545	if (BE (elem == NULL, 0))
2546	return NULL;
2547	if (BE (start == 0 && end == 0, 0))
2548	{
2549	postorder (elem, free_tree, NULL);
2550	return NULL;
2551	}
2552
2553	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2554	if (BE (start > 0, 0))
2555	{
2556	tree = elem;
2557	for (i = 2; i <= start; ++i)
2558	{
2559	elem = duplicate_tree (elem, dfa);
2560	tree = create_tree (dfa, tree, elem, CONCAT);
2561	if (BE (elem == NULL \|\| tree == NULL, 0))
2562	goto parse_dup_op_espace;
2563	}
2564
2565	if (start == end)
2566	return tree;
2567
2568	/* Duplicate ELEM before it is marked optional. */
2569	elem = duplicate_tree (elem, dfa);
2570	old_tree = tree;
2571	}
2572	else
2573	old_tree = NULL;
2574
2575	if (elem->token.type == SUBEXP)
2576	postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2577
2578	tree = create_tree (dfa, elem, NULL,
2579	(end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2580	if (BE (tree == NULL, 0))
2581	goto parse_dup_op_espace;
2582
2583	/* This loop is actually executed only when end != REG_MISSING,
2584	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2585	already created the start+1-th copy. */
2586	if ((Idx) -1 < 0 \|\| end != REG_MISSING)
2587	for (i = start + 2; i <= end; ++i)
2588	{
2589	elem = duplicate_tree (elem, dfa);
2590	tree = create_tree (dfa, tree, elem, CONCAT);
2591	if (BE (elem == NULL \|\| tree == NULL, 0))
2592	goto parse_dup_op_espace;
2593
2594	tree = create_tree (dfa, tree, NULL, OP_ALT);
2595	if (BE (tree == NULL, 0))
2596	goto parse_dup_op_espace;
2597	}
2598
2599	if (old_tree)
2600	tree = create_tree (dfa, old_tree, tree, CONCAT);
2601
2602	return tree;
2603
2604	parse_dup_op_espace:
2605	*err = REG_ESPACE;
2606	return NULL;
2607	}
2608
2609	/* Size of the names for collating symbol/equivalence_class/character_class.
2610	I'm not sure, but maybe enough. */
2611	#define BRACKET_NAME_BUF_SIZE 32
2612
2613	#ifndef _LIBC
2614	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615	Build the range expression which starts from START_ELEM, and ends
2616	at END_ELEM. The result are written to MBCSET and SBCSET.
2617	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618	mbcset->range_ends, is a pointer argument sinse we may
2619	update it. */
2620
2621	static reg_errcode_t
2622	internal_function
2623	# ifdef RE_ENABLE_I18N
2624	build_range_exp (bitset_t sbcset, re_charset_t mbcset, Idx range_alloc,
2625	bracket_elem_t start_elem, bracket_elem_t end_elem)
2626	# else /* not RE_ENABLE_I18N */
2627	build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2628	bracket_elem_t *end_elem)
2629	# endif /* not RE_ENABLE_I18N */
2630	{
2631	unsigned int start_ch, end_ch;
2632	/* Equivalence Classes and Character Classes can't be a range start/end. */
2633	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
2634	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
2635	0))
2636	return REG_ERANGE;
2637
2638	/* We can handle no multi character collating elements without libc
2639	support. */
2640	if (BE ((start_elem->type == COLL_SYM
2641	&& strlen ((char *) start_elem->opr.name) > 1)
2642	\|\| (end_elem->type == COLL_SYM
2643	&& strlen ((char *) end_elem->opr.name) > 1), 0))
2644	return REG_ECOLLATE;
2645
2646	# ifdef RE_ENABLE_I18N
2647	{
2648	wchar_t wc;
2649	wint_t start_wc;
2650	wint_t end_wc;
2651	wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2652
2653	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2654	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2655	: 0));
2656	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2657	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2658	: 0));
2659	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
2660	? __btowc (start_ch) : start_elem->opr.wch);
2661	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
2662	? __btowc (end_ch) : end_elem->opr.wch);
2663	if (start_wc == WEOF \|\| end_wc == WEOF)
2664	return REG_ECOLLATE;
2665	cmp_buf[0] = start_wc;
2666	cmp_buf[4] = end_wc;
2667	if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2668	return REG_ERANGE;
2669
2670	/* Got valid collation sequence values, add them as a new entry.
2671	However, for !_LIBC we have no collation elements: if the
2672	character set is single byte, the single byte character set
2673	that we build below suffices. parse_bracket_exp passes
2674	no MBCSET if dfa->mb_cur_max == 1. */
2675	if (mbcset)
2676	{
2677	/* Check the space of the arrays. */
2678	if (BE (*range_alloc == mbcset->nranges, 0))
2679	{
2680	/* There is not enough space, need realloc. */
2681	wchar_t new_array_start, new_array_end;
2682	Idx new_nranges;
2683
2684	/* +1 in case of mbcset->nranges is 0. */
2685	new_nranges = 2 * mbcset->nranges + 1;
2686	/* Use realloc since mbcset->range_starts and mbcset->range_ends
2687	are NULL if range_alloc == 0. /
2688	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2689	new_nranges);
2690	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2691	new_nranges);
2692
2693	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
2694	return REG_ESPACE;
2695
2696	mbcset->range_starts = new_array_start;
2697	mbcset->range_ends = new_array_end;
2698	*range_alloc = new_nranges;
2699	}
2700
2701	mbcset->range_starts[mbcset->nranges] = start_wc;
2702	mbcset->range_ends[mbcset->nranges++] = end_wc;
2703	}
2704
2705	/* Build the table for single byte characters. */
2706	for (wc = 0; wc < SBC_MAX; ++wc)
2707	{
2708	cmp_buf[2] = wc;
2709	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2710	&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2711	bitset_set (sbcset, wc);
2712	}
2713	}
2714	# else /* not RE_ENABLE_I18N */
2715	{
2716	unsigned int ch;
2717	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2718	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719	: 0));
2720	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2721	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722	: 0));
2723	if (start_ch > end_ch)
2724	return REG_ERANGE;
2725	/* Build the table for single byte characters. */
2726	for (ch = 0; ch < SBC_MAX; ++ch)
2727	if (start_ch <= ch && ch <= end_ch)
2728	bitset_set (sbcset, ch);
2729	}
2730	# endif /* not RE_ENABLE_I18N */
2731	return REG_NOERROR;
2732	}
2733	#endif /* not _LIBC */
2734
2735	#ifndef _LIBC
2736	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2737	Build the collating element which is represented by NAME.
2738	The result are written to MBCSET and SBCSET.
2739	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2740	pointer argument since we may update it. */
2741
2742	static reg_errcode_t
2743	internal_function
2744	build_collating_symbol (bitset_t sbcset,
2745	# ifdef RE_ENABLE_I18N
2746	re_charset_t mbcset, Idx coll_sym_alloc,
2747	# endif
2748	const unsigned char *name)
2749	{
2750	size_t name_len = strlen ((const char *) name);
2751	if (BE (name_len != 1, 0))
2752	return REG_ECOLLATE;
2753	else
2754	{
2755	bitset_set (sbcset, name[0]);
2756	return REG_NOERROR;
2757	}
2758	}
2759	#endif /* not _LIBC */
2760
2761	/* This function parse bracket expression like "[abc]", "[a-c]",
2762	"[[.a-a.]]" etc. */
2763
2764	static bin_tree_t *
2765	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
2766	reg_syntax_t syntax, reg_errcode_t *err)
2767	{
2768	#ifdef _LIBC
2769	const unsigned char *collseqmb;
2770	const char *collseqwc;
2771	uint32_t nrules;
2772	int32_t table_size;
2773	const int32_t *symb_table;
2774	const unsigned char *extra;
2775
2776	/* Local function for parse_bracket_exp used in _LIBC environement.
2777	Seek the collating symbol entry correspondings to NAME.
2778	Return the index of the symbol in the SYMB_TABLE. */
2779
2780	auto inline int32_t
2781	__attribute ((always_inline))
2782	seek_collating_symbol_entry (name, name_len)
2783	const unsigned char *name;
2784	size_t name_len;
2785	{
2786	int32_t hash = elem_hash ((const char *) name, name_len);
2787	int32_t elem = hash % table_size;
2788	if (symb_table[2 * elem] != 0)
2789	{
2790	int32_t second = hash % (table_size - 2) + 1;
2791
2792	do
2793	{
2794	/* First compare the hashing value. */
2795	if (symb_table[2 * elem] == hash
2796	/* Compare the length of the name. */
2797	&& name_len == extra[symb_table[2 * elem + 1]]
2798	/* Compare the name. */
2799	&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2800	name_len) == 0)
2801	{
2802	/* Yep, this is the entry. */
2803	break;
2804	}
2805
2806	/* Next entry. */
2807	elem += second;
2808	}
2809	while (symb_table[2 * elem] != 0);
2810	}
2811	return elem;
2812	}
2813
2814	/* Local function for parse_bracket_exp used in _LIBC environement.
2815	Look up the collation sequence value of BR_ELEM.
2816	Return the value if succeeded, UINT_MAX otherwise. */
2817
2818	auto inline unsigned int
2819	__attribute ((always_inline))
2820	lookup_collation_sequence_value (br_elem)
2821	bracket_elem_t *br_elem;
2822	{
2823	if (br_elem->type == SB_CHAR)
2824	{
2825	/*
2826	if (MB_CUR_MAX == 1)
2827	*/
2828	if (nrules == 0)
2829	return collseqmb[br_elem->opr.ch];
2830	else
2831	{
2832	wint_t wc = __btowc (br_elem->opr.ch);
2833	return __collseq_table_lookup (collseqwc, wc);
2834	}
2835	}
2836	else if (br_elem->type == MB_CHAR)
2837	{
2838	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2839	}
2840	else if (br_elem->type == COLL_SYM)
2841	{
2842	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2843	if (nrules != 0)
2844	{
2845	int32_t elem, idx;
2846	elem = seek_collating_symbol_entry (br_elem->opr.name,
2847	sym_name_len);
2848	if (symb_table[2 * elem] != 0)
2849	{
2850	/* We found the entry. */
2851	idx = symb_table[2 * elem + 1];
2852	/* Skip the name of collating element name. */
2853	idx += 1 + extra[idx];
2854	/* Skip the byte sequence of the collating element. */
2855	idx += 1 + extra[idx];
2856	/* Adjust for the alignment. */
2857	idx = (idx + 3) & ~3;
2858	/* Skip the multibyte collation sequence value. */
2859	idx += sizeof (unsigned int);
2860	/* Skip the wide char sequence of the collating element. */
2861	idx += sizeof (unsigned int) *
2862	(1 + (unsigned int ) (extra + idx));
2863	/* Return the collation sequence value. */
2864	return (unsigned int ) (extra + idx);
2865	}
2866	else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2867	{
2868	/* No valid character. Match it as a single byte
2869	character. */
2870	return collseqmb[br_elem->opr.name[0]];
2871	}
2872	}
2873	else if (sym_name_len == 1)
2874	return collseqmb[br_elem->opr.name[0]];
2875	}
2876	return UINT_MAX;
2877	}
2878
2879	/* Local function for parse_bracket_exp used in _LIBC environement.
2880	Build the range expression which starts from START_ELEM, and ends
2881	at END_ELEM. The result are written to MBCSET and SBCSET.
2882	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2883	mbcset->range_ends, is a pointer argument sinse we may
2884	update it. */
2885
2886	auto inline reg_errcode_t
2887	__attribute ((always_inline))
2888	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2889	re_charset_t *mbcset;
2890	Idx *range_alloc;
2891	bitset_t sbcset;
2892	bracket_elem_t start_elem, end_elem;
2893	{
2894	unsigned int ch;
2895	uint32_t start_collseq;
2896	uint32_t end_collseq;
2897
2898	/* Equivalence Classes and Character Classes can't be a range
2899	start/end. */
2900	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
2901	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
2902	0))
2903	return REG_ERANGE;
2904
2905	start_collseq = lookup_collation_sequence_value (start_elem);
2906	end_collseq = lookup_collation_sequence_value (end_elem);
2907	/* Check start/end collation sequence values. */
2908	if (BE (start_collseq == UINT_MAX \|\| end_collseq == UINT_MAX, 0))
2909	return REG_ECOLLATE;
2910	if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2911	return REG_ERANGE;
2912
2913	/* Got valid collation sequence values, add them as a new entry.
2914	However, if we have no collation elements, and the character set
2915	is single byte, the single byte character set that we
2916	build below suffices. */
2917	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
2918	{
2919	/* Check the space of the arrays. */
2920	if (BE (*range_alloc == mbcset->nranges, 0))
2921	{
2922	/* There is not enough space, need realloc. */
2923	uint32_t *new_array_start;
2924	uint32_t *new_array_end;
2925	Idx new_nranges;
2926
2927	/* +1 in case of mbcset->nranges is 0. */
2928	new_nranges = 2 * mbcset->nranges + 1;
2929	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2930	new_nranges);
2931	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2932	new_nranges);
2933
2934	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
2935	return REG_ESPACE;
2936
2937	mbcset->range_starts = new_array_start;
2938	mbcset->range_ends = new_array_end;
2939	*range_alloc = new_nranges;
2940	}
2941
2942	mbcset->range_starts[mbcset->nranges] = start_collseq;
2943	mbcset->range_ends[mbcset->nranges++] = end_collseq;
2944	}
2945
2946	/* Build the table for single byte characters. */
2947	for (ch = 0; ch < SBC_MAX; ch++)
2948	{
2949	uint32_t ch_collseq;
2950	/*
2951	if (MB_CUR_MAX == 1)
2952	*/
2953	if (nrules == 0)
2954	ch_collseq = collseqmb[ch];
2955	else
2956	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2957	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2958	bitset_set (sbcset, ch);
2959	}
2960	return REG_NOERROR;
2961	}
2962
2963	/* Local function for parse_bracket_exp used in _LIBC environement.
2964	Build the collating element which is represented by NAME.
2965	The result are written to MBCSET and SBCSET.
2966	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2967	pointer argument sinse we may update it. */
2968
2969	auto inline reg_errcode_t
2970	__attribute ((always_inline))
2971	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2972	re_charset_t *mbcset;
2973	Idx *coll_sym_alloc;
2974	bitset_t sbcset;
2975	const unsigned char *name;
2976	{
2977	int32_t elem, idx;
2978	size_t name_len = strlen ((const char *) name);
2979	if (nrules != 0)
2980	{
2981	elem = seek_collating_symbol_entry (name, name_len);
2982	if (symb_table[2 * elem] != 0)
2983	{
2984	/* We found the entry. */
2985	idx = symb_table[2 * elem + 1];
2986	/* Skip the name of collating element name. */
2987	idx += 1 + extra[idx];
2988	}
2989	else if (symb_table[2 * elem] == 0 && name_len == 1)
2990	{
2991	/* No valid character, treat it as a normal
2992	character. */
2993	bitset_set (sbcset, name[0]);
2994	return REG_NOERROR;
2995	}
2996	else
2997	return REG_ECOLLATE;
2998
2999	/* Got valid collation sequence, add it as a new entry. */
3000	/* Check the space of the arrays. */
3001	if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3002	{
3003	/* Not enough, realloc it. */
3004	/* +1 in case of mbcset->ncoll_syms is 0. */
3005	Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3006	/* Use realloc since mbcset->coll_syms is NULL
3007	if alloc == 0. /
3008	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3009	new_coll_sym_alloc);
3010	if (BE (new_coll_syms == NULL, 0))
3011	return REG_ESPACE;
3012	mbcset->coll_syms = new_coll_syms;
3013	*coll_sym_alloc = new_coll_sym_alloc;
3014	}
3015	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3016	return REG_NOERROR;
3017	}
3018	else
3019	{
3020	if (BE (name_len != 1, 0))
3021	return REG_ECOLLATE;
3022	else
3023	{
3024	bitset_set (sbcset, name[0]);
3025	return REG_NOERROR;
3026	}
3027	}
3028	}
3029	#endif
3030
3031	re_token_t br_token;
3032	re_bitset_ptr_t sbcset;
3033	#ifdef RE_ENABLE_I18N
3034	re_charset_t *mbcset;
3035	Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3036	Idx equiv_class_alloc = 0, char_class_alloc = 0;
3037	#endif /* not RE_ENABLE_I18N */
3038	bool non_match = false;
3039	bin_tree_t *work_tree;
3040	int token_len;
3041	bool first_round = true;
3042	#ifdef _LIBC
3043	collseqmb = (const unsigned char *)
3044	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3045	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3046	if (nrules)
3047	{
3048	/*
3049	if (MB_CUR_MAX > 1)
3050	*/
3051	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3052	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3053	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3054	_NL_COLLATE_SYMB_TABLEMB);
3055	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3056	_NL_COLLATE_SYMB_EXTRAMB);
3057	}
3058	#endif
3059	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3060	#ifdef RE_ENABLE_I18N
3061	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3062	#endif /* RE_ENABLE_I18N */
3063	#ifdef RE_ENABLE_I18N
3064	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
3065	#else
3066	if (BE (sbcset == NULL, 0))
3067	#endif /* RE_ENABLE_I18N */
3068	{
3069	*err = REG_ESPACE;
3070	return NULL;
3071	}
3072
3073	token_len = peek_token_bracket (token, regexp, syntax);
3074	if (BE (token->type == END_OF_RE, 0))
3075	{
3076	*err = REG_BADPAT;
3077	goto parse_bracket_exp_free_return;
3078	}
3079	if (token->type == OP_NON_MATCH_LIST)
3080	{
3081	#ifdef RE_ENABLE_I18N
3082	mbcset->non_match = 1;
3083	#endif /* not RE_ENABLE_I18N */
3084	non_match = true;
3085	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3086	bitset_set (sbcset, '\0');
3087	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3088	token_len = peek_token_bracket (token, regexp, syntax);
3089	if (BE (token->type == END_OF_RE, 0))
3090	{
3091	*err = REG_BADPAT;
3092	goto parse_bracket_exp_free_return;
3093	}
3094	}
3095
3096	/* We treat the first ']' as a normal character. */
3097	if (token->type == OP_CLOSE_BRACKET)
3098	token->type = CHARACTER;
3099
3100	while (1)
3101	{
3102	bracket_elem_t start_elem, end_elem;
3103	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3104	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3105	reg_errcode_t ret;
3106	int token_len2 = 0;
3107	bool is_range_exp = false;
3108	re_token_t token2;
3109
3110	start_elem.opr.name = start_name_buf;
3111	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3112	syntax, first_round);
3113	if (BE (ret != REG_NOERROR, 0))
3114	{
3115	*err = ret;
3116	goto parse_bracket_exp_free_return;
3117	}
3118	first_round = false;
3119
3120	/* Get information about the next token. We need it in any case. */
3121	token_len = peek_token_bracket (token, regexp, syntax);
3122
3123	/* Do not check for ranges if we know they are not allowed. */
3124	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3125	{
3126	if (BE (token->type == END_OF_RE, 0))
3127	{
3128	*err = REG_EBRACK;
3129	goto parse_bracket_exp_free_return;
3130	}
3131	if (token->type == OP_CHARSET_RANGE)
3132	{
3133	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3134	token_len2 = peek_token_bracket (&token2, regexp, syntax);
3135	if (BE (token2.type == END_OF_RE, 0))
3136	{
3137	*err = REG_EBRACK;
3138	goto parse_bracket_exp_free_return;
3139	}
3140	if (token2.type == OP_CLOSE_BRACKET)
3141	{
3142	/* We treat the last '-' as a normal character. */
3143	re_string_skip_bytes (regexp, -token_len);
3144	token->type = CHARACTER;
3145	}
3146	else
3147	is_range_exp = true;
3148	}
3149	}
3150
3151	if (is_range_exp == true)
3152	{
3153	end_elem.opr.name = end_name_buf;
3154	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3155	dfa, syntax, true);
3156	if (BE (ret != REG_NOERROR, 0))
3157	{
3158	*err = ret;
3159	goto parse_bracket_exp_free_return;
3160	}
3161
3162	token_len = peek_token_bracket (token, regexp, syntax);
3163
3164	#ifdef _LIBC
3165	*err = build_range_exp (sbcset, mbcset, &range_alloc,
3166	&start_elem, &end_elem);
3167	#else
3168	# ifdef RE_ENABLE_I18N
3169	*err = build_range_exp (sbcset,
3170	dfa->mb_cur_max > 1 ? mbcset : NULL,
3171	&range_alloc, &start_elem, &end_elem);
3172	# else
3173	*err = build_range_exp (sbcset, &start_elem, &end_elem);
3174	# endif
3175	#endif /* RE_ENABLE_I18N */
3176	if (BE (*err != REG_NOERROR, 0))
3177	goto parse_bracket_exp_free_return;
3178	}
3179	else
3180	{
3181	switch (start_elem.type)
3182	{
3183	case SB_CHAR:
3184	bitset_set (sbcset, start_elem.opr.ch);
3185	break;
3186	#ifdef RE_ENABLE_I18N
3187	case MB_CHAR:
3188	/* Check whether the array has enough space. */
3189	if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3190	{
3191	wchar_t *new_mbchars;
3192	/* Not enough, realloc it. */
3193	/* +1 in case of mbcset->nmbchars is 0. */
3194	mbchar_alloc = 2 * mbcset->nmbchars + 1;
3195	/* Use realloc since array is NULL if alloc == 0. /
3196	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3197	mbchar_alloc);
3198	if (BE (new_mbchars == NULL, 0))
3199	goto parse_bracket_exp_espace;
3200	mbcset->mbchars = new_mbchars;
3201	}
3202	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3203	break;
3204	#endif /* RE_ENABLE_I18N */
3205	case EQUIV_CLASS:
3206	*err = build_equiv_class (sbcset,
3207	#ifdef RE_ENABLE_I18N
3208	mbcset, &equiv_class_alloc,
3209	#endif /* RE_ENABLE_I18N */
3210	start_elem.opr.name);
3211	if (BE (*err != REG_NOERROR, 0))
3212	goto parse_bracket_exp_free_return;
3213	break;
3214	case COLL_SYM:
3215	*err = build_collating_symbol (sbcset,
3216	#ifdef RE_ENABLE_I18N
3217	mbcset, &coll_sym_alloc,
3218	#endif /* RE_ENABLE_I18N */
3219	start_elem.opr.name);
3220	if (BE (*err != REG_NOERROR, 0))
3221	goto parse_bracket_exp_free_return;
3222	break;
3223	case CHAR_CLASS:
3224	*err = build_charclass (regexp->trans, sbcset,
3225	#ifdef RE_ENABLE_I18N
3226	mbcset, &char_class_alloc,
3227	#endif /* RE_ENABLE_I18N */
3228	start_elem.opr.name, syntax);
3229	if (BE (*err != REG_NOERROR, 0))
3230	goto parse_bracket_exp_free_return;
3231	break;
3232	default:
3233	assert (0);
3234	break;
3235	}
3236	}
3237	if (BE (token->type == END_OF_RE, 0))
3238	{
3239	*err = REG_EBRACK;
3240	goto parse_bracket_exp_free_return;
3241	}
3242	if (token->type == OP_CLOSE_BRACKET)
3243	break;
3244	}
3245
3246	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3247
3248	/* If it is non-matching list. */
3249	if (non_match)
3250	bitset_not (sbcset);
3251
3252	#ifdef RE_ENABLE_I18N
3253	/* Ensure only single byte characters are set. */
3254	if (dfa->mb_cur_max > 1)
3255	bitset_mask (sbcset, dfa->sb_char);
3256
3257	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
3258	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3259	\|\| mbcset->non_match)))
3260	{
3261	bin_tree_t *mbc_tree;
3262	int sbc_idx;
3263	/* Build a tree for complex bracket. */
3264	dfa->has_mb_node = 1;
3265	br_token.type = COMPLEX_BRACKET;
3266	br_token.opr.mbcset = mbcset;
3267	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3268	if (BE (mbc_tree == NULL, 0))
3269	goto parse_bracket_exp_espace;
3270	for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3271	if (sbcset[sbc_idx])
3272	break;
3273	/* If there are no bits set in sbcset, there is no point
3274	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3275	if (sbc_idx < BITSET_WORDS)
3276	{
3277	/* Build a tree for simple bracket. */
3278	br_token.type = SIMPLE_BRACKET;
3279	br_token.opr.sbcset = sbcset;
3280	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3281	if (BE (work_tree == NULL, 0))
3282	goto parse_bracket_exp_espace;
3283
3284	/* Then join them by ALT node. */
3285	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3286	if (BE (work_tree == NULL, 0))
3287	goto parse_bracket_exp_espace;
3288	}
3289	else
3290	{
3291	re_free (sbcset);
3292	work_tree = mbc_tree;
3293	}
3294	}
3295	else
3296	#endif /* not RE_ENABLE_I18N */
3297	{
3298	#ifdef RE_ENABLE_I18N
3299	free_charset (mbcset);
3300	#endif
3301	/* Build a tree for simple bracket. */
3302	br_token.type = SIMPLE_BRACKET;
3303	br_token.opr.sbcset = sbcset;
3304	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3305	if (BE (work_tree == NULL, 0))
3306	goto parse_bracket_exp_espace;
3307	}
3308	return work_tree;
3309
3310	parse_bracket_exp_espace:
3311	*err = REG_ESPACE;
3312	parse_bracket_exp_free_return:
3313	re_free (sbcset);
3314	#ifdef RE_ENABLE_I18N
3315	free_charset (mbcset);
3316	#endif /* RE_ENABLE_I18N */
3317	return NULL;
3318	}
3319
3320	/* Parse an element in the bracket expression. */
3321
3322	static reg_errcode_t
3323	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
3324	re_token_t token, int token_len, re_dfa_t dfa,
3325	reg_syntax_t syntax, bool accept_hyphen)
3326	{
3327	#ifdef RE_ENABLE_I18N
3328	int cur_char_size;
3329	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3330	if (cur_char_size > 1)
3331	{
3332	elem->type = MB_CHAR;
3333	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3334	re_string_skip_bytes (regexp, cur_char_size);
3335	return REG_NOERROR;
3336	}
3337	#endif /* RE_ENABLE_I18N */
3338	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3339	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
3340	\|\| token->type == OP_OPEN_EQUIV_CLASS)
3341	return parse_bracket_symbol (elem, regexp, token);
3342	if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3343	{
3344	/* A '-' must only appear as anything but a range indicator before
3345	the closing bracket. Everything else is an error. */
3346	re_token_t token2;
3347	(void) peek_token_bracket (&token2, regexp, syntax);
3348	if (token2.type != OP_CLOSE_BRACKET)
3349	/* The actual error value is not standardized since this whole
3350	case is undefined. But ERANGE makes good sense. */
3351	return REG_ERANGE;
3352	}
3353	elem->type = SB_CHAR;
3354	elem->opr.ch = token->opr.c;
3355	return REG_NOERROR;
3356	}
3357
3358	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3359	such as [:<character_class>:], [.<collating_element>.], and
3360	[=<equivalent_class>=]. */
3361
3362	static reg_errcode_t
3363	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
3364	re_token_t *token)
3365	{
3366	unsigned char ch, delim = token->opr.c;
3367	int i = 0;
3368	if (re_string_eoi(regexp))
3369	return REG_EBRACK;
3370	for (;; ++i)
3371	{
3372	if (i >= BRACKET_NAME_BUF_SIZE)
3373	return REG_EBRACK;
3374	if (token->type == OP_OPEN_CHAR_CLASS)
3375	ch = re_string_fetch_byte_case (regexp);
3376	else
3377	ch = re_string_fetch_byte (regexp);
3378	if (re_string_eoi(regexp))
3379	return REG_EBRACK;
3380	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3381	break;
3382	elem->opr.name[i] = ch;
3383	}
3384	re_string_skip_bytes (regexp, 1);
3385	elem->opr.name[i] = '\0';
3386	switch (token->type)
3387	{
3388	case OP_OPEN_COLL_ELEM:
3389	elem->type = COLL_SYM;
3390	break;
3391	case OP_OPEN_EQUIV_CLASS:
3392	elem->type = EQUIV_CLASS;
3393	break;
3394	case OP_OPEN_CHAR_CLASS:
3395	elem->type = CHAR_CLASS;
3396	break;
3397	default:
3398	break;
3399	}
3400	return REG_NOERROR;
3401	}
3402
3403	/* Helper function for parse_bracket_exp.
3404	Build the equivalence class which is represented by NAME.
3405	The result are written to MBCSET and SBCSET.
3406	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3407	is a pointer argument sinse we may update it. */
3408
3409	static reg_errcode_t
3410	#ifdef RE_ENABLE_I18N
3411	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3412	Idx equiv_class_alloc, const unsigned char name)
3413	#else /* not RE_ENABLE_I18N */
3414	build_equiv_class (bitset_t sbcset, const unsigned char *name)
3415	#endif /* not RE_ENABLE_I18N */
3416	{
3417	#ifdef _LIBC
3418	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3419	if (nrules != 0)
3420	{
3421	const int32_t table, indirect;
3422	const unsigned char weights, extra, *cp;
3423	unsigned char char_buf[2];
3424	int32_t idx1, idx2;
3425	unsigned int ch;
3426	size_t len;
3427	/* This #include defines a local function! */
3428	# include <locale/weight.h>
3429	/* Calculate the index for equivalence class. */
3430	cp = name;
3431	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3432	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3433	_NL_COLLATE_WEIGHTMB);
3434	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3435	_NL_COLLATE_EXTRAMB);
3436	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3437	_NL_COLLATE_INDIRECTMB);
3438	idx1 = findidx (&cp);
3439	if (BE (idx1 == 0 \|\| cp < name + strlen ((const char *) name), 0))
3440	/* This isn't a valid character. */
3441	return REG_ECOLLATE;
3442
3443	/* Build single byte matcing table for this equivalence class. */
3444	char_buf[1] = (unsigned char) '\0';
3445	len = weights[idx1];
3446	for (ch = 0; ch < SBC_MAX; ++ch)
3447	{
3448	char_buf[0] = ch;
3449	cp = char_buf;
3450	idx2 = findidx (&cp);
3451	/*
3452	idx2 = table[ch];
3453	*/
3454	if (idx2 == 0)
3455	/* This isn't a valid character. */
3456	continue;
3457	if (len == weights[idx2])
3458	{
3459	int cnt = 0;
3460	while (cnt <= len &&
3461	weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3462	++cnt;
3463
3464	if (cnt > len)
3465	bitset_set (sbcset, ch);
3466	}
3467	}
3468	/* Check whether the array has enough space. */
3469	if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3470	{
3471	/* Not enough, realloc it. */
3472	/* +1 in case of mbcset->nequiv_classes is 0. */
3473	Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3474	/* Use realloc since the array is NULL if alloc == 0. /
3475	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3476	int32_t,
3477	new_equiv_class_alloc);
3478	if (BE (new_equiv_classes == NULL, 0))
3479	return REG_ESPACE;
3480	mbcset->equiv_classes = new_equiv_classes;
3481	*equiv_class_alloc = new_equiv_class_alloc;
3482	}
3483	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3484	}
3485	else
3486	#endif /* _LIBC */
3487	{
3488	if (BE (strlen ((const char *) name) != 1, 0))
3489	return REG_ECOLLATE;
3490	bitset_set (sbcset, *name);
3491	}
3492	return REG_NOERROR;
3493	}
3494
3495	/* Helper function for parse_bracket_exp.
3496	Build the character class which is represented by NAME.
3497	The result are written to MBCSET and SBCSET.
3498	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3499	is a pointer argument sinse we may update it. */
3500
3501	static reg_errcode_t
3502	#ifdef RE_ENABLE_I18N
3503	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3504	re_charset_t mbcset, Idx char_class_alloc,
3505	const unsigned char *class_name, reg_syntax_t syntax)
3506	#else /* not RE_ENABLE_I18N */
3507	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3508	const unsigned char *class_name, reg_syntax_t syntax)
3509	#endif /* not RE_ENABLE_I18N */
3510	{
3511	int i;
3512	const char name = (const char ) class_name;
3513
3514	/* In case of REG_ICASE "upper" and "lower" match the both of
3515	upper and lower cases. */
3516	if ((syntax & RE_ICASE)
3517	&& (strcmp (name, "upper") == 0 \|\| strcmp (name, "lower") == 0))
3518	name = "alpha";
3519
3520	#ifdef RE_ENABLE_I18N
3521	/* Check the space of the arrays. */
3522	if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3523	{
3524	/* Not enough, realloc it. */
3525	/* +1 in case of mbcset->nchar_classes is 0. */
3526	Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3527	/* Use realloc since array is NULL if alloc == 0. /
3528	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3529	new_char_class_alloc);
3530	if (BE (new_char_classes == NULL, 0))
3531	return REG_ESPACE;
3532	mbcset->char_classes = new_char_classes;
3533	*char_class_alloc = new_char_class_alloc;
3534	}
3535	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3536	#endif /* RE_ENABLE_I18N */
3537
3538	#define BUILD_CHARCLASS_LOOP(ctype_func) \
3539	do { \
3540	if (BE (trans != NULL, 0)) \
3541	{ \
3542	for (i = 0; i < SBC_MAX; ++i) \
3543	if (ctype_func (i)) \
3544	bitset_set (sbcset, trans[i]); \
3545	} \
3546	else \
3547	{ \
3548	for (i = 0; i < SBC_MAX; ++i) \
3549	if (ctype_func (i)) \
3550	bitset_set (sbcset, i); \
3551	} \
3552	} while (0)
3553
3554	if (strcmp (name, "alnum") == 0)
3555	BUILD_CHARCLASS_LOOP (isalnum);
3556	else if (strcmp (name, "cntrl") == 0)
3557	BUILD_CHARCLASS_LOOP (iscntrl);
3558	else if (strcmp (name, "lower") == 0)
3559	BUILD_CHARCLASS_LOOP (islower);
3560	else if (strcmp (name, "space") == 0)
3561	BUILD_CHARCLASS_LOOP (isspace);
3562	else if (strcmp (name, "alpha") == 0)
3563	BUILD_CHARCLASS_LOOP (isalpha);
3564	else if (strcmp (name, "digit") == 0)
3565	BUILD_CHARCLASS_LOOP (isdigit);
3566	else if (strcmp (name, "print") == 0)
3567	BUILD_CHARCLASS_LOOP (isprint);
3568	else if (strcmp (name, "upper") == 0)
3569	BUILD_CHARCLASS_LOOP (isupper);
3570	else if (strcmp (name, "blank") == 0)
3571	BUILD_CHARCLASS_LOOP (isblank);
3572	else if (strcmp (name, "graph") == 0)
3573	BUILD_CHARCLASS_LOOP (isgraph);
3574	else if (strcmp (name, "punct") == 0)
3575	BUILD_CHARCLASS_LOOP (ispunct);
3576	else if (strcmp (name, "xdigit") == 0)
3577	BUILD_CHARCLASS_LOOP (isxdigit);
3578	else
3579	return REG_ECTYPE;
3580
3581	return REG_NOERROR;
3582	}
3583
3584	static bin_tree_t *
3585	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3586	const unsigned char *class_name,
3587	const unsigned char *extra, bool non_match,
3588	reg_errcode_t *err)
3589	{
3590	re_bitset_ptr_t sbcset;
3591	#ifdef RE_ENABLE_I18N
3592	re_charset_t *mbcset;
3593	Idx alloc = 0;
3594	#endif /* not RE_ENABLE_I18N */
3595	reg_errcode_t ret;
3596	re_token_t br_token;
3597	bin_tree_t *tree;
3598
3599	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3600	#ifdef RE_ENABLE_I18N
3601	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3602	#endif /* RE_ENABLE_I18N */
3603
3604	#ifdef RE_ENABLE_I18N
3605	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
3606	#else /* not RE_ENABLE_I18N */
3607	if (BE (sbcset == NULL, 0))
3608	#endif /* not RE_ENABLE_I18N */
3609	{
3610	*err = REG_ESPACE;
3611	return NULL;
3612	}
3613
3614	if (non_match)
3615	{
3616	#ifdef RE_ENABLE_I18N
3617	/*
3618	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3619	bitset_set(cset->sbcset, '\0');
3620	*/
3621	mbcset->non_match = 1;
3622	#endif /* not RE_ENABLE_I18N */
3623	}
3624
3625	/* We don't care the syntax in this case. */
3626	ret = build_charclass (trans, sbcset,
3627	#ifdef RE_ENABLE_I18N
3628	mbcset, &alloc,
3629	#endif /* RE_ENABLE_I18N */
3630	class_name, 0);
3631
3632	if (BE (ret != REG_NOERROR, 0))
3633	{
3634	re_free (sbcset);
3635	#ifdef RE_ENABLE_I18N
3636	free_charset (mbcset);
3637	#endif /* RE_ENABLE_I18N */
3638	*err = ret;
3639	return NULL;
3640	}
3641	/* \w match '_' also. */
3642	for (; *extra; extra++)
3643	bitset_set (sbcset, *extra);
3644
3645	/* If it is non-matching list. */
3646	if (non_match)
3647	bitset_not (sbcset);
3648
3649	#ifdef RE_ENABLE_I18N
3650	/* Ensure only single byte characters are set. */
3651	if (dfa->mb_cur_max > 1)
3652	bitset_mask (sbcset, dfa->sb_char);
3653	#endif
3654
3655	/* Build a tree for simple bracket. */
3656	br_token.type = SIMPLE_BRACKET;
3657	br_token.opr.sbcset = sbcset;
3658	tree = create_token_tree (dfa, NULL, NULL, &br_token);
3659	if (BE (tree == NULL, 0))
3660	goto build_word_op_espace;
3661
3662	#ifdef RE_ENABLE_I18N
3663	if (dfa->mb_cur_max > 1)
3664	{
3665	bin_tree_t *mbc_tree;
3666	/* Build a tree for complex bracket. */
3667	br_token.type = COMPLEX_BRACKET;
3668	br_token.opr.mbcset = mbcset;
3669	dfa->has_mb_node = 1;
3670	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3671	if (BE (mbc_tree == NULL, 0))
3672	goto build_word_op_espace;
3673	/* Then join them by ALT node. */
3674	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3675	if (BE (mbc_tree != NULL, 1))
3676	return tree;
3677	}
3678	else
3679	{
3680	free_charset (mbcset);
3681	return tree;
3682	}
3683	#else /* not RE_ENABLE_I18N */
3684	return tree;
3685	#endif /* not RE_ENABLE_I18N */
3686
3687	build_word_op_espace:
3688	re_free (sbcset);
3689	#ifdef RE_ENABLE_I18N
3690	free_charset (mbcset);
3691	#endif /* RE_ENABLE_I18N */
3692	*err = REG_ESPACE;
3693	return NULL;
3694	}
3695
3696	/* This is intended for the expressions like "a{1,3}".
3697	Fetch a number from `input', and return the number.
3698	Return REG_MISSING if the number field is empty like "{,1}".
3699	Return REG_ERROR if an error occurred. */
3700
3701	static Idx
3702	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
3703	{
3704	Idx num = REG_MISSING;
3705	unsigned char c;
3706	while (1)
3707	{
3708	fetch_token (token, input, syntax);
3709	c = token->opr.c;
3710	if (BE (token->type == END_OF_RE, 0))
3711	return REG_ERROR;
3712	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
3713	break;
3714	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c
3715	\|\| num == REG_ERROR)
3716	? REG_ERROR
3717	: ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3718	num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3719	}
3720	return num;
3721	}
3722
3723
3724	#ifdef RE_ENABLE_I18N
3725	static void
3726	free_charset (re_charset_t *cset)
3727	{
3728	re_free (cset->mbchars);
3729	# ifdef _LIBC
3730	re_free (cset->coll_syms);
3731	re_free (cset->equiv_classes);
3732	re_free (cset->range_starts);
3733	re_free (cset->range_ends);
3734	# endif
3735	re_free (cset->char_classes);
3736	re_free (cset);
3737	}
3738	#endif /* RE_ENABLE_I18N */
3739
3740
3741	/* Functions for binary tree operation. */
3742
3743	/* Create a tree node. */
3744
3745	static bin_tree_t *
3746	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3747	re_token_type_t type)
3748	{
3749	re_token_t t;
3750	t.type = type;
3751	return create_token_tree (dfa, left, right, &t);
3752	}
3753
3754	static bin_tree_t *
3755	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
3756	const re_token_t *token)
3757	{
3758	bin_tree_t *tree;
3759	if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3760	{
3761	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3762
3763	if (storage == NULL)
3764	return NULL;
3765	storage->next = dfa->str_tree_storage;
3766	dfa->str_tree_storage = storage;
3767	dfa->str_tree_storage_idx = 0;
3768	}
3769	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3770
3771	tree->parent = NULL;
3772	tree->left = left;
3773	tree->right = right;
3774	tree->token = *token;
3775	tree->token.duplicated = 0;
3776	tree->token.opt_subexp = 0;
3777	tree->first = NULL;
3778	tree->next = NULL;
3779	tree->node_idx = REG_MISSING;
3780
3781	if (left != NULL)
3782	left->parent = tree;
3783	if (right != NULL)
3784	right->parent = tree;
3785	return tree;
3786	}
3787
3788	/* Mark the tree SRC as an optional subexpression.
3789	To be called from preorder or postorder. */
3790
3791	static reg_errcode_t
3792	mark_opt_subexp (void extra, bin_tree_t node)
3793	{
3794	Idx idx = (Idx) (long) extra;
3795	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3796	node->token.opt_subexp = 1;
3797
3798	return REG_NOERROR;
3799	}
3800
3801	/* Free the allocated memory inside NODE. */
3802
3803	static void
3804	free_token (re_token_t *node)
3805	{
3806	#ifdef RE_ENABLE_I18N
3807	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3808	free_charset (node->opr.mbcset);
3809	else
3810	#endif /* RE_ENABLE_I18N */
3811	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3812	re_free (node->opr.sbcset);
3813	}
3814
3815	/* Worker function for tree walking. Free the allocated memory inside NODE
3816	and its children. */
3817
3818	static reg_errcode_t
3819	free_tree (void extra, bin_tree_t node)
3820	{
3821	free_token (&node->token);
3822	return REG_NOERROR;
3823	}
3824
3825
3826	/* Duplicate the node SRC, and return new node. This is a preorder
3827	visit similar to the one implemented by the generic visitor, but
3828	we need more infrastructure to maintain two parallel trees --- so,
3829	it's easier to duplicate. */
3830
3831	static bin_tree_t *
3832	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
3833	{
3834	const bin_tree_t *node;
3835	bin_tree_t *dup_root;
3836	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
3837
3838	for (node = root; ; )
3839	{
3840	/* Create a new tree and link it back to the current parent. */
3841	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3842	if (*p_new == NULL)
3843	return NULL;
3844	(*p_new)->parent = dup_node;
3845	(*p_new)->token.duplicated = 1;
3846	dup_node = *p_new;
3847
3848	/* Go to the left node, or up and to the right. */
3849	if (node->left)
3850	{
3851	node = node->left;
3852	p_new = &dup_node->left;
3853	}
3854	else
3855	{
3856	const bin_tree_t *prev = NULL;
3857	while (node->right == prev \|\| node->right == NULL)
3858	{
3859	prev = node;
3860	node = node->parent;
3861	dup_node = dup_node->parent;
3862	if (!node)
3863	return dup_root;
3864	}
3865	node = node->right;
3866	p_new = &dup_node->right;
3867	}
3868	}
3869	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/app-arch/tar/lib/regcomp.c

Download in other formats: