Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: vendor/glibc/current/posix/regcomp.c

Visit:

Last change on this file was 2237, checked in by (none), 20 years ago
This commit was manufactured by cvs2svn to create branch 'GNU'.
Property cvs2svn:cvs-rev set to `1.1` Property svn:eol-style set to `native` Property svn:executable set to ``*
File size: 109.8 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, write to the Free
18	Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19	02111-1307 USA. */
20
21	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
22	int length, reg_syntax_t syntax);
23	static void re_compile_fastmap_iter (regex_t *bufp,
24	const re_dfastate_t *init_state,
25	char *fastmap);
26	static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27	static void init_word_char (re_dfa_t *dfa);
28	#ifdef RE_ENABLE_I18N
29	static void free_charset (re_charset_t *cset);
30	#endif /* RE_ENABLE_I18N */
31	static void free_workarea_compile (regex_t *preg);
32	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33	#ifdef RE_ENABLE_I18N
34	static void optimize_utf8 (re_dfa_t *dfa);
35	#endif
36	static reg_errcode_t analyze (regex_t *preg);
37	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
38	static reg_errcode_t preorder (bin_tree_t *root,
39	reg_errcode_t (fn (void , bin_tree_t )),
40	void *extra);
41	static reg_errcode_t postorder (bin_tree_t *root,
42	reg_errcode_t (fn (void , bin_tree_t )),
43	void *extra);
44	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
45	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
46	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
47	bin_tree_t *node);
48	static reg_errcode_t calc_first (void extra, bin_tree_t node);
49	static reg_errcode_t calc_next (void extra, bin_tree_t node);
50	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
51	static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
52	int top_clone_node, int root_node,
53	unsigned int constraint);
54	static reg_errcode_t duplicate_node (int new_idx, re_dfa_t dfa, int org_idx,
55	unsigned int constraint);
56	static int search_duplicated_node (re_dfa_t *dfa, int org_node,
57	unsigned int constraint);
58	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
59	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
60	int node, int root);
61	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
62	static int fetch_number (re_string_t input, re_token_t token,
63	reg_syntax_t syntax);
64	static void fetch_token (re_token_t result, re_string_t input,
65	reg_syntax_t syntax);
66	static int peek_token (re_token_t token, re_string_t input,
67	reg_syntax_t syntax);
68	static int peek_token_bracket (re_token_t token, re_string_t input,
69	reg_syntax_t syntax);
70	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
71	reg_syntax_t syntax, reg_errcode_t *err);
72	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
73	re_token_t *token, reg_syntax_t syntax,
74	int nest, reg_errcode_t *err);
75	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
76	re_token_t *token, reg_syntax_t syntax,
77	int nest, reg_errcode_t *err);
78	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
79	re_token_t *token, reg_syntax_t syntax,
80	int nest, reg_errcode_t *err);
81	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
82	re_token_t *token, reg_syntax_t syntax,
83	int nest, reg_errcode_t *err);
84	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
85	re_dfa_t dfa, re_token_t token,
86	reg_syntax_t syntax, reg_errcode_t *err);
87	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
88	re_token_t *token, reg_syntax_t syntax,
89	reg_errcode_t *err);
90	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
91	re_string_t *regexp,
92	re_token_t *token, int token_len,
93	re_dfa_t *dfa,
94	reg_syntax_t syntax,
95	int accept_hyphen);
96	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97	re_string_t *regexp,
98	re_token_t *token);
99	#ifndef _LIBC
100	# ifdef RE_ENABLE_I18N
101	static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
102	re_charset_t mbcset, int range_alloc,
103	bracket_elem_t *start_elem,
104	bracket_elem_t *end_elem);
105	static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
106	re_charset_t *mbcset,
107	int *coll_sym_alloc,
108	const unsigned char *name);
109	# else /* not RE_ENABLE_I18N */
110	static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
111	bracket_elem_t *start_elem,
112	bracket_elem_t *end_elem);
113	static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
114	const unsigned char *name);
115	# endif /* not RE_ENABLE_I18N */
116	#endif /* not _LIBC */
117	#ifdef RE_ENABLE_I18N
118	static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119	re_charset_t *mbcset,
120	int *equiv_class_alloc,
121	const unsigned char *name);
122	static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
123	re_bitset_ptr_t sbcset,
124	re_charset_t *mbcset,
125	int *char_class_alloc,
126	const unsigned char *class_name,
127	reg_syntax_t syntax);
128	#else /* not RE_ENABLE_I18N */
129	static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
130	const unsigned char *name);
131	static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
132	re_bitset_ptr_t sbcset,
133	const unsigned char *class_name,
134	reg_syntax_t syntax);
135	#endif /* not RE_ENABLE_I18N */
136	static bin_tree_t build_charclass_op (re_dfa_t dfa,
137	unsigned RE_TRANSLATE_TYPE trans,
138	const unsigned char *class_name,
139	const unsigned char *extra,
140	int non_match, reg_errcode_t *err);
141	static bin_tree_t create_tree (re_dfa_t dfa,
142	bin_tree_t left, bin_tree_t right,
143	re_token_type_t type);
144	static bin_tree_t create_token_tree (re_dfa_t dfa,
145	bin_tree_t left, bin_tree_t right,
146	const re_token_t *token);
147	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
148	static void free_token (re_token_t *node);
149	static reg_errcode_t free_tree (void extra, bin_tree_t node);
150	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
151
152
153	/* This table gives an error message for each of the error codes listed
154	in regex.h. Obviously the order here has to be same as there.
155	POSIX doesn't require that we do anything for REG_NOERROR,
156	but why not be nice? */
157
158	const char __re_error_msgid[] attribute_hidden =
159	{
160	#define REG_NOERROR_IDX 0
161	gettext_noop ("Success") /* REG_NOERROR */
162	"\0"
163	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
164	gettext_noop ("No match") /* REG_NOMATCH */
165	"\0"
166	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
167	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
168	"\0"
169	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
170	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
171	"\0"
172	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
173	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
174	"\0"
175	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
176	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
177	"\0"
178	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
179	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
180	"\0"
181	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
182	gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
183	"\0"
184	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
185	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
186	"\0"
187	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
188	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
189	"\0"
190	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
191	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
192	"\0"
193	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
194	gettext_noop ("Invalid range end") /* REG_ERANGE */
195	"\0"
196	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
197	gettext_noop ("Memory exhausted") /* REG_ESPACE */
198	"\0"
199	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
200	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
201	"\0"
202	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
203	gettext_noop ("Premature end of regular expression") /* REG_EEND */
204	"\0"
205	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
206	gettext_noop ("Regular expression too big") /* REG_ESIZE */
207	"\0"
208	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
209	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
210	};
211
212	const size_t __re_error_msgid_idx[] attribute_hidden =
213	{
214	REG_NOERROR_IDX,
215	REG_NOMATCH_IDX,
216	REG_BADPAT_IDX,
217	REG_ECOLLATE_IDX,
218	REG_ECTYPE_IDX,
219	REG_EESCAPE_IDX,
220	REG_ESUBREG_IDX,
221	REG_EBRACK_IDX,
222	REG_EPAREN_IDX,
223	REG_EBRACE_IDX,
224	REG_BADBR_IDX,
225	REG_ERANGE_IDX,
226	REG_ESPACE_IDX,
227	REG_BADRPT_IDX,
228	REG_EEND_IDX,
229	REG_ESIZE_IDX,
230	REG_ERPAREN_IDX
231	};
232
233
234	/* Entry points for GNU code. */
235
236	/* re_compile_pattern is the GNU regular expression compiler: it
237	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
238	Returns 0 if the pattern was valid, otherwise an error string.
239
240	Assumes the `allocated' (and perhaps `buffer') and `translate' fields
241	are set in BUFP on entry. */
242
243	const char *
244	re_compile_pattern (pattern, length, bufp)
245	const char *pattern;
246	size_t length;
247	struct re_pattern_buffer *bufp;
248	{
249	reg_errcode_t ret;
250
251	/* And GNU code determines whether or not to get register information
252	by passing null for the REGS argument to re_match, etc., not by
253	setting no_sub, unless RE_NO_SUB is set. */
254	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
255
256	/* Match anchors at newline. */
257	bufp->newline_anchor = 1;
258
259	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
260
261	if (!ret)
262	return NULL;
263	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
264	}
265	#ifdef _LIBC
266	weak_alias (__re_compile_pattern, re_compile_pattern)
267	#endif
268
269	/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
270	also be assigned to arbitrarily: each pattern buffer stores its own
271	syntax, so it can be changed between regex compilations. */
272	/* This has no initializer because initialized variables in Emacs
273	become read-only after dumping. */
274	reg_syntax_t re_syntax_options;
275
276
277	/* Specify the precise syntax of regexps for compilation. This provides
278	for compatibility for various utilities which historically have
279	different, incompatible syntaxes.
280
281	The argument SYNTAX is a bit mask comprised of the various bits
282	defined in regex.h. We return the old syntax. */
283
284	reg_syntax_t
285	re_set_syntax (syntax)
286	reg_syntax_t syntax;
287	{
288	reg_syntax_t ret = re_syntax_options;
289
290	re_syntax_options = syntax;
291	return ret;
292	}
293	#ifdef _LIBC
294	weak_alias (__re_set_syntax, re_set_syntax)
295	#endif
296
297	int
298	re_compile_fastmap (bufp)
299	struct re_pattern_buffer *bufp;
300	{
301	re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
302	char *fastmap = bufp->fastmap;
303
304	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
305	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
306	if (dfa->init_state != dfa->init_state_word)
307	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
308	if (dfa->init_state != dfa->init_state_nl)
309	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
310	if (dfa->init_state != dfa->init_state_begbuf)
311	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
312	bufp->fastmap_accurate = 1;
313	return 0;
314	}
315	#ifdef _LIBC
316	weak_alias (__re_compile_fastmap, re_compile_fastmap)
317	#endif
318
319	static inline void
320	__attribute ((always_inline))
321	re_set_fastmap (char *fastmap, int icase, int ch)
322	{
323	fastmap[ch] = 1;
324	if (icase)
325	fastmap[tolower (ch)] = 1;
326	}
327
328	/* Helper function for re_compile_fastmap.
329	Compile fastmap for the initial_state INIT_STATE. */
330
331	static void
332	re_compile_fastmap_iter (bufp, init_state, fastmap)
333	regex_t *bufp;
334	const re_dfastate_t *init_state;
335	char *fastmap;
336	{
337	re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
338	int node_cnt;
339	int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
340	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
341	{
342	int node = init_state->nodes.elems[node_cnt];
343	re_token_type_t type = dfa->nodes[node].type;
344
345	if (type == CHARACTER)
346	{
347	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
348	#ifdef RE_ENABLE_I18N
349	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
350	{
351	unsigned char buf = alloca (dfa->mb_cur_max), p;
352	wchar_t wc;
353	mbstate_t state;
354
355	p = buf;
356	*p++ = dfa->nodes[node].opr.c;
357	while (++node < dfa->nodes_len
358	&& dfa->nodes[node].type == CHARACTER
359	&& dfa->nodes[node].mb_partial)
360	*p++ = dfa->nodes[node].opr.c;
361	memset (&state, 0, sizeof (state));
362	if (mbrtowc (&wc, (const char *) buf, p - buf,
363	&state) == p - buf
364	&& (__wcrtomb ((char *) buf, towlower (wc), &state)
365	!= (size_t) -1))
366	re_set_fastmap (fastmap, 0, buf[0]);
367	}
368	#endif
369	}
370	else if (type == SIMPLE_BRACKET)
371	{
372	int i, j, ch;
373	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
374	for (j = 0; j < UINT_BITS; ++j, ++ch)
375	if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
376	re_set_fastmap (fastmap, icase, ch);
377	}
378	#ifdef RE_ENABLE_I18N
379	else if (type == COMPLEX_BRACKET)
380	{
381	int i;
382	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
383	if (cset->non_match \|\| cset->ncoll_syms \|\| cset->nequiv_classes
384	\|\| cset->nranges \|\| cset->nchar_classes)
385	{
386	# ifdef _LIBC
387	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
388	{
389	/* In this case we want to catch the bytes which are
390	the first byte of any collation elements.
391	e.g. In da_DK, we want to catch 'a' since "aa"
392	is a valid collation element, and don't catch
393	'b' since 'b' is the only collation element
394	which starts from 'b'. */
395	int j, ch;
396	const int32_t table = (const int32_t )
397	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
398	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
399	for (j = 0; j < UINT_BITS; ++j, ++ch)
400	if (table[ch] < 0)
401	re_set_fastmap (fastmap, icase, ch);
402	}
403	# else
404	if (dfa->mb_cur_max > 1)
405	for (i = 0; i < SBC_MAX; ++i)
406	if (__btowc (i) == WEOF)
407	re_set_fastmap (fastmap, icase, i);
408	# endif /* not _LIBC */
409	}
410	for (i = 0; i < cset->nmbchars; ++i)
411	{
412	char buf[256];
413	mbstate_t state;
414	memset (&state, '\0', sizeof (state));
415	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
416	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
417	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
418	{
419	if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
420	!= (size_t) -1)
421	re_set_fastmap (fastmap, 0, (unsigned char ) buf);
422	}
423	}
424	}
425	#endif /* RE_ENABLE_I18N */
426	else if (type == OP_PERIOD
427	#ifdef RE_ENABLE_I18N
428	\|\| type == OP_UTF8_PERIOD
429	#endif /* RE_ENABLE_I18N */
430	\|\| type == END_OF_RE)
431	{
432	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
433	if (type == END_OF_RE)
434	bufp->can_be_null = 1;
435	return;
436	}
437	}
438	}
439
440
441	/* Entry point for POSIX code. */
442	/* regcomp takes a regular expression as a string and compiles it.
443
444	PREG is a regex_t *. We do not expect any fields to be initialized,
445	since POSIX says we shouldn't. Thus, we set
446
447	`buffer' to the compiled pattern;
448	`used' to the length of the compiled pattern;
449	`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
450	REG_EXTENDED bit in CFLAGS is set; otherwise, to
451	RE_SYNTAX_POSIX_BASIC;
452	`newline_anchor' to REG_NEWLINE being set in CFLAGS;
453	`fastmap' to an allocated space for the fastmap;
454	`fastmap_accurate' to zero;
455	`re_nsub' to the number of subexpressions in PATTERN.
456
457	PATTERN is the address of the pattern string.
458
459	CFLAGS is a series of bits which affect compilation.
460
461	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
462	use POSIX basic syntax.
463
464	If REG_NEWLINE is set, then . and [^...] don't match newline.
465	Also, regexec will try a match beginning after every newline.
466
467	If REG_ICASE is set, then we considers upper- and lowercase
468	versions of letters to be equivalent when matching.
469
470	If REG_NOSUB is set, then when PREG is passed to regexec, that
471	routine will report only success or failure, and nothing about the
472	registers.
473
474	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
475	the return codes and their meanings.) */
476
477	int
478	regcomp (preg, pattern, cflags)
479	regex_t *__restrict preg;
480	const char *__restrict pattern;
481	int cflags;
482	{
483	reg_errcode_t ret;
484	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
485	: RE_SYNTAX_POSIX_BASIC);
486
487	preg->buffer = NULL;
488	preg->allocated = 0;
489	preg->used = 0;
490
491	/* Try to allocate space for the fastmap. */
492	preg->fastmap = re_malloc (char, SBC_MAX);
493	if (BE (preg->fastmap == NULL, 0))
494	return REG_ESPACE;
495
496	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
497
498	/* If REG_NEWLINE is set, newlines are treated differently. */
499	if (cflags & REG_NEWLINE)
500	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
501	syntax &= ~RE_DOT_NEWLINE;
502	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
503	/* It also changes the matching behavior. */
504	preg->newline_anchor = 1;
505	}
506	else
507	preg->newline_anchor = 0;
508	preg->no_sub = !!(cflags & REG_NOSUB);
509	preg->translate = NULL;
510
511	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
512
513	/* POSIX doesn't distinguish between an unmatched open-group and an
514	unmatched close-group: both are REG_EPAREN. */
515	if (ret == REG_ERPAREN)
516	ret = REG_EPAREN;
517
518	/* We have already checked preg->fastmap != NULL. */
519	if (BE (ret == REG_NOERROR, 1))
520	/* Compute the fastmap now, since regexec cannot modify the pattern
521	buffer. This function never fails in this implementation. */
522	(void) re_compile_fastmap (preg);
523	else
524	{
525	/* Some error occurred while compiling the expression. */
526	re_free (preg->fastmap);
527	preg->fastmap = NULL;
528	}
529
530	return (int) ret;
531	}
532	#ifdef _LIBC
533	weak_alias (__regcomp, regcomp)
534	#endif
535
536	/* Returns a message corresponding to an error code, ERRCODE, returned
537	from either regcomp or regexec. We don't use PREG here. */
538
539	size_t
540	regerror (errcode, preg, errbuf, errbuf_size)
541	int errcode;
542	const regex_t *preg;
543	char *errbuf;
544	size_t errbuf_size;
545	{
546	const char *msg;
547	size_t msg_size;
548
549	if (BE (errcode < 0
550	\|\| errcode >= (int) (sizeof (__re_error_msgid_idx)
551	/ sizeof (__re_error_msgid_idx[0])), 0))
552	/* Only error codes returned by the rest of the code should be passed
553	to this routine. If we are given anything else, or if other regex
554	code generates an invalid error code, then the program has a bug.
555	Dump core so we can fix it. */
556	abort ();
557
558	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
559
560	msg_size = strlen (msg) + 1; /* Includes the null. */
561
562	if (BE (errbuf_size != 0, 1))
563	{
564	if (BE (msg_size > errbuf_size, 0))
565	{
566	#if defined HAVE_MEMPCPY \|\| defined _LIBC
567	((char ) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
568	#else
569	memcpy (errbuf, msg, errbuf_size - 1);
570	errbuf[errbuf_size - 1] = 0;
571	#endif
572	}
573	else
574	memcpy (errbuf, msg, msg_size);
575	}
576
577	return msg_size;
578	}
579	#ifdef _LIBC
580	weak_alias (__regerror, regerror)
581	#endif
582
583
584	#ifdef RE_ENABLE_I18N
585	/* This static array is used for the map to single-byte characters when
586	UTF-8 is used. Otherwise we would allocate memory just to initialize
587	it the same all the time. UTF-8 is the preferred encoding so this is
588	a worthwhile optimization. */
589	static const bitset utf8_sb_map =
590	{
591	/* Set the first 128 bits. */
592	# if UINT_MAX == 0xffffffff
593	0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
594	# else
595	# error "Add case for new unsigned int size"
596	# endif
597	};
598	#endif
599
600
601	static void
602	free_dfa_content (re_dfa_t *dfa)
603	{
604	int i, j;
605
606	if (dfa->nodes)
607	for (i = 0; i < dfa->nodes_len; ++i)
608	free_token (dfa->nodes + i);
609	re_free (dfa->nexts);
610	for (i = 0; i < dfa->nodes_len; ++i)
611	{
612	if (dfa->eclosures != NULL)
613	re_node_set_free (dfa->eclosures + i);
614	if (dfa->inveclosures != NULL)
615	re_node_set_free (dfa->inveclosures + i);
616	if (dfa->edests != NULL)
617	re_node_set_free (dfa->edests + i);
618	}
619	re_free (dfa->edests);
620	re_free (dfa->eclosures);
621	re_free (dfa->inveclosures);
622	re_free (dfa->nodes);
623
624	if (dfa->state_table)
625	for (i = 0; i <= dfa->state_hash_mask; ++i)
626	{
627	struct re_state_table_entry *entry = dfa->state_table + i;
628	for (j = 0; j < entry->num; ++j)
629	{
630	re_dfastate_t *state = entry->array[j];
631	free_state (state);
632	}
633	re_free (entry->array);
634	}
635	re_free (dfa->state_table);
636	#ifdef RE_ENABLE_I18N
637	if (dfa->sb_char != utf8_sb_map)
638	re_free (dfa->sb_char);
639	#endif
640	re_free (dfa->subexp_map);
641	#ifdef DEBUG
642	re_free (dfa->re_str);
643	#endif
644
645	re_free (dfa);
646	}
647
648
649	/* Free dynamically allocated space used by PREG. */
650
651	void
652	regfree (preg)
653	regex_t *preg;
654	{
655	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
656	if (BE (dfa != NULL, 1))
657	free_dfa_content (dfa);
658	preg->buffer = NULL;
659	preg->allocated = 0;
660
661	re_free (preg->fastmap);
662	preg->fastmap = NULL;
663
664	re_free (preg->translate);
665	preg->translate = NULL;
666	}
667	#ifdef _LIBC
668	weak_alias (__regfree, regfree)
669	#endif
670
671
672	/* Entry points compatible with 4.2 BSD regex library. We don't define
673	them unless specifically requested. */
674
675	#if defined _REGEX_RE_COMP \|\| defined _LIBC
676
677	/* BSD has one and only one pattern buffer. */
678	static struct re_pattern_buffer re_comp_buf;
679
680	char *
681	# ifdef _LIBC
682	/* Make these definitions weak in libc, so POSIX programs can redefine
683	these names if they don't use our functions, and still use
684	regcomp/regexec above without link errors. */
685	weak_function
686	# endif
687	re_comp (s)
688	const char *s;
689	{
690	reg_errcode_t ret;
691	char *fastmap;
692
693	if (!s)
694	{
695	if (!re_comp_buf.buffer)
696	return gettext ("No previous regular expression");
697	return 0;
698	}
699
700	if (re_comp_buf.buffer)
701	{
702	fastmap = re_comp_buf.fastmap;
703	re_comp_buf.fastmap = NULL;
704	__regfree (&re_comp_buf);
705	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
706	re_comp_buf.fastmap = fastmap;
707	}
708
709	if (re_comp_buf.fastmap == NULL)
710	{
711	re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
712	if (re_comp_buf.fastmap == NULL)
713	return (char *) gettext (__re_error_msgid
714	+ __re_error_msgid_idx[(int) REG_ESPACE]);
715	}
716
717	/* Since `re_exec' always passes NULL for the `regs' argument, we
718	don't need to initialize the pattern buffer fields which affect it. */
719
720	/* Match anchors at newlines. */
721	re_comp_buf.newline_anchor = 1;
722
723	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
724
725	if (!ret)
726	return NULL;
727
728	/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
729	return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
730	}
731
732	#ifdef _LIBC
733	libc_freeres_fn (free_mem)
734	{
735	__regfree (&re_comp_buf);
736	}
737	#endif
738
739	#endif /* _REGEX_RE_COMP */
740
741
742	/* Internal entry point.
743	Compile the regular expression PATTERN, whose length is LENGTH.
744	SYNTAX indicate regular expression's syntax. */
745
746	static reg_errcode_t
747	re_compile_internal (preg, pattern, length, syntax)
748	regex_t *preg;
749	const char * pattern;
750	int length;
751	reg_syntax_t syntax;
752	{
753	reg_errcode_t err = REG_NOERROR;
754	re_dfa_t *dfa;
755	re_string_t regexp;
756
757	/* Initialize the pattern buffer. */
758	preg->fastmap_accurate = 0;
759	preg->syntax = syntax;
760	preg->not_bol = preg->not_eol = 0;
761	preg->used = 0;
762	preg->re_nsub = 0;
763	preg->can_be_null = 0;
764	preg->regs_allocated = REGS_UNALLOCATED;
765
766	/* Initialize the dfa. */
767	dfa = (re_dfa_t *) preg->buffer;
768	if (BE (preg->allocated < sizeof (re_dfa_t), 0))
769	{
770	/* If zero allocated, but buffer is non-null, try to realloc
771	enough space. This loses if buffer's address is bogus, but
772	that is the user's responsibility. If ->buffer is NULL this
773	is a simple allocation. */
774	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
775	if (dfa == NULL)
776	return REG_ESPACE;
777	preg->allocated = sizeof (re_dfa_t);
778	preg->buffer = (unsigned char *) dfa;
779	}
780	preg->used = sizeof (re_dfa_t);
781
782	__libc_lock_init (dfa->lock);
783
784	err = init_dfa (dfa, length);
785	if (BE (err != REG_NOERROR, 0))
786	{
787	free_dfa_content (dfa);
788	preg->buffer = NULL;
789	preg->allocated = 0;
790	return err;
791	}
792	#ifdef DEBUG
793	dfa->re_str = re_malloc (char, length + 1);
794	strncpy (dfa->re_str, pattern, length + 1);
795	#endif
796
797	err = re_string_construct (&regexp, pattern, length, preg->translate,
798	syntax & RE_ICASE, dfa);
799	if (BE (err != REG_NOERROR, 0))
800	{
801	re_compile_internal_free_return:
802	free_workarea_compile (preg);
803	re_string_destruct (&regexp);
804	free_dfa_content (dfa);
805	preg->buffer = NULL;
806	preg->allocated = 0;
807	return err;
808	}
809
810	/* Parse the regular expression, and build a structure tree. */
811	preg->re_nsub = 0;
812	dfa->str_tree = parse (&regexp, preg, syntax, &err);
813	if (BE (dfa->str_tree == NULL, 0))
814	goto re_compile_internal_free_return;
815
816	/* Analyze the tree and create the nfa. */
817	err = analyze (preg);
818	if (BE (err != REG_NOERROR, 0))
819	goto re_compile_internal_free_return;
820
821	#ifdef RE_ENABLE_I18N
822	/* If possible, do searching in single byte encoding to speed things up. */
823	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
824	optimize_utf8 (dfa);
825	#endif
826
827	/* Then create the initial state of the dfa. */
828	err = create_initial_state (dfa);
829
830	/* Release work areas. */
831	free_workarea_compile (preg);
832	re_string_destruct (&regexp);
833
834	if (BE (err != REG_NOERROR, 0))
835	{
836	free_dfa_content (dfa);
837	preg->buffer = NULL;
838	preg->allocated = 0;
839	}
840
841	return err;
842	}
843
844	/* Initialize DFA. We use the length of the regular expression PAT_LEN
845	as the initial length of some arrays. */
846
847	static reg_errcode_t
848	init_dfa (dfa, pat_len)
849	re_dfa_t *dfa;
850	int pat_len;
851	{
852	int table_size;
853	#ifndef _LIBC
854	char *codeset_name;
855	#endif
856
857	memset (dfa, '\0', sizeof (re_dfa_t));
858
859	/* Force allocation of str_tree_storage the first time. */
860	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
861
862	dfa->nodes_alloc = pat_len + 1;
863	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
864
865	dfa->states_alloc = pat_len + 1;
866
867	/* table_size = 2 ^ ceil(log pat_len) */
868	for (table_size = 1; table_size > 0; table_size <<= 1)
869	if (table_size > pat_len)
870	break;
871
872	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
873	dfa->state_hash_mask = table_size - 1;
874
875	dfa->mb_cur_max = MB_CUR_MAX;
876	#ifdef _LIBC
877	if (dfa->mb_cur_max == 6
878	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
879	dfa->is_utf8 = 1;
880	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
881	!= 0);
882	#else
883	# ifdef HAVE_LANGINFO_CODESET
884	codeset_name = nl_langinfo (CODESET);
885	# else
886	codeset_name = getenv ("LC_ALL");
887	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
888	codeset_name = getenv ("LC_CTYPE");
889	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
890	codeset_name = getenv ("LANG");
891	if (codeset_name == NULL)
892	codeset_name = "";
893	else if (strchr (codeset_name, '.') != NULL)
894	codeset_name = strchr (codeset_name, '.') + 1;
895	# endif
896
897	if (strcasecmp (codeset_name, "UTF-8") == 0
898	\|\| strcasecmp (codeset_name, "UTF8") == 0)
899	dfa->is_utf8 = 1;
900
901	/* We check exhaustively in the loop below if this charset is a
902	superset of ASCII. */
903	dfa->map_notascii = 0;
904	#endif
905
906	#ifdef RE_ENABLE_I18N
907	if (dfa->mb_cur_max > 1)
908	{
909	if (dfa->is_utf8)
910	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
911	else
912	{
913	int i, j, ch;
914
915	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
916	if (BE (dfa->sb_char == NULL, 0))
917	return REG_ESPACE;
918
919	/* Clear all bits by, then set those corresponding to single
920	byte chars. */
921	bitset_empty (dfa->sb_char);
922
923	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
924	for (j = 0; j < UINT_BITS; ++j, ++ch)
925	{
926	wchar_t wch = __btowc (ch);
927	if (wch != WEOF)
928	dfa->sb_char[i] \|= 1 << j;
929	# ifndef _LIBC
930	if (isascii (ch) && wch != (wchar_t) ch)
931	dfa->map_notascii = 1;
932	# endif
933	}
934	}
935	}
936	#endif
937
938	if (BE (dfa->nodes == NULL \|\| dfa->state_table == NULL, 0))
939	return REG_ESPACE;
940	return REG_NOERROR;
941	}
942
943	/* Initialize WORD_CHAR table, which indicate which character is
944	"word". In this case "word" means that it is the word construction
945	character used by some operators like "\<", "\>", etc. */
946
947	static void
948	init_word_char (dfa)
949	re_dfa_t *dfa;
950	{
951	int i, j, ch;
952	dfa->word_ops_used = 1;
953	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
954	for (j = 0; j < UINT_BITS; ++j, ++ch)
955	if (isalnum (ch) \|\| ch == '_')
956	dfa->word_char[i] \|= 1 << j;
957	}
958
959	/* Free the work area which are only used while compiling. */
960
961	static void
962	free_workarea_compile (preg)
963	regex_t *preg;
964	{
965	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
966	bin_tree_storage_t storage, next;
967	for (storage = dfa->str_tree_storage; storage; storage = next)
968	{
969	next = storage->next;
970	re_free (storage);
971	}
972	dfa->str_tree_storage = NULL;
973	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
974	dfa->str_tree = NULL;
975	re_free (dfa->org_indices);
976	dfa->org_indices = NULL;
977	}
978
979	/* Create initial states for all contexts. */
980
981	static reg_errcode_t
982	create_initial_state (dfa)
983	re_dfa_t *dfa;
984	{
985	int first, i;
986	reg_errcode_t err;
987	re_node_set init_nodes;
988
989	/* Initial states have the epsilon closure of the node which is
990	the first node of the regular expression. */
991	first = dfa->str_tree->first->node_idx;
992	dfa->init_node = first;
993	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
994	if (BE (err != REG_NOERROR, 0))
995	return err;
996
997	/* The back-references which are in initial states can epsilon transit,
998	since in this case all of the subexpressions can be null.
999	Then we add epsilon closures of the nodes which are the next nodes of
1000	the back-references. */
1001	if (dfa->nbackref > 0)
1002	for (i = 0; i < init_nodes.nelem; ++i)
1003	{
1004	int node_idx = init_nodes.elems[i];
1005	re_token_type_t type = dfa->nodes[node_idx].type;
1006
1007	int clexp_idx;
1008	if (type != OP_BACK_REF)
1009	continue;
1010	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1011	{
1012	re_token_t *clexp_node;
1013	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1014	if (clexp_node->type == OP_CLOSE_SUBEXP
1015	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016	break;
1017	}
1018	if (clexp_idx == init_nodes.nelem)
1019	continue;
1020
1021	if (type == OP_BACK_REF)
1022	{
1023	int dest_idx = dfa->edests[node_idx].elems[0];
1024	if (!re_node_set_contains (&init_nodes, dest_idx))
1025	{
1026	re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1027	i = 0;
1028	}
1029	}
1030	}
1031
1032	/* It must be the first time to invoke acquire_state. */
1033	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1034	/* We don't check ERR here, since the initial state must not be NULL. */
1035	if (BE (dfa->init_state == NULL, 0))
1036	return err;
1037	if (dfa->init_state->has_constraint)
1038	{
1039	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1040	CONTEXT_WORD);
1041	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1042	CONTEXT_NEWLINE);
1043	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1044	&init_nodes,
1045	CONTEXT_NEWLINE
1046	\| CONTEXT_BEGBUF);
1047	if (BE (dfa->init_state_word == NULL \|\| dfa->init_state_nl == NULL
1048	\|\| dfa->init_state_begbuf == NULL, 0))
1049	return err;
1050	}
1051	else
1052	dfa->init_state_word = dfa->init_state_nl
1053	= dfa->init_state_begbuf = dfa->init_state;
1054
1055	re_node_set_free (&init_nodes);
1056	return REG_NOERROR;
1057	}
1058
1059
1060	#ifdef RE_ENABLE_I18N
1061	/* If it is possible to do searching in single byte encoding instead of UTF-8
1062	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1063	DFA nodes where needed. */
1064
1065	static void
1066	optimize_utf8 (dfa)
1067	re_dfa_t *dfa;
1068	{
1069	int node, i, mb_chars = 0, has_period = 0;
1070
1071	for (node = 0; node < dfa->nodes_len; ++node)
1072	switch (dfa->nodes[node].type)
1073	{
1074	case CHARACTER:
1075	if (dfa->nodes[node].opr.c >= 0x80)
1076	mb_chars = 1;
1077	break;
1078	case ANCHOR:
1079	switch (dfa->nodes[node].opr.idx)
1080	{
1081	case LINE_FIRST:
1082	case LINE_LAST:
1083	case BUF_FIRST:
1084	case BUF_LAST:
1085	break;
1086	default:
1087	/* Word anchors etc. cannot be handled. */
1088	return;
1089	}
1090	break;
1091	case OP_PERIOD:
1092	has_period = 1;
1093	break;
1094	case OP_BACK_REF:
1095	case OP_ALT:
1096	case END_OF_RE:
1097	case OP_DUP_ASTERISK:
1098	case OP_OPEN_SUBEXP:
1099	case OP_CLOSE_SUBEXP:
1100	break;
1101	case COMPLEX_BRACKET:
1102	return;
1103	case SIMPLE_BRACKET:
1104	/* Just double check. */
1105	for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1106	if (dfa->nodes[node].opr.sbcset[i])
1107	return;
1108	break;
1109	default:
1110	abort ();
1111	}
1112
1113	if (mb_chars \|\| has_period)
1114	for (node = 0; node < dfa->nodes_len; ++node)
1115	{
1116	if (dfa->nodes[node].type == CHARACTER
1117	&& dfa->nodes[node].opr.c >= 0x80)
1118	dfa->nodes[node].mb_partial = 0;
1119	else if (dfa->nodes[node].type == OP_PERIOD)
1120	dfa->nodes[node].type = OP_UTF8_PERIOD;
1121	}
1122
1123	/* The search can be in single byte locale. */
1124	dfa->mb_cur_max = 1;
1125	dfa->is_utf8 = 0;
1126	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
1127	}
1128	#endif
1129
1130
1131	/* Analyze the structure tree, and calculate "first", "next", "edest",
1132	"eclosure", and "inveclosure". */
1133
1134	static reg_errcode_t
1135	analyze (preg)
1136	regex_t *preg;
1137	{
1138	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
1139	reg_errcode_t ret;
1140
1141	/* Allocate arrays. */
1142	dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1143	dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1144	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1145	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1146	if (BE (dfa->nexts == NULL \|\| dfa->org_indices == NULL \|\| dfa->edests == NULL
1147	\|\| dfa->eclosures == NULL, 0))
1148	return REG_ESPACE;
1149
1150	dfa->subexp_map = re_malloc (int, preg->re_nsub);
1151	if (dfa->subexp_map != NULL)
1152	{
1153	int i;
1154	for (i = 0; i < preg->re_nsub; i++)
1155	dfa->subexp_map[i] = i;
1156	preorder (dfa->str_tree, optimize_subexps, dfa);
1157	for (i = 0; i < preg->re_nsub; i++)
1158	if (dfa->subexp_map[i] != i)
1159	break;
1160	if (i == preg->re_nsub)
1161	{
1162	free (dfa->subexp_map);
1163	dfa->subexp_map = NULL;
1164	}
1165	}
1166
1167	ret = postorder (dfa->str_tree, lower_subexps, preg);
1168	if (BE (ret != REG_NOERROR, 0))
1169	return ret;
1170	ret = postorder (dfa->str_tree, calc_first, dfa);
1171	if (BE (ret != REG_NOERROR, 0))
1172	return ret;
1173	preorder (dfa->str_tree, calc_next, dfa);
1174	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1175	if (BE (ret != REG_NOERROR, 0))
1176	return ret;
1177	ret = calc_eclosure (dfa);
1178	if (BE (ret != REG_NOERROR, 0))
1179	return ret;
1180
1181	/* We only need this during the prune_impossible_nodes pass in regexec.c;
1182	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1183	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1184	\|\| dfa->nbackref)
1185	{
1186	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1187	if (BE (dfa->inveclosures == NULL, 0))
1188	return REG_ESPACE;
1189	ret = calc_inveclosure (dfa);
1190	}
1191
1192	return ret;
1193	}
1194
1195	/* Our parse trees are very unbalanced, so we cannot use a stack to
1196	implement parse tree visits. Instead, we use parent pointers and
1197	some hairy code in these two functions. */
1198	static reg_errcode_t
1199	postorder (root, fn, extra)
1200	bin_tree_t *root;
1201	reg_errcode_t (fn (void , bin_tree_t ));
1202	void *extra;
1203	{
1204	bin_tree_t node, prev;
1205
1206	for (node = root; ; )
1207	{
1208	/* Descend down the tree, preferably to the left (or to the right
1209	if that's the only child). */
1210	while (node->left \|\| node->right)
1211	if (node->left)
1212	node = node->left;
1213	else
1214	node = node->right;
1215
1216	do
1217	{
1218	reg_errcode_t err = fn (extra, node);
1219	if (BE (err != REG_NOERROR, 0))
1220	return err;
1221	if (node->parent == NULL)
1222	return REG_NOERROR;
1223	prev = node;
1224	node = node->parent;
1225	}
1226	/* Go up while we have a node that is reached from the right. */
1227	while (node->right == prev \|\| node->right == NULL);
1228	node = node->right;
1229	}
1230	}
1231
1232	static reg_errcode_t
1233	preorder (root, fn, extra)
1234	bin_tree_t *root;
1235	reg_errcode_t (fn (void , bin_tree_t ));
1236	void *extra;
1237	{
1238	bin_tree_t *node;
1239
1240	for (node = root; ; )
1241	{
1242	reg_errcode_t err = fn (extra, node);
1243	if (BE (err != REG_NOERROR, 0))
1244	return err;
1245
1246	/* Go to the left node, or up and to the right. */
1247	if (node->left)
1248	node = node->left;
1249	else
1250	{
1251	bin_tree_t *prev = NULL;
1252	while (node->right == prev \|\| node->right == NULL)
1253	{
1254	prev = node;
1255	node = node->parent;
1256	if (!node)
1257	return REG_NOERROR;
1258	}
1259	node = node->right;
1260	}
1261	}
1262	}
1263
1264	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1265	re_search_internal to map the inner one's opr.idx to this one's. Adjust
1266	backreferences as well. Requires a preorder visit. */
1267	static reg_errcode_t
1268	optimize_subexps (extra, node)
1269	void *extra;
1270	bin_tree_t *node;
1271	{
1272	re_dfa_t dfa = (re_dfa_t ) extra;
1273
1274	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1275	{
1276	int idx = node->token.opr.idx;
1277	node->token.opr.idx = dfa->subexp_map[idx];
1278	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
1279	}
1280
1281	else if (node->token.type == SUBEXP
1282	&& node->left && node->left->token.type == SUBEXP)
1283	{
1284	int other_idx = node->left->token.opr.idx;
1285
1286	node->left = node->left->left;
1287	if (node->left)
1288	node->left->parent = node;
1289
1290	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1291	if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1292	dfa->used_bkref_map &= ~(1 << other_idx);
1293	}
1294
1295	return REG_NOERROR;
1296	}
1297
1298	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1299	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1300	static reg_errcode_t
1301	lower_subexps (extra, node)
1302	void *extra;
1303	bin_tree_t *node;
1304	{
1305	regex_t preg = (regex_t ) extra;
1306	reg_errcode_t err = REG_NOERROR;
1307
1308	if (node->left && node->left->token.type == SUBEXP)
1309	{
1310	node->left = lower_subexp (&err, preg, node->left);
1311	if (node->left)
1312	node->left->parent = node;
1313	}
1314	if (node->right && node->right->token.type == SUBEXP)
1315	{
1316	node->right = lower_subexp (&err, preg, node->right);
1317	if (node->right)
1318	node->right->parent = node;
1319	}
1320
1321	return err;
1322	}
1323
1324	static bin_tree_t *
1325	lower_subexp (err, preg, node)
1326	reg_errcode_t *err;
1327	regex_t *preg;
1328	bin_tree_t *node;
1329	{
1330	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
1331	bin_tree_t *body = node->left;
1332	bin_tree_t op, cls, tree1, tree;
1333
1334	if (preg->no_sub
1335	/* We do not optimize empty subexpressions, because otherwise we may
1336	have bad CONCAT nodes with NULL children. This is obviously not
1337	very common, so we do not lose much. An example that triggers
1338	this case is the sed "script" /\(\)/x. */
1339	&& node->left != NULL
1340	&& (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1341	\|\| !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1342	return node->left;
1343
1344	/* Convert the SUBEXP node to the concatenation of an
1345	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1346	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349	tree = create_tree (dfa, op, tree1, CONCAT);
1350	if (BE (tree == NULL \|\| tree1 == NULL \|\| op == NULL \|\| cls == NULL, 0))
1351	{
1352	*err = REG_ESPACE;
1353	return NULL;
1354	}
1355
1356	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358	return tree;
1359	}
1360
1361	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362	nodes. Requires a postorder visit. */
1363	static reg_errcode_t
1364	calc_first (extra, node)
1365	void *extra;
1366	bin_tree_t *node;
1367	{
1368	re_dfa_t dfa = (re_dfa_t ) extra;
1369	if (node->token.type == CONCAT)
1370	{
1371	node->first = node->left->first;
1372	node->node_idx = node->left->node_idx;
1373	}
1374	else
1375	{
1376	node->first = node;
1377	node->node_idx = re_dfa_add_node (dfa, node->token);
1378	if (BE (node->node_idx == -1, 0))
1379	return REG_ESPACE;
1380	}
1381	return REG_NOERROR;
1382	}
1383
1384	/* Pass 2: compute NEXT on the tree. Preorder visit. */
1385	static reg_errcode_t
1386	calc_next (extra, node)
1387	void *extra;
1388	bin_tree_t *node;
1389	{
1390	switch (node->token.type)
1391	{
1392	case OP_DUP_ASTERISK:
1393	node->left->next = node;
1394	break;
1395	case CONCAT:
1396	node->left->next = node->right->first;
1397	node->right->next = node->next;
1398	break;
1399	default:
1400	if (node->left)
1401	node->left->next = node->next;
1402	if (node->right)
1403	node->right->next = node->next;
1404	break;
1405	}
1406	return REG_NOERROR;
1407	}
1408
1409	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1410	static reg_errcode_t
1411	link_nfa_nodes (extra, node)
1412	void *extra;
1413	bin_tree_t *node;
1414	{
1415	re_dfa_t dfa = (re_dfa_t ) extra;
1416	int idx = node->node_idx;
1417	reg_errcode_t err = REG_NOERROR;
1418
1419	switch (node->token.type)
1420	{
1421	case CONCAT:
1422	break;
1423
1424	case END_OF_RE:
1425	assert (node->next == NULL);
1426	break;
1427
1428	case OP_DUP_ASTERISK:
1429	case OP_ALT:
1430	{
1431	int left, right;
1432	dfa->has_plural_match = 1;
1433	if (node->left != NULL)
1434	left = node->left->first->node_idx;
1435	else
1436	left = node->next->node_idx;
1437	if (node->right != NULL)
1438	right = node->right->first->node_idx;
1439	else
1440	right = node->next->node_idx;
1441	assert (left > -1);
1442	assert (right > -1);
1443	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1444	}
1445	break;
1446
1447	case ANCHOR:
1448	case OP_OPEN_SUBEXP:
1449	case OP_CLOSE_SUBEXP:
1450	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1451	break;
1452
1453	case OP_BACK_REF:
1454	dfa->nexts[idx] = node->next->node_idx;
1455	if (node->token.type == OP_BACK_REF)
1456	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1457	break;
1458
1459	default:
1460	assert (!IS_EPSILON_NODE (node->token.type));
1461	dfa->nexts[idx] = node->next->node_idx;
1462	break;
1463	}
1464
1465	return err;
1466	}
1467
1468	/* Duplicate the epsilon closure of the node ROOT_NODE.
1469	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1470	to their own constraint. */
1471
1472	static reg_errcode_t
1473	duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1474	init_constraint)
1475	re_dfa_t *dfa;
1476	int top_org_node, top_clone_node, root_node;
1477	unsigned int init_constraint;
1478	{
1479	reg_errcode_t err;
1480	int org_node, clone_node, ret;
1481	unsigned int constraint = init_constraint;
1482	for (org_node = top_org_node, clone_node = top_clone_node;;)
1483	{
1484	int org_dest, clone_dest;
1485	if (dfa->nodes[org_node].type == OP_BACK_REF)
1486	{
1487	/* If the back reference epsilon-transit, its destination must
1488	also have the constraint. Then duplicate the epsilon closure
1489	of the destination of the back reference, and store it in
1490	edests of the back reference. */
1491	org_dest = dfa->nexts[org_node];
1492	re_node_set_empty (dfa->edests + clone_node);
1493	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1494	if (BE (err != REG_NOERROR, 0))
1495	return err;
1496	dfa->nexts[clone_node] = dfa->nexts[org_node];
1497	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498	if (BE (ret < 0, 0))
1499	return REG_ESPACE;
1500	}
1501	else if (dfa->edests[org_node].nelem == 0)
1502	{
1503	/* In case of the node can't epsilon-transit, don't duplicate the
1504	destination and store the original destination as the
1505	destination of the node. */
1506	dfa->nexts[clone_node] = dfa->nexts[org_node];
1507	break;
1508	}
1509	else if (dfa->edests[org_node].nelem == 1)
1510	{
1511	/* In case of the node can epsilon-transit, and it has only one
1512	destination. */
1513	org_dest = dfa->edests[org_node].elems[0];
1514	re_node_set_empty (dfa->edests + clone_node);
1515	if (dfa->nodes[org_node].type == ANCHOR)
1516	{
1517	/* In case of the node has another constraint, append it. */
1518	if (org_node == root_node && clone_node != org_node)
1519	{
1520	/* ...but if the node is root_node itself, it means the
1521	epsilon closure have a loop, then tie it to the
1522	destination of the root_node. */
1523	ret = re_node_set_insert (dfa->edests + clone_node,
1524	org_dest);
1525	if (BE (ret < 0, 0))
1526	return REG_ESPACE;
1527	break;
1528	}
1529	constraint \|= dfa->nodes[org_node].opr.ctx_type;
1530	}
1531	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1532	if (BE (err != REG_NOERROR, 0))
1533	return err;
1534	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535	if (BE (ret < 0, 0))
1536	return REG_ESPACE;
1537	}
1538	else /* dfa->edests[org_node].nelem == 2 */
1539	{
1540	/* In case of the node can epsilon-transit, and it has two
1541	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
1542	org_dest = dfa->edests[org_node].elems[0];
1543	re_node_set_empty (dfa->edests + clone_node);
1544	/* Search for a duplicated node which satisfies the constraint. */
1545	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1546	if (clone_dest == -1)
1547	{
1548	/* There are no such a duplicated node, create a new one. */
1549	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1550	if (BE (err != REG_NOERROR, 0))
1551	return err;
1552	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1553	if (BE (ret < 0, 0))
1554	return REG_ESPACE;
1555	err = duplicate_node_closure (dfa, org_dest, clone_dest,
1556	root_node, constraint);
1557	if (BE (err != REG_NOERROR, 0))
1558	return err;
1559	}
1560	else
1561	{
1562	/* There are a duplicated node which satisfy the constraint,
1563	use it to avoid infinite loop. */
1564	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565	if (BE (ret < 0, 0))
1566	return REG_ESPACE;
1567	}
1568
1569	org_dest = dfa->edests[org_node].elems[1];
1570	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1571	if (BE (err != REG_NOERROR, 0))
1572	return err;
1573	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1574	if (BE (ret < 0, 0))
1575	return REG_ESPACE;
1576	}
1577	org_node = org_dest;
1578	clone_node = clone_dest;
1579	}
1580	return REG_NOERROR;
1581	}
1582
1583	/* Search for a node which is duplicated from the node ORG_NODE, and
1584	satisfies the constraint CONSTRAINT. */
1585
1586	static int
1587	search_duplicated_node (dfa, org_node, constraint)
1588	re_dfa_t *dfa;
1589	int org_node;
1590	unsigned int constraint;
1591	{
1592	int idx;
1593	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1594	{
1595	if (org_node == dfa->org_indices[idx]
1596	&& constraint == dfa->nodes[idx].constraint)
1597	return idx; /* Found. */
1598	}
1599	return -1; /* Not found. */
1600	}
1601
1602	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1603	The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1604	otherwise return the error code. */
1605
1606	static reg_errcode_t
1607	duplicate_node (new_idx, dfa, org_idx, constraint)
1608	re_dfa_t *dfa;
1609	int *new_idx, org_idx;
1610	unsigned int constraint;
1611	{
1612	int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1613	if (BE (dup_idx == -1, 0))
1614	return REG_ESPACE;
1615	dfa->nodes[dup_idx].constraint = constraint;
1616	if (dfa->nodes[org_idx].type == ANCHOR)
1617	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].opr.ctx_type;
1618	dfa->nodes[dup_idx].duplicated = 1;
1619
1620	/* Store the index of the original node. */
1621	dfa->org_indices[dup_idx] = org_idx;
1622	*new_idx = dup_idx;
1623	return REG_NOERROR;
1624	}
1625
1626	static reg_errcode_t
1627	calc_inveclosure (dfa)
1628	re_dfa_t *dfa;
1629	{
1630	int src, idx, ret;
1631	for (idx = 0; idx < dfa->nodes_len; ++idx)
1632	re_node_set_init_empty (dfa->inveclosures + idx);
1633
1634	for (src = 0; src < dfa->nodes_len; ++src)
1635	{
1636	int *elems = dfa->eclosures[src].elems;
1637	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1638	{
1639	ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1640	if (BE (ret == -1, 0))
1641	return REG_ESPACE;
1642	}
1643	}
1644
1645	return REG_NOERROR;
1646	}
1647
1648	/* Calculate "eclosure" for all the node in DFA. */
1649
1650	static reg_errcode_t
1651	calc_eclosure (dfa)
1652	re_dfa_t *dfa;
1653	{
1654	int node_idx, incomplete;
1655	#ifdef DEBUG
1656	assert (dfa->nodes_len > 0);
1657	#endif
1658	incomplete = 0;
1659	/* For each nodes, calculate epsilon closure. */
1660	for (node_idx = 0; ; ++node_idx)
1661	{
1662	reg_errcode_t err;
1663	re_node_set eclosure_elem;
1664	if (node_idx == dfa->nodes_len)
1665	{
1666	if (!incomplete)
1667	break;
1668	incomplete = 0;
1669	node_idx = 0;
1670	}
1671
1672	#ifdef DEBUG
1673	assert (dfa->eclosures[node_idx].nelem != -1);
1674	#endif
1675
1676	/* If we have already calculated, skip it. */
1677	if (dfa->eclosures[node_idx].nelem != 0)
1678	continue;
1679	/* Calculate epsilon closure of `node_idx'. */
1680	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1681	if (BE (err != REG_NOERROR, 0))
1682	return err;
1683
1684	if (dfa->eclosures[node_idx].nelem == 0)
1685	{
1686	incomplete = 1;
1687	re_node_set_free (&eclosure_elem);
1688	}
1689	}
1690	return REG_NOERROR;
1691	}
1692
1693	/* Calculate epsilon closure of NODE. */
1694
1695	static reg_errcode_t
1696	calc_eclosure_iter (new_set, dfa, node, root)
1697	re_node_set *new_set;
1698	re_dfa_t *dfa;
1699	int node, root;
1700	{
1701	reg_errcode_t err;
1702	unsigned int constraint;
1703	int i, incomplete;
1704	re_node_set eclosure;
1705	incomplete = 0;
1706	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1707	if (BE (err != REG_NOERROR, 0))
1708	return err;
1709
1710	/* This indicates that we are calculating this node now.
1711	We reference this value to avoid infinite loop. */
1712	dfa->eclosures[node].nelem = -1;
1713
1714	constraint = ((dfa->nodes[node].type == ANCHOR)
1715	? dfa->nodes[node].opr.ctx_type : 0);
1716	/* If the current node has constraints, duplicate all nodes.
1717	Since they must inherit the constraints. */
1718	if (constraint
1719	&& dfa->edests[node].nelem
1720	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1721	{
1722	int org_node, cur_node;
1723	org_node = cur_node = node;
1724	err = duplicate_node_closure (dfa, node, node, node, constraint);
1725	if (BE (err != REG_NOERROR, 0))
1726	return err;
1727	}
1728
1729	/* Expand each epsilon destination nodes. */
1730	if (IS_EPSILON_NODE(dfa->nodes[node].type))
1731	for (i = 0; i < dfa->edests[node].nelem; ++i)
1732	{
1733	re_node_set eclosure_elem;
1734	int edest = dfa->edests[node].elems[i];
1735	/* If calculating the epsilon closure of `edest' is in progress,
1736	return intermediate result. */
1737	if (dfa->eclosures[edest].nelem == -1)
1738	{
1739	incomplete = 1;
1740	continue;
1741	}
1742	/* If we haven't calculated the epsilon closure of `edest' yet,
1743	calculate now. Otherwise use calculated epsilon closure. */
1744	if (dfa->eclosures[edest].nelem == 0)
1745	{
1746	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1747	if (BE (err != REG_NOERROR, 0))
1748	return err;
1749	}
1750	else
1751	eclosure_elem = dfa->eclosures[edest];
1752	/* Merge the epsilon closure of `edest'. */
1753	re_node_set_merge (&eclosure, &eclosure_elem);
1754	/* If the epsilon closure of `edest' is incomplete,
1755	the epsilon closure of this node is also incomplete. */
1756	if (dfa->eclosures[edest].nelem == 0)
1757	{
1758	incomplete = 1;
1759	re_node_set_free (&eclosure_elem);
1760	}
1761	}
1762
1763	/* Epsilon closures include itself. */
1764	re_node_set_insert (&eclosure, node);
1765	if (incomplete && !root)
1766	dfa->eclosures[node].nelem = 0;
1767	else
1768	dfa->eclosures[node] = eclosure;
1769	*new_set = eclosure;
1770	return REG_NOERROR;
1771	}
1772
1773
1774	/* Functions for token which are used in the parser. */
1775
1776	/* Fetch a token from INPUT.
1777	We must not use this function inside bracket expressions. */
1778
1779	static void
1780	fetch_token (result, input, syntax)
1781	re_token_t *result;
1782	re_string_t *input;
1783	reg_syntax_t syntax;
1784	{
1785	re_string_skip_bytes (input, peek_token (result, input, syntax));
1786	}
1787
1788	/* Peek a token from INPUT, and return the length of the token.
1789	We must not use this function inside bracket expressions. */
1790
1791	static int
1792	peek_token (token, input, syntax)
1793	re_token_t *token;
1794	re_string_t *input;
1795	reg_syntax_t syntax;
1796	{
1797	unsigned char c;
1798
1799	if (re_string_eoi (input))
1800	{
1801	token->type = END_OF_RE;
1802	return 0;
1803	}
1804
1805	c = re_string_peek_byte (input, 0);
1806	token->opr.c = c;
1807
1808	token->word_char = 0;
1809	#ifdef RE_ENABLE_I18N
1810	token->mb_partial = 0;
1811	if (input->mb_cur_max > 1 &&
1812	!re_string_first_byte (input, re_string_cur_idx (input)))
1813	{
1814	token->type = CHARACTER;
1815	token->mb_partial = 1;
1816	return 1;
1817	}
1818	#endif
1819	if (c == '\\')
1820	{
1821	unsigned char c2;
1822	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1823	{
1824	token->type = BACK_SLASH;
1825	return 1;
1826	}
1827
1828	c2 = re_string_peek_byte_case (input, 1);
1829	token->opr.c = c2;
1830	token->type = CHARACTER;
1831	#ifdef RE_ENABLE_I18N
1832	if (input->mb_cur_max > 1)
1833	{
1834	wint_t wc = re_string_wchar_at (input,
1835	re_string_cur_idx (input) + 1);
1836	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1837	}
1838	else
1839	#endif
1840	token->word_char = IS_WORD_CHAR (c2) != 0;
1841
1842	switch (c2)
1843	{
1844	case '\|':
1845	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1846	token->type = OP_ALT;
1847	break;
1848	case '1': case '2': case '3': case '4': case '5':
1849	case '6': case '7': case '8': case '9':
1850	if (!(syntax & RE_NO_BK_REFS))
1851	{
1852	token->type = OP_BACK_REF;
1853	token->opr.idx = c2 - '1';
1854	}
1855	break;
1856	case '<':
1857	if (!(syntax & RE_NO_GNU_OPS))
1858	{
1859	token->type = ANCHOR;
1860	token->opr.ctx_type = WORD_FIRST;
1861	}
1862	break;
1863	case '>':
1864	if (!(syntax & RE_NO_GNU_OPS))
1865	{
1866	token->type = ANCHOR;
1867	token->opr.ctx_type = WORD_LAST;
1868	}
1869	break;
1870	case 'b':
1871	if (!(syntax & RE_NO_GNU_OPS))
1872	{
1873	token->type = ANCHOR;
1874	token->opr.ctx_type = WORD_DELIM;
1875	}
1876	break;
1877	case 'B':
1878	if (!(syntax & RE_NO_GNU_OPS))
1879	{
1880	token->type = ANCHOR;
1881	token->opr.ctx_type = NOT_WORD_DELIM;
1882	}
1883	break;
1884	case 'w':
1885	if (!(syntax & RE_NO_GNU_OPS))
1886	token->type = OP_WORD;
1887	break;
1888	case 'W':
1889	if (!(syntax & RE_NO_GNU_OPS))
1890	token->type = OP_NOTWORD;
1891	break;
1892	case 's':
1893	if (!(syntax & RE_NO_GNU_OPS))
1894	token->type = OP_SPACE;
1895	break;
1896	case 'S':
1897	if (!(syntax & RE_NO_GNU_OPS))
1898	token->type = OP_NOTSPACE;
1899	break;
1900	case '`':
1901	if (!(syntax & RE_NO_GNU_OPS))
1902	{
1903	token->type = ANCHOR;
1904	token->opr.ctx_type = BUF_FIRST;
1905	}
1906	break;
1907	case '\'':
1908	if (!(syntax & RE_NO_GNU_OPS))
1909	{
1910	token->type = ANCHOR;
1911	token->opr.ctx_type = BUF_LAST;
1912	}
1913	break;
1914	case '(':
1915	if (!(syntax & RE_NO_BK_PARENS))
1916	token->type = OP_OPEN_SUBEXP;
1917	break;
1918	case ')':
1919	if (!(syntax & RE_NO_BK_PARENS))
1920	token->type = OP_CLOSE_SUBEXP;
1921	break;
1922	case '+':
1923	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1924	token->type = OP_DUP_PLUS;
1925	break;
1926	case '?':
1927	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1928	token->type = OP_DUP_QUESTION;
1929	break;
1930	case '{':
1931	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1932	token->type = OP_OPEN_DUP_NUM;
1933	break;
1934	case '}':
1935	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1936	token->type = OP_CLOSE_DUP_NUM;
1937	break;
1938	default:
1939	break;
1940	}
1941	return 2;
1942	}
1943
1944	token->type = CHARACTER;
1945	#ifdef RE_ENABLE_I18N
1946	if (input->mb_cur_max > 1)
1947	{
1948	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1949	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1950	}
1951	else
1952	#endif
1953	token->word_char = IS_WORD_CHAR (token->opr.c);
1954
1955	switch (c)
1956	{
1957	case '\n':
1958	if (syntax & RE_NEWLINE_ALT)
1959	token->type = OP_ALT;
1960	break;
1961	case '\|':
1962	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1963	token->type = OP_ALT;
1964	break;
1965	case '*':
1966	token->type = OP_DUP_ASTERISK;
1967	break;
1968	case '+':
1969	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1970	token->type = OP_DUP_PLUS;
1971	break;
1972	case '?':
1973	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1974	token->type = OP_DUP_QUESTION;
1975	break;
1976	case '{':
1977	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1978	token->type = OP_OPEN_DUP_NUM;
1979	break;
1980	case '}':
1981	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1982	token->type = OP_CLOSE_DUP_NUM;
1983	break;
1984	case '(':
1985	if (syntax & RE_NO_BK_PARENS)
1986	token->type = OP_OPEN_SUBEXP;
1987	break;
1988	case ')':
1989	if (syntax & RE_NO_BK_PARENS)
1990	token->type = OP_CLOSE_SUBEXP;
1991	break;
1992	case '[':
1993	token->type = OP_OPEN_BRACKET;
1994	break;
1995	case '.':
1996	token->type = OP_PERIOD;
1997	break;
1998	case '^':
1999	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE)) &&
2000	re_string_cur_idx (input) != 0)
2001	{
2002	char prev = re_string_peek_byte (input, -1);
2003	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
2004	break;
2005	}
2006	token->type = ANCHOR;
2007	token->opr.ctx_type = LINE_FIRST;
2008	break;
2009	case '$':
2010	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2011	re_string_cur_idx (input) + 1 != re_string_length (input))
2012	{
2013	re_token_t next;
2014	re_string_skip_bytes (input, 1);
2015	peek_token (&next, input, syntax);
2016	re_string_skip_bytes (input, -1);
2017	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2018	break;
2019	}
2020	token->type = ANCHOR;
2021	token->opr.ctx_type = LINE_LAST;
2022	break;
2023	default:
2024	break;
2025	}
2026	return 1;
2027	}
2028
2029	/* Peek a token from INPUT, and return the length of the token.
2030	We must not use this function out of bracket expressions. */
2031
2032	static int
2033	peek_token_bracket (token, input, syntax)
2034	re_token_t *token;
2035	re_string_t *input;
2036	reg_syntax_t syntax;
2037	{
2038	unsigned char c;
2039	if (re_string_eoi (input))
2040	{
2041	token->type = END_OF_RE;
2042	return 0;
2043	}
2044	c = re_string_peek_byte (input, 0);
2045	token->opr.c = c;
2046
2047	#ifdef RE_ENABLE_I18N
2048	if (input->mb_cur_max > 1 &&
2049	!re_string_first_byte (input, re_string_cur_idx (input)))
2050	{
2051	token->type = CHARACTER;
2052	return 1;
2053	}
2054	#endif /* RE_ENABLE_I18N */
2055
2056	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2057	&& re_string_cur_idx (input) + 1 < re_string_length (input))
2058	{
2059	/* In this case, '\' escape a character. */
2060	unsigned char c2;
2061	re_string_skip_bytes (input, 1);
2062	c2 = re_string_peek_byte (input, 0);
2063	token->opr.c = c2;
2064	token->type = CHARACTER;
2065	return 1;
2066	}
2067	if (c == '[') /* '[' is a special char in a bracket exps. */
2068	{
2069	unsigned char c2;
2070	int token_len;
2071	if (re_string_cur_idx (input) + 1 < re_string_length (input))
2072	c2 = re_string_peek_byte (input, 1);
2073	else
2074	c2 = 0;
2075	token->opr.c = c2;
2076	token_len = 2;
2077	switch (c2)
2078	{
2079	case '.':
2080	token->type = OP_OPEN_COLL_ELEM;
2081	break;
2082	case '=':
2083	token->type = OP_OPEN_EQUIV_CLASS;
2084	break;
2085	case ':':
2086	if (syntax & RE_CHAR_CLASSES)
2087	{
2088	token->type = OP_OPEN_CHAR_CLASS;
2089	break;
2090	}
2091	/* else fall through. */
2092	default:
2093	token->type = CHARACTER;
2094	token->opr.c = c;
2095	token_len = 1;
2096	break;
2097	}
2098	return token_len;
2099	}
2100	switch (c)
2101	{
2102	case '-':
2103	token->type = OP_CHARSET_RANGE;
2104	break;
2105	case ']':
2106	token->type = OP_CLOSE_BRACKET;
2107	break;
2108	case '^':
2109	token->type = OP_NON_MATCH_LIST;
2110	break;
2111	default:
2112	token->type = CHARACTER;
2113	}
2114	return 1;
2115	}
2116
2117
2118	/* Functions for parser. */
2119
2120	/* Entry point of the parser.
2121	Parse the regular expression REGEXP and return the structure tree.
2122	If an error is occured, ERR is set by error code, and return NULL.
2123	This function build the following tree, from regular expression <reg_exp>:
2124	CAT
2125	/ \
2126	/ \
2127	<reg_exp> EOR
2128
2129	CAT means concatenation.
2130	EOR means end of regular expression. */
2131
2132	static bin_tree_t *
2133	parse (regexp, preg, syntax, err)
2134	re_string_t *regexp;
2135	regex_t *preg;
2136	reg_syntax_t syntax;
2137	reg_errcode_t *err;
2138	{
2139	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2140	bin_tree_t tree, eor, *root;
2141	re_token_t current_token;
2142	dfa->syntax = syntax;
2143	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2144	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2145	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2146	return NULL;
2147	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2148	if (tree != NULL)
2149	root = create_tree (dfa, tree, eor, CONCAT);
2150	else
2151	root = eor;
2152	if (BE (eor == NULL \|\| root == NULL, 0))
2153	{
2154	*err = REG_ESPACE;
2155	return NULL;
2156	}
2157	return root;
2158	}
2159
2160	/* This function build the following tree, from regular expression
2161	<branch1>\|<branch2>:
2162	ALT
2163	/ \
2164	/ \
2165	<branch1> <branch2>
2166
2167	ALT means alternative, which represents the operator `\|'. */
2168
2169	static bin_tree_t *
2170	parse_reg_exp (regexp, preg, token, syntax, nest, err)
2171	re_string_t *regexp;
2172	regex_t *preg;
2173	re_token_t *token;
2174	reg_syntax_t syntax;
2175	int nest;
2176	reg_errcode_t *err;
2177	{
2178	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2179	bin_tree_t tree, branch = NULL;
2180	tree = parse_branch (regexp, preg, token, syntax, nest, err);
2181	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2182	return NULL;
2183
2184	while (token->type == OP_ALT)
2185	{
2186	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2187	if (token->type != OP_ALT && token->type != END_OF_RE
2188	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2189	{
2190	branch = parse_branch (regexp, preg, token, syntax, nest, err);
2191	if (BE (*err != REG_NOERROR && branch == NULL, 0))
2192	return NULL;
2193	}
2194	else
2195	branch = NULL;
2196	tree = create_tree (dfa, tree, branch, OP_ALT);
2197	if (BE (tree == NULL, 0))
2198	{
2199	*err = REG_ESPACE;
2200	return NULL;
2201	}
2202	}
2203	return tree;
2204	}
2205
2206	/* This function build the following tree, from regular expression
2207	<exp1><exp2>:
2208	CAT
2209	/ \
2210	/ \
2211	<exp1> <exp2>
2212
2213	CAT means concatenation. */
2214
2215	static bin_tree_t *
2216	parse_branch (regexp, preg, token, syntax, nest, err)
2217	re_string_t *regexp;
2218	regex_t *preg;
2219	re_token_t *token;
2220	reg_syntax_t syntax;
2221	int nest;
2222	reg_errcode_t *err;
2223	{
2224	bin_tree_t tree, exp;
2225	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2226	tree = parse_expression (regexp, preg, token, syntax, nest, err);
2227	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2228	return NULL;
2229
2230	while (token->type != OP_ALT && token->type != END_OF_RE
2231	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2232	{
2233	exp = parse_expression (regexp, preg, token, syntax, nest, err);
2234	if (BE (*err != REG_NOERROR && exp == NULL, 0))
2235	{
2236	return NULL;
2237	}
2238	if (tree != NULL && exp != NULL)
2239	{
2240	tree = create_tree (dfa, tree, exp, CONCAT);
2241	if (tree == NULL)
2242	{
2243	*err = REG_ESPACE;
2244	return NULL;
2245	}
2246	}
2247	else if (tree == NULL)
2248	tree = exp;
2249	/* Otherwise exp == NULL, we don't need to create new tree. */
2250	}
2251	return tree;
2252	}
2253
2254	/* This function build the following tree, from regular expression a*:
2255	*
2256	\|
2257	a
2258	*/
2259
2260	static bin_tree_t *
2261	parse_expression (regexp, preg, token, syntax, nest, err)
2262	re_string_t *regexp;
2263	regex_t *preg;
2264	re_token_t *token;
2265	reg_syntax_t syntax;
2266	int nest;
2267	reg_errcode_t *err;
2268	{
2269	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2270	bin_tree_t *tree;
2271	switch (token->type)
2272	{
2273	case CHARACTER:
2274	tree = create_token_tree (dfa, NULL, NULL, token);
2275	if (BE (tree == NULL, 0))
2276	{
2277	*err = REG_ESPACE;
2278	return NULL;
2279	}
2280	#ifdef RE_ENABLE_I18N
2281	if (dfa->mb_cur_max > 1)
2282	{
2283	while (!re_string_eoi (regexp)
2284	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2285	{
2286	bin_tree_t *mbc_remain;
2287	fetch_token (token, regexp, syntax);
2288	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2289	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2290	if (BE (mbc_remain == NULL \|\| tree == NULL, 0))
2291	{
2292	*err = REG_ESPACE;
2293	return NULL;
2294	}
2295	}
2296	}
2297	#endif
2298	break;
2299	case OP_OPEN_SUBEXP:
2300	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2301	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2302	return NULL;
2303	break;
2304	case OP_OPEN_BRACKET:
2305	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2306	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307	return NULL;
2308	break;
2309	case OP_BACK_REF:
2310	if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2311	{
2312	*err = REG_ESUBREG;
2313	return NULL;
2314	}
2315	dfa->used_bkref_map \|= 1 << token->opr.idx;
2316	tree = create_token_tree (dfa, NULL, NULL, token);
2317	if (BE (tree == NULL, 0))
2318	{
2319	*err = REG_ESPACE;
2320	return NULL;
2321	}
2322	++dfa->nbackref;
2323	dfa->has_mb_node = 1;
2324	break;
2325	case OP_OPEN_DUP_NUM:
2326	if (syntax & RE_CONTEXT_INVALID_DUP)
2327	{
2328	*err = REG_BADRPT;
2329	return NULL;
2330	}
2331	/* FALLTHROUGH */
2332	case OP_DUP_ASTERISK:
2333	case OP_DUP_PLUS:
2334	case OP_DUP_QUESTION:
2335	if (syntax & RE_CONTEXT_INVALID_OPS)
2336	{
2337	*err = REG_BADRPT;
2338	return NULL;
2339	}
2340	else if (syntax & RE_CONTEXT_INDEP_OPS)
2341	{
2342	fetch_token (token, regexp, syntax);
2343	return parse_expression (regexp, preg, token, syntax, nest, err);
2344	}
2345	/* else fall through */
2346	case OP_CLOSE_SUBEXP:
2347	if ((token->type == OP_CLOSE_SUBEXP) &&
2348	!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2349	{
2350	*err = REG_ERPAREN;
2351	return NULL;
2352	}
2353	/* else fall through */
2354	case OP_CLOSE_DUP_NUM:
2355	/* We treat it as a normal character. */
2356
2357	/* Then we can these characters as normal characters. */
2358	token->type = CHARACTER;
2359	/* mb_partial and word_char bits should be initialized already
2360	by peek_token. */
2361	tree = create_token_tree (dfa, NULL, NULL, token);
2362	if (BE (tree == NULL, 0))
2363	{
2364	*err = REG_ESPACE;
2365	return NULL;
2366	}
2367	break;
2368	case ANCHOR:
2369	if ((token->opr.ctx_type
2370	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
2371	&& dfa->word_ops_used == 0)
2372	init_word_char (dfa);
2373	if (token->opr.ctx_type == WORD_DELIM
2374	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
2375	{
2376	bin_tree_t tree_first, tree_last;
2377	if (token->opr.ctx_type == WORD_DELIM)
2378	{
2379	token->opr.ctx_type = WORD_FIRST;
2380	tree_first = create_token_tree (dfa, NULL, NULL, token);
2381	token->opr.ctx_type = WORD_LAST;
2382	}
2383	else
2384	{
2385	token->opr.ctx_type = INSIDE_WORD;
2386	tree_first = create_token_tree (dfa, NULL, NULL, token);
2387	token->opr.ctx_type = INSIDE_NOTWORD;
2388	}
2389	tree_last = create_token_tree (dfa, NULL, NULL, token);
2390	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391	if (BE (tree_first == NULL \|\| tree_last == NULL \|\| tree == NULL, 0))
2392	{
2393	*err = REG_ESPACE;
2394	return NULL;
2395	}
2396	}
2397	else
2398	{
2399	tree = create_token_tree (dfa, NULL, NULL, token);
2400	if (BE (tree == NULL, 0))
2401	{
2402	*err = REG_ESPACE;
2403	return NULL;
2404	}
2405	}
2406	/* We must return here, since ANCHORs can't be followed
2407	by repetition operators.
2408	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
2409	it must not be "<ANCHOR(^)><REPEAT()>". /
2410	fetch_token (token, regexp, syntax);
2411	return tree;
2412	case OP_PERIOD:
2413	tree = create_token_tree (dfa, NULL, NULL, token);
2414	if (BE (tree == NULL, 0))
2415	{
2416	*err = REG_ESPACE;
2417	return NULL;
2418	}
2419	if (dfa->mb_cur_max > 1)
2420	dfa->has_mb_node = 1;
2421	break;
2422	case OP_WORD:
2423	case OP_NOTWORD:
2424	tree = build_charclass_op (dfa, regexp->trans,
2425	(const unsigned char *) "alnum",
2426	(const unsigned char *) "_",
2427	token->type == OP_NOTWORD, err);
2428	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2429	return NULL;
2430	break;
2431	case OP_SPACE:
2432	case OP_NOTSPACE:
2433	tree = build_charclass_op (dfa, regexp->trans,
2434	(const unsigned char *) "space",
2435	(const unsigned char *) "",
2436	token->type == OP_NOTSPACE, err);
2437	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2438	return NULL;
2439	break;
2440	case OP_ALT:
2441	case END_OF_RE:
2442	return NULL;
2443	case BACK_SLASH:
2444	*err = REG_EESCAPE;
2445	return NULL;
2446	default:
2447	/* Must not happen? */
2448	#ifdef DEBUG
2449	assert (0);
2450	#endif
2451	return NULL;
2452	}
2453	fetch_token (token, regexp, syntax);
2454
2455	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
2456	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
2457	{
2458	tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2459	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2460	return NULL;
2461	/* In BRE consecutive duplications are not allowed. */
2462	if ((syntax & RE_CONTEXT_INVALID_DUP)
2463	&& (token->type == OP_DUP_ASTERISK
2464	\|\| token->type == OP_OPEN_DUP_NUM))
2465	{
2466	*err = REG_BADRPT;
2467	return NULL;
2468	}
2469	}
2470
2471	return tree;
2472	}
2473
2474	/* This function build the following tree, from regular expression
2475	(<reg_exp>):
2476	SUBEXP
2477	\|
2478	<reg_exp>
2479	*/
2480
2481	static bin_tree_t *
2482	parse_sub_exp (regexp, preg, token, syntax, nest, err)
2483	re_string_t *regexp;
2484	regex_t *preg;
2485	re_token_t *token;
2486	reg_syntax_t syntax;
2487	int nest;
2488	reg_errcode_t *err;
2489	{
2490	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2491	bin_tree_t *tree;
2492	size_t cur_nsub;
2493	cur_nsub = preg->re_nsub++;
2494
2495	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2496
2497	/* The subexpression may be a null string. */
2498	if (token->type == OP_CLOSE_SUBEXP)
2499	tree = NULL;
2500	else
2501	{
2502	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2503	if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2504	*err = REG_EPAREN;
2505	if (BE (*err != REG_NOERROR, 0))
2506	return NULL;
2507	}
2508	dfa->completed_bkref_map \|= 1 << cur_nsub;
2509
2510	tree = create_tree (dfa, tree, NULL, SUBEXP);
2511	if (BE (tree == NULL, 0))
2512	{
2513	*err = REG_ESPACE;
2514	return NULL;
2515	}
2516	tree->token.opr.idx = cur_nsub;
2517	return tree;
2518	}
2519
2520	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
2521
2522	static bin_tree_t *
2523	parse_dup_op (elem, regexp, dfa, token, syntax, err)
2524	bin_tree_t *elem;
2525	re_string_t *regexp;
2526	re_dfa_t *dfa;
2527	re_token_t *token;
2528	reg_syntax_t syntax;
2529	reg_errcode_t *err;
2530	{
2531	bin_tree_t tree = NULL, old_tree = NULL;
2532	int i, start, end, start_idx = re_string_cur_idx (regexp);
2533	re_token_t start_token = *token;
2534
2535	if (token->type == OP_OPEN_DUP_NUM)
2536	{
2537	end = 0;
2538	start = fetch_number (regexp, token, syntax);
2539	if (start == -1)
2540	{
2541	if (token->type == CHARACTER && token->opr.c == ',')
2542	start = 0; /* We treat "{,m}" as "{0,m}". */
2543	else
2544	{
2545	err = REG_BADBR; / <re>{} is invalid. */
2546	return NULL;
2547	}
2548	}
2549	if (BE (start != -2, 1))
2550	{
2551	/* We treat "{n}" as "{n,n}". */
2552	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2553	: ((token->type == CHARACTER && token->opr.c == ',')
2554	? fetch_number (regexp, token, syntax) : -2));
2555	}
2556	if (BE (start == -2 \|\| end == -2, 0))
2557	{
2558	/* Invalid sequence. */
2559	if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2560	{
2561	if (token->type == END_OF_RE)
2562	*err = REG_EBRACE;
2563	else
2564	*err = REG_BADBR;
2565
2566	return NULL;
2567	}
2568
2569	/* If the syntax bit is set, rollback. */
2570	re_string_set_index (regexp, start_idx);
2571	*token = start_token;
2572	token->type = CHARACTER;
2573	/* mb_partial and word_char bits should be already initialized by
2574	peek_token. */
2575	return elem;
2576	}
2577
2578	if (BE (end != -1 && start > end, 0))
2579	{
2580	/* First number greater than second. */
2581	*err = REG_BADBR;
2582	return NULL;
2583	}
2584	}
2585	else
2586	{
2587	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2588	end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2589	}
2590
2591	fetch_token (token, regexp, syntax);
2592
2593	if (BE (elem == NULL, 0))
2594	return NULL;
2595	if (BE (start == 0 && end == 0, 0))
2596	{
2597	postorder (elem, free_tree, NULL);
2598	return NULL;
2599	}
2600
2601	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2602	if (BE (start > 0, 0))
2603	{
2604	tree = elem;
2605	for (i = 2; i <= start; ++i)
2606	{
2607	elem = duplicate_tree (elem, dfa);
2608	tree = create_tree (dfa, tree, elem, CONCAT);
2609	if (BE (elem == NULL \|\| tree == NULL, 0))
2610	goto parse_dup_op_espace;
2611	}
2612
2613	if (start == end)
2614	return tree;
2615
2616	/* Duplicate ELEM before it is marked optional. */
2617	elem = duplicate_tree (elem, dfa);
2618	old_tree = tree;
2619	}
2620	else
2621	old_tree = NULL;
2622
2623	if (elem->token.type == SUBEXP)
2624	postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2625
2626	tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2627	if (BE (tree == NULL, 0))
2628	goto parse_dup_op_espace;
2629
2630	/* This loop is actually executed only when end != -1,
2631	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2632	already created the start+1-th copy. */
2633	for (i = start + 2; i <= end; ++i)
2634	{
2635	elem = duplicate_tree (elem, dfa);
2636	tree = create_tree (dfa, tree, elem, CONCAT);
2637	if (BE (elem == NULL \|\| tree == NULL, 0))
2638	goto parse_dup_op_espace;
2639
2640	tree = create_tree (dfa, tree, NULL, OP_ALT);
2641	if (BE (tree == NULL, 0))
2642	goto parse_dup_op_espace;
2643	}
2644
2645	if (old_tree)
2646	tree = create_tree (dfa, old_tree, tree, CONCAT);
2647
2648	return tree;
2649
2650	parse_dup_op_espace:
2651	*err = REG_ESPACE;
2652	return NULL;
2653	}
2654
2655	/* Size of the names for collating symbol/equivalence_class/character_class.
2656	I'm not sure, but maybe enough. */
2657	#define BRACKET_NAME_BUF_SIZE 32
2658
2659	#ifndef _LIBC
2660	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2661	Build the range expression which starts from START_ELEM, and ends
2662	at END_ELEM. The result are written to MBCSET and SBCSET.
2663	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2664	mbcset->range_ends, is a pointer argument sinse we may
2665	update it. */
2666
2667	static reg_errcode_t
2668	# ifdef RE_ENABLE_I18N
2669	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2670	re_charset_t *mbcset;
2671	int *range_alloc;
2672	# else /* not RE_ENABLE_I18N */
2673	build_range_exp (sbcset, start_elem, end_elem)
2674	# endif /* not RE_ENABLE_I18N */
2675	re_bitset_ptr_t sbcset;
2676	bracket_elem_t start_elem, end_elem;
2677	{
2678	unsigned int start_ch, end_ch;
2679	/* Equivalence Classes and Character Classes can't be a range start/end. */
2680	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
2681	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
2682	0))
2683	return REG_ERANGE;
2684
2685	/* We can handle no multi character collating elements without libc
2686	support. */
2687	if (BE ((start_elem->type == COLL_SYM
2688	&& strlen ((char *) start_elem->opr.name) > 1)
2689	\|\| (end_elem->type == COLL_SYM
2690	&& strlen ((char *) end_elem->opr.name) > 1), 0))
2691	return REG_ECOLLATE;
2692
2693	# ifdef RE_ENABLE_I18N
2694	{
2695	wchar_t wc, start_wc, end_wc;
2696	wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2697
2698	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2699	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2700	: 0));
2701	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2702	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2703	: 0));
2704	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
2705	? __btowc (start_ch) : start_elem->opr.wch);
2706	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
2707	? __btowc (end_ch) : end_elem->opr.wch);
2708	if (start_wc == WEOF \|\| end_wc == WEOF)
2709	return REG_ECOLLATE;
2710	cmp_buf[0] = start_wc;
2711	cmp_buf[4] = end_wc;
2712	if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2713	return REG_ERANGE;
2714
2715	/* Got valid collation sequence values, add them as a new entry.
2716	However, for !_LIBC we have no collation elements: if the
2717	character set is single byte, the single byte character set
2718	that we build below suffices. parse_bracket_exp passes
2719	no MBCSET if dfa->mb_cur_max == 1. */
2720	if (mbcset)
2721	{
2722	/* Check the space of the arrays. */
2723	if (BE (*range_alloc == mbcset->nranges, 0))
2724	{
2725	/* There is not enough space, need realloc. */
2726	wchar_t new_array_start, new_array_end;
2727	int new_nranges;
2728
2729	/* +1 in case of mbcset->nranges is 0. */
2730	new_nranges = 2 * mbcset->nranges + 1;
2731	/* Use realloc since mbcset->range_starts and mbcset->range_ends
2732	are NULL if range_alloc == 0. /
2733	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2734	new_nranges);
2735	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2736	new_nranges);
2737
2738	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
2739	return REG_ESPACE;
2740
2741	mbcset->range_starts = new_array_start;
2742	mbcset->range_ends = new_array_end;
2743	*range_alloc = new_nranges;
2744	}
2745
2746	mbcset->range_starts[mbcset->nranges] = start_wc;
2747	mbcset->range_ends[mbcset->nranges++] = end_wc;
2748	}
2749
2750	/* Build the table for single byte characters. */
2751	for (wc = 0; wc < SBC_MAX; ++wc)
2752	{
2753	cmp_buf[2] = wc;
2754	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2755	&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2756	bitset_set (sbcset, wc);
2757	}
2758	}
2759	# else /* not RE_ENABLE_I18N */
2760	{
2761	unsigned int ch;
2762	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2763	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2764	: 0));
2765	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2766	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2767	: 0));
2768	if (start_ch > end_ch)
2769	return REG_ERANGE;
2770	/* Build the table for single byte characters. */
2771	for (ch = 0; ch < SBC_MAX; ++ch)
2772	if (start_ch <= ch && ch <= end_ch)
2773	bitset_set (sbcset, ch);
2774	}
2775	# endif /* not RE_ENABLE_I18N */
2776	return REG_NOERROR;
2777	}
2778	#endif /* not _LIBC */
2779
2780	#ifndef _LIBC
2781	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2782	Build the collating element which is represented by NAME.
2783	The result are written to MBCSET and SBCSET.
2784	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2785	pointer argument since we may update it. */
2786
2787	static reg_errcode_t
2788	# ifdef RE_ENABLE_I18N
2789	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2790	re_charset_t *mbcset;
2791	int *coll_sym_alloc;
2792	# else /* not RE_ENABLE_I18N */
2793	build_collating_symbol (sbcset, name)
2794	# endif /* not RE_ENABLE_I18N */
2795	re_bitset_ptr_t sbcset;
2796	const unsigned char *name;
2797	{
2798	size_t name_len = strlen ((const char *) name);
2799	if (BE (name_len != 1, 0))
2800	return REG_ECOLLATE;
2801	else
2802	{
2803	bitset_set (sbcset, name[0]);
2804	return REG_NOERROR;
2805	}
2806	}
2807	#endif /* not _LIBC */
2808
2809	/* This function parse bracket expression like "[abc]", "[a-c]",
2810	"[[.a-a.]]" etc. */
2811
2812	static bin_tree_t *
2813	parse_bracket_exp (regexp, dfa, token, syntax, err)
2814	re_string_t *regexp;
2815	re_dfa_t *dfa;
2816	re_token_t *token;
2817	reg_syntax_t syntax;
2818	reg_errcode_t *err;
2819	{
2820	#ifdef _LIBC
2821	const unsigned char *collseqmb;
2822	const char *collseqwc;
2823	uint32_t nrules;
2824	int32_t table_size;
2825	const int32_t *symb_table;
2826	const unsigned char *extra;
2827
2828	/* Local function for parse_bracket_exp used in _LIBC environement.
2829	Seek the collating symbol entry correspondings to NAME.
2830	Return the index of the symbol in the SYMB_TABLE. */
2831
2832	auto inline int32_t
2833	__attribute ((always_inline))
2834	seek_collating_symbol_entry (name, name_len)
2835	const unsigned char *name;
2836	size_t name_len;
2837	{
2838	int32_t hash = elem_hash ((const char *) name, name_len);
2839	int32_t elem = hash % table_size;
2840	int32_t second = hash % (table_size - 2);
2841	while (symb_table[2 * elem] != 0)
2842	{
2843	/* First compare the hashing value. */
2844	if (symb_table[2 * elem] == hash
2845	/* Compare the length of the name. */
2846	&& name_len == extra[symb_table[2 * elem + 1]]
2847	/* Compare the name. */
2848	&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2849	name_len) == 0)
2850	{
2851	/* Yep, this is the entry. */
2852	break;
2853	}
2854
2855	/* Next entry. */
2856	elem += second;
2857	}
2858	return elem;
2859	}
2860
2861	/* Local function for parse_bracket_exp used in _LIBC environement.
2862	Look up the collation sequence value of BR_ELEM.
2863	Return the value if succeeded, UINT_MAX otherwise. */
2864
2865	auto inline unsigned int
2866	__attribute ((always_inline))
2867	lookup_collation_sequence_value (br_elem)
2868	bracket_elem_t *br_elem;
2869	{
2870	if (br_elem->type == SB_CHAR)
2871	{
2872	/*
2873	if (MB_CUR_MAX == 1)
2874	*/
2875	if (nrules == 0)
2876	return collseqmb[br_elem->opr.ch];
2877	else
2878	{
2879	wint_t wc = __btowc (br_elem->opr.ch);
2880	return __collseq_table_lookup (collseqwc, wc);
2881	}
2882	}
2883	else if (br_elem->type == MB_CHAR)
2884	{
2885	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2886	}
2887	else if (br_elem->type == COLL_SYM)
2888	{
2889	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2890	if (nrules != 0)
2891	{
2892	int32_t elem, idx;
2893	elem = seek_collating_symbol_entry (br_elem->opr.name,
2894	sym_name_len);
2895	if (symb_table[2 * elem] != 0)
2896	{
2897	/* We found the entry. */
2898	idx = symb_table[2 * elem + 1];
2899	/* Skip the name of collating element name. */
2900	idx += 1 + extra[idx];
2901	/* Skip the byte sequence of the collating element. */
2902	idx += 1 + extra[idx];
2903	/* Adjust for the alignment. */
2904	idx = (idx + 3) & ~3;
2905	/* Skip the multibyte collation sequence value. */
2906	idx += sizeof (unsigned int);
2907	/* Skip the wide char sequence of the collating element. */
2908	idx += sizeof (unsigned int) *
2909	(1 + (unsigned int ) (extra + idx));
2910	/* Return the collation sequence value. */
2911	return (unsigned int ) (extra + idx);
2912	}
2913	else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2914	{
2915	/* No valid character. Match it as a single byte
2916	character. */
2917	return collseqmb[br_elem->opr.name[0]];
2918	}
2919	}
2920	else if (sym_name_len == 1)
2921	return collseqmb[br_elem->opr.name[0]];
2922	}
2923	return UINT_MAX;
2924	}
2925
2926	/* Local function for parse_bracket_exp used in _LIBC environement.
2927	Build the range expression which starts from START_ELEM, and ends
2928	at END_ELEM. The result are written to MBCSET and SBCSET.
2929	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2930	mbcset->range_ends, is a pointer argument sinse we may
2931	update it. */
2932
2933	auto inline reg_errcode_t
2934	__attribute ((always_inline))
2935	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2936	re_charset_t *mbcset;
2937	int *range_alloc;
2938	re_bitset_ptr_t sbcset;
2939	bracket_elem_t start_elem, end_elem;
2940	{
2941	unsigned int ch;
2942	uint32_t start_collseq;
2943	uint32_t end_collseq;
2944
2945	/* Equivalence Classes and Character Classes can't be a range
2946	start/end. */
2947	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
2948	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
2949	0))
2950	return REG_ERANGE;
2951
2952	start_collseq = lookup_collation_sequence_value (start_elem);
2953	end_collseq = lookup_collation_sequence_value (end_elem);
2954	/* Check start/end collation sequence values. */
2955	if (BE (start_collseq == UINT_MAX \|\| end_collseq == UINT_MAX, 0))
2956	return REG_ECOLLATE;
2957	if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2958	return REG_ERANGE;
2959
2960	/* Got valid collation sequence values, add them as a new entry.
2961	However, if we have no collation elements, and the character set
2962	is single byte, the single byte character set that we
2963	build below suffices. */
2964	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
2965	{
2966	/* Check the space of the arrays. */
2967	if (BE (*range_alloc == mbcset->nranges, 0))
2968	{
2969	/* There is not enough space, need realloc. */
2970	uint32_t *new_array_start;
2971	uint32_t *new_array_end;
2972	int new_nranges;
2973
2974	/* +1 in case of mbcset->nranges is 0. */
2975	new_nranges = 2 * mbcset->nranges + 1;
2976	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2977	new_nranges);
2978	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2979	new_nranges);
2980
2981	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
2982	return REG_ESPACE;
2983
2984	mbcset->range_starts = new_array_start;
2985	mbcset->range_ends = new_array_end;
2986	*range_alloc = new_nranges;
2987	}
2988
2989	mbcset->range_starts[mbcset->nranges] = start_collseq;
2990	mbcset->range_ends[mbcset->nranges++] = end_collseq;
2991	}
2992
2993	/* Build the table for single byte characters. */
2994	for (ch = 0; ch < SBC_MAX; ch++)
2995	{
2996	uint32_t ch_collseq;
2997	/*
2998	if (MB_CUR_MAX == 1)
2999	*/
3000	if (nrules == 0)
3001	ch_collseq = collseqmb[ch];
3002	else
3003	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3004	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3005	bitset_set (sbcset, ch);
3006	}
3007	return REG_NOERROR;
3008	}
3009
3010	/* Local function for parse_bracket_exp used in _LIBC environement.
3011	Build the collating element which is represented by NAME.
3012	The result are written to MBCSET and SBCSET.
3013	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3014	pointer argument sinse we may update it. */
3015
3016	auto inline reg_errcode_t
3017	__attribute ((always_inline))
3018	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3019	re_charset_t *mbcset;
3020	int *coll_sym_alloc;
3021	re_bitset_ptr_t sbcset;
3022	const unsigned char *name;
3023	{
3024	int32_t elem, idx;
3025	size_t name_len = strlen ((const char *) name);
3026	if (nrules != 0)
3027	{
3028	elem = seek_collating_symbol_entry (name, name_len);
3029	if (symb_table[2 * elem] != 0)
3030	{
3031	/* We found the entry. */
3032	idx = symb_table[2 * elem + 1];
3033	/* Skip the name of collating element name. */
3034	idx += 1 + extra[idx];
3035	}
3036	else if (symb_table[2 * elem] == 0 && name_len == 1)
3037	{
3038	/* No valid character, treat it as a normal
3039	character. */
3040	bitset_set (sbcset, name[0]);
3041	return REG_NOERROR;
3042	}
3043	else
3044	return REG_ECOLLATE;
3045
3046	/* Got valid collation sequence, add it as a new entry. */
3047	/* Check the space of the arrays. */
3048	if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3049	{
3050	/* Not enough, realloc it. */
3051	/* +1 in case of mbcset->ncoll_syms is 0. */
3052	int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3053	/* Use realloc since mbcset->coll_syms is NULL
3054	if alloc == 0. /
3055	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3056	new_coll_sym_alloc);
3057	if (BE (new_coll_syms == NULL, 0))
3058	return REG_ESPACE;
3059	mbcset->coll_syms = new_coll_syms;
3060	*coll_sym_alloc = new_coll_sym_alloc;
3061	}
3062	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3063	return REG_NOERROR;
3064	}
3065	else
3066	{
3067	if (BE (name_len != 1, 0))
3068	return REG_ECOLLATE;
3069	else
3070	{
3071	bitset_set (sbcset, name[0]);
3072	return REG_NOERROR;
3073	}
3074	}
3075	}
3076	#endif
3077
3078	re_token_t br_token;
3079	re_bitset_ptr_t sbcset;
3080	#ifdef RE_ENABLE_I18N
3081	re_charset_t *mbcset;
3082	int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3083	int equiv_class_alloc = 0, char_class_alloc = 0;
3084	#endif /* not RE_ENABLE_I18N */
3085	int non_match = 0;
3086	bin_tree_t *work_tree;
3087	int token_len;
3088	int first_round = 1;
3089	#ifdef _LIBC
3090	collseqmb = (const unsigned char *)
3091	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3092	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3093	if (nrules)
3094	{
3095	/*
3096	if (MB_CUR_MAX > 1)
3097	*/
3098	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3099	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3100	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3101	_NL_COLLATE_SYMB_TABLEMB);
3102	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3103	_NL_COLLATE_SYMB_EXTRAMB);
3104	}
3105	#endif
3106	sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3107	#ifdef RE_ENABLE_I18N
3108	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3109	#endif /* RE_ENABLE_I18N */
3110	#ifdef RE_ENABLE_I18N
3111	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
3112	#else
3113	if (BE (sbcset == NULL, 0))
3114	#endif /* RE_ENABLE_I18N */
3115	{
3116	*err = REG_ESPACE;
3117	return NULL;
3118	}
3119
3120	token_len = peek_token_bracket (token, regexp, syntax);
3121	if (BE (token->type == END_OF_RE, 0))
3122	{
3123	*err = REG_BADPAT;
3124	goto parse_bracket_exp_free_return;
3125	}
3126	if (token->type == OP_NON_MATCH_LIST)
3127	{
3128	#ifdef RE_ENABLE_I18N
3129	mbcset->non_match = 1;
3130	#endif /* not RE_ENABLE_I18N */
3131	non_match = 1;
3132	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3133	bitset_set (sbcset, '\0');
3134	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3135	token_len = peek_token_bracket (token, regexp, syntax);
3136	if (BE (token->type == END_OF_RE, 0))
3137	{
3138	*err = REG_BADPAT;
3139	goto parse_bracket_exp_free_return;
3140	}
3141	}
3142
3143	/* We treat the first ']' as a normal character. */
3144	if (token->type == OP_CLOSE_BRACKET)
3145	token->type = CHARACTER;
3146
3147	while (1)
3148	{
3149	bracket_elem_t start_elem, end_elem;
3150	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3151	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3152	reg_errcode_t ret;
3153	int token_len2 = 0, is_range_exp = 0;
3154	re_token_t token2;
3155
3156	start_elem.opr.name = start_name_buf;
3157	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3158	syntax, first_round);
3159	if (BE (ret != REG_NOERROR, 0))
3160	{
3161	*err = ret;
3162	goto parse_bracket_exp_free_return;
3163	}
3164	first_round = 0;
3165
3166	/* Get information about the next token. We need it in any case. */
3167	token_len = peek_token_bracket (token, regexp, syntax);
3168
3169	/* Do not check for ranges if we know they are not allowed. */
3170	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3171	{
3172	if (BE (token->type == END_OF_RE, 0))
3173	{
3174	*err = REG_EBRACK;
3175	goto parse_bracket_exp_free_return;
3176	}
3177	if (token->type == OP_CHARSET_RANGE)
3178	{
3179	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3180	token_len2 = peek_token_bracket (&token2, regexp, syntax);
3181	if (BE (token2.type == END_OF_RE, 0))
3182	{
3183	*err = REG_EBRACK;
3184	goto parse_bracket_exp_free_return;
3185	}
3186	if (token2.type == OP_CLOSE_BRACKET)
3187	{
3188	/* We treat the last '-' as a normal character. */
3189	re_string_skip_bytes (regexp, -token_len);
3190	token->type = CHARACTER;
3191	}
3192	else
3193	is_range_exp = 1;
3194	}
3195	}
3196
3197	if (is_range_exp == 1)
3198	{
3199	end_elem.opr.name = end_name_buf;
3200	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3201	dfa, syntax, 1);
3202	if (BE (ret != REG_NOERROR, 0))
3203	{
3204	*err = ret;
3205	goto parse_bracket_exp_free_return;
3206	}
3207
3208	token_len = peek_token_bracket (token, regexp, syntax);
3209
3210	#ifdef _LIBC
3211	*err = build_range_exp (sbcset, mbcset, &range_alloc,
3212	&start_elem, &end_elem);
3213	#else
3214	# ifdef RE_ENABLE_I18N
3215	*err = build_range_exp (sbcset,
3216	dfa->mb_cur_max > 1 ? mbcset : NULL,
3217	&range_alloc, &start_elem, &end_elem);
3218	# else
3219	*err = build_range_exp (sbcset, &start_elem, &end_elem);
3220	# endif
3221	#endif /* RE_ENABLE_I18N */
3222	if (BE (*err != REG_NOERROR, 0))
3223	goto parse_bracket_exp_free_return;
3224	}
3225	else
3226	{
3227	switch (start_elem.type)
3228	{
3229	case SB_CHAR:
3230	bitset_set (sbcset, start_elem.opr.ch);
3231	break;
3232	#ifdef RE_ENABLE_I18N
3233	case MB_CHAR:
3234	/* Check whether the array has enough space. */
3235	if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3236	{
3237	wchar_t *new_mbchars;
3238	/* Not enough, realloc it. */
3239	/* +1 in case of mbcset->nmbchars is 0. */
3240	mbchar_alloc = 2 * mbcset->nmbchars + 1;
3241	/* Use realloc since array is NULL if alloc == 0. /
3242	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3243	mbchar_alloc);
3244	if (BE (new_mbchars == NULL, 0))
3245	goto parse_bracket_exp_espace;
3246	mbcset->mbchars = new_mbchars;
3247	}
3248	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3249	break;
3250	#endif /* RE_ENABLE_I18N */
3251	case EQUIV_CLASS:
3252	*err = build_equiv_class (sbcset,
3253	#ifdef RE_ENABLE_I18N
3254	mbcset, &equiv_class_alloc,
3255	#endif /* RE_ENABLE_I18N */
3256	start_elem.opr.name);
3257	if (BE (*err != REG_NOERROR, 0))
3258	goto parse_bracket_exp_free_return;
3259	break;
3260	case COLL_SYM:
3261	*err = build_collating_symbol (sbcset,
3262	#ifdef RE_ENABLE_I18N
3263	mbcset, &coll_sym_alloc,
3264	#endif /* RE_ENABLE_I18N */
3265	start_elem.opr.name);
3266	if (BE (*err != REG_NOERROR, 0))
3267	goto parse_bracket_exp_free_return;
3268	break;
3269	case CHAR_CLASS:
3270	*err = build_charclass (regexp->trans, sbcset,
3271	#ifdef RE_ENABLE_I18N
3272	mbcset, &char_class_alloc,
3273	#endif /* RE_ENABLE_I18N */
3274	start_elem.opr.name, syntax);
3275	if (BE (*err != REG_NOERROR, 0))
3276	goto parse_bracket_exp_free_return;
3277	break;
3278	default:
3279	assert (0);
3280	break;
3281	}
3282	}
3283	if (BE (token->type == END_OF_RE, 0))
3284	{
3285	*err = REG_EBRACK;
3286	goto parse_bracket_exp_free_return;
3287	}
3288	if (token->type == OP_CLOSE_BRACKET)
3289	break;
3290	}
3291
3292	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3293
3294	/* If it is non-matching list. */
3295	if (non_match)
3296	bitset_not (sbcset);
3297
3298	#ifdef RE_ENABLE_I18N
3299	/* Ensure only single byte characters are set. */
3300	if (dfa->mb_cur_max > 1)
3301	bitset_mask (sbcset, dfa->sb_char);
3302
3303	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
3304	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3305	\|\| mbcset->non_match)))
3306	{
3307	bin_tree_t *mbc_tree;
3308	int sbc_idx;
3309	/* Build a tree for complex bracket. */
3310	dfa->has_mb_node = 1;
3311	br_token.type = COMPLEX_BRACKET;
3312	br_token.opr.mbcset = mbcset;
3313	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3314	if (BE (mbc_tree == NULL, 0))
3315	goto parse_bracket_exp_espace;
3316	for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3317	if (sbcset[sbc_idx])
3318	break;
3319	/* If there are no bits set in sbcset, there is no point
3320	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3321	if (sbc_idx < BITSET_UINTS)
3322	{
3323	/* Build a tree for simple bracket. */
3324	br_token.type = SIMPLE_BRACKET;
3325	br_token.opr.sbcset = sbcset;
3326	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3327	if (BE (work_tree == NULL, 0))
3328	goto parse_bracket_exp_espace;
3329
3330	/* Then join them by ALT node. */
3331	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3332	if (BE (work_tree == NULL, 0))
3333	goto parse_bracket_exp_espace;
3334	}
3335	else
3336	{
3337	re_free (sbcset);
3338	work_tree = mbc_tree;
3339	}
3340	}
3341	else
3342	#endif /* not RE_ENABLE_I18N */
3343	{
3344	#ifdef RE_ENABLE_I18N
3345	free_charset (mbcset);
3346	#endif
3347	/* Build a tree for simple bracket. */
3348	br_token.type = SIMPLE_BRACKET;
3349	br_token.opr.sbcset = sbcset;
3350	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3351	if (BE (work_tree == NULL, 0))
3352	goto parse_bracket_exp_espace;
3353	}
3354	return work_tree;
3355
3356	parse_bracket_exp_espace:
3357	*err = REG_ESPACE;
3358	parse_bracket_exp_free_return:
3359	re_free (sbcset);
3360	#ifdef RE_ENABLE_I18N
3361	free_charset (mbcset);
3362	#endif /* RE_ENABLE_I18N */
3363	return NULL;
3364	}
3365
3366	/* Parse an element in the bracket expression. */
3367
3368	static reg_errcode_t
3369	parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3370	accept_hyphen)
3371	bracket_elem_t *elem;
3372	re_string_t *regexp;
3373	re_token_t *token;
3374	int token_len;
3375	re_dfa_t *dfa;
3376	reg_syntax_t syntax;
3377	int accept_hyphen;
3378	{
3379	#ifdef RE_ENABLE_I18N
3380	int cur_char_size;
3381	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3382	if (cur_char_size > 1)
3383	{
3384	elem->type = MB_CHAR;
3385	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3386	re_string_skip_bytes (regexp, cur_char_size);
3387	return REG_NOERROR;
3388	}
3389	#endif /* RE_ENABLE_I18N */
3390	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3391	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
3392	\|\| token->type == OP_OPEN_EQUIV_CLASS)
3393	return parse_bracket_symbol (elem, regexp, token);
3394	if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3395	{
3396	/* A '-' must only appear as anything but a range indicator before
3397	the closing bracket. Everything else is an error. */
3398	re_token_t token2;
3399	(void) peek_token_bracket (&token2, regexp, syntax);
3400	if (token2.type != OP_CLOSE_BRACKET)
3401	/* The actual error value is not standardized since this whole
3402	case is undefined. But ERANGE makes good sense. */
3403	return REG_ERANGE;
3404	}
3405	elem->type = SB_CHAR;
3406	elem->opr.ch = token->opr.c;
3407	return REG_NOERROR;
3408	}
3409
3410	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3411	such as [:<character_class>:], [.<collating_element>.], and
3412	[=<equivalent_class>=]. */
3413
3414	static reg_errcode_t
3415	parse_bracket_symbol (elem, regexp, token)
3416	bracket_elem_t *elem;
3417	re_string_t *regexp;
3418	re_token_t *token;
3419	{
3420	unsigned char ch, delim = token->opr.c;
3421	int i = 0;
3422	if (re_string_eoi(regexp))
3423	return REG_EBRACK;
3424	for (;; ++i)
3425	{
3426	if (i >= BRACKET_NAME_BUF_SIZE)
3427	return REG_EBRACK;
3428	if (token->type == OP_OPEN_CHAR_CLASS)
3429	ch = re_string_fetch_byte_case (regexp);
3430	else
3431	ch = re_string_fetch_byte (regexp);
3432	if (re_string_eoi(regexp))
3433	return REG_EBRACK;
3434	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3435	break;
3436	elem->opr.name[i] = ch;
3437	}
3438	re_string_skip_bytes (regexp, 1);
3439	elem->opr.name[i] = '\0';
3440	switch (token->type)
3441	{
3442	case OP_OPEN_COLL_ELEM:
3443	elem->type = COLL_SYM;
3444	break;
3445	case OP_OPEN_EQUIV_CLASS:
3446	elem->type = EQUIV_CLASS;
3447	break;
3448	case OP_OPEN_CHAR_CLASS:
3449	elem->type = CHAR_CLASS;
3450	break;
3451	default:
3452	break;
3453	}
3454	return REG_NOERROR;
3455	}
3456
3457	/* Helper function for parse_bracket_exp.
3458	Build the equivalence class which is represented by NAME.
3459	The result are written to MBCSET and SBCSET.
3460	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3461	is a pointer argument sinse we may update it. */
3462
3463	static reg_errcode_t
3464	#ifdef RE_ENABLE_I18N
3465	build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3466	re_charset_t *mbcset;
3467	int *equiv_class_alloc;
3468	#else /* not RE_ENABLE_I18N */
3469	build_equiv_class (sbcset, name)
3470	#endif /* not RE_ENABLE_I18N */
3471	re_bitset_ptr_t sbcset;
3472	const unsigned char *name;
3473	{
3474	#if defined _LIBC
3475	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3476	if (nrules != 0)
3477	{
3478	const int32_t table, indirect;
3479	const unsigned char weights, extra, *cp;
3480	unsigned char char_buf[2];
3481	int32_t idx1, idx2;
3482	unsigned int ch;
3483	size_t len;
3484	/* This #include defines a local function! */
3485	# include <locale/weight.h>
3486	/* Calculate the index for equivalence class. */
3487	cp = name;
3488	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3489	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3490	_NL_COLLATE_WEIGHTMB);
3491	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3492	_NL_COLLATE_EXTRAMB);
3493	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3494	_NL_COLLATE_INDIRECTMB);
3495	idx1 = findidx (&cp);
3496	if (BE (idx1 == 0 \|\| cp < name + strlen ((const char *) name), 0))
3497	/* This isn't a valid character. */
3498	return REG_ECOLLATE;
3499
3500	/* Build single byte matcing table for this equivalence class. */
3501	char_buf[1] = (unsigned char) '\0';
3502	len = weights[idx1];
3503	for (ch = 0; ch < SBC_MAX; ++ch)
3504	{
3505	char_buf[0] = ch;
3506	cp = char_buf;
3507	idx2 = findidx (&cp);
3508	/*
3509	idx2 = table[ch];
3510	*/
3511	if (idx2 == 0)
3512	/* This isn't a valid character. */
3513	continue;
3514	if (len == weights[idx2])
3515	{
3516	int cnt = 0;
3517	while (cnt <= len &&
3518	weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3519	++cnt;
3520
3521	if (cnt > len)
3522	bitset_set (sbcset, ch);
3523	}
3524	}
3525	/* Check whether the array has enough space. */
3526	if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3527	{
3528	/* Not enough, realloc it. */
3529	/* +1 in case of mbcset->nequiv_classes is 0. */
3530	int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3531	/* Use realloc since the array is NULL if alloc == 0. /
3532	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3533	int32_t,
3534	new_equiv_class_alloc);
3535	if (BE (new_equiv_classes == NULL, 0))
3536	return REG_ESPACE;
3537	mbcset->equiv_classes = new_equiv_classes;
3538	*equiv_class_alloc = new_equiv_class_alloc;
3539	}
3540	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3541	}
3542	else
3543	#endif /* _LIBC */
3544	{
3545	if (BE (strlen ((const char *) name) != 1, 0))
3546	return REG_ECOLLATE;
3547	bitset_set (sbcset, *name);
3548	}
3549	return REG_NOERROR;
3550	}
3551
3552	/* Helper function for parse_bracket_exp.
3553	Build the character class which is represented by NAME.
3554	The result are written to MBCSET and SBCSET.
3555	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3556	is a pointer argument sinse we may update it. */
3557
3558	static reg_errcode_t
3559	#ifdef RE_ENABLE_I18N
3560	build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3561	re_charset_t *mbcset;
3562	int *char_class_alloc;
3563	#else /* not RE_ENABLE_I18N */
3564	build_charclass (trans, sbcset, class_name, syntax)
3565	#endif /* not RE_ENABLE_I18N */
3566	unsigned RE_TRANSLATE_TYPE trans;
3567	re_bitset_ptr_t sbcset;
3568	const unsigned char *class_name;
3569	reg_syntax_t syntax;
3570	{
3571	int i;
3572	const char name = (const char ) class_name;
3573
3574	/* In case of REG_ICASE "upper" and "lower" match the both of
3575	upper and lower cases. */
3576	if ((syntax & RE_ICASE)
3577	&& (strcmp (name, "upper") == 0 \|\| strcmp (name, "lower") == 0))
3578	name = "alpha";
3579
3580	#ifdef RE_ENABLE_I18N
3581	/* Check the space of the arrays. */
3582	if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3583	{
3584	/* Not enough, realloc it. */
3585	/* +1 in case of mbcset->nchar_classes is 0. */
3586	int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3587	/* Use realloc since array is NULL if alloc == 0. /
3588	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3589	new_char_class_alloc);
3590	if (BE (new_char_classes == NULL, 0))
3591	return REG_ESPACE;
3592	mbcset->char_classes = new_char_classes;
3593	*char_class_alloc = new_char_class_alloc;
3594	}
3595	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3596	#endif /* RE_ENABLE_I18N */
3597
3598	#define BUILD_CHARCLASS_LOOP(ctype_func) \
3599	for (i = 0; i < SBC_MAX; ++i) \
3600	{ \
3601	if (ctype_func (i)) \
3602	{ \
3603	int ch = trans ? trans[i] : i; \
3604	bitset_set (sbcset, ch); \
3605	} \
3606	}
3607
3608	if (strcmp (name, "alnum") == 0)
3609	BUILD_CHARCLASS_LOOP (isalnum)
3610	else if (strcmp (name, "cntrl") == 0)
3611	BUILD_CHARCLASS_LOOP (iscntrl)
3612	else if (strcmp (name, "lower") == 0)
3613	BUILD_CHARCLASS_LOOP (islower)
3614	else if (strcmp (name, "space") == 0)
3615	BUILD_CHARCLASS_LOOP (isspace)
3616	else if (strcmp (name, "alpha") == 0)
3617	BUILD_CHARCLASS_LOOP (isalpha)
3618	else if (strcmp (name, "digit") == 0)
3619	BUILD_CHARCLASS_LOOP (isdigit)
3620	else if (strcmp (name, "print") == 0)
3621	BUILD_CHARCLASS_LOOP (isprint)
3622	else if (strcmp (name, "upper") == 0)
3623	BUILD_CHARCLASS_LOOP (isupper)
3624	else if (strcmp (name, "blank") == 0)
3625	BUILD_CHARCLASS_LOOP (isblank)
3626	else if (strcmp (name, "graph") == 0)
3627	BUILD_CHARCLASS_LOOP (isgraph)
3628	else if (strcmp (name, "punct") == 0)
3629	BUILD_CHARCLASS_LOOP (ispunct)
3630	else if (strcmp (name, "xdigit") == 0)
3631	BUILD_CHARCLASS_LOOP (isxdigit)
3632	else
3633	return REG_ECTYPE;
3634
3635	return REG_NOERROR;
3636	}
3637
3638	static bin_tree_t *
3639	build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3640	re_dfa_t *dfa;
3641	unsigned RE_TRANSLATE_TYPE trans;
3642	const unsigned char *class_name;
3643	const unsigned char *extra;
3644	int non_match;
3645	reg_errcode_t *err;
3646	{
3647	re_bitset_ptr_t sbcset;
3648	#ifdef RE_ENABLE_I18N
3649	re_charset_t *mbcset;
3650	int alloc = 0;
3651	#endif /* not RE_ENABLE_I18N */
3652	reg_errcode_t ret;
3653	re_token_t br_token;
3654	bin_tree_t *tree;
3655
3656	sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3657	#ifdef RE_ENABLE_I18N
3658	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3659	#endif /* RE_ENABLE_I18N */
3660
3661	#ifdef RE_ENABLE_I18N
3662	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
3663	#else /* not RE_ENABLE_I18N */
3664	if (BE (sbcset == NULL, 0))
3665	#endif /* not RE_ENABLE_I18N */
3666	{
3667	*err = REG_ESPACE;
3668	return NULL;
3669	}
3670
3671	if (non_match)
3672	{
3673	#ifdef RE_ENABLE_I18N
3674	/*
3675	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3676	bitset_set(cset->sbcset, '\0');
3677	*/
3678	mbcset->non_match = 1;
3679	#endif /* not RE_ENABLE_I18N */
3680	}
3681
3682	/* We don't care the syntax in this case. */
3683	ret = build_charclass (trans, sbcset,
3684	#ifdef RE_ENABLE_I18N
3685	mbcset, &alloc,
3686	#endif /* RE_ENABLE_I18N */
3687	class_name, 0);
3688
3689	if (BE (ret != REG_NOERROR, 0))
3690	{
3691	re_free (sbcset);
3692	#ifdef RE_ENABLE_I18N
3693	free_charset (mbcset);
3694	#endif /* RE_ENABLE_I18N */
3695	*err = ret;
3696	return NULL;
3697	}
3698	/* \w match '_' also. */
3699	for (; *extra; extra++)
3700	bitset_set (sbcset, *extra);
3701
3702	/* If it is non-matching list. */
3703	if (non_match)
3704	bitset_not (sbcset);
3705
3706	#ifdef RE_ENABLE_I18N
3707	/* Ensure only single byte characters are set. */
3708	if (dfa->mb_cur_max > 1)
3709	bitset_mask (sbcset, dfa->sb_char);
3710	#endif
3711
3712	/* Build a tree for simple bracket. */
3713	br_token.type = SIMPLE_BRACKET;
3714	br_token.opr.sbcset = sbcset;
3715	tree = create_token_tree (dfa, NULL, NULL, &br_token);
3716	if (BE (tree == NULL, 0))
3717	goto build_word_op_espace;
3718
3719	#ifdef RE_ENABLE_I18N
3720	if (dfa->mb_cur_max > 1)
3721	{
3722	bin_tree_t *mbc_tree;
3723	/* Build a tree for complex bracket. */
3724	br_token.type = COMPLEX_BRACKET;
3725	br_token.opr.mbcset = mbcset;
3726	dfa->has_mb_node = 1;
3727	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3728	if (BE (mbc_tree == NULL, 0))
3729	goto build_word_op_espace;
3730	/* Then join them by ALT node. */
3731	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3732	if (BE (mbc_tree != NULL, 1))
3733	return tree;
3734	}
3735	else
3736	{
3737	free_charset (mbcset);
3738	return tree;
3739	}
3740	#else /* not RE_ENABLE_I18N */
3741	return tree;
3742	#endif /* not RE_ENABLE_I18N */
3743
3744	build_word_op_espace:
3745	re_free (sbcset);
3746	#ifdef RE_ENABLE_I18N
3747	free_charset (mbcset);
3748	#endif /* RE_ENABLE_I18N */
3749	*err = REG_ESPACE;
3750	return NULL;
3751	}
3752
3753	/* This is intended for the expressions like "a{1,3}".
3754	Fetch a number from `input', and return the number.
3755	Return -1, if the number field is empty like "{,1}".
3756	Return -2, If an error is occured. */
3757
3758	static int
3759	fetch_number (input, token, syntax)
3760	re_string_t *input;
3761	re_token_t *token;
3762	reg_syntax_t syntax;
3763	{
3764	int num = -1;
3765	unsigned char c;
3766	while (1)
3767	{
3768	fetch_token (token, input, syntax);
3769	c = token->opr.c;
3770	if (BE (token->type == END_OF_RE, 0))
3771	return -2;
3772	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
3773	break;
3774	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c \|\| num == -2)
3775	? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3776	num = (num > RE_DUP_MAX) ? -2 : num;
3777	}
3778	return num;
3779	}
3780
3781
3782	#ifdef RE_ENABLE_I18N
3783	static void
3784	free_charset (re_charset_t *cset)
3785	{
3786	re_free (cset->mbchars);
3787	# ifdef _LIBC
3788	re_free (cset->coll_syms);
3789	re_free (cset->equiv_classes);
3790	re_free (cset->range_starts);
3791	re_free (cset->range_ends);
3792	# endif
3793	re_free (cset->char_classes);
3794	re_free (cset);
3795	}
3796	#endif /* RE_ENABLE_I18N */
3797
3798
3799	/* Functions for binary tree operation. */
3800
3801	/* Create a tree node. */
3802
3803	static bin_tree_t *
3804	create_tree (dfa, left, right, type)
3805	re_dfa_t *dfa;
3806	bin_tree_t *left;
3807	bin_tree_t *right;
3808	re_token_type_t type;
3809	{
3810	re_token_t t;
3811	t.type = type;
3812	return create_token_tree (dfa, left, right, &t);
3813	}
3814
3815	static bin_tree_t *
3816	create_token_tree (dfa, left, right, token)
3817	re_dfa_t *dfa;
3818	bin_tree_t *left;
3819	bin_tree_t *right;
3820	const re_token_t *token;
3821	{
3822	bin_tree_t *tree;
3823	if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3824	{
3825	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3826
3827	if (storage == NULL)
3828	return NULL;
3829	storage->next = dfa->str_tree_storage;
3830	dfa->str_tree_storage = storage;
3831	dfa->str_tree_storage_idx = 0;
3832	}
3833	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3834
3835	tree->parent = NULL;
3836	tree->left = left;
3837	tree->right = right;
3838	tree->token = *token;
3839	tree->token.duplicated = 0;
3840	tree->token.opt_subexp = 0;
3841	tree->first = NULL;
3842	tree->next = NULL;
3843	tree->node_idx = -1;
3844
3845	if (left != NULL)
3846	left->parent = tree;
3847	if (right != NULL)
3848	right->parent = tree;
3849	return tree;
3850	}
3851
3852	/* Mark the tree SRC as an optional subexpression.
3853	To be called from preorder or postorder. */
3854
3855	static reg_errcode_t
3856	mark_opt_subexp (extra, node)
3857	void *extra;
3858	bin_tree_t *node;
3859	{
3860	int idx = (int) (long) extra;
3861	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3862	node->token.opt_subexp = 1;
3863
3864	return REG_NOERROR;
3865	}
3866
3867	/* Free the allocated memory inside NODE. */
3868
3869	static void
3870	free_token (re_token_t *node)
3871	{
3872	#ifdef RE_ENABLE_I18N
3873	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3874	free_charset (node->opr.mbcset);
3875	else
3876	#endif /* RE_ENABLE_I18N */
3877	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3878	re_free (node->opr.sbcset);
3879	}
3880
3881	/* Worker function for tree walking. Free the allocated memory inside NODE
3882	and its children. */
3883
3884	static reg_errcode_t
3885	free_tree (void extra, bin_tree_t node)
3886	{
3887	free_token (&node->token);
3888	return REG_NOERROR;
3889	}
3890
3891
3892	/* Duplicate the node SRC, and return new node. This is a preorder
3893	visit similar to the one implemented by the generic visitor, but
3894	we need more infrastructure to maintain two parallel trees --- so,
3895	it's easier to duplicate. */
3896
3897	static bin_tree_t *
3898	duplicate_tree (root, dfa)
3899	const bin_tree_t *root;
3900	re_dfa_t *dfa;
3901	{
3902	const bin_tree_t *node;
3903	bin_tree_t *dup_root;
3904	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
3905
3906	for (node = root; ; )
3907	{
3908	/* Create a new tree and link it back to the current parent. */
3909	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3910	if (*p_new == NULL)
3911	return NULL;
3912	(*p_new)->parent = dup_node;
3913	(*p_new)->token.duplicated = 1;
3914	dup_node = *p_new;
3915
3916	/* Go to the left node, or up and to the right. */
3917	if (node->left)
3918	{
3919	node = node->left;
3920	p_new = &dup_node->left;
3921	}
3922	else
3923	{
3924	const bin_tree_t *prev = NULL;
3925	while (node->right == prev \|\| node->right == NULL)
3926	{
3927	prev = node;
3928	node = node->parent;
3929	dup_node = dup_node->parent;
3930	if (!node)
3931	return dup_root;
3932	}
3933	node = node->right;
3934	p_new = &dup_node->right;
3935	}
3936	}
3937	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: