Context Navigation

regcomp.c

Visit:

Last change on this file was 3076, checked in by bird, 18 years ago
gawk 3.1.5
File size: 110.1 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, write to the Free
18	Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19	02110-1301 USA. */
20
21	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
22	int length, reg_syntax_t syntax);
23	static void re_compile_fastmap_iter (regex_t *bufp,
24	const re_dfastate_t *init_state,
25	char *fastmap);
26	static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27	static void init_word_char (re_dfa_t *dfa);
28	#ifdef RE_ENABLE_I18N
29	static void free_charset (re_charset_t *cset);
30	#endif /* RE_ENABLE_I18N */
31	static void free_workarea_compile (regex_t *preg);
32	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33	#ifdef RE_ENABLE_I18N
34	static void optimize_utf8 (re_dfa_t *dfa);
35	#endif
36	static reg_errcode_t analyze (regex_t *preg);
37	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
38	static reg_errcode_t preorder (bin_tree_t *root,
39	reg_errcode_t (fn (void , bin_tree_t )),
40	void *extra);
41	static reg_errcode_t postorder (bin_tree_t *root,
42	reg_errcode_t (fn (void , bin_tree_t )),
43	void *extra);
44	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
45	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
46	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
47	bin_tree_t *node);
48	static reg_errcode_t calc_first (void extra, bin_tree_t node);
49	static reg_errcode_t calc_next (void extra, bin_tree_t node);
50	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
51	static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
52	int top_clone_node, int root_node,
53	unsigned int constraint);
54	static reg_errcode_t duplicate_node (int new_idx, re_dfa_t dfa, int org_idx,
55	unsigned int constraint);
56	static int search_duplicated_node (re_dfa_t *dfa, int org_node,
57	unsigned int constraint);
58	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
59	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
60	int node, int root);
61	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
62	static int fetch_number (re_string_t input, re_token_t token,
63	reg_syntax_t syntax);
64	static void fetch_token (re_token_t result, re_string_t input,
65	reg_syntax_t syntax);
66	static int peek_token (re_token_t token, re_string_t input,
67	reg_syntax_t syntax);
68	static int peek_token_bracket (re_token_t token, re_string_t input,
69	reg_syntax_t syntax);
70	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
71	reg_syntax_t syntax, reg_errcode_t *err);
72	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
73	re_token_t *token, reg_syntax_t syntax,
74	int nest, reg_errcode_t *err);
75	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
76	re_token_t *token, reg_syntax_t syntax,
77	int nest, reg_errcode_t *err);
78	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
79	re_token_t *token, reg_syntax_t syntax,
80	int nest, reg_errcode_t *err);
81	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
82	re_token_t *token, reg_syntax_t syntax,
83	int nest, reg_errcode_t *err);
84	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
85	re_dfa_t dfa, re_token_t token,
86	reg_syntax_t syntax, reg_errcode_t *err);
87	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
88	re_token_t *token, reg_syntax_t syntax,
89	reg_errcode_t *err);
90	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
91	re_string_t *regexp,
92	re_token_t *token, int token_len,
93	re_dfa_t *dfa,
94	reg_syntax_t syntax,
95	int accept_hyphen);
96	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97	re_string_t *regexp,
98	re_token_t *token);
99	#ifndef _LIBC
100	# ifdef RE_ENABLE_I18N
101	static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
102	re_charset_t mbcset, int range_alloc,
103	bracket_elem_t *start_elem,
104	bracket_elem_t *end_elem);
105	static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
106	re_charset_t *mbcset,
107	int *coll_sym_alloc,
108	const unsigned char *name);
109	# else /* not RE_ENABLE_I18N */
110	static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
111	bracket_elem_t *start_elem,
112	bracket_elem_t *end_elem);
113	static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
114	const unsigned char *name);
115	# endif /* not RE_ENABLE_I18N */
116	#endif /* not _LIBC */
117	#ifdef RE_ENABLE_I18N
118	static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119	re_charset_t *mbcset,
120	int *equiv_class_alloc,
121	const unsigned char *name);
122	static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
123	re_bitset_ptr_t sbcset,
124	re_charset_t *mbcset,
125	int *char_class_alloc,
126	const char *class_name,
127	reg_syntax_t syntax);
128	#else /* not RE_ENABLE_I18N */
129	static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
130	const unsigned char *name);
131	static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
132	re_bitset_ptr_t sbcset,
133	const char *class_name,
134	reg_syntax_t syntax);
135	#endif /* not RE_ENABLE_I18N */
136	static bin_tree_t build_charclass_op (re_dfa_t dfa,
137	unsigned RE_TRANSLATE_TYPE trans,
138	const char *class_name,
139	const char *extra,
140	int non_match, reg_errcode_t *err);
141	static bin_tree_t create_tree (re_dfa_t dfa,
142	bin_tree_t left, bin_tree_t right,
143	re_token_type_t type);
144	static bin_tree_t create_token_tree (re_dfa_t dfa,
145	bin_tree_t left, bin_tree_t right,
146	const re_token_t *token);
147	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
148	static void free_token (re_token_t *node);
149	static reg_errcode_t free_tree (void extra, bin_tree_t node);
150	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
151
152
153	/* This table gives an error message for each of the error codes listed
154	in regex.h. Obviously the order here has to be same as there.
155	POSIX doesn't require that we do anything for REG_NOERROR,
156	but why not be nice? */
157
158	const ERRMSG_TYPE __re_error_msgid[] attribute_hidden =
159	{
160	#define REG_NOERROR_IDX 0
161	gettext_noop ("Success") /* REG_NOERROR */
162	ERRMSG_SEPARATOR
163	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
164	gettext_noop ("No match") /* REG_NOMATCH */
165	ERRMSG_SEPARATOR
166	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
167	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
168	ERRMSG_SEPARATOR
169	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
170	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
171	ERRMSG_SEPARATOR
172	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
173	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
174	ERRMSG_SEPARATOR
175	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
176	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
177	ERRMSG_SEPARATOR
178	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
179	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
180	ERRMSG_SEPARATOR
181	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
182	gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
183	ERRMSG_SEPARATOR
184	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
185	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
186	ERRMSG_SEPARATOR
187	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
188	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
189	ERRMSG_SEPARATOR
190	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
191	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
192	ERRMSG_SEPARATOR
193	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
194	gettext_noop ("Invalid range end") /* REG_ERANGE */
195	ERRMSG_SEPARATOR
196	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
197	gettext_noop ("Memory exhausted") /* REG_ESPACE */
198	ERRMSG_SEPARATOR
199	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
200	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
201	ERRMSG_SEPARATOR
202	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
203	gettext_noop ("Premature end of regular expression") /* REG_EEND */
204	ERRMSG_SEPARATOR
205	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
206	gettext_noop ("Regular expression too big") /* REG_ESIZE */
207	ERRMSG_SEPARATOR
208	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
209	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
210	ERRMSG_SEPARATOR
211	};
212
213	const size_t __re_error_msgid_idx[] attribute_hidden =
214	{
215	REG_NOERROR_IDX,
216	REG_NOMATCH_IDX,
217	REG_BADPAT_IDX,
218	REG_ECOLLATE_IDX,
219	REG_ECTYPE_IDX,
220	REG_EESCAPE_IDX,
221	REG_ESUBREG_IDX,
222	REG_EBRACK_IDX,
223	REG_EPAREN_IDX,
224	REG_EBRACE_IDX,
225	REG_BADBR_IDX,
226	REG_ERANGE_IDX,
227	REG_ESPACE_IDX,
228	REG_BADRPT_IDX,
229	REG_EEND_IDX,
230	REG_ESIZE_IDX,
231	REG_ERPAREN_IDX
232	};
233
234
235	/* Entry points for GNU code. */
236
237	/* re_compile_pattern is the GNU regular expression compiler: it
238	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
239	Returns 0 if the pattern was valid, otherwise an error string.
240
241	Assumes the `allocated' (and perhaps `buffer') and `translate' fields
242	are set in BUFP on entry. */
243
244	const char *
245	re_compile_pattern (pattern, length, bufp)
246	const char *pattern;
247	size_t length;
248	struct re_pattern_buffer *bufp;
249	{
250	reg_errcode_t ret;
251
252	/* And GNU code determines whether or not to get register information
253	by passing null for the REGS argument to re_match, etc., not by
254	setting no_sub, unless RE_NO_SUB is set. */
255	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
256
257	/* Match anchors at newline. */
258	bufp->newline_anchor = 1;
259
260	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
261
262	if (!ret)
263	return NULL;
264	return gettext (RE_ERRMSG(ret));
265	}
266	#ifdef _LIBC
267	weak_alias (__re_compile_pattern, re_compile_pattern)
268	#endif
269
270	/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
271	also be assigned to arbitrarily: each pattern buffer stores its own
272	syntax, so it can be changed between regex compilations. */
273	/* This has no initializer because initialized variables in Emacs
274	become read-only after dumping. */
275	reg_syntax_t re_syntax_options;
276
277
278	/* Specify the precise syntax of regexps for compilation. This provides
279	for compatibility for various utilities which historically have
280	different, incompatible syntaxes.
281
282	The argument SYNTAX is a bit mask comprised of the various bits
283	defined in regex.h. We return the old syntax. */
284
285	reg_syntax_t
286	re_set_syntax (syntax)
287	reg_syntax_t syntax;
288	{
289	reg_syntax_t ret = re_syntax_options;
290
291	re_syntax_options = syntax;
292	return ret;
293	}
294	#ifdef _LIBC
295	weak_alias (__re_set_syntax, re_set_syntax)
296	#endif
297
298	int
299	re_compile_fastmap (bufp)
300	struct re_pattern_buffer *bufp;
301	{
302	re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
303	char *fastmap = bufp->fastmap;
304
305	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
306	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
307	if (dfa->init_state != dfa->init_state_word)
308	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
309	if (dfa->init_state != dfa->init_state_nl)
310	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
311	if (dfa->init_state != dfa->init_state_begbuf)
312	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
313	bufp->fastmap_accurate = 1;
314	return 0;
315	}
316	#ifdef _LIBC
317	weak_alias (__re_compile_fastmap, re_compile_fastmap)
318	#endif
319
320	static inline void
321	__attribute ((always_inline))
322	re_set_fastmap (char *fastmap, int icase, int ch)
323	{
324	fastmap[ch] = 1;
325	if (icase)
326	fastmap[tolower (ch)] = 1;
327	}
328
329	/* Helper function for re_compile_fastmap.
330	Compile fastmap for the initial_state INIT_STATE. */
331
332	static void
333	re_compile_fastmap_iter (bufp, init_state, fastmap)
334	regex_t *bufp;
335	const re_dfastate_t *init_state;
336	char *fastmap;
337	{
338	re_dfa_t dfa = (re_dfa_t ) bufp->buffer;
339	int node_cnt;
340	int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
341	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
342	{
343	int node = init_state->nodes.elems[node_cnt];
344	re_token_type_t type = dfa->nodes[node].type;
345
346	if (type == CHARACTER)
347	{
348	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
349	#ifdef RE_ENABLE_I18N
350	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
351	{
352	unsigned char buf = re_malloc (unsigned char, dfa->mb_cur_max), p;
353	wchar_t wc;
354	mbstate_t state;
355
356	p = buf;
357	*p++ = dfa->nodes[node].opr.c;
358	while (++node < dfa->nodes_len
359	&& dfa->nodes[node].type == CHARACTER
360	&& dfa->nodes[node].mb_partial)
361	*p++ = dfa->nodes[node].opr.c;
362	memset (&state, 0, sizeof (state));
363	if (mbrtowc (&wc, (const char *) buf, p - buf,
364	&state) == p - buf
365	&& __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
366	re_set_fastmap (fastmap, 0, buf[0]);
367	re_free (buf);
368	}
369	#endif
370	}
371	else if (type == SIMPLE_BRACKET)
372	{
373	int i, j, ch;
374	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
375	for (j = 0; j < UINT_BITS; ++j, ++ch)
376	if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
377	re_set_fastmap (fastmap, icase, ch);
378	}
379	#ifdef RE_ENABLE_I18N
380	else if (type == COMPLEX_BRACKET)
381	{
382	int i;
383	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
384	if (cset->non_match \|\| cset->ncoll_syms \|\| cset->nequiv_classes
385	\|\| cset->nranges \|\| cset->nchar_classes)
386	{
387	# ifdef _LIBC
388	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
389	{
390	/* In this case we want to catch the bytes which are
391	the first byte of any collation elements.
392	e.g. In da_DK, we want to catch 'a' since "aa"
393	is a valid collation element, and don't catch
394	'b' since 'b' is the only collation element
395	which starts from 'b'. */
396	int j, ch;
397	const int32_t table = (const int32_t )
398	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
399	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
400	for (j = 0; j < UINT_BITS; ++j, ++ch)
401	if (table[ch] < 0)
402	re_set_fastmap (fastmap, icase, ch);
403	}
404	# else
405	if (dfa->mb_cur_max > 1)
406	for (i = 0; i < SBC_MAX; ++i)
407	if (__btowc (i) == WEOF)
408	re_set_fastmap (fastmap, icase, i);
409	# endif /* not _LIBC */
410	}
411	for (i = 0; i < cset->nmbchars; ++i)
412	{
413	char buf[256];
414	mbstate_t state;
415	memset (&state, '\0', sizeof (state));
416	__wcrtomb (buf, cset->mbchars[i], &state);
417	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
418	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
419	{
420	__wcrtomb (buf, towlower (cset->mbchars[i]), &state);
421	re_set_fastmap (fastmap, 0, (unsigned char ) buf);
422	}
423	}
424	}
425	#endif /* RE_ENABLE_I18N */
426	else if (type == OP_PERIOD
427	#ifdef RE_ENABLE_I18N
428	\|\| type == OP_UTF8_PERIOD
429	#endif /* RE_ENABLE_I18N */
430	\|\| type == END_OF_RE)
431	{
432	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
433	if (type == END_OF_RE)
434	bufp->can_be_null = 1;
435	return;
436	}
437	}
438	}
439
440
441	/* Entry point for POSIX code. */
442	/* regcomp takes a regular expression as a string and compiles it.
443
444	PREG is a regex_t *. We do not expect any fields to be initialized,
445	since POSIX says we shouldn't. Thus, we set
446
447	`buffer' to the compiled pattern;
448	`used' to the length of the compiled pattern;
449	`syntax' to RE_SYNTAX_POSIX_EXTENDED if the
450	REG_EXTENDED bit in CFLAGS is set; otherwise, to
451	RE_SYNTAX_POSIX_BASIC;
452	`newline_anchor' to REG_NEWLINE being set in CFLAGS;
453	`fastmap' to an allocated space for the fastmap;
454	`fastmap_accurate' to zero;
455	`re_nsub' to the number of subexpressions in PATTERN.
456
457	PATTERN is the address of the pattern string.
458
459	CFLAGS is a series of bits which affect compilation.
460
461	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
462	use POSIX basic syntax.
463
464	If REG_NEWLINE is set, then . and [^...] don't match newline.
465	Also, regexec will try a match beginning after every newline.
466
467	If REG_ICASE is set, then we considers upper- and lowercase
468	versions of letters to be equivalent when matching.
469
470	If REG_NOSUB is set, then when PREG is passed to regexec, that
471	routine will report only success or failure, and nothing about the
472	registers.
473
474	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
475	the return codes and their meanings.) */
476
477	int
478	regcomp (preg, pattern, cflags)
479	regex_t *__restrict preg;
480	const char *__restrict pattern;
481	int cflags;
482	{
483	reg_errcode_t ret;
484	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
485	: RE_SYNTAX_POSIX_BASIC);
486
487	preg->buffer = NULL;
488	preg->allocated = 0;
489	preg->used = 0;
490
491	/* Try to allocate space for the fastmap. */
492	preg->fastmap = re_malloc (char, SBC_MAX);
493	if (BE (preg->fastmap == NULL, 0))
494	return REG_ESPACE;
495
496	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
497
498	/* If REG_NEWLINE is set, newlines are treated differently. */
499	if (cflags & REG_NEWLINE)
500	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
501	syntax &= ~RE_DOT_NEWLINE;
502	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
503	/* It also changes the matching behavior. */
504	preg->newline_anchor = 1;
505	}
506	else
507	preg->newline_anchor = 0;
508	preg->no_sub = !!(cflags & REG_NOSUB);
509	preg->translate = NULL;
510
511	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
512
513	/* POSIX doesn't distinguish between an unmatched open-group and an
514	unmatched close-group: both are REG_EPAREN. */
515	if (ret == REG_ERPAREN)
516	ret = REG_EPAREN;
517
518	/* We have already checked preg->fastmap != NULL. */
519	if (BE (ret == REG_NOERROR, 1))
520	/* Compute the fastmap now, since regexec cannot modify the pattern
521	buffer. This function never fails in this implementation. */
522	(void) re_compile_fastmap (preg);
523	else
524	{
525	/* Some error occurred while compiling the expression. */
526	re_free (preg->fastmap);
527	preg->fastmap = NULL;
528	}
529
530	return (int) ret;
531	}
532	#ifdef _LIBC
533	weak_alias (__regcomp, regcomp)
534	#endif
535
536	/* Returns a message corresponding to an error code, ERRCODE, returned
537	from either regcomp or regexec. We don't use PREG here. */
538
539	size_t
540	regerror (errcode, preg, errbuf, errbuf_size)
541	int errcode;
542	const regex_t *preg;
543	char *errbuf;
544	size_t errbuf_size;
545	{
546	const char *msg;
547	size_t msg_size;
548
549	if (BE (errcode < 0
550	\|\| errcode >= (int) (sizeof (__re_error_msgid_idx)
551	/ sizeof (__re_error_msgid_idx[0])), 0))
552	/* Only error codes returned by the rest of the code should be passed
553	to this routine. If we are given anything else, or if other regex
554	code generates an invalid error code, then the program has a bug.
555	Dump core so we can fix it. */
556	abort ();
557
558	msg = gettext (RE_ERRMSG(errcode));
559
560	msg_size = strlen (msg) + 1; /* Includes the null. */
561
562	if (BE (errbuf_size != 0, 1))
563	{
564	if (BE (msg_size > errbuf_size, 0))
565	{
566	memcpy (errbuf, msg, errbuf_size - 1);
567	errbuf[errbuf_size - 1] = 0;
568	}
569	else
570	memcpy (errbuf, msg, msg_size);
571	}
572
573	return msg_size;
574	}
575	#ifdef _LIBC
576	weak_alias (__regerror, regerror)
577	#endif
578
579
580	#ifdef RE_ENABLE_I18N
581	/* This static array is used for the map to single-byte characters when
582	UTF-8 is used. Otherwise we would allocate memory just to initialize
583	it the same all the time. UTF-8 is the preferred encoding so this is
584	a worthwhile optimization. */
585	static const bitset utf8_sb_map =
586	{
587	/* Set the first 128 bits. */
588	# if UINT_MAX == 0xffffffff
589	0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
590	# else
591	# error "Add case for new unsigned int size"
592	# endif
593	};
594	#endif
595
596
597	static void
598	free_dfa_content (re_dfa_t *dfa)
599	{
600	int i, j;
601
602	if (dfa->nodes)
603	for (i = 0; i < dfa->nodes_len; ++i)
604	free_token (dfa->nodes + i);
605	re_free (dfa->nexts);
606	for (i = 0; i < dfa->nodes_len; ++i)
607	{
608	if (dfa->eclosures != NULL)
609	re_node_set_free (dfa->eclosures + i);
610	if (dfa->inveclosures != NULL)
611	re_node_set_free (dfa->inveclosures + i);
612	if (dfa->edests != NULL)
613	re_node_set_free (dfa->edests + i);
614	}
615	re_free (dfa->edests);
616	re_free (dfa->eclosures);
617	re_free (dfa->inveclosures);
618	re_free (dfa->nodes);
619
620	if (dfa->state_table)
621	for (i = 0; i <= dfa->state_hash_mask; ++i)
622	{
623	struct re_state_table_entry *entry = dfa->state_table + i;
624	for (j = 0; j < entry->num; ++j)
625	{
626	re_dfastate_t *state = entry->array[j];
627	free_state (state);
628	}
629	re_free (entry->array);
630	}
631	re_free (dfa->state_table);
632	#ifdef RE_ENABLE_I18N
633	if (dfa->sb_char != utf8_sb_map)
634	re_free (dfa->sb_char);
635	#endif
636	re_free (dfa->subexp_map);
637	#ifdef DEBUG
638	re_free (dfa->re_str);
639	#endif
640
641	re_free (dfa);
642	}
643
644
645	/* Free dynamically allocated space used by PREG. */
646
647	void
648	regfree (preg)
649	regex_t *preg;
650	{
651	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
652	if (BE (dfa != NULL, 1))
653	free_dfa_content (dfa);
654	preg->buffer = NULL;
655	preg->allocated = 0;
656
657	re_free (preg->fastmap);
658	preg->fastmap = NULL;
659
660	re_free (preg->translate);
661	preg->translate = NULL;
662	}
663	#ifdef _LIBC
664	weak_alias (__regfree, regfree)
665	#endif
666
667
668	/* Entry points compatible with 4.2 BSD regex library. We don't define
669	them unless specifically requested. */
670
671	#if defined _REGEX_RE_COMP \|\| defined _LIBC
672
673	/* BSD has one and only one pattern buffer. */
674	static struct re_pattern_buffer re_comp_buf;
675
676	char *
677	# ifdef _LIBC
678	/* Make these definitions weak in libc, so POSIX programs can redefine
679	these names if they don't use our functions, and still use
680	regcomp/regexec above without link errors. */
681	weak_function
682	# endif
683	re_comp (s)
684	const char *s;
685	{
686	reg_errcode_t ret;
687	char *fastmap;
688
689	if (!s)
690	{
691	if (!re_comp_buf.buffer)
692	return gettext ("No previous regular expression");
693	return 0;
694	}
695
696	if (re_comp_buf.buffer)
697	{
698	fastmap = re_comp_buf.fastmap;
699	re_comp_buf.fastmap = NULL;
700	__regfree (&re_comp_buf);
701	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
702	re_comp_buf.fastmap = fastmap;
703	}
704
705	if (re_comp_buf.fastmap == NULL)
706	{
707	re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
708	if (re_comp_buf.fastmap == NULL)
709	return (char *) gettext (RE_ERRMSG(REG_ESPACE_IDX));
710	}
711
712	/* Since `re_exec' always passes NULL for the `regs' argument, we
713	don't need to initialize the pattern buffer fields which affect it. */
714
715	/* Match anchors at newlines. */
716	re_comp_buf.newline_anchor = 1;
717
718	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
719
720	if (!ret)
721	return NULL;
722
723	/* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
724	return (char *) gettext (RE_ERRMSG(ret));
725	}
726
727	#ifdef _LIBC
728	libc_freeres_fn (free_mem)
729	{
730	__regfree (&re_comp_buf);
731	}
732	#endif
733
734	#endif /* _REGEX_RE_COMP */
735
736
737	/* Internal entry point.
738	Compile the regular expression PATTERN, whose length is LENGTH.
739	SYNTAX indicate regular expression's syntax. */
740
741	static reg_errcode_t
742	re_compile_internal (preg, pattern, length, syntax)
743	regex_t *preg;
744	const char * pattern;
745	int length;
746	reg_syntax_t syntax;
747	{
748	reg_errcode_t err = REG_NOERROR;
749	re_dfa_t *dfa;
750	re_string_t regexp;
751
752	/* Initialize the pattern buffer. */
753	preg->fastmap_accurate = 0;
754	preg->syntax = syntax;
755	preg->not_bol = preg->not_eol = 0;
756	preg->used = 0;
757	preg->re_nsub = 0;
758	preg->can_be_null = 0;
759	preg->regs_allocated = REGS_UNALLOCATED;
760
761	/* Initialize the dfa. */
762	dfa = (re_dfa_t *) preg->buffer;
763	if (BE (preg->allocated < sizeof (re_dfa_t), 0))
764	{
765	/* If zero allocated, but buffer is non-null, try to realloc
766	enough space. This loses if buffer's address is bogus, but
767	that is the user's responsibility. If ->buffer is NULL this
768	is a simple allocation. */
769	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
770	if (dfa == NULL)
771	return REG_ESPACE;
772	preg->allocated = sizeof (re_dfa_t);
773	preg->buffer = (unsigned char *) dfa;
774	}
775	preg->used = sizeof (re_dfa_t);
776
777	err = init_dfa (dfa, length);
778	if (BE (err != REG_NOERROR, 0))
779	{
780	free_dfa_content (dfa);
781	preg->buffer = NULL;
782	preg->allocated = 0;
783	return err;
784	}
785	#ifdef DEBUG
786	dfa->re_str = re_malloc (char, length + 1);
787	strncpy (dfa->re_str, pattern, length + 1);
788	#endif
789
790	err = re_string_construct (&regexp, pattern, length, preg->translate,
791	syntax & RE_ICASE, dfa);
792	if (BE (err != REG_NOERROR, 0))
793	{
794	re_compile_internal_free_return:
795	free_workarea_compile (preg);
796	re_string_destruct (&regexp);
797	free_dfa_content (dfa);
798	preg->buffer = NULL;
799	preg->allocated = 0;
800	return err;
801	}
802
803	/* Parse the regular expression, and build a structure tree. */
804	preg->re_nsub = 0;
805	dfa->str_tree = parse (&regexp, preg, syntax, &err);
806	if (BE (dfa->str_tree == NULL, 0))
807	goto re_compile_internal_free_return;
808
809	/* Analyze the tree and create the nfa. */
810	err = analyze (preg);
811	if (BE (err != REG_NOERROR, 0))
812	goto re_compile_internal_free_return;
813
814	#ifdef RE_ENABLE_I18N
815	/* If possible, do searching in single byte encoding to speed things up. */
816	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
817	optimize_utf8 (dfa);
818	#endif
819
820	/* Then create the initial state of the dfa. */
821	err = create_initial_state (dfa);
822
823	/* Release work areas. */
824	free_workarea_compile (preg);
825	re_string_destruct (&regexp);
826
827	if (BE (err != REG_NOERROR, 0))
828	{
829	free_dfa_content (dfa);
830	preg->buffer = NULL;
831	preg->allocated = 0;
832	}
833
834	return err;
835	}
836
837	/* Initialize DFA. We use the length of the regular expression PAT_LEN
838	as the initial length of some arrays. */
839
840	static reg_errcode_t
841	init_dfa (dfa, pat_len)
842	re_dfa_t *dfa;
843	int pat_len;
844	{
845	int table_size;
846	#ifndef _LIBC
847	char *codeset_name;
848	#endif
849
850	memset (dfa, '\0', sizeof (re_dfa_t));
851
852	/* Force allocation of str_tree_storage the first time. */
853	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854
855	dfa->nodes_alloc = pat_len + 1;
856	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
857
858	dfa->states_alloc = pat_len + 1;
859
860	/* table_size = 2 ^ ceil(log pat_len) */
861	for (table_size = 1; table_size > 0; table_size <<= 1)
862	if (table_size > pat_len)
863	break;
864
865	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
866	dfa->state_hash_mask = table_size - 1;
867
868	dfa->mb_cur_max = MB_CUR_MAX;
869	#ifdef _LIBC
870	if (dfa->mb_cur_max == 6
871	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
872	dfa->is_utf8 = 1;
873	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
874	!= 0);
875	#else
876	# ifdef HAVE_LANGINFO_CODESET
877	codeset_name = nl_langinfo (CODESET);
878	# else
879	codeset_name = getenv ("LC_ALL");
880	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
881	codeset_name = getenv ("LC_CTYPE");
882	if (codeset_name == NULL \|\| codeset_name[0] == '\0')
883	codeset_name = getenv ("LANG");
884	if (codeset_name == NULL)
885	codeset_name = "";
886	else if (strchr (codeset_name, '.') != NULL)
887	codeset_name = strchr (codeset_name, '.') + 1;
888	# endif
889
890	/* strcasecmp isn't a standard interface. brute force check */
891	#if 0
892	if (strcasecmp (codeset_name, "UTF-8") == 0
893	\|\| strcasecmp (codeset_name, "UTF8") == 0)
894	dfa->is_utf8 = 1;
895	#else
896	if ( (codeset_name[0] == 'U' \|\| codeset_name[0] == 'u')
897	&& (codeset_name[1] == 'T' \|\| codeset_name[1] == 't')
898	&& (codeset_name[2] == 'F' \|\| codeset_name[2] == 'f')
899	&& (codeset_name[3] == '-'
900	? codeset_name[4] == '8' && codeset_name[5] == '\0'
901	: codeset_name[3] == '8' && codeset_name[4] == '\0'))
902	dfa->is_utf8 = 1;
903	#endif
904
905	/* We check exhaustively in the loop below if this charset is a
906	superset of ASCII. */
907	dfa->map_notascii = 0;
908	#endif
909
910	#ifdef RE_ENABLE_I18N
911	if (dfa->mb_cur_max > 1)
912	{
913	if (dfa->is_utf8)
914	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
915	else
916	{
917	int i, j, ch;
918
919	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
920	if (BE (dfa->sb_char == NULL, 0))
921	return REG_ESPACE;
922
923	/* Clear all bits by, then set those corresponding to single
924	byte chars. */
925	bitset_empty (dfa->sb_char);
926
927	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
928	for (j = 0; j < UINT_BITS; ++j, ++ch)
929	{
930	wchar_t wch = __btowc (ch);
931	if (wch != WEOF)
932	dfa->sb_char[i] \|= 1 << j;
933	# ifndef _LIBC
934	if (isascii (ch) && wch != (wchar_t) ch)
935	dfa->map_notascii = 1;
936	# endif
937	}
938	}
939	}
940	#endif
941
942	if (BE (dfa->nodes == NULL \|\| dfa->state_table == NULL, 0))
943	return REG_ESPACE;
944	return REG_NOERROR;
945	}
946
947	/* Initialize WORD_CHAR table, which indicate which character is
948	"word". In this case "word" means that it is the word construction
949	character used by some operators like "\<", "\>", etc. */
950
951	static void
952	init_word_char (dfa)
953	re_dfa_t *dfa;
954	{
955	int i, j, ch;
956	dfa->word_ops_used = 1;
957	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
958	for (j = 0; j < UINT_BITS; ++j, ++ch)
959	if (isalnum (ch) \|\| ch == '_')
960	dfa->word_char[i] \|= 1 << j;
961	}
962
963	/* Free the work area which are only used while compiling. */
964
965	static void
966	free_workarea_compile (preg)
967	regex_t *preg;
968	{
969	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
970	bin_tree_storage_t storage, next;
971	for (storage = dfa->str_tree_storage; storage; storage = next)
972	{
973	next = storage->next;
974	re_free (storage);
975	}
976	dfa->str_tree_storage = NULL;
977	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
978	dfa->str_tree = NULL;
979	re_free (dfa->org_indices);
980	dfa->org_indices = NULL;
981	}
982
983	/* Create initial states for all contexts. */
984
985	static reg_errcode_t
986	create_initial_state (dfa)
987	re_dfa_t *dfa;
988	{
989	int first, i;
990	reg_errcode_t err;
991	re_node_set init_nodes;
992
993	/* Initial states have the epsilon closure of the node which is
994	the first node of the regular expression. */
995	first = dfa->str_tree->first->node_idx;
996	dfa->init_node = first;
997	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
998	if (BE (err != REG_NOERROR, 0))
999	return err;
1000
1001	/* The back-references which are in initial states can epsilon transit,
1002	since in this case all of the subexpressions can be null.
1003	Then we add epsilon closures of the nodes which are the next nodes of
1004	the back-references. */
1005	if (dfa->nbackref > 0)
1006	for (i = 0; i < init_nodes.nelem; ++i)
1007	{
1008	int node_idx = init_nodes.elems[i];
1009	re_token_type_t type = dfa->nodes[node_idx].type;
1010
1011	int clexp_idx;
1012	if (type != OP_BACK_REF)
1013	continue;
1014	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1015	{
1016	re_token_t *clexp_node;
1017	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1018	if (clexp_node->type == OP_CLOSE_SUBEXP
1019	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1020	break;
1021	}
1022	if (clexp_idx == init_nodes.nelem)
1023	continue;
1024
1025	if (type == OP_BACK_REF)
1026	{
1027	int dest_idx = dfa->edests[node_idx].elems[0];
1028	if (!re_node_set_contains (&init_nodes, dest_idx))
1029	{
1030	re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1031	i = 0;
1032	}
1033	}
1034	}
1035
1036	/* It must be the first time to invoke acquire_state. */
1037	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1038	/* We don't check ERR here, since the initial state must not be NULL. */
1039	if (BE (dfa->init_state == NULL, 0))
1040	return err;
1041	if (dfa->init_state->has_constraint)
1042	{
1043	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1044	CONTEXT_WORD);
1045	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1046	CONTEXT_NEWLINE);
1047	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1048	&init_nodes,
1049	CONTEXT_NEWLINE
1050	\| CONTEXT_BEGBUF);
1051	if (BE (dfa->init_state_word == NULL \|\| dfa->init_state_nl == NULL
1052	\|\| dfa->init_state_begbuf == NULL, 0))
1053	return err;
1054	}
1055	else
1056	dfa->init_state_word = dfa->init_state_nl
1057	= dfa->init_state_begbuf = dfa->init_state;
1058
1059	re_node_set_free (&init_nodes);
1060	return REG_NOERROR;
1061	}
1062
1063
1064	#ifdef RE_ENABLE_I18N
1065	/* If it is possible to do searching in single byte encoding instead of UTF-8
1066	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1067	DFA nodes where needed. */
1068
1069	static void
1070	optimize_utf8 (dfa)
1071	re_dfa_t *dfa;
1072	{
1073	int node, i, mb_chars = 0, has_period = 0;
1074
1075	for (node = 0; node < dfa->nodes_len; ++node)
1076	switch (dfa->nodes[node].type)
1077	{
1078	case CHARACTER:
1079	if (dfa->nodes[node].opr.c >= 0x80)
1080	mb_chars = 1;
1081	break;
1082	case ANCHOR:
1083	switch (dfa->nodes[node].opr.idx)
1084	{
1085	case LINE_FIRST:
1086	case LINE_LAST:
1087	case BUF_FIRST:
1088	case BUF_LAST:
1089	break;
1090	default:
1091	/* Word anchors etc. cannot be handled. */
1092	return;
1093	}
1094	break;
1095	case OP_PERIOD:
1096	has_period = 1;
1097	break;
1098	case OP_BACK_REF:
1099	case OP_ALT:
1100	case END_OF_RE:
1101	case OP_DUP_ASTERISK:
1102	case OP_OPEN_SUBEXP:
1103	case OP_CLOSE_SUBEXP:
1104	break;
1105	case COMPLEX_BRACKET:
1106	return;
1107	case SIMPLE_BRACKET:
1108	/* Just double check. */
1109	for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1110	if (dfa->nodes[node].opr.sbcset[i])
1111	return;
1112	break;
1113	default:
1114	abort ();
1115	}
1116
1117	if (mb_chars \|\| has_period)
1118	for (node = 0; node < dfa->nodes_len; ++node)
1119	{
1120	if (dfa->nodes[node].type == CHARACTER
1121	&& dfa->nodes[node].opr.c >= 0x80)
1122	dfa->nodes[node].mb_partial = 0;
1123	else if (dfa->nodes[node].type == OP_PERIOD)
1124	dfa->nodes[node].type = OP_UTF8_PERIOD;
1125	}
1126
1127	/* The search can be in single byte locale. */
1128	dfa->mb_cur_max = 1;
1129	dfa->is_utf8 = 0;
1130	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
1131	}
1132	#endif
1133
1134
1135	/* Analyze the structure tree, and calculate "first", "next", "edest",
1136	"eclosure", and "inveclosure". */
1137
1138	static reg_errcode_t
1139	analyze (preg)
1140	regex_t *preg;
1141	{
1142	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
1143	reg_errcode_t ret;
1144
1145	/* Allocate arrays. */
1146	dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1147	dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1148	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1149	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1150	if (BE (dfa->nexts == NULL \|\| dfa->org_indices == NULL \|\| dfa->edests == NULL
1151	\|\| dfa->eclosures == NULL, 0))
1152	return REG_ESPACE;
1153
1154	dfa->subexp_map = re_malloc (int, preg->re_nsub);
1155	if (dfa->subexp_map != NULL)
1156	{
1157	int i;
1158	for (i = 0; i < preg->re_nsub; i++)
1159	dfa->subexp_map[i] = i;
1160	preorder (dfa->str_tree, optimize_subexps, dfa);
1161	for (i = 0; i < preg->re_nsub; i++)
1162	if (dfa->subexp_map[i] != i)
1163	break;
1164	if (i == preg->re_nsub)
1165	{
1166	free (dfa->subexp_map);
1167	dfa->subexp_map = NULL;
1168	}
1169	}
1170
1171	ret = postorder (dfa->str_tree, lower_subexps, preg);
1172	if (BE (ret != REG_NOERROR, 0))
1173	return ret;
1174	ret = postorder (dfa->str_tree, calc_first, dfa);
1175	if (BE (ret != REG_NOERROR, 0))
1176	return ret;
1177	preorder (dfa->str_tree, calc_next, dfa);
1178	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1179	if (BE (ret != REG_NOERROR, 0))
1180	return ret;
1181	ret = calc_eclosure (dfa);
1182	if (BE (ret != REG_NOERROR, 0))
1183	return ret;
1184
1185	/* We only need this during the prune_impossible_nodes pass in regexec.c;
1186	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1187	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1188	\|\| dfa->nbackref)
1189	{
1190	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1191	if (BE (dfa->inveclosures == NULL, 0))
1192	return REG_ESPACE;
1193	ret = calc_inveclosure (dfa);
1194	}
1195
1196	return ret;
1197	}
1198
1199	/* Our parse trees are very unbalanced, so we cannot use a stack to
1200	implement parse tree visits. Instead, we use parent pointers and
1201	some hairy code in these two functions. */
1202	static reg_errcode_t
1203	postorder (root, fn, extra)
1204	bin_tree_t *root;
1205	reg_errcode_t (fn (void , bin_tree_t ));
1206	void *extra;
1207	{
1208	bin_tree_t node, prev;
1209
1210	for (node = root; ; )
1211	{
1212	/* Descend down the tree, preferably to the left (or to the right
1213	if that's the only child). */
1214	while (node->left \|\| node->right)
1215	if (node->left)
1216	node = node->left;
1217	else
1218	node = node->right;
1219
1220	do
1221	{
1222	reg_errcode_t err = fn (extra, node);
1223	if (BE (err != REG_NOERROR, 0))
1224	return err;
1225	if (node->parent == NULL)
1226	return REG_NOERROR;
1227	prev = node;
1228	node = node->parent;
1229	}
1230	/* Go up while we have a node that is reached from the right. */
1231	while (node->right == prev \|\| node->right == NULL);
1232	node = node->right;
1233	}
1234	}
1235
1236	static reg_errcode_t
1237	preorder (root, fn, extra)
1238	bin_tree_t *root;
1239	reg_errcode_t (fn (void , bin_tree_t ));
1240	void *extra;
1241	{
1242	bin_tree_t *node;
1243
1244	for (node = root; ; )
1245	{
1246	reg_errcode_t err = fn (extra, node);
1247	if (BE (err != REG_NOERROR, 0))
1248	return err;
1249
1250	/* Go to the left node, or up and to the right. */
1251	if (node->left)
1252	node = node->left;
1253	else
1254	{
1255	bin_tree_t *prev = NULL;
1256	while (node->right == prev \|\| node->right == NULL)
1257	{
1258	prev = node;
1259	node = node->parent;
1260	if (!node)
1261	return REG_NOERROR;
1262	}
1263	node = node->right;
1264	}
1265	}
1266	}
1267
1268	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1269	re_search_internal to map the inner one's opr.idx to this one's. Adjust
1270	backreferences as well. Requires a preorder visit. */
1271	static reg_errcode_t
1272	optimize_subexps (extra, node)
1273	void *extra;
1274	bin_tree_t *node;
1275	{
1276	re_dfa_t dfa = (re_dfa_t ) extra;
1277
1278	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1279	{
1280	int idx = node->token.opr.idx;
1281	node->token.opr.idx = dfa->subexp_map[idx];
1282	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
1283	}
1284
1285	else if (node->token.type == SUBEXP
1286	&& node->left && node->left->token.type == SUBEXP)
1287	{
1288	int other_idx = node->left->token.opr.idx;
1289
1290	node->left = node->left->left;
1291	if (node->left)
1292	node->left->parent = node;
1293
1294	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295	if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1296	dfa->used_bkref_map &= ~(1 << other_idx);
1297	}
1298
1299	return REG_NOERROR;
1300	}
1301
1302	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1304	static reg_errcode_t
1305	lower_subexps (extra, node)
1306	void *extra;
1307	bin_tree_t *node;
1308	{
1309	regex_t preg = (regex_t ) extra;
1310	reg_errcode_t err = REG_NOERROR;
1311
1312	if (node->left && node->left->token.type == SUBEXP)
1313	{
1314	node->left = lower_subexp (&err, preg, node->left);
1315	if (node->left)
1316	node->left->parent = node;
1317	}
1318	if (node->right && node->right->token.type == SUBEXP)
1319	{
1320	node->right = lower_subexp (&err, preg, node->right);
1321	if (node->right)
1322	node->right->parent = node;
1323	}
1324
1325	return err;
1326	}
1327
1328	static bin_tree_t *
1329	lower_subexp (err, preg, node)
1330	reg_errcode_t *err;
1331	regex_t *preg;
1332	bin_tree_t *node;
1333	{
1334	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
1335	bin_tree_t *body = node->left;
1336	bin_tree_t op, cls, tree1, tree;
1337
1338	if (preg->no_sub
1339	&& (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1340	\|\| !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1341	return node->left;
1342
1343	/* Convert the SUBEXP node to the concatenation of an
1344	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1345	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1346	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1347	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1348	tree = create_tree (dfa, op, tree1, CONCAT);
1349	if (BE (tree == NULL \|\| tree1 == NULL \|\| op == NULL \|\| cls == NULL, 0))
1350	{
1351	*err = REG_ESPACE;
1352	return NULL;
1353	}
1354
1355	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1356	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1357	return tree;
1358	}
1359
1360	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1361	nodes. Requires a postorder visit. */
1362	static reg_errcode_t
1363	calc_first (extra, node)
1364	void *extra;
1365	bin_tree_t *node;
1366	{
1367	re_dfa_t dfa = (re_dfa_t ) extra;
1368	if (node->token.type == CONCAT)
1369	{
1370	node->first = node->left->first;
1371	node->node_idx = node->left->node_idx;
1372	}
1373	else
1374	{
1375	node->first = node;
1376	node->node_idx = re_dfa_add_node (dfa, node->token);
1377	if (BE (node->node_idx == -1, 0))
1378	return REG_ESPACE;
1379	}
1380	return REG_NOERROR;
1381	}
1382
1383	/* Pass 2: compute NEXT on the tree. Preorder visit. */
1384	static reg_errcode_t
1385	calc_next (extra, node)
1386	void *extra;
1387	bin_tree_t *node;
1388	{
1389	switch (node->token.type)
1390	{
1391	case OP_DUP_ASTERISK:
1392	node->left->next = node;
1393	break;
1394	case CONCAT:
1395	node->left->next = node->right->first;
1396	node->right->next = node->next;
1397	break;
1398	default:
1399	if (node->left)
1400	node->left->next = node->next;
1401	if (node->right)
1402	node->right->next = node->next;
1403	break;
1404	}
1405	return REG_NOERROR;
1406	}
1407
1408	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1409	static reg_errcode_t
1410	link_nfa_nodes (extra, node)
1411	void *extra;
1412	bin_tree_t *node;
1413	{
1414	re_dfa_t dfa = (re_dfa_t ) extra;
1415	int idx = node->node_idx;
1416	reg_errcode_t err = REG_NOERROR;
1417
1418	switch (node->token.type)
1419	{
1420	case CONCAT:
1421	break;
1422
1423	case END_OF_RE:
1424	assert (node->next == NULL);
1425	break;
1426
1427	case OP_DUP_ASTERISK:
1428	case OP_ALT:
1429	{
1430	int left, right;
1431	dfa->has_plural_match = 1;
1432	if (node->left != NULL)
1433	left = node->left->first->node_idx;
1434	else
1435	left = node->next->node_idx;
1436	if (node->right != NULL)
1437	right = node->right->first->node_idx;
1438	else
1439	right = node->next->node_idx;
1440	assert (left > -1);
1441	assert (right > -1);
1442	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1443	}
1444	break;
1445
1446	case ANCHOR:
1447	case OP_OPEN_SUBEXP:
1448	case OP_CLOSE_SUBEXP:
1449	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1450	break;
1451
1452	case OP_BACK_REF:
1453	dfa->nexts[idx] = node->next->node_idx;
1454	if (node->token.type == OP_BACK_REF)
1455	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1456	break;
1457
1458	default:
1459	assert (!IS_EPSILON_NODE (node->token.type));
1460	dfa->nexts[idx] = node->next->node_idx;
1461	break;
1462	}
1463
1464	return err;
1465	}
1466
1467	/* Duplicate the epsilon closure of the node ROOT_NODE.
1468	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1469	to their own constraint. */
1470
1471	static reg_errcode_t
1472	duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1473	init_constraint)
1474	re_dfa_t *dfa;
1475	int top_org_node, top_clone_node, root_node;
1476	unsigned int init_constraint;
1477	{
1478	reg_errcode_t err;
1479	int org_node, clone_node, ret;
1480	unsigned int constraint = init_constraint;
1481	for (org_node = top_org_node, clone_node = top_clone_node;;)
1482	{
1483	int org_dest, clone_dest;
1484	if (dfa->nodes[org_node].type == OP_BACK_REF)
1485	{
1486	/* If the back reference epsilon-transit, its destination must
1487	also have the constraint. Then duplicate the epsilon closure
1488	of the destination of the back reference, and store it in
1489	edests of the back reference. */
1490	org_dest = dfa->nexts[org_node];
1491	re_node_set_empty (dfa->edests + clone_node);
1492	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1493	if (BE (err != REG_NOERROR, 0))
1494	return err;
1495	dfa->nexts[clone_node] = dfa->nexts[org_node];
1496	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1497	if (BE (ret < 0, 0))
1498	return REG_ESPACE;
1499	}
1500	else if (dfa->edests[org_node].nelem == 0)
1501	{
1502	/* In case of the node can't epsilon-transit, don't duplicate the
1503	destination and store the original destination as the
1504	destination of the node. */
1505	dfa->nexts[clone_node] = dfa->nexts[org_node];
1506	break;
1507	}
1508	else if (dfa->edests[org_node].nelem == 1)
1509	{
1510	/* In case of the node can epsilon-transit, and it has only one
1511	destination. */
1512	org_dest = dfa->edests[org_node].elems[0];
1513	re_node_set_empty (dfa->edests + clone_node);
1514	if (dfa->nodes[org_node].type == ANCHOR)
1515	{
1516	/* In case of the node has another constraint, append it. */
1517	if (org_node == root_node && clone_node != org_node)
1518	{
1519	/* ...but if the node is root_node itself, it means the
1520	epsilon closure have a loop, then tie it to the
1521	destination of the root_node. */
1522	ret = re_node_set_insert (dfa->edests + clone_node,
1523	org_dest);
1524	if (BE (ret < 0, 0))
1525	return REG_ESPACE;
1526	break;
1527	}
1528	constraint \|= dfa->nodes[org_node].opr.ctx_type;
1529	}
1530	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1531	if (BE (err != REG_NOERROR, 0))
1532	return err;
1533	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534	if (BE (ret < 0, 0))
1535	return REG_ESPACE;
1536	}
1537	else /* dfa->edests[org_node].nelem == 2 */
1538	{
1539	/* In case of the node can epsilon-transit, and it has two
1540	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
1541	org_dest = dfa->edests[org_node].elems[0];
1542	re_node_set_empty (dfa->edests + clone_node);
1543	/* Search for a duplicated node which satisfies the constraint. */
1544	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1545	if (clone_dest == -1)
1546	{
1547	/* There are no such a duplicated node, create a new one. */
1548	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1549	if (BE (err != REG_NOERROR, 0))
1550	return err;
1551	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552	if (BE (ret < 0, 0))
1553	return REG_ESPACE;
1554	err = duplicate_node_closure (dfa, org_dest, clone_dest,
1555	root_node, constraint);
1556	if (BE (err != REG_NOERROR, 0))
1557	return err;
1558	}
1559	else
1560	{
1561	/* There are a duplicated node which satisfy the constraint,
1562	use it to avoid infinite loop. */
1563	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564	if (BE (ret < 0, 0))
1565	return REG_ESPACE;
1566	}
1567
1568	org_dest = dfa->edests[org_node].elems[1];
1569	err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1570	if (BE (err != REG_NOERROR, 0))
1571	return err;
1572	ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573	if (BE (ret < 0, 0))
1574	return REG_ESPACE;
1575	}
1576	org_node = org_dest;
1577	clone_node = clone_dest;
1578	}
1579	return REG_NOERROR;
1580	}
1581
1582	/* Search for a node which is duplicated from the node ORG_NODE, and
1583	satisfies the constraint CONSTRAINT. */
1584
1585	static int
1586	search_duplicated_node (dfa, org_node, constraint)
1587	re_dfa_t *dfa;
1588	int org_node;
1589	unsigned int constraint;
1590	{
1591	int idx;
1592	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1593	{
1594	if (org_node == dfa->org_indices[idx]
1595	&& constraint == dfa->nodes[idx].constraint)
1596	return idx; /* Found. */
1597	}
1598	return -1; /* Not found. */
1599	}
1600
1601	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1602	The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1603	otherwise return the error code. */
1604
1605	static reg_errcode_t
1606	duplicate_node (new_idx, dfa, org_idx, constraint)
1607	re_dfa_t *dfa;
1608	int *new_idx, org_idx;
1609	unsigned int constraint;
1610	{
1611	int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1612	if (BE (dup_idx == -1, 0))
1613	return REG_ESPACE;
1614	dfa->nodes[dup_idx].constraint = constraint;
1615	if (dfa->nodes[org_idx].type == ANCHOR)
1616	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].opr.ctx_type;
1617	dfa->nodes[dup_idx].duplicated = 1;
1618
1619	/* Store the index of the original node. */
1620	dfa->org_indices[dup_idx] = org_idx;
1621	*new_idx = dup_idx;
1622	return REG_NOERROR;
1623	}
1624
1625	static reg_errcode_t
1626	calc_inveclosure (dfa)
1627	re_dfa_t *dfa;
1628	{
1629	int src, idx, ret;
1630	for (idx = 0; idx < dfa->nodes_len; ++idx)
1631	re_node_set_init_empty (dfa->inveclosures + idx);
1632
1633	for (src = 0; src < dfa->nodes_len; ++src)
1634	{
1635	int *elems = dfa->eclosures[src].elems;
1636	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1637	{
1638	ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1639	if (BE (ret == -1, 0))
1640	return REG_ESPACE;
1641	}
1642	}
1643
1644	return REG_NOERROR;
1645	}
1646
1647	/* Calculate "eclosure" for all the node in DFA. */
1648
1649	static reg_errcode_t
1650	calc_eclosure (dfa)
1651	re_dfa_t *dfa;
1652	{
1653	int node_idx, incomplete;
1654	#ifdef DEBUG
1655	assert (dfa->nodes_len > 0);
1656	#endif
1657	incomplete = 0;
1658	/* For each nodes, calculate epsilon closure. */
1659	for (node_idx = 0; ; ++node_idx)
1660	{
1661	reg_errcode_t err;
1662	re_node_set eclosure_elem;
1663	if (node_idx == dfa->nodes_len)
1664	{
1665	if (!incomplete)
1666	break;
1667	incomplete = 0;
1668	node_idx = 0;
1669	}
1670
1671	#ifdef DEBUG
1672	assert (dfa->eclosures[node_idx].nelem != -1);
1673	#endif
1674
1675	/* If we have already calculated, skip it. */
1676	if (dfa->eclosures[node_idx].nelem != 0)
1677	continue;
1678	/* Calculate epsilon closure of `node_idx'. */
1679	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1680	if (BE (err != REG_NOERROR, 0))
1681	return err;
1682
1683	if (dfa->eclosures[node_idx].nelem == 0)
1684	{
1685	incomplete = 1;
1686	re_node_set_free (&eclosure_elem);
1687	}
1688	}
1689	return REG_NOERROR;
1690	}
1691
1692	/* Calculate epsilon closure of NODE. */
1693
1694	static reg_errcode_t
1695	calc_eclosure_iter (new_set, dfa, node, root)
1696	re_node_set *new_set;
1697	re_dfa_t *dfa;
1698	int node, root;
1699	{
1700	reg_errcode_t err;
1701	unsigned int constraint;
1702	int i, incomplete;
1703	re_node_set eclosure;
1704	incomplete = 0;
1705	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1706	if (BE (err != REG_NOERROR, 0))
1707	return err;
1708
1709	/* This indicates that we are calculating this node now.
1710	We reference this value to avoid infinite loop. */
1711	dfa->eclosures[node].nelem = -1;
1712
1713	constraint = ((dfa->nodes[node].type == ANCHOR)
1714	? dfa->nodes[node].opr.ctx_type : 0);
1715	/* If the current node has constraints, duplicate all nodes.
1716	Since they must inherit the constraints. */
1717	if (constraint
1718	&& dfa->edests[node].nelem
1719	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1720	{
1721	int org_node, cur_node;
1722	org_node = cur_node = node;
1723	err = duplicate_node_closure (dfa, node, node, node, constraint);
1724	if (BE (err != REG_NOERROR, 0))
1725	return err;
1726	}
1727
1728	/* Expand each epsilon destination nodes. */
1729	if (IS_EPSILON_NODE(dfa->nodes[node].type))
1730	for (i = 0; i < dfa->edests[node].nelem; ++i)
1731	{
1732	re_node_set eclosure_elem;
1733	int edest = dfa->edests[node].elems[i];
1734	/* If calculating the epsilon closure of `edest' is in progress,
1735	return intermediate result. */
1736	if (dfa->eclosures[edest].nelem == -1)
1737	{
1738	incomplete = 1;
1739	continue;
1740	}
1741	/* If we haven't calculated the epsilon closure of `edest' yet,
1742	calculate now. Otherwise use calculated epsilon closure. */
1743	if (dfa->eclosures[edest].nelem == 0)
1744	{
1745	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1746	if (BE (err != REG_NOERROR, 0))
1747	return err;
1748	}
1749	else
1750	eclosure_elem = dfa->eclosures[edest];
1751	/* Merge the epsilon closure of `edest'. */
1752	re_node_set_merge (&eclosure, &eclosure_elem);
1753	/* If the epsilon closure of `edest' is incomplete,
1754	the epsilon closure of this node is also incomplete. */
1755	if (dfa->eclosures[edest].nelem == 0)
1756	{
1757	incomplete = 1;
1758	re_node_set_free (&eclosure_elem);
1759	}
1760	}
1761
1762	/* Epsilon closures include itself. */
1763	re_node_set_insert (&eclosure, node);
1764	if (incomplete && !root)
1765	dfa->eclosures[node].nelem = 0;
1766	else
1767	dfa->eclosures[node] = eclosure;
1768	*new_set = eclosure;
1769	return REG_NOERROR;
1770	}
1771
1772
1773	/* Functions for token which are used in the parser. */
1774
1775	/* Fetch a token from INPUT.
1776	We must not use this function inside bracket expressions. */
1777
1778	static void
1779	fetch_token (result, input, syntax)
1780	re_token_t *result;
1781	re_string_t *input;
1782	reg_syntax_t syntax;
1783	{
1784	re_string_skip_bytes (input, peek_token (result, input, syntax));
1785	}
1786
1787	/* Peek a token from INPUT, and return the length of the token.
1788	We must not use this function inside bracket expressions. */
1789
1790	static int
1791	peek_token (token, input, syntax)
1792	re_token_t *token;
1793	re_string_t *input;
1794	reg_syntax_t syntax;
1795	{
1796	unsigned char c;
1797
1798	if (re_string_eoi (input))
1799	{
1800	token->type = END_OF_RE;
1801	return 0;
1802	}
1803
1804	c = re_string_peek_byte (input, 0);
1805	token->opr.c = c;
1806
1807	token->word_char = 0;
1808	#ifdef RE_ENABLE_I18N
1809	token->mb_partial = 0;
1810	if (input->mb_cur_max > 1 &&
1811	!re_string_first_byte (input, re_string_cur_idx (input)))
1812	{
1813	token->type = CHARACTER;
1814	token->mb_partial = 1;
1815	return 1;
1816	}
1817	#endif
1818	if (c == '\\')
1819	{
1820	unsigned char c2;
1821	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1822	{
1823	token->type = BACK_SLASH;
1824	return 1;
1825	}
1826
1827	c2 = re_string_peek_byte_case (input, 1);
1828	token->opr.c = c2;
1829	token->type = CHARACTER;
1830	#ifdef RE_ENABLE_I18N
1831	if (input->mb_cur_max > 1)
1832	{
1833	wint_t wc = re_string_wchar_at (input,
1834	re_string_cur_idx (input) + 1);
1835	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1836	}
1837	else
1838	#endif
1839	token->word_char = IS_WORD_CHAR (c2) != 0;
1840
1841	switch (c2)
1842	{
1843	case '\|':
1844	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1845	token->type = OP_ALT;
1846	break;
1847	case '1': case '2': case '3': case '4': case '5':
1848	case '6': case '7': case '8': case '9':
1849	if (!(syntax & RE_NO_BK_REFS))
1850	{
1851	token->type = OP_BACK_REF;
1852	token->opr.idx = c2 - '1';
1853	}
1854	break;
1855	case '<':
1856	if (!(syntax & RE_NO_GNU_OPS))
1857	{
1858	token->type = ANCHOR;
1859	token->opr.ctx_type = WORD_FIRST;
1860	}
1861	break;
1862	case '>':
1863	if (!(syntax & RE_NO_GNU_OPS))
1864	{
1865	token->type = ANCHOR;
1866	token->opr.ctx_type = WORD_LAST;
1867	}
1868	break;
1869	case 'b':
1870	if (!(syntax & RE_NO_GNU_OPS))
1871	{
1872	token->type = ANCHOR;
1873	token->opr.ctx_type = WORD_DELIM;
1874	}
1875	break;
1876	case 'B':
1877	if (!(syntax & RE_NO_GNU_OPS))
1878	{
1879	token->type = ANCHOR;
1880	token->opr.ctx_type = NOT_WORD_DELIM;
1881	}
1882	break;
1883	case 'w':
1884	if (!(syntax & RE_NO_GNU_OPS))
1885	token->type = OP_WORD;
1886	break;
1887	case 'W':
1888	if (!(syntax & RE_NO_GNU_OPS))
1889	token->type = OP_NOTWORD;
1890	break;
1891	#ifndef GAWK
1892	case 's':
1893	if (!(syntax & RE_NO_GNU_OPS))
1894	token->type = OP_SPACE;
1895	break;
1896	case 'S':
1897	if (!(syntax & RE_NO_GNU_OPS))
1898	token->type = OP_NOTSPACE;
1899	break;
1900	#endif
1901	case '`':
1902	if (!(syntax & RE_NO_GNU_OPS))
1903	{
1904	token->type = ANCHOR;
1905	token->opr.ctx_type = BUF_FIRST;
1906	}
1907	break;
1908	case '\'':
1909	if (!(syntax & RE_NO_GNU_OPS))
1910	{
1911	token->type = ANCHOR;
1912	token->opr.ctx_type = BUF_LAST;
1913	}
1914	break;
1915	case '(':
1916	if (!(syntax & RE_NO_BK_PARENS))
1917	token->type = OP_OPEN_SUBEXP;
1918	break;
1919	case ')':
1920	if (!(syntax & RE_NO_BK_PARENS))
1921	token->type = OP_CLOSE_SUBEXP;
1922	break;
1923	case '+':
1924	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1925	token->type = OP_DUP_PLUS;
1926	break;
1927	case '?':
1928	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1929	token->type = OP_DUP_QUESTION;
1930	break;
1931	case '{':
1932	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1933	token->type = OP_OPEN_DUP_NUM;
1934	break;
1935	case '}':
1936	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1937	token->type = OP_CLOSE_DUP_NUM;
1938	break;
1939	default:
1940	break;
1941	}
1942	return 2;
1943	}
1944
1945	token->type = CHARACTER;
1946	#ifdef RE_ENABLE_I18N
1947	if (input->mb_cur_max > 1)
1948	{
1949	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1950	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1951	}
1952	else
1953	#endif
1954	token->word_char = IS_WORD_CHAR (token->opr.c);
1955
1956	switch (c)
1957	{
1958	case '\n':
1959	if (syntax & RE_NEWLINE_ALT)
1960	token->type = OP_ALT;
1961	break;
1962	case '\|':
1963	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1964	token->type = OP_ALT;
1965	break;
1966	case '*':
1967	token->type = OP_DUP_ASTERISK;
1968	break;
1969	case '+':
1970	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1971	token->type = OP_DUP_PLUS;
1972	break;
1973	case '?':
1974	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1975	token->type = OP_DUP_QUESTION;
1976	break;
1977	case '{':
1978	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1979	token->type = OP_OPEN_DUP_NUM;
1980	break;
1981	case '}':
1982	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1983	token->type = OP_CLOSE_DUP_NUM;
1984	break;
1985	case '(':
1986	if (syntax & RE_NO_BK_PARENS)
1987	token->type = OP_OPEN_SUBEXP;
1988	break;
1989	case ')':
1990	if (syntax & RE_NO_BK_PARENS)
1991	token->type = OP_CLOSE_SUBEXP;
1992	break;
1993	case '[':
1994	token->type = OP_OPEN_BRACKET;
1995	break;
1996	case '.':
1997	token->type = OP_PERIOD;
1998	break;
1999	case '^':
2000	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE)) &&
2001	re_string_cur_idx (input) != 0)
2002	{
2003	char prev = re_string_peek_byte (input, -1);
2004	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
2005	break;
2006	}
2007	token->type = ANCHOR;
2008	token->opr.ctx_type = LINE_FIRST;
2009	break;
2010	case '$':
2011	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2012	re_string_cur_idx (input) + 1 != re_string_length (input))
2013	{
2014	re_token_t next;
2015	re_string_skip_bytes (input, 1);
2016	peek_token (&next, input, syntax);
2017	re_string_skip_bytes (input, -1);
2018	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2019	break;
2020	}
2021	token->type = ANCHOR;
2022	token->opr.ctx_type = LINE_LAST;
2023	break;
2024	default:
2025	break;
2026	}
2027	return 1;
2028	}
2029
2030	/* Peek a token from INPUT, and return the length of the token.
2031	We must not use this function out of bracket expressions. */
2032
2033	static int
2034	peek_token_bracket (token, input, syntax)
2035	re_token_t *token;
2036	re_string_t *input;
2037	reg_syntax_t syntax;
2038	{
2039	unsigned char c;
2040	if (re_string_eoi (input))
2041	{
2042	token->type = END_OF_RE;
2043	return 0;
2044	}
2045	c = re_string_peek_byte (input, 0);
2046	token->opr.c = c;
2047
2048	#ifdef RE_ENABLE_I18N
2049	if (input->mb_cur_max > 1 &&
2050	!re_string_first_byte (input, re_string_cur_idx (input)))
2051	{
2052	token->type = CHARACTER;
2053	return 1;
2054	}
2055	#endif /* RE_ENABLE_I18N */
2056
2057	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2058	&& re_string_cur_idx (input) + 1 < re_string_length (input))
2059	{
2060	/* In this case, '\' escape a character. */
2061	unsigned char c2;
2062	re_string_skip_bytes (input, 1);
2063	c2 = re_string_peek_byte (input, 0);
2064	token->opr.c = c2;
2065	token->type = CHARACTER;
2066	return 1;
2067	}
2068	if (c == '[') /* '[' is a special char in a bracket exps. */
2069	{
2070	unsigned char c2;
2071	int token_len;
2072	if (re_string_cur_idx (input) + 1 < re_string_length (input))
2073	c2 = re_string_peek_byte (input, 1);
2074	else
2075	c2 = 0;
2076	token->opr.c = c2;
2077	token_len = 2;
2078	switch (c2)
2079	{
2080	case '.':
2081	token->type = OP_OPEN_COLL_ELEM;
2082	break;
2083	case '=':
2084	token->type = OP_OPEN_EQUIV_CLASS;
2085	break;
2086	case ':':
2087	if (syntax & RE_CHAR_CLASSES)
2088	{
2089	token->type = OP_OPEN_CHAR_CLASS;
2090	break;
2091	}
2092	/* else fall through. */
2093	default:
2094	token->type = CHARACTER;
2095	token->opr.c = c;
2096	token_len = 1;
2097	break;
2098	}
2099	return token_len;
2100	}
2101	switch (c)
2102	{
2103	case '-':
2104	token->type = OP_CHARSET_RANGE;
2105	break;
2106	case ']':
2107	token->type = OP_CLOSE_BRACKET;
2108	break;
2109	case '^':
2110	token->type = OP_NON_MATCH_LIST;
2111	break;
2112	default:
2113	token->type = CHARACTER;
2114	}
2115	return 1;
2116	}
2117
2118
2119	/* Functions for parser. */
2120
2121	/* Entry point of the parser.
2122	Parse the regular expression REGEXP and return the structure tree.
2123	If an error is occured, ERR is set by error code, and return NULL.
2124	This function build the following tree, from regular expression <reg_exp>:
2125	CAT
2126	/ \
2127	/ \
2128	<reg_exp> EOR
2129
2130	CAT means concatenation.
2131	EOR means end of regular expression. */
2132
2133	static bin_tree_t *
2134	parse (regexp, preg, syntax, err)
2135	re_string_t *regexp;
2136	regex_t *preg;
2137	reg_syntax_t syntax;
2138	reg_errcode_t *err;
2139	{
2140	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2141	bin_tree_t tree, eor, *root;
2142	re_token_t current_token;
2143	dfa->syntax = syntax;
2144	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2145	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2146	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147	return NULL;
2148	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2149	if (tree != NULL)
2150	root = create_tree (dfa, tree, eor, CONCAT);
2151	else
2152	root = eor;
2153	if (BE (eor == NULL \|\| root == NULL, 0))
2154	{
2155	*err = REG_ESPACE;
2156	return NULL;
2157	}
2158	return root;
2159	}
2160
2161	/* This function build the following tree, from regular expression
2162	<branch1>\|<branch2>:
2163	ALT
2164	/ \
2165	/ \
2166	<branch1> <branch2>
2167
2168	ALT means alternative, which represents the operator `\|'. */
2169
2170	static bin_tree_t *
2171	parse_reg_exp (regexp, preg, token, syntax, nest, err)
2172	re_string_t *regexp;
2173	regex_t *preg;
2174	re_token_t *token;
2175	reg_syntax_t syntax;
2176	int nest;
2177	reg_errcode_t *err;
2178	{
2179	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2180	bin_tree_t tree, branch = NULL;
2181	tree = parse_branch (regexp, preg, token, syntax, nest, err);
2182	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2183	return NULL;
2184
2185	while (token->type == OP_ALT)
2186	{
2187	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2188	if (token->type != OP_ALT && token->type != END_OF_RE
2189	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2190	{
2191	branch = parse_branch (regexp, preg, token, syntax, nest, err);
2192	if (BE (*err != REG_NOERROR && branch == NULL, 0))
2193	return NULL;
2194	}
2195	else
2196	branch = NULL;
2197	tree = create_tree (dfa, tree, branch, OP_ALT);
2198	if (BE (tree == NULL, 0))
2199	{
2200	*err = REG_ESPACE;
2201	return NULL;
2202	}
2203	}
2204	return tree;
2205	}
2206
2207	/* This function build the following tree, from regular expression
2208	<exp1><exp2>:
2209	CAT
2210	/ \
2211	/ \
2212	<exp1> <exp2>
2213
2214	CAT means concatenation. */
2215
2216	static bin_tree_t *
2217	parse_branch (regexp, preg, token, syntax, nest, err)
2218	re_string_t *regexp;
2219	regex_t *preg;
2220	re_token_t *token;
2221	reg_syntax_t syntax;
2222	int nest;
2223	reg_errcode_t *err;
2224	{
2225	bin_tree_t tree, exp;
2226	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2227	tree = parse_expression (regexp, preg, token, syntax, nest, err);
2228	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2229	return NULL;
2230
2231	while (token->type != OP_ALT && token->type != END_OF_RE
2232	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
2233	{
2234	exp = parse_expression (regexp, preg, token, syntax, nest, err);
2235	if (BE (*err != REG_NOERROR && exp == NULL, 0))
2236	{
2237	return NULL;
2238	}
2239	if (tree != NULL && exp != NULL)
2240	{
2241	tree = create_tree (dfa, tree, exp, CONCAT);
2242	if (tree == NULL)
2243	{
2244	*err = REG_ESPACE;
2245	return NULL;
2246	}
2247	}
2248	else if (tree == NULL)
2249	tree = exp;
2250	/* Otherwise exp == NULL, we don't need to create new tree. */
2251	}
2252	return tree;
2253	}
2254
2255	/* This function build the following tree, from regular expression a*:
2256	*
2257	\|
2258	a
2259	*/
2260
2261	static bin_tree_t *
2262	parse_expression (regexp, preg, token, syntax, nest, err)
2263	re_string_t *regexp;
2264	regex_t *preg;
2265	re_token_t *token;
2266	reg_syntax_t syntax;
2267	int nest;
2268	reg_errcode_t *err;
2269	{
2270	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2271	bin_tree_t *tree;
2272	switch (token->type)
2273	{
2274	case CHARACTER:
2275	tree = create_token_tree (dfa, NULL, NULL, token);
2276	if (BE (tree == NULL, 0))
2277	{
2278	*err = REG_ESPACE;
2279	return NULL;
2280	}
2281	#ifdef RE_ENABLE_I18N
2282	if (dfa->mb_cur_max > 1)
2283	{
2284	while (!re_string_eoi (regexp)
2285	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2286	{
2287	bin_tree_t *mbc_remain;
2288	fetch_token (token, regexp, syntax);
2289	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2290	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2291	if (BE (mbc_remain == NULL \|\| tree == NULL, 0))
2292	{
2293	*err = REG_ESPACE;
2294	return NULL;
2295	}
2296	}
2297	}
2298	#endif
2299	break;
2300	case OP_OPEN_SUBEXP:
2301	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2302	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2303	return NULL;
2304	break;
2305	case OP_OPEN_BRACKET:
2306	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2307	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2308	return NULL;
2309	break;
2310	case OP_BACK_REF:
2311	if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2312	{
2313	*err = REG_ESUBREG;
2314	return NULL;
2315	}
2316	dfa->used_bkref_map \|= 1 << token->opr.idx;
2317	tree = create_token_tree (dfa, NULL, NULL, token);
2318	if (BE (tree == NULL, 0))
2319	{
2320	*err = REG_ESPACE;
2321	return NULL;
2322	}
2323	++dfa->nbackref;
2324	dfa->has_mb_node = 1;
2325	break;
2326	case OP_OPEN_DUP_NUM:
2327	if (syntax & RE_CONTEXT_INVALID_DUP)
2328	{
2329	*err = REG_BADRPT;
2330	return NULL;
2331	}
2332	/* FALLTHROUGH */
2333	case OP_DUP_ASTERISK:
2334	case OP_DUP_PLUS:
2335	case OP_DUP_QUESTION:
2336	if (syntax & RE_CONTEXT_INVALID_OPS)
2337	{
2338	*err = REG_BADRPT;
2339	return NULL;
2340	}
2341	else if (syntax & RE_CONTEXT_INDEP_OPS)
2342	{
2343	fetch_token (token, regexp, syntax);
2344	return parse_expression (regexp, preg, token, syntax, nest, err);
2345	}
2346	/* else fall through */
2347	case OP_CLOSE_SUBEXP:
2348	if ((token->type == OP_CLOSE_SUBEXP) &&
2349	!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2350	{
2351	*err = REG_ERPAREN;
2352	return NULL;
2353	}
2354	/* else fall through */
2355	case OP_CLOSE_DUP_NUM:
2356	/* We treat it as a normal character. */
2357
2358	/* Then we can these characters as normal characters. */
2359	token->type = CHARACTER;
2360	/* mb_partial and word_char bits should be initialized already
2361	by peek_token. */
2362	tree = create_token_tree (dfa, NULL, NULL, token);
2363	if (BE (tree == NULL, 0))
2364	{
2365	*err = REG_ESPACE;
2366	return NULL;
2367	}
2368	break;
2369	case ANCHOR:
2370	if ((token->opr.ctx_type
2371	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
2372	&& dfa->word_ops_used == 0)
2373	init_word_char (dfa);
2374	if (token->opr.ctx_type == WORD_DELIM
2375	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
2376	{
2377	bin_tree_t tree_first, tree_last;
2378	if (token->opr.ctx_type == WORD_DELIM)
2379	{
2380	token->opr.ctx_type = WORD_FIRST;
2381	tree_first = create_token_tree (dfa, NULL, NULL, token);
2382	token->opr.ctx_type = WORD_LAST;
2383	}
2384	else
2385	{
2386	token->opr.ctx_type = INSIDE_WORD;
2387	tree_first = create_token_tree (dfa, NULL, NULL, token);
2388	token->opr.ctx_type = INSIDE_NOTWORD;
2389	}
2390	tree_last = create_token_tree (dfa, NULL, NULL, token);
2391	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2392	if (BE (tree_first == NULL \|\| tree_last == NULL \|\| tree == NULL, 0))
2393	{
2394	*err = REG_ESPACE;
2395	return NULL;
2396	}
2397	}
2398	else
2399	{
2400	tree = create_token_tree (dfa, NULL, NULL, token);
2401	if (BE (tree == NULL, 0))
2402	{
2403	*err = REG_ESPACE;
2404	return NULL;
2405	}
2406	}
2407	/* We must return here, since ANCHORs can't be followed
2408	by repetition operators.
2409	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
2410	it must not be "<ANCHOR(^)><REPEAT()>". /
2411	fetch_token (token, regexp, syntax);
2412	return tree;
2413	case OP_PERIOD:
2414	tree = create_token_tree (dfa, NULL, NULL, token);
2415	if (BE (tree == NULL, 0))
2416	{
2417	*err = REG_ESPACE;
2418	return NULL;
2419	}
2420	if (dfa->mb_cur_max > 1)
2421	dfa->has_mb_node = 1;
2422	break;
2423	case OP_WORD:
2424	case OP_NOTWORD:
2425	tree = build_charclass_op (dfa, regexp->trans,
2426	(const char *) "alnum",
2427	(const char *) "_",
2428	token->type == OP_NOTWORD, err);
2429	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2430	return NULL;
2431	break;
2432	case OP_SPACE:
2433	case OP_NOTSPACE:
2434	tree = build_charclass_op (dfa, regexp->trans,
2435	(const char *) "space",
2436	(const char *) "",
2437	token->type == OP_NOTSPACE, err);
2438	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2439	return NULL;
2440	break;
2441	case OP_ALT:
2442	case END_OF_RE:
2443	return NULL;
2444	case BACK_SLASH:
2445	*err = REG_EESCAPE;
2446	return NULL;
2447	default:
2448	/* Must not happen? */
2449	#ifdef DEBUG
2450	assert (0);
2451	#endif
2452	return NULL;
2453	}
2454	fetch_token (token, regexp, syntax);
2455
2456	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
2457	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
2458	{
2459	tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2460	if (BE (*err != REG_NOERROR && tree == NULL, 0))
2461	return NULL;
2462	/* In BRE consecutive duplications are not allowed. */
2463	if ((syntax & RE_CONTEXT_INVALID_DUP)
2464	&& (token->type == OP_DUP_ASTERISK
2465	\|\| token->type == OP_OPEN_DUP_NUM))
2466	{
2467	*err = REG_BADRPT;
2468	return NULL;
2469	}
2470	}
2471
2472	return tree;
2473	}
2474
2475	/* This function build the following tree, from regular expression
2476	(<reg_exp>):
2477	SUBEXP
2478	\|
2479	<reg_exp>
2480	*/
2481
2482	static bin_tree_t *
2483	parse_sub_exp (regexp, preg, token, syntax, nest, err)
2484	re_string_t *regexp;
2485	regex_t *preg;
2486	re_token_t *token;
2487	reg_syntax_t syntax;
2488	int nest;
2489	reg_errcode_t *err;
2490	{
2491	re_dfa_t dfa = (re_dfa_t ) preg->buffer;
2492	bin_tree_t *tree;
2493	size_t cur_nsub;
2494	cur_nsub = preg->re_nsub++;
2495
2496	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
2497
2498	/* The subexpression may be a null string. */
2499	if (token->type == OP_CLOSE_SUBEXP)
2500	tree = NULL;
2501	else
2502	{
2503	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2504	if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2505	*err = REG_EPAREN;
2506	if (BE (*err != REG_NOERROR, 0))
2507	return NULL;
2508	}
2509	dfa->completed_bkref_map \|= 1 << cur_nsub;
2510
2511	tree = create_tree (dfa, tree, NULL, SUBEXP);
2512	if (BE (tree == NULL, 0))
2513	{
2514	*err = REG_ESPACE;
2515	return NULL;
2516	}
2517	tree->token.opr.idx = cur_nsub;
2518	return tree;
2519	}
2520
2521	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
2522
2523	static bin_tree_t *
2524	parse_dup_op (elem, regexp, dfa, token, syntax, err)
2525	bin_tree_t *elem;
2526	re_string_t *regexp;
2527	re_dfa_t *dfa;
2528	re_token_t *token;
2529	reg_syntax_t syntax;
2530	reg_errcode_t *err;
2531	{
2532	bin_tree_t tree = NULL, old_tree = NULL;
2533	int i, start, end, start_idx = re_string_cur_idx (regexp);
2534	#ifndef RE_TOKEN_INIT_BUG
2535	re_token_t start_token = *token;
2536	#else
2537	re_token_t start_token;
2538
2539	memcpy ((void ) &start_token, (void ) token, sizeof start_token);
2540	#endif
2541
2542	if (token->type == OP_OPEN_DUP_NUM)
2543	{
2544	end = 0;
2545	start = fetch_number (regexp, token, syntax);
2546	if (start == -1)
2547	{
2548	if (token->type == CHARACTER && token->opr.c == ',')
2549	start = 0; /* We treat "{,m}" as "{0,m}". */
2550	else
2551	{
2552	err = REG_BADBR; / <re>{} is invalid. */
2553	return NULL;
2554	}
2555	}
2556	if (BE (start != -2, 1))
2557	{
2558	/* We treat "{n}" as "{n,n}". */
2559	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2560	: ((token->type == CHARACTER && token->opr.c == ',')
2561	? fetch_number (regexp, token, syntax) : -2));
2562	}
2563	if (BE (start == -2 \|\| end == -2, 0))
2564	{
2565	/* Invalid sequence. */
2566	if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2567	{
2568	if (token->type == END_OF_RE)
2569	*err = REG_EBRACE;
2570	else
2571	*err = REG_BADBR;
2572
2573	return NULL;
2574	}
2575
2576	/* If the syntax bit is set, rollback. */
2577	re_string_set_index (regexp, start_idx);
2578	*token = start_token;
2579	token->type = CHARACTER;
2580	/* mb_partial and word_char bits should be already initialized by
2581	peek_token. */
2582	return elem;
2583	}
2584
2585	if (BE (end != -1 && start > end, 0))
2586	{
2587	/* First number greater than second. */
2588	*err = REG_BADBR;
2589	return NULL;
2590	}
2591	}
2592	else
2593	{
2594	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2595	end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2596	}
2597
2598	fetch_token (token, regexp, syntax);
2599
2600	if (BE (elem == NULL, 0))
2601	return NULL;
2602	if (BE (start == 0 && end == 0, 0))
2603	{
2604	postorder (elem, free_tree, NULL);
2605	return NULL;
2606	}
2607
2608	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2609	if (BE (start > 0, 0))
2610	{
2611	tree = elem;
2612	for (i = 2; i <= start; ++i)
2613	{
2614	elem = duplicate_tree (elem, dfa);
2615	tree = create_tree (dfa, tree, elem, CONCAT);
2616	if (BE (elem == NULL \|\| tree == NULL, 0))
2617	goto parse_dup_op_espace;
2618	}
2619
2620	if (start == end)
2621	return tree;
2622
2623	/* Duplicate ELEM before it is marked optional. */
2624	elem = duplicate_tree (elem, dfa);
2625	old_tree = tree;
2626	}
2627	else
2628	old_tree = NULL;
2629
2630	if (elem->token.type == SUBEXP)
2631	postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2632
2633	tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2634	if (BE (tree == NULL, 0))
2635	goto parse_dup_op_espace;
2636
2637	/* This loop is actually executed only when end != -1,
2638	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2639	already created the start+1-th copy. */
2640	for (i = start + 2; i <= end; ++i)
2641	{
2642	elem = duplicate_tree (elem, dfa);
2643	tree = create_tree (dfa, tree, elem, CONCAT);
2644	if (BE (elem == NULL \|\| tree == NULL, 0))
2645	goto parse_dup_op_espace;
2646
2647	tree = create_tree (dfa, tree, NULL, OP_ALT);
2648	if (BE (tree == NULL, 0))
2649	goto parse_dup_op_espace;
2650	}
2651
2652	if (old_tree)
2653	tree = create_tree (dfa, old_tree, tree, CONCAT);
2654
2655	return tree;
2656
2657	parse_dup_op_espace:
2658	*err = REG_ESPACE;
2659	return NULL;
2660	}
2661
2662	/* Size of the names for collating symbol/equivalence_class/character_class.
2663	I'm not sure, but maybe enough. */
2664	#define BRACKET_NAME_BUF_SIZE 32
2665
2666	#ifndef _LIBC
2667	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2668	Build the range expression which starts from START_ELEM, and ends
2669	at END_ELEM. The result are written to MBCSET and SBCSET.
2670	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2671	mbcset->range_ends, is a pointer argument sinse we may
2672	update it. */
2673
2674	static reg_errcode_t
2675	# ifdef RE_ENABLE_I18N
2676	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2677	re_charset_t *mbcset;
2678	int *range_alloc;
2679	# else /* not RE_ENABLE_I18N */
2680	build_range_exp (sbcset, start_elem, end_elem)
2681	# endif /* not RE_ENABLE_I18N */
2682	re_bitset_ptr_t sbcset;
2683	bracket_elem_t start_elem, end_elem;
2684	{
2685	unsigned int start_ch, end_ch;
2686	/* Equivalence Classes and Character Classes can't be a range start/end. */
2687	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
2688	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
2689	0))
2690	return REG_ERANGE;
2691
2692	/* We can handle no multi character collating elements without libc
2693	support. */
2694	if (BE ((start_elem->type == COLL_SYM
2695	&& strlen ((char *) start_elem->opr.name) > 1)
2696	\|\| (end_elem->type == COLL_SYM
2697	&& strlen ((char *) end_elem->opr.name) > 1), 0))
2698	return REG_ECOLLATE;
2699
2700	# ifdef RE_ENABLE_I18N
2701	{
2702	wchar_t wc, start_wc, end_wc;
2703	wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2704
2705	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2706	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2707	: 0));
2708	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2709	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2710	: 0));
2711	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
2712	? __btowc (start_ch) : start_elem->opr.wch);
2713	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
2714	? __btowc (end_ch) : end_elem->opr.wch);
2715	if (start_wc == WEOF \|\| end_wc == WEOF)
2716	return REG_ECOLLATE;
2717	cmp_buf[0] = start_wc;
2718	cmp_buf[4] = end_wc;
2719	if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2720	return REG_ERANGE;
2721
2722	/* Got valid collation sequence values, add them as a new entry.
2723	However, for !_LIBC we have no collation elements: if the
2724	character set is single byte, the single byte character set
2725	that we build below suffices. parse_bracket_exp passes
2726	no MBCSET if dfa->mb_cur_max == 1. */
2727	if (mbcset)
2728	{
2729	/* Check the space of the arrays. */
2730	if (BE (*range_alloc == mbcset->nranges, 0))
2731	{
2732	/* There is not enough space, need realloc. */
2733	wchar_t new_array_start, new_array_end;
2734	int new_nranges;
2735
2736	/* +1 in case of mbcset->nranges is 0. */
2737	new_nranges = 2 * mbcset->nranges + 1;
2738	/* Use realloc since mbcset->range_starts and mbcset->range_ends
2739	are NULL if range_alloc == 0. /
2740	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2741	new_nranges);
2742	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2743	new_nranges);
2744
2745	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
2746	return REG_ESPACE;
2747
2748	mbcset->range_starts = new_array_start;
2749	mbcset->range_ends = new_array_end;
2750	*range_alloc = new_nranges;
2751	}
2752
2753	mbcset->range_starts[mbcset->nranges] = start_wc;
2754	mbcset->range_ends[mbcset->nranges++] = end_wc;
2755	}
2756
2757	/* Build the table for single byte characters. */
2758	for (wc = 0; wc < SBC_MAX; ++wc)
2759	{
2760	cmp_buf[2] = wc;
2761	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2762	&& wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2763	bitset_set (sbcset, wc);
2764	}
2765	}
2766	# else /* not RE_ENABLE_I18N */
2767	{
2768	unsigned int ch;
2769	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2770	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2771	: 0));
2772	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2773	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2774	: 0));
2775	if (start_ch > end_ch)
2776	return REG_ERANGE;
2777	/* Build the table for single byte characters. */
2778	for (ch = 0; ch < SBC_MAX; ++ch)
2779	if (start_ch <= ch && ch <= end_ch)
2780	bitset_set (sbcset, ch);
2781	}
2782	# endif /* not RE_ENABLE_I18N */
2783	return REG_NOERROR;
2784	}
2785	#endif /* not _LIBC */
2786
2787	#ifndef _LIBC
2788	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2789	Build the collating element which is represented by NAME.
2790	The result are written to MBCSET and SBCSET.
2791	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2792	pointer argument since we may update it. */
2793
2794	static reg_errcode_t
2795	# ifdef RE_ENABLE_I18N
2796	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2797	re_charset_t *mbcset;
2798	int *coll_sym_alloc;
2799	# else /* not RE_ENABLE_I18N */
2800	build_collating_symbol (sbcset, name)
2801	# endif /* not RE_ENABLE_I18N */
2802	re_bitset_ptr_t sbcset;
2803	const unsigned char *name;
2804	{
2805	size_t name_len = strlen ((const char *) name);
2806	if (BE (name_len != 1, 0))
2807	return REG_ECOLLATE;
2808	else
2809	{
2810	bitset_set (sbcset, name[0]);
2811	return REG_NOERROR;
2812	}
2813	}
2814	#endif /* not _LIBC */
2815
2816	/* This function parse bracket expression like "[abc]", "[a-c]",
2817	"[[.a-a.]]" etc. */
2818
2819	static bin_tree_t *
2820	parse_bracket_exp (regexp, dfa, token, syntax, err)
2821	re_string_t *regexp;
2822	re_dfa_t *dfa;
2823	re_token_t *token;
2824	reg_syntax_t syntax;
2825	reg_errcode_t *err;
2826	{
2827	#ifdef _LIBC
2828	const unsigned char *collseqmb;
2829	const char *collseqwc;
2830	uint32_t nrules;
2831	int32_t table_size;
2832	const int32_t *symb_table;
2833	const unsigned char *extra;
2834
2835	/* Local function for parse_bracket_exp used in _LIBC environement.
2836	Seek the collating symbol entry correspondings to NAME.
2837	Return the index of the symbol in the SYMB_TABLE. */
2838
2839	auto inline int32_t
2840	__attribute ((always_inline))
2841	seek_collating_symbol_entry (name, name_len)
2842	const unsigned char *name;
2843	size_t name_len;
2844	{
2845	int32_t hash = elem_hash ((const char *) name, name_len);
2846	int32_t elem = hash % table_size;
2847	int32_t second = hash % (table_size - 2);
2848	while (symb_table[2 * elem] != 0)
2849	{
2850	/* First compare the hashing value. */
2851	if (symb_table[2 * elem] == hash
2852	/* Compare the length of the name. */
2853	&& name_len == extra[symb_table[2 * elem + 1]]
2854	/* Compare the name. */
2855	&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2856	name_len) == 0)
2857	{
2858	/* Yep, this is the entry. */
2859	break;
2860	}
2861
2862	/* Next entry. */
2863	elem += second;
2864	}
2865	return elem;
2866	}
2867
2868	/* Local function for parse_bracket_exp used in _LIBC environement.
2869	Look up the collation sequence value of BR_ELEM.
2870	Return the value if succeeded, UINT_MAX otherwise. */
2871
2872	auto inline unsigned int
2873	__attribute ((always_inline))
2874	lookup_collation_sequence_value (br_elem)
2875	bracket_elem_t *br_elem;
2876	{
2877	if (br_elem->type == SB_CHAR)
2878	{
2879	/*
2880	if (MB_CUR_MAX == 1)
2881	*/
2882	if (nrules == 0)
2883	return collseqmb[br_elem->opr.ch];
2884	else
2885	{
2886	wint_t wc = __btowc (br_elem->opr.ch);
2887	return __collseq_table_lookup (collseqwc, wc);
2888	}
2889	}
2890	else if (br_elem->type == MB_CHAR)
2891	{
2892	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2893	}
2894	else if (br_elem->type == COLL_SYM)
2895	{
2896	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2897	if (nrules != 0)
2898	{
2899	int32_t elem, idx;
2900	elem = seek_collating_symbol_entry (br_elem->opr.name,
2901	sym_name_len);
2902	if (symb_table[2 * elem] != 0)
2903	{
2904	/* We found the entry. */
2905	idx = symb_table[2 * elem + 1];
2906	/* Skip the name of collating element name. */
2907	idx += 1 + extra[idx];
2908	/* Skip the byte sequence of the collating element. */
2909	idx += 1 + extra[idx];
2910	/* Adjust for the alignment. */
2911	idx = (idx + 3) & ~3;
2912	/* Skip the multibyte collation sequence value. */
2913	idx += sizeof (unsigned int);
2914	/* Skip the wide char sequence of the collating element. */
2915	idx += sizeof (unsigned int) *
2916	(1 + (unsigned int ) (extra + idx));
2917	/* Return the collation sequence value. */
2918	return (unsigned int ) (extra + idx);
2919	}
2920	else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2921	{
2922	/* No valid character. Match it as a single byte
2923	character. */
2924	return collseqmb[br_elem->opr.name[0]];
2925	}
2926	}
2927	else if (sym_name_len == 1)
2928	return collseqmb[br_elem->opr.name[0]];
2929	}
2930	return UINT_MAX;
2931	}
2932
2933	/* Local function for parse_bracket_exp used in _LIBC environement.
2934	Build the range expression which starts from START_ELEM, and ends
2935	at END_ELEM. The result are written to MBCSET and SBCSET.
2936	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2937	mbcset->range_ends, is a pointer argument sinse we may
2938	update it. */
2939
2940	auto inline reg_errcode_t
2941	__attribute ((always_inline))
2942	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2943	re_charset_t *mbcset;
2944	int *range_alloc;
2945	re_bitset_ptr_t sbcset;
2946	bracket_elem_t start_elem, end_elem;
2947	{
2948	unsigned int ch;
2949	uint32_t start_collseq;
2950	uint32_t end_collseq;
2951
2952	/* Equivalence Classes and Character Classes can't be a range
2953	start/end. */
2954	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
2955	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
2956	0))
2957	return REG_ERANGE;
2958
2959	start_collseq = lookup_collation_sequence_value (start_elem);
2960	end_collseq = lookup_collation_sequence_value (end_elem);
2961	/* Check start/end collation sequence values. */
2962	if (BE (start_collseq == UINT_MAX \|\| end_collseq == UINT_MAX, 0))
2963	return REG_ECOLLATE;
2964	if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2965	return REG_ERANGE;
2966
2967	/* Got valid collation sequence values, add them as a new entry.
2968	However, if we have no collation elements, and the character set
2969	is single byte, the single byte character set that we
2970	build below suffices. */
2971	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
2972	{
2973	/* Check the space of the arrays. */
2974	if (BE (*range_alloc == mbcset->nranges, 0))
2975	{
2976	/* There is not enough space, need realloc. */
2977	uint32_t *new_array_start;
2978	uint32_t *new_array_end;
2979	int new_nranges;
2980
2981	/* +1 in case of mbcset->nranges is 0. */
2982	new_nranges = 2 * mbcset->nranges + 1;
2983	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2984	new_nranges);
2985	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2986	new_nranges);
2987
2988	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
2989	return REG_ESPACE;
2990
2991	mbcset->range_starts = new_array_start;
2992	mbcset->range_ends = new_array_end;
2993	*range_alloc = new_nranges;
2994	}
2995
2996	mbcset->range_starts[mbcset->nranges] = start_collseq;
2997	mbcset->range_ends[mbcset->nranges++] = end_collseq;
2998	}
2999
3000	/* Build the table for single byte characters. */
3001	for (ch = 0; ch < SBC_MAX; ch++)
3002	{
3003	uint32_t ch_collseq;
3004	/*
3005	if (MB_CUR_MAX == 1)
3006	*/
3007	if (nrules == 0)
3008	ch_collseq = collseqmb[ch];
3009	else
3010	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3011	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3012	bitset_set (sbcset, ch);
3013	}
3014	return REG_NOERROR;
3015	}
3016
3017	/* Local function for parse_bracket_exp used in _LIBC environement.
3018	Build the collating element which is represented by NAME.
3019	The result are written to MBCSET and SBCSET.
3020	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3021	pointer argument sinse we may update it. */
3022
3023	auto inline reg_errcode_t
3024	__attribute ((always_inline))
3025	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3026	re_charset_t *mbcset;
3027	int *coll_sym_alloc;
3028	re_bitset_ptr_t sbcset;
3029	const unsigned char *name;
3030	{
3031	int32_t elem, idx;
3032	size_t name_len = strlen ((const char *) name);
3033	if (nrules != 0)
3034	{
3035	elem = seek_collating_symbol_entry (name, name_len);
3036	if (symb_table[2 * elem] != 0)
3037	{
3038	/* We found the entry. */
3039	idx = symb_table[2 * elem + 1];
3040	/* Skip the name of collating element name. */
3041	idx += 1 + extra[idx];
3042	}
3043	else if (symb_table[2 * elem] == 0 && name_len == 1)
3044	{
3045	/* No valid character, treat it as a normal
3046	character. */
3047	bitset_set (sbcset, name[0]);
3048	return REG_NOERROR;
3049	}
3050	else
3051	return REG_ECOLLATE;
3052
3053	/* Got valid collation sequence, add it as a new entry. */
3054	/* Check the space of the arrays. */
3055	if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3056	{
3057	/* Not enough, realloc it. */
3058	/* +1 in case of mbcset->ncoll_syms is 0. */
3059	int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3060	/* Use realloc since mbcset->coll_syms is NULL
3061	if alloc == 0. /
3062	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3063	new_coll_sym_alloc);
3064	if (BE (new_coll_syms == NULL, 0))
3065	return REG_ESPACE;
3066	mbcset->coll_syms = new_coll_syms;
3067	*coll_sym_alloc = new_coll_sym_alloc;
3068	}
3069	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3070	return REG_NOERROR;
3071	}
3072	else
3073	{
3074	if (BE (name_len != 1, 0))
3075	return REG_ECOLLATE;
3076	else
3077	{
3078	bitset_set (sbcset, name[0]);
3079	return REG_NOERROR;
3080	}
3081	}
3082	}
3083	#endif
3084
3085	re_token_t br_token;
3086	re_bitset_ptr_t sbcset;
3087	#ifdef RE_ENABLE_I18N
3088	re_charset_t *mbcset;
3089	int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3090	int equiv_class_alloc = 0, char_class_alloc = 0;
3091	#endif /* not RE_ENABLE_I18N */
3092	int non_match = 0;
3093	bin_tree_t *work_tree;
3094	int token_len;
3095	int first_round = 1;
3096	#ifdef _LIBC
3097	collseqmb = (const unsigned char *)
3098	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3099	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3100	if (nrules)
3101	{
3102	/*
3103	if (MB_CUR_MAX > 1)
3104	*/
3105	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3106	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3107	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3108	_NL_COLLATE_SYMB_TABLEMB);
3109	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3110	_NL_COLLATE_SYMB_EXTRAMB);
3111	}
3112	#endif
3113	sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3114	#ifdef RE_ENABLE_I18N
3115	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3116	#endif /* RE_ENABLE_I18N */
3117	#ifdef RE_ENABLE_I18N
3118	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
3119	#else
3120	if (BE (sbcset == NULL, 0))
3121	#endif /* RE_ENABLE_I18N */
3122	{
3123	*err = REG_ESPACE;
3124	return NULL;
3125	}
3126
3127	token_len = peek_token_bracket (token, regexp, syntax);
3128	if (BE (token->type == END_OF_RE, 0))
3129	{
3130	*err = REG_BADPAT;
3131	goto parse_bracket_exp_free_return;
3132	}
3133	if (token->type == OP_NON_MATCH_LIST)
3134	{
3135	#ifdef RE_ENABLE_I18N
3136	mbcset->non_match = 1;
3137	#endif /* not RE_ENABLE_I18N */
3138	non_match = 1;
3139	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3140	bitset_set (sbcset, '\0');
3141	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3142	token_len = peek_token_bracket (token, regexp, syntax);
3143	if (BE (token->type == END_OF_RE, 0))
3144	{
3145	*err = REG_BADPAT;
3146	goto parse_bracket_exp_free_return;
3147	}
3148	}
3149
3150	/* We treat the first ']' as a normal character. */
3151	if (token->type == OP_CLOSE_BRACKET)
3152	token->type = CHARACTER;
3153
3154	while (1)
3155	{
3156	bracket_elem_t start_elem, end_elem;
3157	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3158	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3159	reg_errcode_t ret;
3160	int token_len2 = 0, is_range_exp = 0;
3161	re_token_t token2;
3162
3163	start_elem.opr.name = start_name_buf;
3164	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3165	syntax, first_round);
3166	if (BE (ret != REG_NOERROR, 0))
3167	{
3168	*err = ret;
3169	goto parse_bracket_exp_free_return;
3170	}
3171	first_round = 0;
3172
3173	/* Get information about the next token. We need it in any case. */
3174	token_len = peek_token_bracket (token, regexp, syntax);
3175
3176	/* Do not check for ranges if we know they are not allowed. */
3177	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3178	{
3179	if (BE (token->type == END_OF_RE, 0))
3180	{
3181	*err = REG_EBRACK;
3182	goto parse_bracket_exp_free_return;
3183	}
3184	if (token->type == OP_CHARSET_RANGE)
3185	{
3186	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3187	token_len2 = peek_token_bracket (&token2, regexp, syntax);
3188	if (BE (token2.type == END_OF_RE, 0))
3189	{
3190	*err = REG_EBRACK;
3191	goto parse_bracket_exp_free_return;
3192	}
3193	if (token2.type == OP_CLOSE_BRACKET)
3194	{
3195	/* We treat the last '-' as a normal character. */
3196	re_string_skip_bytes (regexp, -token_len);
3197	token->type = CHARACTER;
3198	}
3199	else
3200	is_range_exp = 1;
3201	}
3202	}
3203
3204	if (is_range_exp == 1)
3205	{
3206	end_elem.opr.name = end_name_buf;
3207	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3208	dfa, syntax, 1);
3209	if (BE (ret != REG_NOERROR, 0))
3210	{
3211	*err = ret;
3212	goto parse_bracket_exp_free_return;
3213	}
3214
3215	token_len = peek_token_bracket (token, regexp, syntax);
3216
3217	#ifdef _LIBC
3218	*err = build_range_exp (sbcset, mbcset, &range_alloc,
3219	&start_elem, &end_elem);
3220	#else
3221	# ifdef RE_ENABLE_I18N
3222	*err = build_range_exp (sbcset,
3223	dfa->mb_cur_max > 1 ? mbcset : NULL,
3224	&range_alloc, &start_elem, &end_elem);
3225	# else
3226	*err = build_range_exp (sbcset, &start_elem, &end_elem);
3227	# endif
3228	#endif /* RE_ENABLE_I18N */
3229	if (BE (*err != REG_NOERROR, 0))
3230	goto parse_bracket_exp_free_return;
3231	}
3232	else
3233	{
3234	switch (start_elem.type)
3235	{
3236	case SB_CHAR:
3237	bitset_set (sbcset, start_elem.opr.ch);
3238	break;
3239	#ifdef RE_ENABLE_I18N
3240	case MB_CHAR:
3241	/* Check whether the array has enough space. */
3242	if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3243	{
3244	wchar_t *new_mbchars;
3245	/* Not enough, realloc it. */
3246	/* +1 in case of mbcset->nmbchars is 0. */
3247	mbchar_alloc = 2 * mbcset->nmbchars + 1;
3248	/* Use realloc since array is NULL if alloc == 0. /
3249	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3250	mbchar_alloc);
3251	if (BE (new_mbchars == NULL, 0))
3252	goto parse_bracket_exp_espace;
3253	mbcset->mbchars = new_mbchars;
3254	}
3255	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3256	break;
3257	#endif /* RE_ENABLE_I18N */
3258	case EQUIV_CLASS:
3259	*err = build_equiv_class (sbcset,
3260	#ifdef RE_ENABLE_I18N
3261	mbcset, &equiv_class_alloc,
3262	#endif /* RE_ENABLE_I18N */
3263	start_elem.opr.name);
3264	if (BE (*err != REG_NOERROR, 0))
3265	goto parse_bracket_exp_free_return;
3266	break;
3267	case COLL_SYM:
3268	*err = build_collating_symbol (sbcset,
3269	#ifdef RE_ENABLE_I18N
3270	mbcset, &coll_sym_alloc,
3271	#endif /* RE_ENABLE_I18N */
3272	start_elem.opr.name);
3273	if (BE (*err != REG_NOERROR, 0))
3274	goto parse_bracket_exp_free_return;
3275	break;
3276	case CHAR_CLASS:
3277	*err = build_charclass (regexp->trans, sbcset,
3278	#ifdef RE_ENABLE_I18N
3279	mbcset, &char_class_alloc,
3280	#endif /* RE_ENABLE_I18N */
3281	(const char *) start_elem.opr.name, syntax);
3282	if (BE (*err != REG_NOERROR, 0))
3283	goto parse_bracket_exp_free_return;
3284	break;
3285	default:
3286	assert (0);
3287	break;
3288	}
3289	}
3290	if (BE (token->type == END_OF_RE, 0))
3291	{
3292	*err = REG_EBRACK;
3293	goto parse_bracket_exp_free_return;
3294	}
3295	if (token->type == OP_CLOSE_BRACKET)
3296	break;
3297	}
3298
3299	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3300
3301	/* If it is non-matching list. */
3302	if (non_match)
3303	bitset_not (sbcset);
3304
3305	#ifdef RE_ENABLE_I18N
3306	/* Ensure only single byte characters are set. */
3307	if (dfa->mb_cur_max > 1)
3308	bitset_mask (sbcset, dfa->sb_char);
3309
3310	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
3311	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3312	\|\| mbcset->non_match)))
3313	{
3314	bin_tree_t *mbc_tree;
3315	int sbc_idx;
3316	/* Build a tree for complex bracket. */
3317	dfa->has_mb_node = 1;
3318	br_token.type = COMPLEX_BRACKET;
3319	br_token.opr.mbcset = mbcset;
3320	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3321	if (BE (mbc_tree == NULL, 0))
3322	goto parse_bracket_exp_espace;
3323	for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3324	if (sbcset[sbc_idx])
3325	break;
3326	/* If there are no bits set in sbcset, there is no point
3327	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3328	if (sbc_idx < BITSET_UINTS)
3329	{
3330	/* Build a tree for simple bracket. */
3331	br_token.type = SIMPLE_BRACKET;
3332	br_token.opr.sbcset = sbcset;
3333	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3334	if (BE (work_tree == NULL, 0))
3335	goto parse_bracket_exp_espace;
3336
3337	/* Then join them by ALT node. */
3338	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3339	if (BE (work_tree == NULL, 0))
3340	goto parse_bracket_exp_espace;
3341	}
3342	else
3343	{
3344	re_free (sbcset);
3345	work_tree = mbc_tree;
3346	}
3347	}
3348	else
3349	#endif /* not RE_ENABLE_I18N */
3350	{
3351	#ifdef RE_ENABLE_I18N
3352	free_charset (mbcset);
3353	#endif
3354	/* Build a tree for simple bracket. */
3355	br_token.type = SIMPLE_BRACKET;
3356	br_token.opr.sbcset = sbcset;
3357	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3358	if (BE (work_tree == NULL, 0))
3359	goto parse_bracket_exp_espace;
3360	}
3361	return work_tree;
3362
3363	parse_bracket_exp_espace:
3364	*err = REG_ESPACE;
3365	parse_bracket_exp_free_return:
3366	re_free (sbcset);
3367	#ifdef RE_ENABLE_I18N
3368	free_charset (mbcset);
3369	#endif /* RE_ENABLE_I18N */
3370	return NULL;
3371	}
3372
3373	/* Parse an element in the bracket expression. */
3374
3375	static reg_errcode_t
3376	parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3377	accept_hyphen)
3378	bracket_elem_t *elem;
3379	re_string_t *regexp;
3380	re_token_t *token;
3381	int token_len;
3382	re_dfa_t *dfa;
3383	reg_syntax_t syntax;
3384	int accept_hyphen;
3385	{
3386	#ifdef RE_ENABLE_I18N
3387	int cur_char_size;
3388	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3389	if (cur_char_size > 1)
3390	{
3391	elem->type = MB_CHAR;
3392	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3393	re_string_skip_bytes (regexp, cur_char_size);
3394	return REG_NOERROR;
3395	}
3396	#endif /* RE_ENABLE_I18N */
3397	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3398	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
3399	\|\| token->type == OP_OPEN_EQUIV_CLASS)
3400	return parse_bracket_symbol (elem, regexp, token);
3401	if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3402	{
3403	/* A '-' must only appear as anything but a range indicator before
3404	the closing bracket. Everything else is an error. */
3405	re_token_t token2;
3406	(void) peek_token_bracket (&token2, regexp, syntax);
3407	if (token2.type != OP_CLOSE_BRACKET)
3408	/* The actual error value is not standardized since this whole
3409	case is undefined. But ERANGE makes good sense. */
3410	return REG_ERANGE;
3411	}
3412	elem->type = SB_CHAR;
3413	elem->opr.ch = token->opr.c;
3414	return REG_NOERROR;
3415	}
3416
3417	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3418	such as [:<character_class>:], [.<collating_element>.], and
3419	[=<equivalent_class>=]. */
3420
3421	static reg_errcode_t
3422	parse_bracket_symbol (elem, regexp, token)
3423	bracket_elem_t *elem;
3424	re_string_t *regexp;
3425	re_token_t *token;
3426	{
3427	unsigned char ch, delim = token->opr.c;
3428	int i = 0;
3429	if (re_string_eoi(regexp))
3430	return REG_EBRACK;
3431	for (;; ++i)
3432	{
3433	if (i >= BRACKET_NAME_BUF_SIZE)
3434	return REG_EBRACK;
3435	if (token->type == OP_OPEN_CHAR_CLASS)
3436	ch = re_string_fetch_byte_case (regexp);
3437	else
3438	ch = re_string_fetch_byte (regexp);
3439	if (re_string_eoi(regexp))
3440	return REG_EBRACK;
3441	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3442	break;
3443	elem->opr.name[i] = ch;
3444	}
3445	re_string_skip_bytes (regexp, 1);
3446	elem->opr.name[i] = '\0';
3447	switch (token->type)
3448	{
3449	case OP_OPEN_COLL_ELEM:
3450	elem->type = COLL_SYM;
3451	break;
3452	case OP_OPEN_EQUIV_CLASS:
3453	elem->type = EQUIV_CLASS;
3454	break;
3455	case OP_OPEN_CHAR_CLASS:
3456	elem->type = CHAR_CLASS;
3457	break;
3458	default:
3459	break;
3460	}
3461	return REG_NOERROR;
3462	}
3463
3464	/* Helper function for parse_bracket_exp.
3465	Build the equivalence class which is represented by NAME.
3466	The result are written to MBCSET and SBCSET.
3467	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3468	is a pointer argument sinse we may update it. */
3469
3470	static reg_errcode_t
3471	#ifdef RE_ENABLE_I18N
3472	build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3473	re_charset_t *mbcset;
3474	int *equiv_class_alloc;
3475	#else /* not RE_ENABLE_I18N */
3476	build_equiv_class (sbcset, name)
3477	#endif /* not RE_ENABLE_I18N */
3478	re_bitset_ptr_t sbcset;
3479	const unsigned char *name;
3480	{
3481	#if defined _LIBC
3482	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3483	if (nrules != 0)
3484	{
3485	const int32_t table, indirect;
3486	const unsigned char weights, extra, *cp;
3487	unsigned char char_buf[2];
3488	int32_t idx1, idx2;
3489	unsigned int ch;
3490	size_t len;
3491	/* This #include defines a local function! */
3492	# include <locale/weight.h>
3493	/* Calculate the index for equivalence class. */
3494	cp = name;
3495	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3496	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3497	_NL_COLLATE_WEIGHTMB);
3498	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3499	_NL_COLLATE_EXTRAMB);
3500	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3501	_NL_COLLATE_INDIRECTMB);
3502	idx1 = findidx (&cp);
3503	if (BE (idx1 == 0 \|\| cp < name + strlen ((const char *) name), 0))
3504	/* This isn't a valid character. */
3505	return REG_ECOLLATE;
3506
3507	/* Build single byte matcing table for this equivalence class. */
3508	char_buf[1] = (unsigned char) '\0';
3509	len = weights[idx1];
3510	for (ch = 0; ch < SBC_MAX; ++ch)
3511	{
3512	char_buf[0] = ch;
3513	cp = char_buf;
3514	idx2 = findidx (&cp);
3515	/*
3516	idx2 = table[ch];
3517	*/
3518	if (idx2 == 0)
3519	/* This isn't a valid character. */
3520	continue;
3521	if (len == weights[idx2])
3522	{
3523	int cnt = 0;
3524	while (cnt <= len &&
3525	weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3526	++cnt;
3527
3528	if (cnt > len)
3529	bitset_set (sbcset, ch);
3530	}
3531	}
3532	/* Check whether the array has enough space. */
3533	if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3534	{
3535	/* Not enough, realloc it. */
3536	/* +1 in case of mbcset->nequiv_classes is 0. */
3537	int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3538	/* Use realloc since the array is NULL if alloc == 0. /
3539	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3540	int32_t,
3541	new_equiv_class_alloc);
3542	if (BE (new_equiv_classes == NULL, 0))
3543	return REG_ESPACE;
3544	mbcset->equiv_classes = new_equiv_classes;
3545	*equiv_class_alloc = new_equiv_class_alloc;
3546	}
3547	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3548	}
3549	else
3550	#endif /* _LIBC */
3551	{
3552	if (BE (strlen ((const char *) name) != 1, 0))
3553	return REG_ECOLLATE;
3554	bitset_set (sbcset, *name);
3555	}
3556	return REG_NOERROR;
3557	}
3558
3559	/* Helper function for parse_bracket_exp.
3560	Build the character class which is represented by NAME.
3561	The result are written to MBCSET and SBCSET.
3562	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3563	is a pointer argument sinse we may update it. */
3564
3565	static reg_errcode_t
3566	#ifdef RE_ENABLE_I18N
3567	build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3568	re_charset_t *mbcset;
3569	int *char_class_alloc;
3570	#else /* not RE_ENABLE_I18N */
3571	build_charclass (trans, sbcset, class_name, syntax)
3572	#endif /* not RE_ENABLE_I18N */
3573	unsigned RE_TRANSLATE_TYPE trans;
3574	re_bitset_ptr_t sbcset;
3575	const char *class_name;
3576	reg_syntax_t syntax;
3577	{
3578	int i;
3579
3580	/* In case of REG_ICASE "upper" and "lower" match the both of
3581	upper and lower cases. */
3582	if ((syntax & RE_ICASE)
3583	&& (strcmp (class_name, "upper") == 0 \|\| strcmp (class_name, "lower") == 0))
3584	class_name = "alpha";
3585
3586	#ifdef RE_ENABLE_I18N
3587	/* Check the space of the arrays. */
3588	if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3589	{
3590	/* Not enough, realloc it. */
3591	/* +1 in case of mbcset->nchar_classes is 0. */
3592	int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3593	/* Use realloc since array is NULL if alloc == 0. /
3594	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3595	new_char_class_alloc);
3596	if (BE (new_char_classes == NULL, 0))
3597	return REG_ESPACE;
3598	mbcset->char_classes = new_char_classes;
3599	*char_class_alloc = new_char_class_alloc;
3600	}
3601	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3602	#endif /* RE_ENABLE_I18N */
3603
3604	#define BUILD_CHARCLASS_LOOP(ctype_func) \
3605	for (i = 0; i < SBC_MAX; ++i) \
3606	{ \
3607	if (ctype_func (i)) \
3608	{ \
3609	int ch = trans ? trans[i] : i; \
3610	bitset_set (sbcset, ch); \
3611	} \
3612	}
3613
3614	if (strcmp (class_name, "alnum") == 0)
3615	BUILD_CHARCLASS_LOOP (isalnum)
3616	else if (strcmp (class_name, "cntrl") == 0)
3617	BUILD_CHARCLASS_LOOP (iscntrl)
3618	else if (strcmp (class_name, "lower") == 0)
3619	BUILD_CHARCLASS_LOOP (islower)
3620	else if (strcmp (class_name, "space") == 0)
3621	BUILD_CHARCLASS_LOOP (isspace)
3622	else if (strcmp (class_name, "alpha") == 0)
3623	BUILD_CHARCLASS_LOOP (isalpha)
3624	else if (strcmp (class_name, "digit") == 0)
3625	BUILD_CHARCLASS_LOOP (isdigit)
3626	else if (strcmp (class_name, "print") == 0)
3627	BUILD_CHARCLASS_LOOP (isprint)
3628	else if (strcmp (class_name, "upper") == 0)
3629	BUILD_CHARCLASS_LOOP (isupper)
3630	#ifndef GAWK
3631	else if (strcmp (class_name, "blank") == 0)
3632	BUILD_CHARCLASS_LOOP (isblank)
3633	#else
3634	/* see comments above */
3635	else if (strcmp (class_name, "blank") == 0)
3636	BUILD_CHARCLASS_LOOP (is_blank)
3637	#endif
3638	else if (strcmp (class_name, "graph") == 0)
3639	BUILD_CHARCLASS_LOOP (isgraph)
3640	else if (strcmp (class_name, "punct") == 0)
3641	BUILD_CHARCLASS_LOOP (ispunct)
3642	else if (strcmp (class_name, "xdigit") == 0)
3643	BUILD_CHARCLASS_LOOP (isxdigit)
3644	else
3645	return REG_ECTYPE;
3646
3647	return REG_NOERROR;
3648	}
3649
3650	static bin_tree_t *
3651	build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3652	re_dfa_t *dfa;
3653	unsigned RE_TRANSLATE_TYPE trans;
3654	const char *class_name;
3655	const char *extra;
3656	int non_match;
3657	reg_errcode_t *err;
3658	{
3659	re_bitset_ptr_t sbcset;
3660	#ifdef RE_ENABLE_I18N
3661	re_charset_t *mbcset;
3662	int alloc = 0;
3663	#endif /* not RE_ENABLE_I18N */
3664	reg_errcode_t ret;
3665	re_token_t br_token;
3666	bin_tree_t *tree;
3667
3668	sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3669	#ifdef RE_ENABLE_I18N
3670	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3671	#endif /* RE_ENABLE_I18N */
3672
3673	#ifdef RE_ENABLE_I18N
3674	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
3675	#else /* not RE_ENABLE_I18N */
3676	if (BE (sbcset == NULL, 0))
3677	#endif /* not RE_ENABLE_I18N */
3678	{
3679	*err = REG_ESPACE;
3680	return NULL;
3681	}
3682
3683	if (non_match)
3684	{
3685	#ifdef RE_ENABLE_I18N
3686	/*
3687	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3688	bitset_set(cset->sbcset, '\0');
3689	*/
3690	mbcset->non_match = 1;
3691	#endif /* not RE_ENABLE_I18N */
3692	}
3693
3694	/* We don't care the syntax in this case. */
3695	ret = build_charclass (trans, sbcset,
3696	#ifdef RE_ENABLE_I18N
3697	mbcset, &alloc,
3698	#endif /* RE_ENABLE_I18N */
3699	class_name, 0);
3700
3701	if (BE (ret != REG_NOERROR, 0))
3702	{
3703	re_free (sbcset);
3704	#ifdef RE_ENABLE_I18N
3705	free_charset (mbcset);
3706	#endif /* RE_ENABLE_I18N */
3707	*err = ret;
3708	return NULL;
3709	}
3710	/* \w match '_' also. */
3711	for (; *extra; extra++)
3712	bitset_set (sbcset, *extra);
3713
3714	/* If it is non-matching list. */
3715	if (non_match)
3716	bitset_not (sbcset);
3717
3718	#ifdef RE_ENABLE_I18N
3719	/* Ensure only single byte characters are set. */
3720	if (dfa->mb_cur_max > 1)
3721	bitset_mask (sbcset, dfa->sb_char);
3722	#endif
3723
3724	/* Build a tree for simple bracket. */
3725	br_token.type = SIMPLE_BRACKET;
3726	br_token.opr.sbcset = sbcset;
3727	tree = create_token_tree (dfa, NULL, NULL, &br_token);
3728	if (BE (tree == NULL, 0))
3729	goto build_word_op_espace;
3730
3731	#ifdef RE_ENABLE_I18N
3732	if (dfa->mb_cur_max > 1)
3733	{
3734	bin_tree_t *mbc_tree;
3735	/* Build a tree for complex bracket. */
3736	br_token.type = COMPLEX_BRACKET;
3737	br_token.opr.mbcset = mbcset;
3738	dfa->has_mb_node = 1;
3739	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3740	if (BE (mbc_tree == NULL, 0))
3741	goto build_word_op_espace;
3742	/* Then join them by ALT node. */
3743	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3744	if (BE (mbc_tree != NULL, 1))
3745	return tree;
3746	}
3747	else
3748	{
3749	free_charset (mbcset);
3750	return tree;
3751	}
3752	#else /* not RE_ENABLE_I18N */
3753	return tree;
3754	#endif /* not RE_ENABLE_I18N */
3755
3756	build_word_op_espace:
3757	re_free (sbcset);
3758	#ifdef RE_ENABLE_I18N
3759	free_charset (mbcset);
3760	#endif /* RE_ENABLE_I18N */
3761	*err = REG_ESPACE;
3762	return NULL;
3763	}
3764
3765	/* This is intended for the expressions like "a{1,3}".
3766	Fetch a number from `input', and return the number.
3767	Return -1, if the number field is empty like "{,1}".
3768	Return -2, If an error is occured. */
3769
3770	static int
3771	fetch_number (input, token, syntax)
3772	re_string_t *input;
3773	re_token_t *token;
3774	reg_syntax_t syntax;
3775	{
3776	int num = -1;
3777	unsigned char c;
3778	while (1)
3779	{
3780	fetch_token (token, input, syntax);
3781	c = token->opr.c;
3782	if (BE (token->type == END_OF_RE, 0))
3783	return -2;
3784	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
3785	break;
3786	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c \|\| num == -2)
3787	? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3788	num = (num > RE_DUP_MAX) ? -2 : num;
3789	}
3790	return num;
3791	}
3792
3793
3794	#ifdef RE_ENABLE_I18N
3795	static void
3796	free_charset (re_charset_t *cset)
3797	{
3798	re_free (cset->mbchars);
3799	# ifdef _LIBC
3800	re_free (cset->coll_syms);
3801	re_free (cset->equiv_classes);
3802	re_free (cset->range_starts);
3803	re_free (cset->range_ends);
3804	# endif
3805	re_free (cset->char_classes);
3806	re_free (cset);
3807	}
3808	#endif /* RE_ENABLE_I18N */
3809
3810
3811	/* Functions for binary tree operation. */
3812
3813	/* Create a tree node. */
3814
3815	static bin_tree_t *
3816	create_tree (dfa, left, right, type)
3817	re_dfa_t *dfa;
3818	bin_tree_t *left;
3819	bin_tree_t *right;
3820	re_token_type_t type;
3821	{
3822	re_token_t t;
3823	t.type = type;
3824	return create_token_tree (dfa, left, right, &t);
3825	}
3826
3827	static bin_tree_t *
3828	create_token_tree (dfa, left, right, token)
3829	re_dfa_t *dfa;
3830	bin_tree_t *left;
3831	bin_tree_t *right;
3832	const re_token_t *token;
3833	{
3834	bin_tree_t *tree;
3835	if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3836	{
3837	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3838
3839	if (storage == NULL)
3840	return NULL;
3841	storage->next = dfa->str_tree_storage;
3842	dfa->str_tree_storage = storage;
3843	dfa->str_tree_storage_idx = 0;
3844	}
3845	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3846
3847	tree->parent = NULL;
3848	tree->left = left;
3849	tree->right = right;
3850	tree->token = *token;
3851	tree->token.duplicated = 0;
3852	tree->token.opt_subexp = 0;
3853	tree->first = NULL;
3854	tree->next = NULL;
3855	tree->node_idx = -1;
3856
3857	if (left != NULL)
3858	left->parent = tree;
3859	if (right != NULL)
3860	right->parent = tree;
3861	return tree;
3862	}
3863
3864	/* Mark the tree SRC as an optional subexpression.
3865	To be called from preorder or postorder. */
3866
3867	static reg_errcode_t
3868	mark_opt_subexp (extra, node)
3869	void *extra;
3870	bin_tree_t *node;
3871	{
3872	int idx = (int) (long) extra;
3873	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3874	node->token.opt_subexp = 1;
3875
3876	return REG_NOERROR;
3877	}
3878
3879	/* Free the allocated memory inside NODE. */
3880
3881	static void
3882	free_token (re_token_t *node)
3883	{
3884	#ifdef RE_ENABLE_I18N
3885	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3886	free_charset (node->opr.mbcset);
3887	else
3888	#endif /* RE_ENABLE_I18N */
3889	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3890	re_free (node->opr.sbcset);
3891	}
3892
3893	/* Worker function for tree walking. Free the allocated memory inside NODE
3894	and its children. */
3895
3896	static reg_errcode_t
3897	free_tree (void extra, bin_tree_t node)
3898	{
3899	free_token (&node->token);
3900	return REG_NOERROR;
3901	}
3902
3903
3904	/* Duplicate the node SRC, and return new node. This is a preorder
3905	visit similar to the one implemented by the generic visitor, but
3906	we need more infrastructure to maintain two parallel trees --- so,
3907	it's easier to duplicate. */
3908
3909	static bin_tree_t *
3910	duplicate_tree (root, dfa)
3911	const bin_tree_t *root;
3912	re_dfa_t *dfa;
3913	{
3914	const bin_tree_t *node;
3915	bin_tree_t *dup_root;
3916	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
3917
3918	for (node = root; ; )
3919	{
3920	/* Create a new tree and link it back to the current parent. */
3921	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3922	if (*p_new == NULL)
3923	return NULL;
3924	(*p_new)->parent = dup_node;
3925	(*p_new)->token.duplicated = 1;
3926	dup_node = *p_new;
3927
3928	/* Go to the left node, or up and to the right. */
3929	if (node->left)
3930	{
3931	node = node->left;
3932	p_new = &dup_node->left;
3933	}
3934	else
3935	{
3936	const bin_tree_t *prev = NULL;
3937	while (node->right == prev \|\| node->right == NULL)
3938	{
3939	prev = node;
3940	node = node->parent;
3941	dup_node = dup_node->parent;
3942	if (!node)
3943	return dup_root;
3944	}
3945	node = node->right;
3946	p_new = &dup_node->right;
3947	}
3948	}
3949	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/gawk/3.1.5/regcomp.c

Download in other formats: