source: vendor/gawk/3.1.5/regcomp.c@ 3643

Last change on this file since 3643 was 3076, checked in by bird, 19 years ago

gawk 3.1.5

File size: 110.1 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27static void init_word_char (re_dfa_t *dfa);
28#ifdef RE_ENABLE_I18N
29static void free_charset (re_charset_t *cset);
30#endif /* RE_ENABLE_I18N */
31static void free_workarea_compile (regex_t *preg);
32static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33#ifdef RE_ENABLE_I18N
34static void optimize_utf8 (re_dfa_t *dfa);
35#endif
36static reg_errcode_t analyze (regex_t *preg);
37static reg_errcode_t create_initial_state (re_dfa_t *dfa);
38static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 bin_tree_t *node);
48static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
52 int top_clone_node, int root_node,
53 unsigned int constraint);
54static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
55 unsigned int constraint);
56static int search_duplicated_node (re_dfa_t *dfa, int org_node,
57 unsigned int constraint);
58static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
59static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
60 int node, int root);
61static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
62static int fetch_number (re_string_t *input, re_token_t *token,
63 reg_syntax_t syntax);
64static void fetch_token (re_token_t *result, re_string_t *input,
65 reg_syntax_t syntax);
66static int peek_token (re_token_t *token, re_string_t *input,
67 reg_syntax_t syntax);
68static int peek_token_bracket (re_token_t *token, re_string_t *input,
69 reg_syntax_t syntax);
70static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
71 reg_syntax_t syntax, reg_errcode_t *err);
72static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
76 re_token_t *token, reg_syntax_t syntax,
77 int nest, reg_errcode_t *err);
78static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
79 re_token_t *token, reg_syntax_t syntax,
80 int nest, reg_errcode_t *err);
81static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
82 re_token_t *token, reg_syntax_t syntax,
83 int nest, reg_errcode_t *err);
84static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
85 re_dfa_t *dfa, re_token_t *token,
86 reg_syntax_t syntax, reg_errcode_t *err);
87static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
88 re_token_t *token, reg_syntax_t syntax,
89 reg_errcode_t *err);
90static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
91 re_string_t *regexp,
92 re_token_t *token, int token_len,
93 re_dfa_t *dfa,
94 reg_syntax_t syntax,
95 int accept_hyphen);
96static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97 re_string_t *regexp,
98 re_token_t *token);
99#ifndef _LIBC
100# ifdef RE_ENABLE_I18N
101static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
102 re_charset_t *mbcset, int *range_alloc,
103 bracket_elem_t *start_elem,
104 bracket_elem_t *end_elem);
105static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
106 re_charset_t *mbcset,
107 int *coll_sym_alloc,
108 const unsigned char *name);
109# else /* not RE_ENABLE_I18N */
110static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
111 bracket_elem_t *start_elem,
112 bracket_elem_t *end_elem);
113static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
114 const unsigned char *name);
115# endif /* not RE_ENABLE_I18N */
116#endif /* not _LIBC */
117#ifdef RE_ENABLE_I18N
118static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119 re_charset_t *mbcset,
120 int *equiv_class_alloc,
121 const unsigned char *name);
122static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
123 re_bitset_ptr_t sbcset,
124 re_charset_t *mbcset,
125 int *char_class_alloc,
126 const char *class_name,
127 reg_syntax_t syntax);
128#else /* not RE_ENABLE_I18N */
129static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
130 const unsigned char *name);
131static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
132 re_bitset_ptr_t sbcset,
133 const char *class_name,
134 reg_syntax_t syntax);
135#endif /* not RE_ENABLE_I18N */
136static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
137 unsigned RE_TRANSLATE_TYPE trans,
138 const char *class_name,
139 const char *extra,
140 int non_match, reg_errcode_t *err);
141static bin_tree_t *create_tree (re_dfa_t *dfa,
142 bin_tree_t *left, bin_tree_t *right,
143 re_token_type_t type);
144static bin_tree_t *create_token_tree (re_dfa_t *dfa,
145 bin_tree_t *left, bin_tree_t *right,
146 const re_token_t *token);
147static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
148static void free_token (re_token_t *node);
149static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
150static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
151
152
153/* This table gives an error message for each of the error codes listed
154 in regex.h. Obviously the order here has to be same as there.
155 POSIX doesn't require that we do anything for REG_NOERROR,
156 but why not be nice? */
157
158const ERRMSG_TYPE __re_error_msgid[] attribute_hidden =
159 {
160#define REG_NOERROR_IDX 0
161 gettext_noop ("Success") /* REG_NOERROR */
162 ERRMSG_SEPARATOR
163#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
164 gettext_noop ("No match") /* REG_NOMATCH */
165 ERRMSG_SEPARATOR
166#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
167 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
168 ERRMSG_SEPARATOR
169#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
170 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
171 ERRMSG_SEPARATOR
172#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
173 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
174 ERRMSG_SEPARATOR
175#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
176 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
177 ERRMSG_SEPARATOR
178#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
179 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
180 ERRMSG_SEPARATOR
181#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
182 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
183 ERRMSG_SEPARATOR
184#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
185 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
186 ERRMSG_SEPARATOR
187#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
188 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
189 ERRMSG_SEPARATOR
190#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
191 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
192 ERRMSG_SEPARATOR
193#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
194 gettext_noop ("Invalid range end") /* REG_ERANGE */
195 ERRMSG_SEPARATOR
196#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
197 gettext_noop ("Memory exhausted") /* REG_ESPACE */
198 ERRMSG_SEPARATOR
199#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
200 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
201 ERRMSG_SEPARATOR
202#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
203 gettext_noop ("Premature end of regular expression") /* REG_EEND */
204 ERRMSG_SEPARATOR
205#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
206 gettext_noop ("Regular expression too big") /* REG_ESIZE */
207 ERRMSG_SEPARATOR
208#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
209 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
210 ERRMSG_SEPARATOR
211 };
212
213const size_t __re_error_msgid_idx[] attribute_hidden =
214 {
215 REG_NOERROR_IDX,
216 REG_NOMATCH_IDX,
217 REG_BADPAT_IDX,
218 REG_ECOLLATE_IDX,
219 REG_ECTYPE_IDX,
220 REG_EESCAPE_IDX,
221 REG_ESUBREG_IDX,
222 REG_EBRACK_IDX,
223 REG_EPAREN_IDX,
224 REG_EBRACE_IDX,
225 REG_BADBR_IDX,
226 REG_ERANGE_IDX,
227 REG_ESPACE_IDX,
228 REG_BADRPT_IDX,
229 REG_EEND_IDX,
230 REG_ESIZE_IDX,
231 REG_ERPAREN_IDX
232 };
233
234
235/* Entry points for GNU code. */
236
237/* re_compile_pattern is the GNU regular expression compiler: it
238 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
239 Returns 0 if the pattern was valid, otherwise an error string.
240
241 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
242 are set in BUFP on entry. */
243
244const char *
245re_compile_pattern (pattern, length, bufp)
246 const char *pattern;
247 size_t length;
248 struct re_pattern_buffer *bufp;
249{
250 reg_errcode_t ret;
251
252 /* And GNU code determines whether or not to get register information
253 by passing null for the REGS argument to re_match, etc., not by
254 setting no_sub, unless RE_NO_SUB is set. */
255 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
256
257 /* Match anchors at newline. */
258 bufp->newline_anchor = 1;
259
260 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
261
262 if (!ret)
263 return NULL;
264 return gettext (RE_ERRMSG(ret));
265}
266#ifdef _LIBC
267weak_alias (__re_compile_pattern, re_compile_pattern)
268#endif
269
270/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
271 also be assigned to arbitrarily: each pattern buffer stores its own
272 syntax, so it can be changed between regex compilations. */
273/* This has no initializer because initialized variables in Emacs
274 become read-only after dumping. */
275reg_syntax_t re_syntax_options;
276
277
278/* Specify the precise syntax of regexps for compilation. This provides
279 for compatibility for various utilities which historically have
280 different, incompatible syntaxes.
281
282 The argument SYNTAX is a bit mask comprised of the various bits
283 defined in regex.h. We return the old syntax. */
284
285reg_syntax_t
286re_set_syntax (syntax)
287 reg_syntax_t syntax;
288{
289 reg_syntax_t ret = re_syntax_options;
290
291 re_syntax_options = syntax;
292 return ret;
293}
294#ifdef _LIBC
295weak_alias (__re_set_syntax, re_set_syntax)
296#endif
297
298int
299re_compile_fastmap (bufp)
300 struct re_pattern_buffer *bufp;
301{
302 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
303 char *fastmap = bufp->fastmap;
304
305 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
306 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
307 if (dfa->init_state != dfa->init_state_word)
308 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
309 if (dfa->init_state != dfa->init_state_nl)
310 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
311 if (dfa->init_state != dfa->init_state_begbuf)
312 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
313 bufp->fastmap_accurate = 1;
314 return 0;
315}
316#ifdef _LIBC
317weak_alias (__re_compile_fastmap, re_compile_fastmap)
318#endif
319
320static inline void
321__attribute ((always_inline))
322re_set_fastmap (char *fastmap, int icase, int ch)
323{
324 fastmap[ch] = 1;
325 if (icase)
326 fastmap[tolower (ch)] = 1;
327}
328
329/* Helper function for re_compile_fastmap.
330 Compile fastmap for the initial_state INIT_STATE. */
331
332static void
333re_compile_fastmap_iter (bufp, init_state, fastmap)
334 regex_t *bufp;
335 const re_dfastate_t *init_state;
336 char *fastmap;
337{
338 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
339 int node_cnt;
340 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
341 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
342 {
343 int node = init_state->nodes.elems[node_cnt];
344 re_token_type_t type = dfa->nodes[node].type;
345
346 if (type == CHARACTER)
347 {
348 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
349#ifdef RE_ENABLE_I18N
350 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
351 {
352 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
353 wchar_t wc;
354 mbstate_t state;
355
356 p = buf;
357 *p++ = dfa->nodes[node].opr.c;
358 while (++node < dfa->nodes_len
359 && dfa->nodes[node].type == CHARACTER
360 && dfa->nodes[node].mb_partial)
361 *p++ = dfa->nodes[node].opr.c;
362 memset (&state, 0, sizeof (state));
363 if (mbrtowc (&wc, (const char *) buf, p - buf,
364 &state) == p - buf
365 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
366 re_set_fastmap (fastmap, 0, buf[0]);
367 re_free (buf);
368 }
369#endif
370 }
371 else if (type == SIMPLE_BRACKET)
372 {
373 int i, j, ch;
374 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
375 for (j = 0; j < UINT_BITS; ++j, ++ch)
376 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
377 re_set_fastmap (fastmap, icase, ch);
378 }
379#ifdef RE_ENABLE_I18N
380 else if (type == COMPLEX_BRACKET)
381 {
382 int i;
383 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
384 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
385 || cset->nranges || cset->nchar_classes)
386 {
387# ifdef _LIBC
388 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
389 {
390 /* In this case we want to catch the bytes which are
391 the first byte of any collation elements.
392 e.g. In da_DK, we want to catch 'a' since "aa"
393 is a valid collation element, and don't catch
394 'b' since 'b' is the only collation element
395 which starts from 'b'. */
396 int j, ch;
397 const int32_t *table = (const int32_t *)
398 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
399 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
400 for (j = 0; j < UINT_BITS; ++j, ++ch)
401 if (table[ch] < 0)
402 re_set_fastmap (fastmap, icase, ch);
403 }
404# else
405 if (dfa->mb_cur_max > 1)
406 for (i = 0; i < SBC_MAX; ++i)
407 if (__btowc (i) == WEOF)
408 re_set_fastmap (fastmap, icase, i);
409# endif /* not _LIBC */
410 }
411 for (i = 0; i < cset->nmbchars; ++i)
412 {
413 char buf[256];
414 mbstate_t state;
415 memset (&state, '\0', sizeof (state));
416 __wcrtomb (buf, cset->mbchars[i], &state);
417 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
418 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
419 {
420 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
421 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
422 }
423 }
424 }
425#endif /* RE_ENABLE_I18N */
426 else if (type == OP_PERIOD
427#ifdef RE_ENABLE_I18N
428 || type == OP_UTF8_PERIOD
429#endif /* RE_ENABLE_I18N */
430 || type == END_OF_RE)
431 {
432 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
433 if (type == END_OF_RE)
434 bufp->can_be_null = 1;
435 return;
436 }
437 }
438}
439
440
441/* Entry point for POSIX code. */
442/* regcomp takes a regular expression as a string and compiles it.
443
444 PREG is a regex_t *. We do not expect any fields to be initialized,
445 since POSIX says we shouldn't. Thus, we set
446
447 `buffer' to the compiled pattern;
448 `used' to the length of the compiled pattern;
449 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
450 REG_EXTENDED bit in CFLAGS is set; otherwise, to
451 RE_SYNTAX_POSIX_BASIC;
452 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
453 `fastmap' to an allocated space for the fastmap;
454 `fastmap_accurate' to zero;
455 `re_nsub' to the number of subexpressions in PATTERN.
456
457 PATTERN is the address of the pattern string.
458
459 CFLAGS is a series of bits which affect compilation.
460
461 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
462 use POSIX basic syntax.
463
464 If REG_NEWLINE is set, then . and [^...] don't match newline.
465 Also, regexec will try a match beginning after every newline.
466
467 If REG_ICASE is set, then we considers upper- and lowercase
468 versions of letters to be equivalent when matching.
469
470 If REG_NOSUB is set, then when PREG is passed to regexec, that
471 routine will report only success or failure, and nothing about the
472 registers.
473
474 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
475 the return codes and their meanings.) */
476
477int
478regcomp (preg, pattern, cflags)
479 regex_t *__restrict preg;
480 const char *__restrict pattern;
481 int cflags;
482{
483 reg_errcode_t ret;
484 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
485 : RE_SYNTAX_POSIX_BASIC);
486
487 preg->buffer = NULL;
488 preg->allocated = 0;
489 preg->used = 0;
490
491 /* Try to allocate space for the fastmap. */
492 preg->fastmap = re_malloc (char, SBC_MAX);
493 if (BE (preg->fastmap == NULL, 0))
494 return REG_ESPACE;
495
496 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
497
498 /* If REG_NEWLINE is set, newlines are treated differently. */
499 if (cflags & REG_NEWLINE)
500 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
501 syntax &= ~RE_DOT_NEWLINE;
502 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
503 /* It also changes the matching behavior. */
504 preg->newline_anchor = 1;
505 }
506 else
507 preg->newline_anchor = 0;
508 preg->no_sub = !!(cflags & REG_NOSUB);
509 preg->translate = NULL;
510
511 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
512
513 /* POSIX doesn't distinguish between an unmatched open-group and an
514 unmatched close-group: both are REG_EPAREN. */
515 if (ret == REG_ERPAREN)
516 ret = REG_EPAREN;
517
518 /* We have already checked preg->fastmap != NULL. */
519 if (BE (ret == REG_NOERROR, 1))
520 /* Compute the fastmap now, since regexec cannot modify the pattern
521 buffer. This function never fails in this implementation. */
522 (void) re_compile_fastmap (preg);
523 else
524 {
525 /* Some error occurred while compiling the expression. */
526 re_free (preg->fastmap);
527 preg->fastmap = NULL;
528 }
529
530 return (int) ret;
531}
532#ifdef _LIBC
533weak_alias (__regcomp, regcomp)
534#endif
535
536/* Returns a message corresponding to an error code, ERRCODE, returned
537 from either regcomp or regexec. We don't use PREG here. */
538
539size_t
540regerror (errcode, preg, errbuf, errbuf_size)
541 int errcode;
542 const regex_t *preg;
543 char *errbuf;
544 size_t errbuf_size;
545{
546 const char *msg;
547 size_t msg_size;
548
549 if (BE (errcode < 0
550 || errcode >= (int) (sizeof (__re_error_msgid_idx)
551 / sizeof (__re_error_msgid_idx[0])), 0))
552 /* Only error codes returned by the rest of the code should be passed
553 to this routine. If we are given anything else, or if other regex
554 code generates an invalid error code, then the program has a bug.
555 Dump core so we can fix it. */
556 abort ();
557
558 msg = gettext (RE_ERRMSG(errcode));
559
560 msg_size = strlen (msg) + 1; /* Includes the null. */
561
562 if (BE (errbuf_size != 0, 1))
563 {
564 if (BE (msg_size > errbuf_size, 0))
565 {
566 memcpy (errbuf, msg, errbuf_size - 1);
567 errbuf[errbuf_size - 1] = 0;
568 }
569 else
570 memcpy (errbuf, msg, msg_size);
571 }
572
573 return msg_size;
574}
575#ifdef _LIBC
576weak_alias (__regerror, regerror)
577#endif
578
579
580#ifdef RE_ENABLE_I18N
581/* This static array is used for the map to single-byte characters when
582 UTF-8 is used. Otherwise we would allocate memory just to initialize
583 it the same all the time. UTF-8 is the preferred encoding so this is
584 a worthwhile optimization. */
585static const bitset utf8_sb_map =
586{
587 /* Set the first 128 bits. */
588# if UINT_MAX == 0xffffffff
589 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
590# else
591# error "Add case for new unsigned int size"
592# endif
593};
594#endif
595
596
597static void
598free_dfa_content (re_dfa_t *dfa)
599{
600 int i, j;
601
602 if (dfa->nodes)
603 for (i = 0; i < dfa->nodes_len; ++i)
604 free_token (dfa->nodes + i);
605 re_free (dfa->nexts);
606 for (i = 0; i < dfa->nodes_len; ++i)
607 {
608 if (dfa->eclosures != NULL)
609 re_node_set_free (dfa->eclosures + i);
610 if (dfa->inveclosures != NULL)
611 re_node_set_free (dfa->inveclosures + i);
612 if (dfa->edests != NULL)
613 re_node_set_free (dfa->edests + i);
614 }
615 re_free (dfa->edests);
616 re_free (dfa->eclosures);
617 re_free (dfa->inveclosures);
618 re_free (dfa->nodes);
619
620 if (dfa->state_table)
621 for (i = 0; i <= dfa->state_hash_mask; ++i)
622 {
623 struct re_state_table_entry *entry = dfa->state_table + i;
624 for (j = 0; j < entry->num; ++j)
625 {
626 re_dfastate_t *state = entry->array[j];
627 free_state (state);
628 }
629 re_free (entry->array);
630 }
631 re_free (dfa->state_table);
632#ifdef RE_ENABLE_I18N
633 if (dfa->sb_char != utf8_sb_map)
634 re_free (dfa->sb_char);
635#endif
636 re_free (dfa->subexp_map);
637#ifdef DEBUG
638 re_free (dfa->re_str);
639#endif
640
641 re_free (dfa);
642}
643
644
645/* Free dynamically allocated space used by PREG. */
646
647void
648regfree (preg)
649 regex_t *preg;
650{
651 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
652 if (BE (dfa != NULL, 1))
653 free_dfa_content (dfa);
654 preg->buffer = NULL;
655 preg->allocated = 0;
656
657 re_free (preg->fastmap);
658 preg->fastmap = NULL;
659
660 re_free (preg->translate);
661 preg->translate = NULL;
662}
663#ifdef _LIBC
664weak_alias (__regfree, regfree)
665#endif
666
667
668/* Entry points compatible with 4.2 BSD regex library. We don't define
669 them unless specifically requested. */
670
671#if defined _REGEX_RE_COMP || defined _LIBC
672
673/* BSD has one and only one pattern buffer. */
674static struct re_pattern_buffer re_comp_buf;
675
676char *
677# ifdef _LIBC
678/* Make these definitions weak in libc, so POSIX programs can redefine
679 these names if they don't use our functions, and still use
680 regcomp/regexec above without link errors. */
681weak_function
682# endif
683re_comp (s)
684 const char *s;
685{
686 reg_errcode_t ret;
687 char *fastmap;
688
689 if (!s)
690 {
691 if (!re_comp_buf.buffer)
692 return gettext ("No previous regular expression");
693 return 0;
694 }
695
696 if (re_comp_buf.buffer)
697 {
698 fastmap = re_comp_buf.fastmap;
699 re_comp_buf.fastmap = NULL;
700 __regfree (&re_comp_buf);
701 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
702 re_comp_buf.fastmap = fastmap;
703 }
704
705 if (re_comp_buf.fastmap == NULL)
706 {
707 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
708 if (re_comp_buf.fastmap == NULL)
709 return (char *) gettext (RE_ERRMSG(REG_ESPACE_IDX));
710 }
711
712 /* Since `re_exec' always passes NULL for the `regs' argument, we
713 don't need to initialize the pattern buffer fields which affect it. */
714
715 /* Match anchors at newlines. */
716 re_comp_buf.newline_anchor = 1;
717
718 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
719
720 if (!ret)
721 return NULL;
722
723 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
724 return (char *) gettext (RE_ERRMSG(ret));
725}
726
727#ifdef _LIBC
728libc_freeres_fn (free_mem)
729{
730 __regfree (&re_comp_buf);
731}
732#endif
733
734#endif /* _REGEX_RE_COMP */
735
736
737/* Internal entry point.
738 Compile the regular expression PATTERN, whose length is LENGTH.
739 SYNTAX indicate regular expression's syntax. */
740
741static reg_errcode_t
742re_compile_internal (preg, pattern, length, syntax)
743 regex_t *preg;
744 const char * pattern;
745 int length;
746 reg_syntax_t syntax;
747{
748 reg_errcode_t err = REG_NOERROR;
749 re_dfa_t *dfa;
750 re_string_t regexp;
751
752 /* Initialize the pattern buffer. */
753 preg->fastmap_accurate = 0;
754 preg->syntax = syntax;
755 preg->not_bol = preg->not_eol = 0;
756 preg->used = 0;
757 preg->re_nsub = 0;
758 preg->can_be_null = 0;
759 preg->regs_allocated = REGS_UNALLOCATED;
760
761 /* Initialize the dfa. */
762 dfa = (re_dfa_t *) preg->buffer;
763 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
764 {
765 /* If zero allocated, but buffer is non-null, try to realloc
766 enough space. This loses if buffer's address is bogus, but
767 that is the user's responsibility. If ->buffer is NULL this
768 is a simple allocation. */
769 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
770 if (dfa == NULL)
771 return REG_ESPACE;
772 preg->allocated = sizeof (re_dfa_t);
773 preg->buffer = (unsigned char *) dfa;
774 }
775 preg->used = sizeof (re_dfa_t);
776
777 err = init_dfa (dfa, length);
778 if (BE (err != REG_NOERROR, 0))
779 {
780 free_dfa_content (dfa);
781 preg->buffer = NULL;
782 preg->allocated = 0;
783 return err;
784 }
785#ifdef DEBUG
786 dfa->re_str = re_malloc (char, length + 1);
787 strncpy (dfa->re_str, pattern, length + 1);
788#endif
789
790 err = re_string_construct (&regexp, pattern, length, preg->translate,
791 syntax & RE_ICASE, dfa);
792 if (BE (err != REG_NOERROR, 0))
793 {
794 re_compile_internal_free_return:
795 free_workarea_compile (preg);
796 re_string_destruct (&regexp);
797 free_dfa_content (dfa);
798 preg->buffer = NULL;
799 preg->allocated = 0;
800 return err;
801 }
802
803 /* Parse the regular expression, and build a structure tree. */
804 preg->re_nsub = 0;
805 dfa->str_tree = parse (&regexp, preg, syntax, &err);
806 if (BE (dfa->str_tree == NULL, 0))
807 goto re_compile_internal_free_return;
808
809 /* Analyze the tree and create the nfa. */
810 err = analyze (preg);
811 if (BE (err != REG_NOERROR, 0))
812 goto re_compile_internal_free_return;
813
814#ifdef RE_ENABLE_I18N
815 /* If possible, do searching in single byte encoding to speed things up. */
816 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
817 optimize_utf8 (dfa);
818#endif
819
820 /* Then create the initial state of the dfa. */
821 err = create_initial_state (dfa);
822
823 /* Release work areas. */
824 free_workarea_compile (preg);
825 re_string_destruct (&regexp);
826
827 if (BE (err != REG_NOERROR, 0))
828 {
829 free_dfa_content (dfa);
830 preg->buffer = NULL;
831 preg->allocated = 0;
832 }
833
834 return err;
835}
836
837/* Initialize DFA. We use the length of the regular expression PAT_LEN
838 as the initial length of some arrays. */
839
840static reg_errcode_t
841init_dfa (dfa, pat_len)
842 re_dfa_t *dfa;
843 int pat_len;
844{
845 int table_size;
846#ifndef _LIBC
847 char *codeset_name;
848#endif
849
850 memset (dfa, '\0', sizeof (re_dfa_t));
851
852 /* Force allocation of str_tree_storage the first time. */
853 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854
855 dfa->nodes_alloc = pat_len + 1;
856 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
857
858 dfa->states_alloc = pat_len + 1;
859
860 /* table_size = 2 ^ ceil(log pat_len) */
861 for (table_size = 1; table_size > 0; table_size <<= 1)
862 if (table_size > pat_len)
863 break;
864
865 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
866 dfa->state_hash_mask = table_size - 1;
867
868 dfa->mb_cur_max = MB_CUR_MAX;
869#ifdef _LIBC
870 if (dfa->mb_cur_max == 6
871 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
872 dfa->is_utf8 = 1;
873 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
874 != 0);
875#else
876# ifdef HAVE_LANGINFO_CODESET
877 codeset_name = nl_langinfo (CODESET);
878# else
879 codeset_name = getenv ("LC_ALL");
880 if (codeset_name == NULL || codeset_name[0] == '\0')
881 codeset_name = getenv ("LC_CTYPE");
882 if (codeset_name == NULL || codeset_name[0] == '\0')
883 codeset_name = getenv ("LANG");
884 if (codeset_name == NULL)
885 codeset_name = "";
886 else if (strchr (codeset_name, '.') != NULL)
887 codeset_name = strchr (codeset_name, '.') + 1;
888# endif
889
890 /* strcasecmp isn't a standard interface. brute force check */
891#if 0
892 if (strcasecmp (codeset_name, "UTF-8") == 0
893 || strcasecmp (codeset_name, "UTF8") == 0)
894 dfa->is_utf8 = 1;
895#else
896 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
897 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
898 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
899 && (codeset_name[3] == '-'
900 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
901 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
902 dfa->is_utf8 = 1;
903#endif
904
905 /* We check exhaustively in the loop below if this charset is a
906 superset of ASCII. */
907 dfa->map_notascii = 0;
908#endif
909
910#ifdef RE_ENABLE_I18N
911 if (dfa->mb_cur_max > 1)
912 {
913 if (dfa->is_utf8)
914 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
915 else
916 {
917 int i, j, ch;
918
919 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
920 if (BE (dfa->sb_char == NULL, 0))
921 return REG_ESPACE;
922
923 /* Clear all bits by, then set those corresponding to single
924 byte chars. */
925 bitset_empty (dfa->sb_char);
926
927 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
928 for (j = 0; j < UINT_BITS; ++j, ++ch)
929 {
930 wchar_t wch = __btowc (ch);
931 if (wch != WEOF)
932 dfa->sb_char[i] |= 1 << j;
933# ifndef _LIBC
934 if (isascii (ch) && wch != (wchar_t) ch)
935 dfa->map_notascii = 1;
936# endif
937 }
938 }
939 }
940#endif
941
942 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
943 return REG_ESPACE;
944 return REG_NOERROR;
945}
946
947/* Initialize WORD_CHAR table, which indicate which character is
948 "word". In this case "word" means that it is the word construction
949 character used by some operators like "\<", "\>", etc. */
950
951static void
952init_word_char (dfa)
953 re_dfa_t *dfa;
954{
955 int i, j, ch;
956 dfa->word_ops_used = 1;
957 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
958 for (j = 0; j < UINT_BITS; ++j, ++ch)
959 if (isalnum (ch) || ch == '_')
960 dfa->word_char[i] |= 1 << j;
961}
962
963/* Free the work area which are only used while compiling. */
964
965static void
966free_workarea_compile (preg)
967 regex_t *preg;
968{
969 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
970 bin_tree_storage_t *storage, *next;
971 for (storage = dfa->str_tree_storage; storage; storage = next)
972 {
973 next = storage->next;
974 re_free (storage);
975 }
976 dfa->str_tree_storage = NULL;
977 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
978 dfa->str_tree = NULL;
979 re_free (dfa->org_indices);
980 dfa->org_indices = NULL;
981}
982
983/* Create initial states for all contexts. */
984
985static reg_errcode_t
986create_initial_state (dfa)
987 re_dfa_t *dfa;
988{
989 int first, i;
990 reg_errcode_t err;
991 re_node_set init_nodes;
992
993 /* Initial states have the epsilon closure of the node which is
994 the first node of the regular expression. */
995 first = dfa->str_tree->first->node_idx;
996 dfa->init_node = first;
997 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
998 if (BE (err != REG_NOERROR, 0))
999 return err;
1000
1001 /* The back-references which are in initial states can epsilon transit,
1002 since in this case all of the subexpressions can be null.
1003 Then we add epsilon closures of the nodes which are the next nodes of
1004 the back-references. */
1005 if (dfa->nbackref > 0)
1006 for (i = 0; i < init_nodes.nelem; ++i)
1007 {
1008 int node_idx = init_nodes.elems[i];
1009 re_token_type_t type = dfa->nodes[node_idx].type;
1010
1011 int clexp_idx;
1012 if (type != OP_BACK_REF)
1013 continue;
1014 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1015 {
1016 re_token_t *clexp_node;
1017 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1018 if (clexp_node->type == OP_CLOSE_SUBEXP
1019 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1020 break;
1021 }
1022 if (clexp_idx == init_nodes.nelem)
1023 continue;
1024
1025 if (type == OP_BACK_REF)
1026 {
1027 int dest_idx = dfa->edests[node_idx].elems[0];
1028 if (!re_node_set_contains (&init_nodes, dest_idx))
1029 {
1030 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1031 i = 0;
1032 }
1033 }
1034 }
1035
1036 /* It must be the first time to invoke acquire_state. */
1037 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1038 /* We don't check ERR here, since the initial state must not be NULL. */
1039 if (BE (dfa->init_state == NULL, 0))
1040 return err;
1041 if (dfa->init_state->has_constraint)
1042 {
1043 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1044 CONTEXT_WORD);
1045 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1046 CONTEXT_NEWLINE);
1047 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1048 &init_nodes,
1049 CONTEXT_NEWLINE
1050 | CONTEXT_BEGBUF);
1051 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1052 || dfa->init_state_begbuf == NULL, 0))
1053 return err;
1054 }
1055 else
1056 dfa->init_state_word = dfa->init_state_nl
1057 = dfa->init_state_begbuf = dfa->init_state;
1058
1059 re_node_set_free (&init_nodes);
1060 return REG_NOERROR;
1061}
1062
1063
1064#ifdef RE_ENABLE_I18N
1065/* If it is possible to do searching in single byte encoding instead of UTF-8
1066 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1067 DFA nodes where needed. */
1068
1069static void
1070optimize_utf8 (dfa)
1071 re_dfa_t *dfa;
1072{
1073 int node, i, mb_chars = 0, has_period = 0;
1074
1075 for (node = 0; node < dfa->nodes_len; ++node)
1076 switch (dfa->nodes[node].type)
1077 {
1078 case CHARACTER:
1079 if (dfa->nodes[node].opr.c >= 0x80)
1080 mb_chars = 1;
1081 break;
1082 case ANCHOR:
1083 switch (dfa->nodes[node].opr.idx)
1084 {
1085 case LINE_FIRST:
1086 case LINE_LAST:
1087 case BUF_FIRST:
1088 case BUF_LAST:
1089 break;
1090 default:
1091 /* Word anchors etc. cannot be handled. */
1092 return;
1093 }
1094 break;
1095 case OP_PERIOD:
1096 has_period = 1;
1097 break;
1098 case OP_BACK_REF:
1099 case OP_ALT:
1100 case END_OF_RE:
1101 case OP_DUP_ASTERISK:
1102 case OP_OPEN_SUBEXP:
1103 case OP_CLOSE_SUBEXP:
1104 break;
1105 case COMPLEX_BRACKET:
1106 return;
1107 case SIMPLE_BRACKET:
1108 /* Just double check. */
1109 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1110 if (dfa->nodes[node].opr.sbcset[i])
1111 return;
1112 break;
1113 default:
1114 abort ();
1115 }
1116
1117 if (mb_chars || has_period)
1118 for (node = 0; node < dfa->nodes_len; ++node)
1119 {
1120 if (dfa->nodes[node].type == CHARACTER
1121 && dfa->nodes[node].opr.c >= 0x80)
1122 dfa->nodes[node].mb_partial = 0;
1123 else if (dfa->nodes[node].type == OP_PERIOD)
1124 dfa->nodes[node].type = OP_UTF8_PERIOD;
1125 }
1126
1127 /* The search can be in single byte locale. */
1128 dfa->mb_cur_max = 1;
1129 dfa->is_utf8 = 0;
1130 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1131}
1132#endif
1133
1134
1135/* Analyze the structure tree, and calculate "first", "next", "edest",
1136 "eclosure", and "inveclosure". */
1137
1138static reg_errcode_t
1139analyze (preg)
1140 regex_t *preg;
1141{
1142 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1143 reg_errcode_t ret;
1144
1145 /* Allocate arrays. */
1146 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1147 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1148 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1149 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1150 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1151 || dfa->eclosures == NULL, 0))
1152 return REG_ESPACE;
1153
1154 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1155 if (dfa->subexp_map != NULL)
1156 {
1157 int i;
1158 for (i = 0; i < preg->re_nsub; i++)
1159 dfa->subexp_map[i] = i;
1160 preorder (dfa->str_tree, optimize_subexps, dfa);
1161 for (i = 0; i < preg->re_nsub; i++)
1162 if (dfa->subexp_map[i] != i)
1163 break;
1164 if (i == preg->re_nsub)
1165 {
1166 free (dfa->subexp_map);
1167 dfa->subexp_map = NULL;
1168 }
1169 }
1170
1171 ret = postorder (dfa->str_tree, lower_subexps, preg);
1172 if (BE (ret != REG_NOERROR, 0))
1173 return ret;
1174 ret = postorder (dfa->str_tree, calc_first, dfa);
1175 if (BE (ret != REG_NOERROR, 0))
1176 return ret;
1177 preorder (dfa->str_tree, calc_next, dfa);
1178 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1179 if (BE (ret != REG_NOERROR, 0))
1180 return ret;
1181 ret = calc_eclosure (dfa);
1182 if (BE (ret != REG_NOERROR, 0))
1183 return ret;
1184
1185 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1186 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1187 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1188 || dfa->nbackref)
1189 {
1190 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1191 if (BE (dfa->inveclosures == NULL, 0))
1192 return REG_ESPACE;
1193 ret = calc_inveclosure (dfa);
1194 }
1195
1196 return ret;
1197}
1198
1199/* Our parse trees are very unbalanced, so we cannot use a stack to
1200 implement parse tree visits. Instead, we use parent pointers and
1201 some hairy code in these two functions. */
1202static reg_errcode_t
1203postorder (root, fn, extra)
1204 bin_tree_t *root;
1205 reg_errcode_t (fn (void *, bin_tree_t *));
1206 void *extra;
1207{
1208 bin_tree_t *node, *prev;
1209
1210 for (node = root; ; )
1211 {
1212 /* Descend down the tree, preferably to the left (or to the right
1213 if that's the only child). */
1214 while (node->left || node->right)
1215 if (node->left)
1216 node = node->left;
1217 else
1218 node = node->right;
1219
1220 do
1221 {
1222 reg_errcode_t err = fn (extra, node);
1223 if (BE (err != REG_NOERROR, 0))
1224 return err;
1225 if (node->parent == NULL)
1226 return REG_NOERROR;
1227 prev = node;
1228 node = node->parent;
1229 }
1230 /* Go up while we have a node that is reached from the right. */
1231 while (node->right == prev || node->right == NULL);
1232 node = node->right;
1233 }
1234}
1235
1236static reg_errcode_t
1237preorder (root, fn, extra)
1238 bin_tree_t *root;
1239 reg_errcode_t (fn (void *, bin_tree_t *));
1240 void *extra;
1241{
1242 bin_tree_t *node;
1243
1244 for (node = root; ; )
1245 {
1246 reg_errcode_t err = fn (extra, node);
1247 if (BE (err != REG_NOERROR, 0))
1248 return err;
1249
1250 /* Go to the left node, or up and to the right. */
1251 if (node->left)
1252 node = node->left;
1253 else
1254 {
1255 bin_tree_t *prev = NULL;
1256 while (node->right == prev || node->right == NULL)
1257 {
1258 prev = node;
1259 node = node->parent;
1260 if (!node)
1261 return REG_NOERROR;
1262 }
1263 node = node->right;
1264 }
1265 }
1266}
1267
1268/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1269 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1270 backreferences as well. Requires a preorder visit. */
1271static reg_errcode_t
1272optimize_subexps (extra, node)
1273 void *extra;
1274 bin_tree_t *node;
1275{
1276 re_dfa_t *dfa = (re_dfa_t *) extra;
1277
1278 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1279 {
1280 int idx = node->token.opr.idx;
1281 node->token.opr.idx = dfa->subexp_map[idx];
1282 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1283 }
1284
1285 else if (node->token.type == SUBEXP
1286 && node->left && node->left->token.type == SUBEXP)
1287 {
1288 int other_idx = node->left->token.opr.idx;
1289
1290 node->left = node->left->left;
1291 if (node->left)
1292 node->left->parent = node;
1293
1294 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1296 dfa->used_bkref_map &= ~(1 << other_idx);
1297 }
1298
1299 return REG_NOERROR;
1300}
1301
1302/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1304static reg_errcode_t
1305lower_subexps (extra, node)
1306 void *extra;
1307 bin_tree_t *node;
1308{
1309 regex_t *preg = (regex_t *) extra;
1310 reg_errcode_t err = REG_NOERROR;
1311
1312 if (node->left && node->left->token.type == SUBEXP)
1313 {
1314 node->left = lower_subexp (&err, preg, node->left);
1315 if (node->left)
1316 node->left->parent = node;
1317 }
1318 if (node->right && node->right->token.type == SUBEXP)
1319 {
1320 node->right = lower_subexp (&err, preg, node->right);
1321 if (node->right)
1322 node->right->parent = node;
1323 }
1324
1325 return err;
1326}
1327
1328static bin_tree_t *
1329lower_subexp (err, preg, node)
1330 reg_errcode_t *err;
1331 regex_t *preg;
1332 bin_tree_t *node;
1333{
1334 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1335 bin_tree_t *body = node->left;
1336 bin_tree_t *op, *cls, *tree1, *tree;
1337
1338 if (preg->no_sub
1339 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1340 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1341 return node->left;
1342
1343 /* Convert the SUBEXP node to the concatenation of an
1344 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1345 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1346 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1347 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1348 tree = create_tree (dfa, op, tree1, CONCAT);
1349 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1350 {
1351 *err = REG_ESPACE;
1352 return NULL;
1353 }
1354
1355 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1356 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1357 return tree;
1358}
1359
1360/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1361 nodes. Requires a postorder visit. */
1362static reg_errcode_t
1363calc_first (extra, node)
1364 void *extra;
1365 bin_tree_t *node;
1366{
1367 re_dfa_t *dfa = (re_dfa_t *) extra;
1368 if (node->token.type == CONCAT)
1369 {
1370 node->first = node->left->first;
1371 node->node_idx = node->left->node_idx;
1372 }
1373 else
1374 {
1375 node->first = node;
1376 node->node_idx = re_dfa_add_node (dfa, node->token);
1377 if (BE (node->node_idx == -1, 0))
1378 return REG_ESPACE;
1379 }
1380 return REG_NOERROR;
1381}
1382
1383/* Pass 2: compute NEXT on the tree. Preorder visit. */
1384static reg_errcode_t
1385calc_next (extra, node)
1386 void *extra;
1387 bin_tree_t *node;
1388{
1389 switch (node->token.type)
1390 {
1391 case OP_DUP_ASTERISK:
1392 node->left->next = node;
1393 break;
1394 case CONCAT:
1395 node->left->next = node->right->first;
1396 node->right->next = node->next;
1397 break;
1398 default:
1399 if (node->left)
1400 node->left->next = node->next;
1401 if (node->right)
1402 node->right->next = node->next;
1403 break;
1404 }
1405 return REG_NOERROR;
1406}
1407
1408/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1409static reg_errcode_t
1410link_nfa_nodes (extra, node)
1411 void *extra;
1412 bin_tree_t *node;
1413{
1414 re_dfa_t *dfa = (re_dfa_t *) extra;
1415 int idx = node->node_idx;
1416 reg_errcode_t err = REG_NOERROR;
1417
1418 switch (node->token.type)
1419 {
1420 case CONCAT:
1421 break;
1422
1423 case END_OF_RE:
1424 assert (node->next == NULL);
1425 break;
1426
1427 case OP_DUP_ASTERISK:
1428 case OP_ALT:
1429 {
1430 int left, right;
1431 dfa->has_plural_match = 1;
1432 if (node->left != NULL)
1433 left = node->left->first->node_idx;
1434 else
1435 left = node->next->node_idx;
1436 if (node->right != NULL)
1437 right = node->right->first->node_idx;
1438 else
1439 right = node->next->node_idx;
1440 assert (left > -1);
1441 assert (right > -1);
1442 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1443 }
1444 break;
1445
1446 case ANCHOR:
1447 case OP_OPEN_SUBEXP:
1448 case OP_CLOSE_SUBEXP:
1449 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1450 break;
1451
1452 case OP_BACK_REF:
1453 dfa->nexts[idx] = node->next->node_idx;
1454 if (node->token.type == OP_BACK_REF)
1455 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1456 break;
1457
1458 default:
1459 assert (!IS_EPSILON_NODE (node->token.type));
1460 dfa->nexts[idx] = node->next->node_idx;
1461 break;
1462 }
1463
1464 return err;
1465}
1466
1467/* Duplicate the epsilon closure of the node ROOT_NODE.
1468 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1469 to their own constraint. */
1470
1471static reg_errcode_t
1472duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1473 init_constraint)
1474 re_dfa_t *dfa;
1475 int top_org_node, top_clone_node, root_node;
1476 unsigned int init_constraint;
1477{
1478 reg_errcode_t err;
1479 int org_node, clone_node, ret;
1480 unsigned int constraint = init_constraint;
1481 for (org_node = top_org_node, clone_node = top_clone_node;;)
1482 {
1483 int org_dest, clone_dest;
1484 if (dfa->nodes[org_node].type == OP_BACK_REF)
1485 {
1486 /* If the back reference epsilon-transit, its destination must
1487 also have the constraint. Then duplicate the epsilon closure
1488 of the destination of the back reference, and store it in
1489 edests of the back reference. */
1490 org_dest = dfa->nexts[org_node];
1491 re_node_set_empty (dfa->edests + clone_node);
1492 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1493 if (BE (err != REG_NOERROR, 0))
1494 return err;
1495 dfa->nexts[clone_node] = dfa->nexts[org_node];
1496 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1497 if (BE (ret < 0, 0))
1498 return REG_ESPACE;
1499 }
1500 else if (dfa->edests[org_node].nelem == 0)
1501 {
1502 /* In case of the node can't epsilon-transit, don't duplicate the
1503 destination and store the original destination as the
1504 destination of the node. */
1505 dfa->nexts[clone_node] = dfa->nexts[org_node];
1506 break;
1507 }
1508 else if (dfa->edests[org_node].nelem == 1)
1509 {
1510 /* In case of the node can epsilon-transit, and it has only one
1511 destination. */
1512 org_dest = dfa->edests[org_node].elems[0];
1513 re_node_set_empty (dfa->edests + clone_node);
1514 if (dfa->nodes[org_node].type == ANCHOR)
1515 {
1516 /* In case of the node has another constraint, append it. */
1517 if (org_node == root_node && clone_node != org_node)
1518 {
1519 /* ...but if the node is root_node itself, it means the
1520 epsilon closure have a loop, then tie it to the
1521 destination of the root_node. */
1522 ret = re_node_set_insert (dfa->edests + clone_node,
1523 org_dest);
1524 if (BE (ret < 0, 0))
1525 return REG_ESPACE;
1526 break;
1527 }
1528 constraint |= dfa->nodes[org_node].opr.ctx_type;
1529 }
1530 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1531 if (BE (err != REG_NOERROR, 0))
1532 return err;
1533 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534 if (BE (ret < 0, 0))
1535 return REG_ESPACE;
1536 }
1537 else /* dfa->edests[org_node].nelem == 2 */
1538 {
1539 /* In case of the node can epsilon-transit, and it has two
1540 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1541 org_dest = dfa->edests[org_node].elems[0];
1542 re_node_set_empty (dfa->edests + clone_node);
1543 /* Search for a duplicated node which satisfies the constraint. */
1544 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1545 if (clone_dest == -1)
1546 {
1547 /* There are no such a duplicated node, create a new one. */
1548 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1549 if (BE (err != REG_NOERROR, 0))
1550 return err;
1551 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552 if (BE (ret < 0, 0))
1553 return REG_ESPACE;
1554 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1555 root_node, constraint);
1556 if (BE (err != REG_NOERROR, 0))
1557 return err;
1558 }
1559 else
1560 {
1561 /* There are a duplicated node which satisfy the constraint,
1562 use it to avoid infinite loop. */
1563 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564 if (BE (ret < 0, 0))
1565 return REG_ESPACE;
1566 }
1567
1568 org_dest = dfa->edests[org_node].elems[1];
1569 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1570 if (BE (err != REG_NOERROR, 0))
1571 return err;
1572 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573 if (BE (ret < 0, 0))
1574 return REG_ESPACE;
1575 }
1576 org_node = org_dest;
1577 clone_node = clone_dest;
1578 }
1579 return REG_NOERROR;
1580}
1581
1582/* Search for a node which is duplicated from the node ORG_NODE, and
1583 satisfies the constraint CONSTRAINT. */
1584
1585static int
1586search_duplicated_node (dfa, org_node, constraint)
1587 re_dfa_t *dfa;
1588 int org_node;
1589 unsigned int constraint;
1590{
1591 int idx;
1592 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1593 {
1594 if (org_node == dfa->org_indices[idx]
1595 && constraint == dfa->nodes[idx].constraint)
1596 return idx; /* Found. */
1597 }
1598 return -1; /* Not found. */
1599}
1600
1601/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1602 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1603 otherwise return the error code. */
1604
1605static reg_errcode_t
1606duplicate_node (new_idx, dfa, org_idx, constraint)
1607 re_dfa_t *dfa;
1608 int *new_idx, org_idx;
1609 unsigned int constraint;
1610{
1611 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1612 if (BE (dup_idx == -1, 0))
1613 return REG_ESPACE;
1614 dfa->nodes[dup_idx].constraint = constraint;
1615 if (dfa->nodes[org_idx].type == ANCHOR)
1616 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1617 dfa->nodes[dup_idx].duplicated = 1;
1618
1619 /* Store the index of the original node. */
1620 dfa->org_indices[dup_idx] = org_idx;
1621 *new_idx = dup_idx;
1622 return REG_NOERROR;
1623}
1624
1625static reg_errcode_t
1626calc_inveclosure (dfa)
1627 re_dfa_t *dfa;
1628{
1629 int src, idx, ret;
1630 for (idx = 0; idx < dfa->nodes_len; ++idx)
1631 re_node_set_init_empty (dfa->inveclosures + idx);
1632
1633 for (src = 0; src < dfa->nodes_len; ++src)
1634 {
1635 int *elems = dfa->eclosures[src].elems;
1636 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1637 {
1638 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1639 if (BE (ret == -1, 0))
1640 return REG_ESPACE;
1641 }
1642 }
1643
1644 return REG_NOERROR;
1645}
1646
1647/* Calculate "eclosure" for all the node in DFA. */
1648
1649static reg_errcode_t
1650calc_eclosure (dfa)
1651 re_dfa_t *dfa;
1652{
1653 int node_idx, incomplete;
1654#ifdef DEBUG
1655 assert (dfa->nodes_len > 0);
1656#endif
1657 incomplete = 0;
1658 /* For each nodes, calculate epsilon closure. */
1659 for (node_idx = 0; ; ++node_idx)
1660 {
1661 reg_errcode_t err;
1662 re_node_set eclosure_elem;
1663 if (node_idx == dfa->nodes_len)
1664 {
1665 if (!incomplete)
1666 break;
1667 incomplete = 0;
1668 node_idx = 0;
1669 }
1670
1671#ifdef DEBUG
1672 assert (dfa->eclosures[node_idx].nelem != -1);
1673#endif
1674
1675 /* If we have already calculated, skip it. */
1676 if (dfa->eclosures[node_idx].nelem != 0)
1677 continue;
1678 /* Calculate epsilon closure of `node_idx'. */
1679 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1680 if (BE (err != REG_NOERROR, 0))
1681 return err;
1682
1683 if (dfa->eclosures[node_idx].nelem == 0)
1684 {
1685 incomplete = 1;
1686 re_node_set_free (&eclosure_elem);
1687 }
1688 }
1689 return REG_NOERROR;
1690}
1691
1692/* Calculate epsilon closure of NODE. */
1693
1694static reg_errcode_t
1695calc_eclosure_iter (new_set, dfa, node, root)
1696 re_node_set *new_set;
1697 re_dfa_t *dfa;
1698 int node, root;
1699{
1700 reg_errcode_t err;
1701 unsigned int constraint;
1702 int i, incomplete;
1703 re_node_set eclosure;
1704 incomplete = 0;
1705 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1706 if (BE (err != REG_NOERROR, 0))
1707 return err;
1708
1709 /* This indicates that we are calculating this node now.
1710 We reference this value to avoid infinite loop. */
1711 dfa->eclosures[node].nelem = -1;
1712
1713 constraint = ((dfa->nodes[node].type == ANCHOR)
1714 ? dfa->nodes[node].opr.ctx_type : 0);
1715 /* If the current node has constraints, duplicate all nodes.
1716 Since they must inherit the constraints. */
1717 if (constraint
1718 && dfa->edests[node].nelem
1719 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1720 {
1721 int org_node, cur_node;
1722 org_node = cur_node = node;
1723 err = duplicate_node_closure (dfa, node, node, node, constraint);
1724 if (BE (err != REG_NOERROR, 0))
1725 return err;
1726 }
1727
1728 /* Expand each epsilon destination nodes. */
1729 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1730 for (i = 0; i < dfa->edests[node].nelem; ++i)
1731 {
1732 re_node_set eclosure_elem;
1733 int edest = dfa->edests[node].elems[i];
1734 /* If calculating the epsilon closure of `edest' is in progress,
1735 return intermediate result. */
1736 if (dfa->eclosures[edest].nelem == -1)
1737 {
1738 incomplete = 1;
1739 continue;
1740 }
1741 /* If we haven't calculated the epsilon closure of `edest' yet,
1742 calculate now. Otherwise use calculated epsilon closure. */
1743 if (dfa->eclosures[edest].nelem == 0)
1744 {
1745 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1746 if (BE (err != REG_NOERROR, 0))
1747 return err;
1748 }
1749 else
1750 eclosure_elem = dfa->eclosures[edest];
1751 /* Merge the epsilon closure of `edest'. */
1752 re_node_set_merge (&eclosure, &eclosure_elem);
1753 /* If the epsilon closure of `edest' is incomplete,
1754 the epsilon closure of this node is also incomplete. */
1755 if (dfa->eclosures[edest].nelem == 0)
1756 {
1757 incomplete = 1;
1758 re_node_set_free (&eclosure_elem);
1759 }
1760 }
1761
1762 /* Epsilon closures include itself. */
1763 re_node_set_insert (&eclosure, node);
1764 if (incomplete && !root)
1765 dfa->eclosures[node].nelem = 0;
1766 else
1767 dfa->eclosures[node] = eclosure;
1768 *new_set = eclosure;
1769 return REG_NOERROR;
1770}
1771
1772
1773/* Functions for token which are used in the parser. */
1774
1775/* Fetch a token from INPUT.
1776 We must not use this function inside bracket expressions. */
1777
1778static void
1779fetch_token (result, input, syntax)
1780 re_token_t *result;
1781 re_string_t *input;
1782 reg_syntax_t syntax;
1783{
1784 re_string_skip_bytes (input, peek_token (result, input, syntax));
1785}
1786
1787/* Peek a token from INPUT, and return the length of the token.
1788 We must not use this function inside bracket expressions. */
1789
1790static int
1791peek_token (token, input, syntax)
1792 re_token_t *token;
1793 re_string_t *input;
1794 reg_syntax_t syntax;
1795{
1796 unsigned char c;
1797
1798 if (re_string_eoi (input))
1799 {
1800 token->type = END_OF_RE;
1801 return 0;
1802 }
1803
1804 c = re_string_peek_byte (input, 0);
1805 token->opr.c = c;
1806
1807 token->word_char = 0;
1808#ifdef RE_ENABLE_I18N
1809 token->mb_partial = 0;
1810 if (input->mb_cur_max > 1 &&
1811 !re_string_first_byte (input, re_string_cur_idx (input)))
1812 {
1813 token->type = CHARACTER;
1814 token->mb_partial = 1;
1815 return 1;
1816 }
1817#endif
1818 if (c == '\\')
1819 {
1820 unsigned char c2;
1821 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1822 {
1823 token->type = BACK_SLASH;
1824 return 1;
1825 }
1826
1827 c2 = re_string_peek_byte_case (input, 1);
1828 token->opr.c = c2;
1829 token->type = CHARACTER;
1830#ifdef RE_ENABLE_I18N
1831 if (input->mb_cur_max > 1)
1832 {
1833 wint_t wc = re_string_wchar_at (input,
1834 re_string_cur_idx (input) + 1);
1835 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1836 }
1837 else
1838#endif
1839 token->word_char = IS_WORD_CHAR (c2) != 0;
1840
1841 switch (c2)
1842 {
1843 case '|':
1844 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1845 token->type = OP_ALT;
1846 break;
1847 case '1': case '2': case '3': case '4': case '5':
1848 case '6': case '7': case '8': case '9':
1849 if (!(syntax & RE_NO_BK_REFS))
1850 {
1851 token->type = OP_BACK_REF;
1852 token->opr.idx = c2 - '1';
1853 }
1854 break;
1855 case '<':
1856 if (!(syntax & RE_NO_GNU_OPS))
1857 {
1858 token->type = ANCHOR;
1859 token->opr.ctx_type = WORD_FIRST;
1860 }
1861 break;
1862 case '>':
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 {
1865 token->type = ANCHOR;
1866 token->opr.ctx_type = WORD_LAST;
1867 }
1868 break;
1869 case 'b':
1870 if (!(syntax & RE_NO_GNU_OPS))
1871 {
1872 token->type = ANCHOR;
1873 token->opr.ctx_type = WORD_DELIM;
1874 }
1875 break;
1876 case 'B':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 {
1879 token->type = ANCHOR;
1880 token->opr.ctx_type = NOT_WORD_DELIM;
1881 }
1882 break;
1883 case 'w':
1884 if (!(syntax & RE_NO_GNU_OPS))
1885 token->type = OP_WORD;
1886 break;
1887 case 'W':
1888 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = OP_NOTWORD;
1890 break;
1891#ifndef GAWK
1892 case 's':
1893 if (!(syntax & RE_NO_GNU_OPS))
1894 token->type = OP_SPACE;
1895 break;
1896 case 'S':
1897 if (!(syntax & RE_NO_GNU_OPS))
1898 token->type = OP_NOTSPACE;
1899 break;
1900#endif
1901 case '`':
1902 if (!(syntax & RE_NO_GNU_OPS))
1903 {
1904 token->type = ANCHOR;
1905 token->opr.ctx_type = BUF_FIRST;
1906 }
1907 break;
1908 case '\'':
1909 if (!(syntax & RE_NO_GNU_OPS))
1910 {
1911 token->type = ANCHOR;
1912 token->opr.ctx_type = BUF_LAST;
1913 }
1914 break;
1915 case '(':
1916 if (!(syntax & RE_NO_BK_PARENS))
1917 token->type = OP_OPEN_SUBEXP;
1918 break;
1919 case ')':
1920 if (!(syntax & RE_NO_BK_PARENS))
1921 token->type = OP_CLOSE_SUBEXP;
1922 break;
1923 case '+':
1924 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1925 token->type = OP_DUP_PLUS;
1926 break;
1927 case '?':
1928 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1929 token->type = OP_DUP_QUESTION;
1930 break;
1931 case '{':
1932 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1933 token->type = OP_OPEN_DUP_NUM;
1934 break;
1935 case '}':
1936 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1937 token->type = OP_CLOSE_DUP_NUM;
1938 break;
1939 default:
1940 break;
1941 }
1942 return 2;
1943 }
1944
1945 token->type = CHARACTER;
1946#ifdef RE_ENABLE_I18N
1947 if (input->mb_cur_max > 1)
1948 {
1949 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1950 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1951 }
1952 else
1953#endif
1954 token->word_char = IS_WORD_CHAR (token->opr.c);
1955
1956 switch (c)
1957 {
1958 case '\n':
1959 if (syntax & RE_NEWLINE_ALT)
1960 token->type = OP_ALT;
1961 break;
1962 case '|':
1963 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1964 token->type = OP_ALT;
1965 break;
1966 case '*':
1967 token->type = OP_DUP_ASTERISK;
1968 break;
1969 case '+':
1970 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1971 token->type = OP_DUP_PLUS;
1972 break;
1973 case '?':
1974 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1975 token->type = OP_DUP_QUESTION;
1976 break;
1977 case '{':
1978 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1979 token->type = OP_OPEN_DUP_NUM;
1980 break;
1981 case '}':
1982 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1983 token->type = OP_CLOSE_DUP_NUM;
1984 break;
1985 case '(':
1986 if (syntax & RE_NO_BK_PARENS)
1987 token->type = OP_OPEN_SUBEXP;
1988 break;
1989 case ')':
1990 if (syntax & RE_NO_BK_PARENS)
1991 token->type = OP_CLOSE_SUBEXP;
1992 break;
1993 case '[':
1994 token->type = OP_OPEN_BRACKET;
1995 break;
1996 case '.':
1997 token->type = OP_PERIOD;
1998 break;
1999 case '^':
2000 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2001 re_string_cur_idx (input) != 0)
2002 {
2003 char prev = re_string_peek_byte (input, -1);
2004 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2005 break;
2006 }
2007 token->type = ANCHOR;
2008 token->opr.ctx_type = LINE_FIRST;
2009 break;
2010 case '$':
2011 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2012 re_string_cur_idx (input) + 1 != re_string_length (input))
2013 {
2014 re_token_t next;
2015 re_string_skip_bytes (input, 1);
2016 peek_token (&next, input, syntax);
2017 re_string_skip_bytes (input, -1);
2018 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2019 break;
2020 }
2021 token->type = ANCHOR;
2022 token->opr.ctx_type = LINE_LAST;
2023 break;
2024 default:
2025 break;
2026 }
2027 return 1;
2028}
2029
2030/* Peek a token from INPUT, and return the length of the token.
2031 We must not use this function out of bracket expressions. */
2032
2033static int
2034peek_token_bracket (token, input, syntax)
2035 re_token_t *token;
2036 re_string_t *input;
2037 reg_syntax_t syntax;
2038{
2039 unsigned char c;
2040 if (re_string_eoi (input))
2041 {
2042 token->type = END_OF_RE;
2043 return 0;
2044 }
2045 c = re_string_peek_byte (input, 0);
2046 token->opr.c = c;
2047
2048#ifdef RE_ENABLE_I18N
2049 if (input->mb_cur_max > 1 &&
2050 !re_string_first_byte (input, re_string_cur_idx (input)))
2051 {
2052 token->type = CHARACTER;
2053 return 1;
2054 }
2055#endif /* RE_ENABLE_I18N */
2056
2057 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2058 && re_string_cur_idx (input) + 1 < re_string_length (input))
2059 {
2060 /* In this case, '\' escape a character. */
2061 unsigned char c2;
2062 re_string_skip_bytes (input, 1);
2063 c2 = re_string_peek_byte (input, 0);
2064 token->opr.c = c2;
2065 token->type = CHARACTER;
2066 return 1;
2067 }
2068 if (c == '[') /* '[' is a special char in a bracket exps. */
2069 {
2070 unsigned char c2;
2071 int token_len;
2072 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2073 c2 = re_string_peek_byte (input, 1);
2074 else
2075 c2 = 0;
2076 token->opr.c = c2;
2077 token_len = 2;
2078 switch (c2)
2079 {
2080 case '.':
2081 token->type = OP_OPEN_COLL_ELEM;
2082 break;
2083 case '=':
2084 token->type = OP_OPEN_EQUIV_CLASS;
2085 break;
2086 case ':':
2087 if (syntax & RE_CHAR_CLASSES)
2088 {
2089 token->type = OP_OPEN_CHAR_CLASS;
2090 break;
2091 }
2092 /* else fall through. */
2093 default:
2094 token->type = CHARACTER;
2095 token->opr.c = c;
2096 token_len = 1;
2097 break;
2098 }
2099 return token_len;
2100 }
2101 switch (c)
2102 {
2103 case '-':
2104 token->type = OP_CHARSET_RANGE;
2105 break;
2106 case ']':
2107 token->type = OP_CLOSE_BRACKET;
2108 break;
2109 case '^':
2110 token->type = OP_NON_MATCH_LIST;
2111 break;
2112 default:
2113 token->type = CHARACTER;
2114 }
2115 return 1;
2116}
2117
2118
2119/* Functions for parser. */
2120
2121/* Entry point of the parser.
2122 Parse the regular expression REGEXP and return the structure tree.
2123 If an error is occured, ERR is set by error code, and return NULL.
2124 This function build the following tree, from regular expression <reg_exp>:
2125 CAT
2126 / \
2127 / \
2128 <reg_exp> EOR
2129
2130 CAT means concatenation.
2131 EOR means end of regular expression. */
2132
2133static bin_tree_t *
2134parse (regexp, preg, syntax, err)
2135 re_string_t *regexp;
2136 regex_t *preg;
2137 reg_syntax_t syntax;
2138 reg_errcode_t *err;
2139{
2140 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2141 bin_tree_t *tree, *eor, *root;
2142 re_token_t current_token;
2143 dfa->syntax = syntax;
2144 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2145 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2146 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147 return NULL;
2148 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2149 if (tree != NULL)
2150 root = create_tree (dfa, tree, eor, CONCAT);
2151 else
2152 root = eor;
2153 if (BE (eor == NULL || root == NULL, 0))
2154 {
2155 *err = REG_ESPACE;
2156 return NULL;
2157 }
2158 return root;
2159}
2160
2161/* This function build the following tree, from regular expression
2162 <branch1>|<branch2>:
2163 ALT
2164 / \
2165 / \
2166 <branch1> <branch2>
2167
2168 ALT means alternative, which represents the operator `|'. */
2169
2170static bin_tree_t *
2171parse_reg_exp (regexp, preg, token, syntax, nest, err)
2172 re_string_t *regexp;
2173 regex_t *preg;
2174 re_token_t *token;
2175 reg_syntax_t syntax;
2176 int nest;
2177 reg_errcode_t *err;
2178{
2179 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2180 bin_tree_t *tree, *branch = NULL;
2181 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2182 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2183 return NULL;
2184
2185 while (token->type == OP_ALT)
2186 {
2187 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2188 if (token->type != OP_ALT && token->type != END_OF_RE
2189 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2190 {
2191 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2192 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2193 return NULL;
2194 }
2195 else
2196 branch = NULL;
2197 tree = create_tree (dfa, tree, branch, OP_ALT);
2198 if (BE (tree == NULL, 0))
2199 {
2200 *err = REG_ESPACE;
2201 return NULL;
2202 }
2203 }
2204 return tree;
2205}
2206
2207/* This function build the following tree, from regular expression
2208 <exp1><exp2>:
2209 CAT
2210 / \
2211 / \
2212 <exp1> <exp2>
2213
2214 CAT means concatenation. */
2215
2216static bin_tree_t *
2217parse_branch (regexp, preg, token, syntax, nest, err)
2218 re_string_t *regexp;
2219 regex_t *preg;
2220 re_token_t *token;
2221 reg_syntax_t syntax;
2222 int nest;
2223 reg_errcode_t *err;
2224{
2225 bin_tree_t *tree, *exp;
2226 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2228 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2229 return NULL;
2230
2231 while (token->type != OP_ALT && token->type != END_OF_RE
2232 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2233 {
2234 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2235 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2236 {
2237 return NULL;
2238 }
2239 if (tree != NULL && exp != NULL)
2240 {
2241 tree = create_tree (dfa, tree, exp, CONCAT);
2242 if (tree == NULL)
2243 {
2244 *err = REG_ESPACE;
2245 return NULL;
2246 }
2247 }
2248 else if (tree == NULL)
2249 tree = exp;
2250 /* Otherwise exp == NULL, we don't need to create new tree. */
2251 }
2252 return tree;
2253}
2254
2255/* This function build the following tree, from regular expression a*:
2256 *
2257 |
2258 a
2259*/
2260
2261static bin_tree_t *
2262parse_expression (regexp, preg, token, syntax, nest, err)
2263 re_string_t *regexp;
2264 regex_t *preg;
2265 re_token_t *token;
2266 reg_syntax_t syntax;
2267 int nest;
2268 reg_errcode_t *err;
2269{
2270 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2271 bin_tree_t *tree;
2272 switch (token->type)
2273 {
2274 case CHARACTER:
2275 tree = create_token_tree (dfa, NULL, NULL, token);
2276 if (BE (tree == NULL, 0))
2277 {
2278 *err = REG_ESPACE;
2279 return NULL;
2280 }
2281#ifdef RE_ENABLE_I18N
2282 if (dfa->mb_cur_max > 1)
2283 {
2284 while (!re_string_eoi (regexp)
2285 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2286 {
2287 bin_tree_t *mbc_remain;
2288 fetch_token (token, regexp, syntax);
2289 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2290 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2291 if (BE (mbc_remain == NULL || tree == NULL, 0))
2292 {
2293 *err = REG_ESPACE;
2294 return NULL;
2295 }
2296 }
2297 }
2298#endif
2299 break;
2300 case OP_OPEN_SUBEXP:
2301 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2302 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2303 return NULL;
2304 break;
2305 case OP_OPEN_BRACKET:
2306 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2307 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2308 return NULL;
2309 break;
2310 case OP_BACK_REF:
2311 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2312 {
2313 *err = REG_ESUBREG;
2314 return NULL;
2315 }
2316 dfa->used_bkref_map |= 1 << token->opr.idx;
2317 tree = create_token_tree (dfa, NULL, NULL, token);
2318 if (BE (tree == NULL, 0))
2319 {
2320 *err = REG_ESPACE;
2321 return NULL;
2322 }
2323 ++dfa->nbackref;
2324 dfa->has_mb_node = 1;
2325 break;
2326 case OP_OPEN_DUP_NUM:
2327 if (syntax & RE_CONTEXT_INVALID_DUP)
2328 {
2329 *err = REG_BADRPT;
2330 return NULL;
2331 }
2332 /* FALLTHROUGH */
2333 case OP_DUP_ASTERISK:
2334 case OP_DUP_PLUS:
2335 case OP_DUP_QUESTION:
2336 if (syntax & RE_CONTEXT_INVALID_OPS)
2337 {
2338 *err = REG_BADRPT;
2339 return NULL;
2340 }
2341 else if (syntax & RE_CONTEXT_INDEP_OPS)
2342 {
2343 fetch_token (token, regexp, syntax);
2344 return parse_expression (regexp, preg, token, syntax, nest, err);
2345 }
2346 /* else fall through */
2347 case OP_CLOSE_SUBEXP:
2348 if ((token->type == OP_CLOSE_SUBEXP) &&
2349 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2350 {
2351 *err = REG_ERPAREN;
2352 return NULL;
2353 }
2354 /* else fall through */
2355 case OP_CLOSE_DUP_NUM:
2356 /* We treat it as a normal character. */
2357
2358 /* Then we can these characters as normal characters. */
2359 token->type = CHARACTER;
2360 /* mb_partial and word_char bits should be initialized already
2361 by peek_token. */
2362 tree = create_token_tree (dfa, NULL, NULL, token);
2363 if (BE (tree == NULL, 0))
2364 {
2365 *err = REG_ESPACE;
2366 return NULL;
2367 }
2368 break;
2369 case ANCHOR:
2370 if ((token->opr.ctx_type
2371 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2372 && dfa->word_ops_used == 0)
2373 init_word_char (dfa);
2374 if (token->opr.ctx_type == WORD_DELIM
2375 || token->opr.ctx_type == NOT_WORD_DELIM)
2376 {
2377 bin_tree_t *tree_first, *tree_last;
2378 if (token->opr.ctx_type == WORD_DELIM)
2379 {
2380 token->opr.ctx_type = WORD_FIRST;
2381 tree_first = create_token_tree (dfa, NULL, NULL, token);
2382 token->opr.ctx_type = WORD_LAST;
2383 }
2384 else
2385 {
2386 token->opr.ctx_type = INSIDE_WORD;
2387 tree_first = create_token_tree (dfa, NULL, NULL, token);
2388 token->opr.ctx_type = INSIDE_NOTWORD;
2389 }
2390 tree_last = create_token_tree (dfa, NULL, NULL, token);
2391 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2392 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2393 {
2394 *err = REG_ESPACE;
2395 return NULL;
2396 }
2397 }
2398 else
2399 {
2400 tree = create_token_tree (dfa, NULL, NULL, token);
2401 if (BE (tree == NULL, 0))
2402 {
2403 *err = REG_ESPACE;
2404 return NULL;
2405 }
2406 }
2407 /* We must return here, since ANCHORs can't be followed
2408 by repetition operators.
2409 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2410 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2411 fetch_token (token, regexp, syntax);
2412 return tree;
2413 case OP_PERIOD:
2414 tree = create_token_tree (dfa, NULL, NULL, token);
2415 if (BE (tree == NULL, 0))
2416 {
2417 *err = REG_ESPACE;
2418 return NULL;
2419 }
2420 if (dfa->mb_cur_max > 1)
2421 dfa->has_mb_node = 1;
2422 break;
2423 case OP_WORD:
2424 case OP_NOTWORD:
2425 tree = build_charclass_op (dfa, regexp->trans,
2426 (const char *) "alnum",
2427 (const char *) "_",
2428 token->type == OP_NOTWORD, err);
2429 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2430 return NULL;
2431 break;
2432 case OP_SPACE:
2433 case OP_NOTSPACE:
2434 tree = build_charclass_op (dfa, regexp->trans,
2435 (const char *) "space",
2436 (const char *) "",
2437 token->type == OP_NOTSPACE, err);
2438 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2439 return NULL;
2440 break;
2441 case OP_ALT:
2442 case END_OF_RE:
2443 return NULL;
2444 case BACK_SLASH:
2445 *err = REG_EESCAPE;
2446 return NULL;
2447 default:
2448 /* Must not happen? */
2449#ifdef DEBUG
2450 assert (0);
2451#endif
2452 return NULL;
2453 }
2454 fetch_token (token, regexp, syntax);
2455
2456 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2457 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2458 {
2459 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2460 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2461 return NULL;
2462 /* In BRE consecutive duplications are not allowed. */
2463 if ((syntax & RE_CONTEXT_INVALID_DUP)
2464 && (token->type == OP_DUP_ASTERISK
2465 || token->type == OP_OPEN_DUP_NUM))
2466 {
2467 *err = REG_BADRPT;
2468 return NULL;
2469 }
2470 }
2471
2472 return tree;
2473}
2474
2475/* This function build the following tree, from regular expression
2476 (<reg_exp>):
2477 SUBEXP
2478 |
2479 <reg_exp>
2480*/
2481
2482static bin_tree_t *
2483parse_sub_exp (regexp, preg, token, syntax, nest, err)
2484 re_string_t *regexp;
2485 regex_t *preg;
2486 re_token_t *token;
2487 reg_syntax_t syntax;
2488 int nest;
2489 reg_errcode_t *err;
2490{
2491 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2492 bin_tree_t *tree;
2493 size_t cur_nsub;
2494 cur_nsub = preg->re_nsub++;
2495
2496 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2497
2498 /* The subexpression may be a null string. */
2499 if (token->type == OP_CLOSE_SUBEXP)
2500 tree = NULL;
2501 else
2502 {
2503 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2504 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2505 *err = REG_EPAREN;
2506 if (BE (*err != REG_NOERROR, 0))
2507 return NULL;
2508 }
2509 dfa->completed_bkref_map |= 1 << cur_nsub;
2510
2511 tree = create_tree (dfa, tree, NULL, SUBEXP);
2512 if (BE (tree == NULL, 0))
2513 {
2514 *err = REG_ESPACE;
2515 return NULL;
2516 }
2517 tree->token.opr.idx = cur_nsub;
2518 return tree;
2519}
2520
2521/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2522
2523static bin_tree_t *
2524parse_dup_op (elem, regexp, dfa, token, syntax, err)
2525 bin_tree_t *elem;
2526 re_string_t *regexp;
2527 re_dfa_t *dfa;
2528 re_token_t *token;
2529 reg_syntax_t syntax;
2530 reg_errcode_t *err;
2531{
2532 bin_tree_t *tree = NULL, *old_tree = NULL;
2533 int i, start, end, start_idx = re_string_cur_idx (regexp);
2534#ifndef RE_TOKEN_INIT_BUG
2535 re_token_t start_token = *token;
2536#else
2537 re_token_t start_token;
2538
2539 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2540#endif
2541
2542 if (token->type == OP_OPEN_DUP_NUM)
2543 {
2544 end = 0;
2545 start = fetch_number (regexp, token, syntax);
2546 if (start == -1)
2547 {
2548 if (token->type == CHARACTER && token->opr.c == ',')
2549 start = 0; /* We treat "{,m}" as "{0,m}". */
2550 else
2551 {
2552 *err = REG_BADBR; /* <re>{} is invalid. */
2553 return NULL;
2554 }
2555 }
2556 if (BE (start != -2, 1))
2557 {
2558 /* We treat "{n}" as "{n,n}". */
2559 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2560 : ((token->type == CHARACTER && token->opr.c == ',')
2561 ? fetch_number (regexp, token, syntax) : -2));
2562 }
2563 if (BE (start == -2 || end == -2, 0))
2564 {
2565 /* Invalid sequence. */
2566 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2567 {
2568 if (token->type == END_OF_RE)
2569 *err = REG_EBRACE;
2570 else
2571 *err = REG_BADBR;
2572
2573 return NULL;
2574 }
2575
2576 /* If the syntax bit is set, rollback. */
2577 re_string_set_index (regexp, start_idx);
2578 *token = start_token;
2579 token->type = CHARACTER;
2580 /* mb_partial and word_char bits should be already initialized by
2581 peek_token. */
2582 return elem;
2583 }
2584
2585 if (BE (end != -1 && start > end, 0))
2586 {
2587 /* First number greater than second. */
2588 *err = REG_BADBR;
2589 return NULL;
2590 }
2591 }
2592 else
2593 {
2594 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2595 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2596 }
2597
2598 fetch_token (token, regexp, syntax);
2599
2600 if (BE (elem == NULL, 0))
2601 return NULL;
2602 if (BE (start == 0 && end == 0, 0))
2603 {
2604 postorder (elem, free_tree, NULL);
2605 return NULL;
2606 }
2607
2608 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2609 if (BE (start > 0, 0))
2610 {
2611 tree = elem;
2612 for (i = 2; i <= start; ++i)
2613 {
2614 elem = duplicate_tree (elem, dfa);
2615 tree = create_tree (dfa, tree, elem, CONCAT);
2616 if (BE (elem == NULL || tree == NULL, 0))
2617 goto parse_dup_op_espace;
2618 }
2619
2620 if (start == end)
2621 return tree;
2622
2623 /* Duplicate ELEM before it is marked optional. */
2624 elem = duplicate_tree (elem, dfa);
2625 old_tree = tree;
2626 }
2627 else
2628 old_tree = NULL;
2629
2630 if (elem->token.type == SUBEXP)
2631 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2632
2633 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2634 if (BE (tree == NULL, 0))
2635 goto parse_dup_op_espace;
2636
2637 /* This loop is actually executed only when end != -1,
2638 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2639 already created the start+1-th copy. */
2640 for (i = start + 2; i <= end; ++i)
2641 {
2642 elem = duplicate_tree (elem, dfa);
2643 tree = create_tree (dfa, tree, elem, CONCAT);
2644 if (BE (elem == NULL || tree == NULL, 0))
2645 goto parse_dup_op_espace;
2646
2647 tree = create_tree (dfa, tree, NULL, OP_ALT);
2648 if (BE (tree == NULL, 0))
2649 goto parse_dup_op_espace;
2650 }
2651
2652 if (old_tree)
2653 tree = create_tree (dfa, old_tree, tree, CONCAT);
2654
2655 return tree;
2656
2657 parse_dup_op_espace:
2658 *err = REG_ESPACE;
2659 return NULL;
2660}
2661
2662/* Size of the names for collating symbol/equivalence_class/character_class.
2663 I'm not sure, but maybe enough. */
2664#define BRACKET_NAME_BUF_SIZE 32
2665
2666#ifndef _LIBC
2667 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2668 Build the range expression which starts from START_ELEM, and ends
2669 at END_ELEM. The result are written to MBCSET and SBCSET.
2670 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2671 mbcset->range_ends, is a pointer argument sinse we may
2672 update it. */
2673
2674static reg_errcode_t
2675# ifdef RE_ENABLE_I18N
2676build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2677 re_charset_t *mbcset;
2678 int *range_alloc;
2679# else /* not RE_ENABLE_I18N */
2680build_range_exp (sbcset, start_elem, end_elem)
2681# endif /* not RE_ENABLE_I18N */
2682 re_bitset_ptr_t sbcset;
2683 bracket_elem_t *start_elem, *end_elem;
2684{
2685 unsigned int start_ch, end_ch;
2686 /* Equivalence Classes and Character Classes can't be a range start/end. */
2687 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2688 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2689 0))
2690 return REG_ERANGE;
2691
2692 /* We can handle no multi character collating elements without libc
2693 support. */
2694 if (BE ((start_elem->type == COLL_SYM
2695 && strlen ((char *) start_elem->opr.name) > 1)
2696 || (end_elem->type == COLL_SYM
2697 && strlen ((char *) end_elem->opr.name) > 1), 0))
2698 return REG_ECOLLATE;
2699
2700# ifdef RE_ENABLE_I18N
2701 {
2702 wchar_t wc, start_wc, end_wc;
2703 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2704
2705 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2706 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2707 : 0));
2708 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2709 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2710 : 0));
2711 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2712 ? __btowc (start_ch) : start_elem->opr.wch);
2713 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2714 ? __btowc (end_ch) : end_elem->opr.wch);
2715 if (start_wc == WEOF || end_wc == WEOF)
2716 return REG_ECOLLATE;
2717 cmp_buf[0] = start_wc;
2718 cmp_buf[4] = end_wc;
2719 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2720 return REG_ERANGE;
2721
2722 /* Got valid collation sequence values, add them as a new entry.
2723 However, for !_LIBC we have no collation elements: if the
2724 character set is single byte, the single byte character set
2725 that we build below suffices. parse_bracket_exp passes
2726 no MBCSET if dfa->mb_cur_max == 1. */
2727 if (mbcset)
2728 {
2729 /* Check the space of the arrays. */
2730 if (BE (*range_alloc == mbcset->nranges, 0))
2731 {
2732 /* There is not enough space, need realloc. */
2733 wchar_t *new_array_start, *new_array_end;
2734 int new_nranges;
2735
2736 /* +1 in case of mbcset->nranges is 0. */
2737 new_nranges = 2 * mbcset->nranges + 1;
2738 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2739 are NULL if *range_alloc == 0. */
2740 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2741 new_nranges);
2742 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2743 new_nranges);
2744
2745 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2746 return REG_ESPACE;
2747
2748 mbcset->range_starts = new_array_start;
2749 mbcset->range_ends = new_array_end;
2750 *range_alloc = new_nranges;
2751 }
2752
2753 mbcset->range_starts[mbcset->nranges] = start_wc;
2754 mbcset->range_ends[mbcset->nranges++] = end_wc;
2755 }
2756
2757 /* Build the table for single byte characters. */
2758 for (wc = 0; wc < SBC_MAX; ++wc)
2759 {
2760 cmp_buf[2] = wc;
2761 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2762 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2763 bitset_set (sbcset, wc);
2764 }
2765 }
2766# else /* not RE_ENABLE_I18N */
2767 {
2768 unsigned int ch;
2769 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2770 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2771 : 0));
2772 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2773 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2774 : 0));
2775 if (start_ch > end_ch)
2776 return REG_ERANGE;
2777 /* Build the table for single byte characters. */
2778 for (ch = 0; ch < SBC_MAX; ++ch)
2779 if (start_ch <= ch && ch <= end_ch)
2780 bitset_set (sbcset, ch);
2781 }
2782# endif /* not RE_ENABLE_I18N */
2783 return REG_NOERROR;
2784}
2785#endif /* not _LIBC */
2786
2787#ifndef _LIBC
2788/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2789 Build the collating element which is represented by NAME.
2790 The result are written to MBCSET and SBCSET.
2791 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2792 pointer argument since we may update it. */
2793
2794static reg_errcode_t
2795# ifdef RE_ENABLE_I18N
2796build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2797 re_charset_t *mbcset;
2798 int *coll_sym_alloc;
2799# else /* not RE_ENABLE_I18N */
2800build_collating_symbol (sbcset, name)
2801# endif /* not RE_ENABLE_I18N */
2802 re_bitset_ptr_t sbcset;
2803 const unsigned char *name;
2804{
2805 size_t name_len = strlen ((const char *) name);
2806 if (BE (name_len != 1, 0))
2807 return REG_ECOLLATE;
2808 else
2809 {
2810 bitset_set (sbcset, name[0]);
2811 return REG_NOERROR;
2812 }
2813}
2814#endif /* not _LIBC */
2815
2816/* This function parse bracket expression like "[abc]", "[a-c]",
2817 "[[.a-a.]]" etc. */
2818
2819static bin_tree_t *
2820parse_bracket_exp (regexp, dfa, token, syntax, err)
2821 re_string_t *regexp;
2822 re_dfa_t *dfa;
2823 re_token_t *token;
2824 reg_syntax_t syntax;
2825 reg_errcode_t *err;
2826{
2827#ifdef _LIBC
2828 const unsigned char *collseqmb;
2829 const char *collseqwc;
2830 uint32_t nrules;
2831 int32_t table_size;
2832 const int32_t *symb_table;
2833 const unsigned char *extra;
2834
2835 /* Local function for parse_bracket_exp used in _LIBC environement.
2836 Seek the collating symbol entry correspondings to NAME.
2837 Return the index of the symbol in the SYMB_TABLE. */
2838
2839 auto inline int32_t
2840 __attribute ((always_inline))
2841 seek_collating_symbol_entry (name, name_len)
2842 const unsigned char *name;
2843 size_t name_len;
2844 {
2845 int32_t hash = elem_hash ((const char *) name, name_len);
2846 int32_t elem = hash % table_size;
2847 int32_t second = hash % (table_size - 2);
2848 while (symb_table[2 * elem] != 0)
2849 {
2850 /* First compare the hashing value. */
2851 if (symb_table[2 * elem] == hash
2852 /* Compare the length of the name. */
2853 && name_len == extra[symb_table[2 * elem + 1]]
2854 /* Compare the name. */
2855 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2856 name_len) == 0)
2857 {
2858 /* Yep, this is the entry. */
2859 break;
2860 }
2861
2862 /* Next entry. */
2863 elem += second;
2864 }
2865 return elem;
2866 }
2867
2868 /* Local function for parse_bracket_exp used in _LIBC environement.
2869 Look up the collation sequence value of BR_ELEM.
2870 Return the value if succeeded, UINT_MAX otherwise. */
2871
2872 auto inline unsigned int
2873 __attribute ((always_inline))
2874 lookup_collation_sequence_value (br_elem)
2875 bracket_elem_t *br_elem;
2876 {
2877 if (br_elem->type == SB_CHAR)
2878 {
2879 /*
2880 if (MB_CUR_MAX == 1)
2881 */
2882 if (nrules == 0)
2883 return collseqmb[br_elem->opr.ch];
2884 else
2885 {
2886 wint_t wc = __btowc (br_elem->opr.ch);
2887 return __collseq_table_lookup (collseqwc, wc);
2888 }
2889 }
2890 else if (br_elem->type == MB_CHAR)
2891 {
2892 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2893 }
2894 else if (br_elem->type == COLL_SYM)
2895 {
2896 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2897 if (nrules != 0)
2898 {
2899 int32_t elem, idx;
2900 elem = seek_collating_symbol_entry (br_elem->opr.name,
2901 sym_name_len);
2902 if (symb_table[2 * elem] != 0)
2903 {
2904 /* We found the entry. */
2905 idx = symb_table[2 * elem + 1];
2906 /* Skip the name of collating element name. */
2907 idx += 1 + extra[idx];
2908 /* Skip the byte sequence of the collating element. */
2909 idx += 1 + extra[idx];
2910 /* Adjust for the alignment. */
2911 idx = (idx + 3) & ~3;
2912 /* Skip the multibyte collation sequence value. */
2913 idx += sizeof (unsigned int);
2914 /* Skip the wide char sequence of the collating element. */
2915 idx += sizeof (unsigned int) *
2916 (1 + *(unsigned int *) (extra + idx));
2917 /* Return the collation sequence value. */
2918 return *(unsigned int *) (extra + idx);
2919 }
2920 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2921 {
2922 /* No valid character. Match it as a single byte
2923 character. */
2924 return collseqmb[br_elem->opr.name[0]];
2925 }
2926 }
2927 else if (sym_name_len == 1)
2928 return collseqmb[br_elem->opr.name[0]];
2929 }
2930 return UINT_MAX;
2931 }
2932
2933 /* Local function for parse_bracket_exp used in _LIBC environement.
2934 Build the range expression which starts from START_ELEM, and ends
2935 at END_ELEM. The result are written to MBCSET and SBCSET.
2936 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2937 mbcset->range_ends, is a pointer argument sinse we may
2938 update it. */
2939
2940 auto inline reg_errcode_t
2941 __attribute ((always_inline))
2942 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2943 re_charset_t *mbcset;
2944 int *range_alloc;
2945 re_bitset_ptr_t sbcset;
2946 bracket_elem_t *start_elem, *end_elem;
2947 {
2948 unsigned int ch;
2949 uint32_t start_collseq;
2950 uint32_t end_collseq;
2951
2952 /* Equivalence Classes and Character Classes can't be a range
2953 start/end. */
2954 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2955 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2956 0))
2957 return REG_ERANGE;
2958
2959 start_collseq = lookup_collation_sequence_value (start_elem);
2960 end_collseq = lookup_collation_sequence_value (end_elem);
2961 /* Check start/end collation sequence values. */
2962 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2963 return REG_ECOLLATE;
2964 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2965 return REG_ERANGE;
2966
2967 /* Got valid collation sequence values, add them as a new entry.
2968 However, if we have no collation elements, and the character set
2969 is single byte, the single byte character set that we
2970 build below suffices. */
2971 if (nrules > 0 || dfa->mb_cur_max > 1)
2972 {
2973 /* Check the space of the arrays. */
2974 if (BE (*range_alloc == mbcset->nranges, 0))
2975 {
2976 /* There is not enough space, need realloc. */
2977 uint32_t *new_array_start;
2978 uint32_t *new_array_end;
2979 int new_nranges;
2980
2981 /* +1 in case of mbcset->nranges is 0. */
2982 new_nranges = 2 * mbcset->nranges + 1;
2983 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2984 new_nranges);
2985 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2986 new_nranges);
2987
2988 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2989 return REG_ESPACE;
2990
2991 mbcset->range_starts = new_array_start;
2992 mbcset->range_ends = new_array_end;
2993 *range_alloc = new_nranges;
2994 }
2995
2996 mbcset->range_starts[mbcset->nranges] = start_collseq;
2997 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2998 }
2999
3000 /* Build the table for single byte characters. */
3001 for (ch = 0; ch < SBC_MAX; ch++)
3002 {
3003 uint32_t ch_collseq;
3004 /*
3005 if (MB_CUR_MAX == 1)
3006 */
3007 if (nrules == 0)
3008 ch_collseq = collseqmb[ch];
3009 else
3010 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3011 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3012 bitset_set (sbcset, ch);
3013 }
3014 return REG_NOERROR;
3015 }
3016
3017 /* Local function for parse_bracket_exp used in _LIBC environement.
3018 Build the collating element which is represented by NAME.
3019 The result are written to MBCSET and SBCSET.
3020 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3021 pointer argument sinse we may update it. */
3022
3023 auto inline reg_errcode_t
3024 __attribute ((always_inline))
3025 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3026 re_charset_t *mbcset;
3027 int *coll_sym_alloc;
3028 re_bitset_ptr_t sbcset;
3029 const unsigned char *name;
3030 {
3031 int32_t elem, idx;
3032 size_t name_len = strlen ((const char *) name);
3033 if (nrules != 0)
3034 {
3035 elem = seek_collating_symbol_entry (name, name_len);
3036 if (symb_table[2 * elem] != 0)
3037 {
3038 /* We found the entry. */
3039 idx = symb_table[2 * elem + 1];
3040 /* Skip the name of collating element name. */
3041 idx += 1 + extra[idx];
3042 }
3043 else if (symb_table[2 * elem] == 0 && name_len == 1)
3044 {
3045 /* No valid character, treat it as a normal
3046 character. */
3047 bitset_set (sbcset, name[0]);
3048 return REG_NOERROR;
3049 }
3050 else
3051 return REG_ECOLLATE;
3052
3053 /* Got valid collation sequence, add it as a new entry. */
3054 /* Check the space of the arrays. */
3055 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3056 {
3057 /* Not enough, realloc it. */
3058 /* +1 in case of mbcset->ncoll_syms is 0. */
3059 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3060 /* Use realloc since mbcset->coll_syms is NULL
3061 if *alloc == 0. */
3062 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3063 new_coll_sym_alloc);
3064 if (BE (new_coll_syms == NULL, 0))
3065 return REG_ESPACE;
3066 mbcset->coll_syms = new_coll_syms;
3067 *coll_sym_alloc = new_coll_sym_alloc;
3068 }
3069 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3070 return REG_NOERROR;
3071 }
3072 else
3073 {
3074 if (BE (name_len != 1, 0))
3075 return REG_ECOLLATE;
3076 else
3077 {
3078 bitset_set (sbcset, name[0]);
3079 return REG_NOERROR;
3080 }
3081 }
3082 }
3083#endif
3084
3085 re_token_t br_token;
3086 re_bitset_ptr_t sbcset;
3087#ifdef RE_ENABLE_I18N
3088 re_charset_t *mbcset;
3089 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3090 int equiv_class_alloc = 0, char_class_alloc = 0;
3091#endif /* not RE_ENABLE_I18N */
3092 int non_match = 0;
3093 bin_tree_t *work_tree;
3094 int token_len;
3095 int first_round = 1;
3096#ifdef _LIBC
3097 collseqmb = (const unsigned char *)
3098 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3099 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3100 if (nrules)
3101 {
3102 /*
3103 if (MB_CUR_MAX > 1)
3104 */
3105 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3106 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3107 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3108 _NL_COLLATE_SYMB_TABLEMB);
3109 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3110 _NL_COLLATE_SYMB_EXTRAMB);
3111 }
3112#endif
3113 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3114#ifdef RE_ENABLE_I18N
3115 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3116#endif /* RE_ENABLE_I18N */
3117#ifdef RE_ENABLE_I18N
3118 if (BE (sbcset == NULL || mbcset == NULL, 0))
3119#else
3120 if (BE (sbcset == NULL, 0))
3121#endif /* RE_ENABLE_I18N */
3122 {
3123 *err = REG_ESPACE;
3124 return NULL;
3125 }
3126
3127 token_len = peek_token_bracket (token, regexp, syntax);
3128 if (BE (token->type == END_OF_RE, 0))
3129 {
3130 *err = REG_BADPAT;
3131 goto parse_bracket_exp_free_return;
3132 }
3133 if (token->type == OP_NON_MATCH_LIST)
3134 {
3135#ifdef RE_ENABLE_I18N
3136 mbcset->non_match = 1;
3137#endif /* not RE_ENABLE_I18N */
3138 non_match = 1;
3139 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3140 bitset_set (sbcset, '\0');
3141 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3142 token_len = peek_token_bracket (token, regexp, syntax);
3143 if (BE (token->type == END_OF_RE, 0))
3144 {
3145 *err = REG_BADPAT;
3146 goto parse_bracket_exp_free_return;
3147 }
3148 }
3149
3150 /* We treat the first ']' as a normal character. */
3151 if (token->type == OP_CLOSE_BRACKET)
3152 token->type = CHARACTER;
3153
3154 while (1)
3155 {
3156 bracket_elem_t start_elem, end_elem;
3157 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3158 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3159 reg_errcode_t ret;
3160 int token_len2 = 0, is_range_exp = 0;
3161 re_token_t token2;
3162
3163 start_elem.opr.name = start_name_buf;
3164 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3165 syntax, first_round);
3166 if (BE (ret != REG_NOERROR, 0))
3167 {
3168 *err = ret;
3169 goto parse_bracket_exp_free_return;
3170 }
3171 first_round = 0;
3172
3173 /* Get information about the next token. We need it in any case. */
3174 token_len = peek_token_bracket (token, regexp, syntax);
3175
3176 /* Do not check for ranges if we know they are not allowed. */
3177 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3178 {
3179 if (BE (token->type == END_OF_RE, 0))
3180 {
3181 *err = REG_EBRACK;
3182 goto parse_bracket_exp_free_return;
3183 }
3184 if (token->type == OP_CHARSET_RANGE)
3185 {
3186 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3187 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3188 if (BE (token2.type == END_OF_RE, 0))
3189 {
3190 *err = REG_EBRACK;
3191 goto parse_bracket_exp_free_return;
3192 }
3193 if (token2.type == OP_CLOSE_BRACKET)
3194 {
3195 /* We treat the last '-' as a normal character. */
3196 re_string_skip_bytes (regexp, -token_len);
3197 token->type = CHARACTER;
3198 }
3199 else
3200 is_range_exp = 1;
3201 }
3202 }
3203
3204 if (is_range_exp == 1)
3205 {
3206 end_elem.opr.name = end_name_buf;
3207 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3208 dfa, syntax, 1);
3209 if (BE (ret != REG_NOERROR, 0))
3210 {
3211 *err = ret;
3212 goto parse_bracket_exp_free_return;
3213 }
3214
3215 token_len = peek_token_bracket (token, regexp, syntax);
3216
3217#ifdef _LIBC
3218 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3219 &start_elem, &end_elem);
3220#else
3221# ifdef RE_ENABLE_I18N
3222 *err = build_range_exp (sbcset,
3223 dfa->mb_cur_max > 1 ? mbcset : NULL,
3224 &range_alloc, &start_elem, &end_elem);
3225# else
3226 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3227# endif
3228#endif /* RE_ENABLE_I18N */
3229 if (BE (*err != REG_NOERROR, 0))
3230 goto parse_bracket_exp_free_return;
3231 }
3232 else
3233 {
3234 switch (start_elem.type)
3235 {
3236 case SB_CHAR:
3237 bitset_set (sbcset, start_elem.opr.ch);
3238 break;
3239#ifdef RE_ENABLE_I18N
3240 case MB_CHAR:
3241 /* Check whether the array has enough space. */
3242 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3243 {
3244 wchar_t *new_mbchars;
3245 /* Not enough, realloc it. */
3246 /* +1 in case of mbcset->nmbchars is 0. */
3247 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3248 /* Use realloc since array is NULL if *alloc == 0. */
3249 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3250 mbchar_alloc);
3251 if (BE (new_mbchars == NULL, 0))
3252 goto parse_bracket_exp_espace;
3253 mbcset->mbchars = new_mbchars;
3254 }
3255 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3256 break;
3257#endif /* RE_ENABLE_I18N */
3258 case EQUIV_CLASS:
3259 *err = build_equiv_class (sbcset,
3260#ifdef RE_ENABLE_I18N
3261 mbcset, &equiv_class_alloc,
3262#endif /* RE_ENABLE_I18N */
3263 start_elem.opr.name);
3264 if (BE (*err != REG_NOERROR, 0))
3265 goto parse_bracket_exp_free_return;
3266 break;
3267 case COLL_SYM:
3268 *err = build_collating_symbol (sbcset,
3269#ifdef RE_ENABLE_I18N
3270 mbcset, &coll_sym_alloc,
3271#endif /* RE_ENABLE_I18N */
3272 start_elem.opr.name);
3273 if (BE (*err != REG_NOERROR, 0))
3274 goto parse_bracket_exp_free_return;
3275 break;
3276 case CHAR_CLASS:
3277 *err = build_charclass (regexp->trans, sbcset,
3278#ifdef RE_ENABLE_I18N
3279 mbcset, &char_class_alloc,
3280#endif /* RE_ENABLE_I18N */
3281 (const char *) start_elem.opr.name, syntax);
3282 if (BE (*err != REG_NOERROR, 0))
3283 goto parse_bracket_exp_free_return;
3284 break;
3285 default:
3286 assert (0);
3287 break;
3288 }
3289 }
3290 if (BE (token->type == END_OF_RE, 0))
3291 {
3292 *err = REG_EBRACK;
3293 goto parse_bracket_exp_free_return;
3294 }
3295 if (token->type == OP_CLOSE_BRACKET)
3296 break;
3297 }
3298
3299 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3300
3301 /* If it is non-matching list. */
3302 if (non_match)
3303 bitset_not (sbcset);
3304
3305#ifdef RE_ENABLE_I18N
3306 /* Ensure only single byte characters are set. */
3307 if (dfa->mb_cur_max > 1)
3308 bitset_mask (sbcset, dfa->sb_char);
3309
3310 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3311 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3312 || mbcset->non_match)))
3313 {
3314 bin_tree_t *mbc_tree;
3315 int sbc_idx;
3316 /* Build a tree for complex bracket. */
3317 dfa->has_mb_node = 1;
3318 br_token.type = COMPLEX_BRACKET;
3319 br_token.opr.mbcset = mbcset;
3320 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3321 if (BE (mbc_tree == NULL, 0))
3322 goto parse_bracket_exp_espace;
3323 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3324 if (sbcset[sbc_idx])
3325 break;
3326 /* If there are no bits set in sbcset, there is no point
3327 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3328 if (sbc_idx < BITSET_UINTS)
3329 {
3330 /* Build a tree for simple bracket. */
3331 br_token.type = SIMPLE_BRACKET;
3332 br_token.opr.sbcset = sbcset;
3333 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3334 if (BE (work_tree == NULL, 0))
3335 goto parse_bracket_exp_espace;
3336
3337 /* Then join them by ALT node. */
3338 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3339 if (BE (work_tree == NULL, 0))
3340 goto parse_bracket_exp_espace;
3341 }
3342 else
3343 {
3344 re_free (sbcset);
3345 work_tree = mbc_tree;
3346 }
3347 }
3348 else
3349#endif /* not RE_ENABLE_I18N */
3350 {
3351#ifdef RE_ENABLE_I18N
3352 free_charset (mbcset);
3353#endif
3354 /* Build a tree for simple bracket. */
3355 br_token.type = SIMPLE_BRACKET;
3356 br_token.opr.sbcset = sbcset;
3357 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3358 if (BE (work_tree == NULL, 0))
3359 goto parse_bracket_exp_espace;
3360 }
3361 return work_tree;
3362
3363 parse_bracket_exp_espace:
3364 *err = REG_ESPACE;
3365 parse_bracket_exp_free_return:
3366 re_free (sbcset);
3367#ifdef RE_ENABLE_I18N
3368 free_charset (mbcset);
3369#endif /* RE_ENABLE_I18N */
3370 return NULL;
3371}
3372
3373/* Parse an element in the bracket expression. */
3374
3375static reg_errcode_t
3376parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3377 accept_hyphen)
3378 bracket_elem_t *elem;
3379 re_string_t *regexp;
3380 re_token_t *token;
3381 int token_len;
3382 re_dfa_t *dfa;
3383 reg_syntax_t syntax;
3384 int accept_hyphen;
3385{
3386#ifdef RE_ENABLE_I18N
3387 int cur_char_size;
3388 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3389 if (cur_char_size > 1)
3390 {
3391 elem->type = MB_CHAR;
3392 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3393 re_string_skip_bytes (regexp, cur_char_size);
3394 return REG_NOERROR;
3395 }
3396#endif /* RE_ENABLE_I18N */
3397 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3398 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3399 || token->type == OP_OPEN_EQUIV_CLASS)
3400 return parse_bracket_symbol (elem, regexp, token);
3401 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3402 {
3403 /* A '-' must only appear as anything but a range indicator before
3404 the closing bracket. Everything else is an error. */
3405 re_token_t token2;
3406 (void) peek_token_bracket (&token2, regexp, syntax);
3407 if (token2.type != OP_CLOSE_BRACKET)
3408 /* The actual error value is not standardized since this whole
3409 case is undefined. But ERANGE makes good sense. */
3410 return REG_ERANGE;
3411 }
3412 elem->type = SB_CHAR;
3413 elem->opr.ch = token->opr.c;
3414 return REG_NOERROR;
3415}
3416
3417/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3418 such as [:<character_class>:], [.<collating_element>.], and
3419 [=<equivalent_class>=]. */
3420
3421static reg_errcode_t
3422parse_bracket_symbol (elem, regexp, token)
3423 bracket_elem_t *elem;
3424 re_string_t *regexp;
3425 re_token_t *token;
3426{
3427 unsigned char ch, delim = token->opr.c;
3428 int i = 0;
3429 if (re_string_eoi(regexp))
3430 return REG_EBRACK;
3431 for (;; ++i)
3432 {
3433 if (i >= BRACKET_NAME_BUF_SIZE)
3434 return REG_EBRACK;
3435 if (token->type == OP_OPEN_CHAR_CLASS)
3436 ch = re_string_fetch_byte_case (regexp);
3437 else
3438 ch = re_string_fetch_byte (regexp);
3439 if (re_string_eoi(regexp))
3440 return REG_EBRACK;
3441 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3442 break;
3443 elem->opr.name[i] = ch;
3444 }
3445 re_string_skip_bytes (regexp, 1);
3446 elem->opr.name[i] = '\0';
3447 switch (token->type)
3448 {
3449 case OP_OPEN_COLL_ELEM:
3450 elem->type = COLL_SYM;
3451 break;
3452 case OP_OPEN_EQUIV_CLASS:
3453 elem->type = EQUIV_CLASS;
3454 break;
3455 case OP_OPEN_CHAR_CLASS:
3456 elem->type = CHAR_CLASS;
3457 break;
3458 default:
3459 break;
3460 }
3461 return REG_NOERROR;
3462}
3463
3464 /* Helper function for parse_bracket_exp.
3465 Build the equivalence class which is represented by NAME.
3466 The result are written to MBCSET and SBCSET.
3467 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3468 is a pointer argument sinse we may update it. */
3469
3470static reg_errcode_t
3471#ifdef RE_ENABLE_I18N
3472build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3473 re_charset_t *mbcset;
3474 int *equiv_class_alloc;
3475#else /* not RE_ENABLE_I18N */
3476build_equiv_class (sbcset, name)
3477#endif /* not RE_ENABLE_I18N */
3478 re_bitset_ptr_t sbcset;
3479 const unsigned char *name;
3480{
3481#if defined _LIBC
3482 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3483 if (nrules != 0)
3484 {
3485 const int32_t *table, *indirect;
3486 const unsigned char *weights, *extra, *cp;
3487 unsigned char char_buf[2];
3488 int32_t idx1, idx2;
3489 unsigned int ch;
3490 size_t len;
3491 /* This #include defines a local function! */
3492# include <locale/weight.h>
3493 /* Calculate the index for equivalence class. */
3494 cp = name;
3495 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3496 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3497 _NL_COLLATE_WEIGHTMB);
3498 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3499 _NL_COLLATE_EXTRAMB);
3500 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3501 _NL_COLLATE_INDIRECTMB);
3502 idx1 = findidx (&cp);
3503 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3504 /* This isn't a valid character. */
3505 return REG_ECOLLATE;
3506
3507 /* Build single byte matcing table for this equivalence class. */
3508 char_buf[1] = (unsigned char) '\0';
3509 len = weights[idx1];
3510 for (ch = 0; ch < SBC_MAX; ++ch)
3511 {
3512 char_buf[0] = ch;
3513 cp = char_buf;
3514 idx2 = findidx (&cp);
3515/*
3516 idx2 = table[ch];
3517*/
3518 if (idx2 == 0)
3519 /* This isn't a valid character. */
3520 continue;
3521 if (len == weights[idx2])
3522 {
3523 int cnt = 0;
3524 while (cnt <= len &&
3525 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3526 ++cnt;
3527
3528 if (cnt > len)
3529 bitset_set (sbcset, ch);
3530 }
3531 }
3532 /* Check whether the array has enough space. */
3533 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3534 {
3535 /* Not enough, realloc it. */
3536 /* +1 in case of mbcset->nequiv_classes is 0. */
3537 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3538 /* Use realloc since the array is NULL if *alloc == 0. */
3539 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3540 int32_t,
3541 new_equiv_class_alloc);
3542 if (BE (new_equiv_classes == NULL, 0))
3543 return REG_ESPACE;
3544 mbcset->equiv_classes = new_equiv_classes;
3545 *equiv_class_alloc = new_equiv_class_alloc;
3546 }
3547 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3548 }
3549 else
3550#endif /* _LIBC */
3551 {
3552 if (BE (strlen ((const char *) name) != 1, 0))
3553 return REG_ECOLLATE;
3554 bitset_set (sbcset, *name);
3555 }
3556 return REG_NOERROR;
3557}
3558
3559 /* Helper function for parse_bracket_exp.
3560 Build the character class which is represented by NAME.
3561 The result are written to MBCSET and SBCSET.
3562 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3563 is a pointer argument sinse we may update it. */
3564
3565static reg_errcode_t
3566#ifdef RE_ENABLE_I18N
3567build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3568 re_charset_t *mbcset;
3569 int *char_class_alloc;
3570#else /* not RE_ENABLE_I18N */
3571build_charclass (trans, sbcset, class_name, syntax)
3572#endif /* not RE_ENABLE_I18N */
3573 unsigned RE_TRANSLATE_TYPE trans;
3574 re_bitset_ptr_t sbcset;
3575 const char *class_name;
3576 reg_syntax_t syntax;
3577{
3578 int i;
3579
3580 /* In case of REG_ICASE "upper" and "lower" match the both of
3581 upper and lower cases. */
3582 if ((syntax & RE_ICASE)
3583 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3584 class_name = "alpha";
3585
3586#ifdef RE_ENABLE_I18N
3587 /* Check the space of the arrays. */
3588 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3589 {
3590 /* Not enough, realloc it. */
3591 /* +1 in case of mbcset->nchar_classes is 0. */
3592 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3593 /* Use realloc since array is NULL if *alloc == 0. */
3594 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3595 new_char_class_alloc);
3596 if (BE (new_char_classes == NULL, 0))
3597 return REG_ESPACE;
3598 mbcset->char_classes = new_char_classes;
3599 *char_class_alloc = new_char_class_alloc;
3600 }
3601 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3602#endif /* RE_ENABLE_I18N */
3603
3604#define BUILD_CHARCLASS_LOOP(ctype_func) \
3605 for (i = 0; i < SBC_MAX; ++i) \
3606 { \
3607 if (ctype_func (i)) \
3608 { \
3609 int ch = trans ? trans[i] : i; \
3610 bitset_set (sbcset, ch); \
3611 } \
3612 }
3613
3614 if (strcmp (class_name, "alnum") == 0)
3615 BUILD_CHARCLASS_LOOP (isalnum)
3616 else if (strcmp (class_name, "cntrl") == 0)
3617 BUILD_CHARCLASS_LOOP (iscntrl)
3618 else if (strcmp (class_name, "lower") == 0)
3619 BUILD_CHARCLASS_LOOP (islower)
3620 else if (strcmp (class_name, "space") == 0)
3621 BUILD_CHARCLASS_LOOP (isspace)
3622 else if (strcmp (class_name, "alpha") == 0)
3623 BUILD_CHARCLASS_LOOP (isalpha)
3624 else if (strcmp (class_name, "digit") == 0)
3625 BUILD_CHARCLASS_LOOP (isdigit)
3626 else if (strcmp (class_name, "print") == 0)
3627 BUILD_CHARCLASS_LOOP (isprint)
3628 else if (strcmp (class_name, "upper") == 0)
3629 BUILD_CHARCLASS_LOOP (isupper)
3630#ifndef GAWK
3631 else if (strcmp (class_name, "blank") == 0)
3632 BUILD_CHARCLASS_LOOP (isblank)
3633#else
3634 /* see comments above */
3635 else if (strcmp (class_name, "blank") == 0)
3636 BUILD_CHARCLASS_LOOP (is_blank)
3637#endif
3638 else if (strcmp (class_name, "graph") == 0)
3639 BUILD_CHARCLASS_LOOP (isgraph)
3640 else if (strcmp (class_name, "punct") == 0)
3641 BUILD_CHARCLASS_LOOP (ispunct)
3642 else if (strcmp (class_name, "xdigit") == 0)
3643 BUILD_CHARCLASS_LOOP (isxdigit)
3644 else
3645 return REG_ECTYPE;
3646
3647 return REG_NOERROR;
3648}
3649
3650static bin_tree_t *
3651build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3652 re_dfa_t *dfa;
3653 unsigned RE_TRANSLATE_TYPE trans;
3654 const char *class_name;
3655 const char *extra;
3656 int non_match;
3657 reg_errcode_t *err;
3658{
3659 re_bitset_ptr_t sbcset;
3660#ifdef RE_ENABLE_I18N
3661 re_charset_t *mbcset;
3662 int alloc = 0;
3663#endif /* not RE_ENABLE_I18N */
3664 reg_errcode_t ret;
3665 re_token_t br_token;
3666 bin_tree_t *tree;
3667
3668 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3669#ifdef RE_ENABLE_I18N
3670 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3671#endif /* RE_ENABLE_I18N */
3672
3673#ifdef RE_ENABLE_I18N
3674 if (BE (sbcset == NULL || mbcset == NULL, 0))
3675#else /* not RE_ENABLE_I18N */
3676 if (BE (sbcset == NULL, 0))
3677#endif /* not RE_ENABLE_I18N */
3678 {
3679 *err = REG_ESPACE;
3680 return NULL;
3681 }
3682
3683 if (non_match)
3684 {
3685#ifdef RE_ENABLE_I18N
3686 /*
3687 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3688 bitset_set(cset->sbcset, '\0');
3689 */
3690 mbcset->non_match = 1;
3691#endif /* not RE_ENABLE_I18N */
3692 }
3693
3694 /* We don't care the syntax in this case. */
3695 ret = build_charclass (trans, sbcset,
3696#ifdef RE_ENABLE_I18N
3697 mbcset, &alloc,
3698#endif /* RE_ENABLE_I18N */
3699 class_name, 0);
3700
3701 if (BE (ret != REG_NOERROR, 0))
3702 {
3703 re_free (sbcset);
3704#ifdef RE_ENABLE_I18N
3705 free_charset (mbcset);
3706#endif /* RE_ENABLE_I18N */
3707 *err = ret;
3708 return NULL;
3709 }
3710 /* \w match '_' also. */
3711 for (; *extra; extra++)
3712 bitset_set (sbcset, *extra);
3713
3714 /* If it is non-matching list. */
3715 if (non_match)
3716 bitset_not (sbcset);
3717
3718#ifdef RE_ENABLE_I18N
3719 /* Ensure only single byte characters are set. */
3720 if (dfa->mb_cur_max > 1)
3721 bitset_mask (sbcset, dfa->sb_char);
3722#endif
3723
3724 /* Build a tree for simple bracket. */
3725 br_token.type = SIMPLE_BRACKET;
3726 br_token.opr.sbcset = sbcset;
3727 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3728 if (BE (tree == NULL, 0))
3729 goto build_word_op_espace;
3730
3731#ifdef RE_ENABLE_I18N
3732 if (dfa->mb_cur_max > 1)
3733 {
3734 bin_tree_t *mbc_tree;
3735 /* Build a tree for complex bracket. */
3736 br_token.type = COMPLEX_BRACKET;
3737 br_token.opr.mbcset = mbcset;
3738 dfa->has_mb_node = 1;
3739 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3740 if (BE (mbc_tree == NULL, 0))
3741 goto build_word_op_espace;
3742 /* Then join them by ALT node. */
3743 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3744 if (BE (mbc_tree != NULL, 1))
3745 return tree;
3746 }
3747 else
3748 {
3749 free_charset (mbcset);
3750 return tree;
3751 }
3752#else /* not RE_ENABLE_I18N */
3753 return tree;
3754#endif /* not RE_ENABLE_I18N */
3755
3756 build_word_op_espace:
3757 re_free (sbcset);
3758#ifdef RE_ENABLE_I18N
3759 free_charset (mbcset);
3760#endif /* RE_ENABLE_I18N */
3761 *err = REG_ESPACE;
3762 return NULL;
3763}
3764
3765/* This is intended for the expressions like "a{1,3}".
3766 Fetch a number from `input', and return the number.
3767 Return -1, if the number field is empty like "{,1}".
3768 Return -2, If an error is occured. */
3769
3770static int
3771fetch_number (input, token, syntax)
3772 re_string_t *input;
3773 re_token_t *token;
3774 reg_syntax_t syntax;
3775{
3776 int num = -1;
3777 unsigned char c;
3778 while (1)
3779 {
3780 fetch_token (token, input, syntax);
3781 c = token->opr.c;
3782 if (BE (token->type == END_OF_RE, 0))
3783 return -2;
3784 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3785 break;
3786 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3787 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3788 num = (num > RE_DUP_MAX) ? -2 : num;
3789 }
3790 return num;
3791}
3792
3793
3794#ifdef RE_ENABLE_I18N
3795static void
3796free_charset (re_charset_t *cset)
3797{
3798 re_free (cset->mbchars);
3799# ifdef _LIBC
3800 re_free (cset->coll_syms);
3801 re_free (cset->equiv_classes);
3802 re_free (cset->range_starts);
3803 re_free (cset->range_ends);
3804# endif
3805 re_free (cset->char_classes);
3806 re_free (cset);
3807}
3808#endif /* RE_ENABLE_I18N */
3809
3810
3811/* Functions for binary tree operation. */
3812
3813/* Create a tree node. */
3814
3815static bin_tree_t *
3816create_tree (dfa, left, right, type)
3817 re_dfa_t *dfa;
3818 bin_tree_t *left;
3819 bin_tree_t *right;
3820 re_token_type_t type;
3821{
3822 re_token_t t;
3823 t.type = type;
3824 return create_token_tree (dfa, left, right, &t);
3825}
3826
3827static bin_tree_t *
3828create_token_tree (dfa, left, right, token)
3829 re_dfa_t *dfa;
3830 bin_tree_t *left;
3831 bin_tree_t *right;
3832 const re_token_t *token;
3833{
3834 bin_tree_t *tree;
3835 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3836 {
3837 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3838
3839 if (storage == NULL)
3840 return NULL;
3841 storage->next = dfa->str_tree_storage;
3842 dfa->str_tree_storage = storage;
3843 dfa->str_tree_storage_idx = 0;
3844 }
3845 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3846
3847 tree->parent = NULL;
3848 tree->left = left;
3849 tree->right = right;
3850 tree->token = *token;
3851 tree->token.duplicated = 0;
3852 tree->token.opt_subexp = 0;
3853 tree->first = NULL;
3854 tree->next = NULL;
3855 tree->node_idx = -1;
3856
3857 if (left != NULL)
3858 left->parent = tree;
3859 if (right != NULL)
3860 right->parent = tree;
3861 return tree;
3862}
3863
3864/* Mark the tree SRC as an optional subexpression.
3865 To be called from preorder or postorder. */
3866
3867static reg_errcode_t
3868mark_opt_subexp (extra, node)
3869 void *extra;
3870 bin_tree_t *node;
3871{
3872 int idx = (int) (long) extra;
3873 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3874 node->token.opt_subexp = 1;
3875
3876 return REG_NOERROR;
3877}
3878
3879/* Free the allocated memory inside NODE. */
3880
3881static void
3882free_token (re_token_t *node)
3883{
3884#ifdef RE_ENABLE_I18N
3885 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3886 free_charset (node->opr.mbcset);
3887 else
3888#endif /* RE_ENABLE_I18N */
3889 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3890 re_free (node->opr.sbcset);
3891}
3892
3893/* Worker function for tree walking. Free the allocated memory inside NODE
3894 and its children. */
3895
3896static reg_errcode_t
3897free_tree (void *extra, bin_tree_t *node)
3898{
3899 free_token (&node->token);
3900 return REG_NOERROR;
3901}
3902
3903
3904/* Duplicate the node SRC, and return new node. This is a preorder
3905 visit similar to the one implemented by the generic visitor, but
3906 we need more infrastructure to maintain two parallel trees --- so,
3907 it's easier to duplicate. */
3908
3909static bin_tree_t *
3910duplicate_tree (root, dfa)
3911 const bin_tree_t *root;
3912 re_dfa_t *dfa;
3913{
3914 const bin_tree_t *node;
3915 bin_tree_t *dup_root;
3916 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3917
3918 for (node = root; ; )
3919 {
3920 /* Create a new tree and link it back to the current parent. */
3921 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3922 if (*p_new == NULL)
3923 return NULL;
3924 (*p_new)->parent = dup_node;
3925 (*p_new)->token.duplicated = 1;
3926 dup_node = *p_new;
3927
3928 /* Go to the left node, or up and to the right. */
3929 if (node->left)
3930 {
3931 node = node->left;
3932 p_new = &dup_node->left;
3933 }
3934 else
3935 {
3936 const bin_tree_t *prev = NULL;
3937 while (node->right == prev || node->right == NULL)
3938 {
3939 prev = node;
3940 node = node->parent;
3941 dup_node = dup_node->parent;
3942 if (!node)
3943 return dup_root;
3944 }
3945 node = node->right;
3946 p_new = &dup_node->right;
3947 }
3948 }
3949}
Note: See TracBrowser for help on using the repository browser.