source: trunk/essentials/app-arch/tar/lib/regcomp.c

Last change on this file was 3342, checked in by bird, 18 years ago

tar 1.16.1

File size: 109.1 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26#ifdef RE_ENABLE_I18N
27static void free_charset (re_charset_t *cset);
28#endif /* RE_ENABLE_I18N */
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88#ifdef RE_ENABLE_I18N
89static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 re_charset_t *mbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 bitset_t sbcset,
95 re_charset_t *mbcset,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
98 reg_syntax_t syntax);
99#else /* not RE_ENABLE_I18N */
100static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 bitset_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106#endif /* not RE_ENABLE_I18N */
107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122
123
124/* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
128
129static const char __re_error_msgid[] =
130 {
131#define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
181 };
182
183static const size_t __re_error_msgid_idx[] =
184 {
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
202 };
203
204
205/* Entry points for GNU code. */
206
207/* re_compile_pattern is the GNU regular expression compiler: it
208 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209 Returns 0 if the pattern was valid, otherwise an error string.
210
211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212 are set in BUFP on entry. */
213
214#ifdef _LIBC
215const char *
216re_compile_pattern (pattern, length, bufp)
217 const char *pattern;
218 size_t length;
219 struct re_pattern_buffer *bufp;
220#else /* size_t might promote */
221const char *
222re_compile_pattern (const char *pattern, size_t length,
223 struct re_pattern_buffer *bufp)
224#endif
225{
226 reg_errcode_t ret;
227
228 /* And GNU code determines whether or not to get register information
229 by passing null for the REGS argument to re_match, etc., not by
230 setting no_sub, unless RE_NO_SUB is set. */
231 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232
233 /* Match anchors at newline. */
234 bufp->newline_anchor = 1;
235
236 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
237
238 if (!ret)
239 return NULL;
240 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241}
242#ifdef _LIBC
243weak_alias (__re_compile_pattern, re_compile_pattern)
244#endif
245
246/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
247 also be assigned to arbitrarily: each pattern buffer stores its own
248 syntax, so it can be changed between regex compilations. */
249/* This has no initializer because initialized variables in Emacs
250 become read-only after dumping. */
251reg_syntax_t re_syntax_options;
252
253
254/* Specify the precise syntax of regexps for compilation. This provides
255 for compatibility for various utilities which historically have
256 different, incompatible syntaxes.
257
258 The argument SYNTAX is a bit mask comprised of the various bits
259 defined in regex.h. We return the old syntax. */
260
261reg_syntax_t
262re_set_syntax (syntax)
263 reg_syntax_t syntax;
264{
265 reg_syntax_t ret = re_syntax_options;
266
267 re_syntax_options = syntax;
268 return ret;
269}
270#ifdef _LIBC
271weak_alias (__re_set_syntax, re_set_syntax)
272#endif
273
274int
275re_compile_fastmap (bufp)
276 struct re_pattern_buffer *bufp;
277{
278 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
279 char *fastmap = bufp->fastmap;
280
281 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
282 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
283 if (dfa->init_state != dfa->init_state_word)
284 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
285 if (dfa->init_state != dfa->init_state_nl)
286 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
287 if (dfa->init_state != dfa->init_state_begbuf)
288 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
289 bufp->fastmap_accurate = 1;
290 return 0;
291}
292#ifdef _LIBC
293weak_alias (__re_compile_fastmap, re_compile_fastmap)
294#endif
295
296static inline void
297__attribute ((always_inline))
298re_set_fastmap (char *fastmap, bool icase, int ch)
299{
300 fastmap[ch] = 1;
301 if (icase)
302 fastmap[tolower (ch)] = 1;
303}
304
305/* Helper function for re_compile_fastmap.
306 Compile fastmap for the initial_state INIT_STATE. */
307
308static void
309re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
310 char *fastmap)
311{
312 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
313 Idx node_cnt;
314 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
315 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316 {
317 Idx node = init_state->nodes.elems[node_cnt];
318 re_token_type_t type = dfa->nodes[node].type;
319
320 if (type == CHARACTER)
321 {
322 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
323#ifdef RE_ENABLE_I18N
324 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325 {
326 unsigned char buf[MB_LEN_MAX];
327 unsigned char *p;
328 wchar_t wc;
329 mbstate_t state;
330
331 p = buf;
332 *p++ = dfa->nodes[node].opr.c;
333 while (++node < dfa->nodes_len
334 && dfa->nodes[node].type == CHARACTER
335 && dfa->nodes[node].mb_partial)
336 *p++ = dfa->nodes[node].opr.c;
337 memset (&state, '\0', sizeof (state));
338 if (mbrtowc (&wc, (const char *) buf, p - buf,
339 &state) == p - buf
340 && (__wcrtomb ((char *) buf, towlower (wc), &state)
341 != (size_t) -1))
342 re_set_fastmap (fastmap, false, buf[0]);
343 }
344#endif
345 }
346 else if (type == SIMPLE_BRACKET)
347 {
348 int i, ch;
349 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350 {
351 int j;
352 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
353 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
354 if (w & ((bitset_word_t) 1 << j))
355 re_set_fastmap (fastmap, icase, ch);
356 }
357 }
358#ifdef RE_ENABLE_I18N
359 else if (type == COMPLEX_BRACKET)
360 {
361 Idx i;
362 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
363 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
364 || cset->nranges || cset->nchar_classes)
365 {
366# ifdef _LIBC
367 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
368 {
369 /* In this case we want to catch the bytes which are
370 the first byte of any collation elements.
371 e.g. In da_DK, we want to catch 'a' since "aa"
372 is a valid collation element, and don't catch
373 'b' since 'b' is the only collation element
374 which starts from 'b'. */
375 const int32_t *table = (const int32_t *)
376 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
377 for (i = 0; i < SBC_MAX; ++i)
378 if (table[i] < 0)
379 re_set_fastmap (fastmap, icase, i);
380 }
381# else
382 if (dfa->mb_cur_max > 1)
383 for (i = 0; i < SBC_MAX; ++i)
384 if (__btowc (i) == WEOF)
385 re_set_fastmap (fastmap, icase, i);
386# endif /* not _LIBC */
387 }
388 for (i = 0; i < cset->nmbchars; ++i)
389 {
390 char buf[256];
391 mbstate_t state;
392 memset (&state, '\0', sizeof (state));
393 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
394 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
395 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
396 {
397 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
398 != (size_t) -1)
399 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
400 }
401 }
402 }
403#endif /* RE_ENABLE_I18N */
404 else if (type == OP_PERIOD
405#ifdef RE_ENABLE_I18N
406 || type == OP_UTF8_PERIOD
407#endif /* RE_ENABLE_I18N */
408 || type == END_OF_RE)
409 {
410 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
411 if (type == END_OF_RE)
412 bufp->can_be_null = 1;
413 return;
414 }
415 }
416}
417
418
419/* Entry point for POSIX code. */
420/* regcomp takes a regular expression as a string and compiles it.
421
422 PREG is a regex_t *. We do not expect any fields to be initialized,
423 since POSIX says we shouldn't. Thus, we set
424
425 `buffer' to the compiled pattern;
426 `used' to the length of the compiled pattern;
427 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
428 REG_EXTENDED bit in CFLAGS is set; otherwise, to
429 RE_SYNTAX_POSIX_BASIC;
430 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
431 `fastmap' to an allocated space for the fastmap;
432 `fastmap_accurate' to zero;
433 `re_nsub' to the number of subexpressions in PATTERN.
434
435 PATTERN is the address of the pattern string.
436
437 CFLAGS is a series of bits which affect compilation.
438
439 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
440 use POSIX basic syntax.
441
442 If REG_NEWLINE is set, then . and [^...] don't match newline.
443 Also, regexec will try a match beginning after every newline.
444
445 If REG_ICASE is set, then we considers upper- and lowercase
446 versions of letters to be equivalent when matching.
447
448 If REG_NOSUB is set, then when PREG is passed to regexec, that
449 routine will report only success or failure, and nothing about the
450 registers.
451
452 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
453 the return codes and their meanings.) */
454
455int
456regcomp (preg, pattern, cflags)
457 regex_t *__restrict preg;
458 const char *__restrict pattern;
459 int cflags;
460{
461 reg_errcode_t ret;
462 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
463 : RE_SYNTAX_POSIX_BASIC);
464
465 preg->buffer = NULL;
466 preg->allocated = 0;
467 preg->used = 0;
468
469 /* Try to allocate space for the fastmap. */
470 preg->fastmap = re_malloc (char, SBC_MAX);
471 if (BE (preg->fastmap == NULL, 0))
472 return REG_ESPACE;
473
474 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
475
476 /* If REG_NEWLINE is set, newlines are treated differently. */
477 if (cflags & REG_NEWLINE)
478 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
479 syntax &= ~RE_DOT_NEWLINE;
480 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
481 /* It also changes the matching behavior. */
482 preg->newline_anchor = 1;
483 }
484 else
485 preg->newline_anchor = 0;
486 preg->no_sub = !!(cflags & REG_NOSUB);
487 preg->translate = NULL;
488
489 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
490
491 /* POSIX doesn't distinguish between an unmatched open-group and an
492 unmatched close-group: both are REG_EPAREN. */
493 if (ret == REG_ERPAREN)
494 ret = REG_EPAREN;
495
496 /* We have already checked preg->fastmap != NULL. */
497 if (BE (ret == REG_NOERROR, 1))
498 /* Compute the fastmap now, since regexec cannot modify the pattern
499 buffer. This function never fails in this implementation. */
500 (void) re_compile_fastmap (preg);
501 else
502 {
503 /* Some error occurred while compiling the expression. */
504 re_free (preg->fastmap);
505 preg->fastmap = NULL;
506 }
507
508 return (int) ret;
509}
510#ifdef _LIBC
511weak_alias (__regcomp, regcomp)
512#endif
513
514/* Returns a message corresponding to an error code, ERRCODE, returned
515 from either regcomp or regexec. We don't use PREG here. */
516
517#ifdef _LIBC
518size_t
519regerror (errcode, preg, errbuf, errbuf_size)
520 int errcode;
521 const regex_t *__restrict preg;
522 char *__restrict errbuf;
523 size_t errbuf_size;
524#else /* size_t might promote */
525size_t
526regerror (int errcode, const regex_t *__restrict preg,
527 char *__restrict errbuf, size_t errbuf_size)
528#endif
529{
530 const char *msg;
531 size_t msg_size;
532
533 if (BE (errcode < 0
534 || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 / sizeof (__re_error_msgid_idx[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
540 abort ();
541
542 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543
544 msg_size = strlen (msg) + 1; /* Includes the null. */
545
546 if (BE (errbuf_size != 0, 1))
547 {
548 if (BE (msg_size > errbuf_size, 0))
549 {
550#if defined HAVE_MEMPCPY || defined _LIBC
551 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
552#else
553 memcpy (errbuf, msg, errbuf_size - 1);
554 errbuf[errbuf_size - 1] = 0;
555#endif
556 }
557 else
558 memcpy (errbuf, msg, msg_size);
559 }
560
561 return msg_size;
562}
563#ifdef _LIBC
564weak_alias (__regerror, regerror)
565#endif
566
567
568#ifdef RE_ENABLE_I18N
569/* This static array is used for the map to single-byte characters when
570 UTF-8 is used. Otherwise we would allocate memory just to initialize
571 it the same all the time. UTF-8 is the preferred encoding so this is
572 a worthwhile optimization. */
573static const bitset_t utf8_sb_map =
574{
575 /* Set the first 128 bits. */
576# if 4 * BITSET_WORD_BITS < ASCII_CHARS
577# error "bitset_word_t is narrower than 32 bits"
578# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
579 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
580# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
581 BITSET_WORD_MAX, BITSET_WORD_MAX,
582# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
583 BITSET_WORD_MAX,
584# endif
585 (BITSET_WORD_MAX
586 >> (SBC_MAX % BITSET_WORD_BITS == 0
587 ? 0
588 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
589};
590#endif
591
592
593static void
594free_dfa_content (re_dfa_t *dfa)
595{
596 Idx i, j;
597
598 if (dfa->nodes)
599 for (i = 0; i < dfa->nodes_len; ++i)
600 free_token (dfa->nodes + i);
601 re_free (dfa->nexts);
602 for (i = 0; i < dfa->nodes_len; ++i)
603 {
604 if (dfa->eclosures != NULL)
605 re_node_set_free (dfa->eclosures + i);
606 if (dfa->inveclosures != NULL)
607 re_node_set_free (dfa->inveclosures + i);
608 if (dfa->edests != NULL)
609 re_node_set_free (dfa->edests + i);
610 }
611 re_free (dfa->edests);
612 re_free (dfa->eclosures);
613 re_free (dfa->inveclosures);
614 re_free (dfa->nodes);
615
616 if (dfa->state_table)
617 for (i = 0; i <= dfa->state_hash_mask; ++i)
618 {
619 struct re_state_table_entry *entry = dfa->state_table + i;
620 for (j = 0; j < entry->num; ++j)
621 {
622 re_dfastate_t *state = entry->array[j];
623 free_state (state);
624 }
625 re_free (entry->array);
626 }
627 re_free (dfa->state_table);
628#ifdef RE_ENABLE_I18N
629 if (dfa->sb_char != utf8_sb_map)
630 re_free (dfa->sb_char);
631#endif
632 re_free (dfa->subexp_map);
633#ifdef DEBUG
634 re_free (dfa->re_str);
635#endif
636
637 re_free (dfa);
638}
639
640
641/* Free dynamically allocated space used by PREG. */
642
643void
644regfree (preg)
645 regex_t *preg;
646{
647 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
648 if (BE (dfa != NULL, 1))
649 free_dfa_content (dfa);
650 preg->buffer = NULL;
651 preg->allocated = 0;
652
653 re_free (preg->fastmap);
654 preg->fastmap = NULL;
655
656 re_free (preg->translate);
657 preg->translate = NULL;
658}
659#ifdef _LIBC
660weak_alias (__regfree, regfree)
661#endif
662
663
664/* Entry points compatible with 4.2 BSD regex library. We don't define
665 them unless specifically requested. */
666
667#if defined _REGEX_RE_COMP || defined _LIBC
668
669/* BSD has one and only one pattern buffer. */
670static struct re_pattern_buffer re_comp_buf;
671
672char *
673# ifdef _LIBC
674/* Make these definitions weak in libc, so POSIX programs can redefine
675 these names if they don't use our functions, and still use
676 regcomp/regexec above without link errors. */
677weak_function
678# endif
679re_comp (s)
680 const char *s;
681{
682 reg_errcode_t ret;
683 char *fastmap;
684
685 if (!s)
686 {
687 if (!re_comp_buf.buffer)
688 return gettext ("No previous regular expression");
689 return 0;
690 }
691
692 if (re_comp_buf.buffer)
693 {
694 fastmap = re_comp_buf.fastmap;
695 re_comp_buf.fastmap = NULL;
696 __regfree (&re_comp_buf);
697 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
698 re_comp_buf.fastmap = fastmap;
699 }
700
701 if (re_comp_buf.fastmap == NULL)
702 {
703 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
704 if (re_comp_buf.fastmap == NULL)
705 return (char *) gettext (__re_error_msgid
706 + __re_error_msgid_idx[(int) REG_ESPACE]);
707 }
708
709 /* Since `re_exec' always passes NULL for the `regs' argument, we
710 don't need to initialize the pattern buffer fields which affect it. */
711
712 /* Match anchors at newlines. */
713 re_comp_buf.newline_anchor = 1;
714
715 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
716
717 if (!ret)
718 return NULL;
719
720 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
721 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
722}
723
724#ifdef _LIBC
725libc_freeres_fn (free_mem)
726{
727 __regfree (&re_comp_buf);
728}
729#endif
730
731#endif /* _REGEX_RE_COMP */
732
733
734/* Internal entry point.
735 Compile the regular expression PATTERN, whose length is LENGTH.
736 SYNTAX indicate regular expression's syntax. */
737
738static reg_errcode_t
739re_compile_internal (regex_t *preg, const char * pattern, size_t length,
740 reg_syntax_t syntax)
741{
742 reg_errcode_t err = REG_NOERROR;
743 re_dfa_t *dfa;
744 re_string_t regexp;
745
746 /* Initialize the pattern buffer. */
747 preg->fastmap_accurate = 0;
748 preg->syntax = syntax;
749 preg->not_bol = preg->not_eol = 0;
750 preg->used = 0;
751 preg->re_nsub = 0;
752 preg->can_be_null = 0;
753 preg->regs_allocated = REGS_UNALLOCATED;
754
755 /* Initialize the dfa. */
756 dfa = (re_dfa_t *) preg->buffer;
757 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
758 {
759 /* If zero allocated, but buffer is non-null, try to realloc
760 enough space. This loses if buffer's address is bogus, but
761 that is the user's responsibility. If ->buffer is NULL this
762 is a simple allocation. */
763 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
764 if (dfa == NULL)
765 return REG_ESPACE;
766 preg->allocated = sizeof (re_dfa_t);
767 preg->buffer = (unsigned char *) dfa;
768 }
769 preg->used = sizeof (re_dfa_t);
770
771 err = init_dfa (dfa, length);
772 if (BE (err != REG_NOERROR, 0))
773 {
774 free_dfa_content (dfa);
775 preg->buffer = NULL;
776 preg->allocated = 0;
777 return err;
778 }
779#ifdef DEBUG
780 /* Note: length+1 will not overflow since it is checked in init_dfa. */
781 dfa->re_str = re_malloc (char, length + 1);
782 strncpy (dfa->re_str, pattern, length + 1);
783#endif
784
785 __libc_lock_init (dfa->lock);
786
787 err = re_string_construct (&regexp, pattern, length, preg->translate,
788 syntax & RE_ICASE, dfa);
789 if (BE (err != REG_NOERROR, 0))
790 {
791 re_compile_internal_free_return:
792 free_workarea_compile (preg);
793 re_string_destruct (&regexp);
794 free_dfa_content (dfa);
795 preg->buffer = NULL;
796 preg->allocated = 0;
797 return err;
798 }
799
800 /* Parse the regular expression, and build a structure tree. */
801 preg->re_nsub = 0;
802 dfa->str_tree = parse (&regexp, preg, syntax, &err);
803 if (BE (dfa->str_tree == NULL, 0))
804 goto re_compile_internal_free_return;
805
806 /* Analyze the tree and create the nfa. */
807 err = analyze (preg);
808 if (BE (err != REG_NOERROR, 0))
809 goto re_compile_internal_free_return;
810
811#ifdef RE_ENABLE_I18N
812 /* If possible, do searching in single byte encoding to speed things up. */
813 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
814 optimize_utf8 (dfa);
815#endif
816
817 /* Then create the initial state of the dfa. */
818 err = create_initial_state (dfa);
819
820 /* Release work areas. */
821 free_workarea_compile (preg);
822 re_string_destruct (&regexp);
823
824 if (BE (err != REG_NOERROR, 0))
825 {
826 free_dfa_content (dfa);
827 preg->buffer = NULL;
828 preg->allocated = 0;
829 }
830
831 return err;
832}
833
834/* Initialize DFA. We use the length of the regular expression PAT_LEN
835 as the initial length of some arrays. */
836
837static reg_errcode_t
838init_dfa (re_dfa_t *dfa, size_t pat_len)
839{
840 __re_size_t table_size;
841#ifndef _LIBC
842 char *codeset_name;
843#endif
844#ifdef RE_ENABLE_I18N
845 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
846#else
847 size_t max_i18n_object_size = 0;
848#endif
849 size_t max_object_size =
850 MAX (sizeof (struct re_state_table_entry),
851 MAX (sizeof (re_token_t),
852 MAX (sizeof (re_node_set),
853 MAX (sizeof (regmatch_t),
854 max_i18n_object_size))));
855
856 memset (dfa, '\0', sizeof (re_dfa_t));
857
858 /* Force allocation of str_tree_storage the first time. */
859 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
860
861 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
862 calculation below, and for similar doubling calculations
863 elsewhere. And it's <= rather than <, because some of the
864 doubling calculations add 1 afterwards. */
865 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
866 return REG_ESPACE;
867
868 dfa->nodes_alloc = pat_len + 1;
869 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
870
871 /* table_size = 2 ^ ceil(log pat_len) */
872 for (table_size = 1; ; table_size <<= 1)
873 if (table_size > pat_len)
874 break;
875
876 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
877 dfa->state_hash_mask = table_size - 1;
878
879 dfa->mb_cur_max = MB_CUR_MAX;
880#ifdef _LIBC
881 if (dfa->mb_cur_max == 6
882 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
883 dfa->is_utf8 = 1;
884 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
885 != 0);
886#else
887# ifdef HAVE_LANGINFO_CODESET
888 codeset_name = nl_langinfo (CODESET);
889# else
890 codeset_name = getenv ("LC_ALL");
891 if (codeset_name == NULL || codeset_name[0] == '\0')
892 codeset_name = getenv ("LC_CTYPE");
893 if (codeset_name == NULL || codeset_name[0] == '\0')
894 codeset_name = getenv ("LANG");
895 if (codeset_name == NULL)
896 codeset_name = "";
897 else if (strchr (codeset_name, '.') != NULL)
898 codeset_name = strchr (codeset_name, '.') + 1;
899# endif
900
901 if (strcasecmp (codeset_name, "UTF-8") == 0
902 || strcasecmp (codeset_name, "UTF8") == 0)
903 dfa->is_utf8 = 1;
904
905 /* We check exhaustively in the loop below if this charset is a
906 superset of ASCII. */
907 dfa->map_notascii = 0;
908#endif
909
910#ifdef RE_ENABLE_I18N
911 if (dfa->mb_cur_max > 1)
912 {
913 if (dfa->is_utf8)
914 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
915 else
916 {
917 int i, j, ch;
918
919 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
920 if (BE (dfa->sb_char == NULL, 0))
921 return REG_ESPACE;
922
923 /* Set the bits corresponding to single byte chars. */
924 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
925 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
926 {
927 wint_t wch = __btowc (ch);
928 if (wch != WEOF)
929 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
930# ifndef _LIBC
931 if (isascii (ch) && wch != ch)
932 dfa->map_notascii = 1;
933# endif
934 }
935 }
936 }
937#endif
938
939 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
940 return REG_ESPACE;
941 return REG_NOERROR;
942}
943
944/* Initialize WORD_CHAR table, which indicate which character is
945 "word". In this case "word" means that it is the word construction
946 character used by some operators like "\<", "\>", etc. */
947
948static void
949internal_function
950init_word_char (re_dfa_t *dfa)
951{
952 int i, j, ch;
953 dfa->word_ops_used = 1;
954 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
955 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
956 if (isalnum (ch) || ch == '_')
957 dfa->word_char[i] |= (bitset_word_t) 1 << j;
958}
959
960/* Free the work area which are only used while compiling. */
961
962static void
963free_workarea_compile (regex_t *preg)
964{
965 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
966 bin_tree_storage_t *storage, *next;
967 for (storage = dfa->str_tree_storage; storage; storage = next)
968 {
969 next = storage->next;
970 re_free (storage);
971 }
972 dfa->str_tree_storage = NULL;
973 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
974 dfa->str_tree = NULL;
975 re_free (dfa->org_indices);
976 dfa->org_indices = NULL;
977}
978
979/* Create initial states for all contexts. */
980
981static reg_errcode_t
982create_initial_state (re_dfa_t *dfa)
983{
984 Idx first, i;
985 reg_errcode_t err;
986 re_node_set init_nodes;
987
988 /* Initial states have the epsilon closure of the node which is
989 the first node of the regular expression. */
990 first = dfa->str_tree->first->node_idx;
991 dfa->init_node = first;
992 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
993 if (BE (err != REG_NOERROR, 0))
994 return err;
995
996 /* The back-references which are in initial states can epsilon transit,
997 since in this case all of the subexpressions can be null.
998 Then we add epsilon closures of the nodes which are the next nodes of
999 the back-references. */
1000 if (dfa->nbackref > 0)
1001 for (i = 0; i < init_nodes.nelem; ++i)
1002 {
1003 Idx node_idx = init_nodes.elems[i];
1004 re_token_type_t type = dfa->nodes[node_idx].type;
1005
1006 Idx clexp_idx;
1007 if (type != OP_BACK_REF)
1008 continue;
1009 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1010 {
1011 re_token_t *clexp_node;
1012 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1013 if (clexp_node->type == OP_CLOSE_SUBEXP
1014 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1015 break;
1016 }
1017 if (clexp_idx == init_nodes.nelem)
1018 continue;
1019
1020 if (type == OP_BACK_REF)
1021 {
1022 Idx dest_idx = dfa->edests[node_idx].elems[0];
1023 if (!re_node_set_contains (&init_nodes, dest_idx))
1024 {
1025 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1026 i = 0;
1027 }
1028 }
1029 }
1030
1031 /* It must be the first time to invoke acquire_state. */
1032 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1033 /* We don't check ERR here, since the initial state must not be NULL. */
1034 if (BE (dfa->init_state == NULL, 0))
1035 return err;
1036 if (dfa->init_state->has_constraint)
1037 {
1038 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1039 CONTEXT_WORD);
1040 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1041 CONTEXT_NEWLINE);
1042 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1043 &init_nodes,
1044 CONTEXT_NEWLINE
1045 | CONTEXT_BEGBUF);
1046 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1047 || dfa->init_state_begbuf == NULL, 0))
1048 return err;
1049 }
1050 else
1051 dfa->init_state_word = dfa->init_state_nl
1052 = dfa->init_state_begbuf = dfa->init_state;
1053
1054 re_node_set_free (&init_nodes);
1055 return REG_NOERROR;
1056}
1057
1058
1059#ifdef RE_ENABLE_I18N
1060/* If it is possible to do searching in single byte encoding instead of UTF-8
1061 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1062 DFA nodes where needed. */
1063
1064static void
1065optimize_utf8 (re_dfa_t *dfa)
1066{
1067 Idx node;
1068 int i;
1069 bool mb_chars = false;
1070 bool has_period = false;
1071
1072 for (node = 0; node < dfa->nodes_len; ++node)
1073 switch (dfa->nodes[node].type)
1074 {
1075 case CHARACTER:
1076 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1077 mb_chars = true;
1078 break;
1079 case ANCHOR:
1080 switch (dfa->nodes[node].opr.idx)
1081 {
1082 case LINE_FIRST:
1083 case LINE_LAST:
1084 case BUF_FIRST:
1085 case BUF_LAST:
1086 break;
1087 default:
1088 /* Word anchors etc. cannot be handled. */
1089 return;
1090 }
1091 break;
1092 case OP_PERIOD:
1093 has_period = true;
1094 break;
1095 case OP_BACK_REF:
1096 case OP_ALT:
1097 case END_OF_RE:
1098 case OP_DUP_ASTERISK:
1099 case OP_OPEN_SUBEXP:
1100 case OP_CLOSE_SUBEXP:
1101 break;
1102 case COMPLEX_BRACKET:
1103 return;
1104 case SIMPLE_BRACKET:
1105 /* Just double check. */
1106 {
1107 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1108 ? 0
1109 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1110 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1111 {
1112 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1113 return;
1114 rshift = 0;
1115 }
1116 }
1117 break;
1118 default:
1119 abort ();
1120 }
1121
1122 if (mb_chars || has_period)
1123 for (node = 0; node < dfa->nodes_len; ++node)
1124 {
1125 if (dfa->nodes[node].type == CHARACTER
1126 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1127 dfa->nodes[node].mb_partial = 0;
1128 else if (dfa->nodes[node].type == OP_PERIOD)
1129 dfa->nodes[node].type = OP_UTF8_PERIOD;
1130 }
1131
1132 /* The search can be in single byte locale. */
1133 dfa->mb_cur_max = 1;
1134 dfa->is_utf8 = 0;
1135 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1136}
1137#endif
1138
1139
1140/* Analyze the structure tree, and calculate "first", "next", "edest",
1141 "eclosure", and "inveclosure". */
1142
1143static reg_errcode_t
1144analyze (regex_t *preg)
1145{
1146 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1147 reg_errcode_t ret;
1148
1149 /* Allocate arrays. */
1150 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1151 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1152 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1153 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1154 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1155 || dfa->eclosures == NULL, 0))
1156 return REG_ESPACE;
1157
1158 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1159 if (dfa->subexp_map != NULL)
1160 {
1161 Idx i;
1162 for (i = 0; i < preg->re_nsub; i++)
1163 dfa->subexp_map[i] = i;
1164 preorder (dfa->str_tree, optimize_subexps, dfa);
1165 for (i = 0; i < preg->re_nsub; i++)
1166 if (dfa->subexp_map[i] != i)
1167 break;
1168 if (i == preg->re_nsub)
1169 {
1170 free (dfa->subexp_map);
1171 dfa->subexp_map = NULL;
1172 }
1173 }
1174
1175 ret = postorder (dfa->str_tree, lower_subexps, preg);
1176 if (BE (ret != REG_NOERROR, 0))
1177 return ret;
1178 ret = postorder (dfa->str_tree, calc_first, dfa);
1179 if (BE (ret != REG_NOERROR, 0))
1180 return ret;
1181 preorder (dfa->str_tree, calc_next, dfa);
1182 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1183 if (BE (ret != REG_NOERROR, 0))
1184 return ret;
1185 ret = calc_eclosure (dfa);
1186 if (BE (ret != REG_NOERROR, 0))
1187 return ret;
1188
1189 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1190 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1191 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1192 || dfa->nbackref)
1193 {
1194 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1195 if (BE (dfa->inveclosures == NULL, 0))
1196 return REG_ESPACE;
1197 ret = calc_inveclosure (dfa);
1198 }
1199
1200 return ret;
1201}
1202
1203/* Our parse trees are very unbalanced, so we cannot use a stack to
1204 implement parse tree visits. Instead, we use parent pointers and
1205 some hairy code in these two functions. */
1206static reg_errcode_t
1207postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1208 void *extra)
1209{
1210 bin_tree_t *node, *prev;
1211
1212 for (node = root; ; )
1213 {
1214 /* Descend down the tree, preferably to the left (or to the right
1215 if that's the only child). */
1216 while (node->left || node->right)
1217 if (node->left)
1218 node = node->left;
1219 else
1220 node = node->right;
1221
1222 do
1223 {
1224 reg_errcode_t err = fn (extra, node);
1225 if (BE (err != REG_NOERROR, 0))
1226 return err;
1227 if (node->parent == NULL)
1228 return REG_NOERROR;
1229 prev = node;
1230 node = node->parent;
1231 }
1232 /* Go up while we have a node that is reached from the right. */
1233 while (node->right == prev || node->right == NULL);
1234 node = node->right;
1235 }
1236}
1237
1238static reg_errcode_t
1239preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1240 void *extra)
1241{
1242 bin_tree_t *node;
1243
1244 for (node = root; ; )
1245 {
1246 reg_errcode_t err = fn (extra, node);
1247 if (BE (err != REG_NOERROR, 0))
1248 return err;
1249
1250 /* Go to the left node, or up and to the right. */
1251 if (node->left)
1252 node = node->left;
1253 else
1254 {
1255 bin_tree_t *prev = NULL;
1256 while (node->right == prev || node->right == NULL)
1257 {
1258 prev = node;
1259 node = node->parent;
1260 if (!node)
1261 return REG_NOERROR;
1262 }
1263 node = node->right;
1264 }
1265 }
1266}
1267
1268/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1269 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1270 backreferences as well. Requires a preorder visit. */
1271static reg_errcode_t
1272optimize_subexps (void *extra, bin_tree_t *node)
1273{
1274 re_dfa_t *dfa = (re_dfa_t *) extra;
1275
1276 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1277 {
1278 int idx = node->token.opr.idx;
1279 node->token.opr.idx = dfa->subexp_map[idx];
1280 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1281 }
1282
1283 else if (node->token.type == SUBEXP
1284 && node->left && node->left->token.type == SUBEXP)
1285 {
1286 Idx other_idx = node->left->token.opr.idx;
1287
1288 node->left = node->left->left;
1289 if (node->left)
1290 node->left->parent = node;
1291
1292 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1293 if (other_idx < BITSET_WORD_BITS)
1294 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1295 }
1296
1297 return REG_NOERROR;
1298}
1299
1300/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1301 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1302static reg_errcode_t
1303lower_subexps (void *extra, bin_tree_t *node)
1304{
1305 regex_t *preg = (regex_t *) extra;
1306 reg_errcode_t err = REG_NOERROR;
1307
1308 if (node->left && node->left->token.type == SUBEXP)
1309 {
1310 node->left = lower_subexp (&err, preg, node->left);
1311 if (node->left)
1312 node->left->parent = node;
1313 }
1314 if (node->right && node->right->token.type == SUBEXP)
1315 {
1316 node->right = lower_subexp (&err, preg, node->right);
1317 if (node->right)
1318 node->right->parent = node;
1319 }
1320
1321 return err;
1322}
1323
1324static bin_tree_t *
1325lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1326{
1327 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1328 bin_tree_t *body = node->left;
1329 bin_tree_t *op, *cls, *tree1, *tree;
1330
1331 if (preg->no_sub
1332 /* We do not optimize empty subexpressions, because otherwise we may
1333 have bad CONCAT nodes with NULL children. This is obviously not
1334 very common, so we do not lose much. An example that triggers
1335 this case is the sed "script" /\(\)/x. */
1336 && node->left != NULL
1337 && (node->token.opr.idx >= BITSET_WORD_BITS
1338 || !(dfa->used_bkref_map
1339 & ((bitset_word_t) 1 << node->token.opr.idx))))
1340 return node->left;
1341
1342 /* Convert the SUBEXP node to the concatenation of an
1343 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1344 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1345 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1346 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1347 tree = create_tree (dfa, op, tree1, CONCAT);
1348 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1349 {
1350 *err = REG_ESPACE;
1351 return NULL;
1352 }
1353
1354 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1355 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1356 return tree;
1357}
1358
1359/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1360 nodes. Requires a postorder visit. */
1361static reg_errcode_t
1362calc_first (void *extra, bin_tree_t *node)
1363{
1364 re_dfa_t *dfa = (re_dfa_t *) extra;
1365 if (node->token.type == CONCAT)
1366 {
1367 node->first = node->left->first;
1368 node->node_idx = node->left->node_idx;
1369 }
1370 else
1371 {
1372 node->first = node;
1373 node->node_idx = re_dfa_add_node (dfa, node->token);
1374 if (BE (node->node_idx == REG_MISSING, 0))
1375 return REG_ESPACE;
1376 }
1377 return REG_NOERROR;
1378}
1379
1380/* Pass 2: compute NEXT on the tree. Preorder visit. */
1381static reg_errcode_t
1382calc_next (void *extra, bin_tree_t *node)
1383{
1384 switch (node->token.type)
1385 {
1386 case OP_DUP_ASTERISK:
1387 node->left->next = node;
1388 break;
1389 case CONCAT:
1390 node->left->next = node->right->first;
1391 node->right->next = node->next;
1392 break;
1393 default:
1394 if (node->left)
1395 node->left->next = node->next;
1396 if (node->right)
1397 node->right->next = node->next;
1398 break;
1399 }
1400 return REG_NOERROR;
1401}
1402
1403/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1404static reg_errcode_t
1405link_nfa_nodes (void *extra, bin_tree_t *node)
1406{
1407 re_dfa_t *dfa = (re_dfa_t *) extra;
1408 Idx idx = node->node_idx;
1409 reg_errcode_t err = REG_NOERROR;
1410
1411 switch (node->token.type)
1412 {
1413 case CONCAT:
1414 break;
1415
1416 case END_OF_RE:
1417 assert (node->next == NULL);
1418 break;
1419
1420 case OP_DUP_ASTERISK:
1421 case OP_ALT:
1422 {
1423 Idx left, right;
1424 dfa->has_plural_match = 1;
1425 if (node->left != NULL)
1426 left = node->left->first->node_idx;
1427 else
1428 left = node->next->node_idx;
1429 if (node->right != NULL)
1430 right = node->right->first->node_idx;
1431 else
1432 right = node->next->node_idx;
1433 assert (REG_VALID_INDEX (left));
1434 assert (REG_VALID_INDEX (right));
1435 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1436 }
1437 break;
1438
1439 case ANCHOR:
1440 case OP_OPEN_SUBEXP:
1441 case OP_CLOSE_SUBEXP:
1442 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1443 break;
1444
1445 case OP_BACK_REF:
1446 dfa->nexts[idx] = node->next->node_idx;
1447 if (node->token.type == OP_BACK_REF)
1448 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1449 break;
1450
1451 default:
1452 assert (!IS_EPSILON_NODE (node->token.type));
1453 dfa->nexts[idx] = node->next->node_idx;
1454 break;
1455 }
1456
1457 return err;
1458}
1459
1460/* Duplicate the epsilon closure of the node ROOT_NODE.
1461 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1462 to their own constraint. */
1463
1464static reg_errcode_t
1465internal_function
1466duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1467 Idx root_node, unsigned int init_constraint)
1468{
1469 Idx org_node, clone_node;
1470 bool ok;
1471 unsigned int constraint = init_constraint;
1472 for (org_node = top_org_node, clone_node = top_clone_node;;)
1473 {
1474 Idx org_dest, clone_dest;
1475 if (dfa->nodes[org_node].type == OP_BACK_REF)
1476 {
1477 /* If the back reference epsilon-transit, its destination must
1478 also have the constraint. Then duplicate the epsilon closure
1479 of the destination of the back reference, and store it in
1480 edests of the back reference. */
1481 org_dest = dfa->nexts[org_node];
1482 re_node_set_empty (dfa->edests + clone_node);
1483 clone_dest = duplicate_node (dfa, org_dest, constraint);
1484 if (BE (clone_dest == REG_MISSING, 0))
1485 return REG_ESPACE;
1486 dfa->nexts[clone_node] = dfa->nexts[org_node];
1487 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488 if (BE (! ok, 0))
1489 return REG_ESPACE;
1490 }
1491 else if (dfa->edests[org_node].nelem == 0)
1492 {
1493 /* In case of the node can't epsilon-transit, don't duplicate the
1494 destination and store the original destination as the
1495 destination of the node. */
1496 dfa->nexts[clone_node] = dfa->nexts[org_node];
1497 break;
1498 }
1499 else if (dfa->edests[org_node].nelem == 1)
1500 {
1501 /* In case of the node can epsilon-transit, and it has only one
1502 destination. */
1503 org_dest = dfa->edests[org_node].elems[0];
1504 re_node_set_empty (dfa->edests + clone_node);
1505 if (dfa->nodes[org_node].type == ANCHOR)
1506 {
1507 /* In case of the node has another constraint, append it. */
1508 if (org_node == root_node && clone_node != org_node)
1509 {
1510 /* ...but if the node is root_node itself, it means the
1511 epsilon closure have a loop, then tie it to the
1512 destination of the root_node. */
1513 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1514 if (BE (! ok, 0))
1515 return REG_ESPACE;
1516 break;
1517 }
1518 constraint |= dfa->nodes[org_node].opr.ctx_type;
1519 }
1520 clone_dest = duplicate_node (dfa, org_dest, constraint);
1521 if (BE (clone_dest == REG_MISSING, 0))
1522 return REG_ESPACE;
1523 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1524 if (BE (! ok, 0))
1525 return REG_ESPACE;
1526 }
1527 else /* dfa->edests[org_node].nelem == 2 */
1528 {
1529 /* In case of the node can epsilon-transit, and it has two
1530 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1531 org_dest = dfa->edests[org_node].elems[0];
1532 re_node_set_empty (dfa->edests + clone_node);
1533 /* Search for a duplicated node which satisfies the constraint. */
1534 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1535 if (clone_dest == REG_MISSING)
1536 {
1537 /* There are no such a duplicated node, create a new one. */
1538 reg_errcode_t err;
1539 clone_dest = duplicate_node (dfa, org_dest, constraint);
1540 if (BE (clone_dest == REG_MISSING, 0))
1541 return REG_ESPACE;
1542 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543 if (BE (! ok, 0))
1544 return REG_ESPACE;
1545 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1546 root_node, constraint);
1547 if (BE (err != REG_NOERROR, 0))
1548 return err;
1549 }
1550 else
1551 {
1552 /* There are a duplicated node which satisfy the constraint,
1553 use it to avoid infinite loop. */
1554 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555 if (BE (! ok, 0))
1556 return REG_ESPACE;
1557 }
1558
1559 org_dest = dfa->edests[org_node].elems[1];
1560 clone_dest = duplicate_node (dfa, org_dest, constraint);
1561 if (BE (clone_dest == REG_MISSING, 0))
1562 return REG_ESPACE;
1563 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1564 if (BE (! ok, 0))
1565 return REG_ESPACE;
1566 }
1567 org_node = org_dest;
1568 clone_node = clone_dest;
1569 }
1570 return REG_NOERROR;
1571}
1572
1573/* Search for a node which is duplicated from the node ORG_NODE, and
1574 satisfies the constraint CONSTRAINT. */
1575
1576static Idx
1577search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1578 unsigned int constraint)
1579{
1580 Idx idx;
1581 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1582 {
1583 if (org_node == dfa->org_indices[idx]
1584 && constraint == dfa->nodes[idx].constraint)
1585 return idx; /* Found. */
1586 }
1587 return REG_MISSING; /* Not found. */
1588}
1589
1590/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1591 Return the index of the new node, or REG_MISSING if insufficient storage is
1592 available. */
1593
1594static Idx
1595duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1596{
1597 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1598 if (BE (dup_idx != REG_MISSING, 1))
1599 {
1600 dfa->nodes[dup_idx].constraint = constraint;
1601 if (dfa->nodes[org_idx].type == ANCHOR)
1602 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1603 dfa->nodes[dup_idx].duplicated = 1;
1604
1605 /* Store the index of the original node. */
1606 dfa->org_indices[dup_idx] = org_idx;
1607 }
1608 return dup_idx;
1609}
1610
1611static reg_errcode_t
1612calc_inveclosure (re_dfa_t *dfa)
1613{
1614 Idx src, idx;
1615 bool ok;
1616 for (idx = 0; idx < dfa->nodes_len; ++idx)
1617 re_node_set_init_empty (dfa->inveclosures + idx);
1618
1619 for (src = 0; src < dfa->nodes_len; ++src)
1620 {
1621 Idx *elems = dfa->eclosures[src].elems;
1622 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623 {
1624 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1625 if (BE (! ok, 0))
1626 return REG_ESPACE;
1627 }
1628 }
1629
1630 return REG_NOERROR;
1631}
1632
1633/* Calculate "eclosure" for all the node in DFA. */
1634
1635static reg_errcode_t
1636calc_eclosure (re_dfa_t *dfa)
1637{
1638 Idx node_idx;
1639 bool incomplete;
1640#ifdef DEBUG
1641 assert (dfa->nodes_len > 0);
1642#endif
1643 incomplete = false;
1644 /* For each nodes, calculate epsilon closure. */
1645 for (node_idx = 0; ; ++node_idx)
1646 {
1647 reg_errcode_t err;
1648 re_node_set eclosure_elem;
1649 if (node_idx == dfa->nodes_len)
1650 {
1651 if (!incomplete)
1652 break;
1653 incomplete = false;
1654 node_idx = 0;
1655 }
1656
1657#ifdef DEBUG
1658 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1659#endif
1660
1661 /* If we have already calculated, skip it. */
1662 if (dfa->eclosures[node_idx].nelem != 0)
1663 continue;
1664 /* Calculate epsilon closure of `node_idx'. */
1665 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1666 if (BE (err != REG_NOERROR, 0))
1667 return err;
1668
1669 if (dfa->eclosures[node_idx].nelem == 0)
1670 {
1671 incomplete = true;
1672 re_node_set_free (&eclosure_elem);
1673 }
1674 }
1675 return REG_NOERROR;
1676}
1677
1678/* Calculate epsilon closure of NODE. */
1679
1680static reg_errcode_t
1681calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1682{
1683 reg_errcode_t err;
1684 unsigned int constraint;
1685 Idx i;
1686 bool incomplete;
1687 bool ok;
1688 re_node_set eclosure;
1689 incomplete = false;
1690 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1691 if (BE (err != REG_NOERROR, 0))
1692 return err;
1693
1694 /* This indicates that we are calculating this node now.
1695 We reference this value to avoid infinite loop. */
1696 dfa->eclosures[node].nelem = REG_MISSING;
1697
1698 constraint = ((dfa->nodes[node].type == ANCHOR)
1699 ? dfa->nodes[node].opr.ctx_type : 0);
1700 /* If the current node has constraints, duplicate all nodes.
1701 Since they must inherit the constraints. */
1702 if (constraint
1703 && dfa->edests[node].nelem
1704 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1705 {
1706 err = duplicate_node_closure (dfa, node, node, node, constraint);
1707 if (BE (err != REG_NOERROR, 0))
1708 return err;
1709 }
1710
1711 /* Expand each epsilon destination nodes. */
1712 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1713 for (i = 0; i < dfa->edests[node].nelem; ++i)
1714 {
1715 re_node_set eclosure_elem;
1716 Idx edest = dfa->edests[node].elems[i];
1717 /* If calculating the epsilon closure of `edest' is in progress,
1718 return intermediate result. */
1719 if (dfa->eclosures[edest].nelem == REG_MISSING)
1720 {
1721 incomplete = true;
1722 continue;
1723 }
1724 /* If we haven't calculated the epsilon closure of `edest' yet,
1725 calculate now. Otherwise use calculated epsilon closure. */
1726 if (dfa->eclosures[edest].nelem == 0)
1727 {
1728 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1729 if (BE (err != REG_NOERROR, 0))
1730 return err;
1731 }
1732 else
1733 eclosure_elem = dfa->eclosures[edest];
1734 /* Merge the epsilon closure of `edest'. */
1735 re_node_set_merge (&eclosure, &eclosure_elem);
1736 /* If the epsilon closure of `edest' is incomplete,
1737 the epsilon closure of this node is also incomplete. */
1738 if (dfa->eclosures[edest].nelem == 0)
1739 {
1740 incomplete = true;
1741 re_node_set_free (&eclosure_elem);
1742 }
1743 }
1744
1745 /* Epsilon closures include itself. */
1746 ok = re_node_set_insert (&eclosure, node);
1747 if (BE (! ok, 0))
1748 return REG_ESPACE;
1749 if (incomplete && !root)
1750 dfa->eclosures[node].nelem = 0;
1751 else
1752 dfa->eclosures[node] = eclosure;
1753 *new_set = eclosure;
1754 return REG_NOERROR;
1755}
1756
1757
1758/* Functions for token which are used in the parser. */
1759
1760/* Fetch a token from INPUT.
1761 We must not use this function inside bracket expressions. */
1762
1763static void
1764internal_function
1765fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1766{
1767 re_string_skip_bytes (input, peek_token (result, input, syntax));
1768}
1769
1770/* Peek a token from INPUT, and return the length of the token.
1771 We must not use this function inside bracket expressions. */
1772
1773static int
1774internal_function
1775peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1776{
1777 unsigned char c;
1778
1779 if (re_string_eoi (input))
1780 {
1781 token->type = END_OF_RE;
1782 return 0;
1783 }
1784
1785 c = re_string_peek_byte (input, 0);
1786 token->opr.c = c;
1787
1788 token->word_char = 0;
1789#ifdef RE_ENABLE_I18N
1790 token->mb_partial = 0;
1791 if (input->mb_cur_max > 1 &&
1792 !re_string_first_byte (input, re_string_cur_idx (input)))
1793 {
1794 token->type = CHARACTER;
1795 token->mb_partial = 1;
1796 return 1;
1797 }
1798#endif
1799 if (c == '\\')
1800 {
1801 unsigned char c2;
1802 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1803 {
1804 token->type = BACK_SLASH;
1805 return 1;
1806 }
1807
1808 c2 = re_string_peek_byte_case (input, 1);
1809 token->opr.c = c2;
1810 token->type = CHARACTER;
1811#ifdef RE_ENABLE_I18N
1812 if (input->mb_cur_max > 1)
1813 {
1814 wint_t wc = re_string_wchar_at (input,
1815 re_string_cur_idx (input) + 1);
1816 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1817 }
1818 else
1819#endif
1820 token->word_char = IS_WORD_CHAR (c2) != 0;
1821
1822 switch (c2)
1823 {
1824 case '|':
1825 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1826 token->type = OP_ALT;
1827 break;
1828 case '1': case '2': case '3': case '4': case '5':
1829 case '6': case '7': case '8': case '9':
1830 if (!(syntax & RE_NO_BK_REFS))
1831 {
1832 token->type = OP_BACK_REF;
1833 token->opr.idx = c2 - '1';
1834 }
1835 break;
1836 case '<':
1837 if (!(syntax & RE_NO_GNU_OPS))
1838 {
1839 token->type = ANCHOR;
1840 token->opr.ctx_type = WORD_FIRST;
1841 }
1842 break;
1843 case '>':
1844 if (!(syntax & RE_NO_GNU_OPS))
1845 {
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = WORD_LAST;
1848 }
1849 break;
1850 case 'b':
1851 if (!(syntax & RE_NO_GNU_OPS))
1852 {
1853 token->type = ANCHOR;
1854 token->opr.ctx_type = WORD_DELIM;
1855 }
1856 break;
1857 case 'B':
1858 if (!(syntax & RE_NO_GNU_OPS))
1859 {
1860 token->type = ANCHOR;
1861 token->opr.ctx_type = NOT_WORD_DELIM;
1862 }
1863 break;
1864 case 'w':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = OP_WORD;
1867 break;
1868 case 'W':
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_NOTWORD;
1871 break;
1872 case 's':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_SPACE;
1875 break;
1876 case 'S':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_NOTSPACE;
1879 break;
1880 case '`':
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 {
1883 token->type = ANCHOR;
1884 token->opr.ctx_type = BUF_FIRST;
1885 }
1886 break;
1887 case '\'':
1888 if (!(syntax & RE_NO_GNU_OPS))
1889 {
1890 token->type = ANCHOR;
1891 token->opr.ctx_type = BUF_LAST;
1892 }
1893 break;
1894 case '(':
1895 if (!(syntax & RE_NO_BK_PARENS))
1896 token->type = OP_OPEN_SUBEXP;
1897 break;
1898 case ')':
1899 if (!(syntax & RE_NO_BK_PARENS))
1900 token->type = OP_CLOSE_SUBEXP;
1901 break;
1902 case '+':
1903 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904 token->type = OP_DUP_PLUS;
1905 break;
1906 case '?':
1907 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908 token->type = OP_DUP_QUESTION;
1909 break;
1910 case '{':
1911 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912 token->type = OP_OPEN_DUP_NUM;
1913 break;
1914 case '}':
1915 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916 token->type = OP_CLOSE_DUP_NUM;
1917 break;
1918 default:
1919 break;
1920 }
1921 return 2;
1922 }
1923
1924 token->type = CHARACTER;
1925#ifdef RE_ENABLE_I18N
1926 if (input->mb_cur_max > 1)
1927 {
1928 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1929 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1930 }
1931 else
1932#endif
1933 token->word_char = IS_WORD_CHAR (token->opr.c);
1934
1935 switch (c)
1936 {
1937 case '\n':
1938 if (syntax & RE_NEWLINE_ALT)
1939 token->type = OP_ALT;
1940 break;
1941 case '|':
1942 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1943 token->type = OP_ALT;
1944 break;
1945 case '*':
1946 token->type = OP_DUP_ASTERISK;
1947 break;
1948 case '+':
1949 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1950 token->type = OP_DUP_PLUS;
1951 break;
1952 case '?':
1953 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1954 token->type = OP_DUP_QUESTION;
1955 break;
1956 case '{':
1957 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1958 token->type = OP_OPEN_DUP_NUM;
1959 break;
1960 case '}':
1961 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1962 token->type = OP_CLOSE_DUP_NUM;
1963 break;
1964 case '(':
1965 if (syntax & RE_NO_BK_PARENS)
1966 token->type = OP_OPEN_SUBEXP;
1967 break;
1968 case ')':
1969 if (syntax & RE_NO_BK_PARENS)
1970 token->type = OP_CLOSE_SUBEXP;
1971 break;
1972 case '[':
1973 token->type = OP_OPEN_BRACKET;
1974 break;
1975 case '.':
1976 token->type = OP_PERIOD;
1977 break;
1978 case '^':
1979 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1980 re_string_cur_idx (input) != 0)
1981 {
1982 char prev = re_string_peek_byte (input, -1);
1983 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1984 break;
1985 }
1986 token->type = ANCHOR;
1987 token->opr.ctx_type = LINE_FIRST;
1988 break;
1989 case '$':
1990 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1991 re_string_cur_idx (input) + 1 != re_string_length (input))
1992 {
1993 re_token_t next;
1994 re_string_skip_bytes (input, 1);
1995 peek_token (&next, input, syntax);
1996 re_string_skip_bytes (input, -1);
1997 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1998 break;
1999 }
2000 token->type = ANCHOR;
2001 token->opr.ctx_type = LINE_LAST;
2002 break;
2003 default:
2004 break;
2005 }
2006 return 1;
2007}
2008
2009/* Peek a token from INPUT, and return the length of the token.
2010 We must not use this function out of bracket expressions. */
2011
2012static int
2013internal_function
2014peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2015{
2016 unsigned char c;
2017 if (re_string_eoi (input))
2018 {
2019 token->type = END_OF_RE;
2020 return 0;
2021 }
2022 c = re_string_peek_byte (input, 0);
2023 token->opr.c = c;
2024
2025#ifdef RE_ENABLE_I18N
2026 if (input->mb_cur_max > 1 &&
2027 !re_string_first_byte (input, re_string_cur_idx (input)))
2028 {
2029 token->type = CHARACTER;
2030 return 1;
2031 }
2032#endif /* RE_ENABLE_I18N */
2033
2034 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2035 && re_string_cur_idx (input) + 1 < re_string_length (input))
2036 {
2037 /* In this case, '\' escape a character. */
2038 unsigned char c2;
2039 re_string_skip_bytes (input, 1);
2040 c2 = re_string_peek_byte (input, 0);
2041 token->opr.c = c2;
2042 token->type = CHARACTER;
2043 return 1;
2044 }
2045 if (c == '[') /* '[' is a special char in a bracket exps. */
2046 {
2047 unsigned char c2;
2048 int token_len;
2049 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2050 c2 = re_string_peek_byte (input, 1);
2051 else
2052 c2 = 0;
2053 token->opr.c = c2;
2054 token_len = 2;
2055 switch (c2)
2056 {
2057 case '.':
2058 token->type = OP_OPEN_COLL_ELEM;
2059 break;
2060 case '=':
2061 token->type = OP_OPEN_EQUIV_CLASS;
2062 break;
2063 case ':':
2064 if (syntax & RE_CHAR_CLASSES)
2065 {
2066 token->type = OP_OPEN_CHAR_CLASS;
2067 break;
2068 }
2069 /* else fall through. */
2070 default:
2071 token->type = CHARACTER;
2072 token->opr.c = c;
2073 token_len = 1;
2074 break;
2075 }
2076 return token_len;
2077 }
2078 switch (c)
2079 {
2080 case '-':
2081 token->type = OP_CHARSET_RANGE;
2082 break;
2083 case ']':
2084 token->type = OP_CLOSE_BRACKET;
2085 break;
2086 case '^':
2087 token->type = OP_NON_MATCH_LIST;
2088 break;
2089 default:
2090 token->type = CHARACTER;
2091 }
2092 return 1;
2093}
2094
2095
2096/* Functions for parser. */
2097
2098/* Entry point of the parser.
2099 Parse the regular expression REGEXP and return the structure tree.
2100 If an error is occured, ERR is set by error code, and return NULL.
2101 This function build the following tree, from regular expression <reg_exp>:
2102 CAT
2103 / \
2104 / \
2105 <reg_exp> EOR
2106
2107 CAT means concatenation.
2108 EOR means end of regular expression. */
2109
2110static bin_tree_t *
2111parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2112 reg_errcode_t *err)
2113{
2114 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2115 bin_tree_t *tree, *eor, *root;
2116 re_token_t current_token;
2117 dfa->syntax = syntax;
2118 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2119 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2120 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2121 return NULL;
2122 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2123 if (tree != NULL)
2124 root = create_tree (dfa, tree, eor, CONCAT);
2125 else
2126 root = eor;
2127 if (BE (eor == NULL || root == NULL, 0))
2128 {
2129 *err = REG_ESPACE;
2130 return NULL;
2131 }
2132 return root;
2133}
2134
2135/* This function build the following tree, from regular expression
2136 <branch1>|<branch2>:
2137 ALT
2138 / \
2139 / \
2140 <branch1> <branch2>
2141
2142 ALT means alternative, which represents the operator `|'. */
2143
2144static bin_tree_t *
2145parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2146 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2147{
2148 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2149 bin_tree_t *tree, *branch = NULL;
2150 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2151 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2152 return NULL;
2153
2154 while (token->type == OP_ALT)
2155 {
2156 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2157 if (token->type != OP_ALT && token->type != END_OF_RE
2158 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2159 {
2160 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2161 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2162 return NULL;
2163 }
2164 else
2165 branch = NULL;
2166 tree = create_tree (dfa, tree, branch, OP_ALT);
2167 if (BE (tree == NULL, 0))
2168 {
2169 *err = REG_ESPACE;
2170 return NULL;
2171 }
2172 }
2173 return tree;
2174}
2175
2176/* This function build the following tree, from regular expression
2177 <exp1><exp2>:
2178 CAT
2179 / \
2180 / \
2181 <exp1> <exp2>
2182
2183 CAT means concatenation. */
2184
2185static bin_tree_t *
2186parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2187 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2188{
2189 bin_tree_t *tree, *expr;
2190 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2191 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2192 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2193 return NULL;
2194
2195 while (token->type != OP_ALT && token->type != END_OF_RE
2196 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2197 {
2198 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2199 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2200 {
2201 return NULL;
2202 }
2203 if (tree != NULL && expr != NULL)
2204 {
2205 tree = create_tree (dfa, tree, expr, CONCAT);
2206 if (tree == NULL)
2207 {
2208 *err = REG_ESPACE;
2209 return NULL;
2210 }
2211 }
2212 else if (tree == NULL)
2213 tree = expr;
2214 /* Otherwise expr == NULL, we don't need to create new tree. */
2215 }
2216 return tree;
2217}
2218
2219/* This function build the following tree, from regular expression a*:
2220 *
2221 |
2222 a
2223*/
2224
2225static bin_tree_t *
2226parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2227 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2228{
2229 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2230 bin_tree_t *tree;
2231 switch (token->type)
2232 {
2233 case CHARACTER:
2234 tree = create_token_tree (dfa, NULL, NULL, token);
2235 if (BE (tree == NULL, 0))
2236 {
2237 *err = REG_ESPACE;
2238 return NULL;
2239 }
2240#ifdef RE_ENABLE_I18N
2241 if (dfa->mb_cur_max > 1)
2242 {
2243 while (!re_string_eoi (regexp)
2244 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2245 {
2246 bin_tree_t *mbc_remain;
2247 fetch_token (token, regexp, syntax);
2248 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2249 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2250 if (BE (mbc_remain == NULL || tree == NULL, 0))
2251 {
2252 *err = REG_ESPACE;
2253 return NULL;
2254 }
2255 }
2256 }
2257#endif
2258 break;
2259 case OP_OPEN_SUBEXP:
2260 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2261 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262 return NULL;
2263 break;
2264 case OP_OPEN_BRACKET:
2265 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2266 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267 return NULL;
2268 break;
2269 case OP_BACK_REF:
2270 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2271 {
2272 *err = REG_ESUBREG;
2273 return NULL;
2274 }
2275 dfa->used_bkref_map |= 1 << token->opr.idx;
2276 tree = create_token_tree (dfa, NULL, NULL, token);
2277 if (BE (tree == NULL, 0))
2278 {
2279 *err = REG_ESPACE;
2280 return NULL;
2281 }
2282 ++dfa->nbackref;
2283 dfa->has_mb_node = 1;
2284 break;
2285 case OP_OPEN_DUP_NUM:
2286 if (syntax & RE_CONTEXT_INVALID_DUP)
2287 {
2288 *err = REG_BADRPT;
2289 return NULL;
2290 }
2291 /* FALLTHROUGH */
2292 case OP_DUP_ASTERISK:
2293 case OP_DUP_PLUS:
2294 case OP_DUP_QUESTION:
2295 if (syntax & RE_CONTEXT_INVALID_OPS)
2296 {
2297 *err = REG_BADRPT;
2298 return NULL;
2299 }
2300 else if (syntax & RE_CONTEXT_INDEP_OPS)
2301 {
2302 fetch_token (token, regexp, syntax);
2303 return parse_expression (regexp, preg, token, syntax, nest, err);
2304 }
2305 /* else fall through */
2306 case OP_CLOSE_SUBEXP:
2307 if ((token->type == OP_CLOSE_SUBEXP) &&
2308 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2309 {
2310 *err = REG_ERPAREN;
2311 return NULL;
2312 }
2313 /* else fall through */
2314 case OP_CLOSE_DUP_NUM:
2315 /* We treat it as a normal character. */
2316
2317 /* Then we can these characters as normal characters. */
2318 token->type = CHARACTER;
2319 /* mb_partial and word_char bits should be initialized already
2320 by peek_token. */
2321 tree = create_token_tree (dfa, NULL, NULL, token);
2322 if (BE (tree == NULL, 0))
2323 {
2324 *err = REG_ESPACE;
2325 return NULL;
2326 }
2327 break;
2328 case ANCHOR:
2329 if ((token->opr.ctx_type
2330 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2331 && dfa->word_ops_used == 0)
2332 init_word_char (dfa);
2333 if (token->opr.ctx_type == WORD_DELIM
2334 || token->opr.ctx_type == NOT_WORD_DELIM)
2335 {
2336 bin_tree_t *tree_first, *tree_last;
2337 if (token->opr.ctx_type == WORD_DELIM)
2338 {
2339 token->opr.ctx_type = WORD_FIRST;
2340 tree_first = create_token_tree (dfa, NULL, NULL, token);
2341 token->opr.ctx_type = WORD_LAST;
2342 }
2343 else
2344 {
2345 token->opr.ctx_type = INSIDE_WORD;
2346 tree_first = create_token_tree (dfa, NULL, NULL, token);
2347 token->opr.ctx_type = INSIDE_NOTWORD;
2348 }
2349 tree_last = create_token_tree (dfa, NULL, NULL, token);
2350 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2351 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2352 {
2353 *err = REG_ESPACE;
2354 return NULL;
2355 }
2356 }
2357 else
2358 {
2359 tree = create_token_tree (dfa, NULL, NULL, token);
2360 if (BE (tree == NULL, 0))
2361 {
2362 *err = REG_ESPACE;
2363 return NULL;
2364 }
2365 }
2366 /* We must return here, since ANCHORs can't be followed
2367 by repetition operators.
2368 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2369 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2370 fetch_token (token, regexp, syntax);
2371 return tree;
2372 case OP_PERIOD:
2373 tree = create_token_tree (dfa, NULL, NULL, token);
2374 if (BE (tree == NULL, 0))
2375 {
2376 *err = REG_ESPACE;
2377 return NULL;
2378 }
2379 if (dfa->mb_cur_max > 1)
2380 dfa->has_mb_node = 1;
2381 break;
2382 case OP_WORD:
2383 case OP_NOTWORD:
2384 tree = build_charclass_op (dfa, regexp->trans,
2385 (const unsigned char *) "alnum",
2386 (const unsigned char *) "_",
2387 token->type == OP_NOTWORD, err);
2388 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389 return NULL;
2390 break;
2391 case OP_SPACE:
2392 case OP_NOTSPACE:
2393 tree = build_charclass_op (dfa, regexp->trans,
2394 (const unsigned char *) "space",
2395 (const unsigned char *) "",
2396 token->type == OP_NOTSPACE, err);
2397 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2398 return NULL;
2399 break;
2400 case OP_ALT:
2401 case END_OF_RE:
2402 return NULL;
2403 case BACK_SLASH:
2404 *err = REG_EESCAPE;
2405 return NULL;
2406 default:
2407 /* Must not happen? */
2408#ifdef DEBUG
2409 assert (0);
2410#endif
2411 return NULL;
2412 }
2413 fetch_token (token, regexp, syntax);
2414
2415 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2416 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2417 {
2418 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2419 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2420 return NULL;
2421 /* In BRE consecutive duplications are not allowed. */
2422 if ((syntax & RE_CONTEXT_INVALID_DUP)
2423 && (token->type == OP_DUP_ASTERISK
2424 || token->type == OP_OPEN_DUP_NUM))
2425 {
2426 *err = REG_BADRPT;
2427 return NULL;
2428 }
2429 }
2430
2431 return tree;
2432}
2433
2434/* This function build the following tree, from regular expression
2435 (<reg_exp>):
2436 SUBEXP
2437 |
2438 <reg_exp>
2439*/
2440
2441static bin_tree_t *
2442parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2443 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2444{
2445 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2446 bin_tree_t *tree;
2447 size_t cur_nsub;
2448 cur_nsub = preg->re_nsub++;
2449
2450 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2451
2452 /* The subexpression may be a null string. */
2453 if (token->type == OP_CLOSE_SUBEXP)
2454 tree = NULL;
2455 else
2456 {
2457 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2458 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2459 *err = REG_EPAREN;
2460 if (BE (*err != REG_NOERROR, 0))
2461 return NULL;
2462 }
2463
2464 if (cur_nsub <= '9' - '1')
2465 dfa->completed_bkref_map |= 1 << cur_nsub;
2466
2467 tree = create_tree (dfa, tree, NULL, SUBEXP);
2468 if (BE (tree == NULL, 0))
2469 {
2470 *err = REG_ESPACE;
2471 return NULL;
2472 }
2473 tree->token.opr.idx = cur_nsub;
2474 return tree;
2475}
2476
2477/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2478
2479static bin_tree_t *
2480parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2481 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2482{
2483 bin_tree_t *tree = NULL, *old_tree = NULL;
2484 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2485 re_token_t start_token = *token;
2486
2487 if (token->type == OP_OPEN_DUP_NUM)
2488 {
2489 end = 0;
2490 start = fetch_number (regexp, token, syntax);
2491 if (start == REG_MISSING)
2492 {
2493 if (token->type == CHARACTER && token->opr.c == ',')
2494 start = 0; /* We treat "{,m}" as "{0,m}". */
2495 else
2496 {
2497 *err = REG_BADBR; /* <re>{} is invalid. */
2498 return NULL;
2499 }
2500 }
2501 if (BE (start != REG_ERROR, 1))
2502 {
2503 /* We treat "{n}" as "{n,n}". */
2504 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2505 : ((token->type == CHARACTER && token->opr.c == ',')
2506 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2507 }
2508 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2509 {
2510 /* Invalid sequence. */
2511 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2512 {
2513 if (token->type == END_OF_RE)
2514 *err = REG_EBRACE;
2515 else
2516 *err = REG_BADBR;
2517
2518 return NULL;
2519 }
2520
2521 /* If the syntax bit is set, rollback. */
2522 re_string_set_index (regexp, start_idx);
2523 *token = start_token;
2524 token->type = CHARACTER;
2525 /* mb_partial and word_char bits should be already initialized by
2526 peek_token. */
2527 return elem;
2528 }
2529
2530 if (BE (end != REG_MISSING && start > end, 0))
2531 {
2532 /* First number greater than second. */
2533 *err = REG_BADBR;
2534 return NULL;
2535 }
2536 }
2537 else
2538 {
2539 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2540 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2541 }
2542
2543 fetch_token (token, regexp, syntax);
2544
2545 if (BE (elem == NULL, 0))
2546 return NULL;
2547 if (BE (start == 0 && end == 0, 0))
2548 {
2549 postorder (elem, free_tree, NULL);
2550 return NULL;
2551 }
2552
2553 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2554 if (BE (start > 0, 0))
2555 {
2556 tree = elem;
2557 for (i = 2; i <= start; ++i)
2558 {
2559 elem = duplicate_tree (elem, dfa);
2560 tree = create_tree (dfa, tree, elem, CONCAT);
2561 if (BE (elem == NULL || tree == NULL, 0))
2562 goto parse_dup_op_espace;
2563 }
2564
2565 if (start == end)
2566 return tree;
2567
2568 /* Duplicate ELEM before it is marked optional. */
2569 elem = duplicate_tree (elem, dfa);
2570 old_tree = tree;
2571 }
2572 else
2573 old_tree = NULL;
2574
2575 if (elem->token.type == SUBEXP)
2576 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2577
2578 tree = create_tree (dfa, elem, NULL,
2579 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2580 if (BE (tree == NULL, 0))
2581 goto parse_dup_op_espace;
2582
2583 /* This loop is actually executed only when end != REG_MISSING,
2584 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2585 already created the start+1-th copy. */
2586 if ((Idx) -1 < 0 || end != REG_MISSING)
2587 for (i = start + 2; i <= end; ++i)
2588 {
2589 elem = duplicate_tree (elem, dfa);
2590 tree = create_tree (dfa, tree, elem, CONCAT);
2591 if (BE (elem == NULL || tree == NULL, 0))
2592 goto parse_dup_op_espace;
2593
2594 tree = create_tree (dfa, tree, NULL, OP_ALT);
2595 if (BE (tree == NULL, 0))
2596 goto parse_dup_op_espace;
2597 }
2598
2599 if (old_tree)
2600 tree = create_tree (dfa, old_tree, tree, CONCAT);
2601
2602 return tree;
2603
2604 parse_dup_op_espace:
2605 *err = REG_ESPACE;
2606 return NULL;
2607}
2608
2609/* Size of the names for collating symbol/equivalence_class/character_class.
2610 I'm not sure, but maybe enough. */
2611#define BRACKET_NAME_BUF_SIZE 32
2612
2613#ifndef _LIBC
2614 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615 Build the range expression which starts from START_ELEM, and ends
2616 at END_ELEM. The result are written to MBCSET and SBCSET.
2617 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618 mbcset->range_ends, is a pointer argument sinse we may
2619 update it. */
2620
2621static reg_errcode_t
2622internal_function
2623# ifdef RE_ENABLE_I18N
2624build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2625 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2626# else /* not RE_ENABLE_I18N */
2627build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2628 bracket_elem_t *end_elem)
2629# endif /* not RE_ENABLE_I18N */
2630{
2631 unsigned int start_ch, end_ch;
2632 /* Equivalence Classes and Character Classes can't be a range start/end. */
2633 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2634 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2635 0))
2636 return REG_ERANGE;
2637
2638 /* We can handle no multi character collating elements without libc
2639 support. */
2640 if (BE ((start_elem->type == COLL_SYM
2641 && strlen ((char *) start_elem->opr.name) > 1)
2642 || (end_elem->type == COLL_SYM
2643 && strlen ((char *) end_elem->opr.name) > 1), 0))
2644 return REG_ECOLLATE;
2645
2646# ifdef RE_ENABLE_I18N
2647 {
2648 wchar_t wc;
2649 wint_t start_wc;
2650 wint_t end_wc;
2651 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2652
2653 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2654 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2655 : 0));
2656 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2657 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2658 : 0));
2659 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2660 ? __btowc (start_ch) : start_elem->opr.wch);
2661 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2662 ? __btowc (end_ch) : end_elem->opr.wch);
2663 if (start_wc == WEOF || end_wc == WEOF)
2664 return REG_ECOLLATE;
2665 cmp_buf[0] = start_wc;
2666 cmp_buf[4] = end_wc;
2667 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2668 return REG_ERANGE;
2669
2670 /* Got valid collation sequence values, add them as a new entry.
2671 However, for !_LIBC we have no collation elements: if the
2672 character set is single byte, the single byte character set
2673 that we build below suffices. parse_bracket_exp passes
2674 no MBCSET if dfa->mb_cur_max == 1. */
2675 if (mbcset)
2676 {
2677 /* Check the space of the arrays. */
2678 if (BE (*range_alloc == mbcset->nranges, 0))
2679 {
2680 /* There is not enough space, need realloc. */
2681 wchar_t *new_array_start, *new_array_end;
2682 Idx new_nranges;
2683
2684 /* +1 in case of mbcset->nranges is 0. */
2685 new_nranges = 2 * mbcset->nranges + 1;
2686 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2687 are NULL if *range_alloc == 0. */
2688 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2689 new_nranges);
2690 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2691 new_nranges);
2692
2693 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2694 return REG_ESPACE;
2695
2696 mbcset->range_starts = new_array_start;
2697 mbcset->range_ends = new_array_end;
2698 *range_alloc = new_nranges;
2699 }
2700
2701 mbcset->range_starts[mbcset->nranges] = start_wc;
2702 mbcset->range_ends[mbcset->nranges++] = end_wc;
2703 }
2704
2705 /* Build the table for single byte characters. */
2706 for (wc = 0; wc < SBC_MAX; ++wc)
2707 {
2708 cmp_buf[2] = wc;
2709 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2710 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2711 bitset_set (sbcset, wc);
2712 }
2713 }
2714# else /* not RE_ENABLE_I18N */
2715 {
2716 unsigned int ch;
2717 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2718 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719 : 0));
2720 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2721 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722 : 0));
2723 if (start_ch > end_ch)
2724 return REG_ERANGE;
2725 /* Build the table for single byte characters. */
2726 for (ch = 0; ch < SBC_MAX; ++ch)
2727 if (start_ch <= ch && ch <= end_ch)
2728 bitset_set (sbcset, ch);
2729 }
2730# endif /* not RE_ENABLE_I18N */
2731 return REG_NOERROR;
2732}
2733#endif /* not _LIBC */
2734
2735#ifndef _LIBC
2736/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2737 Build the collating element which is represented by NAME.
2738 The result are written to MBCSET and SBCSET.
2739 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2740 pointer argument since we may update it. */
2741
2742static reg_errcode_t
2743internal_function
2744build_collating_symbol (bitset_t sbcset,
2745# ifdef RE_ENABLE_I18N
2746 re_charset_t *mbcset, Idx *coll_sym_alloc,
2747# endif
2748 const unsigned char *name)
2749{
2750 size_t name_len = strlen ((const char *) name);
2751 if (BE (name_len != 1, 0))
2752 return REG_ECOLLATE;
2753 else
2754 {
2755 bitset_set (sbcset, name[0]);
2756 return REG_NOERROR;
2757 }
2758}
2759#endif /* not _LIBC */
2760
2761/* This function parse bracket expression like "[abc]", "[a-c]",
2762 "[[.a-a.]]" etc. */
2763
2764static bin_tree_t *
2765parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2766 reg_syntax_t syntax, reg_errcode_t *err)
2767{
2768#ifdef _LIBC
2769 const unsigned char *collseqmb;
2770 const char *collseqwc;
2771 uint32_t nrules;
2772 int32_t table_size;
2773 const int32_t *symb_table;
2774 const unsigned char *extra;
2775
2776 /* Local function for parse_bracket_exp used in _LIBC environement.
2777 Seek the collating symbol entry correspondings to NAME.
2778 Return the index of the symbol in the SYMB_TABLE. */
2779
2780 auto inline int32_t
2781 __attribute ((always_inline))
2782 seek_collating_symbol_entry (name, name_len)
2783 const unsigned char *name;
2784 size_t name_len;
2785 {
2786 int32_t hash = elem_hash ((const char *) name, name_len);
2787 int32_t elem = hash % table_size;
2788 if (symb_table[2 * elem] != 0)
2789 {
2790 int32_t second = hash % (table_size - 2) + 1;
2791
2792 do
2793 {
2794 /* First compare the hashing value. */
2795 if (symb_table[2 * elem] == hash
2796 /* Compare the length of the name. */
2797 && name_len == extra[symb_table[2 * elem + 1]]
2798 /* Compare the name. */
2799 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2800 name_len) == 0)
2801 {
2802 /* Yep, this is the entry. */
2803 break;
2804 }
2805
2806 /* Next entry. */
2807 elem += second;
2808 }
2809 while (symb_table[2 * elem] != 0);
2810 }
2811 return elem;
2812 }
2813
2814 /* Local function for parse_bracket_exp used in _LIBC environement.
2815 Look up the collation sequence value of BR_ELEM.
2816 Return the value if succeeded, UINT_MAX otherwise. */
2817
2818 auto inline unsigned int
2819 __attribute ((always_inline))
2820 lookup_collation_sequence_value (br_elem)
2821 bracket_elem_t *br_elem;
2822 {
2823 if (br_elem->type == SB_CHAR)
2824 {
2825 /*
2826 if (MB_CUR_MAX == 1)
2827 */
2828 if (nrules == 0)
2829 return collseqmb[br_elem->opr.ch];
2830 else
2831 {
2832 wint_t wc = __btowc (br_elem->opr.ch);
2833 return __collseq_table_lookup (collseqwc, wc);
2834 }
2835 }
2836 else if (br_elem->type == MB_CHAR)
2837 {
2838 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2839 }
2840 else if (br_elem->type == COLL_SYM)
2841 {
2842 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2843 if (nrules != 0)
2844 {
2845 int32_t elem, idx;
2846 elem = seek_collating_symbol_entry (br_elem->opr.name,
2847 sym_name_len);
2848 if (symb_table[2 * elem] != 0)
2849 {
2850 /* We found the entry. */
2851 idx = symb_table[2 * elem + 1];
2852 /* Skip the name of collating element name. */
2853 idx += 1 + extra[idx];
2854 /* Skip the byte sequence of the collating element. */
2855 idx += 1 + extra[idx];
2856 /* Adjust for the alignment. */
2857 idx = (idx + 3) & ~3;
2858 /* Skip the multibyte collation sequence value. */
2859 idx += sizeof (unsigned int);
2860 /* Skip the wide char sequence of the collating element. */
2861 idx += sizeof (unsigned int) *
2862 (1 + *(unsigned int *) (extra + idx));
2863 /* Return the collation sequence value. */
2864 return *(unsigned int *) (extra + idx);
2865 }
2866 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2867 {
2868 /* No valid character. Match it as a single byte
2869 character. */
2870 return collseqmb[br_elem->opr.name[0]];
2871 }
2872 }
2873 else if (sym_name_len == 1)
2874 return collseqmb[br_elem->opr.name[0]];
2875 }
2876 return UINT_MAX;
2877 }
2878
2879 /* Local function for parse_bracket_exp used in _LIBC environement.
2880 Build the range expression which starts from START_ELEM, and ends
2881 at END_ELEM. The result are written to MBCSET and SBCSET.
2882 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2883 mbcset->range_ends, is a pointer argument sinse we may
2884 update it. */
2885
2886 auto inline reg_errcode_t
2887 __attribute ((always_inline))
2888 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2889 re_charset_t *mbcset;
2890 Idx *range_alloc;
2891 bitset_t sbcset;
2892 bracket_elem_t *start_elem, *end_elem;
2893 {
2894 unsigned int ch;
2895 uint32_t start_collseq;
2896 uint32_t end_collseq;
2897
2898 /* Equivalence Classes and Character Classes can't be a range
2899 start/end. */
2900 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2901 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2902 0))
2903 return REG_ERANGE;
2904
2905 start_collseq = lookup_collation_sequence_value (start_elem);
2906 end_collseq = lookup_collation_sequence_value (end_elem);
2907 /* Check start/end collation sequence values. */
2908 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2909 return REG_ECOLLATE;
2910 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2911 return REG_ERANGE;
2912
2913 /* Got valid collation sequence values, add them as a new entry.
2914 However, if we have no collation elements, and the character set
2915 is single byte, the single byte character set that we
2916 build below suffices. */
2917 if (nrules > 0 || dfa->mb_cur_max > 1)
2918 {
2919 /* Check the space of the arrays. */
2920 if (BE (*range_alloc == mbcset->nranges, 0))
2921 {
2922 /* There is not enough space, need realloc. */
2923 uint32_t *new_array_start;
2924 uint32_t *new_array_end;
2925 Idx new_nranges;
2926
2927 /* +1 in case of mbcset->nranges is 0. */
2928 new_nranges = 2 * mbcset->nranges + 1;
2929 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2930 new_nranges);
2931 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2932 new_nranges);
2933
2934 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2935 return REG_ESPACE;
2936
2937 mbcset->range_starts = new_array_start;
2938 mbcset->range_ends = new_array_end;
2939 *range_alloc = new_nranges;
2940 }
2941
2942 mbcset->range_starts[mbcset->nranges] = start_collseq;
2943 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2944 }
2945
2946 /* Build the table for single byte characters. */
2947 for (ch = 0; ch < SBC_MAX; ch++)
2948 {
2949 uint32_t ch_collseq;
2950 /*
2951 if (MB_CUR_MAX == 1)
2952 */
2953 if (nrules == 0)
2954 ch_collseq = collseqmb[ch];
2955 else
2956 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2957 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2958 bitset_set (sbcset, ch);
2959 }
2960 return REG_NOERROR;
2961 }
2962
2963 /* Local function for parse_bracket_exp used in _LIBC environement.
2964 Build the collating element which is represented by NAME.
2965 The result are written to MBCSET and SBCSET.
2966 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2967 pointer argument sinse we may update it. */
2968
2969 auto inline reg_errcode_t
2970 __attribute ((always_inline))
2971 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2972 re_charset_t *mbcset;
2973 Idx *coll_sym_alloc;
2974 bitset_t sbcset;
2975 const unsigned char *name;
2976 {
2977 int32_t elem, idx;
2978 size_t name_len = strlen ((const char *) name);
2979 if (nrules != 0)
2980 {
2981 elem = seek_collating_symbol_entry (name, name_len);
2982 if (symb_table[2 * elem] != 0)
2983 {
2984 /* We found the entry. */
2985 idx = symb_table[2 * elem + 1];
2986 /* Skip the name of collating element name. */
2987 idx += 1 + extra[idx];
2988 }
2989 else if (symb_table[2 * elem] == 0 && name_len == 1)
2990 {
2991 /* No valid character, treat it as a normal
2992 character. */
2993 bitset_set (sbcset, name[0]);
2994 return REG_NOERROR;
2995 }
2996 else
2997 return REG_ECOLLATE;
2998
2999 /* Got valid collation sequence, add it as a new entry. */
3000 /* Check the space of the arrays. */
3001 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3002 {
3003 /* Not enough, realloc it. */
3004 /* +1 in case of mbcset->ncoll_syms is 0. */
3005 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3006 /* Use realloc since mbcset->coll_syms is NULL
3007 if *alloc == 0. */
3008 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3009 new_coll_sym_alloc);
3010 if (BE (new_coll_syms == NULL, 0))
3011 return REG_ESPACE;
3012 mbcset->coll_syms = new_coll_syms;
3013 *coll_sym_alloc = new_coll_sym_alloc;
3014 }
3015 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3016 return REG_NOERROR;
3017 }
3018 else
3019 {
3020 if (BE (name_len != 1, 0))
3021 return REG_ECOLLATE;
3022 else
3023 {
3024 bitset_set (sbcset, name[0]);
3025 return REG_NOERROR;
3026 }
3027 }
3028 }
3029#endif
3030
3031 re_token_t br_token;
3032 re_bitset_ptr_t sbcset;
3033#ifdef RE_ENABLE_I18N
3034 re_charset_t *mbcset;
3035 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3036 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3037#endif /* not RE_ENABLE_I18N */
3038 bool non_match = false;
3039 bin_tree_t *work_tree;
3040 int token_len;
3041 bool first_round = true;
3042#ifdef _LIBC
3043 collseqmb = (const unsigned char *)
3044 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3045 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3046 if (nrules)
3047 {
3048 /*
3049 if (MB_CUR_MAX > 1)
3050 */
3051 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3052 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3053 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3054 _NL_COLLATE_SYMB_TABLEMB);
3055 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3056 _NL_COLLATE_SYMB_EXTRAMB);
3057 }
3058#endif
3059 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3060#ifdef RE_ENABLE_I18N
3061 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3062#endif /* RE_ENABLE_I18N */
3063#ifdef RE_ENABLE_I18N
3064 if (BE (sbcset == NULL || mbcset == NULL, 0))
3065#else
3066 if (BE (sbcset == NULL, 0))
3067#endif /* RE_ENABLE_I18N */
3068 {
3069 *err = REG_ESPACE;
3070 return NULL;
3071 }
3072
3073 token_len = peek_token_bracket (token, regexp, syntax);
3074 if (BE (token->type == END_OF_RE, 0))
3075 {
3076 *err = REG_BADPAT;
3077 goto parse_bracket_exp_free_return;
3078 }
3079 if (token->type == OP_NON_MATCH_LIST)
3080 {
3081#ifdef RE_ENABLE_I18N
3082 mbcset->non_match = 1;
3083#endif /* not RE_ENABLE_I18N */
3084 non_match = true;
3085 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3086 bitset_set (sbcset, '\0');
3087 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3088 token_len = peek_token_bracket (token, regexp, syntax);
3089 if (BE (token->type == END_OF_RE, 0))
3090 {
3091 *err = REG_BADPAT;
3092 goto parse_bracket_exp_free_return;
3093 }
3094 }
3095
3096 /* We treat the first ']' as a normal character. */
3097 if (token->type == OP_CLOSE_BRACKET)
3098 token->type = CHARACTER;
3099
3100 while (1)
3101 {
3102 bracket_elem_t start_elem, end_elem;
3103 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3104 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3105 reg_errcode_t ret;
3106 int token_len2 = 0;
3107 bool is_range_exp = false;
3108 re_token_t token2;
3109
3110 start_elem.opr.name = start_name_buf;
3111 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3112 syntax, first_round);
3113 if (BE (ret != REG_NOERROR, 0))
3114 {
3115 *err = ret;
3116 goto parse_bracket_exp_free_return;
3117 }
3118 first_round = false;
3119
3120 /* Get information about the next token. We need it in any case. */
3121 token_len = peek_token_bracket (token, regexp, syntax);
3122
3123 /* Do not check for ranges if we know they are not allowed. */
3124 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3125 {
3126 if (BE (token->type == END_OF_RE, 0))
3127 {
3128 *err = REG_EBRACK;
3129 goto parse_bracket_exp_free_return;
3130 }
3131 if (token->type == OP_CHARSET_RANGE)
3132 {
3133 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3134 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3135 if (BE (token2.type == END_OF_RE, 0))
3136 {
3137 *err = REG_EBRACK;
3138 goto parse_bracket_exp_free_return;
3139 }
3140 if (token2.type == OP_CLOSE_BRACKET)
3141 {
3142 /* We treat the last '-' as a normal character. */
3143 re_string_skip_bytes (regexp, -token_len);
3144 token->type = CHARACTER;
3145 }
3146 else
3147 is_range_exp = true;
3148 }
3149 }
3150
3151 if (is_range_exp == true)
3152 {
3153 end_elem.opr.name = end_name_buf;
3154 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3155 dfa, syntax, true);
3156 if (BE (ret != REG_NOERROR, 0))
3157 {
3158 *err = ret;
3159 goto parse_bracket_exp_free_return;
3160 }
3161
3162 token_len = peek_token_bracket (token, regexp, syntax);
3163
3164#ifdef _LIBC
3165 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3166 &start_elem, &end_elem);
3167#else
3168# ifdef RE_ENABLE_I18N
3169 *err = build_range_exp (sbcset,
3170 dfa->mb_cur_max > 1 ? mbcset : NULL,
3171 &range_alloc, &start_elem, &end_elem);
3172# else
3173 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3174# endif
3175#endif /* RE_ENABLE_I18N */
3176 if (BE (*err != REG_NOERROR, 0))
3177 goto parse_bracket_exp_free_return;
3178 }
3179 else
3180 {
3181 switch (start_elem.type)
3182 {
3183 case SB_CHAR:
3184 bitset_set (sbcset, start_elem.opr.ch);
3185 break;
3186#ifdef RE_ENABLE_I18N
3187 case MB_CHAR:
3188 /* Check whether the array has enough space. */
3189 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3190 {
3191 wchar_t *new_mbchars;
3192 /* Not enough, realloc it. */
3193 /* +1 in case of mbcset->nmbchars is 0. */
3194 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3195 /* Use realloc since array is NULL if *alloc == 0. */
3196 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3197 mbchar_alloc);
3198 if (BE (new_mbchars == NULL, 0))
3199 goto parse_bracket_exp_espace;
3200 mbcset->mbchars = new_mbchars;
3201 }
3202 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3203 break;
3204#endif /* RE_ENABLE_I18N */
3205 case EQUIV_CLASS:
3206 *err = build_equiv_class (sbcset,
3207#ifdef RE_ENABLE_I18N
3208 mbcset, &equiv_class_alloc,
3209#endif /* RE_ENABLE_I18N */
3210 start_elem.opr.name);
3211 if (BE (*err != REG_NOERROR, 0))
3212 goto parse_bracket_exp_free_return;
3213 break;
3214 case COLL_SYM:
3215 *err = build_collating_symbol (sbcset,
3216#ifdef RE_ENABLE_I18N
3217 mbcset, &coll_sym_alloc,
3218#endif /* RE_ENABLE_I18N */
3219 start_elem.opr.name);
3220 if (BE (*err != REG_NOERROR, 0))
3221 goto parse_bracket_exp_free_return;
3222 break;
3223 case CHAR_CLASS:
3224 *err = build_charclass (regexp->trans, sbcset,
3225#ifdef RE_ENABLE_I18N
3226 mbcset, &char_class_alloc,
3227#endif /* RE_ENABLE_I18N */
3228 start_elem.opr.name, syntax);
3229 if (BE (*err != REG_NOERROR, 0))
3230 goto parse_bracket_exp_free_return;
3231 break;
3232 default:
3233 assert (0);
3234 break;
3235 }
3236 }
3237 if (BE (token->type == END_OF_RE, 0))
3238 {
3239 *err = REG_EBRACK;
3240 goto parse_bracket_exp_free_return;
3241 }
3242 if (token->type == OP_CLOSE_BRACKET)
3243 break;
3244 }
3245
3246 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3247
3248 /* If it is non-matching list. */
3249 if (non_match)
3250 bitset_not (sbcset);
3251
3252#ifdef RE_ENABLE_I18N
3253 /* Ensure only single byte characters are set. */
3254 if (dfa->mb_cur_max > 1)
3255 bitset_mask (sbcset, dfa->sb_char);
3256
3257 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3258 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3259 || mbcset->non_match)))
3260 {
3261 bin_tree_t *mbc_tree;
3262 int sbc_idx;
3263 /* Build a tree for complex bracket. */
3264 dfa->has_mb_node = 1;
3265 br_token.type = COMPLEX_BRACKET;
3266 br_token.opr.mbcset = mbcset;
3267 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3268 if (BE (mbc_tree == NULL, 0))
3269 goto parse_bracket_exp_espace;
3270 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3271 if (sbcset[sbc_idx])
3272 break;
3273 /* If there are no bits set in sbcset, there is no point
3274 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3275 if (sbc_idx < BITSET_WORDS)
3276 {
3277 /* Build a tree for simple bracket. */
3278 br_token.type = SIMPLE_BRACKET;
3279 br_token.opr.sbcset = sbcset;
3280 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3281 if (BE (work_tree == NULL, 0))
3282 goto parse_bracket_exp_espace;
3283
3284 /* Then join them by ALT node. */
3285 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3286 if (BE (work_tree == NULL, 0))
3287 goto parse_bracket_exp_espace;
3288 }
3289 else
3290 {
3291 re_free (sbcset);
3292 work_tree = mbc_tree;
3293 }
3294 }
3295 else
3296#endif /* not RE_ENABLE_I18N */
3297 {
3298#ifdef RE_ENABLE_I18N
3299 free_charset (mbcset);
3300#endif
3301 /* Build a tree for simple bracket. */
3302 br_token.type = SIMPLE_BRACKET;
3303 br_token.opr.sbcset = sbcset;
3304 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3305 if (BE (work_tree == NULL, 0))
3306 goto parse_bracket_exp_espace;
3307 }
3308 return work_tree;
3309
3310 parse_bracket_exp_espace:
3311 *err = REG_ESPACE;
3312 parse_bracket_exp_free_return:
3313 re_free (sbcset);
3314#ifdef RE_ENABLE_I18N
3315 free_charset (mbcset);
3316#endif /* RE_ENABLE_I18N */
3317 return NULL;
3318}
3319
3320/* Parse an element in the bracket expression. */
3321
3322static reg_errcode_t
3323parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3324 re_token_t *token, int token_len, re_dfa_t *dfa,
3325 reg_syntax_t syntax, bool accept_hyphen)
3326{
3327#ifdef RE_ENABLE_I18N
3328 int cur_char_size;
3329 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3330 if (cur_char_size > 1)
3331 {
3332 elem->type = MB_CHAR;
3333 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3334 re_string_skip_bytes (regexp, cur_char_size);
3335 return REG_NOERROR;
3336 }
3337#endif /* RE_ENABLE_I18N */
3338 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3339 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3340 || token->type == OP_OPEN_EQUIV_CLASS)
3341 return parse_bracket_symbol (elem, regexp, token);
3342 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3343 {
3344 /* A '-' must only appear as anything but a range indicator before
3345 the closing bracket. Everything else is an error. */
3346 re_token_t token2;
3347 (void) peek_token_bracket (&token2, regexp, syntax);
3348 if (token2.type != OP_CLOSE_BRACKET)
3349 /* The actual error value is not standardized since this whole
3350 case is undefined. But ERANGE makes good sense. */
3351 return REG_ERANGE;
3352 }
3353 elem->type = SB_CHAR;
3354 elem->opr.ch = token->opr.c;
3355 return REG_NOERROR;
3356}
3357
3358/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3359 such as [:<character_class>:], [.<collating_element>.], and
3360 [=<equivalent_class>=]. */
3361
3362static reg_errcode_t
3363parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3364 re_token_t *token)
3365{
3366 unsigned char ch, delim = token->opr.c;
3367 int i = 0;
3368 if (re_string_eoi(regexp))
3369 return REG_EBRACK;
3370 for (;; ++i)
3371 {
3372 if (i >= BRACKET_NAME_BUF_SIZE)
3373 return REG_EBRACK;
3374 if (token->type == OP_OPEN_CHAR_CLASS)
3375 ch = re_string_fetch_byte_case (regexp);
3376 else
3377 ch = re_string_fetch_byte (regexp);
3378 if (re_string_eoi(regexp))
3379 return REG_EBRACK;
3380 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3381 break;
3382 elem->opr.name[i] = ch;
3383 }
3384 re_string_skip_bytes (regexp, 1);
3385 elem->opr.name[i] = '\0';
3386 switch (token->type)
3387 {
3388 case OP_OPEN_COLL_ELEM:
3389 elem->type = COLL_SYM;
3390 break;
3391 case OP_OPEN_EQUIV_CLASS:
3392 elem->type = EQUIV_CLASS;
3393 break;
3394 case OP_OPEN_CHAR_CLASS:
3395 elem->type = CHAR_CLASS;
3396 break;
3397 default:
3398 break;
3399 }
3400 return REG_NOERROR;
3401}
3402
3403 /* Helper function for parse_bracket_exp.
3404 Build the equivalence class which is represented by NAME.
3405 The result are written to MBCSET and SBCSET.
3406 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3407 is a pointer argument sinse we may update it. */
3408
3409static reg_errcode_t
3410#ifdef RE_ENABLE_I18N
3411build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3412 Idx *equiv_class_alloc, const unsigned char *name)
3413#else /* not RE_ENABLE_I18N */
3414build_equiv_class (bitset_t sbcset, const unsigned char *name)
3415#endif /* not RE_ENABLE_I18N */
3416{
3417#ifdef _LIBC
3418 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3419 if (nrules != 0)
3420 {
3421 const int32_t *table, *indirect;
3422 const unsigned char *weights, *extra, *cp;
3423 unsigned char char_buf[2];
3424 int32_t idx1, idx2;
3425 unsigned int ch;
3426 size_t len;
3427 /* This #include defines a local function! */
3428# include <locale/weight.h>
3429 /* Calculate the index for equivalence class. */
3430 cp = name;
3431 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3432 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3433 _NL_COLLATE_WEIGHTMB);
3434 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3435 _NL_COLLATE_EXTRAMB);
3436 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3437 _NL_COLLATE_INDIRECTMB);
3438 idx1 = findidx (&cp);
3439 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3440 /* This isn't a valid character. */
3441 return REG_ECOLLATE;
3442
3443 /* Build single byte matcing table for this equivalence class. */
3444 char_buf[1] = (unsigned char) '\0';
3445 len = weights[idx1];
3446 for (ch = 0; ch < SBC_MAX; ++ch)
3447 {
3448 char_buf[0] = ch;
3449 cp = char_buf;
3450 idx2 = findidx (&cp);
3451/*
3452 idx2 = table[ch];
3453*/
3454 if (idx2 == 0)
3455 /* This isn't a valid character. */
3456 continue;
3457 if (len == weights[idx2])
3458 {
3459 int cnt = 0;
3460 while (cnt <= len &&
3461 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3462 ++cnt;
3463
3464 if (cnt > len)
3465 bitset_set (sbcset, ch);
3466 }
3467 }
3468 /* Check whether the array has enough space. */
3469 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3470 {
3471 /* Not enough, realloc it. */
3472 /* +1 in case of mbcset->nequiv_classes is 0. */
3473 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3474 /* Use realloc since the array is NULL if *alloc == 0. */
3475 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3476 int32_t,
3477 new_equiv_class_alloc);
3478 if (BE (new_equiv_classes == NULL, 0))
3479 return REG_ESPACE;
3480 mbcset->equiv_classes = new_equiv_classes;
3481 *equiv_class_alloc = new_equiv_class_alloc;
3482 }
3483 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3484 }
3485 else
3486#endif /* _LIBC */
3487 {
3488 if (BE (strlen ((const char *) name) != 1, 0))
3489 return REG_ECOLLATE;
3490 bitset_set (sbcset, *name);
3491 }
3492 return REG_NOERROR;
3493}
3494
3495 /* Helper function for parse_bracket_exp.
3496 Build the character class which is represented by NAME.
3497 The result are written to MBCSET and SBCSET.
3498 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3499 is a pointer argument sinse we may update it. */
3500
3501static reg_errcode_t
3502#ifdef RE_ENABLE_I18N
3503build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3504 re_charset_t *mbcset, Idx *char_class_alloc,
3505 const unsigned char *class_name, reg_syntax_t syntax)
3506#else /* not RE_ENABLE_I18N */
3507build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3508 const unsigned char *class_name, reg_syntax_t syntax)
3509#endif /* not RE_ENABLE_I18N */
3510{
3511 int i;
3512 const char *name = (const char *) class_name;
3513
3514 /* In case of REG_ICASE "upper" and "lower" match the both of
3515 upper and lower cases. */
3516 if ((syntax & RE_ICASE)
3517 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3518 name = "alpha";
3519
3520#ifdef RE_ENABLE_I18N
3521 /* Check the space of the arrays. */
3522 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3523 {
3524 /* Not enough, realloc it. */
3525 /* +1 in case of mbcset->nchar_classes is 0. */
3526 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3527 /* Use realloc since array is NULL if *alloc == 0. */
3528 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3529 new_char_class_alloc);
3530 if (BE (new_char_classes == NULL, 0))
3531 return REG_ESPACE;
3532 mbcset->char_classes = new_char_classes;
3533 *char_class_alloc = new_char_class_alloc;
3534 }
3535 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3536#endif /* RE_ENABLE_I18N */
3537
3538#define BUILD_CHARCLASS_LOOP(ctype_func) \
3539 do { \
3540 if (BE (trans != NULL, 0)) \
3541 { \
3542 for (i = 0; i < SBC_MAX; ++i) \
3543 if (ctype_func (i)) \
3544 bitset_set (sbcset, trans[i]); \
3545 } \
3546 else \
3547 { \
3548 for (i = 0; i < SBC_MAX; ++i) \
3549 if (ctype_func (i)) \
3550 bitset_set (sbcset, i); \
3551 } \
3552 } while (0)
3553
3554 if (strcmp (name, "alnum") == 0)
3555 BUILD_CHARCLASS_LOOP (isalnum);
3556 else if (strcmp (name, "cntrl") == 0)
3557 BUILD_CHARCLASS_LOOP (iscntrl);
3558 else if (strcmp (name, "lower") == 0)
3559 BUILD_CHARCLASS_LOOP (islower);
3560 else if (strcmp (name, "space") == 0)
3561 BUILD_CHARCLASS_LOOP (isspace);
3562 else if (strcmp (name, "alpha") == 0)
3563 BUILD_CHARCLASS_LOOP (isalpha);
3564 else if (strcmp (name, "digit") == 0)
3565 BUILD_CHARCLASS_LOOP (isdigit);
3566 else if (strcmp (name, "print") == 0)
3567 BUILD_CHARCLASS_LOOP (isprint);
3568 else if (strcmp (name, "upper") == 0)
3569 BUILD_CHARCLASS_LOOP (isupper);
3570 else if (strcmp (name, "blank") == 0)
3571 BUILD_CHARCLASS_LOOP (isblank);
3572 else if (strcmp (name, "graph") == 0)
3573 BUILD_CHARCLASS_LOOP (isgraph);
3574 else if (strcmp (name, "punct") == 0)
3575 BUILD_CHARCLASS_LOOP (ispunct);
3576 else if (strcmp (name, "xdigit") == 0)
3577 BUILD_CHARCLASS_LOOP (isxdigit);
3578 else
3579 return REG_ECTYPE;
3580
3581 return REG_NOERROR;
3582}
3583
3584static bin_tree_t *
3585build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3586 const unsigned char *class_name,
3587 const unsigned char *extra, bool non_match,
3588 reg_errcode_t *err)
3589{
3590 re_bitset_ptr_t sbcset;
3591#ifdef RE_ENABLE_I18N
3592 re_charset_t *mbcset;
3593 Idx alloc = 0;
3594#endif /* not RE_ENABLE_I18N */
3595 reg_errcode_t ret;
3596 re_token_t br_token;
3597 bin_tree_t *tree;
3598
3599 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3600#ifdef RE_ENABLE_I18N
3601 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3602#endif /* RE_ENABLE_I18N */
3603
3604#ifdef RE_ENABLE_I18N
3605 if (BE (sbcset == NULL || mbcset == NULL, 0))
3606#else /* not RE_ENABLE_I18N */
3607 if (BE (sbcset == NULL, 0))
3608#endif /* not RE_ENABLE_I18N */
3609 {
3610 *err = REG_ESPACE;
3611 return NULL;
3612 }
3613
3614 if (non_match)
3615 {
3616#ifdef RE_ENABLE_I18N
3617 /*
3618 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3619 bitset_set(cset->sbcset, '\0');
3620 */
3621 mbcset->non_match = 1;
3622#endif /* not RE_ENABLE_I18N */
3623 }
3624
3625 /* We don't care the syntax in this case. */
3626 ret = build_charclass (trans, sbcset,
3627#ifdef RE_ENABLE_I18N
3628 mbcset, &alloc,
3629#endif /* RE_ENABLE_I18N */
3630 class_name, 0);
3631
3632 if (BE (ret != REG_NOERROR, 0))
3633 {
3634 re_free (sbcset);
3635#ifdef RE_ENABLE_I18N
3636 free_charset (mbcset);
3637#endif /* RE_ENABLE_I18N */
3638 *err = ret;
3639 return NULL;
3640 }
3641 /* \w match '_' also. */
3642 for (; *extra; extra++)
3643 bitset_set (sbcset, *extra);
3644
3645 /* If it is non-matching list. */
3646 if (non_match)
3647 bitset_not (sbcset);
3648
3649#ifdef RE_ENABLE_I18N
3650 /* Ensure only single byte characters are set. */
3651 if (dfa->mb_cur_max > 1)
3652 bitset_mask (sbcset, dfa->sb_char);
3653#endif
3654
3655 /* Build a tree for simple bracket. */
3656 br_token.type = SIMPLE_BRACKET;
3657 br_token.opr.sbcset = sbcset;
3658 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3659 if (BE (tree == NULL, 0))
3660 goto build_word_op_espace;
3661
3662#ifdef RE_ENABLE_I18N
3663 if (dfa->mb_cur_max > 1)
3664 {
3665 bin_tree_t *mbc_tree;
3666 /* Build a tree for complex bracket. */
3667 br_token.type = COMPLEX_BRACKET;
3668 br_token.opr.mbcset = mbcset;
3669 dfa->has_mb_node = 1;
3670 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3671 if (BE (mbc_tree == NULL, 0))
3672 goto build_word_op_espace;
3673 /* Then join them by ALT node. */
3674 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3675 if (BE (mbc_tree != NULL, 1))
3676 return tree;
3677 }
3678 else
3679 {
3680 free_charset (mbcset);
3681 return tree;
3682 }
3683#else /* not RE_ENABLE_I18N */
3684 return tree;
3685#endif /* not RE_ENABLE_I18N */
3686
3687 build_word_op_espace:
3688 re_free (sbcset);
3689#ifdef RE_ENABLE_I18N
3690 free_charset (mbcset);
3691#endif /* RE_ENABLE_I18N */
3692 *err = REG_ESPACE;
3693 return NULL;
3694}
3695
3696/* This is intended for the expressions like "a{1,3}".
3697 Fetch a number from `input', and return the number.
3698 Return REG_MISSING if the number field is empty like "{,1}".
3699 Return REG_ERROR if an error occurred. */
3700
3701static Idx
3702fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3703{
3704 Idx num = REG_MISSING;
3705 unsigned char c;
3706 while (1)
3707 {
3708 fetch_token (token, input, syntax);
3709 c = token->opr.c;
3710 if (BE (token->type == END_OF_RE, 0))
3711 return REG_ERROR;
3712 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3713 break;
3714 num = ((token->type != CHARACTER || c < '0' || '9' < c
3715 || num == REG_ERROR)
3716 ? REG_ERROR
3717 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3718 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3719 }
3720 return num;
3721}
3722
3723
3724#ifdef RE_ENABLE_I18N
3725static void
3726free_charset (re_charset_t *cset)
3727{
3728 re_free (cset->mbchars);
3729# ifdef _LIBC
3730 re_free (cset->coll_syms);
3731 re_free (cset->equiv_classes);
3732 re_free (cset->range_starts);
3733 re_free (cset->range_ends);
3734# endif
3735 re_free (cset->char_classes);
3736 re_free (cset);
3737}
3738#endif /* RE_ENABLE_I18N */
3739
3740
3741/* Functions for binary tree operation. */
3742
3743/* Create a tree node. */
3744
3745static bin_tree_t *
3746create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3747 re_token_type_t type)
3748{
3749 re_token_t t;
3750 t.type = type;
3751 return create_token_tree (dfa, left, right, &t);
3752}
3753
3754static bin_tree_t *
3755create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3756 const re_token_t *token)
3757{
3758 bin_tree_t *tree;
3759 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3760 {
3761 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3762
3763 if (storage == NULL)
3764 return NULL;
3765 storage->next = dfa->str_tree_storage;
3766 dfa->str_tree_storage = storage;
3767 dfa->str_tree_storage_idx = 0;
3768 }
3769 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3770
3771 tree->parent = NULL;
3772 tree->left = left;
3773 tree->right = right;
3774 tree->token = *token;
3775 tree->token.duplicated = 0;
3776 tree->token.opt_subexp = 0;
3777 tree->first = NULL;
3778 tree->next = NULL;
3779 tree->node_idx = REG_MISSING;
3780
3781 if (left != NULL)
3782 left->parent = tree;
3783 if (right != NULL)
3784 right->parent = tree;
3785 return tree;
3786}
3787
3788/* Mark the tree SRC as an optional subexpression.
3789 To be called from preorder or postorder. */
3790
3791static reg_errcode_t
3792mark_opt_subexp (void *extra, bin_tree_t *node)
3793{
3794 Idx idx = (Idx) (long) extra;
3795 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3796 node->token.opt_subexp = 1;
3797
3798 return REG_NOERROR;
3799}
3800
3801/* Free the allocated memory inside NODE. */
3802
3803static void
3804free_token (re_token_t *node)
3805{
3806#ifdef RE_ENABLE_I18N
3807 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3808 free_charset (node->opr.mbcset);
3809 else
3810#endif /* RE_ENABLE_I18N */
3811 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3812 re_free (node->opr.sbcset);
3813}
3814
3815/* Worker function for tree walking. Free the allocated memory inside NODE
3816 and its children. */
3817
3818static reg_errcode_t
3819free_tree (void *extra, bin_tree_t *node)
3820{
3821 free_token (&node->token);
3822 return REG_NOERROR;
3823}
3824
3825
3826/* Duplicate the node SRC, and return new node. This is a preorder
3827 visit similar to the one implemented by the generic visitor, but
3828 we need more infrastructure to maintain two parallel trees --- so,
3829 it's easier to duplicate. */
3830
3831static bin_tree_t *
3832duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3833{
3834 const bin_tree_t *node;
3835 bin_tree_t *dup_root;
3836 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3837
3838 for (node = root; ; )
3839 {
3840 /* Create a new tree and link it back to the current parent. */
3841 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3842 if (*p_new == NULL)
3843 return NULL;
3844 (*p_new)->parent = dup_node;
3845 (*p_new)->token.duplicated = 1;
3846 dup_node = *p_new;
3847
3848 /* Go to the left node, or up and to the right. */
3849 if (node->left)
3850 {
3851 node = node->left;
3852 p_new = &dup_node->left;
3853 }
3854 else
3855 {
3856 const bin_tree_t *prev = NULL;
3857 while (node->right == prev || node->right == NULL)
3858 {
3859 prev = node;
3860 node = node->parent;
3861 dup_node = dup_node->parent;
3862 if (!node)
3863 return dup_root;
3864 }
3865 node = node->right;
3866 p_new = &dup_node->right;
3867 }
3868 }
3869}
Note: See TracBrowser for help on using the repository browser.