source: vendor/glibc/current/posix/regcomp.c

Last change on this file was 2237, checked in by (none), 20 years ago

This commit was manufactured by cvs2svn to create branch 'GNU'.

  • Property cvs2svn:cvs-rev set to 1.1
  • Property svn:eol-style set to native
  • Property svn:executable set to *
File size: 109.8 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27static void init_word_char (re_dfa_t *dfa);
28#ifdef RE_ENABLE_I18N
29static void free_charset (re_charset_t *cset);
30#endif /* RE_ENABLE_I18N */
31static void free_workarea_compile (regex_t *preg);
32static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33#ifdef RE_ENABLE_I18N
34static void optimize_utf8 (re_dfa_t *dfa);
35#endif
36static reg_errcode_t analyze (regex_t *preg);
37static reg_errcode_t create_initial_state (re_dfa_t *dfa);
38static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 bin_tree_t *node);
48static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
52 int top_clone_node, int root_node,
53 unsigned int constraint);
54static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
55 unsigned int constraint);
56static int search_duplicated_node (re_dfa_t *dfa, int org_node,
57 unsigned int constraint);
58static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
59static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
60 int node, int root);
61static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
62static int fetch_number (re_string_t *input, re_token_t *token,
63 reg_syntax_t syntax);
64static void fetch_token (re_token_t *result, re_string_t *input,
65 reg_syntax_t syntax);
66static int peek_token (re_token_t *token, re_string_t *input,
67 reg_syntax_t syntax);
68static int peek_token_bracket (re_token_t *token, re_string_t *input,
69 reg_syntax_t syntax);
70static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
71 reg_syntax_t syntax, reg_errcode_t *err);
72static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
76 re_token_t *token, reg_syntax_t syntax,
77 int nest, reg_errcode_t *err);
78static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
79 re_token_t *token, reg_syntax_t syntax,
80 int nest, reg_errcode_t *err);
81static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
82 re_token_t *token, reg_syntax_t syntax,
83 int nest, reg_errcode_t *err);
84static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
85 re_dfa_t *dfa, re_token_t *token,
86 reg_syntax_t syntax, reg_errcode_t *err);
87static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
88 re_token_t *token, reg_syntax_t syntax,
89 reg_errcode_t *err);
90static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
91 re_string_t *regexp,
92 re_token_t *token, int token_len,
93 re_dfa_t *dfa,
94 reg_syntax_t syntax,
95 int accept_hyphen);
96static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97 re_string_t *regexp,
98 re_token_t *token);
99#ifndef _LIBC
100# ifdef RE_ENABLE_I18N
101static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
102 re_charset_t *mbcset, int *range_alloc,
103 bracket_elem_t *start_elem,
104 bracket_elem_t *end_elem);
105static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
106 re_charset_t *mbcset,
107 int *coll_sym_alloc,
108 const unsigned char *name);
109# else /* not RE_ENABLE_I18N */
110static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
111 bracket_elem_t *start_elem,
112 bracket_elem_t *end_elem);
113static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
114 const unsigned char *name);
115# endif /* not RE_ENABLE_I18N */
116#endif /* not _LIBC */
117#ifdef RE_ENABLE_I18N
118static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119 re_charset_t *mbcset,
120 int *equiv_class_alloc,
121 const unsigned char *name);
122static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
123 re_bitset_ptr_t sbcset,
124 re_charset_t *mbcset,
125 int *char_class_alloc,
126 const unsigned char *class_name,
127 reg_syntax_t syntax);
128#else /* not RE_ENABLE_I18N */
129static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
130 const unsigned char *name);
131static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
132 re_bitset_ptr_t sbcset,
133 const unsigned char *class_name,
134 reg_syntax_t syntax);
135#endif /* not RE_ENABLE_I18N */
136static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
137 unsigned RE_TRANSLATE_TYPE trans,
138 const unsigned char *class_name,
139 const unsigned char *extra,
140 int non_match, reg_errcode_t *err);
141static bin_tree_t *create_tree (re_dfa_t *dfa,
142 bin_tree_t *left, bin_tree_t *right,
143 re_token_type_t type);
144static bin_tree_t *create_token_tree (re_dfa_t *dfa,
145 bin_tree_t *left, bin_tree_t *right,
146 const re_token_t *token);
147static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
148static void free_token (re_token_t *node);
149static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
150static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
151
152
153/* This table gives an error message for each of the error codes listed
154 in regex.h. Obviously the order here has to be same as there.
155 POSIX doesn't require that we do anything for REG_NOERROR,
156 but why not be nice? */
157
158const char __re_error_msgid[] attribute_hidden =
159 {
160#define REG_NOERROR_IDX 0
161 gettext_noop ("Success") /* REG_NOERROR */
162 "\0"
163#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
164 gettext_noop ("No match") /* REG_NOMATCH */
165 "\0"
166#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
167 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
168 "\0"
169#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
170 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
171 "\0"
172#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
173 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
174 "\0"
175#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
176 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
177 "\0"
178#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
179 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
180 "\0"
181#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
182 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
183 "\0"
184#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
185 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
186 "\0"
187#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
188 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
189 "\0"
190#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
191 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
192 "\0"
193#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
194 gettext_noop ("Invalid range end") /* REG_ERANGE */
195 "\0"
196#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
197 gettext_noop ("Memory exhausted") /* REG_ESPACE */
198 "\0"
199#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
200 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
201 "\0"
202#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
203 gettext_noop ("Premature end of regular expression") /* REG_EEND */
204 "\0"
205#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
206 gettext_noop ("Regular expression too big") /* REG_ESIZE */
207 "\0"
208#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
209 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
210 };
211
212const size_t __re_error_msgid_idx[] attribute_hidden =
213 {
214 REG_NOERROR_IDX,
215 REG_NOMATCH_IDX,
216 REG_BADPAT_IDX,
217 REG_ECOLLATE_IDX,
218 REG_ECTYPE_IDX,
219 REG_EESCAPE_IDX,
220 REG_ESUBREG_IDX,
221 REG_EBRACK_IDX,
222 REG_EPAREN_IDX,
223 REG_EBRACE_IDX,
224 REG_BADBR_IDX,
225 REG_ERANGE_IDX,
226 REG_ESPACE_IDX,
227 REG_BADRPT_IDX,
228 REG_EEND_IDX,
229 REG_ESIZE_IDX,
230 REG_ERPAREN_IDX
231 };
232
233
234/* Entry points for GNU code. */
235
236/* re_compile_pattern is the GNU regular expression compiler: it
237 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
238 Returns 0 if the pattern was valid, otherwise an error string.
239
240 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
241 are set in BUFP on entry. */
242
243const char *
244re_compile_pattern (pattern, length, bufp)
245 const char *pattern;
246 size_t length;
247 struct re_pattern_buffer *bufp;
248{
249 reg_errcode_t ret;
250
251 /* And GNU code determines whether or not to get register information
252 by passing null for the REGS argument to re_match, etc., not by
253 setting no_sub, unless RE_NO_SUB is set. */
254 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
255
256 /* Match anchors at newline. */
257 bufp->newline_anchor = 1;
258
259 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
260
261 if (!ret)
262 return NULL;
263 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
264}
265#ifdef _LIBC
266weak_alias (__re_compile_pattern, re_compile_pattern)
267#endif
268
269/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
270 also be assigned to arbitrarily: each pattern buffer stores its own
271 syntax, so it can be changed between regex compilations. */
272/* This has no initializer because initialized variables in Emacs
273 become read-only after dumping. */
274reg_syntax_t re_syntax_options;
275
276
277/* Specify the precise syntax of regexps for compilation. This provides
278 for compatibility for various utilities which historically have
279 different, incompatible syntaxes.
280
281 The argument SYNTAX is a bit mask comprised of the various bits
282 defined in regex.h. We return the old syntax. */
283
284reg_syntax_t
285re_set_syntax (syntax)
286 reg_syntax_t syntax;
287{
288 reg_syntax_t ret = re_syntax_options;
289
290 re_syntax_options = syntax;
291 return ret;
292}
293#ifdef _LIBC
294weak_alias (__re_set_syntax, re_set_syntax)
295#endif
296
297int
298re_compile_fastmap (bufp)
299 struct re_pattern_buffer *bufp;
300{
301 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
302 char *fastmap = bufp->fastmap;
303
304 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
305 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
306 if (dfa->init_state != dfa->init_state_word)
307 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
308 if (dfa->init_state != dfa->init_state_nl)
309 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
310 if (dfa->init_state != dfa->init_state_begbuf)
311 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
312 bufp->fastmap_accurate = 1;
313 return 0;
314}
315#ifdef _LIBC
316weak_alias (__re_compile_fastmap, re_compile_fastmap)
317#endif
318
319static inline void
320__attribute ((always_inline))
321re_set_fastmap (char *fastmap, int icase, int ch)
322{
323 fastmap[ch] = 1;
324 if (icase)
325 fastmap[tolower (ch)] = 1;
326}
327
328/* Helper function for re_compile_fastmap.
329 Compile fastmap for the initial_state INIT_STATE. */
330
331static void
332re_compile_fastmap_iter (bufp, init_state, fastmap)
333 regex_t *bufp;
334 const re_dfastate_t *init_state;
335 char *fastmap;
336{
337 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
338 int node_cnt;
339 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
340 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
341 {
342 int node = init_state->nodes.elems[node_cnt];
343 re_token_type_t type = dfa->nodes[node].type;
344
345 if (type == CHARACTER)
346 {
347 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
348#ifdef RE_ENABLE_I18N
349 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
350 {
351 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
352 wchar_t wc;
353 mbstate_t state;
354
355 p = buf;
356 *p++ = dfa->nodes[node].opr.c;
357 while (++node < dfa->nodes_len
358 && dfa->nodes[node].type == CHARACTER
359 && dfa->nodes[node].mb_partial)
360 *p++ = dfa->nodes[node].opr.c;
361 memset (&state, 0, sizeof (state));
362 if (mbrtowc (&wc, (const char *) buf, p - buf,
363 &state) == p - buf
364 && (__wcrtomb ((char *) buf, towlower (wc), &state)
365 != (size_t) -1))
366 re_set_fastmap (fastmap, 0, buf[0]);
367 }
368#endif
369 }
370 else if (type == SIMPLE_BRACKET)
371 {
372 int i, j, ch;
373 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
374 for (j = 0; j < UINT_BITS; ++j, ++ch)
375 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
376 re_set_fastmap (fastmap, icase, ch);
377 }
378#ifdef RE_ENABLE_I18N
379 else if (type == COMPLEX_BRACKET)
380 {
381 int i;
382 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
383 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
384 || cset->nranges || cset->nchar_classes)
385 {
386# ifdef _LIBC
387 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
388 {
389 /* In this case we want to catch the bytes which are
390 the first byte of any collation elements.
391 e.g. In da_DK, we want to catch 'a' since "aa"
392 is a valid collation element, and don't catch
393 'b' since 'b' is the only collation element
394 which starts from 'b'. */
395 int j, ch;
396 const int32_t *table = (const int32_t *)
397 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
398 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
399 for (j = 0; j < UINT_BITS; ++j, ++ch)
400 if (table[ch] < 0)
401 re_set_fastmap (fastmap, icase, ch);
402 }
403# else
404 if (dfa->mb_cur_max > 1)
405 for (i = 0; i < SBC_MAX; ++i)
406 if (__btowc (i) == WEOF)
407 re_set_fastmap (fastmap, icase, i);
408# endif /* not _LIBC */
409 }
410 for (i = 0; i < cset->nmbchars; ++i)
411 {
412 char buf[256];
413 mbstate_t state;
414 memset (&state, '\0', sizeof (state));
415 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
416 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
417 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
418 {
419 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
420 != (size_t) -1)
421 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
422 }
423 }
424 }
425#endif /* RE_ENABLE_I18N */
426 else if (type == OP_PERIOD
427#ifdef RE_ENABLE_I18N
428 || type == OP_UTF8_PERIOD
429#endif /* RE_ENABLE_I18N */
430 || type == END_OF_RE)
431 {
432 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
433 if (type == END_OF_RE)
434 bufp->can_be_null = 1;
435 return;
436 }
437 }
438}
439
440
441/* Entry point for POSIX code. */
442/* regcomp takes a regular expression as a string and compiles it.
443
444 PREG is a regex_t *. We do not expect any fields to be initialized,
445 since POSIX says we shouldn't. Thus, we set
446
447 `buffer' to the compiled pattern;
448 `used' to the length of the compiled pattern;
449 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
450 REG_EXTENDED bit in CFLAGS is set; otherwise, to
451 RE_SYNTAX_POSIX_BASIC;
452 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
453 `fastmap' to an allocated space for the fastmap;
454 `fastmap_accurate' to zero;
455 `re_nsub' to the number of subexpressions in PATTERN.
456
457 PATTERN is the address of the pattern string.
458
459 CFLAGS is a series of bits which affect compilation.
460
461 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
462 use POSIX basic syntax.
463
464 If REG_NEWLINE is set, then . and [^...] don't match newline.
465 Also, regexec will try a match beginning after every newline.
466
467 If REG_ICASE is set, then we considers upper- and lowercase
468 versions of letters to be equivalent when matching.
469
470 If REG_NOSUB is set, then when PREG is passed to regexec, that
471 routine will report only success or failure, and nothing about the
472 registers.
473
474 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
475 the return codes and their meanings.) */
476
477int
478regcomp (preg, pattern, cflags)
479 regex_t *__restrict preg;
480 const char *__restrict pattern;
481 int cflags;
482{
483 reg_errcode_t ret;
484 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
485 : RE_SYNTAX_POSIX_BASIC);
486
487 preg->buffer = NULL;
488 preg->allocated = 0;
489 preg->used = 0;
490
491 /* Try to allocate space for the fastmap. */
492 preg->fastmap = re_malloc (char, SBC_MAX);
493 if (BE (preg->fastmap == NULL, 0))
494 return REG_ESPACE;
495
496 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
497
498 /* If REG_NEWLINE is set, newlines are treated differently. */
499 if (cflags & REG_NEWLINE)
500 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
501 syntax &= ~RE_DOT_NEWLINE;
502 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
503 /* It also changes the matching behavior. */
504 preg->newline_anchor = 1;
505 }
506 else
507 preg->newline_anchor = 0;
508 preg->no_sub = !!(cflags & REG_NOSUB);
509 preg->translate = NULL;
510
511 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
512
513 /* POSIX doesn't distinguish between an unmatched open-group and an
514 unmatched close-group: both are REG_EPAREN. */
515 if (ret == REG_ERPAREN)
516 ret = REG_EPAREN;
517
518 /* We have already checked preg->fastmap != NULL. */
519 if (BE (ret == REG_NOERROR, 1))
520 /* Compute the fastmap now, since regexec cannot modify the pattern
521 buffer. This function never fails in this implementation. */
522 (void) re_compile_fastmap (preg);
523 else
524 {
525 /* Some error occurred while compiling the expression. */
526 re_free (preg->fastmap);
527 preg->fastmap = NULL;
528 }
529
530 return (int) ret;
531}
532#ifdef _LIBC
533weak_alias (__regcomp, regcomp)
534#endif
535
536/* Returns a message corresponding to an error code, ERRCODE, returned
537 from either regcomp or regexec. We don't use PREG here. */
538
539size_t
540regerror (errcode, preg, errbuf, errbuf_size)
541 int errcode;
542 const regex_t *preg;
543 char *errbuf;
544 size_t errbuf_size;
545{
546 const char *msg;
547 size_t msg_size;
548
549 if (BE (errcode < 0
550 || errcode >= (int) (sizeof (__re_error_msgid_idx)
551 / sizeof (__re_error_msgid_idx[0])), 0))
552 /* Only error codes returned by the rest of the code should be passed
553 to this routine. If we are given anything else, or if other regex
554 code generates an invalid error code, then the program has a bug.
555 Dump core so we can fix it. */
556 abort ();
557
558 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
559
560 msg_size = strlen (msg) + 1; /* Includes the null. */
561
562 if (BE (errbuf_size != 0, 1))
563 {
564 if (BE (msg_size > errbuf_size, 0))
565 {
566#if defined HAVE_MEMPCPY || defined _LIBC
567 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
568#else
569 memcpy (errbuf, msg, errbuf_size - 1);
570 errbuf[errbuf_size - 1] = 0;
571#endif
572 }
573 else
574 memcpy (errbuf, msg, msg_size);
575 }
576
577 return msg_size;
578}
579#ifdef _LIBC
580weak_alias (__regerror, regerror)
581#endif
582
583
584#ifdef RE_ENABLE_I18N
585/* This static array is used for the map to single-byte characters when
586 UTF-8 is used. Otherwise we would allocate memory just to initialize
587 it the same all the time. UTF-8 is the preferred encoding so this is
588 a worthwhile optimization. */
589static const bitset utf8_sb_map =
590{
591 /* Set the first 128 bits. */
592# if UINT_MAX == 0xffffffff
593 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
594# else
595# error "Add case for new unsigned int size"
596# endif
597};
598#endif
599
600
601static void
602free_dfa_content (re_dfa_t *dfa)
603{
604 int i, j;
605
606 if (dfa->nodes)
607 for (i = 0; i < dfa->nodes_len; ++i)
608 free_token (dfa->nodes + i);
609 re_free (dfa->nexts);
610 for (i = 0; i < dfa->nodes_len; ++i)
611 {
612 if (dfa->eclosures != NULL)
613 re_node_set_free (dfa->eclosures + i);
614 if (dfa->inveclosures != NULL)
615 re_node_set_free (dfa->inveclosures + i);
616 if (dfa->edests != NULL)
617 re_node_set_free (dfa->edests + i);
618 }
619 re_free (dfa->edests);
620 re_free (dfa->eclosures);
621 re_free (dfa->inveclosures);
622 re_free (dfa->nodes);
623
624 if (dfa->state_table)
625 for (i = 0; i <= dfa->state_hash_mask; ++i)
626 {
627 struct re_state_table_entry *entry = dfa->state_table + i;
628 for (j = 0; j < entry->num; ++j)
629 {
630 re_dfastate_t *state = entry->array[j];
631 free_state (state);
632 }
633 re_free (entry->array);
634 }
635 re_free (dfa->state_table);
636#ifdef RE_ENABLE_I18N
637 if (dfa->sb_char != utf8_sb_map)
638 re_free (dfa->sb_char);
639#endif
640 re_free (dfa->subexp_map);
641#ifdef DEBUG
642 re_free (dfa->re_str);
643#endif
644
645 re_free (dfa);
646}
647
648
649/* Free dynamically allocated space used by PREG. */
650
651void
652regfree (preg)
653 regex_t *preg;
654{
655 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
656 if (BE (dfa != NULL, 1))
657 free_dfa_content (dfa);
658 preg->buffer = NULL;
659 preg->allocated = 0;
660
661 re_free (preg->fastmap);
662 preg->fastmap = NULL;
663
664 re_free (preg->translate);
665 preg->translate = NULL;
666}
667#ifdef _LIBC
668weak_alias (__regfree, regfree)
669#endif
670
671
672/* Entry points compatible with 4.2 BSD regex library. We don't define
673 them unless specifically requested. */
674
675#if defined _REGEX_RE_COMP || defined _LIBC
676
677/* BSD has one and only one pattern buffer. */
678static struct re_pattern_buffer re_comp_buf;
679
680char *
681# ifdef _LIBC
682/* Make these definitions weak in libc, so POSIX programs can redefine
683 these names if they don't use our functions, and still use
684 regcomp/regexec above without link errors. */
685weak_function
686# endif
687re_comp (s)
688 const char *s;
689{
690 reg_errcode_t ret;
691 char *fastmap;
692
693 if (!s)
694 {
695 if (!re_comp_buf.buffer)
696 return gettext ("No previous regular expression");
697 return 0;
698 }
699
700 if (re_comp_buf.buffer)
701 {
702 fastmap = re_comp_buf.fastmap;
703 re_comp_buf.fastmap = NULL;
704 __regfree (&re_comp_buf);
705 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
706 re_comp_buf.fastmap = fastmap;
707 }
708
709 if (re_comp_buf.fastmap == NULL)
710 {
711 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
712 if (re_comp_buf.fastmap == NULL)
713 return (char *) gettext (__re_error_msgid
714 + __re_error_msgid_idx[(int) REG_ESPACE]);
715 }
716
717 /* Since `re_exec' always passes NULL for the `regs' argument, we
718 don't need to initialize the pattern buffer fields which affect it. */
719
720 /* Match anchors at newlines. */
721 re_comp_buf.newline_anchor = 1;
722
723 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
724
725 if (!ret)
726 return NULL;
727
728 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
729 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
730}
731
732#ifdef _LIBC
733libc_freeres_fn (free_mem)
734{
735 __regfree (&re_comp_buf);
736}
737#endif
738
739#endif /* _REGEX_RE_COMP */
740
741
742/* Internal entry point.
743 Compile the regular expression PATTERN, whose length is LENGTH.
744 SYNTAX indicate regular expression's syntax. */
745
746static reg_errcode_t
747re_compile_internal (preg, pattern, length, syntax)
748 regex_t *preg;
749 const char * pattern;
750 int length;
751 reg_syntax_t syntax;
752{
753 reg_errcode_t err = REG_NOERROR;
754 re_dfa_t *dfa;
755 re_string_t regexp;
756
757 /* Initialize the pattern buffer. */
758 preg->fastmap_accurate = 0;
759 preg->syntax = syntax;
760 preg->not_bol = preg->not_eol = 0;
761 preg->used = 0;
762 preg->re_nsub = 0;
763 preg->can_be_null = 0;
764 preg->regs_allocated = REGS_UNALLOCATED;
765
766 /* Initialize the dfa. */
767 dfa = (re_dfa_t *) preg->buffer;
768 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
769 {
770 /* If zero allocated, but buffer is non-null, try to realloc
771 enough space. This loses if buffer's address is bogus, but
772 that is the user's responsibility. If ->buffer is NULL this
773 is a simple allocation. */
774 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
775 if (dfa == NULL)
776 return REG_ESPACE;
777 preg->allocated = sizeof (re_dfa_t);
778 preg->buffer = (unsigned char *) dfa;
779 }
780 preg->used = sizeof (re_dfa_t);
781
782 __libc_lock_init (dfa->lock);
783
784 err = init_dfa (dfa, length);
785 if (BE (err != REG_NOERROR, 0))
786 {
787 free_dfa_content (dfa);
788 preg->buffer = NULL;
789 preg->allocated = 0;
790 return err;
791 }
792#ifdef DEBUG
793 dfa->re_str = re_malloc (char, length + 1);
794 strncpy (dfa->re_str, pattern, length + 1);
795#endif
796
797 err = re_string_construct (&regexp, pattern, length, preg->translate,
798 syntax & RE_ICASE, dfa);
799 if (BE (err != REG_NOERROR, 0))
800 {
801 re_compile_internal_free_return:
802 free_workarea_compile (preg);
803 re_string_destruct (&regexp);
804 free_dfa_content (dfa);
805 preg->buffer = NULL;
806 preg->allocated = 0;
807 return err;
808 }
809
810 /* Parse the regular expression, and build a structure tree. */
811 preg->re_nsub = 0;
812 dfa->str_tree = parse (&regexp, preg, syntax, &err);
813 if (BE (dfa->str_tree == NULL, 0))
814 goto re_compile_internal_free_return;
815
816 /* Analyze the tree and create the nfa. */
817 err = analyze (preg);
818 if (BE (err != REG_NOERROR, 0))
819 goto re_compile_internal_free_return;
820
821#ifdef RE_ENABLE_I18N
822 /* If possible, do searching in single byte encoding to speed things up. */
823 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
824 optimize_utf8 (dfa);
825#endif
826
827 /* Then create the initial state of the dfa. */
828 err = create_initial_state (dfa);
829
830 /* Release work areas. */
831 free_workarea_compile (preg);
832 re_string_destruct (&regexp);
833
834 if (BE (err != REG_NOERROR, 0))
835 {
836 free_dfa_content (dfa);
837 preg->buffer = NULL;
838 preg->allocated = 0;
839 }
840
841 return err;
842}
843
844/* Initialize DFA. We use the length of the regular expression PAT_LEN
845 as the initial length of some arrays. */
846
847static reg_errcode_t
848init_dfa (dfa, pat_len)
849 re_dfa_t *dfa;
850 int pat_len;
851{
852 int table_size;
853#ifndef _LIBC
854 char *codeset_name;
855#endif
856
857 memset (dfa, '\0', sizeof (re_dfa_t));
858
859 /* Force allocation of str_tree_storage the first time. */
860 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
861
862 dfa->nodes_alloc = pat_len + 1;
863 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
864
865 dfa->states_alloc = pat_len + 1;
866
867 /* table_size = 2 ^ ceil(log pat_len) */
868 for (table_size = 1; table_size > 0; table_size <<= 1)
869 if (table_size > pat_len)
870 break;
871
872 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
873 dfa->state_hash_mask = table_size - 1;
874
875 dfa->mb_cur_max = MB_CUR_MAX;
876#ifdef _LIBC
877 if (dfa->mb_cur_max == 6
878 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
879 dfa->is_utf8 = 1;
880 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
881 != 0);
882#else
883# ifdef HAVE_LANGINFO_CODESET
884 codeset_name = nl_langinfo (CODESET);
885# else
886 codeset_name = getenv ("LC_ALL");
887 if (codeset_name == NULL || codeset_name[0] == '\0')
888 codeset_name = getenv ("LC_CTYPE");
889 if (codeset_name == NULL || codeset_name[0] == '\0')
890 codeset_name = getenv ("LANG");
891 if (codeset_name == NULL)
892 codeset_name = "";
893 else if (strchr (codeset_name, '.') != NULL)
894 codeset_name = strchr (codeset_name, '.') + 1;
895# endif
896
897 if (strcasecmp (codeset_name, "UTF-8") == 0
898 || strcasecmp (codeset_name, "UTF8") == 0)
899 dfa->is_utf8 = 1;
900
901 /* We check exhaustively in the loop below if this charset is a
902 superset of ASCII. */
903 dfa->map_notascii = 0;
904#endif
905
906#ifdef RE_ENABLE_I18N
907 if (dfa->mb_cur_max > 1)
908 {
909 if (dfa->is_utf8)
910 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
911 else
912 {
913 int i, j, ch;
914
915 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
916 if (BE (dfa->sb_char == NULL, 0))
917 return REG_ESPACE;
918
919 /* Clear all bits by, then set those corresponding to single
920 byte chars. */
921 bitset_empty (dfa->sb_char);
922
923 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
924 for (j = 0; j < UINT_BITS; ++j, ++ch)
925 {
926 wchar_t wch = __btowc (ch);
927 if (wch != WEOF)
928 dfa->sb_char[i] |= 1 << j;
929# ifndef _LIBC
930 if (isascii (ch) && wch != (wchar_t) ch)
931 dfa->map_notascii = 1;
932# endif
933 }
934 }
935 }
936#endif
937
938 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
939 return REG_ESPACE;
940 return REG_NOERROR;
941}
942
943/* Initialize WORD_CHAR table, which indicate which character is
944 "word". In this case "word" means that it is the word construction
945 character used by some operators like "\<", "\>", etc. */
946
947static void
948init_word_char (dfa)
949 re_dfa_t *dfa;
950{
951 int i, j, ch;
952 dfa->word_ops_used = 1;
953 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
954 for (j = 0; j < UINT_BITS; ++j, ++ch)
955 if (isalnum (ch) || ch == '_')
956 dfa->word_char[i] |= 1 << j;
957}
958
959/* Free the work area which are only used while compiling. */
960
961static void
962free_workarea_compile (preg)
963 regex_t *preg;
964{
965 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
966 bin_tree_storage_t *storage, *next;
967 for (storage = dfa->str_tree_storage; storage; storage = next)
968 {
969 next = storage->next;
970 re_free (storage);
971 }
972 dfa->str_tree_storage = NULL;
973 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
974 dfa->str_tree = NULL;
975 re_free (dfa->org_indices);
976 dfa->org_indices = NULL;
977}
978
979/* Create initial states for all contexts. */
980
981static reg_errcode_t
982create_initial_state (dfa)
983 re_dfa_t *dfa;
984{
985 int first, i;
986 reg_errcode_t err;
987 re_node_set init_nodes;
988
989 /* Initial states have the epsilon closure of the node which is
990 the first node of the regular expression. */
991 first = dfa->str_tree->first->node_idx;
992 dfa->init_node = first;
993 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
994 if (BE (err != REG_NOERROR, 0))
995 return err;
996
997 /* The back-references which are in initial states can epsilon transit,
998 since in this case all of the subexpressions can be null.
999 Then we add epsilon closures of the nodes which are the next nodes of
1000 the back-references. */
1001 if (dfa->nbackref > 0)
1002 for (i = 0; i < init_nodes.nelem; ++i)
1003 {
1004 int node_idx = init_nodes.elems[i];
1005 re_token_type_t type = dfa->nodes[node_idx].type;
1006
1007 int clexp_idx;
1008 if (type != OP_BACK_REF)
1009 continue;
1010 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1011 {
1012 re_token_t *clexp_node;
1013 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1014 if (clexp_node->type == OP_CLOSE_SUBEXP
1015 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016 break;
1017 }
1018 if (clexp_idx == init_nodes.nelem)
1019 continue;
1020
1021 if (type == OP_BACK_REF)
1022 {
1023 int dest_idx = dfa->edests[node_idx].elems[0];
1024 if (!re_node_set_contains (&init_nodes, dest_idx))
1025 {
1026 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1027 i = 0;
1028 }
1029 }
1030 }
1031
1032 /* It must be the first time to invoke acquire_state. */
1033 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1034 /* We don't check ERR here, since the initial state must not be NULL. */
1035 if (BE (dfa->init_state == NULL, 0))
1036 return err;
1037 if (dfa->init_state->has_constraint)
1038 {
1039 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1040 CONTEXT_WORD);
1041 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1042 CONTEXT_NEWLINE);
1043 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1044 &init_nodes,
1045 CONTEXT_NEWLINE
1046 | CONTEXT_BEGBUF);
1047 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1048 || dfa->init_state_begbuf == NULL, 0))
1049 return err;
1050 }
1051 else
1052 dfa->init_state_word = dfa->init_state_nl
1053 = dfa->init_state_begbuf = dfa->init_state;
1054
1055 re_node_set_free (&init_nodes);
1056 return REG_NOERROR;
1057}
1058
1059
1060#ifdef RE_ENABLE_I18N
1061/* If it is possible to do searching in single byte encoding instead of UTF-8
1062 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1063 DFA nodes where needed. */
1064
1065static void
1066optimize_utf8 (dfa)
1067 re_dfa_t *dfa;
1068{
1069 int node, i, mb_chars = 0, has_period = 0;
1070
1071 for (node = 0; node < dfa->nodes_len; ++node)
1072 switch (dfa->nodes[node].type)
1073 {
1074 case CHARACTER:
1075 if (dfa->nodes[node].opr.c >= 0x80)
1076 mb_chars = 1;
1077 break;
1078 case ANCHOR:
1079 switch (dfa->nodes[node].opr.idx)
1080 {
1081 case LINE_FIRST:
1082 case LINE_LAST:
1083 case BUF_FIRST:
1084 case BUF_LAST:
1085 break;
1086 default:
1087 /* Word anchors etc. cannot be handled. */
1088 return;
1089 }
1090 break;
1091 case OP_PERIOD:
1092 has_period = 1;
1093 break;
1094 case OP_BACK_REF:
1095 case OP_ALT:
1096 case END_OF_RE:
1097 case OP_DUP_ASTERISK:
1098 case OP_OPEN_SUBEXP:
1099 case OP_CLOSE_SUBEXP:
1100 break;
1101 case COMPLEX_BRACKET:
1102 return;
1103 case SIMPLE_BRACKET:
1104 /* Just double check. */
1105 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1106 if (dfa->nodes[node].opr.sbcset[i])
1107 return;
1108 break;
1109 default:
1110 abort ();
1111 }
1112
1113 if (mb_chars || has_period)
1114 for (node = 0; node < dfa->nodes_len; ++node)
1115 {
1116 if (dfa->nodes[node].type == CHARACTER
1117 && dfa->nodes[node].opr.c >= 0x80)
1118 dfa->nodes[node].mb_partial = 0;
1119 else if (dfa->nodes[node].type == OP_PERIOD)
1120 dfa->nodes[node].type = OP_UTF8_PERIOD;
1121 }
1122
1123 /* The search can be in single byte locale. */
1124 dfa->mb_cur_max = 1;
1125 dfa->is_utf8 = 0;
1126 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1127}
1128#endif
1129
1130
1131/* Analyze the structure tree, and calculate "first", "next", "edest",
1132 "eclosure", and "inveclosure". */
1133
1134static reg_errcode_t
1135analyze (preg)
1136 regex_t *preg;
1137{
1138 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1139 reg_errcode_t ret;
1140
1141 /* Allocate arrays. */
1142 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1143 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1144 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1145 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1146 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1147 || dfa->eclosures == NULL, 0))
1148 return REG_ESPACE;
1149
1150 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1151 if (dfa->subexp_map != NULL)
1152 {
1153 int i;
1154 for (i = 0; i < preg->re_nsub; i++)
1155 dfa->subexp_map[i] = i;
1156 preorder (dfa->str_tree, optimize_subexps, dfa);
1157 for (i = 0; i < preg->re_nsub; i++)
1158 if (dfa->subexp_map[i] != i)
1159 break;
1160 if (i == preg->re_nsub)
1161 {
1162 free (dfa->subexp_map);
1163 dfa->subexp_map = NULL;
1164 }
1165 }
1166
1167 ret = postorder (dfa->str_tree, lower_subexps, preg);
1168 if (BE (ret != REG_NOERROR, 0))
1169 return ret;
1170 ret = postorder (dfa->str_tree, calc_first, dfa);
1171 if (BE (ret != REG_NOERROR, 0))
1172 return ret;
1173 preorder (dfa->str_tree, calc_next, dfa);
1174 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1175 if (BE (ret != REG_NOERROR, 0))
1176 return ret;
1177 ret = calc_eclosure (dfa);
1178 if (BE (ret != REG_NOERROR, 0))
1179 return ret;
1180
1181 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1182 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1183 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1184 || dfa->nbackref)
1185 {
1186 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1187 if (BE (dfa->inveclosures == NULL, 0))
1188 return REG_ESPACE;
1189 ret = calc_inveclosure (dfa);
1190 }
1191
1192 return ret;
1193}
1194
1195/* Our parse trees are very unbalanced, so we cannot use a stack to
1196 implement parse tree visits. Instead, we use parent pointers and
1197 some hairy code in these two functions. */
1198static reg_errcode_t
1199postorder (root, fn, extra)
1200 bin_tree_t *root;
1201 reg_errcode_t (fn (void *, bin_tree_t *));
1202 void *extra;
1203{
1204 bin_tree_t *node, *prev;
1205
1206 for (node = root; ; )
1207 {
1208 /* Descend down the tree, preferably to the left (or to the right
1209 if that's the only child). */
1210 while (node->left || node->right)
1211 if (node->left)
1212 node = node->left;
1213 else
1214 node = node->right;
1215
1216 do
1217 {
1218 reg_errcode_t err = fn (extra, node);
1219 if (BE (err != REG_NOERROR, 0))
1220 return err;
1221 if (node->parent == NULL)
1222 return REG_NOERROR;
1223 prev = node;
1224 node = node->parent;
1225 }
1226 /* Go up while we have a node that is reached from the right. */
1227 while (node->right == prev || node->right == NULL);
1228 node = node->right;
1229 }
1230}
1231
1232static reg_errcode_t
1233preorder (root, fn, extra)
1234 bin_tree_t *root;
1235 reg_errcode_t (fn (void *, bin_tree_t *));
1236 void *extra;
1237{
1238 bin_tree_t *node;
1239
1240 for (node = root; ; )
1241 {
1242 reg_errcode_t err = fn (extra, node);
1243 if (BE (err != REG_NOERROR, 0))
1244 return err;
1245
1246 /* Go to the left node, or up and to the right. */
1247 if (node->left)
1248 node = node->left;
1249 else
1250 {
1251 bin_tree_t *prev = NULL;
1252 while (node->right == prev || node->right == NULL)
1253 {
1254 prev = node;
1255 node = node->parent;
1256 if (!node)
1257 return REG_NOERROR;
1258 }
1259 node = node->right;
1260 }
1261 }
1262}
1263
1264/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1265 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1266 backreferences as well. Requires a preorder visit. */
1267static reg_errcode_t
1268optimize_subexps (extra, node)
1269 void *extra;
1270 bin_tree_t *node;
1271{
1272 re_dfa_t *dfa = (re_dfa_t *) extra;
1273
1274 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1275 {
1276 int idx = node->token.opr.idx;
1277 node->token.opr.idx = dfa->subexp_map[idx];
1278 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1279 }
1280
1281 else if (node->token.type == SUBEXP
1282 && node->left && node->left->token.type == SUBEXP)
1283 {
1284 int other_idx = node->left->token.opr.idx;
1285
1286 node->left = node->left->left;
1287 if (node->left)
1288 node->left->parent = node;
1289
1290 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1291 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1292 dfa->used_bkref_map &= ~(1 << other_idx);
1293 }
1294
1295 return REG_NOERROR;
1296}
1297
1298/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1299 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1300static reg_errcode_t
1301lower_subexps (extra, node)
1302 void *extra;
1303 bin_tree_t *node;
1304{
1305 regex_t *preg = (regex_t *) extra;
1306 reg_errcode_t err = REG_NOERROR;
1307
1308 if (node->left && node->left->token.type == SUBEXP)
1309 {
1310 node->left = lower_subexp (&err, preg, node->left);
1311 if (node->left)
1312 node->left->parent = node;
1313 }
1314 if (node->right && node->right->token.type == SUBEXP)
1315 {
1316 node->right = lower_subexp (&err, preg, node->right);
1317 if (node->right)
1318 node->right->parent = node;
1319 }
1320
1321 return err;
1322}
1323
1324static bin_tree_t *
1325lower_subexp (err, preg, node)
1326 reg_errcode_t *err;
1327 regex_t *preg;
1328 bin_tree_t *node;
1329{
1330 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1331 bin_tree_t *body = node->left;
1332 bin_tree_t *op, *cls, *tree1, *tree;
1333
1334 if (preg->no_sub
1335 /* We do not optimize empty subexpressions, because otherwise we may
1336 have bad CONCAT nodes with NULL children. This is obviously not
1337 very common, so we do not lose much. An example that triggers
1338 this case is the sed "script" /\(\)/x. */
1339 && node->left != NULL
1340 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1341 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1342 return node->left;
1343
1344 /* Convert the SUBEXP node to the concatenation of an
1345 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1346 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349 tree = create_tree (dfa, op, tree1, CONCAT);
1350 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1351 {
1352 *err = REG_ESPACE;
1353 return NULL;
1354 }
1355
1356 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358 return tree;
1359}
1360
1361/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362 nodes. Requires a postorder visit. */
1363static reg_errcode_t
1364calc_first (extra, node)
1365 void *extra;
1366 bin_tree_t *node;
1367{
1368 re_dfa_t *dfa = (re_dfa_t *) extra;
1369 if (node->token.type == CONCAT)
1370 {
1371 node->first = node->left->first;
1372 node->node_idx = node->left->node_idx;
1373 }
1374 else
1375 {
1376 node->first = node;
1377 node->node_idx = re_dfa_add_node (dfa, node->token);
1378 if (BE (node->node_idx == -1, 0))
1379 return REG_ESPACE;
1380 }
1381 return REG_NOERROR;
1382}
1383
1384/* Pass 2: compute NEXT on the tree. Preorder visit. */
1385static reg_errcode_t
1386calc_next (extra, node)
1387 void *extra;
1388 bin_tree_t *node;
1389{
1390 switch (node->token.type)
1391 {
1392 case OP_DUP_ASTERISK:
1393 node->left->next = node;
1394 break;
1395 case CONCAT:
1396 node->left->next = node->right->first;
1397 node->right->next = node->next;
1398 break;
1399 default:
1400 if (node->left)
1401 node->left->next = node->next;
1402 if (node->right)
1403 node->right->next = node->next;
1404 break;
1405 }
1406 return REG_NOERROR;
1407}
1408
1409/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1410static reg_errcode_t
1411link_nfa_nodes (extra, node)
1412 void *extra;
1413 bin_tree_t *node;
1414{
1415 re_dfa_t *dfa = (re_dfa_t *) extra;
1416 int idx = node->node_idx;
1417 reg_errcode_t err = REG_NOERROR;
1418
1419 switch (node->token.type)
1420 {
1421 case CONCAT:
1422 break;
1423
1424 case END_OF_RE:
1425 assert (node->next == NULL);
1426 break;
1427
1428 case OP_DUP_ASTERISK:
1429 case OP_ALT:
1430 {
1431 int left, right;
1432 dfa->has_plural_match = 1;
1433 if (node->left != NULL)
1434 left = node->left->first->node_idx;
1435 else
1436 left = node->next->node_idx;
1437 if (node->right != NULL)
1438 right = node->right->first->node_idx;
1439 else
1440 right = node->next->node_idx;
1441 assert (left > -1);
1442 assert (right > -1);
1443 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1444 }
1445 break;
1446
1447 case ANCHOR:
1448 case OP_OPEN_SUBEXP:
1449 case OP_CLOSE_SUBEXP:
1450 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1451 break;
1452
1453 case OP_BACK_REF:
1454 dfa->nexts[idx] = node->next->node_idx;
1455 if (node->token.type == OP_BACK_REF)
1456 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1457 break;
1458
1459 default:
1460 assert (!IS_EPSILON_NODE (node->token.type));
1461 dfa->nexts[idx] = node->next->node_idx;
1462 break;
1463 }
1464
1465 return err;
1466}
1467
1468/* Duplicate the epsilon closure of the node ROOT_NODE.
1469 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1470 to their own constraint. */
1471
1472static reg_errcode_t
1473duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1474 init_constraint)
1475 re_dfa_t *dfa;
1476 int top_org_node, top_clone_node, root_node;
1477 unsigned int init_constraint;
1478{
1479 reg_errcode_t err;
1480 int org_node, clone_node, ret;
1481 unsigned int constraint = init_constraint;
1482 for (org_node = top_org_node, clone_node = top_clone_node;;)
1483 {
1484 int org_dest, clone_dest;
1485 if (dfa->nodes[org_node].type == OP_BACK_REF)
1486 {
1487 /* If the back reference epsilon-transit, its destination must
1488 also have the constraint. Then duplicate the epsilon closure
1489 of the destination of the back reference, and store it in
1490 edests of the back reference. */
1491 org_dest = dfa->nexts[org_node];
1492 re_node_set_empty (dfa->edests + clone_node);
1493 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1494 if (BE (err != REG_NOERROR, 0))
1495 return err;
1496 dfa->nexts[clone_node] = dfa->nexts[org_node];
1497 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1498 if (BE (ret < 0, 0))
1499 return REG_ESPACE;
1500 }
1501 else if (dfa->edests[org_node].nelem == 0)
1502 {
1503 /* In case of the node can't epsilon-transit, don't duplicate the
1504 destination and store the original destination as the
1505 destination of the node. */
1506 dfa->nexts[clone_node] = dfa->nexts[org_node];
1507 break;
1508 }
1509 else if (dfa->edests[org_node].nelem == 1)
1510 {
1511 /* In case of the node can epsilon-transit, and it has only one
1512 destination. */
1513 org_dest = dfa->edests[org_node].elems[0];
1514 re_node_set_empty (dfa->edests + clone_node);
1515 if (dfa->nodes[org_node].type == ANCHOR)
1516 {
1517 /* In case of the node has another constraint, append it. */
1518 if (org_node == root_node && clone_node != org_node)
1519 {
1520 /* ...but if the node is root_node itself, it means the
1521 epsilon closure have a loop, then tie it to the
1522 destination of the root_node. */
1523 ret = re_node_set_insert (dfa->edests + clone_node,
1524 org_dest);
1525 if (BE (ret < 0, 0))
1526 return REG_ESPACE;
1527 break;
1528 }
1529 constraint |= dfa->nodes[org_node].opr.ctx_type;
1530 }
1531 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1532 if (BE (err != REG_NOERROR, 0))
1533 return err;
1534 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535 if (BE (ret < 0, 0))
1536 return REG_ESPACE;
1537 }
1538 else /* dfa->edests[org_node].nelem == 2 */
1539 {
1540 /* In case of the node can epsilon-transit, and it has two
1541 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1542 org_dest = dfa->edests[org_node].elems[0];
1543 re_node_set_empty (dfa->edests + clone_node);
1544 /* Search for a duplicated node which satisfies the constraint. */
1545 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1546 if (clone_dest == -1)
1547 {
1548 /* There are no such a duplicated node, create a new one. */
1549 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1550 if (BE (err != REG_NOERROR, 0))
1551 return err;
1552 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1553 if (BE (ret < 0, 0))
1554 return REG_ESPACE;
1555 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1556 root_node, constraint);
1557 if (BE (err != REG_NOERROR, 0))
1558 return err;
1559 }
1560 else
1561 {
1562 /* There are a duplicated node which satisfy the constraint,
1563 use it to avoid infinite loop. */
1564 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565 if (BE (ret < 0, 0))
1566 return REG_ESPACE;
1567 }
1568
1569 org_dest = dfa->edests[org_node].elems[1];
1570 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1571 if (BE (err != REG_NOERROR, 0))
1572 return err;
1573 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1574 if (BE (ret < 0, 0))
1575 return REG_ESPACE;
1576 }
1577 org_node = org_dest;
1578 clone_node = clone_dest;
1579 }
1580 return REG_NOERROR;
1581}
1582
1583/* Search for a node which is duplicated from the node ORG_NODE, and
1584 satisfies the constraint CONSTRAINT. */
1585
1586static int
1587search_duplicated_node (dfa, org_node, constraint)
1588 re_dfa_t *dfa;
1589 int org_node;
1590 unsigned int constraint;
1591{
1592 int idx;
1593 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1594 {
1595 if (org_node == dfa->org_indices[idx]
1596 && constraint == dfa->nodes[idx].constraint)
1597 return idx; /* Found. */
1598 }
1599 return -1; /* Not found. */
1600}
1601
1602/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1603 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1604 otherwise return the error code. */
1605
1606static reg_errcode_t
1607duplicate_node (new_idx, dfa, org_idx, constraint)
1608 re_dfa_t *dfa;
1609 int *new_idx, org_idx;
1610 unsigned int constraint;
1611{
1612 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1613 if (BE (dup_idx == -1, 0))
1614 return REG_ESPACE;
1615 dfa->nodes[dup_idx].constraint = constraint;
1616 if (dfa->nodes[org_idx].type == ANCHOR)
1617 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1618 dfa->nodes[dup_idx].duplicated = 1;
1619
1620 /* Store the index of the original node. */
1621 dfa->org_indices[dup_idx] = org_idx;
1622 *new_idx = dup_idx;
1623 return REG_NOERROR;
1624}
1625
1626static reg_errcode_t
1627calc_inveclosure (dfa)
1628 re_dfa_t *dfa;
1629{
1630 int src, idx, ret;
1631 for (idx = 0; idx < dfa->nodes_len; ++idx)
1632 re_node_set_init_empty (dfa->inveclosures + idx);
1633
1634 for (src = 0; src < dfa->nodes_len; ++src)
1635 {
1636 int *elems = dfa->eclosures[src].elems;
1637 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1638 {
1639 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1640 if (BE (ret == -1, 0))
1641 return REG_ESPACE;
1642 }
1643 }
1644
1645 return REG_NOERROR;
1646}
1647
1648/* Calculate "eclosure" for all the node in DFA. */
1649
1650static reg_errcode_t
1651calc_eclosure (dfa)
1652 re_dfa_t *dfa;
1653{
1654 int node_idx, incomplete;
1655#ifdef DEBUG
1656 assert (dfa->nodes_len > 0);
1657#endif
1658 incomplete = 0;
1659 /* For each nodes, calculate epsilon closure. */
1660 for (node_idx = 0; ; ++node_idx)
1661 {
1662 reg_errcode_t err;
1663 re_node_set eclosure_elem;
1664 if (node_idx == dfa->nodes_len)
1665 {
1666 if (!incomplete)
1667 break;
1668 incomplete = 0;
1669 node_idx = 0;
1670 }
1671
1672#ifdef DEBUG
1673 assert (dfa->eclosures[node_idx].nelem != -1);
1674#endif
1675
1676 /* If we have already calculated, skip it. */
1677 if (dfa->eclosures[node_idx].nelem != 0)
1678 continue;
1679 /* Calculate epsilon closure of `node_idx'. */
1680 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1681 if (BE (err != REG_NOERROR, 0))
1682 return err;
1683
1684 if (dfa->eclosures[node_idx].nelem == 0)
1685 {
1686 incomplete = 1;
1687 re_node_set_free (&eclosure_elem);
1688 }
1689 }
1690 return REG_NOERROR;
1691}
1692
1693/* Calculate epsilon closure of NODE. */
1694
1695static reg_errcode_t
1696calc_eclosure_iter (new_set, dfa, node, root)
1697 re_node_set *new_set;
1698 re_dfa_t *dfa;
1699 int node, root;
1700{
1701 reg_errcode_t err;
1702 unsigned int constraint;
1703 int i, incomplete;
1704 re_node_set eclosure;
1705 incomplete = 0;
1706 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1707 if (BE (err != REG_NOERROR, 0))
1708 return err;
1709
1710 /* This indicates that we are calculating this node now.
1711 We reference this value to avoid infinite loop. */
1712 dfa->eclosures[node].nelem = -1;
1713
1714 constraint = ((dfa->nodes[node].type == ANCHOR)
1715 ? dfa->nodes[node].opr.ctx_type : 0);
1716 /* If the current node has constraints, duplicate all nodes.
1717 Since they must inherit the constraints. */
1718 if (constraint
1719 && dfa->edests[node].nelem
1720 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1721 {
1722 int org_node, cur_node;
1723 org_node = cur_node = node;
1724 err = duplicate_node_closure (dfa, node, node, node, constraint);
1725 if (BE (err != REG_NOERROR, 0))
1726 return err;
1727 }
1728
1729 /* Expand each epsilon destination nodes. */
1730 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1731 for (i = 0; i < dfa->edests[node].nelem; ++i)
1732 {
1733 re_node_set eclosure_elem;
1734 int edest = dfa->edests[node].elems[i];
1735 /* If calculating the epsilon closure of `edest' is in progress,
1736 return intermediate result. */
1737 if (dfa->eclosures[edest].nelem == -1)
1738 {
1739 incomplete = 1;
1740 continue;
1741 }
1742 /* If we haven't calculated the epsilon closure of `edest' yet,
1743 calculate now. Otherwise use calculated epsilon closure. */
1744 if (dfa->eclosures[edest].nelem == 0)
1745 {
1746 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1747 if (BE (err != REG_NOERROR, 0))
1748 return err;
1749 }
1750 else
1751 eclosure_elem = dfa->eclosures[edest];
1752 /* Merge the epsilon closure of `edest'. */
1753 re_node_set_merge (&eclosure, &eclosure_elem);
1754 /* If the epsilon closure of `edest' is incomplete,
1755 the epsilon closure of this node is also incomplete. */
1756 if (dfa->eclosures[edest].nelem == 0)
1757 {
1758 incomplete = 1;
1759 re_node_set_free (&eclosure_elem);
1760 }
1761 }
1762
1763 /* Epsilon closures include itself. */
1764 re_node_set_insert (&eclosure, node);
1765 if (incomplete && !root)
1766 dfa->eclosures[node].nelem = 0;
1767 else
1768 dfa->eclosures[node] = eclosure;
1769 *new_set = eclosure;
1770 return REG_NOERROR;
1771}
1772
1773
1774/* Functions for token which are used in the parser. */
1775
1776/* Fetch a token from INPUT.
1777 We must not use this function inside bracket expressions. */
1778
1779static void
1780fetch_token (result, input, syntax)
1781 re_token_t *result;
1782 re_string_t *input;
1783 reg_syntax_t syntax;
1784{
1785 re_string_skip_bytes (input, peek_token (result, input, syntax));
1786}
1787
1788/* Peek a token from INPUT, and return the length of the token.
1789 We must not use this function inside bracket expressions. */
1790
1791static int
1792peek_token (token, input, syntax)
1793 re_token_t *token;
1794 re_string_t *input;
1795 reg_syntax_t syntax;
1796{
1797 unsigned char c;
1798
1799 if (re_string_eoi (input))
1800 {
1801 token->type = END_OF_RE;
1802 return 0;
1803 }
1804
1805 c = re_string_peek_byte (input, 0);
1806 token->opr.c = c;
1807
1808 token->word_char = 0;
1809#ifdef RE_ENABLE_I18N
1810 token->mb_partial = 0;
1811 if (input->mb_cur_max > 1 &&
1812 !re_string_first_byte (input, re_string_cur_idx (input)))
1813 {
1814 token->type = CHARACTER;
1815 token->mb_partial = 1;
1816 return 1;
1817 }
1818#endif
1819 if (c == '\\')
1820 {
1821 unsigned char c2;
1822 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1823 {
1824 token->type = BACK_SLASH;
1825 return 1;
1826 }
1827
1828 c2 = re_string_peek_byte_case (input, 1);
1829 token->opr.c = c2;
1830 token->type = CHARACTER;
1831#ifdef RE_ENABLE_I18N
1832 if (input->mb_cur_max > 1)
1833 {
1834 wint_t wc = re_string_wchar_at (input,
1835 re_string_cur_idx (input) + 1);
1836 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1837 }
1838 else
1839#endif
1840 token->word_char = IS_WORD_CHAR (c2) != 0;
1841
1842 switch (c2)
1843 {
1844 case '|':
1845 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1846 token->type = OP_ALT;
1847 break;
1848 case '1': case '2': case '3': case '4': case '5':
1849 case '6': case '7': case '8': case '9':
1850 if (!(syntax & RE_NO_BK_REFS))
1851 {
1852 token->type = OP_BACK_REF;
1853 token->opr.idx = c2 - '1';
1854 }
1855 break;
1856 case '<':
1857 if (!(syntax & RE_NO_GNU_OPS))
1858 {
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = WORD_FIRST;
1861 }
1862 break;
1863 case '>':
1864 if (!(syntax & RE_NO_GNU_OPS))
1865 {
1866 token->type = ANCHOR;
1867 token->opr.ctx_type = WORD_LAST;
1868 }
1869 break;
1870 case 'b':
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 {
1873 token->type = ANCHOR;
1874 token->opr.ctx_type = WORD_DELIM;
1875 }
1876 break;
1877 case 'B':
1878 if (!(syntax & RE_NO_GNU_OPS))
1879 {
1880 token->type = ANCHOR;
1881 token->opr.ctx_type = NOT_WORD_DELIM;
1882 }
1883 break;
1884 case 'w':
1885 if (!(syntax & RE_NO_GNU_OPS))
1886 token->type = OP_WORD;
1887 break;
1888 case 'W':
1889 if (!(syntax & RE_NO_GNU_OPS))
1890 token->type = OP_NOTWORD;
1891 break;
1892 case 's':
1893 if (!(syntax & RE_NO_GNU_OPS))
1894 token->type = OP_SPACE;
1895 break;
1896 case 'S':
1897 if (!(syntax & RE_NO_GNU_OPS))
1898 token->type = OP_NOTSPACE;
1899 break;
1900 case '`':
1901 if (!(syntax & RE_NO_GNU_OPS))
1902 {
1903 token->type = ANCHOR;
1904 token->opr.ctx_type = BUF_FIRST;
1905 }
1906 break;
1907 case '\'':
1908 if (!(syntax & RE_NO_GNU_OPS))
1909 {
1910 token->type = ANCHOR;
1911 token->opr.ctx_type = BUF_LAST;
1912 }
1913 break;
1914 case '(':
1915 if (!(syntax & RE_NO_BK_PARENS))
1916 token->type = OP_OPEN_SUBEXP;
1917 break;
1918 case ')':
1919 if (!(syntax & RE_NO_BK_PARENS))
1920 token->type = OP_CLOSE_SUBEXP;
1921 break;
1922 case '+':
1923 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1924 token->type = OP_DUP_PLUS;
1925 break;
1926 case '?':
1927 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1928 token->type = OP_DUP_QUESTION;
1929 break;
1930 case '{':
1931 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1932 token->type = OP_OPEN_DUP_NUM;
1933 break;
1934 case '}':
1935 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1936 token->type = OP_CLOSE_DUP_NUM;
1937 break;
1938 default:
1939 break;
1940 }
1941 return 2;
1942 }
1943
1944 token->type = CHARACTER;
1945#ifdef RE_ENABLE_I18N
1946 if (input->mb_cur_max > 1)
1947 {
1948 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1949 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1950 }
1951 else
1952#endif
1953 token->word_char = IS_WORD_CHAR (token->opr.c);
1954
1955 switch (c)
1956 {
1957 case '\n':
1958 if (syntax & RE_NEWLINE_ALT)
1959 token->type = OP_ALT;
1960 break;
1961 case '|':
1962 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1963 token->type = OP_ALT;
1964 break;
1965 case '*':
1966 token->type = OP_DUP_ASTERISK;
1967 break;
1968 case '+':
1969 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1970 token->type = OP_DUP_PLUS;
1971 break;
1972 case '?':
1973 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1974 token->type = OP_DUP_QUESTION;
1975 break;
1976 case '{':
1977 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1978 token->type = OP_OPEN_DUP_NUM;
1979 break;
1980 case '}':
1981 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1982 token->type = OP_CLOSE_DUP_NUM;
1983 break;
1984 case '(':
1985 if (syntax & RE_NO_BK_PARENS)
1986 token->type = OP_OPEN_SUBEXP;
1987 break;
1988 case ')':
1989 if (syntax & RE_NO_BK_PARENS)
1990 token->type = OP_CLOSE_SUBEXP;
1991 break;
1992 case '[':
1993 token->type = OP_OPEN_BRACKET;
1994 break;
1995 case '.':
1996 token->type = OP_PERIOD;
1997 break;
1998 case '^':
1999 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2000 re_string_cur_idx (input) != 0)
2001 {
2002 char prev = re_string_peek_byte (input, -1);
2003 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2004 break;
2005 }
2006 token->type = ANCHOR;
2007 token->opr.ctx_type = LINE_FIRST;
2008 break;
2009 case '$':
2010 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2011 re_string_cur_idx (input) + 1 != re_string_length (input))
2012 {
2013 re_token_t next;
2014 re_string_skip_bytes (input, 1);
2015 peek_token (&next, input, syntax);
2016 re_string_skip_bytes (input, -1);
2017 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2018 break;
2019 }
2020 token->type = ANCHOR;
2021 token->opr.ctx_type = LINE_LAST;
2022 break;
2023 default:
2024 break;
2025 }
2026 return 1;
2027}
2028
2029/* Peek a token from INPUT, and return the length of the token.
2030 We must not use this function out of bracket expressions. */
2031
2032static int
2033peek_token_bracket (token, input, syntax)
2034 re_token_t *token;
2035 re_string_t *input;
2036 reg_syntax_t syntax;
2037{
2038 unsigned char c;
2039 if (re_string_eoi (input))
2040 {
2041 token->type = END_OF_RE;
2042 return 0;
2043 }
2044 c = re_string_peek_byte (input, 0);
2045 token->opr.c = c;
2046
2047#ifdef RE_ENABLE_I18N
2048 if (input->mb_cur_max > 1 &&
2049 !re_string_first_byte (input, re_string_cur_idx (input)))
2050 {
2051 token->type = CHARACTER;
2052 return 1;
2053 }
2054#endif /* RE_ENABLE_I18N */
2055
2056 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2057 && re_string_cur_idx (input) + 1 < re_string_length (input))
2058 {
2059 /* In this case, '\' escape a character. */
2060 unsigned char c2;
2061 re_string_skip_bytes (input, 1);
2062 c2 = re_string_peek_byte (input, 0);
2063 token->opr.c = c2;
2064 token->type = CHARACTER;
2065 return 1;
2066 }
2067 if (c == '[') /* '[' is a special char in a bracket exps. */
2068 {
2069 unsigned char c2;
2070 int token_len;
2071 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2072 c2 = re_string_peek_byte (input, 1);
2073 else
2074 c2 = 0;
2075 token->opr.c = c2;
2076 token_len = 2;
2077 switch (c2)
2078 {
2079 case '.':
2080 token->type = OP_OPEN_COLL_ELEM;
2081 break;
2082 case '=':
2083 token->type = OP_OPEN_EQUIV_CLASS;
2084 break;
2085 case ':':
2086 if (syntax & RE_CHAR_CLASSES)
2087 {
2088 token->type = OP_OPEN_CHAR_CLASS;
2089 break;
2090 }
2091 /* else fall through. */
2092 default:
2093 token->type = CHARACTER;
2094 token->opr.c = c;
2095 token_len = 1;
2096 break;
2097 }
2098 return token_len;
2099 }
2100 switch (c)
2101 {
2102 case '-':
2103 token->type = OP_CHARSET_RANGE;
2104 break;
2105 case ']':
2106 token->type = OP_CLOSE_BRACKET;
2107 break;
2108 case '^':
2109 token->type = OP_NON_MATCH_LIST;
2110 break;
2111 default:
2112 token->type = CHARACTER;
2113 }
2114 return 1;
2115}
2116
2117
2118/* Functions for parser. */
2119
2120/* Entry point of the parser.
2121 Parse the regular expression REGEXP and return the structure tree.
2122 If an error is occured, ERR is set by error code, and return NULL.
2123 This function build the following tree, from regular expression <reg_exp>:
2124 CAT
2125 / \
2126 / \
2127 <reg_exp> EOR
2128
2129 CAT means concatenation.
2130 EOR means end of regular expression. */
2131
2132static bin_tree_t *
2133parse (regexp, preg, syntax, err)
2134 re_string_t *regexp;
2135 regex_t *preg;
2136 reg_syntax_t syntax;
2137 reg_errcode_t *err;
2138{
2139 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2140 bin_tree_t *tree, *eor, *root;
2141 re_token_t current_token;
2142 dfa->syntax = syntax;
2143 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2144 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2145 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2146 return NULL;
2147 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2148 if (tree != NULL)
2149 root = create_tree (dfa, tree, eor, CONCAT);
2150 else
2151 root = eor;
2152 if (BE (eor == NULL || root == NULL, 0))
2153 {
2154 *err = REG_ESPACE;
2155 return NULL;
2156 }
2157 return root;
2158}
2159
2160/* This function build the following tree, from regular expression
2161 <branch1>|<branch2>:
2162 ALT
2163 / \
2164 / \
2165 <branch1> <branch2>
2166
2167 ALT means alternative, which represents the operator `|'. */
2168
2169static bin_tree_t *
2170parse_reg_exp (regexp, preg, token, syntax, nest, err)
2171 re_string_t *regexp;
2172 regex_t *preg;
2173 re_token_t *token;
2174 reg_syntax_t syntax;
2175 int nest;
2176 reg_errcode_t *err;
2177{
2178 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2179 bin_tree_t *tree, *branch = NULL;
2180 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2181 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2182 return NULL;
2183
2184 while (token->type == OP_ALT)
2185 {
2186 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2187 if (token->type != OP_ALT && token->type != END_OF_RE
2188 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2189 {
2190 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2191 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2192 return NULL;
2193 }
2194 else
2195 branch = NULL;
2196 tree = create_tree (dfa, tree, branch, OP_ALT);
2197 if (BE (tree == NULL, 0))
2198 {
2199 *err = REG_ESPACE;
2200 return NULL;
2201 }
2202 }
2203 return tree;
2204}
2205
2206/* This function build the following tree, from regular expression
2207 <exp1><exp2>:
2208 CAT
2209 / \
2210 / \
2211 <exp1> <exp2>
2212
2213 CAT means concatenation. */
2214
2215static bin_tree_t *
2216parse_branch (regexp, preg, token, syntax, nest, err)
2217 re_string_t *regexp;
2218 regex_t *preg;
2219 re_token_t *token;
2220 reg_syntax_t syntax;
2221 int nest;
2222 reg_errcode_t *err;
2223{
2224 bin_tree_t *tree, *exp;
2225 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2226 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2227 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2228 return NULL;
2229
2230 while (token->type != OP_ALT && token->type != END_OF_RE
2231 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2232 {
2233 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2234 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2235 {
2236 return NULL;
2237 }
2238 if (tree != NULL && exp != NULL)
2239 {
2240 tree = create_tree (dfa, tree, exp, CONCAT);
2241 if (tree == NULL)
2242 {
2243 *err = REG_ESPACE;
2244 return NULL;
2245 }
2246 }
2247 else if (tree == NULL)
2248 tree = exp;
2249 /* Otherwise exp == NULL, we don't need to create new tree. */
2250 }
2251 return tree;
2252}
2253
2254/* This function build the following tree, from regular expression a*:
2255 *
2256 |
2257 a
2258*/
2259
2260static bin_tree_t *
2261parse_expression (regexp, preg, token, syntax, nest, err)
2262 re_string_t *regexp;
2263 regex_t *preg;
2264 re_token_t *token;
2265 reg_syntax_t syntax;
2266 int nest;
2267 reg_errcode_t *err;
2268{
2269 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2270 bin_tree_t *tree;
2271 switch (token->type)
2272 {
2273 case CHARACTER:
2274 tree = create_token_tree (dfa, NULL, NULL, token);
2275 if (BE (tree == NULL, 0))
2276 {
2277 *err = REG_ESPACE;
2278 return NULL;
2279 }
2280#ifdef RE_ENABLE_I18N
2281 if (dfa->mb_cur_max > 1)
2282 {
2283 while (!re_string_eoi (regexp)
2284 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2285 {
2286 bin_tree_t *mbc_remain;
2287 fetch_token (token, regexp, syntax);
2288 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2289 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2290 if (BE (mbc_remain == NULL || tree == NULL, 0))
2291 {
2292 *err = REG_ESPACE;
2293 return NULL;
2294 }
2295 }
2296 }
2297#endif
2298 break;
2299 case OP_OPEN_SUBEXP:
2300 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2301 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2302 return NULL;
2303 break;
2304 case OP_OPEN_BRACKET:
2305 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2306 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307 return NULL;
2308 break;
2309 case OP_BACK_REF:
2310 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2311 {
2312 *err = REG_ESUBREG;
2313 return NULL;
2314 }
2315 dfa->used_bkref_map |= 1 << token->opr.idx;
2316 tree = create_token_tree (dfa, NULL, NULL, token);
2317 if (BE (tree == NULL, 0))
2318 {
2319 *err = REG_ESPACE;
2320 return NULL;
2321 }
2322 ++dfa->nbackref;
2323 dfa->has_mb_node = 1;
2324 break;
2325 case OP_OPEN_DUP_NUM:
2326 if (syntax & RE_CONTEXT_INVALID_DUP)
2327 {
2328 *err = REG_BADRPT;
2329 return NULL;
2330 }
2331 /* FALLTHROUGH */
2332 case OP_DUP_ASTERISK:
2333 case OP_DUP_PLUS:
2334 case OP_DUP_QUESTION:
2335 if (syntax & RE_CONTEXT_INVALID_OPS)
2336 {
2337 *err = REG_BADRPT;
2338 return NULL;
2339 }
2340 else if (syntax & RE_CONTEXT_INDEP_OPS)
2341 {
2342 fetch_token (token, regexp, syntax);
2343 return parse_expression (regexp, preg, token, syntax, nest, err);
2344 }
2345 /* else fall through */
2346 case OP_CLOSE_SUBEXP:
2347 if ((token->type == OP_CLOSE_SUBEXP) &&
2348 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2349 {
2350 *err = REG_ERPAREN;
2351 return NULL;
2352 }
2353 /* else fall through */
2354 case OP_CLOSE_DUP_NUM:
2355 /* We treat it as a normal character. */
2356
2357 /* Then we can these characters as normal characters. */
2358 token->type = CHARACTER;
2359 /* mb_partial and word_char bits should be initialized already
2360 by peek_token. */
2361 tree = create_token_tree (dfa, NULL, NULL, token);
2362 if (BE (tree == NULL, 0))
2363 {
2364 *err = REG_ESPACE;
2365 return NULL;
2366 }
2367 break;
2368 case ANCHOR:
2369 if ((token->opr.ctx_type
2370 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371 && dfa->word_ops_used == 0)
2372 init_word_char (dfa);
2373 if (token->opr.ctx_type == WORD_DELIM
2374 || token->opr.ctx_type == NOT_WORD_DELIM)
2375 {
2376 bin_tree_t *tree_first, *tree_last;
2377 if (token->opr.ctx_type == WORD_DELIM)
2378 {
2379 token->opr.ctx_type = WORD_FIRST;
2380 tree_first = create_token_tree (dfa, NULL, NULL, token);
2381 token->opr.ctx_type = WORD_LAST;
2382 }
2383 else
2384 {
2385 token->opr.ctx_type = INSIDE_WORD;
2386 tree_first = create_token_tree (dfa, NULL, NULL, token);
2387 token->opr.ctx_type = INSIDE_NOTWORD;
2388 }
2389 tree_last = create_token_tree (dfa, NULL, NULL, token);
2390 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392 {
2393 *err = REG_ESPACE;
2394 return NULL;
2395 }
2396 }
2397 else
2398 {
2399 tree = create_token_tree (dfa, NULL, NULL, token);
2400 if (BE (tree == NULL, 0))
2401 {
2402 *err = REG_ESPACE;
2403 return NULL;
2404 }
2405 }
2406 /* We must return here, since ANCHORs can't be followed
2407 by repetition operators.
2408 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2410 fetch_token (token, regexp, syntax);
2411 return tree;
2412 case OP_PERIOD:
2413 tree = create_token_tree (dfa, NULL, NULL, token);
2414 if (BE (tree == NULL, 0))
2415 {
2416 *err = REG_ESPACE;
2417 return NULL;
2418 }
2419 if (dfa->mb_cur_max > 1)
2420 dfa->has_mb_node = 1;
2421 break;
2422 case OP_WORD:
2423 case OP_NOTWORD:
2424 tree = build_charclass_op (dfa, regexp->trans,
2425 (const unsigned char *) "alnum",
2426 (const unsigned char *) "_",
2427 token->type == OP_NOTWORD, err);
2428 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2429 return NULL;
2430 break;
2431 case OP_SPACE:
2432 case OP_NOTSPACE:
2433 tree = build_charclass_op (dfa, regexp->trans,
2434 (const unsigned char *) "space",
2435 (const unsigned char *) "",
2436 token->type == OP_NOTSPACE, err);
2437 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2438 return NULL;
2439 break;
2440 case OP_ALT:
2441 case END_OF_RE:
2442 return NULL;
2443 case BACK_SLASH:
2444 *err = REG_EESCAPE;
2445 return NULL;
2446 default:
2447 /* Must not happen? */
2448#ifdef DEBUG
2449 assert (0);
2450#endif
2451 return NULL;
2452 }
2453 fetch_token (token, regexp, syntax);
2454
2455 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2456 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2457 {
2458 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2459 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2460 return NULL;
2461 /* In BRE consecutive duplications are not allowed. */
2462 if ((syntax & RE_CONTEXT_INVALID_DUP)
2463 && (token->type == OP_DUP_ASTERISK
2464 || token->type == OP_OPEN_DUP_NUM))
2465 {
2466 *err = REG_BADRPT;
2467 return NULL;
2468 }
2469 }
2470
2471 return tree;
2472}
2473
2474/* This function build the following tree, from regular expression
2475 (<reg_exp>):
2476 SUBEXP
2477 |
2478 <reg_exp>
2479*/
2480
2481static bin_tree_t *
2482parse_sub_exp (regexp, preg, token, syntax, nest, err)
2483 re_string_t *regexp;
2484 regex_t *preg;
2485 re_token_t *token;
2486 reg_syntax_t syntax;
2487 int nest;
2488 reg_errcode_t *err;
2489{
2490 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2491 bin_tree_t *tree;
2492 size_t cur_nsub;
2493 cur_nsub = preg->re_nsub++;
2494
2495 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2496
2497 /* The subexpression may be a null string. */
2498 if (token->type == OP_CLOSE_SUBEXP)
2499 tree = NULL;
2500 else
2501 {
2502 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2503 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2504 *err = REG_EPAREN;
2505 if (BE (*err != REG_NOERROR, 0))
2506 return NULL;
2507 }
2508 dfa->completed_bkref_map |= 1 << cur_nsub;
2509
2510 tree = create_tree (dfa, tree, NULL, SUBEXP);
2511 if (BE (tree == NULL, 0))
2512 {
2513 *err = REG_ESPACE;
2514 return NULL;
2515 }
2516 tree->token.opr.idx = cur_nsub;
2517 return tree;
2518}
2519
2520/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2521
2522static bin_tree_t *
2523parse_dup_op (elem, regexp, dfa, token, syntax, err)
2524 bin_tree_t *elem;
2525 re_string_t *regexp;
2526 re_dfa_t *dfa;
2527 re_token_t *token;
2528 reg_syntax_t syntax;
2529 reg_errcode_t *err;
2530{
2531 bin_tree_t *tree = NULL, *old_tree = NULL;
2532 int i, start, end, start_idx = re_string_cur_idx (regexp);
2533 re_token_t start_token = *token;
2534
2535 if (token->type == OP_OPEN_DUP_NUM)
2536 {
2537 end = 0;
2538 start = fetch_number (regexp, token, syntax);
2539 if (start == -1)
2540 {
2541 if (token->type == CHARACTER && token->opr.c == ',')
2542 start = 0; /* We treat "{,m}" as "{0,m}". */
2543 else
2544 {
2545 *err = REG_BADBR; /* <re>{} is invalid. */
2546 return NULL;
2547 }
2548 }
2549 if (BE (start != -2, 1))
2550 {
2551 /* We treat "{n}" as "{n,n}". */
2552 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2553 : ((token->type == CHARACTER && token->opr.c == ',')
2554 ? fetch_number (regexp, token, syntax) : -2));
2555 }
2556 if (BE (start == -2 || end == -2, 0))
2557 {
2558 /* Invalid sequence. */
2559 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2560 {
2561 if (token->type == END_OF_RE)
2562 *err = REG_EBRACE;
2563 else
2564 *err = REG_BADBR;
2565
2566 return NULL;
2567 }
2568
2569 /* If the syntax bit is set, rollback. */
2570 re_string_set_index (regexp, start_idx);
2571 *token = start_token;
2572 token->type = CHARACTER;
2573 /* mb_partial and word_char bits should be already initialized by
2574 peek_token. */
2575 return elem;
2576 }
2577
2578 if (BE (end != -1 && start > end, 0))
2579 {
2580 /* First number greater than second. */
2581 *err = REG_BADBR;
2582 return NULL;
2583 }
2584 }
2585 else
2586 {
2587 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2588 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2589 }
2590
2591 fetch_token (token, regexp, syntax);
2592
2593 if (BE (elem == NULL, 0))
2594 return NULL;
2595 if (BE (start == 0 && end == 0, 0))
2596 {
2597 postorder (elem, free_tree, NULL);
2598 return NULL;
2599 }
2600
2601 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2602 if (BE (start > 0, 0))
2603 {
2604 tree = elem;
2605 for (i = 2; i <= start; ++i)
2606 {
2607 elem = duplicate_tree (elem, dfa);
2608 tree = create_tree (dfa, tree, elem, CONCAT);
2609 if (BE (elem == NULL || tree == NULL, 0))
2610 goto parse_dup_op_espace;
2611 }
2612
2613 if (start == end)
2614 return tree;
2615
2616 /* Duplicate ELEM before it is marked optional. */
2617 elem = duplicate_tree (elem, dfa);
2618 old_tree = tree;
2619 }
2620 else
2621 old_tree = NULL;
2622
2623 if (elem->token.type == SUBEXP)
2624 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2625
2626 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2627 if (BE (tree == NULL, 0))
2628 goto parse_dup_op_espace;
2629
2630 /* This loop is actually executed only when end != -1,
2631 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2632 already created the start+1-th copy. */
2633 for (i = start + 2; i <= end; ++i)
2634 {
2635 elem = duplicate_tree (elem, dfa);
2636 tree = create_tree (dfa, tree, elem, CONCAT);
2637 if (BE (elem == NULL || tree == NULL, 0))
2638 goto parse_dup_op_espace;
2639
2640 tree = create_tree (dfa, tree, NULL, OP_ALT);
2641 if (BE (tree == NULL, 0))
2642 goto parse_dup_op_espace;
2643 }
2644
2645 if (old_tree)
2646 tree = create_tree (dfa, old_tree, tree, CONCAT);
2647
2648 return tree;
2649
2650 parse_dup_op_espace:
2651 *err = REG_ESPACE;
2652 return NULL;
2653}
2654
2655/* Size of the names for collating symbol/equivalence_class/character_class.
2656 I'm not sure, but maybe enough. */
2657#define BRACKET_NAME_BUF_SIZE 32
2658
2659#ifndef _LIBC
2660 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2661 Build the range expression which starts from START_ELEM, and ends
2662 at END_ELEM. The result are written to MBCSET and SBCSET.
2663 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2664 mbcset->range_ends, is a pointer argument sinse we may
2665 update it. */
2666
2667static reg_errcode_t
2668# ifdef RE_ENABLE_I18N
2669build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2670 re_charset_t *mbcset;
2671 int *range_alloc;
2672# else /* not RE_ENABLE_I18N */
2673build_range_exp (sbcset, start_elem, end_elem)
2674# endif /* not RE_ENABLE_I18N */
2675 re_bitset_ptr_t sbcset;
2676 bracket_elem_t *start_elem, *end_elem;
2677{
2678 unsigned int start_ch, end_ch;
2679 /* Equivalence Classes and Character Classes can't be a range start/end. */
2680 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2681 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2682 0))
2683 return REG_ERANGE;
2684
2685 /* We can handle no multi character collating elements without libc
2686 support. */
2687 if (BE ((start_elem->type == COLL_SYM
2688 && strlen ((char *) start_elem->opr.name) > 1)
2689 || (end_elem->type == COLL_SYM
2690 && strlen ((char *) end_elem->opr.name) > 1), 0))
2691 return REG_ECOLLATE;
2692
2693# ifdef RE_ENABLE_I18N
2694 {
2695 wchar_t wc, start_wc, end_wc;
2696 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2697
2698 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2699 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2700 : 0));
2701 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2702 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2703 : 0));
2704 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2705 ? __btowc (start_ch) : start_elem->opr.wch);
2706 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2707 ? __btowc (end_ch) : end_elem->opr.wch);
2708 if (start_wc == WEOF || end_wc == WEOF)
2709 return REG_ECOLLATE;
2710 cmp_buf[0] = start_wc;
2711 cmp_buf[4] = end_wc;
2712 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2713 return REG_ERANGE;
2714
2715 /* Got valid collation sequence values, add them as a new entry.
2716 However, for !_LIBC we have no collation elements: if the
2717 character set is single byte, the single byte character set
2718 that we build below suffices. parse_bracket_exp passes
2719 no MBCSET if dfa->mb_cur_max == 1. */
2720 if (mbcset)
2721 {
2722 /* Check the space of the arrays. */
2723 if (BE (*range_alloc == mbcset->nranges, 0))
2724 {
2725 /* There is not enough space, need realloc. */
2726 wchar_t *new_array_start, *new_array_end;
2727 int new_nranges;
2728
2729 /* +1 in case of mbcset->nranges is 0. */
2730 new_nranges = 2 * mbcset->nranges + 1;
2731 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2732 are NULL if *range_alloc == 0. */
2733 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2734 new_nranges);
2735 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2736 new_nranges);
2737
2738 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2739 return REG_ESPACE;
2740
2741 mbcset->range_starts = new_array_start;
2742 mbcset->range_ends = new_array_end;
2743 *range_alloc = new_nranges;
2744 }
2745
2746 mbcset->range_starts[mbcset->nranges] = start_wc;
2747 mbcset->range_ends[mbcset->nranges++] = end_wc;
2748 }
2749
2750 /* Build the table for single byte characters. */
2751 for (wc = 0; wc < SBC_MAX; ++wc)
2752 {
2753 cmp_buf[2] = wc;
2754 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2755 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2756 bitset_set (sbcset, wc);
2757 }
2758 }
2759# else /* not RE_ENABLE_I18N */
2760 {
2761 unsigned int ch;
2762 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2763 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2764 : 0));
2765 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2766 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2767 : 0));
2768 if (start_ch > end_ch)
2769 return REG_ERANGE;
2770 /* Build the table for single byte characters. */
2771 for (ch = 0; ch < SBC_MAX; ++ch)
2772 if (start_ch <= ch && ch <= end_ch)
2773 bitset_set (sbcset, ch);
2774 }
2775# endif /* not RE_ENABLE_I18N */
2776 return REG_NOERROR;
2777}
2778#endif /* not _LIBC */
2779
2780#ifndef _LIBC
2781/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2782 Build the collating element which is represented by NAME.
2783 The result are written to MBCSET and SBCSET.
2784 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2785 pointer argument since we may update it. */
2786
2787static reg_errcode_t
2788# ifdef RE_ENABLE_I18N
2789build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2790 re_charset_t *mbcset;
2791 int *coll_sym_alloc;
2792# else /* not RE_ENABLE_I18N */
2793build_collating_symbol (sbcset, name)
2794# endif /* not RE_ENABLE_I18N */
2795 re_bitset_ptr_t sbcset;
2796 const unsigned char *name;
2797{
2798 size_t name_len = strlen ((const char *) name);
2799 if (BE (name_len != 1, 0))
2800 return REG_ECOLLATE;
2801 else
2802 {
2803 bitset_set (sbcset, name[0]);
2804 return REG_NOERROR;
2805 }
2806}
2807#endif /* not _LIBC */
2808
2809/* This function parse bracket expression like "[abc]", "[a-c]",
2810 "[[.a-a.]]" etc. */
2811
2812static bin_tree_t *
2813parse_bracket_exp (regexp, dfa, token, syntax, err)
2814 re_string_t *regexp;
2815 re_dfa_t *dfa;
2816 re_token_t *token;
2817 reg_syntax_t syntax;
2818 reg_errcode_t *err;
2819{
2820#ifdef _LIBC
2821 const unsigned char *collseqmb;
2822 const char *collseqwc;
2823 uint32_t nrules;
2824 int32_t table_size;
2825 const int32_t *symb_table;
2826 const unsigned char *extra;
2827
2828 /* Local function for parse_bracket_exp used in _LIBC environement.
2829 Seek the collating symbol entry correspondings to NAME.
2830 Return the index of the symbol in the SYMB_TABLE. */
2831
2832 auto inline int32_t
2833 __attribute ((always_inline))
2834 seek_collating_symbol_entry (name, name_len)
2835 const unsigned char *name;
2836 size_t name_len;
2837 {
2838 int32_t hash = elem_hash ((const char *) name, name_len);
2839 int32_t elem = hash % table_size;
2840 int32_t second = hash % (table_size - 2);
2841 while (symb_table[2 * elem] != 0)
2842 {
2843 /* First compare the hashing value. */
2844 if (symb_table[2 * elem] == hash
2845 /* Compare the length of the name. */
2846 && name_len == extra[symb_table[2 * elem + 1]]
2847 /* Compare the name. */
2848 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2849 name_len) == 0)
2850 {
2851 /* Yep, this is the entry. */
2852 break;
2853 }
2854
2855 /* Next entry. */
2856 elem += second;
2857 }
2858 return elem;
2859 }
2860
2861 /* Local function for parse_bracket_exp used in _LIBC environement.
2862 Look up the collation sequence value of BR_ELEM.
2863 Return the value if succeeded, UINT_MAX otherwise. */
2864
2865 auto inline unsigned int
2866 __attribute ((always_inline))
2867 lookup_collation_sequence_value (br_elem)
2868 bracket_elem_t *br_elem;
2869 {
2870 if (br_elem->type == SB_CHAR)
2871 {
2872 /*
2873 if (MB_CUR_MAX == 1)
2874 */
2875 if (nrules == 0)
2876 return collseqmb[br_elem->opr.ch];
2877 else
2878 {
2879 wint_t wc = __btowc (br_elem->opr.ch);
2880 return __collseq_table_lookup (collseqwc, wc);
2881 }
2882 }
2883 else if (br_elem->type == MB_CHAR)
2884 {
2885 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2886 }
2887 else if (br_elem->type == COLL_SYM)
2888 {
2889 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2890 if (nrules != 0)
2891 {
2892 int32_t elem, idx;
2893 elem = seek_collating_symbol_entry (br_elem->opr.name,
2894 sym_name_len);
2895 if (symb_table[2 * elem] != 0)
2896 {
2897 /* We found the entry. */
2898 idx = symb_table[2 * elem + 1];
2899 /* Skip the name of collating element name. */
2900 idx += 1 + extra[idx];
2901 /* Skip the byte sequence of the collating element. */
2902 idx += 1 + extra[idx];
2903 /* Adjust for the alignment. */
2904 idx = (idx + 3) & ~3;
2905 /* Skip the multibyte collation sequence value. */
2906 idx += sizeof (unsigned int);
2907 /* Skip the wide char sequence of the collating element. */
2908 idx += sizeof (unsigned int) *
2909 (1 + *(unsigned int *) (extra + idx));
2910 /* Return the collation sequence value. */
2911 return *(unsigned int *) (extra + idx);
2912 }
2913 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2914 {
2915 /* No valid character. Match it as a single byte
2916 character. */
2917 return collseqmb[br_elem->opr.name[0]];
2918 }
2919 }
2920 else if (sym_name_len == 1)
2921 return collseqmb[br_elem->opr.name[0]];
2922 }
2923 return UINT_MAX;
2924 }
2925
2926 /* Local function for parse_bracket_exp used in _LIBC environement.
2927 Build the range expression which starts from START_ELEM, and ends
2928 at END_ELEM. The result are written to MBCSET and SBCSET.
2929 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2930 mbcset->range_ends, is a pointer argument sinse we may
2931 update it. */
2932
2933 auto inline reg_errcode_t
2934 __attribute ((always_inline))
2935 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2936 re_charset_t *mbcset;
2937 int *range_alloc;
2938 re_bitset_ptr_t sbcset;
2939 bracket_elem_t *start_elem, *end_elem;
2940 {
2941 unsigned int ch;
2942 uint32_t start_collseq;
2943 uint32_t end_collseq;
2944
2945 /* Equivalence Classes and Character Classes can't be a range
2946 start/end. */
2947 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2948 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2949 0))
2950 return REG_ERANGE;
2951
2952 start_collseq = lookup_collation_sequence_value (start_elem);
2953 end_collseq = lookup_collation_sequence_value (end_elem);
2954 /* Check start/end collation sequence values. */
2955 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2956 return REG_ECOLLATE;
2957 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2958 return REG_ERANGE;
2959
2960 /* Got valid collation sequence values, add them as a new entry.
2961 However, if we have no collation elements, and the character set
2962 is single byte, the single byte character set that we
2963 build below suffices. */
2964 if (nrules > 0 || dfa->mb_cur_max > 1)
2965 {
2966 /* Check the space of the arrays. */
2967 if (BE (*range_alloc == mbcset->nranges, 0))
2968 {
2969 /* There is not enough space, need realloc. */
2970 uint32_t *new_array_start;
2971 uint32_t *new_array_end;
2972 int new_nranges;
2973
2974 /* +1 in case of mbcset->nranges is 0. */
2975 new_nranges = 2 * mbcset->nranges + 1;
2976 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2977 new_nranges);
2978 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2979 new_nranges);
2980
2981 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2982 return REG_ESPACE;
2983
2984 mbcset->range_starts = new_array_start;
2985 mbcset->range_ends = new_array_end;
2986 *range_alloc = new_nranges;
2987 }
2988
2989 mbcset->range_starts[mbcset->nranges] = start_collseq;
2990 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2991 }
2992
2993 /* Build the table for single byte characters. */
2994 for (ch = 0; ch < SBC_MAX; ch++)
2995 {
2996 uint32_t ch_collseq;
2997 /*
2998 if (MB_CUR_MAX == 1)
2999 */
3000 if (nrules == 0)
3001 ch_collseq = collseqmb[ch];
3002 else
3003 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3004 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3005 bitset_set (sbcset, ch);
3006 }
3007 return REG_NOERROR;
3008 }
3009
3010 /* Local function for parse_bracket_exp used in _LIBC environement.
3011 Build the collating element which is represented by NAME.
3012 The result are written to MBCSET and SBCSET.
3013 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3014 pointer argument sinse we may update it. */
3015
3016 auto inline reg_errcode_t
3017 __attribute ((always_inline))
3018 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3019 re_charset_t *mbcset;
3020 int *coll_sym_alloc;
3021 re_bitset_ptr_t sbcset;
3022 const unsigned char *name;
3023 {
3024 int32_t elem, idx;
3025 size_t name_len = strlen ((const char *) name);
3026 if (nrules != 0)
3027 {
3028 elem = seek_collating_symbol_entry (name, name_len);
3029 if (symb_table[2 * elem] != 0)
3030 {
3031 /* We found the entry. */
3032 idx = symb_table[2 * elem + 1];
3033 /* Skip the name of collating element name. */
3034 idx += 1 + extra[idx];
3035 }
3036 else if (symb_table[2 * elem] == 0 && name_len == 1)
3037 {
3038 /* No valid character, treat it as a normal
3039 character. */
3040 bitset_set (sbcset, name[0]);
3041 return REG_NOERROR;
3042 }
3043 else
3044 return REG_ECOLLATE;
3045
3046 /* Got valid collation sequence, add it as a new entry. */
3047 /* Check the space of the arrays. */
3048 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3049 {
3050 /* Not enough, realloc it. */
3051 /* +1 in case of mbcset->ncoll_syms is 0. */
3052 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3053 /* Use realloc since mbcset->coll_syms is NULL
3054 if *alloc == 0. */
3055 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3056 new_coll_sym_alloc);
3057 if (BE (new_coll_syms == NULL, 0))
3058 return REG_ESPACE;
3059 mbcset->coll_syms = new_coll_syms;
3060 *coll_sym_alloc = new_coll_sym_alloc;
3061 }
3062 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3063 return REG_NOERROR;
3064 }
3065 else
3066 {
3067 if (BE (name_len != 1, 0))
3068 return REG_ECOLLATE;
3069 else
3070 {
3071 bitset_set (sbcset, name[0]);
3072 return REG_NOERROR;
3073 }
3074 }
3075 }
3076#endif
3077
3078 re_token_t br_token;
3079 re_bitset_ptr_t sbcset;
3080#ifdef RE_ENABLE_I18N
3081 re_charset_t *mbcset;
3082 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3083 int equiv_class_alloc = 0, char_class_alloc = 0;
3084#endif /* not RE_ENABLE_I18N */
3085 int non_match = 0;
3086 bin_tree_t *work_tree;
3087 int token_len;
3088 int first_round = 1;
3089#ifdef _LIBC
3090 collseqmb = (const unsigned char *)
3091 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3092 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3093 if (nrules)
3094 {
3095 /*
3096 if (MB_CUR_MAX > 1)
3097 */
3098 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3099 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3100 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3101 _NL_COLLATE_SYMB_TABLEMB);
3102 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3103 _NL_COLLATE_SYMB_EXTRAMB);
3104 }
3105#endif
3106 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3107#ifdef RE_ENABLE_I18N
3108 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3109#endif /* RE_ENABLE_I18N */
3110#ifdef RE_ENABLE_I18N
3111 if (BE (sbcset == NULL || mbcset == NULL, 0))
3112#else
3113 if (BE (sbcset == NULL, 0))
3114#endif /* RE_ENABLE_I18N */
3115 {
3116 *err = REG_ESPACE;
3117 return NULL;
3118 }
3119
3120 token_len = peek_token_bracket (token, regexp, syntax);
3121 if (BE (token->type == END_OF_RE, 0))
3122 {
3123 *err = REG_BADPAT;
3124 goto parse_bracket_exp_free_return;
3125 }
3126 if (token->type == OP_NON_MATCH_LIST)
3127 {
3128#ifdef RE_ENABLE_I18N
3129 mbcset->non_match = 1;
3130#endif /* not RE_ENABLE_I18N */
3131 non_match = 1;
3132 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3133 bitset_set (sbcset, '\0');
3134 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3135 token_len = peek_token_bracket (token, regexp, syntax);
3136 if (BE (token->type == END_OF_RE, 0))
3137 {
3138 *err = REG_BADPAT;
3139 goto parse_bracket_exp_free_return;
3140 }
3141 }
3142
3143 /* We treat the first ']' as a normal character. */
3144 if (token->type == OP_CLOSE_BRACKET)
3145 token->type = CHARACTER;
3146
3147 while (1)
3148 {
3149 bracket_elem_t start_elem, end_elem;
3150 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3151 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3152 reg_errcode_t ret;
3153 int token_len2 = 0, is_range_exp = 0;
3154 re_token_t token2;
3155
3156 start_elem.opr.name = start_name_buf;
3157 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3158 syntax, first_round);
3159 if (BE (ret != REG_NOERROR, 0))
3160 {
3161 *err = ret;
3162 goto parse_bracket_exp_free_return;
3163 }
3164 first_round = 0;
3165
3166 /* Get information about the next token. We need it in any case. */
3167 token_len = peek_token_bracket (token, regexp, syntax);
3168
3169 /* Do not check for ranges if we know they are not allowed. */
3170 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3171 {
3172 if (BE (token->type == END_OF_RE, 0))
3173 {
3174 *err = REG_EBRACK;
3175 goto parse_bracket_exp_free_return;
3176 }
3177 if (token->type == OP_CHARSET_RANGE)
3178 {
3179 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3180 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3181 if (BE (token2.type == END_OF_RE, 0))
3182 {
3183 *err = REG_EBRACK;
3184 goto parse_bracket_exp_free_return;
3185 }
3186 if (token2.type == OP_CLOSE_BRACKET)
3187 {
3188 /* We treat the last '-' as a normal character. */
3189 re_string_skip_bytes (regexp, -token_len);
3190 token->type = CHARACTER;
3191 }
3192 else
3193 is_range_exp = 1;
3194 }
3195 }
3196
3197 if (is_range_exp == 1)
3198 {
3199 end_elem.opr.name = end_name_buf;
3200 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3201 dfa, syntax, 1);
3202 if (BE (ret != REG_NOERROR, 0))
3203 {
3204 *err = ret;
3205 goto parse_bracket_exp_free_return;
3206 }
3207
3208 token_len = peek_token_bracket (token, regexp, syntax);
3209
3210#ifdef _LIBC
3211 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3212 &start_elem, &end_elem);
3213#else
3214# ifdef RE_ENABLE_I18N
3215 *err = build_range_exp (sbcset,
3216 dfa->mb_cur_max > 1 ? mbcset : NULL,
3217 &range_alloc, &start_elem, &end_elem);
3218# else
3219 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3220# endif
3221#endif /* RE_ENABLE_I18N */
3222 if (BE (*err != REG_NOERROR, 0))
3223 goto parse_bracket_exp_free_return;
3224 }
3225 else
3226 {
3227 switch (start_elem.type)
3228 {
3229 case SB_CHAR:
3230 bitset_set (sbcset, start_elem.opr.ch);
3231 break;
3232#ifdef RE_ENABLE_I18N
3233 case MB_CHAR:
3234 /* Check whether the array has enough space. */
3235 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3236 {
3237 wchar_t *new_mbchars;
3238 /* Not enough, realloc it. */
3239 /* +1 in case of mbcset->nmbchars is 0. */
3240 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3241 /* Use realloc since array is NULL if *alloc == 0. */
3242 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3243 mbchar_alloc);
3244 if (BE (new_mbchars == NULL, 0))
3245 goto parse_bracket_exp_espace;
3246 mbcset->mbchars = new_mbchars;
3247 }
3248 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3249 break;
3250#endif /* RE_ENABLE_I18N */
3251 case EQUIV_CLASS:
3252 *err = build_equiv_class (sbcset,
3253#ifdef RE_ENABLE_I18N
3254 mbcset, &equiv_class_alloc,
3255#endif /* RE_ENABLE_I18N */
3256 start_elem.opr.name);
3257 if (BE (*err != REG_NOERROR, 0))
3258 goto parse_bracket_exp_free_return;
3259 break;
3260 case COLL_SYM:
3261 *err = build_collating_symbol (sbcset,
3262#ifdef RE_ENABLE_I18N
3263 mbcset, &coll_sym_alloc,
3264#endif /* RE_ENABLE_I18N */
3265 start_elem.opr.name);
3266 if (BE (*err != REG_NOERROR, 0))
3267 goto parse_bracket_exp_free_return;
3268 break;
3269 case CHAR_CLASS:
3270 *err = build_charclass (regexp->trans, sbcset,
3271#ifdef RE_ENABLE_I18N
3272 mbcset, &char_class_alloc,
3273#endif /* RE_ENABLE_I18N */
3274 start_elem.opr.name, syntax);
3275 if (BE (*err != REG_NOERROR, 0))
3276 goto parse_bracket_exp_free_return;
3277 break;
3278 default:
3279 assert (0);
3280 break;
3281 }
3282 }
3283 if (BE (token->type == END_OF_RE, 0))
3284 {
3285 *err = REG_EBRACK;
3286 goto parse_bracket_exp_free_return;
3287 }
3288 if (token->type == OP_CLOSE_BRACKET)
3289 break;
3290 }
3291
3292 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3293
3294 /* If it is non-matching list. */
3295 if (non_match)
3296 bitset_not (sbcset);
3297
3298#ifdef RE_ENABLE_I18N
3299 /* Ensure only single byte characters are set. */
3300 if (dfa->mb_cur_max > 1)
3301 bitset_mask (sbcset, dfa->sb_char);
3302
3303 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3304 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3305 || mbcset->non_match)))
3306 {
3307 bin_tree_t *mbc_tree;
3308 int sbc_idx;
3309 /* Build a tree for complex bracket. */
3310 dfa->has_mb_node = 1;
3311 br_token.type = COMPLEX_BRACKET;
3312 br_token.opr.mbcset = mbcset;
3313 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3314 if (BE (mbc_tree == NULL, 0))
3315 goto parse_bracket_exp_espace;
3316 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3317 if (sbcset[sbc_idx])
3318 break;
3319 /* If there are no bits set in sbcset, there is no point
3320 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3321 if (sbc_idx < BITSET_UINTS)
3322 {
3323 /* Build a tree for simple bracket. */
3324 br_token.type = SIMPLE_BRACKET;
3325 br_token.opr.sbcset = sbcset;
3326 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3327 if (BE (work_tree == NULL, 0))
3328 goto parse_bracket_exp_espace;
3329
3330 /* Then join them by ALT node. */
3331 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3332 if (BE (work_tree == NULL, 0))
3333 goto parse_bracket_exp_espace;
3334 }
3335 else
3336 {
3337 re_free (sbcset);
3338 work_tree = mbc_tree;
3339 }
3340 }
3341 else
3342#endif /* not RE_ENABLE_I18N */
3343 {
3344#ifdef RE_ENABLE_I18N
3345 free_charset (mbcset);
3346#endif
3347 /* Build a tree for simple bracket. */
3348 br_token.type = SIMPLE_BRACKET;
3349 br_token.opr.sbcset = sbcset;
3350 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3351 if (BE (work_tree == NULL, 0))
3352 goto parse_bracket_exp_espace;
3353 }
3354 return work_tree;
3355
3356 parse_bracket_exp_espace:
3357 *err = REG_ESPACE;
3358 parse_bracket_exp_free_return:
3359 re_free (sbcset);
3360#ifdef RE_ENABLE_I18N
3361 free_charset (mbcset);
3362#endif /* RE_ENABLE_I18N */
3363 return NULL;
3364}
3365
3366/* Parse an element in the bracket expression. */
3367
3368static reg_errcode_t
3369parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3370 accept_hyphen)
3371 bracket_elem_t *elem;
3372 re_string_t *regexp;
3373 re_token_t *token;
3374 int token_len;
3375 re_dfa_t *dfa;
3376 reg_syntax_t syntax;
3377 int accept_hyphen;
3378{
3379#ifdef RE_ENABLE_I18N
3380 int cur_char_size;
3381 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3382 if (cur_char_size > 1)
3383 {
3384 elem->type = MB_CHAR;
3385 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3386 re_string_skip_bytes (regexp, cur_char_size);
3387 return REG_NOERROR;
3388 }
3389#endif /* RE_ENABLE_I18N */
3390 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3391 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3392 || token->type == OP_OPEN_EQUIV_CLASS)
3393 return parse_bracket_symbol (elem, regexp, token);
3394 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3395 {
3396 /* A '-' must only appear as anything but a range indicator before
3397 the closing bracket. Everything else is an error. */
3398 re_token_t token2;
3399 (void) peek_token_bracket (&token2, regexp, syntax);
3400 if (token2.type != OP_CLOSE_BRACKET)
3401 /* The actual error value is not standardized since this whole
3402 case is undefined. But ERANGE makes good sense. */
3403 return REG_ERANGE;
3404 }
3405 elem->type = SB_CHAR;
3406 elem->opr.ch = token->opr.c;
3407 return REG_NOERROR;
3408}
3409
3410/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3411 such as [:<character_class>:], [.<collating_element>.], and
3412 [=<equivalent_class>=]. */
3413
3414static reg_errcode_t
3415parse_bracket_symbol (elem, regexp, token)
3416 bracket_elem_t *elem;
3417 re_string_t *regexp;
3418 re_token_t *token;
3419{
3420 unsigned char ch, delim = token->opr.c;
3421 int i = 0;
3422 if (re_string_eoi(regexp))
3423 return REG_EBRACK;
3424 for (;; ++i)
3425 {
3426 if (i >= BRACKET_NAME_BUF_SIZE)
3427 return REG_EBRACK;
3428 if (token->type == OP_OPEN_CHAR_CLASS)
3429 ch = re_string_fetch_byte_case (regexp);
3430 else
3431 ch = re_string_fetch_byte (regexp);
3432 if (re_string_eoi(regexp))
3433 return REG_EBRACK;
3434 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3435 break;
3436 elem->opr.name[i] = ch;
3437 }
3438 re_string_skip_bytes (regexp, 1);
3439 elem->opr.name[i] = '\0';
3440 switch (token->type)
3441 {
3442 case OP_OPEN_COLL_ELEM:
3443 elem->type = COLL_SYM;
3444 break;
3445 case OP_OPEN_EQUIV_CLASS:
3446 elem->type = EQUIV_CLASS;
3447 break;
3448 case OP_OPEN_CHAR_CLASS:
3449 elem->type = CHAR_CLASS;
3450 break;
3451 default:
3452 break;
3453 }
3454 return REG_NOERROR;
3455}
3456
3457 /* Helper function for parse_bracket_exp.
3458 Build the equivalence class which is represented by NAME.
3459 The result are written to MBCSET and SBCSET.
3460 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3461 is a pointer argument sinse we may update it. */
3462
3463static reg_errcode_t
3464#ifdef RE_ENABLE_I18N
3465build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3466 re_charset_t *mbcset;
3467 int *equiv_class_alloc;
3468#else /* not RE_ENABLE_I18N */
3469build_equiv_class (sbcset, name)
3470#endif /* not RE_ENABLE_I18N */
3471 re_bitset_ptr_t sbcset;
3472 const unsigned char *name;
3473{
3474#if defined _LIBC
3475 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3476 if (nrules != 0)
3477 {
3478 const int32_t *table, *indirect;
3479 const unsigned char *weights, *extra, *cp;
3480 unsigned char char_buf[2];
3481 int32_t idx1, idx2;
3482 unsigned int ch;
3483 size_t len;
3484 /* This #include defines a local function! */
3485# include <locale/weight.h>
3486 /* Calculate the index for equivalence class. */
3487 cp = name;
3488 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3489 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3490 _NL_COLLATE_WEIGHTMB);
3491 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3492 _NL_COLLATE_EXTRAMB);
3493 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3494 _NL_COLLATE_INDIRECTMB);
3495 idx1 = findidx (&cp);
3496 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3497 /* This isn't a valid character. */
3498 return REG_ECOLLATE;
3499
3500 /* Build single byte matcing table for this equivalence class. */
3501 char_buf[1] = (unsigned char) '\0';
3502 len = weights[idx1];
3503 for (ch = 0; ch < SBC_MAX; ++ch)
3504 {
3505 char_buf[0] = ch;
3506 cp = char_buf;
3507 idx2 = findidx (&cp);
3508/*
3509 idx2 = table[ch];
3510*/
3511 if (idx2 == 0)
3512 /* This isn't a valid character. */
3513 continue;
3514 if (len == weights[idx2])
3515 {
3516 int cnt = 0;
3517 while (cnt <= len &&
3518 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3519 ++cnt;
3520
3521 if (cnt > len)
3522 bitset_set (sbcset, ch);
3523 }
3524 }
3525 /* Check whether the array has enough space. */
3526 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3527 {
3528 /* Not enough, realloc it. */
3529 /* +1 in case of mbcset->nequiv_classes is 0. */
3530 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3531 /* Use realloc since the array is NULL if *alloc == 0. */
3532 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3533 int32_t,
3534 new_equiv_class_alloc);
3535 if (BE (new_equiv_classes == NULL, 0))
3536 return REG_ESPACE;
3537 mbcset->equiv_classes = new_equiv_classes;
3538 *equiv_class_alloc = new_equiv_class_alloc;
3539 }
3540 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3541 }
3542 else
3543#endif /* _LIBC */
3544 {
3545 if (BE (strlen ((const char *) name) != 1, 0))
3546 return REG_ECOLLATE;
3547 bitset_set (sbcset, *name);
3548 }
3549 return REG_NOERROR;
3550}
3551
3552 /* Helper function for parse_bracket_exp.
3553 Build the character class which is represented by NAME.
3554 The result are written to MBCSET and SBCSET.
3555 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3556 is a pointer argument sinse we may update it. */
3557
3558static reg_errcode_t
3559#ifdef RE_ENABLE_I18N
3560build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3561 re_charset_t *mbcset;
3562 int *char_class_alloc;
3563#else /* not RE_ENABLE_I18N */
3564build_charclass (trans, sbcset, class_name, syntax)
3565#endif /* not RE_ENABLE_I18N */
3566 unsigned RE_TRANSLATE_TYPE trans;
3567 re_bitset_ptr_t sbcset;
3568 const unsigned char *class_name;
3569 reg_syntax_t syntax;
3570{
3571 int i;
3572 const char *name = (const char *) class_name;
3573
3574 /* In case of REG_ICASE "upper" and "lower" match the both of
3575 upper and lower cases. */
3576 if ((syntax & RE_ICASE)
3577 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3578 name = "alpha";
3579
3580#ifdef RE_ENABLE_I18N
3581 /* Check the space of the arrays. */
3582 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3583 {
3584 /* Not enough, realloc it. */
3585 /* +1 in case of mbcset->nchar_classes is 0. */
3586 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3587 /* Use realloc since array is NULL if *alloc == 0. */
3588 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3589 new_char_class_alloc);
3590 if (BE (new_char_classes == NULL, 0))
3591 return REG_ESPACE;
3592 mbcset->char_classes = new_char_classes;
3593 *char_class_alloc = new_char_class_alloc;
3594 }
3595 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3596#endif /* RE_ENABLE_I18N */
3597
3598#define BUILD_CHARCLASS_LOOP(ctype_func) \
3599 for (i = 0; i < SBC_MAX; ++i) \
3600 { \
3601 if (ctype_func (i)) \
3602 { \
3603 int ch = trans ? trans[i] : i; \
3604 bitset_set (sbcset, ch); \
3605 } \
3606 }
3607
3608 if (strcmp (name, "alnum") == 0)
3609 BUILD_CHARCLASS_LOOP (isalnum)
3610 else if (strcmp (name, "cntrl") == 0)
3611 BUILD_CHARCLASS_LOOP (iscntrl)
3612 else if (strcmp (name, "lower") == 0)
3613 BUILD_CHARCLASS_LOOP (islower)
3614 else if (strcmp (name, "space") == 0)
3615 BUILD_CHARCLASS_LOOP (isspace)
3616 else if (strcmp (name, "alpha") == 0)
3617 BUILD_CHARCLASS_LOOP (isalpha)
3618 else if (strcmp (name, "digit") == 0)
3619 BUILD_CHARCLASS_LOOP (isdigit)
3620 else if (strcmp (name, "print") == 0)
3621 BUILD_CHARCLASS_LOOP (isprint)
3622 else if (strcmp (name, "upper") == 0)
3623 BUILD_CHARCLASS_LOOP (isupper)
3624 else if (strcmp (name, "blank") == 0)
3625 BUILD_CHARCLASS_LOOP (isblank)
3626 else if (strcmp (name, "graph") == 0)
3627 BUILD_CHARCLASS_LOOP (isgraph)
3628 else if (strcmp (name, "punct") == 0)
3629 BUILD_CHARCLASS_LOOP (ispunct)
3630 else if (strcmp (name, "xdigit") == 0)
3631 BUILD_CHARCLASS_LOOP (isxdigit)
3632 else
3633 return REG_ECTYPE;
3634
3635 return REG_NOERROR;
3636}
3637
3638static bin_tree_t *
3639build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3640 re_dfa_t *dfa;
3641 unsigned RE_TRANSLATE_TYPE trans;
3642 const unsigned char *class_name;
3643 const unsigned char *extra;
3644 int non_match;
3645 reg_errcode_t *err;
3646{
3647 re_bitset_ptr_t sbcset;
3648#ifdef RE_ENABLE_I18N
3649 re_charset_t *mbcset;
3650 int alloc = 0;
3651#endif /* not RE_ENABLE_I18N */
3652 reg_errcode_t ret;
3653 re_token_t br_token;
3654 bin_tree_t *tree;
3655
3656 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3657#ifdef RE_ENABLE_I18N
3658 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3659#endif /* RE_ENABLE_I18N */
3660
3661#ifdef RE_ENABLE_I18N
3662 if (BE (sbcset == NULL || mbcset == NULL, 0))
3663#else /* not RE_ENABLE_I18N */
3664 if (BE (sbcset == NULL, 0))
3665#endif /* not RE_ENABLE_I18N */
3666 {
3667 *err = REG_ESPACE;
3668 return NULL;
3669 }
3670
3671 if (non_match)
3672 {
3673#ifdef RE_ENABLE_I18N
3674 /*
3675 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3676 bitset_set(cset->sbcset, '\0');
3677 */
3678 mbcset->non_match = 1;
3679#endif /* not RE_ENABLE_I18N */
3680 }
3681
3682 /* We don't care the syntax in this case. */
3683 ret = build_charclass (trans, sbcset,
3684#ifdef RE_ENABLE_I18N
3685 mbcset, &alloc,
3686#endif /* RE_ENABLE_I18N */
3687 class_name, 0);
3688
3689 if (BE (ret != REG_NOERROR, 0))
3690 {
3691 re_free (sbcset);
3692#ifdef RE_ENABLE_I18N
3693 free_charset (mbcset);
3694#endif /* RE_ENABLE_I18N */
3695 *err = ret;
3696 return NULL;
3697 }
3698 /* \w match '_' also. */
3699 for (; *extra; extra++)
3700 bitset_set (sbcset, *extra);
3701
3702 /* If it is non-matching list. */
3703 if (non_match)
3704 bitset_not (sbcset);
3705
3706#ifdef RE_ENABLE_I18N
3707 /* Ensure only single byte characters are set. */
3708 if (dfa->mb_cur_max > 1)
3709 bitset_mask (sbcset, dfa->sb_char);
3710#endif
3711
3712 /* Build a tree for simple bracket. */
3713 br_token.type = SIMPLE_BRACKET;
3714 br_token.opr.sbcset = sbcset;
3715 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3716 if (BE (tree == NULL, 0))
3717 goto build_word_op_espace;
3718
3719#ifdef RE_ENABLE_I18N
3720 if (dfa->mb_cur_max > 1)
3721 {
3722 bin_tree_t *mbc_tree;
3723 /* Build a tree for complex bracket. */
3724 br_token.type = COMPLEX_BRACKET;
3725 br_token.opr.mbcset = mbcset;
3726 dfa->has_mb_node = 1;
3727 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3728 if (BE (mbc_tree == NULL, 0))
3729 goto build_word_op_espace;
3730 /* Then join them by ALT node. */
3731 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3732 if (BE (mbc_tree != NULL, 1))
3733 return tree;
3734 }
3735 else
3736 {
3737 free_charset (mbcset);
3738 return tree;
3739 }
3740#else /* not RE_ENABLE_I18N */
3741 return tree;
3742#endif /* not RE_ENABLE_I18N */
3743
3744 build_word_op_espace:
3745 re_free (sbcset);
3746#ifdef RE_ENABLE_I18N
3747 free_charset (mbcset);
3748#endif /* RE_ENABLE_I18N */
3749 *err = REG_ESPACE;
3750 return NULL;
3751}
3752
3753/* This is intended for the expressions like "a{1,3}".
3754 Fetch a number from `input', and return the number.
3755 Return -1, if the number field is empty like "{,1}".
3756 Return -2, If an error is occured. */
3757
3758static int
3759fetch_number (input, token, syntax)
3760 re_string_t *input;
3761 re_token_t *token;
3762 reg_syntax_t syntax;
3763{
3764 int num = -1;
3765 unsigned char c;
3766 while (1)
3767 {
3768 fetch_token (token, input, syntax);
3769 c = token->opr.c;
3770 if (BE (token->type == END_OF_RE, 0))
3771 return -2;
3772 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3773 break;
3774 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3775 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3776 num = (num > RE_DUP_MAX) ? -2 : num;
3777 }
3778 return num;
3779}
3780
3781
3782#ifdef RE_ENABLE_I18N
3783static void
3784free_charset (re_charset_t *cset)
3785{
3786 re_free (cset->mbchars);
3787# ifdef _LIBC
3788 re_free (cset->coll_syms);
3789 re_free (cset->equiv_classes);
3790 re_free (cset->range_starts);
3791 re_free (cset->range_ends);
3792# endif
3793 re_free (cset->char_classes);
3794 re_free (cset);
3795}
3796#endif /* RE_ENABLE_I18N */
3797
3798
3799/* Functions for binary tree operation. */
3800
3801/* Create a tree node. */
3802
3803static bin_tree_t *
3804create_tree (dfa, left, right, type)
3805 re_dfa_t *dfa;
3806 bin_tree_t *left;
3807 bin_tree_t *right;
3808 re_token_type_t type;
3809{
3810 re_token_t t;
3811 t.type = type;
3812 return create_token_tree (dfa, left, right, &t);
3813}
3814
3815static bin_tree_t *
3816create_token_tree (dfa, left, right, token)
3817 re_dfa_t *dfa;
3818 bin_tree_t *left;
3819 bin_tree_t *right;
3820 const re_token_t *token;
3821{
3822 bin_tree_t *tree;
3823 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3824 {
3825 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3826
3827 if (storage == NULL)
3828 return NULL;
3829 storage->next = dfa->str_tree_storage;
3830 dfa->str_tree_storage = storage;
3831 dfa->str_tree_storage_idx = 0;
3832 }
3833 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3834
3835 tree->parent = NULL;
3836 tree->left = left;
3837 tree->right = right;
3838 tree->token = *token;
3839 tree->token.duplicated = 0;
3840 tree->token.opt_subexp = 0;
3841 tree->first = NULL;
3842 tree->next = NULL;
3843 tree->node_idx = -1;
3844
3845 if (left != NULL)
3846 left->parent = tree;
3847 if (right != NULL)
3848 right->parent = tree;
3849 return tree;
3850}
3851
3852/* Mark the tree SRC as an optional subexpression.
3853 To be called from preorder or postorder. */
3854
3855static reg_errcode_t
3856mark_opt_subexp (extra, node)
3857 void *extra;
3858 bin_tree_t *node;
3859{
3860 int idx = (int) (long) extra;
3861 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3862 node->token.opt_subexp = 1;
3863
3864 return REG_NOERROR;
3865}
3866
3867/* Free the allocated memory inside NODE. */
3868
3869static void
3870free_token (re_token_t *node)
3871{
3872#ifdef RE_ENABLE_I18N
3873 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3874 free_charset (node->opr.mbcset);
3875 else
3876#endif /* RE_ENABLE_I18N */
3877 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3878 re_free (node->opr.sbcset);
3879}
3880
3881/* Worker function for tree walking. Free the allocated memory inside NODE
3882 and its children. */
3883
3884static reg_errcode_t
3885free_tree (void *extra, bin_tree_t *node)
3886{
3887 free_token (&node->token);
3888 return REG_NOERROR;
3889}
3890
3891
3892/* Duplicate the node SRC, and return new node. This is a preorder
3893 visit similar to the one implemented by the generic visitor, but
3894 we need more infrastructure to maintain two parallel trees --- so,
3895 it's easier to duplicate. */
3896
3897static bin_tree_t *
3898duplicate_tree (root, dfa)
3899 const bin_tree_t *root;
3900 re_dfa_t *dfa;
3901{
3902 const bin_tree_t *node;
3903 bin_tree_t *dup_root;
3904 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3905
3906 for (node = root; ; )
3907 {
3908 /* Create a new tree and link it back to the current parent. */
3909 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3910 if (*p_new == NULL)
3911 return NULL;
3912 (*p_new)->parent = dup_node;
3913 (*p_new)->token.duplicated = 1;
3914 dup_node = *p_new;
3915
3916 /* Go to the left node, or up and to the right. */
3917 if (node->left)
3918 {
3919 node = node->left;
3920 p_new = &dup_node->left;
3921 }
3922 else
3923 {
3924 const bin_tree_t *prev = NULL;
3925 while (node->right == prev || node->right == NULL)
3926 {
3927 prev = node;
3928 node = node->parent;
3929 dup_node = dup_node->parent;
3930 if (!node)
3931 return dup_root;
3932 }
3933 node = node->right;
3934 p_new = &dup_node->right;
3935 }
3936 }
3937}
Note: See TracBrowser for help on using the repository browser.