source: trunk/src/grep/lib/regcomp.c@ 3531

Last change on this file since 3531 was 3529, checked in by bird, 4 years ago

Imported grep 3.7 from grep-3.7.tar.gz (sha256: c22b0cf2d4f6bbe599c902387e8058990e1eee99aef333a203829e5fd3dbb342), applying minimal auto-props.

  • Property svn:eol-style set to native
File size: 111.8 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#ifdef _LIBC
21# include <locale/weight.h>
22#endif
23
24static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30#ifdef RE_ENABLE_I18N
31static void free_charset (re_charset_t *cset);
32#endif /* RE_ENABLE_I18N */
33static void free_workarea_compile (regex_t *preg);
34static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35#ifdef RE_ENABLE_I18N
36static void optimize_utf8 (re_dfa_t *dfa);
37#endif
38static reg_errcode_t analyze (regex_t *preg);
39static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92#ifdef RE_ENABLE_I18N
93static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103#else /* not RE_ENABLE_I18N */
104static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110#endif /* not RE_ENABLE_I18N */
111static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123static void free_token (re_token_t *node);
124static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
126
127
128/* This table gives an error message for each of the error codes listed
129 in regex.h. Obviously the order here has to be same as there.
130 POSIX doesn't require that we do anything for REG_NOERROR,
131 but why not be nice? */
132
133static const char __re_error_msgid[] =
134 {
135#define REG_NOERROR_IDX 0
136 gettext_noop ("Success") /* REG_NOERROR */
137 "\0"
138#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
139 gettext_noop ("No match") /* REG_NOMATCH */
140 "\0"
141#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
142 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
143 "\0"
144#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
145 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
146 "\0"
147#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
148 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
149 "\0"
150#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
151 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
152 "\0"
153#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
154 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
155 "\0"
156#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
157 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
158 "\0"
159#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
160 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
161 "\0"
162#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
163 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
164 "\0"
165#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
166 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
167 "\0"
168#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
169 gettext_noop ("Invalid range end") /* REG_ERANGE */
170 "\0"
171#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
172 gettext_noop ("Memory exhausted") /* REG_ESPACE */
173 "\0"
174#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
175 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
176 "\0"
177#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
178 gettext_noop ("Premature end of regular expression") /* REG_EEND */
179 "\0"
180#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
181 gettext_noop ("Regular expression too big") /* REG_ESIZE */
182 "\0"
183#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
184 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
185 };
186
187static const size_t __re_error_msgid_idx[] =
188 {
189 REG_NOERROR_IDX,
190 REG_NOMATCH_IDX,
191 REG_BADPAT_IDX,
192 REG_ECOLLATE_IDX,
193 REG_ECTYPE_IDX,
194 REG_EESCAPE_IDX,
195 REG_ESUBREG_IDX,
196 REG_EBRACK_IDX,
197 REG_EPAREN_IDX,
198 REG_EBRACE_IDX,
199 REG_BADBR_IDX,
200 REG_ERANGE_IDX,
201 REG_ESPACE_IDX,
202 REG_BADRPT_IDX,
203 REG_EEND_IDX,
204 REG_ESIZE_IDX,
205 REG_ERPAREN_IDX
206 };
207
208
209/* Entry points for GNU code. */
210
211/* re_compile_pattern is the GNU regular expression compiler: it
212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213 Returns 0 if the pattern was valid, otherwise an error string.
214
215 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216 are set in BUFP on entry. */
217
218const char *
219re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
221{
222 reg_errcode_t ret;
223
224 /* And GNU code determines whether or not to get register information
225 by passing null for the REGS argument to re_match, etc., not by
226 setting no_sub, unless RE_NO_SUB is set. */
227 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
228
229 /* Match anchors at newline. */
230 bufp->newline_anchor = 1;
231
232 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
233
234 if (!ret)
235 return NULL;
236 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
237}
238weak_alias (__re_compile_pattern, re_compile_pattern)
239
240/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243/* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245reg_syntax_t re_syntax_options;
246
247
248/* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
251
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
254
255reg_syntax_t
256re_set_syntax (reg_syntax_t syntax)
257{
258 reg_syntax_t ret = re_syntax_options;
259
260 re_syntax_options = syntax;
261 return ret;
262}
263weak_alias (__re_set_syntax, re_set_syntax)
264
265int
266re_compile_fastmap (struct re_pattern_buffer *bufp)
267{
268 re_dfa_t *dfa = bufp->buffer;
269 char *fastmap = bufp->fastmap;
270
271 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
272 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
273 if (dfa->init_state != dfa->init_state_word)
274 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
275 if (dfa->init_state != dfa->init_state_nl)
276 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
277 if (dfa->init_state != dfa->init_state_begbuf)
278 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
279 bufp->fastmap_accurate = 1;
280 return 0;
281}
282weak_alias (__re_compile_fastmap, re_compile_fastmap)
283
284static inline void
285__attribute__ ((always_inline))
286re_set_fastmap (char *fastmap, bool icase, int ch)
287{
288 fastmap[ch] = 1;
289 if (icase)
290 fastmap[tolower (ch)] = 1;
291}
292
293/* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
295
296static void
297re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
298 char *fastmap)
299{
300 re_dfa_t *dfa = bufp->buffer;
301 Idx node_cnt;
302 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
304 {
305 Idx node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
307
308 if (type == CHARACTER)
309 {
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311#ifdef RE_ENABLE_I18N
312 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
313 {
314 unsigned char buf[MB_LEN_MAX];
315 unsigned char *p;
316 wchar_t wc;
317 mbstate_t state;
318
319 p = buf;
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, '\0', sizeof (state));
326 if (__mbrtowc (&wc, (const char *) buf, p - buf,
327 &state) == p - buf
328 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
329 != (size_t) -1))
330 re_set_fastmap (fastmap, false, buf[0]);
331 }
332#endif
333 }
334 else if (type == SIMPLE_BRACKET)
335 {
336 int i, ch;
337 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338 {
339 int j;
340 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
341 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
342 if (w & ((bitset_word_t) 1 << j))
343 re_set_fastmap (fastmap, icase, ch);
344 }
345 }
346#ifdef RE_ENABLE_I18N
347 else if (type == COMPLEX_BRACKET)
348 {
349 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
350 Idx i;
351
352# ifdef _LIBC
353 /* See if we have to try all bytes which start multiple collation
354 elements.
355 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
356 collation element, and don't catch 'b' since 'b' is
357 the only collation element which starts from 'b' (and
358 it is caught by SIMPLE_BRACKET). */
359 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
360 && (cset->ncoll_syms || cset->nranges))
361 {
362 const int32_t *table = (const int32_t *)
363 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
364 for (i = 0; i < SBC_MAX; ++i)
365 if (table[i] < 0)
366 re_set_fastmap (fastmap, icase, i);
367 }
368# endif /* _LIBC */
369
370 /* See if we have to start the match at all multibyte characters,
371 i.e. where we would not find an invalid sequence. This only
372 applies to multibyte character sets; for single byte character
373 sets, the SIMPLE_BRACKET again suffices. */
374 if (dfa->mb_cur_max > 1
375 && (cset->nchar_classes || cset->non_match || cset->nranges
376# ifdef _LIBC
377 || cset->nequiv_classes
378# endif /* _LIBC */
379 ))
380 {
381 unsigned char c = 0;
382 do
383 {
384 mbstate_t mbs;
385 memset (&mbs, 0, sizeof (mbs));
386 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
387 re_set_fastmap (fastmap, false, (int) c);
388 }
389 while (++c != 0);
390 }
391
392 else
393 {
394 /* ... Else catch all bytes which can start the mbchars. */
395 for (i = 0; i < cset->nmbchars; ++i)
396 {
397 char buf[256];
398 mbstate_t state;
399 memset (&state, '\0', sizeof (state));
400 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
401 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
402 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
403 {
404 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
405 != (size_t) -1)
406 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
407 }
408 }
409 }
410 }
411#endif /* RE_ENABLE_I18N */
412 else if (type == OP_PERIOD
413#ifdef RE_ENABLE_I18N
414 || type == OP_UTF8_PERIOD
415#endif /* RE_ENABLE_I18N */
416 || type == END_OF_RE)
417 {
418 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
419 if (type == END_OF_RE)
420 bufp->can_be_null = 1;
421 return;
422 }
423 }
424}
425
426
427/* Entry point for POSIX code. */
428/* regcomp takes a regular expression as a string and compiles it.
429
430 PREG is a regex_t *. We do not expect any fields to be initialized,
431 since POSIX says we shouldn't. Thus, we set
432
433 'buffer' to the compiled pattern;
434 'used' to the length of the compiled pattern;
435 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
436 REG_EXTENDED bit in CFLAGS is set; otherwise, to
437 RE_SYNTAX_POSIX_BASIC;
438 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
439 'fastmap' to an allocated space for the fastmap;
440 'fastmap_accurate' to zero;
441 're_nsub' to the number of subexpressions in PATTERN.
442
443 PATTERN is the address of the pattern string.
444
445 CFLAGS is a series of bits which affect compilation.
446
447 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
448 use POSIX basic syntax.
449
450 If REG_NEWLINE is set, then . and [^...] don't match newline.
451 Also, regexec will try a match beginning after every newline.
452
453 If REG_ICASE is set, then we considers upper- and lowercase
454 versions of letters to be equivalent when matching.
455
456 If REG_NOSUB is set, then when PREG is passed to regexec, that
457 routine will report only success or failure, and nothing about the
458 registers.
459
460 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
461 the return codes and their meanings.) */
462
463int
464regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
465{
466 reg_errcode_t ret;
467 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
468 : RE_SYNTAX_POSIX_BASIC);
469
470 preg->buffer = NULL;
471 preg->allocated = 0;
472 preg->used = 0;
473
474 /* Try to allocate space for the fastmap. */
475 preg->fastmap = re_malloc (char, SBC_MAX);
476 if (__glibc_unlikely (preg->fastmap == NULL))
477 return REG_ESPACE;
478
479 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
480
481 /* If REG_NEWLINE is set, newlines are treated differently. */
482 if (cflags & REG_NEWLINE)
483 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
484 syntax &= ~RE_DOT_NEWLINE;
485 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
486 /* It also changes the matching behavior. */
487 preg->newline_anchor = 1;
488 }
489 else
490 preg->newline_anchor = 0;
491 preg->no_sub = !!(cflags & REG_NOSUB);
492 preg->translate = NULL;
493
494 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
495
496 /* POSIX doesn't distinguish between an unmatched open-group and an
497 unmatched close-group: both are REG_EPAREN. */
498 if (ret == REG_ERPAREN)
499 ret = REG_EPAREN;
500
501 /* We have already checked preg->fastmap != NULL. */
502 if (__glibc_likely (ret == REG_NOERROR))
503 /* Compute the fastmap now, since regexec cannot modify the pattern
504 buffer. This function never fails in this implementation. */
505 (void) re_compile_fastmap (preg);
506 else
507 {
508 /* Some error occurred while compiling the expression. */
509 re_free (preg->fastmap);
510 preg->fastmap = NULL;
511 }
512
513 return (int) ret;
514}
515libc_hidden_def (__regcomp)
516weak_alias (__regcomp, regcomp)
517
518/* Returns a message corresponding to an error code, ERRCODE, returned
519 from either regcomp or regexec. We don't use PREG here. */
520
521size_t
522regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
523 size_t errbuf_size)
524{
525 const char *msg;
526 size_t msg_size;
527 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
528
529 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
530 /* Only error codes returned by the rest of the code should be passed
531 to this routine. If we are given anything else, or if other regex
532 code generates an invalid error code, then the program has a bug.
533 Dump core so we can fix it. */
534 abort ();
535
536 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
537
538 msg_size = strlen (msg) + 1; /* Includes the null. */
539
540 if (__glibc_likely (errbuf_size != 0))
541 {
542 size_t cpy_size = msg_size;
543 if (__glibc_unlikely (msg_size > errbuf_size))
544 {
545 cpy_size = errbuf_size - 1;
546 errbuf[cpy_size] = '\0';
547 }
548 memcpy (errbuf, msg, cpy_size);
549 }
550
551 return msg_size;
552}
553weak_alias (__regerror, regerror)
554
555
556#ifdef RE_ENABLE_I18N
557/* This static array is used for the map to single-byte characters when
558 UTF-8 is used. Otherwise we would allocate memory just to initialize
559 it the same all the time. UTF-8 is the preferred encoding so this is
560 a worthwhile optimization. */
561static const bitset_t utf8_sb_map =
562{
563 /* Set the first 128 bits. */
564# if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
565 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
566# else
567# if 4 * BITSET_WORD_BITS < ASCII_CHARS
568# error "bitset_word_t is narrower than 32 bits"
569# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
570 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
571# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
572 BITSET_WORD_MAX, BITSET_WORD_MAX,
573# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
574 BITSET_WORD_MAX,
575# endif
576 (BITSET_WORD_MAX
577 >> (SBC_MAX % BITSET_WORD_BITS == 0
578 ? 0
579 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
580# endif
581};
582#endif
583
584
585static void
586free_dfa_content (re_dfa_t *dfa)
587{
588 Idx i, j;
589
590 if (dfa->nodes)
591 for (i = 0; i < dfa->nodes_len; ++i)
592 free_token (dfa->nodes + i);
593 re_free (dfa->nexts);
594 for (i = 0; i < dfa->nodes_len; ++i)
595 {
596 if (dfa->eclosures != NULL)
597 re_node_set_free (dfa->eclosures + i);
598 if (dfa->inveclosures != NULL)
599 re_node_set_free (dfa->inveclosures + i);
600 if (dfa->edests != NULL)
601 re_node_set_free (dfa->edests + i);
602 }
603 re_free (dfa->edests);
604 re_free (dfa->eclosures);
605 re_free (dfa->inveclosures);
606 re_free (dfa->nodes);
607
608 if (dfa->state_table)
609 for (i = 0; i <= dfa->state_hash_mask; ++i)
610 {
611 struct re_state_table_entry *entry = dfa->state_table + i;
612 for (j = 0; j < entry->num; ++j)
613 {
614 re_dfastate_t *state = entry->array[j];
615 free_state (state);
616 }
617 re_free (entry->array);
618 }
619 re_free (dfa->state_table);
620#ifdef RE_ENABLE_I18N
621 if (dfa->sb_char != utf8_sb_map)
622 re_free (dfa->sb_char);
623#endif
624 re_free (dfa->subexp_map);
625#ifdef DEBUG
626 re_free (dfa->re_str);
627#endif
628
629 re_free (dfa);
630}
631
632
633/* Free dynamically allocated space used by PREG. */
634
635void
636regfree (regex_t *preg)
637{
638 re_dfa_t *dfa = preg->buffer;
639 if (__glibc_likely (dfa != NULL))
640 {
641 lock_fini (dfa->lock);
642 free_dfa_content (dfa);
643 }
644 preg->buffer = NULL;
645 preg->allocated = 0;
646
647 re_free (preg->fastmap);
648 preg->fastmap = NULL;
649
650 re_free (preg->translate);
651 preg->translate = NULL;
652}
653libc_hidden_def (__regfree)
654weak_alias (__regfree, regfree)
655
656
657/* Entry points compatible with 4.2 BSD regex library. We don't define
658 them unless specifically requested. */
659
660#if defined _REGEX_RE_COMP || defined _LIBC
661
662/* BSD has one and only one pattern buffer. */
663static struct re_pattern_buffer re_comp_buf;
664
665char *
666# ifdef _LIBC
667/* Make these definitions weak in libc, so POSIX programs can redefine
668 these names if they don't use our functions, and still use
669 regcomp/regexec above without link errors. */
670weak_function
671# endif
672re_comp (const char *s)
673{
674 reg_errcode_t ret;
675 char *fastmap;
676
677 if (!s)
678 {
679 if (!re_comp_buf.buffer)
680 return gettext ("No previous regular expression");
681 return 0;
682 }
683
684 if (re_comp_buf.buffer)
685 {
686 fastmap = re_comp_buf.fastmap;
687 re_comp_buf.fastmap = NULL;
688 __regfree (&re_comp_buf);
689 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
690 re_comp_buf.fastmap = fastmap;
691 }
692
693 if (re_comp_buf.fastmap == NULL)
694 {
695 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
696 if (re_comp_buf.fastmap == NULL)
697 return (char *) gettext (__re_error_msgid
698 + __re_error_msgid_idx[(int) REG_ESPACE]);
699 }
700
701 /* Since 're_exec' always passes NULL for the 'regs' argument, we
702 don't need to initialize the pattern buffer fields which affect it. */
703
704 /* Match anchors at newlines. */
705 re_comp_buf.newline_anchor = 1;
706
707 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708
709 if (!ret)
710 return NULL;
711
712 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
713 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
714}
715
716#ifdef _LIBC
717libc_freeres_fn (free_mem)
718{
719 __regfree (&re_comp_buf);
720}
721#endif
722
723#endif /* _REGEX_RE_COMP */
724
725
726/* Internal entry point.
727 Compile the regular expression PATTERN, whose length is LENGTH.
728 SYNTAX indicate regular expression's syntax. */
729
730static reg_errcode_t
731re_compile_internal (regex_t *preg, const char * pattern, size_t length,
732 reg_syntax_t syntax)
733{
734 reg_errcode_t err = REG_NOERROR;
735 re_dfa_t *dfa;
736 re_string_t regexp;
737
738 /* Initialize the pattern buffer. */
739 preg->fastmap_accurate = 0;
740 preg->syntax = syntax;
741 preg->not_bol = preg->not_eol = 0;
742 preg->used = 0;
743 preg->re_nsub = 0;
744 preg->can_be_null = 0;
745 preg->regs_allocated = REGS_UNALLOCATED;
746
747 /* Initialize the dfa. */
748 dfa = preg->buffer;
749 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
750 {
751 /* If zero allocated, but buffer is non-null, try to realloc
752 enough space. This loses if buffer's address is bogus, but
753 that is the user's responsibility. If ->buffer is NULL this
754 is a simple allocation. */
755 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
756 if (dfa == NULL)
757 return REG_ESPACE;
758 preg->allocated = sizeof (re_dfa_t);
759 preg->buffer = dfa;
760 }
761 preg->used = sizeof (re_dfa_t);
762
763 err = init_dfa (dfa, length);
764 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
765 err = REG_ESPACE;
766 if (__glibc_unlikely (err != REG_NOERROR))
767 {
768 free_dfa_content (dfa);
769 preg->buffer = NULL;
770 preg->allocated = 0;
771 return err;
772 }
773#ifdef DEBUG
774 /* Note: length+1 will not overflow since it is checked in init_dfa. */
775 dfa->re_str = re_malloc (char, length + 1);
776 strncpy (dfa->re_str, pattern, length + 1);
777#endif
778
779 err = re_string_construct (&regexp, pattern, length, preg->translate,
780 (syntax & RE_ICASE) != 0, dfa);
781 if (__glibc_unlikely (err != REG_NOERROR))
782 {
783 re_compile_internal_free_return:
784 free_workarea_compile (preg);
785 re_string_destruct (&regexp);
786 lock_fini (dfa->lock);
787 free_dfa_content (dfa);
788 preg->buffer = NULL;
789 preg->allocated = 0;
790 return err;
791 }
792
793 /* Parse the regular expression, and build a structure tree. */
794 preg->re_nsub = 0;
795 dfa->str_tree = parse (&regexp, preg, syntax, &err);
796 if (__glibc_unlikely (dfa->str_tree == NULL))
797 goto re_compile_internal_free_return;
798
799 /* Analyze the tree and create the nfa. */
800 err = analyze (preg);
801 if (__glibc_unlikely (err != REG_NOERROR))
802 goto re_compile_internal_free_return;
803
804#ifdef RE_ENABLE_I18N
805 /* If possible, do searching in single byte encoding to speed things up. */
806 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
807 optimize_utf8 (dfa);
808#endif
809
810 /* Then create the initial state of the dfa. */
811 err = create_initial_state (dfa);
812
813 /* Release work areas. */
814 free_workarea_compile (preg);
815 re_string_destruct (&regexp);
816
817 if (__glibc_unlikely (err != REG_NOERROR))
818 {
819 lock_fini (dfa->lock);
820 free_dfa_content (dfa);
821 preg->buffer = NULL;
822 preg->allocated = 0;
823 }
824
825 return err;
826}
827
828/* Initialize DFA. We use the length of the regular expression PAT_LEN
829 as the initial length of some arrays. */
830
831static reg_errcode_t
832init_dfa (re_dfa_t *dfa, size_t pat_len)
833{
834 __re_size_t table_size;
835#ifndef _LIBC
836 const char *codeset_name;
837#endif
838#ifdef RE_ENABLE_I18N
839 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
840#else
841 size_t max_i18n_object_size = 0;
842#endif
843 size_t max_object_size =
844 MAX (sizeof (struct re_state_table_entry),
845 MAX (sizeof (re_token_t),
846 MAX (sizeof (re_node_set),
847 MAX (sizeof (regmatch_t),
848 max_i18n_object_size))));
849
850 memset (dfa, '\0', sizeof (re_dfa_t));
851
852 /* Force allocation of str_tree_storage the first time. */
853 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854
855 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
856 calculation below, and for similar doubling calculations
857 elsewhere. And it's <= rather than <, because some of the
858 doubling calculations add 1 afterwards. */
859 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
860 <= pat_len))
861 return REG_ESPACE;
862
863 dfa->nodes_alloc = pat_len + 1;
864 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
865
866 /* table_size = 2 ^ ceil(log pat_len) */
867 for (table_size = 1; ; table_size <<= 1)
868 if (table_size > pat_len)
869 break;
870
871 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
872 dfa->state_hash_mask = table_size - 1;
873
874 dfa->mb_cur_max = MB_CUR_MAX;
875#ifdef _LIBC
876 if (dfa->mb_cur_max == 6
877 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
878 dfa->is_utf8 = 1;
879 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
880 != 0);
881#else
882 codeset_name = nl_langinfo (CODESET);
883 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
884 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
885 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
886 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
887 dfa->is_utf8 = 1;
888
889 /* We check exhaustively in the loop below if this charset is a
890 superset of ASCII. */
891 dfa->map_notascii = 0;
892#endif
893
894#ifdef RE_ENABLE_I18N
895 if (dfa->mb_cur_max > 1)
896 {
897 if (dfa->is_utf8)
898 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
899 else
900 {
901 int i, j, ch;
902
903 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
904 if (__glibc_unlikely (dfa->sb_char == NULL))
905 return REG_ESPACE;
906
907 /* Set the bits corresponding to single byte chars. */
908 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
909 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
910 {
911 wint_t wch = __btowc (ch);
912 if (wch != WEOF)
913 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
914# ifndef _LIBC
915 if (isascii (ch) && wch != ch)
916 dfa->map_notascii = 1;
917# endif
918 }
919 }
920 }
921#endif
922
923 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
924 return REG_ESPACE;
925 return REG_NOERROR;
926}
927
928/* Initialize WORD_CHAR table, which indicate which character is
929 "word". In this case "word" means that it is the word construction
930 character used by some operators like "\<", "\>", etc. */
931
932static void
933init_word_char (re_dfa_t *dfa)
934{
935 int i = 0;
936 int j;
937 int ch = 0;
938 dfa->word_ops_used = 1;
939 if (__glibc_likely (dfa->map_notascii == 0))
940 {
941 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
942 them, an issue when this code is used in Gnulib. */
943 bitset_word_t bits0 = 0x00000000;
944 bitset_word_t bits1 = 0x03ff0000;
945 bitset_word_t bits2 = 0x87fffffe;
946 bitset_word_t bits3 = 0x07fffffe;
947 if (BITSET_WORD_BITS == 64)
948 {
949 /* Pacify gcc -Woverflow on 32-bit platformns. */
950 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
951 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
952 i = 2;
953 }
954 else if (BITSET_WORD_BITS == 32)
955 {
956 dfa->word_char[0] = bits0;
957 dfa->word_char[1] = bits1;
958 dfa->word_char[2] = bits2;
959 dfa->word_char[3] = bits3;
960 i = 4;
961 }
962 else
963 goto general_case;
964 ch = 128;
965
966 if (__glibc_likely (dfa->is_utf8))
967 {
968 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
969 return;
970 }
971 }
972
973 general_case:
974 for (; i < BITSET_WORDS; ++i)
975 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
976 if (isalnum (ch) || ch == '_')
977 dfa->word_char[i] |= (bitset_word_t) 1 << j;
978}
979
980/* Free the work area which are only used while compiling. */
981
982static void
983free_workarea_compile (regex_t *preg)
984{
985 re_dfa_t *dfa = preg->buffer;
986 bin_tree_storage_t *storage, *next;
987 for (storage = dfa->str_tree_storage; storage; storage = next)
988 {
989 next = storage->next;
990 re_free (storage);
991 }
992 dfa->str_tree_storage = NULL;
993 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
994 dfa->str_tree = NULL;
995 re_free (dfa->org_indices);
996 dfa->org_indices = NULL;
997}
998
999/* Create initial states for all contexts. */
1000
1001static reg_errcode_t
1002create_initial_state (re_dfa_t *dfa)
1003{
1004 Idx first, i;
1005 reg_errcode_t err;
1006 re_node_set init_nodes;
1007
1008 /* Initial states have the epsilon closure of the node which is
1009 the first node of the regular expression. */
1010 first = dfa->str_tree->first->node_idx;
1011 dfa->init_node = first;
1012 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1013 if (__glibc_unlikely (err != REG_NOERROR))
1014 return err;
1015
1016 /* The back-references which are in initial states can epsilon transit,
1017 since in this case all of the subexpressions can be null.
1018 Then we add epsilon closures of the nodes which are the next nodes of
1019 the back-references. */
1020 if (dfa->nbackref > 0)
1021 for (i = 0; i < init_nodes.nelem; ++i)
1022 {
1023 Idx node_idx = init_nodes.elems[i];
1024 re_token_type_t type = dfa->nodes[node_idx].type;
1025
1026 Idx clexp_idx;
1027 if (type != OP_BACK_REF)
1028 continue;
1029 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1030 {
1031 re_token_t *clexp_node;
1032 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1033 if (clexp_node->type == OP_CLOSE_SUBEXP
1034 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1035 break;
1036 }
1037 if (clexp_idx == init_nodes.nelem)
1038 continue;
1039
1040 if (type == OP_BACK_REF)
1041 {
1042 Idx dest_idx = dfa->edests[node_idx].elems[0];
1043 if (!re_node_set_contains (&init_nodes, dest_idx))
1044 {
1045 reg_errcode_t merge_err
1046 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1047 if (merge_err != REG_NOERROR)
1048 return merge_err;
1049 i = 0;
1050 }
1051 }
1052 }
1053
1054 /* It must be the first time to invoke acquire_state. */
1055 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1056 /* We don't check ERR here, since the initial state must not be NULL. */
1057 if (__glibc_unlikely (dfa->init_state == NULL))
1058 return err;
1059 if (dfa->init_state->has_constraint)
1060 {
1061 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1062 CONTEXT_WORD);
1063 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1064 CONTEXT_NEWLINE);
1065 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1066 &init_nodes,
1067 CONTEXT_NEWLINE
1068 | CONTEXT_BEGBUF);
1069 if (__glibc_unlikely (dfa->init_state_word == NULL
1070 || dfa->init_state_nl == NULL
1071 || dfa->init_state_begbuf == NULL))
1072 return err;
1073 }
1074 else
1075 dfa->init_state_word = dfa->init_state_nl
1076 = dfa->init_state_begbuf = dfa->init_state;
1077
1078 re_node_set_free (&init_nodes);
1079 return REG_NOERROR;
1080}
1081
1082
1083#ifdef RE_ENABLE_I18N
1084/* If it is possible to do searching in single byte encoding instead of UTF-8
1085 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1086 DFA nodes where needed. */
1087
1088static void
1089optimize_utf8 (re_dfa_t *dfa)
1090{
1091 Idx node;
1092 int i;
1093 bool mb_chars = false;
1094 bool has_period = false;
1095
1096 for (node = 0; node < dfa->nodes_len; ++node)
1097 switch (dfa->nodes[node].type)
1098 {
1099 case CHARACTER:
1100 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1101 mb_chars = true;
1102 break;
1103 case ANCHOR:
1104 switch (dfa->nodes[node].opr.ctx_type)
1105 {
1106 case LINE_FIRST:
1107 case LINE_LAST:
1108 case BUF_FIRST:
1109 case BUF_LAST:
1110 break;
1111 default:
1112 /* Word anchors etc. cannot be handled. It's okay to test
1113 opr.ctx_type since constraints (for all DFA nodes) are
1114 created by ORing one or more opr.ctx_type values. */
1115 return;
1116 }
1117 break;
1118 case OP_PERIOD:
1119 has_period = true;
1120 break;
1121 case OP_BACK_REF:
1122 case OP_ALT:
1123 case END_OF_RE:
1124 case OP_DUP_ASTERISK:
1125 case OP_OPEN_SUBEXP:
1126 case OP_CLOSE_SUBEXP:
1127 break;
1128 case COMPLEX_BRACKET:
1129 return;
1130 case SIMPLE_BRACKET:
1131 /* Just double check. */
1132 {
1133 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1134 ? 0
1135 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1136 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1137 {
1138 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1139 return;
1140 rshift = 0;
1141 }
1142 }
1143 break;
1144 default:
1145 abort ();
1146 }
1147
1148 if (mb_chars || has_period)
1149 for (node = 0; node < dfa->nodes_len; ++node)
1150 {
1151 if (dfa->nodes[node].type == CHARACTER
1152 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1153 dfa->nodes[node].mb_partial = 0;
1154 else if (dfa->nodes[node].type == OP_PERIOD)
1155 dfa->nodes[node].type = OP_UTF8_PERIOD;
1156 }
1157
1158 /* The search can be in single byte locale. */
1159 dfa->mb_cur_max = 1;
1160 dfa->is_utf8 = 0;
1161 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1162}
1163#endif
1164
1165
1166/* Analyze the structure tree, and calculate "first", "next", "edest",
1167 "eclosure", and "inveclosure". */
1168
1169static reg_errcode_t
1170analyze (regex_t *preg)
1171{
1172 re_dfa_t *dfa = preg->buffer;
1173 reg_errcode_t ret;
1174
1175 /* Allocate arrays. */
1176 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1177 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1178 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1179 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1180 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1181 || dfa->edests == NULL || dfa->eclosures == NULL))
1182 return REG_ESPACE;
1183
1184 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1185 if (dfa->subexp_map != NULL)
1186 {
1187 Idx i;
1188 for (i = 0; i < preg->re_nsub; i++)
1189 dfa->subexp_map[i] = i;
1190 preorder (dfa->str_tree, optimize_subexps, dfa);
1191 for (i = 0; i < preg->re_nsub; i++)
1192 if (dfa->subexp_map[i] != i)
1193 break;
1194 if (i == preg->re_nsub)
1195 {
1196 re_free (dfa->subexp_map);
1197 dfa->subexp_map = NULL;
1198 }
1199 }
1200
1201 ret = postorder (dfa->str_tree, lower_subexps, preg);
1202 if (__glibc_unlikely (ret != REG_NOERROR))
1203 return ret;
1204 ret = postorder (dfa->str_tree, calc_first, dfa);
1205 if (__glibc_unlikely (ret != REG_NOERROR))
1206 return ret;
1207 preorder (dfa->str_tree, calc_next, dfa);
1208 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1209 if (__glibc_unlikely (ret != REG_NOERROR))
1210 return ret;
1211 ret = calc_eclosure (dfa);
1212 if (__glibc_unlikely (ret != REG_NOERROR))
1213 return ret;
1214
1215 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1216 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1217 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1218 || dfa->nbackref)
1219 {
1220 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1221 if (__glibc_unlikely (dfa->inveclosures == NULL))
1222 return REG_ESPACE;
1223 ret = calc_inveclosure (dfa);
1224 }
1225
1226 return ret;
1227}
1228
1229/* Our parse trees are very unbalanced, so we cannot use a stack to
1230 implement parse tree visits. Instead, we use parent pointers and
1231 some hairy code in these two functions. */
1232static reg_errcode_t
1233postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1234 void *extra)
1235{
1236 bin_tree_t *node, *prev;
1237
1238 for (node = root; ; )
1239 {
1240 /* Descend down the tree, preferably to the left (or to the right
1241 if that's the only child). */
1242 while (node->left || node->right)
1243 if (node->left)
1244 node = node->left;
1245 else
1246 node = node->right;
1247
1248 do
1249 {
1250 reg_errcode_t err = fn (extra, node);
1251 if (__glibc_unlikely (err != REG_NOERROR))
1252 return err;
1253 if (node->parent == NULL)
1254 return REG_NOERROR;
1255 prev = node;
1256 node = node->parent;
1257 }
1258 /* Go up while we have a node that is reached from the right. */
1259 while (node->right == prev || node->right == NULL);
1260 node = node->right;
1261 }
1262}
1263
1264static reg_errcode_t
1265preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1266 void *extra)
1267{
1268 bin_tree_t *node;
1269
1270 for (node = root; ; )
1271 {
1272 reg_errcode_t err = fn (extra, node);
1273 if (__glibc_unlikely (err != REG_NOERROR))
1274 return err;
1275
1276 /* Go to the left node, or up and to the right. */
1277 if (node->left)
1278 node = node->left;
1279 else
1280 {
1281 bin_tree_t *prev = NULL;
1282 while (node->right == prev || node->right == NULL)
1283 {
1284 prev = node;
1285 node = node->parent;
1286 if (!node)
1287 return REG_NOERROR;
1288 }
1289 node = node->right;
1290 }
1291 }
1292}
1293
1294/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1295 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1296 backreferences as well. Requires a preorder visit. */
1297static reg_errcode_t
1298optimize_subexps (void *extra, bin_tree_t *node)
1299{
1300 re_dfa_t *dfa = (re_dfa_t *) extra;
1301
1302 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1303 {
1304 int idx = node->token.opr.idx;
1305 node->token.opr.idx = dfa->subexp_map[idx];
1306 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1307 }
1308
1309 else if (node->token.type == SUBEXP
1310 && node->left && node->left->token.type == SUBEXP)
1311 {
1312 Idx other_idx = node->left->token.opr.idx;
1313
1314 node->left = node->left->left;
1315 if (node->left)
1316 node->left->parent = node;
1317
1318 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1319 if (other_idx < BITSET_WORD_BITS)
1320 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1321 }
1322
1323 return REG_NOERROR;
1324}
1325
1326/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1327 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1328static reg_errcode_t
1329lower_subexps (void *extra, bin_tree_t *node)
1330{
1331 regex_t *preg = (regex_t *) extra;
1332 reg_errcode_t err = REG_NOERROR;
1333
1334 if (node->left && node->left->token.type == SUBEXP)
1335 {
1336 node->left = lower_subexp (&err, preg, node->left);
1337 if (node->left)
1338 node->left->parent = node;
1339 }
1340 if (node->right && node->right->token.type == SUBEXP)
1341 {
1342 node->right = lower_subexp (&err, preg, node->right);
1343 if (node->right)
1344 node->right->parent = node;
1345 }
1346
1347 return err;
1348}
1349
1350static bin_tree_t *
1351lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1352{
1353 re_dfa_t *dfa = preg->buffer;
1354 bin_tree_t *body = node->left;
1355 bin_tree_t *op, *cls, *tree1, *tree;
1356
1357 if (preg->no_sub
1358 /* We do not optimize empty subexpressions, because otherwise we may
1359 have bad CONCAT nodes with NULL children. This is obviously not
1360 very common, so we do not lose much. An example that triggers
1361 this case is the sed "script" /\(\)/x. */
1362 && node->left != NULL
1363 && (node->token.opr.idx >= BITSET_WORD_BITS
1364 || !(dfa->used_bkref_map
1365 & ((bitset_word_t) 1 << node->token.opr.idx))))
1366 return node->left;
1367
1368 /* Convert the SUBEXP node to the concatenation of an
1369 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1370 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1371 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1372 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1373 tree = create_tree (dfa, op, tree1, CONCAT);
1374 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1375 || op == NULL || cls == NULL))
1376 {
1377 *err = REG_ESPACE;
1378 return NULL;
1379 }
1380
1381 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1382 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1383 return tree;
1384}
1385
1386/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1387 nodes. Requires a postorder visit. */
1388static reg_errcode_t
1389calc_first (void *extra, bin_tree_t *node)
1390{
1391 re_dfa_t *dfa = (re_dfa_t *) extra;
1392 if (node->token.type == CONCAT)
1393 {
1394 node->first = node->left->first;
1395 node->node_idx = node->left->node_idx;
1396 }
1397 else
1398 {
1399 node->first = node;
1400 node->node_idx = re_dfa_add_node (dfa, node->token);
1401 if (__glibc_unlikely (node->node_idx == -1))
1402 return REG_ESPACE;
1403 if (node->token.type == ANCHOR)
1404 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1405 }
1406 return REG_NOERROR;
1407}
1408
1409/* Pass 2: compute NEXT on the tree. Preorder visit. */
1410static reg_errcode_t
1411calc_next (void *extra, bin_tree_t *node)
1412{
1413 switch (node->token.type)
1414 {
1415 case OP_DUP_ASTERISK:
1416 node->left->next = node;
1417 break;
1418 case CONCAT:
1419 node->left->next = node->right->first;
1420 node->right->next = node->next;
1421 break;
1422 default:
1423 if (node->left)
1424 node->left->next = node->next;
1425 if (node->right)
1426 node->right->next = node->next;
1427 break;
1428 }
1429 return REG_NOERROR;
1430}
1431
1432/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1433static reg_errcode_t
1434link_nfa_nodes (void *extra, bin_tree_t *node)
1435{
1436 re_dfa_t *dfa = (re_dfa_t *) extra;
1437 Idx idx = node->node_idx;
1438 reg_errcode_t err = REG_NOERROR;
1439
1440 switch (node->token.type)
1441 {
1442 case CONCAT:
1443 break;
1444
1445 case END_OF_RE:
1446 DEBUG_ASSERT (node->next == NULL);
1447 break;
1448
1449 case OP_DUP_ASTERISK:
1450 case OP_ALT:
1451 {
1452 Idx left, right;
1453 dfa->has_plural_match = 1;
1454 if (node->left != NULL)
1455 left = node->left->first->node_idx;
1456 else
1457 left = node->next->node_idx;
1458 if (node->right != NULL)
1459 right = node->right->first->node_idx;
1460 else
1461 right = node->next->node_idx;
1462 DEBUG_ASSERT (left > -1);
1463 DEBUG_ASSERT (right > -1);
1464 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1465 }
1466 break;
1467
1468 case ANCHOR:
1469 case OP_OPEN_SUBEXP:
1470 case OP_CLOSE_SUBEXP:
1471 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1472 break;
1473
1474 case OP_BACK_REF:
1475 dfa->nexts[idx] = node->next->node_idx;
1476 if (node->token.type == OP_BACK_REF)
1477 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1478 break;
1479
1480 default:
1481 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1482 dfa->nexts[idx] = node->next->node_idx;
1483 break;
1484 }
1485
1486 return err;
1487}
1488
1489/* Duplicate the epsilon closure of the node ROOT_NODE.
1490 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1491 to their own constraint. */
1492
1493static reg_errcode_t
1494duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1495 Idx root_node, unsigned int init_constraint)
1496{
1497 Idx org_node, clone_node;
1498 bool ok;
1499 unsigned int constraint = init_constraint;
1500 for (org_node = top_org_node, clone_node = top_clone_node;;)
1501 {
1502 Idx org_dest, clone_dest;
1503 if (dfa->nodes[org_node].type == OP_BACK_REF)
1504 {
1505 /* If the back reference epsilon-transit, its destination must
1506 also have the constraint. Then duplicate the epsilon closure
1507 of the destination of the back reference, and store it in
1508 edests of the back reference. */
1509 org_dest = dfa->nexts[org_node];
1510 re_node_set_empty (dfa->edests + clone_node);
1511 clone_dest = duplicate_node (dfa, org_dest, constraint);
1512 if (__glibc_unlikely (clone_dest == -1))
1513 return REG_ESPACE;
1514 dfa->nexts[clone_node] = dfa->nexts[org_node];
1515 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516 if (__glibc_unlikely (! ok))
1517 return REG_ESPACE;
1518 }
1519 else if (dfa->edests[org_node].nelem == 0)
1520 {
1521 /* In case of the node can't epsilon-transit, don't duplicate the
1522 destination and store the original destination as the
1523 destination of the node. */
1524 dfa->nexts[clone_node] = dfa->nexts[org_node];
1525 break;
1526 }
1527 else if (dfa->edests[org_node].nelem == 1)
1528 {
1529 /* In case of the node can epsilon-transit, and it has only one
1530 destination. */
1531 org_dest = dfa->edests[org_node].elems[0];
1532 re_node_set_empty (dfa->edests + clone_node);
1533 /* If the node is root_node itself, it means the epsilon closure
1534 has a loop. Then tie it to the destination of the root_node. */
1535 if (org_node == root_node && clone_node != org_node)
1536 {
1537 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1538 if (__glibc_unlikely (! ok))
1539 return REG_ESPACE;
1540 break;
1541 }
1542 /* In case the node has another constraint, append it. */
1543 constraint |= dfa->nodes[org_node].constraint;
1544 clone_dest = duplicate_node (dfa, org_dest, constraint);
1545 if (__glibc_unlikely (clone_dest == -1))
1546 return REG_ESPACE;
1547 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 if (__glibc_unlikely (! ok))
1549 return REG_ESPACE;
1550 }
1551 else /* dfa->edests[org_node].nelem == 2 */
1552 {
1553 /* In case of the node can epsilon-transit, and it has two
1554 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1555 org_dest = dfa->edests[org_node].elems[0];
1556 re_node_set_empty (dfa->edests + clone_node);
1557 /* Search for a duplicated node which satisfies the constraint. */
1558 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1559 if (clone_dest == -1)
1560 {
1561 /* There is no such duplicated node, create a new one. */
1562 reg_errcode_t err;
1563 clone_dest = duplicate_node (dfa, org_dest, constraint);
1564 if (__glibc_unlikely (clone_dest == -1))
1565 return REG_ESPACE;
1566 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567 if (__glibc_unlikely (! ok))
1568 return REG_ESPACE;
1569 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1570 root_node, constraint);
1571 if (__glibc_unlikely (err != REG_NOERROR))
1572 return err;
1573 }
1574 else
1575 {
1576 /* There is a duplicated node which satisfies the constraint,
1577 use it to avoid infinite loop. */
1578 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1579 if (__glibc_unlikely (! ok))
1580 return REG_ESPACE;
1581 }
1582
1583 org_dest = dfa->edests[org_node].elems[1];
1584 clone_dest = duplicate_node (dfa, org_dest, constraint);
1585 if (__glibc_unlikely (clone_dest == -1))
1586 return REG_ESPACE;
1587 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1588 if (__glibc_unlikely (! ok))
1589 return REG_ESPACE;
1590 }
1591 org_node = org_dest;
1592 clone_node = clone_dest;
1593 }
1594 return REG_NOERROR;
1595}
1596
1597/* Search for a node which is duplicated from the node ORG_NODE, and
1598 satisfies the constraint CONSTRAINT. */
1599
1600static Idx
1601search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1602 unsigned int constraint)
1603{
1604 Idx idx;
1605 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1606 {
1607 if (org_node == dfa->org_indices[idx]
1608 && constraint == dfa->nodes[idx].constraint)
1609 return idx; /* Found. */
1610 }
1611 return -1; /* Not found. */
1612}
1613
1614/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1615 Return the index of the new node, or -1 if insufficient storage is
1616 available. */
1617
1618static Idx
1619duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1620{
1621 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1622 if (__glibc_likely (dup_idx != -1))
1623 {
1624 dfa->nodes[dup_idx].constraint = constraint;
1625 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1626 dfa->nodes[dup_idx].duplicated = 1;
1627
1628 /* Store the index of the original node. */
1629 dfa->org_indices[dup_idx] = org_idx;
1630 }
1631 return dup_idx;
1632}
1633
1634static reg_errcode_t
1635calc_inveclosure (re_dfa_t *dfa)
1636{
1637 Idx src, idx;
1638 bool ok;
1639 for (idx = 0; idx < dfa->nodes_len; ++idx)
1640 re_node_set_init_empty (dfa->inveclosures + idx);
1641
1642 for (src = 0; src < dfa->nodes_len; ++src)
1643 {
1644 Idx *elems = dfa->eclosures[src].elems;
1645 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1646 {
1647 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1648 if (__glibc_unlikely (! ok))
1649 return REG_ESPACE;
1650 }
1651 }
1652
1653 return REG_NOERROR;
1654}
1655
1656/* Calculate "eclosure" for all the node in DFA. */
1657
1658static reg_errcode_t
1659calc_eclosure (re_dfa_t *dfa)
1660{
1661 Idx node_idx;
1662 bool incomplete;
1663 DEBUG_ASSERT (dfa->nodes_len > 0);
1664 incomplete = false;
1665 /* For each nodes, calculate epsilon closure. */
1666 for (node_idx = 0; ; ++node_idx)
1667 {
1668 reg_errcode_t err;
1669 re_node_set eclosure_elem;
1670 if (node_idx == dfa->nodes_len)
1671 {
1672 if (!incomplete)
1673 break;
1674 incomplete = false;
1675 node_idx = 0;
1676 }
1677
1678 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1679
1680 /* If we have already calculated, skip it. */
1681 if (dfa->eclosures[node_idx].nelem != 0)
1682 continue;
1683 /* Calculate epsilon closure of 'node_idx'. */
1684 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1685 if (__glibc_unlikely (err != REG_NOERROR))
1686 return err;
1687
1688 if (dfa->eclosures[node_idx].nelem == 0)
1689 {
1690 incomplete = true;
1691 re_node_set_free (&eclosure_elem);
1692 }
1693 }
1694 return REG_NOERROR;
1695}
1696
1697/* Calculate epsilon closure of NODE. */
1698
1699static reg_errcode_t
1700calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1701{
1702 reg_errcode_t err;
1703 Idx i;
1704 re_node_set eclosure;
1705 bool incomplete = false;
1706 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1707 if (__glibc_unlikely (err != REG_NOERROR))
1708 return err;
1709
1710 /* An epsilon closure includes itself. */
1711 eclosure.elems[eclosure.nelem++] = node;
1712
1713 /* This indicates that we are calculating this node now.
1714 We reference this value to avoid infinite loop. */
1715 dfa->eclosures[node].nelem = -1;
1716
1717 /* If the current node has constraints, duplicate all nodes
1718 since they must inherit the constraints. */
1719 if (dfa->nodes[node].constraint
1720 && dfa->edests[node].nelem
1721 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1722 {
1723 err = duplicate_node_closure (dfa, node, node, node,
1724 dfa->nodes[node].constraint);
1725 if (__glibc_unlikely (err != REG_NOERROR))
1726 return err;
1727 }
1728
1729 /* Expand each epsilon destination nodes. */
1730 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1731 for (i = 0; i < dfa->edests[node].nelem; ++i)
1732 {
1733 re_node_set eclosure_elem;
1734 Idx edest = dfa->edests[node].elems[i];
1735 /* If calculating the epsilon closure of 'edest' is in progress,
1736 return intermediate result. */
1737 if (dfa->eclosures[edest].nelem == -1)
1738 {
1739 incomplete = true;
1740 continue;
1741 }
1742 /* If we haven't calculated the epsilon closure of 'edest' yet,
1743 calculate now. Otherwise use calculated epsilon closure. */
1744 if (dfa->eclosures[edest].nelem == 0)
1745 {
1746 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1747 if (__glibc_unlikely (err != REG_NOERROR))
1748 return err;
1749 }
1750 else
1751 eclosure_elem = dfa->eclosures[edest];
1752 /* Merge the epsilon closure of 'edest'. */
1753 err = re_node_set_merge (&eclosure, &eclosure_elem);
1754 if (__glibc_unlikely (err != REG_NOERROR))
1755 return err;
1756 /* If the epsilon closure of 'edest' is incomplete,
1757 the epsilon closure of this node is also incomplete. */
1758 if (dfa->eclosures[edest].nelem == 0)
1759 {
1760 incomplete = true;
1761 re_node_set_free (&eclosure_elem);
1762 }
1763 }
1764
1765 if (incomplete && !root)
1766 dfa->eclosures[node].nelem = 0;
1767 else
1768 dfa->eclosures[node] = eclosure;
1769 *new_set = eclosure;
1770 return REG_NOERROR;
1771}
1772
1773
1774/* Functions for token which are used in the parser. */
1775
1776/* Fetch a token from INPUT.
1777 We must not use this function inside bracket expressions. */
1778
1779static void
1780fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1781{
1782 re_string_skip_bytes (input, peek_token (result, input, syntax));
1783}
1784
1785/* Peek a token from INPUT, and return the length of the token.
1786 We must not use this function inside bracket expressions. */
1787
1788static int
1789peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1790{
1791 unsigned char c;
1792
1793 if (re_string_eoi (input))
1794 {
1795 token->type = END_OF_RE;
1796 return 0;
1797 }
1798
1799 c = re_string_peek_byte (input, 0);
1800 token->opr.c = c;
1801
1802 token->word_char = 0;
1803#ifdef RE_ENABLE_I18N
1804 token->mb_partial = 0;
1805 if (input->mb_cur_max > 1
1806 && !re_string_first_byte (input, re_string_cur_idx (input)))
1807 {
1808 token->type = CHARACTER;
1809 token->mb_partial = 1;
1810 return 1;
1811 }
1812#endif
1813 if (c == '\\')
1814 {
1815 unsigned char c2;
1816 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1817 {
1818 token->type = BACK_SLASH;
1819 return 1;
1820 }
1821
1822 c2 = re_string_peek_byte_case (input, 1);
1823 token->opr.c = c2;
1824 token->type = CHARACTER;
1825#ifdef RE_ENABLE_I18N
1826 if (input->mb_cur_max > 1)
1827 {
1828 wint_t wc = re_string_wchar_at (input,
1829 re_string_cur_idx (input) + 1);
1830 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1831 }
1832 else
1833#endif
1834 token->word_char = IS_WORD_CHAR (c2) != 0;
1835
1836 switch (c2)
1837 {
1838 case '|':
1839 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1840 token->type = OP_ALT;
1841 break;
1842 case '1': case '2': case '3': case '4': case '5':
1843 case '6': case '7': case '8': case '9':
1844 if (!(syntax & RE_NO_BK_REFS))
1845 {
1846 token->type = OP_BACK_REF;
1847 token->opr.idx = c2 - '1';
1848 }
1849 break;
1850 case '<':
1851 if (!(syntax & RE_NO_GNU_OPS))
1852 {
1853 token->type = ANCHOR;
1854 token->opr.ctx_type = WORD_FIRST;
1855 }
1856 break;
1857 case '>':
1858 if (!(syntax & RE_NO_GNU_OPS))
1859 {
1860 token->type = ANCHOR;
1861 token->opr.ctx_type = WORD_LAST;
1862 }
1863 break;
1864 case 'b':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 {
1867 token->type = ANCHOR;
1868 token->opr.ctx_type = WORD_DELIM;
1869 }
1870 break;
1871 case 'B':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 {
1874 token->type = ANCHOR;
1875 token->opr.ctx_type = NOT_WORD_DELIM;
1876 }
1877 break;
1878 case 'w':
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = OP_WORD;
1881 break;
1882 case 'W':
1883 if (!(syntax & RE_NO_GNU_OPS))
1884 token->type = OP_NOTWORD;
1885 break;
1886 case 's':
1887 if (!(syntax & RE_NO_GNU_OPS))
1888 token->type = OP_SPACE;
1889 break;
1890 case 'S':
1891 if (!(syntax & RE_NO_GNU_OPS))
1892 token->type = OP_NOTSPACE;
1893 break;
1894 case '`':
1895 if (!(syntax & RE_NO_GNU_OPS))
1896 {
1897 token->type = ANCHOR;
1898 token->opr.ctx_type = BUF_FIRST;
1899 }
1900 break;
1901 case '\'':
1902 if (!(syntax & RE_NO_GNU_OPS))
1903 {
1904 token->type = ANCHOR;
1905 token->opr.ctx_type = BUF_LAST;
1906 }
1907 break;
1908 case '(':
1909 if (!(syntax & RE_NO_BK_PARENS))
1910 token->type = OP_OPEN_SUBEXP;
1911 break;
1912 case ')':
1913 if (!(syntax & RE_NO_BK_PARENS))
1914 token->type = OP_CLOSE_SUBEXP;
1915 break;
1916 case '+':
1917 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1918 token->type = OP_DUP_PLUS;
1919 break;
1920 case '?':
1921 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1922 token->type = OP_DUP_QUESTION;
1923 break;
1924 case '{':
1925 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1926 token->type = OP_OPEN_DUP_NUM;
1927 break;
1928 case '}':
1929 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1930 token->type = OP_CLOSE_DUP_NUM;
1931 break;
1932 default:
1933 break;
1934 }
1935 return 2;
1936 }
1937
1938 token->type = CHARACTER;
1939#ifdef RE_ENABLE_I18N
1940 if (input->mb_cur_max > 1)
1941 {
1942 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1943 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1944 }
1945 else
1946#endif
1947 token->word_char = IS_WORD_CHAR (token->opr.c);
1948
1949 switch (c)
1950 {
1951 case '\n':
1952 if (syntax & RE_NEWLINE_ALT)
1953 token->type = OP_ALT;
1954 break;
1955 case '|':
1956 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1957 token->type = OP_ALT;
1958 break;
1959 case '*':
1960 token->type = OP_DUP_ASTERISK;
1961 break;
1962 case '+':
1963 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1964 token->type = OP_DUP_PLUS;
1965 break;
1966 case '?':
1967 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1968 token->type = OP_DUP_QUESTION;
1969 break;
1970 case '{':
1971 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1972 token->type = OP_OPEN_DUP_NUM;
1973 break;
1974 case '}':
1975 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1976 token->type = OP_CLOSE_DUP_NUM;
1977 break;
1978 case '(':
1979 if (syntax & RE_NO_BK_PARENS)
1980 token->type = OP_OPEN_SUBEXP;
1981 break;
1982 case ')':
1983 if (syntax & RE_NO_BK_PARENS)
1984 token->type = OP_CLOSE_SUBEXP;
1985 break;
1986 case '[':
1987 token->type = OP_OPEN_BRACKET;
1988 break;
1989 case '.':
1990 token->type = OP_PERIOD;
1991 break;
1992 case '^':
1993 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1994 && re_string_cur_idx (input) != 0)
1995 {
1996 char prev = re_string_peek_byte (input, -1);
1997 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1998 break;
1999 }
2000 token->type = ANCHOR;
2001 token->opr.ctx_type = LINE_FIRST;
2002 break;
2003 case '$':
2004 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
2005 && re_string_cur_idx (input) + 1 != re_string_length (input))
2006 {
2007 re_token_t next;
2008 re_string_skip_bytes (input, 1);
2009 peek_token (&next, input, syntax);
2010 re_string_skip_bytes (input, -1);
2011 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2012 break;
2013 }
2014 token->type = ANCHOR;
2015 token->opr.ctx_type = LINE_LAST;
2016 break;
2017 default:
2018 break;
2019 }
2020 return 1;
2021}
2022
2023/* Peek a token from INPUT, and return the length of the token.
2024 We must not use this function out of bracket expressions. */
2025
2026static int
2027peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2028{
2029 unsigned char c;
2030 if (re_string_eoi (input))
2031 {
2032 token->type = END_OF_RE;
2033 return 0;
2034 }
2035 c = re_string_peek_byte (input, 0);
2036 token->opr.c = c;
2037
2038#ifdef RE_ENABLE_I18N
2039 if (input->mb_cur_max > 1
2040 && !re_string_first_byte (input, re_string_cur_idx (input)))
2041 {
2042 token->type = CHARACTER;
2043 return 1;
2044 }
2045#endif /* RE_ENABLE_I18N */
2046
2047 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2048 && re_string_cur_idx (input) + 1 < re_string_length (input))
2049 {
2050 /* In this case, '\' escape a character. */
2051 unsigned char c2;
2052 re_string_skip_bytes (input, 1);
2053 c2 = re_string_peek_byte (input, 0);
2054 token->opr.c = c2;
2055 token->type = CHARACTER;
2056 return 1;
2057 }
2058 if (c == '[') /* '[' is a special char in a bracket exps. */
2059 {
2060 unsigned char c2;
2061 int token_len;
2062 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2063 c2 = re_string_peek_byte (input, 1);
2064 else
2065 c2 = 0;
2066 token->opr.c = c2;
2067 token_len = 2;
2068 switch (c2)
2069 {
2070 case '.':
2071 token->type = OP_OPEN_COLL_ELEM;
2072 break;
2073
2074 case '=':
2075 token->type = OP_OPEN_EQUIV_CLASS;
2076 break;
2077
2078 case ':':
2079 if (syntax & RE_CHAR_CLASSES)
2080 {
2081 token->type = OP_OPEN_CHAR_CLASS;
2082 break;
2083 }
2084 FALLTHROUGH;
2085 default:
2086 token->type = CHARACTER;
2087 token->opr.c = c;
2088 token_len = 1;
2089 break;
2090 }
2091 return token_len;
2092 }
2093 switch (c)
2094 {
2095 case '-':
2096 token->type = OP_CHARSET_RANGE;
2097 break;
2098 case ']':
2099 token->type = OP_CLOSE_BRACKET;
2100 break;
2101 case '^':
2102 token->type = OP_NON_MATCH_LIST;
2103 break;
2104 default:
2105 token->type = CHARACTER;
2106 }
2107 return 1;
2108}
2109
2110
2111/* Functions for parser. */
2112
2113/* Entry point of the parser.
2114 Parse the regular expression REGEXP and return the structure tree.
2115 If an error occurs, ERR is set by error code, and return NULL.
2116 This function build the following tree, from regular expression <reg_exp>:
2117 CAT
2118 / \
2119 / \
2120 <reg_exp> EOR
2121
2122 CAT means concatenation.
2123 EOR means end of regular expression. */
2124
2125static bin_tree_t *
2126parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2127 reg_errcode_t *err)
2128{
2129 re_dfa_t *dfa = preg->buffer;
2130 bin_tree_t *tree, *eor, *root;
2131 re_token_t current_token;
2132 dfa->syntax = syntax;
2133 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2134 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2135 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2136 return NULL;
2137 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2138 if (tree != NULL)
2139 root = create_tree (dfa, tree, eor, CONCAT);
2140 else
2141 root = eor;
2142 if (__glibc_unlikely (eor == NULL || root == NULL))
2143 {
2144 *err = REG_ESPACE;
2145 return NULL;
2146 }
2147 return root;
2148}
2149
2150/* This function build the following tree, from regular expression
2151 <branch1>|<branch2>:
2152 ALT
2153 / \
2154 / \
2155 <branch1> <branch2>
2156
2157 ALT means alternative, which represents the operator '|'. */
2158
2159static bin_tree_t *
2160parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2161 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2162{
2163 re_dfa_t *dfa = preg->buffer;
2164 bin_tree_t *tree, *branch = NULL;
2165 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2166 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2167 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2168 return NULL;
2169
2170 while (token->type == OP_ALT)
2171 {
2172 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2173 if (token->type != OP_ALT && token->type != END_OF_RE
2174 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2175 {
2176 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2177 dfa->completed_bkref_map = initial_bkref_map;
2178 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2179 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2180 {
2181 if (tree != NULL)
2182 postorder (tree, free_tree, NULL);
2183 return NULL;
2184 }
2185 dfa->completed_bkref_map |= accumulated_bkref_map;
2186 }
2187 else
2188 branch = NULL;
2189 tree = create_tree (dfa, tree, branch, OP_ALT);
2190 if (__glibc_unlikely (tree == NULL))
2191 {
2192 *err = REG_ESPACE;
2193 return NULL;
2194 }
2195 }
2196 return tree;
2197}
2198
2199/* This function build the following tree, from regular expression
2200 <exp1><exp2>:
2201 CAT
2202 / \
2203 / \
2204 <exp1> <exp2>
2205
2206 CAT means concatenation. */
2207
2208static bin_tree_t *
2209parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2210 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2211{
2212 bin_tree_t *tree, *expr;
2213 re_dfa_t *dfa = preg->buffer;
2214 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2215 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2216 return NULL;
2217
2218 while (token->type != OP_ALT && token->type != END_OF_RE
2219 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2220 {
2221 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2222 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2223 {
2224 if (tree != NULL)
2225 postorder (tree, free_tree, NULL);
2226 return NULL;
2227 }
2228 if (tree != NULL && expr != NULL)
2229 {
2230 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2231 if (newtree == NULL)
2232 {
2233 postorder (expr, free_tree, NULL);
2234 postorder (tree, free_tree, NULL);
2235 *err = REG_ESPACE;
2236 return NULL;
2237 }
2238 tree = newtree;
2239 }
2240 else if (tree == NULL)
2241 tree = expr;
2242 /* Otherwise expr == NULL, we don't need to create new tree. */
2243 }
2244 return tree;
2245}
2246
2247/* This function build the following tree, from regular expression a*:
2248 *
2249 |
2250 a
2251*/
2252
2253static bin_tree_t *
2254parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2255 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2256{
2257 re_dfa_t *dfa = preg->buffer;
2258 bin_tree_t *tree;
2259 switch (token->type)
2260 {
2261 case CHARACTER:
2262 tree = create_token_tree (dfa, NULL, NULL, token);
2263 if (__glibc_unlikely (tree == NULL))
2264 {
2265 *err = REG_ESPACE;
2266 return NULL;
2267 }
2268#ifdef RE_ENABLE_I18N
2269 if (dfa->mb_cur_max > 1)
2270 {
2271 while (!re_string_eoi (regexp)
2272 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2273 {
2274 bin_tree_t *mbc_remain;
2275 fetch_token (token, regexp, syntax);
2276 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2277 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2278 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2279 {
2280 *err = REG_ESPACE;
2281 return NULL;
2282 }
2283 }
2284 }
2285#endif
2286 break;
2287
2288 case OP_OPEN_SUBEXP:
2289 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2290 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2291 return NULL;
2292 break;
2293
2294 case OP_OPEN_BRACKET:
2295 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2296 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2297 return NULL;
2298 break;
2299
2300 case OP_BACK_REF:
2301 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2302 {
2303 *err = REG_ESUBREG;
2304 return NULL;
2305 }
2306 dfa->used_bkref_map |= 1 << token->opr.idx;
2307 tree = create_token_tree (dfa, NULL, NULL, token);
2308 if (__glibc_unlikely (tree == NULL))
2309 {
2310 *err = REG_ESPACE;
2311 return NULL;
2312 }
2313 ++dfa->nbackref;
2314 dfa->has_mb_node = 1;
2315 break;
2316
2317 case OP_OPEN_DUP_NUM:
2318 if (syntax & RE_CONTEXT_INVALID_DUP)
2319 {
2320 *err = REG_BADRPT;
2321 return NULL;
2322 }
2323 FALLTHROUGH;
2324 case OP_DUP_ASTERISK:
2325 case OP_DUP_PLUS:
2326 case OP_DUP_QUESTION:
2327 if (syntax & RE_CONTEXT_INVALID_OPS)
2328 {
2329 *err = REG_BADRPT;
2330 return NULL;
2331 }
2332 else if (syntax & RE_CONTEXT_INDEP_OPS)
2333 {
2334 fetch_token (token, regexp, syntax);
2335 return parse_expression (regexp, preg, token, syntax, nest, err);
2336 }
2337 FALLTHROUGH;
2338 case OP_CLOSE_SUBEXP:
2339 if ((token->type == OP_CLOSE_SUBEXP)
2340 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2341 {
2342 *err = REG_ERPAREN;
2343 return NULL;
2344 }
2345 FALLTHROUGH;
2346 case OP_CLOSE_DUP_NUM:
2347 /* We treat it as a normal character. */
2348
2349 /* Then we can these characters as normal characters. */
2350 token->type = CHARACTER;
2351 /* mb_partial and word_char bits should be initialized already
2352 by peek_token. */
2353 tree = create_token_tree (dfa, NULL, NULL, token);
2354 if (__glibc_unlikely (tree == NULL))
2355 {
2356 *err = REG_ESPACE;
2357 return NULL;
2358 }
2359 break;
2360
2361 case ANCHOR:
2362 if ((token->opr.ctx_type
2363 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2364 && dfa->word_ops_used == 0)
2365 init_word_char (dfa);
2366 if (token->opr.ctx_type == WORD_DELIM
2367 || token->opr.ctx_type == NOT_WORD_DELIM)
2368 {
2369 bin_tree_t *tree_first, *tree_last;
2370 if (token->opr.ctx_type == WORD_DELIM)
2371 {
2372 token->opr.ctx_type = WORD_FIRST;
2373 tree_first = create_token_tree (dfa, NULL, NULL, token);
2374 token->opr.ctx_type = WORD_LAST;
2375 }
2376 else
2377 {
2378 token->opr.ctx_type = INSIDE_WORD;
2379 tree_first = create_token_tree (dfa, NULL, NULL, token);
2380 token->opr.ctx_type = INSIDE_NOTWORD;
2381 }
2382 tree_last = create_token_tree (dfa, NULL, NULL, token);
2383 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2384 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2385 || tree == NULL))
2386 {
2387 *err = REG_ESPACE;
2388 return NULL;
2389 }
2390 }
2391 else
2392 {
2393 tree = create_token_tree (dfa, NULL, NULL, token);
2394 if (__glibc_unlikely (tree == NULL))
2395 {
2396 *err = REG_ESPACE;
2397 return NULL;
2398 }
2399 }
2400 /* We must return here, since ANCHORs can't be followed
2401 by repetition operators.
2402 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2403 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2404 fetch_token (token, regexp, syntax);
2405 return tree;
2406
2407 case OP_PERIOD:
2408 tree = create_token_tree (dfa, NULL, NULL, token);
2409 if (__glibc_unlikely (tree == NULL))
2410 {
2411 *err = REG_ESPACE;
2412 return NULL;
2413 }
2414 if (dfa->mb_cur_max > 1)
2415 dfa->has_mb_node = 1;
2416 break;
2417
2418 case OP_WORD:
2419 case OP_NOTWORD:
2420 tree = build_charclass_op (dfa, regexp->trans,
2421 "alnum",
2422 "_",
2423 token->type == OP_NOTWORD, err);
2424 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2425 return NULL;
2426 break;
2427
2428 case OP_SPACE:
2429 case OP_NOTSPACE:
2430 tree = build_charclass_op (dfa, regexp->trans,
2431 "space",
2432 "",
2433 token->type == OP_NOTSPACE, err);
2434 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2435 return NULL;
2436 break;
2437
2438 case OP_ALT:
2439 case END_OF_RE:
2440 return NULL;
2441
2442 case BACK_SLASH:
2443 *err = REG_EESCAPE;
2444 return NULL;
2445
2446 default:
2447 /* Must not happen? */
2448 DEBUG_ASSERT (false);
2449 return NULL;
2450 }
2451 fetch_token (token, regexp, syntax);
2452
2453 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2454 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2455 {
2456 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2457 syntax, err);
2458 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2459 {
2460 if (tree != NULL)
2461 postorder (tree, free_tree, NULL);
2462 return NULL;
2463 }
2464 tree = dup_tree;
2465 /* In BRE consecutive duplications are not allowed. */
2466 if ((syntax & RE_CONTEXT_INVALID_DUP)
2467 && (token->type == OP_DUP_ASTERISK
2468 || token->type == OP_OPEN_DUP_NUM))
2469 {
2470 if (tree != NULL)
2471 postorder (tree, free_tree, NULL);
2472 *err = REG_BADRPT;
2473 return NULL;
2474 }
2475 }
2476
2477 return tree;
2478}
2479
2480/* This function build the following tree, from regular expression
2481 (<reg_exp>):
2482 SUBEXP
2483 |
2484 <reg_exp>
2485*/
2486
2487static bin_tree_t *
2488parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2489 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2490{
2491 re_dfa_t *dfa = preg->buffer;
2492 bin_tree_t *tree;
2493 size_t cur_nsub;
2494 cur_nsub = preg->re_nsub++;
2495
2496 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2497
2498 /* The subexpression may be a null string. */
2499 if (token->type == OP_CLOSE_SUBEXP)
2500 tree = NULL;
2501 else
2502 {
2503 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2504 if (__glibc_unlikely (*err == REG_NOERROR
2505 && token->type != OP_CLOSE_SUBEXP))
2506 {
2507 if (tree != NULL)
2508 postorder (tree, free_tree, NULL);
2509 *err = REG_EPAREN;
2510 }
2511 if (__glibc_unlikely (*err != REG_NOERROR))
2512 return NULL;
2513 }
2514
2515 if (cur_nsub <= '9' - '1')
2516 dfa->completed_bkref_map |= 1 << cur_nsub;
2517
2518 tree = create_tree (dfa, tree, NULL, SUBEXP);
2519 if (__glibc_unlikely (tree == NULL))
2520 {
2521 *err = REG_ESPACE;
2522 return NULL;
2523 }
2524 tree->token.opr.idx = cur_nsub;
2525 return tree;
2526}
2527
2528/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2529
2530static bin_tree_t *
2531parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2532 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2533{
2534 bin_tree_t *tree = NULL, *old_tree = NULL;
2535 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2536 re_token_t start_token = *token;
2537
2538 if (token->type == OP_OPEN_DUP_NUM)
2539 {
2540 end = 0;
2541 start = fetch_number (regexp, token, syntax);
2542 if (start == -1)
2543 {
2544 if (token->type == CHARACTER && token->opr.c == ',')
2545 start = 0; /* We treat "{,m}" as "{0,m}". */
2546 else
2547 {
2548 *err = REG_BADBR; /* <re>{} is invalid. */
2549 return NULL;
2550 }
2551 }
2552 if (__glibc_likely (start != -2))
2553 {
2554 /* We treat "{n}" as "{n,n}". */
2555 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2556 : ((token->type == CHARACTER && token->opr.c == ',')
2557 ? fetch_number (regexp, token, syntax) : -2));
2558 }
2559 if (__glibc_unlikely (start == -2 || end == -2))
2560 {
2561 /* Invalid sequence. */
2562 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2563 {
2564 if (token->type == END_OF_RE)
2565 *err = REG_EBRACE;
2566 else
2567 *err = REG_BADBR;
2568
2569 return NULL;
2570 }
2571
2572 /* If the syntax bit is set, rollback. */
2573 re_string_set_index (regexp, start_idx);
2574 *token = start_token;
2575 token->type = CHARACTER;
2576 /* mb_partial and word_char bits should be already initialized by
2577 peek_token. */
2578 return elem;
2579 }
2580
2581 if (__glibc_unlikely ((end != -1 && start > end)
2582 || token->type != OP_CLOSE_DUP_NUM))
2583 {
2584 /* First number greater than second. */
2585 *err = REG_BADBR;
2586 return NULL;
2587 }
2588
2589 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2590 {
2591 *err = REG_ESIZE;
2592 return NULL;
2593 }
2594 }
2595 else
2596 {
2597 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2598 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2599 }
2600
2601 fetch_token (token, regexp, syntax);
2602
2603 if (__glibc_unlikely (elem == NULL))
2604 return NULL;
2605 if (__glibc_unlikely (start == 0 && end == 0))
2606 {
2607 postorder (elem, free_tree, NULL);
2608 return NULL;
2609 }
2610
2611 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2612 if (__glibc_unlikely (start > 0))
2613 {
2614 tree = elem;
2615 for (i = 2; i <= start; ++i)
2616 {
2617 elem = duplicate_tree (elem, dfa);
2618 tree = create_tree (dfa, tree, elem, CONCAT);
2619 if (__glibc_unlikely (elem == NULL || tree == NULL))
2620 goto parse_dup_op_espace;
2621 }
2622
2623 if (start == end)
2624 return tree;
2625
2626 /* Duplicate ELEM before it is marked optional. */
2627 elem = duplicate_tree (elem, dfa);
2628 if (__glibc_unlikely (elem == NULL))
2629 goto parse_dup_op_espace;
2630 old_tree = tree;
2631 }
2632 else
2633 old_tree = NULL;
2634
2635 if (elem->token.type == SUBEXP)
2636 {
2637 uintptr_t subidx = elem->token.opr.idx;
2638 postorder (elem, mark_opt_subexp, (void *) subidx);
2639 }
2640
2641 tree = create_tree (dfa, elem, NULL,
2642 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2643 if (__glibc_unlikely (tree == NULL))
2644 goto parse_dup_op_espace;
2645
2646 /* This loop is actually executed only when end != -1,
2647 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2648 already created the start+1-th copy. */
2649 if (TYPE_SIGNED (Idx) || end != -1)
2650 for (i = start + 2; i <= end; ++i)
2651 {
2652 elem = duplicate_tree (elem, dfa);
2653 tree = create_tree (dfa, tree, elem, CONCAT);
2654 if (__glibc_unlikely (elem == NULL || tree == NULL))
2655 goto parse_dup_op_espace;
2656
2657 tree = create_tree (dfa, tree, NULL, OP_ALT);
2658 if (__glibc_unlikely (tree == NULL))
2659 goto parse_dup_op_espace;
2660 }
2661
2662 if (old_tree)
2663 tree = create_tree (dfa, old_tree, tree, CONCAT);
2664
2665 return tree;
2666
2667 parse_dup_op_espace:
2668 *err = REG_ESPACE;
2669 return NULL;
2670}
2671
2672/* Size of the names for collating symbol/equivalence_class/character_class.
2673 I'm not sure, but maybe enough. */
2674#define BRACKET_NAME_BUF_SIZE 32
2675
2676#ifndef _LIBC
2677
2678# ifdef RE_ENABLE_I18N
2679/* Convert the byte B to the corresponding wide character. In a
2680 unibyte locale, treat B as itself. In a multibyte locale, return
2681 WEOF if B is an encoding error. */
2682static wint_t
2683parse_byte (unsigned char b, re_charset_t *mbcset)
2684{
2685 return mbcset == NULL ? b : __btowc (b);
2686}
2687# endif
2688
2689 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2690 Build the range expression which starts from START_ELEM, and ends
2691 at END_ELEM. The result are written to MBCSET and SBCSET.
2692 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2693 mbcset->range_ends, is a pointer argument since we may
2694 update it. */
2695
2696static reg_errcode_t
2697# ifdef RE_ENABLE_I18N
2698build_range_exp (const reg_syntax_t syntax,
2699 bitset_t sbcset,
2700 re_charset_t *mbcset,
2701 Idx *range_alloc,
2702 const bracket_elem_t *start_elem,
2703 const bracket_elem_t *end_elem)
2704# else /* not RE_ENABLE_I18N */
2705build_range_exp (const reg_syntax_t syntax,
2706 bitset_t sbcset,
2707 const bracket_elem_t *start_elem,
2708 const bracket_elem_t *end_elem)
2709# endif /* not RE_ENABLE_I18N */
2710{
2711 unsigned int start_ch, end_ch;
2712 /* Equivalence Classes and Character Classes can't be a range start/end. */
2713 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2714 || start_elem->type == CHAR_CLASS
2715 || end_elem->type == EQUIV_CLASS
2716 || end_elem->type == CHAR_CLASS))
2717 return REG_ERANGE;
2718
2719 /* We can handle no multi character collating elements without libc
2720 support. */
2721 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2722 && strlen ((char *) start_elem->opr.name) > 1)
2723 || (end_elem->type == COLL_SYM
2724 && strlen ((char *) end_elem->opr.name) > 1)))
2725 return REG_ECOLLATE;
2726
2727# ifdef RE_ENABLE_I18N
2728 {
2729 wchar_t wc;
2730 wint_t start_wc;
2731 wint_t end_wc;
2732
2733 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2734 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2735 : 0));
2736 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2737 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2738 : 0));
2739 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2740 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2741 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2742 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2743 if (start_wc == WEOF || end_wc == WEOF)
2744 return REG_ECOLLATE;
2745 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2746 && start_wc > end_wc))
2747 return REG_ERANGE;
2748
2749 /* Got valid collation sequence values, add them as a new entry.
2750 However, for !_LIBC we have no collation elements: if the
2751 character set is single byte, the single byte character set
2752 that we build below suffices. parse_bracket_exp passes
2753 no MBCSET if dfa->mb_cur_max == 1. */
2754 if (mbcset)
2755 {
2756 /* Check the space of the arrays. */
2757 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2758 {
2759 /* There is not enough space, need realloc. */
2760 wchar_t *new_array_start, *new_array_end;
2761 Idx new_nranges;
2762
2763 /* +1 in case of mbcset->nranges is 0. */
2764 new_nranges = 2 * mbcset->nranges + 1;
2765 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2766 are NULL if *range_alloc == 0. */
2767 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2768 new_nranges);
2769 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2770 new_nranges);
2771
2772 if (__glibc_unlikely (new_array_start == NULL
2773 || new_array_end == NULL))
2774 {
2775 re_free (new_array_start);
2776 re_free (new_array_end);
2777 return REG_ESPACE;
2778 }
2779
2780 mbcset->range_starts = new_array_start;
2781 mbcset->range_ends = new_array_end;
2782 *range_alloc = new_nranges;
2783 }
2784
2785 mbcset->range_starts[mbcset->nranges] = start_wc;
2786 mbcset->range_ends[mbcset->nranges++] = end_wc;
2787 }
2788
2789 /* Build the table for single byte characters. */
2790 for (wc = 0; wc < SBC_MAX; ++wc)
2791 {
2792 if (start_wc <= wc && wc <= end_wc)
2793 bitset_set (sbcset, wc);
2794 }
2795 }
2796# else /* not RE_ENABLE_I18N */
2797 {
2798 unsigned int ch;
2799 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2800 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2801 : 0));
2802 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2803 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2804 : 0));
2805 if (start_ch > end_ch)
2806 return REG_ERANGE;
2807 /* Build the table for single byte characters. */
2808 for (ch = 0; ch < SBC_MAX; ++ch)
2809 if (start_ch <= ch && ch <= end_ch)
2810 bitset_set (sbcset, ch);
2811 }
2812# endif /* not RE_ENABLE_I18N */
2813 return REG_NOERROR;
2814}
2815#endif /* not _LIBC */
2816
2817#ifndef _LIBC
2818/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2819 Build the collating element which is represented by NAME.
2820 The result are written to MBCSET and SBCSET.
2821 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2822 pointer argument since we may update it. */
2823
2824static reg_errcode_t
2825# ifdef RE_ENABLE_I18N
2826build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2827 Idx *coll_sym_alloc, const unsigned char *name)
2828# else /* not RE_ENABLE_I18N */
2829build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2830# endif /* not RE_ENABLE_I18N */
2831{
2832 size_t name_len = strlen ((const char *) name);
2833 if (__glibc_unlikely (name_len != 1))
2834 return REG_ECOLLATE;
2835 else
2836 {
2837 bitset_set (sbcset, name[0]);
2838 return REG_NOERROR;
2839 }
2840}
2841#endif /* not _LIBC */
2842
2843/* This function parse bracket expression like "[abc]", "[a-c]",
2844 "[[.a-a.]]" etc. */
2845
2846static bin_tree_t *
2847parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2848 reg_syntax_t syntax, reg_errcode_t *err)
2849{
2850#ifdef _LIBC
2851 const unsigned char *collseqmb;
2852 const char *collseqwc;
2853 uint32_t nrules;
2854 int32_t table_size;
2855 const int32_t *symb_table;
2856 const unsigned char *extra;
2857
2858 /* Local function for parse_bracket_exp used in _LIBC environment.
2859 Seek the collating symbol entry corresponding to NAME.
2860 Return the index of the symbol in the SYMB_TABLE,
2861 or -1 if not found. */
2862
2863 auto inline int32_t
2864 __attribute__ ((always_inline))
2865 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2866 {
2867 int32_t elem;
2868
2869 for (elem = 0; elem < table_size; elem++)
2870 if (symb_table[2 * elem] != 0)
2871 {
2872 int32_t idx = symb_table[2 * elem + 1];
2873 /* Skip the name of collating element name. */
2874 idx += 1 + extra[idx];
2875 if (/* Compare the length of the name. */
2876 name_len == extra[idx]
2877 /* Compare the name. */
2878 && memcmp (name, &extra[idx + 1], name_len) == 0)
2879 /* Yep, this is the entry. */
2880 return elem;
2881 }
2882 return -1;
2883 }
2884
2885 /* Local function for parse_bracket_exp used in _LIBC environment.
2886 Look up the collation sequence value of BR_ELEM.
2887 Return the value if succeeded, UINT_MAX otherwise. */
2888
2889 auto inline unsigned int
2890 __attribute__ ((always_inline))
2891 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2892 {
2893 if (br_elem->type == SB_CHAR)
2894 {
2895 /*
2896 if (MB_CUR_MAX == 1)
2897 */
2898 if (nrules == 0)
2899 return collseqmb[br_elem->opr.ch];
2900 else
2901 {
2902 wint_t wc = __btowc (br_elem->opr.ch);
2903 return __collseq_table_lookup (collseqwc, wc);
2904 }
2905 }
2906 else if (br_elem->type == MB_CHAR)
2907 {
2908 if (nrules != 0)
2909 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2910 }
2911 else if (br_elem->type == COLL_SYM)
2912 {
2913 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2914 if (nrules != 0)
2915 {
2916 int32_t elem, idx;
2917 elem = seek_collating_symbol_entry (br_elem->opr.name,
2918 sym_name_len);
2919 if (elem != -1)
2920 {
2921 /* We found the entry. */
2922 idx = symb_table[2 * elem + 1];
2923 /* Skip the name of collating element name. */
2924 idx += 1 + extra[idx];
2925 /* Skip the byte sequence of the collating element. */
2926 idx += 1 + extra[idx];
2927 /* Adjust for the alignment. */
2928 idx = (idx + 3) & ~3;
2929 /* Skip the multibyte collation sequence value. */
2930 idx += sizeof (unsigned int);
2931 /* Skip the wide char sequence of the collating element. */
2932 idx += sizeof (unsigned int) *
2933 (1 + *(unsigned int *) (extra + idx));
2934 /* Return the collation sequence value. */
2935 return *(unsigned int *) (extra + idx);
2936 }
2937 else if (sym_name_len == 1)
2938 {
2939 /* No valid character. Match it as a single byte
2940 character. */
2941 return collseqmb[br_elem->opr.name[0]];
2942 }
2943 }
2944 else if (sym_name_len == 1)
2945 return collseqmb[br_elem->opr.name[0]];
2946 }
2947 return UINT_MAX;
2948 }
2949
2950 /* Local function for parse_bracket_exp used in _LIBC environment.
2951 Build the range expression which starts from START_ELEM, and ends
2952 at END_ELEM. The result are written to MBCSET and SBCSET.
2953 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2954 mbcset->range_ends, is a pointer argument since we may
2955 update it. */
2956
2957 auto inline reg_errcode_t
2958 __attribute__ ((always_inline))
2959 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2960 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2961 {
2962 unsigned int ch;
2963 uint32_t start_collseq;
2964 uint32_t end_collseq;
2965
2966 /* Equivalence Classes and Character Classes can't be a range
2967 start/end. */
2968 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2969 || start_elem->type == CHAR_CLASS
2970 || end_elem->type == EQUIV_CLASS
2971 || end_elem->type == CHAR_CLASS))
2972 return REG_ERANGE;
2973
2974 /* FIXME: Implement rational ranges here, too. */
2975 start_collseq = lookup_collation_sequence_value (start_elem);
2976 end_collseq = lookup_collation_sequence_value (end_elem);
2977 /* Check start/end collation sequence values. */
2978 if (__glibc_unlikely (start_collseq == UINT_MAX
2979 || end_collseq == UINT_MAX))
2980 return REG_ECOLLATE;
2981 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2982 && start_collseq > end_collseq))
2983 return REG_ERANGE;
2984
2985 /* Got valid collation sequence values, add them as a new entry.
2986 However, if we have no collation elements, and the character set
2987 is single byte, the single byte character set that we
2988 build below suffices. */
2989 if (nrules > 0 || dfa->mb_cur_max > 1)
2990 {
2991 /* Check the space of the arrays. */
2992 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2993 {
2994 /* There is not enough space, need realloc. */
2995 uint32_t *new_array_start;
2996 uint32_t *new_array_end;
2997 Idx new_nranges;
2998
2999 /* +1 in case of mbcset->nranges is 0. */
3000 new_nranges = 2 * mbcset->nranges + 1;
3001 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3002 new_nranges);
3003 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3004 new_nranges);
3005
3006 if (__glibc_unlikely (new_array_start == NULL
3007 || new_array_end == NULL))
3008 return REG_ESPACE;
3009
3010 mbcset->range_starts = new_array_start;
3011 mbcset->range_ends = new_array_end;
3012 *range_alloc = new_nranges;
3013 }
3014
3015 mbcset->range_starts[mbcset->nranges] = start_collseq;
3016 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3017 }
3018
3019 /* Build the table for single byte characters. */
3020 for (ch = 0; ch < SBC_MAX; ch++)
3021 {
3022 uint32_t ch_collseq;
3023 /*
3024 if (MB_CUR_MAX == 1)
3025 */
3026 if (nrules == 0)
3027 ch_collseq = collseqmb[ch];
3028 else
3029 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3030 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3031 bitset_set (sbcset, ch);
3032 }
3033 return REG_NOERROR;
3034 }
3035
3036 /* Local function for parse_bracket_exp used in _LIBC environment.
3037 Build the collating element which is represented by NAME.
3038 The result are written to MBCSET and SBCSET.
3039 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3040 pointer argument since we may update it. */
3041
3042 auto inline reg_errcode_t
3043 __attribute__ ((always_inline))
3044 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3045 Idx *coll_sym_alloc, const unsigned char *name)
3046 {
3047 int32_t elem, idx;
3048 size_t name_len = strlen ((const char *) name);
3049 if (nrules != 0)
3050 {
3051 elem = seek_collating_symbol_entry (name, name_len);
3052 if (elem != -1)
3053 {
3054 /* We found the entry. */
3055 idx = symb_table[2 * elem + 1];
3056 /* Skip the name of collating element name. */
3057 idx += 1 + extra[idx];
3058 }
3059 else if (name_len == 1)
3060 {
3061 /* No valid character, treat it as a normal
3062 character. */
3063 bitset_set (sbcset, name[0]);
3064 return REG_NOERROR;
3065 }
3066 else
3067 return REG_ECOLLATE;
3068
3069 /* Got valid collation sequence, add it as a new entry. */
3070 /* Check the space of the arrays. */
3071 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3072 {
3073 /* Not enough, realloc it. */
3074 /* +1 in case of mbcset->ncoll_syms is 0. */
3075 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3076 /* Use realloc since mbcset->coll_syms is NULL
3077 if *alloc == 0. */
3078 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3079 new_coll_sym_alloc);
3080 if (__glibc_unlikely (new_coll_syms == NULL))
3081 return REG_ESPACE;
3082 mbcset->coll_syms = new_coll_syms;
3083 *coll_sym_alloc = new_coll_sym_alloc;
3084 }
3085 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3086 return REG_NOERROR;
3087 }
3088 else
3089 {
3090 if (__glibc_unlikely (name_len != 1))
3091 return REG_ECOLLATE;
3092 else
3093 {
3094 bitset_set (sbcset, name[0]);
3095 return REG_NOERROR;
3096 }
3097 }
3098 }
3099#endif
3100
3101 re_token_t br_token;
3102 re_bitset_ptr_t sbcset;
3103#ifdef RE_ENABLE_I18N
3104 re_charset_t *mbcset;
3105 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3106 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3107#endif /* not RE_ENABLE_I18N */
3108 bool non_match = false;
3109 bin_tree_t *work_tree;
3110 int token_len;
3111 bool first_round = true;
3112#ifdef _LIBC
3113 collseqmb = (const unsigned char *)
3114 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3115 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3116 if (nrules)
3117 {
3118 /*
3119 if (MB_CUR_MAX > 1)
3120 */
3121 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3122 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3123 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3124 _NL_COLLATE_SYMB_TABLEMB);
3125 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3126 _NL_COLLATE_SYMB_EXTRAMB);
3127 }
3128#endif
3129 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3130#ifdef RE_ENABLE_I18N
3131 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3132#endif /* RE_ENABLE_I18N */
3133#ifdef RE_ENABLE_I18N
3134 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3135#else
3136 if (__glibc_unlikely (sbcset == NULL))
3137#endif /* RE_ENABLE_I18N */
3138 {
3139 re_free (sbcset);
3140#ifdef RE_ENABLE_I18N
3141 re_free (mbcset);
3142#endif
3143 *err = REG_ESPACE;
3144 return NULL;
3145 }
3146
3147 token_len = peek_token_bracket (token, regexp, syntax);
3148 if (__glibc_unlikely (token->type == END_OF_RE))
3149 {
3150 *err = REG_BADPAT;
3151 goto parse_bracket_exp_free_return;
3152 }
3153 if (token->type == OP_NON_MATCH_LIST)
3154 {
3155#ifdef RE_ENABLE_I18N
3156 mbcset->non_match = 1;
3157#endif /* not RE_ENABLE_I18N */
3158 non_match = true;
3159 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3160 bitset_set (sbcset, '\n');
3161 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3162 token_len = peek_token_bracket (token, regexp, syntax);
3163 if (__glibc_unlikely (token->type == END_OF_RE))
3164 {
3165 *err = REG_BADPAT;
3166 goto parse_bracket_exp_free_return;
3167 }
3168 }
3169
3170 /* We treat the first ']' as a normal character. */
3171 if (token->type == OP_CLOSE_BRACKET)
3172 token->type = CHARACTER;
3173
3174 while (1)
3175 {
3176 bracket_elem_t start_elem, end_elem;
3177 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3178 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3179 reg_errcode_t ret;
3180 int token_len2 = 0;
3181 bool is_range_exp = false;
3182 re_token_t token2;
3183
3184 start_elem.opr.name = start_name_buf;
3185 start_elem.type = COLL_SYM;
3186 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3187 syntax, first_round);
3188 if (__glibc_unlikely (ret != REG_NOERROR))
3189 {
3190 *err = ret;
3191 goto parse_bracket_exp_free_return;
3192 }
3193 first_round = false;
3194
3195 /* Get information about the next token. We need it in any case. */
3196 token_len = peek_token_bracket (token, regexp, syntax);
3197
3198 /* Do not check for ranges if we know they are not allowed. */
3199 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3200 {
3201 if (__glibc_unlikely (token->type == END_OF_RE))
3202 {
3203 *err = REG_EBRACK;
3204 goto parse_bracket_exp_free_return;
3205 }
3206 if (token->type == OP_CHARSET_RANGE)
3207 {
3208 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3209 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3210 if (__glibc_unlikely (token2.type == END_OF_RE))
3211 {
3212 *err = REG_EBRACK;
3213 goto parse_bracket_exp_free_return;
3214 }
3215 if (token2.type == OP_CLOSE_BRACKET)
3216 {
3217 /* We treat the last '-' as a normal character. */
3218 re_string_skip_bytes (regexp, -token_len);
3219 token->type = CHARACTER;
3220 }
3221 else
3222 is_range_exp = true;
3223 }
3224 }
3225
3226 if (is_range_exp == true)
3227 {
3228 end_elem.opr.name = end_name_buf;
3229 end_elem.type = COLL_SYM;
3230 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3231 dfa, syntax, true);
3232 if (__glibc_unlikely (ret != REG_NOERROR))
3233 {
3234 *err = ret;
3235 goto parse_bracket_exp_free_return;
3236 }
3237
3238 token_len = peek_token_bracket (token, regexp, syntax);
3239
3240#ifdef _LIBC
3241 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3242 &start_elem, &end_elem);
3243#else
3244# ifdef RE_ENABLE_I18N
3245 *err = build_range_exp (syntax, sbcset,
3246 dfa->mb_cur_max > 1 ? mbcset : NULL,
3247 &range_alloc, &start_elem, &end_elem);
3248# else
3249 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3250# endif
3251#endif /* RE_ENABLE_I18N */
3252 if (__glibc_unlikely (*err != REG_NOERROR))
3253 goto parse_bracket_exp_free_return;
3254 }
3255 else
3256 {
3257 switch (start_elem.type)
3258 {
3259 case SB_CHAR:
3260 bitset_set (sbcset, start_elem.opr.ch);
3261 break;
3262#ifdef RE_ENABLE_I18N
3263 case MB_CHAR:
3264 /* Check whether the array has enough space. */
3265 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3266 {
3267 wchar_t *new_mbchars;
3268 /* Not enough, realloc it. */
3269 /* +1 in case of mbcset->nmbchars is 0. */
3270 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3271 /* Use realloc since array is NULL if *alloc == 0. */
3272 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3273 mbchar_alloc);
3274 if (__glibc_unlikely (new_mbchars == NULL))
3275 goto parse_bracket_exp_espace;
3276 mbcset->mbchars = new_mbchars;
3277 }
3278 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3279 break;
3280#endif /* RE_ENABLE_I18N */
3281 case EQUIV_CLASS:
3282 *err = build_equiv_class (sbcset,
3283#ifdef RE_ENABLE_I18N
3284 mbcset, &equiv_class_alloc,
3285#endif /* RE_ENABLE_I18N */
3286 start_elem.opr.name);
3287 if (__glibc_unlikely (*err != REG_NOERROR))
3288 goto parse_bracket_exp_free_return;
3289 break;
3290 case COLL_SYM:
3291 *err = build_collating_symbol (sbcset,
3292#ifdef RE_ENABLE_I18N
3293 mbcset, &coll_sym_alloc,
3294#endif /* RE_ENABLE_I18N */
3295 start_elem.opr.name);
3296 if (__glibc_unlikely (*err != REG_NOERROR))
3297 goto parse_bracket_exp_free_return;
3298 break;
3299 case CHAR_CLASS:
3300 *err = build_charclass (regexp->trans, sbcset,
3301#ifdef RE_ENABLE_I18N
3302 mbcset, &char_class_alloc,
3303#endif /* RE_ENABLE_I18N */
3304 (const char *) start_elem.opr.name,
3305 syntax);
3306 if (__glibc_unlikely (*err != REG_NOERROR))
3307 goto parse_bracket_exp_free_return;
3308 break;
3309 default:
3310 DEBUG_ASSERT (false);
3311 break;
3312 }
3313 }
3314 if (__glibc_unlikely (token->type == END_OF_RE))
3315 {
3316 *err = REG_EBRACK;
3317 goto parse_bracket_exp_free_return;
3318 }
3319 if (token->type == OP_CLOSE_BRACKET)
3320 break;
3321 }
3322
3323 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3324
3325 /* If it is non-matching list. */
3326 if (non_match)
3327 bitset_not (sbcset);
3328
3329#ifdef RE_ENABLE_I18N
3330 /* Ensure only single byte characters are set. */
3331 if (dfa->mb_cur_max > 1)
3332 bitset_mask (sbcset, dfa->sb_char);
3333
3334 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3335 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3336 || mbcset->non_match)))
3337 {
3338 bin_tree_t *mbc_tree;
3339 int sbc_idx;
3340 /* Build a tree for complex bracket. */
3341 dfa->has_mb_node = 1;
3342 br_token.type = COMPLEX_BRACKET;
3343 br_token.opr.mbcset = mbcset;
3344 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3345 if (__glibc_unlikely (mbc_tree == NULL))
3346 goto parse_bracket_exp_espace;
3347 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3348 if (sbcset[sbc_idx])
3349 break;
3350 /* If there are no bits set in sbcset, there is no point
3351 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3352 if (sbc_idx < BITSET_WORDS)
3353 {
3354 /* Build a tree for simple bracket. */
3355 br_token.type = SIMPLE_BRACKET;
3356 br_token.opr.sbcset = sbcset;
3357 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3358 if (__glibc_unlikely (work_tree == NULL))
3359 goto parse_bracket_exp_espace;
3360
3361 /* Then join them by ALT node. */
3362 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3363 if (__glibc_unlikely (work_tree == NULL))
3364 goto parse_bracket_exp_espace;
3365 }
3366 else
3367 {
3368 re_free (sbcset);
3369 work_tree = mbc_tree;
3370 }
3371 }
3372 else
3373#endif /* not RE_ENABLE_I18N */
3374 {
3375#ifdef RE_ENABLE_I18N
3376 free_charset (mbcset);
3377#endif
3378 /* Build a tree for simple bracket. */
3379 br_token.type = SIMPLE_BRACKET;
3380 br_token.opr.sbcset = sbcset;
3381 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3382 if (__glibc_unlikely (work_tree == NULL))
3383 goto parse_bracket_exp_espace;
3384 }
3385 return work_tree;
3386
3387 parse_bracket_exp_espace:
3388 *err = REG_ESPACE;
3389 parse_bracket_exp_free_return:
3390 re_free (sbcset);
3391#ifdef RE_ENABLE_I18N
3392 free_charset (mbcset);
3393#endif /* RE_ENABLE_I18N */
3394 return NULL;
3395}
3396
3397/* Parse an element in the bracket expression. */
3398
3399static reg_errcode_t
3400parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3401 re_token_t *token, int token_len, re_dfa_t *dfa,
3402 reg_syntax_t syntax, bool accept_hyphen)
3403{
3404#ifdef RE_ENABLE_I18N
3405 int cur_char_size;
3406 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3407 if (cur_char_size > 1)
3408 {
3409 elem->type = MB_CHAR;
3410 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3411 re_string_skip_bytes (regexp, cur_char_size);
3412 return REG_NOERROR;
3413 }
3414#endif /* RE_ENABLE_I18N */
3415 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3416 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3417 || token->type == OP_OPEN_EQUIV_CLASS)
3418 return parse_bracket_symbol (elem, regexp, token);
3419 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3420 {
3421 /* A '-' must only appear as anything but a range indicator before
3422 the closing bracket. Everything else is an error. */
3423 re_token_t token2;
3424 (void) peek_token_bracket (&token2, regexp, syntax);
3425 if (token2.type != OP_CLOSE_BRACKET)
3426 /* The actual error value is not standardized since this whole
3427 case is undefined. But ERANGE makes good sense. */
3428 return REG_ERANGE;
3429 }
3430 elem->type = SB_CHAR;
3431 elem->opr.ch = token->opr.c;
3432 return REG_NOERROR;
3433}
3434
3435/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3436 such as [:<character_class>:], [.<collating_element>.], and
3437 [=<equivalent_class>=]. */
3438
3439static reg_errcode_t
3440parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3441 re_token_t *token)
3442{
3443 unsigned char ch, delim = token->opr.c;
3444 int i = 0;
3445 if (re_string_eoi(regexp))
3446 return REG_EBRACK;
3447 for (;; ++i)
3448 {
3449 if (i >= BRACKET_NAME_BUF_SIZE)
3450 return REG_EBRACK;
3451 if (token->type == OP_OPEN_CHAR_CLASS)
3452 ch = re_string_fetch_byte_case (regexp);
3453 else
3454 ch = re_string_fetch_byte (regexp);
3455 if (re_string_eoi(regexp))
3456 return REG_EBRACK;
3457 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3458 break;
3459 elem->opr.name[i] = ch;
3460 }
3461 re_string_skip_bytes (regexp, 1);
3462 elem->opr.name[i] = '\0';
3463 switch (token->type)
3464 {
3465 case OP_OPEN_COLL_ELEM:
3466 elem->type = COLL_SYM;
3467 break;
3468 case OP_OPEN_EQUIV_CLASS:
3469 elem->type = EQUIV_CLASS;
3470 break;
3471 case OP_OPEN_CHAR_CLASS:
3472 elem->type = CHAR_CLASS;
3473 break;
3474 default:
3475 break;
3476 }
3477 return REG_NOERROR;
3478}
3479
3480 /* Helper function for parse_bracket_exp.
3481 Build the equivalence class which is represented by NAME.
3482 The result are written to MBCSET and SBCSET.
3483 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3484 is a pointer argument since we may update it. */
3485
3486static reg_errcode_t
3487#ifdef RE_ENABLE_I18N
3488build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3489 Idx *equiv_class_alloc, const unsigned char *name)
3490#else /* not RE_ENABLE_I18N */
3491build_equiv_class (bitset_t sbcset, const unsigned char *name)
3492#endif /* not RE_ENABLE_I18N */
3493{
3494#ifdef _LIBC
3495 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3496 if (nrules != 0)
3497 {
3498 const int32_t *table, *indirect;
3499 const unsigned char *weights, *extra, *cp;
3500 unsigned char char_buf[2];
3501 int32_t idx1, idx2;
3502 unsigned int ch;
3503 size_t len;
3504 /* Calculate the index for equivalence class. */
3505 cp = name;
3506 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3507 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3508 _NL_COLLATE_WEIGHTMB);
3509 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3510 _NL_COLLATE_EXTRAMB);
3511 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3512 _NL_COLLATE_INDIRECTMB);
3513 idx1 = findidx (table, indirect, extra, &cp, -1);
3514 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3515 /* This isn't a valid character. */
3516 return REG_ECOLLATE;
3517
3518 /* Build single byte matching table for this equivalence class. */
3519 len = weights[idx1 & 0xffffff];
3520 for (ch = 0; ch < SBC_MAX; ++ch)
3521 {
3522 char_buf[0] = ch;
3523 cp = char_buf;
3524 idx2 = findidx (table, indirect, extra, &cp, 1);
3525/*
3526 idx2 = table[ch];
3527*/
3528 if (idx2 == 0)
3529 /* This isn't a valid character. */
3530 continue;
3531 /* Compare only if the length matches and the collation rule
3532 index is the same. */
3533 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3534 && memcmp (weights + (idx1 & 0xffffff) + 1,
3535 weights + (idx2 & 0xffffff) + 1, len) == 0)
3536 bitset_set (sbcset, ch);
3537 }
3538 /* Check whether the array has enough space. */
3539 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3540 {
3541 /* Not enough, realloc it. */
3542 /* +1 in case of mbcset->nequiv_classes is 0. */
3543 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3544 /* Use realloc since the array is NULL if *alloc == 0. */
3545 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3546 int32_t,
3547 new_equiv_class_alloc);
3548 if (__glibc_unlikely (new_equiv_classes == NULL))
3549 return REG_ESPACE;
3550 mbcset->equiv_classes = new_equiv_classes;
3551 *equiv_class_alloc = new_equiv_class_alloc;
3552 }
3553 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3554 }
3555 else
3556#endif /* _LIBC */
3557 {
3558 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3559 return REG_ECOLLATE;
3560 bitset_set (sbcset, *name);
3561 }
3562 return REG_NOERROR;
3563}
3564
3565 /* Helper function for parse_bracket_exp.
3566 Build the character class which is represented by NAME.
3567 The result are written to MBCSET and SBCSET.
3568 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3569 is a pointer argument since we may update it. */
3570
3571static reg_errcode_t
3572#ifdef RE_ENABLE_I18N
3573build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3574 re_charset_t *mbcset, Idx *char_class_alloc,
3575 const char *class_name, reg_syntax_t syntax)
3576#else /* not RE_ENABLE_I18N */
3577build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3578 const char *class_name, reg_syntax_t syntax)
3579#endif /* not RE_ENABLE_I18N */
3580{
3581 int i;
3582 const char *name = class_name;
3583
3584 /* In case of REG_ICASE "upper" and "lower" match the both of
3585 upper and lower cases. */
3586 if ((syntax & RE_ICASE)
3587 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3588 name = "alpha";
3589
3590#ifdef RE_ENABLE_I18N
3591 /* Check the space of the arrays. */
3592 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3593 {
3594 /* Not enough, realloc it. */
3595 /* +1 in case of mbcset->nchar_classes is 0. */
3596 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3597 /* Use realloc since array is NULL if *alloc == 0. */
3598 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3599 new_char_class_alloc);
3600 if (__glibc_unlikely (new_char_classes == NULL))
3601 return REG_ESPACE;
3602 mbcset->char_classes = new_char_classes;
3603 *char_class_alloc = new_char_class_alloc;
3604 }
3605 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3606#endif /* RE_ENABLE_I18N */
3607
3608#define BUILD_CHARCLASS_LOOP(ctype_func) \
3609 do { \
3610 if (__glibc_unlikely (trans != NULL)) \
3611 { \
3612 for (i = 0; i < SBC_MAX; ++i) \
3613 if (ctype_func (i)) \
3614 bitset_set (sbcset, trans[i]); \
3615 } \
3616 else \
3617 { \
3618 for (i = 0; i < SBC_MAX; ++i) \
3619 if (ctype_func (i)) \
3620 bitset_set (sbcset, i); \
3621 } \
3622 } while (0)
3623
3624 if (strcmp (name, "alnum") == 0)
3625 BUILD_CHARCLASS_LOOP (isalnum);
3626 else if (strcmp (name, "cntrl") == 0)
3627 BUILD_CHARCLASS_LOOP (iscntrl);
3628 else if (strcmp (name, "lower") == 0)
3629 BUILD_CHARCLASS_LOOP (islower);
3630 else if (strcmp (name, "space") == 0)
3631 BUILD_CHARCLASS_LOOP (isspace);
3632 else if (strcmp (name, "alpha") == 0)
3633 BUILD_CHARCLASS_LOOP (isalpha);
3634 else if (strcmp (name, "digit") == 0)
3635 BUILD_CHARCLASS_LOOP (isdigit);
3636 else if (strcmp (name, "print") == 0)
3637 BUILD_CHARCLASS_LOOP (isprint);
3638 else if (strcmp (name, "upper") == 0)
3639 BUILD_CHARCLASS_LOOP (isupper);
3640 else if (strcmp (name, "blank") == 0)
3641 BUILD_CHARCLASS_LOOP (isblank);
3642 else if (strcmp (name, "graph") == 0)
3643 BUILD_CHARCLASS_LOOP (isgraph);
3644 else if (strcmp (name, "punct") == 0)
3645 BUILD_CHARCLASS_LOOP (ispunct);
3646 else if (strcmp (name, "xdigit") == 0)
3647 BUILD_CHARCLASS_LOOP (isxdigit);
3648 else
3649 return REG_ECTYPE;
3650
3651 return REG_NOERROR;
3652}
3653
3654static bin_tree_t *
3655build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3656 const char *class_name,
3657 const char *extra, bool non_match,
3658 reg_errcode_t *err)
3659{
3660 re_bitset_ptr_t sbcset;
3661#ifdef RE_ENABLE_I18N
3662 re_charset_t *mbcset;
3663 Idx alloc = 0;
3664#endif /* not RE_ENABLE_I18N */
3665 reg_errcode_t ret;
3666 bin_tree_t *tree;
3667
3668 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3669 if (__glibc_unlikely (sbcset == NULL))
3670 {
3671 *err = REG_ESPACE;
3672 return NULL;
3673 }
3674#ifdef RE_ENABLE_I18N
3675 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3676 if (__glibc_unlikely (mbcset == NULL))
3677 {
3678 re_free (sbcset);
3679 *err = REG_ESPACE;
3680 return NULL;
3681 }
3682 mbcset->non_match = non_match;
3683#endif /* RE_ENABLE_I18N */
3684
3685 /* We don't care the syntax in this case. */
3686 ret = build_charclass (trans, sbcset,
3687#ifdef RE_ENABLE_I18N
3688 mbcset, &alloc,
3689#endif /* RE_ENABLE_I18N */
3690 class_name, 0);
3691
3692 if (__glibc_unlikely (ret != REG_NOERROR))
3693 {
3694 re_free (sbcset);
3695#ifdef RE_ENABLE_I18N
3696 free_charset (mbcset);
3697#endif /* RE_ENABLE_I18N */
3698 *err = ret;
3699 return NULL;
3700 }
3701 /* \w match '_' also. */
3702 for (; *extra; extra++)
3703 bitset_set (sbcset, *extra);
3704
3705 /* If it is non-matching list. */
3706 if (non_match)
3707 bitset_not (sbcset);
3708
3709#ifdef RE_ENABLE_I18N
3710 /* Ensure only single byte characters are set. */
3711 if (dfa->mb_cur_max > 1)
3712 bitset_mask (sbcset, dfa->sb_char);
3713#endif
3714
3715 /* Build a tree for simple bracket. */
3716 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3717 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3718 if (__glibc_unlikely (tree == NULL))
3719 goto build_word_op_espace;
3720
3721#ifdef RE_ENABLE_I18N
3722 if (dfa->mb_cur_max > 1)
3723 {
3724 bin_tree_t *mbc_tree;
3725 /* Build a tree for complex bracket. */
3726 br_token.type = COMPLEX_BRACKET;
3727 br_token.opr.mbcset = mbcset;
3728 dfa->has_mb_node = 1;
3729 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3730 if (__glibc_unlikely (mbc_tree == NULL))
3731 goto build_word_op_espace;
3732 /* Then join them by ALT node. */
3733 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3734 if (__glibc_likely (mbc_tree != NULL))
3735 return tree;
3736 }
3737 else
3738 {
3739 free_charset (mbcset);
3740 return tree;
3741 }
3742#else /* not RE_ENABLE_I18N */
3743 return tree;
3744#endif /* not RE_ENABLE_I18N */
3745
3746 build_word_op_espace:
3747 re_free (sbcset);
3748#ifdef RE_ENABLE_I18N
3749 free_charset (mbcset);
3750#endif /* RE_ENABLE_I18N */
3751 *err = REG_ESPACE;
3752 return NULL;
3753}
3754
3755/* This is intended for the expressions like "a{1,3}".
3756 Fetch a number from 'input', and return the number.
3757 Return -1 if the number field is empty like "{,1}".
3758 Return RE_DUP_MAX + 1 if the number field is too large.
3759 Return -2 if an error occurred. */
3760
3761static Idx
3762fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3763{
3764 Idx num = -1;
3765 unsigned char c;
3766 while (1)
3767 {
3768 fetch_token (token, input, syntax);
3769 c = token->opr.c;
3770 if (__glibc_unlikely (token->type == END_OF_RE))
3771 return -2;
3772 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3773 break;
3774 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3775 ? -2
3776 : num == -1
3777 ? c - '0'
3778 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3779 }
3780 return num;
3781}
3782
3783
3784#ifdef RE_ENABLE_I18N
3785static void
3786free_charset (re_charset_t *cset)
3787{
3788 re_free (cset->mbchars);
3789# ifdef _LIBC
3790 re_free (cset->coll_syms);
3791 re_free (cset->equiv_classes);
3792# endif
3793 re_free (cset->range_starts);
3794 re_free (cset->range_ends);
3795 re_free (cset->char_classes);
3796 re_free (cset);
3797}
3798#endif /* RE_ENABLE_I18N */
3799
3800
3801/* Functions for binary tree operation. */
3802
3803/* Create a tree node. */
3804
3805static bin_tree_t *
3806create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3807 re_token_type_t type)
3808{
3809 re_token_t t = { .type = type };
3810 return create_token_tree (dfa, left, right, &t);
3811}
3812
3813static bin_tree_t *
3814create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3815 const re_token_t *token)
3816{
3817 bin_tree_t *tree;
3818 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3819 {
3820 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3821
3822 if (storage == NULL)
3823 return NULL;
3824 storage->next = dfa->str_tree_storage;
3825 dfa->str_tree_storage = storage;
3826 dfa->str_tree_storage_idx = 0;
3827 }
3828 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3829
3830 tree->parent = NULL;
3831 tree->left = left;
3832 tree->right = right;
3833 tree->token = *token;
3834 tree->token.duplicated = 0;
3835 tree->token.opt_subexp = 0;
3836 tree->first = NULL;
3837 tree->next = NULL;
3838 tree->node_idx = -1;
3839
3840 if (left != NULL)
3841 left->parent = tree;
3842 if (right != NULL)
3843 right->parent = tree;
3844 return tree;
3845}
3846
3847/* Mark the tree SRC as an optional subexpression.
3848 To be called from preorder or postorder. */
3849
3850static reg_errcode_t
3851mark_opt_subexp (void *extra, bin_tree_t *node)
3852{
3853 Idx idx = (uintptr_t) extra;
3854 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3855 node->token.opt_subexp = 1;
3856
3857 return REG_NOERROR;
3858}
3859
3860/* Free the allocated memory inside NODE. */
3861
3862static void
3863free_token (re_token_t *node)
3864{
3865#ifdef RE_ENABLE_I18N
3866 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3867 free_charset (node->opr.mbcset);
3868 else
3869#endif /* RE_ENABLE_I18N */
3870 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3871 re_free (node->opr.sbcset);
3872}
3873
3874/* Worker function for tree walking. Free the allocated memory inside NODE
3875 and its children. */
3876
3877static reg_errcode_t
3878free_tree (void *extra, bin_tree_t *node)
3879{
3880 free_token (&node->token);
3881 return REG_NOERROR;
3882}
3883
3884
3885/* Duplicate the node SRC, and return new node. This is a preorder
3886 visit similar to the one implemented by the generic visitor, but
3887 we need more infrastructure to maintain two parallel trees --- so,
3888 it's easier to duplicate. */
3889
3890static bin_tree_t *
3891duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3892{
3893 const bin_tree_t *node;
3894 bin_tree_t *dup_root;
3895 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3896
3897 for (node = root; ; )
3898 {
3899 /* Create a new tree and link it back to the current parent. */
3900 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3901 if (*p_new == NULL)
3902 return NULL;
3903 (*p_new)->parent = dup_node;
3904 (*p_new)->token.duplicated = 1;
3905 dup_node = *p_new;
3906
3907 /* Go to the left node, or up and to the right. */
3908 if (node->left)
3909 {
3910 node = node->left;
3911 p_new = &dup_node->left;
3912 }
3913 else
3914 {
3915 const bin_tree_t *prev = NULL;
3916 while (node->right == prev || node->right == NULL)
3917 {
3918 prev = node;
3919 node = node->parent;
3920 dup_node = dup_node->parent;
3921 if (!node)
3922 return dup_root;
3923 }
3924 node = node->right;
3925 p_new = &dup_node->right;
3926 }
3927 }
3928}
Note: See TracBrowser for help on using the repository browser.