source: trunk/src/sed/lib/regcomp.c

Last change on this file was 3613, checked in by bird, 10 months ago

src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge /vendor/sed/4.1.5 /vendor/sed/current .)

File size: 109.5 KB
RevLine 
[599]1/* Extended regular expression matching and search library.
[3613]2 Copyright (C) 2002-2022 Free Software Foundation, Inc.
[599]3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
[3613]17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
[599]19
[3613]20#ifdef _LIBC
21# include <locale/weight.h>
22#endif
23
[599]24static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30static void free_charset (re_charset_t *cset);
31static void free_workarea_compile (regex_t *preg);
32static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33static void optimize_utf8 (re_dfa_t *dfa);
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
[3613]48static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
[599]50 unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
[3613]53 Idx node, bool root);
[599]54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
[3613]55static Idx fetch_number (re_string_t *input, re_token_t *token,
[599]56 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
[3613]58 reg_syntax_t syntax);
[599]59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
[3613]63 Idx nest, reg_errcode_t *err);
[599]64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
[3613]66 Idx nest, reg_errcode_t *err);
[599]67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
[3613]69 Idx nest, reg_errcode_t *err);
[599]70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
[3613]72 Idx nest, reg_errcode_t *err);
[599]73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
[3613]84 bool accept_hyphen);
[599]85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88static reg_errcode_t build_equiv_class (bitset_t sbcset,
89 re_charset_t *mbcset,
[3613]90 Idx *equiv_class_alloc,
[599]91 const unsigned char *name);
92static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93 bitset_t sbcset,
94 re_charset_t *mbcset,
[3613]95 Idx *char_class_alloc,
96 const char *class_name,
[599]97 reg_syntax_t syntax);
98static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
99 RE_TRANSLATE_TYPE trans,
[3613]100 const char *class_name,
101 const char *extra,
102 bool non_match, reg_errcode_t *err);
[599]103static bin_tree_t *create_tree (re_dfa_t *dfa,
104 bin_tree_t *left, bin_tree_t *right,
105 re_token_type_t type);
106static bin_tree_t *create_token_tree (re_dfa_t *dfa,
107 bin_tree_t *left, bin_tree_t *right,
108 const re_token_t *token);
109static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
110static void free_token (re_token_t *node);
111static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
112static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
113
114
115/* This table gives an error message for each of the error codes listed
116 in regex.h. Obviously the order here has to be same as there.
117 POSIX doesn't require that we do anything for REG_NOERROR,
118 but why not be nice? */
[3613]119
[599]120static const char __re_error_msgid[] =
121 {
122#define REG_NOERROR_IDX 0
123 gettext_noop ("Success") /* REG_NOERROR */
124 "\0"
125#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
126 gettext_noop ("No match") /* REG_NOMATCH */
127 "\0"
128#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
129 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
130 "\0"
131#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
132 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
133 "\0"
134#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
135 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
136 "\0"
137#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
138 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
139 "\0"
140#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
141 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
142 "\0"
[3613]143#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
[599]144 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
[3613]145 "\0"
[599]146#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
147 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
148 "\0"
149#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
150 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
151 "\0"
152#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
153 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
154 "\0"
155#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
156 gettext_noop ("Invalid range end") /* REG_ERANGE */
157 "\0"
158#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
159 gettext_noop ("Memory exhausted") /* REG_ESPACE */
160 "\0"
161#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
162 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
163 "\0"
164#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
165 gettext_noop ("Premature end of regular expression") /* REG_EEND */
166 "\0"
167#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
168 gettext_noop ("Regular expression too big") /* REG_ESIZE */
169 "\0"
170#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
171 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
172 };
[3613]173
[599]174static const size_t __re_error_msgid_idx[] =
175 {
176 REG_NOERROR_IDX,
177 REG_NOMATCH_IDX,
178 REG_BADPAT_IDX,
179 REG_ECOLLATE_IDX,
180 REG_ECTYPE_IDX,
181 REG_EESCAPE_IDX,
182 REG_ESUBREG_IDX,
183 REG_EBRACK_IDX,
184 REG_EPAREN_IDX,
185 REG_EBRACE_IDX,
186 REG_BADBR_IDX,
187 REG_ERANGE_IDX,
188 REG_ESPACE_IDX,
189 REG_BADRPT_IDX,
190 REG_EEND_IDX,
191 REG_ESIZE_IDX,
192 REG_ERPAREN_IDX
193 };
194
195
196/* Entry points for GNU code. */
197
198/* re_compile_pattern is the GNU regular expression compiler: it
199 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
[3613]200 Returns 0 if the pattern was valid, otherwise an error string.
[599]201
202 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
203 are set in BUFP on entry. */
[3613]204
205const char *
[599]206re_compile_pattern (const char *pattern, size_t length,
207 struct re_pattern_buffer *bufp)
208{
209 reg_errcode_t ret;
210
211 /* And GNU code determines whether or not to get register information
212 by passing null for the REGS argument to re_match, etc., not by
213 setting no_sub, unless RE_NO_SUB is set. */
214 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
215
216 /* Match anchors at newline. */
217 bufp->newline_anchor = 1;
218
219 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
220
221 if (!ret)
222 return NULL;
223 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
224}
[3613]225weak_alias (__re_compile_pattern, re_compile_pattern)
[599]226
227/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
228 also be assigned to arbitrarily: each pattern buffer stores its own
229 syntax, so it can be changed between regex compilations. */
230/* This has no initializer because initialized variables in Emacs
231 become read-only after dumping. */
232reg_syntax_t re_syntax_options;
233
234
235/* Specify the precise syntax of regexps for compilation. This provides
236 for compatibility for various utilities which historically have
237 different, incompatible syntaxes.
238
239 The argument SYNTAX is a bit mask comprised of the various bits
240 defined in regex.h. We return the old syntax. */
[3613]241
[599]242reg_syntax_t
243re_set_syntax (reg_syntax_t syntax)
244{
245 reg_syntax_t ret = re_syntax_options;
246
247 re_syntax_options = syntax;
248 return ret;
249}
250weak_alias (__re_set_syntax, re_set_syntax)
[3613]251
[599]252int
[3613]253re_compile_fastmap (struct re_pattern_buffer *bufp)
[599]254{
255 re_dfa_t *dfa = bufp->buffer;
256 char *fastmap = bufp->fastmap;
257
258 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
259 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
260 if (dfa->init_state != dfa->init_state_word)
261 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
262 if (dfa->init_state != dfa->init_state_nl)
263 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
264 if (dfa->init_state != dfa->init_state_begbuf)
265 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
266 bufp->fastmap_accurate = 1;
267 return 0;
268}
[3613]269weak_alias (__re_compile_fastmap, re_compile_fastmap)
270
[599]271static __always_inline void
272re_set_fastmap (char *fastmap, bool icase, int ch)
273{
274 fastmap[ch] = 1;
275 if (icase)
276 fastmap[tolower (ch)] = 1;
277}
278
279/* Helper function for re_compile_fastmap.
280 Compile fastmap for the initial_state INIT_STATE. */
281
282static void
283re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
[3613]284 char *fastmap)
285{
286 re_dfa_t *dfa = bufp->buffer;
[599]287 Idx node_cnt;
288 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
[3613]289 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
[599]290 {
291 Idx node = init_state->nodes.elems[node_cnt];
292 re_token_type_t type = dfa->nodes[node].type;
293
294 if (type == CHARACTER)
295 {
296 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
[3613]297 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
298 {
[599]299 unsigned char buf[MB_LEN_MAX];
300 unsigned char *p;
301 wchar_t wc;
302 mbstate_t state;
303
304 p = buf;
305 *p++ = dfa->nodes[node].opr.c;
306 while (++node < dfa->nodes_len
307 && dfa->nodes[node].type == CHARACTER
308 && dfa->nodes[node].mb_partial)
[3613]309 *p++ = dfa->nodes[node].opr.c;
310 memset (&state, '\0', sizeof (state));
311 if (__mbrtowc (&wc, (const char *) buf, p - buf,
[599]312 &state) == p - buf
[3613]313 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
[599]314 != (size_t) -1))
315 re_set_fastmap (fastmap, false, buf[0]);
316 }
317 }
318 else if (type == SIMPLE_BRACKET)
319 {
320 int i, ch;
321 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
322 {
323 int j;
324 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
325 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
326 if (w & ((bitset_word_t) 1 << j))
327 re_set_fastmap (fastmap, icase, ch);
328 }
329 }
330 else if (type == COMPLEX_BRACKET)
[3613]331 {
332 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
333 Idx i;
334
335#ifdef _LIBC
336 /* See if we have to try all bytes which start multiple collation
337 elements.
338 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
339 collation element, and don't catch 'b' since 'b' is
340 the only collation element which starts from 'b' (and
341 it is caught by SIMPLE_BRACKET). */
[599]342 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
343 && (cset->ncoll_syms || cset->nranges))
344 {
345 const int32_t *table = (const int32_t *)
346 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
347 for (i = 0; i < SBC_MAX; ++i)
348 if (table[i] < 0)
[3613]349 re_set_fastmap (fastmap, icase, i);
350 }
351#endif /* _LIBC */
352
353 /* See if we have to start the match at all multibyte characters,
354 i.e. where we would not find an invalid sequence. This only
355 applies to multibyte character sets; for single byte character
356 sets, the SIMPLE_BRACKET again suffices. */
357 if (dfa->mb_cur_max > 1
358 && (cset->nchar_classes || cset->non_match || cset->nranges
359#ifdef _LIBC
360 || cset->nequiv_classes
361#endif /* _LIBC */
362 ))
363 {
364 unsigned char c = 0;
365 do
366 {
367 mbstate_t mbs;
368 memset (&mbs, 0, sizeof (mbs));
369 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
370 re_set_fastmap (fastmap, false, (int) c);
[599]371 }
[3613]372 while (++c != 0);
373 }
[599]374
[3613]375 else
376 {
[599]377 /* ... Else catch all bytes which can start the mbchars. */
[3613]378 for (i = 0; i < cset->nmbchars; ++i)
379 {
380 char buf[256];
381 mbstate_t state;
382 memset (&state, '\0', sizeof (state));
383 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
384 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
385 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
386 {
387 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
388 != (size_t) -1)
[599]389 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
390 }
391 }
[3613]392 }
[599]393 }
394 else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
395 {
396 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
397 if (type == END_OF_RE)
398 bufp->can_be_null = 1;
399 return;
400 }
401 }
402}
403
404
405/* Entry point for POSIX code. */
406/* regcomp takes a regular expression as a string and compiles it.
407
[3613]408 PREG is a regex_t *. We do not expect any fields to be initialized,
409 since POSIX says we shouldn't. Thus, we set
410
[599]411 'buffer' to the compiled pattern;
412 'used' to the length of the compiled pattern;
[3613]413 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
414 REG_EXTENDED bit in CFLAGS is set; otherwise, to
415 RE_SYNTAX_POSIX_BASIC;
416 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
[599]417 'fastmap' to an allocated space for the fastmap;
418 'fastmap_accurate' to zero;
419 're_nsub' to the number of subexpressions in PATTERN.
420
421 PATTERN is the address of the pattern string.
422
423 CFLAGS is a series of bits which affect compilation.
424
425 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
426 use POSIX basic syntax.
427
428 If REG_NEWLINE is set, then . and [^...] don't match newline.
429 Also, regexec will try a match beginning after every newline.
430
431 If REG_ICASE is set, then we considers upper- and lowercase
432 versions of letters to be equivalent when matching.
433
434 If REG_NOSUB is set, then when PREG is passed to regexec, that
435 routine will report only success or failure, and nothing about the
436 registers.
437
438 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
[3613]439 the return codes and their meanings.) */
[599]440
441int
442regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
443{
444 reg_errcode_t ret;
445 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
446 : RE_SYNTAX_POSIX_BASIC);
447
448 preg->buffer = NULL;
449 preg->allocated = 0;
450 preg->used = 0;
[3613]451
[599]452 /* Try to allocate space for the fastmap. */
453 preg->fastmap = re_malloc (char, SBC_MAX);
454 if (__glibc_unlikely (preg->fastmap == NULL))
455 return REG_ESPACE;
456
457 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
458
459 /* If REG_NEWLINE is set, newlines are treated differently. */
460 if (cflags & REG_NEWLINE)
461 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
462 syntax &= ~RE_DOT_NEWLINE;
463 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
464 /* It also changes the matching behavior. */
465 preg->newline_anchor = 1;
466 }
467 else
468 preg->newline_anchor = 0;
469 preg->no_sub = !!(cflags & REG_NOSUB);
470 preg->translate = NULL;
471
472 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
473
474 /* POSIX doesn't distinguish between an unmatched open-group and an
475 unmatched close-group: both are REG_EPAREN. */
476 if (ret == REG_ERPAREN)
[3613]477 ret = REG_EPAREN;
[599]478
479 /* We have already checked preg->fastmap != NULL. */
480 if (__glibc_likely (ret == REG_NOERROR))
481 /* Compute the fastmap now, since regexec cannot modify the pattern
482 buffer. This function never fails in this implementation. */
483 (void) re_compile_fastmap (preg);
484 else
485 {
486 /* Some error occurred while compiling the expression. */
487 re_free (preg->fastmap);
488 preg->fastmap = NULL;
489 }
[3613]490
[599]491 return (int) ret;
492}
493libc_hidden_def (__regcomp)
494weak_alias (__regcomp, regcomp)
495
496/* Returns a message corresponding to an error code, ERRCODE, returned
[3613]497 from either regcomp or regexec. We don't use PREG here. */
498
[599]499size_t
500regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
501 size_t errbuf_size)
[3613]502{
[599]503 const char *msg;
[3613]504 size_t msg_size;
[599]505 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
506
507 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
508 /* Only error codes returned by the rest of the code should be passed
509 to this routine. If we are given anything else, or if other regex
510 code generates an invalid error code, then the program has a bug.
511 Dump core so we can fix it. */
512 abort ();
513
514 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
[3613]515
[599]516 msg_size = strlen (msg) + 1; /* Includes the null. */
[3613]517
518 if (__glibc_likely (errbuf_size != 0))
[599]519 {
[3613]520 size_t cpy_size = msg_size;
521 if (__glibc_unlikely (msg_size > errbuf_size))
[599]522 {
[3613]523 cpy_size = errbuf_size - 1;
[599]524 errbuf[cpy_size] = '\0';
525 }
526 memcpy (errbuf, msg, cpy_size);
527 }
528
529 return msg_size;
530}
531weak_alias (__regerror, regerror)
532
533
534/* This static array is used for the map to single-byte characters when
535 UTF-8 is used. Otherwise we would allocate memory just to initialize
536 it the same all the time. UTF-8 is the preferred encoding so this is
[606]537 a worthwhile optimization. */
[3613]538static const bitset_t utf8_sb_map =
539{
[606]540 /* Set the first 128 bits. */
[3613]541#if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
542 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
543#else
544# if 4 * BITSET_WORD_BITS < ASCII_CHARS
545# error "bitset_word_t is narrower than 32 bits"
546# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
547 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
548# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
549 BITSET_WORD_MAX, BITSET_WORD_MAX,
550# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
551 BITSET_WORD_MAX,
552# endif
553 (BITSET_WORD_MAX
[606]554 >> (SBC_MAX % BITSET_WORD_BITS == 0
[599]555 ? 0
556 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
557#endif
558};
559
560
[3613]561static void
[599]562free_dfa_content (re_dfa_t *dfa)
563{
564 Idx i, j;
565
566 if (dfa->nodes)
567 for (i = 0; i < dfa->nodes_len; ++i)
568 free_token (dfa->nodes + i);
569 re_free (dfa->nexts);
570 for (i = 0; i < dfa->nodes_len; ++i)
571 {
572 if (dfa->eclosures != NULL)
573 re_node_set_free (dfa->eclosures + i);
574 if (dfa->inveclosures != NULL)
575 re_node_set_free (dfa->inveclosures + i);
576 if (dfa->edests != NULL)
577 re_node_set_free (dfa->edests + i);
578 }
579 re_free (dfa->edests);
580 re_free (dfa->eclosures);
581 re_free (dfa->inveclosures);
582 re_free (dfa->nodes);
583
584 if (dfa->state_table)
585 for (i = 0; i <= dfa->state_hash_mask; ++i)
586 {
587 struct re_state_table_entry *entry = dfa->state_table + i;
588 for (j = 0; j < entry->num; ++j)
589 {
[3613]590 re_dfastate_t *state = entry->array[j];
[599]591 free_state (state);
592 }
593 re_free (entry->array);
594 }
595 re_free (dfa->state_table);
596 if (dfa->sb_char != utf8_sb_map)
597 re_free (dfa->sb_char);
598 re_free (dfa->subexp_map);
599#ifdef DEBUG
600 re_free (dfa->re_str);
601#endif
602
603 re_free (dfa);
604}
605
606
[3613]607/* Free dynamically allocated space used by PREG. */
[599]608
[3613]609void
610regfree (regex_t *preg)
611{
612 re_dfa_t *dfa = preg->buffer;
613 if (__glibc_likely (dfa != NULL))
614 {
[599]615 lock_fini (dfa->lock);
616 free_dfa_content (dfa);
617 }
618 preg->buffer = NULL;
619 preg->allocated = 0;
620
621 re_free (preg->fastmap);
622 preg->fastmap = NULL;
623
[3613]624 re_free (preg->translate);
[599]625 preg->translate = NULL;
626}
627libc_hidden_def (__regfree)
628weak_alias (__regfree, regfree)
629
630
631/* Entry points compatible with 4.2 BSD regex library. We don't define
632 them unless specifically requested. */
633
634#if defined _REGEX_RE_COMP || defined _LIBC
635
636/* BSD has one and only one pattern buffer. */
637static struct re_pattern_buffer re_comp_buf;
638
639char *
640# ifdef _LIBC
641/* Make these definitions weak in libc, so POSIX programs can redefine
[3613]642 these names if they don't use our functions, and still use
[599]643 regcomp/regexec above without link errors. */
644weak_function
645# endif
646re_comp (const char *s)
647{
648 reg_errcode_t ret;
649 char *fastmap;
650
651 if (!s)
652 {
653 if (!re_comp_buf.buffer)
654 return gettext ("No previous regular expression");
655 return 0;
656 }
657
658 if (re_comp_buf.buffer)
659 {
660 fastmap = re_comp_buf.fastmap;
661 re_comp_buf.fastmap = NULL;
662 __regfree (&re_comp_buf);
663 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
664 re_comp_buf.fastmap = fastmap;
[3613]665 }
[599]666
667 if (re_comp_buf.fastmap == NULL)
668 {
669 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
670 if (re_comp_buf.fastmap == NULL)
[3613]671 return (char *) gettext (__re_error_msgid
[599]672 + __re_error_msgid_idx[(int) REG_ESPACE]);
673 }
674
675 /* Since 're_exec' always passes NULL for the 'regs' argument, we
676 don't need to initialize the pattern buffer fields which affect it. */
677
678 /* Match anchors at newlines. */
679 re_comp_buf.newline_anchor = 1;
680
681 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
[3613]682
[599]683 if (!ret)
684 return NULL;
685
686 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
687 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
688}
689
690#ifdef _LIBC
691libc_freeres_fn (free_mem)
692{
693 __regfree (&re_comp_buf);
694}
695#endif
696
697#endif /* _REGEX_RE_COMP */
698
699
700/* Internal entry point.
701 Compile the regular expression PATTERN, whose length is LENGTH.
702 SYNTAX indicate regular expression's syntax. */
703
704static reg_errcode_t
705re_compile_internal (regex_t *preg, const char * pattern, size_t length,
706 reg_syntax_t syntax)
707{
708 reg_errcode_t err = REG_NOERROR;
709 re_dfa_t *dfa;
710 re_string_t regexp;
711
712 /* Initialize the pattern buffer. */
713 preg->fastmap_accurate = 0;
714 preg->syntax = syntax;
715 preg->not_bol = preg->not_eol = 0;
716 preg->used = 0;
[3613]717 preg->re_nsub = 0;
718 preg->can_be_null = 0;
[599]719 preg->regs_allocated = REGS_UNALLOCATED;
720
721 /* Initialize the dfa. */
722 dfa = preg->buffer;
723 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
724 {
725 /* If zero allocated, but buffer is non-null, try to realloc
726 enough space. This loses if buffer's address is bogus, but
727 that is the user's responsibility. If ->buffer is NULL this
[3613]728 is a simple allocation. */
[599]729 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
730 if (dfa == NULL)
731 return REG_ESPACE;
732 preg->allocated = sizeof (re_dfa_t);
[3613]733 preg->buffer = dfa;
734 }
735 preg->used = sizeof (re_dfa_t);
[599]736
737 err = init_dfa (dfa, length);
738 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
739 err = REG_ESPACE;
740 if (__glibc_unlikely (err != REG_NOERROR))
741 {
742 free_dfa_content (dfa);
743 preg->buffer = NULL;
744 preg->allocated = 0;
745 return err;
746 }
747#ifdef DEBUG
748 /* Note: length+1 will not overflow since it is checked in init_dfa. */
[3613]749 dfa->re_str = re_malloc (char, length + 1);
750 strncpy (dfa->re_str, pattern, length + 1);
[599]751#endif
752
753 err = re_string_construct (&regexp, pattern, length, preg->translate,
754 (syntax & RE_ICASE) != 0, dfa);
[3613]755 if (__glibc_unlikely (err != REG_NOERROR))
[599]756 {
757 re_compile_internal_free_return:
758 free_workarea_compile (preg);
759 re_string_destruct (&regexp);
760 lock_fini (dfa->lock);
761 free_dfa_content (dfa);
762 preg->buffer = NULL;
763 preg->allocated = 0;
764 return err;
[3613]765 }
[599]766
767 /* Parse the regular expression, and build a structure tree. */
768 preg->re_nsub = 0;
769 dfa->str_tree = parse (&regexp, preg, syntax, &err);
[3613]770 if (__glibc_unlikely (dfa->str_tree == NULL))
[599]771 goto re_compile_internal_free_return;
772
773 /* Analyze the tree and create the nfa. */
774 err = analyze (preg);
775 if (__glibc_unlikely (err != REG_NOERROR))
776 goto re_compile_internal_free_return;
777
778 /* If possible, do searching in single byte encoding to speed things up. */
779 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
780 optimize_utf8 (dfa);
781
782 /* Then create the initial state of the dfa. */
783 err = create_initial_state (dfa);
[3613]784
[599]785 /* Release work areas. */
[3613]786 free_workarea_compile (preg);
[599]787 re_string_destruct (&regexp);
788
789 if (__glibc_unlikely (err != REG_NOERROR))
790 {
791 lock_fini (dfa->lock);
792 free_dfa_content (dfa);
793 preg->buffer = NULL;
794 preg->allocated = 0;
795 }
796
797 return err;
798}
799
800/* Initialize DFA. We use the length of the regular expression PAT_LEN
[3613]801 as the initial length of some arrays. */
[599]802
[3613]803static reg_errcode_t
[599]804init_dfa (re_dfa_t *dfa, size_t pat_len)
[3613]805{
806 __re_size_t table_size;
807#ifndef _LIBC
808 const char *codeset_name;
809#endif
810 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
811 size_t max_object_size =
[599]812 MAX (sizeof (struct re_state_table_entry),
813 MAX (sizeof (re_token_t),
814 MAX (sizeof (re_node_set),
815 MAX (sizeof (regmatch_t),
816 max_i18n_object_size))));
817
[3613]818 memset (dfa, '\0', sizeof (re_dfa_t));
819
820 /* Force allocation of str_tree_storage the first time. */
821 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
822
823 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
[599]824 calculation below, and for similar doubling calculations
825 elsewhere. And it's <= rather than <, because some of the
826 doubling calculations add 1 afterwards. */
827 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
828 <= pat_len))
829 return REG_ESPACE;
830
831 dfa->nodes_alloc = pat_len + 1;
832 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
833
834 /* table_size = 2 ^ ceil(log pat_len) */
835 for (table_size = 1; ; table_size <<= 1)
836 if (table_size > pat_len)
837 break;
838
839 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
840 dfa->state_hash_mask = table_size - 1;
841
842 dfa->mb_cur_max = MB_CUR_MAX;
843#ifdef _LIBC
844 if (dfa->mb_cur_max == 6
[3550]845 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
846 dfa->is_utf8 = 1;
847 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
848 != 0);
[599]849#else
[3613]850# ifdef KBUILD_OS_WINDOWS
851 (void)codeset_name;
852 if (get_crt_codepage() == MY_CP_UTF8)
853# else
[3550]854 codeset_name = nl_langinfo (CODESET);
[599]855 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
856 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
857 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
858 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
859# endif
860 dfa->is_utf8 = 1;
861
862 /* We check exhaustively in the loop below if this charset is a
863 superset of ASCII. */
864 dfa->map_notascii = 0;
865#endif
866
867 if (dfa->mb_cur_max > 1)
868 {
869 if (dfa->is_utf8)
870 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
[3613]871 else
[599]872 {
873 int i, j, ch;
874
875 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
876 if (__glibc_unlikely (dfa->sb_char == NULL))
877 return REG_ESPACE;
878
879 /* Set the bits corresponding to single byte chars. */
880 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
[3613]881 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
[599]882 {
883 wint_t wch = __btowc (ch);
[3613]884 if (wch != WEOF)
[599]885 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
886#ifndef _LIBC
887 if (isascii (ch) && wch != ch)
888 dfa->map_notascii = 1;
[3613]889#endif
[599]890 }
891 }
892 }
893
894 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
895 return REG_ESPACE;
896 return REG_NOERROR;
897}
898
899/* Initialize WORD_CHAR table, which indicate which character is
900 "word". In this case "word" means that it is the word construction
[3613]901 character used by some operators like "\<", "\>", etc. */
902
903static void
[599]904init_word_char (re_dfa_t *dfa)
[3613]905{
906 int i = 0;
907 int j;
908 int ch = 0;
909 dfa->word_ops_used = 1;
910 if (__glibc_likely (dfa->map_notascii == 0))
911 {
912 bitset_word_t bits0 = 0x00000000;
913 bitset_word_t bits1 = 0x03ff0000;
914 bitset_word_t bits2 = 0x87fffffe;
915 bitset_word_t bits3 = 0x07fffffe;
916 if (BITSET_WORD_BITS == 64)
917 {
918 /* Pacify gcc -Woverflow on 32-bit platformns. */
919 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
920 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
921 i = 2;
922 }
923 else if (BITSET_WORD_BITS == 32)
924 {
925 dfa->word_char[0] = bits0;
926 dfa->word_char[1] = bits1;
927 dfa->word_char[2] = bits2;
928 dfa->word_char[3] = bits3;
929 i = 4;
930 }
931 else
932 goto general_case;
933 ch = 128;
934
935 if (__glibc_likely (dfa->is_utf8))
936 {
937 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
938 return;
[599]939 }
940 }
941
942 general_case:
943 for (; i < BITSET_WORDS; ++i)
944 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
945 if (isalnum (ch) || ch == '_')
946 dfa->word_char[i] |= (bitset_word_t) 1 << j;
947}
948
[3613]949/* Free the work area which are only used while compiling. */
[599]950
951static void
952free_workarea_compile (regex_t *preg)
953{
954 re_dfa_t *dfa = preg->buffer;
955 bin_tree_storage_t *storage, *next;
956 for (storage = dfa->str_tree_storage; storage; storage = next)
957 {
958 next = storage->next;
959 re_free (storage);
960 }
961 dfa->str_tree_storage = NULL;
962 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
963 dfa->str_tree = NULL;
964 re_free (dfa->org_indices);
965 dfa->org_indices = NULL;
966}
967
[3613]968/* Create initial states for all contexts. */
[599]969
970static reg_errcode_t
971create_initial_state (re_dfa_t *dfa)
972{
973 Idx first, i;
974 reg_errcode_t err;
975 re_node_set init_nodes;
976
[3613]977 /* Initial states have the epsilon closure of the node which is
[599]978 the first node of the regular expression. */
979 first = dfa->str_tree->first->node_idx;
980 dfa->init_node = first;
981 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
982 if (__glibc_unlikely (err != REG_NOERROR))
983 return err;
984
985 /* The back-references which are in initial states can epsilon transit,
986 since in this case all of the subexpressions can be null.
[3613]987 Then we add epsilon closures of the nodes which are the next nodes of
[599]988 the back-references. */
989 if (dfa->nbackref > 0)
[3613]990 for (i = 0; i < init_nodes.nelem; ++i)
[599]991 {
992 Idx node_idx = init_nodes.elems[i];
993 re_token_type_t type = dfa->nodes[node_idx].type;
994
995 Idx clexp_idx;
996 if (type != OP_BACK_REF)
997 continue;
998 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
999 {
1000 re_token_t *clexp_node;
1001 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1002 if (clexp_node->type == OP_CLOSE_SUBEXP
1003 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1004 break;
1005 }
[3613]1006 if (clexp_idx == init_nodes.nelem)
[599]1007 continue;
1008
[3613]1009 if (type == OP_BACK_REF)
1010 {
1011 Idx dest_idx = dfa->edests[node_idx].elems[0];
1012 if (!re_node_set_contains (&init_nodes, dest_idx))
[599]1013 {
1014 reg_errcode_t merge_err
1015 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1016 if (merge_err != REG_NOERROR)
1017 return merge_err;
1018 i = 0;
1019 }
1020 }
[3613]1021 }
[599]1022
1023 /* It must be the first time to invoke acquire_state. */
1024 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1025 /* We don't check ERR here, since the initial state must not be NULL. */
1026 if (__glibc_unlikely (dfa->init_state == NULL))
1027 return err;
1028 if (dfa->init_state->has_constraint)
1029 {
1030 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1031 CONTEXT_WORD);
1032 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
[3613]1033 CONTEXT_NEWLINE);
1034 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1035 &init_nodes,
[599]1036 CONTEXT_NEWLINE
1037 | CONTEXT_BEGBUF);
1038 if (__glibc_unlikely (dfa->init_state_word == NULL
1039 || dfa->init_state_nl == NULL
1040 || dfa->init_state_begbuf == NULL))
1041 return err;
1042 }
1043 else
1044 dfa->init_state_word = dfa->init_state_nl
1045 = dfa->init_state_begbuf = dfa->init_state;
1046
1047 re_node_set_free (&init_nodes);
1048 return REG_NOERROR;
1049}
1050
1051
1052/* If it is possible to do searching in single byte encoding instead of UTF-8
[3613]1053 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1054 DFA nodes where needed. */
1055
1056static void
[599]1057optimize_utf8 (re_dfa_t *dfa)
1058{
1059 Idx node;
1060 int i;
1061 bool mb_chars = false;
[3613]1062 bool has_period = false;
1063
[599]1064 for (node = 0; node < dfa->nodes_len; ++node)
1065 switch (dfa->nodes[node].type)
[3613]1066 {
[599]1067 case CHARACTER:
1068 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1069 mb_chars = true;
1070 break;
1071 case ANCHOR:
1072 switch (dfa->nodes[node].opr.ctx_type)
1073 {
[3613]1074 case LINE_FIRST:
1075 case LINE_LAST:
1076 case BUF_FIRST:
[599]1077 case BUF_LAST:
1078 break;
1079 default:
1080 /* Word anchors etc. cannot be handled. It's okay to test
[3613]1081 opr.ctx_type since constraints (for all DFA nodes) are
1082 created by ORing one or more opr.ctx_type values. */
[599]1083 return;
1084 }
1085 break;
1086 case OP_PERIOD:
1087 has_period = true;
1088 break;
1089 case OP_BACK_REF:
1090 case OP_ALT:
1091 case END_OF_RE:
1092 case OP_DUP_ASTERISK:
[3613]1093 case OP_OPEN_SUBEXP:
1094 case OP_CLOSE_SUBEXP:
1095 break;
1096 case COMPLEX_BRACKET:
1097 return;
1098 case SIMPLE_BRACKET:
1099 /* Just double check. */
1100 {
1101 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1102 ? 0
1103 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1104 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
[599]1105 {
1106 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1107 return;
1108 rshift = 0;
1109 }
1110 }
1111 break;
1112 default:
1113 abort ();
[3613]1114 }
[599]1115
1116 if (mb_chars || has_period)
1117 for (node = 0; node < dfa->nodes_len; ++node)
1118 {
1119 if (dfa->nodes[node].type == CHARACTER
1120 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1121 dfa->nodes[node].mb_partial = 0;
1122 else if (dfa->nodes[node].type == OP_PERIOD)
1123 dfa->nodes[node].type = OP_UTF8_PERIOD;
1124 }
1125
1126 /* The search can be in single byte locale. */
1127 dfa->mb_cur_max = 1;
1128 dfa->is_utf8 = 0;
1129 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1130}
1131
[3613]1132
[599]1133/* Analyze the structure tree, and calculate "first", "next", "edest",
1134 "eclosure", and "inveclosure". */
1135
[3613]1136static reg_errcode_t
1137analyze (regex_t *preg)
[599]1138{
1139 re_dfa_t *dfa = preg->buffer;
[3613]1140 reg_errcode_t ret;
1141
[599]1142 /* Allocate arrays. */
1143 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
[3613]1144 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
[599]1145 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1146 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
[3613]1147 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
[599]1148 || dfa->edests == NULL || dfa->eclosures == NULL))
1149 return REG_ESPACE;
1150
1151 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1152 if (dfa->subexp_map != NULL)
1153 {
1154 Idx i;
1155 for (i = 0; i < preg->re_nsub; i++)
[3613]1156 dfa->subexp_map[i] = i;
[599]1157 preorder (dfa->str_tree, optimize_subexps, dfa);
1158 for (i = 0; i < preg->re_nsub; i++)
1159 if (dfa->subexp_map[i] != i)
1160 break;
1161 if (i == preg->re_nsub)
[3613]1162 {
[599]1163 re_free (dfa->subexp_map);
1164 dfa->subexp_map = NULL;
[3613]1165 }
[599]1166 }
1167
1168 ret = postorder (dfa->str_tree, lower_subexps, preg);
[3613]1169 if (__glibc_unlikely (ret != REG_NOERROR))
[599]1170 return ret;
1171 ret = postorder (dfa->str_tree, calc_first, dfa);
[3613]1172 if (__glibc_unlikely (ret != REG_NOERROR))
[599]1173 return ret;
1174 preorder (dfa->str_tree, calc_next, dfa);
1175 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1176 if (__glibc_unlikely (ret != REG_NOERROR))
1177 return ret;
1178 ret = calc_eclosure (dfa);
1179 if (__glibc_unlikely (ret != REG_NOERROR))
1180 return ret;
[3613]1181
1182 /* We only need this during the prune_impossible_nodes pass in regexec.c;
[599]1183 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1184 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1185 || dfa->nbackref)
1186 {
1187 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1188 if (__glibc_unlikely (dfa->inveclosures == NULL))
1189 return REG_ESPACE;
1190 ret = calc_inveclosure (dfa);
1191 }
1192
1193 return ret;
1194}
1195
1196/* Our parse trees are very unbalanced, so we cannot use a stack to
1197 implement parse tree visits. Instead, we use parent pointers and
1198 some hairy code in these two functions. */
1199static reg_errcode_t
1200postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1201 void *extra)
1202{
1203 bin_tree_t *node, *prev;
[3613]1204
1205 for (node = root; ; )
1206 {
[599]1207 /* Descend down the tree, preferably to the left (or to the right
1208 if that's the only child). */
1209 while (node->left || node->right)
1210 if (node->left)
[3613]1211 node = node->left;
[599]1212 else
[3613]1213 node = node->right;
[599]1214
1215 do
1216 {
1217 reg_errcode_t err = fn (extra, node);
1218 if (__glibc_unlikely (err != REG_NOERROR))
1219 return err;
1220 if (node->parent == NULL)
1221 return REG_NOERROR;
1222 prev = node;
1223 node = node->parent;
1224 }
1225 /* Go up while we have a node that is reached from the right. */
1226 while (node->right == prev || node->right == NULL);
1227 node = node->right;
1228 }
1229}
1230
1231static reg_errcode_t
1232preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
[3613]1233 void *extra)
[599]1234{
1235 bin_tree_t *node;
1236
1237 for (node = root; ; )
1238 {
1239 reg_errcode_t err = fn (extra, node);
1240 if (__glibc_unlikely (err != REG_NOERROR))
1241 return err;
1242
1243 /* Go to the left node, or up and to the right. */
1244 if (node->left)
1245 node = node->left;
1246 else
[3613]1247 {
[599]1248 bin_tree_t *prev = NULL;
1249 while (node->right == prev || node->right == NULL)
1250 {
1251 prev = node;
1252 node = node->parent;
1253 if (!node)
1254 return REG_NOERROR;
1255 }
1256 node = node->right;
1257 }
1258 }
1259}
1260
1261/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1262 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1263 backreferences as well. Requires a preorder visit. */
1264static reg_errcode_t
1265optimize_subexps (void *extra, bin_tree_t *node)
1266{
1267 re_dfa_t *dfa = (re_dfa_t *) extra;
1268
1269 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
[3613]1270 {
[599]1271 int idx = node->token.opr.idx;
[3613]1272 node->token.opr.idx = dfa->subexp_map[idx];
[599]1273 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1274 }
1275
[3613]1276 else if (node->token.type == SUBEXP
[599]1277 && node->left && node->left->token.type == SUBEXP)
1278 {
1279 Idx other_idx = node->left->token.opr.idx;
[3613]1280
[599]1281 node->left = node->left->left;
1282 if (node->left)
1283 node->left->parent = node;
1284
1285 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1286 if (other_idx < BITSET_WORD_BITS)
1287 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1288 }
1289
1290 return REG_NOERROR;
1291}
1292
1293/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1294 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1295static reg_errcode_t
1296lower_subexps (void *extra, bin_tree_t *node)
1297{
1298 regex_t *preg = (regex_t *) extra;
1299 reg_errcode_t err = REG_NOERROR;
1300
1301 if (node->left && node->left->token.type == SUBEXP)
1302 {
1303 node->left = lower_subexp (&err, preg, node->left);
1304 if (node->left)
1305 node->left->parent = node;
1306 }
1307 if (node->right && node->right->token.type == SUBEXP)
1308 {
1309 node->right = lower_subexp (&err, preg, node->right);
1310 if (node->right)
1311 node->right->parent = node;
1312 }
[3613]1313
[599]1314 return err;
1315}
1316
1317static bin_tree_t *
1318lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1319{
1320 re_dfa_t *dfa = preg->buffer;
1321 bin_tree_t *body = node->left;
1322 bin_tree_t *op, *cls, *tree1, *tree;
1323
1324 if (preg->no_sub
1325 /* We do not optimize empty subexpressions, because otherwise we may
1326 have bad CONCAT nodes with NULL children. This is obviously not
1327 very common, so we do not lose much. An example that triggers
1328 this case is the sed "script" /\(\)/x. */
1329 && node->left != NULL
1330 && (node->token.opr.idx >= BITSET_WORD_BITS
1331 || !(dfa->used_bkref_map
1332 & ((bitset_word_t) 1 << node->token.opr.idx))))
1333 return node->left;
[3613]1334
1335 /* Convert the SUBEXP node to the concatenation of an
[599]1336 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1337 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1338 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1339 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1340 tree = create_tree (dfa, op, tree1, CONCAT);
1341 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1342 || op == NULL || cls == NULL))
1343 {
1344 *err = REG_ESPACE;
1345 return NULL;
1346 }
1347
1348 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1349 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1350 return tree;
1351}
1352
1353/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1354 nodes. Requires a postorder visit. */
1355static reg_errcode_t
1356calc_first (void *extra, bin_tree_t *node)
1357{
1358 re_dfa_t *dfa = (re_dfa_t *) extra;
1359 if (node->token.type == CONCAT)
1360 {
[3613]1361 node->first = node->left->first;
1362 node->node_idx = node->left->node_idx;
1363 }
1364 else
[599]1365 {
1366 node->first = node;
1367 node->node_idx = re_dfa_add_node (dfa, node->token);
1368 if (__glibc_unlikely (node->node_idx == -1))
1369 return REG_ESPACE;
1370 if (node->token.type == ANCHOR)
1371 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1372 }
1373 return REG_NOERROR;
1374}
1375
1376/* Pass 2: compute NEXT on the tree. Preorder visit. */
1377static reg_errcode_t
1378calc_next (void *extra, bin_tree_t *node)
1379{
1380 switch (node->token.type)
1381 {
1382 case OP_DUP_ASTERISK:
1383 node->left->next = node;
1384 break;
1385 case CONCAT:
[3613]1386 node->left->next = node->right->first;
[599]1387 node->right->next = node->next;
1388 break;
1389 default:
1390 if (node->left)
1391 node->left->next = node->next;
1392 if (node->right)
1393 node->right->next = node->next;
1394 break;
1395 }
1396 return REG_NOERROR;
[3613]1397}
[599]1398
1399/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1400static reg_errcode_t
1401link_nfa_nodes (void *extra, bin_tree_t *node)
1402{
1403 re_dfa_t *dfa = (re_dfa_t *) extra;
1404 Idx idx = node->node_idx;
1405 reg_errcode_t err = REG_NOERROR;
[3613]1406
[599]1407 switch (node->token.type)
1408 {
1409 case CONCAT:
1410 break;
1411
[3613]1412 case END_OF_RE:
[599]1413 DEBUG_ASSERT (node->next == NULL);
1414 break;
1415
1416 case OP_DUP_ASTERISK:
1417 case OP_ALT:
1418 {
1419 Idx left, right;
1420 dfa->has_plural_match = 1;
1421 if (node->left != NULL)
[3613]1422 left = node->left->first->node_idx;
1423 else
[599]1424 left = node->next->node_idx;
1425 if (node->right != NULL)
1426 right = node->right->first->node_idx;
1427 else
1428 right = node->next->node_idx;
1429 DEBUG_ASSERT (left > -1);
1430 DEBUG_ASSERT (right > -1);
1431 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1432 }
1433 break;
1434
1435 case ANCHOR:
1436 case OP_OPEN_SUBEXP:
[3613]1437 case OP_CLOSE_SUBEXP:
[599]1438 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1439 break;
1440
[3613]1441 case OP_BACK_REF:
[599]1442 dfa->nexts[idx] = node->next->node_idx;
1443 if (node->token.type == OP_BACK_REF)
1444 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1445 break;
1446
1447 default:
1448 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1449 dfa->nexts[idx] = node->next->node_idx;
1450 break;
1451 }
1452
1453 return err;
[3613]1454}
1455
[599]1456/* Duplicate the epsilon closure of the node ROOT_NODE.
[3613]1457 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1458 to their own constraint. */
[599]1459
1460static reg_errcode_t
1461duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
[3613]1462 Idx root_node, unsigned int init_constraint)
[599]1463{
1464 Idx org_node, clone_node;
1465 bool ok;
1466 unsigned int constraint = init_constraint;
1467 for (org_node = top_org_node, clone_node = top_clone_node;;)
1468 {
1469 Idx org_dest, clone_dest;
1470 if (dfa->nodes[org_node].type == OP_BACK_REF)
1471 {
[3613]1472 /* If the back reference epsilon-transit, its destination must
[599]1473 also have the constraint. Then duplicate the epsilon closure
1474 of the destination of the back reference, and store it in
[3613]1475 edests of the back reference. */
1476 org_dest = dfa->nexts[org_node];
[599]1477 re_node_set_empty (dfa->edests + clone_node);
1478 clone_dest = duplicate_node (dfa, org_dest, constraint);
1479 if (__glibc_unlikely (clone_dest == -1))
1480 return REG_ESPACE;
1481 dfa->nexts[clone_node] = dfa->nexts[org_node];
1482 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1483 if (__glibc_unlikely (! ok))
1484 return REG_ESPACE;
1485 }
1486 else if (dfa->edests[org_node].nelem == 0)
1487 {
1488 /* In case of the node can't epsilon-transit, don't duplicate the
1489 destination and store the original destination as the
1490 destination of the node. */
1491 dfa->nexts[clone_node] = dfa->nexts[org_node];
1492 break;
[3613]1493 }
1494 else if (dfa->edests[org_node].nelem == 1)
1495 {
[599]1496 /* In case of the node can epsilon-transit, and it has only one
[3613]1497 destination. */
1498 org_dest = dfa->edests[org_node].elems[0];
1499 re_node_set_empty (dfa->edests + clone_node);
1500 /* If the node is root_node itself, it means the epsilon closure
[599]1501 has a loop. Then tie it to the destination of the root_node. */
[3613]1502 if (org_node == root_node && clone_node != org_node)
1503 {
[599]1504 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
[3613]1505 if (__glibc_unlikely (! ok))
[599]1506 return REG_ESPACE;
[3613]1507 break;
1508 }
[599]1509 /* In case the node has another constraint, append it. */
1510 constraint |= dfa->nodes[org_node].constraint;
1511 clone_dest = duplicate_node (dfa, org_dest, constraint);
1512 if (__glibc_unlikely (clone_dest == -1))
1513 return REG_ESPACE;
1514 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1515 if (__glibc_unlikely (! ok))
1516 return REG_ESPACE;
1517 }
1518 else /* dfa->edests[org_node].nelem == 2 */
1519 {
1520 /* In case of the node can epsilon-transit, and it has two
[3613]1521 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
[599]1522 org_dest = dfa->edests[org_node].elems[0];
1523 re_node_set_empty (dfa->edests + clone_node);
[3613]1524 /* Search for a duplicated node which satisfies the constraint. */
[599]1525 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
[3613]1526 if (clone_dest == -1)
1527 {
[599]1528 /* There is no such duplicated node, create a new one. */
1529 reg_errcode_t err;
1530 clone_dest = duplicate_node (dfa, org_dest, constraint);
[3613]1531 if (__glibc_unlikely (clone_dest == -1))
[599]1532 return REG_ESPACE;
1533 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534 if (__glibc_unlikely (! ok))
1535 return REG_ESPACE;
[3613]1536 err = duplicate_node_closure (dfa, org_dest, clone_dest,
[599]1537 root_node, constraint);
[3613]1538 if (__glibc_unlikely (err != REG_NOERROR))
1539 return err;
[599]1540 }
1541 else
1542 {
1543 /* There is a duplicated node which satisfies the constraint,
1544 use it to avoid infinite loop. */
[3613]1545 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
[599]1546 if (__glibc_unlikely (! ok))
[3613]1547 return REG_ESPACE;
1548 }
[599]1549
1550 org_dest = dfa->edests[org_node].elems[1];
1551 clone_dest = duplicate_node (dfa, org_dest, constraint);
1552 if (__glibc_unlikely (clone_dest == -1))
1553 return REG_ESPACE;
1554 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555 if (__glibc_unlikely (! ok))
1556 return REG_ESPACE;
1557 }
1558 org_node = org_dest;
1559 clone_node = clone_dest;
[3613]1560 }
1561 return REG_NOERROR;
[599]1562}
1563
[3613]1564/* Search for a node which is duplicated from the node ORG_NODE, and
[599]1565 satisfies the constraint CONSTRAINT. */
1566
1567static Idx
1568search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1569 unsigned int constraint)
1570{
1571 Idx idx;
1572 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1573 {
1574 if (org_node == dfa->org_indices[idx]
1575 && constraint == dfa->nodes[idx].constraint)
1576 return idx; /* Found. */
1577 }
[3613]1578 return -1; /* Not found. */
1579}
[599]1580
[3613]1581/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1582 Return the index of the new node, or -1 if insufficient storage is
[599]1583 available. */
1584
[3613]1585static Idx
[599]1586duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1587{
1588 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1589 if (__glibc_likely (dup_idx != -1))
1590 {
1591 dfa->nodes[dup_idx].constraint = constraint;
1592 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1593 dfa->nodes[dup_idx].duplicated = 1;
1594
1595 /* Store the index of the original node. */
1596 dfa->org_indices[dup_idx] = org_idx;
[3613]1597 }
1598 return dup_idx;
[599]1599}
1600
1601static reg_errcode_t
1602calc_inveclosure (re_dfa_t *dfa)
1603{
[3613]1604 Idx src, idx;
[599]1605 bool ok;
1606 for (idx = 0; idx < dfa->nodes_len; ++idx)
[3613]1607 re_node_set_init_empty (dfa->inveclosures + idx);
1608
[599]1609 for (src = 0; src < dfa->nodes_len; ++src)
1610 {
1611 Idx *elems = dfa->eclosures[src].elems;
1612 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1613 {
1614 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1615 if (__glibc_unlikely (! ok))
1616 return REG_ESPACE;
1617 }
1618 }
1619
1620 return REG_NOERROR;
[3613]1621}
1622
1623/* Calculate "eclosure" for all the node in DFA. */
1624
[599]1625static reg_errcode_t
1626calc_eclosure (re_dfa_t *dfa)
1627{
1628 Idx node_idx;
1629 bool incomplete;
1630 DEBUG_ASSERT (dfa->nodes_len > 0);
1631 incomplete = false;
1632 /* For each nodes, calculate epsilon closure. */
1633 for (node_idx = 0; ; ++node_idx)
[3613]1634 {
[599]1635 reg_errcode_t err;
1636 re_node_set eclosure_elem;
1637 if (node_idx == dfa->nodes_len)
[3613]1638 {
[599]1639 if (!incomplete)
1640 break;
1641 incomplete = false;
1642 node_idx = 0;
[3613]1643 }
1644
1645 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
[599]1646
1647 /* If we have already calculated, skip it. */
1648 if (dfa->eclosures[node_idx].nelem != 0)
1649 continue;
[3613]1650 /* Calculate epsilon closure of 'node_idx'. */
[599]1651 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1652 if (__glibc_unlikely (err != REG_NOERROR))
1653 return err;
1654
1655 if (dfa->eclosures[node_idx].nelem == 0)
1656 {
1657 incomplete = true;
1658 re_node_set_free (&eclosure_elem);
1659 }
[3613]1660 }
[599]1661 return REG_NOERROR;
1662}
[3613]1663
[599]1664/* Calculate epsilon closure of NODE. */
[3613]1665
[599]1666static reg_errcode_t
[3613]1667calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
[599]1668{
1669 reg_errcode_t err;
[3613]1670 Idx i;
1671 re_node_set eclosure;
1672 bool incomplete = false;
[599]1673 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1674 if (__glibc_unlikely (err != REG_NOERROR))
1675 return err;
1676
[3613]1677 /* An epsilon closure includes itself. */
1678 eclosure.elems[eclosure.nelem++] = node;
1679
[599]1680 /* This indicates that we are calculating this node now.
1681 We reference this value to avoid infinite loop. */
1682 dfa->eclosures[node].nelem = -1;
[3613]1683
1684 /* If the current node has constraints, duplicate all nodes
1685 since they must inherit the constraints. */
[599]1686 if (dfa->nodes[node].constraint
1687 && dfa->edests[node].nelem
1688 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1689 {
1690 err = duplicate_node_closure (dfa, node, node, node,
1691 dfa->nodes[node].constraint);
1692 if (__glibc_unlikely (err != REG_NOERROR))
1693 return err;
[3613]1694 }
1695
[599]1696 /* Expand each epsilon destination nodes. */
1697 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1698 for (i = 0; i < dfa->edests[node].nelem; ++i)
[3613]1699 {
[599]1700 re_node_set eclosure_elem;
1701 Idx edest = dfa->edests[node].elems[i];
[3613]1702 /* If calculating the epsilon closure of 'edest' is in progress,
[599]1703 return intermediate result. */
1704 if (dfa->eclosures[edest].nelem == -1)
1705 {
[3613]1706 incomplete = true;
1707 continue;
[599]1708 }
1709 /* If we haven't calculated the epsilon closure of 'edest' yet,
1710 calculate now. Otherwise use calculated epsilon closure. */
1711 if (dfa->eclosures[edest].nelem == 0)
[3613]1712 {
1713 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1714 if (__glibc_unlikely (err != REG_NOERROR))
1715 return err;
1716 }
[599]1717 else
1718 eclosure_elem = dfa->eclosures[edest];
1719 /* Merge the epsilon closure of 'edest'. */
[3613]1720 err = re_node_set_merge (&eclosure, &eclosure_elem);
[599]1721 if (__glibc_unlikely (err != REG_NOERROR))
1722 return err;
1723 /* If the epsilon closure of 'edest' is incomplete,
1724 the epsilon closure of this node is also incomplete. */
1725 if (dfa->eclosures[edest].nelem == 0)
1726 {
1727 incomplete = true;
1728 re_node_set_free (&eclosure_elem);
1729 }
1730 }
1731
1732 if (incomplete && !root)
1733 dfa->eclosures[node].nelem = 0;
1734 else
1735 dfa->eclosures[node] = eclosure;
1736 *new_set = eclosure;
1737 return REG_NOERROR;
1738}
1739
1740
1741/* Functions for token which are used in the parser. */
1742
1743/* Fetch a token from INPUT.
1744 We must not use this function inside bracket expressions. */
1745
1746static void
1747fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1748{
1749 re_string_skip_bytes (input, peek_token (result, input, syntax));
1750}
1751
1752/* Peek a token from INPUT, and return the length of the token.
1753 We must not use this function inside bracket expressions. */
1754
1755static int
1756peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1757{
1758 unsigned char c;
1759
1760 if (re_string_eoi (input))
1761 {
1762 token->type = END_OF_RE;
[3613]1763 return 0;
1764 }
[599]1765
1766 c = re_string_peek_byte (input, 0);
1767 token->opr.c = c;
1768
1769 token->word_char = 0;
1770 token->mb_partial = 0;
1771 if (input->mb_cur_max > 1
1772 && !re_string_first_byte (input, re_string_cur_idx (input)))
1773 {
1774 token->type = CHARACTER;
1775 token->mb_partial = 1;
1776 return 1;
1777 }
1778 if (c == '\\')
1779 {
1780 unsigned char c2;
1781 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1782 {
1783 token->type = BACK_SLASH;
1784 return 1;
1785 }
1786
1787 c2 = re_string_peek_byte_case (input, 1);
1788 token->opr.c = c2;
1789 token->type = CHARACTER;
1790 if (input->mb_cur_max > 1)
1791 {
1792 wint_t wc = re_string_wchar_at (input,
1793 re_string_cur_idx (input) + 1);
1794 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1795 }
1796 else
1797 token->word_char = IS_WORD_CHAR (c2) != 0;
1798
1799 switch (c2)
1800 {
1801 case '|':
1802 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1803 token->type = OP_ALT;
1804 break;
1805 case '1': case '2': case '3': case '4': case '5':
1806 case '6': case '7': case '8': case '9':
1807 if (!(syntax & RE_NO_BK_REFS))
1808 {
1809 token->type = OP_BACK_REF;
1810 token->opr.idx = c2 - '1';
1811 }
1812 break;
1813 case '<':
1814 if (!(syntax & RE_NO_GNU_OPS))
1815 {
1816 token->type = ANCHOR;
1817 token->opr.ctx_type = WORD_FIRST;
1818 }
1819 break;
1820 case '>':
1821 if (!(syntax & RE_NO_GNU_OPS))
1822 {
1823 token->type = ANCHOR;
1824 token->opr.ctx_type = WORD_LAST;
1825 }
1826 break;
1827 case 'b':
1828 if (!(syntax & RE_NO_GNU_OPS))
1829 {
1830 token->type = ANCHOR;
1831 token->opr.ctx_type = WORD_DELIM;
1832 }
1833 break;
1834 case 'B':
1835 if (!(syntax & RE_NO_GNU_OPS))
1836 {
1837 token->type = ANCHOR;
1838 token->opr.ctx_type = NOT_WORD_DELIM;
1839 }
1840 break;
1841 case 'w':
1842 if (!(syntax & RE_NO_GNU_OPS))
1843 token->type = OP_WORD;
1844 break;
1845 case 'W':
1846 if (!(syntax & RE_NO_GNU_OPS))
1847 token->type = OP_NOTWORD;
1848 break;
1849 case 's':
1850 if (!(syntax & RE_NO_GNU_OPS))
1851 token->type = OP_SPACE;
1852 break;
1853 case 'S':
1854 if (!(syntax & RE_NO_GNU_OPS))
1855 token->type = OP_NOTSPACE;
1856 break;
1857 case '`':
1858 if (!(syntax & RE_NO_GNU_OPS))
1859 {
1860 token->type = ANCHOR;
1861 token->opr.ctx_type = BUF_FIRST;
1862 }
1863 break;
1864 case '\'':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 {
1867 token->type = ANCHOR;
1868 token->opr.ctx_type = BUF_LAST;
1869 }
1870 break;
1871 case '(':
1872 if (!(syntax & RE_NO_BK_PARENS))
1873 token->type = OP_OPEN_SUBEXP;
1874 break;
1875 case ')':
1876 if (!(syntax & RE_NO_BK_PARENS))
1877 token->type = OP_CLOSE_SUBEXP;
1878 break;
1879 case '+':
1880 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1881 token->type = OP_DUP_PLUS;
1882 break;
1883 case '?':
1884 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1885 token->type = OP_DUP_QUESTION;
1886 break;
1887 case '{':
1888 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1889 token->type = OP_OPEN_DUP_NUM;
1890 break;
1891 case '}':
1892 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1893 token->type = OP_CLOSE_DUP_NUM;
1894 break;
1895 default:
1896 break;
1897 }
1898 return 2;
1899 }
1900
1901 token->type = CHARACTER;
1902 if (input->mb_cur_max > 1)
1903 {
1904 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1905 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1906 }
1907 else
1908 token->word_char = IS_WORD_CHAR (token->opr.c);
1909
1910 switch (c)
1911 {
1912 case '\n':
1913 if (syntax & RE_NEWLINE_ALT)
1914 token->type = OP_ALT;
1915 break;
1916 case '|':
1917 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1918 token->type = OP_ALT;
1919 break;
1920 case '*':
1921 token->type = OP_DUP_ASTERISK;
1922 break;
1923 case '+':
1924 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1925 token->type = OP_DUP_PLUS;
1926 break;
1927 case '?':
1928 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1929 token->type = OP_DUP_QUESTION;
1930 break;
1931 case '{':
1932 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1933 token->type = OP_OPEN_DUP_NUM;
1934 break;
1935 case '}':
1936 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1937 token->type = OP_CLOSE_DUP_NUM;
1938 break;
1939 case '(':
1940 if (syntax & RE_NO_BK_PARENS)
1941 token->type = OP_OPEN_SUBEXP;
1942 break;
1943 case ')':
1944 if (syntax & RE_NO_BK_PARENS)
1945 token->type = OP_CLOSE_SUBEXP;
[3613]1946 break;
1947 case '[':
[599]1948 token->type = OP_OPEN_BRACKET;
1949 break;
1950 case '.':
1951 token->type = OP_PERIOD;
1952 break;
1953 case '^':
1954 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1955 && re_string_cur_idx (input) != 0)
1956 {
[3613]1957 char prev = re_string_peek_byte (input, -1);
1958 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
[599]1959 break;
1960 }
1961 token->type = ANCHOR;
1962 token->opr.ctx_type = LINE_FIRST;
1963 break;
1964 case '$':
1965 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1966 && re_string_cur_idx (input) + 1 != re_string_length (input))
1967 {
1968 re_token_t next;
1969 re_string_skip_bytes (input, 1);
1970 peek_token (&next, input, syntax);
1971 re_string_skip_bytes (input, -1);
1972 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1973 break;
1974 }
1975 token->type = ANCHOR;
1976 token->opr.ctx_type = LINE_LAST;
1977 break;
1978 default:
1979 break;
1980 }
1981 return 1;
1982}
1983
1984/* Peek a token from INPUT, and return the length of the token.
1985 We must not use this function out of bracket expressions. */
1986
1987static int
1988peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1989{
1990 unsigned char c;
[3613]1991 if (re_string_eoi (input))
1992 {
[599]1993 token->type = END_OF_RE;
1994 return 0;
1995 }
1996 c = re_string_peek_byte (input, 0);
1997 token->opr.c = c;
1998
1999 if (input->mb_cur_max > 1
2000 && !re_string_first_byte (input, re_string_cur_idx (input)))
2001 {
2002 token->type = CHARACTER;
2003 return 1;
2004 }
2005
2006 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2007 && re_string_cur_idx (input) + 1 < re_string_length (input))
2008 {
2009 /* In this case, '\' escape a character. */
2010 unsigned char c2;
2011 re_string_skip_bytes (input, 1);
2012 c2 = re_string_peek_byte (input, 0);
2013 token->opr.c = c2;
2014 token->type = CHARACTER;
2015 return 1;
2016 }
2017 if (c == '[') /* '[' is a special char in a bracket exps. */
2018 {
2019 unsigned char c2;
2020 int token_len;
2021 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2022 c2 = re_string_peek_byte (input, 1);
2023 else
[3613]2024 c2 = 0;
[599]2025 token->opr.c = c2;
2026 token_len = 2;
2027 switch (c2)
[3613]2028 {
[599]2029 case '.':
2030 token->type = OP_OPEN_COLL_ELEM;
2031 break;
2032
2033 case '=':
2034 token->type = OP_OPEN_EQUIV_CLASS;
[3613]2035 break;
[599]2036
2037 case ':':
2038 if (syntax & RE_CHAR_CLASSES)
2039 {
2040 token->type = OP_OPEN_CHAR_CLASS;
2041 break;
2042 }
2043 FALLTHROUGH;
2044 default:
2045 token->type = CHARACTER;
2046 token->opr.c = c;
2047 token_len = 1;
2048 break;
2049 }
2050 return token_len;
2051 }
[3613]2052 switch (c)
2053 {
2054 case ']':
2055 token->type = OP_CLOSE_BRACKET;
2056 break;
2057 case '^':
2058 token->type = OP_NON_MATCH_LIST;
2059 break;
2060 case '-':
2061 /* In V7 Unix grep and Unix awk and mawk, [...---...]
2062 (3 adjacent minus signs) stands for a single minus sign.
2063 Support that without breaking anything else. */
2064 if (! (re_string_cur_idx (input) + 2 < re_string_length (input)
[599]2065 && re_string_peek_byte (input, 1) == '-'
2066 && re_string_peek_byte (input, 2) == '-'))
2067 {
2068 token->type = OP_CHARSET_RANGE;
2069 break;
2070 }
2071 re_string_skip_bytes (input, 2);
2072 FALLTHROUGH;
2073 default:
2074 token->type = CHARACTER;
[3613]2075 }
[599]2076 return 1;
2077}
2078
2079
2080/* Functions for parser. */
2081
2082/* Entry point of the parser.
2083 Parse the regular expression REGEXP and return the structure tree.
2084 If an error occurs, ERR is set by error code, and return NULL.
2085 This function build the following tree, from regular expression <reg_exp>:
2086 CAT
2087 / \
2088 / \
[3613]2089 <reg_exp> EOR
[599]2090
2091 CAT means concatenation.
2092 EOR means end of regular expression. */
2093
2094static bin_tree_t *
[3613]2095parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
[599]2096 reg_errcode_t *err)
2097{
2098 re_dfa_t *dfa = preg->buffer;
2099 bin_tree_t *tree, *eor, *root;
2100 re_token_t current_token;
2101 dfa->syntax = syntax;
[3613]2102 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
[599]2103 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2104 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2105 return NULL;
2106 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2107 if (tree != NULL)
2108 root = create_tree (dfa, tree, eor, CONCAT);
2109 else
2110 root = eor;
2111 if (__glibc_unlikely (eor == NULL || root == NULL))
2112 {
2113 *err = REG_ESPACE;
2114 return NULL;
2115 }
2116 return root;
[3613]2117}
[599]2118
2119/* This function build the following tree, from regular expression
2120 <branch1>|<branch2>:
[3613]2121 ALT
[599]2122 / \
[3613]2123 / \
[599]2124 <branch1> <branch2>
[3613]2125
[599]2126 ALT means alternative, which represents the operator '|'. */
[3613]2127
[599]2128static bin_tree_t *
2129parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2130 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2131{
2132 re_dfa_t *dfa = preg->buffer;
2133 bin_tree_t *tree, *branch = NULL;
2134 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2135 tree = parse_branch (regexp, preg, token, syntax, nest, err);
[3613]2136 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2137 return NULL;
[599]2138
[3613]2139 while (token->type == OP_ALT)
2140 {
2141 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2142 if (token->type != OP_ALT && token->type != END_OF_RE
2143 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2144 {
2145 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
[599]2146 dfa->completed_bkref_map = initial_bkref_map;
2147 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2148 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2149 {
[3613]2150 if (tree != NULL)
[599]2151 postorder (tree, free_tree, NULL);
2152 return NULL;
2153 }
2154 dfa->completed_bkref_map |= accumulated_bkref_map;
2155 }
2156 else
2157 branch = NULL;
2158 tree = create_tree (dfa, tree, branch, OP_ALT);
2159 if (__glibc_unlikely (tree == NULL))
2160 {
2161 *err = REG_ESPACE;
2162 return NULL;
2163 }
2164 }
2165 return tree;
2166}
2167
2168/* This function build the following tree, from regular expression
2169 <exp1><exp2>:
[3613]2170 CAT
[599]2171 / \
[3613]2172 / \
2173 <exp1> <exp2>
[599]2174
[3613]2175 CAT means concatenation. */
[599]2176
2177static bin_tree_t *
2178parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2179 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2180{
[3613]2181 bin_tree_t *tree, *expr;
2182 re_dfa_t *dfa = preg->buffer;
[599]2183 tree = parse_expression (regexp, preg, token, syntax, nest, err);
[3613]2184 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2185 return NULL;
[599]2186
2187 while (token->type != OP_ALT && token->type != END_OF_RE
[3613]2188 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
[599]2189 {
[3613]2190 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2191 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
[599]2192 {
[3613]2193 if (tree != NULL)
2194 postorder (tree, free_tree, NULL);
[599]2195 return NULL;
2196 }
2197 if (tree != NULL && expr != NULL)
[3613]2198 {
[599]2199 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2200 if (newtree == NULL)
[3613]2201 {
2202 postorder (expr, free_tree, NULL);
[599]2203 postorder (tree, free_tree, NULL);
2204 *err = REG_ESPACE;
2205 return NULL;
2206 }
2207 tree = newtree;
2208 }
2209 else if (tree == NULL)
2210 tree = expr;
2211 /* Otherwise expr == NULL, we don't need to create new tree. */
2212 }
2213 return tree;
2214}
[3613]2215
[599]2216/* This function build the following tree, from regular expression a*:
[3613]2217 *
[599]2218 |
2219 a
2220*/
2221
2222static bin_tree_t *
[3613]2223parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
[599]2224 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2225{
2226 re_dfa_t *dfa = preg->buffer;
2227 bin_tree_t *tree;
2228 switch (token->type)
2229 {
2230 case CHARACTER:
2231 tree = create_token_tree (dfa, NULL, NULL, token);
2232 if (__glibc_unlikely (tree == NULL))
2233 {
2234 *err = REG_ESPACE;
2235 return NULL;
2236 }
[3613]2237 if (dfa->mb_cur_max > 1)
[599]2238 {
2239 while (!re_string_eoi (regexp)
2240 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2241 {
2242 bin_tree_t *mbc_remain;
2243 fetch_token (token, regexp, syntax);
2244 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
[3613]2245 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
[599]2246 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2247 {
[3613]2248 *err = REG_ESPACE;
[599]2249 return NULL;
2250 }
[3613]2251 }
[599]2252 }
2253 break;
[3613]2254
[599]2255 case OP_OPEN_SUBEXP:
2256 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
[3613]2257 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
[599]2258 return NULL;
[3613]2259 break;
[599]2260
2261 case OP_OPEN_BRACKET:
2262 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2264 return NULL;
2265 break;
[3613]2266
[599]2267 case OP_BACK_REF:
2268 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2269 {
2270 *err = REG_ESUBREG;
2271 return NULL;
2272 }
2273 dfa->used_bkref_map |= 1 << token->opr.idx;
[3613]2274 tree = create_token_tree (dfa, NULL, NULL, token);
[599]2275 if (__glibc_unlikely (tree == NULL))
2276 {
2277 *err = REG_ESPACE;
2278 return NULL;
2279 }
2280 ++dfa->nbackref;
[3613]2281 dfa->has_mb_node = 1;
[599]2282 break;
2283
2284 case OP_OPEN_DUP_NUM:
2285 if (syntax & RE_CONTEXT_INVALID_DUP)
2286 {
2287 *err = REG_BADRPT;
2288 return NULL;
2289 }
2290 FALLTHROUGH;
2291 case OP_DUP_ASTERISK:
2292 case OP_DUP_PLUS:
2293 case OP_DUP_QUESTION:
2294 if (syntax & RE_CONTEXT_INVALID_OPS)
[3613]2295 {
[599]2296 *err = REG_BADRPT;
[3613]2297 return NULL;
2298 }
[599]2299 else if (syntax & RE_CONTEXT_INDEP_OPS)
2300 {
2301 fetch_token (token, regexp, syntax);
2302 return parse_expression (regexp, preg, token, syntax, nest, err);
[3613]2303 }
[599]2304 FALLTHROUGH;
2305 case OP_CLOSE_SUBEXP:
2306 if ((token->type == OP_CLOSE_SUBEXP)
2307 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2308 {
2309 *err = REG_ERPAREN;
2310 return NULL;
2311 }
[3613]2312 FALLTHROUGH;
[599]2313 case OP_CLOSE_DUP_NUM:
2314 /* We treat it as a normal character. */
2315
2316 /* Then we can these characters as normal characters. */
2317 token->type = CHARACTER;
[3613]2318 /* mb_partial and word_char bits should be initialized already
[599]2319 by peek_token. */
2320 tree = create_token_tree (dfa, NULL, NULL, token);
2321 if (__glibc_unlikely (tree == NULL))
2322 {
2323 *err = REG_ESPACE;
2324 return NULL;
[3613]2325 }
[599]2326 break;
2327
2328 case ANCHOR:
2329 if ((token->opr.ctx_type
2330 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2331 && dfa->word_ops_used == 0)
2332 init_word_char (dfa);
[3613]2333 if (token->opr.ctx_type == WORD_DELIM
2334 || token->opr.ctx_type == NOT_WORD_DELIM)
2335 {
[599]2336 bin_tree_t *tree_first, *tree_last;
2337 if (token->opr.ctx_type == WORD_DELIM)
2338 {
[3613]2339 token->opr.ctx_type = WORD_FIRST;
[599]2340 tree_first = create_token_tree (dfa, NULL, NULL, token);
2341 token->opr.ctx_type = WORD_LAST;
[3613]2342 }
2343 else
[599]2344 {
2345 token->opr.ctx_type = INSIDE_WORD;
2346 tree_first = create_token_tree (dfa, NULL, NULL, token);
2347 token->opr.ctx_type = INSIDE_NOTWORD;
2348 }
2349 tree_last = create_token_tree (dfa, NULL, NULL, token);
2350 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2351 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
[3613]2352 || tree == NULL))
[599]2353 {
2354 *err = REG_ESPACE;
2355 return NULL;
2356 }
2357 }
2358 else
2359 {
2360 tree = create_token_tree (dfa, NULL, NULL, token);
2361 if (__glibc_unlikely (tree == NULL))
2362 {
2363 *err = REG_ESPACE;
[3613]2364 return NULL;
[599]2365 }
2366 }
[3613]2367 /* We must return here, since ANCHORs can't be followed
[599]2368 by repetition operators.
2369 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2370 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2371 fetch_token (token, regexp, syntax);
2372 return tree;
2373
2374 case OP_PERIOD:
[3613]2375 tree = create_token_tree (dfa, NULL, NULL, token);
[599]2376 if (__glibc_unlikely (tree == NULL))
2377 {
2378 *err = REG_ESPACE;
[3613]2379 return NULL;
2380 }
[599]2381 if (dfa->mb_cur_max > 1)
[3613]2382 dfa->has_mb_node = 1;
[599]2383 break;
2384
[3613]2385 case OP_WORD:
[599]2386 case OP_NOTWORD:
2387 tree = build_charclass_op (dfa, regexp->trans,
2388 "alnum",
[3613]2389 "_",
2390 token->type == OP_NOTWORD, err);
[599]2391 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
[3613]2392 return NULL;
[599]2393 break;
2394
[3613]2395 case OP_SPACE:
[599]2396 case OP_NOTSPACE:
2397 tree = build_charclass_op (dfa, regexp->trans,
2398 "space",
[3613]2399 "",
[599]2400 token->type == OP_NOTSPACE, err);
2401 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2402 return NULL;
[3613]2403 break;
[599]2404
2405 case OP_ALT:
[3613]2406 case END_OF_RE:
[599]2407 return NULL;
2408
2409 case BACK_SLASH:
2410 *err = REG_EESCAPE;
2411 return NULL;
2412
2413 default:
[3613]2414 /* Must not happen? */
2415 DEBUG_ASSERT (false);
2416 return NULL;
2417 }
2418 fetch_token (token, regexp, syntax);
2419
2420 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2421 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2422 {
[599]2423 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2424 syntax, err);
2425 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2426 {
2427 if (tree != NULL)
[3613]2428 postorder (tree, free_tree, NULL);
2429 return NULL;
[599]2430 }
2431 tree = dup_tree;
2432 /* In BRE consecutive duplications are not allowed. */
2433 if ((syntax & RE_CONTEXT_INVALID_DUP)
2434 && (token->type == OP_DUP_ASTERISK
2435 || token->type == OP_OPEN_DUP_NUM))
2436 {
2437 if (tree != NULL)
2438 postorder (tree, free_tree, NULL);
2439 *err = REG_BADRPT;
2440 return NULL;
2441 }
2442 }
2443
2444 return tree;
2445}
2446
[3613]2447/* This function build the following tree, from regular expression
[599]2448 (<reg_exp>):
[3613]2449 SUBEXP
[599]2450 |
2451 <reg_exp>
2452*/
2453
2454static bin_tree_t *
2455parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2456 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2457{
2458 re_dfa_t *dfa = preg->buffer;
2459 bin_tree_t *tree;
2460 size_t cur_nsub;
2461 cur_nsub = preg->re_nsub++;
[3613]2462
2463 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2464
2465 /* The subexpression may be a null string. */
2466 if (token->type == OP_CLOSE_SUBEXP)
2467 tree = NULL;
2468 else
2469 {
[599]2470 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2471 if (__glibc_unlikely (*err == REG_NOERROR
2472 && token->type != OP_CLOSE_SUBEXP))
2473 {
2474 if (tree != NULL)
2475 postorder (tree, free_tree, NULL);
2476 *err = REG_EPAREN;
[3613]2477 }
[599]2478 if (__glibc_unlikely (*err != REG_NOERROR))
2479 return NULL;
2480 }
2481
2482 if (cur_nsub <= '9' - '1')
2483 dfa->completed_bkref_map |= 1 << cur_nsub;
2484
2485 tree = create_tree (dfa, tree, NULL, SUBEXP);
2486 if (__glibc_unlikely (tree == NULL))
2487 {
2488 *err = REG_ESPACE;
2489 return NULL;
2490 }
2491 tree->token.opr.idx = cur_nsub;
2492 return tree;
[3613]2493}
[599]2494
2495/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2496
2497static bin_tree_t *
2498parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2499 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2500{
2501 bin_tree_t *tree = NULL, *old_tree = NULL;
2502 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2503 re_token_t start_token = *token;
2504
2505 if (token->type == OP_OPEN_DUP_NUM)
2506 {
2507 end = 0;
2508 start = fetch_number (regexp, token, syntax);
2509 if (start == -1)
[3613]2510 {
[599]2511 if (token->type == CHARACTER && token->opr.c == ',')
2512 start = 0; /* We treat "{,m}" as "{0,m}". */
2513 else
2514 {
2515 *err = REG_BADBR; /* <re>{} is invalid. */
2516 return NULL;
[3613]2517 }
[599]2518 }
2519 if (__glibc_likely (start != -2))
[3613]2520 {
[599]2521 /* We treat "{n}" as "{n,n}". */
2522 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2523 : ((token->type == CHARACTER && token->opr.c == ',')
2524 ? fetch_number (regexp, token, syntax) : -2));
2525 }
2526 if (__glibc_unlikely (start == -2 || end == -2))
2527 {
2528 /* Invalid sequence. */
2529 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2530 {
2531 if (token->type == END_OF_RE)
2532 *err = REG_EBRACE;
2533 else
2534 *err = REG_BADBR;
2535
2536 return NULL;
2537 }
2538
[3613]2539 /* If the syntax bit is set, rollback. */
2540 re_string_set_index (regexp, start_idx);
[599]2541 *token = start_token;
2542 token->type = CHARACTER;
2543 /* mb_partial and word_char bits should be already initialized by
2544 peek_token. */
2545 return elem;
[3613]2546 }
2547
2548 if (__glibc_unlikely ((end != -1 && start > end)
2549 || token->type != OP_CLOSE_DUP_NUM))
2550 {
2551 /* First number greater than second. */
[599]2552 *err = REG_BADBR;
2553 return NULL;
2554 }
2555
2556 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2557 {
2558 *err = REG_ESIZE;
2559 return NULL;
2560 }
[3613]2561 }
[599]2562 else
[3613]2563 {
[599]2564 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2565 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2566 }
2567
2568 fetch_token (token, regexp, syntax);
2569
[3613]2570 if (__glibc_unlikely (elem == NULL))
[599]2571 return NULL;
2572 if (__glibc_unlikely (start == 0 && end == 0))
2573 {
2574 postorder (elem, free_tree, NULL);
2575 return NULL;
2576 }
[3613]2577
[599]2578 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2579 if (__glibc_unlikely (start > 0))
2580 {
2581 tree = elem;
2582 for (i = 2; i <= start; ++i)
2583 {
2584 elem = duplicate_tree (elem, dfa);
2585 tree = create_tree (dfa, tree, elem, CONCAT);
[3613]2586 if (__glibc_unlikely (elem == NULL || tree == NULL))
2587 goto parse_dup_op_espace;
[599]2588 }
2589
2590 if (start == end)
2591 return tree;
2592
2593 /* Duplicate ELEM before it is marked optional. */
[3613]2594 elem = duplicate_tree (elem, dfa);
2595 if (__glibc_unlikely (elem == NULL))
2596 goto parse_dup_op_espace;
2597 old_tree = tree;
[599]2598 }
[3613]2599 else
2600 old_tree = NULL;
2601
[599]2602 if (elem->token.type == SUBEXP)
2603 {
2604 uintptr_t subidx = elem->token.opr.idx;
2605 postorder (elem, mark_opt_subexp, (void *) subidx);
2606 }
[3613]2607
2608 tree = create_tree (dfa, elem, NULL,
2609 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2610 if (__glibc_unlikely (tree == NULL))
2611 goto parse_dup_op_espace;
2612
2613 /* This loop is actually executed only when end != -1,
[599]2614 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
[3613]2615 already created the start+1-th copy. */
2616 if (TYPE_SIGNED (Idx) || end != -1)
2617 for (i = start + 2; i <= end; ++i)
2618 {
[599]2619 elem = duplicate_tree (elem, dfa);
2620 tree = create_tree (dfa, tree, elem, CONCAT);
2621 if (__glibc_unlikely (elem == NULL || tree == NULL))
2622 goto parse_dup_op_espace;
2623
2624 tree = create_tree (dfa, tree, NULL, OP_ALT);
2625 if (__glibc_unlikely (tree == NULL))
2626 goto parse_dup_op_espace;
2627 }
2628
2629 if (old_tree)
2630 tree = create_tree (dfa, old_tree, tree, CONCAT);
2631
2632 return tree;
2633
2634 parse_dup_op_espace:
2635 *err = REG_ESPACE;
[3613]2636 return NULL;
2637}
2638
2639/* Size of the names for collating symbol/equivalence_class/character_class.
2640 I'm not sure, but maybe enough. */
2641#define BRACKET_NAME_BUF_SIZE 32
2642
2643#ifndef _LIBC
2644
2645/* Convert the byte B to the corresponding wide character. In a
2646 unibyte locale, treat B as itself. In a multibyte locale, return
2647 WEOF if B is an encoding error. */
2648static wint_t
2649parse_byte (unsigned char b, re_dfa_t const *dfa)
2650{
2651 return dfa->mb_cur_max > 1 ? __btowc (b) : b;
[599]2652}
[3613]2653
2654/* Local function for parse_bracket_exp used in _LIBC environment.
2655 Build the range expression which starts from START_ELEM, and ends
2656 at END_ELEM. The result are written to MBCSET and SBCSET.
2657 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2658 mbcset->range_ends, is a pointer argument since we may
[599]2659 update it. */
2660
[3613]2661static reg_errcode_t
2662build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2663 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2664 re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
[599]2665 const unsigned char *collseqmb, const char *collseqwc,
2666 int_fast32_t table_size, const void *symb_table,
2667 const unsigned char *extra)
2668{
[3613]2669 /* Equivalence Classes and Character Classes can't be a range start/end. */
2670 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2671 || start_elem->type == CHAR_CLASS
2672 || end_elem->type == EQUIV_CLASS
[599]2673 || end_elem->type == CHAR_CLASS))
2674 return REG_ERANGE;
[3613]2675
[599]2676 /* We can handle no multi character collating elements without libc
2677 support. */
[3613]2678 if (__glibc_unlikely ((start_elem->type == COLL_SYM
[599]2679 && strlen ((char *) start_elem->opr.name) > 1)
2680 || (end_elem->type == COLL_SYM
2681 && strlen ((char *) end_elem->opr.name) > 1)))
[3613]2682 return REG_ECOLLATE;
[599]2683
[3613]2684 unsigned int
[599]2685 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
[3613]2686 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
[599]2687 : 0)),
[3613]2688 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2689 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2690 : 0));
2691 wint_t
2692 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
[599]2693 ? parse_byte (start_ch, dfa) : start_elem->opr.wch),
[3613]2694 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2695 ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
2696
2697 if (start_wc == WEOF || end_wc == WEOF)
2698 return REG_ECOLLATE;
2699 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2700 && start_wc > end_wc))
2701 return REG_ERANGE;
2702
2703 /* Got valid collation sequence values, add them as a new entry.
2704 However, for !_LIBC we have no collation elements: if the
2705 character set is single byte, the single byte character set
2706 that we build below suffices. parse_bracket_exp passes
[599]2707 no MBCSET if dfa->mb_cur_max == 1. */
[3613]2708 if (dfa->mb_cur_max > 1)
2709 {
2710 /* Check the space of the arrays. */
2711 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2712 {
2713 /* There is not enough space, need realloc. */
2714 wchar_t *new_array_start, *new_array_end;
2715 Idx new_nranges;
[599]2716
[3613]2717 /* +1 in case of mbcset->nranges is 0. */
2718 new_nranges = 2 * mbcset->nranges + 1;
2719 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2720 are NULL if *range_alloc == 0. */
2721 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2722 new_nranges);
2723 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
[599]2724 new_nranges);
[3613]2725
2726 if (__glibc_unlikely (new_array_start == NULL
2727 || new_array_end == NULL))
2728 {
[599]2729 re_free (new_array_start);
[3613]2730 re_free (new_array_end);
2731 return REG_ESPACE;
2732 }
2733
2734 mbcset->range_starts = new_array_start;
2735 mbcset->range_ends = new_array_end;
2736 *range_alloc = new_nranges;
2737 }
2738
2739 mbcset->range_starts[mbcset->nranges] = start_wc;
2740 mbcset->range_ends[mbcset->nranges++] = end_wc;
[599]2741 }
2742
2743 /* Build the table for single byte characters. */
2744 for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
2745 {
[3613]2746 if (start_wc <= wc && wc <= end_wc)
[599]2747 bitset_set (sbcset, wc);
2748 }
2749
2750 return REG_NOERROR;
2751}
2752#endif /* not _LIBC */
2753
[3613]2754#ifndef _LIBC
2755/* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
2756 Build the collating element which is represented by NAME.
[599]2757 The result are written to MBCSET and SBCSET.
2758 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
[3613]2759 pointer argument since we may update it. */
[599]2760
2761static reg_errcode_t
2762build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2763 Idx *coll_sym_alloc, const unsigned char *name,
2764 uint_fast32_t nrules, int_fast32_t table_size,
2765 const void *symb_table, const unsigned char *extra)
2766{
2767 size_t name_len = strlen ((const char *) name);
2768 if (__glibc_unlikely (name_len != 1))
[3613]2769 return REG_ECOLLATE;
2770 else
2771 {
2772 bitset_set (sbcset, name[0]);
2773 return REG_NOERROR;
[599]2774 }
[3613]2775}
2776#endif /* not _LIBC */
2777
2778#ifdef _LIBC
2779/* Local function for parse_bracket_exp used in _LIBC environment.
[599]2780 Seek the collating symbol entry corresponding to NAME.
[3613]2781 Return the index of the symbol in the SYMB_TABLE,
[599]2782 or -1 if not found. */
[3613]2783
2784static __always_inline int32_t
2785seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2786 const int32_t *symb_table,
2787 int_fast32_t table_size,
2788 const unsigned char *extra)
2789{
2790 int_fast32_t elem;
2791
2792 for (elem = 0; elem < table_size; elem++)
2793 if (symb_table[2 * elem] != 0)
2794 {
2795 int32_t idx = symb_table[2 * elem + 1];
2796 /* Skip the name of collating element name. */
2797 idx += 1 + extra[idx];
[599]2798 if (/* Compare the length of the name. */
[3613]2799 name_len == extra[idx]
2800 /* Compare the name. */
2801 && memcmp (name, &extra[idx + 1], name_len) == 0)
2802 /* Yep, this is the entry. */
2803 return elem;
2804 }
2805 return -1;
2806}
2807
2808/* Local function for parse_bracket_exp used in _LIBC environment.
2809 Look up the collation sequence value of BR_ELEM.
2810 Return the value if succeeded, UINT_MAX otherwise. */
2811
[599]2812static __always_inline unsigned int
[3613]2813lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2814 const unsigned char *collseqmb,
2815 const char *collseqwc,
2816 int_fast32_t table_size,
[599]2817 const int32_t *symb_table,
[3613]2818 const unsigned char *extra)
2819{
[599]2820 if (br_elem->type == SB_CHAR)
2821 {
[3613]2822 /* if (MB_CUR_MAX == 1) */
[599]2823 if (nrules == 0)
[3613]2824 return collseqmb[br_elem->opr.ch];
2825 else
2826 {
2827 wint_t wc = __btowc (br_elem->opr.ch);
2828 return __collseq_table_lookup (collseqwc, wc);
2829 }
2830 }
[599]2831 else if (br_elem->type == MB_CHAR)
[3613]2832 {
2833 if (nrules != 0)
2834 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2835 }
2836 else if (br_elem->type == COLL_SYM)
2837 {
[599]2838 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
[3613]2839 if (nrules != 0)
2840 {
2841 int32_t elem, idx;
2842 elem = seek_collating_symbol_entry (br_elem->opr.name,
2843 sym_name_len,
2844 symb_table, table_size,
2845 extra);
2846 if (elem != -1)
2847 {
2848 /* We found the entry. */
2849 idx = symb_table[2 * elem + 1];
2850 /* Skip the name of collating element name. */
2851 idx += 1 + extra[idx];
2852 /* Skip the byte sequence of the collating element. */
2853 idx += 1 + extra[idx];
[599]2854 /* Adjust for the alignment. */
[3613]2855 idx = (idx + 3) & ~3;
[599]2856 /* Skip the multibyte collation sequence value. */
[3613]2857 idx += sizeof (unsigned int);
2858 /* Skip the wide char sequence of the collating element. */
2859 idx += sizeof (unsigned int) *
[599]2860 (1 + *(unsigned int *) (extra + idx));
2861 /* Return the collation sequence value. */
[3613]2862 return *(unsigned int *) (extra + idx);
2863 }
[599]2864 else if (sym_name_len == 1)
[3613]2865 {
2866 /* No valid character. Match it as a single byte
[599]2867 character. */
[3613]2868 return collseqmb[br_elem->opr.name[0]];
2869 }
2870 }
2871 else if (sym_name_len == 1)
2872 return collseqmb[br_elem->opr.name[0]];
2873 }
[599]2874 return UINT_MAX;
[3613]2875}
2876
2877/* Local function for parse_bracket_exp used in _LIBC environment.
2878 Build the range expression which starts from START_ELEM, and ends
2879 at END_ELEM. The result are written to MBCSET and SBCSET.
2880 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2881 mbcset->range_ends, is a pointer argument since we may
2882 update it. */
2883
2884static __always_inline reg_errcode_t
2885build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
[599]2886 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
[3613]2887 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2888 const unsigned char *collseqmb, const char *collseqwc,
2889 int_fast32_t table_size, const int32_t *symb_table,
2890 const unsigned char *extra)
2891{
2892 unsigned int ch;
2893 uint32_t start_collseq;
[599]2894 uint32_t end_collseq;
[3613]2895
2896 /* Equivalence Classes and Character Classes can't be a range
2897 start/end. */
2898 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2899 || start_elem->type == CHAR_CLASS
2900 || end_elem->type == EQUIV_CLASS
2901 || end_elem->type == CHAR_CLASS))
2902 return REG_ERANGE;
2903
2904 /* FIXME: Implement rational ranges here, too. */
2905 start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2906 table_size, symb_table, extra);
[599]2907 end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
[3613]2908 table_size, symb_table, extra);
2909 /* Check start/end collation sequence values. */
2910 if (__glibc_unlikely (start_collseq == UINT_MAX
2911 || end_collseq == UINT_MAX))
2912 return REG_ECOLLATE;
2913 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2914 && start_collseq > end_collseq))
2915 return REG_ERANGE;
[599]2916
[3613]2917 /* Got valid collation sequence values, add them as a new entry.
2918 However, if we have no collation elements, and the character set
2919 is single byte, the single byte character set that we
2920 build below suffices. */
[599]2921 if (nrules > 0 || dfa->mb_cur_max > 1)
[3613]2922 {
2923 /* Check the space of the arrays. */
2924 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2925 {
2926 /* There is not enough space, need realloc. */
2927 uint32_t *new_array_start;
[599]2928 uint32_t *new_array_end;
[3613]2929 int new_nranges;
2930
2931 /* +1 in case of mbcset->nranges is 0. */
[599]2932 new_nranges = 2 * mbcset->nranges + 1;
[3613]2933 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2934 new_nranges);
2935 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
[599]2936 new_nranges);
2937
[3613]2938 if (__glibc_unlikely (new_array_start == NULL
2939 || new_array_end == NULL))
[599]2940 return REG_ESPACE;
2941
[3613]2942 mbcset->range_starts = new_array_start;
2943 mbcset->range_ends = new_array_end;
2944 *range_alloc = new_nranges;
2945 }
2946
2947 mbcset->range_starts[mbcset->nranges] = start_collseq;
2948 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2949 }
2950
2951 /* Build the table for single byte characters. */
2952 for (ch = 0; ch < SBC_MAX; ch++)
2953 {
2954 uint32_t ch_collseq;
2955 /* if (MB_CUR_MAX == 1) */
[599]2956 if (nrules == 0)
[3613]2957 ch_collseq = collseqmb[ch];
2958 else
2959 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2960 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2961 bitset_set (sbcset, ch);
2962 }
2963 return REG_NOERROR;
2964}
2965
2966/* Local function for parse_bracket_exp used in _LIBC environment.
2967 Build the collating element which is represented by NAME.
2968 The result are written to MBCSET and SBCSET.
2969 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2970 pointer argument since we may update it. */
2971
[599]2972static __always_inline reg_errcode_t
[3613]2973build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2974 Idx *coll_sym_alloc, const unsigned char *name,
2975 uint_fast32_t nrules, int_fast32_t table_size,
[599]2976 const int32_t *symb_table, const unsigned char *extra)
[3613]2977{
2978 int32_t elem, idx;
2979 size_t name_len = strlen ((const char *) name);
2980 if (nrules != 0)
2981 {
2982 elem = seek_collating_symbol_entry (name, name_len, symb_table,
2983 table_size, extra);
2984 if (elem != -1)
2985 {
2986 /* We found the entry. */
[599]2987 idx = symb_table[2 * elem + 1];
2988 /* Skip the name of collating element name. */
2989 idx += 1 + extra[idx];
[3613]2990 }
2991 else if (name_len == 1)
2992 {
2993 /* No valid character, treat it as a normal
2994 character. */
[599]2995 bitset_set (sbcset, name[0]);
[3613]2996 return REG_NOERROR;
2997 }
2998 else
2999 return REG_ECOLLATE;
3000
3001 /* Got valid collation sequence, add it as a new entry. */
3002 /* Check the space of the arrays. */
3003 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3004 {
3005 /* Not enough, realloc it. */
3006 /* +1 in case of mbcset->ncoll_syms is 0. */
[599]3007 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
[3613]3008 /* Use realloc since mbcset->coll_syms is NULL
3009 if *alloc == 0. */
[599]3010 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
[3613]3011 new_coll_sym_alloc);
3012 if (__glibc_unlikely (new_coll_syms == NULL))
3013 return REG_ESPACE;
3014 mbcset->coll_syms = new_coll_syms;
3015 *coll_sym_alloc = new_coll_sym_alloc;
3016 }
3017 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3018 return REG_NOERROR;
3019 }
3020 else
3021 {
3022 if (__glibc_unlikely (name_len != 1))
[599]3023 return REG_ECOLLATE;
[3613]3024 else
3025 {
3026 bitset_set (sbcset, name[0]);
3027 return REG_NOERROR;
3028 }
3029 }
3030}
3031#endif /* _LIBC */
3032
3033/* This function parse bracket expression like "[abc]", "[a-c]",
3034 "[[.a-a.]]" etc. */
3035
3036static bin_tree_t *
3037parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
[599]3038 reg_syntax_t syntax, reg_errcode_t *err)
3039{
3040 const unsigned char *collseqmb = NULL;
[3613]3041 const char *collseqwc = NULL;
3042 uint_fast32_t nrules = 0;
3043 int_fast32_t table_size = 0;
[599]3044 const void *symb_table = NULL;
3045 const unsigned char *extra = NULL;
[3613]3046
[599]3047 re_token_t br_token;
3048 re_bitset_ptr_t sbcset;
3049 re_charset_t *mbcset;
3050 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3051 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3052 bool non_match = false;
3053 bin_tree_t *work_tree;
3054 int token_len;
3055 bool first_round = true;
3056#ifdef _LIBC
3057 collseqmb = (const unsigned char *)
[3613]3058 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
[599]3059 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3060 if (nrules)
3061 {
3062 /*
3063 if (MB_CUR_MAX > 1)
3064 */
[3613]3065 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
[599]3066 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
[3613]3067 symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
3068 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
[599]3069 _NL_COLLATE_SYMB_EXTRAMB);
3070 }
3071#endif
3072 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3073 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
[3613]3074 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
[599]3075 {
3076 re_free (sbcset);
3077 re_free (mbcset);
3078 *err = REG_ESPACE;
3079 return NULL;
3080 }
3081
[3613]3082 token_len = peek_token_bracket (token, regexp, syntax);
[599]3083 if (__glibc_unlikely (token->type == END_OF_RE))
[3613]3084 {
[599]3085 *err = REG_BADPAT;
3086 goto parse_bracket_exp_free_return;
[3613]3087 }
[599]3088 if (token->type == OP_NON_MATCH_LIST)
3089 {
3090 mbcset->non_match = 1;
3091 non_match = true;
3092 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3093 bitset_set (sbcset, '\n');
3094 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3095 token_len = peek_token_bracket (token, regexp, syntax);
3096 if (__glibc_unlikely (token->type == END_OF_RE))
3097 {
3098 *err = REG_BADPAT;
3099 goto parse_bracket_exp_free_return;
3100 }
3101 }
3102
3103 /* We treat the first ']' as a normal character. */
[3613]3104 if (token->type == OP_CLOSE_BRACKET)
3105 token->type = CHARACTER;
[599]3106
3107 while (1)
3108 {
[3613]3109 bracket_elem_t start_elem, end_elem;
[599]3110 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3111 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
[3613]3112 reg_errcode_t ret;
[599]3113 int token_len2 = 0;
3114 bool is_range_exp = false;
3115 re_token_t token2;
3116
[3613]3117 start_elem.opr.name = start_name_buf;
[599]3118 start_elem.type = COLL_SYM;
3119 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3120 syntax, first_round);
3121 if (__glibc_unlikely (ret != REG_NOERROR))
3122 {
3123 *err = ret;
3124 goto parse_bracket_exp_free_return;
[3613]3125 }
[599]3126 first_round = false;
3127
3128 /* Get information about the next token. We need it in any case. */
3129 token_len = peek_token_bracket (token, regexp, syntax);
3130
3131 /* Do not check for ranges if we know they are not allowed. */
3132 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3133 {
[3613]3134 if (__glibc_unlikely (token->type == END_OF_RE))
[599]3135 {
3136 *err = REG_EBRACK;
3137 goto parse_bracket_exp_free_return;
3138 }
3139 if (token->type == OP_CHARSET_RANGE)
3140 {
3141 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3142 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3143 if (__glibc_unlikely (token2.type == END_OF_RE))
3144 {
3145 *err = REG_EBRACK;
[3613]3146 goto parse_bracket_exp_free_return;
[599]3147 }
3148 if (token2.type == OP_CLOSE_BRACKET)
3149 {
[3613]3150 /* We treat the last '-' as a normal character. */
[599]3151 re_string_skip_bytes (regexp, -token_len);
3152 token->type = CHARACTER;
[3613]3153 }
[599]3154 else
[3613]3155 is_range_exp = true;
3156 }
[599]3157 }
3158
3159 if (is_range_exp == true)
3160 {
3161 end_elem.opr.name = end_name_buf;
3162 end_elem.type = COLL_SYM;
3163 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3164 dfa, syntax, true);
[3613]3165 if (__glibc_unlikely (ret != REG_NOERROR))
3166 {
3167 *err = ret;
3168 goto parse_bracket_exp_free_return;
[599]3169 }
3170
3171 token_len = peek_token_bracket (token, regexp, syntax);
3172
3173 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3174 &start_elem, &end_elem,
3175 dfa, syntax, nrules, collseqmb, collseqwc,
3176 table_size, symb_table, extra);
3177 if (__glibc_unlikely (*err != REG_NOERROR))
3178 goto parse_bracket_exp_free_return;
3179 }
[3613]3180 else
[599]3181 {
3182 switch (start_elem.type)
3183 {
3184 case SB_CHAR:
3185 bitset_set (sbcset, start_elem.opr.ch);
3186 break;
3187 case MB_CHAR:
3188 /* Check whether the array has enough space. */
[3613]3189 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
[599]3190 {
3191 wchar_t *new_mbchars;
3192 /* Not enough, realloc it. */
3193 /* +1 in case of mbcset->nmbchars is 0. */
3194 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3195 /* Use realloc since array is NULL if *alloc == 0. */
3196 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3197 mbchar_alloc);
3198 if (__glibc_unlikely (new_mbchars == NULL))
[3613]3199 goto parse_bracket_exp_espace;
[599]3200 mbcset->mbchars = new_mbchars;
3201 }
3202 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3203 break;
3204 case EQUIV_CLASS:
[3613]3205 *err = build_equiv_class (sbcset,
3206 mbcset, &equiv_class_alloc,
3207 start_elem.opr.name);
[599]3208 if (__glibc_unlikely (*err != REG_NOERROR))
3209 goto parse_bracket_exp_free_return;
3210 break;
3211 case COLL_SYM:
3212 *err = build_collating_symbol (sbcset,
[3613]3213 mbcset, &coll_sym_alloc,
3214 start_elem.opr.name,
3215 nrules, table_size, symb_table, extra);
[599]3216 if (__glibc_unlikely (*err != REG_NOERROR))
3217 goto parse_bracket_exp_free_return;
3218 break;
[3613]3219 case CHAR_CLASS:
[599]3220 *err = build_charclass (regexp->trans, sbcset,
3221 mbcset, &char_class_alloc,
3222 (const char *) start_elem.opr.name,
[3613]3223 syntax);
[599]3224 if (__glibc_unlikely (*err != REG_NOERROR))
3225 goto parse_bracket_exp_free_return;
3226 break;
3227 default:
3228 DEBUG_ASSERT (false);
3229 break;
3230 }
3231 }
3232 if (__glibc_unlikely (token->type == END_OF_RE))
3233 {
3234 *err = REG_EBRACK;
3235 goto parse_bracket_exp_free_return;
3236 }
3237 if (token->type == OP_CLOSE_BRACKET)
3238 break;
3239 }
3240
3241 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3242
3243 /* If it is non-matching list. */
3244 if (non_match)
3245 bitset_not (sbcset);
3246
3247 /* Ensure only single byte characters are set. */
3248 if (dfa->mb_cur_max > 1)
3249 bitset_mask (sbcset, dfa->sb_char);
3250
3251 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3252 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
[3613]3253 || mbcset->non_match)))
[599]3254 {
3255 bin_tree_t *mbc_tree;
3256 int sbc_idx;
3257 /* Build a tree for complex bracket. */
3258 dfa->has_mb_node = 1;
3259 br_token.type = COMPLEX_BRACKET;
3260 br_token.opr.mbcset = mbcset;
3261 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
[3613]3262 if (__glibc_unlikely (mbc_tree == NULL))
3263 goto parse_bracket_exp_espace;
3264 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3265 if (sbcset[sbc_idx])
3266 break;
3267 /* If there are no bits set in sbcset, there is no point
[599]3268 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
[3613]3269 if (sbc_idx < BITSET_WORDS)
3270 {
3271 /* Build a tree for simple bracket. */
3272 br_token.type = SIMPLE_BRACKET;
[599]3273 br_token.opr.sbcset = sbcset;
3274 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3275 if (__glibc_unlikely (work_tree == NULL))
3276 goto parse_bracket_exp_espace;
3277
3278 /* Then join them by ALT node. */
3279 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3280 if (__glibc_unlikely (work_tree == NULL))
3281 goto parse_bracket_exp_espace;
3282 }
3283 else
3284 {
3285 re_free (sbcset);
3286 work_tree = mbc_tree;
[3613]3287 }
3288 }
[599]3289 else
3290 {
3291 free_charset (mbcset);
3292 /* Build a tree for simple bracket. */
3293 br_token.type = SIMPLE_BRACKET;
3294 br_token.opr.sbcset = sbcset;
3295 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296 if (__glibc_unlikely (work_tree == NULL))
3297 goto parse_bracket_exp_espace;
3298 }
3299 return work_tree;
3300
3301 parse_bracket_exp_espace:
3302 *err = REG_ESPACE;
3303 parse_bracket_exp_free_return:
3304 re_free (sbcset);
[3613]3305 free_charset (mbcset);
[599]3306 return NULL;
3307}
3308
3309/* Parse an element in the bracket expression. */
3310
3311static reg_errcode_t
3312parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3313 re_token_t *token, int token_len, re_dfa_t *dfa,
3314 reg_syntax_t syntax, bool accept_hyphen)
3315{
3316 int cur_char_size;
3317 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3318 if (cur_char_size > 1)
3319 {
[3613]3320 elem->type = MB_CHAR;
[599]3321 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3322 re_string_skip_bytes (regexp, cur_char_size);
3323 return REG_NOERROR;
3324 }
3325 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3326 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3327 || token->type == OP_OPEN_EQUIV_CLASS)
3328 return parse_bracket_symbol (elem, regexp, token);
3329 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3330 {
3331 /* A '-' must only appear as anything but a range indicator before
3332 the closing bracket. Everything else is an error. */
3333 re_token_t token2;
3334 (void) peek_token_bracket (&token2, regexp, syntax);
3335 if (token2.type != OP_CLOSE_BRACKET)
3336 /* The actual error value is not standardized since this whole
3337 case is undefined. But ERANGE makes good sense. */
3338 return REG_ERANGE;
3339 }
3340 elem->type = SB_CHAR;
3341 elem->opr.ch = token->opr.c;
3342 return REG_NOERROR;
3343}
3344
3345/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3346 such as [:<character_class>:], [.<collating_element>.], and
3347 [=<equivalent_class>=]. */
3348
3349static reg_errcode_t
3350parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3351 re_token_t *token)
3352{
3353 unsigned char ch, delim = token->opr.c;
3354 int i = 0;
3355 if (re_string_eoi(regexp))
3356 return REG_EBRACK;
3357 for (;; ++i)
3358 {
3359 if (i >= BRACKET_NAME_BUF_SIZE)
3360 return REG_EBRACK;
3361 if (token->type == OP_OPEN_CHAR_CLASS)
3362 ch = re_string_fetch_byte_case (regexp);
3363 else
3364 ch = re_string_fetch_byte (regexp);
3365 if (re_string_eoi(regexp))
3366 return REG_EBRACK;
3367 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3368 break;
3369 elem->opr.name[i] = ch;
3370 }
3371 re_string_skip_bytes (regexp, 1);
3372 elem->opr.name[i] = '\0';
3373 switch (token->type)
3374 {
3375 case OP_OPEN_COLL_ELEM:
3376 elem->type = COLL_SYM;
3377 break;
3378 case OP_OPEN_EQUIV_CLASS:
3379 elem->type = EQUIV_CLASS;
3380 break;
3381 case OP_OPEN_CHAR_CLASS:
3382 elem->type = CHAR_CLASS;
3383 break;
3384 default:
[3613]3385 break;
[599]3386 }
3387 return REG_NOERROR;
3388}
[3613]3389
[599]3390 /* Helper function for parse_bracket_exp.
3391 Build the equivalence class which is represented by NAME.
3392 The result are written to MBCSET and SBCSET.
3393 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3394 is a pointer argument since we may update it. */
3395
3396static reg_errcode_t
3397build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3398 Idx *equiv_class_alloc, const unsigned char *name)
3399{
3400#ifdef _LIBC
3401 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3402 if (nrules != 0)
3403 {
3404 const int32_t *table, *indirect;
3405 const unsigned char *weights, *extra, *cp;
3406 unsigned char char_buf[2];
3407 int32_t idx1, idx2;
3408 unsigned int ch;
3409 size_t len;
[3613]3410 /* Calculate the index for equivalence class. */
3411 cp = name;
[599]3412 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3413 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3414 _NL_COLLATE_WEIGHTMB);
[3613]3415 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3416 _NL_COLLATE_EXTRAMB);
[599]3417 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3418 _NL_COLLATE_INDIRECTMB);
3419 idx1 = findidx (table, indirect, extra, &cp, -1);
3420 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
[3613]3421 /* This isn't a valid character. */
[599]3422 return REG_ECOLLATE;
3423
3424 /* Build single byte matching table for this equivalence class. */
3425 len = weights[idx1 & 0xffffff];
3426 for (ch = 0; ch < SBC_MAX; ++ch)
3427 {
[3613]3428 char_buf[0] = ch;
3429 cp = char_buf;
3430 idx2 = findidx (table, indirect, extra, &cp, 1);
3431/*
3432 idx2 = table[ch];
3433*/
[599]3434 if (idx2 == 0)
3435 /* This isn't a valid character. */
[3613]3436 continue;
[599]3437 /* Compare only if the length matches and the collation rule
3438 index is the same. */
3439 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
[3613]3440 && memcmp (weights + (idx1 & 0xffffff) + 1,
[599]3441 weights + (idx2 & 0xffffff) + 1, len) == 0)
3442 bitset_set (sbcset, ch);
3443 }
3444 /* Check whether the array has enough space. */
[3613]3445 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
[599]3446 {
3447 /* Not enough, realloc it. */
3448 /* +1 in case of mbcset->nequiv_classes is 0. */
3449 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3450 /* Use realloc since the array is NULL if *alloc == 0. */
3451 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3452 int32_t,
3453 new_equiv_class_alloc);
3454 if (__glibc_unlikely (new_equiv_classes == NULL))
[3613]3455 return REG_ESPACE;
[599]3456 mbcset->equiv_classes = new_equiv_classes;
3457 *equiv_class_alloc = new_equiv_class_alloc;
3458 }
3459 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3460 }
3461 else
3462#endif /* _LIBC */
3463 {
3464 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3465 return REG_ECOLLATE;
[3613]3466 bitset_set (sbcset, *name);
[599]3467 }
3468 return REG_NOERROR;
3469}
[3613]3470
3471 /* Helper function for parse_bracket_exp.
[599]3472 Build the character class which is represented by NAME.
3473 The result are written to MBCSET and SBCSET.
[3613]3474 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
[599]3475 is a pointer argument since we may update it. */
3476
3477static reg_errcode_t
3478build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3479 re_charset_t *mbcset, Idx *char_class_alloc,
3480 const char *class_name, reg_syntax_t syntax)
3481{
3482 int i;
[3613]3483 const char *name = class_name;
[599]3484
3485 /* In case of REG_ICASE "upper" and "lower" match the both of
3486 upper and lower cases. */
[3613]3487 if ((syntax & RE_ICASE)
[599]3488 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3489 name = "alpha";
3490
[3613]3491 /* Check the space of the arrays. */
[599]3492 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3493 {
3494 /* Not enough, realloc it. */
3495 /* +1 in case of mbcset->nchar_classes is 0. */
3496 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3497 /* Use realloc since array is NULL if *alloc == 0. */
3498 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3499 new_char_class_alloc);
[3613]3500 if (__glibc_unlikely (new_char_classes == NULL))
[599]3501 return REG_ESPACE;
3502 mbcset->char_classes = new_char_classes;
[3613]3503 *char_class_alloc = new_char_class_alloc;
[599]3504 }
3505 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3506
3507#define BUILD_CHARCLASS_LOOP(ctype_func) \
3508 do { \
[3613]3509 if (__glibc_unlikely (trans != NULL)) \
[599]3510 { \
3511 for (i = 0; i < SBC_MAX; ++i) \
3512 if (ctype_func (i)) \
3513 bitset_set (sbcset, trans[i]); \
3514 } \
3515 else \
3516 { \
3517 for (i = 0; i < SBC_MAX; ++i) \
3518 if (ctype_func (i)) \
3519 bitset_set (sbcset, i); \
3520 } \
3521 } while (0)
3522
3523 if (strcmp (name, "alnum") == 0)
3524 BUILD_CHARCLASS_LOOP (isalnum);
3525 else if (strcmp (name, "cntrl") == 0)
3526 BUILD_CHARCLASS_LOOP (iscntrl);
3527 else if (strcmp (name, "lower") == 0)
3528 BUILD_CHARCLASS_LOOP (islower);
3529 else if (strcmp (name, "space") == 0)
3530 BUILD_CHARCLASS_LOOP (isspace);
3531 else if (strcmp (name, "alpha") == 0)
3532 BUILD_CHARCLASS_LOOP (isalpha);
3533 else if (strcmp (name, "digit") == 0)
3534 BUILD_CHARCLASS_LOOP (isdigit);
3535 else if (strcmp (name, "print") == 0)
3536 BUILD_CHARCLASS_LOOP (isprint);
3537 else if (strcmp (name, "upper") == 0)
3538 BUILD_CHARCLASS_LOOP (isupper);
3539 else if (strcmp (name, "blank") == 0)
3540 BUILD_CHARCLASS_LOOP (isblank);
3541 else if (strcmp (name, "graph") == 0)
3542 BUILD_CHARCLASS_LOOP (isgraph);
3543 else if (strcmp (name, "punct") == 0)
3544 BUILD_CHARCLASS_LOOP (ispunct);
3545 else if (strcmp (name, "xdigit") == 0)
[3613]3546 BUILD_CHARCLASS_LOOP (isxdigit);
3547 else
[599]3548 return REG_ECTYPE;
3549
3550 return REG_NOERROR;
3551}
[3613]3552
[599]3553static bin_tree_t *
3554build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3555 const char *class_name,
3556 const char *extra, bool non_match,
[3613]3557 reg_errcode_t *err)
[599]3558{
3559 re_bitset_ptr_t sbcset;
3560 re_charset_t *mbcset;
3561 Idx alloc = 0;
[3613]3562 reg_errcode_t ret;
3563 bin_tree_t *tree;
[599]3564
[3613]3565 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3566 if (__glibc_unlikely (sbcset == NULL))
3567 {
[599]3568 *err = REG_ESPACE;
[3613]3569 return NULL;
[599]3570 }
3571 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
[3613]3572 if (__glibc_unlikely (mbcset == NULL))
[599]3573 {
[3613]3574 re_free (sbcset);
[599]3575 *err = REG_ESPACE;
3576 return NULL;
3577 }
3578 mbcset->non_match = non_match;
3579
3580 /* We don't care the syntax in this case. */
3581 ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
3582
3583 if (__glibc_unlikely (ret != REG_NOERROR))
3584 {
3585 re_free (sbcset);
3586 free_charset (mbcset);
3587 *err = ret;
3588 return NULL;
3589 }
3590 /* \w match '_' also. */
3591 for (; *extra; extra++)
3592 bitset_set (sbcset, *extra);
3593
[3613]3594 /* If it is non-matching list. */
[599]3595 if (non_match)
[3613]3596 bitset_not (sbcset);
[599]3597
3598 /* Ensure only single byte characters are set. */
3599 if (dfa->mb_cur_max > 1)
3600 bitset_mask (sbcset, dfa->sb_char);
3601
3602 /* Build a tree for simple bracket. */
3603 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3604 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3605 if (__glibc_unlikely (tree == NULL))
3606 goto build_word_op_espace;
[3613]3607
[599]3608 if (dfa->mb_cur_max > 1)
3609 {
3610 bin_tree_t *mbc_tree;
[3613]3611 /* Build a tree for complex bracket. */
[599]3612 br_token.type = COMPLEX_BRACKET;
3613 br_token.opr.mbcset = mbcset;
3614 dfa->has_mb_node = 1;
3615 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3616 if (__glibc_unlikely (mbc_tree == NULL))
3617 goto build_word_op_espace;
3618 /* Then join them by ALT node. */
3619 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3620 if (__glibc_likely (mbc_tree != NULL))
3621 return tree;
3622 }
3623 else
3624 {
3625 free_charset (mbcset);
3626 return tree;
3627 }
[3613]3628
3629 build_word_op_espace:
3630 re_free (sbcset);
3631 free_charset (mbcset);
[599]3632 *err = REG_ESPACE;
[3613]3633 return NULL;
[599]3634}
3635
[3613]3636/* This is intended for the expressions like "a{1,3}".
[599]3637 Fetch a number from 'input', and return the number.
3638 Return -1 if the number field is empty like "{,1}".
3639 Return RE_DUP_MAX + 1 if the number field is too large.
3640 Return -2 if an error occurred. */
3641
[3613]3642static Idx
[599]3643fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3644{
3645 Idx num = -1;
3646 unsigned char c;
[3613]3647 while (1)
3648 {
3649 fetch_token (token, input, syntax);
3650 c = token->opr.c;
[599]3651 if (__glibc_unlikely (token->type == END_OF_RE))
3652 return -2;
3653 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3654 break;
3655 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3656 ? -2
3657 : num == -1
3658 ? c - '0'
[3613]3659 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
[599]3660 }
3661 return num;
[3613]3662}
[599]3663
3664
3665static void
3666free_charset (re_charset_t *cset)
3667{
3668 re_free (cset->mbchars);
3669#ifdef _LIBC
3670 re_free (cset->coll_syms);
3671 re_free (cset->equiv_classes);
3672#endif
3673 re_free (cset->range_starts);
3674 re_free (cset->range_ends);
3675 re_free (cset->char_classes);
3676 re_free (cset);
[3613]3677}
[599]3678
3679
3680/* Functions for binary tree operation. */
3681
3682/* Create a tree node. */
3683
3684static bin_tree_t *
3685create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
[3613]3686 re_token_type_t type)
[599]3687{
3688 re_token_t t = { .type = type };
3689 return create_token_tree (dfa, left, right, &t);
3690}
3691
3692static bin_tree_t *
3693create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3694 const re_token_t *token)
3695{
3696 bin_tree_t *tree;
3697 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3698 {
3699 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3700
3701 if (storage == NULL)
3702 return NULL;
3703 storage->next = dfa->str_tree_storage;
3704 dfa->str_tree_storage = storage;
3705 dfa->str_tree_storage_idx = 0;
3706 }
3707 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3708
3709 tree->parent = NULL;
3710 tree->left = left;
3711 tree->right = right;
3712 tree->token = *token;
3713 tree->token.duplicated = 0;
3714 tree->token.opt_subexp = 0;
3715 tree->first = NULL;
3716 tree->next = NULL;
3717 tree->node_idx = -1;
3718
3719 if (left != NULL)
3720 left->parent = tree;
[3613]3721 if (right != NULL)
[599]3722 right->parent = tree;
3723 return tree;
3724}
3725
3726/* Mark the tree SRC as an optional subexpression.
3727 To be called from preorder or postorder. */
3728
3729static reg_errcode_t
3730mark_opt_subexp (void *extra, bin_tree_t *node)
3731{
3732 Idx idx = (uintptr_t) extra;
3733 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3734 node->token.opt_subexp = 1;
[3613]3735
3736 return REG_NOERROR;
[599]3737}
3738
3739/* Free the allocated memory inside NODE. */
3740
3741static void
3742free_token (re_token_t *node)
3743{
3744 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3745 free_charset (node->opr.mbcset);
3746 else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3747 re_free (node->opr.sbcset);
3748}
3749
3750/* Worker function for tree walking. Free the allocated memory inside NODE
3751 and its children. */
3752
3753static reg_errcode_t
3754free_tree (void *extra, bin_tree_t *node)
3755{
3756 free_token (&node->token);
3757 return REG_NOERROR;
3758}
3759
3760
3761/* Duplicate the node SRC, and return new node. This is a preorder
3762 visit similar to the one implemented by the generic visitor, but
3763 we need more infrastructure to maintain two parallel trees --- so,
3764 it's easier to duplicate. */
3765
3766static bin_tree_t *
3767duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3768{
3769 const bin_tree_t *node;
3770 bin_tree_t *dup_root;
3771 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3772
3773 for (node = root; ; )
3774 {
3775 /* Create a new tree and link it back to the current parent. */
3776 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3777 if (*p_new == NULL)
3778 return NULL;
3779 (*p_new)->parent = dup_node;
3780 (*p_new)->token.duplicated = 1;
3781 dup_node = *p_new;
3782
3783 /* Go to the left node, or up and to the right. */
3784 if (node->left)
3785 {
3786 node = node->left;
[3613]3787 p_new = &dup_node->left;
[599]3788 }
3789 else
3790 {
3791 const bin_tree_t *prev = NULL;
3792 while (node->right == prev || node->right == NULL)
3793 {
3794 prev = node;
3795 node = node->parent;
3796 dup_node = dup_node->parent;
3797 if (!node)
3798 return dup_root;
3799 }
3800 node = node->right;
3801 p_new = &dup_node->right;
3802 }
3803 }
3804}
Note: See TracBrowser for help on using the repository browser.