source: trunk/src/sed/lib/regcomp.c@ 1810

Last change on this file since 1810 was 606, checked in by bird, 19 years ago

Made sed build using MSC.

File size: 107.6 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27#ifdef RE_ENABLE_I18N
28static void free_charset (re_charset_t *cset);
29#endif /* RE_ENABLE_I18N */
30static void free_workarea_compile (regex_t *preg);
31static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32#ifdef RE_ENABLE_I18N
33static void optimize_utf8 (re_dfa_t *dfa);
34#endif
35static reg_errcode_t analyze (regex_t *preg);
36static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89#ifdef RE_ENABLE_I18N
90static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100#else /* not RE_ENABLE_I18N */
101static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107#endif /* not RE_ENABLE_I18N */
108static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120static void free_token (re_token_t *node);
121static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123
124
125/* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
129
130const char __re_error_msgid[] attribute_hidden =
131 {
132#define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
134 "\0"
135#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
137 "\0"
138#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 "\0"
141#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 "\0"
144#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 "\0"
147#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 "\0"
150#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 "\0"
153#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 "\0"
156#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 "\0"
159#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 "\0"
162#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 "\0"
165#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 "\0"
168#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 "\0"
171#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 "\0"
174#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 "\0"
177#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 "\0"
180#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 };
183
184const size_t __re_error_msgid_idx[] attribute_hidden =
185 {
186 REG_NOERROR_IDX,
187 REG_NOMATCH_IDX,
188 REG_BADPAT_IDX,
189 REG_ECOLLATE_IDX,
190 REG_ECTYPE_IDX,
191 REG_EESCAPE_IDX,
192 REG_ESUBREG_IDX,
193 REG_EBRACK_IDX,
194 REG_EPAREN_IDX,
195 REG_EBRACE_IDX,
196 REG_BADBR_IDX,
197 REG_ERANGE_IDX,
198 REG_ESPACE_IDX,
199 REG_BADRPT_IDX,
200 REG_EEND_IDX,
201 REG_ESIZE_IDX,
202 REG_ERPAREN_IDX
203 };
204
205
206/* Entry points for GNU code. */
207
208/* re_compile_pattern is the GNU regular expression compiler: it
209 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210 Returns 0 if the pattern was valid, otherwise an error string.
211
212 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
213 are set in BUFP on entry. */
214
215const char *
216re_compile_pattern (pattern, length, bufp)
217 const char *pattern;
218 size_t length;
219 struct re_pattern_buffer *bufp;
220{
221 reg_errcode_t ret;
222
223 /* And GNU code determines whether or not to get register information
224 by passing null for the REGS argument to re_match, etc., not by
225 setting no_sub, unless RE_NO_SUB is set. */
226 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227
228 /* Match anchors at newline. */
229 bufp->newline_anchor = 1;
230
231 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232
233 if (!ret)
234 return NULL;
235 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236}
237#ifdef _LIBC
238weak_alias (__re_compile_pattern, re_compile_pattern)
239#endif
240
241/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
242 also be assigned to arbitrarily: each pattern buffer stores its own
243 syntax, so it can be changed between regex compilations. */
244/* This has no initializer because initialized variables in Emacs
245 become read-only after dumping. */
246reg_syntax_t re_syntax_options;
247
248
249/* Specify the precise syntax of regexps for compilation. This provides
250 for compatibility for various utilities which historically have
251 different, incompatible syntaxes.
252
253 The argument SYNTAX is a bit mask comprised of the various bits
254 defined in regex.h. We return the old syntax. */
255
256reg_syntax_t
257re_set_syntax (syntax)
258 reg_syntax_t syntax;
259{
260 reg_syntax_t ret = re_syntax_options;
261
262 re_syntax_options = syntax;
263 return ret;
264}
265#ifdef _LIBC
266weak_alias (__re_set_syntax, re_set_syntax)
267#endif
268
269int
270re_compile_fastmap (bufp)
271 struct re_pattern_buffer *bufp;
272{
273 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
274 char *fastmap = bufp->fastmap;
275
276 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
277 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
278 if (dfa->init_state != dfa->init_state_word)
279 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
280 if (dfa->init_state != dfa->init_state_nl)
281 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
282 if (dfa->init_state != dfa->init_state_begbuf)
283 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
284 bufp->fastmap_accurate = 1;
285 return 0;
286}
287#ifdef _LIBC
288weak_alias (__re_compile_fastmap, re_compile_fastmap)
289#endif
290
291static inline void
292__attribute ((always_inline))
293re_set_fastmap (char *fastmap, int icase, int ch)
294{
295 fastmap[ch] = 1;
296 if (icase)
297 fastmap[tolower (ch)] = 1;
298}
299
300/* Helper function for re_compile_fastmap.
301 Compile fastmap for the initial_state INIT_STATE. */
302
303static void
304re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
305 char *fastmap)
306{
307 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
308 int node_cnt;
309 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
310 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
311 {
312 int node = init_state->nodes.elems[node_cnt];
313 re_token_type_t type = dfa->nodes[node].type;
314
315 if (type == CHARACTER)
316 {
317 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
318#ifdef RE_ENABLE_I18N
319 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
320 {
321 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
322 wchar_t wc;
323 mbstate_t state;
324
325 p = buf;
326 *p++ = dfa->nodes[node].opr.c;
327 while (++node < dfa->nodes_len
328 && dfa->nodes[node].type == CHARACTER
329 && dfa->nodes[node].mb_partial)
330 *p++ = dfa->nodes[node].opr.c;
331 memset (&state, '\0', sizeof (state));
332 if (mbrtowc (&wc, (const char *) buf, p - buf,
333 &state) == p - buf
334 && (__wcrtomb ((char *) buf, towlower (wc), &state)
335 != (size_t) -1))
336 re_set_fastmap (fastmap, 0, buf[0]);
337 }
338#endif
339 }
340 else if (type == SIMPLE_BRACKET)
341 {
342 int i, ch;
343 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
344 {
345 int j;
346 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
347 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
348 if (w & ((bitset_word_t) 1 << j))
349 re_set_fastmap (fastmap, icase, ch);
350 }
351 }
352#ifdef RE_ENABLE_I18N
353 else if (type == COMPLEX_BRACKET)
354 {
355 int i;
356 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
357 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
358 || cset->nranges || cset->nchar_classes)
359 {
360# ifdef _LIBC
361 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
362 {
363 /* In this case we want to catch the bytes which are
364 the first byte of any collation elements.
365 e.g. In da_DK, we want to catch 'a' since "aa"
366 is a valid collation element, and don't catch
367 'b' since 'b' is the only collation element
368 which starts from 'b'. */
369 const int32_t *table = (const int32_t *)
370 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
371 for (i = 0; i < SBC_MAX; ++i)
372 if (table[i] < 0)
373 re_set_fastmap (fastmap, icase, i);
374 }
375# else
376 if (dfa->mb_cur_max > 1)
377 for (i = 0; i < SBC_MAX; ++i)
378 if (__btowc (i) == WEOF)
379 re_set_fastmap (fastmap, icase, i);
380# endif /* not _LIBC */
381 }
382 for (i = 0; i < cset->nmbchars; ++i)
383 {
384 char buf[256];
385 mbstate_t state;
386 memset (&state, '\0', sizeof (state));
387 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
388 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
389 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
390 {
391 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
392 != (size_t) -1)
393 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
394 }
395 }
396 }
397#endif /* RE_ENABLE_I18N */
398 else if (type == OP_PERIOD
399#ifdef RE_ENABLE_I18N
400 || type == OP_UTF8_PERIOD
401#endif /* RE_ENABLE_I18N */
402 || type == END_OF_RE)
403 {
404 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
405 if (type == END_OF_RE)
406 bufp->can_be_null = 1;
407 return;
408 }
409 }
410}
411
412
413/* Entry point for POSIX code. */
414/* regcomp takes a regular expression as a string and compiles it.
415
416 PREG is a regex_t *. We do not expect any fields to be initialized,
417 since POSIX says we shouldn't. Thus, we set
418
419 `buffer' to the compiled pattern;
420 `used' to the length of the compiled pattern;
421 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
422 REG_EXTENDED bit in CFLAGS is set; otherwise, to
423 RE_SYNTAX_POSIX_BASIC;
424 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
425 `fastmap' to an allocated space for the fastmap;
426 `fastmap_accurate' to zero;
427 `re_nsub' to the number of subexpressions in PATTERN.
428
429 PATTERN is the address of the pattern string.
430
431 CFLAGS is a series of bits which affect compilation.
432
433 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
434 use POSIX basic syntax.
435
436 If REG_NEWLINE is set, then . and [^...] don't match newline.
437 Also, regexec will try a match beginning after every newline.
438
439 If REG_ICASE is set, then we considers upper- and lowercase
440 versions of letters to be equivalent when matching.
441
442 If REG_NOSUB is set, then when PREG is passed to regexec, that
443 routine will report only success or failure, and nothing about the
444 registers.
445
446 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
447 the return codes and their meanings.) */
448
449int
450regcomp (preg, pattern, cflags)
451 regex_t *__restrict preg;
452 const char *__restrict pattern;
453 int cflags;
454{
455 reg_errcode_t ret;
456 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
457 : RE_SYNTAX_POSIX_BASIC);
458
459 preg->buffer = NULL;
460 preg->allocated = 0;
461 preg->used = 0;
462
463 /* Try to allocate space for the fastmap. */
464 preg->fastmap = re_malloc (char, SBC_MAX);
465 if (BE (preg->fastmap == NULL, 0))
466 return REG_ESPACE;
467
468 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
469
470 /* If REG_NEWLINE is set, newlines are treated differently. */
471 if (cflags & REG_NEWLINE)
472 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
473 syntax &= ~RE_DOT_NEWLINE;
474 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
475 /* It also changes the matching behavior. */
476 preg->newline_anchor = 1;
477 }
478 else
479 preg->newline_anchor = 0;
480 preg->no_sub = !!(cflags & REG_NOSUB);
481 preg->translate = NULL;
482
483 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
484
485 /* POSIX doesn't distinguish between an unmatched open-group and an
486 unmatched close-group: both are REG_EPAREN. */
487 if (ret == REG_ERPAREN)
488 ret = REG_EPAREN;
489
490 /* We have already checked preg->fastmap != NULL. */
491 if (BE (ret == REG_NOERROR, 1))
492 /* Compute the fastmap now, since regexec cannot modify the pattern
493 buffer. This function never fails in this implementation. */
494 (void) re_compile_fastmap (preg);
495 else
496 {
497 /* Some error occurred while compiling the expression. */
498 re_free (preg->fastmap);
499 preg->fastmap = NULL;
500 }
501
502 return (int) ret;
503}
504#ifdef _LIBC
505weak_alias (__regcomp, regcomp)
506#endif
507
508/* Returns a message corresponding to an error code, ERRCODE, returned
509 from either regcomp or regexec. We don't use PREG here. */
510
511size_t
512regerror (errcode, preg, errbuf, errbuf_size)
513 int errcode;
514 const regex_t *__restrict preg;
515 char *__restrict errbuf;
516 size_t errbuf_size;
517{
518 const char *msg;
519 size_t msg_size;
520
521 if (BE (errcode < 0
522 || errcode >= (int) (sizeof (__re_error_msgid_idx)
523 / sizeof (__re_error_msgid_idx[0])), 0))
524 /* Only error codes returned by the rest of the code should be passed
525 to this routine. If we are given anything else, or if other regex
526 code generates an invalid error code, then the program has a bug.
527 Dump core so we can fix it. */
528 abort ();
529
530 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
531
532 msg_size = strlen (msg) + 1; /* Includes the null. */
533
534 if (BE (errbuf_size != 0, 1))
535 {
536 if (BE (msg_size > errbuf_size, 0))
537 {
538#if defined HAVE_MEMPCPY || defined _LIBC
539 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
540#else
541 memcpy (errbuf, msg, errbuf_size - 1);
542 errbuf[errbuf_size - 1] = 0;
543#endif
544 }
545 else
546 memcpy (errbuf, msg, msg_size);
547 }
548
549 return msg_size;
550}
551#ifdef _LIBC
552weak_alias (__regerror, regerror)
553#endif
554
555
556#ifdef RE_ENABLE_I18N
557/* This static array is used for the map to single-byte characters when
558 UTF-8 is used. Otherwise we would allocate memory just to initialize
559 it the same all the time. UTF-8 is the preferred encoding so this is
560 a worthwhile optimization. */
561static const bitset_t utf8_sb_map =
562{
563 /* Set the first 128 bits. */
564#ifdef _MSC_VER
565 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 0, 0, 0, 0
566#else
567 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
568#endif
569};
570#endif
571
572
573static void
574free_dfa_content (re_dfa_t *dfa)
575{
576 int i, j;
577
578 if (dfa->nodes)
579 for (i = 0; i < dfa->nodes_len; ++i)
580 free_token (dfa->nodes + i);
581 re_free (dfa->nexts);
582 for (i = 0; i < dfa->nodes_len; ++i)
583 {
584 if (dfa->eclosures != NULL)
585 re_node_set_free (dfa->eclosures + i);
586 if (dfa->inveclosures != NULL)
587 re_node_set_free (dfa->inveclosures + i);
588 if (dfa->edests != NULL)
589 re_node_set_free (dfa->edests + i);
590 }
591 re_free (dfa->edests);
592 re_free (dfa->eclosures);
593 re_free (dfa->inveclosures);
594 re_free (dfa->nodes);
595
596 if (dfa->state_table)
597 for (i = 0; i <= dfa->state_hash_mask; ++i)
598 {
599 struct re_state_table_entry *entry = dfa->state_table + i;
600 for (j = 0; j < entry->num; ++j)
601 {
602 re_dfastate_t *state = entry->array[j];
603 free_state (state);
604 }
605 re_free (entry->array);
606 }
607 re_free (dfa->state_table);
608#ifdef RE_ENABLE_I18N
609 if (dfa->sb_char != utf8_sb_map)
610 re_free (dfa->sb_char);
611#endif
612 re_free (dfa->subexp_map);
613#ifdef DEBUG
614 re_free (dfa->re_str);
615#endif
616
617 re_free (dfa);
618}
619
620
621/* Free dynamically allocated space used by PREG. */
622
623void
624regfree (preg)
625 regex_t *preg;
626{
627 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
628 if (BE (dfa != NULL, 1))
629 free_dfa_content (dfa);
630 preg->buffer = NULL;
631 preg->allocated = 0;
632
633 re_free (preg->fastmap);
634 preg->fastmap = NULL;
635
636 re_free (preg->translate);
637 preg->translate = NULL;
638}
639#ifdef _LIBC
640weak_alias (__regfree, regfree)
641#endif
642
643
644/* Entry points compatible with 4.2 BSD regex library. We don't define
645 them unless specifically requested. */
646
647#if defined _REGEX_RE_COMP || defined _LIBC
648
649/* BSD has one and only one pattern buffer. */
650static struct re_pattern_buffer re_comp_buf;
651
652char *
653# ifdef _LIBC
654/* Make these definitions weak in libc, so POSIX programs can redefine
655 these names if they don't use our functions, and still use
656 regcomp/regexec above without link errors. */
657weak_function
658# endif
659re_comp (s)
660 const char *s;
661{
662 reg_errcode_t ret;
663 char *fastmap;
664
665 if (!s)
666 {
667 if (!re_comp_buf.buffer)
668 return gettext ("No previous regular expression");
669 return 0;
670 }
671
672 if (re_comp_buf.buffer)
673 {
674 fastmap = re_comp_buf.fastmap;
675 re_comp_buf.fastmap = NULL;
676 __regfree (&re_comp_buf);
677 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
678 re_comp_buf.fastmap = fastmap;
679 }
680
681 if (re_comp_buf.fastmap == NULL)
682 {
683 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
684 if (re_comp_buf.fastmap == NULL)
685 return (char *) gettext (__re_error_msgid
686 + __re_error_msgid_idx[(int) REG_ESPACE]);
687 }
688
689 /* Since `re_exec' always passes NULL for the `regs' argument, we
690 don't need to initialize the pattern buffer fields which affect it. */
691
692 /* Match anchors at newlines. */
693 re_comp_buf.newline_anchor = 1;
694
695 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
696
697 if (!ret)
698 return NULL;
699
700 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
701 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
702}
703
704#ifdef _LIBC
705libc_freeres_fn (free_mem)
706{
707 __regfree (&re_comp_buf);
708}
709#endif
710
711#endif /* _REGEX_RE_COMP */
712
713
714/* Internal entry point.
715 Compile the regular expression PATTERN, whose length is LENGTH.
716 SYNTAX indicate regular expression's syntax. */
717
718static reg_errcode_t
719re_compile_internal (regex_t *preg, const char * pattern, size_t length,
720 reg_syntax_t syntax)
721{
722 reg_errcode_t err = REG_NOERROR;
723 re_dfa_t *dfa;
724 re_string_t regexp;
725
726 /* Initialize the pattern buffer. */
727 preg->fastmap_accurate = 0;
728 preg->syntax = syntax;
729 preg->not_bol = preg->not_eol = 0;
730 preg->used = 0;
731 preg->re_nsub = 0;
732 preg->can_be_null = 0;
733 preg->regs_allocated = REGS_UNALLOCATED;
734
735 /* Initialize the dfa. */
736 dfa = (re_dfa_t *) preg->buffer;
737 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
738 {
739 /* If zero allocated, but buffer is non-null, try to realloc
740 enough space. This loses if buffer's address is bogus, but
741 that is the user's responsibility. If ->buffer is NULL this
742 is a simple allocation. */
743 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
744 if (dfa == NULL)
745 return REG_ESPACE;
746 preg->allocated = sizeof (re_dfa_t);
747 preg->buffer = (unsigned char *) dfa;
748 }
749 preg->used = sizeof (re_dfa_t);
750
751 err = init_dfa (dfa, length);
752 if (BE (err != REG_NOERROR, 0))
753 {
754 free_dfa_content (dfa);
755 preg->buffer = NULL;
756 preg->allocated = 0;
757 return err;
758 }
759#ifdef DEBUG
760 /* Note: length+1 will not overflow since it is checked in init_dfa. */
761 dfa->re_str = re_malloc (char, length + 1);
762 strncpy (dfa->re_str, pattern, length + 1);
763#endif
764
765 __libc_lock_init (dfa->lock);
766
767 err = re_string_construct (&regexp, pattern, length, preg->translate,
768 syntax & RE_ICASE, dfa);
769 if (BE (err != REG_NOERROR, 0))
770 {
771 re_compile_internal_free_return:
772 free_workarea_compile (preg);
773 re_string_destruct (&regexp);
774 free_dfa_content (dfa);
775 preg->buffer = NULL;
776 preg->allocated = 0;
777 return err;
778 }
779
780 /* Parse the regular expression, and build a structure tree. */
781 preg->re_nsub = 0;
782 dfa->str_tree = parse (&regexp, preg, syntax, &err);
783 if (BE (dfa->str_tree == NULL, 0))
784 goto re_compile_internal_free_return;
785
786 /* Analyze the tree and create the nfa. */
787 err = analyze (preg);
788 if (BE (err != REG_NOERROR, 0))
789 goto re_compile_internal_free_return;
790
791#ifdef RE_ENABLE_I18N
792 /* If possible, do searching in single byte encoding to speed things up. */
793 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
794 optimize_utf8 (dfa);
795#endif
796
797 /* Then create the initial state of the dfa. */
798 err = create_initial_state (dfa);
799
800 /* Release work areas. */
801 free_workarea_compile (preg);
802 re_string_destruct (&regexp);
803
804 if (BE (err != REG_NOERROR, 0))
805 {
806 free_dfa_content (dfa);
807 preg->buffer = NULL;
808 preg->allocated = 0;
809 }
810
811 return err;
812}
813
814/* Initialize DFA. We use the length of the regular expression PAT_LEN
815 as the initial length of some arrays. */
816
817static reg_errcode_t
818init_dfa (re_dfa_t *dfa, size_t pat_len)
819{
820 unsigned int table_size;
821#ifndef _LIBC
822 char *codeset_name;
823#endif
824
825 memset (dfa, '\0', sizeof (re_dfa_t));
826
827 /* Force allocation of str_tree_storage the first time. */
828 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
829
830 /* Avoid overflows. */
831 if (pat_len == SIZE_MAX)
832 return REG_ESPACE;
833
834 dfa->nodes_alloc = pat_len + 1;
835 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
836
837 /* table_size = 2 ^ ceil(log pat_len) */
838 for (table_size = 1; ; table_size <<= 1)
839 if (table_size > pat_len)
840 break;
841
842 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
843 dfa->state_hash_mask = table_size - 1;
844
845 dfa->mb_cur_max = MB_CUR_MAX;
846#ifdef _LIBC
847 if (dfa->mb_cur_max == 6
848 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
849 dfa->is_utf8 = 1;
850 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
851 != 0);
852#else
853# ifdef HAVE_LANGINFO_CODESET
854 codeset_name = nl_langinfo (CODESET);
855# else
856 codeset_name = getenv ("LC_ALL");
857 if (codeset_name == NULL || codeset_name[0] == '\0')
858 codeset_name = getenv ("LC_CTYPE");
859 if (codeset_name == NULL || codeset_name[0] == '\0')
860 codeset_name = getenv ("LANG");
861 if (codeset_name == NULL)
862 codeset_name = "";
863 else if (strchr (codeset_name, '.') != NULL)
864 codeset_name = strchr (codeset_name, '.') + 1;
865# endif
866
867 if (strcasecmp (codeset_name, "UTF-8") == 0
868 || strcasecmp (codeset_name, "UTF8") == 0)
869 dfa->is_utf8 = 1;
870
871 /* We check exhaustively in the loop below if this charset is a
872 superset of ASCII. */
873 dfa->map_notascii = 0;
874#endif
875
876#ifdef RE_ENABLE_I18N
877 if (dfa->mb_cur_max > 1)
878 {
879 if (dfa->is_utf8)
880 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
881 else
882 {
883 int i, j, ch;
884
885 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
886 if (BE (dfa->sb_char == NULL, 0))
887 return REG_ESPACE;
888
889 /* Set the bits corresponding to single byte chars. */
890 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
891 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
892 {
893 wint_t wch = __btowc (ch);
894 if (wch != WEOF)
895 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
896# ifndef _LIBC
897 if (isascii (ch) && wch != ch)
898 dfa->map_notascii = 1;
899# endif
900 }
901 }
902 }
903#endif
904
905 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
906 return REG_ESPACE;
907 return REG_NOERROR;
908}
909
910/* Initialize WORD_CHAR table, which indicate which character is
911 "word". In this case "word" means that it is the word construction
912 character used by some operators like "\<", "\>", etc. */
913
914static void
915internal_function
916init_word_char (re_dfa_t *dfa)
917{
918 int i, j, ch;
919 dfa->word_ops_used = 1;
920 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
921 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
922 if (isalnum (ch) || ch == '_')
923 dfa->word_char[i] |= (bitset_word_t) 1 << j;
924}
925
926/* Free the work area which are only used while compiling. */
927
928static void
929free_workarea_compile (regex_t *preg)
930{
931 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
932 bin_tree_storage_t *storage, *next;
933 for (storage = dfa->str_tree_storage; storage; storage = next)
934 {
935 next = storage->next;
936 re_free (storage);
937 }
938 dfa->str_tree_storage = NULL;
939 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
940 dfa->str_tree = NULL;
941 re_free (dfa->org_indices);
942 dfa->org_indices = NULL;
943}
944
945/* Create initial states for all contexts. */
946
947static reg_errcode_t
948create_initial_state (re_dfa_t *dfa)
949{
950 int first, i;
951 reg_errcode_t err;
952 re_node_set init_nodes;
953
954 /* Initial states have the epsilon closure of the node which is
955 the first node of the regular expression. */
956 first = dfa->str_tree->first->node_idx;
957 dfa->init_node = first;
958 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
959 if (BE (err != REG_NOERROR, 0))
960 return err;
961
962 /* The back-references which are in initial states can epsilon transit,
963 since in this case all of the subexpressions can be null.
964 Then we add epsilon closures of the nodes which are the next nodes of
965 the back-references. */
966 if (dfa->nbackref > 0)
967 for (i = 0; i < init_nodes.nelem; ++i)
968 {
969 int node_idx = init_nodes.elems[i];
970 re_token_type_t type = dfa->nodes[node_idx].type;
971
972 int clexp_idx;
973 if (type != OP_BACK_REF)
974 continue;
975 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
976 {
977 re_token_t *clexp_node;
978 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
979 if (clexp_node->type == OP_CLOSE_SUBEXP
980 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
981 break;
982 }
983 if (clexp_idx == init_nodes.nelem)
984 continue;
985
986 if (type == OP_BACK_REF)
987 {
988 int dest_idx = dfa->edests[node_idx].elems[0];
989 if (!re_node_set_contains (&init_nodes, dest_idx))
990 {
991 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
992 i = 0;
993 }
994 }
995 }
996
997 /* It must be the first time to invoke acquire_state. */
998 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
999 /* We don't check ERR here, since the initial state must not be NULL. */
1000 if (BE (dfa->init_state == NULL, 0))
1001 return err;
1002 if (dfa->init_state->has_constraint)
1003 {
1004 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1005 CONTEXT_WORD);
1006 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1007 CONTEXT_NEWLINE);
1008 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1009 &init_nodes,
1010 CONTEXT_NEWLINE
1011 | CONTEXT_BEGBUF);
1012 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1013 || dfa->init_state_begbuf == NULL, 0))
1014 return err;
1015 }
1016 else
1017 dfa->init_state_word = dfa->init_state_nl
1018 = dfa->init_state_begbuf = dfa->init_state;
1019
1020 re_node_set_free (&init_nodes);
1021 return REG_NOERROR;
1022}
1023
1024
1025#ifdef RE_ENABLE_I18N
1026/* If it is possible to do searching in single byte encoding instead of UTF-8
1027 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1028 DFA nodes where needed. */
1029
1030static void
1031optimize_utf8 (re_dfa_t *dfa)
1032{
1033 int node, i, mb_chars = 0, has_period = 0;
1034
1035 for (node = 0; node < dfa->nodes_len; ++node)
1036 switch (dfa->nodes[node].type)
1037 {
1038 case CHARACTER:
1039 if (dfa->nodes[node].opr.c >= 0x80)
1040 mb_chars = 1;
1041 break;
1042 case ANCHOR:
1043 switch (dfa->nodes[node].opr.idx)
1044 {
1045 case LINE_FIRST:
1046 case LINE_LAST:
1047 case BUF_FIRST:
1048 case BUF_LAST:
1049 break;
1050 default:
1051 /* Word anchors etc. cannot be handled. */
1052 return;
1053 }
1054 break;
1055 case OP_PERIOD:
1056 has_period = 1;
1057 break;
1058 case OP_BACK_REF:
1059 case OP_ALT:
1060 case END_OF_RE:
1061 case OP_DUP_ASTERISK:
1062 case OP_OPEN_SUBEXP:
1063 case OP_CLOSE_SUBEXP:
1064 break;
1065 case COMPLEX_BRACKET:
1066 return;
1067 case SIMPLE_BRACKET:
1068 /* Just double check. The non-ASCII range starts at 0x80. */
1069 assert (0x80 % BITSET_WORD_BITS == 0);
1070 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1071 if (dfa->nodes[node].opr.sbcset[i])
1072 return;
1073 break;
1074 default:
1075 abort ();
1076 }
1077
1078 if (mb_chars || has_period)
1079 for (node = 0; node < dfa->nodes_len; ++node)
1080 {
1081 if (dfa->nodes[node].type == CHARACTER
1082 && dfa->nodes[node].opr.c >= 0x80)
1083 dfa->nodes[node].mb_partial = 0;
1084 else if (dfa->nodes[node].type == OP_PERIOD)
1085 dfa->nodes[node].type = OP_UTF8_PERIOD;
1086 }
1087
1088 /* The search can be in single byte locale. */
1089 dfa->mb_cur_max = 1;
1090 dfa->is_utf8 = 0;
1091 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1092}
1093#endif
1094
1095
1096/* Analyze the structure tree, and calculate "first", "next", "edest",
1097 "eclosure", and "inveclosure". */
1098
1099static reg_errcode_t
1100analyze (regex_t *preg)
1101{
1102 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1103 reg_errcode_t ret;
1104
1105 /* Allocate arrays. */
1106 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1107 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1108 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1109 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1110 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1111 || dfa->eclosures == NULL, 0))
1112 return REG_ESPACE;
1113
1114 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1115 if (dfa->subexp_map != NULL)
1116 {
1117 int i;
1118 for (i = 0; i < preg->re_nsub; i++)
1119 dfa->subexp_map[i] = i;
1120 preorder (dfa->str_tree, optimize_subexps, dfa);
1121 for (i = 0; i < preg->re_nsub; i++)
1122 if (dfa->subexp_map[i] != i)
1123 break;
1124 if (i == preg->re_nsub)
1125 {
1126 free (dfa->subexp_map);
1127 dfa->subexp_map = NULL;
1128 }
1129 }
1130
1131 ret = postorder (dfa->str_tree, lower_subexps, preg);
1132 if (BE (ret != REG_NOERROR, 0))
1133 return ret;
1134 ret = postorder (dfa->str_tree, calc_first, dfa);
1135 if (BE (ret != REG_NOERROR, 0))
1136 return ret;
1137 preorder (dfa->str_tree, calc_next, dfa);
1138 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1139 if (BE (ret != REG_NOERROR, 0))
1140 return ret;
1141 ret = calc_eclosure (dfa);
1142 if (BE (ret != REG_NOERROR, 0))
1143 return ret;
1144
1145 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1146 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1147 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1148 || dfa->nbackref)
1149 {
1150 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1151 if (BE (dfa->inveclosures == NULL, 0))
1152 return REG_ESPACE;
1153 ret = calc_inveclosure (dfa);
1154 }
1155
1156 return ret;
1157}
1158
1159/* Our parse trees are very unbalanced, so we cannot use a stack to
1160 implement parse tree visits. Instead, we use parent pointers and
1161 some hairy code in these two functions. */
1162static reg_errcode_t
1163postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1164 void *extra)
1165{
1166 bin_tree_t *node, *prev;
1167
1168 for (node = root; ; )
1169 {
1170 /* Descend down the tree, preferably to the left (or to the right
1171 if that's the only child). */
1172 while (node->left || node->right)
1173 if (node->left)
1174 node = node->left;
1175 else
1176 node = node->right;
1177
1178 do
1179 {
1180 reg_errcode_t err = fn (extra, node);
1181 if (BE (err != REG_NOERROR, 0))
1182 return err;
1183 if (node->parent == NULL)
1184 return REG_NOERROR;
1185 prev = node;
1186 node = node->parent;
1187 }
1188 /* Go up while we have a node that is reached from the right. */
1189 while (node->right == prev || node->right == NULL);
1190 node = node->right;
1191 }
1192}
1193
1194static reg_errcode_t
1195preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1196 void *extra)
1197{
1198 bin_tree_t *node;
1199
1200 for (node = root; ; )
1201 {
1202 reg_errcode_t err = fn (extra, node);
1203 if (BE (err != REG_NOERROR, 0))
1204 return err;
1205
1206 /* Go to the left node, or up and to the right. */
1207 if (node->left)
1208 node = node->left;
1209 else
1210 {
1211 bin_tree_t *prev = NULL;
1212 while (node->right == prev || node->right == NULL)
1213 {
1214 prev = node;
1215 node = node->parent;
1216 if (!node)
1217 return REG_NOERROR;
1218 }
1219 node = node->right;
1220 }
1221 }
1222}
1223
1224/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1225 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1226 backreferences as well. Requires a preorder visit. */
1227static reg_errcode_t
1228optimize_subexps (void *extra, bin_tree_t *node)
1229{
1230 re_dfa_t *dfa = (re_dfa_t *) extra;
1231
1232 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1233 {
1234 int idx = node->token.opr.idx;
1235 node->token.opr.idx = dfa->subexp_map[idx];
1236 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1237 }
1238
1239 else if (node->token.type == SUBEXP
1240 && node->left && node->left->token.type == SUBEXP)
1241 {
1242 int other_idx = node->left->token.opr.idx;
1243
1244 node->left = node->left->left;
1245 if (node->left)
1246 node->left->parent = node;
1247
1248 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1249 if (other_idx < BITSET_WORD_BITS)
1250 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1251 }
1252
1253 return REG_NOERROR;
1254}
1255
1256/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1257 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1258static reg_errcode_t
1259lower_subexps (void *extra, bin_tree_t *node)
1260{
1261 regex_t *preg = (regex_t *) extra;
1262 reg_errcode_t err = REG_NOERROR;
1263
1264 if (node->left && node->left->token.type == SUBEXP)
1265 {
1266 node->left = lower_subexp (&err, preg, node->left);
1267 if (node->left)
1268 node->left->parent = node;
1269 }
1270 if (node->right && node->right->token.type == SUBEXP)
1271 {
1272 node->right = lower_subexp (&err, preg, node->right);
1273 if (node->right)
1274 node->right->parent = node;
1275 }
1276
1277 return err;
1278}
1279
1280static bin_tree_t *
1281lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1282{
1283 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1284 bin_tree_t *body = node->left;
1285 bin_tree_t *op, *cls, *tree1, *tree;
1286
1287 if (preg->no_sub
1288 /* We do not optimize empty subexpressions, because otherwise we may
1289 have bad CONCAT nodes with NULL children. This is obviously not
1290 very common, so we do not lose much. An example that triggers
1291 this case is the sed "script" /\(\)/x. */
1292 && node->left != NULL
1293 && (node->token.opr.idx >= BITSET_WORD_BITS
1294 || !(dfa->used_bkref_map
1295 & ((bitset_word_t) 1 << node->token.opr.idx))))
1296 return node->left;
1297
1298 /* Convert the SUBEXP node to the concatenation of an
1299 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1300 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1301 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1302 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1303 tree = create_tree (dfa, op, tree1, CONCAT);
1304 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1305 {
1306 *err = REG_ESPACE;
1307 return NULL;
1308 }
1309
1310 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1311 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1312 return tree;
1313}
1314
1315/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1316 nodes. Requires a postorder visit. */
1317static reg_errcode_t
1318calc_first (void *extra, bin_tree_t *node)
1319{
1320 re_dfa_t *dfa = (re_dfa_t *) extra;
1321 if (node->token.type == CONCAT)
1322 {
1323 node->first = node->left->first;
1324 node->node_idx = node->left->node_idx;
1325 }
1326 else
1327 {
1328 node->first = node;
1329 node->node_idx = re_dfa_add_node (dfa, node->token);
1330 if (BE (node->node_idx == -1, 0))
1331 return REG_ESPACE;
1332 }
1333 return REG_NOERROR;
1334}
1335
1336/* Pass 2: compute NEXT on the tree. Preorder visit. */
1337static reg_errcode_t
1338calc_next (void *extra, bin_tree_t *node)
1339{
1340 switch (node->token.type)
1341 {
1342 case OP_DUP_ASTERISK:
1343 node->left->next = node;
1344 break;
1345 case CONCAT:
1346 node->left->next = node->right->first;
1347 node->right->next = node->next;
1348 break;
1349 default:
1350 if (node->left)
1351 node->left->next = node->next;
1352 if (node->right)
1353 node->right->next = node->next;
1354 break;
1355 }
1356 return REG_NOERROR;
1357}
1358
1359/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1360static reg_errcode_t
1361link_nfa_nodes (void *extra, bin_tree_t *node)
1362{
1363 re_dfa_t *dfa = (re_dfa_t *) extra;
1364 int idx = node->node_idx;
1365 reg_errcode_t err = REG_NOERROR;
1366
1367 switch (node->token.type)
1368 {
1369 case CONCAT:
1370 break;
1371
1372 case END_OF_RE:
1373 assert (node->next == NULL);
1374 break;
1375
1376 case OP_DUP_ASTERISK:
1377 case OP_ALT:
1378 {
1379 int left, right;
1380 dfa->has_plural_match = 1;
1381 if (node->left != NULL)
1382 left = node->left->first->node_idx;
1383 else
1384 left = node->next->node_idx;
1385 if (node->right != NULL)
1386 right = node->right->first->node_idx;
1387 else
1388 right = node->next->node_idx;
1389 assert (left > -1);
1390 assert (right > -1);
1391 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1392 }
1393 break;
1394
1395 case ANCHOR:
1396 case OP_OPEN_SUBEXP:
1397 case OP_CLOSE_SUBEXP:
1398 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1399 break;
1400
1401 case OP_BACK_REF:
1402 dfa->nexts[idx] = node->next->node_idx;
1403 if (node->token.type == OP_BACK_REF)
1404 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1405 break;
1406
1407 default:
1408 assert (!IS_EPSILON_NODE (node->token.type));
1409 dfa->nexts[idx] = node->next->node_idx;
1410 break;
1411 }
1412
1413 return err;
1414}
1415
1416/* Duplicate the epsilon closure of the node ROOT_NODE.
1417 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1418 to their own constraint. */
1419
1420static reg_errcode_t
1421internal_function
1422duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1423 int root_node, unsigned int init_constraint)
1424{
1425 int org_node, clone_node, ret;
1426 unsigned int constraint = init_constraint;
1427 for (org_node = top_org_node, clone_node = top_clone_node;;)
1428 {
1429 int org_dest, clone_dest;
1430 if (dfa->nodes[org_node].type == OP_BACK_REF)
1431 {
1432 /* If the back reference epsilon-transit, its destination must
1433 also have the constraint. Then duplicate the epsilon closure
1434 of the destination of the back reference, and store it in
1435 edests of the back reference. */
1436 org_dest = dfa->nexts[org_node];
1437 re_node_set_empty (dfa->edests + clone_node);
1438 clone_dest = duplicate_node (dfa, org_dest, constraint);
1439 if (BE (clone_dest == -1, 0))
1440 return REG_ESPACE;
1441 dfa->nexts[clone_node] = dfa->nexts[org_node];
1442 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1443 if (BE (ret < 0, 0))
1444 return REG_ESPACE;
1445 }
1446 else if (dfa->edests[org_node].nelem == 0)
1447 {
1448 /* In case of the node can't epsilon-transit, don't duplicate the
1449 destination and store the original destination as the
1450 destination of the node. */
1451 dfa->nexts[clone_node] = dfa->nexts[org_node];
1452 break;
1453 }
1454 else if (dfa->edests[org_node].nelem == 1)
1455 {
1456 /* In case of the node can epsilon-transit, and it has only one
1457 destination. */
1458 org_dest = dfa->edests[org_node].elems[0];
1459 re_node_set_empty (dfa->edests + clone_node);
1460 if (dfa->nodes[org_node].type == ANCHOR)
1461 {
1462 /* In case of the node has another constraint, append it. */
1463 if (org_node == root_node && clone_node != org_node)
1464 {
1465 /* ...but if the node is root_node itself, it means the
1466 epsilon closure have a loop, then tie it to the
1467 destination of the root_node. */
1468 ret = re_node_set_insert (dfa->edests + clone_node,
1469 org_dest);
1470 if (BE (ret < 0, 0))
1471 return REG_ESPACE;
1472 break;
1473 }
1474 constraint |= dfa->nodes[org_node].opr.ctx_type;
1475 }
1476 clone_dest = duplicate_node (dfa, org_dest, constraint);
1477 if (BE (clone_dest == -1, 0))
1478 return REG_ESPACE;
1479 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1480 if (BE (ret < 0, 0))
1481 return REG_ESPACE;
1482 }
1483 else /* dfa->edests[org_node].nelem == 2 */
1484 {
1485 /* In case of the node can epsilon-transit, and it has two
1486 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1487 org_dest = dfa->edests[org_node].elems[0];
1488 re_node_set_empty (dfa->edests + clone_node);
1489 /* Search for a duplicated node which satisfies the constraint. */
1490 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1491 if (clone_dest == -1)
1492 {
1493 /* There are no such a duplicated node, create a new one. */
1494 reg_errcode_t err;
1495 clone_dest = duplicate_node (dfa, org_dest, constraint);
1496 if (BE (clone_dest == -1, 0))
1497 return REG_ESPACE;
1498 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1499 if (BE (ret < 0, 0))
1500 return REG_ESPACE;
1501 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1502 root_node, constraint);
1503 if (BE (err != REG_NOERROR, 0))
1504 return err;
1505 }
1506 else
1507 {
1508 /* There are a duplicated node which satisfy the constraint,
1509 use it to avoid infinite loop. */
1510 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1511 if (BE (ret < 0, 0))
1512 return REG_ESPACE;
1513 }
1514
1515 org_dest = dfa->edests[org_node].elems[1];
1516 clone_dest = duplicate_node (dfa, org_dest, constraint);
1517 if (BE (clone_dest == -1, 0))
1518 return REG_ESPACE;
1519 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1520 if (BE (ret < 0, 0))
1521 return REG_ESPACE;
1522 }
1523 org_node = org_dest;
1524 clone_node = clone_dest;
1525 }
1526 return REG_NOERROR;
1527}
1528
1529/* Search for a node which is duplicated from the node ORG_NODE, and
1530 satisfies the constraint CONSTRAINT. */
1531
1532static int
1533search_duplicated_node (const re_dfa_t *dfa, int org_node,
1534 unsigned int constraint)
1535{
1536 int idx;
1537 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1538 {
1539 if (org_node == dfa->org_indices[idx]
1540 && constraint == dfa->nodes[idx].constraint)
1541 return idx; /* Found. */
1542 }
1543 return -1; /* Not found. */
1544}
1545
1546/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1547 Return the index of the new node, or -1 if insufficient storage is
1548 available. */
1549
1550static int
1551duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1552{
1553 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1554 if (BE (dup_idx != -1, 1))
1555 {
1556 dfa->nodes[dup_idx].constraint = constraint;
1557 if (dfa->nodes[org_idx].type == ANCHOR)
1558 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1559 dfa->nodes[dup_idx].duplicated = 1;
1560
1561 /* Store the index of the original node. */
1562 dfa->org_indices[dup_idx] = org_idx;
1563 }
1564 return dup_idx;
1565}
1566
1567static reg_errcode_t
1568calc_inveclosure (re_dfa_t *dfa)
1569{
1570 int src, idx, ret;
1571 for (idx = 0; idx < dfa->nodes_len; ++idx)
1572 re_node_set_init_empty (dfa->inveclosures + idx);
1573
1574 for (src = 0; src < dfa->nodes_len; ++src)
1575 {
1576 int *elems = dfa->eclosures[src].elems;
1577 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1578 {
1579 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1580 if (BE (ret == -1, 0))
1581 return REG_ESPACE;
1582 }
1583 }
1584
1585 return REG_NOERROR;
1586}
1587
1588/* Calculate "eclosure" for all the node in DFA. */
1589
1590static reg_errcode_t
1591calc_eclosure (re_dfa_t *dfa)
1592{
1593 int node_idx, incomplete;
1594#ifdef DEBUG
1595 assert (dfa->nodes_len > 0);
1596#endif
1597 incomplete = 0;
1598 /* For each nodes, calculate epsilon closure. */
1599 for (node_idx = 0; ; ++node_idx)
1600 {
1601 reg_errcode_t err;
1602 re_node_set eclosure_elem;
1603 if (node_idx == dfa->nodes_len)
1604 {
1605 if (!incomplete)
1606 break;
1607 incomplete = 0;
1608 node_idx = 0;
1609 }
1610
1611#ifdef DEBUG
1612 assert (dfa->eclosures[node_idx].nelem != -1);
1613#endif
1614
1615 /* If we have already calculated, skip it. */
1616 if (dfa->eclosures[node_idx].nelem != 0)
1617 continue;
1618 /* Calculate epsilon closure of `node_idx'. */
1619 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1620 if (BE (err != REG_NOERROR, 0))
1621 return err;
1622
1623 if (dfa->eclosures[node_idx].nelem == 0)
1624 {
1625 incomplete = 1;
1626 re_node_set_free (&eclosure_elem);
1627 }
1628 }
1629 return REG_NOERROR;
1630}
1631
1632/* Calculate epsilon closure of NODE. */
1633
1634static reg_errcode_t
1635calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1636{
1637 reg_errcode_t err;
1638 unsigned int constraint;
1639 int i, incomplete;
1640 re_node_set eclosure;
1641 incomplete = 0;
1642 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1643 if (BE (err != REG_NOERROR, 0))
1644 return err;
1645
1646 /* This indicates that we are calculating this node now.
1647 We reference this value to avoid infinite loop. */
1648 dfa->eclosures[node].nelem = -1;
1649
1650 constraint = ((dfa->nodes[node].type == ANCHOR)
1651 ? dfa->nodes[node].opr.ctx_type : 0);
1652 /* If the current node has constraints, duplicate all nodes.
1653 Since they must inherit the constraints. */
1654 if (constraint
1655 && dfa->edests[node].nelem
1656 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1657 {
1658 int org_node, cur_node;
1659 org_node = cur_node = node;
1660 err = duplicate_node_closure (dfa, node, node, node, constraint);
1661 if (BE (err != REG_NOERROR, 0))
1662 return err;
1663 }
1664
1665 /* Expand each epsilon destination nodes. */
1666 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1667 for (i = 0; i < dfa->edests[node].nelem; ++i)
1668 {
1669 re_node_set eclosure_elem;
1670 int edest = dfa->edests[node].elems[i];
1671 /* If calculating the epsilon closure of `edest' is in progress,
1672 return intermediate result. */
1673 if (dfa->eclosures[edest].nelem == -1)
1674 {
1675 incomplete = 1;
1676 continue;
1677 }
1678 /* If we haven't calculated the epsilon closure of `edest' yet,
1679 calculate now. Otherwise use calculated epsilon closure. */
1680 if (dfa->eclosures[edest].nelem == 0)
1681 {
1682 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1683 if (BE (err != REG_NOERROR, 0))
1684 return err;
1685 }
1686 else
1687 eclosure_elem = dfa->eclosures[edest];
1688 /* Merge the epsilon closure of `edest'. */
1689 re_node_set_merge (&eclosure, &eclosure_elem);
1690 /* If the epsilon closure of `edest' is incomplete,
1691 the epsilon closure of this node is also incomplete. */
1692 if (dfa->eclosures[edest].nelem == 0)
1693 {
1694 incomplete = 1;
1695 re_node_set_free (&eclosure_elem);
1696 }
1697 }
1698
1699 /* Epsilon closures include itself. */
1700 re_node_set_insert (&eclosure, node);
1701 if (incomplete && !root)
1702 dfa->eclosures[node].nelem = 0;
1703 else
1704 dfa->eclosures[node] = eclosure;
1705 *new_set = eclosure;
1706 return REG_NOERROR;
1707}
1708
1709
1710/* Functions for token which are used in the parser. */
1711
1712/* Fetch a token from INPUT.
1713 We must not use this function inside bracket expressions. */
1714
1715static void
1716internal_function
1717fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1718{
1719 re_string_skip_bytes (input, peek_token (result, input, syntax));
1720}
1721
1722/* Peek a token from INPUT, and return the length of the token.
1723 We must not use this function inside bracket expressions. */
1724
1725static int
1726internal_function
1727peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1728{
1729 unsigned char c;
1730
1731 if (re_string_eoi (input))
1732 {
1733 token->type = END_OF_RE;
1734 return 0;
1735 }
1736
1737 c = re_string_peek_byte (input, 0);
1738 token->opr.c = c;
1739
1740 token->word_char = 0;
1741#ifdef RE_ENABLE_I18N
1742 token->mb_partial = 0;
1743 if (input->mb_cur_max > 1 &&
1744 !re_string_first_byte (input, re_string_cur_idx (input)))
1745 {
1746 token->type = CHARACTER;
1747 token->mb_partial = 1;
1748 return 1;
1749 }
1750#endif
1751 if (c == '\\')
1752 {
1753 unsigned char c2;
1754 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1755 {
1756 token->type = BACK_SLASH;
1757 return 1;
1758 }
1759
1760 c2 = re_string_peek_byte_case (input, 1);
1761 token->opr.c = c2;
1762 token->type = CHARACTER;
1763#ifdef RE_ENABLE_I18N
1764 if (input->mb_cur_max > 1)
1765 {
1766 wint_t wc = re_string_wchar_at (input,
1767 re_string_cur_idx (input) + 1);
1768 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1769 }
1770 else
1771#endif
1772 token->word_char = IS_WORD_CHAR (c2) != 0;
1773
1774 switch (c2)
1775 {
1776 case '|':
1777 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1778 token->type = OP_ALT;
1779 break;
1780 case '1': case '2': case '3': case '4': case '5':
1781 case '6': case '7': case '8': case '9':
1782 if (!(syntax & RE_NO_BK_REFS))
1783 {
1784 token->type = OP_BACK_REF;
1785 token->opr.idx = c2 - '1';
1786 }
1787 break;
1788 case '<':
1789 if (!(syntax & RE_NO_GNU_OPS))
1790 {
1791 token->type = ANCHOR;
1792 token->opr.ctx_type = WORD_FIRST;
1793 }
1794 break;
1795 case '>':
1796 if (!(syntax & RE_NO_GNU_OPS))
1797 {
1798 token->type = ANCHOR;
1799 token->opr.ctx_type = WORD_LAST;
1800 }
1801 break;
1802 case 'b':
1803 if (!(syntax & RE_NO_GNU_OPS))
1804 {
1805 token->type = ANCHOR;
1806 token->opr.ctx_type = WORD_DELIM;
1807 }
1808 break;
1809 case 'B':
1810 if (!(syntax & RE_NO_GNU_OPS))
1811 {
1812 token->type = ANCHOR;
1813 token->opr.ctx_type = NOT_WORD_DELIM;
1814 }
1815 break;
1816 case 'w':
1817 if (!(syntax & RE_NO_GNU_OPS))
1818 token->type = OP_WORD;
1819 break;
1820 case 'W':
1821 if (!(syntax & RE_NO_GNU_OPS))
1822 token->type = OP_NOTWORD;
1823 break;
1824 case 's':
1825 if (!(syntax & RE_NO_GNU_OPS))
1826 token->type = OP_SPACE;
1827 break;
1828 case 'S':
1829 if (!(syntax & RE_NO_GNU_OPS))
1830 token->type = OP_NOTSPACE;
1831 break;
1832 case '`':
1833 if (!(syntax & RE_NO_GNU_OPS))
1834 {
1835 token->type = ANCHOR;
1836 token->opr.ctx_type = BUF_FIRST;
1837 }
1838 break;
1839 case '\'':
1840 if (!(syntax & RE_NO_GNU_OPS))
1841 {
1842 token->type = ANCHOR;
1843 token->opr.ctx_type = BUF_LAST;
1844 }
1845 break;
1846 case '(':
1847 if (!(syntax & RE_NO_BK_PARENS))
1848 token->type = OP_OPEN_SUBEXP;
1849 break;
1850 case ')':
1851 if (!(syntax & RE_NO_BK_PARENS))
1852 token->type = OP_CLOSE_SUBEXP;
1853 break;
1854 case '+':
1855 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1856 token->type = OP_DUP_PLUS;
1857 break;
1858 case '?':
1859 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1860 token->type = OP_DUP_QUESTION;
1861 break;
1862 case '{':
1863 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1864 token->type = OP_OPEN_DUP_NUM;
1865 break;
1866 case '}':
1867 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1868 token->type = OP_CLOSE_DUP_NUM;
1869 break;
1870 default:
1871 break;
1872 }
1873 return 2;
1874 }
1875
1876 token->type = CHARACTER;
1877#ifdef RE_ENABLE_I18N
1878 if (input->mb_cur_max > 1)
1879 {
1880 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1881 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1882 }
1883 else
1884#endif
1885 token->word_char = IS_WORD_CHAR (token->opr.c);
1886
1887 switch (c)
1888 {
1889 case '\n':
1890 if (syntax & RE_NEWLINE_ALT)
1891 token->type = OP_ALT;
1892 break;
1893 case '|':
1894 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1895 token->type = OP_ALT;
1896 break;
1897 case '*':
1898 token->type = OP_DUP_ASTERISK;
1899 break;
1900 case '+':
1901 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1902 token->type = OP_DUP_PLUS;
1903 break;
1904 case '?':
1905 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1906 token->type = OP_DUP_QUESTION;
1907 break;
1908 case '{':
1909 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1910 token->type = OP_OPEN_DUP_NUM;
1911 break;
1912 case '}':
1913 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1914 token->type = OP_CLOSE_DUP_NUM;
1915 break;
1916 case '(':
1917 if (syntax & RE_NO_BK_PARENS)
1918 token->type = OP_OPEN_SUBEXP;
1919 break;
1920 case ')':
1921 if (syntax & RE_NO_BK_PARENS)
1922 token->type = OP_CLOSE_SUBEXP;
1923 break;
1924 case '[':
1925 token->type = OP_OPEN_BRACKET;
1926 break;
1927 case '.':
1928 token->type = OP_PERIOD;
1929 break;
1930 case '^':
1931 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1932 re_string_cur_idx (input) != 0)
1933 {
1934 char prev = re_string_peek_byte (input, -1);
1935 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1936 break;
1937 }
1938 token->type = ANCHOR;
1939 token->opr.ctx_type = LINE_FIRST;
1940 break;
1941 case '$':
1942 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1943 re_string_cur_idx (input) + 1 != re_string_length (input))
1944 {
1945 re_token_t next;
1946 re_string_skip_bytes (input, 1);
1947 peek_token (&next, input, syntax);
1948 re_string_skip_bytes (input, -1);
1949 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1950 break;
1951 }
1952 token->type = ANCHOR;
1953 token->opr.ctx_type = LINE_LAST;
1954 break;
1955 default:
1956 break;
1957 }
1958 return 1;
1959}
1960
1961/* Peek a token from INPUT, and return the length of the token.
1962 We must not use this function out of bracket expressions. */
1963
1964static int
1965internal_function
1966peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1967{
1968 unsigned char c;
1969 if (re_string_eoi (input))
1970 {
1971 token->type = END_OF_RE;
1972 return 0;
1973 }
1974 c = re_string_peek_byte (input, 0);
1975 token->opr.c = c;
1976
1977#ifdef RE_ENABLE_I18N
1978 if (input->mb_cur_max > 1 &&
1979 !re_string_first_byte (input, re_string_cur_idx (input)))
1980 {
1981 token->type = CHARACTER;
1982 return 1;
1983 }
1984#endif /* RE_ENABLE_I18N */
1985
1986 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1987 && re_string_cur_idx (input) + 1 < re_string_length (input))
1988 {
1989 /* In this case, '\' escape a character. */
1990 unsigned char c2;
1991 re_string_skip_bytes (input, 1);
1992 c2 = re_string_peek_byte (input, 0);
1993 token->opr.c = c2;
1994 token->type = CHARACTER;
1995 return 1;
1996 }
1997 if (c == '[') /* '[' is a special char in a bracket exps. */
1998 {
1999 unsigned char c2;
2000 int token_len;
2001 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2002 c2 = re_string_peek_byte (input, 1);
2003 else
2004 c2 = 0;
2005 token->opr.c = c2;
2006 token_len = 2;
2007 switch (c2)
2008 {
2009 case '.':
2010 token->type = OP_OPEN_COLL_ELEM;
2011 break;
2012 case '=':
2013 token->type = OP_OPEN_EQUIV_CLASS;
2014 break;
2015 case ':':
2016 if (syntax & RE_CHAR_CLASSES)
2017 {
2018 token->type = OP_OPEN_CHAR_CLASS;
2019 break;
2020 }
2021 /* else fall through. */
2022 default:
2023 token->type = CHARACTER;
2024 token->opr.c = c;
2025 token_len = 1;
2026 break;
2027 }
2028 return token_len;
2029 }
2030 switch (c)
2031 {
2032 case '-':
2033 token->type = OP_CHARSET_RANGE;
2034 break;
2035 case ']':
2036 token->type = OP_CLOSE_BRACKET;
2037 break;
2038 case '^':
2039 token->type = OP_NON_MATCH_LIST;
2040 break;
2041 default:
2042 token->type = CHARACTER;
2043 }
2044 return 1;
2045}
2046
2047
2048/* Functions for parser. */
2049
2050/* Entry point of the parser.
2051 Parse the regular expression REGEXP and return the structure tree.
2052 If an error is occured, ERR is set by error code, and return NULL.
2053 This function build the following tree, from regular expression <reg_exp>:
2054 CAT
2055 / \
2056 / \
2057 <reg_exp> EOR
2058
2059 CAT means concatenation.
2060 EOR means end of regular expression. */
2061
2062static bin_tree_t *
2063parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2064 reg_errcode_t *err)
2065{
2066 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2067 bin_tree_t *tree, *eor, *root;
2068 re_token_t current_token;
2069 dfa->syntax = syntax;
2070 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2071 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2072 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2073 return NULL;
2074 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2075 if (tree != NULL)
2076 root = create_tree (dfa, tree, eor, CONCAT);
2077 else
2078 root = eor;
2079 if (BE (eor == NULL || root == NULL, 0))
2080 {
2081 *err = REG_ESPACE;
2082 return NULL;
2083 }
2084 return root;
2085}
2086
2087/* This function build the following tree, from regular expression
2088 <branch1>|<branch2>:
2089 ALT
2090 / \
2091 / \
2092 <branch1> <branch2>
2093
2094 ALT means alternative, which represents the operator `|'. */
2095
2096static bin_tree_t *
2097parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2098 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2099{
2100 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2101 bin_tree_t *tree, *branch = NULL;
2102 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2103 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2104 return NULL;
2105
2106 while (token->type == OP_ALT)
2107 {
2108 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2109 if (token->type != OP_ALT && token->type != END_OF_RE
2110 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2111 {
2112 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2113 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2114 return NULL;
2115 }
2116 else
2117 branch = NULL;
2118 tree = create_tree (dfa, tree, branch, OP_ALT);
2119 if (BE (tree == NULL, 0))
2120 {
2121 *err = REG_ESPACE;
2122 return NULL;
2123 }
2124 }
2125 return tree;
2126}
2127
2128/* This function build the following tree, from regular expression
2129 <exp1><exp2>:
2130 CAT
2131 / \
2132 / \
2133 <exp1> <exp2>
2134
2135 CAT means concatenation. */
2136
2137static bin_tree_t *
2138parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2139 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2140{
2141 bin_tree_t *tree, *exp;
2142 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2143 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2144 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2145 return NULL;
2146
2147 while (token->type != OP_ALT && token->type != END_OF_RE
2148 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2149 {
2150 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2151 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2152 {
2153 return NULL;
2154 }
2155 if (tree != NULL && exp != NULL)
2156 {
2157 tree = create_tree (dfa, tree, exp, CONCAT);
2158 if (tree == NULL)
2159 {
2160 *err = REG_ESPACE;
2161 return NULL;
2162 }
2163 }
2164 else if (tree == NULL)
2165 tree = exp;
2166 /* Otherwise exp == NULL, we don't need to create new tree. */
2167 }
2168 return tree;
2169}
2170
2171/* This function build the following tree, from regular expression a*:
2172 *
2173 |
2174 a
2175*/
2176
2177static bin_tree_t *
2178parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2179 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2180{
2181 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2182 bin_tree_t *tree;
2183 switch (token->type)
2184 {
2185 case CHARACTER:
2186 tree = create_token_tree (dfa, NULL, NULL, token);
2187 if (BE (tree == NULL, 0))
2188 {
2189 *err = REG_ESPACE;
2190 return NULL;
2191 }
2192#ifdef RE_ENABLE_I18N
2193 if (dfa->mb_cur_max > 1)
2194 {
2195 while (!re_string_eoi (regexp)
2196 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2197 {
2198 bin_tree_t *mbc_remain;
2199 fetch_token (token, regexp, syntax);
2200 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2201 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2202 if (BE (mbc_remain == NULL || tree == NULL, 0))
2203 {
2204 *err = REG_ESPACE;
2205 return NULL;
2206 }
2207 }
2208 }
2209#endif
2210 break;
2211 case OP_OPEN_SUBEXP:
2212 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2213 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2214 return NULL;
2215 break;
2216 case OP_OPEN_BRACKET:
2217 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2218 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2219 return NULL;
2220 break;
2221 case OP_BACK_REF:
2222 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2223 {
2224 *err = REG_ESUBREG;
2225 return NULL;
2226 }
2227 dfa->used_bkref_map |= 1 << token->opr.idx;
2228 tree = create_token_tree (dfa, NULL, NULL, token);
2229 if (BE (tree == NULL, 0))
2230 {
2231 *err = REG_ESPACE;
2232 return NULL;
2233 }
2234 ++dfa->nbackref;
2235 dfa->has_mb_node = 1;
2236 break;
2237 case OP_OPEN_DUP_NUM:
2238 if (syntax & RE_CONTEXT_INVALID_DUP)
2239 {
2240 *err = REG_BADRPT;
2241 return NULL;
2242 }
2243 /* FALLTHROUGH */
2244 case OP_DUP_ASTERISK:
2245 case OP_DUP_PLUS:
2246 case OP_DUP_QUESTION:
2247 if (syntax & RE_CONTEXT_INVALID_OPS)
2248 {
2249 *err = REG_BADRPT;
2250 return NULL;
2251 }
2252 else if (syntax & RE_CONTEXT_INDEP_OPS)
2253 {
2254 fetch_token (token, regexp, syntax);
2255 return parse_expression (regexp, preg, token, syntax, nest, err);
2256 }
2257 /* else fall through */
2258 case OP_CLOSE_SUBEXP:
2259 if ((token->type == OP_CLOSE_SUBEXP) &&
2260 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2261 {
2262 *err = REG_ERPAREN;
2263 return NULL;
2264 }
2265 /* else fall through */
2266 case OP_CLOSE_DUP_NUM:
2267 /* We treat it as a normal character. */
2268
2269 /* Then we can these characters as normal characters. */
2270 token->type = CHARACTER;
2271 /* mb_partial and word_char bits should be initialized already
2272 by peek_token. */
2273 tree = create_token_tree (dfa, NULL, NULL, token);
2274 if (BE (tree == NULL, 0))
2275 {
2276 *err = REG_ESPACE;
2277 return NULL;
2278 }
2279 break;
2280 case ANCHOR:
2281 if ((token->opr.ctx_type
2282 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2283 && dfa->word_ops_used == 0)
2284 init_word_char (dfa);
2285 if (token->opr.ctx_type == WORD_DELIM
2286 || token->opr.ctx_type == NOT_WORD_DELIM)
2287 {
2288 bin_tree_t *tree_first, *tree_last;
2289 if (token->opr.ctx_type == WORD_DELIM)
2290 {
2291 token->opr.ctx_type = WORD_FIRST;
2292 tree_first = create_token_tree (dfa, NULL, NULL, token);
2293 token->opr.ctx_type = WORD_LAST;
2294 }
2295 else
2296 {
2297 token->opr.ctx_type = INSIDE_WORD;
2298 tree_first = create_token_tree (dfa, NULL, NULL, token);
2299 token->opr.ctx_type = INSIDE_NOTWORD;
2300 }
2301 tree_last = create_token_tree (dfa, NULL, NULL, token);
2302 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2303 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2304 {
2305 *err = REG_ESPACE;
2306 return NULL;
2307 }
2308 }
2309 else
2310 {
2311 tree = create_token_tree (dfa, NULL, NULL, token);
2312 if (BE (tree == NULL, 0))
2313 {
2314 *err = REG_ESPACE;
2315 return NULL;
2316 }
2317 }
2318 /* We must return here, since ANCHORs can't be followed
2319 by repetition operators.
2320 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2321 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2322 fetch_token (token, regexp, syntax);
2323 return tree;
2324 case OP_PERIOD:
2325 tree = create_token_tree (dfa, NULL, NULL, token);
2326 if (BE (tree == NULL, 0))
2327 {
2328 *err = REG_ESPACE;
2329 return NULL;
2330 }
2331 if (dfa->mb_cur_max > 1)
2332 dfa->has_mb_node = 1;
2333 break;
2334 case OP_WORD:
2335 case OP_NOTWORD:
2336 tree = build_charclass_op (dfa, regexp->trans,
2337 (const unsigned char *) "alnum",
2338 (const unsigned char *) "_",
2339 token->type == OP_NOTWORD, err);
2340 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2341 return NULL;
2342 break;
2343 case OP_SPACE:
2344 case OP_NOTSPACE:
2345 tree = build_charclass_op (dfa, regexp->trans,
2346 (const unsigned char *) "space",
2347 (const unsigned char *) "",
2348 token->type == OP_NOTSPACE, err);
2349 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2350 return NULL;
2351 break;
2352 case OP_ALT:
2353 case END_OF_RE:
2354 return NULL;
2355 case BACK_SLASH:
2356 *err = REG_EESCAPE;
2357 return NULL;
2358 default:
2359 /* Must not happen? */
2360#ifdef DEBUG
2361 assert (0);
2362#endif
2363 return NULL;
2364 }
2365 fetch_token (token, regexp, syntax);
2366
2367 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2368 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2369 {
2370 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2371 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2372 return NULL;
2373 /* In BRE consecutive duplications are not allowed. */
2374 if ((syntax & RE_CONTEXT_INVALID_DUP)
2375 && (token->type == OP_DUP_ASTERISK
2376 || token->type == OP_OPEN_DUP_NUM))
2377 {
2378 *err = REG_BADRPT;
2379 return NULL;
2380 }
2381 }
2382
2383 return tree;
2384}
2385
2386/* This function build the following tree, from regular expression
2387 (<reg_exp>):
2388 SUBEXP
2389 |
2390 <reg_exp>
2391*/
2392
2393static bin_tree_t *
2394parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2395 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2396{
2397 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2398 bin_tree_t *tree;
2399 size_t cur_nsub;
2400 cur_nsub = preg->re_nsub++;
2401
2402 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2403
2404 /* The subexpression may be a null string. */
2405 if (token->type == OP_CLOSE_SUBEXP)
2406 tree = NULL;
2407 else
2408 {
2409 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2410 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2411 *err = REG_EPAREN;
2412 if (BE (*err != REG_NOERROR, 0))
2413 return NULL;
2414 }
2415
2416 if (cur_nsub <= '9' - '1')
2417 dfa->completed_bkref_map |= 1 << cur_nsub;
2418
2419 tree = create_tree (dfa, tree, NULL, SUBEXP);
2420 if (BE (tree == NULL, 0))
2421 {
2422 *err = REG_ESPACE;
2423 return NULL;
2424 }
2425 tree->token.opr.idx = cur_nsub;
2426 return tree;
2427}
2428
2429/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2430
2431static bin_tree_t *
2432parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2433 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2434{
2435 bin_tree_t *tree = NULL, *old_tree = NULL;
2436 int i, start, end, start_idx = re_string_cur_idx (regexp);
2437 re_token_t start_token = *token;
2438
2439 if (token->type == OP_OPEN_DUP_NUM)
2440 {
2441 end = 0;
2442 start = fetch_number (regexp, token, syntax);
2443 if (start == -1)
2444 {
2445 if (token->type == CHARACTER && token->opr.c == ',')
2446 start = 0; /* We treat "{,m}" as "{0,m}". */
2447 else
2448 {
2449 *err = REG_BADBR; /* <re>{} is invalid. */
2450 return NULL;
2451 }
2452 }
2453 if (BE (start != -2, 1))
2454 {
2455 /* We treat "{n}" as "{n,n}". */
2456 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2457 : ((token->type == CHARACTER && token->opr.c == ',')
2458 ? fetch_number (regexp, token, syntax) : -2));
2459 }
2460 if (BE (start == -2 || end == -2, 0))
2461 {
2462 /* Invalid sequence. */
2463 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2464 {
2465 if (token->type == END_OF_RE)
2466 *err = REG_EBRACE;
2467 else
2468 *err = REG_BADBR;
2469
2470 return NULL;
2471 }
2472
2473 /* If the syntax bit is set, rollback. */
2474 re_string_set_index (regexp, start_idx);
2475 *token = start_token;
2476 token->type = CHARACTER;
2477 /* mb_partial and word_char bits should be already initialized by
2478 peek_token. */
2479 return elem;
2480 }
2481
2482 if (BE (end != -1 && start > end, 0))
2483 {
2484 /* First number greater than second. */
2485 *err = REG_BADBR;
2486 return NULL;
2487 }
2488 }
2489 else
2490 {
2491 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2492 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2493 }
2494
2495 fetch_token (token, regexp, syntax);
2496
2497 if (BE (elem == NULL, 0))
2498 return NULL;
2499 if (BE (start == 0 && end == 0, 0))
2500 {
2501 postorder (elem, free_tree, NULL);
2502 return NULL;
2503 }
2504
2505 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2506 if (BE (start > 0, 0))
2507 {
2508 tree = elem;
2509 for (i = 2; i <= start; ++i)
2510 {
2511 elem = duplicate_tree (elem, dfa);
2512 tree = create_tree (dfa, tree, elem, CONCAT);
2513 if (BE (elem == NULL || tree == NULL, 0))
2514 goto parse_dup_op_espace;
2515 }
2516
2517 if (start == end)
2518 return tree;
2519
2520 /* Duplicate ELEM before it is marked optional. */
2521 elem = duplicate_tree (elem, dfa);
2522 old_tree = tree;
2523 }
2524 else
2525 old_tree = NULL;
2526
2527 if (elem->token.type == SUBEXP)
2528 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2529
2530 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2531 if (BE (tree == NULL, 0))
2532 goto parse_dup_op_espace;
2533
2534 /* This loop is actually executed only when end != -1,
2535 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2536 already created the start+1-th copy. */
2537 for (i = start + 2; i <= end; ++i)
2538 {
2539 elem = duplicate_tree (elem, dfa);
2540 tree = create_tree (dfa, tree, elem, CONCAT);
2541 if (BE (elem == NULL || tree == NULL, 0))
2542 goto parse_dup_op_espace;
2543
2544 tree = create_tree (dfa, tree, NULL, OP_ALT);
2545 if (BE (tree == NULL, 0))
2546 goto parse_dup_op_espace;
2547 }
2548
2549 if (old_tree)
2550 tree = create_tree (dfa, old_tree, tree, CONCAT);
2551
2552 return tree;
2553
2554 parse_dup_op_espace:
2555 *err = REG_ESPACE;
2556 return NULL;
2557}
2558
2559/* Size of the names for collating symbol/equivalence_class/character_class.
2560 I'm not sure, but maybe enough. */
2561#define BRACKET_NAME_BUF_SIZE 32
2562
2563#ifndef _LIBC
2564 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2565 Build the range expression which starts from START_ELEM, and ends
2566 at END_ELEM. The result are written to MBCSET and SBCSET.
2567 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2568 mbcset->range_ends, is a pointer argument sinse we may
2569 update it. */
2570
2571static reg_errcode_t
2572internal_function
2573# ifdef RE_ENABLE_I18N
2574build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2575 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2576# else /* not RE_ENABLE_I18N */
2577build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2578 bracket_elem_t *end_elem)
2579# endif /* not RE_ENABLE_I18N */
2580{
2581 unsigned int start_ch, end_ch;
2582 /* Equivalence Classes and Character Classes can't be a range start/end. */
2583 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2584 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2585 0))
2586 return REG_ERANGE;
2587
2588 /* We can handle no multi character collating elements without libc
2589 support. */
2590 if (BE ((start_elem->type == COLL_SYM
2591 && strlen ((char *) start_elem->opr.name) > 1)
2592 || (end_elem->type == COLL_SYM
2593 && strlen ((char *) end_elem->opr.name) > 1), 0))
2594 return REG_ECOLLATE;
2595
2596# ifdef RE_ENABLE_I18N
2597 {
2598 wchar_t wc;
2599 wint_t start_wc;
2600 wint_t end_wc;
2601 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2602
2603 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2604 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2605 : 0));
2606 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2607 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2608 : 0));
2609 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2610 ? __btowc (start_ch) : start_elem->opr.wch);
2611 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2612 ? __btowc (end_ch) : end_elem->opr.wch);
2613 if (start_wc == WEOF || end_wc == WEOF)
2614 return REG_ECOLLATE;
2615 cmp_buf[0] = start_wc;
2616 cmp_buf[4] = end_wc;
2617 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2618 return REG_ERANGE;
2619
2620 /* Got valid collation sequence values, add them as a new entry.
2621 However, for !_LIBC we have no collation elements: if the
2622 character set is single byte, the single byte character set
2623 that we build below suffices. parse_bracket_exp passes
2624 no MBCSET if dfa->mb_cur_max == 1. */
2625 if (mbcset)
2626 {
2627 /* Check the space of the arrays. */
2628 if (BE (*range_alloc == mbcset->nranges, 0))
2629 {
2630 /* There is not enough space, need realloc. */
2631 wchar_t *new_array_start, *new_array_end;
2632 int new_nranges;
2633
2634 /* +1 in case of mbcset->nranges is 0. */
2635 new_nranges = 2 * mbcset->nranges + 1;
2636 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2637 are NULL if *range_alloc == 0. */
2638 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2639 new_nranges);
2640 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2641 new_nranges);
2642
2643 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2644 return REG_ESPACE;
2645
2646 mbcset->range_starts = new_array_start;
2647 mbcset->range_ends = new_array_end;
2648 *range_alloc = new_nranges;
2649 }
2650
2651 mbcset->range_starts[mbcset->nranges] = start_wc;
2652 mbcset->range_ends[mbcset->nranges++] = end_wc;
2653 }
2654
2655 /* Build the table for single byte characters. */
2656 for (wc = 0; wc < SBC_MAX; ++wc)
2657 {
2658 cmp_buf[2] = wc;
2659 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2660 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2661 bitset_set (sbcset, wc);
2662 }
2663 }
2664# else /* not RE_ENABLE_I18N */
2665 {
2666 unsigned int ch;
2667 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2668 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2669 : 0));
2670 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2671 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2672 : 0));
2673 if (start_ch > end_ch)
2674 return REG_ERANGE;
2675 /* Build the table for single byte characters. */
2676 for (ch = 0; ch < SBC_MAX; ++ch)
2677 if (start_ch <= ch && ch <= end_ch)
2678 bitset_set (sbcset, ch);
2679 }
2680# endif /* not RE_ENABLE_I18N */
2681 return REG_NOERROR;
2682}
2683#endif /* not _LIBC */
2684
2685#ifndef _LIBC
2686/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2687 Build the collating element which is represented by NAME.
2688 The result are written to MBCSET and SBCSET.
2689 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2690 pointer argument since we may update it. */
2691
2692static reg_errcode_t
2693internal_function
2694# ifdef RE_ENABLE_I18N
2695build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2696 int *coll_sym_alloc, const unsigned char *name)
2697# else /* not RE_ENABLE_I18N */
2698build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2699# endif /* not RE_ENABLE_I18N */
2700{
2701 size_t name_len = strlen ((const char *) name);
2702 if (BE (name_len != 1, 0))
2703 return REG_ECOLLATE;
2704 else
2705 {
2706 bitset_set (sbcset, name[0]);
2707 return REG_NOERROR;
2708 }
2709}
2710#endif /* not _LIBC */
2711
2712/* This function parse bracket expression like "[abc]", "[a-c]",
2713 "[[.a-a.]]" etc. */
2714
2715static bin_tree_t *
2716parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2717 reg_syntax_t syntax, reg_errcode_t *err)
2718{
2719#ifdef _LIBC
2720 const unsigned char *collseqmb;
2721 const char *collseqwc;
2722 uint32_t nrules;
2723 int32_t table_size;
2724 const int32_t *symb_table;
2725 const unsigned char *extra;
2726
2727 /* Local function for parse_bracket_exp used in _LIBC environement.
2728 Seek the collating symbol entry correspondings to NAME.
2729 Return the index of the symbol in the SYMB_TABLE. */
2730
2731 auto inline int32_t
2732 __attribute ((always_inline))
2733 seek_collating_symbol_entry (name, name_len)
2734 const unsigned char *name;
2735 size_t name_len;
2736 {
2737 int32_t hash = elem_hash ((const char *) name, name_len);
2738 int32_t elem = hash % table_size;
2739 if (symb_table[2 * elem] != 0)
2740 {
2741 int32_t second = hash % (table_size - 2) + 1;
2742
2743 do
2744 {
2745 /* First compare the hashing value. */
2746 if (symb_table[2 * elem] == hash
2747 /* Compare the length of the name. */
2748 && name_len == extra[symb_table[2 * elem + 1]]
2749 /* Compare the name. */
2750 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2751 name_len) == 0)
2752 {
2753 /* Yep, this is the entry. */
2754 break;
2755 }
2756
2757 /* Next entry. */
2758 elem += second;
2759 }
2760 while (symb_table[2 * elem] != 0);
2761 }
2762 return elem;
2763 }
2764
2765 /* Local function for parse_bracket_exp used in _LIBC environement.
2766 Look up the collation sequence value of BR_ELEM.
2767 Return the value if succeeded, UINT_MAX otherwise. */
2768
2769 auto inline unsigned int
2770 __attribute ((always_inline))
2771 lookup_collation_sequence_value (br_elem)
2772 bracket_elem_t *br_elem;
2773 {
2774 if (br_elem->type == SB_CHAR)
2775 {
2776 /*
2777 if (MB_CUR_MAX == 1)
2778 */
2779 if (nrules == 0)
2780 return collseqmb[br_elem->opr.ch];
2781 else
2782 {
2783 wint_t wc = __btowc (br_elem->opr.ch);
2784 return __collseq_table_lookup (collseqwc, wc);
2785 }
2786 }
2787 else if (br_elem->type == MB_CHAR)
2788 {
2789 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2790 }
2791 else if (br_elem->type == COLL_SYM)
2792 {
2793 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2794 if (nrules != 0)
2795 {
2796 int32_t elem, idx;
2797 elem = seek_collating_symbol_entry (br_elem->opr.name,
2798 sym_name_len);
2799 if (symb_table[2 * elem] != 0)
2800 {
2801 /* We found the entry. */
2802 idx = symb_table[2 * elem + 1];
2803 /* Skip the name of collating element name. */
2804 idx += 1 + extra[idx];
2805 /* Skip the byte sequence of the collating element. */
2806 idx += 1 + extra[idx];
2807 /* Adjust for the alignment. */
2808 idx = (idx + 3) & ~3;
2809 /* Skip the multibyte collation sequence value. */
2810 idx += sizeof (unsigned int);
2811 /* Skip the wide char sequence of the collating element. */
2812 idx += sizeof (unsigned int) *
2813 (1 + *(unsigned int *) (extra + idx));
2814 /* Return the collation sequence value. */
2815 return *(unsigned int *) (extra + idx);
2816 }
2817 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2818 {
2819 /* No valid character. Match it as a single byte
2820 character. */
2821 return collseqmb[br_elem->opr.name[0]];
2822 }
2823 }
2824 else if (sym_name_len == 1)
2825 return collseqmb[br_elem->opr.name[0]];
2826 }
2827 return UINT_MAX;
2828 }
2829
2830 /* Local function for parse_bracket_exp used in _LIBC environement.
2831 Build the range expression which starts from START_ELEM, and ends
2832 at END_ELEM. The result are written to MBCSET and SBCSET.
2833 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2834 mbcset->range_ends, is a pointer argument sinse we may
2835 update it. */
2836
2837 auto inline reg_errcode_t
2838 __attribute ((always_inline))
2839 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2840 re_charset_t *mbcset;
2841 int *range_alloc;
2842 bitset_t sbcset;
2843 bracket_elem_t *start_elem, *end_elem;
2844 {
2845 unsigned int ch;
2846 uint32_t start_collseq;
2847 uint32_t end_collseq;
2848
2849 /* Equivalence Classes and Character Classes can't be a range
2850 start/end. */
2851 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2852 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2853 0))
2854 return REG_ERANGE;
2855
2856 start_collseq = lookup_collation_sequence_value (start_elem);
2857 end_collseq = lookup_collation_sequence_value (end_elem);
2858 /* Check start/end collation sequence values. */
2859 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2860 return REG_ECOLLATE;
2861 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2862 return REG_ERANGE;
2863
2864 /* Got valid collation sequence values, add them as a new entry.
2865 However, if we have no collation elements, and the character set
2866 is single byte, the single byte character set that we
2867 build below suffices. */
2868 if (nrules > 0 || dfa->mb_cur_max > 1)
2869 {
2870 /* Check the space of the arrays. */
2871 if (BE (*range_alloc == mbcset->nranges, 0))
2872 {
2873 /* There is not enough space, need realloc. */
2874 uint32_t *new_array_start;
2875 uint32_t *new_array_end;
2876 int new_nranges;
2877
2878 /* +1 in case of mbcset->nranges is 0. */
2879 new_nranges = 2 * mbcset->nranges + 1;
2880 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2881 new_nranges);
2882 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2883 new_nranges);
2884
2885 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2886 return REG_ESPACE;
2887
2888 mbcset->range_starts = new_array_start;
2889 mbcset->range_ends = new_array_end;
2890 *range_alloc = new_nranges;
2891 }
2892
2893 mbcset->range_starts[mbcset->nranges] = start_collseq;
2894 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2895 }
2896
2897 /* Build the table for single byte characters. */
2898 for (ch = 0; ch < SBC_MAX; ch++)
2899 {
2900 uint32_t ch_collseq;
2901 /*
2902 if (MB_CUR_MAX == 1)
2903 */
2904 if (nrules == 0)
2905 ch_collseq = collseqmb[ch];
2906 else
2907 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2908 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2909 bitset_set (sbcset, ch);
2910 }
2911 return REG_NOERROR;
2912 }
2913
2914 /* Local function for parse_bracket_exp used in _LIBC environement.
2915 Build the collating element which is represented by NAME.
2916 The result are written to MBCSET and SBCSET.
2917 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2918 pointer argument sinse we may update it. */
2919
2920 auto inline reg_errcode_t
2921 __attribute ((always_inline))
2922 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2923 re_charset_t *mbcset;
2924 int *coll_sym_alloc;
2925 bitset_t sbcset;
2926 const unsigned char *name;
2927 {
2928 int32_t elem, idx;
2929 size_t name_len = strlen ((const char *) name);
2930 if (nrules != 0)
2931 {
2932 elem = seek_collating_symbol_entry (name, name_len);
2933 if (symb_table[2 * elem] != 0)
2934 {
2935 /* We found the entry. */
2936 idx = symb_table[2 * elem + 1];
2937 /* Skip the name of collating element name. */
2938 idx += 1 + extra[idx];
2939 }
2940 else if (symb_table[2 * elem] == 0 && name_len == 1)
2941 {
2942 /* No valid character, treat it as a normal
2943 character. */
2944 bitset_set (sbcset, name[0]);
2945 return REG_NOERROR;
2946 }
2947 else
2948 return REG_ECOLLATE;
2949
2950 /* Got valid collation sequence, add it as a new entry. */
2951 /* Check the space of the arrays. */
2952 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2953 {
2954 /* Not enough, realloc it. */
2955 /* +1 in case of mbcset->ncoll_syms is 0. */
2956 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2957 /* Use realloc since mbcset->coll_syms is NULL
2958 if *alloc == 0. */
2959 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2960 new_coll_sym_alloc);
2961 if (BE (new_coll_syms == NULL, 0))
2962 return REG_ESPACE;
2963 mbcset->coll_syms = new_coll_syms;
2964 *coll_sym_alloc = new_coll_sym_alloc;
2965 }
2966 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2967 return REG_NOERROR;
2968 }
2969 else
2970 {
2971 if (BE (name_len != 1, 0))
2972 return REG_ECOLLATE;
2973 else
2974 {
2975 bitset_set (sbcset, name[0]);
2976 return REG_NOERROR;
2977 }
2978 }
2979 }
2980#endif
2981
2982 re_token_t br_token;
2983 re_bitset_ptr_t sbcset;
2984#ifdef RE_ENABLE_I18N
2985 re_charset_t *mbcset;
2986 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2987 int equiv_class_alloc = 0, char_class_alloc = 0;
2988#endif /* not RE_ENABLE_I18N */
2989 int non_match = 0;
2990 bin_tree_t *work_tree;
2991 int token_len;
2992 int first_round = 1;
2993#ifdef _LIBC
2994 collseqmb = (const unsigned char *)
2995 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2996 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2997 if (nrules)
2998 {
2999 /*
3000 if (MB_CUR_MAX > 1)
3001 */
3002 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3003 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3004 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3005 _NL_COLLATE_SYMB_TABLEMB);
3006 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3007 _NL_COLLATE_SYMB_EXTRAMB);
3008 }
3009#endif
3010 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3011#ifdef RE_ENABLE_I18N
3012 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3013#endif /* RE_ENABLE_I18N */
3014#ifdef RE_ENABLE_I18N
3015 if (BE (sbcset == NULL || mbcset == NULL, 0))
3016#else
3017 if (BE (sbcset == NULL, 0))
3018#endif /* RE_ENABLE_I18N */
3019 {
3020 *err = REG_ESPACE;
3021 return NULL;
3022 }
3023
3024 token_len = peek_token_bracket (token, regexp, syntax);
3025 if (BE (token->type == END_OF_RE, 0))
3026 {
3027 *err = REG_BADPAT;
3028 goto parse_bracket_exp_free_return;
3029 }
3030 if (token->type == OP_NON_MATCH_LIST)
3031 {
3032#ifdef RE_ENABLE_I18N
3033 mbcset->non_match = 1;
3034#endif /* not RE_ENABLE_I18N */
3035 non_match = 1;
3036 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3037 bitset_set (sbcset, '\0');
3038 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3039 token_len = peek_token_bracket (token, regexp, syntax);
3040 if (BE (token->type == END_OF_RE, 0))
3041 {
3042 *err = REG_BADPAT;
3043 goto parse_bracket_exp_free_return;
3044 }
3045 }
3046
3047 /* We treat the first ']' as a normal character. */
3048 if (token->type == OP_CLOSE_BRACKET)
3049 token->type = CHARACTER;
3050
3051 while (1)
3052 {
3053 bracket_elem_t start_elem, end_elem;
3054 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3055 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3056 reg_errcode_t ret;
3057 int token_len2 = 0, is_range_exp = 0;
3058 re_token_t token2;
3059
3060 start_elem.opr.name = start_name_buf;
3061 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3062 syntax, first_round);
3063 if (BE (ret != REG_NOERROR, 0))
3064 {
3065 *err = ret;
3066 goto parse_bracket_exp_free_return;
3067 }
3068 first_round = 0;
3069
3070 /* Get information about the next token. We need it in any case. */
3071 token_len = peek_token_bracket (token, regexp, syntax);
3072
3073 /* Do not check for ranges if we know they are not allowed. */
3074 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3075 {
3076 if (BE (token->type == END_OF_RE, 0))
3077 {
3078 *err = REG_EBRACK;
3079 goto parse_bracket_exp_free_return;
3080 }
3081 if (token->type == OP_CHARSET_RANGE)
3082 {
3083 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3084 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3085 if (BE (token2.type == END_OF_RE, 0))
3086 {
3087 *err = REG_EBRACK;
3088 goto parse_bracket_exp_free_return;
3089 }
3090 if (token2.type == OP_CLOSE_BRACKET)
3091 {
3092 /* We treat the last '-' as a normal character. */
3093 re_string_skip_bytes (regexp, -token_len);
3094 token->type = CHARACTER;
3095 }
3096 else
3097 is_range_exp = 1;
3098 }
3099 }
3100
3101 if (is_range_exp == 1)
3102 {
3103 end_elem.opr.name = end_name_buf;
3104 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3105 dfa, syntax, 1);
3106 if (BE (ret != REG_NOERROR, 0))
3107 {
3108 *err = ret;
3109 goto parse_bracket_exp_free_return;
3110 }
3111
3112 token_len = peek_token_bracket (token, regexp, syntax);
3113
3114#ifdef _LIBC
3115 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3116 &start_elem, &end_elem);
3117#else
3118# ifdef RE_ENABLE_I18N
3119 *err = build_range_exp (sbcset,
3120 dfa->mb_cur_max > 1 ? mbcset : NULL,
3121 &range_alloc, &start_elem, &end_elem);
3122# else
3123 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3124# endif
3125#endif /* RE_ENABLE_I18N */
3126 if (BE (*err != REG_NOERROR, 0))
3127 goto parse_bracket_exp_free_return;
3128 }
3129 else
3130 {
3131 switch (start_elem.type)
3132 {
3133 case SB_CHAR:
3134 bitset_set (sbcset, start_elem.opr.ch);
3135 break;
3136#ifdef RE_ENABLE_I18N
3137 case MB_CHAR:
3138 /* Check whether the array has enough space. */
3139 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3140 {
3141 wchar_t *new_mbchars;
3142 /* Not enough, realloc it. */
3143 /* +1 in case of mbcset->nmbchars is 0. */
3144 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3145 /* Use realloc since array is NULL if *alloc == 0. */
3146 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3147 mbchar_alloc);
3148 if (BE (new_mbchars == NULL, 0))
3149 goto parse_bracket_exp_espace;
3150 mbcset->mbchars = new_mbchars;
3151 }
3152 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3153 break;
3154#endif /* RE_ENABLE_I18N */
3155 case EQUIV_CLASS:
3156 *err = build_equiv_class (sbcset,
3157#ifdef RE_ENABLE_I18N
3158 mbcset, &equiv_class_alloc,
3159#endif /* RE_ENABLE_I18N */
3160 start_elem.opr.name);
3161 if (BE (*err != REG_NOERROR, 0))
3162 goto parse_bracket_exp_free_return;
3163 break;
3164 case COLL_SYM:
3165 *err = build_collating_symbol (sbcset,
3166#ifdef RE_ENABLE_I18N
3167 mbcset, &coll_sym_alloc,
3168#endif /* RE_ENABLE_I18N */
3169 start_elem.opr.name);
3170 if (BE (*err != REG_NOERROR, 0))
3171 goto parse_bracket_exp_free_return;
3172 break;
3173 case CHAR_CLASS:
3174 *err = build_charclass (regexp->trans, sbcset,
3175#ifdef RE_ENABLE_I18N
3176 mbcset, &char_class_alloc,
3177#endif /* RE_ENABLE_I18N */
3178 start_elem.opr.name, syntax);
3179 if (BE (*err != REG_NOERROR, 0))
3180 goto parse_bracket_exp_free_return;
3181 break;
3182 default:
3183 assert (0);
3184 break;
3185 }
3186 }
3187 if (BE (token->type == END_OF_RE, 0))
3188 {
3189 *err = REG_EBRACK;
3190 goto parse_bracket_exp_free_return;
3191 }
3192 if (token->type == OP_CLOSE_BRACKET)
3193 break;
3194 }
3195
3196 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3197
3198 /* If it is non-matching list. */
3199 if (non_match)
3200 bitset_not (sbcset);
3201
3202#ifdef RE_ENABLE_I18N
3203 /* Ensure only single byte characters are set. */
3204 if (dfa->mb_cur_max > 1)
3205 bitset_mask (sbcset, dfa->sb_char);
3206
3207 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3208 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3209 || mbcset->non_match)))
3210 {
3211 bin_tree_t *mbc_tree;
3212 int sbc_idx;
3213 /* Build a tree for complex bracket. */
3214 dfa->has_mb_node = 1;
3215 br_token.type = COMPLEX_BRACKET;
3216 br_token.opr.mbcset = mbcset;
3217 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3218 if (BE (mbc_tree == NULL, 0))
3219 goto parse_bracket_exp_espace;
3220 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3221 if (sbcset[sbc_idx])
3222 break;
3223 /* If there are no bits set in sbcset, there is no point
3224 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3225 if (sbc_idx < BITSET_WORDS)
3226 {
3227 /* Build a tree for simple bracket. */
3228 br_token.type = SIMPLE_BRACKET;
3229 br_token.opr.sbcset = sbcset;
3230 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3231 if (BE (work_tree == NULL, 0))
3232 goto parse_bracket_exp_espace;
3233
3234 /* Then join them by ALT node. */
3235 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3236 if (BE (work_tree == NULL, 0))
3237 goto parse_bracket_exp_espace;
3238 }
3239 else
3240 {
3241 re_free (sbcset);
3242 work_tree = mbc_tree;
3243 }
3244 }
3245 else
3246#endif /* not RE_ENABLE_I18N */
3247 {
3248#ifdef RE_ENABLE_I18N
3249 free_charset (mbcset);
3250#endif
3251 /* Build a tree for simple bracket. */
3252 br_token.type = SIMPLE_BRACKET;
3253 br_token.opr.sbcset = sbcset;
3254 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3255 if (BE (work_tree == NULL, 0))
3256 goto parse_bracket_exp_espace;
3257 }
3258 return work_tree;
3259
3260 parse_bracket_exp_espace:
3261 *err = REG_ESPACE;
3262 parse_bracket_exp_free_return:
3263 re_free (sbcset);
3264#ifdef RE_ENABLE_I18N
3265 free_charset (mbcset);
3266#endif /* RE_ENABLE_I18N */
3267 return NULL;
3268}
3269
3270/* Parse an element in the bracket expression. */
3271
3272static reg_errcode_t
3273parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3274 re_token_t *token, int token_len, re_dfa_t *dfa,
3275 reg_syntax_t syntax, int accept_hyphen)
3276{
3277#ifdef RE_ENABLE_I18N
3278 int cur_char_size;
3279 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3280 if (cur_char_size > 1)
3281 {
3282 elem->type = MB_CHAR;
3283 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3284 re_string_skip_bytes (regexp, cur_char_size);
3285 return REG_NOERROR;
3286 }
3287#endif /* RE_ENABLE_I18N */
3288 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3289 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3290 || token->type == OP_OPEN_EQUIV_CLASS)
3291 return parse_bracket_symbol (elem, regexp, token);
3292 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3293 {
3294 /* A '-' must only appear as anything but a range indicator before
3295 the closing bracket. Everything else is an error. */
3296 re_token_t token2;
3297 (void) peek_token_bracket (&token2, regexp, syntax);
3298 if (token2.type != OP_CLOSE_BRACKET)
3299 /* The actual error value is not standardized since this whole
3300 case is undefined. But ERANGE makes good sense. */
3301 return REG_ERANGE;
3302 }
3303 elem->type = SB_CHAR;
3304 elem->opr.ch = token->opr.c;
3305 return REG_NOERROR;
3306}
3307
3308/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3309 such as [:<character_class>:], [.<collating_element>.], and
3310 [=<equivalent_class>=]. */
3311
3312static reg_errcode_t
3313parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3314 re_token_t *token)
3315{
3316 unsigned char ch, delim = token->opr.c;
3317 int i = 0;
3318 if (re_string_eoi(regexp))
3319 return REG_EBRACK;
3320 for (;; ++i)
3321 {
3322 if (i >= BRACKET_NAME_BUF_SIZE)
3323 return REG_EBRACK;
3324 if (token->type == OP_OPEN_CHAR_CLASS)
3325 ch = re_string_fetch_byte_case (regexp);
3326 else
3327 ch = re_string_fetch_byte (regexp);
3328 if (re_string_eoi(regexp))
3329 return REG_EBRACK;
3330 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3331 break;
3332 elem->opr.name[i] = ch;
3333 }
3334 re_string_skip_bytes (regexp, 1);
3335 elem->opr.name[i] = '\0';
3336 switch (token->type)
3337 {
3338 case OP_OPEN_COLL_ELEM:
3339 elem->type = COLL_SYM;
3340 break;
3341 case OP_OPEN_EQUIV_CLASS:
3342 elem->type = EQUIV_CLASS;
3343 break;
3344 case OP_OPEN_CHAR_CLASS:
3345 elem->type = CHAR_CLASS;
3346 break;
3347 default:
3348 break;
3349 }
3350 return REG_NOERROR;
3351}
3352
3353 /* Helper function for parse_bracket_exp.
3354 Build the equivalence class which is represented by NAME.
3355 The result are written to MBCSET and SBCSET.
3356 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3357 is a pointer argument sinse we may update it. */
3358
3359static reg_errcode_t
3360#ifdef RE_ENABLE_I18N
3361build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3362 int *equiv_class_alloc, const unsigned char *name)
3363#else /* not RE_ENABLE_I18N */
3364build_equiv_class (bitset_t sbcset, const unsigned char *name)
3365#endif /* not RE_ENABLE_I18N */
3366{
3367#ifdef _LIBC
3368 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3369 if (nrules != 0)
3370 {
3371 const int32_t *table, *indirect;
3372 const unsigned char *weights, *extra, *cp;
3373 unsigned char char_buf[2];
3374 int32_t idx1, idx2;
3375 unsigned int ch;
3376 size_t len;
3377 /* This #include defines a local function! */
3378# include <locale/weight.h>
3379 /* Calculate the index for equivalence class. */
3380 cp = name;
3381 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3382 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3383 _NL_COLLATE_WEIGHTMB);
3384 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3385 _NL_COLLATE_EXTRAMB);
3386 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3387 _NL_COLLATE_INDIRECTMB);
3388 idx1 = findidx (&cp);
3389 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3390 /* This isn't a valid character. */
3391 return REG_ECOLLATE;
3392
3393 /* Build single byte matcing table for this equivalence class. */
3394 char_buf[1] = (unsigned char) '\0';
3395 len = weights[idx1];
3396 for (ch = 0; ch < SBC_MAX; ++ch)
3397 {
3398 char_buf[0] = ch;
3399 cp = char_buf;
3400 idx2 = findidx (&cp);
3401/*
3402 idx2 = table[ch];
3403*/
3404 if (idx2 == 0)
3405 /* This isn't a valid character. */
3406 continue;
3407 if (len == weights[idx2])
3408 {
3409 int cnt = 0;
3410 while (cnt <= len &&
3411 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3412 ++cnt;
3413
3414 if (cnt > len)
3415 bitset_set (sbcset, ch);
3416 }
3417 }
3418 /* Check whether the array has enough space. */
3419 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3420 {
3421 /* Not enough, realloc it. */
3422 /* +1 in case of mbcset->nequiv_classes is 0. */
3423 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3424 /* Use realloc since the array is NULL if *alloc == 0. */
3425 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3426 int32_t,
3427 new_equiv_class_alloc);
3428 if (BE (new_equiv_classes == NULL, 0))
3429 return REG_ESPACE;
3430 mbcset->equiv_classes = new_equiv_classes;
3431 *equiv_class_alloc = new_equiv_class_alloc;
3432 }
3433 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3434 }
3435 else
3436#endif /* _LIBC */
3437 {
3438 if (BE (strlen ((const char *) name) != 1, 0))
3439 return REG_ECOLLATE;
3440 bitset_set (sbcset, *name);
3441 }
3442 return REG_NOERROR;
3443}
3444
3445 /* Helper function for parse_bracket_exp.
3446 Build the character class which is represented by NAME.
3447 The result are written to MBCSET and SBCSET.
3448 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3449 is a pointer argument sinse we may update it. */
3450
3451static reg_errcode_t
3452#ifdef RE_ENABLE_I18N
3453build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3454 re_charset_t *mbcset, int *char_class_alloc,
3455 const unsigned char *class_name, reg_syntax_t syntax)
3456#else /* not RE_ENABLE_I18N */
3457build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3458 const unsigned char *class_name, reg_syntax_t syntax)
3459#endif /* not RE_ENABLE_I18N */
3460{
3461 int i;
3462 const char *name = (const char *) class_name;
3463
3464 /* In case of REG_ICASE "upper" and "lower" match the both of
3465 upper and lower cases. */
3466 if ((syntax & RE_ICASE)
3467 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3468 name = "alpha";
3469
3470#ifdef RE_ENABLE_I18N
3471 /* Check the space of the arrays. */
3472 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3473 {
3474 /* Not enough, realloc it. */
3475 /* +1 in case of mbcset->nchar_classes is 0. */
3476 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3477 /* Use realloc since array is NULL if *alloc == 0. */
3478 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3479 new_char_class_alloc);
3480 if (BE (new_char_classes == NULL, 0))
3481 return REG_ESPACE;
3482 mbcset->char_classes = new_char_classes;
3483 *char_class_alloc = new_char_class_alloc;
3484 }
3485 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3486#endif /* RE_ENABLE_I18N */
3487
3488#define BUILD_CHARCLASS_LOOP(ctype_func) \
3489 do { \
3490 if (BE (trans != NULL, 0)) \
3491 { \
3492 for (i = 0; i < SBC_MAX; ++i) \
3493 if (ctype_func (i)) \
3494 bitset_set (sbcset, trans[i]); \
3495 } \
3496 else \
3497 { \
3498 for (i = 0; i < SBC_MAX; ++i) \
3499 if (ctype_func (i)) \
3500 bitset_set (sbcset, i); \
3501 } \
3502 } while (0)
3503
3504 if (strcmp (name, "alnum") == 0)
3505 BUILD_CHARCLASS_LOOP (isalnum);
3506 else if (strcmp (name, "cntrl") == 0)
3507 BUILD_CHARCLASS_LOOP (iscntrl);
3508 else if (strcmp (name, "lower") == 0)
3509 BUILD_CHARCLASS_LOOP (islower);
3510 else if (strcmp (name, "space") == 0)
3511 BUILD_CHARCLASS_LOOP (isspace);
3512 else if (strcmp (name, "alpha") == 0)
3513 BUILD_CHARCLASS_LOOP (isalpha);
3514 else if (strcmp (name, "digit") == 0)
3515 BUILD_CHARCLASS_LOOP (isdigit);
3516 else if (strcmp (name, "print") == 0)
3517 BUILD_CHARCLASS_LOOP (isprint);
3518 else if (strcmp (name, "upper") == 0)
3519 BUILD_CHARCLASS_LOOP (isupper);
3520 else if (strcmp (name, "blank") == 0)
3521 BUILD_CHARCLASS_LOOP (isblank);
3522 else if (strcmp (name, "graph") == 0)
3523 BUILD_CHARCLASS_LOOP (isgraph);
3524 else if (strcmp (name, "punct") == 0)
3525 BUILD_CHARCLASS_LOOP (ispunct);
3526 else if (strcmp (name, "xdigit") == 0)
3527 BUILD_CHARCLASS_LOOP (isxdigit);
3528 else
3529 return REG_ECTYPE;
3530
3531 return REG_NOERROR;
3532}
3533
3534static bin_tree_t *
3535build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3536 const unsigned char *class_name,
3537 const unsigned char *extra, int non_match,
3538 reg_errcode_t *err)
3539{
3540 re_bitset_ptr_t sbcset;
3541#ifdef RE_ENABLE_I18N
3542 re_charset_t *mbcset;
3543 int alloc = 0;
3544#endif /* not RE_ENABLE_I18N */
3545 reg_errcode_t ret;
3546 re_token_t br_token;
3547 bin_tree_t *tree;
3548
3549 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3550#ifdef RE_ENABLE_I18N
3551 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3552#endif /* RE_ENABLE_I18N */
3553
3554#ifdef RE_ENABLE_I18N
3555 if (BE (sbcset == NULL || mbcset == NULL, 0))
3556#else /* not RE_ENABLE_I18N */
3557 if (BE (sbcset == NULL, 0))
3558#endif /* not RE_ENABLE_I18N */
3559 {
3560 *err = REG_ESPACE;
3561 return NULL;
3562 }
3563
3564 if (non_match)
3565 {
3566#ifdef RE_ENABLE_I18N
3567 /*
3568 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3569 bitset_set(cset->sbcset, '\0');
3570 */
3571 mbcset->non_match = 1;
3572#endif /* not RE_ENABLE_I18N */
3573 }
3574
3575 /* We don't care the syntax in this case. */
3576 ret = build_charclass (trans, sbcset,
3577#ifdef RE_ENABLE_I18N
3578 mbcset, &alloc,
3579#endif /* RE_ENABLE_I18N */
3580 class_name, 0);
3581
3582 if (BE (ret != REG_NOERROR, 0))
3583 {
3584 re_free (sbcset);
3585#ifdef RE_ENABLE_I18N
3586 free_charset (mbcset);
3587#endif /* RE_ENABLE_I18N */
3588 *err = ret;
3589 return NULL;
3590 }
3591 /* \w match '_' also. */
3592 for (; *extra; extra++)
3593 bitset_set (sbcset, *extra);
3594
3595 /* If it is non-matching list. */
3596 if (non_match)
3597 bitset_not (sbcset);
3598
3599#ifdef RE_ENABLE_I18N
3600 /* Ensure only single byte characters are set. */
3601 if (dfa->mb_cur_max > 1)
3602 bitset_mask (sbcset, dfa->sb_char);
3603#endif
3604
3605 /* Build a tree for simple bracket. */
3606 br_token.type = SIMPLE_BRACKET;
3607 br_token.opr.sbcset = sbcset;
3608 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3609 if (BE (tree == NULL, 0))
3610 goto build_word_op_espace;
3611
3612#ifdef RE_ENABLE_I18N
3613 if (dfa->mb_cur_max > 1)
3614 {
3615 bin_tree_t *mbc_tree;
3616 /* Build a tree for complex bracket. */
3617 br_token.type = COMPLEX_BRACKET;
3618 br_token.opr.mbcset = mbcset;
3619 dfa->has_mb_node = 1;
3620 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3621 if (BE (mbc_tree == NULL, 0))
3622 goto build_word_op_espace;
3623 /* Then join them by ALT node. */
3624 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3625 if (BE (mbc_tree != NULL, 1))
3626 return tree;
3627 }
3628 else
3629 {
3630 free_charset (mbcset);
3631 return tree;
3632 }
3633#else /* not RE_ENABLE_I18N */
3634 return tree;
3635#endif /* not RE_ENABLE_I18N */
3636
3637 build_word_op_espace:
3638 re_free (sbcset);
3639#ifdef RE_ENABLE_I18N
3640 free_charset (mbcset);
3641#endif /* RE_ENABLE_I18N */
3642 *err = REG_ESPACE;
3643 return NULL;
3644}
3645
3646/* This is intended for the expressions like "a{1,3}".
3647 Fetch a number from `input', and return the number.
3648 Return -1, if the number field is empty like "{,1}".
3649 Return -2, If an error is occured. */
3650
3651static int
3652fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3653{
3654 int num = -1;
3655 unsigned char c;
3656 while (1)
3657 {
3658 fetch_token (token, input, syntax);
3659 c = token->opr.c;
3660 if (BE (token->type == END_OF_RE, 0))
3661 return -2;
3662 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3663 break;
3664 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3665 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3666 num = (num > RE_DUP_MAX) ? -2 : num;
3667 }
3668 return num;
3669}
3670
3671
3672#ifdef RE_ENABLE_I18N
3673static void
3674free_charset (re_charset_t *cset)
3675{
3676 re_free (cset->mbchars);
3677# ifdef _LIBC
3678 re_free (cset->coll_syms);
3679 re_free (cset->equiv_classes);
3680 re_free (cset->range_starts);
3681 re_free (cset->range_ends);
3682# endif
3683 re_free (cset->char_classes);
3684 re_free (cset);
3685}
3686#endif /* RE_ENABLE_I18N */
3687
3688
3689/* Functions for binary tree operation. */
3690
3691/* Create a tree node. */
3692
3693static bin_tree_t *
3694create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3695 re_token_type_t type)
3696{
3697 re_token_t t;
3698 t.type = type;
3699 return create_token_tree (dfa, left, right, &t);
3700}
3701
3702static bin_tree_t *
3703create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3704 const re_token_t *token)
3705{
3706 bin_tree_t *tree;
3707 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3708 {
3709 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3710
3711 if (storage == NULL)
3712 return NULL;
3713 storage->next = dfa->str_tree_storage;
3714 dfa->str_tree_storage = storage;
3715 dfa->str_tree_storage_idx = 0;
3716 }
3717 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3718
3719 tree->parent = NULL;
3720 tree->left = left;
3721 tree->right = right;
3722 tree->token = *token;
3723 tree->token.duplicated = 0;
3724 tree->token.opt_subexp = 0;
3725 tree->first = NULL;
3726 tree->next = NULL;
3727 tree->node_idx = -1;
3728
3729 if (left != NULL)
3730 left->parent = tree;
3731 if (right != NULL)
3732 right->parent = tree;
3733 return tree;
3734}
3735
3736/* Mark the tree SRC as an optional subexpression.
3737 To be called from preorder or postorder. */
3738
3739static reg_errcode_t
3740mark_opt_subexp (void *extra, bin_tree_t *node)
3741{
3742 int idx = (int) (long) extra;
3743 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3744 node->token.opt_subexp = 1;
3745
3746 return REG_NOERROR;
3747}
3748
3749/* Free the allocated memory inside NODE. */
3750
3751static void
3752free_token (re_token_t *node)
3753{
3754#ifdef RE_ENABLE_I18N
3755 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3756 free_charset (node->opr.mbcset);
3757 else
3758#endif /* RE_ENABLE_I18N */
3759 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3760 re_free (node->opr.sbcset);
3761}
3762
3763/* Worker function for tree walking. Free the allocated memory inside NODE
3764 and its children. */
3765
3766static reg_errcode_t
3767free_tree (void *extra, bin_tree_t *node)
3768{
3769 free_token (&node->token);
3770 return REG_NOERROR;
3771}
3772
3773
3774/* Duplicate the node SRC, and return new node. This is a preorder
3775 visit similar to the one implemented by the generic visitor, but
3776 we need more infrastructure to maintain two parallel trees --- so,
3777 it's easier to duplicate. */
3778
3779static bin_tree_t *
3780duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3781{
3782 const bin_tree_t *node;
3783 bin_tree_t *dup_root;
3784 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3785
3786 for (node = root; ; )
3787 {
3788 /* Create a new tree and link it back to the current parent. */
3789 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3790 if (*p_new == NULL)
3791 return NULL;
3792 (*p_new)->parent = dup_node;
3793 (*p_new)->token.duplicated = 1;
3794 dup_node = *p_new;
3795
3796 /* Go to the left node, or up and to the right. */
3797 if (node->left)
3798 {
3799 node = node->left;
3800 p_new = &dup_node->left;
3801 }
3802 else
3803 {
3804 const bin_tree_t *prev = NULL;
3805 while (node->right == prev || node->right == NULL)
3806 {
3807 prev = node;
3808 node = node->parent;
3809 dup_node = dup_node->parent;
3810 if (!node)
3811 return dup_root;
3812 }
3813 node = node->right;
3814 p_new = &dup_node->right;
3815 }
3816 }
3817}
Note: See TracBrowser for help on using the repository browser.