source: trunk/src/sed/lib/regexec.c@ 1838

Last change on this file since 1838 was 606, checked in by bird, 19 years ago

Made sed build using MSC.

File size: 125.3 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
29 internal_function;
30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
56 internal_function;
57static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
61 internal_function;
62static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
67 regmatch_t *regs,
68 re_node_set *eps_via_nodes)
69 internal_function;
70static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
75 internal_function;
76
77#ifdef RE_ENABLE_I18N
78static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
81 internal_function;
82#endif /* RE_ENABLE_I18N */
83static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
85 internal_function;
86static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
89 internal_function;
90static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
92 int str_idx,
93 re_node_set *dest_nodes)
94 internal_function;
95static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
98 internal_function;
99static int check_dst_limits (const re_match_context_t *mctx,
100 re_node_set *limits,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
106 internal_function;
107static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
114 re_node_set *limits,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
120 internal_function;
121static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
122 re_dfastate_t **dst,
123 re_dfastate_t **src, int num)
124 internal_function;
125static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
133 internal_function;
134static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
137#if 0
138static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
141 internal_function;
142#endif
143#ifdef RE_ENABLE_I18N
144static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
146 internal_function;
147#endif /* RE_ENABLE_I18N */
148static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
150 internal_function;
151static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
153 internal_function;
154static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
158 internal_function;
159static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
166 int str_idx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
169 internal_function;
170static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
173 internal_function;
174static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
181 internal_function;
182static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184#ifdef RE_ENABLE_I18N
185static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
187 internal_function;
188# ifdef _LIBC
189static unsigned int find_collation_sequence_value (const unsigned char *mbs,
190 size_t name_len)
191 internal_function;
192# endif /* _LIBC */
193#endif /* RE_ENABLE_I18N */
194static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
195 const re_dfastate_t *state,
196 re_node_set *states_node,
197 bitset_t *states_ch) internal_function;
198static int check_node_accept (const re_match_context_t *mctx,
199 const re_token_t *node, int idx)
200 internal_function;
201static reg_errcode_t extend_buffers (re_match_context_t *mctx)
202 internal_function;
203
204
205/* Entry point for POSIX code. */
206
207/* regexec searches for a given pattern, specified by PREG, in the
208 string STRING.
209
210 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
211 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
212 least NMATCH elements, and we set them to the offsets of the
213 corresponding matched substrings.
214
215 EFLAGS specifies `execution flags' which affect matching: if
216 REG_NOTBOL is set, then ^ does not match at the beginning of the
217 string; if REG_NOTEOL is set, then $ does not match at the end.
218
219 We return 0 if we find a match and REG_NOMATCH if not. */
220
221int
222regexec (preg, string, nmatch, pmatch, eflags)
223 const regex_t *__restrict preg;
224 const char *__restrict string;
225 size_t nmatch;
226 regmatch_t pmatch[];
227 int eflags;
228{
229 reg_errcode_t err;
230 int start, length;
231 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
232
233 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
234 return REG_BADPAT;
235
236 if (eflags & REG_STARTEND)
237 {
238 start = pmatch[0].rm_so;
239 length = pmatch[0].rm_eo;
240 }
241 else
242 {
243 start = 0;
244 length = strlen (string);
245 }
246
247 __libc_lock_lock (dfa->lock);
248 if (preg->no_sub)
249 err = re_search_internal (preg, string, length, start, length - start,
250 length, 0, NULL, eflags);
251 else
252 err = re_search_internal (preg, string, length, start, length - start,
253 length, nmatch, pmatch, eflags);
254 __libc_lock_unlock (dfa->lock);
255 return err != REG_NOERROR;
256}
257
258#ifdef _LIBC
259# include <shlib-compat.h>
260versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
261
262# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
263__typeof__ (__regexec) __compat_regexec;
264
265int
266attribute_compat_text_section
267__compat_regexec (const regex_t *__restrict preg,
268 const char *__restrict string, size_t nmatch,
269 regmatch_t pmatch[], int eflags)
270{
271 return regexec (preg, string, nmatch, pmatch,
272 eflags & (REG_NOTBOL | REG_NOTEOL));
273}
274compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
275# endif
276#endif
277
278/* Entry points for GNU code. */
279
280/* re_match, re_search, re_match_2, re_search_2
281
282 The former two functions operate on STRING with length LENGTH,
283 while the later two operate on concatenation of STRING1 and STRING2
284 with lengths LENGTH1 and LENGTH2, respectively.
285
286 re_match() matches the compiled pattern in BUFP against the string,
287 starting at index START.
288
289 re_search() first tries matching at index START, then it tries to match
290 starting from index START + 1, and so on. The last start position tried
291 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
292 way as re_match().)
293
294 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
295 the first STOP characters of the concatenation of the strings should be
296 concerned.
297
298 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
299 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
300 computed relative to the concatenation, not relative to the individual
301 strings.)
302
303 On success, re_match* functions return the length of the match, re_search*
304 return the position of the start of the match. Return value -1 means no
305 match was found and -2 indicates an internal error. */
306
307int
308re_match (bufp, string, length, start, regs)
309 struct re_pattern_buffer *bufp;
310 const char *string;
311 int length, start;
312 struct re_registers *regs;
313{
314 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
315}
316#ifdef _LIBC
317weak_alias (__re_match, re_match)
318#endif
319
320int
321re_search (bufp, string, length, start, range, regs)
322 struct re_pattern_buffer *bufp;
323 const char *string;
324 int length, start, range;
325 struct re_registers *regs;
326{
327 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
328}
329#ifdef _LIBC
330weak_alias (__re_search, re_search)
331#endif
332
333int
334re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
335 struct re_pattern_buffer *bufp;
336 const char *string1, *string2;
337 int length1, length2, start, stop;
338 struct re_registers *regs;
339{
340 return re_search_2_stub (bufp, string1, length1, string2, length2,
341 start, 0, regs, stop, 1);
342}
343#ifdef _LIBC
344weak_alias (__re_match_2, re_match_2)
345#endif
346
347int
348re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
349 struct re_pattern_buffer *bufp;
350 const char *string1, *string2;
351 int length1, length2, start, range, stop;
352 struct re_registers *regs;
353{
354 return re_search_2_stub (bufp, string1, length1, string2, length2,
355 start, range, regs, stop, 0);
356}
357#ifdef _LIBC
358weak_alias (__re_search_2, re_search_2)
359#endif
360
361static int
362re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
363 stop, ret_len)
364 struct re_pattern_buffer *bufp;
365 const char *string1, *string2;
366 int length1, length2, start, range, stop, ret_len;
367 struct re_registers *regs;
368{
369 const char *str;
370 int rval;
371 int len = length1 + length2;
372 int free_str = 0;
373
374 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
375 return -2;
376
377 /* Concatenate the strings. */
378 if (length2 > 0)
379 if (length1 > 0)
380 {
381 char *s = re_malloc (char, len);
382
383 if (BE (s == NULL, 0))
384 return -2;
385#ifdef _LIBC
386 memcpy (__mempcpy (s, string1, length1), string2, length2);
387#else
388 memcpy (s, string1, length1);
389 memcpy (s + length1, string2, length2);
390#endif
391 str = s;
392 free_str = 1;
393 }
394 else
395 str = string2;
396 else
397 str = string1;
398
399 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
400 ret_len);
401 if (free_str)
402 re_free ((char *) str);
403 return rval;
404}
405
406/* The parameters have the same meaning as those of re_search.
407 Additional parameters:
408 If RET_LEN is nonzero the length of the match is returned (re_match style);
409 otherwise the position of the match is returned. */
410
411static int
412re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
413 struct re_pattern_buffer *bufp;
414 const char *string;
415 int length, start, range, stop, ret_len;
416 struct re_registers *regs;
417{
418 reg_errcode_t result;
419 regmatch_t *pmatch;
420 int nregs, rval;
421 int eflags = 0;
422 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
423
424 /* Check for out-of-range. */
425 if (BE (start < 0 || start > length, 0))
426 return -1;
427 if (BE (start + range > length, 0))
428 range = length - start;
429 else if (BE (start + range < 0, 0))
430 range = -start;
431
432 __libc_lock_lock (dfa->lock);
433
434 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
435 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
436
437 /* Compile fastmap if we haven't yet. */
438 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
439 re_compile_fastmap (bufp);
440
441 if (BE (bufp->no_sub, 0))
442 regs = NULL;
443
444 /* We need at least 1 register. */
445 if (regs == NULL)
446 nregs = 1;
447 else if (BE (bufp->regs_allocated == REGS_FIXED &&
448 regs->num_regs < bufp->re_nsub + 1, 0))
449 {
450 nregs = regs->num_regs;
451 if (BE (nregs < 1, 0))
452 {
453 /* Nothing can be copied to regs. */
454 regs = NULL;
455 nregs = 1;
456 }
457 }
458 else
459 nregs = bufp->re_nsub + 1;
460 pmatch = re_malloc (regmatch_t, nregs);
461 if (BE (pmatch == NULL, 0))
462 {
463 rval = -2;
464 goto out;
465 }
466
467 result = re_search_internal (bufp, string, length, start, range, stop,
468 nregs, pmatch, eflags);
469
470 rval = 0;
471
472 /* I hope we needn't fill ther regs with -1's when no match was found. */
473 if (result != REG_NOERROR)
474 rval = -1;
475 else if (regs != NULL)
476 {
477 /* If caller wants register contents data back, copy them. */
478 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
479 bufp->regs_allocated);
480 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
481 rval = -2;
482 }
483
484 if (BE (rval == 0, 1))
485 {
486 if (ret_len)
487 {
488 assert (pmatch[0].rm_so == start);
489 rval = pmatch[0].rm_eo - start;
490 }
491 else
492 rval = pmatch[0].rm_so;
493 }
494 re_free (pmatch);
495 out:
496 __libc_lock_unlock (dfa->lock);
497 return rval;
498}
499
500static unsigned
501re_copy_regs (regs, pmatch, nregs, regs_allocated)
502 struct re_registers *regs;
503 regmatch_t *pmatch;
504 int nregs, regs_allocated;
505{
506 int rval = REGS_REALLOCATE;
507 int i;
508 int need_regs = nregs + 1;
509 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
510 uses. */
511
512 /* Have the register data arrays been allocated? */
513 if (regs_allocated == REGS_UNALLOCATED)
514 { /* No. So allocate them with malloc. */
515 regs->start = re_malloc (regoff_t, need_regs);
516 regs->end = re_malloc (regoff_t, need_regs);
517 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
518 return REGS_UNALLOCATED;
519 regs->num_regs = need_regs;
520 }
521 else if (regs_allocated == REGS_REALLOCATE)
522 { /* Yes. If we need more elements than were already
523 allocated, reallocate them. If we need fewer, just
524 leave it alone. */
525 if (BE (need_regs > regs->num_regs, 0))
526 {
527 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
528 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
529 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
530 return REGS_UNALLOCATED;
531 regs->start = new_start;
532 regs->end = new_end;
533 regs->num_regs = need_regs;
534 }
535 }
536 else
537 {
538 assert (regs_allocated == REGS_FIXED);
539 /* This function may not be called with REGS_FIXED and nregs too big. */
540 assert (regs->num_regs >= nregs);
541 rval = REGS_FIXED;
542 }
543
544 /* Copy the regs. */
545 for (i = 0; i < nregs; ++i)
546 {
547 regs->start[i] = pmatch[i].rm_so;
548 regs->end[i] = pmatch[i].rm_eo;
549 }
550 for ( ; i < regs->num_regs; ++i)
551 regs->start[i] = regs->end[i] = -1;
552
553 return rval;
554}
555
556/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
557 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
558 this memory for recording register information. STARTS and ENDS
559 must be allocated using the malloc library routine, and must each
560 be at least NUM_REGS * sizeof (regoff_t) bytes long.
561
562 If NUM_REGS == 0, then subsequent matches should allocate their own
563 register data.
564
565 Unless this function is called, the first search or match using
566 PATTERN_BUFFER will allocate its own register data, without
567 freeing the old data. */
568
569void
570re_set_registers (bufp, regs, num_regs, starts, ends)
571 struct re_pattern_buffer *bufp;
572 struct re_registers *regs;
573 unsigned num_regs;
574 regoff_t *starts, *ends;
575{
576 if (num_regs)
577 {
578 bufp->regs_allocated = REGS_REALLOCATE;
579 regs->num_regs = num_regs;
580 regs->start = starts;
581 regs->end = ends;
582 }
583 else
584 {
585 bufp->regs_allocated = REGS_UNALLOCATED;
586 regs->num_regs = 0;
587 regs->start = regs->end = (regoff_t *) 0;
588 }
589}
590#ifdef _LIBC
591weak_alias (__re_set_registers, re_set_registers)
592#endif
593
594
595/* Entry points compatible with 4.2 BSD regex library. We don't define
596 them unless specifically requested. */
597
598#if defined _REGEX_RE_COMP || defined _LIBC
599int
600# ifdef _LIBC
601weak_function
602# endif
603re_exec (s)
604 const char *s;
605{
606 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
607}
608#endif /* _REGEX_RE_COMP */
609
610
611/* Internal entry point. */
612
613/* Searches for a compiled pattern PREG in the string STRING, whose
614 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
615 mingings with regexec. START, and RANGE have the same meanings
616 with re_search.
617 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
618 otherwise return the error code.
619 Note: We assume front end functions already check ranges.
620 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
621
622static reg_errcode_t
623re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
624 eflags)
625 const regex_t *preg;
626 const char *string;
627 int length, start, range, stop, eflags;
628 size_t nmatch;
629 regmatch_t pmatch[];
630{
631 reg_errcode_t err;
632 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
633 int left_lim, right_lim, incr;
634 int fl_longest_match, match_first, match_kind, match_last = -1;
635 int extra_nmatch;
636 int sb, ch;
637#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
638 re_match_context_t mctx = { .dfa = dfa };
639#else
640 re_match_context_t mctx;
641#endif
642 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
643 && range && !preg->can_be_null) ? preg->fastmap : NULL;
644 RE_TRANSLATE_TYPE t = preg->translate;
645
646#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
647 memset (&mctx, '\0', sizeof (re_match_context_t));
648 mctx.dfa = dfa;
649#endif
650
651 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
652 nmatch -= extra_nmatch;
653
654 /* Check if the DFA haven't been compiled. */
655 if (BE (preg->used == 0 || dfa->init_state == NULL
656 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
657 || dfa->init_state_begbuf == NULL, 0))
658 return REG_NOMATCH;
659
660#ifdef DEBUG
661 /* We assume front-end functions already check them. */
662 assert (start + range >= 0 && start + range <= length);
663#endif
664
665 /* If initial states with non-begbuf contexts have no elements,
666 the regex must be anchored. If preg->newline_anchor is set,
667 we'll never use init_state_nl, so do not check it. */
668 if (dfa->init_state->nodes.nelem == 0
669 && dfa->init_state_word->nodes.nelem == 0
670 && (dfa->init_state_nl->nodes.nelem == 0
671 || !preg->newline_anchor))
672 {
673 if (start != 0 && start + range != 0)
674 return REG_NOMATCH;
675 start = range = 0;
676 }
677
678 /* We must check the longest matching, if nmatch > 0. */
679 fl_longest_match = (nmatch != 0 || dfa->nbackref);
680
681 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
682 preg->translate, preg->syntax & RE_ICASE, dfa);
683 if (BE (err != REG_NOERROR, 0))
684 goto free_return;
685 mctx.input.stop = stop;
686 mctx.input.raw_stop = stop;
687 mctx.input.newline_anchor = preg->newline_anchor;
688
689 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
690 if (BE (err != REG_NOERROR, 0))
691 goto free_return;
692
693 /* We will log all the DFA states through which the dfa pass,
694 if nmatch > 1, or this dfa has "multibyte node", which is a
695 back-reference or a node which can accept multibyte character or
696 multi character collating element. */
697 if (nmatch > 1 || dfa->has_mb_node)
698 {
699 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
700 if (BE (mctx.state_log == NULL, 0))
701 {
702 err = REG_ESPACE;
703 goto free_return;
704 }
705 }
706 else
707 mctx.state_log = NULL;
708
709 match_first = start;
710 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
711 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
712
713 /* Check incrementally whether of not the input string match. */
714 incr = (range < 0) ? -1 : 1;
715 left_lim = (range < 0) ? start + range : start;
716 right_lim = (range < 0) ? start : start + range;
717 sb = dfa->mb_cur_max == 1;
718 match_kind =
719 (fastmap
720 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
721 | (range >= 0 ? 2 : 0)
722 | (t != NULL ? 1 : 0))
723 : 8);
724
725 for (;; match_first += incr)
726 {
727 err = REG_NOMATCH;
728 if (match_first < left_lim || right_lim < match_first)
729 goto free_return;
730
731 /* Advance as rapidly as possible through the string, until we
732 find a plausible place to start matching. This may be done
733 with varying efficiency, so there are various possibilities:
734 only the most common of them are specialized, in order to
735 save on code size. We use a switch statement for speed. */
736 switch (match_kind)
737 {
738 case 8:
739 /* No fastmap. */
740 break;
741
742 case 7:
743 /* Fastmap with single-byte translation, match forward. */
744 while (BE (match_first < right_lim, 1)
745 && !fastmap[t[(unsigned char) string[match_first]]])
746 ++match_first;
747 goto forward_match_found_start_or_reached_end;
748
749 case 6:
750 /* Fastmap without translation, match forward. */
751 while (BE (match_first < right_lim, 1)
752 && !fastmap[(unsigned char) string[match_first]])
753 ++match_first;
754
755 forward_match_found_start_or_reached_end:
756 if (BE (match_first == right_lim, 0))
757 {
758 ch = match_first >= length
759 ? 0 : (unsigned char) string[match_first];
760 if (!fastmap[t ? t[ch] : ch])
761 goto free_return;
762 }
763 break;
764
765 case 4:
766 case 5:
767 /* Fastmap without multi-byte translation, match backwards. */
768 while (match_first >= left_lim)
769 {
770 ch = match_first >= length
771 ? 0 : (unsigned char) string[match_first];
772 if (fastmap[t ? t[ch] : ch])
773 break;
774 --match_first;
775 }
776 if (match_first < left_lim)
777 goto free_return;
778 break;
779
780 default:
781 /* In this case, we can't determine easily the current byte,
782 since it might be a component byte of a multibyte
783 character. Then we use the constructed buffer instead. */
784 for (;;)
785 {
786 /* If MATCH_FIRST is out of the valid range, reconstruct the
787 buffers. */
788 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
789 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
790 {
791 err = re_string_reconstruct (&mctx.input, match_first,
792 eflags);
793 if (BE (err != REG_NOERROR, 0))
794 goto free_return;
795
796 offset = match_first - mctx.input.raw_mbs_idx;
797 }
798 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
799 Note that MATCH_FIRST must not be smaller than 0. */
800 ch = (match_first >= length
801 ? 0 : re_string_byte_at (&mctx.input, offset));
802 if (fastmap[ch])
803 break;
804 match_first += incr;
805 if (match_first < left_lim || match_first > right_lim)
806 {
807 err = REG_NOMATCH;
808 goto free_return;
809 }
810 }
811 break;
812 }
813
814 /* Reconstruct the buffers so that the matcher can assume that
815 the matching starts from the beginning of the buffer. */
816 err = re_string_reconstruct (&mctx.input, match_first, eflags);
817 if (BE (err != REG_NOERROR, 0))
818 goto free_return;
819
820#ifdef RE_ENABLE_I18N
821 /* Don't consider this char as a possible match start if it part,
822 yet isn't the head, of a multibyte character. */
823 if (!sb && !re_string_first_byte (&mctx.input, 0))
824 continue;
825#endif
826
827 /* It seems to be appropriate one, then use the matcher. */
828 /* We assume that the matching starts from 0. */
829 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
830 match_last = check_matching (&mctx, fl_longest_match,
831 range >= 0 ? &match_first : NULL);
832 if (match_last != -1)
833 {
834 if (BE (match_last == -2, 0))
835 {
836 err = REG_ESPACE;
837 goto free_return;
838 }
839 else
840 {
841 mctx.match_last = match_last;
842 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
843 {
844 re_dfastate_t *pstate = mctx.state_log[match_last];
845 mctx.last_node = check_halt_state_context (&mctx, pstate,
846 match_last);
847 }
848 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
849 || dfa->nbackref)
850 {
851 err = prune_impossible_nodes (&mctx);
852 if (err == REG_NOERROR)
853 break;
854 if (BE (err != REG_NOMATCH, 0))
855 goto free_return;
856 match_last = -1;
857 }
858 else
859 break; /* We found a match. */
860 }
861 }
862
863 match_ctx_clean (&mctx);
864 }
865
866#ifdef DEBUG
867 assert (match_last != -1);
868 assert (err == REG_NOERROR);
869#endif
870
871 /* Set pmatch[] if we need. */
872 if (nmatch > 0)
873 {
874 int reg_idx;
875
876 /* Initialize registers. */
877 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
878 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
879
880 /* Set the points where matching start/end. */
881 pmatch[0].rm_so = 0;
882 pmatch[0].rm_eo = mctx.match_last;
883
884 if (!preg->no_sub && nmatch > 1)
885 {
886 err = set_regs (preg, &mctx, nmatch, pmatch,
887 dfa->has_plural_match && dfa->nbackref > 0);
888 if (BE (err != REG_NOERROR, 0))
889 goto free_return;
890 }
891
892 /* At last, add the offset to the each registers, since we slided
893 the buffers so that we could assume that the matching starts
894 from 0. */
895 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
896 if (pmatch[reg_idx].rm_so != -1)
897 {
898#ifdef RE_ENABLE_I18N
899 if (BE (mctx.input.offsets_needed != 0, 0))
900 {
901 pmatch[reg_idx].rm_so =
902 (pmatch[reg_idx].rm_so == mctx.input.valid_len
903 ? mctx.input.valid_raw_len
904 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
905 pmatch[reg_idx].rm_eo =
906 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
907 ? mctx.input.valid_raw_len
908 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
909 }
910#else
911 assert (mctx.input.offsets_needed == 0);
912#endif
913 pmatch[reg_idx].rm_so += match_first;
914 pmatch[reg_idx].rm_eo += match_first;
915 }
916 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
917 {
918 pmatch[nmatch + reg_idx].rm_so = -1;
919 pmatch[nmatch + reg_idx].rm_eo = -1;
920 }
921
922 if (dfa->subexp_map)
923 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
924 if (dfa->subexp_map[reg_idx] != reg_idx)
925 {
926 pmatch[reg_idx + 1].rm_so
927 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
928 pmatch[reg_idx + 1].rm_eo
929 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
930 }
931 }
932
933 free_return:
934 re_free (mctx.state_log);
935 if (dfa->nbackref)
936 match_ctx_free (&mctx);
937 re_string_destruct (&mctx.input);
938 return err;
939}
940
941static reg_errcode_t
942prune_impossible_nodes (mctx)
943 re_match_context_t *mctx;
944{
945 const re_dfa_t *const dfa = mctx->dfa;
946 int halt_node, match_last;
947 reg_errcode_t ret;
948 re_dfastate_t **sifted_states;
949 re_dfastate_t **lim_states = NULL;
950 re_sift_context_t sctx;
951#ifdef DEBUG
952 assert (mctx->state_log != NULL);
953#endif
954 match_last = mctx->match_last;
955 halt_node = mctx->last_node;
956 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
957 if (BE (sifted_states == NULL, 0))
958 {
959 ret = REG_ESPACE;
960 goto free_return;
961 }
962 if (dfa->nbackref)
963 {
964 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
965 if (BE (lim_states == NULL, 0))
966 {
967 ret = REG_ESPACE;
968 goto free_return;
969 }
970 while (1)
971 {
972 memset (lim_states, '\0',
973 sizeof (re_dfastate_t *) * (match_last + 1));
974 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
975 match_last);
976 ret = sift_states_backward (mctx, &sctx);
977 re_node_set_free (&sctx.limits);
978 if (BE (ret != REG_NOERROR, 0))
979 goto free_return;
980 if (sifted_states[0] != NULL || lim_states[0] != NULL)
981 break;
982 do
983 {
984 --match_last;
985 if (match_last < 0)
986 {
987 ret = REG_NOMATCH;
988 goto free_return;
989 }
990 } while (mctx->state_log[match_last] == NULL
991 || !mctx->state_log[match_last]->halt);
992 halt_node = check_halt_state_context (mctx,
993 mctx->state_log[match_last],
994 match_last);
995 }
996 ret = merge_state_array (dfa, sifted_states, lim_states,
997 match_last + 1);
998 re_free (lim_states);
999 lim_states = NULL;
1000 if (BE (ret != REG_NOERROR, 0))
1001 goto free_return;
1002 }
1003 else
1004 {
1005 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1006 ret = sift_states_backward (mctx, &sctx);
1007 re_node_set_free (&sctx.limits);
1008 if (BE (ret != REG_NOERROR, 0))
1009 goto free_return;
1010 }
1011 re_free (mctx->state_log);
1012 mctx->state_log = sifted_states;
1013 sifted_states = NULL;
1014 mctx->last_node = halt_node;
1015 mctx->match_last = match_last;
1016 ret = REG_NOERROR;
1017 free_return:
1018 re_free (sifted_states);
1019 re_free (lim_states);
1020 return ret;
1021}
1022
1023/* Acquire an initial state and return it.
1024 We must select appropriate initial state depending on the context,
1025 since initial states may have constraints like "\<", "^", etc.. */
1026
1027static inline re_dfastate_t *
1028__attribute ((always_inline)) internal_function
1029acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1030 int idx)
1031{
1032 const re_dfa_t *const dfa = mctx->dfa;
1033 if (dfa->init_state->has_constraint)
1034 {
1035 unsigned int context;
1036 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1037 if (IS_WORD_CONTEXT (context))
1038 return dfa->init_state_word;
1039 else if (IS_ORDINARY_CONTEXT (context))
1040 return dfa->init_state;
1041 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1042 return dfa->init_state_begbuf;
1043 else if (IS_NEWLINE_CONTEXT (context))
1044 return dfa->init_state_nl;
1045 else if (IS_BEGBUF_CONTEXT (context))
1046 {
1047 /* It is relatively rare case, then calculate on demand. */
1048 return re_acquire_state_context (err, dfa,
1049 dfa->init_state->entrance_nodes,
1050 context);
1051 }
1052 else
1053 /* Must not happen? */
1054 return dfa->init_state;
1055 }
1056 else
1057 return dfa->init_state;
1058}
1059
1060/* Check whether the regular expression match input string INPUT or not,
1061 and return the index where the matching end, return -1 if not match,
1062 or return -2 in case of an error.
1063 FL_LONGEST_MATCH means we want the POSIX longest matching.
1064 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1065 next place where we may want to try matching.
1066 Note that the matcher assume that the maching starts from the current
1067 index of the buffer. */
1068
1069static int
1070internal_function
1071check_matching (re_match_context_t *mctx, int fl_longest_match,
1072 int *p_match_first)
1073{
1074 const re_dfa_t *const dfa = mctx->dfa;
1075 reg_errcode_t err;
1076 int match = 0;
1077 int match_last = -1;
1078 int cur_str_idx = re_string_cur_idx (&mctx->input);
1079 re_dfastate_t *cur_state;
1080 int at_init_state = p_match_first != NULL;
1081 int next_start_idx = cur_str_idx;
1082
1083 err = REG_NOERROR;
1084 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1085 /* An initial state must not be NULL (invalid). */
1086 if (BE (cur_state == NULL, 0))
1087 {
1088 assert (err == REG_ESPACE);
1089 return -2;
1090 }
1091
1092 if (mctx->state_log != NULL)
1093 {
1094 mctx->state_log[cur_str_idx] = cur_state;
1095
1096 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1097 later. E.g. Processing back references. */
1098 if (BE (dfa->nbackref, 0))
1099 {
1100 at_init_state = 0;
1101 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1102 if (BE (err != REG_NOERROR, 0))
1103 return err;
1104
1105 if (cur_state->has_backref)
1106 {
1107 err = transit_state_bkref (mctx, &cur_state->nodes);
1108 if (BE (err != REG_NOERROR, 0))
1109 return err;
1110 }
1111 }
1112 }
1113
1114 /* If the RE accepts NULL string. */
1115 if (BE (cur_state->halt, 0))
1116 {
1117 if (!cur_state->has_constraint
1118 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1119 {
1120 if (!fl_longest_match)
1121 return cur_str_idx;
1122 else
1123 {
1124 match_last = cur_str_idx;
1125 match = 1;
1126 }
1127 }
1128 }
1129
1130 while (!re_string_eoi (&mctx->input))
1131 {
1132 re_dfastate_t *old_state = cur_state;
1133 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1134
1135 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1136 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1137 && mctx->input.valid_len < mctx->input.len))
1138 {
1139 err = extend_buffers (mctx);
1140 if (BE (err != REG_NOERROR, 0))
1141 {
1142 assert (err == REG_ESPACE);
1143 return -2;
1144 }
1145 }
1146
1147 cur_state = transit_state (&err, mctx, cur_state);
1148 if (mctx->state_log != NULL)
1149 cur_state = merge_state_with_log (&err, mctx, cur_state);
1150
1151 if (cur_state == NULL)
1152 {
1153 /* Reached the invalid state or an error. Try to recover a valid
1154 state using the state log, if available and if we have not
1155 already found a valid (even if not the longest) match. */
1156 if (BE (err != REG_NOERROR, 0))
1157 return -2;
1158
1159 if (mctx->state_log == NULL
1160 || (match && !fl_longest_match)
1161 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1162 break;
1163 }
1164
1165 if (BE (at_init_state, 0))
1166 {
1167 if (old_state == cur_state)
1168 next_start_idx = next_char_idx;
1169 else
1170 at_init_state = 0;
1171 }
1172
1173 if (cur_state->halt)
1174 {
1175 /* Reached a halt state.
1176 Check the halt state can satisfy the current context. */
1177 if (!cur_state->has_constraint
1178 || check_halt_state_context (mctx, cur_state,
1179 re_string_cur_idx (&mctx->input)))
1180 {
1181 /* We found an appropriate halt state. */
1182 match_last = re_string_cur_idx (&mctx->input);
1183 match = 1;
1184
1185 /* We found a match, do not modify match_first below. */
1186 p_match_first = NULL;
1187 if (!fl_longest_match)
1188 break;
1189 }
1190 }
1191 }
1192
1193 if (p_match_first)
1194 *p_match_first += next_start_idx;
1195
1196 return match_last;
1197}
1198
1199/* Check NODE match the current context. */
1200
1201static int
1202internal_function
1203check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1204{
1205 re_token_type_t type = dfa->nodes[node].type;
1206 unsigned int constraint = dfa->nodes[node].constraint;
1207 if (type != END_OF_RE)
1208 return 0;
1209 if (!constraint)
1210 return 1;
1211 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1212 return 0;
1213 return 1;
1214}
1215
1216/* Check the halt state STATE match the current context.
1217 Return 0 if not match, if the node, STATE has, is a halt node and
1218 match the context, return the node. */
1219
1220static int
1221internal_function
1222check_halt_state_context (const re_match_context_t *mctx,
1223 const re_dfastate_t *state, int idx)
1224{
1225 int i;
1226 unsigned int context;
1227#ifdef DEBUG
1228 assert (state->halt);
1229#endif
1230 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1231 for (i = 0; i < state->nodes.nelem; ++i)
1232 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1233 return state->nodes.elems[i];
1234 return 0;
1235}
1236
1237/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1238 corresponding to the DFA).
1239 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1240 of errors. */
1241
1242static int
1243internal_function
1244proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1245 int *pidx, int node, re_node_set *eps_via_nodes,
1246 struct re_fail_stack_t *fs)
1247{
1248 const re_dfa_t *const dfa = mctx->dfa;
1249 int i, err;
1250 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1251 {
1252 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1253 re_node_set *edests = &dfa->edests[node];
1254 int dest_node;
1255 err = re_node_set_insert (eps_via_nodes, node);
1256 if (BE (err < 0, 0))
1257 return -2;
1258 /* Pick up a valid destination, or return -1 if none is found. */
1259 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1260 {
1261 int candidate = edests->elems[i];
1262 if (!re_node_set_contains (cur_nodes, candidate))
1263 continue;
1264 if (dest_node == -1)
1265 dest_node = candidate;
1266
1267 else
1268 {
1269 /* In order to avoid infinite loop like "(a*)*", return the second
1270 epsilon-transition if the first was already considered. */
1271 if (re_node_set_contains (eps_via_nodes, dest_node))
1272 return candidate;
1273
1274 /* Otherwise, push the second epsilon-transition on the fail stack. */
1275 else if (fs != NULL
1276 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1277 eps_via_nodes))
1278 return -2;
1279
1280 /* We know we are going to exit. */
1281 break;
1282 }
1283 }
1284 return dest_node;
1285 }
1286 else
1287 {
1288 int naccepted = 0;
1289 re_token_type_t type = dfa->nodes[node].type;
1290
1291#ifdef RE_ENABLE_I18N
1292 if (dfa->nodes[node].accept_mb)
1293 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1294 else
1295#endif /* RE_ENABLE_I18N */
1296 if (type == OP_BACK_REF)
1297 {
1298 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1299 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1300 if (fs != NULL)
1301 {
1302 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1303 return -1;
1304 else if (naccepted)
1305 {
1306 char *buf = (char *) re_string_get_buffer (&mctx->input);
1307 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1308 naccepted) != 0)
1309 return -1;
1310 }
1311 }
1312
1313 if (naccepted == 0)
1314 {
1315 int dest_node;
1316 err = re_node_set_insert (eps_via_nodes, node);
1317 if (BE (err < 0, 0))
1318 return -2;
1319 dest_node = dfa->edests[node].elems[0];
1320 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1321 dest_node))
1322 return dest_node;
1323 }
1324 }
1325
1326 if (naccepted != 0
1327 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1328 {
1329 int dest_node = dfa->nexts[node];
1330 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1331 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1332 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1333 dest_node)))
1334 return -1;
1335 re_node_set_empty (eps_via_nodes);
1336 return dest_node;
1337 }
1338 }
1339 return -1;
1340}
1341
1342static reg_errcode_t
1343internal_function
1344push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1345 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1346{
1347 reg_errcode_t err;
1348 int num = fs->num++;
1349 if (fs->num == fs->alloc)
1350 {
1351 struct re_fail_stack_ent_t *new_array;
1352 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1353 * fs->alloc * 2));
1354 if (new_array == NULL)
1355 return REG_ESPACE;
1356 fs->alloc *= 2;
1357 fs->stack = new_array;
1358 }
1359 fs->stack[num].idx = str_idx;
1360 fs->stack[num].node = dest_node;
1361 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1362 if (fs->stack[num].regs == NULL)
1363 return REG_ESPACE;
1364 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1365 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1366 return err;
1367}
1368
1369static int
1370internal_function
1371pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1372 regmatch_t *regs, re_node_set *eps_via_nodes)
1373{
1374 int num = --fs->num;
1375 assert (num >= 0);
1376 *pidx = fs->stack[num].idx;
1377 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1378 re_node_set_free (eps_via_nodes);
1379 re_free (fs->stack[num].regs);
1380 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1381 return fs->stack[num].node;
1382}
1383
1384/* Set the positions where the subexpressions are starts/ends to registers
1385 PMATCH.
1386 Note: We assume that pmatch[0] is already set, and
1387 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1388
1389static reg_errcode_t
1390internal_function
1391set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1392 regmatch_t *pmatch, int fl_backtrack)
1393{
1394 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1395 int idx, cur_node;
1396 re_node_set eps_via_nodes;
1397 struct re_fail_stack_t *fs;
1398 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1399 regmatch_t *prev_idx_match;
1400 int prev_idx_match_malloced = 0;
1401
1402#ifdef DEBUG
1403 assert (nmatch > 1);
1404 assert (mctx->state_log != NULL);
1405#endif
1406 if (fl_backtrack)
1407 {
1408 fs = &fs_body;
1409 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1410 if (fs->stack == NULL)
1411 return REG_ESPACE;
1412 }
1413 else
1414 fs = NULL;
1415
1416 cur_node = dfa->init_node;
1417 re_node_set_init_empty (&eps_via_nodes);
1418
1419 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1420 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1421 else
1422 {
1423 prev_idx_match = re_malloc (regmatch_t, nmatch);
1424 if (prev_idx_match == NULL)
1425 {
1426 free_fail_stack_return (fs);
1427 return REG_ESPACE;
1428 }
1429 prev_idx_match_malloced = 1;
1430 }
1431 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1432
1433 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1434 {
1435 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1436
1437 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1438 {
1439 int reg_idx;
1440 if (fs)
1441 {
1442 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1443 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1444 break;
1445 if (reg_idx == nmatch)
1446 {
1447 re_node_set_free (&eps_via_nodes);
1448 if (prev_idx_match_malloced)
1449 re_free (prev_idx_match);
1450 return free_fail_stack_return (fs);
1451 }
1452 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1453 &eps_via_nodes);
1454 }
1455 else
1456 {
1457 re_node_set_free (&eps_via_nodes);
1458 if (prev_idx_match_malloced)
1459 re_free (prev_idx_match);
1460 return REG_NOERROR;
1461 }
1462 }
1463
1464 /* Proceed to next node. */
1465 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1466 &eps_via_nodes, fs);
1467
1468 if (BE (cur_node < 0, 0))
1469 {
1470 if (BE (cur_node == -2, 0))
1471 {
1472 re_node_set_free (&eps_via_nodes);
1473 if (prev_idx_match_malloced)
1474 re_free (prev_idx_match);
1475 free_fail_stack_return (fs);
1476 return REG_ESPACE;
1477 }
1478 if (fs)
1479 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1480 &eps_via_nodes);
1481 else
1482 {
1483 re_node_set_free (&eps_via_nodes);
1484 if (prev_idx_match_malloced)
1485 re_free (prev_idx_match);
1486 return REG_NOMATCH;
1487 }
1488 }
1489 }
1490 re_node_set_free (&eps_via_nodes);
1491 if (prev_idx_match_malloced)
1492 re_free (prev_idx_match);
1493 return free_fail_stack_return (fs);
1494}
1495
1496static reg_errcode_t
1497internal_function
1498free_fail_stack_return (struct re_fail_stack_t *fs)
1499{
1500 if (fs)
1501 {
1502 int fs_idx;
1503 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1504 {
1505 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1506 re_free (fs->stack[fs_idx].regs);
1507 }
1508 re_free (fs->stack);
1509 }
1510 return REG_NOERROR;
1511}
1512
1513static void
1514internal_function
1515update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1516 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1517{
1518 int type = dfa->nodes[cur_node].type;
1519 if (type == OP_OPEN_SUBEXP)
1520 {
1521 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1522
1523 /* We are at the first node of this sub expression. */
1524 if (reg_num < nmatch)
1525 {
1526 pmatch[reg_num].rm_so = cur_idx;
1527 pmatch[reg_num].rm_eo = -1;
1528 }
1529 }
1530 else if (type == OP_CLOSE_SUBEXP)
1531 {
1532 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1533 if (reg_num < nmatch)
1534 {
1535 /* We are at the last node of this sub expression. */
1536 if (pmatch[reg_num].rm_so < cur_idx)
1537 {
1538 pmatch[reg_num].rm_eo = cur_idx;
1539 /* This is a non-empty match or we are not inside an optional
1540 subexpression. Accept this right away. */
1541 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1542 }
1543 else
1544 {
1545 if (dfa->nodes[cur_node].opt_subexp
1546 && prev_idx_match[reg_num].rm_so != -1)
1547 /* We transited through an empty match for an optional
1548 subexpression, like (a?)*, and this is not the subexp's
1549 first match. Copy back the old content of the registers
1550 so that matches of an inner subexpression are undone as
1551 well, like in ((a?))*. */
1552 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1553 else
1554 /* We completed a subexpression, but it may be part of
1555 an optional one, so do not update PREV_IDX_MATCH. */
1556 pmatch[reg_num].rm_eo = cur_idx;
1557 }
1558 }
1559 }
1560}
1561
1562/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1563 and sift the nodes in each states according to the following rules.
1564 Updated state_log will be wrote to STATE_LOG.
1565
1566 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1567 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1568 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1569 the LAST_NODE, we throw away the node `a'.
1570 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1571 string `s' and transit to `b':
1572 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1573 away the node `a'.
1574 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1575 thrown away, we throw away the node `a'.
1576 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1577 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1578 node `a'.
1579 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1580 we throw away the node `a'. */
1581
1582#define STATE_NODE_CONTAINS(state,node) \
1583 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1584
1585static reg_errcode_t
1586internal_function
1587sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1588{
1589 reg_errcode_t err;
1590 int null_cnt = 0;
1591 int str_idx = sctx->last_str_idx;
1592 re_node_set cur_dest;
1593
1594#ifdef DEBUG
1595 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1596#endif
1597
1598 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1599 transit to the last_node and the last_node itself. */
1600 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1601 if (BE (err != REG_NOERROR, 0))
1602 return err;
1603 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1604 if (BE (err != REG_NOERROR, 0))
1605 goto free_return;
1606
1607 /* Then check each states in the state_log. */
1608 while (str_idx > 0)
1609 {
1610 /* Update counters. */
1611 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1612 if (null_cnt > mctx->max_mb_elem_len)
1613 {
1614 memset (sctx->sifted_states, '\0',
1615 sizeof (re_dfastate_t *) * str_idx);
1616 re_node_set_free (&cur_dest);
1617 return REG_NOERROR;
1618 }
1619 re_node_set_empty (&cur_dest);
1620 --str_idx;
1621
1622 if (mctx->state_log[str_idx])
1623 {
1624 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1625 if (BE (err != REG_NOERROR, 0))
1626 goto free_return;
1627 }
1628
1629 /* Add all the nodes which satisfy the following conditions:
1630 - It can epsilon transit to a node in CUR_DEST.
1631 - It is in CUR_SRC.
1632 And update state_log. */
1633 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1634 if (BE (err != REG_NOERROR, 0))
1635 goto free_return;
1636 }
1637 err = REG_NOERROR;
1638 free_return:
1639 re_node_set_free (&cur_dest);
1640 return err;
1641}
1642
1643static reg_errcode_t
1644internal_function
1645build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1646 int str_idx, re_node_set *cur_dest)
1647{
1648 const re_dfa_t *const dfa = mctx->dfa;
1649 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1650 int i;
1651
1652 /* Then build the next sifted state.
1653 We build the next sifted state on `cur_dest', and update
1654 `sifted_states[str_idx]' with `cur_dest'.
1655 Note:
1656 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1657 `cur_src' points the node_set of the old `state_log[str_idx]'
1658 (with the epsilon nodes pre-filtered out). */
1659 for (i = 0; i < cur_src->nelem; i++)
1660 {
1661 int prev_node = cur_src->elems[i];
1662 int naccepted = 0;
1663 int ret;
1664
1665#ifdef DEBUG
1666 re_token_type_t type = dfa->nodes[prev_node].type;
1667 assert (!IS_EPSILON_NODE (type));
1668#endif
1669#ifdef RE_ENABLE_I18N
1670 /* If the node may accept `multi byte'. */
1671 if (dfa->nodes[prev_node].accept_mb)
1672 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1673 str_idx, sctx->last_str_idx);
1674#endif /* RE_ENABLE_I18N */
1675
1676 /* We don't check backreferences here.
1677 See update_cur_sifted_state(). */
1678 if (!naccepted
1679 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1680 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1681 dfa->nexts[prev_node]))
1682 naccepted = 1;
1683
1684 if (naccepted == 0)
1685 continue;
1686
1687 if (sctx->limits.nelem)
1688 {
1689 int to_idx = str_idx + naccepted;
1690 if (check_dst_limits (mctx, &sctx->limits,
1691 dfa->nexts[prev_node], to_idx,
1692 prev_node, str_idx))
1693 continue;
1694 }
1695 ret = re_node_set_insert (cur_dest, prev_node);
1696 if (BE (ret == -1, 0))
1697 return REG_ESPACE;
1698 }
1699
1700 return REG_NOERROR;
1701}
1702
1703/* Helper functions. */
1704
1705static reg_errcode_t
1706internal_function
1707clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1708{
1709 int top = mctx->state_log_top;
1710
1711 if (next_state_log_idx >= mctx->input.bufs_len
1712 || (next_state_log_idx >= mctx->input.valid_len
1713 && mctx->input.valid_len < mctx->input.len))
1714 {
1715 reg_errcode_t err;
1716 err = extend_buffers (mctx);
1717 if (BE (err != REG_NOERROR, 0))
1718 return err;
1719 }
1720
1721 if (top < next_state_log_idx)
1722 {
1723 memset (mctx->state_log + top + 1, '\0',
1724 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1725 mctx->state_log_top = next_state_log_idx;
1726 }
1727 return REG_NOERROR;
1728}
1729
1730static reg_errcode_t
1731internal_function
1732merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1733 re_dfastate_t **src, int num)
1734{
1735 int st_idx;
1736 reg_errcode_t err;
1737 for (st_idx = 0; st_idx < num; ++st_idx)
1738 {
1739 if (dst[st_idx] == NULL)
1740 dst[st_idx] = src[st_idx];
1741 else if (src[st_idx] != NULL)
1742 {
1743 re_node_set merged_set;
1744 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1745 &src[st_idx]->nodes);
1746 if (BE (err != REG_NOERROR, 0))
1747 return err;
1748 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1749 re_node_set_free (&merged_set);
1750 if (BE (err != REG_NOERROR, 0))
1751 return err;
1752 }
1753 }
1754 return REG_NOERROR;
1755}
1756
1757static reg_errcode_t
1758internal_function
1759update_cur_sifted_state (const re_match_context_t *mctx,
1760 re_sift_context_t *sctx, int str_idx,
1761 re_node_set *dest_nodes)
1762{
1763 const re_dfa_t *const dfa = mctx->dfa;
1764 reg_errcode_t err = REG_NOERROR;
1765 const re_node_set *candidates;
1766 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1767 : &mctx->state_log[str_idx]->nodes);
1768
1769 if (dest_nodes->nelem == 0)
1770 sctx->sifted_states[str_idx] = NULL;
1771 else
1772 {
1773 if (candidates)
1774 {
1775 /* At first, add the nodes which can epsilon transit to a node in
1776 DEST_NODE. */
1777 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1778 if (BE (err != REG_NOERROR, 0))
1779 return err;
1780
1781 /* Then, check the limitations in the current sift_context. */
1782 if (sctx->limits.nelem)
1783 {
1784 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1785 mctx->bkref_ents, str_idx);
1786 if (BE (err != REG_NOERROR, 0))
1787 return err;
1788 }
1789 }
1790
1791 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1792 if (BE (err != REG_NOERROR, 0))
1793 return err;
1794 }
1795
1796 if (candidates && mctx->state_log[str_idx]->has_backref)
1797 {
1798 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1799 if (BE (err != REG_NOERROR, 0))
1800 return err;
1801 }
1802 return REG_NOERROR;
1803}
1804
1805static reg_errcode_t
1806internal_function
1807add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1808 const re_node_set *candidates)
1809{
1810 reg_errcode_t err = REG_NOERROR;
1811 int i;
1812
1813 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1814 if (BE (err != REG_NOERROR, 0))
1815 return err;
1816
1817 if (!state->inveclosure.alloc)
1818 {
1819 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1820 if (BE (err != REG_NOERROR, 0))
1821 return REG_ESPACE;
1822 for (i = 0; i < dest_nodes->nelem; i++)
1823 re_node_set_merge (&state->inveclosure,
1824 dfa->inveclosures + dest_nodes->elems[i]);
1825 }
1826 return re_node_set_add_intersect (dest_nodes, candidates,
1827 &state->inveclosure);
1828}
1829
1830static reg_errcode_t
1831internal_function
1832sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1833 const re_node_set *candidates)
1834{
1835 int ecl_idx;
1836 reg_errcode_t err;
1837 re_node_set *inv_eclosure = dfa->inveclosures + node;
1838 re_node_set except_nodes;
1839 re_node_set_init_empty (&except_nodes);
1840 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1841 {
1842 int cur_node = inv_eclosure->elems[ecl_idx];
1843 if (cur_node == node)
1844 continue;
1845 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1846 {
1847 int edst1 = dfa->edests[cur_node].elems[0];
1848 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1849 ? dfa->edests[cur_node].elems[1] : -1);
1850 if ((!re_node_set_contains (inv_eclosure, edst1)
1851 && re_node_set_contains (dest_nodes, edst1))
1852 || (edst2 > 0
1853 && !re_node_set_contains (inv_eclosure, edst2)
1854 && re_node_set_contains (dest_nodes, edst2)))
1855 {
1856 err = re_node_set_add_intersect (&except_nodes, candidates,
1857 dfa->inveclosures + cur_node);
1858 if (BE (err != REG_NOERROR, 0))
1859 {
1860 re_node_set_free (&except_nodes);
1861 return err;
1862 }
1863 }
1864 }
1865 }
1866 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1867 {
1868 int cur_node = inv_eclosure->elems[ecl_idx];
1869 if (!re_node_set_contains (&except_nodes, cur_node))
1870 {
1871 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1872 re_node_set_remove_at (dest_nodes, idx);
1873 }
1874 }
1875 re_node_set_free (&except_nodes);
1876 return REG_NOERROR;
1877}
1878
1879static int
1880internal_function
1881check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1882 int dst_node, int dst_idx, int src_node, int src_idx)
1883{
1884 const re_dfa_t *const dfa = mctx->dfa;
1885 int lim_idx, src_pos, dst_pos;
1886
1887 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1888 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1889 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1890 {
1891 int subexp_idx;
1892 struct re_backref_cache_entry *ent;
1893 ent = mctx->bkref_ents + limits->elems[lim_idx];
1894 subexp_idx = dfa->nodes[ent->node].opr.idx;
1895
1896 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1897 subexp_idx, dst_node, dst_idx,
1898 dst_bkref_idx);
1899 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1900 subexp_idx, src_node, src_idx,
1901 src_bkref_idx);
1902
1903 /* In case of:
1904 <src> <dst> ( <subexp> )
1905 ( <subexp> ) <src> <dst>
1906 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1907 if (src_pos == dst_pos)
1908 continue; /* This is unrelated limitation. */
1909 else
1910 return 1;
1911 }
1912 return 0;
1913}
1914
1915static int
1916internal_function
1917check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1918 int subexp_idx, int from_node, int bkref_idx)
1919{
1920 const re_dfa_t *const dfa = mctx->dfa;
1921 const re_node_set *eclosures = dfa->eclosures + from_node;
1922 int node_idx;
1923
1924 /* Else, we are on the boundary: examine the nodes on the epsilon
1925 closure. */
1926 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1927 {
1928 int node = eclosures->elems[node_idx];
1929 switch (dfa->nodes[node].type)
1930 {
1931 case OP_BACK_REF:
1932 if (bkref_idx != -1)
1933 {
1934 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1935 do
1936 {
1937 int dst, cpos;
1938
1939 if (ent->node != node)
1940 continue;
1941
1942 if (subexp_idx < BITSET_WORD_BITS
1943 && !(ent->eps_reachable_subexps_map
1944 & ((bitset_word_t) 1 << subexp_idx)))
1945 continue;
1946
1947 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1948 OP_CLOSE_SUBEXP cases below. But, if the
1949 destination node is the same node as the source
1950 node, don't recurse because it would cause an
1951 infinite loop: a regex that exhibits this behavior
1952 is ()\1*\1* */
1953 dst = dfa->edests[node].elems[0];
1954 if (dst == from_node)
1955 {
1956 if (boundaries & 1)
1957 return -1;
1958 else /* if (boundaries & 2) */
1959 return 0;
1960 }
1961
1962 cpos =
1963 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1964 dst, bkref_idx);
1965 if (cpos == -1 /* && (boundaries & 1) */)
1966 return -1;
1967 if (cpos == 0 && (boundaries & 2))
1968 return 0;
1969
1970 if (subexp_idx < BITSET_WORD_BITS)
1971 ent->eps_reachable_subexps_map
1972 &= ~((bitset_word_t) 1 << subexp_idx);
1973 }
1974 while (ent++->more);
1975 }
1976 break;
1977
1978 case OP_OPEN_SUBEXP:
1979 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1980 return -1;
1981 break;
1982
1983 case OP_CLOSE_SUBEXP:
1984 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1985 return 0;
1986 break;
1987
1988 default:
1989 break;
1990 }
1991 }
1992
1993 return (boundaries & 2) ? 1 : 0;
1994}
1995
1996static int
1997internal_function
1998check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
1999 int subexp_idx, int from_node, int str_idx,
2000 int bkref_idx)
2001{
2002 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2003 int boundaries;
2004
2005 /* If we are outside the range of the subexpression, return -1 or 1. */
2006 if (str_idx < lim->subexp_from)
2007 return -1;
2008
2009 if (lim->subexp_to < str_idx)
2010 return 1;
2011
2012 /* If we are within the subexpression, return 0. */
2013 boundaries = (str_idx == lim->subexp_from);
2014 boundaries |= (str_idx == lim->subexp_to) << 1;
2015 if (boundaries == 0)
2016 return 0;
2017
2018 /* Else, examine epsilon closure. */
2019 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2020 from_node, bkref_idx);
2021}
2022
2023/* Check the limitations of sub expressions LIMITS, and remove the nodes
2024 which are against limitations from DEST_NODES. */
2025
2026static reg_errcode_t
2027internal_function
2028check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2029 const re_node_set *candidates, re_node_set *limits,
2030 struct re_backref_cache_entry *bkref_ents, int str_idx)
2031{
2032 reg_errcode_t err;
2033 int node_idx, lim_idx;
2034
2035 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2036 {
2037 int subexp_idx;
2038 struct re_backref_cache_entry *ent;
2039 ent = bkref_ents + limits->elems[lim_idx];
2040
2041 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2042 continue; /* This is unrelated limitation. */
2043
2044 subexp_idx = dfa->nodes[ent->node].opr.idx;
2045 if (ent->subexp_to == str_idx)
2046 {
2047 int ops_node = -1;
2048 int cls_node = -1;
2049 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2050 {
2051 int node = dest_nodes->elems[node_idx];
2052 re_token_type_t type = dfa->nodes[node].type;
2053 if (type == OP_OPEN_SUBEXP
2054 && subexp_idx == dfa->nodes[node].opr.idx)
2055 ops_node = node;
2056 else if (type == OP_CLOSE_SUBEXP
2057 && subexp_idx == dfa->nodes[node].opr.idx)
2058 cls_node = node;
2059 }
2060
2061 /* Check the limitation of the open subexpression. */
2062 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2063 if (ops_node >= 0)
2064 {
2065 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2066 candidates);
2067 if (BE (err != REG_NOERROR, 0))
2068 return err;
2069 }
2070
2071 /* Check the limitation of the close subexpression. */
2072 if (cls_node >= 0)
2073 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2074 {
2075 int node = dest_nodes->elems[node_idx];
2076 if (!re_node_set_contains (dfa->inveclosures + node,
2077 cls_node)
2078 && !re_node_set_contains (dfa->eclosures + node,
2079 cls_node))
2080 {
2081 /* It is against this limitation.
2082 Remove it form the current sifted state. */
2083 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2084 candidates);
2085 if (BE (err != REG_NOERROR, 0))
2086 return err;
2087 --node_idx;
2088 }
2089 }
2090 }
2091 else /* (ent->subexp_to != str_idx) */
2092 {
2093 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2094 {
2095 int node = dest_nodes->elems[node_idx];
2096 re_token_type_t type = dfa->nodes[node].type;
2097 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2098 {
2099 if (subexp_idx != dfa->nodes[node].opr.idx)
2100 continue;
2101 /* It is against this limitation.
2102 Remove it form the current sifted state. */
2103 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2104 candidates);
2105 if (BE (err != REG_NOERROR, 0))
2106 return err;
2107 }
2108 }
2109 }
2110 }
2111 return REG_NOERROR;
2112}
2113
2114static reg_errcode_t
2115internal_function
2116sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2117 int str_idx, const re_node_set *candidates)
2118{
2119 const re_dfa_t *const dfa = mctx->dfa;
2120 reg_errcode_t err;
2121 int node_idx, node;
2122 re_sift_context_t local_sctx;
2123 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2124
2125 if (first_idx == -1)
2126 return REG_NOERROR;
2127
2128 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2129
2130 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2131 {
2132 int enabled_idx;
2133 re_token_type_t type;
2134 struct re_backref_cache_entry *entry;
2135 node = candidates->elems[node_idx];
2136 type = dfa->nodes[node].type;
2137 /* Avoid infinite loop for the REs like "()\1+". */
2138 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2139 continue;
2140 if (type != OP_BACK_REF)
2141 continue;
2142
2143 entry = mctx->bkref_ents + first_idx;
2144 enabled_idx = first_idx;
2145 do
2146 {
2147 int subexp_len;
2148 int to_idx;
2149 int dst_node;
2150 int ret;
2151 re_dfastate_t *cur_state;
2152
2153 if (entry->node != node)
2154 continue;
2155 subexp_len = entry->subexp_to - entry->subexp_from;
2156 to_idx = str_idx + subexp_len;
2157 dst_node = (subexp_len ? dfa->nexts[node]
2158 : dfa->edests[node].elems[0]);
2159
2160 if (to_idx > sctx->last_str_idx
2161 || sctx->sifted_states[to_idx] == NULL
2162 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2163 || check_dst_limits (mctx, &sctx->limits, node,
2164 str_idx, dst_node, to_idx))
2165 continue;
2166
2167 if (local_sctx.sifted_states == NULL)
2168 {
2169 local_sctx = *sctx;
2170 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2171 if (BE (err != REG_NOERROR, 0))
2172 goto free_return;
2173 }
2174 local_sctx.last_node = node;
2175 local_sctx.last_str_idx = str_idx;
2176 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2177 if (BE (ret < 0, 0))
2178 {
2179 err = REG_ESPACE;
2180 goto free_return;
2181 }
2182 cur_state = local_sctx.sifted_states[str_idx];
2183 err = sift_states_backward (mctx, &local_sctx);
2184 if (BE (err != REG_NOERROR, 0))
2185 goto free_return;
2186 if (sctx->limited_states != NULL)
2187 {
2188 err = merge_state_array (dfa, sctx->limited_states,
2189 local_sctx.sifted_states,
2190 str_idx + 1);
2191 if (BE (err != REG_NOERROR, 0))
2192 goto free_return;
2193 }
2194 local_sctx.sifted_states[str_idx] = cur_state;
2195 re_node_set_remove (&local_sctx.limits, enabled_idx);
2196
2197 /* mctx->bkref_ents may have changed, reload the pointer. */
2198 entry = mctx->bkref_ents + enabled_idx;
2199 }
2200 while (enabled_idx++, entry++->more);
2201 }
2202 err = REG_NOERROR;
2203 free_return:
2204 if (local_sctx.sifted_states != NULL)
2205 {
2206 re_node_set_free (&local_sctx.limits);
2207 }
2208
2209 return err;
2210}
2211
2212
2213#ifdef RE_ENABLE_I18N
2214static int
2215internal_function
2216sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2217 int node_idx, int str_idx, int max_str_idx)
2218{
2219 const re_dfa_t *const dfa = mctx->dfa;
2220 int naccepted;
2221 /* Check the node can accept `multi byte'. */
2222 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2223 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2224 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2225 dfa->nexts[node_idx]))
2226 /* The node can't accept the `multi byte', or the
2227 destination was already thrown away, then the node
2228 could't accept the current input `multi byte'. */
2229 naccepted = 0;
2230 /* Otherwise, it is sure that the node could accept
2231 `naccepted' bytes input. */
2232 return naccepted;
2233}
2234#endif /* RE_ENABLE_I18N */
2235
2236
2237
2238/* Functions for state transition. */
2239
2240/* Return the next state to which the current state STATE will transit by
2241 accepting the current input byte, and update STATE_LOG if necessary.
2242 If STATE can accept a multibyte char/collating element/back reference
2243 update the destination of STATE_LOG. */
2244
2245static re_dfastate_t *
2246internal_function
2247transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2248 re_dfastate_t *state)
2249{
2250 re_dfastate_t **trtable;
2251 unsigned char ch;
2252
2253#ifdef RE_ENABLE_I18N
2254 /* If the current state can accept multibyte. */
2255 if (BE (state->accept_mb, 0))
2256 {
2257 *err = transit_state_mb (mctx, state);
2258 if (BE (*err != REG_NOERROR, 0))
2259 return NULL;
2260 }
2261#endif /* RE_ENABLE_I18N */
2262
2263 /* Then decide the next state with the single byte. */
2264#if 0
2265 if (0)
2266 /* don't use transition table */
2267 return transit_state_sb (err, mctx, state);
2268#endif
2269
2270 /* Use transition table */
2271 ch = re_string_fetch_byte (&mctx->input);
2272 for (;;)
2273 {
2274 trtable = state->trtable;
2275 if (BE (trtable != NULL, 1))
2276 return trtable[ch];
2277
2278 trtable = state->word_trtable;
2279 if (BE (trtable != NULL, 1))
2280 {
2281 unsigned int context;
2282 context
2283 = re_string_context_at (&mctx->input,
2284 re_string_cur_idx (&mctx->input) - 1,
2285 mctx->eflags);
2286 if (IS_WORD_CONTEXT (context))
2287 return trtable[ch + SBC_MAX];
2288 else
2289 return trtable[ch];
2290 }
2291
2292 if (!build_trtable (mctx->dfa, state))
2293 {
2294 *err = REG_ESPACE;
2295 return NULL;
2296 }
2297
2298 /* Retry, we now have a transition table. */
2299 }
2300}
2301
2302/* Update the state_log if we need */
2303re_dfastate_t *
2304internal_function
2305merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2306 re_dfastate_t *next_state)
2307{
2308 const re_dfa_t *const dfa = mctx->dfa;
2309 int cur_idx = re_string_cur_idx (&mctx->input);
2310
2311 if (cur_idx > mctx->state_log_top)
2312 {
2313 mctx->state_log[cur_idx] = next_state;
2314 mctx->state_log_top = cur_idx;
2315 }
2316 else if (mctx->state_log[cur_idx] == 0)
2317 {
2318 mctx->state_log[cur_idx] = next_state;
2319 }
2320 else
2321 {
2322 re_dfastate_t *pstate;
2323 unsigned int context;
2324 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2325 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2326 the destination of a multibyte char/collating element/
2327 back reference. Then the next state is the union set of
2328 these destinations and the results of the transition table. */
2329 pstate = mctx->state_log[cur_idx];
2330 log_nodes = pstate->entrance_nodes;
2331 if (next_state != NULL)
2332 {
2333 table_nodes = next_state->entrance_nodes;
2334 *err = re_node_set_init_union (&next_nodes, table_nodes,
2335 log_nodes);
2336 if (BE (*err != REG_NOERROR, 0))
2337 return NULL;
2338 }
2339 else
2340 next_nodes = *log_nodes;
2341 /* Note: We already add the nodes of the initial state,
2342 then we don't need to add them here. */
2343
2344 context = re_string_context_at (&mctx->input,
2345 re_string_cur_idx (&mctx->input) - 1,
2346 mctx->eflags);
2347 next_state = mctx->state_log[cur_idx]
2348 = re_acquire_state_context (err, dfa, &next_nodes, context);
2349 /* We don't need to check errors here, since the return value of
2350 this function is next_state and ERR is already set. */
2351
2352 if (table_nodes != NULL)
2353 re_node_set_free (&next_nodes);
2354 }
2355
2356 if (BE (dfa->nbackref, 0) && next_state != NULL)
2357 {
2358 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2359 later. We must check them here, since the back references in the
2360 next state might use them. */
2361 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2362 cur_idx);
2363 if (BE (*err != REG_NOERROR, 0))
2364 return NULL;
2365
2366 /* If the next state has back references. */
2367 if (next_state->has_backref)
2368 {
2369 *err = transit_state_bkref (mctx, &next_state->nodes);
2370 if (BE (*err != REG_NOERROR, 0))
2371 return NULL;
2372 next_state = mctx->state_log[cur_idx];
2373 }
2374 }
2375
2376 return next_state;
2377}
2378
2379/* Skip bytes in the input that correspond to part of a
2380 multi-byte match, then look in the log for a state
2381 from which to restart matching. */
2382re_dfastate_t *
2383internal_function
2384find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2385{
2386 re_dfastate_t *cur_state;
2387 do
2388 {
2389 int max = mctx->state_log_top;
2390 int cur_str_idx = re_string_cur_idx (&mctx->input);
2391
2392 do
2393 {
2394 if (++cur_str_idx > max)
2395 return NULL;
2396 re_string_skip_bytes (&mctx->input, 1);
2397 }
2398 while (mctx->state_log[cur_str_idx] == NULL);
2399
2400 cur_state = merge_state_with_log (err, mctx, NULL);
2401 }
2402 while (*err == REG_NOERROR && cur_state == NULL);
2403 return cur_state;
2404}
2405
2406/* Helper functions for transit_state. */
2407
2408/* From the node set CUR_NODES, pick up the nodes whose types are
2409 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2410 expression. And register them to use them later for evaluating the
2411 correspoding back references. */
2412
2413static reg_errcode_t
2414internal_function
2415check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2416 int str_idx)
2417{
2418 const re_dfa_t *const dfa = mctx->dfa;
2419 int node_idx;
2420 reg_errcode_t err;
2421
2422 /* TODO: This isn't efficient.
2423 Because there might be more than one nodes whose types are
2424 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2425 nodes.
2426 E.g. RE: (a){2} */
2427 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2428 {
2429 int node = cur_nodes->elems[node_idx];
2430 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2431 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2432 && (dfa->used_bkref_map
2433 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2434 {
2435 err = match_ctx_add_subtop (mctx, node, str_idx);
2436 if (BE (err != REG_NOERROR, 0))
2437 return err;
2438 }
2439 }
2440 return REG_NOERROR;
2441}
2442
2443#if 0
2444/* Return the next state to which the current state STATE will transit by
2445 accepting the current input byte. */
2446
2447static re_dfastate_t *
2448transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2449 re_dfastate_t *state)
2450{
2451 const re_dfa_t *const dfa = mctx->dfa;
2452 re_node_set next_nodes;
2453 re_dfastate_t *next_state;
2454 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2455 unsigned int context;
2456
2457 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2458 if (BE (*err != REG_NOERROR, 0))
2459 return NULL;
2460 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2461 {
2462 int cur_node = state->nodes.elems[node_cnt];
2463 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2464 {
2465 *err = re_node_set_merge (&next_nodes,
2466 dfa->eclosures + dfa->nexts[cur_node]);
2467 if (BE (*err != REG_NOERROR, 0))
2468 {
2469 re_node_set_free (&next_nodes);
2470 return NULL;
2471 }
2472 }
2473 }
2474 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2475 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2476 /* We don't need to check errors here, since the return value of
2477 this function is next_state and ERR is already set. */
2478
2479 re_node_set_free (&next_nodes);
2480 re_string_skip_bytes (&mctx->input, 1);
2481 return next_state;
2482}
2483#endif
2484
2485#ifdef RE_ENABLE_I18N
2486static reg_errcode_t
2487internal_function
2488transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2489{
2490 const re_dfa_t *const dfa = mctx->dfa;
2491 reg_errcode_t err;
2492 int i;
2493
2494 for (i = 0; i < pstate->nodes.nelem; ++i)
2495 {
2496 re_node_set dest_nodes, *new_nodes;
2497 int cur_node_idx = pstate->nodes.elems[i];
2498 int naccepted, dest_idx;
2499 unsigned int context;
2500 re_dfastate_t *dest_state;
2501
2502 if (!dfa->nodes[cur_node_idx].accept_mb)
2503 continue;
2504
2505 if (dfa->nodes[cur_node_idx].constraint)
2506 {
2507 context = re_string_context_at (&mctx->input,
2508 re_string_cur_idx (&mctx->input),
2509 mctx->eflags);
2510 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2511 context))
2512 continue;
2513 }
2514
2515 /* How many bytes the node can accept? */
2516 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2517 re_string_cur_idx (&mctx->input));
2518 if (naccepted == 0)
2519 continue;
2520
2521 /* The node can accepts `naccepted' bytes. */
2522 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2523 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2524 : mctx->max_mb_elem_len);
2525 err = clean_state_log_if_needed (mctx, dest_idx);
2526 if (BE (err != REG_NOERROR, 0))
2527 return err;
2528#ifdef DEBUG
2529 assert (dfa->nexts[cur_node_idx] != -1);
2530#endif
2531 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2532
2533 dest_state = mctx->state_log[dest_idx];
2534 if (dest_state == NULL)
2535 dest_nodes = *new_nodes;
2536 else
2537 {
2538 err = re_node_set_init_union (&dest_nodes,
2539 dest_state->entrance_nodes, new_nodes);
2540 if (BE (err != REG_NOERROR, 0))
2541 return err;
2542 }
2543 context = re_string_context_at (&mctx->input, dest_idx - 1,
2544 mctx->eflags);
2545 mctx->state_log[dest_idx]
2546 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2547 if (dest_state != NULL)
2548 re_node_set_free (&dest_nodes);
2549 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2550 return err;
2551 }
2552 return REG_NOERROR;
2553}
2554#endif /* RE_ENABLE_I18N */
2555
2556static reg_errcode_t
2557internal_function
2558transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2559{
2560 const re_dfa_t *const dfa = mctx->dfa;
2561 reg_errcode_t err;
2562 int i;
2563 int cur_str_idx = re_string_cur_idx (&mctx->input);
2564
2565 for (i = 0; i < nodes->nelem; ++i)
2566 {
2567 int dest_str_idx, prev_nelem, bkc_idx;
2568 int node_idx = nodes->elems[i];
2569 unsigned int context;
2570 const re_token_t *node = dfa->nodes + node_idx;
2571 re_node_set *new_dest_nodes;
2572
2573 /* Check whether `node' is a backreference or not. */
2574 if (node->type != OP_BACK_REF)
2575 continue;
2576
2577 if (node->constraint)
2578 {
2579 context = re_string_context_at (&mctx->input, cur_str_idx,
2580 mctx->eflags);
2581 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2582 continue;
2583 }
2584
2585 /* `node' is a backreference.
2586 Check the substring which the substring matched. */
2587 bkc_idx = mctx->nbkref_ents;
2588 err = get_subexp (mctx, node_idx, cur_str_idx);
2589 if (BE (err != REG_NOERROR, 0))
2590 goto free_return;
2591
2592 /* And add the epsilon closures (which is `new_dest_nodes') of
2593 the backreference to appropriate state_log. */
2594#ifdef DEBUG
2595 assert (dfa->nexts[node_idx] != -1);
2596#endif
2597 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2598 {
2599 int subexp_len;
2600 re_dfastate_t *dest_state;
2601 struct re_backref_cache_entry *bkref_ent;
2602 bkref_ent = mctx->bkref_ents + bkc_idx;
2603 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2604 continue;
2605 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2606 new_dest_nodes = (subexp_len == 0
2607 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2608 : dfa->eclosures + dfa->nexts[node_idx]);
2609 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2610 - bkref_ent->subexp_from);
2611 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2612 mctx->eflags);
2613 dest_state = mctx->state_log[dest_str_idx];
2614 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2615 : mctx->state_log[cur_str_idx]->nodes.nelem);
2616 /* Add `new_dest_node' to state_log. */
2617 if (dest_state == NULL)
2618 {
2619 mctx->state_log[dest_str_idx]
2620 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2621 context);
2622 if (BE (mctx->state_log[dest_str_idx] == NULL
2623 && err != REG_NOERROR, 0))
2624 goto free_return;
2625 }
2626 else
2627 {
2628 re_node_set dest_nodes;
2629 err = re_node_set_init_union (&dest_nodes,
2630 dest_state->entrance_nodes,
2631 new_dest_nodes);
2632 if (BE (err != REG_NOERROR, 0))
2633 {
2634 re_node_set_free (&dest_nodes);
2635 goto free_return;
2636 }
2637 mctx->state_log[dest_str_idx]
2638 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2639 re_node_set_free (&dest_nodes);
2640 if (BE (mctx->state_log[dest_str_idx] == NULL
2641 && err != REG_NOERROR, 0))
2642 goto free_return;
2643 }
2644 /* We need to check recursively if the backreference can epsilon
2645 transit. */
2646 if (subexp_len == 0
2647 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2648 {
2649 err = check_subexp_matching_top (mctx, new_dest_nodes,
2650 cur_str_idx);
2651 if (BE (err != REG_NOERROR, 0))
2652 goto free_return;
2653 err = transit_state_bkref (mctx, new_dest_nodes);
2654 if (BE (err != REG_NOERROR, 0))
2655 goto free_return;
2656 }
2657 }
2658 }
2659 err = REG_NOERROR;
2660 free_return:
2661 return err;
2662}
2663
2664/* Enumerate all the candidates which the backreference BKREF_NODE can match
2665 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2666 Note that we might collect inappropriate candidates here.
2667 However, the cost of checking them strictly here is too high, then we
2668 delay these checking for prune_impossible_nodes(). */
2669
2670static reg_errcode_t
2671internal_function
2672get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2673{
2674 const re_dfa_t *const dfa = mctx->dfa;
2675 int subexp_num, sub_top_idx;
2676 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2677 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2678 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2679 if (cache_idx != -1)
2680 {
2681 const struct re_backref_cache_entry *entry
2682 = mctx->bkref_ents + cache_idx;
2683 do
2684 if (entry->node == bkref_node)
2685 return REG_NOERROR; /* We already checked it. */
2686 while (entry++->more);
2687 }
2688
2689 subexp_num = dfa->nodes[bkref_node].opr.idx;
2690
2691 /* For each sub expression */
2692 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2693 {
2694 reg_errcode_t err;
2695 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2696 re_sub_match_last_t *sub_last;
2697 int sub_last_idx, sl_str, bkref_str_off;
2698
2699 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2700 continue; /* It isn't related. */
2701
2702 sl_str = sub_top->str_idx;
2703 bkref_str_off = bkref_str_idx;
2704 /* At first, check the last node of sub expressions we already
2705 evaluated. */
2706 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2707 {
2708 int sl_str_diff;
2709 sub_last = sub_top->lasts[sub_last_idx];
2710 sl_str_diff = sub_last->str_idx - sl_str;
2711 /* The matched string by the sub expression match with the substring
2712 at the back reference? */
2713 if (sl_str_diff > 0)
2714 {
2715 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2716 {
2717 /* Not enough chars for a successful match. */
2718 if (bkref_str_off + sl_str_diff > mctx->input.len)
2719 break;
2720
2721 err = clean_state_log_if_needed (mctx,
2722 bkref_str_off
2723 + sl_str_diff);
2724 if (BE (err != REG_NOERROR, 0))
2725 return err;
2726 buf = (const char *) re_string_get_buffer (&mctx->input);
2727 }
2728 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2729 /* We don't need to search this sub expression any more. */
2730 break;
2731 }
2732 bkref_str_off += sl_str_diff;
2733 sl_str += sl_str_diff;
2734 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2735 bkref_str_idx);
2736
2737 /* Reload buf, since the preceding call might have reallocated
2738 the buffer. */
2739 buf = (const char *) re_string_get_buffer (&mctx->input);
2740
2741 if (err == REG_NOMATCH)
2742 continue;
2743 if (BE (err != REG_NOERROR, 0))
2744 return err;
2745 }
2746
2747 if (sub_last_idx < sub_top->nlasts)
2748 continue;
2749 if (sub_last_idx > 0)
2750 ++sl_str;
2751 /* Then, search for the other last nodes of the sub expression. */
2752 for (; sl_str <= bkref_str_idx; ++sl_str)
2753 {
2754 int cls_node, sl_str_off;
2755 const re_node_set *nodes;
2756 sl_str_off = sl_str - sub_top->str_idx;
2757 /* The matched string by the sub expression match with the substring
2758 at the back reference? */
2759 if (sl_str_off > 0)
2760 {
2761 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2762 {
2763 /* If we are at the end of the input, we cannot match. */
2764 if (bkref_str_off >= mctx->input.len)
2765 break;
2766
2767 err = extend_buffers (mctx);
2768 if (BE (err != REG_NOERROR, 0))
2769 return err;
2770
2771 buf = (const char *) re_string_get_buffer (&mctx->input);
2772 }
2773 if (buf [bkref_str_off++] != buf[sl_str - 1])
2774 break; /* We don't need to search this sub expression
2775 any more. */
2776 }
2777 if (mctx->state_log[sl_str] == NULL)
2778 continue;
2779 /* Does this state have a ')' of the sub expression? */
2780 nodes = &mctx->state_log[sl_str]->nodes;
2781 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2782 OP_CLOSE_SUBEXP);
2783 if (cls_node == -1)
2784 continue; /* No. */
2785 if (sub_top->path == NULL)
2786 {
2787 sub_top->path = calloc (sizeof (state_array_t),
2788 sl_str - sub_top->str_idx + 1);
2789 if (sub_top->path == NULL)
2790 return REG_ESPACE;
2791 }
2792 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2793 in the current context? */
2794 err = check_arrival (mctx, sub_top->path, sub_top->node,
2795 sub_top->str_idx, cls_node, sl_str,
2796 OP_CLOSE_SUBEXP);
2797 if (err == REG_NOMATCH)
2798 continue;
2799 if (BE (err != REG_NOERROR, 0))
2800 return err;
2801 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2802 if (BE (sub_last == NULL, 0))
2803 return REG_ESPACE;
2804 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2805 bkref_str_idx);
2806 if (err == REG_NOMATCH)
2807 continue;
2808 }
2809 }
2810 return REG_NOERROR;
2811}
2812
2813/* Helper functions for get_subexp(). */
2814
2815/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2816 If it can arrive, register the sub expression expressed with SUB_TOP
2817 and SUB_LAST. */
2818
2819static reg_errcode_t
2820internal_function
2821get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2822 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2823{
2824 reg_errcode_t err;
2825 int to_idx;
2826 /* Can the subexpression arrive the back reference? */
2827 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2828 sub_last->str_idx, bkref_node, bkref_str,
2829 OP_OPEN_SUBEXP);
2830 if (err != REG_NOERROR)
2831 return err;
2832 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2833 sub_last->str_idx);
2834 if (BE (err != REG_NOERROR, 0))
2835 return err;
2836 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2837 return clean_state_log_if_needed (mctx, to_idx);
2838}
2839
2840/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2841 Search '(' if FL_OPEN, or search ')' otherwise.
2842 TODO: This function isn't efficient...
2843 Because there might be more than one nodes whose types are
2844 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2845 nodes.
2846 E.g. RE: (a){2} */
2847
2848static int
2849internal_function
2850find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2851 int subexp_idx, int type)
2852{
2853 int cls_idx;
2854 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2855 {
2856 int cls_node = nodes->elems[cls_idx];
2857 const re_token_t *node = dfa->nodes + cls_node;
2858 if (node->type == type
2859 && node->opr.idx == subexp_idx)
2860 return cls_node;
2861 }
2862 return -1;
2863}
2864
2865/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2866 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2867 heavily reused.
2868 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2869
2870static reg_errcode_t
2871internal_function
2872check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2873 int top_str, int last_node, int last_str, int type)
2874{
2875 const re_dfa_t *const dfa = mctx->dfa;
2876 reg_errcode_t err = REG_NOERROR;
2877 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2878 re_dfastate_t *cur_state = NULL;
2879 re_node_set *cur_nodes, next_nodes;
2880 re_dfastate_t **backup_state_log;
2881 unsigned int context;
2882
2883 subexp_num = dfa->nodes[top_node].opr.idx;
2884 /* Extend the buffer if we need. */
2885 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2886 {
2887 re_dfastate_t **new_array;
2888 int old_alloc = path->alloc;
2889 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2890 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2891 if (BE (new_array == NULL, 0))
2892 {
2893 path->alloc = old_alloc;
2894 return REG_ESPACE;
2895 }
2896 path->array = new_array;
2897 memset (new_array + old_alloc, '\0',
2898 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2899 }
2900
2901#ifdef __GNUC__ /* silly buggers. */
2902 str_idx = path->next_idx ?: top_str;
2903#else
2904 str_idx = path->next_idx ? path->next_idx : top_str;
2905#endif
2906
2907 /* Temporary modify MCTX. */
2908 backup_state_log = mctx->state_log;
2909 backup_cur_idx = mctx->input.cur_idx;
2910 mctx->state_log = path->array;
2911 mctx->input.cur_idx = str_idx;
2912
2913 /* Setup initial node set. */
2914 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2915 if (str_idx == top_str)
2916 {
2917 err = re_node_set_init_1 (&next_nodes, top_node);
2918 if (BE (err != REG_NOERROR, 0))
2919 return err;
2920 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2921 if (BE (err != REG_NOERROR, 0))
2922 {
2923 re_node_set_free (&next_nodes);
2924 return err;
2925 }
2926 }
2927 else
2928 {
2929 cur_state = mctx->state_log[str_idx];
2930 if (cur_state && cur_state->has_backref)
2931 {
2932 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2933 if (BE (err != REG_NOERROR, 0))
2934 return err;
2935 }
2936 else
2937 re_node_set_init_empty (&next_nodes);
2938 }
2939 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2940 {
2941 if (next_nodes.nelem)
2942 {
2943 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2944 subexp_num, type);
2945 if (BE (err != REG_NOERROR, 0))
2946 {
2947 re_node_set_free (&next_nodes);
2948 return err;
2949 }
2950 }
2951 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2952 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2953 {
2954 re_node_set_free (&next_nodes);
2955 return err;
2956 }
2957 mctx->state_log[str_idx] = cur_state;
2958 }
2959
2960 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2961 {
2962 re_node_set_empty (&next_nodes);
2963 if (mctx->state_log[str_idx + 1])
2964 {
2965 err = re_node_set_merge (&next_nodes,
2966 &mctx->state_log[str_idx + 1]->nodes);
2967 if (BE (err != REG_NOERROR, 0))
2968 {
2969 re_node_set_free (&next_nodes);
2970 return err;
2971 }
2972 }
2973 if (cur_state)
2974 {
2975 err = check_arrival_add_next_nodes (mctx, str_idx,
2976 &cur_state->non_eps_nodes,
2977 &next_nodes);
2978 if (BE (err != REG_NOERROR, 0))
2979 {
2980 re_node_set_free (&next_nodes);
2981 return err;
2982 }
2983 }
2984 ++str_idx;
2985 if (next_nodes.nelem)
2986 {
2987 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2988 if (BE (err != REG_NOERROR, 0))
2989 {
2990 re_node_set_free (&next_nodes);
2991 return err;
2992 }
2993 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2994 subexp_num, type);
2995 if (BE (err != REG_NOERROR, 0))
2996 {
2997 re_node_set_free (&next_nodes);
2998 return err;
2999 }
3000 }
3001 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3002 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3003 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3004 {
3005 re_node_set_free (&next_nodes);
3006 return err;
3007 }
3008 mctx->state_log[str_idx] = cur_state;
3009 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3010 }
3011 re_node_set_free (&next_nodes);
3012 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3013 : &mctx->state_log[last_str]->nodes);
3014 path->next_idx = str_idx;
3015
3016 /* Fix MCTX. */
3017 mctx->state_log = backup_state_log;
3018 mctx->input.cur_idx = backup_cur_idx;
3019
3020 /* Then check the current node set has the node LAST_NODE. */
3021 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3022 return REG_NOERROR;
3023
3024 return REG_NOMATCH;
3025}
3026
3027/* Helper functions for check_arrival. */
3028
3029/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3030 to NEXT_NODES.
3031 TODO: This function is similar to the functions transit_state*(),
3032 however this function has many additional works.
3033 Can't we unify them? */
3034
3035static reg_errcode_t
3036internal_function
3037check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3038 re_node_set *cur_nodes, re_node_set *next_nodes)
3039{
3040 const re_dfa_t *const dfa = mctx->dfa;
3041 int result;
3042 int cur_idx;
3043 reg_errcode_t err = REG_NOERROR;
3044 re_node_set union_set;
3045 re_node_set_init_empty (&union_set);
3046 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3047 {
3048 int naccepted = 0;
3049 int cur_node = cur_nodes->elems[cur_idx];
3050#ifdef DEBUG
3051 re_token_type_t type = dfa->nodes[cur_node].type;
3052 assert (!IS_EPSILON_NODE (type));
3053#endif
3054#ifdef RE_ENABLE_I18N
3055 /* If the node may accept `multi byte'. */
3056 if (dfa->nodes[cur_node].accept_mb)
3057 {
3058 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3059 str_idx);
3060 if (naccepted > 1)
3061 {
3062 re_dfastate_t *dest_state;
3063 int next_node = dfa->nexts[cur_node];
3064 int next_idx = str_idx + naccepted;
3065 dest_state = mctx->state_log[next_idx];
3066 re_node_set_empty (&union_set);
3067 if (dest_state)
3068 {
3069 err = re_node_set_merge (&union_set, &dest_state->nodes);
3070 if (BE (err != REG_NOERROR, 0))
3071 {
3072 re_node_set_free (&union_set);
3073 return err;
3074 }
3075 }
3076 result = re_node_set_insert (&union_set, next_node);
3077 if (BE (result < 0, 0))
3078 {
3079 re_node_set_free (&union_set);
3080 return REG_ESPACE;
3081 }
3082 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3083 &union_set);
3084 if (BE (mctx->state_log[next_idx] == NULL
3085 && err != REG_NOERROR, 0))
3086 {
3087 re_node_set_free (&union_set);
3088 return err;
3089 }
3090 }
3091 }
3092#endif /* RE_ENABLE_I18N */
3093 if (naccepted
3094 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3095 {
3096 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3097 if (BE (result < 0, 0))
3098 {
3099 re_node_set_free (&union_set);
3100 return REG_ESPACE;
3101 }
3102 }
3103 }
3104 re_node_set_free (&union_set);
3105 return REG_NOERROR;
3106}
3107
3108/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3109 CUR_NODES, however exclude the nodes which are:
3110 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3111 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3112*/
3113
3114static reg_errcode_t
3115internal_function
3116check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3117 int ex_subexp, int type)
3118{
3119 reg_errcode_t err;
3120 int idx, outside_node;
3121 re_node_set new_nodes;
3122#ifdef DEBUG
3123 assert (cur_nodes->nelem);
3124#endif
3125 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3126 if (BE (err != REG_NOERROR, 0))
3127 return err;
3128 /* Create a new node set NEW_NODES with the nodes which are epsilon
3129 closures of the node in CUR_NODES. */
3130
3131 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3132 {
3133 int cur_node = cur_nodes->elems[idx];
3134 const re_node_set *eclosure = dfa->eclosures + cur_node;
3135 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3136 if (outside_node == -1)
3137 {
3138 /* There are no problematic nodes, just merge them. */
3139 err = re_node_set_merge (&new_nodes, eclosure);
3140 if (BE (err != REG_NOERROR, 0))
3141 {
3142 re_node_set_free (&new_nodes);
3143 return err;
3144 }
3145 }
3146 else
3147 {
3148 /* There are problematic nodes, re-calculate incrementally. */
3149 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3150 ex_subexp, type);
3151 if (BE (err != REG_NOERROR, 0))
3152 {
3153 re_node_set_free (&new_nodes);
3154 return err;
3155 }
3156 }
3157 }
3158 re_node_set_free (cur_nodes);
3159 *cur_nodes = new_nodes;
3160 return REG_NOERROR;
3161}
3162
3163/* Helper function for check_arrival_expand_ecl.
3164 Check incrementally the epsilon closure of TARGET, and if it isn't
3165 problematic append it to DST_NODES. */
3166
3167static reg_errcode_t
3168internal_function
3169check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3170 int target, int ex_subexp, int type)
3171{
3172 int cur_node;
3173 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3174 {
3175 int err;
3176
3177 if (dfa->nodes[cur_node].type == type
3178 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3179 {
3180 if (type == OP_CLOSE_SUBEXP)
3181 {
3182 err = re_node_set_insert (dst_nodes, cur_node);
3183 if (BE (err == -1, 0))
3184 return REG_ESPACE;
3185 }
3186 break;
3187 }
3188 err = re_node_set_insert (dst_nodes, cur_node);
3189 if (BE (err == -1, 0))
3190 return REG_ESPACE;
3191 if (dfa->edests[cur_node].nelem == 0)
3192 break;
3193 if (dfa->edests[cur_node].nelem == 2)
3194 {
3195 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3196 dfa->edests[cur_node].elems[1],
3197 ex_subexp, type);
3198 if (BE (err != REG_NOERROR, 0))
3199 return err;
3200 }
3201 cur_node = dfa->edests[cur_node].elems[0];
3202 }
3203 return REG_NOERROR;
3204}
3205
3206
3207/* For all the back references in the current state, calculate the
3208 destination of the back references by the appropriate entry
3209 in MCTX->BKREF_ENTS. */
3210
3211static reg_errcode_t
3212internal_function
3213expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3214 int cur_str, int subexp_num, int type)
3215{
3216 const re_dfa_t *const dfa = mctx->dfa;
3217 reg_errcode_t err;
3218 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3219 struct re_backref_cache_entry *ent;
3220
3221 if (cache_idx_start == -1)
3222 return REG_NOERROR;
3223
3224 restart:
3225 ent = mctx->bkref_ents + cache_idx_start;
3226 do
3227 {
3228 int to_idx, next_node;
3229
3230 /* Is this entry ENT is appropriate? */
3231 if (!re_node_set_contains (cur_nodes, ent->node))
3232 continue; /* No. */
3233
3234 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3235 /* Calculate the destination of the back reference, and append it
3236 to MCTX->STATE_LOG. */
3237 if (to_idx == cur_str)
3238 {
3239 /* The backreference did epsilon transit, we must re-check all the
3240 node in the current state. */
3241 re_node_set new_dests;
3242 reg_errcode_t err2, err3;
3243 next_node = dfa->edests[ent->node].elems[0];
3244 if (re_node_set_contains (cur_nodes, next_node))
3245 continue;
3246 err = re_node_set_init_1 (&new_dests, next_node);
3247 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3248 err3 = re_node_set_merge (cur_nodes, &new_dests);
3249 re_node_set_free (&new_dests);
3250 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3251 || err3 != REG_NOERROR, 0))
3252 {
3253 err = (err != REG_NOERROR ? err
3254 : (err2 != REG_NOERROR ? err2 : err3));
3255 return err;
3256 }
3257 /* TODO: It is still inefficient... */
3258 goto restart;
3259 }
3260 else
3261 {
3262 re_node_set union_set;
3263 next_node = dfa->nexts[ent->node];
3264 if (mctx->state_log[to_idx])
3265 {
3266 int ret;
3267 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3268 next_node))
3269 continue;
3270 err = re_node_set_init_copy (&union_set,
3271 &mctx->state_log[to_idx]->nodes);
3272 ret = re_node_set_insert (&union_set, next_node);
3273 if (BE (err != REG_NOERROR || ret < 0, 0))
3274 {
3275 re_node_set_free (&union_set);
3276 err = err != REG_NOERROR ? err : REG_ESPACE;
3277 return err;
3278 }
3279 }
3280 else
3281 {
3282 err = re_node_set_init_1 (&union_set, next_node);
3283 if (BE (err != REG_NOERROR, 0))
3284 return err;
3285 }
3286 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3287 re_node_set_free (&union_set);
3288 if (BE (mctx->state_log[to_idx] == NULL
3289 && err != REG_NOERROR, 0))
3290 return err;
3291 }
3292 }
3293 while (ent++->more);
3294 return REG_NOERROR;
3295}
3296
3297/* Build transition table for the state.
3298 Return 1 if succeeded, otherwise return NULL. */
3299
3300static int
3301internal_function
3302build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3303{
3304 reg_errcode_t err;
3305 int i, j, ch, need_word_trtable = 0;
3306 bitset_word_t elem, mask;
3307 bool dests_node_malloced = false;
3308 bool dest_states_malloced = false;
3309 int ndests; /* Number of the destination states from `state'. */
3310 re_dfastate_t **trtable;
3311 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3312 re_node_set follows, *dests_node;
3313 bitset_t *dests_ch;
3314 bitset_t acceptable;
3315
3316 struct dests_alloc
3317 {
3318 re_node_set dests_node[SBC_MAX];
3319 bitset_t dests_ch[SBC_MAX];
3320 } *dests_alloc;
3321
3322 /* We build DFA states which corresponds to the destination nodes
3323 from `state'. `dests_node[i]' represents the nodes which i-th
3324 destination state contains, and `dests_ch[i]' represents the
3325 characters which i-th destination state accepts. */
3326 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3327 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3328 else
3329 {
3330 dests_alloc = re_malloc (struct dests_alloc, 1);
3331 if (BE (dests_alloc == NULL, 0))
3332 return 0;
3333 dests_node_malloced = true;
3334 }
3335 dests_node = dests_alloc->dests_node;
3336 dests_ch = dests_alloc->dests_ch;
3337
3338 /* Initialize transiton table. */
3339 state->word_trtable = state->trtable = NULL;
3340
3341 /* At first, group all nodes belonging to `state' into several
3342 destinations. */
3343 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3344 if (BE (ndests <= 0, 0))
3345 {
3346 if (dests_node_malloced)
3347 free (dests_alloc);
3348 /* Return 0 in case of an error, 1 otherwise. */
3349 if (ndests == 0)
3350 {
3351 state->trtable = (re_dfastate_t **)
3352 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3353 return 1;
3354 }
3355 return 0;
3356 }
3357
3358 err = re_node_set_alloc (&follows, ndests + 1);
3359 if (BE (err != REG_NOERROR, 0))
3360 goto out_free;
3361
3362 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3363 + ndests * 3 * sizeof (re_dfastate_t *)))
3364 dest_states = (re_dfastate_t **)
3365 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3366 else
3367 {
3368 dest_states = (re_dfastate_t **)
3369 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3370 if (BE (dest_states == NULL, 0))
3371 {
3372out_free:
3373 if (dest_states_malloced)
3374 free (dest_states);
3375 re_node_set_free (&follows);
3376 for (i = 0; i < ndests; ++i)
3377 re_node_set_free (dests_node + i);
3378 if (dests_node_malloced)
3379 free (dests_alloc);
3380 return 0;
3381 }
3382 dest_states_malloced = true;
3383 }
3384 dest_states_word = dest_states + ndests;
3385 dest_states_nl = dest_states_word + ndests;
3386 bitset_empty (acceptable);
3387
3388 /* Then build the states for all destinations. */
3389 for (i = 0; i < ndests; ++i)
3390 {
3391 int next_node;
3392 re_node_set_empty (&follows);
3393 /* Merge the follows of this destination states. */
3394 for (j = 0; j < dests_node[i].nelem; ++j)
3395 {
3396 next_node = dfa->nexts[dests_node[i].elems[j]];
3397 if (next_node != -1)
3398 {
3399 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3400 if (BE (err != REG_NOERROR, 0))
3401 goto out_free;
3402 }
3403 }
3404 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3405 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3406 goto out_free;
3407 /* If the new state has context constraint,
3408 build appropriate states for these contexts. */
3409 if (dest_states[i]->has_constraint)
3410 {
3411 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3412 CONTEXT_WORD);
3413 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3414 goto out_free;
3415
3416 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3417 need_word_trtable = 1;
3418
3419 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3420 CONTEXT_NEWLINE);
3421 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3422 goto out_free;
3423 }
3424 else
3425 {
3426 dest_states_word[i] = dest_states[i];
3427 dest_states_nl[i] = dest_states[i];
3428 }
3429 bitset_merge (acceptable, dests_ch[i]);
3430 }
3431
3432 if (!BE (need_word_trtable, 0))
3433 {
3434 /* We don't care about whether the following character is a word
3435 character, or we are in a single-byte character set so we can
3436 discern by looking at the character code: allocate a
3437 256-entry transition table. */
3438 trtable = state->trtable =
3439 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3440 if (BE (trtable == NULL, 0))
3441 goto out_free;
3442
3443 /* For all characters ch...: */
3444 for (i = 0; i < BITSET_WORDS; ++i)
3445 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3446 elem;
3447 mask <<= 1, elem >>= 1, ++ch)
3448 if (BE (elem & 1, 0))
3449 {
3450 /* There must be exactly one destination which accepts
3451 character ch. See group_nodes_into_DFAstates. */
3452 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3453 ;
3454
3455 /* j-th destination accepts the word character ch. */
3456 if (dfa->word_char[i] & mask)
3457 trtable[ch] = dest_states_word[j];
3458 else
3459 trtable[ch] = dest_states[j];
3460 }
3461 }
3462 else
3463 {
3464 /* We care about whether the following character is a word
3465 character, and we are in a multi-byte character set: discern
3466 by looking at the character code: build two 256-entry
3467 transition tables, one starting at trtable[0] and one
3468 starting at trtable[SBC_MAX]. */
3469 trtable = state->word_trtable =
3470 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3471 if (BE (trtable == NULL, 0))
3472 goto out_free;
3473
3474 /* For all characters ch...: */
3475 for (i = 0; i < BITSET_WORDS; ++i)
3476 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3477 elem;
3478 mask <<= 1, elem >>= 1, ++ch)
3479 if (BE (elem & 1, 0))
3480 {
3481 /* There must be exactly one destination which accepts
3482 character ch. See group_nodes_into_DFAstates. */
3483 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3484 ;
3485
3486 /* j-th destination accepts the word character ch. */
3487 trtable[ch] = dest_states[j];
3488 trtable[ch + SBC_MAX] = dest_states_word[j];
3489 }
3490 }
3491
3492 /* new line */
3493 if (bitset_contain (acceptable, NEWLINE_CHAR))
3494 {
3495 /* The current state accepts newline character. */
3496 for (j = 0; j < ndests; ++j)
3497 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3498 {
3499 /* k-th destination accepts newline character. */
3500 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3501 if (need_word_trtable)
3502 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3503 /* There must be only one destination which accepts
3504 newline. See group_nodes_into_DFAstates. */
3505 break;
3506 }
3507 }
3508
3509 if (dest_states_malloced)
3510 free (dest_states);
3511
3512 re_node_set_free (&follows);
3513 for (i = 0; i < ndests; ++i)
3514 re_node_set_free (dests_node + i);
3515
3516 if (dests_node_malloced)
3517 free (dests_alloc);
3518
3519 return 1;
3520}
3521
3522/* Group all nodes belonging to STATE into several destinations.
3523 Then for all destinations, set the nodes belonging to the destination
3524 to DESTS_NODE[i] and set the characters accepted by the destination
3525 to DEST_CH[i]. This function return the number of destinations. */
3526
3527static int
3528internal_function
3529group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3530 re_node_set *dests_node, bitset_t *dests_ch)
3531{
3532 reg_errcode_t err;
3533 int result;
3534 int i, j, k;
3535 int ndests; /* Number of the destinations from `state'. */
3536 bitset_t accepts; /* Characters a node can accept. */
3537 const re_node_set *cur_nodes = &state->nodes;
3538 bitset_empty (accepts);
3539 ndests = 0;
3540
3541 /* For all the nodes belonging to `state', */
3542 for (i = 0; i < cur_nodes->nelem; ++i)
3543 {
3544 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3545 re_token_type_t type = node->type;
3546 unsigned int constraint = node->constraint;
3547
3548 /* Enumerate all single byte character this node can accept. */
3549 if (type == CHARACTER)
3550 bitset_set (accepts, node->opr.c);
3551 else if (type == SIMPLE_BRACKET)
3552 {
3553 bitset_merge (accepts, node->opr.sbcset);
3554 }
3555 else if (type == OP_PERIOD)
3556 {
3557#ifdef RE_ENABLE_I18N
3558 if (dfa->mb_cur_max > 1)
3559 bitset_merge (accepts, dfa->sb_char);
3560 else
3561#endif
3562 bitset_set_all (accepts);
3563 if (!(dfa->syntax & RE_DOT_NEWLINE))
3564 bitset_clear (accepts, '\n');
3565 if (dfa->syntax & RE_DOT_NOT_NULL)
3566 bitset_clear (accepts, '\0');
3567 }
3568#ifdef RE_ENABLE_I18N
3569 else if (type == OP_UTF8_PERIOD)
3570 {
3571 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3572 if (!(dfa->syntax & RE_DOT_NEWLINE))
3573 bitset_clear (accepts, '\n');
3574 if (dfa->syntax & RE_DOT_NOT_NULL)
3575 bitset_clear (accepts, '\0');
3576 }
3577#endif
3578 else
3579 continue;
3580
3581 /* Check the `accepts' and sift the characters which are not
3582 match it the context. */
3583 if (constraint)
3584 {
3585 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3586 {
3587 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3588 bitset_empty (accepts);
3589 if (accepts_newline)
3590 bitset_set (accepts, NEWLINE_CHAR);
3591 else
3592 continue;
3593 }
3594 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3595 {
3596 bitset_empty (accepts);
3597 continue;
3598 }
3599
3600 if (constraint & NEXT_WORD_CONSTRAINT)
3601 {
3602 bitset_word_t any_set = 0;
3603 if (type == CHARACTER && !node->word_char)
3604 {
3605 bitset_empty (accepts);
3606 continue;
3607 }
3608#ifdef RE_ENABLE_I18N
3609 if (dfa->mb_cur_max > 1)
3610 for (j = 0; j < BITSET_WORDS; ++j)
3611 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3612 else
3613#endif
3614 for (j = 0; j < BITSET_WORDS; ++j)
3615 any_set |= (accepts[j] &= dfa->word_char[j]);
3616 if (!any_set)
3617 continue;
3618 }
3619 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3620 {
3621 bitset_word_t any_set = 0;
3622 if (type == CHARACTER && node->word_char)
3623 {
3624 bitset_empty (accepts);
3625 continue;
3626 }
3627#ifdef RE_ENABLE_I18N
3628 if (dfa->mb_cur_max > 1)
3629 for (j = 0; j < BITSET_WORDS; ++j)
3630 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3631 else
3632#endif
3633 for (j = 0; j < BITSET_WORDS; ++j)
3634 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3635 if (!any_set)
3636 continue;
3637 }
3638 }
3639
3640 /* Then divide `accepts' into DFA states, or create a new
3641 state. Above, we make sure that accepts is not empty. */
3642 for (j = 0; j < ndests; ++j)
3643 {
3644 bitset_t intersec; /* Intersection sets, see below. */
3645 bitset_t remains;
3646 /* Flags, see below. */
3647 bitset_word_t has_intersec, not_subset, not_consumed;
3648
3649 /* Optimization, skip if this state doesn't accept the character. */
3650 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3651 continue;
3652
3653 /* Enumerate the intersection set of this state and `accepts'. */
3654 has_intersec = 0;
3655 for (k = 0; k < BITSET_WORDS; ++k)
3656 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3657 /* And skip if the intersection set is empty. */
3658 if (!has_intersec)
3659 continue;
3660
3661 /* Then check if this state is a subset of `accepts'. */
3662 not_subset = not_consumed = 0;
3663 for (k = 0; k < BITSET_WORDS; ++k)
3664 {
3665 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3666 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3667 }
3668
3669 /* If this state isn't a subset of `accepts', create a
3670 new group state, which has the `remains'. */
3671 if (not_subset)
3672 {
3673 bitset_copy (dests_ch[ndests], remains);
3674 bitset_copy (dests_ch[j], intersec);
3675 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3676 if (BE (err != REG_NOERROR, 0))
3677 goto error_return;
3678 ++ndests;
3679 }
3680
3681 /* Put the position in the current group. */
3682 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3683 if (BE (result < 0, 0))
3684 goto error_return;
3685
3686 /* If all characters are consumed, go to next node. */
3687 if (!not_consumed)
3688 break;
3689 }
3690 /* Some characters remain, create a new group. */
3691 if (j == ndests)
3692 {
3693 bitset_copy (dests_ch[ndests], accepts);
3694 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3695 if (BE (err != REG_NOERROR, 0))
3696 goto error_return;
3697 ++ndests;
3698 bitset_empty (accepts);
3699 }
3700 }
3701 return ndests;
3702 error_return:
3703 for (j = 0; j < ndests; ++j)
3704 re_node_set_free (dests_node + j);
3705 return -1;
3706}
3707
3708#ifdef RE_ENABLE_I18N
3709/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3710 Return the number of the bytes the node accepts.
3711 STR_IDX is the current index of the input string.
3712
3713 This function handles the nodes which can accept one character, or
3714 one collating element like '.', '[a-z]', opposite to the other nodes
3715 can only accept one byte. */
3716
3717static int
3718internal_function
3719check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3720 const re_string_t *input, int str_idx)
3721{
3722 const re_token_t *node = dfa->nodes + node_idx;
3723 int char_len, elem_len;
3724 int i;
3725
3726 if (BE (node->type == OP_UTF8_PERIOD, 0))
3727 {
3728 unsigned char c = re_string_byte_at (input, str_idx), d;
3729 if (BE (c < 0xc2, 1))
3730 return 0;
3731
3732 if (str_idx + 2 > input->len)
3733 return 0;
3734
3735 d = re_string_byte_at (input, str_idx + 1);
3736 if (c < 0xe0)
3737 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3738 else if (c < 0xf0)
3739 {
3740 char_len = 3;
3741 if (c == 0xe0 && d < 0xa0)
3742 return 0;
3743 }
3744 else if (c < 0xf8)
3745 {
3746 char_len = 4;
3747 if (c == 0xf0 && d < 0x90)
3748 return 0;
3749 }
3750 else if (c < 0xfc)
3751 {
3752 char_len = 5;
3753 if (c == 0xf8 && d < 0x88)
3754 return 0;
3755 }
3756 else if (c < 0xfe)
3757 {
3758 char_len = 6;
3759 if (c == 0xfc && d < 0x84)
3760 return 0;
3761 }
3762 else
3763 return 0;
3764
3765 if (str_idx + char_len > input->len)
3766 return 0;
3767
3768 for (i = 1; i < char_len; ++i)
3769 {
3770 d = re_string_byte_at (input, str_idx + i);
3771 if (d < 0x80 || d > 0xbf)
3772 return 0;
3773 }
3774 return char_len;
3775 }
3776
3777 char_len = re_string_char_size_at (input, str_idx);
3778 if (node->type == OP_PERIOD)
3779 {
3780 if (char_len <= 1)
3781 return 0;
3782 /* FIXME: I don't think this if is needed, as both '\n'
3783 and '\0' are char_len == 1. */
3784 /* '.' accepts any one character except the following two cases. */
3785 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3786 re_string_byte_at (input, str_idx) == '\n') ||
3787 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3788 re_string_byte_at (input, str_idx) == '\0'))
3789 return 0;
3790 return char_len;
3791 }
3792
3793 elem_len = re_string_elem_size_at (input, str_idx);
3794 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3795 return 0;
3796
3797 if (node->type == COMPLEX_BRACKET)
3798 {
3799 const re_charset_t *cset = node->opr.mbcset;
3800# ifdef _LIBC
3801 const unsigned char *pin
3802 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3803 int j;
3804 uint32_t nrules;
3805# endif /* _LIBC */
3806 int match_len = 0;
3807 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3808 ? re_string_wchar_at (input, str_idx) : 0);
3809
3810 /* match with multibyte character? */
3811 for (i = 0; i < cset->nmbchars; ++i)
3812 if (wc == cset->mbchars[i])
3813 {
3814 match_len = char_len;
3815 goto check_node_accept_bytes_match;
3816 }
3817 /* match with character_class? */
3818 for (i = 0; i < cset->nchar_classes; ++i)
3819 {
3820 wctype_t wt = cset->char_classes[i];
3821 if (__iswctype (wc, wt))
3822 {
3823 match_len = char_len;
3824 goto check_node_accept_bytes_match;
3825 }
3826 }
3827
3828# ifdef _LIBC
3829 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3830 if (nrules != 0)
3831 {
3832 unsigned int in_collseq = 0;
3833 const int32_t *table, *indirect;
3834 const unsigned char *weights, *extra;
3835 const char *collseqwc;
3836 int32_t idx;
3837 /* This #include defines a local function! */
3838# include <locale/weight.h>
3839
3840 /* match with collating_symbol? */
3841 if (cset->ncoll_syms)
3842 extra = (const unsigned char *)
3843 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3844 for (i = 0; i < cset->ncoll_syms; ++i)
3845 {
3846 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3847 /* Compare the length of input collating element and
3848 the length of current collating element. */
3849 if (*coll_sym != elem_len)
3850 continue;
3851 /* Compare each bytes. */
3852 for (j = 0; j < *coll_sym; j++)
3853 if (pin[j] != coll_sym[1 + j])
3854 break;
3855 if (j == *coll_sym)
3856 {
3857 /* Match if every bytes is equal. */
3858 match_len = j;
3859 goto check_node_accept_bytes_match;
3860 }
3861 }
3862
3863 if (cset->nranges)
3864 {
3865 if (elem_len <= char_len)
3866 {
3867 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3868 in_collseq = __collseq_table_lookup (collseqwc, wc);
3869 }
3870 else
3871 in_collseq = find_collation_sequence_value (pin, elem_len);
3872 }
3873 /* match with range expression? */
3874 for (i = 0; i < cset->nranges; ++i)
3875 if (cset->range_starts[i] <= in_collseq
3876 && in_collseq <= cset->range_ends[i])
3877 {
3878 match_len = elem_len;
3879 goto check_node_accept_bytes_match;
3880 }
3881
3882 /* match with equivalence_class? */
3883 if (cset->nequiv_classes)
3884 {
3885 const unsigned char *cp = pin;
3886 table = (const int32_t *)
3887 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3888 weights = (const unsigned char *)
3889 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3890 extra = (const unsigned char *)
3891 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3892 indirect = (const int32_t *)
3893 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3894 idx = findidx (&cp);
3895 if (idx > 0)
3896 for (i = 0; i < cset->nequiv_classes; ++i)
3897 {
3898 int32_t equiv_class_idx = cset->equiv_classes[i];
3899 size_t weight_len = weights[idx];
3900 if (weight_len == weights[equiv_class_idx])
3901 {
3902 int cnt = 0;
3903 while (cnt <= weight_len
3904 && (weights[equiv_class_idx + 1 + cnt]
3905 == weights[idx + 1 + cnt]))
3906 ++cnt;
3907 if (cnt > weight_len)
3908 {
3909 match_len = elem_len;
3910 goto check_node_accept_bytes_match;
3911 }
3912 }
3913 }
3914 }
3915 }
3916 else
3917# endif /* _LIBC */
3918 {
3919 /* match with range expression? */
3920#if __GNUC__ >= 2
3921 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3922#else
3923 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3924 cmp_buf[2] = wc;
3925#endif
3926 for (i = 0; i < cset->nranges; ++i)
3927 {
3928 cmp_buf[0] = cset->range_starts[i];
3929 cmp_buf[4] = cset->range_ends[i];
3930 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3931 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3932 {
3933 match_len = char_len;
3934 goto check_node_accept_bytes_match;
3935 }
3936 }
3937 }
3938 check_node_accept_bytes_match:
3939 if (!cset->non_match)
3940 return match_len;
3941 else
3942 {
3943 if (match_len > 0)
3944 return 0;
3945 else
3946 return (elem_len > char_len) ? elem_len : char_len;
3947 }
3948 }
3949 return 0;
3950}
3951
3952# ifdef _LIBC
3953static unsigned int
3954internal_function
3955find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3956{
3957 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3958 if (nrules == 0)
3959 {
3960 if (mbs_len == 1)
3961 {
3962 /* No valid character. Match it as a single byte character. */
3963 const unsigned char *collseq = (const unsigned char *)
3964 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3965 return collseq[mbs[0]];
3966 }
3967 return UINT_MAX;
3968 }
3969 else
3970 {
3971 int32_t idx;
3972 const unsigned char *extra = (const unsigned char *)
3973 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3974 int32_t extrasize = (const unsigned char *)
3975 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3976
3977 for (idx = 0; idx < extrasize;)
3978 {
3979 int mbs_cnt, found = 0;
3980 int32_t elem_mbs_len;
3981 /* Skip the name of collating element name. */
3982 idx = idx + extra[idx] + 1;
3983 elem_mbs_len = extra[idx++];
3984 if (mbs_len == elem_mbs_len)
3985 {
3986 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3987 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3988 break;
3989 if (mbs_cnt == elem_mbs_len)
3990 /* Found the entry. */
3991 found = 1;
3992 }
3993 /* Skip the byte sequence of the collating element. */
3994 idx += elem_mbs_len;
3995 /* Adjust for the alignment. */
3996 idx = (idx + 3) & ~3;
3997 /* Skip the collation sequence value. */
3998 idx += sizeof (uint32_t);
3999 /* Skip the wide char sequence of the collating element. */
4000 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4001 /* If we found the entry, return the sequence value. */
4002 if (found)
4003 return *(uint32_t *) (extra + idx);
4004 /* Skip the collation sequence value. */
4005 idx += sizeof (uint32_t);
4006 }
4007 return UINT_MAX;
4008 }
4009}
4010# endif /* _LIBC */
4011#endif /* RE_ENABLE_I18N */
4012
4013/* Check whether the node accepts the byte which is IDX-th
4014 byte of the INPUT. */
4015
4016static int
4017internal_function
4018check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4019 int idx)
4020{
4021 unsigned char ch;
4022 ch = re_string_byte_at (&mctx->input, idx);
4023 switch (node->type)
4024 {
4025 case CHARACTER:
4026 if (node->opr.c != ch)
4027 return 0;
4028 break;
4029
4030 case SIMPLE_BRACKET:
4031 if (!bitset_contain (node->opr.sbcset, ch))
4032 return 0;
4033 break;
4034
4035#ifdef RE_ENABLE_I18N
4036 case OP_UTF8_PERIOD:
4037 if (ch >= 0x80)
4038 return 0;
4039 /* FALLTHROUGH */
4040#endif
4041 case OP_PERIOD:
4042 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4043 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4044 return 0;
4045 break;
4046
4047 default:
4048 return 0;
4049 }
4050
4051 if (node->constraint)
4052 {
4053 /* The node has constraints. Check whether the current context
4054 satisfies the constraints. */
4055 unsigned int context = re_string_context_at (&mctx->input, idx,
4056 mctx->eflags);
4057 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4058 return 0;
4059 }
4060
4061 return 1;
4062}
4063
4064/* Extend the buffers, if the buffers have run out. */
4065
4066static reg_errcode_t
4067internal_function
4068extend_buffers (re_match_context_t *mctx)
4069{
4070 reg_errcode_t ret;
4071 re_string_t *pstr = &mctx->input;
4072
4073 /* Double the lengthes of the buffers. */
4074 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4075 if (BE (ret != REG_NOERROR, 0))
4076 return ret;
4077
4078 if (mctx->state_log != NULL)
4079 {
4080 /* And double the length of state_log. */
4081 /* XXX We have no indication of the size of this buffer. If this
4082 allocation fail we have no indication that the state_log array
4083 does not have the right size. */
4084 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4085 pstr->bufs_len + 1);
4086 if (BE (new_array == NULL, 0))
4087 return REG_ESPACE;
4088 mctx->state_log = new_array;
4089 }
4090
4091 /* Then reconstruct the buffers. */
4092 if (pstr->icase)
4093 {
4094#ifdef RE_ENABLE_I18N
4095 if (pstr->mb_cur_max > 1)
4096 {
4097 ret = build_wcs_upper_buffer (pstr);
4098 if (BE (ret != REG_NOERROR, 0))
4099 return ret;
4100 }
4101 else
4102#endif /* RE_ENABLE_I18N */
4103 build_upper_buffer (pstr);
4104 }
4105 else
4106 {
4107#ifdef RE_ENABLE_I18N
4108 if (pstr->mb_cur_max > 1)
4109 build_wcs_buffer (pstr);
4110 else
4111#endif /* RE_ENABLE_I18N */
4112 {
4113 if (pstr->trans != NULL)
4114 re_string_translate_buffer (pstr);
4115 }
4116 }
4117 return REG_NOERROR;
4118}
4119
4120
4121
4122/* Functions for matching context. */
4123
4124/* Initialize MCTX. */
4125
4126static reg_errcode_t
4127internal_function
4128match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4129{
4130 mctx->eflags = eflags;
4131 mctx->match_last = -1;
4132 if (n > 0)
4133 {
4134 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4135 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4136 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4137 return REG_ESPACE;
4138 }
4139 /* Already zero-ed by the caller.
4140 else
4141 mctx->bkref_ents = NULL;
4142 mctx->nbkref_ents = 0;
4143 mctx->nsub_tops = 0; */
4144 mctx->abkref_ents = n;
4145 mctx->max_mb_elem_len = 1;
4146 mctx->asub_tops = n;
4147 return REG_NOERROR;
4148}
4149
4150/* Clean the entries which depend on the current input in MCTX.
4151 This function must be invoked when the matcher changes the start index
4152 of the input, or changes the input string. */
4153
4154static void
4155internal_function
4156match_ctx_clean (re_match_context_t *mctx)
4157{
4158 int st_idx;
4159 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4160 {
4161 int sl_idx;
4162 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4163 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4164 {
4165 re_sub_match_last_t *last = top->lasts[sl_idx];
4166 re_free (last->path.array);
4167 re_free (last);
4168 }
4169 re_free (top->lasts);
4170 if (top->path)
4171 {
4172 re_free (top->path->array);
4173 re_free (top->path);
4174 }
4175 free (top);
4176 }
4177
4178 mctx->nsub_tops = 0;
4179 mctx->nbkref_ents = 0;
4180}
4181
4182/* Free all the memory associated with MCTX. */
4183
4184static void
4185internal_function
4186match_ctx_free (re_match_context_t *mctx)
4187{
4188 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4189 match_ctx_clean (mctx);
4190 re_free (mctx->sub_tops);
4191 re_free (mctx->bkref_ents);
4192}
4193
4194/* Add a new backreference entry to MCTX.
4195 Note that we assume that caller never call this function with duplicate
4196 entry, and call with STR_IDX which isn't smaller than any existing entry.
4197*/
4198
4199static reg_errcode_t
4200internal_function
4201match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4202 int to)
4203{
4204 if (mctx->nbkref_ents >= mctx->abkref_ents)
4205 {
4206 struct re_backref_cache_entry* new_entry;
4207 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4208 mctx->abkref_ents * 2);
4209 if (BE (new_entry == NULL, 0))
4210 {
4211 re_free (mctx->bkref_ents);
4212 return REG_ESPACE;
4213 }
4214 mctx->bkref_ents = new_entry;
4215 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4216 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4217 mctx->abkref_ents *= 2;
4218 }
4219 if (mctx->nbkref_ents > 0
4220 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4221 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4222
4223 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4224 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4225 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4226 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4227
4228 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4229 If bit N is clear, means that this entry won't epsilon-transition to
4230 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4231 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4232 such node.
4233
4234 A backreference does not epsilon-transition unless it is empty, so set
4235 to all zeros if FROM != TO. */
4236 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4237 = (from == to ? ~0 : 0);
4238
4239 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4240 if (mctx->max_mb_elem_len < to - from)
4241 mctx->max_mb_elem_len = to - from;
4242 return REG_NOERROR;
4243}
4244
4245/* Search for the first entry which has the same str_idx, or -1 if none is
4246 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4247
4248static int
4249internal_function
4250search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4251{
4252 int left, right, mid, last;
4253 last = right = mctx->nbkref_ents;
4254 for (left = 0; left < right;)
4255 {
4256 mid = (left + right) / 2;
4257 if (mctx->bkref_ents[mid].str_idx < str_idx)
4258 left = mid + 1;
4259 else
4260 right = mid;
4261 }
4262 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4263 return left;
4264 else
4265 return -1;
4266}
4267
4268/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4269 at STR_IDX. */
4270
4271static reg_errcode_t
4272internal_function
4273match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4274{
4275#ifdef DEBUG
4276 assert (mctx->sub_tops != NULL);
4277 assert (mctx->asub_tops > 0);
4278#endif
4279 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4280 {
4281 int new_asub_tops = mctx->asub_tops * 2;
4282 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4283 re_sub_match_top_t *,
4284 new_asub_tops);
4285 if (BE (new_array == NULL, 0))
4286 return REG_ESPACE;
4287 mctx->sub_tops = new_array;
4288 mctx->asub_tops = new_asub_tops;
4289 }
4290 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4291 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4292 return REG_ESPACE;
4293 mctx->sub_tops[mctx->nsub_tops]->node = node;
4294 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4295 return REG_NOERROR;
4296}
4297
4298/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4299 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4300
4301static re_sub_match_last_t *
4302internal_function
4303match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4304{
4305 re_sub_match_last_t *new_entry;
4306 if (BE (subtop->nlasts == subtop->alasts, 0))
4307 {
4308 int new_alasts = 2 * subtop->alasts + 1;
4309 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4310 re_sub_match_last_t *,
4311 new_alasts);
4312 if (BE (new_array == NULL, 0))
4313 return NULL;
4314 subtop->lasts = new_array;
4315 subtop->alasts = new_alasts;
4316 }
4317 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4318 if (BE (new_entry != NULL, 1))
4319 {
4320 subtop->lasts[subtop->nlasts] = new_entry;
4321 new_entry->node = node;
4322 new_entry->str_idx = str_idx;
4323 ++subtop->nlasts;
4324 }
4325 return new_entry;
4326}
4327
4328static void
4329internal_function
4330sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4331 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4332{
4333 sctx->sifted_states = sifted_sts;
4334 sctx->limited_states = limited_sts;
4335 sctx->last_node = last_node;
4336 sctx->last_str_idx = last_str_idx;
4337 re_node_set_init_empty (&sctx->limits);
4338}
Note: See TracBrowser for help on using the repository browser.