source: trunk/essentials/sys-apps/gawk/regexec.c@ 3590

Last change on this file since 3590 was 3076, checked in by bird, 19 years ago

gawk 3.1.5

File size: 125.7 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
29 internal_function;
30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55static inline re_dfastate_t *acquire_init_state_context
56 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
57 __attribute ((always_inline)) internal_function;
58static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
59 internal_function;
60static int check_matching (re_match_context_t *mctx, int fl_longest_match,
61 int *p_match_first)
62 internal_function;
63static int check_halt_node_context (const re_dfa_t *dfa, int node,
64 unsigned int context) internal_function;
65static int check_halt_state_context (const re_match_context_t *mctx,
66 const re_dfastate_t *state, int idx)
67 internal_function;
68static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
69 regmatch_t *prev_idx_match, int cur_node,
70 int cur_idx, int nmatch) internal_function;
71static int proceed_next_node (const re_match_context_t *mctx,
72 int nregs, regmatch_t *regs,
73 int *pidx, int node, re_node_set *eps_via_nodes,
74 struct re_fail_stack_t *fs) internal_function;
75static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
76 int str_idx, int dest_node, int nregs,
77 regmatch_t *regs,
78 re_node_set *eps_via_nodes) internal_function;
79static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
80 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
81static reg_errcode_t set_regs (const regex_t *preg,
82 const re_match_context_t *mctx,
83 size_t nmatch, regmatch_t *pmatch,
84 int fl_backtrack) internal_function;
85static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
86
87#ifdef RE_ENABLE_I18N
88static int sift_states_iter_mb (const re_match_context_t *mctx,
89 re_sift_context_t *sctx,
90 int node_idx, int str_idx, int max_str_idx) internal_function;
91#endif /* RE_ENABLE_I18N */
92static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
93 re_sift_context_t *sctx) internal_function;
94static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
95 re_sift_context_t *sctx, int str_idx,
96 re_node_set *cur_dest) internal_function;
97static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
98 re_sift_context_t *sctx,
99 int str_idx,
100 re_node_set *dest_nodes) internal_function;
101static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
102 re_node_set *dest_nodes,
103 const re_node_set *candidates) internal_function;
104static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
105 re_node_set *dest_nodes,
106 const re_node_set *and_nodes) internal_function;
107static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
108 int dst_node, int dst_idx, int src_node,
109 int src_idx) internal_function;
110static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
111 int boundaries, int subexp_idx,
112 int from_node, int bkref_idx) internal_function;
113static int check_dst_limits_calc_pos (re_match_context_t *mctx,
114 int limit, int subexp_idx,
115 int node, int str_idx,
116 int bkref_idx) internal_function;
117static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
118 re_node_set *dest_nodes,
119 const re_node_set *candidates,
120 re_node_set *limits,
121 struct re_backref_cache_entry *bkref_ents,
122 int str_idx) internal_function;
123static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
124 re_sift_context_t *sctx,
125 int str_idx, const re_node_set *candidates) internal_function;
126static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
127 int next_state_log_idx) internal_function;
128static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
129 re_dfastate_t **src, int num) internal_function;
130static re_dfastate_t *find_recover_state (reg_errcode_t *err,
131 re_match_context_t *mctx) internal_function;
132static re_dfastate_t *transit_state (reg_errcode_t *err,
133 re_match_context_t *mctx,
134 re_dfastate_t *state) internal_function;
135static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
136 re_match_context_t *mctx,
137 re_dfastate_t *next_state) internal_function;
138static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
139 re_node_set *cur_nodes,
140 int str_idx) internal_function;
141#if 0
142static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
143 re_match_context_t *mctx,
144 re_dfastate_t *pstate) internal_function;
145#endif
146#ifdef RE_ENABLE_I18N
147static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
148 re_dfastate_t *pstate) internal_function;
149#endif /* RE_ENABLE_I18N */
150static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
151 const re_node_set *nodes) internal_function;
152static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx) internal_function;
154static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str) internal_function;
158static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159 int subexp_idx, int type) internal_function;
160static reg_errcode_t check_arrival (re_match_context_t *mctx,
161 state_array_t *path, int top_node,
162 int top_str, int last_node, int last_str,
163 int type) internal_function;
164static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
165 int str_idx,
166 re_node_set *cur_nodes,
167 re_node_set *next_nodes) internal_function;
168static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
169 re_node_set *cur_nodes,
170 int ex_subexp, int type) internal_function;
171static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
172 re_node_set *dst_nodes,
173 int target, int ex_subexp,
174 int type) internal_function;
175static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
176 re_node_set *cur_nodes, int cur_str,
177 int subexp_num, int type) internal_function;
178static int build_trtable (re_dfa_t *dfa,
179 re_dfastate_t *state) internal_function;
180#ifdef RE_ENABLE_I18N
181static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
182 const re_string_t *input, int idx) internal_function;
183# ifdef _LIBC
184static unsigned int find_collation_sequence_value (const unsigned char *mbs,
185 size_t name_len) internal_function;
186# endif /* _LIBC */
187#endif /* RE_ENABLE_I18N */
188static int group_nodes_into_DFAstates (re_dfa_t *dfa,
189 const re_dfastate_t *state,
190 re_node_set *states_node,
191 bitset *states_ch) internal_function;
192static int check_node_accept (const re_match_context_t *mctx,
193 const re_token_t *node, int idx) internal_function;
194static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
195
196
197/* Entry point for POSIX code. */
198
199/* regexec searches for a given pattern, specified by PREG, in the
200 string STRING.
201
202 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
203 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
204 least NMATCH elements, and we set them to the offsets of the
205 corresponding matched substrings.
206
207 EFLAGS specifies `execution flags' which affect matching: if
208 REG_NOTBOL is set, then ^ does not match at the beginning of the
209 string; if REG_NOTEOL is set, then $ does not match at the end.
210
211 We return 0 if we find a match and REG_NOMATCH if not. */
212
213int
214regexec (preg, string, nmatch, pmatch, eflags)
215 const regex_t *__restrict preg;
216 const char *__restrict string;
217 size_t nmatch;
218 regmatch_t pmatch[];
219 int eflags;
220{
221 reg_errcode_t err;
222 int start, length;
223
224 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
225 return REG_BADPAT;
226
227 if (eflags & REG_STARTEND)
228 {
229 start = pmatch[0].rm_so;
230 length = pmatch[0].rm_eo;
231 }
232 else
233 {
234 start = 0;
235 length = strlen (string);
236 }
237 if (preg->no_sub)
238 err = re_search_internal (preg, string, length, start, length - start,
239 length, 0, NULL, eflags);
240 else
241 err = re_search_internal (preg, string, length, start, length - start,
242 length, nmatch, pmatch, eflags);
243 return err != REG_NOERROR;
244}
245
246#ifdef _LIBC
247# include <shlib-compat.h>
248versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
249
250# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
251__typeof__ (__regexec) __compat_regexec;
252
253int
254attribute_compat_text_section
255__compat_regexec (const regex_t *__restrict preg,
256 const char *__restrict string, size_t nmatch,
257 regmatch_t pmatch[], int eflags)
258{
259 return regexec (preg, string, nmatch, pmatch,
260 eflags & (REG_NOTBOL | REG_NOTEOL));
261}
262compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
263# endif
264#endif
265
266/* Entry points for GNU code. */
267
268/* re_match, re_search, re_match_2, re_search_2
269
270 The former two functions operate on STRING with length LENGTH,
271 while the later two operate on concatenation of STRING1 and STRING2
272 with lengths LENGTH1 and LENGTH2, respectively.
273
274 re_match() matches the compiled pattern in BUFP against the string,
275 starting at index START.
276
277 re_search() first tries matching at index START, then it tries to match
278 starting from index START + 1, and so on. The last start position tried
279 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
280 way as re_match().)
281
282 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
283 the first STOP characters of the concatenation of the strings should be
284 concerned.
285
286 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
287 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
288 computed relative to the concatenation, not relative to the individual
289 strings.)
290
291 On success, re_match* functions return the length of the match, re_search*
292 return the position of the start of the match. Return value -1 means no
293 match was found and -2 indicates an internal error. */
294
295int
296re_match (bufp, string, length, start, regs)
297 struct re_pattern_buffer *bufp;
298 const char *string;
299 int length, start;
300 struct re_registers *regs;
301{
302 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
303}
304#ifdef _LIBC
305weak_alias (__re_match, re_match)
306#endif
307
308int
309re_search (bufp, string, length, start, range, regs)
310 struct re_pattern_buffer *bufp;
311 const char *string;
312 int length, start, range;
313 struct re_registers *regs;
314{
315 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
316}
317#ifdef _LIBC
318weak_alias (__re_search, re_search)
319#endif
320
321int
322re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
323 struct re_pattern_buffer *bufp;
324 const char *string1, *string2;
325 int length1, length2, start, stop;
326 struct re_registers *regs;
327{
328 return re_search_2_stub (bufp, string1, length1, string2, length2,
329 start, 0, regs, stop, 1);
330}
331#ifdef _LIBC
332weak_alias (__re_match_2, re_match_2)
333#endif
334
335int
336re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
337 struct re_pattern_buffer *bufp;
338 const char *string1, *string2;
339 int length1, length2, start, range, stop;
340 struct re_registers *regs;
341{
342 return re_search_2_stub (bufp, string1, length1, string2, length2,
343 start, range, regs, stop, 0);
344}
345#ifdef _LIBC
346weak_alias (__re_search_2, re_search_2)
347#endif
348
349static int
350re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
351 stop, ret_len)
352 struct re_pattern_buffer *bufp;
353 const char *string1, *string2;
354 int length1, length2, start, range, stop, ret_len;
355 struct re_registers *regs;
356{
357 const char *str;
358 int rval;
359 int len = length1 + length2;
360 int free_str = 0;
361
362 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
363 return -2;
364
365 /* Concatenate the strings. */
366 if (length2 > 0)
367 if (length1 > 0)
368 {
369 char *s = re_malloc (char, len);
370
371 if (BE (s == NULL, 0))
372 return -2;
373 memcpy (s, string1, length1);
374 memcpy (s + length1, string2, length2);
375 str = s;
376 free_str = 1;
377 }
378 else
379 str = string2;
380 else
381 str = string1;
382
383 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
384 ret_len);
385 if (free_str)
386 re_free ((char *) str);
387 return rval;
388}
389
390/* The parameters have the same meaning as those of re_search.
391 Additional parameters:
392 If RET_LEN is nonzero the length of the match is returned (re_match style);
393 otherwise the position of the match is returned. */
394
395static int
396re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
397 struct re_pattern_buffer *bufp;
398 const char *string;
399 int length, start, range, stop, ret_len;
400 struct re_registers *regs;
401{
402 reg_errcode_t result;
403 regmatch_t *pmatch;
404 int nregs, rval;
405 int eflags = 0;
406
407 /* Check for out-of-range. */
408 if (BE (start < 0 || start > length, 0))
409 return -1;
410 if (BE (start + range > length, 0))
411 range = length - start;
412 else if (BE (start + range < 0, 0))
413 range = -start;
414
415 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
416 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
417
418 /* Compile fastmap if we haven't yet. */
419 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
420 re_compile_fastmap (bufp);
421
422 if (BE (bufp->no_sub, 0))
423 regs = NULL;
424
425 /* We need at least 1 register. */
426 if (regs == NULL)
427 nregs = 1;
428 else if (BE (bufp->regs_allocated == REGS_FIXED &&
429 regs->num_regs < bufp->re_nsub + 1, 0))
430 {
431 nregs = regs->num_regs;
432 if (BE (nregs < 1, 0))
433 {
434 /* Nothing can be copied to regs. */
435 regs = NULL;
436 nregs = 1;
437 }
438 }
439 else
440 nregs = bufp->re_nsub + 1;
441 pmatch = re_malloc (regmatch_t, nregs);
442 if (BE (pmatch == NULL, 0))
443 return -2;
444
445 result = re_search_internal (bufp, string, length, start, range, stop,
446 nregs, pmatch, eflags);
447
448 rval = 0;
449
450 /* I hope we needn't fill ther regs with -1's when no match was found. */
451 if (result != REG_NOERROR)
452 rval = -1;
453 else if (regs != NULL)
454 {
455 /* If caller wants register contents data back, copy them. */
456 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
457 bufp->regs_allocated);
458 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
459 rval = -2;
460 }
461
462 if (BE (rval == 0, 1))
463 {
464 if (ret_len)
465 {
466 assert (pmatch[0].rm_so == start);
467 rval = pmatch[0].rm_eo - start;
468 }
469 else
470 rval = pmatch[0].rm_so;
471 }
472 re_free (pmatch);
473 return rval;
474}
475
476static unsigned
477re_copy_regs (regs, pmatch, nregs, regs_allocated)
478 struct re_registers *regs;
479 regmatch_t *pmatch;
480 int nregs, regs_allocated;
481{
482 int rval = REGS_REALLOCATE;
483 int i;
484 int need_regs = nregs + 1;
485 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
486 uses. */
487
488 /* Have the register data arrays been allocated? */
489 if (regs_allocated == REGS_UNALLOCATED)
490 { /* No. So allocate them with malloc. */
491 regs->start = re_malloc (regoff_t, need_regs);
492 regs->end = re_malloc (regoff_t, need_regs);
493 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
494 return REGS_UNALLOCATED;
495 regs->num_regs = need_regs;
496 }
497 else if (regs_allocated == REGS_REALLOCATE)
498 { /* Yes. If we need more elements than were already
499 allocated, reallocate them. If we need fewer, just
500 leave it alone. */
501 if (BE (need_regs > regs->num_regs, 0))
502 {
503 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
504 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
505 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
506 return REGS_UNALLOCATED;
507 regs->start = new_start;
508 regs->end = new_end;
509 regs->num_regs = need_regs;
510 }
511 }
512 else
513 {
514 assert (regs_allocated == REGS_FIXED);
515 /* This function may not be called with REGS_FIXED and nregs too big. */
516 assert (regs->num_regs >= nregs);
517 rval = REGS_FIXED;
518 }
519
520 /* Copy the regs. */
521 for (i = 0; i < nregs; ++i)
522 {
523 regs->start[i] = pmatch[i].rm_so;
524 regs->end[i] = pmatch[i].rm_eo;
525 }
526 for ( ; i < regs->num_regs; ++i)
527 regs->start[i] = regs->end[i] = -1;
528
529 return rval;
530}
531
532/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
533 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
534 this memory for recording register information. STARTS and ENDS
535 must be allocated using the malloc library routine, and must each
536 be at least NUM_REGS * sizeof (regoff_t) bytes long.
537
538 If NUM_REGS == 0, then subsequent matches should allocate their own
539 register data.
540
541 Unless this function is called, the first search or match using
542 PATTERN_BUFFER will allocate its own register data, without
543 freeing the old data. */
544
545void
546re_set_registers (bufp, regs, num_regs, starts, ends)
547 struct re_pattern_buffer *bufp;
548 struct re_registers *regs;
549 unsigned num_regs;
550 regoff_t *starts, *ends;
551{
552 if (num_regs)
553 {
554 bufp->regs_allocated = REGS_REALLOCATE;
555 regs->num_regs = num_regs;
556 regs->start = starts;
557 regs->end = ends;
558 }
559 else
560 {
561 bufp->regs_allocated = REGS_UNALLOCATED;
562 regs->num_regs = 0;
563 regs->start = regs->end = (regoff_t *) 0;
564 }
565}
566#ifdef _LIBC
567weak_alias (__re_set_registers, re_set_registers)
568#endif
569
570
571/* Entry points compatible with 4.2 BSD regex library. We don't define
572 them unless specifically requested. */
573
574#if defined _REGEX_RE_COMP || defined _LIBC
575int
576# ifdef _LIBC
577weak_function
578# endif
579re_exec (s)
580 const char *s;
581{
582 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
583}
584#endif /* _REGEX_RE_COMP */
585
586
587/* Internal entry point. */
588
589/* Searches for a compiled pattern PREG in the string STRING, whose
590 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
591 mingings with regexec. START, and RANGE have the same meanings
592 with re_search.
593 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
594 otherwise return the error code.
595 Note: We assume front end functions already check ranges.
596 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
597
598static reg_errcode_t
599re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
600 eflags)
601 const regex_t *preg;
602 const char *string;
603 int length, start, range, stop, eflags;
604 size_t nmatch;
605 regmatch_t pmatch[];
606{
607 reg_errcode_t err;
608 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
609 int left_lim, right_lim, incr;
610 int fl_longest_match, match_first, match_kind, match_last = -1;
611 int extra_nmatch;
612 int sb, ch;
613#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
614 re_match_context_t mctx = { .dfa = dfa };
615#else
616 re_match_context_t mctx;
617#endif
618 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
619 && range && !preg->can_be_null) ? preg->fastmap : NULL;
620 unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
621
622#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
623 memset (&mctx, '\0', sizeof (re_match_context_t));
624 mctx.dfa = dfa;
625#endif
626
627 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
628 nmatch -= extra_nmatch;
629
630 /* Check if the DFA haven't been compiled. */
631 if (BE (preg->used == 0 || dfa->init_state == NULL
632 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
633 || dfa->init_state_begbuf == NULL, 0))
634 return REG_NOMATCH;
635
636#ifdef DEBUG
637 /* We assume front-end functions already check them. */
638 assert (start + range >= 0 && start + range <= length);
639#endif
640
641 /* If initial states with non-begbuf contexts have no elements,
642 the regex must be anchored. If preg->newline_anchor is set,
643 we'll never use init_state_nl, so do not check it. */
644 if (dfa->init_state->nodes.nelem == 0
645 && dfa->init_state_word->nodes.nelem == 0
646 && (dfa->init_state_nl->nodes.nelem == 0
647 || !preg->newline_anchor))
648 {
649 if (start != 0 && start + range != 0)
650 return REG_NOMATCH;
651 start = range = 0;
652 }
653
654 /* We must check the longest matching, if nmatch > 0. */
655 fl_longest_match = (nmatch != 0 || dfa->nbackref);
656
657 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
658 preg->translate, preg->syntax & RE_ICASE, dfa);
659 if (BE (err != REG_NOERROR, 0))
660 goto free_return;
661 mctx.input.stop = stop;
662 mctx.input.raw_stop = stop;
663 mctx.input.newline_anchor = preg->newline_anchor;
664
665 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
666 if (BE (err != REG_NOERROR, 0))
667 goto free_return;
668
669 /* We will log all the DFA states through which the dfa pass,
670 if nmatch > 1, or this dfa has "multibyte node", which is a
671 back-reference or a node which can accept multibyte character or
672 multi character collating element. */
673 if (nmatch > 1 || dfa->has_mb_node)
674 {
675 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
676 if (BE (mctx.state_log == NULL, 0))
677 {
678 err = REG_ESPACE;
679 goto free_return;
680 }
681 }
682 else
683 mctx.state_log = NULL;
684
685 match_first = start;
686 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
687 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
688
689 /* Check incrementally whether of not the input string match. */
690 incr = (range < 0) ? -1 : 1;
691 left_lim = (range < 0) ? start + range : start;
692 right_lim = (range < 0) ? start : start + range;
693 sb = dfa->mb_cur_max == 1;
694 match_kind =
695 (fastmap
696 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
697 | (range >= 0 ? 2 : 0)
698 | (t != NULL ? 1 : 0))
699 : 8);
700
701 for (;; match_first += incr)
702 {
703 err = REG_NOMATCH;
704 if (match_first < left_lim || right_lim < match_first)
705 goto free_return;
706
707 /* Advance as rapidly as possible through the string, until we
708 find a plausible place to start matching. This may be done
709 with varying efficiency, so there are various possibilities:
710 only the most common of them are specialized, in order to
711 save on code size. We use a switch statement for speed. */
712 switch (match_kind)
713 {
714 case 8:
715 /* No fastmap. */
716 break;
717
718 case 7:
719 /* Fastmap with single-byte translation, match forward. */
720 while (BE (match_first < right_lim, 1)
721 && !fastmap[t[(unsigned char) string[match_first]]])
722 ++match_first;
723 goto forward_match_found_start_or_reached_end;
724
725 case 6:
726 /* Fastmap without translation, match forward. */
727 while (BE (match_first < right_lim, 1)
728 && !fastmap[(unsigned char) string[match_first]])
729 ++match_first;
730
731 forward_match_found_start_or_reached_end:
732 if (BE (match_first == right_lim, 0))
733 {
734 ch = match_first >= length
735 ? 0 : (unsigned char) string[match_first];
736 if (!fastmap[t ? t[ch] : ch])
737 goto free_return;
738 }
739 break;
740
741 case 4:
742 case 5:
743 /* Fastmap without multi-byte translation, match backwards. */
744 while (match_first >= left_lim)
745 {
746 ch = match_first >= length
747 ? 0 : (unsigned char) string[match_first];
748 if (fastmap[t ? t[ch] : ch])
749 break;
750 --match_first;
751 }
752 if (match_first < left_lim)
753 goto free_return;
754 break;
755
756 default:
757 /* In this case, we can't determine easily the current byte,
758 since it might be a component byte of a multibyte
759 character. Then we use the constructed buffer instead. */
760 for (;;)
761 {
762 /* If MATCH_FIRST is out of the valid range, reconstruct the
763 buffers. */
764 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
765 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
766 {
767 err = re_string_reconstruct (&mctx.input, match_first,
768 eflags);
769 if (BE (err != REG_NOERROR, 0))
770 goto free_return;
771
772 offset = match_first - mctx.input.raw_mbs_idx;
773 }
774 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
775 Note that MATCH_FIRST must not be smaller than 0. */
776 ch = (match_first >= length
777 ? 0 : re_string_byte_at (&mctx.input, offset));
778 if (fastmap[ch])
779 break;
780 match_first += incr;
781 if (match_first < left_lim || match_first > right_lim)
782 {
783 err = REG_NOMATCH;
784 goto free_return;
785 }
786 }
787 break;
788 }
789
790 /* Reconstruct the buffers so that the matcher can assume that
791 the matching starts from the beginning of the buffer. */
792 err = re_string_reconstruct (&mctx.input, match_first, eflags);
793 if (BE (err != REG_NOERROR, 0))
794 goto free_return;
795
796#ifdef RE_ENABLE_I18N
797 /* Don't consider this char as a possible match start if it part,
798 yet isn't the head, of a multibyte character. */
799 if (!sb && !re_string_first_byte (&mctx.input, 0))
800 continue;
801#endif
802
803 /* It seems to be appropriate one, then use the matcher. */
804 /* We assume that the matching starts from 0. */
805 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
806 match_last = check_matching (&mctx, fl_longest_match,
807 range >= 0 ? &match_first : NULL);
808 if (match_last != -1)
809 {
810 if (BE (match_last == -2, 0))
811 {
812 err = REG_ESPACE;
813 goto free_return;
814 }
815 else
816 {
817 mctx.match_last = match_last;
818 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
819 {
820 re_dfastate_t *pstate = mctx.state_log[match_last];
821 mctx.last_node = check_halt_state_context (&mctx, pstate,
822 match_last);
823 }
824 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
825 || dfa->nbackref)
826 {
827 err = prune_impossible_nodes (&mctx);
828 if (err == REG_NOERROR)
829 break;
830 if (BE (err != REG_NOMATCH, 0))
831 goto free_return;
832 match_last = -1;
833 }
834 else
835 break; /* We found a match. */
836 }
837 }
838
839 match_ctx_clean (&mctx);
840 }
841
842#ifdef DEBUG
843 assert (match_last != -1);
844 assert (err == REG_NOERROR);
845#endif
846
847 /* Set pmatch[] if we need. */
848 if (nmatch > 0)
849 {
850 int reg_idx;
851
852 /* Initialize registers. */
853 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
854 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
855
856 /* Set the points where matching start/end. */
857 pmatch[0].rm_so = 0;
858 pmatch[0].rm_eo = mctx.match_last;
859
860 if (!preg->no_sub && nmatch > 1)
861 {
862 err = set_regs (preg, &mctx, nmatch, pmatch,
863 dfa->has_plural_match && dfa->nbackref > 0);
864 if (BE (err != REG_NOERROR, 0))
865 goto free_return;
866 }
867
868 /* At last, add the offset to the each registers, since we slided
869 the buffers so that we could assume that the matching starts
870 from 0. */
871 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
872 if (pmatch[reg_idx].rm_so != -1)
873 {
874#ifdef RE_ENABLE_I18N
875 if (BE (mctx.input.offsets_needed != 0, 0))
876 {
877 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
878 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
879 else
880 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
881 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
882 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
883 else
884 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
885 }
886#else
887 assert (mctx.input.offsets_needed == 0);
888#endif
889 pmatch[reg_idx].rm_so += match_first;
890 pmatch[reg_idx].rm_eo += match_first;
891 }
892 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
893 {
894 pmatch[nmatch + reg_idx].rm_so = -1;
895 pmatch[nmatch + reg_idx].rm_eo = -1;
896 }
897
898 if (dfa->subexp_map)
899 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
900 if (dfa->subexp_map[reg_idx] != reg_idx)
901 {
902 pmatch[reg_idx + 1].rm_so
903 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
904 pmatch[reg_idx + 1].rm_eo
905 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
906 }
907 }
908
909 free_return:
910 re_free (mctx.state_log);
911 if (dfa->nbackref)
912 match_ctx_free (&mctx);
913 re_string_destruct (&mctx.input);
914 return err;
915}
916
917static reg_errcode_t
918prune_impossible_nodes (mctx)
919 re_match_context_t *mctx;
920{
921 re_dfa_t *const dfa = mctx->dfa;
922 int halt_node, match_last;
923 reg_errcode_t ret;
924 re_dfastate_t **sifted_states;
925 re_dfastate_t **lim_states = NULL;
926 re_sift_context_t sctx;
927#ifdef DEBUG
928 assert (mctx->state_log != NULL);
929#endif
930 match_last = mctx->match_last;
931 halt_node = mctx->last_node;
932 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
933 if (BE (sifted_states == NULL, 0))
934 {
935 ret = REG_ESPACE;
936 goto free_return;
937 }
938 if (dfa->nbackref)
939 {
940 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
941 if (BE (lim_states == NULL, 0))
942 {
943 ret = REG_ESPACE;
944 goto free_return;
945 }
946 while (1)
947 {
948 memset (lim_states, '\0',
949 sizeof (re_dfastate_t *) * (match_last + 1));
950 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
951 match_last);
952 ret = sift_states_backward (mctx, &sctx);
953 re_node_set_free (&sctx.limits);
954 if (BE (ret != REG_NOERROR, 0))
955 goto free_return;
956 if (sifted_states[0] != NULL || lim_states[0] != NULL)
957 break;
958 do
959 {
960 --match_last;
961 if (match_last < 0)
962 {
963 ret = REG_NOMATCH;
964 goto free_return;
965 }
966 } while (mctx->state_log[match_last] == NULL
967 || !mctx->state_log[match_last]->halt);
968 halt_node = check_halt_state_context (mctx,
969 mctx->state_log[match_last],
970 match_last);
971 }
972 ret = merge_state_array (dfa, sifted_states, lim_states,
973 match_last + 1);
974 re_free (lim_states);
975 lim_states = NULL;
976 if (BE (ret != REG_NOERROR, 0))
977 goto free_return;
978 }
979 else
980 {
981 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
982 ret = sift_states_backward (mctx, &sctx);
983 re_node_set_free (&sctx.limits);
984 if (BE (ret != REG_NOERROR, 0))
985 goto free_return;
986 }
987 re_free (mctx->state_log);
988 mctx->state_log = sifted_states;
989 sifted_states = NULL;
990 mctx->last_node = halt_node;
991 mctx->match_last = match_last;
992 ret = REG_NOERROR;
993 free_return:
994 re_free (sifted_states);
995 re_free (lim_states);
996 return ret;
997}
998
999/* Acquire an initial state and return it.
1000 We must select appropriate initial state depending on the context,
1001 since initial states may have constraints like "\<", "^", etc.. */
1002
1003static inline re_dfastate_t *
1004acquire_init_state_context (err, mctx, idx)
1005 reg_errcode_t *err;
1006 const re_match_context_t *mctx;
1007 int idx;
1008{
1009 re_dfa_t *const dfa = mctx->dfa;
1010 if (dfa->init_state->has_constraint)
1011 {
1012 unsigned int context;
1013 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1014 if (IS_WORD_CONTEXT (context))
1015 return dfa->init_state_word;
1016 else if (IS_ORDINARY_CONTEXT (context))
1017 return dfa->init_state;
1018 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1019 return dfa->init_state_begbuf;
1020 else if (IS_NEWLINE_CONTEXT (context))
1021 return dfa->init_state_nl;
1022 else if (IS_BEGBUF_CONTEXT (context))
1023 {
1024 /* It is relatively rare case, then calculate on demand. */
1025 return re_acquire_state_context (err, dfa,
1026 dfa->init_state->entrance_nodes,
1027 context);
1028 }
1029 else
1030 /* Must not happen? */
1031 return dfa->init_state;
1032 }
1033 else
1034 return dfa->init_state;
1035}
1036
1037/* Check whether the regular expression match input string INPUT or not,
1038 and return the index where the matching end, return -1 if not match,
1039 or return -2 in case of an error.
1040 FL_LONGEST_MATCH means we want the POSIX longest matching.
1041 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1042 next place where we may want to try matching.
1043 Note that the matcher assume that the maching starts from the current
1044 index of the buffer. */
1045
1046static int
1047check_matching (mctx, fl_longest_match, p_match_first)
1048 re_match_context_t *mctx;
1049 int fl_longest_match;
1050 int *p_match_first;
1051{
1052 re_dfa_t *const dfa = mctx->dfa;
1053 reg_errcode_t err;
1054 int match = 0;
1055 int match_last = -1;
1056 int cur_str_idx = re_string_cur_idx (&mctx->input);
1057 re_dfastate_t *cur_state;
1058 int at_init_state = p_match_first != NULL;
1059 int next_start_idx = cur_str_idx;
1060
1061 err = REG_NOERROR;
1062 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1063 /* An initial state must not be NULL (invalid). */
1064 if (BE (cur_state == NULL, 0))
1065 {
1066 assert (err == REG_ESPACE);
1067 return -2;
1068 }
1069
1070 if (mctx->state_log != NULL)
1071 {
1072 mctx->state_log[cur_str_idx] = cur_state;
1073
1074 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1075 later. E.g. Processing back references. */
1076 if (BE (dfa->nbackref, 0))
1077 {
1078 at_init_state = 0;
1079 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1080 if (BE (err != REG_NOERROR, 0))
1081 return err;
1082
1083 if (cur_state->has_backref)
1084 {
1085 err = transit_state_bkref (mctx, &cur_state->nodes);
1086 if (BE (err != REG_NOERROR, 0))
1087 return err;
1088 }
1089 }
1090 }
1091
1092 /* If the RE accepts NULL string. */
1093 if (BE (cur_state->halt, 0))
1094 {
1095 if (!cur_state->has_constraint
1096 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1097 {
1098 if (!fl_longest_match)
1099 return cur_str_idx;
1100 else
1101 {
1102 match_last = cur_str_idx;
1103 match = 1;
1104 }
1105 }
1106 }
1107
1108 while (!re_string_eoi (&mctx->input))
1109 {
1110 re_dfastate_t *old_state = cur_state;
1111 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1112
1113 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1114 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1115 && mctx->input.valid_len < mctx->input.len))
1116 {
1117 err = extend_buffers (mctx);
1118 if (BE (err != REG_NOERROR, 0))
1119 {
1120 assert (err == REG_ESPACE);
1121 return -2;
1122 }
1123 }
1124
1125 cur_state = transit_state (&err, mctx, cur_state);
1126 if (mctx->state_log != NULL)
1127 cur_state = merge_state_with_log (&err, mctx, cur_state);
1128
1129 if (cur_state == NULL)
1130 {
1131 /* Reached the invalid state or an error. Try to recover a valid
1132 state using the state log, if available and if we have not
1133 already found a valid (even if not the longest) match. */
1134 if (BE (err != REG_NOERROR, 0))
1135 return -2;
1136
1137 if (mctx->state_log == NULL
1138 || (match && !fl_longest_match)
1139 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1140 break;
1141 }
1142
1143 if (BE (at_init_state, 0))
1144 {
1145 if (old_state == cur_state)
1146 next_start_idx = next_char_idx;
1147 else
1148 at_init_state = 0;
1149 }
1150
1151 if (cur_state->halt)
1152 {
1153 /* Reached a halt state.
1154 Check the halt state can satisfy the current context. */
1155 if (!cur_state->has_constraint
1156 || check_halt_state_context (mctx, cur_state,
1157 re_string_cur_idx (&mctx->input)))
1158 {
1159 /* We found an appropriate halt state. */
1160 match_last = re_string_cur_idx (&mctx->input);
1161 match = 1;
1162
1163 /* We found a match, do not modify match_first below. */
1164 p_match_first = NULL;
1165 if (!fl_longest_match)
1166 break;
1167 }
1168 }
1169 }
1170
1171 if (p_match_first)
1172 *p_match_first += next_start_idx;
1173
1174 return match_last;
1175}
1176
1177/* Check NODE match the current context. */
1178
1179static int check_halt_node_context (dfa, node, context)
1180 const re_dfa_t *dfa;
1181 int node;
1182 unsigned int context;
1183{
1184 re_token_type_t type = dfa->nodes[node].type;
1185 unsigned int constraint = dfa->nodes[node].constraint;
1186 if (type != END_OF_RE)
1187 return 0;
1188 if (!constraint)
1189 return 1;
1190 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1191 return 0;
1192 return 1;
1193}
1194
1195/* Check the halt state STATE match the current context.
1196 Return 0 if not match, if the node, STATE has, is a halt node and
1197 match the context, return the node. */
1198
1199static int
1200check_halt_state_context (mctx, state, idx)
1201 const re_match_context_t *mctx;
1202 const re_dfastate_t *state;
1203 int idx;
1204{
1205 int i;
1206 unsigned int context;
1207#ifdef DEBUG
1208 assert (state->halt);
1209#endif
1210 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1211 for (i = 0; i < state->nodes.nelem; ++i)
1212 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1213 return state->nodes.elems[i];
1214 return 0;
1215}
1216
1217/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1218 corresponding to the DFA).
1219 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1220 of errors. */
1221
1222static int
1223proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1224 const re_match_context_t *mctx;
1225 regmatch_t *regs;
1226 int nregs, *pidx, node;
1227 re_node_set *eps_via_nodes;
1228 struct re_fail_stack_t *fs;
1229{
1230 re_dfa_t *const dfa = mctx->dfa;
1231 int i, err, dest_node;
1232 dest_node = -1;
1233 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1234 {
1235 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1236 re_node_set *edests = &dfa->edests[node];
1237 int dest_node;
1238 err = re_node_set_insert (eps_via_nodes, node);
1239 if (BE (err < 0, 0))
1240 return -2;
1241 /* Pick up a valid destination, or return -1 if none is found. */
1242 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1243 {
1244 int candidate = edests->elems[i];
1245 if (!re_node_set_contains (cur_nodes, candidate))
1246 continue;
1247 if (dest_node == -1)
1248 dest_node = candidate;
1249
1250 else
1251 {
1252 /* In order to avoid infinite loop like "(a*)*", return the second
1253 epsilon-transition if the first was already considered. */
1254 if (re_node_set_contains (eps_via_nodes, dest_node))
1255 return candidate;
1256
1257 /* Otherwise, push the second epsilon-transition on the fail stack. */
1258 else if (fs != NULL
1259 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1260 eps_via_nodes) != REG_NOERROR)
1261 return -2;
1262
1263 /* We know we are going to exit. */
1264 break;
1265 }
1266 }
1267 return dest_node;
1268 }
1269 else
1270 {
1271 int naccepted = 0;
1272 re_token_type_t type = dfa->nodes[node].type;
1273
1274#ifdef RE_ENABLE_I18N
1275 if (dfa->nodes[node].accept_mb)
1276 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1277 else
1278#endif /* RE_ENABLE_I18N */
1279 if (type == OP_BACK_REF)
1280 {
1281 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1282 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1283 if (fs != NULL)
1284 {
1285 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1286 return -1;
1287 else if (naccepted)
1288 {
1289 char *buf = (char *) re_string_get_buffer (&mctx->input);
1290 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1291 naccepted) != 0)
1292 return -1;
1293 }
1294 }
1295
1296 if (naccepted == 0)
1297 {
1298 err = re_node_set_insert (eps_via_nodes, node);
1299 if (BE (err < 0, 0))
1300 return -2;
1301 dest_node = dfa->edests[node].elems[0];
1302 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1303 dest_node))
1304 return dest_node;
1305 }
1306 }
1307
1308 if (naccepted != 0
1309 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1310 {
1311 dest_node = dfa->nexts[node];
1312 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1313 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1314 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1315 dest_node)))
1316 return -1;
1317 re_node_set_empty (eps_via_nodes);
1318 return dest_node;
1319 }
1320 }
1321 return -1;
1322}
1323
1324static reg_errcode_t
1325push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1326 struct re_fail_stack_t *fs;
1327 int str_idx, dest_node, nregs;
1328 regmatch_t *regs;
1329 re_node_set *eps_via_nodes;
1330{
1331 reg_errcode_t err;
1332 int num = fs->num++;
1333 if (fs->num == fs->alloc)
1334 {
1335 struct re_fail_stack_ent_t *new_array;
1336 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1337 * fs->alloc * 2));
1338 if (new_array == NULL)
1339 return REG_ESPACE;
1340 fs->alloc *= 2;
1341 fs->stack = new_array;
1342 }
1343 fs->stack[num].idx = str_idx;
1344 fs->stack[num].node = dest_node;
1345 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1346 if (fs->stack[num].regs == NULL)
1347 return REG_ESPACE;
1348 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1349 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1350 return err;
1351}
1352
1353static int
1354pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1355 struct re_fail_stack_t *fs;
1356 int *pidx, nregs;
1357 regmatch_t *regs;
1358 re_node_set *eps_via_nodes;
1359{
1360 int num = --fs->num;
1361 assert (num >= 0);
1362 *pidx = fs->stack[num].idx;
1363 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1364 re_node_set_free (eps_via_nodes);
1365 re_free (fs->stack[num].regs);
1366 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1367 return fs->stack[num].node;
1368}
1369
1370/* Set the positions where the subexpressions are starts/ends to registers
1371 PMATCH.
1372 Note: We assume that pmatch[0] is already set, and
1373 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1374
1375static reg_errcode_t
1376set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1377 const regex_t *preg;
1378 const re_match_context_t *mctx;
1379 size_t nmatch;
1380 regmatch_t *pmatch;
1381 int fl_backtrack;
1382{
1383 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1384 int idx, cur_node;
1385 re_node_set eps_via_nodes;
1386 struct re_fail_stack_t *fs;
1387 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1388 regmatch_t *prev_idx_match;
1389
1390#ifdef DEBUG
1391 assert (nmatch > 1);
1392 assert (mctx->state_log != NULL);
1393#endif
1394 if (fl_backtrack)
1395 {
1396 fs = &fs_body;
1397 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1398 if (fs->stack == NULL)
1399 return REG_ESPACE;
1400 }
1401 else
1402 fs = NULL;
1403
1404 cur_node = dfa->init_node;
1405 re_node_set_init_empty (&eps_via_nodes);
1406
1407 prev_idx_match = re_malloc (regmatch_t, nmatch);
1408 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1409
1410 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1411 {
1412 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1413
1414 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1415 {
1416 int reg_idx;
1417 if (fs)
1418 {
1419 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1420 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1421 break;
1422 if (reg_idx == nmatch)
1423 {
1424 re_node_set_free (&eps_via_nodes);
1425 re_free (prev_idx_match);
1426 return free_fail_stack_return (fs);
1427 }
1428 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1429 &eps_via_nodes);
1430 }
1431 else
1432 {
1433 re_node_set_free (&eps_via_nodes);
1434 re_free (prev_idx_match);
1435 return REG_NOERROR;
1436 }
1437 }
1438
1439 /* Proceed to next node. */
1440 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1441 &eps_via_nodes, fs);
1442
1443 if (BE (cur_node < 0, 0))
1444 {
1445 if (BE (cur_node == -2, 0))
1446 {
1447 re_node_set_free (&eps_via_nodes);
1448 free_fail_stack_return (fs);
1449 re_free (prev_idx_match);
1450 return REG_ESPACE;
1451 }
1452 if (fs)
1453 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1454 &eps_via_nodes);
1455 else
1456 {
1457 re_node_set_free (&eps_via_nodes);
1458 re_free (prev_idx_match);
1459 return REG_NOMATCH;
1460 }
1461 }
1462 }
1463 re_node_set_free (&eps_via_nodes);
1464 re_free (prev_idx_match);
1465 return free_fail_stack_return (fs);
1466}
1467
1468static reg_errcode_t
1469free_fail_stack_return (fs)
1470 struct re_fail_stack_t *fs;
1471{
1472 if (fs)
1473 {
1474 int fs_idx;
1475 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1476 {
1477 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1478 re_free (fs->stack[fs_idx].regs);
1479 }
1480 re_free (fs->stack);
1481 }
1482 return REG_NOERROR;
1483}
1484
1485static void
1486update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1487 re_dfa_t *dfa;
1488 regmatch_t *pmatch, *prev_idx_match;
1489 int cur_node, cur_idx, nmatch;
1490{
1491 int type = dfa->nodes[cur_node].type;
1492 if (type == OP_OPEN_SUBEXP)
1493 {
1494 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1495
1496 /* We are at the first node of this sub expression. */
1497 if (reg_num < nmatch)
1498 {
1499 pmatch[reg_num].rm_so = cur_idx;
1500 pmatch[reg_num].rm_eo = -1;
1501 }
1502 }
1503 else if (type == OP_CLOSE_SUBEXP)
1504 {
1505 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1506 if (reg_num < nmatch)
1507 {
1508 /* We are at the last node of this sub expression. */
1509 if (pmatch[reg_num].rm_so < cur_idx)
1510 {
1511 pmatch[reg_num].rm_eo = cur_idx;
1512 /* This is a non-empty match or we are not inside an optional
1513 subexpression. Accept this right away. */
1514 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1515 }
1516 else
1517 {
1518 if (dfa->nodes[cur_node].opt_subexp
1519 && prev_idx_match[reg_num].rm_so != -1)
1520 /* We transited through an empty match for an optional
1521 subexpression, like (a?)*, and this is not the subexp's
1522 first match. Copy back the old content of the registers
1523 so that matches of an inner subexpression are undone as
1524 well, like in ((a?))*. */
1525 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1526 else
1527 /* We completed a subexpression, but it may be part of
1528 an optional one, so do not update PREV_IDX_MATCH. */
1529 pmatch[reg_num].rm_eo = cur_idx;
1530 }
1531 }
1532 }
1533}
1534
1535/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1536 and sift the nodes in each states according to the following rules.
1537 Updated state_log will be wrote to STATE_LOG.
1538
1539 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1540 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1541 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1542 the LAST_NODE, we throw away the node `a'.
1543 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1544 string `s' and transit to `b':
1545 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1546 away the node `a'.
1547 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1548 thrown away, we throw away the node `a'.
1549 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1550 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1551 node `a'.
1552 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1553 we throw away the node `a'. */
1554
1555#define STATE_NODE_CONTAINS(state,node) \
1556 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1557
1558static reg_errcode_t
1559sift_states_backward (mctx, sctx)
1560 re_match_context_t *mctx;
1561 re_sift_context_t *sctx;
1562{
1563 reg_errcode_t err;
1564 int null_cnt = 0;
1565 int str_idx = sctx->last_str_idx;
1566 re_node_set cur_dest;
1567
1568#ifdef DEBUG
1569 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1570#endif
1571
1572 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1573 transit to the last_node and the last_node itself. */
1574 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1575 if (BE (err != REG_NOERROR, 0))
1576 return err;
1577 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1578 if (BE (err != REG_NOERROR, 0))
1579 goto free_return;
1580
1581 /* Then check each states in the state_log. */
1582 while (str_idx > 0)
1583 {
1584 /* Update counters. */
1585 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1586 if (null_cnt > mctx->max_mb_elem_len)
1587 {
1588 memset (sctx->sifted_states, '\0',
1589 sizeof (re_dfastate_t *) * str_idx);
1590 re_node_set_free (&cur_dest);
1591 return REG_NOERROR;
1592 }
1593 re_node_set_empty (&cur_dest);
1594 --str_idx;
1595
1596 if (mctx->state_log[str_idx])
1597 {
1598 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1599 if (BE (err != REG_NOERROR, 0))
1600 goto free_return;
1601 }
1602
1603 /* Add all the nodes which satisfy the following conditions:
1604 - It can epsilon transit to a node in CUR_DEST.
1605 - It is in CUR_SRC.
1606 And update state_log. */
1607 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1608 if (BE (err != REG_NOERROR, 0))
1609 goto free_return;
1610 }
1611 err = REG_NOERROR;
1612 free_return:
1613 re_node_set_free (&cur_dest);
1614 return err;
1615}
1616
1617static reg_errcode_t
1618build_sifted_states (mctx, sctx, str_idx, cur_dest)
1619 re_match_context_t *mctx;
1620 re_sift_context_t *sctx;
1621 int str_idx;
1622 re_node_set *cur_dest;
1623{
1624 re_dfa_t *const dfa = mctx->dfa;
1625 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1626 int i;
1627
1628 /* Then build the next sifted state.
1629 We build the next sifted state on `cur_dest', and update
1630 `sifted_states[str_idx]' with `cur_dest'.
1631 Note:
1632 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1633 `cur_src' points the node_set of the old `state_log[str_idx]'
1634 (with the epsilon nodes pre-filtered out). */
1635 for (i = 0; i < cur_src->nelem; i++)
1636 {
1637 int prev_node = cur_src->elems[i];
1638 int naccepted = 0;
1639 int ret;
1640
1641#ifdef DEBUG
1642 re_token_type_t type = dfa->nodes[prev_node].type;
1643 assert (!IS_EPSILON_NODE (type));
1644#endif
1645#ifdef RE_ENABLE_I18N
1646 /* If the node may accept `multi byte'. */
1647 if (dfa->nodes[prev_node].accept_mb)
1648 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1649 str_idx, sctx->last_str_idx);
1650#endif /* RE_ENABLE_I18N */
1651
1652 /* We don't check backreferences here.
1653 See update_cur_sifted_state(). */
1654 if (!naccepted
1655 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1656 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1657 dfa->nexts[prev_node]))
1658 naccepted = 1;
1659
1660 if (naccepted == 0)
1661 continue;
1662
1663 if (sctx->limits.nelem)
1664 {
1665 int to_idx = str_idx + naccepted;
1666 if (check_dst_limits (mctx, &sctx->limits,
1667 dfa->nexts[prev_node], to_idx,
1668 prev_node, str_idx))
1669 continue;
1670 }
1671 ret = re_node_set_insert (cur_dest, prev_node);
1672 if (BE (ret == -1, 0))
1673 return REG_ESPACE;
1674 }
1675
1676 return REG_NOERROR;
1677}
1678
1679/* Helper functions. */
1680
1681static reg_errcode_t
1682clean_state_log_if_needed (mctx, next_state_log_idx)
1683 re_match_context_t *mctx;
1684 int next_state_log_idx;
1685{
1686 int top = mctx->state_log_top;
1687
1688 if (next_state_log_idx >= mctx->input.bufs_len
1689 || (next_state_log_idx >= mctx->input.valid_len
1690 && mctx->input.valid_len < mctx->input.len))
1691 {
1692 reg_errcode_t err;
1693 err = extend_buffers (mctx);
1694 if (BE (err != REG_NOERROR, 0))
1695 return err;
1696 }
1697
1698 if (top < next_state_log_idx)
1699 {
1700 memset (mctx->state_log + top + 1, '\0',
1701 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1702 mctx->state_log_top = next_state_log_idx;
1703 }
1704 return REG_NOERROR;
1705}
1706
1707static reg_errcode_t
1708merge_state_array (dfa, dst, src, num)
1709 re_dfa_t *dfa;
1710 re_dfastate_t **dst;
1711 re_dfastate_t **src;
1712 int num;
1713{
1714 int st_idx;
1715 reg_errcode_t err;
1716 for (st_idx = 0; st_idx < num; ++st_idx)
1717 {
1718 if (dst[st_idx] == NULL)
1719 dst[st_idx] = src[st_idx];
1720 else if (src[st_idx] != NULL)
1721 {
1722 re_node_set merged_set;
1723 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1724 &src[st_idx]->nodes);
1725 if (BE (err != REG_NOERROR, 0))
1726 return err;
1727 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1728 re_node_set_free (&merged_set);
1729 if (BE (err != REG_NOERROR, 0))
1730 return err;
1731 }
1732 }
1733 return REG_NOERROR;
1734}
1735
1736static reg_errcode_t
1737update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1738 re_match_context_t *mctx;
1739 re_sift_context_t *sctx;
1740 int str_idx;
1741 re_node_set *dest_nodes;
1742{
1743 re_dfa_t *const dfa = mctx->dfa;
1744 reg_errcode_t err;
1745 const re_node_set *candidates;
1746 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1747 : &mctx->state_log[str_idx]->nodes);
1748
1749 if (dest_nodes->nelem == 0)
1750 sctx->sifted_states[str_idx] = NULL;
1751 else
1752 {
1753 if (candidates)
1754 {
1755 /* At first, add the nodes which can epsilon transit to a node in
1756 DEST_NODE. */
1757 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1758 if (BE (err != REG_NOERROR, 0))
1759 return err;
1760
1761 /* Then, check the limitations in the current sift_context. */
1762 if (sctx->limits.nelem)
1763 {
1764 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1765 mctx->bkref_ents, str_idx);
1766 if (BE (err != REG_NOERROR, 0))
1767 return err;
1768 }
1769 }
1770
1771 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1772 if (BE (err != REG_NOERROR, 0))
1773 return err;
1774 }
1775
1776 if (candidates && mctx->state_log[str_idx]->has_backref)
1777 {
1778 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1779 if (BE (err != REG_NOERROR, 0))
1780 return err;
1781 }
1782 return REG_NOERROR;
1783}
1784
1785static reg_errcode_t
1786add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1787 re_dfa_t *dfa;
1788 re_node_set *dest_nodes;
1789 const re_node_set *candidates;
1790{
1791 reg_errcode_t err = REG_NOERROR;
1792 int i;
1793
1794 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1795 if (BE (err != REG_NOERROR, 0))
1796 return err;
1797
1798 if (!state->inveclosure.alloc)
1799 {
1800 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1801 if (BE (err != REG_NOERROR, 0))
1802 return REG_ESPACE;
1803 for (i = 0; i < dest_nodes->nelem; i++)
1804 re_node_set_merge (&state->inveclosure,
1805 dfa->inveclosures + dest_nodes->elems[i]);
1806 }
1807 return re_node_set_add_intersect (dest_nodes, candidates,
1808 &state->inveclosure);
1809}
1810
1811static reg_errcode_t
1812sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1813 re_dfa_t *dfa;
1814 int node;
1815 re_node_set *dest_nodes;
1816 const re_node_set *candidates;
1817{
1818 int ecl_idx;
1819 reg_errcode_t err;
1820 re_node_set *inv_eclosure = dfa->inveclosures + node;
1821 re_node_set except_nodes;
1822 re_node_set_init_empty (&except_nodes);
1823 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1824 {
1825 int cur_node = inv_eclosure->elems[ecl_idx];
1826 if (cur_node == node)
1827 continue;
1828 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1829 {
1830 int edst1 = dfa->edests[cur_node].elems[0];
1831 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1832 ? dfa->edests[cur_node].elems[1] : -1);
1833 if ((!re_node_set_contains (inv_eclosure, edst1)
1834 && re_node_set_contains (dest_nodes, edst1))
1835 || (edst2 > 0
1836 && !re_node_set_contains (inv_eclosure, edst2)
1837 && re_node_set_contains (dest_nodes, edst2)))
1838 {
1839 err = re_node_set_add_intersect (&except_nodes, candidates,
1840 dfa->inveclosures + cur_node);
1841 if (BE (err != REG_NOERROR, 0))
1842 {
1843 re_node_set_free (&except_nodes);
1844 return err;
1845 }
1846 }
1847 }
1848 }
1849 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1850 {
1851 int cur_node = inv_eclosure->elems[ecl_idx];
1852 if (!re_node_set_contains (&except_nodes, cur_node))
1853 {
1854 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1855 re_node_set_remove_at (dest_nodes, idx);
1856 }
1857 }
1858 re_node_set_free (&except_nodes);
1859 return REG_NOERROR;
1860}
1861
1862static int
1863check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1864 re_match_context_t *mctx;
1865 re_node_set *limits;
1866 int dst_node, dst_idx, src_node, src_idx;
1867{
1868 re_dfa_t *const dfa = mctx->dfa;
1869 int lim_idx, src_pos, dst_pos;
1870
1871 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1872 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1873 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1874 {
1875 int subexp_idx;
1876 struct re_backref_cache_entry *ent;
1877 ent = mctx->bkref_ents + limits->elems[lim_idx];
1878 subexp_idx = dfa->nodes[ent->node].opr.idx;
1879
1880 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1881 subexp_idx, dst_node, dst_idx,
1882 dst_bkref_idx);
1883 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1884 subexp_idx, src_node, src_idx,
1885 src_bkref_idx);
1886
1887 /* In case of:
1888 <src> <dst> ( <subexp> )
1889 ( <subexp> ) <src> <dst>
1890 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1891 if (src_pos == dst_pos)
1892 continue; /* This is unrelated limitation. */
1893 else
1894 return 1;
1895 }
1896 return 0;
1897}
1898
1899static int
1900check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1901 re_match_context_t *mctx;
1902 int boundaries, subexp_idx, from_node, bkref_idx;
1903{
1904 re_dfa_t *const dfa = mctx->dfa;
1905 re_node_set *eclosures = dfa->eclosures + from_node;
1906 int node_idx;
1907
1908 /* Else, we are on the boundary: examine the nodes on the epsilon
1909 closure. */
1910 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1911 {
1912 int node = eclosures->elems[node_idx];
1913 switch (dfa->nodes[node].type)
1914 {
1915 case OP_BACK_REF:
1916 if (bkref_idx != -1)
1917 {
1918 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1919 do
1920 {
1921 int dst, cpos;
1922
1923 if (ent->node != node)
1924 continue;
1925
1926 if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1927 && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1928 continue;
1929
1930 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1931 OP_CLOSE_SUBEXP cases below. But, if the
1932 destination node is the same node as the source
1933 node, don't recurse because it would cause an
1934 infinite loop: a regex that exhibits this behavior
1935 is ()\1*\1* */
1936 dst = dfa->edests[node].elems[0];
1937 if (dst == from_node)
1938 {
1939 if (boundaries & 1)
1940 return -1;
1941 else /* if (boundaries & 2) */
1942 return 0;
1943 }
1944
1945 cpos =
1946 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1947 dst, bkref_idx);
1948 if (cpos == -1 /* && (boundaries & 1) */)
1949 return -1;
1950 if (cpos == 0 && (boundaries & 2))
1951 return 0;
1952
1953 ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1954 }
1955 while (ent++->more);
1956 }
1957 break;
1958
1959 case OP_OPEN_SUBEXP:
1960 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1961 return -1;
1962 break;
1963
1964 case OP_CLOSE_SUBEXP:
1965 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1966 return 0;
1967 break;
1968
1969 default:
1970 break;
1971 }
1972 }
1973
1974 return (boundaries & 2) ? 1 : 0;
1975}
1976
1977static int
1978check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1979 re_match_context_t *mctx;
1980 int limit, subexp_idx, from_node, str_idx, bkref_idx;
1981{
1982 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1983 int boundaries;
1984
1985 /* If we are outside the range of the subexpression, return -1 or 1. */
1986 if (str_idx < lim->subexp_from)
1987 return -1;
1988
1989 if (lim->subexp_to < str_idx)
1990 return 1;
1991
1992 /* If we are within the subexpression, return 0. */
1993 boundaries = (str_idx == lim->subexp_from);
1994 boundaries |= (str_idx == lim->subexp_to) << 1;
1995 if (boundaries == 0)
1996 return 0;
1997
1998 /* Else, examine epsilon closure. */
1999 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2000 from_node, bkref_idx);
2001}
2002
2003/* Check the limitations of sub expressions LIMITS, and remove the nodes
2004 which are against limitations from DEST_NODES. */
2005
2006static reg_errcode_t
2007check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
2008 re_dfa_t *dfa;
2009 re_node_set *dest_nodes;
2010 const re_node_set *candidates;
2011 re_node_set *limits;
2012 struct re_backref_cache_entry *bkref_ents;
2013 int str_idx;
2014{
2015 reg_errcode_t err;
2016 int node_idx, lim_idx;
2017
2018 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2019 {
2020 int subexp_idx;
2021 struct re_backref_cache_entry *ent;
2022 ent = bkref_ents + limits->elems[lim_idx];
2023
2024 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2025 continue; /* This is unrelated limitation. */
2026
2027 subexp_idx = dfa->nodes[ent->node].opr.idx;
2028 if (ent->subexp_to == str_idx)
2029 {
2030 int ops_node = -1;
2031 int cls_node = -1;
2032 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2033 {
2034 int node = dest_nodes->elems[node_idx];
2035 re_token_type_t type = dfa->nodes[node].type;
2036 if (type == OP_OPEN_SUBEXP
2037 && subexp_idx == dfa->nodes[node].opr.idx)
2038 ops_node = node;
2039 else if (type == OP_CLOSE_SUBEXP
2040 && subexp_idx == dfa->nodes[node].opr.idx)
2041 cls_node = node;
2042 }
2043
2044 /* Check the limitation of the open subexpression. */
2045 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2046 if (ops_node >= 0)
2047 {
2048 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2049 candidates);
2050 if (BE (err != REG_NOERROR, 0))
2051 return err;
2052 }
2053
2054 /* Check the limitation of the close subexpression. */
2055 if (cls_node >= 0)
2056 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2057 {
2058 int node = dest_nodes->elems[node_idx];
2059 if (!re_node_set_contains (dfa->inveclosures + node,
2060 cls_node)
2061 && !re_node_set_contains (dfa->eclosures + node,
2062 cls_node))
2063 {
2064 /* It is against this limitation.
2065 Remove it form the current sifted state. */
2066 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2067 candidates);
2068 if (BE (err != REG_NOERROR, 0))
2069 return err;
2070 --node_idx;
2071 }
2072 }
2073 }
2074 else /* (ent->subexp_to != str_idx) */
2075 {
2076 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2077 {
2078 int node = dest_nodes->elems[node_idx];
2079 re_token_type_t type = dfa->nodes[node].type;
2080 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2081 {
2082 if (subexp_idx != dfa->nodes[node].opr.idx)
2083 continue;
2084 /* It is against this limitation.
2085 Remove it form the current sifted state. */
2086 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2087 candidates);
2088 if (BE (err != REG_NOERROR, 0))
2089 return err;
2090 }
2091 }
2092 }
2093 }
2094 return REG_NOERROR;
2095}
2096
2097static reg_errcode_t
2098sift_states_bkref (mctx, sctx, str_idx, candidates)
2099 re_match_context_t *mctx;
2100 re_sift_context_t *sctx;
2101 int str_idx;
2102 const re_node_set *candidates;
2103{
2104 re_dfa_t *const dfa = mctx->dfa;
2105 reg_errcode_t err;
2106 int node_idx, node;
2107 re_sift_context_t local_sctx;
2108 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2109
2110 if (first_idx == -1)
2111 return REG_NOERROR;
2112
2113 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2114
2115 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2116 {
2117 int enabled_idx;
2118 re_token_type_t type;
2119 struct re_backref_cache_entry *entry;
2120 node = candidates->elems[node_idx];
2121 type = dfa->nodes[node].type;
2122 /* Avoid infinite loop for the REs like "()\1+". */
2123 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2124 continue;
2125 if (type != OP_BACK_REF)
2126 continue;
2127
2128 entry = mctx->bkref_ents + first_idx;
2129 enabled_idx = first_idx;
2130 do
2131 {
2132 int subexp_len, to_idx, dst_node;
2133 re_dfastate_t *cur_state;
2134
2135 if (entry->node != node)
2136 continue;
2137 subexp_len = entry->subexp_to - entry->subexp_from;
2138 to_idx = str_idx + subexp_len;
2139 dst_node = (subexp_len ? dfa->nexts[node]
2140 : dfa->edests[node].elems[0]);
2141
2142 if (to_idx > sctx->last_str_idx
2143 || sctx->sifted_states[to_idx] == NULL
2144 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2145 || check_dst_limits (mctx, &sctx->limits, node,
2146 str_idx, dst_node, to_idx))
2147 continue;
2148
2149 if (local_sctx.sifted_states == NULL)
2150 {
2151 local_sctx = *sctx;
2152 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2153 if (BE (err != REG_NOERROR, 0))
2154 goto free_return;
2155 }
2156 local_sctx.last_node = node;
2157 local_sctx.last_str_idx = str_idx;
2158 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2159 if (BE (err < 0, 0))
2160 {
2161 err = REG_ESPACE;
2162 goto free_return;
2163 }
2164 cur_state = local_sctx.sifted_states[str_idx];
2165 err = sift_states_backward (mctx, &local_sctx);
2166 if (BE (err != REG_NOERROR, 0))
2167 goto free_return;
2168 if (sctx->limited_states != NULL)
2169 {
2170 err = merge_state_array (dfa, sctx->limited_states,
2171 local_sctx.sifted_states,
2172 str_idx + 1);
2173 if (BE (err != REG_NOERROR, 0))
2174 goto free_return;
2175 }
2176 local_sctx.sifted_states[str_idx] = cur_state;
2177 re_node_set_remove (&local_sctx.limits, enabled_idx);
2178
2179 /* mctx->bkref_ents may have changed, reload the pointer. */
2180 entry = mctx->bkref_ents + enabled_idx;
2181 }
2182 while (enabled_idx++, entry++->more);
2183 }
2184 err = REG_NOERROR;
2185 free_return:
2186 if (local_sctx.sifted_states != NULL)
2187 {
2188 re_node_set_free (&local_sctx.limits);
2189 }
2190
2191 return err;
2192}
2193
2194
2195#ifdef RE_ENABLE_I18N
2196static int
2197sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2198 const re_match_context_t *mctx;
2199 re_sift_context_t *sctx;
2200 int node_idx, str_idx, max_str_idx;
2201{
2202 re_dfa_t *const dfa = mctx->dfa;
2203 int naccepted;
2204 /* Check the node can accept `multi byte'. */
2205 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2206 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2207 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2208 dfa->nexts[node_idx]))
2209 /* The node can't accept the `multi byte', or the
2210 destination was already thrown away, then the node
2211 could't accept the current input `multi byte'. */
2212 naccepted = 0;
2213 /* Otherwise, it is sure that the node could accept
2214 `naccepted' bytes input. */
2215 return naccepted;
2216}
2217#endif /* RE_ENABLE_I18N */
2218
2219
2220
2221/* Functions for state transition. */
2222
2223/* Return the next state to which the current state STATE will transit by
2224 accepting the current input byte, and update STATE_LOG if necessary.
2225 If STATE can accept a multibyte char/collating element/back reference
2226 update the destination of STATE_LOG. */
2227
2228static re_dfastate_t *
2229transit_state (err, mctx, state)
2230 reg_errcode_t *err;
2231 re_match_context_t *mctx;
2232 re_dfastate_t *state;
2233{
2234 re_dfastate_t **trtable;
2235 unsigned char ch;
2236
2237#ifdef RE_ENABLE_I18N
2238 /* If the current state can accept multibyte. */
2239 if (BE (state->accept_mb, 0))
2240 {
2241 *err = transit_state_mb (mctx, state);
2242 if (BE (*err != REG_NOERROR, 0))
2243 return NULL;
2244 }
2245#endif /* RE_ENABLE_I18N */
2246
2247 /* Then decide the next state with the single byte. */
2248#if 0
2249 if (0)
2250 /* don't use transition table */
2251 return transit_state_sb (err, mctx, state);
2252#endif
2253
2254 /* Use transition table */
2255 ch = re_string_fetch_byte (&mctx->input);
2256 for (;;)
2257 {
2258 trtable = state->trtable;
2259 if (BE (trtable != NULL, 1))
2260 return trtable[ch];
2261
2262 trtable = state->word_trtable;
2263 if (BE (trtable != NULL, 1))
2264 {
2265 unsigned int context;
2266 context
2267 = re_string_context_at (&mctx->input,
2268 re_string_cur_idx (&mctx->input) - 1,
2269 mctx->eflags);
2270 if (IS_WORD_CONTEXT (context))
2271 return trtable[ch + SBC_MAX];
2272 else
2273 return trtable[ch];
2274 }
2275
2276 if (!build_trtable (mctx->dfa, state))
2277 {
2278 *err = REG_ESPACE;
2279 return NULL;
2280 }
2281
2282 /* Retry, we now have a transition table. */
2283 }
2284}
2285
2286/* Update the state_log if we need */
2287re_dfastate_t *
2288merge_state_with_log (err, mctx, next_state)
2289 reg_errcode_t *err;
2290 re_match_context_t *mctx;
2291 re_dfastate_t *next_state;
2292{
2293 re_dfa_t *const dfa = mctx->dfa;
2294 int cur_idx = re_string_cur_idx (&mctx->input);
2295
2296 if (cur_idx > mctx->state_log_top)
2297 {
2298 mctx->state_log[cur_idx] = next_state;
2299 mctx->state_log_top = cur_idx;
2300 }
2301 else if (mctx->state_log[cur_idx] == 0)
2302 {
2303 mctx->state_log[cur_idx] = next_state;
2304 }
2305 else
2306 {
2307 re_dfastate_t *pstate;
2308 unsigned int context;
2309 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2310 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2311 the destination of a multibyte char/collating element/
2312 back reference. Then the next state is the union set of
2313 these destinations and the results of the transition table. */
2314 pstate = mctx->state_log[cur_idx];
2315 log_nodes = pstate->entrance_nodes;
2316 if (next_state != NULL)
2317 {
2318 table_nodes = next_state->entrance_nodes;
2319 *err = re_node_set_init_union (&next_nodes, table_nodes,
2320 log_nodes);
2321 if (BE (*err != REG_NOERROR, 0))
2322 return NULL;
2323 }
2324 else
2325 next_nodes = *log_nodes;
2326 /* Note: We already add the nodes of the initial state,
2327 then we don't need to add them here. */
2328
2329 context = re_string_context_at (&mctx->input,
2330 re_string_cur_idx (&mctx->input) - 1,
2331 mctx->eflags);
2332 next_state = mctx->state_log[cur_idx]
2333 = re_acquire_state_context (err, dfa, &next_nodes, context);
2334 /* We don't need to check errors here, since the return value of
2335 this function is next_state and ERR is already set. */
2336
2337 if (table_nodes != NULL)
2338 re_node_set_free (&next_nodes);
2339 }
2340
2341 if (BE (dfa->nbackref, 0) && next_state != NULL)
2342 {
2343 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2344 later. We must check them here, since the back references in the
2345 next state might use them. */
2346 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2347 cur_idx);
2348 if (BE (*err != REG_NOERROR, 0))
2349 return NULL;
2350
2351 /* If the next state has back references. */
2352 if (next_state->has_backref)
2353 {
2354 *err = transit_state_bkref (mctx, &next_state->nodes);
2355 if (BE (*err != REG_NOERROR, 0))
2356 return NULL;
2357 next_state = mctx->state_log[cur_idx];
2358 }
2359 }
2360
2361 return next_state;
2362}
2363
2364/* Skip bytes in the input that correspond to part of a
2365 multi-byte match, then look in the log for a state
2366 from which to restart matching. */
2367re_dfastate_t *
2368find_recover_state (err, mctx)
2369 reg_errcode_t *err;
2370 re_match_context_t *mctx;
2371{
2372 re_dfastate_t *cur_state = NULL;
2373 do
2374 {
2375 int max = mctx->state_log_top;
2376 int cur_str_idx = re_string_cur_idx (&mctx->input);
2377
2378 do
2379 {
2380 if (++cur_str_idx > max)
2381 return NULL;
2382 re_string_skip_bytes (&mctx->input, 1);
2383 }
2384 while (mctx->state_log[cur_str_idx] == NULL);
2385
2386 cur_state = merge_state_with_log (err, mctx, NULL);
2387 }
2388 while (err == REG_NOERROR && cur_state == NULL);
2389 return cur_state;
2390}
2391
2392/* Helper functions for transit_state. */
2393
2394/* From the node set CUR_NODES, pick up the nodes whose types are
2395 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2396 expression. And register them to use them later for evaluating the
2397 correspoding back references. */
2398
2399static reg_errcode_t
2400check_subexp_matching_top (mctx, cur_nodes, str_idx)
2401 re_match_context_t *mctx;
2402 re_node_set *cur_nodes;
2403 int str_idx;
2404{
2405 re_dfa_t *const dfa = mctx->dfa;
2406 int node_idx;
2407 reg_errcode_t err;
2408
2409 /* TODO: This isn't efficient.
2410 Because there might be more than one nodes whose types are
2411 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2412 nodes.
2413 E.g. RE: (a){2} */
2414 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2415 {
2416 int node = cur_nodes->elems[node_idx];
2417 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2418 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2419 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2420 {
2421 err = match_ctx_add_subtop (mctx, node, str_idx);
2422 if (BE (err != REG_NOERROR, 0))
2423 return err;
2424 }
2425 }
2426 return REG_NOERROR;
2427}
2428
2429#if 0
2430/* Return the next state to which the current state STATE will transit by
2431 accepting the current input byte. */
2432
2433static re_dfastate_t *
2434transit_state_sb (err, mctx, state)
2435 reg_errcode_t *err;
2436 re_match_context_t *mctx;
2437 re_dfastate_t *state;
2438{
2439 re_dfa_t *const dfa = mctx->dfa;
2440 re_node_set next_nodes;
2441 re_dfastate_t *next_state;
2442 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2443 unsigned int context;
2444
2445 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2446 if (BE (*err != REG_NOERROR, 0))
2447 return NULL;
2448 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2449 {
2450 int cur_node = state->nodes.elems[node_cnt];
2451 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2452 {
2453 *err = re_node_set_merge (&next_nodes,
2454 dfa->eclosures + dfa->nexts[cur_node]);
2455 if (BE (*err != REG_NOERROR, 0))
2456 {
2457 re_node_set_free (&next_nodes);
2458 return NULL;
2459 }
2460 }
2461 }
2462 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2463 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2464 /* We don't need to check errors here, since the return value of
2465 this function is next_state and ERR is already set. */
2466
2467 re_node_set_free (&next_nodes);
2468 re_string_skip_bytes (&mctx->input, 1);
2469 return next_state;
2470}
2471#endif
2472
2473#ifdef RE_ENABLE_I18N
2474static reg_errcode_t
2475transit_state_mb (mctx, pstate)
2476 re_match_context_t *mctx;
2477 re_dfastate_t *pstate;
2478{
2479 re_dfa_t *const dfa = mctx->dfa;
2480 reg_errcode_t err;
2481 int i;
2482
2483 for (i = 0; i < pstate->nodes.nelem; ++i)
2484 {
2485 re_node_set dest_nodes, *new_nodes;
2486 int cur_node_idx = pstate->nodes.elems[i];
2487 int naccepted, dest_idx;
2488 unsigned int context;
2489 re_dfastate_t *dest_state;
2490
2491 if (!dfa->nodes[cur_node_idx].accept_mb)
2492 continue;
2493
2494 if (dfa->nodes[cur_node_idx].constraint)
2495 {
2496 context = re_string_context_at (&mctx->input,
2497 re_string_cur_idx (&mctx->input),
2498 mctx->eflags);
2499 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2500 context))
2501 continue;
2502 }
2503
2504 /* How many bytes the node can accept? */
2505 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2506 re_string_cur_idx (&mctx->input));
2507 if (naccepted == 0)
2508 continue;
2509
2510 /* The node can accepts `naccepted' bytes. */
2511 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2512 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2513 : mctx->max_mb_elem_len);
2514 err = clean_state_log_if_needed (mctx, dest_idx);
2515 if (BE (err != REG_NOERROR, 0))
2516 return err;
2517#ifdef DEBUG
2518 assert (dfa->nexts[cur_node_idx] != -1);
2519#endif
2520 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2521
2522 dest_state = mctx->state_log[dest_idx];
2523 if (dest_state == NULL)
2524 dest_nodes = *new_nodes;
2525 else
2526 {
2527 err = re_node_set_init_union (&dest_nodes,
2528 dest_state->entrance_nodes, new_nodes);
2529 if (BE (err != REG_NOERROR, 0))
2530 return err;
2531 }
2532 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2533 mctx->state_log[dest_idx]
2534 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2535 if (dest_state != NULL)
2536 re_node_set_free (&dest_nodes);
2537 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2538 return err;
2539 }
2540 return REG_NOERROR;
2541}
2542#endif /* RE_ENABLE_I18N */
2543
2544static reg_errcode_t
2545transit_state_bkref (mctx, nodes)
2546 re_match_context_t *mctx;
2547 const re_node_set *nodes;
2548{
2549 re_dfa_t *const dfa = mctx->dfa;
2550 reg_errcode_t err;
2551 int i;
2552 int cur_str_idx = re_string_cur_idx (&mctx->input);
2553
2554 for (i = 0; i < nodes->nelem; ++i)
2555 {
2556 int dest_str_idx, prev_nelem, bkc_idx;
2557 int node_idx = nodes->elems[i];
2558 unsigned int context;
2559 const re_token_t *node = dfa->nodes + node_idx;
2560 re_node_set *new_dest_nodes;
2561
2562 /* Check whether `node' is a backreference or not. */
2563 if (node->type != OP_BACK_REF)
2564 continue;
2565
2566 if (node->constraint)
2567 {
2568 context = re_string_context_at (&mctx->input, cur_str_idx,
2569 mctx->eflags);
2570 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2571 continue;
2572 }
2573
2574 /* `node' is a backreference.
2575 Check the substring which the substring matched. */
2576 bkc_idx = mctx->nbkref_ents;
2577 err = get_subexp (mctx, node_idx, cur_str_idx);
2578 if (BE (err != REG_NOERROR, 0))
2579 goto free_return;
2580
2581 /* And add the epsilon closures (which is `new_dest_nodes') of
2582 the backreference to appropriate state_log. */
2583#ifdef DEBUG
2584 assert (dfa->nexts[node_idx] != -1);
2585#endif
2586 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2587 {
2588 int subexp_len;
2589 re_dfastate_t *dest_state;
2590 struct re_backref_cache_entry *bkref_ent;
2591 bkref_ent = mctx->bkref_ents + bkc_idx;
2592 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2593 continue;
2594 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2595 new_dest_nodes = (subexp_len == 0
2596 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2597 : dfa->eclosures + dfa->nexts[node_idx]);
2598 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2599 - bkref_ent->subexp_from);
2600 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2601 mctx->eflags);
2602 dest_state = mctx->state_log[dest_str_idx];
2603 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2604 : mctx->state_log[cur_str_idx]->nodes.nelem);
2605 /* Add `new_dest_node' to state_log. */
2606 if (dest_state == NULL)
2607 {
2608 mctx->state_log[dest_str_idx]
2609 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2610 context);
2611 if (BE (mctx->state_log[dest_str_idx] == NULL
2612 && err != REG_NOERROR, 0))
2613 goto free_return;
2614 }
2615 else
2616 {
2617 re_node_set dest_nodes;
2618 err = re_node_set_init_union (&dest_nodes,
2619 dest_state->entrance_nodes,
2620 new_dest_nodes);
2621 if (BE (err != REG_NOERROR, 0))
2622 {
2623 re_node_set_free (&dest_nodes);
2624 goto free_return;
2625 }
2626 mctx->state_log[dest_str_idx]
2627 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2628 re_node_set_free (&dest_nodes);
2629 if (BE (mctx->state_log[dest_str_idx] == NULL
2630 && err != REG_NOERROR, 0))
2631 goto free_return;
2632 }
2633 /* We need to check recursively if the backreference can epsilon
2634 transit. */
2635 if (subexp_len == 0
2636 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2637 {
2638 err = check_subexp_matching_top (mctx, new_dest_nodes,
2639 cur_str_idx);
2640 if (BE (err != REG_NOERROR, 0))
2641 goto free_return;
2642 err = transit_state_bkref (mctx, new_dest_nodes);
2643 if (BE (err != REG_NOERROR, 0))
2644 goto free_return;
2645 }
2646 }
2647 }
2648 err = REG_NOERROR;
2649 free_return:
2650 return err;
2651}
2652
2653/* Enumerate all the candidates which the backreference BKREF_NODE can match
2654 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2655 Note that we might collect inappropriate candidates here.
2656 However, the cost of checking them strictly here is too high, then we
2657 delay these checking for prune_impossible_nodes(). */
2658
2659static reg_errcode_t
2660get_subexp (mctx, bkref_node, bkref_str_idx)
2661 re_match_context_t *mctx;
2662 int bkref_node, bkref_str_idx;
2663{
2664 re_dfa_t *const dfa = mctx->dfa;
2665 int subexp_num, sub_top_idx;
2666 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2667 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2668 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2669 if (cache_idx != -1)
2670 {
2671 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2672 do
2673 if (entry->node == bkref_node)
2674 return REG_NOERROR; /* We already checked it. */
2675 while (entry++->more);
2676 }
2677
2678 subexp_num = dfa->nodes[bkref_node].opr.idx;
2679
2680 /* For each sub expression */
2681 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2682 {
2683 reg_errcode_t err;
2684 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2685 re_sub_match_last_t *sub_last;
2686 int sub_last_idx, sl_str, bkref_str_off;
2687
2688 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2689 continue; /* It isn't related. */
2690
2691 sl_str = sub_top->str_idx;
2692 bkref_str_off = bkref_str_idx;
2693 /* At first, check the last node of sub expressions we already
2694 evaluated. */
2695 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2696 {
2697 int sl_str_diff;
2698 sub_last = sub_top->lasts[sub_last_idx];
2699 sl_str_diff = sub_last->str_idx - sl_str;
2700 /* The matched string by the sub expression match with the substring
2701 at the back reference? */
2702 if (sl_str_diff > 0)
2703 {
2704 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2705 {
2706 /* Not enough chars for a successful match. */
2707 if (bkref_str_off + sl_str_diff > mctx->input.len)
2708 break;
2709
2710 err = clean_state_log_if_needed (mctx,
2711 bkref_str_off
2712 + sl_str_diff);
2713 if (BE (err != REG_NOERROR, 0))
2714 return err;
2715 buf = (const char *) re_string_get_buffer (&mctx->input);
2716 }
2717 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2718 break; /* We don't need to search this sub expression any more. */
2719 }
2720 bkref_str_off += sl_str_diff;
2721 sl_str += sl_str_diff;
2722 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2723 bkref_str_idx);
2724
2725 /* Reload buf, since the preceding call might have reallocated
2726 the buffer. */
2727 buf = (const char *) re_string_get_buffer (&mctx->input);
2728
2729 if (err == REG_NOMATCH)
2730 continue;
2731 if (BE (err != REG_NOERROR, 0))
2732 return err;
2733 }
2734
2735 if (sub_last_idx < sub_top->nlasts)
2736 continue;
2737 if (sub_last_idx > 0)
2738 ++sl_str;
2739 /* Then, search for the other last nodes of the sub expression. */
2740 for (; sl_str <= bkref_str_idx; ++sl_str)
2741 {
2742 int cls_node, sl_str_off;
2743 const re_node_set *nodes;
2744 sl_str_off = sl_str - sub_top->str_idx;
2745 /* The matched string by the sub expression match with the substring
2746 at the back reference? */
2747 if (sl_str_off > 0)
2748 {
2749 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2750 {
2751 /* If we are at the end of the input, we cannot match. */
2752 if (bkref_str_off >= mctx->input.len)
2753 break;
2754
2755 err = extend_buffers (mctx);
2756 if (BE (err != REG_NOERROR, 0))
2757 return err;
2758
2759 buf = (const char *) re_string_get_buffer (&mctx->input);
2760 }
2761 if (buf [bkref_str_off++] != buf[sl_str - 1])
2762 break; /* We don't need to search this sub expression
2763 any more. */
2764 }
2765 if (mctx->state_log[sl_str] == NULL)
2766 continue;
2767 /* Does this state have a ')' of the sub expression? */
2768 nodes = &mctx->state_log[sl_str]->nodes;
2769 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2770 if (cls_node == -1)
2771 continue; /* No. */
2772 if (sub_top->path == NULL)
2773 {
2774 sub_top->path = calloc (sizeof (state_array_t),
2775 sl_str - sub_top->str_idx + 1);
2776 if (sub_top->path == NULL)
2777 return REG_ESPACE;
2778 }
2779 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2780 in the current context? */
2781 err = check_arrival (mctx, sub_top->path, sub_top->node,
2782 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2783 if (err == REG_NOMATCH)
2784 continue;
2785 if (BE (err != REG_NOERROR, 0))
2786 return err;
2787 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2788 if (BE (sub_last == NULL, 0))
2789 return REG_ESPACE;
2790 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2791 bkref_str_idx);
2792 if (err == REG_NOMATCH)
2793 continue;
2794 }
2795 }
2796 return REG_NOERROR;
2797}
2798
2799/* Helper functions for get_subexp(). */
2800
2801/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2802 If it can arrive, register the sub expression expressed with SUB_TOP
2803 and SUB_LAST. */
2804
2805static reg_errcode_t
2806get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2807 re_match_context_t *mctx;
2808 const re_sub_match_top_t *sub_top;
2809 re_sub_match_last_t *sub_last;
2810 int bkref_node, bkref_str;
2811{
2812 reg_errcode_t err;
2813 int to_idx;
2814 /* Can the subexpression arrive the back reference? */
2815 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2816 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2817 if (err != REG_NOERROR)
2818 return err;
2819 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2820 sub_last->str_idx);
2821 if (BE (err != REG_NOERROR, 0))
2822 return err;
2823 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2824 return clean_state_log_if_needed (mctx, to_idx);
2825}
2826
2827/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2828 Search '(' if FL_OPEN, or search ')' otherwise.
2829 TODO: This function isn't efficient...
2830 Because there might be more than one nodes whose types are
2831 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2832 nodes.
2833 E.g. RE: (a){2} */
2834
2835static int
2836find_subexp_node (dfa, nodes, subexp_idx, type)
2837 const re_dfa_t *dfa;
2838 const re_node_set *nodes;
2839 int subexp_idx, type;
2840{
2841 int cls_idx;
2842 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2843 {
2844 int cls_node = nodes->elems[cls_idx];
2845 const re_token_t *node = dfa->nodes + cls_node;
2846 if (node->type == type
2847 && node->opr.idx == subexp_idx)
2848 return cls_node;
2849 }
2850 return -1;
2851}
2852
2853/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2854 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2855 heavily reused.
2856 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2857
2858static reg_errcode_t
2859check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2860 type)
2861 re_match_context_t *mctx;
2862 state_array_t *path;
2863 int top_node, top_str, last_node, last_str, type;
2864{
2865 re_dfa_t *const dfa = mctx->dfa;
2866 reg_errcode_t err;
2867 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2868 re_dfastate_t *cur_state = NULL;
2869 re_node_set *cur_nodes, next_nodes;
2870 re_dfastate_t **backup_state_log;
2871 unsigned int context;
2872
2873 subexp_num = dfa->nodes[top_node].opr.idx;
2874 /* Extend the buffer if we need. */
2875 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2876 {
2877 re_dfastate_t **new_array;
2878 int old_alloc = path->alloc;
2879 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2880 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2881 if (new_array == NULL)
2882 {
2883 path->alloc = old_alloc;
2884 return REG_ESPACE;
2885 }
2886 path->array = new_array;
2887 memset (new_array + old_alloc, '\0',
2888 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2889 }
2890
2891 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2892
2893 /* Temporary modify MCTX. */
2894 backup_state_log = mctx->state_log;
2895 backup_cur_idx = mctx->input.cur_idx;
2896 mctx->state_log = path->array;
2897 mctx->input.cur_idx = str_idx;
2898
2899 /* Setup initial node set. */
2900 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2901 if (str_idx == top_str)
2902 {
2903 err = re_node_set_init_1 (&next_nodes, top_node);
2904 if (BE (err != REG_NOERROR, 0))
2905 return err;
2906 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2907 if (BE (err != REG_NOERROR, 0))
2908 {
2909 re_node_set_free (&next_nodes);
2910 return err;
2911 }
2912 }
2913 else
2914 {
2915 cur_state = mctx->state_log[str_idx];
2916 if (cur_state && cur_state->has_backref)
2917 {
2918 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2919 if (BE ( err != REG_NOERROR, 0))
2920 return err;
2921 }
2922 else
2923 re_node_set_init_empty (&next_nodes);
2924 }
2925 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2926 {
2927 if (next_nodes.nelem)
2928 {
2929 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2930 subexp_num, type);
2931 if (BE ( err != REG_NOERROR, 0))
2932 {
2933 re_node_set_free (&next_nodes);
2934 return err;
2935 }
2936 }
2937 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2938 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2939 {
2940 re_node_set_free (&next_nodes);
2941 return err;
2942 }
2943 mctx->state_log[str_idx] = cur_state;
2944 }
2945
2946 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2947 {
2948 re_node_set_empty (&next_nodes);
2949 if (mctx->state_log[str_idx + 1])
2950 {
2951 err = re_node_set_merge (&next_nodes,
2952 &mctx->state_log[str_idx + 1]->nodes);
2953 if (BE (err != REG_NOERROR, 0))
2954 {
2955 re_node_set_free (&next_nodes);
2956 return err;
2957 }
2958 }
2959 if (cur_state)
2960 {
2961 err = check_arrival_add_next_nodes (mctx, str_idx,
2962 &cur_state->non_eps_nodes, &next_nodes);
2963 if (BE (err != REG_NOERROR, 0))
2964 {
2965 re_node_set_free (&next_nodes);
2966 return err;
2967 }
2968 }
2969 ++str_idx;
2970 if (next_nodes.nelem)
2971 {
2972 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2973 if (BE (err != REG_NOERROR, 0))
2974 {
2975 re_node_set_free (&next_nodes);
2976 return err;
2977 }
2978 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2979 subexp_num, type);
2980 if (BE ( err != REG_NOERROR, 0))
2981 {
2982 re_node_set_free (&next_nodes);
2983 return err;
2984 }
2985 }
2986 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2987 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2988 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2989 {
2990 re_node_set_free (&next_nodes);
2991 return err;
2992 }
2993 mctx->state_log[str_idx] = cur_state;
2994 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2995 }
2996 re_node_set_free (&next_nodes);
2997 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2998 : &mctx->state_log[last_str]->nodes);
2999 path->next_idx = str_idx;
3000
3001 /* Fix MCTX. */
3002 mctx->state_log = backup_state_log;
3003 mctx->input.cur_idx = backup_cur_idx;
3004
3005 /* Then check the current node set has the node LAST_NODE. */
3006 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3007 return REG_NOERROR;
3008
3009 return REG_NOMATCH;
3010}
3011
3012/* Helper functions for check_arrival. */
3013
3014/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3015 to NEXT_NODES.
3016 TODO: This function is similar to the functions transit_state*(),
3017 however this function has many additional works.
3018 Can't we unify them? */
3019
3020static reg_errcode_t
3021check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3022 re_match_context_t *mctx;
3023 int str_idx;
3024 re_node_set *cur_nodes, *next_nodes;
3025{
3026 re_dfa_t *const dfa = mctx->dfa;
3027 int result;
3028 int cur_idx;
3029 re_node_set union_set;
3030 re_node_set_init_empty (&union_set);
3031 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3032 {
3033 int naccepted = 0;
3034 int cur_node = cur_nodes->elems[cur_idx];
3035#ifdef DEBUG
3036 re_token_type_t type = dfa->nodes[cur_node].type;
3037 assert (!IS_EPSILON_NODE (type));
3038#endif
3039#ifdef RE_ENABLE_I18N
3040 /* If the node may accept `multi byte'. */
3041 if (dfa->nodes[cur_node].accept_mb)
3042 {
3043 reg_errcode_t err;
3044
3045 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3046 str_idx);
3047 if (naccepted > 1)
3048 {
3049 re_dfastate_t *dest_state;
3050 int next_node = dfa->nexts[cur_node];
3051 int next_idx = str_idx + naccepted;
3052 dest_state = mctx->state_log[next_idx];
3053 re_node_set_empty (&union_set);
3054 if (dest_state)
3055 {
3056 err = re_node_set_merge (&union_set, &dest_state->nodes);
3057 if (BE (err != REG_NOERROR, 0))
3058 {
3059 re_node_set_free (&union_set);
3060 return err;
3061 }
3062 }
3063 result = re_node_set_insert (&union_set, next_node);
3064 if (BE (result < 0, 0))
3065 {
3066 re_node_set_free (&union_set);
3067 return REG_ESPACE;
3068 }
3069 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3070 &union_set);
3071 if (BE (mctx->state_log[next_idx] == NULL
3072 && err != REG_NOERROR, 0))
3073 {
3074 re_node_set_free (&union_set);
3075 return err;
3076 }
3077 }
3078 }
3079#endif /* RE_ENABLE_I18N */
3080 if (naccepted
3081 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3082 {
3083 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3084 if (BE (result < 0, 0))
3085 {
3086 re_node_set_free (&union_set);
3087 return REG_ESPACE;
3088 }
3089 }
3090 }
3091 re_node_set_free (&union_set);
3092 return REG_NOERROR;
3093}
3094
3095/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3096 CUR_NODES, however exclude the nodes which are:
3097 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3098 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3099*/
3100
3101static reg_errcode_t
3102check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3103 re_dfa_t *dfa;
3104 re_node_set *cur_nodes;
3105 int ex_subexp, type;
3106{
3107 reg_errcode_t err;
3108 int idx, outside_node;
3109 re_node_set new_nodes;
3110#ifdef DEBUG
3111 assert (cur_nodes->nelem);
3112#endif
3113 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3114 if (BE (err != REG_NOERROR, 0))
3115 return err;
3116 /* Create a new node set NEW_NODES with the nodes which are epsilon
3117 closures of the node in CUR_NODES. */
3118
3119 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3120 {
3121 int cur_node = cur_nodes->elems[idx];
3122 re_node_set *eclosure = dfa->eclosures + cur_node;
3123 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3124 if (outside_node == -1)
3125 {
3126 /* There are no problematic nodes, just merge them. */
3127 err = re_node_set_merge (&new_nodes, eclosure);
3128 if (BE (err != REG_NOERROR, 0))
3129 {
3130 re_node_set_free (&new_nodes);
3131 return err;
3132 }
3133 }
3134 else
3135 {
3136 /* There are problematic nodes, re-calculate incrementally. */
3137 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3138 ex_subexp, type);
3139 if (BE (err != REG_NOERROR, 0))
3140 {
3141 re_node_set_free (&new_nodes);
3142 return err;
3143 }
3144 }
3145 }
3146 re_node_set_free (cur_nodes);
3147 *cur_nodes = new_nodes;
3148 return REG_NOERROR;
3149}
3150
3151/* Helper function for check_arrival_expand_ecl.
3152 Check incrementally the epsilon closure of TARGET, and if it isn't
3153 problematic append it to DST_NODES. */
3154
3155static reg_errcode_t
3156check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3157 re_dfa_t *dfa;
3158 int target, ex_subexp, type;
3159 re_node_set *dst_nodes;
3160{
3161 int cur_node;
3162 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3163 {
3164 int err;
3165
3166 if (dfa->nodes[cur_node].type == type
3167 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3168 {
3169 if (type == OP_CLOSE_SUBEXP)
3170 {
3171 err = re_node_set_insert (dst_nodes, cur_node);
3172 if (BE (err == -1, 0))
3173 return REG_ESPACE;
3174 }
3175 break;
3176 }
3177 err = re_node_set_insert (dst_nodes, cur_node);
3178 if (BE (err == -1, 0))
3179 return REG_ESPACE;
3180 if (dfa->edests[cur_node].nelem == 0)
3181 break;
3182 if (dfa->edests[cur_node].nelem == 2)
3183 {
3184 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3185 dfa->edests[cur_node].elems[1],
3186 ex_subexp, type);
3187 if (BE (err != REG_NOERROR, 0))
3188 return err;
3189 }
3190 cur_node = dfa->edests[cur_node].elems[0];
3191 }
3192 return REG_NOERROR;
3193}
3194
3195
3196/* For all the back references in the current state, calculate the
3197 destination of the back references by the appropriate entry
3198 in MCTX->BKREF_ENTS. */
3199
3200static reg_errcode_t
3201expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3202 type)
3203 re_match_context_t *mctx;
3204 int cur_str, subexp_num, type;
3205 re_node_set *cur_nodes;
3206{
3207 re_dfa_t *const dfa = mctx->dfa;
3208 reg_errcode_t err;
3209 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3210 struct re_backref_cache_entry *ent;
3211
3212 if (cache_idx_start == -1)
3213 return REG_NOERROR;
3214
3215 restart:
3216 ent = mctx->bkref_ents + cache_idx_start;
3217 do
3218 {
3219 int to_idx, next_node;
3220
3221 /* Is this entry ENT is appropriate? */
3222 if (!re_node_set_contains (cur_nodes, ent->node))
3223 continue; /* No. */
3224
3225 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3226 /* Calculate the destination of the back reference, and append it
3227 to MCTX->STATE_LOG. */
3228 if (to_idx == cur_str)
3229 {
3230 /* The backreference did epsilon transit, we must re-check all the
3231 node in the current state. */
3232 re_node_set new_dests;
3233 reg_errcode_t err2, err3;
3234 next_node = dfa->edests[ent->node].elems[0];
3235 if (re_node_set_contains (cur_nodes, next_node))
3236 continue;
3237 err = re_node_set_init_1 (&new_dests, next_node);
3238 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3239 err3 = re_node_set_merge (cur_nodes, &new_dests);
3240 re_node_set_free (&new_dests);
3241 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3242 || err3 != REG_NOERROR, 0))
3243 {
3244 err = (err != REG_NOERROR ? err
3245 : (err2 != REG_NOERROR ? err2 : err3));
3246 return err;
3247 }
3248 /* TODO: It is still inefficient... */
3249 goto restart;
3250 }
3251 else
3252 {
3253 re_node_set union_set;
3254 next_node = dfa->nexts[ent->node];
3255 if (mctx->state_log[to_idx])
3256 {
3257 int ret;
3258 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3259 next_node))
3260 continue;
3261 err = re_node_set_init_copy (&union_set,
3262 &mctx->state_log[to_idx]->nodes);
3263 ret = re_node_set_insert (&union_set, next_node);
3264 if (BE (err != REG_NOERROR || ret < 0, 0))
3265 {
3266 re_node_set_free (&union_set);
3267 err = err != REG_NOERROR ? err : REG_ESPACE;
3268 return err;
3269 }
3270 }
3271 else
3272 {
3273 err = re_node_set_init_1 (&union_set, next_node);
3274 if (BE (err != REG_NOERROR, 0))
3275 return err;
3276 }
3277 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3278 re_node_set_free (&union_set);
3279 if (BE (mctx->state_log[to_idx] == NULL
3280 && err != REG_NOERROR, 0))
3281 return err;
3282 }
3283 }
3284 while (ent++->more);
3285 return REG_NOERROR;
3286}
3287
3288/* Build transition table for the state.
3289 Return 1 if succeeded, otherwise return NULL. */
3290
3291static int
3292build_trtable (dfa, state)
3293 re_dfa_t *dfa;
3294 re_dfastate_t *state;
3295{
3296 reg_errcode_t err;
3297 int i, j, ch, need_word_trtable = 0;
3298 unsigned int elem, mask;
3299 int dests_node_malloced = 0, dest_states_malloced = 0;
3300 int ndests; /* Number of the destination states from `state'. */
3301 re_dfastate_t **trtable;
3302 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3303 re_node_set follows, *dests_node;
3304 bitset *dests_ch;
3305 bitset acceptable;
3306
3307 /* We build DFA states which corresponds to the destination nodes
3308 from `state'. `dests_node[i]' represents the nodes which i-th
3309 destination state contains, and `dests_ch[i]' represents the
3310 characters which i-th destination state accepts. */
3311#ifdef _LIBC
3312 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3313 dests_node = (re_node_set *)
3314 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3315 else
3316#endif
3317 {
3318 dests_node = (re_node_set *)
3319 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3320 if (BE (dests_node == NULL, 0))
3321 return 0;
3322 dests_node_malloced = 1;
3323 }
3324 dests_ch = (bitset *) (dests_node + SBC_MAX);
3325
3326 /* Initialize transiton table. */
3327 state->word_trtable = state->trtable = NULL;
3328
3329 /* At first, group all nodes belonging to `state' into several
3330 destinations. */
3331 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3332 if (BE (ndests <= 0, 0))
3333 {
3334 if (dests_node_malloced)
3335 free (dests_node);
3336 /* Return 0 in case of an error, 1 otherwise. */
3337 if (ndests == 0)
3338 {
3339 state->trtable = (re_dfastate_t **)
3340 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3341 return 1;
3342 }
3343 return 0;
3344 }
3345
3346 err = re_node_set_alloc (&follows, ndests + 1);
3347 if (BE (err != REG_NOERROR, 0))
3348 goto out_free;
3349
3350#ifdef _LIBC
3351 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3352 + ndests * 3 * sizeof (re_dfastate_t *)))
3353 dest_states = (re_dfastate_t **)
3354 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3355 else
3356#endif
3357 {
3358 dest_states = (re_dfastate_t **)
3359 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3360 if (BE (dest_states == NULL, 0))
3361 {
3362out_free:
3363 if (dest_states_malloced)
3364 free (dest_states);
3365 re_node_set_free (&follows);
3366 for (i = 0; i < ndests; ++i)
3367 re_node_set_free (dests_node + i);
3368 if (dests_node_malloced)
3369 free (dests_node);
3370 return 0;
3371 }
3372 dest_states_malloced = 1;
3373 }
3374 dest_states_word = dest_states + ndests;
3375 dest_states_nl = dest_states_word + ndests;
3376 bitset_empty (acceptable);
3377
3378 /* Then build the states for all destinations. */
3379 for (i = 0; i < ndests; ++i)
3380 {
3381 int next_node;
3382 re_node_set_empty (&follows);
3383 /* Merge the follows of this destination states. */
3384 for (j = 0; j < dests_node[i].nelem; ++j)
3385 {
3386 next_node = dfa->nexts[dests_node[i].elems[j]];
3387 if (next_node != -1)
3388 {
3389 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3390 if (BE (err != REG_NOERROR, 0))
3391 goto out_free;
3392 }
3393 }
3394 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3395 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3396 goto out_free;
3397 /* If the new state has context constraint,
3398 build appropriate states for these contexts. */
3399 if (dest_states[i]->has_constraint)
3400 {
3401 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3402 CONTEXT_WORD);
3403 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3404 goto out_free;
3405
3406 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3407 need_word_trtable = 1;
3408
3409 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3410 CONTEXT_NEWLINE);
3411 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3412 goto out_free;
3413 }
3414 else
3415 {
3416 dest_states_word[i] = dest_states[i];
3417 dest_states_nl[i] = dest_states[i];
3418 }
3419 bitset_merge (acceptable, dests_ch[i]);
3420 }
3421
3422 if (!BE (need_word_trtable, 0))
3423 {
3424 /* We don't care about whether the following character is a word
3425 character, or we are in a single-byte character set so we can
3426 discern by looking at the character code: allocate a
3427 256-entry transition table. */
3428 trtable = state->trtable =
3429 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3430 if (BE (trtable == NULL, 0))
3431 goto out_free;
3432
3433 /* For all characters ch...: */
3434 for (i = 0; i < BITSET_UINTS; ++i)
3435 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3436 elem;
3437 mask <<= 1, elem >>= 1, ++ch)
3438 if (BE (elem & 1, 0))
3439 {
3440 /* There must be exactly one destination which accepts
3441 character ch. See group_nodes_into_DFAstates. */
3442 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3443 ;
3444
3445 /* j-th destination accepts the word character ch. */
3446 if (dfa->word_char[i] & mask)
3447 trtable[ch] = dest_states_word[j];
3448 else
3449 trtable[ch] = dest_states[j];
3450 }
3451 }
3452 else
3453 {
3454 /* We care about whether the following character is a word
3455 character, and we are in a multi-byte character set: discern
3456 by looking at the character code: build two 256-entry
3457 transition tables, one starting at trtable[0] and one
3458 starting at trtable[SBC_MAX]. */
3459 trtable = state->word_trtable =
3460 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3461 if (BE (trtable == NULL, 0))
3462 goto out_free;
3463
3464 /* For all characters ch...: */
3465 for (i = 0; i < BITSET_UINTS; ++i)
3466 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3467 elem;
3468 mask <<= 1, elem >>= 1, ++ch)
3469 if (BE (elem & 1, 0))
3470 {
3471 /* There must be exactly one destination which accepts
3472 character ch. See group_nodes_into_DFAstates. */
3473 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3474 ;
3475
3476 /* j-th destination accepts the word character ch. */
3477 trtable[ch] = dest_states[j];
3478 trtable[ch + SBC_MAX] = dest_states_word[j];
3479 }
3480 }
3481
3482 /* new line */
3483 if (bitset_contain (acceptable, NEWLINE_CHAR))
3484 {
3485 /* The current state accepts newline character. */
3486 for (j = 0; j < ndests; ++j)
3487 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3488 {
3489 /* k-th destination accepts newline character. */
3490 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3491 if (need_word_trtable)
3492 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3493 /* There must be only one destination which accepts
3494 newline. See group_nodes_into_DFAstates. */
3495 break;
3496 }
3497 }
3498
3499 if (dest_states_malloced)
3500 free (dest_states);
3501
3502 re_node_set_free (&follows);
3503 for (i = 0; i < ndests; ++i)
3504 re_node_set_free (dests_node + i);
3505
3506 if (dests_node_malloced)
3507 free (dests_node);
3508
3509 return 1;
3510}
3511
3512/* Group all nodes belonging to STATE into several destinations.
3513 Then for all destinations, set the nodes belonging to the destination
3514 to DESTS_NODE[i] and set the characters accepted by the destination
3515 to DEST_CH[i]. This function return the number of destinations. */
3516
3517static int
3518group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3519 re_dfa_t *dfa;
3520 const re_dfastate_t *state;
3521 re_node_set *dests_node;
3522 bitset *dests_ch;
3523{
3524 reg_errcode_t err;
3525 int result;
3526 int i, j, k;
3527 int ndests; /* Number of the destinations from `state'. */
3528 bitset accepts; /* Characters a node can accept. */
3529 const re_node_set *cur_nodes = &state->nodes;
3530 bitset_empty (accepts);
3531 ndests = 0;
3532
3533 /* For all the nodes belonging to `state', */
3534 for (i = 0; i < cur_nodes->nelem; ++i)
3535 {
3536 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3537 re_token_type_t type = node->type;
3538 unsigned int constraint = node->constraint;
3539
3540 /* Enumerate all single byte character this node can accept. */
3541 if (type == CHARACTER)
3542 bitset_set (accepts, node->opr.c);
3543 else if (type == SIMPLE_BRACKET)
3544 {
3545 bitset_merge (accepts, node->opr.sbcset);
3546 }
3547 else if (type == OP_PERIOD)
3548 {
3549#ifdef RE_ENABLE_I18N
3550 if (dfa->mb_cur_max > 1)
3551 bitset_merge (accepts, dfa->sb_char);
3552 else
3553#endif
3554 bitset_set_all (accepts);
3555 if (!(dfa->syntax & RE_DOT_NEWLINE))
3556 bitset_clear (accepts, '\n');
3557 if (dfa->syntax & RE_DOT_NOT_NULL)
3558 bitset_clear (accepts, '\0');
3559 }
3560#ifdef RE_ENABLE_I18N
3561 else if (type == OP_UTF8_PERIOD)
3562 {
3563 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3564 if (!(dfa->syntax & RE_DOT_NEWLINE))
3565 bitset_clear (accepts, '\n');
3566 if (dfa->syntax & RE_DOT_NOT_NULL)
3567 bitset_clear (accepts, '\0');
3568 }
3569#endif
3570 else
3571 continue;
3572
3573 /* Check the `accepts' and sift the characters which are not
3574 match it the context. */
3575 if (constraint)
3576 {
3577 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3578 {
3579 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3580 bitset_empty (accepts);
3581 if (accepts_newline)
3582 bitset_set (accepts, NEWLINE_CHAR);
3583 else
3584 continue;
3585 }
3586 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3587 {
3588 bitset_empty (accepts);
3589 continue;
3590 }
3591
3592 if (constraint & NEXT_WORD_CONSTRAINT)
3593 {
3594 unsigned int any_set = 0;
3595 if (type == CHARACTER && !node->word_char)
3596 {
3597 bitset_empty (accepts);
3598 continue;
3599 }
3600#ifdef RE_ENABLE_I18N
3601 if (dfa->mb_cur_max > 1)
3602 for (j = 0; j < BITSET_UINTS; ++j)
3603 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3604 else
3605#endif
3606 for (j = 0; j < BITSET_UINTS; ++j)
3607 any_set |= (accepts[j] &= dfa->word_char[j]);
3608 if (!any_set)
3609 continue;
3610 }
3611 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3612 {
3613 unsigned int any_set = 0;
3614 if (type == CHARACTER && node->word_char)
3615 {
3616 bitset_empty (accepts);
3617 continue;
3618 }
3619#ifdef RE_ENABLE_I18N
3620 if (dfa->mb_cur_max > 1)
3621 for (j = 0; j < BITSET_UINTS; ++j)
3622 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3623 else
3624#endif
3625 for (j = 0; j < BITSET_UINTS; ++j)
3626 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3627 if (!any_set)
3628 continue;
3629 }
3630 }
3631
3632 /* Then divide `accepts' into DFA states, or create a new
3633 state. Above, we make sure that accepts is not empty. */
3634 for (j = 0; j < ndests; ++j)
3635 {
3636 bitset intersec; /* Intersection sets, see below. */
3637 bitset remains;
3638 /* Flags, see below. */
3639 int has_intersec, not_subset, not_consumed;
3640
3641 /* Optimization, skip if this state doesn't accept the character. */
3642 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3643 continue;
3644
3645 /* Enumerate the intersection set of this state and `accepts'. */
3646 has_intersec = 0;
3647 for (k = 0; k < BITSET_UINTS; ++k)
3648 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3649 /* And skip if the intersection set is empty. */
3650 if (!has_intersec)
3651 continue;
3652
3653 /* Then check if this state is a subset of `accepts'. */
3654 not_subset = not_consumed = 0;
3655 for (k = 0; k < BITSET_UINTS; ++k)
3656 {
3657 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3658 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3659 }
3660
3661 /* If this state isn't a subset of `accepts', create a
3662 new group state, which has the `remains'. */
3663 if (not_subset)
3664 {
3665 bitset_copy (dests_ch[ndests], remains);
3666 bitset_copy (dests_ch[j], intersec);
3667 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3668 if (BE (err != REG_NOERROR, 0))
3669 goto error_return;
3670 ++ndests;
3671 }
3672
3673 /* Put the position in the current group. */
3674 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3675 if (BE (result < 0, 0))
3676 goto error_return;
3677
3678 /* If all characters are consumed, go to next node. */
3679 if (!not_consumed)
3680 break;
3681 }
3682 /* Some characters remain, create a new group. */
3683 if (j == ndests)
3684 {
3685 bitset_copy (dests_ch[ndests], accepts);
3686 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3687 if (BE (err != REG_NOERROR, 0))
3688 goto error_return;
3689 ++ndests;
3690 bitset_empty (accepts);
3691 }
3692 }
3693 return ndests;
3694 error_return:
3695 for (j = 0; j < ndests; ++j)
3696 re_node_set_free (dests_node + j);
3697 return -1;
3698}
3699
3700#ifdef RE_ENABLE_I18N
3701/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3702 Return the number of the bytes the node accepts.
3703 STR_IDX is the current index of the input string.
3704
3705 This function handles the nodes which can accept one character, or
3706 one collating element like '.', '[a-z]', opposite to the other nodes
3707 can only accept one byte. */
3708
3709static int
3710check_node_accept_bytes (dfa, node_idx, input, str_idx)
3711 re_dfa_t *dfa;
3712 int node_idx, str_idx;
3713 const re_string_t *input;
3714{
3715 const re_token_t *node = dfa->nodes + node_idx;
3716 int char_len, elem_len;
3717 int i;
3718
3719 if (BE (node->type == OP_UTF8_PERIOD, 0))
3720 {
3721 unsigned char c = re_string_byte_at (input, str_idx), d;
3722 if (BE (c < 0xc2, 1))
3723 return 0;
3724
3725 if (str_idx + 2 > input->len)
3726 return 0;
3727
3728 d = re_string_byte_at (input, str_idx + 1);
3729 if (c < 0xe0)
3730 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3731 else if (c < 0xf0)
3732 {
3733 char_len = 3;
3734 if (c == 0xe0 && d < 0xa0)
3735 return 0;
3736 }
3737 else if (c < 0xf8)
3738 {
3739 char_len = 4;
3740 if (c == 0xf0 && d < 0x90)
3741 return 0;
3742 }
3743 else if (c < 0xfc)
3744 {
3745 char_len = 5;
3746 if (c == 0xf8 && d < 0x88)
3747 return 0;
3748 }
3749 else if (c < 0xfe)
3750 {
3751 char_len = 6;
3752 if (c == 0xfc && d < 0x84)
3753 return 0;
3754 }
3755 else
3756 return 0;
3757
3758 if (str_idx + char_len > input->len)
3759 return 0;
3760
3761 for (i = 1; i < char_len; ++i)
3762 {
3763 d = re_string_byte_at (input, str_idx + i);
3764 if (d < 0x80 || d > 0xbf)
3765 return 0;
3766 }
3767 return char_len;
3768 }
3769
3770 char_len = re_string_char_size_at (input, str_idx);
3771 if (node->type == OP_PERIOD)
3772 {
3773 if (char_len <= 1)
3774 return 0;
3775 /* FIXME: I don't think this if is needed, as both '\n'
3776 and '\0' are char_len == 1. */
3777 /* '.' accepts any one character except the following two cases. */
3778 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3779 re_string_byte_at (input, str_idx) == '\n') ||
3780 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3781 re_string_byte_at (input, str_idx) == '\0'))
3782 return 0;
3783 return char_len;
3784 }
3785
3786 elem_len = re_string_elem_size_at (input, str_idx);
3787 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3788 return 0;
3789
3790 if (node->type == COMPLEX_BRACKET)
3791 {
3792 const re_charset_t *cset = node->opr.mbcset;
3793# ifdef _LIBC
3794 const unsigned char *pin = ((char *) re_string_get_buffer (input)
3795 + str_idx);
3796 int j;
3797 uint32_t nrules;
3798# endif /* _LIBC */
3799 int match_len = 0;
3800 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3801 ? re_string_wchar_at (input, str_idx) : 0);
3802
3803 /* match with multibyte character? */
3804 for (i = 0; i < cset->nmbchars; ++i)
3805 if (wc == cset->mbchars[i])
3806 {
3807 match_len = char_len;
3808 goto check_node_accept_bytes_match;
3809 }
3810 /* match with character_class? */
3811 for (i = 0; i < cset->nchar_classes; ++i)
3812 {
3813 wctype_t wt = cset->char_classes[i];
3814 if (__iswctype (wc, wt))
3815 {
3816 match_len = char_len;
3817 goto check_node_accept_bytes_match;
3818 }
3819 }
3820
3821# ifdef _LIBC
3822 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3823 if (nrules != 0)
3824 {
3825 unsigned int in_collseq = 0;
3826 const int32_t *table, *indirect;
3827 const unsigned char *weights, *extra;
3828 const char *collseqwc;
3829 int32_t idx;
3830 /* This #include defines a local function! */
3831# include <locale/weight.h>
3832
3833 /* match with collating_symbol? */
3834 if (cset->ncoll_syms)
3835 extra = (const unsigned char *)
3836 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3837 for (i = 0; i < cset->ncoll_syms; ++i)
3838 {
3839 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3840 /* Compare the length of input collating element and
3841 the length of current collating element. */
3842 if (*coll_sym != elem_len)
3843 continue;
3844 /* Compare each bytes. */
3845 for (j = 0; j < *coll_sym; j++)
3846 if (pin[j] != coll_sym[1 + j])
3847 break;
3848 if (j == *coll_sym)
3849 {
3850 /* Match if every bytes is equal. */
3851 match_len = j;
3852 goto check_node_accept_bytes_match;
3853 }
3854 }
3855
3856 if (cset->nranges)
3857 {
3858 if (elem_len <= char_len)
3859 {
3860 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3861 in_collseq = __collseq_table_lookup (collseqwc, wc);
3862 }
3863 else
3864 in_collseq = find_collation_sequence_value (pin, elem_len);
3865 }
3866 /* match with range expression? */
3867 for (i = 0; i < cset->nranges; ++i)
3868 if (cset->range_starts[i] <= in_collseq
3869 && in_collseq <= cset->range_ends[i])
3870 {
3871 match_len = elem_len;
3872 goto check_node_accept_bytes_match;
3873 }
3874
3875 /* match with equivalence_class? */
3876 if (cset->nequiv_classes)
3877 {
3878 const unsigned char *cp = pin;
3879 table = (const int32_t *)
3880 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3881 weights = (const unsigned char *)
3882 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3883 extra = (const unsigned char *)
3884 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3885 indirect = (const int32_t *)
3886 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3887 idx = findidx (&cp);
3888 if (idx > 0)
3889 for (i = 0; i < cset->nequiv_classes; ++i)
3890 {
3891 int32_t equiv_class_idx = cset->equiv_classes[i];
3892 size_t weight_len = weights[idx];
3893 if (weight_len == weights[equiv_class_idx])
3894 {
3895 int cnt = 0;
3896 while (cnt <= weight_len
3897 && (weights[equiv_class_idx + 1 + cnt]
3898 == weights[idx + 1 + cnt]))
3899 ++cnt;
3900 if (cnt > weight_len)
3901 {
3902 match_len = elem_len;
3903 goto check_node_accept_bytes_match;
3904 }
3905 }
3906 }
3907 }
3908 }
3909 else
3910# endif /* _LIBC */
3911 {
3912 /* match with range expression? */
3913#if __GNUC__ >= 2
3914 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3915#else
3916 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3917 cmp_buf[2] = wc;
3918#endif
3919 for (i = 0; i < cset->nranges; ++i)
3920 {
3921 cmp_buf[0] = cset->range_starts[i];
3922 cmp_buf[4] = cset->range_ends[i];
3923 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3924 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3925 {
3926 match_len = char_len;
3927 goto check_node_accept_bytes_match;
3928 }
3929 }
3930 }
3931 check_node_accept_bytes_match:
3932 if (!cset->non_match)
3933 return match_len;
3934 else
3935 {
3936 if (match_len > 0)
3937 return 0;
3938 else
3939 return (elem_len > char_len) ? elem_len : char_len;
3940 }
3941 }
3942 return 0;
3943}
3944
3945# ifdef _LIBC
3946static unsigned int
3947find_collation_sequence_value (mbs, mbs_len)
3948 const unsigned char *mbs;
3949 size_t mbs_len;
3950{
3951 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3952 if (nrules == 0)
3953 {
3954 if (mbs_len == 1)
3955 {
3956 /* No valid character. Match it as a single byte character. */
3957 const unsigned char *collseq = (const unsigned char *)
3958 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3959 return collseq[mbs[0]];
3960 }
3961 return UINT_MAX;
3962 }
3963 else
3964 {
3965 int32_t idx;
3966 const unsigned char *extra = (const unsigned char *)
3967 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3968 int32_t extrasize = (const unsigned char *)
3969 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3970
3971 for (idx = 0; idx < extrasize;)
3972 {
3973 int mbs_cnt, found = 0;
3974 int32_t elem_mbs_len;
3975 /* Skip the name of collating element name. */
3976 idx = idx + extra[idx] + 1;
3977 elem_mbs_len = extra[idx++];
3978 if (mbs_len == elem_mbs_len)
3979 {
3980 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3981 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3982 break;
3983 if (mbs_cnt == elem_mbs_len)
3984 /* Found the entry. */
3985 found = 1;
3986 }
3987 /* Skip the byte sequence of the collating element. */
3988 idx += elem_mbs_len;
3989 /* Adjust for the alignment. */
3990 idx = (idx + 3) & ~3;
3991 /* Skip the collation sequence value. */
3992 idx += sizeof (uint32_t);
3993 /* Skip the wide char sequence of the collating element. */
3994 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3995 /* If we found the entry, return the sequence value. */
3996 if (found)
3997 return *(uint32_t *) (extra + idx);
3998 /* Skip the collation sequence value. */
3999 idx += sizeof (uint32_t);
4000 }
4001 return UINT_MAX;
4002 }
4003}
4004# endif /* _LIBC */
4005#endif /* RE_ENABLE_I18N */
4006
4007/* Check whether the node accepts the byte which is IDX-th
4008 byte of the INPUT. */
4009
4010static int
4011check_node_accept (mctx, node, idx)
4012 const re_match_context_t *mctx;
4013 const re_token_t *node;
4014 int idx;
4015{
4016 unsigned char ch;
4017 ch = re_string_byte_at (&mctx->input, idx);
4018 switch (node->type)
4019 {
4020 case CHARACTER:
4021 if (node->opr.c != ch)
4022 return 0;
4023 break;
4024
4025 case SIMPLE_BRACKET:
4026 if (!bitset_contain (node->opr.sbcset, ch))
4027 return 0;
4028 break;
4029
4030#ifdef RE_ENABLE_I18N
4031 case OP_UTF8_PERIOD:
4032 if (ch >= 0x80)
4033 return 0;
4034 /* FALLTHROUGH */
4035#endif
4036 case OP_PERIOD:
4037 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4038 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4039 return 0;
4040 break;
4041
4042 default:
4043 return 0;
4044 }
4045
4046 if (node->constraint)
4047 {
4048 /* The node has constraints. Check whether the current context
4049 satisfies the constraints. */
4050 unsigned int context = re_string_context_at (&mctx->input, idx,
4051 mctx->eflags);
4052 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4053 return 0;
4054 }
4055
4056 return 1;
4057}
4058
4059/* Extend the buffers, if the buffers have run out. */
4060
4061static reg_errcode_t
4062extend_buffers (mctx)
4063 re_match_context_t *mctx;
4064{
4065 reg_errcode_t ret;
4066 re_string_t *pstr = &mctx->input;
4067
4068 /* Double the lengthes of the buffers. */
4069 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4070 if (BE (ret != REG_NOERROR, 0))
4071 return ret;
4072
4073 if (mctx->state_log != NULL)
4074 {
4075 /* And double the length of state_log. */
4076 /* XXX We have no indication of the size of this buffer. If this
4077 allocation fail we have no indication that the state_log array
4078 does not have the right size. */
4079 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4080 pstr->bufs_len + 1);
4081 if (BE (new_array == NULL, 0))
4082 return REG_ESPACE;
4083 mctx->state_log = new_array;
4084 }
4085
4086 /* Then reconstruct the buffers. */
4087 if (pstr->icase)
4088 {
4089#ifdef RE_ENABLE_I18N
4090 if (pstr->mb_cur_max > 1)
4091 {
4092 ret = build_wcs_upper_buffer (pstr);
4093 if (BE (ret != REG_NOERROR, 0))
4094 return ret;
4095 }
4096 else
4097#endif /* RE_ENABLE_I18N */
4098 build_upper_buffer (pstr);
4099 }
4100 else
4101 {
4102#ifdef RE_ENABLE_I18N
4103 if (pstr->mb_cur_max > 1)
4104 build_wcs_buffer (pstr);
4105 else
4106#endif /* RE_ENABLE_I18N */
4107 {
4108 if (pstr->trans != NULL)
4109 re_string_translate_buffer (pstr);
4110 }
4111 }
4112 return REG_NOERROR;
4113}
4114
4115
4116
4117/* Functions for matching context. */
4118
4119/* Initialize MCTX. */
4120
4121static reg_errcode_t
4122match_ctx_init (mctx, eflags, n)
4123 re_match_context_t *mctx;
4124 int eflags, n;
4125{
4126 mctx->eflags = eflags;
4127 mctx->match_last = -1;
4128 if (n > 0)
4129 {
4130 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4131 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4132 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4133 return REG_ESPACE;
4134 }
4135 /* Already zero-ed by the caller.
4136 else
4137 mctx->bkref_ents = NULL;
4138 mctx->nbkref_ents = 0;
4139 mctx->nsub_tops = 0; */
4140 mctx->abkref_ents = n;
4141 mctx->max_mb_elem_len = 1;
4142 mctx->asub_tops = n;
4143 return REG_NOERROR;
4144}
4145
4146/* Clean the entries which depend on the current input in MCTX.
4147 This function must be invoked when the matcher changes the start index
4148 of the input, or changes the input string. */
4149
4150static void
4151match_ctx_clean (mctx)
4152 re_match_context_t *mctx;
4153{
4154 int st_idx;
4155 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4156 {
4157 int sl_idx;
4158 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4159 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4160 {
4161 re_sub_match_last_t *last = top->lasts[sl_idx];
4162 re_free (last->path.array);
4163 re_free (last);
4164 }
4165 re_free (top->lasts);
4166 if (top->path)
4167 {
4168 re_free (top->path->array);
4169 re_free (top->path);
4170 }
4171 free (top);
4172 }
4173
4174 mctx->nsub_tops = 0;
4175 mctx->nbkref_ents = 0;
4176}
4177
4178/* Free all the memory associated with MCTX. */
4179
4180static void
4181match_ctx_free (mctx)
4182 re_match_context_t *mctx;
4183{
4184 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4185 match_ctx_clean (mctx);
4186 re_free (mctx->sub_tops);
4187 re_free (mctx->bkref_ents);
4188}
4189
4190/* Add a new backreference entry to MCTX.
4191 Note that we assume that caller never call this function with duplicate
4192 entry, and call with STR_IDX which isn't smaller than any existing entry.
4193*/
4194
4195static reg_errcode_t
4196match_ctx_add_entry (mctx, node, str_idx, from, to)
4197 re_match_context_t *mctx;
4198 int node, str_idx, from, to;
4199{
4200 if (mctx->nbkref_ents >= mctx->abkref_ents)
4201 {
4202 struct re_backref_cache_entry* new_entry;
4203 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4204 mctx->abkref_ents * 2);
4205 if (BE (new_entry == NULL, 0))
4206 {
4207 re_free (mctx->bkref_ents);
4208 return REG_ESPACE;
4209 }
4210 mctx->bkref_ents = new_entry;
4211 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4212 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4213 mctx->abkref_ents *= 2;
4214 }
4215 if (mctx->nbkref_ents > 0
4216 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4217 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4218
4219 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4220 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4221 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4222 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4223
4224 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4225 If bit N is clear, means that this entry won't epsilon-transition to
4226 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4227 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4228 such node.
4229
4230 A backreference does not epsilon-transition unless it is empty, so set
4231 to all zeros if FROM != TO. */
4232 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4233 = (from == to ? ~0 : 0);
4234
4235 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4236 if (mctx->max_mb_elem_len < to - from)
4237 mctx->max_mb_elem_len = to - from;
4238 return REG_NOERROR;
4239}
4240
4241/* Search for the first entry which has the same str_idx, or -1 if none is
4242 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4243
4244static int
4245search_cur_bkref_entry (mctx, str_idx)
4246 re_match_context_t *mctx;
4247 int str_idx;
4248{
4249 int left, right, mid, last;
4250 last = right = mctx->nbkref_ents;
4251 for (left = 0; left < right;)
4252 {
4253 mid = (left + right) / 2;
4254 if (mctx->bkref_ents[mid].str_idx < str_idx)
4255 left = mid + 1;
4256 else
4257 right = mid;
4258 }
4259 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4260 return left;
4261 else
4262 return -1;
4263}
4264
4265/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4266 at STR_IDX. */
4267
4268static reg_errcode_t
4269match_ctx_add_subtop (mctx, node, str_idx)
4270 re_match_context_t *mctx;
4271 int node, str_idx;
4272{
4273#ifdef DEBUG
4274 assert (mctx->sub_tops != NULL);
4275 assert (mctx->asub_tops > 0);
4276#endif
4277 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4278 {
4279 int new_asub_tops = mctx->asub_tops * 2;
4280 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4281 re_sub_match_top_t *,
4282 new_asub_tops);
4283 if (BE (new_array == NULL, 0))
4284 return REG_ESPACE;
4285 mctx->sub_tops = new_array;
4286 mctx->asub_tops = new_asub_tops;
4287 }
4288 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4289 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4290 return REG_ESPACE;
4291 mctx->sub_tops[mctx->nsub_tops]->node = node;
4292 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4293 return REG_NOERROR;
4294}
4295
4296/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4297 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4298
4299static re_sub_match_last_t *
4300match_ctx_add_sublast (subtop, node, str_idx)
4301 re_sub_match_top_t *subtop;
4302 int node, str_idx;
4303{
4304 re_sub_match_last_t *new_entry;
4305 if (BE (subtop->nlasts == subtop->alasts, 0))
4306 {
4307 int new_alasts = 2 * subtop->alasts + 1;
4308 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4309 re_sub_match_last_t *,
4310 new_alasts);
4311 if (BE (new_array == NULL, 0))
4312 return NULL;
4313 subtop->lasts = new_array;
4314 subtop->alasts = new_alasts;
4315 }
4316 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4317 if (BE (new_entry != NULL, 1))
4318 {
4319 subtop->lasts[subtop->nlasts] = new_entry;
4320 new_entry->node = node;
4321 new_entry->str_idx = str_idx;
4322 ++subtop->nlasts;
4323 }
4324 return new_entry;
4325}
4326
4327static void
4328sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4329 re_sift_context_t *sctx;
4330 re_dfastate_t **sifted_sts, **limited_sts;
4331 int last_node, last_str_idx;
4332{
4333 sctx->sifted_states = sifted_sts;
4334 sctx->limited_states = limited_sts;
4335 sctx->last_node = last_node;
4336 sctx->last_str_idx = last_str_idx;
4337 re_node_set_init_empty (&sctx->limits);
4338}
Note: See TracBrowser for help on using the repository browser.