source: vendor/glibc/current/posix/regexec.c

Last change on this file was 2237, checked in by (none), 20 years ago

This commit was manufactured by cvs2svn to create branch 'GNU'.

  • Property cvs2svn:cvs-rev set to 1.1
  • Property svn:eol-style set to native
  • Property svn:executable set to *
File size: 125.8 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24static void match_ctx_free (re_match_context_t *cache) internal_function;
25static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
27 internal_function;
28static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
29 internal_function;
30static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
34 internal_function;
35static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
37 int last_str_idx)
38 internal_function;
39static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55static inline re_dfastate_t *acquire_init_state_context
56 (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
57 __attribute ((always_inline)) internal_function;
58static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
59 internal_function;
60static int check_matching (re_match_context_t *mctx, int fl_longest_match,
61 int *p_match_first)
62 internal_function;
63static int check_halt_node_context (const re_dfa_t *dfa, int node,
64 unsigned int context) internal_function;
65static int check_halt_state_context (const re_match_context_t *mctx,
66 const re_dfastate_t *state, int idx)
67 internal_function;
68static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
69 regmatch_t *prev_idx_match, int cur_node,
70 int cur_idx, int nmatch) internal_function;
71static int proceed_next_node (const re_match_context_t *mctx,
72 int nregs, regmatch_t *regs,
73 int *pidx, int node, re_node_set *eps_via_nodes,
74 struct re_fail_stack_t *fs) internal_function;
75static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
76 int str_idx, int dest_node, int nregs,
77 regmatch_t *regs,
78 re_node_set *eps_via_nodes) internal_function;
79static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
80 regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
81static reg_errcode_t set_regs (const regex_t *preg,
82 const re_match_context_t *mctx,
83 size_t nmatch, regmatch_t *pmatch,
84 int fl_backtrack) internal_function;
85static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
86
87#ifdef RE_ENABLE_I18N
88static int sift_states_iter_mb (const re_match_context_t *mctx,
89 re_sift_context_t *sctx,
90 int node_idx, int str_idx, int max_str_idx) internal_function;
91#endif /* RE_ENABLE_I18N */
92static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
93 re_sift_context_t *sctx) internal_function;
94static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
95 re_sift_context_t *sctx, int str_idx,
96 re_node_set *cur_dest) internal_function;
97static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
98 re_sift_context_t *sctx,
99 int str_idx,
100 re_node_set *dest_nodes) internal_function;
101static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
102 re_node_set *dest_nodes,
103 const re_node_set *candidates) internal_function;
104static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
105 re_node_set *dest_nodes,
106 const re_node_set *and_nodes) internal_function;
107static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
108 int dst_node, int dst_idx, int src_node,
109 int src_idx) internal_function;
110static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
111 int boundaries, int subexp_idx,
112 int from_node, int bkref_idx) internal_function;
113static int check_dst_limits_calc_pos (re_match_context_t *mctx,
114 int limit, int subexp_idx,
115 int node, int str_idx,
116 int bkref_idx) internal_function;
117static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
118 re_node_set *dest_nodes,
119 const re_node_set *candidates,
120 re_node_set *limits,
121 struct re_backref_cache_entry *bkref_ents,
122 int str_idx) internal_function;
123static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
124 re_sift_context_t *sctx,
125 int str_idx, const re_node_set *candidates) internal_function;
126static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
127 int next_state_log_idx) internal_function;
128static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
129 re_dfastate_t **src, int num) internal_function;
130static re_dfastate_t *find_recover_state (reg_errcode_t *err,
131 re_match_context_t *mctx) internal_function;
132static re_dfastate_t *transit_state (reg_errcode_t *err,
133 re_match_context_t *mctx,
134 re_dfastate_t *state) internal_function;
135static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
136 re_match_context_t *mctx,
137 re_dfastate_t *next_state) internal_function;
138static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
139 re_node_set *cur_nodes,
140 int str_idx) internal_function;
141#if 0
142static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
143 re_match_context_t *mctx,
144 re_dfastate_t *pstate) internal_function;
145#endif
146#ifdef RE_ENABLE_I18N
147static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
148 re_dfastate_t *pstate) internal_function;
149#endif /* RE_ENABLE_I18N */
150static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
151 const re_node_set *nodes) internal_function;
152static reg_errcode_t get_subexp (re_match_context_t *mctx,
153 int bkref_node, int bkref_str_idx) internal_function;
154static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str) internal_function;
158static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
159 int subexp_idx, int type) internal_function;
160static reg_errcode_t check_arrival (re_match_context_t *mctx,
161 state_array_t *path, int top_node,
162 int top_str, int last_node, int last_str,
163 int type) internal_function;
164static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
165 int str_idx,
166 re_node_set *cur_nodes,
167 re_node_set *next_nodes) internal_function;
168static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
169 re_node_set *cur_nodes,
170 int ex_subexp, int type) internal_function;
171static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
172 re_node_set *dst_nodes,
173 int target, int ex_subexp,
174 int type) internal_function;
175static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
176 re_node_set *cur_nodes, int cur_str,
177 int subexp_num, int type) internal_function;
178static int build_trtable (re_dfa_t *dfa,
179 re_dfastate_t *state) internal_function;
180#ifdef RE_ENABLE_I18N
181static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
182 const re_string_t *input, int idx) internal_function;
183# ifdef _LIBC
184static unsigned int find_collation_sequence_value (const unsigned char *mbs,
185 size_t name_len) internal_function;
186# endif /* _LIBC */
187#endif /* RE_ENABLE_I18N */
188static int group_nodes_into_DFAstates (re_dfa_t *dfa,
189 const re_dfastate_t *state,
190 re_node_set *states_node,
191 bitset *states_ch) internal_function;
192static int check_node_accept (const re_match_context_t *mctx,
193 const re_token_t *node, int idx) internal_function;
194static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
195
196
197/* Entry point for POSIX code. */
198
199/* regexec searches for a given pattern, specified by PREG, in the
200 string STRING.
201
202 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
203 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
204 least NMATCH elements, and we set them to the offsets of the
205 corresponding matched substrings.
206
207 EFLAGS specifies `execution flags' which affect matching: if
208 REG_NOTBOL is set, then ^ does not match at the beginning of the
209 string; if REG_NOTEOL is set, then $ does not match at the end.
210
211 We return 0 if we find a match and REG_NOMATCH if not. */
212
213int
214regexec (preg, string, nmatch, pmatch, eflags)
215 const regex_t *__restrict preg;
216 const char *__restrict string;
217 size_t nmatch;
218 regmatch_t pmatch[];
219 int eflags;
220{
221 reg_errcode_t err;
222 int start, length;
223 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
224
225 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
226 return REG_BADPAT;
227
228 if (eflags & REG_STARTEND)
229 {
230 start = pmatch[0].rm_so;
231 length = pmatch[0].rm_eo;
232 }
233 else
234 {
235 start = 0;
236 length = strlen (string);
237 }
238
239 __libc_lock_lock (dfa->lock);
240 if (preg->no_sub)
241 err = re_search_internal (preg, string, length, start, length - start,
242 length, 0, NULL, eflags);
243 else
244 err = re_search_internal (preg, string, length, start, length - start,
245 length, nmatch, pmatch, eflags);
246 __libc_lock_unlock (dfa->lock);
247 return err != REG_NOERROR;
248}
249
250#ifdef _LIBC
251# include <shlib-compat.h>
252versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
253
254# if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
255__typeof__ (__regexec) __compat_regexec;
256
257int
258attribute_compat_text_section
259__compat_regexec (const regex_t *__restrict preg,
260 const char *__restrict string, size_t nmatch,
261 regmatch_t pmatch[], int eflags)
262{
263 return regexec (preg, string, nmatch, pmatch,
264 eflags & (REG_NOTBOL | REG_NOTEOL));
265}
266compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
267# endif
268#endif
269
270/* Entry points for GNU code. */
271
272/* re_match, re_search, re_match_2, re_search_2
273
274 The former two functions operate on STRING with length LENGTH,
275 while the later two operate on concatenation of STRING1 and STRING2
276 with lengths LENGTH1 and LENGTH2, respectively.
277
278 re_match() matches the compiled pattern in BUFP against the string,
279 starting at index START.
280
281 re_search() first tries matching at index START, then it tries to match
282 starting from index START + 1, and so on. The last start position tried
283 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
284 way as re_match().)
285
286 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
287 the first STOP characters of the concatenation of the strings should be
288 concerned.
289
290 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
291 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
292 computed relative to the concatenation, not relative to the individual
293 strings.)
294
295 On success, re_match* functions return the length of the match, re_search*
296 return the position of the start of the match. Return value -1 means no
297 match was found and -2 indicates an internal error. */
298
299int
300re_match (bufp, string, length, start, regs)
301 struct re_pattern_buffer *bufp;
302 const char *string;
303 int length, start;
304 struct re_registers *regs;
305{
306 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
307}
308#ifdef _LIBC
309weak_alias (__re_match, re_match)
310#endif
311
312int
313re_search (bufp, string, length, start, range, regs)
314 struct re_pattern_buffer *bufp;
315 const char *string;
316 int length, start, range;
317 struct re_registers *regs;
318{
319 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
320}
321#ifdef _LIBC
322weak_alias (__re_search, re_search)
323#endif
324
325int
326re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
327 struct re_pattern_buffer *bufp;
328 const char *string1, *string2;
329 int length1, length2, start, stop;
330 struct re_registers *regs;
331{
332 return re_search_2_stub (bufp, string1, length1, string2, length2,
333 start, 0, regs, stop, 1);
334}
335#ifdef _LIBC
336weak_alias (__re_match_2, re_match_2)
337#endif
338
339int
340re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
341 struct re_pattern_buffer *bufp;
342 const char *string1, *string2;
343 int length1, length2, start, range, stop;
344 struct re_registers *regs;
345{
346 return re_search_2_stub (bufp, string1, length1, string2, length2,
347 start, range, regs, stop, 0);
348}
349#ifdef _LIBC
350weak_alias (__re_search_2, re_search_2)
351#endif
352
353static int
354re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
355 stop, ret_len)
356 struct re_pattern_buffer *bufp;
357 const char *string1, *string2;
358 int length1, length2, start, range, stop, ret_len;
359 struct re_registers *regs;
360{
361 const char *str;
362 int rval;
363 int len = length1 + length2;
364 int free_str = 0;
365
366 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
367 return -2;
368
369 /* Concatenate the strings. */
370 if (length2 > 0)
371 if (length1 > 0)
372 {
373 char *s = re_malloc (char, len);
374
375 if (BE (s == NULL, 0))
376 return -2;
377 memcpy (s, string1, length1);
378 memcpy (s + length1, string2, length2);
379 str = s;
380 free_str = 1;
381 }
382 else
383 str = string2;
384 else
385 str = string1;
386
387 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
388 ret_len);
389 if (free_str)
390 re_free ((char *) str);
391 return rval;
392}
393
394/* The parameters have the same meaning as those of re_search.
395 Additional parameters:
396 If RET_LEN is nonzero the length of the match is returned (re_match style);
397 otherwise the position of the match is returned. */
398
399static int
400re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
401 struct re_pattern_buffer *bufp;
402 const char *string;
403 int length, start, range, stop, ret_len;
404 struct re_registers *regs;
405{
406 reg_errcode_t result;
407 regmatch_t *pmatch;
408 int nregs, rval;
409 int eflags = 0;
410 re_dfa_t *dfa = (re_dfa_t *)bufp->buffer;
411
412 /* Check for out-of-range. */
413 if (BE (start < 0 || start > length, 0))
414 return -1;
415 if (BE (start + range > length, 0))
416 range = length - start;
417 else if (BE (start + range < 0, 0))
418 range = -start;
419
420 __libc_lock_lock (dfa->lock);
421
422 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
423 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
424
425 /* Compile fastmap if we haven't yet. */
426 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
427 re_compile_fastmap (bufp);
428
429 if (BE (bufp->no_sub, 0))
430 regs = NULL;
431
432 /* We need at least 1 register. */
433 if (regs == NULL)
434 nregs = 1;
435 else if (BE (bufp->regs_allocated == REGS_FIXED &&
436 regs->num_regs < bufp->re_nsub + 1, 0))
437 {
438 nregs = regs->num_regs;
439 if (BE (nregs < 1, 0))
440 {
441 /* Nothing can be copied to regs. */
442 regs = NULL;
443 nregs = 1;
444 }
445 }
446 else
447 nregs = bufp->re_nsub + 1;
448 pmatch = re_malloc (regmatch_t, nregs);
449 if (BE (pmatch == NULL, 0))
450 {
451 rval = -2;
452 goto out;
453 }
454
455 result = re_search_internal (bufp, string, length, start, range, stop,
456 nregs, pmatch, eflags);
457
458 rval = 0;
459
460 /* I hope we needn't fill ther regs with -1's when no match was found. */
461 if (result != REG_NOERROR)
462 rval = -1;
463 else if (regs != NULL)
464 {
465 /* If caller wants register contents data back, copy them. */
466 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
467 bufp->regs_allocated);
468 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
469 rval = -2;
470 }
471
472 if (BE (rval == 0, 1))
473 {
474 if (ret_len)
475 {
476 assert (pmatch[0].rm_so == start);
477 rval = pmatch[0].rm_eo - start;
478 }
479 else
480 rval = pmatch[0].rm_so;
481 }
482 re_free (pmatch);
483 out:
484 __libc_lock_unlock (dfa->lock);
485 return rval;
486}
487
488static unsigned
489re_copy_regs (regs, pmatch, nregs, regs_allocated)
490 struct re_registers *regs;
491 regmatch_t *pmatch;
492 int nregs, regs_allocated;
493{
494 int rval = REGS_REALLOCATE;
495 int i;
496 int need_regs = nregs + 1;
497 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
498 uses. */
499
500 /* Have the register data arrays been allocated? */
501 if (regs_allocated == REGS_UNALLOCATED)
502 { /* No. So allocate them with malloc. */
503 regs->start = re_malloc (regoff_t, need_regs);
504 regs->end = re_malloc (regoff_t, need_regs);
505 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
506 return REGS_UNALLOCATED;
507 regs->num_regs = need_regs;
508 }
509 else if (regs_allocated == REGS_REALLOCATE)
510 { /* Yes. If we need more elements than were already
511 allocated, reallocate them. If we need fewer, just
512 leave it alone. */
513 if (BE (need_regs > regs->num_regs, 0))
514 {
515 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
516 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
517 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
518 return REGS_UNALLOCATED;
519 regs->start = new_start;
520 regs->end = new_end;
521 regs->num_regs = need_regs;
522 }
523 }
524 else
525 {
526 assert (regs_allocated == REGS_FIXED);
527 /* This function may not be called with REGS_FIXED and nregs too big. */
528 assert (regs->num_regs >= nregs);
529 rval = REGS_FIXED;
530 }
531
532 /* Copy the regs. */
533 for (i = 0; i < nregs; ++i)
534 {
535 regs->start[i] = pmatch[i].rm_so;
536 regs->end[i] = pmatch[i].rm_eo;
537 }
538 for ( ; i < regs->num_regs; ++i)
539 regs->start[i] = regs->end[i] = -1;
540
541 return rval;
542}
543
544/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
545 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
546 this memory for recording register information. STARTS and ENDS
547 must be allocated using the malloc library routine, and must each
548 be at least NUM_REGS * sizeof (regoff_t) bytes long.
549
550 If NUM_REGS == 0, then subsequent matches should allocate their own
551 register data.
552
553 Unless this function is called, the first search or match using
554 PATTERN_BUFFER will allocate its own register data, without
555 freeing the old data. */
556
557void
558re_set_registers (bufp, regs, num_regs, starts, ends)
559 struct re_pattern_buffer *bufp;
560 struct re_registers *regs;
561 unsigned num_regs;
562 regoff_t *starts, *ends;
563{
564 if (num_regs)
565 {
566 bufp->regs_allocated = REGS_REALLOCATE;
567 regs->num_regs = num_regs;
568 regs->start = starts;
569 regs->end = ends;
570 }
571 else
572 {
573 bufp->regs_allocated = REGS_UNALLOCATED;
574 regs->num_regs = 0;
575 regs->start = regs->end = (regoff_t *) 0;
576 }
577}
578#ifdef _LIBC
579weak_alias (__re_set_registers, re_set_registers)
580#endif
581
582
583/* Entry points compatible with 4.2 BSD regex library. We don't define
584 them unless specifically requested. */
585
586#if defined _REGEX_RE_COMP || defined _LIBC
587int
588# ifdef _LIBC
589weak_function
590# endif
591re_exec (s)
592 const char *s;
593{
594 return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
595}
596#endif /* _REGEX_RE_COMP */
597
598
599/* Internal entry point. */
600
601/* Searches for a compiled pattern PREG in the string STRING, whose
602 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
603 mingings with regexec. START, and RANGE have the same meanings
604 with re_search.
605 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
606 otherwise return the error code.
607 Note: We assume front end functions already check ranges.
608 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
609
610static reg_errcode_t
611re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
612 eflags)
613 const regex_t *preg;
614 const char *string;
615 int length, start, range, stop, eflags;
616 size_t nmatch;
617 regmatch_t pmatch[];
618{
619 reg_errcode_t err;
620 re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
621 int left_lim, right_lim, incr;
622 int fl_longest_match, match_first, match_kind, match_last = -1;
623 int extra_nmatch;
624 int sb, ch;
625#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
626 re_match_context_t mctx = { .dfa = dfa };
627#else
628 re_match_context_t mctx;
629#endif
630 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
631 && range && !preg->can_be_null) ? preg->fastmap : NULL;
632 unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
633
634#if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
635 memset (&mctx, '\0', sizeof (re_match_context_t));
636 mctx.dfa = dfa;
637#endif
638
639 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
640 nmatch -= extra_nmatch;
641
642 /* Check if the DFA haven't been compiled. */
643 if (BE (preg->used == 0 || dfa->init_state == NULL
644 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
645 || dfa->init_state_begbuf == NULL, 0))
646 return REG_NOMATCH;
647
648#ifdef DEBUG
649 /* We assume front-end functions already check them. */
650 assert (start + range >= 0 && start + range <= length);
651#endif
652
653 /* If initial states with non-begbuf contexts have no elements,
654 the regex must be anchored. If preg->newline_anchor is set,
655 we'll never use init_state_nl, so do not check it. */
656 if (dfa->init_state->nodes.nelem == 0
657 && dfa->init_state_word->nodes.nelem == 0
658 && (dfa->init_state_nl->nodes.nelem == 0
659 || !preg->newline_anchor))
660 {
661 if (start != 0 && start + range != 0)
662 return REG_NOMATCH;
663 start = range = 0;
664 }
665
666 /* We must check the longest matching, if nmatch > 0. */
667 fl_longest_match = (nmatch != 0 || dfa->nbackref);
668
669 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
670 preg->translate, preg->syntax & RE_ICASE, dfa);
671 if (BE (err != REG_NOERROR, 0))
672 goto free_return;
673 mctx.input.stop = stop;
674 mctx.input.raw_stop = stop;
675 mctx.input.newline_anchor = preg->newline_anchor;
676
677 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
678 if (BE (err != REG_NOERROR, 0))
679 goto free_return;
680
681 /* We will log all the DFA states through which the dfa pass,
682 if nmatch > 1, or this dfa has "multibyte node", which is a
683 back-reference or a node which can accept multibyte character or
684 multi character collating element. */
685 if (nmatch > 1 || dfa->has_mb_node)
686 {
687 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
688 if (BE (mctx.state_log == NULL, 0))
689 {
690 err = REG_ESPACE;
691 goto free_return;
692 }
693 }
694 else
695 mctx.state_log = NULL;
696
697 match_first = start;
698 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
699 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
700
701 /* Check incrementally whether of not the input string match. */
702 incr = (range < 0) ? -1 : 1;
703 left_lim = (range < 0) ? start + range : start;
704 right_lim = (range < 0) ? start : start + range;
705 sb = dfa->mb_cur_max == 1;
706 match_kind =
707 (fastmap
708 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
709 | (range >= 0 ? 2 : 0)
710 | (t != NULL ? 1 : 0))
711 : 8);
712
713 for (;; match_first += incr)
714 {
715 err = REG_NOMATCH;
716 if (match_first < left_lim || right_lim < match_first)
717 goto free_return;
718
719 /* Advance as rapidly as possible through the string, until we
720 find a plausible place to start matching. This may be done
721 with varying efficiency, so there are various possibilities:
722 only the most common of them are specialized, in order to
723 save on code size. We use a switch statement for speed. */
724 switch (match_kind)
725 {
726 case 8:
727 /* No fastmap. */
728 break;
729
730 case 7:
731 /* Fastmap with single-byte translation, match forward. */
732 while (BE (match_first < right_lim, 1)
733 && !fastmap[t[(unsigned char) string[match_first]]])
734 ++match_first;
735 goto forward_match_found_start_or_reached_end;
736
737 case 6:
738 /* Fastmap without translation, match forward. */
739 while (BE (match_first < right_lim, 1)
740 && !fastmap[(unsigned char) string[match_first]])
741 ++match_first;
742
743 forward_match_found_start_or_reached_end:
744 if (BE (match_first == right_lim, 0))
745 {
746 ch = match_first >= length
747 ? 0 : (unsigned char) string[match_first];
748 if (!fastmap[t ? t[ch] : ch])
749 goto free_return;
750 }
751 break;
752
753 case 4:
754 case 5:
755 /* Fastmap without multi-byte translation, match backwards. */
756 while (match_first >= left_lim)
757 {
758 ch = match_first >= length
759 ? 0 : (unsigned char) string[match_first];
760 if (fastmap[t ? t[ch] : ch])
761 break;
762 --match_first;
763 }
764 if (match_first < left_lim)
765 goto free_return;
766 break;
767
768 default:
769 /* In this case, we can't determine easily the current byte,
770 since it might be a component byte of a multibyte
771 character. Then we use the constructed buffer instead. */
772 for (;;)
773 {
774 /* If MATCH_FIRST is out of the valid range, reconstruct the
775 buffers. */
776 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
777 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
778 {
779 err = re_string_reconstruct (&mctx.input, match_first,
780 eflags);
781 if (BE (err != REG_NOERROR, 0))
782 goto free_return;
783
784 offset = match_first - mctx.input.raw_mbs_idx;
785 }
786 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
787 Note that MATCH_FIRST must not be smaller than 0. */
788 ch = (match_first >= length
789 ? 0 : re_string_byte_at (&mctx.input, offset));
790 if (fastmap[ch])
791 break;
792 match_first += incr;
793 if (match_first < left_lim || match_first > right_lim)
794 {
795 err = REG_NOMATCH;
796 goto free_return;
797 }
798 }
799 break;
800 }
801
802 /* Reconstruct the buffers so that the matcher can assume that
803 the matching starts from the beginning of the buffer. */
804 err = re_string_reconstruct (&mctx.input, match_first, eflags);
805 if (BE (err != REG_NOERROR, 0))
806 goto free_return;
807
808#ifdef RE_ENABLE_I18N
809 /* Don't consider this char as a possible match start if it part,
810 yet isn't the head, of a multibyte character. */
811 if (!sb && !re_string_first_byte (&mctx.input, 0))
812 continue;
813#endif
814
815 /* It seems to be appropriate one, then use the matcher. */
816 /* We assume that the matching starts from 0. */
817 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
818 match_last = check_matching (&mctx, fl_longest_match,
819 range >= 0 ? &match_first : NULL);
820 if (match_last != -1)
821 {
822 if (BE (match_last == -2, 0))
823 {
824 err = REG_ESPACE;
825 goto free_return;
826 }
827 else
828 {
829 mctx.match_last = match_last;
830 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
831 {
832 re_dfastate_t *pstate = mctx.state_log[match_last];
833 mctx.last_node = check_halt_state_context (&mctx, pstate,
834 match_last);
835 }
836 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
837 || dfa->nbackref)
838 {
839 err = prune_impossible_nodes (&mctx);
840 if (err == REG_NOERROR)
841 break;
842 if (BE (err != REG_NOMATCH, 0))
843 goto free_return;
844 match_last = -1;
845 }
846 else
847 break; /* We found a match. */
848 }
849 }
850
851 match_ctx_clean (&mctx);
852 }
853
854#ifdef DEBUG
855 assert (match_last != -1);
856 assert (err == REG_NOERROR);
857#endif
858
859 /* Set pmatch[] if we need. */
860 if (nmatch > 0)
861 {
862 int reg_idx;
863
864 /* Initialize registers. */
865 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
866 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
867
868 /* Set the points where matching start/end. */
869 pmatch[0].rm_so = 0;
870 pmatch[0].rm_eo = mctx.match_last;
871
872 if (!preg->no_sub && nmatch > 1)
873 {
874 err = set_regs (preg, &mctx, nmatch, pmatch,
875 dfa->has_plural_match && dfa->nbackref > 0);
876 if (BE (err != REG_NOERROR, 0))
877 goto free_return;
878 }
879
880 /* At last, add the offset to the each registers, since we slided
881 the buffers so that we could assume that the matching starts
882 from 0. */
883 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
884 if (pmatch[reg_idx].rm_so != -1)
885 {
886#ifdef RE_ENABLE_I18N
887 if (BE (mctx.input.offsets_needed != 0, 0))
888 {
889 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
890 pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
891 else
892 pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
893 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
894 pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
895 else
896 pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
897 }
898#else
899 assert (mctx.input.offsets_needed == 0);
900#endif
901 pmatch[reg_idx].rm_so += match_first;
902 pmatch[reg_idx].rm_eo += match_first;
903 }
904 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
905 {
906 pmatch[nmatch + reg_idx].rm_so = -1;
907 pmatch[nmatch + reg_idx].rm_eo = -1;
908 }
909
910 if (dfa->subexp_map)
911 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
912 if (dfa->subexp_map[reg_idx] != reg_idx)
913 {
914 pmatch[reg_idx + 1].rm_so
915 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
916 pmatch[reg_idx + 1].rm_eo
917 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
918 }
919 }
920
921 free_return:
922 re_free (mctx.state_log);
923 if (dfa->nbackref)
924 match_ctx_free (&mctx);
925 re_string_destruct (&mctx.input);
926 return err;
927}
928
929static reg_errcode_t
930prune_impossible_nodes (mctx)
931 re_match_context_t *mctx;
932{
933 re_dfa_t *const dfa = mctx->dfa;
934 int halt_node, match_last;
935 reg_errcode_t ret;
936 re_dfastate_t **sifted_states;
937 re_dfastate_t **lim_states = NULL;
938 re_sift_context_t sctx;
939#ifdef DEBUG
940 assert (mctx->state_log != NULL);
941#endif
942 match_last = mctx->match_last;
943 halt_node = mctx->last_node;
944 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
945 if (BE (sifted_states == NULL, 0))
946 {
947 ret = REG_ESPACE;
948 goto free_return;
949 }
950 if (dfa->nbackref)
951 {
952 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
953 if (BE (lim_states == NULL, 0))
954 {
955 ret = REG_ESPACE;
956 goto free_return;
957 }
958 while (1)
959 {
960 memset (lim_states, '\0',
961 sizeof (re_dfastate_t *) * (match_last + 1));
962 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
963 match_last);
964 ret = sift_states_backward (mctx, &sctx);
965 re_node_set_free (&sctx.limits);
966 if (BE (ret != REG_NOERROR, 0))
967 goto free_return;
968 if (sifted_states[0] != NULL || lim_states[0] != NULL)
969 break;
970 do
971 {
972 --match_last;
973 if (match_last < 0)
974 {
975 ret = REG_NOMATCH;
976 goto free_return;
977 }
978 } while (mctx->state_log[match_last] == NULL
979 || !mctx->state_log[match_last]->halt);
980 halt_node = check_halt_state_context (mctx,
981 mctx->state_log[match_last],
982 match_last);
983 }
984 ret = merge_state_array (dfa, sifted_states, lim_states,
985 match_last + 1);
986 re_free (lim_states);
987 lim_states = NULL;
988 if (BE (ret != REG_NOERROR, 0))
989 goto free_return;
990 }
991 else
992 {
993 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
994 ret = sift_states_backward (mctx, &sctx);
995 re_node_set_free (&sctx.limits);
996 if (BE (ret != REG_NOERROR, 0))
997 goto free_return;
998 }
999 re_free (mctx->state_log);
1000 mctx->state_log = sifted_states;
1001 sifted_states = NULL;
1002 mctx->last_node = halt_node;
1003 mctx->match_last = match_last;
1004 ret = REG_NOERROR;
1005 free_return:
1006 re_free (sifted_states);
1007 re_free (lim_states);
1008 return ret;
1009}
1010
1011/* Acquire an initial state and return it.
1012 We must select appropriate initial state depending on the context,
1013 since initial states may have constraints like "\<", "^", etc.. */
1014
1015static inline re_dfastate_t *
1016acquire_init_state_context (err, mctx, idx)
1017 reg_errcode_t *err;
1018 const re_match_context_t *mctx;
1019 int idx;
1020{
1021 re_dfa_t *const dfa = mctx->dfa;
1022 if (dfa->init_state->has_constraint)
1023 {
1024 unsigned int context;
1025 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1026 if (IS_WORD_CONTEXT (context))
1027 return dfa->init_state_word;
1028 else if (IS_ORDINARY_CONTEXT (context))
1029 return dfa->init_state;
1030 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1031 return dfa->init_state_begbuf;
1032 else if (IS_NEWLINE_CONTEXT (context))
1033 return dfa->init_state_nl;
1034 else if (IS_BEGBUF_CONTEXT (context))
1035 {
1036 /* It is relatively rare case, then calculate on demand. */
1037 return re_acquire_state_context (err, dfa,
1038 dfa->init_state->entrance_nodes,
1039 context);
1040 }
1041 else
1042 /* Must not happen? */
1043 return dfa->init_state;
1044 }
1045 else
1046 return dfa->init_state;
1047}
1048
1049/* Check whether the regular expression match input string INPUT or not,
1050 and return the index where the matching end, return -1 if not match,
1051 or return -2 in case of an error.
1052 FL_LONGEST_MATCH means we want the POSIX longest matching.
1053 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1054 next place where we may want to try matching.
1055 Note that the matcher assume that the maching starts from the current
1056 index of the buffer. */
1057
1058static int
1059check_matching (mctx, fl_longest_match, p_match_first)
1060 re_match_context_t *mctx;
1061 int fl_longest_match;
1062 int *p_match_first;
1063{
1064 re_dfa_t *const dfa = mctx->dfa;
1065 reg_errcode_t err;
1066 int match = 0;
1067 int match_last = -1;
1068 int cur_str_idx = re_string_cur_idx (&mctx->input);
1069 re_dfastate_t *cur_state;
1070 int at_init_state = p_match_first != NULL;
1071 int next_start_idx = cur_str_idx;
1072
1073 err = REG_NOERROR;
1074 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1075 /* An initial state must not be NULL (invalid). */
1076 if (BE (cur_state == NULL, 0))
1077 {
1078 assert (err == REG_ESPACE);
1079 return -2;
1080 }
1081
1082 if (mctx->state_log != NULL)
1083 {
1084 mctx->state_log[cur_str_idx] = cur_state;
1085
1086 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1087 later. E.g. Processing back references. */
1088 if (BE (dfa->nbackref, 0))
1089 {
1090 at_init_state = 0;
1091 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1092 if (BE (err != REG_NOERROR, 0))
1093 return err;
1094
1095 if (cur_state->has_backref)
1096 {
1097 err = transit_state_bkref (mctx, &cur_state->nodes);
1098 if (BE (err != REG_NOERROR, 0))
1099 return err;
1100 }
1101 }
1102 }
1103
1104 /* If the RE accepts NULL string. */
1105 if (BE (cur_state->halt, 0))
1106 {
1107 if (!cur_state->has_constraint
1108 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1109 {
1110 if (!fl_longest_match)
1111 return cur_str_idx;
1112 else
1113 {
1114 match_last = cur_str_idx;
1115 match = 1;
1116 }
1117 }
1118 }
1119
1120 while (!re_string_eoi (&mctx->input))
1121 {
1122 re_dfastate_t *old_state = cur_state;
1123 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1124
1125 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1126 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1127 && mctx->input.valid_len < mctx->input.len))
1128 {
1129 err = extend_buffers (mctx);
1130 if (BE (err != REG_NOERROR, 0))
1131 {
1132 assert (err == REG_ESPACE);
1133 return -2;
1134 }
1135 }
1136
1137 cur_state = transit_state (&err, mctx, cur_state);
1138 if (mctx->state_log != NULL)
1139 cur_state = merge_state_with_log (&err, mctx, cur_state);
1140
1141 if (cur_state == NULL)
1142 {
1143 /* Reached the invalid state or an error. Try to recover a valid
1144 state using the state log, if available and if we have not
1145 already found a valid (even if not the longest) match. */
1146 if (BE (err != REG_NOERROR, 0))
1147 return -2;
1148
1149 if (mctx->state_log == NULL
1150 || (match && !fl_longest_match)
1151 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1152 break;
1153 }
1154
1155 if (BE (at_init_state, 0))
1156 {
1157 if (old_state == cur_state)
1158 next_start_idx = next_char_idx;
1159 else
1160 at_init_state = 0;
1161 }
1162
1163 if (cur_state->halt)
1164 {
1165 /* Reached a halt state.
1166 Check the halt state can satisfy the current context. */
1167 if (!cur_state->has_constraint
1168 || check_halt_state_context (mctx, cur_state,
1169 re_string_cur_idx (&mctx->input)))
1170 {
1171 /* We found an appropriate halt state. */
1172 match_last = re_string_cur_idx (&mctx->input);
1173 match = 1;
1174
1175 /* We found a match, do not modify match_first below. */
1176 p_match_first = NULL;
1177 if (!fl_longest_match)
1178 break;
1179 }
1180 }
1181 }
1182
1183 if (p_match_first)
1184 *p_match_first += next_start_idx;
1185
1186 return match_last;
1187}
1188
1189/* Check NODE match the current context. */
1190
1191static int check_halt_node_context (dfa, node, context)
1192 const re_dfa_t *dfa;
1193 int node;
1194 unsigned int context;
1195{
1196 re_token_type_t type = dfa->nodes[node].type;
1197 unsigned int constraint = dfa->nodes[node].constraint;
1198 if (type != END_OF_RE)
1199 return 0;
1200 if (!constraint)
1201 return 1;
1202 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1203 return 0;
1204 return 1;
1205}
1206
1207/* Check the halt state STATE match the current context.
1208 Return 0 if not match, if the node, STATE has, is a halt node and
1209 match the context, return the node. */
1210
1211static int
1212check_halt_state_context (mctx, state, idx)
1213 const re_match_context_t *mctx;
1214 const re_dfastate_t *state;
1215 int idx;
1216{
1217 int i;
1218 unsigned int context;
1219#ifdef DEBUG
1220 assert (state->halt);
1221#endif
1222 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1223 for (i = 0; i < state->nodes.nelem; ++i)
1224 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1225 return state->nodes.elems[i];
1226 return 0;
1227}
1228
1229/* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1230 corresponding to the DFA).
1231 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1232 of errors. */
1233
1234static int
1235proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1236 const re_match_context_t *mctx;
1237 regmatch_t *regs;
1238 int nregs, *pidx, node;
1239 re_node_set *eps_via_nodes;
1240 struct re_fail_stack_t *fs;
1241{
1242 re_dfa_t *const dfa = mctx->dfa;
1243 int i, err, dest_node;
1244 dest_node = -1;
1245 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1246 {
1247 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1248 re_node_set *edests = &dfa->edests[node];
1249 int dest_node;
1250 err = re_node_set_insert (eps_via_nodes, node);
1251 if (BE (err < 0, 0))
1252 return -2;
1253 /* Pick up a valid destination, or return -1 if none is found. */
1254 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1255 {
1256 int candidate = edests->elems[i];
1257 if (!re_node_set_contains (cur_nodes, candidate))
1258 continue;
1259 if (dest_node == -1)
1260 dest_node = candidate;
1261
1262 else
1263 {
1264 /* In order to avoid infinite loop like "(a*)*", return the second
1265 epsilon-transition if the first was already considered. */
1266 if (re_node_set_contains (eps_via_nodes, dest_node))
1267 return candidate;
1268
1269 /* Otherwise, push the second epsilon-transition on the fail stack. */
1270 else if (fs != NULL
1271 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1272 eps_via_nodes))
1273 return -2;
1274
1275 /* We know we are going to exit. */
1276 break;
1277 }
1278 }
1279 return dest_node;
1280 }
1281 else
1282 {
1283 int naccepted = 0;
1284 re_token_type_t type = dfa->nodes[node].type;
1285
1286#ifdef RE_ENABLE_I18N
1287 if (dfa->nodes[node].accept_mb)
1288 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1289 else
1290#endif /* RE_ENABLE_I18N */
1291 if (type == OP_BACK_REF)
1292 {
1293 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1294 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1295 if (fs != NULL)
1296 {
1297 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1298 return -1;
1299 else if (naccepted)
1300 {
1301 char *buf = (char *) re_string_get_buffer (&mctx->input);
1302 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1303 naccepted) != 0)
1304 return -1;
1305 }
1306 }
1307
1308 if (naccepted == 0)
1309 {
1310 err = re_node_set_insert (eps_via_nodes, node);
1311 if (BE (err < 0, 0))
1312 return -2;
1313 dest_node = dfa->edests[node].elems[0];
1314 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1315 dest_node))
1316 return dest_node;
1317 }
1318 }
1319
1320 if (naccepted != 0
1321 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1322 {
1323 dest_node = dfa->nexts[node];
1324 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1325 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1326 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1327 dest_node)))
1328 return -1;
1329 re_node_set_empty (eps_via_nodes);
1330 return dest_node;
1331 }
1332 }
1333 return -1;
1334}
1335
1336static reg_errcode_t
1337push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1338 struct re_fail_stack_t *fs;
1339 int str_idx, dest_node, nregs;
1340 regmatch_t *regs;
1341 re_node_set *eps_via_nodes;
1342{
1343 reg_errcode_t err;
1344 int num = fs->num++;
1345 if (fs->num == fs->alloc)
1346 {
1347 struct re_fail_stack_ent_t *new_array;
1348 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1349 * fs->alloc * 2));
1350 if (new_array == NULL)
1351 return REG_ESPACE;
1352 fs->alloc *= 2;
1353 fs->stack = new_array;
1354 }
1355 fs->stack[num].idx = str_idx;
1356 fs->stack[num].node = dest_node;
1357 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1358 if (fs->stack[num].regs == NULL)
1359 return REG_ESPACE;
1360 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1361 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1362 return err;
1363}
1364
1365static int
1366pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1367 struct re_fail_stack_t *fs;
1368 int *pidx, nregs;
1369 regmatch_t *regs;
1370 re_node_set *eps_via_nodes;
1371{
1372 int num = --fs->num;
1373 assert (num >= 0);
1374 *pidx = fs->stack[num].idx;
1375 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1376 re_node_set_free (eps_via_nodes);
1377 re_free (fs->stack[num].regs);
1378 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1379 return fs->stack[num].node;
1380}
1381
1382/* Set the positions where the subexpressions are starts/ends to registers
1383 PMATCH.
1384 Note: We assume that pmatch[0] is already set, and
1385 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1386
1387static reg_errcode_t
1388set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1389 const regex_t *preg;
1390 const re_match_context_t *mctx;
1391 size_t nmatch;
1392 regmatch_t *pmatch;
1393 int fl_backtrack;
1394{
1395 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1396 int idx, cur_node;
1397 re_node_set eps_via_nodes;
1398 struct re_fail_stack_t *fs;
1399 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1400 regmatch_t *prev_idx_match;
1401
1402#ifdef DEBUG
1403 assert (nmatch > 1);
1404 assert (mctx->state_log != NULL);
1405#endif
1406 if (fl_backtrack)
1407 {
1408 fs = &fs_body;
1409 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1410 if (fs->stack == NULL)
1411 return REG_ESPACE;
1412 }
1413 else
1414 fs = NULL;
1415
1416 cur_node = dfa->init_node;
1417 re_node_set_init_empty (&eps_via_nodes);
1418
1419 prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * nmatch);
1420 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1421
1422 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1423 {
1424 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1425
1426 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1427 {
1428 int reg_idx;
1429 if (fs)
1430 {
1431 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1432 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1433 break;
1434 if (reg_idx == nmatch)
1435 {
1436 re_node_set_free (&eps_via_nodes);
1437 return free_fail_stack_return (fs);
1438 }
1439 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1440 &eps_via_nodes);
1441 }
1442 else
1443 {
1444 re_node_set_free (&eps_via_nodes);
1445 return REG_NOERROR;
1446 }
1447 }
1448
1449 /* Proceed to next node. */
1450 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1451 &eps_via_nodes, fs);
1452
1453 if (BE (cur_node < 0, 0))
1454 {
1455 if (BE (cur_node == -2, 0))
1456 {
1457 re_node_set_free (&eps_via_nodes);
1458 free_fail_stack_return (fs);
1459 return REG_ESPACE;
1460 }
1461 if (fs)
1462 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1463 &eps_via_nodes);
1464 else
1465 {
1466 re_node_set_free (&eps_via_nodes);
1467 return REG_NOMATCH;
1468 }
1469 }
1470 }
1471 re_node_set_free (&eps_via_nodes);
1472 return free_fail_stack_return (fs);
1473}
1474
1475static reg_errcode_t
1476free_fail_stack_return (fs)
1477 struct re_fail_stack_t *fs;
1478{
1479 if (fs)
1480 {
1481 int fs_idx;
1482 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1483 {
1484 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1485 re_free (fs->stack[fs_idx].regs);
1486 }
1487 re_free (fs->stack);
1488 }
1489 return REG_NOERROR;
1490}
1491
1492static void
1493update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1494 re_dfa_t *dfa;
1495 regmatch_t *pmatch, *prev_idx_match;
1496 int cur_node, cur_idx, nmatch;
1497{
1498 int type = dfa->nodes[cur_node].type;
1499 if (type == OP_OPEN_SUBEXP)
1500 {
1501 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1502
1503 /* We are at the first node of this sub expression. */
1504 if (reg_num < nmatch)
1505 {
1506 pmatch[reg_num].rm_so = cur_idx;
1507 pmatch[reg_num].rm_eo = -1;
1508 }
1509 }
1510 else if (type == OP_CLOSE_SUBEXP)
1511 {
1512 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1513 if (reg_num < nmatch)
1514 {
1515 /* We are at the last node of this sub expression. */
1516 if (pmatch[reg_num].rm_so < cur_idx)
1517 {
1518 pmatch[reg_num].rm_eo = cur_idx;
1519 /* This is a non-empty match or we are not inside an optional
1520 subexpression. Accept this right away. */
1521 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1522 }
1523 else
1524 {
1525 if (dfa->nodes[cur_node].opt_subexp
1526 && prev_idx_match[reg_num].rm_so != -1)
1527 /* We transited through an empty match for an optional
1528 subexpression, like (a?)*, and this is not the subexp's
1529 first match. Copy back the old content of the registers
1530 so that matches of an inner subexpression are undone as
1531 well, like in ((a?))*. */
1532 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1533 else
1534 /* We completed a subexpression, but it may be part of
1535 an optional one, so do not update PREV_IDX_MATCH. */
1536 pmatch[reg_num].rm_eo = cur_idx;
1537 }
1538 }
1539 }
1540}
1541
1542/* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1543 and sift the nodes in each states according to the following rules.
1544 Updated state_log will be wrote to STATE_LOG.
1545
1546 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1547 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1548 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1549 the LAST_NODE, we throw away the node `a'.
1550 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1551 string `s' and transit to `b':
1552 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1553 away the node `a'.
1554 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1555 thrown away, we throw away the node `a'.
1556 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1557 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1558 node `a'.
1559 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1560 we throw away the node `a'. */
1561
1562#define STATE_NODE_CONTAINS(state,node) \
1563 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1564
1565static reg_errcode_t
1566sift_states_backward (mctx, sctx)
1567 re_match_context_t *mctx;
1568 re_sift_context_t *sctx;
1569{
1570 reg_errcode_t err;
1571 int null_cnt = 0;
1572 int str_idx = sctx->last_str_idx;
1573 re_node_set cur_dest;
1574
1575#ifdef DEBUG
1576 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1577#endif
1578
1579 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1580 transit to the last_node and the last_node itself. */
1581 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1582 if (BE (err != REG_NOERROR, 0))
1583 return err;
1584 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1585 if (BE (err != REG_NOERROR, 0))
1586 goto free_return;
1587
1588 /* Then check each states in the state_log. */
1589 while (str_idx > 0)
1590 {
1591 /* Update counters. */
1592 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1593 if (null_cnt > mctx->max_mb_elem_len)
1594 {
1595 memset (sctx->sifted_states, '\0',
1596 sizeof (re_dfastate_t *) * str_idx);
1597 re_node_set_free (&cur_dest);
1598 return REG_NOERROR;
1599 }
1600 re_node_set_empty (&cur_dest);
1601 --str_idx;
1602
1603 if (mctx->state_log[str_idx])
1604 {
1605 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1606 if (BE (err != REG_NOERROR, 0))
1607 goto free_return;
1608 }
1609
1610 /* Add all the nodes which satisfy the following conditions:
1611 - It can epsilon transit to a node in CUR_DEST.
1612 - It is in CUR_SRC.
1613 And update state_log. */
1614 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1615 if (BE (err != REG_NOERROR, 0))
1616 goto free_return;
1617 }
1618 err = REG_NOERROR;
1619 free_return:
1620 re_node_set_free (&cur_dest);
1621 return err;
1622}
1623
1624static reg_errcode_t
1625build_sifted_states (mctx, sctx, str_idx, cur_dest)
1626 re_match_context_t *mctx;
1627 re_sift_context_t *sctx;
1628 int str_idx;
1629 re_node_set *cur_dest;
1630{
1631 re_dfa_t *const dfa = mctx->dfa;
1632 re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1633 int i;
1634
1635 /* Then build the next sifted state.
1636 We build the next sifted state on `cur_dest', and update
1637 `sifted_states[str_idx]' with `cur_dest'.
1638 Note:
1639 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1640 `cur_src' points the node_set of the old `state_log[str_idx]'
1641 (with the epsilon nodes pre-filtered out). */
1642 for (i = 0; i < cur_src->nelem; i++)
1643 {
1644 int prev_node = cur_src->elems[i];
1645 int naccepted = 0;
1646 int ret;
1647
1648#ifdef DEBUG
1649 re_token_type_t type = dfa->nodes[prev_node].type;
1650 assert (!IS_EPSILON_NODE (type));
1651#endif
1652#ifdef RE_ENABLE_I18N
1653 /* If the node may accept `multi byte'. */
1654 if (dfa->nodes[prev_node].accept_mb)
1655 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1656 str_idx, sctx->last_str_idx);
1657#endif /* RE_ENABLE_I18N */
1658
1659 /* We don't check backreferences here.
1660 See update_cur_sifted_state(). */
1661 if (!naccepted
1662 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1663 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1664 dfa->nexts[prev_node]))
1665 naccepted = 1;
1666
1667 if (naccepted == 0)
1668 continue;
1669
1670 if (sctx->limits.nelem)
1671 {
1672 int to_idx = str_idx + naccepted;
1673 if (check_dst_limits (mctx, &sctx->limits,
1674 dfa->nexts[prev_node], to_idx,
1675 prev_node, str_idx))
1676 continue;
1677 }
1678 ret = re_node_set_insert (cur_dest, prev_node);
1679 if (BE (ret == -1, 0))
1680 return REG_ESPACE;
1681 }
1682
1683 return REG_NOERROR;
1684}
1685
1686/* Helper functions. */
1687
1688static reg_errcode_t
1689clean_state_log_if_needed (mctx, next_state_log_idx)
1690 re_match_context_t *mctx;
1691 int next_state_log_idx;
1692{
1693 int top = mctx->state_log_top;
1694
1695 if (next_state_log_idx >= mctx->input.bufs_len
1696 || (next_state_log_idx >= mctx->input.valid_len
1697 && mctx->input.valid_len < mctx->input.len))
1698 {
1699 reg_errcode_t err;
1700 err = extend_buffers (mctx);
1701 if (BE (err != REG_NOERROR, 0))
1702 return err;
1703 }
1704
1705 if (top < next_state_log_idx)
1706 {
1707 memset (mctx->state_log + top + 1, '\0',
1708 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1709 mctx->state_log_top = next_state_log_idx;
1710 }
1711 return REG_NOERROR;
1712}
1713
1714static reg_errcode_t
1715merge_state_array (dfa, dst, src, num)
1716 re_dfa_t *dfa;
1717 re_dfastate_t **dst;
1718 re_dfastate_t **src;
1719 int num;
1720{
1721 int st_idx;
1722 reg_errcode_t err;
1723 for (st_idx = 0; st_idx < num; ++st_idx)
1724 {
1725 if (dst[st_idx] == NULL)
1726 dst[st_idx] = src[st_idx];
1727 else if (src[st_idx] != NULL)
1728 {
1729 re_node_set merged_set;
1730 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1731 &src[st_idx]->nodes);
1732 if (BE (err != REG_NOERROR, 0))
1733 return err;
1734 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1735 re_node_set_free (&merged_set);
1736 if (BE (err != REG_NOERROR, 0))
1737 return err;
1738 }
1739 }
1740 return REG_NOERROR;
1741}
1742
1743static reg_errcode_t
1744update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1745 re_match_context_t *mctx;
1746 re_sift_context_t *sctx;
1747 int str_idx;
1748 re_node_set *dest_nodes;
1749{
1750 re_dfa_t *const dfa = mctx->dfa;
1751 reg_errcode_t err;
1752 const re_node_set *candidates;
1753 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1754 : &mctx->state_log[str_idx]->nodes);
1755
1756 if (dest_nodes->nelem == 0)
1757 sctx->sifted_states[str_idx] = NULL;
1758 else
1759 {
1760 if (candidates)
1761 {
1762 /* At first, add the nodes which can epsilon transit to a node in
1763 DEST_NODE. */
1764 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1765 if (BE (err != REG_NOERROR, 0))
1766 return err;
1767
1768 /* Then, check the limitations in the current sift_context. */
1769 if (sctx->limits.nelem)
1770 {
1771 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1772 mctx->bkref_ents, str_idx);
1773 if (BE (err != REG_NOERROR, 0))
1774 return err;
1775 }
1776 }
1777
1778 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1779 if (BE (err != REG_NOERROR, 0))
1780 return err;
1781 }
1782
1783 if (candidates && mctx->state_log[str_idx]->has_backref)
1784 {
1785 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1786 if (BE (err != REG_NOERROR, 0))
1787 return err;
1788 }
1789 return REG_NOERROR;
1790}
1791
1792static reg_errcode_t
1793add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1794 re_dfa_t *dfa;
1795 re_node_set *dest_nodes;
1796 const re_node_set *candidates;
1797{
1798 reg_errcode_t err = REG_NOERROR;
1799 int i;
1800
1801 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1802 if (BE (err != REG_NOERROR, 0))
1803 return err;
1804
1805 if (!state->inveclosure.alloc)
1806 {
1807 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1808 if (BE (err != REG_NOERROR, 0))
1809 return REG_ESPACE;
1810 for (i = 0; i < dest_nodes->nelem; i++)
1811 re_node_set_merge (&state->inveclosure,
1812 dfa->inveclosures + dest_nodes->elems[i]);
1813 }
1814 return re_node_set_add_intersect (dest_nodes, candidates,
1815 &state->inveclosure);
1816}
1817
1818static reg_errcode_t
1819sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1820 re_dfa_t *dfa;
1821 int node;
1822 re_node_set *dest_nodes;
1823 const re_node_set *candidates;
1824{
1825 int ecl_idx;
1826 reg_errcode_t err;
1827 re_node_set *inv_eclosure = dfa->inveclosures + node;
1828 re_node_set except_nodes;
1829 re_node_set_init_empty (&except_nodes);
1830 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1831 {
1832 int cur_node = inv_eclosure->elems[ecl_idx];
1833 if (cur_node == node)
1834 continue;
1835 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1836 {
1837 int edst1 = dfa->edests[cur_node].elems[0];
1838 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1839 ? dfa->edests[cur_node].elems[1] : -1);
1840 if ((!re_node_set_contains (inv_eclosure, edst1)
1841 && re_node_set_contains (dest_nodes, edst1))
1842 || (edst2 > 0
1843 && !re_node_set_contains (inv_eclosure, edst2)
1844 && re_node_set_contains (dest_nodes, edst2)))
1845 {
1846 err = re_node_set_add_intersect (&except_nodes, candidates,
1847 dfa->inveclosures + cur_node);
1848 if (BE (err != REG_NOERROR, 0))
1849 {
1850 re_node_set_free (&except_nodes);
1851 return err;
1852 }
1853 }
1854 }
1855 }
1856 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1857 {
1858 int cur_node = inv_eclosure->elems[ecl_idx];
1859 if (!re_node_set_contains (&except_nodes, cur_node))
1860 {
1861 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1862 re_node_set_remove_at (dest_nodes, idx);
1863 }
1864 }
1865 re_node_set_free (&except_nodes);
1866 return REG_NOERROR;
1867}
1868
1869static int
1870check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1871 re_match_context_t *mctx;
1872 re_node_set *limits;
1873 int dst_node, dst_idx, src_node, src_idx;
1874{
1875 re_dfa_t *const dfa = mctx->dfa;
1876 int lim_idx, src_pos, dst_pos;
1877
1878 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1879 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1880 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1881 {
1882 int subexp_idx;
1883 struct re_backref_cache_entry *ent;
1884 ent = mctx->bkref_ents + limits->elems[lim_idx];
1885 subexp_idx = dfa->nodes[ent->node].opr.idx;
1886
1887 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1888 subexp_idx, dst_node, dst_idx,
1889 dst_bkref_idx);
1890 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1891 subexp_idx, src_node, src_idx,
1892 src_bkref_idx);
1893
1894 /* In case of:
1895 <src> <dst> ( <subexp> )
1896 ( <subexp> ) <src> <dst>
1897 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1898 if (src_pos == dst_pos)
1899 continue; /* This is unrelated limitation. */
1900 else
1901 return 1;
1902 }
1903 return 0;
1904}
1905
1906static int
1907check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1908 re_match_context_t *mctx;
1909 int boundaries, subexp_idx, from_node, bkref_idx;
1910{
1911 re_dfa_t *const dfa = mctx->dfa;
1912 re_node_set *eclosures = dfa->eclosures + from_node;
1913 int node_idx;
1914
1915 /* Else, we are on the boundary: examine the nodes on the epsilon
1916 closure. */
1917 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1918 {
1919 int node = eclosures->elems[node_idx];
1920 switch (dfa->nodes[node].type)
1921 {
1922 case OP_BACK_REF:
1923 if (bkref_idx != -1)
1924 {
1925 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1926 do
1927 {
1928 int dst, cpos;
1929
1930 if (ent->node != node)
1931 continue;
1932
1933 if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1934 && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1935 continue;
1936
1937 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1938 OP_CLOSE_SUBEXP cases below. But, if the
1939 destination node is the same node as the source
1940 node, don't recurse because it would cause an
1941 infinite loop: a regex that exhibits this behavior
1942 is ()\1*\1* */
1943 dst = dfa->edests[node].elems[0];
1944 if (dst == from_node)
1945 {
1946 if (boundaries & 1)
1947 return -1;
1948 else /* if (boundaries & 2) */
1949 return 0;
1950 }
1951
1952 cpos =
1953 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1954 dst, bkref_idx);
1955 if (cpos == -1 /* && (boundaries & 1) */)
1956 return -1;
1957 if (cpos == 0 && (boundaries & 2))
1958 return 0;
1959
1960 ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1961 }
1962 while (ent++->more);
1963 }
1964 break;
1965
1966 case OP_OPEN_SUBEXP:
1967 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1968 return -1;
1969 break;
1970
1971 case OP_CLOSE_SUBEXP:
1972 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1973 return 0;
1974 break;
1975
1976 default:
1977 break;
1978 }
1979 }
1980
1981 return (boundaries & 2) ? 1 : 0;
1982}
1983
1984static int
1985check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1986 re_match_context_t *mctx;
1987 int limit, subexp_idx, from_node, str_idx, bkref_idx;
1988{
1989 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1990 int boundaries;
1991
1992 /* If we are outside the range of the subexpression, return -1 or 1. */
1993 if (str_idx < lim->subexp_from)
1994 return -1;
1995
1996 if (lim->subexp_to < str_idx)
1997 return 1;
1998
1999 /* If we are within the subexpression, return 0. */
2000 boundaries = (str_idx == lim->subexp_from);
2001 boundaries |= (str_idx == lim->subexp_to) << 1;
2002 if (boundaries == 0)
2003 return 0;
2004
2005 /* Else, examine epsilon closure. */
2006 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2007 from_node, bkref_idx);
2008}
2009
2010/* Check the limitations of sub expressions LIMITS, and remove the nodes
2011 which are against limitations from DEST_NODES. */
2012
2013static reg_errcode_t
2014check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
2015 re_dfa_t *dfa;
2016 re_node_set *dest_nodes;
2017 const re_node_set *candidates;
2018 re_node_set *limits;
2019 struct re_backref_cache_entry *bkref_ents;
2020 int str_idx;
2021{
2022 reg_errcode_t err;
2023 int node_idx, lim_idx;
2024
2025 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2026 {
2027 int subexp_idx;
2028 struct re_backref_cache_entry *ent;
2029 ent = bkref_ents + limits->elems[lim_idx];
2030
2031 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2032 continue; /* This is unrelated limitation. */
2033
2034 subexp_idx = dfa->nodes[ent->node].opr.idx;
2035 if (ent->subexp_to == str_idx)
2036 {
2037 int ops_node = -1;
2038 int cls_node = -1;
2039 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2040 {
2041 int node = dest_nodes->elems[node_idx];
2042 re_token_type_t type = dfa->nodes[node].type;
2043 if (type == OP_OPEN_SUBEXP
2044 && subexp_idx == dfa->nodes[node].opr.idx)
2045 ops_node = node;
2046 else if (type == OP_CLOSE_SUBEXP
2047 && subexp_idx == dfa->nodes[node].opr.idx)
2048 cls_node = node;
2049 }
2050
2051 /* Check the limitation of the open subexpression. */
2052 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2053 if (ops_node >= 0)
2054 {
2055 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2056 candidates);
2057 if (BE (err != REG_NOERROR, 0))
2058 return err;
2059 }
2060
2061 /* Check the limitation of the close subexpression. */
2062 if (cls_node >= 0)
2063 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2064 {
2065 int node = dest_nodes->elems[node_idx];
2066 if (!re_node_set_contains (dfa->inveclosures + node,
2067 cls_node)
2068 && !re_node_set_contains (dfa->eclosures + node,
2069 cls_node))
2070 {
2071 /* It is against this limitation.
2072 Remove it form the current sifted state. */
2073 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2074 candidates);
2075 if (BE (err != REG_NOERROR, 0))
2076 return err;
2077 --node_idx;
2078 }
2079 }
2080 }
2081 else /* (ent->subexp_to != str_idx) */
2082 {
2083 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2084 {
2085 int node = dest_nodes->elems[node_idx];
2086 re_token_type_t type = dfa->nodes[node].type;
2087 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2088 {
2089 if (subexp_idx != dfa->nodes[node].opr.idx)
2090 continue;
2091 /* It is against this limitation.
2092 Remove it form the current sifted state. */
2093 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2094 candidates);
2095 if (BE (err != REG_NOERROR, 0))
2096 return err;
2097 }
2098 }
2099 }
2100 }
2101 return REG_NOERROR;
2102}
2103
2104static reg_errcode_t
2105sift_states_bkref (mctx, sctx, str_idx, candidates)
2106 re_match_context_t *mctx;
2107 re_sift_context_t *sctx;
2108 int str_idx;
2109 const re_node_set *candidates;
2110{
2111 re_dfa_t *const dfa = mctx->dfa;
2112 reg_errcode_t err;
2113 int node_idx, node;
2114 re_sift_context_t local_sctx;
2115 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2116
2117 if (first_idx == -1)
2118 return REG_NOERROR;
2119
2120 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2121
2122 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2123 {
2124 int enabled_idx;
2125 re_token_type_t type;
2126 struct re_backref_cache_entry *entry;
2127 node = candidates->elems[node_idx];
2128 type = dfa->nodes[node].type;
2129 /* Avoid infinite loop for the REs like "()\1+". */
2130 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2131 continue;
2132 if (type != OP_BACK_REF)
2133 continue;
2134
2135 entry = mctx->bkref_ents + first_idx;
2136 enabled_idx = first_idx;
2137 do
2138 {
2139 int subexp_len, to_idx, dst_node;
2140 re_dfastate_t *cur_state;
2141
2142 if (entry->node != node)
2143 continue;
2144 subexp_len = entry->subexp_to - entry->subexp_from;
2145 to_idx = str_idx + subexp_len;
2146 dst_node = (subexp_len ? dfa->nexts[node]
2147 : dfa->edests[node].elems[0]);
2148
2149 if (to_idx > sctx->last_str_idx
2150 || sctx->sifted_states[to_idx] == NULL
2151 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2152 || check_dst_limits (mctx, &sctx->limits, node,
2153 str_idx, dst_node, to_idx))
2154 continue;
2155
2156 if (local_sctx.sifted_states == NULL)
2157 {
2158 local_sctx = *sctx;
2159 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2160 if (BE (err != REG_NOERROR, 0))
2161 goto free_return;
2162 }
2163 local_sctx.last_node = node;
2164 local_sctx.last_str_idx = str_idx;
2165 err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2166 if (BE (err < 0, 0))
2167 {
2168 err = REG_ESPACE;
2169 goto free_return;
2170 }
2171 cur_state = local_sctx.sifted_states[str_idx];
2172 err = sift_states_backward (mctx, &local_sctx);
2173 if (BE (err != REG_NOERROR, 0))
2174 goto free_return;
2175 if (sctx->limited_states != NULL)
2176 {
2177 err = merge_state_array (dfa, sctx->limited_states,
2178 local_sctx.sifted_states,
2179 str_idx + 1);
2180 if (BE (err != REG_NOERROR, 0))
2181 goto free_return;
2182 }
2183 local_sctx.sifted_states[str_idx] = cur_state;
2184 re_node_set_remove (&local_sctx.limits, enabled_idx);
2185
2186 /* mctx->bkref_ents may have changed, reload the pointer. */
2187 entry = mctx->bkref_ents + enabled_idx;
2188 }
2189 while (enabled_idx++, entry++->more);
2190 }
2191 err = REG_NOERROR;
2192 free_return:
2193 if (local_sctx.sifted_states != NULL)
2194 {
2195 re_node_set_free (&local_sctx.limits);
2196 }
2197
2198 return err;
2199}
2200
2201
2202#ifdef RE_ENABLE_I18N
2203static int
2204sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2205 const re_match_context_t *mctx;
2206 re_sift_context_t *sctx;
2207 int node_idx, str_idx, max_str_idx;
2208{
2209 re_dfa_t *const dfa = mctx->dfa;
2210 int naccepted;
2211 /* Check the node can accept `multi byte'. */
2212 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2213 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2214 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2215 dfa->nexts[node_idx]))
2216 /* The node can't accept the `multi byte', or the
2217 destination was already thrown away, then the node
2218 could't accept the current input `multi byte'. */
2219 naccepted = 0;
2220 /* Otherwise, it is sure that the node could accept
2221 `naccepted' bytes input. */
2222 return naccepted;
2223}
2224#endif /* RE_ENABLE_I18N */
2225
2226
2227
2228/* Functions for state transition. */
2229
2230/* Return the next state to which the current state STATE will transit by
2231 accepting the current input byte, and update STATE_LOG if necessary.
2232 If STATE can accept a multibyte char/collating element/back reference
2233 update the destination of STATE_LOG. */
2234
2235static re_dfastate_t *
2236transit_state (err, mctx, state)
2237 reg_errcode_t *err;
2238 re_match_context_t *mctx;
2239 re_dfastate_t *state;
2240{
2241 re_dfastate_t **trtable;
2242 unsigned char ch;
2243
2244#ifdef RE_ENABLE_I18N
2245 /* If the current state can accept multibyte. */
2246 if (BE (state->accept_mb, 0))
2247 {
2248 *err = transit_state_mb (mctx, state);
2249 if (BE (*err != REG_NOERROR, 0))
2250 return NULL;
2251 }
2252#endif /* RE_ENABLE_I18N */
2253
2254 /* Then decide the next state with the single byte. */
2255#if 0
2256 if (0)
2257 /* don't use transition table */
2258 return transit_state_sb (err, mctx, state);
2259#endif
2260
2261 /* Use transition table */
2262 ch = re_string_fetch_byte (&mctx->input);
2263 for (;;)
2264 {
2265 trtable = state->trtable;
2266 if (BE (trtable != NULL, 1))
2267 return trtable[ch];
2268
2269 trtable = state->word_trtable;
2270 if (BE (trtable != NULL, 1))
2271 {
2272 unsigned int context;
2273 context
2274 = re_string_context_at (&mctx->input,
2275 re_string_cur_idx (&mctx->input) - 1,
2276 mctx->eflags);
2277 if (IS_WORD_CONTEXT (context))
2278 return trtable[ch + SBC_MAX];
2279 else
2280 return trtable[ch];
2281 }
2282
2283 if (!build_trtable (mctx->dfa, state))
2284 {
2285 *err = REG_ESPACE;
2286 return NULL;
2287 }
2288
2289 /* Retry, we now have a transition table. */
2290 }
2291}
2292
2293/* Update the state_log if we need */
2294re_dfastate_t *
2295merge_state_with_log (err, mctx, next_state)
2296 reg_errcode_t *err;
2297 re_match_context_t *mctx;
2298 re_dfastate_t *next_state;
2299{
2300 re_dfa_t *const dfa = mctx->dfa;
2301 int cur_idx = re_string_cur_idx (&mctx->input);
2302
2303 if (cur_idx > mctx->state_log_top)
2304 {
2305 mctx->state_log[cur_idx] = next_state;
2306 mctx->state_log_top = cur_idx;
2307 }
2308 else if (mctx->state_log[cur_idx] == 0)
2309 {
2310 mctx->state_log[cur_idx] = next_state;
2311 }
2312 else
2313 {
2314 re_dfastate_t *pstate;
2315 unsigned int context;
2316 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2317 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2318 the destination of a multibyte char/collating element/
2319 back reference. Then the next state is the union set of
2320 these destinations and the results of the transition table. */
2321 pstate = mctx->state_log[cur_idx];
2322 log_nodes = pstate->entrance_nodes;
2323 if (next_state != NULL)
2324 {
2325 table_nodes = next_state->entrance_nodes;
2326 *err = re_node_set_init_union (&next_nodes, table_nodes,
2327 log_nodes);
2328 if (BE (*err != REG_NOERROR, 0))
2329 return NULL;
2330 }
2331 else
2332 next_nodes = *log_nodes;
2333 /* Note: We already add the nodes of the initial state,
2334 then we don't need to add them here. */
2335
2336 context = re_string_context_at (&mctx->input,
2337 re_string_cur_idx (&mctx->input) - 1,
2338 mctx->eflags);
2339 next_state = mctx->state_log[cur_idx]
2340 = re_acquire_state_context (err, dfa, &next_nodes, context);
2341 /* We don't need to check errors here, since the return value of
2342 this function is next_state and ERR is already set. */
2343
2344 if (table_nodes != NULL)
2345 re_node_set_free (&next_nodes);
2346 }
2347
2348 if (BE (dfa->nbackref, 0) && next_state != NULL)
2349 {
2350 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2351 later. We must check them here, since the back references in the
2352 next state might use them. */
2353 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2354 cur_idx);
2355 if (BE (*err != REG_NOERROR, 0))
2356 return NULL;
2357
2358 /* If the next state has back references. */
2359 if (next_state->has_backref)
2360 {
2361 *err = transit_state_bkref (mctx, &next_state->nodes);
2362 if (BE (*err != REG_NOERROR, 0))
2363 return NULL;
2364 next_state = mctx->state_log[cur_idx];
2365 }
2366 }
2367
2368 return next_state;
2369}
2370
2371/* Skip bytes in the input that correspond to part of a
2372 multi-byte match, then look in the log for a state
2373 from which to restart matching. */
2374re_dfastate_t *
2375find_recover_state (err, mctx)
2376 reg_errcode_t *err;
2377 re_match_context_t *mctx;
2378{
2379 re_dfastate_t *cur_state = NULL;
2380 do
2381 {
2382 int max = mctx->state_log_top;
2383 int cur_str_idx = re_string_cur_idx (&mctx->input);
2384
2385 do
2386 {
2387 if (++cur_str_idx > max)
2388 return NULL;
2389 re_string_skip_bytes (&mctx->input, 1);
2390 }
2391 while (mctx->state_log[cur_str_idx] == NULL);
2392
2393 cur_state = merge_state_with_log (err, mctx, NULL);
2394 }
2395 while (err == REG_NOERROR && cur_state == NULL);
2396 return cur_state;
2397}
2398
2399/* Helper functions for transit_state. */
2400
2401/* From the node set CUR_NODES, pick up the nodes whose types are
2402 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2403 expression. And register them to use them later for evaluating the
2404 correspoding back references. */
2405
2406static reg_errcode_t
2407check_subexp_matching_top (mctx, cur_nodes, str_idx)
2408 re_match_context_t *mctx;
2409 re_node_set *cur_nodes;
2410 int str_idx;
2411{
2412 re_dfa_t *const dfa = mctx->dfa;
2413 int node_idx;
2414 reg_errcode_t err;
2415
2416 /* TODO: This isn't efficient.
2417 Because there might be more than one nodes whose types are
2418 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2419 nodes.
2420 E.g. RE: (a){2} */
2421 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2422 {
2423 int node = cur_nodes->elems[node_idx];
2424 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2425 && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2426 && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2427 {
2428 err = match_ctx_add_subtop (mctx, node, str_idx);
2429 if (BE (err != REG_NOERROR, 0))
2430 return err;
2431 }
2432 }
2433 return REG_NOERROR;
2434}
2435
2436#if 0
2437/* Return the next state to which the current state STATE will transit by
2438 accepting the current input byte. */
2439
2440static re_dfastate_t *
2441transit_state_sb (err, mctx, state)
2442 reg_errcode_t *err;
2443 re_match_context_t *mctx;
2444 re_dfastate_t *state;
2445{
2446 re_dfa_t *const dfa = mctx->dfa;
2447 re_node_set next_nodes;
2448 re_dfastate_t *next_state;
2449 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2450 unsigned int context;
2451
2452 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2453 if (BE (*err != REG_NOERROR, 0))
2454 return NULL;
2455 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2456 {
2457 int cur_node = state->nodes.elems[node_cnt];
2458 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2459 {
2460 *err = re_node_set_merge (&next_nodes,
2461 dfa->eclosures + dfa->nexts[cur_node]);
2462 if (BE (*err != REG_NOERROR, 0))
2463 {
2464 re_node_set_free (&next_nodes);
2465 return NULL;
2466 }
2467 }
2468 }
2469 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2470 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2471 /* We don't need to check errors here, since the return value of
2472 this function is next_state and ERR is already set. */
2473
2474 re_node_set_free (&next_nodes);
2475 re_string_skip_bytes (&mctx->input, 1);
2476 return next_state;
2477}
2478#endif
2479
2480#ifdef RE_ENABLE_I18N
2481static reg_errcode_t
2482transit_state_mb (mctx, pstate)
2483 re_match_context_t *mctx;
2484 re_dfastate_t *pstate;
2485{
2486 re_dfa_t *const dfa = mctx->dfa;
2487 reg_errcode_t err;
2488 int i;
2489
2490 for (i = 0; i < pstate->nodes.nelem; ++i)
2491 {
2492 re_node_set dest_nodes, *new_nodes;
2493 int cur_node_idx = pstate->nodes.elems[i];
2494 int naccepted, dest_idx;
2495 unsigned int context;
2496 re_dfastate_t *dest_state;
2497
2498 if (!dfa->nodes[cur_node_idx].accept_mb)
2499 continue;
2500
2501 if (dfa->nodes[cur_node_idx].constraint)
2502 {
2503 context = re_string_context_at (&mctx->input,
2504 re_string_cur_idx (&mctx->input),
2505 mctx->eflags);
2506 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2507 context))
2508 continue;
2509 }
2510
2511 /* How many bytes the node can accept? */
2512 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2513 re_string_cur_idx (&mctx->input));
2514 if (naccepted == 0)
2515 continue;
2516
2517 /* The node can accepts `naccepted' bytes. */
2518 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2519 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2520 : mctx->max_mb_elem_len);
2521 err = clean_state_log_if_needed (mctx, dest_idx);
2522 if (BE (err != REG_NOERROR, 0))
2523 return err;
2524#ifdef DEBUG
2525 assert (dfa->nexts[cur_node_idx] != -1);
2526#endif
2527 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2528
2529 dest_state = mctx->state_log[dest_idx];
2530 if (dest_state == NULL)
2531 dest_nodes = *new_nodes;
2532 else
2533 {
2534 err = re_node_set_init_union (&dest_nodes,
2535 dest_state->entrance_nodes, new_nodes);
2536 if (BE (err != REG_NOERROR, 0))
2537 return err;
2538 }
2539 context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2540 mctx->state_log[dest_idx]
2541 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2542 if (dest_state != NULL)
2543 re_node_set_free (&dest_nodes);
2544 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2545 return err;
2546 }
2547 return REG_NOERROR;
2548}
2549#endif /* RE_ENABLE_I18N */
2550
2551static reg_errcode_t
2552transit_state_bkref (mctx, nodes)
2553 re_match_context_t *mctx;
2554 const re_node_set *nodes;
2555{
2556 re_dfa_t *const dfa = mctx->dfa;
2557 reg_errcode_t err;
2558 int i;
2559 int cur_str_idx = re_string_cur_idx (&mctx->input);
2560
2561 for (i = 0; i < nodes->nelem; ++i)
2562 {
2563 int dest_str_idx, prev_nelem, bkc_idx;
2564 int node_idx = nodes->elems[i];
2565 unsigned int context;
2566 const re_token_t *node = dfa->nodes + node_idx;
2567 re_node_set *new_dest_nodes;
2568
2569 /* Check whether `node' is a backreference or not. */
2570 if (node->type != OP_BACK_REF)
2571 continue;
2572
2573 if (node->constraint)
2574 {
2575 context = re_string_context_at (&mctx->input, cur_str_idx,
2576 mctx->eflags);
2577 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2578 continue;
2579 }
2580
2581 /* `node' is a backreference.
2582 Check the substring which the substring matched. */
2583 bkc_idx = mctx->nbkref_ents;
2584 err = get_subexp (mctx, node_idx, cur_str_idx);
2585 if (BE (err != REG_NOERROR, 0))
2586 goto free_return;
2587
2588 /* And add the epsilon closures (which is `new_dest_nodes') of
2589 the backreference to appropriate state_log. */
2590#ifdef DEBUG
2591 assert (dfa->nexts[node_idx] != -1);
2592#endif
2593 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2594 {
2595 int subexp_len;
2596 re_dfastate_t *dest_state;
2597 struct re_backref_cache_entry *bkref_ent;
2598 bkref_ent = mctx->bkref_ents + bkc_idx;
2599 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2600 continue;
2601 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2602 new_dest_nodes = (subexp_len == 0
2603 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2604 : dfa->eclosures + dfa->nexts[node_idx]);
2605 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2606 - bkref_ent->subexp_from);
2607 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2608 mctx->eflags);
2609 dest_state = mctx->state_log[dest_str_idx];
2610 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2611 : mctx->state_log[cur_str_idx]->nodes.nelem);
2612 /* Add `new_dest_node' to state_log. */
2613 if (dest_state == NULL)
2614 {
2615 mctx->state_log[dest_str_idx]
2616 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2617 context);
2618 if (BE (mctx->state_log[dest_str_idx] == NULL
2619 && err != REG_NOERROR, 0))
2620 goto free_return;
2621 }
2622 else
2623 {
2624 re_node_set dest_nodes;
2625 err = re_node_set_init_union (&dest_nodes,
2626 dest_state->entrance_nodes,
2627 new_dest_nodes);
2628 if (BE (err != REG_NOERROR, 0))
2629 {
2630 re_node_set_free (&dest_nodes);
2631 goto free_return;
2632 }
2633 mctx->state_log[dest_str_idx]
2634 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2635 re_node_set_free (&dest_nodes);
2636 if (BE (mctx->state_log[dest_str_idx] == NULL
2637 && err != REG_NOERROR, 0))
2638 goto free_return;
2639 }
2640 /* We need to check recursively if the backreference can epsilon
2641 transit. */
2642 if (subexp_len == 0
2643 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2644 {
2645 err = check_subexp_matching_top (mctx, new_dest_nodes,
2646 cur_str_idx);
2647 if (BE (err != REG_NOERROR, 0))
2648 goto free_return;
2649 err = transit_state_bkref (mctx, new_dest_nodes);
2650 if (BE (err != REG_NOERROR, 0))
2651 goto free_return;
2652 }
2653 }
2654 }
2655 err = REG_NOERROR;
2656 free_return:
2657 return err;
2658}
2659
2660/* Enumerate all the candidates which the backreference BKREF_NODE can match
2661 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2662 Note that we might collect inappropriate candidates here.
2663 However, the cost of checking them strictly here is too high, then we
2664 delay these checking for prune_impossible_nodes(). */
2665
2666static reg_errcode_t
2667get_subexp (mctx, bkref_node, bkref_str_idx)
2668 re_match_context_t *mctx;
2669 int bkref_node, bkref_str_idx;
2670{
2671 re_dfa_t *const dfa = mctx->dfa;
2672 int subexp_num, sub_top_idx;
2673 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2674 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2675 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2676 if (cache_idx != -1)
2677 {
2678 const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2679 do
2680 if (entry->node == bkref_node)
2681 return REG_NOERROR; /* We already checked it. */
2682 while (entry++->more);
2683 }
2684
2685 subexp_num = dfa->nodes[bkref_node].opr.idx;
2686
2687 /* For each sub expression */
2688 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2689 {
2690 reg_errcode_t err;
2691 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2692 re_sub_match_last_t *sub_last;
2693 int sub_last_idx, sl_str, bkref_str_off;
2694
2695 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2696 continue; /* It isn't related. */
2697
2698 sl_str = sub_top->str_idx;
2699 bkref_str_off = bkref_str_idx;
2700 /* At first, check the last node of sub expressions we already
2701 evaluated. */
2702 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2703 {
2704 int sl_str_diff;
2705 sub_last = sub_top->lasts[sub_last_idx];
2706 sl_str_diff = sub_last->str_idx - sl_str;
2707 /* The matched string by the sub expression match with the substring
2708 at the back reference? */
2709 if (sl_str_diff > 0)
2710 {
2711 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2712 {
2713 /* Not enough chars for a successful match. */
2714 if (bkref_str_off + sl_str_diff > mctx->input.len)
2715 break;
2716
2717 err = clean_state_log_if_needed (mctx,
2718 bkref_str_off
2719 + sl_str_diff);
2720 if (BE (err != REG_NOERROR, 0))
2721 return err;
2722 buf = (const char *) re_string_get_buffer (&mctx->input);
2723 }
2724 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2725 break; /* We don't need to search this sub expression any more. */
2726 }
2727 bkref_str_off += sl_str_diff;
2728 sl_str += sl_str_diff;
2729 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2730 bkref_str_idx);
2731
2732 /* Reload buf, since the preceding call might have reallocated
2733 the buffer. */
2734 buf = (const char *) re_string_get_buffer (&mctx->input);
2735
2736 if (err == REG_NOMATCH)
2737 continue;
2738 if (BE (err != REG_NOERROR, 0))
2739 return err;
2740 }
2741
2742 if (sub_last_idx < sub_top->nlasts)
2743 continue;
2744 if (sub_last_idx > 0)
2745 ++sl_str;
2746 /* Then, search for the other last nodes of the sub expression. */
2747 for (; sl_str <= bkref_str_idx; ++sl_str)
2748 {
2749 int cls_node, sl_str_off;
2750 const re_node_set *nodes;
2751 sl_str_off = sl_str - sub_top->str_idx;
2752 /* The matched string by the sub expression match with the substring
2753 at the back reference? */
2754 if (sl_str_off > 0)
2755 {
2756 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2757 {
2758 /* If we are at the end of the input, we cannot match. */
2759 if (bkref_str_off >= mctx->input.len)
2760 break;
2761
2762 err = extend_buffers (mctx);
2763 if (BE (err != REG_NOERROR, 0))
2764 return err;
2765
2766 buf = (const char *) re_string_get_buffer (&mctx->input);
2767 }
2768 if (buf [bkref_str_off++] != buf[sl_str - 1])
2769 break; /* We don't need to search this sub expression
2770 any more. */
2771 }
2772 if (mctx->state_log[sl_str] == NULL)
2773 continue;
2774 /* Does this state have a ')' of the sub expression? */
2775 nodes = &mctx->state_log[sl_str]->nodes;
2776 cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2777 if (cls_node == -1)
2778 continue; /* No. */
2779 if (sub_top->path == NULL)
2780 {
2781 sub_top->path = calloc (sizeof (state_array_t),
2782 sl_str - sub_top->str_idx + 1);
2783 if (sub_top->path == NULL)
2784 return REG_ESPACE;
2785 }
2786 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2787 in the current context? */
2788 err = check_arrival (mctx, sub_top->path, sub_top->node,
2789 sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2790 if (err == REG_NOMATCH)
2791 continue;
2792 if (BE (err != REG_NOERROR, 0))
2793 return err;
2794 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2795 if (BE (sub_last == NULL, 0))
2796 return REG_ESPACE;
2797 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2798 bkref_str_idx);
2799 if (err == REG_NOMATCH)
2800 continue;
2801 }
2802 }
2803 return REG_NOERROR;
2804}
2805
2806/* Helper functions for get_subexp(). */
2807
2808/* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2809 If it can arrive, register the sub expression expressed with SUB_TOP
2810 and SUB_LAST. */
2811
2812static reg_errcode_t
2813get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2814 re_match_context_t *mctx;
2815 const re_sub_match_top_t *sub_top;
2816 re_sub_match_last_t *sub_last;
2817 int bkref_node, bkref_str;
2818{
2819 reg_errcode_t err;
2820 int to_idx;
2821 /* Can the subexpression arrive the back reference? */
2822 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2823 sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2824 if (err != REG_NOERROR)
2825 return err;
2826 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2827 sub_last->str_idx);
2828 if (BE (err != REG_NOERROR, 0))
2829 return err;
2830 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2831 return clean_state_log_if_needed (mctx, to_idx);
2832}
2833
2834/* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2835 Search '(' if FL_OPEN, or search ')' otherwise.
2836 TODO: This function isn't efficient...
2837 Because there might be more than one nodes whose types are
2838 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2839 nodes.
2840 E.g. RE: (a){2} */
2841
2842static int
2843find_subexp_node (dfa, nodes, subexp_idx, type)
2844 const re_dfa_t *dfa;
2845 const re_node_set *nodes;
2846 int subexp_idx, type;
2847{
2848 int cls_idx;
2849 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2850 {
2851 int cls_node = nodes->elems[cls_idx];
2852 const re_token_t *node = dfa->nodes + cls_node;
2853 if (node->type == type
2854 && node->opr.idx == subexp_idx)
2855 return cls_node;
2856 }
2857 return -1;
2858}
2859
2860/* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2861 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2862 heavily reused.
2863 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2864
2865static reg_errcode_t
2866check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2867 type)
2868 re_match_context_t *mctx;
2869 state_array_t *path;
2870 int top_node, top_str, last_node, last_str, type;
2871{
2872 re_dfa_t *const dfa = mctx->dfa;
2873 reg_errcode_t err;
2874 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2875 re_dfastate_t *cur_state = NULL;
2876 re_node_set *cur_nodes, next_nodes;
2877 re_dfastate_t **backup_state_log;
2878 unsigned int context;
2879
2880 subexp_num = dfa->nodes[top_node].opr.idx;
2881 /* Extend the buffer if we need. */
2882 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2883 {
2884 re_dfastate_t **new_array;
2885 int old_alloc = path->alloc;
2886 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2887 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2888 if (new_array == NULL)
2889 {
2890 path->alloc = old_alloc;
2891 return REG_ESPACE;
2892 }
2893 path->array = new_array;
2894 memset (new_array + old_alloc, '\0',
2895 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2896 }
2897
2898 str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2899
2900 /* Temporary modify MCTX. */
2901 backup_state_log = mctx->state_log;
2902 backup_cur_idx = mctx->input.cur_idx;
2903 mctx->state_log = path->array;
2904 mctx->input.cur_idx = str_idx;
2905
2906 /* Setup initial node set. */
2907 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2908 if (str_idx == top_str)
2909 {
2910 err = re_node_set_init_1 (&next_nodes, top_node);
2911 if (BE (err != REG_NOERROR, 0))
2912 return err;
2913 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2914 if (BE (err != REG_NOERROR, 0))
2915 {
2916 re_node_set_free (&next_nodes);
2917 return err;
2918 }
2919 }
2920 else
2921 {
2922 cur_state = mctx->state_log[str_idx];
2923 if (cur_state && cur_state->has_backref)
2924 {
2925 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2926 if (BE ( err != REG_NOERROR, 0))
2927 return err;
2928 }
2929 else
2930 re_node_set_init_empty (&next_nodes);
2931 }
2932 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2933 {
2934 if (next_nodes.nelem)
2935 {
2936 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2937 subexp_num, type);
2938 if (BE ( err != REG_NOERROR, 0))
2939 {
2940 re_node_set_free (&next_nodes);
2941 return err;
2942 }
2943 }
2944 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2945 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2946 {
2947 re_node_set_free (&next_nodes);
2948 return err;
2949 }
2950 mctx->state_log[str_idx] = cur_state;
2951 }
2952
2953 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2954 {
2955 re_node_set_empty (&next_nodes);
2956 if (mctx->state_log[str_idx + 1])
2957 {
2958 err = re_node_set_merge (&next_nodes,
2959 &mctx->state_log[str_idx + 1]->nodes);
2960 if (BE (err != REG_NOERROR, 0))
2961 {
2962 re_node_set_free (&next_nodes);
2963 return err;
2964 }
2965 }
2966 if (cur_state)
2967 {
2968 err = check_arrival_add_next_nodes (mctx, str_idx,
2969 &cur_state->non_eps_nodes, &next_nodes);
2970 if (BE (err != REG_NOERROR, 0))
2971 {
2972 re_node_set_free (&next_nodes);
2973 return err;
2974 }
2975 }
2976 ++str_idx;
2977 if (next_nodes.nelem)
2978 {
2979 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2980 if (BE (err != REG_NOERROR, 0))
2981 {
2982 re_node_set_free (&next_nodes);
2983 return err;
2984 }
2985 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2986 subexp_num, type);
2987 if (BE ( err != REG_NOERROR, 0))
2988 {
2989 re_node_set_free (&next_nodes);
2990 return err;
2991 }
2992 }
2993 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2994 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2995 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2996 {
2997 re_node_set_free (&next_nodes);
2998 return err;
2999 }
3000 mctx->state_log[str_idx] = cur_state;
3001 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3002 }
3003 re_node_set_free (&next_nodes);
3004 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3005 : &mctx->state_log[last_str]->nodes);
3006 path->next_idx = str_idx;
3007
3008 /* Fix MCTX. */
3009 mctx->state_log = backup_state_log;
3010 mctx->input.cur_idx = backup_cur_idx;
3011
3012 /* Then check the current node set has the node LAST_NODE. */
3013 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3014 return REG_NOERROR;
3015
3016 return REG_NOMATCH;
3017}
3018
3019/* Helper functions for check_arrival. */
3020
3021/* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3022 to NEXT_NODES.
3023 TODO: This function is similar to the functions transit_state*(),
3024 however this function has many additional works.
3025 Can't we unify them? */
3026
3027static reg_errcode_t
3028check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3029 re_match_context_t *mctx;
3030 int str_idx;
3031 re_node_set *cur_nodes, *next_nodes;
3032{
3033 re_dfa_t *const dfa = mctx->dfa;
3034 int result;
3035 int cur_idx;
3036 reg_errcode_t err;
3037 re_node_set union_set;
3038 re_node_set_init_empty (&union_set);
3039 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3040 {
3041 int naccepted = 0;
3042 int cur_node = cur_nodes->elems[cur_idx];
3043#ifdef DEBUG
3044 re_token_type_t type = dfa->nodes[cur_node].type;
3045 assert (!IS_EPSILON_NODE (type));
3046#endif
3047#ifdef RE_ENABLE_I18N
3048 /* If the node may accept `multi byte'. */
3049 if (dfa->nodes[cur_node].accept_mb)
3050 {
3051 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3052 str_idx);
3053 if (naccepted > 1)
3054 {
3055 re_dfastate_t *dest_state;
3056 int next_node = dfa->nexts[cur_node];
3057 int next_idx = str_idx + naccepted;
3058 dest_state = mctx->state_log[next_idx];
3059 re_node_set_empty (&union_set);
3060 if (dest_state)
3061 {
3062 err = re_node_set_merge (&union_set, &dest_state->nodes);
3063 if (BE (err != REG_NOERROR, 0))
3064 {
3065 re_node_set_free (&union_set);
3066 return err;
3067 }
3068 }
3069 result = re_node_set_insert (&union_set, next_node);
3070 if (BE (result < 0, 0))
3071 {
3072 re_node_set_free (&union_set);
3073 return REG_ESPACE;
3074 }
3075 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3076 &union_set);
3077 if (BE (mctx->state_log[next_idx] == NULL
3078 && err != REG_NOERROR, 0))
3079 {
3080 re_node_set_free (&union_set);
3081 return err;
3082 }
3083 }
3084 }
3085#endif /* RE_ENABLE_I18N */
3086 if (naccepted
3087 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3088 {
3089 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3090 if (BE (result < 0, 0))
3091 {
3092 re_node_set_free (&union_set);
3093 return REG_ESPACE;
3094 }
3095 }
3096 }
3097 re_node_set_free (&union_set);
3098 return REG_NOERROR;
3099}
3100
3101/* For all the nodes in CUR_NODES, add the epsilon closures of them to
3102 CUR_NODES, however exclude the nodes which are:
3103 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3104 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3105*/
3106
3107static reg_errcode_t
3108check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3109 re_dfa_t *dfa;
3110 re_node_set *cur_nodes;
3111 int ex_subexp, type;
3112{
3113 reg_errcode_t err;
3114 int idx, outside_node;
3115 re_node_set new_nodes;
3116#ifdef DEBUG
3117 assert (cur_nodes->nelem);
3118#endif
3119 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3120 if (BE (err != REG_NOERROR, 0))
3121 return err;
3122 /* Create a new node set NEW_NODES with the nodes which are epsilon
3123 closures of the node in CUR_NODES. */
3124
3125 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3126 {
3127 int cur_node = cur_nodes->elems[idx];
3128 re_node_set *eclosure = dfa->eclosures + cur_node;
3129 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3130 if (outside_node == -1)
3131 {
3132 /* There are no problematic nodes, just merge them. */
3133 err = re_node_set_merge (&new_nodes, eclosure);
3134 if (BE (err != REG_NOERROR, 0))
3135 {
3136 re_node_set_free (&new_nodes);
3137 return err;
3138 }
3139 }
3140 else
3141 {
3142 /* There are problematic nodes, re-calculate incrementally. */
3143 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3144 ex_subexp, type);
3145 if (BE (err != REG_NOERROR, 0))
3146 {
3147 re_node_set_free (&new_nodes);
3148 return err;
3149 }
3150 }
3151 }
3152 re_node_set_free (cur_nodes);
3153 *cur_nodes = new_nodes;
3154 return REG_NOERROR;
3155}
3156
3157/* Helper function for check_arrival_expand_ecl.
3158 Check incrementally the epsilon closure of TARGET, and if it isn't
3159 problematic append it to DST_NODES. */
3160
3161static reg_errcode_t
3162check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3163 re_dfa_t *dfa;
3164 int target, ex_subexp, type;
3165 re_node_set *dst_nodes;
3166{
3167 int cur_node;
3168 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3169 {
3170 int err;
3171
3172 if (dfa->nodes[cur_node].type == type
3173 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3174 {
3175 if (type == OP_CLOSE_SUBEXP)
3176 {
3177 err = re_node_set_insert (dst_nodes, cur_node);
3178 if (BE (err == -1, 0))
3179 return REG_ESPACE;
3180 }
3181 break;
3182 }
3183 err = re_node_set_insert (dst_nodes, cur_node);
3184 if (BE (err == -1, 0))
3185 return REG_ESPACE;
3186 if (dfa->edests[cur_node].nelem == 0)
3187 break;
3188 if (dfa->edests[cur_node].nelem == 2)
3189 {
3190 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3191 dfa->edests[cur_node].elems[1],
3192 ex_subexp, type);
3193 if (BE (err != REG_NOERROR, 0))
3194 return err;
3195 }
3196 cur_node = dfa->edests[cur_node].elems[0];
3197 }
3198 return REG_NOERROR;
3199}
3200
3201
3202/* For all the back references in the current state, calculate the
3203 destination of the back references by the appropriate entry
3204 in MCTX->BKREF_ENTS. */
3205
3206static reg_errcode_t
3207expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3208 type)
3209 re_match_context_t *mctx;
3210 int cur_str, subexp_num, type;
3211 re_node_set *cur_nodes;
3212{
3213 re_dfa_t *const dfa = mctx->dfa;
3214 reg_errcode_t err;
3215 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3216 struct re_backref_cache_entry *ent;
3217
3218 if (cache_idx_start == -1)
3219 return REG_NOERROR;
3220
3221 restart:
3222 ent = mctx->bkref_ents + cache_idx_start;
3223 do
3224 {
3225 int to_idx, next_node;
3226
3227 /* Is this entry ENT is appropriate? */
3228 if (!re_node_set_contains (cur_nodes, ent->node))
3229 continue; /* No. */
3230
3231 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3232 /* Calculate the destination of the back reference, and append it
3233 to MCTX->STATE_LOG. */
3234 if (to_idx == cur_str)
3235 {
3236 /* The backreference did epsilon transit, we must re-check all the
3237 node in the current state. */
3238 re_node_set new_dests;
3239 reg_errcode_t err2, err3;
3240 next_node = dfa->edests[ent->node].elems[0];
3241 if (re_node_set_contains (cur_nodes, next_node))
3242 continue;
3243 err = re_node_set_init_1 (&new_dests, next_node);
3244 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3245 err3 = re_node_set_merge (cur_nodes, &new_dests);
3246 re_node_set_free (&new_dests);
3247 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3248 || err3 != REG_NOERROR, 0))
3249 {
3250 err = (err != REG_NOERROR ? err
3251 : (err2 != REG_NOERROR ? err2 : err3));
3252 return err;
3253 }
3254 /* TODO: It is still inefficient... */
3255 goto restart;
3256 }
3257 else
3258 {
3259 re_node_set union_set;
3260 next_node = dfa->nexts[ent->node];
3261 if (mctx->state_log[to_idx])
3262 {
3263 int ret;
3264 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3265 next_node))
3266 continue;
3267 err = re_node_set_init_copy (&union_set,
3268 &mctx->state_log[to_idx]->nodes);
3269 ret = re_node_set_insert (&union_set, next_node);
3270 if (BE (err != REG_NOERROR || ret < 0, 0))
3271 {
3272 re_node_set_free (&union_set);
3273 err = err != REG_NOERROR ? err : REG_ESPACE;
3274 return err;
3275 }
3276 }
3277 else
3278 {
3279 err = re_node_set_init_1 (&union_set, next_node);
3280 if (BE (err != REG_NOERROR, 0))
3281 return err;
3282 }
3283 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3284 re_node_set_free (&union_set);
3285 if (BE (mctx->state_log[to_idx] == NULL
3286 && err != REG_NOERROR, 0))
3287 return err;
3288 }
3289 }
3290 while (ent++->more);
3291 return REG_NOERROR;
3292}
3293
3294/* Build transition table for the state.
3295 Return 1 if succeeded, otherwise return NULL. */
3296
3297static int
3298build_trtable (dfa, state)
3299 re_dfa_t *dfa;
3300 re_dfastate_t *state;
3301{
3302 reg_errcode_t err;
3303 int i, j, ch, need_word_trtable = 0;
3304 unsigned int elem, mask;
3305 int dests_node_malloced = 0, dest_states_malloced = 0;
3306 int ndests; /* Number of the destination states from `state'. */
3307 re_dfastate_t **trtable;
3308 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3309 re_node_set follows, *dests_node;
3310 bitset *dests_ch;
3311 bitset acceptable;
3312
3313 /* We build DFA states which corresponds to the destination nodes
3314 from `state'. `dests_node[i]' represents the nodes which i-th
3315 destination state contains, and `dests_ch[i]' represents the
3316 characters which i-th destination state accepts. */
3317#ifdef _LIBC
3318 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3319 dests_node = (re_node_set *)
3320 alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3321 else
3322#endif
3323 {
3324 dests_node = (re_node_set *)
3325 malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3326 if (BE (dests_node == NULL, 0))
3327 return 0;
3328 dests_node_malloced = 1;
3329 }
3330 dests_ch = (bitset *) (dests_node + SBC_MAX);
3331
3332 /* Initialize transiton table. */
3333 state->word_trtable = state->trtable = NULL;
3334
3335 /* At first, group all nodes belonging to `state' into several
3336 destinations. */
3337 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3338 if (BE (ndests <= 0, 0))
3339 {
3340 if (dests_node_malloced)
3341 free (dests_node);
3342 /* Return 0 in case of an error, 1 otherwise. */
3343 if (ndests == 0)
3344 {
3345 state->trtable = (re_dfastate_t **)
3346 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3347 return 1;
3348 }
3349 return 0;
3350 }
3351
3352 err = re_node_set_alloc (&follows, ndests + 1);
3353 if (BE (err != REG_NOERROR, 0))
3354 goto out_free;
3355
3356#ifdef _LIBC
3357 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3358 + ndests * 3 * sizeof (re_dfastate_t *)))
3359 dest_states = (re_dfastate_t **)
3360 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3361 else
3362#endif
3363 {
3364 dest_states = (re_dfastate_t **)
3365 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3366 if (BE (dest_states == NULL, 0))
3367 {
3368out_free:
3369 if (dest_states_malloced)
3370 free (dest_states);
3371 re_node_set_free (&follows);
3372 for (i = 0; i < ndests; ++i)
3373 re_node_set_free (dests_node + i);
3374 if (dests_node_malloced)
3375 free (dests_node);
3376 return 0;
3377 }
3378 dest_states_malloced = 1;
3379 }
3380 dest_states_word = dest_states + ndests;
3381 dest_states_nl = dest_states_word + ndests;
3382 bitset_empty (acceptable);
3383
3384 /* Then build the states for all destinations. */
3385 for (i = 0; i < ndests; ++i)
3386 {
3387 int next_node;
3388 re_node_set_empty (&follows);
3389 /* Merge the follows of this destination states. */
3390 for (j = 0; j < dests_node[i].nelem; ++j)
3391 {
3392 next_node = dfa->nexts[dests_node[i].elems[j]];
3393 if (next_node != -1)
3394 {
3395 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3396 if (BE (err != REG_NOERROR, 0))
3397 goto out_free;
3398 }
3399 }
3400 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3401 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3402 goto out_free;
3403 /* If the new state has context constraint,
3404 build appropriate states for these contexts. */
3405 if (dest_states[i]->has_constraint)
3406 {
3407 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3408 CONTEXT_WORD);
3409 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3410 goto out_free;
3411
3412 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3413 need_word_trtable = 1;
3414
3415 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3416 CONTEXT_NEWLINE);
3417 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3418 goto out_free;
3419 }
3420 else
3421 {
3422 dest_states_word[i] = dest_states[i];
3423 dest_states_nl[i] = dest_states[i];
3424 }
3425 bitset_merge (acceptable, dests_ch[i]);
3426 }
3427
3428 if (!BE (need_word_trtable, 0))
3429 {
3430 /* We don't care about whether the following character is a word
3431 character, or we are in a single-byte character set so we can
3432 discern by looking at the character code: allocate a
3433 256-entry transition table. */
3434 trtable = state->trtable =
3435 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3436 if (BE (trtable == NULL, 0))
3437 goto out_free;
3438
3439 /* For all characters ch...: */
3440 for (i = 0; i < BITSET_UINTS; ++i)
3441 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3442 elem;
3443 mask <<= 1, elem >>= 1, ++ch)
3444 if (BE (elem & 1, 0))
3445 {
3446 /* There must be exactly one destination which accepts
3447 character ch. See group_nodes_into_DFAstates. */
3448 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3449 ;
3450
3451 /* j-th destination accepts the word character ch. */
3452 if (dfa->word_char[i] & mask)
3453 trtable[ch] = dest_states_word[j];
3454 else
3455 trtable[ch] = dest_states[j];
3456 }
3457 }
3458 else
3459 {
3460 /* We care about whether the following character is a word
3461 character, and we are in a multi-byte character set: discern
3462 by looking at the character code: build two 256-entry
3463 transition tables, one starting at trtable[0] and one
3464 starting at trtable[SBC_MAX]. */
3465 trtable = state->word_trtable =
3466 (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3467 if (BE (trtable == NULL, 0))
3468 goto out_free;
3469
3470 /* For all characters ch...: */
3471 for (i = 0; i < BITSET_UINTS; ++i)
3472 for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3473 elem;
3474 mask <<= 1, elem >>= 1, ++ch)
3475 if (BE (elem & 1, 0))
3476 {
3477 /* There must be exactly one destination which accepts
3478 character ch. See group_nodes_into_DFAstates. */
3479 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3480 ;
3481
3482 /* j-th destination accepts the word character ch. */
3483 trtable[ch] = dest_states[j];
3484 trtable[ch + SBC_MAX] = dest_states_word[j];
3485 }
3486 }
3487
3488 /* new line */
3489 if (bitset_contain (acceptable, NEWLINE_CHAR))
3490 {
3491 /* The current state accepts newline character. */
3492 for (j = 0; j < ndests; ++j)
3493 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3494 {
3495 /* k-th destination accepts newline character. */
3496 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3497 if (need_word_trtable)
3498 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3499 /* There must be only one destination which accepts
3500 newline. See group_nodes_into_DFAstates. */
3501 break;
3502 }
3503 }
3504
3505 if (dest_states_malloced)
3506 free (dest_states);
3507
3508 re_node_set_free (&follows);
3509 for (i = 0; i < ndests; ++i)
3510 re_node_set_free (dests_node + i);
3511
3512 if (dests_node_malloced)
3513 free (dests_node);
3514
3515 return 1;
3516}
3517
3518/* Group all nodes belonging to STATE into several destinations.
3519 Then for all destinations, set the nodes belonging to the destination
3520 to DESTS_NODE[i] and set the characters accepted by the destination
3521 to DEST_CH[i]. This function return the number of destinations. */
3522
3523static int
3524group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3525 re_dfa_t *dfa;
3526 const re_dfastate_t *state;
3527 re_node_set *dests_node;
3528 bitset *dests_ch;
3529{
3530 reg_errcode_t err;
3531 int result;
3532 int i, j, k;
3533 int ndests; /* Number of the destinations from `state'. */
3534 bitset accepts; /* Characters a node can accept. */
3535 const re_node_set *cur_nodes = &state->nodes;
3536 bitset_empty (accepts);
3537 ndests = 0;
3538
3539 /* For all the nodes belonging to `state', */
3540 for (i = 0; i < cur_nodes->nelem; ++i)
3541 {
3542 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3543 re_token_type_t type = node->type;
3544 unsigned int constraint = node->constraint;
3545
3546 /* Enumerate all single byte character this node can accept. */
3547 if (type == CHARACTER)
3548 bitset_set (accepts, node->opr.c);
3549 else if (type == SIMPLE_BRACKET)
3550 {
3551 bitset_merge (accepts, node->opr.sbcset);
3552 }
3553 else if (type == OP_PERIOD)
3554 {
3555#ifdef RE_ENABLE_I18N
3556 if (dfa->mb_cur_max > 1)
3557 bitset_merge (accepts, dfa->sb_char);
3558 else
3559#endif
3560 bitset_set_all (accepts);
3561 if (!(dfa->syntax & RE_DOT_NEWLINE))
3562 bitset_clear (accepts, '\n');
3563 if (dfa->syntax & RE_DOT_NOT_NULL)
3564 bitset_clear (accepts, '\0');
3565 }
3566#ifdef RE_ENABLE_I18N
3567 else if (type == OP_UTF8_PERIOD)
3568 {
3569 memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3570 if (!(dfa->syntax & RE_DOT_NEWLINE))
3571 bitset_clear (accepts, '\n');
3572 if (dfa->syntax & RE_DOT_NOT_NULL)
3573 bitset_clear (accepts, '\0');
3574 }
3575#endif
3576 else
3577 continue;
3578
3579 /* Check the `accepts' and sift the characters which are not
3580 match it the context. */
3581 if (constraint)
3582 {
3583 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3584 {
3585 int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3586 bitset_empty (accepts);
3587 if (accepts_newline)
3588 bitset_set (accepts, NEWLINE_CHAR);
3589 else
3590 continue;
3591 }
3592 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3593 {
3594 bitset_empty (accepts);
3595 continue;
3596 }
3597
3598 if (constraint & NEXT_WORD_CONSTRAINT)
3599 {
3600 unsigned int any_set = 0;
3601 if (type == CHARACTER && !node->word_char)
3602 {
3603 bitset_empty (accepts);
3604 continue;
3605 }
3606#ifdef RE_ENABLE_I18N
3607 if (dfa->mb_cur_max > 1)
3608 for (j = 0; j < BITSET_UINTS; ++j)
3609 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3610 else
3611#endif
3612 for (j = 0; j < BITSET_UINTS; ++j)
3613 any_set |= (accepts[j] &= dfa->word_char[j]);
3614 if (!any_set)
3615 continue;
3616 }
3617 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3618 {
3619 unsigned int any_set = 0;
3620 if (type == CHARACTER && node->word_char)
3621 {
3622 bitset_empty (accepts);
3623 continue;
3624 }
3625#ifdef RE_ENABLE_I18N
3626 if (dfa->mb_cur_max > 1)
3627 for (j = 0; j < BITSET_UINTS; ++j)
3628 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3629 else
3630#endif
3631 for (j = 0; j < BITSET_UINTS; ++j)
3632 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3633 if (!any_set)
3634 continue;
3635 }
3636 }
3637
3638 /* Then divide `accepts' into DFA states, or create a new
3639 state. Above, we make sure that accepts is not empty. */
3640 for (j = 0; j < ndests; ++j)
3641 {
3642 bitset intersec; /* Intersection sets, see below. */
3643 bitset remains;
3644 /* Flags, see below. */
3645 int has_intersec, not_subset, not_consumed;
3646
3647 /* Optimization, skip if this state doesn't accept the character. */
3648 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3649 continue;
3650
3651 /* Enumerate the intersection set of this state and `accepts'. */
3652 has_intersec = 0;
3653 for (k = 0; k < BITSET_UINTS; ++k)
3654 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3655 /* And skip if the intersection set is empty. */
3656 if (!has_intersec)
3657 continue;
3658
3659 /* Then check if this state is a subset of `accepts'. */
3660 not_subset = not_consumed = 0;
3661 for (k = 0; k < BITSET_UINTS; ++k)
3662 {
3663 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3664 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3665 }
3666
3667 /* If this state isn't a subset of `accepts', create a
3668 new group state, which has the `remains'. */
3669 if (not_subset)
3670 {
3671 bitset_copy (dests_ch[ndests], remains);
3672 bitset_copy (dests_ch[j], intersec);
3673 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3674 if (BE (err != REG_NOERROR, 0))
3675 goto error_return;
3676 ++ndests;
3677 }
3678
3679 /* Put the position in the current group. */
3680 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3681 if (BE (result < 0, 0))
3682 goto error_return;
3683
3684 /* If all characters are consumed, go to next node. */
3685 if (!not_consumed)
3686 break;
3687 }
3688 /* Some characters remain, create a new group. */
3689 if (j == ndests)
3690 {
3691 bitset_copy (dests_ch[ndests], accepts);
3692 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3693 if (BE (err != REG_NOERROR, 0))
3694 goto error_return;
3695 ++ndests;
3696 bitset_empty (accepts);
3697 }
3698 }
3699 return ndests;
3700 error_return:
3701 for (j = 0; j < ndests; ++j)
3702 re_node_set_free (dests_node + j);
3703 return -1;
3704}
3705
3706#ifdef RE_ENABLE_I18N
3707/* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3708 Return the number of the bytes the node accepts.
3709 STR_IDX is the current index of the input string.
3710
3711 This function handles the nodes which can accept one character, or
3712 one collating element like '.', '[a-z]', opposite to the other nodes
3713 can only accept one byte. */
3714
3715static int
3716check_node_accept_bytes (dfa, node_idx, input, str_idx)
3717 re_dfa_t *dfa;
3718 int node_idx, str_idx;
3719 const re_string_t *input;
3720{
3721 const re_token_t *node = dfa->nodes + node_idx;
3722 int char_len, elem_len;
3723 int i;
3724
3725 if (BE (node->type == OP_UTF8_PERIOD, 0))
3726 {
3727 unsigned char c = re_string_byte_at (input, str_idx), d;
3728 if (BE (c < 0xc2, 1))
3729 return 0;
3730
3731 if (str_idx + 2 > input->len)
3732 return 0;
3733
3734 d = re_string_byte_at (input, str_idx + 1);
3735 if (c < 0xe0)
3736 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3737 else if (c < 0xf0)
3738 {
3739 char_len = 3;
3740 if (c == 0xe0 && d < 0xa0)
3741 return 0;
3742 }
3743 else if (c < 0xf8)
3744 {
3745 char_len = 4;
3746 if (c == 0xf0 && d < 0x90)
3747 return 0;
3748 }
3749 else if (c < 0xfc)
3750 {
3751 char_len = 5;
3752 if (c == 0xf8 && d < 0x88)
3753 return 0;
3754 }
3755 else if (c < 0xfe)
3756 {
3757 char_len = 6;
3758 if (c == 0xfc && d < 0x84)
3759 return 0;
3760 }
3761 else
3762 return 0;
3763
3764 if (str_idx + char_len > input->len)
3765 return 0;
3766
3767 for (i = 1; i < char_len; ++i)
3768 {
3769 d = re_string_byte_at (input, str_idx + i);
3770 if (d < 0x80 || d > 0xbf)
3771 return 0;
3772 }
3773 return char_len;
3774 }
3775
3776 char_len = re_string_char_size_at (input, str_idx);
3777 if (node->type == OP_PERIOD)
3778 {
3779 if (char_len <= 1)
3780 return 0;
3781 /* FIXME: I don't think this if is needed, as both '\n'
3782 and '\0' are char_len == 1. */
3783 /* '.' accepts any one character except the following two cases. */
3784 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3785 re_string_byte_at (input, str_idx) == '\n') ||
3786 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3787 re_string_byte_at (input, str_idx) == '\0'))
3788 return 0;
3789 return char_len;
3790 }
3791
3792 elem_len = re_string_elem_size_at (input, str_idx);
3793 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3794 return 0;
3795
3796 if (node->type == COMPLEX_BRACKET)
3797 {
3798 const re_charset_t *cset = node->opr.mbcset;
3799# ifdef _LIBC
3800 const unsigned char *pin
3801 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3802 int j;
3803 uint32_t nrules;
3804# endif /* _LIBC */
3805 int match_len = 0;
3806 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3807 ? re_string_wchar_at (input, str_idx) : 0);
3808
3809 /* match with multibyte character? */
3810 for (i = 0; i < cset->nmbchars; ++i)
3811 if (wc == cset->mbchars[i])
3812 {
3813 match_len = char_len;
3814 goto check_node_accept_bytes_match;
3815 }
3816 /* match with character_class? */
3817 for (i = 0; i < cset->nchar_classes; ++i)
3818 {
3819 wctype_t wt = cset->char_classes[i];
3820 if (__iswctype (wc, wt))
3821 {
3822 match_len = char_len;
3823 goto check_node_accept_bytes_match;
3824 }
3825 }
3826
3827# ifdef _LIBC
3828 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3829 if (nrules != 0)
3830 {
3831 unsigned int in_collseq = 0;
3832 const int32_t *table, *indirect;
3833 const unsigned char *weights, *extra;
3834 const char *collseqwc;
3835 int32_t idx;
3836 /* This #include defines a local function! */
3837# include <locale/weight.h>
3838
3839 /* match with collating_symbol? */
3840 if (cset->ncoll_syms)
3841 extra = (const unsigned char *)
3842 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3843 for (i = 0; i < cset->ncoll_syms; ++i)
3844 {
3845 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3846 /* Compare the length of input collating element and
3847 the length of current collating element. */
3848 if (*coll_sym != elem_len)
3849 continue;
3850 /* Compare each bytes. */
3851 for (j = 0; j < *coll_sym; j++)
3852 if (pin[j] != coll_sym[1 + j])
3853 break;
3854 if (j == *coll_sym)
3855 {
3856 /* Match if every bytes is equal. */
3857 match_len = j;
3858 goto check_node_accept_bytes_match;
3859 }
3860 }
3861
3862 if (cset->nranges)
3863 {
3864 if (elem_len <= char_len)
3865 {
3866 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3867 in_collseq = __collseq_table_lookup (collseqwc, wc);
3868 }
3869 else
3870 in_collseq = find_collation_sequence_value (pin, elem_len);
3871 }
3872 /* match with range expression? */
3873 for (i = 0; i < cset->nranges; ++i)
3874 if (cset->range_starts[i] <= in_collseq
3875 && in_collseq <= cset->range_ends[i])
3876 {
3877 match_len = elem_len;
3878 goto check_node_accept_bytes_match;
3879 }
3880
3881 /* match with equivalence_class? */
3882 if (cset->nequiv_classes)
3883 {
3884 const unsigned char *cp = pin;
3885 table = (const int32_t *)
3886 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3887 weights = (const unsigned char *)
3888 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3889 extra = (const unsigned char *)
3890 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3891 indirect = (const int32_t *)
3892 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3893 idx = findidx (&cp);
3894 if (idx > 0)
3895 for (i = 0; i < cset->nequiv_classes; ++i)
3896 {
3897 int32_t equiv_class_idx = cset->equiv_classes[i];
3898 size_t weight_len = weights[idx];
3899 if (weight_len == weights[equiv_class_idx])
3900 {
3901 int cnt = 0;
3902 while (cnt <= weight_len
3903 && (weights[equiv_class_idx + 1 + cnt]
3904 == weights[idx + 1 + cnt]))
3905 ++cnt;
3906 if (cnt > weight_len)
3907 {
3908 match_len = elem_len;
3909 goto check_node_accept_bytes_match;
3910 }
3911 }
3912 }
3913 }
3914 }
3915 else
3916# endif /* _LIBC */
3917 {
3918 /* match with range expression? */
3919#if __GNUC__ >= 2
3920 wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3921#else
3922 wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3923 cmp_buf[2] = wc;
3924#endif
3925 for (i = 0; i < cset->nranges; ++i)
3926 {
3927 cmp_buf[0] = cset->range_starts[i];
3928 cmp_buf[4] = cset->range_ends[i];
3929 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3930 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3931 {
3932 match_len = char_len;
3933 goto check_node_accept_bytes_match;
3934 }
3935 }
3936 }
3937 check_node_accept_bytes_match:
3938 if (!cset->non_match)
3939 return match_len;
3940 else
3941 {
3942 if (match_len > 0)
3943 return 0;
3944 else
3945 return (elem_len > char_len) ? elem_len : char_len;
3946 }
3947 }
3948 return 0;
3949}
3950
3951# ifdef _LIBC
3952static unsigned int
3953find_collation_sequence_value (mbs, mbs_len)
3954 const unsigned char *mbs;
3955 size_t mbs_len;
3956{
3957 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3958 if (nrules == 0)
3959 {
3960 if (mbs_len == 1)
3961 {
3962 /* No valid character. Match it as a single byte character. */
3963 const unsigned char *collseq = (const unsigned char *)
3964 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3965 return collseq[mbs[0]];
3966 }
3967 return UINT_MAX;
3968 }
3969 else
3970 {
3971 int32_t idx;
3972 const unsigned char *extra = (const unsigned char *)
3973 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3974 int32_t extrasize = (const unsigned char *)
3975 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3976
3977 for (idx = 0; idx < extrasize;)
3978 {
3979 int mbs_cnt, found = 0;
3980 int32_t elem_mbs_len;
3981 /* Skip the name of collating element name. */
3982 idx = idx + extra[idx] + 1;
3983 elem_mbs_len = extra[idx++];
3984 if (mbs_len == elem_mbs_len)
3985 {
3986 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3987 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3988 break;
3989 if (mbs_cnt == elem_mbs_len)
3990 /* Found the entry. */
3991 found = 1;
3992 }
3993 /* Skip the byte sequence of the collating element. */
3994 idx += elem_mbs_len;
3995 /* Adjust for the alignment. */
3996 idx = (idx + 3) & ~3;
3997 /* Skip the collation sequence value. */
3998 idx += sizeof (uint32_t);
3999 /* Skip the wide char sequence of the collating element. */
4000 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4001 /* If we found the entry, return the sequence value. */
4002 if (found)
4003 return *(uint32_t *) (extra + idx);
4004 /* Skip the collation sequence value. */
4005 idx += sizeof (uint32_t);
4006 }
4007 return UINT_MAX;
4008 }
4009}
4010# endif /* _LIBC */
4011#endif /* RE_ENABLE_I18N */
4012
4013/* Check whether the node accepts the byte which is IDX-th
4014 byte of the INPUT. */
4015
4016static int
4017check_node_accept (mctx, node, idx)
4018 const re_match_context_t *mctx;
4019 const re_token_t *node;
4020 int idx;
4021{
4022 unsigned char ch;
4023 ch = re_string_byte_at (&mctx->input, idx);
4024 switch (node->type)
4025 {
4026 case CHARACTER:
4027 if (node->opr.c != ch)
4028 return 0;
4029 break;
4030
4031 case SIMPLE_BRACKET:
4032 if (!bitset_contain (node->opr.sbcset, ch))
4033 return 0;
4034 break;
4035
4036#ifdef RE_ENABLE_I18N
4037 case OP_UTF8_PERIOD:
4038 if (ch >= 0x80)
4039 return 0;
4040 /* FALLTHROUGH */
4041#endif
4042 case OP_PERIOD:
4043 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4044 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4045 return 0;
4046 break;
4047
4048 default:
4049 return 0;
4050 }
4051
4052 if (node->constraint)
4053 {
4054 /* The node has constraints. Check whether the current context
4055 satisfies the constraints. */
4056 unsigned int context = re_string_context_at (&mctx->input, idx,
4057 mctx->eflags);
4058 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4059 return 0;
4060 }
4061
4062 return 1;
4063}
4064
4065/* Extend the buffers, if the buffers have run out. */
4066
4067static reg_errcode_t
4068extend_buffers (mctx)
4069 re_match_context_t *mctx;
4070{
4071 reg_errcode_t ret;
4072 re_string_t *pstr = &mctx->input;
4073
4074 /* Double the lengthes of the buffers. */
4075 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4076 if (BE (ret != REG_NOERROR, 0))
4077 return ret;
4078
4079 if (mctx->state_log != NULL)
4080 {
4081 /* And double the length of state_log. */
4082 /* XXX We have no indication of the size of this buffer. If this
4083 allocation fail we have no indication that the state_log array
4084 does not have the right size. */
4085 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4086 pstr->bufs_len + 1);
4087 if (BE (new_array == NULL, 0))
4088 return REG_ESPACE;
4089 mctx->state_log = new_array;
4090 }
4091
4092 /* Then reconstruct the buffers. */
4093 if (pstr->icase)
4094 {
4095#ifdef RE_ENABLE_I18N
4096 if (pstr->mb_cur_max > 1)
4097 {
4098 ret = build_wcs_upper_buffer (pstr);
4099 if (BE (ret != REG_NOERROR, 0))
4100 return ret;
4101 }
4102 else
4103#endif /* RE_ENABLE_I18N */
4104 build_upper_buffer (pstr);
4105 }
4106 else
4107 {
4108#ifdef RE_ENABLE_I18N
4109 if (pstr->mb_cur_max > 1)
4110 build_wcs_buffer (pstr);
4111 else
4112#endif /* RE_ENABLE_I18N */
4113 {
4114 if (pstr->trans != NULL)
4115 re_string_translate_buffer (pstr);
4116 }
4117 }
4118 return REG_NOERROR;
4119}
4120
4121
4122
4123/* Functions for matching context. */
4124
4125/* Initialize MCTX. */
4126
4127static reg_errcode_t
4128match_ctx_init (mctx, eflags, n)
4129 re_match_context_t *mctx;
4130 int eflags, n;
4131{
4132 mctx->eflags = eflags;
4133 mctx->match_last = -1;
4134 if (n > 0)
4135 {
4136 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4137 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4138 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4139 return REG_ESPACE;
4140 }
4141 /* Already zero-ed by the caller.
4142 else
4143 mctx->bkref_ents = NULL;
4144 mctx->nbkref_ents = 0;
4145 mctx->nsub_tops = 0; */
4146 mctx->abkref_ents = n;
4147 mctx->max_mb_elem_len = 1;
4148 mctx->asub_tops = n;
4149 return REG_NOERROR;
4150}
4151
4152/* Clean the entries which depend on the current input in MCTX.
4153 This function must be invoked when the matcher changes the start index
4154 of the input, or changes the input string. */
4155
4156static void
4157match_ctx_clean (mctx)
4158 re_match_context_t *mctx;
4159{
4160 int st_idx;
4161 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4162 {
4163 int sl_idx;
4164 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4165 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4166 {
4167 re_sub_match_last_t *last = top->lasts[sl_idx];
4168 re_free (last->path.array);
4169 re_free (last);
4170 }
4171 re_free (top->lasts);
4172 if (top->path)
4173 {
4174 re_free (top->path->array);
4175 re_free (top->path);
4176 }
4177 free (top);
4178 }
4179
4180 mctx->nsub_tops = 0;
4181 mctx->nbkref_ents = 0;
4182}
4183
4184/* Free all the memory associated with MCTX. */
4185
4186static void
4187match_ctx_free (mctx)
4188 re_match_context_t *mctx;
4189{
4190 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4191 match_ctx_clean (mctx);
4192 re_free (mctx->sub_tops);
4193 re_free (mctx->bkref_ents);
4194}
4195
4196/* Add a new backreference entry to MCTX.
4197 Note that we assume that caller never call this function with duplicate
4198 entry, and call with STR_IDX which isn't smaller than any existing entry.
4199*/
4200
4201static reg_errcode_t
4202match_ctx_add_entry (mctx, node, str_idx, from, to)
4203 re_match_context_t *mctx;
4204 int node, str_idx, from, to;
4205{
4206 if (mctx->nbkref_ents >= mctx->abkref_ents)
4207 {
4208 struct re_backref_cache_entry* new_entry;
4209 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4210 mctx->abkref_ents * 2);
4211 if (BE (new_entry == NULL, 0))
4212 {
4213 re_free (mctx->bkref_ents);
4214 return REG_ESPACE;
4215 }
4216 mctx->bkref_ents = new_entry;
4217 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4218 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4219 mctx->abkref_ents *= 2;
4220 }
4221 if (mctx->nbkref_ents > 0
4222 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4223 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4224
4225 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4226 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4227 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4228 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4229
4230 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4231 If bit N is clear, means that this entry won't epsilon-transition to
4232 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4233 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4234 such node.
4235
4236 A backreference does not epsilon-transition unless it is empty, so set
4237 to all zeros if FROM != TO. */
4238 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4239 = (from == to ? ~0 : 0);
4240
4241 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4242 if (mctx->max_mb_elem_len < to - from)
4243 mctx->max_mb_elem_len = to - from;
4244 return REG_NOERROR;
4245}
4246
4247/* Search for the first entry which has the same str_idx, or -1 if none is
4248 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4249
4250static int
4251search_cur_bkref_entry (mctx, str_idx)
4252 re_match_context_t *mctx;
4253 int str_idx;
4254{
4255 int left, right, mid, last;
4256 last = right = mctx->nbkref_ents;
4257 for (left = 0; left < right;)
4258 {
4259 mid = (left + right) / 2;
4260 if (mctx->bkref_ents[mid].str_idx < str_idx)
4261 left = mid + 1;
4262 else
4263 right = mid;
4264 }
4265 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4266 return left;
4267 else
4268 return -1;
4269}
4270
4271/* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4272 at STR_IDX. */
4273
4274static reg_errcode_t
4275match_ctx_add_subtop (mctx, node, str_idx)
4276 re_match_context_t *mctx;
4277 int node, str_idx;
4278{
4279#ifdef DEBUG
4280 assert (mctx->sub_tops != NULL);
4281 assert (mctx->asub_tops > 0);
4282#endif
4283 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4284 {
4285 int new_asub_tops = mctx->asub_tops * 2;
4286 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4287 re_sub_match_top_t *,
4288 new_asub_tops);
4289 if (BE (new_array == NULL, 0))
4290 return REG_ESPACE;
4291 mctx->sub_tops = new_array;
4292 mctx->asub_tops = new_asub_tops;
4293 }
4294 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4295 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4296 return REG_ESPACE;
4297 mctx->sub_tops[mctx->nsub_tops]->node = node;
4298 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4299 return REG_NOERROR;
4300}
4301
4302/* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4303 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4304
4305static re_sub_match_last_t *
4306match_ctx_add_sublast (subtop, node, str_idx)
4307 re_sub_match_top_t *subtop;
4308 int node, str_idx;
4309{
4310 re_sub_match_last_t *new_entry;
4311 if (BE (subtop->nlasts == subtop->alasts, 0))
4312 {
4313 int new_alasts = 2 * subtop->alasts + 1;
4314 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4315 re_sub_match_last_t *,
4316 new_alasts);
4317 if (BE (new_array == NULL, 0))
4318 return NULL;
4319 subtop->lasts = new_array;
4320 subtop->alasts = new_alasts;
4321 }
4322 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4323 if (BE (new_entry != NULL, 1))
4324 {
4325 subtop->lasts[subtop->nlasts] = new_entry;
4326 new_entry->node = node;
4327 new_entry->str_idx = str_idx;
4328 ++subtop->nlasts;
4329 }
4330 return new_entry;
4331}
4332
4333static void
4334sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4335 re_sift_context_t *sctx;
4336 re_dfastate_t **sifted_sts, **limited_sts;
4337 int last_node, last_str_idx;
4338{
4339 sctx->sifted_states = sifted_sts;
4340 sctx->limited_states = limited_sts;
4341 sctx->last_node = last_node;
4342 sctx->last_str_idx = last_str_idx;
4343 re_node_set_init_empty (&sctx->limits);
4344}
Note: See TracBrowser for help on using the repository browser.