source: vendor/glibc/current/posix/regex_internal.c

Last change on this file was 2237, checked in by (none), 20 years ago

This commit was manufactured by cvs2svn to create branch 'GNU'.

  • Property cvs2svn:cvs-rev set to 1.1
  • Property svn:eol-style set to native
  • Property svn:executable set to *
File size: 44.0 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25#ifdef RE_ENABLE_I18N
26static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27 wint_t *last_wc) internal_function;
28#endif /* RE_ENABLE_I18N */
29static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
30 unsigned int hash) internal_function;
31static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
32 const re_node_set *nodes,
33 unsigned int hash) internal_function;
34static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
35 const re_node_set *nodes,
36 unsigned int context,
37 unsigned int hash) internal_function;
38static unsigned int inline calc_state_hash (const re_node_set *nodes,
39 unsigned int context) internal_function;
40
41
42/* Functions for string operation. */
43
44/* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
47static reg_errcode_t
48re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
49 re_string_t *pstr;
50 const char *str;
51 int len, init_len, icase;
52 RE_TRANSLATE_TYPE trans;
53 const re_dfa_t *dfa;
54{
55 reg_errcode_t ret;
56 int init_buf_len;
57
58 /* Ensure at least one character fits into the buffers. */
59 if (init_len < dfa->mb_cur_max)
60 init_len = dfa->mb_cur_max;
61 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
62 re_string_construct_common (str, len, pstr, trans, icase, dfa);
63
64 ret = re_string_realloc_buffers (pstr, init_buf_len);
65 if (BE (ret != REG_NOERROR, 0))
66 return ret;
67
68 pstr->word_char = dfa->word_char;
69 pstr->word_ops_used = dfa->word_ops_used;
70 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
71 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
72 pstr->valid_raw_len = pstr->valid_len;
73 return REG_NOERROR;
74}
75
76/* This function allocate the buffers, and initialize them. */
77
78static reg_errcode_t
79re_string_construct (pstr, str, len, trans, icase, dfa)
80 re_string_t *pstr;
81 const char *str;
82 int len, icase;
83 RE_TRANSLATE_TYPE trans;
84 const re_dfa_t *dfa;
85{
86 reg_errcode_t ret;
87 memset (pstr, '\0', sizeof (re_string_t));
88 re_string_construct_common (str, len, pstr, trans, icase, dfa);
89
90 if (len > 0)
91 {
92 ret = re_string_realloc_buffers (pstr, len + 1);
93 if (BE (ret != REG_NOERROR, 0))
94 return ret;
95 }
96 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
97
98 if (icase)
99 {
100#ifdef RE_ENABLE_I18N
101 if (dfa->mb_cur_max > 1)
102 {
103 while (1)
104 {
105 ret = build_wcs_upper_buffer (pstr);
106 if (BE (ret != REG_NOERROR, 0))
107 return ret;
108 if (pstr->valid_raw_len >= len)
109 break;
110 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
111 break;
112 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
113 if (BE (ret != REG_NOERROR, 0))
114 return ret;
115 }
116 }
117 else
118#endif /* RE_ENABLE_I18N */
119 build_upper_buffer (pstr);
120 }
121 else
122 {
123#ifdef RE_ENABLE_I18N
124 if (dfa->mb_cur_max > 1)
125 build_wcs_buffer (pstr);
126 else
127#endif /* RE_ENABLE_I18N */
128 {
129 if (trans != NULL)
130 re_string_translate_buffer (pstr);
131 else
132 {
133 pstr->valid_len = pstr->bufs_len;
134 pstr->valid_raw_len = pstr->bufs_len;
135 }
136 }
137 }
138
139 return REG_NOERROR;
140}
141
142/* Helper functions for re_string_allocate, and re_string_construct. */
143
144static reg_errcode_t
145re_string_realloc_buffers (pstr, new_buf_len)
146 re_string_t *pstr;
147 int new_buf_len;
148{
149#ifdef RE_ENABLE_I18N
150 if (pstr->mb_cur_max > 1)
151 {
152 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
153 if (BE (new_array == NULL, 0))
154 return REG_ESPACE;
155 pstr->wcs = new_array;
156 if (pstr->offsets != NULL)
157 {
158 int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
159 if (BE (new_array == NULL, 0))
160 return REG_ESPACE;
161 pstr->offsets = new_array;
162 }
163 }
164#endif /* RE_ENABLE_I18N */
165 if (pstr->mbs_allocated)
166 {
167 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
168 new_buf_len);
169 if (BE (new_array == NULL, 0))
170 return REG_ESPACE;
171 pstr->mbs = new_array;
172 }
173 pstr->bufs_len = new_buf_len;
174 return REG_NOERROR;
175}
176
177
178static void
179re_string_construct_common (str, len, pstr, trans, icase, dfa)
180 const char *str;
181 int len;
182 re_string_t *pstr;
183 RE_TRANSLATE_TYPE trans;
184 int icase;
185 const re_dfa_t *dfa;
186{
187 pstr->raw_mbs = (const unsigned char *) str;
188 pstr->len = len;
189 pstr->raw_len = len;
190 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
191 pstr->icase = icase ? 1 : 0;
192 pstr->mbs_allocated = (trans != NULL || icase);
193 pstr->mb_cur_max = dfa->mb_cur_max;
194 pstr->is_utf8 = dfa->is_utf8;
195 pstr->map_notascii = dfa->map_notascii;
196 pstr->stop = pstr->len;
197 pstr->raw_stop = pstr->stop;
198}
199
200#ifdef RE_ENABLE_I18N
201
202/* Build wide character buffer PSTR->WCS.
203 If the byte sequence of the string are:
204 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
205 Then wide character buffer will be:
206 <wc1> , WEOF , <wc2> , WEOF , <wc3>
207 We use WEOF for padding, they indicate that the position isn't
208 a first byte of a multibyte character.
209
210 Note that this function assumes PSTR->VALID_LEN elements are already
211 built and starts from PSTR->VALID_LEN. */
212
213static void
214build_wcs_buffer (pstr)
215 re_string_t *pstr;
216{
217#ifdef _LIBC
218 unsigned char buf[MB_LEN_MAX];
219 assert (MB_LEN_MAX >= pstr->mb_cur_max);
220#else
221 unsigned char buf[64];
222#endif
223 mbstate_t prev_st;
224 int byte_idx, end_idx, remain_len;
225 size_t mbclen;
226
227 /* Build the buffers from pstr->valid_len to either pstr->len or
228 pstr->bufs_len. */
229 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
230 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
231 {
232 wchar_t wc;
233 const char *p;
234
235 remain_len = end_idx - byte_idx;
236 prev_st = pstr->cur_state;
237 /* Apply the translation if we need. */
238 if (BE (pstr->trans != NULL, 0))
239 {
240 int i, ch;
241
242 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
243 {
244 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
245 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
246 }
247 p = (const char *) buf;
248 }
249 else
250 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
251 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
252 if (BE (mbclen == (size_t) -2, 0))
253 {
254 /* The buffer doesn't have enough space, finish to build. */
255 pstr->cur_state = prev_st;
256 break;
257 }
258 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
259 {
260 /* We treat these cases as a singlebyte character. */
261 mbclen = 1;
262 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
263 if (BE (pstr->trans != NULL, 0))
264 wc = pstr->trans[wc];
265 pstr->cur_state = prev_st;
266 }
267
268 /* Write wide character and padding. */
269 pstr->wcs[byte_idx++] = wc;
270 /* Write paddings. */
271 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
272 pstr->wcs[byte_idx++] = WEOF;
273 }
274 pstr->valid_len = byte_idx;
275 pstr->valid_raw_len = byte_idx;
276}
277
278/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
279 but for REG_ICASE. */
280
281static int
282build_wcs_upper_buffer (pstr)
283 re_string_t *pstr;
284{
285 mbstate_t prev_st;
286 int src_idx, byte_idx, end_idx, remain_len;
287 size_t mbclen;
288#ifdef _LIBC
289 char buf[MB_LEN_MAX];
290 assert (MB_LEN_MAX >= pstr->mb_cur_max);
291#else
292 char buf[64];
293#endif
294
295 byte_idx = pstr->valid_len;
296 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
297
298 /* The following optimization assumes that ASCII characters can be
299 mapped to wide characters with a simple cast. */
300 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
301 {
302 while (byte_idx < end_idx)
303 {
304 wchar_t wc;
305
306 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
307 && mbsinit (&pstr->cur_state))
308 {
309 /* In case of a singlebyte character. */
310 pstr->mbs[byte_idx]
311 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
312 /* The next step uses the assumption that wchar_t is encoded
313 ASCII-safe: all ASCII values can be converted like this. */
314 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
315 ++byte_idx;
316 continue;
317 }
318
319 remain_len = end_idx - byte_idx;
320 prev_st = pstr->cur_state;
321 mbclen = mbrtowc (&wc,
322 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323 + byte_idx), remain_len, &pstr->cur_state);
324 if (BE (mbclen + 2 > 2, 1))
325 {
326 wchar_t wcu = wc;
327 if (iswlower (wc))
328 {
329 size_t mbcdlen;
330
331 wcu = towupper (wc);
332 mbcdlen = wcrtomb (buf, wcu, &prev_st);
333 if (BE (mbclen == mbcdlen, 1))
334 memcpy (pstr->mbs + byte_idx, buf, mbclen);
335 else
336 {
337 src_idx = byte_idx;
338 goto offsets_needed;
339 }
340 }
341 else
342 memcpy (pstr->mbs + byte_idx,
343 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
344 pstr->wcs[byte_idx++] = wcu;
345 /* Write paddings. */
346 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
347 pstr->wcs[byte_idx++] = WEOF;
348 }
349 else if (mbclen == (size_t) -1 || mbclen == 0)
350 {
351 /* It is an invalid character or '\0'. Just use the byte. */
352 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
353 pstr->mbs[byte_idx] = ch;
354 /* And also cast it to wide char. */
355 pstr->wcs[byte_idx++] = (wchar_t) ch;
356 if (BE (mbclen == (size_t) -1, 0))
357 pstr->cur_state = prev_st;
358 }
359 else
360 {
361 /* The buffer doesn't have enough space, finish to build. */
362 pstr->cur_state = prev_st;
363 break;
364 }
365 }
366 pstr->valid_len = byte_idx;
367 pstr->valid_raw_len = byte_idx;
368 return REG_NOERROR;
369 }
370 else
371 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
372 {
373 wchar_t wc;
374 const char *p;
375 offsets_needed:
376 remain_len = end_idx - byte_idx;
377 prev_st = pstr->cur_state;
378 if (BE (pstr->trans != NULL, 0))
379 {
380 int i, ch;
381
382 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
383 {
384 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
385 buf[i] = pstr->trans[ch];
386 }
387 p = (const char *) buf;
388 }
389 else
390 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
391 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
392 if (BE (mbclen + 2 > 2, 1))
393 {
394 wchar_t wcu = wc;
395 if (iswlower (wc))
396 {
397 size_t mbcdlen;
398
399 wcu = towupper (wc);
400 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
401 if (BE (mbclen == mbcdlen, 1))
402 memcpy (pstr->mbs + byte_idx, buf, mbclen);
403 else if (mbcdlen != (size_t) -1)
404 {
405 size_t i;
406
407 if (byte_idx + mbcdlen > pstr->bufs_len)
408 {
409 pstr->cur_state = prev_st;
410 break;
411 }
412
413 if (pstr->offsets == NULL)
414 {
415 pstr->offsets = re_malloc (int, pstr->bufs_len);
416
417 if (pstr->offsets == NULL)
418 return REG_ESPACE;
419 }
420 if (!pstr->offsets_needed)
421 {
422 for (i = 0; i < (size_t) byte_idx; ++i)
423 pstr->offsets[i] = i;
424 pstr->offsets_needed = 1;
425 }
426
427 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
428 pstr->wcs[byte_idx] = wcu;
429 pstr->offsets[byte_idx] = src_idx;
430 for (i = 1; i < mbcdlen; ++i)
431 {
432 pstr->offsets[byte_idx + i]
433 = src_idx + (i < mbclen ? i : mbclen - 1);
434 pstr->wcs[byte_idx + i] = WEOF;
435 }
436 pstr->len += mbcdlen - mbclen;
437 if (pstr->raw_stop > src_idx)
438 pstr->stop += mbcdlen - mbclen;
439 end_idx = (pstr->bufs_len > pstr->len)
440 ? pstr->len : pstr->bufs_len;
441 byte_idx += mbcdlen;
442 src_idx += mbclen;
443 continue;
444 }
445 else
446 memcpy (pstr->mbs + byte_idx, p, mbclen);
447 }
448 else
449 memcpy (pstr->mbs + byte_idx, p, mbclen);
450
451 if (BE (pstr->offsets_needed != 0, 0))
452 {
453 size_t i;
454 for (i = 0; i < mbclen; ++i)
455 pstr->offsets[byte_idx + i] = src_idx + i;
456 }
457 src_idx += mbclen;
458
459 pstr->wcs[byte_idx++] = wcu;
460 /* Write paddings. */
461 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
462 pstr->wcs[byte_idx++] = WEOF;
463 }
464 else if (mbclen == (size_t) -1 || mbclen == 0)
465 {
466 /* It is an invalid character or '\0'. Just use the byte. */
467 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
468
469 if (BE (pstr->trans != NULL, 0))
470 ch = pstr->trans [ch];
471 pstr->mbs[byte_idx] = ch;
472
473 if (BE (pstr->offsets_needed != 0, 0))
474 pstr->offsets[byte_idx] = src_idx;
475 ++src_idx;
476
477 /* And also cast it to wide char. */
478 pstr->wcs[byte_idx++] = (wchar_t) ch;
479 if (BE (mbclen == (size_t) -1, 0))
480 pstr->cur_state = prev_st;
481 }
482 else
483 {
484 /* The buffer doesn't have enough space, finish to build. */
485 pstr->cur_state = prev_st;
486 break;
487 }
488 }
489 pstr->valid_len = byte_idx;
490 pstr->valid_raw_len = src_idx;
491 return REG_NOERROR;
492}
493
494/* Skip characters until the index becomes greater than NEW_RAW_IDX.
495 Return the index. */
496
497static int
498re_string_skip_chars (pstr, new_raw_idx, last_wc)
499 re_string_t *pstr;
500 int new_raw_idx;
501 wint_t *last_wc;
502{
503 mbstate_t prev_st;
504 int rawbuf_idx;
505 size_t mbclen;
506 wchar_t wc = 0;
507
508 /* Skip the characters which are not necessary to check. */
509 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
510 rawbuf_idx < new_raw_idx;)
511 {
512 int remain_len;
513 remain_len = pstr->len - rawbuf_idx;
514 prev_st = pstr->cur_state;
515 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
516 remain_len, &pstr->cur_state);
517 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
518 {
519 /* We treat these cases as a singlebyte character. */
520 mbclen = 1;
521 pstr->cur_state = prev_st;
522 }
523 /* Then proceed the next character. */
524 rawbuf_idx += mbclen;
525 }
526 *last_wc = (wint_t) wc;
527 return rawbuf_idx;
528}
529#endif /* RE_ENABLE_I18N */
530
531/* Build the buffer PSTR->MBS, and apply the translation if we need.
532 This function is used in case of REG_ICASE. */
533
534static void
535build_upper_buffer (pstr)
536 re_string_t *pstr;
537{
538 int char_idx, end_idx;
539 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
540
541 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
542 {
543 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
544 if (BE (pstr->trans != NULL, 0))
545 ch = pstr->trans[ch];
546 if (islower (ch))
547 pstr->mbs[char_idx] = toupper (ch);
548 else
549 pstr->mbs[char_idx] = ch;
550 }
551 pstr->valid_len = char_idx;
552 pstr->valid_raw_len = char_idx;
553}
554
555/* Apply TRANS to the buffer in PSTR. */
556
557static void
558re_string_translate_buffer (pstr)
559 re_string_t *pstr;
560{
561 int buf_idx, end_idx;
562 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
563
564 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
565 {
566 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567 pstr->mbs[buf_idx] = pstr->trans[ch];
568 }
569
570 pstr->valid_len = buf_idx;
571 pstr->valid_raw_len = buf_idx;
572}
573
574/* This function re-construct the buffers.
575 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576 convert to upper case in case of REG_ICASE, apply translation. */
577
578static reg_errcode_t
579re_string_reconstruct (pstr, idx, eflags)
580 re_string_t *pstr;
581 int idx, eflags;
582{
583 int offset = idx - pstr->raw_mbs_idx;
584 if (BE (offset < 0, 0))
585 {
586 /* Reset buffer. */
587#ifdef RE_ENABLE_I18N
588 if (pstr->mb_cur_max > 1)
589 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
590#endif /* RE_ENABLE_I18N */
591 pstr->len = pstr->raw_len;
592 pstr->stop = pstr->raw_stop;
593 pstr->valid_len = 0;
594 pstr->raw_mbs_idx = 0;
595 pstr->valid_raw_len = 0;
596 pstr->offsets_needed = 0;
597 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
598 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
599 if (!pstr->mbs_allocated)
600 pstr->mbs = (unsigned char *) pstr->raw_mbs;
601 offset = idx;
602 }
603
604 if (BE (offset != 0, 1))
605 {
606 /* Are the characters which are already checked remain? */
607 if (BE (offset < pstr->valid_raw_len, 1)
608#ifdef RE_ENABLE_I18N
609 /* Handling this would enlarge the code too much.
610 Accept a slowdown in that case. */
611 && pstr->offsets_needed == 0
612#endif
613 )
614 {
615 /* Yes, move them to the front of the buffer. */
616 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
617#ifdef RE_ENABLE_I18N
618 if (pstr->mb_cur_max > 1)
619 memmove (pstr->wcs, pstr->wcs + offset,
620 (pstr->valid_len - offset) * sizeof (wint_t));
621#endif /* RE_ENABLE_I18N */
622 if (BE (pstr->mbs_allocated, 0))
623 memmove (pstr->mbs, pstr->mbs + offset,
624 pstr->valid_len - offset);
625 pstr->valid_len -= offset;
626 pstr->valid_raw_len -= offset;
627#if DEBUG
628 assert (pstr->valid_len > 0);
629#endif
630 }
631 else
632 {
633 /* No, skip all characters until IDX. */
634#ifdef RE_ENABLE_I18N
635 if (BE (pstr->offsets_needed, 0))
636 {
637 pstr->len = pstr->raw_len - idx + offset;
638 pstr->stop = pstr->raw_stop - idx + offset;
639 pstr->offsets_needed = 0;
640 }
641#endif
642 pstr->valid_len = 0;
643 pstr->valid_raw_len = 0;
644#ifdef RE_ENABLE_I18N
645 if (pstr->mb_cur_max > 1)
646 {
647 int wcs_idx;
648 wint_t wc = WEOF;
649
650 if (pstr->is_utf8)
651 {
652 const unsigned char *raw, *p, *q, *end;
653
654 /* Special case UTF-8. Multi-byte chars start with any
655 byte other than 0x80 - 0xbf. */
656 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
657 end = raw + (offset - pstr->mb_cur_max);
658 for (p = raw + offset - 1; p >= end; --p)
659 if ((*p & 0xc0) != 0x80)
660 {
661 mbstate_t cur_state;
662 wchar_t wc2;
663 int mlen = raw + pstr->len - p;
664 unsigned char buf[6];
665
666 q = p;
667 if (BE (pstr->trans != NULL, 0))
668 {
669 int i = mlen < 6 ? mlen : 6;
670 while (--i >= 0)
671 buf[i] = pstr->trans[p[i]];
672 q = buf;
673 }
674 /* XXX Don't use mbrtowc, we know which conversion
675 to use (UTF-8 -> UCS4). */
676 memset (&cur_state, 0, sizeof (cur_state));
677 mlen = (mbrtowc (&wc2, (const char *) p, mlen,
678 &cur_state)
679 - (raw + offset - p));
680 if (mlen >= 0)
681 {
682 memset (&pstr->cur_state, '\0',
683 sizeof (mbstate_t));
684 pstr->valid_len = mlen;
685 wc = wc2;
686 }
687 break;
688 }
689 }
690
691 if (wc == WEOF)
692 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
693 if (BE (pstr->valid_len, 0))
694 {
695 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
696 pstr->wcs[wcs_idx] = WEOF;
697 if (pstr->mbs_allocated)
698 memset (pstr->mbs, 255, pstr->valid_len);
699 }
700 pstr->valid_raw_len = pstr->valid_len;
701 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
702 && IS_WIDE_WORD_CHAR (wc))
703 ? CONTEXT_WORD
704 : ((IS_WIDE_NEWLINE (wc)
705 && pstr->newline_anchor)
706 ? CONTEXT_NEWLINE : 0));
707 }
708 else
709#endif /* RE_ENABLE_I18N */
710 {
711 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
712 if (pstr->trans)
713 c = pstr->trans[c];
714 pstr->tip_context = (bitset_contain (pstr->word_char, c)
715 ? CONTEXT_WORD
716 : ((IS_NEWLINE (c) && pstr->newline_anchor)
717 ? CONTEXT_NEWLINE : 0));
718 }
719 }
720 if (!BE (pstr->mbs_allocated, 0))
721 pstr->mbs += offset;
722 }
723 pstr->raw_mbs_idx = idx;
724 pstr->len -= offset;
725 pstr->stop -= offset;
726
727 /* Then build the buffers. */
728#ifdef RE_ENABLE_I18N
729 if (pstr->mb_cur_max > 1)
730 {
731 if (pstr->icase)
732 {
733 int ret = build_wcs_upper_buffer (pstr);
734 if (BE (ret != REG_NOERROR, 0))
735 return ret;
736 }
737 else
738 build_wcs_buffer (pstr);
739 }
740 else
741#endif /* RE_ENABLE_I18N */
742 if (BE (pstr->mbs_allocated, 0))
743 {
744 if (pstr->icase)
745 build_upper_buffer (pstr);
746 else if (pstr->trans != NULL)
747 re_string_translate_buffer (pstr);
748 }
749 else
750 pstr->valid_len = pstr->len;
751
752 pstr->cur_idx = 0;
753 return REG_NOERROR;
754}
755
756static unsigned char
757re_string_peek_byte_case (pstr, idx)
758 const re_string_t *pstr;
759 int idx;
760{
761 int ch, off;
762
763 /* Handle the common (easiest) cases first. */
764 if (BE (!pstr->mbs_allocated, 1))
765 return re_string_peek_byte (pstr, idx);
766
767#ifdef RE_ENABLE_I18N
768 if (pstr->mb_cur_max > 1
769 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
770 return re_string_peek_byte (pstr, idx);
771#endif
772
773 off = pstr->cur_idx + idx;
774#ifdef RE_ENABLE_I18N
775 if (pstr->offsets_needed)
776 off = pstr->offsets[off];
777#endif
778
779 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
780
781#ifdef RE_ENABLE_I18N
782 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
783 this function returns CAPITAL LETTER I instead of first byte of
784 DOTLESS SMALL LETTER I. The latter would confuse the parser,
785 since peek_byte_case doesn't advance cur_idx in any way. */
786 if (pstr->offsets_needed && !isascii (ch))
787 return re_string_peek_byte (pstr, idx);
788#endif
789
790 return ch;
791}
792
793static unsigned char
794re_string_fetch_byte_case (pstr)
795 re_string_t *pstr;
796{
797 if (BE (!pstr->mbs_allocated, 1))
798 return re_string_fetch_byte (pstr);
799
800#ifdef RE_ENABLE_I18N
801 if (pstr->offsets_needed)
802 {
803 int off, ch;
804
805 /* For tr_TR.UTF-8 [[:islower:]] there is
806 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
807 in that case the whole multi-byte character and return
808 the original letter. On the other side, with
809 [[: DOTLESS SMALL LETTER I return [[:I, as doing
810 anything else would complicate things too much. */
811
812 if (!re_string_first_byte (pstr, pstr->cur_idx))
813 return re_string_fetch_byte (pstr);
814
815 off = pstr->offsets[pstr->cur_idx];
816 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
817
818 if (! isascii (ch))
819 return re_string_fetch_byte (pstr);
820
821 re_string_skip_bytes (pstr,
822 re_string_char_size_at (pstr, pstr->cur_idx));
823 return ch;
824 }
825#endif
826
827 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
828}
829
830static void
831re_string_destruct (pstr)
832 re_string_t *pstr;
833{
834#ifdef RE_ENABLE_I18N
835 re_free (pstr->wcs);
836 re_free (pstr->offsets);
837#endif /* RE_ENABLE_I18N */
838 if (pstr->mbs_allocated)
839 re_free (pstr->mbs);
840}
841
842/* Return the context at IDX in INPUT. */
843
844static unsigned int
845re_string_context_at (input, idx, eflags)
846 const re_string_t *input;
847 int idx, eflags;
848{
849 int c;
850 if (BE (idx < 0, 0))
851 /* In this case, we use the value stored in input->tip_context,
852 since we can't know the character in input->mbs[-1] here. */
853 return input->tip_context;
854 if (BE (idx == input->len, 0))
855 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
856 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
857#ifdef RE_ENABLE_I18N
858 if (input->mb_cur_max > 1)
859 {
860 wint_t wc;
861 int wc_idx = idx;
862 while(input->wcs[wc_idx] == WEOF)
863 {
864#ifdef DEBUG
865 /* It must not happen. */
866 assert (wc_idx >= 0);
867#endif
868 --wc_idx;
869 if (wc_idx < 0)
870 return input->tip_context;
871 }
872 wc = input->wcs[wc_idx];
873 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
874 return CONTEXT_WORD;
875 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
876 ? CONTEXT_NEWLINE : 0);
877 }
878 else
879#endif
880 {
881 c = re_string_byte_at (input, idx);
882 if (bitset_contain (input->word_char, c))
883 return CONTEXT_WORD;
884 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
885 }
886}
887
888
889/* Functions for set operation. */
890
891static reg_errcode_t
892re_node_set_alloc (set, size)
893 re_node_set *set;
894 int size;
895{
896 set->alloc = size;
897 set->nelem = 0;
898 set->elems = re_malloc (int, size);
899 if (BE (set->elems == NULL, 0))
900 return REG_ESPACE;
901 return REG_NOERROR;
902}
903
904static reg_errcode_t
905re_node_set_init_1 (set, elem)
906 re_node_set *set;
907 int elem;
908{
909 set->alloc = 1;
910 set->nelem = 1;
911 set->elems = re_malloc (int, 1);
912 if (BE (set->elems == NULL, 0))
913 {
914 set->alloc = set->nelem = 0;
915 return REG_ESPACE;
916 }
917 set->elems[0] = elem;
918 return REG_NOERROR;
919}
920
921static reg_errcode_t
922re_node_set_init_2 (set, elem1, elem2)
923 re_node_set *set;
924 int elem1, elem2;
925{
926 set->alloc = 2;
927 set->elems = re_malloc (int, 2);
928 if (BE (set->elems == NULL, 0))
929 return REG_ESPACE;
930 if (elem1 == elem2)
931 {
932 set->nelem = 1;
933 set->elems[0] = elem1;
934 }
935 else
936 {
937 set->nelem = 2;
938 if (elem1 < elem2)
939 {
940 set->elems[0] = elem1;
941 set->elems[1] = elem2;
942 }
943 else
944 {
945 set->elems[0] = elem2;
946 set->elems[1] = elem1;
947 }
948 }
949 return REG_NOERROR;
950}
951
952static reg_errcode_t
953re_node_set_init_copy (dest, src)
954 re_node_set *dest;
955 const re_node_set *src;
956{
957 dest->nelem = src->nelem;
958 if (src->nelem > 0)
959 {
960 dest->alloc = dest->nelem;
961 dest->elems = re_malloc (int, dest->alloc);
962 if (BE (dest->elems == NULL, 0))
963 {
964 dest->alloc = dest->nelem = 0;
965 return REG_ESPACE;
966 }
967 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
968 }
969 else
970 re_node_set_init_empty (dest);
971 return REG_NOERROR;
972}
973
974/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
975 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
976 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
977
978static reg_errcode_t
979re_node_set_add_intersect (dest, src1, src2)
980 re_node_set *dest;
981 const re_node_set *src1, *src2;
982{
983 int i1, i2, is, id, delta, sbase;
984 if (src1->nelem == 0 || src2->nelem == 0)
985 return REG_NOERROR;
986
987 /* We need dest->nelem + 2 * elems_in_intersection; this is a
988 conservative estimate. */
989 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
990 {
991 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
992 int *new_elems = re_realloc (dest->elems, int, new_alloc);
993 if (BE (new_elems == NULL, 0))
994 return REG_ESPACE;
995 dest->elems = new_elems;
996 dest->alloc = new_alloc;
997 }
998
999 /* Find the items in the intersection of SRC1 and SRC2, and copy
1000 into the top of DEST those that are not already in DEST itself. */
1001 sbase = dest->nelem + src1->nelem + src2->nelem;
1002 i1 = src1->nelem - 1;
1003 i2 = src2->nelem - 1;
1004 id = dest->nelem - 1;
1005 for (;;)
1006 {
1007 if (src1->elems[i1] == src2->elems[i2])
1008 {
1009 /* Try to find the item in DEST. Maybe we could binary search? */
1010 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1011 --id;
1012
1013 if (id < 0 || dest->elems[id] != src1->elems[i1])
1014 dest->elems[--sbase] = src1->elems[i1];
1015
1016 if (--i1 < 0 || --i2 < 0)
1017 break;
1018 }
1019
1020 /* Lower the highest of the two items. */
1021 else if (src1->elems[i1] < src2->elems[i2])
1022 {
1023 if (--i2 < 0)
1024 break;
1025 }
1026 else
1027 {
1028 if (--i1 < 0)
1029 break;
1030 }
1031 }
1032
1033 id = dest->nelem - 1;
1034 is = dest->nelem + src1->nelem + src2->nelem - 1;
1035 delta = is - sbase + 1;
1036
1037 /* Now copy. When DELTA becomes zero, the remaining
1038 DEST elements are already in place; this is more or
1039 less the same loop that is in re_node_set_merge. */
1040 dest->nelem += delta;
1041 if (delta > 0 && id >= 0)
1042 for (;;)
1043 {
1044 if (dest->elems[is] > dest->elems[id])
1045 {
1046 /* Copy from the top. */
1047 dest->elems[id + delta--] = dest->elems[is--];
1048 if (delta == 0)
1049 break;
1050 }
1051 else
1052 {
1053 /* Slide from the bottom. */
1054 dest->elems[id + delta] = dest->elems[id];
1055 if (--id < 0)
1056 break;
1057 }
1058 }
1059
1060 /* Copy remaining SRC elements. */
1061 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1062
1063 return REG_NOERROR;
1064}
1065
1066/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1067 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1068
1069static reg_errcode_t
1070re_node_set_init_union (dest, src1, src2)
1071 re_node_set *dest;
1072 const re_node_set *src1, *src2;
1073{
1074 int i1, i2, id;
1075 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1076 {
1077 dest->alloc = src1->nelem + src2->nelem;
1078 dest->elems = re_malloc (int, dest->alloc);
1079 if (BE (dest->elems == NULL, 0))
1080 return REG_ESPACE;
1081 }
1082 else
1083 {
1084 if (src1 != NULL && src1->nelem > 0)
1085 return re_node_set_init_copy (dest, src1);
1086 else if (src2 != NULL && src2->nelem > 0)
1087 return re_node_set_init_copy (dest, src2);
1088 else
1089 re_node_set_init_empty (dest);
1090 return REG_NOERROR;
1091 }
1092 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1093 {
1094 if (src1->elems[i1] > src2->elems[i2])
1095 {
1096 dest->elems[id++] = src2->elems[i2++];
1097 continue;
1098 }
1099 if (src1->elems[i1] == src2->elems[i2])
1100 ++i2;
1101 dest->elems[id++] = src1->elems[i1++];
1102 }
1103 if (i1 < src1->nelem)
1104 {
1105 memcpy (dest->elems + id, src1->elems + i1,
1106 (src1->nelem - i1) * sizeof (int));
1107 id += src1->nelem - i1;
1108 }
1109 else if (i2 < src2->nelem)
1110 {
1111 memcpy (dest->elems + id, src2->elems + i2,
1112 (src2->nelem - i2) * sizeof (int));
1113 id += src2->nelem - i2;
1114 }
1115 dest->nelem = id;
1116 return REG_NOERROR;
1117}
1118
1119/* Calculate the union set of the sets DEST and SRC. And store it to
1120 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1121
1122static reg_errcode_t
1123re_node_set_merge (dest, src)
1124 re_node_set *dest;
1125 const re_node_set *src;
1126{
1127 int is, id, sbase, delta;
1128 if (src == NULL || src->nelem == 0)
1129 return REG_NOERROR;
1130 if (dest->alloc < 2 * src->nelem + dest->nelem)
1131 {
1132 int new_alloc = 2 * (src->nelem + dest->alloc);
1133 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1134 if (BE (new_buffer == NULL, 0))
1135 return REG_ESPACE;
1136 dest->elems = new_buffer;
1137 dest->alloc = new_alloc;
1138 }
1139
1140 if (BE (dest->nelem == 0, 0))
1141 {
1142 dest->nelem = src->nelem;
1143 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1144 return REG_NOERROR;
1145 }
1146
1147 /* Copy into the top of DEST the items of SRC that are not
1148 found in DEST. Maybe we could binary search in DEST? */
1149 for (sbase = dest->nelem + 2 * src->nelem,
1150 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1151 {
1152 if (dest->elems[id] == src->elems[is])
1153 is--, id--;
1154 else if (dest->elems[id] < src->elems[is])
1155 dest->elems[--sbase] = src->elems[is--];
1156 else /* if (dest->elems[id] > src->elems[is]) */
1157 --id;
1158 }
1159
1160 if (is >= 0)
1161 {
1162 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1163 sbase -= is + 1;
1164 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1165 }
1166
1167 id = dest->nelem - 1;
1168 is = dest->nelem + 2 * src->nelem - 1;
1169 delta = is - sbase + 1;
1170 if (delta == 0)
1171 return REG_NOERROR;
1172
1173 /* Now copy. When DELTA becomes zero, the remaining
1174 DEST elements are already in place. */
1175 dest->nelem += delta;
1176 for (;;)
1177 {
1178 if (dest->elems[is] > dest->elems[id])
1179 {
1180 /* Copy from the top. */
1181 dest->elems[id + delta--] = dest->elems[is--];
1182 if (delta == 0)
1183 break;
1184 }
1185 else
1186 {
1187 /* Slide from the bottom. */
1188 dest->elems[id + delta] = dest->elems[id];
1189 if (--id < 0)
1190 {
1191 /* Copy remaining SRC elements. */
1192 memcpy (dest->elems, dest->elems + sbase,
1193 delta * sizeof (int));
1194 break;
1195 }
1196 }
1197 }
1198
1199 return REG_NOERROR;
1200}
1201
1202/* Insert the new element ELEM to the re_node_set* SET.
1203 SET should not already have ELEM.
1204 return -1 if an error is occured, return 1 otherwise. */
1205
1206static int
1207re_node_set_insert (set, elem)
1208 re_node_set *set;
1209 int elem;
1210{
1211 int idx;
1212 /* In case the set is empty. */
1213 if (set->alloc == 0)
1214 {
1215 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1216 return 1;
1217 else
1218 return -1;
1219 }
1220
1221 if (BE (set->nelem, 0) == 0)
1222 {
1223 /* We already guaranteed above that set->alloc != 0. */
1224 set->elems[0] = elem;
1225 ++set->nelem;
1226 return 1;
1227 }
1228
1229 /* Realloc if we need. */
1230 if (set->alloc == set->nelem)
1231 {
1232 int *new_array;
1233 set->alloc = set->alloc * 2;
1234 new_array = re_realloc (set->elems, int, set->alloc);
1235 if (BE (new_array == NULL, 0))
1236 return -1;
1237 set->elems = new_array;
1238 }
1239
1240 /* Move the elements which follows the new element. Test the
1241 first element separately to skip a check in the inner loop. */
1242 if (elem < set->elems[0])
1243 {
1244 idx = 0;
1245 for (idx = set->nelem; idx > 0; idx--)
1246 set->elems[idx] = set->elems[idx - 1];
1247 }
1248 else
1249 {
1250 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1251 set->elems[idx] = set->elems[idx - 1];
1252 }
1253
1254 /* Insert the new element. */
1255 set->elems[idx] = elem;
1256 ++set->nelem;
1257 return 1;
1258}
1259
1260/* Insert the new element ELEM to the re_node_set* SET.
1261 SET should not already have any element greater than or equal to ELEM.
1262 Return -1 if an error is occured, return 1 otherwise. */
1263
1264static int
1265re_node_set_insert_last (set, elem)
1266 re_node_set *set;
1267 int elem;
1268{
1269 /* Realloc if we need. */
1270 if (set->alloc == set->nelem)
1271 {
1272 int *new_array;
1273 set->alloc = (set->alloc + 1) * 2;
1274 new_array = re_realloc (set->elems, int, set->alloc);
1275 if (BE (new_array == NULL, 0))
1276 return -1;
1277 set->elems = new_array;
1278 }
1279
1280 /* Insert the new element. */
1281 set->elems[set->nelem++] = elem;
1282 return 1;
1283}
1284
1285/* Compare two node sets SET1 and SET2.
1286 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1287
1288static int
1289re_node_set_compare (set1, set2)
1290 const re_node_set *set1, *set2;
1291{
1292 int i;
1293 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1294 return 0;
1295 for (i = set1->nelem ; --i >= 0 ; )
1296 if (set1->elems[i] != set2->elems[i])
1297 return 0;
1298 return 1;
1299}
1300
1301/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1302
1303static int
1304re_node_set_contains (set, elem)
1305 const re_node_set *set;
1306 int elem;
1307{
1308 unsigned int idx, right, mid;
1309 if (set->nelem <= 0)
1310 return 0;
1311
1312 /* Binary search the element. */
1313 idx = 0;
1314 right = set->nelem - 1;
1315 while (idx < right)
1316 {
1317 mid = (idx + right) / 2;
1318 if (set->elems[mid] < elem)
1319 idx = mid + 1;
1320 else
1321 right = mid;
1322 }
1323 return set->elems[idx] == elem ? idx + 1 : 0;
1324}
1325
1326static void
1327re_node_set_remove_at (set, idx)
1328 re_node_set *set;
1329 int idx;
1330{
1331 if (idx < 0 || idx >= set->nelem)
1332 return;
1333 --set->nelem;
1334 for (; idx < set->nelem; idx++)
1335 set->elems[idx] = set->elems[idx + 1];
1336}
1337
1338
1339
1340/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1341 Or return -1, if an error will be occured. */
1342
1343static int
1344re_dfa_add_node (dfa, token)
1345 re_dfa_t *dfa;
1346 re_token_t token;
1347{
1348 int type = token.type;
1349 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1350 {
1351 int new_nodes_alloc = dfa->nodes_alloc * 2;
1352 int *new_nexts, *new_indices;
1353 re_node_set *new_edests, *new_eclosures;
1354
1355 re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1356 new_nodes_alloc);
1357 if (BE (new_array == NULL, 0))
1358 return -1;
1359 dfa->nodes = new_array;
1360 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1361 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1362 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1363 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1364 if (BE (new_nexts == NULL || new_indices == NULL
1365 || new_edests == NULL || new_eclosures == NULL, 0))
1366 return -1;
1367 dfa->nexts = new_nexts;
1368 dfa->org_indices = new_indices;
1369 dfa->edests = new_edests;
1370 dfa->eclosures = new_eclosures;
1371 dfa->nodes_alloc = new_nodes_alloc;
1372 }
1373 dfa->nodes[dfa->nodes_len] = token;
1374 dfa->nodes[dfa->nodes_len].constraint = 0;
1375#ifdef RE_ENABLE_I18N
1376 dfa->nodes[dfa->nodes_len].accept_mb =
1377 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1378#endif
1379 dfa->nexts[dfa->nodes_len] = -1;
1380 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1381 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1382 return dfa->nodes_len++;
1383}
1384
1385static unsigned int inline
1386calc_state_hash (nodes, context)
1387 const re_node_set *nodes;
1388 unsigned int context;
1389{
1390 unsigned int hash = nodes->nelem + context;
1391 int i;
1392 for (i = 0 ; i < nodes->nelem ; i++)
1393 hash += nodes->elems[i];
1394 return hash;
1395}
1396
1397/* Search for the state whose node_set is equivalent to NODES.
1398 Return the pointer to the state, if we found it in the DFA.
1399 Otherwise create the new one and return it. In case of an error
1400 return NULL and set the error code in ERR.
1401 Note: - We assume NULL as the invalid state, then it is possible that
1402 return value is NULL and ERR is REG_NOERROR.
1403 - We never return non-NULL value in case of any errors, it is for
1404 optimization. */
1405
1406static re_dfastate_t*
1407re_acquire_state (err, dfa, nodes)
1408 reg_errcode_t *err;
1409 re_dfa_t *dfa;
1410 const re_node_set *nodes;
1411{
1412 unsigned int hash;
1413 re_dfastate_t *new_state;
1414 struct re_state_table_entry *spot;
1415 int i;
1416 if (BE (nodes->nelem == 0, 0))
1417 {
1418 *err = REG_NOERROR;
1419 return NULL;
1420 }
1421 hash = calc_state_hash (nodes, 0);
1422 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1423
1424 for (i = 0 ; i < spot->num ; i++)
1425 {
1426 re_dfastate_t *state = spot->array[i];
1427 if (hash != state->hash)
1428 continue;
1429 if (re_node_set_compare (&state->nodes, nodes))
1430 return state;
1431 }
1432
1433 /* There are no appropriate state in the dfa, create the new one. */
1434 new_state = create_ci_newstate (dfa, nodes, hash);
1435 if (BE (new_state != NULL, 1))
1436 return new_state;
1437 else
1438 {
1439 *err = REG_ESPACE;
1440 return NULL;
1441 }
1442}
1443
1444/* Search for the state whose node_set is equivalent to NODES and
1445 whose context is equivalent to CONTEXT.
1446 Return the pointer to the state, if we found it in the DFA.
1447 Otherwise create the new one and return it. In case of an error
1448 return NULL and set the error code in ERR.
1449 Note: - We assume NULL as the invalid state, then it is possible that
1450 return value is NULL and ERR is REG_NOERROR.
1451 - We never return non-NULL value in case of any errors, it is for
1452 optimization. */
1453
1454static re_dfastate_t*
1455re_acquire_state_context (err, dfa, nodes, context)
1456 reg_errcode_t *err;
1457 re_dfa_t *dfa;
1458 const re_node_set *nodes;
1459 unsigned int context;
1460{
1461 unsigned int hash;
1462 re_dfastate_t *new_state;
1463 struct re_state_table_entry *spot;
1464 int i;
1465 if (nodes->nelem == 0)
1466 {
1467 *err = REG_NOERROR;
1468 return NULL;
1469 }
1470 hash = calc_state_hash (nodes, context);
1471 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1472
1473 for (i = 0 ; i < spot->num ; i++)
1474 {
1475 re_dfastate_t *state = spot->array[i];
1476 if (state->hash == hash
1477 && state->context == context
1478 && re_node_set_compare (state->entrance_nodes, nodes))
1479 return state;
1480 }
1481 /* There are no appropriate state in `dfa', create the new one. */
1482 new_state = create_cd_newstate (dfa, nodes, context, hash);
1483 if (BE (new_state != NULL, 1))
1484 return new_state;
1485 else
1486 {
1487 *err = REG_ESPACE;
1488 return NULL;
1489 }
1490}
1491
1492/* Finish initialization of the new state NEWSTATE, and using its hash value
1493 HASH put in the appropriate bucket of DFA's state table. Return value
1494 indicates the error code if failed. */
1495
1496static reg_errcode_t
1497register_state (dfa, newstate, hash)
1498 re_dfa_t *dfa;
1499 re_dfastate_t *newstate;
1500 unsigned int hash;
1501{
1502 struct re_state_table_entry *spot;
1503 reg_errcode_t err;
1504 int i;
1505
1506 newstate->hash = hash;
1507 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1508 if (BE (err != REG_NOERROR, 0))
1509 return REG_ESPACE;
1510 for (i = 0; i < newstate->nodes.nelem; i++)
1511 {
1512 int elem = newstate->nodes.elems[i];
1513 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1514 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1515 }
1516
1517 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1518 if (BE (spot->alloc <= spot->num, 0))
1519 {
1520 int new_alloc = 2 * spot->num + 2;
1521 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1522 new_alloc);
1523 if (BE (new_array == NULL, 0))
1524 return REG_ESPACE;
1525 spot->array = new_array;
1526 spot->alloc = new_alloc;
1527 }
1528 spot->array[spot->num++] = newstate;
1529 return REG_NOERROR;
1530}
1531
1532/* Create the new state which is independ of contexts.
1533 Return the new state if succeeded, otherwise return NULL. */
1534
1535static re_dfastate_t *
1536create_ci_newstate (dfa, nodes, hash)
1537 re_dfa_t *dfa;
1538 const re_node_set *nodes;
1539 unsigned int hash;
1540{
1541 int i;
1542 reg_errcode_t err;
1543 re_dfastate_t *newstate;
1544
1545 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1546 if (BE (newstate == NULL, 0))
1547 return NULL;
1548 err = re_node_set_init_copy (&newstate->nodes, nodes);
1549 if (BE (err != REG_NOERROR, 0))
1550 {
1551 re_free (newstate);
1552 return NULL;
1553 }
1554
1555 newstate->entrance_nodes = &newstate->nodes;
1556 for (i = 0 ; i < nodes->nelem ; i++)
1557 {
1558 re_token_t *node = dfa->nodes + nodes->elems[i];
1559 re_token_type_t type = node->type;
1560 if (type == CHARACTER && !node->constraint)
1561 continue;
1562#ifdef RE_ENABLE_I18N
1563 newstate->accept_mb |= node->accept_mb;
1564#endif /* RE_ENABLE_I18N */
1565
1566 /* If the state has the halt node, the state is a halt state. */
1567 if (type == END_OF_RE)
1568 newstate->halt = 1;
1569 else if (type == OP_BACK_REF)
1570 newstate->has_backref = 1;
1571 else if (type == ANCHOR || node->constraint)
1572 newstate->has_constraint = 1;
1573 }
1574 err = register_state (dfa, newstate, hash);
1575 if (BE (err != REG_NOERROR, 0))
1576 {
1577 free_state (newstate);
1578 newstate = NULL;
1579 }
1580 return newstate;
1581}
1582
1583/* Create the new state which is depend on the context CONTEXT.
1584 Return the new state if succeeded, otherwise return NULL. */
1585
1586static re_dfastate_t *
1587create_cd_newstate (dfa, nodes, context, hash)
1588 re_dfa_t *dfa;
1589 const re_node_set *nodes;
1590 unsigned int context, hash;
1591{
1592 int i, nctx_nodes = 0;
1593 reg_errcode_t err;
1594 re_dfastate_t *newstate;
1595
1596 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1597 if (BE (newstate == NULL, 0))
1598 return NULL;
1599 err = re_node_set_init_copy (&newstate->nodes, nodes);
1600 if (BE (err != REG_NOERROR, 0))
1601 {
1602 re_free (newstate);
1603 return NULL;
1604 }
1605
1606 newstate->context = context;
1607 newstate->entrance_nodes = &newstate->nodes;
1608
1609 for (i = 0 ; i < nodes->nelem ; i++)
1610 {
1611 unsigned int constraint = 0;
1612 re_token_t *node = dfa->nodes + nodes->elems[i];
1613 re_token_type_t type = node->type;
1614 if (node->constraint)
1615 constraint = node->constraint;
1616
1617 if (type == CHARACTER && !constraint)
1618 continue;
1619#ifdef RE_ENABLE_I18N
1620 newstate->accept_mb |= node->accept_mb;
1621#endif /* RE_ENABLE_I18N */
1622
1623 /* If the state has the halt node, the state is a halt state. */
1624 if (type == END_OF_RE)
1625 newstate->halt = 1;
1626 else if (type == OP_BACK_REF)
1627 newstate->has_backref = 1;
1628 else if (type == ANCHOR)
1629 constraint = node->opr.ctx_type;
1630
1631 if (constraint)
1632 {
1633 if (newstate->entrance_nodes == &newstate->nodes)
1634 {
1635 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1636 if (BE (newstate->entrance_nodes == NULL, 0))
1637 {
1638 free_state (newstate);
1639 return NULL;
1640 }
1641 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1642 nctx_nodes = 0;
1643 newstate->has_constraint = 1;
1644 }
1645
1646 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1647 {
1648 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1649 ++nctx_nodes;
1650 }
1651 }
1652 }
1653 err = register_state (dfa, newstate, hash);
1654 if (BE (err != REG_NOERROR, 0))
1655 {
1656 free_state (newstate);
1657 newstate = NULL;
1658 }
1659 return newstate;
1660}
1661
1662static void
1663free_state (state)
1664 re_dfastate_t *state;
1665{
1666 re_node_set_free (&state->non_eps_nodes);
1667 re_node_set_free (&state->inveclosure);
1668 if (state->entrance_nodes != &state->nodes)
1669 {
1670 re_node_set_free (state->entrance_nodes);
1671 re_free (state->entrance_nodes);
1672 }
1673 re_node_set_free (&state->nodes);
1674 re_free (state->word_trtable);
1675 re_free (state->trtable);
1676 re_free (state);
1677}
Note: See TracBrowser for help on using the repository browser.