source: vendor/gawk/3.1.5/regex_internal.c

Last change on this file was 3076, checked in by bird, 18 years ago

gawk 3.1.5

File size: 44.0 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
20
21static void re_string_construct_common (const char *str, int len,
22 re_string_t *pstr,
23 RE_TRANSLATE_TYPE trans, int icase,
24 const re_dfa_t *dfa) internal_function;
25#ifdef RE_ENABLE_I18N
26static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27 wint_t *last_wc) internal_function;
28#endif /* RE_ENABLE_I18N */
29static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
30 unsigned int hash) internal_function;
31static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
32 const re_node_set *nodes,
33 unsigned int hash) internal_function;
34static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
35 const re_node_set *nodes,
36 unsigned int context,
37 unsigned int hash) internal_function;
38static unsigned int inline calc_state_hash (const re_node_set *nodes,
39 unsigned int context) internal_function;
40
41
42/* Functions for string operation. */
43
44/* This function allocate the buffers. It is necessary to call
45 re_string_reconstruct before using the object. */
46
47static reg_errcode_t
48re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
49 re_string_t *pstr;
50 const char *str;
51 int len, init_len, icase;
52 RE_TRANSLATE_TYPE trans;
53 const re_dfa_t *dfa;
54{
55 reg_errcode_t ret;
56 int init_buf_len;
57
58 /* Ensure at least one character fits into the buffers. */
59 if (init_len < dfa->mb_cur_max)
60 init_len = dfa->mb_cur_max;
61 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
62 re_string_construct_common (str, len, pstr, trans, icase, dfa);
63
64 ret = re_string_realloc_buffers (pstr, init_buf_len);
65 if (BE (ret != REG_NOERROR, 0))
66 return ret;
67
68 pstr->word_char = dfa->word_char;
69 pstr->word_ops_used = dfa->word_ops_used;
70 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
71 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
72 pstr->valid_raw_len = pstr->valid_len;
73 return REG_NOERROR;
74}
75
76/* This function allocate the buffers, and initialize them. */
77
78static reg_errcode_t
79re_string_construct (pstr, str, len, trans, icase, dfa)
80 re_string_t *pstr;
81 const char *str;
82 int len, icase;
83 RE_TRANSLATE_TYPE trans;
84 const re_dfa_t *dfa;
85{
86 reg_errcode_t ret;
87 memset (pstr, '\0', sizeof (re_string_t));
88 re_string_construct_common (str, len, pstr, trans, icase, dfa);
89
90 if (len > 0)
91 {
92 ret = re_string_realloc_buffers (pstr, len + 1);
93 if (BE (ret != REG_NOERROR, 0))
94 return ret;
95 }
96 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
97
98 if (icase)
99 {
100#ifdef RE_ENABLE_I18N
101 if (dfa->mb_cur_max > 1)
102 {
103 while (1)
104 {
105 ret = build_wcs_upper_buffer (pstr);
106 if (BE (ret != REG_NOERROR, 0))
107 return ret;
108 if (pstr->valid_raw_len >= len)
109 break;
110 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
111 break;
112 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
113 if (BE (ret != REG_NOERROR, 0))
114 return ret;
115 }
116 }
117 else
118#endif /* RE_ENABLE_I18N */
119 build_upper_buffer (pstr);
120 }
121 else
122 {
123#ifdef RE_ENABLE_I18N
124 if (dfa->mb_cur_max > 1)
125 build_wcs_buffer (pstr);
126 else
127#endif /* RE_ENABLE_I18N */
128 {
129 if (trans != NULL)
130 re_string_translate_buffer (pstr);
131 else
132 {
133 pstr->valid_len = pstr->bufs_len;
134 pstr->valid_raw_len = pstr->bufs_len;
135 }
136 }
137 }
138
139 return REG_NOERROR;
140}
141
142/* Helper functions for re_string_allocate, and re_string_construct. */
143
144static reg_errcode_t
145re_string_realloc_buffers (pstr, new_buf_len)
146 re_string_t *pstr;
147 int new_buf_len;
148{
149#ifdef RE_ENABLE_I18N
150 if (pstr->mb_cur_max > 1)
151 {
152 wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
153 if (BE (new_array == NULL, 0))
154 return REG_ESPACE;
155 pstr->wcs = new_array;
156 if (pstr->offsets != NULL)
157 {
158 int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
159 if (BE (new_array == NULL, 0))
160 return REG_ESPACE;
161 pstr->offsets = new_array;
162 }
163 }
164#endif /* RE_ENABLE_I18N */
165 if (pstr->mbs_allocated)
166 {
167 unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
168 new_buf_len);
169 if (BE (new_array == NULL, 0))
170 return REG_ESPACE;
171 pstr->mbs = new_array;
172 }
173 pstr->bufs_len = new_buf_len;
174 return REG_NOERROR;
175}
176
177
178static void
179re_string_construct_common (str, len, pstr, trans, icase, dfa)
180 const char *str;
181 int len;
182 re_string_t *pstr;
183 RE_TRANSLATE_TYPE trans;
184 int icase;
185 const re_dfa_t *dfa;
186{
187 pstr->raw_mbs = (const unsigned char *) str;
188 pstr->len = len;
189 pstr->raw_len = len;
190 pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
191 pstr->icase = icase ? 1 : 0;
192 pstr->mbs_allocated = (trans != NULL || icase);
193 pstr->mb_cur_max = dfa->mb_cur_max;
194 pstr->is_utf8 = dfa->is_utf8;
195 pstr->map_notascii = dfa->map_notascii;
196 pstr->stop = pstr->len;
197 pstr->raw_stop = pstr->stop;
198}
199
200#ifdef RE_ENABLE_I18N
201
202/* Build wide character buffer PSTR->WCS.
203 If the byte sequence of the string are:
204 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
205 Then wide character buffer will be:
206 <wc1> , WEOF , <wc2> , WEOF , <wc3>
207 We use WEOF for padding, they indicate that the position isn't
208 a first byte of a multibyte character.
209
210 Note that this function assumes PSTR->VALID_LEN elements are already
211 built and starts from PSTR->VALID_LEN. */
212
213static void
214build_wcs_buffer (pstr)
215 re_string_t *pstr;
216{
217#ifdef _LIBC
218 unsigned char buf[pstr->mb_cur_max];
219#else
220 unsigned char buf[64];
221#endif
222 mbstate_t prev_st;
223 int byte_idx, end_idx, mbclen, remain_len;
224
225 /* Build the buffers from pstr->valid_len to either pstr->len or
226 pstr->bufs_len. */
227 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
228 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
229 {
230 wchar_t wc;
231 const char *p;
232
233 remain_len = end_idx - byte_idx;
234 prev_st = pstr->cur_state;
235 /* Apply the translation if we need. */
236 if (BE (pstr->trans != NULL, 0))
237 {
238 int i, ch;
239
240 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
241 {
242 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
243 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
244 }
245 p = (const char *) buf;
246 }
247 else
248 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
249 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
250 if (BE (mbclen == (size_t) -2, 0))
251 {
252 /* The buffer doesn't have enough space, finish to build. */
253 pstr->cur_state = prev_st;
254 break;
255 }
256 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
257 {
258 /* We treat these cases as a singlebyte character. */
259 mbclen = 1;
260 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
261 if (BE (pstr->trans != NULL, 0))
262 wc = pstr->trans[wc];
263 pstr->cur_state = prev_st;
264 }
265
266 /* Write wide character and padding. */
267 pstr->wcs[byte_idx++] = wc;
268 /* Write paddings. */
269 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
270 pstr->wcs[byte_idx++] = WEOF;
271 }
272 pstr->valid_len = byte_idx;
273 pstr->valid_raw_len = byte_idx;
274}
275
276/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277 but for REG_ICASE. */
278
279static int
280build_wcs_upper_buffer (pstr)
281 re_string_t *pstr;
282{
283 mbstate_t prev_st;
284 int src_idx, byte_idx, end_idx, mbclen, remain_len;
285#ifdef _LIBC
286 unsigned char buf[pstr->mb_cur_max];
287#else
288 unsigned char buf[64];
289#endif
290
291 byte_idx = pstr->valid_len;
292 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
293
294 /* The following optimization assumes that ASCII characters can be
295 mapped to wide characters with a simple cast. */
296 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
297 {
298 while (byte_idx < end_idx)
299 {
300 wchar_t wc;
301
302 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
303 && mbsinit (&pstr->cur_state))
304 {
305 /* In case of a singlebyte character. */
306 pstr->mbs[byte_idx]
307 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
308 /* The next step uses the assumption that wchar_t is encoded
309 ASCII-safe: all ASCII values can be converted like this. */
310 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
311 ++byte_idx;
312 continue;
313 }
314
315 remain_len = end_idx - byte_idx;
316 prev_st = pstr->cur_state;
317 mbclen = mbrtowc (&wc,
318 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
319 + byte_idx), remain_len, &pstr->cur_state);
320 if (BE (mbclen > 0, 1))
321 {
322 wchar_t wcu = wc;
323 if (iswlower (wc))
324 {
325 int mbcdlen;
326
327 wcu = towupper (wc);
328 mbcdlen = wcrtomb ((char *)buf, wcu, &prev_st);
329 if (BE (mbclen == mbcdlen, 1))
330 memcpy (pstr->mbs + byte_idx, buf, mbclen);
331 else
332 {
333 src_idx = byte_idx;
334 goto offsets_needed;
335 }
336 }
337 else
338 memcpy (pstr->mbs + byte_idx,
339 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
340 pstr->wcs[byte_idx++] = wcu;
341 /* Write paddings. */
342 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
343 pstr->wcs[byte_idx++] = WEOF;
344 }
345 else if (mbclen == (size_t) -1 || mbclen == 0)
346 {
347 /* It is an invalid character or '\0'. Just use the byte. */
348 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
349 pstr->mbs[byte_idx] = ch;
350 /* And also cast it to wide char. */
351 pstr->wcs[byte_idx++] = (wchar_t) ch;
352 if (BE (mbclen == (size_t) -1, 0))
353 pstr->cur_state = prev_st;
354 }
355 else
356 {
357 /* The buffer doesn't have enough space, finish to build. */
358 pstr->cur_state = prev_st;
359 break;
360 }
361 }
362 pstr->valid_len = byte_idx;
363 pstr->valid_raw_len = byte_idx;
364 return REG_NOERROR;
365 }
366 else
367 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
368 {
369 wchar_t wc;
370 const char *p;
371
372 offsets_needed:
373 remain_len = end_idx - byte_idx;
374 prev_st = pstr->cur_state;
375 if (BE (pstr->trans != NULL, 0))
376 {
377 int i, ch;
378
379 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
380 {
381 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
382 buf[i] = pstr->trans[ch];
383 }
384 p = (const char *) buf;
385 }
386 else
387 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
388 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
389 if (BE (mbclen > 0, 1))
390 {
391 wchar_t wcu = wc;
392 if (iswlower (wc))
393 {
394 int mbcdlen;
395
396 wcu = towupper (wc);
397 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
398 if (BE (mbclen == mbcdlen, 1))
399 memcpy (pstr->mbs + byte_idx, buf, mbclen);
400 else
401 {
402 int i;
403
404 if (byte_idx + mbcdlen > pstr->bufs_len)
405 {
406 pstr->cur_state = prev_st;
407 break;
408 }
409
410 if (pstr->offsets == NULL)
411 {
412 pstr->offsets = re_malloc (int, pstr->bufs_len);
413
414 if (pstr->offsets == NULL)
415 return REG_ESPACE;
416 }
417 if (!pstr->offsets_needed)
418 {
419 for (i = 0; i < byte_idx; ++i)
420 pstr->offsets[i] = i;
421 pstr->offsets_needed = 1;
422 }
423
424 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
425 pstr->wcs[byte_idx] = wcu;
426 pstr->offsets[byte_idx] = src_idx;
427 for (i = 1; i < mbcdlen; ++i)
428 {
429 pstr->offsets[byte_idx + i]
430 = src_idx + (i < mbclen ? i : mbclen - 1);
431 pstr->wcs[byte_idx + i] = WEOF;
432 }
433 pstr->len += mbcdlen - mbclen;
434 if (pstr->raw_stop > src_idx)
435 pstr->stop += mbcdlen - mbclen;
436 end_idx = (pstr->bufs_len > pstr->len)
437 ? pstr->len : pstr->bufs_len;
438 byte_idx += mbcdlen;
439 src_idx += mbclen;
440 continue;
441 }
442 }
443 else
444 memcpy (pstr->mbs + byte_idx, p, mbclen);
445
446 if (BE (pstr->offsets_needed != 0, 0))
447 {
448 int i;
449 for (i = 0; i < mbclen; ++i)
450 pstr->offsets[byte_idx + i] = src_idx + i;
451 }
452 src_idx += mbclen;
453
454 pstr->wcs[byte_idx++] = wcu;
455 /* Write paddings. */
456 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
457 pstr->wcs[byte_idx++] = WEOF;
458 }
459 else if (mbclen == (size_t) -1 || mbclen == 0)
460 {
461 /* It is an invalid character or '\0'. Just use the byte. */
462 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
463
464 if (BE (pstr->trans != NULL, 0))
465 ch = pstr->trans [ch];
466 pstr->mbs[byte_idx] = ch;
467
468 if (BE (pstr->offsets_needed != 0, 0))
469 pstr->offsets[byte_idx] = src_idx;
470 ++src_idx;
471
472 /* And also cast it to wide char. */
473 pstr->wcs[byte_idx++] = (wchar_t) ch;
474 if (BE (mbclen == (size_t) -1, 0))
475 pstr->cur_state = prev_st;
476 }
477 else
478 {
479 /* The buffer doesn't have enough space, finish to build. */
480 pstr->cur_state = prev_st;
481 break;
482 }
483 }
484 pstr->valid_len = byte_idx;
485 pstr->valid_raw_len = src_idx;
486 return REG_NOERROR;
487}
488
489/* Skip characters until the index becomes greater than NEW_RAW_IDX.
490 Return the index. */
491
492static int
493re_string_skip_chars (pstr, new_raw_idx, last_wc)
494 re_string_t *pstr;
495 int new_raw_idx;
496 wint_t *last_wc;
497{
498 mbstate_t prev_st;
499 int rawbuf_idx, mbclen;
500 wchar_t wc = 0;
501
502 /* Skip the characters which are not necessary to check. */
503 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
504 rawbuf_idx < new_raw_idx;)
505 {
506 int remain_len;
507 remain_len = pstr->len - rawbuf_idx;
508 prev_st = pstr->cur_state;
509 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
510 remain_len, &pstr->cur_state);
511 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
512 {
513 /* We treat these cases as a singlebyte character. */
514 mbclen = 1;
515 pstr->cur_state = prev_st;
516 }
517 /* Then proceed the next character. */
518 rawbuf_idx += mbclen;
519 }
520 *last_wc = (wint_t) wc;
521 return rawbuf_idx;
522}
523#endif /* RE_ENABLE_I18N */
524
525/* Build the buffer PSTR->MBS, and apply the translation if we need.
526 This function is used in case of REG_ICASE. */
527
528static void
529build_upper_buffer (pstr)
530 re_string_t *pstr;
531{
532 int char_idx, end_idx;
533 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
534
535 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
536 {
537 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
538 if (BE (pstr->trans != NULL, 0))
539 ch = pstr->trans[ch];
540 if (islower (ch))
541 pstr->mbs[char_idx] = toupper (ch);
542 else
543 pstr->mbs[char_idx] = ch;
544 }
545 pstr->valid_len = char_idx;
546 pstr->valid_raw_len = char_idx;
547}
548
549/* Apply TRANS to the buffer in PSTR. */
550
551static void
552re_string_translate_buffer (pstr)
553 re_string_t *pstr;
554{
555 int buf_idx, end_idx;
556 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
557
558 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
559 {
560 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
561 pstr->mbs[buf_idx] = pstr->trans[ch];
562 }
563
564 pstr->valid_len = buf_idx;
565 pstr->valid_raw_len = buf_idx;
566}
567
568/* This function re-construct the buffers.
569 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
570 convert to upper case in case of REG_ICASE, apply translation. */
571
572static reg_errcode_t
573re_string_reconstruct (pstr, idx, eflags)
574 re_string_t *pstr;
575 int idx, eflags;
576{
577 int offset = idx - pstr->raw_mbs_idx;
578 if (BE (offset < 0, 0))
579 {
580 /* Reset buffer. */
581#ifdef RE_ENABLE_I18N
582 if (pstr->mb_cur_max > 1)
583 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
584#endif /* RE_ENABLE_I18N */
585 pstr->len = pstr->raw_len;
586 pstr->stop = pstr->raw_stop;
587 pstr->valid_len = 0;
588 pstr->raw_mbs_idx = 0;
589 pstr->valid_raw_len = 0;
590 pstr->offsets_needed = 0;
591 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
592 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
593 if (!pstr->mbs_allocated)
594 pstr->mbs = (unsigned char *) pstr->raw_mbs;
595 offset = idx;
596 }
597
598 if (BE (offset != 0, 1))
599 {
600 /* Are the characters which are already checked remain? */
601 if (BE (offset < pstr->valid_raw_len, 1)
602#ifdef RE_ENABLE_I18N
603 /* Handling this would enlarge the code too much.
604 Accept a slowdown in that case. */
605 && pstr->offsets_needed == 0
606#endif
607 )
608 {
609 /* Yes, move them to the front of the buffer. */
610 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
611#ifdef RE_ENABLE_I18N
612 if (pstr->mb_cur_max > 1)
613 memmove (pstr->wcs, pstr->wcs + offset,
614 (pstr->valid_len - offset) * sizeof (wint_t));
615#endif /* RE_ENABLE_I18N */
616 if (BE (pstr->mbs_allocated, 0))
617 memmove (pstr->mbs, pstr->mbs + offset,
618 pstr->valid_len - offset);
619 pstr->valid_len -= offset;
620 pstr->valid_raw_len -= offset;
621#if DEBUG
622 assert (pstr->valid_len > 0);
623#endif
624 }
625 else
626 {
627 /* No, skip all characters until IDX. */
628#ifdef RE_ENABLE_I18N
629 if (BE (pstr->offsets_needed, 0))
630 {
631 pstr->len = pstr->raw_len - idx + offset;
632 pstr->stop = pstr->raw_stop - idx + offset;
633 pstr->offsets_needed = 0;
634 }
635#endif
636 pstr->valid_len = 0;
637 pstr->valid_raw_len = 0;
638#ifdef RE_ENABLE_I18N
639 if (pstr->mb_cur_max > 1)
640 {
641 int wcs_idx;
642 wint_t wc = WEOF;
643
644 if (pstr->is_utf8)
645 {
646 const unsigned char *raw, *p, *q, *end;
647
648 /* Special case UTF-8. Multi-byte chars start with any
649 byte other than 0x80 - 0xbf. */
650 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
651 end = raw + (offset - pstr->mb_cur_max);
652 for (p = raw + offset - 1; p >= end; --p)
653 if ((*p & 0xc0) != 0x80)
654 {
655 mbstate_t cur_state;
656 wchar_t wc2;
657 int mlen = raw + pstr->len - p;
658 unsigned char buf[6];
659
660 q = p;
661 if (BE (pstr->trans != NULL, 0))
662 {
663 int i = mlen < 6 ? mlen : 6;
664 while (--i >= 0)
665 buf[i] = pstr->trans[p[i]];
666 q = buf;
667 }
668 /* XXX Don't use mbrtowc, we know which conversion
669 to use (UTF-8 -> UCS4). */
670 memset (&cur_state, 0, sizeof (cur_state));
671 mlen = mbrtowc (&wc2, p, mlen, &cur_state)
672 - (raw + offset - p);
673 if (mlen >= 0)
674 {
675 memset (&pstr->cur_state, '\0',
676 sizeof (mbstate_t));
677 pstr->valid_len = mlen;
678 wc = wc2;
679 }
680 break;
681 }
682 }
683
684 if (wc == WEOF)
685 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
686 if (BE (pstr->valid_len, 0))
687 {
688 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
689 pstr->wcs[wcs_idx] = WEOF;
690 if (pstr->mbs_allocated)
691 memset (pstr->mbs, 255, pstr->valid_len);
692 }
693 pstr->valid_raw_len = pstr->valid_len;
694 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
695 && IS_WIDE_WORD_CHAR (wc))
696 ? CONTEXT_WORD
697 : ((IS_WIDE_NEWLINE (wc)
698 && pstr->newline_anchor)
699 ? CONTEXT_NEWLINE : 0));
700 }
701 else
702#endif /* RE_ENABLE_I18N */
703 {
704 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
705 if (pstr->trans)
706 c = pstr->trans[c];
707 pstr->tip_context = (bitset_contain (pstr->word_char, c)
708 ? CONTEXT_WORD
709 : ((IS_NEWLINE (c) && pstr->newline_anchor)
710 ? CONTEXT_NEWLINE : 0));
711 }
712 }
713 if (!BE (pstr->mbs_allocated, 0))
714 pstr->mbs += offset;
715 }
716 pstr->raw_mbs_idx = idx;
717 pstr->len -= offset;
718 pstr->stop -= offset;
719
720 /* Then build the buffers. */
721#ifdef RE_ENABLE_I18N
722 if (pstr->mb_cur_max > 1)
723 {
724 if (pstr->icase)
725 {
726 int ret = build_wcs_upper_buffer (pstr);
727 if (BE (ret != REG_NOERROR, 0))
728 return ret;
729 }
730 else
731 build_wcs_buffer (pstr);
732 }
733 else
734#endif /* RE_ENABLE_I18N */
735 if (BE (pstr->mbs_allocated, 0))
736 {
737 if (pstr->icase)
738 build_upper_buffer (pstr);
739 else if (pstr->trans != NULL)
740 re_string_translate_buffer (pstr);
741 }
742 else
743 pstr->valid_len = pstr->len;
744
745 pstr->cur_idx = 0;
746 return REG_NOERROR;
747}
748
749static unsigned char
750re_string_peek_byte_case (pstr, idx)
751 const re_string_t *pstr;
752 int idx;
753{
754 int ch, off;
755
756 /* Handle the common (easiest) cases first. */
757 if (BE (!pstr->mbs_allocated, 1))
758 return re_string_peek_byte (pstr, idx);
759
760#ifdef RE_ENABLE_I18N
761 if (pstr->mb_cur_max > 1
762 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
763 return re_string_peek_byte (pstr, idx);
764#endif
765
766 off = pstr->cur_idx + idx;
767#ifdef RE_ENABLE_I18N
768 if (pstr->offsets_needed)
769 off = pstr->offsets[off];
770#endif
771
772 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
773
774#ifdef RE_ENABLE_I18N
775 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
776 this function returns CAPITAL LETTER I instead of first byte of
777 DOTLESS SMALL LETTER I. The latter would confuse the parser,
778 since peek_byte_case doesn't advance cur_idx in any way. */
779 if (pstr->offsets_needed && !isascii (ch))
780 return re_string_peek_byte (pstr, idx);
781#endif
782
783 return ch;
784}
785
786static unsigned char
787re_string_fetch_byte_case (pstr)
788 re_string_t *pstr;
789{
790 if (BE (!pstr->mbs_allocated, 1))
791 return re_string_fetch_byte (pstr);
792
793#ifdef RE_ENABLE_I18N
794 if (pstr->offsets_needed)
795 {
796 int off, ch;
797
798 /* For tr_TR.UTF-8 [[:islower:]] there is
799 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
800 in that case the whole multi-byte character and return
801 the original letter. On the other side, with
802 [[: DOTLESS SMALL LETTER I return [[:I, as doing
803 anything else would complicate things too much. */
804
805 if (!re_string_first_byte (pstr, pstr->cur_idx))
806 return re_string_fetch_byte (pstr);
807
808 off = pstr->offsets[pstr->cur_idx];
809 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
810
811 if (! isascii (ch))
812 return re_string_fetch_byte (pstr);
813
814 re_string_skip_bytes (pstr,
815 re_string_char_size_at (pstr, pstr->cur_idx));
816 return ch;
817 }
818#endif
819
820 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
821}
822
823static void
824re_string_destruct (pstr)
825 re_string_t *pstr;
826{
827#ifdef RE_ENABLE_I18N
828 re_free (pstr->wcs);
829 re_free (pstr->offsets);
830#endif /* RE_ENABLE_I18N */
831 if (pstr->mbs_allocated)
832 re_free (pstr->mbs);
833}
834
835/* Return the context at IDX in INPUT. */
836
837static unsigned int
838re_string_context_at (input, idx, eflags)
839 const re_string_t *input;
840 int idx, eflags;
841{
842 int c;
843 if (BE (idx < 0, 0))
844 /* In this case, we use the value stored in input->tip_context,
845 since we can't know the character in input->mbs[-1] here. */
846 return input->tip_context;
847 if (BE (idx == input->len, 0))
848 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
849 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
850#ifdef RE_ENABLE_I18N
851 if (input->mb_cur_max > 1)
852 {
853 wint_t wc;
854 int wc_idx = idx;
855 while(input->wcs[wc_idx] == WEOF)
856 {
857#ifdef DEBUG
858 /* It must not happen. */
859 assert (wc_idx >= 0);
860#endif
861 --wc_idx;
862 if (wc_idx < 0)
863 return input->tip_context;
864 }
865 wc = input->wcs[wc_idx];
866 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
867 return CONTEXT_WORD;
868 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
869 ? CONTEXT_NEWLINE : 0);
870 }
871 else
872#endif
873 {
874 c = re_string_byte_at (input, idx);
875 if (bitset_contain (input->word_char, c))
876 return CONTEXT_WORD;
877 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
878 }
879}
880
881
882/* Functions for set operation. */
883
884static reg_errcode_t
885re_node_set_alloc (set, size)
886 re_node_set *set;
887 int size;
888{
889 /*
890 * ADR: valgrind says size can be 0, which then doesn't
891 * free the block of size 0. Harumph. This seems
892 * to work ok, though.
893 */
894 if (size == 0)
895 {
896 memset(set, 0, sizeof(*set));
897 return REG_NOERROR;
898 }
899 set->alloc = size;
900 set->nelem = 0;
901 set->elems = re_malloc (int, size);
902 if (BE (set->elems == NULL, 0))
903 return REG_ESPACE;
904 return REG_NOERROR;
905}
906
907static reg_errcode_t
908re_node_set_init_1 (set, elem)
909 re_node_set *set;
910 int elem;
911{
912 set->alloc = 1;
913 set->nelem = 1;
914 set->elems = re_malloc (int, 1);
915 if (BE (set->elems == NULL, 0))
916 {
917 set->alloc = set->nelem = 0;
918 return REG_ESPACE;
919 }
920 set->elems[0] = elem;
921 return REG_NOERROR;
922}
923
924static reg_errcode_t
925re_node_set_init_2 (set, elem1, elem2)
926 re_node_set *set;
927 int elem1, elem2;
928{
929 set->alloc = 2;
930 set->elems = re_malloc (int, 2);
931 if (BE (set->elems == NULL, 0))
932 return REG_ESPACE;
933 if (elem1 == elem2)
934 {
935 set->nelem = 1;
936 set->elems[0] = elem1;
937 }
938 else
939 {
940 set->nelem = 2;
941 if (elem1 < elem2)
942 {
943 set->elems[0] = elem1;
944 set->elems[1] = elem2;
945 }
946 else
947 {
948 set->elems[0] = elem2;
949 set->elems[1] = elem1;
950 }
951 }
952 return REG_NOERROR;
953}
954
955static reg_errcode_t
956re_node_set_init_copy (dest, src)
957 re_node_set *dest;
958 const re_node_set *src;
959{
960 dest->nelem = src->nelem;
961 if (src->nelem > 0)
962 {
963 dest->alloc = dest->nelem;
964 dest->elems = re_malloc (int, dest->alloc);
965 if (BE (dest->elems == NULL, 0))
966 {
967 dest->alloc = dest->nelem = 0;
968 return REG_ESPACE;
969 }
970 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
971 }
972 else
973 re_node_set_init_empty (dest);
974 return REG_NOERROR;
975}
976
977/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
978 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
979 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
980
981static reg_errcode_t
982re_node_set_add_intersect (dest, src1, src2)
983 re_node_set *dest;
984 const re_node_set *src1, *src2;
985{
986 int i1, i2, is, id, delta, sbase;
987 if (src1->nelem == 0 || src2->nelem == 0)
988 return REG_NOERROR;
989
990 /* We need dest->nelem + 2 * elems_in_intersection; this is a
991 conservative estimate. */
992 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
993 {
994 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
995 int *new_elems = re_realloc (dest->elems, int, new_alloc);
996 if (BE (new_elems == NULL, 0))
997 return REG_ESPACE;
998 dest->elems = new_elems;
999 dest->alloc = new_alloc;
1000 }
1001
1002 /* Find the items in the intersection of SRC1 and SRC2, and copy
1003 into the top of DEST those that are not already in DEST itself. */
1004 sbase = dest->nelem + src1->nelem + src2->nelem;
1005 i1 = src1->nelem - 1;
1006 i2 = src2->nelem - 1;
1007 id = dest->nelem - 1;
1008 for (;;)
1009 {
1010 if (src1->elems[i1] == src2->elems[i2])
1011 {
1012 /* Try to find the item in DEST. Maybe we could binary search? */
1013 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1014 --id;
1015
1016 if (id < 0 || dest->elems[id] != src1->elems[i1])
1017 dest->elems[--sbase] = src1->elems[i1];
1018
1019 if (--i1 < 0 || --i2 < 0)
1020 break;
1021 }
1022
1023 /* Lower the highest of the two items. */
1024 else if (src1->elems[i1] < src2->elems[i2])
1025 {
1026 if (--i2 < 0)
1027 break;
1028 }
1029 else
1030 {
1031 if (--i1 < 0)
1032 break;
1033 }
1034 }
1035
1036 id = dest->nelem - 1;
1037 is = dest->nelem + src1->nelem + src2->nelem - 1;
1038 delta = is - sbase + 1;
1039
1040 /* Now copy. When DELTA becomes zero, the remaining
1041 DEST elements are already in place; this is more or
1042 less the same loop that is in re_node_set_merge. */
1043 dest->nelem += delta;
1044 if (delta > 0 && id >= 0)
1045 for (;;)
1046 {
1047 if (dest->elems[is] > dest->elems[id])
1048 {
1049 /* Copy from the top. */
1050 dest->elems[id + delta--] = dest->elems[is--];
1051 if (delta == 0)
1052 break;
1053 }
1054 else
1055 {
1056 /* Slide from the bottom. */
1057 dest->elems[id + delta] = dest->elems[id];
1058 if (--id < 0)
1059 break;
1060 }
1061 }
1062
1063 /* Copy remaining SRC elements. */
1064 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1065
1066 return REG_NOERROR;
1067}
1068
1069/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1070 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1071
1072static reg_errcode_t
1073re_node_set_init_union (dest, src1, src2)
1074 re_node_set *dest;
1075 const re_node_set *src1, *src2;
1076{
1077 int i1, i2, id;
1078 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1079 {
1080 dest->alloc = src1->nelem + src2->nelem;
1081 dest->elems = re_malloc (int, dest->alloc);
1082 if (BE (dest->elems == NULL, 0))
1083 return REG_ESPACE;
1084 }
1085 else
1086 {
1087 if (src1 != NULL && src1->nelem > 0)
1088 return re_node_set_init_copy (dest, src1);
1089 else if (src2 != NULL && src2->nelem > 0)
1090 return re_node_set_init_copy (dest, src2);
1091 else
1092 re_node_set_init_empty (dest);
1093 return REG_NOERROR;
1094 }
1095 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1096 {
1097 if (src1->elems[i1] > src2->elems[i2])
1098 {
1099 dest->elems[id++] = src2->elems[i2++];
1100 continue;
1101 }
1102 if (src1->elems[i1] == src2->elems[i2])
1103 ++i2;
1104 dest->elems[id++] = src1->elems[i1++];
1105 }
1106 if (i1 < src1->nelem)
1107 {
1108 memcpy (dest->elems + id, src1->elems + i1,
1109 (src1->nelem - i1) * sizeof (int));
1110 id += src1->nelem - i1;
1111 }
1112 else if (i2 < src2->nelem)
1113 {
1114 memcpy (dest->elems + id, src2->elems + i2,
1115 (src2->nelem - i2) * sizeof (int));
1116 id += src2->nelem - i2;
1117 }
1118 dest->nelem = id;
1119 return REG_NOERROR;
1120}
1121
1122/* Calculate the union set of the sets DEST and SRC. And store it to
1123 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1124
1125static reg_errcode_t
1126re_node_set_merge (dest, src)
1127 re_node_set *dest;
1128 const re_node_set *src;
1129{
1130 int is, id, sbase, delta;
1131 if (src == NULL || src->nelem == 0)
1132 return REG_NOERROR;
1133 if (dest->alloc < 2 * src->nelem + dest->nelem)
1134 {
1135 int new_alloc = 2 * (src->nelem + dest->alloc);
1136 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1137 if (BE (new_buffer == NULL, 0))
1138 return REG_ESPACE;
1139 dest->elems = new_buffer;
1140 dest->alloc = new_alloc;
1141 }
1142
1143 if (BE (dest->nelem == 0, 0))
1144 {
1145 dest->nelem = src->nelem;
1146 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1147 return REG_NOERROR;
1148 }
1149
1150 /* Copy into the top of DEST the items of SRC that are not
1151 found in DEST. Maybe we could binary search in DEST? */
1152 for (sbase = dest->nelem + 2 * src->nelem,
1153 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1154 {
1155 if (dest->elems[id] == src->elems[is])
1156 is--, id--;
1157 else if (dest->elems[id] < src->elems[is])
1158 dest->elems[--sbase] = src->elems[is--];
1159 else /* if (dest->elems[id] > src->elems[is]) */
1160 --id;
1161 }
1162
1163 if (is >= 0)
1164 {
1165 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1166 sbase -= is + 1;
1167 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1168 }
1169
1170 id = dest->nelem - 1;
1171 is = dest->nelem + 2 * src->nelem - 1;
1172 delta = is - sbase + 1;
1173 if (delta == 0)
1174 return REG_NOERROR;
1175
1176 /* Now copy. When DELTA becomes zero, the remaining
1177 DEST elements are already in place. */
1178 dest->nelem += delta;
1179 for (;;)
1180 {
1181 if (dest->elems[is] > dest->elems[id])
1182 {
1183 /* Copy from the top. */
1184 dest->elems[id + delta--] = dest->elems[is--];
1185 if (delta == 0)
1186 break;
1187 }
1188 else
1189 {
1190 /* Slide from the bottom. */
1191 dest->elems[id + delta] = dest->elems[id];
1192 if (--id < 0)
1193 {
1194 /* Copy remaining SRC elements. */
1195 memcpy (dest->elems, dest->elems + sbase,
1196 delta * sizeof (int));
1197 break;
1198 }
1199 }
1200 }
1201
1202 return REG_NOERROR;
1203}
1204
1205/* Insert the new element ELEM to the re_node_set* SET.
1206 SET should not already have ELEM.
1207 return -1 if an error is occured, return 1 otherwise. */
1208
1209static int
1210re_node_set_insert (set, elem)
1211 re_node_set *set;
1212 int elem;
1213{
1214 int idx;
1215 /* In case the set is empty. */
1216 if (set->alloc == 0)
1217 {
1218 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1219 return 1;
1220 else
1221 return -1;
1222 }
1223
1224 if (BE (set->nelem, 0) == 0)
1225 {
1226 /* We already guaranteed above that set->alloc != 0. */
1227 set->elems[0] = elem;
1228 ++set->nelem;
1229 return 1;
1230 }
1231
1232 /* Realloc if we need. */
1233 if (set->alloc == set->nelem)
1234 {
1235 int *new_array;
1236 set->alloc = set->alloc * 2;
1237 new_array = re_realloc (set->elems, int, set->alloc);
1238 if (BE (new_array == NULL, 0))
1239 return -1;
1240 set->elems = new_array;
1241 }
1242
1243 /* Move the elements which follows the new element. Test the
1244 first element separately to skip a check in the inner loop. */
1245 if (elem < set->elems[0])
1246 {
1247 idx = 0;
1248 for (idx = set->nelem; idx > 0; idx--)
1249 set->elems[idx] = set->elems[idx - 1];
1250 }
1251 else
1252 {
1253 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1254 set->elems[idx] = set->elems[idx - 1];
1255 }
1256
1257 /* Insert the new element. */
1258 set->elems[idx] = elem;
1259 ++set->nelem;
1260 return 1;
1261}
1262
1263/* Insert the new element ELEM to the re_node_set* SET.
1264 SET should not already have any element greater than or equal to ELEM.
1265 Return -1 if an error is occured, return 1 otherwise. */
1266
1267static int
1268re_node_set_insert_last (set, elem)
1269 re_node_set *set;
1270 int elem;
1271{
1272 /* Realloc if we need. */
1273 if (set->alloc == set->nelem)
1274 {
1275 int *new_array;
1276 set->alloc = (set->alloc + 1) * 2;
1277 new_array = re_realloc (set->elems, int, set->alloc);
1278 if (BE (new_array == NULL, 0))
1279 return -1;
1280 set->elems = new_array;
1281 }
1282
1283 /* Insert the new element. */
1284 set->elems[set->nelem++] = elem;
1285 return 1;
1286}
1287
1288/* Compare two node sets SET1 and SET2.
1289 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1290
1291static int
1292re_node_set_compare (set1, set2)
1293 const re_node_set *set1, *set2;
1294{
1295 int i;
1296 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1297 return 0;
1298 for (i = set1->nelem ; --i >= 0 ; )
1299 if (set1->elems[i] != set2->elems[i])
1300 return 0;
1301 return 1;
1302}
1303
1304/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1305
1306static int
1307re_node_set_contains (set, elem)
1308 const re_node_set *set;
1309 int elem;
1310{
1311 unsigned int idx, right, mid;
1312 if (set->nelem <= 0)
1313 return 0;
1314
1315 /* Binary search the element. */
1316 idx = 0;
1317 right = set->nelem - 1;
1318 while (idx < right)
1319 {
1320 mid = (idx + right) / 2;
1321 if (set->elems[mid] < elem)
1322 idx = mid + 1;
1323 else
1324 right = mid;
1325 }
1326 return set->elems[idx] == elem ? idx + 1 : 0;
1327}
1328
1329static void
1330re_node_set_remove_at (set, idx)
1331 re_node_set *set;
1332 int idx;
1333{
1334 if (idx < 0 || idx >= set->nelem)
1335 return;
1336 --set->nelem;
1337 for (; idx < set->nelem; idx++)
1338 set->elems[idx] = set->elems[idx + 1];
1339}
1340
1341
1342
1343/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1344 Or return -1, if an error will be occured. */
1345
1346static int
1347re_dfa_add_node (dfa, token)
1348 re_dfa_t *dfa;
1349 re_token_t token;
1350{
1351 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1352 {
1353 int new_nodes_alloc = dfa->nodes_alloc * 2;
1354 int *new_nexts, *new_indices;
1355 re_node_set *new_edests, *new_eclosures;
1356
1357 re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1358 new_nodes_alloc);
1359 if (BE (new_array == NULL, 0))
1360 return -1;
1361 dfa->nodes = new_array;
1362 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1363 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1364 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1365 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1366 if (BE (new_nexts == NULL || new_indices == NULL
1367 || new_edests == NULL || new_eclosures == NULL, 0))
1368 return -1;
1369 dfa->nexts = new_nexts;
1370 dfa->org_indices = new_indices;
1371 dfa->edests = new_edests;
1372 dfa->eclosures = new_eclosures;
1373 dfa->nodes_alloc = new_nodes_alloc;
1374 }
1375 dfa->nodes[dfa->nodes_len] = token;
1376 dfa->nodes[dfa->nodes_len].constraint = 0;
1377#ifdef RE_ENABLE_I18N
1378 dfa->nodes[dfa->nodes_len].accept_mb =
1379 (token.type == OP_PERIOD && dfa->mb_cur_max > 1) || token.type == COMPLEX_BRACKET;
1380#endif
1381 dfa->nexts[dfa->nodes_len] = -1;
1382 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1383 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1384 return dfa->nodes_len++;
1385}
1386
1387static unsigned int inline
1388calc_state_hash (nodes, context)
1389 const re_node_set *nodes;
1390 unsigned int context;
1391{
1392 unsigned int hash = nodes->nelem + context;
1393 int i;
1394 for (i = 0 ; i < nodes->nelem ; i++)
1395 hash += nodes->elems[i];
1396 return hash;
1397}
1398
1399/* Search for the state whose node_set is equivalent to NODES.
1400 Return the pointer to the state, if we found it in the DFA.
1401 Otherwise create the new one and return it. In case of an error
1402 return NULL and set the error code in ERR.
1403 Note: - We assume NULL as the invalid state, then it is possible that
1404 return value is NULL and ERR is REG_NOERROR.
1405 - We never return non-NULL value in case of any errors, it is for
1406 optimization. */
1407
1408static re_dfastate_t*
1409re_acquire_state (err, dfa, nodes)
1410 reg_errcode_t *err;
1411 re_dfa_t *dfa;
1412 const re_node_set *nodes;
1413{
1414 unsigned int hash;
1415 re_dfastate_t *new_state;
1416 struct re_state_table_entry *spot;
1417 int i;
1418 if (BE (nodes->nelem == 0, 0))
1419 {
1420 *err = REG_NOERROR;
1421 return NULL;
1422 }
1423 hash = calc_state_hash (nodes, 0);
1424 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1425
1426 for (i = 0 ; i < spot->num ; i++)
1427 {
1428 re_dfastate_t *state = spot->array[i];
1429 if (hash != state->hash)
1430 continue;
1431 if (re_node_set_compare (&state->nodes, nodes))
1432 return state;
1433 }
1434
1435 /* There are no appropriate state in the dfa, create the new one. */
1436 new_state = create_ci_newstate (dfa, nodes, hash);
1437 if (BE (new_state != NULL, 1))
1438 return new_state;
1439 else
1440 {
1441 *err = REG_ESPACE;
1442 return NULL;
1443 }
1444}
1445
1446/* Search for the state whose node_set is equivalent to NODES and
1447 whose context is equivalent to CONTEXT.
1448 Return the pointer to the state, if we found it in the DFA.
1449 Otherwise create the new one and return it. In case of an error
1450 return NULL and set the error code in ERR.
1451 Note: - We assume NULL as the invalid state, then it is possible that
1452 return value is NULL and ERR is REG_NOERROR.
1453 - We never return non-NULL value in case of any errors, it is for
1454 optimization. */
1455
1456static re_dfastate_t*
1457re_acquire_state_context (err, dfa, nodes, context)
1458 reg_errcode_t *err;
1459 re_dfa_t *dfa;
1460 const re_node_set *nodes;
1461 unsigned int context;
1462{
1463 unsigned int hash;
1464 re_dfastate_t *new_state;
1465 struct re_state_table_entry *spot;
1466 int i;
1467 if (nodes->nelem == 0)
1468 {
1469 *err = REG_NOERROR;
1470 return NULL;
1471 }
1472 hash = calc_state_hash (nodes, context);
1473 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1474
1475 for (i = 0 ; i < spot->num ; i++)
1476 {
1477 re_dfastate_t *state = spot->array[i];
1478 if (state->hash == hash
1479 && state->context == context
1480 && re_node_set_compare (state->entrance_nodes, nodes))
1481 return state;
1482 }
1483 /* There are no appropriate state in `dfa', create the new one. */
1484 new_state = create_cd_newstate (dfa, nodes, context, hash);
1485 if (BE (new_state != NULL, 1))
1486 return new_state;
1487 else
1488 {
1489 *err = REG_ESPACE;
1490 return NULL;
1491 }
1492}
1493
1494/* Finish initialization of the new state NEWSTATE, and using its hash value
1495 HASH put in the appropriate bucket of DFA's state table. Return value
1496 indicates the error code if failed. */
1497
1498static reg_errcode_t
1499register_state (dfa, newstate, hash)
1500 re_dfa_t *dfa;
1501 re_dfastate_t *newstate;
1502 unsigned int hash;
1503{
1504 struct re_state_table_entry *spot;
1505 reg_errcode_t err;
1506 int i;
1507
1508 newstate->hash = hash;
1509 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1510 if (BE (err != REG_NOERROR, 0))
1511 return REG_ESPACE;
1512 for (i = 0; i < newstate->nodes.nelem; i++)
1513 {
1514 int elem = newstate->nodes.elems[i];
1515 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1516 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1517 }
1518
1519 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1520 if (BE (spot->alloc <= spot->num, 0))
1521 {
1522 int new_alloc = 2 * spot->num + 2;
1523 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1524 new_alloc);
1525 if (BE (new_array == NULL, 0))
1526 return REG_ESPACE;
1527 spot->array = new_array;
1528 spot->alloc = new_alloc;
1529 }
1530 spot->array[spot->num++] = newstate;
1531 return REG_NOERROR;
1532}
1533
1534/* Create the new state which is independ of contexts.
1535 Return the new state if succeeded, otherwise return NULL. */
1536
1537static re_dfastate_t *
1538create_ci_newstate (dfa, nodes, hash)
1539 re_dfa_t *dfa;
1540 const re_node_set *nodes;
1541 unsigned int hash;
1542{
1543 int i;
1544 reg_errcode_t err;
1545 re_dfastate_t *newstate;
1546
1547 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1548 if (BE (newstate == NULL, 0))
1549 return NULL;
1550 err = re_node_set_init_copy (&newstate->nodes, nodes);
1551 if (BE (err != REG_NOERROR, 0))
1552 {
1553 re_free (newstate);
1554 return NULL;
1555 }
1556
1557 newstate->entrance_nodes = &newstate->nodes;
1558 for (i = 0 ; i < nodes->nelem ; i++)
1559 {
1560 re_token_t *node = dfa->nodes + nodes->elems[i];
1561 re_token_type_t type = node->type;
1562 if (type == CHARACTER && !node->constraint)
1563 continue;
1564#ifdef RE_ENABLE_I18N
1565 newstate->accept_mb |= node->accept_mb;
1566#endif /* RE_ENABLE_I18N */
1567
1568 /* If the state has the halt node, the state is a halt state. */
1569 if (type == END_OF_RE)
1570 newstate->halt = 1;
1571 else if (type == OP_BACK_REF)
1572 newstate->has_backref = 1;
1573 else if (type == ANCHOR || node->constraint)
1574 newstate->has_constraint = 1;
1575 }
1576 err = register_state (dfa, newstate, hash);
1577 if (BE (err != REG_NOERROR, 0))
1578 {
1579 free_state (newstate);
1580 newstate = NULL;
1581 }
1582 return newstate;
1583}
1584
1585/* Create the new state which is depend on the context CONTEXT.
1586 Return the new state if succeeded, otherwise return NULL. */
1587
1588static re_dfastate_t *
1589create_cd_newstate (dfa, nodes, context, hash)
1590 re_dfa_t *dfa;
1591 const re_node_set *nodes;
1592 unsigned int context, hash;
1593{
1594 int i, nctx_nodes = 0;
1595 reg_errcode_t err;
1596 re_dfastate_t *newstate;
1597
1598 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1599 if (BE (newstate == NULL, 0))
1600 return NULL;
1601 err = re_node_set_init_copy (&newstate->nodes, nodes);
1602 if (BE (err != REG_NOERROR, 0))
1603 {
1604 re_free (newstate);
1605 return NULL;
1606 }
1607
1608 newstate->context = context;
1609 newstate->entrance_nodes = &newstate->nodes;
1610
1611 for (i = 0 ; i < nodes->nelem ; i++)
1612 {
1613 unsigned int constraint = 0;
1614 re_token_t *node = dfa->nodes + nodes->elems[i];
1615 re_token_type_t type = node->type;
1616 if (node->constraint)
1617 constraint = node->constraint;
1618
1619 if (type == CHARACTER && !constraint)
1620 continue;
1621#ifdef RE_ENABLE_I18N
1622 newstate->accept_mb |= node->accept_mb;
1623#endif /* RE_ENABLE_I18N */
1624
1625 /* If the state has the halt node, the state is a halt state. */
1626 if (type == END_OF_RE)
1627 newstate->halt = 1;
1628 else if (type == OP_BACK_REF)
1629 newstate->has_backref = 1;
1630 else if (type == ANCHOR)
1631 constraint = node->opr.ctx_type;
1632
1633 if (constraint)
1634 {
1635 if (newstate->entrance_nodes == &newstate->nodes)
1636 {
1637 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1638 if (BE (newstate->entrance_nodes == NULL, 0))
1639 {
1640 free_state (newstate);
1641 return NULL;
1642 }
1643 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1644 nctx_nodes = 0;
1645 newstate->has_constraint = 1;
1646 }
1647
1648 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1649 {
1650 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1651 ++nctx_nodes;
1652 }
1653 }
1654 }
1655 err = register_state (dfa, newstate, hash);
1656 if (BE (err != REG_NOERROR, 0))
1657 {
1658 free_state (newstate);
1659 newstate = NULL;
1660 }
1661 return newstate;
1662}
1663
1664static void
1665free_state (state)
1666 re_dfastate_t *state;
1667{
1668 re_node_set_free (&state->non_eps_nodes);
1669 re_node_set_free (&state->inveclosure);
1670 if (state->entrance_nodes != &state->nodes)
1671 {
1672 re_node_set_free (state->entrance_nodes);
1673 re_free (state->entrance_nodes);
1674 }
1675 re_node_set_free (&state->nodes);
1676 re_free (state->word_trtable);
1677 re_free (state->trtable);
1678 re_free (state);
1679}
Note: See TracBrowser for help on using the repository browser.