source: trunk/essentials/app-arch/tar/lib/regex_internal.c

Last change on this file was 3342, checked in by bird, 18 years ago

tar 1.16.1

File size: 46.9 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa) internal_function;
24static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash) internal_function;
31
32
33/* Functions for string operation. */
34
35/* This function allocate the buffers. It is necessary to call
36 re_string_reconstruct before using the object. */
37
38static reg_errcode_t
39internal_function
40re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
41 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
42{
43 reg_errcode_t ret;
44 Idx init_buf_len;
45
46 /* Ensure at least one character fits into the buffers. */
47 if (init_len < dfa->mb_cur_max)
48 init_len = dfa->mb_cur_max;
49 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51
52 ret = re_string_realloc_buffers (pstr, init_buf_len);
53 if (BE (ret != REG_NOERROR, 0))
54 return ret;
55
56 pstr->word_char = dfa->word_char;
57 pstr->word_ops_used = dfa->word_ops_used;
58 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
60 pstr->valid_raw_len = pstr->valid_len;
61 return REG_NOERROR;
62}
63
64/* This function allocate the buffers, and initialize them. */
65
66static reg_errcode_t
67internal_function
68re_string_construct (re_string_t *pstr, const char *str, Idx len,
69 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
70{
71 reg_errcode_t ret;
72 memset (pstr, '\0', sizeof (re_string_t));
73 re_string_construct_common (str, len, pstr, trans, icase, dfa);
74
75 if (len > 0)
76 {
77 ret = re_string_realloc_buffers (pstr, len + 1);
78 if (BE (ret != REG_NOERROR, 0))
79 return ret;
80 }
81 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
82
83 if (icase)
84 {
85#ifdef RE_ENABLE_I18N
86 if (dfa->mb_cur_max > 1)
87 {
88 while (1)
89 {
90 ret = build_wcs_upper_buffer (pstr);
91 if (BE (ret != REG_NOERROR, 0))
92 return ret;
93 if (pstr->valid_raw_len >= len)
94 break;
95 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 break;
97 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98 if (BE (ret != REG_NOERROR, 0))
99 return ret;
100 }
101 }
102 else
103#endif /* RE_ENABLE_I18N */
104 build_upper_buffer (pstr);
105 }
106 else
107 {
108#ifdef RE_ENABLE_I18N
109 if (dfa->mb_cur_max > 1)
110 build_wcs_buffer (pstr);
111 else
112#endif /* RE_ENABLE_I18N */
113 {
114 if (trans != NULL)
115 re_string_translate_buffer (pstr);
116 else
117 {
118 pstr->valid_len = pstr->bufs_len;
119 pstr->valid_raw_len = pstr->bufs_len;
120 }
121 }
122 }
123
124 return REG_NOERROR;
125}
126
127/* Helper functions for re_string_allocate, and re_string_construct. */
128
129static reg_errcode_t
130internal_function
131re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
132{
133#ifdef RE_ENABLE_I18N
134 if (pstr->mb_cur_max > 1)
135 {
136 wint_t *new_wcs;
137
138 /* Avoid overflow. */
139 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
140 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
141 return REG_ESPACE;
142
143 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
144 if (BE (new_wcs == NULL, 0))
145 return REG_ESPACE;
146 pstr->wcs = new_wcs;
147 if (pstr->offsets != NULL)
148 {
149 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
150 if (BE (new_offsets == NULL, 0))
151 return REG_ESPACE;
152 pstr->offsets = new_offsets;
153 }
154 }
155#endif /* RE_ENABLE_I18N */
156 if (pstr->mbs_allocated)
157 {
158 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159 new_buf_len);
160 if (BE (new_mbs == NULL, 0))
161 return REG_ESPACE;
162 pstr->mbs = new_mbs;
163 }
164 pstr->bufs_len = new_buf_len;
165 return REG_NOERROR;
166}
167
168
169static void
170internal_function
171re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
172 RE_TRANSLATE_TYPE trans, bool icase,
173 const re_dfa_t *dfa)
174{
175 pstr->raw_mbs = (const unsigned char *) str;
176 pstr->len = len;
177 pstr->raw_len = len;
178 pstr->trans = trans;
179 pstr->icase = icase;
180 pstr->mbs_allocated = (trans != NULL || icase);
181 pstr->mb_cur_max = dfa->mb_cur_max;
182 pstr->is_utf8 = dfa->is_utf8;
183 pstr->map_notascii = dfa->map_notascii;
184 pstr->stop = pstr->len;
185 pstr->raw_stop = pstr->stop;
186}
187
188#ifdef RE_ENABLE_I18N
189
190/* Build wide character buffer PSTR->WCS.
191 If the byte sequence of the string are:
192 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193 Then wide character buffer will be:
194 <wc1> , WEOF , <wc2> , WEOF , <wc3>
195 We use WEOF for padding, they indicate that the position isn't
196 a first byte of a multibyte character.
197
198 Note that this function assumes PSTR->VALID_LEN elements are already
199 built and starts from PSTR->VALID_LEN. */
200
201static void
202internal_function
203build_wcs_buffer (re_string_t *pstr)
204{
205#ifdef _LIBC
206 unsigned char buf[MB_LEN_MAX];
207 assert (MB_LEN_MAX >= pstr->mb_cur_max);
208#else
209 unsigned char buf[64];
210#endif
211 mbstate_t prev_st;
212 Idx byte_idx, end_idx, remain_len;
213 size_t mbclen;
214
215 /* Build the buffers from pstr->valid_len to either pstr->len or
216 pstr->bufs_len. */
217 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
219 {
220 wchar_t wc;
221 const char *p;
222
223 remain_len = end_idx - byte_idx;
224 prev_st = pstr->cur_state;
225 /* Apply the translation if we need. */
226 if (BE (pstr->trans != NULL, 0))
227 {
228 int i, ch;
229
230 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
231 {
232 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
234 }
235 p = (const char *) buf;
236 }
237 else
238 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240 if (BE (mbclen == (size_t) -2, 0))
241 {
242 /* The buffer doesn't have enough space, finish to build. */
243 pstr->cur_state = prev_st;
244 break;
245 }
246 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
247 {
248 /* We treat these cases as a singlebyte character. */
249 mbclen = 1;
250 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
251 if (BE (pstr->trans != NULL, 0))
252 wc = pstr->trans[wc];
253 pstr->cur_state = prev_st;
254 }
255
256 /* Write wide character and padding. */
257 pstr->wcs[byte_idx++] = wc;
258 /* Write paddings. */
259 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260 pstr->wcs[byte_idx++] = WEOF;
261 }
262 pstr->valid_len = byte_idx;
263 pstr->valid_raw_len = byte_idx;
264}
265
266/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267 but for REG_ICASE. */
268
269static reg_errcode_t
270internal_function
271build_wcs_upper_buffer (re_string_t *pstr)
272{
273 mbstate_t prev_st;
274 Idx src_idx, byte_idx, end_idx, remain_len;
275 size_t mbclen;
276#ifdef _LIBC
277 char buf[MB_LEN_MAX];
278 assert (MB_LEN_MAX >= pstr->mb_cur_max);
279#else
280 char buf[64];
281#endif
282
283 byte_idx = pstr->valid_len;
284 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285
286 /* The following optimization assumes that ASCII characters can be
287 mapped to wide characters with a simple cast. */
288 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289 {
290 while (byte_idx < end_idx)
291 {
292 wchar_t wc;
293
294 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295 && mbsinit (&pstr->cur_state))
296 {
297 /* In case of a singlebyte character. */
298 pstr->mbs[byte_idx]
299 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300 /* The next step uses the assumption that wchar_t is encoded
301 ASCII-safe: all ASCII values can be converted like this. */
302 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303 ++byte_idx;
304 continue;
305 }
306
307 remain_len = end_idx - byte_idx;
308 prev_st = pstr->cur_state;
309 mbclen = mbrtowc (&wc,
310 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311 + byte_idx), remain_len, &pstr->cur_state);
312 if (BE (mbclen < (size_t) -2, 1))
313 {
314 wchar_t wcu = wc;
315 if (iswlower (wc))
316 {
317 size_t mbcdlen;
318
319 wcu = towupper (wc);
320 mbcdlen = wcrtomb (buf, wcu, &prev_st);
321 if (BE (mbclen == mbcdlen, 1))
322 memcpy (pstr->mbs + byte_idx, buf, mbclen);
323 else
324 {
325 src_idx = byte_idx;
326 goto offsets_needed;
327 }
328 }
329 else
330 memcpy (pstr->mbs + byte_idx,
331 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332 pstr->wcs[byte_idx++] = wcu;
333 /* Write paddings. */
334 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
335 pstr->wcs[byte_idx++] = WEOF;
336 }
337 else if (mbclen == (size_t) -1 || mbclen == 0)
338 {
339 /* It is an invalid character or '\0'. Just use the byte. */
340 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
341 pstr->mbs[byte_idx] = ch;
342 /* And also cast it to wide char. */
343 pstr->wcs[byte_idx++] = (wchar_t) ch;
344 if (BE (mbclen == (size_t) -1, 0))
345 pstr->cur_state = prev_st;
346 }
347 else
348 {
349 /* The buffer doesn't have enough space, finish to build. */
350 pstr->cur_state = prev_st;
351 break;
352 }
353 }
354 pstr->valid_len = byte_idx;
355 pstr->valid_raw_len = byte_idx;
356 return REG_NOERROR;
357 }
358 else
359 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
360 {
361 wchar_t wc;
362 const char *p;
363 offsets_needed:
364 remain_len = end_idx - byte_idx;
365 prev_st = pstr->cur_state;
366 if (BE (pstr->trans != NULL, 0))
367 {
368 int i, ch;
369
370 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
371 {
372 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
373 buf[i] = pstr->trans[ch];
374 }
375 p = (const char *) buf;
376 }
377 else
378 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
379 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
380 if (BE (mbclen < (size_t) -2, 1))
381 {
382 wchar_t wcu = wc;
383 if (iswlower (wc))
384 {
385 size_t mbcdlen;
386
387 wcu = towupper (wc);
388 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389 if (BE (mbclen == mbcdlen, 1))
390 memcpy (pstr->mbs + byte_idx, buf, mbclen);
391 else if (mbcdlen != (size_t) -1)
392 {
393 size_t i;
394
395 if (byte_idx + mbcdlen > pstr->bufs_len)
396 {
397 pstr->cur_state = prev_st;
398 break;
399 }
400
401 if (pstr->offsets == NULL)
402 {
403 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
404
405 if (pstr->offsets == NULL)
406 return REG_ESPACE;
407 }
408 if (!pstr->offsets_needed)
409 {
410 for (i = 0; i < (size_t) byte_idx; ++i)
411 pstr->offsets[i] = i;
412 pstr->offsets_needed = 1;
413 }
414
415 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416 pstr->wcs[byte_idx] = wcu;
417 pstr->offsets[byte_idx] = src_idx;
418 for (i = 1; i < mbcdlen; ++i)
419 {
420 pstr->offsets[byte_idx + i]
421 = src_idx + (i < mbclen ? i : mbclen - 1);
422 pstr->wcs[byte_idx + i] = WEOF;
423 }
424 pstr->len += mbcdlen - mbclen;
425 if (pstr->raw_stop > src_idx)
426 pstr->stop += mbcdlen - mbclen;
427 end_idx = (pstr->bufs_len > pstr->len)
428 ? pstr->len : pstr->bufs_len;
429 byte_idx += mbcdlen;
430 src_idx += mbclen;
431 continue;
432 }
433 else
434 memcpy (pstr->mbs + byte_idx, p, mbclen);
435 }
436 else
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
438
439 if (BE (pstr->offsets_needed != 0, 0))
440 {
441 size_t i;
442 for (i = 0; i < mbclen; ++i)
443 pstr->offsets[byte_idx + i] = src_idx + i;
444 }
445 src_idx += mbclen;
446
447 pstr->wcs[byte_idx++] = wcu;
448 /* Write paddings. */
449 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450 pstr->wcs[byte_idx++] = WEOF;
451 }
452 else if (mbclen == (size_t) -1 || mbclen == 0)
453 {
454 /* It is an invalid character or '\0'. Just use the byte. */
455 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
456
457 if (BE (pstr->trans != NULL, 0))
458 ch = pstr->trans [ch];
459 pstr->mbs[byte_idx] = ch;
460
461 if (BE (pstr->offsets_needed != 0, 0))
462 pstr->offsets[byte_idx] = src_idx;
463 ++src_idx;
464
465 /* And also cast it to wide char. */
466 pstr->wcs[byte_idx++] = (wchar_t) ch;
467 if (BE (mbclen == (size_t) -1, 0))
468 pstr->cur_state = prev_st;
469 }
470 else
471 {
472 /* The buffer doesn't have enough space, finish to build. */
473 pstr->cur_state = prev_st;
474 break;
475 }
476 }
477 pstr->valid_len = byte_idx;
478 pstr->valid_raw_len = src_idx;
479 return REG_NOERROR;
480}
481
482/* Skip characters until the index becomes greater than NEW_RAW_IDX.
483 Return the index. */
484
485static Idx
486internal_function
487re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
488{
489 mbstate_t prev_st;
490 Idx rawbuf_idx;
491 size_t mbclen;
492 wint_t wc = WEOF;
493
494 /* Skip the characters which are not necessary to check. */
495 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
496 rawbuf_idx < new_raw_idx;)
497 {
498 wchar_t wc2;
499 Idx remain_len;
500 remain_len = pstr->len - rawbuf_idx;
501 prev_st = pstr->cur_state;
502 mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
503 remain_len, &pstr->cur_state);
504 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
505 {
506 /* We treat these cases as a single byte character. */
507 if (mbclen == 0 || remain_len == 0)
508 wc = L'\0';
509 else
510 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
511 mbclen = 1;
512 pstr->cur_state = prev_st;
513 }
514 else
515 wc = wc2;
516 /* Then proceed the next character. */
517 rawbuf_idx += mbclen;
518 }
519 *last_wc = wc;
520 return rawbuf_idx;
521}
522#endif /* RE_ENABLE_I18N */
523
524/* Build the buffer PSTR->MBS, and apply the translation if we need.
525 This function is used in case of REG_ICASE. */
526
527static void
528internal_function
529build_upper_buffer (re_string_t *pstr)
530{
531 Idx char_idx, end_idx;
532 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533
534 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535 {
536 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
537 if (BE (pstr->trans != NULL, 0))
538 ch = pstr->trans[ch];
539 if (islower (ch))
540 pstr->mbs[char_idx] = toupper (ch);
541 else
542 pstr->mbs[char_idx] = ch;
543 }
544 pstr->valid_len = char_idx;
545 pstr->valid_raw_len = char_idx;
546}
547
548/* Apply TRANS to the buffer in PSTR. */
549
550static void
551internal_function
552re_string_translate_buffer (re_string_t *pstr)
553{
554 Idx buf_idx, end_idx;
555 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
556
557 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
558 {
559 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
560 pstr->mbs[buf_idx] = pstr->trans[ch];
561 }
562
563 pstr->valid_len = buf_idx;
564 pstr->valid_raw_len = buf_idx;
565}
566
567/* This function re-construct the buffers.
568 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
569 convert to upper case in case of REG_ICASE, apply translation. */
570
571static reg_errcode_t
572internal_function
573re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
574{
575 Idx offset;
576
577 if (BE (pstr->raw_mbs_idx <= idx, 0))
578 offset = idx - pstr->raw_mbs_idx;
579 else
580 {
581 /* Reset buffer. */
582#ifdef RE_ENABLE_I18N
583 if (pstr->mb_cur_max > 1)
584 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
585#endif /* RE_ENABLE_I18N */
586 pstr->len = pstr->raw_len;
587 pstr->stop = pstr->raw_stop;
588 pstr->valid_len = 0;
589 pstr->raw_mbs_idx = 0;
590 pstr->valid_raw_len = 0;
591 pstr->offsets_needed = 0;
592 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
593 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
594 if (!pstr->mbs_allocated)
595 pstr->mbs = (unsigned char *) pstr->raw_mbs;
596 offset = idx;
597 }
598
599 if (BE (offset != 0, 1))
600 {
601 /* Should the already checked characters be kept? */
602 if (BE (offset < pstr->valid_raw_len, 1))
603 {
604 /* Yes, move them to the front of the buffer. */
605#ifdef RE_ENABLE_I18N
606 if (BE (pstr->offsets_needed, 0))
607 {
608 Idx low = 0, high = pstr->valid_len, mid;
609 do
610 {
611 mid = (high + low) / 2;
612 if (pstr->offsets[mid] > offset)
613 high = mid;
614 else if (pstr->offsets[mid] < offset)
615 low = mid + 1;
616 else
617 break;
618 }
619 while (low < high);
620 if (pstr->offsets[mid] < offset)
621 ++mid;
622 pstr->tip_context = re_string_context_at (pstr, mid - 1,
623 eflags);
624 /* This can be quite complicated, so handle specially
625 only the common and easy case where the character with
626 different length representation of lower and upper
627 case is present at or after offset. */
628 if (pstr->valid_len > offset
629 && mid == offset && pstr->offsets[mid] == offset)
630 {
631 memmove (pstr->wcs, pstr->wcs + offset,
632 (pstr->valid_len - offset) * sizeof (wint_t));
633 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
634 pstr->valid_len -= offset;
635 pstr->valid_raw_len -= offset;
636 for (low = 0; low < pstr->valid_len; low++)
637 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
638 }
639 else
640 {
641 /* Otherwise, just find out how long the partial multibyte
642 character at offset is and fill it with WEOF/255. */
643 pstr->len = pstr->raw_len - idx + offset;
644 pstr->stop = pstr->raw_stop - idx + offset;
645 pstr->offsets_needed = 0;
646 while (mid > 0 && pstr->offsets[mid - 1] == offset)
647 --mid;
648 while (mid < pstr->valid_len)
649 if (pstr->wcs[mid] != WEOF)
650 break;
651 else
652 ++mid;
653 if (mid == pstr->valid_len)
654 pstr->valid_len = 0;
655 else
656 {
657 pstr->valid_len = pstr->offsets[mid] - offset;
658 if (pstr->valid_len)
659 {
660 for (low = 0; low < pstr->valid_len; ++low)
661 pstr->wcs[low] = WEOF;
662 memset (pstr->mbs, 255, pstr->valid_len);
663 }
664 }
665 pstr->valid_raw_len = pstr->valid_len;
666 }
667 }
668 else
669#endif
670 {
671 pstr->tip_context = re_string_context_at (pstr, offset - 1,
672 eflags);
673#ifdef RE_ENABLE_I18N
674 if (pstr->mb_cur_max > 1)
675 memmove (pstr->wcs, pstr->wcs + offset,
676 (pstr->valid_len - offset) * sizeof (wint_t));
677#endif /* RE_ENABLE_I18N */
678 if (BE (pstr->mbs_allocated, 0))
679 memmove (pstr->mbs, pstr->mbs + offset,
680 pstr->valid_len - offset);
681 pstr->valid_len -= offset;
682 pstr->valid_raw_len -= offset;
683#if DEBUG
684 assert (pstr->valid_len > 0);
685#endif
686 }
687 }
688 else
689 {
690 /* No, skip all characters until IDX. */
691 Idx prev_valid_len = pstr->valid_len;
692
693#ifdef RE_ENABLE_I18N
694 if (BE (pstr->offsets_needed, 0))
695 {
696 pstr->len = pstr->raw_len - idx + offset;
697 pstr->stop = pstr->raw_stop - idx + offset;
698 pstr->offsets_needed = 0;
699 }
700#endif
701 pstr->valid_len = 0;
702#ifdef RE_ENABLE_I18N
703 if (pstr->mb_cur_max > 1)
704 {
705 Idx wcs_idx;
706 wint_t wc = WEOF;
707
708 if (pstr->is_utf8)
709 {
710 const unsigned char *raw, *p, *q, *end;
711
712 /* Special case UTF-8. Multi-byte chars start with any
713 byte other than 0x80 - 0xbf. */
714 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
715 end = raw + (offset - pstr->mb_cur_max);
716 if (end < pstr->raw_mbs)
717 end = pstr->raw_mbs;
718 p = raw + offset - 1;
719#ifdef _LIBC
720 /* We know the wchar_t encoding is UCS4, so for the simple
721 case, ASCII characters, skip the conversion step. */
722 if (isascii (*p) && BE (pstr->trans == NULL, 1))
723 {
724 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
725 /* pstr->valid_len = 0; */
726 wc = (wchar_t) *p;
727 }
728 else
729#endif
730 for (; p >= end; --p)
731 if ((*p & 0xc0) != 0x80)
732 {
733 mbstate_t cur_state;
734 wchar_t wc2;
735 Idx mlen = raw + pstr->len - p;
736 unsigned char buf[6];
737 size_t mbclen;
738
739 q = p;
740 if (BE (pstr->trans != NULL, 0))
741 {
742 int i = mlen < 6 ? mlen : 6;
743 while (--i >= 0)
744 buf[i] = pstr->trans[p[i]];
745 q = buf;
746 }
747 /* XXX Don't use mbrtowc, we know which conversion
748 to use (UTF-8 -> UCS4). */
749 memset (&cur_state, 0, sizeof (cur_state));
750 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
751 &cur_state);
752 if (raw + offset - p <= mbclen
753 && mbclen < (size_t) -2)
754 {
755 memset (&pstr->cur_state, '\0',
756 sizeof (mbstate_t));
757 pstr->valid_len = mbclen - (raw + offset - p);
758 wc = wc2;
759 }
760 break;
761 }
762 }
763
764 if (wc == WEOF)
765 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
766 if (wc == WEOF)
767 pstr->tip_context
768 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
769 else
770 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
771 && IS_WIDE_WORD_CHAR (wc))
772 ? CONTEXT_WORD
773 : ((IS_WIDE_NEWLINE (wc)
774 && pstr->newline_anchor)
775 ? CONTEXT_NEWLINE : 0));
776 if (BE (pstr->valid_len, 0))
777 {
778 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
779 pstr->wcs[wcs_idx] = WEOF;
780 if (pstr->mbs_allocated)
781 memset (pstr->mbs, 255, pstr->valid_len);
782 }
783 pstr->valid_raw_len = pstr->valid_len;
784 }
785 else
786#endif /* RE_ENABLE_I18N */
787 {
788 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
789 pstr->valid_raw_len = 0;
790 if (pstr->trans)
791 c = pstr->trans[c];
792 pstr->tip_context = (bitset_contain (pstr->word_char, c)
793 ? CONTEXT_WORD
794 : ((IS_NEWLINE (c) && pstr->newline_anchor)
795 ? CONTEXT_NEWLINE : 0));
796 }
797 }
798 if (!BE (pstr->mbs_allocated, 0))
799 pstr->mbs += offset;
800 }
801 pstr->raw_mbs_idx = idx;
802 pstr->len -= offset;
803 pstr->stop -= offset;
804
805 /* Then build the buffers. */
806#ifdef RE_ENABLE_I18N
807 if (pstr->mb_cur_max > 1)
808 {
809 if (pstr->icase)
810 {
811 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
812 if (BE (ret != REG_NOERROR, 0))
813 return ret;
814 }
815 else
816 build_wcs_buffer (pstr);
817 }
818 else
819#endif /* RE_ENABLE_I18N */
820 if (BE (pstr->mbs_allocated, 0))
821 {
822 if (pstr->icase)
823 build_upper_buffer (pstr);
824 else if (pstr->trans != NULL)
825 re_string_translate_buffer (pstr);
826 }
827 else
828 pstr->valid_len = pstr->len;
829
830 pstr->cur_idx = 0;
831 return REG_NOERROR;
832}
833
834static unsigned char
835internal_function __attribute ((pure))
836re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
837{
838 int ch;
839 Idx off;
840
841 /* Handle the common (easiest) cases first. */
842 if (BE (!pstr->mbs_allocated, 1))
843 return re_string_peek_byte (pstr, idx);
844
845#ifdef RE_ENABLE_I18N
846 if (pstr->mb_cur_max > 1
847 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
848 return re_string_peek_byte (pstr, idx);
849#endif
850
851 off = pstr->cur_idx + idx;
852#ifdef RE_ENABLE_I18N
853 if (pstr->offsets_needed)
854 off = pstr->offsets[off];
855#endif
856
857 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
858
859#ifdef RE_ENABLE_I18N
860 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
861 this function returns CAPITAL LETTER I instead of first byte of
862 DOTLESS SMALL LETTER I. The latter would confuse the parser,
863 since peek_byte_case doesn't advance cur_idx in any way. */
864 if (pstr->offsets_needed && !isascii (ch))
865 return re_string_peek_byte (pstr, idx);
866#endif
867
868 return ch;
869}
870
871static unsigned char
872internal_function __attribute ((pure))
873re_string_fetch_byte_case (re_string_t *pstr)
874{
875 if (BE (!pstr->mbs_allocated, 1))
876 return re_string_fetch_byte (pstr);
877
878#ifdef RE_ENABLE_I18N
879 if (pstr->offsets_needed)
880 {
881 Idx off;
882 int ch;
883
884 /* For tr_TR.UTF-8 [[:islower:]] there is
885 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
886 in that case the whole multi-byte character and return
887 the original letter. On the other side, with
888 [[: DOTLESS SMALL LETTER I return [[:I, as doing
889 anything else would complicate things too much. */
890
891 if (!re_string_first_byte (pstr, pstr->cur_idx))
892 return re_string_fetch_byte (pstr);
893
894 off = pstr->offsets[pstr->cur_idx];
895 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
896
897 if (! isascii (ch))
898 return re_string_fetch_byte (pstr);
899
900 re_string_skip_bytes (pstr,
901 re_string_char_size_at (pstr, pstr->cur_idx));
902 return ch;
903 }
904#endif
905
906 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
907}
908
909static void
910internal_function
911re_string_destruct (re_string_t *pstr)
912{
913#ifdef RE_ENABLE_I18N
914 re_free (pstr->wcs);
915 re_free (pstr->offsets);
916#endif /* RE_ENABLE_I18N */
917 if (pstr->mbs_allocated)
918 re_free (pstr->mbs);
919}
920
921/* Return the context at IDX in INPUT. */
922
923static unsigned int
924internal_function
925re_string_context_at (const re_string_t *input, Idx idx, int eflags)
926{
927 int c;
928 if (BE (! REG_VALID_INDEX (idx), 0))
929 /* In this case, we use the value stored in input->tip_context,
930 since we can't know the character in input->mbs[-1] here. */
931 return input->tip_context;
932 if (BE (idx == input->len, 0))
933 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
935#ifdef RE_ENABLE_I18N
936 if (input->mb_cur_max > 1)
937 {
938 wint_t wc;
939 Idx wc_idx = idx;
940 while(input->wcs[wc_idx] == WEOF)
941 {
942#ifdef DEBUG
943 /* It must not happen. */
944 assert (REG_VALID_INDEX (wc_idx));
945#endif
946 --wc_idx;
947 if (! REG_VALID_INDEX (wc_idx))
948 return input->tip_context;
949 }
950 wc = input->wcs[wc_idx];
951 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
952 return CONTEXT_WORD;
953 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954 ? CONTEXT_NEWLINE : 0);
955 }
956 else
957#endif
958 {
959 c = re_string_byte_at (input, idx);
960 if (bitset_contain (input->word_char, c))
961 return CONTEXT_WORD;
962 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963 }
964}
965
966
967/* Functions for set operation. */
968
969static reg_errcode_t
970internal_function
971re_node_set_alloc (re_node_set *set, Idx size)
972{
973 set->alloc = size;
974 set->nelem = 0;
975 set->elems = re_malloc (Idx, size);
976 if (BE (set->elems == NULL, 0))
977 return REG_ESPACE;
978 return REG_NOERROR;
979}
980
981static reg_errcode_t
982internal_function
983re_node_set_init_1 (re_node_set *set, Idx elem)
984{
985 set->alloc = 1;
986 set->nelem = 1;
987 set->elems = re_malloc (Idx, 1);
988 if (BE (set->elems == NULL, 0))
989 {
990 set->alloc = set->nelem = 0;
991 return REG_ESPACE;
992 }
993 set->elems[0] = elem;
994 return REG_NOERROR;
995}
996
997static reg_errcode_t
998internal_function
999re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1000{
1001 set->alloc = 2;
1002 set->elems = re_malloc (Idx, 2);
1003 if (BE (set->elems == NULL, 0))
1004 return REG_ESPACE;
1005 if (elem1 == elem2)
1006 {
1007 set->nelem = 1;
1008 set->elems[0] = elem1;
1009 }
1010 else
1011 {
1012 set->nelem = 2;
1013 if (elem1 < elem2)
1014 {
1015 set->elems[0] = elem1;
1016 set->elems[1] = elem2;
1017 }
1018 else
1019 {
1020 set->elems[0] = elem2;
1021 set->elems[1] = elem1;
1022 }
1023 }
1024 return REG_NOERROR;
1025}
1026
1027static reg_errcode_t
1028internal_function
1029re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1030{
1031 dest->nelem = src->nelem;
1032 if (src->nelem > 0)
1033 {
1034 dest->alloc = dest->nelem;
1035 dest->elems = re_malloc (Idx, dest->alloc);
1036 if (BE (dest->elems == NULL, 0))
1037 {
1038 dest->alloc = dest->nelem = 0;
1039 return REG_ESPACE;
1040 }
1041 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1042 }
1043 else
1044 re_node_set_init_empty (dest);
1045 return REG_NOERROR;
1046}
1047
1048/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1049 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1050 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1051
1052static reg_errcode_t
1053internal_function
1054re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1055 const re_node_set *src2)
1056{
1057 Idx i1, i2, is, id, delta, sbase;
1058 if (src1->nelem == 0 || src2->nelem == 0)
1059 return REG_NOERROR;
1060
1061 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1062 conservative estimate. */
1063 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1064 {
1065 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1066 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1067 if (BE (new_elems == NULL, 0))
1068 return REG_ESPACE;
1069 dest->elems = new_elems;
1070 dest->alloc = new_alloc;
1071 }
1072
1073 /* Find the items in the intersection of SRC1 and SRC2, and copy
1074 into the top of DEST those that are not already in DEST itself. */
1075 sbase = dest->nelem + src1->nelem + src2->nelem;
1076 i1 = src1->nelem - 1;
1077 i2 = src2->nelem - 1;
1078 id = dest->nelem - 1;
1079 for (;;)
1080 {
1081 if (src1->elems[i1] == src2->elems[i2])
1082 {
1083 /* Try to find the item in DEST. Maybe we could binary search? */
1084 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1085 --id;
1086
1087 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1088 dest->elems[--sbase] = src1->elems[i1];
1089
1090 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1091 break;
1092 }
1093
1094 /* Lower the highest of the two items. */
1095 else if (src1->elems[i1] < src2->elems[i2])
1096 {
1097 if (! REG_VALID_INDEX (--i2))
1098 break;
1099 }
1100 else
1101 {
1102 if (! REG_VALID_INDEX (--i1))
1103 break;
1104 }
1105 }
1106
1107 id = dest->nelem - 1;
1108 is = dest->nelem + src1->nelem + src2->nelem - 1;
1109 delta = is - sbase + 1;
1110
1111 /* Now copy. When DELTA becomes zero, the remaining
1112 DEST elements are already in place; this is more or
1113 less the same loop that is in re_node_set_merge. */
1114 dest->nelem += delta;
1115 if (delta > 0 && REG_VALID_INDEX (id))
1116 for (;;)
1117 {
1118 if (dest->elems[is] > dest->elems[id])
1119 {
1120 /* Copy from the top. */
1121 dest->elems[id + delta--] = dest->elems[is--];
1122 if (delta == 0)
1123 break;
1124 }
1125 else
1126 {
1127 /* Slide from the bottom. */
1128 dest->elems[id + delta] = dest->elems[id];
1129 if (! REG_VALID_INDEX (--id))
1130 break;
1131 }
1132 }
1133
1134 /* Copy remaining SRC elements. */
1135 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1136
1137 return REG_NOERROR;
1138}
1139
1140/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1141 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1142
1143static reg_errcode_t
1144internal_function
1145re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1146 const re_node_set *src2)
1147{
1148 Idx i1, i2, id;
1149 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1150 {
1151 dest->alloc = src1->nelem + src2->nelem;
1152 dest->elems = re_malloc (Idx, dest->alloc);
1153 if (BE (dest->elems == NULL, 0))
1154 return REG_ESPACE;
1155 }
1156 else
1157 {
1158 if (src1 != NULL && src1->nelem > 0)
1159 return re_node_set_init_copy (dest, src1);
1160 else if (src2 != NULL && src2->nelem > 0)
1161 return re_node_set_init_copy (dest, src2);
1162 else
1163 re_node_set_init_empty (dest);
1164 return REG_NOERROR;
1165 }
1166 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1167 {
1168 if (src1->elems[i1] > src2->elems[i2])
1169 {
1170 dest->elems[id++] = src2->elems[i2++];
1171 continue;
1172 }
1173 if (src1->elems[i1] == src2->elems[i2])
1174 ++i2;
1175 dest->elems[id++] = src1->elems[i1++];
1176 }
1177 if (i1 < src1->nelem)
1178 {
1179 memcpy (dest->elems + id, src1->elems + i1,
1180 (src1->nelem - i1) * sizeof (Idx));
1181 id += src1->nelem - i1;
1182 }
1183 else if (i2 < src2->nelem)
1184 {
1185 memcpy (dest->elems + id, src2->elems + i2,
1186 (src2->nelem - i2) * sizeof (Idx));
1187 id += src2->nelem - i2;
1188 }
1189 dest->nelem = id;
1190 return REG_NOERROR;
1191}
1192
1193/* Calculate the union set of the sets DEST and SRC. And store it to
1194 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1195
1196static reg_errcode_t
1197internal_function
1198re_node_set_merge (re_node_set *dest, const re_node_set *src)
1199{
1200 Idx is, id, sbase, delta;
1201 if (src == NULL || src->nelem == 0)
1202 return REG_NOERROR;
1203 if (dest->alloc < 2 * src->nelem + dest->nelem)
1204 {
1205 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1206 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1207 if (BE (new_buffer == NULL, 0))
1208 return REG_ESPACE;
1209 dest->elems = new_buffer;
1210 dest->alloc = new_alloc;
1211 }
1212
1213 if (BE (dest->nelem == 0, 0))
1214 {
1215 dest->nelem = src->nelem;
1216 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1217 return REG_NOERROR;
1218 }
1219
1220 /* Copy into the top of DEST the items of SRC that are not
1221 found in DEST. Maybe we could binary search in DEST? */
1222 for (sbase = dest->nelem + 2 * src->nelem,
1223 is = src->nelem - 1, id = dest->nelem - 1;
1224 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1225 {
1226 if (dest->elems[id] == src->elems[is])
1227 is--, id--;
1228 else if (dest->elems[id] < src->elems[is])
1229 dest->elems[--sbase] = src->elems[is--];
1230 else /* if (dest->elems[id] > src->elems[is]) */
1231 --id;
1232 }
1233
1234 if (REG_VALID_INDEX (is))
1235 {
1236 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1237 sbase -= is + 1;
1238 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1239 }
1240
1241 id = dest->nelem - 1;
1242 is = dest->nelem + 2 * src->nelem - 1;
1243 delta = is - sbase + 1;
1244 if (delta == 0)
1245 return REG_NOERROR;
1246
1247 /* Now copy. When DELTA becomes zero, the remaining
1248 DEST elements are already in place. */
1249 dest->nelem += delta;
1250 for (;;)
1251 {
1252 if (dest->elems[is] > dest->elems[id])
1253 {
1254 /* Copy from the top. */
1255 dest->elems[id + delta--] = dest->elems[is--];
1256 if (delta == 0)
1257 break;
1258 }
1259 else
1260 {
1261 /* Slide from the bottom. */
1262 dest->elems[id + delta] = dest->elems[id];
1263 if (! REG_VALID_INDEX (--id))
1264 {
1265 /* Copy remaining SRC elements. */
1266 memcpy (dest->elems, dest->elems + sbase,
1267 delta * sizeof (Idx));
1268 break;
1269 }
1270 }
1271 }
1272
1273 return REG_NOERROR;
1274}
1275
1276/* Insert the new element ELEM to the re_node_set* SET.
1277 SET should not already have ELEM.
1278 Return true if successful. */
1279
1280static bool
1281internal_function
1282re_node_set_insert (re_node_set *set, Idx elem)
1283{
1284 Idx idx;
1285 /* In case the set is empty. */
1286 if (set->alloc == 0)
1287 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1288
1289 if (BE (set->nelem, 0) == 0)
1290 {
1291 /* We already guaranteed above that set->alloc != 0. */
1292 set->elems[0] = elem;
1293 ++set->nelem;
1294 return true;
1295 }
1296
1297 /* Realloc if we need. */
1298 if (set->alloc == set->nelem)
1299 {
1300 Idx *new_elems;
1301 set->alloc = set->alloc * 2;
1302 new_elems = re_realloc (set->elems, Idx, set->alloc);
1303 if (BE (new_elems == NULL, 0))
1304 return false;
1305 set->elems = new_elems;
1306 }
1307
1308 /* Move the elements which follows the new element. Test the
1309 first element separately to skip a check in the inner loop. */
1310 if (elem < set->elems[0])
1311 {
1312 idx = 0;
1313 for (idx = set->nelem; idx > 0; idx--)
1314 set->elems[idx] = set->elems[idx - 1];
1315 }
1316 else
1317 {
1318 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1319 set->elems[idx] = set->elems[idx - 1];
1320 }
1321
1322 /* Insert the new element. */
1323 set->elems[idx] = elem;
1324 ++set->nelem;
1325 return true;
1326}
1327
1328/* Insert the new element ELEM to the re_node_set* SET.
1329 SET should not already have any element greater than or equal to ELEM.
1330 Return true if successful. */
1331
1332static bool
1333internal_function
1334re_node_set_insert_last (re_node_set *set, Idx elem)
1335{
1336 /* Realloc if we need. */
1337 if (set->alloc == set->nelem)
1338 {
1339 Idx *new_elems;
1340 set->alloc = (set->alloc + 1) * 2;
1341 new_elems = re_realloc (set->elems, Idx, set->alloc);
1342 if (BE (new_elems == NULL, 0))
1343 return false;
1344 set->elems = new_elems;
1345 }
1346
1347 /* Insert the new element. */
1348 set->elems[set->nelem++] = elem;
1349 return true;
1350}
1351
1352/* Compare two node sets SET1 and SET2.
1353 Return true if SET1 and SET2 are equivalent. */
1354
1355static bool
1356internal_function __attribute ((pure))
1357re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1358{
1359 Idx i;
1360 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1361 return false;
1362 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1363 if (set1->elems[i] != set2->elems[i])
1364 return false;
1365 return true;
1366}
1367
1368/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1369
1370static Idx
1371internal_function __attribute ((pure))
1372re_node_set_contains (const re_node_set *set, Idx elem)
1373{
1374 __re_size_t idx, right, mid;
1375 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1376 return 0;
1377
1378 /* Binary search the element. */
1379 idx = 0;
1380 right = set->nelem - 1;
1381 while (idx < right)
1382 {
1383 mid = (idx + right) / 2;
1384 if (set->elems[mid] < elem)
1385 idx = mid + 1;
1386 else
1387 right = mid;
1388 }
1389 return set->elems[idx] == elem ? idx + 1 : 0;
1390}
1391
1392static void
1393internal_function
1394re_node_set_remove_at (re_node_set *set, Idx idx)
1395{
1396 if (idx < 0 || idx >= set->nelem)
1397 return;
1398 --set->nelem;
1399 for (; idx < set->nelem; idx++)
1400 set->elems[idx] = set->elems[idx + 1];
1401}
1402
1403
1404
1405/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406 Or return REG_MISSING if an error occurred. */
1407
1408static Idx
1409internal_function
1410re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1411{
1412 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1413 {
1414 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1415 Idx *new_nexts, *new_indices;
1416 re_node_set *new_edests, *new_eclosures;
1417 re_token_t *new_nodes;
1418 size_t max_object_size =
1419 MAX (sizeof (re_token_t),
1420 MAX (sizeof (re_node_set),
1421 sizeof (Idx)));
1422
1423 /* Avoid overflows. */
1424 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1425 return REG_MISSING;
1426
1427 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1428 if (BE (new_nodes == NULL, 0))
1429 return REG_MISSING;
1430 dfa->nodes = new_nodes;
1431 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1432 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1433 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1434 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1435 if (BE (new_nexts == NULL || new_indices == NULL
1436 || new_edests == NULL || new_eclosures == NULL, 0))
1437 return REG_MISSING;
1438 dfa->nexts = new_nexts;
1439 dfa->org_indices = new_indices;
1440 dfa->edests = new_edests;
1441 dfa->eclosures = new_eclosures;
1442 dfa->nodes_alloc = new_nodes_alloc;
1443 }
1444 dfa->nodes[dfa->nodes_len] = token;
1445 dfa->nodes[dfa->nodes_len].constraint = 0;
1446#ifdef RE_ENABLE_I18N
1447 {
1448 int type = token.type;
1449 dfa->nodes[dfa->nodes_len].accept_mb =
1450 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1451 }
1452#endif
1453 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1454 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1455 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1456 return dfa->nodes_len++;
1457}
1458
1459static inline re_hashval_t
1460internal_function
1461calc_state_hash (const re_node_set *nodes, unsigned int context)
1462{
1463 re_hashval_t hash = nodes->nelem + context;
1464 Idx i;
1465 for (i = 0 ; i < nodes->nelem ; i++)
1466 hash += nodes->elems[i];
1467 return hash;
1468}
1469
1470/* Search for the state whose node_set is equivalent to NODES.
1471 Return the pointer to the state, if we found it in the DFA.
1472 Otherwise create the new one and return it. In case of an error
1473 return NULL and set the error code in ERR.
1474 Note: - We assume NULL as the invalid state, then it is possible that
1475 return value is NULL and ERR is REG_NOERROR.
1476 - We never return non-NULL value in case of any errors, it is for
1477 optimization. */
1478
1479static re_dfastate_t *
1480internal_function
1481re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1482 const re_node_set *nodes)
1483{
1484 re_hashval_t hash;
1485 re_dfastate_t *new_state;
1486 struct re_state_table_entry *spot;
1487 Idx i;
1488#ifdef lint
1489 /* Suppress bogus uninitialized-variable warnings. */
1490 *err = REG_NOERROR;
1491#endif
1492 if (BE (nodes->nelem == 0, 0))
1493 {
1494 *err = REG_NOERROR;
1495 return NULL;
1496 }
1497 hash = calc_state_hash (nodes, 0);
1498 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1499
1500 for (i = 0 ; i < spot->num ; i++)
1501 {
1502 re_dfastate_t *state = spot->array[i];
1503 if (hash != state->hash)
1504 continue;
1505 if (re_node_set_compare (&state->nodes, nodes))
1506 return state;
1507 }
1508
1509 /* There are no appropriate state in the dfa, create the new one. */
1510 new_state = create_ci_newstate (dfa, nodes, hash);
1511 if (BE (new_state == NULL, 0))
1512 *err = REG_ESPACE;
1513
1514 return new_state;
1515}
1516
1517/* Search for the state whose node_set is equivalent to NODES and
1518 whose context is equivalent to CONTEXT.
1519 Return the pointer to the state, if we found it in the DFA.
1520 Otherwise create the new one and return it. In case of an error
1521 return NULL and set the error code in ERR.
1522 Note: - We assume NULL as the invalid state, then it is possible that
1523 return value is NULL and ERR is REG_NOERROR.
1524 - We never return non-NULL value in case of any errors, it is for
1525 optimization. */
1526
1527static re_dfastate_t *
1528internal_function
1529re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1530 const re_node_set *nodes, unsigned int context)
1531{
1532 re_hashval_t hash;
1533 re_dfastate_t *new_state;
1534 struct re_state_table_entry *spot;
1535 Idx i;
1536#ifdef lint
1537 /* Suppress bogus uninitialized-variable warnings. */
1538 *err = REG_NOERROR;
1539#endif
1540 if (nodes->nelem == 0)
1541 {
1542 *err = REG_NOERROR;
1543 return NULL;
1544 }
1545 hash = calc_state_hash (nodes, context);
1546 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1547
1548 for (i = 0 ; i < spot->num ; i++)
1549 {
1550 re_dfastate_t *state = spot->array[i];
1551 if (state->hash == hash
1552 && state->context == context
1553 && re_node_set_compare (state->entrance_nodes, nodes))
1554 return state;
1555 }
1556 /* There are no appropriate state in `dfa', create the new one. */
1557 new_state = create_cd_newstate (dfa, nodes, context, hash);
1558 if (BE (new_state == NULL, 0))
1559 *err = REG_ESPACE;
1560
1561 return new_state;
1562}
1563
1564/* Finish initialization of the new state NEWSTATE, and using its hash value
1565 HASH put in the appropriate bucket of DFA's state table. Return value
1566 indicates the error code if failed. */
1567
1568static reg_errcode_t
1569register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1570 re_hashval_t hash)
1571{
1572 struct re_state_table_entry *spot;
1573 reg_errcode_t err;
1574 Idx i;
1575
1576 newstate->hash = hash;
1577 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1578 if (BE (err != REG_NOERROR, 0))
1579 return REG_ESPACE;
1580 for (i = 0; i < newstate->nodes.nelem; i++)
1581 {
1582 Idx elem = newstate->nodes.elems[i];
1583 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1584 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1585 return REG_ESPACE;
1586 }
1587
1588 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1589 if (BE (spot->alloc <= spot->num, 0))
1590 {
1591 Idx new_alloc = 2 * spot->num + 2;
1592 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1593 new_alloc);
1594 if (BE (new_array == NULL, 0))
1595 return REG_ESPACE;
1596 spot->array = new_array;
1597 spot->alloc = new_alloc;
1598 }
1599 spot->array[spot->num++] = newstate;
1600 return REG_NOERROR;
1601}
1602
1603static void
1604free_state (re_dfastate_t *state)
1605{
1606 re_node_set_free (&state->non_eps_nodes);
1607 re_node_set_free (&state->inveclosure);
1608 if (state->entrance_nodes != &state->nodes)
1609 {
1610 re_node_set_free (state->entrance_nodes);
1611 re_free (state->entrance_nodes);
1612 }
1613 re_node_set_free (&state->nodes);
1614 re_free (state->word_trtable);
1615 re_free (state->trtable);
1616 re_free (state);
1617}
1618
1619/* Create the new state which is independ of contexts.
1620 Return the new state if succeeded, otherwise return NULL. */
1621
1622static re_dfastate_t *
1623internal_function
1624create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1625 re_hashval_t hash)
1626{
1627 Idx i;
1628 reg_errcode_t err;
1629 re_dfastate_t *newstate;
1630
1631 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1632 if (BE (newstate == NULL, 0))
1633 return NULL;
1634 err = re_node_set_init_copy (&newstate->nodes, nodes);
1635 if (BE (err != REG_NOERROR, 0))
1636 {
1637 re_free (newstate);
1638 return NULL;
1639 }
1640
1641 newstate->entrance_nodes = &newstate->nodes;
1642 for (i = 0 ; i < nodes->nelem ; i++)
1643 {
1644 re_token_t *node = dfa->nodes + nodes->elems[i];
1645 re_token_type_t type = node->type;
1646 if (type == CHARACTER && !node->constraint)
1647 continue;
1648#ifdef RE_ENABLE_I18N
1649 newstate->accept_mb |= node->accept_mb;
1650#endif /* RE_ENABLE_I18N */
1651
1652 /* If the state has the halt node, the state is a halt state. */
1653 if (type == END_OF_RE)
1654 newstate->halt = 1;
1655 else if (type == OP_BACK_REF)
1656 newstate->has_backref = 1;
1657 else if (type == ANCHOR || node->constraint)
1658 newstate->has_constraint = 1;
1659 }
1660 err = register_state (dfa, newstate, hash);
1661 if (BE (err != REG_NOERROR, 0))
1662 {
1663 free_state (newstate);
1664 newstate = NULL;
1665 }
1666 return newstate;
1667}
1668
1669/* Create the new state which is depend on the context CONTEXT.
1670 Return the new state if succeeded, otherwise return NULL. */
1671
1672static re_dfastate_t *
1673internal_function
1674create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1675 unsigned int context, re_hashval_t hash)
1676{
1677 Idx i, nctx_nodes = 0;
1678 reg_errcode_t err;
1679 re_dfastate_t *newstate;
1680
1681 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1682 if (BE (newstate == NULL, 0))
1683 return NULL;
1684 err = re_node_set_init_copy (&newstate->nodes, nodes);
1685 if (BE (err != REG_NOERROR, 0))
1686 {
1687 re_free (newstate);
1688 return NULL;
1689 }
1690
1691 newstate->context = context;
1692 newstate->entrance_nodes = &newstate->nodes;
1693
1694 for (i = 0 ; i < nodes->nelem ; i++)
1695 {
1696 unsigned int constraint = 0;
1697 re_token_t *node = dfa->nodes + nodes->elems[i];
1698 re_token_type_t type = node->type;
1699 if (node->constraint)
1700 constraint = node->constraint;
1701
1702 if (type == CHARACTER && !constraint)
1703 continue;
1704#ifdef RE_ENABLE_I18N
1705 newstate->accept_mb |= node->accept_mb;
1706#endif /* RE_ENABLE_I18N */
1707
1708 /* If the state has the halt node, the state is a halt state. */
1709 if (type == END_OF_RE)
1710 newstate->halt = 1;
1711 else if (type == OP_BACK_REF)
1712 newstate->has_backref = 1;
1713 else if (type == ANCHOR)
1714 constraint = node->opr.ctx_type;
1715
1716 if (constraint)
1717 {
1718 if (newstate->entrance_nodes == &newstate->nodes)
1719 {
1720 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1721 if (BE (newstate->entrance_nodes == NULL, 0))
1722 {
1723 free_state (newstate);
1724 return NULL;
1725 }
1726 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1727 nctx_nodes = 0;
1728 newstate->has_constraint = 1;
1729 }
1730
1731 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1732 {
1733 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1734 ++nctx_nodes;
1735 }
1736 }
1737 }
1738 err = register_state (dfa, newstate, hash);
1739 if (BE (err != REG_NOERROR, 0))
1740 {
1741 free_state (newstate);
1742 newstate = NULL;
1743 }
1744 return newstate;
1745}
Note: See TracBrowser for help on using the repository browser.