Context Navigation

regex_internal.c

Visit:

Last change on this file was 3342, checked in by bird, 18 years ago
tar 1.16.1
File size: 46.9 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	This program is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 2, or (at your option)
9	any later version.
10
11	This program is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License along
17	with this program; if not, write to the Free Software Foundation,
18	Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20	static void re_string_construct_common (const char *str, Idx len,
21	re_string_t *pstr,
22	RE_TRANSLATE_TYPE trans, bool icase,
23	const re_dfa_t *dfa) internal_function;
24	static re_dfastate_t create_ci_newstate (const re_dfa_t dfa,
25	const re_node_set *nodes,
26	re_hashval_t hash) internal_function;
27	static re_dfastate_t create_cd_newstate (const re_dfa_t dfa,
28	const re_node_set *nodes,
29	unsigned int context,
30	re_hashval_t hash) internal_function;
31
32
33	/* Functions for string operation. */
34
35	/* This function allocate the buffers. It is necessary to call
36	re_string_reconstruct before using the object. */
37
38	static reg_errcode_t
39	internal_function
40	re_string_allocate (re_string_t pstr, const char str, Idx len, Idx init_len,
41	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
42	{
43	reg_errcode_t ret;
44	Idx init_buf_len;
45
46	/* Ensure at least one character fits into the buffers. */
47	if (init_len < dfa->mb_cur_max)
48	init_len = dfa->mb_cur_max;
49	init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
50	re_string_construct_common (str, len, pstr, trans, icase, dfa);
51
52	ret = re_string_realloc_buffers (pstr, init_buf_len);
53	if (BE (ret != REG_NOERROR, 0))
54	return ret;
55
56	pstr->word_char = dfa->word_char;
57	pstr->word_ops_used = dfa->word_ops_used;
58	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
59	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > 1) ? 0 : len;
60	pstr->valid_raw_len = pstr->valid_len;
61	return REG_NOERROR;
62	}
63
64	/* This function allocate the buffers, and initialize them. */
65
66	static reg_errcode_t
67	internal_function
68	re_string_construct (re_string_t pstr, const char str, Idx len,
69	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
70	{
71	reg_errcode_t ret;
72	memset (pstr, '\0', sizeof (re_string_t));
73	re_string_construct_common (str, len, pstr, trans, icase, dfa);
74
75	if (len > 0)
76	{
77	ret = re_string_realloc_buffers (pstr, len + 1);
78	if (BE (ret != REG_NOERROR, 0))
79	return ret;
80	}
81	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
82
83	if (icase)
84	{
85	#ifdef RE_ENABLE_I18N
86	if (dfa->mb_cur_max > 1)
87	{
88	while (1)
89	{
90	ret = build_wcs_upper_buffer (pstr);
91	if (BE (ret != REG_NOERROR, 0))
92	return ret;
93	if (pstr->valid_raw_len >= len)
94	break;
95	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96	break;
97	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
98	if (BE (ret != REG_NOERROR, 0))
99	return ret;
100	}
101	}
102	else
103	#endif /* RE_ENABLE_I18N */
104	build_upper_buffer (pstr);
105	}
106	else
107	{
108	#ifdef RE_ENABLE_I18N
109	if (dfa->mb_cur_max > 1)
110	build_wcs_buffer (pstr);
111	else
112	#endif /* RE_ENABLE_I18N */
113	{
114	if (trans != NULL)
115	re_string_translate_buffer (pstr);
116	else
117	{
118	pstr->valid_len = pstr->bufs_len;
119	pstr->valid_raw_len = pstr->bufs_len;
120	}
121	}
122	}
123
124	return REG_NOERROR;
125	}
126
127	/* Helper functions for re_string_allocate, and re_string_construct. */
128
129	static reg_errcode_t
130	internal_function
131	re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
132	{
133	#ifdef RE_ENABLE_I18N
134	if (pstr->mb_cur_max > 1)
135	{
136	wint_t *new_wcs;
137
138	/* Avoid overflow. */
139	size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
140	if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
141	return REG_ESPACE;
142
143	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
144	if (BE (new_wcs == NULL, 0))
145	return REG_ESPACE;
146	pstr->wcs = new_wcs;
147	if (pstr->offsets != NULL)
148	{
149	Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
150	if (BE (new_offsets == NULL, 0))
151	return REG_ESPACE;
152	pstr->offsets = new_offsets;
153	}
154	}
155	#endif /* RE_ENABLE_I18N */
156	if (pstr->mbs_allocated)
157	{
158	unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
159	new_buf_len);
160	if (BE (new_mbs == NULL, 0))
161	return REG_ESPACE;
162	pstr->mbs = new_mbs;
163	}
164	pstr->bufs_len = new_buf_len;
165	return REG_NOERROR;
166	}
167
168
169	static void
170	internal_function
171	re_string_construct_common (const char str, Idx len, re_string_t pstr,
172	RE_TRANSLATE_TYPE trans, bool icase,
173	const re_dfa_t *dfa)
174	{
175	pstr->raw_mbs = (const unsigned char *) str;
176	pstr->len = len;
177	pstr->raw_len = len;
178	pstr->trans = trans;
179	pstr->icase = icase;
180	pstr->mbs_allocated = (trans != NULL \|\| icase);
181	pstr->mb_cur_max = dfa->mb_cur_max;
182	pstr->is_utf8 = dfa->is_utf8;
183	pstr->map_notascii = dfa->map_notascii;
184	pstr->stop = pstr->len;
185	pstr->raw_stop = pstr->stop;
186	}
187
188	#ifdef RE_ENABLE_I18N
189
190	/* Build wide character buffer PSTR->WCS.
191	If the byte sequence of the string are:
192	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
193	Then wide character buffer will be:
194	<wc1> , WEOF , <wc2> , WEOF , <wc3>
195	We use WEOF for padding, they indicate that the position isn't
196	a first byte of a multibyte character.
197
198	Note that this function assumes PSTR->VALID_LEN elements are already
199	built and starts from PSTR->VALID_LEN. */
200
201	static void
202	internal_function
203	build_wcs_buffer (re_string_t *pstr)
204	{
205	#ifdef _LIBC
206	unsigned char buf[MB_LEN_MAX];
207	assert (MB_LEN_MAX >= pstr->mb_cur_max);
208	#else
209	unsigned char buf[64];
210	#endif
211	mbstate_t prev_st;
212	Idx byte_idx, end_idx, remain_len;
213	size_t mbclen;
214
215	/* Build the buffers from pstr->valid_len to either pstr->len or
216	pstr->bufs_len. */
217	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
218	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
219	{
220	wchar_t wc;
221	const char *p;
222
223	remain_len = end_idx - byte_idx;
224	prev_st = pstr->cur_state;
225	/* Apply the translation if we need. */
226	if (BE (pstr->trans != NULL, 0))
227	{
228	int i, ch;
229
230	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
231	{
232	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
233	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
234	}
235	p = (const char *) buf;
236	}
237	else
238	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
239	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
240	if (BE (mbclen == (size_t) -2, 0))
241	{
242	/* The buffer doesn't have enough space, finish to build. */
243	pstr->cur_state = prev_st;
244	break;
245	}
246	else if (BE (mbclen == (size_t) -1 \|\| mbclen == 0, 0))
247	{
248	/* We treat these cases as a singlebyte character. */
249	mbclen = 1;
250	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
251	if (BE (pstr->trans != NULL, 0))
252	wc = pstr->trans[wc];
253	pstr->cur_state = prev_st;
254	}
255
256	/* Write wide character and padding. */
257	pstr->wcs[byte_idx++] = wc;
258	/* Write paddings. */
259	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260	pstr->wcs[byte_idx++] = WEOF;
261	}
262	pstr->valid_len = byte_idx;
263	pstr->valid_raw_len = byte_idx;
264	}
265
266	/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267	but for REG_ICASE. */
268
269	static reg_errcode_t
270	internal_function
271	build_wcs_upper_buffer (re_string_t *pstr)
272	{
273	mbstate_t prev_st;
274	Idx src_idx, byte_idx, end_idx, remain_len;
275	size_t mbclen;
276	#ifdef _LIBC
277	char buf[MB_LEN_MAX];
278	assert (MB_LEN_MAX >= pstr->mb_cur_max);
279	#else
280	char buf[64];
281	#endif
282
283	byte_idx = pstr->valid_len;
284	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285
286	/* The following optimization assumes that ASCII characters can be
287	mapped to wide characters with a simple cast. */
288	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289	{
290	while (byte_idx < end_idx)
291	{
292	wchar_t wc;
293
294	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295	&& mbsinit (&pstr->cur_state))
296	{
297	/* In case of a singlebyte character. */
298	pstr->mbs[byte_idx]
299	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300	/* The next step uses the assumption that wchar_t is encoded
301	ASCII-safe: all ASCII values can be converted like this. */
302	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303	++byte_idx;
304	continue;
305	}
306
307	remain_len = end_idx - byte_idx;
308	prev_st = pstr->cur_state;
309	mbclen = mbrtowc (&wc,
310	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311	+ byte_idx), remain_len, &pstr->cur_state);
312	if (BE (mbclen < (size_t) -2, 1))
313	{
314	wchar_t wcu = wc;
315	if (iswlower (wc))
316	{
317	size_t mbcdlen;
318
319	wcu = towupper (wc);
320	mbcdlen = wcrtomb (buf, wcu, &prev_st);
321	if (BE (mbclen == mbcdlen, 1))
322	memcpy (pstr->mbs + byte_idx, buf, mbclen);
323	else
324	{
325	src_idx = byte_idx;
326	goto offsets_needed;
327	}
328	}
329	else
330	memcpy (pstr->mbs + byte_idx,
331	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332	pstr->wcs[byte_idx++] = wcu;
333	/* Write paddings. */
334	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
335	pstr->wcs[byte_idx++] = WEOF;
336	}
337	else if (mbclen == (size_t) -1 \|\| mbclen == 0)
338	{
339	/* It is an invalid character or '\0'. Just use the byte. */
340	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
341	pstr->mbs[byte_idx] = ch;
342	/* And also cast it to wide char. */
343	pstr->wcs[byte_idx++] = (wchar_t) ch;
344	if (BE (mbclen == (size_t) -1, 0))
345	pstr->cur_state = prev_st;
346	}
347	else
348	{
349	/* The buffer doesn't have enough space, finish to build. */
350	pstr->cur_state = prev_st;
351	break;
352	}
353	}
354	pstr->valid_len = byte_idx;
355	pstr->valid_raw_len = byte_idx;
356	return REG_NOERROR;
357	}
358	else
359	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
360	{
361	wchar_t wc;
362	const char *p;
363	offsets_needed:
364	remain_len = end_idx - byte_idx;
365	prev_st = pstr->cur_state;
366	if (BE (pstr->trans != NULL, 0))
367	{
368	int i, ch;
369
370	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
371	{
372	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
373	buf[i] = pstr->trans[ch];
374	}
375	p = (const char *) buf;
376	}
377	else
378	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
379	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
380	if (BE (mbclen < (size_t) -2, 1))
381	{
382	wchar_t wcu = wc;
383	if (iswlower (wc))
384	{
385	size_t mbcdlen;
386
387	wcu = towupper (wc);
388	mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389	if (BE (mbclen == mbcdlen, 1))
390	memcpy (pstr->mbs + byte_idx, buf, mbclen);
391	else if (mbcdlen != (size_t) -1)
392	{
393	size_t i;
394
395	if (byte_idx + mbcdlen > pstr->bufs_len)
396	{
397	pstr->cur_state = prev_st;
398	break;
399	}
400
401	if (pstr->offsets == NULL)
402	{
403	pstr->offsets = re_malloc (Idx, pstr->bufs_len);
404
405	if (pstr->offsets == NULL)
406	return REG_ESPACE;
407	}
408	if (!pstr->offsets_needed)
409	{
410	for (i = 0; i < (size_t) byte_idx; ++i)
411	pstr->offsets[i] = i;
412	pstr->offsets_needed = 1;
413	}
414
415	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416	pstr->wcs[byte_idx] = wcu;
417	pstr->offsets[byte_idx] = src_idx;
418	for (i = 1; i < mbcdlen; ++i)
419	{
420	pstr->offsets[byte_idx + i]
421	= src_idx + (i < mbclen ? i : mbclen - 1);
422	pstr->wcs[byte_idx + i] = WEOF;
423	}
424	pstr->len += mbcdlen - mbclen;
425	if (pstr->raw_stop > src_idx)
426	pstr->stop += mbcdlen - mbclen;
427	end_idx = (pstr->bufs_len > pstr->len)
428	? pstr->len : pstr->bufs_len;
429	byte_idx += mbcdlen;
430	src_idx += mbclen;
431	continue;
432	}
433	else
434	memcpy (pstr->mbs + byte_idx, p, mbclen);
435	}
436	else
437	memcpy (pstr->mbs + byte_idx, p, mbclen);
438
439	if (BE (pstr->offsets_needed != 0, 0))
440	{
441	size_t i;
442	for (i = 0; i < mbclen; ++i)
443	pstr->offsets[byte_idx + i] = src_idx + i;
444	}
445	src_idx += mbclen;
446
447	pstr->wcs[byte_idx++] = wcu;
448	/* Write paddings. */
449	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450	pstr->wcs[byte_idx++] = WEOF;
451	}
452	else if (mbclen == (size_t) -1 \|\| mbclen == 0)
453	{
454	/* It is an invalid character or '\0'. Just use the byte. */
455	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
456
457	if (BE (pstr->trans != NULL, 0))
458	ch = pstr->trans [ch];
459	pstr->mbs[byte_idx] = ch;
460
461	if (BE (pstr->offsets_needed != 0, 0))
462	pstr->offsets[byte_idx] = src_idx;
463	++src_idx;
464
465	/* And also cast it to wide char. */
466	pstr->wcs[byte_idx++] = (wchar_t) ch;
467	if (BE (mbclen == (size_t) -1, 0))
468	pstr->cur_state = prev_st;
469	}
470	else
471	{
472	/* The buffer doesn't have enough space, finish to build. */
473	pstr->cur_state = prev_st;
474	break;
475	}
476	}
477	pstr->valid_len = byte_idx;
478	pstr->valid_raw_len = src_idx;
479	return REG_NOERROR;
480	}
481
482	/* Skip characters until the index becomes greater than NEW_RAW_IDX.
483	Return the index. */
484
485	static Idx
486	internal_function
487	re_string_skip_chars (re_string_t pstr, Idx new_raw_idx, wint_t last_wc)
488	{
489	mbstate_t prev_st;
490	Idx rawbuf_idx;
491	size_t mbclen;
492	wint_t wc = WEOF;
493
494	/* Skip the characters which are not necessary to check. */
495	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
496	rawbuf_idx < new_raw_idx;)
497	{
498	wchar_t wc2;
499	Idx remain_len;
500	remain_len = pstr->len - rawbuf_idx;
501	prev_st = pstr->cur_state;
502	mbclen = mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
503	remain_len, &pstr->cur_state);
504	if (BE (mbclen == (size_t) -2 \|\| mbclen == (size_t) -1 \|\| mbclen == 0, 0))
505	{
506	/* We treat these cases as a single byte character. */
507	if (mbclen == 0 \|\| remain_len == 0)
508	wc = L'\0';
509	else
510	wc = (unsigned char ) (pstr->raw_mbs + rawbuf_idx);
511	mbclen = 1;
512	pstr->cur_state = prev_st;
513	}
514	else
515	wc = wc2;
516	/* Then proceed the next character. */
517	rawbuf_idx += mbclen;
518	}
519	*last_wc = wc;
520	return rawbuf_idx;
521	}
522	#endif /* RE_ENABLE_I18N */
523
524	/* Build the buffer PSTR->MBS, and apply the translation if we need.
525	This function is used in case of REG_ICASE. */
526
527	static void
528	internal_function
529	build_upper_buffer (re_string_t *pstr)
530	{
531	Idx char_idx, end_idx;
532	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533
534	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535	{
536	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
537	if (BE (pstr->trans != NULL, 0))
538	ch = pstr->trans[ch];
539	if (islower (ch))
540	pstr->mbs[char_idx] = toupper (ch);
541	else
542	pstr->mbs[char_idx] = ch;
543	}
544	pstr->valid_len = char_idx;
545	pstr->valid_raw_len = char_idx;
546	}
547
548	/* Apply TRANS to the buffer in PSTR. */
549
550	static void
551	internal_function
552	re_string_translate_buffer (re_string_t *pstr)
553	{
554	Idx buf_idx, end_idx;
555	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
556
557	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
558	{
559	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
560	pstr->mbs[buf_idx] = pstr->trans[ch];
561	}
562
563	pstr->valid_len = buf_idx;
564	pstr->valid_raw_len = buf_idx;
565	}
566
567	/* This function re-construct the buffers.
568	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
569	convert to upper case in case of REG_ICASE, apply translation. */
570
571	static reg_errcode_t
572	internal_function
573	re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
574	{
575	Idx offset;
576
577	if (BE (pstr->raw_mbs_idx <= idx, 0))
578	offset = idx - pstr->raw_mbs_idx;
579	else
580	{
581	/* Reset buffer. */
582	#ifdef RE_ENABLE_I18N
583	if (pstr->mb_cur_max > 1)
584	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
585	#endif /* RE_ENABLE_I18N */
586	pstr->len = pstr->raw_len;
587	pstr->stop = pstr->raw_stop;
588	pstr->valid_len = 0;
589	pstr->raw_mbs_idx = 0;
590	pstr->valid_raw_len = 0;
591	pstr->offsets_needed = 0;
592	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
593	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
594	if (!pstr->mbs_allocated)
595	pstr->mbs = (unsigned char *) pstr->raw_mbs;
596	offset = idx;
597	}
598
599	if (BE (offset != 0, 1))
600	{
601	/* Should the already checked characters be kept? */
602	if (BE (offset < pstr->valid_raw_len, 1))
603	{
604	/* Yes, move them to the front of the buffer. */
605	#ifdef RE_ENABLE_I18N
606	if (BE (pstr->offsets_needed, 0))
607	{
608	Idx low = 0, high = pstr->valid_len, mid;
609	do
610	{
611	mid = (high + low) / 2;
612	if (pstr->offsets[mid] > offset)
613	high = mid;
614	else if (pstr->offsets[mid] < offset)
615	low = mid + 1;
616	else
617	break;
618	}
619	while (low < high);
620	if (pstr->offsets[mid] < offset)
621	++mid;
622	pstr->tip_context = re_string_context_at (pstr, mid - 1,
623	eflags);
624	/* This can be quite complicated, so handle specially
625	only the common and easy case where the character with
626	different length representation of lower and upper
627	case is present at or after offset. */
628	if (pstr->valid_len > offset
629	&& mid == offset && pstr->offsets[mid] == offset)
630	{
631	memmove (pstr->wcs, pstr->wcs + offset,
632	(pstr->valid_len - offset) * sizeof (wint_t));
633	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
634	pstr->valid_len -= offset;
635	pstr->valid_raw_len -= offset;
636	for (low = 0; low < pstr->valid_len; low++)
637	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
638	}
639	else
640	{
641	/* Otherwise, just find out how long the partial multibyte
642	character at offset is and fill it with WEOF/255. */
643	pstr->len = pstr->raw_len - idx + offset;
644	pstr->stop = pstr->raw_stop - idx + offset;
645	pstr->offsets_needed = 0;
646	while (mid > 0 && pstr->offsets[mid - 1] == offset)
647	--mid;
648	while (mid < pstr->valid_len)
649	if (pstr->wcs[mid] != WEOF)
650	break;
651	else
652	++mid;
653	if (mid == pstr->valid_len)
654	pstr->valid_len = 0;
655	else
656	{
657	pstr->valid_len = pstr->offsets[mid] - offset;
658	if (pstr->valid_len)
659	{
660	for (low = 0; low < pstr->valid_len; ++low)
661	pstr->wcs[low] = WEOF;
662	memset (pstr->mbs, 255, pstr->valid_len);
663	}
664	}
665	pstr->valid_raw_len = pstr->valid_len;
666	}
667	}
668	else
669	#endif
670	{
671	pstr->tip_context = re_string_context_at (pstr, offset - 1,
672	eflags);
673	#ifdef RE_ENABLE_I18N
674	if (pstr->mb_cur_max > 1)
675	memmove (pstr->wcs, pstr->wcs + offset,
676	(pstr->valid_len - offset) * sizeof (wint_t));
677	#endif /* RE_ENABLE_I18N */
678	if (BE (pstr->mbs_allocated, 0))
679	memmove (pstr->mbs, pstr->mbs + offset,
680	pstr->valid_len - offset);
681	pstr->valid_len -= offset;
682	pstr->valid_raw_len -= offset;
683	#if DEBUG
684	assert (pstr->valid_len > 0);
685	#endif
686	}
687	}
688	else
689	{
690	/* No, skip all characters until IDX. */
691	Idx prev_valid_len = pstr->valid_len;
692
693	#ifdef RE_ENABLE_I18N
694	if (BE (pstr->offsets_needed, 0))
695	{
696	pstr->len = pstr->raw_len - idx + offset;
697	pstr->stop = pstr->raw_stop - idx + offset;
698	pstr->offsets_needed = 0;
699	}
700	#endif
701	pstr->valid_len = 0;
702	#ifdef RE_ENABLE_I18N
703	if (pstr->mb_cur_max > 1)
704	{
705	Idx wcs_idx;
706	wint_t wc = WEOF;
707
708	if (pstr->is_utf8)
709	{
710	const unsigned char raw, p, q, end;
711
712	/* Special case UTF-8. Multi-byte chars start with any
713	byte other than 0x80 - 0xbf. */
714	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
715	end = raw + (offset - pstr->mb_cur_max);
716	if (end < pstr->raw_mbs)
717	end = pstr->raw_mbs;
718	p = raw + offset - 1;
719	#ifdef _LIBC
720	/* We know the wchar_t encoding is UCS4, so for the simple
721	case, ASCII characters, skip the conversion step. */
722	if (isascii (*p) && BE (pstr->trans == NULL, 1))
723	{
724	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
725	/* pstr->valid_len = 0; */
726	wc = (wchar_t) *p;
727	}
728	else
729	#endif
730	for (; p >= end; --p)
731	if ((*p & 0xc0) != 0x80)
732	{
733	mbstate_t cur_state;
734	wchar_t wc2;
735	Idx mlen = raw + pstr->len - p;
736	unsigned char buf[6];
737	size_t mbclen;
738
739	q = p;
740	if (BE (pstr->trans != NULL, 0))
741	{
742	int i = mlen < 6 ? mlen : 6;
743	while (--i >= 0)
744	buf[i] = pstr->trans[p[i]];
745	q = buf;
746	}
747	/* XXX Don't use mbrtowc, we know which conversion
748	to use (UTF-8 -> UCS4). */
749	memset (&cur_state, 0, sizeof (cur_state));
750	mbclen = mbrtowc (&wc2, (const char *) p, mlen,
751	&cur_state);
752	if (raw + offset - p <= mbclen
753	&& mbclen < (size_t) -2)
754	{
755	memset (&pstr->cur_state, '\0',
756	sizeof (mbstate_t));
757	pstr->valid_len = mbclen - (raw + offset - p);
758	wc = wc2;
759	}
760	break;
761	}
762	}
763
764	if (wc == WEOF)
765	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
766	if (wc == WEOF)
767	pstr->tip_context
768	= re_string_context_at (pstr, prev_valid_len - 1, eflags);
769	else
770	pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
771	&& IS_WIDE_WORD_CHAR (wc))
772	? CONTEXT_WORD
773	: ((IS_WIDE_NEWLINE (wc)
774	&& pstr->newline_anchor)
775	? CONTEXT_NEWLINE : 0));
776	if (BE (pstr->valid_len, 0))
777	{
778	for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
779	pstr->wcs[wcs_idx] = WEOF;
780	if (pstr->mbs_allocated)
781	memset (pstr->mbs, 255, pstr->valid_len);
782	}
783	pstr->valid_raw_len = pstr->valid_len;
784	}
785	else
786	#endif /* RE_ENABLE_I18N */
787	{
788	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
789	pstr->valid_raw_len = 0;
790	if (pstr->trans)
791	c = pstr->trans[c];
792	pstr->tip_context = (bitset_contain (pstr->word_char, c)
793	? CONTEXT_WORD
794	: ((IS_NEWLINE (c) && pstr->newline_anchor)
795	? CONTEXT_NEWLINE : 0));
796	}
797	}
798	if (!BE (pstr->mbs_allocated, 0))
799	pstr->mbs += offset;
800	}
801	pstr->raw_mbs_idx = idx;
802	pstr->len -= offset;
803	pstr->stop -= offset;
804
805	/* Then build the buffers. */
806	#ifdef RE_ENABLE_I18N
807	if (pstr->mb_cur_max > 1)
808	{
809	if (pstr->icase)
810	{
811	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
812	if (BE (ret != REG_NOERROR, 0))
813	return ret;
814	}
815	else
816	build_wcs_buffer (pstr);
817	}
818	else
819	#endif /* RE_ENABLE_I18N */
820	if (BE (pstr->mbs_allocated, 0))
821	{
822	if (pstr->icase)
823	build_upper_buffer (pstr);
824	else if (pstr->trans != NULL)
825	re_string_translate_buffer (pstr);
826	}
827	else
828	pstr->valid_len = pstr->len;
829
830	pstr->cur_idx = 0;
831	return REG_NOERROR;
832	}
833
834	static unsigned char
835	internal_function __attribute ((pure))
836	re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
837	{
838	int ch;
839	Idx off;
840
841	/* Handle the common (easiest) cases first. */
842	if (BE (!pstr->mbs_allocated, 1))
843	return re_string_peek_byte (pstr, idx);
844
845	#ifdef RE_ENABLE_I18N
846	if (pstr->mb_cur_max > 1
847	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
848	return re_string_peek_byte (pstr, idx);
849	#endif
850
851	off = pstr->cur_idx + idx;
852	#ifdef RE_ENABLE_I18N
853	if (pstr->offsets_needed)
854	off = pstr->offsets[off];
855	#endif
856
857	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
858
859	#ifdef RE_ENABLE_I18N
860	/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
861	this function returns CAPITAL LETTER I instead of first byte of
862	DOTLESS SMALL LETTER I. The latter would confuse the parser,
863	since peek_byte_case doesn't advance cur_idx in any way. */
864	if (pstr->offsets_needed && !isascii (ch))
865	return re_string_peek_byte (pstr, idx);
866	#endif
867
868	return ch;
869	}
870
871	static unsigned char
872	internal_function __attribute ((pure))
873	re_string_fetch_byte_case (re_string_t *pstr)
874	{
875	if (BE (!pstr->mbs_allocated, 1))
876	return re_string_fetch_byte (pstr);
877
878	#ifdef RE_ENABLE_I18N
879	if (pstr->offsets_needed)
880	{
881	Idx off;
882	int ch;
883
884	/* For tr_TR.UTF-8 [[:islower:]] there is
885	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
886	in that case the whole multi-byte character and return
887	the original letter. On the other side, with
888	[[: DOTLESS SMALL LETTER I return [[:I, as doing
889	anything else would complicate things too much. */
890
891	if (!re_string_first_byte (pstr, pstr->cur_idx))
892	return re_string_fetch_byte (pstr);
893
894	off = pstr->offsets[pstr->cur_idx];
895	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
896
897	if (! isascii (ch))
898	return re_string_fetch_byte (pstr);
899
900	re_string_skip_bytes (pstr,
901	re_string_char_size_at (pstr, pstr->cur_idx));
902	return ch;
903	}
904	#endif
905
906	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
907	}
908
909	static void
910	internal_function
911	re_string_destruct (re_string_t *pstr)
912	{
913	#ifdef RE_ENABLE_I18N
914	re_free (pstr->wcs);
915	re_free (pstr->offsets);
916	#endif /* RE_ENABLE_I18N */
917	if (pstr->mbs_allocated)
918	re_free (pstr->mbs);
919	}
920
921	/* Return the context at IDX in INPUT. */
922
923	static unsigned int
924	internal_function
925	re_string_context_at (const re_string_t *input, Idx idx, int eflags)
926	{
927	int c;
928	if (BE (! REG_VALID_INDEX (idx), 0))
929	/* In this case, we use the value stored in input->tip_context,
930	since we can't know the character in input->mbs[-1] here. */
931	return input->tip_context;
932	if (BE (idx == input->len, 0))
933	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
935	#ifdef RE_ENABLE_I18N
936	if (input->mb_cur_max > 1)
937	{
938	wint_t wc;
939	Idx wc_idx = idx;
940	while(input->wcs[wc_idx] == WEOF)
941	{
942	#ifdef DEBUG
943	/* It must not happen. */
944	assert (REG_VALID_INDEX (wc_idx));
945	#endif
946	--wc_idx;
947	if (! REG_VALID_INDEX (wc_idx))
948	return input->tip_context;
949	}
950	wc = input->wcs[wc_idx];
951	if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
952	return CONTEXT_WORD;
953	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954	? CONTEXT_NEWLINE : 0);
955	}
956	else
957	#endif
958	{
959	c = re_string_byte_at (input, idx);
960	if (bitset_contain (input->word_char, c))
961	return CONTEXT_WORD;
962	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963	}
964	}
965
966
967	/* Functions for set operation. */
968
969	static reg_errcode_t
970	internal_function
971	re_node_set_alloc (re_node_set *set, Idx size)
972	{
973	set->alloc = size;
974	set->nelem = 0;
975	set->elems = re_malloc (Idx, size);
976	if (BE (set->elems == NULL, 0))
977	return REG_ESPACE;
978	return REG_NOERROR;
979	}
980
981	static reg_errcode_t
982	internal_function
983	re_node_set_init_1 (re_node_set *set, Idx elem)
984	{
985	set->alloc = 1;
986	set->nelem = 1;
987	set->elems = re_malloc (Idx, 1);
988	if (BE (set->elems == NULL, 0))
989	{
990	set->alloc = set->nelem = 0;
991	return REG_ESPACE;
992	}
993	set->elems[0] = elem;
994	return REG_NOERROR;
995	}
996
997	static reg_errcode_t
998	internal_function
999	re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1000	{
1001	set->alloc = 2;
1002	set->elems = re_malloc (Idx, 2);
1003	if (BE (set->elems == NULL, 0))
1004	return REG_ESPACE;
1005	if (elem1 == elem2)
1006	{
1007	set->nelem = 1;
1008	set->elems[0] = elem1;
1009	}
1010	else
1011	{
1012	set->nelem = 2;
1013	if (elem1 < elem2)
1014	{
1015	set->elems[0] = elem1;
1016	set->elems[1] = elem2;
1017	}
1018	else
1019	{
1020	set->elems[0] = elem2;
1021	set->elems[1] = elem1;
1022	}
1023	}
1024	return REG_NOERROR;
1025	}
1026
1027	static reg_errcode_t
1028	internal_function
1029	re_node_set_init_copy (re_node_set dest, const re_node_set src)
1030	{
1031	dest->nelem = src->nelem;
1032	if (src->nelem > 0)
1033	{
1034	dest->alloc = dest->nelem;
1035	dest->elems = re_malloc (Idx, dest->alloc);
1036	if (BE (dest->elems == NULL, 0))
1037	{
1038	dest->alloc = dest->nelem = 0;
1039	return REG_ESPACE;
1040	}
1041	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1042	}
1043	else
1044	re_node_set_init_empty (dest);
1045	return REG_NOERROR;
1046	}
1047
1048	/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1049	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1050	Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1051
1052	static reg_errcode_t
1053	internal_function
1054	re_node_set_add_intersect (re_node_set dest, const re_node_set src1,
1055	const re_node_set *src2)
1056	{
1057	Idx i1, i2, is, id, delta, sbase;
1058	if (src1->nelem == 0 \|\| src2->nelem == 0)
1059	return REG_NOERROR;
1060
1061	/* We need dest->nelem + 2 * elems_in_intersection; this is a
1062	conservative estimate. */
1063	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1064	{
1065	Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1066	Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1067	if (BE (new_elems == NULL, 0))
1068	return REG_ESPACE;
1069	dest->elems = new_elems;
1070	dest->alloc = new_alloc;
1071	}
1072
1073	/* Find the items in the intersection of SRC1 and SRC2, and copy
1074	into the top of DEST those that are not already in DEST itself. */
1075	sbase = dest->nelem + src1->nelem + src2->nelem;
1076	i1 = src1->nelem - 1;
1077	i2 = src2->nelem - 1;
1078	id = dest->nelem - 1;
1079	for (;;)
1080	{
1081	if (src1->elems[i1] == src2->elems[i2])
1082	{
1083	/* Try to find the item in DEST. Maybe we could binary search? */
1084	while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1085	--id;
1086
1087	if (! REG_VALID_INDEX (id) \|\| dest->elems[id] != src1->elems[i1])
1088	dest->elems[--sbase] = src1->elems[i1];
1089
1090	if (! REG_VALID_INDEX (--i1) \|\| ! REG_VALID_INDEX (--i2))
1091	break;
1092	}
1093
1094	/* Lower the highest of the two items. */
1095	else if (src1->elems[i1] < src2->elems[i2])
1096	{
1097	if (! REG_VALID_INDEX (--i2))
1098	break;
1099	}
1100	else
1101	{
1102	if (! REG_VALID_INDEX (--i1))
1103	break;
1104	}
1105	}
1106
1107	id = dest->nelem - 1;
1108	is = dest->nelem + src1->nelem + src2->nelem - 1;
1109	delta = is - sbase + 1;
1110
1111	/* Now copy. When DELTA becomes zero, the remaining
1112	DEST elements are already in place; this is more or
1113	less the same loop that is in re_node_set_merge. */
1114	dest->nelem += delta;
1115	if (delta > 0 && REG_VALID_INDEX (id))
1116	for (;;)
1117	{
1118	if (dest->elems[is] > dest->elems[id])
1119	{
1120	/* Copy from the top. */
1121	dest->elems[id + delta--] = dest->elems[is--];
1122	if (delta == 0)
1123	break;
1124	}
1125	else
1126	{
1127	/* Slide from the bottom. */
1128	dest->elems[id + delta] = dest->elems[id];
1129	if (! REG_VALID_INDEX (--id))
1130	break;
1131	}
1132	}
1133
1134	/* Copy remaining SRC elements. */
1135	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1136
1137	return REG_NOERROR;
1138	}
1139
1140	/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1141	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1142
1143	static reg_errcode_t
1144	internal_function
1145	re_node_set_init_union (re_node_set dest, const re_node_set src1,
1146	const re_node_set *src2)
1147	{
1148	Idx i1, i2, id;
1149	if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1150	{
1151	dest->alloc = src1->nelem + src2->nelem;
1152	dest->elems = re_malloc (Idx, dest->alloc);
1153	if (BE (dest->elems == NULL, 0))
1154	return REG_ESPACE;
1155	}
1156	else
1157	{
1158	if (src1 != NULL && src1->nelem > 0)
1159	return re_node_set_init_copy (dest, src1);
1160	else if (src2 != NULL && src2->nelem > 0)
1161	return re_node_set_init_copy (dest, src2);
1162	else
1163	re_node_set_init_empty (dest);
1164	return REG_NOERROR;
1165	}
1166	for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1167	{
1168	if (src1->elems[i1] > src2->elems[i2])
1169	{
1170	dest->elems[id++] = src2->elems[i2++];
1171	continue;
1172	}
1173	if (src1->elems[i1] == src2->elems[i2])
1174	++i2;
1175	dest->elems[id++] = src1->elems[i1++];
1176	}
1177	if (i1 < src1->nelem)
1178	{
1179	memcpy (dest->elems + id, src1->elems + i1,
1180	(src1->nelem - i1) * sizeof (Idx));
1181	id += src1->nelem - i1;
1182	}
1183	else if (i2 < src2->nelem)
1184	{
1185	memcpy (dest->elems + id, src2->elems + i2,
1186	(src2->nelem - i2) * sizeof (Idx));
1187	id += src2->nelem - i2;
1188	}
1189	dest->nelem = id;
1190	return REG_NOERROR;
1191	}
1192
1193	/* Calculate the union set of the sets DEST and SRC. And store it to
1194	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1195
1196	static reg_errcode_t
1197	internal_function
1198	re_node_set_merge (re_node_set dest, const re_node_set src)
1199	{
1200	Idx is, id, sbase, delta;
1201	if (src == NULL \|\| src->nelem == 0)
1202	return REG_NOERROR;
1203	if (dest->alloc < 2 * src->nelem + dest->nelem)
1204	{
1205	Idx new_alloc = 2 * (src->nelem + dest->alloc);
1206	Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1207	if (BE (new_buffer == NULL, 0))
1208	return REG_ESPACE;
1209	dest->elems = new_buffer;
1210	dest->alloc = new_alloc;
1211	}
1212
1213	if (BE (dest->nelem == 0, 0))
1214	{
1215	dest->nelem = src->nelem;
1216	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1217	return REG_NOERROR;
1218	}
1219
1220	/* Copy into the top of DEST the items of SRC that are not
1221	found in DEST. Maybe we could binary search in DEST? */
1222	for (sbase = dest->nelem + 2 * src->nelem,
1223	is = src->nelem - 1, id = dest->nelem - 1;
1224	REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1225	{
1226	if (dest->elems[id] == src->elems[is])
1227	is--, id--;
1228	else if (dest->elems[id] < src->elems[is])
1229	dest->elems[--sbase] = src->elems[is--];
1230	else /* if (dest->elems[id] > src->elems[is]) */
1231	--id;
1232	}
1233
1234	if (REG_VALID_INDEX (is))
1235	{
1236	/* If DEST is exhausted, the remaining items of SRC must be unique. */
1237	sbase -= is + 1;
1238	memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1239	}
1240
1241	id = dest->nelem - 1;
1242	is = dest->nelem + 2 * src->nelem - 1;
1243	delta = is - sbase + 1;
1244	if (delta == 0)
1245	return REG_NOERROR;
1246
1247	/* Now copy. When DELTA becomes zero, the remaining
1248	DEST elements are already in place. */
1249	dest->nelem += delta;
1250	for (;;)
1251	{
1252	if (dest->elems[is] > dest->elems[id])
1253	{
1254	/* Copy from the top. */
1255	dest->elems[id + delta--] = dest->elems[is--];
1256	if (delta == 0)
1257	break;
1258	}
1259	else
1260	{
1261	/* Slide from the bottom. */
1262	dest->elems[id + delta] = dest->elems[id];
1263	if (! REG_VALID_INDEX (--id))
1264	{
1265	/* Copy remaining SRC elements. */
1266	memcpy (dest->elems, dest->elems + sbase,
1267	delta * sizeof (Idx));
1268	break;
1269	}
1270	}
1271	}
1272
1273	return REG_NOERROR;
1274	}
1275
1276	/* Insert the new element ELEM to the re_node_set* SET.
1277	SET should not already have ELEM.
1278	Return true if successful. */
1279
1280	static bool
1281	internal_function
1282	re_node_set_insert (re_node_set *set, Idx elem)
1283	{
1284	Idx idx;
1285	/* In case the set is empty. */
1286	if (set->alloc == 0)
1287	return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1288
1289	if (BE (set->nelem, 0) == 0)
1290	{
1291	/* We already guaranteed above that set->alloc != 0. */
1292	set->elems[0] = elem;
1293	++set->nelem;
1294	return true;
1295	}
1296
1297	/* Realloc if we need. */
1298	if (set->alloc == set->nelem)
1299	{
1300	Idx *new_elems;
1301	set->alloc = set->alloc * 2;
1302	new_elems = re_realloc (set->elems, Idx, set->alloc);
1303	if (BE (new_elems == NULL, 0))
1304	return false;
1305	set->elems = new_elems;
1306	}
1307
1308	/* Move the elements which follows the new element. Test the
1309	first element separately to skip a check in the inner loop. */
1310	if (elem < set->elems[0])
1311	{
1312	idx = 0;
1313	for (idx = set->nelem; idx > 0; idx--)
1314	set->elems[idx] = set->elems[idx - 1];
1315	}
1316	else
1317	{
1318	for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1319	set->elems[idx] = set->elems[idx - 1];
1320	}
1321
1322	/* Insert the new element. */
1323	set->elems[idx] = elem;
1324	++set->nelem;
1325	return true;
1326	}
1327
1328	/* Insert the new element ELEM to the re_node_set* SET.
1329	SET should not already have any element greater than or equal to ELEM.
1330	Return true if successful. */
1331
1332	static bool
1333	internal_function
1334	re_node_set_insert_last (re_node_set *set, Idx elem)
1335	{
1336	/* Realloc if we need. */
1337	if (set->alloc == set->nelem)
1338	{
1339	Idx *new_elems;
1340	set->alloc = (set->alloc + 1) * 2;
1341	new_elems = re_realloc (set->elems, Idx, set->alloc);
1342	if (BE (new_elems == NULL, 0))
1343	return false;
1344	set->elems = new_elems;
1345	}
1346
1347	/* Insert the new element. */
1348	set->elems[set->nelem++] = elem;
1349	return true;
1350	}
1351
1352	/* Compare two node sets SET1 and SET2.
1353	Return true if SET1 and SET2 are equivalent. */
1354
1355	static bool
1356	internal_function __attribute ((pure))
1357	re_node_set_compare (const re_node_set set1, const re_node_set set2)
1358	{
1359	Idx i;
1360	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1361	return false;
1362	for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1363	if (set1->elems[i] != set2->elems[i])
1364	return false;
1365	return true;
1366	}
1367
1368	/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1369
1370	static Idx
1371	internal_function __attribute ((pure))
1372	re_node_set_contains (const re_node_set *set, Idx elem)
1373	{
1374	__re_size_t idx, right, mid;
1375	if (! REG_VALID_NONZERO_INDEX (set->nelem))
1376	return 0;
1377
1378	/* Binary search the element. */
1379	idx = 0;
1380	right = set->nelem - 1;
1381	while (idx < right)
1382	{
1383	mid = (idx + right) / 2;
1384	if (set->elems[mid] < elem)
1385	idx = mid + 1;
1386	else
1387	right = mid;
1388	}
1389	return set->elems[idx] == elem ? idx + 1 : 0;
1390	}
1391
1392	static void
1393	internal_function
1394	re_node_set_remove_at (re_node_set *set, Idx idx)
1395	{
1396	if (idx < 0 \|\| idx >= set->nelem)
1397	return;
1398	--set->nelem;
1399	for (; idx < set->nelem; idx++)
1400	set->elems[idx] = set->elems[idx + 1];
1401	}
1402
1403
1404
1405	/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406	Or return REG_MISSING if an error occurred. */
1407
1408	static Idx
1409	internal_function
1410	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1411	{
1412	if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1413	{
1414	size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1415	Idx new_nexts, new_indices;
1416	re_node_set new_edests, new_eclosures;
1417	re_token_t *new_nodes;
1418	size_t max_object_size =
1419	MAX (sizeof (re_token_t),
1420	MAX (sizeof (re_node_set),
1421	sizeof (Idx)));
1422
1423	/* Avoid overflows. */
1424	if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1425	return REG_MISSING;
1426
1427	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1428	if (BE (new_nodes == NULL, 0))
1429	return REG_MISSING;
1430	dfa->nodes = new_nodes;
1431	new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1432	new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1433	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1434	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1435	if (BE (new_nexts == NULL \|\| new_indices == NULL
1436	\|\| new_edests == NULL \|\| new_eclosures == NULL, 0))
1437	return REG_MISSING;
1438	dfa->nexts = new_nexts;
1439	dfa->org_indices = new_indices;
1440	dfa->edests = new_edests;
1441	dfa->eclosures = new_eclosures;
1442	dfa->nodes_alloc = new_nodes_alloc;
1443	}
1444	dfa->nodes[dfa->nodes_len] = token;
1445	dfa->nodes[dfa->nodes_len].constraint = 0;
1446	#ifdef RE_ENABLE_I18N
1447	{
1448	int type = token.type;
1449	dfa->nodes[dfa->nodes_len].accept_mb =
1450	(type == OP_PERIOD && dfa->mb_cur_max > 1) \|\| type == COMPLEX_BRACKET;
1451	}
1452	#endif
1453	dfa->nexts[dfa->nodes_len] = REG_MISSING;
1454	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1455	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1456	return dfa->nodes_len++;
1457	}
1458
1459	static inline re_hashval_t
1460	internal_function
1461	calc_state_hash (const re_node_set *nodes, unsigned int context)
1462	{
1463	re_hashval_t hash = nodes->nelem + context;
1464	Idx i;
1465	for (i = 0 ; i < nodes->nelem ; i++)
1466	hash += nodes->elems[i];
1467	return hash;
1468	}
1469
1470	/* Search for the state whose node_set is equivalent to NODES.
1471	Return the pointer to the state, if we found it in the DFA.
1472	Otherwise create the new one and return it. In case of an error
1473	return NULL and set the error code in ERR.
1474	Note: - We assume NULL as the invalid state, then it is possible that
1475	return value is NULL and ERR is REG_NOERROR.
1476	- We never return non-NULL value in case of any errors, it is for
1477	optimization. */
1478
1479	static re_dfastate_t *
1480	internal_function
1481	re_acquire_state (reg_errcode_t err, const re_dfa_t dfa,
1482	const re_node_set *nodes)
1483	{
1484	re_hashval_t hash;
1485	re_dfastate_t *new_state;
1486	struct re_state_table_entry *spot;
1487	Idx i;
1488	#ifdef lint
1489	/* Suppress bogus uninitialized-variable warnings. */
1490	*err = REG_NOERROR;
1491	#endif
1492	if (BE (nodes->nelem == 0, 0))
1493	{
1494	*err = REG_NOERROR;
1495	return NULL;
1496	}
1497	hash = calc_state_hash (nodes, 0);
1498	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1499
1500	for (i = 0 ; i < spot->num ; i++)
1501	{
1502	re_dfastate_t *state = spot->array[i];
1503	if (hash != state->hash)
1504	continue;
1505	if (re_node_set_compare (&state->nodes, nodes))
1506	return state;
1507	}
1508
1509	/* There are no appropriate state in the dfa, create the new one. */
1510	new_state = create_ci_newstate (dfa, nodes, hash);
1511	if (BE (new_state == NULL, 0))
1512	*err = REG_ESPACE;
1513
1514	return new_state;
1515	}
1516
1517	/* Search for the state whose node_set is equivalent to NODES and
1518	whose context is equivalent to CONTEXT.
1519	Return the pointer to the state, if we found it in the DFA.
1520	Otherwise create the new one and return it. In case of an error
1521	return NULL and set the error code in ERR.
1522	Note: - We assume NULL as the invalid state, then it is possible that
1523	return value is NULL and ERR is REG_NOERROR.
1524	- We never return non-NULL value in case of any errors, it is for
1525	optimization. */
1526
1527	static re_dfastate_t *
1528	internal_function
1529	re_acquire_state_context (reg_errcode_t err, const re_dfa_t dfa,
1530	const re_node_set *nodes, unsigned int context)
1531	{
1532	re_hashval_t hash;
1533	re_dfastate_t *new_state;
1534	struct re_state_table_entry *spot;
1535	Idx i;
1536	#ifdef lint
1537	/* Suppress bogus uninitialized-variable warnings. */
1538	*err = REG_NOERROR;
1539	#endif
1540	if (nodes->nelem == 0)
1541	{
1542	*err = REG_NOERROR;
1543	return NULL;
1544	}
1545	hash = calc_state_hash (nodes, context);
1546	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1547
1548	for (i = 0 ; i < spot->num ; i++)
1549	{
1550	re_dfastate_t *state = spot->array[i];
1551	if (state->hash == hash
1552	&& state->context == context
1553	&& re_node_set_compare (state->entrance_nodes, nodes))
1554	return state;
1555	}
1556	/* There are no appropriate state in `dfa', create the new one. */
1557	new_state = create_cd_newstate (dfa, nodes, context, hash);
1558	if (BE (new_state == NULL, 0))
1559	*err = REG_ESPACE;
1560
1561	return new_state;
1562	}
1563
1564	/* Finish initialization of the new state NEWSTATE, and using its hash value
1565	HASH put in the appropriate bucket of DFA's state table. Return value
1566	indicates the error code if failed. */
1567
1568	static reg_errcode_t
1569	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1570	re_hashval_t hash)
1571	{
1572	struct re_state_table_entry *spot;
1573	reg_errcode_t err;
1574	Idx i;
1575
1576	newstate->hash = hash;
1577	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1578	if (BE (err != REG_NOERROR, 0))
1579	return REG_ESPACE;
1580	for (i = 0; i < newstate->nodes.nelem; i++)
1581	{
1582	Idx elem = newstate->nodes.elems[i];
1583	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1584	if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1585	return REG_ESPACE;
1586	}
1587
1588	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1589	if (BE (spot->alloc <= spot->num, 0))
1590	{
1591	Idx new_alloc = 2 * spot->num + 2;
1592	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1593	new_alloc);
1594	if (BE (new_array == NULL, 0))
1595	return REG_ESPACE;
1596	spot->array = new_array;
1597	spot->alloc = new_alloc;
1598	}
1599	spot->array[spot->num++] = newstate;
1600	return REG_NOERROR;
1601	}
1602
1603	static void
1604	free_state (re_dfastate_t *state)
1605	{
1606	re_node_set_free (&state->non_eps_nodes);
1607	re_node_set_free (&state->inveclosure);
1608	if (state->entrance_nodes != &state->nodes)
1609	{
1610	re_node_set_free (state->entrance_nodes);
1611	re_free (state->entrance_nodes);
1612	}
1613	re_node_set_free (&state->nodes);
1614	re_free (state->word_trtable);
1615	re_free (state->trtable);
1616	re_free (state);
1617	}
1618
1619	/* Create the new state which is independ of contexts.
1620	Return the new state if succeeded, otherwise return NULL. */
1621
1622	static re_dfastate_t *
1623	internal_function
1624	create_ci_newstate (const re_dfa_t dfa, const re_node_set nodes,
1625	re_hashval_t hash)
1626	{
1627	Idx i;
1628	reg_errcode_t err;
1629	re_dfastate_t *newstate;
1630
1631	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1632	if (BE (newstate == NULL, 0))
1633	return NULL;
1634	err = re_node_set_init_copy (&newstate->nodes, nodes);
1635	if (BE (err != REG_NOERROR, 0))
1636	{
1637	re_free (newstate);
1638	return NULL;
1639	}
1640
1641	newstate->entrance_nodes = &newstate->nodes;
1642	for (i = 0 ; i < nodes->nelem ; i++)
1643	{
1644	re_token_t *node = dfa->nodes + nodes->elems[i];
1645	re_token_type_t type = node->type;
1646	if (type == CHARACTER && !node->constraint)
1647	continue;
1648	#ifdef RE_ENABLE_I18N
1649	newstate->accept_mb \|= node->accept_mb;
1650	#endif /* RE_ENABLE_I18N */
1651
1652	/* If the state has the halt node, the state is a halt state. */
1653	if (type == END_OF_RE)
1654	newstate->halt = 1;
1655	else if (type == OP_BACK_REF)
1656	newstate->has_backref = 1;
1657	else if (type == ANCHOR \|\| node->constraint)
1658	newstate->has_constraint = 1;
1659	}
1660	err = register_state (dfa, newstate, hash);
1661	if (BE (err != REG_NOERROR, 0))
1662	{
1663	free_state (newstate);
1664	newstate = NULL;
1665	}
1666	return newstate;
1667	}
1668
1669	/* Create the new state which is depend on the context CONTEXT.
1670	Return the new state if succeeded, otherwise return NULL. */
1671
1672	static re_dfastate_t *
1673	internal_function
1674	create_cd_newstate (const re_dfa_t dfa, const re_node_set nodes,
1675	unsigned int context, re_hashval_t hash)
1676	{
1677	Idx i, nctx_nodes = 0;
1678	reg_errcode_t err;
1679	re_dfastate_t *newstate;
1680
1681	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1682	if (BE (newstate == NULL, 0))
1683	return NULL;
1684	err = re_node_set_init_copy (&newstate->nodes, nodes);
1685	if (BE (err != REG_NOERROR, 0))
1686	{
1687	re_free (newstate);
1688	return NULL;
1689	}
1690
1691	newstate->context = context;
1692	newstate->entrance_nodes = &newstate->nodes;
1693
1694	for (i = 0 ; i < nodes->nelem ; i++)
1695	{
1696	unsigned int constraint = 0;
1697	re_token_t *node = dfa->nodes + nodes->elems[i];
1698	re_token_type_t type = node->type;
1699	if (node->constraint)
1700	constraint = node->constraint;
1701
1702	if (type == CHARACTER && !constraint)
1703	continue;
1704	#ifdef RE_ENABLE_I18N
1705	newstate->accept_mb \|= node->accept_mb;
1706	#endif /* RE_ENABLE_I18N */
1707
1708	/* If the state has the halt node, the state is a halt state. */
1709	if (type == END_OF_RE)
1710	newstate->halt = 1;
1711	else if (type == OP_BACK_REF)
1712	newstate->has_backref = 1;
1713	else if (type == ANCHOR)
1714	constraint = node->opr.ctx_type;
1715
1716	if (constraint)
1717	{
1718	if (newstate->entrance_nodes == &newstate->nodes)
1719	{
1720	newstate->entrance_nodes = re_malloc (re_node_set, 1);
1721	if (BE (newstate->entrance_nodes == NULL, 0))
1722	{
1723	free_state (newstate);
1724	return NULL;
1725	}
1726	re_node_set_init_copy (newstate->entrance_nodes, nodes);
1727	nctx_nodes = 0;
1728	newstate->has_constraint = 1;
1729	}
1730
1731	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1732	{
1733	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1734	++nctx_nodes;
1735	}
1736	}
1737	}
1738	err = register_state (dfa, newstate, hash);
1739	if (BE (err != REG_NOERROR, 0))
1740	{
1741	free_state (newstate);
1742	newstate = NULL;
1743	}
1744	return newstate;
1745	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/app-arch/tar/lib/regex_internal.c

Download in other formats: