Context Navigation

regex_internal.c

Visit:

Last change on this file was 3076, checked in by bird, 18 years ago
gawk 3.1.5
File size: 44.0 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, write to the Free
18	Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19	02110-1301 USA. */
20
21	static void re_string_construct_common (const char *str, int len,
22	re_string_t *pstr,
23	RE_TRANSLATE_TYPE trans, int icase,
24	const re_dfa_t *dfa) internal_function;
25	#ifdef RE_ENABLE_I18N
26	static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27	wint_t *last_wc) internal_function;
28	#endif /* RE_ENABLE_I18N */
29	static reg_errcode_t register_state (re_dfa_t dfa, re_dfastate_t newstate,
30	unsigned int hash) internal_function;
31	static re_dfastate_t create_ci_newstate (re_dfa_t dfa,
32	const re_node_set *nodes,
33	unsigned int hash) internal_function;
34	static re_dfastate_t create_cd_newstate (re_dfa_t dfa,
35	const re_node_set *nodes,
36	unsigned int context,
37	unsigned int hash) internal_function;
38	static unsigned int inline calc_state_hash (const re_node_set *nodes,
39	unsigned int context) internal_function;
40
41
42	/* Functions for string operation. */
43
44	/* This function allocate the buffers. It is necessary to call
45	re_string_reconstruct before using the object. */
46
47	static reg_errcode_t
48	re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
49	re_string_t *pstr;
50	const char *str;
51	int len, init_len, icase;
52	RE_TRANSLATE_TYPE trans;
53	const re_dfa_t *dfa;
54	{
55	reg_errcode_t ret;
56	int init_buf_len;
57
58	/* Ensure at least one character fits into the buffers. */
59	if (init_len < dfa->mb_cur_max)
60	init_len = dfa->mb_cur_max;
61	init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
62	re_string_construct_common (str, len, pstr, trans, icase, dfa);
63
64	ret = re_string_realloc_buffers (pstr, init_buf_len);
65	if (BE (ret != REG_NOERROR, 0))
66	return ret;
67
68	pstr->word_char = dfa->word_char;
69	pstr->word_ops_used = dfa->word_ops_used;
70	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
71	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > 1) ? 0 : len;
72	pstr->valid_raw_len = pstr->valid_len;
73	return REG_NOERROR;
74	}
75
76	/* This function allocate the buffers, and initialize them. */
77
78	static reg_errcode_t
79	re_string_construct (pstr, str, len, trans, icase, dfa)
80	re_string_t *pstr;
81	const char *str;
82	int len, icase;
83	RE_TRANSLATE_TYPE trans;
84	const re_dfa_t *dfa;
85	{
86	reg_errcode_t ret;
87	memset (pstr, '\0', sizeof (re_string_t));
88	re_string_construct_common (str, len, pstr, trans, icase, dfa);
89
90	if (len > 0)
91	{
92	ret = re_string_realloc_buffers (pstr, len + 1);
93	if (BE (ret != REG_NOERROR, 0))
94	return ret;
95	}
96	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
97
98	if (icase)
99	{
100	#ifdef RE_ENABLE_I18N
101	if (dfa->mb_cur_max > 1)
102	{
103	while (1)
104	{
105	ret = build_wcs_upper_buffer (pstr);
106	if (BE (ret != REG_NOERROR, 0))
107	return ret;
108	if (pstr->valid_raw_len >= len)
109	break;
110	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
111	break;
112	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
113	if (BE (ret != REG_NOERROR, 0))
114	return ret;
115	}
116	}
117	else
118	#endif /* RE_ENABLE_I18N */
119	build_upper_buffer (pstr);
120	}
121	else
122	{
123	#ifdef RE_ENABLE_I18N
124	if (dfa->mb_cur_max > 1)
125	build_wcs_buffer (pstr);
126	else
127	#endif /* RE_ENABLE_I18N */
128	{
129	if (trans != NULL)
130	re_string_translate_buffer (pstr);
131	else
132	{
133	pstr->valid_len = pstr->bufs_len;
134	pstr->valid_raw_len = pstr->bufs_len;
135	}
136	}
137	}
138
139	return REG_NOERROR;
140	}
141
142	/* Helper functions for re_string_allocate, and re_string_construct. */
143
144	static reg_errcode_t
145	re_string_realloc_buffers (pstr, new_buf_len)
146	re_string_t *pstr;
147	int new_buf_len;
148	{
149	#ifdef RE_ENABLE_I18N
150	if (pstr->mb_cur_max > 1)
151	{
152	wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
153	if (BE (new_array == NULL, 0))
154	return REG_ESPACE;
155	pstr->wcs = new_array;
156	if (pstr->offsets != NULL)
157	{
158	int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
159	if (BE (new_array == NULL, 0))
160	return REG_ESPACE;
161	pstr->offsets = new_array;
162	}
163	}
164	#endif /* RE_ENABLE_I18N */
165	if (pstr->mbs_allocated)
166	{
167	unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
168	new_buf_len);
169	if (BE (new_array == NULL, 0))
170	return REG_ESPACE;
171	pstr->mbs = new_array;
172	}
173	pstr->bufs_len = new_buf_len;
174	return REG_NOERROR;
175	}
176
177
178	static void
179	re_string_construct_common (str, len, pstr, trans, icase, dfa)
180	const char *str;
181	int len;
182	re_string_t *pstr;
183	RE_TRANSLATE_TYPE trans;
184	int icase;
185	const re_dfa_t *dfa;
186	{
187	pstr->raw_mbs = (const unsigned char *) str;
188	pstr->len = len;
189	pstr->raw_len = len;
190	pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
191	pstr->icase = icase ? 1 : 0;
192	pstr->mbs_allocated = (trans != NULL \|\| icase);
193	pstr->mb_cur_max = dfa->mb_cur_max;
194	pstr->is_utf8 = dfa->is_utf8;
195	pstr->map_notascii = dfa->map_notascii;
196	pstr->stop = pstr->len;
197	pstr->raw_stop = pstr->stop;
198	}
199
200	#ifdef RE_ENABLE_I18N
201
202	/* Build wide character buffer PSTR->WCS.
203	If the byte sequence of the string are:
204	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
205	Then wide character buffer will be:
206	<wc1> , WEOF , <wc2> , WEOF , <wc3>
207	We use WEOF for padding, they indicate that the position isn't
208	a first byte of a multibyte character.
209
210	Note that this function assumes PSTR->VALID_LEN elements are already
211	built and starts from PSTR->VALID_LEN. */
212
213	static void
214	build_wcs_buffer (pstr)
215	re_string_t *pstr;
216	{
217	#ifdef _LIBC
218	unsigned char buf[pstr->mb_cur_max];
219	#else
220	unsigned char buf[64];
221	#endif
222	mbstate_t prev_st;
223	int byte_idx, end_idx, mbclen, remain_len;
224
225	/* Build the buffers from pstr->valid_len to either pstr->len or
226	pstr->bufs_len. */
227	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
228	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
229	{
230	wchar_t wc;
231	const char *p;
232
233	remain_len = end_idx - byte_idx;
234	prev_st = pstr->cur_state;
235	/* Apply the translation if we need. */
236	if (BE (pstr->trans != NULL, 0))
237	{
238	int i, ch;
239
240	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
241	{
242	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
243	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
244	}
245	p = (const char *) buf;
246	}
247	else
248	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
249	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
250	if (BE (mbclen == (size_t) -2, 0))
251	{
252	/* The buffer doesn't have enough space, finish to build. */
253	pstr->cur_state = prev_st;
254	break;
255	}
256	else if (BE (mbclen == (size_t) -1 \|\| mbclen == 0, 0))
257	{
258	/* We treat these cases as a singlebyte character. */
259	mbclen = 1;
260	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
261	if (BE (pstr->trans != NULL, 0))
262	wc = pstr->trans[wc];
263	pstr->cur_state = prev_st;
264	}
265
266	/* Write wide character and padding. */
267	pstr->wcs[byte_idx++] = wc;
268	/* Write paddings. */
269	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
270	pstr->wcs[byte_idx++] = WEOF;
271	}
272	pstr->valid_len = byte_idx;
273	pstr->valid_raw_len = byte_idx;
274	}
275
276	/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
277	but for REG_ICASE. */
278
279	static int
280	build_wcs_upper_buffer (pstr)
281	re_string_t *pstr;
282	{
283	mbstate_t prev_st;
284	int src_idx, byte_idx, end_idx, mbclen, remain_len;
285	#ifdef _LIBC
286	unsigned char buf[pstr->mb_cur_max];
287	#else
288	unsigned char buf[64];
289	#endif
290
291	byte_idx = pstr->valid_len;
292	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
293
294	/* The following optimization assumes that ASCII characters can be
295	mapped to wide characters with a simple cast. */
296	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
297	{
298	while (byte_idx < end_idx)
299	{
300	wchar_t wc;
301
302	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
303	&& mbsinit (&pstr->cur_state))
304	{
305	/* In case of a singlebyte character. */
306	pstr->mbs[byte_idx]
307	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
308	/* The next step uses the assumption that wchar_t is encoded
309	ASCII-safe: all ASCII values can be converted like this. */
310	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
311	++byte_idx;
312	continue;
313	}
314
315	remain_len = end_idx - byte_idx;
316	prev_st = pstr->cur_state;
317	mbclen = mbrtowc (&wc,
318	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
319	+ byte_idx), remain_len, &pstr->cur_state);
320	if (BE (mbclen > 0, 1))
321	{
322	wchar_t wcu = wc;
323	if (iswlower (wc))
324	{
325	int mbcdlen;
326
327	wcu = towupper (wc);
328	mbcdlen = wcrtomb ((char *)buf, wcu, &prev_st);
329	if (BE (mbclen == mbcdlen, 1))
330	memcpy (pstr->mbs + byte_idx, buf, mbclen);
331	else
332	{
333	src_idx = byte_idx;
334	goto offsets_needed;
335	}
336	}
337	else
338	memcpy (pstr->mbs + byte_idx,
339	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
340	pstr->wcs[byte_idx++] = wcu;
341	/* Write paddings. */
342	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
343	pstr->wcs[byte_idx++] = WEOF;
344	}
345	else if (mbclen == (size_t) -1 \|\| mbclen == 0)
346	{
347	/* It is an invalid character or '\0'. Just use the byte. */
348	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
349	pstr->mbs[byte_idx] = ch;
350	/* And also cast it to wide char. */
351	pstr->wcs[byte_idx++] = (wchar_t) ch;
352	if (BE (mbclen == (size_t) -1, 0))
353	pstr->cur_state = prev_st;
354	}
355	else
356	{
357	/* The buffer doesn't have enough space, finish to build. */
358	pstr->cur_state = prev_st;
359	break;
360	}
361	}
362	pstr->valid_len = byte_idx;
363	pstr->valid_raw_len = byte_idx;
364	return REG_NOERROR;
365	}
366	else
367	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
368	{
369	wchar_t wc;
370	const char *p;
371
372	offsets_needed:
373	remain_len = end_idx - byte_idx;
374	prev_st = pstr->cur_state;
375	if (BE (pstr->trans != NULL, 0))
376	{
377	int i, ch;
378
379	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
380	{
381	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
382	buf[i] = pstr->trans[ch];
383	}
384	p = (const char *) buf;
385	}
386	else
387	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
388	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
389	if (BE (mbclen > 0, 1))
390	{
391	wchar_t wcu = wc;
392	if (iswlower (wc))
393	{
394	int mbcdlen;
395
396	wcu = towupper (wc);
397	mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
398	if (BE (mbclen == mbcdlen, 1))
399	memcpy (pstr->mbs + byte_idx, buf, mbclen);
400	else
401	{
402	int i;
403
404	if (byte_idx + mbcdlen > pstr->bufs_len)
405	{
406	pstr->cur_state = prev_st;
407	break;
408	}
409
410	if (pstr->offsets == NULL)
411	{
412	pstr->offsets = re_malloc (int, pstr->bufs_len);
413
414	if (pstr->offsets == NULL)
415	return REG_ESPACE;
416	}
417	if (!pstr->offsets_needed)
418	{
419	for (i = 0; i < byte_idx; ++i)
420	pstr->offsets[i] = i;
421	pstr->offsets_needed = 1;
422	}
423
424	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
425	pstr->wcs[byte_idx] = wcu;
426	pstr->offsets[byte_idx] = src_idx;
427	for (i = 1; i < mbcdlen; ++i)
428	{
429	pstr->offsets[byte_idx + i]
430	= src_idx + (i < mbclen ? i : mbclen - 1);
431	pstr->wcs[byte_idx + i] = WEOF;
432	}
433	pstr->len += mbcdlen - mbclen;
434	if (pstr->raw_stop > src_idx)
435	pstr->stop += mbcdlen - mbclen;
436	end_idx = (pstr->bufs_len > pstr->len)
437	? pstr->len : pstr->bufs_len;
438	byte_idx += mbcdlen;
439	src_idx += mbclen;
440	continue;
441	}
442	}
443	else
444	memcpy (pstr->mbs + byte_idx, p, mbclen);
445
446	if (BE (pstr->offsets_needed != 0, 0))
447	{
448	int i;
449	for (i = 0; i < mbclen; ++i)
450	pstr->offsets[byte_idx + i] = src_idx + i;
451	}
452	src_idx += mbclen;
453
454	pstr->wcs[byte_idx++] = wcu;
455	/* Write paddings. */
456	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
457	pstr->wcs[byte_idx++] = WEOF;
458	}
459	else if (mbclen == (size_t) -1 \|\| mbclen == 0)
460	{
461	/* It is an invalid character or '\0'. Just use the byte. */
462	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
463
464	if (BE (pstr->trans != NULL, 0))
465	ch = pstr->trans [ch];
466	pstr->mbs[byte_idx] = ch;
467
468	if (BE (pstr->offsets_needed != 0, 0))
469	pstr->offsets[byte_idx] = src_idx;
470	++src_idx;
471
472	/* And also cast it to wide char. */
473	pstr->wcs[byte_idx++] = (wchar_t) ch;
474	if (BE (mbclen == (size_t) -1, 0))
475	pstr->cur_state = prev_st;
476	}
477	else
478	{
479	/* The buffer doesn't have enough space, finish to build. */
480	pstr->cur_state = prev_st;
481	break;
482	}
483	}
484	pstr->valid_len = byte_idx;
485	pstr->valid_raw_len = src_idx;
486	return REG_NOERROR;
487	}
488
489	/* Skip characters until the index becomes greater than NEW_RAW_IDX.
490	Return the index. */
491
492	static int
493	re_string_skip_chars (pstr, new_raw_idx, last_wc)
494	re_string_t *pstr;
495	int new_raw_idx;
496	wint_t *last_wc;
497	{
498	mbstate_t prev_st;
499	int rawbuf_idx, mbclen;
500	wchar_t wc = 0;
501
502	/* Skip the characters which are not necessary to check. */
503	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
504	rawbuf_idx < new_raw_idx;)
505	{
506	int remain_len;
507	remain_len = pstr->len - rawbuf_idx;
508	prev_st = pstr->cur_state;
509	mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
510	remain_len, &pstr->cur_state);
511	if (BE (mbclen == (size_t) -2 \|\| mbclen == (size_t) -1 \|\| mbclen == 0, 0))
512	{
513	/* We treat these cases as a singlebyte character. */
514	mbclen = 1;
515	pstr->cur_state = prev_st;
516	}
517	/* Then proceed the next character. */
518	rawbuf_idx += mbclen;
519	}
520	*last_wc = (wint_t) wc;
521	return rawbuf_idx;
522	}
523	#endif /* RE_ENABLE_I18N */
524
525	/* Build the buffer PSTR->MBS, and apply the translation if we need.
526	This function is used in case of REG_ICASE. */
527
528	static void
529	build_upper_buffer (pstr)
530	re_string_t *pstr;
531	{
532	int char_idx, end_idx;
533	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
534
535	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
536	{
537	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
538	if (BE (pstr->trans != NULL, 0))
539	ch = pstr->trans[ch];
540	if (islower (ch))
541	pstr->mbs[char_idx] = toupper (ch);
542	else
543	pstr->mbs[char_idx] = ch;
544	}
545	pstr->valid_len = char_idx;
546	pstr->valid_raw_len = char_idx;
547	}
548
549	/* Apply TRANS to the buffer in PSTR. */
550
551	static void
552	re_string_translate_buffer (pstr)
553	re_string_t *pstr;
554	{
555	int buf_idx, end_idx;
556	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
557
558	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
559	{
560	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
561	pstr->mbs[buf_idx] = pstr->trans[ch];
562	}
563
564	pstr->valid_len = buf_idx;
565	pstr->valid_raw_len = buf_idx;
566	}
567
568	/* This function re-construct the buffers.
569	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
570	convert to upper case in case of REG_ICASE, apply translation. */
571
572	static reg_errcode_t
573	re_string_reconstruct (pstr, idx, eflags)
574	re_string_t *pstr;
575	int idx, eflags;
576	{
577	int offset = idx - pstr->raw_mbs_idx;
578	if (BE (offset < 0, 0))
579	{
580	/* Reset buffer. */
581	#ifdef RE_ENABLE_I18N
582	if (pstr->mb_cur_max > 1)
583	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
584	#endif /* RE_ENABLE_I18N */
585	pstr->len = pstr->raw_len;
586	pstr->stop = pstr->raw_stop;
587	pstr->valid_len = 0;
588	pstr->raw_mbs_idx = 0;
589	pstr->valid_raw_len = 0;
590	pstr->offsets_needed = 0;
591	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
592	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
593	if (!pstr->mbs_allocated)
594	pstr->mbs = (unsigned char *) pstr->raw_mbs;
595	offset = idx;
596	}
597
598	if (BE (offset != 0, 1))
599	{
600	/* Are the characters which are already checked remain? */
601	if (BE (offset < pstr->valid_raw_len, 1)
602	#ifdef RE_ENABLE_I18N
603	/* Handling this would enlarge the code too much.
604	Accept a slowdown in that case. */
605	&& pstr->offsets_needed == 0
606	#endif
607	)
608	{
609	/* Yes, move them to the front of the buffer. */
610	pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
611	#ifdef RE_ENABLE_I18N
612	if (pstr->mb_cur_max > 1)
613	memmove (pstr->wcs, pstr->wcs + offset,
614	(pstr->valid_len - offset) * sizeof (wint_t));
615	#endif /* RE_ENABLE_I18N */
616	if (BE (pstr->mbs_allocated, 0))
617	memmove (pstr->mbs, pstr->mbs + offset,
618	pstr->valid_len - offset);
619	pstr->valid_len -= offset;
620	pstr->valid_raw_len -= offset;
621	#if DEBUG
622	assert (pstr->valid_len > 0);
623	#endif
624	}
625	else
626	{
627	/* No, skip all characters until IDX. */
628	#ifdef RE_ENABLE_I18N
629	if (BE (pstr->offsets_needed, 0))
630	{
631	pstr->len = pstr->raw_len - idx + offset;
632	pstr->stop = pstr->raw_stop - idx + offset;
633	pstr->offsets_needed = 0;
634	}
635	#endif
636	pstr->valid_len = 0;
637	pstr->valid_raw_len = 0;
638	#ifdef RE_ENABLE_I18N
639	if (pstr->mb_cur_max > 1)
640	{
641	int wcs_idx;
642	wint_t wc = WEOF;
643
644	if (pstr->is_utf8)
645	{
646	const unsigned char raw, p, q, end;
647
648	/* Special case UTF-8. Multi-byte chars start with any
649	byte other than 0x80 - 0xbf. */
650	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
651	end = raw + (offset - pstr->mb_cur_max);
652	for (p = raw + offset - 1; p >= end; --p)
653	if ((*p & 0xc0) != 0x80)
654	{
655	mbstate_t cur_state;
656	wchar_t wc2;
657	int mlen = raw + pstr->len - p;
658	unsigned char buf[6];
659
660	q = p;
661	if (BE (pstr->trans != NULL, 0))
662	{
663	int i = mlen < 6 ? mlen : 6;
664	while (--i >= 0)
665	buf[i] = pstr->trans[p[i]];
666	q = buf;
667	}
668	/* XXX Don't use mbrtowc, we know which conversion
669	to use (UTF-8 -> UCS4). */
670	memset (&cur_state, 0, sizeof (cur_state));
671	mlen = mbrtowc (&wc2, p, mlen, &cur_state)
672	- (raw + offset - p);
673	if (mlen >= 0)
674	{
675	memset (&pstr->cur_state, '\0',
676	sizeof (mbstate_t));
677	pstr->valid_len = mlen;
678	wc = wc2;
679	}
680	break;
681	}
682	}
683
684	if (wc == WEOF)
685	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
686	if (BE (pstr->valid_len, 0))
687	{
688	for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
689	pstr->wcs[wcs_idx] = WEOF;
690	if (pstr->mbs_allocated)
691	memset (pstr->mbs, 255, pstr->valid_len);
692	}
693	pstr->valid_raw_len = pstr->valid_len;
694	pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
695	&& IS_WIDE_WORD_CHAR (wc))
696	? CONTEXT_WORD
697	: ((IS_WIDE_NEWLINE (wc)
698	&& pstr->newline_anchor)
699	? CONTEXT_NEWLINE : 0));
700	}
701	else
702	#endif /* RE_ENABLE_I18N */
703	{
704	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
705	if (pstr->trans)
706	c = pstr->trans[c];
707	pstr->tip_context = (bitset_contain (pstr->word_char, c)
708	? CONTEXT_WORD
709	: ((IS_NEWLINE (c) && pstr->newline_anchor)
710	? CONTEXT_NEWLINE : 0));
711	}
712	}
713	if (!BE (pstr->mbs_allocated, 0))
714	pstr->mbs += offset;
715	}
716	pstr->raw_mbs_idx = idx;
717	pstr->len -= offset;
718	pstr->stop -= offset;
719
720	/* Then build the buffers. */
721	#ifdef RE_ENABLE_I18N
722	if (pstr->mb_cur_max > 1)
723	{
724	if (pstr->icase)
725	{
726	int ret = build_wcs_upper_buffer (pstr);
727	if (BE (ret != REG_NOERROR, 0))
728	return ret;
729	}
730	else
731	build_wcs_buffer (pstr);
732	}
733	else
734	#endif /* RE_ENABLE_I18N */
735	if (BE (pstr->mbs_allocated, 0))
736	{
737	if (pstr->icase)
738	build_upper_buffer (pstr);
739	else if (pstr->trans != NULL)
740	re_string_translate_buffer (pstr);
741	}
742	else
743	pstr->valid_len = pstr->len;
744
745	pstr->cur_idx = 0;
746	return REG_NOERROR;
747	}
748
749	static unsigned char
750	re_string_peek_byte_case (pstr, idx)
751	const re_string_t *pstr;
752	int idx;
753	{
754	int ch, off;
755
756	/* Handle the common (easiest) cases first. */
757	if (BE (!pstr->mbs_allocated, 1))
758	return re_string_peek_byte (pstr, idx);
759
760	#ifdef RE_ENABLE_I18N
761	if (pstr->mb_cur_max > 1
762	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
763	return re_string_peek_byte (pstr, idx);
764	#endif
765
766	off = pstr->cur_idx + idx;
767	#ifdef RE_ENABLE_I18N
768	if (pstr->offsets_needed)
769	off = pstr->offsets[off];
770	#endif
771
772	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
773
774	#ifdef RE_ENABLE_I18N
775	/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
776	this function returns CAPITAL LETTER I instead of first byte of
777	DOTLESS SMALL LETTER I. The latter would confuse the parser,
778	since peek_byte_case doesn't advance cur_idx in any way. */
779	if (pstr->offsets_needed && !isascii (ch))
780	return re_string_peek_byte (pstr, idx);
781	#endif
782
783	return ch;
784	}
785
786	static unsigned char
787	re_string_fetch_byte_case (pstr)
788	re_string_t *pstr;
789	{
790	if (BE (!pstr->mbs_allocated, 1))
791	return re_string_fetch_byte (pstr);
792
793	#ifdef RE_ENABLE_I18N
794	if (pstr->offsets_needed)
795	{
796	int off, ch;
797
798	/* For tr_TR.UTF-8 [[:islower:]] there is
799	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
800	in that case the whole multi-byte character and return
801	the original letter. On the other side, with
802	[[: DOTLESS SMALL LETTER I return [[:I, as doing
803	anything else would complicate things too much. */
804
805	if (!re_string_first_byte (pstr, pstr->cur_idx))
806	return re_string_fetch_byte (pstr);
807
808	off = pstr->offsets[pstr->cur_idx];
809	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
810
811	if (! isascii (ch))
812	return re_string_fetch_byte (pstr);
813
814	re_string_skip_bytes (pstr,
815	re_string_char_size_at (pstr, pstr->cur_idx));
816	return ch;
817	}
818	#endif
819
820	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
821	}
822
823	static void
824	re_string_destruct (pstr)
825	re_string_t *pstr;
826	{
827	#ifdef RE_ENABLE_I18N
828	re_free (pstr->wcs);
829	re_free (pstr->offsets);
830	#endif /* RE_ENABLE_I18N */
831	if (pstr->mbs_allocated)
832	re_free (pstr->mbs);
833	}
834
835	/* Return the context at IDX in INPUT. */
836
837	static unsigned int
838	re_string_context_at (input, idx, eflags)
839	const re_string_t *input;
840	int idx, eflags;
841	{
842	int c;
843	if (BE (idx < 0, 0))
844	/* In this case, we use the value stored in input->tip_context,
845	since we can't know the character in input->mbs[-1] here. */
846	return input->tip_context;
847	if (BE (idx == input->len, 0))
848	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
849	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
850	#ifdef RE_ENABLE_I18N
851	if (input->mb_cur_max > 1)
852	{
853	wint_t wc;
854	int wc_idx = idx;
855	while(input->wcs[wc_idx] == WEOF)
856	{
857	#ifdef DEBUG
858	/* It must not happen. */
859	assert (wc_idx >= 0);
860	#endif
861	--wc_idx;
862	if (wc_idx < 0)
863	return input->tip_context;
864	}
865	wc = input->wcs[wc_idx];
866	if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
867	return CONTEXT_WORD;
868	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
869	? CONTEXT_NEWLINE : 0);
870	}
871	else
872	#endif
873	{
874	c = re_string_byte_at (input, idx);
875	if (bitset_contain (input->word_char, c))
876	return CONTEXT_WORD;
877	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
878	}
879	}
880
881
882	/* Functions for set operation. */
883
884	static reg_errcode_t
885	re_node_set_alloc (set, size)
886	re_node_set *set;
887	int size;
888	{
889	/*
890	* ADR: valgrind says size can be 0, which then doesn't
891	* free the block of size 0. Harumph. This seems
892	* to work ok, though.
893	*/
894	if (size == 0)
895	{
896	memset(set, 0, sizeof(*set));
897	return REG_NOERROR;
898	}
899	set->alloc = size;
900	set->nelem = 0;
901	set->elems = re_malloc (int, size);
902	if (BE (set->elems == NULL, 0))
903	return REG_ESPACE;
904	return REG_NOERROR;
905	}
906
907	static reg_errcode_t
908	re_node_set_init_1 (set, elem)
909	re_node_set *set;
910	int elem;
911	{
912	set->alloc = 1;
913	set->nelem = 1;
914	set->elems = re_malloc (int, 1);
915	if (BE (set->elems == NULL, 0))
916	{
917	set->alloc = set->nelem = 0;
918	return REG_ESPACE;
919	}
920	set->elems[0] = elem;
921	return REG_NOERROR;
922	}
923
924	static reg_errcode_t
925	re_node_set_init_2 (set, elem1, elem2)
926	re_node_set *set;
927	int elem1, elem2;
928	{
929	set->alloc = 2;
930	set->elems = re_malloc (int, 2);
931	if (BE (set->elems == NULL, 0))
932	return REG_ESPACE;
933	if (elem1 == elem2)
934	{
935	set->nelem = 1;
936	set->elems[0] = elem1;
937	}
938	else
939	{
940	set->nelem = 2;
941	if (elem1 < elem2)
942	{
943	set->elems[0] = elem1;
944	set->elems[1] = elem2;
945	}
946	else
947	{
948	set->elems[0] = elem2;
949	set->elems[1] = elem1;
950	}
951	}
952	return REG_NOERROR;
953	}
954
955	static reg_errcode_t
956	re_node_set_init_copy (dest, src)
957	re_node_set *dest;
958	const re_node_set *src;
959	{
960	dest->nelem = src->nelem;
961	if (src->nelem > 0)
962	{
963	dest->alloc = dest->nelem;
964	dest->elems = re_malloc (int, dest->alloc);
965	if (BE (dest->elems == NULL, 0))
966	{
967	dest->alloc = dest->nelem = 0;
968	return REG_ESPACE;
969	}
970	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
971	}
972	else
973	re_node_set_init_empty (dest);
974	return REG_NOERROR;
975	}
976
977	/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
978	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
979	Note: We assume dest->elems is NULL, when dest->alloc is 0. */
980
981	static reg_errcode_t
982	re_node_set_add_intersect (dest, src1, src2)
983	re_node_set *dest;
984	const re_node_set src1, src2;
985	{
986	int i1, i2, is, id, delta, sbase;
987	if (src1->nelem == 0 \|\| src2->nelem == 0)
988	return REG_NOERROR;
989
990	/* We need dest->nelem + 2 * elems_in_intersection; this is a
991	conservative estimate. */
992	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
993	{
994	int new_alloc = src1->nelem + src2->nelem + dest->alloc;
995	int *new_elems = re_realloc (dest->elems, int, new_alloc);
996	if (BE (new_elems == NULL, 0))
997	return REG_ESPACE;
998	dest->elems = new_elems;
999	dest->alloc = new_alloc;
1000	}
1001
1002	/* Find the items in the intersection of SRC1 and SRC2, and copy
1003	into the top of DEST those that are not already in DEST itself. */
1004	sbase = dest->nelem + src1->nelem + src2->nelem;
1005	i1 = src1->nelem - 1;
1006	i2 = src2->nelem - 1;
1007	id = dest->nelem - 1;
1008	for (;;)
1009	{
1010	if (src1->elems[i1] == src2->elems[i2])
1011	{
1012	/* Try to find the item in DEST. Maybe we could binary search? */
1013	while (id >= 0 && dest->elems[id] > src1->elems[i1])
1014	--id;
1015
1016	if (id < 0 \|\| dest->elems[id] != src1->elems[i1])
1017	dest->elems[--sbase] = src1->elems[i1];
1018
1019	if (--i1 < 0 \|\| --i2 < 0)
1020	break;
1021	}
1022
1023	/* Lower the highest of the two items. */
1024	else if (src1->elems[i1] < src2->elems[i2])
1025	{
1026	if (--i2 < 0)
1027	break;
1028	}
1029	else
1030	{
1031	if (--i1 < 0)
1032	break;
1033	}
1034	}
1035
1036	id = dest->nelem - 1;
1037	is = dest->nelem + src1->nelem + src2->nelem - 1;
1038	delta = is - sbase + 1;
1039
1040	/* Now copy. When DELTA becomes zero, the remaining
1041	DEST elements are already in place; this is more or
1042	less the same loop that is in re_node_set_merge. */
1043	dest->nelem += delta;
1044	if (delta > 0 && id >= 0)
1045	for (;;)
1046	{
1047	if (dest->elems[is] > dest->elems[id])
1048	{
1049	/* Copy from the top. */
1050	dest->elems[id + delta--] = dest->elems[is--];
1051	if (delta == 0)
1052	break;
1053	}
1054	else
1055	{
1056	/* Slide from the bottom. */
1057	dest->elems[id + delta] = dest->elems[id];
1058	if (--id < 0)
1059	break;
1060	}
1061	}
1062
1063	/* Copy remaining SRC elements. */
1064	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1065
1066	return REG_NOERROR;
1067	}
1068
1069	/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1070	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1071
1072	static reg_errcode_t
1073	re_node_set_init_union (dest, src1, src2)
1074	re_node_set *dest;
1075	const re_node_set src1, src2;
1076	{
1077	int i1, i2, id;
1078	if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1079	{
1080	dest->alloc = src1->nelem + src2->nelem;
1081	dest->elems = re_malloc (int, dest->alloc);
1082	if (BE (dest->elems == NULL, 0))
1083	return REG_ESPACE;
1084	}
1085	else
1086	{
1087	if (src1 != NULL && src1->nelem > 0)
1088	return re_node_set_init_copy (dest, src1);
1089	else if (src2 != NULL && src2->nelem > 0)
1090	return re_node_set_init_copy (dest, src2);
1091	else
1092	re_node_set_init_empty (dest);
1093	return REG_NOERROR;
1094	}
1095	for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1096	{
1097	if (src1->elems[i1] > src2->elems[i2])
1098	{
1099	dest->elems[id++] = src2->elems[i2++];
1100	continue;
1101	}
1102	if (src1->elems[i1] == src2->elems[i2])
1103	++i2;
1104	dest->elems[id++] = src1->elems[i1++];
1105	}
1106	if (i1 < src1->nelem)
1107	{
1108	memcpy (dest->elems + id, src1->elems + i1,
1109	(src1->nelem - i1) * sizeof (int));
1110	id += src1->nelem - i1;
1111	}
1112	else if (i2 < src2->nelem)
1113	{
1114	memcpy (dest->elems + id, src2->elems + i2,
1115	(src2->nelem - i2) * sizeof (int));
1116	id += src2->nelem - i2;
1117	}
1118	dest->nelem = id;
1119	return REG_NOERROR;
1120	}
1121
1122	/* Calculate the union set of the sets DEST and SRC. And store it to
1123	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1124
1125	static reg_errcode_t
1126	re_node_set_merge (dest, src)
1127	re_node_set *dest;
1128	const re_node_set *src;
1129	{
1130	int is, id, sbase, delta;
1131	if (src == NULL \|\| src->nelem == 0)
1132	return REG_NOERROR;
1133	if (dest->alloc < 2 * src->nelem + dest->nelem)
1134	{
1135	int new_alloc = 2 * (src->nelem + dest->alloc);
1136	int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1137	if (BE (new_buffer == NULL, 0))
1138	return REG_ESPACE;
1139	dest->elems = new_buffer;
1140	dest->alloc = new_alloc;
1141	}
1142
1143	if (BE (dest->nelem == 0, 0))
1144	{
1145	dest->nelem = src->nelem;
1146	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1147	return REG_NOERROR;
1148	}
1149
1150	/* Copy into the top of DEST the items of SRC that are not
1151	found in DEST. Maybe we could binary search in DEST? */
1152	for (sbase = dest->nelem + 2 * src->nelem,
1153	is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1154	{
1155	if (dest->elems[id] == src->elems[is])
1156	is--, id--;
1157	else if (dest->elems[id] < src->elems[is])
1158	dest->elems[--sbase] = src->elems[is--];
1159	else /* if (dest->elems[id] > src->elems[is]) */
1160	--id;
1161	}
1162
1163	if (is >= 0)
1164	{
1165	/* If DEST is exhausted, the remaining items of SRC must be unique. */
1166	sbase -= is + 1;
1167	memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1168	}
1169
1170	id = dest->nelem - 1;
1171	is = dest->nelem + 2 * src->nelem - 1;
1172	delta = is - sbase + 1;
1173	if (delta == 0)
1174	return REG_NOERROR;
1175
1176	/* Now copy. When DELTA becomes zero, the remaining
1177	DEST elements are already in place. */
1178	dest->nelem += delta;
1179	for (;;)
1180	{
1181	if (dest->elems[is] > dest->elems[id])
1182	{
1183	/* Copy from the top. */
1184	dest->elems[id + delta--] = dest->elems[is--];
1185	if (delta == 0)
1186	break;
1187	}
1188	else
1189	{
1190	/* Slide from the bottom. */
1191	dest->elems[id + delta] = dest->elems[id];
1192	if (--id < 0)
1193	{
1194	/* Copy remaining SRC elements. */
1195	memcpy (dest->elems, dest->elems + sbase,
1196	delta * sizeof (int));
1197	break;
1198	}
1199	}
1200	}
1201
1202	return REG_NOERROR;
1203	}
1204
1205	/* Insert the new element ELEM to the re_node_set* SET.
1206	SET should not already have ELEM.
1207	return -1 if an error is occured, return 1 otherwise. */
1208
1209	static int
1210	re_node_set_insert (set, elem)
1211	re_node_set *set;
1212	int elem;
1213	{
1214	int idx;
1215	/* In case the set is empty. */
1216	if (set->alloc == 0)
1217	{
1218	if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1219	return 1;
1220	else
1221	return -1;
1222	}
1223
1224	if (BE (set->nelem, 0) == 0)
1225	{
1226	/* We already guaranteed above that set->alloc != 0. */
1227	set->elems[0] = elem;
1228	++set->nelem;
1229	return 1;
1230	}
1231
1232	/* Realloc if we need. */
1233	if (set->alloc == set->nelem)
1234	{
1235	int *new_array;
1236	set->alloc = set->alloc * 2;
1237	new_array = re_realloc (set->elems, int, set->alloc);
1238	if (BE (new_array == NULL, 0))
1239	return -1;
1240	set->elems = new_array;
1241	}
1242
1243	/* Move the elements which follows the new element. Test the
1244	first element separately to skip a check in the inner loop. */
1245	if (elem < set->elems[0])
1246	{
1247	idx = 0;
1248	for (idx = set->nelem; idx > 0; idx--)
1249	set->elems[idx] = set->elems[idx - 1];
1250	}
1251	else
1252	{
1253	for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1254	set->elems[idx] = set->elems[idx - 1];
1255	}
1256
1257	/* Insert the new element. */
1258	set->elems[idx] = elem;
1259	++set->nelem;
1260	return 1;
1261	}
1262
1263	/* Insert the new element ELEM to the re_node_set* SET.
1264	SET should not already have any element greater than or equal to ELEM.
1265	Return -1 if an error is occured, return 1 otherwise. */
1266
1267	static int
1268	re_node_set_insert_last (set, elem)
1269	re_node_set *set;
1270	int elem;
1271	{
1272	/* Realloc if we need. */
1273	if (set->alloc == set->nelem)
1274	{
1275	int *new_array;
1276	set->alloc = (set->alloc + 1) * 2;
1277	new_array = re_realloc (set->elems, int, set->alloc);
1278	if (BE (new_array == NULL, 0))
1279	return -1;
1280	set->elems = new_array;
1281	}
1282
1283	/* Insert the new element. */
1284	set->elems[set->nelem++] = elem;
1285	return 1;
1286	}
1287
1288	/* Compare two node sets SET1 and SET2.
1289	return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1290
1291	static int
1292	re_node_set_compare (set1, set2)
1293	const re_node_set set1, set2;
1294	{
1295	int i;
1296	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1297	return 0;
1298	for (i = set1->nelem ; --i >= 0 ; )
1299	if (set1->elems[i] != set2->elems[i])
1300	return 0;
1301	return 1;
1302	}
1303
1304	/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1305
1306	static int
1307	re_node_set_contains (set, elem)
1308	const re_node_set *set;
1309	int elem;
1310	{
1311	unsigned int idx, right, mid;
1312	if (set->nelem <= 0)
1313	return 0;
1314
1315	/* Binary search the element. */
1316	idx = 0;
1317	right = set->nelem - 1;
1318	while (idx < right)
1319	{
1320	mid = (idx + right) / 2;
1321	if (set->elems[mid] < elem)
1322	idx = mid + 1;
1323	else
1324	right = mid;
1325	}
1326	return set->elems[idx] == elem ? idx + 1 : 0;
1327	}
1328
1329	static void
1330	re_node_set_remove_at (set, idx)
1331	re_node_set *set;
1332	int idx;
1333	{
1334	if (idx < 0 \|\| idx >= set->nelem)
1335	return;
1336	--set->nelem;
1337	for (; idx < set->nelem; idx++)
1338	set->elems[idx] = set->elems[idx + 1];
1339	}
1340
1341
1342
1343	/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1344	Or return -1, if an error will be occured. */
1345
1346	static int
1347	re_dfa_add_node (dfa, token)
1348	re_dfa_t *dfa;
1349	re_token_t token;
1350	{
1351	if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1352	{
1353	int new_nodes_alloc = dfa->nodes_alloc * 2;
1354	int new_nexts, new_indices;
1355	re_node_set new_edests, new_eclosures;
1356
1357	re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1358	new_nodes_alloc);
1359	if (BE (new_array == NULL, 0))
1360	return -1;
1361	dfa->nodes = new_array;
1362	new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1363	new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1364	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1365	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1366	if (BE (new_nexts == NULL \|\| new_indices == NULL
1367	\|\| new_edests == NULL \|\| new_eclosures == NULL, 0))
1368	return -1;
1369	dfa->nexts = new_nexts;
1370	dfa->org_indices = new_indices;
1371	dfa->edests = new_edests;
1372	dfa->eclosures = new_eclosures;
1373	dfa->nodes_alloc = new_nodes_alloc;
1374	}
1375	dfa->nodes[dfa->nodes_len] = token;
1376	dfa->nodes[dfa->nodes_len].constraint = 0;
1377	#ifdef RE_ENABLE_I18N
1378	dfa->nodes[dfa->nodes_len].accept_mb =
1379	(token.type == OP_PERIOD && dfa->mb_cur_max > 1) \|\| token.type == COMPLEX_BRACKET;
1380	#endif
1381	dfa->nexts[dfa->nodes_len] = -1;
1382	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1383	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1384	return dfa->nodes_len++;
1385	}
1386
1387	static unsigned int inline
1388	calc_state_hash (nodes, context)
1389	const re_node_set *nodes;
1390	unsigned int context;
1391	{
1392	unsigned int hash = nodes->nelem + context;
1393	int i;
1394	for (i = 0 ; i < nodes->nelem ; i++)
1395	hash += nodes->elems[i];
1396	return hash;
1397	}
1398
1399	/* Search for the state whose node_set is equivalent to NODES.
1400	Return the pointer to the state, if we found it in the DFA.
1401	Otherwise create the new one and return it. In case of an error
1402	return NULL and set the error code in ERR.
1403	Note: - We assume NULL as the invalid state, then it is possible that
1404	return value is NULL and ERR is REG_NOERROR.
1405	- We never return non-NULL value in case of any errors, it is for
1406	optimization. */
1407
1408	static re_dfastate_t*
1409	re_acquire_state (err, dfa, nodes)
1410	reg_errcode_t *err;
1411	re_dfa_t *dfa;
1412	const re_node_set *nodes;
1413	{
1414	unsigned int hash;
1415	re_dfastate_t *new_state;
1416	struct re_state_table_entry *spot;
1417	int i;
1418	if (BE (nodes->nelem == 0, 0))
1419	{
1420	*err = REG_NOERROR;
1421	return NULL;
1422	}
1423	hash = calc_state_hash (nodes, 0);
1424	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1425
1426	for (i = 0 ; i < spot->num ; i++)
1427	{
1428	re_dfastate_t *state = spot->array[i];
1429	if (hash != state->hash)
1430	continue;
1431	if (re_node_set_compare (&state->nodes, nodes))
1432	return state;
1433	}
1434
1435	/* There are no appropriate state in the dfa, create the new one. */
1436	new_state = create_ci_newstate (dfa, nodes, hash);
1437	if (BE (new_state != NULL, 1))
1438	return new_state;
1439	else
1440	{
1441	*err = REG_ESPACE;
1442	return NULL;
1443	}
1444	}
1445
1446	/* Search for the state whose node_set is equivalent to NODES and
1447	whose context is equivalent to CONTEXT.
1448	Return the pointer to the state, if we found it in the DFA.
1449	Otherwise create the new one and return it. In case of an error
1450	return NULL and set the error code in ERR.
1451	Note: - We assume NULL as the invalid state, then it is possible that
1452	return value is NULL and ERR is REG_NOERROR.
1453	- We never return non-NULL value in case of any errors, it is for
1454	optimization. */
1455
1456	static re_dfastate_t*
1457	re_acquire_state_context (err, dfa, nodes, context)
1458	reg_errcode_t *err;
1459	re_dfa_t *dfa;
1460	const re_node_set *nodes;
1461	unsigned int context;
1462	{
1463	unsigned int hash;
1464	re_dfastate_t *new_state;
1465	struct re_state_table_entry *spot;
1466	int i;
1467	if (nodes->nelem == 0)
1468	{
1469	*err = REG_NOERROR;
1470	return NULL;
1471	}
1472	hash = calc_state_hash (nodes, context);
1473	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1474
1475	for (i = 0 ; i < spot->num ; i++)
1476	{
1477	re_dfastate_t *state = spot->array[i];
1478	if (state->hash == hash
1479	&& state->context == context
1480	&& re_node_set_compare (state->entrance_nodes, nodes))
1481	return state;
1482	}
1483	/* There are no appropriate state in `dfa', create the new one. */
1484	new_state = create_cd_newstate (dfa, nodes, context, hash);
1485	if (BE (new_state != NULL, 1))
1486	return new_state;
1487	else
1488	{
1489	*err = REG_ESPACE;
1490	return NULL;
1491	}
1492	}
1493
1494	/* Finish initialization of the new state NEWSTATE, and using its hash value
1495	HASH put in the appropriate bucket of DFA's state table. Return value
1496	indicates the error code if failed. */
1497
1498	static reg_errcode_t
1499	register_state (dfa, newstate, hash)
1500	re_dfa_t *dfa;
1501	re_dfastate_t *newstate;
1502	unsigned int hash;
1503	{
1504	struct re_state_table_entry *spot;
1505	reg_errcode_t err;
1506	int i;
1507
1508	newstate->hash = hash;
1509	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1510	if (BE (err != REG_NOERROR, 0))
1511	return REG_ESPACE;
1512	for (i = 0; i < newstate->nodes.nelem; i++)
1513	{
1514	int elem = newstate->nodes.elems[i];
1515	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1516	re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1517	}
1518
1519	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1520	if (BE (spot->alloc <= spot->num, 0))
1521	{
1522	int new_alloc = 2 * spot->num + 2;
1523	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1524	new_alloc);
1525	if (BE (new_array == NULL, 0))
1526	return REG_ESPACE;
1527	spot->array = new_array;
1528	spot->alloc = new_alloc;
1529	}
1530	spot->array[spot->num++] = newstate;
1531	return REG_NOERROR;
1532	}
1533
1534	/* Create the new state which is independ of contexts.
1535	Return the new state if succeeded, otherwise return NULL. */
1536
1537	static re_dfastate_t *
1538	create_ci_newstate (dfa, nodes, hash)
1539	re_dfa_t *dfa;
1540	const re_node_set *nodes;
1541	unsigned int hash;
1542	{
1543	int i;
1544	reg_errcode_t err;
1545	re_dfastate_t *newstate;
1546
1547	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1548	if (BE (newstate == NULL, 0))
1549	return NULL;
1550	err = re_node_set_init_copy (&newstate->nodes, nodes);
1551	if (BE (err != REG_NOERROR, 0))
1552	{
1553	re_free (newstate);
1554	return NULL;
1555	}
1556
1557	newstate->entrance_nodes = &newstate->nodes;
1558	for (i = 0 ; i < nodes->nelem ; i++)
1559	{
1560	re_token_t *node = dfa->nodes + nodes->elems[i];
1561	re_token_type_t type = node->type;
1562	if (type == CHARACTER && !node->constraint)
1563	continue;
1564	#ifdef RE_ENABLE_I18N
1565	newstate->accept_mb \|= node->accept_mb;
1566	#endif /* RE_ENABLE_I18N */
1567
1568	/* If the state has the halt node, the state is a halt state. */
1569	if (type == END_OF_RE)
1570	newstate->halt = 1;
1571	else if (type == OP_BACK_REF)
1572	newstate->has_backref = 1;
1573	else if (type == ANCHOR \|\| node->constraint)
1574	newstate->has_constraint = 1;
1575	}
1576	err = register_state (dfa, newstate, hash);
1577	if (BE (err != REG_NOERROR, 0))
1578	{
1579	free_state (newstate);
1580	newstate = NULL;
1581	}
1582	return newstate;
1583	}
1584
1585	/* Create the new state which is depend on the context CONTEXT.
1586	Return the new state if succeeded, otherwise return NULL. */
1587
1588	static re_dfastate_t *
1589	create_cd_newstate (dfa, nodes, context, hash)
1590	re_dfa_t *dfa;
1591	const re_node_set *nodes;
1592	unsigned int context, hash;
1593	{
1594	int i, nctx_nodes = 0;
1595	reg_errcode_t err;
1596	re_dfastate_t *newstate;
1597
1598	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1599	if (BE (newstate == NULL, 0))
1600	return NULL;
1601	err = re_node_set_init_copy (&newstate->nodes, nodes);
1602	if (BE (err != REG_NOERROR, 0))
1603	{
1604	re_free (newstate);
1605	return NULL;
1606	}
1607
1608	newstate->context = context;
1609	newstate->entrance_nodes = &newstate->nodes;
1610
1611	for (i = 0 ; i < nodes->nelem ; i++)
1612	{
1613	unsigned int constraint = 0;
1614	re_token_t *node = dfa->nodes + nodes->elems[i];
1615	re_token_type_t type = node->type;
1616	if (node->constraint)
1617	constraint = node->constraint;
1618
1619	if (type == CHARACTER && !constraint)
1620	continue;
1621	#ifdef RE_ENABLE_I18N
1622	newstate->accept_mb \|= node->accept_mb;
1623	#endif /* RE_ENABLE_I18N */
1624
1625	/* If the state has the halt node, the state is a halt state. */
1626	if (type == END_OF_RE)
1627	newstate->halt = 1;
1628	else if (type == OP_BACK_REF)
1629	newstate->has_backref = 1;
1630	else if (type == ANCHOR)
1631	constraint = node->opr.ctx_type;
1632
1633	if (constraint)
1634	{
1635	if (newstate->entrance_nodes == &newstate->nodes)
1636	{
1637	newstate->entrance_nodes = re_malloc (re_node_set, 1);
1638	if (BE (newstate->entrance_nodes == NULL, 0))
1639	{
1640	free_state (newstate);
1641	return NULL;
1642	}
1643	re_node_set_init_copy (newstate->entrance_nodes, nodes);
1644	nctx_nodes = 0;
1645	newstate->has_constraint = 1;
1646	}
1647
1648	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1649	{
1650	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1651	++nctx_nodes;
1652	}
1653	}
1654	}
1655	err = register_state (dfa, newstate, hash);
1656	if (BE (err != REG_NOERROR, 0))
1657	{
1658	free_state (newstate);
1659	newstate = NULL;
1660	}
1661	return newstate;
1662	}
1663
1664	static void
1665	free_state (state)
1666	re_dfastate_t *state;
1667	{
1668	re_node_set_free (&state->non_eps_nodes);
1669	re_node_set_free (&state->inveclosure);
1670	if (state->entrance_nodes != &state->nodes)
1671	{
1672	re_node_set_free (state->entrance_nodes);
1673	re_free (state->entrance_nodes);
1674	}
1675	re_node_set_free (&state->nodes);
1676	re_free (state->word_trtable);
1677	re_free (state->trtable);
1678	re_free (state);
1679	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/gawk/3.1.5/regex_internal.c

Download in other formats: