Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: vendor/glibc/current/posix/regex_internal.c

Visit:

Last change on this file was 2237, checked in by (none), 20 years ago
This commit was manufactured by cvs2svn to create branch 'GNU'.
Property cvs2svn:cvs-rev set to `1.1` Property svn:eol-style set to `native` Property svn:executable set to ``*
File size: 44.0 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, write to the Free
18	Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19	02111-1307 USA. */
20
21	static void re_string_construct_common (const char *str, int len,
22	re_string_t *pstr,
23	RE_TRANSLATE_TYPE trans, int icase,
24	const re_dfa_t *dfa) internal_function;
25	#ifdef RE_ENABLE_I18N
26	static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27	wint_t *last_wc) internal_function;
28	#endif /* RE_ENABLE_I18N */
29	static reg_errcode_t register_state (re_dfa_t dfa, re_dfastate_t newstate,
30	unsigned int hash) internal_function;
31	static re_dfastate_t create_ci_newstate (re_dfa_t dfa,
32	const re_node_set *nodes,
33	unsigned int hash) internal_function;
34	static re_dfastate_t create_cd_newstate (re_dfa_t dfa,
35	const re_node_set *nodes,
36	unsigned int context,
37	unsigned int hash) internal_function;
38	static unsigned int inline calc_state_hash (const re_node_set *nodes,
39	unsigned int context) internal_function;
40
41
42	/* Functions for string operation. */
43
44	/* This function allocate the buffers. It is necessary to call
45	re_string_reconstruct before using the object. */
46
47	static reg_errcode_t
48	re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
49	re_string_t *pstr;
50	const char *str;
51	int len, init_len, icase;
52	RE_TRANSLATE_TYPE trans;
53	const re_dfa_t *dfa;
54	{
55	reg_errcode_t ret;
56	int init_buf_len;
57
58	/* Ensure at least one character fits into the buffers. */
59	if (init_len < dfa->mb_cur_max)
60	init_len = dfa->mb_cur_max;
61	init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
62	re_string_construct_common (str, len, pstr, trans, icase, dfa);
63
64	ret = re_string_realloc_buffers (pstr, init_buf_len);
65	if (BE (ret != REG_NOERROR, 0))
66	return ret;
67
68	pstr->word_char = dfa->word_char;
69	pstr->word_ops_used = dfa->word_ops_used;
70	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
71	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > 1) ? 0 : len;
72	pstr->valid_raw_len = pstr->valid_len;
73	return REG_NOERROR;
74	}
75
76	/* This function allocate the buffers, and initialize them. */
77
78	static reg_errcode_t
79	re_string_construct (pstr, str, len, trans, icase, dfa)
80	re_string_t *pstr;
81	const char *str;
82	int len, icase;
83	RE_TRANSLATE_TYPE trans;
84	const re_dfa_t *dfa;
85	{
86	reg_errcode_t ret;
87	memset (pstr, '\0', sizeof (re_string_t));
88	re_string_construct_common (str, len, pstr, trans, icase, dfa);
89
90	if (len > 0)
91	{
92	ret = re_string_realloc_buffers (pstr, len + 1);
93	if (BE (ret != REG_NOERROR, 0))
94	return ret;
95	}
96	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
97
98	if (icase)
99	{
100	#ifdef RE_ENABLE_I18N
101	if (dfa->mb_cur_max > 1)
102	{
103	while (1)
104	{
105	ret = build_wcs_upper_buffer (pstr);
106	if (BE (ret != REG_NOERROR, 0))
107	return ret;
108	if (pstr->valid_raw_len >= len)
109	break;
110	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
111	break;
112	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
113	if (BE (ret != REG_NOERROR, 0))
114	return ret;
115	}
116	}
117	else
118	#endif /* RE_ENABLE_I18N */
119	build_upper_buffer (pstr);
120	}
121	else
122	{
123	#ifdef RE_ENABLE_I18N
124	if (dfa->mb_cur_max > 1)
125	build_wcs_buffer (pstr);
126	else
127	#endif /* RE_ENABLE_I18N */
128	{
129	if (trans != NULL)
130	re_string_translate_buffer (pstr);
131	else
132	{
133	pstr->valid_len = pstr->bufs_len;
134	pstr->valid_raw_len = pstr->bufs_len;
135	}
136	}
137	}
138
139	return REG_NOERROR;
140	}
141
142	/* Helper functions for re_string_allocate, and re_string_construct. */
143
144	static reg_errcode_t
145	re_string_realloc_buffers (pstr, new_buf_len)
146	re_string_t *pstr;
147	int new_buf_len;
148	{
149	#ifdef RE_ENABLE_I18N
150	if (pstr->mb_cur_max > 1)
151	{
152	wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
153	if (BE (new_array == NULL, 0))
154	return REG_ESPACE;
155	pstr->wcs = new_array;
156	if (pstr->offsets != NULL)
157	{
158	int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
159	if (BE (new_array == NULL, 0))
160	return REG_ESPACE;
161	pstr->offsets = new_array;
162	}
163	}
164	#endif /* RE_ENABLE_I18N */
165	if (pstr->mbs_allocated)
166	{
167	unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
168	new_buf_len);
169	if (BE (new_array == NULL, 0))
170	return REG_ESPACE;
171	pstr->mbs = new_array;
172	}
173	pstr->bufs_len = new_buf_len;
174	return REG_NOERROR;
175	}
176
177
178	static void
179	re_string_construct_common (str, len, pstr, trans, icase, dfa)
180	const char *str;
181	int len;
182	re_string_t *pstr;
183	RE_TRANSLATE_TYPE trans;
184	int icase;
185	const re_dfa_t *dfa;
186	{
187	pstr->raw_mbs = (const unsigned char *) str;
188	pstr->len = len;
189	pstr->raw_len = len;
190	pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
191	pstr->icase = icase ? 1 : 0;
192	pstr->mbs_allocated = (trans != NULL \|\| icase);
193	pstr->mb_cur_max = dfa->mb_cur_max;
194	pstr->is_utf8 = dfa->is_utf8;
195	pstr->map_notascii = dfa->map_notascii;
196	pstr->stop = pstr->len;
197	pstr->raw_stop = pstr->stop;
198	}
199
200	#ifdef RE_ENABLE_I18N
201
202	/* Build wide character buffer PSTR->WCS.
203	If the byte sequence of the string are:
204	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
205	Then wide character buffer will be:
206	<wc1> , WEOF , <wc2> , WEOF , <wc3>
207	We use WEOF for padding, they indicate that the position isn't
208	a first byte of a multibyte character.
209
210	Note that this function assumes PSTR->VALID_LEN elements are already
211	built and starts from PSTR->VALID_LEN. */
212
213	static void
214	build_wcs_buffer (pstr)
215	re_string_t *pstr;
216	{
217	#ifdef _LIBC
218	unsigned char buf[MB_LEN_MAX];
219	assert (MB_LEN_MAX >= pstr->mb_cur_max);
220	#else
221	unsigned char buf[64];
222	#endif
223	mbstate_t prev_st;
224	int byte_idx, end_idx, remain_len;
225	size_t mbclen;
226
227	/* Build the buffers from pstr->valid_len to either pstr->len or
228	pstr->bufs_len. */
229	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
230	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
231	{
232	wchar_t wc;
233	const char *p;
234
235	remain_len = end_idx - byte_idx;
236	prev_st = pstr->cur_state;
237	/* Apply the translation if we need. */
238	if (BE (pstr->trans != NULL, 0))
239	{
240	int i, ch;
241
242	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
243	{
244	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
245	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
246	}
247	p = (const char *) buf;
248	}
249	else
250	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
251	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
252	if (BE (mbclen == (size_t) -2, 0))
253	{
254	/* The buffer doesn't have enough space, finish to build. */
255	pstr->cur_state = prev_st;
256	break;
257	}
258	else if (BE (mbclen == (size_t) -1 \|\| mbclen == 0, 0))
259	{
260	/* We treat these cases as a singlebyte character. */
261	mbclen = 1;
262	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
263	if (BE (pstr->trans != NULL, 0))
264	wc = pstr->trans[wc];
265	pstr->cur_state = prev_st;
266	}
267
268	/* Write wide character and padding. */
269	pstr->wcs[byte_idx++] = wc;
270	/* Write paddings. */
271	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
272	pstr->wcs[byte_idx++] = WEOF;
273	}
274	pstr->valid_len = byte_idx;
275	pstr->valid_raw_len = byte_idx;
276	}
277
278	/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
279	but for REG_ICASE. */
280
281	static int
282	build_wcs_upper_buffer (pstr)
283	re_string_t *pstr;
284	{
285	mbstate_t prev_st;
286	int src_idx, byte_idx, end_idx, remain_len;
287	size_t mbclen;
288	#ifdef _LIBC
289	char buf[MB_LEN_MAX];
290	assert (MB_LEN_MAX >= pstr->mb_cur_max);
291	#else
292	char buf[64];
293	#endif
294
295	byte_idx = pstr->valid_len;
296	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
297
298	/* The following optimization assumes that ASCII characters can be
299	mapped to wide characters with a simple cast. */
300	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
301	{
302	while (byte_idx < end_idx)
303	{
304	wchar_t wc;
305
306	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
307	&& mbsinit (&pstr->cur_state))
308	{
309	/* In case of a singlebyte character. */
310	pstr->mbs[byte_idx]
311	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
312	/* The next step uses the assumption that wchar_t is encoded
313	ASCII-safe: all ASCII values can be converted like this. */
314	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
315	++byte_idx;
316	continue;
317	}
318
319	remain_len = end_idx - byte_idx;
320	prev_st = pstr->cur_state;
321	mbclen = mbrtowc (&wc,
322	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323	+ byte_idx), remain_len, &pstr->cur_state);
324	if (BE (mbclen + 2 > 2, 1))
325	{
326	wchar_t wcu = wc;
327	if (iswlower (wc))
328	{
329	size_t mbcdlen;
330
331	wcu = towupper (wc);
332	mbcdlen = wcrtomb (buf, wcu, &prev_st);
333	if (BE (mbclen == mbcdlen, 1))
334	memcpy (pstr->mbs + byte_idx, buf, mbclen);
335	else
336	{
337	src_idx = byte_idx;
338	goto offsets_needed;
339	}
340	}
341	else
342	memcpy (pstr->mbs + byte_idx,
343	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
344	pstr->wcs[byte_idx++] = wcu;
345	/* Write paddings. */
346	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
347	pstr->wcs[byte_idx++] = WEOF;
348	}
349	else if (mbclen == (size_t) -1 \|\| mbclen == 0)
350	{
351	/* It is an invalid character or '\0'. Just use the byte. */
352	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
353	pstr->mbs[byte_idx] = ch;
354	/* And also cast it to wide char. */
355	pstr->wcs[byte_idx++] = (wchar_t) ch;
356	if (BE (mbclen == (size_t) -1, 0))
357	pstr->cur_state = prev_st;
358	}
359	else
360	{
361	/* The buffer doesn't have enough space, finish to build. */
362	pstr->cur_state = prev_st;
363	break;
364	}
365	}
366	pstr->valid_len = byte_idx;
367	pstr->valid_raw_len = byte_idx;
368	return REG_NOERROR;
369	}
370	else
371	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
372	{
373	wchar_t wc;
374	const char *p;
375	offsets_needed:
376	remain_len = end_idx - byte_idx;
377	prev_st = pstr->cur_state;
378	if (BE (pstr->trans != NULL, 0))
379	{
380	int i, ch;
381
382	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
383	{
384	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
385	buf[i] = pstr->trans[ch];
386	}
387	p = (const char *) buf;
388	}
389	else
390	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
391	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
392	if (BE (mbclen + 2 > 2, 1))
393	{
394	wchar_t wcu = wc;
395	if (iswlower (wc))
396	{
397	size_t mbcdlen;
398
399	wcu = towupper (wc);
400	mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
401	if (BE (mbclen == mbcdlen, 1))
402	memcpy (pstr->mbs + byte_idx, buf, mbclen);
403	else if (mbcdlen != (size_t) -1)
404	{
405	size_t i;
406
407	if (byte_idx + mbcdlen > pstr->bufs_len)
408	{
409	pstr->cur_state = prev_st;
410	break;
411	}
412
413	if (pstr->offsets == NULL)
414	{
415	pstr->offsets = re_malloc (int, pstr->bufs_len);
416
417	if (pstr->offsets == NULL)
418	return REG_ESPACE;
419	}
420	if (!pstr->offsets_needed)
421	{
422	for (i = 0; i < (size_t) byte_idx; ++i)
423	pstr->offsets[i] = i;
424	pstr->offsets_needed = 1;
425	}
426
427	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
428	pstr->wcs[byte_idx] = wcu;
429	pstr->offsets[byte_idx] = src_idx;
430	for (i = 1; i < mbcdlen; ++i)
431	{
432	pstr->offsets[byte_idx + i]
433	= src_idx + (i < mbclen ? i : mbclen - 1);
434	pstr->wcs[byte_idx + i] = WEOF;
435	}
436	pstr->len += mbcdlen - mbclen;
437	if (pstr->raw_stop > src_idx)
438	pstr->stop += mbcdlen - mbclen;
439	end_idx = (pstr->bufs_len > pstr->len)
440	? pstr->len : pstr->bufs_len;
441	byte_idx += mbcdlen;
442	src_idx += mbclen;
443	continue;
444	}
445	else
446	memcpy (pstr->mbs + byte_idx, p, mbclen);
447	}
448	else
449	memcpy (pstr->mbs + byte_idx, p, mbclen);
450
451	if (BE (pstr->offsets_needed != 0, 0))
452	{
453	size_t i;
454	for (i = 0; i < mbclen; ++i)
455	pstr->offsets[byte_idx + i] = src_idx + i;
456	}
457	src_idx += mbclen;
458
459	pstr->wcs[byte_idx++] = wcu;
460	/* Write paddings. */
461	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
462	pstr->wcs[byte_idx++] = WEOF;
463	}
464	else if (mbclen == (size_t) -1 \|\| mbclen == 0)
465	{
466	/* It is an invalid character or '\0'. Just use the byte. */
467	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
468
469	if (BE (pstr->trans != NULL, 0))
470	ch = pstr->trans [ch];
471	pstr->mbs[byte_idx] = ch;
472
473	if (BE (pstr->offsets_needed != 0, 0))
474	pstr->offsets[byte_idx] = src_idx;
475	++src_idx;
476
477	/* And also cast it to wide char. */
478	pstr->wcs[byte_idx++] = (wchar_t) ch;
479	if (BE (mbclen == (size_t) -1, 0))
480	pstr->cur_state = prev_st;
481	}
482	else
483	{
484	/* The buffer doesn't have enough space, finish to build. */
485	pstr->cur_state = prev_st;
486	break;
487	}
488	}
489	pstr->valid_len = byte_idx;
490	pstr->valid_raw_len = src_idx;
491	return REG_NOERROR;
492	}
493
494	/* Skip characters until the index becomes greater than NEW_RAW_IDX.
495	Return the index. */
496
497	static int
498	re_string_skip_chars (pstr, new_raw_idx, last_wc)
499	re_string_t *pstr;
500	int new_raw_idx;
501	wint_t *last_wc;
502	{
503	mbstate_t prev_st;
504	int rawbuf_idx;
505	size_t mbclen;
506	wchar_t wc = 0;
507
508	/* Skip the characters which are not necessary to check. */
509	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
510	rawbuf_idx < new_raw_idx;)
511	{
512	int remain_len;
513	remain_len = pstr->len - rawbuf_idx;
514	prev_st = pstr->cur_state;
515	mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
516	remain_len, &pstr->cur_state);
517	if (BE (mbclen == (size_t) -2 \|\| mbclen == (size_t) -1 \|\| mbclen == 0, 0))
518	{
519	/* We treat these cases as a singlebyte character. */
520	mbclen = 1;
521	pstr->cur_state = prev_st;
522	}
523	/* Then proceed the next character. */
524	rawbuf_idx += mbclen;
525	}
526	*last_wc = (wint_t) wc;
527	return rawbuf_idx;
528	}
529	#endif /* RE_ENABLE_I18N */
530
531	/* Build the buffer PSTR->MBS, and apply the translation if we need.
532	This function is used in case of REG_ICASE. */
533
534	static void
535	build_upper_buffer (pstr)
536	re_string_t *pstr;
537	{
538	int char_idx, end_idx;
539	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
540
541	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
542	{
543	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
544	if (BE (pstr->trans != NULL, 0))
545	ch = pstr->trans[ch];
546	if (islower (ch))
547	pstr->mbs[char_idx] = toupper (ch);
548	else
549	pstr->mbs[char_idx] = ch;
550	}
551	pstr->valid_len = char_idx;
552	pstr->valid_raw_len = char_idx;
553	}
554
555	/* Apply TRANS to the buffer in PSTR. */
556
557	static void
558	re_string_translate_buffer (pstr)
559	re_string_t *pstr;
560	{
561	int buf_idx, end_idx;
562	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
563
564	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
565	{
566	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567	pstr->mbs[buf_idx] = pstr->trans[ch];
568	}
569
570	pstr->valid_len = buf_idx;
571	pstr->valid_raw_len = buf_idx;
572	}
573
574	/* This function re-construct the buffers.
575	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576	convert to upper case in case of REG_ICASE, apply translation. */
577
578	static reg_errcode_t
579	re_string_reconstruct (pstr, idx, eflags)
580	re_string_t *pstr;
581	int idx, eflags;
582	{
583	int offset = idx - pstr->raw_mbs_idx;
584	if (BE (offset < 0, 0))
585	{
586	/* Reset buffer. */
587	#ifdef RE_ENABLE_I18N
588	if (pstr->mb_cur_max > 1)
589	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
590	#endif /* RE_ENABLE_I18N */
591	pstr->len = pstr->raw_len;
592	pstr->stop = pstr->raw_stop;
593	pstr->valid_len = 0;
594	pstr->raw_mbs_idx = 0;
595	pstr->valid_raw_len = 0;
596	pstr->offsets_needed = 0;
597	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
598	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
599	if (!pstr->mbs_allocated)
600	pstr->mbs = (unsigned char *) pstr->raw_mbs;
601	offset = idx;
602	}
603
604	if (BE (offset != 0, 1))
605	{
606	/* Are the characters which are already checked remain? */
607	if (BE (offset < pstr->valid_raw_len, 1)
608	#ifdef RE_ENABLE_I18N
609	/* Handling this would enlarge the code too much.
610	Accept a slowdown in that case. */
611	&& pstr->offsets_needed == 0
612	#endif
613	)
614	{
615	/* Yes, move them to the front of the buffer. */
616	pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
617	#ifdef RE_ENABLE_I18N
618	if (pstr->mb_cur_max > 1)
619	memmove (pstr->wcs, pstr->wcs + offset,
620	(pstr->valid_len - offset) * sizeof (wint_t));
621	#endif /* RE_ENABLE_I18N */
622	if (BE (pstr->mbs_allocated, 0))
623	memmove (pstr->mbs, pstr->mbs + offset,
624	pstr->valid_len - offset);
625	pstr->valid_len -= offset;
626	pstr->valid_raw_len -= offset;
627	#if DEBUG
628	assert (pstr->valid_len > 0);
629	#endif
630	}
631	else
632	{
633	/* No, skip all characters until IDX. */
634	#ifdef RE_ENABLE_I18N
635	if (BE (pstr->offsets_needed, 0))
636	{
637	pstr->len = pstr->raw_len - idx + offset;
638	pstr->stop = pstr->raw_stop - idx + offset;
639	pstr->offsets_needed = 0;
640	}
641	#endif
642	pstr->valid_len = 0;
643	pstr->valid_raw_len = 0;
644	#ifdef RE_ENABLE_I18N
645	if (pstr->mb_cur_max > 1)
646	{
647	int wcs_idx;
648	wint_t wc = WEOF;
649
650	if (pstr->is_utf8)
651	{
652	const unsigned char raw, p, q, end;
653
654	/* Special case UTF-8. Multi-byte chars start with any
655	byte other than 0x80 - 0xbf. */
656	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
657	end = raw + (offset - pstr->mb_cur_max);
658	for (p = raw + offset - 1; p >= end; --p)
659	if ((*p & 0xc0) != 0x80)
660	{
661	mbstate_t cur_state;
662	wchar_t wc2;
663	int mlen = raw + pstr->len - p;
664	unsigned char buf[6];
665
666	q = p;
667	if (BE (pstr->trans != NULL, 0))
668	{
669	int i = mlen < 6 ? mlen : 6;
670	while (--i >= 0)
671	buf[i] = pstr->trans[p[i]];
672	q = buf;
673	}
674	/* XXX Don't use mbrtowc, we know which conversion
675	to use (UTF-8 -> UCS4). */
676	memset (&cur_state, 0, sizeof (cur_state));
677	mlen = (mbrtowc (&wc2, (const char *) p, mlen,
678	&cur_state)
679	- (raw + offset - p));
680	if (mlen >= 0)
681	{
682	memset (&pstr->cur_state, '\0',
683	sizeof (mbstate_t));
684	pstr->valid_len = mlen;
685	wc = wc2;
686	}
687	break;
688	}
689	}
690
691	if (wc == WEOF)
692	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
693	if (BE (pstr->valid_len, 0))
694	{
695	for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
696	pstr->wcs[wcs_idx] = WEOF;
697	if (pstr->mbs_allocated)
698	memset (pstr->mbs, 255, pstr->valid_len);
699	}
700	pstr->valid_raw_len = pstr->valid_len;
701	pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
702	&& IS_WIDE_WORD_CHAR (wc))
703	? CONTEXT_WORD
704	: ((IS_WIDE_NEWLINE (wc)
705	&& pstr->newline_anchor)
706	? CONTEXT_NEWLINE : 0));
707	}
708	else
709	#endif /* RE_ENABLE_I18N */
710	{
711	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
712	if (pstr->trans)
713	c = pstr->trans[c];
714	pstr->tip_context = (bitset_contain (pstr->word_char, c)
715	? CONTEXT_WORD
716	: ((IS_NEWLINE (c) && pstr->newline_anchor)
717	? CONTEXT_NEWLINE : 0));
718	}
719	}
720	if (!BE (pstr->mbs_allocated, 0))
721	pstr->mbs += offset;
722	}
723	pstr->raw_mbs_idx = idx;
724	pstr->len -= offset;
725	pstr->stop -= offset;
726
727	/* Then build the buffers. */
728	#ifdef RE_ENABLE_I18N
729	if (pstr->mb_cur_max > 1)
730	{
731	if (pstr->icase)
732	{
733	int ret = build_wcs_upper_buffer (pstr);
734	if (BE (ret != REG_NOERROR, 0))
735	return ret;
736	}
737	else
738	build_wcs_buffer (pstr);
739	}
740	else
741	#endif /* RE_ENABLE_I18N */
742	if (BE (pstr->mbs_allocated, 0))
743	{
744	if (pstr->icase)
745	build_upper_buffer (pstr);
746	else if (pstr->trans != NULL)
747	re_string_translate_buffer (pstr);
748	}
749	else
750	pstr->valid_len = pstr->len;
751
752	pstr->cur_idx = 0;
753	return REG_NOERROR;
754	}
755
756	static unsigned char
757	re_string_peek_byte_case (pstr, idx)
758	const re_string_t *pstr;
759	int idx;
760	{
761	int ch, off;
762
763	/* Handle the common (easiest) cases first. */
764	if (BE (!pstr->mbs_allocated, 1))
765	return re_string_peek_byte (pstr, idx);
766
767	#ifdef RE_ENABLE_I18N
768	if (pstr->mb_cur_max > 1
769	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
770	return re_string_peek_byte (pstr, idx);
771	#endif
772
773	off = pstr->cur_idx + idx;
774	#ifdef RE_ENABLE_I18N
775	if (pstr->offsets_needed)
776	off = pstr->offsets[off];
777	#endif
778
779	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
780
781	#ifdef RE_ENABLE_I18N
782	/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
783	this function returns CAPITAL LETTER I instead of first byte of
784	DOTLESS SMALL LETTER I. The latter would confuse the parser,
785	since peek_byte_case doesn't advance cur_idx in any way. */
786	if (pstr->offsets_needed && !isascii (ch))
787	return re_string_peek_byte (pstr, idx);
788	#endif
789
790	return ch;
791	}
792
793	static unsigned char
794	re_string_fetch_byte_case (pstr)
795	re_string_t *pstr;
796	{
797	if (BE (!pstr->mbs_allocated, 1))
798	return re_string_fetch_byte (pstr);
799
800	#ifdef RE_ENABLE_I18N
801	if (pstr->offsets_needed)
802	{
803	int off, ch;
804
805	/* For tr_TR.UTF-8 [[:islower:]] there is
806	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
807	in that case the whole multi-byte character and return
808	the original letter. On the other side, with
809	[[: DOTLESS SMALL LETTER I return [[:I, as doing
810	anything else would complicate things too much. */
811
812	if (!re_string_first_byte (pstr, pstr->cur_idx))
813	return re_string_fetch_byte (pstr);
814
815	off = pstr->offsets[pstr->cur_idx];
816	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
817
818	if (! isascii (ch))
819	return re_string_fetch_byte (pstr);
820
821	re_string_skip_bytes (pstr,
822	re_string_char_size_at (pstr, pstr->cur_idx));
823	return ch;
824	}
825	#endif
826
827	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
828	}
829
830	static void
831	re_string_destruct (pstr)
832	re_string_t *pstr;
833	{
834	#ifdef RE_ENABLE_I18N
835	re_free (pstr->wcs);
836	re_free (pstr->offsets);
837	#endif /* RE_ENABLE_I18N */
838	if (pstr->mbs_allocated)
839	re_free (pstr->mbs);
840	}
841
842	/* Return the context at IDX in INPUT. */
843
844	static unsigned int
845	re_string_context_at (input, idx, eflags)
846	const re_string_t *input;
847	int idx, eflags;
848	{
849	int c;
850	if (BE (idx < 0, 0))
851	/* In this case, we use the value stored in input->tip_context,
852	since we can't know the character in input->mbs[-1] here. */
853	return input->tip_context;
854	if (BE (idx == input->len, 0))
855	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
856	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
857	#ifdef RE_ENABLE_I18N
858	if (input->mb_cur_max > 1)
859	{
860	wint_t wc;
861	int wc_idx = idx;
862	while(input->wcs[wc_idx] == WEOF)
863	{
864	#ifdef DEBUG
865	/* It must not happen. */
866	assert (wc_idx >= 0);
867	#endif
868	--wc_idx;
869	if (wc_idx < 0)
870	return input->tip_context;
871	}
872	wc = input->wcs[wc_idx];
873	if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
874	return CONTEXT_WORD;
875	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
876	? CONTEXT_NEWLINE : 0);
877	}
878	else
879	#endif
880	{
881	c = re_string_byte_at (input, idx);
882	if (bitset_contain (input->word_char, c))
883	return CONTEXT_WORD;
884	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
885	}
886	}
887
888
889	/* Functions for set operation. */
890
891	static reg_errcode_t
892	re_node_set_alloc (set, size)
893	re_node_set *set;
894	int size;
895	{
896	set->alloc = size;
897	set->nelem = 0;
898	set->elems = re_malloc (int, size);
899	if (BE (set->elems == NULL, 0))
900	return REG_ESPACE;
901	return REG_NOERROR;
902	}
903
904	static reg_errcode_t
905	re_node_set_init_1 (set, elem)
906	re_node_set *set;
907	int elem;
908	{
909	set->alloc = 1;
910	set->nelem = 1;
911	set->elems = re_malloc (int, 1);
912	if (BE (set->elems == NULL, 0))
913	{
914	set->alloc = set->nelem = 0;
915	return REG_ESPACE;
916	}
917	set->elems[0] = elem;
918	return REG_NOERROR;
919	}
920
921	static reg_errcode_t
922	re_node_set_init_2 (set, elem1, elem2)
923	re_node_set *set;
924	int elem1, elem2;
925	{
926	set->alloc = 2;
927	set->elems = re_malloc (int, 2);
928	if (BE (set->elems == NULL, 0))
929	return REG_ESPACE;
930	if (elem1 == elem2)
931	{
932	set->nelem = 1;
933	set->elems[0] = elem1;
934	}
935	else
936	{
937	set->nelem = 2;
938	if (elem1 < elem2)
939	{
940	set->elems[0] = elem1;
941	set->elems[1] = elem2;
942	}
943	else
944	{
945	set->elems[0] = elem2;
946	set->elems[1] = elem1;
947	}
948	}
949	return REG_NOERROR;
950	}
951
952	static reg_errcode_t
953	re_node_set_init_copy (dest, src)
954	re_node_set *dest;
955	const re_node_set *src;
956	{
957	dest->nelem = src->nelem;
958	if (src->nelem > 0)
959	{
960	dest->alloc = dest->nelem;
961	dest->elems = re_malloc (int, dest->alloc);
962	if (BE (dest->elems == NULL, 0))
963	{
964	dest->alloc = dest->nelem = 0;
965	return REG_ESPACE;
966	}
967	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
968	}
969	else
970	re_node_set_init_empty (dest);
971	return REG_NOERROR;
972	}
973
974	/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
975	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
976	Note: We assume dest->elems is NULL, when dest->alloc is 0. */
977
978	static reg_errcode_t
979	re_node_set_add_intersect (dest, src1, src2)
980	re_node_set *dest;
981	const re_node_set src1, src2;
982	{
983	int i1, i2, is, id, delta, sbase;
984	if (src1->nelem == 0 \|\| src2->nelem == 0)
985	return REG_NOERROR;
986
987	/* We need dest->nelem + 2 * elems_in_intersection; this is a
988	conservative estimate. */
989	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
990	{
991	int new_alloc = src1->nelem + src2->nelem + dest->alloc;
992	int *new_elems = re_realloc (dest->elems, int, new_alloc);
993	if (BE (new_elems == NULL, 0))
994	return REG_ESPACE;
995	dest->elems = new_elems;
996	dest->alloc = new_alloc;
997	}
998
999	/* Find the items in the intersection of SRC1 and SRC2, and copy
1000	into the top of DEST those that are not already in DEST itself. */
1001	sbase = dest->nelem + src1->nelem + src2->nelem;
1002	i1 = src1->nelem - 1;
1003	i2 = src2->nelem - 1;
1004	id = dest->nelem - 1;
1005	for (;;)
1006	{
1007	if (src1->elems[i1] == src2->elems[i2])
1008	{
1009	/* Try to find the item in DEST. Maybe we could binary search? */
1010	while (id >= 0 && dest->elems[id] > src1->elems[i1])
1011	--id;
1012
1013	if (id < 0 \|\| dest->elems[id] != src1->elems[i1])
1014	dest->elems[--sbase] = src1->elems[i1];
1015
1016	if (--i1 < 0 \|\| --i2 < 0)
1017	break;
1018	}
1019
1020	/* Lower the highest of the two items. */
1021	else if (src1->elems[i1] < src2->elems[i2])
1022	{
1023	if (--i2 < 0)
1024	break;
1025	}
1026	else
1027	{
1028	if (--i1 < 0)
1029	break;
1030	}
1031	}
1032
1033	id = dest->nelem - 1;
1034	is = dest->nelem + src1->nelem + src2->nelem - 1;
1035	delta = is - sbase + 1;
1036
1037	/* Now copy. When DELTA becomes zero, the remaining
1038	DEST elements are already in place; this is more or
1039	less the same loop that is in re_node_set_merge. */
1040	dest->nelem += delta;
1041	if (delta > 0 && id >= 0)
1042	for (;;)
1043	{
1044	if (dest->elems[is] > dest->elems[id])
1045	{
1046	/* Copy from the top. */
1047	dest->elems[id + delta--] = dest->elems[is--];
1048	if (delta == 0)
1049	break;
1050	}
1051	else
1052	{
1053	/* Slide from the bottom. */
1054	dest->elems[id + delta] = dest->elems[id];
1055	if (--id < 0)
1056	break;
1057	}
1058	}
1059
1060	/* Copy remaining SRC elements. */
1061	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1062
1063	return REG_NOERROR;
1064	}
1065
1066	/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1067	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1068
1069	static reg_errcode_t
1070	re_node_set_init_union (dest, src1, src2)
1071	re_node_set *dest;
1072	const re_node_set src1, src2;
1073	{
1074	int i1, i2, id;
1075	if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1076	{
1077	dest->alloc = src1->nelem + src2->nelem;
1078	dest->elems = re_malloc (int, dest->alloc);
1079	if (BE (dest->elems == NULL, 0))
1080	return REG_ESPACE;
1081	}
1082	else
1083	{
1084	if (src1 != NULL && src1->nelem > 0)
1085	return re_node_set_init_copy (dest, src1);
1086	else if (src2 != NULL && src2->nelem > 0)
1087	return re_node_set_init_copy (dest, src2);
1088	else
1089	re_node_set_init_empty (dest);
1090	return REG_NOERROR;
1091	}
1092	for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1093	{
1094	if (src1->elems[i1] > src2->elems[i2])
1095	{
1096	dest->elems[id++] = src2->elems[i2++];
1097	continue;
1098	}
1099	if (src1->elems[i1] == src2->elems[i2])
1100	++i2;
1101	dest->elems[id++] = src1->elems[i1++];
1102	}
1103	if (i1 < src1->nelem)
1104	{
1105	memcpy (dest->elems + id, src1->elems + i1,
1106	(src1->nelem - i1) * sizeof (int));
1107	id += src1->nelem - i1;
1108	}
1109	else if (i2 < src2->nelem)
1110	{
1111	memcpy (dest->elems + id, src2->elems + i2,
1112	(src2->nelem - i2) * sizeof (int));
1113	id += src2->nelem - i2;
1114	}
1115	dest->nelem = id;
1116	return REG_NOERROR;
1117	}
1118
1119	/* Calculate the union set of the sets DEST and SRC. And store it to
1120	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1121
1122	static reg_errcode_t
1123	re_node_set_merge (dest, src)
1124	re_node_set *dest;
1125	const re_node_set *src;
1126	{
1127	int is, id, sbase, delta;
1128	if (src == NULL \|\| src->nelem == 0)
1129	return REG_NOERROR;
1130	if (dest->alloc < 2 * src->nelem + dest->nelem)
1131	{
1132	int new_alloc = 2 * (src->nelem + dest->alloc);
1133	int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1134	if (BE (new_buffer == NULL, 0))
1135	return REG_ESPACE;
1136	dest->elems = new_buffer;
1137	dest->alloc = new_alloc;
1138	}
1139
1140	if (BE (dest->nelem == 0, 0))
1141	{
1142	dest->nelem = src->nelem;
1143	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1144	return REG_NOERROR;
1145	}
1146
1147	/* Copy into the top of DEST the items of SRC that are not
1148	found in DEST. Maybe we could binary search in DEST? */
1149	for (sbase = dest->nelem + 2 * src->nelem,
1150	is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1151	{
1152	if (dest->elems[id] == src->elems[is])
1153	is--, id--;
1154	else if (dest->elems[id] < src->elems[is])
1155	dest->elems[--sbase] = src->elems[is--];
1156	else /* if (dest->elems[id] > src->elems[is]) */
1157	--id;
1158	}
1159
1160	if (is >= 0)
1161	{
1162	/* If DEST is exhausted, the remaining items of SRC must be unique. */
1163	sbase -= is + 1;
1164	memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1165	}
1166
1167	id = dest->nelem - 1;
1168	is = dest->nelem + 2 * src->nelem - 1;
1169	delta = is - sbase + 1;
1170	if (delta == 0)
1171	return REG_NOERROR;
1172
1173	/* Now copy. When DELTA becomes zero, the remaining
1174	DEST elements are already in place. */
1175	dest->nelem += delta;
1176	for (;;)
1177	{
1178	if (dest->elems[is] > dest->elems[id])
1179	{
1180	/* Copy from the top. */
1181	dest->elems[id + delta--] = dest->elems[is--];
1182	if (delta == 0)
1183	break;
1184	}
1185	else
1186	{
1187	/* Slide from the bottom. */
1188	dest->elems[id + delta] = dest->elems[id];
1189	if (--id < 0)
1190	{
1191	/* Copy remaining SRC elements. */
1192	memcpy (dest->elems, dest->elems + sbase,
1193	delta * sizeof (int));
1194	break;
1195	}
1196	}
1197	}
1198
1199	return REG_NOERROR;
1200	}
1201
1202	/* Insert the new element ELEM to the re_node_set* SET.
1203	SET should not already have ELEM.
1204	return -1 if an error is occured, return 1 otherwise. */
1205
1206	static int
1207	re_node_set_insert (set, elem)
1208	re_node_set *set;
1209	int elem;
1210	{
1211	int idx;
1212	/* In case the set is empty. */
1213	if (set->alloc == 0)
1214	{
1215	if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1216	return 1;
1217	else
1218	return -1;
1219	}
1220
1221	if (BE (set->nelem, 0) == 0)
1222	{
1223	/* We already guaranteed above that set->alloc != 0. */
1224	set->elems[0] = elem;
1225	++set->nelem;
1226	return 1;
1227	}
1228
1229	/* Realloc if we need. */
1230	if (set->alloc == set->nelem)
1231	{
1232	int *new_array;
1233	set->alloc = set->alloc * 2;
1234	new_array = re_realloc (set->elems, int, set->alloc);
1235	if (BE (new_array == NULL, 0))
1236	return -1;
1237	set->elems = new_array;
1238	}
1239
1240	/* Move the elements which follows the new element. Test the
1241	first element separately to skip a check in the inner loop. */
1242	if (elem < set->elems[0])
1243	{
1244	idx = 0;
1245	for (idx = set->nelem; idx > 0; idx--)
1246	set->elems[idx] = set->elems[idx - 1];
1247	}
1248	else
1249	{
1250	for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1251	set->elems[idx] = set->elems[idx - 1];
1252	}
1253
1254	/* Insert the new element. */
1255	set->elems[idx] = elem;
1256	++set->nelem;
1257	return 1;
1258	}
1259
1260	/* Insert the new element ELEM to the re_node_set* SET.
1261	SET should not already have any element greater than or equal to ELEM.
1262	Return -1 if an error is occured, return 1 otherwise. */
1263
1264	static int
1265	re_node_set_insert_last (set, elem)
1266	re_node_set *set;
1267	int elem;
1268	{
1269	/* Realloc if we need. */
1270	if (set->alloc == set->nelem)
1271	{
1272	int *new_array;
1273	set->alloc = (set->alloc + 1) * 2;
1274	new_array = re_realloc (set->elems, int, set->alloc);
1275	if (BE (new_array == NULL, 0))
1276	return -1;
1277	set->elems = new_array;
1278	}
1279
1280	/* Insert the new element. */
1281	set->elems[set->nelem++] = elem;
1282	return 1;
1283	}
1284
1285	/* Compare two node sets SET1 and SET2.
1286	return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1287
1288	static int
1289	re_node_set_compare (set1, set2)
1290	const re_node_set set1, set2;
1291	{
1292	int i;
1293	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1294	return 0;
1295	for (i = set1->nelem ; --i >= 0 ; )
1296	if (set1->elems[i] != set2->elems[i])
1297	return 0;
1298	return 1;
1299	}
1300
1301	/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1302
1303	static int
1304	re_node_set_contains (set, elem)
1305	const re_node_set *set;
1306	int elem;
1307	{
1308	unsigned int idx, right, mid;
1309	if (set->nelem <= 0)
1310	return 0;
1311
1312	/* Binary search the element. */
1313	idx = 0;
1314	right = set->nelem - 1;
1315	while (idx < right)
1316	{
1317	mid = (idx + right) / 2;
1318	if (set->elems[mid] < elem)
1319	idx = mid + 1;
1320	else
1321	right = mid;
1322	}
1323	return set->elems[idx] == elem ? idx + 1 : 0;
1324	}
1325
1326	static void
1327	re_node_set_remove_at (set, idx)
1328	re_node_set *set;
1329	int idx;
1330	{
1331	if (idx < 0 \|\| idx >= set->nelem)
1332	return;
1333	--set->nelem;
1334	for (; idx < set->nelem; idx++)
1335	set->elems[idx] = set->elems[idx + 1];
1336	}
1337
1338
1339
1340	/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1341	Or return -1, if an error will be occured. */
1342
1343	static int
1344	re_dfa_add_node (dfa, token)
1345	re_dfa_t *dfa;
1346	re_token_t token;
1347	{
1348	int type = token.type;
1349	if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1350	{
1351	int new_nodes_alloc = dfa->nodes_alloc * 2;
1352	int new_nexts, new_indices;
1353	re_node_set new_edests, new_eclosures;
1354
1355	re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1356	new_nodes_alloc);
1357	if (BE (new_array == NULL, 0))
1358	return -1;
1359	dfa->nodes = new_array;
1360	new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1361	new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1362	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1363	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1364	if (BE (new_nexts == NULL \|\| new_indices == NULL
1365	\|\| new_edests == NULL \|\| new_eclosures == NULL, 0))
1366	return -1;
1367	dfa->nexts = new_nexts;
1368	dfa->org_indices = new_indices;
1369	dfa->edests = new_edests;
1370	dfa->eclosures = new_eclosures;
1371	dfa->nodes_alloc = new_nodes_alloc;
1372	}
1373	dfa->nodes[dfa->nodes_len] = token;
1374	dfa->nodes[dfa->nodes_len].constraint = 0;
1375	#ifdef RE_ENABLE_I18N
1376	dfa->nodes[dfa->nodes_len].accept_mb =
1377	(type == OP_PERIOD && dfa->mb_cur_max > 1) \|\| type == COMPLEX_BRACKET;
1378	#endif
1379	dfa->nexts[dfa->nodes_len] = -1;
1380	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1381	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1382	return dfa->nodes_len++;
1383	}
1384
1385	static unsigned int inline
1386	calc_state_hash (nodes, context)
1387	const re_node_set *nodes;
1388	unsigned int context;
1389	{
1390	unsigned int hash = nodes->nelem + context;
1391	int i;
1392	for (i = 0 ; i < nodes->nelem ; i++)
1393	hash += nodes->elems[i];
1394	return hash;
1395	}
1396
1397	/* Search for the state whose node_set is equivalent to NODES.
1398	Return the pointer to the state, if we found it in the DFA.
1399	Otherwise create the new one and return it. In case of an error
1400	return NULL and set the error code in ERR.
1401	Note: - We assume NULL as the invalid state, then it is possible that
1402	return value is NULL and ERR is REG_NOERROR.
1403	- We never return non-NULL value in case of any errors, it is for
1404	optimization. */
1405
1406	static re_dfastate_t*
1407	re_acquire_state (err, dfa, nodes)
1408	reg_errcode_t *err;
1409	re_dfa_t *dfa;
1410	const re_node_set *nodes;
1411	{
1412	unsigned int hash;
1413	re_dfastate_t *new_state;
1414	struct re_state_table_entry *spot;
1415	int i;
1416	if (BE (nodes->nelem == 0, 0))
1417	{
1418	*err = REG_NOERROR;
1419	return NULL;
1420	}
1421	hash = calc_state_hash (nodes, 0);
1422	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1423
1424	for (i = 0 ; i < spot->num ; i++)
1425	{
1426	re_dfastate_t *state = spot->array[i];
1427	if (hash != state->hash)
1428	continue;
1429	if (re_node_set_compare (&state->nodes, nodes))
1430	return state;
1431	}
1432
1433	/* There are no appropriate state in the dfa, create the new one. */
1434	new_state = create_ci_newstate (dfa, nodes, hash);
1435	if (BE (new_state != NULL, 1))
1436	return new_state;
1437	else
1438	{
1439	*err = REG_ESPACE;
1440	return NULL;
1441	}
1442	}
1443
1444	/* Search for the state whose node_set is equivalent to NODES and
1445	whose context is equivalent to CONTEXT.
1446	Return the pointer to the state, if we found it in the DFA.
1447	Otherwise create the new one and return it. In case of an error
1448	return NULL and set the error code in ERR.
1449	Note: - We assume NULL as the invalid state, then it is possible that
1450	return value is NULL and ERR is REG_NOERROR.
1451	- We never return non-NULL value in case of any errors, it is for
1452	optimization. */
1453
1454	static re_dfastate_t*
1455	re_acquire_state_context (err, dfa, nodes, context)
1456	reg_errcode_t *err;
1457	re_dfa_t *dfa;
1458	const re_node_set *nodes;
1459	unsigned int context;
1460	{
1461	unsigned int hash;
1462	re_dfastate_t *new_state;
1463	struct re_state_table_entry *spot;
1464	int i;
1465	if (nodes->nelem == 0)
1466	{
1467	*err = REG_NOERROR;
1468	return NULL;
1469	}
1470	hash = calc_state_hash (nodes, context);
1471	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1472
1473	for (i = 0 ; i < spot->num ; i++)
1474	{
1475	re_dfastate_t *state = spot->array[i];
1476	if (state->hash == hash
1477	&& state->context == context
1478	&& re_node_set_compare (state->entrance_nodes, nodes))
1479	return state;
1480	}
1481	/* There are no appropriate state in `dfa', create the new one. */
1482	new_state = create_cd_newstate (dfa, nodes, context, hash);
1483	if (BE (new_state != NULL, 1))
1484	return new_state;
1485	else
1486	{
1487	*err = REG_ESPACE;
1488	return NULL;
1489	}
1490	}
1491
1492	/* Finish initialization of the new state NEWSTATE, and using its hash value
1493	HASH put in the appropriate bucket of DFA's state table. Return value
1494	indicates the error code if failed. */
1495
1496	static reg_errcode_t
1497	register_state (dfa, newstate, hash)
1498	re_dfa_t *dfa;
1499	re_dfastate_t *newstate;
1500	unsigned int hash;
1501	{
1502	struct re_state_table_entry *spot;
1503	reg_errcode_t err;
1504	int i;
1505
1506	newstate->hash = hash;
1507	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1508	if (BE (err != REG_NOERROR, 0))
1509	return REG_ESPACE;
1510	for (i = 0; i < newstate->nodes.nelem; i++)
1511	{
1512	int elem = newstate->nodes.elems[i];
1513	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1514	re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1515	}
1516
1517	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1518	if (BE (spot->alloc <= spot->num, 0))
1519	{
1520	int new_alloc = 2 * spot->num + 2;
1521	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1522	new_alloc);
1523	if (BE (new_array == NULL, 0))
1524	return REG_ESPACE;
1525	spot->array = new_array;
1526	spot->alloc = new_alloc;
1527	}
1528	spot->array[spot->num++] = newstate;
1529	return REG_NOERROR;
1530	}
1531
1532	/* Create the new state which is independ of contexts.
1533	Return the new state if succeeded, otherwise return NULL. */
1534
1535	static re_dfastate_t *
1536	create_ci_newstate (dfa, nodes, hash)
1537	re_dfa_t *dfa;
1538	const re_node_set *nodes;
1539	unsigned int hash;
1540	{
1541	int i;
1542	reg_errcode_t err;
1543	re_dfastate_t *newstate;
1544
1545	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1546	if (BE (newstate == NULL, 0))
1547	return NULL;
1548	err = re_node_set_init_copy (&newstate->nodes, nodes);
1549	if (BE (err != REG_NOERROR, 0))
1550	{
1551	re_free (newstate);
1552	return NULL;
1553	}
1554
1555	newstate->entrance_nodes = &newstate->nodes;
1556	for (i = 0 ; i < nodes->nelem ; i++)
1557	{
1558	re_token_t *node = dfa->nodes + nodes->elems[i];
1559	re_token_type_t type = node->type;
1560	if (type == CHARACTER && !node->constraint)
1561	continue;
1562	#ifdef RE_ENABLE_I18N
1563	newstate->accept_mb \|= node->accept_mb;
1564	#endif /* RE_ENABLE_I18N */
1565
1566	/* If the state has the halt node, the state is a halt state. */
1567	if (type == END_OF_RE)
1568	newstate->halt = 1;
1569	else if (type == OP_BACK_REF)
1570	newstate->has_backref = 1;
1571	else if (type == ANCHOR \|\| node->constraint)
1572	newstate->has_constraint = 1;
1573	}
1574	err = register_state (dfa, newstate, hash);
1575	if (BE (err != REG_NOERROR, 0))
1576	{
1577	free_state (newstate);
1578	newstate = NULL;
1579	}
1580	return newstate;
1581	}
1582
1583	/* Create the new state which is depend on the context CONTEXT.
1584	Return the new state if succeeded, otherwise return NULL. */
1585
1586	static re_dfastate_t *
1587	create_cd_newstate (dfa, nodes, context, hash)
1588	re_dfa_t *dfa;
1589	const re_node_set *nodes;
1590	unsigned int context, hash;
1591	{
1592	int i, nctx_nodes = 0;
1593	reg_errcode_t err;
1594	re_dfastate_t *newstate;
1595
1596	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1597	if (BE (newstate == NULL, 0))
1598	return NULL;
1599	err = re_node_set_init_copy (&newstate->nodes, nodes);
1600	if (BE (err != REG_NOERROR, 0))
1601	{
1602	re_free (newstate);
1603	return NULL;
1604	}
1605
1606	newstate->context = context;
1607	newstate->entrance_nodes = &newstate->nodes;
1608
1609	for (i = 0 ; i < nodes->nelem ; i++)
1610	{
1611	unsigned int constraint = 0;
1612	re_token_t *node = dfa->nodes + nodes->elems[i];
1613	re_token_type_t type = node->type;
1614	if (node->constraint)
1615	constraint = node->constraint;
1616
1617	if (type == CHARACTER && !constraint)
1618	continue;
1619	#ifdef RE_ENABLE_I18N
1620	newstate->accept_mb \|= node->accept_mb;
1621	#endif /* RE_ENABLE_I18N */
1622
1623	/* If the state has the halt node, the state is a halt state. */
1624	if (type == END_OF_RE)
1625	newstate->halt = 1;
1626	else if (type == OP_BACK_REF)
1627	newstate->has_backref = 1;
1628	else if (type == ANCHOR)
1629	constraint = node->opr.ctx_type;
1630
1631	if (constraint)
1632	{
1633	if (newstate->entrance_nodes == &newstate->nodes)
1634	{
1635	newstate->entrance_nodes = re_malloc (re_node_set, 1);
1636	if (BE (newstate->entrance_nodes == NULL, 0))
1637	{
1638	free_state (newstate);
1639	return NULL;
1640	}
1641	re_node_set_init_copy (newstate->entrance_nodes, nodes);
1642	nctx_nodes = 0;
1643	newstate->has_constraint = 1;
1644	}
1645
1646	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1647	{
1648	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1649	++nctx_nodes;
1650	}
1651	}
1652	}
1653	err = register_state (dfa, newstate, hash);
1654	if (BE (err != REG_NOERROR, 0))
1655	{
1656	free_state (newstate);
1657	newstate = NULL;
1658	}
1659	return newstate;
1660	}
1661
1662	static void
1663	free_state (state)
1664	re_dfastate_t *state;
1665	{
1666	re_node_set_free (&state->non_eps_nodes);
1667	re_node_set_free (&state->inveclosure);
1668	if (state->entrance_nodes != &state->nodes)
1669	{
1670	re_node_set_free (state->entrance_nodes);
1671	re_free (state->entrance_nodes);
1672	}
1673	re_node_set_free (&state->nodes);
1674	re_free (state->word_trtable);
1675	re_free (state->trtable);
1676	re_free (state);
1677	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: