Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

dfa.c@ 2817

Visit:

Last change on this file since 2817 was 2557, checked in by bird, 20 years ago
grep 2.5.1a
File size: 94.6 KB

Line
1	/* dfa.c - deterministic extended regexp routines for GNU
2	Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; either version 2, or (at your option)
7	any later version.
8
9	This program is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License
15	along with this program; if not, write to the Free Software
16	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
17
18	/* Written June, 1988 by Mike Haertel
19	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
20
21	#ifdef HAVE_CONFIG_H
22	#include <config.h>
23	#endif
24
25	#include <assert.h>
26	#include <ctype.h>
27	#include <stdio.h>
28
29	#include <sys/types.h>
30	#ifdef STDC_HEADERS
31	#include <stdlib.h>
32	#else
33	extern char calloc(), malloc(), *realloc();
34	extern void free();
35	#endif
36
37	#if defined(HAVE_STRING_H) \|\| defined(STDC_HEADERS)
38	#include <string.h>
39	#else
40	#include <strings.h>
41	#endif
42
43	#if HAVE_SETLOCALE
44	# include <locale.h>
45	#endif
46
47	#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
48	/* We can handle multibyte string. */
49	# define MBS_SUPPORT
50	#endif
51
52	#ifdef MBS_SUPPORT
53	# include <wchar.h>
54	# include <wctype.h>
55	#endif
56
57	#ifndef DEBUG /* use the same approach as regex.c */
58	#undef assert
59	#define assert(e)
60	#endif /* DEBUG */
61
62	#ifndef isgraph
63	#define isgraph(C) (isprint(C) && !isspace(C))
64	#endif
65
66	#if defined (STDC_HEADERS) \|\| (!defined (isascii) && !defined (HAVE_ISASCII))
67	#define ISALPHA(C) isalpha(C)
68	#define ISUPPER(C) isupper(C)
69	#define ISLOWER(C) islower(C)
70	#define ISDIGIT(C) isdigit(C)
71	#define ISXDIGIT(C) isxdigit(C)
72	#define ISSPACE(C) isspace(C)
73	#define ISPUNCT(C) ispunct(C)
74	#define ISALNUM(C) isalnum(C)
75	#define ISPRINT(C) isprint(C)
76	#define ISGRAPH(C) isgraph(C)
77	#define ISCNTRL(C) iscntrl(C)
78	#else
79	#define ISALPHA(C) (isascii(C) && isalpha(C))
80	#define ISUPPER(C) (isascii(C) && isupper(C))
81	#define ISLOWER(C) (isascii(C) && islower(C))
82	#define ISDIGIT(C) (isascii(C) && isdigit(C))
83	#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
84	#define ISSPACE(C) (isascii(C) && isspace(C))
85	#define ISPUNCT(C) (isascii(C) && ispunct(C))
86	#define ISALNUM(C) (isascii(C) && isalnum(C))
87	#define ISPRINT(C) (isascii(C) && isprint(C))
88	#define ISGRAPH(C) (isascii(C) && isgraph(C))
89	#define ISCNTRL(C) (isascii(C) && iscntrl(C))
90	#endif
91
92	/* ISASCIIDIGIT differs from ISDIGIT, as follows:
93	- Its arg may be any int or unsigned int; it need not be an unsigned char.
94	- It's guaranteed to evaluate its argument exactly once.
95	- It's typically faster.
96	Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
97	only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
98	it's important to use the locale's definition of `digit' even when the
99	host does not conform to Posix. */
100	#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
101
102	/* If we (don't) have I18N. */
103	/* glibc defines _ */
104	#ifndef _
105	# ifdef HAVE_LIBINTL_H
106	# include <libintl.h>
107	# ifndef _
108	# define _(Str) gettext (Str)
109	# endif
110	# else
111	# define _(Str) (Str)
112	# endif
113	#endif
114
115	#include "regex.h"
116	#include "dfa.h"
117	#include "hard-locale.h"
118
119	/* HPUX, define those as macros in sys/param.h */
120	#ifdef setbit
121	# undef setbit
122	#endif
123	#ifdef clrbit
124	# undef clrbit
125	#endif
126
127	static void dfamust PARAMS ((struct dfa *dfa));
128	static void regexp PARAMS ((int toplevel));
129
130	static ptr_t
131	xcalloc (size_t n, size_t s)
132	{
133	ptr_t r = calloc(n, s);
134
135	if (!r)
136	dfaerror(_("Memory exhausted"));
137	return r;
138	}
139
140	static ptr_t
141	xmalloc (size_t n)
142	{
143	ptr_t r = malloc(n);
144
145	assert(n != 0);
146	if (!r)
147	dfaerror(_("Memory exhausted"));
148	return r;
149	}
150
151	static ptr_t
152	xrealloc (ptr_t p, size_t n)
153	{
154	ptr_t r = realloc(p, n);
155
156	assert(n != 0);
157	if (!r)
158	dfaerror(_("Memory exhausted"));
159	return r;
160	}
161
162	#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
163	#define MALLOC(p, t, n) ((p) = (t ) xmalloc((n) sizeof (t)))
164	#define REALLOC(p, t, n) ((p) = (t ) xrealloc((ptr_t) (p), (n) sizeof (t)))
165
166	/* Reallocate an array of type t if nalloc is too small for index. */
167	#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
168	if ((index) >= (nalloc)) \
169	{ \
170	do \
171	(nalloc) *= 2; \
172	while ((index) >= (nalloc)); \
173	REALLOC(p, t, nalloc); \
174	}
175
176	#ifdef DEBUG
177
178	static void
179	prtok (token t)
180	{
181	char const *s;
182
183	if (t < 0)
184	fprintf(stderr, "END");
185	else if (t < NOTCHAR)
186	fprintf(stderr, "%c", t);
187	else
188	{
189	switch (t)
190	{
191	case EMPTY: s = "EMPTY"; break;
192	case BACKREF: s = "BACKREF"; break;
193	case BEGLINE: s = "BEGLINE"; break;
194	case ENDLINE: s = "ENDLINE"; break;
195	case BEGWORD: s = "BEGWORD"; break;
196	case ENDWORD: s = "ENDWORD"; break;
197	case LIMWORD: s = "LIMWORD"; break;
198	case NOTLIMWORD: s = "NOTLIMWORD"; break;
199	case QMARK: s = "QMARK"; break;
200	case STAR: s = "STAR"; break;
201	case PLUS: s = "PLUS"; break;
202	case CAT: s = "CAT"; break;
203	case OR: s = "OR"; break;
204	case ORTOP: s = "ORTOP"; break;
205	case LPAREN: s = "LPAREN"; break;
206	case RPAREN: s = "RPAREN"; break;
207	case CRANGE: s = "CRANGE"; break;
208	#ifdef MBS_SUPPORT
209	case ANYCHAR: s = "ANYCHAR"; break;
210	case MBCSET: s = "MBCSET"; break;
211	#endif /* MBS_SUPPORT */
212	default: s = "CSET"; break;
213	}
214	fprintf(stderr, "%s", s);
215	}
216	}
217	#endif /* DEBUG */
218
219	/* Stuff pertaining to charclasses. */
220
221	static int
222	tstbit (unsigned b, charclass c)
223	{
224	return c[b / INTBITS] & 1 << b % INTBITS;
225	}
226
227	static void
228	setbit (unsigned b, charclass c)
229	{
230	c[b / INTBITS] \|= 1 << b % INTBITS;
231	}
232
233	static void
234	clrbit (unsigned b, charclass c)
235	{
236	c[b / INTBITS] &= ~(1 << b % INTBITS);
237	}
238
239	static void
240	copyset (charclass src, charclass dst)
241	{
242	memcpy (dst, src, sizeof (charclass));
243	}
244
245	static void
246	zeroset (charclass s)
247	{
248	memset (s, 0, sizeof (charclass));
249	}
250
251	static void
252	notset (charclass s)
253	{
254	int i;
255
256	for (i = 0; i < CHARCLASS_INTS; ++i)
257	s[i] = ~s[i];
258	}
259
260	static int
261	equal (charclass s1, charclass s2)
262	{
263	return memcmp (s1, s2, sizeof (charclass)) == 0;
264	}
265
266	/* A pointer to the current dfa is kept here during parsing. */
267	static struct dfa *dfa;
268
269	/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
270	static int
271	charclass_index (charclass s)
272	{
273	int i;
274
275	for (i = 0; i < dfa->cindex; ++i)
276	if (equal(s, dfa->charclasses[i]))
277	return i;
278	REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
279	++dfa->cindex;
280	copyset(s, dfa->charclasses[i]);
281	return i;
282	}
283
284	/* Syntax bits controlling the behavior of the lexical analyzer. */
285	static reg_syntax_t syntax_bits, syntax_bits_set;
286
287	/* Flag for case-folding letters into sets. */
288	static int case_fold;
289
290	/* End-of-line byte in data. */
291	static unsigned char eolbyte;
292
293	/* Entry point to set syntax options. */
294	void
295	dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
296	{
297	syntax_bits_set = 1;
298	syntax_bits = bits;
299	case_fold = fold;
300	eolbyte = eol;
301	}
302
303	/* Like setbit, but if case is folded, set both cases of a letter. */
304	static void
305	setbit_case_fold (unsigned b, charclass c)
306	{
307	setbit (b, c);
308	if (case_fold)
309	{
310	if (ISUPPER (b))
311	setbit (tolower (b), c);
312	else if (ISLOWER (b))
313	setbit (toupper (b), c);
314	}
315	}
316
317	/* Lexical analyzer. All the dross that deals with the obnoxious
318	GNU Regex syntax bits is located here. The poor, suffering
319	reader is referred to the GNU Regex documentation for the
320	meaning of the @#%!@#%^!@ syntax bits. */
321
322	static char const lexstart; / Pointer to beginning of input string. */
323	static char const lexptr; / Pointer to next input character. */
324	static int lexleft; /* Number of characters remaining. */
325	static token lasttok; /* Previous token returned; initially END. */
326	static int laststart; /* True if we're separated from beginning or (, \|
327	only by zero-width characters. */
328	static int parens; /* Count of outstanding left parens. */
329	static int minrep, maxrep; /* Repeat counts for {m,n}. */
330	static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
331
332	#ifdef MBS_SUPPORT
333	/* These variables are used only if (MB_CUR_MAX > 1). */
334	static mbstate_t mbs; /* Mbstate for mbrlen(). */
335	static int cur_mb_len; /* Byte length of the current scanning
336	multibyte character. */
337	static int cur_mb_index; /* Byte index of the current scanning multibyte
338	character.
339
340	singlebyte character : cur_mb_index = 0
341	multibyte character
342	1st byte : cur_mb_index = 1
343	2nd byte : cur_mb_index = 2
344	...
345	nth byte : cur_mb_index = n */
346	static unsigned char mblen_buf;/ Correspond to the input buffer in dfaexec().
347	Each element store the amount of remain
348	byte of corresponding multibyte character
349	in the input string. A element's value
350	is 0 if corresponding character is a
351	singlebyte chracter.
352	e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
353	mblen_buf : 0, 3, 2, 1
354	*/
355	static wchar_t inputwcs; / Wide character representation of input
356	string in dfaexec().
357	The length of this array is same as
358	the length of input string(char array).
359	inputstring[i] is a single-byte char,
360	or 1st byte of a multibyte char.
361	And inputwcs[i] is the codepoint. */
362	static unsigned char const buf_begin;/ refference to begin in dfaexec(). */
363	static unsigned char const buf_end; / refference to end in dfaexec(). */
364	#endif /* MBS_SUPPORT */
365
366	#ifdef MBS_SUPPORT
367	/* This function update cur_mb_len, and cur_mb_index.
368	p points current lexptr, len is the remaining buffer length. */
369	static void
370	update_mb_len_index (unsigned char const *p, int len)
371	{
372	/* If last character is a part of a multibyte character,
373	we update cur_mb_index. */
374	if (cur_mb_index)
375	cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
376	: cur_mb_index + 1;
377
378	/* If last character is a single byte character, or the
379	last portion of a multibyte character, we check whether
380	next character is a multibyte character or not. */
381	if (! cur_mb_index)
382	{
383	cur_mb_len = mbrlen(p, len, &mbs);
384	if (cur_mb_len > 1)
385	/* It is a multibyte character.
386	cur_mb_len was already set by mbrlen(). */
387	cur_mb_index = 1;
388	else if (cur_mb_len < 1)
389	/* Invalid sequence. We treat it as a singlebyte character.
390	cur_mb_index is aleady 0. */
391	cur_mb_len = 1;
392	/* Otherwise, cur_mb_len == 1, it is a singlebyte character.
393	cur_mb_index is aleady 0. */
394	}
395	}
396	#endif /* MBS_SUPPORT */
397
398	#ifdef MBS_SUPPORT
399	/* Note that characters become unsigned here. */
400	# define FETCH(c, eoferr) \
401	{ \
402	if (! lexleft) \
403	{ \
404	if (eoferr != 0) \
405	dfaerror (eoferr); \
406	else \
407	return lasttok = END; \
408	} \
409	if (MB_CUR_MAX > 1) \
410	update_mb_len_index(lexptr, lexleft); \
411	(c) = (unsigned char) *lexptr++; \
412	--lexleft; \
413	}
414
415	/* This function fetch a wide character, and update cur_mb_len,
416	used only if the current locale is a multibyte environment. */
417	static wchar_t
418	fetch_wc (char const *eoferr)
419	{
420	wchar_t wc;
421	if (! lexleft)
422	{
423	if (eoferr != 0)
424	dfaerror (eoferr);
425	else
426	return -1;
427	}
428
429	cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
430	if (cur_mb_len <= 0)
431	{
432	cur_mb_len = 1;
433	wc = *lexptr;
434	}
435	lexptr += cur_mb_len;
436	lexleft -= cur_mb_len;
437	return wc;
438	}
439	#else
440	/* Note that characters become unsigned here. */
441	# define FETCH(c, eoferr) \
442	{ \
443	if (! lexleft) \
444	{ \
445	if (eoferr != 0) \
446	dfaerror (eoferr); \
447	else \
448	return lasttok = END; \
449	} \
450	(c) = (unsigned char) *lexptr++; \
451	--lexleft; \
452	}
453	#endif /* MBS_SUPPORT */
454
455	#ifdef MBS_SUPPORT
456	/* Multibyte character handling sub-routin for lex.
457	This function parse a bracket expression and build a struct
458	mb_char_classes. */
459	static void
460	parse_bracket_exp_mb ()
461	{
462	wchar_t wc, wc1, wc2;
463
464	/* Work area to build a mb_char_classes. */
465	struct mb_char_classes *work_mbc;
466	int chars_al, range_sts_al, range_ends_al, ch_classes_al,
467	equivs_al, coll_elems_al;
468
469	REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
470	dfa->mbcsets_alloc, dfa->nmbcsets + 1);
471	/* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
472	We will update dfa->multibyte_prop in addtok(), because we can't
473	decide the index in dfa->tokens[]. */
474
475	/* Initialize work are */
476	work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
477
478	chars_al = 1;
479	range_sts_al = range_ends_al = 0;
480	ch_classes_al = equivs_al = coll_elems_al = 0;
481	MALLOC(work_mbc->chars, wchar_t, chars_al);
482
483	work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
484	work_mbc->nequivs = work_mbc->ncoll_elems = 0;
485	work_mbc->chars = work_mbc->ch_classes = NULL;
486	work_mbc->range_sts = work_mbc->range_ends = NULL;
487	work_mbc->equivs = work_mbc->coll_elems = NULL;
488
489	wc = fetch_wc(_("Unbalanced ["));
490	if (wc == L'^')
491	{
492	wc = fetch_wc(_("Unbalanced ["));
493	work_mbc->invert = 1;
494	}
495	else
496	work_mbc->invert = 0;
497	do
498	{
499	wc1 = -1; /* mark wc1 is not initialized". */
500
501	/* Note that if we're looking at some other [:...:] construct,
502	we just treat it as a bunch of ordinary characters. We can do
503	this because we assume regex has checked for syntax errors before
504	dfa is ever called. */
505	if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
506	{
507	#define BRACKET_BUFFER_SIZE 128
508	char str[BRACKET_BUFFER_SIZE];
509	wc1 = wc;
510	wc = fetch_wc(_("Unbalanced ["));
511
512	/* If pattern contains `[[:', `[[.', or `[[='. */
513	if (cur_mb_len == 1 && (wc == L':' \|\| wc == L'.' \|\| wc == L'='))
514	{
515	unsigned char c;
516	unsigned char delim = (unsigned char)wc;
517	int len = 0;
518	for (;;)
519	{
520	if (! lexleft)
521	dfaerror (_("Unbalanced ["));
522	c = (unsigned char) *lexptr++;
523	--lexleft;
524
525	if ((c == delim && *lexptr == ']') \|\| lexleft == 0)
526	break;
527	if (len < BRACKET_BUFFER_SIZE)
528	str[len++] = c;
529	else
530	/* This is in any case an invalid class name. */
531	str[0] = '\0';
532	}
533	str[len] = '\0';
534
535	if (lexleft == 0)
536	{
537	REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
538	work_mbc->nchars + 2);
539	work_mbc->chars[work_mbc->nchars++] = L'[';
540	work_mbc->chars[work_mbc->nchars++] = delim;
541	break;
542	}
543
544	if (--lexleft, *lexptr++ != ']')
545	dfaerror (_("Unbalanced ["));
546	if (delim == ':')
547	/* build character class. */
548	{
549	wctype_t wt;
550	/* Query the character class as wctype_t. */
551	wt = wctype (str);
552
553	if (ch_classes_al == 0)
554	MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al);
555	REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
556	ch_classes_al,
557	work_mbc->nch_classes + 1);
558	work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
559
560	}
561	else if (delim == '=' \|\| delim == '.')
562	{
563	char *elem;
564	MALLOC(elem, char, len + 1);
565	strncpy(elem, str, len + 1);
566
567	if (delim == '=')
568	/* build equivalent class. */
569	{
570	if (equivs_al == 0)
571	MALLOC(work_mbc->equivs, char*, ++equivs_al);
572	REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
573	equivs_al,
574	work_mbc->nequivs + 1);
575	work_mbc->equivs[work_mbc->nequivs++] = elem;
576	}
577
578	if (delim == '.')
579	/* build collating element. */
580	{
581	if (coll_elems_al == 0)
582	MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
583	REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
584	coll_elems_al,
585	work_mbc->ncoll_elems + 1);
586	work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
587	}
588	}
589	wc = -1;
590	}
591	else
592	/* We treat '[' as a normal character here. */
593	{
594	wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
595	}
596	}
597	else
598	{
599	if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
600	wc = fetch_wc(("Unbalanced ["));
601	}
602
603	if (wc1 == -1)
604	wc1 = fetch_wc(_("Unbalanced ["));
605
606	if (wc1 == L'-')
607	/* build range characters. */
608	{
609	wc2 = fetch_wc(_("Unbalanced ["));
610	if (wc2 == L']')
611	{
612	/* In the case [x-], the - is an ordinary hyphen,
613	which is left in c1, the lookahead character. */
614	lexptr -= cur_mb_len;
615	lexleft += cur_mb_len;
616	wc2 = wc;
617	}
618	else
619	{
620	if (wc2 == L'\\'
621	&& (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
622	wc2 = fetch_wc(_("Unbalanced ["));
623	wc1 = fetch_wc(_("Unbalanced ["));
624	}
625
626	if (range_sts_al == 0)
627	{
628	MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
629	MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
630	}
631	REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
632	range_sts_al, work_mbc->nranges + 1);
633	work_mbc->range_sts[work_mbc->nranges] = wc;
634	REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
635	range_ends_al, work_mbc->nranges + 1);
636	work_mbc->range_ends[work_mbc->nranges++] = wc2;
637	}
638	else if (wc != -1)
639	/* build normal characters. */
640	{
641	REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
642	work_mbc->nchars + 1);
643	work_mbc->chars[work_mbc->nchars++] = wc;
644	}
645	}
646	while ((wc = wc1) != L']');
647	}
648	#endif /* MBS_SUPPORT */
649
650	#ifdef __STDC__
651	#define FUNC(F, P) static int F(int c) { return P(c); }
652	#else
653	#define FUNC(F, P) static int F(c) int c; { return P(c); }
654	#endif
655
656	FUNC(is_alpha, ISALPHA)
657	FUNC(is_upper, ISUPPER)
658	FUNC(is_lower, ISLOWER)
659	FUNC(is_digit, ISDIGIT)
660	FUNC(is_xdigit, ISXDIGIT)
661	FUNC(is_space, ISSPACE)
662	FUNC(is_punct, ISPUNCT)
663	FUNC(is_alnum, ISALNUM)
664	FUNC(is_print, ISPRINT)
665	FUNC(is_graph, ISGRAPH)
666	FUNC(is_cntrl, ISCNTRL)
667
668	static int
669	is_blank (int c)
670	{
671	return (c == ' ' \|\| c == '\t');
672	}
673
674	/* The following list maps the names of the Posix named character classes
675	to predicate functions that determine whether a given character is in
676	the class. The leading [ has already been eaten by the lexical analyzer. */
677	static struct {
678	const char *name;
679	int (*pred) PARAMS ((int));
680	} const prednames[] = {
681	{ ":alpha:]", is_alpha },
682	{ ":upper:]", is_upper },
683	{ ":lower:]", is_lower },
684	{ ":digit:]", is_digit },
685	{ ":xdigit:]", is_xdigit },
686	{ ":space:]", is_space },
687	{ ":punct:]", is_punct },
688	{ ":alnum:]", is_alnum },
689	{ ":print:]", is_print },
690	{ ":graph:]", is_graph },
691	{ ":cntrl:]", is_cntrl },
692	{ ":blank:]", is_blank },
693	{ 0 }
694	};
695
696	/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
697	#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) \|\| (C) == '_')
698
699	static int
700	looking_at (char const *s)
701	{
702	size_t len;
703
704	len = strlen(s);
705	if (lexleft < len)
706	return 0;
707	return strncmp(s, lexptr, len) == 0;
708	}
709
710	static token
711	lex (void)
712	{
713	unsigned c, c1, c2;
714	int backslash = 0, invert;
715	charclass ccl;
716	int i;
717
718	/* Basic plan: We fetch a character. If it's a backslash,
719	we set the backslash flag and go through the loop again.
720	On the plus side, this avoids having a duplicate of the
721	main switch inside the backslash case. On the minus side,
722	it means that just about every case begins with
723	"if (backslash) ...". */
724	for (i = 0; i < 2; ++i)
725	{
726	FETCH(c, 0);
727	#ifdef MBS_SUPPORT
728	if (MB_CUR_MAX > 1 && cur_mb_index)
729	/* If this is a part of a multi-byte character, we must treat
730	this byte data as a normal character.
731	e.g. In case of SJIS encoding, some character contains '\',
732	but they must not be backslash. */
733	goto normal_char;
734	#endif /* MBS_SUPPORT */
735	switch (c)
736	{
737	case '\\':
738	if (backslash)
739	goto normal_char;
740	if (lexleft == 0)
741	dfaerror(_("Unfinished \\ escape"));
742	backslash = 1;
743	break;
744
745	case '^':
746	if (backslash)
747	goto normal_char;
748	if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
749	\|\| lasttok == END
750	\|\| lasttok == LPAREN
751	\|\| lasttok == OR)
752	return lasttok = BEGLINE;
753	goto normal_char;
754
755	case '$':
756	if (backslash)
757	goto normal_char;
758	if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
759	\|\| lexleft == 0
760	\|\| (syntax_bits & RE_NO_BK_PARENS
761	? lexleft > 0 && *lexptr == ')'
762	: lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
763	\|\| (syntax_bits & RE_NO_BK_VBAR
764	? lexleft > 0 && *lexptr == '\|'
765	: lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '\|')
766	\|\| ((syntax_bits & RE_NEWLINE_ALT)
767	&& lexleft > 0 && *lexptr == '\n'))
768	return lasttok = ENDLINE;
769	goto normal_char;
770
771	case '1':
772	case '2':
773	case '3':
774	case '4':
775	case '5':
776	case '6':
777	case '7':
778	case '8':
779	case '9':
780	if (backslash && !(syntax_bits & RE_NO_BK_REFS))
781	{
782	laststart = 0;
783	return lasttok = BACKREF;
784	}
785	goto normal_char;
786
787	case '`':
788	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
789	return lasttok = BEGLINE; /* FIXME: should be beginning of string */
790	goto normal_char;
791
792	case '\'':
793	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
794	return lasttok = ENDLINE; /* FIXME: should be end of string */
795	goto normal_char;
796
797	case '<':
798	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
799	return lasttok = BEGWORD;
800	goto normal_char;
801
802	case '>':
803	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
804	return lasttok = ENDWORD;
805	goto normal_char;
806
807	case 'b':
808	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
809	return lasttok = LIMWORD;
810	goto normal_char;
811
812	case 'B':
813	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
814	return lasttok = NOTLIMWORD;
815	goto normal_char;
816
817	case '?':
818	if (syntax_bits & RE_LIMITED_OPS)
819	goto normal_char;
820	if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
821	goto normal_char;
822	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
823	goto normal_char;
824	return lasttok = QMARK;
825
826	case '*':
827	if (backslash)
828	goto normal_char;
829	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
830	goto normal_char;
831	return lasttok = STAR;
832
833	case '+':
834	if (syntax_bits & RE_LIMITED_OPS)
835	goto normal_char;
836	if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
837	goto normal_char;
838	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
839	goto normal_char;
840	return lasttok = PLUS;
841
842	case '{':
843	if (!(syntax_bits & RE_INTERVALS))
844	goto normal_char;
845	if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
846	goto normal_char;
847	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
848	goto normal_char;
849
850	if (syntax_bits & RE_NO_BK_BRACES)
851	{
852	/* Scan ahead for a valid interval; if it's not valid,
853	treat it as a literal '{'. */
854	int lo = -1, hi = -1;
855	char const *p = lexptr;
856	char const *lim = p + lexleft;
857	for (; p != lim && ISASCIIDIGIT (*p); p++)
858	lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
859	if (p != lim && *p == ',')
860	while (++p != lim && ISASCIIDIGIT (*p))
861	hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
862	else
863	hi = lo;
864	if (p == lim \|\| *p != '}'
865	\|\| lo < 0 \|\| RE_DUP_MAX < hi \|\| (0 <= hi && hi < lo))
866	goto normal_char;
867	}
868
869	minrep = 0;
870	/* Cases:
871	{M} - exact count
872	{M,} - minimum count, maximum is infinity
873	{M,N} - M through N */
874	FETCH(c, _("unfinished repeat count"));
875	if (ISASCIIDIGIT (c))
876	{
877	minrep = c - '0';
878	for (;;)
879	{
880	FETCH(c, _("unfinished repeat count"));
881	if (! ISASCIIDIGIT (c))
882	break;
883	minrep = 10 * minrep + c - '0';
884	}
885	}
886	else
887	dfaerror(_("malformed repeat count"));
888	if (c == ',')
889	{
890	FETCH (c, _("unfinished repeat count"));
891	if (! ISASCIIDIGIT (c))
892	maxrep = -1;
893	else
894	{
895	maxrep = c - '0';
896	for (;;)
897	{
898	FETCH (c, _("unfinished repeat count"));
899	if (! ISASCIIDIGIT (c))
900	break;
901	maxrep = 10 * maxrep + c - '0';
902	}
903	if (0 <= maxrep && maxrep < minrep)
904	dfaerror (_("malformed repeat count"));
905	}
906	}
907	else
908	maxrep = minrep;
909	if (!(syntax_bits & RE_NO_BK_BRACES))
910	{
911	if (c != '\\')
912	dfaerror(_("malformed repeat count"));
913	FETCH(c, _("unfinished repeat count"));
914	}
915	if (c != '}')
916	dfaerror(_("malformed repeat count"));
917	laststart = 0;
918	return lasttok = REPMN;
919
920	case '\|':
921	if (syntax_bits & RE_LIMITED_OPS)
922	goto normal_char;
923	if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
924	goto normal_char;
925	laststart = 1;
926	return lasttok = OR;
927
928	case '\n':
929	if (syntax_bits & RE_LIMITED_OPS
930	\|\| backslash
931	\|\| !(syntax_bits & RE_NEWLINE_ALT))
932	goto normal_char;
933	laststart = 1;
934	return lasttok = OR;
935
936	case '(':
937	if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
938	goto normal_char;
939	++parens;
940	laststart = 1;
941	return lasttok = LPAREN;
942
943	case ')':
944	if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
945	goto normal_char;
946	if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
947	goto normal_char;
948	--parens;
949	laststart = 0;
950	return lasttok = RPAREN;
951
952	case '.':
953	if (backslash)
954	goto normal_char;
955	#ifdef MBS_SUPPORT
956	if (MB_CUR_MAX > 1)
957	{
958	/* In multibyte environment period must match with a single
959	character not a byte. So we use ANYCHAR. */
960	laststart = 0;
961	return lasttok = ANYCHAR;
962	}
963	#endif /* MBS_SUPPORT */
964	zeroset(ccl);
965	notset(ccl);
966	if (!(syntax_bits & RE_DOT_NEWLINE))
967	clrbit(eolbyte, ccl);
968	if (syntax_bits & RE_DOT_NOT_NULL)
969	clrbit('\0', ccl);
970	laststart = 0;
971	return lasttok = CSET + charclass_index(ccl);
972
973	case 'w':
974	case 'W':
975	if (!backslash \|\| (syntax_bits & RE_NO_GNU_OPS))
976	goto normal_char;
977	zeroset(ccl);
978	for (c2 = 0; c2 < NOTCHAR; ++c2)
979	if (IS_WORD_CONSTITUENT(c2))
980	setbit(c2, ccl);
981	if (c == 'W')
982	notset(ccl);
983	laststart = 0;
984	return lasttok = CSET + charclass_index(ccl);
985
986	case '[':
987	if (backslash)
988	goto normal_char;
989	laststart = 0;
990	#ifdef MBS_SUPPORT
991	if (MB_CUR_MAX > 1)
992	{
993	/* In multibyte environment a bracket expression may contain
994	multibyte characters, which must be treated as characters
995	(not bytes). So we parse it by parse_bracket_exp_mb(). */
996	parse_bracket_exp_mb();
997	return lasttok = MBCSET;
998	}
999	#endif
1000	zeroset(ccl);
1001	FETCH(c, _("Unbalanced ["));
1002	if (c == '^')
1003	{
1004	FETCH(c, _("Unbalanced ["));
1005	invert = 1;
1006	}
1007	else
1008	invert = 0;
1009	do
1010	{
1011	/* Nobody ever said this had to be fast. :-)
1012	Note that if we're looking at some other [:...:]
1013	construct, we just treat it as a bunch of ordinary
1014	characters. We can do this because we assume
1015	regex has checked for syntax errors before
1016	dfa is ever called. */
1017	if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
1018	for (c1 = 0; prednames[c1].name; ++c1)
1019	if (looking_at(prednames[c1].name))
1020	{
1021	int (*pred) PARAMS ((int)) = prednames[c1].pred;
1022
1023	for (c2 = 0; c2 < NOTCHAR; ++c2)
1024	if ((*pred)(c2))
1025	setbit_case_fold (c2, ccl);
1026	lexptr += strlen(prednames[c1].name);
1027	lexleft -= strlen(prednames[c1].name);
1028	FETCH(c1, _("Unbalanced ["));
1029	goto skip;
1030	}
1031	if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1032	FETCH(c, _("Unbalanced ["));
1033	FETCH(c1, _("Unbalanced ["));
1034	if (c1 == '-')
1035	{
1036	FETCH(c2, _("Unbalanced ["));
1037	if (c2 == ']')
1038	{
1039	/* In the case [x-], the - is an ordinary hyphen,
1040	which is left in c1, the lookahead character. */
1041	--lexptr;
1042	++lexleft;
1043	}
1044	else
1045	{
1046	if (c2 == '\\'
1047	&& (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1048	FETCH(c2, _("Unbalanced ["));
1049	FETCH(c1, _("Unbalanced ["));
1050	if (!hard_LC_COLLATE) {
1051	for (; c <= c2; c++)
1052	setbit_case_fold (c, ccl);
1053	} else {
1054	/* POSIX locales are painful - leave the decision to libc */
1055	char expr[6] = { '[', c, '-', c2, ']', '\0' };
1056	regex_t re;
1057	if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1058	for (c = 0; c < NOTCHAR; ++c) {
1059	char buf[2] = { c, '\0' };
1060	regmatch_t mat;
1061	if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1062	&& mat.rm_so == 0 && mat.rm_eo == 1)
1063	setbit_case_fold (c, ccl);
1064	}
1065	regfree (&re);
1066	}
1067	}
1068	continue;
1069	}
1070	}
1071
1072	setbit_case_fold (c, ccl);
1073
1074	skip:
1075	;
1076	}
1077	while ((c = c1) != ']');
1078	if (invert)
1079	{
1080	notset(ccl);
1081	if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1082	clrbit(eolbyte, ccl);
1083	}
1084	return lasttok = CSET + charclass_index(ccl);
1085
1086	default:
1087	normal_char:
1088	laststart = 0;
1089	if (case_fold && ISALPHA(c))
1090	{
1091	zeroset(ccl);
1092	setbit_case_fold (c, ccl);
1093	return lasttok = CSET + charclass_index(ccl);
1094	}
1095	return c;
1096	}
1097	}
1098
1099	/* The above loop should consume at most a backslash
1100	and some other character. */
1101	abort();
1102	return END; /* keeps pedantic compilers happy. */
1103	}
1104
1105	/* Recursive descent parser for regular expressions. */
1106
1107	static token tok; /* Lookahead token. */
1108	static int depth; /* Current depth of a hypothetical stack
1109	holding deferred productions. This is
1110	used to determine the depth that will be
1111	required of the real stack later on in
1112	dfaanalyze(). */
1113
1114	/* Add the given token to the parse tree, maintaining the depth count and
1115	updating the maximum depth if necessary. */
1116	static void
1117	addtok (token t)
1118	{
1119	#ifdef MBS_SUPPORT
1120	if (MB_CUR_MAX > 1)
1121	{
1122	REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1123	dfa->tindex);
1124	/* Set dfa->multibyte_prop. See struct dfa in dfa.h. */
1125	if (t == MBCSET)
1126	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1127	else if (t < NOTCHAR)
1128	dfa->multibyte_prop[dfa->tindex]
1129	= (cur_mb_len == 1)? 3 /* single-byte char */
1130	: (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1131	+ ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1132	else
1133	/* It may be unnecesssary, but it is safer to treat other
1134	symbols as singlebyte characters. */
1135	dfa->multibyte_prop[dfa->tindex] = 3;
1136	}
1137	#endif
1138
1139	REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1140	dfa->tokens[dfa->tindex++] = t;
1141
1142	switch (t)
1143	{
1144	case QMARK:
1145	case STAR:
1146	case PLUS:
1147	break;
1148
1149	case CAT:
1150	case OR:
1151	case ORTOP:
1152	--depth;
1153	break;
1154
1155	default:
1156	++dfa->nleaves;
1157	case EMPTY:
1158	++depth;
1159	break;
1160	}
1161	if (depth > dfa->depth)
1162	dfa->depth = depth;
1163	}
1164
1165	/* The grammar understood by the parser is as follows.
1166
1167	regexp:
1168	regexp OR branch
1169	branch
1170
1171	branch:
1172	branch closure
1173	closure
1174
1175	closure:
1176	closure QMARK
1177	closure STAR
1178	closure PLUS
1179	closure REPMN
1180	atom
1181
1182	atom:
1183	<normal character>
1184	<multibyte character>
1185	ANYCHAR
1186	MBCSET
1187	CSET
1188	BACKREF
1189	BEGLINE
1190	ENDLINE
1191	BEGWORD
1192	ENDWORD
1193	LIMWORD
1194	NOTLIMWORD
1195	CRANGE
1196	LPAREN regexp RPAREN
1197	<empty>
1198
1199	The parser builds a parse tree in postfix form in an array of tokens. */
1200
1201	static void
1202	atom (void)
1203	{
1204	if ((tok >= 0 && tok < NOTCHAR) \|\| tok >= CSET \|\| tok == BACKREF
1205	\|\| tok == BEGLINE \|\| tok == ENDLINE \|\| tok == BEGWORD
1206	#ifdef MBS_SUPPORT
1207	\|\| tok == ANYCHAR \|\| tok == MBCSET /* MB_CUR_MAX > 1 */
1208	#endif /* MBS_SUPPORT */
1209	\|\| tok == ENDWORD \|\| tok == LIMWORD \|\| tok == NOTLIMWORD)
1210	{
1211	addtok(tok);
1212	tok = lex();
1213	#ifdef MBS_SUPPORT
1214	/* We treat a multibyte character as a single atom, so that DFA
1215	can treat a multibyte character as a single expression.
1216
1217	e.g. We construct following tree from "<mb1><mb2>".
1218	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1219	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1220	*/
1221	if (MB_CUR_MAX > 1)
1222	{
1223	while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1224	{
1225	addtok(tok);
1226	addtok(CAT);
1227	tok = lex();
1228	}
1229	}
1230	#endif /* MBS_SUPPORT */
1231	}
1232	else if (tok == CRANGE)
1233	{
1234	/* A character range like "[a-z]" in a locale other than "C" or
1235	"POSIX". This range might any sequence of one or more
1236	characters. Unfortunately the POSIX locale primitives give
1237	us no practical way to find what character sequences might be
1238	matched. Treat this approximately like "(.\1)" -- i.e. match
1239	one character, and then punt to the full matcher. */
1240	charclass ccl;
1241	zeroset (ccl);
1242	notset (ccl);
1243	addtok (CSET + charclass_index (ccl));
1244	addtok (BACKREF);
1245	addtok (CAT);
1246	tok = lex ();
1247	}
1248	else if (tok == LPAREN)
1249	{
1250	tok = lex();
1251	regexp(0);
1252	if (tok != RPAREN)
1253	dfaerror(_("Unbalanced ("));
1254	tok = lex();
1255	}
1256	else
1257	addtok(EMPTY);
1258	}
1259
1260	/* Return the number of tokens in the given subexpression. */
1261	static int
1262	nsubtoks (int tindex)
1263	{
1264	int ntoks1;
1265
1266	switch (dfa->tokens[tindex - 1])
1267	{
1268	default:
1269	return 1;
1270	case QMARK:
1271	case STAR:
1272	case PLUS:
1273	return 1 + nsubtoks(tindex - 1);
1274	case CAT:
1275	case OR:
1276	case ORTOP:
1277	ntoks1 = nsubtoks(tindex - 1);
1278	return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1279	}
1280	}
1281
1282	/* Copy the given subexpression to the top of the tree. */
1283	static void
1284	copytoks (int tindex, int ntokens)
1285	{
1286	int i;
1287
1288	for (i = 0; i < ntokens; ++i)
1289	addtok(dfa->tokens[tindex + i]);
1290	}
1291
1292	static void
1293	closure (void)
1294	{
1295	int tindex, ntokens, i;
1296
1297	atom();
1298	while (tok == QMARK \|\| tok == STAR \|\| tok == PLUS \|\| tok == REPMN)
1299	if (tok == REPMN)
1300	{
1301	ntokens = nsubtoks(dfa->tindex);
1302	tindex = dfa->tindex - ntokens;
1303	if (maxrep < 0)
1304	addtok(PLUS);
1305	if (minrep == 0)
1306	addtok(QMARK);
1307	for (i = 1; i < minrep; ++i)
1308	{
1309	copytoks(tindex, ntokens);
1310	addtok(CAT);
1311	}
1312	for (; i < maxrep; ++i)
1313	{
1314	copytoks(tindex, ntokens);
1315	addtok(QMARK);
1316	addtok(CAT);
1317	}
1318	tok = lex();
1319	}
1320	else
1321	{
1322	addtok(tok);
1323	tok = lex();
1324	}
1325	}
1326
1327	static void
1328	branch (void)
1329	{
1330	closure();
1331	while (tok != RPAREN && tok != OR && tok >= 0)
1332	{
1333	closure();
1334	addtok(CAT);
1335	}
1336	}
1337
1338	static void
1339	regexp (int toplevel)
1340	{
1341	branch();
1342	while (tok == OR)
1343	{
1344	tok = lex();
1345	branch();
1346	if (toplevel)
1347	addtok(ORTOP);
1348	else
1349	addtok(OR);
1350	}
1351	}
1352
1353	/* Main entry point for the parser. S is a string to be parsed, len is the
1354	length of the string, so s can include NUL characters. D is a pointer to
1355	the struct dfa to parse into. */
1356	void
1357	dfaparse (char const s, size_t len, struct dfa d)
1358	{
1359	dfa = d;
1360	lexstart = lexptr = s;
1361	lexleft = len;
1362	lasttok = END;
1363	laststart = 1;
1364	parens = 0;
1365	#if ENABLE_NLS
1366	hard_LC_COLLATE = hard_locale (LC_COLLATE);
1367	#endif
1368	#ifdef MBS_SUPPORT
1369	if (MB_CUR_MAX > 1)
1370	{
1371	cur_mb_index = 0;
1372	cur_mb_len = 0;
1373	memset(&mbs, 0, sizeof(mbstate_t));
1374	}
1375	#endif /* MBS_SUPPORT */
1376
1377	if (! syntax_bits_set)
1378	dfaerror(_("No syntax specified"));
1379
1380	tok = lex();
1381	depth = d->depth;
1382
1383	regexp(1);
1384
1385	if (tok != END)
1386	dfaerror(_("Unbalanced )"));
1387
1388	addtok(END - d->nregexps);
1389	addtok(CAT);
1390
1391	if (d->nregexps)
1392	addtok(ORTOP);
1393
1394	++d->nregexps;
1395	}
1396
1397	/* Some primitives for operating on sets of positions. */
1398
1399	/* Copy one set to another; the destination must be large enough. */
1400	static void
1401	copy (position_set const src, position_set dst)
1402	{
1403	int i;
1404
1405	for (i = 0; i < src->nelem; ++i)
1406	dst->elems[i] = src->elems[i];
1407	dst->nelem = src->nelem;
1408	}
1409
1410	/* Insert a position in a set. Position sets are maintained in sorted
1411	order according to index. If position already exists in the set with
1412	the same index then their constraints are logically or'd together.
1413	S->elems must point to an array large enough to hold the resulting set. */
1414	static void
1415	insert (position p, position_set *s)
1416	{
1417	int i;
1418	position t1, t2;
1419
1420	for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1421	continue;
1422	if (i < s->nelem && p.index == s->elems[i].index)
1423	s->elems[i].constraint \|= p.constraint;
1424	else
1425	{
1426	t1 = p;
1427	++s->nelem;
1428	while (i < s->nelem)
1429	{
1430	t2 = s->elems[i];
1431	s->elems[i++] = t1;
1432	t1 = t2;
1433	}
1434	}
1435	}
1436
1437	/* Merge two sets of positions into a third. The result is exactly as if
1438	the positions of both sets were inserted into an initially empty set. */
1439	static void
1440	merge (position_set const s1, position_set const s2, position_set *m)
1441	{
1442	int i = 0, j = 0;
1443
1444	m->nelem = 0;
1445	while (i < s1->nelem && j < s2->nelem)
1446	if (s1->elems[i].index > s2->elems[j].index)
1447	m->elems[m->nelem++] = s1->elems[i++];
1448	else if (s1->elems[i].index < s2->elems[j].index)
1449	m->elems[m->nelem++] = s2->elems[j++];
1450	else
1451	{
1452	m->elems[m->nelem] = s1->elems[i++];
1453	m->elems[m->nelem++].constraint \|= s2->elems[j++].constraint;
1454	}
1455	while (i < s1->nelem)
1456	m->elems[m->nelem++] = s1->elems[i++];
1457	while (j < s2->nelem)
1458	m->elems[m->nelem++] = s2->elems[j++];
1459	}
1460
1461	/* Delete a position from a set. */
1462	static void
1463	delete (position p, position_set *s)
1464	{
1465	int i;
1466
1467	for (i = 0; i < s->nelem; ++i)
1468	if (p.index == s->elems[i].index)
1469	break;
1470	if (i < s->nelem)
1471	for (--s->nelem; i < s->nelem; ++i)
1472	s->elems[i] = s->elems[i + 1];
1473	}
1474
1475	/* Find the index of the state corresponding to the given position set with
1476	the given preceding context, or create a new state if there is no such
1477	state. Newline and letter tell whether we got here on a newline or
1478	letter, respectively. */
1479	static int
1480	state_index (struct dfa d, position_set const s, int newline, int letter)
1481	{
1482	int hash = 0;
1483	int constraint;
1484	int i, j;
1485
1486	newline = newline ? 1 : 0;
1487	letter = letter ? 1 : 0;
1488
1489	for (i = 0; i < s->nelem; ++i)
1490	hash ^= s->elems[i].index + s->elems[i].constraint;
1491
1492	/* Try to find a state that exactly matches the proposed one. */
1493	for (i = 0; i < d->sindex; ++i)
1494	{
1495	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
1496	\|\| newline != d->states[i].newline \|\| letter != d->states[i].letter)
1497	continue;
1498	for (j = 0; j < s->nelem; ++j)
1499	if (s->elems[j].constraint
1500	!= d->states[i].elems.elems[j].constraint
1501	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
1502	break;
1503	if (j == s->nelem)
1504	return i;
1505	}
1506
1507	/* We'll have to create a new state. */
1508	REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1509	d->states[i].hash = hash;
1510	MALLOC(d->states[i].elems.elems, position, s->nelem);
1511	copy(s, &d->states[i].elems);
1512	d->states[i].newline = newline;
1513	d->states[i].letter = letter;
1514	d->states[i].backref = 0;
1515	d->states[i].constraint = 0;
1516	d->states[i].first_end = 0;
1517	#ifdef MBS_SUPPORT
1518	if (MB_CUR_MAX > 1)
1519	d->states[i].mbps.nelem = 0;
1520	#endif
1521	for (j = 0; j < s->nelem; ++j)
1522	if (d->tokens[s->elems[j].index] < 0)
1523	{
1524	constraint = s->elems[j].constraint;
1525	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1526	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1527	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1528	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1529	d->states[i].constraint \|= constraint;
1530	if (! d->states[i].first_end)
1531	d->states[i].first_end = d->tokens[s->elems[j].index];
1532	}
1533	else if (d->tokens[s->elems[j].index] == BACKREF)
1534	{
1535	d->states[i].constraint = NO_CONSTRAINT;
1536	d->states[i].backref = 1;
1537	}
1538
1539	++d->sindex;
1540
1541	return i;
1542	}
1543
1544	/* Find the epsilon closure of a set of positions. If any position of the set
1545	contains a symbol that matches the empty string in some context, replace
1546	that position with the elements of its follow labeled with an appropriate
1547	constraint. Repeat exhaustively until no funny positions are left.
1548	S->elems must be large enough to hold the result. */
1549	static void
1550	epsclosure (position_set s, struct dfa const d)
1551	{
1552	int i, j;
1553	int *visited;
1554	position p, old;
1555
1556	MALLOC(visited, int, d->tindex);
1557	for (i = 0; i < d->tindex; ++i)
1558	visited[i] = 0;
1559
1560	for (i = 0; i < s->nelem; ++i)
1561	if (d->tokens[s->elems[i].index] >= NOTCHAR
1562	&& d->tokens[s->elems[i].index] != BACKREF
1563	#ifdef MBS_SUPPORT
1564	&& d->tokens[s->elems[i].index] != ANYCHAR
1565	&& d->tokens[s->elems[i].index] != MBCSET
1566	#endif
1567	&& d->tokens[s->elems[i].index] < CSET)
1568	{
1569	old = s->elems[i];
1570	p.constraint = old.constraint;
1571	delete(s->elems[i], s);
1572	if (visited[old.index])
1573	{
1574	--i;
1575	continue;
1576	}
1577	visited[old.index] = 1;
1578	switch (d->tokens[old.index])
1579	{
1580	case BEGLINE:
1581	p.constraint &= BEGLINE_CONSTRAINT;
1582	break;
1583	case ENDLINE:
1584	p.constraint &= ENDLINE_CONSTRAINT;
1585	break;
1586	case BEGWORD:
1587	p.constraint &= BEGWORD_CONSTRAINT;
1588	break;
1589	case ENDWORD:
1590	p.constraint &= ENDWORD_CONSTRAINT;
1591	break;
1592	case LIMWORD:
1593	p.constraint &= LIMWORD_CONSTRAINT;
1594	break;
1595	case NOTLIMWORD:
1596	p.constraint &= NOTLIMWORD_CONSTRAINT;
1597	break;
1598	default:
1599	break;
1600	}
1601	for (j = 0; j < d->follows[old.index].nelem; ++j)
1602	{
1603	p.index = d->follows[old.index].elems[j].index;
1604	insert(p, s);
1605	}
1606	/* Force rescan to start at the beginning. */
1607	i = -1;
1608	}
1609
1610	free(visited);
1611	}
1612
1613	/* Perform bottom-up analysis on the parse tree, computing various functions.
1614	Note that at this point, we're pretending constructs like \< are real
1615	characters rather than constraints on what can follow them.
1616
1617	Nullable: A node is nullable if it is at the root of a regexp that can
1618	match the empty string.
1619	* EMPTY leaves are nullable.
1620	* No other leaf is nullable.
1621	* A QMARK or STAR node is nullable.
1622	* A PLUS node is nullable if its argument is nullable.
1623	* A CAT node is nullable if both its arguments are nullable.
1624	* An OR node is nullable if either argument is nullable.
1625
1626	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1627	that could correspond to the first character of a string matching the
1628	regexp rooted at the given node.
1629	* EMPTY leaves have empty firstpos.
1630	* The firstpos of a nonempty leaf is that leaf itself.
1631	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1632	argument.
1633	* The firstpos of a CAT node is the firstpos of the left argument, union
1634	the firstpos of the right if the left argument is nullable.
1635	* The firstpos of an OR node is the union of firstpos of each argument.
1636
1637	Lastpos: The lastpos of a node is the set of positions that could
1638	correspond to the last character of a string matching the regexp at
1639	the given node.
1640	* EMPTY leaves have empty lastpos.
1641	* The lastpos of a nonempty leaf is that leaf itself.
1642	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1643	argument.
1644	* The lastpos of a CAT node is the lastpos of its right argument, union
1645	the lastpos of the left if the right argument is nullable.
1646	* The lastpos of an OR node is the union of the lastpos of each argument.
1647
1648	Follow: The follow of a position is the set of positions that could
1649	correspond to the character following a character matching the node in
1650	a string matching the regexp. At this point we consider special symbols
1651	that match the empty string in some context to be just normal characters.
1652	Later, if we find that a special symbol is in a follow set, we will
1653	replace it with the elements of its follow, labeled with an appropriate
1654	constraint.
1655	* Every node in the firstpos of the argument of a STAR or PLUS node is in
1656	the follow of every node in the lastpos.
1657	* Every node in the firstpos of the second argument of a CAT node is in
1658	the follow of every node in the lastpos of the first argument.
1659
1660	Because of the postfix representation of the parse tree, the depth-first
1661	analysis is conveniently done by a linear scan with the aid of a stack.
1662	Sets are stored as arrays of the elements, obeying a stack-like allocation
1663	scheme; the number of elements in each set deeper in the stack can be
1664	used to determine the address of a particular set's array. */
1665	void
1666	dfaanalyze (struct dfa *d, int searchflag)
1667	{
1668	int nullable; / Nullable stack. */
1669	int nfirstpos; / Element count stack for firstpos sets. */
1670	position firstpos; / Array where firstpos elements are stored. */
1671	int nlastpos; / Element count stack for lastpos sets. */
1672	position lastpos; / Array where lastpos elements are stored. */
1673	int nalloc; / Sizes of arrays allocated to follow sets. */
1674	position_set tmp; /* Temporary set for merging sets. */
1675	position_set merged; /* Result of merging sets. */
1676	int wants_newline; /* True if some position wants newline info. */
1677	int *o_nullable;
1678	int o_nfirst, o_nlast;
1679	position o_firstpos, o_lastpos;
1680	int i, j;
1681	position *pos;
1682
1683	#ifdef DEBUG
1684	fprintf(stderr, "dfaanalyze:\n");
1685	for (i = 0; i < d->tindex; ++i)
1686	{
1687	fprintf(stderr, " %d:", i);
1688	prtok(d->tokens[i]);
1689	}
1690	putc('\n', stderr);
1691	#endif
1692
1693	d->searchflag = searchflag;
1694
1695	MALLOC(nullable, int, d->depth);
1696	o_nullable = nullable;
1697	MALLOC(nfirstpos, int, d->depth);
1698	o_nfirst = nfirstpos;
1699	MALLOC(firstpos, position, d->nleaves);
1700	o_firstpos = firstpos, firstpos += d->nleaves;
1701	MALLOC(nlastpos, int, d->depth);
1702	o_nlast = nlastpos;
1703	MALLOC(lastpos, position, d->nleaves);
1704	o_lastpos = lastpos, lastpos += d->nleaves;
1705	MALLOC(nalloc, int, d->tindex);
1706	for (i = 0; i < d->tindex; ++i)
1707	nalloc[i] = 0;
1708	MALLOC(merged.elems, position, d->nleaves);
1709
1710	CALLOC(d->follows, position_set, d->tindex);
1711
1712	for (i = 0; i < d->tindex; ++i)
1713	#ifdef DEBUG
1714	{ /* Nonsyntactic #ifdef goo... */
1715	#endif
1716	switch (d->tokens[i])
1717	{
1718	case EMPTY:
1719	/* The empty set is nullable. */
1720	*nullable++ = 1;
1721
1722	/* The firstpos and lastpos of the empty leaf are both empty. */
1723	nfirstpos++ = nlastpos++ = 0;
1724	break;
1725
1726	case STAR:
1727	case PLUS:
1728	/* Every element in the firstpos of the argument is in the follow
1729	of every element in the lastpos. */
1730	tmp.nelem = nfirstpos[-1];
1731	tmp.elems = firstpos;
1732	pos = lastpos;
1733	for (j = 0; j < nlastpos[-1]; ++j)
1734	{
1735	merge(&tmp, &d->follows[pos[j].index], &merged);
1736	REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1737	nalloc[pos[j].index], merged.nelem - 1);
1738	copy(&merged, &d->follows[pos[j].index]);
1739	}
1740
1741	case QMARK:
1742	/* A QMARK or STAR node is automatically nullable. */
1743	if (d->tokens[i] != PLUS)
1744	nullable[-1] = 1;
1745	break;
1746
1747	case CAT:
1748	/* Every element in the firstpos of the second argument is in the
1749	follow of every element in the lastpos of the first argument. */
1750	tmp.nelem = nfirstpos[-1];
1751	tmp.elems = firstpos;
1752	pos = lastpos + nlastpos[-1];
1753	for (j = 0; j < nlastpos[-2]; ++j)
1754	{
1755	merge(&tmp, &d->follows[pos[j].index], &merged);
1756	REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1757	nalloc[pos[j].index], merged.nelem - 1);
1758	copy(&merged, &d->follows[pos[j].index]);
1759	}
1760
1761	/* The firstpos of a CAT node is the firstpos of the first argument,
1762	union that of the second argument if the first is nullable. */
1763	if (nullable[-2])
1764	nfirstpos[-2] += nfirstpos[-1];
1765	else
1766	firstpos += nfirstpos[-1];
1767	--nfirstpos;
1768
1769	/* The lastpos of a CAT node is the lastpos of the second argument,
1770	union that of the first argument if the second is nullable. */
1771	if (nullable[-1])
1772	nlastpos[-2] += nlastpos[-1];
1773	else
1774	{
1775	pos = lastpos + nlastpos[-2];
1776	for (j = nlastpos[-1] - 1; j >= 0; --j)
1777	pos[j] = lastpos[j];
1778	lastpos += nlastpos[-2];
1779	nlastpos[-2] = nlastpos[-1];
1780	}
1781	--nlastpos;
1782
1783	/* A CAT node is nullable if both arguments are nullable. */
1784	nullable[-2] = nullable[-1] && nullable[-2];
1785	--nullable;
1786	break;
1787
1788	case OR:
1789	case ORTOP:
1790	/* The firstpos is the union of the firstpos of each argument. */
1791	nfirstpos[-2] += nfirstpos[-1];
1792	--nfirstpos;
1793
1794	/* The lastpos is the union of the lastpos of each argument. */
1795	nlastpos[-2] += nlastpos[-1];
1796	--nlastpos;
1797
1798	/* An OR node is nullable if either argument is nullable. */
1799	nullable[-2] = nullable[-1] \|\| nullable[-2];
1800	--nullable;
1801	break;
1802
1803	default:
1804	/* Anything else is a nonempty position. (Note that special
1805	constructs like \< are treated as nonempty strings here;
1806	an "epsilon closure" effectively makes them nullable later.
1807	Backreferences have to get a real position so we can detect
1808	transitions on them later. But they are nullable. */
1809	*nullable++ = d->tokens[i] == BACKREF;
1810
1811	/* This position is in its own firstpos and lastpos. */
1812	nfirstpos++ = nlastpos++ = 1;
1813	--firstpos, --lastpos;
1814	firstpos->index = lastpos->index = i;
1815	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1816
1817	/* Allocate the follow set for this position. */
1818	nalloc[i] = 1;
1819	MALLOC(d->follows[i].elems, position, nalloc[i]);
1820	break;
1821	}
1822	#ifdef DEBUG
1823	/* ... balance the above nonsyntactic #ifdef goo... */
1824	fprintf(stderr, "node %d:", i);
1825	prtok(d->tokens[i]);
1826	putc('\n', stderr);
1827	fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1828	fprintf(stderr, " firstpos:");
1829	for (j = nfirstpos[-1] - 1; j >= 0; --j)
1830	{
1831	fprintf(stderr, " %d:", firstpos[j].index);
1832	prtok(d->tokens[firstpos[j].index]);
1833	}
1834	fprintf(stderr, "\n lastpos:");
1835	for (j = nlastpos[-1] - 1; j >= 0; --j)
1836	{
1837	fprintf(stderr, " %d:", lastpos[j].index);
1838	prtok(d->tokens[lastpos[j].index]);
1839	}
1840	putc('\n', stderr);
1841	}
1842	#endif
1843
1844	/* For each follow set that is the follow set of a real position, replace
1845	it with its epsilon closure. */
1846	for (i = 0; i < d->tindex; ++i)
1847	if (d->tokens[i] < NOTCHAR \|\| d->tokens[i] == BACKREF
1848	#ifdef MBS_SUPPORT
1849	\|\| d->tokens[i] == ANYCHAR
1850	\|\| d->tokens[i] == MBCSET
1851	#endif
1852	\|\| d->tokens[i] >= CSET)
1853	{
1854	#ifdef DEBUG
1855	fprintf(stderr, "follows(%d:", i);
1856	prtok(d->tokens[i]);
1857	fprintf(stderr, "):");
1858	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1859	{
1860	fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1861	prtok(d->tokens[d->follows[i].elems[j].index]);
1862	}
1863	putc('\n', stderr);
1864	#endif
1865	copy(&d->follows[i], &merged);
1866	epsclosure(&merged, d);
1867	if (d->follows[i].nelem < merged.nelem)
1868	REALLOC(d->follows[i].elems, position, merged.nelem);
1869	copy(&merged, &d->follows[i]);
1870	}
1871
1872	/* Get the epsilon closure of the firstpos of the regexp. The result will
1873	be the set of positions of state 0. */
1874	merged.nelem = 0;
1875	for (i = 0; i < nfirstpos[-1]; ++i)
1876	insert(firstpos[i], &merged);
1877	epsclosure(&merged, d);
1878
1879	/* Check if any of the positions of state 0 will want newline context. */
1880	wants_newline = 0;
1881	for (i = 0; i < merged.nelem; ++i)
1882	if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1883	wants_newline = 1;
1884
1885	/* Build the initial state. */
1886	d->salloc = 1;
1887	d->sindex = 0;
1888	MALLOC(d->states, dfa_state, d->salloc);
1889	state_index(d, &merged, wants_newline, 0);
1890
1891	free(o_nullable);
1892	free(o_nfirst);
1893	free(o_firstpos);
1894	free(o_nlast);
1895	free(o_lastpos);
1896	free(nalloc);
1897	free(merged.elems);
1898	}
1899
1900	/* Find, for each character, the transition out of state s of d, and store
1901	it in the appropriate slot of trans.
1902
1903	We divide the positions of s into groups (positions can appear in more
1904	than one group). Each group is labeled with a set of characters that
1905	every position in the group matches (taking into account, if necessary,
1906	preceding context information of s). For each group, find the union
1907	of the its elements' follows. This set is the set of positions of the
1908	new state. For each character in the group's label, set the transition
1909	on this character to be to a state corresponding to the set's positions,
1910	and its associated backward context information, if necessary.
1911
1912	If we are building a searching matcher, we include the positions of state
1913	0 in every state.
1914
1915	The collection of groups is constructed by building an equivalence-class
1916	partition of the positions of s.
1917
1918	For each position, find the set of characters C that it matches. Eliminate
1919	any characters from C that fail on grounds of backward context.
1920
1921	Search through the groups, looking for a group whose label L has nonempty
1922	intersection with C. If L - C is nonempty, create a new group labeled
1923	L - C and having the same positions as the current group, and set L to
1924	the intersection of L and C. Insert the position in this group, set
1925	C = C - L, and resume scanning.
1926
1927	If after comparing with every group there are characters remaining in C,
1928	create a new group labeled with the characters of C and insert this
1929	position in that group. */
1930	void
1931	dfastate (int s, struct dfa *d, int trans[])
1932	{
1933	position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1934	charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1935	int ngrps = 0; /* Number of groups actually used. */
1936	position pos; /* Current position being considered. */
1937	charclass matches; /* Set of matching characters. */
1938	int matchesf; /* True if matches is nonempty. */
1939	charclass intersect; /* Intersection with some label set. */
1940	int intersectf; /* True if intersect is nonempty. */
1941	charclass leftovers; /* Stuff in the label that didn't match. */
1942	int leftoversf; /* True if leftovers is nonempty. */
1943	static charclass letters; /* Set of characters considered letters. */
1944	static charclass newline; /* Set of characters that aren't newline. */
1945	position_set follows; /* Union of the follows of some group. */
1946	position_set tmp; /* Temporary space for merging sets. */
1947	int state; /* New state. */
1948	int wants_newline; /* New state wants to know newline context. */
1949	int state_newline; /* New state on a newline transition. */
1950	int wants_letter; /* New state wants to know letter context. */
1951	int state_letter; /* New state on a letter transition. */
1952	static int initialized; /* Flag for static initialization. */
1953	#ifdef MBS_SUPPORT
1954	int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */
1955	#endif
1956	int i, j, k;
1957
1958	/* Initialize the set of letters, if necessary. */
1959	if (! initialized)
1960	{
1961	initialized = 1;
1962	for (i = 0; i < NOTCHAR; ++i)
1963	if (IS_WORD_CONSTITUENT(i))
1964	setbit(i, letters);
1965	setbit(eolbyte, newline);
1966	}
1967
1968	zeroset(matches);
1969
1970	for (i = 0; i < d->states[s].elems.nelem; ++i)
1971	{
1972	pos = d->states[s].elems.elems[i];
1973	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1974	setbit(d->tokens[pos.index], matches);
1975	else if (d->tokens[pos.index] >= CSET)
1976	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1977	#ifdef MBS_SUPPORT
1978	else if (d->tokens[pos.index] == ANYCHAR
1979	\|\| d->tokens[pos.index] == MBCSET)
1980	/* MB_CUR_MAX > 1 */
1981	{
1982	/* ANYCHAR and MBCSET must match with a single character, so we
1983	must put it to d->states[s].mbps, which contains the positions
1984	which can match with a single character not a byte. */
1985	if (d->states[s].mbps.nelem == 0)
1986	{
1987	MALLOC(d->states[s].mbps.elems, position,
1988	d->states[s].elems.nelem);
1989	}
1990	insert(pos, &(d->states[s].mbps));
1991	continue;
1992	}
1993	#endif /* MBS_SUPPORT */
1994	else
1995	continue;
1996
1997	/* Some characters may need to be eliminated from matches because
1998	they fail in the current context. */
1999	if (pos.constraint != 0xFF)
2000	{
2001	if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2002	d->states[s].newline, 1))
2003	clrbit(eolbyte, matches);
2004	if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2005	d->states[s].newline, 0))
2006	for (j = 0; j < CHARCLASS_INTS; ++j)
2007	matches[j] &= newline[j];
2008	if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2009	d->states[s].letter, 1))
2010	for (j = 0; j < CHARCLASS_INTS; ++j)
2011	matches[j] &= ~letters[j];
2012	if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2013	d->states[s].letter, 0))
2014	for (j = 0; j < CHARCLASS_INTS; ++j)
2015	matches[j] &= letters[j];
2016
2017	/* If there are no characters left, there's no point in going on. */
2018	for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2019	continue;
2020	if (j == CHARCLASS_INTS)
2021	continue;
2022	}
2023
2024	for (j = 0; j < ngrps; ++j)
2025	{
2026	/* If matches contains a single character only, and the current
2027	group's label doesn't contain that character, go on to the
2028	next group. */
2029	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2030	&& !tstbit(d->tokens[pos.index], labels[j]))
2031	continue;
2032
2033	/* Check if this group's label has a nonempty intersection with
2034	matches. */
2035	intersectf = 0;
2036	for (k = 0; k < CHARCLASS_INTS; ++k)
2037	(intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2038	if (! intersectf)
2039	continue;
2040
2041	/* It does; now find the set differences both ways. */
2042	leftoversf = matchesf = 0;
2043	for (k = 0; k < CHARCLASS_INTS; ++k)
2044	{
2045	/* Even an optimizing compiler can't know this for sure. */
2046	int match = matches[k], label = labels[j][k];
2047
2048	(leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2049	(matches[k] = match & ~label) ? (matchesf = 1) : 0;
2050	}
2051
2052	/* If there were leftovers, create a new group labeled with them. */
2053	if (leftoversf)
2054	{
2055	copyset(leftovers, labels[ngrps]);
2056	copyset(intersect, labels[j]);
2057	MALLOC(grps[ngrps].elems, position, d->nleaves);
2058	copy(&grps[j], &grps[ngrps]);
2059	++ngrps;
2060	}
2061
2062	/* Put the position in the current group. Note that there is no
2063	reason to call insert() here. */
2064	grps[j].elems[grps[j].nelem++] = pos;
2065
2066	/* If every character matching the current position has been
2067	accounted for, we're done. */
2068	if (! matchesf)
2069	break;
2070	}
2071
2072	/* If we've passed the last group, and there are still characters
2073	unaccounted for, then we'll have to create a new group. */
2074	if (j == ngrps)
2075	{
2076	copyset(matches, labels[ngrps]);
2077	zeroset(matches);
2078	MALLOC(grps[ngrps].elems, position, d->nleaves);
2079	grps[ngrps].nelem = 1;
2080	grps[ngrps].elems[0] = pos;
2081	++ngrps;
2082	}
2083	}
2084
2085	MALLOC(follows.elems, position, d->nleaves);
2086	MALLOC(tmp.elems, position, d->nleaves);
2087
2088	/* If we are a searching matcher, the default transition is to a state
2089	containing the positions of state 0, otherwise the default transition
2090	is to fail miserably. */
2091	if (d->searchflag)
2092	{
2093	wants_newline = 0;
2094	wants_letter = 0;
2095	for (i = 0; i < d->states[0].elems.nelem; ++i)
2096	{
2097	if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2098	wants_newline = 1;
2099	if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2100	wants_letter = 1;
2101	}
2102	copy(&d->states[0].elems, &follows);
2103	state = state_index(d, &follows, 0, 0);
2104	if (wants_newline)
2105	state_newline = state_index(d, &follows, 1, 0);
2106	else
2107	state_newline = state;
2108	if (wants_letter)
2109	state_letter = state_index(d, &follows, 0, 1);
2110	else
2111	state_letter = state;
2112	for (i = 0; i < NOTCHAR; ++i)
2113	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2114	trans[eolbyte] = state_newline;
2115	}
2116	else
2117	for (i = 0; i < NOTCHAR; ++i)
2118	trans[i] = -1;
2119
2120	for (i = 0; i < ngrps; ++i)
2121	{
2122	follows.nelem = 0;
2123
2124	/* Find the union of the follows of the positions of the group.
2125	This is a hideously inefficient loop. Fix it someday. */
2126	for (j = 0; j < grps[i].nelem; ++j)
2127	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2128	insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2129
2130	#ifdef MBS_SUPPORT
2131	if (MB_CUR_MAX > 1)
2132	{
2133	/* If a token in follows.elems is not 1st byte of a multibyte
2134	character, or the states of follows must accept the bytes
2135	which are not 1st byte of the multibyte character.
2136	Then, if a state of follows encounter a byte, it must not be
2137	a 1st byte of a multibyte character nor singlebyte character.
2138	We cansel to add state[0].follows to next state, because
2139	state[0] must accept 1st-byte
2140
2141	For example, we assume <sb a> is a certain singlebyte
2142	character, <mb A> is a certain multibyte character, and the
2143	codepoint of <sb a> equals the 2nd byte of the codepoint of
2144	<mb A>.
2145	When state[0] accepts <sb a>, state[i] transit to state[i+1]
2146	by accepting accepts 1st byte of <mb A>, and state[i+1]
2147	accepts 2nd byte of <mb A>, if state[i+1] encounter the
2148	codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2149	<mb A>, so we can not add state[0]. */
2150
2151	next_isnt_1st_byte = 0;
2152	for (j = 0; j < follows.nelem; ++j)
2153	{
2154	if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2155	{
2156	next_isnt_1st_byte = 1;
2157	break;
2158	}
2159	}
2160	}
2161	#endif
2162
2163	/* If we are building a searching matcher, throw in the positions
2164	of state 0 as well. */
2165	#ifdef MBS_SUPPORT
2166	if (d->searchflag && (MB_CUR_MAX == 1 \|\| !next_isnt_1st_byte))
2167	#else
2168	if (d->searchflag)
2169	#endif
2170	for (j = 0; j < d->states[0].elems.nelem; ++j)
2171	insert(d->states[0].elems.elems[j], &follows);
2172
2173	/* Find out if the new state will want any context information. */
2174	wants_newline = 0;
2175	if (tstbit(eolbyte, labels[i]))
2176	for (j = 0; j < follows.nelem; ++j)
2177	if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2178	wants_newline = 1;
2179
2180	wants_letter = 0;
2181	for (j = 0; j < CHARCLASS_INTS; ++j)
2182	if (labels[i][j] & letters[j])
2183	break;
2184	if (j < CHARCLASS_INTS)
2185	for (j = 0; j < follows.nelem; ++j)
2186	if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2187	wants_letter = 1;
2188
2189	/* Find the state(s) corresponding to the union of the follows. */
2190	state = state_index(d, &follows, 0, 0);
2191	if (wants_newline)
2192	state_newline = state_index(d, &follows, 1, 0);
2193	else
2194	state_newline = state;
2195	if (wants_letter)
2196	state_letter = state_index(d, &follows, 0, 1);
2197	else
2198	state_letter = state;
2199
2200	/* Set the transitions for each character in the current label. */
2201	for (j = 0; j < CHARCLASS_INTS; ++j)
2202	for (k = 0; k < INTBITS; ++k)
2203	if (labels[i][j] & 1 << k)
2204	{
2205	int c = j * INTBITS + k;
2206
2207	if (c == eolbyte)
2208	trans[c] = state_newline;
2209	else if (IS_WORD_CONSTITUENT(c))
2210	trans[c] = state_letter;
2211	else if (c < NOTCHAR)
2212	trans[c] = state;
2213	}
2214	}
2215
2216	for (i = 0; i < ngrps; ++i)
2217	free(grps[i].elems);
2218	free(follows.elems);
2219	free(tmp.elems);
2220	}
2221
2222	/* Some routines for manipulating a compiled dfa's transition tables.
2223	Each state may or may not have a transition table; if it does, and it
2224	is a non-accepting state, then d->trans[state] points to its table.
2225	If it is an accepting state then d->fails[state] points to its table.
2226	If it has no table at all, then d->trans[state] is NULL.
2227	TODO: Improve this comment, get rid of the unnecessary redundancy. */
2228
2229	static void
2230	build_state (int s, struct dfa *d)
2231	{
2232	int trans; / The new transition table. */
2233	int i;
2234
2235	/* Set an upper limit on the number of transition tables that will ever
2236	exist at once. 1024 is arbitrary. The idea is that the frequently
2237	used transition tables will be quickly rebuilt, whereas the ones that
2238	were only needed once or twice will be cleared away. */
2239	if (d->trcount >= 1024)
2240	{
2241	for (i = 0; i < d->tralloc; ++i)
2242	if (d->trans[i])
2243	{
2244	free((ptr_t) d->trans[i]);
2245	d->trans[i] = NULL;
2246	}
2247	else if (d->fails[i])
2248	{
2249	free((ptr_t) d->fails[i]);
2250	d->fails[i] = NULL;
2251	}
2252	d->trcount = 0;
2253	}
2254
2255	++d->trcount;
2256
2257	/* Set up the success bits for this state. */
2258	d->success[s] = 0;
2259	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2260	s, *d))
2261	d->success[s] \|= 4;
2262	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2263	s, *d))
2264	d->success[s] \|= 2;
2265	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2266	s, *d))
2267	d->success[s] \|= 1;
2268
2269	MALLOC(trans, int, NOTCHAR);
2270	dfastate(s, d, trans);
2271
2272	/* Now go through the new transition table, and make sure that the trans
2273	and fail arrays are allocated large enough to hold a pointer for the
2274	largest state mentioned in the table. */
2275	for (i = 0; i < NOTCHAR; ++i)
2276	if (trans[i] >= d->tralloc)
2277	{
2278	int oldalloc = d->tralloc;
2279
2280	while (trans[i] >= d->tralloc)
2281	d->tralloc *= 2;
2282	REALLOC(d->realtrans, int *, d->tralloc + 1);
2283	d->trans = d->realtrans + 1;
2284	REALLOC(d->fails, int *, d->tralloc);
2285	REALLOC(d->success, int, d->tralloc);
2286	while (oldalloc < d->tralloc)
2287	{
2288	d->trans[oldalloc] = NULL;
2289	d->fails[oldalloc++] = NULL;
2290	}
2291	}
2292
2293	/* Newline is a sentinel. */
2294	trans[eolbyte] = -1;
2295
2296	if (ACCEPTING(s, *d))
2297	d->fails[s] = trans;
2298	else
2299	d->trans[s] = trans;
2300	}
2301
2302	static void
2303	build_state_zero (struct dfa *d)
2304	{
2305	d->tralloc = 1;
2306	d->trcount = 0;
2307	CALLOC(d->realtrans, int *, d->tralloc + 1);
2308	d->trans = d->realtrans + 1;
2309	CALLOC(d->fails, int *, d->tralloc);
2310	MALLOC(d->success, int, d->tralloc);
2311	build_state(0, d);
2312	}
2313
2314	#ifdef MBS_SUPPORT
2315	/* Multibyte character handling sub-routins for dfaexec. */
2316
2317	/* Initial state may encounter the byte which is not a singlebyte character
2318	nor 1st byte of a multibyte character. But it is incorrect for initial
2319	state to accept such a byte.
2320	For example, in sjis encoding the regular expression like "\\" accepts
2321	the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2322	0x815c. Then Initial state must skip the bytes which are not a singlebyte
2323	character nor 1st byte of a multibyte character. */
2324	#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2325	if (s == 0) \
2326	{ \
2327	while (inputwcs[p - buf_begin] == 0 \
2328	&& mblen_buf[p - buf_begin] > 0 \
2329	&& p < buf_end) \
2330	++p; \
2331	if (p >= end) \
2332	{ \
2333	free(mblen_buf); \
2334	free(inputwcs); \
2335	return (size_t) -1; \
2336	} \
2337	}
2338
2339	static void
2340	realloc_trans_if_necessary(struct dfa *d, int new_state)
2341	{
2342	/* Make sure that the trans and fail arrays are allocated large enough
2343	to hold a pointer for the new state. */
2344	if (new_state >= d->tralloc)
2345	{
2346	int oldalloc = d->tralloc;
2347
2348	while (new_state >= d->tralloc)
2349	d->tralloc *= 2;
2350	REALLOC(d->realtrans, int *, d->tralloc + 1);
2351	d->trans = d->realtrans + 1;
2352	REALLOC(d->fails, int *, d->tralloc);
2353	REALLOC(d->success, int, d->tralloc);
2354	while (oldalloc < d->tralloc)
2355	{
2356	d->trans[oldalloc] = NULL;
2357	d->fails[oldalloc++] = NULL;
2358	}
2359	}
2360	}
2361
2362	/* Return values of transit_state_singlebyte(), and
2363	transit_state_consume_1char. */
2364	typedef enum
2365	{
2366	TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2367	TRANSIT_STATE_DONE, /* State transition has finished. */
2368	TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2369	} status_transit_state;
2370
2371	/* Consume a single byte and transit state from 's' to '*next_state'.
2372	This function is almost same as the state transition routin in dfaexec().
2373	But state transition is done just once, otherwise matching succeed or
2374	reach the end of the buffer. */
2375	static status_transit_state
2376	transit_state_singlebyte (struct dfa d, int s, unsigned char const p,
2377	int *next_state)
2378	{
2379	int *t;
2380	int works = s;
2381
2382	status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2383
2384	while (rval == TRANSIT_STATE_IN_PROGRESS)
2385	{
2386	if ((t = d->trans[works]) != NULL)
2387	{
2388	works = t[*p];
2389	rval = TRANSIT_STATE_DONE;
2390	if (works < 0)
2391	works = 0;
2392	}
2393	else if (works < 0)
2394	{
2395	if (p == buf_end)
2396	/* At the moment, it must not happen. */
2397	return TRANSIT_STATE_END_BUFFER;
2398	works = 0;
2399	}
2400	else if (d->fails[works])
2401	{
2402	works = d->fails[works][*p];
2403	rval = TRANSIT_STATE_DONE;
2404	}
2405	else
2406	{
2407	build_state(works, d);
2408	}
2409	}
2410	*next_state = works;
2411	return rval;
2412	}
2413
2414	/* Check whether period can match or not in the current context. If it can,
2415	return the amount of the bytes with which period can match, otherwise
2416	return 0.
2417	`pos' is the position of the period. `index' is the index from the
2418	buf_begin, and it is the current position in the buffer. */
2419	static int
2420	match_anychar (struct dfa *d, int s, position pos, int index)
2421	{
2422	int newline = 0;
2423	int letter = 0;
2424	wchar_t wc;
2425	int mbclen;
2426
2427	wc = inputwcs[index];
2428	mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2429
2430	/* Check context. */
2431	if (wc == (wchar_t)eolbyte)
2432	{
2433	if (!(syntax_bits & RE_DOT_NEWLINE))
2434	return 0;
2435	newline = 1;
2436	}
2437	else if (wc == (wchar_t)'\0')
2438	{
2439	if (syntax_bits & RE_DOT_NOT_NULL)
2440	return 0;
2441	newline = 1;
2442	}
2443
2444	if (iswalnum(wc) \|\| wc == L'_')
2445	letter = 1;
2446
2447	if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2448	newline, d->states[s].letter, letter))
2449	return 0;
2450
2451	return mbclen;
2452	}
2453
2454	/* Check whether bracket expression can match or not in the current context.
2455	If it can, return the amount of the bytes with which expression can match,
2456	otherwise return 0.
2457	`pos' is the position of the bracket expression. `index' is the index
2458	from the buf_begin, and it is the current position in the buffer. */
2459	int
2460	match_mb_charset (struct dfa *d, int s, position pos, int index)
2461	{
2462	int i;
2463	int match; /* Flag which represent that matching succeed. */
2464	int match_len; /* Length of the character (or collating element)
2465	with which this operator match. */
2466	int op_len; /* Length of the operator. */
2467	char buffer[128];
2468	wchar_t wcbuf[6];
2469
2470	/* Pointer to the structure to which we are currently reffering. */
2471	struct mb_char_classes *work_mbc;
2472
2473	int newline = 0;
2474	int letter = 0;
2475	wchar_t wc; /* Current reffering character. */
2476
2477	wc = inputwcs[index];
2478
2479	/* Check context. */
2480	if (wc == (wchar_t)eolbyte)
2481	{
2482	if (!(syntax_bits & RE_DOT_NEWLINE))
2483	return 0;
2484	newline = 1;
2485	}
2486	else if (wc == (wchar_t)'\0')
2487	{
2488	if (syntax_bits & RE_DOT_NOT_NULL)
2489	return 0;
2490	newline = 1;
2491	}
2492	if (iswalnum(wc) \|\| wc == L'_')
2493	letter = 1;
2494	if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2495	newline, d->states[s].letter, letter))
2496	return 0;
2497
2498	/* Assign the current reffering operator to work_mbc. */
2499	work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2500	match = !work_mbc->invert;
2501	match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2502
2503	/* match with a character class? */
2504	for (i = 0; i<work_mbc->nch_classes; i++)
2505	{
2506	if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2507	goto charset_matched;
2508	}
2509
2510	strncpy(buffer, buf_begin + index, match_len);
2511	buffer[match_len] = '\0';
2512
2513	/* match with an equivalent class? */
2514	for (i = 0; i<work_mbc->nequivs; i++)
2515	{
2516	op_len = strlen(work_mbc->equivs[i]);
2517	strncpy(buffer, buf_begin + index, op_len);
2518	buffer[op_len] = '\0';
2519	if (strcoll(work_mbc->equivs[i], buffer) == 0)
2520	{
2521	match_len = op_len;
2522	goto charset_matched;
2523	}
2524	}
2525
2526	/* match with a collating element? */
2527	for (i = 0; i<work_mbc->ncoll_elems; i++)
2528	{
2529	op_len = strlen(work_mbc->coll_elems[i]);
2530	strncpy(buffer, buf_begin + index, op_len);
2531	buffer[op_len] = '\0';
2532
2533	if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2534	{
2535	match_len = op_len;
2536	goto charset_matched;
2537	}
2538	}
2539
2540	wcbuf[0] = wc;
2541	wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2542
2543	/* match with a range? */
2544	for (i = 0; i<work_mbc->nranges; i++)
2545	{
2546	wcbuf[2] = work_mbc->range_sts[i];
2547	wcbuf[4] = work_mbc->range_ends[i];
2548
2549	if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2550	wcscoll(wcbuf+4, wcbuf) >= 0)
2551	goto charset_matched;
2552	}
2553
2554	/* match with a character? */
2555	for (i = 0; i<work_mbc->nchars; i++)
2556	{
2557	if (wc == work_mbc->chars[i])
2558	goto charset_matched;
2559	}
2560
2561	match = !match;
2562
2563	charset_matched:
2564	return match ? match_len : 0;
2565	}
2566
2567	/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2568	array which corresponds to `d->states[s].mbps.elem' and each element of
2569	the array contains the amount of the bytes with which the element can
2570	match.
2571	`index' is the index from the buf_begin, and it is the current position
2572	in the buffer.
2573	Caller MUST free the array which this function return. */
2574	static int*
2575	check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2576	{
2577	int i;
2578	int* rarray;
2579
2580	MALLOC(rarray, int, d->states[s].mbps.nelem);
2581	for (i = 0; i < d->states[s].mbps.nelem; ++i)
2582	{
2583	position pos = d->states[s].mbps.elems[i];
2584	switch(d->tokens[pos.index])
2585	{
2586	case ANYCHAR:
2587	rarray[i] = match_anychar(d, s, pos, index);
2588	break;
2589	case MBCSET:
2590	rarray[i] = match_mb_charset(d, s, pos, index);
2591	break;
2592	default:
2593	break; /* can not happen. */
2594	}
2595	}
2596	return rarray;
2597	}
2598
2599	/* Consume a single character and enumerate all of the positions which can
2600	be next position from the state `s'.
2601	`match_lens' is the input. It can be NULL, but it can also be the output
2602	of check_matching_with_multibyte_ops() for optimization.
2603	`mbclen' and `pps' are the output. `mbclen' is the length of the
2604	character consumed, and `pps' is the set this function enumerate. */
2605	static status_transit_state
2606	transit_state_consume_1char (struct dfa d, int s, unsigned char const *pp,
2607	int match_lens, int mbclen, position_set *pps)
2608	{
2609	int i, j;
2610	int s1, s2;
2611	int* work_mbls;
2612	status_transit_state rs = TRANSIT_STATE_DONE;
2613
2614	/* Calculate the length of the (single/multi byte) character
2615	to which p points. */
2616	mbclen = (mblen_buf[pp - buf_begin] == 0)? 1
2617	: mblen_buf[*pp - buf_begin];
2618
2619	/* Calculate the state which can be reached from the state `s' by
2620	consuming `mbclen' single bytes from the buffer. /
2621	s1 = s;
2622	for (i = 0; i < *mbclen; i++)
2623	{
2624	s2 = s1;
2625	rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2626	}
2627	/* Copy the positions contained by `s1' to the set `pps'. */
2628	copy(&(d->states[s1].elems), pps);
2629
2630	/* Check (inputed)match_lens, and initialize if it is NULL. */
2631	if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2632	work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2633	else
2634	work_mbls = match_lens;
2635
2636	/* Add all of the positions which can be reached from `s' by consuming
2637	a single character. */
2638	for (i = 0; i < d->states[s].mbps.nelem ; i++)
2639	{
2640	if (work_mbls[i] == *mbclen)
2641	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2642	j++)
2643	insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2644	pps);
2645	}
2646
2647	if (match_lens == NULL && work_mbls != NULL)
2648	free(work_mbls);
2649	return rs;
2650	}
2651
2652	/* Transit state from s, then return new state and update the pointer of the
2653	buffer. This function is for some operator which can match with a multi-
2654	byte character or a collating element(which may be multi characters). */
2655	static int
2656	transit_state (struct dfa d, int s, unsigned char const *pp)
2657	{
2658	int s1;
2659	int mbclen; /* The length of current input multibyte character. */
2660	int maxlen = 0;
2661	int i, j;
2662	int *match_lens = NULL;
2663	int nelem = d->states[s].mbps.nelem; /* Just a alias. */
2664	position_set follows;
2665	unsigned char const p1 = pp;
2666	status_transit_state rs;
2667	wchar_t wc;
2668
2669	if (nelem > 0)
2670	/* This state has (a) multibyte operator(s).
2671	We check whether each of them can match or not. */
2672	{
2673	/* Note: caller must free the return value of this function. */
2674	match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2675
2676	for (i = 0; i < nelem; i++)
2677	/* Search the operator which match the longest string,
2678	in this state. */
2679	{
2680	if (match_lens[i] > maxlen)
2681	maxlen = match_lens[i];
2682	}
2683	}
2684
2685	if (nelem == 0 \|\| maxlen == 0)
2686	/* This state has no multibyte operator which can match.
2687	We need to check only one singlebyte character. */
2688	{
2689	status_transit_state rs;
2690	rs = transit_state_singlebyte(d, s, *pp, &s1);
2691
2692	/* We must update the pointer if state transition succeeded. */
2693	if (rs == TRANSIT_STATE_DONE)
2694	++*pp;
2695
2696	if (match_lens != NULL)
2697	free(match_lens);
2698	return s1;
2699	}
2700
2701	/* This state has some operators which can match a multibyte character. */
2702	follows.nelem = 0;
2703	MALLOC(follows.elems, position, d->nleaves);
2704
2705	/* `maxlen' may be longer than the length of a character, because it may
2706	not be a character but a (multi character) collating element.
2707	We enumerate all of the positions which `s' can reach by consuming
2708	`maxlen' bytes. */
2709	rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2710
2711	wc = inputwcs[*pp - mbclen - buf_begin];
2712	s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2713	realloc_trans_if_necessary(d, s1);
2714
2715	while (*pp - p1 < maxlen)
2716	{
2717	follows.nelem = 0;
2718	rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2719
2720	for (i = 0; i < nelem ; i++)
2721	{
2722	if (match_lens[i] == *pp - p1)
2723	for (j = 0;
2724	j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2725	insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2726	&follows);
2727	}
2728
2729	wc = inputwcs[*pp - mbclen - buf_begin];
2730	s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2731	realloc_trans_if_necessary(d, s1);
2732	}
2733	free(match_lens);
2734	free(follows.elems);
2735	return s1;
2736	}
2737
2738	#endif
2739
2740	/* Search through a buffer looking for a match to the given struct dfa.
2741	Find the first occurrence of a string matching the regexp in the buffer,
2742	and the shortest possible version thereof. Return the offset of the first
2743	character after the match, or (size_t) -1 if none is found. BEGIN points to
2744	the beginning of the buffer, and SIZE is the size of the buffer. If SIZE
2745	is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place
2746	where we're supposed to store a 1 if backreferencing happened and the
2747	match needs to be verified by a backtracking matcher. Otherwise
2748	we store a 0 in backref. /
2749	size_t
2750	dfaexec (struct dfa d, char const begin, size_t size, int *backref)
2751	{
2752	register int s; /* Current state. */
2753	register unsigned char const p; / Current input character. */
2754	register unsigned char const end; / One past the last input character. */
2755	register int *trans, t; /* Copy of d->trans so it can be optimized
2756	into a register. */
2757	register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
2758	static int sbit[NOTCHAR]; /* Table for anding with d->success. */
2759	static int sbit_init;
2760
2761	if (! sbit_init)
2762	{
2763	int i;
2764
2765	sbit_init = 1;
2766	for (i = 0; i < NOTCHAR; ++i)
2767	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2768	sbit[eol] = 4;
2769	}
2770
2771	if (! d->tralloc)
2772	build_state_zero(d);
2773
2774	s = 0;
2775	p = (unsigned char const *) begin;
2776	end = p + size;
2777	trans = d->trans;
2778
2779	#ifdef MBS_SUPPORT
2780	if (MB_CUR_MAX > 1)
2781	{
2782	int remain_bytes, i;
2783	buf_begin = begin;
2784	buf_end = end;
2785
2786	/* initialize mblen_buf, and inputwcs. */
2787	MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2);
2788	MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2);
2789	memset(&mbs, 0, sizeof(mbstate_t));
2790	remain_bytes = 0;
2791	for (i = 0; i < end - (unsigned char const *)begin + 1; i++)
2792	{
2793	if (remain_bytes == 0)
2794	{
2795	remain_bytes
2796	= mbrtowc(inputwcs + i, begin + i,
2797	end - (unsigned char const *)begin - i + 1, &mbs);
2798	if (remain_bytes <= 1)
2799	{
2800	remain_bytes = 0;
2801	inputwcs[i] = (wchar_t)begin[i];
2802	mblen_buf[i] = 0;
2803	}
2804	else
2805	{
2806	mblen_buf[i] = remain_bytes;
2807	remain_bytes--;
2808	}
2809	}
2810	else
2811	{
2812	mblen_buf[i] = remain_bytes;
2813	inputwcs[i] = 0;
2814	remain_bytes--;
2815	}
2816	}
2817	mblen_buf[i] = 0;
2818	inputwcs[i] = 0; /* sentinel */
2819	}
2820	#endif /* MBS_SUPPORT */
2821
2822	for (;;)
2823	{
2824	#ifdef MBS_SUPPORT
2825	if (MB_CUR_MAX > 1)
2826	while ((t = trans[s]))
2827	{
2828	if (d->states[s].mbps.nelem != 0)
2829	{
2830	/* Can match with a multibyte character( and multi character
2831	collating element). */
2832	unsigned char const *nextp;
2833
2834	SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2835
2836	nextp = p;
2837	s = transit_state(d, s, &nextp);
2838	p = nextp;
2839
2840	/* Trans table might be updated. */
2841	trans = d->trans;
2842	}
2843	else
2844	{
2845	SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2846	s = t[*p++];
2847	}
2848	}
2849	else
2850	#endif /* MBS_SUPPORT */
2851	while ((t = trans[s]))
2852	s = t[*p++];
2853
2854	if (s < 0)
2855	{
2856	if (p == end)
2857	{
2858	#ifdef MBS_SUPPORT
2859	if (MB_CUR_MAX > 1)
2860	{
2861	free(mblen_buf);
2862	free(inputwcs);
2863	}
2864	#endif /* MBS_SUPPORT */
2865	return (size_t) -1;
2866	}
2867	s = 0;
2868	}
2869	else if ((t = d->fails[s]))
2870	{
2871	if (d->success[s] & sbit[*p])
2872	{
2873	if (backref)
2874	*backref = (d->states[s].backref != 0);
2875	#ifdef MBS_SUPPORT
2876	if (MB_CUR_MAX > 1)
2877	{
2878	free(mblen_buf);
2879	free(inputwcs);
2880	}
2881	#endif /* MBS_SUPPORT */
2882	return (char const *) p - begin;
2883	}
2884
2885	#ifdef MBS_SUPPORT
2886	if (MB_CUR_MAX > 1)
2887	{
2888	SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2889	if (d->states[s].mbps.nelem != 0)
2890	{
2891	/* Can match with a multibyte character( and multi
2892	character collating element). */
2893	unsigned char const *nextp;
2894	nextp = p;
2895	s = transit_state(d, s, &nextp);
2896	p = nextp;
2897
2898	/* Trans table might be updated. */
2899	trans = d->trans;
2900	}
2901	else
2902	s = t[*p++];
2903	}
2904	else
2905	#endif /* MBS_SUPPORT */
2906	s = t[*p++];
2907	}
2908	else
2909	{
2910	build_state(s, d);
2911	trans = d->trans;
2912	}
2913	}
2914	}
2915
2916	/* Initialize the components of a dfa that the other routines don't
2917	initialize for themselves. */
2918	void
2919	dfainit (struct dfa *d)
2920	{
2921	d->calloc = 1;
2922	MALLOC(d->charclasses, charclass, d->calloc);
2923	d->cindex = 0;
2924
2925	d->talloc = 1;
2926	MALLOC(d->tokens, token, d->talloc);
2927	d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2928	#ifdef MBS_SUPPORT
2929	if (MB_CUR_MAX > 1)
2930	{
2931	d->nmultibyte_prop = 1;
2932	MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
2933	d->nmbcsets = 0;
2934	d->mbcsets_alloc = 1;
2935	MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
2936	}
2937	#endif
2938
2939	d->searchflag = 0;
2940	d->tralloc = 0;
2941
2942	d->musts = 0;
2943	}
2944
2945	/* Parse and analyze a single string of the given length. */
2946	void
2947	dfacomp (char const s, size_t len, struct dfa d, int searchflag)
2948	{
2949	if (case_fold) /* dummy folding in service of dfamust() */
2950	{
2951	char *lcopy;
2952	int i;
2953
2954	lcopy = malloc(len);
2955	if (!lcopy)
2956	dfaerror(_("out of memory"));
2957
2958	/* This is a kludge. */
2959	case_fold = 0;
2960	for (i = 0; i < len; ++i)
2961	if (ISUPPER ((unsigned char) s[i]))
2962	lcopy[i] = tolower ((unsigned char) s[i]);
2963	else
2964	lcopy[i] = s[i];
2965
2966	dfainit(d);
2967	dfaparse(lcopy, len, d);
2968	free(lcopy);
2969	dfamust(d);
2970	d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2971	case_fold = 1;
2972	dfaparse(s, len, d);
2973	dfaanalyze(d, searchflag);
2974	}
2975	else
2976	{
2977	dfainit(d);
2978	dfaparse(s, len, d);
2979	dfamust(d);
2980	dfaanalyze(d, searchflag);
2981	}
2982	}
2983
2984	/* Free the storage held by the components of a dfa. */
2985	void
2986	dfafree (struct dfa *d)
2987	{
2988	int i;
2989	struct dfamust dm, ndm;
2990
2991	free((ptr_t) d->charclasses);
2992	free((ptr_t) d->tokens);
2993
2994	#ifdef MBS_SUPPORT
2995	if (MB_CUR_MAX > 1)
2996	{
2997	free((ptr_t) d->multibyte_prop);
2998	for (i = 0; i < d->nmbcsets; ++i)
2999	{
3000	int j;
3001	struct mb_char_classes *p = &(d->mbcsets[i]);
3002	if (p->chars != NULL)
3003	free(p->chars);
3004	if (p->ch_classes != NULL)
3005	free(p->ch_classes);
3006	if (p->range_sts != NULL)
3007	free(p->range_sts);
3008	if (p->range_ends != NULL)
3009	free(p->range_ends);
3010
3011	for (j = 0; j < p->nequivs; ++j)
3012	free(p->equivs[j]);
3013	if (p->equivs != NULL)
3014	free(p->equivs);
3015
3016	for (j = 0; j < p->ncoll_elems; ++j)
3017	free(p->coll_elems[j]);
3018	if (p->coll_elems != NULL)
3019	free(p->coll_elems);
3020	}
3021	free((ptr_t) d->mbcsets);
3022	}
3023	#endif /* MBS_SUPPORT */
3024
3025	for (i = 0; i < d->sindex; ++i)
3026	free((ptr_t) d->states[i].elems.elems);
3027	free((ptr_t) d->states);
3028	for (i = 0; i < d->tindex; ++i)
3029	if (d->follows[i].elems)
3030	free((ptr_t) d->follows[i].elems);
3031	free((ptr_t) d->follows);
3032	for (i = 0; i < d->tralloc; ++i)
3033	if (d->trans[i])
3034	free((ptr_t) d->trans[i]);
3035	else if (d->fails[i])
3036	free((ptr_t) d->fails[i]);
3037	if (d->realtrans) free((ptr_t) d->realtrans);
3038	if (d->fails) free((ptr_t) d->fails);
3039	if (d->success) free((ptr_t) d->success);
3040	for (dm = d->musts; dm; dm = ndm)
3041	{
3042	ndm = dm->next;
3043	free(dm->must);
3044	free((ptr_t) dm);
3045	}
3046	}
3047
3048	/* Having found the postfix representation of the regular expression,
3049	try to find a long sequence of characters that must appear in any line
3050	containing the r.e.
3051	Finding a "longest" sequence is beyond the scope here;
3052	we take an easy way out and hope for the best.
3053	(Take "(ab\|a)b"--please.)
3054
3055	We do a bottom-up calculation of sequences of characters that must appear
3056	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3057	representation:
3058	sequences that must appear at the left of the match ("left")
3059	sequences that must appear at the right of the match ("right")
3060	lists of sequences that must appear somewhere in the match ("in")
3061	sequences that must constitute the match ("is")
3062
3063	When we get to the root of the tree, we use one of the longest of its
3064	calculated "in" sequences as our answer. The sequence we find is returned in
3065	d->must (where "d" is the single argument passed to "dfamust");
3066	the length of the sequence is returned in d->mustn.
3067
3068	The sequences calculated for the various types of node (in pseudo ANSI c)
3069	are shown below. "p" is the operand of unary operators (and the left-hand
3070	operand of binary operators); "q" is the right-hand operand of binary
3071	operators.
3072
3073	"ZERO" means "a zero-length sequence" below.
3074
3075	Type left right is in
3076	---- ---- ----- -- --
3077	char c # c # c # c # c
3078
3079	ANYCHAR ZERO ZERO ZERO ZERO
3080
3081	MBCSET ZERO ZERO ZERO ZERO
3082
3083	CSET ZERO ZERO ZERO ZERO
3084
3085	STAR ZERO ZERO ZERO ZERO
3086
3087	QMARK ZERO ZERO ZERO ZERO
3088
3089	PLUS p->left p->right ZERO p->in
3090
3091	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3092	p->left : q->right : q->is!=ZERO) ? q->in plus
3093	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3094	ZERO
3095
3096	OR longest common longest common (do p->is and substrings common to
3097	leading trailing q->is have same p->in and q->in
3098	(sub)sequence (sub)sequence length and
3099	of p->left of p->right content) ?
3100	and q->left and q->right p->is : NULL
3101
3102	If there's anything else we recognize in the tree, all four sequences get set
3103	to zero-length sequences. If there's something we don't recognize in the tree,
3104	we just return a zero-length sequence.
3105
3106	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3107	'aaa')?
3108
3109	And. . .is it here or someplace that we might ponder "optimizations" such as
3110	egrep 'psi\|epsilon' -> egrep 'psi'
3111	egrep 'pepsi\|epsilon' -> egrep 'epsi'
3112	(Yes, we now find "epsi" as a "string
3113	that must occur", but we might also
3114	simplify the entire r.e. being sought)
3115	grep '[c]' -> grep 'c'
3116	grep '(ab\|a)b' -> grep 'ab'
3117	grep 'ab*' -> grep 'a'
3118	grep 'a*b' -> grep 'b'
3119
3120	There are several issues:
3121
3122	Is optimization easy (enough)?
3123
3124	Does optimization actually accomplish anything,
3125	or is the automaton you get from "psi\|epsilon" (for example)
3126	the same as the one you get from "psi" (for example)?
3127
3128	Are optimizable r.e.'s likely to be used in real-life situations
3129	(something like 'ab*' is probably unlikely; something like is
3130	'psi\|epsilon' is likelier)? */
3131
3132	static char *
3133	icatalloc (char old, char new)
3134	{
3135	char *result;
3136	size_t oldsize, newsize;
3137
3138	newsize = (new == NULL) ? 0 : strlen(new);
3139	if (old == NULL)
3140	oldsize = 0;
3141	else if (newsize == 0)
3142	return old;
3143	else oldsize = strlen(old);
3144	if (old == NULL)
3145	result = (char *) malloc(newsize + 1);
3146	else
3147	result = (char ) realloc((void ) old, oldsize + newsize + 1);
3148	if (result != NULL && new != NULL)
3149	(void) strcpy(result + oldsize, new);
3150	return result;
3151	}
3152
3153	static char *
3154	icpyalloc (char *string)
3155	{
3156	return icatalloc((char *) NULL, string);
3157	}
3158
3159	static char *
3160	istrstr (char lookin, char lookfor)
3161	{
3162	char *cp;
3163	size_t len;
3164
3165	len = strlen(lookfor);
3166	for (cp = lookin; *cp != '\0'; ++cp)
3167	if (strncmp(cp, lookfor, len) == 0)
3168	return cp;
3169	return NULL;
3170	}
3171
3172	static void
3173	ifree (char *cp)
3174	{
3175	if (cp != NULL)
3176	free(cp);
3177	}
3178
3179	static void
3180	freelist (char **cpp)
3181	{
3182	int i;
3183
3184	if (cpp == NULL)
3185	return;
3186	for (i = 0; cpp[i] != NULL; ++i)
3187	{
3188	free(cpp[i]);
3189	cpp[i] = NULL;
3190	}
3191	}
3192
3193	static char **
3194	enlist (char *cpp, char new, size_t len)
3195	{
3196	int i, j;
3197
3198	if (cpp == NULL)
3199	return NULL;
3200	if ((new = icpyalloc(new)) == NULL)
3201	{
3202	freelist(cpp);
3203	return NULL;
3204	}
3205	new[len] = '\0';
3206	/* Is there already something in the list that's new (or longer)? */
3207	for (i = 0; cpp[i] != NULL; ++i)
3208	if (istrstr(cpp[i], new) != NULL)
3209	{
3210	free(new);
3211	return cpp;
3212	}
3213	/* Eliminate any obsoleted strings. */
3214	j = 0;
3215	while (cpp[j] != NULL)
3216	if (istrstr(new, cpp[j]) == NULL)
3217	++j;
3218	else
3219	{
3220	free(cpp[j]);
3221	if (--i == j)
3222	break;
3223	cpp[j] = cpp[i];
3224	cpp[i] = NULL;
3225	}
3226	/* Add the new string. */
3227	cpp = (char *) realloc((char ) cpp, (i + 2) * sizeof *cpp);
3228	if (cpp == NULL)
3229	return NULL;
3230	cpp[i] = new;
3231	cpp[i + 1] = NULL;
3232	return cpp;
3233	}
3234
3235	/* Given pointers to two strings, return a pointer to an allocated
3236	list of their distinct common substrings. Return NULL if something
3237	seems wild. */
3238	static char **
3239	comsubs (char left, char right)
3240	{
3241	char **cpp;
3242	char *lcp;
3243	char *rcp;
3244	size_t i, len;
3245
3246	if (left == NULL \|\| right == NULL)
3247	return NULL;
3248	cpp = (char *) malloc(sizeof cpp);
3249	if (cpp == NULL)
3250	return NULL;
3251	cpp[0] = NULL;
3252	for (lcp = left; *lcp != '\0'; ++lcp)
3253	{
3254	len = 0;
3255	rcp = strchr (right, *lcp);
3256	while (rcp != NULL)
3257	{
3258	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3259	continue;
3260	if (i > len)
3261	len = i;
3262	rcp = strchr (rcp + 1, *lcp);
3263	}
3264	if (len == 0)
3265	continue;
3266	if ((cpp = enlist(cpp, lcp, len)) == NULL)
3267	break;
3268	}
3269	return cpp;
3270	}
3271
3272	static char **
3273	addlists (char old, char new)
3274	{
3275	int i;
3276
3277	if (old == NULL \|\| new == NULL)
3278	return NULL;
3279	for (i = 0; new[i] != NULL; ++i)
3280	{
3281	old = enlist(old, new[i], strlen(new[i]));
3282	if (old == NULL)
3283	break;
3284	}
3285	return old;
3286	}
3287
3288	/* Given two lists of substrings, return a new list giving substrings
3289	common to both. */
3290	static char **
3291	inboth (char left, char right)
3292	{
3293	char **both;
3294	char **temp;
3295	int lnum, rnum;
3296
3297	if (left == NULL \|\| right == NULL)
3298	return NULL;
3299	both = (char *) malloc(sizeof both);
3300	if (both == NULL)
3301	return NULL;
3302	both[0] = NULL;
3303	for (lnum = 0; left[lnum] != NULL; ++lnum)
3304	{
3305	for (rnum = 0; right[rnum] != NULL; ++rnum)
3306	{
3307	temp = comsubs(left[lnum], right[rnum]);
3308	if (temp == NULL)
3309	{
3310	freelist(both);
3311	return NULL;
3312	}
3313	both = addlists(both, temp);
3314	freelist(temp);
3315	free(temp);
3316	if (both == NULL)
3317	return NULL;
3318	}
3319	}
3320	return both;
3321	}
3322
3323	typedef struct
3324	{
3325	char **in;
3326	char *left;
3327	char *right;
3328	char *is;
3329	} must;
3330
3331	static void
3332	resetmust (must *mp)
3333	{
3334	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3335	freelist(mp->in);
3336	}
3337
3338	static void
3339	dfamust (struct dfa *dfa)
3340	{
3341	must *musts;
3342	must *mp;
3343	char *result;
3344	int ri;
3345	int i;
3346	int exact;
3347	token t;
3348	static must must0;
3349	struct dfamust *dm;
3350	static char empty_string[] = "";
3351
3352	result = empty_string;
3353	exact = 0;
3354	musts = (must ) malloc((dfa->tindex + 1) sizeof *musts);
3355	if (musts == NULL)
3356	return;
3357	mp = musts;
3358	for (i = 0; i <= dfa->tindex; ++i)
3359	mp[i] = must0;
3360	for (i = 0; i <= dfa->tindex; ++i)
3361	{
3362	mp[i].in = (char *) malloc(sizeof mp[i].in);
3363	mp[i].left = malloc(2);
3364	mp[i].right = malloc(2);
3365	mp[i].is = malloc(2);
3366	if (mp[i].in == NULL \|\| mp[i].left == NULL \|\|
3367	mp[i].right == NULL \|\| mp[i].is == NULL)
3368	goto done;
3369	mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3370	mp[i].in[0] = NULL;
3371	}
3372	#ifdef DEBUG
3373	fprintf(stderr, "dfamust:\n");
3374	for (i = 0; i < dfa->tindex; ++i)
3375	{
3376	fprintf(stderr, " %d:", i);
3377	prtok(dfa->tokens[i]);
3378	}
3379	putc('\n', stderr);
3380	#endif
3381	for (ri = 0; ri < dfa->tindex; ++ri)
3382	{
3383	switch (t = dfa->tokens[ri])
3384	{
3385	case LPAREN:
3386	case RPAREN:
3387	goto done; /* "cannot happen" */
3388	case EMPTY:
3389	case BEGLINE:
3390	case ENDLINE:
3391	case BEGWORD:
3392	case ENDWORD:
3393	case LIMWORD:
3394	case NOTLIMWORD:
3395	case BACKREF:
3396	resetmust(mp);
3397	break;
3398	case STAR:
3399	case QMARK:
3400	if (mp <= musts)
3401	goto done; /* "cannot happen" */
3402	--mp;
3403	resetmust(mp);
3404	break;
3405	case OR:
3406	case ORTOP:
3407	if (mp < &musts[2])
3408	goto done; /* "cannot happen" */
3409	{
3410	char **new;
3411	must *lmp;
3412	must *rmp;
3413	int j, ln, rn, n;
3414
3415	rmp = --mp;
3416	lmp = --mp;
3417	/* Guaranteed to be. Unlikely, but. . . */
3418	if (strcmp(lmp->is, rmp->is) != 0)
3419	lmp->is[0] = '\0';
3420	/* Left side--easy */
3421	i = 0;
3422	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3423	++i;
3424	lmp->left[i] = '\0';
3425	/* Right side */
3426	ln = strlen(lmp->right);
3427	rn = strlen(rmp->right);
3428	n = ln;
3429	if (n > rn)
3430	n = rn;
3431	for (i = 0; i < n; ++i)
3432	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3433	break;
3434	for (j = 0; j < i; ++j)
3435	lmp->right[j] = lmp->right[(ln - i) + j];
3436	lmp->right[j] = '\0';
3437	new = inboth(lmp->in, rmp->in);
3438	if (new == NULL)
3439	goto done;
3440	freelist(lmp->in);
3441	free((char *) lmp->in);
3442	lmp->in = new;
3443	}
3444	break;
3445	case PLUS:
3446	if (mp <= musts)
3447	goto done; /* "cannot happen" */
3448	--mp;
3449	mp->is[0] = '\0';
3450	break;
3451	case END:
3452	if (mp != &musts[1])
3453	goto done; /* "cannot happen" */
3454	for (i = 0; musts[0].in[i] != NULL; ++i)
3455	if (strlen(musts[0].in[i]) > strlen(result))
3456	result = musts[0].in[i];
3457	if (strcmp(result, musts[0].is) == 0)
3458	exact = 1;
3459	goto done;
3460	case CAT:
3461	if (mp < &musts[2])
3462	goto done; /* "cannot happen" */
3463	{
3464	must *lmp;
3465	must *rmp;
3466
3467	rmp = --mp;
3468	lmp = --mp;
3469	/* In. Everything in left, plus everything in
3470	right, plus catenation of
3471	left's right and right's left. */
3472	lmp->in = addlists(lmp->in, rmp->in);
3473	if (lmp->in == NULL)
3474	goto done;
3475	if (lmp->right[0] != '\0' &&
3476	rmp->left[0] != '\0')
3477	{
3478	char *tp;
3479
3480	tp = icpyalloc(lmp->right);
3481	if (tp == NULL)
3482	goto done;
3483	tp = icatalloc(tp, rmp->left);
3484	if (tp == NULL)
3485	goto done;
3486	lmp->in = enlist(lmp->in, tp,
3487	strlen(tp));
3488	free(tp);
3489	if (lmp->in == NULL)
3490	goto done;
3491	}
3492	/* Left-hand */
3493	if (lmp->is[0] != '\0')
3494	{
3495	lmp->left = icatalloc(lmp->left,
3496	rmp->left);
3497	if (lmp->left == NULL)
3498	goto done;
3499	}
3500	/* Right-hand */
3501	if (rmp->is[0] == '\0')
3502	lmp->right[0] = '\0';
3503	lmp->right = icatalloc(lmp->right, rmp->right);
3504	if (lmp->right == NULL)
3505	goto done;
3506	/* Guaranteed to be */
3507	if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3508	{
3509	lmp->is = icatalloc(lmp->is, rmp->is);
3510	if (lmp->is == NULL)
3511	goto done;
3512	}
3513	else
3514	lmp->is[0] = '\0';
3515	}
3516	break;
3517	default:
3518	if (t < END)
3519	{
3520	/* "cannot happen" */
3521	goto done;
3522	}
3523	else if (t == '\0')
3524	{
3525	/* not on my shift */
3526	goto done;
3527	}
3528	else if (t >= CSET
3529	#ifdef MBS_SUPPORT
3530	\|\| t == ANYCHAR
3531	\|\| t == MBCSET
3532	#endif /* MBS_SUPPORT */
3533	)
3534	{
3535	/* easy enough */
3536	resetmust(mp);
3537	}
3538	else
3539	{
3540	/* plain character */
3541	resetmust(mp);
3542	mp->is[0] = mp->left[0] = mp->right[0] = t;
3543	mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3544	mp->in = enlist(mp->in, mp->is, (size_t)1);
3545	if (mp->in == NULL)
3546	goto done;
3547	}
3548	break;
3549	}
3550	#ifdef DEBUG
3551	fprintf(stderr, " node: %d:", ri);
3552	prtok(dfa->tokens[ri]);
3553	fprintf(stderr, "\n in:");
3554	for (i = 0; mp->in[i]; ++i)
3555	fprintf(stderr, " \"%s\"", mp->in[i]);
3556	fprintf(stderr, "\n is: \"%s\"\n", mp->is);
3557	fprintf(stderr, " left: \"%s\"\n", mp->left);
3558	fprintf(stderr, " right: \"%s\"\n", mp->right);
3559	#endif
3560	++mp;
3561	}
3562	done:
3563	if (strlen(result))
3564	{
3565	dm = (struct dfamust *) malloc(sizeof (struct dfamust));
3566	dm->exact = exact;
3567	dm->must = malloc(strlen(result) + 1);
3568	strcpy(dm->must, result);
3569	dm->next = dfa->musts;
3570	dfa->musts = dm;
3571	}
3572	mp = musts;
3573	for (i = 0; i <= dfa->tindex; ++i)
3574	{
3575	freelist(mp[i].in);
3576	ifree((char *) mp[i].in);
3577	ifree(mp[i].left);
3578	ifree(mp[i].right);
3579	ifree(mp[i].is);
3580	}
3581	free((char *) mp);
3582	}
3583	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/grep/src/dfa.c@ 2817

Download in other formats: