Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

dfa.c@ 3090

Visit:

Last change on this file since 3090 was 3076, checked in by bird, 19 years ago
gawk 3.1.5
File size: 98.9 KB

Line
1	/* dfa.c - deterministic extended regexp routines for GNU
2	Copyright 1988, 1998, 2000, 2002, 2004, 2005 Free Software Foundation, Inc.
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; either version 2, or (at your option)
7	any later version.
8
9	This program is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License
15	along with this program; if not, write to the Free Software
16	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
17
18	/* Written June, 1988 by Mike Haertel
19	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
20
21	#ifdef HAVE_CONFIG_H
22	#include <config.h>
23	#endif
24
25	#include <assert.h>
26	#include <ctype.h>
27	#include <stdio.h>
28
29	#ifndef VMS
30	#include <sys/types.h>
31	#else
32	#include <stddef.h>
33	#endif
34	#ifdef STDC_HEADERS
35	#include <stdlib.h>
36	#else
37	extern char calloc(), malloc(), *realloc();
38	extern void free();
39	#endif
40
41	#if defined(HAVE_STRING_H) \|\| defined(STDC_HEADERS)
42	#include <string.h>
43	#else
44	#include <strings.h>
45	#endif
46
47	#if HAVE_SETLOCALE
48	# include <locale.h>
49	#endif
50
51
52	#ifndef DEBUG /* use the same approach as regex.c */
53	#undef assert
54	#define assert(e)
55	#endif /* DEBUG */
56
57	#ifndef isgraph
58	#define isgraph(C) (isprint(C) && !isspace(C))
59	#endif
60
61	#if defined (STDC_HEADERS) \|\| (!defined (isascii) && !defined (HAVE_ISASCII))
62	#define ISALPHA(C) isalpha(C)
63	#define ISUPPER(C) isupper(C)
64	#define ISLOWER(C) islower(C)
65	#define ISDIGIT(C) isdigit(C)
66	#define ISXDIGIT(C) isxdigit(C)
67	#define ISSPACE(C) isspace(C)
68	#define ISPUNCT(C) ispunct(C)
69	#define ISALNUM(C) isalnum(C)
70	#define ISPRINT(C) isprint(C)
71	#define ISGRAPH(C) isgraph(C)
72	#define ISCNTRL(C) iscntrl(C)
73	#else
74	#define ISALPHA(C) (isascii(C) && isalpha(C))
75	#define ISUPPER(C) (isascii(C) && isupper(C))
76	#define ISLOWER(C) (isascii(C) && islower(C))
77	#define ISDIGIT(C) (isascii(C) && isdigit(C))
78	#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
79	#define ISSPACE(C) (isascii(C) && isspace(C))
80	#define ISPUNCT(C) (isascii(C) && ispunct(C))
81	#define ISALNUM(C) (isascii(C) && isalnum(C))
82	#define ISPRINT(C) (isascii(C) && isprint(C))
83	#define ISGRAPH(C) (isascii(C) && isgraph(C))
84	#define ISCNTRL(C) (isascii(C) && iscntrl(C))
85	#endif
86
87	/* ISASCIIDIGIT differs from ISDIGIT, as follows:
88	- Its arg may be any int or unsigned int; it need not be an unsigned char.
89	- It's guaranteed to evaluate its argument exactly once.
90	- It's typically faster.
91	Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
92	only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
93	it's important to use the locale's definition of `digit' even when the
94	host does not conform to Posix. */
95	#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
96
97	/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
98	#include "gettext.h"
99	#define _(str) gettext (str)
100
101	#ifndef NO_MBSUPPORT
102	#include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
103	#endif
104	#ifdef MBS_SUPPORT
105	/* We can handle multibyte strings. */
106	# include <wchar.h>
107	# include <wctype.h>
108	#endif
109
110	#include "regex.h"
111	#include "dfa.h"
112	#include "hard-locale.h"
113
114	/* HPUX, define those as macros in sys/param.h */
115	#ifdef setbit
116	# undef setbit
117	#endif
118	#ifdef clrbit
119	# undef clrbit
120	#endif
121
122	static void dfamust PARAMS ((struct dfa *dfa));
123
124	static ptr_t xcalloc PARAMS ((size_t n, size_t s));
125	static ptr_t xmalloc PARAMS ((size_t n));
126	static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
127	#ifdef DEBUG
128	static void prtok PARAMS ((token t));
129	#endif
130	static int tstbit PARAMS ((unsigned b, charclass c));
131	static void setbit PARAMS ((unsigned b, charclass c));
132	static void clrbit PARAMS ((unsigned b, charclass c));
133	static void copyset PARAMS ((charclass src, charclass dst));
134	static void zeroset PARAMS ((charclass s));
135	static void notset PARAMS ((charclass s));
136	static int equal PARAMS ((charclass s1, charclass s2));
137	static int charclass_index PARAMS ((charclass s));
138	static int looking_at PARAMS ((const char *s));
139	static token lex PARAMS ((void));
140	static void addtok PARAMS ((token t));
141	static void atom PARAMS ((void));
142	static int nsubtoks PARAMS ((int tindex));
143	static void copytoks PARAMS ((int tindex, int ntokens));
144	static void closure PARAMS ((void));
145	static void branch PARAMS ((void));
146	static void regexp PARAMS ((int toplevel));
147	static void copy PARAMS ((position_set const src, position_set dst));
148	static void insert PARAMS ((position p, position_set *s));
149	static void merge PARAMS ((position_set const s1, position_set const s2, position_set *m));
150	static void delete PARAMS ((position p, position_set *s));
151	static int state_index PARAMS ((struct dfa d, position_set const s,
152	int newline, int letter));
153	static void build_state PARAMS ((int s, struct dfa *d));
154	static void build_state_zero PARAMS ((struct dfa *d));
155	static char icatalloc PARAMS ((char old, char *new));
156	static char icpyalloc PARAMS ((char string));
157	static char istrstr PARAMS ((char lookin, char *lookfor));
158	static void ifree PARAMS ((char *cp));
159	static void freelist PARAMS ((char **cpp));
160	static char enlist PARAMS ((char cpp, char *new, size_t len));
161	static char *comsubs PARAMS ((char left, char *right));
162	static char addlists PARAMS ((char old, char **new));
163	static char inboth PARAMS ((char left, char **right));
164
165	static ptr_t
166	xcalloc (size_t n, size_t s)
167	{
168	ptr_t r = calloc(n, s);
169
170	if (!r)
171	dfaerror(_("Memory exhausted"));
172	return r;
173	}
174
175	static ptr_t
176	xmalloc (size_t n)
177	{
178	ptr_t r = malloc(n);
179
180	assert(n != 0);
181	if (!r)
182	dfaerror(_("Memory exhausted"));
183	return r;
184	}
185
186	static ptr_t
187	xrealloc (ptr_t p, size_t n)
188	{
189	ptr_t r = realloc(p, n);
190
191	assert(n != 0);
192	if (!r)
193	dfaerror(_("Memory exhausted"));
194	return r;
195	}
196
197	#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
198	#define MALLOC(p, t, n) ((p) = (t ) xmalloc((n) sizeof (t)))
199	#define REALLOC(p, t, n) ((p) = (t ) xrealloc((ptr_t) (p), (n) sizeof (t)))
200
201	/* Reallocate an array of type t if nalloc is too small for index. */
202	#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
203	if ((index) >= (nalloc)) \
204	{ \
205	do \
206	(nalloc) *= 2; \
207	while ((index) >= (nalloc)); \
208	REALLOC(p, t, nalloc); \
209	}
210
211	#ifdef DEBUG
212
213	static void
214	prtok (token t)
215	{
216	char const *s;
217
218	if (t < 0)
219	fprintf(stderr, "END");
220	else if (t < NOTCHAR)
221	fprintf(stderr, "%c", t);
222	else
223	{
224	switch (t)
225	{
226	case EMPTY: s = "EMPTY"; break;
227	case BACKREF: s = "BACKREF"; break;
228	case BEGLINE: s = "BEGLINE"; break;
229	case ENDLINE: s = "ENDLINE"; break;
230	case BEGWORD: s = "BEGWORD"; break;
231	case ENDWORD: s = "ENDWORD"; break;
232	case LIMWORD: s = "LIMWORD"; break;
233	case NOTLIMWORD: s = "NOTLIMWORD"; break;
234	case QMARK: s = "QMARK"; break;
235	case STAR: s = "STAR"; break;
236	case PLUS: s = "PLUS"; break;
237	case CAT: s = "CAT"; break;
238	case OR: s = "OR"; break;
239	case ORTOP: s = "ORTOP"; break;
240	case LPAREN: s = "LPAREN"; break;
241	case RPAREN: s = "RPAREN"; break;
242	case CRANGE: s = "CRANGE"; break;
243	#ifdef MBS_SUPPORT
244	case ANYCHAR: s = "ANYCHAR"; break;
245	case MBCSET: s = "MBCSET"; break;
246	#endif /* MBS_SUPPORT */
247	default: s = "CSET"; break;
248	}
249	fprintf(stderr, "%s", s);
250	}
251	}
252	#endif /* DEBUG */
253
254	/* Stuff pertaining to charclasses. */
255
256	static int
257	tstbit (unsigned b, charclass c)
258	{
259	return c[b / INTBITS] & 1 << b % INTBITS;
260	}
261
262	static void
263	setbit (unsigned b, charclass c)
264	{
265	c[b / INTBITS] \|= 1 << b % INTBITS;
266	}
267
268	static void
269	clrbit (unsigned b, charclass c)
270	{
271	c[b / INTBITS] &= ~(1 << b % INTBITS);
272	}
273
274	static void
275	copyset (charclass src, charclass dst)
276	{
277	memcpy (dst, src, sizeof (charclass));
278	}
279
280	static void
281	zeroset (charclass s)
282	{
283	memset (s, 0, sizeof (charclass));
284	}
285
286	static void
287	notset (charclass s)
288	{
289	int i;
290
291	for (i = 0; i < CHARCLASS_INTS; ++i)
292	s[i] = ~s[i];
293	}
294
295	static int
296	equal (charclass s1, charclass s2)
297	{
298	return memcmp (s1, s2, sizeof (charclass)) == 0;
299	}
300
301	/* A pointer to the current dfa is kept here during parsing. */
302	static struct dfa *dfa;
303
304	/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
305	static int
306	charclass_index (charclass s)
307	{
308	int i;
309
310	for (i = 0; i < dfa->cindex; ++i)
311	if (equal(s, dfa->charclasses[i]))
312	return i;
313	REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
314	++dfa->cindex;
315	copyset(s, dfa->charclasses[i]);
316	return i;
317	}
318
319	/* Syntax bits controlling the behavior of the lexical analyzer. */
320	static reg_syntax_t syntax_bits, syntax_bits_set;
321
322	/* Flag for case-folding letters into sets. */
323	static int case_fold;
324
325	/* End-of-line byte in data. */
326	static unsigned char eolbyte;
327
328	/* Entry point to set syntax options. */
329	void
330	dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
331	{
332	syntax_bits_set = 1;
333	syntax_bits = bits;
334	case_fold = fold;
335	eolbyte = eol;
336	}
337
338	/* Like setbit, but if case is folded, set both cases of a letter. */
339	static void
340	setbit_case_fold (unsigned b, charclass c)
341	{
342	setbit (b, c);
343	if (case_fold)
344	{
345	if (ISUPPER (b))
346	setbit (tolower (b), c);
347	else if (ISLOWER (b))
348	setbit (toupper (b), c);
349	}
350	}
351
352	/* Lexical analyzer. All the dross that deals with the obnoxious
353	GNU Regex syntax bits is located here. The poor, suffering
354	reader is referred to the GNU Regex documentation for the
355	meaning of the @#%!@#%^!@ syntax bits. */
356
357	static char const lexptr; / Pointer to next input character. */
358	static int lexleft; /* Number of characters remaining. */
359	static token lasttok; /* Previous token returned; initially END. */
360	static int laststart; /* True if we're separated from beginning or (, \|
361	only by zero-width characters. */
362	static int parens; /* Count of outstanding left parens. */
363	static int minrep, maxrep; /* Repeat counts for {m,n}. */
364	static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
365
366	#ifdef MBS_SUPPORT
367	/* These variables are used only if (MB_CUR_MAX > 1). */
368	static mbstate_t mbs; /* Mbstate for mbrlen(). */
369	static int cur_mb_len; /* Byte length of the current scanning
370	multibyte character. */
371	static int cur_mb_index; /* Byte index of the current scanning multibyte
372	character.
373
374	single byte character : cur_mb_index = 0
375	multibyte character
376	1st byte : cur_mb_index = 1
377	2nd byte : cur_mb_index = 2
378	...
379	nth byte : cur_mb_index = n */
380	static unsigned char mblen_buf;/ Correspond to the input buffer in dfaexec().
381	Each element store the amount of remain
382	byte of corresponding multibyte character
383	in the input string. A element's value
384	is 0 if corresponding character is a
385	single byte chracter.
386	e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
387	mblen_buf : 0, 3, 2, 1
388	*/
389	static wchar_t inputwcs; / Wide character representation of input
390	string in dfaexec().
391	The length of this array is same as
392	the length of input string(char array).
393	inputstring[i] is a single-byte char,
394	or 1st byte of a multibyte char.
395	And inputwcs[i] is the codepoint. */
396	static unsigned char const buf_begin; / reference to begin in dfaexec(). */
397	static unsigned char const buf_end; / reference to end in dfaexec(). */
398	#endif /* MBS_SUPPORT */
399
400	#ifdef MBS_SUPPORT
401	/* This function update cur_mb_len, and cur_mb_index.
402	p points current lexptr, len is the remaining buffer length. */
403	static void
404	update_mb_len_index (unsigned char const *p, int len)
405	{
406	/* If last character is a part of a multibyte character,
407	we update cur_mb_index. */
408	if (cur_mb_index)
409	cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
410	: cur_mb_index + 1;
411
412	/* If last character is a single byte character, or the
413	last portion of a multibyte character, we check whether
414	next character is a multibyte character or not. */
415	if (! cur_mb_index)
416	{
417	cur_mb_len = mbrlen(p, len, &mbs);
418	if (cur_mb_len > 1)
419	/* It is a multibyte character.
420	cur_mb_len was already set by mbrlen(). */
421	cur_mb_index = 1;
422	else if (cur_mb_len < 1)
423	/* Invalid sequence. We treat it as a single byte character.
424	cur_mb_index is aleady 0. */
425	cur_mb_len = 1;
426	/* Otherwise, cur_mb_len == 1, it is a single byte character.
427	cur_mb_index is aleady 0. */
428	}
429	}
430	#endif /* MBS_SUPPORT */
431
432	#ifdef MBS_SUPPORT
433	/* Note that characters become unsigned here. */
434	# define FETCH(c, eoferr) \
435	{ \
436	if (! lexleft) \
437	{ \
438	if (eoferr != 0) \
439	dfaerror (eoferr); \
440	else \
441	return lasttok = END; \
442	} \
443	if (MB_CUR_MAX > 1) \
444	update_mb_len_index(lexptr, lexleft); \
445	(c) = (unsigned char) *lexptr++; \
446	--lexleft; \
447	}
448
449	/* This function fetch a wide character, and update cur_mb_len,
450	used only if the current locale is a multibyte environment. */
451	static wint_t
452	fetch_wc (char const *eoferr)
453	{
454	wchar_t wc;
455	if (! lexleft)
456	{
457	if (eoferr != 0)
458	dfaerror (eoferr);
459	else
460	return WEOF;
461	}
462
463	cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
464	if (cur_mb_len <= 0)
465	{
466	cur_mb_len = 1;
467	wc = *lexptr;
468	}
469	lexptr += cur_mb_len;
470	lexleft -= cur_mb_len;
471	return wc;
472	}
473	#else
474	/* Note that characters become unsigned here. */
475	# define FETCH(c, eoferr) \
476	{ \
477	if (! lexleft) \
478	{ \
479	if (eoferr != 0) \
480	dfaerror (eoferr); \
481	else \
482	return lasttok = END; \
483	} \
484	(c) = (unsigned char) *lexptr++; \
485	--lexleft; \
486	}
487	#endif /* MBS_SUPPORT */
488
489	#ifdef MBS_SUPPORT
490	/* Multibyte character handling sub-routine for lex.
491	This function parse a bracket expression and build a struct
492	mb_char_classes. */
493	static void
494	parse_bracket_exp_mb ()
495	{
496	wint_t wc, wc1, wc2;
497
498	/* Work area to build a mb_char_classes. */
499	struct mb_char_classes *work_mbc;
500	int chars_al, range_sts_al, range_ends_al, ch_classes_al,
501	equivs_al, coll_elems_al;
502
503	REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
504	dfa->mbcsets_alloc, dfa->nmbcsets + 1);
505	/* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
506	We will update dfa->multibyte_prop[] in addtok(), because we can't
507	decide the index in dfa->tokens[]. */
508
509	/* Initialize work are */
510	work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
511
512	chars_al = 1;
513	range_sts_al = range_ends_al = 0;
514	ch_classes_al = equivs_al = coll_elems_al = 0;
515	MALLOC(work_mbc->chars, wchar_t, chars_al);
516
517	work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
518	work_mbc->nequivs = work_mbc->ncoll_elems = 0;
519	work_mbc->chars = NULL;
520	work_mbc->ch_classes = NULL;
521	work_mbc->range_sts = work_mbc->range_ends = NULL;
522	work_mbc->equivs = work_mbc->coll_elems = NULL;
523
524	wc = fetch_wc(_("Unbalanced ["));
525	if (wc == L'^')
526	{
527	wc = fetch_wc(_("Unbalanced ["));
528	work_mbc->invert = 1;
529	}
530	else
531	work_mbc->invert = 0;
532	do
533	{
534	wc1 = WEOF; /* mark wc1 is not initialized". */
535
536	/* Note that if we're looking at some other [:...:] construct,
537	we just treat it as a bunch of ordinary characters. We can do
538	this because we assume regex has checked for syntax errors before
539	dfa is ever called. */
540	if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
541	{
542	#define BRACKET_BUFFER_SIZE 128
543	char str[BRACKET_BUFFER_SIZE];
544	wc1 = wc;
545	wc = fetch_wc(_("Unbalanced ["));
546
547	/* If pattern contains `[[:', `[[.', or `[[='. */
548	if (cur_mb_len == 1 && (wc == L':' \|\| wc == L'.' \|\| wc == L'='))
549	{
550	unsigned char c;
551	unsigned char delim = (unsigned char)wc;
552	int len = 0;
553	for (;;)
554	{
555	if (! lexleft)
556	dfaerror (_("Unbalanced ["));
557	c = (unsigned char) *lexptr++;
558	--lexleft;
559
560	if ((c == delim && *lexptr == ']') \|\| lexleft == 0)
561	break;
562	if (len < BRACKET_BUFFER_SIZE)
563	str[len++] = c;
564	else
565	/* This is in any case an invalid class name. */
566	str[0] = '\0';
567	}
568	str[len] = '\0';
569
570	if (lexleft == 0)
571	{
572	REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
573	work_mbc->nchars + 2);
574	work_mbc->chars[work_mbc->nchars++] = L'[';
575	work_mbc->chars[work_mbc->nchars++] = delim;
576	break;
577	}
578
579	if (--lexleft, *lexptr++ != ']')
580	dfaerror (_("Unbalanced ["));
581	if (delim == ':')
582	/* build character class. */
583	{
584	wctype_t wt;
585	/* Query the character class as wctype_t. */
586	if (case_fold && (strcmp(str, "upper") == 0 \|\| strcmp(str, "lower") == 0))
587	strcpy(str, "alpha");
588
589	wt = wctype (str);
590
591	if (ch_classes_al == 0)
592	MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
593	REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
594	ch_classes_al,
595	work_mbc->nch_classes + 1);
596	work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
597
598	}
599	else if (delim == '=' \|\| delim == '.')
600	{
601	char *elem;
602	MALLOC(elem, char, len + 1);
603	strncpy(elem, str, len + 1);
604
605	if (delim == '=')
606	/* build equivalent class. */
607	{
608	if (equivs_al == 0)
609	MALLOC(work_mbc->equivs, char*, ++equivs_al);
610	REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
611	equivs_al,
612	work_mbc->nequivs + 1);
613	work_mbc->equivs[work_mbc->nequivs++] = elem;
614	}
615
616	if (delim == '.')
617	/* build collating element. */
618	{
619	if (coll_elems_al == 0)
620	MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
621	REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
622	coll_elems_al,
623	work_mbc->ncoll_elems + 1);
624	work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
625	}
626	}
627	wc = wc1 = WEOF;
628	}
629	else
630	/* We treat '[' as a normal character here. */
631	{
632	wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
633	}
634	}
635	else
636	{
637	if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
638	wc = fetch_wc(("Unbalanced ["));
639	}
640
641	if (wc1 == WEOF)
642	wc1 = fetch_wc(_("Unbalanced ["));
643
644	if (wc1 == L'-')
645	/* build range characters. */
646	{
647	wc2 = fetch_wc(_("Unbalanced ["));
648	if (wc2 == L']')
649	{
650	/* In the case [x-], the - is an ordinary hyphen,
651	which is left in c1, the lookahead character. */
652	lexptr -= cur_mb_len;
653	lexleft += cur_mb_len;
654	wc2 = wc;
655	}
656	else
657	{
658	if (wc2 == L'\\'
659	&& (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
660	wc2 = fetch_wc(_("Unbalanced ["));
661	wc1 = fetch_wc(_("Unbalanced ["));
662	}
663
664	if (range_sts_al == 0)
665	{
666	MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
667	MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
668	}
669	REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
670	range_sts_al, work_mbc->nranges + 1);
671	work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
672	REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
673	range_ends_al, work_mbc->nranges + 1);
674	work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
675	if (case_fold && (iswlower((wint_t)wc) \|\| iswupper((wint_t)wc))
676	&& (iswlower((wint_t)wc2) \|\| iswupper((wint_t)wc2)))
677	{
678	wint_t altcase;
679	altcase = wc;
680	if (iswlower((wint_t)wc))
681	altcase = towupper((wint_t)wc);
682	else
683	altcase = towlower((wint_t)wc);
684	REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
685	range_sts_al, work_mbc->nranges + 1);
686	work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase;
687
688	altcase = wc2;
689	if (iswlower((wint_t)wc2))
690	altcase = towupper((wint_t)wc2);
691	else
692	altcase = towlower((wint_t)wc2);
693	REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
694	range_ends_al, work_mbc->nranges + 1);
695	work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase;
696	}
697	}
698	else if (wc != WEOF)
699	/* build normal characters. */
700	{
701	REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
702	work_mbc->nchars + 1);
703	work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
704	if (case_fold && (iswlower(wc) \|\| iswupper(wc)))
705	{
706	REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
707	work_mbc->nchars + 1);
708	work_mbc->chars[work_mbc->nchars++] =
709	(wchar_t) (iswlower(wc) ? towupper(wc) : towlower(wc));
710	}
711	}
712	}
713	while ((wc = wc1) != L']');
714	}
715	#endif /* MBS_SUPPORT */
716
717	#ifdef __STDC__
718	#define FUNC(F, P) static int F(int c) { return P(c); }
719	#else
720	#define FUNC(F, P) static int F(c) int c; { return P(c); }
721	#endif
722
723	FUNC(is_alpha, ISALPHA)
724	FUNC(is_upper, ISUPPER)
725	FUNC(is_lower, ISLOWER)
726	FUNC(is_digit, ISDIGIT)
727	FUNC(is_xdigit, ISXDIGIT)
728	FUNC(is_space, ISSPACE)
729	FUNC(is_punct, ISPUNCT)
730	FUNC(is_alnum, ISALNUM)
731	FUNC(is_print, ISPRINT)
732	FUNC(is_graph, ISGRAPH)
733	FUNC(is_cntrl, ISCNTRL)
734
735	static int
736	is_blank (int c)
737	{
738	return (c == ' ' \|\| c == '\t');
739	}
740
741	/* The following list maps the names of the Posix named character classes
742	to predicate functions that determine whether a given character is in
743	the class. The leading [ has already been eaten by the lexical analyzer. */
744	static struct {
745	const char *name;
746	int (*pred) PARAMS ((int));
747	} const prednames[] = {
748	{ ":alpha:]", is_alpha },
749	{ ":upper:]", is_upper },
750	{ ":lower:]", is_lower },
751	{ ":digit:]", is_digit },
752	{ ":xdigit:]", is_xdigit },
753	{ ":space:]", is_space },
754	{ ":punct:]", is_punct },
755	{ ":alnum:]", is_alnum },
756	{ ":print:]", is_print },
757	{ ":graph:]", is_graph },
758	{ ":cntrl:]", is_cntrl },
759	{ ":blank:]", is_blank },
760	{ 0 }
761	};
762
763	/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
764	#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) \|\| (C) == '_')
765
766	static int
767	looking_at (char const *s)
768	{
769	size_t len;
770
771	len = strlen(s);
772	if (lexleft < len)
773	return 0;
774	return strncmp(s, lexptr, len) == 0;
775	}
776
777	static token
778	lex (void)
779	{
780	unsigned c, c1, c2;
781	int backslash = 0, invert;
782	charclass ccl;
783	int i;
784
785	/* Basic plan: We fetch a character. If it's a backslash,
786	we set the backslash flag and go through the loop again.
787	On the plus side, this avoids having a duplicate of the
788	main switch inside the backslash case. On the minus side,
789	it means that just about every case begins with
790	"if (backslash) ...". */
791	for (i = 0; i < 2; ++i)
792	{
793	FETCH(c, 0);
794	#ifdef MBS_SUPPORT
795	if (MB_CUR_MAX > 1 && cur_mb_index)
796	/* If this is a part of a multi-byte character, we must treat
797	this byte data as a normal character.
798	e.g. In case of SJIS encoding, some character contains '\',
799	but they must not be backslash. */
800	goto normal_char;
801	#endif /* MBS_SUPPORT */
802	switch (c)
803	{
804	case '\\':
805	if (backslash)
806	goto normal_char;
807	if (lexleft == 0)
808	dfaerror(_("Unfinished \\ escape"));
809	backslash = 1;
810	break;
811
812	case '^':
813	if (backslash)
814	goto normal_char;
815	if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
816	\|\| lasttok == END
817	\|\| lasttok == LPAREN
818	\|\| lasttok == OR)
819	return lasttok = BEGLINE;
820	goto normal_char;
821
822	case '$':
823	if (backslash)
824	goto normal_char;
825	if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
826	\|\| lexleft == 0
827	\|\| (syntax_bits & RE_NO_BK_PARENS
828	? lexleft > 0 && *lexptr == ')'
829	: lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
830	\|\| (syntax_bits & RE_NO_BK_VBAR
831	? lexleft > 0 && *lexptr == '\|'
832	: lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '\|')
833	\|\| ((syntax_bits & RE_NEWLINE_ALT)
834	&& lexleft > 0 && *lexptr == '\n'))
835	return lasttok = ENDLINE;
836	goto normal_char;
837
838	case '1':
839	case '2':
840	case '3':
841	case '4':
842	case '5':
843	case '6':
844	case '7':
845	case '8':
846	case '9':
847	if (backslash && !(syntax_bits & RE_NO_BK_REFS))
848	{
849	laststart = 0;
850	return lasttok = BACKREF;
851	}
852	goto normal_char;
853
854	case '`':
855	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
856	return lasttok = BEGLINE; /* FIXME: should be beginning of string */
857	goto normal_char;
858
859	case '\'':
860	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
861	return lasttok = ENDLINE; /* FIXME: should be end of string */
862	goto normal_char;
863
864	case '<':
865	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
866	return lasttok = BEGWORD;
867	goto normal_char;
868
869	case '>':
870	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
871	return lasttok = ENDWORD;
872	goto normal_char;
873
874	case 'b':
875	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
876	return lasttok = LIMWORD;
877	goto normal_char;
878
879	case 'B':
880	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
881	return lasttok = NOTLIMWORD;
882	goto normal_char;
883
884	case '?':
885	if (syntax_bits & RE_LIMITED_OPS)
886	goto normal_char;
887	if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
888	goto normal_char;
889	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
890	goto normal_char;
891	return lasttok = QMARK;
892
893	case '*':
894	if (backslash)
895	goto normal_char;
896	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
897	goto normal_char;
898	return lasttok = STAR;
899
900	case '+':
901	if (syntax_bits & RE_LIMITED_OPS)
902	goto normal_char;
903	if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
904	goto normal_char;
905	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
906	goto normal_char;
907	return lasttok = PLUS;
908
909	case '{':
910	if (!(syntax_bits & RE_INTERVALS))
911	goto normal_char;
912	if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
913	goto normal_char;
914	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
915	goto normal_char;
916
917	if (syntax_bits & RE_NO_BK_BRACES)
918	{
919	/* Scan ahead for a valid interval; if it's not valid,
920	treat it as a literal '{'. */
921	int lo = -1, hi = -1;
922	char const *p = lexptr;
923	char const *lim = p + lexleft;
924	for (; p != lim && ISASCIIDIGIT (*p); p++)
925	lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
926	if (p != lim && *p == ',')
927	while (++p != lim && ISASCIIDIGIT (*p))
928	hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
929	else
930	hi = lo;
931	if (p == lim \|\| *p != '}'
932	\|\| lo < 0 \|\| RE_DUP_MAX < hi \|\| (0 <= hi && hi < lo))
933	goto normal_char;
934	}
935
936	minrep = 0;
937	/* Cases:
938	{M} - exact count
939	{M,} - minimum count, maximum is infinity
940	{M,N} - M through N */
941	FETCH(c, _("unfinished repeat count"));
942	if (ISASCIIDIGIT (c))
943	{
944	minrep = c - '0';
945	for (;;)
946	{
947	FETCH(c, _("unfinished repeat count"));
948	if (! ISASCIIDIGIT (c))
949	break;
950	minrep = 10 * minrep + c - '0';
951	}
952	}
953	else
954	dfaerror(_("malformed repeat count"));
955	if (c == ',')
956	{
957	FETCH (c, _("unfinished repeat count"));
958	if (! ISASCIIDIGIT (c))
959	maxrep = -1;
960	else
961	{
962	maxrep = c - '0';
963	for (;;)
964	{
965	FETCH (c, _("unfinished repeat count"));
966	if (! ISASCIIDIGIT (c))
967	break;
968	maxrep = 10 * maxrep + c - '0';
969	}
970	if (0 <= maxrep && maxrep < minrep)
971	dfaerror (_("malformed repeat count"));
972	}
973	}
974	else
975	maxrep = minrep;
976	if (!(syntax_bits & RE_NO_BK_BRACES))
977	{
978	if (c != '\\')
979	dfaerror(_("malformed repeat count"));
980	FETCH(c, _("unfinished repeat count"));
981	}
982	if (c != '}')
983	dfaerror(_("malformed repeat count"));
984	laststart = 0;
985	return lasttok = REPMN;
986
987	case '\|':
988	if (syntax_bits & RE_LIMITED_OPS)
989	goto normal_char;
990	if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
991	goto normal_char;
992	laststart = 1;
993	return lasttok = OR;
994
995	case '\n':
996	if (syntax_bits & RE_LIMITED_OPS
997	\|\| backslash
998	\|\| !(syntax_bits & RE_NEWLINE_ALT))
999	goto normal_char;
1000	laststart = 1;
1001	return lasttok = OR;
1002
1003	case '(':
1004	if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1005	goto normal_char;
1006	++parens;
1007	laststart = 1;
1008	return lasttok = LPAREN;
1009
1010	case ')':
1011	if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1012	goto normal_char;
1013	if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1014	goto normal_char;
1015	--parens;
1016	laststart = 0;
1017	return lasttok = RPAREN;
1018
1019	case '.':
1020	if (backslash)
1021	goto normal_char;
1022	#ifdef MBS_SUPPORT
1023	if (MB_CUR_MAX > 1)
1024	{
1025	/* In multibyte environment period must match with a single
1026	character not a byte. So we use ANYCHAR. */
1027	laststart = 0;
1028	return lasttok = ANYCHAR;
1029	}
1030	#endif /* MBS_SUPPORT */
1031	zeroset(ccl);
1032	notset(ccl);
1033	if (!(syntax_bits & RE_DOT_NEWLINE))
1034	clrbit(eolbyte, ccl);
1035	if (syntax_bits & RE_DOT_NOT_NULL)
1036	clrbit('\0', ccl);
1037	laststart = 0;
1038	return lasttok = CSET + charclass_index(ccl);
1039
1040	#ifndef GAWK
1041	case 's':
1042	case 'S':
1043	if (!backslash \|\| (syntax_bits & RE_NO_GNU_OPS))
1044	goto normal_char;
1045	zeroset(ccl);
1046	for (c2 = 0; c2 < NOTCHAR; ++c2)
1047	if (ISSPACE(c2))
1048	setbit(c2, ccl);
1049	if (c == 'S')
1050	notset(ccl);
1051	laststart = 0;
1052	return lasttok = CSET + charclass_index(ccl);
1053	#endif
1054
1055	case 'w':
1056	case 'W':
1057	if (!backslash \|\| (syntax_bits & RE_NO_GNU_OPS))
1058	goto normal_char;
1059	zeroset(ccl);
1060	for (c2 = 0; c2 < NOTCHAR; ++c2)
1061	if (IS_WORD_CONSTITUENT(c2))
1062	setbit(c2, ccl);
1063	if (c == 'W')
1064	notset(ccl);
1065	laststart = 0;
1066	return lasttok = CSET + charclass_index(ccl);
1067
1068	case '[':
1069	if (backslash)
1070	goto normal_char;
1071	laststart = 0;
1072	#ifdef MBS_SUPPORT
1073	if (MB_CUR_MAX > 1)
1074	{
1075	/* In multibyte environment a bracket expression may contain
1076	multibyte characters, which must be treated as characters
1077	(not bytes). So we parse it by parse_bracket_exp_mb(). */
1078	parse_bracket_exp_mb();
1079	return lasttok = MBCSET;
1080	}
1081	#endif
1082	zeroset(ccl);
1083	FETCH(c, _("Unbalanced ["));
1084	if (c == '^')
1085	{
1086	FETCH(c, _("Unbalanced ["));
1087	invert = 1;
1088	}
1089	else
1090	invert = 0;
1091	do
1092	{
1093	/* Nobody ever said this had to be fast. :-)
1094	Note that if we're looking at some other [:...:]
1095	construct, we just treat it as a bunch of ordinary
1096	characters. We can do this because we assume
1097	regex has checked for syntax errors before
1098	dfa is ever called. */
1099	if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
1100	for (c1 = 0; prednames[c1].name; ++c1)
1101	if (looking_at(prednames[c1].name))
1102	{
1103	int (*pred) PARAMS ((int)) = prednames[c1].pred;
1104
1105	for (c2 = 0; c2 < NOTCHAR; ++c2)
1106	if ((*pred)(c2))
1107	setbit_case_fold (c2, ccl);
1108	lexptr += strlen(prednames[c1].name);
1109	lexleft -= strlen(prednames[c1].name);
1110	FETCH(c1, _("Unbalanced ["));
1111	goto skip;
1112	}
1113	if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1114	FETCH(c, _("Unbalanced ["));
1115	FETCH(c1, _("Unbalanced ["));
1116	if (c1 == '-')
1117	{
1118	FETCH(c2, _("Unbalanced ["));
1119	if (c2 == ']')
1120	{
1121	/* In the case [x-], the - is an ordinary hyphen,
1122	which is left in c1, the lookahead character. */
1123	--lexptr;
1124	++lexleft;
1125	}
1126	else
1127	{
1128	if (c2 == '\\'
1129	&& (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1130	FETCH(c2, _("Unbalanced ["));
1131	FETCH(c1, _("Unbalanced ["));
1132	if (!hard_LC_COLLATE) {
1133	for (; c <= c2; c++)
1134	setbit_case_fold (c, ccl);
1135	} else {
1136	/* POSIX locales are painful - leave the decision to libc */
1137	regex_t re;
1138	char expr[6]; /* = { '[', c, '-', c2, ']', '\0' }; */
1139
1140	expr[0] = '['; expr[1] = c; expr[2] = '-';
1141	expr[3] = c2; expr[4] = ']'; expr[5] = '\0';
1142	if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1143	for (c = 0; c < NOTCHAR; ++c) {
1144	regmatch_t mat;
1145	char buf[2]; /* = { c, '\0' }; */
1146
1147	buf[0] = c; buf[1] = '\0';
1148	if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1149	&& mat.rm_so == 0 && mat.rm_eo == 1)
1150	setbit_case_fold (c, ccl);
1151	}
1152	regfree (&re);
1153	}
1154	}
1155	continue;
1156	}
1157	}
1158
1159	setbit_case_fold (c, ccl);
1160
1161	skip:
1162	;
1163	}
1164	while ((c = c1) != ']');
1165	if (invert)
1166	{
1167	notset(ccl);
1168	if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1169	clrbit(eolbyte, ccl);
1170	}
1171	return lasttok = CSET + charclass_index(ccl);
1172
1173	default:
1174	normal_char:
1175	laststart = 0;
1176	if (case_fold && ISALPHA(c))
1177	{
1178	zeroset(ccl);
1179	setbit_case_fold (c, ccl);
1180	return lasttok = CSET + charclass_index(ccl);
1181	}
1182	return lasttok = c;
1183	}
1184	}
1185
1186	/* The above loop should consume at most a backslash
1187	and some other character. */
1188	abort();
1189	return END; /* keeps pedantic compilers happy. */
1190	}
1191
1192	/* Recursive descent parser for regular expressions. */
1193
1194	static token tok; /* Lookahead token. */
1195	static int depth; /* Current depth of a hypothetical stack
1196	holding deferred productions. This is
1197	used to determine the depth that will be
1198	required of the real stack later on in
1199	dfaanalyze(). */
1200
1201	/* Add the given token to the parse tree, maintaining the depth count and
1202	updating the maximum depth if necessary. */
1203	static void
1204	addtok (token t)
1205	{
1206	#ifdef MBS_SUPPORT
1207	if (MB_CUR_MAX > 1)
1208	{
1209	REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1210	dfa->tindex);
1211	/* Set dfa->multibyte_prop. See struct dfa in dfa.h. */
1212	if (t == MBCSET)
1213	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1214	else if (t < NOTCHAR)
1215	dfa->multibyte_prop[dfa->tindex]
1216	= (cur_mb_len == 1)? 3 /* single-byte char */
1217	: (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1218	+ ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1219	else
1220	/* It may be unnecessary, but it is safer to treat other
1221	symbols as single byte characters. */
1222	dfa->multibyte_prop[dfa->tindex] = 3;
1223	}
1224	#endif
1225
1226	REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1227	dfa->tokens[dfa->tindex++] = t;
1228
1229	switch (t)
1230	{
1231	case QMARK:
1232	case STAR:
1233	case PLUS:
1234	break;
1235
1236	case CAT:
1237	case OR:
1238	case ORTOP:
1239	--depth;
1240	break;
1241
1242	default:
1243	++dfa->nleaves;
1244	case EMPTY:
1245	++depth;
1246	break;
1247	}
1248	if (depth > dfa->depth)
1249	dfa->depth = depth;
1250	}
1251
1252	/* The grammar understood by the parser is as follows.
1253
1254	regexp:
1255	regexp OR branch
1256	branch
1257
1258	branch:
1259	branch closure
1260	closure
1261
1262	closure:
1263	closure QMARK
1264	closure STAR
1265	closure PLUS
1266	closure REPMN
1267	atom
1268
1269	atom:
1270	<normal character>
1271	<multibyte character>
1272	ANYCHAR
1273	MBCSET
1274	CSET
1275	BACKREF
1276	BEGLINE
1277	ENDLINE
1278	BEGWORD
1279	ENDWORD
1280	LIMWORD
1281	NOTLIMWORD
1282	CRANGE
1283	LPAREN regexp RPAREN
1284	<empty>
1285
1286	The parser builds a parse tree in postfix form in an array of tokens. */
1287
1288	static void
1289	atom (void)
1290	{
1291	if ((tok >= 0 && tok < NOTCHAR) \|\| tok >= CSET \|\| tok == BACKREF
1292	\|\| tok == BEGLINE \|\| tok == ENDLINE \|\| tok == BEGWORD
1293	#ifdef MBS_SUPPORT
1294	\|\| tok == ANYCHAR \|\| tok == MBCSET /* MB_CUR_MAX > 1 */
1295	#endif /* MBS_SUPPORT */
1296	\|\| tok == ENDWORD \|\| tok == LIMWORD \|\| tok == NOTLIMWORD)
1297	{
1298	addtok(tok);
1299	tok = lex();
1300	#ifdef MBS_SUPPORT
1301	/* We treat a multibyte character as a single atom, so that DFA
1302	can treat a multibyte character as a single expression.
1303
1304	e.g. We construct following tree from "<mb1><mb2>".
1305	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1306	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1307	*/
1308	if (MB_CUR_MAX > 1)
1309	{
1310	while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1311	{
1312	addtok(tok);
1313	addtok(CAT);
1314	tok = lex();
1315	}
1316	}
1317	#endif /* MBS_SUPPORT */
1318	}
1319	else if (tok == CRANGE)
1320	{
1321	/* A character range like "[a-z]" in a locale other than "C" or
1322	"POSIX". This range might any sequence of one or more
1323	characters. Unfortunately the POSIX locale primitives give
1324	us no practical way to find what character sequences might be
1325	matched. Treat this approximately like "(.\1)" -- i.e. match
1326	one character, and then punt to the full matcher. */
1327	charclass ccl;
1328	zeroset (ccl);
1329	notset (ccl);
1330	addtok (CSET + charclass_index (ccl));
1331	addtok (BACKREF);
1332	addtok (CAT);
1333	tok = lex ();
1334	}
1335	else if (tok == LPAREN)
1336	{
1337	tok = lex();
1338	regexp(0);
1339	if (tok != RPAREN)
1340	dfaerror(_("Unbalanced ("));
1341	tok = lex();
1342	}
1343	else
1344	addtok(EMPTY);
1345	}
1346
1347	/* Return the number of tokens in the given subexpression. */
1348	static int
1349	nsubtoks (int tindex)
1350	{
1351	int ntoks1;
1352
1353	switch (dfa->tokens[tindex - 1])
1354	{
1355	default:
1356	return 1;
1357	case QMARK:
1358	case STAR:
1359	case PLUS:
1360	return 1 + nsubtoks(tindex - 1);
1361	case CAT:
1362	case OR:
1363	case ORTOP:
1364	ntoks1 = nsubtoks(tindex - 1);
1365	return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1366	}
1367	}
1368
1369	/* Copy the given subexpression to the top of the tree. */
1370	static void
1371	copytoks (int tindex, int ntokens)
1372	{
1373	int i;
1374
1375	for (i = 0; i < ntokens; ++i)
1376	addtok(dfa->tokens[tindex + i]);
1377	}
1378
1379	static void
1380	closure (void)
1381	{
1382	int tindex, ntokens, i;
1383
1384	atom();
1385	while (tok == QMARK \|\| tok == STAR \|\| tok == PLUS \|\| tok == REPMN)
1386	if (tok == REPMN)
1387	{
1388	ntokens = nsubtoks(dfa->tindex);
1389	tindex = dfa->tindex - ntokens;
1390	if (maxrep < 0)
1391	addtok(PLUS);
1392	if (minrep == 0)
1393	addtok(QMARK);
1394	for (i = 1; i < minrep; ++i)
1395	{
1396	copytoks(tindex, ntokens);
1397	addtok(CAT);
1398	}
1399	for (; i < maxrep; ++i)
1400	{
1401	copytoks(tindex, ntokens);
1402	addtok(QMARK);
1403	addtok(CAT);
1404	}
1405	tok = lex();
1406	}
1407	else
1408	{
1409	addtok(tok);
1410	tok = lex();
1411	}
1412	}
1413
1414	static void
1415	branch (void)
1416	{
1417	closure();
1418	while (tok != RPAREN && tok != OR && tok >= 0)
1419	{
1420	closure();
1421	addtok(CAT);
1422	}
1423	}
1424
1425	static void
1426	regexp (int toplevel)
1427	{
1428	branch();
1429	while (tok == OR)
1430	{
1431	tok = lex();
1432	branch();
1433	if (toplevel)
1434	addtok(ORTOP);
1435	else
1436	addtok(OR);
1437	}
1438	}
1439
1440	/* Main entry point for the parser. S is a string to be parsed, len is the
1441	length of the string, so s can include NUL characters. D is a pointer to
1442	the struct dfa to parse into. */
1443	void
1444	dfaparse (char const s, size_t len, struct dfa d)
1445	{
1446	dfa = d;
1447	lexptr = s;
1448	lexleft = len;
1449	lasttok = END;
1450	laststart = 1;
1451	parens = 0;
1452	#ifdef LC_COLLATE
1453	hard_LC_COLLATE = hard_locale (LC_COLLATE);
1454	#endif
1455	#ifdef MBS_SUPPORT
1456	if (MB_CUR_MAX > 1)
1457	{
1458	cur_mb_index = 0;
1459	cur_mb_len = 0;
1460	memset(&mbs, 0, sizeof(mbstate_t));
1461	}
1462	#endif /* MBS_SUPPORT */
1463
1464	if (! syntax_bits_set)
1465	dfaerror(_("No syntax specified"));
1466
1467	tok = lex();
1468	depth = d->depth;
1469
1470	regexp(1);
1471
1472	if (tok != END)
1473	dfaerror(_("Unbalanced )"));
1474
1475	addtok(END - d->nregexps);
1476	addtok(CAT);
1477
1478	if (d->nregexps)
1479	addtok(ORTOP);
1480
1481	++d->nregexps;
1482	}
1483
1484	/* Some primitives for operating on sets of positions. */
1485
1486	/* Copy one set to another; the destination must be large enough. */
1487	static void
1488	copy (position_set const src, position_set dst)
1489	{
1490	int i;
1491
1492	for (i = 0; i < src->nelem; ++i)
1493	dst->elems[i] = src->elems[i];
1494	dst->nelem = src->nelem;
1495	}
1496
1497	/* Insert a position in a set. Position sets are maintained in sorted
1498	order according to index. If position already exists in the set with
1499	the same index then their constraints are logically or'd together.
1500	S->elems must point to an array large enough to hold the resulting set. */
1501	static void
1502	insert (position p, position_set *s)
1503	{
1504	int i;
1505	position t1, t2;
1506
1507	for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1508	continue;
1509	if (i < s->nelem && p.index == s->elems[i].index)
1510	s->elems[i].constraint \|= p.constraint;
1511	else
1512	{
1513	t1 = p;
1514	++s->nelem;
1515	while (i < s->nelem)
1516	{
1517	t2 = s->elems[i];
1518	s->elems[i++] = t1;
1519	t1 = t2;
1520	}
1521	}
1522	}
1523
1524	/* Merge two sets of positions into a third. The result is exactly as if
1525	the positions of both sets were inserted into an initially empty set. */
1526	static void
1527	merge (position_set const s1, position_set const s2, position_set *m)
1528	{
1529	int i = 0, j = 0;
1530
1531	m->nelem = 0;
1532	while (i < s1->nelem && j < s2->nelem)
1533	if (s1->elems[i].index > s2->elems[j].index)
1534	m->elems[m->nelem++] = s1->elems[i++];
1535	else if (s1->elems[i].index < s2->elems[j].index)
1536	m->elems[m->nelem++] = s2->elems[j++];
1537	else
1538	{
1539	m->elems[m->nelem] = s1->elems[i++];
1540	m->elems[m->nelem++].constraint \|= s2->elems[j++].constraint;
1541	}
1542	while (i < s1->nelem)
1543	m->elems[m->nelem++] = s1->elems[i++];
1544	while (j < s2->nelem)
1545	m->elems[m->nelem++] = s2->elems[j++];
1546	}
1547
1548	/* Delete a position from a set. */
1549	static void
1550	delete (position p, position_set *s)
1551	{
1552	int i;
1553
1554	for (i = 0; i < s->nelem; ++i)
1555	if (p.index == s->elems[i].index)
1556	break;
1557	if (i < s->nelem)
1558	for (--s->nelem; i < s->nelem; ++i)
1559	s->elems[i] = s->elems[i + 1];
1560	}
1561
1562	/* Find the index of the state corresponding to the given position set with
1563	the given preceding context, or create a new state if there is no such
1564	state. Newline and letter tell whether we got here on a newline or
1565	letter, respectively. */
1566	static int
1567	state_index (struct dfa d, position_set const s, int newline, int letter)
1568	{
1569	int hash = 0;
1570	int constraint;
1571	int i, j;
1572
1573	newline = newline ? 1 : 0;
1574	letter = letter ? 1 : 0;
1575
1576	for (i = 0; i < s->nelem; ++i)
1577	hash ^= s->elems[i].index + s->elems[i].constraint;
1578
1579	/* Try to find a state that exactly matches the proposed one. */
1580	for (i = 0; i < d->sindex; ++i)
1581	{
1582	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
1583	\|\| newline != d->states[i].newline \|\| letter != d->states[i].letter)
1584	continue;
1585	for (j = 0; j < s->nelem; ++j)
1586	if (s->elems[j].constraint
1587	!= d->states[i].elems.elems[j].constraint
1588	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
1589	break;
1590	if (j == s->nelem)
1591	return i;
1592	}
1593
1594	/* We'll have to create a new state. */
1595	REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1596	d->states[i].hash = hash;
1597	MALLOC(d->states[i].elems.elems, position, s->nelem);
1598	copy(s, &d->states[i].elems);
1599	d->states[i].newline = newline;
1600	d->states[i].letter = letter;
1601	d->states[i].backref = 0;
1602	d->states[i].constraint = 0;
1603	d->states[i].first_end = 0;
1604	#ifdef MBS_SUPPORT
1605	if (MB_CUR_MAX > 1)
1606	d->states[i].mbps.nelem = 0;
1607	#endif
1608	for (j = 0; j < s->nelem; ++j)
1609	if (d->tokens[s->elems[j].index] < 0)
1610	{
1611	constraint = s->elems[j].constraint;
1612	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1613	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1614	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1615	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1616	d->states[i].constraint \|= constraint;
1617	if (! d->states[i].first_end)
1618	d->states[i].first_end = d->tokens[s->elems[j].index];
1619	}
1620	else if (d->tokens[s->elems[j].index] == BACKREF)
1621	{
1622	d->states[i].constraint = NO_CONSTRAINT;
1623	d->states[i].backref = 1;
1624	}
1625
1626	++d->sindex;
1627
1628	return i;
1629	}
1630
1631	/* Find the epsilon closure of a set of positions. If any position of the set
1632	contains a symbol that matches the empty string in some context, replace
1633	that position with the elements of its follow labeled with an appropriate
1634	constraint. Repeat exhaustively until no funny positions are left.
1635	S->elems must be large enough to hold the result. */
1636	static void
1637	epsclosure (position_set s, struct dfa const d)
1638	{
1639	int i, j;
1640	int *visited;
1641	position p, old;
1642
1643	MALLOC(visited, int, d->tindex);
1644	for (i = 0; i < d->tindex; ++i)
1645	visited[i] = 0;
1646
1647	for (i = 0; i < s->nelem; ++i)
1648	if (d->tokens[s->elems[i].index] >= NOTCHAR
1649	&& d->tokens[s->elems[i].index] != BACKREF
1650	#ifdef MBS_SUPPORT
1651	&& d->tokens[s->elems[i].index] != ANYCHAR
1652	&& d->tokens[s->elems[i].index] != MBCSET
1653	#endif
1654	&& d->tokens[s->elems[i].index] < CSET)
1655	{
1656	old = s->elems[i];
1657	p.constraint = old.constraint;
1658	delete(s->elems[i], s);
1659	if (visited[old.index])
1660	{
1661	--i;
1662	continue;
1663	}
1664	visited[old.index] = 1;
1665	switch (d->tokens[old.index])
1666	{
1667	case BEGLINE:
1668	p.constraint &= BEGLINE_CONSTRAINT;
1669	break;
1670	case ENDLINE:
1671	p.constraint &= ENDLINE_CONSTRAINT;
1672	break;
1673	case BEGWORD:
1674	p.constraint &= BEGWORD_CONSTRAINT;
1675	break;
1676	case ENDWORD:
1677	p.constraint &= ENDWORD_CONSTRAINT;
1678	break;
1679	case LIMWORD:
1680	p.constraint &= LIMWORD_CONSTRAINT;
1681	break;
1682	case NOTLIMWORD:
1683	p.constraint &= NOTLIMWORD_CONSTRAINT;
1684	break;
1685	default:
1686	break;
1687	}
1688	for (j = 0; j < d->follows[old.index].nelem; ++j)
1689	{
1690	p.index = d->follows[old.index].elems[j].index;
1691	insert(p, s);
1692	}
1693	/* Force rescan to start at the beginning. */
1694	i = -1;
1695	}
1696
1697	free(visited);
1698	}
1699
1700	/* Perform bottom-up analysis on the parse tree, computing various functions.
1701	Note that at this point, we're pretending constructs like \< are real
1702	characters rather than constraints on what can follow them.
1703
1704	Nullable: A node is nullable if it is at the root of a regexp that can
1705	match the empty string.
1706	* EMPTY leaves are nullable.
1707	* No other leaf is nullable.
1708	* A QMARK or STAR node is nullable.
1709	* A PLUS node is nullable if its argument is nullable.
1710	* A CAT node is nullable if both its arguments are nullable.
1711	* An OR node is nullable if either argument is nullable.
1712
1713	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1714	that could correspond to the first character of a string matching the
1715	regexp rooted at the given node.
1716	* EMPTY leaves have empty firstpos.
1717	* The firstpos of a nonempty leaf is that leaf itself.
1718	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1719	argument.
1720	* The firstpos of a CAT node is the firstpos of the left argument, union
1721	the firstpos of the right if the left argument is nullable.
1722	* The firstpos of an OR node is the union of firstpos of each argument.
1723
1724	Lastpos: The lastpos of a node is the set of positions that could
1725	correspond to the last character of a string matching the regexp at
1726	the given node.
1727	* EMPTY leaves have empty lastpos.
1728	* The lastpos of a nonempty leaf is that leaf itself.
1729	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1730	argument.
1731	* The lastpos of a CAT node is the lastpos of its right argument, union
1732	the lastpos of the left if the right argument is nullable.
1733	* The lastpos of an OR node is the union of the lastpos of each argument.
1734
1735	Follow: The follow of a position is the set of positions that could
1736	correspond to the character following a character matching the node in
1737	a string matching the regexp. At this point we consider special symbols
1738	that match the empty string in some context to be just normal characters.
1739	Later, if we find that a special symbol is in a follow set, we will
1740	replace it with the elements of its follow, labeled with an appropriate
1741	constraint.
1742	* Every node in the firstpos of the argument of a STAR or PLUS node is in
1743	the follow of every node in the lastpos.
1744	* Every node in the firstpos of the second argument of a CAT node is in
1745	the follow of every node in the lastpos of the first argument.
1746
1747	Because of the postfix representation of the parse tree, the depth-first
1748	analysis is conveniently done by a linear scan with the aid of a stack.
1749	Sets are stored as arrays of the elements, obeying a stack-like allocation
1750	scheme; the number of elements in each set deeper in the stack can be
1751	used to determine the address of a particular set's array. */
1752	void
1753	dfaanalyze (struct dfa *d, int searchflag)
1754	{
1755	int nullable; / Nullable stack. */
1756	int nfirstpos; / Element count stack for firstpos sets. */
1757	position firstpos; / Array where firstpos elements are stored. */
1758	int nlastpos; / Element count stack for lastpos sets. */
1759	position lastpos; / Array where lastpos elements are stored. */
1760	int nalloc; / Sizes of arrays allocated to follow sets. */
1761	position_set tmp; /* Temporary set for merging sets. */
1762	position_set merged; /* Result of merging sets. */
1763	int wants_newline; /* True if some position wants newline info. */
1764	int *o_nullable;
1765	int o_nfirst, o_nlast;
1766	position o_firstpos, o_lastpos;
1767	int i, j;
1768	position *pos;
1769
1770	#ifdef DEBUG
1771	fprintf(stderr, "dfaanalyze:\n");
1772	for (i = 0; i < d->tindex; ++i)
1773	{
1774	fprintf(stderr, " %d:", i);
1775	prtok(d->tokens[i]);
1776	}
1777	putc('\n', stderr);
1778	#endif
1779
1780	d->searchflag = searchflag;
1781
1782	MALLOC(nullable, int, d->depth);
1783	o_nullable = nullable;
1784	MALLOC(nfirstpos, int, d->depth);
1785	o_nfirst = nfirstpos;
1786	MALLOC(firstpos, position, d->nleaves);
1787	o_firstpos = firstpos, firstpos += d->nleaves;
1788	MALLOC(nlastpos, int, d->depth);
1789	o_nlast = nlastpos;
1790	MALLOC(lastpos, position, d->nleaves);
1791	o_lastpos = lastpos, lastpos += d->nleaves;
1792	MALLOC(nalloc, int, d->tindex);
1793	for (i = 0; i < d->tindex; ++i)
1794	nalloc[i] = 0;
1795	MALLOC(merged.elems, position, d->nleaves);
1796
1797	CALLOC(d->follows, position_set, d->tindex);
1798
1799	for (i = 0; i < d->tindex; ++i)
1800	#ifdef DEBUG
1801	{ /* Nonsyntactic #ifdef goo... */
1802	#endif
1803	switch (d->tokens[i])
1804	{
1805	case EMPTY:
1806	/* The empty set is nullable. */
1807	*nullable++ = 1;
1808
1809	/* The firstpos and lastpos of the empty leaf are both empty. */
1810	nfirstpos++ = nlastpos++ = 0;
1811	break;
1812
1813	case STAR:
1814	case PLUS:
1815	/* Every element in the firstpos of the argument is in the follow
1816	of every element in the lastpos. */
1817	tmp.nelem = nfirstpos[-1];
1818	tmp.elems = firstpos;
1819	pos = lastpos;
1820	for (j = 0; j < nlastpos[-1]; ++j)
1821	{
1822	merge(&tmp, &d->follows[pos[j].index], &merged);
1823	REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1824	nalloc[pos[j].index], merged.nelem - 1);
1825	copy(&merged, &d->follows[pos[j].index]);
1826	}
1827
1828	case QMARK:
1829	/* A QMARK or STAR node is automatically nullable. */
1830	if (d->tokens[i] != PLUS)
1831	nullable[-1] = 1;
1832	break;
1833
1834	case CAT:
1835	/* Every element in the firstpos of the second argument is in the
1836	follow of every element in the lastpos of the first argument. */
1837	tmp.nelem = nfirstpos[-1];
1838	tmp.elems = firstpos;
1839	pos = lastpos + nlastpos[-1];
1840	for (j = 0; j < nlastpos[-2]; ++j)
1841	{
1842	merge(&tmp, &d->follows[pos[j].index], &merged);
1843	REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1844	nalloc[pos[j].index], merged.nelem - 1);
1845	copy(&merged, &d->follows[pos[j].index]);
1846	}
1847
1848	/* The firstpos of a CAT node is the firstpos of the first argument,
1849	union that of the second argument if the first is nullable. */
1850	if (nullable[-2])
1851	nfirstpos[-2] += nfirstpos[-1];
1852	else
1853	firstpos += nfirstpos[-1];
1854	--nfirstpos;
1855
1856	/* The lastpos of a CAT node is the lastpos of the second argument,
1857	union that of the first argument if the second is nullable. */
1858	if (nullable[-1])
1859	nlastpos[-2] += nlastpos[-1];
1860	else
1861	{
1862	pos = lastpos + nlastpos[-2];
1863	for (j = nlastpos[-1] - 1; j >= 0; --j)
1864	pos[j] = lastpos[j];
1865	lastpos += nlastpos[-2];
1866	nlastpos[-2] = nlastpos[-1];
1867	}
1868	--nlastpos;
1869
1870	/* A CAT node is nullable if both arguments are nullable. */
1871	nullable[-2] = nullable[-1] && nullable[-2];
1872	--nullable;
1873	break;
1874
1875	case OR:
1876	case ORTOP:
1877	/* The firstpos is the union of the firstpos of each argument. */
1878	nfirstpos[-2] += nfirstpos[-1];
1879	--nfirstpos;
1880
1881	/* The lastpos is the union of the lastpos of each argument. */
1882	nlastpos[-2] += nlastpos[-1];
1883	--nlastpos;
1884
1885	/* An OR node is nullable if either argument is nullable. */
1886	nullable[-2] = nullable[-1] \|\| nullable[-2];
1887	--nullable;
1888	break;
1889
1890	default:
1891	/* Anything else is a nonempty position. (Note that special
1892	constructs like \< are treated as nonempty strings here;
1893	an "epsilon closure" effectively makes them nullable later.
1894	Backreferences have to get a real position so we can detect
1895	transitions on them later. But they are nullable. */
1896	*nullable++ = d->tokens[i] == BACKREF;
1897
1898	/* This position is in its own firstpos and lastpos. */
1899	nfirstpos++ = nlastpos++ = 1;
1900	--firstpos, --lastpos;
1901	firstpos->index = lastpos->index = i;
1902	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1903
1904	/* Allocate the follow set for this position. */
1905	nalloc[i] = 1;
1906	MALLOC(d->follows[i].elems, position, nalloc[i]);
1907	break;
1908	}
1909	#ifdef DEBUG
1910	/* ... balance the above nonsyntactic #ifdef goo... */
1911	fprintf(stderr, "node %d:", i);
1912	prtok(d->tokens[i]);
1913	putc('\n', stderr);
1914	fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1915	fprintf(stderr, " firstpos:");
1916	for (j = nfirstpos[-1] - 1; j >= 0; --j)
1917	{
1918	fprintf(stderr, " %d:", firstpos[j].index);
1919	prtok(d->tokens[firstpos[j].index]);
1920	}
1921	fprintf(stderr, "\n lastpos:");
1922	for (j = nlastpos[-1] - 1; j >= 0; --j)
1923	{
1924	fprintf(stderr, " %d:", lastpos[j].index);
1925	prtok(d->tokens[lastpos[j].index]);
1926	}
1927	putc('\n', stderr);
1928	}
1929	#endif
1930
1931	/* For each follow set that is the follow set of a real position, replace
1932	it with its epsilon closure. */
1933	for (i = 0; i < d->tindex; ++i)
1934	if (d->tokens[i] < NOTCHAR \|\| d->tokens[i] == BACKREF
1935	#ifdef MBS_SUPPORT
1936	\|\| d->tokens[i] == ANYCHAR
1937	\|\| d->tokens[i] == MBCSET
1938	#endif
1939	\|\| d->tokens[i] >= CSET)
1940	{
1941	#ifdef DEBUG
1942	fprintf(stderr, "follows(%d:", i);
1943	prtok(d->tokens[i]);
1944	fprintf(stderr, "):");
1945	for (j = d->follows[i].nelem - 1; j >= 0; --j)
1946	{
1947	fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1948	prtok(d->tokens[d->follows[i].elems[j].index]);
1949	}
1950	putc('\n', stderr);
1951	#endif
1952	copy(&d->follows[i], &merged);
1953	epsclosure(&merged, d);
1954	if (d->follows[i].nelem < merged.nelem)
1955	REALLOC(d->follows[i].elems, position, merged.nelem);
1956	copy(&merged, &d->follows[i]);
1957	}
1958
1959	/* Get the epsilon closure of the firstpos of the regexp. The result will
1960	be the set of positions of state 0. */
1961	merged.nelem = 0;
1962	for (i = 0; i < nfirstpos[-1]; ++i)
1963	insert(firstpos[i], &merged);
1964	epsclosure(&merged, d);
1965
1966	/* Check if any of the positions of state 0 will want newline context. */
1967	wants_newline = 0;
1968	for (i = 0; i < merged.nelem; ++i)
1969	if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1970	wants_newline = 1;
1971
1972	/* Build the initial state. */
1973	d->salloc = 1;
1974	d->sindex = 0;
1975	MALLOC(d->states, dfa_state, d->salloc);
1976	state_index(d, &merged, wants_newline, 0);
1977
1978	free(o_nullable);
1979	free(o_nfirst);
1980	free(o_firstpos);
1981	free(o_nlast);
1982	free(o_lastpos);
1983	free(nalloc);
1984	free(merged.elems);
1985	}
1986
1987	/* Find, for each character, the transition out of state s of d, and store
1988	it in the appropriate slot of trans.
1989
1990	We divide the positions of s into groups (positions can appear in more
1991	than one group). Each group is labeled with a set of characters that
1992	every position in the group matches (taking into account, if necessary,
1993	preceding context information of s). For each group, find the union
1994	of the its elements' follows. This set is the set of positions of the
1995	new state. For each character in the group's label, set the transition
1996	on this character to be to a state corresponding to the set's positions,
1997	and its associated backward context information, if necessary.
1998
1999	If we are building a searching matcher, we include the positions of state
2000	0 in every state.
2001
2002	The collection of groups is constructed by building an equivalence-class
2003	partition of the positions of s.
2004
2005	For each position, find the set of characters C that it matches. Eliminate
2006	any characters from C that fail on grounds of backward context.
2007
2008	Search through the groups, looking for a group whose label L has nonempty
2009	intersection with C. If L - C is nonempty, create a new group labeled
2010	L - C and having the same positions as the current group, and set L to
2011	the intersection of L and C. Insert the position in this group, set
2012	C = C - L, and resume scanning.
2013
2014	If after comparing with every group there are characters remaining in C,
2015	create a new group labeled with the characters of C and insert this
2016	position in that group. */
2017	void
2018	dfastate (int s, struct dfa *d, int trans[])
2019	{
2020	position_set grps[NOTCHAR]; /* As many as will ever be needed. */
2021	charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
2022	int ngrps = 0; /* Number of groups actually used. */
2023	position pos; /* Current position being considered. */
2024	charclass matches; /* Set of matching characters. */
2025	int matchesf; /* True if matches is nonempty. */
2026	charclass intersect; /* Intersection with some label set. */
2027	int intersectf; /* True if intersect is nonempty. */
2028	charclass leftovers; /* Stuff in the label that didn't match. */
2029	int leftoversf; /* True if leftovers is nonempty. */
2030	static charclass letters; /* Set of characters considered letters. */
2031	static charclass newline; /* Set of characters that aren't newline. */
2032	position_set follows; /* Union of the follows of some group. */
2033	position_set tmp; /* Temporary space for merging sets. */
2034	int state; /* New state. */
2035	int wants_newline; /* New state wants to know newline context. */
2036	int state_newline; /* New state on a newline transition. */
2037	int wants_letter; /* New state wants to know letter context. */
2038	int state_letter; /* New state on a letter transition. */
2039	static int initialized; /* Flag for static initialization. */
2040	#ifdef MBS_SUPPORT
2041	int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
2042	#endif
2043	int i, j, k;
2044
2045	/* Initialize the set of letters, if necessary. */
2046	if (! initialized)
2047	{
2048	initialized = 1;
2049	for (i = 0; i < NOTCHAR; ++i)
2050	if (IS_WORD_CONSTITUENT(i))
2051	setbit(i, letters);
2052	setbit(eolbyte, newline);
2053	}
2054
2055	zeroset(matches);
2056
2057	for (i = 0; i < d->states[s].elems.nelem; ++i)
2058	{
2059	pos = d->states[s].elems.elems[i];
2060	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2061	setbit(d->tokens[pos.index], matches);
2062	else if (d->tokens[pos.index] >= CSET)
2063	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
2064	#ifdef MBS_SUPPORT
2065	else if (d->tokens[pos.index] == ANYCHAR
2066	\|\| d->tokens[pos.index] == MBCSET)
2067	/* MB_CUR_MAX > 1 */
2068	{
2069	/* ANYCHAR and MBCSET must match with a single character, so we
2070	must put it to d->states[s].mbps, which contains the positions
2071	which can match with a single character not a byte. */
2072	if (d->states[s].mbps.nelem == 0)
2073	{
2074	MALLOC(d->states[s].mbps.elems, position,
2075	d->states[s].elems.nelem);
2076	}
2077	insert(pos, &(d->states[s].mbps));
2078	continue;
2079	}
2080	#endif /* MBS_SUPPORT */
2081	else
2082	continue;
2083
2084	/* Some characters may need to be eliminated from matches because
2085	they fail in the current context. */
2086	if (pos.constraint != 0xFF)
2087	{
2088	if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2089	d->states[s].newline, 1))
2090	clrbit(eolbyte, matches);
2091	if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2092	d->states[s].newline, 0))
2093	for (j = 0; j < CHARCLASS_INTS; ++j)
2094	matches[j] &= newline[j];
2095	if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2096	d->states[s].letter, 1))
2097	for (j = 0; j < CHARCLASS_INTS; ++j)
2098	matches[j] &= ~letters[j];
2099	if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2100	d->states[s].letter, 0))
2101	for (j = 0; j < CHARCLASS_INTS; ++j)
2102	matches[j] &= letters[j];
2103
2104	/* If there are no characters left, there's no point in going on. */
2105	for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2106	continue;
2107	if (j == CHARCLASS_INTS)
2108	continue;
2109	}
2110
2111	for (j = 0; j < ngrps; ++j)
2112	{
2113	/* If matches contains a single character only, and the current
2114	group's label doesn't contain that character, go on to the
2115	next group. */
2116	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2117	&& !tstbit(d->tokens[pos.index], labels[j]))
2118	continue;
2119
2120	/* Check if this group's label has a nonempty intersection with
2121	matches. */
2122	intersectf = 0;
2123	for (k = 0; k < CHARCLASS_INTS; ++k)
2124	(intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2125	if (! intersectf)
2126	continue;
2127
2128	/* It does; now find the set differences both ways. */
2129	leftoversf = matchesf = 0;
2130	for (k = 0; k < CHARCLASS_INTS; ++k)
2131	{
2132	/* Even an optimizing compiler can't know this for sure. */
2133	int match = matches[k], label = labels[j][k];
2134
2135	(leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2136	(matches[k] = match & ~label) ? (matchesf = 1) : 0;
2137	}
2138
2139	/* If there were leftovers, create a new group labeled with them. */
2140	if (leftoversf)
2141	{
2142	copyset(leftovers, labels[ngrps]);
2143	copyset(intersect, labels[j]);
2144	MALLOC(grps[ngrps].elems, position, d->nleaves);
2145	copy(&grps[j], &grps[ngrps]);
2146	++ngrps;
2147	}
2148
2149	/* Put the position in the current group. Note that there is no
2150	reason to call insert() here. */
2151	grps[j].elems[grps[j].nelem++] = pos;
2152
2153	/* If every character matching the current position has been
2154	accounted for, we're done. */
2155	if (! matchesf)
2156	break;
2157	}
2158
2159	/* If we've passed the last group, and there are still characters
2160	unaccounted for, then we'll have to create a new group. */
2161	if (j == ngrps)
2162	{
2163	copyset(matches, labels[ngrps]);
2164	zeroset(matches);
2165	MALLOC(grps[ngrps].elems, position, d->nleaves);
2166	grps[ngrps].nelem = 1;
2167	grps[ngrps].elems[0] = pos;
2168	++ngrps;
2169	}
2170	}
2171
2172	MALLOC(follows.elems, position, d->nleaves);
2173	MALLOC(tmp.elems, position, d->nleaves);
2174
2175	/* If we are a searching matcher, the default transition is to a state
2176	containing the positions of state 0, otherwise the default transition
2177	is to fail miserably. */
2178	if (d->searchflag)
2179	{
2180	wants_newline = 0;
2181	wants_letter = 0;
2182	for (i = 0; i < d->states[0].elems.nelem; ++i)
2183	{
2184	if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2185	wants_newline = 1;
2186	if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2187	wants_letter = 1;
2188	}
2189	copy(&d->states[0].elems, &follows);
2190	state = state_index(d, &follows, 0, 0);
2191	if (wants_newline)
2192	state_newline = state_index(d, &follows, 1, 0);
2193	else
2194	state_newline = state;
2195	if (wants_letter)
2196	state_letter = state_index(d, &follows, 0, 1);
2197	else
2198	state_letter = state;
2199	for (i = 0; i < NOTCHAR; ++i)
2200	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2201	trans[eolbyte] = state_newline;
2202	}
2203	else
2204	for (i = 0; i < NOTCHAR; ++i)
2205	trans[i] = -1;
2206
2207	for (i = 0; i < ngrps; ++i)
2208	{
2209	follows.nelem = 0;
2210
2211	/* Find the union of the follows of the positions of the group.
2212	This is a hideously inefficient loop. Fix it someday. */
2213	for (j = 0; j < grps[i].nelem; ++j)
2214	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2215	insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2216
2217	#ifdef MBS_SUPPORT
2218	if (MB_CUR_MAX > 1)
2219	{
2220	/* If a token in follows.elems is not 1st byte of a multibyte
2221	character, or the states of follows must accept the bytes
2222	which are not 1st byte of the multibyte character.
2223	Then, if a state of follows encounter a byte, it must not be
2224	a 1st byte of a multibyte character nor single byte character.
2225	We cansel to add state[0].follows to next state, because
2226	state[0] must accept 1st-byte
2227
2228	For example, we assume <sb a> is a certain single byte
2229	character, <mb A> is a certain multibyte character, and the
2230	codepoint of <sb a> equals the 2nd byte of the codepoint of
2231	<mb A>.
2232	When state[0] accepts <sb a>, state[i] transit to state[i+1]
2233	by accepting accepts 1st byte of <mb A>, and state[i+1]
2234	accepts 2nd byte of <mb A>, if state[i+1] encounter the
2235	codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2236	<mb A>, so we can not add state[0]. */
2237
2238	next_isnt_1st_byte = 0;
2239	for (j = 0; j < follows.nelem; ++j)
2240	{
2241	if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2242	{
2243	next_isnt_1st_byte = 1;
2244	break;
2245	}
2246	}
2247	}
2248	#endif
2249
2250	/* If we are building a searching matcher, throw in the positions
2251	of state 0 as well. */
2252	#ifdef MBS_SUPPORT
2253	if (d->searchflag && (MB_CUR_MAX == 1 \|\| !next_isnt_1st_byte))
2254	#else
2255	if (d->searchflag)
2256	#endif
2257	for (j = 0; j < d->states[0].elems.nelem; ++j)
2258	insert(d->states[0].elems.elems[j], &follows);
2259
2260	/* Find out if the new state will want any context information. */
2261	wants_newline = 0;
2262	if (tstbit(eolbyte, labels[i]))
2263	for (j = 0; j < follows.nelem; ++j)
2264	if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2265	wants_newline = 1;
2266
2267	wants_letter = 0;
2268	for (j = 0; j < CHARCLASS_INTS; ++j)
2269	if (labels[i][j] & letters[j])
2270	break;
2271	if (j < CHARCLASS_INTS)
2272	for (j = 0; j < follows.nelem; ++j)
2273	if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2274	wants_letter = 1;
2275
2276	/* Find the state(s) corresponding to the union of the follows. */
2277	state = state_index(d, &follows, 0, 0);
2278	if (wants_newline)
2279	state_newline = state_index(d, &follows, 1, 0);
2280	else
2281	state_newline = state;
2282	if (wants_letter)
2283	state_letter = state_index(d, &follows, 0, 1);
2284	else
2285	state_letter = state;
2286
2287	/* Set the transitions for each character in the current label. */
2288	for (j = 0; j < CHARCLASS_INTS; ++j)
2289	for (k = 0; k < INTBITS; ++k)
2290	if (labels[i][j] & 1 << k)
2291	{
2292	int c = j * INTBITS + k;
2293
2294	if (c == eolbyte)
2295	trans[c] = state_newline;
2296	else if (IS_WORD_CONSTITUENT(c))
2297	trans[c] = state_letter;
2298	else if (c < NOTCHAR)
2299	trans[c] = state;
2300	}
2301	}
2302
2303	for (i = 0; i < ngrps; ++i)
2304	free(grps[i].elems);
2305	free(follows.elems);
2306	free(tmp.elems);
2307	}
2308
2309	/* Some routines for manipulating a compiled dfa's transition tables.
2310	Each state may or may not have a transition table; if it does, and it
2311	is a non-accepting state, then d->trans[state] points to its table.
2312	If it is an accepting state then d->fails[state] points to its table.
2313	If it has no table at all, then d->trans[state] is NULL.
2314	TODO: Improve this comment, get rid of the unnecessary redundancy. */
2315
2316	static void
2317	build_state (int s, struct dfa *d)
2318	{
2319	int trans; / The new transition table. */
2320	int i;
2321
2322	/* Set an upper limit on the number of transition tables that will ever
2323	exist at once. 1024 is arbitrary. The idea is that the frequently
2324	used transition tables will be quickly rebuilt, whereas the ones that
2325	were only needed once or twice will be cleared away. */
2326	if (d->trcount >= 1024)
2327	{
2328	for (i = 0; i < d->tralloc; ++i)
2329	if (d->trans[i])
2330	{
2331	free((ptr_t) d->trans[i]);
2332	d->trans[i] = NULL;
2333	}
2334	else if (d->fails[i])
2335	{
2336	free((ptr_t) d->fails[i]);
2337	d->fails[i] = NULL;
2338	}
2339	d->trcount = 0;
2340	}
2341
2342	++d->trcount;
2343
2344	/* Set up the success bits for this state. */
2345	d->success[s] = 0;
2346	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2347	s, *d))
2348	d->success[s] \|= 4;
2349	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2350	s, *d))
2351	d->success[s] \|= 2;
2352	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2353	s, *d))
2354	d->success[s] \|= 1;
2355
2356	MALLOC(trans, int, NOTCHAR);
2357	dfastate(s, d, trans);
2358
2359	/* Now go through the new transition table, and make sure that the trans
2360	and fail arrays are allocated large enough to hold a pointer for the
2361	largest state mentioned in the table. */
2362	for (i = 0; i < NOTCHAR; ++i)
2363	if (trans[i] >= d->tralloc)
2364	{
2365	int oldalloc = d->tralloc;
2366
2367	while (trans[i] >= d->tralloc)
2368	d->tralloc *= 2;
2369	REALLOC(d->realtrans, int *, d->tralloc + 1);
2370	d->trans = d->realtrans + 1;
2371	REALLOC(d->fails, int *, d->tralloc);
2372	REALLOC(d->success, int, d->tralloc);
2373	REALLOC(d->newlines, int, d->tralloc);
2374	while (oldalloc < d->tralloc)
2375	{
2376	d->trans[oldalloc] = NULL;
2377	d->fails[oldalloc++] = NULL;
2378	}
2379	}
2380
2381	/* Keep the newline transition in a special place so we can use it as
2382	a sentinel. */
2383	d->newlines[s] = trans[eolbyte];
2384	trans[eolbyte] = -1;
2385
2386	if (ACCEPTING(s, *d))
2387	d->fails[s] = trans;
2388	else
2389	d->trans[s] = trans;
2390	}
2391
2392	static void
2393	build_state_zero (struct dfa *d)
2394	{
2395	d->tralloc = 1;
2396	d->trcount = 0;
2397	CALLOC(d->realtrans, int *, d->tralloc + 1);
2398	d->trans = d->realtrans + 1;
2399	CALLOC(d->fails, int *, d->tralloc);
2400	MALLOC(d->success, int, d->tralloc);
2401	MALLOC(d->newlines, int, d->tralloc);
2402	build_state(0, d);
2403	}
2404
2405	#ifdef MBS_SUPPORT
2406	/* Multibyte character handling sub-routines for dfaexec. */
2407
2408	/* Initial state may encounter the byte which is not a single byte character
2409	nor 1st byte of a multibyte character. But it is incorrect for initial
2410	state to accept such a byte.
2411	For example, in sjis encoding the regular expression like "\\" accepts
2412	the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2413	0x815c. Then Initial state must skip the bytes which are not a single byte
2414	character nor 1st byte of a multibyte character. */
2415	#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2416	if (s == 0) \
2417	{ \
2418	while (inputwcs[p - buf_begin] == 0 \
2419	&& mblen_buf[p - buf_begin] > 0 \
2420	&& (unsigned char const *)p < buf_end) \
2421	++p; \
2422	if ((char *)p >= end) \
2423	{ \
2424	free(mblen_buf); \
2425	free(inputwcs); \
2426	return NULL; \
2427	} \
2428	}
2429
2430	static void
2431	realloc_trans_if_necessary(struct dfa *d, int new_state)
2432	{
2433	/* Make sure that the trans and fail arrays are allocated large enough
2434	to hold a pointer for the new state. */
2435	if (new_state >= d->tralloc)
2436	{
2437	int oldalloc = d->tralloc;
2438
2439	while (new_state >= d->tralloc)
2440	d->tralloc *= 2;
2441	REALLOC(d->realtrans, int *, d->tralloc + 1);
2442	d->trans = d->realtrans + 1;
2443	REALLOC(d->fails, int *, d->tralloc);
2444	REALLOC(d->success, int, d->tralloc);
2445	REALLOC(d->newlines, int, d->tralloc);
2446	while (oldalloc < d->tralloc)
2447	{
2448	d->trans[oldalloc] = NULL;
2449	d->fails[oldalloc++] = NULL;
2450	}
2451	}
2452	}
2453
2454	/* Return values of transit_state_singlebyte(), and
2455	transit_state_consume_1char. */
2456	typedef enum
2457	{
2458	TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2459	TRANSIT_STATE_DONE, /* State transition has finished. */
2460	TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2461	} status_transit_state;
2462
2463	/* Consume a single byte and transit state from 's' to '*next_state'.
2464	This function is almost same as the state transition routin in dfaexec().
2465	But state transition is done just once, otherwise matching succeed or
2466	reach the end of the buffer. */
2467	static status_transit_state
2468	transit_state_singlebyte (struct dfa d, int s, unsigned char const p,
2469	int *next_state)
2470	{
2471	int *t;
2472	int works = s;
2473
2474	status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2475
2476	while (rval == TRANSIT_STATE_IN_PROGRESS)
2477	{
2478	if ((t = d->trans[works]) != NULL)
2479	{
2480	works = t[*p];
2481	rval = TRANSIT_STATE_DONE;
2482	if (works < 0)
2483	works = 0;
2484	}
2485	else if (works < 0)
2486	{
2487	if (p == buf_end)
2488	/* At the moment, it must not happen. */
2489	return TRANSIT_STATE_END_BUFFER;
2490	works = 0;
2491	}
2492	else if (d->fails[works])
2493	{
2494	works = d->fails[works][*p];
2495	rval = TRANSIT_STATE_DONE;
2496	}
2497	else
2498	{
2499	build_state(works, d);
2500	}
2501	}
2502	*next_state = works;
2503	return rval;
2504	}
2505
2506	/* Check whether period can match or not in the current context. If it can,
2507	return the amount of the bytes with which period can match, otherwise
2508	return 0.
2509	`pos' is the position of the period. `index' is the index from the
2510	buf_begin, and it is the current position in the buffer. */
2511	static int
2512	match_anychar (struct dfa *d, int s, position pos, int index)
2513	{
2514	int newline = 0;
2515	int letter = 0;
2516	wchar_t wc;
2517	int mbclen;
2518
2519	wc = inputwcs[index];
2520	mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2521
2522	/* Check context. */
2523	if (wc == (wchar_t)eolbyte)
2524	{
2525	if (!(syntax_bits & RE_DOT_NEWLINE))
2526	return 0;
2527	newline = 1;
2528	}
2529	else if (wc == (wchar_t)'\0')
2530	{
2531	if (syntax_bits & RE_DOT_NOT_NULL)
2532	return 0;
2533	newline = 1;
2534	}
2535
2536	if (iswalnum(wc) \|\| wc == L'_')
2537	letter = 1;
2538
2539	if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2540	newline, d->states[s].letter, letter))
2541	return 0;
2542
2543	return mbclen;
2544	}
2545
2546	/* Check whether bracket expression can match or not in the current context.
2547	If it can, return the amount of the bytes with which expression can match,
2548	otherwise return 0.
2549	`pos' is the position of the bracket expression. `index' is the index
2550	from the buf_begin, and it is the current position in the buffer. */
2551	int
2552	match_mb_charset (struct dfa *d, int s, position pos, int index)
2553	{
2554	int i;
2555	int match; /* Flag which represent that matching succeed. */
2556	int match_len; /* Length of the character (or collating element)
2557	with which this operator match. */
2558	int op_len; /* Length of the operator. */
2559	char buffer[128];
2560	wchar_t wcbuf[6];
2561
2562	/* Pointer to the structure to which we are currently refering. */
2563	struct mb_char_classes *work_mbc;
2564
2565	int newline = 0;
2566	int letter = 0;
2567	wchar_t wc; /* Current refering character. */
2568
2569	wc = inputwcs[index];
2570
2571	/* Check context. */
2572	if (wc == (wchar_t)eolbyte)
2573	{
2574	if (!(syntax_bits & RE_DOT_NEWLINE))
2575	return 0;
2576	newline = 1;
2577	}
2578	else if (wc == (wchar_t)'\0')
2579	{
2580	if (syntax_bits & RE_DOT_NOT_NULL)
2581	return 0;
2582	newline = 1;
2583	}
2584	if (iswalnum(wc) \|\| wc == L'_')
2585	letter = 1;
2586	if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2587	newline, d->states[s].letter, letter))
2588	return 0;
2589
2590	/* Assign the current refering operator to work_mbc. */
2591	work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2592	match = !work_mbc->invert;
2593	match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2594
2595	/* match with a character class? */
2596	for (i = 0; i<work_mbc->nch_classes; i++)
2597	{
2598	if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2599	goto charset_matched;
2600	}
2601
2602	strncpy(buffer, buf_begin + index, match_len);
2603	buffer[match_len] = '\0';
2604
2605	/* match with an equivalent class? */
2606	for (i = 0; i<work_mbc->nequivs; i++)
2607	{
2608	op_len = strlen(work_mbc->equivs[i]);
2609	strncpy(buffer, buf_begin + index, op_len);
2610	buffer[op_len] = '\0';
2611	if (strcoll(work_mbc->equivs[i], buffer) == 0)
2612	{
2613	match_len = op_len;
2614	goto charset_matched;
2615	}
2616	}
2617
2618	/* match with a collating element? */
2619	for (i = 0; i<work_mbc->ncoll_elems; i++)
2620	{
2621	op_len = strlen(work_mbc->coll_elems[i]);
2622	strncpy(buffer, buf_begin + index, op_len);
2623	buffer[op_len] = '\0';
2624
2625	if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2626	{
2627	match_len = op_len;
2628	goto charset_matched;
2629	}
2630	}
2631
2632	wcbuf[0] = wc;
2633	wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2634
2635	/* match with a range? */
2636	for (i = 0; i<work_mbc->nranges; i++)
2637	{
2638	wcbuf[2] = work_mbc->range_sts[i];
2639	wcbuf[4] = work_mbc->range_ends[i];
2640
2641	if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2642	wcscoll(wcbuf+4, wcbuf) >= 0)
2643	goto charset_matched;
2644	}
2645
2646	/* match with a character? */
2647	for (i = 0; i<work_mbc->nchars; i++)
2648	{
2649	if (wc == work_mbc->chars[i])
2650	goto charset_matched;
2651	}
2652
2653	match = !match;
2654
2655	charset_matched:
2656	return match ? match_len : 0;
2657	}
2658
2659	/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2660	array which corresponds to `d->states[s].mbps.elem' and each element of
2661	the array contains the amount of the bytes with which the element can
2662	match.
2663	`index' is the index from the buf_begin, and it is the current position
2664	in the buffer.
2665	Caller MUST free the array which this function return. */
2666	static int*
2667	check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2668	{
2669	int i;
2670	int* rarray;
2671
2672	MALLOC(rarray, int, d->states[s].mbps.nelem);
2673	for (i = 0; i < d->states[s].mbps.nelem; ++i)
2674	{
2675	position pos = d->states[s].mbps.elems[i];
2676	switch(d->tokens[pos.index])
2677	{
2678	case ANYCHAR:
2679	rarray[i] = match_anychar(d, s, pos, index);
2680	break;
2681	case MBCSET:
2682	rarray[i] = match_mb_charset(d, s, pos, index);
2683	break;
2684	default:
2685	break; /* can not happen. */
2686	}
2687	}
2688	return rarray;
2689	}
2690
2691	/* Consume a single character and enumerate all of the positions which can
2692	be next position from the state `s'.
2693	`match_lens' is the input. It can be NULL, but it can also be the output
2694	of check_matching_with_multibyte_ops() for optimization.
2695	`mbclen' and `pps' are the output. `mbclen' is the length of the
2696	character consumed, and `pps' is the set this function enumerate. */
2697	static status_transit_state
2698	transit_state_consume_1char (struct dfa d, int s, unsigned char const *pp,
2699	int match_lens, int mbclen, position_set *pps)
2700	{
2701	int i, j;
2702	int s1, s2;
2703	int* work_mbls;
2704	status_transit_state rs = TRANSIT_STATE_DONE;
2705
2706	/* Calculate the length of the (single/multi byte) character
2707	to which p points. */
2708	mbclen = (mblen_buf[pp - buf_begin] == 0)? 1
2709	: mblen_buf[*pp - buf_begin];
2710
2711	/* Calculate the state which can be reached from the state `s' by
2712	consuming `mbclen' single bytes from the buffer. /
2713	s1 = s;
2714	for (i = 0; i < *mbclen; i++)
2715	{
2716	s2 = s1;
2717	rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2718	}
2719	/* Copy the positions contained by `s1' to the set `pps'. */
2720	copy(&(d->states[s1].elems), pps);
2721
2722	/* Check (inputed)match_lens, and initialize if it is NULL. */
2723	if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2724	work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2725	else
2726	work_mbls = match_lens;
2727
2728	/* Add all of the positions which can be reached from `s' by consuming
2729	a single character. */
2730	for (i = 0; i < d->states[s].mbps.nelem ; i++)
2731	{
2732	if (work_mbls[i] == *mbclen)
2733	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2734	j++)
2735	insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2736	pps);
2737	}
2738
2739	if (match_lens == NULL && work_mbls != NULL)
2740	free(work_mbls);
2741	return rs;
2742	}
2743
2744	/* Transit state from s, then return new state and update the pointer of the
2745	buffer. This function is for some operator which can match with a multi-
2746	byte character or a collating element (which may be multi characters). */
2747	static int
2748	transit_state (struct dfa d, int s, unsigned char const *pp)
2749	{
2750	int s1;
2751	int mbclen; /* The length of current input multibyte character. */
2752	int maxlen = 0;
2753	int i, j;
2754	int *match_lens = NULL;
2755	int nelem = d->states[s].mbps.nelem; /* Just a alias. */
2756	position_set follows;
2757	unsigned char const p1 = pp;
2758	status_transit_state rs;
2759	wchar_t wc;
2760
2761	if (nelem > 0)
2762	/* This state has (a) multibyte operator(s).
2763	We check whether each of them can match or not. */
2764	{
2765	/* Note: caller must free the return value of this function. */
2766	match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2767
2768	for (i = 0; i < nelem; i++)
2769	/* Search the operator which match the longest string,
2770	in this state. */
2771	{
2772	if (match_lens[i] > maxlen)
2773	maxlen = match_lens[i];
2774	}
2775	}
2776
2777	if (nelem == 0 \|\| maxlen == 0)
2778	/* This state has no multibyte operator which can match.
2779	We need to check only one single byte character. */
2780	{
2781	status_transit_state rs;
2782	rs = transit_state_singlebyte(d, s, *pp, &s1);
2783
2784	/* We must update the pointer if state transition succeeded. */
2785	if (rs == TRANSIT_STATE_DONE)
2786	++*pp;
2787
2788	if (match_lens != NULL)
2789	free(match_lens);
2790	return s1;
2791	}
2792
2793	/* This state has some operators which can match a multibyte character. */
2794	follows.nelem = 0;
2795	MALLOC(follows.elems, position, d->nleaves);
2796
2797	/* `maxlen' may be longer than the length of a character, because it may
2798	not be a character but a (multi character) collating element.
2799	We enumerate all of the positions which `s' can reach by consuming
2800	`maxlen' bytes. */
2801	rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2802
2803	wc = inputwcs[*pp - mbclen - buf_begin];
2804	s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2805	realloc_trans_if_necessary(d, s1);
2806
2807	while (*pp - p1 < maxlen)
2808	{
2809	follows.nelem = 0;
2810	rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2811
2812	for (i = 0; i < nelem ; i++)
2813	{
2814	if (match_lens[i] == *pp - p1)
2815	for (j = 0;
2816	j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2817	insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2818	&follows);
2819	}
2820
2821	wc = inputwcs[*pp - mbclen - buf_begin];
2822	s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2823	realloc_trans_if_necessary(d, s1);
2824	}
2825	free(match_lens);
2826	free(follows.elems);
2827	return s1;
2828	}
2829
2830	#endif /* MBS_SUPPORT */
2831
2832	/* Search through a buffer looking for a match to the given struct dfa.
2833	Find the first occurrence of a string matching the regexp in the buffer,
2834	and the shortest possible version thereof. Return a pointer to the first
2835	character after the match, or NULL if none is found. Begin points to
2836	the beginning of the buffer, and end points to the first character after
2837	its end. We store a newline in *end to act as a sentinel, so end had
2838	better point somewhere valid. Newline is a flag indicating whether to
2839	allow newlines to be in the matching string. If count is non-
2840	NULL it points to a place we're supposed to increment every time we
2841	see a newline. Finally, if backref is non-NULL it points to a place
2842	where we're supposed to store a 1 if backreferencing happened and the
2843	match needs to be verified by a backtracking matcher. Otherwise
2844	we store a 0 in backref. /
2845	char *
2846	dfaexec (struct dfa d, char const begin, char *end,
2847	int newline, int count, int backref)
2848	{
2849	register int s, s1, tmp; /* Current state. */
2850	register unsigned char const p; / Current input character. */
2851	register int *trans, t; /* Copy of d->trans so it can be optimized
2852	into a register. */
2853	register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
2854	static int sbit[NOTCHAR]; /* Table for anding with d->success. */
2855	static int sbit_init;
2856
2857	if (! sbit_init)
2858	{
2859	int i;
2860
2861	sbit_init = 1;
2862	for (i = 0; i < NOTCHAR; ++i)
2863	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2864	sbit[eol] = 4;
2865	}
2866
2867	if (! d->tralloc)
2868	build_state_zero(d);
2869
2870	s = s1 = 0;
2871	p = (unsigned char const *) begin;
2872	trans = d->trans;
2873	*end = eol;
2874
2875	#ifdef MBS_SUPPORT
2876	if (MB_CUR_MAX > 1)
2877	{
2878	int remain_bytes, i;
2879	buf_begin = begin;
2880	buf_end = end;
2881
2882	/* initialize mblen_buf, and inputwcs. */
2883	MALLOC(mblen_buf, unsigned char, end - begin + 2);
2884	MALLOC(inputwcs, wchar_t, end - begin + 2);
2885	memset(&mbs, 0, sizeof(mbstate_t));
2886	remain_bytes = 0;
2887	for (i = 0; i < end - begin + 1; i++)
2888	{
2889	if (remain_bytes == 0)
2890	{
2891	remain_bytes
2892	= mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
2893	if (remain_bytes <= 1)
2894	{
2895	remain_bytes = 0;
2896	inputwcs[i] = (wchar_t)begin[i];
2897	mblen_buf[i] = 0;
2898	}
2899	else
2900	{
2901	mblen_buf[i] = remain_bytes;
2902	remain_bytes--;
2903	}
2904	}
2905	else
2906	{
2907	mblen_buf[i] = remain_bytes;
2908	inputwcs[i] = 0;
2909	remain_bytes--;
2910	}
2911	}
2912	mblen_buf[i] = 0;
2913	inputwcs[i] = 0; /* sentinel */
2914	}
2915	#endif /* MBS_SUPPORT */
2916
2917	for (;;)
2918	{
2919	#ifdef MBS_SUPPORT
2920	if (MB_CUR_MAX > 1)
2921	while ((t = trans[s]))
2922	{
2923	if ((char *) p > end)
2924	break;
2925	s1 = s;
2926	if (d->states[s].mbps.nelem != 0)
2927	{
2928	/* Can match with a multibyte character (and multi character
2929	collating element). */
2930	unsigned char const *nextp;
2931
2932	SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2933
2934	nextp = p;
2935	s = transit_state(d, s, &nextp);
2936	p = (unsigned char *)nextp;
2937
2938	/* Trans table might be updated. */
2939	trans = d->trans;
2940	}
2941	else
2942	{
2943	SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2944	s = t[*p++];
2945	}
2946	}
2947	else
2948	#endif /* MBS_SUPPORT */
2949	while ((t = trans[s]) != 0) { /* hand-optimized loop */
2950	s1 = t[*p++];
2951	if ((t = trans[s1]) == 0) {
2952	tmp = s ; s = s1 ; s1 = tmp ; /* swap */
2953	break;
2954	}
2955	s = t[*p++];
2956	}
2957
2958	if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
2959	{
2960	if (d->success[s] & sbit[*p])
2961	{
2962	if (backref)
2963	*backref = (d->states[s].backref != 0);
2964	#ifdef MBS_SUPPORT
2965	if (MB_CUR_MAX > 1)
2966	{
2967	free(mblen_buf);
2968	free(inputwcs);
2969	}
2970	#endif /* MBS_SUPPORT */
2971	return (char *) p;
2972	}
2973
2974	s1 = s;
2975	#ifdef MBS_SUPPORT
2976	if (MB_CUR_MAX > 1)
2977	{
2978	unsigned char const *nextp;
2979	nextp = p;
2980	s = transit_state(d, s, &nextp);
2981	p = (unsigned char *)nextp;
2982
2983	/* Trans table might be updated. */
2984	trans = d->trans;
2985	}
2986	else
2987	#endif /* MBS_SUPPORT */
2988	s = d->fails[s][*p++];
2989	continue;
2990	}
2991
2992	/* If the previous character was a newline, count it. */
2993	if (count && (char *) p <= end && p[-1] == eol)
2994	++*count;
2995
2996	/* Check if we've run off the end of the buffer. */
2997	if ((char *) p > end)
2998	{
2999	#ifdef MBS_SUPPORT
3000	if (MB_CUR_MAX > 1)
3001	{
3002	free(mblen_buf);
3003	free(inputwcs);
3004	}
3005	#endif /* MBS_SUPPORT */
3006	return NULL;
3007	}
3008
3009	if (s >= 0)
3010	{
3011	build_state(s, d);
3012	trans = d->trans;
3013	continue;
3014	}
3015
3016	if (p[-1] == eol && newline)
3017	{
3018	s = d->newlines[s1];
3019	continue;
3020	}
3021
3022	s = 0;
3023	}
3024	}
3025
3026	/* Initialize the components of a dfa that the other routines don't
3027	initialize for themselves. */
3028	void
3029	dfainit (struct dfa *d)
3030	{
3031	d->calloc = 1;
3032	MALLOC(d->charclasses, charclass, d->calloc);
3033	d->cindex = 0;
3034
3035	d->talloc = 1;
3036	MALLOC(d->tokens, token, d->talloc);
3037	d->tindex = d->depth = d->nleaves = d->nregexps = 0;
3038	#ifdef MBS_SUPPORT
3039	if (MB_CUR_MAX > 1)
3040	{
3041	d->nmultibyte_prop = 1;
3042	MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
3043	d->nmbcsets = 0;
3044	d->mbcsets_alloc = 1;
3045	MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
3046	}
3047	#endif
3048
3049	d->searchflag = 0;
3050	d->tralloc = 0;
3051
3052	d->musts = 0;
3053	d->realtrans = 0;
3054	d->fails = 0;
3055	d->newlines = 0;
3056	d->success = 0;
3057	}
3058
3059	/* Parse and analyze a single string of the given length. */
3060	void
3061	dfacomp (char const s, size_t len, struct dfa d, int searchflag)
3062	{
3063	if (case_fold) /* dummy folding in service of dfamust() */
3064	{
3065	char *lcopy;
3066	int i;
3067
3068	lcopy = malloc(len);
3069	if (!lcopy)
3070	dfaerror(_("memory exhausted"));
3071
3072	/* This is a kludge. */
3073	case_fold = 0;
3074	for (i = 0; i < len; ++i)
3075	if (ISUPPER ((unsigned char) s[i]))
3076	lcopy[i] = tolower ((unsigned char) s[i]);
3077	else
3078	lcopy[i] = s[i];
3079
3080	dfainit(d);
3081	dfaparse(lcopy, len, d);
3082	free(lcopy);
3083	dfamust(d);
3084	d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
3085	case_fold = 1;
3086	dfaparse(s, len, d);
3087	dfaanalyze(d, searchflag);
3088	}
3089	else
3090	{
3091	dfainit(d);
3092	dfaparse(s, len, d);
3093	dfamust(d);
3094	dfaanalyze(d, searchflag);
3095	}
3096	}
3097
3098	/* Free the storage held by the components of a dfa. */
3099	void
3100	dfafree (struct dfa *d)
3101	{
3102	int i;
3103	struct dfamust dm, ndm;
3104
3105	free((ptr_t) d->charclasses);
3106	free((ptr_t) d->tokens);
3107
3108	#ifdef MBS_SUPPORT
3109	if (MB_CUR_MAX > 1)
3110	{
3111	free((ptr_t) d->multibyte_prop);
3112	for (i = 0; i < d->nmbcsets; ++i)
3113	{
3114	int j;
3115	struct mb_char_classes *p = &(d->mbcsets[i]);
3116	if (p->chars != NULL)
3117	free(p->chars);
3118	if (p->ch_classes != NULL)
3119	free(p->ch_classes);
3120	if (p->range_sts != NULL)
3121	free(p->range_sts);
3122	if (p->range_ends != NULL)
3123	free(p->range_ends);
3124
3125	for (j = 0; j < p->nequivs; ++j)
3126	free(p->equivs[j]);
3127	if (p->equivs != NULL)
3128	free(p->equivs);
3129
3130	for (j = 0; j < p->ncoll_elems; ++j)
3131	free(p->coll_elems[j]);
3132	if (p->coll_elems != NULL)
3133	free(p->coll_elems);
3134	}
3135	free((ptr_t) d->mbcsets);
3136	}
3137	#endif /* MBS_SUPPORT */
3138
3139	for (i = 0; i < d->sindex; ++i)
3140	free((ptr_t) d->states[i].elems.elems);
3141	free((ptr_t) d->states);
3142	for (i = 0; i < d->tindex; ++i)
3143	if (d->follows[i].elems)
3144	free((ptr_t) d->follows[i].elems);
3145	free((ptr_t) d->follows);
3146	for (i = 0; i < d->tralloc; ++i)
3147	if (d->trans[i])
3148	free((ptr_t) d->trans[i]);
3149	else if (d->fails[i])
3150	free((ptr_t) d->fails[i]);
3151	if (d->realtrans) free((ptr_t) d->realtrans);
3152	if (d->fails) free((ptr_t) d->fails);
3153	if (d->newlines) free((ptr_t) d->newlines);
3154	if (d->success) free((ptr_t) d->success);
3155	for (dm = d->musts; dm; dm = ndm)
3156	{
3157	ndm = dm->next;
3158	free(dm->must);
3159	free((ptr_t) dm);
3160	}
3161	}
3162
3163	/* Having found the postfix representation of the regular expression,
3164	try to find a long sequence of characters that must appear in any line
3165	containing the r.e.
3166	Finding a "longest" sequence is beyond the scope here;
3167	we take an easy way out and hope for the best.
3168	(Take "(ab\|a)b"--please.)
3169
3170	We do a bottom-up calculation of sequences of characters that must appear
3171	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3172	representation:
3173	sequences that must appear at the left of the match ("left")
3174	sequences that must appear at the right of the match ("right")
3175	lists of sequences that must appear somewhere in the match ("in")
3176	sequences that must constitute the match ("is")
3177
3178	When we get to the root of the tree, we use one of the longest of its
3179	calculated "in" sequences as our answer. The sequence we find is returned in
3180	d->must (where "d" is the single argument passed to "dfamust");
3181	the length of the sequence is returned in d->mustn.
3182
3183	The sequences calculated for the various types of node (in pseudo ANSI c)
3184	are shown below. "p" is the operand of unary operators (and the left-hand
3185	operand of binary operators); "q" is the right-hand operand of binary
3186	operators.
3187
3188	"ZERO" means "a zero-length sequence" below.
3189
3190	Type left right is in
3191	---- ---- ----- -- --
3192	char c # c # c # c # c
3193
3194	ANYCHAR ZERO ZERO ZERO ZERO
3195
3196	MBCSET ZERO ZERO ZERO ZERO
3197
3198	CSET ZERO ZERO ZERO ZERO
3199
3200	STAR ZERO ZERO ZERO ZERO
3201
3202	QMARK ZERO ZERO ZERO ZERO
3203
3204	PLUS p->left p->right ZERO p->in
3205
3206	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3207	p->left : q->right : q->is!=ZERO) ? q->in plus
3208	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3209	ZERO
3210
3211	OR longest common longest common (do p->is and substrings common to
3212	leading trailing q->is have same p->in and q->in
3213	(sub)sequence (sub)sequence length and
3214	of p->left of p->right content) ?
3215	and q->left and q->right p->is : NULL
3216
3217	If there's anything else we recognize in the tree, all four sequences get set
3218	to zero-length sequences. If there's something we don't recognize in the tree,
3219	we just return a zero-length sequence.
3220
3221	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3222	'aaa')?
3223
3224	And. . .is it here or someplace that we might ponder "optimizations" such as
3225	egrep 'psi\|epsilon' -> egrep 'psi'
3226	egrep 'pepsi\|epsilon' -> egrep 'epsi'
3227	(Yes, we now find "epsi" as a "string
3228	that must occur", but we might also
3229	simplify the entire r.e. being sought)
3230	grep '[c]' -> grep 'c'
3231	grep '(ab\|a)b' -> grep 'ab'
3232	grep 'ab*' -> grep 'a'
3233	grep 'a*b' -> grep 'b'
3234
3235	There are several issues:
3236
3237	Is optimization easy (enough)?
3238
3239	Does optimization actually accomplish anything,
3240	or is the automaton you get from "psi\|epsilon" (for example)
3241	the same as the one you get from "psi" (for example)?
3242
3243	Are optimizable r.e.'s likely to be used in real-life situations
3244	(something like 'ab*' is probably unlikely; something like is
3245	'psi\|epsilon' is likelier)? */
3246
3247	static char *
3248	icatalloc (char old, char new)
3249	{
3250	char *result;
3251	size_t oldsize, newsize;
3252
3253	newsize = (new == NULL) ? 0 : strlen(new);
3254	if (old == NULL)
3255	oldsize = 0;
3256	else if (newsize == 0)
3257	return old;
3258	else oldsize = strlen(old);
3259	if (old == NULL)
3260	result = (char *) malloc(newsize + 1);
3261	else
3262	result = (char ) realloc((void ) old, oldsize + newsize + 1);
3263	if (result != NULL && new != NULL)
3264	(void) strcpy(result + oldsize, new);
3265	return result;
3266	}
3267
3268	static char *
3269	icpyalloc (char *string)
3270	{
3271	return icatalloc((char *) NULL, string);
3272	}
3273
3274	static char *
3275	istrstr (char lookin, char lookfor)
3276	{
3277	char *cp;
3278	size_t len;
3279
3280	len = strlen(lookfor);
3281	for (cp = lookin; *cp != '\0'; ++cp)
3282	if (strncmp(cp, lookfor, len) == 0)
3283	return cp;
3284	return NULL;
3285	}
3286
3287	static void
3288	ifree (char *cp)
3289	{
3290	if (cp != NULL)
3291	free(cp);
3292	}
3293
3294	static void
3295	freelist (char **cpp)
3296	{
3297	int i;
3298
3299	if (cpp == NULL)
3300	return;
3301	for (i = 0; cpp[i] != NULL; ++i)
3302	{
3303	free(cpp[i]);
3304	cpp[i] = NULL;
3305	}
3306	}
3307
3308	static char **
3309	enlist (char *cpp, char new, size_t len)
3310	{
3311	int i, j;
3312
3313	if (cpp == NULL)
3314	return NULL;
3315	if ((new = icpyalloc(new)) == NULL)
3316	{
3317	freelist(cpp);
3318	return NULL;
3319	}
3320	new[len] = '\0';
3321	/* Is there already something in the list that's new (or longer)? */
3322	for (i = 0; cpp[i] != NULL; ++i)
3323	if (istrstr(cpp[i], new) != NULL)
3324	{
3325	free(new);
3326	return cpp;
3327	}
3328	/* Eliminate any obsoleted strings. */
3329	j = 0;
3330	while (cpp[j] != NULL)
3331	if (istrstr(new, cpp[j]) == NULL)
3332	++j;
3333	else
3334	{
3335	free(cpp[j]);
3336	if (--i == j)
3337	break;
3338	cpp[j] = cpp[i];
3339	cpp[i] = NULL;
3340	}
3341	/* Add the new string. */
3342	cpp = (char *) realloc((char ) cpp, (i + 2) * sizeof *cpp);
3343	if (cpp == NULL)
3344	return NULL;
3345	cpp[i] = new;
3346	cpp[i + 1] = NULL;
3347	return cpp;
3348	}
3349
3350	/* Given pointers to two strings, return a pointer to an allocated
3351	list of their distinct common substrings. Return NULL if something
3352	seems wild. */
3353	static char **
3354	comsubs (char left, char right)
3355	{
3356	char **cpp;
3357	char *lcp;
3358	char *rcp;
3359	size_t i, len;
3360
3361	if (left == NULL \|\| right == NULL)
3362	return NULL;
3363	cpp = (char *) malloc(sizeof cpp);
3364	if (cpp == NULL)
3365	return NULL;
3366	cpp[0] = NULL;
3367	for (lcp = left; *lcp != '\0'; ++lcp)
3368	{
3369	len = 0;
3370	rcp = strchr (right, *lcp);
3371	while (rcp != NULL)
3372	{
3373	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3374	continue;
3375	if (i > len)
3376	len = i;
3377	rcp = strchr (rcp + 1, *lcp);
3378	}
3379	if (len == 0)
3380	continue;
3381	if ((cpp = enlist(cpp, lcp, len)) == NULL)
3382	break;
3383	}
3384	return cpp;
3385	}
3386
3387	static char **
3388	addlists (char old, char new)
3389	{
3390	int i;
3391
3392	if (old == NULL \|\| new == NULL)
3393	return NULL;
3394	for (i = 0; new[i] != NULL; ++i)
3395	{
3396	old = enlist(old, new[i], strlen(new[i]));
3397	if (old == NULL)
3398	break;
3399	}
3400	return old;
3401	}
3402
3403	/* Given two lists of substrings, return a new list giving substrings
3404	common to both. */
3405	static char **
3406	inboth (char left, char right)
3407	{
3408	char **both;
3409	char **temp;
3410	int lnum, rnum;
3411
3412	if (left == NULL \|\| right == NULL)
3413	return NULL;
3414	both = (char *) malloc(sizeof both);
3415	if (both == NULL)
3416	return NULL;
3417	both[0] = NULL;
3418	for (lnum = 0; left[lnum] != NULL; ++lnum)
3419	{
3420	for (rnum = 0; right[rnum] != NULL; ++rnum)
3421	{
3422	temp = comsubs(left[lnum], right[rnum]);
3423	if (temp == NULL)
3424	{
3425	freelist(both);
3426	return NULL;
3427	}
3428	both = addlists(both, temp);
3429	freelist(temp);
3430	free(temp);
3431	if (both == NULL)
3432	return NULL;
3433	}
3434	}
3435	return both;
3436	}
3437
3438	typedef struct
3439	{
3440	char **in;
3441	char *left;
3442	char *right;
3443	char *is;
3444	} must;
3445
3446	static void
3447	resetmust (must *mp)
3448	{
3449	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3450	freelist(mp->in);
3451	}
3452
3453	static void
3454	dfamust (struct dfa *dfa)
3455	{
3456	must *musts;
3457	must *mp;
3458	char *result;
3459	int ri;
3460	int i;
3461	int exact;
3462	token t;
3463	static must must0;
3464	struct dfamust *dm;
3465	static char empty_string[] = "";
3466
3467	result = empty_string;
3468	exact = 0;
3469	musts = (must ) malloc((dfa->tindex + 1) sizeof *musts);
3470	if (musts == NULL)
3471	return;
3472	mp = musts;
3473	for (i = 0; i <= dfa->tindex; ++i)
3474	mp[i] = must0;
3475	for (i = 0; i <= dfa->tindex; ++i)
3476	{
3477	mp[i].in = (char *) malloc(sizeof mp[i].in);
3478	mp[i].left = malloc(2);
3479	mp[i].right = malloc(2);
3480	mp[i].is = malloc(2);
3481	if (mp[i].in == NULL \|\| mp[i].left == NULL \|\|
3482	mp[i].right == NULL \|\| mp[i].is == NULL)
3483	goto done;
3484	mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3485	mp[i].in[0] = NULL;
3486	}
3487	#ifdef DEBUG
3488	fprintf(stderr, "dfamust:\n");
3489	for (i = 0; i < dfa->tindex; ++i)
3490	{
3491	fprintf(stderr, " %d:", i);
3492	prtok(dfa->tokens[i]);
3493	}
3494	putc('\n', stderr);
3495	#endif
3496	for (ri = 0; ri < dfa->tindex; ++ri)
3497	{
3498	switch (t = dfa->tokens[ri])
3499	{
3500	case LPAREN:
3501	case RPAREN:
3502	goto done; /* "cannot happen" */
3503	case EMPTY:
3504	case BEGLINE:
3505	case ENDLINE:
3506	case BEGWORD:
3507	case ENDWORD:
3508	case LIMWORD:
3509	case NOTLIMWORD:
3510	case BACKREF:
3511	resetmust(mp);
3512	break;
3513	case STAR:
3514	case QMARK:
3515	if (mp <= musts)
3516	goto done; /* "cannot happen" */
3517	--mp;
3518	resetmust(mp);
3519	break;
3520	case OR:
3521	case ORTOP:
3522	if (mp < &musts[2])
3523	goto done; /* "cannot happen" */
3524	{
3525	char **new;
3526	must *lmp;
3527	must *rmp;
3528	int j, ln, rn, n;
3529
3530	rmp = --mp;
3531	lmp = --mp;
3532	/* Guaranteed to be. Unlikely, but. . . */
3533	if (strcmp(lmp->is, rmp->is) != 0)
3534	lmp->is[0] = '\0';
3535	/* Left side--easy */
3536	i = 0;
3537	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3538	++i;
3539	lmp->left[i] = '\0';
3540	/* Right side */
3541	ln = strlen(lmp->right);
3542	rn = strlen(rmp->right);
3543	n = ln;
3544	if (n > rn)
3545	n = rn;
3546	for (i = 0; i < n; ++i)
3547	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3548	break;
3549	for (j = 0; j < i; ++j)
3550	lmp->right[j] = lmp->right[(ln - i) + j];
3551	lmp->right[j] = '\0';
3552	new = inboth(lmp->in, rmp->in);
3553	if (new == NULL)
3554	goto done;
3555	freelist(lmp->in);
3556	free((char *) lmp->in);
3557	lmp->in = new;
3558	}
3559	break;
3560	case PLUS:
3561	if (mp <= musts)
3562	goto done; /* "cannot happen" */
3563	--mp;
3564	mp->is[0] = '\0';
3565	break;
3566	case END:
3567	if (mp != &musts[1])
3568	goto done; /* "cannot happen" */
3569	for (i = 0; musts[0].in[i] != NULL; ++i)
3570	if (strlen(musts[0].in[i]) > strlen(result))
3571	result = musts[0].in[i];
3572	if (strcmp(result, musts[0].is) == 0)
3573	exact = 1;
3574	goto done;
3575	case CAT:
3576	if (mp < &musts[2])
3577	goto done; /* "cannot happen" */
3578	{
3579	must *lmp;
3580	must *rmp;
3581
3582	rmp = --mp;
3583	lmp = --mp;
3584	/* In. Everything in left, plus everything in
3585	right, plus catenation of
3586	left's right and right's left. */
3587	lmp->in = addlists(lmp->in, rmp->in);
3588	if (lmp->in == NULL)
3589	goto done;
3590	if (lmp->right[0] != '\0' &&
3591	rmp->left[0] != '\0')
3592	{
3593	char *tp;
3594
3595	tp = icpyalloc(lmp->right);
3596	if (tp == NULL)
3597	goto done;
3598	tp = icatalloc(tp, rmp->left);
3599	if (tp == NULL)
3600	goto done;
3601	lmp->in = enlist(lmp->in, tp,
3602	strlen(tp));
3603	free(tp);
3604	if (lmp->in == NULL)
3605	goto done;
3606	}
3607	/* Left-hand */
3608	if (lmp->is[0] != '\0')
3609	{
3610	lmp->left = icatalloc(lmp->left,
3611	rmp->left);
3612	if (lmp->left == NULL)
3613	goto done;
3614	}
3615	/* Right-hand */
3616	if (rmp->is[0] == '\0')
3617	lmp->right[0] = '\0';
3618	lmp->right = icatalloc(lmp->right, rmp->right);
3619	if (lmp->right == NULL)
3620	goto done;
3621	/* Guaranteed to be */
3622	if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3623	{
3624	lmp->is = icatalloc(lmp->is, rmp->is);
3625	if (lmp->is == NULL)
3626	goto done;
3627	}
3628	else
3629	lmp->is[0] = '\0';
3630	}
3631	break;
3632	default:
3633	if (t < END)
3634	{
3635	/* "cannot happen" */
3636	goto done;
3637	}
3638	else if (t == '\0')
3639	{
3640	/* not on my shift */
3641	goto done;
3642	}
3643	else if (t >= CSET
3644	#ifdef MBS_SUPPORT
3645	\|\| t == ANYCHAR
3646	\|\| t == MBCSET
3647	#endif /* MBS_SUPPORT */
3648	)
3649	{
3650	/* easy enough */
3651	resetmust(mp);
3652	}
3653	else
3654	{
3655	/* plain character */
3656	resetmust(mp);
3657	mp->is[0] = mp->left[0] = mp->right[0] = t;
3658	mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3659	mp->in = enlist(mp->in, mp->is, (size_t)1);
3660	if (mp->in == NULL)
3661	goto done;
3662	}
3663	break;
3664	}
3665	#ifdef DEBUG
3666	fprintf(stderr, " node: %d:", ri);
3667	prtok(dfa->tokens[ri]);
3668	fprintf(stderr, "\n in:");
3669	for (i = 0; mp->in[i]; ++i)
3670	fprintf(stderr, " \"%s\"", mp->in[i]);
3671	fprintf(stderr, "\n is: \"%s\"\n", mp->is);
3672	fprintf(stderr, " left: \"%s\"\n", mp->left);
3673	fprintf(stderr, " right: \"%s\"\n", mp->right);
3674	#endif
3675	++mp;
3676	}
3677	done:
3678	if (strlen(result))
3679	{
3680	MALLOC(dm, struct dfamust, 1);
3681	dm->exact = exact;
3682	MALLOC(dm->must, char, strlen(result) + 1);
3683	strcpy(dm->must, result);
3684	dm->next = dfa->musts;
3685	dfa->musts = dm;
3686	}
3687	mp = musts;
3688	for (i = 0; i <= dfa->tindex; ++i)
3689	{
3690	freelist(mp[i].in);
3691	ifree((char *) mp[i].in);
3692	ifree(mp[i].left);
3693	ifree(mp[i].right);
3694	ifree(mp[i].is);
3695	}
3696	free((char *) mp);
3697	}
3698	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/sys-apps/gawk/dfa.c@ 3090

Download in other formats: