Context Navigation

dfa.c

Visit:

Last change on this file was 3076, checked in by bird, 18 years ago
gawk 3.1.5
File size: 98.9 KB

Rev	Line
[3076]	1	/* dfa.c - deterministic extended regexp routines for GNU
	2	Copyright 1988, 1998, 2000, 2002, 2004, 2005 Free Software Foundation, Inc.
	3
	4	This program is free software; you can redistribute it and/or modify
	5	it under the terms of the GNU General Public License as published by
	6	the Free Software Foundation; either version 2, or (at your option)
	7	any later version.
	8
	9	This program is distributed in the hope that it will be useful,
	10	but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	GNU General Public License for more details.
	13
	14	You should have received a copy of the GNU General Public License
	15	along with this program; if not, write to the Free Software
	16	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
	17
	18	/* Written June, 1988 by Mike Haertel
	19	Modified July, 1988 by Arthur David Olson to assist BMG speedups */
	20
	21	#ifdef HAVE_CONFIG_H
	22	#include <config.h>
	23	#endif
	24
	25	#include <assert.h>
	26	#include <ctype.h>
	27	#include <stdio.h>
	28
	29	#ifndef VMS
	30	#include <sys/types.h>
	31	#else
	32	#include <stddef.h>
	33	#endif
	34	#ifdef STDC_HEADERS
	35	#include <stdlib.h>
	36	#else
	37	extern char calloc(), malloc(), *realloc();
	38	extern void free();
	39	#endif
	40
	41	#if defined(HAVE_STRING_H) \|\| defined(STDC_HEADERS)
	42	#include <string.h>
	43	#else
	44	#include <strings.h>
	45	#endif
	46
	47	#if HAVE_SETLOCALE
	48	# include <locale.h>
	49	#endif
	50
	51
	52	#ifndef DEBUG /* use the same approach as regex.c */
	53	#undef assert
	54	#define assert(e)
	55	#endif /* DEBUG */
	56
	57	#ifndef isgraph
	58	#define isgraph(C) (isprint(C) && !isspace(C))
	59	#endif
	60
	61	#if defined (STDC_HEADERS) \|\| (!defined (isascii) && !defined (HAVE_ISASCII))
	62	#define ISALPHA(C) isalpha(C)
	63	#define ISUPPER(C) isupper(C)
	64	#define ISLOWER(C) islower(C)
	65	#define ISDIGIT(C) isdigit(C)
	66	#define ISXDIGIT(C) isxdigit(C)
	67	#define ISSPACE(C) isspace(C)
	68	#define ISPUNCT(C) ispunct(C)
	69	#define ISALNUM(C) isalnum(C)
	70	#define ISPRINT(C) isprint(C)
	71	#define ISGRAPH(C) isgraph(C)
	72	#define ISCNTRL(C) iscntrl(C)
	73	#else
	74	#define ISALPHA(C) (isascii(C) && isalpha(C))
	75	#define ISUPPER(C) (isascii(C) && isupper(C))
	76	#define ISLOWER(C) (isascii(C) && islower(C))
	77	#define ISDIGIT(C) (isascii(C) && isdigit(C))
	78	#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
	79	#define ISSPACE(C) (isascii(C) && isspace(C))
	80	#define ISPUNCT(C) (isascii(C) && ispunct(C))
	81	#define ISALNUM(C) (isascii(C) && isalnum(C))
	82	#define ISPRINT(C) (isascii(C) && isprint(C))
	83	#define ISGRAPH(C) (isascii(C) && isgraph(C))
	84	#define ISCNTRL(C) (isascii(C) && iscntrl(C))
	85	#endif
	86
	87	/* ISASCIIDIGIT differs from ISDIGIT, as follows:
	88	- Its arg may be any int or unsigned int; it need not be an unsigned char.
	89	- It's guaranteed to evaluate its argument exactly once.
	90	- It's typically faster.
	91	Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
	92	only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
	93	it's important to use the locale's definition of `digit' even when the
	94	host does not conform to Posix. */
	95	#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
	96
	97	/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
	98	#include "gettext.h"
	99	#define _(str) gettext (str)
	100
	101	#ifndef NO_MBSUPPORT
	102	#include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
	103	#endif
	104	#ifdef MBS_SUPPORT
	105	/* We can handle multibyte strings. */
	106	# include <wchar.h>
	107	# include <wctype.h>
	108	#endif
	109
	110	#include "regex.h"
	111	#include "dfa.h"
	112	#include "hard-locale.h"
	113
	114	/* HPUX, define those as macros in sys/param.h */
	115	#ifdef setbit
	116	# undef setbit
	117	#endif
	118	#ifdef clrbit
	119	# undef clrbit
	120	#endif
	121
	122	static void dfamust PARAMS ((struct dfa *dfa));
	123
	124	static ptr_t xcalloc PARAMS ((size_t n, size_t s));
	125	static ptr_t xmalloc PARAMS ((size_t n));
	126	static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
	127	#ifdef DEBUG
	128	static void prtok PARAMS ((token t));
	129	#endif
	130	static int tstbit PARAMS ((unsigned b, charclass c));
	131	static void setbit PARAMS ((unsigned b, charclass c));
	132	static void clrbit PARAMS ((unsigned b, charclass c));
	133	static void copyset PARAMS ((charclass src, charclass dst));
	134	static void zeroset PARAMS ((charclass s));
	135	static void notset PARAMS ((charclass s));
	136	static int equal PARAMS ((charclass s1, charclass s2));
	137	static int charclass_index PARAMS ((charclass s));
	138	static int looking_at PARAMS ((const char *s));
	139	static token lex PARAMS ((void));
	140	static void addtok PARAMS ((token t));
	141	static void atom PARAMS ((void));
	142	static int nsubtoks PARAMS ((int tindex));
	143	static void copytoks PARAMS ((int tindex, int ntokens));
	144	static void closure PARAMS ((void));
	145	static void branch PARAMS ((void));
	146	static void regexp PARAMS ((int toplevel));
	147	static void copy PARAMS ((position_set const src, position_set dst));
	148	static void insert PARAMS ((position p, position_set *s));
	149	static void merge PARAMS ((position_set const s1, position_set const s2, position_set *m));
	150	static void delete PARAMS ((position p, position_set *s));
	151	static int state_index PARAMS ((struct dfa d, position_set const s,
	152	int newline, int letter));
	153	static void build_state PARAMS ((int s, struct dfa *d));
	154	static void build_state_zero PARAMS ((struct dfa *d));
	155	static char icatalloc PARAMS ((char old, char *new));
	156	static char icpyalloc PARAMS ((char string));
	157	static char istrstr PARAMS ((char lookin, char *lookfor));
	158	static void ifree PARAMS ((char *cp));
	159	static void freelist PARAMS ((char **cpp));
	160	static char enlist PARAMS ((char cpp, char *new, size_t len));
	161	static char *comsubs PARAMS ((char left, char *right));
	162	static char addlists PARAMS ((char old, char **new));
	163	static char inboth PARAMS ((char left, char **right));
	164
	165	static ptr_t
	166	xcalloc (size_t n, size_t s)
	167	{
	168	ptr_t r = calloc(n, s);
	169
	170	if (!r)
	171	dfaerror(_("Memory exhausted"));
	172	return r;
	173	}
	174
	175	static ptr_t
	176	xmalloc (size_t n)
	177	{
	178	ptr_t r = malloc(n);
	179
	180	assert(n != 0);
	181	if (!r)
	182	dfaerror(_("Memory exhausted"));
	183	return r;
	184	}
	185
	186	static ptr_t
	187	xrealloc (ptr_t p, size_t n)
	188	{
	189	ptr_t r = realloc(p, n);
	190
	191	assert(n != 0);
	192	if (!r)
	193	dfaerror(_("Memory exhausted"));
	194	return r;
	195	}
	196
	197	#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
	198	#define MALLOC(p, t, n) ((p) = (t ) xmalloc((n) sizeof (t)))
	199	#define REALLOC(p, t, n) ((p) = (t ) xrealloc((ptr_t) (p), (n) sizeof (t)))
	200
	201	/* Reallocate an array of type t if nalloc is too small for index. */
	202	#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
	203	if ((index) >= (nalloc)) \
	204	{ \
	205	do \
	206	(nalloc) *= 2; \
	207	while ((index) >= (nalloc)); \
	208	REALLOC(p, t, nalloc); \
	209	}
	210
	211	#ifdef DEBUG
	212
	213	static void
	214	prtok (token t)
	215	{
	216	char const *s;
	217
	218	if (t < 0)
	219	fprintf(stderr, "END");
	220	else if (t < NOTCHAR)
	221	fprintf(stderr, "%c", t);
	222	else
	223	{
	224	switch (t)
	225	{
	226	case EMPTY: s = "EMPTY"; break;
	227	case BACKREF: s = "BACKREF"; break;
	228	case BEGLINE: s = "BEGLINE"; break;
	229	case ENDLINE: s = "ENDLINE"; break;
	230	case BEGWORD: s = "BEGWORD"; break;
	231	case ENDWORD: s = "ENDWORD"; break;
	232	case LIMWORD: s = "LIMWORD"; break;
	233	case NOTLIMWORD: s = "NOTLIMWORD"; break;
	234	case QMARK: s = "QMARK"; break;
	235	case STAR: s = "STAR"; break;
	236	case PLUS: s = "PLUS"; break;
	237	case CAT: s = "CAT"; break;
	238	case OR: s = "OR"; break;
	239	case ORTOP: s = "ORTOP"; break;
	240	case LPAREN: s = "LPAREN"; break;
	241	case RPAREN: s = "RPAREN"; break;
	242	case CRANGE: s = "CRANGE"; break;
	243	#ifdef MBS_SUPPORT
	244	case ANYCHAR: s = "ANYCHAR"; break;
	245	case MBCSET: s = "MBCSET"; break;
	246	#endif /* MBS_SUPPORT */
	247	default: s = "CSET"; break;
	248	}
	249	fprintf(stderr, "%s", s);
	250	}
	251	}
	252	#endif /* DEBUG */
	253
	254	/* Stuff pertaining to charclasses. */
	255
	256	static int
	257	tstbit (unsigned b, charclass c)
	258	{
	259	return c[b / INTBITS] & 1 << b % INTBITS;
	260	}
	261
	262	static void
	263	setbit (unsigned b, charclass c)
	264	{
	265	c[b / INTBITS] \|= 1 << b % INTBITS;
	266	}
	267
	268	static void
	269	clrbit (unsigned b, charclass c)
	270	{
	271	c[b / INTBITS] &= ~(1 << b % INTBITS);
	272	}
	273
	274	static void
	275	copyset (charclass src, charclass dst)
	276	{
	277	memcpy (dst, src, sizeof (charclass));
	278	}
	279
	280	static void
	281	zeroset (charclass s)
	282	{
	283	memset (s, 0, sizeof (charclass));
	284	}
	285
	286	static void
	287	notset (charclass s)
	288	{
	289	int i;
	290
	291	for (i = 0; i < CHARCLASS_INTS; ++i)
	292	s[i] = ~s[i];
	293	}
	294
	295	static int
	296	equal (charclass s1, charclass s2)
	297	{
	298	return memcmp (s1, s2, sizeof (charclass)) == 0;
	299	}
	300
	301	/* A pointer to the current dfa is kept here during parsing. */
	302	static struct dfa *dfa;
	303
	304	/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
	305	static int
	306	charclass_index (charclass s)
	307	{
	308	int i;
	309
	310	for (i = 0; i < dfa->cindex; ++i)
	311	if (equal(s, dfa->charclasses[i]))
	312	return i;
	313	REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
	314	++dfa->cindex;
	315	copyset(s, dfa->charclasses[i]);
	316	return i;
	317	}
	318
	319	/* Syntax bits controlling the behavior of the lexical analyzer. */
	320	static reg_syntax_t syntax_bits, syntax_bits_set;
	321
	322	/* Flag for case-folding letters into sets. */
	323	static int case_fold;
	324
	325	/* End-of-line byte in data. */
	326	static unsigned char eolbyte;
	327
	328	/* Entry point to set syntax options. */
	329	void
	330	dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
	331	{
	332	syntax_bits_set = 1;
	333	syntax_bits = bits;
	334	case_fold = fold;
	335	eolbyte = eol;
	336	}
	337
	338	/* Like setbit, but if case is folded, set both cases of a letter. */
	339	static void
	340	setbit_case_fold (unsigned b, charclass c)
	341	{
	342	setbit (b, c);
	343	if (case_fold)
	344	{
	345	if (ISUPPER (b))
	346	setbit (tolower (b), c);
	347	else if (ISLOWER (b))
	348	setbit (toupper (b), c);
	349	}
	350	}
	351
	352	/* Lexical analyzer. All the dross that deals with the obnoxious
	353	GNU Regex syntax bits is located here. The poor, suffering
	354	reader is referred to the GNU Regex documentation for the
	355	meaning of the @#%!@#%^!@ syntax bits. */
	356
	357	static char const lexptr; / Pointer to next input character. */
	358	static int lexleft; /* Number of characters remaining. */
	359	static token lasttok; /* Previous token returned; initially END. */
	360	static int laststart; /* True if we're separated from beginning or (, \|
	361	only by zero-width characters. */
	362	static int parens; /* Count of outstanding left parens. */
	363	static int minrep, maxrep; /* Repeat counts for {m,n}. */
	364	static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
	365
	366	#ifdef MBS_SUPPORT
	367	/* These variables are used only if (MB_CUR_MAX > 1). */
	368	static mbstate_t mbs; /* Mbstate for mbrlen(). */
	369	static int cur_mb_len; /* Byte length of the current scanning
	370	multibyte character. */
	371	static int cur_mb_index; /* Byte index of the current scanning multibyte
	372	character.
	373
	374	single byte character : cur_mb_index = 0
	375	multibyte character
	376	1st byte : cur_mb_index = 1
	377	2nd byte : cur_mb_index = 2
	378	...
	379	nth byte : cur_mb_index = n */
	380	static unsigned char mblen_buf;/ Correspond to the input buffer in dfaexec().
	381	Each element store the amount of remain
	382	byte of corresponding multibyte character
	383	in the input string. A element's value
	384	is 0 if corresponding character is a
	385	single byte chracter.
	386	e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
	387	mblen_buf : 0, 3, 2, 1
	388	*/
	389	static wchar_t inputwcs; / Wide character representation of input
	390	string in dfaexec().
	391	The length of this array is same as
	392	the length of input string(char array).
	393	inputstring[i] is a single-byte char,
	394	or 1st byte of a multibyte char.
	395	And inputwcs[i] is the codepoint. */
	396	static unsigned char const buf_begin; / reference to begin in dfaexec(). */
	397	static unsigned char const buf_end; / reference to end in dfaexec(). */
	398	#endif /* MBS_SUPPORT */
	399
	400	#ifdef MBS_SUPPORT
	401	/* This function update cur_mb_len, and cur_mb_index.
	402	p points current lexptr, len is the remaining buffer length. */
	403	static void
	404	update_mb_len_index (unsigned char const *p, int len)
	405	{
	406	/* If last character is a part of a multibyte character,
	407	we update cur_mb_index. */
	408	if (cur_mb_index)
	409	cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
	410	: cur_mb_index + 1;
	411
	412	/* If last character is a single byte character, or the
	413	last portion of a multibyte character, we check whether
	414	next character is a multibyte character or not. */
	415	if (! cur_mb_index)
	416	{
	417	cur_mb_len = mbrlen(p, len, &mbs);
	418	if (cur_mb_len > 1)
	419	/* It is a multibyte character.
	420	cur_mb_len was already set by mbrlen(). */
	421	cur_mb_index = 1;
	422	else if (cur_mb_len < 1)
	423	/* Invalid sequence. We treat it as a single byte character.
	424	cur_mb_index is aleady 0. */
	425	cur_mb_len = 1;
	426	/* Otherwise, cur_mb_len == 1, it is a single byte character.
	427	cur_mb_index is aleady 0. */
	428	}
	429	}
	430	#endif /* MBS_SUPPORT */
	431
	432	#ifdef MBS_SUPPORT
	433	/* Note that characters become unsigned here. */
	434	# define FETCH(c, eoferr) \
	435	{ \
	436	if (! lexleft) \
	437	{ \
	438	if (eoferr != 0) \
	439	dfaerror (eoferr); \
	440	else \
	441	return lasttok = END; \
	442	} \
	443	if (MB_CUR_MAX > 1) \
	444	update_mb_len_index(lexptr, lexleft); \
	445	(c) = (unsigned char) *lexptr++; \
	446	--lexleft; \
	447	}
	448
	449	/* This function fetch a wide character, and update cur_mb_len,
	450	used only if the current locale is a multibyte environment. */
	451	static wint_t
	452	fetch_wc (char const *eoferr)
	453	{
	454	wchar_t wc;
	455	if (! lexleft)
	456	{
	457	if (eoferr != 0)
	458	dfaerror (eoferr);
	459	else
	460	return WEOF;
	461	}
	462
	463	cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
	464	if (cur_mb_len <= 0)
	465	{
	466	cur_mb_len = 1;
	467	wc = *lexptr;
	468	}
	469	lexptr += cur_mb_len;
	470	lexleft -= cur_mb_len;
	471	return wc;
	472	}
	473	#else
	474	/* Note that characters become unsigned here. */
	475	# define FETCH(c, eoferr) \
	476	{ \
	477	if (! lexleft) \
	478	{ \
	479	if (eoferr != 0) \
	480	dfaerror (eoferr); \
	481	else \
	482	return lasttok = END; \
	483	} \
	484	(c) = (unsigned char) *lexptr++; \
	485	--lexleft; \
	486	}
	487	#endif /* MBS_SUPPORT */
	488
	489	#ifdef MBS_SUPPORT
	490	/* Multibyte character handling sub-routine for lex.
	491	This function parse a bracket expression and build a struct
	492	mb_char_classes. */
	493	static void
	494	parse_bracket_exp_mb ()
	495	{
	496	wint_t wc, wc1, wc2;
	497
	498	/* Work area to build a mb_char_classes. */
	499	struct mb_char_classes *work_mbc;
	500	int chars_al, range_sts_al, range_ends_al, ch_classes_al,
	501	equivs_al, coll_elems_al;
	502
	503	REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
	504	dfa->mbcsets_alloc, dfa->nmbcsets + 1);
	505	/* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
	506	We will update dfa->multibyte_prop[] in addtok(), because we can't
	507	decide the index in dfa->tokens[]. */
	508
	509	/* Initialize work are */
	510	work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
	511
	512	chars_al = 1;
	513	range_sts_al = range_ends_al = 0;
	514	ch_classes_al = equivs_al = coll_elems_al = 0;
	515	MALLOC(work_mbc->chars, wchar_t, chars_al);
	516
	517	work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
	518	work_mbc->nequivs = work_mbc->ncoll_elems = 0;
	519	work_mbc->chars = NULL;
	520	work_mbc->ch_classes = NULL;
	521	work_mbc->range_sts = work_mbc->range_ends = NULL;
	522	work_mbc->equivs = work_mbc->coll_elems = NULL;
	523
	524	wc = fetch_wc(_("Unbalanced ["));
	525	if (wc == L'^')
	526	{
	527	wc = fetch_wc(_("Unbalanced ["));
	528	work_mbc->invert = 1;
	529	}
	530	else
	531	work_mbc->invert = 0;
	532	do
	533	{
	534	wc1 = WEOF; /* mark wc1 is not initialized". */
	535
	536	/* Note that if we're looking at some other [:...:] construct,
	537	we just treat it as a bunch of ordinary characters. We can do
	538	this because we assume regex has checked for syntax errors before
	539	dfa is ever called. */
	540	if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
	541	{
	542	#define BRACKET_BUFFER_SIZE 128
	543	char str[BRACKET_BUFFER_SIZE];
	544	wc1 = wc;
	545	wc = fetch_wc(_("Unbalanced ["));
	546
	547	/* If pattern contains `[[:', `[[.', or `[[='. */
	548	if (cur_mb_len == 1 && (wc == L':' \|\| wc == L'.' \|\| wc == L'='))
	549	{
	550	unsigned char c;
	551	unsigned char delim = (unsigned char)wc;
	552	int len = 0;
	553	for (;;)
	554	{
	555	if (! lexleft)
	556	dfaerror (_("Unbalanced ["));
	557	c = (unsigned char) *lexptr++;
	558	--lexleft;
	559
	560	if ((c == delim && *lexptr == ']') \|\| lexleft == 0)
	561	break;
	562	if (len < BRACKET_BUFFER_SIZE)
	563	str[len++] = c;
	564	else
	565	/* This is in any case an invalid class name. */
	566	str[0] = '\0';
	567	}
	568	str[len] = '\0';
	569
	570	if (lexleft == 0)
	571	{
	572	REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
	573	work_mbc->nchars + 2);
	574	work_mbc->chars[work_mbc->nchars++] = L'[';
	575	work_mbc->chars[work_mbc->nchars++] = delim;
	576	break;
	577	}
	578
	579	if (--lexleft, *lexptr++ != ']')
	580	dfaerror (_("Unbalanced ["));
	581	if (delim == ':')
	582	/* build character class. */
	583	{
	584	wctype_t wt;
	585	/* Query the character class as wctype_t. */
	586	if (case_fold && (strcmp(str, "upper") == 0 \|\| strcmp(str, "lower") == 0))
	587	strcpy(str, "alpha");
	588
	589	wt = wctype (str);
	590
	591	if (ch_classes_al == 0)
	592	MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
	593	REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
	594	ch_classes_al,
	595	work_mbc->nch_classes + 1);
	596	work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
	597
	598	}
	599	else if (delim == '=' \|\| delim == '.')
	600	{
	601	char *elem;
	602	MALLOC(elem, char, len + 1);
	603	strncpy(elem, str, len + 1);
	604
	605	if (delim == '=')
	606	/* build equivalent class. */
	607	{
	608	if (equivs_al == 0)
	609	MALLOC(work_mbc->equivs, char*, ++equivs_al);
	610	REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
	611	equivs_al,
	612	work_mbc->nequivs + 1);
	613	work_mbc->equivs[work_mbc->nequivs++] = elem;
	614	}
	615
	616	if (delim == '.')
	617	/* build collating element. */
	618	{
	619	if (coll_elems_al == 0)
	620	MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
	621	REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
	622	coll_elems_al,
	623	work_mbc->ncoll_elems + 1);
	624	work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
	625	}
	626	}
	627	wc = wc1 = WEOF;
	628	}
	629	else
	630	/* We treat '[' as a normal character here. */
	631	{
	632	wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
	633	}
	634	}
	635	else
	636	{
	637	if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
	638	wc = fetch_wc(("Unbalanced ["));
	639	}
	640
	641	if (wc1 == WEOF)
	642	wc1 = fetch_wc(_("Unbalanced ["));
	643
	644	if (wc1 == L'-')
	645	/* build range characters. */
	646	{
	647	wc2 = fetch_wc(_("Unbalanced ["));
	648	if (wc2 == L']')
	649	{
	650	/* In the case [x-], the - is an ordinary hyphen,
	651	which is left in c1, the lookahead character. */
	652	lexptr -= cur_mb_len;
	653	lexleft += cur_mb_len;
	654	wc2 = wc;
	655	}
	656	else
	657	{
	658	if (wc2 == L'\\'
	659	&& (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
	660	wc2 = fetch_wc(_("Unbalanced ["));
	661	wc1 = fetch_wc(_("Unbalanced ["));
	662	}
	663
	664	if (range_sts_al == 0)
	665	{
	666	MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
	667	MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
	668	}
	669	REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
	670	range_sts_al, work_mbc->nranges + 1);
	671	work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
	672	REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
	673	range_ends_al, work_mbc->nranges + 1);
	674	work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
	675	if (case_fold && (iswlower((wint_t)wc) \|\| iswupper((wint_t)wc))
	676	&& (iswlower((wint_t)wc2) \|\| iswupper((wint_t)wc2)))
	677	{
	678	wint_t altcase;
	679	altcase = wc;
	680	if (iswlower((wint_t)wc))
	681	altcase = towupper((wint_t)wc);
	682	else
	683	altcase = towlower((wint_t)wc);
	684	REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
	685	range_sts_al, work_mbc->nranges + 1);
	686	work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase;
	687
	688	altcase = wc2;
	689	if (iswlower((wint_t)wc2))
	690	altcase = towupper((wint_t)wc2);
	691	else
	692	altcase = towlower((wint_t)wc2);
	693	REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
	694	range_ends_al, work_mbc->nranges + 1);
	695	work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase;
	696	}
	697	}
	698	else if (wc != WEOF)
	699	/* build normal characters. */
	700	{
	701	REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
	702	work_mbc->nchars + 1);
	703	work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
	704	if (case_fold && (iswlower(wc) \|\| iswupper(wc)))
	705	{
	706	REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
	707	work_mbc->nchars + 1);
	708	work_mbc->chars[work_mbc->nchars++] =
	709	(wchar_t) (iswlower(wc) ? towupper(wc) : towlower(wc));
	710	}
	711	}
	712	}
	713	while ((wc = wc1) != L']');
	714	}
	715	#endif /* MBS_SUPPORT */
	716
	717	#ifdef __STDC__
	718	#define FUNC(F, P) static int F(int c) { return P(c); }
	719	#else
	720	#define FUNC(F, P) static int F(c) int c; { return P(c); }
	721	#endif
	722
	723	FUNC(is_alpha, ISALPHA)
	724	FUNC(is_upper, ISUPPER)
	725	FUNC(is_lower, ISLOWER)
	726	FUNC(is_digit, ISDIGIT)
	727	FUNC(is_xdigit, ISXDIGIT)
	728	FUNC(is_space, ISSPACE)
	729	FUNC(is_punct, ISPUNCT)
	730	FUNC(is_alnum, ISALNUM)
	731	FUNC(is_print, ISPRINT)
	732	FUNC(is_graph, ISGRAPH)
	733	FUNC(is_cntrl, ISCNTRL)
	734
	735	static int
	736	is_blank (int c)
	737	{
	738	return (c == ' ' \|\| c == '\t');
	739	}
	740
	741	/* The following list maps the names of the Posix named character classes
	742	to predicate functions that determine whether a given character is in
	743	the class. The leading [ has already been eaten by the lexical analyzer. */
	744	static struct {
	745	const char *name;
	746	int (*pred) PARAMS ((int));
	747	} const prednames[] = {
	748	{ ":alpha:]", is_alpha },
	749	{ ":upper:]", is_upper },
	750	{ ":lower:]", is_lower },
	751	{ ":digit:]", is_digit },
	752	{ ":xdigit:]", is_xdigit },
	753	{ ":space:]", is_space },
	754	{ ":punct:]", is_punct },
	755	{ ":alnum:]", is_alnum },
	756	{ ":print:]", is_print },
	757	{ ":graph:]", is_graph },
	758	{ ":cntrl:]", is_cntrl },
	759	{ ":blank:]", is_blank },
	760	{ 0 }
	761	};
	762
	763	/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
	764	#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) \|\| (C) == '_')
	765
	766	static int
	767	looking_at (char const *s)
	768	{
	769	size_t len;
	770
	771	len = strlen(s);
	772	if (lexleft < len)
	773	return 0;
	774	return strncmp(s, lexptr, len) == 0;
	775	}
	776
	777	static token
	778	lex (void)
	779	{
	780	unsigned c, c1, c2;
	781	int backslash = 0, invert;
	782	charclass ccl;
	783	int i;
	784
	785	/* Basic plan: We fetch a character. If it's a backslash,
	786	we set the backslash flag and go through the loop again.
	787	On the plus side, this avoids having a duplicate of the
	788	main switch inside the backslash case. On the minus side,
	789	it means that just about every case begins with
	790	"if (backslash) ...". */
	791	for (i = 0; i < 2; ++i)
	792	{
	793	FETCH(c, 0);
	794	#ifdef MBS_SUPPORT
	795	if (MB_CUR_MAX > 1 && cur_mb_index)
	796	/* If this is a part of a multi-byte character, we must treat
	797	this byte data as a normal character.
	798	e.g. In case of SJIS encoding, some character contains '\',
	799	but they must not be backslash. */
	800	goto normal_char;
	801	#endif /* MBS_SUPPORT */
	802	switch (c)
	803	{
	804	case '\\':
	805	if (backslash)
	806	goto normal_char;
	807	if (lexleft == 0)
	808	dfaerror(_("Unfinished \\ escape"));
	809	backslash = 1;
	810	break;
	811
	812	case '^':
	813	if (backslash)
	814	goto normal_char;
	815	if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
	816	\|\| lasttok == END
	817	\|\| lasttok == LPAREN
	818	\|\| lasttok == OR)
	819	return lasttok = BEGLINE;
	820	goto normal_char;
	821
	822	case '$':
	823	if (backslash)
	824	goto normal_char;
	825	if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
	826	\|\| lexleft == 0
	827	\|\| (syntax_bits & RE_NO_BK_PARENS
	828	? lexleft > 0 && *lexptr == ')'
	829	: lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
	830	\|\| (syntax_bits & RE_NO_BK_VBAR
	831	? lexleft > 0 && *lexptr == '\|'
	832	: lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '\|')
	833	\|\| ((syntax_bits & RE_NEWLINE_ALT)
	834	&& lexleft > 0 && *lexptr == '\n'))
	835	return lasttok = ENDLINE;
	836	goto normal_char;
	837
	838	case '1':
	839	case '2':
	840	case '3':
	841	case '4':
	842	case '5':
	843	case '6':
	844	case '7':
	845	case '8':
	846	case '9':
	847	if (backslash && !(syntax_bits & RE_NO_BK_REFS))
	848	{
	849	laststart = 0;
	850	return lasttok = BACKREF;
	851	}
	852	goto normal_char;
	853
	854	case '`':
	855	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
	856	return lasttok = BEGLINE; /* FIXME: should be beginning of string */
	857	goto normal_char;
	858
	859	case '\'':
	860	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
	861	return lasttok = ENDLINE; /* FIXME: should be end of string */
	862	goto normal_char;
	863
	864	case '<':
	865	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
	866	return lasttok = BEGWORD;
	867	goto normal_char;
	868
	869	case '>':
	870	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
	871	return lasttok = ENDWORD;
	872	goto normal_char;
	873
	874	case 'b':
	875	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
	876	return lasttok = LIMWORD;
	877	goto normal_char;
	878
	879	case 'B':
	880	if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
	881	return lasttok = NOTLIMWORD;
	882	goto normal_char;
	883
	884	case '?':
	885	if (syntax_bits & RE_LIMITED_OPS)
	886	goto normal_char;
	887	if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
	888	goto normal_char;
	889	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
	890	goto normal_char;
	891	return lasttok = QMARK;
	892
	893	case '*':
	894	if (backslash)
	895	goto normal_char;
	896	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
	897	goto normal_char;
	898	return lasttok = STAR;
	899
	900	case '+':
	901	if (syntax_bits & RE_LIMITED_OPS)
	902	goto normal_char;
	903	if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
	904	goto normal_char;
	905	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
	906	goto normal_char;
	907	return lasttok = PLUS;
	908
	909	case '{':
	910	if (!(syntax_bits & RE_INTERVALS))
	911	goto normal_char;
	912	if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
	913	goto normal_char;
	914	if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
	915	goto normal_char;
	916
	917	if (syntax_bits & RE_NO_BK_BRACES)
	918	{
	919	/* Scan ahead for a valid interval; if it's not valid,
	920	treat it as a literal '{'. */
	921	int lo = -1, hi = -1;
	922	char const *p = lexptr;
	923	char const *lim = p + lexleft;
	924	for (; p != lim && ISASCIIDIGIT (*p); p++)
	925	lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
	926	if (p != lim && *p == ',')
	927	while (++p != lim && ISASCIIDIGIT (*p))
	928	hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
	929	else
	930	hi = lo;
	931	if (p == lim \|\| *p != '}'
	932	\|\| lo < 0 \|\| RE_DUP_MAX < hi \|\| (0 <= hi && hi < lo))
	933	goto normal_char;
	934	}
	935
	936	minrep = 0;
	937	/* Cases:
	938	{M} - exact count
	939	{M,} - minimum count, maximum is infinity
	940	{M,N} - M through N */
	941	FETCH(c, _("unfinished repeat count"));
	942	if (ISASCIIDIGIT (c))
	943	{
	944	minrep = c - '0';
	945	for (;;)
	946	{
	947	FETCH(c, _("unfinished repeat count"));
	948	if (! ISASCIIDIGIT (c))
	949	break;
	950	minrep = 10 * minrep + c - '0';
	951	}
	952	}
	953	else
	954	dfaerror(_("malformed repeat count"));
	955	if (c == ',')
	956	{
	957	FETCH (c, _("unfinished repeat count"));
	958	if (! ISASCIIDIGIT (c))
	959	maxrep = -1;
	960	else
	961	{
	962	maxrep = c - '0';
	963	for (;;)
	964	{
	965	FETCH (c, _("unfinished repeat count"));
	966	if (! ISASCIIDIGIT (c))
	967	break;
	968	maxrep = 10 * maxrep + c - '0';
	969	}
	970	if (0 <= maxrep && maxrep < minrep)
	971	dfaerror (_("malformed repeat count"));
	972	}
	973	}
	974	else
	975	maxrep = minrep;
	976	if (!(syntax_bits & RE_NO_BK_BRACES))
	977	{
	978	if (c != '\\')
	979	dfaerror(_("malformed repeat count"));
	980	FETCH(c, _("unfinished repeat count"));
	981	}
	982	if (c != '}')
	983	dfaerror(_("malformed repeat count"));
	984	laststart = 0;
	985	return lasttok = REPMN;
	986
	987	case '\|':
	988	if (syntax_bits & RE_LIMITED_OPS)
	989	goto normal_char;
	990	if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
	991	goto normal_char;
	992	laststart = 1;
	993	return lasttok = OR;
	994
	995	case '\n':
	996	if (syntax_bits & RE_LIMITED_OPS
	997	\|\| backslash
	998	\|\| !(syntax_bits & RE_NEWLINE_ALT))
	999	goto normal_char;
	1000	laststart = 1;
	1001	return lasttok = OR;
	1002
	1003	case '(':
	1004	if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
	1005	goto normal_char;
	1006	++parens;
	1007	laststart = 1;
	1008	return lasttok = LPAREN;
	1009
	1010	case ')':
	1011	if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
	1012	goto normal_char;
	1013	if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
	1014	goto normal_char;
	1015	--parens;
	1016	laststart = 0;
	1017	return lasttok = RPAREN;
	1018
	1019	case '.':
	1020	if (backslash)
	1021	goto normal_char;
	1022	#ifdef MBS_SUPPORT
	1023	if (MB_CUR_MAX > 1)
	1024	{
	1025	/* In multibyte environment period must match with a single
	1026	character not a byte. So we use ANYCHAR. */
	1027	laststart = 0;
	1028	return lasttok = ANYCHAR;
	1029	}
	1030	#endif /* MBS_SUPPORT */
	1031	zeroset(ccl);
	1032	notset(ccl);
	1033	if (!(syntax_bits & RE_DOT_NEWLINE))
	1034	clrbit(eolbyte, ccl);
	1035	if (syntax_bits & RE_DOT_NOT_NULL)
	1036	clrbit('\0', ccl);
	1037	laststart = 0;
	1038	return lasttok = CSET + charclass_index(ccl);
	1039
	1040	#ifndef GAWK
	1041	case 's':
	1042	case 'S':
	1043	if (!backslash \|\| (syntax_bits & RE_NO_GNU_OPS))
	1044	goto normal_char;
	1045	zeroset(ccl);
	1046	for (c2 = 0; c2 < NOTCHAR; ++c2)
	1047	if (ISSPACE(c2))
	1048	setbit(c2, ccl);
	1049	if (c == 'S')
	1050	notset(ccl);
	1051	laststart = 0;
	1052	return lasttok = CSET + charclass_index(ccl);
	1053	#endif
	1054
	1055	case 'w':
	1056	case 'W':
	1057	if (!backslash \|\| (syntax_bits & RE_NO_GNU_OPS))
	1058	goto normal_char;
	1059	zeroset(ccl);
	1060	for (c2 = 0; c2 < NOTCHAR; ++c2)
	1061	if (IS_WORD_CONSTITUENT(c2))
	1062	setbit(c2, ccl);
	1063	if (c == 'W')
	1064	notset(ccl);
	1065	laststart = 0;
	1066	return lasttok = CSET + charclass_index(ccl);
	1067
	1068	case '[':
	1069	if (backslash)
	1070	goto normal_char;
	1071	laststart = 0;
	1072	#ifdef MBS_SUPPORT
	1073	if (MB_CUR_MAX > 1)
	1074	{
	1075	/* In multibyte environment a bracket expression may contain
	1076	multibyte characters, which must be treated as characters
	1077	(not bytes). So we parse it by parse_bracket_exp_mb(). */
	1078	parse_bracket_exp_mb();
	1079	return lasttok = MBCSET;
	1080	}
	1081	#endif
	1082	zeroset(ccl);
	1083	FETCH(c, _("Unbalanced ["));
	1084	if (c == '^')
	1085	{
	1086	FETCH(c, _("Unbalanced ["));
	1087	invert = 1;
	1088	}
	1089	else
	1090	invert = 0;
	1091	do
	1092	{
	1093	/* Nobody ever said this had to be fast. :-)
	1094	Note that if we're looking at some other [:...:]
	1095	construct, we just treat it as a bunch of ordinary
	1096	characters. We can do this because we assume
	1097	regex has checked for syntax errors before
	1098	dfa is ever called. */
	1099	if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
	1100	for (c1 = 0; prednames[c1].name; ++c1)
	1101	if (looking_at(prednames[c1].name))
	1102	{
	1103	int (*pred) PARAMS ((int)) = prednames[c1].pred;
	1104
	1105	for (c2 = 0; c2 < NOTCHAR; ++c2)
	1106	if ((*pred)(c2))
	1107	setbit_case_fold (c2, ccl);
	1108	lexptr += strlen(prednames[c1].name);
	1109	lexleft -= strlen(prednames[c1].name);
	1110	FETCH(c1, _("Unbalanced ["));
	1111	goto skip;
	1112	}
	1113	if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
	1114	FETCH(c, _("Unbalanced ["));
	1115	FETCH(c1, _("Unbalanced ["));
	1116	if (c1 == '-')
	1117	{
	1118	FETCH(c2, _("Unbalanced ["));
	1119	if (c2 == ']')
	1120	{
	1121	/* In the case [x-], the - is an ordinary hyphen,
	1122	which is left in c1, the lookahead character. */
	1123	--lexptr;
	1124	++lexleft;
	1125	}
	1126	else
	1127	{
	1128	if (c2 == '\\'
	1129	&& (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
	1130	FETCH(c2, _("Unbalanced ["));
	1131	FETCH(c1, _("Unbalanced ["));
	1132	if (!hard_LC_COLLATE) {
	1133	for (; c <= c2; c++)
	1134	setbit_case_fold (c, ccl);
	1135	} else {
	1136	/* POSIX locales are painful - leave the decision to libc */
	1137	regex_t re;
	1138	char expr[6]; /* = { '[', c, '-', c2, ']', '\0' }; */
	1139
	1140	expr[0] = '['; expr[1] = c; expr[2] = '-';
	1141	expr[3] = c2; expr[4] = ']'; expr[5] = '\0';
	1142	if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
	1143	for (c = 0; c < NOTCHAR; ++c) {
	1144	regmatch_t mat;
	1145	char buf[2]; /* = { c, '\0' }; */
	1146
	1147	buf[0] = c; buf[1] = '\0';
	1148	if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
	1149	&& mat.rm_so == 0 && mat.rm_eo == 1)
	1150	setbit_case_fold (c, ccl);
	1151	}
	1152	regfree (&re);
	1153	}
	1154	}
	1155	continue;
	1156	}
	1157	}
	1158
	1159	setbit_case_fold (c, ccl);
	1160
	1161	skip:
	1162	;
	1163	}
	1164	while ((c = c1) != ']');
	1165	if (invert)
	1166	{
	1167	notset(ccl);
	1168	if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
	1169	clrbit(eolbyte, ccl);
	1170	}
	1171	return lasttok = CSET + charclass_index(ccl);
	1172
	1173	default:
	1174	normal_char:
	1175	laststart = 0;
	1176	if (case_fold && ISALPHA(c))
	1177	{
	1178	zeroset(ccl);
	1179	setbit_case_fold (c, ccl);
	1180	return lasttok = CSET + charclass_index(ccl);
	1181	}
	1182	return lasttok = c;
	1183	}
	1184	}
	1185
	1186	/* The above loop should consume at most a backslash
	1187	and some other character. */
	1188	abort();
	1189	return END; /* keeps pedantic compilers happy. */
	1190	}
	1191
	1192	/* Recursive descent parser for regular expressions. */
	1193
	1194	static token tok; /* Lookahead token. */
	1195	static int depth; /* Current depth of a hypothetical stack
	1196	holding deferred productions. This is
	1197	used to determine the depth that will be
	1198	required of the real stack later on in
	1199	dfaanalyze(). */
	1200
	1201	/* Add the given token to the parse tree, maintaining the depth count and
	1202	updating the maximum depth if necessary. */
	1203	static void
	1204	addtok (token t)
	1205	{
	1206	#ifdef MBS_SUPPORT
	1207	if (MB_CUR_MAX > 1)
	1208	{
	1209	REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
	1210	dfa->tindex);
	1211	/* Set dfa->multibyte_prop. See struct dfa in dfa.h. */
	1212	if (t == MBCSET)
	1213	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
	1214	else if (t < NOTCHAR)
	1215	dfa->multibyte_prop[dfa->tindex]
	1216	= (cur_mb_len == 1)? 3 /* single-byte char */
	1217	: (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
	1218	+ ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
	1219	else
	1220	/* It may be unnecessary, but it is safer to treat other
	1221	symbols as single byte characters. */
	1222	dfa->multibyte_prop[dfa->tindex] = 3;
	1223	}
	1224	#endif
	1225
	1226	REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
	1227	dfa->tokens[dfa->tindex++] = t;
	1228
	1229	switch (t)
	1230	{
	1231	case QMARK:
	1232	case STAR:
	1233	case PLUS:
	1234	break;
	1235
	1236	case CAT:
	1237	case OR:
	1238	case ORTOP:
	1239	--depth;
	1240	break;
	1241
	1242	default:
	1243	++dfa->nleaves;
	1244	case EMPTY:
	1245	++depth;
	1246	break;
	1247	}
	1248	if (depth > dfa->depth)
	1249	dfa->depth = depth;
	1250	}
	1251
	1252	/* The grammar understood by the parser is as follows.
	1253
	1254	regexp:
	1255	regexp OR branch
	1256	branch
	1257
	1258	branch:
	1259	branch closure
	1260	closure
	1261
	1262	closure:
	1263	closure QMARK
	1264	closure STAR
	1265	closure PLUS
	1266	closure REPMN
	1267	atom
	1268
	1269	atom:
	1270	<normal character>
	1271	<multibyte character>
	1272	ANYCHAR
	1273	MBCSET
	1274	CSET
	1275	BACKREF
	1276	BEGLINE
	1277	ENDLINE
	1278	BEGWORD
	1279	ENDWORD
	1280	LIMWORD
	1281	NOTLIMWORD
	1282	CRANGE
	1283	LPAREN regexp RPAREN
	1284	<empty>
	1285
	1286	The parser builds a parse tree in postfix form in an array of tokens. */
	1287
	1288	static void
	1289	atom (void)
	1290	{
	1291	if ((tok >= 0 && tok < NOTCHAR) \|\| tok >= CSET \|\| tok == BACKREF
	1292	\|\| tok == BEGLINE \|\| tok == ENDLINE \|\| tok == BEGWORD
	1293	#ifdef MBS_SUPPORT
	1294	\|\| tok == ANYCHAR \|\| tok == MBCSET /* MB_CUR_MAX > 1 */
	1295	#endif /* MBS_SUPPORT */
	1296	\|\| tok == ENDWORD \|\| tok == LIMWORD \|\| tok == NOTLIMWORD)
	1297	{
	1298	addtok(tok);
	1299	tok = lex();
	1300	#ifdef MBS_SUPPORT
	1301	/* We treat a multibyte character as a single atom, so that DFA
	1302	can treat a multibyte character as a single expression.
	1303
	1304	e.g. We construct following tree from "<mb1><mb2>".
	1305	<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
	1306	<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
	1307	*/
	1308	if (MB_CUR_MAX > 1)
	1309	{
	1310	while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
	1311	{
	1312	addtok(tok);
	1313	addtok(CAT);
	1314	tok = lex();
	1315	}
	1316	}
	1317	#endif /* MBS_SUPPORT */
	1318	}
	1319	else if (tok == CRANGE)
	1320	{
	1321	/* A character range like "[a-z]" in a locale other than "C" or
	1322	"POSIX". This range might any sequence of one or more
	1323	characters. Unfortunately the POSIX locale primitives give
	1324	us no practical way to find what character sequences might be
	1325	matched. Treat this approximately like "(.\1)" -- i.e. match
	1326	one character, and then punt to the full matcher. */
	1327	charclass ccl;
	1328	zeroset (ccl);
	1329	notset (ccl);
	1330	addtok (CSET + charclass_index (ccl));
	1331	addtok (BACKREF);
	1332	addtok (CAT);
	1333	tok = lex ();
	1334	}
	1335	else if (tok == LPAREN)
	1336	{
	1337	tok = lex();
	1338	regexp(0);
	1339	if (tok != RPAREN)
	1340	dfaerror(_("Unbalanced ("));
	1341	tok = lex();
	1342	}
	1343	else
	1344	addtok(EMPTY);
	1345	}
	1346
	1347	/* Return the number of tokens in the given subexpression. */
	1348	static int
	1349	nsubtoks (int tindex)
	1350	{
	1351	int ntoks1;
	1352
	1353	switch (dfa->tokens[tindex - 1])
	1354	{
	1355	default:
	1356	return 1;
	1357	case QMARK:
	1358	case STAR:
	1359	case PLUS:
	1360	return 1 + nsubtoks(tindex - 1);
	1361	case CAT:
	1362	case OR:
	1363	case ORTOP:
	1364	ntoks1 = nsubtoks(tindex - 1);
	1365	return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
	1366	}
	1367	}
	1368
	1369	/* Copy the given subexpression to the top of the tree. */
	1370	static void
	1371	copytoks (int tindex, int ntokens)
	1372	{
	1373	int i;
	1374
	1375	for (i = 0; i < ntokens; ++i)
	1376	addtok(dfa->tokens[tindex + i]);
	1377	}
	1378
	1379	static void
	1380	closure (void)
	1381	{
	1382	int tindex, ntokens, i;
	1383
	1384	atom();
	1385	while (tok == QMARK \|\| tok == STAR \|\| tok == PLUS \|\| tok == REPMN)
	1386	if (tok == REPMN)
	1387	{
	1388	ntokens = nsubtoks(dfa->tindex);
	1389	tindex = dfa->tindex - ntokens;
	1390	if (maxrep < 0)
	1391	addtok(PLUS);
	1392	if (minrep == 0)
	1393	addtok(QMARK);
	1394	for (i = 1; i < minrep; ++i)
	1395	{
	1396	copytoks(tindex, ntokens);
	1397	addtok(CAT);
	1398	}
	1399	for (; i < maxrep; ++i)
	1400	{
	1401	copytoks(tindex, ntokens);
	1402	addtok(QMARK);
	1403	addtok(CAT);
	1404	}
	1405	tok = lex();
	1406	}
	1407	else
	1408	{
	1409	addtok(tok);
	1410	tok = lex();
	1411	}
	1412	}
	1413
	1414	static void
	1415	branch (void)
	1416	{
	1417	closure();
	1418	while (tok != RPAREN && tok != OR && tok >= 0)
	1419	{
	1420	closure();
	1421	addtok(CAT);
	1422	}
	1423	}
	1424
	1425	static void
	1426	regexp (int toplevel)
	1427	{
	1428	branch();
	1429	while (tok == OR)
	1430	{
	1431	tok = lex();
	1432	branch();
	1433	if (toplevel)
	1434	addtok(ORTOP);
	1435	else
	1436	addtok(OR);
	1437	}
	1438	}
	1439
	1440	/* Main entry point for the parser. S is a string to be parsed, len is the
	1441	length of the string, so s can include NUL characters. D is a pointer to
	1442	the struct dfa to parse into. */
	1443	void
	1444	dfaparse (char const s, size_t len, struct dfa d)
	1445	{
	1446	dfa = d;
	1447	lexptr = s;
	1448	lexleft = len;
	1449	lasttok = END;
	1450	laststart = 1;
	1451	parens = 0;
	1452	#ifdef LC_COLLATE
	1453	hard_LC_COLLATE = hard_locale (LC_COLLATE);
	1454	#endif
	1455	#ifdef MBS_SUPPORT
	1456	if (MB_CUR_MAX > 1)
	1457	{
	1458	cur_mb_index = 0;
	1459	cur_mb_len = 0;
	1460	memset(&mbs, 0, sizeof(mbstate_t));
	1461	}
	1462	#endif /* MBS_SUPPORT */
	1463
	1464	if (! syntax_bits_set)
	1465	dfaerror(_("No syntax specified"));
	1466
	1467	tok = lex();
	1468	depth = d->depth;
	1469
	1470	regexp(1);
	1471
	1472	if (tok != END)
	1473	dfaerror(_("Unbalanced )"));
	1474
	1475	addtok(END - d->nregexps);
	1476	addtok(CAT);
	1477
	1478	if (d->nregexps)
	1479	addtok(ORTOP);
	1480
	1481	++d->nregexps;
	1482	}
	1483
	1484	/* Some primitives for operating on sets of positions. */
	1485
	1486	/* Copy one set to another; the destination must be large enough. */
	1487	static void
	1488	copy (position_set const src, position_set dst)
	1489	{
	1490	int i;
	1491
	1492	for (i = 0; i < src->nelem; ++i)
	1493	dst->elems[i] = src->elems[i];
	1494	dst->nelem = src->nelem;
	1495	}
	1496
	1497	/* Insert a position in a set. Position sets are maintained in sorted
	1498	order according to index. If position already exists in the set with
	1499	the same index then their constraints are logically or'd together.
	1500	S->elems must point to an array large enough to hold the resulting set. */
	1501	static void
	1502	insert (position p, position_set *s)
	1503	{
	1504	int i;
	1505	position t1, t2;
	1506
	1507	for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
	1508	continue;
	1509	if (i < s->nelem && p.index == s->elems[i].index)
	1510	s->elems[i].constraint \|= p.constraint;
	1511	else
	1512	{
	1513	t1 = p;
	1514	++s->nelem;
	1515	while (i < s->nelem)
	1516	{
	1517	t2 = s->elems[i];
	1518	s->elems[i++] = t1;
	1519	t1 = t2;
	1520	}
	1521	}
	1522	}
	1523
	1524	/* Merge two sets of positions into a third. The result is exactly as if
	1525	the positions of both sets were inserted into an initially empty set. */
	1526	static void
	1527	merge (position_set const s1, position_set const s2, position_set *m)
	1528	{
	1529	int i = 0, j = 0;
	1530
	1531	m->nelem = 0;
	1532	while (i < s1->nelem && j < s2->nelem)
	1533	if (s1->elems[i].index > s2->elems[j].index)
	1534	m->elems[m->nelem++] = s1->elems[i++];
	1535	else if (s1->elems[i].index < s2->elems[j].index)
	1536	m->elems[m->nelem++] = s2->elems[j++];
	1537	else
	1538	{
	1539	m->elems[m->nelem] = s1->elems[i++];
	1540	m->elems[m->nelem++].constraint \|= s2->elems[j++].constraint;
	1541	}
	1542	while (i < s1->nelem)
	1543	m->elems[m->nelem++] = s1->elems[i++];
	1544	while (j < s2->nelem)
	1545	m->elems[m->nelem++] = s2->elems[j++];
	1546	}
	1547
	1548	/* Delete a position from a set. */
	1549	static void
	1550	delete (position p, position_set *s)
	1551	{
	1552	int i;
	1553
	1554	for (i = 0; i < s->nelem; ++i)
	1555	if (p.index == s->elems[i].index)
	1556	break;
	1557	if (i < s->nelem)
	1558	for (--s->nelem; i < s->nelem; ++i)
	1559	s->elems[i] = s->elems[i + 1];
	1560	}
	1561
	1562	/* Find the index of the state corresponding to the given position set with
	1563	the given preceding context, or create a new state if there is no such
	1564	state. Newline and letter tell whether we got here on a newline or
	1565	letter, respectively. */
	1566	static int
	1567	state_index (struct dfa d, position_set const s, int newline, int letter)
	1568	{
	1569	int hash = 0;
	1570	int constraint;
	1571	int i, j;
	1572
	1573	newline = newline ? 1 : 0;
	1574	letter = letter ? 1 : 0;
	1575
	1576	for (i = 0; i < s->nelem; ++i)
	1577	hash ^= s->elems[i].index + s->elems[i].constraint;
	1578
	1579	/* Try to find a state that exactly matches the proposed one. */
	1580	for (i = 0; i < d->sindex; ++i)
	1581	{
	1582	if (hash != d->states[i].hash \|\| s->nelem != d->states[i].elems.nelem
	1583	\|\| newline != d->states[i].newline \|\| letter != d->states[i].letter)
	1584	continue;
	1585	for (j = 0; j < s->nelem; ++j)
	1586	if (s->elems[j].constraint
	1587	!= d->states[i].elems.elems[j].constraint
	1588	\|\| s->elems[j].index != d->states[i].elems.elems[j].index)
	1589	break;
	1590	if (j == s->nelem)
	1591	return i;
	1592	}
	1593
	1594	/* We'll have to create a new state. */
	1595	REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
	1596	d->states[i].hash = hash;
	1597	MALLOC(d->states[i].elems.elems, position, s->nelem);
	1598	copy(s, &d->states[i].elems);
	1599	d->states[i].newline = newline;
	1600	d->states[i].letter = letter;
	1601	d->states[i].backref = 0;
	1602	d->states[i].constraint = 0;
	1603	d->states[i].first_end = 0;
	1604	#ifdef MBS_SUPPORT
	1605	if (MB_CUR_MAX > 1)
	1606	d->states[i].mbps.nelem = 0;
	1607	#endif
	1608	for (j = 0; j < s->nelem; ++j)
	1609	if (d->tokens[s->elems[j].index] < 0)
	1610	{
	1611	constraint = s->elems[j].constraint;
	1612	if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
	1613	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
	1614	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
	1615	\|\| SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
	1616	d->states[i].constraint \|= constraint;
	1617	if (! d->states[i].first_end)
	1618	d->states[i].first_end = d->tokens[s->elems[j].index];
	1619	}
	1620	else if (d->tokens[s->elems[j].index] == BACKREF)
	1621	{
	1622	d->states[i].constraint = NO_CONSTRAINT;
	1623	d->states[i].backref = 1;
	1624	}
	1625
	1626	++d->sindex;
	1627
	1628	return i;
	1629	}
	1630
	1631	/* Find the epsilon closure of a set of positions. If any position of the set
	1632	contains a symbol that matches the empty string in some context, replace
	1633	that position with the elements of its follow labeled with an appropriate
	1634	constraint. Repeat exhaustively until no funny positions are left.
	1635	S->elems must be large enough to hold the result. */
	1636	static void
	1637	epsclosure (position_set s, struct dfa const d)
	1638	{
	1639	int i, j;
	1640	int *visited;
	1641	position p, old;
	1642
	1643	MALLOC(visited, int, d->tindex);
	1644	for (i = 0; i < d->tindex; ++i)
	1645	visited[i] = 0;
	1646
	1647	for (i = 0; i < s->nelem; ++i)
	1648	if (d->tokens[s->elems[i].index] >= NOTCHAR
	1649	&& d->tokens[s->elems[i].index] != BACKREF
	1650	#ifdef MBS_SUPPORT
	1651	&& d->tokens[s->elems[i].index] != ANYCHAR
	1652	&& d->tokens[s->elems[i].index] != MBCSET
	1653	#endif
	1654	&& d->tokens[s->elems[i].index] < CSET)
	1655	{
	1656	old = s->elems[i];
	1657	p.constraint = old.constraint;
	1658	delete(s->elems[i], s);
	1659	if (visited[old.index])
	1660	{
	1661	--i;
	1662	continue;
	1663	}
	1664	visited[old.index] = 1;
	1665	switch (d->tokens[old.index])
	1666	{
	1667	case BEGLINE:
	1668	p.constraint &= BEGLINE_CONSTRAINT;
	1669	break;
	1670	case ENDLINE:
	1671	p.constraint &= ENDLINE_CONSTRAINT;
	1672	break;
	1673	case BEGWORD:
	1674	p.constraint &= BEGWORD_CONSTRAINT;
	1675	break;
	1676	case ENDWORD:
	1677	p.constraint &= ENDWORD_CONSTRAINT;
	1678	break;
	1679	case LIMWORD:
	1680	p.constraint &= LIMWORD_CONSTRAINT;
	1681	break;
	1682	case NOTLIMWORD:
	1683	p.constraint &= NOTLIMWORD_CONSTRAINT;
	1684	break;
	1685	default:
	1686	break;
	1687	}
	1688	for (j = 0; j < d->follows[old.index].nelem; ++j)
	1689	{
	1690	p.index = d->follows[old.index].elems[j].index;
	1691	insert(p, s);
	1692	}
	1693	/* Force rescan to start at the beginning. */
	1694	i = -1;
	1695	}
	1696
	1697	free(visited);
	1698	}
	1699
	1700	/* Perform bottom-up analysis on the parse tree, computing various functions.
	1701	Note that at this point, we're pretending constructs like \< are real
	1702	characters rather than constraints on what can follow them.
	1703
	1704	Nullable: A node is nullable if it is at the root of a regexp that can
	1705	match the empty string.
	1706	* EMPTY leaves are nullable.
	1707	* No other leaf is nullable.
	1708	* A QMARK or STAR node is nullable.
	1709	* A PLUS node is nullable if its argument is nullable.
	1710	* A CAT node is nullable if both its arguments are nullable.
	1711	* An OR node is nullable if either argument is nullable.
	1712
	1713	Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
	1714	that could correspond to the first character of a string matching the
	1715	regexp rooted at the given node.
	1716	* EMPTY leaves have empty firstpos.
	1717	* The firstpos of a nonempty leaf is that leaf itself.
	1718	* The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
	1719	argument.
	1720	* The firstpos of a CAT node is the firstpos of the left argument, union
	1721	the firstpos of the right if the left argument is nullable.
	1722	* The firstpos of an OR node is the union of firstpos of each argument.
	1723
	1724	Lastpos: The lastpos of a node is the set of positions that could
	1725	correspond to the last character of a string matching the regexp at
	1726	the given node.
	1727	* EMPTY leaves have empty lastpos.
	1728	* The lastpos of a nonempty leaf is that leaf itself.
	1729	* The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
	1730	argument.
	1731	* The lastpos of a CAT node is the lastpos of its right argument, union
	1732	the lastpos of the left if the right argument is nullable.
	1733	* The lastpos of an OR node is the union of the lastpos of each argument.
	1734
	1735	Follow: The follow of a position is the set of positions that could
	1736	correspond to the character following a character matching the node in
	1737	a string matching the regexp. At this point we consider special symbols
	1738	that match the empty string in some context to be just normal characters.
	1739	Later, if we find that a special symbol is in a follow set, we will
	1740	replace it with the elements of its follow, labeled with an appropriate
	1741	constraint.
	1742	* Every node in the firstpos of the argument of a STAR or PLUS node is in
	1743	the follow of every node in the lastpos.
	1744	* Every node in the firstpos of the second argument of a CAT node is in
	1745	the follow of every node in the lastpos of the first argument.
	1746
	1747	Because of the postfix representation of the parse tree, the depth-first
	1748	analysis is conveniently done by a linear scan with the aid of a stack.
	1749	Sets are stored as arrays of the elements, obeying a stack-like allocation
	1750	scheme; the number of elements in each set deeper in the stack can be
	1751	used to determine the address of a particular set's array. */
	1752	void
	1753	dfaanalyze (struct dfa *d, int searchflag)
	1754	{
	1755	int nullable; / Nullable stack. */
	1756	int nfirstpos; / Element count stack for firstpos sets. */
	1757	position firstpos; / Array where firstpos elements are stored. */
	1758	int nlastpos; / Element count stack for lastpos sets. */
	1759	position lastpos; / Array where lastpos elements are stored. */
	1760	int nalloc; / Sizes of arrays allocated to follow sets. */
	1761	position_set tmp; /* Temporary set for merging sets. */
	1762	position_set merged; /* Result of merging sets. */
	1763	int wants_newline; /* True if some position wants newline info. */
	1764	int *o_nullable;
	1765	int o_nfirst, o_nlast;
	1766	position o_firstpos, o_lastpos;
	1767	int i, j;
	1768	position *pos;
	1769
	1770	#ifdef DEBUG
	1771	fprintf(stderr, "dfaanalyze:\n");
	1772	for (i = 0; i < d->tindex; ++i)
	1773	{
	1774	fprintf(stderr, " %d:", i);
	1775	prtok(d->tokens[i]);
	1776	}
	1777	putc('\n', stderr);
	1778	#endif
	1779
	1780	d->searchflag = searchflag;
	1781
	1782	MALLOC(nullable, int, d->depth);
	1783	o_nullable = nullable;
	1784	MALLOC(nfirstpos, int, d->depth);
	1785	o_nfirst = nfirstpos;
	1786	MALLOC(firstpos, position, d->nleaves);
	1787	o_firstpos = firstpos, firstpos += d->nleaves;
	1788	MALLOC(nlastpos, int, d->depth);
	1789	o_nlast = nlastpos;
	1790	MALLOC(lastpos, position, d->nleaves);
	1791	o_lastpos = lastpos, lastpos += d->nleaves;
	1792	MALLOC(nalloc, int, d->tindex);
	1793	for (i = 0; i < d->tindex; ++i)
	1794	nalloc[i] = 0;
	1795	MALLOC(merged.elems, position, d->nleaves);
	1796
	1797	CALLOC(d->follows, position_set, d->tindex);
	1798
	1799	for (i = 0; i < d->tindex; ++i)
	1800	#ifdef DEBUG
	1801	{ /* Nonsyntactic #ifdef goo... */
	1802	#endif
	1803	switch (d->tokens[i])
	1804	{
	1805	case EMPTY:
	1806	/* The empty set is nullable. */
	1807	*nullable++ = 1;
	1808
	1809	/* The firstpos and lastpos of the empty leaf are both empty. */
	1810	nfirstpos++ = nlastpos++ = 0;
	1811	break;
	1812
	1813	case STAR:
	1814	case PLUS:
	1815	/* Every element in the firstpos of the argument is in the follow
	1816	of every element in the lastpos. */
	1817	tmp.nelem = nfirstpos[-1];
	1818	tmp.elems = firstpos;
	1819	pos = lastpos;
	1820	for (j = 0; j < nlastpos[-1]; ++j)
	1821	{
	1822	merge(&tmp, &d->follows[pos[j].index], &merged);
	1823	REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
	1824	nalloc[pos[j].index], merged.nelem - 1);
	1825	copy(&merged, &d->follows[pos[j].index]);
	1826	}
	1827
	1828	case QMARK:
	1829	/* A QMARK or STAR node is automatically nullable. */
	1830	if (d->tokens[i] != PLUS)
	1831	nullable[-1] = 1;
	1832	break;
	1833
	1834	case CAT:
	1835	/* Every element in the firstpos of the second argument is in the
	1836	follow of every element in the lastpos of the first argument. */
	1837	tmp.nelem = nfirstpos[-1];
	1838	tmp.elems = firstpos;
	1839	pos = lastpos + nlastpos[-1];
	1840	for (j = 0; j < nlastpos[-2]; ++j)
	1841	{
	1842	merge(&tmp, &d->follows[pos[j].index], &merged);
	1843	REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
	1844	nalloc[pos[j].index], merged.nelem - 1);
	1845	copy(&merged, &d->follows[pos[j].index]);
	1846	}
	1847
	1848	/* The firstpos of a CAT node is the firstpos of the first argument,
	1849	union that of the second argument if the first is nullable. */
	1850	if (nullable[-2])
	1851	nfirstpos[-2] += nfirstpos[-1];
	1852	else
	1853	firstpos += nfirstpos[-1];
	1854	--nfirstpos;
	1855
	1856	/* The lastpos of a CAT node is the lastpos of the second argument,
	1857	union that of the first argument if the second is nullable. */
	1858	if (nullable[-1])
	1859	nlastpos[-2] += nlastpos[-1];
	1860	else
	1861	{
	1862	pos = lastpos + nlastpos[-2];
	1863	for (j = nlastpos[-1] - 1; j >= 0; --j)
	1864	pos[j] = lastpos[j];
	1865	lastpos += nlastpos[-2];
	1866	nlastpos[-2] = nlastpos[-1];
	1867	}
	1868	--nlastpos;
	1869
	1870	/* A CAT node is nullable if both arguments are nullable. */
	1871	nullable[-2] = nullable[-1] && nullable[-2];
	1872	--nullable;
	1873	break;
	1874
	1875	case OR:
	1876	case ORTOP:
	1877	/* The firstpos is the union of the firstpos of each argument. */
	1878	nfirstpos[-2] += nfirstpos[-1];
	1879	--nfirstpos;
	1880
	1881	/* The lastpos is the union of the lastpos of each argument. */
	1882	nlastpos[-2] += nlastpos[-1];
	1883	--nlastpos;
	1884
	1885	/* An OR node is nullable if either argument is nullable. */
	1886	nullable[-2] = nullable[-1] \|\| nullable[-2];
	1887	--nullable;
	1888	break;
	1889
	1890	default:
	1891	/* Anything else is a nonempty position. (Note that special
	1892	constructs like \< are treated as nonempty strings here;
	1893	an "epsilon closure" effectively makes them nullable later.
	1894	Backreferences have to get a real position so we can detect
	1895	transitions on them later. But they are nullable. */
	1896	*nullable++ = d->tokens[i] == BACKREF;
	1897
	1898	/* This position is in its own firstpos and lastpos. */
	1899	nfirstpos++ = nlastpos++ = 1;
	1900	--firstpos, --lastpos;
	1901	firstpos->index = lastpos->index = i;
	1902	firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
	1903
	1904	/* Allocate the follow set for this position. */
	1905	nalloc[i] = 1;
	1906	MALLOC(d->follows[i].elems, position, nalloc[i]);
	1907	break;
	1908	}
	1909	#ifdef DEBUG
	1910	/* ... balance the above nonsyntactic #ifdef goo... */
	1911	fprintf(stderr, "node %d:", i);
	1912	prtok(d->tokens[i]);
	1913	putc('\n', stderr);
	1914	fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
	1915	fprintf(stderr, " firstpos:");
	1916	for (j = nfirstpos[-1] - 1; j >= 0; --j)
	1917	{
	1918	fprintf(stderr, " %d:", firstpos[j].index);
	1919	prtok(d->tokens[firstpos[j].index]);
	1920	}
	1921	fprintf(stderr, "\n lastpos:");
	1922	for (j = nlastpos[-1] - 1; j >= 0; --j)
	1923	{
	1924	fprintf(stderr, " %d:", lastpos[j].index);
	1925	prtok(d->tokens[lastpos[j].index]);
	1926	}
	1927	putc('\n', stderr);
	1928	}
	1929	#endif
	1930
	1931	/* For each follow set that is the follow set of a real position, replace
	1932	it with its epsilon closure. */
	1933	for (i = 0; i < d->tindex; ++i)
	1934	if (d->tokens[i] < NOTCHAR \|\| d->tokens[i] == BACKREF
	1935	#ifdef MBS_SUPPORT
	1936	\|\| d->tokens[i] == ANYCHAR
	1937	\|\| d->tokens[i] == MBCSET
	1938	#endif
	1939	\|\| d->tokens[i] >= CSET)
	1940	{
	1941	#ifdef DEBUG
	1942	fprintf(stderr, "follows(%d:", i);
	1943	prtok(d->tokens[i]);
	1944	fprintf(stderr, "):");
	1945	for (j = d->follows[i].nelem - 1; j >= 0; --j)
	1946	{
	1947	fprintf(stderr, " %d:", d->follows[i].elems[j].index);
	1948	prtok(d->tokens[d->follows[i].elems[j].index]);
	1949	}
	1950	putc('\n', stderr);
	1951	#endif
	1952	copy(&d->follows[i], &merged);
	1953	epsclosure(&merged, d);
	1954	if (d->follows[i].nelem < merged.nelem)
	1955	REALLOC(d->follows[i].elems, position, merged.nelem);
	1956	copy(&merged, &d->follows[i]);
	1957	}
	1958
	1959	/* Get the epsilon closure of the firstpos of the regexp. The result will
	1960	be the set of positions of state 0. */
	1961	merged.nelem = 0;
	1962	for (i = 0; i < nfirstpos[-1]; ++i)
	1963	insert(firstpos[i], &merged);
	1964	epsclosure(&merged, d);
	1965
	1966	/* Check if any of the positions of state 0 will want newline context. */
	1967	wants_newline = 0;
	1968	for (i = 0; i < merged.nelem; ++i)
	1969	if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
	1970	wants_newline = 1;
	1971
	1972	/* Build the initial state. */
	1973	d->salloc = 1;
	1974	d->sindex = 0;
	1975	MALLOC(d->states, dfa_state, d->salloc);
	1976	state_index(d, &merged, wants_newline, 0);
	1977
	1978	free(o_nullable);
	1979	free(o_nfirst);
	1980	free(o_firstpos);
	1981	free(o_nlast);
	1982	free(o_lastpos);
	1983	free(nalloc);
	1984	free(merged.elems);
	1985	}
	1986
	1987	/* Find, for each character, the transition out of state s of d, and store
	1988	it in the appropriate slot of trans.
	1989
	1990	We divide the positions of s into groups (positions can appear in more
	1991	than one group). Each group is labeled with a set of characters that
	1992	every position in the group matches (taking into account, if necessary,
	1993	preceding context information of s). For each group, find the union
	1994	of the its elements' follows. This set is the set of positions of the
	1995	new state. For each character in the group's label, set the transition
	1996	on this character to be to a state corresponding to the set's positions,
	1997	and its associated backward context information, if necessary.
	1998
	1999	If we are building a searching matcher, we include the positions of state
	2000	0 in every state.
	2001
	2002	The collection of groups is constructed by building an equivalence-class
	2003	partition of the positions of s.
	2004
	2005	For each position, find the set of characters C that it matches. Eliminate
	2006	any characters from C that fail on grounds of backward context.
	2007
	2008	Search through the groups, looking for a group whose label L has nonempty
	2009	intersection with C. If L - C is nonempty, create a new group labeled
	2010	L - C and having the same positions as the current group, and set L to
	2011	the intersection of L and C. Insert the position in this group, set
	2012	C = C - L, and resume scanning.
	2013
	2014	If after comparing with every group there are characters remaining in C,
	2015	create a new group labeled with the characters of C and insert this
	2016	position in that group. */
	2017	void
	2018	dfastate (int s, struct dfa *d, int trans[])
	2019	{
	2020	position_set grps[NOTCHAR]; /* As many as will ever be needed. */
	2021	charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
	2022	int ngrps = 0; /* Number of groups actually used. */
	2023	position pos; /* Current position being considered. */
	2024	charclass matches; /* Set of matching characters. */
	2025	int matchesf; /* True if matches is nonempty. */
	2026	charclass intersect; /* Intersection with some label set. */
	2027	int intersectf; /* True if intersect is nonempty. */
	2028	charclass leftovers; /* Stuff in the label that didn't match. */
	2029	int leftoversf; /* True if leftovers is nonempty. */
	2030	static charclass letters; /* Set of characters considered letters. */
	2031	static charclass newline; /* Set of characters that aren't newline. */
	2032	position_set follows; /* Union of the follows of some group. */
	2033	position_set tmp; /* Temporary space for merging sets. */
	2034	int state; /* New state. */
	2035	int wants_newline; /* New state wants to know newline context. */
	2036	int state_newline; /* New state on a newline transition. */
	2037	int wants_letter; /* New state wants to know letter context. */
	2038	int state_letter; /* New state on a letter transition. */
	2039	static int initialized; /* Flag for static initialization. */
	2040	#ifdef MBS_SUPPORT
	2041	int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
	2042	#endif
	2043	int i, j, k;
	2044
	2045	/* Initialize the set of letters, if necessary. */
	2046	if (! initialized)
	2047	{
	2048	initialized = 1;
	2049	for (i = 0; i < NOTCHAR; ++i)
	2050	if (IS_WORD_CONSTITUENT(i))
	2051	setbit(i, letters);
	2052	setbit(eolbyte, newline);
	2053	}
	2054
	2055	zeroset(matches);
	2056
	2057	for (i = 0; i < d->states[s].elems.nelem; ++i)
	2058	{
	2059	pos = d->states[s].elems.elems[i];
	2060	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
	2061	setbit(d->tokens[pos.index], matches);
	2062	else if (d->tokens[pos.index] >= CSET)
	2063	copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
	2064	#ifdef MBS_SUPPORT
	2065	else if (d->tokens[pos.index] == ANYCHAR
	2066	\|\| d->tokens[pos.index] == MBCSET)
	2067	/* MB_CUR_MAX > 1 */
	2068	{
	2069	/* ANYCHAR and MBCSET must match with a single character, so we
	2070	must put it to d->states[s].mbps, which contains the positions
	2071	which can match with a single character not a byte. */
	2072	if (d->states[s].mbps.nelem == 0)
	2073	{
	2074	MALLOC(d->states[s].mbps.elems, position,
	2075	d->states[s].elems.nelem);
	2076	}
	2077	insert(pos, &(d->states[s].mbps));
	2078	continue;
	2079	}
	2080	#endif /* MBS_SUPPORT */
	2081	else
	2082	continue;
	2083
	2084	/* Some characters may need to be eliminated from matches because
	2085	they fail in the current context. */
	2086	if (pos.constraint != 0xFF)
	2087	{
	2088	if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
	2089	d->states[s].newline, 1))
	2090	clrbit(eolbyte, matches);
	2091	if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
	2092	d->states[s].newline, 0))
	2093	for (j = 0; j < CHARCLASS_INTS; ++j)
	2094	matches[j] &= newline[j];
	2095	if (! MATCHES_LETTER_CONTEXT(pos.constraint,
	2096	d->states[s].letter, 1))
	2097	for (j = 0; j < CHARCLASS_INTS; ++j)
	2098	matches[j] &= ~letters[j];
	2099	if (! MATCHES_LETTER_CONTEXT(pos.constraint,
	2100	d->states[s].letter, 0))
	2101	for (j = 0; j < CHARCLASS_INTS; ++j)
	2102	matches[j] &= letters[j];
	2103
	2104	/* If there are no characters left, there's no point in going on. */
	2105	for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
	2106	continue;
	2107	if (j == CHARCLASS_INTS)
	2108	continue;
	2109	}
	2110
	2111	for (j = 0; j < ngrps; ++j)
	2112	{
	2113	/* If matches contains a single character only, and the current
	2114	group's label doesn't contain that character, go on to the
	2115	next group. */
	2116	if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
	2117	&& !tstbit(d->tokens[pos.index], labels[j]))
	2118	continue;
	2119
	2120	/* Check if this group's label has a nonempty intersection with
	2121	matches. */
	2122	intersectf = 0;
	2123	for (k = 0; k < CHARCLASS_INTS; ++k)
	2124	(intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
	2125	if (! intersectf)
	2126	continue;
	2127
	2128	/* It does; now find the set differences both ways. */
	2129	leftoversf = matchesf = 0;
	2130	for (k = 0; k < CHARCLASS_INTS; ++k)
	2131	{
	2132	/* Even an optimizing compiler can't know this for sure. */
	2133	int match = matches[k], label = labels[j][k];
	2134
	2135	(leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
	2136	(matches[k] = match & ~label) ? (matchesf = 1) : 0;
	2137	}
	2138
	2139	/* If there were leftovers, create a new group labeled with them. */
	2140	if (leftoversf)
	2141	{
	2142	copyset(leftovers, labels[ngrps]);
	2143	copyset(intersect, labels[j]);
	2144	MALLOC(grps[ngrps].elems, position, d->nleaves);
	2145	copy(&grps[j], &grps[ngrps]);
	2146	++ngrps;
	2147	}
	2148
	2149	/* Put the position in the current group. Note that there is no
	2150	reason to call insert() here. */
	2151	grps[j].elems[grps[j].nelem++] = pos;
	2152
	2153	/* If every character matching the current position has been
	2154	accounted for, we're done. */
	2155	if (! matchesf)
	2156	break;
	2157	}
	2158
	2159	/* If we've passed the last group, and there are still characters
	2160	unaccounted for, then we'll have to create a new group. */
	2161	if (j == ngrps)
	2162	{
	2163	copyset(matches, labels[ngrps]);
	2164	zeroset(matches);
	2165	MALLOC(grps[ngrps].elems, position, d->nleaves);
	2166	grps[ngrps].nelem = 1;
	2167	grps[ngrps].elems[0] = pos;
	2168	++ngrps;
	2169	}
	2170	}
	2171
	2172	MALLOC(follows.elems, position, d->nleaves);
	2173	MALLOC(tmp.elems, position, d->nleaves);
	2174
	2175	/* If we are a searching matcher, the default transition is to a state
	2176	containing the positions of state 0, otherwise the default transition
	2177	is to fail miserably. */
	2178	if (d->searchflag)
	2179	{
	2180	wants_newline = 0;
	2181	wants_letter = 0;
	2182	for (i = 0; i < d->states[0].elems.nelem; ++i)
	2183	{
	2184	if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
	2185	wants_newline = 1;
	2186	if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
	2187	wants_letter = 1;
	2188	}
	2189	copy(&d->states[0].elems, &follows);
	2190	state = state_index(d, &follows, 0, 0);
	2191	if (wants_newline)
	2192	state_newline = state_index(d, &follows, 1, 0);
	2193	else
	2194	state_newline = state;
	2195	if (wants_letter)
	2196	state_letter = state_index(d, &follows, 0, 1);
	2197	else
	2198	state_letter = state;
	2199	for (i = 0; i < NOTCHAR; ++i)
	2200	trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
	2201	trans[eolbyte] = state_newline;
	2202	}
	2203	else
	2204	for (i = 0; i < NOTCHAR; ++i)
	2205	trans[i] = -1;
	2206
	2207	for (i = 0; i < ngrps; ++i)
	2208	{
	2209	follows.nelem = 0;
	2210
	2211	/* Find the union of the follows of the positions of the group.
	2212	This is a hideously inefficient loop. Fix it someday. */
	2213	for (j = 0; j < grps[i].nelem; ++j)
	2214	for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
	2215	insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
	2216
	2217	#ifdef MBS_SUPPORT
	2218	if (MB_CUR_MAX > 1)
	2219	{
	2220	/* If a token in follows.elems is not 1st byte of a multibyte
	2221	character, or the states of follows must accept the bytes
	2222	which are not 1st byte of the multibyte character.
	2223	Then, if a state of follows encounter a byte, it must not be
	2224	a 1st byte of a multibyte character nor single byte character.
	2225	We cansel to add state[0].follows to next state, because
	2226	state[0] must accept 1st-byte
	2227
	2228	For example, we assume <sb a> is a certain single byte
	2229	character, <mb A> is a certain multibyte character, and the
	2230	codepoint of <sb a> equals the 2nd byte of the codepoint of
	2231	<mb A>.
	2232	When state[0] accepts <sb a>, state[i] transit to state[i+1]
	2233	by accepting accepts 1st byte of <mb A>, and state[i+1]
	2234	accepts 2nd byte of <mb A>, if state[i+1] encounter the
	2235	codepoint of <sb a>, it must not be <sb a> but 2nd byte of
	2236	<mb A>, so we can not add state[0]. */
	2237
	2238	next_isnt_1st_byte = 0;
	2239	for (j = 0; j < follows.nelem; ++j)
	2240	{
	2241	if (!(d->multibyte_prop[follows.elems[j].index] & 1))
	2242	{
	2243	next_isnt_1st_byte = 1;
	2244	break;
	2245	}
	2246	}
	2247	}
	2248	#endif
	2249
	2250	/* If we are building a searching matcher, throw in the positions
	2251	of state 0 as well. */
	2252	#ifdef MBS_SUPPORT
	2253	if (d->searchflag && (MB_CUR_MAX == 1 \|\| !next_isnt_1st_byte))
	2254	#else
	2255	if (d->searchflag)
	2256	#endif
	2257	for (j = 0; j < d->states[0].elems.nelem; ++j)
	2258	insert(d->states[0].elems.elems[j], &follows);
	2259
	2260	/* Find out if the new state will want any context information. */
	2261	wants_newline = 0;
	2262	if (tstbit(eolbyte, labels[i]))
	2263	for (j = 0; j < follows.nelem; ++j)
	2264	if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
	2265	wants_newline = 1;
	2266
	2267	wants_letter = 0;
	2268	for (j = 0; j < CHARCLASS_INTS; ++j)
	2269	if (labels[i][j] & letters[j])
	2270	break;
	2271	if (j < CHARCLASS_INTS)
	2272	for (j = 0; j < follows.nelem; ++j)
	2273	if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
	2274	wants_letter = 1;
	2275
	2276	/* Find the state(s) corresponding to the union of the follows. */
	2277	state = state_index(d, &follows, 0, 0);
	2278	if (wants_newline)
	2279	state_newline = state_index(d, &follows, 1, 0);
	2280	else
	2281	state_newline = state;
	2282	if (wants_letter)
	2283	state_letter = state_index(d, &follows, 0, 1);
	2284	else
	2285	state_letter = state;
	2286
	2287	/* Set the transitions for each character in the current label. */
	2288	for (j = 0; j < CHARCLASS_INTS; ++j)
	2289	for (k = 0; k < INTBITS; ++k)
	2290	if (labels[i][j] & 1 << k)
	2291	{
	2292	int c = j * INTBITS + k;
	2293
	2294	if (c == eolbyte)
	2295	trans[c] = state_newline;
	2296	else if (IS_WORD_CONSTITUENT(c))
	2297	trans[c] = state_letter;
	2298	else if (c < NOTCHAR)
	2299	trans[c] = state;
	2300	}
	2301	}
	2302
	2303	for (i = 0; i < ngrps; ++i)
	2304	free(grps[i].elems);
	2305	free(follows.elems);
	2306	free(tmp.elems);
	2307	}
	2308
	2309	/* Some routines for manipulating a compiled dfa's transition tables.
	2310	Each state may or may not have a transition table; if it does, and it
	2311	is a non-accepting state, then d->trans[state] points to its table.
	2312	If it is an accepting state then d->fails[state] points to its table.
	2313	If it has no table at all, then d->trans[state] is NULL.
	2314	TODO: Improve this comment, get rid of the unnecessary redundancy. */
	2315
	2316	static void
	2317	build_state (int s, struct dfa *d)
	2318	{
	2319	int trans; / The new transition table. */
	2320	int i;
	2321
	2322	/* Set an upper limit on the number of transition tables that will ever
	2323	exist at once. 1024 is arbitrary. The idea is that the frequently
	2324	used transition tables will be quickly rebuilt, whereas the ones that
	2325	were only needed once or twice will be cleared away. */
	2326	if (d->trcount >= 1024)
	2327	{
	2328	for (i = 0; i < d->tralloc; ++i)
	2329	if (d->trans[i])
	2330	{
	2331	free((ptr_t) d->trans[i]);
	2332	d->trans[i] = NULL;
	2333	}
	2334	else if (d->fails[i])
	2335	{
	2336	free((ptr_t) d->fails[i]);
	2337	d->fails[i] = NULL;
	2338	}
	2339	d->trcount = 0;
	2340	}
	2341
	2342	++d->trcount;
	2343
	2344	/* Set up the success bits for this state. */
	2345	d->success[s] = 0;
	2346	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
	2347	s, *d))
	2348	d->success[s] \|= 4;
	2349	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
	2350	s, *d))
	2351	d->success[s] \|= 2;
	2352	if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
	2353	s, *d))
	2354	d->success[s] \|= 1;
	2355
	2356	MALLOC(trans, int, NOTCHAR);
	2357	dfastate(s, d, trans);
	2358
	2359	/* Now go through the new transition table, and make sure that the trans
	2360	and fail arrays are allocated large enough to hold a pointer for the
	2361	largest state mentioned in the table. */
	2362	for (i = 0; i < NOTCHAR; ++i)
	2363	if (trans[i] >= d->tralloc)
	2364	{
	2365	int oldalloc = d->tralloc;
	2366
	2367	while (trans[i] >= d->tralloc)
	2368	d->tralloc *= 2;
	2369	REALLOC(d->realtrans, int *, d->tralloc + 1);
	2370	d->trans = d->realtrans + 1;
	2371	REALLOC(d->fails, int *, d->tralloc);
	2372	REALLOC(d->success, int, d->tralloc);
	2373	REALLOC(d->newlines, int, d->tralloc);
	2374	while (oldalloc < d->tralloc)
	2375	{
	2376	d->trans[oldalloc] = NULL;
	2377	d->fails[oldalloc++] = NULL;
	2378	}
	2379	}
	2380
	2381	/* Keep the newline transition in a special place so we can use it as
	2382	a sentinel. */
	2383	d->newlines[s] = trans[eolbyte];
	2384	trans[eolbyte] = -1;
	2385
	2386	if (ACCEPTING(s, *d))
	2387	d->fails[s] = trans;
	2388	else
	2389	d->trans[s] = trans;
	2390	}
	2391
	2392	static void
	2393	build_state_zero (struct dfa *d)
	2394	{
	2395	d->tralloc = 1;
	2396	d->trcount = 0;
	2397	CALLOC(d->realtrans, int *, d->tralloc + 1);
	2398	d->trans = d->realtrans + 1;
	2399	CALLOC(d->fails, int *, d->tralloc);
	2400	MALLOC(d->success, int, d->tralloc);
	2401	MALLOC(d->newlines, int, d->tralloc);
	2402	build_state(0, d);
	2403	}
	2404
	2405	#ifdef MBS_SUPPORT
	2406	/* Multibyte character handling sub-routines for dfaexec. */
	2407
	2408	/* Initial state may encounter the byte which is not a single byte character
	2409	nor 1st byte of a multibyte character. But it is incorrect for initial
	2410	state to accept such a byte.
	2411	For example, in sjis encoding the regular expression like "\\" accepts
	2412	the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
	2413	0x815c. Then Initial state must skip the bytes which are not a single byte
	2414	character nor 1st byte of a multibyte character. */
	2415	#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
	2416	if (s == 0) \
	2417	{ \
	2418	while (inputwcs[p - buf_begin] == 0 \
	2419	&& mblen_buf[p - buf_begin] > 0 \
	2420	&& (unsigned char const *)p < buf_end) \
	2421	++p; \
	2422	if ((char *)p >= end) \
	2423	{ \
	2424	free(mblen_buf); \
	2425	free(inputwcs); \
	2426	return NULL; \
	2427	} \
	2428	}
	2429
	2430	static void
	2431	realloc_trans_if_necessary(struct dfa *d, int new_state)
	2432	{
	2433	/* Make sure that the trans and fail arrays are allocated large enough
	2434	to hold a pointer for the new state. */
	2435	if (new_state >= d->tralloc)
	2436	{
	2437	int oldalloc = d->tralloc;
	2438
	2439	while (new_state >= d->tralloc)
	2440	d->tralloc *= 2;
	2441	REALLOC(d->realtrans, int *, d->tralloc + 1);
	2442	d->trans = d->realtrans + 1;
	2443	REALLOC(d->fails, int *, d->tralloc);
	2444	REALLOC(d->success, int, d->tralloc);
	2445	REALLOC(d->newlines, int, d->tralloc);
	2446	while (oldalloc < d->tralloc)
	2447	{
	2448	d->trans[oldalloc] = NULL;
	2449	d->fails[oldalloc++] = NULL;
	2450	}
	2451	}
	2452	}
	2453
	2454	/* Return values of transit_state_singlebyte(), and
	2455	transit_state_consume_1char. */
	2456	typedef enum
	2457	{
	2458	TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
	2459	TRANSIT_STATE_DONE, /* State transition has finished. */
	2460	TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
	2461	} status_transit_state;
	2462
	2463	/* Consume a single byte and transit state from 's' to '*next_state'.
	2464	This function is almost same as the state transition routin in dfaexec().
	2465	But state transition is done just once, otherwise matching succeed or
	2466	reach the end of the buffer. */
	2467	static status_transit_state
	2468	transit_state_singlebyte (struct dfa d, int s, unsigned char const p,
	2469	int *next_state)
	2470	{
	2471	int *t;
	2472	int works = s;
	2473
	2474	status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
	2475
	2476	while (rval == TRANSIT_STATE_IN_PROGRESS)
	2477	{
	2478	if ((t = d->trans[works]) != NULL)
	2479	{
	2480	works = t[*p];
	2481	rval = TRANSIT_STATE_DONE;
	2482	if (works < 0)
	2483	works = 0;
	2484	}
	2485	else if (works < 0)
	2486	{
	2487	if (p == buf_end)
	2488	/* At the moment, it must not happen. */
	2489	return TRANSIT_STATE_END_BUFFER;
	2490	works = 0;
	2491	}
	2492	else if (d->fails[works])
	2493	{
	2494	works = d->fails[works][*p];
	2495	rval = TRANSIT_STATE_DONE;
	2496	}
	2497	else
	2498	{
	2499	build_state(works, d);
	2500	}
	2501	}
	2502	*next_state = works;
	2503	return rval;
	2504	}
	2505
	2506	/* Check whether period can match or not in the current context. If it can,
	2507	return the amount of the bytes with which period can match, otherwise
	2508	return 0.
	2509	`pos' is the position of the period. `index' is the index from the
	2510	buf_begin, and it is the current position in the buffer. */
	2511	static int
	2512	match_anychar (struct dfa *d, int s, position pos, int index)
	2513	{
	2514	int newline = 0;
	2515	int letter = 0;
	2516	wchar_t wc;
	2517	int mbclen;
	2518
	2519	wc = inputwcs[index];
	2520	mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
	2521
	2522	/* Check context. */
	2523	if (wc == (wchar_t)eolbyte)
	2524	{
	2525	if (!(syntax_bits & RE_DOT_NEWLINE))
	2526	return 0;
	2527	newline = 1;
	2528	}
	2529	else if (wc == (wchar_t)'\0')
	2530	{
	2531	if (syntax_bits & RE_DOT_NOT_NULL)
	2532	return 0;
	2533	newline = 1;
	2534	}
	2535
	2536	if (iswalnum(wc) \|\| wc == L'_')
	2537	letter = 1;
	2538
	2539	if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
	2540	newline, d->states[s].letter, letter))
	2541	return 0;
	2542
	2543	return mbclen;
	2544	}
	2545
	2546	/* Check whether bracket expression can match or not in the current context.
	2547	If it can, return the amount of the bytes with which expression can match,
	2548	otherwise return 0.
	2549	`pos' is the position of the bracket expression. `index' is the index
	2550	from the buf_begin, and it is the current position in the buffer. */
	2551	int
	2552	match_mb_charset (struct dfa *d, int s, position pos, int index)
	2553	{
	2554	int i;
	2555	int match; /* Flag which represent that matching succeed. */
	2556	int match_len; /* Length of the character (or collating element)
	2557	with which this operator match. */
	2558	int op_len; /* Length of the operator. */
	2559	char buffer[128];
	2560	wchar_t wcbuf[6];
	2561
	2562	/* Pointer to the structure to which we are currently refering. */
	2563	struct mb_char_classes *work_mbc;
	2564
	2565	int newline = 0;
	2566	int letter = 0;
	2567	wchar_t wc; /* Current refering character. */
	2568
	2569	wc = inputwcs[index];
	2570
	2571	/* Check context. */
	2572	if (wc == (wchar_t)eolbyte)
	2573	{
	2574	if (!(syntax_bits & RE_DOT_NEWLINE))
	2575	return 0;
	2576	newline = 1;
	2577	}
	2578	else if (wc == (wchar_t)'\0')
	2579	{
	2580	if (syntax_bits & RE_DOT_NOT_NULL)
	2581	return 0;
	2582	newline = 1;
	2583	}
	2584	if (iswalnum(wc) \|\| wc == L'_')
	2585	letter = 1;
	2586	if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
	2587	newline, d->states[s].letter, letter))
	2588	return 0;
	2589
	2590	/* Assign the current refering operator to work_mbc. */
	2591	work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
	2592	match = !work_mbc->invert;
	2593	match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
	2594
	2595	/* match with a character class? */
	2596	for (i = 0; i<work_mbc->nch_classes; i++)
	2597	{
	2598	if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
	2599	goto charset_matched;
	2600	}
	2601
	2602	strncpy(buffer, buf_begin + index, match_len);
	2603	buffer[match_len] = '\0';
	2604
	2605	/* match with an equivalent class? */
	2606	for (i = 0; i<work_mbc->nequivs; i++)
	2607	{
	2608	op_len = strlen(work_mbc->equivs[i]);
	2609	strncpy(buffer, buf_begin + index, op_len);
	2610	buffer[op_len] = '\0';
	2611	if (strcoll(work_mbc->equivs[i], buffer) == 0)
	2612	{
	2613	match_len = op_len;
	2614	goto charset_matched;
	2615	}
	2616	}
	2617
	2618	/* match with a collating element? */
	2619	for (i = 0; i<work_mbc->ncoll_elems; i++)
	2620	{
	2621	op_len = strlen(work_mbc->coll_elems[i]);
	2622	strncpy(buffer, buf_begin + index, op_len);
	2623	buffer[op_len] = '\0';
	2624
	2625	if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
	2626	{
	2627	match_len = op_len;
	2628	goto charset_matched;
	2629	}
	2630	}
	2631
	2632	wcbuf[0] = wc;
	2633	wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
	2634
	2635	/* match with a range? */
	2636	for (i = 0; i<work_mbc->nranges; i++)
	2637	{
	2638	wcbuf[2] = work_mbc->range_sts[i];
	2639	wcbuf[4] = work_mbc->range_ends[i];
	2640
	2641	if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
	2642	wcscoll(wcbuf+4, wcbuf) >= 0)
	2643	goto charset_matched;
	2644	}
	2645
	2646	/* match with a character? */
	2647	for (i = 0; i<work_mbc->nchars; i++)
	2648	{
	2649	if (wc == work_mbc->chars[i])
	2650	goto charset_matched;
	2651	}
	2652
	2653	match = !match;
	2654
	2655	charset_matched:
	2656	return match ? match_len : 0;
	2657	}
	2658
	2659	/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
	2660	array which corresponds to `d->states[s].mbps.elem' and each element of
	2661	the array contains the amount of the bytes with which the element can
	2662	match.
	2663	`index' is the index from the buf_begin, and it is the current position
	2664	in the buffer.
	2665	Caller MUST free the array which this function return. */
	2666	static int*
	2667	check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
	2668	{
	2669	int i;
	2670	int* rarray;
	2671
	2672	MALLOC(rarray, int, d->states[s].mbps.nelem);
	2673	for (i = 0; i < d->states[s].mbps.nelem; ++i)
	2674	{
	2675	position pos = d->states[s].mbps.elems[i];
	2676	switch(d->tokens[pos.index])
	2677	{
	2678	case ANYCHAR:
	2679	rarray[i] = match_anychar(d, s, pos, index);
	2680	break;
	2681	case MBCSET:
	2682	rarray[i] = match_mb_charset(d, s, pos, index);
	2683	break;
	2684	default:
	2685	break; /* can not happen. */
	2686	}
	2687	}
	2688	return rarray;
	2689	}
	2690
	2691	/* Consume a single character and enumerate all of the positions which can
	2692	be next position from the state `s'.
	2693	`match_lens' is the input. It can be NULL, but it can also be the output
	2694	of check_matching_with_multibyte_ops() for optimization.
	2695	`mbclen' and `pps' are the output. `mbclen' is the length of the
	2696	character consumed, and `pps' is the set this function enumerate. */
	2697	static status_transit_state
	2698	transit_state_consume_1char (struct dfa d, int s, unsigned char const *pp,
	2699	int match_lens, int mbclen, position_set *pps)
	2700	{
	2701	int i, j;
	2702	int s1, s2;
	2703	int* work_mbls;
	2704	status_transit_state rs = TRANSIT_STATE_DONE;
	2705
	2706	/* Calculate the length of the (single/multi byte) character
	2707	to which p points. */
	2708	mbclen = (mblen_buf[pp - buf_begin] == 0)? 1
	2709	: mblen_buf[*pp - buf_begin];
	2710
	2711	/* Calculate the state which can be reached from the state `s' by
	2712	consuming `mbclen' single bytes from the buffer. /
	2713	s1 = s;
	2714	for (i = 0; i < *mbclen; i++)
	2715	{
	2716	s2 = s1;
	2717	rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
	2718	}
	2719	/* Copy the positions contained by `s1' to the set `pps'. */
	2720	copy(&(d->states[s1].elems), pps);
	2721
	2722	/* Check (inputed)match_lens, and initialize if it is NULL. */
	2723	if (match_lens == NULL && d->states[s].mbps.nelem != 0)
	2724	work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
	2725	else
	2726	work_mbls = match_lens;
	2727
	2728	/* Add all of the positions which can be reached from `s' by consuming
	2729	a single character. */
	2730	for (i = 0; i < d->states[s].mbps.nelem ; i++)
	2731	{
	2732	if (work_mbls[i] == *mbclen)
	2733	for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
	2734	j++)
	2735	insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
	2736	pps);
	2737	}
	2738
	2739	if (match_lens == NULL && work_mbls != NULL)
	2740	free(work_mbls);
	2741	return rs;
	2742	}
	2743
	2744	/* Transit state from s, then return new state and update the pointer of the
	2745	buffer. This function is for some operator which can match with a multi-
	2746	byte character or a collating element (which may be multi characters). */
	2747	static int
	2748	transit_state (struct dfa d, int s, unsigned char const *pp)
	2749	{
	2750	int s1;
	2751	int mbclen; /* The length of current input multibyte character. */
	2752	int maxlen = 0;
	2753	int i, j;
	2754	int *match_lens = NULL;
	2755	int nelem = d->states[s].mbps.nelem; /* Just a alias. */
	2756	position_set follows;
	2757	unsigned char const p1 = pp;
	2758	status_transit_state rs;
	2759	wchar_t wc;
	2760
	2761	if (nelem > 0)
	2762	/* This state has (a) multibyte operator(s).
	2763	We check whether each of them can match or not. */
	2764	{
	2765	/* Note: caller must free the return value of this function. */
	2766	match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
	2767
	2768	for (i = 0; i < nelem; i++)
	2769	/* Search the operator which match the longest string,
	2770	in this state. */
	2771	{
	2772	if (match_lens[i] > maxlen)
	2773	maxlen = match_lens[i];
	2774	}
	2775	}
	2776
	2777	if (nelem == 0 \|\| maxlen == 0)
	2778	/* This state has no multibyte operator which can match.
	2779	We need to check only one single byte character. */
	2780	{
	2781	status_transit_state rs;
	2782	rs = transit_state_singlebyte(d, s, *pp, &s1);
	2783
	2784	/* We must update the pointer if state transition succeeded. */
	2785	if (rs == TRANSIT_STATE_DONE)
	2786	++*pp;
	2787
	2788	if (match_lens != NULL)
	2789	free(match_lens);
	2790	return s1;
	2791	}
	2792
	2793	/* This state has some operators which can match a multibyte character. */
	2794	follows.nelem = 0;
	2795	MALLOC(follows.elems, position, d->nleaves);
	2796
	2797	/* `maxlen' may be longer than the length of a character, because it may
	2798	not be a character but a (multi character) collating element.
	2799	We enumerate all of the positions which `s' can reach by consuming
	2800	`maxlen' bytes. */
	2801	rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
	2802
	2803	wc = inputwcs[*pp - mbclen - buf_begin];
	2804	s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
	2805	realloc_trans_if_necessary(d, s1);
	2806
	2807	while (*pp - p1 < maxlen)
	2808	{
	2809	follows.nelem = 0;
	2810	rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
	2811
	2812	for (i = 0; i < nelem ; i++)
	2813	{
	2814	if (match_lens[i] == *pp - p1)
	2815	for (j = 0;
	2816	j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
	2817	insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
	2818	&follows);
	2819	}
	2820
	2821	wc = inputwcs[*pp - mbclen - buf_begin];
	2822	s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
	2823	realloc_trans_if_necessary(d, s1);
	2824	}
	2825	free(match_lens);
	2826	free(follows.elems);
	2827	return s1;
	2828	}
	2829
	2830	#endif /* MBS_SUPPORT */
	2831
	2832	/* Search through a buffer looking for a match to the given struct dfa.
	2833	Find the first occurrence of a string matching the regexp in the buffer,
	2834	and the shortest possible version thereof. Return a pointer to the first
	2835	character after the match, or NULL if none is found. Begin points to
	2836	the beginning of the buffer, and end points to the first character after
	2837	its end. We store a newline in *end to act as a sentinel, so end had
	2838	better point somewhere valid. Newline is a flag indicating whether to
	2839	allow newlines to be in the matching string. If count is non-
	2840	NULL it points to a place we're supposed to increment every time we
	2841	see a newline. Finally, if backref is non-NULL it points to a place
	2842	where we're supposed to store a 1 if backreferencing happened and the
	2843	match needs to be verified by a backtracking matcher. Otherwise
	2844	we store a 0 in backref. /
	2845	char *
	2846	dfaexec (struct dfa d, char const begin, char *end,
	2847	int newline, int count, int backref)
	2848	{
	2849	register int s, s1, tmp; /* Current state. */
	2850	register unsigned char const p; / Current input character. */
	2851	register int *trans, t; /* Copy of d->trans so it can be optimized
	2852	into a register. */
	2853	register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
	2854	static int sbit[NOTCHAR]; /* Table for anding with d->success. */
	2855	static int sbit_init;
	2856
	2857	if (! sbit_init)
	2858	{
	2859	int i;
	2860
	2861	sbit_init = 1;
	2862	for (i = 0; i < NOTCHAR; ++i)
	2863	sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
	2864	sbit[eol] = 4;
	2865	}
	2866
	2867	if (! d->tralloc)
	2868	build_state_zero(d);
	2869
	2870	s = s1 = 0;
	2871	p = (unsigned char const *) begin;
	2872	trans = d->trans;
	2873	*end = eol;
	2874
	2875	#ifdef MBS_SUPPORT
	2876	if (MB_CUR_MAX > 1)
	2877	{
	2878	int remain_bytes, i;
	2879	buf_begin = begin;
	2880	buf_end = end;
	2881
	2882	/* initialize mblen_buf, and inputwcs. */
	2883	MALLOC(mblen_buf, unsigned char, end - begin + 2);
	2884	MALLOC(inputwcs, wchar_t, end - begin + 2);
	2885	memset(&mbs, 0, sizeof(mbstate_t));
	2886	remain_bytes = 0;
	2887	for (i = 0; i < end - begin + 1; i++)
	2888	{
	2889	if (remain_bytes == 0)
	2890	{
	2891	remain_bytes
	2892	= mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
	2893	if (remain_bytes <= 1)
	2894	{
	2895	remain_bytes = 0;
	2896	inputwcs[i] = (wchar_t)begin[i];
	2897	mblen_buf[i] = 0;
	2898	}
	2899	else
	2900	{
	2901	mblen_buf[i] = remain_bytes;
	2902	remain_bytes--;
	2903	}
	2904	}
	2905	else
	2906	{
	2907	mblen_buf[i] = remain_bytes;
	2908	inputwcs[i] = 0;
	2909	remain_bytes--;
	2910	}
	2911	}
	2912	mblen_buf[i] = 0;
	2913	inputwcs[i] = 0; /* sentinel */
	2914	}
	2915	#endif /* MBS_SUPPORT */
	2916
	2917	for (;;)
	2918	{
	2919	#ifdef MBS_SUPPORT
	2920	if (MB_CUR_MAX > 1)
	2921	while ((t = trans[s]))
	2922	{
	2923	if ((char *) p > end)
	2924	break;
	2925	s1 = s;
	2926	if (d->states[s].mbps.nelem != 0)
	2927	{
	2928	/* Can match with a multibyte character (and multi character
	2929	collating element). */
	2930	unsigned char const *nextp;
	2931
	2932	SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
	2933
	2934	nextp = p;
	2935	s = transit_state(d, s, &nextp);
	2936	p = (unsigned char *)nextp;
	2937
	2938	/* Trans table might be updated. */
	2939	trans = d->trans;
	2940	}
	2941	else
	2942	{
	2943	SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
	2944	s = t[*p++];
	2945	}
	2946	}
	2947	else
	2948	#endif /* MBS_SUPPORT */
	2949	while ((t = trans[s]) != 0) { /* hand-optimized loop */
	2950	s1 = t[*p++];
	2951	if ((t = trans[s1]) == 0) {
	2952	tmp = s ; s = s1 ; s1 = tmp ; /* swap */
	2953	break;
	2954	}
	2955	s = t[*p++];
	2956	}
	2957
	2958	if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
	2959	{
	2960	if (d->success[s] & sbit[*p])
	2961	{
	2962	if (backref)
	2963	*backref = (d->states[s].backref != 0);
	2964	#ifdef MBS_SUPPORT
	2965	if (MB_CUR_MAX > 1)
	2966	{
	2967	free(mblen_buf);
	2968	free(inputwcs);
	2969	}
	2970	#endif /* MBS_SUPPORT */
	2971	return (char *) p;
	2972	}
	2973
	2974	s1 = s;
	2975	#ifdef MBS_SUPPORT
	2976	if (MB_CUR_MAX > 1)
	2977	{
	2978	unsigned char const *nextp;
	2979	nextp = p;
	2980	s = transit_state(d, s, &nextp);
	2981	p = (unsigned char *)nextp;
	2982
	2983	/* Trans table might be updated. */
	2984	trans = d->trans;
	2985	}
	2986	else
	2987	#endif /* MBS_SUPPORT */
	2988	s = d->fails[s][*p++];
	2989	continue;
	2990	}
	2991
	2992	/* If the previous character was a newline, count it. */
	2993	if (count && (char *) p <= end && p[-1] == eol)
	2994	++*count;
	2995
	2996	/* Check if we've run off the end of the buffer. */
	2997	if ((char *) p > end)
	2998	{
	2999	#ifdef MBS_SUPPORT
	3000	if (MB_CUR_MAX > 1)
	3001	{
	3002	free(mblen_buf);
	3003	free(inputwcs);
	3004	}
	3005	#endif /* MBS_SUPPORT */
	3006	return NULL;
	3007	}
	3008
	3009	if (s >= 0)
	3010	{
	3011	build_state(s, d);
	3012	trans = d->trans;
	3013	continue;
	3014	}
	3015
	3016	if (p[-1] == eol && newline)
	3017	{
	3018	s = d->newlines[s1];
	3019	continue;
	3020	}
	3021
	3022	s = 0;
	3023	}
	3024	}
	3025
	3026	/* Initialize the components of a dfa that the other routines don't
	3027	initialize for themselves. */
	3028	void
	3029	dfainit (struct dfa *d)
	3030	{
	3031	d->calloc = 1;
	3032	MALLOC(d->charclasses, charclass, d->calloc);
	3033	d->cindex = 0;
	3034
	3035	d->talloc = 1;
	3036	MALLOC(d->tokens, token, d->talloc);
	3037	d->tindex = d->depth = d->nleaves = d->nregexps = 0;
	3038	#ifdef MBS_SUPPORT
	3039	if (MB_CUR_MAX > 1)
	3040	{
	3041	d->nmultibyte_prop = 1;
	3042	MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
	3043	d->nmbcsets = 0;
	3044	d->mbcsets_alloc = 1;
	3045	MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
	3046	}
	3047	#endif
	3048
	3049	d->searchflag = 0;
	3050	d->tralloc = 0;
	3051
	3052	d->musts = 0;
	3053	d->realtrans = 0;
	3054	d->fails = 0;
	3055	d->newlines = 0;
	3056	d->success = 0;
	3057	}
	3058
	3059	/* Parse and analyze a single string of the given length. */
	3060	void
	3061	dfacomp (char const s, size_t len, struct dfa d, int searchflag)
	3062	{
	3063	if (case_fold) /* dummy folding in service of dfamust() */
	3064	{
	3065	char *lcopy;
	3066	int i;
	3067
	3068	lcopy = malloc(len);
	3069	if (!lcopy)
	3070	dfaerror(_("memory exhausted"));
	3071
	3072	/* This is a kludge. */
	3073	case_fold = 0;
	3074	for (i = 0; i < len; ++i)
	3075	if (ISUPPER ((unsigned char) s[i]))
	3076	lcopy[i] = tolower ((unsigned char) s[i]);
	3077	else
	3078	lcopy[i] = s[i];
	3079
	3080	dfainit(d);
	3081	dfaparse(lcopy, len, d);
	3082	free(lcopy);
	3083	dfamust(d);
	3084	d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
	3085	case_fold = 1;
	3086	dfaparse(s, len, d);
	3087	dfaanalyze(d, searchflag);
	3088	}
	3089	else
	3090	{
	3091	dfainit(d);
	3092	dfaparse(s, len, d);
	3093	dfamust(d);
	3094	dfaanalyze(d, searchflag);
	3095	}
	3096	}
	3097
	3098	/* Free the storage held by the components of a dfa. */
	3099	void
	3100	dfafree (struct dfa *d)
	3101	{
	3102	int i;
	3103	struct dfamust dm, ndm;
	3104
	3105	free((ptr_t) d->charclasses);
	3106	free((ptr_t) d->tokens);
	3107
	3108	#ifdef MBS_SUPPORT
	3109	if (MB_CUR_MAX > 1)
	3110	{
	3111	free((ptr_t) d->multibyte_prop);
	3112	for (i = 0; i < d->nmbcsets; ++i)
	3113	{
	3114	int j;
	3115	struct mb_char_classes *p = &(d->mbcsets[i]);
	3116	if (p->chars != NULL)
	3117	free(p->chars);
	3118	if (p->ch_classes != NULL)
	3119	free(p->ch_classes);
	3120	if (p->range_sts != NULL)
	3121	free(p->range_sts);
	3122	if (p->range_ends != NULL)
	3123	free(p->range_ends);
	3124
	3125	for (j = 0; j < p->nequivs; ++j)
	3126	free(p->equivs[j]);
	3127	if (p->equivs != NULL)
	3128	free(p->equivs);
	3129
	3130	for (j = 0; j < p->ncoll_elems; ++j)
	3131	free(p->coll_elems[j]);
	3132	if (p->coll_elems != NULL)
	3133	free(p->coll_elems);
	3134	}
	3135	free((ptr_t) d->mbcsets);
	3136	}
	3137	#endif /* MBS_SUPPORT */
	3138
	3139	for (i = 0; i < d->sindex; ++i)
	3140	free((ptr_t) d->states[i].elems.elems);
	3141	free((ptr_t) d->states);
	3142	for (i = 0; i < d->tindex; ++i)
	3143	if (d->follows[i].elems)
	3144	free((ptr_t) d->follows[i].elems);
	3145	free((ptr_t) d->follows);
	3146	for (i = 0; i < d->tralloc; ++i)
	3147	if (d->trans[i])
	3148	free((ptr_t) d->trans[i]);
	3149	else if (d->fails[i])
	3150	free((ptr_t) d->fails[i]);
	3151	if (d->realtrans) free((ptr_t) d->realtrans);
	3152	if (d->fails) free((ptr_t) d->fails);
	3153	if (d->newlines) free((ptr_t) d->newlines);
	3154	if (d->success) free((ptr_t) d->success);
	3155	for (dm = d->musts; dm; dm = ndm)
	3156	{
	3157	ndm = dm->next;
	3158	free(dm->must);
	3159	free((ptr_t) dm);
	3160	}
	3161	}
	3162
	3163	/* Having found the postfix representation of the regular expression,
	3164	try to find a long sequence of characters that must appear in any line
	3165	containing the r.e.
	3166	Finding a "longest" sequence is beyond the scope here;
	3167	we take an easy way out and hope for the best.
	3168	(Take "(ab\|a)b"--please.)
	3169
	3170	We do a bottom-up calculation of sequences of characters that must appear
	3171	in matches of r.e.'s represented by trees rooted at the nodes of the postfix
	3172	representation:
	3173	sequences that must appear at the left of the match ("left")
	3174	sequences that must appear at the right of the match ("right")
	3175	lists of sequences that must appear somewhere in the match ("in")
	3176	sequences that must constitute the match ("is")
	3177
	3178	When we get to the root of the tree, we use one of the longest of its
	3179	calculated "in" sequences as our answer. The sequence we find is returned in
	3180	d->must (where "d" is the single argument passed to "dfamust");
	3181	the length of the sequence is returned in d->mustn.
	3182
	3183	The sequences calculated for the various types of node (in pseudo ANSI c)
	3184	are shown below. "p" is the operand of unary operators (and the left-hand
	3185	operand of binary operators); "q" is the right-hand operand of binary
	3186	operators.
	3187
	3188	"ZERO" means "a zero-length sequence" below.
	3189
	3190	Type left right is in
	3191	---- ---- ----- -- --
	3192	char c # c # c # c # c
	3193
	3194	ANYCHAR ZERO ZERO ZERO ZERO
	3195
	3196	MBCSET ZERO ZERO ZERO ZERO
	3197
	3198	CSET ZERO ZERO ZERO ZERO
	3199
	3200	STAR ZERO ZERO ZERO ZERO
	3201
	3202	QMARK ZERO ZERO ZERO ZERO
	3203
	3204	PLUS p->left p->right ZERO p->in
	3205
	3206	CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
	3207	p->left : q->right : q->is!=ZERO) ? q->in plus
	3208	p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
	3209	ZERO
	3210
	3211	OR longest common longest common (do p->is and substrings common to
	3212	leading trailing q->is have same p->in and q->in
	3213	(sub)sequence (sub)sequence length and
	3214	of p->left of p->right content) ?
	3215	and q->left and q->right p->is : NULL
	3216
	3217	If there's anything else we recognize in the tree, all four sequences get set
	3218	to zero-length sequences. If there's something we don't recognize in the tree,
	3219	we just return a zero-length sequence.
	3220
	3221	Break ties in favor of infrequent letters (choosing 'zzz' in preference to
	3222	'aaa')?
	3223
	3224	And. . .is it here or someplace that we might ponder "optimizations" such as
	3225	egrep 'psi\|epsilon' -> egrep 'psi'
	3226	egrep 'pepsi\|epsilon' -> egrep 'epsi'
	3227	(Yes, we now find "epsi" as a "string
	3228	that must occur", but we might also
	3229	simplify the entire r.e. being sought)
	3230	grep '[c]' -> grep 'c'
	3231	grep '(ab\|a)b' -> grep 'ab'
	3232	grep 'ab*' -> grep 'a'
	3233	grep 'a*b' -> grep 'b'
	3234
	3235	There are several issues:
	3236
	3237	Is optimization easy (enough)?
	3238
	3239	Does optimization actually accomplish anything,
	3240	or is the automaton you get from "psi\|epsilon" (for example)
	3241	the same as the one you get from "psi" (for example)?
	3242
	3243	Are optimizable r.e.'s likely to be used in real-life situations
	3244	(something like 'ab*' is probably unlikely; something like is
	3245	'psi\|epsilon' is likelier)? */
	3246
	3247	static char *
	3248	icatalloc (char old, char new)
	3249	{
	3250	char *result;
	3251	size_t oldsize, newsize;
	3252
	3253	newsize = (new == NULL) ? 0 : strlen(new);
	3254	if (old == NULL)
	3255	oldsize = 0;
	3256	else if (newsize == 0)
	3257	return old;
	3258	else oldsize = strlen(old);
	3259	if (old == NULL)
	3260	result = (char *) malloc(newsize + 1);
	3261	else
	3262	result = (char ) realloc((void ) old, oldsize + newsize + 1);
	3263	if (result != NULL && new != NULL)
	3264	(void) strcpy(result + oldsize, new);
	3265	return result;
	3266	}
	3267
	3268	static char *
	3269	icpyalloc (char *string)
	3270	{
	3271	return icatalloc((char *) NULL, string);
	3272	}
	3273
	3274	static char *
	3275	istrstr (char lookin, char lookfor)
	3276	{
	3277	char *cp;
	3278	size_t len;
	3279
	3280	len = strlen(lookfor);
	3281	for (cp = lookin; *cp != '\0'; ++cp)
	3282	if (strncmp(cp, lookfor, len) == 0)
	3283	return cp;
	3284	return NULL;
	3285	}
	3286
	3287	static void
	3288	ifree (char *cp)
	3289	{
	3290	if (cp != NULL)
	3291	free(cp);
	3292	}
	3293
	3294	static void
	3295	freelist (char **cpp)
	3296	{
	3297	int i;
	3298
	3299	if (cpp == NULL)
	3300	return;
	3301	for (i = 0; cpp[i] != NULL; ++i)
	3302	{
	3303	free(cpp[i]);
	3304	cpp[i] = NULL;
	3305	}
	3306	}
	3307
	3308	static char **
	3309	enlist (char *cpp, char new, size_t len)
	3310	{
	3311	int i, j;
	3312
	3313	if (cpp == NULL)
	3314	return NULL;
	3315	if ((new = icpyalloc(new)) == NULL)
	3316	{
	3317	freelist(cpp);
	3318	return NULL;
	3319	}
	3320	new[len] = '\0';
	3321	/* Is there already something in the list that's new (or longer)? */
	3322	for (i = 0; cpp[i] != NULL; ++i)
	3323	if (istrstr(cpp[i], new) != NULL)
	3324	{
	3325	free(new);
	3326	return cpp;
	3327	}
	3328	/* Eliminate any obsoleted strings. */
	3329	j = 0;
	3330	while (cpp[j] != NULL)
	3331	if (istrstr(new, cpp[j]) == NULL)
	3332	++j;
	3333	else
	3334	{
	3335	free(cpp[j]);
	3336	if (--i == j)
	3337	break;
	3338	cpp[j] = cpp[i];
	3339	cpp[i] = NULL;
	3340	}
	3341	/* Add the new string. */
	3342	cpp = (char *) realloc((char ) cpp, (i + 2) * sizeof *cpp);
	3343	if (cpp == NULL)
	3344	return NULL;
	3345	cpp[i] = new;
	3346	cpp[i + 1] = NULL;
	3347	return cpp;
	3348	}
	3349
	3350	/* Given pointers to two strings, return a pointer to an allocated
	3351	list of their distinct common substrings. Return NULL if something
	3352	seems wild. */
	3353	static char **
	3354	comsubs (char left, char right)
	3355	{
	3356	char **cpp;
	3357	char *lcp;
	3358	char *rcp;
	3359	size_t i, len;
	3360
	3361	if (left == NULL \|\| right == NULL)
	3362	return NULL;
	3363	cpp = (char *) malloc(sizeof cpp);
	3364	if (cpp == NULL)
	3365	return NULL;
	3366	cpp[0] = NULL;
	3367	for (lcp = left; *lcp != '\0'; ++lcp)
	3368	{
	3369	len = 0;
	3370	rcp = strchr (right, *lcp);
	3371	while (rcp != NULL)
	3372	{
	3373	for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
	3374	continue;
	3375	if (i > len)
	3376	len = i;
	3377	rcp = strchr (rcp + 1, *lcp);
	3378	}
	3379	if (len == 0)
	3380	continue;
	3381	if ((cpp = enlist(cpp, lcp, len)) == NULL)
	3382	break;
	3383	}
	3384	return cpp;
	3385	}
	3386
	3387	static char **
	3388	addlists (char old, char new)
	3389	{
	3390	int i;
	3391
	3392	if (old == NULL \|\| new == NULL)
	3393	return NULL;
	3394	for (i = 0; new[i] != NULL; ++i)
	3395	{
	3396	old = enlist(old, new[i], strlen(new[i]));
	3397	if (old == NULL)
	3398	break;
	3399	}
	3400	return old;
	3401	}
	3402
	3403	/* Given two lists of substrings, return a new list giving substrings
	3404	common to both. */
	3405	static char **
	3406	inboth (char left, char right)
	3407	{
	3408	char **both;
	3409	char **temp;
	3410	int lnum, rnum;
	3411
	3412	if (left == NULL \|\| right == NULL)
	3413	return NULL;
	3414	both = (char *) malloc(sizeof both);
	3415	if (both == NULL)
	3416	return NULL;
	3417	both[0] = NULL;
	3418	for (lnum = 0; left[lnum] != NULL; ++lnum)
	3419	{
	3420	for (rnum = 0; right[rnum] != NULL; ++rnum)
	3421	{
	3422	temp = comsubs(left[lnum], right[rnum]);
	3423	if (temp == NULL)
	3424	{
	3425	freelist(both);
	3426	return NULL;
	3427	}
	3428	both = addlists(both, temp);
	3429	freelist(temp);
	3430	free(temp);
	3431	if (both == NULL)
	3432	return NULL;
	3433	}
	3434	}
	3435	return both;
	3436	}
	3437
	3438	typedef struct
	3439	{
	3440	char **in;
	3441	char *left;
	3442	char *right;
	3443	char *is;
	3444	} must;
	3445
	3446	static void
	3447	resetmust (must *mp)
	3448	{
	3449	mp->left[0] = mp->right[0] = mp->is[0] = '\0';
	3450	freelist(mp->in);
	3451	}
	3452
	3453	static void
	3454	dfamust (struct dfa *dfa)
	3455	{
	3456	must *musts;
	3457	must *mp;
	3458	char *result;
	3459	int ri;
	3460	int i;
	3461	int exact;
	3462	token t;
	3463	static must must0;
	3464	struct dfamust *dm;
	3465	static char empty_string[] = "";
	3466
	3467	result = empty_string;
	3468	exact = 0;
	3469	musts = (must ) malloc((dfa->tindex + 1) sizeof *musts);
	3470	if (musts == NULL)
	3471	return;
	3472	mp = musts;
	3473	for (i = 0; i <= dfa->tindex; ++i)
	3474	mp[i] = must0;
	3475	for (i = 0; i <= dfa->tindex; ++i)
	3476	{
	3477	mp[i].in = (char *) malloc(sizeof mp[i].in);
	3478	mp[i].left = malloc(2);
	3479	mp[i].right = malloc(2);
	3480	mp[i].is = malloc(2);
	3481	if (mp[i].in == NULL \|\| mp[i].left == NULL \|\|
	3482	mp[i].right == NULL \|\| mp[i].is == NULL)
	3483	goto done;
	3484	mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
	3485	mp[i].in[0] = NULL;
	3486	}
	3487	#ifdef DEBUG
	3488	fprintf(stderr, "dfamust:\n");
	3489	for (i = 0; i < dfa->tindex; ++i)
	3490	{
	3491	fprintf(stderr, " %d:", i);
	3492	prtok(dfa->tokens[i]);
	3493	}
	3494	putc('\n', stderr);
	3495	#endif
	3496	for (ri = 0; ri < dfa->tindex; ++ri)
	3497	{
	3498	switch (t = dfa->tokens[ri])
	3499	{
	3500	case LPAREN:
	3501	case RPAREN:
	3502	goto done; /* "cannot happen" */
	3503	case EMPTY:
	3504	case BEGLINE:
	3505	case ENDLINE:
	3506	case BEGWORD:
	3507	case ENDWORD:
	3508	case LIMWORD:
	3509	case NOTLIMWORD:
	3510	case BACKREF:
	3511	resetmust(mp);
	3512	break;
	3513	case STAR:
	3514	case QMARK:
	3515	if (mp <= musts)
	3516	goto done; /* "cannot happen" */
	3517	--mp;
	3518	resetmust(mp);
	3519	break;
	3520	case OR:
	3521	case ORTOP:
	3522	if (mp < &musts[2])
	3523	goto done; /* "cannot happen" */
	3524	{
	3525	char **new;
	3526	must *lmp;
	3527	must *rmp;
	3528	int j, ln, rn, n;
	3529
	3530	rmp = --mp;
	3531	lmp = --mp;
	3532	/* Guaranteed to be. Unlikely, but. . . */
	3533	if (strcmp(lmp->is, rmp->is) != 0)
	3534	lmp->is[0] = '\0';
	3535	/* Left side--easy */
	3536	i = 0;
	3537	while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
	3538	++i;
	3539	lmp->left[i] = '\0';
	3540	/* Right side */
	3541	ln = strlen(lmp->right);
	3542	rn = strlen(rmp->right);
	3543	n = ln;
	3544	if (n > rn)
	3545	n = rn;
	3546	for (i = 0; i < n; ++i)
	3547	if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
	3548	break;
	3549	for (j = 0; j < i; ++j)
	3550	lmp->right[j] = lmp->right[(ln - i) + j];
	3551	lmp->right[j] = '\0';
	3552	new = inboth(lmp->in, rmp->in);
	3553	if (new == NULL)
	3554	goto done;
	3555	freelist(lmp->in);
	3556	free((char *) lmp->in);
	3557	lmp->in = new;
	3558	}
	3559	break;
	3560	case PLUS:
	3561	if (mp <= musts)
	3562	goto done; /* "cannot happen" */
	3563	--mp;
	3564	mp->is[0] = '\0';
	3565	break;
	3566	case END:
	3567	if (mp != &musts[1])
	3568	goto done; /* "cannot happen" */
	3569	for (i = 0; musts[0].in[i] != NULL; ++i)
	3570	if (strlen(musts[0].in[i]) > strlen(result))
	3571	result = musts[0].in[i];
	3572	if (strcmp(result, musts[0].is) == 0)
	3573	exact = 1;
	3574	goto done;
	3575	case CAT:
	3576	if (mp < &musts[2])
	3577	goto done; /* "cannot happen" */
	3578	{
	3579	must *lmp;
	3580	must *rmp;
	3581
	3582	rmp = --mp;
	3583	lmp = --mp;
	3584	/* In. Everything in left, plus everything in
	3585	right, plus catenation of
	3586	left's right and right's left. */
	3587	lmp->in = addlists(lmp->in, rmp->in);
	3588	if (lmp->in == NULL)
	3589	goto done;
	3590	if (lmp->right[0] != '\0' &&
	3591	rmp->left[0] != '\0')
	3592	{
	3593	char *tp;
	3594
	3595	tp = icpyalloc(lmp->right);
	3596	if (tp == NULL)
	3597	goto done;
	3598	tp = icatalloc(tp, rmp->left);
	3599	if (tp == NULL)
	3600	goto done;
	3601	lmp->in = enlist(lmp->in, tp,
	3602	strlen(tp));
	3603	free(tp);
	3604	if (lmp->in == NULL)
	3605	goto done;
	3606	}
	3607	/* Left-hand */
	3608	if (lmp->is[0] != '\0')
	3609	{
	3610	lmp->left = icatalloc(lmp->left,
	3611	rmp->left);
	3612	if (lmp->left == NULL)
	3613	goto done;
	3614	}
	3615	/* Right-hand */
	3616	if (rmp->is[0] == '\0')
	3617	lmp->right[0] = '\0';
	3618	lmp->right = icatalloc(lmp->right, rmp->right);
	3619	if (lmp->right == NULL)
	3620	goto done;
	3621	/* Guaranteed to be */
	3622	if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
	3623	{
	3624	lmp->is = icatalloc(lmp->is, rmp->is);
	3625	if (lmp->is == NULL)
	3626	goto done;
	3627	}
	3628	else
	3629	lmp->is[0] = '\0';
	3630	}
	3631	break;
	3632	default:
	3633	if (t < END)
	3634	{
	3635	/* "cannot happen" */
	3636	goto done;
	3637	}
	3638	else if (t == '\0')
	3639	{
	3640	/* not on my shift */
	3641	goto done;
	3642	}
	3643	else if (t >= CSET
	3644	#ifdef MBS_SUPPORT
	3645	\|\| t == ANYCHAR
	3646	\|\| t == MBCSET
	3647	#endif /* MBS_SUPPORT */
	3648	)
	3649	{
	3650	/* easy enough */
	3651	resetmust(mp);
	3652	}
	3653	else
	3654	{
	3655	/* plain character */
	3656	resetmust(mp);
	3657	mp->is[0] = mp->left[0] = mp->right[0] = t;
	3658	mp->is[1] = mp->left[1] = mp->right[1] = '\0';
	3659	mp->in = enlist(mp->in, mp->is, (size_t)1);
	3660	if (mp->in == NULL)
	3661	goto done;
	3662	}
	3663	break;
	3664	}
	3665	#ifdef DEBUG
	3666	fprintf(stderr, " node: %d:", ri);
	3667	prtok(dfa->tokens[ri]);
	3668	fprintf(stderr, "\n in:");
	3669	for (i = 0; mp->in[i]; ++i)
	3670	fprintf(stderr, " \"%s\"", mp->in[i]);
	3671	fprintf(stderr, "\n is: \"%s\"\n", mp->is);
	3672	fprintf(stderr, " left: \"%s\"\n", mp->left);
	3673	fprintf(stderr, " right: \"%s\"\n", mp->right);
	3674	#endif
	3675	++mp;
	3676	}
	3677	done:
	3678	if (strlen(result))
	3679	{
	3680	MALLOC(dm, struct dfamust, 1);
	3681	dm->exact = exact;
	3682	MALLOC(dm->must, char, strlen(result) + 1);
	3683	strcpy(dm->must, result);
	3684	dm->next = dfa->musts;
	3685	dfa->musts = dm;
	3686	}
	3687	mp = musts;
	3688	for (i = 0; i <= dfa->tindex; ++i)
	3689	{
	3690	freelist(mp[i].in);
	3691	ifree((char *) mp[i].in);
	3692	ifree(mp[i].left);
	3693	ifree(mp[i].right);
	3694	ifree(mp[i].is);
	3695	}
	3696	free((char *) mp);
	3697	}
	3698	/* vim:set shiftwidth=2: */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/gawk/3.1.5/dfa.c

Download in other formats: