Context Navigation

source: trunk/src/helpers/regexp.c@ 178

Visit:

Last change on this file since 178 was 178, checked in by umoeller, 23 years ago
More pager fixes.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 77.9 KB

Line
1
2	/*
3	*@@sourcefile regexp.c:
4	* extended regular expressions, taken from "Andy's Extended
5	* Regular Expressions", as written by Andy Key
6	* (nyangau@interalpha.co.uk) and released into the public
7	* domain, plus adjustments for the XWP helpers.
8	*
9	* Usage: All C programs; not OS/2-specific.
10	*
11	* Function prefixes:
12	* -- rxp* regular expression functions.
13	*
14	* Regular expression matching is done in the following stages:
15	*
16	* 1) Call rxpCompile to parse the expression into a recursive
17	* tree of matches.
18	*
19	* This tree is converted into a finite state machine (FSM).
20	* A second FSM is built from the first but with epsilon
21	* moves removed to elimate lockups and increase speed.
22	*
23	* The input string can be used to drive the FSM through all
24	* the possible routes. The largest (or smallest) amount of
25	* input string required to reach the finish state is recorded
26	* since an extended regular expression is deemed to match as
27	* much input string as possible.
28	*
29	* 2) Call one of rxpMatch, rxpMatch_fwd, or rxpMatch_bwd to
30	* perform a match.
31	*
32	* 3) Call rxpFree to free the compiled ERE.
33	*
34	* Beware: The matching routine is highly recursive and can
35	* require around 20 to 30 bytes per character in the source string
36	* to match it. Thus you should try to limit the length of the source
37	* string and/or allow a stack size of at least 20 (to 30) * max string
38	* length. In addition, add 2KB to allow for the use of several nested
39	* sub-expressions in the matching.
40	*
41	*@@header "helpers\regexp.h"
42	*@@added V0.9.19 (2002-04-17) [umoeller]
43	*/
44
45	/*
46	* Copyright (C) 2002 Ulrich Mller.
47	* This file is part of the "XWorkplace helpers" source package.
48	* This is free software; you can redistribute it and/or modify
49	* it under the terms of the GNU General Public License as published
50	* by the Free Software Foundation, in version 2 as it comes in the
51	* "COPYING" file of the XWorkplace main distribution.
52	* This program is distributed in the hope that it will be useful,
53	* but WITHOUT ANY WARRANTY; without even the implied warranty of
54	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
55	* GNU General Public License for more details.
56	*/
57
58	#include <stdio.h>
59	#include <ctype.h>
60	#include <stdlib.h>
61	#include <string.h>
62	#include <malloc.h>
63	#include <memory.h>
64	#define ERE_C
65	#include "helpers\regexp.h"
66
67	#define isword(c) (isalnum(c)\|\|(c)=='_')
68
69	static int val_of_hex(char c)
70	{
71	if (c >= '0' && c <= '9')
72	return c - '0';
73	if (c >= 'a' && c <= 'f')
74	return c - 'a' + 10;
75	if (c >= 'A' && c <= 'F')
76	return c - 'A' + 10;
77	return 0; // Shouldn't get here
78	}
79
80	static int escaped(const char *s)
81	{
82	if (s[0] == 'x' && isxdigit(s[1]))
83	// \x followed by 1 or 2 hex digits
84	{
85	int n = val_of_hex(s[1]);
86
87	if (isxdigit(s[2]))
88	n = n * 16 + val_of_hex(s[2]);
89	return n;
90	}
91	else
92	switch (s[0])
93	{
94	case 'n':
95	return '\n';
96	case 't':
97	return '\t';
98	case 'r':
99	return '\r';
100	case 'b':
101	return '\b';
102	case 'f':
103	return '\f';
104	case 'e':
105	return 0x1b;
106	default:
107	return (unsigned char)s[0];
108	}
109	}
110
111	static const char past_escaped(const char s)
112	{
113	if (s[0] == 'x' && isxdigit(s[1]))
114	return isxdigit(s[2]) ? s + 3 : s + 2;
115	else
116	return s + 1;
117	}
118
119	#define zero_cclass(cclass) memset(cclass, 0, 0x100 >> 3)
120
121	static unsigned char bits[] =
122	{0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
123
124	#define add_to_cclass(n, cclass) (cclass[(unsigned char)(n)>>3] \|= bits[(unsigned char)(n) & 7])
125
126	#define remove_from_cclass(n, cclass) (cclass[(unsigned char)(n)>>3] &= ~bits[(unsigned char)(n) & 7])
127
128	static void invert_cclass(unsigned char *cclass)
129	{
130	int i;
131
132	for (i = 0; i < (0x100 >> 3); i++)
133	cclass[i] ^= 0xff;
134	}
135
136	#define match_cclass(n, cclass) ((cclass[(unsigned char)(n)>>3] & bits[(unsigned char)(n) & 7]) != 0)
137
138	#define CHL_EOS 0
139	#define CHL_END_CCLASS (-1)
140	#define CHL_COMP (-2)
141	#define CHL_RANGE (-3)
142	#define CHL_POSIX_COLLATING (-4)
143	#define CHL_POSIX_EQUIVALENCE (-5)
144	#define CHL_POSIX_CCLASS_BAD (-6)
145	#define CHL_POSIX_CCLASS_BASE (-30)
146	#define CHL_POSIX_CCLASS_END (-30+20)
147
148	/* I use my own wrapper functions so I can take their addresses.
149	* Remember, isalnum etc. can be implemented as macros... */
150	static BOOLEAN my_isalnum(int ch)
151	{
152	return isalnum(ch);
153	}
154	static BOOLEAN my_isalpha(int ch)
155	{
156	return isalpha(ch);
157	}
158	static BOOLEAN my_isblank(int ch)
159	{
160	return ch == ' ' \|\| ch == '\t';
161	}
162	static BOOLEAN my_iscntrl(int ch)
163	{
164	return iscntrl(ch);
165	}
166	static BOOLEAN my_isdigit(int ch)
167	{
168	return isdigit(ch);
169	}
170	static BOOLEAN my_islower(int ch)
171	{
172	return islower(ch);
173	}
174	static BOOLEAN my_isprint(int ch)
175	{
176	return isprint(ch);
177	}
178	static BOOLEAN my_ispunct(int ch)
179	{
180	return ispunct(ch);
181	}
182	static BOOLEAN my_isspace(int ch)
183	{
184	return isspace(ch);
185	}
186	static BOOLEAN my_isupper(int ch)
187	{
188	return isupper(ch);
189	}
190	static BOOLEAN my_isxdigit(int ch)
191	{
192	return isxdigit(ch);
193	}
194
195	typedef struct
196	{
197	int len_name;
198	const char *name;
199	BOOLEAN(*iscclass) (int ch);
200	}
201	POSIX_CCLASS;
202
203	static POSIX_CCLASS posix_cclass[] =
204	{
205	5, "alnum", my_isalnum,
206	5, "alpha", my_isalpha,
207	5, "blank", my_isblank,
208	5, "cntrl", my_iscntrl,
209	5, "digit", my_isdigit,
210	5, "lower", my_islower,
211	5, "print", my_isprint,
212	5, "punct", my_ispunct,
213	5, "space", my_isspace,
214	5, "upper", my_isupper,
215	6, "xdigit", my_isxdigit,
216	};
217
218	static int find_posix(const char *str)
219	{
220	int p;
221
222	if (str[0] != '[')
223	return str[0];
224	if (str[1] == '.')
225	return CHL_POSIX_COLLATING;
226	if (str[1] == '=')
227	return CHL_POSIX_EQUIVALENCE;
228	if (str[1] != ':')
229	return str[0];
230	for (p = 0; p < sizeof(posix_cclass) / sizeof(posix_cclass[0]); p++)
231	if (!memcmp(str + 2, posix_cclass[p].name, posix_cclass[p].len_name) &&
232	str[2 + posix_cclass[p].len_name] == ':' &&
233	str[2 + posix_cclass[p].len_name + 1] == ']')
234	return CHL_POSIX_CCLASS_BASE + p;
235	return CHL_POSIX_CCLASS_BAD;
236	}
237
238	static int cclass_thisch(const char *str)
239	{
240	switch (str[0])
241	{
242	case '\\':
243	return escaped(str + 1);
244	case ']':
245	return CHL_END_CCLASS;
246	case '^':
247	return CHL_COMP;
248	case '-':
249	return CHL_RANGE;
250	case '\0':
251	return CHL_EOS;
252	default:
253	return find_posix(str);
254	}
255	}
256
257	static const char cclass_nextch(const char str)
258	{
259	int p;
260
261	if (str[0] == '\\')
262	return past_escaped(str + 1);
263	p = find_posix(str);
264	if (CHL_POSIX_CCLASS_BASE <= p && p < CHL_POSIX_CCLASS_END)
265	return str + 2 + posix_cclass[p - CHL_POSIX_CCLASS_BASE].len_name + 2;
266	else
267	return str + 1;
268	}
269
270	static unsigned char compile_cclass(const char str,
271	const char **str_after,
272	int erecf,
273	int *rc)
274	{
275	unsigned char *cclass;
276	BOOLEAN complement;
277	int i, c, last_c = -1;
278
279	if ((cclass = (unsigned char *)malloc(0x100 >> 3)) == NULL)
280	{
281	*rc = ERROR_NOT_ENOUGH_MEMORY;
282	return NULL;
283	}
284	zero_cclass(cclass);
285
286	complement = (cclass_thisch(str) == CHL_COMP);
287	if (complement)
288	str = cclass_nextch(str);
289
290	while ((c = cclass_thisch(str)) != CHL_EOS && c != CHL_END_CCLASS)
291	{
292	if (CHL_POSIX_CCLASS_BASE <= c && c < CHL_POSIX_CCLASS_END)
293	{
294	for (i = 0; i < 0x100; i++)
295	if (posix_cclass[c - CHL_POSIX_CCLASS_BASE].iscclass(i))
296	add_to_cclass(i, cclass);
297	last_c = -1;
298	}
299	else
300	switch (c)
301	{
302	/...sCHL_POSIX_\42\ \45\ error cases:32: /
303	case CHL_POSIX_COLLATING:
304	free(cclass);
305	*rc = EREE_POSIX_COLLATING;
306	return NULL;
307	case CHL_POSIX_EQUIVALENCE:
308	free(cclass);
309	*rc = EREE_POSIX_EQUIVALENCE;
310	return NULL;
311	case CHL_POSIX_CCLASS_BAD:
312	free(cclass);
313	*rc = EREE_POSIX_CCLASS_BAD;
314	return NULL;
315	/...sCHL_RANGE \45\ range:32: /
316	case CHL_RANGE:
317	if (last_c == -1)
318	// Unexpected at this point
319	{
320	free(cclass);
321	*rc = EREE_UNEX_RANGE;
322	return NULL;
323	}
324	str = cclass_nextch(str);
325	if ((c = cclass_thisch(str)) == CHL_EOS \|\| c == CHL_END_CCLASS)
326	// Not followed by anything
327	{
328	free(cclass);
329	*rc = EREE_UNF_RANGE;
330	return NULL;
331	}
332	for (i = last_c + 1; i <= c; i++)
333	add_to_cclass(i, cclass);
334	last_c = c;
335	break;
336
337	#pragma info(nogen) // do not warn here
338
339	/...sCHL_COMP \45\ complement:32: /
340	case CHL_COMP:
341	c = '^';
342	// Fall through
343	/...sdefault \45\ individual entry:32: /
344	default:
345	if (erecf & ERECF_TOLOWER)
346	c = tolower(c);
347	add_to_cclass(c, cclass);
348	last_c = c;
349	break;
350
351	#pragma info(restore)
352
353	}
354	str = cclass_nextch(str);
355	}
356
357	if (c == CHL_EOS)
358	{
359	free(cclass);
360	*rc = EREE_UNF_CCLASS;
361	return NULL;
362	}
363
364	if (complement)
365	invert_cclass(cclass);
366
367	remove_from_cclass(0, cclass);
368
369	*str_after = cclass_nextch(str);
370	return cclass;
371	}
372
373	#define delete_cclass(cclass) free(cclass)
374
375	/* A 'match' embodies all of regular expression functionality.
376	* Matches can be defined in terms of each other. */
377
378	typedef unsigned char MTYPE;
379
380	#define MTYPE_NULL ((MTYPE) 0)
381	#define MTYPE_CHAR ((MTYPE) 1)
382	#define MTYPE_NCHAR ((MTYPE) 2)
383	#define MTYPE_STRING ((MTYPE) 3)
384	#define MTYPE_CCLASS ((MTYPE) 4)
385	#define MTYPE_WORD ((MTYPE) 5)
386	#define MTYPE_NWORD ((MTYPE) 6)
387	#define MTYPE_DOT ((MTYPE) 7)
388	#define MTYPE_QUERY ((MTYPE) 8)
389	#define MTYPE_PLUS ((MTYPE) 9)
390	#define MTYPE_STAR ((MTYPE) 10)
391	#define MTYPE_CREP ((MTYPE) 11)
392	#define MTYPE_OR ((MTYPE) 12)
393	#define MTYPE_CAT ((MTYPE) 13)
394	#define MTYPE_SUB ((MTYPE) 14)
395	#define MTYPE_SOL ((MTYPE) 15)
396	#define MTYPE_EOL ((MTYPE) 16)
397	#define MTYPE_SOW ((MTYPE) 17)
398	#define MTYPE_EOW ((MTYPE) 18)
399	#define MTYPE_IW ((MTYPE) 19)
400	#define MTYPE_EW ((MTYPE) 20)
401	#define MTYPE_BACK ((MTYPE) 21)
402
403	typedef struct match_struct MATCH;
404
405	struct match_struct
406	{
407	MTYPE mtype;
408	union
409	{
410	char character;
411	unsigned char *cclass;
412	MATCH *match;
413	MATCH *matchs[2];
414	int n_span;
415	char *string;
416	struct
417	{
418	unsigned m, n;
419	MATCH *match;
420	}
421	crep;
422	}
423	u;
424	};
425
426	static MATCH null_match =
427	{MTYPE_NULL};
428
429	#define NULL_MATCH (&null_match)
430
431	static void delete_match(MATCH * match)
432	{
433	if (match == NULL_MATCH)
434	return;
435	switch (match->mtype)
436	{
437	case MTYPE_STRING:
438	free(match->u.string);
439	break;
440	case MTYPE_CCLASS:
441	delete_cclass(match->u.cclass);
442	break;
443	case MTYPE_QUERY:
444	case MTYPE_PLUS:
445	case MTYPE_STAR:
446	delete_match(match->u.match);
447	break;
448	case MTYPE_CREP:
449	delete_match(match->u.crep.match);
450	break;
451	case MTYPE_OR:
452	case MTYPE_CAT:
453	delete_match(match->u.matchs[0]);
454	delete_match(match->u.matchs[1]);
455	break;
456	case MTYPE_SUB:
457	delete_match(match->u.match);
458	break;
459	}
460	free(match);
461	}
462
463	/*
464	*@@ shortest_match:
465	* determines the shortest possible match length for a given match
466	* tree. In the following example it is 3, thus allowing us to only
467	* consider 4 positions for regular expression matching in a 6
468	* character line.
469	*
470	+ aa*bc
471	+
472	+ 123456
473	+ ---
474	+ ---
475	+ ---
476	+ ---
477	*/
478
479	static unsigned shortest_match(const MATCH * match)
480	{
481	unsigned a, b;
482
483	switch (match->mtype)
484	{
485	case MTYPE_NULL:
486	return 0;
487	case MTYPE_CHAR:
488	return 1;
489	case MTYPE_NCHAR:
490	return 1;
491	case MTYPE_STRING:
492	return (unsigned char)match->u.string[0];
493	case MTYPE_CCLASS:
494	return 1;
495	case MTYPE_WORD:
496	return 1;
497	case MTYPE_NWORD:
498	return 1;
499	case MTYPE_DOT:
500	return 1;
501	case MTYPE_QUERY:
502	return 0;
503	case MTYPE_PLUS:
504	return shortest_match(match->u.match);
505	case MTYPE_STAR:
506	return 0;
507	case MTYPE_CREP:
508	return match->u.crep.m *
509	shortest_match(match->u.crep.match);
510	case MTYPE_OR:
511	a = shortest_match(match->u.matchs[0]);
512	b = shortest_match(match->u.matchs[1]);
513	return a <= b ? a : b;
514	case MTYPE_CAT:
515	return shortest_match(match->u.matchs[0]) +
516	shortest_match(match->u.matchs[1]);
517	case MTYPE_SUB:
518	return shortest_match(match->u.match);
519	case MTYPE_SOL:
520	return 0;
521	case MTYPE_EOL:
522	return 0;
523	case MTYPE_SOW:
524	return 0;
525	case MTYPE_EOW:
526	return 0;
527	case MTYPE_IW:
528	return 0;
529	case MTYPE_EW:
530	return 0;
531	case MTYPE_BACK:
532	return 0;
533	}
534	return 0; // Should never happen
535	}
536
537	static BOOLEAN got_backrefs(MATCH * match)
538	{
539	switch (match->mtype)
540	{
541	case MTYPE_QUERY:
542	case MTYPE_PLUS:
543	case MTYPE_STAR:
544	return got_backrefs(match->u.match);
545	case MTYPE_CREP:
546	return got_backrefs(match->u.crep.match);
547	case MTYPE_OR:
548	case MTYPE_CAT:
549	return got_backrefs(match->u.matchs[0]) \|\|
550	got_backrefs(match->u.matchs[1]);
551	case MTYPE_BACK:
552	return TRUE;
553	}
554	return FALSE;
555	}
556
557	static MATCH remove_subs(MATCH match)
558	{
559	switch (match->mtype)
560	{
561	case MTYPE_QUERY:
562	case MTYPE_PLUS:
563	case MTYPE_STAR:
564	match->u.match = remove_subs(match->u.match);
565	break;
566	case MTYPE_CREP:
567	match->u.crep.match = remove_subs(match->u.crep.match);
568	break;
569	case MTYPE_OR:
570	case MTYPE_CAT:
571	match->u.matchs[0] = remove_subs(match->u.matchs[0]);
572	match->u.matchs[1] = remove_subs(match->u.matchs[1]);
573	break;
574	case MTYPE_SUB:
575	{
576	MATCH *m = match;
577
578	match = match->u.match;
579	free(m);
580	}
581	break;
582	}
583	return match;
584	}
585
586	static int count_sub(const MATCH * match)
587	{
588	switch (match->mtype)
589	{
590	case MTYPE_SUB:
591	return 1 + count_sub(match->u.match);
592	case MTYPE_QUERY:
593	case MTYPE_PLUS:
594	case MTYPE_STAR:
595	return count_sub(match->u.match);
596	case MTYPE_CREP:
597	return count_sub(match->u.crep.match);
598	case MTYPE_OR:
599	case MTYPE_CAT:
600	return count_sub(match->u.matchs[0]) +
601	count_sub(match->u.matchs[1]);
602	}
603	return 0;
604	}
605
606	/...scompiling matches:0: /
607	/*
608	*
609	* <term> ::= <character> \| <class> \| ( <match> )
610	* <factor> ::= <term>? \| <term>+ \| <term>* \| <term>
611	* <factors> ::= <factor> { <factor> }
612	* <match> ::= <factors> { \|<factors> }
613	*
614	*/
615
616	/* We have an optimisation here in that if we can see a span of 'boring'
617	* characters, we return them as a single entity. We mustn't do this for the
618	* last in the span, as this may have modifiers applied to it, as in the
619	* example string "abcd", where the d has the modifier, but abc do not.
620	* We only do 255 chars at once, as the length is stored in a byte.
621	* Reducing 255 levels of per-character recursive searching to 1 level with
622	* a string compare has got to be a massive saving. */
623
624	#define STRING_MAX 255
625
626	#define CH_NOT_BASE 1000
627	#define CH_NOT_END (1000+0x100)
628	#define CH_EOS 0
629	#define CH_DOT (-1)
630	#define CH_WORD (-2)
631	#define CH_NWORD (-3)
632	#define CH_LSQR (-4)
633	#define CH_RSQR (-5)
634	#define CH_LPAR (-6)
635	#define CH_RPAR (-7)
636	#define CH_LCUR (-8)
637	#define CH_RCUR (-9)
638	#define CH_QUERY (-10)
639	#define CH_PLUS (-11)
640	#define CH_STAR (-12)
641	#define CH_OR (-13)
642	#define CH_SOL (-14)
643	#define CH_EOL (-15)
644	#define CH_SOW (-16)
645	#define CH_EOW (-17)
646	#define CH_IW (-18)
647	#define CH_EW (-19)
648	#define CH_BAD_TILDE (-20)
649	#define CH_STRING_BASE (-1000)
650	#define CH_STRING_END (-1000+STRING_MAX)
651	#define CH_BACK_BASE (-2000)
652	#define CH_BACK_END (-2000+9)
653
654	static int boring_string(const char *s)
655	{
656	int n = 0;
657
658	while (*s &&
659	strchr("\\.[](){}?+\|^$<>~", s) == NULL
660	&& n < STRING_MAX)
661	{
662	s++;
663	n++;
664	}
665	return n;
666	}
667
668	static int thisch(const char *str)
669	{
670	int n;
671
672	switch (str[0])
673	{
674	case '\\':
675	if (str[1] >= '1' && str[1] <= '9')
676	return CH_BACK_BASE + (str[1] - '1');
677	switch (str[1])
678	{
679	case '`':
680	return CH_SOL;
681	case '\'':
682	return CH_EOL;
683	case '<':
684	return CH_SOW;
685	case '>':
686	return CH_EOW;
687	case 'w':
688	return CH_WORD;
689	case 'W':
690	return CH_NWORD;
691	case 'B':
692	return CH_IW;
693	case 'y':
694	return CH_EW;
695	default:
696	return escaped(str + 1);
697	}
698
699	case '~':
700	if (str[1] == '\\')
701	return CH_NOT_BASE + escaped(str + 2);
702	else if (str[1] != '\0')
703	return CH_NOT_BASE + (unsigned char)str[1];
704	else
705	return '~';
706	case '.':
707	return CH_DOT;
708	case '[':
709	return CH_LSQR;
710	case ']':
711	return CH_RSQR;
712	case '(':
713	return CH_LPAR;
714	case ')':
715	return CH_RPAR;
716	case '{':
717	return CH_LCUR;
718	case '}':
719	return CH_RCUR;
720	case '?':
721	return CH_QUERY;
722	case '+':
723	return CH_PLUS;
724	case '*':
725	return CH_STAR;
726	case '\|':
727	return CH_OR;
728	case '^':
729	return CH_SOL;
730	case '$':
731	return CH_EOL;
732	}
733
734	n = boring_string(str);
735	if (n < 3)
736	return (unsigned char)str[0];
737	else
738	return CH_STRING_BASE + n - 1;
739	}
740
741	static const char nextch(const char str)
742	{
743	int n;
744
745	switch (str[0])
746	{
747	case '\\':
748	if (str[1] >= '1' && str[1] <= '9')
749	return str + 2;
750	switch (str[1])
751	{
752	case '`':
753	case '\'':
754	case '<':
755	case '>':
756	case 'w':
757	case 'W':
758	case 'B':
759	case 'y':
760	return str + 2;
761	default:
762	return past_escaped(str + 1);
763	}
764
765	case '~':
766	if (str[1] == '\\')
767	return past_escaped(str + 2);
768	else if (str[1] != '\0')
769	return str + 2;
770	else
771	return str + 1;
772	case '.':
773	case '[':
774	case '(':
775	case ')':
776	case '{':
777	case '}':
778	case '?':
779	case '+':
780	case '*':
781	case '\|':
782	case '^':
783	case '$':
784	return str + 1;
785	}
786
787	n = boring_string(str);
788	if (n < 3)
789	return str + 1;
790	else
791	return str + n - 1;
792	}
793
794	static MATCH compile_match(const char str, const char *str_after, int erecf, int rc);
795
796	static const char scan_number(const char str, unsigned *num)
797	{
798	if (!isdigit(*str))
799	return NULL;
800	*num = 0;
801	do
802	(num) = (num) * 10 + (*str++ - '0');
803	while (isdigit(*str));
804	return str;
805	}
806
807	static MATCH create_match(int rc)
808	{
809	MATCH *match;
810
811	if ((match = (MATCH *) malloc(sizeof(MATCH))) == NULL)
812	{
813	*rc = ERROR_NOT_ENOUGH_MEMORY;
814	return NULL;
815	}
816	return match;
817	}
818
819	static MATCH compile_term(const char str, const char *str_after, int erecf, int rc)
820	{
821	MATCH *match;
822	int c;
823
824	c = thisch(str);
825	switch (c)
826	{
827	case CH_RSQR:
828	*rc = EREE_UNEX_RSQR;
829	return NULL;
830	case CH_QUERY:
831	*rc = EREE_UNEX_QUERY;
832	return NULL;
833	case CH_PLUS:
834	*rc = EREE_UNEX_PLUS;
835	return NULL;
836	case CH_STAR:
837	*rc = EREE_UNEX_STAR;
838	return NULL;
839	case CH_LCUR:
840	*rc = EREE_UNEX_LCUR;
841	return NULL;
842	case CH_RCUR:
843	*rc = EREE_UNEX_RCUR;
844	return NULL;
845	}
846
847	if (c == CH_EOS \|\| c == CH_OR \|\| c == CH_RPAR)
848	{
849	*str_after = str;
850	return NULL_MATCH;
851	}
852
853	if ((match = create_match(rc)) == NULL)
854	return NULL;
855
856	if (CH_NOT_BASE <= c && c < CH_NOT_END)
857	/...snot a specific character:16: /
858	{
859	char ch = (char)(c - CH_NOT_BASE);
860
861	if (erecf & ERECF_TOLOWER)
862	ch = (char)tolower(ch);
863	match->mtype = MTYPE_NCHAR;
864	match->u.character = ch;
865	str = nextch(str);
866	}
867	else if (CH_STRING_BASE <= c && c < CH_STRING_END)
868	/...sa string of non\45\special characters:16: /
869	{
870	unsigned len = c - CH_STRING_BASE;
871
872	match->mtype = MTYPE_STRING;
873	match->u.string = (char *)malloc(1 + len);
874	if (match->u.string == NULL)
875	{
876	free(match);
877	*rc = ERROR_NOT_ENOUGH_MEMORY;
878	return NULL;
879	}
880	match->u.string[0] = (char)len;
881	memcpy(match->u.string + 1, str, len);
882	if (erecf & ERECF_TOLOWER)
883	{
884	unsigned i;
885
886	for (i = 1; i <= len; i++)
887	match->u.string[i] = (char)tolower(match->u.string[i]);
888	}
889	str = nextch(str);
890	}
891	else if (CH_BACK_BASE <= c && c < CH_BACK_END)
892	/...sa backreference:16: /
893	{
894	match->mtype = MTYPE_BACK;
895	match->u.n_span = c - CH_BACK_BASE;
896	str = nextch(str);
897	}
898	else
899	switch (c)
900	{
901	/...sCH_LSQR \45\ character class:24: /
902	case CH_LSQR:
903	match->mtype = MTYPE_CCLASS;
904	str = nextch(str);
905	if ((match->u.cclass = compile_cclass(str, &str, erecf, rc)) == NULL)
906	{
907	free(match);
908	return NULL;
909	}
910	break;
911	/...sCH_DOT \45\ any character:24: /
912	case CH_DOT:
913	match->mtype = MTYPE_DOT;
914	str = nextch(str);
915	break;
916	/...sCH_WORD \45\ \92\w:24: /
917	case CH_WORD:
918	match->mtype = MTYPE_WORD;
919	str = nextch(str);
920	break;
921	/...sCH_NWORD \45\ \92\W:24: /
922	case CH_NWORD:
923	match->mtype = MTYPE_NWORD;
924	str = nextch(str);
925	break;
926	/...sCH_LPAR \45\ nested regular expression:24: /
927	case CH_LPAR:
928	{
929	MATCH *sub_match;
930
931	str = nextch(str);
932	if ((sub_match = compile_match(str, &str, erecf, rc)) == NULL)
933	{
934	free(match);
935	return NULL;
936	}
937	if (!got_backrefs(sub_match))
938	sub_match = remove_subs(sub_match);
939	if (thisch(str) != CH_RPAR)
940	{
941	*rc = EREE_UNF_SUB;
942	return NULL;
943	}
944	str = nextch(str);
945	match->mtype = MTYPE_SUB;
946	match->u.match = sub_match;
947	}
948	break;
949	/...sCH_SOL \45\ \94\:24: /
950	case CH_SOL:
951	match->mtype = MTYPE_SOL;
952	str = nextch(str);
953	break;
954	/...sCH_EOL \45\ \36\:24: /
955	case CH_EOL:
956	match->mtype = MTYPE_EOL;
957	str = nextch(str);
958	break;
959	/...sCH_SOW \45\ \92\\60\:24: /
960	case CH_SOW:
961	match->mtype = MTYPE_SOW;
962	str = nextch(str);
963	break;
964	/...sCH_EOW \45\ \92\\62\:24: /
965	case CH_EOW:
966	match->mtype = MTYPE_EOW;
967	str = nextch(str);
968	break;
969	/...sCH_IW \45\ \92\B:24: /
970	case CH_IW:
971	match->mtype = MTYPE_IW;
972	str = nextch(str);
973	break;
974	/...sCH_EW \45\ \92\y:24: /
975	case CH_EW:
976	match->mtype = MTYPE_EW;
977	str = nextch(str);
978	break;
979	/...sdefault \45\ any old character:24: /
980	default:
981	{
982	char ch = (char)c;
983
984	if (erecf & ERECF_TOLOWER)
985	ch = (char)tolower(ch);
986	match->mtype = MTYPE_CHAR;
987	match->u.character = ch;
988	str = nextch(str);
989	}
990	break;
991	}
992
993	*str_after = str;
994	return match;
995	}
996
997	static MTYPE repeat_type_of(int c)
998	{
999	switch (c)
1000	{
1001	case CH_QUERY:
1002	return MTYPE_QUERY;
1003	case CH_PLUS:
1004	return MTYPE_PLUS;
1005	case CH_STAR:
1006	return MTYPE_STAR;
1007	case CH_LCUR:
1008	return MTYPE_CREP;
1009	default:
1010	return (MTYPE) - 1;
1011	}
1012	}
1013
1014	static MATCH compile_factor(const char str, const char *str_after, int erecf, int rc)
1015	{
1016	MATCH match, parent;
1017	MTYPE repeat_mtype;
1018
1019	if ((match = compile_term(str, &str, erecf, rc)) == NULL)
1020	return NULL;
1021
1022	while ((repeat_mtype = repeat_type_of(thisch(str))) != (MTYPE) - 1)
1023	/...smatch is to be repeated:16: /
1024	{
1025	if ((parent = create_match(rc)) == NULL)
1026	{
1027	delete_match(match);
1028	return NULL;
1029	}
1030
1031	parent->mtype = repeat_mtype;
1032	str = nextch(str);
1033	if (repeat_mtype == MTYPE_CREP)
1034	{
1035	parent->u.crep.match = match;
1036	if ((str = scan_number(str, &(parent->u.crep.m))) == NULL)
1037	{
1038	delete_match(match);
1039	free(parent);
1040	*rc = EREE_BAD_CREP_M;
1041	return NULL;
1042	}
1043	parent->u.crep.n = parent->u.crep.m;
1044	if (*str == ',')
1045	{
1046	++str;
1047	if (*str != '}')
1048	{
1049	if ((str = scan_number(str, &(parent->u.crep.n))) == NULL)
1050	{
1051	delete_match(match);
1052	free(parent);
1053	*rc = EREE_BAD_CREP_N;
1054	return NULL;
1055	}
1056	}
1057	else
1058	parent->u.crep.n = (unsigned)~0;
1059	}
1060	if (*str != '}')
1061	{
1062	delete_match(match);
1063	free(parent);
1064	*rc = EREE_UNF_CREP;
1065	return NULL;
1066	}
1067	++str;
1068	if (parent->u.crep.m > parent->u.crep.n)
1069	{
1070	delete_match(match);
1071	free(parent);
1072	*rc = EREE_BAD_CREP;
1073	return NULL;
1074	}
1075	}
1076	else
1077	parent->u.match = match;
1078	match = parent;
1079	}
1080
1081	*str_after = str;
1082
1083	return match;
1084	}
1085
1086	static MATCH compile_factors(const char str, const char *str_after, int erecf, int rc)
1087	{
1088	MATCH *match;
1089	int c;
1090
1091	if ((match = compile_factor(str, &str, erecf, rc)) == NULL)
1092	return NULL;
1093
1094	while ((c = thisch(str)) != CH_EOS && c != CH_RPAR && c != CH_OR)
1095	/...sconsider catenation of more factors:16: /
1096	{
1097	MATCH sibling, parent;
1098
1099	if ((sibling = compile_factor(str, &str, erecf, rc)) == NULL)
1100	{
1101	delete_match(match);
1102	return NULL;
1103	}
1104
1105	if ((parent = create_match(rc)) == NULL)
1106	{
1107	delete_match(sibling);
1108	delete_match(match);
1109	return NULL;
1110	}
1111	parent->mtype = MTYPE_CAT;
1112	parent->u.matchs[0] = match;
1113	parent->u.matchs[1] = sibling;
1114	match = parent;
1115	}
1116
1117	*str_after = str;
1118	return match;
1119	}
1120
1121	/...scompile_match \45\ factors\124\factors:0: /
1122	static MATCH compile_match(const char str, const char *str_after, int erecf, int rc)
1123	{
1124	MATCH *match;
1125
1126	if ((match = compile_factors(str, &str, erecf, rc)) == NULL)
1127	return NULL;
1128
1129	while (thisch(str) == CH_OR)
1130	/...sfind sibling and or it in:16: /
1131	{
1132	MATCH sibling, parent;
1133
1134	str = nextch(str);
1135	if ((sibling = compile_factors(str, &str, erecf, rc)) == NULL)
1136	{
1137	delete_match(match);
1138	return NULL;
1139	}
1140	if ((parent = create_match(rc)) == NULL)
1141	{
1142	delete_match(sibling);
1143	delete_match(match);
1144	return NULL;
1145	}
1146	parent->mtype = MTYPE_OR;
1147	parent->u.matchs[0] = match;
1148	parent->u.matchs[1] = sibling;
1149	match = parent;
1150	}
1151
1152	*str_after = str;
1153
1154	return match;
1155	}
1156
1157	#ifdef DEBUG
1158	/...sprint_tree:0: /
1159	/...sdo_indent:0: /
1160	static void do_indent(int indent)
1161	{
1162	while (indent--)
1163	putchar('\t');
1164	}
1165
1166	static void print_tree(const MATCH * match, int indent)
1167	{
1168	do_indent(indent);
1169	switch (match->mtype)
1170	{
1171	case MTYPE_NULL:
1172	printf("null\n");
1173	break;
1174	case MTYPE_CHAR:
1175	printf("%c\n", match->u.character);
1176	break;
1177	case MTYPE_NCHAR:
1178	printf("~%c\n", match->u.character);
1179	break;
1180	case MTYPE_STRING:
1181	printf("%.s\n",
1182	(unsigned char)match->u.string[0],
1183	(unsigned char)match->u.string[0],
1184	match->u.string + 1);
1185	break;
1186	case MTYPE_CCLASS:
1187	{
1188	int i;
1189
1190	printf("[");
1191	for (i = 0; i < 0x100; i++)
1192	if (match_cclass(i, match->u.cclass))
1193	printf("%c", i);
1194	printf("]\n");
1195	}
1196	break;
1197	case MTYPE_DOT:
1198	printf(".\n");
1199	break;
1200	case MTYPE_WORD:
1201	printf("\\w\n");
1202	break;
1203	case MTYPE_NWORD:
1204	printf("\\W\n");
1205	break;
1206	case MTYPE_QUERY:
1207	printf("?\n");
1208	print_tree(match->u.match, indent + 1);
1209	break;
1210	case MTYPE_PLUS:
1211	printf("+\n");
1212	print_tree(match->u.match, indent + 1);
1213	break;
1214	case MTYPE_STAR:
1215	printf("*\n");
1216	print_tree(match->u.match, indent + 1);
1217	break;
1218	case MTYPE_CREP:
1219	printf("{%u,%u}\n",
1220	match->u.crep.m,
1221	match->u.crep.n);
1222	print_tree(match->u.crep.match, indent + 1);
1223	break;
1224	case MTYPE_OR:
1225	printf("\|\n");
1226	print_tree(match->u.matchs[0], indent + 1);
1227	print_tree(match->u.matchs[1], indent + 1);
1228	break;
1229	case MTYPE_CAT:
1230	printf("CAT\n");
1231	print_tree(match->u.matchs[0], indent + 1);
1232	print_tree(match->u.matchs[1], indent + 1);
1233	break;
1234	case MTYPE_SUB:
1235	printf("SUB\n");
1236	print_tree(match->u.match, indent + 1);
1237	break;
1238	case MTYPE_SOL:
1239	printf("^\n");
1240	break;
1241	case MTYPE_EOL:
1242	printf("$\n");
1243	break;
1244	case MTYPE_SOW:
1245	printf("\\<\n");
1246	break;
1247	case MTYPE_EOW:
1248	printf("\\>\n");
1249	break;
1250	case MTYPE_IW:
1251	printf("\\B\n");
1252	break;
1253	case MTYPE_EW:
1254	printf("\\y\n");
1255	break;
1256	case MTYPE_BACK:
1257	printf("\\%d\n", match->u.n_span + 1);
1258	break;
1259	}
1260	}
1261	#endif
1262
1263	/...sfinite state machine:0: /
1264	typedef unsigned char ETYPE; // Edge type
1265
1266	#define ETYPE_CHAR ((ETYPE) 0) // Can advance if given character
1267	#define ETYPE_NCHAR ((ETYPE) 1) // Can advance if not given char.
1268	#define ETYPE_STRING ((ETYPE) 2) // Can advance if strings match
1269	#define ETYPE_DOT ((ETYPE) 3) // Can advance if any character
1270	#define ETYPE_CCLASS ((ETYPE) 4) // Can advance if any in class
1271	#define ETYPE_WORD ((ETYPE) 5) // Can advance if word constituent
1272	#define ETYPE_NWORD ((ETYPE) 6) // Can advance if not word constit.
1273	#define ETYPE_EPSILON ((ETYPE) 7) // Can advance without reading input
1274	#define ETYPE_SOL ((ETYPE) 8) // Matches if at start of line
1275	#define ETYPE_EOL ((ETYPE) 9) // Matches if at end of line
1276	#define ETYPE_SOW ((ETYPE) 10) // Matches if at start of word
1277	#define ETYPE_EOW ((ETYPE) 11) // Matches if at end of word
1278	#define ETYPE_IW ((ETYPE) 12) // Matches if within word
1279	#define ETYPE_EW ((ETYPE) 13) // Matches if at word start or end
1280	#define ETYPE_SSUB ((ETYPE) 14) // Records passage over (
1281	#define ETYPE_ESUB ((ETYPE) 15) // Records passage over )
1282	#define ETYPE_BACK ((ETYPE) 16) // Back reference
1283
1284	typedef struct
1285	{
1286	ETYPE etype; // Edge type, (an ETYPE_ no)
1287	union
1288	{
1289	char character; // Character to use if ETYPE_CHAR
1290	unsigned char *cclass; // Class to use if ETYPE_CCLASS
1291	char *string; // len, then chars if ETYPE_STRING
1292	int n_span; // Used if ETYPE_BACK
1293	}
1294	u;
1295	const char *gate; // Used if type of epsilon move
1296	int to_state; // State to go to if test succeeds
1297	int next_edge; // Next test to try after this
1298	}
1299	EDGE;
1300
1301	/* Under DOS, restrict FSM size to 13KB approx.. On other environments memory
1302	* isn't such an issue, so allow something closer to 32KB. Of course, a better
1303	* implementation would dynamically grow the FSM size as needed. Something for
1304	* the future perhaps... */
1305
1306	#ifdef xxDOS
1307	#define MAX_STATES 200
1308	#define MAX_EDGES 600
1309	#else
1310	#define MAX_STATES 500
1311	#define MAX_EDGES 1500
1312	#endif
1313
1314	#define FLAG_FINISH 0x01
1315	#define FLAG_VISITED 0x02
1316	#define FLAG_REACHABLE 0x04
1317
1318	typedef struct
1319	{
1320	int n_states;
1321	int state_first_edges[MAX_STATES];
1322	unsigned char state_flags[MAX_STATES];
1323	int n_edges;
1324	EDGE edges[MAX_EDGES];
1325	}
1326	FSM;
1327
1328	/...screate_fsm:0: /
1329	static FSM create_fsm(int rc)
1330	{
1331	FSM *fsm;
1332	int i;
1333
1334	if ((fsm = (FSM *) malloc(sizeof(FSM))) == NULL)
1335	{
1336	*rc = ERROR_NOT_ENOUGH_MEMORY;
1337	return NULL;
1338	}
1339
1340	fsm->n_states = 0;
1341	fsm->n_edges = 0;
1342
1343	for (i = 0; i < MAX_STATES; i++)
1344	{
1345	fsm->state_first_edges[i] = -1;
1346	fsm->state_flags[i] = 0;
1347	}
1348
1349	return fsm;
1350	}
1351	/...sdelete_fsm:0: /
1352	#define delete_fsm(fsm) free(fsm)
1353
1354	/...smalloc_state:0: /
1355	static int malloc_state(FSM * fsm)
1356	{
1357	if (fsm->n_states == MAX_STATES)
1358	return -1;
1359	else
1360	return fsm->n_states++;
1361	}
1362	/...smalloc_edge:0: /
1363	/* Allocate space for new supplied edge and fill it in.
1364	* If already exists then don't bother (duplicates waste search time).
1365	* Return TRUE if all went ok. */
1366
1367	static BOOLEAN malloc_edge(int s, EDGE * edge, FSM * fsm)
1368	{
1369	int edge_no, n_edges = fsm->n_edges++;
1370
1371	// See if edge already exists
1372
1373	for (edge_no = fsm->state_first_edges[s];
1374	edge_no != -1;
1375	edge_no = fsm->edges[edge_no].next_edge)
1376	// Do we already have this edge
1377	if (fsm->edges[edge_no].to_state == edge->to_state &&
1378	fsm->edges[edge_no].etype == edge->etype)
1379	// An edge of this type already exists
1380	switch (edge->etype)
1381	{
1382	/...sETYPE_CHAR\47\NCHAR:32: /
1383	case ETYPE_CHAR:
1384	case ETYPE_NCHAR:
1385	if (edge->u.character == fsm->edges[edge_no].u.character)
1386	return TRUE;
1387	break;
1388	/...sETYPE_STRING:32: /
1389	case ETYPE_STRING:
1390	if ((unsigned char)fsm->edges[edge_no].u.string[0] ==
1391	(unsigned char)edge->u.string[0] &&
1392	!memcmp(fsm->edges[edge_no].u.string + 1,
1393	edge->u.string + 1,
1394	(unsigned char)edge->u.string[0]))
1395	return TRUE;
1396	break;
1397	/...sETYPE_CCLASS:32: /
1398	case ETYPE_CCLASS:
1399	if (edge->u.cclass == fsm->edges[edge_no].u.cclass)
1400	return TRUE;
1401	break;
1402	/...sETYPE_DOT\47\EPSILON\47\SOL\47\EOL etc\46\\46\:32: /
1403	case ETYPE_DOT:
1404	case ETYPE_EPSILON:
1405	case ETYPE_SOL:
1406	case ETYPE_EOL:
1407	case ETYPE_SOW:
1408	case ETYPE_EOW:
1409	case ETYPE_IW:
1410	case ETYPE_EW:
1411	case ETYPE_SSUB:
1412	case ETYPE_ESUB:
1413	return TRUE;
1414	/...sETYPE_BACK:32: /
1415	case ETYPE_BACK:
1416	if (edge->u.n_span == fsm->edges[edge_no].u.n_span)
1417	return TRUE;
1418	break;
1419	}
1420
1421	// Going to have to add the edge
1422
1423	if (n_edges >= MAX_EDGES)
1424	return FALSE;
1425
1426	memcpy(&(fsm->edges[n_edges]), edge, sizeof(EDGE));
1427	fsm->edges[n_edges].next_edge = fsm->state_first_edges[s];
1428	fsm->state_first_edges[s] = n_edges;
1429	return TRUE;
1430	}
1431	/...smake_fsm_from_match:0: /
1432	/...sadd_edge_to_fsm_character:0: /
1433	static BOOLEAN add_edge_to_fsm_character(int s, int f, FSM * fsm, char character)
1434	{
1435	EDGE edge;
1436
1437	edge.etype = ETYPE_CHAR;
1438	edge.u.character = character;
1439	edge.to_state = f;
1440	return malloc_edge(s, &edge, fsm);
1441	}
1442	/...sadd_edge_to_fsm_ncharacter:0: /
1443	static BOOLEAN add_edge_to_fsm_ncharacter(int s, int f, FSM * fsm, char character)
1444	{
1445	EDGE edge;
1446
1447	edge.etype = ETYPE_NCHAR;
1448	edge.u.character = character;
1449	edge.to_state = f;
1450	return malloc_edge(s, &edge, fsm);
1451	}
1452	/...sadd_edge_to_fsm_string:0: /
1453	static BOOLEAN add_edge_to_fsm_string(int s, int f, FSM * fsm, char *string)
1454	{
1455	EDGE edge;
1456
1457	edge.etype = ETYPE_STRING;
1458	edge.u.string = string;
1459	edge.to_state = f;
1460	return malloc_edge(s, &edge, fsm);
1461	}
1462	/...sadd_edge_to_fsm_cclass:0: /
1463	static BOOLEAN add_edge_to_fsm_cclass(int s, int f, FSM * fsm, unsigned char *cclass)
1464	{
1465	EDGE edge;
1466
1467	edge.etype = ETYPE_CCLASS;
1468	edge.u.cclass = cclass;
1469	edge.to_state = f;
1470	return malloc_edge(s, &edge, fsm);
1471	}
1472	/...sadd_edge_to_fsm_dot:0: /
1473	static BOOLEAN add_edge_to_fsm_dot(int s, int f, FSM * fsm)
1474	{
1475	EDGE edge;
1476
1477	edge.etype = ETYPE_DOT;
1478	edge.to_state = f;
1479	return malloc_edge(s, &edge, fsm);
1480	}
1481	/...sadd_edge_to_fsm_word:0: /
1482	static BOOLEAN add_edge_to_fsm_word(int s, int f, FSM * fsm)
1483	{
1484	EDGE edge;
1485
1486	edge.etype = ETYPE_WORD;
1487	edge.to_state = f;
1488	return malloc_edge(s, &edge, fsm);
1489	}
1490	/...sadd_edge_to_fsm_nword:0: /
1491	static BOOLEAN add_edge_to_fsm_nword(int s, int f, FSM * fsm)
1492	{
1493	EDGE edge;
1494
1495	edge.etype = ETYPE_NWORD;
1496	edge.to_state = f;
1497	return malloc_edge(s, &edge, fsm);
1498	}
1499	/...sadd_edge_to_fsm_epsilon:0: /
1500	static BOOLEAN add_edge_to_fsm_epsilon(int s, int f, FSM * fsm)
1501	{
1502	EDGE edge;
1503
1504	edge.etype = ETYPE_EPSILON;
1505	edge.to_state = f;
1506	return malloc_edge(s, &edge, fsm);
1507	}
1508	/...sadd_edge_to_fsm_special:0: /
1509	static BOOLEAN add_edge_to_fsm_special(int s, int f, FSM * fsm, ETYPE etype)
1510	{
1511	EDGE edge;
1512
1513	edge.etype = etype;
1514	edge.to_state = f;
1515	edge.gate = NULL;
1516	return malloc_edge(s, &edge, fsm);
1517	}
1518	/...sadd_edge_to_fsm_back:0: /
1519	static BOOLEAN add_edge_to_fsm_back(int s, int f, FSM * fsm, int n_span)
1520	{
1521	EDGE edge;
1522
1523	edge.etype = ETYPE_BACK;
1524	edge.to_state = f;
1525	edge.gate = NULL;
1526	edge.u.n_span = n_span;
1527	return malloc_edge(s, &edge, fsm);
1528	}
1529
1530	static BOOLEAN make_fsm_from_match(MATCH * match, FSM * fsm, int s, int f)
1531	{
1532	int n1, n2, n3, n4, i;
1533
1534	switch (match->mtype)
1535	{
1536	/...sMTYPE_NULL:16: /
1537	/*
1538	* e
1539	* S ----> F
1540	*
1541	*/
1542
1543	case MTYPE_NULL:
1544	return (*s = malloc_state(fsm)) != -1 &&
1545	(*f = malloc_state(fsm)) != -1 &&
1546	add_edge_to_fsm_epsilon(s, f, fsm);
1547	/...sMTYPE_CHAR:16: /
1548	/*
1549	* c
1550	* S ----> F
1551	*
1552	*/
1553
1554	case MTYPE_CHAR:
1555	return (*s = malloc_state(fsm)) != -1 &&
1556	(*f = malloc_state(fsm)) != -1 &&
1557	add_edge_to_fsm_character(s, f, fsm, match->u.character);
1558	/...sMTYPE_NCHAR:16: /
1559	/*
1560	* ~c
1561	* S ----> F
1562	*
1563	*/
1564
1565	case MTYPE_NCHAR:
1566	return (*s = malloc_state(fsm)) != -1 &&
1567	(*f = malloc_state(fsm)) != -1 &&
1568	add_edge_to_fsm_ncharacter(s, f, fsm, match->u.character);
1569	/...sMTYPE_STRING:16: /
1570	/*
1571	* string
1572	* S ----> F
1573	*
1574	*/
1575
1576	case MTYPE_STRING:
1577	return (*s = malloc_state(fsm)) != -1 &&
1578	(*f = malloc_state(fsm)) != -1 &&
1579	add_edge_to_fsm_string(s, f, fsm, match->u.string);
1580
1581	/...sMTYPE_CCLASS:16: /
1582	/*
1583	* cclass
1584	* S ----> F
1585	*
1586	*/
1587
1588	case MTYPE_CCLASS:
1589	return (*s = malloc_state(fsm)) != -1 &&
1590	(*f = malloc_state(fsm)) != -1 &&
1591	add_edge_to_fsm_cclass(s, f, fsm, match->u.cclass);
1592
1593	/...sMTYPE_DOT:16: /
1594	/*
1595	* any
1596	* S ----> F
1597	*
1598	*/
1599
1600	case MTYPE_DOT:
1601	return (*s = malloc_state(fsm)) != -1 &&
1602	(*f = malloc_state(fsm)) != -1 &&
1603	add_edge_to_fsm_dot(s, f, fsm);
1604
1605	/...sMTYPE_WORD:16: /
1606	/*
1607	* word
1608	* S ----> F
1609	*
1610	*/
1611
1612	case MTYPE_WORD:
1613	return (*s = malloc_state(fsm)) != -1 &&
1614	(*f = malloc_state(fsm)) != -1 &&
1615	add_edge_to_fsm_word(s, f, fsm);
1616
1617	/...sMTYPE_NWORD:16: /
1618	/*
1619	* !word
1620	* S ----> F
1621	*
1622	*/
1623
1624	case MTYPE_NWORD:
1625	return (*s = malloc_state(fsm)) != -1 &&
1626	(*f = malloc_state(fsm)) != -1 &&
1627	add_edge_to_fsm_nword(s, f, fsm);
1628
1629	/...sMTYPE_QUERY:16: /
1630	/*
1631	* e
1632	* S ----> F
1633	* \| ^
1634	* \| e \| e
1635	* v \|
1636	* [n1 n2]
1637	*
1638	*/
1639
1640	case MTYPE_QUERY:
1641	if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1642	return FALSE;
1643	return (*s = malloc_state(fsm)) != -1 &&
1644	(*f = malloc_state(fsm)) != -1 &&
1645	add_edge_to_fsm_epsilon(s, f, fsm) &&
1646	add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1647	add_edge_to_fsm_epsilon(n2, *f, fsm);
1648
1649	/...sMTYPE_PLUS:16: /
1650	/*
1651	*
1652	* [S F]
1653	* ^ e \|
1654	* +---+
1655	*
1656	*/
1657
1658	case MTYPE_PLUS:
1659	if (!make_fsm_from_match(match->u.match, fsm, s, f))
1660	return FALSE;
1661	return add_edge_to_fsm_epsilon(f, s, fsm);
1662
1663	/...sMTYPE_STAR:16: /
1664	/*
1665	*
1666	* +---- S/F
1667	* \| ^
1668	* \| e \| e
1669	* v \|
1670	* [n1 n2]
1671	*
1672	*/
1673
1674	case MTYPE_STAR:
1675	if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1676	return FALSE;
1677	return (s = f = malloc_state(fsm)) != -1 &&
1678	add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1679	add_edge_to_fsm_epsilon(n2, *f, fsm);
1680
1681	/...sMTYPE_CREP:16: /
1682	/*
1683	* e e e e
1684	* S-->[n1 n2]-->[n1 n2]-->[n1 n2]-->[n1 n2] <re>{2,4}
1685	* \| \| \|
1686	* \| e \| e \| e
1687	* v \| \|
1688	* F<---------+----------+
1689	*
1690	* e
1691	* +----+
1692	* \| \|
1693	* e e e v \|
1694	* S-->[n1 n2]-->[n1 n2]-->[n1 n2] <re>{2,}
1695	* \| \|
1696	* \| e \| e
1697	* v \|
1698	* F<---------+
1699	*/
1700
1701	case MTYPE_CREP:
1702	if ((*s = malloc_state(fsm)) == -1 \|\|
1703	(*f = malloc_state(fsm)) == -1)
1704	return FALSE;
1705	n3 = *s;
1706	for (i = 0; i < (int)match->u.crep.m; i++)
1707	{
1708	if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1709	return FALSE;
1710	if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1711	return FALSE;
1712	n3 = n2;
1713	}
1714	if (!add_edge_to_fsm_epsilon(n3, *f, fsm))
1715	return FALSE;
1716	if (match->u.crep.n != ~0)
1717	for (; i < (int)match->u.crep.n; i++)
1718	{
1719	if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1720	return FALSE;
1721	if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1722	return FALSE;
1723	if (!add_edge_to_fsm_epsilon(n2, *f, fsm))
1724	return FALSE;
1725	n3 = n2;
1726	}
1727	else
1728	{
1729	if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1730	return FALSE;
1731	if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1732	return FALSE;
1733	if (!add_edge_to_fsm_epsilon(n2, *f, fsm))
1734	return FALSE;
1735	if (!add_edge_to_fsm_epsilon(n2, n1, fsm))
1736	return FALSE;
1737	}
1738	return TRUE;
1739
1740	/...sMTYPE_OR:16: /
1741	/*
1742	* e e
1743	* +--->[n1 n2]----+
1744	* \| v
1745	* S F
1746	* \| e e ^
1747	* +--->[n3 n4]----+
1748	*
1749	*/
1750
1751	case MTYPE_OR:
1752	if (!make_fsm_from_match(match->u.matchs[0], fsm, &n1, &n2) \|\|
1753	!make_fsm_from_match(match->u.matchs[1], fsm, &n3, &n4))
1754	return FALSE;
1755	return (*s = malloc_state(fsm)) != -1 &&
1756	(*f = malloc_state(fsm)) != -1 &&
1757	add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1758	add_edge_to_fsm_epsilon(*s, n3, fsm) &&
1759	add_edge_to_fsm_epsilon(n2, *f, fsm) &&
1760	add_edge_to_fsm_epsilon(n4, *f, fsm);
1761
1762	/...sMTYPE_CAT:16: /
1763	/*
1764	* e
1765	* [S n1]---->[n2 F]
1766	*
1767	*/
1768
1769	case MTYPE_CAT:
1770	if (!make_fsm_from_match(match->u.matchs[0], fsm, s, &n1) \|\|
1771	!make_fsm_from_match(match->u.matchs[1], fsm, &n2, f))
1772	return FALSE;
1773	return add_edge_to_fsm_epsilon(n1, n2, fsm);
1774
1775	/...sMTYPE_SUB:16: /
1776	/*
1777	* ssub esub
1778	* S---->[n1 n2]---->F
1779	*
1780	*/
1781
1782	case MTYPE_SUB:
1783	if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1784	return FALSE;
1785	return (*s = malloc_state(fsm)) != -1 &&
1786	(*f = malloc_state(fsm)) != -1 &&
1787	add_edge_to_fsm_special(*s, n1, fsm, ETYPE_SSUB) &&
1788	add_edge_to_fsm_special(n2, *f, fsm, ETYPE_ESUB);
1789
1790	/...sMTYPE_SOL\47\EOL\47\SOW\47\EOW\47\IW\47\EW:16: /
1791	/*
1792	* special
1793	* S ----> F
1794	*
1795	*/
1796
1797	case MTYPE_SOL:
1798	return (*s = malloc_state(fsm)) != -1 &&
1799	(*f = malloc_state(fsm)) != -1 &&
1800	add_edge_to_fsm_special(s, f, fsm, ETYPE_SOL);
1801	case MTYPE_EOL:
1802	return (*s = malloc_state(fsm)) != -1 &&
1803	(*f = malloc_state(fsm)) != -1 &&
1804	add_edge_to_fsm_special(s, f, fsm, ETYPE_EOL);
1805	case MTYPE_SOW:
1806	return (*s = malloc_state(fsm)) != -1 &&
1807	(*f = malloc_state(fsm)) != -1 &&
1808	add_edge_to_fsm_special(s, f, fsm, ETYPE_SOW);
1809	case MTYPE_EOW:
1810	return (*s = malloc_state(fsm)) != -1 &&
1811	(*f = malloc_state(fsm)) != -1 &&
1812	add_edge_to_fsm_special(s, f, fsm, ETYPE_EOW);
1813	case MTYPE_IW:
1814	return (*s = malloc_state(fsm)) != -1 &&
1815	(*f = malloc_state(fsm)) != -1 &&
1816	add_edge_to_fsm_special(s, f, fsm, ETYPE_IW);
1817	case MTYPE_EW:
1818	return (*s = malloc_state(fsm)) != -1 &&
1819	(*f = malloc_state(fsm)) != -1 &&
1820	add_edge_to_fsm_special(s, f, fsm, ETYPE_EW);
1821
1822	/...sMTYPE_BACK:16: /
1823	case MTYPE_BACK:
1824	return (*s = malloc_state(fsm)) != -1 &&
1825	(*f = malloc_state(fsm)) != -1 &&
1826	add_edge_to_fsm_back(s, f, fsm, match->u.n_span);
1827
1828	}
1829	return FALSE; // Should never happen
1830	}
1831
1832	/...sremove_epsilons:0: /
1833	/* The problems with epsilon moves are :-
1834	* 1) They can gang up on you in groups to form loops of states that require
1835	* no input to go all the way around them. Any function attempting a
1836	* recursive search of the FSM will recurse forever.
1837	* 2) They can slow the recognition process by as much as a factor of 2.
1838	* eg:
1839	* a e b e c
1840	* O ----> O ----> O ----> O ----> O ----> O Is slow
1841	*
1842	* a b c
1843	* O ----> O ----> O ----> O Is faster
1844	*
1845	*/
1846
1847	/...sfinish_states:0: /
1848	/...sis_finish_reachable:0: /
1849	/* When we recurse we mark the current state as visited to stop infinite
1850	* recursion on loops of epsilon moves. */
1851
1852	static BOOLEAN is_finish_reachable(FSM * fsm, int state_no)
1853	{
1854	int edge_no;
1855	BOOLEAN ok;
1856
1857	if (fsm->state_flags[state_no] & FLAG_VISITED)
1858	// Been here already
1859	return FALSE;
1860
1861	if (fsm->state_flags[state_no] & FLAG_FINISH)
1862	// At finish
1863	return TRUE;
1864
1865	for (edge_no = fsm->state_first_edges[state_no];
1866	edge_no != -1;
1867	edge_no = fsm->edges[edge_no].next_edge)
1868	if (fsm->edges[edge_no].etype == ETYPE_EPSILON)
1869	{
1870	fsm->state_flags[state_no] \|= FLAG_VISITED;
1871	ok = is_finish_reachable(fsm, fsm->edges[edge_no].to_state);
1872	fsm->state_flags[state_no] &= ~FLAG_VISITED;
1873	if (ok)
1874	return TRUE;
1875	}
1876
1877
1878	return FALSE;
1879	}
1880
1881
1882	static void finish_states(int f, FSM * fsm, FSM * fsm_without)
1883	{
1884	int state_no;
1885
1886	fsm->state_flags[f] = FLAG_FINISH;
1887	fsm_without->state_flags[f] = FLAG_FINISH;
1888	for (state_no = 0; state_no < fsm->n_states; state_no++)
1889	if (is_finish_reachable(fsm, state_no))
1890	fsm_without->state_flags[state_no] = FLAG_FINISH;
1891	}
1892
1893	/...sdetermine_reachable:0: /
1894	static void determine_reachable(int s, FSM * fsm, FSM * fsm_without)
1895	{
1896	int edge_no, to_state;
1897
1898	fsm_without->state_flags[s] \|= FLAG_REACHABLE;
1899	for (edge_no = 0; edge_no < fsm->n_edges; edge_no++)
1900	if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1901	{
1902	to_state = fsm->edges[edge_no].to_state;
1903	fsm_without->state_flags[to_state] \|= FLAG_REACHABLE;
1904	}
1905	}
1906
1907	/...scopy_non_epsilons:0: /
1908	static void copy_non_epsilons(FSM * fsm, FSM * fsm_without)
1909	{
1910	int state_no, edge_no;
1911
1912	for (state_no = 0; state_no < fsm->n_states; state_no++)
1913	if (fsm_without->state_flags[state_no] & FLAG_REACHABLE)
1914	for (edge_no = fsm->state_first_edges[state_no];
1915	edge_no != -1;
1916	edge_no = fsm->edges[edge_no].next_edge)
1917	if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1918	malloc_edge(state_no, &(fsm->edges[edge_no]), fsm_without);
1919	}
1920
1921	/...sfollow_epsilons:0: /
1922	/...scopy_edges_reachable:0: /
1923	/* What this says is :-
1924	* If state A can reach state B, by an epsilon move, and
1925	* state B can reach state C then
1926	* state A can reach state C *
1927	*
1928	* If the state B to state C involves an epsilon move then
1929	* A can reach whatever C can reach too (by recursion)
1930	*/
1931
1932	static BOOLEAN copy_edges_reachable(
1933	FSM * fsm,
1934	FSM * fsm_without,
1935	int state_no_to, /* AK: Bad identifier, might better be
1936	* described as 'original reachable state' */
1937	int state_no_from
1938	)
1939	{
1940	int edge_no;
1941	BOOLEAN ok;
1942
1943	if (fsm->state_flags[state_no_from] & FLAG_VISITED)
1944	// Been here already, therefore all copied from here ok
1945	return TRUE;
1946
1947	for (edge_no = fsm->state_first_edges[state_no_from];
1948	edge_no != -1;
1949	edge_no = fsm->edges[edge_no].next_edge)
1950	if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1951	// Had better add this edge
1952	/...sadd this edge to the \39\to\39\ state:24: /
1953	{
1954	if (!malloc_edge(state_no_to, &(fsm->edges[edge_no]), fsm_without))
1955	return (FALSE);
1956	}
1957
1958	else
1959	{
1960	fsm->state_flags[state_no_from] \|= FLAG_VISITED;
1961	ok = copy_edges_reachable(fsm, fsm_without, state_no_to, fsm->edges[edge_no].to_state);
1962	fsm->state_flags[state_no_from] &= ~FLAG_VISITED;
1963	if (!ok)
1964	return FALSE;
1965	}
1966
1967	return TRUE;
1968	}
1969
1970
1971	static BOOLEAN follow_epsilons(FSM * fsm, FSM * fsm_without)
1972	{
1973	int state_no;
1974
1975	for (state_no = 0; state_no < fsm->n_states; state_no++)
1976	if (fsm_without->state_flags[state_no] & FLAG_REACHABLE)
1977	if (!copy_edges_reachable(fsm, fsm_without, state_no, state_no))
1978	return FALSE;
1979	return TRUE;
1980	}
1981
1982
1983	static BOOLEAN remove_epsilons(int s, int f, FSM * fsm, FSM * fsm_without)
1984	{
1985	// FSM with no epsilon moves will have the same number of states
1986
1987	fsm_without->n_states = fsm->n_states;
1988
1989	// Mark state f as a finish state in the new FSM
1990	// Any state with epsilon move(s) to state f is also a finish state
1991
1992	finish_states(f, fsm, fsm_without);
1993
1994	// Determine which states can be reached by non-epsilon moves. Add
1995	// to this set, the start state. The resulting states should have
1996	// their edges considered, but the other will not be reachable. This
1997	// is because they will be bypassed by follow_epsilons().
1998
1999	determine_reachable(s, fsm, fsm_without);
2000
2001	// Copy across all reachable, non epsilon moves to new FSM
2002
2003	copy_non_epsilons(fsm, fsm_without);
2004
2005	// For all states, determine all other states that can be reached by
2006	// epsilon moves and add the edges leading from them to us
2007
2008	return follow_epsilons(fsm, fsm_without);
2009	}
2010
2011	/...smatch_fsm:0: /
2012	/* Stack requirements per call, (one call per character in string!).
2013	*
2014	* eg: 16 bit OS/2, large model :-
2015	* Return address + Stack frame + SI and DI + Arguments + Locals
2016	* 2 + 2 + 2+2 + 4+2+4+4 + 2 = 24
2017	*
2018	* eg: 16 bit DOS large model :-
2019	* Return address + Stack frame + SI and DI + Arguments + Locals
2020	* 2 + 2 + 2+2 + 4+2 + 2 = 16
2021	*
2022	* eg: RS/6000 AIX :-
2023	* Massive stack frame per call, massive program stack size
2024	* net effect - room for approx 200 levels before core dump!
2025	*
2026	* In addition, we can consume additional stack for every ETYPE_SSUB
2027	* (and ETYPE_ESUB). Guessing this requirement to be approx 100 bytes,
2028	* we place an arbitrary limit of 20 (ie: 2KB) on subexpressions in an ERE.
2029	* This limit was enforced earlier during match tree parsing.
2030	*
2031	*/
2032
2033	typedef struct substruct SUBS;
2034	struct substruct
2035	{
2036	int n_spans;
2037	ERE_SPAN spans[MAX_SPANS];
2038	SUBS *next;
2039	};
2040
2041	#define MAX_SUBS 20
2042
2043	typedef struct
2044	{
2045	FSM *fsm;
2046	int eremf;
2047	const char *str_init;
2048	const char *str_limit;
2049	const char *str_best;
2050	SUBS *subs;
2051	SUBS *subs_base;
2052	ERE_MATCHINFO *mi;
2053	}
2054	CONTEXT;
2055
2056	#define NR
2057
2058	static void NR walk_fsm(const char str, int state_no, CONTEXT cx);
2059
2060	static void NR walk_fsm_gated(const char str, CONTEXT cx, EDGE * e)
2061	{
2062	if (e->gate != str)
2063	// Avoid looping via this edge
2064	{
2065	const char *gate = e->gate;
2066
2067	e->gate = str;
2068	walk_fsm(str, e->to_state, cx);
2069	e->gate = gate;
2070	}
2071	}
2072
2073	static void NR walk_fsm_ssub(const char str, CONTEXT cx, EDGE * e)
2074	{
2075	SUBS subs;
2076
2077	if (cx->subs->n_spans < MAX_SPANS)
2078	cx->subs->spans[cx->subs->n_spans].pos = str - cx->str_init;
2079
2080	subs.n_spans = 0;
2081	subs.next = cx->subs;
2082	cx->subs = &subs;
2083	walk_fsm_gated(str, cx, e);
2084	cx->subs = subs.next;
2085	}
2086
2087	static void NR walk_fsm_esub(const char str, CONTEXT cx, EDGE * e)
2088	{
2089	SUBS *subs = cx->subs;
2090
2091	cx->subs = cx->subs->next;
2092
2093	if (cx->subs->n_spans < MAX_SPANS)
2094	cx->subs->spans[cx->subs->n_spans].len =
2095	(str - cx->str_init) - cx->subs->spans[cx->subs->n_spans].pos;
2096	++(cx->subs->n_spans);
2097
2098	walk_fsm_gated(str, cx, e);
2099
2100	--(cx->subs->n_spans);
2101	cx->subs = subs;
2102	}
2103
2104	static void NR walk_fsm(const char str, int state_no, CONTEXT cx)
2105	{
2106	int edge_no;
2107
2108	if (cx->fsm->state_flags[state_no] & FLAG_FINISH)
2109	// Got to finishing state, may have got a better match than before
2110	{
2111	if ((cx->str_best == NULL \|\|
2112	(cx->str_best < str) == (cx->eremf & EREMF_SHORTEST) == 0) &&
2113	str <= cx->str_limit)
2114	{
2115	cx->str_best = str;
2116	if (cx->mi != NULL)
2117	{
2118	int i;
2119
2120	cx->mi->n_spans = cx->subs_base->n_spans;
2121	for (i = 0; i < cx->mi->n_spans; i++)
2122	cx->mi->spans[i] = cx->subs_base->spans[i];
2123	}
2124	}
2125	if (cx->eremf & EREMF_ANY)
2126	return;
2127	// Continue, as may be able to get a better match
2128	}
2129
2130	for (edge_no = cx->fsm->state_first_edges[state_no];
2131	edge_no != -1;
2132	edge_no = cx->fsm->edges[edge_no].next_edge)
2133	// Consider taking a step along an edge to a new state
2134	{
2135	EDGE *e = &(cx->fsm->edges[edge_no]);
2136
2137	switch (e->etype)
2138	{
2139	/...sETYPE_CHAR \45\ if matches character\44\ we can advance:24: /
2140	case ETYPE_CHAR:
2141	if (*str == e->u.character)
2142	walk_fsm(str + 1, e->to_state, cx);
2143	break;
2144
2145	/...sETYPE_NCHAR \45\ if not matches character\44\ we can advance:24: /
2146	case ETYPE_NCHAR:
2147	if (str != '\0' && str != e->u.character)
2148	walk_fsm(str + 1, e->to_state, cx);
2149	break;
2150
2151	/...sETYPE_STRING \45\ if matches string\44\ we can advance:24: /
2152	case ETYPE_STRING:
2153	{
2154	unsigned len = (unsigned char)e->u.string[0];
2155
2156	if (!memcmp(str, e->u.string + 1, len))
2157	walk_fsm(str + len, e->to_state, cx);
2158	}
2159	break;
2160
2161	/...sETYPE_DOT \45\ if got any character\44\ we can advance:24: /
2162	case ETYPE_DOT:
2163	if (*str != '\0')
2164	walk_fsm(str + 1, e->to_state, cx);
2165	break;
2166
2167	/...sETYPE_WORD \45\ if got word constituent\44\ we can advance:24: /
2168	case ETYPE_WORD:
2169	if (str != '\0' && isword(str))
2170	walk_fsm(str + 1, e->to_state, cx);
2171	break;
2172
2173	/...sETYPE_NWORD \45\ if got non word constituent\44\ we can advance:24: /
2174	case ETYPE_NWORD:
2175	if (str != '\0' && !isword(str))
2176	walk_fsm(str + 1, e->to_state, cx);
2177	break;
2178
2179	/...sETYPE_CCLASS \45\ if in the class\44\ we can advance:24: /
2180	case ETYPE_CCLASS:
2181	if (match_cclass(*str, e->u.cclass))
2182	walk_fsm(str + 1, e->to_state, cx);
2183	break;
2184
2185	/...sETYPE_SOL\47\EOL\47\SOW\47\EOW\47\IW\47\EW \45\ special epsilon moves:24: /
2186	case ETYPE_SOL:
2187	if (str == cx->str_init)
2188	walk_fsm_gated(str, cx, e);
2189	break;
2190	case ETYPE_EOL:
2191	if (*str == '\0')
2192	walk_fsm_gated(str, cx, e);
2193	break;
2194	case ETYPE_SOW:
2195	if (isword(str[0]) &&
2196	((cx->str_init < str && !isword(str[-1])) \|\|
2197	(cx->str_init == str)))
2198	walk_fsm_gated(str, cx, e);
2199	break;
2200	case ETYPE_EOW:
2201	if ((cx->str_init < str && isword(str[-1])) &&
2202	(str[0] == '\0' \|\| !isword(str[0])))
2203	walk_fsm_gated(str, cx, e);
2204	break;
2205	case ETYPE_IW:
2206	if (cx->str_init < str && isword(str[-1]) &&
2207	str[0] != '\0' && isword(str[0]))
2208	walk_fsm_gated(str, cx, e);
2209	break;
2210	case ETYPE_EW:
2211	if (isword(str[0]) &&
2212	((cx->str_init < str && !isword(str[-1])) \|\|
2213	(cx->str_init == str)))
2214	walk_fsm_gated(str, cx, e);
2215	else if ((cx->str_init < str && isword(str[-1])) &&
2216	(str[0] == '\0' \|\| !isword(str[0])))
2217	walk_fsm_gated(str, cx, e);
2218	break;
2219
2220	/...sETYPE_SSUB\47\ESUB \45\ handle nested subexpression:24: /
2221	case ETYPE_SSUB:
2222	walk_fsm_ssub(str, cx, e);
2223	break;
2224	case ETYPE_ESUB:
2225	walk_fsm_esub(str, cx, e);
2226	break;
2227
2228	/...sETYPE_BACK \45\ check backreference:24: /
2229	case ETYPE_BACK:
2230	if (e->u.n_span < cx->subs->n_spans)
2231	{
2232	int len = cx->subs->spans[e->u.n_span].len;
2233
2234	if (!memcmp(str, cx->str_init + cx->subs->spans[e->u.n_span].pos, len))
2235	walk_fsm_gated(str + len, cx, e);
2236	}
2237	break;
2238
2239	}
2240	}
2241	}
2242
2243	static const char match_fsm(FSM fsm,
2244	int eremf,
2245	const char *str,
2246	int posn,
2247	int limit,
2248	int state_no,
2249	ERE_MATCHINFO * mi)
2250	{
2251	CONTEXT cx;
2252	SUBS subs;
2253
2254	cx.fsm = fsm;
2255	cx.eremf = eremf;
2256	cx.str_init = str;
2257	cx.str_limit = str + limit;
2258	cx.str_best = NULL;
2259	cx.subs = &subs;
2260	cx.subs_base = &subs;
2261	cx.mi = mi;
2262	subs.n_spans = 0;
2263	walk_fsm(str + posn, state_no, &cx);
2264	return cx.str_best;
2265	}
2266
2267
2268	#ifdef DEBUG
2269	/...sprint_fsm:0: /
2270	static void print_fsm(FSM * fsm, int s, BOOLEAN not_just_reachable)
2271	{
2272	int state_no, edge_no;
2273
2274	printf("Starting state %02d\n", s);
2275	for (state_no = 0; state_no < fsm->n_states; state_no++)
2276	if ((fsm->state_flags[state_no] & FLAG_REACHABLE) != 0 \|\|
2277	not_just_reachable)
2278	{
2279	printf("%02d:%c\t", state_no, ((fsm->state_flags[state_no] & FLAG_FINISH) != 0) ? 'F' : ' ');
2280	for (edge_no = fsm->state_first_edges[state_no];
2281	edge_no != -1;
2282	edge_no = fsm->edges[edge_no].next_edge)
2283	/...sshow edge:32: /
2284	{
2285	EDGE *e = &(fsm->edges[edge_no]);
2286
2287	switch (e->etype)
2288	{
2289	case ETYPE_CHAR:
2290	printf("%c", e->u.character);
2291	break;
2292	case ETYPE_NCHAR:
2293	printf("~%c", e->u.character);
2294	break;
2295	case ETYPE_STRING:
2296	printf("%.s",
2297	(unsigned char)e->u.string[0],
2298	(unsigned char)e->u.string[0],
2299	e->u.string + 1);
2300	break;
2301	case ETYPE_DOT:
2302	printf(".");
2303	break;
2304	case ETYPE_CCLASS:
2305	printf("[");
2306	break;
2307	case ETYPE_WORD:
2308	printf("\\w");
2309	break;
2310	case ETYPE_NWORD:
2311	printf("\\W");
2312	break;
2313	case ETYPE_EPSILON:
2314	printf("e");
2315	break;
2316	case ETYPE_SOL:
2317	printf("^");
2318	break;
2319	case ETYPE_EOL:
2320	printf("$");
2321	break;
2322	case ETYPE_SOW:
2323	printf("\\<");
2324	break;
2325	case ETYPE_EOW:
2326	printf("\\>");
2327	break;
2328	case ETYPE_IW:
2329	printf("\\B");
2330	break;
2331	case ETYPE_EW:
2332	printf("\\y");
2333	break;
2334	case ETYPE_SSUB:
2335	printf("(");
2336	break;
2337	case ETYPE_ESUB:
2338	printf(")");
2339	break;
2340	case ETYPE_BACK:
2341	printf("\\%d", e->u.n_span + 1);
2342	break;
2343	}
2344	printf("->%02d\t", e->to_state);
2345	}
2346
2347	printf("\n");
2348	}
2349	}
2350
2351	#endif
2352
2353	/...sextended regular expressions:0: /
2354	/* An ERE knows its original match tree and also the FSM it is compiled into.
2355	* Using a (largely epsilon move free) FSM makes for faster searching. */
2356
2357	typedef struct
2358	{
2359	MATCH *match; // Parse tree for expression
2360	int shortest_match; // Shortest match possible
2361	FSM *fsm; // Compiled FSM
2362	int s; // Start state for FSM
2363	}
2364	ERE;
2365
2366	/*
2367	*@@ rxpCompile:
2368	* compiles the regular expression str for later matching.
2369	*
2370	* If ERECF_TOLOWER is passed with erecf, every character
2371	* (or range of characters) to be matched are stored in the
2372	* compiled ERE in lower case. Therefore, if strings to be
2373	* matched are passed in lower case also, the result is a
2374	* case-insensitive match.
2375	*/
2376
2377	ERE* rxpCompile(const char *str,
2378	int erecf,
2379	int *rc) // out: error code
2380	{
2381	ERE *ere;
2382	const char *str_after;
2383	FSM *fsm;
2384	int s, f;
2385
2386	*rc = NO_ERROR;
2387
2388	if ((ere = (ERE *) malloc(sizeof(ERE))) == NULL)
2389	{
2390	*rc = ERROR_NOT_ENOUGH_MEMORY;
2391	return NULL;
2392	}
2393
2394	if ((ere->match = compile_match(str, &str_after, erecf, rc)) == NULL)
2395	{
2396	free(ere);
2397	return NULL;
2398	}
2399
2400	if (thisch(str_after) == CH_RPAR)
2401	{
2402	delete_match(ere->match);
2403	free(ere);
2404	*rc = EREE_UNEX_RPAR;
2405	return NULL;
2406	}
2407
2408	if (count_sub(ere->match) > MAX_SUBS)
2409	{
2410	delete_match(ere->match);
2411	free(ere);
2412	*rc = EREE_TOO_MANY_SUB;
2413	return NULL;
2414	}
2415
2416	#ifdef DEBUG
2417	print_tree(ere->match, 0);
2418	#endif
2419
2420	ere->shortest_match = (int)shortest_match(ere->match);
2421
2422	if ((fsm = create_fsm(rc)) == NULL)
2423	{
2424	delete_match(ere->match);
2425	free(ere);
2426	return NULL;
2427	}
2428
2429	if (!make_fsm_from_match(ere->match, fsm, &s, &f))
2430	{
2431	delete_fsm(fsm);
2432	delete_match(ere->match);
2433	free(ere);
2434	*rc = EREE_COMPILE_FSM;
2435	return NULL;
2436	}
2437
2438	#ifdef DEBUG
2439	print_fsm(fsm, s, TRUE);
2440	#endif
2441
2442	if ((ere->fsm = create_fsm(rc)) == NULL)
2443	{
2444	delete_fsm(fsm);
2445	delete_match(ere->match);
2446	free(ere);
2447	*rc = EREE_COMPILE_FSM;
2448	return NULL;
2449	}
2450
2451	if (!remove_epsilons(s, f, fsm, ere->fsm))
2452	{
2453	delete_fsm(ere->fsm);
2454	delete_fsm(fsm);
2455	delete_match(ere->match);
2456	free(ere);
2457	*rc = EREE_COMPILE_FSM;
2458	return NULL;
2459	}
2460
2461	delete_fsm(fsm);
2462
2463	ere->s = s;
2464
2465	#ifdef DEBUG
2466	print_fsm(ere->fsm, s, FALSE);
2467	#endif
2468
2469	return ere;
2470	}
2471
2472	/*
2473	*@@ rxpMinLen:
2474	*
2475	*/
2476
2477	int rxpMinLen(const ERE * ere)
2478	{
2479	return ere->shortest_match;
2480	}
2481
2482	/*
2483	*@@ rxpMatch:
2484	* returns the number of characters in the match, starting from
2485	* pos characters into the string to be searched. Details of
2486	* sub-matches can also be returend. Returns -1 if no match.
2487	*
2488	* If EREMF_SHORTEST is passed with eremf, the code looks for
2489	* the shortest match, instead of the longest match.
2490	*
2491	* If EREMF_ANY is passed with eremf, the code doesn't try to
2492	* find the longest (or shortest) match, it will return with the
2493	* first match it finds (which could be of any length).
2494	* This can speed up matching.
2495	*/
2496
2497	int rxpMatch(const ERE * ere,
2498	int eremf,
2499	const char *str,
2500	int pos,
2501	ERE_MATCHINFO * mi)
2502	{
2503	int len = pos + strlen(str + pos);
2504	const char *str_best;
2505
2506	if ((str_best = match_fsm(ere->fsm, eremf, str, pos, len, ere->s, mi)) == NULL)
2507	return -1;
2508	return (str_best - str) - pos;
2509	}
2510
2511	/*
2512	*@@ rxpMatch_fwd:
2513	* match forwards within a string from a specified start position.
2514	*
2515	* If a match, return TRUE, and also return pos and len of the match.
2516	*
2517	* If EREMF_SHORTEST is passed with eremf, the code looks for
2518	* the shortest match, instead of the longest match.
2519	*
2520	* If EREMF_ANY is passed with eremf, the code doesn't try to
2521	* find the longest (or shortest) match, it will return with the
2522	* first match it finds (which could be of any length).
2523	* This can speed up matching.
2524	*/
2525
2526	BOOLEAN rxpMatch_fwd(const ERE *ere, // in: compiled ERE (from rxpCompile)
2527	int eremf, // in: EREMF_* flags
2528	const char *str, // in: string to test
2529	int pos, // in: start position
2530	int *pos_match, // out: position of match
2531	int *len_match, // out: length of match
2532	ERE_MATCHINFO *mi) // out: match info (for rxpSubsWith)
2533	{
2534	int len = pos + strlen(str + pos);
2535	int i;
2536
2537	for (i = pos; i <= len - ere->shortest_match; i++)
2538	{
2539	const char *str_best;
2540
2541	if ((str_best = match_fsm(ere->fsm, eremf, str, i, len, ere->s, mi)) != NULL)
2542	{
2543	*pos_match = i;
2544	*len_match = (str_best - str) - i;
2545	return TRUE;
2546	}
2547	}
2548	return FALSE;
2549	}
2550
2551	/*
2552	*@@ rxpMatch_bwd:
2553	* match backwards within a string not passing a
2554	* specified end position.
2555	*
2556	* If a match, return TRUE, and also return pos and
2557	* len of the match. We need to consider matches from
2558	* the beginning of the line. We want the one which
2559	* ends up in the rightmost position. Of those which
2560	* end up equally far right, we want the one which
2561	* extends the furthest (or shortest if EREMF_SHORTEST)
2562	* left. See how we get this as a side effect of the
2563	* loop ordering and the '>= + delta' test.
2564	*
2565	* This may not look as efficient as scanning the string
2566	* backwards, but note that this would require a reversed
2567	* ERE too, and we can't reverse EREs as they may
2568	* contain backreferences.
2569	*
2570	* If EREMF_SHORTEST is passed with eremf, the code looks for
2571	* the shortest match, instead of the longest match.
2572	*
2573	* If EREMF_ANY is passed with eremf, the code doesn't try to
2574	* find the longest (or shortest) match, it will return with the
2575	* first match it finds (which could be of any length).
2576	* This can speed up matching.
2577	*/
2578
2579	BOOLEAN rxpMatch_bwd(const ERE *ere, // in: compiled ERE (from rxpCompile)
2580	int eremf, // in: EREMF_* flags
2581	const char *str, // in: string to test
2582	int pos, // in: start position
2583	int *pos_match, // out: position of match
2584	int *len_match, // out: length of match
2585	ERE_MATCHINFO * mi) // out: match info (for rxpSubsWith)
2586	{
2587	int i;
2588	int delta = (eremf & EREMF_SHORTEST) ? 0 : 1;
2589	const char *rightmost = NULL;
2590	ERE_MATCHINFO mi2;
2591
2592	for (i = 0; i <= pos - ere->shortest_match; i++)
2593	{
2594	const char *str_best;
2595
2596	if ((str_best = match_fsm(ere->fsm, eremf, str, i, pos, ere->s, &mi2)) != NULL)
2597	{
2598	if (rightmost == NULL \|\|
2599	str_best >= rightmost + delta)
2600	{
2601	*pos_match = i;
2602	*len_match = (str_best - str) - i;
2603	rightmost = str_best;
2604	if (mi != NULL)
2605	{
2606	mi->n_spans = mi2.n_spans;
2607	for (i = 0; i < mi->n_spans; i++)
2608	mi->spans[i] = mi2.spans[i];
2609	}
2610	}
2611	}
2612	}
2613	return rightmost != NULL;
2614	}
2615
2616	/*
2617	*@@ rxpFree:
2618	* frees all resources allocated by rxpCompile.
2619	*/
2620
2621	void rxpFree(ERE * ere)
2622	{
2623	if (ere)
2624	{
2625	delete_match(ere->match);
2626	delete_fsm(ere->fsm);
2627	free(ere);
2628	}
2629	}
2630
2631	/*
2632	*@@ rxpSubsWith:
2633	* perform a substitution based upon an earlier found match.
2634	* This allows for implementing a "find and replace" function.
2635	*/
2636
2637	BOOLEAN rxpSubsWith(const char *str, // in: original string searched (same as str given to rxpMatch_fwd)
2638	int pos, // in: span of the entire match (pos_match from rxpMatch_fwd)
2639	int len, // in: span of the entire match (len_match from rxpMatch_fwd)
2640	ERE_MATCHINFO *mi, // in: details of match sub-spans (as from rxpMatch_fwd)
2641	const char *with, // in: replacement string with \1 etc.
2642	char *out, // out: buffer for string substitutions
2643	int len_out, // in: sizeof *out
2644	int *rc) // out: error, if FALSE returned
2645	{
2646	int i = 0;
2647	int j;
2648
2649	memcpy(out, str, pos);
2650	i += pos;
2651	while (*with != '\0')
2652	{
2653	const char *rep;
2654	int len_rep;
2655
2656	if (*with != '\\')
2657	{
2658	rep = with++;
2659	len_rep = 1;
2660	}
2661	else
2662	{
2663	++with;
2664	if (with >= '1' && with <= '9')
2665	{
2666	int span = *with - '1';
2667
2668	++with;
2669	if (span >= mi->n_spans)
2670	{
2671	*rc = EREE_BAD_BACKREF;
2672	return FALSE;
2673	}
2674	rep = str + mi->spans[span].pos;
2675	len_rep = mi->spans[span].len;
2676	}
2677	else if (*with != '\0')
2678	{
2679	rep = with++;
2680	len_rep = 1;
2681	}
2682	else
2683	{
2684	*rc = EREE_BAD_BACKSLASH;
2685	return FALSE;
2686	}
2687	}
2688	if (i + len_rep > len_out)
2689	{
2690	*rc = EREE_SUBS_LEN;
2691	return FALSE;
2692	}
2693	memcpy(out + i, rep, len_rep);
2694	i += len_rep;
2695	}
2696	j = pos + len + strlen(str + pos + len);
2697	if (i + j > len_out)
2698	{
2699	*rc = EREE_SUBS_LEN;
2700	return FALSE;
2701	}
2702	memcpy(out + i, str + pos + len, j);
2703	i += j;
2704	out[i] = '\0';
2705	return TRUE;
2706	}
2707
2708	#ifdef __TESTCASE__
2709
2710	int main(int argc, char *argv[])
2711	{
2712	ERE *ere;
2713	ERE_MATCHINFO mi;
2714	int rc;
2715
2716	const char pcsz, pcszEre;
2717
2718	if (argc != 3)
2719	{
2720	printf("Usage: regexp <teststring> <ere>\n");
2721	exit(1);
2722	}
2723
2724	pcsz = argv[1];
2725	pcszEre = argv[2];
2726
2727	printf("matching \"%s\" against \"%s\"\n",
2728	pcsz,
2729	pcszEre);
2730	fflush(stdout);
2731
2732	if (!(ere = rxpCompile(pcszEre,
2733	0,
2734	&rc)))
2735	{
2736	printf("Error %d in rxpCompile: %s\n", rc, rxpError(rc));
2737	exit(rc);
2738	}
2739
2740	{
2741	int pos, length;
2742	rc = rxpMatch_fwd(ere,
2743	0,
2744	pcsz,
2745	0,
2746	&pos,
2747	&length,
2748	&mi);
2749
2750	if (rc == 0)
2751	printf("no match\n");
2752	else
2753	printf("found at pos %d, length %d\n", pos, length);
2754	}
2755
2756	return 0;
2757	}
2758
2759	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats: