Context Navigation

source: branches/branch-1-0/src/helpers/regexp.c@ 368

Visit:

Last change on this file since 368 was 222, checked in by umoeller, 23 years ago
Minor adjustments for new static handling.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 78.0 KB

Line
1
2	/*
3	*@@sourcefile regexp.c:
4	* extended regular expressions, taken from "Andy's Extended
5	* Regular Expressions", as written by Andy Key
6	* (nyangau@interalpha.co.uk) and released into the public
7	* domain, plus adjustments for the XWP helpers.
8	*
9	* Usage: All C programs; not OS/2-specific.
10	*
11	* Function prefixes:
12	* -- rxp* regular expression functions.
13	*
14	* Regular expression matching is done in the following stages:
15	*
16	* 1) Call rxpCompile to parse the expression into a recursive
17	* tree of matches.
18	*
19	* This tree is converted into a finite state machine (FSM).
20	* A second FSM is built from the first but with epsilon
21	* moves removed to elimate lockups and increase speed.
22	*
23	* The input string can be used to drive the FSM through all
24	* the possible routes. The largest (or smallest) amount of
25	* input string required to reach the finish state is recorded
26	* since an extended regular expression is deemed to match as
27	* much input string as possible.
28	*
29	* 2) Call one of rxpMatch, rxpMatch_fwd, or rxpMatch_bwd to
30	* perform a match.
31	*
32	* 3) Call rxpFree to free the compiled ERE.
33	*
34	* Beware: The matching routine is highly recursive and can
35	* require around 20 to 30 bytes per character in the source string
36	* to match it. Thus you should try to limit the length of the source
37	* string and/or allow a stack size of at least 20 (to 30) * max string
38	* length. In addition, add 2KB to allow for the use of several nested
39	* sub-expressions in the matching.
40	*
41	*@@header "helpers\regexp.h"
42	*@@added V0.9.19 (2002-04-17) [umoeller]
43	*/
44
45	/*
46	* Copyright (C) 2002 Ulrich Mller.
47	* This file is part of the "XWorkplace helpers" source package.
48	* This is free software; you can redistribute it and/or modify
49	* it under the terms of the GNU General Public License as published
50	* by the Free Software Foundation, in version 2 as it comes in the
51	* "COPYING" file of the XWorkplace main distribution.
52	* This program is distributed in the hope that it will be useful,
53	* but WITHOUT ANY WARRANTY; without even the implied warranty of
54	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
55	* GNU General Public License for more details.
56	*/
57
58	#include <stdio.h>
59	#include <ctype.h>
60	#include <stdlib.h>
61	#include <string.h>
62	#include <malloc.h>
63	#include <memory.h>
64
65	#include "setup.h" // code generation and debugging options
66
67	#define ERE_C
68	#include "helpers\regexp.h"
69
70	#define isword(c) (isalnum(c)\|\|(c)=='_')
71
72	STATIC int val_of_hex(char c)
73	{
74	if (c >= '0' && c <= '9')
75	return c - '0';
76	if (c >= 'a' && c <= 'f')
77	return c - 'a' + 10;
78	if (c >= 'A' && c <= 'F')
79	return c - 'A' + 10;
80	return 0; // Shouldn't get here
81	}
82
83	STATIC int escaped(const char *s)
84	{
85	if (s[0] == 'x' && isxdigit(s[1]))
86	// \x followed by 1 or 2 hex digits
87	{
88	int n = val_of_hex(s[1]);
89
90	if (isxdigit(s[2]))
91	n = n * 16 + val_of_hex(s[2]);
92	return n;
93	}
94	else
95	switch (s[0])
96	{
97	case 'n':
98	return '\n';
99	case 't':
100	return '\t';
101	case 'r':
102	return '\r';
103	case 'b':
104	return '\b';
105	case 'f':
106	return '\f';
107	case 'e':
108	return 0x1b;
109	default:
110	return (unsigned char)s[0];
111	}
112	}
113
114	STATIC const char past_escaped(const char s)
115	{
116	if (s[0] == 'x' && isxdigit(s[1]))
117	return isxdigit(s[2]) ? s + 3 : s + 2;
118	else
119	return s + 1;
120	}
121
122	#define zero_cclass(cclass) memset(cclass, 0, 0x100 >> 3)
123
124	STATIC unsigned char bits[] =
125	{0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
126
127	#define add_to_cclass(n, cclass) (cclass[(unsigned char)(n)>>3] \|= bits[(unsigned char)(n) & 7])
128
129	#define remove_from_cclass(n, cclass) (cclass[(unsigned char)(n)>>3] &= ~bits[(unsigned char)(n) & 7])
130
131	STATIC void invert_cclass(unsigned char *cclass)
132	{
133	int i;
134
135	for (i = 0; i < (0x100 >> 3); i++)
136	cclass[i] ^= 0xff;
137	}
138
139	#define match_cclass(n, cclass) ((cclass[(unsigned char)(n)>>3] & bits[(unsigned char)(n) & 7]) != 0)
140
141	#define CHL_EOS 0
142	#define CHL_END_CCLASS (-1)
143	#define CHL_COMP (-2)
144	#define CHL_RANGE (-3)
145	#define CHL_POSIX_COLLATING (-4)
146	#define CHL_POSIX_EQUIVALENCE (-5)
147	#define CHL_POSIX_CCLASS_BAD (-6)
148	#define CHL_POSIX_CCLASS_BASE (-30)
149	#define CHL_POSIX_CCLASS_END (-30+20)
150
151	/* I use my own wrapper functions so I can take their addresses.
152	* Remember, isalnum etc. can be implemented as macros... */
153	STATIC BOOLEAN my_isalnum(int ch)
154	{
155	return isalnum(ch);
156	}
157	STATIC BOOLEAN my_isalpha(int ch)
158	{
159	return isalpha(ch);
160	}
161	STATIC BOOLEAN my_isblank(int ch)
162	{
163	return ch == ' ' \|\| ch == '\t';
164	}
165	STATIC BOOLEAN my_iscntrl(int ch)
166	{
167	return iscntrl(ch);
168	}
169	STATIC BOOLEAN my_isdigit(int ch)
170	{
171	return isdigit(ch);
172	}
173	STATIC BOOLEAN my_islower(int ch)
174	{
175	return islower(ch);
176	}
177	STATIC BOOLEAN my_isprint(int ch)
178	{
179	return isprint(ch);
180	}
181	STATIC BOOLEAN my_ispunct(int ch)
182	{
183	return ispunct(ch);
184	}
185	STATIC BOOLEAN my_isspace(int ch)
186	{
187	return isspace(ch);
188	}
189	STATIC BOOLEAN my_isupper(int ch)
190	{
191	return isupper(ch);
192	}
193	STATIC BOOLEAN my_isxdigit(int ch)
194	{
195	return isxdigit(ch);
196	}
197
198	typedef struct
199	{
200	int len_name;
201	const char *name;
202	BOOLEAN(*iscclass) (int ch);
203	}
204	POSIX_CCLASS;
205
206	STATIC POSIX_CCLASS posix_cclass[] =
207	{
208	5, "alnum", my_isalnum,
209	5, "alpha", my_isalpha,
210	5, "blank", my_isblank,
211	5, "cntrl", my_iscntrl,
212	5, "digit", my_isdigit,
213	5, "lower", my_islower,
214	5, "print", my_isprint,
215	5, "punct", my_ispunct,
216	5, "space", my_isspace,
217	5, "upper", my_isupper,
218	6, "xdigit", my_isxdigit,
219	};
220
221	STATIC int find_posix(const char *str)
222	{
223	int p;
224
225	if (str[0] != '[')
226	return str[0];
227	if (str[1] == '.')
228	return CHL_POSIX_COLLATING;
229	if (str[1] == '=')
230	return CHL_POSIX_EQUIVALENCE;
231	if (str[1] != ':')
232	return str[0];
233	for (p = 0; p < sizeof(posix_cclass) / sizeof(posix_cclass[0]); p++)
234	if (!memcmp(str + 2, posix_cclass[p].name, posix_cclass[p].len_name) &&
235	str[2 + posix_cclass[p].len_name] == ':' &&
236	str[2 + posix_cclass[p].len_name + 1] == ']')
237	return CHL_POSIX_CCLASS_BASE + p;
238	return CHL_POSIX_CCLASS_BAD;
239	}
240
241	STATIC int cclass_thisch(const char *str)
242	{
243	switch (str[0])
244	{
245	case '\\':
246	return escaped(str + 1);
247	case ']':
248	return CHL_END_CCLASS;
249	case '^':
250	return CHL_COMP;
251	case '-':
252	return CHL_RANGE;
253	case '\0':
254	return CHL_EOS;
255	default:
256	return find_posix(str);
257	}
258	}
259
260	STATIC const char cclass_nextch(const char str)
261	{
262	int p;
263
264	if (str[0] == '\\')
265	return past_escaped(str + 1);
266	p = find_posix(str);
267	if (CHL_POSIX_CCLASS_BASE <= p && p < CHL_POSIX_CCLASS_END)
268	return str + 2 + posix_cclass[p - CHL_POSIX_CCLASS_BASE].len_name + 2;
269	else
270	return str + 1;
271	}
272
273	STATIC unsigned char compile_cclass(const char str,
274	const char **str_after,
275	int erecf,
276	int *rc)
277	{
278	unsigned char *cclass;
279	BOOLEAN complement;
280	int i, c, last_c = -1;
281
282	if ((cclass = (unsigned char *)malloc(0x100 >> 3)) == NULL)
283	{
284	*rc = ERROR_NOT_ENOUGH_MEMORY;
285	return NULL;
286	}
287	zero_cclass(cclass);
288
289	complement = (cclass_thisch(str) == CHL_COMP);
290	if (complement)
291	str = cclass_nextch(str);
292
293	while ((c = cclass_thisch(str)) != CHL_EOS && c != CHL_END_CCLASS)
294	{
295	if (CHL_POSIX_CCLASS_BASE <= c && c < CHL_POSIX_CCLASS_END)
296	{
297	for (i = 0; i < 0x100; i++)
298	if (posix_cclass[c - CHL_POSIX_CCLASS_BASE].iscclass(i))
299	add_to_cclass(i, cclass);
300	last_c = -1;
301	}
302	else
303	switch (c)
304	{
305	/...sCHL_POSIX_\42\ \45\ error cases:32: /
306	case CHL_POSIX_COLLATING:
307	free(cclass);
308	*rc = EREE_POSIX_COLLATING;
309	return NULL;
310	case CHL_POSIX_EQUIVALENCE:
311	free(cclass);
312	*rc = EREE_POSIX_EQUIVALENCE;
313	return NULL;
314	case CHL_POSIX_CCLASS_BAD:
315	free(cclass);
316	*rc = EREE_POSIX_CCLASS_BAD;
317	return NULL;
318	/...sCHL_RANGE \45\ range:32: /
319	case CHL_RANGE:
320	if (last_c == -1)
321	// Unexpected at this point
322	{
323	free(cclass);
324	*rc = EREE_UNEX_RANGE;
325	return NULL;
326	}
327	str = cclass_nextch(str);
328	if ((c = cclass_thisch(str)) == CHL_EOS \|\| c == CHL_END_CCLASS)
329	// Not followed by anything
330	{
331	free(cclass);
332	*rc = EREE_UNF_RANGE;
333	return NULL;
334	}
335	for (i = last_c + 1; i <= c; i++)
336	add_to_cclass(i, cclass);
337	last_c = c;
338	break;
339
340	#pragma info(nogen) // do not warn here
341
342	/...sCHL_COMP \45\ complement:32: /
343	case CHL_COMP:
344	c = '^';
345	// Fall through
346	/...sdefault \45\ individual entry:32: /
347	default:
348	if (erecf & ERECF_TOLOWER)
349	c = tolower(c);
350	add_to_cclass(c, cclass);
351	last_c = c;
352	break;
353
354	#pragma info(restore)
355
356	}
357	str = cclass_nextch(str);
358	}
359
360	if (c == CHL_EOS)
361	{
362	free(cclass);
363	*rc = EREE_UNF_CCLASS;
364	return NULL;
365	}
366
367	if (complement)
368	invert_cclass(cclass);
369
370	remove_from_cclass(0, cclass);
371
372	*str_after = cclass_nextch(str);
373	return cclass;
374	}
375
376	#define delete_cclass(cclass) free(cclass)
377
378	/* A 'match' embodies all of regular expression functionality.
379	* Matches can be defined in terms of each other. */
380
381	typedef unsigned char MTYPE;
382
383	#define MTYPE_NULL ((MTYPE) 0)
384	#define MTYPE_CHAR ((MTYPE) 1)
385	#define MTYPE_NCHAR ((MTYPE) 2)
386	#define MTYPE_STRING ((MTYPE) 3)
387	#define MTYPE_CCLASS ((MTYPE) 4)
388	#define MTYPE_WORD ((MTYPE) 5)
389	#define MTYPE_NWORD ((MTYPE) 6)
390	#define MTYPE_DOT ((MTYPE) 7)
391	#define MTYPE_QUERY ((MTYPE) 8)
392	#define MTYPE_PLUS ((MTYPE) 9)
393	#define MTYPE_STAR ((MTYPE) 10)
394	#define MTYPE_CREP ((MTYPE) 11)
395	#define MTYPE_OR ((MTYPE) 12)
396	#define MTYPE_CAT ((MTYPE) 13)
397	#define MTYPE_SUB ((MTYPE) 14)
398	#define MTYPE_SOL ((MTYPE) 15)
399	#define MTYPE_EOL ((MTYPE) 16)
400	#define MTYPE_SOW ((MTYPE) 17)
401	#define MTYPE_EOW ((MTYPE) 18)
402	#define MTYPE_IW ((MTYPE) 19)
403	#define MTYPE_EW ((MTYPE) 20)
404	#define MTYPE_BACK ((MTYPE) 21)
405
406	typedef struct match_struct MATCH;
407
408	struct match_struct
409	{
410	MTYPE mtype;
411	union
412	{
413	char character;
414	unsigned char *cclass;
415	MATCH *match;
416	MATCH *matchs[2];
417	int n_span;
418	char *string;
419	struct
420	{
421	unsigned m, n;
422	MATCH *match;
423	}
424	crep;
425	}
426	u;
427	};
428
429	STATIC MATCH null_match =
430	{MTYPE_NULL};
431
432	#define NULL_MATCH (&null_match)
433
434	STATIC void delete_match(MATCH * match)
435	{
436	if (match == NULL_MATCH)
437	return;
438	switch (match->mtype)
439	{
440	case MTYPE_STRING:
441	free(match->u.string);
442	break;
443	case MTYPE_CCLASS:
444	delete_cclass(match->u.cclass);
445	break;
446	case MTYPE_QUERY:
447	case MTYPE_PLUS:
448	case MTYPE_STAR:
449	delete_match(match->u.match);
450	break;
451	case MTYPE_CREP:
452	delete_match(match->u.crep.match);
453	break;
454	case MTYPE_OR:
455	case MTYPE_CAT:
456	delete_match(match->u.matchs[0]);
457	delete_match(match->u.matchs[1]);
458	break;
459	case MTYPE_SUB:
460	delete_match(match->u.match);
461	break;
462	}
463	free(match);
464	}
465
466	/*
467	*@@ shortest_match:
468	* determines the shortest possible match length for a given match
469	* tree. In the following example it is 3, thus allowing us to only
470	* consider 4 positions for regular expression matching in a 6
471	* character line.
472	*
473	+ aa*bc
474	+
475	+ 123456
476	+ ---
477	+ ---
478	+ ---
479	+ ---
480	*/
481
482	STATIC unsigned shortest_match(const MATCH * match)
483	{
484	unsigned a, b;
485
486	switch (match->mtype)
487	{
488	case MTYPE_NULL:
489	return 0;
490	case MTYPE_CHAR:
491	return 1;
492	case MTYPE_NCHAR:
493	return 1;
494	case MTYPE_STRING:
495	return (unsigned char)match->u.string[0];
496	case MTYPE_CCLASS:
497	return 1;
498	case MTYPE_WORD:
499	return 1;
500	case MTYPE_NWORD:
501	return 1;
502	case MTYPE_DOT:
503	return 1;
504	case MTYPE_QUERY:
505	return 0;
506	case MTYPE_PLUS:
507	return shortest_match(match->u.match);
508	case MTYPE_STAR:
509	return 0;
510	case MTYPE_CREP:
511	return match->u.crep.m *
512	shortest_match(match->u.crep.match);
513	case MTYPE_OR:
514	a = shortest_match(match->u.matchs[0]);
515	b = shortest_match(match->u.matchs[1]);
516	return a <= b ? a : b;
517	case MTYPE_CAT:
518	return shortest_match(match->u.matchs[0]) +
519	shortest_match(match->u.matchs[1]);
520	case MTYPE_SUB:
521	return shortest_match(match->u.match);
522	case MTYPE_SOL:
523	return 0;
524	case MTYPE_EOL:
525	return 0;
526	case MTYPE_SOW:
527	return 0;
528	case MTYPE_EOW:
529	return 0;
530	case MTYPE_IW:
531	return 0;
532	case MTYPE_EW:
533	return 0;
534	case MTYPE_BACK:
535	return 0;
536	}
537	return 0; // Should never happen
538	}
539
540	STATIC BOOLEAN got_backrefs(MATCH * match)
541	{
542	switch (match->mtype)
543	{
544	case MTYPE_QUERY:
545	case MTYPE_PLUS:
546	case MTYPE_STAR:
547	return got_backrefs(match->u.match);
548	case MTYPE_CREP:
549	return got_backrefs(match->u.crep.match);
550	case MTYPE_OR:
551	case MTYPE_CAT:
552	return got_backrefs(match->u.matchs[0]) \|\|
553	got_backrefs(match->u.matchs[1]);
554	case MTYPE_BACK:
555	return TRUE;
556	}
557	return FALSE;
558	}
559
560	STATIC MATCH remove_subs(MATCH match)
561	{
562	switch (match->mtype)
563	{
564	case MTYPE_QUERY:
565	case MTYPE_PLUS:
566	case MTYPE_STAR:
567	match->u.match = remove_subs(match->u.match);
568	break;
569	case MTYPE_CREP:
570	match->u.crep.match = remove_subs(match->u.crep.match);
571	break;
572	case MTYPE_OR:
573	case MTYPE_CAT:
574	match->u.matchs[0] = remove_subs(match->u.matchs[0]);
575	match->u.matchs[1] = remove_subs(match->u.matchs[1]);
576	break;
577	case MTYPE_SUB:
578	{
579	MATCH *m = match;
580
581	match = match->u.match;
582	free(m);
583	}
584	break;
585	}
586	return match;
587	}
588
589	STATIC int count_sub(const MATCH * match)
590	{
591	switch (match->mtype)
592	{
593	case MTYPE_SUB:
594	return 1 + count_sub(match->u.match);
595	case MTYPE_QUERY:
596	case MTYPE_PLUS:
597	case MTYPE_STAR:
598	return count_sub(match->u.match);
599	case MTYPE_CREP:
600	return count_sub(match->u.crep.match);
601	case MTYPE_OR:
602	case MTYPE_CAT:
603	return count_sub(match->u.matchs[0]) +
604	count_sub(match->u.matchs[1]);
605	}
606	return 0;
607	}
608
609	/...scompiling matches:0: /
610	/*
611	*
612	* <term> ::= <character> \| <class> \| ( <match> )
613	* <factor> ::= <term>? \| <term>+ \| <term>* \| <term>
614	* <factors> ::= <factor> { <factor> }
615	* <match> ::= <factors> { \|<factors> }
616	*
617	*/
618
619	/* We have an optimisation here in that if we can see a span of 'boring'
620	* characters, we return them as a single entity. We mustn't do this for the
621	* last in the span, as this may have modifiers applied to it, as in the
622	* example string "abcd", where the d has the modifier, but abc do not.
623	* We only do 255 chars at once, as the length is stored in a byte.
624	* Reducing 255 levels of per-character recursive searching to 1 level with
625	* a string compare has got to be a massive saving. */
626
627	#define STRING_MAX 255
628
629	#define CH_NOT_BASE 1000
630	#define CH_NOT_END (1000+0x100)
631	#define CH_EOS 0
632	#define CH_DOT (-1)
633	#define CH_WORD (-2)
634	#define CH_NWORD (-3)
635	#define CH_LSQR (-4)
636	#define CH_RSQR (-5)
637	#define CH_LPAR (-6)
638	#define CH_RPAR (-7)
639	#define CH_LCUR (-8)
640	#define CH_RCUR (-9)
641	#define CH_QUERY (-10)
642	#define CH_PLUS (-11)
643	#define CH_STAR (-12)
644	#define CH_OR (-13)
645	#define CH_SOL (-14)
646	#define CH_EOL (-15)
647	#define CH_SOW (-16)
648	#define CH_EOW (-17)
649	#define CH_IW (-18)
650	#define CH_EW (-19)
651	#define CH_BAD_TILDE (-20)
652	#define CH_STRING_BASE (-1000)
653	#define CH_STRING_END (-1000+STRING_MAX)
654	#define CH_BACK_BASE (-2000)
655	#define CH_BACK_END (-2000+9)
656
657	STATIC int boring_string(const char *s)
658	{
659	int n = 0;
660
661	while (*s &&
662	strchr("\\.[](){}?+\|^$<>~", s) == NULL
663	&& n < STRING_MAX)
664	{
665	s++;
666	n++;
667	}
668	return n;
669	}
670
671	STATIC int thisch(const char *str)
672	{
673	int n;
674
675	switch (str[0])
676	{
677	case '\\':
678	if (str[1] >= '1' && str[1] <= '9')
679	return CH_BACK_BASE + (str[1] - '1');
680	switch (str[1])
681	{
682	case '`':
683	return CH_SOL;
684	case '\'':
685	return CH_EOL;
686	case '<':
687	return CH_SOW;
688	case '>':
689	return CH_EOW;
690	case 'w':
691	return CH_WORD;
692	case 'W':
693	return CH_NWORD;
694	case 'B':
695	return CH_IW;
696	case 'y':
697	return CH_EW;
698	default:
699	return escaped(str + 1);
700	}
701
702	case '~':
703	if (str[1] == '\\')
704	return CH_NOT_BASE + escaped(str + 2);
705	else if (str[1] != '\0')
706	return CH_NOT_BASE + (unsigned char)str[1];
707	else
708	return '~';
709	case '.':
710	return CH_DOT;
711	case '[':
712	return CH_LSQR;
713	case ']':
714	return CH_RSQR;
715	case '(':
716	return CH_LPAR;
717	case ')':
718	return CH_RPAR;
719	case '{':
720	return CH_LCUR;
721	case '}':
722	return CH_RCUR;
723	case '?':
724	return CH_QUERY;
725	case '+':
726	return CH_PLUS;
727	case '*':
728	return CH_STAR;
729	case '\|':
730	return CH_OR;
731	case '^':
732	return CH_SOL;
733	case '$':
734	return CH_EOL;
735	}
736
737	n = boring_string(str);
738	if (n < 3)
739	return (unsigned char)str[0];
740	else
741	return CH_STRING_BASE + n - 1;
742	}
743
744	STATIC const char nextch(const char str)
745	{
746	int n;
747
748	switch (str[0])
749	{
750	case '\\':
751	if (str[1] >= '1' && str[1] <= '9')
752	return str + 2;
753	switch (str[1])
754	{
755	case '`':
756	case '\'':
757	case '<':
758	case '>':
759	case 'w':
760	case 'W':
761	case 'B':
762	case 'y':
763	return str + 2;
764	default:
765	return past_escaped(str + 1);
766	}
767
768	case '~':
769	if (str[1] == '\\')
770	return past_escaped(str + 2);
771	else if (str[1] != '\0')
772	return str + 2;
773	else
774	return str + 1;
775	case '.':
776	case '[':
777	case '(':
778	case ')':
779	case '{':
780	case '}':
781	case '?':
782	case '+':
783	case '*':
784	case '\|':
785	case '^':
786	case '$':
787	return str + 1;
788	}
789
790	n = boring_string(str);
791	if (n < 3)
792	return str + 1;
793	else
794	return str + n - 1;
795	}
796
797	STATIC MATCH compile_match(const char str, const char *str_after, int erecf, int rc);
798
799	STATIC const char scan_number(const char str, unsigned *num)
800	{
801	if (!isdigit(*str))
802	return NULL;
803	*num = 0;
804	do
805	(num) = (num) * 10 + (*str++ - '0');
806	while (isdigit(*str));
807	return str;
808	}
809
810	STATIC MATCH create_match(int rc)
811	{
812	MATCH *match;
813
814	if ((match = (MATCH *) malloc(sizeof(MATCH))) == NULL)
815	{
816	*rc = ERROR_NOT_ENOUGH_MEMORY;
817	return NULL;
818	}
819	return match;
820	}
821
822	STATIC MATCH compile_term(const char str, const char *str_after, int erecf, int rc)
823	{
824	MATCH *match;
825	int c;
826
827	c = thisch(str);
828	switch (c)
829	{
830	case CH_RSQR:
831	*rc = EREE_UNEX_RSQR;
832	return NULL;
833	case CH_QUERY:
834	*rc = EREE_UNEX_QUERY;
835	return NULL;
836	case CH_PLUS:
837	*rc = EREE_UNEX_PLUS;
838	return NULL;
839	case CH_STAR:
840	*rc = EREE_UNEX_STAR;
841	return NULL;
842	case CH_LCUR:
843	*rc = EREE_UNEX_LCUR;
844	return NULL;
845	case CH_RCUR:
846	*rc = EREE_UNEX_RCUR;
847	return NULL;
848	}
849
850	if (c == CH_EOS \|\| c == CH_OR \|\| c == CH_RPAR)
851	{
852	*str_after = str;
853	return NULL_MATCH;
854	}
855
856	if ((match = create_match(rc)) == NULL)
857	return NULL;
858
859	if (CH_NOT_BASE <= c && c < CH_NOT_END)
860	/...snot a specific character:16: /
861	{
862	char ch = (char)(c - CH_NOT_BASE);
863
864	if (erecf & ERECF_TOLOWER)
865	ch = (char)tolower(ch);
866	match->mtype = MTYPE_NCHAR;
867	match->u.character = ch;
868	str = nextch(str);
869	}
870	else if (CH_STRING_BASE <= c && c < CH_STRING_END)
871	/...sa string of non\45\special characters:16: /
872	{
873	unsigned len = c - CH_STRING_BASE;
874
875	match->mtype = MTYPE_STRING;
876	match->u.string = (char *)malloc(1 + len);
877	if (match->u.string == NULL)
878	{
879	free(match);
880	*rc = ERROR_NOT_ENOUGH_MEMORY;
881	return NULL;
882	}
883	match->u.string[0] = (char)len;
884	memcpy(match->u.string + 1, str, len);
885	if (erecf & ERECF_TOLOWER)
886	{
887	unsigned i;
888
889	for (i = 1; i <= len; i++)
890	match->u.string[i] = (char)tolower(match->u.string[i]);
891	}
892	str = nextch(str);
893	}
894	else if (CH_BACK_BASE <= c && c < CH_BACK_END)
895	/...sa backreference:16: /
896	{
897	match->mtype = MTYPE_BACK;
898	match->u.n_span = c - CH_BACK_BASE;
899	str = nextch(str);
900	}
901	else
902	switch (c)
903	{
904	/...sCH_LSQR \45\ character class:24: /
905	case CH_LSQR:
906	match->mtype = MTYPE_CCLASS;
907	str = nextch(str);
908	if ((match->u.cclass = compile_cclass(str, &str, erecf, rc)) == NULL)
909	{
910	free(match);
911	return NULL;
912	}
913	break;
914	/...sCH_DOT \45\ any character:24: /
915	case CH_DOT:
916	match->mtype = MTYPE_DOT;
917	str = nextch(str);
918	break;
919	/...sCH_WORD \45\ \92\w:24: /
920	case CH_WORD:
921	match->mtype = MTYPE_WORD;
922	str = nextch(str);
923	break;
924	/...sCH_NWORD \45\ \92\W:24: /
925	case CH_NWORD:
926	match->mtype = MTYPE_NWORD;
927	str = nextch(str);
928	break;
929	/...sCH_LPAR \45\ nested regular expression:24: /
930	case CH_LPAR:
931	{
932	MATCH *sub_match;
933
934	str = nextch(str);
935	if ((sub_match = compile_match(str, &str, erecf, rc)) == NULL)
936	{
937	free(match);
938	return NULL;
939	}
940	if (!got_backrefs(sub_match))
941	sub_match = remove_subs(sub_match);
942	if (thisch(str) != CH_RPAR)
943	{
944	*rc = EREE_UNF_SUB;
945	return NULL;
946	}
947	str = nextch(str);
948	match->mtype = MTYPE_SUB;
949	match->u.match = sub_match;
950	}
951	break;
952	/...sCH_SOL \45\ \94\:24: /
953	case CH_SOL:
954	match->mtype = MTYPE_SOL;
955	str = nextch(str);
956	break;
957	/...sCH_EOL \45\ \36\:24: /
958	case CH_EOL:
959	match->mtype = MTYPE_EOL;
960	str = nextch(str);
961	break;
962	/...sCH_SOW \45\ \92\\60\:24: /
963	case CH_SOW:
964	match->mtype = MTYPE_SOW;
965	str = nextch(str);
966	break;
967	/...sCH_EOW \45\ \92\\62\:24: /
968	case CH_EOW:
969	match->mtype = MTYPE_EOW;
970	str = nextch(str);
971	break;
972	/...sCH_IW \45\ \92\B:24: /
973	case CH_IW:
974	match->mtype = MTYPE_IW;
975	str = nextch(str);
976	break;
977	/...sCH_EW \45\ \92\y:24: /
978	case CH_EW:
979	match->mtype = MTYPE_EW;
980	str = nextch(str);
981	break;
982	/...sdefault \45\ any old character:24: /
983	default:
984	{
985	char ch = (char)c;
986
987	if (erecf & ERECF_TOLOWER)
988	ch = (char)tolower(ch);
989	match->mtype = MTYPE_CHAR;
990	match->u.character = ch;
991	str = nextch(str);
992	}
993	break;
994	}
995
996	*str_after = str;
997	return match;
998	}
999
1000	STATIC MTYPE repeat_type_of(int c)
1001	{
1002	switch (c)
1003	{
1004	case CH_QUERY:
1005	return MTYPE_QUERY;
1006	case CH_PLUS:
1007	return MTYPE_PLUS;
1008	case CH_STAR:
1009	return MTYPE_STAR;
1010	case CH_LCUR:
1011	return MTYPE_CREP;
1012	default:
1013	return (MTYPE) - 1;
1014	}
1015	}
1016
1017	STATIC MATCH compile_factor(const char str, const char *str_after, int erecf, int rc)
1018	{
1019	MATCH match, parent;
1020	MTYPE repeat_mtype;
1021
1022	if ((match = compile_term(str, &str, erecf, rc)) == NULL)
1023	return NULL;
1024
1025	while ((repeat_mtype = repeat_type_of(thisch(str))) != (MTYPE) - 1)
1026	/...smatch is to be repeated:16: /
1027	{
1028	if ((parent = create_match(rc)) == NULL)
1029	{
1030	delete_match(match);
1031	return NULL;
1032	}
1033
1034	parent->mtype = repeat_mtype;
1035	str = nextch(str);
1036	if (repeat_mtype == MTYPE_CREP)
1037	{
1038	parent->u.crep.match = match;
1039	if ((str = scan_number(str, &(parent->u.crep.m))) == NULL)
1040	{
1041	delete_match(match);
1042	free(parent);
1043	*rc = EREE_BAD_CREP_M;
1044	return NULL;
1045	}
1046	parent->u.crep.n = parent->u.crep.m;
1047	if (*str == ',')
1048	{
1049	++str;
1050	if (*str != '}')
1051	{
1052	if ((str = scan_number(str, &(parent->u.crep.n))) == NULL)
1053	{
1054	delete_match(match);
1055	free(parent);
1056	*rc = EREE_BAD_CREP_N;
1057	return NULL;
1058	}
1059	}
1060	else
1061	parent->u.crep.n = (unsigned)~0;
1062	}
1063	if (*str != '}')
1064	{
1065	delete_match(match);
1066	free(parent);
1067	*rc = EREE_UNF_CREP;
1068	return NULL;
1069	}
1070	++str;
1071	if (parent->u.crep.m > parent->u.crep.n)
1072	{
1073	delete_match(match);
1074	free(parent);
1075	*rc = EREE_BAD_CREP;
1076	return NULL;
1077	}
1078	}
1079	else
1080	parent->u.match = match;
1081	match = parent;
1082	}
1083
1084	*str_after = str;
1085
1086	return match;
1087	}
1088
1089	STATIC MATCH compile_factors(const char str, const char *str_after, int erecf, int rc)
1090	{
1091	MATCH *match;
1092	int c;
1093
1094	if ((match = compile_factor(str, &str, erecf, rc)) == NULL)
1095	return NULL;
1096
1097	while ((c = thisch(str)) != CH_EOS && c != CH_RPAR && c != CH_OR)
1098	/...sconsider catenation of more factors:16: /
1099	{
1100	MATCH sibling, parent;
1101
1102	if ((sibling = compile_factor(str, &str, erecf, rc)) == NULL)
1103	{
1104	delete_match(match);
1105	return NULL;
1106	}
1107
1108	if ((parent = create_match(rc)) == NULL)
1109	{
1110	delete_match(sibling);
1111	delete_match(match);
1112	return NULL;
1113	}
1114	parent->mtype = MTYPE_CAT;
1115	parent->u.matchs[0] = match;
1116	parent->u.matchs[1] = sibling;
1117	match = parent;
1118	}
1119
1120	*str_after = str;
1121	return match;
1122	}
1123
1124	/...scompile_match \45\ factors\124\factors:0: /
1125	STATIC MATCH compile_match(const char str, const char *str_after, int erecf, int rc)
1126	{
1127	MATCH *match;
1128
1129	if ((match = compile_factors(str, &str, erecf, rc)) == NULL)
1130	return NULL;
1131
1132	while (thisch(str) == CH_OR)
1133	/...sfind sibling and or it in:16: /
1134	{
1135	MATCH sibling, parent;
1136
1137	str = nextch(str);
1138	if ((sibling = compile_factors(str, &str, erecf, rc)) == NULL)
1139	{
1140	delete_match(match);
1141	return NULL;
1142	}
1143	if ((parent = create_match(rc)) == NULL)
1144	{
1145	delete_match(sibling);
1146	delete_match(match);
1147	return NULL;
1148	}
1149	parent->mtype = MTYPE_OR;
1150	parent->u.matchs[0] = match;
1151	parent->u.matchs[1] = sibling;
1152	match = parent;
1153	}
1154
1155	*str_after = str;
1156
1157	return match;
1158	}
1159
1160	#ifdef DEBUG
1161	/...sprint_tree:0: /
1162	/...sdo_indent:0: /
1163	STATIC void do_indent(int indent)
1164	{
1165	while (indent--)
1166	putchar('\t');
1167	}
1168
1169	STATIC void print_tree(const MATCH * match, int indent)
1170	{
1171	do_indent(indent);
1172	switch (match->mtype)
1173	{
1174	case MTYPE_NULL:
1175	printf("null\n");
1176	break;
1177	case MTYPE_CHAR:
1178	printf("%c\n", match->u.character);
1179	break;
1180	case MTYPE_NCHAR:
1181	printf("~%c\n", match->u.character);
1182	break;
1183	case MTYPE_STRING:
1184	printf("%.s\n",
1185	(unsigned char)match->u.string[0],
1186	(unsigned char)match->u.string[0],
1187	match->u.string + 1);
1188	break;
1189	case MTYPE_CCLASS:
1190	{
1191	int i;
1192
1193	printf("[");
1194	for (i = 0; i < 0x100; i++)
1195	if (match_cclass(i, match->u.cclass))
1196	printf("%c", i);
1197	printf("]\n");
1198	}
1199	break;
1200	case MTYPE_DOT:
1201	printf(".\n");
1202	break;
1203	case MTYPE_WORD:
1204	printf("\\w\n");
1205	break;
1206	case MTYPE_NWORD:
1207	printf("\\W\n");
1208	break;
1209	case MTYPE_QUERY:
1210	printf("?\n");
1211	print_tree(match->u.match, indent + 1);
1212	break;
1213	case MTYPE_PLUS:
1214	printf("+\n");
1215	print_tree(match->u.match, indent + 1);
1216	break;
1217	case MTYPE_STAR:
1218	printf("*\n");
1219	print_tree(match->u.match, indent + 1);
1220	break;
1221	case MTYPE_CREP:
1222	printf("{%u,%u}\n",
1223	match->u.crep.m,
1224	match->u.crep.n);
1225	print_tree(match->u.crep.match, indent + 1);
1226	break;
1227	case MTYPE_OR:
1228	printf("\|\n");
1229	print_tree(match->u.matchs[0], indent + 1);
1230	print_tree(match->u.matchs[1], indent + 1);
1231	break;
1232	case MTYPE_CAT:
1233	printf("CAT\n");
1234	print_tree(match->u.matchs[0], indent + 1);
1235	print_tree(match->u.matchs[1], indent + 1);
1236	break;
1237	case MTYPE_SUB:
1238	printf("SUB\n");
1239	print_tree(match->u.match, indent + 1);
1240	break;
1241	case MTYPE_SOL:
1242	printf("^\n");
1243	break;
1244	case MTYPE_EOL:
1245	printf("$\n");
1246	break;
1247	case MTYPE_SOW:
1248	printf("\\<\n");
1249	break;
1250	case MTYPE_EOW:
1251	printf("\\>\n");
1252	break;
1253	case MTYPE_IW:
1254	printf("\\B\n");
1255	break;
1256	case MTYPE_EW:
1257	printf("\\y\n");
1258	break;
1259	case MTYPE_BACK:
1260	printf("\\%d\n", match->u.n_span + 1);
1261	break;
1262	}
1263	}
1264	#endif
1265
1266	/...sfinite state machine:0: /
1267	typedef unsigned char ETYPE; // Edge type
1268
1269	#define ETYPE_CHAR ((ETYPE) 0) // Can advance if given character
1270	#define ETYPE_NCHAR ((ETYPE) 1) // Can advance if not given char.
1271	#define ETYPE_STRING ((ETYPE) 2) // Can advance if strings match
1272	#define ETYPE_DOT ((ETYPE) 3) // Can advance if any character
1273	#define ETYPE_CCLASS ((ETYPE) 4) // Can advance if any in class
1274	#define ETYPE_WORD ((ETYPE) 5) // Can advance if word constituent
1275	#define ETYPE_NWORD ((ETYPE) 6) // Can advance if not word constit.
1276	#define ETYPE_EPSILON ((ETYPE) 7) // Can advance without reading input
1277	#define ETYPE_SOL ((ETYPE) 8) // Matches if at start of line
1278	#define ETYPE_EOL ((ETYPE) 9) // Matches if at end of line
1279	#define ETYPE_SOW ((ETYPE) 10) // Matches if at start of word
1280	#define ETYPE_EOW ((ETYPE) 11) // Matches if at end of word
1281	#define ETYPE_IW ((ETYPE) 12) // Matches if within word
1282	#define ETYPE_EW ((ETYPE) 13) // Matches if at word start or end
1283	#define ETYPE_SSUB ((ETYPE) 14) // Records passage over (
1284	#define ETYPE_ESUB ((ETYPE) 15) // Records passage over )
1285	#define ETYPE_BACK ((ETYPE) 16) // Back reference
1286
1287	typedef struct
1288	{
1289	ETYPE etype; // Edge type, (an ETYPE_ no)
1290	union
1291	{
1292	char character; // Character to use if ETYPE_CHAR
1293	unsigned char *cclass; // Class to use if ETYPE_CCLASS
1294	char *string; // len, then chars if ETYPE_STRING
1295	int n_span; // Used if ETYPE_BACK
1296	}
1297	u;
1298	const char *gate; // Used if type of epsilon move
1299	int to_state; // State to go to if test succeeds
1300	int next_edge; // Next test to try after this
1301	}
1302	EDGE;
1303
1304	/* Under DOS, restrict FSM size to 13KB approx.. On other environments memory
1305	* isn't such an issue, so allow something closer to 32KB. Of course, a better
1306	* implementation would dynamically grow the FSM size as needed. Something for
1307	* the future perhaps... */
1308
1309	#ifdef xxDOS
1310	#define MAX_STATES 200
1311	#define MAX_EDGES 600
1312	#else
1313	#define MAX_STATES 500
1314	#define MAX_EDGES 1500
1315	#endif
1316
1317	#define FLAG_FINISH 0x01
1318	#define FLAG_VISITED 0x02
1319	#define FLAG_REACHABLE 0x04
1320
1321	typedef struct
1322	{
1323	int n_states;
1324	int state_first_edges[MAX_STATES];
1325	unsigned char state_flags[MAX_STATES];
1326	int n_edges;
1327	EDGE edges[MAX_EDGES];
1328	}
1329	FSM;
1330
1331	/...screate_fsm:0: /
1332	STATIC FSM create_fsm(int rc)
1333	{
1334	FSM *fsm;
1335	int i;
1336
1337	if ((fsm = (FSM *) malloc(sizeof(FSM))) == NULL)
1338	{
1339	*rc = ERROR_NOT_ENOUGH_MEMORY;
1340	return NULL;
1341	}
1342
1343	fsm->n_states = 0;
1344	fsm->n_edges = 0;
1345
1346	for (i = 0; i < MAX_STATES; i++)
1347	{
1348	fsm->state_first_edges[i] = -1;
1349	fsm->state_flags[i] = 0;
1350	}
1351
1352	return fsm;
1353	}
1354	/...sdelete_fsm:0: /
1355	#define delete_fsm(fsm) free(fsm)
1356
1357	/...smalloc_state:0: /
1358	STATIC int malloc_state(FSM * fsm)
1359	{
1360	if (fsm->n_states == MAX_STATES)
1361	return -1;
1362	else
1363	return fsm->n_states++;
1364	}
1365	/...smalloc_edge:0: /
1366	/* Allocate space for new supplied edge and fill it in.
1367	* If already exists then don't bother (duplicates waste search time).
1368	* Return TRUE if all went ok. */
1369
1370	STATIC BOOLEAN malloc_edge(int s, EDGE * edge, FSM * fsm)
1371	{
1372	int edge_no, n_edges = fsm->n_edges++;
1373
1374	// See if edge already exists
1375
1376	for (edge_no = fsm->state_first_edges[s];
1377	edge_no != -1;
1378	edge_no = fsm->edges[edge_no].next_edge)
1379	// Do we already have this edge
1380	if (fsm->edges[edge_no].to_state == edge->to_state &&
1381	fsm->edges[edge_no].etype == edge->etype)
1382	// An edge of this type already exists
1383	switch (edge->etype)
1384	{
1385	/...sETYPE_CHAR\47\NCHAR:32: /
1386	case ETYPE_CHAR:
1387	case ETYPE_NCHAR:
1388	if (edge->u.character == fsm->edges[edge_no].u.character)
1389	return TRUE;
1390	break;
1391	/...sETYPE_STRING:32: /
1392	case ETYPE_STRING:
1393	if ((unsigned char)fsm->edges[edge_no].u.string[0] ==
1394	(unsigned char)edge->u.string[0] &&
1395	!memcmp(fsm->edges[edge_no].u.string + 1,
1396	edge->u.string + 1,
1397	(unsigned char)edge->u.string[0]))
1398	return TRUE;
1399	break;
1400	/...sETYPE_CCLASS:32: /
1401	case ETYPE_CCLASS:
1402	if (edge->u.cclass == fsm->edges[edge_no].u.cclass)
1403	return TRUE;
1404	break;
1405	/...sETYPE_DOT\47\EPSILON\47\SOL\47\EOL etc\46\\46\:32: /
1406	case ETYPE_DOT:
1407	case ETYPE_EPSILON:
1408	case ETYPE_SOL:
1409	case ETYPE_EOL:
1410	case ETYPE_SOW:
1411	case ETYPE_EOW:
1412	case ETYPE_IW:
1413	case ETYPE_EW:
1414	case ETYPE_SSUB:
1415	case ETYPE_ESUB:
1416	return TRUE;
1417	/...sETYPE_BACK:32: /
1418	case ETYPE_BACK:
1419	if (edge->u.n_span == fsm->edges[edge_no].u.n_span)
1420	return TRUE;
1421	break;
1422	}
1423
1424	// Going to have to add the edge
1425
1426	if (n_edges >= MAX_EDGES)
1427	return FALSE;
1428
1429	memcpy(&(fsm->edges[n_edges]), edge, sizeof(EDGE));
1430	fsm->edges[n_edges].next_edge = fsm->state_first_edges[s];
1431	fsm->state_first_edges[s] = n_edges;
1432	return TRUE;
1433	}
1434	/...smake_fsm_from_match:0: /
1435	/...sadd_edge_to_fsm_character:0: /
1436	STATIC BOOLEAN add_edge_to_fsm_character(int s, int f, FSM * fsm, char character)
1437	{
1438	EDGE edge;
1439
1440	edge.etype = ETYPE_CHAR;
1441	edge.u.character = character;
1442	edge.to_state = f;
1443	return malloc_edge(s, &edge, fsm);
1444	}
1445	/...sadd_edge_to_fsm_ncharacter:0: /
1446	STATIC BOOLEAN add_edge_to_fsm_ncharacter(int s, int f, FSM * fsm, char character)
1447	{
1448	EDGE edge;
1449
1450	edge.etype = ETYPE_NCHAR;
1451	edge.u.character = character;
1452	edge.to_state = f;
1453	return malloc_edge(s, &edge, fsm);
1454	}
1455	/...sadd_edge_to_fsm_string:0: /
1456	STATIC BOOLEAN add_edge_to_fsm_string(int s, int f, FSM * fsm, char *string)
1457	{
1458	EDGE edge;
1459
1460	edge.etype = ETYPE_STRING;
1461	edge.u.string = string;
1462	edge.to_state = f;
1463	return malloc_edge(s, &edge, fsm);
1464	}
1465	/...sadd_edge_to_fsm_cclass:0: /
1466	STATIC BOOLEAN add_edge_to_fsm_cclass(int s, int f, FSM * fsm, unsigned char *cclass)
1467	{
1468	EDGE edge;
1469
1470	edge.etype = ETYPE_CCLASS;
1471	edge.u.cclass = cclass;
1472	edge.to_state = f;
1473	return malloc_edge(s, &edge, fsm);
1474	}
1475	/...sadd_edge_to_fsm_dot:0: /
1476	STATIC BOOLEAN add_edge_to_fsm_dot(int s, int f, FSM * fsm)
1477	{
1478	EDGE edge;
1479
1480	edge.etype = ETYPE_DOT;
1481	edge.to_state = f;
1482	return malloc_edge(s, &edge, fsm);
1483	}
1484	/...sadd_edge_to_fsm_word:0: /
1485	STATIC BOOLEAN add_edge_to_fsm_word(int s, int f, FSM * fsm)
1486	{
1487	EDGE edge;
1488
1489	edge.etype = ETYPE_WORD;
1490	edge.to_state = f;
1491	return malloc_edge(s, &edge, fsm);
1492	}
1493	/...sadd_edge_to_fsm_nword:0: /
1494	STATIC BOOLEAN add_edge_to_fsm_nword(int s, int f, FSM * fsm)
1495	{
1496	EDGE edge;
1497
1498	edge.etype = ETYPE_NWORD;
1499	edge.to_state = f;
1500	return malloc_edge(s, &edge, fsm);
1501	}
1502	/...sadd_edge_to_fsm_epsilon:0: /
1503	STATIC BOOLEAN add_edge_to_fsm_epsilon(int s, int f, FSM * fsm)
1504	{
1505	EDGE edge;
1506
1507	edge.etype = ETYPE_EPSILON;
1508	edge.to_state = f;
1509	return malloc_edge(s, &edge, fsm);
1510	}
1511	/...sadd_edge_to_fsm_special:0: /
1512	STATIC BOOLEAN add_edge_to_fsm_special(int s, int f, FSM * fsm, ETYPE etype)
1513	{
1514	EDGE edge;
1515
1516	edge.etype = etype;
1517	edge.to_state = f;
1518	edge.gate = NULL;
1519	return malloc_edge(s, &edge, fsm);
1520	}
1521	/...sadd_edge_to_fsm_back:0: /
1522	STATIC BOOLEAN add_edge_to_fsm_back(int s, int f, FSM * fsm, int n_span)
1523	{
1524	EDGE edge;
1525
1526	edge.etype = ETYPE_BACK;
1527	edge.to_state = f;
1528	edge.gate = NULL;
1529	edge.u.n_span = n_span;
1530	return malloc_edge(s, &edge, fsm);
1531	}
1532
1533	STATIC BOOLEAN make_fsm_from_match(MATCH * match, FSM * fsm, int s, int f)
1534	{
1535	int n1, n2, n3, n4, i;
1536
1537	switch (match->mtype)
1538	{
1539	/...sMTYPE_NULL:16: /
1540	/*
1541	* e
1542	* S ----> F
1543	*
1544	*/
1545
1546	case MTYPE_NULL:
1547	return (*s = malloc_state(fsm)) != -1 &&
1548	(*f = malloc_state(fsm)) != -1 &&
1549	add_edge_to_fsm_epsilon(s, f, fsm);
1550	/...sMTYPE_CHAR:16: /
1551	/*
1552	* c
1553	* S ----> F
1554	*
1555	*/
1556
1557	case MTYPE_CHAR:
1558	return (*s = malloc_state(fsm)) != -1 &&
1559	(*f = malloc_state(fsm)) != -1 &&
1560	add_edge_to_fsm_character(s, f, fsm, match->u.character);
1561	/...sMTYPE_NCHAR:16: /
1562	/*
1563	* ~c
1564	* S ----> F
1565	*
1566	*/
1567
1568	case MTYPE_NCHAR:
1569	return (*s = malloc_state(fsm)) != -1 &&
1570	(*f = malloc_state(fsm)) != -1 &&
1571	add_edge_to_fsm_ncharacter(s, f, fsm, match->u.character);
1572	/...sMTYPE_STRING:16: /
1573	/*
1574	* string
1575	* S ----> F
1576	*
1577	*/
1578
1579	case MTYPE_STRING:
1580	return (*s = malloc_state(fsm)) != -1 &&
1581	(*f = malloc_state(fsm)) != -1 &&
1582	add_edge_to_fsm_string(s, f, fsm, match->u.string);
1583
1584	/...sMTYPE_CCLASS:16: /
1585	/*
1586	* cclass
1587	* S ----> F
1588	*
1589	*/
1590
1591	case MTYPE_CCLASS:
1592	return (*s = malloc_state(fsm)) != -1 &&
1593	(*f = malloc_state(fsm)) != -1 &&
1594	add_edge_to_fsm_cclass(s, f, fsm, match->u.cclass);
1595
1596	/...sMTYPE_DOT:16: /
1597	/*
1598	* any
1599	* S ----> F
1600	*
1601	*/
1602
1603	case MTYPE_DOT:
1604	return (*s = malloc_state(fsm)) != -1 &&
1605	(*f = malloc_state(fsm)) != -1 &&
1606	add_edge_to_fsm_dot(s, f, fsm);
1607
1608	/...sMTYPE_WORD:16: /
1609	/*
1610	* word
1611	* S ----> F
1612	*
1613	*/
1614
1615	case MTYPE_WORD:
1616	return (*s = malloc_state(fsm)) != -1 &&
1617	(*f = malloc_state(fsm)) != -1 &&
1618	add_edge_to_fsm_word(s, f, fsm);
1619
1620	/...sMTYPE_NWORD:16: /
1621	/*
1622	* !word
1623	* S ----> F
1624	*
1625	*/
1626
1627	case MTYPE_NWORD:
1628	return (*s = malloc_state(fsm)) != -1 &&
1629	(*f = malloc_state(fsm)) != -1 &&
1630	add_edge_to_fsm_nword(s, f, fsm);
1631
1632	/...sMTYPE_QUERY:16: /
1633	/*
1634	* e
1635	* S ----> F
1636	* \| ^
1637	* \| e \| e
1638	* v \|
1639	* [n1 n2]
1640	*
1641	*/
1642
1643	case MTYPE_QUERY:
1644	if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1645	return FALSE;
1646	return (*s = malloc_state(fsm)) != -1 &&
1647	(*f = malloc_state(fsm)) != -1 &&
1648	add_edge_to_fsm_epsilon(s, f, fsm) &&
1649	add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1650	add_edge_to_fsm_epsilon(n2, *f, fsm);
1651
1652	/...sMTYPE_PLUS:16: /
1653	/*
1654	*
1655	* [S F]
1656	* ^ e \|
1657	* +---+
1658	*
1659	*/
1660
1661	case MTYPE_PLUS:
1662	if (!make_fsm_from_match(match->u.match, fsm, s, f))
1663	return FALSE;
1664	return add_edge_to_fsm_epsilon(f, s, fsm);
1665
1666	/...sMTYPE_STAR:16: /
1667	/*
1668	*
1669	* +---- S/F
1670	* \| ^
1671	* \| e \| e
1672	* v \|
1673	* [n1 n2]
1674	*
1675	*/
1676
1677	case MTYPE_STAR:
1678	if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1679	return FALSE;
1680	return (s = f = malloc_state(fsm)) != -1 &&
1681	add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1682	add_edge_to_fsm_epsilon(n2, *f, fsm);
1683
1684	/...sMTYPE_CREP:16: /
1685	/*
1686	* e e e e
1687	* S-->[n1 n2]-->[n1 n2]-->[n1 n2]-->[n1 n2] <re>{2,4}
1688	* \| \| \|
1689	* \| e \| e \| e
1690	* v \| \|
1691	* F<---------+----------+
1692	*
1693	* e
1694	* +----+
1695	* \| \|
1696	* e e e v \|
1697	* S-->[n1 n2]-->[n1 n2]-->[n1 n2] <re>{2,}
1698	* \| \|
1699	* \| e \| e
1700	* v \|
1701	* F<---------+
1702	*/
1703
1704	case MTYPE_CREP:
1705	if ((*s = malloc_state(fsm)) == -1 \|\|
1706	(*f = malloc_state(fsm)) == -1)
1707	return FALSE;
1708	n3 = *s;
1709	for (i = 0; i < (int)match->u.crep.m; i++)
1710	{
1711	if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1712	return FALSE;
1713	if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1714	return FALSE;
1715	n3 = n2;
1716	}
1717	if (!add_edge_to_fsm_epsilon(n3, *f, fsm))
1718	return FALSE;
1719	if (match->u.crep.n != ~0)
1720	for (; i < (int)match->u.crep.n; i++)
1721	{
1722	if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1723	return FALSE;
1724	if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1725	return FALSE;
1726	if (!add_edge_to_fsm_epsilon(n2, *f, fsm))
1727	return FALSE;
1728	n3 = n2;
1729	}
1730	else
1731	{
1732	if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1733	return FALSE;
1734	if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1735	return FALSE;
1736	if (!add_edge_to_fsm_epsilon(n2, *f, fsm))
1737	return FALSE;
1738	if (!add_edge_to_fsm_epsilon(n2, n1, fsm))
1739	return FALSE;
1740	}
1741	return TRUE;
1742
1743	/...sMTYPE_OR:16: /
1744	/*
1745	* e e
1746	* +--->[n1 n2]----+
1747	* \| v
1748	* S F
1749	* \| e e ^
1750	* +--->[n3 n4]----+
1751	*
1752	*/
1753
1754	case MTYPE_OR:
1755	if (!make_fsm_from_match(match->u.matchs[0], fsm, &n1, &n2) \|\|
1756	!make_fsm_from_match(match->u.matchs[1], fsm, &n3, &n4))
1757	return FALSE;
1758	return (*s = malloc_state(fsm)) != -1 &&
1759	(*f = malloc_state(fsm)) != -1 &&
1760	add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1761	add_edge_to_fsm_epsilon(*s, n3, fsm) &&
1762	add_edge_to_fsm_epsilon(n2, *f, fsm) &&
1763	add_edge_to_fsm_epsilon(n4, *f, fsm);
1764
1765	/...sMTYPE_CAT:16: /
1766	/*
1767	* e
1768	* [S n1]---->[n2 F]
1769	*
1770	*/
1771
1772	case MTYPE_CAT:
1773	if (!make_fsm_from_match(match->u.matchs[0], fsm, s, &n1) \|\|
1774	!make_fsm_from_match(match->u.matchs[1], fsm, &n2, f))
1775	return FALSE;
1776	return add_edge_to_fsm_epsilon(n1, n2, fsm);
1777
1778	/...sMTYPE_SUB:16: /
1779	/*
1780	* ssub esub
1781	* S---->[n1 n2]---->F
1782	*
1783	*/
1784
1785	case MTYPE_SUB:
1786	if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1787	return FALSE;
1788	return (*s = malloc_state(fsm)) != -1 &&
1789	(*f = malloc_state(fsm)) != -1 &&
1790	add_edge_to_fsm_special(*s, n1, fsm, ETYPE_SSUB) &&
1791	add_edge_to_fsm_special(n2, *f, fsm, ETYPE_ESUB);
1792
1793	/...sMTYPE_SOL\47\EOL\47\SOW\47\EOW\47\IW\47\EW:16: /
1794	/*
1795	* special
1796	* S ----> F
1797	*
1798	*/
1799
1800	case MTYPE_SOL:
1801	return (*s = malloc_state(fsm)) != -1 &&
1802	(*f = malloc_state(fsm)) != -1 &&
1803	add_edge_to_fsm_special(s, f, fsm, ETYPE_SOL);
1804	case MTYPE_EOL:
1805	return (*s = malloc_state(fsm)) != -1 &&
1806	(*f = malloc_state(fsm)) != -1 &&
1807	add_edge_to_fsm_special(s, f, fsm, ETYPE_EOL);
1808	case MTYPE_SOW:
1809	return (*s = malloc_state(fsm)) != -1 &&
1810	(*f = malloc_state(fsm)) != -1 &&
1811	add_edge_to_fsm_special(s, f, fsm, ETYPE_SOW);
1812	case MTYPE_EOW:
1813	return (*s = malloc_state(fsm)) != -1 &&
1814	(*f = malloc_state(fsm)) != -1 &&
1815	add_edge_to_fsm_special(s, f, fsm, ETYPE_EOW);
1816	case MTYPE_IW:
1817	return (*s = malloc_state(fsm)) != -1 &&
1818	(*f = malloc_state(fsm)) != -1 &&
1819	add_edge_to_fsm_special(s, f, fsm, ETYPE_IW);
1820	case MTYPE_EW:
1821	return (*s = malloc_state(fsm)) != -1 &&
1822	(*f = malloc_state(fsm)) != -1 &&
1823	add_edge_to_fsm_special(s, f, fsm, ETYPE_EW);
1824
1825	/...sMTYPE_BACK:16: /
1826	case MTYPE_BACK:
1827	return (*s = malloc_state(fsm)) != -1 &&
1828	(*f = malloc_state(fsm)) != -1 &&
1829	add_edge_to_fsm_back(s, f, fsm, match->u.n_span);
1830
1831	}
1832	return FALSE; // Should never happen
1833	}
1834
1835	/...sremove_epsilons:0: /
1836	/* The problems with epsilon moves are :-
1837	* 1) They can gang up on you in groups to form loops of states that require
1838	* no input to go all the way around them. Any function attempting a
1839	* recursive search of the FSM will recurse forever.
1840	* 2) They can slow the recognition process by as much as a factor of 2.
1841	* eg:
1842	* a e b e c
1843	* O ----> O ----> O ----> O ----> O ----> O Is slow
1844	*
1845	* a b c
1846	* O ----> O ----> O ----> O Is faster
1847	*
1848	*/
1849
1850	/...sfinish_states:0: /
1851	/...sis_finish_reachable:0: /
1852	/* When we recurse we mark the current state as visited to stop infinite
1853	* recursion on loops of epsilon moves. */
1854
1855	STATIC BOOLEAN is_finish_reachable(FSM * fsm, int state_no)
1856	{
1857	int edge_no;
1858	BOOLEAN ok;
1859
1860	if (fsm->state_flags[state_no] & FLAG_VISITED)
1861	// Been here already
1862	return FALSE;
1863
1864	if (fsm->state_flags[state_no] & FLAG_FINISH)
1865	// At finish
1866	return TRUE;
1867
1868	for (edge_no = fsm->state_first_edges[state_no];
1869	edge_no != -1;
1870	edge_no = fsm->edges[edge_no].next_edge)
1871	if (fsm->edges[edge_no].etype == ETYPE_EPSILON)
1872	{
1873	fsm->state_flags[state_no] \|= FLAG_VISITED;
1874	ok = is_finish_reachable(fsm, fsm->edges[edge_no].to_state);
1875	fsm->state_flags[state_no] &= ~FLAG_VISITED;
1876	if (ok)
1877	return TRUE;
1878	}
1879
1880
1881	return FALSE;
1882	}
1883
1884
1885	STATIC void finish_states(int f, FSM * fsm, FSM * fsm_without)
1886	{
1887	int state_no;
1888
1889	fsm->state_flags[f] = FLAG_FINISH;
1890	fsm_without->state_flags[f] = FLAG_FINISH;
1891	for (state_no = 0; state_no < fsm->n_states; state_no++)
1892	if (is_finish_reachable(fsm, state_no))
1893	fsm_without->state_flags[state_no] = FLAG_FINISH;
1894	}
1895
1896	/...sdetermine_reachable:0: /
1897	STATIC void determine_reachable(int s, FSM * fsm, FSM * fsm_without)
1898	{
1899	int edge_no, to_state;
1900
1901	fsm_without->state_flags[s] \|= FLAG_REACHABLE;
1902	for (edge_no = 0; edge_no < fsm->n_edges; edge_no++)
1903	if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1904	{
1905	to_state = fsm->edges[edge_no].to_state;
1906	fsm_without->state_flags[to_state] \|= FLAG_REACHABLE;
1907	}
1908	}
1909
1910	/...scopy_non_epsilons:0: /
1911	STATIC void copy_non_epsilons(FSM * fsm, FSM * fsm_without)
1912	{
1913	int state_no, edge_no;
1914
1915	for (state_no = 0; state_no < fsm->n_states; state_no++)
1916	if (fsm_without->state_flags[state_no] & FLAG_REACHABLE)
1917	for (edge_no = fsm->state_first_edges[state_no];
1918	edge_no != -1;
1919	edge_no = fsm->edges[edge_no].next_edge)
1920	if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1921	malloc_edge(state_no, &(fsm->edges[edge_no]), fsm_without);
1922	}
1923
1924	/...sfollow_epsilons:0: /
1925	/...scopy_edges_reachable:0: /
1926	/* What this says is :-
1927	* If state A can reach state B, by an epsilon move, and
1928	* state B can reach state C then
1929	* state A can reach state C *
1930	*
1931	* If the state B to state C involves an epsilon move then
1932	* A can reach whatever C can reach too (by recursion)
1933	*/
1934
1935	STATIC BOOLEAN copy_edges_reachable(
1936	FSM * fsm,
1937	FSM * fsm_without,
1938	int state_no_to, /* AK: Bad identifier, might better be
1939	* described as 'original reachable state' */
1940	int state_no_from
1941	)
1942	{
1943	int edge_no;
1944	BOOLEAN ok;
1945
1946	if (fsm->state_flags[state_no_from] & FLAG_VISITED)
1947	// Been here already, therefore all copied from here ok
1948	return TRUE;
1949
1950	for (edge_no = fsm->state_first_edges[state_no_from];
1951	edge_no != -1;
1952	edge_no = fsm->edges[edge_no].next_edge)
1953	if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1954	// Had better add this edge
1955	/...sadd this edge to the \39\to\39\ state:24: /
1956	{
1957	if (!malloc_edge(state_no_to, &(fsm->edges[edge_no]), fsm_without))
1958	return (FALSE);
1959	}
1960
1961	else
1962	{
1963	fsm->state_flags[state_no_from] \|= FLAG_VISITED;
1964	ok = copy_edges_reachable(fsm, fsm_without, state_no_to, fsm->edges[edge_no].to_state);
1965	fsm->state_flags[state_no_from] &= ~FLAG_VISITED;
1966	if (!ok)
1967	return FALSE;
1968	}
1969
1970	return TRUE;
1971	}
1972
1973
1974	STATIC BOOLEAN follow_epsilons(FSM * fsm, FSM * fsm_without)
1975	{
1976	int state_no;
1977
1978	for (state_no = 0; state_no < fsm->n_states; state_no++)
1979	if (fsm_without->state_flags[state_no] & FLAG_REACHABLE)
1980	if (!copy_edges_reachable(fsm, fsm_without, state_no, state_no))
1981	return FALSE;
1982	return TRUE;
1983	}
1984
1985
1986	STATIC BOOLEAN remove_epsilons(int s, int f, FSM * fsm, FSM * fsm_without)
1987	{
1988	// FSM with no epsilon moves will have the same number of states
1989
1990	fsm_without->n_states = fsm->n_states;
1991
1992	// Mark state f as a finish state in the new FSM
1993	// Any state with epsilon move(s) to state f is also a finish state
1994
1995	finish_states(f, fsm, fsm_without);
1996
1997	// Determine which states can be reached by non-epsilon moves. Add
1998	// to this set, the start state. The resulting states should have
1999	// their edges considered, but the other will not be reachable. This
2000	// is because they will be bypassed by follow_epsilons().
2001
2002	determine_reachable(s, fsm, fsm_without);
2003
2004	// Copy across all reachable, non epsilon moves to new FSM
2005
2006	copy_non_epsilons(fsm, fsm_without);
2007
2008	// For all states, determine all other states that can be reached by
2009	// epsilon moves and add the edges leading from them to us
2010
2011	return follow_epsilons(fsm, fsm_without);
2012	}
2013
2014	/...smatch_fsm:0: /
2015	/* Stack requirements per call, (one call per character in string!).
2016	*
2017	* eg: 16 bit OS/2, large model :-
2018	* Return address + Stack frame + SI and DI + Arguments + Locals
2019	* 2 + 2 + 2+2 + 4+2+4+4 + 2 = 24
2020	*
2021	* eg: 16 bit DOS large model :-
2022	* Return address + Stack frame + SI and DI + Arguments + Locals
2023	* 2 + 2 + 2+2 + 4+2 + 2 = 16
2024	*
2025	* eg: RS/6000 AIX :-
2026	* Massive stack frame per call, massive program stack size
2027	* net effect - room for approx 200 levels before core dump!
2028	*
2029	* In addition, we can consume additional stack for every ETYPE_SSUB
2030	* (and ETYPE_ESUB). Guessing this requirement to be approx 100 bytes,
2031	* we place an arbitrary limit of 20 (ie: 2KB) on subexpressions in an ERE.
2032	* This limit was enforced earlier during match tree parsing.
2033	*
2034	*/
2035
2036	typedef struct substruct SUBS;
2037	struct substruct
2038	{
2039	int n_spans;
2040	ERE_SPAN spans[MAX_SPANS];
2041	SUBS *next;
2042	};
2043
2044	#define MAX_SUBS 20
2045
2046	typedef struct
2047	{
2048	FSM *fsm;
2049	int eremf;
2050	const char *str_init;
2051	const char *str_limit;
2052	const char *str_best;
2053	SUBS *subs;
2054	SUBS *subs_base;
2055	ERE_MATCHINFO *mi;
2056	}
2057	CONTEXT;
2058
2059	#define NR
2060
2061	STATIC void NR walk_fsm(const char str, int state_no, CONTEXT cx);
2062
2063	STATIC void NR walk_fsm_gated(const char str, CONTEXT cx, EDGE * e)
2064	{
2065	if (e->gate != str)
2066	// Avoid looping via this edge
2067	{
2068	const char *gate = e->gate;
2069
2070	e->gate = str;
2071	walk_fsm(str, e->to_state, cx);
2072	e->gate = gate;
2073	}
2074	}
2075
2076	STATIC void NR walk_fsm_ssub(const char str, CONTEXT cx, EDGE * e)
2077	{
2078	SUBS subs;
2079
2080	if (cx->subs->n_spans < MAX_SPANS)
2081	cx->subs->spans[cx->subs->n_spans].pos = str - cx->str_init;
2082
2083	subs.n_spans = 0;
2084	subs.next = cx->subs;
2085	cx->subs = &subs;
2086	walk_fsm_gated(str, cx, e);
2087	cx->subs = subs.next;
2088	}
2089
2090	STATIC void NR walk_fsm_esub(const char str, CONTEXT cx, EDGE * e)
2091	{
2092	SUBS *subs = cx->subs;
2093
2094	cx->subs = cx->subs->next;
2095
2096	if (cx->subs->n_spans < MAX_SPANS)
2097	cx->subs->spans[cx->subs->n_spans].len =
2098	(str - cx->str_init) - cx->subs->spans[cx->subs->n_spans].pos;
2099	++(cx->subs->n_spans);
2100
2101	walk_fsm_gated(str, cx, e);
2102
2103	--(cx->subs->n_spans);
2104	cx->subs = subs;
2105	}
2106
2107	STATIC void NR walk_fsm(const char str, int state_no, CONTEXT cx)
2108	{
2109	int edge_no;
2110
2111	if (cx->fsm->state_flags[state_no] & FLAG_FINISH)
2112	// Got to finishing state, may have got a better match than before
2113	{
2114	if ((cx->str_best == NULL \|\|
2115	(cx->str_best < str) == (cx->eremf & EREMF_SHORTEST) == 0) &&
2116	str <= cx->str_limit)
2117	{
2118	cx->str_best = str;
2119	if (cx->mi != NULL)
2120	{
2121	int i;
2122
2123	cx->mi->n_spans = cx->subs_base->n_spans;
2124	for (i = 0; i < cx->mi->n_spans; i++)
2125	cx->mi->spans[i] = cx->subs_base->spans[i];
2126	}
2127	}
2128	if (cx->eremf & EREMF_ANY)
2129	return;
2130	// Continue, as may be able to get a better match
2131	}
2132
2133	for (edge_no = cx->fsm->state_first_edges[state_no];
2134	edge_no != -1;
2135	edge_no = cx->fsm->edges[edge_no].next_edge)
2136	// Consider taking a step along an edge to a new state
2137	{
2138	EDGE *e = &(cx->fsm->edges[edge_no]);
2139
2140	switch (e->etype)
2141	{
2142	/...sETYPE_CHAR \45\ if matches character\44\ we can advance:24: /
2143	case ETYPE_CHAR:
2144	if (*str == e->u.character)
2145	walk_fsm(str + 1, e->to_state, cx);
2146	break;
2147
2148	/...sETYPE_NCHAR \45\ if not matches character\44\ we can advance:24: /
2149	case ETYPE_NCHAR:
2150	if (str != '\0' && str != e->u.character)
2151	walk_fsm(str + 1, e->to_state, cx);
2152	break;
2153
2154	/...sETYPE_STRING \45\ if matches string\44\ we can advance:24: /
2155	case ETYPE_STRING:
2156	{
2157	unsigned len = (unsigned char)e->u.string[0];
2158
2159	if (!memcmp(str, e->u.string + 1, len))
2160	walk_fsm(str + len, e->to_state, cx);
2161	}
2162	break;
2163
2164	/...sETYPE_DOT \45\ if got any character\44\ we can advance:24: /
2165	case ETYPE_DOT:
2166	if (*str != '\0')
2167	walk_fsm(str + 1, e->to_state, cx);
2168	break;
2169
2170	/...sETYPE_WORD \45\ if got word constituent\44\ we can advance:24: /
2171	case ETYPE_WORD:
2172	if (str != '\0' && isword(str))
2173	walk_fsm(str + 1, e->to_state, cx);
2174	break;
2175
2176	/...sETYPE_NWORD \45\ if got non word constituent\44\ we can advance:24: /
2177	case ETYPE_NWORD:
2178	if (str != '\0' && !isword(str))
2179	walk_fsm(str + 1, e->to_state, cx);
2180	break;
2181
2182	/...sETYPE_CCLASS \45\ if in the class\44\ we can advance:24: /
2183	case ETYPE_CCLASS:
2184	if (match_cclass(*str, e->u.cclass))
2185	walk_fsm(str + 1, e->to_state, cx);
2186	break;
2187
2188	/...sETYPE_SOL\47\EOL\47\SOW\47\EOW\47\IW\47\EW \45\ special epsilon moves:24: /
2189	case ETYPE_SOL:
2190	if (str == cx->str_init)
2191	walk_fsm_gated(str, cx, e);
2192	break;
2193	case ETYPE_EOL:
2194	if (*str == '\0')
2195	walk_fsm_gated(str, cx, e);
2196	break;
2197	case ETYPE_SOW:
2198	if (isword(str[0]) &&
2199	((cx->str_init < str && !isword(str[-1])) \|\|
2200	(cx->str_init == str)))
2201	walk_fsm_gated(str, cx, e);
2202	break;
2203	case ETYPE_EOW:
2204	if ((cx->str_init < str && isword(str[-1])) &&
2205	(str[0] == '\0' \|\| !isword(str[0])))
2206	walk_fsm_gated(str, cx, e);
2207	break;
2208	case ETYPE_IW:
2209	if (cx->str_init < str && isword(str[-1]) &&
2210	str[0] != '\0' && isword(str[0]))
2211	walk_fsm_gated(str, cx, e);
2212	break;
2213	case ETYPE_EW:
2214	if (isword(str[0]) &&
2215	((cx->str_init < str && !isword(str[-1])) \|\|
2216	(cx->str_init == str)))
2217	walk_fsm_gated(str, cx, e);
2218	else if ((cx->str_init < str && isword(str[-1])) &&
2219	(str[0] == '\0' \|\| !isword(str[0])))
2220	walk_fsm_gated(str, cx, e);
2221	break;
2222
2223	/...sETYPE_SSUB\47\ESUB \45\ handle nested subexpression:24: /
2224	case ETYPE_SSUB:
2225	walk_fsm_ssub(str, cx, e);
2226	break;
2227	case ETYPE_ESUB:
2228	walk_fsm_esub(str, cx, e);
2229	break;
2230
2231	/...sETYPE_BACK \45\ check backreference:24: /
2232	case ETYPE_BACK:
2233	if (e->u.n_span < cx->subs->n_spans)
2234	{
2235	int len = cx->subs->spans[e->u.n_span].len;
2236
2237	if (!memcmp(str, cx->str_init + cx->subs->spans[e->u.n_span].pos, len))
2238	walk_fsm_gated(str + len, cx, e);
2239	}
2240	break;
2241
2242	}
2243	}
2244	}
2245
2246	STATIC const char match_fsm(FSM fsm,
2247	int eremf,
2248	const char *str,
2249	int posn,
2250	int limit,
2251	int state_no,
2252	ERE_MATCHINFO * mi)
2253	{
2254	CONTEXT cx;
2255	SUBS subs;
2256
2257	cx.fsm = fsm;
2258	cx.eremf = eremf;
2259	cx.str_init = str;
2260	cx.str_limit = str + limit;
2261	cx.str_best = NULL;
2262	cx.subs = &subs;
2263	cx.subs_base = &subs;
2264	cx.mi = mi;
2265	subs.n_spans = 0;
2266	walk_fsm(str + posn, state_no, &cx);
2267	return cx.str_best;
2268	}
2269
2270
2271	#ifdef DEBUG
2272	/...sprint_fsm:0: /
2273	STATIC void print_fsm(FSM * fsm, int s, BOOLEAN not_just_reachable)
2274	{
2275	int state_no, edge_no;
2276
2277	printf("Starting state %02d\n", s);
2278	for (state_no = 0; state_no < fsm->n_states; state_no++)
2279	if ((fsm->state_flags[state_no] & FLAG_REACHABLE) != 0 \|\|
2280	not_just_reachable)
2281	{
2282	printf("%02d:%c\t", state_no, ((fsm->state_flags[state_no] & FLAG_FINISH) != 0) ? 'F' : ' ');
2283	for (edge_no = fsm->state_first_edges[state_no];
2284	edge_no != -1;
2285	edge_no = fsm->edges[edge_no].next_edge)
2286	/...sshow edge:32: /
2287	{
2288	EDGE *e = &(fsm->edges[edge_no]);
2289
2290	switch (e->etype)
2291	{
2292	case ETYPE_CHAR:
2293	printf("%c", e->u.character);
2294	break;
2295	case ETYPE_NCHAR:
2296	printf("~%c", e->u.character);
2297	break;
2298	case ETYPE_STRING:
2299	printf("%.s",
2300	(unsigned char)e->u.string[0],
2301	(unsigned char)e->u.string[0],
2302	e->u.string + 1);
2303	break;
2304	case ETYPE_DOT:
2305	printf(".");
2306	break;
2307	case ETYPE_CCLASS:
2308	printf("[");
2309	break;
2310	case ETYPE_WORD:
2311	printf("\\w");
2312	break;
2313	case ETYPE_NWORD:
2314	printf("\\W");
2315	break;
2316	case ETYPE_EPSILON:
2317	printf("e");
2318	break;
2319	case ETYPE_SOL:
2320	printf("^");
2321	break;
2322	case ETYPE_EOL:
2323	printf("$");
2324	break;
2325	case ETYPE_SOW:
2326	printf("\\<");
2327	break;
2328	case ETYPE_EOW:
2329	printf("\\>");
2330	break;
2331	case ETYPE_IW:
2332	printf("\\B");
2333	break;
2334	case ETYPE_EW:
2335	printf("\\y");
2336	break;
2337	case ETYPE_SSUB:
2338	printf("(");
2339	break;
2340	case ETYPE_ESUB:
2341	printf(")");
2342	break;
2343	case ETYPE_BACK:
2344	printf("\\%d", e->u.n_span + 1);
2345	break;
2346	}
2347	printf("->%02d\t", e->to_state);
2348	}
2349
2350	printf("\n");
2351	}
2352	}
2353
2354	#endif
2355
2356	/...sextended regular expressions:0: /
2357	/* An ERE knows its original match tree and also the FSM it is compiled into.
2358	* Using a (largely epsilon move free) FSM makes for faster searching. */
2359
2360	typedef struct
2361	{
2362	MATCH *match; // Parse tree for expression
2363	int shortest_match; // Shortest match possible
2364	FSM *fsm; // Compiled FSM
2365	int s; // Start state for FSM
2366	}
2367	ERE;
2368
2369	/*
2370	*@@ rxpCompile:
2371	* compiles the regular expression str for later matching.
2372	*
2373	* If ERECF_TOLOWER is passed with erecf, every character
2374	* (or range of characters) to be matched are stored in the
2375	* compiled ERE in lower case. Therefore, if strings to be
2376	* matched are passed in lower case also, the result is a
2377	* case-insensitive match.
2378	*/
2379
2380	ERE* rxpCompile(const char *str,
2381	int erecf,
2382	int *rc) // out: error code
2383	{
2384	ERE *ere;
2385	const char *str_after;
2386	FSM *fsm;
2387	int s, f;
2388
2389	*rc = NO_ERROR;
2390
2391	if ((ere = (ERE *) malloc(sizeof(ERE))) == NULL)
2392	{
2393	*rc = ERROR_NOT_ENOUGH_MEMORY;
2394	return NULL;
2395	}
2396
2397	if ((ere->match = compile_match(str, &str_after, erecf, rc)) == NULL)
2398	{
2399	free(ere);
2400	return NULL;
2401	}
2402
2403	if (thisch(str_after) == CH_RPAR)
2404	{
2405	delete_match(ere->match);
2406	free(ere);
2407	*rc = EREE_UNEX_RPAR;
2408	return NULL;
2409	}
2410
2411	if (count_sub(ere->match) > MAX_SUBS)
2412	{
2413	delete_match(ere->match);
2414	free(ere);
2415	*rc = EREE_TOO_MANY_SUB;
2416	return NULL;
2417	}
2418
2419	#ifdef DEBUG
2420	print_tree(ere->match, 0);
2421	#endif
2422
2423	ere->shortest_match = (int)shortest_match(ere->match);
2424
2425	if ((fsm = create_fsm(rc)) == NULL)
2426	{
2427	delete_match(ere->match);
2428	free(ere);
2429	return NULL;
2430	}
2431
2432	if (!make_fsm_from_match(ere->match, fsm, &s, &f))
2433	{
2434	delete_fsm(fsm);
2435	delete_match(ere->match);
2436	free(ere);
2437	*rc = EREE_COMPILE_FSM;
2438	return NULL;
2439	}
2440
2441	#ifdef DEBUG
2442	print_fsm(fsm, s, TRUE);
2443	#endif
2444
2445	if ((ere->fsm = create_fsm(rc)) == NULL)
2446	{
2447	delete_fsm(fsm);
2448	delete_match(ere->match);
2449	free(ere);
2450	*rc = EREE_COMPILE_FSM;
2451	return NULL;
2452	}
2453
2454	if (!remove_epsilons(s, f, fsm, ere->fsm))
2455	{
2456	delete_fsm(ere->fsm);
2457	delete_fsm(fsm);
2458	delete_match(ere->match);
2459	free(ere);
2460	*rc = EREE_COMPILE_FSM;
2461	return NULL;
2462	}
2463
2464	delete_fsm(fsm);
2465
2466	ere->s = s;
2467
2468	#ifdef DEBUG
2469	print_fsm(ere->fsm, s, FALSE);
2470	#endif
2471
2472	return ere;
2473	}
2474
2475	/*
2476	*@@ rxpMinLen:
2477	*
2478	*/
2479
2480	int rxpMinLen(const ERE * ere)
2481	{
2482	return ere->shortest_match;
2483	}
2484
2485	/*
2486	*@@ rxpMatch:
2487	* returns the number of characters in the match, starting from
2488	* pos characters into the string to be searched. Details of
2489	* sub-matches can also be returend. Returns -1 if no match.
2490	*
2491	* If EREMF_SHORTEST is passed with eremf, the code looks for
2492	* the shortest match, instead of the longest match.
2493	*
2494	* If EREMF_ANY is passed with eremf, the code doesn't try to
2495	* find the longest (or shortest) match, it will return with the
2496	* first match it finds (which could be of any length).
2497	* This can speed up matching.
2498	*/
2499
2500	int rxpMatch(const ERE * ere,
2501	int eremf,
2502	const char *str,
2503	int pos,
2504	ERE_MATCHINFO * mi)
2505	{
2506	int len = pos + strlen(str + pos);
2507	const char *str_best;
2508
2509	if ((str_best = match_fsm(ere->fsm, eremf, str, pos, len, ere->s, mi)) == NULL)
2510	return -1;
2511	return (str_best - str) - pos;
2512	}
2513
2514	/*
2515	*@@ rxpMatch_fwd:
2516	* match forwards within a string from a specified start position.
2517	*
2518	* If a match, return TRUE, and also return pos and len of the match.
2519	*
2520	* If EREMF_SHORTEST is passed with eremf, the code looks for
2521	* the shortest match, instead of the longest match.
2522	*
2523	* If EREMF_ANY is passed with eremf, the code doesn't try to
2524	* find the longest (or shortest) match, it will return with the
2525	* first match it finds (which could be of any length).
2526	* This can speed up matching.
2527	*/
2528
2529	BOOLEAN rxpMatch_fwd(const ERE *ere, // in: compiled ERE (from rxpCompile)
2530	int eremf, // in: EREMF_* flags
2531	const char *str, // in: string to test
2532	int pos, // in: start position
2533	int *pos_match, // out: position of match
2534	int *len_match, // out: length of match
2535	ERE_MATCHINFO *mi) // out: match info (for rxpSubsWith)
2536	{
2537	int len = pos + strlen(str + pos);
2538	int i;
2539
2540	for (i = pos; i <= len - ere->shortest_match; i++)
2541	{
2542	const char *str_best;
2543
2544	if ((str_best = match_fsm(ere->fsm, eremf, str, i, len, ere->s, mi)) != NULL)
2545	{
2546	*pos_match = i;
2547	*len_match = (str_best - str) - i;
2548	return TRUE;
2549	}
2550	}
2551	return FALSE;
2552	}
2553
2554	/*
2555	*@@ rxpMatch_bwd:
2556	* match backwards within a string not passing a
2557	* specified end position.
2558	*
2559	* If a match, return TRUE, and also return pos and
2560	* len of the match. We need to consider matches from
2561	* the beginning of the line. We want the one which
2562	* ends up in the rightmost position. Of those which
2563	* end up equally far right, we want the one which
2564	* extends the furthest (or shortest if EREMF_SHORTEST)
2565	* left. See how we get this as a side effect of the
2566	* loop ordering and the '>= + delta' test.
2567	*
2568	* This may not look as efficient as scanning the string
2569	* backwards, but note that this would require a reversed
2570	* ERE too, and we can't reverse EREs as they may
2571	* contain backreferences.
2572	*
2573	* If EREMF_SHORTEST is passed with eremf, the code looks for
2574	* the shortest match, instead of the longest match.
2575	*
2576	* If EREMF_ANY is passed with eremf, the code doesn't try to
2577	* find the longest (or shortest) match, it will return with the
2578	* first match it finds (which could be of any length).
2579	* This can speed up matching.
2580	*/
2581
2582	BOOLEAN rxpMatch_bwd(const ERE *ere, // in: compiled ERE (from rxpCompile)
2583	int eremf, // in: EREMF_* flags
2584	const char *str, // in: string to test
2585	int pos, // in: start position
2586	int *pos_match, // out: position of match
2587	int *len_match, // out: length of match
2588	ERE_MATCHINFO * mi) // out: match info (for rxpSubsWith)
2589	{
2590	int i;
2591	int delta = (eremf & EREMF_SHORTEST) ? 0 : 1;
2592	const char *rightmost = NULL;
2593	ERE_MATCHINFO mi2;
2594
2595	for (i = 0; i <= pos - ere->shortest_match; i++)
2596	{
2597	const char *str_best;
2598
2599	if ((str_best = match_fsm(ere->fsm, eremf, str, i, pos, ere->s, &mi2)) != NULL)
2600	{
2601	if (rightmost == NULL \|\|
2602	str_best >= rightmost + delta)
2603	{
2604	*pos_match = i;
2605	*len_match = (str_best - str) - i;
2606	rightmost = str_best;
2607	if (mi != NULL)
2608	{
2609	mi->n_spans = mi2.n_spans;
2610	for (i = 0; i < mi->n_spans; i++)
2611	mi->spans[i] = mi2.spans[i];
2612	}
2613	}
2614	}
2615	}
2616	return rightmost != NULL;
2617	}
2618
2619	/*
2620	*@@ rxpFree:
2621	* frees all resources allocated by rxpCompile.
2622	*/
2623
2624	void rxpFree(ERE * ere)
2625	{
2626	if (ere)
2627	{
2628	delete_match(ere->match);
2629	delete_fsm(ere->fsm);
2630	free(ere);
2631	}
2632	}
2633
2634	/*
2635	*@@ rxpSubsWith:
2636	* perform a substitution based upon an earlier found match.
2637	* This allows for implementing a "find and replace" function.
2638	*/
2639
2640	BOOLEAN rxpSubsWith(const char *str, // in: original string searched (same as str given to rxpMatch_fwd)
2641	int pos, // in: span of the entire match (pos_match from rxpMatch_fwd)
2642	int len, // in: span of the entire match (len_match from rxpMatch_fwd)
2643	ERE_MATCHINFO *mi, // in: details of match sub-spans (as from rxpMatch_fwd)
2644	const char *with, // in: replacement string with \1 etc.
2645	char *out, // out: buffer for string substitutions
2646	int len_out, // in: sizeof *out
2647	int *rc) // out: error, if FALSE returned
2648	{
2649	int i = 0;
2650	int j;
2651
2652	memcpy(out, str, pos);
2653	i += pos;
2654	while (*with != '\0')
2655	{
2656	const char *rep;
2657	int len_rep;
2658
2659	if (*with != '\\')
2660	{
2661	rep = with++;
2662	len_rep = 1;
2663	}
2664	else
2665	{
2666	++with;
2667	if (with >= '1' && with <= '9')
2668	{
2669	int span = *with - '1';
2670
2671	++with;
2672	if (span >= mi->n_spans)
2673	{
2674	*rc = EREE_BAD_BACKREF;
2675	return FALSE;
2676	}
2677	rep = str + mi->spans[span].pos;
2678	len_rep = mi->spans[span].len;
2679	}
2680	else if (*with != '\0')
2681	{
2682	rep = with++;
2683	len_rep = 1;
2684	}
2685	else
2686	{
2687	*rc = EREE_BAD_BACKSLASH;
2688	return FALSE;
2689	}
2690	}
2691	if (i + len_rep > len_out)
2692	{
2693	*rc = EREE_SUBS_LEN;
2694	return FALSE;
2695	}
2696	memcpy(out + i, rep, len_rep);
2697	i += len_rep;
2698	}
2699	j = pos + len + strlen(str + pos + len);
2700	if (i + j > len_out)
2701	{
2702	*rc = EREE_SUBS_LEN;
2703	return FALSE;
2704	}
2705	memcpy(out + i, str + pos + len, j);
2706	i += j;
2707	out[i] = '\0';
2708	return TRUE;
2709	}
2710
2711	#ifdef __TESTCASE__
2712
2713	int main(int argc, char *argv[])
2714	{
2715	ERE *ere;
2716	ERE_MATCHINFO mi;
2717	int rc;
2718
2719	const char pcsz, pcszEre;
2720
2721	if (argc != 3)
2722	{
2723	printf("Usage: regexp <teststring> <ere>\n");
2724	exit(1);
2725	}
2726
2727	pcsz = argv[1];
2728	pcszEre = argv[2];
2729
2730	printf("matching \"%s\" against \"%s\"\n",
2731	pcsz,
2732	pcszEre);
2733	fflush(stdout);
2734
2735	if (!(ere = rxpCompile(pcszEre,
2736	0,
2737	&rc)))
2738	{
2739	printf("Error %d in rxpCompile: %s\n", rc, rxpError(rc));
2740	exit(rc);
2741	}
2742
2743	{
2744	int pos, length;
2745	rc = rxpMatch_fwd(ere,
2746	0,
2747	pcsz,
2748	0,
2749	&pos,
2750	&length,
2751	&mi);
2752
2753	if (rc == 0)
2754	printf("no match\n");
2755	else
2756	printf("found at pos %d, length %d\n", pos, length);
2757	}
2758
2759	return 0;
2760	}
2761
2762	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats: