source: trunk/src/helpers/regexp.c@ 178

Last change on this file since 178 was 178, checked in by umoeller, 23 years ago

More pager fixes.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 77.9 KB
Line 
1
2/*
3 *@@sourcefile regexp.c:
4 * extended regular expressions, taken from "Andy's Extended
5 * Regular Expressions", as written by Andy Key
6 * (nyangau@interalpha.co.uk) and released into the public
7 * domain, plus adjustments for the XWP helpers.
8 *
9 * Usage: All C programs; not OS/2-specific.
10 *
11 * Function prefixes:
12 * -- rxp* regular expression functions.
13 *
14 * Regular expression matching is done in the following stages:
15 *
16 * 1) Call rxpCompile to parse the expression into a recursive
17 * tree of matches.
18 *
19 * This tree is converted into a finite state machine (FSM).
20 * A second FSM is built from the first but with epsilon
21 * moves removed to elimate lockups and increase speed.
22 *
23 * The input string can be used to drive the FSM through all
24 * the possible routes. The largest (or smallest) amount of
25 * input string required to reach the finish state is recorded
26 * since an extended regular expression is deemed to match as
27 * much input string as possible.
28 *
29 * 2) Call one of rxpMatch, rxpMatch_fwd, or rxpMatch_bwd to
30 * perform a match.
31 *
32 * 3) Call rxpFree to free the compiled ERE.
33 *
34 * Beware: The matching routine is highly recursive and can
35 * require around 20 to 30 bytes per character in the source string
36 * to match it. Thus you should try to limit the length of the source
37 * string and/or allow a stack size of at least 20 (to 30) * max string
38 * length. In addition, add 2KB to allow for the use of several nested
39 * sub-expressions in the matching.
40 *
41 *@@header "helpers\regexp.h"
42 *@@added V0.9.19 (2002-04-17) [umoeller]
43 */
44
45/*
46 * Copyright (C) 2002 Ulrich M”ller.
47 * This file is part of the "XWorkplace helpers" source package.
48 * This is free software; you can redistribute it and/or modify
49 * it under the terms of the GNU General Public License as published
50 * by the Free Software Foundation, in version 2 as it comes in the
51 * "COPYING" file of the XWorkplace main distribution.
52 * This program is distributed in the hope that it will be useful,
53 * but WITHOUT ANY WARRANTY; without even the implied warranty of
54 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
55 * GNU General Public License for more details.
56 */
57
58#include <stdio.h>
59#include <ctype.h>
60#include <stdlib.h>
61#include <string.h>
62#include <malloc.h>
63#include <memory.h>
64#define ERE_C
65#include "helpers\regexp.h"
66
67#define isword(c) (isalnum(c)||(c)=='_')
68
69static int val_of_hex(char c)
70{
71 if (c >= '0' && c <= '9')
72 return c - '0';
73 if (c >= 'a' && c <= 'f')
74 return c - 'a' + 10;
75 if (c >= 'A' && c <= 'F')
76 return c - 'A' + 10;
77 return 0; // Shouldn't get here
78}
79
80static int escaped(const char *s)
81{
82 if (s[0] == 'x' && isxdigit(s[1]))
83 // \x followed by 1 or 2 hex digits
84 {
85 int n = val_of_hex(s[1]);
86
87 if (isxdigit(s[2]))
88 n = n * 16 + val_of_hex(s[2]);
89 return n;
90 }
91 else
92 switch (s[0])
93 {
94 case 'n':
95 return '\n';
96 case 't':
97 return '\t';
98 case 'r':
99 return '\r';
100 case 'b':
101 return '\b';
102 case 'f':
103 return '\f';
104 case 'e':
105 return 0x1b;
106 default:
107 return (unsigned char)s[0];
108 }
109}
110
111static const char *past_escaped(const char *s)
112{
113 if (s[0] == 'x' && isxdigit(s[1]))
114 return isxdigit(s[2]) ? s + 3 : s + 2;
115 else
116 return s + 1;
117}
118
119#define zero_cclass(cclass) memset(cclass, 0, 0x100 >> 3)
120
121static unsigned char bits[] =
122{0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
123
124#define add_to_cclass(n, cclass) (cclass[(unsigned char)(n)>>3] |= bits[(unsigned char)(n) & 7])
125
126#define remove_from_cclass(n, cclass) (cclass[(unsigned char)(n)>>3] &= ~bits[(unsigned char)(n) & 7])
127
128static void invert_cclass(unsigned char *cclass)
129{
130 int i;
131
132 for (i = 0; i < (0x100 >> 3); i++)
133 cclass[i] ^= 0xff;
134}
135
136#define match_cclass(n, cclass) ((cclass[(unsigned char)(n)>>3] & bits[(unsigned char)(n) & 7]) != 0)
137
138#define CHL_EOS 0
139#define CHL_END_CCLASS (-1)
140#define CHL_COMP (-2)
141#define CHL_RANGE (-3)
142#define CHL_POSIX_COLLATING (-4)
143#define CHL_POSIX_EQUIVALENCE (-5)
144#define CHL_POSIX_CCLASS_BAD (-6)
145#define CHL_POSIX_CCLASS_BASE (-30)
146#define CHL_POSIX_CCLASS_END (-30+20)
147
148/* I use my own wrapper functions so I can take their addresses.
149 * Remember, isalnum etc. can be implemented as macros... */
150static BOOLEAN my_isalnum(int ch)
151{
152 return isalnum(ch);
153}
154static BOOLEAN my_isalpha(int ch)
155{
156 return isalpha(ch);
157}
158static BOOLEAN my_isblank(int ch)
159{
160 return ch == ' ' || ch == '\t';
161}
162static BOOLEAN my_iscntrl(int ch)
163{
164 return iscntrl(ch);
165}
166static BOOLEAN my_isdigit(int ch)
167{
168 return isdigit(ch);
169}
170static BOOLEAN my_islower(int ch)
171{
172 return islower(ch);
173}
174static BOOLEAN my_isprint(int ch)
175{
176 return isprint(ch);
177}
178static BOOLEAN my_ispunct(int ch)
179{
180 return ispunct(ch);
181}
182static BOOLEAN my_isspace(int ch)
183{
184 return isspace(ch);
185}
186static BOOLEAN my_isupper(int ch)
187{
188 return isupper(ch);
189}
190static BOOLEAN my_isxdigit(int ch)
191{
192 return isxdigit(ch);
193}
194
195typedef struct
196{
197 int len_name;
198 const char *name;
199 BOOLEAN(*iscclass) (int ch);
200}
201POSIX_CCLASS;
202
203static POSIX_CCLASS posix_cclass[] =
204{
205 5, "alnum", my_isalnum,
206 5, "alpha", my_isalpha,
207 5, "blank", my_isblank,
208 5, "cntrl", my_iscntrl,
209 5, "digit", my_isdigit,
210 5, "lower", my_islower,
211 5, "print", my_isprint,
212 5, "punct", my_ispunct,
213 5, "space", my_isspace,
214 5, "upper", my_isupper,
215 6, "xdigit", my_isxdigit,
216};
217
218static int find_posix(const char *str)
219{
220 int p;
221
222 if (str[0] != '[')
223 return str[0];
224 if (str[1] == '.')
225 return CHL_POSIX_COLLATING;
226 if (str[1] == '=')
227 return CHL_POSIX_EQUIVALENCE;
228 if (str[1] != ':')
229 return str[0];
230 for (p = 0; p < sizeof(posix_cclass) / sizeof(posix_cclass[0]); p++)
231 if (!memcmp(str + 2, posix_cclass[p].name, posix_cclass[p].len_name) &&
232 str[2 + posix_cclass[p].len_name] == ':' &&
233 str[2 + posix_cclass[p].len_name + 1] == ']')
234 return CHL_POSIX_CCLASS_BASE + p;
235 return CHL_POSIX_CCLASS_BAD;
236}
237
238static int cclass_thisch(const char *str)
239{
240 switch (str[0])
241 {
242 case '\\':
243 return escaped(str + 1);
244 case ']':
245 return CHL_END_CCLASS;
246 case '^':
247 return CHL_COMP;
248 case '-':
249 return CHL_RANGE;
250 case '\0':
251 return CHL_EOS;
252 default:
253 return find_posix(str);
254 }
255}
256
257static const char *cclass_nextch(const char *str)
258{
259 int p;
260
261 if (str[0] == '\\')
262 return past_escaped(str + 1);
263 p = find_posix(str);
264 if (CHL_POSIX_CCLASS_BASE <= p && p < CHL_POSIX_CCLASS_END)
265 return str + 2 + posix_cclass[p - CHL_POSIX_CCLASS_BASE].len_name + 2;
266 else
267 return str + 1;
268}
269
270static unsigned char *compile_cclass(const char *str,
271 const char **str_after,
272 int erecf,
273 int *rc)
274{
275 unsigned char *cclass;
276 BOOLEAN complement;
277 int i, c, last_c = -1;
278
279 if ((cclass = (unsigned char *)malloc(0x100 >> 3)) == NULL)
280 {
281 *rc = ERROR_NOT_ENOUGH_MEMORY;
282 return NULL;
283 }
284 zero_cclass(cclass);
285
286 complement = (cclass_thisch(str) == CHL_COMP);
287 if (complement)
288 str = cclass_nextch(str);
289
290 while ((c = cclass_thisch(str)) != CHL_EOS && c != CHL_END_CCLASS)
291 {
292 if (CHL_POSIX_CCLASS_BASE <= c && c < CHL_POSIX_CCLASS_END)
293 {
294 for (i = 0; i < 0x100; i++)
295 if (posix_cclass[c - CHL_POSIX_CCLASS_BASE].iscclass(i))
296 add_to_cclass(i, cclass);
297 last_c = -1;
298 }
299 else
300 switch (c)
301 {
302/*...sCHL_POSIX_\42\ \45\ error cases:32: */
303 case CHL_POSIX_COLLATING:
304 free(cclass);
305 *rc = EREE_POSIX_COLLATING;
306 return NULL;
307 case CHL_POSIX_EQUIVALENCE:
308 free(cclass);
309 *rc = EREE_POSIX_EQUIVALENCE;
310 return NULL;
311 case CHL_POSIX_CCLASS_BAD:
312 free(cclass);
313 *rc = EREE_POSIX_CCLASS_BAD;
314 return NULL;
315/*...sCHL_RANGE \45\ range:32: */
316 case CHL_RANGE:
317 if (last_c == -1)
318 // Unexpected at this point
319 {
320 free(cclass);
321 *rc = EREE_UNEX_RANGE;
322 return NULL;
323 }
324 str = cclass_nextch(str);
325 if ((c = cclass_thisch(str)) == CHL_EOS || c == CHL_END_CCLASS)
326 // Not followed by anything
327 {
328 free(cclass);
329 *rc = EREE_UNF_RANGE;
330 return NULL;
331 }
332 for (i = last_c + 1; i <= c; i++)
333 add_to_cclass(i, cclass);
334 last_c = c;
335 break;
336
337#pragma info(nogen) // do not warn here
338
339/*...sCHL_COMP \45\ complement:32: */
340 case CHL_COMP:
341 c = '^';
342 // Fall through
343/*...sdefault \45\ individual entry:32: */
344 default:
345 if (erecf & ERECF_TOLOWER)
346 c = tolower(c);
347 add_to_cclass(c, cclass);
348 last_c = c;
349 break;
350
351#pragma info(restore)
352
353 }
354 str = cclass_nextch(str);
355 }
356
357 if (c == CHL_EOS)
358 {
359 free(cclass);
360 *rc = EREE_UNF_CCLASS;
361 return NULL;
362 }
363
364 if (complement)
365 invert_cclass(cclass);
366
367 remove_from_cclass(0, cclass);
368
369 *str_after = cclass_nextch(str);
370 return cclass;
371}
372
373#define delete_cclass(cclass) free(cclass)
374
375/* A 'match' embodies all of regular expression functionality.
376 * Matches can be defined in terms of each other. */
377
378typedef unsigned char MTYPE;
379
380#define MTYPE_NULL ((MTYPE) 0)
381#define MTYPE_CHAR ((MTYPE) 1)
382#define MTYPE_NCHAR ((MTYPE) 2)
383#define MTYPE_STRING ((MTYPE) 3)
384#define MTYPE_CCLASS ((MTYPE) 4)
385#define MTYPE_WORD ((MTYPE) 5)
386#define MTYPE_NWORD ((MTYPE) 6)
387#define MTYPE_DOT ((MTYPE) 7)
388#define MTYPE_QUERY ((MTYPE) 8)
389#define MTYPE_PLUS ((MTYPE) 9)
390#define MTYPE_STAR ((MTYPE) 10)
391#define MTYPE_CREP ((MTYPE) 11)
392#define MTYPE_OR ((MTYPE) 12)
393#define MTYPE_CAT ((MTYPE) 13)
394#define MTYPE_SUB ((MTYPE) 14)
395#define MTYPE_SOL ((MTYPE) 15)
396#define MTYPE_EOL ((MTYPE) 16)
397#define MTYPE_SOW ((MTYPE) 17)
398#define MTYPE_EOW ((MTYPE) 18)
399#define MTYPE_IW ((MTYPE) 19)
400#define MTYPE_EW ((MTYPE) 20)
401#define MTYPE_BACK ((MTYPE) 21)
402
403typedef struct match_struct MATCH;
404
405struct match_struct
406{
407 MTYPE mtype;
408 union
409 {
410 char character;
411 unsigned char *cclass;
412 MATCH *match;
413 MATCH *matchs[2];
414 int n_span;
415 char *string;
416 struct
417 {
418 unsigned m, n;
419 MATCH *match;
420 }
421 crep;
422 }
423 u;
424};
425
426static MATCH null_match =
427{MTYPE_NULL};
428
429#define NULL_MATCH (&null_match)
430
431static void delete_match(MATCH * match)
432{
433 if (match == NULL_MATCH)
434 return;
435 switch (match->mtype)
436 {
437 case MTYPE_STRING:
438 free(match->u.string);
439 break;
440 case MTYPE_CCLASS:
441 delete_cclass(match->u.cclass);
442 break;
443 case MTYPE_QUERY:
444 case MTYPE_PLUS:
445 case MTYPE_STAR:
446 delete_match(match->u.match);
447 break;
448 case MTYPE_CREP:
449 delete_match(match->u.crep.match);
450 break;
451 case MTYPE_OR:
452 case MTYPE_CAT:
453 delete_match(match->u.matchs[0]);
454 delete_match(match->u.matchs[1]);
455 break;
456 case MTYPE_SUB:
457 delete_match(match->u.match);
458 break;
459 }
460 free(match);
461}
462
463/*
464 *@@ shortest_match:
465 * determines the shortest possible match length for a given match
466 * tree. In the following example it is 3, thus allowing us to only
467 * consider 4 positions for regular expression matching in a 6
468 * character line.
469 *
470 + aa*bc
471 +
472 + 123456
473 + ---
474 + ---
475 + ---
476 + ---
477 */
478
479static unsigned shortest_match(const MATCH * match)
480{
481 unsigned a, b;
482
483 switch (match->mtype)
484 {
485 case MTYPE_NULL:
486 return 0;
487 case MTYPE_CHAR:
488 return 1;
489 case MTYPE_NCHAR:
490 return 1;
491 case MTYPE_STRING:
492 return (unsigned char)match->u.string[0];
493 case MTYPE_CCLASS:
494 return 1;
495 case MTYPE_WORD:
496 return 1;
497 case MTYPE_NWORD:
498 return 1;
499 case MTYPE_DOT:
500 return 1;
501 case MTYPE_QUERY:
502 return 0;
503 case MTYPE_PLUS:
504 return shortest_match(match->u.match);
505 case MTYPE_STAR:
506 return 0;
507 case MTYPE_CREP:
508 return match->u.crep.m *
509 shortest_match(match->u.crep.match);
510 case MTYPE_OR:
511 a = shortest_match(match->u.matchs[0]);
512 b = shortest_match(match->u.matchs[1]);
513 return a <= b ? a : b;
514 case MTYPE_CAT:
515 return shortest_match(match->u.matchs[0]) +
516 shortest_match(match->u.matchs[1]);
517 case MTYPE_SUB:
518 return shortest_match(match->u.match);
519 case MTYPE_SOL:
520 return 0;
521 case MTYPE_EOL:
522 return 0;
523 case MTYPE_SOW:
524 return 0;
525 case MTYPE_EOW:
526 return 0;
527 case MTYPE_IW:
528 return 0;
529 case MTYPE_EW:
530 return 0;
531 case MTYPE_BACK:
532 return 0;
533 }
534 return 0; // Should never happen
535}
536
537static BOOLEAN got_backrefs(MATCH * match)
538{
539 switch (match->mtype)
540 {
541 case MTYPE_QUERY:
542 case MTYPE_PLUS:
543 case MTYPE_STAR:
544 return got_backrefs(match->u.match);
545 case MTYPE_CREP:
546 return got_backrefs(match->u.crep.match);
547 case MTYPE_OR:
548 case MTYPE_CAT:
549 return got_backrefs(match->u.matchs[0]) ||
550 got_backrefs(match->u.matchs[1]);
551 case MTYPE_BACK:
552 return TRUE;
553 }
554 return FALSE;
555}
556
557static MATCH *remove_subs(MATCH * match)
558{
559 switch (match->mtype)
560 {
561 case MTYPE_QUERY:
562 case MTYPE_PLUS:
563 case MTYPE_STAR:
564 match->u.match = remove_subs(match->u.match);
565 break;
566 case MTYPE_CREP:
567 match->u.crep.match = remove_subs(match->u.crep.match);
568 break;
569 case MTYPE_OR:
570 case MTYPE_CAT:
571 match->u.matchs[0] = remove_subs(match->u.matchs[0]);
572 match->u.matchs[1] = remove_subs(match->u.matchs[1]);
573 break;
574 case MTYPE_SUB:
575 {
576 MATCH *m = match;
577
578 match = match->u.match;
579 free(m);
580 }
581 break;
582 }
583 return match;
584}
585
586static int count_sub(const MATCH * match)
587{
588 switch (match->mtype)
589 {
590 case MTYPE_SUB:
591 return 1 + count_sub(match->u.match);
592 case MTYPE_QUERY:
593 case MTYPE_PLUS:
594 case MTYPE_STAR:
595 return count_sub(match->u.match);
596 case MTYPE_CREP:
597 return count_sub(match->u.crep.match);
598 case MTYPE_OR:
599 case MTYPE_CAT:
600 return count_sub(match->u.matchs[0]) +
601 count_sub(match->u.matchs[1]);
602 }
603 return 0;
604}
605
606/*...scompiling matches:0: */
607/*
608 *
609 * <term> ::= <character> | <class> | ( <match> )
610 * <factor> ::= <term>? | <term>+ | <term>* | <term>
611 * <factors> ::= <factor> { <factor> }
612 * <match> ::= <factors> { |<factors> }
613 *
614 */
615
616/* We have an optimisation here in that if we can see a span of 'boring'
617 * characters, we return them as a single entity. We mustn't do this for the
618 * last in the span, as this may have modifiers applied to it, as in the
619 * example string "abcd*", where the d has the * modifier, but abc do not.
620 * We only do 255 chars at once, as the length is stored in a byte.
621 * Reducing 255 levels of per-character recursive searching to 1 level with
622 * a string compare has got to be a massive saving. */
623
624#define STRING_MAX 255
625
626#define CH_NOT_BASE 1000
627#define CH_NOT_END (1000+0x100)
628#define CH_EOS 0
629#define CH_DOT (-1)
630#define CH_WORD (-2)
631#define CH_NWORD (-3)
632#define CH_LSQR (-4)
633#define CH_RSQR (-5)
634#define CH_LPAR (-6)
635#define CH_RPAR (-7)
636#define CH_LCUR (-8)
637#define CH_RCUR (-9)
638#define CH_QUERY (-10)
639#define CH_PLUS (-11)
640#define CH_STAR (-12)
641#define CH_OR (-13)
642#define CH_SOL (-14)
643#define CH_EOL (-15)
644#define CH_SOW (-16)
645#define CH_EOW (-17)
646#define CH_IW (-18)
647#define CH_EW (-19)
648#define CH_BAD_TILDE (-20)
649#define CH_STRING_BASE (-1000)
650#define CH_STRING_END (-1000+STRING_MAX)
651#define CH_BACK_BASE (-2000)
652#define CH_BACK_END (-2000+9)
653
654static int boring_string(const char *s)
655{
656 int n = 0;
657
658 while (*s &&
659 strchr("\\.[](){}?+*|^$<>~", *s) == NULL
660 && n < STRING_MAX)
661 {
662 s++;
663 n++;
664 }
665 return n;
666}
667
668static int thisch(const char *str)
669{
670 int n;
671
672 switch (str[0])
673 {
674 case '\\':
675 if (str[1] >= '1' && str[1] <= '9')
676 return CH_BACK_BASE + (str[1] - '1');
677 switch (str[1])
678 {
679 case '`':
680 return CH_SOL;
681 case '\'':
682 return CH_EOL;
683 case '<':
684 return CH_SOW;
685 case '>':
686 return CH_EOW;
687 case 'w':
688 return CH_WORD;
689 case 'W':
690 return CH_NWORD;
691 case 'B':
692 return CH_IW;
693 case 'y':
694 return CH_EW;
695 default:
696 return escaped(str + 1);
697 }
698
699 case '~':
700 if (str[1] == '\\')
701 return CH_NOT_BASE + escaped(str + 2);
702 else if (str[1] != '\0')
703 return CH_NOT_BASE + (unsigned char)str[1];
704 else
705 return '~';
706 case '.':
707 return CH_DOT;
708 case '[':
709 return CH_LSQR;
710 case ']':
711 return CH_RSQR;
712 case '(':
713 return CH_LPAR;
714 case ')':
715 return CH_RPAR;
716 case '{':
717 return CH_LCUR;
718 case '}':
719 return CH_RCUR;
720 case '?':
721 return CH_QUERY;
722 case '+':
723 return CH_PLUS;
724 case '*':
725 return CH_STAR;
726 case '|':
727 return CH_OR;
728 case '^':
729 return CH_SOL;
730 case '$':
731 return CH_EOL;
732 }
733
734 n = boring_string(str);
735 if (n < 3)
736 return (unsigned char)str[0];
737 else
738 return CH_STRING_BASE + n - 1;
739}
740
741static const char *nextch(const char *str)
742{
743 int n;
744
745 switch (str[0])
746 {
747 case '\\':
748 if (str[1] >= '1' && str[1] <= '9')
749 return str + 2;
750 switch (str[1])
751 {
752 case '`':
753 case '\'':
754 case '<':
755 case '>':
756 case 'w':
757 case 'W':
758 case 'B':
759 case 'y':
760 return str + 2;
761 default:
762 return past_escaped(str + 1);
763 }
764
765 case '~':
766 if (str[1] == '\\')
767 return past_escaped(str + 2);
768 else if (str[1] != '\0')
769 return str + 2;
770 else
771 return str + 1;
772 case '.':
773 case '[':
774 case '(':
775 case ')':
776 case '{':
777 case '}':
778 case '?':
779 case '+':
780 case '*':
781 case '|':
782 case '^':
783 case '$':
784 return str + 1;
785 }
786
787 n = boring_string(str);
788 if (n < 3)
789 return str + 1;
790 else
791 return str + n - 1;
792}
793
794static MATCH *compile_match(const char *str, const char **str_after, int erecf, int *rc);
795
796static const char *scan_number(const char *str, unsigned *num)
797{
798 if (!isdigit(*str))
799 return NULL;
800 *num = 0;
801 do
802 (*num) = (*num) * 10 + (*str++ - '0');
803 while (isdigit(*str));
804 return str;
805}
806
807static MATCH *create_match(int *rc)
808{
809 MATCH *match;
810
811 if ((match = (MATCH *) malloc(sizeof(MATCH))) == NULL)
812 {
813 *rc = ERROR_NOT_ENOUGH_MEMORY;
814 return NULL;
815 }
816 return match;
817}
818
819static MATCH *compile_term(const char *str, const char **str_after, int erecf, int *rc)
820{
821 MATCH *match;
822 int c;
823
824 c = thisch(str);
825 switch (c)
826 {
827 case CH_RSQR:
828 *rc = EREE_UNEX_RSQR;
829 return NULL;
830 case CH_QUERY:
831 *rc = EREE_UNEX_QUERY;
832 return NULL;
833 case CH_PLUS:
834 *rc = EREE_UNEX_PLUS;
835 return NULL;
836 case CH_STAR:
837 *rc = EREE_UNEX_STAR;
838 return NULL;
839 case CH_LCUR:
840 *rc = EREE_UNEX_LCUR;
841 return NULL;
842 case CH_RCUR:
843 *rc = EREE_UNEX_RCUR;
844 return NULL;
845 }
846
847 if (c == CH_EOS || c == CH_OR || c == CH_RPAR)
848 {
849 *str_after = str;
850 return NULL_MATCH;
851 }
852
853 if ((match = create_match(rc)) == NULL)
854 return NULL;
855
856 if (CH_NOT_BASE <= c && c < CH_NOT_END)
857/*...snot a specific character:16: */
858 {
859 char ch = (char)(c - CH_NOT_BASE);
860
861 if (erecf & ERECF_TOLOWER)
862 ch = (char)tolower(ch);
863 match->mtype = MTYPE_NCHAR;
864 match->u.character = ch;
865 str = nextch(str);
866 }
867 else if (CH_STRING_BASE <= c && c < CH_STRING_END)
868/*...sa string of non\45\special characters:16: */
869 {
870 unsigned len = c - CH_STRING_BASE;
871
872 match->mtype = MTYPE_STRING;
873 match->u.string = (char *)malloc(1 + len);
874 if (match->u.string == NULL)
875 {
876 free(match);
877 *rc = ERROR_NOT_ENOUGH_MEMORY;
878 return NULL;
879 }
880 match->u.string[0] = (char)len;
881 memcpy(match->u.string + 1, str, len);
882 if (erecf & ERECF_TOLOWER)
883 {
884 unsigned i;
885
886 for (i = 1; i <= len; i++)
887 match->u.string[i] = (char)tolower(match->u.string[i]);
888 }
889 str = nextch(str);
890 }
891 else if (CH_BACK_BASE <= c && c < CH_BACK_END)
892/*...sa backreference:16: */
893 {
894 match->mtype = MTYPE_BACK;
895 match->u.n_span = c - CH_BACK_BASE;
896 str = nextch(str);
897 }
898 else
899 switch (c)
900 {
901/*...sCH_LSQR \45\ character class:24: */
902 case CH_LSQR:
903 match->mtype = MTYPE_CCLASS;
904 str = nextch(str);
905 if ((match->u.cclass = compile_cclass(str, &str, erecf, rc)) == NULL)
906 {
907 free(match);
908 return NULL;
909 }
910 break;
911/*...sCH_DOT \45\ any character:24: */
912 case CH_DOT:
913 match->mtype = MTYPE_DOT;
914 str = nextch(str);
915 break;
916/*...sCH_WORD \45\ \92\w:24: */
917 case CH_WORD:
918 match->mtype = MTYPE_WORD;
919 str = nextch(str);
920 break;
921/*...sCH_NWORD \45\ \92\W:24: */
922 case CH_NWORD:
923 match->mtype = MTYPE_NWORD;
924 str = nextch(str);
925 break;
926/*...sCH_LPAR \45\ nested regular expression:24: */
927 case CH_LPAR:
928 {
929 MATCH *sub_match;
930
931 str = nextch(str);
932 if ((sub_match = compile_match(str, &str, erecf, rc)) == NULL)
933 {
934 free(match);
935 return NULL;
936 }
937 if (!got_backrefs(sub_match))
938 sub_match = remove_subs(sub_match);
939 if (thisch(str) != CH_RPAR)
940 {
941 *rc = EREE_UNF_SUB;
942 return NULL;
943 }
944 str = nextch(str);
945 match->mtype = MTYPE_SUB;
946 match->u.match = sub_match;
947 }
948 break;
949/*...sCH_SOL \45\ \94\:24: */
950 case CH_SOL:
951 match->mtype = MTYPE_SOL;
952 str = nextch(str);
953 break;
954/*...sCH_EOL \45\ \36\:24: */
955 case CH_EOL:
956 match->mtype = MTYPE_EOL;
957 str = nextch(str);
958 break;
959/*...sCH_SOW \45\ \92\\60\:24: */
960 case CH_SOW:
961 match->mtype = MTYPE_SOW;
962 str = nextch(str);
963 break;
964/*...sCH_EOW \45\ \92\\62\:24: */
965 case CH_EOW:
966 match->mtype = MTYPE_EOW;
967 str = nextch(str);
968 break;
969/*...sCH_IW \45\ \92\B:24: */
970 case CH_IW:
971 match->mtype = MTYPE_IW;
972 str = nextch(str);
973 break;
974/*...sCH_EW \45\ \92\y:24: */
975 case CH_EW:
976 match->mtype = MTYPE_EW;
977 str = nextch(str);
978 break;
979/*...sdefault \45\ any old character:24: */
980 default:
981 {
982 char ch = (char)c;
983
984 if (erecf & ERECF_TOLOWER)
985 ch = (char)tolower(ch);
986 match->mtype = MTYPE_CHAR;
987 match->u.character = ch;
988 str = nextch(str);
989 }
990 break;
991 }
992
993 *str_after = str;
994 return match;
995}
996
997static MTYPE repeat_type_of(int c)
998{
999 switch (c)
1000 {
1001 case CH_QUERY:
1002 return MTYPE_QUERY;
1003 case CH_PLUS:
1004 return MTYPE_PLUS;
1005 case CH_STAR:
1006 return MTYPE_STAR;
1007 case CH_LCUR:
1008 return MTYPE_CREP;
1009 default:
1010 return (MTYPE) - 1;
1011 }
1012}
1013
1014static MATCH *compile_factor(const char *str, const char **str_after, int erecf, int *rc)
1015{
1016 MATCH *match, *parent;
1017 MTYPE repeat_mtype;
1018
1019 if ((match = compile_term(str, &str, erecf, rc)) == NULL)
1020 return NULL;
1021
1022 while ((repeat_mtype = repeat_type_of(thisch(str))) != (MTYPE) - 1)
1023/*...smatch is to be repeated:16: */
1024 {
1025 if ((parent = create_match(rc)) == NULL)
1026 {
1027 delete_match(match);
1028 return NULL;
1029 }
1030
1031 parent->mtype = repeat_mtype;
1032 str = nextch(str);
1033 if (repeat_mtype == MTYPE_CREP)
1034 {
1035 parent->u.crep.match = match;
1036 if ((str = scan_number(str, &(parent->u.crep.m))) == NULL)
1037 {
1038 delete_match(match);
1039 free(parent);
1040 *rc = EREE_BAD_CREP_M;
1041 return NULL;
1042 }
1043 parent->u.crep.n = parent->u.crep.m;
1044 if (*str == ',')
1045 {
1046 ++str;
1047 if (*str != '}')
1048 {
1049 if ((str = scan_number(str, &(parent->u.crep.n))) == NULL)
1050 {
1051 delete_match(match);
1052 free(parent);
1053 *rc = EREE_BAD_CREP_N;
1054 return NULL;
1055 }
1056 }
1057 else
1058 parent->u.crep.n = (unsigned)~0;
1059 }
1060 if (*str != '}')
1061 {
1062 delete_match(match);
1063 free(parent);
1064 *rc = EREE_UNF_CREP;
1065 return NULL;
1066 }
1067 ++str;
1068 if (parent->u.crep.m > parent->u.crep.n)
1069 {
1070 delete_match(match);
1071 free(parent);
1072 *rc = EREE_BAD_CREP;
1073 return NULL;
1074 }
1075 }
1076 else
1077 parent->u.match = match;
1078 match = parent;
1079 }
1080
1081 *str_after = str;
1082
1083 return match;
1084}
1085
1086static MATCH *compile_factors(const char *str, const char **str_after, int erecf, int *rc)
1087{
1088 MATCH *match;
1089 int c;
1090
1091 if ((match = compile_factor(str, &str, erecf, rc)) == NULL)
1092 return NULL;
1093
1094 while ((c = thisch(str)) != CH_EOS && c != CH_RPAR && c != CH_OR)
1095/*...sconsider catenation of more factors:16: */
1096 {
1097 MATCH *sibling, *parent;
1098
1099 if ((sibling = compile_factor(str, &str, erecf, rc)) == NULL)
1100 {
1101 delete_match(match);
1102 return NULL;
1103 }
1104
1105 if ((parent = create_match(rc)) == NULL)
1106 {
1107 delete_match(sibling);
1108 delete_match(match);
1109 return NULL;
1110 }
1111 parent->mtype = MTYPE_CAT;
1112 parent->u.matchs[0] = match;
1113 parent->u.matchs[1] = sibling;
1114 match = parent;
1115 }
1116
1117 *str_after = str;
1118 return match;
1119}
1120
1121/*...scompile_match \45\ factors\124\factors:0: */
1122static MATCH *compile_match(const char *str, const char **str_after, int erecf, int *rc)
1123{
1124 MATCH *match;
1125
1126 if ((match = compile_factors(str, &str, erecf, rc)) == NULL)
1127 return NULL;
1128
1129 while (thisch(str) == CH_OR)
1130/*...sfind sibling and or it in:16: */
1131 {
1132 MATCH *sibling, *parent;
1133
1134 str = nextch(str);
1135 if ((sibling = compile_factors(str, &str, erecf, rc)) == NULL)
1136 {
1137 delete_match(match);
1138 return NULL;
1139 }
1140 if ((parent = create_match(rc)) == NULL)
1141 {
1142 delete_match(sibling);
1143 delete_match(match);
1144 return NULL;
1145 }
1146 parent->mtype = MTYPE_OR;
1147 parent->u.matchs[0] = match;
1148 parent->u.matchs[1] = sibling;
1149 match = parent;
1150 }
1151
1152 *str_after = str;
1153
1154 return match;
1155}
1156
1157#ifdef DEBUG
1158/*...sprint_tree:0: */
1159/*...sdo_indent:0: */
1160static void do_indent(int indent)
1161{
1162 while (indent--)
1163 putchar('\t');
1164}
1165
1166static void print_tree(const MATCH * match, int indent)
1167{
1168 do_indent(indent);
1169 switch (match->mtype)
1170 {
1171 case MTYPE_NULL:
1172 printf("null\n");
1173 break;
1174 case MTYPE_CHAR:
1175 printf("%c\n", match->u.character);
1176 break;
1177 case MTYPE_NCHAR:
1178 printf("~%c\n", match->u.character);
1179 break;
1180 case MTYPE_STRING:
1181 printf("%*.*s\n",
1182 (unsigned char)match->u.string[0],
1183 (unsigned char)match->u.string[0],
1184 match->u.string + 1);
1185 break;
1186 case MTYPE_CCLASS:
1187 {
1188 int i;
1189
1190 printf("[");
1191 for (i = 0; i < 0x100; i++)
1192 if (match_cclass(i, match->u.cclass))
1193 printf("%c", i);
1194 printf("]\n");
1195 }
1196 break;
1197 case MTYPE_DOT:
1198 printf(".\n");
1199 break;
1200 case MTYPE_WORD:
1201 printf("\\w\n");
1202 break;
1203 case MTYPE_NWORD:
1204 printf("\\W\n");
1205 break;
1206 case MTYPE_QUERY:
1207 printf("?\n");
1208 print_tree(match->u.match, indent + 1);
1209 break;
1210 case MTYPE_PLUS:
1211 printf("+\n");
1212 print_tree(match->u.match, indent + 1);
1213 break;
1214 case MTYPE_STAR:
1215 printf("*\n");
1216 print_tree(match->u.match, indent + 1);
1217 break;
1218 case MTYPE_CREP:
1219 printf("{%u,%u}\n",
1220 match->u.crep.m,
1221 match->u.crep.n);
1222 print_tree(match->u.crep.match, indent + 1);
1223 break;
1224 case MTYPE_OR:
1225 printf("|\n");
1226 print_tree(match->u.matchs[0], indent + 1);
1227 print_tree(match->u.matchs[1], indent + 1);
1228 break;
1229 case MTYPE_CAT:
1230 printf("CAT\n");
1231 print_tree(match->u.matchs[0], indent + 1);
1232 print_tree(match->u.matchs[1], indent + 1);
1233 break;
1234 case MTYPE_SUB:
1235 printf("SUB\n");
1236 print_tree(match->u.match, indent + 1);
1237 break;
1238 case MTYPE_SOL:
1239 printf("^\n");
1240 break;
1241 case MTYPE_EOL:
1242 printf("$\n");
1243 break;
1244 case MTYPE_SOW:
1245 printf("\\<\n");
1246 break;
1247 case MTYPE_EOW:
1248 printf("\\>\n");
1249 break;
1250 case MTYPE_IW:
1251 printf("\\B\n");
1252 break;
1253 case MTYPE_EW:
1254 printf("\\y\n");
1255 break;
1256 case MTYPE_BACK:
1257 printf("\\%d\n", match->u.n_span + 1);
1258 break;
1259 }
1260}
1261#endif
1262
1263/*...sfinite state machine:0: */
1264typedef unsigned char ETYPE; // Edge type
1265
1266#define ETYPE_CHAR ((ETYPE) 0) // Can advance if given character
1267#define ETYPE_NCHAR ((ETYPE) 1) // Can advance if not given char.
1268#define ETYPE_STRING ((ETYPE) 2) // Can advance if strings match
1269#define ETYPE_DOT ((ETYPE) 3) // Can advance if any character
1270#define ETYPE_CCLASS ((ETYPE) 4) // Can advance if any in class
1271#define ETYPE_WORD ((ETYPE) 5) // Can advance if word constituent
1272#define ETYPE_NWORD ((ETYPE) 6) // Can advance if not word constit.
1273#define ETYPE_EPSILON ((ETYPE) 7) // Can advance without reading input
1274#define ETYPE_SOL ((ETYPE) 8) // Matches if at start of line
1275#define ETYPE_EOL ((ETYPE) 9) // Matches if at end of line
1276#define ETYPE_SOW ((ETYPE) 10) // Matches if at start of word
1277#define ETYPE_EOW ((ETYPE) 11) // Matches if at end of word
1278#define ETYPE_IW ((ETYPE) 12) // Matches if within word
1279#define ETYPE_EW ((ETYPE) 13) // Matches if at word start or end
1280#define ETYPE_SSUB ((ETYPE) 14) // Records passage over (
1281#define ETYPE_ESUB ((ETYPE) 15) // Records passage over )
1282#define ETYPE_BACK ((ETYPE) 16) // Back reference
1283
1284typedef struct
1285{
1286 ETYPE etype; // Edge type, (an ETYPE_ no)
1287 union
1288 {
1289 char character; // Character to use if ETYPE_CHAR
1290 unsigned char *cclass; // Class to use if ETYPE_CCLASS
1291 char *string; // len, then chars if ETYPE_STRING
1292 int n_span; // Used if ETYPE_BACK
1293 }
1294 u;
1295 const char *gate; // Used if type of epsilon move
1296 int to_state; // State to go to if test succeeds
1297 int next_edge; // Next test to try after this
1298}
1299EDGE;
1300
1301/* Under DOS, restrict FSM size to 13KB approx.. On other environments memory
1302 * isn't such an issue, so allow something closer to 32KB. Of course, a better
1303 * implementation would dynamically grow the FSM size as needed. Something for
1304 * the future perhaps... */
1305
1306#ifdef xxDOS
1307#define MAX_STATES 200
1308#define MAX_EDGES 600
1309#else
1310#define MAX_STATES 500
1311#define MAX_EDGES 1500
1312#endif
1313
1314#define FLAG_FINISH 0x01
1315#define FLAG_VISITED 0x02
1316#define FLAG_REACHABLE 0x04
1317
1318typedef struct
1319{
1320 int n_states;
1321 int state_first_edges[MAX_STATES];
1322 unsigned char state_flags[MAX_STATES];
1323 int n_edges;
1324 EDGE edges[MAX_EDGES];
1325}
1326FSM;
1327
1328/*...screate_fsm:0: */
1329static FSM *create_fsm(int *rc)
1330{
1331 FSM *fsm;
1332 int i;
1333
1334 if ((fsm = (FSM *) malloc(sizeof(FSM))) == NULL)
1335 {
1336 *rc = ERROR_NOT_ENOUGH_MEMORY;
1337 return NULL;
1338 }
1339
1340 fsm->n_states = 0;
1341 fsm->n_edges = 0;
1342
1343 for (i = 0; i < MAX_STATES; i++)
1344 {
1345 fsm->state_first_edges[i] = -1;
1346 fsm->state_flags[i] = 0;
1347 }
1348
1349 return fsm;
1350}
1351/*...sdelete_fsm:0: */
1352#define delete_fsm(fsm) free(fsm)
1353
1354/*...smalloc_state:0: */
1355static int malloc_state(FSM * fsm)
1356{
1357 if (fsm->n_states == MAX_STATES)
1358 return -1;
1359 else
1360 return fsm->n_states++;
1361}
1362/*...smalloc_edge:0: */
1363/* Allocate space for new supplied edge and fill it in.
1364 * If already exists then don't bother (duplicates waste search time).
1365 * Return TRUE if all went ok. */
1366
1367static BOOLEAN malloc_edge(int s, EDGE * edge, FSM * fsm)
1368{
1369 int edge_no, n_edges = fsm->n_edges++;
1370
1371 // See if edge already exists
1372
1373 for (edge_no = fsm->state_first_edges[s];
1374 edge_no != -1;
1375 edge_no = fsm->edges[edge_no].next_edge)
1376 // Do we already have this edge
1377 if (fsm->edges[edge_no].to_state == edge->to_state &&
1378 fsm->edges[edge_no].etype == edge->etype)
1379 // An edge of this type already exists
1380 switch (edge->etype)
1381 {
1382/*...sETYPE_CHAR\47\NCHAR:32: */
1383 case ETYPE_CHAR:
1384 case ETYPE_NCHAR:
1385 if (edge->u.character == fsm->edges[edge_no].u.character)
1386 return TRUE;
1387 break;
1388/*...sETYPE_STRING:32: */
1389 case ETYPE_STRING:
1390 if ((unsigned char)fsm->edges[edge_no].u.string[0] ==
1391 (unsigned char)edge->u.string[0] &&
1392 !memcmp(fsm->edges[edge_no].u.string + 1,
1393 edge->u.string + 1,
1394 (unsigned char)edge->u.string[0]))
1395 return TRUE;
1396 break;
1397/*...sETYPE_CCLASS:32: */
1398 case ETYPE_CCLASS:
1399 if (edge->u.cclass == fsm->edges[edge_no].u.cclass)
1400 return TRUE;
1401 break;
1402/*...sETYPE_DOT\47\EPSILON\47\SOL\47\EOL etc\46\\46\:32: */
1403 case ETYPE_DOT:
1404 case ETYPE_EPSILON:
1405 case ETYPE_SOL:
1406 case ETYPE_EOL:
1407 case ETYPE_SOW:
1408 case ETYPE_EOW:
1409 case ETYPE_IW:
1410 case ETYPE_EW:
1411 case ETYPE_SSUB:
1412 case ETYPE_ESUB:
1413 return TRUE;
1414/*...sETYPE_BACK:32: */
1415 case ETYPE_BACK:
1416 if (edge->u.n_span == fsm->edges[edge_no].u.n_span)
1417 return TRUE;
1418 break;
1419 }
1420
1421 // Going to have to add the edge
1422
1423 if (n_edges >= MAX_EDGES)
1424 return FALSE;
1425
1426 memcpy(&(fsm->edges[n_edges]), edge, sizeof(EDGE));
1427 fsm->edges[n_edges].next_edge = fsm->state_first_edges[s];
1428 fsm->state_first_edges[s] = n_edges;
1429 return TRUE;
1430}
1431/*...smake_fsm_from_match:0: */
1432/*...sadd_edge_to_fsm_character:0: */
1433static BOOLEAN add_edge_to_fsm_character(int s, int f, FSM * fsm, char character)
1434{
1435 EDGE edge;
1436
1437 edge.etype = ETYPE_CHAR;
1438 edge.u.character = character;
1439 edge.to_state = f;
1440 return malloc_edge(s, &edge, fsm);
1441}
1442/*...sadd_edge_to_fsm_ncharacter:0: */
1443static BOOLEAN add_edge_to_fsm_ncharacter(int s, int f, FSM * fsm, char character)
1444{
1445 EDGE edge;
1446
1447 edge.etype = ETYPE_NCHAR;
1448 edge.u.character = character;
1449 edge.to_state = f;
1450 return malloc_edge(s, &edge, fsm);
1451}
1452/*...sadd_edge_to_fsm_string:0: */
1453static BOOLEAN add_edge_to_fsm_string(int s, int f, FSM * fsm, char *string)
1454{
1455 EDGE edge;
1456
1457 edge.etype = ETYPE_STRING;
1458 edge.u.string = string;
1459 edge.to_state = f;
1460 return malloc_edge(s, &edge, fsm);
1461}
1462/*...sadd_edge_to_fsm_cclass:0: */
1463static BOOLEAN add_edge_to_fsm_cclass(int s, int f, FSM * fsm, unsigned char *cclass)
1464{
1465 EDGE edge;
1466
1467 edge.etype = ETYPE_CCLASS;
1468 edge.u.cclass = cclass;
1469 edge.to_state = f;
1470 return malloc_edge(s, &edge, fsm);
1471}
1472/*...sadd_edge_to_fsm_dot:0: */
1473static BOOLEAN add_edge_to_fsm_dot(int s, int f, FSM * fsm)
1474{
1475 EDGE edge;
1476
1477 edge.etype = ETYPE_DOT;
1478 edge.to_state = f;
1479 return malloc_edge(s, &edge, fsm);
1480}
1481/*...sadd_edge_to_fsm_word:0: */
1482static BOOLEAN add_edge_to_fsm_word(int s, int f, FSM * fsm)
1483{
1484 EDGE edge;
1485
1486 edge.etype = ETYPE_WORD;
1487 edge.to_state = f;
1488 return malloc_edge(s, &edge, fsm);
1489}
1490/*...sadd_edge_to_fsm_nword:0: */
1491static BOOLEAN add_edge_to_fsm_nword(int s, int f, FSM * fsm)
1492{
1493 EDGE edge;
1494
1495 edge.etype = ETYPE_NWORD;
1496 edge.to_state = f;
1497 return malloc_edge(s, &edge, fsm);
1498}
1499/*...sadd_edge_to_fsm_epsilon:0: */
1500static BOOLEAN add_edge_to_fsm_epsilon(int s, int f, FSM * fsm)
1501{
1502 EDGE edge;
1503
1504 edge.etype = ETYPE_EPSILON;
1505 edge.to_state = f;
1506 return malloc_edge(s, &edge, fsm);
1507}
1508/*...sadd_edge_to_fsm_special:0: */
1509static BOOLEAN add_edge_to_fsm_special(int s, int f, FSM * fsm, ETYPE etype)
1510{
1511 EDGE edge;
1512
1513 edge.etype = etype;
1514 edge.to_state = f;
1515 edge.gate = NULL;
1516 return malloc_edge(s, &edge, fsm);
1517}
1518/*...sadd_edge_to_fsm_back:0: */
1519static BOOLEAN add_edge_to_fsm_back(int s, int f, FSM * fsm, int n_span)
1520{
1521 EDGE edge;
1522
1523 edge.etype = ETYPE_BACK;
1524 edge.to_state = f;
1525 edge.gate = NULL;
1526 edge.u.n_span = n_span;
1527 return malloc_edge(s, &edge, fsm);
1528}
1529
1530static BOOLEAN make_fsm_from_match(MATCH * match, FSM * fsm, int *s, int *f)
1531{
1532 int n1, n2, n3, n4, i;
1533
1534 switch (match->mtype)
1535 {
1536/*...sMTYPE_NULL:16: */
1537/*
1538 * e
1539 * S ----> F
1540 *
1541 */
1542
1543 case MTYPE_NULL:
1544 return (*s = malloc_state(fsm)) != -1 &&
1545 (*f = malloc_state(fsm)) != -1 &&
1546 add_edge_to_fsm_epsilon(*s, *f, fsm);
1547/*...sMTYPE_CHAR:16: */
1548/*
1549 * c
1550 * S ----> F
1551 *
1552 */
1553
1554 case MTYPE_CHAR:
1555 return (*s = malloc_state(fsm)) != -1 &&
1556 (*f = malloc_state(fsm)) != -1 &&
1557 add_edge_to_fsm_character(*s, *f, fsm, match->u.character);
1558/*...sMTYPE_NCHAR:16: */
1559/*
1560 * ~c
1561 * S ----> F
1562 *
1563 */
1564
1565 case MTYPE_NCHAR:
1566 return (*s = malloc_state(fsm)) != -1 &&
1567 (*f = malloc_state(fsm)) != -1 &&
1568 add_edge_to_fsm_ncharacter(*s, *f, fsm, match->u.character);
1569/*...sMTYPE_STRING:16: */
1570/*
1571 * string
1572 * S ----> F
1573 *
1574 */
1575
1576 case MTYPE_STRING:
1577 return (*s = malloc_state(fsm)) != -1 &&
1578 (*f = malloc_state(fsm)) != -1 &&
1579 add_edge_to_fsm_string(*s, *f, fsm, match->u.string);
1580
1581/*...sMTYPE_CCLASS:16: */
1582/*
1583 * cclass
1584 * S ----> F
1585 *
1586 */
1587
1588 case MTYPE_CCLASS:
1589 return (*s = malloc_state(fsm)) != -1 &&
1590 (*f = malloc_state(fsm)) != -1 &&
1591 add_edge_to_fsm_cclass(*s, *f, fsm, match->u.cclass);
1592
1593/*...sMTYPE_DOT:16: */
1594/*
1595 * any
1596 * S ----> F
1597 *
1598 */
1599
1600 case MTYPE_DOT:
1601 return (*s = malloc_state(fsm)) != -1 &&
1602 (*f = malloc_state(fsm)) != -1 &&
1603 add_edge_to_fsm_dot(*s, *f, fsm);
1604
1605/*...sMTYPE_WORD:16: */
1606/*
1607 * word
1608 * S ----> F
1609 *
1610 */
1611
1612 case MTYPE_WORD:
1613 return (*s = malloc_state(fsm)) != -1 &&
1614 (*f = malloc_state(fsm)) != -1 &&
1615 add_edge_to_fsm_word(*s, *f, fsm);
1616
1617/*...sMTYPE_NWORD:16: */
1618/*
1619 * !word
1620 * S ----> F
1621 *
1622 */
1623
1624 case MTYPE_NWORD:
1625 return (*s = malloc_state(fsm)) != -1 &&
1626 (*f = malloc_state(fsm)) != -1 &&
1627 add_edge_to_fsm_nword(*s, *f, fsm);
1628
1629/*...sMTYPE_QUERY:16: */
1630/*
1631 * e
1632 * S ----> F
1633 * | ^
1634 * | e | e
1635 * v |
1636 * [n1 n2]
1637 *
1638 */
1639
1640 case MTYPE_QUERY:
1641 if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1642 return FALSE;
1643 return (*s = malloc_state(fsm)) != -1 &&
1644 (*f = malloc_state(fsm)) != -1 &&
1645 add_edge_to_fsm_epsilon(*s, *f, fsm) &&
1646 add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1647 add_edge_to_fsm_epsilon(n2, *f, fsm);
1648
1649/*...sMTYPE_PLUS:16: */
1650/*
1651 *
1652 * [S F]
1653 * ^ e |
1654 * +---+
1655 *
1656 */
1657
1658 case MTYPE_PLUS:
1659 if (!make_fsm_from_match(match->u.match, fsm, s, f))
1660 return FALSE;
1661 return add_edge_to_fsm_epsilon(*f, *s, fsm);
1662
1663/*...sMTYPE_STAR:16: */
1664/*
1665 *
1666 * +---- S/F
1667 * | ^
1668 * | e | e
1669 * v |
1670 * [n1 n2]
1671 *
1672 */
1673
1674 case MTYPE_STAR:
1675 if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1676 return FALSE;
1677 return (*s = *f = malloc_state(fsm)) != -1 &&
1678 add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1679 add_edge_to_fsm_epsilon(n2, *f, fsm);
1680
1681/*...sMTYPE_CREP:16: */
1682/*
1683 * e e e e
1684 * S-->[n1 n2]-->[n1 n2]-->[n1 n2]-->[n1 n2] <re>{2,4}
1685 * | | |
1686 * | e | e | e
1687 * v | |
1688 * F<---------+----------+
1689 *
1690 * e
1691 * +----+
1692 * | |
1693 * e e e v |
1694 * S-->[n1 n2]-->[n1 n2]-->[n1 n2] <re>{2,}
1695 * | |
1696 * | e | e
1697 * v |
1698 * F<---------+
1699 */
1700
1701 case MTYPE_CREP:
1702 if ((*s = malloc_state(fsm)) == -1 ||
1703 (*f = malloc_state(fsm)) == -1)
1704 return FALSE;
1705 n3 = *s;
1706 for (i = 0; i < (int)match->u.crep.m; i++)
1707 {
1708 if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1709 return FALSE;
1710 if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1711 return FALSE;
1712 n3 = n2;
1713 }
1714 if (!add_edge_to_fsm_epsilon(n3, *f, fsm))
1715 return FALSE;
1716 if (match->u.crep.n != ~0)
1717 for (; i < (int)match->u.crep.n; i++)
1718 {
1719 if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1720 return FALSE;
1721 if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1722 return FALSE;
1723 if (!add_edge_to_fsm_epsilon(n2, *f, fsm))
1724 return FALSE;
1725 n3 = n2;
1726 }
1727 else
1728 {
1729 if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1730 return FALSE;
1731 if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1732 return FALSE;
1733 if (!add_edge_to_fsm_epsilon(n2, *f, fsm))
1734 return FALSE;
1735 if (!add_edge_to_fsm_epsilon(n2, n1, fsm))
1736 return FALSE;
1737 }
1738 return TRUE;
1739
1740/*...sMTYPE_OR:16: */
1741/*
1742 * e e
1743 * +--->[n1 n2]----+
1744 * | v
1745 * S F
1746 * | e e ^
1747 * +--->[n3 n4]----+
1748 *
1749 */
1750
1751 case MTYPE_OR:
1752 if (!make_fsm_from_match(match->u.matchs[0], fsm, &n1, &n2) ||
1753 !make_fsm_from_match(match->u.matchs[1], fsm, &n3, &n4))
1754 return FALSE;
1755 return (*s = malloc_state(fsm)) != -1 &&
1756 (*f = malloc_state(fsm)) != -1 &&
1757 add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1758 add_edge_to_fsm_epsilon(*s, n3, fsm) &&
1759 add_edge_to_fsm_epsilon(n2, *f, fsm) &&
1760 add_edge_to_fsm_epsilon(n4, *f, fsm);
1761
1762/*...sMTYPE_CAT:16: */
1763/*
1764 * e
1765 * [S n1]---->[n2 F]
1766 *
1767 */
1768
1769 case MTYPE_CAT:
1770 if (!make_fsm_from_match(match->u.matchs[0], fsm, s, &n1) ||
1771 !make_fsm_from_match(match->u.matchs[1], fsm, &n2, f))
1772 return FALSE;
1773 return add_edge_to_fsm_epsilon(n1, n2, fsm);
1774
1775/*...sMTYPE_SUB:16: */
1776/*
1777 * ssub esub
1778 * S---->[n1 n2]---->F
1779 *
1780 */
1781
1782 case MTYPE_SUB:
1783 if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1784 return FALSE;
1785 return (*s = malloc_state(fsm)) != -1 &&
1786 (*f = malloc_state(fsm)) != -1 &&
1787 add_edge_to_fsm_special(*s, n1, fsm, ETYPE_SSUB) &&
1788 add_edge_to_fsm_special(n2, *f, fsm, ETYPE_ESUB);
1789
1790/*...sMTYPE_SOL\47\EOL\47\SOW\47\EOW\47\IW\47\EW:16: */
1791/*
1792 * special
1793 * S ----> F
1794 *
1795 */
1796
1797 case MTYPE_SOL:
1798 return (*s = malloc_state(fsm)) != -1 &&
1799 (*f = malloc_state(fsm)) != -1 &&
1800 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_SOL);
1801 case MTYPE_EOL:
1802 return (*s = malloc_state(fsm)) != -1 &&
1803 (*f = malloc_state(fsm)) != -1 &&
1804 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_EOL);
1805 case MTYPE_SOW:
1806 return (*s = malloc_state(fsm)) != -1 &&
1807 (*f = malloc_state(fsm)) != -1 &&
1808 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_SOW);
1809 case MTYPE_EOW:
1810 return (*s = malloc_state(fsm)) != -1 &&
1811 (*f = malloc_state(fsm)) != -1 &&
1812 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_EOW);
1813 case MTYPE_IW:
1814 return (*s = malloc_state(fsm)) != -1 &&
1815 (*f = malloc_state(fsm)) != -1 &&
1816 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_IW);
1817 case MTYPE_EW:
1818 return (*s = malloc_state(fsm)) != -1 &&
1819 (*f = malloc_state(fsm)) != -1 &&
1820 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_EW);
1821
1822/*...sMTYPE_BACK:16: */
1823 case MTYPE_BACK:
1824 return (*s = malloc_state(fsm)) != -1 &&
1825 (*f = malloc_state(fsm)) != -1 &&
1826 add_edge_to_fsm_back(*s, *f, fsm, match->u.n_span);
1827
1828 }
1829 return FALSE; // Should never happen
1830}
1831
1832/*...sremove_epsilons:0: */
1833/* The problems with epsilon moves are :-
1834 * 1) They can gang up on you in groups to form loops of states that require
1835 * no input to go all the way around them. Any function attempting a
1836 * recursive search of the FSM will recurse forever.
1837 * 2) They can slow the recognition process by as much as a factor of 2.
1838 * eg:
1839 * a e b e c
1840 * O ----> O ----> O ----> O ----> O ----> O Is slow
1841 *
1842 * a b c
1843 * O ----> O ----> O ----> O Is faster
1844 *
1845 */
1846
1847/*...sfinish_states:0: */
1848/*...sis_finish_reachable:0: */
1849/* When we recurse we mark the current state as visited to stop infinite
1850 * recursion on loops of epsilon moves. */
1851
1852static BOOLEAN is_finish_reachable(FSM * fsm, int state_no)
1853{
1854 int edge_no;
1855 BOOLEAN ok;
1856
1857 if (fsm->state_flags[state_no] & FLAG_VISITED)
1858 // Been here already
1859 return FALSE;
1860
1861 if (fsm->state_flags[state_no] & FLAG_FINISH)
1862 // At finish
1863 return TRUE;
1864
1865 for (edge_no = fsm->state_first_edges[state_no];
1866 edge_no != -1;
1867 edge_no = fsm->edges[edge_no].next_edge)
1868 if (fsm->edges[edge_no].etype == ETYPE_EPSILON)
1869 {
1870 fsm->state_flags[state_no] |= FLAG_VISITED;
1871 ok = is_finish_reachable(fsm, fsm->edges[edge_no].to_state);
1872 fsm->state_flags[state_no] &= ~FLAG_VISITED;
1873 if (ok)
1874 return TRUE;
1875 }
1876
1877
1878 return FALSE;
1879}
1880
1881
1882static void finish_states(int f, FSM * fsm, FSM * fsm_without)
1883{
1884 int state_no;
1885
1886 fsm->state_flags[f] = FLAG_FINISH;
1887 fsm_without->state_flags[f] = FLAG_FINISH;
1888 for (state_no = 0; state_no < fsm->n_states; state_no++)
1889 if (is_finish_reachable(fsm, state_no))
1890 fsm_without->state_flags[state_no] = FLAG_FINISH;
1891}
1892
1893/*...sdetermine_reachable:0: */
1894static void determine_reachable(int s, FSM * fsm, FSM * fsm_without)
1895{
1896 int edge_no, to_state;
1897
1898 fsm_without->state_flags[s] |= FLAG_REACHABLE;
1899 for (edge_no = 0; edge_no < fsm->n_edges; edge_no++)
1900 if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1901 {
1902 to_state = fsm->edges[edge_no].to_state;
1903 fsm_without->state_flags[to_state] |= FLAG_REACHABLE;
1904 }
1905}
1906
1907/*...scopy_non_epsilons:0: */
1908static void copy_non_epsilons(FSM * fsm, FSM * fsm_without)
1909{
1910 int state_no, edge_no;
1911
1912 for (state_no = 0; state_no < fsm->n_states; state_no++)
1913 if (fsm_without->state_flags[state_no] & FLAG_REACHABLE)
1914 for (edge_no = fsm->state_first_edges[state_no];
1915 edge_no != -1;
1916 edge_no = fsm->edges[edge_no].next_edge)
1917 if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1918 malloc_edge(state_no, &(fsm->edges[edge_no]), fsm_without);
1919}
1920
1921/*...sfollow_epsilons:0: */
1922/*...scopy_edges_reachable:0: */
1923/* What this says is :-
1924 * If state A can reach state B, by an epsilon move, and
1925 * state B can reach state C then
1926 * state A can reach state C *
1927 *
1928 * If the state B to state C involves an epsilon move then
1929 * A can reach whatever C can reach too (by recursion)
1930 */
1931
1932static BOOLEAN copy_edges_reachable(
1933 FSM * fsm,
1934 FSM * fsm_without,
1935 int state_no_to, /* AK: Bad identifier, might better be
1936 * described as 'original reachable state' */
1937 int state_no_from
1938)
1939{
1940 int edge_no;
1941 BOOLEAN ok;
1942
1943 if (fsm->state_flags[state_no_from] & FLAG_VISITED)
1944 // Been here already, therefore all copied from here ok
1945 return TRUE;
1946
1947 for (edge_no = fsm->state_first_edges[state_no_from];
1948 edge_no != -1;
1949 edge_no = fsm->edges[edge_no].next_edge)
1950 if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1951 // Had better add this edge
1952/*...sadd this edge to the \39\to\39\ state:24: */
1953 {
1954 if (!malloc_edge(state_no_to, &(fsm->edges[edge_no]), fsm_without))
1955 return (FALSE);
1956 }
1957
1958 else
1959 {
1960 fsm->state_flags[state_no_from] |= FLAG_VISITED;
1961 ok = copy_edges_reachable(fsm, fsm_without, state_no_to, fsm->edges[edge_no].to_state);
1962 fsm->state_flags[state_no_from] &= ~FLAG_VISITED;
1963 if (!ok)
1964 return FALSE;
1965 }
1966
1967 return TRUE;
1968}
1969
1970
1971static BOOLEAN follow_epsilons(FSM * fsm, FSM * fsm_without)
1972{
1973 int state_no;
1974
1975 for (state_no = 0; state_no < fsm->n_states; state_no++)
1976 if (fsm_without->state_flags[state_no] & FLAG_REACHABLE)
1977 if (!copy_edges_reachable(fsm, fsm_without, state_no, state_no))
1978 return FALSE;
1979 return TRUE;
1980}
1981
1982
1983static BOOLEAN remove_epsilons(int s, int f, FSM * fsm, FSM * fsm_without)
1984{
1985 // FSM with no epsilon moves will have the same number of states
1986
1987 fsm_without->n_states = fsm->n_states;
1988
1989 // Mark state f as a finish state in the new FSM
1990 // Any state with epsilon move(s) to state f is also a finish state
1991
1992 finish_states(f, fsm, fsm_without);
1993
1994 // Determine which states can be reached by non-epsilon moves. Add
1995 // to this set, the start state. The resulting states should have
1996 // their edges considered, but the other will not be reachable. This
1997 // is because they will be bypassed by follow_epsilons().
1998
1999 determine_reachable(s, fsm, fsm_without);
2000
2001 // Copy across all reachable, non epsilon moves to new FSM
2002
2003 copy_non_epsilons(fsm, fsm_without);
2004
2005 // For all states, determine all other states that can be reached by
2006 // epsilon moves and add the edges leading from them to us
2007
2008 return follow_epsilons(fsm, fsm_without);
2009}
2010
2011/*...smatch_fsm:0: */
2012/* Stack requirements per call, (one call per character in string!).
2013 *
2014 * eg: 16 bit OS/2, large model :-
2015 * Return address + Stack frame + SI and DI + Arguments + Locals
2016 * 2 + 2 + 2+2 + 4+2+4+4 + 2 = 24
2017 *
2018 * eg: 16 bit DOS large model :-
2019 * Return address + Stack frame + SI and DI + Arguments + Locals
2020 * 2 + 2 + 2+2 + 4+2 + 2 = 16
2021 *
2022 * eg: RS/6000 AIX :-
2023 * Massive stack frame per call, massive program stack size
2024 * net effect - room for approx 200 levels before core dump!
2025 *
2026 * In addition, we can consume additional stack for every ETYPE_SSUB
2027 * (and ETYPE_ESUB). Guessing this requirement to be approx 100 bytes,
2028 * we place an arbitrary limit of 20 (ie: 2KB) on subexpressions in an ERE.
2029 * This limit was enforced earlier during match tree parsing.
2030 *
2031 */
2032
2033typedef struct substruct SUBS;
2034struct substruct
2035{
2036 int n_spans;
2037 ERE_SPAN spans[MAX_SPANS];
2038 SUBS *next;
2039};
2040
2041#define MAX_SUBS 20
2042
2043typedef struct
2044{
2045 FSM *fsm;
2046 int eremf;
2047 const char *str_init;
2048 const char *str_limit;
2049 const char *str_best;
2050 SUBS *subs;
2051 SUBS *subs_base;
2052 ERE_MATCHINFO *mi;
2053}
2054CONTEXT;
2055
2056#define NR
2057
2058static void NR walk_fsm(const char *str, int state_no, CONTEXT * cx);
2059
2060static void NR walk_fsm_gated(const char *str, CONTEXT * cx, EDGE * e)
2061{
2062 if (e->gate != str)
2063 // Avoid looping via this edge
2064 {
2065 const char *gate = e->gate;
2066
2067 e->gate = str;
2068 walk_fsm(str, e->to_state, cx);
2069 e->gate = gate;
2070 }
2071}
2072
2073static void NR walk_fsm_ssub(const char *str, CONTEXT * cx, EDGE * e)
2074{
2075 SUBS subs;
2076
2077 if (cx->subs->n_spans < MAX_SPANS)
2078 cx->subs->spans[cx->subs->n_spans].pos = str - cx->str_init;
2079
2080 subs.n_spans = 0;
2081 subs.next = cx->subs;
2082 cx->subs = &subs;
2083 walk_fsm_gated(str, cx, e);
2084 cx->subs = subs.next;
2085}
2086
2087static void NR walk_fsm_esub(const char *str, CONTEXT * cx, EDGE * e)
2088{
2089 SUBS *subs = cx->subs;
2090
2091 cx->subs = cx->subs->next;
2092
2093 if (cx->subs->n_spans < MAX_SPANS)
2094 cx->subs->spans[cx->subs->n_spans].len =
2095 (str - cx->str_init) - cx->subs->spans[cx->subs->n_spans].pos;
2096 ++(cx->subs->n_spans);
2097
2098 walk_fsm_gated(str, cx, e);
2099
2100 --(cx->subs->n_spans);
2101 cx->subs = subs;
2102}
2103
2104static void NR walk_fsm(const char *str, int state_no, CONTEXT * cx)
2105{
2106 int edge_no;
2107
2108 if (cx->fsm->state_flags[state_no] & FLAG_FINISH)
2109 // Got to finishing state, may have got a better match than before
2110 {
2111 if ((cx->str_best == NULL ||
2112 (cx->str_best < str) == (cx->eremf & EREMF_SHORTEST) == 0) &&
2113 str <= cx->str_limit)
2114 {
2115 cx->str_best = str;
2116 if (cx->mi != NULL)
2117 {
2118 int i;
2119
2120 cx->mi->n_spans = cx->subs_base->n_spans;
2121 for (i = 0; i < cx->mi->n_spans; i++)
2122 cx->mi->spans[i] = cx->subs_base->spans[i];
2123 }
2124 }
2125 if (cx->eremf & EREMF_ANY)
2126 return;
2127 // Continue, as may be able to get a better match
2128 }
2129
2130 for (edge_no = cx->fsm->state_first_edges[state_no];
2131 edge_no != -1;
2132 edge_no = cx->fsm->edges[edge_no].next_edge)
2133 // Consider taking a step along an edge to a new state
2134 {
2135 EDGE *e = &(cx->fsm->edges[edge_no]);
2136
2137 switch (e->etype)
2138 {
2139/*...sETYPE_CHAR \45\ if matches character\44\ we can advance:24: */
2140 case ETYPE_CHAR:
2141 if (*str == e->u.character)
2142 walk_fsm(str + 1, e->to_state, cx);
2143 break;
2144
2145/*...sETYPE_NCHAR \45\ if not matches character\44\ we can advance:24: */
2146 case ETYPE_NCHAR:
2147 if (*str != '\0' && *str != e->u.character)
2148 walk_fsm(str + 1, e->to_state, cx);
2149 break;
2150
2151/*...sETYPE_STRING \45\ if matches string\44\ we can advance:24: */
2152 case ETYPE_STRING:
2153 {
2154 unsigned len = (unsigned char)e->u.string[0];
2155
2156 if (!memcmp(str, e->u.string + 1, len))
2157 walk_fsm(str + len, e->to_state, cx);
2158 }
2159 break;
2160
2161/*...sETYPE_DOT \45\ if got any character\44\ we can advance:24: */
2162 case ETYPE_DOT:
2163 if (*str != '\0')
2164 walk_fsm(str + 1, e->to_state, cx);
2165 break;
2166
2167/*...sETYPE_WORD \45\ if got word constituent\44\ we can advance:24: */
2168 case ETYPE_WORD:
2169 if (*str != '\0' && isword(*str))
2170 walk_fsm(str + 1, e->to_state, cx);
2171 break;
2172
2173/*...sETYPE_NWORD \45\ if got non word constituent\44\ we can advance:24: */
2174 case ETYPE_NWORD:
2175 if (*str != '\0' && !isword(*str))
2176 walk_fsm(str + 1, e->to_state, cx);
2177 break;
2178
2179/*...sETYPE_CCLASS \45\ if in the class\44\ we can advance:24: */
2180 case ETYPE_CCLASS:
2181 if (match_cclass(*str, e->u.cclass))
2182 walk_fsm(str + 1, e->to_state, cx);
2183 break;
2184
2185/*...sETYPE_SOL\47\EOL\47\SOW\47\EOW\47\IW\47\EW \45\ special epsilon moves:24: */
2186 case ETYPE_SOL:
2187 if (str == cx->str_init)
2188 walk_fsm_gated(str, cx, e);
2189 break;
2190 case ETYPE_EOL:
2191 if (*str == '\0')
2192 walk_fsm_gated(str, cx, e);
2193 break;
2194 case ETYPE_SOW:
2195 if (isword(str[0]) &&
2196 ((cx->str_init < str && !isword(str[-1])) ||
2197 (cx->str_init == str)))
2198 walk_fsm_gated(str, cx, e);
2199 break;
2200 case ETYPE_EOW:
2201 if ((cx->str_init < str && isword(str[-1])) &&
2202 (str[0] == '\0' || !isword(str[0])))
2203 walk_fsm_gated(str, cx, e);
2204 break;
2205 case ETYPE_IW:
2206 if (cx->str_init < str && isword(str[-1]) &&
2207 str[0] != '\0' && isword(str[0]))
2208 walk_fsm_gated(str, cx, e);
2209 break;
2210 case ETYPE_EW:
2211 if (isword(str[0]) &&
2212 ((cx->str_init < str && !isword(str[-1])) ||
2213 (cx->str_init == str)))
2214 walk_fsm_gated(str, cx, e);
2215 else if ((cx->str_init < str && isword(str[-1])) &&
2216 (str[0] == '\0' || !isword(str[0])))
2217 walk_fsm_gated(str, cx, e);
2218 break;
2219
2220/*...sETYPE_SSUB\47\ESUB \45\ handle nested subexpression:24: */
2221 case ETYPE_SSUB:
2222 walk_fsm_ssub(str, cx, e);
2223 break;
2224 case ETYPE_ESUB:
2225 walk_fsm_esub(str, cx, e);
2226 break;
2227
2228/*...sETYPE_BACK \45\ check backreference:24: */
2229 case ETYPE_BACK:
2230 if (e->u.n_span < cx->subs->n_spans)
2231 {
2232 int len = cx->subs->spans[e->u.n_span].len;
2233
2234 if (!memcmp(str, cx->str_init + cx->subs->spans[e->u.n_span].pos, len))
2235 walk_fsm_gated(str + len, cx, e);
2236 }
2237 break;
2238
2239 }
2240 }
2241}
2242
2243static const char *match_fsm(FSM * fsm,
2244 int eremf,
2245 const char *str,
2246 int posn,
2247 int limit,
2248 int state_no,
2249 ERE_MATCHINFO * mi)
2250{
2251 CONTEXT cx;
2252 SUBS subs;
2253
2254 cx.fsm = fsm;
2255 cx.eremf = eremf;
2256 cx.str_init = str;
2257 cx.str_limit = str + limit;
2258 cx.str_best = NULL;
2259 cx.subs = &subs;
2260 cx.subs_base = &subs;
2261 cx.mi = mi;
2262 subs.n_spans = 0;
2263 walk_fsm(str + posn, state_no, &cx);
2264 return cx.str_best;
2265}
2266
2267
2268#ifdef DEBUG
2269/*...sprint_fsm:0: */
2270static void print_fsm(FSM * fsm, int s, BOOLEAN not_just_reachable)
2271{
2272 int state_no, edge_no;
2273
2274 printf("Starting state %02d\n", s);
2275 for (state_no = 0; state_no < fsm->n_states; state_no++)
2276 if ((fsm->state_flags[state_no] & FLAG_REACHABLE) != 0 ||
2277 not_just_reachable)
2278 {
2279 printf("%02d:%c\t", state_no, ((fsm->state_flags[state_no] & FLAG_FINISH) != 0) ? 'F' : ' ');
2280 for (edge_no = fsm->state_first_edges[state_no];
2281 edge_no != -1;
2282 edge_no = fsm->edges[edge_no].next_edge)
2283/*...sshow edge:32: */
2284 {
2285 EDGE *e = &(fsm->edges[edge_no]);
2286
2287 switch (e->etype)
2288 {
2289 case ETYPE_CHAR:
2290 printf("%c", e->u.character);
2291 break;
2292 case ETYPE_NCHAR:
2293 printf("~%c", e->u.character);
2294 break;
2295 case ETYPE_STRING:
2296 printf("%*.*s",
2297 (unsigned char)e->u.string[0],
2298 (unsigned char)e->u.string[0],
2299 e->u.string + 1);
2300 break;
2301 case ETYPE_DOT:
2302 printf(".");
2303 break;
2304 case ETYPE_CCLASS:
2305 printf("[");
2306 break;
2307 case ETYPE_WORD:
2308 printf("\\w");
2309 break;
2310 case ETYPE_NWORD:
2311 printf("\\W");
2312 break;
2313 case ETYPE_EPSILON:
2314 printf("e");
2315 break;
2316 case ETYPE_SOL:
2317 printf("^");
2318 break;
2319 case ETYPE_EOL:
2320 printf("$");
2321 break;
2322 case ETYPE_SOW:
2323 printf("\\<");
2324 break;
2325 case ETYPE_EOW:
2326 printf("\\>");
2327 break;
2328 case ETYPE_IW:
2329 printf("\\B");
2330 break;
2331 case ETYPE_EW:
2332 printf("\\y");
2333 break;
2334 case ETYPE_SSUB:
2335 printf("(");
2336 break;
2337 case ETYPE_ESUB:
2338 printf(")");
2339 break;
2340 case ETYPE_BACK:
2341 printf("\\%d", e->u.n_span + 1);
2342 break;
2343 }
2344 printf("->%02d\t", e->to_state);
2345 }
2346
2347 printf("\n");
2348 }
2349}
2350
2351#endif
2352
2353/*...sextended regular expressions:0: */
2354/* An ERE knows its original match tree and also the FSM it is compiled into.
2355 * Using a (largely epsilon move free) FSM makes for faster searching. */
2356
2357typedef struct
2358{
2359 MATCH *match; // Parse tree for expression
2360 int shortest_match; // Shortest match possible
2361 FSM *fsm; // Compiled FSM
2362 int s; // Start state for FSM
2363}
2364ERE;
2365
2366/*
2367 *@@ rxpCompile:
2368 * compiles the regular expression str for later matching.
2369 *
2370 * If ERECF_TOLOWER is passed with erecf, every character
2371 * (or range of characters) to be matched are stored in the
2372 * compiled ERE in lower case. Therefore, if strings to be
2373 * matched are passed in lower case also, the result is a
2374 * case-insensitive match.
2375 */
2376
2377ERE* rxpCompile(const char *str,
2378 int erecf,
2379 int *rc) // out: error code
2380{
2381 ERE *ere;
2382 const char *str_after;
2383 FSM *fsm;
2384 int s, f;
2385
2386 *rc = NO_ERROR;
2387
2388 if ((ere = (ERE *) malloc(sizeof(ERE))) == NULL)
2389 {
2390 *rc = ERROR_NOT_ENOUGH_MEMORY;
2391 return NULL;
2392 }
2393
2394 if ((ere->match = compile_match(str, &str_after, erecf, rc)) == NULL)
2395 {
2396 free(ere);
2397 return NULL;
2398 }
2399
2400 if (thisch(str_after) == CH_RPAR)
2401 {
2402 delete_match(ere->match);
2403 free(ere);
2404 *rc = EREE_UNEX_RPAR;
2405 return NULL;
2406 }
2407
2408 if (count_sub(ere->match) > MAX_SUBS)
2409 {
2410 delete_match(ere->match);
2411 free(ere);
2412 *rc = EREE_TOO_MANY_SUB;
2413 return NULL;
2414 }
2415
2416#ifdef DEBUG
2417 print_tree(ere->match, 0);
2418#endif
2419
2420 ere->shortest_match = (int)shortest_match(ere->match);
2421
2422 if ((fsm = create_fsm(rc)) == NULL)
2423 {
2424 delete_match(ere->match);
2425 free(ere);
2426 return NULL;
2427 }
2428
2429 if (!make_fsm_from_match(ere->match, fsm, &s, &f))
2430 {
2431 delete_fsm(fsm);
2432 delete_match(ere->match);
2433 free(ere);
2434 *rc = EREE_COMPILE_FSM;
2435 return NULL;
2436 }
2437
2438#ifdef DEBUG
2439 print_fsm(fsm, s, TRUE);
2440#endif
2441
2442 if ((ere->fsm = create_fsm(rc)) == NULL)
2443 {
2444 delete_fsm(fsm);
2445 delete_match(ere->match);
2446 free(ere);
2447 *rc = EREE_COMPILE_FSM;
2448 return NULL;
2449 }
2450
2451 if (!remove_epsilons(s, f, fsm, ere->fsm))
2452 {
2453 delete_fsm(ere->fsm);
2454 delete_fsm(fsm);
2455 delete_match(ere->match);
2456 free(ere);
2457 *rc = EREE_COMPILE_FSM;
2458 return NULL;
2459 }
2460
2461 delete_fsm(fsm);
2462
2463 ere->s = s;
2464
2465#ifdef DEBUG
2466 print_fsm(ere->fsm, s, FALSE);
2467#endif
2468
2469 return ere;
2470}
2471
2472/*
2473 *@@ rxpMinLen:
2474 *
2475 */
2476
2477int rxpMinLen(const ERE * ere)
2478{
2479 return ere->shortest_match;
2480}
2481
2482/*
2483 *@@ rxpMatch:
2484 * returns the number of characters in the match, starting from
2485 * pos characters into the string to be searched. Details of
2486 * sub-matches can also be returend. Returns -1 if no match.
2487 *
2488 * If EREMF_SHORTEST is passed with eremf, the code looks for
2489 * the shortest match, instead of the longest match.
2490 *
2491 * If EREMF_ANY is passed with eremf, the code doesn't try to
2492 * find the longest (or shortest) match, it will return with the
2493 * first match it finds (which could be of any length).
2494 * This can speed up matching.
2495 */
2496
2497int rxpMatch(const ERE * ere,
2498 int eremf,
2499 const char *str,
2500 int pos,
2501 ERE_MATCHINFO * mi)
2502{
2503 int len = pos + strlen(str + pos);
2504 const char *str_best;
2505
2506 if ((str_best = match_fsm(ere->fsm, eremf, str, pos, len, ere->s, mi)) == NULL)
2507 return -1;
2508 return (str_best - str) - pos;
2509}
2510
2511/*
2512 *@@ rxpMatch_fwd:
2513 * match forwards within a string from a specified start position.
2514 *
2515 * If a match, return TRUE, and also return pos and len of the match.
2516 *
2517 * If EREMF_SHORTEST is passed with eremf, the code looks for
2518 * the shortest match, instead of the longest match.
2519 *
2520 * If EREMF_ANY is passed with eremf, the code doesn't try to
2521 * find the longest (or shortest) match, it will return with the
2522 * first match it finds (which could be of any length).
2523 * This can speed up matching.
2524 */
2525
2526BOOLEAN rxpMatch_fwd(const ERE *ere, // in: compiled ERE (from rxpCompile)
2527 int eremf, // in: EREMF_* flags
2528 const char *str, // in: string to test
2529 int pos, // in: start position
2530 int *pos_match, // out: position of match
2531 int *len_match, // out: length of match
2532 ERE_MATCHINFO *mi) // out: match info (for rxpSubsWith)
2533{
2534 int len = pos + strlen(str + pos);
2535 int i;
2536
2537 for (i = pos; i <= len - ere->shortest_match; i++)
2538 {
2539 const char *str_best;
2540
2541 if ((str_best = match_fsm(ere->fsm, eremf, str, i, len, ere->s, mi)) != NULL)
2542 {
2543 *pos_match = i;
2544 *len_match = (str_best - str) - i;
2545 return TRUE;
2546 }
2547 }
2548 return FALSE;
2549}
2550
2551/*
2552 *@@ rxpMatch_bwd:
2553 * match backwards within a string not passing a
2554 * specified end position.
2555 *
2556 * If a match, return TRUE, and also return pos and
2557 * len of the match. We need to consider matches from
2558 * the beginning of the line. We want the one which
2559 * ends up in the rightmost position. Of those which
2560 * end up equally far right, we want the one which
2561 * extends the furthest (or shortest if EREMF_SHORTEST)
2562 * left. See how we get this as a side effect of the
2563 * loop ordering and the '>= + delta' test.
2564 *
2565 * This may not look as efficient as scanning the string
2566 * backwards, but note that this would require a reversed
2567 * ERE too, and we can't reverse EREs as they may
2568 * contain backreferences.
2569 *
2570 * If EREMF_SHORTEST is passed with eremf, the code looks for
2571 * the shortest match, instead of the longest match.
2572 *
2573 * If EREMF_ANY is passed with eremf, the code doesn't try to
2574 * find the longest (or shortest) match, it will return with the
2575 * first match it finds (which could be of any length).
2576 * This can speed up matching.
2577 */
2578
2579BOOLEAN rxpMatch_bwd(const ERE *ere, // in: compiled ERE (from rxpCompile)
2580 int eremf, // in: EREMF_* flags
2581 const char *str, // in: string to test
2582 int pos, // in: start position
2583 int *pos_match, // out: position of match
2584 int *len_match, // out: length of match
2585 ERE_MATCHINFO * mi) // out: match info (for rxpSubsWith)
2586{
2587 int i;
2588 int delta = (eremf & EREMF_SHORTEST) ? 0 : 1;
2589 const char *rightmost = NULL;
2590 ERE_MATCHINFO mi2;
2591
2592 for (i = 0; i <= pos - ere->shortest_match; i++)
2593 {
2594 const char *str_best;
2595
2596 if ((str_best = match_fsm(ere->fsm, eremf, str, i, pos, ere->s, &mi2)) != NULL)
2597 {
2598 if (rightmost == NULL ||
2599 str_best >= rightmost + delta)
2600 {
2601 *pos_match = i;
2602 *len_match = (str_best - str) - i;
2603 rightmost = str_best;
2604 if (mi != NULL)
2605 {
2606 mi->n_spans = mi2.n_spans;
2607 for (i = 0; i < mi->n_spans; i++)
2608 mi->spans[i] = mi2.spans[i];
2609 }
2610 }
2611 }
2612 }
2613 return rightmost != NULL;
2614}
2615
2616/*
2617 *@@ rxpFree:
2618 * frees all resources allocated by rxpCompile.
2619 */
2620
2621void rxpFree(ERE * ere)
2622{
2623 if (ere)
2624 {
2625 delete_match(ere->match);
2626 delete_fsm(ere->fsm);
2627 free(ere);
2628 }
2629}
2630
2631/*
2632 *@@ rxpSubsWith:
2633 * perform a substitution based upon an earlier found match.
2634 * This allows for implementing a "find and replace" function.
2635 */
2636
2637BOOLEAN rxpSubsWith(const char *str, // in: original string searched (same as str given to rxpMatch_fwd)
2638 int pos, // in: span of the entire match (pos_match from rxpMatch_fwd)
2639 int len, // in: span of the entire match (len_match from rxpMatch_fwd)
2640 ERE_MATCHINFO *mi, // in: details of match sub-spans (as from rxpMatch_fwd)
2641 const char *with, // in: replacement string with \1 etc.
2642 char *out, // out: buffer for string substitutions
2643 int len_out, // in: sizeof *out
2644 int *rc) // out: error, if FALSE returned
2645{
2646 int i = 0;
2647 int j;
2648
2649 memcpy(out, str, pos);
2650 i += pos;
2651 while (*with != '\0')
2652 {
2653 const char *rep;
2654 int len_rep;
2655
2656 if (*with != '\\')
2657 {
2658 rep = with++;
2659 len_rep = 1;
2660 }
2661 else
2662 {
2663 ++with;
2664 if (*with >= '1' && *with <= '9')
2665 {
2666 int span = *with - '1';
2667
2668 ++with;
2669 if (span >= mi->n_spans)
2670 {
2671 *rc = EREE_BAD_BACKREF;
2672 return FALSE;
2673 }
2674 rep = str + mi->spans[span].pos;
2675 len_rep = mi->spans[span].len;
2676 }
2677 else if (*with != '\0')
2678 {
2679 rep = with++;
2680 len_rep = 1;
2681 }
2682 else
2683 {
2684 *rc = EREE_BAD_BACKSLASH;
2685 return FALSE;
2686 }
2687 }
2688 if (i + len_rep > len_out)
2689 {
2690 *rc = EREE_SUBS_LEN;
2691 return FALSE;
2692 }
2693 memcpy(out + i, rep, len_rep);
2694 i += len_rep;
2695 }
2696 j = pos + len + strlen(str + pos + len);
2697 if (i + j > len_out)
2698 {
2699 *rc = EREE_SUBS_LEN;
2700 return FALSE;
2701 }
2702 memcpy(out + i, str + pos + len, j);
2703 i += j;
2704 out[i] = '\0';
2705 return TRUE;
2706}
2707
2708#ifdef __TESTCASE__
2709
2710int main(int argc, char *argv[])
2711{
2712 ERE *ere;
2713 ERE_MATCHINFO mi;
2714 int rc;
2715
2716 const char *pcsz, *pcszEre;
2717
2718 if (argc != 3)
2719 {
2720 printf("Usage: regexp <teststring> <ere>\n");
2721 exit(1);
2722 }
2723
2724 pcsz = argv[1];
2725 pcszEre = argv[2];
2726
2727 printf("matching \"%s\" against \"%s\"\n",
2728 pcsz,
2729 pcszEre);
2730 fflush(stdout);
2731
2732 if (!(ere = rxpCompile(pcszEre,
2733 0,
2734 &rc)))
2735 {
2736 printf("Error %d in rxpCompile: %s\n", rc, rxpError(rc));
2737 exit(rc);
2738 }
2739
2740 {
2741 int pos, length;
2742 rc = rxpMatch_fwd(ere,
2743 0,
2744 pcsz,
2745 0,
2746 &pos,
2747 &length,
2748 &mi);
2749
2750 if (rc == 0)
2751 printf("no match\n");
2752 else
2753 printf("found at pos %d, length %d\n", pos, length);
2754 }
2755
2756 return 0;
2757}
2758
2759#endif
Note: See TracBrowser for help on using the repository browser.