source: branches/branch-1-0/src/helpers/regexp.c@ 297

Last change on this file since 297 was 222, checked in by umoeller, 23 years ago

Minor adjustments for new static handling.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 78.0 KB
Line 
1
2/*
3 *@@sourcefile regexp.c:
4 * extended regular expressions, taken from "Andy's Extended
5 * Regular Expressions", as written by Andy Key
6 * (nyangau@interalpha.co.uk) and released into the public
7 * domain, plus adjustments for the XWP helpers.
8 *
9 * Usage: All C programs; not OS/2-specific.
10 *
11 * Function prefixes:
12 * -- rxp* regular expression functions.
13 *
14 * Regular expression matching is done in the following stages:
15 *
16 * 1) Call rxpCompile to parse the expression into a recursive
17 * tree of matches.
18 *
19 * This tree is converted into a finite state machine (FSM).
20 * A second FSM is built from the first but with epsilon
21 * moves removed to elimate lockups and increase speed.
22 *
23 * The input string can be used to drive the FSM through all
24 * the possible routes. The largest (or smallest) amount of
25 * input string required to reach the finish state is recorded
26 * since an extended regular expression is deemed to match as
27 * much input string as possible.
28 *
29 * 2) Call one of rxpMatch, rxpMatch_fwd, or rxpMatch_bwd to
30 * perform a match.
31 *
32 * 3) Call rxpFree to free the compiled ERE.
33 *
34 * Beware: The matching routine is highly recursive and can
35 * require around 20 to 30 bytes per character in the source string
36 * to match it. Thus you should try to limit the length of the source
37 * string and/or allow a stack size of at least 20 (to 30) * max string
38 * length. In addition, add 2KB to allow for the use of several nested
39 * sub-expressions in the matching.
40 *
41 *@@header "helpers\regexp.h"
42 *@@added V0.9.19 (2002-04-17) [umoeller]
43 */
44
45/*
46 * Copyright (C) 2002 Ulrich M”ller.
47 * This file is part of the "XWorkplace helpers" source package.
48 * This is free software; you can redistribute it and/or modify
49 * it under the terms of the GNU General Public License as published
50 * by the Free Software Foundation, in version 2 as it comes in the
51 * "COPYING" file of the XWorkplace main distribution.
52 * This program is distributed in the hope that it will be useful,
53 * but WITHOUT ANY WARRANTY; without even the implied warranty of
54 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
55 * GNU General Public License for more details.
56 */
57
58#include <stdio.h>
59#include <ctype.h>
60#include <stdlib.h>
61#include <string.h>
62#include <malloc.h>
63#include <memory.h>
64
65#include "setup.h" // code generation and debugging options
66
67#define ERE_C
68#include "helpers\regexp.h"
69
70#define isword(c) (isalnum(c)||(c)=='_')
71
72STATIC int val_of_hex(char c)
73{
74 if (c >= '0' && c <= '9')
75 return c - '0';
76 if (c >= 'a' && c <= 'f')
77 return c - 'a' + 10;
78 if (c >= 'A' && c <= 'F')
79 return c - 'A' + 10;
80 return 0; // Shouldn't get here
81}
82
83STATIC int escaped(const char *s)
84{
85 if (s[0] == 'x' && isxdigit(s[1]))
86 // \x followed by 1 or 2 hex digits
87 {
88 int n = val_of_hex(s[1]);
89
90 if (isxdigit(s[2]))
91 n = n * 16 + val_of_hex(s[2]);
92 return n;
93 }
94 else
95 switch (s[0])
96 {
97 case 'n':
98 return '\n';
99 case 't':
100 return '\t';
101 case 'r':
102 return '\r';
103 case 'b':
104 return '\b';
105 case 'f':
106 return '\f';
107 case 'e':
108 return 0x1b;
109 default:
110 return (unsigned char)s[0];
111 }
112}
113
114STATIC const char *past_escaped(const char *s)
115{
116 if (s[0] == 'x' && isxdigit(s[1]))
117 return isxdigit(s[2]) ? s + 3 : s + 2;
118 else
119 return s + 1;
120}
121
122#define zero_cclass(cclass) memset(cclass, 0, 0x100 >> 3)
123
124STATIC unsigned char bits[] =
125{0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
126
127#define add_to_cclass(n, cclass) (cclass[(unsigned char)(n)>>3] |= bits[(unsigned char)(n) & 7])
128
129#define remove_from_cclass(n, cclass) (cclass[(unsigned char)(n)>>3] &= ~bits[(unsigned char)(n) & 7])
130
131STATIC void invert_cclass(unsigned char *cclass)
132{
133 int i;
134
135 for (i = 0; i < (0x100 >> 3); i++)
136 cclass[i] ^= 0xff;
137}
138
139#define match_cclass(n, cclass) ((cclass[(unsigned char)(n)>>3] & bits[(unsigned char)(n) & 7]) != 0)
140
141#define CHL_EOS 0
142#define CHL_END_CCLASS (-1)
143#define CHL_COMP (-2)
144#define CHL_RANGE (-3)
145#define CHL_POSIX_COLLATING (-4)
146#define CHL_POSIX_EQUIVALENCE (-5)
147#define CHL_POSIX_CCLASS_BAD (-6)
148#define CHL_POSIX_CCLASS_BASE (-30)
149#define CHL_POSIX_CCLASS_END (-30+20)
150
151/* I use my own wrapper functions so I can take their addresses.
152 * Remember, isalnum etc. can be implemented as macros... */
153STATIC BOOLEAN my_isalnum(int ch)
154{
155 return isalnum(ch);
156}
157STATIC BOOLEAN my_isalpha(int ch)
158{
159 return isalpha(ch);
160}
161STATIC BOOLEAN my_isblank(int ch)
162{
163 return ch == ' ' || ch == '\t';
164}
165STATIC BOOLEAN my_iscntrl(int ch)
166{
167 return iscntrl(ch);
168}
169STATIC BOOLEAN my_isdigit(int ch)
170{
171 return isdigit(ch);
172}
173STATIC BOOLEAN my_islower(int ch)
174{
175 return islower(ch);
176}
177STATIC BOOLEAN my_isprint(int ch)
178{
179 return isprint(ch);
180}
181STATIC BOOLEAN my_ispunct(int ch)
182{
183 return ispunct(ch);
184}
185STATIC BOOLEAN my_isspace(int ch)
186{
187 return isspace(ch);
188}
189STATIC BOOLEAN my_isupper(int ch)
190{
191 return isupper(ch);
192}
193STATIC BOOLEAN my_isxdigit(int ch)
194{
195 return isxdigit(ch);
196}
197
198typedef struct
199{
200 int len_name;
201 const char *name;
202 BOOLEAN(*iscclass) (int ch);
203}
204POSIX_CCLASS;
205
206STATIC POSIX_CCLASS posix_cclass[] =
207{
208 5, "alnum", my_isalnum,
209 5, "alpha", my_isalpha,
210 5, "blank", my_isblank,
211 5, "cntrl", my_iscntrl,
212 5, "digit", my_isdigit,
213 5, "lower", my_islower,
214 5, "print", my_isprint,
215 5, "punct", my_ispunct,
216 5, "space", my_isspace,
217 5, "upper", my_isupper,
218 6, "xdigit", my_isxdigit,
219};
220
221STATIC int find_posix(const char *str)
222{
223 int p;
224
225 if (str[0] != '[')
226 return str[0];
227 if (str[1] == '.')
228 return CHL_POSIX_COLLATING;
229 if (str[1] == '=')
230 return CHL_POSIX_EQUIVALENCE;
231 if (str[1] != ':')
232 return str[0];
233 for (p = 0; p < sizeof(posix_cclass) / sizeof(posix_cclass[0]); p++)
234 if (!memcmp(str + 2, posix_cclass[p].name, posix_cclass[p].len_name) &&
235 str[2 + posix_cclass[p].len_name] == ':' &&
236 str[2 + posix_cclass[p].len_name + 1] == ']')
237 return CHL_POSIX_CCLASS_BASE + p;
238 return CHL_POSIX_CCLASS_BAD;
239}
240
241STATIC int cclass_thisch(const char *str)
242{
243 switch (str[0])
244 {
245 case '\\':
246 return escaped(str + 1);
247 case ']':
248 return CHL_END_CCLASS;
249 case '^':
250 return CHL_COMP;
251 case '-':
252 return CHL_RANGE;
253 case '\0':
254 return CHL_EOS;
255 default:
256 return find_posix(str);
257 }
258}
259
260STATIC const char *cclass_nextch(const char *str)
261{
262 int p;
263
264 if (str[0] == '\\')
265 return past_escaped(str + 1);
266 p = find_posix(str);
267 if (CHL_POSIX_CCLASS_BASE <= p && p < CHL_POSIX_CCLASS_END)
268 return str + 2 + posix_cclass[p - CHL_POSIX_CCLASS_BASE].len_name + 2;
269 else
270 return str + 1;
271}
272
273STATIC unsigned char *compile_cclass(const char *str,
274 const char **str_after,
275 int erecf,
276 int *rc)
277{
278 unsigned char *cclass;
279 BOOLEAN complement;
280 int i, c, last_c = -1;
281
282 if ((cclass = (unsigned char *)malloc(0x100 >> 3)) == NULL)
283 {
284 *rc = ERROR_NOT_ENOUGH_MEMORY;
285 return NULL;
286 }
287 zero_cclass(cclass);
288
289 complement = (cclass_thisch(str) == CHL_COMP);
290 if (complement)
291 str = cclass_nextch(str);
292
293 while ((c = cclass_thisch(str)) != CHL_EOS && c != CHL_END_CCLASS)
294 {
295 if (CHL_POSIX_CCLASS_BASE <= c && c < CHL_POSIX_CCLASS_END)
296 {
297 for (i = 0; i < 0x100; i++)
298 if (posix_cclass[c - CHL_POSIX_CCLASS_BASE].iscclass(i))
299 add_to_cclass(i, cclass);
300 last_c = -1;
301 }
302 else
303 switch (c)
304 {
305/*...sCHL_POSIX_\42\ \45\ error cases:32: */
306 case CHL_POSIX_COLLATING:
307 free(cclass);
308 *rc = EREE_POSIX_COLLATING;
309 return NULL;
310 case CHL_POSIX_EQUIVALENCE:
311 free(cclass);
312 *rc = EREE_POSIX_EQUIVALENCE;
313 return NULL;
314 case CHL_POSIX_CCLASS_BAD:
315 free(cclass);
316 *rc = EREE_POSIX_CCLASS_BAD;
317 return NULL;
318/*...sCHL_RANGE \45\ range:32: */
319 case CHL_RANGE:
320 if (last_c == -1)
321 // Unexpected at this point
322 {
323 free(cclass);
324 *rc = EREE_UNEX_RANGE;
325 return NULL;
326 }
327 str = cclass_nextch(str);
328 if ((c = cclass_thisch(str)) == CHL_EOS || c == CHL_END_CCLASS)
329 // Not followed by anything
330 {
331 free(cclass);
332 *rc = EREE_UNF_RANGE;
333 return NULL;
334 }
335 for (i = last_c + 1; i <= c; i++)
336 add_to_cclass(i, cclass);
337 last_c = c;
338 break;
339
340#pragma info(nogen) // do not warn here
341
342/*...sCHL_COMP \45\ complement:32: */
343 case CHL_COMP:
344 c = '^';
345 // Fall through
346/*...sdefault \45\ individual entry:32: */
347 default:
348 if (erecf & ERECF_TOLOWER)
349 c = tolower(c);
350 add_to_cclass(c, cclass);
351 last_c = c;
352 break;
353
354#pragma info(restore)
355
356 }
357 str = cclass_nextch(str);
358 }
359
360 if (c == CHL_EOS)
361 {
362 free(cclass);
363 *rc = EREE_UNF_CCLASS;
364 return NULL;
365 }
366
367 if (complement)
368 invert_cclass(cclass);
369
370 remove_from_cclass(0, cclass);
371
372 *str_after = cclass_nextch(str);
373 return cclass;
374}
375
376#define delete_cclass(cclass) free(cclass)
377
378/* A 'match' embodies all of regular expression functionality.
379 * Matches can be defined in terms of each other. */
380
381typedef unsigned char MTYPE;
382
383#define MTYPE_NULL ((MTYPE) 0)
384#define MTYPE_CHAR ((MTYPE) 1)
385#define MTYPE_NCHAR ((MTYPE) 2)
386#define MTYPE_STRING ((MTYPE) 3)
387#define MTYPE_CCLASS ((MTYPE) 4)
388#define MTYPE_WORD ((MTYPE) 5)
389#define MTYPE_NWORD ((MTYPE) 6)
390#define MTYPE_DOT ((MTYPE) 7)
391#define MTYPE_QUERY ((MTYPE) 8)
392#define MTYPE_PLUS ((MTYPE) 9)
393#define MTYPE_STAR ((MTYPE) 10)
394#define MTYPE_CREP ((MTYPE) 11)
395#define MTYPE_OR ((MTYPE) 12)
396#define MTYPE_CAT ((MTYPE) 13)
397#define MTYPE_SUB ((MTYPE) 14)
398#define MTYPE_SOL ((MTYPE) 15)
399#define MTYPE_EOL ((MTYPE) 16)
400#define MTYPE_SOW ((MTYPE) 17)
401#define MTYPE_EOW ((MTYPE) 18)
402#define MTYPE_IW ((MTYPE) 19)
403#define MTYPE_EW ((MTYPE) 20)
404#define MTYPE_BACK ((MTYPE) 21)
405
406typedef struct match_struct MATCH;
407
408struct match_struct
409{
410 MTYPE mtype;
411 union
412 {
413 char character;
414 unsigned char *cclass;
415 MATCH *match;
416 MATCH *matchs[2];
417 int n_span;
418 char *string;
419 struct
420 {
421 unsigned m, n;
422 MATCH *match;
423 }
424 crep;
425 }
426 u;
427};
428
429STATIC MATCH null_match =
430{MTYPE_NULL};
431
432#define NULL_MATCH (&null_match)
433
434STATIC void delete_match(MATCH * match)
435{
436 if (match == NULL_MATCH)
437 return;
438 switch (match->mtype)
439 {
440 case MTYPE_STRING:
441 free(match->u.string);
442 break;
443 case MTYPE_CCLASS:
444 delete_cclass(match->u.cclass);
445 break;
446 case MTYPE_QUERY:
447 case MTYPE_PLUS:
448 case MTYPE_STAR:
449 delete_match(match->u.match);
450 break;
451 case MTYPE_CREP:
452 delete_match(match->u.crep.match);
453 break;
454 case MTYPE_OR:
455 case MTYPE_CAT:
456 delete_match(match->u.matchs[0]);
457 delete_match(match->u.matchs[1]);
458 break;
459 case MTYPE_SUB:
460 delete_match(match->u.match);
461 break;
462 }
463 free(match);
464}
465
466/*
467 *@@ shortest_match:
468 * determines the shortest possible match length for a given match
469 * tree. In the following example it is 3, thus allowing us to only
470 * consider 4 positions for regular expression matching in a 6
471 * character line.
472 *
473 + aa*bc
474 +
475 + 123456
476 + ---
477 + ---
478 + ---
479 + ---
480 */
481
482STATIC unsigned shortest_match(const MATCH * match)
483{
484 unsigned a, b;
485
486 switch (match->mtype)
487 {
488 case MTYPE_NULL:
489 return 0;
490 case MTYPE_CHAR:
491 return 1;
492 case MTYPE_NCHAR:
493 return 1;
494 case MTYPE_STRING:
495 return (unsigned char)match->u.string[0];
496 case MTYPE_CCLASS:
497 return 1;
498 case MTYPE_WORD:
499 return 1;
500 case MTYPE_NWORD:
501 return 1;
502 case MTYPE_DOT:
503 return 1;
504 case MTYPE_QUERY:
505 return 0;
506 case MTYPE_PLUS:
507 return shortest_match(match->u.match);
508 case MTYPE_STAR:
509 return 0;
510 case MTYPE_CREP:
511 return match->u.crep.m *
512 shortest_match(match->u.crep.match);
513 case MTYPE_OR:
514 a = shortest_match(match->u.matchs[0]);
515 b = shortest_match(match->u.matchs[1]);
516 return a <= b ? a : b;
517 case MTYPE_CAT:
518 return shortest_match(match->u.matchs[0]) +
519 shortest_match(match->u.matchs[1]);
520 case MTYPE_SUB:
521 return shortest_match(match->u.match);
522 case MTYPE_SOL:
523 return 0;
524 case MTYPE_EOL:
525 return 0;
526 case MTYPE_SOW:
527 return 0;
528 case MTYPE_EOW:
529 return 0;
530 case MTYPE_IW:
531 return 0;
532 case MTYPE_EW:
533 return 0;
534 case MTYPE_BACK:
535 return 0;
536 }
537 return 0; // Should never happen
538}
539
540STATIC BOOLEAN got_backrefs(MATCH * match)
541{
542 switch (match->mtype)
543 {
544 case MTYPE_QUERY:
545 case MTYPE_PLUS:
546 case MTYPE_STAR:
547 return got_backrefs(match->u.match);
548 case MTYPE_CREP:
549 return got_backrefs(match->u.crep.match);
550 case MTYPE_OR:
551 case MTYPE_CAT:
552 return got_backrefs(match->u.matchs[0]) ||
553 got_backrefs(match->u.matchs[1]);
554 case MTYPE_BACK:
555 return TRUE;
556 }
557 return FALSE;
558}
559
560STATIC MATCH *remove_subs(MATCH * match)
561{
562 switch (match->mtype)
563 {
564 case MTYPE_QUERY:
565 case MTYPE_PLUS:
566 case MTYPE_STAR:
567 match->u.match = remove_subs(match->u.match);
568 break;
569 case MTYPE_CREP:
570 match->u.crep.match = remove_subs(match->u.crep.match);
571 break;
572 case MTYPE_OR:
573 case MTYPE_CAT:
574 match->u.matchs[0] = remove_subs(match->u.matchs[0]);
575 match->u.matchs[1] = remove_subs(match->u.matchs[1]);
576 break;
577 case MTYPE_SUB:
578 {
579 MATCH *m = match;
580
581 match = match->u.match;
582 free(m);
583 }
584 break;
585 }
586 return match;
587}
588
589STATIC int count_sub(const MATCH * match)
590{
591 switch (match->mtype)
592 {
593 case MTYPE_SUB:
594 return 1 + count_sub(match->u.match);
595 case MTYPE_QUERY:
596 case MTYPE_PLUS:
597 case MTYPE_STAR:
598 return count_sub(match->u.match);
599 case MTYPE_CREP:
600 return count_sub(match->u.crep.match);
601 case MTYPE_OR:
602 case MTYPE_CAT:
603 return count_sub(match->u.matchs[0]) +
604 count_sub(match->u.matchs[1]);
605 }
606 return 0;
607}
608
609/*...scompiling matches:0: */
610/*
611 *
612 * <term> ::= <character> | <class> | ( <match> )
613 * <factor> ::= <term>? | <term>+ | <term>* | <term>
614 * <factors> ::= <factor> { <factor> }
615 * <match> ::= <factors> { |<factors> }
616 *
617 */
618
619/* We have an optimisation here in that if we can see a span of 'boring'
620 * characters, we return them as a single entity. We mustn't do this for the
621 * last in the span, as this may have modifiers applied to it, as in the
622 * example string "abcd*", where the d has the * modifier, but abc do not.
623 * We only do 255 chars at once, as the length is stored in a byte.
624 * Reducing 255 levels of per-character recursive searching to 1 level with
625 * a string compare has got to be a massive saving. */
626
627#define STRING_MAX 255
628
629#define CH_NOT_BASE 1000
630#define CH_NOT_END (1000+0x100)
631#define CH_EOS 0
632#define CH_DOT (-1)
633#define CH_WORD (-2)
634#define CH_NWORD (-3)
635#define CH_LSQR (-4)
636#define CH_RSQR (-5)
637#define CH_LPAR (-6)
638#define CH_RPAR (-7)
639#define CH_LCUR (-8)
640#define CH_RCUR (-9)
641#define CH_QUERY (-10)
642#define CH_PLUS (-11)
643#define CH_STAR (-12)
644#define CH_OR (-13)
645#define CH_SOL (-14)
646#define CH_EOL (-15)
647#define CH_SOW (-16)
648#define CH_EOW (-17)
649#define CH_IW (-18)
650#define CH_EW (-19)
651#define CH_BAD_TILDE (-20)
652#define CH_STRING_BASE (-1000)
653#define CH_STRING_END (-1000+STRING_MAX)
654#define CH_BACK_BASE (-2000)
655#define CH_BACK_END (-2000+9)
656
657STATIC int boring_string(const char *s)
658{
659 int n = 0;
660
661 while (*s &&
662 strchr("\\.[](){}?+*|^$<>~", *s) == NULL
663 && n < STRING_MAX)
664 {
665 s++;
666 n++;
667 }
668 return n;
669}
670
671STATIC int thisch(const char *str)
672{
673 int n;
674
675 switch (str[0])
676 {
677 case '\\':
678 if (str[1] >= '1' && str[1] <= '9')
679 return CH_BACK_BASE + (str[1] - '1');
680 switch (str[1])
681 {
682 case '`':
683 return CH_SOL;
684 case '\'':
685 return CH_EOL;
686 case '<':
687 return CH_SOW;
688 case '>':
689 return CH_EOW;
690 case 'w':
691 return CH_WORD;
692 case 'W':
693 return CH_NWORD;
694 case 'B':
695 return CH_IW;
696 case 'y':
697 return CH_EW;
698 default:
699 return escaped(str + 1);
700 }
701
702 case '~':
703 if (str[1] == '\\')
704 return CH_NOT_BASE + escaped(str + 2);
705 else if (str[1] != '\0')
706 return CH_NOT_BASE + (unsigned char)str[1];
707 else
708 return '~';
709 case '.':
710 return CH_DOT;
711 case '[':
712 return CH_LSQR;
713 case ']':
714 return CH_RSQR;
715 case '(':
716 return CH_LPAR;
717 case ')':
718 return CH_RPAR;
719 case '{':
720 return CH_LCUR;
721 case '}':
722 return CH_RCUR;
723 case '?':
724 return CH_QUERY;
725 case '+':
726 return CH_PLUS;
727 case '*':
728 return CH_STAR;
729 case '|':
730 return CH_OR;
731 case '^':
732 return CH_SOL;
733 case '$':
734 return CH_EOL;
735 }
736
737 n = boring_string(str);
738 if (n < 3)
739 return (unsigned char)str[0];
740 else
741 return CH_STRING_BASE + n - 1;
742}
743
744STATIC const char *nextch(const char *str)
745{
746 int n;
747
748 switch (str[0])
749 {
750 case '\\':
751 if (str[1] >= '1' && str[1] <= '9')
752 return str + 2;
753 switch (str[1])
754 {
755 case '`':
756 case '\'':
757 case '<':
758 case '>':
759 case 'w':
760 case 'W':
761 case 'B':
762 case 'y':
763 return str + 2;
764 default:
765 return past_escaped(str + 1);
766 }
767
768 case '~':
769 if (str[1] == '\\')
770 return past_escaped(str + 2);
771 else if (str[1] != '\0')
772 return str + 2;
773 else
774 return str + 1;
775 case '.':
776 case '[':
777 case '(':
778 case ')':
779 case '{':
780 case '}':
781 case '?':
782 case '+':
783 case '*':
784 case '|':
785 case '^':
786 case '$':
787 return str + 1;
788 }
789
790 n = boring_string(str);
791 if (n < 3)
792 return str + 1;
793 else
794 return str + n - 1;
795}
796
797STATIC MATCH *compile_match(const char *str, const char **str_after, int erecf, int *rc);
798
799STATIC const char *scan_number(const char *str, unsigned *num)
800{
801 if (!isdigit(*str))
802 return NULL;
803 *num = 0;
804 do
805 (*num) = (*num) * 10 + (*str++ - '0');
806 while (isdigit(*str));
807 return str;
808}
809
810STATIC MATCH *create_match(int *rc)
811{
812 MATCH *match;
813
814 if ((match = (MATCH *) malloc(sizeof(MATCH))) == NULL)
815 {
816 *rc = ERROR_NOT_ENOUGH_MEMORY;
817 return NULL;
818 }
819 return match;
820}
821
822STATIC MATCH *compile_term(const char *str, const char **str_after, int erecf, int *rc)
823{
824 MATCH *match;
825 int c;
826
827 c = thisch(str);
828 switch (c)
829 {
830 case CH_RSQR:
831 *rc = EREE_UNEX_RSQR;
832 return NULL;
833 case CH_QUERY:
834 *rc = EREE_UNEX_QUERY;
835 return NULL;
836 case CH_PLUS:
837 *rc = EREE_UNEX_PLUS;
838 return NULL;
839 case CH_STAR:
840 *rc = EREE_UNEX_STAR;
841 return NULL;
842 case CH_LCUR:
843 *rc = EREE_UNEX_LCUR;
844 return NULL;
845 case CH_RCUR:
846 *rc = EREE_UNEX_RCUR;
847 return NULL;
848 }
849
850 if (c == CH_EOS || c == CH_OR || c == CH_RPAR)
851 {
852 *str_after = str;
853 return NULL_MATCH;
854 }
855
856 if ((match = create_match(rc)) == NULL)
857 return NULL;
858
859 if (CH_NOT_BASE <= c && c < CH_NOT_END)
860/*...snot a specific character:16: */
861 {
862 char ch = (char)(c - CH_NOT_BASE);
863
864 if (erecf & ERECF_TOLOWER)
865 ch = (char)tolower(ch);
866 match->mtype = MTYPE_NCHAR;
867 match->u.character = ch;
868 str = nextch(str);
869 }
870 else if (CH_STRING_BASE <= c && c < CH_STRING_END)
871/*...sa string of non\45\special characters:16: */
872 {
873 unsigned len = c - CH_STRING_BASE;
874
875 match->mtype = MTYPE_STRING;
876 match->u.string = (char *)malloc(1 + len);
877 if (match->u.string == NULL)
878 {
879 free(match);
880 *rc = ERROR_NOT_ENOUGH_MEMORY;
881 return NULL;
882 }
883 match->u.string[0] = (char)len;
884 memcpy(match->u.string + 1, str, len);
885 if (erecf & ERECF_TOLOWER)
886 {
887 unsigned i;
888
889 for (i = 1; i <= len; i++)
890 match->u.string[i] = (char)tolower(match->u.string[i]);
891 }
892 str = nextch(str);
893 }
894 else if (CH_BACK_BASE <= c && c < CH_BACK_END)
895/*...sa backreference:16: */
896 {
897 match->mtype = MTYPE_BACK;
898 match->u.n_span = c - CH_BACK_BASE;
899 str = nextch(str);
900 }
901 else
902 switch (c)
903 {
904/*...sCH_LSQR \45\ character class:24: */
905 case CH_LSQR:
906 match->mtype = MTYPE_CCLASS;
907 str = nextch(str);
908 if ((match->u.cclass = compile_cclass(str, &str, erecf, rc)) == NULL)
909 {
910 free(match);
911 return NULL;
912 }
913 break;
914/*...sCH_DOT \45\ any character:24: */
915 case CH_DOT:
916 match->mtype = MTYPE_DOT;
917 str = nextch(str);
918 break;
919/*...sCH_WORD \45\ \92\w:24: */
920 case CH_WORD:
921 match->mtype = MTYPE_WORD;
922 str = nextch(str);
923 break;
924/*...sCH_NWORD \45\ \92\W:24: */
925 case CH_NWORD:
926 match->mtype = MTYPE_NWORD;
927 str = nextch(str);
928 break;
929/*...sCH_LPAR \45\ nested regular expression:24: */
930 case CH_LPAR:
931 {
932 MATCH *sub_match;
933
934 str = nextch(str);
935 if ((sub_match = compile_match(str, &str, erecf, rc)) == NULL)
936 {
937 free(match);
938 return NULL;
939 }
940 if (!got_backrefs(sub_match))
941 sub_match = remove_subs(sub_match);
942 if (thisch(str) != CH_RPAR)
943 {
944 *rc = EREE_UNF_SUB;
945 return NULL;
946 }
947 str = nextch(str);
948 match->mtype = MTYPE_SUB;
949 match->u.match = sub_match;
950 }
951 break;
952/*...sCH_SOL \45\ \94\:24: */
953 case CH_SOL:
954 match->mtype = MTYPE_SOL;
955 str = nextch(str);
956 break;
957/*...sCH_EOL \45\ \36\:24: */
958 case CH_EOL:
959 match->mtype = MTYPE_EOL;
960 str = nextch(str);
961 break;
962/*...sCH_SOW \45\ \92\\60\:24: */
963 case CH_SOW:
964 match->mtype = MTYPE_SOW;
965 str = nextch(str);
966 break;
967/*...sCH_EOW \45\ \92\\62\:24: */
968 case CH_EOW:
969 match->mtype = MTYPE_EOW;
970 str = nextch(str);
971 break;
972/*...sCH_IW \45\ \92\B:24: */
973 case CH_IW:
974 match->mtype = MTYPE_IW;
975 str = nextch(str);
976 break;
977/*...sCH_EW \45\ \92\y:24: */
978 case CH_EW:
979 match->mtype = MTYPE_EW;
980 str = nextch(str);
981 break;
982/*...sdefault \45\ any old character:24: */
983 default:
984 {
985 char ch = (char)c;
986
987 if (erecf & ERECF_TOLOWER)
988 ch = (char)tolower(ch);
989 match->mtype = MTYPE_CHAR;
990 match->u.character = ch;
991 str = nextch(str);
992 }
993 break;
994 }
995
996 *str_after = str;
997 return match;
998}
999
1000STATIC MTYPE repeat_type_of(int c)
1001{
1002 switch (c)
1003 {
1004 case CH_QUERY:
1005 return MTYPE_QUERY;
1006 case CH_PLUS:
1007 return MTYPE_PLUS;
1008 case CH_STAR:
1009 return MTYPE_STAR;
1010 case CH_LCUR:
1011 return MTYPE_CREP;
1012 default:
1013 return (MTYPE) - 1;
1014 }
1015}
1016
1017STATIC MATCH *compile_factor(const char *str, const char **str_after, int erecf, int *rc)
1018{
1019 MATCH *match, *parent;
1020 MTYPE repeat_mtype;
1021
1022 if ((match = compile_term(str, &str, erecf, rc)) == NULL)
1023 return NULL;
1024
1025 while ((repeat_mtype = repeat_type_of(thisch(str))) != (MTYPE) - 1)
1026/*...smatch is to be repeated:16: */
1027 {
1028 if ((parent = create_match(rc)) == NULL)
1029 {
1030 delete_match(match);
1031 return NULL;
1032 }
1033
1034 parent->mtype = repeat_mtype;
1035 str = nextch(str);
1036 if (repeat_mtype == MTYPE_CREP)
1037 {
1038 parent->u.crep.match = match;
1039 if ((str = scan_number(str, &(parent->u.crep.m))) == NULL)
1040 {
1041 delete_match(match);
1042 free(parent);
1043 *rc = EREE_BAD_CREP_M;
1044 return NULL;
1045 }
1046 parent->u.crep.n = parent->u.crep.m;
1047 if (*str == ',')
1048 {
1049 ++str;
1050 if (*str != '}')
1051 {
1052 if ((str = scan_number(str, &(parent->u.crep.n))) == NULL)
1053 {
1054 delete_match(match);
1055 free(parent);
1056 *rc = EREE_BAD_CREP_N;
1057 return NULL;
1058 }
1059 }
1060 else
1061 parent->u.crep.n = (unsigned)~0;
1062 }
1063 if (*str != '}')
1064 {
1065 delete_match(match);
1066 free(parent);
1067 *rc = EREE_UNF_CREP;
1068 return NULL;
1069 }
1070 ++str;
1071 if (parent->u.crep.m > parent->u.crep.n)
1072 {
1073 delete_match(match);
1074 free(parent);
1075 *rc = EREE_BAD_CREP;
1076 return NULL;
1077 }
1078 }
1079 else
1080 parent->u.match = match;
1081 match = parent;
1082 }
1083
1084 *str_after = str;
1085
1086 return match;
1087}
1088
1089STATIC MATCH *compile_factors(const char *str, const char **str_after, int erecf, int *rc)
1090{
1091 MATCH *match;
1092 int c;
1093
1094 if ((match = compile_factor(str, &str, erecf, rc)) == NULL)
1095 return NULL;
1096
1097 while ((c = thisch(str)) != CH_EOS && c != CH_RPAR && c != CH_OR)
1098/*...sconsider catenation of more factors:16: */
1099 {
1100 MATCH *sibling, *parent;
1101
1102 if ((sibling = compile_factor(str, &str, erecf, rc)) == NULL)
1103 {
1104 delete_match(match);
1105 return NULL;
1106 }
1107
1108 if ((parent = create_match(rc)) == NULL)
1109 {
1110 delete_match(sibling);
1111 delete_match(match);
1112 return NULL;
1113 }
1114 parent->mtype = MTYPE_CAT;
1115 parent->u.matchs[0] = match;
1116 parent->u.matchs[1] = sibling;
1117 match = parent;
1118 }
1119
1120 *str_after = str;
1121 return match;
1122}
1123
1124/*...scompile_match \45\ factors\124\factors:0: */
1125STATIC MATCH *compile_match(const char *str, const char **str_after, int erecf, int *rc)
1126{
1127 MATCH *match;
1128
1129 if ((match = compile_factors(str, &str, erecf, rc)) == NULL)
1130 return NULL;
1131
1132 while (thisch(str) == CH_OR)
1133/*...sfind sibling and or it in:16: */
1134 {
1135 MATCH *sibling, *parent;
1136
1137 str = nextch(str);
1138 if ((sibling = compile_factors(str, &str, erecf, rc)) == NULL)
1139 {
1140 delete_match(match);
1141 return NULL;
1142 }
1143 if ((parent = create_match(rc)) == NULL)
1144 {
1145 delete_match(sibling);
1146 delete_match(match);
1147 return NULL;
1148 }
1149 parent->mtype = MTYPE_OR;
1150 parent->u.matchs[0] = match;
1151 parent->u.matchs[1] = sibling;
1152 match = parent;
1153 }
1154
1155 *str_after = str;
1156
1157 return match;
1158}
1159
1160#ifdef DEBUG
1161/*...sprint_tree:0: */
1162/*...sdo_indent:0: */
1163STATIC void do_indent(int indent)
1164{
1165 while (indent--)
1166 putchar('\t');
1167}
1168
1169STATIC void print_tree(const MATCH * match, int indent)
1170{
1171 do_indent(indent);
1172 switch (match->mtype)
1173 {
1174 case MTYPE_NULL:
1175 printf("null\n");
1176 break;
1177 case MTYPE_CHAR:
1178 printf("%c\n", match->u.character);
1179 break;
1180 case MTYPE_NCHAR:
1181 printf("~%c\n", match->u.character);
1182 break;
1183 case MTYPE_STRING:
1184 printf("%*.*s\n",
1185 (unsigned char)match->u.string[0],
1186 (unsigned char)match->u.string[0],
1187 match->u.string + 1);
1188 break;
1189 case MTYPE_CCLASS:
1190 {
1191 int i;
1192
1193 printf("[");
1194 for (i = 0; i < 0x100; i++)
1195 if (match_cclass(i, match->u.cclass))
1196 printf("%c", i);
1197 printf("]\n");
1198 }
1199 break;
1200 case MTYPE_DOT:
1201 printf(".\n");
1202 break;
1203 case MTYPE_WORD:
1204 printf("\\w\n");
1205 break;
1206 case MTYPE_NWORD:
1207 printf("\\W\n");
1208 break;
1209 case MTYPE_QUERY:
1210 printf("?\n");
1211 print_tree(match->u.match, indent + 1);
1212 break;
1213 case MTYPE_PLUS:
1214 printf("+\n");
1215 print_tree(match->u.match, indent + 1);
1216 break;
1217 case MTYPE_STAR:
1218 printf("*\n");
1219 print_tree(match->u.match, indent + 1);
1220 break;
1221 case MTYPE_CREP:
1222 printf("{%u,%u}\n",
1223 match->u.crep.m,
1224 match->u.crep.n);
1225 print_tree(match->u.crep.match, indent + 1);
1226 break;
1227 case MTYPE_OR:
1228 printf("|\n");
1229 print_tree(match->u.matchs[0], indent + 1);
1230 print_tree(match->u.matchs[1], indent + 1);
1231 break;
1232 case MTYPE_CAT:
1233 printf("CAT\n");
1234 print_tree(match->u.matchs[0], indent + 1);
1235 print_tree(match->u.matchs[1], indent + 1);
1236 break;
1237 case MTYPE_SUB:
1238 printf("SUB\n");
1239 print_tree(match->u.match, indent + 1);
1240 break;
1241 case MTYPE_SOL:
1242 printf("^\n");
1243 break;
1244 case MTYPE_EOL:
1245 printf("$\n");
1246 break;
1247 case MTYPE_SOW:
1248 printf("\\<\n");
1249 break;
1250 case MTYPE_EOW:
1251 printf("\\>\n");
1252 break;
1253 case MTYPE_IW:
1254 printf("\\B\n");
1255 break;
1256 case MTYPE_EW:
1257 printf("\\y\n");
1258 break;
1259 case MTYPE_BACK:
1260 printf("\\%d\n", match->u.n_span + 1);
1261 break;
1262 }
1263}
1264#endif
1265
1266/*...sfinite state machine:0: */
1267typedef unsigned char ETYPE; // Edge type
1268
1269#define ETYPE_CHAR ((ETYPE) 0) // Can advance if given character
1270#define ETYPE_NCHAR ((ETYPE) 1) // Can advance if not given char.
1271#define ETYPE_STRING ((ETYPE) 2) // Can advance if strings match
1272#define ETYPE_DOT ((ETYPE) 3) // Can advance if any character
1273#define ETYPE_CCLASS ((ETYPE) 4) // Can advance if any in class
1274#define ETYPE_WORD ((ETYPE) 5) // Can advance if word constituent
1275#define ETYPE_NWORD ((ETYPE) 6) // Can advance if not word constit.
1276#define ETYPE_EPSILON ((ETYPE) 7) // Can advance without reading input
1277#define ETYPE_SOL ((ETYPE) 8) // Matches if at start of line
1278#define ETYPE_EOL ((ETYPE) 9) // Matches if at end of line
1279#define ETYPE_SOW ((ETYPE) 10) // Matches if at start of word
1280#define ETYPE_EOW ((ETYPE) 11) // Matches if at end of word
1281#define ETYPE_IW ((ETYPE) 12) // Matches if within word
1282#define ETYPE_EW ((ETYPE) 13) // Matches if at word start or end
1283#define ETYPE_SSUB ((ETYPE) 14) // Records passage over (
1284#define ETYPE_ESUB ((ETYPE) 15) // Records passage over )
1285#define ETYPE_BACK ((ETYPE) 16) // Back reference
1286
1287typedef struct
1288{
1289 ETYPE etype; // Edge type, (an ETYPE_ no)
1290 union
1291 {
1292 char character; // Character to use if ETYPE_CHAR
1293 unsigned char *cclass; // Class to use if ETYPE_CCLASS
1294 char *string; // len, then chars if ETYPE_STRING
1295 int n_span; // Used if ETYPE_BACK
1296 }
1297 u;
1298 const char *gate; // Used if type of epsilon move
1299 int to_state; // State to go to if test succeeds
1300 int next_edge; // Next test to try after this
1301}
1302EDGE;
1303
1304/* Under DOS, restrict FSM size to 13KB approx.. On other environments memory
1305 * isn't such an issue, so allow something closer to 32KB. Of course, a better
1306 * implementation would dynamically grow the FSM size as needed. Something for
1307 * the future perhaps... */
1308
1309#ifdef xxDOS
1310#define MAX_STATES 200
1311#define MAX_EDGES 600
1312#else
1313#define MAX_STATES 500
1314#define MAX_EDGES 1500
1315#endif
1316
1317#define FLAG_FINISH 0x01
1318#define FLAG_VISITED 0x02
1319#define FLAG_REACHABLE 0x04
1320
1321typedef struct
1322{
1323 int n_states;
1324 int state_first_edges[MAX_STATES];
1325 unsigned char state_flags[MAX_STATES];
1326 int n_edges;
1327 EDGE edges[MAX_EDGES];
1328}
1329FSM;
1330
1331/*...screate_fsm:0: */
1332STATIC FSM *create_fsm(int *rc)
1333{
1334 FSM *fsm;
1335 int i;
1336
1337 if ((fsm = (FSM *) malloc(sizeof(FSM))) == NULL)
1338 {
1339 *rc = ERROR_NOT_ENOUGH_MEMORY;
1340 return NULL;
1341 }
1342
1343 fsm->n_states = 0;
1344 fsm->n_edges = 0;
1345
1346 for (i = 0; i < MAX_STATES; i++)
1347 {
1348 fsm->state_first_edges[i] = -1;
1349 fsm->state_flags[i] = 0;
1350 }
1351
1352 return fsm;
1353}
1354/*...sdelete_fsm:0: */
1355#define delete_fsm(fsm) free(fsm)
1356
1357/*...smalloc_state:0: */
1358STATIC int malloc_state(FSM * fsm)
1359{
1360 if (fsm->n_states == MAX_STATES)
1361 return -1;
1362 else
1363 return fsm->n_states++;
1364}
1365/*...smalloc_edge:0: */
1366/* Allocate space for new supplied edge and fill it in.
1367 * If already exists then don't bother (duplicates waste search time).
1368 * Return TRUE if all went ok. */
1369
1370STATIC BOOLEAN malloc_edge(int s, EDGE * edge, FSM * fsm)
1371{
1372 int edge_no, n_edges = fsm->n_edges++;
1373
1374 // See if edge already exists
1375
1376 for (edge_no = fsm->state_first_edges[s];
1377 edge_no != -1;
1378 edge_no = fsm->edges[edge_no].next_edge)
1379 // Do we already have this edge
1380 if (fsm->edges[edge_no].to_state == edge->to_state &&
1381 fsm->edges[edge_no].etype == edge->etype)
1382 // An edge of this type already exists
1383 switch (edge->etype)
1384 {
1385/*...sETYPE_CHAR\47\NCHAR:32: */
1386 case ETYPE_CHAR:
1387 case ETYPE_NCHAR:
1388 if (edge->u.character == fsm->edges[edge_no].u.character)
1389 return TRUE;
1390 break;
1391/*...sETYPE_STRING:32: */
1392 case ETYPE_STRING:
1393 if ((unsigned char)fsm->edges[edge_no].u.string[0] ==
1394 (unsigned char)edge->u.string[0] &&
1395 !memcmp(fsm->edges[edge_no].u.string + 1,
1396 edge->u.string + 1,
1397 (unsigned char)edge->u.string[0]))
1398 return TRUE;
1399 break;
1400/*...sETYPE_CCLASS:32: */
1401 case ETYPE_CCLASS:
1402 if (edge->u.cclass == fsm->edges[edge_no].u.cclass)
1403 return TRUE;
1404 break;
1405/*...sETYPE_DOT\47\EPSILON\47\SOL\47\EOL etc\46\\46\:32: */
1406 case ETYPE_DOT:
1407 case ETYPE_EPSILON:
1408 case ETYPE_SOL:
1409 case ETYPE_EOL:
1410 case ETYPE_SOW:
1411 case ETYPE_EOW:
1412 case ETYPE_IW:
1413 case ETYPE_EW:
1414 case ETYPE_SSUB:
1415 case ETYPE_ESUB:
1416 return TRUE;
1417/*...sETYPE_BACK:32: */
1418 case ETYPE_BACK:
1419 if (edge->u.n_span == fsm->edges[edge_no].u.n_span)
1420 return TRUE;
1421 break;
1422 }
1423
1424 // Going to have to add the edge
1425
1426 if (n_edges >= MAX_EDGES)
1427 return FALSE;
1428
1429 memcpy(&(fsm->edges[n_edges]), edge, sizeof(EDGE));
1430 fsm->edges[n_edges].next_edge = fsm->state_first_edges[s];
1431 fsm->state_first_edges[s] = n_edges;
1432 return TRUE;
1433}
1434/*...smake_fsm_from_match:0: */
1435/*...sadd_edge_to_fsm_character:0: */
1436STATIC BOOLEAN add_edge_to_fsm_character(int s, int f, FSM * fsm, char character)
1437{
1438 EDGE edge;
1439
1440 edge.etype = ETYPE_CHAR;
1441 edge.u.character = character;
1442 edge.to_state = f;
1443 return malloc_edge(s, &edge, fsm);
1444}
1445/*...sadd_edge_to_fsm_ncharacter:0: */
1446STATIC BOOLEAN add_edge_to_fsm_ncharacter(int s, int f, FSM * fsm, char character)
1447{
1448 EDGE edge;
1449
1450 edge.etype = ETYPE_NCHAR;
1451 edge.u.character = character;
1452 edge.to_state = f;
1453 return malloc_edge(s, &edge, fsm);
1454}
1455/*...sadd_edge_to_fsm_string:0: */
1456STATIC BOOLEAN add_edge_to_fsm_string(int s, int f, FSM * fsm, char *string)
1457{
1458 EDGE edge;
1459
1460 edge.etype = ETYPE_STRING;
1461 edge.u.string = string;
1462 edge.to_state = f;
1463 return malloc_edge(s, &edge, fsm);
1464}
1465/*...sadd_edge_to_fsm_cclass:0: */
1466STATIC BOOLEAN add_edge_to_fsm_cclass(int s, int f, FSM * fsm, unsigned char *cclass)
1467{
1468 EDGE edge;
1469
1470 edge.etype = ETYPE_CCLASS;
1471 edge.u.cclass = cclass;
1472 edge.to_state = f;
1473 return malloc_edge(s, &edge, fsm);
1474}
1475/*...sadd_edge_to_fsm_dot:0: */
1476STATIC BOOLEAN add_edge_to_fsm_dot(int s, int f, FSM * fsm)
1477{
1478 EDGE edge;
1479
1480 edge.etype = ETYPE_DOT;
1481 edge.to_state = f;
1482 return malloc_edge(s, &edge, fsm);
1483}
1484/*...sadd_edge_to_fsm_word:0: */
1485STATIC BOOLEAN add_edge_to_fsm_word(int s, int f, FSM * fsm)
1486{
1487 EDGE edge;
1488
1489 edge.etype = ETYPE_WORD;
1490 edge.to_state = f;
1491 return malloc_edge(s, &edge, fsm);
1492}
1493/*...sadd_edge_to_fsm_nword:0: */
1494STATIC BOOLEAN add_edge_to_fsm_nword(int s, int f, FSM * fsm)
1495{
1496 EDGE edge;
1497
1498 edge.etype = ETYPE_NWORD;
1499 edge.to_state = f;
1500 return malloc_edge(s, &edge, fsm);
1501}
1502/*...sadd_edge_to_fsm_epsilon:0: */
1503STATIC BOOLEAN add_edge_to_fsm_epsilon(int s, int f, FSM * fsm)
1504{
1505 EDGE edge;
1506
1507 edge.etype = ETYPE_EPSILON;
1508 edge.to_state = f;
1509 return malloc_edge(s, &edge, fsm);
1510}
1511/*...sadd_edge_to_fsm_special:0: */
1512STATIC BOOLEAN add_edge_to_fsm_special(int s, int f, FSM * fsm, ETYPE etype)
1513{
1514 EDGE edge;
1515
1516 edge.etype = etype;
1517 edge.to_state = f;
1518 edge.gate = NULL;
1519 return malloc_edge(s, &edge, fsm);
1520}
1521/*...sadd_edge_to_fsm_back:0: */
1522STATIC BOOLEAN add_edge_to_fsm_back(int s, int f, FSM * fsm, int n_span)
1523{
1524 EDGE edge;
1525
1526 edge.etype = ETYPE_BACK;
1527 edge.to_state = f;
1528 edge.gate = NULL;
1529 edge.u.n_span = n_span;
1530 return malloc_edge(s, &edge, fsm);
1531}
1532
1533STATIC BOOLEAN make_fsm_from_match(MATCH * match, FSM * fsm, int *s, int *f)
1534{
1535 int n1, n2, n3, n4, i;
1536
1537 switch (match->mtype)
1538 {
1539/*...sMTYPE_NULL:16: */
1540/*
1541 * e
1542 * S ----> F
1543 *
1544 */
1545
1546 case MTYPE_NULL:
1547 return (*s = malloc_state(fsm)) != -1 &&
1548 (*f = malloc_state(fsm)) != -1 &&
1549 add_edge_to_fsm_epsilon(*s, *f, fsm);
1550/*...sMTYPE_CHAR:16: */
1551/*
1552 * c
1553 * S ----> F
1554 *
1555 */
1556
1557 case MTYPE_CHAR:
1558 return (*s = malloc_state(fsm)) != -1 &&
1559 (*f = malloc_state(fsm)) != -1 &&
1560 add_edge_to_fsm_character(*s, *f, fsm, match->u.character);
1561/*...sMTYPE_NCHAR:16: */
1562/*
1563 * ~c
1564 * S ----> F
1565 *
1566 */
1567
1568 case MTYPE_NCHAR:
1569 return (*s = malloc_state(fsm)) != -1 &&
1570 (*f = malloc_state(fsm)) != -1 &&
1571 add_edge_to_fsm_ncharacter(*s, *f, fsm, match->u.character);
1572/*...sMTYPE_STRING:16: */
1573/*
1574 * string
1575 * S ----> F
1576 *
1577 */
1578
1579 case MTYPE_STRING:
1580 return (*s = malloc_state(fsm)) != -1 &&
1581 (*f = malloc_state(fsm)) != -1 &&
1582 add_edge_to_fsm_string(*s, *f, fsm, match->u.string);
1583
1584/*...sMTYPE_CCLASS:16: */
1585/*
1586 * cclass
1587 * S ----> F
1588 *
1589 */
1590
1591 case MTYPE_CCLASS:
1592 return (*s = malloc_state(fsm)) != -1 &&
1593 (*f = malloc_state(fsm)) != -1 &&
1594 add_edge_to_fsm_cclass(*s, *f, fsm, match->u.cclass);
1595
1596/*...sMTYPE_DOT:16: */
1597/*
1598 * any
1599 * S ----> F
1600 *
1601 */
1602
1603 case MTYPE_DOT:
1604 return (*s = malloc_state(fsm)) != -1 &&
1605 (*f = malloc_state(fsm)) != -1 &&
1606 add_edge_to_fsm_dot(*s, *f, fsm);
1607
1608/*...sMTYPE_WORD:16: */
1609/*
1610 * word
1611 * S ----> F
1612 *
1613 */
1614
1615 case MTYPE_WORD:
1616 return (*s = malloc_state(fsm)) != -1 &&
1617 (*f = malloc_state(fsm)) != -1 &&
1618 add_edge_to_fsm_word(*s, *f, fsm);
1619
1620/*...sMTYPE_NWORD:16: */
1621/*
1622 * !word
1623 * S ----> F
1624 *
1625 */
1626
1627 case MTYPE_NWORD:
1628 return (*s = malloc_state(fsm)) != -1 &&
1629 (*f = malloc_state(fsm)) != -1 &&
1630 add_edge_to_fsm_nword(*s, *f, fsm);
1631
1632/*...sMTYPE_QUERY:16: */
1633/*
1634 * e
1635 * S ----> F
1636 * | ^
1637 * | e | e
1638 * v |
1639 * [n1 n2]
1640 *
1641 */
1642
1643 case MTYPE_QUERY:
1644 if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1645 return FALSE;
1646 return (*s = malloc_state(fsm)) != -1 &&
1647 (*f = malloc_state(fsm)) != -1 &&
1648 add_edge_to_fsm_epsilon(*s, *f, fsm) &&
1649 add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1650 add_edge_to_fsm_epsilon(n2, *f, fsm);
1651
1652/*...sMTYPE_PLUS:16: */
1653/*
1654 *
1655 * [S F]
1656 * ^ e |
1657 * +---+
1658 *
1659 */
1660
1661 case MTYPE_PLUS:
1662 if (!make_fsm_from_match(match->u.match, fsm, s, f))
1663 return FALSE;
1664 return add_edge_to_fsm_epsilon(*f, *s, fsm);
1665
1666/*...sMTYPE_STAR:16: */
1667/*
1668 *
1669 * +---- S/F
1670 * | ^
1671 * | e | e
1672 * v |
1673 * [n1 n2]
1674 *
1675 */
1676
1677 case MTYPE_STAR:
1678 if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1679 return FALSE;
1680 return (*s = *f = malloc_state(fsm)) != -1 &&
1681 add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1682 add_edge_to_fsm_epsilon(n2, *f, fsm);
1683
1684/*...sMTYPE_CREP:16: */
1685/*
1686 * e e e e
1687 * S-->[n1 n2]-->[n1 n2]-->[n1 n2]-->[n1 n2] <re>{2,4}
1688 * | | |
1689 * | e | e | e
1690 * v | |
1691 * F<---------+----------+
1692 *
1693 * e
1694 * +----+
1695 * | |
1696 * e e e v |
1697 * S-->[n1 n2]-->[n1 n2]-->[n1 n2] <re>{2,}
1698 * | |
1699 * | e | e
1700 * v |
1701 * F<---------+
1702 */
1703
1704 case MTYPE_CREP:
1705 if ((*s = malloc_state(fsm)) == -1 ||
1706 (*f = malloc_state(fsm)) == -1)
1707 return FALSE;
1708 n3 = *s;
1709 for (i = 0; i < (int)match->u.crep.m; i++)
1710 {
1711 if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1712 return FALSE;
1713 if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1714 return FALSE;
1715 n3 = n2;
1716 }
1717 if (!add_edge_to_fsm_epsilon(n3, *f, fsm))
1718 return FALSE;
1719 if (match->u.crep.n != ~0)
1720 for (; i < (int)match->u.crep.n; i++)
1721 {
1722 if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1723 return FALSE;
1724 if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1725 return FALSE;
1726 if (!add_edge_to_fsm_epsilon(n2, *f, fsm))
1727 return FALSE;
1728 n3 = n2;
1729 }
1730 else
1731 {
1732 if (!make_fsm_from_match(match->u.crep.match, fsm, &n1, &n2))
1733 return FALSE;
1734 if (!add_edge_to_fsm_epsilon(n3, n1, fsm))
1735 return FALSE;
1736 if (!add_edge_to_fsm_epsilon(n2, *f, fsm))
1737 return FALSE;
1738 if (!add_edge_to_fsm_epsilon(n2, n1, fsm))
1739 return FALSE;
1740 }
1741 return TRUE;
1742
1743/*...sMTYPE_OR:16: */
1744/*
1745 * e e
1746 * +--->[n1 n2]----+
1747 * | v
1748 * S F
1749 * | e e ^
1750 * +--->[n3 n4]----+
1751 *
1752 */
1753
1754 case MTYPE_OR:
1755 if (!make_fsm_from_match(match->u.matchs[0], fsm, &n1, &n2) ||
1756 !make_fsm_from_match(match->u.matchs[1], fsm, &n3, &n4))
1757 return FALSE;
1758 return (*s = malloc_state(fsm)) != -1 &&
1759 (*f = malloc_state(fsm)) != -1 &&
1760 add_edge_to_fsm_epsilon(*s, n1, fsm) &&
1761 add_edge_to_fsm_epsilon(*s, n3, fsm) &&
1762 add_edge_to_fsm_epsilon(n2, *f, fsm) &&
1763 add_edge_to_fsm_epsilon(n4, *f, fsm);
1764
1765/*...sMTYPE_CAT:16: */
1766/*
1767 * e
1768 * [S n1]---->[n2 F]
1769 *
1770 */
1771
1772 case MTYPE_CAT:
1773 if (!make_fsm_from_match(match->u.matchs[0], fsm, s, &n1) ||
1774 !make_fsm_from_match(match->u.matchs[1], fsm, &n2, f))
1775 return FALSE;
1776 return add_edge_to_fsm_epsilon(n1, n2, fsm);
1777
1778/*...sMTYPE_SUB:16: */
1779/*
1780 * ssub esub
1781 * S---->[n1 n2]---->F
1782 *
1783 */
1784
1785 case MTYPE_SUB:
1786 if (!make_fsm_from_match(match->u.match, fsm, &n1, &n2))
1787 return FALSE;
1788 return (*s = malloc_state(fsm)) != -1 &&
1789 (*f = malloc_state(fsm)) != -1 &&
1790 add_edge_to_fsm_special(*s, n1, fsm, ETYPE_SSUB) &&
1791 add_edge_to_fsm_special(n2, *f, fsm, ETYPE_ESUB);
1792
1793/*...sMTYPE_SOL\47\EOL\47\SOW\47\EOW\47\IW\47\EW:16: */
1794/*
1795 * special
1796 * S ----> F
1797 *
1798 */
1799
1800 case MTYPE_SOL:
1801 return (*s = malloc_state(fsm)) != -1 &&
1802 (*f = malloc_state(fsm)) != -1 &&
1803 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_SOL);
1804 case MTYPE_EOL:
1805 return (*s = malloc_state(fsm)) != -1 &&
1806 (*f = malloc_state(fsm)) != -1 &&
1807 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_EOL);
1808 case MTYPE_SOW:
1809 return (*s = malloc_state(fsm)) != -1 &&
1810 (*f = malloc_state(fsm)) != -1 &&
1811 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_SOW);
1812 case MTYPE_EOW:
1813 return (*s = malloc_state(fsm)) != -1 &&
1814 (*f = malloc_state(fsm)) != -1 &&
1815 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_EOW);
1816 case MTYPE_IW:
1817 return (*s = malloc_state(fsm)) != -1 &&
1818 (*f = malloc_state(fsm)) != -1 &&
1819 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_IW);
1820 case MTYPE_EW:
1821 return (*s = malloc_state(fsm)) != -1 &&
1822 (*f = malloc_state(fsm)) != -1 &&
1823 add_edge_to_fsm_special(*s, *f, fsm, ETYPE_EW);
1824
1825/*...sMTYPE_BACK:16: */
1826 case MTYPE_BACK:
1827 return (*s = malloc_state(fsm)) != -1 &&
1828 (*f = malloc_state(fsm)) != -1 &&
1829 add_edge_to_fsm_back(*s, *f, fsm, match->u.n_span);
1830
1831 }
1832 return FALSE; // Should never happen
1833}
1834
1835/*...sremove_epsilons:0: */
1836/* The problems with epsilon moves are :-
1837 * 1) They can gang up on you in groups to form loops of states that require
1838 * no input to go all the way around them. Any function attempting a
1839 * recursive search of the FSM will recurse forever.
1840 * 2) They can slow the recognition process by as much as a factor of 2.
1841 * eg:
1842 * a e b e c
1843 * O ----> O ----> O ----> O ----> O ----> O Is slow
1844 *
1845 * a b c
1846 * O ----> O ----> O ----> O Is faster
1847 *
1848 */
1849
1850/*...sfinish_states:0: */
1851/*...sis_finish_reachable:0: */
1852/* When we recurse we mark the current state as visited to stop infinite
1853 * recursion on loops of epsilon moves. */
1854
1855STATIC BOOLEAN is_finish_reachable(FSM * fsm, int state_no)
1856{
1857 int edge_no;
1858 BOOLEAN ok;
1859
1860 if (fsm->state_flags[state_no] & FLAG_VISITED)
1861 // Been here already
1862 return FALSE;
1863
1864 if (fsm->state_flags[state_no] & FLAG_FINISH)
1865 // At finish
1866 return TRUE;
1867
1868 for (edge_no = fsm->state_first_edges[state_no];
1869 edge_no != -1;
1870 edge_no = fsm->edges[edge_no].next_edge)
1871 if (fsm->edges[edge_no].etype == ETYPE_EPSILON)
1872 {
1873 fsm->state_flags[state_no] |= FLAG_VISITED;
1874 ok = is_finish_reachable(fsm, fsm->edges[edge_no].to_state);
1875 fsm->state_flags[state_no] &= ~FLAG_VISITED;
1876 if (ok)
1877 return TRUE;
1878 }
1879
1880
1881 return FALSE;
1882}
1883
1884
1885STATIC void finish_states(int f, FSM * fsm, FSM * fsm_without)
1886{
1887 int state_no;
1888
1889 fsm->state_flags[f] = FLAG_FINISH;
1890 fsm_without->state_flags[f] = FLAG_FINISH;
1891 for (state_no = 0; state_no < fsm->n_states; state_no++)
1892 if (is_finish_reachable(fsm, state_no))
1893 fsm_without->state_flags[state_no] = FLAG_FINISH;
1894}
1895
1896/*...sdetermine_reachable:0: */
1897STATIC void determine_reachable(int s, FSM * fsm, FSM * fsm_without)
1898{
1899 int edge_no, to_state;
1900
1901 fsm_without->state_flags[s] |= FLAG_REACHABLE;
1902 for (edge_no = 0; edge_no < fsm->n_edges; edge_no++)
1903 if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1904 {
1905 to_state = fsm->edges[edge_no].to_state;
1906 fsm_without->state_flags[to_state] |= FLAG_REACHABLE;
1907 }
1908}
1909
1910/*...scopy_non_epsilons:0: */
1911STATIC void copy_non_epsilons(FSM * fsm, FSM * fsm_without)
1912{
1913 int state_no, edge_no;
1914
1915 for (state_no = 0; state_no < fsm->n_states; state_no++)
1916 if (fsm_without->state_flags[state_no] & FLAG_REACHABLE)
1917 for (edge_no = fsm->state_first_edges[state_no];
1918 edge_no != -1;
1919 edge_no = fsm->edges[edge_no].next_edge)
1920 if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1921 malloc_edge(state_no, &(fsm->edges[edge_no]), fsm_without);
1922}
1923
1924/*...sfollow_epsilons:0: */
1925/*...scopy_edges_reachable:0: */
1926/* What this says is :-
1927 * If state A can reach state B, by an epsilon move, and
1928 * state B can reach state C then
1929 * state A can reach state C *
1930 *
1931 * If the state B to state C involves an epsilon move then
1932 * A can reach whatever C can reach too (by recursion)
1933 */
1934
1935STATIC BOOLEAN copy_edges_reachable(
1936 FSM * fsm,
1937 FSM * fsm_without,
1938 int state_no_to, /* AK: Bad identifier, might better be
1939 * described as 'original reachable state' */
1940 int state_no_from
1941)
1942{
1943 int edge_no;
1944 BOOLEAN ok;
1945
1946 if (fsm->state_flags[state_no_from] & FLAG_VISITED)
1947 // Been here already, therefore all copied from here ok
1948 return TRUE;
1949
1950 for (edge_no = fsm->state_first_edges[state_no_from];
1951 edge_no != -1;
1952 edge_no = fsm->edges[edge_no].next_edge)
1953 if (fsm->edges[edge_no].etype != ETYPE_EPSILON)
1954 // Had better add this edge
1955/*...sadd this edge to the \39\to\39\ state:24: */
1956 {
1957 if (!malloc_edge(state_no_to, &(fsm->edges[edge_no]), fsm_without))
1958 return (FALSE);
1959 }
1960
1961 else
1962 {
1963 fsm->state_flags[state_no_from] |= FLAG_VISITED;
1964 ok = copy_edges_reachable(fsm, fsm_without, state_no_to, fsm->edges[edge_no].to_state);
1965 fsm->state_flags[state_no_from] &= ~FLAG_VISITED;
1966 if (!ok)
1967 return FALSE;
1968 }
1969
1970 return TRUE;
1971}
1972
1973
1974STATIC BOOLEAN follow_epsilons(FSM * fsm, FSM * fsm_without)
1975{
1976 int state_no;
1977
1978 for (state_no = 0; state_no < fsm->n_states; state_no++)
1979 if (fsm_without->state_flags[state_no] & FLAG_REACHABLE)
1980 if (!copy_edges_reachable(fsm, fsm_without, state_no, state_no))
1981 return FALSE;
1982 return TRUE;
1983}
1984
1985
1986STATIC BOOLEAN remove_epsilons(int s, int f, FSM * fsm, FSM * fsm_without)
1987{
1988 // FSM with no epsilon moves will have the same number of states
1989
1990 fsm_without->n_states = fsm->n_states;
1991
1992 // Mark state f as a finish state in the new FSM
1993 // Any state with epsilon move(s) to state f is also a finish state
1994
1995 finish_states(f, fsm, fsm_without);
1996
1997 // Determine which states can be reached by non-epsilon moves. Add
1998 // to this set, the start state. The resulting states should have
1999 // their edges considered, but the other will not be reachable. This
2000 // is because they will be bypassed by follow_epsilons().
2001
2002 determine_reachable(s, fsm, fsm_without);
2003
2004 // Copy across all reachable, non epsilon moves to new FSM
2005
2006 copy_non_epsilons(fsm, fsm_without);
2007
2008 // For all states, determine all other states that can be reached by
2009 // epsilon moves and add the edges leading from them to us
2010
2011 return follow_epsilons(fsm, fsm_without);
2012}
2013
2014/*...smatch_fsm:0: */
2015/* Stack requirements per call, (one call per character in string!).
2016 *
2017 * eg: 16 bit OS/2, large model :-
2018 * Return address + Stack frame + SI and DI + Arguments + Locals
2019 * 2 + 2 + 2+2 + 4+2+4+4 + 2 = 24
2020 *
2021 * eg: 16 bit DOS large model :-
2022 * Return address + Stack frame + SI and DI + Arguments + Locals
2023 * 2 + 2 + 2+2 + 4+2 + 2 = 16
2024 *
2025 * eg: RS/6000 AIX :-
2026 * Massive stack frame per call, massive program stack size
2027 * net effect - room for approx 200 levels before core dump!
2028 *
2029 * In addition, we can consume additional stack for every ETYPE_SSUB
2030 * (and ETYPE_ESUB). Guessing this requirement to be approx 100 bytes,
2031 * we place an arbitrary limit of 20 (ie: 2KB) on subexpressions in an ERE.
2032 * This limit was enforced earlier during match tree parsing.
2033 *
2034 */
2035
2036typedef struct substruct SUBS;
2037struct substruct
2038{
2039 int n_spans;
2040 ERE_SPAN spans[MAX_SPANS];
2041 SUBS *next;
2042};
2043
2044#define MAX_SUBS 20
2045
2046typedef struct
2047{
2048 FSM *fsm;
2049 int eremf;
2050 const char *str_init;
2051 const char *str_limit;
2052 const char *str_best;
2053 SUBS *subs;
2054 SUBS *subs_base;
2055 ERE_MATCHINFO *mi;
2056}
2057CONTEXT;
2058
2059#define NR
2060
2061STATIC void NR walk_fsm(const char *str, int state_no, CONTEXT * cx);
2062
2063STATIC void NR walk_fsm_gated(const char *str, CONTEXT * cx, EDGE * e)
2064{
2065 if (e->gate != str)
2066 // Avoid looping via this edge
2067 {
2068 const char *gate = e->gate;
2069
2070 e->gate = str;
2071 walk_fsm(str, e->to_state, cx);
2072 e->gate = gate;
2073 }
2074}
2075
2076STATIC void NR walk_fsm_ssub(const char *str, CONTEXT * cx, EDGE * e)
2077{
2078 SUBS subs;
2079
2080 if (cx->subs->n_spans < MAX_SPANS)
2081 cx->subs->spans[cx->subs->n_spans].pos = str - cx->str_init;
2082
2083 subs.n_spans = 0;
2084 subs.next = cx->subs;
2085 cx->subs = &subs;
2086 walk_fsm_gated(str, cx, e);
2087 cx->subs = subs.next;
2088}
2089
2090STATIC void NR walk_fsm_esub(const char *str, CONTEXT * cx, EDGE * e)
2091{
2092 SUBS *subs = cx->subs;
2093
2094 cx->subs = cx->subs->next;
2095
2096 if (cx->subs->n_spans < MAX_SPANS)
2097 cx->subs->spans[cx->subs->n_spans].len =
2098 (str - cx->str_init) - cx->subs->spans[cx->subs->n_spans].pos;
2099 ++(cx->subs->n_spans);
2100
2101 walk_fsm_gated(str, cx, e);
2102
2103 --(cx->subs->n_spans);
2104 cx->subs = subs;
2105}
2106
2107STATIC void NR walk_fsm(const char *str, int state_no, CONTEXT * cx)
2108{
2109 int edge_no;
2110
2111 if (cx->fsm->state_flags[state_no] & FLAG_FINISH)
2112 // Got to finishing state, may have got a better match than before
2113 {
2114 if ((cx->str_best == NULL ||
2115 (cx->str_best < str) == (cx->eremf & EREMF_SHORTEST) == 0) &&
2116 str <= cx->str_limit)
2117 {
2118 cx->str_best = str;
2119 if (cx->mi != NULL)
2120 {
2121 int i;
2122
2123 cx->mi->n_spans = cx->subs_base->n_spans;
2124 for (i = 0; i < cx->mi->n_spans; i++)
2125 cx->mi->spans[i] = cx->subs_base->spans[i];
2126 }
2127 }
2128 if (cx->eremf & EREMF_ANY)
2129 return;
2130 // Continue, as may be able to get a better match
2131 }
2132
2133 for (edge_no = cx->fsm->state_first_edges[state_no];
2134 edge_no != -1;
2135 edge_no = cx->fsm->edges[edge_no].next_edge)
2136 // Consider taking a step along an edge to a new state
2137 {
2138 EDGE *e = &(cx->fsm->edges[edge_no]);
2139
2140 switch (e->etype)
2141 {
2142/*...sETYPE_CHAR \45\ if matches character\44\ we can advance:24: */
2143 case ETYPE_CHAR:
2144 if (*str == e->u.character)
2145 walk_fsm(str + 1, e->to_state, cx);
2146 break;
2147
2148/*...sETYPE_NCHAR \45\ if not matches character\44\ we can advance:24: */
2149 case ETYPE_NCHAR:
2150 if (*str != '\0' && *str != e->u.character)
2151 walk_fsm(str + 1, e->to_state, cx);
2152 break;
2153
2154/*...sETYPE_STRING \45\ if matches string\44\ we can advance:24: */
2155 case ETYPE_STRING:
2156 {
2157 unsigned len = (unsigned char)e->u.string[0];
2158
2159 if (!memcmp(str, e->u.string + 1, len))
2160 walk_fsm(str + len, e->to_state, cx);
2161 }
2162 break;
2163
2164/*...sETYPE_DOT \45\ if got any character\44\ we can advance:24: */
2165 case ETYPE_DOT:
2166 if (*str != '\0')
2167 walk_fsm(str + 1, e->to_state, cx);
2168 break;
2169
2170/*...sETYPE_WORD \45\ if got word constituent\44\ we can advance:24: */
2171 case ETYPE_WORD:
2172 if (*str != '\0' && isword(*str))
2173 walk_fsm(str + 1, e->to_state, cx);
2174 break;
2175
2176/*...sETYPE_NWORD \45\ if got non word constituent\44\ we can advance:24: */
2177 case ETYPE_NWORD:
2178 if (*str != '\0' && !isword(*str))
2179 walk_fsm(str + 1, e->to_state, cx);
2180 break;
2181
2182/*...sETYPE_CCLASS \45\ if in the class\44\ we can advance:24: */
2183 case ETYPE_CCLASS:
2184 if (match_cclass(*str, e->u.cclass))
2185 walk_fsm(str + 1, e->to_state, cx);
2186 break;
2187
2188/*...sETYPE_SOL\47\EOL\47\SOW\47\EOW\47\IW\47\EW \45\ special epsilon moves:24: */
2189 case ETYPE_SOL:
2190 if (str == cx->str_init)
2191 walk_fsm_gated(str, cx, e);
2192 break;
2193 case ETYPE_EOL:
2194 if (*str == '\0')
2195 walk_fsm_gated(str, cx, e);
2196 break;
2197 case ETYPE_SOW:
2198 if (isword(str[0]) &&
2199 ((cx->str_init < str && !isword(str[-1])) ||
2200 (cx->str_init == str)))
2201 walk_fsm_gated(str, cx, e);
2202 break;
2203 case ETYPE_EOW:
2204 if ((cx->str_init < str && isword(str[-1])) &&
2205 (str[0] == '\0' || !isword(str[0])))
2206 walk_fsm_gated(str, cx, e);
2207 break;
2208 case ETYPE_IW:
2209 if (cx->str_init < str && isword(str[-1]) &&
2210 str[0] != '\0' && isword(str[0]))
2211 walk_fsm_gated(str, cx, e);
2212 break;
2213 case ETYPE_EW:
2214 if (isword(str[0]) &&
2215 ((cx->str_init < str && !isword(str[-1])) ||
2216 (cx->str_init == str)))
2217 walk_fsm_gated(str, cx, e);
2218 else if ((cx->str_init < str && isword(str[-1])) &&
2219 (str[0] == '\0' || !isword(str[0])))
2220 walk_fsm_gated(str, cx, e);
2221 break;
2222
2223/*...sETYPE_SSUB\47\ESUB \45\ handle nested subexpression:24: */
2224 case ETYPE_SSUB:
2225 walk_fsm_ssub(str, cx, e);
2226 break;
2227 case ETYPE_ESUB:
2228 walk_fsm_esub(str, cx, e);
2229 break;
2230
2231/*...sETYPE_BACK \45\ check backreference:24: */
2232 case ETYPE_BACK:
2233 if (e->u.n_span < cx->subs->n_spans)
2234 {
2235 int len = cx->subs->spans[e->u.n_span].len;
2236
2237 if (!memcmp(str, cx->str_init + cx->subs->spans[e->u.n_span].pos, len))
2238 walk_fsm_gated(str + len, cx, e);
2239 }
2240 break;
2241
2242 }
2243 }
2244}
2245
2246STATIC const char *match_fsm(FSM * fsm,
2247 int eremf,
2248 const char *str,
2249 int posn,
2250 int limit,
2251 int state_no,
2252 ERE_MATCHINFO * mi)
2253{
2254 CONTEXT cx;
2255 SUBS subs;
2256
2257 cx.fsm = fsm;
2258 cx.eremf = eremf;
2259 cx.str_init = str;
2260 cx.str_limit = str + limit;
2261 cx.str_best = NULL;
2262 cx.subs = &subs;
2263 cx.subs_base = &subs;
2264 cx.mi = mi;
2265 subs.n_spans = 0;
2266 walk_fsm(str + posn, state_no, &cx);
2267 return cx.str_best;
2268}
2269
2270
2271#ifdef DEBUG
2272/*...sprint_fsm:0: */
2273STATIC void print_fsm(FSM * fsm, int s, BOOLEAN not_just_reachable)
2274{
2275 int state_no, edge_no;
2276
2277 printf("Starting state %02d\n", s);
2278 for (state_no = 0; state_no < fsm->n_states; state_no++)
2279 if ((fsm->state_flags[state_no] & FLAG_REACHABLE) != 0 ||
2280 not_just_reachable)
2281 {
2282 printf("%02d:%c\t", state_no, ((fsm->state_flags[state_no] & FLAG_FINISH) != 0) ? 'F' : ' ');
2283 for (edge_no = fsm->state_first_edges[state_no];
2284 edge_no != -1;
2285 edge_no = fsm->edges[edge_no].next_edge)
2286/*...sshow edge:32: */
2287 {
2288 EDGE *e = &(fsm->edges[edge_no]);
2289
2290 switch (e->etype)
2291 {
2292 case ETYPE_CHAR:
2293 printf("%c", e->u.character);
2294 break;
2295 case ETYPE_NCHAR:
2296 printf("~%c", e->u.character);
2297 break;
2298 case ETYPE_STRING:
2299 printf("%*.*s",
2300 (unsigned char)e->u.string[0],
2301 (unsigned char)e->u.string[0],
2302 e->u.string + 1);
2303 break;
2304 case ETYPE_DOT:
2305 printf(".");
2306 break;
2307 case ETYPE_CCLASS:
2308 printf("[");
2309 break;
2310 case ETYPE_WORD:
2311 printf("\\w");
2312 break;
2313 case ETYPE_NWORD:
2314 printf("\\W");
2315 break;
2316 case ETYPE_EPSILON:
2317 printf("e");
2318 break;
2319 case ETYPE_SOL:
2320 printf("^");
2321 break;
2322 case ETYPE_EOL:
2323 printf("$");
2324 break;
2325 case ETYPE_SOW:
2326 printf("\\<");
2327 break;
2328 case ETYPE_EOW:
2329 printf("\\>");
2330 break;
2331 case ETYPE_IW:
2332 printf("\\B");
2333 break;
2334 case ETYPE_EW:
2335 printf("\\y");
2336 break;
2337 case ETYPE_SSUB:
2338 printf("(");
2339 break;
2340 case ETYPE_ESUB:
2341 printf(")");
2342 break;
2343 case ETYPE_BACK:
2344 printf("\\%d", e->u.n_span + 1);
2345 break;
2346 }
2347 printf("->%02d\t", e->to_state);
2348 }
2349
2350 printf("\n");
2351 }
2352}
2353
2354#endif
2355
2356/*...sextended regular expressions:0: */
2357/* An ERE knows its original match tree and also the FSM it is compiled into.
2358 * Using a (largely epsilon move free) FSM makes for faster searching. */
2359
2360typedef struct
2361{
2362 MATCH *match; // Parse tree for expression
2363 int shortest_match; // Shortest match possible
2364 FSM *fsm; // Compiled FSM
2365 int s; // Start state for FSM
2366}
2367ERE;
2368
2369/*
2370 *@@ rxpCompile:
2371 * compiles the regular expression str for later matching.
2372 *
2373 * If ERECF_TOLOWER is passed with erecf, every character
2374 * (or range of characters) to be matched are stored in the
2375 * compiled ERE in lower case. Therefore, if strings to be
2376 * matched are passed in lower case also, the result is a
2377 * case-insensitive match.
2378 */
2379
2380ERE* rxpCompile(const char *str,
2381 int erecf,
2382 int *rc) // out: error code
2383{
2384 ERE *ere;
2385 const char *str_after;
2386 FSM *fsm;
2387 int s, f;
2388
2389 *rc = NO_ERROR;
2390
2391 if ((ere = (ERE *) malloc(sizeof(ERE))) == NULL)
2392 {
2393 *rc = ERROR_NOT_ENOUGH_MEMORY;
2394 return NULL;
2395 }
2396
2397 if ((ere->match = compile_match(str, &str_after, erecf, rc)) == NULL)
2398 {
2399 free(ere);
2400 return NULL;
2401 }
2402
2403 if (thisch(str_after) == CH_RPAR)
2404 {
2405 delete_match(ere->match);
2406 free(ere);
2407 *rc = EREE_UNEX_RPAR;
2408 return NULL;
2409 }
2410
2411 if (count_sub(ere->match) > MAX_SUBS)
2412 {
2413 delete_match(ere->match);
2414 free(ere);
2415 *rc = EREE_TOO_MANY_SUB;
2416 return NULL;
2417 }
2418
2419#ifdef DEBUG
2420 print_tree(ere->match, 0);
2421#endif
2422
2423 ere->shortest_match = (int)shortest_match(ere->match);
2424
2425 if ((fsm = create_fsm(rc)) == NULL)
2426 {
2427 delete_match(ere->match);
2428 free(ere);
2429 return NULL;
2430 }
2431
2432 if (!make_fsm_from_match(ere->match, fsm, &s, &f))
2433 {
2434 delete_fsm(fsm);
2435 delete_match(ere->match);
2436 free(ere);
2437 *rc = EREE_COMPILE_FSM;
2438 return NULL;
2439 }
2440
2441#ifdef DEBUG
2442 print_fsm(fsm, s, TRUE);
2443#endif
2444
2445 if ((ere->fsm = create_fsm(rc)) == NULL)
2446 {
2447 delete_fsm(fsm);
2448 delete_match(ere->match);
2449 free(ere);
2450 *rc = EREE_COMPILE_FSM;
2451 return NULL;
2452 }
2453
2454 if (!remove_epsilons(s, f, fsm, ere->fsm))
2455 {
2456 delete_fsm(ere->fsm);
2457 delete_fsm(fsm);
2458 delete_match(ere->match);
2459 free(ere);
2460 *rc = EREE_COMPILE_FSM;
2461 return NULL;
2462 }
2463
2464 delete_fsm(fsm);
2465
2466 ere->s = s;
2467
2468#ifdef DEBUG
2469 print_fsm(ere->fsm, s, FALSE);
2470#endif
2471
2472 return ere;
2473}
2474
2475/*
2476 *@@ rxpMinLen:
2477 *
2478 */
2479
2480int rxpMinLen(const ERE * ere)
2481{
2482 return ere->shortest_match;
2483}
2484
2485/*
2486 *@@ rxpMatch:
2487 * returns the number of characters in the match, starting from
2488 * pos characters into the string to be searched. Details of
2489 * sub-matches can also be returend. Returns -1 if no match.
2490 *
2491 * If EREMF_SHORTEST is passed with eremf, the code looks for
2492 * the shortest match, instead of the longest match.
2493 *
2494 * If EREMF_ANY is passed with eremf, the code doesn't try to
2495 * find the longest (or shortest) match, it will return with the
2496 * first match it finds (which could be of any length).
2497 * This can speed up matching.
2498 */
2499
2500int rxpMatch(const ERE * ere,
2501 int eremf,
2502 const char *str,
2503 int pos,
2504 ERE_MATCHINFO * mi)
2505{
2506 int len = pos + strlen(str + pos);
2507 const char *str_best;
2508
2509 if ((str_best = match_fsm(ere->fsm, eremf, str, pos, len, ere->s, mi)) == NULL)
2510 return -1;
2511 return (str_best - str) - pos;
2512}
2513
2514/*
2515 *@@ rxpMatch_fwd:
2516 * match forwards within a string from a specified start position.
2517 *
2518 * If a match, return TRUE, and also return pos and len of the match.
2519 *
2520 * If EREMF_SHORTEST is passed with eremf, the code looks for
2521 * the shortest match, instead of the longest match.
2522 *
2523 * If EREMF_ANY is passed with eremf, the code doesn't try to
2524 * find the longest (or shortest) match, it will return with the
2525 * first match it finds (which could be of any length).
2526 * This can speed up matching.
2527 */
2528
2529BOOLEAN rxpMatch_fwd(const ERE *ere, // in: compiled ERE (from rxpCompile)
2530 int eremf, // in: EREMF_* flags
2531 const char *str, // in: string to test
2532 int pos, // in: start position
2533 int *pos_match, // out: position of match
2534 int *len_match, // out: length of match
2535 ERE_MATCHINFO *mi) // out: match info (for rxpSubsWith)
2536{
2537 int len = pos + strlen(str + pos);
2538 int i;
2539
2540 for (i = pos; i <= len - ere->shortest_match; i++)
2541 {
2542 const char *str_best;
2543
2544 if ((str_best = match_fsm(ere->fsm, eremf, str, i, len, ere->s, mi)) != NULL)
2545 {
2546 *pos_match = i;
2547 *len_match = (str_best - str) - i;
2548 return TRUE;
2549 }
2550 }
2551 return FALSE;
2552}
2553
2554/*
2555 *@@ rxpMatch_bwd:
2556 * match backwards within a string not passing a
2557 * specified end position.
2558 *
2559 * If a match, return TRUE, and also return pos and
2560 * len of the match. We need to consider matches from
2561 * the beginning of the line. We want the one which
2562 * ends up in the rightmost position. Of those which
2563 * end up equally far right, we want the one which
2564 * extends the furthest (or shortest if EREMF_SHORTEST)
2565 * left. See how we get this as a side effect of the
2566 * loop ordering and the '>= + delta' test.
2567 *
2568 * This may not look as efficient as scanning the string
2569 * backwards, but note that this would require a reversed
2570 * ERE too, and we can't reverse EREs as they may
2571 * contain backreferences.
2572 *
2573 * If EREMF_SHORTEST is passed with eremf, the code looks for
2574 * the shortest match, instead of the longest match.
2575 *
2576 * If EREMF_ANY is passed with eremf, the code doesn't try to
2577 * find the longest (or shortest) match, it will return with the
2578 * first match it finds (which could be of any length).
2579 * This can speed up matching.
2580 */
2581
2582BOOLEAN rxpMatch_bwd(const ERE *ere, // in: compiled ERE (from rxpCompile)
2583 int eremf, // in: EREMF_* flags
2584 const char *str, // in: string to test
2585 int pos, // in: start position
2586 int *pos_match, // out: position of match
2587 int *len_match, // out: length of match
2588 ERE_MATCHINFO * mi) // out: match info (for rxpSubsWith)
2589{
2590 int i;
2591 int delta = (eremf & EREMF_SHORTEST) ? 0 : 1;
2592 const char *rightmost = NULL;
2593 ERE_MATCHINFO mi2;
2594
2595 for (i = 0; i <= pos - ere->shortest_match; i++)
2596 {
2597 const char *str_best;
2598
2599 if ((str_best = match_fsm(ere->fsm, eremf, str, i, pos, ere->s, &mi2)) != NULL)
2600 {
2601 if (rightmost == NULL ||
2602 str_best >= rightmost + delta)
2603 {
2604 *pos_match = i;
2605 *len_match = (str_best - str) - i;
2606 rightmost = str_best;
2607 if (mi != NULL)
2608 {
2609 mi->n_spans = mi2.n_spans;
2610 for (i = 0; i < mi->n_spans; i++)
2611 mi->spans[i] = mi2.spans[i];
2612 }
2613 }
2614 }
2615 }
2616 return rightmost != NULL;
2617}
2618
2619/*
2620 *@@ rxpFree:
2621 * frees all resources allocated by rxpCompile.
2622 */
2623
2624void rxpFree(ERE * ere)
2625{
2626 if (ere)
2627 {
2628 delete_match(ere->match);
2629 delete_fsm(ere->fsm);
2630 free(ere);
2631 }
2632}
2633
2634/*
2635 *@@ rxpSubsWith:
2636 * perform a substitution based upon an earlier found match.
2637 * This allows for implementing a "find and replace" function.
2638 */
2639
2640BOOLEAN rxpSubsWith(const char *str, // in: original string searched (same as str given to rxpMatch_fwd)
2641 int pos, // in: span of the entire match (pos_match from rxpMatch_fwd)
2642 int len, // in: span of the entire match (len_match from rxpMatch_fwd)
2643 ERE_MATCHINFO *mi, // in: details of match sub-spans (as from rxpMatch_fwd)
2644 const char *with, // in: replacement string with \1 etc.
2645 char *out, // out: buffer for string substitutions
2646 int len_out, // in: sizeof *out
2647 int *rc) // out: error, if FALSE returned
2648{
2649 int i = 0;
2650 int j;
2651
2652 memcpy(out, str, pos);
2653 i += pos;
2654 while (*with != '\0')
2655 {
2656 const char *rep;
2657 int len_rep;
2658
2659 if (*with != '\\')
2660 {
2661 rep = with++;
2662 len_rep = 1;
2663 }
2664 else
2665 {
2666 ++with;
2667 if (*with >= '1' && *with <= '9')
2668 {
2669 int span = *with - '1';
2670
2671 ++with;
2672 if (span >= mi->n_spans)
2673 {
2674 *rc = EREE_BAD_BACKREF;
2675 return FALSE;
2676 }
2677 rep = str + mi->spans[span].pos;
2678 len_rep = mi->spans[span].len;
2679 }
2680 else if (*with != '\0')
2681 {
2682 rep = with++;
2683 len_rep = 1;
2684 }
2685 else
2686 {
2687 *rc = EREE_BAD_BACKSLASH;
2688 return FALSE;
2689 }
2690 }
2691 if (i + len_rep > len_out)
2692 {
2693 *rc = EREE_SUBS_LEN;
2694 return FALSE;
2695 }
2696 memcpy(out + i, rep, len_rep);
2697 i += len_rep;
2698 }
2699 j = pos + len + strlen(str + pos + len);
2700 if (i + j > len_out)
2701 {
2702 *rc = EREE_SUBS_LEN;
2703 return FALSE;
2704 }
2705 memcpy(out + i, str + pos + len, j);
2706 i += j;
2707 out[i] = '\0';
2708 return TRUE;
2709}
2710
2711#ifdef __TESTCASE__
2712
2713int main(int argc, char *argv[])
2714{
2715 ERE *ere;
2716 ERE_MATCHINFO mi;
2717 int rc;
2718
2719 const char *pcsz, *pcszEre;
2720
2721 if (argc != 3)
2722 {
2723 printf("Usage: regexp <teststring> <ere>\n");
2724 exit(1);
2725 }
2726
2727 pcsz = argv[1];
2728 pcszEre = argv[2];
2729
2730 printf("matching \"%s\" against \"%s\"\n",
2731 pcsz,
2732 pcszEre);
2733 fflush(stdout);
2734
2735 if (!(ere = rxpCompile(pcszEre,
2736 0,
2737 &rc)))
2738 {
2739 printf("Error %d in rxpCompile: %s\n", rc, rxpError(rc));
2740 exit(rc);
2741 }
2742
2743 {
2744 int pos, length;
2745 rc = rxpMatch_fwd(ere,
2746 0,
2747 pcsz,
2748 0,
2749 &pos,
2750 &length,
2751 &mi);
2752
2753 if (rc == 0)
2754 printf("no match\n");
2755 else
2756 printf("found at pos %d, length %d\n", pos, length);
2757 }
2758
2759 return 0;
2760}
2761
2762#endif
Note: See TracBrowser for help on using the repository browser.