source: trunk/essentials/sys-apps/gawk/dfa.c@ 3846

Last change on this file since 3846 was 3076, checked in by bird, 19 years ago

gawk 3.1.5

File size: 98.9 KB
Line 
1/* dfa.c - deterministic extended regexp routines for GNU
2 Copyright 1988, 1998, 2000, 2002, 2004, 2005 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
17
18/* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
20
21#ifdef HAVE_CONFIG_H
22#include <config.h>
23#endif
24
25#include <assert.h>
26#include <ctype.h>
27#include <stdio.h>
28
29#ifndef VMS
30#include <sys/types.h>
31#else
32#include <stddef.h>
33#endif
34#ifdef STDC_HEADERS
35#include <stdlib.h>
36#else
37extern char *calloc(), *malloc(), *realloc();
38extern void free();
39#endif
40
41#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
42#include <string.h>
43#else
44#include <strings.h>
45#endif
46
47#if HAVE_SETLOCALE
48# include <locale.h>
49#endif
50
51
52#ifndef DEBUG /* use the same approach as regex.c */
53#undef assert
54#define assert(e)
55#endif /* DEBUG */
56
57#ifndef isgraph
58#define isgraph(C) (isprint(C) && !isspace(C))
59#endif
60
61#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
62#define ISALPHA(C) isalpha(C)
63#define ISUPPER(C) isupper(C)
64#define ISLOWER(C) islower(C)
65#define ISDIGIT(C) isdigit(C)
66#define ISXDIGIT(C) isxdigit(C)
67#define ISSPACE(C) isspace(C)
68#define ISPUNCT(C) ispunct(C)
69#define ISALNUM(C) isalnum(C)
70#define ISPRINT(C) isprint(C)
71#define ISGRAPH(C) isgraph(C)
72#define ISCNTRL(C) iscntrl(C)
73#else
74#define ISALPHA(C) (isascii(C) && isalpha(C))
75#define ISUPPER(C) (isascii(C) && isupper(C))
76#define ISLOWER(C) (isascii(C) && islower(C))
77#define ISDIGIT(C) (isascii(C) && isdigit(C))
78#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
79#define ISSPACE(C) (isascii(C) && isspace(C))
80#define ISPUNCT(C) (isascii(C) && ispunct(C))
81#define ISALNUM(C) (isascii(C) && isalnum(C))
82#define ISPRINT(C) (isascii(C) && isprint(C))
83#define ISGRAPH(C) (isascii(C) && isgraph(C))
84#define ISCNTRL(C) (isascii(C) && iscntrl(C))
85#endif
86
87/* ISASCIIDIGIT differs from ISDIGIT, as follows:
88 - Its arg may be any int or unsigned int; it need not be an unsigned char.
89 - It's guaranteed to evaluate its argument exactly once.
90 - It's typically faster.
91 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
92 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
93 it's important to use the locale's definition of `digit' even when the
94 host does not conform to Posix. */
95#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
96
97/* gettext.h ensures that we don't use gettext if ENABLE_NLS is not defined */
98#include "gettext.h"
99#define _(str) gettext (str)
100
101#ifndef NO_MBSUPPORT
102#include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
103#endif
104#ifdef MBS_SUPPORT
105/* We can handle multibyte strings. */
106# include <wchar.h>
107# include <wctype.h>
108#endif
109
110#include "regex.h"
111#include "dfa.h"
112#include "hard-locale.h"
113
114/* HPUX, define those as macros in sys/param.h */
115#ifdef setbit
116# undef setbit
117#endif
118#ifdef clrbit
119# undef clrbit
120#endif
121
122static void dfamust PARAMS ((struct dfa *dfa));
123
124static ptr_t xcalloc PARAMS ((size_t n, size_t s));
125static ptr_t xmalloc PARAMS ((size_t n));
126static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
127#ifdef DEBUG
128static void prtok PARAMS ((token t));
129#endif
130static int tstbit PARAMS ((unsigned b, charclass c));
131static void setbit PARAMS ((unsigned b, charclass c));
132static void clrbit PARAMS ((unsigned b, charclass c));
133static void copyset PARAMS ((charclass src, charclass dst));
134static void zeroset PARAMS ((charclass s));
135static void notset PARAMS ((charclass s));
136static int equal PARAMS ((charclass s1, charclass s2));
137static int charclass_index PARAMS ((charclass s));
138static int looking_at PARAMS ((const char *s));
139static token lex PARAMS ((void));
140static void addtok PARAMS ((token t));
141static void atom PARAMS ((void));
142static int nsubtoks PARAMS ((int tindex));
143static void copytoks PARAMS ((int tindex, int ntokens));
144static void closure PARAMS ((void));
145static void branch PARAMS ((void));
146static void regexp PARAMS ((int toplevel));
147static void copy PARAMS ((position_set const *src, position_set *dst));
148static void insert PARAMS ((position p, position_set *s));
149static void merge PARAMS ((position_set const *s1, position_set const *s2, position_set *m));
150static void delete PARAMS ((position p, position_set *s));
151static int state_index PARAMS ((struct dfa *d, position_set const *s,
152 int newline, int letter));
153static void build_state PARAMS ((int s, struct dfa *d));
154static void build_state_zero PARAMS ((struct dfa *d));
155static char *icatalloc PARAMS ((char *old, char *new));
156static char *icpyalloc PARAMS ((char *string));
157static char *istrstr PARAMS ((char *lookin, char *lookfor));
158static void ifree PARAMS ((char *cp));
159static void freelist PARAMS ((char **cpp));
160static char **enlist PARAMS ((char **cpp, char *new, size_t len));
161static char **comsubs PARAMS ((char *left, char *right));
162static char **addlists PARAMS ((char **old, char **new));
163static char **inboth PARAMS ((char **left, char **right));
164
165static ptr_t
166xcalloc (size_t n, size_t s)
167{
168 ptr_t r = calloc(n, s);
169
170 if (!r)
171 dfaerror(_("Memory exhausted"));
172 return r;
173}
174
175static ptr_t
176xmalloc (size_t n)
177{
178 ptr_t r = malloc(n);
179
180 assert(n != 0);
181 if (!r)
182 dfaerror(_("Memory exhausted"));
183 return r;
184}
185
186static ptr_t
187xrealloc (ptr_t p, size_t n)
188{
189 ptr_t r = realloc(p, n);
190
191 assert(n != 0);
192 if (!r)
193 dfaerror(_("Memory exhausted"));
194 return r;
195}
196
197#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
198#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
199#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
200
201/* Reallocate an array of type t if nalloc is too small for index. */
202#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
203 if ((index) >= (nalloc)) \
204 { \
205 do \
206 (nalloc) *= 2; \
207 while ((index) >= (nalloc)); \
208 REALLOC(p, t, nalloc); \
209 }
210
211#ifdef DEBUG
212
213static void
214prtok (token t)
215{
216 char const *s;
217
218 if (t < 0)
219 fprintf(stderr, "END");
220 else if (t < NOTCHAR)
221 fprintf(stderr, "%c", t);
222 else
223 {
224 switch (t)
225 {
226 case EMPTY: s = "EMPTY"; break;
227 case BACKREF: s = "BACKREF"; break;
228 case BEGLINE: s = "BEGLINE"; break;
229 case ENDLINE: s = "ENDLINE"; break;
230 case BEGWORD: s = "BEGWORD"; break;
231 case ENDWORD: s = "ENDWORD"; break;
232 case LIMWORD: s = "LIMWORD"; break;
233 case NOTLIMWORD: s = "NOTLIMWORD"; break;
234 case QMARK: s = "QMARK"; break;
235 case STAR: s = "STAR"; break;
236 case PLUS: s = "PLUS"; break;
237 case CAT: s = "CAT"; break;
238 case OR: s = "OR"; break;
239 case ORTOP: s = "ORTOP"; break;
240 case LPAREN: s = "LPAREN"; break;
241 case RPAREN: s = "RPAREN"; break;
242 case CRANGE: s = "CRANGE"; break;
243#ifdef MBS_SUPPORT
244 case ANYCHAR: s = "ANYCHAR"; break;
245 case MBCSET: s = "MBCSET"; break;
246#endif /* MBS_SUPPORT */
247 default: s = "CSET"; break;
248 }
249 fprintf(stderr, "%s", s);
250 }
251}
252#endif /* DEBUG */
253
254/* Stuff pertaining to charclasses. */
255
256static int
257tstbit (unsigned b, charclass c)
258{
259 return c[b / INTBITS] & 1 << b % INTBITS;
260}
261
262static void
263setbit (unsigned b, charclass c)
264{
265 c[b / INTBITS] |= 1 << b % INTBITS;
266}
267
268static void
269clrbit (unsigned b, charclass c)
270{
271 c[b / INTBITS] &= ~(1 << b % INTBITS);
272}
273
274static void
275copyset (charclass src, charclass dst)
276{
277 memcpy (dst, src, sizeof (charclass));
278}
279
280static void
281zeroset (charclass s)
282{
283 memset (s, 0, sizeof (charclass));
284}
285
286static void
287notset (charclass s)
288{
289 int i;
290
291 for (i = 0; i < CHARCLASS_INTS; ++i)
292 s[i] = ~s[i];
293}
294
295static int
296equal (charclass s1, charclass s2)
297{
298 return memcmp (s1, s2, sizeof (charclass)) == 0;
299}
300
301/* A pointer to the current dfa is kept here during parsing. */
302static struct dfa *dfa;
303
304/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
305static int
306charclass_index (charclass s)
307{
308 int i;
309
310 for (i = 0; i < dfa->cindex; ++i)
311 if (equal(s, dfa->charclasses[i]))
312 return i;
313 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
314 ++dfa->cindex;
315 copyset(s, dfa->charclasses[i]);
316 return i;
317}
318
319/* Syntax bits controlling the behavior of the lexical analyzer. */
320static reg_syntax_t syntax_bits, syntax_bits_set;
321
322/* Flag for case-folding letters into sets. */
323static int case_fold;
324
325/* End-of-line byte in data. */
326static unsigned char eolbyte;
327
328/* Entry point to set syntax options. */
329void
330dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
331{
332 syntax_bits_set = 1;
333 syntax_bits = bits;
334 case_fold = fold;
335 eolbyte = eol;
336}
337
338/* Like setbit, but if case is folded, set both cases of a letter. */
339static void
340setbit_case_fold (unsigned b, charclass c)
341{
342 setbit (b, c);
343 if (case_fold)
344 {
345 if (ISUPPER (b))
346 setbit (tolower (b), c);
347 else if (ISLOWER (b))
348 setbit (toupper (b), c);
349 }
350}
351
352/* Lexical analyzer. All the dross that deals with the obnoxious
353 GNU Regex syntax bits is located here. The poor, suffering
354 reader is referred to the GNU Regex documentation for the
355 meaning of the @#%!@#%^!@ syntax bits. */
356
357static char const *lexptr; /* Pointer to next input character. */
358static int lexleft; /* Number of characters remaining. */
359static token lasttok; /* Previous token returned; initially END. */
360static int laststart; /* True if we're separated from beginning or (, |
361 only by zero-width characters. */
362static int parens; /* Count of outstanding left parens. */
363static int minrep, maxrep; /* Repeat counts for {m,n}. */
364static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
365
366#ifdef MBS_SUPPORT
367/* These variables are used only if (MB_CUR_MAX > 1). */
368static mbstate_t mbs; /* Mbstate for mbrlen(). */
369static int cur_mb_len; /* Byte length of the current scanning
370 multibyte character. */
371static int cur_mb_index; /* Byte index of the current scanning multibyte
372 character.
373
374 single byte character : cur_mb_index = 0
375 multibyte character
376 1st byte : cur_mb_index = 1
377 2nd byte : cur_mb_index = 2
378 ...
379 nth byte : cur_mb_index = n */
380static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
381 Each element store the amount of remain
382 byte of corresponding multibyte character
383 in the input string. A element's value
384 is 0 if corresponding character is a
385 single byte chracter.
386 e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>
387 mblen_buf : 0, 3, 2, 1
388 */
389static wchar_t *inputwcs; /* Wide character representation of input
390 string in dfaexec().
391 The length of this array is same as
392 the length of input string(char array).
393 inputstring[i] is a single-byte char,
394 or 1st byte of a multibyte char.
395 And inputwcs[i] is the codepoint. */
396static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
397static unsigned char const *buf_end; /* reference to end in dfaexec(). */
398#endif /* MBS_SUPPORT */
399
400#ifdef MBS_SUPPORT
401/* This function update cur_mb_len, and cur_mb_index.
402 p points current lexptr, len is the remaining buffer length. */
403static void
404update_mb_len_index (unsigned char const *p, int len)
405{
406 /* If last character is a part of a multibyte character,
407 we update cur_mb_index. */
408 if (cur_mb_index)
409 cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
410 : cur_mb_index + 1;
411
412 /* If last character is a single byte character, or the
413 last portion of a multibyte character, we check whether
414 next character is a multibyte character or not. */
415 if (! cur_mb_index)
416 {
417 cur_mb_len = mbrlen(p, len, &mbs);
418 if (cur_mb_len > 1)
419 /* It is a multibyte character.
420 cur_mb_len was already set by mbrlen(). */
421 cur_mb_index = 1;
422 else if (cur_mb_len < 1)
423 /* Invalid sequence. We treat it as a single byte character.
424 cur_mb_index is aleady 0. */
425 cur_mb_len = 1;
426 /* Otherwise, cur_mb_len == 1, it is a single byte character.
427 cur_mb_index is aleady 0. */
428 }
429}
430#endif /* MBS_SUPPORT */
431
432#ifdef MBS_SUPPORT
433/* Note that characters become unsigned here. */
434# define FETCH(c, eoferr) \
435 { \
436 if (! lexleft) \
437 { \
438 if (eoferr != 0) \
439 dfaerror (eoferr); \
440 else \
441 return lasttok = END; \
442 } \
443 if (MB_CUR_MAX > 1) \
444 update_mb_len_index(lexptr, lexleft); \
445 (c) = (unsigned char) *lexptr++; \
446 --lexleft; \
447 }
448
449/* This function fetch a wide character, and update cur_mb_len,
450 used only if the current locale is a multibyte environment. */
451static wint_t
452fetch_wc (char const *eoferr)
453{
454 wchar_t wc;
455 if (! lexleft)
456 {
457 if (eoferr != 0)
458 dfaerror (eoferr);
459 else
460 return WEOF;
461 }
462
463 cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
464 if (cur_mb_len <= 0)
465 {
466 cur_mb_len = 1;
467 wc = *lexptr;
468 }
469 lexptr += cur_mb_len;
470 lexleft -= cur_mb_len;
471 return wc;
472}
473#else
474/* Note that characters become unsigned here. */
475# define FETCH(c, eoferr) \
476 { \
477 if (! lexleft) \
478 { \
479 if (eoferr != 0) \
480 dfaerror (eoferr); \
481 else \
482 return lasttok = END; \
483 } \
484 (c) = (unsigned char) *lexptr++; \
485 --lexleft; \
486 }
487#endif /* MBS_SUPPORT */
488
489#ifdef MBS_SUPPORT
490/* Multibyte character handling sub-routine for lex.
491 This function parse a bracket expression and build a struct
492 mb_char_classes. */
493static void
494parse_bracket_exp_mb ()
495{
496 wint_t wc, wc1, wc2;
497
498 /* Work area to build a mb_char_classes. */
499 struct mb_char_classes *work_mbc;
500 int chars_al, range_sts_al, range_ends_al, ch_classes_al,
501 equivs_al, coll_elems_al;
502
503 REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
504 dfa->mbcsets_alloc, dfa->nmbcsets + 1);
505 /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
506 We will update dfa->multibyte_prop[] in addtok(), because we can't
507 decide the index in dfa->tokens[]. */
508
509 /* Initialize work are */
510 work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
511
512 chars_al = 1;
513 range_sts_al = range_ends_al = 0;
514 ch_classes_al = equivs_al = coll_elems_al = 0;
515 MALLOC(work_mbc->chars, wchar_t, chars_al);
516
517 work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
518 work_mbc->nequivs = work_mbc->ncoll_elems = 0;
519 work_mbc->chars = NULL;
520 work_mbc->ch_classes = NULL;
521 work_mbc->range_sts = work_mbc->range_ends = NULL;
522 work_mbc->equivs = work_mbc->coll_elems = NULL;
523
524 wc = fetch_wc(_("Unbalanced ["));
525 if (wc == L'^')
526 {
527 wc = fetch_wc(_("Unbalanced ["));
528 work_mbc->invert = 1;
529 }
530 else
531 work_mbc->invert = 0;
532 do
533 {
534 wc1 = WEOF; /* mark wc1 is not initialized". */
535
536 /* Note that if we're looking at some other [:...:] construct,
537 we just treat it as a bunch of ordinary characters. We can do
538 this because we assume regex has checked for syntax errors before
539 dfa is ever called. */
540 if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
541 {
542#define BRACKET_BUFFER_SIZE 128
543 char str[BRACKET_BUFFER_SIZE];
544 wc1 = wc;
545 wc = fetch_wc(_("Unbalanced ["));
546
547 /* If pattern contains `[[:', `[[.', or `[[='. */
548 if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
549 {
550 unsigned char c;
551 unsigned char delim = (unsigned char)wc;
552 int len = 0;
553 for (;;)
554 {
555 if (! lexleft)
556 dfaerror (_("Unbalanced ["));
557 c = (unsigned char) *lexptr++;
558 --lexleft;
559
560 if ((c == delim && *lexptr == ']') || lexleft == 0)
561 break;
562 if (len < BRACKET_BUFFER_SIZE)
563 str[len++] = c;
564 else
565 /* This is in any case an invalid class name. */
566 str[0] = '\0';
567 }
568 str[len] = '\0';
569
570 if (lexleft == 0)
571 {
572 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
573 work_mbc->nchars + 2);
574 work_mbc->chars[work_mbc->nchars++] = L'[';
575 work_mbc->chars[work_mbc->nchars++] = delim;
576 break;
577 }
578
579 if (--lexleft, *lexptr++ != ']')
580 dfaerror (_("Unbalanced ["));
581 if (delim == ':')
582 /* build character class. */
583 {
584 wctype_t wt;
585 /* Query the character class as wctype_t. */
586 if (case_fold && (strcmp(str, "upper") == 0 || strcmp(str, "lower") == 0))
587 strcpy(str, "alpha");
588
589 wt = wctype (str);
590
591 if (ch_classes_al == 0)
592 MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
593 REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
594 ch_classes_al,
595 work_mbc->nch_classes + 1);
596 work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
597
598 }
599 else if (delim == '=' || delim == '.')
600 {
601 char *elem;
602 MALLOC(elem, char, len + 1);
603 strncpy(elem, str, len + 1);
604
605 if (delim == '=')
606 /* build equivalent class. */
607 {
608 if (equivs_al == 0)
609 MALLOC(work_mbc->equivs, char*, ++equivs_al);
610 REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
611 equivs_al,
612 work_mbc->nequivs + 1);
613 work_mbc->equivs[work_mbc->nequivs++] = elem;
614 }
615
616 if (delim == '.')
617 /* build collating element. */
618 {
619 if (coll_elems_al == 0)
620 MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
621 REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
622 coll_elems_al,
623 work_mbc->ncoll_elems + 1);
624 work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
625 }
626 }
627 wc = wc1 = WEOF;
628 }
629 else
630 /* We treat '[' as a normal character here. */
631 {
632 wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
633 }
634 }
635 else
636 {
637 if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
638 wc = fetch_wc(("Unbalanced ["));
639 }
640
641 if (wc1 == WEOF)
642 wc1 = fetch_wc(_("Unbalanced ["));
643
644 if (wc1 == L'-')
645 /* build range characters. */
646 {
647 wc2 = fetch_wc(_("Unbalanced ["));
648 if (wc2 == L']')
649 {
650 /* In the case [x-], the - is an ordinary hyphen,
651 which is left in c1, the lookahead character. */
652 lexptr -= cur_mb_len;
653 lexleft += cur_mb_len;
654 wc2 = wc;
655 }
656 else
657 {
658 if (wc2 == L'\\'
659 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
660 wc2 = fetch_wc(_("Unbalanced ["));
661 wc1 = fetch_wc(_("Unbalanced ["));
662 }
663
664 if (range_sts_al == 0)
665 {
666 MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
667 MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
668 }
669 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
670 range_sts_al, work_mbc->nranges + 1);
671 work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
672 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
673 range_ends_al, work_mbc->nranges + 1);
674 work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
675 if (case_fold && (iswlower((wint_t)wc) || iswupper((wint_t)wc))
676 && (iswlower((wint_t)wc2) || iswupper((wint_t)wc2)))
677 {
678 wint_t altcase;
679 altcase = wc;
680 if (iswlower((wint_t)wc))
681 altcase = towupper((wint_t)wc);
682 else
683 altcase = towlower((wint_t)wc);
684 REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
685 range_sts_al, work_mbc->nranges + 1);
686 work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase;
687
688 altcase = wc2;
689 if (iswlower((wint_t)wc2))
690 altcase = towupper((wint_t)wc2);
691 else
692 altcase = towlower((wint_t)wc2);
693 REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
694 range_ends_al, work_mbc->nranges + 1);
695 work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase;
696 }
697 }
698 else if (wc != WEOF)
699 /* build normal characters. */
700 {
701 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
702 work_mbc->nchars + 1);
703 work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
704 if (case_fold && (iswlower(wc) || iswupper(wc)))
705 {
706 REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
707 work_mbc->nchars + 1);
708 work_mbc->chars[work_mbc->nchars++] =
709 (wchar_t) (iswlower(wc) ? towupper(wc) : towlower(wc));
710 }
711 }
712 }
713 while ((wc = wc1) != L']');
714}
715#endif /* MBS_SUPPORT */
716
717#ifdef __STDC__
718#define FUNC(F, P) static int F(int c) { return P(c); }
719#else
720#define FUNC(F, P) static int F(c) int c; { return P(c); }
721#endif
722
723FUNC(is_alpha, ISALPHA)
724FUNC(is_upper, ISUPPER)
725FUNC(is_lower, ISLOWER)
726FUNC(is_digit, ISDIGIT)
727FUNC(is_xdigit, ISXDIGIT)
728FUNC(is_space, ISSPACE)
729FUNC(is_punct, ISPUNCT)
730FUNC(is_alnum, ISALNUM)
731FUNC(is_print, ISPRINT)
732FUNC(is_graph, ISGRAPH)
733FUNC(is_cntrl, ISCNTRL)
734
735static int
736is_blank (int c)
737{
738 return (c == ' ' || c == '\t');
739}
740
741/* The following list maps the names of the Posix named character classes
742 to predicate functions that determine whether a given character is in
743 the class. The leading [ has already been eaten by the lexical analyzer. */
744static struct {
745 const char *name;
746 int (*pred) PARAMS ((int));
747} const prednames[] = {
748 { ":alpha:]", is_alpha },
749 { ":upper:]", is_upper },
750 { ":lower:]", is_lower },
751 { ":digit:]", is_digit },
752 { ":xdigit:]", is_xdigit },
753 { ":space:]", is_space },
754 { ":punct:]", is_punct },
755 { ":alnum:]", is_alnum },
756 { ":print:]", is_print },
757 { ":graph:]", is_graph },
758 { ":cntrl:]", is_cntrl },
759 { ":blank:]", is_blank },
760 { 0 }
761};
762
763/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
764#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
765
766static int
767looking_at (char const *s)
768{
769 size_t len;
770
771 len = strlen(s);
772 if (lexleft < len)
773 return 0;
774 return strncmp(s, lexptr, len) == 0;
775}
776
777static token
778lex (void)
779{
780 unsigned c, c1, c2;
781 int backslash = 0, invert;
782 charclass ccl;
783 int i;
784
785 /* Basic plan: We fetch a character. If it's a backslash,
786 we set the backslash flag and go through the loop again.
787 On the plus side, this avoids having a duplicate of the
788 main switch inside the backslash case. On the minus side,
789 it means that just about every case begins with
790 "if (backslash) ...". */
791 for (i = 0; i < 2; ++i)
792 {
793 FETCH(c, 0);
794#ifdef MBS_SUPPORT
795 if (MB_CUR_MAX > 1 && cur_mb_index)
796 /* If this is a part of a multi-byte character, we must treat
797 this byte data as a normal character.
798 e.g. In case of SJIS encoding, some character contains '\',
799 but they must not be backslash. */
800 goto normal_char;
801#endif /* MBS_SUPPORT */
802 switch (c)
803 {
804 case '\\':
805 if (backslash)
806 goto normal_char;
807 if (lexleft == 0)
808 dfaerror(_("Unfinished \\ escape"));
809 backslash = 1;
810 break;
811
812 case '^':
813 if (backslash)
814 goto normal_char;
815 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
816 || lasttok == END
817 || lasttok == LPAREN
818 || lasttok == OR)
819 return lasttok = BEGLINE;
820 goto normal_char;
821
822 case '$':
823 if (backslash)
824 goto normal_char;
825 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
826 || lexleft == 0
827 || (syntax_bits & RE_NO_BK_PARENS
828 ? lexleft > 0 && *lexptr == ')'
829 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
830 || (syntax_bits & RE_NO_BK_VBAR
831 ? lexleft > 0 && *lexptr == '|'
832 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
833 || ((syntax_bits & RE_NEWLINE_ALT)
834 && lexleft > 0 && *lexptr == '\n'))
835 return lasttok = ENDLINE;
836 goto normal_char;
837
838 case '1':
839 case '2':
840 case '3':
841 case '4':
842 case '5':
843 case '6':
844 case '7':
845 case '8':
846 case '9':
847 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
848 {
849 laststart = 0;
850 return lasttok = BACKREF;
851 }
852 goto normal_char;
853
854 case '`':
855 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
856 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
857 goto normal_char;
858
859 case '\'':
860 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
861 return lasttok = ENDLINE; /* FIXME: should be end of string */
862 goto normal_char;
863
864 case '<':
865 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
866 return lasttok = BEGWORD;
867 goto normal_char;
868
869 case '>':
870 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
871 return lasttok = ENDWORD;
872 goto normal_char;
873
874 case 'b':
875 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
876 return lasttok = LIMWORD;
877 goto normal_char;
878
879 case 'B':
880 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
881 return lasttok = NOTLIMWORD;
882 goto normal_char;
883
884 case '?':
885 if (syntax_bits & RE_LIMITED_OPS)
886 goto normal_char;
887 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
888 goto normal_char;
889 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
890 goto normal_char;
891 return lasttok = QMARK;
892
893 case '*':
894 if (backslash)
895 goto normal_char;
896 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
897 goto normal_char;
898 return lasttok = STAR;
899
900 case '+':
901 if (syntax_bits & RE_LIMITED_OPS)
902 goto normal_char;
903 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
904 goto normal_char;
905 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
906 goto normal_char;
907 return lasttok = PLUS;
908
909 case '{':
910 if (!(syntax_bits & RE_INTERVALS))
911 goto normal_char;
912 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
913 goto normal_char;
914 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
915 goto normal_char;
916
917 if (syntax_bits & RE_NO_BK_BRACES)
918 {
919 /* Scan ahead for a valid interval; if it's not valid,
920 treat it as a literal '{'. */
921 int lo = -1, hi = -1;
922 char const *p = lexptr;
923 char const *lim = p + lexleft;
924 for (; p != lim && ISASCIIDIGIT (*p); p++)
925 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
926 if (p != lim && *p == ',')
927 while (++p != lim && ISASCIIDIGIT (*p))
928 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
929 else
930 hi = lo;
931 if (p == lim || *p != '}'
932 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
933 goto normal_char;
934 }
935
936 minrep = 0;
937 /* Cases:
938 {M} - exact count
939 {M,} - minimum count, maximum is infinity
940 {M,N} - M through N */
941 FETCH(c, _("unfinished repeat count"));
942 if (ISASCIIDIGIT (c))
943 {
944 minrep = c - '0';
945 for (;;)
946 {
947 FETCH(c, _("unfinished repeat count"));
948 if (! ISASCIIDIGIT (c))
949 break;
950 minrep = 10 * minrep + c - '0';
951 }
952 }
953 else
954 dfaerror(_("malformed repeat count"));
955 if (c == ',')
956 {
957 FETCH (c, _("unfinished repeat count"));
958 if (! ISASCIIDIGIT (c))
959 maxrep = -1;
960 else
961 {
962 maxrep = c - '0';
963 for (;;)
964 {
965 FETCH (c, _("unfinished repeat count"));
966 if (! ISASCIIDIGIT (c))
967 break;
968 maxrep = 10 * maxrep + c - '0';
969 }
970 if (0 <= maxrep && maxrep < minrep)
971 dfaerror (_("malformed repeat count"));
972 }
973 }
974 else
975 maxrep = minrep;
976 if (!(syntax_bits & RE_NO_BK_BRACES))
977 {
978 if (c != '\\')
979 dfaerror(_("malformed repeat count"));
980 FETCH(c, _("unfinished repeat count"));
981 }
982 if (c != '}')
983 dfaerror(_("malformed repeat count"));
984 laststart = 0;
985 return lasttok = REPMN;
986
987 case '|':
988 if (syntax_bits & RE_LIMITED_OPS)
989 goto normal_char;
990 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
991 goto normal_char;
992 laststart = 1;
993 return lasttok = OR;
994
995 case '\n':
996 if (syntax_bits & RE_LIMITED_OPS
997 || backslash
998 || !(syntax_bits & RE_NEWLINE_ALT))
999 goto normal_char;
1000 laststart = 1;
1001 return lasttok = OR;
1002
1003 case '(':
1004 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1005 goto normal_char;
1006 ++parens;
1007 laststart = 1;
1008 return lasttok = LPAREN;
1009
1010 case ')':
1011 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
1012 goto normal_char;
1013 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1014 goto normal_char;
1015 --parens;
1016 laststart = 0;
1017 return lasttok = RPAREN;
1018
1019 case '.':
1020 if (backslash)
1021 goto normal_char;
1022#ifdef MBS_SUPPORT
1023 if (MB_CUR_MAX > 1)
1024 {
1025 /* In multibyte environment period must match with a single
1026 character not a byte. So we use ANYCHAR. */
1027 laststart = 0;
1028 return lasttok = ANYCHAR;
1029 }
1030#endif /* MBS_SUPPORT */
1031 zeroset(ccl);
1032 notset(ccl);
1033 if (!(syntax_bits & RE_DOT_NEWLINE))
1034 clrbit(eolbyte, ccl);
1035 if (syntax_bits & RE_DOT_NOT_NULL)
1036 clrbit('\0', ccl);
1037 laststart = 0;
1038 return lasttok = CSET + charclass_index(ccl);
1039
1040#ifndef GAWK
1041 case 's':
1042 case 'S':
1043 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1044 goto normal_char;
1045 zeroset(ccl);
1046 for (c2 = 0; c2 < NOTCHAR; ++c2)
1047 if (ISSPACE(c2))
1048 setbit(c2, ccl);
1049 if (c == 'S')
1050 notset(ccl);
1051 laststart = 0;
1052 return lasttok = CSET + charclass_index(ccl);
1053#endif
1054
1055 case 'w':
1056 case 'W':
1057 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
1058 goto normal_char;
1059 zeroset(ccl);
1060 for (c2 = 0; c2 < NOTCHAR; ++c2)
1061 if (IS_WORD_CONSTITUENT(c2))
1062 setbit(c2, ccl);
1063 if (c == 'W')
1064 notset(ccl);
1065 laststart = 0;
1066 return lasttok = CSET + charclass_index(ccl);
1067
1068 case '[':
1069 if (backslash)
1070 goto normal_char;
1071 laststart = 0;
1072#ifdef MBS_SUPPORT
1073 if (MB_CUR_MAX > 1)
1074 {
1075 /* In multibyte environment a bracket expression may contain
1076 multibyte characters, which must be treated as characters
1077 (not bytes). So we parse it by parse_bracket_exp_mb(). */
1078 parse_bracket_exp_mb();
1079 return lasttok = MBCSET;
1080 }
1081#endif
1082 zeroset(ccl);
1083 FETCH(c, _("Unbalanced ["));
1084 if (c == '^')
1085 {
1086 FETCH(c, _("Unbalanced ["));
1087 invert = 1;
1088 }
1089 else
1090 invert = 0;
1091 do
1092 {
1093 /* Nobody ever said this had to be fast. :-)
1094 Note that if we're looking at some other [:...:]
1095 construct, we just treat it as a bunch of ordinary
1096 characters. We can do this because we assume
1097 regex has checked for syntax errors before
1098 dfa is ever called. */
1099 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
1100 for (c1 = 0; prednames[c1].name; ++c1)
1101 if (looking_at(prednames[c1].name))
1102 {
1103 int (*pred) PARAMS ((int)) = prednames[c1].pred;
1104
1105 for (c2 = 0; c2 < NOTCHAR; ++c2)
1106 if ((*pred)(c2))
1107 setbit_case_fold (c2, ccl);
1108 lexptr += strlen(prednames[c1].name);
1109 lexleft -= strlen(prednames[c1].name);
1110 FETCH(c1, _("Unbalanced ["));
1111 goto skip;
1112 }
1113 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1114 FETCH(c, _("Unbalanced ["));
1115 FETCH(c1, _("Unbalanced ["));
1116 if (c1 == '-')
1117 {
1118 FETCH(c2, _("Unbalanced ["));
1119 if (c2 == ']')
1120 {
1121 /* In the case [x-], the - is an ordinary hyphen,
1122 which is left in c1, the lookahead character. */
1123 --lexptr;
1124 ++lexleft;
1125 }
1126 else
1127 {
1128 if (c2 == '\\'
1129 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1130 FETCH(c2, _("Unbalanced ["));
1131 FETCH(c1, _("Unbalanced ["));
1132 if (!hard_LC_COLLATE) {
1133 for (; c <= c2; c++)
1134 setbit_case_fold (c, ccl);
1135 } else {
1136 /* POSIX locales are painful - leave the decision to libc */
1137 regex_t re;
1138 char expr[6]; /* = { '[', c, '-', c2, ']', '\0' }; */
1139
1140 expr[0] = '['; expr[1] = c; expr[2] = '-';
1141 expr[3] = c2; expr[4] = ']'; expr[5] = '\0';
1142 if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {
1143 for (c = 0; c < NOTCHAR; ++c) {
1144 regmatch_t mat;
1145 char buf[2]; /* = { c, '\0' }; */
1146
1147 buf[0] = c; buf[1] = '\0';
1148 if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR
1149 && mat.rm_so == 0 && mat.rm_eo == 1)
1150 setbit_case_fold (c, ccl);
1151 }
1152 regfree (&re);
1153 }
1154 }
1155 continue;
1156 }
1157 }
1158
1159 setbit_case_fold (c, ccl);
1160
1161 skip:
1162 ;
1163 }
1164 while ((c = c1) != ']');
1165 if (invert)
1166 {
1167 notset(ccl);
1168 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1169 clrbit(eolbyte, ccl);
1170 }
1171 return lasttok = CSET + charclass_index(ccl);
1172
1173 default:
1174 normal_char:
1175 laststart = 0;
1176 if (case_fold && ISALPHA(c))
1177 {
1178 zeroset(ccl);
1179 setbit_case_fold (c, ccl);
1180 return lasttok = CSET + charclass_index(ccl);
1181 }
1182 return lasttok = c;
1183 }
1184 }
1185
1186 /* The above loop should consume at most a backslash
1187 and some other character. */
1188 abort();
1189 return END; /* keeps pedantic compilers happy. */
1190}
1191
1192/* Recursive descent parser for regular expressions. */
1193
1194static token tok; /* Lookahead token. */
1195static int depth; /* Current depth of a hypothetical stack
1196 holding deferred productions. This is
1197 used to determine the depth that will be
1198 required of the real stack later on in
1199 dfaanalyze(). */
1200
1201/* Add the given token to the parse tree, maintaining the depth count and
1202 updating the maximum depth if necessary. */
1203static void
1204addtok (token t)
1205{
1206#ifdef MBS_SUPPORT
1207 if (MB_CUR_MAX > 1)
1208 {
1209 REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
1210 dfa->tindex);
1211 /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */
1212 if (t == MBCSET)
1213 dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
1214 else if (t < NOTCHAR)
1215 dfa->multibyte_prop[dfa->tindex]
1216 = (cur_mb_len == 1)? 3 /* single-byte char */
1217 : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
1218 + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
1219 else
1220 /* It may be unnecessary, but it is safer to treat other
1221 symbols as single byte characters. */
1222 dfa->multibyte_prop[dfa->tindex] = 3;
1223 }
1224#endif
1225
1226 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
1227 dfa->tokens[dfa->tindex++] = t;
1228
1229 switch (t)
1230 {
1231 case QMARK:
1232 case STAR:
1233 case PLUS:
1234 break;
1235
1236 case CAT:
1237 case OR:
1238 case ORTOP:
1239 --depth;
1240 break;
1241
1242 default:
1243 ++dfa->nleaves;
1244 case EMPTY:
1245 ++depth;
1246 break;
1247 }
1248 if (depth > dfa->depth)
1249 dfa->depth = depth;
1250}
1251
1252/* The grammar understood by the parser is as follows.
1253
1254 regexp:
1255 regexp OR branch
1256 branch
1257
1258 branch:
1259 branch closure
1260 closure
1261
1262 closure:
1263 closure QMARK
1264 closure STAR
1265 closure PLUS
1266 closure REPMN
1267 atom
1268
1269 atom:
1270 <normal character>
1271 <multibyte character>
1272 ANYCHAR
1273 MBCSET
1274 CSET
1275 BACKREF
1276 BEGLINE
1277 ENDLINE
1278 BEGWORD
1279 ENDWORD
1280 LIMWORD
1281 NOTLIMWORD
1282 CRANGE
1283 LPAREN regexp RPAREN
1284 <empty>
1285
1286 The parser builds a parse tree in postfix form in an array of tokens. */
1287
1288static void
1289atom (void)
1290{
1291 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
1292 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
1293#ifdef MBS_SUPPORT
1294 || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
1295#endif /* MBS_SUPPORT */
1296 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
1297 {
1298 addtok(tok);
1299 tok = lex();
1300#ifdef MBS_SUPPORT
1301 /* We treat a multibyte character as a single atom, so that DFA
1302 can treat a multibyte character as a single expression.
1303
1304 e.g. We construct following tree from "<mb1><mb2>".
1305 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1306 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
1307 */
1308 if (MB_CUR_MAX > 1)
1309 {
1310 while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
1311 {
1312 addtok(tok);
1313 addtok(CAT);
1314 tok = lex();
1315 }
1316 }
1317#endif /* MBS_SUPPORT */
1318 }
1319 else if (tok == CRANGE)
1320 {
1321 /* A character range like "[a-z]" in a locale other than "C" or
1322 "POSIX". This range might any sequence of one or more
1323 characters. Unfortunately the POSIX locale primitives give
1324 us no practical way to find what character sequences might be
1325 matched. Treat this approximately like "(.\1)" -- i.e. match
1326 one character, and then punt to the full matcher. */
1327 charclass ccl;
1328 zeroset (ccl);
1329 notset (ccl);
1330 addtok (CSET + charclass_index (ccl));
1331 addtok (BACKREF);
1332 addtok (CAT);
1333 tok = lex ();
1334 }
1335 else if (tok == LPAREN)
1336 {
1337 tok = lex();
1338 regexp(0);
1339 if (tok != RPAREN)
1340 dfaerror(_("Unbalanced ("));
1341 tok = lex();
1342 }
1343 else
1344 addtok(EMPTY);
1345}
1346
1347/* Return the number of tokens in the given subexpression. */
1348static int
1349nsubtoks (int tindex)
1350{
1351 int ntoks1;
1352
1353 switch (dfa->tokens[tindex - 1])
1354 {
1355 default:
1356 return 1;
1357 case QMARK:
1358 case STAR:
1359 case PLUS:
1360 return 1 + nsubtoks(tindex - 1);
1361 case CAT:
1362 case OR:
1363 case ORTOP:
1364 ntoks1 = nsubtoks(tindex - 1);
1365 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
1366 }
1367}
1368
1369/* Copy the given subexpression to the top of the tree. */
1370static void
1371copytoks (int tindex, int ntokens)
1372{
1373 int i;
1374
1375 for (i = 0; i < ntokens; ++i)
1376 addtok(dfa->tokens[tindex + i]);
1377}
1378
1379static void
1380closure (void)
1381{
1382 int tindex, ntokens, i;
1383
1384 atom();
1385 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
1386 if (tok == REPMN)
1387 {
1388 ntokens = nsubtoks(dfa->tindex);
1389 tindex = dfa->tindex - ntokens;
1390 if (maxrep < 0)
1391 addtok(PLUS);
1392 if (minrep == 0)
1393 addtok(QMARK);
1394 for (i = 1; i < minrep; ++i)
1395 {
1396 copytoks(tindex, ntokens);
1397 addtok(CAT);
1398 }
1399 for (; i < maxrep; ++i)
1400 {
1401 copytoks(tindex, ntokens);
1402 addtok(QMARK);
1403 addtok(CAT);
1404 }
1405 tok = lex();
1406 }
1407 else
1408 {
1409 addtok(tok);
1410 tok = lex();
1411 }
1412}
1413
1414static void
1415branch (void)
1416{
1417 closure();
1418 while (tok != RPAREN && tok != OR && tok >= 0)
1419 {
1420 closure();
1421 addtok(CAT);
1422 }
1423}
1424
1425static void
1426regexp (int toplevel)
1427{
1428 branch();
1429 while (tok == OR)
1430 {
1431 tok = lex();
1432 branch();
1433 if (toplevel)
1434 addtok(ORTOP);
1435 else
1436 addtok(OR);
1437 }
1438}
1439
1440/* Main entry point for the parser. S is a string to be parsed, len is the
1441 length of the string, so s can include NUL characters. D is a pointer to
1442 the struct dfa to parse into. */
1443void
1444dfaparse (char const *s, size_t len, struct dfa *d)
1445{
1446 dfa = d;
1447 lexptr = s;
1448 lexleft = len;
1449 lasttok = END;
1450 laststart = 1;
1451 parens = 0;
1452#ifdef LC_COLLATE
1453 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1454#endif
1455#ifdef MBS_SUPPORT
1456 if (MB_CUR_MAX > 1)
1457 {
1458 cur_mb_index = 0;
1459 cur_mb_len = 0;
1460 memset(&mbs, 0, sizeof(mbstate_t));
1461 }
1462#endif /* MBS_SUPPORT */
1463
1464 if (! syntax_bits_set)
1465 dfaerror(_("No syntax specified"));
1466
1467 tok = lex();
1468 depth = d->depth;
1469
1470 regexp(1);
1471
1472 if (tok != END)
1473 dfaerror(_("Unbalanced )"));
1474
1475 addtok(END - d->nregexps);
1476 addtok(CAT);
1477
1478 if (d->nregexps)
1479 addtok(ORTOP);
1480
1481 ++d->nregexps;
1482}
1483
1484/* Some primitives for operating on sets of positions. */
1485
1486/* Copy one set to another; the destination must be large enough. */
1487static void
1488copy (position_set const *src, position_set *dst)
1489{
1490 int i;
1491
1492 for (i = 0; i < src->nelem; ++i)
1493 dst->elems[i] = src->elems[i];
1494 dst->nelem = src->nelem;
1495}
1496
1497/* Insert a position in a set. Position sets are maintained in sorted
1498 order according to index. If position already exists in the set with
1499 the same index then their constraints are logically or'd together.
1500 S->elems must point to an array large enough to hold the resulting set. */
1501static void
1502insert (position p, position_set *s)
1503{
1504 int i;
1505 position t1, t2;
1506
1507 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1508 continue;
1509 if (i < s->nelem && p.index == s->elems[i].index)
1510 s->elems[i].constraint |= p.constraint;
1511 else
1512 {
1513 t1 = p;
1514 ++s->nelem;
1515 while (i < s->nelem)
1516 {
1517 t2 = s->elems[i];
1518 s->elems[i++] = t1;
1519 t1 = t2;
1520 }
1521 }
1522}
1523
1524/* Merge two sets of positions into a third. The result is exactly as if
1525 the positions of both sets were inserted into an initially empty set. */
1526static void
1527merge (position_set const *s1, position_set const *s2, position_set *m)
1528{
1529 int i = 0, j = 0;
1530
1531 m->nelem = 0;
1532 while (i < s1->nelem && j < s2->nelem)
1533 if (s1->elems[i].index > s2->elems[j].index)
1534 m->elems[m->nelem++] = s1->elems[i++];
1535 else if (s1->elems[i].index < s2->elems[j].index)
1536 m->elems[m->nelem++] = s2->elems[j++];
1537 else
1538 {
1539 m->elems[m->nelem] = s1->elems[i++];
1540 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1541 }
1542 while (i < s1->nelem)
1543 m->elems[m->nelem++] = s1->elems[i++];
1544 while (j < s2->nelem)
1545 m->elems[m->nelem++] = s2->elems[j++];
1546}
1547
1548/* Delete a position from a set. */
1549static void
1550delete (position p, position_set *s)
1551{
1552 int i;
1553
1554 for (i = 0; i < s->nelem; ++i)
1555 if (p.index == s->elems[i].index)
1556 break;
1557 if (i < s->nelem)
1558 for (--s->nelem; i < s->nelem; ++i)
1559 s->elems[i] = s->elems[i + 1];
1560}
1561
1562/* Find the index of the state corresponding to the given position set with
1563 the given preceding context, or create a new state if there is no such
1564 state. Newline and letter tell whether we got here on a newline or
1565 letter, respectively. */
1566static int
1567state_index (struct dfa *d, position_set const *s, int newline, int letter)
1568{
1569 int hash = 0;
1570 int constraint;
1571 int i, j;
1572
1573 newline = newline ? 1 : 0;
1574 letter = letter ? 1 : 0;
1575
1576 for (i = 0; i < s->nelem; ++i)
1577 hash ^= s->elems[i].index + s->elems[i].constraint;
1578
1579 /* Try to find a state that exactly matches the proposed one. */
1580 for (i = 0; i < d->sindex; ++i)
1581 {
1582 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1583 || newline != d->states[i].newline || letter != d->states[i].letter)
1584 continue;
1585 for (j = 0; j < s->nelem; ++j)
1586 if (s->elems[j].constraint
1587 != d->states[i].elems.elems[j].constraint
1588 || s->elems[j].index != d->states[i].elems.elems[j].index)
1589 break;
1590 if (j == s->nelem)
1591 return i;
1592 }
1593
1594 /* We'll have to create a new state. */
1595 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1596 d->states[i].hash = hash;
1597 MALLOC(d->states[i].elems.elems, position, s->nelem);
1598 copy(s, &d->states[i].elems);
1599 d->states[i].newline = newline;
1600 d->states[i].letter = letter;
1601 d->states[i].backref = 0;
1602 d->states[i].constraint = 0;
1603 d->states[i].first_end = 0;
1604#ifdef MBS_SUPPORT
1605 if (MB_CUR_MAX > 1)
1606 d->states[i].mbps.nelem = 0;
1607#endif
1608 for (j = 0; j < s->nelem; ++j)
1609 if (d->tokens[s->elems[j].index] < 0)
1610 {
1611 constraint = s->elems[j].constraint;
1612 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1613 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1614 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1615 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1616 d->states[i].constraint |= constraint;
1617 if (! d->states[i].first_end)
1618 d->states[i].first_end = d->tokens[s->elems[j].index];
1619 }
1620 else if (d->tokens[s->elems[j].index] == BACKREF)
1621 {
1622 d->states[i].constraint = NO_CONSTRAINT;
1623 d->states[i].backref = 1;
1624 }
1625
1626 ++d->sindex;
1627
1628 return i;
1629}
1630
1631/* Find the epsilon closure of a set of positions. If any position of the set
1632 contains a symbol that matches the empty string in some context, replace
1633 that position with the elements of its follow labeled with an appropriate
1634 constraint. Repeat exhaustively until no funny positions are left.
1635 S->elems must be large enough to hold the result. */
1636static void
1637epsclosure (position_set *s, struct dfa const *d)
1638{
1639 int i, j;
1640 int *visited;
1641 position p, old;
1642
1643 MALLOC(visited, int, d->tindex);
1644 for (i = 0; i < d->tindex; ++i)
1645 visited[i] = 0;
1646
1647 for (i = 0; i < s->nelem; ++i)
1648 if (d->tokens[s->elems[i].index] >= NOTCHAR
1649 && d->tokens[s->elems[i].index] != BACKREF
1650#ifdef MBS_SUPPORT
1651 && d->tokens[s->elems[i].index] != ANYCHAR
1652 && d->tokens[s->elems[i].index] != MBCSET
1653#endif
1654 && d->tokens[s->elems[i].index] < CSET)
1655 {
1656 old = s->elems[i];
1657 p.constraint = old.constraint;
1658 delete(s->elems[i], s);
1659 if (visited[old.index])
1660 {
1661 --i;
1662 continue;
1663 }
1664 visited[old.index] = 1;
1665 switch (d->tokens[old.index])
1666 {
1667 case BEGLINE:
1668 p.constraint &= BEGLINE_CONSTRAINT;
1669 break;
1670 case ENDLINE:
1671 p.constraint &= ENDLINE_CONSTRAINT;
1672 break;
1673 case BEGWORD:
1674 p.constraint &= BEGWORD_CONSTRAINT;
1675 break;
1676 case ENDWORD:
1677 p.constraint &= ENDWORD_CONSTRAINT;
1678 break;
1679 case LIMWORD:
1680 p.constraint &= LIMWORD_CONSTRAINT;
1681 break;
1682 case NOTLIMWORD:
1683 p.constraint &= NOTLIMWORD_CONSTRAINT;
1684 break;
1685 default:
1686 break;
1687 }
1688 for (j = 0; j < d->follows[old.index].nelem; ++j)
1689 {
1690 p.index = d->follows[old.index].elems[j].index;
1691 insert(p, s);
1692 }
1693 /* Force rescan to start at the beginning. */
1694 i = -1;
1695 }
1696
1697 free(visited);
1698}
1699
1700/* Perform bottom-up analysis on the parse tree, computing various functions.
1701 Note that at this point, we're pretending constructs like \< are real
1702 characters rather than constraints on what can follow them.
1703
1704 Nullable: A node is nullable if it is at the root of a regexp that can
1705 match the empty string.
1706 * EMPTY leaves are nullable.
1707 * No other leaf is nullable.
1708 * A QMARK or STAR node is nullable.
1709 * A PLUS node is nullable if its argument is nullable.
1710 * A CAT node is nullable if both its arguments are nullable.
1711 * An OR node is nullable if either argument is nullable.
1712
1713 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1714 that could correspond to the first character of a string matching the
1715 regexp rooted at the given node.
1716 * EMPTY leaves have empty firstpos.
1717 * The firstpos of a nonempty leaf is that leaf itself.
1718 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1719 argument.
1720 * The firstpos of a CAT node is the firstpos of the left argument, union
1721 the firstpos of the right if the left argument is nullable.
1722 * The firstpos of an OR node is the union of firstpos of each argument.
1723
1724 Lastpos: The lastpos of a node is the set of positions that could
1725 correspond to the last character of a string matching the regexp at
1726 the given node.
1727 * EMPTY leaves have empty lastpos.
1728 * The lastpos of a nonempty leaf is that leaf itself.
1729 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1730 argument.
1731 * The lastpos of a CAT node is the lastpos of its right argument, union
1732 the lastpos of the left if the right argument is nullable.
1733 * The lastpos of an OR node is the union of the lastpos of each argument.
1734
1735 Follow: The follow of a position is the set of positions that could
1736 correspond to the character following a character matching the node in
1737 a string matching the regexp. At this point we consider special symbols
1738 that match the empty string in some context to be just normal characters.
1739 Later, if we find that a special symbol is in a follow set, we will
1740 replace it with the elements of its follow, labeled with an appropriate
1741 constraint.
1742 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1743 the follow of every node in the lastpos.
1744 * Every node in the firstpos of the second argument of a CAT node is in
1745 the follow of every node in the lastpos of the first argument.
1746
1747 Because of the postfix representation of the parse tree, the depth-first
1748 analysis is conveniently done by a linear scan with the aid of a stack.
1749 Sets are stored as arrays of the elements, obeying a stack-like allocation
1750 scheme; the number of elements in each set deeper in the stack can be
1751 used to determine the address of a particular set's array. */
1752void
1753dfaanalyze (struct dfa *d, int searchflag)
1754{
1755 int *nullable; /* Nullable stack. */
1756 int *nfirstpos; /* Element count stack for firstpos sets. */
1757 position *firstpos; /* Array where firstpos elements are stored. */
1758 int *nlastpos; /* Element count stack for lastpos sets. */
1759 position *lastpos; /* Array where lastpos elements are stored. */
1760 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1761 position_set tmp; /* Temporary set for merging sets. */
1762 position_set merged; /* Result of merging sets. */
1763 int wants_newline; /* True if some position wants newline info. */
1764 int *o_nullable;
1765 int *o_nfirst, *o_nlast;
1766 position *o_firstpos, *o_lastpos;
1767 int i, j;
1768 position *pos;
1769
1770#ifdef DEBUG
1771 fprintf(stderr, "dfaanalyze:\n");
1772 for (i = 0; i < d->tindex; ++i)
1773 {
1774 fprintf(stderr, " %d:", i);
1775 prtok(d->tokens[i]);
1776 }
1777 putc('\n', stderr);
1778#endif
1779
1780 d->searchflag = searchflag;
1781
1782 MALLOC(nullable, int, d->depth);
1783 o_nullable = nullable;
1784 MALLOC(nfirstpos, int, d->depth);
1785 o_nfirst = nfirstpos;
1786 MALLOC(firstpos, position, d->nleaves);
1787 o_firstpos = firstpos, firstpos += d->nleaves;
1788 MALLOC(nlastpos, int, d->depth);
1789 o_nlast = nlastpos;
1790 MALLOC(lastpos, position, d->nleaves);
1791 o_lastpos = lastpos, lastpos += d->nleaves;
1792 MALLOC(nalloc, int, d->tindex);
1793 for (i = 0; i < d->tindex; ++i)
1794 nalloc[i] = 0;
1795 MALLOC(merged.elems, position, d->nleaves);
1796
1797 CALLOC(d->follows, position_set, d->tindex);
1798
1799 for (i = 0; i < d->tindex; ++i)
1800#ifdef DEBUG
1801 { /* Nonsyntactic #ifdef goo... */
1802#endif
1803 switch (d->tokens[i])
1804 {
1805 case EMPTY:
1806 /* The empty set is nullable. */
1807 *nullable++ = 1;
1808
1809 /* The firstpos and lastpos of the empty leaf are both empty. */
1810 *nfirstpos++ = *nlastpos++ = 0;
1811 break;
1812
1813 case STAR:
1814 case PLUS:
1815 /* Every element in the firstpos of the argument is in the follow
1816 of every element in the lastpos. */
1817 tmp.nelem = nfirstpos[-1];
1818 tmp.elems = firstpos;
1819 pos = lastpos;
1820 for (j = 0; j < nlastpos[-1]; ++j)
1821 {
1822 merge(&tmp, &d->follows[pos[j].index], &merged);
1823 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1824 nalloc[pos[j].index], merged.nelem - 1);
1825 copy(&merged, &d->follows[pos[j].index]);
1826 }
1827
1828 case QMARK:
1829 /* A QMARK or STAR node is automatically nullable. */
1830 if (d->tokens[i] != PLUS)
1831 nullable[-1] = 1;
1832 break;
1833
1834 case CAT:
1835 /* Every element in the firstpos of the second argument is in the
1836 follow of every element in the lastpos of the first argument. */
1837 tmp.nelem = nfirstpos[-1];
1838 tmp.elems = firstpos;
1839 pos = lastpos + nlastpos[-1];
1840 for (j = 0; j < nlastpos[-2]; ++j)
1841 {
1842 merge(&tmp, &d->follows[pos[j].index], &merged);
1843 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1844 nalloc[pos[j].index], merged.nelem - 1);
1845 copy(&merged, &d->follows[pos[j].index]);
1846 }
1847
1848 /* The firstpos of a CAT node is the firstpos of the first argument,
1849 union that of the second argument if the first is nullable. */
1850 if (nullable[-2])
1851 nfirstpos[-2] += nfirstpos[-1];
1852 else
1853 firstpos += nfirstpos[-1];
1854 --nfirstpos;
1855
1856 /* The lastpos of a CAT node is the lastpos of the second argument,
1857 union that of the first argument if the second is nullable. */
1858 if (nullable[-1])
1859 nlastpos[-2] += nlastpos[-1];
1860 else
1861 {
1862 pos = lastpos + nlastpos[-2];
1863 for (j = nlastpos[-1] - 1; j >= 0; --j)
1864 pos[j] = lastpos[j];
1865 lastpos += nlastpos[-2];
1866 nlastpos[-2] = nlastpos[-1];
1867 }
1868 --nlastpos;
1869
1870 /* A CAT node is nullable if both arguments are nullable. */
1871 nullable[-2] = nullable[-1] && nullable[-2];
1872 --nullable;
1873 break;
1874
1875 case OR:
1876 case ORTOP:
1877 /* The firstpos is the union of the firstpos of each argument. */
1878 nfirstpos[-2] += nfirstpos[-1];
1879 --nfirstpos;
1880
1881 /* The lastpos is the union of the lastpos of each argument. */
1882 nlastpos[-2] += nlastpos[-1];
1883 --nlastpos;
1884
1885 /* An OR node is nullable if either argument is nullable. */
1886 nullable[-2] = nullable[-1] || nullable[-2];
1887 --nullable;
1888 break;
1889
1890 default:
1891 /* Anything else is a nonempty position. (Note that special
1892 constructs like \< are treated as nonempty strings here;
1893 an "epsilon closure" effectively makes them nullable later.
1894 Backreferences have to get a real position so we can detect
1895 transitions on them later. But they are nullable. */
1896 *nullable++ = d->tokens[i] == BACKREF;
1897
1898 /* This position is in its own firstpos and lastpos. */
1899 *nfirstpos++ = *nlastpos++ = 1;
1900 --firstpos, --lastpos;
1901 firstpos->index = lastpos->index = i;
1902 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1903
1904 /* Allocate the follow set for this position. */
1905 nalloc[i] = 1;
1906 MALLOC(d->follows[i].elems, position, nalloc[i]);
1907 break;
1908 }
1909#ifdef DEBUG
1910 /* ... balance the above nonsyntactic #ifdef goo... */
1911 fprintf(stderr, "node %d:", i);
1912 prtok(d->tokens[i]);
1913 putc('\n', stderr);
1914 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1915 fprintf(stderr, " firstpos:");
1916 for (j = nfirstpos[-1] - 1; j >= 0; --j)
1917 {
1918 fprintf(stderr, " %d:", firstpos[j].index);
1919 prtok(d->tokens[firstpos[j].index]);
1920 }
1921 fprintf(stderr, "\n lastpos:");
1922 for (j = nlastpos[-1] - 1; j >= 0; --j)
1923 {
1924 fprintf(stderr, " %d:", lastpos[j].index);
1925 prtok(d->tokens[lastpos[j].index]);
1926 }
1927 putc('\n', stderr);
1928 }
1929#endif
1930
1931 /* For each follow set that is the follow set of a real position, replace
1932 it with its epsilon closure. */
1933 for (i = 0; i < d->tindex; ++i)
1934 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1935#ifdef MBS_SUPPORT
1936 || d->tokens[i] == ANYCHAR
1937 || d->tokens[i] == MBCSET
1938#endif
1939 || d->tokens[i] >= CSET)
1940 {
1941#ifdef DEBUG
1942 fprintf(stderr, "follows(%d:", i);
1943 prtok(d->tokens[i]);
1944 fprintf(stderr, "):");
1945 for (j = d->follows[i].nelem - 1; j >= 0; --j)
1946 {
1947 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1948 prtok(d->tokens[d->follows[i].elems[j].index]);
1949 }
1950 putc('\n', stderr);
1951#endif
1952 copy(&d->follows[i], &merged);
1953 epsclosure(&merged, d);
1954 if (d->follows[i].nelem < merged.nelem)
1955 REALLOC(d->follows[i].elems, position, merged.nelem);
1956 copy(&merged, &d->follows[i]);
1957 }
1958
1959 /* Get the epsilon closure of the firstpos of the regexp. The result will
1960 be the set of positions of state 0. */
1961 merged.nelem = 0;
1962 for (i = 0; i < nfirstpos[-1]; ++i)
1963 insert(firstpos[i], &merged);
1964 epsclosure(&merged, d);
1965
1966 /* Check if any of the positions of state 0 will want newline context. */
1967 wants_newline = 0;
1968 for (i = 0; i < merged.nelem; ++i)
1969 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1970 wants_newline = 1;
1971
1972 /* Build the initial state. */
1973 d->salloc = 1;
1974 d->sindex = 0;
1975 MALLOC(d->states, dfa_state, d->salloc);
1976 state_index(d, &merged, wants_newline, 0);
1977
1978 free(o_nullable);
1979 free(o_nfirst);
1980 free(o_firstpos);
1981 free(o_nlast);
1982 free(o_lastpos);
1983 free(nalloc);
1984 free(merged.elems);
1985}
1986
1987/* Find, for each character, the transition out of state s of d, and store
1988 it in the appropriate slot of trans.
1989
1990 We divide the positions of s into groups (positions can appear in more
1991 than one group). Each group is labeled with a set of characters that
1992 every position in the group matches (taking into account, if necessary,
1993 preceding context information of s). For each group, find the union
1994 of the its elements' follows. This set is the set of positions of the
1995 new state. For each character in the group's label, set the transition
1996 on this character to be to a state corresponding to the set's positions,
1997 and its associated backward context information, if necessary.
1998
1999 If we are building a searching matcher, we include the positions of state
2000 0 in every state.
2001
2002 The collection of groups is constructed by building an equivalence-class
2003 partition of the positions of s.
2004
2005 For each position, find the set of characters C that it matches. Eliminate
2006 any characters from C that fail on grounds of backward context.
2007
2008 Search through the groups, looking for a group whose label L has nonempty
2009 intersection with C. If L - C is nonempty, create a new group labeled
2010 L - C and having the same positions as the current group, and set L to
2011 the intersection of L and C. Insert the position in this group, set
2012 C = C - L, and resume scanning.
2013
2014 If after comparing with every group there are characters remaining in C,
2015 create a new group labeled with the characters of C and insert this
2016 position in that group. */
2017void
2018dfastate (int s, struct dfa *d, int trans[])
2019{
2020 position_set grps[NOTCHAR]; /* As many as will ever be needed. */
2021 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
2022 int ngrps = 0; /* Number of groups actually used. */
2023 position pos; /* Current position being considered. */
2024 charclass matches; /* Set of matching characters. */
2025 int matchesf; /* True if matches is nonempty. */
2026 charclass intersect; /* Intersection with some label set. */
2027 int intersectf; /* True if intersect is nonempty. */
2028 charclass leftovers; /* Stuff in the label that didn't match. */
2029 int leftoversf; /* True if leftovers is nonempty. */
2030 static charclass letters; /* Set of characters considered letters. */
2031 static charclass newline; /* Set of characters that aren't newline. */
2032 position_set follows; /* Union of the follows of some group. */
2033 position_set tmp; /* Temporary space for merging sets. */
2034 int state; /* New state. */
2035 int wants_newline; /* New state wants to know newline context. */
2036 int state_newline; /* New state on a newline transition. */
2037 int wants_letter; /* New state wants to know letter context. */
2038 int state_letter; /* New state on a letter transition. */
2039 static int initialized; /* Flag for static initialization. */
2040#ifdef MBS_SUPPORT
2041 int next_isnt_1st_byte = 0; /* Flag if we can't add state0. */
2042#endif
2043 int i, j, k;
2044
2045 /* Initialize the set of letters, if necessary. */
2046 if (! initialized)
2047 {
2048 initialized = 1;
2049 for (i = 0; i < NOTCHAR; ++i)
2050 if (IS_WORD_CONSTITUENT(i))
2051 setbit(i, letters);
2052 setbit(eolbyte, newline);
2053 }
2054
2055 zeroset(matches);
2056
2057 for (i = 0; i < d->states[s].elems.nelem; ++i)
2058 {
2059 pos = d->states[s].elems.elems[i];
2060 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2061 setbit(d->tokens[pos.index], matches);
2062 else if (d->tokens[pos.index] >= CSET)
2063 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
2064#ifdef MBS_SUPPORT
2065 else if (d->tokens[pos.index] == ANYCHAR
2066 || d->tokens[pos.index] == MBCSET)
2067 /* MB_CUR_MAX > 1 */
2068 {
2069 /* ANYCHAR and MBCSET must match with a single character, so we
2070 must put it to d->states[s].mbps, which contains the positions
2071 which can match with a single character not a byte. */
2072 if (d->states[s].mbps.nelem == 0)
2073 {
2074 MALLOC(d->states[s].mbps.elems, position,
2075 d->states[s].elems.nelem);
2076 }
2077 insert(pos, &(d->states[s].mbps));
2078 continue;
2079 }
2080#endif /* MBS_SUPPORT */
2081 else
2082 continue;
2083
2084 /* Some characters may need to be eliminated from matches because
2085 they fail in the current context. */
2086 if (pos.constraint != 0xFF)
2087 {
2088 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2089 d->states[s].newline, 1))
2090 clrbit(eolbyte, matches);
2091 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
2092 d->states[s].newline, 0))
2093 for (j = 0; j < CHARCLASS_INTS; ++j)
2094 matches[j] &= newline[j];
2095 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2096 d->states[s].letter, 1))
2097 for (j = 0; j < CHARCLASS_INTS; ++j)
2098 matches[j] &= ~letters[j];
2099 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
2100 d->states[s].letter, 0))
2101 for (j = 0; j < CHARCLASS_INTS; ++j)
2102 matches[j] &= letters[j];
2103
2104 /* If there are no characters left, there's no point in going on. */
2105 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
2106 continue;
2107 if (j == CHARCLASS_INTS)
2108 continue;
2109 }
2110
2111 for (j = 0; j < ngrps; ++j)
2112 {
2113 /* If matches contains a single character only, and the current
2114 group's label doesn't contain that character, go on to the
2115 next group. */
2116 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
2117 && !tstbit(d->tokens[pos.index], labels[j]))
2118 continue;
2119
2120 /* Check if this group's label has a nonempty intersection with
2121 matches. */
2122 intersectf = 0;
2123 for (k = 0; k < CHARCLASS_INTS; ++k)
2124 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
2125 if (! intersectf)
2126 continue;
2127
2128 /* It does; now find the set differences both ways. */
2129 leftoversf = matchesf = 0;
2130 for (k = 0; k < CHARCLASS_INTS; ++k)
2131 {
2132 /* Even an optimizing compiler can't know this for sure. */
2133 int match = matches[k], label = labels[j][k];
2134
2135 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
2136 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
2137 }
2138
2139 /* If there were leftovers, create a new group labeled with them. */
2140 if (leftoversf)
2141 {
2142 copyset(leftovers, labels[ngrps]);
2143 copyset(intersect, labels[j]);
2144 MALLOC(grps[ngrps].elems, position, d->nleaves);
2145 copy(&grps[j], &grps[ngrps]);
2146 ++ngrps;
2147 }
2148
2149 /* Put the position in the current group. Note that there is no
2150 reason to call insert() here. */
2151 grps[j].elems[grps[j].nelem++] = pos;
2152
2153 /* If every character matching the current position has been
2154 accounted for, we're done. */
2155 if (! matchesf)
2156 break;
2157 }
2158
2159 /* If we've passed the last group, and there are still characters
2160 unaccounted for, then we'll have to create a new group. */
2161 if (j == ngrps)
2162 {
2163 copyset(matches, labels[ngrps]);
2164 zeroset(matches);
2165 MALLOC(grps[ngrps].elems, position, d->nleaves);
2166 grps[ngrps].nelem = 1;
2167 grps[ngrps].elems[0] = pos;
2168 ++ngrps;
2169 }
2170 }
2171
2172 MALLOC(follows.elems, position, d->nleaves);
2173 MALLOC(tmp.elems, position, d->nleaves);
2174
2175 /* If we are a searching matcher, the default transition is to a state
2176 containing the positions of state 0, otherwise the default transition
2177 is to fail miserably. */
2178 if (d->searchflag)
2179 {
2180 wants_newline = 0;
2181 wants_letter = 0;
2182 for (i = 0; i < d->states[0].elems.nelem; ++i)
2183 {
2184 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
2185 wants_newline = 1;
2186 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
2187 wants_letter = 1;
2188 }
2189 copy(&d->states[0].elems, &follows);
2190 state = state_index(d, &follows, 0, 0);
2191 if (wants_newline)
2192 state_newline = state_index(d, &follows, 1, 0);
2193 else
2194 state_newline = state;
2195 if (wants_letter)
2196 state_letter = state_index(d, &follows, 0, 1);
2197 else
2198 state_letter = state;
2199 for (i = 0; i < NOTCHAR; ++i)
2200 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
2201 trans[eolbyte] = state_newline;
2202 }
2203 else
2204 for (i = 0; i < NOTCHAR; ++i)
2205 trans[i] = -1;
2206
2207 for (i = 0; i < ngrps; ++i)
2208 {
2209 follows.nelem = 0;
2210
2211 /* Find the union of the follows of the positions of the group.
2212 This is a hideously inefficient loop. Fix it someday. */
2213 for (j = 0; j < grps[i].nelem; ++j)
2214 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
2215 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
2216
2217#ifdef MBS_SUPPORT
2218 if (MB_CUR_MAX > 1)
2219 {
2220 /* If a token in follows.elems is not 1st byte of a multibyte
2221 character, or the states of follows must accept the bytes
2222 which are not 1st byte of the multibyte character.
2223 Then, if a state of follows encounter a byte, it must not be
2224 a 1st byte of a multibyte character nor single byte character.
2225 We cansel to add state[0].follows to next state, because
2226 state[0] must accept 1st-byte
2227
2228 For example, we assume <sb a> is a certain single byte
2229 character, <mb A> is a certain multibyte character, and the
2230 codepoint of <sb a> equals the 2nd byte of the codepoint of
2231 <mb A>.
2232 When state[0] accepts <sb a>, state[i] transit to state[i+1]
2233 by accepting accepts 1st byte of <mb A>, and state[i+1]
2234 accepts 2nd byte of <mb A>, if state[i+1] encounter the
2235 codepoint of <sb a>, it must not be <sb a> but 2nd byte of
2236 <mb A>, so we can not add state[0]. */
2237
2238 next_isnt_1st_byte = 0;
2239 for (j = 0; j < follows.nelem; ++j)
2240 {
2241 if (!(d->multibyte_prop[follows.elems[j].index] & 1))
2242 {
2243 next_isnt_1st_byte = 1;
2244 break;
2245 }
2246 }
2247 }
2248#endif
2249
2250 /* If we are building a searching matcher, throw in the positions
2251 of state 0 as well. */
2252#ifdef MBS_SUPPORT
2253 if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte))
2254#else
2255 if (d->searchflag)
2256#endif
2257 for (j = 0; j < d->states[0].elems.nelem; ++j)
2258 insert(d->states[0].elems.elems[j], &follows);
2259
2260 /* Find out if the new state will want any context information. */
2261 wants_newline = 0;
2262 if (tstbit(eolbyte, labels[i]))
2263 for (j = 0; j < follows.nelem; ++j)
2264 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
2265 wants_newline = 1;
2266
2267 wants_letter = 0;
2268 for (j = 0; j < CHARCLASS_INTS; ++j)
2269 if (labels[i][j] & letters[j])
2270 break;
2271 if (j < CHARCLASS_INTS)
2272 for (j = 0; j < follows.nelem; ++j)
2273 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
2274 wants_letter = 1;
2275
2276 /* Find the state(s) corresponding to the union of the follows. */
2277 state = state_index(d, &follows, 0, 0);
2278 if (wants_newline)
2279 state_newline = state_index(d, &follows, 1, 0);
2280 else
2281 state_newline = state;
2282 if (wants_letter)
2283 state_letter = state_index(d, &follows, 0, 1);
2284 else
2285 state_letter = state;
2286
2287 /* Set the transitions for each character in the current label. */
2288 for (j = 0; j < CHARCLASS_INTS; ++j)
2289 for (k = 0; k < INTBITS; ++k)
2290 if (labels[i][j] & 1 << k)
2291 {
2292 int c = j * INTBITS + k;
2293
2294 if (c == eolbyte)
2295 trans[c] = state_newline;
2296 else if (IS_WORD_CONSTITUENT(c))
2297 trans[c] = state_letter;
2298 else if (c < NOTCHAR)
2299 trans[c] = state;
2300 }
2301 }
2302
2303 for (i = 0; i < ngrps; ++i)
2304 free(grps[i].elems);
2305 free(follows.elems);
2306 free(tmp.elems);
2307}
2308
2309/* Some routines for manipulating a compiled dfa's transition tables.
2310 Each state may or may not have a transition table; if it does, and it
2311 is a non-accepting state, then d->trans[state] points to its table.
2312 If it is an accepting state then d->fails[state] points to its table.
2313 If it has no table at all, then d->trans[state] is NULL.
2314 TODO: Improve this comment, get rid of the unnecessary redundancy. */
2315
2316static void
2317build_state (int s, struct dfa *d)
2318{
2319 int *trans; /* The new transition table. */
2320 int i;
2321
2322 /* Set an upper limit on the number of transition tables that will ever
2323 exist at once. 1024 is arbitrary. The idea is that the frequently
2324 used transition tables will be quickly rebuilt, whereas the ones that
2325 were only needed once or twice will be cleared away. */
2326 if (d->trcount >= 1024)
2327 {
2328 for (i = 0; i < d->tralloc; ++i)
2329 if (d->trans[i])
2330 {
2331 free((ptr_t) d->trans[i]);
2332 d->trans[i] = NULL;
2333 }
2334 else if (d->fails[i])
2335 {
2336 free((ptr_t) d->fails[i]);
2337 d->fails[i] = NULL;
2338 }
2339 d->trcount = 0;
2340 }
2341
2342 ++d->trcount;
2343
2344 /* Set up the success bits for this state. */
2345 d->success[s] = 0;
2346 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
2347 s, *d))
2348 d->success[s] |= 4;
2349 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
2350 s, *d))
2351 d->success[s] |= 2;
2352 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
2353 s, *d))
2354 d->success[s] |= 1;
2355
2356 MALLOC(trans, int, NOTCHAR);
2357 dfastate(s, d, trans);
2358
2359 /* Now go through the new transition table, and make sure that the trans
2360 and fail arrays are allocated large enough to hold a pointer for the
2361 largest state mentioned in the table. */
2362 for (i = 0; i < NOTCHAR; ++i)
2363 if (trans[i] >= d->tralloc)
2364 {
2365 int oldalloc = d->tralloc;
2366
2367 while (trans[i] >= d->tralloc)
2368 d->tralloc *= 2;
2369 REALLOC(d->realtrans, int *, d->tralloc + 1);
2370 d->trans = d->realtrans + 1;
2371 REALLOC(d->fails, int *, d->tralloc);
2372 REALLOC(d->success, int, d->tralloc);
2373 REALLOC(d->newlines, int, d->tralloc);
2374 while (oldalloc < d->tralloc)
2375 {
2376 d->trans[oldalloc] = NULL;
2377 d->fails[oldalloc++] = NULL;
2378 }
2379 }
2380
2381 /* Keep the newline transition in a special place so we can use it as
2382 a sentinel. */
2383 d->newlines[s] = trans[eolbyte];
2384 trans[eolbyte] = -1;
2385
2386 if (ACCEPTING(s, *d))
2387 d->fails[s] = trans;
2388 else
2389 d->trans[s] = trans;
2390}
2391
2392static void
2393build_state_zero (struct dfa *d)
2394{
2395 d->tralloc = 1;
2396 d->trcount = 0;
2397 CALLOC(d->realtrans, int *, d->tralloc + 1);
2398 d->trans = d->realtrans + 1;
2399 CALLOC(d->fails, int *, d->tralloc);
2400 MALLOC(d->success, int, d->tralloc);
2401 MALLOC(d->newlines, int, d->tralloc);
2402 build_state(0, d);
2403}
2404
2405#ifdef MBS_SUPPORT
2406/* Multibyte character handling sub-routines for dfaexec. */
2407
2408/* Initial state may encounter the byte which is not a single byte character
2409 nor 1st byte of a multibyte character. But it is incorrect for initial
2410 state to accept such a byte.
2411 For example, in sjis encoding the regular expression like "\\" accepts
2412 the codepoint 0x5c, but should not accept the 2nd byte of the codepoint
2413 0x815c. Then Initial state must skip the bytes which are not a single byte
2414 character nor 1st byte of a multibyte character. */
2415#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
2416 if (s == 0) \
2417 { \
2418 while (inputwcs[p - buf_begin] == 0 \
2419 && mblen_buf[p - buf_begin] > 0 \
2420 && (unsigned char const *)p < buf_end) \
2421 ++p; \
2422 if ((char *)p >= end) \
2423 { \
2424 free(mblen_buf); \
2425 free(inputwcs); \
2426 return NULL; \
2427 } \
2428 }
2429
2430static void
2431realloc_trans_if_necessary(struct dfa *d, int new_state)
2432{
2433 /* Make sure that the trans and fail arrays are allocated large enough
2434 to hold a pointer for the new state. */
2435 if (new_state >= d->tralloc)
2436 {
2437 int oldalloc = d->tralloc;
2438
2439 while (new_state >= d->tralloc)
2440 d->tralloc *= 2;
2441 REALLOC(d->realtrans, int *, d->tralloc + 1);
2442 d->trans = d->realtrans + 1;
2443 REALLOC(d->fails, int *, d->tralloc);
2444 REALLOC(d->success, int, d->tralloc);
2445 REALLOC(d->newlines, int, d->tralloc);
2446 while (oldalloc < d->tralloc)
2447 {
2448 d->trans[oldalloc] = NULL;
2449 d->fails[oldalloc++] = NULL;
2450 }
2451 }
2452}
2453
2454/* Return values of transit_state_singlebyte(), and
2455 transit_state_consume_1char. */
2456typedef enum
2457{
2458 TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */
2459 TRANSIT_STATE_DONE, /* State transition has finished. */
2460 TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */
2461} status_transit_state;
2462
2463/* Consume a single byte and transit state from 's' to '*next_state'.
2464 This function is almost same as the state transition routin in dfaexec().
2465 But state transition is done just once, otherwise matching succeed or
2466 reach the end of the buffer. */
2467static status_transit_state
2468transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p,
2469 int *next_state)
2470{
2471 int *t;
2472 int works = s;
2473
2474 status_transit_state rval = TRANSIT_STATE_IN_PROGRESS;
2475
2476 while (rval == TRANSIT_STATE_IN_PROGRESS)
2477 {
2478 if ((t = d->trans[works]) != NULL)
2479 {
2480 works = t[*p];
2481 rval = TRANSIT_STATE_DONE;
2482 if (works < 0)
2483 works = 0;
2484 }
2485 else if (works < 0)
2486 {
2487 if (p == buf_end)
2488 /* At the moment, it must not happen. */
2489 return TRANSIT_STATE_END_BUFFER;
2490 works = 0;
2491 }
2492 else if (d->fails[works])
2493 {
2494 works = d->fails[works][*p];
2495 rval = TRANSIT_STATE_DONE;
2496 }
2497 else
2498 {
2499 build_state(works, d);
2500 }
2501 }
2502 *next_state = works;
2503 return rval;
2504}
2505
2506/* Check whether period can match or not in the current context. If it can,
2507 return the amount of the bytes with which period can match, otherwise
2508 return 0.
2509 `pos' is the position of the period. `index' is the index from the
2510 buf_begin, and it is the current position in the buffer. */
2511static int
2512match_anychar (struct dfa *d, int s, position pos, int index)
2513{
2514 int newline = 0;
2515 int letter = 0;
2516 wchar_t wc;
2517 int mbclen;
2518
2519 wc = inputwcs[index];
2520 mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2521
2522 /* Check context. */
2523 if (wc == (wchar_t)eolbyte)
2524 {
2525 if (!(syntax_bits & RE_DOT_NEWLINE))
2526 return 0;
2527 newline = 1;
2528 }
2529 else if (wc == (wchar_t)'\0')
2530 {
2531 if (syntax_bits & RE_DOT_NOT_NULL)
2532 return 0;
2533 newline = 1;
2534 }
2535
2536 if (iswalnum(wc) || wc == L'_')
2537 letter = 1;
2538
2539 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2540 newline, d->states[s].letter, letter))
2541 return 0;
2542
2543 return mbclen;
2544}
2545
2546/* Check whether bracket expression can match or not in the current context.
2547 If it can, return the amount of the bytes with which expression can match,
2548 otherwise return 0.
2549 `pos' is the position of the bracket expression. `index' is the index
2550 from the buf_begin, and it is the current position in the buffer. */
2551int
2552match_mb_charset (struct dfa *d, int s, position pos, int index)
2553{
2554 int i;
2555 int match; /* Flag which represent that matching succeed. */
2556 int match_len; /* Length of the character (or collating element)
2557 with which this operator match. */
2558 int op_len; /* Length of the operator. */
2559 char buffer[128];
2560 wchar_t wcbuf[6];
2561
2562 /* Pointer to the structure to which we are currently refering. */
2563 struct mb_char_classes *work_mbc;
2564
2565 int newline = 0;
2566 int letter = 0;
2567 wchar_t wc; /* Current refering character. */
2568
2569 wc = inputwcs[index];
2570
2571 /* Check context. */
2572 if (wc == (wchar_t)eolbyte)
2573 {
2574 if (!(syntax_bits & RE_DOT_NEWLINE))
2575 return 0;
2576 newline = 1;
2577 }
2578 else if (wc == (wchar_t)'\0')
2579 {
2580 if (syntax_bits & RE_DOT_NOT_NULL)
2581 return 0;
2582 newline = 1;
2583 }
2584 if (iswalnum(wc) || wc == L'_')
2585 letter = 1;
2586 if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline,
2587 newline, d->states[s].letter, letter))
2588 return 0;
2589
2590 /* Assign the current refering operator to work_mbc. */
2591 work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
2592 match = !work_mbc->invert;
2593 match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
2594
2595 /* match with a character class? */
2596 for (i = 0; i<work_mbc->nch_classes; i++)
2597 {
2598 if (iswctype((wint_t)wc, work_mbc->ch_classes[i]))
2599 goto charset_matched;
2600 }
2601
2602 strncpy(buffer, buf_begin + index, match_len);
2603 buffer[match_len] = '\0';
2604
2605 /* match with an equivalent class? */
2606 for (i = 0; i<work_mbc->nequivs; i++)
2607 {
2608 op_len = strlen(work_mbc->equivs[i]);
2609 strncpy(buffer, buf_begin + index, op_len);
2610 buffer[op_len] = '\0';
2611 if (strcoll(work_mbc->equivs[i], buffer) == 0)
2612 {
2613 match_len = op_len;
2614 goto charset_matched;
2615 }
2616 }
2617
2618 /* match with a collating element? */
2619 for (i = 0; i<work_mbc->ncoll_elems; i++)
2620 {
2621 op_len = strlen(work_mbc->coll_elems[i]);
2622 strncpy(buffer, buf_begin + index, op_len);
2623 buffer[op_len] = '\0';
2624
2625 if (strcoll(work_mbc->coll_elems[i], buffer) == 0)
2626 {
2627 match_len = op_len;
2628 goto charset_matched;
2629 }
2630 }
2631
2632 wcbuf[0] = wc;
2633 wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
2634
2635 /* match with a range? */
2636 for (i = 0; i<work_mbc->nranges; i++)
2637 {
2638 wcbuf[2] = work_mbc->range_sts[i];
2639 wcbuf[4] = work_mbc->range_ends[i];
2640
2641 if (wcscoll(wcbuf, wcbuf+2) >= 0 &&
2642 wcscoll(wcbuf+4, wcbuf) >= 0)
2643 goto charset_matched;
2644 }
2645
2646 /* match with a character? */
2647 for (i = 0; i<work_mbc->nchars; i++)
2648 {
2649 if (wc == work_mbc->chars[i])
2650 goto charset_matched;
2651 }
2652
2653 match = !match;
2654
2655 charset_matched:
2656 return match ? match_len : 0;
2657}
2658
2659/* Check each of `d->states[s].mbps.elem' can match or not. Then return the
2660 array which corresponds to `d->states[s].mbps.elem' and each element of
2661 the array contains the amount of the bytes with which the element can
2662 match.
2663 `index' is the index from the buf_begin, and it is the current position
2664 in the buffer.
2665 Caller MUST free the array which this function return. */
2666static int*
2667check_matching_with_multibyte_ops (struct dfa *d, int s, int index)
2668{
2669 int i;
2670 int* rarray;
2671
2672 MALLOC(rarray, int, d->states[s].mbps.nelem);
2673 for (i = 0; i < d->states[s].mbps.nelem; ++i)
2674 {
2675 position pos = d->states[s].mbps.elems[i];
2676 switch(d->tokens[pos.index])
2677 {
2678 case ANYCHAR:
2679 rarray[i] = match_anychar(d, s, pos, index);
2680 break;
2681 case MBCSET:
2682 rarray[i] = match_mb_charset(d, s, pos, index);
2683 break;
2684 default:
2685 break; /* can not happen. */
2686 }
2687 }
2688 return rarray;
2689}
2690
2691/* Consume a single character and enumerate all of the positions which can
2692 be next position from the state `s'.
2693 `match_lens' is the input. It can be NULL, but it can also be the output
2694 of check_matching_with_multibyte_ops() for optimization.
2695 `mbclen' and `pps' are the output. `mbclen' is the length of the
2696 character consumed, and `pps' is the set this function enumerate. */
2697static status_transit_state
2698transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp,
2699 int *match_lens, int *mbclen, position_set *pps)
2700{
2701 int i, j;
2702 int s1, s2;
2703 int* work_mbls;
2704 status_transit_state rs = TRANSIT_STATE_DONE;
2705
2706 /* Calculate the length of the (single/multi byte) character
2707 to which p points. */
2708 *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
2709 : mblen_buf[*pp - buf_begin];
2710
2711 /* Calculate the state which can be reached from the state `s' by
2712 consuming `*mbclen' single bytes from the buffer. */
2713 s1 = s;
2714 for (i = 0; i < *mbclen; i++)
2715 {
2716 s2 = s1;
2717 rs = transit_state_singlebyte(d, s2, (*pp)++, &s1);
2718 }
2719 /* Copy the positions contained by `s1' to the set `pps'. */
2720 copy(&(d->states[s1].elems), pps);
2721
2722 /* Check (inputed)match_lens, and initialize if it is NULL. */
2723 if (match_lens == NULL && d->states[s].mbps.nelem != 0)
2724 work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2725 else
2726 work_mbls = match_lens;
2727
2728 /* Add all of the positions which can be reached from `s' by consuming
2729 a single character. */
2730 for (i = 0; i < d->states[s].mbps.nelem ; i++)
2731 {
2732 if (work_mbls[i] == *mbclen)
2733 for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
2734 j++)
2735 insert(d->follows[d->states[s].mbps.elems[i].index].elems[j],
2736 pps);
2737 }
2738
2739 if (match_lens == NULL && work_mbls != NULL)
2740 free(work_mbls);
2741 return rs;
2742}
2743
2744/* Transit state from s, then return new state and update the pointer of the
2745 buffer. This function is for some operator which can match with a multi-
2746 byte character or a collating element (which may be multi characters). */
2747static int
2748transit_state (struct dfa *d, int s, unsigned char const **pp)
2749{
2750 int s1;
2751 int mbclen; /* The length of current input multibyte character. */
2752 int maxlen = 0;
2753 int i, j;
2754 int *match_lens = NULL;
2755 int nelem = d->states[s].mbps.nelem; /* Just a alias. */
2756 position_set follows;
2757 unsigned char const *p1 = *pp;
2758 status_transit_state rs;
2759 wchar_t wc;
2760
2761 if (nelem > 0)
2762 /* This state has (a) multibyte operator(s).
2763 We check whether each of them can match or not. */
2764 {
2765 /* Note: caller must free the return value of this function. */
2766 match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin);
2767
2768 for (i = 0; i < nelem; i++)
2769 /* Search the operator which match the longest string,
2770 in this state. */
2771 {
2772 if (match_lens[i] > maxlen)
2773 maxlen = match_lens[i];
2774 }
2775 }
2776
2777 if (nelem == 0 || maxlen == 0)
2778 /* This state has no multibyte operator which can match.
2779 We need to check only one single byte character. */
2780 {
2781 status_transit_state rs;
2782 rs = transit_state_singlebyte(d, s, *pp, &s1);
2783
2784 /* We must update the pointer if state transition succeeded. */
2785 if (rs == TRANSIT_STATE_DONE)
2786 ++*pp;
2787
2788 if (match_lens != NULL)
2789 free(match_lens);
2790 return s1;
2791 }
2792
2793 /* This state has some operators which can match a multibyte character. */
2794 follows.nelem = 0;
2795 MALLOC(follows.elems, position, d->nleaves);
2796
2797 /* `maxlen' may be longer than the length of a character, because it may
2798 not be a character but a (multi character) collating element.
2799 We enumerate all of the positions which `s' can reach by consuming
2800 `maxlen' bytes. */
2801 rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
2802
2803 wc = inputwcs[*pp - mbclen - buf_begin];
2804 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2805 realloc_trans_if_necessary(d, s1);
2806
2807 while (*pp - p1 < maxlen)
2808 {
2809 follows.nelem = 0;
2810 rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows);
2811
2812 for (i = 0; i < nelem ; i++)
2813 {
2814 if (match_lens[i] == *pp - p1)
2815 for (j = 0;
2816 j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++)
2817 insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j],
2818 &follows);
2819 }
2820
2821 wc = inputwcs[*pp - mbclen - buf_begin];
2822 s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
2823 realloc_trans_if_necessary(d, s1);
2824 }
2825 free(match_lens);
2826 free(follows.elems);
2827 return s1;
2828}
2829
2830#endif /* MBS_SUPPORT */
2831
2832/* Search through a buffer looking for a match to the given struct dfa.
2833 Find the first occurrence of a string matching the regexp in the buffer,
2834 and the shortest possible version thereof. Return a pointer to the first
2835 character after the match, or NULL if none is found. Begin points to
2836 the beginning of the buffer, and end points to the first character after
2837 its end. We store a newline in *end to act as a sentinel, so end had
2838 better point somewhere valid. Newline is a flag indicating whether to
2839 allow newlines to be in the matching string. If count is non-
2840 NULL it points to a place we're supposed to increment every time we
2841 see a newline. Finally, if backref is non-NULL it points to a place
2842 where we're supposed to store a 1 if backreferencing happened and the
2843 match needs to be verified by a backtracking matcher. Otherwise
2844 we store a 0 in *backref. */
2845char *
2846dfaexec (struct dfa *d, char const *begin, char *end,
2847 int newline, int *count, int *backref)
2848{
2849 register int s, s1, tmp; /* Current state. */
2850 register unsigned char const *p; /* Current input character. */
2851 register int **trans, *t; /* Copy of d->trans so it can be optimized
2852 into a register. */
2853 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
2854 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
2855 static int sbit_init;
2856
2857 if (! sbit_init)
2858 {
2859 int i;
2860
2861 sbit_init = 1;
2862 for (i = 0; i < NOTCHAR; ++i)
2863 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
2864 sbit[eol] = 4;
2865 }
2866
2867 if (! d->tralloc)
2868 build_state_zero(d);
2869
2870 s = s1 = 0;
2871 p = (unsigned char const *) begin;
2872 trans = d->trans;
2873 *end = eol;
2874
2875#ifdef MBS_SUPPORT
2876 if (MB_CUR_MAX > 1)
2877 {
2878 int remain_bytes, i;
2879 buf_begin = begin;
2880 buf_end = end;
2881
2882 /* initialize mblen_buf, and inputwcs. */
2883 MALLOC(mblen_buf, unsigned char, end - begin + 2);
2884 MALLOC(inputwcs, wchar_t, end - begin + 2);
2885 memset(&mbs, 0, sizeof(mbstate_t));
2886 remain_bytes = 0;
2887 for (i = 0; i < end - begin + 1; i++)
2888 {
2889 if (remain_bytes == 0)
2890 {
2891 remain_bytes
2892 = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
2893 if (remain_bytes <= 1)
2894 {
2895 remain_bytes = 0;
2896 inputwcs[i] = (wchar_t)begin[i];
2897 mblen_buf[i] = 0;
2898 }
2899 else
2900 {
2901 mblen_buf[i] = remain_bytes;
2902 remain_bytes--;
2903 }
2904 }
2905 else
2906 {
2907 mblen_buf[i] = remain_bytes;
2908 inputwcs[i] = 0;
2909 remain_bytes--;
2910 }
2911 }
2912 mblen_buf[i] = 0;
2913 inputwcs[i] = 0; /* sentinel */
2914 }
2915#endif /* MBS_SUPPORT */
2916
2917 for (;;)
2918 {
2919#ifdef MBS_SUPPORT
2920 if (MB_CUR_MAX > 1)
2921 while ((t = trans[s]))
2922 {
2923 if ((char *) p > end)
2924 break;
2925 s1 = s;
2926 if (d->states[s].mbps.nelem != 0)
2927 {
2928 /* Can match with a multibyte character (and multi character
2929 collating element). */
2930 unsigned char const *nextp;
2931
2932 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2933
2934 nextp = p;
2935 s = transit_state(d, s, &nextp);
2936 p = (unsigned char *)nextp;
2937
2938 /* Trans table might be updated. */
2939 trans = d->trans;
2940 }
2941 else
2942 {
2943 SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
2944 s = t[*p++];
2945 }
2946 }
2947 else
2948#endif /* MBS_SUPPORT */
2949 while ((t = trans[s]) != 0) { /* hand-optimized loop */
2950 s1 = t[*p++];
2951 if ((t = trans[s1]) == 0) {
2952 tmp = s ; s = s1 ; s1 = tmp ; /* swap */
2953 break;
2954 }
2955 s = t[*p++];
2956 }
2957
2958 if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
2959 {
2960 if (d->success[s] & sbit[*p])
2961 {
2962 if (backref)
2963 *backref = (d->states[s].backref != 0);
2964#ifdef MBS_SUPPORT
2965 if (MB_CUR_MAX > 1)
2966 {
2967 free(mblen_buf);
2968 free(inputwcs);
2969 }
2970#endif /* MBS_SUPPORT */
2971 return (char *) p;
2972 }
2973
2974 s1 = s;
2975#ifdef MBS_SUPPORT
2976 if (MB_CUR_MAX > 1)
2977 {
2978 unsigned char const *nextp;
2979 nextp = p;
2980 s = transit_state(d, s, &nextp);
2981 p = (unsigned char *)nextp;
2982
2983 /* Trans table might be updated. */
2984 trans = d->trans;
2985 }
2986 else
2987#endif /* MBS_SUPPORT */
2988 s = d->fails[s][*p++];
2989 continue;
2990 }
2991
2992 /* If the previous character was a newline, count it. */
2993 if (count && (char *) p <= end && p[-1] == eol)
2994 ++*count;
2995
2996 /* Check if we've run off the end of the buffer. */
2997 if ((char *) p > end)
2998 {
2999#ifdef MBS_SUPPORT
3000 if (MB_CUR_MAX > 1)
3001 {
3002 free(mblen_buf);
3003 free(inputwcs);
3004 }
3005#endif /* MBS_SUPPORT */
3006 return NULL;
3007 }
3008
3009 if (s >= 0)
3010 {
3011 build_state(s, d);
3012 trans = d->trans;
3013 continue;
3014 }
3015
3016 if (p[-1] == eol && newline)
3017 {
3018 s = d->newlines[s1];
3019 continue;
3020 }
3021
3022 s = 0;
3023 }
3024}
3025
3026/* Initialize the components of a dfa that the other routines don't
3027 initialize for themselves. */
3028void
3029dfainit (struct dfa *d)
3030{
3031 d->calloc = 1;
3032 MALLOC(d->charclasses, charclass, d->calloc);
3033 d->cindex = 0;
3034
3035 d->talloc = 1;
3036 MALLOC(d->tokens, token, d->talloc);
3037 d->tindex = d->depth = d->nleaves = d->nregexps = 0;
3038#ifdef MBS_SUPPORT
3039 if (MB_CUR_MAX > 1)
3040 {
3041 d->nmultibyte_prop = 1;
3042 MALLOC(d->multibyte_prop, int, d->nmultibyte_prop);
3043 d->nmbcsets = 0;
3044 d->mbcsets_alloc = 1;
3045 MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc);
3046 }
3047#endif
3048
3049 d->searchflag = 0;
3050 d->tralloc = 0;
3051
3052 d->musts = 0;
3053 d->realtrans = 0;
3054 d->fails = 0;
3055 d->newlines = 0;
3056 d->success = 0;
3057}
3058
3059/* Parse and analyze a single string of the given length. */
3060void
3061dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
3062{
3063 if (case_fold) /* dummy folding in service of dfamust() */
3064 {
3065 char *lcopy;
3066 int i;
3067
3068 lcopy = malloc(len);
3069 if (!lcopy)
3070 dfaerror(_("memory exhausted"));
3071
3072 /* This is a kludge. */
3073 case_fold = 0;
3074 for (i = 0; i < len; ++i)
3075 if (ISUPPER ((unsigned char) s[i]))
3076 lcopy[i] = tolower ((unsigned char) s[i]);
3077 else
3078 lcopy[i] = s[i];
3079
3080 dfainit(d);
3081 dfaparse(lcopy, len, d);
3082 free(lcopy);
3083 dfamust(d);
3084 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
3085 case_fold = 1;
3086 dfaparse(s, len, d);
3087 dfaanalyze(d, searchflag);
3088 }
3089 else
3090 {
3091 dfainit(d);
3092 dfaparse(s, len, d);
3093 dfamust(d);
3094 dfaanalyze(d, searchflag);
3095 }
3096}
3097
3098/* Free the storage held by the components of a dfa. */
3099void
3100dfafree (struct dfa *d)
3101{
3102 int i;
3103 struct dfamust *dm, *ndm;
3104
3105 free((ptr_t) d->charclasses);
3106 free((ptr_t) d->tokens);
3107
3108#ifdef MBS_SUPPORT
3109 if (MB_CUR_MAX > 1)
3110 {
3111 free((ptr_t) d->multibyte_prop);
3112 for (i = 0; i < d->nmbcsets; ++i)
3113 {
3114 int j;
3115 struct mb_char_classes *p = &(d->mbcsets[i]);
3116 if (p->chars != NULL)
3117 free(p->chars);
3118 if (p->ch_classes != NULL)
3119 free(p->ch_classes);
3120 if (p->range_sts != NULL)
3121 free(p->range_sts);
3122 if (p->range_ends != NULL)
3123 free(p->range_ends);
3124
3125 for (j = 0; j < p->nequivs; ++j)
3126 free(p->equivs[j]);
3127 if (p->equivs != NULL)
3128 free(p->equivs);
3129
3130 for (j = 0; j < p->ncoll_elems; ++j)
3131 free(p->coll_elems[j]);
3132 if (p->coll_elems != NULL)
3133 free(p->coll_elems);
3134 }
3135 free((ptr_t) d->mbcsets);
3136 }
3137#endif /* MBS_SUPPORT */
3138
3139 for (i = 0; i < d->sindex; ++i)
3140 free((ptr_t) d->states[i].elems.elems);
3141 free((ptr_t) d->states);
3142 for (i = 0; i < d->tindex; ++i)
3143 if (d->follows[i].elems)
3144 free((ptr_t) d->follows[i].elems);
3145 free((ptr_t) d->follows);
3146 for (i = 0; i < d->tralloc; ++i)
3147 if (d->trans[i])
3148 free((ptr_t) d->trans[i]);
3149 else if (d->fails[i])
3150 free((ptr_t) d->fails[i]);
3151 if (d->realtrans) free((ptr_t) d->realtrans);
3152 if (d->fails) free((ptr_t) d->fails);
3153 if (d->newlines) free((ptr_t) d->newlines);
3154 if (d->success) free((ptr_t) d->success);
3155 for (dm = d->musts; dm; dm = ndm)
3156 {
3157 ndm = dm->next;
3158 free(dm->must);
3159 free((ptr_t) dm);
3160 }
3161}
3162
3163/* Having found the postfix representation of the regular expression,
3164 try to find a long sequence of characters that must appear in any line
3165 containing the r.e.
3166 Finding a "longest" sequence is beyond the scope here;
3167 we take an easy way out and hope for the best.
3168 (Take "(ab|a)b"--please.)
3169
3170 We do a bottom-up calculation of sequences of characters that must appear
3171 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3172 representation:
3173 sequences that must appear at the left of the match ("left")
3174 sequences that must appear at the right of the match ("right")
3175 lists of sequences that must appear somewhere in the match ("in")
3176 sequences that must constitute the match ("is")
3177
3178 When we get to the root of the tree, we use one of the longest of its
3179 calculated "in" sequences as our answer. The sequence we find is returned in
3180 d->must (where "d" is the single argument passed to "dfamust");
3181 the length of the sequence is returned in d->mustn.
3182
3183 The sequences calculated for the various types of node (in pseudo ANSI c)
3184 are shown below. "p" is the operand of unary operators (and the left-hand
3185 operand of binary operators); "q" is the right-hand operand of binary
3186 operators.
3187
3188 "ZERO" means "a zero-length sequence" below.
3189
3190 Type left right is in
3191 ---- ---- ----- -- --
3192 char c # c # c # c # c
3193
3194 ANYCHAR ZERO ZERO ZERO ZERO
3195
3196 MBCSET ZERO ZERO ZERO ZERO
3197
3198 CSET ZERO ZERO ZERO ZERO
3199
3200 STAR ZERO ZERO ZERO ZERO
3201
3202 QMARK ZERO ZERO ZERO ZERO
3203
3204 PLUS p->left p->right ZERO p->in
3205
3206 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3207 p->left : q->right : q->is!=ZERO) ? q->in plus
3208 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3209 ZERO
3210
3211 OR longest common longest common (do p->is and substrings common to
3212 leading trailing q->is have same p->in and q->in
3213 (sub)sequence (sub)sequence length and
3214 of p->left of p->right content) ?
3215 and q->left and q->right p->is : NULL
3216
3217 If there's anything else we recognize in the tree, all four sequences get set
3218 to zero-length sequences. If there's something we don't recognize in the tree,
3219 we just return a zero-length sequence.
3220
3221 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3222 'aaa')?
3223
3224 And. . .is it here or someplace that we might ponder "optimizations" such as
3225 egrep 'psi|epsilon' -> egrep 'psi'
3226 egrep 'pepsi|epsilon' -> egrep 'epsi'
3227 (Yes, we now find "epsi" as a "string
3228 that must occur", but we might also
3229 simplify the *entire* r.e. being sought)
3230 grep '[c]' -> grep 'c'
3231 grep '(ab|a)b' -> grep 'ab'
3232 grep 'ab*' -> grep 'a'
3233 grep 'a*b' -> grep 'b'
3234
3235 There are several issues:
3236
3237 Is optimization easy (enough)?
3238
3239 Does optimization actually accomplish anything,
3240 or is the automaton you get from "psi|epsilon" (for example)
3241 the same as the one you get from "psi" (for example)?
3242
3243 Are optimizable r.e.'s likely to be used in real-life situations
3244 (something like 'ab*' is probably unlikely; something like is
3245 'psi|epsilon' is likelier)? */
3246
3247static char *
3248icatalloc (char *old, char *new)
3249{
3250 char *result;
3251 size_t oldsize, newsize;
3252
3253 newsize = (new == NULL) ? 0 : strlen(new);
3254 if (old == NULL)
3255 oldsize = 0;
3256 else if (newsize == 0)
3257 return old;
3258 else oldsize = strlen(old);
3259 if (old == NULL)
3260 result = (char *) malloc(newsize + 1);
3261 else
3262 result = (char *) realloc((void *) old, oldsize + newsize + 1);
3263 if (result != NULL && new != NULL)
3264 (void) strcpy(result + oldsize, new);
3265 return result;
3266}
3267
3268static char *
3269icpyalloc (char *string)
3270{
3271 return icatalloc((char *) NULL, string);
3272}
3273
3274static char *
3275istrstr (char *lookin, char *lookfor)
3276{
3277 char *cp;
3278 size_t len;
3279
3280 len = strlen(lookfor);
3281 for (cp = lookin; *cp != '\0'; ++cp)
3282 if (strncmp(cp, lookfor, len) == 0)
3283 return cp;
3284 return NULL;
3285}
3286
3287static void
3288ifree (char *cp)
3289{
3290 if (cp != NULL)
3291 free(cp);
3292}
3293
3294static void
3295freelist (char **cpp)
3296{
3297 int i;
3298
3299 if (cpp == NULL)
3300 return;
3301 for (i = 0; cpp[i] != NULL; ++i)
3302 {
3303 free(cpp[i]);
3304 cpp[i] = NULL;
3305 }
3306}
3307
3308static char **
3309enlist (char **cpp, char *new, size_t len)
3310{
3311 int i, j;
3312
3313 if (cpp == NULL)
3314 return NULL;
3315 if ((new = icpyalloc(new)) == NULL)
3316 {
3317 freelist(cpp);
3318 return NULL;
3319 }
3320 new[len] = '\0';
3321 /* Is there already something in the list that's new (or longer)? */
3322 for (i = 0; cpp[i] != NULL; ++i)
3323 if (istrstr(cpp[i], new) != NULL)
3324 {
3325 free(new);
3326 return cpp;
3327 }
3328 /* Eliminate any obsoleted strings. */
3329 j = 0;
3330 while (cpp[j] != NULL)
3331 if (istrstr(new, cpp[j]) == NULL)
3332 ++j;
3333 else
3334 {
3335 free(cpp[j]);
3336 if (--i == j)
3337 break;
3338 cpp[j] = cpp[i];
3339 cpp[i] = NULL;
3340 }
3341 /* Add the new string. */
3342 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
3343 if (cpp == NULL)
3344 return NULL;
3345 cpp[i] = new;
3346 cpp[i + 1] = NULL;
3347 return cpp;
3348}
3349
3350/* Given pointers to two strings, return a pointer to an allocated
3351 list of their distinct common substrings. Return NULL if something
3352 seems wild. */
3353static char **
3354comsubs (char *left, char *right)
3355{
3356 char **cpp;
3357 char *lcp;
3358 char *rcp;
3359 size_t i, len;
3360
3361 if (left == NULL || right == NULL)
3362 return NULL;
3363 cpp = (char **) malloc(sizeof *cpp);
3364 if (cpp == NULL)
3365 return NULL;
3366 cpp[0] = NULL;
3367 for (lcp = left; *lcp != '\0'; ++lcp)
3368 {
3369 len = 0;
3370 rcp = strchr (right, *lcp);
3371 while (rcp != NULL)
3372 {
3373 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3374 continue;
3375 if (i > len)
3376 len = i;
3377 rcp = strchr (rcp + 1, *lcp);
3378 }
3379 if (len == 0)
3380 continue;
3381 if ((cpp = enlist(cpp, lcp, len)) == NULL)
3382 break;
3383 }
3384 return cpp;
3385}
3386
3387static char **
3388addlists (char **old, char **new)
3389{
3390 int i;
3391
3392 if (old == NULL || new == NULL)
3393 return NULL;
3394 for (i = 0; new[i] != NULL; ++i)
3395 {
3396 old = enlist(old, new[i], strlen(new[i]));
3397 if (old == NULL)
3398 break;
3399 }
3400 return old;
3401}
3402
3403/* Given two lists of substrings, return a new list giving substrings
3404 common to both. */
3405static char **
3406inboth (char **left, char **right)
3407{
3408 char **both;
3409 char **temp;
3410 int lnum, rnum;
3411
3412 if (left == NULL || right == NULL)
3413 return NULL;
3414 both = (char **) malloc(sizeof *both);
3415 if (both == NULL)
3416 return NULL;
3417 both[0] = NULL;
3418 for (lnum = 0; left[lnum] != NULL; ++lnum)
3419 {
3420 for (rnum = 0; right[rnum] != NULL; ++rnum)
3421 {
3422 temp = comsubs(left[lnum], right[rnum]);
3423 if (temp == NULL)
3424 {
3425 freelist(both);
3426 return NULL;
3427 }
3428 both = addlists(both, temp);
3429 freelist(temp);
3430 free(temp);
3431 if (both == NULL)
3432 return NULL;
3433 }
3434 }
3435 return both;
3436}
3437
3438typedef struct
3439{
3440 char **in;
3441 char *left;
3442 char *right;
3443 char *is;
3444} must;
3445
3446static void
3447resetmust (must *mp)
3448{
3449 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3450 freelist(mp->in);
3451}
3452
3453static void
3454dfamust (struct dfa *dfa)
3455{
3456 must *musts;
3457 must *mp;
3458 char *result;
3459 int ri;
3460 int i;
3461 int exact;
3462 token t;
3463 static must must0;
3464 struct dfamust *dm;
3465 static char empty_string[] = "";
3466
3467 result = empty_string;
3468 exact = 0;
3469 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
3470 if (musts == NULL)
3471 return;
3472 mp = musts;
3473 for (i = 0; i <= dfa->tindex; ++i)
3474 mp[i] = must0;
3475 for (i = 0; i <= dfa->tindex; ++i)
3476 {
3477 mp[i].in = (char **) malloc(sizeof *mp[i].in);
3478 mp[i].left = malloc(2);
3479 mp[i].right = malloc(2);
3480 mp[i].is = malloc(2);
3481 if (mp[i].in == NULL || mp[i].left == NULL ||
3482 mp[i].right == NULL || mp[i].is == NULL)
3483 goto done;
3484 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
3485 mp[i].in[0] = NULL;
3486 }
3487#ifdef DEBUG
3488 fprintf(stderr, "dfamust:\n");
3489 for (i = 0; i < dfa->tindex; ++i)
3490 {
3491 fprintf(stderr, " %d:", i);
3492 prtok(dfa->tokens[i]);
3493 }
3494 putc('\n', stderr);
3495#endif
3496 for (ri = 0; ri < dfa->tindex; ++ri)
3497 {
3498 switch (t = dfa->tokens[ri])
3499 {
3500 case LPAREN:
3501 case RPAREN:
3502 goto done; /* "cannot happen" */
3503 case EMPTY:
3504 case BEGLINE:
3505 case ENDLINE:
3506 case BEGWORD:
3507 case ENDWORD:
3508 case LIMWORD:
3509 case NOTLIMWORD:
3510 case BACKREF:
3511 resetmust(mp);
3512 break;
3513 case STAR:
3514 case QMARK:
3515 if (mp <= musts)
3516 goto done; /* "cannot happen" */
3517 --mp;
3518 resetmust(mp);
3519 break;
3520 case OR:
3521 case ORTOP:
3522 if (mp < &musts[2])
3523 goto done; /* "cannot happen" */
3524 {
3525 char **new;
3526 must *lmp;
3527 must *rmp;
3528 int j, ln, rn, n;
3529
3530 rmp = --mp;
3531 lmp = --mp;
3532 /* Guaranteed to be. Unlikely, but. . . */
3533 if (strcmp(lmp->is, rmp->is) != 0)
3534 lmp->is[0] = '\0';
3535 /* Left side--easy */
3536 i = 0;
3537 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3538 ++i;
3539 lmp->left[i] = '\0';
3540 /* Right side */
3541 ln = strlen(lmp->right);
3542 rn = strlen(rmp->right);
3543 n = ln;
3544 if (n > rn)
3545 n = rn;
3546 for (i = 0; i < n; ++i)
3547 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3548 break;
3549 for (j = 0; j < i; ++j)
3550 lmp->right[j] = lmp->right[(ln - i) + j];
3551 lmp->right[j] = '\0';
3552 new = inboth(lmp->in, rmp->in);
3553 if (new == NULL)
3554 goto done;
3555 freelist(lmp->in);
3556 free((char *) lmp->in);
3557 lmp->in = new;
3558 }
3559 break;
3560 case PLUS:
3561 if (mp <= musts)
3562 goto done; /* "cannot happen" */
3563 --mp;
3564 mp->is[0] = '\0';
3565 break;
3566 case END:
3567 if (mp != &musts[1])
3568 goto done; /* "cannot happen" */
3569 for (i = 0; musts[0].in[i] != NULL; ++i)
3570 if (strlen(musts[0].in[i]) > strlen(result))
3571 result = musts[0].in[i];
3572 if (strcmp(result, musts[0].is) == 0)
3573 exact = 1;
3574 goto done;
3575 case CAT:
3576 if (mp < &musts[2])
3577 goto done; /* "cannot happen" */
3578 {
3579 must *lmp;
3580 must *rmp;
3581
3582 rmp = --mp;
3583 lmp = --mp;
3584 /* In. Everything in left, plus everything in
3585 right, plus catenation of
3586 left's right and right's left. */
3587 lmp->in = addlists(lmp->in, rmp->in);
3588 if (lmp->in == NULL)
3589 goto done;
3590 if (lmp->right[0] != '\0' &&
3591 rmp->left[0] != '\0')
3592 {
3593 char *tp;
3594
3595 tp = icpyalloc(lmp->right);
3596 if (tp == NULL)
3597 goto done;
3598 tp = icatalloc(tp, rmp->left);
3599 if (tp == NULL)
3600 goto done;
3601 lmp->in = enlist(lmp->in, tp,
3602 strlen(tp));
3603 free(tp);
3604 if (lmp->in == NULL)
3605 goto done;
3606 }
3607 /* Left-hand */
3608 if (lmp->is[0] != '\0')
3609 {
3610 lmp->left = icatalloc(lmp->left,
3611 rmp->left);
3612 if (lmp->left == NULL)
3613 goto done;
3614 }
3615 /* Right-hand */
3616 if (rmp->is[0] == '\0')
3617 lmp->right[0] = '\0';
3618 lmp->right = icatalloc(lmp->right, rmp->right);
3619 if (lmp->right == NULL)
3620 goto done;
3621 /* Guaranteed to be */
3622 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
3623 {
3624 lmp->is = icatalloc(lmp->is, rmp->is);
3625 if (lmp->is == NULL)
3626 goto done;
3627 }
3628 else
3629 lmp->is[0] = '\0';
3630 }
3631 break;
3632 default:
3633 if (t < END)
3634 {
3635 /* "cannot happen" */
3636 goto done;
3637 }
3638 else if (t == '\0')
3639 {
3640 /* not on *my* shift */
3641 goto done;
3642 }
3643 else if (t >= CSET
3644#ifdef MBS_SUPPORT
3645 || t == ANYCHAR
3646 || t == MBCSET
3647#endif /* MBS_SUPPORT */
3648 )
3649 {
3650 /* easy enough */
3651 resetmust(mp);
3652 }
3653 else
3654 {
3655 /* plain character */
3656 resetmust(mp);
3657 mp->is[0] = mp->left[0] = mp->right[0] = t;
3658 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
3659 mp->in = enlist(mp->in, mp->is, (size_t)1);
3660 if (mp->in == NULL)
3661 goto done;
3662 }
3663 break;
3664 }
3665#ifdef DEBUG
3666 fprintf(stderr, " node: %d:", ri);
3667 prtok(dfa->tokens[ri]);
3668 fprintf(stderr, "\n in:");
3669 for (i = 0; mp->in[i]; ++i)
3670 fprintf(stderr, " \"%s\"", mp->in[i]);
3671 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
3672 fprintf(stderr, " left: \"%s\"\n", mp->left);
3673 fprintf(stderr, " right: \"%s\"\n", mp->right);
3674#endif
3675 ++mp;
3676 }
3677 done:
3678 if (strlen(result))
3679 {
3680 MALLOC(dm, struct dfamust, 1);
3681 dm->exact = exact;
3682 MALLOC(dm->must, char, strlen(result) + 1);
3683 strcpy(dm->must, result);
3684 dm->next = dfa->musts;
3685 dfa->musts = dm;
3686 }
3687 mp = musts;
3688 for (i = 0; i <= dfa->tindex; ++i)
3689 {
3690 freelist(mp[i].in);
3691 ifree((char *) mp[i].in);
3692 ifree(mp[i].left);
3693 ifree(mp[i].right);
3694 ifree(mp[i].is);
3695 }
3696 free((char *) mp);
3697}
3698/* vim:set shiftwidth=2: */
Note: See TracBrowser for help on using the repository browser.