source: trunk/src/grep/lib/dfa.c@ 3658

Last change on this file since 3658 was 3658, checked in by bird, 9 months ago

grep/dfa.c: Workaround for Visual C++ 2022 (amd64) optimizer bug. (same as sed)

  • Property svn:eol-style set to native
File size: 134.7 KB
Line 
1/* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2021 Free Software
3 Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
19
20/* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
22
23#include <config.h>
24
25#include "dfa.h"
26
27#include "flexmember.h"
28#include "idx.h"
29#include "verify.h"
30
31#include <assert.h>
32#include <ctype.h>
33#include <stdint.h>
34#include <stdio.h>
35#include <stdlib.h>
36#include <limits.h>
37#include <string.h>
38
39/* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
40 understandably cannot deduce that the input comes from a
41 well-formed regular expression. There's little point to the
42 runtime overhead of 'assert' instead of 'assume_nonnull' when the
43 MMU will check anyway. */
44#define assume_nonnull(x) assume ((x) != NULL)
45
46static bool
47streq (char const *a, char const *b)
48{
49 return strcmp (a, b) == 0;
50}
51
52static bool
53isasciidigit (char c)
54{
55 return '0' <= c && c <= '9';
56}
57
58#include "gettext.h"
59#define _(str) gettext (str)
60
61#include <wchar.h>
62
63#include "xalloc.h"
64#include "localeinfo.h"
65
66#ifndef FALLTHROUGH
67# if 201710L < __STDC_VERSION__
68# define FALLTHROUGH [[__fallthrough__]]
69# elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
70# define FALLTHROUGH __attribute__ ((__fallthrough__))
71# else
72# define FALLTHROUGH ((void) 0)
73# endif
74#endif
75
76#ifndef MIN
77# define MIN(a,b) ((a) < (b) ? (a) : (b))
78#endif
79
80/* HPUX defines these as macros in sys/param.h. */
81#ifdef setbit
82# undef setbit
83#endif
84#ifdef clrbit
85# undef clrbit
86#endif
87
88/* For code that does not use Gnulib’s isblank module. */
89#if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
90# define isblank dfa_isblank
91static int
92isblank (int c)
93{
94 return c == ' ' || c == '\t';
95}
96#endif
97
98/* First integer value that is greater than any character code. */
99enum { NOTCHAR = 1 << CHAR_BIT };
100
101#ifdef UINT_LEAST64_MAX
102
103/* Number of bits used in a charclass word. */
104enum { CHARCLASS_WORD_BITS = 64 };
105
106/* This represents part of a character class. It must be unsigned and
107 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
108typedef uint_least64_t charclass_word;
109
110/* Part of a charclass initializer that represents 64 bits' worth of a
111 charclass, where LO and HI are the low and high-order 32 bits of
112 the 64-bit quantity. */
113# define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
114
115#else
116/* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
117enum { CHARCLASS_WORD_BITS = 32 };
118typedef unsigned long charclass_word;
119# define CHARCLASS_PAIR(lo, hi) lo, hi
120#endif
121
122/* An initializer for a charclass whose 32-bit words are A through H. */
123#define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
124 {{ \
125 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
126 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
127 }}
128
129/* The maximum useful value of a charclass_word; all used bits are 1. */
130static charclass_word const CHARCLASS_WORD_MASK
131 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
132
133/* Number of words required to hold a bit for every character. */
134enum
135{
136 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
137};
138
139/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
140typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
141
142/* Convert a possibly-signed character to an unsigned character. This is
143 a bit safer than casting to unsigned char, since it catches some type
144 errors that the cast doesn't. */
145static unsigned char
146to_uchar (char ch)
147{
148 return ch;
149}
150
151/* Contexts tell us whether a character is a newline or a word constituent.
152 Word-constituent characters are those that satisfy iswalnum, plus '_'.
153 Each character has a single CTX_* value; bitmasks of CTX_* values denote
154 a particular character class.
155
156 A state also stores a context value, which is a bitmask of CTX_* values.
157 A state's context represents a set of characters that the state's
158 predecessors must match. For example, a state whose context does not
159 include CTX_LETTER will never have transitions where the previous
160 character is a word constituent. A state whose context is CTX_ANY
161 might have transitions from any character. */
162
163enum
164 {
165 CTX_NONE = 1,
166 CTX_LETTER = 2,
167 CTX_NEWLINE = 4,
168 CTX_ANY = 7
169 };
170
171/* Sometimes characters can only be matched depending on the surrounding
172 context. Such context decisions depend on what the previous character
173 was, and the value of the current (lookahead) character. Context
174 dependent constraints are encoded as 9-bit integers. Each bit that
175 is set indicates that the constraint succeeds in the corresponding
176 context.
177
178 bit 6-8 - valid contexts when next character is CTX_NEWLINE
179 bit 3-5 - valid contexts when next character is CTX_LETTER
180 bit 0-2 - valid contexts when next character is CTX_NONE
181
182 succeeds_in_context determines whether a given constraint
183 succeeds in a particular context. Prev is a bitmask of possible
184 context values for the previous character, curr is the (single-bit)
185 context value for the lookahead character. */
186static int
187newline_constraint (int constraint)
188{
189 return (constraint >> 6) & 7;
190}
191static int
192letter_constraint (int constraint)
193{
194 return (constraint >> 3) & 7;
195}
196static int
197other_constraint (int constraint)
198{
199 return constraint & 7;
200}
201
202static bool
203succeeds_in_context (int constraint, int prev, int curr)
204{
205 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
206 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
207 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
208 & prev);
209}
210
211/* The following describe what a constraint depends on. */
212static bool
213prev_newline_dependent (int constraint)
214{
215 return ((constraint ^ constraint >> 2) & 0111) != 0;
216}
217static bool
218prev_letter_dependent (int constraint)
219{
220 return ((constraint ^ constraint >> 1) & 0111) != 0;
221}
222
223/* Tokens that match the empty string subject to some constraint actually
224 work by applying that constraint to determine what may follow them,
225 taking into account what has gone before. The following values are
226 the constraints corresponding to the special tokens previously defined. */
227enum
228 {
229 NO_CONSTRAINT = 0777,
230 BEGLINE_CONSTRAINT = 0444,
231 ENDLINE_CONSTRAINT = 0700,
232 BEGWORD_CONSTRAINT = 0050,
233 ENDWORD_CONSTRAINT = 0202,
234 LIMWORD_CONSTRAINT = 0252,
235 NOTLIMWORD_CONSTRAINT = 0525
236 };
237
238/* The regexp is parsed into an array of tokens in postfix form. Some tokens
239 are operators and others are terminal symbols. Most (but not all) of these
240 codes are returned by the lexical analyzer. */
241
242typedef ptrdiff_t token;
243static token const TOKEN_MAX = PTRDIFF_MAX;
244
245/* States are indexed by state_num values. These are normally
246 nonnegative but -1 is used as a special value. */
247typedef ptrdiff_t state_num;
248
249/* Predefined token values. */
250enum
251{
252 END = -1, /* END is a terminal symbol that matches the
253 end of input; any value of END or less in
254 the parse tree is such a symbol. Accepting
255 states of the DFA are those that would have
256 a transition on END. This is -1, not some
257 more-negative value, to tweak the speed of
258 comparisons to END. */
259
260 /* Ordinary character values are terminal symbols that match themselves. */
261
262 /* CSET must come last in the following list of special tokens. Otherwise,
263 the list order matters only for performance. Related special tokens
264 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
265 || CSET <= t) can be done with a single machine-level comparison. */
266
267 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
268 the empty string. */
269
270 QMARK, /* QMARK is an operator of one argument that
271 matches zero or one occurrences of its
272 argument. */
273
274 STAR, /* STAR is an operator of one argument that
275 matches the Kleene closure (zero or more
276 occurrences) of its argument. */
277
278 PLUS, /* PLUS is an operator of one argument that
279 matches the positive closure (one or more
280 occurrences) of its argument. */
281
282 REPMN, /* REPMN is a lexical token corresponding
283 to the {m,n} construct. REPMN never
284 appears in the compiled token vector. */
285
286 CAT, /* CAT is an operator of two arguments that
287 matches the concatenation of its
288 arguments. CAT is never returned by the
289 lexical analyzer. */
290
291 OR, /* OR is an operator of two arguments that
292 matches either of its arguments. */
293
294 LPAREN, /* LPAREN never appears in the parse tree,
295 it is only a lexeme. */
296
297 RPAREN, /* RPAREN never appears in the parse tree. */
298
299#if defined(KMK_GREP) && defined(KBUILD_OS_WINDOWS)
300# define WCHAR DFA_WCHAR
301#endif
302 WCHAR, /* Only returned by lex. wctok contains
303 the wide character representation. */
304
305 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
306 a valid multibyte (or single byte) character.
307 It is used only if MB_CUR_MAX > 1. */
308
309 BEG, /* BEG is an initial symbol that matches the
310 beginning of input. */
311
312 BEGLINE, /* BEGLINE is a terminal symbol that matches
313 the empty string at the beginning of a
314 line. */
315
316 ENDLINE, /* ENDLINE is a terminal symbol that matches
317 the empty string at the end of a line. */
318
319 BEGWORD, /* BEGWORD is a terminal symbol that matches
320 the empty string at the beginning of a
321 word. */
322
323 ENDWORD, /* ENDWORD is a terminal symbol that matches
324 the empty string at the end of a word. */
325
326 LIMWORD, /* LIMWORD is a terminal symbol that matches
327 the empty string at the beginning or the
328 end of a word. */
329
330 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
331 matches the empty string not at
332 the beginning or end of a word. */
333
334 BACKREF, /* BACKREF is generated by \<digit>
335 or by any other construct that
336 is not completely handled. If the scanner
337 detects a transition on backref, it returns
338 a kind of "semi-success" indicating that
339 the match will have to be verified with
340 a backtracking matcher. */
341
342 MBCSET, /* MBCSET is similar to CSET, but for
343 multibyte characters. */
344
345 CSET /* CSET and (and any value greater) is a
346 terminal symbol that matches any of a
347 class of characters. */
348};
349
350
351/* States of the recognizer correspond to sets of positions in the parse
352 tree, together with the constraints under which they may be matched.
353 So a position is encoded as an index into the parse tree together with
354 a constraint. */
355typedef struct
356{
357 idx_t index; /* Index into the parse array. */
358 unsigned int constraint; /* Constraint for matching this position. */
359} position;
360
361/* Sets of positions are stored as arrays. */
362typedef struct
363{
364 position *elems; /* Elements of this position set. */
365 idx_t nelem; /* Number of elements in this set. */
366 idx_t alloc; /* Number of elements allocated in ELEMS. */
367} position_set;
368
369/* A state of the dfa consists of a set of positions, some flags,
370 and the token value of the lowest-numbered position of the state that
371 contains an END token. */
372typedef struct
373{
374 size_t hash; /* Hash of the positions of this state. */
375 position_set elems; /* Positions this state could match. */
376 unsigned char context; /* Context from previous state. */
377 unsigned short constraint; /* Constraint for this state to accept. */
378 position_set mbps; /* Positions which can match multibyte
379 characters or the follows, e.g., period.
380 Used only if MB_CUR_MAX > 1. */
381 state_num mb_trindex; /* Index of this state in MB_TRANS, or
382 negative if the state does not have
383 ANYCHAR. */
384} dfa_state;
385
386/* Maximum for any transition table count. This should be at least 3,
387 for the initial state setup. */
388enum { MAX_TRCOUNT = 1024 };
389
390/* A bracket operator.
391 e.g., [a-c], [[:alpha:]], etc. */
392struct mb_char_classes
393{
394 ptrdiff_t cset;
395 bool invert;
396 wchar_t *chars; /* Normal characters. */
397 idx_t nchars;
398 idx_t nchars_alloc;
399};
400
401struct regex_syntax
402{
403 /* Syntax bits controlling the behavior of the lexical analyzer. */
404 reg_syntax_t syntax_bits;
405 bool syntax_bits_set;
406
407 /* Flag for case-folding letters into sets. */
408 bool case_fold;
409
410 /* True if ^ and $ match only the start and end of data, and do not match
411 end-of-line within data. */
412 bool anchor;
413
414 /* End-of-line byte in data. */
415 unsigned char eolbyte;
416
417 /* Cache of char-context values. */
418 char sbit[NOTCHAR];
419
420 /* If never_trail[B], the byte B cannot be a non-initial byte in a
421 multibyte character. */
422 bool never_trail[NOTCHAR];
423
424 /* Set of characters considered letters. */
425 charclass letters;
426
427 /* Set of characters that are newline. */
428 charclass newline;
429};
430
431/* Lexical analyzer. All the dross that deals with the obnoxious
432 GNU Regex syntax bits is located here. The poor, suffering
433 reader is referred to the GNU Regex documentation for the
434 meaning of the @#%!@#%^!@ syntax bits. */
435struct lexer_state
436{
437 char const *ptr; /* Pointer to next input character. */
438 idx_t left; /* Number of characters remaining. */
439 token lasttok; /* Previous token returned; initially END. */
440 idx_t parens; /* Count of outstanding left parens. */
441 int minrep, maxrep; /* Repeat counts for {m,n}. */
442
443 /* Wide character representation of the current multibyte character,
444 or WEOF if there was an encoding error. Used only if
445 MB_CUR_MAX > 1. */
446 wint_t wctok;
447
448 /* The most recently analyzed multibyte bracket expression. */
449 struct mb_char_classes brack;
450
451 /* We're separated from beginning or (, | only by zero-width characters. */
452 bool laststart;
453};
454
455/* Recursive descent parser for regular expressions. */
456
457struct parser_state
458{
459 token tok; /* Lookahead token. */
460 idx_t depth; /* Current depth of a hypothetical stack
461 holding deferred productions. This is
462 used to determine the depth that will be
463 required of the real stack later on in
464 dfaanalyze. */
465};
466
467/* A compiled regular expression. */
468struct dfa
469{
470 /* Fields filled by the scanner. */
471 charclass *charclasses; /* Array of character sets for CSET tokens. */
472 idx_t cindex; /* Index for adding new charclasses. */
473 idx_t calloc; /* Number of charclasses allocated. */
474 ptrdiff_t canychar; /* Index of anychar class, or -1. */
475
476 /* Scanner state */
477 struct lexer_state lex;
478
479 /* Parser state */
480 struct parser_state parse;
481
482 /* Fields filled by the parser. */
483 token *tokens; /* Postfix parse array. */
484 idx_t tindex; /* Index for adding new tokens. */
485 idx_t talloc; /* Number of tokens currently allocated. */
486 idx_t depth; /* Depth required of an evaluation stack
487 used for depth-first traversal of the
488 parse tree. */
489 idx_t nleaves; /* Number of non-EMPTY leaves
490 in the parse tree. */
491 idx_t nregexps; /* Count of parallel regexps being built
492 with dfaparse. */
493 bool fast; /* The DFA is fast. */
494 bool epsilon; /* Does a token match only the empty string? */
495 token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
496 mbstate_t mbs; /* Multibyte conversion state. */
497
498 /* The following are valid only if MB_CUR_MAX > 1. */
499
500 /* The value of multibyte_prop[i] is defined by following rule.
501 if tokens[i] < NOTCHAR
502 bit 0 : tokens[i] is the first byte of a character, including
503 single-byte characters.
504 bit 1 : tokens[i] is the last byte of a character, including
505 single-byte characters.
506
507 e.g.
508 tokens
509 = 'single_byte_a', 'multi_byte_A', single_byte_b'
510 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
511 multibyte_prop
512 = 3 , 1 , 0 , 2 , 3
513 */
514 char *multibyte_prop;
515
516 /* Fields filled by the superset. */
517 struct dfa *superset; /* Hint of the dfa. */
518
519 /* Fields filled by the state builder. */
520 dfa_state *states; /* States of the dfa. */
521 state_num sindex; /* Index for adding new states. */
522 idx_t salloc; /* Number of states currently allocated. */
523
524 /* Fields filled by the parse tree->NFA conversion. */
525 position_set *follows; /* Array of follow sets, indexed by position
526 index. The follow of a position is the set
527 of positions containing characters that
528 could conceivably follow a character
529 matching the given position in a string
530 matching the regexp. Allocated to the
531 maximum possible position index. */
532 bool searchflag; /* We are supposed to build a searching
533 as opposed to an exact matcher. A searching
534 matcher finds the first and shortest string
535 matching a regexp anywhere in the buffer,
536 whereas an exact matcher finds the longest
537 string matching, but anchored to the
538 beginning of the buffer. */
539
540 /* Fields filled by dfaanalyze. */
541 int *constraints; /* Array of union of accepting constraints
542 in the follow of a position. */
543 int *separates; /* Array of contexts on follow of a
544 position. */
545
546 /* Fields filled by dfaexec. */
547 state_num tralloc; /* Number of transition tables that have
548 slots so far, not counting trans[-1] and
549 trans[-2]. */
550 int trcount; /* Number of transition tables that have
551 been built, other than for initial
552 states. */
553 int min_trcount; /* Number of initial states. Equivalently,
554 the minimum state number for which trcount
555 counts transitions. */
556 state_num **trans; /* Transition tables for states that can
557 never accept. If the transitions for a
558 state have not yet been computed, or the
559 state could possibly accept, its entry in
560 this table is NULL. This points to two
561 past the start of the allocated array,
562 and trans[-1] and trans[-2] are always
563 NULL. */
564 state_num **fails; /* Transition tables after failing to accept
565 on a state that potentially could do so.
566 If trans[i] is non-null, fails[i] must
567 be null. */
568 char *success; /* Table of acceptance conditions used in
569 dfaexec and computed in build_state. */
570 state_num *newlines; /* Transitions on newlines. The entry for a
571 newline in any transition table is always
572 -1 so we can count lines without wasting
573 too many cycles. The transition for a
574 newline is stored separately and handled
575 as a special case. Newline is also used
576 as a sentinel at the end of the buffer. */
577 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
578 context in multibyte locales, in which we
579 do not distinguish between their contexts,
580 as not supported word. */
581 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
582 state_num **mb_trans; /* Transition tables for states with
583 ANYCHAR. */
584 state_num mb_trcount; /* Number of transition tables for states with
585 ANYCHAR that have actually been built. */
586
587 /* Syntax configuration. This is near the end so that dfacopysyntax
588 can memset up to here. */
589 struct regex_syntax syntax;
590
591 /* Information derived from the locale. This is at the end so that
592 a quick memset need not clear it specially. */
593
594 /* dfaexec implementation. */
595 char *(*dfaexec) (struct dfa *, char const *, char *,
596 bool, ptrdiff_t *, bool *);
597
598 /* Other cached information derived from the locale. */
599 struct localeinfo localeinfo;
600};
601
602/* User access to dfa internals. */
603
604/* S could possibly be an accepting state of R. */
605static bool
606accepting (state_num s, struct dfa const *r)
607{
608 return r->states[s].constraint != 0;
609}
610
611/* STATE accepts in the specified context. */
612static bool
613accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
614{
615 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
616}
617
618static void regexp (struct dfa *dfa);
619
620/* Store into *PWC the result of converting the leading bytes of the
621 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
622 and updating the conversion state in *D. On conversion error,
623 convert just a single byte, to WEOF. Return the number of bytes
624 converted.
625
626 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
627
628 * PWC points to wint_t, not to wchar_t.
629 * The last arg is a dfa *D instead of merely a multibyte conversion
630 state D->mbs.
631 * N is idx_t not size_t, and must be at least 1.
632 * S[N - 1] must be a sentinel byte.
633 * Shift encodings are not supported.
634 * The return value is always in the range 1..N.
635 * D->mbs is always valid afterwards.
636 * *PWC is always set to something. */
637static int
638mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
639{
640 unsigned char uc = s[0];
641 wint_t wc = d->localeinfo.sbctowc[uc];
642
643 if (wc == WEOF)
644 {
645 wchar_t wch;
646 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
647 if (0 < nbytes && nbytes < (size_t) -2)
648 {
649 *pwc = wch;
650 return nbytes;
651 }
652 memset (&d->mbs, 0, sizeof d->mbs);
653 }
654
655 *pwc = wc;
656 return 1;
657}
658
659#ifdef DEBUG
660
661static void
662prtok (token t)
663{
664 if (t <= END)
665 fprintf (stderr, "END");
666 else if (0 <= t && t < NOTCHAR)
667 {
668 unsigned int ch = t;
669 fprintf (stderr, "0x%02x", ch);
670 }
671 else
672 {
673 char const *s;
674 switch (t)
675 {
676 case BEG:
677 s = "BEG";
678 break;
679 case EMPTY:
680 s = "EMPTY";
681 break;
682 case BACKREF:
683 s = "BACKREF";
684 break;
685 case BEGLINE:
686 s = "BEGLINE";
687 break;
688 case ENDLINE:
689 s = "ENDLINE";
690 break;
691 case BEGWORD:
692 s = "BEGWORD";
693 break;
694 case ENDWORD:
695 s = "ENDWORD";
696 break;
697 case LIMWORD:
698 s = "LIMWORD";
699 break;
700 case NOTLIMWORD:
701 s = "NOTLIMWORD";
702 break;
703 case QMARK:
704 s = "QMARK";
705 break;
706 case STAR:
707 s = "STAR";
708 break;
709 case PLUS:
710 s = "PLUS";
711 break;
712 case CAT:
713 s = "CAT";
714 break;
715 case OR:
716 s = "OR";
717 break;
718 case LPAREN:
719 s = "LPAREN";
720 break;
721 case RPAREN:
722 s = "RPAREN";
723 break;
724 case ANYCHAR:
725 s = "ANYCHAR";
726 break;
727 case MBCSET:
728 s = "MBCSET";
729 break;
730 default:
731 s = "CSET";
732 break;
733 }
734 fprintf (stderr, "%s", s);
735 }
736}
737#endif /* DEBUG */
738
739/* Stuff pertaining to charclasses. */
740
741static bool
742tstbit (unsigned int b, charclass const *c)
743{
744 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
745}
746
747static void
748setbit (unsigned int b, charclass *c)
749{
750 charclass_word one = 1;
751 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
752}
753
754static void
755clrbit (unsigned int b, charclass *c)
756{
757 charclass_word one = 1;
758 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
759}
760
761static void
762zeroset (charclass *s)
763{
764 memset (s, 0, sizeof *s);
765}
766
767static void
768fillset (charclass *s)
769{
770 for (int i = 0; i < CHARCLASS_WORDS; i++)
771 s->w[i] = CHARCLASS_WORD_MASK;
772}
773
774static void
775notset (charclass *s)
776{
777 for (int i = 0; i < CHARCLASS_WORDS; ++i)
778 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
779}
780
781static bool
782equal (charclass const *s1, charclass const *s2)
783{
784 charclass_word w = 0;
785 for (int i = 0; i < CHARCLASS_WORDS; i++)
786 w |= s1->w[i] ^ s2->w[i];
787 return w == 0;
788}
789
790static bool
791emptyset (charclass const *s)
792{
793 charclass_word w = 0;
794 for (int i = 0; i < CHARCLASS_WORDS; i++)
795 w |= s->w[i];
796 return w == 0;
797}
798
799/* Ensure that the array addressed by PA holds at least I + 1 items.
800 Either return PA, or reallocate the array and return its new address.
801 Although PA may be null, the returned value is never null.
802
803 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
804 is updated on reallocation. If PA is null, *NITEMS must be zero.
805 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
806 ITEM_SIZE is the size of one item; it must be positive.
807 Avoid O(N**2) behavior on arrays growing linearly. */
808static void *
809maybe_realloc (void *pa, idx_t i, idx_t *nitems,
810 ptrdiff_t nitems_max, idx_t item_size)
811{
812 if (i < *nitems)
813 return pa;
814 return xpalloc (pa, nitems, 1, nitems_max, item_size);
815}
816
817/* In DFA D, find the index of charclass S, or allocate a new one. */
818static idx_t
819charclass_index (struct dfa *d, charclass const *s)
820{
821 idx_t i;
822
823 for (i = 0; i < d->cindex; ++i)
824 if (equal (s, &d->charclasses[i]))
825 return i;
826 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
827 TOKEN_MAX - CSET, sizeof *d->charclasses);
828 ++d->cindex;
829 d->charclasses[i] = *s;
830 return i;
831}
832
833static bool
834unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
835{
836 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
837}
838
839static int
840char_context (struct dfa const *dfa, unsigned char c)
841{
842 if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
843 return CTX_NEWLINE;
844 if (unibyte_word_constituent (dfa, c))
845 return CTX_LETTER;
846 return CTX_NONE;
847}
848
849/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
850 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
851 this may happen when folding case in weird Turkish locales where
852 dotless i/dotted I are not included in the chosen character set.
853 Return whether a bit was set in the charclass. */
854static bool
855setbit_wc (wint_t wc, charclass *c)
856{
857 int b = wctob (wc);
858 if (b < 0)
859 return false;
860
861 setbit (b, c);
862 return true;
863}
864
865/* Set a bit for B and its case variants in the charclass C.
866 MB_CUR_MAX must be 1. */
867static void
868setbit_case_fold_c (int b, charclass *c)
869{
870 int ub = toupper (b);
871 for (int i = 0; i < NOTCHAR; i++)
872 if (toupper (i) == ub)
873 setbit (i, c);
874}
875
876/* Fetch the next lexical input character from the pattern. There
877 must at least one byte of pattern input. Set DFA->lex.wctok to the
878 value of the character or to WEOF depending on whether the input is
879 a valid multibyte character (possibly of length 1). Then return
880 the next input byte value, except return EOF if the input is a
881 multibyte character of length greater than 1. */
882static int
883fetch_wc (struct dfa *dfa)
884{
885 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
886 dfa);
887 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
888 dfa->lex.ptr += nbytes;
889 dfa->lex.left -= nbytes;
890 return c;
891}
892
893/* If there is no more input, report an error about unbalanced brackets.
894 Otherwise, behave as with fetch_wc (DFA). */
895static int
896bracket_fetch_wc (struct dfa *dfa)
897{
898 if (! dfa->lex.left)
899 dfaerror (_("unbalanced ["));
900 return fetch_wc (dfa);
901}
902
903typedef int predicate (int);
904
905/* The following list maps the names of the Posix named character classes
906 to predicate functions that determine whether a given character is in
907 the class. The leading [ has already been eaten by the lexical
908 analyzer. */
909struct dfa_ctype
910{
911 const char *name;
912 predicate *func;
913 bool single_byte_only;
914};
915
916static const struct dfa_ctype prednames[] = {
917 {"alpha", isalpha, false},
918 {"upper", isupper, false},
919 {"lower", islower, false},
920 {"digit", isdigit, true},
921 {"xdigit", isxdigit, false},
922 {"space", isspace, false},
923 {"punct", ispunct, false},
924 {"alnum", isalnum, false},
925 {"print", isprint, false},
926 {"graph", isgraph, false},
927 {"cntrl", iscntrl, false},
928 {"blank", isblank, false},
929 {NULL, NULL, false}
930};
931
932static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
933find_pred (const char *str)
934{
935 for (int i = 0; prednames[i].name; i++)
936 if (streq (str, prednames[i].name))
937 return &prednames[i];
938 return NULL;
939}
940
941/* Parse a bracket expression, which possibly includes multibyte
942 characters. */
943static token
944parse_bracket_exp (struct dfa *dfa)
945{
946 /* This is a bracket expression that dfaexec is known to
947 process correctly. */
948 bool known_bracket_exp = true;
949
950 /* Used to warn about [:space:].
951 Bit 0 = first character is a colon.
952 Bit 1 = last character is a colon.
953 Bit 2 = includes any other character but a colon.
954 Bit 3 = includes ranges, char/equiv classes or collation elements. */
955 int colon_warning_state;
956
957 dfa->lex.brack.nchars = 0;
958 charclass ccl;
959 zeroset (&ccl);
960 int c = bracket_fetch_wc (dfa);
961 bool invert = c == '^';
962 if (invert)
963 {
964 c = bracket_fetch_wc (dfa);
965 known_bracket_exp = dfa->localeinfo.simple;
966 }
967 wint_t wc = dfa->lex.wctok;
968 int c1;
969 wint_t wc1;
970 colon_warning_state = (c == ':');
971 do
972 {
973 c1 = NOTCHAR; /* Mark c1 as not initialized. */
974 colon_warning_state &= ~2;
975
976 /* Note that if we're looking at some other [:...:] construct,
977 we just treat it as a bunch of ordinary characters. We can do
978 this because we assume regex has checked for syntax errors before
979 dfa is ever called. */
980 if (c == '[')
981 {
982 c1 = bracket_fetch_wc (dfa);
983 wc1 = dfa->lex.wctok;
984
985 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
986 || c1 == '.' || c1 == '=')
987 {
988 enum { MAX_BRACKET_STRING_LEN = 32 };
989 char str[MAX_BRACKET_STRING_LEN + 1];
990 int len = 0;
991 for (;;)
992 {
993 c = bracket_fetch_wc (dfa);
994 if (dfa->lex.left == 0
995 || (c == c1 && dfa->lex.ptr[0] == ']'))
996 break;
997 if (len < MAX_BRACKET_STRING_LEN)
998 str[len++] = c;
999 else
1000 /* This is in any case an invalid class name. */
1001 str[0] = '\0';
1002 }
1003 str[len] = '\0';
1004
1005 /* Fetch bracket. */
1006 c = bracket_fetch_wc (dfa);
1007 wc = dfa->lex.wctok;
1008 if (c1 == ':')
1009 /* Build character class. POSIX allows character
1010 classes to match multicharacter collating elements,
1011 but the regex code does not support that, so do not
1012 worry about that possibility. */
1013 {
1014 char const *class
1015 = (dfa->syntax.case_fold && (streq (str, "upper")
1016 || streq (str, "lower"))
1017 ? "alpha" : str);
1018 const struct dfa_ctype *pred = find_pred (class);
1019 if (!pred)
1020 dfaerror (_("invalid character class"));
1021
1022 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1023 known_bracket_exp = false;
1024 else
1025 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1026 if (pred->func (c2))
1027 setbit (c2, &ccl);
1028 }
1029 else
1030 known_bracket_exp = false;
1031
1032 colon_warning_state |= 8;
1033
1034 /* Fetch new lookahead character. */
1035 c1 = bracket_fetch_wc (dfa);
1036 wc1 = dfa->lex.wctok;
1037 continue;
1038 }
1039
1040 /* We treat '[' as a normal character here. c/c1/wc/wc1
1041 are already set up. */
1042 }
1043
1044 if (c == '\\'
1045 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1046 {
1047 c = bracket_fetch_wc (dfa);
1048 wc = dfa->lex.wctok;
1049 }
1050
1051 if (c1 == NOTCHAR)
1052 {
1053 c1 = bracket_fetch_wc (dfa);
1054 wc1 = dfa->lex.wctok;
1055 }
1056
1057 if (c1 == '-')
1058 /* build range characters. */
1059 {
1060 int c2 = bracket_fetch_wc (dfa);
1061 wint_t wc2 = dfa->lex.wctok;
1062
1063 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1064 Treat it like [-a[.aa.]] while parsing it, and
1065 remember that the set is unknown. */
1066 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1067 {
1068 known_bracket_exp = false;
1069 c2 = ']';
1070 }
1071
1072 if (c2 == ']')
1073 {
1074 /* In the case [x-], the - is an ordinary hyphen,
1075 which is left in c1, the lookahead character. */
1076 dfa->lex.ptr--;
1077 dfa->lex.left++;
1078 }
1079 else
1080 {
1081 if (c2 == '\\' && (dfa->syntax.syntax_bits
1082 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1083 {
1084 c2 = bracket_fetch_wc (dfa);
1085 wc2 = dfa->lex.wctok;
1086 }
1087
1088 colon_warning_state |= 8;
1089 c1 = bracket_fetch_wc (dfa);
1090 wc1 = dfa->lex.wctok;
1091
1092 /* Treat [x-y] as a range if x != y. */
1093 if (wc != wc2 || wc == WEOF)
1094 {
1095 if (dfa->localeinfo.simple
1096 || (isasciidigit (c) & isasciidigit (c2)))
1097 {
1098 for (int ci = c; ci <= c2; ci++)
1099 if (dfa->syntax.case_fold && isalpha (ci))
1100 setbit_case_fold_c (ci, &ccl);
1101 else
1102 setbit (ci, &ccl);
1103 }
1104 else
1105 known_bracket_exp = false;
1106
1107 continue;
1108 }
1109 }
1110 }
1111
1112 colon_warning_state |= (c == ':') ? 2 : 4;
1113
1114 if (!dfa->localeinfo.multibyte)
1115 {
1116 if (dfa->syntax.case_fold && isalpha (c))
1117 setbit_case_fold_c (c, &ccl);
1118 else
1119 setbit (c, &ccl);
1120 continue;
1121 }
1122
1123 if (wc == WEOF)
1124 known_bracket_exp = false;
1125 else
1126 {
1127 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1128 int n = (dfa->syntax.case_fold
1129 ? case_folded_counterparts (wc, folded + 1) + 1
1130 : 1);
1131 folded[0] = wc;
1132 for (int i = 0; i < n; i++)
1133 if (!setbit_wc (folded[i], &ccl))
1134 {
1135 dfa->lex.brack.chars
1136 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1137 &dfa->lex.brack.nchars_alloc, -1,
1138 sizeof *dfa->lex.brack.chars);
1139 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1140 }
1141 }
1142 }
1143 while ((wc = wc1, (c = c1) != ']'));
1144
1145 if (colon_warning_state == 7)
1146 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1147
1148 if (! known_bracket_exp)
1149 return BACKREF;
1150
1151 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1152 {
1153 dfa->lex.brack.invert = invert;
1154 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1155 return MBCSET;
1156 }
1157
1158 if (invert)
1159 {
1160 notset (&ccl);
1161 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1162 clrbit ('\n', &ccl);
1163 }
1164
1165 return CSET + charclass_index (dfa, &ccl);
1166}
1167
1168struct lexptr
1169{
1170 char const *ptr;
1171 idx_t left;
1172};
1173
1174static void
1175push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1176{
1177 ls->ptr = dfa->lex.ptr;
1178 ls->left = dfa->lex.left;
1179 dfa->lex.ptr = s;
1180 dfa->lex.left = strlen (s);
1181}
1182
1183static void
1184pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1185{
1186 dfa->lex.ptr = ls->ptr;
1187 dfa->lex.left = ls->left;
1188}
1189
1190static token
1191lex (struct dfa *dfa)
1192{
1193 bool backslash = false;
1194
1195 /* Basic plan: We fetch a character. If it's a backslash,
1196 we set the backslash flag and go through the loop again.
1197 On the plus side, this avoids having a duplicate of the
1198 main switch inside the backslash case. On the minus side,
1199 it means that just about every case begins with
1200 "if (backslash) ...". */
1201 for (int i = 0; i < 2; ++i)
1202 {
1203 if (! dfa->lex.left)
1204 return dfa->lex.lasttok = END;
1205 int c = fetch_wc (dfa);
1206
1207 switch (c)
1208 {
1209 case '\\':
1210 if (backslash)
1211 goto normal_char;
1212 if (dfa->lex.left == 0)
1213 dfaerror (_("unfinished \\ escape"));
1214 backslash = true;
1215 break;
1216
1217 case '^':
1218 if (backslash)
1219 goto normal_char;
1220 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1221 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1222 || dfa->lex.lasttok == OR)
1223 return dfa->lex.lasttok = BEGLINE;
1224 goto normal_char;
1225
1226 case '$':
1227 if (backslash)
1228 goto normal_char;
1229 /* kmk: cl v19.29.30139/amd64 messes this function up when optimizing
1230 for speed, workaround is to optimize it for size instead. The
1231 symptom is that the following SED expression fail to match:
1232 s/^[0-9a-fA-F]\{1,\} \(00[0-9a-fA-F]*\) ABS *notype *External *| \([^.]\{1,\}\)\.\(.*$\)/ 1=\1 2=\2 3=\3/
1233
1234 Seems the exact problem is that it gets the indexing here wrong:
1235 dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS) & (dfa->lex.ptr[0] == '\\')]
1236 It forgets to do the ` dfa->lex.ptr[0] == '\\' ` part and instead
1237 ANDs with a register initialized to zero. Rewriting the
1238 expressions using the tinary operator works around the problem,
1239 although the resulting code is a lot bulkier.
1240 */
1241 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1242 || dfa->lex.left == 0
1243#ifdef _MSC_VER /* see above */
1244 || (!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1245 ? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == ')'
1246 : dfa->lex.left > 0 && dfa->lex.ptr[0] == ')')
1247#else
1248 || ((dfa->lex.left
1249 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1250 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1251 & (dfa->lex.ptr[0] == '\\')]
1252 == ')'))
1253#endif
1254#ifdef _MSC_VER /* see above */
1255 || (!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1256 ? dfa->lex.left > 1 && dfa->lex.ptr[dfa->lex.ptr[0] == '\\'] == '|'
1257 : dfa->lex.left > 0 && dfa->lex.ptr[0] == '|')
1258#else
1259 || ((dfa->lex.left
1260 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1261 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1262 & (dfa->lex.ptr[0] == '\\')]
1263 == '|'))
1264#endif
1265 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1266 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1267 return dfa->lex.lasttok = ENDLINE;
1268 goto normal_char;
1269
1270 case '1':
1271 case '2':
1272 case '3':
1273 case '4':
1274 case '5':
1275 case '6':
1276 case '7':
1277 case '8':
1278 case '9':
1279 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1280 {
1281 dfa->lex.laststart = false;
1282 return dfa->lex.lasttok = BACKREF;
1283 }
1284 goto normal_char;
1285
1286 case '`':
1287 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1288 {
1289 /* FIXME: should be beginning of string */
1290 return dfa->lex.lasttok = BEGLINE;
1291 }
1292 goto normal_char;
1293
1294 case '\'':
1295 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1296 {
1297 /* FIXME: should be end of string */
1298 return dfa->lex.lasttok = ENDLINE;
1299 }
1300 goto normal_char;
1301
1302 case '<':
1303 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1304 return dfa->lex.lasttok = BEGWORD;
1305 goto normal_char;
1306
1307 case '>':
1308 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1309 return dfa->lex.lasttok = ENDWORD;
1310 goto normal_char;
1311
1312 case 'b':
1313 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1314 return dfa->lex.lasttok = LIMWORD;
1315 goto normal_char;
1316
1317 case 'B':
1318 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1319 return dfa->lex.lasttok = NOTLIMWORD;
1320 goto normal_char;
1321
1322 case '?':
1323 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1324 goto normal_char;
1325 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1326 goto normal_char;
1327 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1328 && dfa->lex.laststart)
1329 goto normal_char;
1330 return dfa->lex.lasttok = QMARK;
1331
1332 case '*':
1333 if (backslash)
1334 goto normal_char;
1335 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1336 && dfa->lex.laststart)
1337 goto normal_char;
1338 return dfa->lex.lasttok = STAR;
1339
1340 case '+':
1341 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1342 goto normal_char;
1343 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1344 goto normal_char;
1345 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1346 && dfa->lex.laststart)
1347 goto normal_char;
1348 return dfa->lex.lasttok = PLUS;
1349
1350 case '{':
1351 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1352 goto normal_char;
1353 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1354 goto normal_char;
1355 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1356 && dfa->lex.laststart)
1357 goto normal_char;
1358
1359 /* Cases:
1360 {M} - exact count
1361 {M,} - minimum count, maximum is infinity
1362 {,N} - 0 through N
1363 {,} - 0 to infinity (same as '*')
1364 {M,N} - M through N */
1365 {
1366 char const *p = dfa->lex.ptr;
1367 char const *lim = p + dfa->lex.left;
1368 dfa->lex.minrep = dfa->lex.maxrep = -1;
1369 for (; p != lim && isasciidigit (*p); p++)
1370 dfa->lex.minrep = (dfa->lex.minrep < 0
1371 ? *p - '0'
1372 : MIN (RE_DUP_MAX + 1,
1373 dfa->lex.minrep * 10 + *p - '0'));
1374 if (p != lim)
1375 {
1376 if (*p != ',')
1377 dfa->lex.maxrep = dfa->lex.minrep;
1378 else
1379 {
1380 if (dfa->lex.minrep < 0)
1381 dfa->lex.minrep = 0;
1382 while (++p != lim && isasciidigit (*p))
1383 dfa->lex.maxrep
1384 = (dfa->lex.maxrep < 0
1385 ? *p - '0'
1386 : MIN (RE_DUP_MAX + 1,
1387 dfa->lex.maxrep * 10 + *p - '0'));
1388 }
1389 }
1390 if (! ((! backslash || (p != lim && *p++ == '\\'))
1391 && p != lim && *p++ == '}'
1392 && 0 <= dfa->lex.minrep
1393 && (dfa->lex.maxrep < 0
1394 || dfa->lex.minrep <= dfa->lex.maxrep)))
1395 {
1396 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1397 goto normal_char;
1398 dfaerror (_("invalid content of \\{\\}"));
1399 }
1400 if (RE_DUP_MAX < dfa->lex.maxrep)
1401 dfaerror (_("regular expression too big"));
1402 dfa->lex.ptr = p;
1403 dfa->lex.left = lim - p;
1404 }
1405 dfa->lex.laststart = false;
1406 return dfa->lex.lasttok = REPMN;
1407
1408 case '|':
1409 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1410 goto normal_char;
1411 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1412 goto normal_char;
1413 dfa->lex.laststart = true;
1414 return dfa->lex.lasttok = OR;
1415
1416 case '\n':
1417 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1418 || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1419 goto normal_char;
1420 dfa->lex.laststart = true;
1421 return dfa->lex.lasttok = OR;
1422
1423 case '(':
1424 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1425 goto normal_char;
1426 dfa->lex.parens++;
1427 dfa->lex.laststart = true;
1428 return dfa->lex.lasttok = LPAREN;
1429
1430 case ')':
1431 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1432 goto normal_char;
1433 if (dfa->lex.parens == 0
1434 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1435 goto normal_char;
1436 dfa->lex.parens--;
1437 dfa->lex.laststart = false;
1438 return dfa->lex.lasttok = RPAREN;
1439
1440 case '.':
1441 if (backslash)
1442 goto normal_char;
1443 if (dfa->canychar < 0)
1444 {
1445 charclass ccl;
1446 fillset (&ccl);
1447 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1448 clrbit ('\n', &ccl);
1449 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1450 clrbit ('\0', &ccl);
1451 if (dfa->localeinfo.multibyte)
1452 for (int c2 = 0; c2 < NOTCHAR; c2++)
1453 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1454 clrbit (c2, &ccl);
1455 dfa->canychar = charclass_index (dfa, &ccl);
1456 }
1457 dfa->lex.laststart = false;
1458 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1459 ? ANYCHAR
1460 : CSET + dfa->canychar);
1461
1462 case 's':
1463 case 'S':
1464 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1465 goto normal_char;
1466 if (!dfa->localeinfo.multibyte)
1467 {
1468 charclass ccl;
1469 zeroset (&ccl);
1470 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1471 if (isspace (c2))
1472 setbit (c2, &ccl);
1473 if (c == 'S')
1474 notset (&ccl);
1475 dfa->lex.laststart = false;
1476 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1477 }
1478
1479 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1480 add_utf8_anychar, makes sense. */
1481
1482 /* \s and \S are documented to be equivalent to [[:space:]] and
1483 [^[:space:]] respectively, so tell the lexer to process those
1484 strings, each minus its "already processed" '['. */
1485 {
1486 struct lexptr ls;
1487 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1488 dfa->lex.lasttok = parse_bracket_exp (dfa);
1489 pop_lex_state (dfa, &ls);
1490 }
1491
1492 dfa->lex.laststart = false;
1493 return dfa->lex.lasttok;
1494
1495 case 'w':
1496 case 'W':
1497 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1498 goto normal_char;
1499
1500 if (!dfa->localeinfo.multibyte)
1501 {
1502 charclass ccl;
1503 zeroset (&ccl);
1504 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1505 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1506 setbit (c2, &ccl);
1507 if (c == 'W')
1508 notset (&ccl);
1509 dfa->lex.laststart = false;
1510 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1511 }
1512
1513 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1514 add_utf8_anychar, makes sense. */
1515
1516 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1517 [^_[:alnum:]] respectively, so tell the lexer to process those
1518 strings, each minus its "already processed" '['. */
1519 {
1520 struct lexptr ls;
1521 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1522 dfa->lex.lasttok = parse_bracket_exp (dfa);
1523 pop_lex_state (dfa, &ls);
1524 }
1525
1526 dfa->lex.laststart = false;
1527 return dfa->lex.lasttok;
1528
1529 case '[':
1530 if (backslash)
1531 goto normal_char;
1532 dfa->lex.laststart = false;
1533 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1534
1535 default:
1536 normal_char:
1537 dfa->lex.laststart = false;
1538 /* For multibyte character sets, folding is done in atom. Always
1539 return WCHAR. */
1540 if (dfa->localeinfo.multibyte)
1541 return dfa->lex.lasttok = WCHAR;
1542
1543 if (dfa->syntax.case_fold && isalpha (c))
1544 {
1545 charclass ccl;
1546 zeroset (&ccl);
1547 setbit_case_fold_c (c, &ccl);
1548 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1549 }
1550
1551 return dfa->lex.lasttok = c;
1552 }
1553 }
1554
1555 /* The above loop should consume at most a backslash
1556 and some other character. */
1557 abort ();
1558 return END; /* keeps pedantic compilers happy. */
1559}
1560
1561static void
1562addtok_mb (struct dfa *dfa, token t, char mbprop)
1563{
1564 if (dfa->talloc == dfa->tindex)
1565 {
1566 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1567 sizeof *dfa->tokens);
1568 if (dfa->localeinfo.multibyte)
1569 dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1570 sizeof *dfa->multibyte_prop);
1571 }
1572 if (dfa->localeinfo.multibyte)
1573 dfa->multibyte_prop[dfa->tindex] = mbprop;
1574 dfa->tokens[dfa->tindex++] = t;
1575
1576 switch (t)
1577 {
1578 case QMARK:
1579 case STAR:
1580 case PLUS:
1581 break;
1582
1583 case CAT:
1584 case OR:
1585 dfa->parse.depth--;
1586 break;
1587
1588 case EMPTY:
1589 dfa->epsilon = true;
1590 goto increment_depth;
1591
1592 case BACKREF:
1593 dfa->fast = false;
1594 goto increment_nleaves;
1595
1596 case BEGLINE:
1597 case ENDLINE:
1598 case BEGWORD:
1599 case ENDWORD:
1600 case LIMWORD:
1601 case NOTLIMWORD:
1602 dfa->epsilon = true;
1603 FALLTHROUGH;
1604 default:
1605 increment_nleaves:
1606 dfa->nleaves++;
1607 increment_depth:
1608 dfa->parse.depth++;
1609 if (dfa->depth < dfa->parse.depth)
1610 dfa->depth = dfa->parse.depth;
1611 break;
1612 }
1613}
1614
1615static void addtok_wc (struct dfa *dfa, wint_t wc);
1616
1617/* Add the given token to the parse tree, maintaining the depth count and
1618 updating the maximum depth if necessary. */
1619static void
1620addtok (struct dfa *dfa, token t)
1621{
1622 if (dfa->localeinfo.multibyte && t == MBCSET)
1623 {
1624 bool need_or = false;
1625
1626 /* Extract wide characters into alternations for better performance.
1627 This does not require UTF-8. */
1628 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1629 {
1630 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1631 if (need_or)
1632 addtok (dfa, OR);
1633 need_or = true;
1634 }
1635 dfa->lex.brack.nchars = 0;
1636
1637 /* Wide characters have been handled above, so it is possible
1638 that the set is empty now. Do nothing in that case. */
1639 if (dfa->lex.brack.cset != -1)
1640 {
1641 addtok (dfa, CSET + dfa->lex.brack.cset);
1642 if (need_or)
1643 addtok (dfa, OR);
1644 }
1645 }
1646 else
1647 {
1648 addtok_mb (dfa, t, 3);
1649 }
1650}
1651
1652/* We treat a multibyte character as a single atom, so that DFA
1653 can treat a multibyte character as a single expression.
1654
1655 e.g., we construct the following tree from "<mb1><mb2>".
1656 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1657 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1658static void
1659addtok_wc (struct dfa *dfa, wint_t wc)
1660{
1661 unsigned char buf[MB_LEN_MAX];
1662 mbstate_t s = { 0 };
1663 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1664 int buflen;
1665
1666 if (stored_bytes != (size_t) -1)
1667 buflen = stored_bytes;
1668 else
1669 {
1670 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1671 the addtok_mb call altogether can corrupt the heap. */
1672 buflen = 1;
1673 buf[0] = 0;
1674 }
1675
1676 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1677 for (int i = 1; i < buflen; i++)
1678 {
1679 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1680 addtok (dfa, CAT);
1681 }
1682}
1683
1684static void
1685add_utf8_anychar (struct dfa *dfa)
1686{
1687 /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1688 UTF-8 byte sequence has been defined as follows:
1689
1690 ([\x00-\x7f]
1691 |[\xc2-\xdf][\x80-\xbf]
1692 |[\xe0][\xa0-\xbf][\x80-\xbf]
1693 |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1694 |[\xed][\x80-\x9f][\x80-\xbf]
1695 |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1696 |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1697 |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1698
1699 which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1700 where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1701 D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1702 H = [\x80-\x9f], I = [\xf0],
1703 J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1704
1705 This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
1706
1707 /* Mnemonics for classes containing two or more bytes. */
1708 enum { A, B, C, E, F, H, J, K, M };
1709
1710 /* Mnemonics for single-byte tokens. */
1711 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1712
1713 static charclass const utf8_classes[] = {
1714 /* A. 00-7f: 1-byte sequence. */
1715 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1716
1717 /* B. c2-df: 1st byte of a 2-byte sequence. */
1718 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1719
1720 /* C. 80-bf: non-leading bytes. */
1721 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1722
1723 /* D. e0 (just a token). */
1724
1725 /* E. a0-bf: 2nd byte of a "DEC" sequence. */
1726 CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1727
1728 /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1729 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1730
1731 /* G. ed (just a token). */
1732
1733 /* H. 80-9f: 2nd byte of a "GHC" sequence. */
1734 CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1735
1736 /* I. f0 (just a token). */
1737
1738 /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1739 CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1740
1741 /* K. f1-f3: 1st byte of a "KCCC" sequence. */
1742 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1743
1744 /* L. f4 (just a token). */
1745
1746 /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1747 CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1748 };
1749
1750 /* Define the character classes that are needed below. */
1751 if (dfa->utf8_anychar_classes[0] == 0)
1752 {
1753 charclass c = utf8_classes[0];
1754 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1755 clrbit ('\n', &c);
1756 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1757 clrbit ('\0', &c);
1758 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1759
1760 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1761 dfa->utf8_anychar_classes[i]
1762 = CSET + charclass_index (dfa, &utf8_classes[i]);
1763 }
1764
1765 /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1766 The token buffer is in reverse Polish order, so we get
1767 "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1768 C CAT OR C CAT OR C CAT OR". */
1769 addtok (dfa, dfa->utf8_anychar_classes[A]);
1770 addtok (dfa, dfa->utf8_anychar_classes[B]);
1771 addtok (dfa, D_token);
1772 addtok (dfa, dfa->utf8_anychar_classes[E]);
1773 addtok (dfa, CAT);
1774 addtok (dfa, OR);
1775 addtok (dfa, G_token);
1776 addtok (dfa, dfa->utf8_anychar_classes[H]);
1777 addtok (dfa, CAT);
1778 addtok (dfa, OR);
1779 addtok (dfa, dfa->utf8_anychar_classes[F]);
1780 addtok (dfa, I_token);
1781 addtok (dfa, dfa->utf8_anychar_classes[J]);
1782 addtok (dfa, CAT);
1783 addtok (dfa, OR);
1784 addtok (dfa, L_token);
1785 addtok (dfa, dfa->utf8_anychar_classes[M]);
1786 addtok (dfa, CAT);
1787 addtok (dfa, OR);
1788 addtok (dfa, dfa->utf8_anychar_classes[K]);
1789 for (int i = 0; i < 3; i++)
1790 {
1791 addtok (dfa, dfa->utf8_anychar_classes[C]);
1792 addtok (dfa, CAT);
1793 addtok (dfa, OR);
1794 }
1795}
1796
1797/* The grammar understood by the parser is as follows.
1798
1799 regexp:
1800 regexp OR branch
1801 branch
1802
1803 branch:
1804 branch closure
1805 closure
1806
1807 closure:
1808 closure QMARK
1809 closure STAR
1810 closure PLUS
1811 closure REPMN
1812 atom
1813
1814 atom:
1815 <normal character>
1816 <multibyte character>
1817 ANYCHAR
1818 MBCSET
1819 CSET
1820 BACKREF
1821 BEGLINE
1822 ENDLINE
1823 BEGWORD
1824 ENDWORD
1825 LIMWORD
1826 NOTLIMWORD
1827 LPAREN regexp RPAREN
1828 <empty>
1829
1830 The parser builds a parse tree in postfix form in an array of tokens. */
1831
1832static void
1833atom (struct dfa *dfa)
1834{
1835 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1836 || dfa->parse.tok >= CSET
1837 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1838 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1839 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1840 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1841 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1842 {
1843 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1844 {
1845 /* For UTF-8 expand the period to a series of CSETs that define a
1846 valid UTF-8 character. This avoids using the slow multibyte
1847 path. I'm pretty sure it would be both profitable and correct to
1848 do it for any encoding; however, the optimization must be done
1849 manually as it is done above in add_utf8_anychar. So, let's
1850 start with UTF-8: it is the most used, and the structure of the
1851 encoding makes the correctness more obvious. */
1852 add_utf8_anychar (dfa);
1853 }
1854 else
1855 addtok (dfa, dfa->parse.tok);
1856 dfa->parse.tok = lex (dfa);
1857 }
1858 else if (dfa->parse.tok == WCHAR)
1859 {
1860 if (dfa->lex.wctok == WEOF)
1861 addtok (dfa, BACKREF);
1862 else
1863 {
1864 addtok_wc (dfa, dfa->lex.wctok);
1865
1866 if (dfa->syntax.case_fold)
1867 {
1868 wchar_t folded[CASE_FOLDED_BUFSIZE];
1869 int n = case_folded_counterparts (dfa->lex.wctok, folded);
1870 for (int i = 0; i < n; i++)
1871 {
1872 addtok_wc (dfa, folded[i]);
1873 addtok (dfa, OR);
1874 }
1875 }
1876 }
1877
1878 dfa->parse.tok = lex (dfa);
1879 }
1880 else if (dfa->parse.tok == LPAREN)
1881 {
1882 dfa->parse.tok = lex (dfa);
1883 regexp (dfa);
1884 if (dfa->parse.tok != RPAREN)
1885 dfaerror (_("unbalanced ("));
1886 dfa->parse.tok = lex (dfa);
1887 }
1888 else
1889 addtok (dfa, EMPTY);
1890}
1891
1892/* Return the number of tokens in the given subexpression. */
1893static idx_t _GL_ATTRIBUTE_PURE
1894nsubtoks (struct dfa const *dfa, idx_t tindex)
1895{
1896 switch (dfa->tokens[tindex - 1])
1897 {
1898 default:
1899 return 1;
1900 case QMARK:
1901 case STAR:
1902 case PLUS:
1903 return 1 + nsubtoks (dfa, tindex - 1);
1904 case CAT:
1905 case OR:
1906 {
1907 idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1908 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1909 }
1910 }
1911}
1912
1913/* Copy the given subexpression to the top of the tree. */
1914static void
1915copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1916{
1917 if (dfa->localeinfo.multibyte)
1918 for (idx_t i = 0; i < ntokens; i++)
1919 addtok_mb (dfa, dfa->tokens[tindex + i],
1920 dfa->multibyte_prop[tindex + i]);
1921 else
1922 for (idx_t i = 0; i < ntokens; i++)
1923 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1924}
1925
1926static void
1927closure (struct dfa *dfa)
1928{
1929 atom (dfa);
1930 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1931 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1932 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1933 {
1934 idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1935 idx_t tindex = dfa->tindex - ntokens;
1936 if (dfa->lex.maxrep < 0)
1937 addtok (dfa, PLUS);
1938 if (dfa->lex.minrep == 0)
1939 addtok (dfa, QMARK);
1940 int i;
1941 for (i = 1; i < dfa->lex.minrep; i++)
1942 {
1943 copytoks (dfa, tindex, ntokens);
1944 addtok (dfa, CAT);
1945 }
1946 for (; i < dfa->lex.maxrep; i++)
1947 {
1948 copytoks (dfa, tindex, ntokens);
1949 addtok (dfa, QMARK);
1950 addtok (dfa, CAT);
1951 }
1952 dfa->parse.tok = lex (dfa);
1953 }
1954 else if (dfa->parse.tok == REPMN)
1955 {
1956 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1957 dfa->parse.tok = lex (dfa);
1958 closure (dfa);
1959 }
1960 else
1961 {
1962 addtok (dfa, dfa->parse.tok);
1963 dfa->parse.tok = lex (dfa);
1964 }
1965}
1966
1967static void
1968branch (struct dfa* dfa)
1969{
1970 closure (dfa);
1971 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1972 && dfa->parse.tok >= 0)
1973 {
1974 closure (dfa);
1975 addtok (dfa, CAT);
1976 }
1977}
1978
1979static void
1980regexp (struct dfa *dfa)
1981{
1982 branch (dfa);
1983 while (dfa->parse.tok == OR)
1984 {
1985 dfa->parse.tok = lex (dfa);
1986 branch (dfa);
1987 addtok (dfa, OR);
1988 }
1989}
1990
1991/* Parse a string S of length LEN into D. S can include NUL characters.
1992 This is the main entry point for the parser. */
1993void
1994dfaparse (char const *s, idx_t len, struct dfa *d)
1995{
1996 d->lex.ptr = s;
1997 d->lex.left = len;
1998 d->lex.lasttok = END;
1999 d->lex.laststart = true;
2000
2001 if (!d->syntax.syntax_bits_set)
2002 dfaerror (_("no syntax specified"));
2003
2004 if (!d->nregexps)
2005 addtok (d, BEG);
2006
2007 d->parse.tok = lex (d);
2008 d->parse.depth = d->depth;
2009
2010 regexp (d);
2011
2012 if (d->parse.tok != END)
2013 dfaerror (_("unbalanced )"));
2014
2015 addtok (d, END - d->nregexps);
2016 addtok (d, CAT);
2017
2018 if (d->nregexps)
2019 addtok (d, OR);
2020
2021 ++d->nregexps;
2022}
2023
2024/* Some primitives for operating on sets of positions. */
2025
2026/* Copy one set to another. */
2027static void
2028copy (position_set const *src, position_set *dst)
2029{
2030 if (dst->alloc < src->nelem)
2031 {
2032 free (dst->elems);
2033 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2034 sizeof *dst->elems);
2035 }
2036 dst->nelem = src->nelem;
2037 if (src->nelem != 0)
2038 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2039}
2040
2041static void
2042alloc_position_set (position_set *s, idx_t size)
2043{
2044 s->elems = xnmalloc (size, sizeof *s->elems);
2045 s->alloc = size;
2046 s->nelem = 0;
2047}
2048
2049/* Insert position P in set S. S is maintained in sorted order on
2050 decreasing index. If there is already an entry in S with P.index
2051 then merge (logically-OR) P's constraints into the one in S.
2052 S->elems must point to an array large enough to hold the resulting set. */
2053static void
2054insert (position p, position_set *s)
2055{
2056 idx_t count = s->nelem;
2057 idx_t lo = 0, hi = count;
2058 while (lo < hi)
2059 {
2060 idx_t mid = (lo + hi) >> 1;
2061 if (s->elems[mid].index < p.index)
2062 lo = mid + 1;
2063 else if (s->elems[mid].index == p.index)
2064 {
2065 s->elems[mid].constraint |= p.constraint;
2066 return;
2067 }
2068 else
2069 hi = mid;
2070 }
2071
2072 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2073 for (idx_t i = count; i > lo; i--)
2074 s->elems[i] = s->elems[i - 1];
2075 s->elems[lo] = p;
2076 ++s->nelem;
2077}
2078
2079static void
2080append (position p, position_set *s)
2081{
2082 idx_t count = s->nelem;
2083 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2084 s->elems[s->nelem++] = p;
2085}
2086
2087/* Merge S1 and S2 (with the additional constraint C2) into M. The
2088 result is as if the positions of S1, and of S2 with the additional
2089 constraint C2, were inserted into an initially empty set. */
2090static void
2091merge_constrained (position_set const *s1, position_set const *s2,
2092 unsigned int c2, position_set *m)
2093{
2094 idx_t i = 0, j = 0;
2095
2096 if (m->alloc - s1->nelem < s2->nelem)
2097 {
2098 free (m->elems);
2099 m->alloc = s1->nelem;
2100 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2101 }
2102 m->nelem = 0;
2103 while (i < s1->nelem || j < s2->nelem)
2104 if (! (j < s2->nelem)
2105 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2106 {
2107 unsigned int c = ((i < s1->nelem && j < s2->nelem
2108 && s1->elems[i].index == s2->elems[j].index)
2109 ? s2->elems[j++].constraint & c2
2110 : 0);
2111 m->elems[m->nelem].index = s1->elems[i].index;
2112 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2113 }
2114 else
2115 {
2116 if (s2->elems[j].constraint & c2)
2117 {
2118 m->elems[m->nelem].index = s2->elems[j].index;
2119 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2120 }
2121 j++;
2122 }
2123}
2124
2125/* Merge two sets of positions into a third. The result is exactly as if
2126 the positions of both sets were inserted into an initially empty set. */
2127static void
2128merge (position_set const *s1, position_set const *s2, position_set *m)
2129{
2130 merge_constrained (s1, s2, -1, m);
2131}
2132
2133/* Merge into DST all the elements of SRC, possibly destroying
2134 the contents of the temporary M. */
2135static void
2136merge2 (position_set *dst, position_set const *src, position_set *m)
2137{
2138 if (src->nelem < 4)
2139 {
2140 for (idx_t i = 0; i < src->nelem; i++)
2141 insert (src->elems[i], dst);
2142 }
2143 else
2144 {
2145 merge (src, dst, m);
2146 copy (m, dst);
2147 }
2148}
2149
2150/* Delete a position from a set. Return the nonzero constraint of the
2151 deleted position, or zero if there was no such position. */
2152static unsigned int
2153delete (idx_t del, position_set *s)
2154{
2155 idx_t count = s->nelem;
2156 idx_t lo = 0, hi = count;
2157 while (lo < hi)
2158 {
2159 idx_t mid = (lo + hi) >> 1;
2160 if (s->elems[mid].index < del)
2161 lo = mid + 1;
2162 else if (s->elems[mid].index == del)
2163 {
2164 unsigned int c = s->elems[mid].constraint;
2165 idx_t i;
2166 for (i = mid; i + 1 < count; i++)
2167 s->elems[i] = s->elems[i + 1];
2168 s->nelem = i;
2169 return c;
2170 }
2171 else
2172 hi = mid;
2173 }
2174 return 0;
2175}
2176
2177/* Replace a position with the followed set. */
2178static void
2179replace (position_set *dst, idx_t del, position_set *add,
2180 unsigned int constraint, position_set *tmp)
2181{
2182 unsigned int c = delete (del, dst) & constraint;
2183
2184 if (c)
2185 {
2186 copy (dst, tmp);
2187 merge_constrained (tmp, add, c, dst);
2188 }
2189}
2190
2191/* Find the index of the state corresponding to the given position set with
2192 the given preceding context, or create a new state if there is no such
2193 state. Context tells whether we got here on a newline or letter. */
2194static state_num
2195state_index (struct dfa *d, position_set const *s, int context)
2196{
2197 size_t hash = 0;
2198 int constraint = 0;
2199 state_num i;
2200
2201 for (i = 0; i < s->nelem; ++i)
2202 {
2203 idx_t ind = s->elems[i].index;
2204 hash ^= ind + s->elems[i].constraint;
2205 }
2206
2207 /* Try to find a state that exactly matches the proposed one. */
2208 for (i = 0; i < d->sindex; ++i)
2209 {
2210 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2211 || context != d->states[i].context)
2212 continue;
2213 state_num j;
2214 for (j = 0; j < s->nelem; ++j)
2215 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2216 || s->elems[j].index != d->states[i].elems.elems[j].index)
2217 break;
2218 if (j == s->nelem)
2219 return i;
2220 }
2221
2222#ifdef DEBUG
2223 fprintf (stderr, "new state %td\n nextpos:", i);
2224 for (state_num j = 0; j < s->nelem; j++)
2225 {
2226 fprintf (stderr, " %td:", s->elems[j].index);
2227 prtok (d->tokens[s->elems[j].index]);
2228 }
2229 fprintf (stderr, "\n context:");
2230 if (context ^ CTX_ANY)
2231 {
2232 if (context & CTX_NONE)
2233 fprintf (stderr, " CTX_NONE");
2234 if (context & CTX_LETTER)
2235 fprintf (stderr, " CTX_LETTER");
2236 if (context & CTX_NEWLINE)
2237 fprintf (stderr, " CTX_NEWLINE");
2238 }
2239 else
2240 fprintf (stderr, " CTX_ANY");
2241 fprintf (stderr, "\n");
2242#endif
2243
2244 for (state_num j = 0; j < s->nelem; j++)
2245 {
2246 int c = d->constraints[s->elems[j].index];
2247
2248 if (c != 0)
2249 {
2250 if (succeeds_in_context (c, context, CTX_ANY))
2251 constraint |= c;
2252 }
2253 else if (d->tokens[s->elems[j].index] == BACKREF)
2254 constraint = NO_CONSTRAINT;
2255 }
2256
2257
2258 /* Create a new state. */
2259 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2260 sizeof *d->states);
2261 d->states[i].hash = hash;
2262 alloc_position_set (&d->states[i].elems, s->nelem);
2263 copy (s, &d->states[i].elems);
2264 d->states[i].context = context;
2265 d->states[i].constraint = constraint;
2266 d->states[i].mbps.nelem = 0;
2267 d->states[i].mbps.elems = NULL;
2268 d->states[i].mb_trindex = -1;
2269
2270 ++d->sindex;
2271
2272 return i;
2273}
2274
2275/* Find the epsilon closure of D's set of positions. If any position of the set
2276 contains a symbol that matches the empty string in some context, replace
2277 that position with the elements of its follow labeled with an appropriate
2278 constraint. Repeat exhaustively until no funny positions are left.
2279 S->elems must be large enough to hold the result. BACKWARD is D's
2280 backward set; use and update it too. */
2281static void
2282epsclosure (struct dfa const *d, position_set *backward)
2283{
2284 position_set tmp;
2285 alloc_position_set (&tmp, d->nleaves);
2286 for (idx_t i = 0; i < d->tindex; i++)
2287 if (0 < d->follows[i].nelem)
2288 {
2289 unsigned int constraint;
2290 switch (d->tokens[i])
2291 {
2292 default:
2293 continue;
2294
2295 case BEGLINE:
2296 constraint = BEGLINE_CONSTRAINT;
2297 break;
2298 case ENDLINE:
2299 constraint = ENDLINE_CONSTRAINT;
2300 break;
2301 case BEGWORD:
2302 constraint = BEGWORD_CONSTRAINT;
2303 break;
2304 case ENDWORD:
2305 constraint = ENDWORD_CONSTRAINT;
2306 break;
2307 case LIMWORD:
2308 constraint = LIMWORD_CONSTRAINT;
2309 break;
2310 case NOTLIMWORD:
2311 constraint = NOTLIMWORD_CONSTRAINT;
2312 break;
2313 case EMPTY:
2314 constraint = NO_CONSTRAINT;
2315 break;
2316 }
2317
2318 delete (i, &d->follows[i]);
2319
2320 for (idx_t j = 0; j < backward[i].nelem; j++)
2321 replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2322 constraint, &tmp);
2323 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2324 replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2325 NO_CONSTRAINT, &tmp);
2326 }
2327 free (tmp.elems);
2328}
2329
2330/* Returns the set of contexts for which there is at least one
2331 character included in C. */
2332
2333static int
2334charclass_context (struct dfa const *dfa, charclass const *c)
2335{
2336 int context = 0;
2337
2338 for (int j = 0; j < CHARCLASS_WORDS; j++)
2339 {
2340 if (c->w[j] & dfa->syntax.newline.w[j])
2341 context |= CTX_NEWLINE;
2342 if (c->w[j] & dfa->syntax.letters.w[j])
2343 context |= CTX_LETTER;
2344 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2345 context |= CTX_NONE;
2346 }
2347
2348 return context;
2349}
2350
2351/* Returns the contexts on which the position set S depends. Each context
2352 in the set of returned contexts (let's call it SC) may have a different
2353 follow set than other contexts in SC, and also different from the
2354 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2355 in the complement set will have the same follow set. */
2356
2357static int _GL_ATTRIBUTE_PURE
2358state_separate_contexts (struct dfa *d, position_set const *s)
2359{
2360 int separate_contexts = 0;
2361
2362 for (idx_t j = 0; j < s->nelem; j++)
2363 separate_contexts |= d->separates[s->elems[j].index];
2364
2365 return separate_contexts;
2366}
2367
2368enum
2369{
2370 /* Single token is repeated. It is distinguished from non-repeated. */
2371 OPT_REPEAT = (1 << 0),
2372
2373 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2374 node is not merged. */
2375 OPT_LPAREN = (1 << 1),
2376
2377 /* Multiple branches are joined. The node is not merged. */
2378 OPT_RPAREN = (1 << 2),
2379
2380 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2381 flag is turned on. */
2382 OPT_WALKED = (1 << 3),
2383
2384 /* The node is queued. The node is not queued again. */
2385 OPT_QUEUED = (1 << 4)
2386};
2387
2388static void
2389merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2390 position_set *merged)
2391{
2392 position_set *follows = d->follows;
2393 idx_t nelem = 0;
2394
2395 for (idx_t i = 0; i < follows[tindex].nelem; i++)
2396 {
2397 idx_t sindex = follows[tindex].elems[i].index;
2398
2399 /* Skip the node as pruned in future. */
2400 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2401 if (iconstraint == 0)
2402 continue;
2403
2404 if (d->tokens[follows[tindex].elems[i].index] <= END)
2405 {
2406 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2407 continue;
2408 }
2409
2410 if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2411 {
2412 idx_t j;
2413
2414 for (j = 0; j < nelem; j++)
2415 {
2416 idx_t dindex = follows[tindex].elems[j].index;
2417
2418 if (dindex == tindex)
2419 continue;
2420
2421 if (follows[tindex].elems[j].constraint != iconstraint)
2422 continue;
2423
2424 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2425 continue;
2426
2427 if (d->tokens[sindex] != d->tokens[dindex])
2428 continue;
2429
2430 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2431 continue;
2432
2433 if (flags[sindex] & OPT_REPEAT)
2434 delete (sindex, &follows[sindex]);
2435
2436 merge2 (&follows[dindex], &follows[sindex], merged);
2437
2438 break;
2439 }
2440
2441 if (j < nelem)
2442 continue;
2443 }
2444
2445 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2446 flags[sindex] |= OPT_QUEUED;
2447 }
2448
2449 follows[tindex].nelem = nelem;
2450}
2451
2452static int
2453compare (const void *a, const void *b)
2454{
2455 position const *p = a, *q = b;
2456 return (p->index > q->index) - (p->index < q->index);
2457}
2458
2459static void
2460reorder_tokens (struct dfa *d)
2461{
2462 idx_t nleaves = 0;
2463 ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2464 map[0] = nleaves++;
2465 for (idx_t i = 1; i < d->tindex; i++)
2466 map[i] = -1;
2467
2468 token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2469 position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2470 int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2471 char *multibyte_prop = (d->localeinfo.multibyte
2472 ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2473 : NULL);
2474
2475 for (idx_t i = 0; i < d->tindex; i++)
2476 {
2477 if (map[i] < 0)
2478 {
2479 free (d->follows[i].elems);
2480 d->follows[i].elems = NULL;
2481 d->follows[i].nelem = 0;
2482 continue;
2483 }
2484
2485 tokens[map[i]] = d->tokens[i];
2486 follows[map[i]] = d->follows[i];
2487 constraints[map[i]] = d->constraints[i];
2488
2489 if (multibyte_prop != NULL)
2490 multibyte_prop[map[i]] = d->multibyte_prop[i];
2491
2492 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2493 {
2494 if (map[d->follows[i].elems[j].index] == -1)
2495 map[d->follows[i].elems[j].index] = nleaves++;
2496
2497 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2498 }
2499
2500 qsort (d->follows[i].elems, d->follows[i].nelem,
2501 sizeof *d->follows[i].elems, compare);
2502 }
2503
2504 for (idx_t i = 0; i < nleaves; i++)
2505 {
2506 d->tokens[i] = tokens[i];
2507 d->follows[i] = follows[i];
2508 d->constraints[i] = constraints[i];
2509
2510 if (multibyte_prop != NULL)
2511 d->multibyte_prop[i] = multibyte_prop[i];
2512 }
2513
2514 d->tindex = d->nleaves = nleaves;
2515
2516 free (tokens);
2517 free (follows);
2518 free (constraints);
2519 free (multibyte_prop);
2520 free (map);
2521}
2522
2523static void
2524dfaoptimize (struct dfa *d)
2525{
2526 char *flags = xizalloc (d->tindex);
2527
2528 for (idx_t i = 0; i < d->tindex; i++)
2529 {
2530 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2531 {
2532 if (d->follows[i].elems[j].index == i)
2533 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2534 else if (d->follows[i].elems[j].index < i)
2535 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2536 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2537 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2538 else
2539 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2540 }
2541 }
2542
2543 flags[0] |= OPT_QUEUED;
2544
2545 position_set merged0;
2546 position_set *merged = &merged0;
2547 alloc_position_set (merged, d->nleaves);
2548
2549 d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2550
2551 for (idx_t i = 0; i < d->tindex; i++)
2552 if (flags[i] & OPT_QUEUED)
2553 merge_nfa_state (d, i, flags, merged);
2554
2555 reorder_tokens (d);
2556
2557 free (merged->elems);
2558 free (flags);
2559}
2560
2561/* Perform bottom-up analysis on the parse tree, computing various functions.
2562 Note that at this point, we're pretending constructs like \< are real
2563 characters rather than constraints on what can follow them.
2564
2565 Nullable: A node is nullable if it is at the root of a regexp that can
2566 match the empty string.
2567 * EMPTY leaves are nullable.
2568 * No other leaf is nullable.
2569 * A QMARK or STAR node is nullable.
2570 * A PLUS node is nullable if its argument is nullable.
2571 * A CAT node is nullable if both its arguments are nullable.
2572 * An OR node is nullable if either argument is nullable.
2573
2574 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2575 that could correspond to the first character of a string matching the
2576 regexp rooted at the given node.
2577 * EMPTY leaves have empty firstpos.
2578 * The firstpos of a nonempty leaf is that leaf itself.
2579 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2580 argument.
2581 * The firstpos of a CAT node is the firstpos of the left argument, union
2582 the firstpos of the right if the left argument is nullable.
2583 * The firstpos of an OR node is the union of firstpos of each argument.
2584
2585 Lastpos: The lastpos of a node is the set of positions that could
2586 correspond to the last character of a string matching the regexp at
2587 the given node.
2588 * EMPTY leaves have empty lastpos.
2589 * The lastpos of a nonempty leaf is that leaf itself.
2590 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2591 argument.
2592 * The lastpos of a CAT node is the lastpos of its right argument, union
2593 the lastpos of the left if the right argument is nullable.
2594 * The lastpos of an OR node is the union of the lastpos of each argument.
2595
2596 Follow: The follow of a position is the set of positions that could
2597 correspond to the character following a character matching the node in
2598 a string matching the regexp. At this point we consider special symbols
2599 that match the empty string in some context to be just normal characters.
2600 Later, if we find that a special symbol is in a follow set, we will
2601 replace it with the elements of its follow, labeled with an appropriate
2602 constraint.
2603 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2604 the follow of every node in the lastpos.
2605 * Every node in the firstpos of the second argument of a CAT node is in
2606 the follow of every node in the lastpos of the first argument.
2607
2608 Because of the postfix representation of the parse tree, the depth-first
2609 analysis is conveniently done by a linear scan with the aid of a stack.
2610 Sets are stored as arrays of the elements, obeying a stack-like allocation
2611 scheme; the number of elements in each set deeper in the stack can be
2612 used to determine the address of a particular set's array. */
2613static void
2614dfaanalyze (struct dfa *d, bool searchflag)
2615{
2616 /* Array allocated to hold position sets. */
2617 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2618 /* Firstpos and lastpos elements. */
2619 position *firstpos = posalloc;
2620 position *lastpos = firstpos + d->nleaves;
2621 position pos;
2622 position_set tmp;
2623
2624 /* Stack for element counts and nullable flags. */
2625 struct
2626 {
2627 /* Whether the entry is nullable. */
2628 bool nullable;
2629
2630 /* Counts of firstpos and lastpos sets. */
2631 idx_t nfirstpos;
2632 idx_t nlastpos;
2633 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2634
2635 position_set merged; /* Result of merging sets. */
2636
2637 addtok (d, CAT);
2638 idx_t tindex = d->tindex;
2639
2640#ifdef DEBUG
2641 fprintf (stderr, "dfaanalyze:\n");
2642 for (idx_t i = 0; i < tindex; i++)
2643 {
2644 fprintf (stderr, " %td:", i);
2645 prtok (d->tokens[i]);
2646 }
2647 putc ('\n', stderr);
2648#endif
2649
2650 d->searchflag = searchflag;
2651 alloc_position_set (&merged, d->nleaves);
2652 d->follows = xicalloc (tindex, sizeof *d->follows);
2653 position_set *backward
2654 = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2655
2656 for (idx_t i = 0; i < tindex; i++)
2657 {
2658 switch (d->tokens[i])
2659 {
2660 case EMPTY:
2661 /* The empty set is nullable. */
2662 stk->nullable = true;
2663
2664 /* The firstpos and lastpos of the empty leaf are both empty. */
2665 stk->nfirstpos = stk->nlastpos = 0;
2666 stk++;
2667 break;
2668
2669 case STAR:
2670 case PLUS:
2671 /* Every element in the lastpos of the argument is in the backward
2672 set of every element in the firstpos. */
2673 if (d->epsilon)
2674 {
2675 tmp.elems = lastpos - stk[-1].nlastpos;
2676 tmp.nelem = stk[-1].nlastpos;
2677 for (position *p = firstpos - stk[-1].nfirstpos;
2678 p < firstpos; p++)
2679 merge2 (&backward[p->index], &tmp, &merged);
2680 }
2681
2682 /* Every element in the firstpos of the argument is in the follow
2683 of every element in the lastpos. */
2684 {
2685 tmp.elems = firstpos - stk[-1].nfirstpos;
2686 tmp.nelem = stk[-1].nfirstpos;
2687 for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2688 merge2 (&d->follows[p->index], &tmp, &merged);
2689 }
2690 FALLTHROUGH;
2691 case QMARK:
2692 /* A QMARK or STAR node is automatically nullable. */
2693 if (d->tokens[i] != PLUS)
2694 stk[-1].nullable = true;
2695 break;
2696
2697 case CAT:
2698 /* Every element in the lastpos of the first argument is in
2699 the backward set of every element in the firstpos of the
2700 second argument. */
2701 if (backward)
2702 {
2703 tmp.nelem = stk[-2].nlastpos;
2704 tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2705 for (position *p = firstpos - stk[-1].nfirstpos;
2706 p < firstpos; p++)
2707 merge2 (&backward[p->index], &tmp, &merged);
2708 }
2709
2710 /* Every element in the firstpos of the second argument is in the
2711 follow of every element in the lastpos of the first argument. */
2712 {
2713 tmp.nelem = stk[-1].nfirstpos;
2714 tmp.elems = firstpos - stk[-1].nfirstpos;
2715 for (position *plim = lastpos - stk[-1].nlastpos,
2716 *p = plim - stk[-2].nlastpos;
2717 p < plim; p++)
2718 merge2 (&d->follows[p->index], &tmp, &merged);
2719 }
2720
2721 /* The firstpos of a CAT node is the firstpos of the first argument,
2722 union that of the second argument if the first is nullable. */
2723 if (stk[-2].nullable)
2724 stk[-2].nfirstpos += stk[-1].nfirstpos;
2725 else
2726 firstpos -= stk[-1].nfirstpos;
2727
2728 /* The lastpos of a CAT node is the lastpos of the second argument,
2729 union that of the first argument if the second is nullable. */
2730 if (stk[-1].nullable)
2731 stk[-2].nlastpos += stk[-1].nlastpos;
2732 else
2733 {
2734 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2735 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2736 p[j] = p[j + stk[-2].nlastpos];
2737 lastpos -= stk[-2].nlastpos;
2738 stk[-2].nlastpos = stk[-1].nlastpos;
2739 }
2740
2741 /* A CAT node is nullable if both arguments are nullable. */
2742 stk[-2].nullable &= stk[-1].nullable;
2743 stk--;
2744 break;
2745
2746 case OR:
2747 /* The firstpos is the union of the firstpos of each argument. */
2748 stk[-2].nfirstpos += stk[-1].nfirstpos;
2749
2750 /* The lastpos is the union of the lastpos of each argument. */
2751 stk[-2].nlastpos += stk[-1].nlastpos;
2752
2753 /* An OR node is nullable if either argument is nullable. */
2754 stk[-2].nullable |= stk[-1].nullable;
2755 stk--;
2756 break;
2757
2758 default:
2759 /* Anything else is a nonempty position. (Note that special
2760 constructs like \< are treated as nonempty strings here;
2761 an "epsilon closure" effectively makes them nullable later.
2762 Backreferences have to get a real position so we can detect
2763 transitions on them later. But they are nullable. */
2764 stk->nullable = d->tokens[i] == BACKREF;
2765
2766 /* This position is in its own firstpos and lastpos. */
2767 stk->nfirstpos = stk->nlastpos = 1;
2768 stk++;
2769
2770 firstpos->index = lastpos->index = i;
2771 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2772 firstpos++, lastpos++;
2773
2774 break;
2775 }
2776#ifdef DEBUG
2777 /* ... balance the above nonsyntactic #ifdef goo... */
2778 fprintf (stderr, "node %td:", i);
2779 prtok (d->tokens[i]);
2780 putc ('\n', stderr);
2781 fprintf (stderr,
2782 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2783 fprintf (stderr, " firstpos:");
2784 for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2785 {
2786 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2787 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2788 }
2789 fprintf (stderr, "\n lastpos:");
2790 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2791 {
2792 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2793 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2794 }
2795 putc ('\n', stderr);
2796#endif
2797 }
2798
2799 if (backward)
2800 {
2801 /* For each follow set that is the follow set of a real position,
2802 replace it with its epsilon closure. */
2803 epsclosure (d, backward);
2804
2805 for (idx_t i = 0; i < tindex; i++)
2806 free (backward[i].elems);
2807 free (backward);
2808 }
2809
2810 dfaoptimize (d);
2811
2812#ifdef DEBUG
2813 for (idx_t i = 0; i < tindex; i++)
2814 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2815 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2816 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2817 {
2818 fprintf (stderr, "follows(%td:", i);
2819 prtok (d->tokens[i]);
2820 fprintf (stderr, "):");
2821 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2822 {
2823 fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2824 prtok (d->tokens[d->follows[i].elems[j].index]);
2825 }
2826 putc ('\n', stderr);
2827 }
2828#endif
2829
2830 pos.index = 0;
2831 pos.constraint = NO_CONSTRAINT;
2832
2833 alloc_position_set (&tmp, 1);
2834
2835 append (pos, &tmp);
2836
2837 d->separates = xicalloc (tindex, sizeof *d->separates);
2838
2839 for (idx_t i = 0; i < tindex; i++)
2840 {
2841 if (prev_newline_dependent (d->constraints[i]))
2842 d->separates[i] |= CTX_NEWLINE;
2843 if (prev_letter_dependent (d->constraints[i]))
2844 d->separates[i] |= CTX_LETTER;
2845
2846 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2847 {
2848 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2849 d->separates[i] |= CTX_NEWLINE;
2850 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2851 d->separates[i] |= CTX_LETTER;
2852 }
2853 }
2854
2855 /* Context wanted by some position. */
2856 int separate_contexts = state_separate_contexts (d, &tmp);
2857
2858 /* Build the initial state. */
2859 if (separate_contexts & CTX_NEWLINE)
2860 state_index (d, &tmp, CTX_NEWLINE);
2861 d->initstate_notbol = d->min_trcount
2862 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2863 if (separate_contexts & CTX_LETTER)
2864 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2865 d->min_trcount++;
2866 d->trcount = 0;
2867
2868 free (posalloc);
2869 free (stkalloc);
2870 free (merged.elems);
2871 free (tmp.elems);
2872}
2873
2874/* Make sure D's state arrays are large enough to hold NEW_STATE. */
2875static void
2876realloc_trans_if_necessary (struct dfa *d)
2877{
2878 state_num oldalloc = d->tralloc;
2879 if (oldalloc < d->sindex)
2880 {
2881 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2882 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2883 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2884 -1, sizeof *realtrans);
2885 realtrans[0] = realtrans[1] = NULL;
2886 d->trans = realtrans + 2;
2887 idx_t newalloc = d->tralloc = newalloc1 - 2;
2888 d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2889 d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2890 d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2891 if (d->localeinfo.multibyte)
2892 {
2893 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2894 realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2895 if (oldalloc == 0)
2896 realtrans[0] = realtrans[1] = NULL;
2897 d->mb_trans = realtrans + 2;
2898 }
2899 for (; oldalloc < newalloc; oldalloc++)
2900 {
2901 d->trans[oldalloc] = NULL;
2902 d->fails[oldalloc] = NULL;
2903 if (d->localeinfo.multibyte)
2904 d->mb_trans[oldalloc] = NULL;
2905 }
2906 }
2907}
2908
2909/*
2910 Calculate the transition table for a new state derived from state s
2911 for a compiled dfa d after input character uc, and return the new
2912 state number.
2913
2914 Do not worry about all possible input characters; calculate just the group
2915 of positions that match uc. Label it with the set of characters that
2916 every position in the group matches (taking into account, if necessary,
2917 preceding context information of s). Then find the union
2918 of these positions' follows, i.e., the set of positions of the
2919 new state. For each character in the group's label, set the transition
2920 on this character to be to a state corresponding to the set's positions,
2921 and its associated backward context information, if necessary.
2922
2923 When building a searching matcher, include the positions of state
2924 0 in every state.
2925
2926 The group is constructed by building an equivalence-class
2927 partition of the positions of s.
2928
2929 For each position, find the set of characters C that it matches. Eliminate
2930 any characters from C that fail on grounds of backward context.
2931
2932 Check whether the group's label L has nonempty
2933 intersection with C. If L - C is nonempty, create a new group labeled
2934 L - C and having the same positions as the current group, and set L to
2935 the intersection of L and C. Insert the position in the group, set
2936 C = C - L, and resume scanning.
2937
2938 If after comparing with every group there are characters remaining in C,
2939 create a new group labeled with the characters of C and insert this
2940 position in that group. */
2941
2942static state_num
2943build_state (state_num s, struct dfa *d, unsigned char uc)
2944{
2945 position_set follows; /* Union of the follows for each
2946 position of the current state. */
2947 position_set group; /* Positions that match the input char. */
2948 position_set tmp; /* Temporary space for merging sets. */
2949 state_num state; /* New state. */
2950 state_num state_newline; /* New state on a newline transition. */
2951 state_num state_letter; /* New state on a letter transition. */
2952
2953#ifdef DEBUG
2954 fprintf (stderr, "build state %td\n", s);
2955#endif
2956
2957 /* A pointer to the new transition table, and the table itself. */
2958 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2959 state_num *trans = *ptrans;
2960
2961 if (!trans)
2962 {
2963 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2964 transition tables that can exist at once, other than for
2965 initial states. Often-used transition tables are quickly
2966 rebuilt, whereas rarely-used ones are cleared away. */
2967 if (MAX_TRCOUNT <= d->trcount)
2968 {
2969 for (state_num i = d->min_trcount; i < d->tralloc; i++)
2970 {
2971 free (d->trans[i]);
2972 free (d->fails[i]);
2973 d->trans[i] = d->fails[i] = NULL;
2974 }
2975 d->trcount = 0;
2976 }
2977
2978 d->trcount++;
2979 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2980
2981 /* Fill transition table with a default value which means that the
2982 transited state has not been calculated yet. */
2983 for (int i = 0; i < NOTCHAR; i++)
2984 trans[i] = -2;
2985 }
2986
2987 /* Set up the success bits for this state. */
2988 d->success[s] = 0;
2989 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2990 d->success[s] |= CTX_NEWLINE;
2991 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2992 d->success[s] |= CTX_LETTER;
2993 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2994 d->success[s] |= CTX_NONE;
2995
2996 alloc_position_set (&follows, d->nleaves);
2997
2998 /* Find the union of the follows of the positions of the group.
2999 This is a hideously inefficient loop. Fix it someday. */
3000 for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3001 for (idx_t k = 0;
3002 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3003 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3004 &follows);
3005
3006 /* Positions that match the input char. */
3007 alloc_position_set (&group, d->nleaves);
3008
3009 /* The group's label. */
3010 charclass label;
3011 fillset (&label);
3012
3013 for (idx_t i = 0; i < follows.nelem; i++)
3014 {
3015 charclass matches; /* Set of matching characters. */
3016 position pos = follows.elems[i];
3017 bool matched = false;
3018 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3019 {
3020 zeroset (&matches);
3021 setbit (d->tokens[pos.index], &matches);
3022 if (d->tokens[pos.index] == uc)
3023 matched = true;
3024 }
3025 else if (d->tokens[pos.index] >= CSET)
3026 {
3027 matches = d->charclasses[d->tokens[pos.index] - CSET];
3028 if (tstbit (uc, &matches))
3029 matched = true;
3030 }
3031 else if (d->tokens[pos.index] == ANYCHAR)
3032 {
3033 matches = d->charclasses[d->canychar];
3034 if (tstbit (uc, &matches))
3035 matched = true;
3036
3037 /* ANYCHAR must match with a single character, so we must put
3038 it to D->states[s].mbps which contains the positions which
3039 can match with a single character not a byte. If all
3040 positions which has ANYCHAR does not depend on context of
3041 next character, we put the follows instead of it to
3042 D->states[s].mbps to optimize. */
3043 if (succeeds_in_context (pos.constraint, d->states[s].context,
3044 CTX_NONE))
3045 {
3046 if (d->states[s].mbps.nelem == 0)
3047 alloc_position_set (&d->states[s].mbps, 1);
3048 insert (pos, &d->states[s].mbps);
3049 }
3050 }
3051 else
3052 continue;
3053
3054 /* Some characters may need to be eliminated from matches because
3055 they fail in the current context. */
3056 if (pos.constraint != NO_CONSTRAINT)
3057 {
3058 if (!succeeds_in_context (pos.constraint,
3059 d->states[s].context, CTX_NEWLINE))
3060 for (int j = 0; j < CHARCLASS_WORDS; j++)
3061 matches.w[j] &= ~d->syntax.newline.w[j];
3062 if (!succeeds_in_context (pos.constraint,
3063 d->states[s].context, CTX_LETTER))
3064 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3065 matches.w[j] &= ~d->syntax.letters.w[j];
3066 if (!succeeds_in_context (pos.constraint,
3067 d->states[s].context, CTX_NONE))
3068 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3069 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3070
3071 /* If there are no characters left, there's no point in going on. */
3072 if (emptyset (&matches))
3073 continue;
3074
3075 /* If we have reset the bit that made us declare "matched", reset
3076 that indicator, too. This is required to avoid an infinite loop
3077 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3078 if (!tstbit (uc, &matches))
3079 matched = false;
3080 }
3081
3082#ifdef DEBUG
3083 fprintf (stderr, " nextpos %td:", pos.index);
3084 prtok (d->tokens[pos.index]);
3085 fprintf (stderr, " of");
3086 for (unsigned j = 0; j < NOTCHAR; j++)
3087 if (tstbit (j, &matches))
3088 fprintf (stderr, " 0x%02x", j);
3089 fprintf (stderr, "\n");
3090#endif
3091
3092 if (matched)
3093 {
3094 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3095 label.w[k] &= matches.w[k];
3096 append (pos, &group);
3097 }
3098 else
3099 {
3100 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3101 label.w[k] &= ~matches.w[k];
3102 }
3103 }
3104
3105 alloc_position_set (&tmp, d->nleaves);
3106
3107 if (group.nelem > 0)
3108 {
3109 /* If we are building a searching matcher, throw in the positions
3110 of state 0 as well, if possible. */
3111 if (d->searchflag)
3112 {
3113 /* If a token in follows.elems is not 1st byte of a multibyte
3114 character, or the states of follows must accept the bytes
3115 which are not 1st byte of the multibyte character.
3116 Then, if a state of follows encounters a byte, it must not be
3117 a 1st byte of a multibyte character nor a single byte character.
3118 In this case, do not add state[0].follows to next state, because
3119 state[0] must accept 1st-byte.
3120
3121 For example, suppose <sb a> is a certain single byte character,
3122 <mb A> is a certain multibyte character, and the codepoint of
3123 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3124 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3125 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3126 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3127 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3128 not add state[0]. */
3129
3130 bool mergeit = !d->localeinfo.multibyte;
3131 if (!mergeit)
3132 {
3133 mergeit = true;
3134 for (idx_t j = 0; mergeit && j < group.nelem; j++)
3135 mergeit &= d->multibyte_prop[group.elems[j].index];
3136 }
3137 if (mergeit)
3138 merge2 (&group, &d->states[0].elems, &tmp);
3139 }
3140
3141 /* Find out if the new state will want any context information,
3142 by calculating possible contexts that the group can match,
3143 and separate contexts that the new state wants to know. */
3144 int possible_contexts = charclass_context (d, &label);
3145 int separate_contexts = state_separate_contexts (d, &group);
3146
3147 /* Find the state(s) corresponding to the union of the follows. */
3148 if (possible_contexts & ~separate_contexts)
3149 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3150 else
3151 state = -1;
3152 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3153 state_newline = state_index (d, &group, CTX_NEWLINE);
3154 else
3155 state_newline = state;
3156 if (separate_contexts & possible_contexts & CTX_LETTER)
3157 state_letter = state_index (d, &group, CTX_LETTER);
3158 else
3159 state_letter = state;
3160
3161 /* Reallocate now, to reallocate any newline transition properly. */
3162 realloc_trans_if_necessary (d);
3163 }
3164
3165 /* If we are a searching matcher, the default transition is to a state
3166 containing the positions of state 0, otherwise the default transition
3167 is to fail miserably. */
3168 else if (d->searchflag)
3169 {
3170 state_newline = 0;
3171 state_letter = d->min_trcount - 1;
3172 state = d->initstate_notbol;
3173 }
3174 else
3175 {
3176 state_newline = -1;
3177 state_letter = -1;
3178 state = -1;
3179 }
3180
3181 /* Set the transitions for each character in the label. */
3182 for (int i = 0; i < NOTCHAR; i++)
3183 if (tstbit (i, &label))
3184 switch (d->syntax.sbit[i])
3185 {
3186 case CTX_NEWLINE:
3187 trans[i] = state_newline;
3188 break;
3189 case CTX_LETTER:
3190 trans[i] = state_letter;
3191 break;
3192 default:
3193 trans[i] = state;
3194 break;
3195 }
3196
3197#ifdef DEBUG
3198 fprintf (stderr, "trans table %td", s);
3199 for (int i = 0; i < NOTCHAR; ++i)
3200 {
3201 if (!(i & 0xf))
3202 fprintf (stderr, "\n");
3203 fprintf (stderr, " %2td", trans[i]);
3204 }
3205 fprintf (stderr, "\n");
3206#endif
3207
3208 free (group.elems);
3209 free (follows.elems);
3210 free (tmp.elems);
3211
3212 /* Keep the newline transition in a special place so we can use it as
3213 a sentinel. */
3214 if (tstbit (d->syntax.eolbyte, &label))
3215 {
3216 d->newlines[s] = trans[d->syntax.eolbyte];
3217 trans[d->syntax.eolbyte] = -1;
3218 }
3219
3220 return trans[uc];
3221}
3222
3223/* Multibyte character handling sub-routines for dfaexec. */
3224
3225/* Consume a single byte and transit state from 's' to '*next_state'.
3226 This function is almost same as the state transition routin in dfaexec.
3227 But state transition is done just once, otherwise matching succeed or
3228 reach the end of the buffer. */
3229static state_num
3230transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3231{
3232 state_num *t;
3233
3234 if (d->trans[s])
3235 t = d->trans[s];
3236 else if (d->fails[s])
3237 t = d->fails[s];
3238 else
3239 {
3240 build_state (s, d, **pp);
3241 if (d->trans[s])
3242 t = d->trans[s];
3243 else
3244 {
3245 t = d->fails[s];
3246 assert (t);
3247 }
3248 }
3249
3250 if (t[**pp] == -2)
3251 build_state (s, d, **pp);
3252
3253 return t[*(*pp)++];
3254}
3255
3256/* Transit state from s, then return new state and update the pointer of
3257 the buffer. This function is for a period operator which can match a
3258 multi-byte character. */
3259static state_num
3260transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3261 unsigned char const *end)
3262{
3263 wint_t wc;
3264
3265 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3266
3267 /* This state has some operators which can match a multibyte character. */
3268 d->mb_follows.nelem = 0;
3269
3270 /* Calculate the state which can be reached from the state 's' by
3271 consuming 'mbclen' single bytes from the buffer. */
3272 state_num s1 = s;
3273 int mbci;
3274 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3275 s = transit_state_singlebyte (d, s, pp);
3276 *pp += mbclen - mbci;
3277
3278 if (wc == WEOF)
3279 {
3280 /* It is an invalid character, so ANYCHAR is not accepted. */
3281 return s;
3282 }
3283
3284 /* If all positions which have ANYCHAR do not depend on the context
3285 of the next character, calculate the next state with
3286 pre-calculated follows and cache the result. */
3287 if (d->states[s1].mb_trindex < 0)
3288 {
3289 if (MAX_TRCOUNT <= d->mb_trcount)
3290 {
3291 state_num s3;
3292 for (s3 = -1; s3 < d->tralloc; s3++)
3293 {
3294 free (d->mb_trans[s3]);
3295 d->mb_trans[s3] = NULL;
3296 }
3297
3298 for (state_num i = 0; i < d->sindex; i++)
3299 d->states[i].mb_trindex = -1;
3300 d->mb_trcount = 0;
3301 }
3302 d->states[s1].mb_trindex = d->mb_trcount++;
3303 }
3304
3305 if (! d->mb_trans[s])
3306 {
3307 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3308 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3309 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3310 for (int i = 0; i < MAX_TRCOUNT; i++)
3311 d->mb_trans[s][i] = -1;
3312 }
3313 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3314 return d->mb_trans[s][d->states[s1].mb_trindex];
3315
3316 if (s == -1)
3317 copy (&d->states[s1].mbps, &d->mb_follows);
3318 else
3319 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3320
3321 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3322 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3323 realloc_trans_if_necessary (d);
3324
3325 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3326
3327 return s2;
3328}
3329
3330/* The initial state may encounter a byte which is not a single byte character
3331 nor the first byte of a multibyte character. But it is incorrect for the
3332 initial state to accept such a byte. For example, in Shift JIS the regular
3333 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3334 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3335 that are not a single byte character nor the first byte of a multibyte
3336 character.
3337
3338 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3339 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3340 the result is greater than P, set *WCP to the final wide character
3341 processed, or to WEOF if no wide character is processed. Otherwise,
3342 if WCP is non-NULL, *WCP may or may not be updated.
3343
3344 Both P and MBP must be no larger than END. */
3345static unsigned char const *
3346skip_remains_mb (struct dfa *d, unsigned char const *p,
3347 unsigned char const *mbp, char const *end)
3348{
3349 if (d->syntax.never_trail[*p])
3350 return p;
3351 while (mbp < p)
3352 {
3353 wint_t wc;
3354 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3355 end - (char const *) mbp, d);
3356 }
3357 return mbp;
3358}
3359
3360/* Search through a buffer looking for a match to the struct dfa *D.
3361 Find the first occurrence of a string matching the regexp in the
3362 buffer, and the shortest possible version thereof. Return a pointer to
3363 the first character after the match, or NULL if none is found. BEGIN
3364 points to the beginning of the buffer, and END points to the first byte
3365 after its end. Note however that we store a sentinel byte (usually
3366 newline) in *END, so the actual buffer must be one byte longer.
3367 When ALLOW_NL, newlines may appear in the matching string.
3368 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3369 If MULTIBYTE, the input consists of multibyte characters and/or
3370 encoding-error bytes. Otherwise, it consists of single-byte characters.
3371 Here is the list of features that make this DFA matcher punt:
3372 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3373 - [^...] in non-simple locale
3374 - [[=foo=]] or [[.foo.]]
3375 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3376 - back-reference: (.)\1
3377 - word-delimiter in multibyte locale: \<, \>, \b, \B
3378 See struct localeinfo.simple for the definition of "simple locale". */
3379
3380static inline char *
3381dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3382 ptrdiff_t *count, bool multibyte)
3383{
3384 if (MAX_TRCOUNT <= d->sindex)
3385 {
3386 for (state_num s = d->min_trcount; s < d->sindex; s++)
3387 {
3388 free (d->states[s].elems.elems);
3389 free (d->states[s].mbps.elems);
3390 }
3391 d->sindex = d->min_trcount;
3392
3393 if (d->trans)
3394 {
3395 for (state_num s = 0; s < d->tralloc; s++)
3396 {
3397 free (d->trans[s]);
3398 free (d->fails[s]);
3399 d->trans[s] = d->fails[s] = NULL;
3400 }
3401 d->trcount = 0;
3402 }
3403
3404 if (d->localeinfo.multibyte && d->mb_trans)
3405 {
3406 for (state_num s = -1; s < d->tralloc; s++)
3407 {
3408 free (d->mb_trans[s]);
3409 d->mb_trans[s] = NULL;
3410 }
3411 for (state_num s = 0; s < d->min_trcount; s++)
3412 d->states[s].mb_trindex = -1;
3413 d->mb_trcount = 0;
3414 }
3415 }
3416
3417 if (!d->tralloc)
3418 realloc_trans_if_necessary (d);
3419
3420 /* Current state. */
3421 state_num s = 0, s1 = 0;
3422
3423 /* Current input character. */
3424 unsigned char const *p = (unsigned char const *) begin;
3425 unsigned char const *mbp = p;
3426
3427 /* Copy of d->trans so it can be optimized into a register. */
3428 state_num **trans = d->trans;
3429 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3430 unsigned char saved_end = *(unsigned char *) end;
3431 *end = eol;
3432
3433 if (multibyte)
3434 {
3435 memset (&d->mbs, 0, sizeof d->mbs);
3436 if (d->mb_follows.alloc == 0)
3437 alloc_position_set (&d->mb_follows, d->nleaves);
3438 }
3439
3440 idx_t nlcount = 0;
3441 for (;;)
3442 {
3443 state_num *t;
3444 while ((t = trans[s]) != NULL)
3445 {
3446 if (s < d->min_trcount)
3447 {
3448 if (!multibyte || d->states[s].mbps.nelem == 0)
3449 {
3450 while (t[*p] == s)
3451 p++;
3452 }
3453 if (multibyte)
3454 p = mbp = skip_remains_mb (d, p, mbp, end);
3455 }
3456
3457 if (multibyte)
3458 {
3459 s1 = s;
3460
3461 if (d->states[s].mbps.nelem == 0
3462 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3463 {
3464 /* If an input character does not match ANYCHAR, do it
3465 like a single-byte character. */
3466 s = t[*p++];
3467 }
3468 else
3469 {
3470 s = transit_state (d, s, &p, (unsigned char *) end);
3471 mbp = p;
3472 trans = d->trans;
3473 }
3474 }
3475 else
3476 {
3477 s1 = t[*p++];
3478 t = trans[s1];
3479 if (! t)
3480 {
3481 state_num tmp = s;
3482 s = s1;
3483 s1 = tmp; /* swap */
3484 break;
3485 }
3486 if (s < d->min_trcount)
3487 {
3488 while (t[*p] == s1)
3489 p++;
3490 }
3491 s = t[*p++];
3492 }
3493 }
3494
3495 if (s < 0)
3496 {
3497 if (s == -2)
3498 {
3499 s = build_state (s1, d, p[-1]);
3500 trans = d->trans;
3501 }
3502 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3503 {
3504 /* The previous character was a newline. Count it, and skip
3505 checking of multibyte character boundary until here. */
3506 nlcount++;
3507 mbp = p;
3508
3509 s = (allow_nl ? d->newlines[s1]
3510 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3511 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3512 : d->initstate_notbol);
3513 }
3514 else
3515 {
3516 p = NULL;
3517 goto done;
3518 }
3519 }
3520 else if (d->fails[s])
3521 {
3522 if ((d->success[s] & d->syntax.sbit[*p])
3523 || ((char *) p == end
3524 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3525 d)))
3526 goto done;
3527
3528 if (multibyte && s < d->min_trcount)
3529 p = mbp = skip_remains_mb (d, p, mbp, end);
3530
3531 s1 = s;
3532 if (!multibyte || d->states[s].mbps.nelem == 0
3533 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3534 {
3535 /* If a input character does not match ANYCHAR, do it
3536 like a single-byte character. */
3537 s = d->fails[s][*p++];
3538 }
3539 else
3540 {
3541 s = transit_state (d, s, &p, (unsigned char *) end);
3542 mbp = p;
3543 trans = d->trans;
3544 }
3545 }
3546 else
3547 {
3548 build_state (s, d, p[0]);
3549 trans = d->trans;
3550 }
3551 }
3552
3553 done:
3554 if (count)
3555 *count += nlcount;
3556 *end = saved_end;
3557 return (char *) p;
3558}
3559
3560/* Specialized versions of dfaexec for multibyte and single-byte cases.
3561 This is for performance, as dfaexec_main is an inline function. */
3562
3563static char *
3564dfaexec_mb (struct dfa *d, char const *begin, char *end,
3565 bool allow_nl, ptrdiff_t *count, bool *backref)
3566{
3567 return dfaexec_main (d, begin, end, allow_nl, count, true);
3568}
3569
3570static char *
3571dfaexec_sb (struct dfa *d, char const *begin, char *end,
3572 bool allow_nl, ptrdiff_t *count, bool *backref)
3573{
3574 return dfaexec_main (d, begin, end, allow_nl, count, false);
3575}
3576
3577/* Always set *BACKREF and return BEGIN. Use this wrapper for
3578 any regexp that uses a construct not supported by this code. */
3579static char *
3580dfaexec_noop (struct dfa *d, char const *begin, char *end,
3581 bool allow_nl, ptrdiff_t *count, bool *backref)
3582{
3583 *backref = true;
3584 return (char *) begin;
3585}
3586
3587/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3588 but faster and set *BACKREF if the DFA code does not support this
3589 regexp usage. */
3590
3591char *
3592dfaexec (struct dfa *d, char const *begin, char *end,
3593 bool allow_nl, ptrdiff_t *count, bool *backref)
3594{
3595 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3596}
3597
3598struct dfa *
3599dfasuperset (struct dfa const *d)
3600{
3601 return d->superset;
3602}
3603
3604bool
3605dfaisfast (struct dfa const *d)
3606{
3607 return d->fast;
3608}
3609
3610static void
3611free_mbdata (struct dfa *d)
3612{
3613 free (d->multibyte_prop);
3614 free (d->lex.brack.chars);
3615 free (d->mb_follows.elems);
3616
3617 if (d->mb_trans)
3618 {
3619 state_num s;
3620 for (s = -1; s < d->tralloc; s++)
3621 free (d->mb_trans[s]);
3622 free (d->mb_trans - 2);
3623 }
3624}
3625
3626/* Return true if every construct in D is supported by this DFA matcher. */
3627bool
3628dfasupported (struct dfa const *d)
3629{
3630 for (idx_t i = 0; i < d->tindex; i++)
3631 {
3632 switch (d->tokens[i])
3633 {
3634 case BEGWORD:
3635 case ENDWORD:
3636 case LIMWORD:
3637 case NOTLIMWORD:
3638 if (!d->localeinfo.multibyte)
3639 continue;
3640 FALLTHROUGH;
3641 case BACKREF:
3642 case MBCSET:
3643 return false;
3644 }
3645 }
3646 return true;
3647}
3648
3649/* Disable use of the superset DFA if it is not likely to help
3650 performance. */
3651static void
3652maybe_disable_superset_dfa (struct dfa *d)
3653{
3654 if (!d->localeinfo.using_utf8)
3655 return;
3656
3657 bool have_backref = false;
3658 for (idx_t i = 0; i < d->tindex; i++)
3659 {
3660 switch (d->tokens[i])
3661 {
3662 case ANYCHAR:
3663 /* Lowered. */
3664 abort ();
3665 case BACKREF:
3666 have_backref = true;
3667 break;
3668 case MBCSET:
3669 /* Requires multi-byte algorithm. */
3670 return;
3671 default:
3672 break;
3673 }
3674 }
3675
3676 if (!have_backref && d->superset)
3677 {
3678 /* The superset DFA is not likely to be much faster, so remove it. */
3679 dfafree (d->superset);
3680 free (d->superset);
3681 d->superset = NULL;
3682 }
3683
3684 free_mbdata (d);
3685 d->localeinfo.multibyte = false;
3686 d->dfaexec = dfaexec_sb;
3687 d->fast = true;
3688}
3689
3690static void
3691dfassbuild (struct dfa *d)
3692{
3693 struct dfa *sup = dfaalloc ();
3694
3695 *sup = *d;
3696 sup->localeinfo.multibyte = false;
3697 sup->dfaexec = dfaexec_sb;
3698 sup->multibyte_prop = NULL;
3699 sup->superset = NULL;
3700 sup->states = NULL;
3701 sup->sindex = 0;
3702 sup->constraints = NULL;
3703 sup->separates = NULL;
3704 sup->follows = NULL;
3705 sup->tralloc = 0;
3706 sup->trans = NULL;
3707 sup->fails = NULL;
3708 sup->success = NULL;
3709 sup->newlines = NULL;
3710
3711 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3712 if (d->cindex)
3713 {
3714 memcpy (sup->charclasses, d->charclasses,
3715 d->cindex * sizeof *sup->charclasses);
3716 }
3717
3718 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3719 sup->talloc = d->tindex * 2;
3720
3721 bool have_achar = false;
3722 bool have_nchar = false;
3723 idx_t j;
3724 for (idx_t i = j = 0; i < d->tindex; i++)
3725 {
3726 switch (d->tokens[i])
3727 {
3728 case ANYCHAR:
3729 case MBCSET:
3730 case BACKREF:
3731 {
3732 charclass ccl;
3733 fillset (&ccl);
3734 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3735 sup->tokens[j++] = STAR;
3736 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3737 || d->tokens[i + 1] == PLUS)
3738 i++;
3739 have_achar = true;
3740 }
3741 break;
3742 case BEGWORD:
3743 case ENDWORD:
3744 case LIMWORD:
3745 case NOTLIMWORD:
3746 if (d->localeinfo.multibyte)
3747 {
3748 /* These constraints aren't supported in a multibyte locale.
3749 Ignore them in the superset DFA. */
3750 sup->tokens[j++] = EMPTY;
3751 break;
3752 }
3753 FALLTHROUGH;
3754 default:
3755 sup->tokens[j++] = d->tokens[i];
3756 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3757 || d->tokens[i] >= CSET)
3758 have_nchar = true;
3759 break;
3760 }
3761 }
3762 sup->tindex = j;
3763
3764 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3765 d->superset = sup;
3766 else
3767 {
3768 dfafree (sup);
3769 free (sup);
3770 }
3771}
3772
3773/* Parse a string S of length LEN into D (but skip this step if S is null).
3774 Then analyze D and build a matcher for it.
3775 SEARCHFLAG says whether to build a searching or an exact matcher. */
3776void
3777dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3778{
3779 if (s != NULL)
3780 dfaparse (s, len, d);
3781
3782 dfassbuild (d);
3783
3784 if (dfasupported (d))
3785 {
3786 maybe_disable_superset_dfa (d);
3787 dfaanalyze (d, searchflag);
3788 }
3789 else
3790 {
3791 d->dfaexec = dfaexec_noop;
3792 }
3793
3794 if (d->superset)
3795 {
3796 d->fast = true;
3797 dfaanalyze (d->superset, searchflag);
3798 }
3799}
3800
3801/* Free the storage held by the components of a dfa. */
3802void
3803dfafree (struct dfa *d)
3804{
3805 free (d->charclasses);
3806 free (d->tokens);
3807
3808 if (d->localeinfo.multibyte)
3809 free_mbdata (d);
3810
3811 free (d->constraints);
3812 free (d->separates);
3813
3814 for (idx_t i = 0; i < d->sindex; i++)
3815 {
3816 free (d->states[i].elems.elems);
3817 free (d->states[i].mbps.elems);
3818 }
3819 free (d->states);
3820
3821 if (d->follows)
3822 {
3823 for (idx_t i = 0; i < d->tindex; i++)
3824 free (d->follows[i].elems);
3825 free (d->follows);
3826 }
3827
3828 if (d->trans)
3829 {
3830 for (idx_t i = 0; i < d->tralloc; i++)
3831 {
3832 free (d->trans[i]);
3833 free (d->fails[i]);
3834 }
3835
3836 free (d->trans - 2);
3837 free (d->fails);
3838 free (d->newlines);
3839 free (d->success);
3840 }
3841
3842 if (d->superset)
3843 {
3844 dfafree (d->superset);
3845 free (d->superset);
3846 }
3847}
3848
3849/* Having found the postfix representation of the regular expression,
3850 try to find a long sequence of characters that must appear in any line
3851 containing the r.e.
3852 Finding a "longest" sequence is beyond the scope here;
3853 we take an easy way out and hope for the best.
3854 (Take "(ab|a)b"--please.)
3855
3856 We do a bottom-up calculation of sequences of characters that must appear
3857 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3858 representation:
3859 sequences that must appear at the left of the match ("left")
3860 sequences that must appear at the right of the match ("right")
3861 lists of sequences that must appear somewhere in the match ("in")
3862 sequences that must constitute the match ("is")
3863
3864 When we get to the root of the tree, we use one of the longest of its
3865 calculated "in" sequences as our answer.
3866
3867 The sequences calculated for the various types of node (in pseudo ANSI c)
3868 are shown below. "p" is the operand of unary operators (and the left-hand
3869 operand of binary operators); "q" is the right-hand operand of binary
3870 operators.
3871
3872 "ZERO" means "a zero-length sequence" below.
3873
3874 Type left right is in
3875 ---- ---- ----- -- --
3876 char c # c # c # c # c
3877
3878 ANYCHAR ZERO ZERO ZERO ZERO
3879
3880 MBCSET ZERO ZERO ZERO ZERO
3881
3882 CSET ZERO ZERO ZERO ZERO
3883
3884 STAR ZERO ZERO ZERO ZERO
3885
3886 QMARK ZERO ZERO ZERO ZERO
3887
3888 PLUS p->left p->right ZERO p->in
3889
3890 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3891 p->left : q->right : q->is!=ZERO) ? q->in plus
3892 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3893 ZERO
3894
3895 OR longest common longest common (do p->is and substrings common
3896 leading trailing to q->is have same p->in and
3897 (sub)sequence (sub)sequence q->in length and content) ?
3898 of p->left of p->right
3899 and q->left and q->right p->is : NULL
3900
3901 If there's anything else we recognize in the tree, all four sequences get set
3902 to zero-length sequences. If there's something we don't recognize in the
3903 tree, we just return a zero-length sequence.
3904
3905 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3906 'aaa')?
3907
3908 And ... is it here or someplace that we might ponder "optimizations" such as
3909 egrep 'psi|epsilon' -> egrep 'psi'
3910 egrep 'pepsi|epsilon' -> egrep 'epsi'
3911 (Yes, we now find "epsi" as a "string
3912 that must occur", but we might also
3913 simplify the *entire* r.e. being sought)
3914 grep '[c]' -> grep 'c'
3915 grep '(ab|a)b' -> grep 'ab'
3916 grep 'ab*' -> grep 'a'
3917 grep 'a*b' -> grep 'b'
3918
3919 There are several issues:
3920
3921 Is optimization easy (enough)?
3922
3923 Does optimization actually accomplish anything,
3924 or is the automaton you get from "psi|epsilon" (for example)
3925 the same as the one you get from "psi" (for example)?
3926
3927 Are optimizable r.e.'s likely to be used in real-life situations
3928 (something like 'ab*' is probably unlikely; something like is
3929 'psi|epsilon' is likelier)? */
3930
3931static char *
3932icatalloc (char *old, char const *new)
3933{
3934 idx_t newsize = strlen (new);
3935 if (newsize == 0)
3936 return old;
3937 idx_t oldsize = strlen (old);
3938 char *result = xirealloc (old, oldsize + newsize + 1);
3939 memcpy (result + oldsize, new, newsize + 1);
3940 return result;
3941}
3942
3943static void
3944freelist (char **cpp)
3945{
3946 while (*cpp)
3947 free (*cpp++);
3948}
3949
3950static char **
3951enlistnew (char **cpp, char *new)
3952{
3953 /* Is there already something in the list that's new (or longer)? */
3954 idx_t i;
3955 for (i = 0; cpp[i] != NULL; i++)
3956 if (strstr (cpp[i], new) != NULL)
3957 {
3958 free (new);
3959 return cpp;
3960 }
3961 /* Eliminate any obsoleted strings. */
3962 for (idx_t j = 0; cpp[j] != NULL; )
3963 if (strstr (new, cpp[j]) == NULL)
3964 ++j;
3965 else
3966 {
3967 free (cpp[j]);
3968 if (--i == j)
3969 break;
3970 cpp[j] = cpp[i];
3971 cpp[i] = NULL;
3972 }
3973 /* Add the new string. */
3974 cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
3975 cpp[i] = new;
3976 cpp[i + 1] = NULL;
3977 return cpp;
3978}
3979
3980static char **
3981enlist (char **cpp, char const *str, idx_t len)
3982{
3983 return enlistnew (cpp, ximemdup0 (str, len));
3984}
3985
3986/* Given pointers to two strings, return a pointer to an allocated
3987 list of their distinct common substrings. */
3988static char **
3989comsubs (char *left, char const *right)
3990{
3991 char **cpp = xzalloc (sizeof *cpp);
3992
3993 for (char *lcp = left; *lcp != '\0'; lcp++)
3994 {
3995 idx_t len = 0;
3996 char *rcp = strchr (right, *lcp);
3997 while (rcp != NULL)
3998 {
3999 idx_t i;
4000 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4001 continue;
4002 if (i > len)
4003 len = i;
4004 rcp = strchr (rcp + 1, *lcp);
4005 }
4006 if (len != 0)
4007 cpp = enlist (cpp, lcp, len);
4008 }
4009 return cpp;
4010}
4011
4012static char **
4013addlists (char **old, char **new)
4014{
4015 for (; *new; new++)
4016 old = enlistnew (old, xstrdup (*new));
4017 return old;
4018}
4019
4020/* Given two lists of substrings, return a new list giving substrings
4021 common to both. */
4022static char **
4023inboth (char **left, char **right)
4024{
4025 char **both = xzalloc (sizeof *both);
4026
4027 for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4028 {
4029 for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4030 {
4031 char **temp = comsubs (left[lnum], right[rnum]);
4032 both = addlists (both, temp);
4033 freelist (temp);
4034 free (temp);
4035 }
4036 }
4037 return both;
4038}
4039
4040typedef struct must must;
4041
4042struct must
4043{
4044 char **in;
4045 char *left;
4046 char *right;
4047 char *is;
4048 bool begline;
4049 bool endline;
4050 must *prev;
4051};
4052
4053static must *
4054allocmust (must *mp, idx_t size)
4055{
4056 must *new_mp = xmalloc (sizeof *new_mp);
4057 new_mp->in = xzalloc (sizeof *new_mp->in);
4058 new_mp->left = xizalloc (size);
4059 new_mp->right = xizalloc (size);
4060 new_mp->is = xizalloc (size);
4061 new_mp->begline = false;
4062 new_mp->endline = false;
4063 new_mp->prev = mp;
4064 return new_mp;
4065}
4066
4067static void
4068resetmust (must *mp)
4069{
4070 freelist (mp->in);
4071 mp->in[0] = NULL;
4072 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4073 mp->begline = false;
4074 mp->endline = false;
4075}
4076
4077static void
4078freemust (must *mp)
4079{
4080 freelist (mp->in);
4081 free (mp->in);
4082 free (mp->left);
4083 free (mp->right);
4084 free (mp->is);
4085 free (mp);
4086}
4087
4088struct dfamust *
4089dfamust (struct dfa const *d)
4090{
4091 must *mp = NULL;
4092 char const *result = "";
4093 bool exact = false;
4094 bool begline = false;
4095 bool endline = false;
4096 bool need_begline = false;
4097 bool need_endline = false;
4098 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4099
4100 for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4101 {
4102 token t = d->tokens[ri];
4103 switch (t)
4104 {
4105 case BEGLINE:
4106 mp = allocmust (mp, 2);
4107 mp->begline = true;
4108 need_begline = true;
4109 break;
4110 case ENDLINE:
4111 mp = allocmust (mp, 2);
4112 mp->endline = true;
4113 need_endline = true;
4114 break;
4115 case LPAREN:
4116 case RPAREN:
4117 assert (!"neither LPAREN nor RPAREN may appear here");
4118
4119 case EMPTY:
4120 case BEGWORD:
4121 case ENDWORD:
4122 case LIMWORD:
4123 case NOTLIMWORD:
4124 case BACKREF:
4125 case ANYCHAR:
4126 case MBCSET:
4127 mp = allocmust (mp, 2);
4128 break;
4129
4130 case STAR:
4131 case QMARK:
4132 assume_nonnull (mp);
4133 resetmust (mp);
4134 break;
4135
4136 case OR:
4137 {
4138 char **new;
4139 must *rmp = mp;
4140 assume_nonnull (rmp);
4141 must *lmp = mp = mp->prev;
4142 assume_nonnull (lmp);
4143 idx_t j, ln, rn, n;
4144
4145 /* Guaranteed to be. Unlikely, but ... */
4146 if (streq (lmp->is, rmp->is))
4147 {
4148 lmp->begline &= rmp->begline;
4149 lmp->endline &= rmp->endline;
4150 }
4151 else
4152 {
4153 lmp->is[0] = '\0';
4154 lmp->begline = false;
4155 lmp->endline = false;
4156 }
4157 /* Left side--easy */
4158 idx_t i = 0;
4159 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4160 ++i;
4161 lmp->left[i] = '\0';
4162 /* Right side */
4163 ln = strlen (lmp->right);
4164 rn = strlen (rmp->right);
4165 n = ln;
4166 if (n > rn)
4167 n = rn;
4168 for (i = 0; i < n; ++i)
4169 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4170 break;
4171 for (j = 0; j < i; ++j)
4172 lmp->right[j] = lmp->right[(ln - i) + j];
4173 lmp->right[j] = '\0';
4174 new = inboth (lmp->in, rmp->in);
4175 freelist (lmp->in);
4176 free (lmp->in);
4177 lmp->in = new;
4178 freemust (rmp);
4179 }
4180 break;
4181
4182 case PLUS:
4183 assume_nonnull (mp);
4184 mp->is[0] = '\0';
4185 break;
4186
4187 case END:
4188 assume_nonnull (mp);
4189 assert (!mp->prev);
4190 for (idx_t i = 0; mp->in[i] != NULL; i++)
4191 if (strlen (mp->in[i]) > strlen (result))
4192 result = mp->in[i];
4193 if (streq (result, mp->is))
4194 {
4195 if ((!need_begline || mp->begline) && (!need_endline
4196 || mp->endline))
4197 exact = true;
4198 begline = mp->begline;
4199 endline = mp->endline;
4200 }
4201 goto done;
4202
4203 case CAT:
4204 {
4205 must *rmp = mp;
4206 assume_nonnull (rmp);
4207 must *lmp = mp = mp->prev;
4208 assume_nonnull (lmp);
4209
4210 /* In. Everything in left, plus everything in
4211 right, plus concatenation of
4212 left's right and right's left. */
4213 lmp->in = addlists (lmp->in, rmp->in);
4214 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4215 {
4216 idx_t lrlen = strlen (lmp->right);
4217 idx_t rllen = strlen (rmp->left);
4218 char *tp = ximalloc (lrlen + rllen + 1);
4219 memcpy (tp + lrlen, rmp->left, rllen + 1);
4220 memcpy (tp, lmp->right, lrlen);
4221 lmp->in = enlistnew (lmp->in, tp);
4222 }
4223 /* Left-hand */
4224 if (lmp->is[0] != '\0')
4225 lmp->left = icatalloc (lmp->left, rmp->left);
4226 /* Right-hand */
4227 if (rmp->is[0] == '\0')
4228 lmp->right[0] = '\0';
4229 lmp->right = icatalloc (lmp->right, rmp->right);
4230 /* Guaranteed to be */
4231 if ((lmp->is[0] != '\0' || lmp->begline)
4232 && (rmp->is[0] != '\0' || rmp->endline))
4233 {
4234 lmp->is = icatalloc (lmp->is, rmp->is);
4235 lmp->endline = rmp->endline;
4236 }
4237 else
4238 {
4239 lmp->is[0] = '\0';
4240 lmp->begline = false;
4241 lmp->endline = false;
4242 }
4243 freemust (rmp);
4244 }
4245 break;
4246
4247 case '\0':
4248 /* Not on *my* shift. */
4249 goto done;
4250
4251 default:
4252 if (CSET <= t)
4253 {
4254 /* If T is a singleton, or if case-folding in a unibyte
4255 locale and T's members all case-fold to the same char,
4256 convert T to one of its members. Otherwise, do
4257 nothing further with T. */
4258 charclass *ccl = &d->charclasses[t - CSET];
4259 int j;
4260 for (j = 0; j < NOTCHAR; j++)
4261 if (tstbit (j, ccl))
4262 break;
4263 if (! (j < NOTCHAR))
4264 {
4265 mp = allocmust (mp, 2);
4266 break;
4267 }
4268 t = j;
4269 while (++j < NOTCHAR)
4270 if (tstbit (j, ccl)
4271 && ! (case_fold_unibyte
4272 && toupper (j) == toupper (t)))
4273 break;
4274 if (j < NOTCHAR)
4275 {
4276 mp = allocmust (mp, 2);
4277 break;
4278 }
4279 }
4280
4281 idx_t rj = ri + 2;
4282 if (d->tokens[ri + 1] == CAT)
4283 {
4284 for (; rj < d->tindex - 1; rj += 2)
4285 {
4286 if ((rj != ri && (d->tokens[rj] <= 0
4287 || NOTCHAR <= d->tokens[rj]))
4288 || d->tokens[rj + 1] != CAT)
4289 break;
4290 }
4291 }
4292 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4293 mp->is[0] = mp->left[0] = mp->right[0]
4294 = case_fold_unibyte ? toupper (t) : t;
4295
4296 idx_t i;
4297 for (i = 1; ri + 2 < rj; i++)
4298 {
4299 ri += 2;
4300 t = d->tokens[ri];
4301 mp->is[i] = mp->left[i] = mp->right[i]
4302 = case_fold_unibyte ? toupper (t) : t;
4303 }
4304 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4305 mp->in = enlist (mp->in, mp->is, i);
4306 break;
4307 }
4308 }
4309 done:;
4310
4311 struct dfamust *dm = NULL;
4312 if (*result)
4313 {
4314 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4315 dm->exact = exact;
4316 dm->begline = begline;
4317 dm->endline = endline;
4318 strcpy (dm->must, result);
4319 }
4320
4321 while (mp)
4322 {
4323 must *prev = mp->prev;
4324 freemust (mp);
4325 mp = prev;
4326 }
4327
4328 return dm;
4329}
4330
4331void
4332dfamustfree (struct dfamust *dm)
4333{
4334 free (dm);
4335}
4336
4337struct dfa *
4338dfaalloc (void)
4339{
4340 return xmalloc (sizeof (struct dfa));
4341}
4342
4343/* Initialize DFA. */
4344void
4345dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4346 reg_syntax_t bits, int dfaopts)
4347{
4348 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4349 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4350 dfa->localeinfo = *linfo;
4351
4352 dfa->fast = !dfa->localeinfo.multibyte;
4353
4354 dfa->canychar = -1;
4355 dfa->syntax.syntax_bits_set = true;
4356 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4357 dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4358 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4359 dfa->syntax.syntax_bits = bits;
4360
4361 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4362 {
4363 unsigned char uc = i;
4364
4365 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4366 switch (dfa->syntax.sbit[uc])
4367 {
4368 case CTX_LETTER:
4369 setbit (uc, &dfa->syntax.letters);
4370 break;
4371 case CTX_NEWLINE:
4372 setbit (uc, &dfa->syntax.newline);
4373 break;
4374 }
4375
4376 /* POSIX requires that the five bytes in "\n\r./" (including the
4377 terminating NUL) cannot occur inside a multibyte character. */
4378 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4379 ? (uc & 0xc0) != 0x80
4380 : strchr ("\n\r./", uc) != NULL);
4381 }
4382}
4383
4384/* Initialize TO by copying FROM's syntax settings. */
4385void
4386dfacopysyntax (struct dfa *to, struct dfa const *from)
4387{
4388 memset (to, 0, offsetof (struct dfa, syntax));
4389 to->canychar = -1;
4390 to->fast = from->fast;
4391 to->syntax = from->syntax;
4392 to->dfaexec = from->dfaexec;
4393 to->localeinfo = from->localeinfo;
4394}
4395
4396/* vim:set shiftwidth=2: */
Note: See TracBrowser for help on using the repository browser.