1 | /* dfa.h - declarations for GNU deterministic regexp compiler
|
---|
2 | Copyright (C) 1988, 1998, 2002, 2004 Free Software Foundation, Inc.
|
---|
3 |
|
---|
4 | This program is free software; you can redistribute it and/or modify
|
---|
5 | it under the terms of the GNU General Public License as published by
|
---|
6 | the Free Software Foundation; either version 2, or (at your option)
|
---|
7 | any later version.
|
---|
8 |
|
---|
9 | This program is distributed in the hope that it will be useful,
|
---|
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
12 | GNU General Public License for more details.
|
---|
13 |
|
---|
14 | You should have received a copy of the GNU General Public License
|
---|
15 | along with this program; if not, write to the Free Software
|
---|
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
|
---|
17 |
|
---|
18 | /* Written June, 1988 by Mike Haertel */
|
---|
19 |
|
---|
20 | /* FIXME:
|
---|
21 | 2. We should not export so much of the DFA internals.
|
---|
22 | In addition to clobbering modularity, we eat up valuable
|
---|
23 | name space. */
|
---|
24 |
|
---|
25 | #ifdef __STDC__
|
---|
26 | # ifndef _PTR_T
|
---|
27 | # define _PTR_T
|
---|
28 | typedef void * ptr_t;
|
---|
29 | # endif
|
---|
30 | #else
|
---|
31 | # ifndef _PTR_T
|
---|
32 | # define _PTR_T
|
---|
33 | typedef char * ptr_t;
|
---|
34 | # endif
|
---|
35 | #endif
|
---|
36 |
|
---|
37 | #ifdef PARAMS
|
---|
38 | # undef PARAMS
|
---|
39 | #endif
|
---|
40 | #if PROTOTYPES
|
---|
41 | # define PARAMS(x) x
|
---|
42 | #else
|
---|
43 | # define PARAMS(x) ()
|
---|
44 | #endif
|
---|
45 |
|
---|
46 | /* Number of bits in an unsigned char. */
|
---|
47 | #ifndef CHARBITS
|
---|
48 | #define CHARBITS 8
|
---|
49 | #endif
|
---|
50 |
|
---|
51 | /* First integer value that is greater than any character code. */
|
---|
52 | #define NOTCHAR (1 << CHARBITS)
|
---|
53 |
|
---|
54 | /* INTBITS need not be exact, just a lower bound. */
|
---|
55 | #ifndef INTBITS
|
---|
56 | #define INTBITS (CHARBITS * sizeof (int))
|
---|
57 | #endif
|
---|
58 |
|
---|
59 | /* Number of ints required to hold a bit for every character. */
|
---|
60 | #define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
|
---|
61 |
|
---|
62 | /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
|
---|
63 | typedef int charclass[CHARCLASS_INTS];
|
---|
64 |
|
---|
65 | /* The regexp is parsed into an array of tokens in postfix form. Some tokens
|
---|
66 | are operators and others are terminal symbols. Most (but not all) of these
|
---|
67 | codes are returned by the lexical analyzer. */
|
---|
68 |
|
---|
69 | typedef enum
|
---|
70 | {
|
---|
71 | END = -1, /* END is a terminal symbol that matches the
|
---|
72 | end of input; any value of END or less in
|
---|
73 | the parse tree is such a symbol. Accepting
|
---|
74 | states of the DFA are those that would have
|
---|
75 | a transition on END. */
|
---|
76 |
|
---|
77 | /* Ordinary character values are terminal symbols that match themselves. */
|
---|
78 |
|
---|
79 | EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
|
---|
80 | the empty string. */
|
---|
81 |
|
---|
82 | BACKREF, /* BACKREF is generated by \<digit>; it
|
---|
83 | it not completely handled. If the scanner
|
---|
84 | detects a transition on backref, it returns
|
---|
85 | a kind of "semi-success" indicating that
|
---|
86 | the match will have to be verified with
|
---|
87 | a backtracking matcher. */
|
---|
88 |
|
---|
89 | BEGLINE, /* BEGLINE is a terminal symbol that matches
|
---|
90 | the empty string if it is at the beginning
|
---|
91 | of a line. */
|
---|
92 |
|
---|
93 | ENDLINE, /* ENDLINE is a terminal symbol that matches
|
---|
94 | the empty string if it is at the end of
|
---|
95 | a line. */
|
---|
96 |
|
---|
97 | BEGWORD, /* BEGWORD is a terminal symbol that matches
|
---|
98 | the empty string if it is at the beginning
|
---|
99 | of a word. */
|
---|
100 |
|
---|
101 | ENDWORD, /* ENDWORD is a terminal symbol that matches
|
---|
102 | the empty string if it is at the end of
|
---|
103 | a word. */
|
---|
104 |
|
---|
105 | LIMWORD, /* LIMWORD is a terminal symbol that matches
|
---|
106 | the empty string if it is at the beginning
|
---|
107 | or the end of a word. */
|
---|
108 |
|
---|
109 | NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
|
---|
110 | matches the empty string if it is not at
|
---|
111 | the beginning or end of a word. */
|
---|
112 |
|
---|
113 | QMARK, /* QMARK is an operator of one argument that
|
---|
114 | matches zero or one occurences of its
|
---|
115 | argument. */
|
---|
116 |
|
---|
117 | STAR, /* STAR is an operator of one argument that
|
---|
118 | matches the Kleene closure (zero or more
|
---|
119 | occurrences) of its argument. */
|
---|
120 |
|
---|
121 | PLUS, /* PLUS is an operator of one argument that
|
---|
122 | matches the positive closure (one or more
|
---|
123 | occurrences) of its argument. */
|
---|
124 |
|
---|
125 | REPMN, /* REPMN is a lexical token corresponding
|
---|
126 | to the {m,n} construct. REPMN never
|
---|
127 | appears in the compiled token vector. */
|
---|
128 |
|
---|
129 | CAT, /* CAT is an operator of two arguments that
|
---|
130 | matches the concatenation of its
|
---|
131 | arguments. CAT is never returned by the
|
---|
132 | lexical analyzer. */
|
---|
133 |
|
---|
134 | OR, /* OR is an operator of two arguments that
|
---|
135 | matches either of its arguments. */
|
---|
136 |
|
---|
137 | ORTOP, /* OR at the toplevel in the parse tree.
|
---|
138 | This is used for a boyer-moore heuristic. */
|
---|
139 |
|
---|
140 | LPAREN, /* LPAREN never appears in the parse tree,
|
---|
141 | it is only a lexeme. */
|
---|
142 |
|
---|
143 | RPAREN, /* RPAREN never appears in the parse tree. */
|
---|
144 |
|
---|
145 | CRANGE, /* CRANGE never appears in the parse tree.
|
---|
146 | It stands for a character range that can
|
---|
147 | match a string of one or more characters.
|
---|
148 | For example, [a-z] can match "ch" in
|
---|
149 | a Spanish locale. */
|
---|
150 |
|
---|
151 | #ifdef MBS_SUPPORT
|
---|
152 | ANYCHAR, /* ANYCHAR is a terminal symbol that matches
|
---|
153 | any multibyte (or single byte) characters.
|
---|
154 | It is used only if MB_CUR_MAX > 1. */
|
---|
155 |
|
---|
156 | MBCSET, /* MBCSET is similar to CSET, but for
|
---|
157 | multibyte characters. */
|
---|
158 | #endif /* MBS_SUPPORT */
|
---|
159 |
|
---|
160 | CSET /* CSET and (and any value greater) is a
|
---|
161 | terminal symbol that matches any of a
|
---|
162 | class of characters. */
|
---|
163 | } token;
|
---|
164 |
|
---|
165 | /* Sets are stored in an array in the compiled dfa; the index of the
|
---|
166 | array corresponding to a given set token is given by SET_INDEX(t). */
|
---|
167 | #define SET_INDEX(t) ((t) - CSET)
|
---|
168 |
|
---|
169 | /* Sometimes characters can only be matched depending on the surrounding
|
---|
170 | context. Such context decisions depend on what the previous character
|
---|
171 | was, and the value of the current (lookahead) character. Context
|
---|
172 | dependent constraints are encoded as 8 bit integers. Each bit that
|
---|
173 | is set indicates that the constraint succeeds in the corresponding
|
---|
174 | context.
|
---|
175 |
|
---|
176 | bit 7 - previous and current are newlines
|
---|
177 | bit 6 - previous was newline, current isn't
|
---|
178 | bit 5 - previous wasn't newline, current is
|
---|
179 | bit 4 - neither previous nor current is a newline
|
---|
180 | bit 3 - previous and current are word-constituents
|
---|
181 | bit 2 - previous was word-constituent, current isn't
|
---|
182 | bit 1 - previous wasn't word-constituent, current is
|
---|
183 | bit 0 - neither previous nor current is word-constituent
|
---|
184 |
|
---|
185 | Word-constituent characters are those that satisfy isalnum().
|
---|
186 |
|
---|
187 | The macro SUCCEEDS_IN_CONTEXT determines whether a a given constraint
|
---|
188 | succeeds in a particular context. Prevn is true if the previous character
|
---|
189 | was a newline, currn is true if the lookahead character is a newline.
|
---|
190 | Prevl and currl similarly depend upon whether the previous and current
|
---|
191 | characters are word-constituent letters. */
|
---|
192 | #define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
|
---|
193 | ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4))
|
---|
194 | #define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \
|
---|
195 | ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0)))
|
---|
196 | #define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \
|
---|
197 | (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
|
---|
198 | && MATCHES_LETTER_CONTEXT(constraint, prevl, currl))
|
---|
199 |
|
---|
200 | /* The following macros give information about what a constraint depends on. */
|
---|
201 | #define PREV_NEWLINE_DEPENDENT(constraint) \
|
---|
202 | (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30))
|
---|
203 | #define PREV_LETTER_DEPENDENT(constraint) \
|
---|
204 | (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03))
|
---|
205 |
|
---|
206 | /* Tokens that match the empty string subject to some constraint actually
|
---|
207 | work by applying that constraint to determine what may follow them,
|
---|
208 | taking into account what has gone before. The following values are
|
---|
209 | the constraints corresponding to the special tokens previously defined. */
|
---|
210 | #define NO_CONSTRAINT 0xff
|
---|
211 | #define BEGLINE_CONSTRAINT 0xcf
|
---|
212 | #define ENDLINE_CONSTRAINT 0xaf
|
---|
213 | #define BEGWORD_CONSTRAINT 0xf2
|
---|
214 | #define ENDWORD_CONSTRAINT 0xf4
|
---|
215 | #define LIMWORD_CONSTRAINT 0xf6
|
---|
216 | #define NOTLIMWORD_CONSTRAINT 0xf9
|
---|
217 |
|
---|
218 | /* States of the recognizer correspond to sets of positions in the parse
|
---|
219 | tree, together with the constraints under which they may be matched.
|
---|
220 | So a position is encoded as an index into the parse tree together with
|
---|
221 | a constraint. */
|
---|
222 | typedef struct
|
---|
223 | {
|
---|
224 | unsigned index; /* Index into the parse array. */
|
---|
225 | unsigned constraint; /* Constraint for matching this position. */
|
---|
226 | } position;
|
---|
227 |
|
---|
228 | /* Sets of positions are stored as arrays. */
|
---|
229 | typedef struct
|
---|
230 | {
|
---|
231 | position *elems; /* Elements of this position set. */
|
---|
232 | int nelem; /* Number of elements in this set. */
|
---|
233 | } position_set;
|
---|
234 |
|
---|
235 | /* A state of the dfa consists of a set of positions, some flags,
|
---|
236 | and the token value of the lowest-numbered position of the state that
|
---|
237 | contains an END token. */
|
---|
238 | typedef struct
|
---|
239 | {
|
---|
240 | int hash; /* Hash of the positions of this state. */
|
---|
241 | position_set elems; /* Positions this state could match. */
|
---|
242 | char newline; /* True if previous state matched newline. */
|
---|
243 | char letter; /* True if previous state matched a letter. */
|
---|
244 | char backref; /* True if this state matches a \<digit>. */
|
---|
245 | unsigned char constraint; /* Constraint for this state to accept. */
|
---|
246 | int first_end; /* Token value of the first END in elems. */
|
---|
247 | #ifdef MBS_SUPPORT
|
---|
248 | position_set mbps; /* Positions which can match multibyte
|
---|
249 | characters. e.g. period.
|
---|
250 | These staff are used only if
|
---|
251 | MB_CUR_MAX > 1. */
|
---|
252 | #endif
|
---|
253 | } dfa_state;
|
---|
254 |
|
---|
255 | /* Element of a list of strings, at least one of which is known to
|
---|
256 | appear in any R.E. matching the DFA. */
|
---|
257 | struct dfamust
|
---|
258 | {
|
---|
259 | int exact;
|
---|
260 | char *must;
|
---|
261 | struct dfamust *next;
|
---|
262 | };
|
---|
263 |
|
---|
264 | #ifdef MBS_SUPPORT
|
---|
265 | /* A bracket operator.
|
---|
266 | e.g. [a-c], [[:alpha:]], etc. */
|
---|
267 | struct mb_char_classes
|
---|
268 | {
|
---|
269 | int invert;
|
---|
270 | wchar_t *chars; /* Normal characters. */
|
---|
271 | int nchars;
|
---|
272 | wctype_t *ch_classes; /* Character classes. */
|
---|
273 | int nch_classes;
|
---|
274 | wchar_t *range_sts; /* Range characters (start of the range). */
|
---|
275 | wchar_t *range_ends; /* Range characters (end of the range). */
|
---|
276 | int nranges;
|
---|
277 | char **equivs; /* Equivalent classes. */
|
---|
278 | int nequivs;
|
---|
279 | char **coll_elems;
|
---|
280 | int ncoll_elems; /* Collating elements. */
|
---|
281 | };
|
---|
282 | #endif
|
---|
283 |
|
---|
284 | /* A compiled regular expression. */
|
---|
285 | struct dfa
|
---|
286 | {
|
---|
287 | /* Stuff built by the scanner. */
|
---|
288 | charclass *charclasses; /* Array of character sets for CSET tokens. */
|
---|
289 | int cindex; /* Index for adding new charclasses. */
|
---|
290 | int calloc; /* Number of charclasses currently allocated. */
|
---|
291 |
|
---|
292 | /* Stuff built by the parser. */
|
---|
293 | token *tokens; /* Postfix parse array. */
|
---|
294 | int tindex; /* Index for adding new tokens. */
|
---|
295 | int talloc; /* Number of tokens currently allocated. */
|
---|
296 | int depth; /* Depth required of an evaluation stack
|
---|
297 | used for depth-first traversal of the
|
---|
298 | parse tree. */
|
---|
299 | int nleaves; /* Number of leaves on the parse tree. */
|
---|
300 | int nregexps; /* Count of parallel regexps being built
|
---|
301 | with dfaparse(). */
|
---|
302 | #ifdef MBS_SUPPORT
|
---|
303 | /* These stuff are used only if MB_CUR_MAX > 1 or multibyte environments. */
|
---|
304 | int nmultibyte_prop;
|
---|
305 | int *multibyte_prop;
|
---|
306 | /* The value of multibyte_prop[i] is defined by following rule.
|
---|
307 | if tokens[i] < NOTCHAR
|
---|
308 | bit 1 : tokens[i] is a single byte character, or the last-byte of
|
---|
309 | a multibyte character.
|
---|
310 | bit 0 : tokens[i] is a single byte character, or the 1st-byte of
|
---|
311 | a multibyte character.
|
---|
312 | if tokens[i] = MBCSET
|
---|
313 | ("the index of mbcsets correspnd to this operator" << 2) + 3
|
---|
314 |
|
---|
315 | e.g.
|
---|
316 | tokens
|
---|
317 | = 'single_byte_a', 'multi_byte_A', single_byte_b'
|
---|
318 | = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
|
---|
319 | multibyte_prop
|
---|
320 | = 3 , 1 , 0 , 2 , 3
|
---|
321 | */
|
---|
322 |
|
---|
323 | /* Array of the bracket expressoin in the DFA. */
|
---|
324 | struct mb_char_classes *mbcsets;
|
---|
325 | int nmbcsets;
|
---|
326 | int mbcsets_alloc;
|
---|
327 | #endif
|
---|
328 |
|
---|
329 | /* Stuff owned by the state builder. */
|
---|
330 | dfa_state *states; /* States of the dfa. */
|
---|
331 | int sindex; /* Index for adding new states. */
|
---|
332 | int salloc; /* Number of states currently allocated. */
|
---|
333 |
|
---|
334 | /* Stuff built by the structure analyzer. */
|
---|
335 | position_set *follows; /* Array of follow sets, indexed by position
|
---|
336 | index. The follow of a position is the set
|
---|
337 | of positions containing characters that
|
---|
338 | could conceivably follow a character
|
---|
339 | matching the given position in a string
|
---|
340 | matching the regexp. Allocated to the
|
---|
341 | maximum possible position index. */
|
---|
342 | int searchflag; /* True if we are supposed to build a searching
|
---|
343 | as opposed to an exact matcher. A searching
|
---|
344 | matcher finds the first and shortest string
|
---|
345 | matching a regexp anywhere in the buffer,
|
---|
346 | whereas an exact matcher finds the longest
|
---|
347 | string matching, but anchored to the
|
---|
348 | beginning of the buffer. */
|
---|
349 |
|
---|
350 | /* Stuff owned by the executor. */
|
---|
351 | int tralloc; /* Number of transition tables that have
|
---|
352 | slots so far. */
|
---|
353 | int trcount; /* Number of transition tables that have
|
---|
354 | actually been built. */
|
---|
355 | int **trans; /* Transition tables for states that can
|
---|
356 | never accept. If the transitions for a
|
---|
357 | state have not yet been computed, or the
|
---|
358 | state could possibly accept, its entry in
|
---|
359 | this table is NULL. */
|
---|
360 | int **realtrans; /* Trans always points to realtrans + 1; this
|
---|
361 | is so trans[-1] can contain NULL. */
|
---|
362 | int **fails; /* Transition tables after failing to accept
|
---|
363 | on a state that potentially could do so. */
|
---|
364 | int *success; /* Table of acceptance conditions used in
|
---|
365 | dfaexec and computed in build_state. */
|
---|
366 | int *newlines; /* Transitions on newlines. The entry for a
|
---|
367 | newline in any transition table is always
|
---|
368 | -1 so we can count lines without wasting
|
---|
369 | too many cycles. The transition for a
|
---|
370 | newline is stored separately and handled
|
---|
371 | as a special case. Newline is also used
|
---|
372 | as a sentinel at the end of the buffer. */
|
---|
373 | struct dfamust *musts; /* List of strings, at least one of which
|
---|
374 | is known to appear in any r.e. matching
|
---|
375 | the dfa. */
|
---|
376 | };
|
---|
377 |
|
---|
378 | /* Some macros for user access to dfa internals. */
|
---|
379 |
|
---|
380 | /* ACCEPTING returns true if s could possibly be an accepting state of r. */
|
---|
381 | #define ACCEPTING(s, r) ((r).states[s].constraint)
|
---|
382 |
|
---|
383 | /* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
|
---|
384 | specified context. */
|
---|
385 | #define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, dfa) \
|
---|
386 | SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint, \
|
---|
387 | prevn, currn, prevl, currl)
|
---|
388 |
|
---|
389 | /* FIRST_MATCHING_REGEXP returns the index number of the first of parallel
|
---|
390 | regexps that a given state could accept. Parallel regexps are numbered
|
---|
391 | starting at 1. */
|
---|
392 | #define FIRST_MATCHING_REGEXP(state, dfa) (-(dfa).states[state].first_end)
|
---|
393 |
|
---|
394 | /* Entry points. */
|
---|
395 |
|
---|
396 | /* dfasyntax() takes three arguments; the first sets the syntax bits described
|
---|
397 | earlier in this file, the second sets the case-folding flag, and the
|
---|
398 | third specifies the line terminator. */
|
---|
399 | extern void dfasyntax PARAMS ((reg_syntax_t, int, unsigned char));
|
---|
400 |
|
---|
401 | /* Compile the given string of the given length into the given struct dfa.
|
---|
402 | Final argument is a flag specifying whether to build a searching or an
|
---|
403 | exact matcher. */
|
---|
404 | extern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int));
|
---|
405 |
|
---|
406 | /* Execute the given struct dfa on the buffer of characters. The
|
---|
407 | first char * points to the beginning, and the second points to the
|
---|
408 | first character after the end of the buffer, which must be a writable
|
---|
409 | place so a sentinel end-of-buffer marker can be stored there. The
|
---|
410 | second-to-last argument is a flag telling whether to allow newlines to
|
---|
411 | be part of a string matching the regexp. The next-to-last argument,
|
---|
412 | if non-NULL, points to a place to increment every time we see a
|
---|
413 | newline. The final argument, if non-NULL, points to a flag that will
|
---|
414 | be set if further examination by a backtracking matcher is needed in
|
---|
415 | order to verify backreferencing; otherwise the flag will be cleared.
|
---|
416 | Returns NULL if no match is found, or a pointer to the first
|
---|
417 | character after the first & shortest matching string in the buffer. */
|
---|
418 | extern char *dfaexec PARAMS ((struct dfa *, char const *, char *, int, int *, int *));
|
---|
419 |
|
---|
420 | /* Free the storage held by the components of a struct dfa. */
|
---|
421 | extern void dfafree PARAMS ((struct dfa *));
|
---|
422 |
|
---|
423 | /* Entry points for people who know what they're doing. */
|
---|
424 |
|
---|
425 | /* Initialize the components of a struct dfa. */
|
---|
426 | extern void dfainit PARAMS ((struct dfa *));
|
---|
427 |
|
---|
428 | /* Incrementally parse a string of given length into a struct dfa. */
|
---|
429 | extern void dfaparse PARAMS ((char const *, size_t, struct dfa *));
|
---|
430 |
|
---|
431 | /* Analyze a parsed regexp; second argument tells whether to build a searching
|
---|
432 | or an exact matcher. */
|
---|
433 | extern void dfaanalyze PARAMS ((struct dfa *, int));
|
---|
434 |
|
---|
435 | /* Compute, for each possible character, the transitions out of a given
|
---|
436 | state, storing them in an array of integers. */
|
---|
437 | extern void dfastate PARAMS ((int, struct dfa *, int []));
|
---|
438 |
|
---|
439 | /* Error handling. */
|
---|
440 |
|
---|
441 | /* dfaerror() is called by the regexp routines whenever an error occurs. It
|
---|
442 | takes a single argument, a NUL-terminated string describing the error.
|
---|
443 | The user must supply a dfaerror. */
|
---|
444 | extern void dfaerror PARAMS ((const char *));
|
---|