1 | /*
|
---|
2 | * re.c - compile regular expressions.
|
---|
3 | */
|
---|
4 |
|
---|
5 | /*
|
---|
6 | * Copyright (C) 1991-2005 the Free Software Foundation, Inc.
|
---|
7 | *
|
---|
8 | * This file is part of GAWK, the GNU implementation of the
|
---|
9 | * AWK Programming Language.
|
---|
10 | *
|
---|
11 | * GAWK is free software; you can redistribute it and/or modify
|
---|
12 | * it under the terms of the GNU General Public License as published by
|
---|
13 | * the Free Software Foundation; either version 2 of the License, or
|
---|
14 | * (at your option) any later version.
|
---|
15 | *
|
---|
16 | * GAWK is distributed in the hope that it will be useful,
|
---|
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
19 | * GNU General Public License for more details.
|
---|
20 | *
|
---|
21 | * You should have received a copy of the GNU General Public License
|
---|
22 | * along with this program; if not, write to the Free Software
|
---|
23 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
|
---|
24 | */
|
---|
25 |
|
---|
26 | #include "awk.h"
|
---|
27 |
|
---|
28 | static reg_syntax_t syn;
|
---|
29 |
|
---|
30 | /* make_regexp --- generate compiled regular expressions */
|
---|
31 |
|
---|
32 | Regexp *
|
---|
33 | make_regexp(const char *s, size_t len, int ignorecase, int dfa)
|
---|
34 | {
|
---|
35 | Regexp *rp;
|
---|
36 | const char *rerr;
|
---|
37 | const char *src = s;
|
---|
38 | char *temp;
|
---|
39 | const char *end = s + len;
|
---|
40 | register char *dest;
|
---|
41 | register int c, c2;
|
---|
42 | static short first = TRUE;
|
---|
43 | static short no_dfa = FALSE;
|
---|
44 | int has_anchor = FALSE;
|
---|
45 |
|
---|
46 | /* The number of bytes in the current multibyte character.
|
---|
47 | It is 0, when the current character is a singlebyte character. */
|
---|
48 | size_t is_multibyte = 0;
|
---|
49 | #ifdef MBS_SUPPORT
|
---|
50 | mbstate_t mbs;
|
---|
51 |
|
---|
52 | if (gawk_mb_cur_max > 1)
|
---|
53 | memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
|
---|
54 | #endif
|
---|
55 |
|
---|
56 | if (first) {
|
---|
57 | first = FALSE;
|
---|
58 | no_dfa = (getenv("GAWK_NO_DFA") != NULL); /* for debugging and testing */
|
---|
59 | }
|
---|
60 |
|
---|
61 | /* Handle escaped characters first. */
|
---|
62 |
|
---|
63 | /*
|
---|
64 | * Build a copy of the string (in dest) with the
|
---|
65 | * escaped characters translated, and generate the regex
|
---|
66 | * from that.
|
---|
67 | */
|
---|
68 | emalloc(dest, char *, len + 2, "make_regexp");
|
---|
69 | temp = dest;
|
---|
70 |
|
---|
71 | while (src < end) {
|
---|
72 | #ifdef MBS_SUPPORT
|
---|
73 | if (gawk_mb_cur_max > 1 && ! is_multibyte) {
|
---|
74 | /* The previous byte is a singlebyte character, or last byte
|
---|
75 | of a multibyte character. We check the next character. */
|
---|
76 | is_multibyte = mbrlen(src, end - src, &mbs);
|
---|
77 | if ((is_multibyte == 1) || (is_multibyte == (size_t) -1)
|
---|
78 | || (is_multibyte == (size_t) -2 || (is_multibyte == 0))) {
|
---|
79 | /* We treat it as a singlebyte character. */
|
---|
80 | is_multibyte = 0;
|
---|
81 | }
|
---|
82 | }
|
---|
83 | #endif
|
---|
84 |
|
---|
85 | /* We skip multibyte character, since it must not be a special
|
---|
86 | character. */
|
---|
87 | if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
|
---|
88 | (*src == '\\')) {
|
---|
89 | c = *++src;
|
---|
90 | switch (c) {
|
---|
91 | case 'a':
|
---|
92 | case 'b':
|
---|
93 | case 'f':
|
---|
94 | case 'n':
|
---|
95 | case 'r':
|
---|
96 | case 't':
|
---|
97 | case 'v':
|
---|
98 | case 'x':
|
---|
99 | case '0':
|
---|
100 | case '1':
|
---|
101 | case '2':
|
---|
102 | case '3':
|
---|
103 | case '4':
|
---|
104 | case '5':
|
---|
105 | case '6':
|
---|
106 | case '7':
|
---|
107 | c2 = parse_escape(&src);
|
---|
108 | if (c2 < 0)
|
---|
109 | cant_happen();
|
---|
110 | /*
|
---|
111 | * Unix awk treats octal (and hex?) chars
|
---|
112 | * literally in re's, so escape regexp
|
---|
113 | * metacharacters.
|
---|
114 | */
|
---|
115 | if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x')
|
---|
116 | && strchr("()|*+?.^$\\[]", c2) != NULL)
|
---|
117 | *dest++ = '\\';
|
---|
118 | *dest++ = (char) c2;
|
---|
119 | break;
|
---|
120 | case '8':
|
---|
121 | case '9': /* a\9b not valid */
|
---|
122 | *dest++ = c;
|
---|
123 | src++;
|
---|
124 | break;
|
---|
125 | case 'y': /* normally \b */
|
---|
126 | /* gnu regex op */
|
---|
127 | if (! do_traditional) {
|
---|
128 | *dest++ = '\\';
|
---|
129 | *dest++ = 'b';
|
---|
130 | src++;
|
---|
131 | break;
|
---|
132 | }
|
---|
133 | /* else, fall through */
|
---|
134 | default:
|
---|
135 | *dest++ = '\\';
|
---|
136 | *dest++ = (char) c;
|
---|
137 | src++;
|
---|
138 | break;
|
---|
139 | } /* switch */
|
---|
140 | } else {
|
---|
141 | c = *src;
|
---|
142 | if (c == '^' || c == '$')
|
---|
143 | has_anchor = TRUE;
|
---|
144 | *dest++ = *src++; /* not '\\' */
|
---|
145 | }
|
---|
146 | if (gawk_mb_cur_max > 1 && is_multibyte)
|
---|
147 | is_multibyte--;
|
---|
148 | } /* while */
|
---|
149 |
|
---|
150 | *dest = '\0' ; /* Only necessary if we print dest ? */
|
---|
151 | emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
|
---|
152 | memset((char *) rp, 0, sizeof(*rp));
|
---|
153 | rp->pat.allocated = 0; /* regex will allocate the buffer */
|
---|
154 | emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
|
---|
155 |
|
---|
156 | /*
|
---|
157 | * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
|
---|
158 | * was going to turn out to be, I wouldn't have bothered with it.
|
---|
159 | *
|
---|
160 | * In the case where we have a multibyte character set, we have no
|
---|
161 | * choice but to use RE_ICASE, since the casetable is for single-byte
|
---|
162 | * character sets only.
|
---|
163 | *
|
---|
164 | * On the other hand, if we do have a single-byte character set,
|
---|
165 | * using the casetable should give a performance improvement, since
|
---|
166 | * it's computed only once, not each time a regex is compiled. We
|
---|
167 | * also think it's probably better for portability. See the
|
---|
168 | * discussion by the definition of casetable[] in eval.c.
|
---|
169 | */
|
---|
170 |
|
---|
171 | if (ignorecase) {
|
---|
172 | if (gawk_mb_cur_max > 1) {
|
---|
173 | syn |= RE_ICASE;
|
---|
174 | rp->pat.translate = NULL;
|
---|
175 | } else {
|
---|
176 | syn &= ~RE_ICASE;
|
---|
177 | rp->pat.translate = (char *) casetable;
|
---|
178 | }
|
---|
179 | } else {
|
---|
180 | rp->pat.translate = NULL;
|
---|
181 | syn &= ~RE_ICASE;
|
---|
182 | }
|
---|
183 |
|
---|
184 | dfasyntax(syn | (ignorecase ? RE_ICASE : 0), ignorecase ? TRUE : FALSE, '\n');
|
---|
185 | re_set_syntax(syn);
|
---|
186 |
|
---|
187 | len = dest - temp;
|
---|
188 | if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
|
---|
189 | fatal("%s: /%s/", rerr, temp); /* rerr already gettextized inside regex routines */
|
---|
190 |
|
---|
191 | /* gack. this must be done *after* re_compile_pattern */
|
---|
192 | rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
|
---|
193 | if (dfa && ! no_dfa) {
|
---|
194 | dfacomp(temp, len, &(rp->dfareg), TRUE);
|
---|
195 | rp->dfa = TRUE;
|
---|
196 | } else
|
---|
197 | rp->dfa = FALSE;
|
---|
198 | rp->has_anchor = has_anchor;
|
---|
199 |
|
---|
200 | free(temp);
|
---|
201 | return rp;
|
---|
202 | }
|
---|
203 |
|
---|
204 | /* research --- do a regexp search. use dfa if possible */
|
---|
205 |
|
---|
206 | int
|
---|
207 | research(Regexp *rp, register char *str, int start,
|
---|
208 | register size_t len, int flags)
|
---|
209 | {
|
---|
210 | const char *ret = str;
|
---|
211 | int try_backref;
|
---|
212 | int need_start;
|
---|
213 | int no_bol;
|
---|
214 | int res;
|
---|
215 |
|
---|
216 | need_start = ((flags & RE_NEED_START) != 0);
|
---|
217 | no_bol = ((flags & RE_NO_BOL) != 0);
|
---|
218 |
|
---|
219 | if (no_bol)
|
---|
220 | rp->pat.not_bol = 1;
|
---|
221 |
|
---|
222 | /*
|
---|
223 | * Always do dfa search if can; if it fails, then even if
|
---|
224 | * need_start is true, we won't bother with the regex search.
|
---|
225 | *
|
---|
226 | * The dfa matcher doesn't have a no_bol flag, so don't bother
|
---|
227 | * trying it in that case.
|
---|
228 | */
|
---|
229 | if (rp->dfa && ! no_bol) {
|
---|
230 | char save;
|
---|
231 | int count = 0;
|
---|
232 | /*
|
---|
233 | * dfa likes to stick a '\n' right after the matched
|
---|
234 | * text. So we just save and restore the character.
|
---|
235 | */
|
---|
236 | save = str[start+len];
|
---|
237 | ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
|
---|
238 | &count, &try_backref);
|
---|
239 | str[start+len] = save;
|
---|
240 | }
|
---|
241 |
|
---|
242 | if (ret) {
|
---|
243 | if (need_start || rp->dfa == FALSE || try_backref) {
|
---|
244 | /*
|
---|
245 | * Passing NULL as last arg speeds up search for cases
|
---|
246 | * where we don't need the start/end info.
|
---|
247 | */
|
---|
248 | res = re_search(&(rp->pat), str, start+len,
|
---|
249 | start, len, need_start ? &(rp->regs) : NULL);
|
---|
250 | } else
|
---|
251 | res = 1;
|
---|
252 | } else
|
---|
253 | res = -1;
|
---|
254 |
|
---|
255 | rp->pat.not_bol = 0;
|
---|
256 | return res;
|
---|
257 | }
|
---|
258 |
|
---|
259 | /* refree --- free up the dynamic memory used by a compiled regexp */
|
---|
260 |
|
---|
261 | void
|
---|
262 | refree(Regexp *rp)
|
---|
263 | {
|
---|
264 | /*
|
---|
265 | * This isn't malloced, don't let regfree free it.
|
---|
266 | * (This is strictly necessary only for the old
|
---|
267 | * version of regex, but it's a good idea to keep it
|
---|
268 | * here in case regex internals change in the future.)
|
---|
269 | */
|
---|
270 | rp->pat.translate = NULL;
|
---|
271 |
|
---|
272 | regfree(& rp->pat);
|
---|
273 | if (rp->regs.start)
|
---|
274 | free(rp->regs.start);
|
---|
275 | if (rp->regs.end)
|
---|
276 | free(rp->regs.end);
|
---|
277 | if (rp->dfa)
|
---|
278 | dfafree(&(rp->dfareg));
|
---|
279 | free(rp);
|
---|
280 | }
|
---|
281 |
|
---|
282 | /* dfaerror --- print an error message for the dfa routines */
|
---|
283 |
|
---|
284 | void
|
---|
285 | dfaerror(const char *s)
|
---|
286 | {
|
---|
287 | fatal("%s", s);
|
---|
288 | }
|
---|
289 |
|
---|
290 | /* re_update --- recompile a dynamic regexp */
|
---|
291 |
|
---|
292 | Regexp *
|
---|
293 | re_update(NODE *t)
|
---|
294 | {
|
---|
295 | NODE *t1;
|
---|
296 |
|
---|
297 | if ((t->re_flags & CASE) == IGNORECASE) {
|
---|
298 | if ((t->re_flags & CONST) != 0) {
|
---|
299 | assert(t->type == Node_regex);
|
---|
300 | return t->re_reg;
|
---|
301 | }
|
---|
302 | t1 = force_string(tree_eval(t->re_exp));
|
---|
303 | if (t->re_text != NULL) {
|
---|
304 | if (cmp_nodes(t->re_text, t1) == 0) {
|
---|
305 | free_temp(t1);
|
---|
306 | return t->re_reg;
|
---|
307 | }
|
---|
308 | unref(t->re_text);
|
---|
309 | }
|
---|
310 | t->re_text = dupnode(t1);
|
---|
311 | free_temp(t1);
|
---|
312 | }
|
---|
313 | if (t->re_reg != NULL)
|
---|
314 | refree(t->re_reg);
|
---|
315 | if (t->re_cnt > 0)
|
---|
316 | t->re_cnt++;
|
---|
317 | if (t->re_cnt > 10)
|
---|
318 | t->re_cnt = 0;
|
---|
319 | if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
|
---|
320 | t1 = force_string(tree_eval(t->re_exp));
|
---|
321 | unref(t->re_text);
|
---|
322 | t->re_text = dupnode(t1);
|
---|
323 | free_temp(t1);
|
---|
324 | }
|
---|
325 | t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
|
---|
326 | IGNORECASE, t->re_cnt);
|
---|
327 | t->re_flags &= ~CASE;
|
---|
328 | t->re_flags |= IGNORECASE;
|
---|
329 | return t->re_reg;
|
---|
330 | }
|
---|
331 |
|
---|
332 | /* resetup --- choose what kind of regexps we match */
|
---|
333 |
|
---|
334 | void
|
---|
335 | resetup()
|
---|
336 | {
|
---|
337 | if (do_posix)
|
---|
338 | syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
|
---|
339 | else if (do_traditional)
|
---|
340 | syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
|
---|
341 | else
|
---|
342 | syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
|
---|
343 |
|
---|
344 | /*
|
---|
345 | * Interval expressions are off by default, since it's likely to
|
---|
346 | * break too many old programs to have them on.
|
---|
347 | */
|
---|
348 | if (do_intervals)
|
---|
349 | syn |= RE_INTERVALS;
|
---|
350 |
|
---|
351 | (void) re_set_syntax(syn);
|
---|
352 | dfasyntax(syn, FALSE, '\n');
|
---|
353 | }
|
---|
354 |
|
---|
355 | /* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */
|
---|
356 |
|
---|
357 | int
|
---|
358 | avoid_dfa(NODE *re, char *str, size_t len)
|
---|
359 | {
|
---|
360 | char *end;
|
---|
361 |
|
---|
362 | if (! re->re_reg->has_anchor)
|
---|
363 | return FALSE;
|
---|
364 |
|
---|
365 | for (end = str + len; str < end; str++)
|
---|
366 | if (*str == '\n')
|
---|
367 | return TRUE;
|
---|
368 |
|
---|
369 | return FALSE;
|
---|
370 | }
|
---|
371 |
|
---|
372 | /* reisstring --- return TRUE if the RE match is a simple string match */
|
---|
373 |
|
---|
374 | int
|
---|
375 | reisstring(const char *text, size_t len, Regexp *re, const char *buf)
|
---|
376 | {
|
---|
377 | static char metas[] = ".*+(){}[]|?^$\\";
|
---|
378 | int i;
|
---|
379 | int res;
|
---|
380 | const char *matched;
|
---|
381 |
|
---|
382 | /* simple checking for has meta characters in re */
|
---|
383 | for (i = 0; i < len; i++) {
|
---|
384 | if (strchr(metas, text[i]) != NULL) {
|
---|
385 | return FALSE; /* give up early, can't be string match */
|
---|
386 | }
|
---|
387 | }
|
---|
388 |
|
---|
389 | /* make accessable to gdb */
|
---|
390 | matched = &buf[RESTART(re, buf)];
|
---|
391 |
|
---|
392 | res = (memcmp(text, matched, len) == 0);
|
---|
393 |
|
---|
394 | return res;
|
---|
395 | }
|
---|
396 |
|
---|
397 | /* remaybelong --- return TRUE if the RE contains * ? | + */
|
---|
398 |
|
---|
399 | int
|
---|
400 | remaybelong(const char *text, size_t len)
|
---|
401 | {
|
---|
402 | while (len--) {
|
---|
403 | if (strchr("*+|?", *text++) != NULL) {
|
---|
404 | return TRUE;
|
---|
405 | }
|
---|
406 | }
|
---|
407 |
|
---|
408 | return FALSE;
|
---|
409 | }
|
---|
410 |
|
---|
411 | /* reflags2str --- make a regex flags value readable */
|
---|
412 |
|
---|
413 | const char *
|
---|
414 | reflags2str(int flagval)
|
---|
415 | {
|
---|
416 | static const struct flagtab values[] = {
|
---|
417 | { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
|
---|
418 | { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
|
---|
419 | { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
|
---|
420 | { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
|
---|
421 | { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
|
---|
422 | { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
|
---|
423 | { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
|
---|
424 | { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
|
---|
425 | { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
|
---|
426 | { RE_INTERVALS, "RE_INTERVALS" },
|
---|
427 | { RE_LIMITED_OPS, "RE_LIMITED_OPS" },
|
---|
428 | { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
|
---|
429 | { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
|
---|
430 | { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
|
---|
431 | { RE_NO_BK_REFS, "RE_NO_BK_REFS" },
|
---|
432 | { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
|
---|
433 | { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
|
---|
434 | { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
|
---|
435 | { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
|
---|
436 | { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
|
---|
437 | { RE_DEBUG, "RE_DEBUG" },
|
---|
438 | { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
|
---|
439 | { RE_ICASE, "RE_ICASE" },
|
---|
440 | { RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
|
---|
441 | { RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
|
---|
442 | { RE_NO_SUB, "RE_NO_SUB" },
|
---|
443 | { 0, NULL },
|
---|
444 | };
|
---|
445 |
|
---|
446 | return genflags2str(flagval, values);
|
---|
447 | }
|
---|