[3076] | 1 | /*
|
---|
| 2 | * re.c - compile regular expressions.
|
---|
| 3 | */
|
---|
| 4 |
|
---|
| 5 | /*
|
---|
| 6 | * Copyright (C) 1991-2005 the Free Software Foundation, Inc.
|
---|
| 7 | *
|
---|
| 8 | * This file is part of GAWK, the GNU implementation of the
|
---|
| 9 | * AWK Programming Language.
|
---|
| 10 | *
|
---|
| 11 | * GAWK is free software; you can redistribute it and/or modify
|
---|
| 12 | * it under the terms of the GNU General Public License as published by
|
---|
| 13 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 14 | * (at your option) any later version.
|
---|
| 15 | *
|
---|
| 16 | * GAWK is distributed in the hope that it will be useful,
|
---|
| 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 19 | * GNU General Public License for more details.
|
---|
| 20 | *
|
---|
| 21 | * You should have received a copy of the GNU General Public License
|
---|
| 22 | * along with this program; if not, write to the Free Software
|
---|
| 23 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
|
---|
| 24 | */
|
---|
| 25 |
|
---|
| 26 | #include "awk.h"
|
---|
| 27 |
|
---|
| 28 | static reg_syntax_t syn;
|
---|
| 29 |
|
---|
| 30 | /* make_regexp --- generate compiled regular expressions */
|
---|
| 31 |
|
---|
| 32 | Regexp *
|
---|
| 33 | make_regexp(const char *s, size_t len, int ignorecase, int dfa)
|
---|
| 34 | {
|
---|
| 35 | Regexp *rp;
|
---|
| 36 | const char *rerr;
|
---|
| 37 | const char *src = s;
|
---|
| 38 | char *temp;
|
---|
| 39 | const char *end = s + len;
|
---|
| 40 | register char *dest;
|
---|
| 41 | register int c, c2;
|
---|
| 42 | static short first = TRUE;
|
---|
| 43 | static short no_dfa = FALSE;
|
---|
| 44 | int has_anchor = FALSE;
|
---|
| 45 |
|
---|
| 46 | /* The number of bytes in the current multibyte character.
|
---|
| 47 | It is 0, when the current character is a singlebyte character. */
|
---|
| 48 | size_t is_multibyte = 0;
|
---|
| 49 | #ifdef MBS_SUPPORT
|
---|
| 50 | mbstate_t mbs;
|
---|
| 51 |
|
---|
| 52 | if (gawk_mb_cur_max > 1)
|
---|
| 53 | memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
|
---|
| 54 | #endif
|
---|
| 55 |
|
---|
| 56 | if (first) {
|
---|
| 57 | first = FALSE;
|
---|
| 58 | no_dfa = (getenv("GAWK_NO_DFA") != NULL); /* for debugging and testing */
|
---|
| 59 | }
|
---|
| 60 |
|
---|
| 61 | /* Handle escaped characters first. */
|
---|
| 62 |
|
---|
| 63 | /*
|
---|
| 64 | * Build a copy of the string (in dest) with the
|
---|
| 65 | * escaped characters translated, and generate the regex
|
---|
| 66 | * from that.
|
---|
| 67 | */
|
---|
| 68 | emalloc(dest, char *, len + 2, "make_regexp");
|
---|
| 69 | temp = dest;
|
---|
| 70 |
|
---|
| 71 | while (src < end) {
|
---|
| 72 | #ifdef MBS_SUPPORT
|
---|
| 73 | if (gawk_mb_cur_max > 1 && ! is_multibyte) {
|
---|
| 74 | /* The previous byte is a singlebyte character, or last byte
|
---|
| 75 | of a multibyte character. We check the next character. */
|
---|
| 76 | is_multibyte = mbrlen(src, end - src, &mbs);
|
---|
| 77 | if ((is_multibyte == 1) || (is_multibyte == (size_t) -1)
|
---|
| 78 | || (is_multibyte == (size_t) -2 || (is_multibyte == 0))) {
|
---|
| 79 | /* We treat it as a singlebyte character. */
|
---|
| 80 | is_multibyte = 0;
|
---|
| 81 | }
|
---|
| 82 | }
|
---|
| 83 | #endif
|
---|
| 84 |
|
---|
| 85 | /* We skip multibyte character, since it must not be a special
|
---|
| 86 | character. */
|
---|
| 87 | if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
|
---|
| 88 | (*src == '\\')) {
|
---|
| 89 | c = *++src;
|
---|
| 90 | switch (c) {
|
---|
| 91 | case 'a':
|
---|
| 92 | case 'b':
|
---|
| 93 | case 'f':
|
---|
| 94 | case 'n':
|
---|
| 95 | case 'r':
|
---|
| 96 | case 't':
|
---|
| 97 | case 'v':
|
---|
| 98 | case 'x':
|
---|
| 99 | case '0':
|
---|
| 100 | case '1':
|
---|
| 101 | case '2':
|
---|
| 102 | case '3':
|
---|
| 103 | case '4':
|
---|
| 104 | case '5':
|
---|
| 105 | case '6':
|
---|
| 106 | case '7':
|
---|
| 107 | c2 = parse_escape(&src);
|
---|
| 108 | if (c2 < 0)
|
---|
| 109 | cant_happen();
|
---|
| 110 | /*
|
---|
| 111 | * Unix awk treats octal (and hex?) chars
|
---|
| 112 | * literally in re's, so escape regexp
|
---|
| 113 | * metacharacters.
|
---|
| 114 | */
|
---|
| 115 | if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x')
|
---|
| 116 | && strchr("()|*+?.^$\\[]", c2) != NULL)
|
---|
| 117 | *dest++ = '\\';
|
---|
| 118 | *dest++ = (char) c2;
|
---|
| 119 | break;
|
---|
| 120 | case '8':
|
---|
| 121 | case '9': /* a\9b not valid */
|
---|
| 122 | *dest++ = c;
|
---|
| 123 | src++;
|
---|
| 124 | break;
|
---|
| 125 | case 'y': /* normally \b */
|
---|
| 126 | /* gnu regex op */
|
---|
| 127 | if (! do_traditional) {
|
---|
| 128 | *dest++ = '\\';
|
---|
| 129 | *dest++ = 'b';
|
---|
| 130 | src++;
|
---|
| 131 | break;
|
---|
| 132 | }
|
---|
| 133 | /* else, fall through */
|
---|
| 134 | default:
|
---|
| 135 | *dest++ = '\\';
|
---|
| 136 | *dest++ = (char) c;
|
---|
| 137 | src++;
|
---|
| 138 | break;
|
---|
| 139 | } /* switch */
|
---|
| 140 | } else {
|
---|
| 141 | c = *src;
|
---|
| 142 | if (c == '^' || c == '$')
|
---|
| 143 | has_anchor = TRUE;
|
---|
| 144 | *dest++ = *src++; /* not '\\' */
|
---|
| 145 | }
|
---|
| 146 | if (gawk_mb_cur_max > 1 && is_multibyte)
|
---|
| 147 | is_multibyte--;
|
---|
| 148 | } /* while */
|
---|
| 149 |
|
---|
| 150 | *dest = '\0' ; /* Only necessary if we print dest ? */
|
---|
| 151 | emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
|
---|
| 152 | memset((char *) rp, 0, sizeof(*rp));
|
---|
| 153 | rp->pat.allocated = 0; /* regex will allocate the buffer */
|
---|
| 154 | emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
|
---|
| 155 |
|
---|
| 156 | /*
|
---|
| 157 | * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
|
---|
| 158 | * was going to turn out to be, I wouldn't have bothered with it.
|
---|
| 159 | *
|
---|
| 160 | * In the case where we have a multibyte character set, we have no
|
---|
| 161 | * choice but to use RE_ICASE, since the casetable is for single-byte
|
---|
| 162 | * character sets only.
|
---|
| 163 | *
|
---|
| 164 | * On the other hand, if we do have a single-byte character set,
|
---|
| 165 | * using the casetable should give a performance improvement, since
|
---|
| 166 | * it's computed only once, not each time a regex is compiled. We
|
---|
| 167 | * also think it's probably better for portability. See the
|
---|
| 168 | * discussion by the definition of casetable[] in eval.c.
|
---|
| 169 | */
|
---|
| 170 |
|
---|
| 171 | if (ignorecase) {
|
---|
| 172 | if (gawk_mb_cur_max > 1) {
|
---|
| 173 | syn |= RE_ICASE;
|
---|
| 174 | rp->pat.translate = NULL;
|
---|
| 175 | } else {
|
---|
| 176 | syn &= ~RE_ICASE;
|
---|
| 177 | rp->pat.translate = (char *) casetable;
|
---|
| 178 | }
|
---|
| 179 | } else {
|
---|
| 180 | rp->pat.translate = NULL;
|
---|
| 181 | syn &= ~RE_ICASE;
|
---|
| 182 | }
|
---|
| 183 |
|
---|
| 184 | dfasyntax(syn | (ignorecase ? RE_ICASE : 0), ignorecase ? TRUE : FALSE, '\n');
|
---|
| 185 | re_set_syntax(syn);
|
---|
| 186 |
|
---|
| 187 | len = dest - temp;
|
---|
| 188 | if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
|
---|
| 189 | fatal("%s: /%s/", rerr, temp); /* rerr already gettextized inside regex routines */
|
---|
| 190 |
|
---|
| 191 | /* gack. this must be done *after* re_compile_pattern */
|
---|
| 192 | rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
|
---|
| 193 | if (dfa && ! no_dfa) {
|
---|
| 194 | dfacomp(temp, len, &(rp->dfareg), TRUE);
|
---|
| 195 | rp->dfa = TRUE;
|
---|
| 196 | } else
|
---|
| 197 | rp->dfa = FALSE;
|
---|
| 198 | rp->has_anchor = has_anchor;
|
---|
| 199 |
|
---|
| 200 | free(temp);
|
---|
| 201 | return rp;
|
---|
| 202 | }
|
---|
| 203 |
|
---|
| 204 | /* research --- do a regexp search. use dfa if possible */
|
---|
| 205 |
|
---|
| 206 | int
|
---|
| 207 | research(Regexp *rp, register char *str, int start,
|
---|
| 208 | register size_t len, int flags)
|
---|
| 209 | {
|
---|
| 210 | const char *ret = str;
|
---|
| 211 | int try_backref;
|
---|
| 212 | int need_start;
|
---|
| 213 | int no_bol;
|
---|
| 214 | int res;
|
---|
| 215 |
|
---|
| 216 | need_start = ((flags & RE_NEED_START) != 0);
|
---|
| 217 | no_bol = ((flags & RE_NO_BOL) != 0);
|
---|
| 218 |
|
---|
| 219 | if (no_bol)
|
---|
| 220 | rp->pat.not_bol = 1;
|
---|
| 221 |
|
---|
| 222 | /*
|
---|
| 223 | * Always do dfa search if can; if it fails, then even if
|
---|
| 224 | * need_start is true, we won't bother with the regex search.
|
---|
| 225 | *
|
---|
| 226 | * The dfa matcher doesn't have a no_bol flag, so don't bother
|
---|
| 227 | * trying it in that case.
|
---|
| 228 | */
|
---|
| 229 | if (rp->dfa && ! no_bol) {
|
---|
| 230 | char save;
|
---|
| 231 | int count = 0;
|
---|
| 232 | /*
|
---|
| 233 | * dfa likes to stick a '\n' right after the matched
|
---|
| 234 | * text. So we just save and restore the character.
|
---|
| 235 | */
|
---|
| 236 | save = str[start+len];
|
---|
| 237 | ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
|
---|
| 238 | &count, &try_backref);
|
---|
| 239 | str[start+len] = save;
|
---|
| 240 | }
|
---|
| 241 |
|
---|
| 242 | if (ret) {
|
---|
| 243 | if (need_start || rp->dfa == FALSE || try_backref) {
|
---|
| 244 | /*
|
---|
| 245 | * Passing NULL as last arg speeds up search for cases
|
---|
| 246 | * where we don't need the start/end info.
|
---|
| 247 | */
|
---|
| 248 | res = re_search(&(rp->pat), str, start+len,
|
---|
| 249 | start, len, need_start ? &(rp->regs) : NULL);
|
---|
| 250 | } else
|
---|
| 251 | res = 1;
|
---|
| 252 | } else
|
---|
| 253 | res = -1;
|
---|
| 254 |
|
---|
| 255 | rp->pat.not_bol = 0;
|
---|
| 256 | return res;
|
---|
| 257 | }
|
---|
| 258 |
|
---|
| 259 | /* refree --- free up the dynamic memory used by a compiled regexp */
|
---|
| 260 |
|
---|
| 261 | void
|
---|
| 262 | refree(Regexp *rp)
|
---|
| 263 | {
|
---|
| 264 | /*
|
---|
| 265 | * This isn't malloced, don't let regfree free it.
|
---|
| 266 | * (This is strictly necessary only for the old
|
---|
| 267 | * version of regex, but it's a good idea to keep it
|
---|
| 268 | * here in case regex internals change in the future.)
|
---|
| 269 | */
|
---|
| 270 | rp->pat.translate = NULL;
|
---|
| 271 |
|
---|
| 272 | regfree(& rp->pat);
|
---|
| 273 | if (rp->regs.start)
|
---|
| 274 | free(rp->regs.start);
|
---|
| 275 | if (rp->regs.end)
|
---|
| 276 | free(rp->regs.end);
|
---|
| 277 | if (rp->dfa)
|
---|
| 278 | dfafree(&(rp->dfareg));
|
---|
| 279 | free(rp);
|
---|
| 280 | }
|
---|
| 281 |
|
---|
| 282 | /* dfaerror --- print an error message for the dfa routines */
|
---|
| 283 |
|
---|
| 284 | void
|
---|
| 285 | dfaerror(const char *s)
|
---|
| 286 | {
|
---|
| 287 | fatal("%s", s);
|
---|
| 288 | }
|
---|
| 289 |
|
---|
| 290 | /* re_update --- recompile a dynamic regexp */
|
---|
| 291 |
|
---|
| 292 | Regexp *
|
---|
| 293 | re_update(NODE *t)
|
---|
| 294 | {
|
---|
| 295 | NODE *t1;
|
---|
| 296 |
|
---|
| 297 | if ((t->re_flags & CASE) == IGNORECASE) {
|
---|
| 298 | if ((t->re_flags & CONST) != 0) {
|
---|
| 299 | assert(t->type == Node_regex);
|
---|
| 300 | return t->re_reg;
|
---|
| 301 | }
|
---|
| 302 | t1 = force_string(tree_eval(t->re_exp));
|
---|
| 303 | if (t->re_text != NULL) {
|
---|
| 304 | if (cmp_nodes(t->re_text, t1) == 0) {
|
---|
| 305 | free_temp(t1);
|
---|
| 306 | return t->re_reg;
|
---|
| 307 | }
|
---|
| 308 | unref(t->re_text);
|
---|
| 309 | }
|
---|
| 310 | t->re_text = dupnode(t1);
|
---|
| 311 | free_temp(t1);
|
---|
| 312 | }
|
---|
| 313 | if (t->re_reg != NULL)
|
---|
| 314 | refree(t->re_reg);
|
---|
| 315 | if (t->re_cnt > 0)
|
---|
| 316 | t->re_cnt++;
|
---|
| 317 | if (t->re_cnt > 10)
|
---|
| 318 | t->re_cnt = 0;
|
---|
| 319 | if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
|
---|
| 320 | t1 = force_string(tree_eval(t->re_exp));
|
---|
| 321 | unref(t->re_text);
|
---|
| 322 | t->re_text = dupnode(t1);
|
---|
| 323 | free_temp(t1);
|
---|
| 324 | }
|
---|
| 325 | t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
|
---|
| 326 | IGNORECASE, t->re_cnt);
|
---|
| 327 | t->re_flags &= ~CASE;
|
---|
| 328 | t->re_flags |= IGNORECASE;
|
---|
| 329 | return t->re_reg;
|
---|
| 330 | }
|
---|
| 331 |
|
---|
| 332 | /* resetup --- choose what kind of regexps we match */
|
---|
| 333 |
|
---|
| 334 | void
|
---|
| 335 | resetup()
|
---|
| 336 | {
|
---|
| 337 | if (do_posix)
|
---|
| 338 | syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
|
---|
| 339 | else if (do_traditional)
|
---|
| 340 | syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
|
---|
| 341 | else
|
---|
| 342 | syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
|
---|
| 343 |
|
---|
| 344 | /*
|
---|
| 345 | * Interval expressions are off by default, since it's likely to
|
---|
| 346 | * break too many old programs to have them on.
|
---|
| 347 | */
|
---|
| 348 | if (do_intervals)
|
---|
| 349 | syn |= RE_INTERVALS;
|
---|
| 350 |
|
---|
| 351 | (void) re_set_syntax(syn);
|
---|
| 352 | dfasyntax(syn, FALSE, '\n');
|
---|
| 353 | }
|
---|
| 354 |
|
---|
| 355 | /* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */
|
---|
| 356 |
|
---|
| 357 | int
|
---|
| 358 | avoid_dfa(NODE *re, char *str, size_t len)
|
---|
| 359 | {
|
---|
| 360 | char *end;
|
---|
| 361 |
|
---|
| 362 | if (! re->re_reg->has_anchor)
|
---|
| 363 | return FALSE;
|
---|
| 364 |
|
---|
| 365 | for (end = str + len; str < end; str++)
|
---|
| 366 | if (*str == '\n')
|
---|
| 367 | return TRUE;
|
---|
| 368 |
|
---|
| 369 | return FALSE;
|
---|
| 370 | }
|
---|
| 371 |
|
---|
| 372 | /* reisstring --- return TRUE if the RE match is a simple string match */
|
---|
| 373 |
|
---|
| 374 | int
|
---|
| 375 | reisstring(const char *text, size_t len, Regexp *re, const char *buf)
|
---|
| 376 | {
|
---|
| 377 | static char metas[] = ".*+(){}[]|?^$\\";
|
---|
| 378 | int i;
|
---|
| 379 | int res;
|
---|
| 380 | const char *matched;
|
---|
| 381 |
|
---|
| 382 | /* simple checking for has meta characters in re */
|
---|
| 383 | for (i = 0; i < len; i++) {
|
---|
| 384 | if (strchr(metas, text[i]) != NULL) {
|
---|
| 385 | return FALSE; /* give up early, can't be string match */
|
---|
| 386 | }
|
---|
| 387 | }
|
---|
| 388 |
|
---|
| 389 | /* make accessable to gdb */
|
---|
| 390 | matched = &buf[RESTART(re, buf)];
|
---|
| 391 |
|
---|
| 392 | res = (memcmp(text, matched, len) == 0);
|
---|
| 393 |
|
---|
| 394 | return res;
|
---|
| 395 | }
|
---|
| 396 |
|
---|
| 397 | /* remaybelong --- return TRUE if the RE contains * ? | + */
|
---|
| 398 |
|
---|
| 399 | int
|
---|
| 400 | remaybelong(const char *text, size_t len)
|
---|
| 401 | {
|
---|
| 402 | while (len--) {
|
---|
| 403 | if (strchr("*+|?", *text++) != NULL) {
|
---|
| 404 | return TRUE;
|
---|
| 405 | }
|
---|
| 406 | }
|
---|
| 407 |
|
---|
| 408 | return FALSE;
|
---|
| 409 | }
|
---|
| 410 |
|
---|
| 411 | /* reflags2str --- make a regex flags value readable */
|
---|
| 412 |
|
---|
| 413 | const char *
|
---|
| 414 | reflags2str(int flagval)
|
---|
| 415 | {
|
---|
| 416 | static const struct flagtab values[] = {
|
---|
| 417 | { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
|
---|
| 418 | { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
|
---|
| 419 | { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
|
---|
| 420 | { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
|
---|
| 421 | { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
|
---|
| 422 | { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
|
---|
| 423 | { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
|
---|
| 424 | { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
|
---|
| 425 | { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
|
---|
| 426 | { RE_INTERVALS, "RE_INTERVALS" },
|
---|
| 427 | { RE_LIMITED_OPS, "RE_LIMITED_OPS" },
|
---|
| 428 | { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
|
---|
| 429 | { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
|
---|
| 430 | { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
|
---|
| 431 | { RE_NO_BK_REFS, "RE_NO_BK_REFS" },
|
---|
| 432 | { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
|
---|
| 433 | { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
|
---|
| 434 | { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
|
---|
| 435 | { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
|
---|
| 436 | { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
|
---|
| 437 | { RE_DEBUG, "RE_DEBUG" },
|
---|
| 438 | { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
|
---|
| 439 | { RE_ICASE, "RE_ICASE" },
|
---|
| 440 | { RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
|
---|
| 441 | { RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
|
---|
| 442 | { RE_NO_SUB, "RE_NO_SUB" },
|
---|
| 443 | { 0, NULL },
|
---|
| 444 | };
|
---|
| 445 |
|
---|
| 446 | return genflags2str(flagval, values);
|
---|
| 447 | }
|
---|