source: trunk/essentials/sys-apps/gawk/re.c@ 3871

Last change on this file since 3871 was 3076, checked in by bird, 19 years ago

gawk 3.1.5

File size: 11.0 KB
Line 
1/*
2 * re.c - compile regular expressions.
3 */
4
5/*
6 * Copyright (C) 1991-2005 the Free Software Foundation, Inc.
7 *
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
10 *
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24 */
25
26#include "awk.h"
27
28static reg_syntax_t syn;
29
30/* make_regexp --- generate compiled regular expressions */
31
32Regexp *
33make_regexp(const char *s, size_t len, int ignorecase, int dfa)
34{
35 Regexp *rp;
36 const char *rerr;
37 const char *src = s;
38 char *temp;
39 const char *end = s + len;
40 register char *dest;
41 register int c, c2;
42 static short first = TRUE;
43 static short no_dfa = FALSE;
44 int has_anchor = FALSE;
45
46 /* The number of bytes in the current multibyte character.
47 It is 0, when the current character is a singlebyte character. */
48 size_t is_multibyte = 0;
49#ifdef MBS_SUPPORT
50 mbstate_t mbs;
51
52 if (gawk_mb_cur_max > 1)
53 memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
54#endif
55
56 if (first) {
57 first = FALSE;
58 no_dfa = (getenv("GAWK_NO_DFA") != NULL); /* for debugging and testing */
59 }
60
61 /* Handle escaped characters first. */
62
63 /*
64 * Build a copy of the string (in dest) with the
65 * escaped characters translated, and generate the regex
66 * from that.
67 */
68 emalloc(dest, char *, len + 2, "make_regexp");
69 temp = dest;
70
71 while (src < end) {
72#ifdef MBS_SUPPORT
73 if (gawk_mb_cur_max > 1 && ! is_multibyte) {
74 /* The previous byte is a singlebyte character, or last byte
75 of a multibyte character. We check the next character. */
76 is_multibyte = mbrlen(src, end - src, &mbs);
77 if ((is_multibyte == 1) || (is_multibyte == (size_t) -1)
78 || (is_multibyte == (size_t) -2 || (is_multibyte == 0))) {
79 /* We treat it as a singlebyte character. */
80 is_multibyte = 0;
81 }
82 }
83#endif
84
85 /* We skip multibyte character, since it must not be a special
86 character. */
87 if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
88 (*src == '\\')) {
89 c = *++src;
90 switch (c) {
91 case 'a':
92 case 'b':
93 case 'f':
94 case 'n':
95 case 'r':
96 case 't':
97 case 'v':
98 case 'x':
99 case '0':
100 case '1':
101 case '2':
102 case '3':
103 case '4':
104 case '5':
105 case '6':
106 case '7':
107 c2 = parse_escape(&src);
108 if (c2 < 0)
109 cant_happen();
110 /*
111 * Unix awk treats octal (and hex?) chars
112 * literally in re's, so escape regexp
113 * metacharacters.
114 */
115 if (do_traditional && ! do_posix && (ISDIGIT(c) || c == 'x')
116 && strchr("()|*+?.^$\\[]", c2) != NULL)
117 *dest++ = '\\';
118 *dest++ = (char) c2;
119 break;
120 case '8':
121 case '9': /* a\9b not valid */
122 *dest++ = c;
123 src++;
124 break;
125 case 'y': /* normally \b */
126 /* gnu regex op */
127 if (! do_traditional) {
128 *dest++ = '\\';
129 *dest++ = 'b';
130 src++;
131 break;
132 }
133 /* else, fall through */
134 default:
135 *dest++ = '\\';
136 *dest++ = (char) c;
137 src++;
138 break;
139 } /* switch */
140 } else {
141 c = *src;
142 if (c == '^' || c == '$')
143 has_anchor = TRUE;
144 *dest++ = *src++; /* not '\\' */
145 }
146 if (gawk_mb_cur_max > 1 && is_multibyte)
147 is_multibyte--;
148 } /* while */
149
150 *dest = '\0' ; /* Only necessary if we print dest ? */
151 emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
152 memset((char *) rp, 0, sizeof(*rp));
153 rp->pat.allocated = 0; /* regex will allocate the buffer */
154 emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
155
156 /*
157 * Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
158 * was going to turn out to be, I wouldn't have bothered with it.
159 *
160 * In the case where we have a multibyte character set, we have no
161 * choice but to use RE_ICASE, since the casetable is for single-byte
162 * character sets only.
163 *
164 * On the other hand, if we do have a single-byte character set,
165 * using the casetable should give a performance improvement, since
166 * it's computed only once, not each time a regex is compiled. We
167 * also think it's probably better for portability. See the
168 * discussion by the definition of casetable[] in eval.c.
169 */
170
171 if (ignorecase) {
172 if (gawk_mb_cur_max > 1) {
173 syn |= RE_ICASE;
174 rp->pat.translate = NULL;
175 } else {
176 syn &= ~RE_ICASE;
177 rp->pat.translate = (char *) casetable;
178 }
179 } else {
180 rp->pat.translate = NULL;
181 syn &= ~RE_ICASE;
182 }
183
184 dfasyntax(syn | (ignorecase ? RE_ICASE : 0), ignorecase ? TRUE : FALSE, '\n');
185 re_set_syntax(syn);
186
187 len = dest - temp;
188 if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
189 fatal("%s: /%s/", rerr, temp); /* rerr already gettextized inside regex routines */
190
191 /* gack. this must be done *after* re_compile_pattern */
192 rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
193 if (dfa && ! no_dfa) {
194 dfacomp(temp, len, &(rp->dfareg), TRUE);
195 rp->dfa = TRUE;
196 } else
197 rp->dfa = FALSE;
198 rp->has_anchor = has_anchor;
199
200 free(temp);
201 return rp;
202}
203
204/* research --- do a regexp search. use dfa if possible */
205
206int
207research(Regexp *rp, register char *str, int start,
208 register size_t len, int flags)
209{
210 const char *ret = str;
211 int try_backref;
212 int need_start;
213 int no_bol;
214 int res;
215
216 need_start = ((flags & RE_NEED_START) != 0);
217 no_bol = ((flags & RE_NO_BOL) != 0);
218
219 if (no_bol)
220 rp->pat.not_bol = 1;
221
222 /*
223 * Always do dfa search if can; if it fails, then even if
224 * need_start is true, we won't bother with the regex search.
225 *
226 * The dfa matcher doesn't have a no_bol flag, so don't bother
227 * trying it in that case.
228 */
229 if (rp->dfa && ! no_bol) {
230 char save;
231 int count = 0;
232 /*
233 * dfa likes to stick a '\n' right after the matched
234 * text. So we just save and restore the character.
235 */
236 save = str[start+len];
237 ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
238 &count, &try_backref);
239 str[start+len] = save;
240 }
241
242 if (ret) {
243 if (need_start || rp->dfa == FALSE || try_backref) {
244 /*
245 * Passing NULL as last arg speeds up search for cases
246 * where we don't need the start/end info.
247 */
248 res = re_search(&(rp->pat), str, start+len,
249 start, len, need_start ? &(rp->regs) : NULL);
250 } else
251 res = 1;
252 } else
253 res = -1;
254
255 rp->pat.not_bol = 0;
256 return res;
257}
258
259/* refree --- free up the dynamic memory used by a compiled regexp */
260
261void
262refree(Regexp *rp)
263{
264 /*
265 * This isn't malloced, don't let regfree free it.
266 * (This is strictly necessary only for the old
267 * version of regex, but it's a good idea to keep it
268 * here in case regex internals change in the future.)
269 */
270 rp->pat.translate = NULL;
271
272 regfree(& rp->pat);
273 if (rp->regs.start)
274 free(rp->regs.start);
275 if (rp->regs.end)
276 free(rp->regs.end);
277 if (rp->dfa)
278 dfafree(&(rp->dfareg));
279 free(rp);
280}
281
282/* dfaerror --- print an error message for the dfa routines */
283
284void
285dfaerror(const char *s)
286{
287 fatal("%s", s);
288}
289
290/* re_update --- recompile a dynamic regexp */
291
292Regexp *
293re_update(NODE *t)
294{
295 NODE *t1;
296
297 if ((t->re_flags & CASE) == IGNORECASE) {
298 if ((t->re_flags & CONST) != 0) {
299 assert(t->type == Node_regex);
300 return t->re_reg;
301 }
302 t1 = force_string(tree_eval(t->re_exp));
303 if (t->re_text != NULL) {
304 if (cmp_nodes(t->re_text, t1) == 0) {
305 free_temp(t1);
306 return t->re_reg;
307 }
308 unref(t->re_text);
309 }
310 t->re_text = dupnode(t1);
311 free_temp(t1);
312 }
313 if (t->re_reg != NULL)
314 refree(t->re_reg);
315 if (t->re_cnt > 0)
316 t->re_cnt++;
317 if (t->re_cnt > 10)
318 t->re_cnt = 0;
319 if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
320 t1 = force_string(tree_eval(t->re_exp));
321 unref(t->re_text);
322 t->re_text = dupnode(t1);
323 free_temp(t1);
324 }
325 t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
326 IGNORECASE, t->re_cnt);
327 t->re_flags &= ~CASE;
328 t->re_flags |= IGNORECASE;
329 return t->re_reg;
330}
331
332/* resetup --- choose what kind of regexps we match */
333
334void
335resetup()
336{
337 if (do_posix)
338 syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
339 else if (do_traditional)
340 syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
341 else
342 syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
343
344 /*
345 * Interval expressions are off by default, since it's likely to
346 * break too many old programs to have them on.
347 */
348 if (do_intervals)
349 syn |= RE_INTERVALS;
350
351 (void) re_set_syntax(syn);
352 dfasyntax(syn, FALSE, '\n');
353}
354
355/* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */
356
357int
358avoid_dfa(NODE *re, char *str, size_t len)
359{
360 char *end;
361
362 if (! re->re_reg->has_anchor)
363 return FALSE;
364
365 for (end = str + len; str < end; str++)
366 if (*str == '\n')
367 return TRUE;
368
369 return FALSE;
370}
371
372/* reisstring --- return TRUE if the RE match is a simple string match */
373
374int
375reisstring(const char *text, size_t len, Regexp *re, const char *buf)
376{
377 static char metas[] = ".*+(){}[]|?^$\\";
378 int i;
379 int res;
380 const char *matched;
381
382 /* simple checking for has meta characters in re */
383 for (i = 0; i < len; i++) {
384 if (strchr(metas, text[i]) != NULL) {
385 return FALSE; /* give up early, can't be string match */
386 }
387 }
388
389 /* make accessable to gdb */
390 matched = &buf[RESTART(re, buf)];
391
392 res = (memcmp(text, matched, len) == 0);
393
394 return res;
395}
396
397/* remaybelong --- return TRUE if the RE contains * ? | + */
398
399int
400remaybelong(const char *text, size_t len)
401{
402 while (len--) {
403 if (strchr("*+|?", *text++) != NULL) {
404 return TRUE;
405 }
406 }
407
408 return FALSE;
409}
410
411/* reflags2str --- make a regex flags value readable */
412
413const char *
414reflags2str(int flagval)
415{
416 static const struct flagtab values[] = {
417 { RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
418 { RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
419 { RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
420 { RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
421 { RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
422 { RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
423 { RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
424 { RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
425 { RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
426 { RE_INTERVALS, "RE_INTERVALS" },
427 { RE_LIMITED_OPS, "RE_LIMITED_OPS" },
428 { RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
429 { RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
430 { RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
431 { RE_NO_BK_REFS, "RE_NO_BK_REFS" },
432 { RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
433 { RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
434 { RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
435 { RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
436 { RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
437 { RE_DEBUG, "RE_DEBUG" },
438 { RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
439 { RE_ICASE, "RE_ICASE" },
440 { RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
441 { RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
442 { RE_NO_SUB, "RE_NO_SUB" },
443 { 0, NULL },
444 };
445
446 return genflags2str(flagval, values);
447}
Note: See TracBrowser for help on using the repository browser.