Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

re.c@ 3840

Visit:

Last change on this file since 3840 was 3076, checked in by bird, 18 years ago
gawk 3.1.5
File size: 11.0 KB

Line
1	/*
2	* re.c - compile regular expressions.
3	*/
4
5	/*
6	* Copyright (C) 1991-2005 the Free Software Foundation, Inc.
7	*
8	* This file is part of GAWK, the GNU implementation of the
9	* AWK Programming Language.
10	*
11	* GAWK is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* GAWK is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
24	*/
25
26	#include "awk.h"
27
28	static reg_syntax_t syn;
29
30	/* make_regexp --- generate compiled regular expressions */
31
32	Regexp *
33	make_regexp(const char *s, size_t len, int ignorecase, int dfa)
34	{
35	Regexp *rp;
36	const char *rerr;
37	const char *src = s;
38	char *temp;
39	const char *end = s + len;
40	register char *dest;
41	register int c, c2;
42	static short first = TRUE;
43	static short no_dfa = FALSE;
44	int has_anchor = FALSE;
45
46	/* The number of bytes in the current multibyte character.
47	It is 0, when the current character is a singlebyte character. */
48	size_t is_multibyte = 0;
49	#ifdef MBS_SUPPORT
50	mbstate_t mbs;
51
52	if (gawk_mb_cur_max > 1)
53	memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
54	#endif
55
56	if (first) {
57	first = FALSE;
58	no_dfa = (getenv("GAWK_NO_DFA") != NULL); /* for debugging and testing */
59	}
60
61	/* Handle escaped characters first. */
62
63	/*
64	* Build a copy of the string (in dest) with the
65	* escaped characters translated, and generate the regex
66	* from that.
67	*/
68	emalloc(dest, char *, len + 2, "make_regexp");
69	temp = dest;
70
71	while (src < end) {
72	#ifdef MBS_SUPPORT
73	if (gawk_mb_cur_max > 1 && ! is_multibyte) {
74	/* The previous byte is a singlebyte character, or last byte
75	of a multibyte character. We check the next character. */
76	is_multibyte = mbrlen(src, end - src, &mbs);
77	if ((is_multibyte == 1) \|\| (is_multibyte == (size_t) -1)
78	\|\| (is_multibyte == (size_t) -2 \|\| (is_multibyte == 0))) {
79	/* We treat it as a singlebyte character. */
80	is_multibyte = 0;
81	}
82	}
83	#endif
84
85	/* We skip multibyte character, since it must not be a special
86	character. */
87	if ((gawk_mb_cur_max == 1 \|\| ! is_multibyte) &&
88	(*src == '\\')) {
89	c = *++src;
90	switch (c) {
91	case 'a':
92	case 'b':
93	case 'f':
94	case 'n':
95	case 'r':
96	case 't':
97	case 'v':
98	case 'x':
99	case '0':
100	case '1':
101	case '2':
102	case '3':
103	case '4':
104	case '5':
105	case '6':
106	case '7':
107	c2 = parse_escape(&src);
108	if (c2 < 0)
109	cant_happen();
110	/*
111	* Unix awk treats octal (and hex?) chars
112	* literally in re's, so escape regexp
113	* metacharacters.
114	*/
115	if (do_traditional && ! do_posix && (ISDIGIT(c) \|\| c == 'x')
116	&& strchr("()\|*+?.^$\\[]", c2) != NULL)
117	*dest++ = '\\';
118	*dest++ = (char) c2;
119	break;
120	case '8':
121	case '9': /* a\9b not valid */
122	*dest++ = c;
123	src++;
124	break;
125	case 'y': /* normally \b */
126	/* gnu regex op */
127	if (! do_traditional) {
128	*dest++ = '\\';
129	*dest++ = 'b';
130	src++;
131	break;
132	}
133	/* else, fall through */
134	default:
135	*dest++ = '\\';
136	*dest++ = (char) c;
137	src++;
138	break;
139	} /* switch */
140	} else {
141	c = *src;
142	if (c == '^' \|\| c == '$')
143	has_anchor = TRUE;
144	dest++ = src++; /* not '\\' */
145	}
146	if (gawk_mb_cur_max > 1 && is_multibyte)
147	is_multibyte--;
148	} /* while */
149
150	dest = '\0' ; / Only necessary if we print dest ? */
151	emalloc(rp, Regexp , sizeof(rp), "make_regexp");
152	memset((char ) rp, 0, sizeof(rp));
153	rp->pat.allocated = 0; /* regex will allocate the buffer */
154	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
155
156	/*
157	* Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
158	* was going to turn out to be, I wouldn't have bothered with it.
159	*
160	* In the case where we have a multibyte character set, we have no
161	* choice but to use RE_ICASE, since the casetable is for single-byte
162	* character sets only.
163	*
164	* On the other hand, if we do have a single-byte character set,
165	* using the casetable should give a performance improvement, since
166	* it's computed only once, not each time a regex is compiled. We
167	* also think it's probably better for portability. See the
168	* discussion by the definition of casetable[] in eval.c.
169	*/
170
171	if (ignorecase) {
172	if (gawk_mb_cur_max > 1) {
173	syn \|= RE_ICASE;
174	rp->pat.translate = NULL;
175	} else {
176	syn &= ~RE_ICASE;
177	rp->pat.translate = (char *) casetable;
178	}
179	} else {
180	rp->pat.translate = NULL;
181	syn &= ~RE_ICASE;
182	}
183
184	dfasyntax(syn \| (ignorecase ? RE_ICASE : 0), ignorecase ? TRUE : FALSE, '\n');
185	re_set_syntax(syn);
186
187	len = dest - temp;
188	if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
189	fatal("%s: /%s/", rerr, temp); /* rerr already gettextized inside regex routines */
190
191	/* gack. this must be done after re_compile_pattern */
192	rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
193	if (dfa && ! no_dfa) {
194	dfacomp(temp, len, &(rp->dfareg), TRUE);
195	rp->dfa = TRUE;
196	} else
197	rp->dfa = FALSE;
198	rp->has_anchor = has_anchor;
199
200	free(temp);
201	return rp;
202	}
203
204	/* research --- do a regexp search. use dfa if possible */
205
206	int
207	research(Regexp rp, register char str, int start,
208	register size_t len, int flags)
209	{
210	const char *ret = str;
211	int try_backref;
212	int need_start;
213	int no_bol;
214	int res;
215
216	need_start = ((flags & RE_NEED_START) != 0);
217	no_bol = ((flags & RE_NO_BOL) != 0);
218
219	if (no_bol)
220	rp->pat.not_bol = 1;
221
222	/*
223	* Always do dfa search if can; if it fails, then even if
224	* need_start is true, we won't bother with the regex search.
225	*
226	* The dfa matcher doesn't have a no_bol flag, so don't bother
227	* trying it in that case.
228	*/
229	if (rp->dfa && ! no_bol) {
230	char save;
231	int count = 0;
232	/*
233	* dfa likes to stick a '\n' right after the matched
234	* text. So we just save and restore the character.
235	*/
236	save = str[start+len];
237	ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
238	&count, &try_backref);
239	str[start+len] = save;
240	}
241
242	if (ret) {
243	if (need_start \|\| rp->dfa == FALSE \|\| try_backref) {
244	/*
245	* Passing NULL as last arg speeds up search for cases
246	* where we don't need the start/end info.
247	*/
248	res = re_search(&(rp->pat), str, start+len,
249	start, len, need_start ? &(rp->regs) : NULL);
250	} else
251	res = 1;
252	} else
253	res = -1;
254
255	rp->pat.not_bol = 0;
256	return res;
257	}
258
259	/* refree --- free up the dynamic memory used by a compiled regexp */
260
261	void
262	refree(Regexp *rp)
263	{
264	/*
265	* This isn't malloced, don't let regfree free it.
266	* (This is strictly necessary only for the old
267	* version of regex, but it's a good idea to keep it
268	* here in case regex internals change in the future.)
269	*/
270	rp->pat.translate = NULL;
271
272	regfree(& rp->pat);
273	if (rp->regs.start)
274	free(rp->regs.start);
275	if (rp->regs.end)
276	free(rp->regs.end);
277	if (rp->dfa)
278	dfafree(&(rp->dfareg));
279	free(rp);
280	}
281
282	/* dfaerror --- print an error message for the dfa routines */
283
284	void
285	dfaerror(const char *s)
286	{
287	fatal("%s", s);
288	}
289
290	/* re_update --- recompile a dynamic regexp */
291
292	Regexp *
293	re_update(NODE *t)
294	{
295	NODE *t1;
296
297	if ((t->re_flags & CASE) == IGNORECASE) {
298	if ((t->re_flags & CONST) != 0) {
299	assert(t->type == Node_regex);
300	return t->re_reg;
301	}
302	t1 = force_string(tree_eval(t->re_exp));
303	if (t->re_text != NULL) {
304	if (cmp_nodes(t->re_text, t1) == 0) {
305	free_temp(t1);
306	return t->re_reg;
307	}
308	unref(t->re_text);
309	}
310	t->re_text = dupnode(t1);
311	free_temp(t1);
312	}
313	if (t->re_reg != NULL)
314	refree(t->re_reg);
315	if (t->re_cnt > 0)
316	t->re_cnt++;
317	if (t->re_cnt > 10)
318	t->re_cnt = 0;
319	if (t->re_text == NULL \|\| (t->re_flags & CASE) != IGNORECASE) {
320	t1 = force_string(tree_eval(t->re_exp));
321	unref(t->re_text);
322	t->re_text = dupnode(t1);
323	free_temp(t1);
324	}
325	t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
326	IGNORECASE, t->re_cnt);
327	t->re_flags &= ~CASE;
328	t->re_flags \|= IGNORECASE;
329	return t->re_reg;
330	}
331
332	/* resetup --- choose what kind of regexps we match */
333
334	void
335	resetup()
336	{
337	if (do_posix)
338	syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
339	else if (do_traditional)
340	syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
341	else
342	syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
343
344	/*
345	* Interval expressions are off by default, since it's likely to
346	* break too many old programs to have them on.
347	*/
348	if (do_intervals)
349	syn \|= RE_INTERVALS;
350
351	(void) re_set_syntax(syn);
352	dfasyntax(syn, FALSE, '\n');
353	}
354
355	/* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */
356
357	int
358	avoid_dfa(NODE re, char str, size_t len)
359	{
360	char *end;
361
362	if (! re->re_reg->has_anchor)
363	return FALSE;
364
365	for (end = str + len; str < end; str++)
366	if (*str == '\n')
367	return TRUE;
368
369	return FALSE;
370	}
371
372	/* reisstring --- return TRUE if the RE match is a simple string match */
373
374	int
375	reisstring(const char text, size_t len, Regexp re, const char *buf)
376	{
377	static char metas[] = ".*+(){}[]\|?^$\\";
378	int i;
379	int res;
380	const char *matched;
381
382	/* simple checking for has meta characters in re */
383	for (i = 0; i < len; i++) {
384	if (strchr(metas, text[i]) != NULL) {
385	return FALSE; /* give up early, can't be string match */
386	}
387	}
388
389	/* make accessable to gdb */
390	matched = &buf[RESTART(re, buf)];
391
392	res = (memcmp(text, matched, len) == 0);
393
394	return res;
395	}
396
397	/* remaybelong --- return TRUE if the RE contains * ? \| + */
398
399	int
400	remaybelong(const char *text, size_t len)
401	{
402	while (len--) {
403	if (strchr("+\|?", text++) != NULL) {
404	return TRUE;
405	}
406	}
407
408	return FALSE;
409	}
410
411	/* reflags2str --- make a regex flags value readable */
412
413	const char *
414	reflags2str(int flagval)
415	{
416	static const struct flagtab values[] = {
417	{ RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
418	{ RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
419	{ RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
420	{ RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
421	{ RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
422	{ RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
423	{ RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
424	{ RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
425	{ RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
426	{ RE_INTERVALS, "RE_INTERVALS" },
427	{ RE_LIMITED_OPS, "RE_LIMITED_OPS" },
428	{ RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
429	{ RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
430	{ RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
431	{ RE_NO_BK_REFS, "RE_NO_BK_REFS" },
432	{ RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
433	{ RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
434	{ RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
435	{ RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
436	{ RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
437	{ RE_DEBUG, "RE_DEBUG" },
438	{ RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
439	{ RE_ICASE, "RE_ICASE" },
440	{ RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
441	{ RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
442	{ RE_NO_SUB, "RE_NO_SUB" },
443	{ 0, NULL },
444	};
445
446	return genflags2str(flagval, values);
447	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/gawk/3.1.5/re.c@ 3840

Download in other formats: