Context Navigation

re.c

Visit:

Last change on this file was 3076, checked in by bird, 18 years ago
gawk 3.1.5
File size: 11.0 KB

Rev	Line
[3076]	1	/*
	2	* re.c - compile regular expressions.
	3	*/
	4
	5	/*
	6	* Copyright (C) 1991-2005 the Free Software Foundation, Inc.
	7	*
	8	* This file is part of GAWK, the GNU implementation of the
	9	* AWK Programming Language.
	10	*
	11	* GAWK is free software; you can redistribute it and/or modify
	12	* it under the terms of the GNU General Public License as published by
	13	* the Free Software Foundation; either version 2 of the License, or
	14	* (at your option) any later version.
	15	*
	16	* GAWK is distributed in the hope that it will be useful,
	17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	* GNU General Public License for more details.
	20	*
	21	* You should have received a copy of the GNU General Public License
	22	* along with this program; if not, write to the Free Software
	23	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
	24	*/
	25
	26	#include "awk.h"
	27
	28	static reg_syntax_t syn;
	29
	30	/* make_regexp --- generate compiled regular expressions */
	31
	32	Regexp *
	33	make_regexp(const char *s, size_t len, int ignorecase, int dfa)
	34	{
	35	Regexp *rp;
	36	const char *rerr;
	37	const char *src = s;
	38	char *temp;
	39	const char *end = s + len;
	40	register char *dest;
	41	register int c, c2;
	42	static short first = TRUE;
	43	static short no_dfa = FALSE;
	44	int has_anchor = FALSE;
	45
	46	/* The number of bytes in the current multibyte character.
	47	It is 0, when the current character is a singlebyte character. */
	48	size_t is_multibyte = 0;
	49	#ifdef MBS_SUPPORT
	50	mbstate_t mbs;
	51
	52	if (gawk_mb_cur_max > 1)
	53	memset(&mbs, 0, sizeof(mbstate_t)); /* Initialize. */
	54	#endif
	55
	56	if (first) {
	57	first = FALSE;
	58	no_dfa = (getenv("GAWK_NO_DFA") != NULL); /* for debugging and testing */
	59	}
	60
	61	/* Handle escaped characters first. */
	62
	63	/*
	64	* Build a copy of the string (in dest) with the
	65	* escaped characters translated, and generate the regex
	66	* from that.
	67	*/
	68	emalloc(dest, char *, len + 2, "make_regexp");
	69	temp = dest;
	70
	71	while (src < end) {
	72	#ifdef MBS_SUPPORT
	73	if (gawk_mb_cur_max > 1 && ! is_multibyte) {
	74	/* The previous byte is a singlebyte character, or last byte
	75	of a multibyte character. We check the next character. */
	76	is_multibyte = mbrlen(src, end - src, &mbs);
	77	if ((is_multibyte == 1) \|\| (is_multibyte == (size_t) -1)
	78	\|\| (is_multibyte == (size_t) -2 \|\| (is_multibyte == 0))) {
	79	/* We treat it as a singlebyte character. */
	80	is_multibyte = 0;
	81	}
	82	}
	83	#endif
	84
	85	/* We skip multibyte character, since it must not be a special
	86	character. */
	87	if ((gawk_mb_cur_max == 1 \|\| ! is_multibyte) &&
	88	(*src == '\\')) {
	89	c = *++src;
	90	switch (c) {
	91	case 'a':
	92	case 'b':
	93	case 'f':
	94	case 'n':
	95	case 'r':
	96	case 't':
	97	case 'v':
	98	case 'x':
	99	case '0':
	100	case '1':
	101	case '2':
	102	case '3':
	103	case '4':
	104	case '5':
	105	case '6':
	106	case '7':
	107	c2 = parse_escape(&src);
	108	if (c2 < 0)
	109	cant_happen();
	110	/*
	111	* Unix awk treats octal (and hex?) chars
	112	* literally in re's, so escape regexp
	113	* metacharacters.
	114	*/
	115	if (do_traditional && ! do_posix && (ISDIGIT(c) \|\| c == 'x')
	116	&& strchr("()\|*+?.^$\\[]", c2) != NULL)
	117	*dest++ = '\\';
	118	*dest++ = (char) c2;
	119	break;
	120	case '8':
	121	case '9': /* a\9b not valid */
	122	*dest++ = c;
	123	src++;
	124	break;
	125	case 'y': /* normally \b */
	126	/* gnu regex op */
	127	if (! do_traditional) {
	128	*dest++ = '\\';
	129	*dest++ = 'b';
	130	src++;
	131	break;
	132	}
	133	/* else, fall through */
	134	default:
	135	*dest++ = '\\';
	136	*dest++ = (char) c;
	137	src++;
	138	break;
	139	} /* switch */
	140	} else {
	141	c = *src;
	142	if (c == '^' \|\| c == '$')
	143	has_anchor = TRUE;
	144	dest++ = src++; /* not '\\' */
	145	}
	146	if (gawk_mb_cur_max > 1 && is_multibyte)
	147	is_multibyte--;
	148	} /* while */
	149
	150	dest = '\0' ; / Only necessary if we print dest ? */
	151	emalloc(rp, Regexp , sizeof(rp), "make_regexp");
	152	memset((char ) rp, 0, sizeof(rp));
	153	rp->pat.allocated = 0; /* regex will allocate the buffer */
	154	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
	155
	156	/*
	157	* Lo these many years ago, had I known what a P.I.T.A. IGNORECASE
	158	* was going to turn out to be, I wouldn't have bothered with it.
	159	*
	160	* In the case where we have a multibyte character set, we have no
	161	* choice but to use RE_ICASE, since the casetable is for single-byte
	162	* character sets only.
	163	*
	164	* On the other hand, if we do have a single-byte character set,
	165	* using the casetable should give a performance improvement, since
	166	* it's computed only once, not each time a regex is compiled. We
	167	* also think it's probably better for portability. See the
	168	* discussion by the definition of casetable[] in eval.c.
	169	*/
	170
	171	if (ignorecase) {
	172	if (gawk_mb_cur_max > 1) {
	173	syn \|= RE_ICASE;
	174	rp->pat.translate = NULL;
	175	} else {
	176	syn &= ~RE_ICASE;
	177	rp->pat.translate = (char *) casetable;
	178	}
	179	} else {
	180	rp->pat.translate = NULL;
	181	syn &= ~RE_ICASE;
	182	}
	183
	184	dfasyntax(syn \| (ignorecase ? RE_ICASE : 0), ignorecase ? TRUE : FALSE, '\n');
	185	re_set_syntax(syn);
	186
	187	len = dest - temp;
	188	if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
	189	fatal("%s: /%s/", rerr, temp); /* rerr already gettextized inside regex routines */
	190
	191	/* gack. this must be done after re_compile_pattern */
	192	rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
	193	if (dfa && ! no_dfa) {
	194	dfacomp(temp, len, &(rp->dfareg), TRUE);
	195	rp->dfa = TRUE;
	196	} else
	197	rp->dfa = FALSE;
	198	rp->has_anchor = has_anchor;
	199
	200	free(temp);
	201	return rp;
	202	}
	203
	204	/* research --- do a regexp search. use dfa if possible */
	205
	206	int
	207	research(Regexp rp, register char str, int start,
	208	register size_t len, int flags)
	209	{
	210	const char *ret = str;
	211	int try_backref;
	212	int need_start;
	213	int no_bol;
	214	int res;
	215
	216	need_start = ((flags & RE_NEED_START) != 0);
	217	no_bol = ((flags & RE_NO_BOL) != 0);
	218
	219	if (no_bol)
	220	rp->pat.not_bol = 1;
	221
	222	/*
	223	* Always do dfa search if can; if it fails, then even if
	224	* need_start is true, we won't bother with the regex search.
	225	*
	226	* The dfa matcher doesn't have a no_bol flag, so don't bother
	227	* trying it in that case.
	228	*/
	229	if (rp->dfa && ! no_bol) {
	230	char save;
	231	int count = 0;
	232	/*
	233	* dfa likes to stick a '\n' right after the matched
	234	* text. So we just save and restore the character.
	235	*/
	236	save = str[start+len];
	237	ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
	238	&count, &try_backref);
	239	str[start+len] = save;
	240	}
	241
	242	if (ret) {
	243	if (need_start \|\| rp->dfa == FALSE \|\| try_backref) {
	244	/*
	245	* Passing NULL as last arg speeds up search for cases
	246	* where we don't need the start/end info.
	247	*/
	248	res = re_search(&(rp->pat), str, start+len,
	249	start, len, need_start ? &(rp->regs) : NULL);
	250	} else
	251	res = 1;
	252	} else
	253	res = -1;
	254
	255	rp->pat.not_bol = 0;
	256	return res;
	257	}
	258
	259	/* refree --- free up the dynamic memory used by a compiled regexp */
	260
	261	void
	262	refree(Regexp *rp)
	263	{
	264	/*
	265	* This isn't malloced, don't let regfree free it.
	266	* (This is strictly necessary only for the old
	267	* version of regex, but it's a good idea to keep it
	268	* here in case regex internals change in the future.)
	269	*/
	270	rp->pat.translate = NULL;
	271
	272	regfree(& rp->pat);
	273	if (rp->regs.start)
	274	free(rp->regs.start);
	275	if (rp->regs.end)
	276	free(rp->regs.end);
	277	if (rp->dfa)
	278	dfafree(&(rp->dfareg));
	279	free(rp);
	280	}
	281
	282	/* dfaerror --- print an error message for the dfa routines */
	283
	284	void
	285	dfaerror(const char *s)
	286	{
	287	fatal("%s", s);
	288	}
	289
	290	/* re_update --- recompile a dynamic regexp */
	291
	292	Regexp *
	293	re_update(NODE *t)
	294	{
	295	NODE *t1;
	296
	297	if ((t->re_flags & CASE) == IGNORECASE) {
	298	if ((t->re_flags & CONST) != 0) {
	299	assert(t->type == Node_regex);
	300	return t->re_reg;
	301	}
	302	t1 = force_string(tree_eval(t->re_exp));
	303	if (t->re_text != NULL) {
	304	if (cmp_nodes(t->re_text, t1) == 0) {
	305	free_temp(t1);
	306	return t->re_reg;
	307	}
	308	unref(t->re_text);
	309	}
	310	t->re_text = dupnode(t1);
	311	free_temp(t1);
	312	}
	313	if (t->re_reg != NULL)
	314	refree(t->re_reg);
	315	if (t->re_cnt > 0)
	316	t->re_cnt++;
	317	if (t->re_cnt > 10)
	318	t->re_cnt = 0;
	319	if (t->re_text == NULL \|\| (t->re_flags & CASE) != IGNORECASE) {
	320	t1 = force_string(tree_eval(t->re_exp));
	321	unref(t->re_text);
	322	t->re_text = dupnode(t1);
	323	free_temp(t1);
	324	}
	325	t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
	326	IGNORECASE, t->re_cnt);
	327	t->re_flags &= ~CASE;
	328	t->re_flags \|= IGNORECASE;
	329	return t->re_reg;
	330	}
	331
	332	/* resetup --- choose what kind of regexps we match */
	333
	334	void
	335	resetup()
	336	{
	337	if (do_posix)
	338	syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */
	339	else if (do_traditional)
	340	syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */
	341	else
	342	syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */
	343
	344	/*
	345	* Interval expressions are off by default, since it's likely to
	346	* break too many old programs to have them on.
	347	*/
	348	if (do_intervals)
	349	syn \|= RE_INTERVALS;
	350
	351	(void) re_set_syntax(syn);
	352	dfasyntax(syn, FALSE, '\n');
	353	}
	354
	355	/* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */
	356
	357	int
	358	avoid_dfa(NODE re, char str, size_t len)
	359	{
	360	char *end;
	361
	362	if (! re->re_reg->has_anchor)
	363	return FALSE;
	364
	365	for (end = str + len; str < end; str++)
	366	if (*str == '\n')
	367	return TRUE;
	368
	369	return FALSE;
	370	}
	371
	372	/* reisstring --- return TRUE if the RE match is a simple string match */
	373
	374	int
	375	reisstring(const char text, size_t len, Regexp re, const char *buf)
	376	{
	377	static char metas[] = ".*+(){}[]\|?^$\\";
	378	int i;
	379	int res;
	380	const char *matched;
	381
	382	/* simple checking for has meta characters in re */
	383	for (i = 0; i < len; i++) {
	384	if (strchr(metas, text[i]) != NULL) {
	385	return FALSE; /* give up early, can't be string match */
	386	}
	387	}
	388
	389	/* make accessable to gdb */
	390	matched = &buf[RESTART(re, buf)];
	391
	392	res = (memcmp(text, matched, len) == 0);
	393
	394	return res;
	395	}
	396
	397	/* remaybelong --- return TRUE if the RE contains * ? \| + */
	398
	399	int
	400	remaybelong(const char *text, size_t len)
	401	{
	402	while (len--) {
	403	if (strchr("+\|?", text++) != NULL) {
	404	return TRUE;
	405	}
	406	}
	407
	408	return FALSE;
	409	}
	410
	411	/* reflags2str --- make a regex flags value readable */
	412
	413	const char *
	414	reflags2str(int flagval)
	415	{
	416	static const struct flagtab values[] = {
	417	{ RE_BACKSLASH_ESCAPE_IN_LISTS, "RE_BACKSLASH_ESCAPE_IN_LISTS" },
	418	{ RE_BK_PLUS_QM, "RE_BK_PLUS_QM" },
	419	{ RE_CHAR_CLASSES, "RE_CHAR_CLASSES" },
	420	{ RE_CONTEXT_INDEP_ANCHORS, "RE_CONTEXT_INDEP_ANCHORS" },
	421	{ RE_CONTEXT_INDEP_OPS, "RE_CONTEXT_INDEP_OPS" },
	422	{ RE_CONTEXT_INVALID_OPS, "RE_CONTEXT_INVALID_OPS" },
	423	{ RE_DOT_NEWLINE, "RE_DOT_NEWLINE" },
	424	{ RE_DOT_NOT_NULL, "RE_DOT_NOT_NULL" },
	425	{ RE_HAT_LISTS_NOT_NEWLINE, "RE_HAT_LISTS_NOT_NEWLINE" },
	426	{ RE_INTERVALS, "RE_INTERVALS" },
	427	{ RE_LIMITED_OPS, "RE_LIMITED_OPS" },
	428	{ RE_NEWLINE_ALT, "RE_NEWLINE_ALT" },
	429	{ RE_NO_BK_BRACES, "RE_NO_BK_BRACES" },
	430	{ RE_NO_BK_PARENS, "RE_NO_BK_PARENS" },
	431	{ RE_NO_BK_REFS, "RE_NO_BK_REFS" },
	432	{ RE_NO_BK_VBAR, "RE_NO_BK_VBAR" },
	433	{ RE_NO_EMPTY_RANGES, "RE_NO_EMPTY_RANGES" },
	434	{ RE_UNMATCHED_RIGHT_PAREN_ORD, "RE_UNMATCHED_RIGHT_PAREN_ORD" },
	435	{ RE_NO_POSIX_BACKTRACKING, "RE_NO_POSIX_BACKTRACKING" },
	436	{ RE_NO_GNU_OPS, "RE_NO_GNU_OPS" },
	437	{ RE_DEBUG, "RE_DEBUG" },
	438	{ RE_INVALID_INTERVAL_ORD, "RE_INVALID_INTERVAL_ORD" },
	439	{ RE_ICASE, "RE_ICASE" },
	440	{ RE_CARET_ANCHORS_HERE, "RE_CARET_ANCHORS_HERE" },
	441	{ RE_CONTEXT_INVALID_DUP, "RE_CONTEXT_INVALID_DUP" },
	442	{ RE_NO_SUB, "RE_NO_SUB" },
	443	{ 0, NULL },
	444	};
	445
	446	return genflags2str(flagval, values);
	447	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/gawk/3.1.5/re.c

Download in other formats: