Context Navigation

regexp.c@ 3613

Last change on this file since 3613 was 3613, checked in by bird, 10 months ago
src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge ^{/vendor/sed/4.1.5}/vendor/sed/current .)
File size: 10.0 KB

Line
1	/* GNU SED, a batch stream editor.
2	Copyright (C) 1999-2022 Free Software Foundation, Inc.
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; either version 3, or (at your option)
7	any later version.
8
9	This program is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License
15	along with this program; If not, see <https://www.gnu.org/licenses/>. */
16
17	#include "sed.h"
18
19	#include <ctype.h>
20	#include <limits.h>
21	#include <string.h>
22	#include <stdio.h>
23	#include <stdlib.h>
24
25	#include "xalloc.h"
26
27	#ifdef gettext_noop
28	# define N_(String) gettext_noop(String)
29	#else
30	# define N_(String) (String)
31	#endif
32
33	extern bool use_extended_syntax_p;
34
35	static const char errors[] =
36	"no previous regular expression\0"
37	"cannot specify modifiers on empty regexp";
38
39	#define NO_REGEX (errors)
40	#define BAD_MODIF (NO_REGEX + sizeof(N_("no previous regular expression")))
41
42
43	void
44	dfaerror (char const *mesg)
45	{
46	panic ("%s", mesg);
47	}
48
49	void
50	dfawarn (char const *mesg)
51	{
52	if (!getenv ("POSIXLY_CORRECT"))
53	dfaerror (mesg);
54	}
55
56
57	static void
58	compile_regex_1 (struct regex *new_regex, int needed_sub)
59	{
60	const char *error;
61	int syntax = ((extended_regexp_flags & REG_EXTENDED)
62	? RE_SYNTAX_POSIX_EXTENDED
63	: RE_SYNTAX_POSIX_BASIC);
64
65	syntax &= ~RE_DOT_NOT_NULL;
66	syntax \|= RE_NO_POSIX_BACKTRACKING;
67
68	switch (posixicity)
69	{
70	case POSIXLY_EXTENDED:
71	syntax &= ~RE_UNMATCHED_RIGHT_PAREN_ORD;
72	break;
73	case POSIXLY_CORRECT:
74	syntax \|= RE_UNMATCHED_RIGHT_PAREN_ORD;
75	break;
76	case POSIXLY_BASIC:
77	syntax \|= RE_UNMATCHED_RIGHT_PAREN_ORD \| RE_NO_GNU_OPS;
78	if (!(extended_regexp_flags & REG_EXTENDED))
79	syntax \|= RE_LIMITED_OPS;
80	break;
81	}
82
83	if (new_regex->flags & REG_ICASE)
84	syntax \|= RE_ICASE;
85	else
86	new_regex->pattern.fastmap = malloc (1 << (sizeof (char) * 8));
87	syntax \|= needed_sub ? 0 : RE_NO_SUB;
88
89	/* If REG_NEWLINE is set, newlines are treated differently. */
90	if (new_regex->flags & REG_NEWLINE)
91	{
92	/* REG_NEWLINE implies neither . nor [^...] match newline. */
93	syntax &= ~RE_DOT_NEWLINE;
94	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
95	}
96
97	re_set_syntax (syntax);
98	error = re_compile_pattern (new_regex->re, new_regex->sz,
99	&new_regex->pattern);
100	new_regex->pattern.newline_anchor =
101	buffer_delimiter == '\n' && (new_regex->flags & REG_NEWLINE) != 0;
102
103	new_regex->pattern.translate = NULL;
104	#ifndef RE_ICASE
105	if (new_regex->flags & REG_ICASE)
106	{
107	static char translate[1 << (sizeof (char) * 8)];
108	int i;
109	for (i = 0; i < sizeof (translate) / sizeof (char); i++)
110	translate[i] = tolower (i);
111
112	new_regex->pattern.translate = translate;
113	}
114	#endif
115
116	if (error)
117	bad_prog (error);
118
119	/* Just to be sure, I mark this as not POSIXLY_CORRECT behavior */
120	if (needed_sub
121	&& new_regex->pattern.re_nsub < needed_sub - 1
122	&& posixicity == POSIXLY_EXTENDED)
123	{
124	char buf[200];
125	sprintf (buf, _("invalid reference \\%d on `s' command's RHS"),
126	needed_sub - 1);
127	bad_prog (buf);
128	}
129
130	int dfaopts = buffer_delimiter == '\n' ? 0 : DFA_EOL_NUL;
131	new_regex->dfa = dfaalloc ();
132	dfasyntax (new_regex->dfa, &localeinfo, syntax, dfaopts);
133	dfacomp (new_regex->re, new_regex->sz, new_regex->dfa, 1);
134
135	/* The patterns which consist of only ^ or $ often appear in
136	substitution, but regex and dfa are not good at them, as regex does
137	not build fastmap, and as all in buffer must be scanned for $. So
138	we mark them to handle manually. */
139	if (new_regex->sz == 1)
140	{
141	if (new_regex->re[0] == '^')
142	new_regex->begline = true;
143	if (new_regex->re[0] == '$')
144	new_regex->endline = true;
145	}
146	}
147
148	struct regex *
149	compile_regex (struct buffer *b, int flags, int needed_sub)
150	{
151	struct regex *new_regex;
152	size_t re_len;
153
154	/* // matches the last RE */
155	if (size_buffer (b) == 0)
156	{
157	if (flags > 0)
158	bad_prog (_(BAD_MODIF));
159	return NULL;
160	}
161
162	re_len = size_buffer (b);
163	new_regex = xzalloc (sizeof (struct regex) + re_len - 1);
164	new_regex->flags = flags;
165	memcpy (new_regex->re, get_buffer (b), re_len);
166
167	/* GNU regex does not process \t & co. */
168	new_regex->sz = normalize_text (new_regex->re, re_len, TEXT_REGEX);
169
170	compile_regex_1 (new_regex, needed_sub);
171	return new_regex;
172	}
173
174	int
175	match_regex (struct regex regex, char buf, size_t buflen,
176	size_t buf_start_offset, struct re_registers *regarray,
177	int regsize)
178	{
179	int ret;
180	static struct regex *regex_last;
181
182	/* printf ("Matching from %d/%d\n", buf_start_offset, buflen); */
183
184	/* Keep track of the last regexp matched. */
185	if (!regex)
186	{
187	regex = regex_last;
188	if (!regex_last)
189	bad_prog (_(NO_REGEX));
190	}
191	else
192	regex_last = regex;
193
194	/* gnulib's re_search uses signed-int as length */
195	if (buflen >= INT_MAX)
196	panic (_("regex input buffer length larger than INT_MAX"));
197
198	if (regex->pattern.no_sub && regsize)
199	{
200	/* Re-compiling an existing regex, free the previously allocated
201	structures. */
202	if (regex->dfa)
203	{
204	dfafree (regex->dfa);
205	free (regex->dfa);
206	regex->dfa = NULL;
207	}
208	regfree (&regex->pattern);
209
210	compile_regex_1 (regex, regsize);
211	}
212
213	regex->pattern.regs_allocated = REGS_REALLOCATE;
214
215	/* Optimized handling for '^' and '$' patterns */
216	if (regex->begline \|\| regex->endline)
217	{
218	size_t offset;
219
220	if (regex->endline)
221	{
222	const char *p = NULL;
223
224	if (regex->flags & REG_NEWLINE)
225	p = memchr (buf + buf_start_offset, buffer_delimiter,
226	buflen - buf_start_offset);
227
228	offset = p ? p - buf : buflen;
229	}
230	else if (buf_start_offset == 0)
231	/* begline anchor, starting at beginning of the buffer. */
232	offset = 0;
233	else if (!(regex->flags & REG_NEWLINE))
234	/* begline anchor, starting in the middle of the text buffer,
235	and multiline regex is not specified - will never match.
236	Example: seq 2 \| sed 'N;s/^/X/g' */
237	return 0;
238	else if (buf[buf_start_offset - 1] == buffer_delimiter)
239	/* begline anchor, starting in the middle of the text buffer,
240	with multiline match, and the current character
241	is the line delimiter - start here.
242	Example: seq 2 \| sed 'N;s/^/X/mg' */
243	offset = buf_start_offset;
244	else
245	{
246	/* begline anchor, starting in the middle of the search buffer,
247	all previous optimizions didn't work: search
248	for the next line delimiter character in the buffer,
249	and start from there if found. */
250	const char *p = memchr (buf + buf_start_offset, buffer_delimiter,
251	buflen - buf_start_offset);
252
253	if (p == NULL)
254	return 0;
255
256	offset = p - buf + 1;
257	}
258
259	if (regsize)
260	{
261	size_t i;
262
263	if (!regarray->start)
264	{
265	regarray->start = XCALLOC (1, regoff_t);
266	regarray->end = XCALLOC (1, regoff_t);
267	regarray->num_regs = 1;
268	}
269
270	regarray->start[0] = offset;
271	regarray->end[0] = offset;
272
273	for (i = 1 ; i < regarray->num_regs; ++i)
274	regarray->start[i] = regarray->end[i] = -1;
275	}
276
277	return 1;
278	}
279
280	if (buf_start_offset == 0)
281	{
282	struct dfa *superset = dfasuperset (regex->dfa);
283
284	if (superset && !dfaexec (superset, buf, buf + buflen, true, NULL, NULL))
285	return 0;
286
287	if ((!regsize && (regex->flags & REG_NEWLINE))
288	\|\| (!superset && dfaisfast (regex->dfa)))
289	{
290	bool backref = false;
291
292	if (!dfaexec (regex->dfa, buf, buf + buflen, true, NULL, &backref))
293	return 0;
294
295	if (!regsize && (regex->flags & REG_NEWLINE) && !backref)
296	return 1;
297	}
298	}
299
300	/* If the buffer delimiter is not newline character, we cannot use
301	newline_anchor flag of regex. So do it line-by-line, and add offset
302	value to results. */
303	if ((regex->flags & REG_NEWLINE) && buffer_delimiter != '\n')
304	{
305	const char beg, end;
306	const char *start;
307
308	beg = buf;
309
310	if (buf_start_offset > 0)
311	{
312	const char *eol = memrchr (buf, buffer_delimiter, buf_start_offset);
313
314	if (eol != NULL)
315	beg = eol + 1;
316	}
317
318	start = buf + buf_start_offset;
319
320	for (;;)
321	{
322	end = memchr (beg, buffer_delimiter, buf + buflen - beg);
323
324	if (end == NULL)
325	end = buf + buflen;
326
327	ret = re_search (&regex->pattern, beg, end - beg,
328	start - beg, end - start,
329	regsize ? regarray : NULL);
330
331	if (ret > -1)
332	{
333	size_t i;
334
335	ret += beg - buf;
336
337	if (regsize)
338	{
339	for (i = 0; i < regarray->num_regs; ++i)
340	{
341	if (regarray->start[i] > -1)
342	regarray->start[i] += beg - buf;
343	if (regarray->end[i] > -1)
344	regarray->end[i] += beg - buf;
345	}
346	}
347
348	break;
349	}
350
351	if (end == buf + buflen)
352	break;
353
354	beg = start = end + 1;
355	}
356	}
357	else
358	ret = re_search (&regex->pattern, buf, buflen, buf_start_offset,
359	buflen - buf_start_offset,
360	regsize ? regarray : NULL);
361
362	return (ret > -1);
363	}
364
365
366	#ifdef lint
367	void
368	release_regex (struct regex *regex)
369	{
370	if (regex->dfa)
371	{
372	dfafree (regex->dfa);
373	free (regex->dfa);
374	regex->dfa = NULL;
375	}
376	regfree (&regex->pattern);
377	free (regex);
378	}
379	#endif /* lint */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/sed/sed/regexp.c@ 3613

Download in other formats: