1 | /* GNU SED, a batch stream editor.
|
---|
2 | Copyright (C) 1999-2022 Free Software Foundation, Inc.
|
---|
3 |
|
---|
4 | This program is free software; you can redistribute it and/or modify
|
---|
5 | it under the terms of the GNU General Public License as published by
|
---|
6 | the Free Software Foundation; either version 3, or (at your option)
|
---|
7 | any later version.
|
---|
8 |
|
---|
9 | This program is distributed in the hope that it will be useful,
|
---|
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
12 | GNU General Public License for more details.
|
---|
13 |
|
---|
14 | You should have received a copy of the GNU General Public License
|
---|
15 | along with this program; If not, see <https://www.gnu.org/licenses/>. */
|
---|
16 |
|
---|
17 | #include "sed.h"
|
---|
18 |
|
---|
19 | #include <ctype.h>
|
---|
20 | #include <limits.h>
|
---|
21 | #include <string.h>
|
---|
22 | #include <stdio.h>
|
---|
23 | #include <stdlib.h>
|
---|
24 |
|
---|
25 | #include "xalloc.h"
|
---|
26 |
|
---|
27 | #ifdef gettext_noop
|
---|
28 | # define N_(String) gettext_noop(String)
|
---|
29 | #else
|
---|
30 | # define N_(String) (String)
|
---|
31 | #endif
|
---|
32 |
|
---|
33 | extern bool use_extended_syntax_p;
|
---|
34 |
|
---|
35 | static const char errors[] =
|
---|
36 | "no previous regular expression\0"
|
---|
37 | "cannot specify modifiers on empty regexp";
|
---|
38 |
|
---|
39 | #define NO_REGEX (errors)
|
---|
40 | #define BAD_MODIF (NO_REGEX + sizeof(N_("no previous regular expression")))
|
---|
41 |
|
---|
42 |
|
---|
43 | void
|
---|
44 | dfaerror (char const *mesg)
|
---|
45 | {
|
---|
46 | panic ("%s", mesg);
|
---|
47 | }
|
---|
48 |
|
---|
49 | void
|
---|
50 | dfawarn (char const *mesg)
|
---|
51 | {
|
---|
52 | if (!getenv ("POSIXLY_CORRECT"))
|
---|
53 | dfaerror (mesg);
|
---|
54 | }
|
---|
55 |
|
---|
56 |
|
---|
57 | static void
|
---|
58 | compile_regex_1 (struct regex *new_regex, int needed_sub)
|
---|
59 | {
|
---|
60 | const char *error;
|
---|
61 | int syntax = ((extended_regexp_flags & REG_EXTENDED)
|
---|
62 | ? RE_SYNTAX_POSIX_EXTENDED
|
---|
63 | : RE_SYNTAX_POSIX_BASIC);
|
---|
64 |
|
---|
65 | syntax &= ~RE_DOT_NOT_NULL;
|
---|
66 | syntax |= RE_NO_POSIX_BACKTRACKING;
|
---|
67 |
|
---|
68 | switch (posixicity)
|
---|
69 | {
|
---|
70 | case POSIXLY_EXTENDED:
|
---|
71 | syntax &= ~RE_UNMATCHED_RIGHT_PAREN_ORD;
|
---|
72 | break;
|
---|
73 | case POSIXLY_CORRECT:
|
---|
74 | syntax |= RE_UNMATCHED_RIGHT_PAREN_ORD;
|
---|
75 | break;
|
---|
76 | case POSIXLY_BASIC:
|
---|
77 | syntax |= RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS;
|
---|
78 | if (!(extended_regexp_flags & REG_EXTENDED))
|
---|
79 | syntax |= RE_LIMITED_OPS;
|
---|
80 | break;
|
---|
81 | }
|
---|
82 |
|
---|
83 | if (new_regex->flags & REG_ICASE)
|
---|
84 | syntax |= RE_ICASE;
|
---|
85 | else
|
---|
86 | new_regex->pattern.fastmap = malloc (1 << (sizeof (char) * 8));
|
---|
87 | syntax |= needed_sub ? 0 : RE_NO_SUB;
|
---|
88 |
|
---|
89 | /* If REG_NEWLINE is set, newlines are treated differently. */
|
---|
90 | if (new_regex->flags & REG_NEWLINE)
|
---|
91 | {
|
---|
92 | /* REG_NEWLINE implies neither . nor [^...] match newline. */
|
---|
93 | syntax &= ~RE_DOT_NEWLINE;
|
---|
94 | syntax |= RE_HAT_LISTS_NOT_NEWLINE;
|
---|
95 | }
|
---|
96 |
|
---|
97 | re_set_syntax (syntax);
|
---|
98 | error = re_compile_pattern (new_regex->re, new_regex->sz,
|
---|
99 | &new_regex->pattern);
|
---|
100 | new_regex->pattern.newline_anchor =
|
---|
101 | buffer_delimiter == '\n' && (new_regex->flags & REG_NEWLINE) != 0;
|
---|
102 |
|
---|
103 | new_regex->pattern.translate = NULL;
|
---|
104 | #ifndef RE_ICASE
|
---|
105 | if (new_regex->flags & REG_ICASE)
|
---|
106 | {
|
---|
107 | static char translate[1 << (sizeof (char) * 8)];
|
---|
108 | int i;
|
---|
109 | for (i = 0; i < sizeof (translate) / sizeof (char); i++)
|
---|
110 | translate[i] = tolower (i);
|
---|
111 |
|
---|
112 | new_regex->pattern.translate = translate;
|
---|
113 | }
|
---|
114 | #endif
|
---|
115 |
|
---|
116 | if (error)
|
---|
117 | bad_prog (error);
|
---|
118 |
|
---|
119 | /* Just to be sure, I mark this as not POSIXLY_CORRECT behavior */
|
---|
120 | if (needed_sub
|
---|
121 | && new_regex->pattern.re_nsub < needed_sub - 1
|
---|
122 | && posixicity == POSIXLY_EXTENDED)
|
---|
123 | {
|
---|
124 | char buf[200];
|
---|
125 | sprintf (buf, _("invalid reference \\%d on `s' command's RHS"),
|
---|
126 | needed_sub - 1);
|
---|
127 | bad_prog (buf);
|
---|
128 | }
|
---|
129 |
|
---|
130 | int dfaopts = buffer_delimiter == '\n' ? 0 : DFA_EOL_NUL;
|
---|
131 | new_regex->dfa = dfaalloc ();
|
---|
132 | dfasyntax (new_regex->dfa, &localeinfo, syntax, dfaopts);
|
---|
133 | dfacomp (new_regex->re, new_regex->sz, new_regex->dfa, 1);
|
---|
134 |
|
---|
135 | /* The patterns which consist of only ^ or $ often appear in
|
---|
136 | substitution, but regex and dfa are not good at them, as regex does
|
---|
137 | not build fastmap, and as all in buffer must be scanned for $. So
|
---|
138 | we mark them to handle manually. */
|
---|
139 | if (new_regex->sz == 1)
|
---|
140 | {
|
---|
141 | if (new_regex->re[0] == '^')
|
---|
142 | new_regex->begline = true;
|
---|
143 | if (new_regex->re[0] == '$')
|
---|
144 | new_regex->endline = true;
|
---|
145 | }
|
---|
146 | }
|
---|
147 |
|
---|
148 | struct regex *
|
---|
149 | compile_regex (struct buffer *b, int flags, int needed_sub)
|
---|
150 | {
|
---|
151 | struct regex *new_regex;
|
---|
152 | size_t re_len;
|
---|
153 |
|
---|
154 | /* // matches the last RE */
|
---|
155 | if (size_buffer (b) == 0)
|
---|
156 | {
|
---|
157 | if (flags > 0)
|
---|
158 | bad_prog (_(BAD_MODIF));
|
---|
159 | return NULL;
|
---|
160 | }
|
---|
161 |
|
---|
162 | re_len = size_buffer (b);
|
---|
163 | new_regex = xzalloc (sizeof (struct regex) + re_len - 1);
|
---|
164 | new_regex->flags = flags;
|
---|
165 | memcpy (new_regex->re, get_buffer (b), re_len);
|
---|
166 |
|
---|
167 | /* GNU regex does not process \t & co. */
|
---|
168 | new_regex->sz = normalize_text (new_regex->re, re_len, TEXT_REGEX);
|
---|
169 |
|
---|
170 | compile_regex_1 (new_regex, needed_sub);
|
---|
171 | return new_regex;
|
---|
172 | }
|
---|
173 |
|
---|
174 | int
|
---|
175 | match_regex (struct regex *regex, char *buf, size_t buflen,
|
---|
176 | size_t buf_start_offset, struct re_registers *regarray,
|
---|
177 | int regsize)
|
---|
178 | {
|
---|
179 | int ret;
|
---|
180 | static struct regex *regex_last;
|
---|
181 |
|
---|
182 | /* printf ("Matching from %d/%d\n", buf_start_offset, buflen); */
|
---|
183 |
|
---|
184 | /* Keep track of the last regexp matched. */
|
---|
185 | if (!regex)
|
---|
186 | {
|
---|
187 | regex = regex_last;
|
---|
188 | if (!regex_last)
|
---|
189 | bad_prog (_(NO_REGEX));
|
---|
190 | }
|
---|
191 | else
|
---|
192 | regex_last = regex;
|
---|
193 |
|
---|
194 | /* gnulib's re_search uses signed-int as length */
|
---|
195 | if (buflen >= INT_MAX)
|
---|
196 | panic (_("regex input buffer length larger than INT_MAX"));
|
---|
197 |
|
---|
198 | if (regex->pattern.no_sub && regsize)
|
---|
199 | {
|
---|
200 | /* Re-compiling an existing regex, free the previously allocated
|
---|
201 | structures. */
|
---|
202 | if (regex->dfa)
|
---|
203 | {
|
---|
204 | dfafree (regex->dfa);
|
---|
205 | free (regex->dfa);
|
---|
206 | regex->dfa = NULL;
|
---|
207 | }
|
---|
208 | regfree (®ex->pattern);
|
---|
209 |
|
---|
210 | compile_regex_1 (regex, regsize);
|
---|
211 | }
|
---|
212 |
|
---|
213 | regex->pattern.regs_allocated = REGS_REALLOCATE;
|
---|
214 |
|
---|
215 | /* Optimized handling for '^' and '$' patterns */
|
---|
216 | if (regex->begline || regex->endline)
|
---|
217 | {
|
---|
218 | size_t offset;
|
---|
219 |
|
---|
220 | if (regex->endline)
|
---|
221 | {
|
---|
222 | const char *p = NULL;
|
---|
223 |
|
---|
224 | if (regex->flags & REG_NEWLINE)
|
---|
225 | p = memchr (buf + buf_start_offset, buffer_delimiter,
|
---|
226 | buflen - buf_start_offset);
|
---|
227 |
|
---|
228 | offset = p ? p - buf : buflen;
|
---|
229 | }
|
---|
230 | else if (buf_start_offset == 0)
|
---|
231 | /* begline anchor, starting at beginning of the buffer. */
|
---|
232 | offset = 0;
|
---|
233 | else if (!(regex->flags & REG_NEWLINE))
|
---|
234 | /* begline anchor, starting in the middle of the text buffer,
|
---|
235 | and multiline regex is not specified - will never match.
|
---|
236 | Example: seq 2 | sed 'N;s/^/X/g' */
|
---|
237 | return 0;
|
---|
238 | else if (buf[buf_start_offset - 1] == buffer_delimiter)
|
---|
239 | /* begline anchor, starting in the middle of the text buffer,
|
---|
240 | with multiline match, and the current character
|
---|
241 | is the line delimiter - start here.
|
---|
242 | Example: seq 2 | sed 'N;s/^/X/mg' */
|
---|
243 | offset = buf_start_offset;
|
---|
244 | else
|
---|
245 | {
|
---|
246 | /* begline anchor, starting in the middle of the search buffer,
|
---|
247 | all previous optimizions didn't work: search
|
---|
248 | for the next line delimiter character in the buffer,
|
---|
249 | and start from there if found. */
|
---|
250 | const char *p = memchr (buf + buf_start_offset, buffer_delimiter,
|
---|
251 | buflen - buf_start_offset);
|
---|
252 |
|
---|
253 | if (p == NULL)
|
---|
254 | return 0;
|
---|
255 |
|
---|
256 | offset = p - buf + 1;
|
---|
257 | }
|
---|
258 |
|
---|
259 | if (regsize)
|
---|
260 | {
|
---|
261 | size_t i;
|
---|
262 |
|
---|
263 | if (!regarray->start)
|
---|
264 | {
|
---|
265 | regarray->start = XCALLOC (1, regoff_t);
|
---|
266 | regarray->end = XCALLOC (1, regoff_t);
|
---|
267 | regarray->num_regs = 1;
|
---|
268 | }
|
---|
269 |
|
---|
270 | regarray->start[0] = offset;
|
---|
271 | regarray->end[0] = offset;
|
---|
272 |
|
---|
273 | for (i = 1 ; i < regarray->num_regs; ++i)
|
---|
274 | regarray->start[i] = regarray->end[i] = -1;
|
---|
275 | }
|
---|
276 |
|
---|
277 | return 1;
|
---|
278 | }
|
---|
279 |
|
---|
280 | if (buf_start_offset == 0)
|
---|
281 | {
|
---|
282 | struct dfa *superset = dfasuperset (regex->dfa);
|
---|
283 |
|
---|
284 | if (superset && !dfaexec (superset, buf, buf + buflen, true, NULL, NULL))
|
---|
285 | return 0;
|
---|
286 |
|
---|
287 | if ((!regsize && (regex->flags & REG_NEWLINE))
|
---|
288 | || (!superset && dfaisfast (regex->dfa)))
|
---|
289 | {
|
---|
290 | bool backref = false;
|
---|
291 |
|
---|
292 | if (!dfaexec (regex->dfa, buf, buf + buflen, true, NULL, &backref))
|
---|
293 | return 0;
|
---|
294 |
|
---|
295 | if (!regsize && (regex->flags & REG_NEWLINE) && !backref)
|
---|
296 | return 1;
|
---|
297 | }
|
---|
298 | }
|
---|
299 |
|
---|
300 | /* If the buffer delimiter is not newline character, we cannot use
|
---|
301 | newline_anchor flag of regex. So do it line-by-line, and add offset
|
---|
302 | value to results. */
|
---|
303 | if ((regex->flags & REG_NEWLINE) && buffer_delimiter != '\n')
|
---|
304 | {
|
---|
305 | const char *beg, *end;
|
---|
306 | const char *start;
|
---|
307 |
|
---|
308 | beg = buf;
|
---|
309 |
|
---|
310 | if (buf_start_offset > 0)
|
---|
311 | {
|
---|
312 | const char *eol = memrchr (buf, buffer_delimiter, buf_start_offset);
|
---|
313 |
|
---|
314 | if (eol != NULL)
|
---|
315 | beg = eol + 1;
|
---|
316 | }
|
---|
317 |
|
---|
318 | start = buf + buf_start_offset;
|
---|
319 |
|
---|
320 | for (;;)
|
---|
321 | {
|
---|
322 | end = memchr (beg, buffer_delimiter, buf + buflen - beg);
|
---|
323 |
|
---|
324 | if (end == NULL)
|
---|
325 | end = buf + buflen;
|
---|
326 |
|
---|
327 | ret = re_search (®ex->pattern, beg, end - beg,
|
---|
328 | start - beg, end - start,
|
---|
329 | regsize ? regarray : NULL);
|
---|
330 |
|
---|
331 | if (ret > -1)
|
---|
332 | {
|
---|
333 | size_t i;
|
---|
334 |
|
---|
335 | ret += beg - buf;
|
---|
336 |
|
---|
337 | if (regsize)
|
---|
338 | {
|
---|
339 | for (i = 0; i < regarray->num_regs; ++i)
|
---|
340 | {
|
---|
341 | if (regarray->start[i] > -1)
|
---|
342 | regarray->start[i] += beg - buf;
|
---|
343 | if (regarray->end[i] > -1)
|
---|
344 | regarray->end[i] += beg - buf;
|
---|
345 | }
|
---|
346 | }
|
---|
347 |
|
---|
348 | break;
|
---|
349 | }
|
---|
350 |
|
---|
351 | if (end == buf + buflen)
|
---|
352 | break;
|
---|
353 |
|
---|
354 | beg = start = end + 1;
|
---|
355 | }
|
---|
356 | }
|
---|
357 | else
|
---|
358 | ret = re_search (®ex->pattern, buf, buflen, buf_start_offset,
|
---|
359 | buflen - buf_start_offset,
|
---|
360 | regsize ? regarray : NULL);
|
---|
361 |
|
---|
362 | return (ret > -1);
|
---|
363 | }
|
---|
364 |
|
---|
365 |
|
---|
366 | #ifdef lint
|
---|
367 | void
|
---|
368 | release_regex (struct regex *regex)
|
---|
369 | {
|
---|
370 | if (regex->dfa)
|
---|
371 | {
|
---|
372 | dfafree (regex->dfa);
|
---|
373 | free (regex->dfa);
|
---|
374 | regex->dfa = NULL;
|
---|
375 | }
|
---|
376 | regfree (®ex->pattern);
|
---|
377 | free (regex);
|
---|
378 | }
|
---|
379 | #endif /* lint */
|
---|