source: trunk/src/sed/sed/regexp.c

Last change on this file was 3613, checked in by bird, 10 months ago

src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge /vendor/sed/4.1.5 /vendor/sed/current .)

File size: 10.0 KB
RevLine 
[599]1/* GNU SED, a batch stream editor.
[3613]2 Copyright (C) 1999-2022 Free Software Foundation, Inc.
[599]3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
[3613]6 the Free Software Foundation; either version 3, or (at your option)
[599]7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
[3613]15 along with this program; If not, see <https://www.gnu.org/licenses/>. */
[599]16
17#include "sed.h"
18
19#include <ctype.h>
[3613]20#include <limits.h>
21#include <string.h>
[599]22#include <stdio.h>
[3613]23#include <stdlib.h>
[599]24
[3613]25#include "xalloc.h"
26
[599]27#ifdef gettext_noop
28# define N_(String) gettext_noop(String)
29#else
30# define N_(String) (String)
31#endif
32
33extern bool use_extended_syntax_p;
34
35static const char errors[] =
36 "no previous regular expression\0"
37 "cannot specify modifiers on empty regexp";
38
39#define NO_REGEX (errors)
40#define BAD_MODIF (NO_REGEX + sizeof(N_("no previous regular expression")))
41
42
[3613]43void
44dfaerror (char const *mesg)
45{
46 panic ("%s", mesg);
47}
48
49void
50dfawarn (char const *mesg)
51{
52 if (!getenv ("POSIXLY_CORRECT"))
53 dfaerror (mesg);
54}
55
56
[599]57static void
[3613]58compile_regex_1 (struct regex *new_regex, int needed_sub)
[599]59{
[3613]60 const char *error;
61 int syntax = ((extended_regexp_flags & REG_EXTENDED)
62 ? RE_SYNTAX_POSIX_EXTENDED
63 : RE_SYNTAX_POSIX_BASIC);
[599]64
[3613]65 syntax &= ~RE_DOT_NOT_NULL;
66 syntax |= RE_NO_POSIX_BACKTRACKING;
67
68 switch (posixicity)
[599]69 {
[3613]70 case POSIXLY_EXTENDED:
71 syntax &= ~RE_UNMATCHED_RIGHT_PAREN_ORD;
72 break;
73 case POSIXLY_CORRECT:
74 syntax |= RE_UNMATCHED_RIGHT_PAREN_ORD;
75 break;
76 case POSIXLY_BASIC:
77 syntax |= RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS;
78 if (!(extended_regexp_flags & REG_EXTENDED))
79 syntax |= RE_LIMITED_OPS;
80 break;
[599]81 }
82
[3613]83 if (new_regex->flags & REG_ICASE)
84 syntax |= RE_ICASE;
85 else
86 new_regex->pattern.fastmap = malloc (1 << (sizeof (char) * 8));
[599]87 syntax |= needed_sub ? 0 : RE_NO_SUB;
88
89 /* If REG_NEWLINE is set, newlines are treated differently. */
90 if (new_regex->flags & REG_NEWLINE)
91 {
92 /* REG_NEWLINE implies neither . nor [^...] match newline. */
93 syntax &= ~RE_DOT_NEWLINE;
94 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
95 }
96
97 re_set_syntax (syntax);
98 error = re_compile_pattern (new_regex->re, new_regex->sz,
[3613]99 &new_regex->pattern);
100 new_regex->pattern.newline_anchor =
101 buffer_delimiter == '\n' && (new_regex->flags & REG_NEWLINE) != 0;
[599]102
103 new_regex->pattern.translate = NULL;
104#ifndef RE_ICASE
105 if (new_regex->flags & REG_ICASE)
106 {
[3613]107 static char translate[1 << (sizeof (char) * 8)];
[599]108 int i;
[3613]109 for (i = 0; i < sizeof (translate) / sizeof (char); i++)
110 translate[i] = tolower (i);
[599]111
112 new_regex->pattern.translate = translate;
113 }
114#endif
115
116 if (error)
[3613]117 bad_prog (error);
[599]118
119 /* Just to be sure, I mark this as not POSIXLY_CORRECT behavior */
120 if (needed_sub
121 && new_regex->pattern.re_nsub < needed_sub - 1
122 && posixicity == POSIXLY_EXTENDED)
123 {
124 char buf[200];
[3613]125 sprintf (buf, _("invalid reference \\%d on `s' command's RHS"),
126 needed_sub - 1);
127 bad_prog (buf);
[599]128 }
[3613]129
130 int dfaopts = buffer_delimiter == '\n' ? 0 : DFA_EOL_NUL;
131 new_regex->dfa = dfaalloc ();
132 dfasyntax (new_regex->dfa, &localeinfo, syntax, dfaopts);
133 dfacomp (new_regex->re, new_regex->sz, new_regex->dfa, 1);
134
135 /* The patterns which consist of only ^ or $ often appear in
136 substitution, but regex and dfa are not good at them, as regex does
137 not build fastmap, and as all in buffer must be scanned for $. So
138 we mark them to handle manually. */
139 if (new_regex->sz == 1)
140 {
141 if (new_regex->re[0] == '^')
142 new_regex->begline = true;
143 if (new_regex->re[0] == '$')
144 new_regex->endline = true;
145 }
[599]146}
147
148struct regex *
[3613]149compile_regex (struct buffer *b, int flags, int needed_sub)
[599]150{
151 struct regex *new_regex;
152 size_t re_len;
153
154 /* // matches the last RE */
[3613]155 if (size_buffer (b) == 0)
[599]156 {
157 if (flags > 0)
[3613]158 bad_prog (_(BAD_MODIF));
[599]159 return NULL;
160 }
161
[3613]162 re_len = size_buffer (b);
163 new_regex = xzalloc (sizeof (struct regex) + re_len - 1);
[599]164 new_regex->flags = flags;
[3613]165 memcpy (new_regex->re, get_buffer (b), re_len);
[599]166
167 /* GNU regex does not process \t & co. */
[3613]168 new_regex->sz = normalize_text (new_regex->re, re_len, TEXT_REGEX);
[599]169
170 compile_regex_1 (new_regex, needed_sub);
171 return new_regex;
172}
173
174int
[3613]175match_regex (struct regex *regex, char *buf, size_t buflen,
176 size_t buf_start_offset, struct re_registers *regarray,
177 int regsize)
[599]178{
179 int ret;
180 static struct regex *regex_last;
181
182 /* printf ("Matching from %d/%d\n", buf_start_offset, buflen); */
183
184 /* Keep track of the last regexp matched. */
185 if (!regex)
186 {
187 regex = regex_last;
188 if (!regex_last)
[3613]189 bad_prog (_(NO_REGEX));
[599]190 }
191 else
192 regex_last = regex;
193
[3613]194 /* gnulib's re_search uses signed-int as length */
195 if (buflen >= INT_MAX)
196 panic (_("regex input buffer length larger than INT_MAX"));
[599]197
198 if (regex->pattern.no_sub && regsize)
[3613]199 {
200 /* Re-compiling an existing regex, free the previously allocated
201 structures. */
202 if (regex->dfa)
203 {
204 dfafree (regex->dfa);
205 free (regex->dfa);
206 regex->dfa = NULL;
207 }
208 regfree (&regex->pattern);
[599]209
[3613]210 compile_regex_1 (regex, regsize);
211 }
212
[599]213 regex->pattern.regs_allocated = REGS_REALLOCATE;
214
[3613]215 /* Optimized handling for '^' and '$' patterns */
216 if (regex->begline || regex->endline)
217 {
218 size_t offset;
[599]219
[3613]220 if (regex->endline)
221 {
222 const char *p = NULL;
223
224 if (regex->flags & REG_NEWLINE)
225 p = memchr (buf + buf_start_offset, buffer_delimiter,
226 buflen - buf_start_offset);
227
228 offset = p ? p - buf : buflen;
229 }
230 else if (buf_start_offset == 0)
231 /* begline anchor, starting at beginning of the buffer. */
232 offset = 0;
233 else if (!(regex->flags & REG_NEWLINE))
234 /* begline anchor, starting in the middle of the text buffer,
235 and multiline regex is not specified - will never match.
236 Example: seq 2 | sed 'N;s/^/X/g' */
237 return 0;
238 else if (buf[buf_start_offset - 1] == buffer_delimiter)
239 /* begline anchor, starting in the middle of the text buffer,
240 with multiline match, and the current character
241 is the line delimiter - start here.
242 Example: seq 2 | sed 'N;s/^/X/mg' */
243 offset = buf_start_offset;
244 else
245 {
246 /* begline anchor, starting in the middle of the search buffer,
247 all previous optimizions didn't work: search
248 for the next line delimiter character in the buffer,
249 and start from there if found. */
250 const char *p = memchr (buf + buf_start_offset, buffer_delimiter,
251 buflen - buf_start_offset);
252
253 if (p == NULL)
254 return 0;
255
256 offset = p - buf + 1;
257 }
258
259 if (regsize)
260 {
261 size_t i;
262
263 if (!regarray->start)
264 {
265 regarray->start = XCALLOC (1, regoff_t);
266 regarray->end = XCALLOC (1, regoff_t);
267 regarray->num_regs = 1;
268 }
269
270 regarray->start[0] = offset;
271 regarray->end[0] = offset;
272
273 for (i = 1 ; i < regarray->num_regs; ++i)
274 regarray->start[i] = regarray->end[i] = -1;
275 }
276
277 return 1;
278 }
279
280 if (buf_start_offset == 0)
281 {
282 struct dfa *superset = dfasuperset (regex->dfa);
283
284 if (superset && !dfaexec (superset, buf, buf + buflen, true, NULL, NULL))
285 return 0;
286
287 if ((!regsize && (regex->flags & REG_NEWLINE))
288 || (!superset && dfaisfast (regex->dfa)))
289 {
290 bool backref = false;
291
292 if (!dfaexec (regex->dfa, buf, buf + buflen, true, NULL, &backref))
293 return 0;
294
295 if (!regsize && (regex->flags & REG_NEWLINE) && !backref)
296 return 1;
297 }
298 }
299
300 /* If the buffer delimiter is not newline character, we cannot use
301 newline_anchor flag of regex. So do it line-by-line, and add offset
302 value to results. */
303 if ((regex->flags & REG_NEWLINE) && buffer_delimiter != '\n')
304 {
305 const char *beg, *end;
306 const char *start;
307
308 beg = buf;
309
310 if (buf_start_offset > 0)
311 {
312 const char *eol = memrchr (buf, buffer_delimiter, buf_start_offset);
313
314 if (eol != NULL)
315 beg = eol + 1;
316 }
317
318 start = buf + buf_start_offset;
319
320 for (;;)
321 {
322 end = memchr (beg, buffer_delimiter, buf + buflen - beg);
323
324 if (end == NULL)
325 end = buf + buflen;
326
327 ret = re_search (&regex->pattern, beg, end - beg,
328 start - beg, end - start,
329 regsize ? regarray : NULL);
330
331 if (ret > -1)
332 {
333 size_t i;
334
335 ret += beg - buf;
336
337 if (regsize)
338 {
339 for (i = 0; i < regarray->num_regs; ++i)
340 {
341 if (regarray->start[i] > -1)
342 regarray->start[i] += beg - buf;
343 if (regarray->end[i] > -1)
344 regarray->end[i] += beg - buf;
345 }
346 }
347
348 break;
349 }
350
351 if (end == buf + buflen)
352 break;
353
354 beg = start = end + 1;
355 }
356 }
357 else
358 ret = re_search (&regex->pattern, buf, buflen, buf_start_offset,
359 buflen - buf_start_offset,
360 regsize ? regarray : NULL);
361
[599]362 return (ret > -1);
363}
364
365
[3613]366#ifdef lint
[599]367void
[3613]368release_regex (struct regex *regex)
[599]369{
[3613]370 if (regex->dfa)
371 {
372 dfafree (regex->dfa);
373 free (regex->dfa);
374 regex->dfa = NULL;
375 }
376 regfree (&regex->pattern);
377 free (regex);
[599]378}
[3613]379#endif /* lint */
Note: See TracBrowser for help on using the repository browser.