[599] | 1 | /* GNU SED, a batch stream editor.
|
---|
[3613] | 2 | Copyright (C) 1999-2022 Free Software Foundation, Inc.
|
---|
[599] | 3 |
|
---|
| 4 | This program is free software; you can redistribute it and/or modify
|
---|
| 5 | it under the terms of the GNU General Public License as published by
|
---|
[3613] | 6 | the Free Software Foundation; either version 3, or (at your option)
|
---|
[599] | 7 | any later version.
|
---|
| 8 |
|
---|
| 9 | This program is distributed in the hope that it will be useful,
|
---|
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 12 | GNU General Public License for more details.
|
---|
| 13 |
|
---|
| 14 | You should have received a copy of the GNU General Public License
|
---|
[3613] | 15 | along with this program; If not, see <https://www.gnu.org/licenses/>. */
|
---|
[599] | 16 |
|
---|
| 17 | #include "sed.h"
|
---|
| 18 |
|
---|
| 19 | #include <ctype.h>
|
---|
[3613] | 20 | #include <limits.h>
|
---|
| 21 | #include <string.h>
|
---|
[599] | 22 | #include <stdio.h>
|
---|
[3613] | 23 | #include <stdlib.h>
|
---|
[599] | 24 |
|
---|
[3613] | 25 | #include "xalloc.h"
|
---|
| 26 |
|
---|
[599] | 27 | #ifdef gettext_noop
|
---|
| 28 | # define N_(String) gettext_noop(String)
|
---|
| 29 | #else
|
---|
| 30 | # define N_(String) (String)
|
---|
| 31 | #endif
|
---|
| 32 |
|
---|
| 33 | extern bool use_extended_syntax_p;
|
---|
| 34 |
|
---|
| 35 | static const char errors[] =
|
---|
| 36 | "no previous regular expression\0"
|
---|
| 37 | "cannot specify modifiers on empty regexp";
|
---|
| 38 |
|
---|
| 39 | #define NO_REGEX (errors)
|
---|
| 40 | #define BAD_MODIF (NO_REGEX + sizeof(N_("no previous regular expression")))
|
---|
| 41 |
|
---|
| 42 |
|
---|
[3613] | 43 | void
|
---|
| 44 | dfaerror (char const *mesg)
|
---|
| 45 | {
|
---|
| 46 | panic ("%s", mesg);
|
---|
| 47 | }
|
---|
| 48 |
|
---|
| 49 | void
|
---|
| 50 | dfawarn (char const *mesg)
|
---|
| 51 | {
|
---|
| 52 | if (!getenv ("POSIXLY_CORRECT"))
|
---|
| 53 | dfaerror (mesg);
|
---|
| 54 | }
|
---|
| 55 |
|
---|
| 56 |
|
---|
[599] | 57 | static void
|
---|
[3613] | 58 | compile_regex_1 (struct regex *new_regex, int needed_sub)
|
---|
[599] | 59 | {
|
---|
[3613] | 60 | const char *error;
|
---|
| 61 | int syntax = ((extended_regexp_flags & REG_EXTENDED)
|
---|
| 62 | ? RE_SYNTAX_POSIX_EXTENDED
|
---|
| 63 | : RE_SYNTAX_POSIX_BASIC);
|
---|
[599] | 64 |
|
---|
[3613] | 65 | syntax &= ~RE_DOT_NOT_NULL;
|
---|
| 66 | syntax |= RE_NO_POSIX_BACKTRACKING;
|
---|
| 67 |
|
---|
| 68 | switch (posixicity)
|
---|
[599] | 69 | {
|
---|
[3613] | 70 | case POSIXLY_EXTENDED:
|
---|
| 71 | syntax &= ~RE_UNMATCHED_RIGHT_PAREN_ORD;
|
---|
| 72 | break;
|
---|
| 73 | case POSIXLY_CORRECT:
|
---|
| 74 | syntax |= RE_UNMATCHED_RIGHT_PAREN_ORD;
|
---|
| 75 | break;
|
---|
| 76 | case POSIXLY_BASIC:
|
---|
| 77 | syntax |= RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS;
|
---|
| 78 | if (!(extended_regexp_flags & REG_EXTENDED))
|
---|
| 79 | syntax |= RE_LIMITED_OPS;
|
---|
| 80 | break;
|
---|
[599] | 81 | }
|
---|
| 82 |
|
---|
[3613] | 83 | if (new_regex->flags & REG_ICASE)
|
---|
| 84 | syntax |= RE_ICASE;
|
---|
| 85 | else
|
---|
| 86 | new_regex->pattern.fastmap = malloc (1 << (sizeof (char) * 8));
|
---|
[599] | 87 | syntax |= needed_sub ? 0 : RE_NO_SUB;
|
---|
| 88 |
|
---|
| 89 | /* If REG_NEWLINE is set, newlines are treated differently. */
|
---|
| 90 | if (new_regex->flags & REG_NEWLINE)
|
---|
| 91 | {
|
---|
| 92 | /* REG_NEWLINE implies neither . nor [^...] match newline. */
|
---|
| 93 | syntax &= ~RE_DOT_NEWLINE;
|
---|
| 94 | syntax |= RE_HAT_LISTS_NOT_NEWLINE;
|
---|
| 95 | }
|
---|
| 96 |
|
---|
| 97 | re_set_syntax (syntax);
|
---|
| 98 | error = re_compile_pattern (new_regex->re, new_regex->sz,
|
---|
[3613] | 99 | &new_regex->pattern);
|
---|
| 100 | new_regex->pattern.newline_anchor =
|
---|
| 101 | buffer_delimiter == '\n' && (new_regex->flags & REG_NEWLINE) != 0;
|
---|
[599] | 102 |
|
---|
| 103 | new_regex->pattern.translate = NULL;
|
---|
| 104 | #ifndef RE_ICASE
|
---|
| 105 | if (new_regex->flags & REG_ICASE)
|
---|
| 106 | {
|
---|
[3613] | 107 | static char translate[1 << (sizeof (char) * 8)];
|
---|
[599] | 108 | int i;
|
---|
[3613] | 109 | for (i = 0; i < sizeof (translate) / sizeof (char); i++)
|
---|
| 110 | translate[i] = tolower (i);
|
---|
[599] | 111 |
|
---|
| 112 | new_regex->pattern.translate = translate;
|
---|
| 113 | }
|
---|
| 114 | #endif
|
---|
| 115 |
|
---|
| 116 | if (error)
|
---|
[3613] | 117 | bad_prog (error);
|
---|
[599] | 118 |
|
---|
| 119 | /* Just to be sure, I mark this as not POSIXLY_CORRECT behavior */
|
---|
| 120 | if (needed_sub
|
---|
| 121 | && new_regex->pattern.re_nsub < needed_sub - 1
|
---|
| 122 | && posixicity == POSIXLY_EXTENDED)
|
---|
| 123 | {
|
---|
| 124 | char buf[200];
|
---|
[3613] | 125 | sprintf (buf, _("invalid reference \\%d on `s' command's RHS"),
|
---|
| 126 | needed_sub - 1);
|
---|
| 127 | bad_prog (buf);
|
---|
[599] | 128 | }
|
---|
[3613] | 129 |
|
---|
| 130 | int dfaopts = buffer_delimiter == '\n' ? 0 : DFA_EOL_NUL;
|
---|
| 131 | new_regex->dfa = dfaalloc ();
|
---|
| 132 | dfasyntax (new_regex->dfa, &localeinfo, syntax, dfaopts);
|
---|
| 133 | dfacomp (new_regex->re, new_regex->sz, new_regex->dfa, 1);
|
---|
| 134 |
|
---|
| 135 | /* The patterns which consist of only ^ or $ often appear in
|
---|
| 136 | substitution, but regex and dfa are not good at them, as regex does
|
---|
| 137 | not build fastmap, and as all in buffer must be scanned for $. So
|
---|
| 138 | we mark them to handle manually. */
|
---|
| 139 | if (new_regex->sz == 1)
|
---|
| 140 | {
|
---|
| 141 | if (new_regex->re[0] == '^')
|
---|
| 142 | new_regex->begline = true;
|
---|
| 143 | if (new_regex->re[0] == '$')
|
---|
| 144 | new_regex->endline = true;
|
---|
| 145 | }
|
---|
[599] | 146 | }
|
---|
| 147 |
|
---|
| 148 | struct regex *
|
---|
[3613] | 149 | compile_regex (struct buffer *b, int flags, int needed_sub)
|
---|
[599] | 150 | {
|
---|
| 151 | struct regex *new_regex;
|
---|
| 152 | size_t re_len;
|
---|
| 153 |
|
---|
| 154 | /* // matches the last RE */
|
---|
[3613] | 155 | if (size_buffer (b) == 0)
|
---|
[599] | 156 | {
|
---|
| 157 | if (flags > 0)
|
---|
[3613] | 158 | bad_prog (_(BAD_MODIF));
|
---|
[599] | 159 | return NULL;
|
---|
| 160 | }
|
---|
| 161 |
|
---|
[3613] | 162 | re_len = size_buffer (b);
|
---|
| 163 | new_regex = xzalloc (sizeof (struct regex) + re_len - 1);
|
---|
[599] | 164 | new_regex->flags = flags;
|
---|
[3613] | 165 | memcpy (new_regex->re, get_buffer (b), re_len);
|
---|
[599] | 166 |
|
---|
| 167 | /* GNU regex does not process \t & co. */
|
---|
[3613] | 168 | new_regex->sz = normalize_text (new_regex->re, re_len, TEXT_REGEX);
|
---|
[599] | 169 |
|
---|
| 170 | compile_regex_1 (new_regex, needed_sub);
|
---|
| 171 | return new_regex;
|
---|
| 172 | }
|
---|
| 173 |
|
---|
| 174 | int
|
---|
[3613] | 175 | match_regex (struct regex *regex, char *buf, size_t buflen,
|
---|
| 176 | size_t buf_start_offset, struct re_registers *regarray,
|
---|
| 177 | int regsize)
|
---|
[599] | 178 | {
|
---|
| 179 | int ret;
|
---|
| 180 | static struct regex *regex_last;
|
---|
| 181 |
|
---|
| 182 | /* printf ("Matching from %d/%d\n", buf_start_offset, buflen); */
|
---|
| 183 |
|
---|
| 184 | /* Keep track of the last regexp matched. */
|
---|
| 185 | if (!regex)
|
---|
| 186 | {
|
---|
| 187 | regex = regex_last;
|
---|
| 188 | if (!regex_last)
|
---|
[3613] | 189 | bad_prog (_(NO_REGEX));
|
---|
[599] | 190 | }
|
---|
| 191 | else
|
---|
| 192 | regex_last = regex;
|
---|
| 193 |
|
---|
[3613] | 194 | /* gnulib's re_search uses signed-int as length */
|
---|
| 195 | if (buflen >= INT_MAX)
|
---|
| 196 | panic (_("regex input buffer length larger than INT_MAX"));
|
---|
[599] | 197 |
|
---|
| 198 | if (regex->pattern.no_sub && regsize)
|
---|
[3613] | 199 | {
|
---|
| 200 | /* Re-compiling an existing regex, free the previously allocated
|
---|
| 201 | structures. */
|
---|
| 202 | if (regex->dfa)
|
---|
| 203 | {
|
---|
| 204 | dfafree (regex->dfa);
|
---|
| 205 | free (regex->dfa);
|
---|
| 206 | regex->dfa = NULL;
|
---|
| 207 | }
|
---|
| 208 | regfree (®ex->pattern);
|
---|
[599] | 209 |
|
---|
[3613] | 210 | compile_regex_1 (regex, regsize);
|
---|
| 211 | }
|
---|
| 212 |
|
---|
[599] | 213 | regex->pattern.regs_allocated = REGS_REALLOCATE;
|
---|
| 214 |
|
---|
[3613] | 215 | /* Optimized handling for '^' and '$' patterns */
|
---|
| 216 | if (regex->begline || regex->endline)
|
---|
| 217 | {
|
---|
| 218 | size_t offset;
|
---|
[599] | 219 |
|
---|
[3613] | 220 | if (regex->endline)
|
---|
| 221 | {
|
---|
| 222 | const char *p = NULL;
|
---|
| 223 |
|
---|
| 224 | if (regex->flags & REG_NEWLINE)
|
---|
| 225 | p = memchr (buf + buf_start_offset, buffer_delimiter,
|
---|
| 226 | buflen - buf_start_offset);
|
---|
| 227 |
|
---|
| 228 | offset = p ? p - buf : buflen;
|
---|
| 229 | }
|
---|
| 230 | else if (buf_start_offset == 0)
|
---|
| 231 | /* begline anchor, starting at beginning of the buffer. */
|
---|
| 232 | offset = 0;
|
---|
| 233 | else if (!(regex->flags & REG_NEWLINE))
|
---|
| 234 | /* begline anchor, starting in the middle of the text buffer,
|
---|
| 235 | and multiline regex is not specified - will never match.
|
---|
| 236 | Example: seq 2 | sed 'N;s/^/X/g' */
|
---|
| 237 | return 0;
|
---|
| 238 | else if (buf[buf_start_offset - 1] == buffer_delimiter)
|
---|
| 239 | /* begline anchor, starting in the middle of the text buffer,
|
---|
| 240 | with multiline match, and the current character
|
---|
| 241 | is the line delimiter - start here.
|
---|
| 242 | Example: seq 2 | sed 'N;s/^/X/mg' */
|
---|
| 243 | offset = buf_start_offset;
|
---|
| 244 | else
|
---|
| 245 | {
|
---|
| 246 | /* begline anchor, starting in the middle of the search buffer,
|
---|
| 247 | all previous optimizions didn't work: search
|
---|
| 248 | for the next line delimiter character in the buffer,
|
---|
| 249 | and start from there if found. */
|
---|
| 250 | const char *p = memchr (buf + buf_start_offset, buffer_delimiter,
|
---|
| 251 | buflen - buf_start_offset);
|
---|
| 252 |
|
---|
| 253 | if (p == NULL)
|
---|
| 254 | return 0;
|
---|
| 255 |
|
---|
| 256 | offset = p - buf + 1;
|
---|
| 257 | }
|
---|
| 258 |
|
---|
| 259 | if (regsize)
|
---|
| 260 | {
|
---|
| 261 | size_t i;
|
---|
| 262 |
|
---|
| 263 | if (!regarray->start)
|
---|
| 264 | {
|
---|
| 265 | regarray->start = XCALLOC (1, regoff_t);
|
---|
| 266 | regarray->end = XCALLOC (1, regoff_t);
|
---|
| 267 | regarray->num_regs = 1;
|
---|
| 268 | }
|
---|
| 269 |
|
---|
| 270 | regarray->start[0] = offset;
|
---|
| 271 | regarray->end[0] = offset;
|
---|
| 272 |
|
---|
| 273 | for (i = 1 ; i < regarray->num_regs; ++i)
|
---|
| 274 | regarray->start[i] = regarray->end[i] = -1;
|
---|
| 275 | }
|
---|
| 276 |
|
---|
| 277 | return 1;
|
---|
| 278 | }
|
---|
| 279 |
|
---|
| 280 | if (buf_start_offset == 0)
|
---|
| 281 | {
|
---|
| 282 | struct dfa *superset = dfasuperset (regex->dfa);
|
---|
| 283 |
|
---|
| 284 | if (superset && !dfaexec (superset, buf, buf + buflen, true, NULL, NULL))
|
---|
| 285 | return 0;
|
---|
| 286 |
|
---|
| 287 | if ((!regsize && (regex->flags & REG_NEWLINE))
|
---|
| 288 | || (!superset && dfaisfast (regex->dfa)))
|
---|
| 289 | {
|
---|
| 290 | bool backref = false;
|
---|
| 291 |
|
---|
| 292 | if (!dfaexec (regex->dfa, buf, buf + buflen, true, NULL, &backref))
|
---|
| 293 | return 0;
|
---|
| 294 |
|
---|
| 295 | if (!regsize && (regex->flags & REG_NEWLINE) && !backref)
|
---|
| 296 | return 1;
|
---|
| 297 | }
|
---|
| 298 | }
|
---|
| 299 |
|
---|
| 300 | /* If the buffer delimiter is not newline character, we cannot use
|
---|
| 301 | newline_anchor flag of regex. So do it line-by-line, and add offset
|
---|
| 302 | value to results. */
|
---|
| 303 | if ((regex->flags & REG_NEWLINE) && buffer_delimiter != '\n')
|
---|
| 304 | {
|
---|
| 305 | const char *beg, *end;
|
---|
| 306 | const char *start;
|
---|
| 307 |
|
---|
| 308 | beg = buf;
|
---|
| 309 |
|
---|
| 310 | if (buf_start_offset > 0)
|
---|
| 311 | {
|
---|
| 312 | const char *eol = memrchr (buf, buffer_delimiter, buf_start_offset);
|
---|
| 313 |
|
---|
| 314 | if (eol != NULL)
|
---|
| 315 | beg = eol + 1;
|
---|
| 316 | }
|
---|
| 317 |
|
---|
| 318 | start = buf + buf_start_offset;
|
---|
| 319 |
|
---|
| 320 | for (;;)
|
---|
| 321 | {
|
---|
| 322 | end = memchr (beg, buffer_delimiter, buf + buflen - beg);
|
---|
| 323 |
|
---|
| 324 | if (end == NULL)
|
---|
| 325 | end = buf + buflen;
|
---|
| 326 |
|
---|
| 327 | ret = re_search (®ex->pattern, beg, end - beg,
|
---|
| 328 | start - beg, end - start,
|
---|
| 329 | regsize ? regarray : NULL);
|
---|
| 330 |
|
---|
| 331 | if (ret > -1)
|
---|
| 332 | {
|
---|
| 333 | size_t i;
|
---|
| 334 |
|
---|
| 335 | ret += beg - buf;
|
---|
| 336 |
|
---|
| 337 | if (regsize)
|
---|
| 338 | {
|
---|
| 339 | for (i = 0; i < regarray->num_regs; ++i)
|
---|
| 340 | {
|
---|
| 341 | if (regarray->start[i] > -1)
|
---|
| 342 | regarray->start[i] += beg - buf;
|
---|
| 343 | if (regarray->end[i] > -1)
|
---|
| 344 | regarray->end[i] += beg - buf;
|
---|
| 345 | }
|
---|
| 346 | }
|
---|
| 347 |
|
---|
| 348 | break;
|
---|
| 349 | }
|
---|
| 350 |
|
---|
| 351 | if (end == buf + buflen)
|
---|
| 352 | break;
|
---|
| 353 |
|
---|
| 354 | beg = start = end + 1;
|
---|
| 355 | }
|
---|
| 356 | }
|
---|
| 357 | else
|
---|
| 358 | ret = re_search (®ex->pattern, buf, buflen, buf_start_offset,
|
---|
| 359 | buflen - buf_start_offset,
|
---|
| 360 | regsize ? regarray : NULL);
|
---|
| 361 |
|
---|
[599] | 362 | return (ret > -1);
|
---|
| 363 | }
|
---|
| 364 |
|
---|
| 365 |
|
---|
[3613] | 366 | #ifdef lint
|
---|
[599] | 367 | void
|
---|
[3613] | 368 | release_regex (struct regex *regex)
|
---|
[599] | 369 | {
|
---|
[3613] | 370 | if (regex->dfa)
|
---|
| 371 | {
|
---|
| 372 | dfafree (regex->dfa);
|
---|
| 373 | free (regex->dfa);
|
---|
| 374 | regex->dfa = NULL;
|
---|
| 375 | }
|
---|
| 376 | regfree (®ex->pattern);
|
---|
| 377 | free (regex);
|
---|
[599] | 378 | }
|
---|
[3613] | 379 | #endif /* lint */
|
---|