source: vendor/FreeBSD-libc/5.3/regex/regcomp.c

Last change on this file was 1695, checked in by bird, 21 years ago

FreeBSD 5.3 libc sources.

  • Property cvs2svn:cvs-rev set to 1.1.1.2
  • Property svn:eol-style set to native
  • Property svn:executable set to *
File size: 42.7 KB
Line 
1/*-
2 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
3 * Copyright (c) 1992, 1993, 1994
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Henry Spencer.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
38 */
39
40#if defined(LIBC_SCCS) && !defined(lint)
41static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
42#endif /* LIBC_SCCS and not lint */
43#include <sys/cdefs.h>
44__FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.32.2.1 2004/09/08 04:42:46 tjr Exp $");
45
46#include <sys/types.h>
47#include <stdio.h>
48#include <string.h>
49#include <ctype.h>
50#include <limits.h>
51#include <stdlib.h>
52#include <regex.h>
53#include <wchar.h>
54#include <wctype.h>
55
56#include "collate.h"
57
58#include "utils.h"
59#include "regex2.h"
60
61#include "cname.h"
62
63/*
64 * parse structure, passed up and down to avoid global variables and
65 * other clumsinesses
66 */
67struct parse {
68 char *next; /* next character in RE */
69 char *end; /* end of string (-> NUL normally) */
70 int error; /* has an error been seen? */
71 sop *strip; /* malloced strip */
72 sopno ssize; /* malloced strip size (allocated) */
73 sopno slen; /* malloced strip length (used) */
74 int ncsalloc; /* number of csets allocated */
75 struct re_guts *g;
76# define NPAREN 10 /* we need to remember () 1-9 for back refs */
77 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
78 sopno pend[NPAREN]; /* -> ) ([0] unused) */
79};
80
81/* ========= begin header generated by ./mkh ========= */
82#ifdef __cplusplus
83extern "C" {
84#endif
85
86/* === regcomp.c === */
87static void p_ere(struct parse *p, wint_t stop);
88static void p_ere_exp(struct parse *p);
89static void p_str(struct parse *p);
90static void p_bre(struct parse *p, wint_t end1, wint_t end2);
91static int p_simp_re(struct parse *p, int starordinary);
92static int p_count(struct parse *p);
93static void p_bracket(struct parse *p);
94static void p_b_term(struct parse *p, cset *cs);
95static void p_b_cclass(struct parse *p, cset *cs);
96static void p_b_eclass(struct parse *p, cset *cs);
97static wint_t p_b_symbol(struct parse *p);
98static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
99static wint_t othercase(wint_t ch);
100static void bothcases(struct parse *p, wint_t ch);
101static void ordinary(struct parse *p, wint_t ch);
102static void nonnewline(struct parse *p);
103static void repeat(struct parse *p, sopno start, int from, int to);
104static int seterr(struct parse *p, int e);
105static cset *allocset(struct parse *p);
106static void freeset(struct parse *p, cset *cs);
107static void CHadd(struct parse *p, cset *cs, wint_t ch);
108static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
109static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
110static wint_t singleton(cset *cs);
111static sopno dupl(struct parse *p, sopno start, sopno finish);
112static void doemit(struct parse *p, sop op, size_t opnd);
113static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
114static void dofwd(struct parse *p, sopno pos, sop value);
115static void enlarge(struct parse *p, sopno size);
116static void stripsnug(struct parse *p, struct re_guts *g);
117static void findmust(struct parse *p, struct re_guts *g);
118static int altoffset(sop *scan, int offset);
119static void computejumps(struct parse *p, struct re_guts *g);
120static void computematchjumps(struct parse *p, struct re_guts *g);
121static sopno pluscount(struct parse *p, struct re_guts *g);
122static wint_t wgetnext(struct parse *p);
123
124#ifdef __cplusplus
125}
126#endif
127/* ========= end header generated by ./mkh ========= */
128
129static char nuls[10]; /* place to point scanner in event of error */
130
131/*
132 * macros for use with parse structure
133 * BEWARE: these know that the parse structure is named `p' !!!
134 */
135#define PEEK() (*p->next)
136#define PEEK2() (*(p->next+1))
137#define MORE() (p->next < p->end)
138#define MORE2() (p->next+1 < p->end)
139#define SEE(c) (MORE() && PEEK() == (c))
140#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
141#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
142#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
143#define NEXT() (p->next++)
144#define NEXT2() (p->next += 2)
145#define NEXTn(n) (p->next += (n))
146#define GETNEXT() (*p->next++)
147#define WGETNEXT() wgetnext(p)
148#define SETERROR(e) seterr(p, (e))
149#define REQUIRE(co, e) ((co) || SETERROR(e))
150#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
151#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
152#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
153#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
154#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
155#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
156#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
157#define HERE() (p->slen)
158#define THERE() (p->slen - 1)
159#define THERETHERE() (p->slen - 2)
160#define DROP(n) (p->slen -= (n))
161
162#ifndef NDEBUG
163static int never = 0; /* for use in asserts; shuts lint up */
164#else
165#define never 0 /* some <assert.h>s have bugs too */
166#endif
167
168/* Macro used by computejump()/computematchjump() */
169#define MIN(a,b) ((a)<(b)?(a):(b))
170
171/*
172 - regcomp - interface for parser and compilation
173 = extern int regcomp(regex_t *, const char *, int);
174 = #define REG_BASIC 0000
175 = #define REG_EXTENDED 0001
176 = #define REG_ICASE 0002
177 = #define REG_NOSUB 0004
178 = #define REG_NEWLINE 0010
179 = #define REG_NOSPEC 0020
180 = #define REG_PEND 0040
181 = #define REG_DUMP 0200
182 */
183int /* 0 success, otherwise REG_something */
184regcomp(preg, pattern, cflags)
185regex_t * __restrict preg;
186const char * __restrict pattern;
187int cflags;
188{
189 struct parse pa;
190 struct re_guts *g;
191 struct parse *p = &pa;
192 int i;
193 size_t len;
194#ifdef REDEBUG
195# define GOODFLAGS(f) (f)
196#else
197# define GOODFLAGS(f) ((f)&~REG_DUMP)
198#endif
199
200 cflags = GOODFLAGS(cflags);
201 if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
202 return(REG_INVARG);
203
204 if (cflags&REG_PEND) {
205 if (preg->re_endp < pattern)
206 return(REG_INVARG);
207 len = preg->re_endp - pattern;
208 } else
209 len = strlen((char *)pattern);
210
211 /* do the mallocs early so failure handling is easy */
212 g = (struct re_guts *)malloc(sizeof(struct re_guts));
213 if (g == NULL)
214 return(REG_ESPACE);
215 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
216 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
217 p->slen = 0;
218 if (p->strip == NULL) {
219 free((char *)g);
220 return(REG_ESPACE);
221 }
222
223 /* set things up */
224 p->g = g;
225 p->next = (char *)pattern; /* convenience; we do not modify it */
226 p->end = p->next + len;
227 p->error = 0;
228 p->ncsalloc = 0;
229 for (i = 0; i < NPAREN; i++) {
230 p->pbegin[i] = 0;
231 p->pend[i] = 0;
232 }
233 g->sets = NULL;
234 g->ncsets = 0;
235 g->cflags = cflags;
236 g->iflags = 0;
237 g->nbol = 0;
238 g->neol = 0;
239 g->must = NULL;
240 g->moffset = -1;
241 g->charjump = NULL;
242 g->matchjump = NULL;
243 g->mlen = 0;
244 g->nsub = 0;
245 g->backrefs = 0;
246
247 /* do it */
248 EMIT(OEND, 0);
249 g->firststate = THERE();
250 if (cflags&REG_EXTENDED)
251 p_ere(p, OUT);
252 else if (cflags&REG_NOSPEC)
253 p_str(p);
254 else
255 p_bre(p, OUT, OUT);
256 EMIT(OEND, 0);
257 g->laststate = THERE();
258
259 /* tidy up loose ends and fill things in */
260 stripsnug(p, g);
261 findmust(p, g);
262 /* only use Boyer-Moore algorithm if the pattern is bigger
263 * than three characters
264 */
265 if(g->mlen > 3) {
266 computejumps(p, g);
267 computematchjumps(p, g);
268 if(g->matchjump == NULL && g->charjump != NULL) {
269 free(g->charjump);
270 g->charjump = NULL;
271 }
272 }
273 g->nplus = pluscount(p, g);
274 g->magic = MAGIC2;
275 preg->re_nsub = g->nsub;
276 preg->re_g = g;
277 preg->re_magic = MAGIC1;
278#ifndef REDEBUG
279 /* not debugging, so can't rely on the assert() in regexec() */
280 if (g->iflags&BAD)
281 SETERROR(REG_ASSERT);
282#endif
283
284 /* win or lose, we're done */
285 if (p->error != 0) /* lose */
286 regfree(preg);
287 return(p->error);
288}
289
290/*
291 - p_ere - ERE parser top level, concatenation and alternation
292 == static void p_ere(struct parse *p, int stop);
293 */
294static void
295p_ere(p, stop)
296struct parse *p;
297int stop; /* character this ERE should end at */
298{
299 char c;
300 sopno prevback;
301 sopno prevfwd;
302 sopno conc;
303 int first = 1; /* is this the first alternative? */
304
305 for (;;) {
306 /* do a bunch of concatenated expressions */
307 conc = HERE();
308 while (MORE() && (c = PEEK()) != '|' && c != stop)
309 p_ere_exp(p);
310 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
311
312 if (!EAT('|'))
313 break; /* NOTE BREAK OUT */
314
315 if (first) {
316 INSERT(OCH_, conc); /* offset is wrong */
317 prevfwd = conc;
318 prevback = conc;
319 first = 0;
320 }
321 ASTERN(OOR1, prevback);
322 prevback = THERE();
323 AHEAD(prevfwd); /* fix previous offset */
324 prevfwd = HERE();
325 EMIT(OOR2, 0); /* offset is very wrong */
326 }
327
328 if (!first) { /* tail-end fixups */
329 AHEAD(prevfwd);
330 ASTERN(O_CH, prevback);
331 }
332
333 assert(!MORE() || SEE(stop));
334}
335
336/*
337 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
338 == static void p_ere_exp(struct parse *p);
339 */
340static void
341p_ere_exp(p)
342struct parse *p;
343{
344 char c;
345 wint_t wc;
346 sopno pos;
347 int count;
348 int count2;
349 sopno subno;
350 int wascaret = 0;
351
352 assert(MORE()); /* caller should have ensured this */
353 c = GETNEXT();
354
355 pos = HERE();
356 switch (c) {
357 case '(':
358 (void)REQUIRE(MORE(), REG_EPAREN);
359 p->g->nsub++;
360 subno = p->g->nsub;
361 if (subno < NPAREN)
362 p->pbegin[subno] = HERE();
363 EMIT(OLPAREN, subno);
364 if (!SEE(')'))
365 p_ere(p, ')');
366 if (subno < NPAREN) {
367 p->pend[subno] = HERE();
368 assert(p->pend[subno] != 0);
369 }
370 EMIT(ORPAREN, subno);
371 (void)MUSTEAT(')', REG_EPAREN);
372 break;
373#ifndef POSIX_MISTAKE
374 case ')': /* happens only if no current unmatched ( */
375 /*
376 * You may ask, why the ifndef? Because I didn't notice
377 * this until slightly too late for 1003.2, and none of the
378 * other 1003.2 regular-expression reviewers noticed it at
379 * all. So an unmatched ) is legal POSIX, at least until
380 * we can get it fixed.
381 */
382 SETERROR(REG_EPAREN);
383 break;
384#endif
385 case '^':
386 EMIT(OBOL, 0);
387 p->g->iflags |= USEBOL;
388 p->g->nbol++;
389 wascaret = 1;
390 break;
391 case '$':
392 EMIT(OEOL, 0);
393 p->g->iflags |= USEEOL;
394 p->g->neol++;
395 break;
396 case '|':
397 SETERROR(REG_EMPTY);
398 break;
399 case '*':
400 case '+':
401 case '?':
402 SETERROR(REG_BADRPT);
403 break;
404 case '.':
405 if (p->g->cflags&REG_NEWLINE)
406 nonnewline(p);
407 else
408 EMIT(OANY, 0);
409 break;
410 case '[':
411 p_bracket(p);
412 break;
413 case '\\':
414 (void)REQUIRE(MORE(), REG_EESCAPE);
415 wc = WGETNEXT();
416 ordinary(p, wc);
417 break;
418 case '{': /* okay as ordinary except if digit follows */
419 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
420 /* FALLTHROUGH */
421 default:
422 p->next--;
423 wc = WGETNEXT();
424 ordinary(p, wc);
425 break;
426 }
427
428 if (!MORE())
429 return;
430 c = PEEK();
431 /* we call { a repetition if followed by a digit */
432 if (!( c == '*' || c == '+' || c == '?' ||
433 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
434 return; /* no repetition, we're done */
435 NEXT();
436
437 (void)REQUIRE(!wascaret, REG_BADRPT);
438 switch (c) {
439 case '*': /* implemented as +? */
440 /* this case does not require the (y|) trick, noKLUDGE */
441 INSERT(OPLUS_, pos);
442 ASTERN(O_PLUS, pos);
443 INSERT(OQUEST_, pos);
444 ASTERN(O_QUEST, pos);
445 break;
446 case '+':
447 INSERT(OPLUS_, pos);
448 ASTERN(O_PLUS, pos);
449 break;
450 case '?':
451 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
452 INSERT(OCH_, pos); /* offset slightly wrong */
453 ASTERN(OOR1, pos); /* this one's right */
454 AHEAD(pos); /* fix the OCH_ */
455 EMIT(OOR2, 0); /* offset very wrong... */
456 AHEAD(THERE()); /* ...so fix it */
457 ASTERN(O_CH, THERETHERE());
458 break;
459 case '{':
460 count = p_count(p);
461 if (EAT(',')) {
462 if (isdigit((uch)PEEK())) {
463 count2 = p_count(p);
464 (void)REQUIRE(count <= count2, REG_BADBR);
465 } else /* single number with comma */
466 count2 = INFINITY;
467 } else /* just a single number */
468 count2 = count;
469 repeat(p, pos, count, count2);
470 if (!EAT('}')) { /* error heuristics */
471 while (MORE() && PEEK() != '}')
472 NEXT();
473 (void)REQUIRE(MORE(), REG_EBRACE);
474 SETERROR(REG_BADBR);
475 }
476 break;
477 }
478
479 if (!MORE())
480 return;
481 c = PEEK();
482 if (!( c == '*' || c == '+' || c == '?' ||
483 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
484 return;
485 SETERROR(REG_BADRPT);
486}
487
488/*
489 - p_str - string (no metacharacters) "parser"
490 == static void p_str(struct parse *p);
491 */
492static void
493p_str(p)
494struct parse *p;
495{
496 (void)REQUIRE(MORE(), REG_EMPTY);
497 while (MORE())
498 ordinary(p, WGETNEXT());
499}
500
501/*
502 - p_bre - BRE parser top level, anchoring and concatenation
503 == static void p_bre(struct parse *p, int end1, \
504 == int end2);
505 * Giving end1 as OUT essentially eliminates the end1/end2 check.
506 *
507 * This implementation is a bit of a kludge, in that a trailing $ is first
508 * taken as an ordinary character and then revised to be an anchor.
509 * The amount of lookahead needed to avoid this kludge is excessive.
510 */
511static void
512p_bre(p, end1, end2)
513struct parse *p;
514int end1; /* first terminating character */
515int end2; /* second terminating character */
516{
517 sopno start = HERE();
518 int first = 1; /* first subexpression? */
519 int wasdollar = 0;
520
521 if (EAT('^')) {
522 EMIT(OBOL, 0);
523 p->g->iflags |= USEBOL;
524 p->g->nbol++;
525 }
526 while (MORE() && !SEETWO(end1, end2)) {
527 wasdollar = p_simp_re(p, first);
528 first = 0;
529 }
530 if (wasdollar) { /* oops, that was a trailing anchor */
531 DROP(1);
532 EMIT(OEOL, 0);
533 p->g->iflags |= USEEOL;
534 p->g->neol++;
535 }
536
537 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
538}
539
540/*
541 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
542 == static int p_simp_re(struct parse *p, int starordinary);
543 */
544static int /* was the simple RE an unbackslashed $? */
545p_simp_re(p, starordinary)
546struct parse *p;
547int starordinary; /* is a leading * an ordinary character? */
548{
549 int c;
550 int count;
551 int count2;
552 sopno pos;
553 int i;
554 wint_t wc;
555 sopno subno;
556# define BACKSL (1<<CHAR_BIT)
557
558 pos = HERE(); /* repetion op, if any, covers from here */
559
560 assert(MORE()); /* caller should have ensured this */
561 c = GETNEXT();
562 if (c == '\\') {
563 (void)REQUIRE(MORE(), REG_EESCAPE);
564 c = BACKSL | GETNEXT();
565 }
566 switch (c) {
567 case '.':
568 if (p->g->cflags&REG_NEWLINE)
569 nonnewline(p);
570 else
571 EMIT(OANY, 0);
572 break;
573 case '[':
574 p_bracket(p);
575 break;
576 case BACKSL|'{':
577 SETERROR(REG_BADRPT);
578 break;
579 case BACKSL|'(':
580 p->g->nsub++;
581 subno = p->g->nsub;
582 if (subno < NPAREN)
583 p->pbegin[subno] = HERE();
584 EMIT(OLPAREN, subno);
585 /* the MORE here is an error heuristic */
586 if (MORE() && !SEETWO('\\', ')'))
587 p_bre(p, '\\', ')');
588 if (subno < NPAREN) {
589 p->pend[subno] = HERE();
590 assert(p->pend[subno] != 0);
591 }
592 EMIT(ORPAREN, subno);
593 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
594 break;
595 case BACKSL|')': /* should not get here -- must be user */
596 case BACKSL|'}':
597 SETERROR(REG_EPAREN);
598 break;
599 case BACKSL|'1':
600 case BACKSL|'2':
601 case BACKSL|'3':
602 case BACKSL|'4':
603 case BACKSL|'5':
604 case BACKSL|'6':
605 case BACKSL|'7':
606 case BACKSL|'8':
607 case BACKSL|'9':
608 i = (c&~BACKSL) - '0';
609 assert(i < NPAREN);
610 if (p->pend[i] != 0) {
611 assert(i <= p->g->nsub);
612 EMIT(OBACK_, i);
613 assert(p->pbegin[i] != 0);
614 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
615 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
616 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
617 EMIT(O_BACK, i);
618 } else
619 SETERROR(REG_ESUBREG);
620 p->g->backrefs = 1;
621 break;
622 case '*':
623 (void)REQUIRE(starordinary, REG_BADRPT);
624 /* FALLTHROUGH */
625 default:
626 p->next--;
627 wc = WGETNEXT();
628 ordinary(p, wc);
629 break;
630 }
631
632 if (EAT('*')) { /* implemented as +? */
633 /* this case does not require the (y|) trick, noKLUDGE */
634 INSERT(OPLUS_, pos);
635 ASTERN(O_PLUS, pos);
636 INSERT(OQUEST_, pos);
637 ASTERN(O_QUEST, pos);
638 } else if (EATTWO('\\', '{')) {
639 count = p_count(p);
640 if (EAT(',')) {
641 if (MORE() && isdigit((uch)PEEK())) {
642 count2 = p_count(p);
643 (void)REQUIRE(count <= count2, REG_BADBR);
644 } else /* single number with comma */
645 count2 = INFINITY;
646 } else /* just a single number */
647 count2 = count;
648 repeat(p, pos, count, count2);
649 if (!EATTWO('\\', '}')) { /* error heuristics */
650 while (MORE() && !SEETWO('\\', '}'))
651 NEXT();
652 (void)REQUIRE(MORE(), REG_EBRACE);
653 SETERROR(REG_BADBR);
654 }
655 } else if (c == '$') /* $ (but not \$) ends it */
656 return(1);
657
658 return(0);
659}
660
661/*
662 - p_count - parse a repetition count
663 == static int p_count(struct parse *p);
664 */
665static int /* the value */
666p_count(p)
667struct parse *p;
668{
669 int count = 0;
670 int ndigits = 0;
671
672 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
673 count = count*10 + (GETNEXT() - '0');
674 ndigits++;
675 }
676
677 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
678 return(count);
679}
680
681/*
682 - p_bracket - parse a bracketed character list
683 == static void p_bracket(struct parse *p);
684 */
685static void
686p_bracket(p)
687struct parse *p;
688{
689 cset *cs;
690 wint_t ch;
691
692 /* Dept of Truly Sickening Special-Case Kludges */
693 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
694 EMIT(OBOW, 0);
695 NEXTn(6);
696 return;
697 }
698 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
699 EMIT(OEOW, 0);
700 NEXTn(6);
701 return;
702 }
703
704 if ((cs = allocset(p)) == NULL)
705 return;
706
707 if (p->g->cflags&REG_ICASE)
708 cs->icase = 1;
709 if (EAT('^'))
710 cs->invert = 1;
711 if (EAT(']'))
712 CHadd(p, cs, ']');
713 else if (EAT('-'))
714 CHadd(p, cs, '-');
715 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
716 p_b_term(p, cs);
717 if (EAT('-'))
718 CHadd(p, cs, '-');
719 (void)MUSTEAT(']', REG_EBRACK);
720
721 if (p->error != 0) /* don't mess things up further */
722 return;
723
724 if (cs->invert && p->g->cflags&REG_NEWLINE)
725 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
726
727 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
728 ordinary(p, ch);
729 freeset(p, cs);
730 } else
731 EMIT(OANYOF, (int)(cs - p->g->sets));
732}
733
734/*
735 - p_b_term - parse one term of a bracketed character list
736 == static void p_b_term(struct parse *p, cset *cs);
737 */
738static void
739p_b_term(p, cs)
740struct parse *p;
741cset *cs;
742{
743 char c;
744 wint_t start, finish;
745 wint_t i;
746
747 /* classify what we've got */
748 switch ((MORE()) ? PEEK() : '\0') {
749 case '[':
750 c = (MORE2()) ? PEEK2() : '\0';
751 break;
752 case '-':
753 SETERROR(REG_ERANGE);
754 return; /* NOTE RETURN */
755 break;
756 default:
757 c = '\0';
758 break;
759 }
760
761 switch (c) {
762 case ':': /* character class */
763 NEXT2();
764 (void)REQUIRE(MORE(), REG_EBRACK);
765 c = PEEK();
766 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
767 p_b_cclass(p, cs);
768 (void)REQUIRE(MORE(), REG_EBRACK);
769 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
770 break;
771 case '=': /* equivalence class */
772 NEXT2();
773 (void)REQUIRE(MORE(), REG_EBRACK);
774 c = PEEK();
775 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
776 p_b_eclass(p, cs);
777 (void)REQUIRE(MORE(), REG_EBRACK);
778 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
779 break;
780 default: /* symbol, ordinary character, or range */
781 start = p_b_symbol(p);
782 if (SEE('-') && MORE2() && PEEK2() != ']') {
783 /* range */
784 NEXT();
785 if (EAT('-'))
786 finish = '-';
787 else
788 finish = p_b_symbol(p);
789 } else
790 finish = start;
791 if (start == finish)
792 CHadd(p, cs, start);
793 else {
794 if (__collate_load_error) {
795 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
796 CHaddrange(p, cs, start, finish);
797 } else {
798 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
799 for (i = 0; i <= UCHAR_MAX; i++) {
800 if ( __collate_range_cmp(start, i) <= 0
801 && __collate_range_cmp(i, finish) <= 0
802 )
803 CHadd(p, cs, i);
804 }
805 }
806 }
807 break;
808 }
809}
810
811/*
812 - p_b_cclass - parse a character-class name and deal with it
813 == static void p_b_cclass(struct parse *p, cset *cs);
814 */
815static void
816p_b_cclass(p, cs)
817struct parse *p;
818cset *cs;
819{
820 char *sp = p->next;
821 size_t len;
822 wctype_t wct;
823 char clname[16];
824
825 while (MORE() && isalpha((uch)PEEK()))
826 NEXT();
827 len = p->next - sp;
828 if (len >= sizeof(clname) - 1) {
829 SETERROR(REG_ECTYPE);
830 return;
831 }
832 memcpy(clname, sp, len);
833 clname[len] = '\0';
834 if ((wct = wctype(clname)) == 0) {
835 SETERROR(REG_ECTYPE);
836 return;
837 }
838 CHaddtype(p, cs, wct);
839}
840
841/*
842 - p_b_eclass - parse an equivalence-class name and deal with it
843 == static void p_b_eclass(struct parse *p, cset *cs);
844 *
845 * This implementation is incomplete. xxx
846 */
847static void
848p_b_eclass(p, cs)
849struct parse *p;
850cset *cs;
851{
852 wint_t c;
853
854 c = p_b_coll_elem(p, '=');
855 CHadd(p, cs, c);
856}
857
858/*
859 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
860 == static char p_b_symbol(struct parse *p);
861 */
862static wint_t /* value of symbol */
863p_b_symbol(p)
864struct parse *p;
865{
866 wint_t value;
867
868 (void)REQUIRE(MORE(), REG_EBRACK);
869 if (!EATTWO('[', '.'))
870 return(WGETNEXT());
871
872 /* collating symbol */
873 value = p_b_coll_elem(p, '.');
874 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
875 return(value);
876}
877
878/*
879 - p_b_coll_elem - parse a collating-element name and look it up
880 == static char p_b_coll_elem(struct parse *p, int endc);
881 */
882static wint_t /* value of collating element */
883p_b_coll_elem(p, endc)
884struct parse *p;
885wint_t endc; /* name ended by endc,']' */
886{
887 char *sp = p->next;
888 struct cname *cp;
889 int len;
890 mbstate_t mbs;
891 wchar_t wc;
892 size_t clen;
893
894 while (MORE() && !SEETWO(endc, ']'))
895 NEXT();
896 if (!MORE()) {
897 SETERROR(REG_EBRACK);
898 return(0);
899 }
900 len = p->next - sp;
901 for (cp = cnames; cp->name != NULL; cp++)
902 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
903 return(cp->code); /* known name */
904 memset(&mbs, 0, sizeof(mbs));
905 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
906 return (wc); /* single character */
907 else if (clen == (size_t)-1 || clen == (size_t)-2)
908 SETERROR(REG_ILLSEQ);
909 else
910 SETERROR(REG_ECOLLATE); /* neither */
911 return(0);
912}
913
914/*
915 - othercase - return the case counterpart of an alphabetic
916 == static char othercase(int ch);
917 */
918static wint_t /* if no counterpart, return ch */
919othercase(ch)
920wint_t ch;
921{
922 assert(iswalpha(ch));
923 if (iswupper(ch))
924 return(towlower(ch));
925 else if (iswlower(ch))
926 return(towupper(ch));
927 else /* peculiar, but could happen */
928 return(ch);
929}
930
931/*
932 - bothcases - emit a dualcase version of a two-case character
933 == static void bothcases(struct parse *p, int ch);
934 *
935 * Boy, is this implementation ever a kludge...
936 */
937static void
938bothcases(p, ch)
939struct parse *p;
940wint_t ch;
941{
942 char *oldnext = p->next;
943 char *oldend = p->end;
944 char bracket[3 + MB_LEN_MAX];
945 size_t n;
946 mbstate_t mbs;
947
948 assert(othercase(ch) != ch); /* p_bracket() would recurse */
949 p->next = bracket;
950 memset(&mbs, 0, sizeof(mbs));
951 n = wcrtomb(bracket, ch, &mbs);
952 assert(n != (size_t)-1);
953 bracket[n] = ']';
954 bracket[n + 1] = '\0';
955 p->end = bracket+n+1;
956 p_bracket(p);
957 assert(p->next == p->end);
958 p->next = oldnext;
959 p->end = oldend;
960}
961
962/*
963 - ordinary - emit an ordinary character
964 == static void ordinary(struct parse *p, int ch);
965 */
966static void
967ordinary(p, ch)
968struct parse *p;
969wint_t ch;
970{
971 cset *cs;
972
973 if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
974 bothcases(p, ch);
975 else if ((ch & OPDMASK) == ch)
976 EMIT(OCHAR, ch);
977 else {
978 /*
979 * Kludge: character is too big to fit into an OCHAR operand.
980 * Emit a singleton set.
981 */
982 if ((cs = allocset(p)) == NULL)
983 return;
984 CHadd(p, cs, ch);
985 EMIT(OANYOF, (int)(cs - p->g->sets));
986 }
987}
988
989/*
990 - nonnewline - emit REG_NEWLINE version of OANY
991 == static void nonnewline(struct parse *p);
992 *
993 * Boy, is this implementation ever a kludge...
994 */
995static void
996nonnewline(p)
997struct parse *p;
998{
999 char *oldnext = p->next;
1000 char *oldend = p->end;
1001 char bracket[4];
1002
1003 p->next = bracket;
1004 p->end = bracket+3;
1005 bracket[0] = '^';
1006 bracket[1] = '\n';
1007 bracket[2] = ']';
1008 bracket[3] = '\0';
1009 p_bracket(p);
1010 assert(p->next == bracket+3);
1011 p->next = oldnext;
1012 p->end = oldend;
1013}
1014
1015/*
1016 - repeat - generate code for a bounded repetition, recursively if needed
1017 == static void repeat(struct parse *p, sopno start, int from, int to);
1018 */
1019static void
1020repeat(p, start, from, to)
1021struct parse *p;
1022sopno start; /* operand from here to end of strip */
1023int from; /* repeated from this number */
1024int to; /* to this number of times (maybe INFINITY) */
1025{
1026 sopno finish = HERE();
1027# define N 2
1028# define INF 3
1029# define REP(f, t) ((f)*8 + (t))
1030# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1031 sopno copy;
1032
1033 if (p->error != 0) /* head off possible runaway recursion */
1034 return;
1035
1036 assert(from <= to);
1037
1038 switch (REP(MAP(from), MAP(to))) {
1039 case REP(0, 0): /* must be user doing this */
1040 DROP(finish-start); /* drop the operand */
1041 break;
1042 case REP(0, 1): /* as x{1,1}? */
1043 case REP(0, N): /* as x{1,n}? */
1044 case REP(0, INF): /* as x{1,}? */
1045 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1046 INSERT(OCH_, start); /* offset is wrong... */
1047 repeat(p, start+1, 1, to);
1048 ASTERN(OOR1, start);
1049 AHEAD(start); /* ... fix it */
1050 EMIT(OOR2, 0);
1051 AHEAD(THERE());
1052 ASTERN(O_CH, THERETHERE());
1053 break;
1054 case REP(1, 1): /* trivial case */
1055 /* done */
1056 break;
1057 case REP(1, N): /* as x?x{1,n-1} */
1058 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1059 INSERT(OCH_, start);
1060 ASTERN(OOR1, start);
1061 AHEAD(start);
1062 EMIT(OOR2, 0); /* offset very wrong... */
1063 AHEAD(THERE()); /* ...so fix it */
1064 ASTERN(O_CH, THERETHERE());
1065 copy = dupl(p, start+1, finish+1);
1066 assert(copy == finish+4);
1067 repeat(p, copy, 1, to-1);
1068 break;
1069 case REP(1, INF): /* as x+ */
1070 INSERT(OPLUS_, start);
1071 ASTERN(O_PLUS, start);
1072 break;
1073 case REP(N, N): /* as xx{m-1,n-1} */
1074 copy = dupl(p, start, finish);
1075 repeat(p, copy, from-1, to-1);
1076 break;
1077 case REP(N, INF): /* as xx{n-1,INF} */
1078 copy = dupl(p, start, finish);
1079 repeat(p, copy, from-1, to);
1080 break;
1081 default: /* "can't happen" */
1082 SETERROR(REG_ASSERT); /* just in case */
1083 break;
1084 }
1085}
1086
1087/*
1088 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1089 - character from the parse struct, signals a REG_ILLSEQ error if the
1090 - character can't be converted. Returns the number of bytes consumed.
1091 */
1092static wint_t
1093wgetnext(p)
1094struct parse *p;
1095{
1096 mbstate_t mbs;
1097 wchar_t wc;
1098 size_t n;
1099
1100 memset(&mbs, 0, sizeof(mbs));
1101 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1102 if (n == (size_t)-1 || n == (size_t)-2) {
1103 SETERROR(REG_ILLSEQ);
1104 return (0);
1105 }
1106 if (n == 0)
1107 n = 1;
1108 p->next += n;
1109 return (wc);
1110}
1111
1112/*
1113 - seterr - set an error condition
1114 == static int seterr(struct parse *p, int e);
1115 */
1116static int /* useless but makes type checking happy */
1117seterr(p, e)
1118struct parse *p;
1119int e;
1120{
1121 if (p->error == 0) /* keep earliest error condition */
1122 p->error = e;
1123 p->next = nuls; /* try to bring things to a halt */
1124 p->end = nuls;
1125 return(0); /* make the return value well-defined */
1126}
1127
1128/*
1129 - allocset - allocate a set of characters for []
1130 == static cset *allocset(struct parse *p);
1131 */
1132static cset *
1133allocset(p)
1134struct parse *p;
1135{
1136 cset *cs, *ncs;
1137
1138 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
1139 if (ncs == NULL) {
1140 SETERROR(REG_ESPACE);
1141 return (NULL);
1142 }
1143 p->g->sets = ncs;
1144 cs = &p->g->sets[p->g->ncsets++];
1145 memset(cs, 0, sizeof(*cs));
1146
1147 return(cs);
1148}
1149
1150/*
1151 - freeset - free a now-unused set
1152 == static void freeset(struct parse *p, cset *cs);
1153 */
1154static void
1155freeset(p, cs)
1156struct parse *p;
1157cset *cs;
1158{
1159 cset *top = &p->g->sets[p->g->ncsets];
1160
1161 free(cs->wides);
1162 free(cs->ranges);
1163 free(cs->types);
1164 memset(cs, 0, sizeof(*cs));
1165 if (cs == top-1) /* recover only the easy case */
1166 p->g->ncsets--;
1167}
1168
1169/*
1170 - singleton - Determine whether a set contains only one character,
1171 - returning it if so, otherwise returning OUT.
1172 */
1173static wint_t
1174singleton(cs)
1175cset *cs;
1176{
1177 wint_t i, s, n;
1178
1179 for (i = n = 0; i < NC; i++)
1180 if (CHIN(cs, i)) {
1181 n++;
1182 s = i;
1183 }
1184 if (n == 1)
1185 return (s);
1186 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1187 cs->icase == 0)
1188 return (cs->wides[0]);
1189 /* Don't bother handling the other cases. */
1190 return (OUT);
1191}
1192
1193/*
1194 - CHadd - add character to character set.
1195 */
1196static void
1197CHadd(p, cs, ch)
1198struct parse *p;
1199cset *cs;
1200wint_t ch;
1201{
1202 wint_t nch, *newwides;
1203 assert(ch >= 0);
1204 if (ch < NC)
1205 cs->bmp[ch >> 3] |= 1 << (ch & 7);
1206 else {
1207 newwides = realloc(cs->wides, (cs->nwides + 1) *
1208 sizeof(*cs->wides));
1209 if (newwides == NULL) {
1210 SETERROR(REG_ESPACE);
1211 return;
1212 }
1213 cs->wides = newwides;
1214 cs->wides[cs->nwides++] = ch;
1215 }
1216 if (cs->icase) {
1217 if ((nch = towlower(ch)) < NC)
1218 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1219 if ((nch = towupper(ch)) < NC)
1220 cs->bmp[nch >> 3] |= 1 << (nch & 7);
1221 }
1222}
1223
1224/*
1225 - CHaddrange - add all characters in the range [min,max] to a character set.
1226 */
1227static void
1228CHaddrange(p, cs, min, max)
1229struct parse *p;
1230cset *cs;
1231wint_t min, max;
1232{
1233 crange *newranges;
1234
1235 for (; min < NC && min <= max; min++)
1236 CHadd(p, cs, min);
1237 if (min >= max)
1238 return;
1239 newranges = realloc(cs->ranges, (cs->nranges + 1) *
1240 sizeof(*cs->ranges));
1241 if (newranges == NULL) {
1242 SETERROR(REG_ESPACE);
1243 return;
1244 }
1245 cs->ranges = newranges;
1246 cs->ranges[cs->nranges].min = min;
1247 cs->ranges[cs->nranges].min = max;
1248 cs->nranges++;
1249}
1250
1251/*
1252 - CHaddtype - add all characters of a certain type to a character set.
1253 */
1254static void
1255CHaddtype(p, cs, wct)
1256struct parse *p;
1257cset *cs;
1258wctype_t wct;
1259{
1260 wint_t i;
1261 wctype_t *newtypes;
1262
1263 for (i = 0; i < NC; i++)
1264 if (iswctype(i, wct))
1265 CHadd(p, cs, i);
1266 newtypes = realloc(cs->types, (cs->ntypes + 1) *
1267 sizeof(*cs->types));
1268 if (newtypes == NULL) {
1269 SETERROR(REG_ESPACE);
1270 return;
1271 }
1272 cs->types = newtypes;
1273 cs->types[cs->ntypes++] = wct;
1274}
1275
1276/*
1277 - dupl - emit a duplicate of a bunch of sops
1278 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1279 */
1280static sopno /* start of duplicate */
1281dupl(p, start, finish)
1282struct parse *p;
1283sopno start; /* from here */
1284sopno finish; /* to this less one */
1285{
1286 sopno ret = HERE();
1287 sopno len = finish - start;
1288
1289 assert(finish >= start);
1290 if (len == 0)
1291 return(ret);
1292 enlarge(p, p->ssize + len); /* this many unexpected additions */
1293 assert(p->ssize >= p->slen + len);
1294 (void) memcpy((char *)(p->strip + p->slen),
1295 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1296 p->slen += len;
1297 return(ret);
1298}
1299
1300/*
1301 - doemit - emit a strip operator
1302 == static void doemit(struct parse *p, sop op, size_t opnd);
1303 *
1304 * It might seem better to implement this as a macro with a function as
1305 * hard-case backup, but it's just too big and messy unless there are
1306 * some changes to the data structures. Maybe later.
1307 */
1308static void
1309doemit(p, op, opnd)
1310struct parse *p;
1311sop op;
1312size_t opnd;
1313{
1314 /* avoid making error situations worse */
1315 if (p->error != 0)
1316 return;
1317
1318 /* deal with oversize operands ("can't happen", more or less) */
1319 assert(opnd < 1<<OPSHIFT);
1320
1321 /* deal with undersized strip */
1322 if (p->slen >= p->ssize)
1323 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1324 assert(p->slen < p->ssize);
1325
1326 /* finally, it's all reduced to the easy case */
1327 p->strip[p->slen++] = SOP(op, opnd);
1328}
1329
1330/*
1331 - doinsert - insert a sop into the strip
1332 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1333 */
1334static void
1335doinsert(p, op, opnd, pos)
1336struct parse *p;
1337sop op;
1338size_t opnd;
1339sopno pos;
1340{
1341 sopno sn;
1342 sop s;
1343 int i;
1344
1345 /* avoid making error situations worse */
1346 if (p->error != 0)
1347 return;
1348
1349 sn = HERE();
1350 EMIT(op, opnd); /* do checks, ensure space */
1351 assert(HERE() == sn+1);
1352 s = p->strip[sn];
1353
1354 /* adjust paren pointers */
1355 assert(pos > 0);
1356 for (i = 1; i < NPAREN; i++) {
1357 if (p->pbegin[i] >= pos) {
1358 p->pbegin[i]++;
1359 }
1360 if (p->pend[i] >= pos) {
1361 p->pend[i]++;
1362 }
1363 }
1364
1365 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1366 (HERE()-pos-1)*sizeof(sop));
1367 p->strip[pos] = s;
1368}
1369
1370/*
1371 - dofwd - complete a forward reference
1372 == static void dofwd(struct parse *p, sopno pos, sop value);
1373 */
1374static void
1375dofwd(p, pos, value)
1376struct parse *p;
1377sopno pos;
1378sop value;
1379{
1380 /* avoid making error situations worse */
1381 if (p->error != 0)
1382 return;
1383
1384 assert(value < 1<<OPSHIFT);
1385 p->strip[pos] = OP(p->strip[pos]) | value;
1386}
1387
1388/*
1389 - enlarge - enlarge the strip
1390 == static void enlarge(struct parse *p, sopno size);
1391 */
1392static void
1393enlarge(p, size)
1394struct parse *p;
1395sopno size;
1396{
1397 sop *sp;
1398
1399 if (p->ssize >= size)
1400 return;
1401
1402 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1403 if (sp == NULL) {
1404 SETERROR(REG_ESPACE);
1405 return;
1406 }
1407 p->strip = sp;
1408 p->ssize = size;
1409}
1410
1411/*
1412 - stripsnug - compact the strip
1413 == static void stripsnug(struct parse *p, struct re_guts *g);
1414 */
1415static void
1416stripsnug(p, g)
1417struct parse *p;
1418struct re_guts *g;
1419{
1420 g->nstates = p->slen;
1421 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1422 if (g->strip == NULL) {
1423 SETERROR(REG_ESPACE);
1424 g->strip = p->strip;
1425 }
1426}
1427
1428/*
1429 - findmust - fill in must and mlen with longest mandatory literal string
1430 == static void findmust(struct parse *p, struct re_guts *g);
1431 *
1432 * This algorithm could do fancy things like analyzing the operands of |
1433 * for common subsequences. Someday. This code is simple and finds most
1434 * of the interesting cases.
1435 *
1436 * Note that must and mlen got initialized during setup.
1437 */
1438static void
1439findmust(p, g)
1440struct parse *p;
1441struct re_guts *g;
1442{
1443 sop *scan;
1444 sop *start;
1445 sop *newstart;
1446 sopno newlen;
1447 sop s;
1448 char *cp;
1449 int offset;
1450 char buf[MB_LEN_MAX];
1451 size_t clen;
1452 mbstate_t mbs;
1453
1454 /* avoid making error situations worse */
1455 if (p->error != 0)
1456 return;
1457
1458 /*
1459 * It's not generally safe to do a ``char'' substring search on
1460 * multibyte character strings, but it's safe for at least
1461 * UTF-8 (see RFC 3629).
1462 */
1463 if (MB_CUR_MAX > 1 &&
1464 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1465 return;
1466
1467 /* find the longest OCHAR sequence in strip */
1468 newlen = 0;
1469 offset = 0;
1470 g->moffset = 0;
1471 scan = g->strip + 1;
1472 do {
1473 s = *scan++;
1474 switch (OP(s)) {
1475 case OCHAR: /* sequence member */
1476 if (newlen == 0) { /* new sequence */
1477 memset(&mbs, 0, sizeof(mbs));
1478 newstart = scan - 1;
1479 }
1480 clen = wcrtomb(buf, OPND(s), &mbs);
1481 if (clen == (size_t)-1)
1482 goto toohard;
1483 newlen += clen;
1484 break;
1485 case OPLUS_: /* things that don't break one */
1486 case OLPAREN:
1487 case ORPAREN:
1488 break;
1489 case OQUEST_: /* things that must be skipped */
1490 case OCH_:
1491 offset = altoffset(scan, offset);
1492 scan--;
1493 do {
1494 scan += OPND(s);
1495 s = *scan;
1496 /* assert() interferes w debug printouts */
1497 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1498 OP(s) != OOR2) {
1499 g->iflags |= BAD;
1500 return;
1501 }
1502 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1503 /* FALLTHROUGH */
1504 case OBOW: /* things that break a sequence */
1505 case OEOW:
1506 case OBOL:
1507 case OEOL:
1508 case O_QUEST:
1509 case O_CH:
1510 case OEND:
1511 if (newlen > g->mlen) { /* ends one */
1512 start = newstart;
1513 g->mlen = newlen;
1514 if (offset > -1) {
1515 g->moffset += offset;
1516 offset = newlen;
1517 } else
1518 g->moffset = offset;
1519 } else {
1520 if (offset > -1)
1521 offset += newlen;
1522 }
1523 newlen = 0;
1524 break;
1525 case OANY:
1526 if (newlen > g->mlen) { /* ends one */
1527 start = newstart;
1528 g->mlen = newlen;
1529 if (offset > -1) {
1530 g->moffset += offset;
1531 offset = newlen;
1532 } else
1533 g->moffset = offset;
1534 } else {
1535 if (offset > -1)
1536 offset += newlen;
1537 }
1538 if (offset > -1)
1539 offset++;
1540 newlen = 0;
1541 break;
1542 case OANYOF: /* may or may not invalidate offset */
1543 /* First, everything as OANY */
1544 if (newlen > g->mlen) { /* ends one */
1545 start = newstart;
1546 g->mlen = newlen;
1547 if (offset > -1) {
1548 g->moffset += offset;
1549 offset = newlen;
1550 } else
1551 g->moffset = offset;
1552 } else {
1553 if (offset > -1)
1554 offset += newlen;
1555 }
1556 if (offset > -1)
1557 offset++;
1558 newlen = 0;
1559 break;
1560 toohard:
1561 default:
1562 /* Anything here makes it impossible or too hard
1563 * to calculate the offset -- so we give up;
1564 * save the last known good offset, in case the
1565 * must sequence doesn't occur later.
1566 */
1567 if (newlen > g->mlen) { /* ends one */
1568 start = newstart;
1569 g->mlen = newlen;
1570 if (offset > -1)
1571 g->moffset += offset;
1572 else
1573 g->moffset = offset;
1574 }
1575 offset = -1;
1576 newlen = 0;
1577 break;
1578 }
1579 } while (OP(s) != OEND);
1580
1581 if (g->mlen == 0) { /* there isn't one */
1582 g->moffset = -1;
1583 return;
1584 }
1585
1586 /* turn it into a character string */
1587 g->must = malloc((size_t)g->mlen + 1);
1588 if (g->must == NULL) { /* argh; just forget it */
1589 g->mlen = 0;
1590 g->moffset = -1;
1591 return;
1592 }
1593 cp = g->must;
1594 scan = start;
1595 memset(&mbs, 0, sizeof(mbs));
1596 while (cp < g->must + g->mlen) {
1597 while (OP(s = *scan++) != OCHAR)
1598 continue;
1599 clen = wcrtomb(cp, OPND(s), &mbs);
1600 assert(clen != (size_t)-1);
1601 cp += clen;
1602 }
1603 assert(cp == g->must + g->mlen);
1604 *cp++ = '\0'; /* just on general principles */
1605}
1606
1607/*
1608 - altoffset - choose biggest offset among multiple choices
1609 == static int altoffset(sop *scan, int offset);
1610 *
1611 * Compute, recursively if necessary, the largest offset among multiple
1612 * re paths.
1613 */
1614static int
1615altoffset(scan, offset)
1616sop *scan;
1617int offset;
1618{
1619 int largest;
1620 int try;
1621 sop s;
1622
1623 /* If we gave up already on offsets, return */
1624 if (offset == -1)
1625 return -1;
1626
1627 largest = 0;
1628 try = 0;
1629 s = *scan++;
1630 while (OP(s) != O_QUEST && OP(s) != O_CH) {
1631 switch (OP(s)) {
1632 case OOR1:
1633 if (try > largest)
1634 largest = try;
1635 try = 0;
1636 break;
1637 case OQUEST_:
1638 case OCH_:
1639 try = altoffset(scan, try);
1640 if (try == -1)
1641 return -1;
1642 scan--;
1643 do {
1644 scan += OPND(s);
1645 s = *scan;
1646 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1647 OP(s) != OOR2)
1648 return -1;
1649 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1650 /* We must skip to the next position, or we'll
1651 * leave altoffset() too early.
1652 */
1653 scan++;
1654 break;
1655 case OANYOF:
1656 case OCHAR:
1657 case OANY:
1658 try++;
1659 case OBOW:
1660 case OEOW:
1661 case OLPAREN:
1662 case ORPAREN:
1663 case OOR2:
1664 break;
1665 default:
1666 try = -1;
1667 break;
1668 }
1669 if (try == -1)
1670 return -1;
1671 s = *scan++;
1672 }
1673
1674 if (try > largest)
1675 largest = try;
1676
1677 return largest+offset;
1678}
1679
1680/*
1681 - computejumps - compute char jumps for BM scan
1682 == static void computejumps(struct parse *p, struct re_guts *g);
1683 *
1684 * This algorithm assumes g->must exists and is has size greater than
1685 * zero. It's based on the algorithm found on Computer Algorithms by
1686 * Sara Baase.
1687 *
1688 * A char jump is the number of characters one needs to jump based on
1689 * the value of the character from the text that was mismatched.
1690 */
1691static void
1692computejumps(p, g)
1693struct parse *p;
1694struct re_guts *g;
1695{
1696 int ch;
1697 int mindex;
1698
1699 /* Avoid making errors worse */
1700 if (p->error != 0)
1701 return;
1702
1703 g->charjump = (int*) malloc((NC + 1) * sizeof(int));
1704 if (g->charjump == NULL) /* Not a fatal error */
1705 return;
1706 /* Adjust for signed chars, if necessary */
1707 g->charjump = &g->charjump[-(CHAR_MIN)];
1708
1709 /* If the character does not exist in the pattern, the jump
1710 * is equal to the number of characters in the pattern.
1711 */
1712 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1713 g->charjump[ch] = g->mlen;
1714
1715 /* If the character does exist, compute the jump that would
1716 * take us to the last character in the pattern equal to it
1717 * (notice that we match right to left, so that last character
1718 * is the first one that would be matched).
1719 */
1720 for (mindex = 0; mindex < g->mlen; mindex++)
1721 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1722}
1723
1724/*
1725 - computematchjumps - compute match jumps for BM scan
1726 == static void computematchjumps(struct parse *p, struct re_guts *g);
1727 *
1728 * This algorithm assumes g->must exists and is has size greater than
1729 * zero. It's based on the algorithm found on Computer Algorithms by
1730 * Sara Baase.
1731 *
1732 * A match jump is the number of characters one needs to advance based
1733 * on the already-matched suffix.
1734 * Notice that all values here are minus (g->mlen-1), because of the way
1735 * the search algorithm works.
1736 */
1737static void
1738computematchjumps(p, g)
1739struct parse *p;
1740struct re_guts *g;
1741{
1742 int mindex; /* General "must" iterator */
1743 int suffix; /* Keeps track of matching suffix */
1744 int ssuffix; /* Keeps track of suffixes' suffix */
1745 int* pmatches; /* pmatches[k] points to the next i
1746 * such that i+1...mlen is a substring
1747 * of k+1...k+mlen-i-1
1748 */
1749
1750 /* Avoid making errors worse */
1751 if (p->error != 0)
1752 return;
1753
1754 pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
1755 if (pmatches == NULL) {
1756 g->matchjump = NULL;
1757 return;
1758 }
1759
1760 g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
1761 if (g->matchjump == NULL) /* Not a fatal error */
1762 return;
1763
1764 /* Set maximum possible jump for each character in the pattern */
1765 for (mindex = 0; mindex < g->mlen; mindex++)
1766 g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1767
1768 /* Compute pmatches[] */
1769 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1770 mindex--, suffix--) {
1771 pmatches[mindex] = suffix;
1772
1773 /* If a mismatch is found, interrupting the substring,
1774 * compute the matchjump for that position. If no
1775 * mismatch is found, then a text substring mismatched
1776 * against the suffix will also mismatch against the
1777 * substring.
1778 */
1779 while (suffix < g->mlen
1780 && g->must[mindex] != g->must[suffix]) {
1781 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1782 g->mlen - mindex - 1);
1783 suffix = pmatches[suffix];
1784 }
1785 }
1786
1787 /* Compute the matchjump up to the last substring found to jump
1788 * to the beginning of the largest must pattern prefix matching
1789 * it's own suffix.
1790 */
1791 for (mindex = 0; mindex <= suffix; mindex++)
1792 g->matchjump[mindex] = MIN(g->matchjump[mindex],
1793 g->mlen + suffix - mindex);
1794
1795 ssuffix = pmatches[suffix];
1796 while (suffix < g->mlen) {
1797 while (suffix <= ssuffix && suffix < g->mlen) {
1798 g->matchjump[suffix] = MIN(g->matchjump[suffix],
1799 g->mlen + ssuffix - suffix);
1800 suffix++;
1801 }
1802 if (suffix < g->mlen)
1803 ssuffix = pmatches[ssuffix];
1804 }
1805
1806 free(pmatches);
1807}
1808
1809/*
1810 - pluscount - count + nesting
1811 == static sopno pluscount(struct parse *p, struct re_guts *g);
1812 */
1813static sopno /* nesting depth */
1814pluscount(p, g)
1815struct parse *p;
1816struct re_guts *g;
1817{
1818 sop *scan;
1819 sop s;
1820 sopno plusnest = 0;
1821 sopno maxnest = 0;
1822
1823 if (p->error != 0)
1824 return(0); /* there may not be an OEND */
1825
1826 scan = g->strip + 1;
1827 do {
1828 s = *scan++;
1829 switch (OP(s)) {
1830 case OPLUS_:
1831 plusnest++;
1832 break;
1833 case O_PLUS:
1834 if (plusnest > maxnest)
1835 maxnest = plusnest;
1836 plusnest--;
1837 break;
1838 }
1839 } while (OP(s) != OEND);
1840 if (plusnest != 0)
1841 g->iflags |= BAD;
1842 return(maxnest);
1843}
Note: See TracBrowser for help on using the repository browser.