Context Navigation

source: vendor/FreeBSD-libc/5.3/regex/regcomp.c

Visit:

Last change on this file was 1695, checked in by bird, 21 years ago
FreeBSD 5.3 libc sources.
Property cvs2svn:cvs-rev set to `1.1.1.2` Property svn:eol-style set to `native` Property svn:executable set to ``*
File size: 42.7 KB

Line
1	/*-
2	* Copyright (c) 1992, 1993, 1994 Henry Spencer.
3	* Copyright (c) 1992, 1993, 1994
4	* The Regents of the University of California. All rights reserved.
5	*
6	* This code is derived from software contributed to Berkeley by
7	* Henry Spencer.
8	*
9	* Redistribution and use in source and binary forms, with or without
10	* modification, are permitted provided that the following conditions
11	* are met:
12	* 1. Redistributions of source code must retain the above copyright
13	* notice, this list of conditions and the following disclaimer.
14	* 2. Redistributions in binary form must reproduce the above copyright
15	* notice, this list of conditions and the following disclaimer in the
16	* documentation and/or other materials provided with the distribution.
17	* 3. All advertising materials mentioning features or use of this software
18	* must display the following acknowledgement:
19	* This product includes software developed by the University of
20	* California, Berkeley and its contributors.
21	* 4. Neither the name of the University nor the names of its contributors
22	* may be used to endorse or promote products derived from this software
23	* without specific prior written permission.
24	*
25	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35	* SUCH DAMAGE.
36	*
37	* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
38	*/
39
40	#if defined(LIBC_SCCS) && !defined(lint)
41	static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
42	#endif /* LIBC_SCCS and not lint */
43	#include <sys/cdefs.h>
44	__FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.32.2.1 2004/09/08 04:42:46 tjr Exp $");
45
46	#include <sys/types.h>
47	#include <stdio.h>
48	#include <string.h>
49	#include <ctype.h>
50	#include <limits.h>
51	#include <stdlib.h>
52	#include <regex.h>
53	#include <wchar.h>
54	#include <wctype.h>
55
56	#include "collate.h"
57
58	#include "utils.h"
59	#include "regex2.h"
60
61	#include "cname.h"
62
63	/*
64	* parse structure, passed up and down to avoid global variables and
65	* other clumsinesses
66	*/
67	struct parse {
68	char next; / next character in RE */
69	char end; / end of string (-> NUL normally) */
70	int error; /* has an error been seen? */
71	sop strip; / malloced strip */
72	sopno ssize; /* malloced strip size (allocated) */
73	sopno slen; /* malloced strip length (used) */
74	int ncsalloc; /* number of csets allocated */
75	struct re_guts *g;
76	# define NPAREN 10 /* we need to remember () 1-9 for back refs */
77	sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
78	sopno pend[NPAREN]; /* -> ) ([0] unused) */
79	};
80
81	/* ========= begin header generated by ./mkh ========= */
82	#ifdef __cplusplus
83	extern "C" {
84	#endif
85
86	/* === regcomp.c === */
87	static void p_ere(struct parse *p, wint_t stop);
88	static void p_ere_exp(struct parse *p);
89	static void p_str(struct parse *p);
90	static void p_bre(struct parse *p, wint_t end1, wint_t end2);
91	static int p_simp_re(struct parse *p, int starordinary);
92	static int p_count(struct parse *p);
93	static void p_bracket(struct parse *p);
94	static void p_b_term(struct parse p, cset cs);
95	static void p_b_cclass(struct parse p, cset cs);
96	static void p_b_eclass(struct parse p, cset cs);
97	static wint_t p_b_symbol(struct parse *p);
98	static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
99	static wint_t othercase(wint_t ch);
100	static void bothcases(struct parse *p, wint_t ch);
101	static void ordinary(struct parse *p, wint_t ch);
102	static void nonnewline(struct parse *p);
103	static void repeat(struct parse *p, sopno start, int from, int to);
104	static int seterr(struct parse *p, int e);
105	static cset allocset(struct parse p);
106	static void freeset(struct parse p, cset cs);
107	static void CHadd(struct parse p, cset cs, wint_t ch);
108	static void CHaddrange(struct parse p, cset cs, wint_t min, wint_t max);
109	static void CHaddtype(struct parse p, cset cs, wctype_t wct);
110	static wint_t singleton(cset *cs);
111	static sopno dupl(struct parse *p, sopno start, sopno finish);
112	static void doemit(struct parse *p, sop op, size_t opnd);
113	static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
114	static void dofwd(struct parse *p, sopno pos, sop value);
115	static void enlarge(struct parse *p, sopno size);
116	static void stripsnug(struct parse p, struct re_guts g);
117	static void findmust(struct parse p, struct re_guts g);
118	static int altoffset(sop *scan, int offset);
119	static void computejumps(struct parse p, struct re_guts g);
120	static void computematchjumps(struct parse p, struct re_guts g);
121	static sopno pluscount(struct parse p, struct re_guts g);
122	static wint_t wgetnext(struct parse *p);
123
124	#ifdef __cplusplus
125	}
126	#endif
127	/* ========= end header generated by ./mkh ========= */
128
129	static char nuls[10]; /* place to point scanner in event of error */
130
131	/*
132	* macros for use with parse structure
133	* BEWARE: these know that the parse structure is named `p' !!!
134	*/
135	#define PEEK() (*p->next)
136	#define PEEK2() (*(p->next+1))
137	#define MORE() (p->next < p->end)
138	#define MORE2() (p->next+1 < p->end)
139	#define SEE(c) (MORE() && PEEK() == (c))
140	#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
141	#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
142	#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
143	#define NEXT() (p->next++)
144	#define NEXT2() (p->next += 2)
145	#define NEXTn(n) (p->next += (n))
146	#define GETNEXT() (*p->next++)
147	#define WGETNEXT() wgetnext(p)
148	#define SETERROR(e) seterr(p, (e))
149	#define REQUIRE(co, e) ((co) \|\| SETERROR(e))
150	#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
151	#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
152	#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() \|\| PEEK() != (c), e))
153	#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
154	#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
155	#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
156	#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
157	#define HERE() (p->slen)
158	#define THERE() (p->slen - 1)
159	#define THERETHERE() (p->slen - 2)
160	#define DROP(n) (p->slen -= (n))
161
162	#ifndef NDEBUG
163	static int never = 0; /* for use in asserts; shuts lint up */
164	#else
165	#define never 0 /* some <assert.h>s have bugs too */
166	#endif
167
168	/* Macro used by computejump()/computematchjump() */
169	#define MIN(a,b) ((a)<(b)?(a):(b))
170
171	/*
172	- regcomp - interface for parser and compilation
173	= extern int regcomp(regex_t , const char , int);
174	= #define REG_BASIC 0000
175	= #define REG_EXTENDED 0001
176	= #define REG_ICASE 0002
177	= #define REG_NOSUB 0004
178	= #define REG_NEWLINE 0010
179	= #define REG_NOSPEC 0020
180	= #define REG_PEND 0040
181	= #define REG_DUMP 0200
182	*/
183	int /* 0 success, otherwise REG_something */
184	regcomp(preg, pattern, cflags)
185	regex_t * __restrict preg;
186	const char * __restrict pattern;
187	int cflags;
188	{
189	struct parse pa;
190	struct re_guts *g;
191	struct parse *p = &pa;
192	int i;
193	size_t len;
194	#ifdef REDEBUG
195	# define GOODFLAGS(f) (f)
196	#else
197	# define GOODFLAGS(f) ((f)&~REG_DUMP)
198	#endif
199
200	cflags = GOODFLAGS(cflags);
201	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
202	return(REG_INVARG);
203
204	if (cflags&REG_PEND) {
205	if (preg->re_endp < pattern)
206	return(REG_INVARG);
207	len = preg->re_endp - pattern;
208	} else
209	len = strlen((char *)pattern);
210
211	/* do the mallocs early so failure handling is easy */
212	g = (struct re_guts *)malloc(sizeof(struct re_guts));
213	if (g == NULL)
214	return(REG_ESPACE);
215	p->ssize = len/(size_t)2(size_t)3 + (size_t)1; / ugh */
216	p->strip = (sop )malloc(p->ssize sizeof(sop));
217	p->slen = 0;
218	if (p->strip == NULL) {
219	free((char *)g);
220	return(REG_ESPACE);
221	}
222
223	/* set things up */
224	p->g = g;
225	p->next = (char )pattern; / convenience; we do not modify it */
226	p->end = p->next + len;
227	p->error = 0;
228	p->ncsalloc = 0;
229	for (i = 0; i < NPAREN; i++) {
230	p->pbegin[i] = 0;
231	p->pend[i] = 0;
232	}
233	g->sets = NULL;
234	g->ncsets = 0;
235	g->cflags = cflags;
236	g->iflags = 0;
237	g->nbol = 0;
238	g->neol = 0;
239	g->must = NULL;
240	g->moffset = -1;
241	g->charjump = NULL;
242	g->matchjump = NULL;
243	g->mlen = 0;
244	g->nsub = 0;
245	g->backrefs = 0;
246
247	/* do it */
248	EMIT(OEND, 0);
249	g->firststate = THERE();
250	if (cflags&REG_EXTENDED)
251	p_ere(p, OUT);
252	else if (cflags&REG_NOSPEC)
253	p_str(p);
254	else
255	p_bre(p, OUT, OUT);
256	EMIT(OEND, 0);
257	g->laststate = THERE();
258
259	/* tidy up loose ends and fill things in */
260	stripsnug(p, g);
261	findmust(p, g);
262	/* only use Boyer-Moore algorithm if the pattern is bigger
263	* than three characters
264	*/
265	if(g->mlen > 3) {
266	computejumps(p, g);
267	computematchjumps(p, g);
268	if(g->matchjump == NULL && g->charjump != NULL) {
269	free(g->charjump);
270	g->charjump = NULL;
271	}
272	}
273	g->nplus = pluscount(p, g);
274	g->magic = MAGIC2;
275	preg->re_nsub = g->nsub;
276	preg->re_g = g;
277	preg->re_magic = MAGIC1;
278	#ifndef REDEBUG
279	/* not debugging, so can't rely on the assert() in regexec() */
280	if (g->iflags&BAD)
281	SETERROR(REG_ASSERT);
282	#endif
283
284	/* win or lose, we're done */
285	if (p->error != 0) /* lose */
286	regfree(preg);
287	return(p->error);
288	}
289
290	/*
291	- p_ere - ERE parser top level, concatenation and alternation
292	== static void p_ere(struct parse *p, int stop);
293	*/
294	static void
295	p_ere(p, stop)
296	struct parse *p;
297	int stop; /* character this ERE should end at */
298	{
299	char c;
300	sopno prevback;
301	sopno prevfwd;
302	sopno conc;
303	int first = 1; /* is this the first alternative? */
304
305	for (;;) {
306	/* do a bunch of concatenated expressions */
307	conc = HERE();
308	while (MORE() && (c = PEEK()) != '\|' && c != stop)
309	p_ere_exp(p);
310	(void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
311
312	if (!EAT('\|'))
313	break; /* NOTE BREAK OUT */
314
315	if (first) {
316	INSERT(OCH_, conc); /* offset is wrong */
317	prevfwd = conc;
318	prevback = conc;
319	first = 0;
320	}
321	ASTERN(OOR1, prevback);
322	prevback = THERE();
323	AHEAD(prevfwd); /* fix previous offset */
324	prevfwd = HERE();
325	EMIT(OOR2, 0); /* offset is very wrong */
326	}
327
328	if (!first) { /* tail-end fixups */
329	AHEAD(prevfwd);
330	ASTERN(O_CH, prevback);
331	}
332
333	assert(!MORE() \|\| SEE(stop));
334	}
335
336	/*
337	- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
338	== static void p_ere_exp(struct parse *p);
339	*/
340	static void
341	p_ere_exp(p)
342	struct parse *p;
343	{
344	char c;
345	wint_t wc;
346	sopno pos;
347	int count;
348	int count2;
349	sopno subno;
350	int wascaret = 0;
351
352	assert(MORE()); /* caller should have ensured this */
353	c = GETNEXT();
354
355	pos = HERE();
356	switch (c) {
357	case '(':
358	(void)REQUIRE(MORE(), REG_EPAREN);
359	p->g->nsub++;
360	subno = p->g->nsub;
361	if (subno < NPAREN)
362	p->pbegin[subno] = HERE();
363	EMIT(OLPAREN, subno);
364	if (!SEE(')'))
365	p_ere(p, ')');
366	if (subno < NPAREN) {
367	p->pend[subno] = HERE();
368	assert(p->pend[subno] != 0);
369	}
370	EMIT(ORPAREN, subno);
371	(void)MUSTEAT(')', REG_EPAREN);
372	break;
373	#ifndef POSIX_MISTAKE
374	case ')': /* happens only if no current unmatched ( */
375	/*
376	* You may ask, why the ifndef? Because I didn't notice
377	* this until slightly too late for 1003.2, and none of the
378	* other 1003.2 regular-expression reviewers noticed it at
379	* all. So an unmatched ) is legal POSIX, at least until
380	* we can get it fixed.
381	*/
382	SETERROR(REG_EPAREN);
383	break;
384	#endif
385	case '^':
386	EMIT(OBOL, 0);
387	p->g->iflags \|= USEBOL;
388	p->g->nbol++;
389	wascaret = 1;
390	break;
391	case '$':
392	EMIT(OEOL, 0);
393	p->g->iflags \|= USEEOL;
394	p->g->neol++;
395	break;
396	case '\|':
397	SETERROR(REG_EMPTY);
398	break;
399	case '*':
400	case '+':
401	case '?':
402	SETERROR(REG_BADRPT);
403	break;
404	case '.':
405	if (p->g->cflags&REG_NEWLINE)
406	nonnewline(p);
407	else
408	EMIT(OANY, 0);
409	break;
410	case '[':
411	p_bracket(p);
412	break;
413	case '\\':
414	(void)REQUIRE(MORE(), REG_EESCAPE);
415	wc = WGETNEXT();
416	ordinary(p, wc);
417	break;
418	case '{': /* okay as ordinary except if digit follows */
419	(void)REQUIRE(!MORE() \|\| !isdigit((uch)PEEK()), REG_BADRPT);
420	/* FALLTHROUGH */
421	default:
422	p->next--;
423	wc = WGETNEXT();
424	ordinary(p, wc);
425	break;
426	}
427
428	if (!MORE())
429	return;
430	c = PEEK();
431	/* we call { a repetition if followed by a digit */
432	if (!( c == '*' \|\| c == '+' \|\| c == '?' \|\|
433	(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
434	return; /* no repetition, we're done */
435	NEXT();
436
437	(void)REQUIRE(!wascaret, REG_BADRPT);
438	switch (c) {
439	case '': / implemented as +? */
440	/* this case does not require the (y\|) trick, noKLUDGE */
441	INSERT(OPLUS_, pos);
442	ASTERN(O_PLUS, pos);
443	INSERT(OQUEST_, pos);
444	ASTERN(O_QUEST, pos);
445	break;
446	case '+':
447	INSERT(OPLUS_, pos);
448	ASTERN(O_PLUS, pos);
449	break;
450	case '?':
451	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
452	INSERT(OCH_, pos); /* offset slightly wrong */
453	ASTERN(OOR1, pos); /* this one's right */
454	AHEAD(pos); /* fix the OCH_ */
455	EMIT(OOR2, 0); /* offset very wrong... */
456	AHEAD(THERE()); /* ...so fix it */
457	ASTERN(O_CH, THERETHERE());
458	break;
459	case '{':
460	count = p_count(p);
461	if (EAT(',')) {
462	if (isdigit((uch)PEEK())) {
463	count2 = p_count(p);
464	(void)REQUIRE(count <= count2, REG_BADBR);
465	} else /* single number with comma */
466	count2 = INFINITY;
467	} else /* just a single number */
468	count2 = count;
469	repeat(p, pos, count, count2);
470	if (!EAT('}')) { /* error heuristics */
471	while (MORE() && PEEK() != '}')
472	NEXT();
473	(void)REQUIRE(MORE(), REG_EBRACE);
474	SETERROR(REG_BADBR);
475	}
476	break;
477	}
478
479	if (!MORE())
480	return;
481	c = PEEK();
482	if (!( c == '*' \|\| c == '+' \|\| c == '?' \|\|
483	(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
484	return;
485	SETERROR(REG_BADRPT);
486	}
487
488	/*
489	- p_str - string (no metacharacters) "parser"
490	== static void p_str(struct parse *p);
491	*/
492	static void
493	p_str(p)
494	struct parse *p;
495	{
496	(void)REQUIRE(MORE(), REG_EMPTY);
497	while (MORE())
498	ordinary(p, WGETNEXT());
499	}
500
501	/*
502	- p_bre - BRE parser top level, anchoring and concatenation
503	== static void p_bre(struct parse *p, int end1, \
504	== int end2);
505	* Giving end1 as OUT essentially eliminates the end1/end2 check.
506	*
507	* This implementation is a bit of a kludge, in that a trailing $ is first
508	* taken as an ordinary character and then revised to be an anchor.
509	* The amount of lookahead needed to avoid this kludge is excessive.
510	*/
511	static void
512	p_bre(p, end1, end2)
513	struct parse *p;
514	int end1; /* first terminating character */
515	int end2; /* second terminating character */
516	{
517	sopno start = HERE();
518	int first = 1; /* first subexpression? */
519	int wasdollar = 0;
520
521	if (EAT('^')) {
522	EMIT(OBOL, 0);
523	p->g->iflags \|= USEBOL;
524	p->g->nbol++;
525	}
526	while (MORE() && !SEETWO(end1, end2)) {
527	wasdollar = p_simp_re(p, first);
528	first = 0;
529	}
530	if (wasdollar) { /* oops, that was a trailing anchor */
531	DROP(1);
532	EMIT(OEOL, 0);
533	p->g->iflags \|= USEEOL;
534	p->g->neol++;
535	}
536
537	(void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
538	}
539
540	/*
541	- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
542	== static int p_simp_re(struct parse *p, int starordinary);
543	*/
544	static int /* was the simple RE an unbackslashed $? */
545	p_simp_re(p, starordinary)
546	struct parse *p;
547	int starordinary; /* is a leading * an ordinary character? */
548	{
549	int c;
550	int count;
551	int count2;
552	sopno pos;
553	int i;
554	wint_t wc;
555	sopno subno;
556	# define BACKSL (1<<CHAR_BIT)
557
558	pos = HERE(); /* repetion op, if any, covers from here */
559
560	assert(MORE()); /* caller should have ensured this */
561	c = GETNEXT();
562	if (c == '\\') {
563	(void)REQUIRE(MORE(), REG_EESCAPE);
564	c = BACKSL \| GETNEXT();
565	}
566	switch (c) {
567	case '.':
568	if (p->g->cflags&REG_NEWLINE)
569	nonnewline(p);
570	else
571	EMIT(OANY, 0);
572	break;
573	case '[':
574	p_bracket(p);
575	break;
576	case BACKSL\|'{':
577	SETERROR(REG_BADRPT);
578	break;
579	case BACKSL\|'(':
580	p->g->nsub++;
581	subno = p->g->nsub;
582	if (subno < NPAREN)
583	p->pbegin[subno] = HERE();
584	EMIT(OLPAREN, subno);
585	/* the MORE here is an error heuristic */
586	if (MORE() && !SEETWO('\\', ')'))
587	p_bre(p, '\\', ')');
588	if (subno < NPAREN) {
589	p->pend[subno] = HERE();
590	assert(p->pend[subno] != 0);
591	}
592	EMIT(ORPAREN, subno);
593	(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
594	break;
595	case BACKSL\|')': /* should not get here -- must be user */
596	case BACKSL\|'}':
597	SETERROR(REG_EPAREN);
598	break;
599	case BACKSL\|'1':
600	case BACKSL\|'2':
601	case BACKSL\|'3':
602	case BACKSL\|'4':
603	case BACKSL\|'5':
604	case BACKSL\|'6':
605	case BACKSL\|'7':
606	case BACKSL\|'8':
607	case BACKSL\|'9':
608	i = (c&~BACKSL) - '0';
609	assert(i < NPAREN);
610	if (p->pend[i] != 0) {
611	assert(i <= p->g->nsub);
612	EMIT(OBACK_, i);
613	assert(p->pbegin[i] != 0);
614	assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
615	assert(OP(p->strip[p->pend[i]]) == ORPAREN);
616	(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
617	EMIT(O_BACK, i);
618	} else
619	SETERROR(REG_ESUBREG);
620	p->g->backrefs = 1;
621	break;
622	case '*':
623	(void)REQUIRE(starordinary, REG_BADRPT);
624	/* FALLTHROUGH */
625	default:
626	p->next--;
627	wc = WGETNEXT();
628	ordinary(p, wc);
629	break;
630	}
631
632	if (EAT('')) { / implemented as +? */
633	/* this case does not require the (y\|) trick, noKLUDGE */
634	INSERT(OPLUS_, pos);
635	ASTERN(O_PLUS, pos);
636	INSERT(OQUEST_, pos);
637	ASTERN(O_QUEST, pos);
638	} else if (EATTWO('\\', '{')) {
639	count = p_count(p);
640	if (EAT(',')) {
641	if (MORE() && isdigit((uch)PEEK())) {
642	count2 = p_count(p);
643	(void)REQUIRE(count <= count2, REG_BADBR);
644	} else /* single number with comma */
645	count2 = INFINITY;
646	} else /* just a single number */
647	count2 = count;
648	repeat(p, pos, count, count2);
649	if (!EATTWO('\\', '}')) { /* error heuristics */
650	while (MORE() && !SEETWO('\\', '}'))
651	NEXT();
652	(void)REQUIRE(MORE(), REG_EBRACE);
653	SETERROR(REG_BADBR);
654	}
655	} else if (c == '$') /* $ (but not \$) ends it */
656	return(1);
657
658	return(0);
659	}
660
661	/*
662	- p_count - parse a repetition count
663	== static int p_count(struct parse *p);
664	*/
665	static int /* the value */
666	p_count(p)
667	struct parse *p;
668	{
669	int count = 0;
670	int ndigits = 0;
671
672	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
673	count = count*10 + (GETNEXT() - '0');
674	ndigits++;
675	}
676
677	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
678	return(count);
679	}
680
681	/*
682	- p_bracket - parse a bracketed character list
683	== static void p_bracket(struct parse *p);
684	*/
685	static void
686	p_bracket(p)
687	struct parse *p;
688	{
689	cset *cs;
690	wint_t ch;
691
692	/* Dept of Truly Sickening Special-Case Kludges */
693	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
694	EMIT(OBOW, 0);
695	NEXTn(6);
696	return;
697	}
698	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
699	EMIT(OEOW, 0);
700	NEXTn(6);
701	return;
702	}
703
704	if ((cs = allocset(p)) == NULL)
705	return;
706
707	if (p->g->cflags&REG_ICASE)
708	cs->icase = 1;
709	if (EAT('^'))
710	cs->invert = 1;
711	if (EAT(']'))
712	CHadd(p, cs, ']');
713	else if (EAT('-'))
714	CHadd(p, cs, '-');
715	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
716	p_b_term(p, cs);
717	if (EAT('-'))
718	CHadd(p, cs, '-');
719	(void)MUSTEAT(']', REG_EBRACK);
720
721	if (p->error != 0) /* don't mess things up further */
722	return;
723
724	if (cs->invert && p->g->cflags&REG_NEWLINE)
725	cs->bmp['\n' >> 3] \|= 1 << ('\n' & 7);
726
727	if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
728	ordinary(p, ch);
729	freeset(p, cs);
730	} else
731	EMIT(OANYOF, (int)(cs - p->g->sets));
732	}
733
734	/*
735	- p_b_term - parse one term of a bracketed character list
736	== static void p_b_term(struct parse p, cset cs);
737	*/
738	static void
739	p_b_term(p, cs)
740	struct parse *p;
741	cset *cs;
742	{
743	char c;
744	wint_t start, finish;
745	wint_t i;
746
747	/* classify what we've got */
748	switch ((MORE()) ? PEEK() : '\0') {
749	case '[':
750	c = (MORE2()) ? PEEK2() : '\0';
751	break;
752	case '-':
753	SETERROR(REG_ERANGE);
754	return; /* NOTE RETURN */
755	break;
756	default:
757	c = '\0';
758	break;
759	}
760
761	switch (c) {
762	case ':': /* character class */
763	NEXT2();
764	(void)REQUIRE(MORE(), REG_EBRACK);
765	c = PEEK();
766	(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
767	p_b_cclass(p, cs);
768	(void)REQUIRE(MORE(), REG_EBRACK);
769	(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
770	break;
771	case '=': /* equivalence class */
772	NEXT2();
773	(void)REQUIRE(MORE(), REG_EBRACK);
774	c = PEEK();
775	(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
776	p_b_eclass(p, cs);
777	(void)REQUIRE(MORE(), REG_EBRACK);
778	(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
779	break;
780	default: /* symbol, ordinary character, or range */
781	start = p_b_symbol(p);
782	if (SEE('-') && MORE2() && PEEK2() != ']') {
783	/* range */
784	NEXT();
785	if (EAT('-'))
786	finish = '-';
787	else
788	finish = p_b_symbol(p);
789	} else
790	finish = start;
791	if (start == finish)
792	CHadd(p, cs, start);
793	else {
794	if (__collate_load_error) {
795	(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
796	CHaddrange(p, cs, start, finish);
797	} else {
798	(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
799	for (i = 0; i <= UCHAR_MAX; i++) {
800	if ( __collate_range_cmp(start, i) <= 0
801	&& __collate_range_cmp(i, finish) <= 0
802	)
803	CHadd(p, cs, i);
804	}
805	}
806	}
807	break;
808	}
809	}
810
811	/*
812	- p_b_cclass - parse a character-class name and deal with it
813	== static void p_b_cclass(struct parse p, cset cs);
814	*/
815	static void
816	p_b_cclass(p, cs)
817	struct parse *p;
818	cset *cs;
819	{
820	char *sp = p->next;
821	size_t len;
822	wctype_t wct;
823	char clname[16];
824
825	while (MORE() && isalpha((uch)PEEK()))
826	NEXT();
827	len = p->next - sp;
828	if (len >= sizeof(clname) - 1) {
829	SETERROR(REG_ECTYPE);
830	return;
831	}
832	memcpy(clname, sp, len);
833	clname[len] = '\0';
834	if ((wct = wctype(clname)) == 0) {
835	SETERROR(REG_ECTYPE);
836	return;
837	}
838	CHaddtype(p, cs, wct);
839	}
840
841	/*
842	- p_b_eclass - parse an equivalence-class name and deal with it
843	== static void p_b_eclass(struct parse p, cset cs);
844	*
845	* This implementation is incomplete. xxx
846	*/
847	static void
848	p_b_eclass(p, cs)
849	struct parse *p;
850	cset *cs;
851	{
852	wint_t c;
853
854	c = p_b_coll_elem(p, '=');
855	CHadd(p, cs, c);
856	}
857
858	/*
859	- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
860	== static char p_b_symbol(struct parse *p);
861	*/
862	static wint_t /* value of symbol */
863	p_b_symbol(p)
864	struct parse *p;
865	{
866	wint_t value;
867
868	(void)REQUIRE(MORE(), REG_EBRACK);
869	if (!EATTWO('[', '.'))
870	return(WGETNEXT());
871
872	/* collating symbol */
873	value = p_b_coll_elem(p, '.');
874	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
875	return(value);
876	}
877
878	/*
879	- p_b_coll_elem - parse a collating-element name and look it up
880	== static char p_b_coll_elem(struct parse *p, int endc);
881	*/
882	static wint_t /* value of collating element */
883	p_b_coll_elem(p, endc)
884	struct parse *p;
885	wint_t endc; /* name ended by endc,']' */
886	{
887	char *sp = p->next;
888	struct cname *cp;
889	int len;
890	mbstate_t mbs;
891	wchar_t wc;
892	size_t clen;
893
894	while (MORE() && !SEETWO(endc, ']'))
895	NEXT();
896	if (!MORE()) {
897	SETERROR(REG_EBRACK);
898	return(0);
899	}
900	len = p->next - sp;
901	for (cp = cnames; cp->name != NULL; cp++)
902	if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
903	return(cp->code); /* known name */
904	memset(&mbs, 0, sizeof(mbs));
905	if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
906	return (wc); /* single character */
907	else if (clen == (size_t)-1 \|\| clen == (size_t)-2)
908	SETERROR(REG_ILLSEQ);
909	else
910	SETERROR(REG_ECOLLATE); /* neither */
911	return(0);
912	}
913
914	/*
915	- othercase - return the case counterpart of an alphabetic
916	== static char othercase(int ch);
917	*/
918	static wint_t /* if no counterpart, return ch */
919	othercase(ch)
920	wint_t ch;
921	{
922	assert(iswalpha(ch));
923	if (iswupper(ch))
924	return(towlower(ch));
925	else if (iswlower(ch))
926	return(towupper(ch));
927	else /* peculiar, but could happen */
928	return(ch);
929	}
930
931	/*
932	- bothcases - emit a dualcase version of a two-case character
933	== static void bothcases(struct parse *p, int ch);
934	*
935	* Boy, is this implementation ever a kludge...
936	*/
937	static void
938	bothcases(p, ch)
939	struct parse *p;
940	wint_t ch;
941	{
942	char *oldnext = p->next;
943	char *oldend = p->end;
944	char bracket[3 + MB_LEN_MAX];
945	size_t n;
946	mbstate_t mbs;
947
948	assert(othercase(ch) != ch); /* p_bracket() would recurse */
949	p->next = bracket;
950	memset(&mbs, 0, sizeof(mbs));
951	n = wcrtomb(bracket, ch, &mbs);
952	assert(n != (size_t)-1);
953	bracket[n] = ']';
954	bracket[n + 1] = '\0';
955	p->end = bracket+n+1;
956	p_bracket(p);
957	assert(p->next == p->end);
958	p->next = oldnext;
959	p->end = oldend;
960	}
961
962	/*
963	- ordinary - emit an ordinary character
964	== static void ordinary(struct parse *p, int ch);
965	*/
966	static void
967	ordinary(p, ch)
968	struct parse *p;
969	wint_t ch;
970	{
971	cset *cs;
972
973	if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
974	bothcases(p, ch);
975	else if ((ch & OPDMASK) == ch)
976	EMIT(OCHAR, ch);
977	else {
978	/*
979	* Kludge: character is too big to fit into an OCHAR operand.
980	* Emit a singleton set.
981	*/
982	if ((cs = allocset(p)) == NULL)
983	return;
984	CHadd(p, cs, ch);
985	EMIT(OANYOF, (int)(cs - p->g->sets));
986	}
987	}
988
989	/*
990	- nonnewline - emit REG_NEWLINE version of OANY
991	== static void nonnewline(struct parse *p);
992	*
993	* Boy, is this implementation ever a kludge...
994	*/
995	static void
996	nonnewline(p)
997	struct parse *p;
998	{
999	char *oldnext = p->next;
1000	char *oldend = p->end;
1001	char bracket[4];
1002
1003	p->next = bracket;
1004	p->end = bracket+3;
1005	bracket[0] = '^';
1006	bracket[1] = '\n';
1007	bracket[2] = ']';
1008	bracket[3] = '\0';
1009	p_bracket(p);
1010	assert(p->next == bracket+3);
1011	p->next = oldnext;
1012	p->end = oldend;
1013	}
1014
1015	/*
1016	- repeat - generate code for a bounded repetition, recursively if needed
1017	== static void repeat(struct parse *p, sopno start, int from, int to);
1018	*/
1019	static void
1020	repeat(p, start, from, to)
1021	struct parse *p;
1022	sopno start; /* operand from here to end of strip */
1023	int from; /* repeated from this number */
1024	int to; /* to this number of times (maybe INFINITY) */
1025	{
1026	sopno finish = HERE();
1027	# define N 2
1028	# define INF 3
1029	# define REP(f, t) ((f)*8 + (t))
1030	# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1031	sopno copy;
1032
1033	if (p->error != 0) /* head off possible runaway recursion */
1034	return;
1035
1036	assert(from <= to);
1037
1038	switch (REP(MAP(from), MAP(to))) {
1039	case REP(0, 0): /* must be user doing this */
1040	DROP(finish-start); /* drop the operand */
1041	break;
1042	case REP(0, 1): /* as x{1,1}? */
1043	case REP(0, N): /* as x{1,n}? */
1044	case REP(0, INF): /* as x{1,}? */
1045	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
1046	INSERT(OCH_, start); /* offset is wrong... */
1047	repeat(p, start+1, 1, to);
1048	ASTERN(OOR1, start);
1049	AHEAD(start); /* ... fix it */
1050	EMIT(OOR2, 0);
1051	AHEAD(THERE());
1052	ASTERN(O_CH, THERETHERE());
1053	break;
1054	case REP(1, 1): /* trivial case */
1055	/* done */
1056	break;
1057	case REP(1, N): /* as x?x{1,n-1} */
1058	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
1059	INSERT(OCH_, start);
1060	ASTERN(OOR1, start);
1061	AHEAD(start);
1062	EMIT(OOR2, 0); /* offset very wrong... */
1063	AHEAD(THERE()); /* ...so fix it */
1064	ASTERN(O_CH, THERETHERE());
1065	copy = dupl(p, start+1, finish+1);
1066	assert(copy == finish+4);
1067	repeat(p, copy, 1, to-1);
1068	break;
1069	case REP(1, INF): /* as x+ */
1070	INSERT(OPLUS_, start);
1071	ASTERN(O_PLUS, start);
1072	break;
1073	case REP(N, N): /* as xx{m-1,n-1} */
1074	copy = dupl(p, start, finish);
1075	repeat(p, copy, from-1, to-1);
1076	break;
1077	case REP(N, INF): /* as xx{n-1,INF} */
1078	copy = dupl(p, start, finish);
1079	repeat(p, copy, from-1, to);
1080	break;
1081	default: /* "can't happen" */
1082	SETERROR(REG_ASSERT); /* just in case */
1083	break;
1084	}
1085	}
1086
1087	/*
1088	- wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1089	- character from the parse struct, signals a REG_ILLSEQ error if the
1090	- character can't be converted. Returns the number of bytes consumed.
1091	*/
1092	static wint_t
1093	wgetnext(p)
1094	struct parse *p;
1095	{
1096	mbstate_t mbs;
1097	wchar_t wc;
1098	size_t n;
1099
1100	memset(&mbs, 0, sizeof(mbs));
1101	n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
1102	if (n == (size_t)-1 \|\| n == (size_t)-2) {
1103	SETERROR(REG_ILLSEQ);
1104	return (0);
1105	}
1106	if (n == 0)
1107	n = 1;
1108	p->next += n;
1109	return (wc);
1110	}
1111
1112	/*
1113	- seterr - set an error condition
1114	== static int seterr(struct parse *p, int e);
1115	*/
1116	static int /* useless but makes type checking happy */
1117	seterr(p, e)
1118	struct parse *p;
1119	int e;
1120	{
1121	if (p->error == 0) /* keep earliest error condition */
1122	p->error = e;
1123	p->next = nuls; /* try to bring things to a halt */
1124	p->end = nuls;
1125	return(0); /* make the return value well-defined */
1126	}
1127
1128	/*
1129	- allocset - allocate a set of characters for []
1130	== static cset allocset(struct parse p);
1131	*/
1132	static cset *
1133	allocset(p)
1134	struct parse *p;
1135	{
1136	cset cs, ncs;
1137
1138	ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
1139	if (ncs == NULL) {
1140	SETERROR(REG_ESPACE);
1141	return (NULL);
1142	}
1143	p->g->sets = ncs;
1144	cs = &p->g->sets[p->g->ncsets++];
1145	memset(cs, 0, sizeof(*cs));
1146
1147	return(cs);
1148	}
1149
1150	/*
1151	- freeset - free a now-unused set
1152	== static void freeset(struct parse p, cset cs);
1153	*/
1154	static void
1155	freeset(p, cs)
1156	struct parse *p;
1157	cset *cs;
1158	{
1159	cset *top = &p->g->sets[p->g->ncsets];
1160
1161	free(cs->wides);
1162	free(cs->ranges);
1163	free(cs->types);
1164	memset(cs, 0, sizeof(*cs));
1165	if (cs == top-1) /* recover only the easy case */
1166	p->g->ncsets--;
1167	}
1168
1169	/*
1170	- singleton - Determine whether a set contains only one character,
1171	- returning it if so, otherwise returning OUT.
1172	*/
1173	static wint_t
1174	singleton(cs)
1175	cset *cs;
1176	{
1177	wint_t i, s, n;
1178
1179	for (i = n = 0; i < NC; i++)
1180	if (CHIN(cs, i)) {
1181	n++;
1182	s = i;
1183	}
1184	if (n == 1)
1185	return (s);
1186	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1187	cs->icase == 0)
1188	return (cs->wides[0]);
1189	/* Don't bother handling the other cases. */
1190	return (OUT);
1191	}
1192
1193	/*
1194	- CHadd - add character to character set.
1195	*/
1196	static void
1197	CHadd(p, cs, ch)
1198	struct parse *p;
1199	cset *cs;
1200	wint_t ch;
1201	{
1202	wint_t nch, *newwides;
1203	assert(ch >= 0);
1204	if (ch < NC)
1205	cs->bmp[ch >> 3] \|= 1 << (ch & 7);
1206	else {
1207	newwides = realloc(cs->wides, (cs->nwides + 1) *
1208	sizeof(*cs->wides));
1209	if (newwides == NULL) {
1210	SETERROR(REG_ESPACE);
1211	return;
1212	}
1213	cs->wides = newwides;
1214	cs->wides[cs->nwides++] = ch;
1215	}
1216	if (cs->icase) {
1217	if ((nch = towlower(ch)) < NC)
1218	cs->bmp[nch >> 3] \|= 1 << (nch & 7);
1219	if ((nch = towupper(ch)) < NC)
1220	cs->bmp[nch >> 3] \|= 1 << (nch & 7);
1221	}
1222	}
1223
1224	/*
1225	- CHaddrange - add all characters in the range [min,max] to a character set.
1226	*/
1227	static void
1228	CHaddrange(p, cs, min, max)
1229	struct parse *p;
1230	cset *cs;
1231	wint_t min, max;
1232	{
1233	crange *newranges;
1234
1235	for (; min < NC && min <= max; min++)
1236	CHadd(p, cs, min);
1237	if (min >= max)
1238	return;
1239	newranges = realloc(cs->ranges, (cs->nranges + 1) *
1240	sizeof(*cs->ranges));
1241	if (newranges == NULL) {
1242	SETERROR(REG_ESPACE);
1243	return;
1244	}
1245	cs->ranges = newranges;
1246	cs->ranges[cs->nranges].min = min;
1247	cs->ranges[cs->nranges].min = max;
1248	cs->nranges++;
1249	}
1250
1251	/*
1252	- CHaddtype - add all characters of a certain type to a character set.
1253	*/
1254	static void
1255	CHaddtype(p, cs, wct)
1256	struct parse *p;
1257	cset *cs;
1258	wctype_t wct;
1259	{
1260	wint_t i;
1261	wctype_t *newtypes;
1262
1263	for (i = 0; i < NC; i++)
1264	if (iswctype(i, wct))
1265	CHadd(p, cs, i);
1266	newtypes = realloc(cs->types, (cs->ntypes + 1) *
1267	sizeof(*cs->types));
1268	if (newtypes == NULL) {
1269	SETERROR(REG_ESPACE);
1270	return;
1271	}
1272	cs->types = newtypes;
1273	cs->types[cs->ntypes++] = wct;
1274	}
1275
1276	/*
1277	- dupl - emit a duplicate of a bunch of sops
1278	== static sopno dupl(struct parse *p, sopno start, sopno finish);
1279	*/
1280	static sopno /* start of duplicate */
1281	dupl(p, start, finish)
1282	struct parse *p;
1283	sopno start; /* from here */
1284	sopno finish; /* to this less one */
1285	{
1286	sopno ret = HERE();
1287	sopno len = finish - start;
1288
1289	assert(finish >= start);
1290	if (len == 0)
1291	return(ret);
1292	enlarge(p, p->ssize + len); /* this many unexpected additions */
1293	assert(p->ssize >= p->slen + len);
1294	(void) memcpy((char *)(p->strip + p->slen),
1295	(char )(p->strip + start), (size_t)lensizeof(sop));
1296	p->slen += len;
1297	return(ret);
1298	}
1299
1300	/*
1301	- doemit - emit a strip operator
1302	== static void doemit(struct parse *p, sop op, size_t opnd);
1303	*
1304	* It might seem better to implement this as a macro with a function as
1305	* hard-case backup, but it's just too big and messy unless there are
1306	* some changes to the data structures. Maybe later.
1307	*/
1308	static void
1309	doemit(p, op, opnd)
1310	struct parse *p;
1311	sop op;
1312	size_t opnd;
1313	{
1314	/* avoid making error situations worse */
1315	if (p->error != 0)
1316	return;
1317
1318	/* deal with oversize operands ("can't happen", more or less) */
1319	assert(opnd < 1<<OPSHIFT);
1320
1321	/* deal with undersized strip */
1322	if (p->slen >= p->ssize)
1323	enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1324	assert(p->slen < p->ssize);
1325
1326	/* finally, it's all reduced to the easy case */
1327	p->strip[p->slen++] = SOP(op, opnd);
1328	}
1329
1330	/*
1331	- doinsert - insert a sop into the strip
1332	== static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1333	*/
1334	static void
1335	doinsert(p, op, opnd, pos)
1336	struct parse *p;
1337	sop op;
1338	size_t opnd;
1339	sopno pos;
1340	{
1341	sopno sn;
1342	sop s;
1343	int i;
1344
1345	/* avoid making error situations worse */
1346	if (p->error != 0)
1347	return;
1348
1349	sn = HERE();
1350	EMIT(op, opnd); /* do checks, ensure space */
1351	assert(HERE() == sn+1);
1352	s = p->strip[sn];
1353
1354	/* adjust paren pointers */
1355	assert(pos > 0);
1356	for (i = 1; i < NPAREN; i++) {
1357	if (p->pbegin[i] >= pos) {
1358	p->pbegin[i]++;
1359	}
1360	if (p->pend[i] >= pos) {
1361	p->pend[i]++;
1362	}
1363	}
1364
1365	memmove((char )&p->strip[pos+1], (char )&p->strip[pos],
1366	(HERE()-pos-1)*sizeof(sop));
1367	p->strip[pos] = s;
1368	}
1369
1370	/*
1371	- dofwd - complete a forward reference
1372	== static void dofwd(struct parse *p, sopno pos, sop value);
1373	*/
1374	static void
1375	dofwd(p, pos, value)
1376	struct parse *p;
1377	sopno pos;
1378	sop value;
1379	{
1380	/* avoid making error situations worse */
1381	if (p->error != 0)
1382	return;
1383
1384	assert(value < 1<<OPSHIFT);
1385	p->strip[pos] = OP(p->strip[pos]) \| value;
1386	}
1387
1388	/*
1389	- enlarge - enlarge the strip
1390	== static void enlarge(struct parse *p, sopno size);
1391	*/
1392	static void
1393	enlarge(p, size)
1394	struct parse *p;
1395	sopno size;
1396	{
1397	sop *sp;
1398
1399	if (p->ssize >= size)
1400	return;
1401
1402	sp = (sop )realloc(p->strip, sizesizeof(sop));
1403	if (sp == NULL) {
1404	SETERROR(REG_ESPACE);
1405	return;
1406	}
1407	p->strip = sp;
1408	p->ssize = size;
1409	}
1410
1411	/*
1412	- stripsnug - compact the strip
1413	== static void stripsnug(struct parse p, struct re_guts g);
1414	*/
1415	static void
1416	stripsnug(p, g)
1417	struct parse *p;
1418	struct re_guts *g;
1419	{
1420	g->nstates = p->slen;
1421	g->strip = (sop )realloc((char )p->strip, p->slen * sizeof(sop));
1422	if (g->strip == NULL) {
1423	SETERROR(REG_ESPACE);
1424	g->strip = p->strip;
1425	}
1426	}
1427
1428	/*
1429	- findmust - fill in must and mlen with longest mandatory literal string
1430	== static void findmust(struct parse p, struct re_guts g);
1431	*
1432	* This algorithm could do fancy things like analyzing the operands of \|
1433	* for common subsequences. Someday. This code is simple and finds most
1434	* of the interesting cases.
1435	*
1436	* Note that must and mlen got initialized during setup.
1437	*/
1438	static void
1439	findmust(p, g)
1440	struct parse *p;
1441	struct re_guts *g;
1442	{
1443	sop *scan;
1444	sop *start;
1445	sop *newstart;
1446	sopno newlen;
1447	sop s;
1448	char *cp;
1449	int offset;
1450	char buf[MB_LEN_MAX];
1451	size_t clen;
1452	mbstate_t mbs;
1453
1454	/* avoid making error situations worse */
1455	if (p->error != 0)
1456	return;
1457
1458	/*
1459	* It's not generally safe to do a ``char'' substring search on
1460	* multibyte character strings, but it's safe for at least
1461	* UTF-8 (see RFC 3629).
1462	*/
1463	if (MB_CUR_MAX > 1 &&
1464	strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
1465	return;
1466
1467	/* find the longest OCHAR sequence in strip */
1468	newlen = 0;
1469	offset = 0;
1470	g->moffset = 0;
1471	scan = g->strip + 1;
1472	do {
1473	s = *scan++;
1474	switch (OP(s)) {
1475	case OCHAR: /* sequence member */
1476	if (newlen == 0) { /* new sequence */
1477	memset(&mbs, 0, sizeof(mbs));
1478	newstart = scan - 1;
1479	}
1480	clen = wcrtomb(buf, OPND(s), &mbs);
1481	if (clen == (size_t)-1)
1482	goto toohard;
1483	newlen += clen;
1484	break;
1485	case OPLUS_: /* things that don't break one */
1486	case OLPAREN:
1487	case ORPAREN:
1488	break;
1489	case OQUEST_: /* things that must be skipped */
1490	case OCH_:
1491	offset = altoffset(scan, offset);
1492	scan--;
1493	do {
1494	scan += OPND(s);
1495	s = *scan;
1496	/* assert() interferes w debug printouts */
1497	if (OP(s) != O_QUEST && OP(s) != O_CH &&
1498	OP(s) != OOR2) {
1499	g->iflags \|= BAD;
1500	return;
1501	}
1502	} while (OP(s) != O_QUEST && OP(s) != O_CH);
1503	/* FALLTHROUGH */
1504	case OBOW: /* things that break a sequence */
1505	case OEOW:
1506	case OBOL:
1507	case OEOL:
1508	case O_QUEST:
1509	case O_CH:
1510	case OEND:
1511	if (newlen > g->mlen) { /* ends one */
1512	start = newstart;
1513	g->mlen = newlen;
1514	if (offset > -1) {
1515	g->moffset += offset;
1516	offset = newlen;
1517	} else
1518	g->moffset = offset;
1519	} else {
1520	if (offset > -1)
1521	offset += newlen;
1522	}
1523	newlen = 0;
1524	break;
1525	case OANY:
1526	if (newlen > g->mlen) { /* ends one */
1527	start = newstart;
1528	g->mlen = newlen;
1529	if (offset > -1) {
1530	g->moffset += offset;
1531	offset = newlen;
1532	} else
1533	g->moffset = offset;
1534	} else {
1535	if (offset > -1)
1536	offset += newlen;
1537	}
1538	if (offset > -1)
1539	offset++;
1540	newlen = 0;
1541	break;
1542	case OANYOF: /* may or may not invalidate offset */
1543	/* First, everything as OANY */
1544	if (newlen > g->mlen) { /* ends one */
1545	start = newstart;
1546	g->mlen = newlen;
1547	if (offset > -1) {
1548	g->moffset += offset;
1549	offset = newlen;
1550	} else
1551	g->moffset = offset;
1552	} else {
1553	if (offset > -1)
1554	offset += newlen;
1555	}
1556	if (offset > -1)
1557	offset++;
1558	newlen = 0;
1559	break;
1560	toohard:
1561	default:
1562	/* Anything here makes it impossible or too hard
1563	* to calculate the offset -- so we give up;
1564	* save the last known good offset, in case the
1565	* must sequence doesn't occur later.
1566	*/
1567	if (newlen > g->mlen) { /* ends one */
1568	start = newstart;
1569	g->mlen = newlen;
1570	if (offset > -1)
1571	g->moffset += offset;
1572	else
1573	g->moffset = offset;
1574	}
1575	offset = -1;
1576	newlen = 0;
1577	break;
1578	}
1579	} while (OP(s) != OEND);
1580
1581	if (g->mlen == 0) { /* there isn't one */
1582	g->moffset = -1;
1583	return;
1584	}
1585
1586	/* turn it into a character string */
1587	g->must = malloc((size_t)g->mlen + 1);
1588	if (g->must == NULL) { /* argh; just forget it */
1589	g->mlen = 0;
1590	g->moffset = -1;
1591	return;
1592	}
1593	cp = g->must;
1594	scan = start;
1595	memset(&mbs, 0, sizeof(mbs));
1596	while (cp < g->must + g->mlen) {
1597	while (OP(s = *scan++) != OCHAR)
1598	continue;
1599	clen = wcrtomb(cp, OPND(s), &mbs);
1600	assert(clen != (size_t)-1);
1601	cp += clen;
1602	}
1603	assert(cp == g->must + g->mlen);
1604	cp++ = '\0'; / just on general principles */
1605	}
1606
1607	/*
1608	- altoffset - choose biggest offset among multiple choices
1609	== static int altoffset(sop *scan, int offset);
1610	*
1611	* Compute, recursively if necessary, the largest offset among multiple
1612	* re paths.
1613	*/
1614	static int
1615	altoffset(scan, offset)
1616	sop *scan;
1617	int offset;
1618	{
1619	int largest;
1620	int try;
1621	sop s;
1622
1623	/* If we gave up already on offsets, return */
1624	if (offset == -1)
1625	return -1;
1626
1627	largest = 0;
1628	try = 0;
1629	s = *scan++;
1630	while (OP(s) != O_QUEST && OP(s) != O_CH) {
1631	switch (OP(s)) {
1632	case OOR1:
1633	if (try > largest)
1634	largest = try;
1635	try = 0;
1636	break;
1637	case OQUEST_:
1638	case OCH_:
1639	try = altoffset(scan, try);
1640	if (try == -1)
1641	return -1;
1642	scan--;
1643	do {
1644	scan += OPND(s);
1645	s = *scan;
1646	if (OP(s) != O_QUEST && OP(s) != O_CH &&
1647	OP(s) != OOR2)
1648	return -1;
1649	} while (OP(s) != O_QUEST && OP(s) != O_CH);
1650	/* We must skip to the next position, or we'll
1651	* leave altoffset() too early.
1652	*/
1653	scan++;
1654	break;
1655	case OANYOF:
1656	case OCHAR:
1657	case OANY:
1658	try++;
1659	case OBOW:
1660	case OEOW:
1661	case OLPAREN:
1662	case ORPAREN:
1663	case OOR2:
1664	break;
1665	default:
1666	try = -1;
1667	break;
1668	}
1669	if (try == -1)
1670	return -1;
1671	s = *scan++;
1672	}
1673
1674	if (try > largest)
1675	largest = try;
1676
1677	return largest+offset;
1678	}
1679
1680	/*
1681	- computejumps - compute char jumps for BM scan
1682	== static void computejumps(struct parse p, struct re_guts g);
1683	*
1684	* This algorithm assumes g->must exists and is has size greater than
1685	* zero. It's based on the algorithm found on Computer Algorithms by
1686	* Sara Baase.
1687	*
1688	* A char jump is the number of characters one needs to jump based on
1689	* the value of the character from the text that was mismatched.
1690	*/
1691	static void
1692	computejumps(p, g)
1693	struct parse *p;
1694	struct re_guts *g;
1695	{
1696	int ch;
1697	int mindex;
1698
1699	/* Avoid making errors worse */
1700	if (p->error != 0)
1701	return;
1702
1703	g->charjump = (int) malloc((NC + 1) sizeof(int));
1704	if (g->charjump == NULL) /* Not a fatal error */
1705	return;
1706	/* Adjust for signed chars, if necessary */
1707	g->charjump = &g->charjump[-(CHAR_MIN)];
1708
1709	/* If the character does not exist in the pattern, the jump
1710	* is equal to the number of characters in the pattern.
1711	*/
1712	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1713	g->charjump[ch] = g->mlen;
1714
1715	/* If the character does exist, compute the jump that would
1716	* take us to the last character in the pattern equal to it
1717	* (notice that we match right to left, so that last character
1718	* is the first one that would be matched).
1719	*/
1720	for (mindex = 0; mindex < g->mlen; mindex++)
1721	g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1722	}
1723
1724	/*
1725	- computematchjumps - compute match jumps for BM scan
1726	== static void computematchjumps(struct parse p, struct re_guts g);
1727	*
1728	* This algorithm assumes g->must exists and is has size greater than
1729	* zero. It's based on the algorithm found on Computer Algorithms by
1730	* Sara Baase.
1731	*
1732	* A match jump is the number of characters one needs to advance based
1733	* on the already-matched suffix.
1734	* Notice that all values here are minus (g->mlen-1), because of the way
1735	* the search algorithm works.
1736	*/
1737	static void
1738	computematchjumps(p, g)
1739	struct parse *p;
1740	struct re_guts *g;
1741	{
1742	int mindex; /* General "must" iterator */
1743	int suffix; /* Keeps track of matching suffix */
1744	int ssuffix; /* Keeps track of suffixes' suffix */
1745	int* pmatches; /* pmatches[k] points to the next i
1746	* such that i+1...mlen is a substring
1747	* of k+1...k+mlen-i-1
1748	*/
1749
1750	/* Avoid making errors worse */
1751	if (p->error != 0)
1752	return;
1753
1754	pmatches = (int) malloc(g->mlen sizeof(unsigned int));
1755	if (pmatches == NULL) {
1756	g->matchjump = NULL;
1757	return;
1758	}
1759
1760	g->matchjump = (int) malloc(g->mlen sizeof(unsigned int));
1761	if (g->matchjump == NULL) /* Not a fatal error */
1762	return;
1763
1764	/* Set maximum possible jump for each character in the pattern */
1765	for (mindex = 0; mindex < g->mlen; mindex++)
1766	g->matchjump[mindex] = 2*g->mlen - mindex - 1;
1767
1768	/* Compute pmatches[] */
1769	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
1770	mindex--, suffix--) {
1771	pmatches[mindex] = suffix;
1772
1773	/* If a mismatch is found, interrupting the substring,
1774	* compute the matchjump for that position. If no
1775	* mismatch is found, then a text substring mismatched
1776	* against the suffix will also mismatch against the
1777	* substring.
1778	*/
1779	while (suffix < g->mlen
1780	&& g->must[mindex] != g->must[suffix]) {
1781	g->matchjump[suffix] = MIN(g->matchjump[suffix],
1782	g->mlen - mindex - 1);
1783	suffix = pmatches[suffix];
1784	}
1785	}
1786
1787	/* Compute the matchjump up to the last substring found to jump
1788	* to the beginning of the largest must pattern prefix matching
1789	* it's own suffix.
1790	*/
1791	for (mindex = 0; mindex <= suffix; mindex++)
1792	g->matchjump[mindex] = MIN(g->matchjump[mindex],
1793	g->mlen + suffix - mindex);
1794
1795	ssuffix = pmatches[suffix];
1796	while (suffix < g->mlen) {
1797	while (suffix <= ssuffix && suffix < g->mlen) {
1798	g->matchjump[suffix] = MIN(g->matchjump[suffix],
1799	g->mlen + ssuffix - suffix);
1800	suffix++;
1801	}
1802	if (suffix < g->mlen)
1803	ssuffix = pmatches[ssuffix];
1804	}
1805
1806	free(pmatches);
1807	}
1808
1809	/*
1810	- pluscount - count + nesting
1811	== static sopno pluscount(struct parse p, struct re_guts g);
1812	*/
1813	static sopno /* nesting depth */
1814	pluscount(p, g)
1815	struct parse *p;
1816	struct re_guts *g;
1817	{
1818	sop *scan;
1819	sop s;
1820	sopno plusnest = 0;
1821	sopno maxnest = 0;
1822
1823	if (p->error != 0)
1824	return(0); /* there may not be an OEND */
1825
1826	scan = g->strip + 1;
1827	do {
1828	s = *scan++;
1829	switch (OP(s)) {
1830	case OPLUS_:
1831	plusnest++;
1832	break;
1833	case O_PLUS:
1834	if (plusnest > maxnest)
1835	maxnest = plusnest;
1836	plusnest--;
1837	break;
1838	}
1839	} while (OP(s) != OEND);
1840	if (plusnest != 0)
1841	g->iflags \|= BAD;
1842	return(maxnest);
1843	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: