Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: vendor/FreeBSD-libc/5.1/regex/regcomp.c

Visit:

Last change on this file was 991, checked in by (none), 22 years ago
This commit was manufactured by cvs2svn to create branch 'FREEBSD'.
Property cvs2svn:cvs-rev set to `1.1` Property svn:eol-style set to `native` Property svn:executable set to ``*
File size: 48.0 KB

Line
1	/*-
2	* Copyright (c) 1992, 1993, 1994 Henry Spencer.
3	* Copyright (c) 1992, 1993, 1994
4	* The Regents of the University of California. All rights reserved.
5	*
6	* This code is derived from software contributed to Berkeley by
7	* Henry Spencer.
8	*
9	* Redistribution and use in source and binary forms, with or without
10	* modification, are permitted provided that the following conditions
11	* are met:
12	* 1. Redistributions of source code must retain the above copyright
13	* notice, this list of conditions and the following disclaimer.
14	* 2. Redistributions in binary form must reproduce the above copyright
15	* notice, this list of conditions and the following disclaimer in the
16	* documentation and/or other materials provided with the distribution.
17	* 3. All advertising materials mentioning features or use of this software
18	* must display the following acknowledgement:
19	* This product includes software developed by the University of
20	* California, Berkeley and its contributors.
21	* 4. Neither the name of the University nor the names of its contributors
22	* may be used to endorse or promote products derived from this software
23	* without specific prior written permission.
24	*
25	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35	* SUCH DAMAGE.
36	*
37	* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
38	*/
39
40	#if defined(LIBC_SCCS) && !defined(lint)
41	static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
42	#endif /* LIBC_SCCS and not lint */
43	#include <sys/cdefs.h>
44	__FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.30 2003/02/16 17:29:10 nectar Exp $");
45
46	#include <sys/types.h>
47	#include <stdio.h>
48	#include <string.h>
49	#include <ctype.h>
50	#include <limits.h>
51	#include <stdlib.h>
52	#include <regex.h>
53
54	#include "collate.h"
55
56	#include "utils.h"
57	#include "regex2.h"
58
59	#include "cclass.h"
60	#include "cname.h"
61
62	/*
63	* parse structure, passed up and down to avoid global variables and
64	* other clumsinesses
65	*/
66	struct parse {
67	char next; / next character in RE */
68	char end; / end of string (-> NUL normally) */
69	int error; /* has an error been seen? */
70	sop strip; / malloced strip */
71	sopno ssize; /* malloced strip size (allocated) */
72	sopno slen; /* malloced strip length (used) */
73	int ncsalloc; /* number of csets allocated */
74	struct re_guts *g;
75	# define NPAREN 10 /* we need to remember () 1-9 for back refs */
76	sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
77	sopno pend[NPAREN]; /* -> ) ([0] unused) */
78	};
79
80	/* ========= begin header generated by ./mkh ========= */
81	#ifdef __cplusplus
82	extern "C" {
83	#endif
84
85	/* === regcomp.c === */
86	static void p_ere(struct parse *p, int stop);
87	static void p_ere_exp(struct parse *p);
88	static void p_str(struct parse *p);
89	static void p_bre(struct parse *p, int end1, int end2);
90	static int p_simp_re(struct parse *p, int starordinary);
91	static int p_count(struct parse *p);
92	static void p_bracket(struct parse *p);
93	static void p_b_term(struct parse p, cset cs);
94	static void p_b_cclass(struct parse p, cset cs);
95	static void p_b_eclass(struct parse p, cset cs);
96	static char p_b_symbol(struct parse *p);
97	static char p_b_coll_elem(struct parse *p, int endc);
98	static char othercase(int ch);
99	static void bothcases(struct parse *p, int ch);
100	static void ordinary(struct parse *p, int ch);
101	static void nonnewline(struct parse *p);
102	static void repeat(struct parse *p, sopno start, int from, int to);
103	static int seterr(struct parse *p, int e);
104	static cset allocset(struct parse p);
105	static void freeset(struct parse p, cset cs);
106	static int freezeset(struct parse p, cset cs);
107	static int firstch(struct parse p, cset cs);
108	static int nch(struct parse p, cset cs);
109	static void mcadd(struct parse p, cset cs, char *cp) __unused;
110	#if used
111	static void mcsub(cset cs, char cp);
112	static int mcin(cset cs, char cp);
113	static char mcfind(cset cs, char *cp);
114	#endif
115	static void mcinvert(struct parse p, cset cs);
116	static void mccase(struct parse p, cset cs);
117	static int isinsets(struct re_guts *g, int c);
118	static int samesets(struct re_guts *g, int c1, int c2);
119	static void categorize(struct parse p, struct re_guts g);
120	static sopno dupl(struct parse *p, sopno start, sopno finish);
121	static void doemit(struct parse *p, sop op, size_t opnd);
122	static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
123	static void dofwd(struct parse *p, sopno pos, sop value);
124	static void enlarge(struct parse *p, sopno size);
125	static void stripsnug(struct parse p, struct re_guts g);
126	static void findmust(struct parse p, struct re_guts g);
127	static int altoffset(sop *scan, int offset, int mccs);
128	static void computejumps(struct parse p, struct re_guts g);
129	static void computematchjumps(struct parse p, struct re_guts g);
130	static sopno pluscount(struct parse p, struct re_guts g);
131
132	#ifdef __cplusplus
133	}
134	#endif
135	/* ========= end header generated by ./mkh ========= */
136
137	static char nuls[10]; /* place to point scanner in event of error */
138
139	/*
140	* macros for use with parse structure
141	* BEWARE: these know that the parse structure is named `p' !!!
142	*/
143	#define PEEK() (*p->next)
144	#define PEEK2() (*(p->next+1))
145	#define MORE() (p->next < p->end)
146	#define MORE2() (p->next+1 < p->end)
147	#define SEE(c) (MORE() && PEEK() == (c))
148	#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
149	#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
150	#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
151	#define NEXT() (p->next++)
152	#define NEXT2() (p->next += 2)
153	#define NEXTn(n) (p->next += (n))
154	#define GETNEXT() (*p->next++)
155	#define SETERROR(e) seterr(p, (e))
156	#define REQUIRE(co, e) ((co) \|\| SETERROR(e))
157	#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
158	#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
159	#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() \|\| PEEK() != (c), e))
160	#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
161	#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
162	#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
163	#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
164	#define HERE() (p->slen)
165	#define THERE() (p->slen - 1)
166	#define THERETHERE() (p->slen - 2)
167	#define DROP(n) (p->slen -= (n))
168
169	#ifndef NDEBUG
170	static int never = 0; /* for use in asserts; shuts lint up */
171	#else
172	#define never 0 /* some <assert.h>s have bugs too */
173	#endif
174
175	/* Macro used by computejump()/computematchjump() */
176	#define MIN(a,b) ((a)<(b)?(a):(b))
177
178	/*
179	- regcomp - interface for parser and compilation
180	= extern int regcomp(regex_t , const char , int);
181	= #define REG_BASIC 0000
182	= #define REG_EXTENDED 0001
183	= #define REG_ICASE 0002
184	= #define REG_NOSUB 0004
185	= #define REG_NEWLINE 0010
186	= #define REG_NOSPEC 0020
187	= #define REG_PEND 0040
188	= #define REG_DUMP 0200
189	*/
190	int /* 0 success, otherwise REG_something */
191	regcomp(preg, pattern, cflags)
192	regex_t * __restrict preg;
193	const char * __restrict pattern;
194	int cflags;
195	{
196	struct parse pa;
197	struct re_guts *g;
198	struct parse *p = &pa;
199	int i;
200	size_t len;
201	#ifdef REDEBUG
202	# define GOODFLAGS(f) (f)
203	#else
204	# define GOODFLAGS(f) ((f)&~REG_DUMP)
205	#endif
206
207	cflags = GOODFLAGS(cflags);
208	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
209	return(REG_INVARG);
210
211	if (cflags&REG_PEND) {
212	if (preg->re_endp < pattern)
213	return(REG_INVARG);
214	len = preg->re_endp - pattern;
215	} else
216	len = strlen((char *)pattern);
217
218	/* do the mallocs early so failure handling is easy */
219	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
220	(NC-1)*sizeof(cat_t));
221	if (g == NULL)
222	return(REG_ESPACE);
223	p->ssize = len/(size_t)2(size_t)3 + (size_t)1; / ugh */
224	p->strip = (sop )malloc(p->ssize sizeof(sop));
225	p->slen = 0;
226	if (p->strip == NULL) {
227	free((char *)g);
228	return(REG_ESPACE);
229	}
230
231	/* set things up */
232	p->g = g;
233	p->next = (char )pattern; / convenience; we do not modify it */
234	p->end = p->next + len;
235	p->error = 0;
236	p->ncsalloc = 0;
237	for (i = 0; i < NPAREN; i++) {
238	p->pbegin[i] = 0;
239	p->pend[i] = 0;
240	}
241	g->csetsize = NC;
242	g->sets = NULL;
243	g->setbits = NULL;
244	g->ncsets = 0;
245	g->cflags = cflags;
246	g->iflags = 0;
247	g->nbol = 0;
248	g->neol = 0;
249	g->must = NULL;
250	g->moffset = -1;
251	g->charjump = NULL;
252	g->matchjump = NULL;
253	g->mlen = 0;
254	g->nsub = 0;
255	g->ncategories = 1; /* category 0 is "everything else" */
256	g->categories = &g->catspace[-(CHAR_MIN)];
257	(void) memset((char )g->catspace, 0, NCsizeof(cat_t));
258	g->backrefs = 0;
259
260	/* do it */
261	EMIT(OEND, 0);
262	g->firststate = THERE();
263	if (cflags&REG_EXTENDED)
264	p_ere(p, OUT);
265	else if (cflags&REG_NOSPEC)
266	p_str(p);
267	else
268	p_bre(p, OUT, OUT);
269	EMIT(OEND, 0);
270	g->laststate = THERE();
271
272	/* tidy up loose ends and fill things in */
273	categorize(p, g);
274	stripsnug(p, g);
275	findmust(p, g);
276	/* only use Boyer-Moore algorithm if the pattern is bigger
277	* than three characters
278	*/
279	if(g->mlen > 3) {
280	computejumps(p, g);
281	computematchjumps(p, g);
282	if(g->matchjump == NULL && g->charjump != NULL) {
283	free(g->charjump);
284	g->charjump = NULL;
285	}
286	}
287	g->nplus = pluscount(p, g);
288	g->magic = MAGIC2;
289	preg->re_nsub = g->nsub;
290	preg->re_g = g;
291	preg->re_magic = MAGIC1;
292	#ifndef REDEBUG
293	/* not debugging, so can't rely on the assert() in regexec() */
294	if (g->iflags&BAD)
295	SETERROR(REG_ASSERT);
296	#endif
297
298	/* win or lose, we're done */
299	if (p->error != 0) /* lose */
300	regfree(preg);
301	return(p->error);
302	}
303
304	/*
305	- p_ere - ERE parser top level, concatenation and alternation
306	== static void p_ere(struct parse *p, int stop);
307	*/
308	static void
309	p_ere(p, stop)
310	struct parse *p;
311	int stop; /* character this ERE should end at */
312	{
313	char c;
314	sopno prevback;
315	sopno prevfwd;
316	sopno conc;
317	int first = 1; /* is this the first alternative? */
318
319	for (;;) {
320	/* do a bunch of concatenated expressions */
321	conc = HERE();
322	while (MORE() && (c = PEEK()) != '\|' && c != stop)
323	p_ere_exp(p);
324	(void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
325
326	if (!EAT('\|'))
327	break; /* NOTE BREAK OUT */
328
329	if (first) {
330	INSERT(OCH_, conc); /* offset is wrong */
331	prevfwd = conc;
332	prevback = conc;
333	first = 0;
334	}
335	ASTERN(OOR1, prevback);
336	prevback = THERE();
337	AHEAD(prevfwd); /* fix previous offset */
338	prevfwd = HERE();
339	EMIT(OOR2, 0); /* offset is very wrong */
340	}
341
342	if (!first) { /* tail-end fixups */
343	AHEAD(prevfwd);
344	ASTERN(O_CH, prevback);
345	}
346
347	assert(!MORE() \|\| SEE(stop));
348	}
349
350	/*
351	- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
352	== static void p_ere_exp(struct parse *p);
353	*/
354	static void
355	p_ere_exp(p)
356	struct parse *p;
357	{
358	char c;
359	sopno pos;
360	int count;
361	int count2;
362	sopno subno;
363	int wascaret = 0;
364
365	assert(MORE()); /* caller should have ensured this */
366	c = GETNEXT();
367
368	pos = HERE();
369	switch (c) {
370	case '(':
371	(void)REQUIRE(MORE(), REG_EPAREN);
372	p->g->nsub++;
373	subno = p->g->nsub;
374	if (subno < NPAREN)
375	p->pbegin[subno] = HERE();
376	EMIT(OLPAREN, subno);
377	if (!SEE(')'))
378	p_ere(p, ')');
379	if (subno < NPAREN) {
380	p->pend[subno] = HERE();
381	assert(p->pend[subno] != 0);
382	}
383	EMIT(ORPAREN, subno);
384	(void)MUSTEAT(')', REG_EPAREN);
385	break;
386	#ifndef POSIX_MISTAKE
387	case ')': /* happens only if no current unmatched ( */
388	/*
389	* You may ask, why the ifndef? Because I didn't notice
390	* this until slightly too late for 1003.2, and none of the
391	* other 1003.2 regular-expression reviewers noticed it at
392	* all. So an unmatched ) is legal POSIX, at least until
393	* we can get it fixed.
394	*/
395	SETERROR(REG_EPAREN);
396	break;
397	#endif
398	case '^':
399	EMIT(OBOL, 0);
400	p->g->iflags \|= USEBOL;
401	p->g->nbol++;
402	wascaret = 1;
403	break;
404	case '$':
405	EMIT(OEOL, 0);
406	p->g->iflags \|= USEEOL;
407	p->g->neol++;
408	break;
409	case '\|':
410	SETERROR(REG_EMPTY);
411	break;
412	case '*':
413	case '+':
414	case '?':
415	SETERROR(REG_BADRPT);
416	break;
417	case '.':
418	if (p->g->cflags&REG_NEWLINE)
419	nonnewline(p);
420	else
421	EMIT(OANY, 0);
422	break;
423	case '[':
424	p_bracket(p);
425	break;
426	case '\\':
427	(void)REQUIRE(MORE(), REG_EESCAPE);
428	c = GETNEXT();
429	ordinary(p, c);
430	break;
431	case '{': /* okay as ordinary except if digit follows */
432	(void)REQUIRE(!MORE() \|\| !isdigit((uch)PEEK()), REG_BADRPT);
433	/* FALLTHROUGH */
434	default:
435	ordinary(p, c);
436	break;
437	}
438
439	if (!MORE())
440	return;
441	c = PEEK();
442	/* we call { a repetition if followed by a digit */
443	if (!( c == '*' \|\| c == '+' \|\| c == '?' \|\|
444	(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
445	return; /* no repetition, we're done */
446	NEXT();
447
448	(void)REQUIRE(!wascaret, REG_BADRPT);
449	switch (c) {
450	case '': / implemented as +? */
451	/* this case does not require the (y\|) trick, noKLUDGE */
452	INSERT(OPLUS_, pos);
453	ASTERN(O_PLUS, pos);
454	INSERT(OQUEST_, pos);
455	ASTERN(O_QUEST, pos);
456	break;
457	case '+':
458	INSERT(OPLUS_, pos);
459	ASTERN(O_PLUS, pos);
460	break;
461	case '?':
462	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
463	INSERT(OCH_, pos); /* offset slightly wrong */
464	ASTERN(OOR1, pos); /* this one's right */
465	AHEAD(pos); /* fix the OCH_ */
466	EMIT(OOR2, 0); /* offset very wrong... */
467	AHEAD(THERE()); /* ...so fix it */
468	ASTERN(O_CH, THERETHERE());
469	break;
470	case '{':
471	count = p_count(p);
472	if (EAT(',')) {
473	if (isdigit((uch)PEEK())) {
474	count2 = p_count(p);
475	(void)REQUIRE(count <= count2, REG_BADBR);
476	} else /* single number with comma */
477	count2 = INFINITY;
478	} else /* just a single number */
479	count2 = count;
480	repeat(p, pos, count, count2);
481	if (!EAT('}')) { /* error heuristics */
482	while (MORE() && PEEK() != '}')
483	NEXT();
484	(void)REQUIRE(MORE(), REG_EBRACE);
485	SETERROR(REG_BADBR);
486	}
487	break;
488	}
489
490	if (!MORE())
491	return;
492	c = PEEK();
493	if (!( c == '*' \|\| c == '+' \|\| c == '?' \|\|
494	(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
495	return;
496	SETERROR(REG_BADRPT);
497	}
498
499	/*
500	- p_str - string (no metacharacters) "parser"
501	== static void p_str(struct parse *p);
502	*/
503	static void
504	p_str(p)
505	struct parse *p;
506	{
507	(void)REQUIRE(MORE(), REG_EMPTY);
508	while (MORE())
509	ordinary(p, GETNEXT());
510	}
511
512	/*
513	- p_bre - BRE parser top level, anchoring and concatenation
514	== static void p_bre(struct parse *p, int end1, \
515	== int end2);
516	* Giving end1 as OUT essentially eliminates the end1/end2 check.
517	*
518	* This implementation is a bit of a kludge, in that a trailing $ is first
519	* taken as an ordinary character and then revised to be an anchor. The
520	* only undesirable side effect is that '$' gets included as a character
521	* category in such cases. This is fairly harmless; not worth fixing.
522	* The amount of lookahead needed to avoid this kludge is excessive.
523	*/
524	static void
525	p_bre(p, end1, end2)
526	struct parse *p;
527	int end1; /* first terminating character */
528	int end2; /* second terminating character */
529	{
530	sopno start = HERE();
531	int first = 1; /* first subexpression? */
532	int wasdollar = 0;
533
534	if (EAT('^')) {
535	EMIT(OBOL, 0);
536	p->g->iflags \|= USEBOL;
537	p->g->nbol++;
538	}
539	while (MORE() && !SEETWO(end1, end2)) {
540	wasdollar = p_simp_re(p, first);
541	first = 0;
542	}
543	if (wasdollar) { /* oops, that was a trailing anchor */
544	DROP(1);
545	EMIT(OEOL, 0);
546	p->g->iflags \|= USEEOL;
547	p->g->neol++;
548	}
549
550	(void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
551	}
552
553	/*
554	- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
555	== static int p_simp_re(struct parse *p, int starordinary);
556	*/
557	static int /* was the simple RE an unbackslashed $? */
558	p_simp_re(p, starordinary)
559	struct parse *p;
560	int starordinary; /* is a leading * an ordinary character? */
561	{
562	int c;
563	int count;
564	int count2;
565	sopno pos;
566	int i;
567	sopno subno;
568	# define BACKSL (1<<CHAR_BIT)
569
570	pos = HERE(); /* repetion op, if any, covers from here */
571
572	assert(MORE()); /* caller should have ensured this */
573	c = GETNEXT();
574	if (c == '\\') {
575	(void)REQUIRE(MORE(), REG_EESCAPE);
576	c = BACKSL \| GETNEXT();
577	}
578	switch (c) {
579	case '.':
580	if (p->g->cflags&REG_NEWLINE)
581	nonnewline(p);
582	else
583	EMIT(OANY, 0);
584	break;
585	case '[':
586	p_bracket(p);
587	break;
588	case BACKSL\|'{':
589	SETERROR(REG_BADRPT);
590	break;
591	case BACKSL\|'(':
592	p->g->nsub++;
593	subno = p->g->nsub;
594	if (subno < NPAREN)
595	p->pbegin[subno] = HERE();
596	EMIT(OLPAREN, subno);
597	/* the MORE here is an error heuristic */
598	if (MORE() && !SEETWO('\\', ')'))
599	p_bre(p, '\\', ')');
600	if (subno < NPAREN) {
601	p->pend[subno] = HERE();
602	assert(p->pend[subno] != 0);
603	}
604	EMIT(ORPAREN, subno);
605	(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
606	break;
607	case BACKSL\|')': /* should not get here -- must be user */
608	case BACKSL\|'}':
609	SETERROR(REG_EPAREN);
610	break;
611	case BACKSL\|'1':
612	case BACKSL\|'2':
613	case BACKSL\|'3':
614	case BACKSL\|'4':
615	case BACKSL\|'5':
616	case BACKSL\|'6':
617	case BACKSL\|'7':
618	case BACKSL\|'8':
619	case BACKSL\|'9':
620	i = (c&~BACKSL) - '0';
621	assert(i < NPAREN);
622	if (p->pend[i] != 0) {
623	assert(i <= p->g->nsub);
624	EMIT(OBACK_, i);
625	assert(p->pbegin[i] != 0);
626	assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
627	assert(OP(p->strip[p->pend[i]]) == ORPAREN);
628	(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
629	EMIT(O_BACK, i);
630	} else
631	SETERROR(REG_ESUBREG);
632	p->g->backrefs = 1;
633	break;
634	case '*':
635	(void)REQUIRE(starordinary, REG_BADRPT);
636	/* FALLTHROUGH */
637	default:
638	ordinary(p, (char)c);
639	break;
640	}
641
642	if (EAT('')) { / implemented as +? */
643	/* this case does not require the (y\|) trick, noKLUDGE */
644	INSERT(OPLUS_, pos);
645	ASTERN(O_PLUS, pos);
646	INSERT(OQUEST_, pos);
647	ASTERN(O_QUEST, pos);
648	} else if (EATTWO('\\', '{')) {
649	count = p_count(p);
650	if (EAT(',')) {
651	if (MORE() && isdigit((uch)PEEK())) {
652	count2 = p_count(p);
653	(void)REQUIRE(count <= count2, REG_BADBR);
654	} else /* single number with comma */
655	count2 = INFINITY;
656	} else /* just a single number */
657	count2 = count;
658	repeat(p, pos, count, count2);
659	if (!EATTWO('\\', '}')) { /* error heuristics */
660	while (MORE() && !SEETWO('\\', '}'))
661	NEXT();
662	(void)REQUIRE(MORE(), REG_EBRACE);
663	SETERROR(REG_BADBR);
664	}
665	} else if (c == '$') /* $ (but not \$) ends it */
666	return(1);
667
668	return(0);
669	}
670
671	/*
672	- p_count - parse a repetition count
673	== static int p_count(struct parse *p);
674	*/
675	static int /* the value */
676	p_count(p)
677	struct parse *p;
678	{
679	int count = 0;
680	int ndigits = 0;
681
682	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
683	count = count*10 + (GETNEXT() - '0');
684	ndigits++;
685	}
686
687	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
688	return(count);
689	}
690
691	/*
692	- p_bracket - parse a bracketed character list
693	== static void p_bracket(struct parse *p);
694	*
695	* Note a significant property of this code: if the allocset() did SETERROR,
696	* no set operations are done.
697	*/
698	static void
699	p_bracket(p)
700	struct parse *p;
701	{
702	cset *cs = allocset(p);
703	int invert = 0;
704
705	/* Dept of Truly Sickening Special-Case Kludges */
706	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
707	EMIT(OBOW, 0);
708	NEXTn(6);
709	return;
710	}
711	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
712	EMIT(OEOW, 0);
713	NEXTn(6);
714	return;
715	}
716
717	if (EAT('^'))
718	invert++; /* make note to invert set at end */
719	if (EAT(']'))
720	CHadd(cs, ']');
721	else if (EAT('-'))
722	CHadd(cs, '-');
723	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
724	p_b_term(p, cs);
725	if (EAT('-'))
726	CHadd(cs, '-');
727	(void)MUSTEAT(']', REG_EBRACK);
728
729	if (p->error != 0) /* don't mess things up further */
730	return;
731
732	if (p->g->cflags&REG_ICASE) {
733	int i;
734	int ci;
735
736	for (i = p->g->csetsize - 1; i >= 0; i--)
737	if (CHIN(cs, i) && isalpha(i)) {
738	ci = othercase(i);
739	if (ci != i)
740	CHadd(cs, ci);
741	}
742	if (cs->multis != NULL)
743	mccase(p, cs);
744	}
745	if (invert) {
746	int i;
747
748	for (i = p->g->csetsize - 1; i >= 0; i--)
749	if (CHIN(cs, i))
750	CHsub(cs, i);
751	else
752	CHadd(cs, i);
753	if (p->g->cflags&REG_NEWLINE)
754	CHsub(cs, '\n');
755	if (cs->multis != NULL)
756	mcinvert(p, cs);
757	}
758
759	assert(cs->multis == NULL); /* xxx */
760
761	if (nch(p, cs) == 1) { /* optimize singleton sets */
762	ordinary(p, firstch(p, cs));
763	freeset(p, cs);
764	} else
765	EMIT(OANYOF, freezeset(p, cs));
766	}
767
768	/*
769	- p_b_term - parse one term of a bracketed character list
770	== static void p_b_term(struct parse p, cset cs);
771	*/
772	static void
773	p_b_term(p, cs)
774	struct parse *p;
775	cset *cs;
776	{
777	char c;
778	char start, finish;
779	int i;
780
781	/* classify what we've got */
782	switch ((MORE()) ? PEEK() : '\0') {
783	case '[':
784	c = (MORE2()) ? PEEK2() : '\0';
785	break;
786	case '-':
787	SETERROR(REG_ERANGE);
788	return; /* NOTE RETURN */
789	break;
790	default:
791	c = '\0';
792	break;
793	}
794
795	switch (c) {
796	case ':': /* character class */
797	NEXT2();
798	(void)REQUIRE(MORE(), REG_EBRACK);
799	c = PEEK();
800	(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
801	p_b_cclass(p, cs);
802	(void)REQUIRE(MORE(), REG_EBRACK);
803	(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
804	break;
805	case '=': /* equivalence class */
806	NEXT2();
807	(void)REQUIRE(MORE(), REG_EBRACK);
808	c = PEEK();
809	(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
810	p_b_eclass(p, cs);
811	(void)REQUIRE(MORE(), REG_EBRACK);
812	(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
813	break;
814	default: /* symbol, ordinary character, or range */
815	/* xxx revision needed for multichar stuff */
816	start = p_b_symbol(p);
817	if (SEE('-') && MORE2() && PEEK2() != ']') {
818	/* range */
819	NEXT();
820	if (EAT('-'))
821	finish = '-';
822	else
823	finish = p_b_symbol(p);
824	} else
825	finish = start;
826	if (start == finish)
827	CHadd(cs, start);
828	else {
829	if (__collate_load_error) {
830	(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
831	for (i = (uch)start; i <= (uch)finish; i++)
832	CHadd(cs, i);
833	} else {
834	(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
835	for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
836	if ( __collate_range_cmp(start, i) <= 0
837	&& __collate_range_cmp(i, finish) <= 0
838	)
839	CHadd(cs, i);
840	}
841	}
842	}
843	break;
844	}
845	}
846
847	/*
848	- p_b_cclass - parse a character-class name and deal with it
849	== static void p_b_cclass(struct parse p, cset cs);
850	*/
851	static void
852	p_b_cclass(p, cs)
853	struct parse *p;
854	cset *cs;
855	{
856	int c;
857	char *sp = p->next;
858	struct cclass *cp;
859	size_t len;
860
861	while (MORE() && isalpha((uch)PEEK()))
862	NEXT();
863	len = p->next - sp;
864	for (cp = cclasses; cp->name != NULL; cp++)
865	if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
866	break;
867	if (cp->name == NULL) {
868	/* oops, didn't find it */
869	SETERROR(REG_ECTYPE);
870	return;
871	}
872
873	switch (cp->fidx) {
874	case CALNUM:
875	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
876	if (isalnum((uch)c))
877	CHadd(cs, c);
878	break;
879	case CALPHA:
880	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
881	if (isalpha((uch)c))
882	CHadd(cs, c);
883	break;
884	case CBLANK:
885	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
886	if (isblank((uch)c))
887	CHadd(cs, c);
888	break;
889	case CCNTRL:
890	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
891	if (iscntrl((uch)c))
892	CHadd(cs, c);
893	break;
894	case CDIGIT:
895	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
896	if (isdigit((uch)c))
897	CHadd(cs, c);
898	break;
899	case CGRAPH:
900	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
901	if (isgraph((uch)c))
902	CHadd(cs, c);
903	break;
904	case CLOWER:
905	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
906	if (islower((uch)c))
907	CHadd(cs, c);
908	break;
909	case CPRINT:
910	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
911	if (isprint((uch)c))
912	CHadd(cs, c);
913	break;
914	case CPUNCT:
915	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
916	if (ispunct((uch)c))
917	CHadd(cs, c);
918	break;
919	case CSPACE:
920	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
921	if (isspace((uch)c))
922	CHadd(cs, c);
923	break;
924	case CUPPER:
925	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
926	if (isupper((uch)c))
927	CHadd(cs, c);
928	break;
929	case CXDIGIT:
930	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
931	if (isxdigit((uch)c))
932	CHadd(cs, c);
933	break;
934	}
935	#if 0
936	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
937	MCadd(p, cs, u);
938	#endif
939	}
940
941	/*
942	- p_b_eclass - parse an equivalence-class name and deal with it
943	== static void p_b_eclass(struct parse p, cset cs);
944	*
945	* This implementation is incomplete. xxx
946	*/
947	static void
948	p_b_eclass(p, cs)
949	struct parse *p;
950	cset *cs;
951	{
952	char c;
953
954	c = p_b_coll_elem(p, '=');
955	CHadd(cs, c);
956	}
957
958	/*
959	- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
960	== static char p_b_symbol(struct parse *p);
961	*/
962	static char /* value of symbol */
963	p_b_symbol(p)
964	struct parse *p;
965	{
966	char value;
967
968	(void)REQUIRE(MORE(), REG_EBRACK);
969	if (!EATTWO('[', '.'))
970	return(GETNEXT());
971
972	/* collating symbol */
973	value = p_b_coll_elem(p, '.');
974	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
975	return(value);
976	}
977
978	/*
979	- p_b_coll_elem - parse a collating-element name and look it up
980	== static char p_b_coll_elem(struct parse *p, int endc);
981	*/
982	static char /* value of collating element */
983	p_b_coll_elem(p, endc)
984	struct parse *p;
985	int endc; /* name ended by endc,']' */
986	{
987	char *sp = p->next;
988	struct cname *cp;
989	int len;
990
991	while (MORE() && !SEETWO(endc, ']'))
992	NEXT();
993	if (!MORE()) {
994	SETERROR(REG_EBRACK);
995	return(0);
996	}
997	len = p->next - sp;
998	for (cp = cnames; cp->name != NULL; cp++)
999	if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1000	return(cp->code); /* known name */
1001	if (len == 1)
1002	return(sp); / single character */
1003	SETERROR(REG_ECOLLATE); /* neither */
1004	return(0);
1005	}
1006
1007	/*
1008	- othercase - return the case counterpart of an alphabetic
1009	== static char othercase(int ch);
1010	*/
1011	static char /* if no counterpart, return ch */
1012	othercase(ch)
1013	int ch;
1014	{
1015	ch = (uch)ch;
1016	assert(isalpha(ch));
1017	if (isupper(ch))
1018	return(tolower(ch));
1019	else if (islower(ch))
1020	return(toupper(ch));
1021	else /* peculiar, but could happen */
1022	return(ch);
1023	}
1024
1025	/*
1026	- bothcases - emit a dualcase version of a two-case character
1027	== static void bothcases(struct parse *p, int ch);
1028	*
1029	* Boy, is this implementation ever a kludge...
1030	*/
1031	static void
1032	bothcases(p, ch)
1033	struct parse *p;
1034	int ch;
1035	{
1036	char *oldnext = p->next;
1037	char *oldend = p->end;
1038	char bracket[3];
1039
1040	ch = (uch)ch;
1041	assert(othercase(ch) != ch); /* p_bracket() would recurse */
1042	p->next = bracket;
1043	p->end = bracket+2;
1044	bracket[0] = ch;
1045	bracket[1] = ']';
1046	bracket[2] = '\0';
1047	p_bracket(p);
1048	assert(p->next == bracket+2);
1049	p->next = oldnext;
1050	p->end = oldend;
1051	}
1052
1053	/*
1054	- ordinary - emit an ordinary character
1055	== static void ordinary(struct parse *p, int ch);
1056	*/
1057	static void
1058	ordinary(p, ch)
1059	struct parse *p;
1060	int ch;
1061	{
1062	cat_t *cap = p->g->categories;
1063
1064	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1065	bothcases(p, ch);
1066	else {
1067	EMIT(OCHAR, (uch)ch);
1068	if (cap[ch] == 0)
1069	cap[ch] = p->g->ncategories++;
1070	}
1071	}
1072
1073	/*
1074	- nonnewline - emit REG_NEWLINE version of OANY
1075	== static void nonnewline(struct parse *p);
1076	*
1077	* Boy, is this implementation ever a kludge...
1078	*/
1079	static void
1080	nonnewline(p)
1081	struct parse *p;
1082	{
1083	char *oldnext = p->next;
1084	char *oldend = p->end;
1085	char bracket[4];
1086
1087	p->next = bracket;
1088	p->end = bracket+3;
1089	bracket[0] = '^';
1090	bracket[1] = '\n';
1091	bracket[2] = ']';
1092	bracket[3] = '\0';
1093	p_bracket(p);
1094	assert(p->next == bracket+3);
1095	p->next = oldnext;
1096	p->end = oldend;
1097	}
1098
1099	/*
1100	- repeat - generate code for a bounded repetition, recursively if needed
1101	== static void repeat(struct parse *p, sopno start, int from, int to);
1102	*/
1103	static void
1104	repeat(p, start, from, to)
1105	struct parse *p;
1106	sopno start; /* operand from here to end of strip */
1107	int from; /* repeated from this number */
1108	int to; /* to this number of times (maybe INFINITY) */
1109	{
1110	sopno finish = HERE();
1111	# define N 2
1112	# define INF 3
1113	# define REP(f, t) ((f)*8 + (t))
1114	# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1115	sopno copy;
1116
1117	if (p->error != 0) /* head off possible runaway recursion */
1118	return;
1119
1120	assert(from <= to);
1121
1122	switch (REP(MAP(from), MAP(to))) {
1123	case REP(0, 0): /* must be user doing this */
1124	DROP(finish-start); /* drop the operand */
1125	break;
1126	case REP(0, 1): /* as x{1,1}? */
1127	case REP(0, N): /* as x{1,n}? */
1128	case REP(0, INF): /* as x{1,}? */
1129	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
1130	INSERT(OCH_, start); /* offset is wrong... */
1131	repeat(p, start+1, 1, to);
1132	ASTERN(OOR1, start);
1133	AHEAD(start); /* ... fix it */
1134	EMIT(OOR2, 0);
1135	AHEAD(THERE());
1136	ASTERN(O_CH, THERETHERE());
1137	break;
1138	case REP(1, 1): /* trivial case */
1139	/* done */
1140	break;
1141	case REP(1, N): /* as x?x{1,n-1} */
1142	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
1143	INSERT(OCH_, start);
1144	ASTERN(OOR1, start);
1145	AHEAD(start);
1146	EMIT(OOR2, 0); /* offset very wrong... */
1147	AHEAD(THERE()); /* ...so fix it */
1148	ASTERN(O_CH, THERETHERE());
1149	copy = dupl(p, start+1, finish+1);
1150	assert(copy == finish+4);
1151	repeat(p, copy, 1, to-1);
1152	break;
1153	case REP(1, INF): /* as x+ */
1154	INSERT(OPLUS_, start);
1155	ASTERN(O_PLUS, start);
1156	break;
1157	case REP(N, N): /* as xx{m-1,n-1} */
1158	copy = dupl(p, start, finish);
1159	repeat(p, copy, from-1, to-1);
1160	break;
1161	case REP(N, INF): /* as xx{n-1,INF} */
1162	copy = dupl(p, start, finish);
1163	repeat(p, copy, from-1, to);
1164	break;
1165	default: /* "can't happen" */
1166	SETERROR(REG_ASSERT); /* just in case */
1167	break;
1168	}
1169	}
1170
1171	/*
1172	- seterr - set an error condition
1173	== static int seterr(struct parse *p, int e);
1174	*/
1175	static int /* useless but makes type checking happy */
1176	seterr(p, e)
1177	struct parse *p;
1178	int e;
1179	{
1180	if (p->error == 0) /* keep earliest error condition */
1181	p->error = e;
1182	p->next = nuls; /* try to bring things to a halt */
1183	p->end = nuls;
1184	return(0); /* make the return value well-defined */
1185	}
1186
1187	/*
1188	- allocset - allocate a set of characters for []
1189	== static cset allocset(struct parse p);
1190	*/
1191	static cset *
1192	allocset(p)
1193	struct parse *p;
1194	{
1195	int no = p->g->ncsets++;
1196	size_t nc;
1197	size_t nbytes;
1198	cset *cs;
1199	size_t css = (size_t)p->g->csetsize;
1200	int i;
1201
1202	if (no >= p->ncsalloc) { /* need another column of space */
1203	p->ncsalloc += CHAR_BIT;
1204	nc = p->ncsalloc;
1205	assert(nc % CHAR_BIT == 0);
1206	nbytes = nc / CHAR_BIT * css;
1207	if (p->g->sets == NULL)
1208	p->g->sets = (cset )malloc(nc sizeof(cset));
1209	else
1210	p->g->sets = (cset )reallocf((char )p->g->sets,
1211	nc * sizeof(cset));
1212	if (p->g->setbits == NULL)
1213	p->g->setbits = (uch *)malloc(nbytes);
1214	else {
1215	p->g->setbits = (uch )reallocf((char )p->g->setbits,
1216	nbytes);
1217	/* xxx this isn't right if setbits is now NULL */
1218	for (i = 0; i < no; i++)
1219	p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1220	}
1221	if (p->g->sets != NULL && p->g->setbits != NULL)
1222	(void) memset((char *)p->g->setbits + (nbytes - css),
1223	0, css);
1224	else {
1225	no = 0;
1226	SETERROR(REG_ESPACE);
1227	/* caller's responsibility not to do set ops */
1228	}
1229	}
1230
1231	assert(p->g->sets != NULL); /* xxx */
1232	cs = &p->g->sets[no];
1233	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1234	cs->mask = 1 << ((no) % CHAR_BIT);
1235	cs->hash = 0;
1236	cs->smultis = 0;
1237	cs->multis = NULL;
1238
1239	return(cs);
1240	}
1241
1242	/*
1243	- freeset - free a now-unused set
1244	== static void freeset(struct parse p, cset cs);
1245	*/
1246	static void
1247	freeset(p, cs)
1248	struct parse *p;
1249	cset *cs;
1250	{
1251	int i;
1252	cset *top = &p->g->sets[p->g->ncsets];
1253	size_t css = (size_t)p->g->csetsize;
1254
1255	for (i = 0; i < css; i++)
1256	CHsub(cs, i);
1257	if (cs == top-1) /* recover only the easy case */
1258	p->g->ncsets--;
1259	}
1260
1261	/*
1262	- freezeset - final processing on a set of characters
1263	== static int freezeset(struct parse p, cset cs);
1264	*
1265	* The main task here is merging identical sets. This is usually a waste
1266	* of time (although the hash code minimizes the overhead), but can win
1267	* big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1268	* is done using addition rather than xor -- all ASCII [aA] sets xor to
1269	* the same value!
1270	*/
1271	static int /* set number */
1272	freezeset(p, cs)
1273	struct parse *p;
1274	cset *cs;
1275	{
1276	short h = cs->hash;
1277	int i;
1278	cset *top = &p->g->sets[p->g->ncsets];
1279	cset *cs2;
1280	size_t css = (size_t)p->g->csetsize;
1281
1282	/* look for an earlier one which is the same */
1283	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1284	if (cs2->hash == h && cs2 != cs) {
1285	/* maybe */
1286	for (i = 0; i < css; i++)
1287	if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1288	break; /* no */
1289	if (i == css)
1290	break; /* yes */
1291	}
1292
1293	if (cs2 < top) { /* found one */
1294	freeset(p, cs);
1295	cs = cs2;
1296	}
1297
1298	return((int)(cs - p->g->sets));
1299	}
1300
1301	/*
1302	- firstch - return first character in a set (which must have at least one)
1303	== static int firstch(struct parse p, cset cs);
1304	*/
1305	static int /* character; there is no "none" value */
1306	firstch(p, cs)
1307	struct parse *p;
1308	cset *cs;
1309	{
1310	int i;
1311	size_t css = (size_t)p->g->csetsize;
1312
1313	for (i = 0; i < css; i++)
1314	if (CHIN(cs, i))
1315	return((char)i);
1316	assert(never);
1317	return(0); /* arbitrary */
1318	}
1319
1320	/*
1321	- nch - number of characters in a set
1322	== static int nch(struct parse p, cset cs);
1323	*/
1324	static int
1325	nch(p, cs)
1326	struct parse *p;
1327	cset *cs;
1328	{
1329	int i;
1330	size_t css = (size_t)p->g->csetsize;
1331	int n = 0;
1332
1333	for (i = 0; i < css; i++)
1334	if (CHIN(cs, i))
1335	n++;
1336	return(n);
1337	}
1338
1339	/*
1340	- mcadd - add a collating element to a cset
1341	== static void mcadd(struct parse p, cset cs, \
1342	== char *cp);
1343	*/
1344	static void
1345	mcadd(p, cs, cp)
1346	struct parse *p;
1347	cset *cs;
1348	char *cp;
1349	{
1350	size_t oldend = cs->smultis;
1351
1352	cs->smultis += strlen(cp) + 1;
1353	if (cs->multis == NULL)
1354	cs->multis = malloc(cs->smultis);
1355	else
1356	cs->multis = reallocf(cs->multis, cs->smultis);
1357	if (cs->multis == NULL) {
1358	SETERROR(REG_ESPACE);
1359	return;
1360	}
1361
1362	(void) strcpy(cs->multis + oldend - 1, cp);
1363	cs->multis[cs->smultis - 1] = '\0';
1364	}
1365
1366	#if used
1367	/*
1368	- mcsub - subtract a collating element from a cset
1369	== static void mcsub(cset cs, char cp);
1370	*/
1371	static void
1372	mcsub(cs, cp)
1373	cset *cs;
1374	char *cp;
1375	{
1376	char *fp = mcfind(cs, cp);
1377	size_t len = strlen(fp);
1378
1379	assert(fp != NULL);
1380	(void) memmove(fp, fp + len + 1,
1381	cs->smultis - (fp + len + 1 - cs->multis));
1382	cs->smultis -= len;
1383
1384	if (cs->smultis == 0) {
1385	free(cs->multis);
1386	cs->multis = NULL;
1387	return;
1388	}
1389
1390	cs->multis = reallocf(cs->multis, cs->smultis);
1391	assert(cs->multis != NULL);
1392	}
1393
1394	/*
1395	- mcin - is a collating element in a cset?
1396	== static int mcin(cset cs, char cp);
1397	*/
1398	static int
1399	mcin(cs, cp)
1400	cset *cs;
1401	char *cp;
1402	{
1403	return(mcfind(cs, cp) != NULL);
1404	}
1405
1406	/*
1407	- mcfind - find a collating element in a cset
1408	== static char mcfind(cset cs, char *cp);
1409	*/
1410	static char *
1411	mcfind(cs, cp)
1412	cset *cs;
1413	char *cp;
1414	{
1415	char *p;
1416
1417	if (cs->multis == NULL)
1418	return(NULL);
1419	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1420	if (strcmp(cp, p) == 0)
1421	return(p);
1422	return(NULL);
1423	}
1424	#endif
1425
1426	/*
1427	- mcinvert - invert the list of collating elements in a cset
1428	== static void mcinvert(struct parse p, cset cs);
1429	*
1430	* This would have to know the set of possibilities. Implementation
1431	* is deferred.
1432	*/
1433	static void
1434	mcinvert(p, cs)
1435	struct parse *p;
1436	cset *cs;
1437	{
1438	assert(cs->multis == NULL); /* xxx */
1439	}
1440
1441	/*
1442	- mccase - add case counterparts of the list of collating elements in a cset
1443	== static void mccase(struct parse p, cset cs);
1444	*
1445	* This would have to know the set of possibilities. Implementation
1446	* is deferred.
1447	*/
1448	static void
1449	mccase(p, cs)
1450	struct parse *p;
1451	cset *cs;
1452	{
1453	assert(cs->multis == NULL); /* xxx */
1454	}
1455
1456	/*
1457	- isinsets - is this character in any sets?
1458	== static int isinsets(struct re_guts *g, int c);
1459	*/
1460	static int /* predicate */
1461	isinsets(g, c)
1462	struct re_guts *g;
1463	int c;
1464	{
1465	uch *col;
1466	int i;
1467	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1468	unsigned uc = (uch)c;
1469
1470	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1471	if (col[uc] != 0)
1472	return(1);
1473	return(0);
1474	}
1475
1476	/*
1477	- samesets - are these two characters in exactly the same sets?
1478	== static int samesets(struct re_guts *g, int c1, int c2);
1479	*/
1480	static int /* predicate */
1481	samesets(g, c1, c2)
1482	struct re_guts *g;
1483	int c1;
1484	int c2;
1485	{
1486	uch *col;
1487	int i;
1488	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1489	unsigned uc1 = (uch)c1;
1490	unsigned uc2 = (uch)c2;
1491
1492	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1493	if (col[uc1] != col[uc2])
1494	return(0);
1495	return(1);
1496	}
1497
1498	/*
1499	- categorize - sort out character categories
1500	== static void categorize(struct parse p, struct re_guts g);
1501	*/
1502	static void
1503	categorize(p, g)
1504	struct parse *p;
1505	struct re_guts *g;
1506	{
1507	cat_t *cats = g->categories;
1508	int c;
1509	int c2;
1510	cat_t cat;
1511
1512	/* avoid making error situations worse */
1513	if (p->error != 0)
1514	return;
1515
1516	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1517	if (cats[c] == 0 && isinsets(g, c)) {
1518	cat = g->ncategories++;
1519	cats[c] = cat;
1520	for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1521	if (cats[c2] == 0 && samesets(g, c, c2))
1522	cats[c2] = cat;
1523	}
1524	}
1525
1526	/*
1527	- dupl - emit a duplicate of a bunch of sops
1528	== static sopno dupl(struct parse *p, sopno start, sopno finish);
1529	*/
1530	static sopno /* start of duplicate */
1531	dupl(p, start, finish)
1532	struct parse *p;
1533	sopno start; /* from here */
1534	sopno finish; /* to this less one */
1535	{
1536	sopno ret = HERE();
1537	sopno len = finish - start;
1538
1539	assert(finish >= start);
1540	if (len == 0)
1541	return(ret);
1542	enlarge(p, p->ssize + len); /* this many unexpected additions */
1543	assert(p->ssize >= p->slen + len);
1544	(void) memcpy((char *)(p->strip + p->slen),
1545	(char )(p->strip + start), (size_t)lensizeof(sop));
1546	p->slen += len;
1547	return(ret);
1548	}
1549
1550	/*
1551	- doemit - emit a strip operator
1552	== static void doemit(struct parse *p, sop op, size_t opnd);
1553	*
1554	* It might seem better to implement this as a macro with a function as
1555	* hard-case backup, but it's just too big and messy unless there are
1556	* some changes to the data structures. Maybe later.
1557	*/
1558	static void
1559	doemit(p, op, opnd)
1560	struct parse *p;
1561	sop op;
1562	size_t opnd;
1563	{
1564	/* avoid making error situations worse */
1565	if (p->error != 0)
1566	return;
1567
1568	/* deal with oversize operands ("can't happen", more or less) */
1569	assert(opnd < 1<<OPSHIFT);
1570
1571	/* deal with undersized strip */
1572	if (p->slen >= p->ssize)
1573	enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1574	assert(p->slen < p->ssize);
1575
1576	/* finally, it's all reduced to the easy case */
1577	p->strip[p->slen++] = SOP(op, opnd);
1578	}
1579
1580	/*
1581	- doinsert - insert a sop into the strip
1582	== static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1583	*/
1584	static void
1585	doinsert(p, op, opnd, pos)
1586	struct parse *p;
1587	sop op;
1588	size_t opnd;
1589	sopno pos;
1590	{
1591	sopno sn;
1592	sop s;
1593	int i;
1594
1595	/* avoid making error situations worse */
1596	if (p->error != 0)
1597	return;
1598
1599	sn = HERE();
1600	EMIT(op, opnd); /* do checks, ensure space */
1601	assert(HERE() == sn+1);
1602	s = p->strip[sn];
1603
1604	/* adjust paren pointers */
1605	assert(pos > 0);
1606	for (i = 1; i < NPAREN; i++) {
1607	if (p->pbegin[i] >= pos) {
1608	p->pbegin[i]++;
1609	}
1610	if (p->pend[i] >= pos) {
1611	p->pend[i]++;
1612	}
1613	}
1614
1615	memmove((char )&p->strip[pos+1], (char )&p->strip[pos],
1616	(HERE()-pos-1)*sizeof(sop));
1617	p->strip[pos] = s;
1618	}
1619
1620	/*
1621	- dofwd - complete a forward reference
1622	== static void dofwd(struct parse *p, sopno pos, sop value);
1623	*/
1624	static void
1625	dofwd(p, pos, value)
1626	struct parse *p;
1627	sopno pos;
1628	sop value;
1629	{
1630	/* avoid making error situations worse */
1631	if (p->error != 0)
1632	return;
1633
1634	assert(value < 1<<OPSHIFT);
1635	p->strip[pos] = OP(p->strip[pos]) \| value;
1636	}
1637
1638	/*
1639	- enlarge - enlarge the strip
1640	== static void enlarge(struct parse *p, sopno size);
1641	*/
1642	static void
1643	enlarge(p, size)
1644	struct parse *p;
1645	sopno size;
1646	{
1647	sop *sp;
1648
1649	if (p->ssize >= size)
1650	return;
1651
1652	sp = (sop )realloc(p->strip, sizesizeof(sop));
1653	if (sp == NULL) {
1654	SETERROR(REG_ESPACE);
1655	return;
1656	}
1657	p->strip = sp;
1658	p->ssize = size;
1659	}
1660
1661	/*
1662	- stripsnug - compact the strip
1663	== static void stripsnug(struct parse p, struct re_guts g);
1664	*/
1665	static void
1666	stripsnug(p, g)
1667	struct parse *p;
1668	struct re_guts *g;
1669	{
1670	g->nstates = p->slen;
1671	g->strip = (sop )realloc((char )p->strip, p->slen * sizeof(sop));
1672	if (g->strip == NULL) {
1673	SETERROR(REG_ESPACE);
1674	g->strip = p->strip;
1675	}
1676	}
1677
1678	/*
1679	- findmust - fill in must and mlen with longest mandatory literal string
1680	== static void findmust(struct parse p, struct re_guts g);
1681	*
1682	* This algorithm could do fancy things like analyzing the operands of \|
1683	* for common subsequences. Someday. This code is simple and finds most
1684	* of the interesting cases.
1685	*
1686	* Note that must and mlen got initialized during setup.
1687	*/
1688	static void
1689	findmust(p, g)
1690	struct parse *p;
1691	struct re_guts *g;
1692	{
1693	sop *scan;
1694	sop *start;
1695	sop *newstart;
1696	sopno newlen;
1697	sop s;
1698	char *cp;
1699	sopno i;
1700	int offset;
1701	int cs, mccs;
1702
1703	/* avoid making error situations worse */
1704	if (p->error != 0)
1705	return;
1706
1707	/* Find out if we can handle OANYOF or not */
1708	mccs = 0;
1709	for (cs = 0; cs < g->ncsets; cs++)
1710	if (g->sets[cs].multis != NULL)
1711	mccs = 1;
1712
1713	/* find the longest OCHAR sequence in strip */
1714	newlen = 0;
1715	offset = 0;
1716	g->moffset = 0;
1717	scan = g->strip + 1;
1718	do {
1719	s = *scan++;
1720	switch (OP(s)) {
1721	case OCHAR: /* sequence member */
1722	if (newlen == 0) /* new sequence */
1723	newstart = scan - 1;
1724	newlen++;
1725	break;
1726	case OPLUS_: /* things that don't break one */
1727	case OLPAREN:
1728	case ORPAREN:
1729	break;
1730	case OQUEST_: /* things that must be skipped */
1731	case OCH_:
1732	offset = altoffset(scan, offset, mccs);
1733	scan--;
1734	do {
1735	scan += OPND(s);
1736	s = *scan;
1737	/* assert() interferes w debug printouts */
1738	if (OP(s) != O_QUEST && OP(s) != O_CH &&
1739	OP(s) != OOR2) {
1740	g->iflags \|= BAD;
1741	return;
1742	}
1743	} while (OP(s) != O_QUEST && OP(s) != O_CH);
1744	/* FALLTHROUGH */
1745	case OBOW: /* things that break a sequence */
1746	case OEOW:
1747	case OBOL:
1748	case OEOL:
1749	case O_QUEST:
1750	case O_CH:
1751	case OEND:
1752	if (newlen > g->mlen) { /* ends one */
1753	start = newstart;
1754	g->mlen = newlen;
1755	if (offset > -1) {
1756	g->moffset += offset;
1757	offset = newlen;
1758	} else
1759	g->moffset = offset;
1760	} else {
1761	if (offset > -1)
1762	offset += newlen;
1763	}
1764	newlen = 0;
1765	break;
1766	case OANY:
1767	if (newlen > g->mlen) { /* ends one */
1768	start = newstart;
1769	g->mlen = newlen;
1770	if (offset > -1) {
1771	g->moffset += offset;
1772	offset = newlen;
1773	} else
1774	g->moffset = offset;
1775	} else {
1776	if (offset > -1)
1777	offset += newlen;
1778	}
1779	if (offset > -1)
1780	offset++;
1781	newlen = 0;
1782	break;
1783	case OANYOF: /* may or may not invalidate offset */
1784	/* First, everything as OANY */
1785	if (newlen > g->mlen) { /* ends one */
1786	start = newstart;
1787	g->mlen = newlen;
1788	if (offset > -1) {
1789	g->moffset += offset;
1790	offset = newlen;
1791	} else
1792	g->moffset = offset;
1793	} else {
1794	if (offset > -1)
1795	offset += newlen;
1796	}
1797	if (offset > -1)
1798	offset++;
1799	newlen = 0;
1800	/* And, now, if we found out we can't deal with
1801	* it, make offset = -1.
1802	*/
1803	if (mccs)
1804	offset = -1;
1805	break;
1806	default:
1807	/* Anything here makes it impossible or too hard
1808	* to calculate the offset -- so we give up;
1809	* save the last known good offset, in case the
1810	* must sequence doesn't occur later.
1811	*/
1812	if (newlen > g->mlen) { /* ends one */
1813	start = newstart;
1814	g->mlen = newlen;
1815	if (offset > -1)
1816	g->moffset += offset;
1817	else
1818	g->moffset = offset;
1819	}
1820	offset = -1;
1821	newlen = 0;
1822	break;
1823	}
1824	} while (OP(s) != OEND);
1825
1826	if (g->mlen == 0) { /* there isn't one */
1827	g->moffset = -1;
1828	return;
1829	}
1830
1831	/* turn it into a character string */
1832	g->must = malloc((size_t)g->mlen + 1);
1833	if (g->must == NULL) { /* argh; just forget it */
1834	g->mlen = 0;
1835	g->moffset = -1;
1836	return;
1837	}
1838	cp = g->must;
1839	scan = start;
1840	for (i = g->mlen; i > 0; i--) {
1841	while (OP(s = *scan++) != OCHAR)
1842	continue;
1843	assert(cp < g->must + g->mlen);
1844	*cp++ = (char)OPND(s);
1845	}
1846	assert(cp == g->must + g->mlen);
1847	cp++ = '\0'; / just on general principles */
1848	}
1849
1850	/*
1851	- altoffset - choose biggest offset among multiple choices
1852	== static int altoffset(sop *scan, int offset, int mccs);
1853	*
1854	* Compute, recursively if necessary, the largest offset among multiple
1855	* re paths.
1856	*/
1857	static int
1858	altoffset(scan, offset, mccs)
1859	sop *scan;
1860	int offset;
1861	int mccs;
1862	{
1863	int largest;
1864	int try;
1865	sop s;
1866
1867	/* If we gave up already on offsets, return */
1868	if (offset == -1)
1869	return -1;
1870
1871	largest = 0;
1872	try = 0;
1873	s = *scan++;
1874	while (OP(s) != O_QUEST && OP(s) != O_CH) {
1875	switch (OP(s)) {
1876	case OOR1:
1877	if (try > largest)
1878	largest = try;
1879	try = 0;
1880	break;
1881	case OQUEST_:
1882	case OCH_:
1883	try = altoffset(scan, try, mccs);
1884	if (try == -1)
1885	return -1;
1886	scan--;
1887	do {
1888	scan += OPND(s);
1889	s = *scan;
1890	if (OP(s) != O_QUEST && OP(s) != O_CH &&
1891	OP(s) != OOR2)
1892	return -1;
1893	} while (OP(s) != O_QUEST && OP(s) != O_CH);
1894	/* We must skip to the next position, or we'll
1895	* leave altoffset() too early.
1896	*/
1897	scan++;
1898	break;
1899	case OANYOF:
1900	if (mccs)
1901	return -1;
1902	case OCHAR:
1903	case OANY:
1904	try++;
1905	case OBOW:
1906	case OEOW:
1907	case OLPAREN:
1908	case ORPAREN:
1909	case OOR2:
1910	break;
1911	default:
1912	try = -1;
1913	break;
1914	}
1915	if (try == -1)
1916	return -1;
1917	s = *scan++;
1918	}
1919
1920	if (try > largest)
1921	largest = try;
1922
1923	return largest+offset;
1924	}
1925
1926	/*
1927	- computejumps - compute char jumps for BM scan
1928	== static void computejumps(struct parse p, struct re_guts g);
1929	*
1930	* This algorithm assumes g->must exists and is has size greater than
1931	* zero. It's based on the algorithm found on Computer Algorithms by
1932	* Sara Baase.
1933	*
1934	* A char jump is the number of characters one needs to jump based on
1935	* the value of the character from the text that was mismatched.
1936	*/
1937	static void
1938	computejumps(p, g)
1939	struct parse *p;
1940	struct re_guts *g;
1941	{
1942	int ch;
1943	int mindex;
1944
1945	/* Avoid making errors worse */
1946	if (p->error != 0)
1947	return;
1948
1949	g->charjump = (int) malloc((NC + 1) sizeof(int));
1950	if (g->charjump == NULL) /* Not a fatal error */
1951	return;
1952	/* Adjust for signed chars, if necessary */
1953	g->charjump = &g->charjump[-(CHAR_MIN)];
1954
1955	/* If the character does not exist in the pattern, the jump
1956	* is equal to the number of characters in the pattern.
1957	*/
1958	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1959	g->charjump[ch] = g->mlen;
1960
1961	/* If the character does exist, compute the jump that would
1962	* take us to the last character in the pattern equal to it
1963	* (notice that we match right to left, so that last character
1964	* is the first one that would be matched).
1965	*/
1966	for (mindex = 0; mindex < g->mlen; mindex++)
1967	g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
1968	}
1969
1970	/*
1971	- computematchjumps - compute match jumps for BM scan
1972	== static void computematchjumps(struct parse p, struct re_guts g);
1973	*
1974	* This algorithm assumes g->must exists and is has size greater than
1975	* zero. It's based on the algorithm found on Computer Algorithms by
1976	* Sara Baase.
1977	*
1978	* A match jump is the number of characters one needs to advance based
1979	* on the already-matched suffix.
1980	* Notice that all values here are minus (g->mlen-1), because of the way
1981	* the search algorithm works.
1982	*/
1983	static void
1984	computematchjumps(p, g)
1985	struct parse *p;
1986	struct re_guts *g;
1987	{
1988	int mindex; /* General "must" iterator */
1989	int suffix; /* Keeps track of matching suffix */
1990	int ssuffix; /* Keeps track of suffixes' suffix */
1991	int* pmatches; /* pmatches[k] points to the next i
1992	* such that i+1...mlen is a substring
1993	* of k+1...k+mlen-i-1
1994	*/
1995
1996	/* Avoid making errors worse */
1997	if (p->error != 0)
1998	return;
1999
2000	pmatches = (int) malloc(g->mlen sizeof(unsigned int));
2001	if (pmatches == NULL) {
2002	g->matchjump = NULL;
2003	return;
2004	}
2005
2006	g->matchjump = (int) malloc(g->mlen sizeof(unsigned int));
2007	if (g->matchjump == NULL) /* Not a fatal error */
2008	return;
2009
2010	/* Set maximum possible jump for each character in the pattern */
2011	for (mindex = 0; mindex < g->mlen; mindex++)
2012	g->matchjump[mindex] = 2*g->mlen - mindex - 1;
2013
2014	/* Compute pmatches[] */
2015	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
2016	mindex--, suffix--) {
2017	pmatches[mindex] = suffix;
2018
2019	/* If a mismatch is found, interrupting the substring,
2020	* compute the matchjump for that position. If no
2021	* mismatch is found, then a text substring mismatched
2022	* against the suffix will also mismatch against the
2023	* substring.
2024	*/
2025	while (suffix < g->mlen
2026	&& g->must[mindex] != g->must[suffix]) {
2027	g->matchjump[suffix] = MIN(g->matchjump[suffix],
2028	g->mlen - mindex - 1);
2029	suffix = pmatches[suffix];
2030	}
2031	}
2032
2033	/* Compute the matchjump up to the last substring found to jump
2034	* to the beginning of the largest must pattern prefix matching
2035	* it's own suffix.
2036	*/
2037	for (mindex = 0; mindex <= suffix; mindex++)
2038	g->matchjump[mindex] = MIN(g->matchjump[mindex],
2039	g->mlen + suffix - mindex);
2040
2041	ssuffix = pmatches[suffix];
2042	while (suffix < g->mlen) {
2043	while (suffix <= ssuffix && suffix < g->mlen) {
2044	g->matchjump[suffix] = MIN(g->matchjump[suffix],
2045	g->mlen + ssuffix - suffix);
2046	suffix++;
2047	}
2048	if (suffix < g->mlen)
2049	ssuffix = pmatches[ssuffix];
2050	}
2051
2052	free(pmatches);
2053	}
2054
2055	/*
2056	- pluscount - count + nesting
2057	== static sopno pluscount(struct parse p, struct re_guts g);
2058	*/
2059	static sopno /* nesting depth */
2060	pluscount(p, g)
2061	struct parse *p;
2062	struct re_guts *g;
2063	{
2064	sop *scan;
2065	sop s;
2066	sopno plusnest = 0;
2067	sopno maxnest = 0;
2068
2069	if (p->error != 0)
2070	return(0); /* there may not be an OEND */
2071
2072	scan = g->strip + 1;
2073	do {
2074	s = *scan++;
2075	switch (OP(s)) {
2076	case OPLUS_:
2077	plusnest++;
2078	break;
2079	case O_PLUS:
2080	if (plusnest > maxnest)
2081	maxnest = plusnest;
2082	plusnest--;
2083	break;
2084	}
2085	} while (OP(s) != OEND);
2086	if (plusnest != 0)
2087	g->iflags \|= BAD;
2088	return(maxnest);
2089	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: