source: trunk/grep/lib/regex.c@ 2960

Last change on this file since 2960 was 2557, checked in by bird, 20 years ago

grep 2.5.1a

File size: 243.4 KB
Line 
1/* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5 Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details.
16
17 You should have received a copy of the GNU Library General Public
18 License along with the GNU C Library; see the file COPYING.LIB. If not,
19 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22/* AIX requires this to be the first thing in the file. */
23#if defined _AIX && !defined REGEX_MALLOC
24 #pragma alloca
25#endif
26
27#undef _GNU_SOURCE
28#define _GNU_SOURCE
29
30#ifdef HAVE_CONFIG_H
31# include <config.h>
32#endif
33
34#ifndef PARAMS
35# if defined __GNUC__ || (defined __STDC__ && __STDC__)
36# define PARAMS(args) args
37# else
38# define PARAMS(args) ()
39# endif /* GCC. */
40#endif /* Not PARAMS. */
41
42#if defined STDC_HEADERS && !defined emacs
43# include <stddef.h>
44#else
45/* We need this for `regex.h', and perhaps for the Emacs include files. */
46# include <sys/types.h>
47#endif
48
49#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
50
51/* For platform which support the ISO C amendement 1 functionality we
52 support user defined character classes. */
53#if defined _LIBC || WIDE_CHAR_SUPPORT
54/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
55# include <wchar.h>
56# include <wctype.h>
57#endif
58
59/* This is for multi byte string support. */
60#ifdef MBS_SUPPORT
61# define CHAR_TYPE wchar_t
62# define US_CHAR_TYPE wchar_t/* unsigned character type */
63# define COMPILED_BUFFER_VAR wc_buffer
64# define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
65# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_TYPE)+1)
66# define PUT_CHAR(c) \
67 do { \
68 if (MB_CUR_MAX == 1) \
69 putchar (c); \
70 else \
71 printf ("%C", (wint_t) c); /* Should we use wide stream?? */ \
72 } while (0)
73# define TRUE 1
74# define FALSE 0
75#else
76# define CHAR_TYPE char
77# define US_CHAR_TYPE unsigned char /* unsigned character type */
78# define COMPILED_BUFFER_VAR bufp->buffer
79# define OFFSET_ADDRESS_SIZE 2
80# define PUT_CHAR(c) putchar (c)
81#endif /* MBS_SUPPORT */
82
83#ifdef _LIBC
84/* We have to keep the namespace clean. */
85# define regfree(preg) __regfree (preg)
86# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
87# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
88# define regerror(errcode, preg, errbuf, errbuf_size) \
89 __regerror(errcode, preg, errbuf, errbuf_size)
90# define re_set_registers(bu, re, nu, st, en) \
91 __re_set_registers (bu, re, nu, st, en)
92# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
93 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
94# define re_match(bufp, string, size, pos, regs) \
95 __re_match (bufp, string, size, pos, regs)
96# define re_search(bufp, string, size, startpos, range, regs) \
97 __re_search (bufp, string, size, startpos, range, regs)
98# define re_compile_pattern(pattern, length, bufp) \
99 __re_compile_pattern (pattern, length, bufp)
100# define re_set_syntax(syntax) __re_set_syntax (syntax)
101# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
102 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
103# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
104
105# define btowc __btowc
106
107/* We are also using some library internals. */
108# include <locale/localeinfo.h>
109# include <locale/elem-hash.h>
110# include <langinfo.h>
111# include <locale/coll-lookup.h>
112#endif
113
114/* This is for other GNU distributions with internationalized messages. */
115#if HAVE_LIBINTL_H || defined _LIBC
116# include <libintl.h>
117# ifdef _LIBC
118# undef gettext
119# define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
120# endif
121#else
122# define gettext(msgid) (msgid)
123#endif
124
125#ifndef gettext_noop
126/* This define is so xgettext can find the internationalizable
127 strings. */
128# define gettext_noop(String) String
129#endif
130
131/* The `emacs' switch turns on certain matching commands
132 that make sense only in Emacs. */
133#ifdef emacs
134
135# include "lisp.h"
136# include "buffer.h"
137# include "syntax.h"
138
139#else /* not emacs */
140
141/* If we are not linking with Emacs proper,
142 we can't use the relocating allocator
143 even if config.h says that we can. */
144# undef REL_ALLOC
145
146# if defined STDC_HEADERS || defined _LIBC
147# include <stdlib.h>
148# else
149char *malloc ();
150char *realloc ();
151# endif
152
153/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
154 If nothing else has been done, use the method below. */
155# ifdef INHIBIT_STRING_HEADER
156# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
157# if !defined bzero && !defined bcopy
158# undef INHIBIT_STRING_HEADER
159# endif
160# endif
161# endif
162
163/* This is the normal way of making sure we have a bcopy and a bzero.
164 This is used in most programs--a few other programs avoid this
165 by defining INHIBIT_STRING_HEADER. */
166# ifndef INHIBIT_STRING_HEADER
167# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
168# include <string.h>
169# ifndef bzero
170# ifndef _LIBC
171# define bzero(s, n) (memset (s, '\0', n), (s))
172# else
173# define bzero(s, n) __bzero (s, n)
174# endif
175# endif
176# else
177# include <strings.h>
178# ifndef memcmp
179# define memcmp(s1, s2, n) bcmp (s1, s2, n)
180# endif
181# ifndef memcpy
182# define memcpy(d, s, n) (bcopy (s, d, n), (d))
183# endif
184# endif
185# endif
186
187/* Define the syntax stuff for \<, \>, etc. */
188
189/* This must be nonzero for the wordchar and notwordchar pattern
190 commands in re_match_2. */
191# ifndef Sword
192# define Sword 1
193# endif
194
195# ifdef SWITCH_ENUM_BUG
196# define SWITCH_ENUM_CAST(x) ((int)(x))
197# else
198# define SWITCH_ENUM_CAST(x) (x)
199# endif
200
201#endif /* not emacs */
202
203#if defined _LIBC || HAVE_LIMITS_H
204# include <limits.h>
205#endif
206
207#ifndef MB_LEN_MAX
208# define MB_LEN_MAX 1
209#endif
210
211
212/* Get the interface, including the syntax bits. */
213#include <regex.h>
214
215/* isalpha etc. are used for the character classes. */
216#include <ctype.h>
217
218/* Jim Meyering writes:
219
220 "... Some ctype macros are valid only for character codes that
221 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
222 using /bin/cc or gcc but without giving an ansi option). So, all
223 ctype uses should be through macros like ISPRINT... If
224 STDC_HEADERS is defined, then autoconf has verified that the ctype
225 macros don't need to be guarded with references to isascii. ...
226 Defining isascii to 1 should let any compiler worth its salt
227 eliminate the && through constant folding."
228 Solaris defines some of these symbols so we must undefine them first. */
229
230#undef ISASCII
231#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
232# define ISASCII(c) 1
233#else
234# define ISASCII(c) isascii(c)
235#endif
236
237#ifdef isblank
238# define ISBLANK(c) (ISASCII (c) && isblank (c))
239#else
240# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
241#endif
242#ifdef isgraph
243# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
244#else
245# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
246#endif
247
248#undef ISPRINT
249#define ISPRINT(c) (ISASCII (c) && isprint (c))
250#define ISDIGIT(c) (ISASCII (c) && isdigit (c))
251#define ISALNUM(c) (ISASCII (c) && isalnum (c))
252#define ISALPHA(c) (ISASCII (c) && isalpha (c))
253#define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
254#define ISLOWER(c) (ISASCII (c) && islower (c))
255#define ISPUNCT(c) (ISASCII (c) && ispunct (c))
256#define ISSPACE(c) (ISASCII (c) && isspace (c))
257#define ISUPPER(c) (ISASCII (c) && isupper (c))
258#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
259
260#ifdef _tolower
261# define TOLOWER(c) _tolower(c)
262#else
263# define TOLOWER(c) tolower(c)
264#endif
265
266#ifndef NULL
267# define NULL (void *)0
268#endif
269
270/* We remove any previous definition of `SIGN_EXTEND_CHAR',
271 since ours (we hope) works properly with all combinations of
272 machines, compilers, `char' and `unsigned char' argument types.
273 (Per Bothner suggested the basic approach.) */
274#undef SIGN_EXTEND_CHAR
275#if __STDC__
276# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
277#else /* not __STDC__ */
278/* As in Harbison and Steele. */
279# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
280#endif
281
282
283#ifndef emacs
284/* How many characters in the character set. */
285# define CHAR_SET_SIZE 256
286
287# ifdef SYNTAX_TABLE
288
289extern char *re_syntax_table;
290
291# else /* not SYNTAX_TABLE */
292
293static char re_syntax_table[CHAR_SET_SIZE];
294
295static void init_syntax_once PARAMS ((void));
296
297static void
298init_syntax_once ()
299{
300 register int c;
301 static int done = 0;
302
303 if (done)
304 return;
305 bzero (re_syntax_table, sizeof re_syntax_table);
306
307 for (c = 0; c < CHAR_SET_SIZE; ++c)
308 if (ISALNUM (c))
309 re_syntax_table[c] = Sword;
310
311 re_syntax_table['_'] = Sword;
312
313 done = 1;
314}
315
316# endif /* not SYNTAX_TABLE */
317
318# define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
319
320#endif /* emacs */
321
322
323/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
324 use `alloca' instead of `malloc'. This is because using malloc in
325 re_search* or re_match* could cause memory leaks when C-g is used in
326 Emacs; also, malloc is slower and causes storage fragmentation. On
327 the other hand, malloc is more portable, and easier to debug.
328
329 Because we sometimes use alloca, some routines have to be macros,
330 not functions -- `alloca'-allocated space disappears at the end of the
331 function it is called in. */
332
333#ifdef REGEX_MALLOC
334
335# define REGEX_ALLOCATE malloc
336# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
337# define REGEX_FREE free
338
339#else /* not REGEX_MALLOC */
340
341/* Emacs already defines alloca, sometimes. */
342# ifndef alloca
343
344/* Make alloca work the best possible way. */
345# ifdef __GNUC__
346# define alloca __builtin_alloca
347# else /* not __GNUC__ */
348# if HAVE_ALLOCA_H
349# include <alloca.h>
350# endif /* HAVE_ALLOCA_H */
351# endif /* not __GNUC__ */
352
353# endif /* not alloca */
354
355# define REGEX_ALLOCATE alloca
356
357/* Assumes a `char *destination' variable. */
358# define REGEX_REALLOCATE(source, osize, nsize) \
359 (destination = (char *) alloca (nsize), \
360 memcpy (destination, source, osize))
361
362/* No need to do anything to free, after alloca. */
363# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
364
365#endif /* not REGEX_MALLOC */
366
367/* Define how to allocate the failure stack. */
368
369#if defined REL_ALLOC && defined REGEX_MALLOC
370
371# define REGEX_ALLOCATE_STACK(size) \
372 r_alloc (&failure_stack_ptr, (size))
373# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
374 r_re_alloc (&failure_stack_ptr, (nsize))
375# define REGEX_FREE_STACK(ptr) \
376 r_alloc_free (&failure_stack_ptr)
377
378#else /* not using relocating allocator */
379
380# ifdef REGEX_MALLOC
381
382# define REGEX_ALLOCATE_STACK malloc
383# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
384# define REGEX_FREE_STACK free
385
386# else /* not REGEX_MALLOC */
387
388# define REGEX_ALLOCATE_STACK alloca
389
390# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
391 REGEX_REALLOCATE (source, osize, nsize)
392/* No need to explicitly free anything. */
393# define REGEX_FREE_STACK(arg)
394
395# endif /* not REGEX_MALLOC */
396#endif /* not using relocating allocator */
397
398
399/* True if `size1' is non-NULL and PTR is pointing anywhere inside
400 `string1' or just past its end. This works if PTR is NULL, which is
401 a good thing. */
402#define FIRST_STRING_P(ptr) \
403 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
404
405/* (Re)Allocate N items of type T using malloc, or fail. */
406#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
407#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
408#define RETALLOC_IF(addr, n, t) \
409 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
410#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
411
412#define BYTEWIDTH 8 /* In bits. */
413
414#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
415
416#undef MAX
417#undef MIN
418#define MAX(a, b) ((a) > (b) ? (a) : (b))
419#define MIN(a, b) ((a) < (b) ? (a) : (b))
420
421typedef char boolean;
422#define false 0
423#define true 1
424
425static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
426 const char *string1, int size1,
427 const char *string2, int size2,
428 int pos,
429 struct re_registers *regs,
430 int stop));
431
432
433/* These are the command codes that appear in compiled regular
434 expressions. Some opcodes are followed by argument bytes. A
435 command code can specify any interpretation whatsoever for its
436 arguments. Zero bytes may appear in the compiled regular expression. */
437
438typedef enum
439{
440 no_op = 0,
441
442 /* Succeed right away--no more backtracking. */
443 succeed,
444
445 /* Followed by one byte giving n, then by n literal bytes. */
446 exactn,
447
448#ifdef MBS_SUPPORT
449 /* Same as exactn, but contains binary data. */
450 exactn_bin,
451#endif
452
453 /* Matches any (more or less) character. */
454 anychar,
455
456 /* Matches any one char belonging to specified set. First
457 following byte is number of bitmap bytes. Then come bytes
458 for a bitmap saying which chars are in. Bits in each byte
459 are ordered low-bit-first. A character is in the set if its
460 bit is 1. A character too large to have a bit in the map is
461 automatically not in the set. */
462 /* ifdef MBS_SUPPORT, following element is length of character
463 classes, length of collating symbols, length of equivalence
464 classes, length of character ranges, and length of characters.
465 Next, character class element, collating symbols elements,
466 equivalence class elements, range elements, and character
467 elements follow.
468 See regex_compile function. */
469 charset,
470
471 /* Same parameters as charset, but match any character that is
472 not one of those specified. */
473 charset_not,
474
475 /* Start remembering the text that is matched, for storing in a
476 register. Followed by one byte with the register number, in
477 the range 0 to one less than the pattern buffer's re_nsub
478 field. Then followed by one byte with the number of groups
479 inner to this one. (This last has to be part of the
480 start_memory only because we need it in the on_failure_jump
481 of re_match_2.) */
482 start_memory,
483
484 /* Stop remembering the text that is matched and store it in a
485 memory register. Followed by one byte with the register
486 number, in the range 0 to one less than `re_nsub' in the
487 pattern buffer, and one byte with the number of inner groups,
488 just like `start_memory'. (We need the number of inner
489 groups here because we don't have any easy way of finding the
490 corresponding start_memory when we're at a stop_memory.) */
491 stop_memory,
492
493 /* Match a duplicate of something remembered. Followed by one
494 byte containing the register number. */
495 duplicate,
496
497 /* Fail unless at beginning of line. */
498 begline,
499
500 /* Fail unless at end of line. */
501 endline,
502
503 /* Succeeds if at beginning of buffer (if emacs) or at beginning
504 of string to be matched (if not). */
505 begbuf,
506
507 /* Analogously, for end of buffer/string. */
508 endbuf,
509
510 /* Followed by two byte relative address to which to jump. */
511 jump,
512
513 /* Same as jump, but marks the end of an alternative. */
514 jump_past_alt,
515
516 /* Followed by two-byte relative address of place to resume at
517 in case of failure. */
518 /* ifdef MBS_SUPPORT, the size of address is 1. */
519 on_failure_jump,
520
521 /* Like on_failure_jump, but pushes a placeholder instead of the
522 current string position when executed. */
523 on_failure_keep_string_jump,
524
525 /* Throw away latest failure point and then jump to following
526 two-byte relative address. */
527 /* ifdef MBS_SUPPORT, the size of address is 1. */
528 pop_failure_jump,
529
530 /* Change to pop_failure_jump if know won't have to backtrack to
531 match; otherwise change to jump. This is used to jump
532 back to the beginning of a repeat. If what follows this jump
533 clearly won't match what the repeat does, such that we can be
534 sure that there is no use backtracking out of repetitions
535 already matched, then we change it to a pop_failure_jump.
536 Followed by two-byte address. */
537 /* ifdef MBS_SUPPORT, the size of address is 1. */
538 maybe_pop_jump,
539
540 /* Jump to following two-byte address, and push a dummy failure
541 point. This failure point will be thrown away if an attempt
542 is made to use it for a failure. A `+' construct makes this
543 before the first repeat. Also used as an intermediary kind
544 of jump when compiling an alternative. */
545 /* ifdef MBS_SUPPORT, the size of address is 1. */
546 dummy_failure_jump,
547
548 /* Push a dummy failure point and continue. Used at the end of
549 alternatives. */
550 push_dummy_failure,
551
552 /* Followed by two-byte relative address and two-byte number n.
553 After matching N times, jump to the address upon failure. */
554 /* ifdef MBS_SUPPORT, the size of address is 1. */
555 succeed_n,
556
557 /* Followed by two-byte relative address, and two-byte number n.
558 Jump to the address N times, then fail. */
559 /* ifdef MBS_SUPPORT, the size of address is 1. */
560 jump_n,
561
562 /* Set the following two-byte relative address to the
563 subsequent two-byte number. The address *includes* the two
564 bytes of number. */
565 /* ifdef MBS_SUPPORT, the size of address is 1. */
566 set_number_at,
567
568 wordchar, /* Matches any word-constituent character. */
569 notwordchar, /* Matches any char that is not a word-constituent. */
570
571 wordbeg, /* Succeeds if at word beginning. */
572 wordend, /* Succeeds if at word end. */
573
574 wordbound, /* Succeeds if at a word boundary. */
575 notwordbound /* Succeeds if not at a word boundary. */
576
577#ifdef emacs
578 ,before_dot, /* Succeeds if before point. */
579 at_dot, /* Succeeds if at point. */
580 after_dot, /* Succeeds if after point. */
581
582 /* Matches any character whose syntax is specified. Followed by
583 a byte which contains a syntax code, e.g., Sword. */
584 syntaxspec,
585
586 /* Matches any character whose syntax is not that specified. */
587 notsyntaxspec
588#endif /* emacs */
589} re_opcode_t;
590
591
592/* Common operations on the compiled pattern. */
593
594/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
595/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
596
597#ifdef MBS_SUPPORT
598# define STORE_NUMBER(destination, number) \
599 do { \
600 *(destination) = (US_CHAR_TYPE)(number); \
601 } while (0)
602#else
603# define STORE_NUMBER(destination, number) \
604 do { \
605 (destination)[0] = (number) & 0377; \
606 (destination)[1] = (number) >> 8; \
607 } while (0)
608#endif /* MBS_SUPPORT */
609
610/* Same as STORE_NUMBER, except increment DESTINATION to
611 the byte after where the number is stored. Therefore, DESTINATION
612 must be an lvalue. */
613/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
614
615#define STORE_NUMBER_AND_INCR(destination, number) \
616 do { \
617 STORE_NUMBER (destination, number); \
618 (destination) += OFFSET_ADDRESS_SIZE; \
619 } while (0)
620
621/* Put into DESTINATION a number stored in two contiguous bytes starting
622 at SOURCE. */
623/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
624
625#ifdef MBS_SUPPORT
626# define EXTRACT_NUMBER(destination, source) \
627 do { \
628 (destination) = *(source); \
629 } while (0)
630#else
631# define EXTRACT_NUMBER(destination, source) \
632 do { \
633 (destination) = *(source) & 0377; \
634 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
635 } while (0)
636#endif
637
638#ifdef DEBUG
639static void extract_number _RE_ARGS ((int *dest, US_CHAR_TYPE *source));
640static void
641extract_number (dest, source)
642 int *dest;
643 US_CHAR_TYPE *source;
644{
645#ifdef MBS_SUPPORT
646 *dest = *source;
647#else
648 int temp = SIGN_EXTEND_CHAR (*(source + 1));
649 *dest = *source & 0377;
650 *dest += temp << 8;
651#endif
652}
653
654# ifndef EXTRACT_MACROS /* To debug the macros. */
655# undef EXTRACT_NUMBER
656# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
657# endif /* not EXTRACT_MACROS */
658
659#endif /* DEBUG */
660
661/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
662 SOURCE must be an lvalue. */
663
664#define EXTRACT_NUMBER_AND_INCR(destination, source) \
665 do { \
666 EXTRACT_NUMBER (destination, source); \
667 (source) += OFFSET_ADDRESS_SIZE; \
668 } while (0)
669
670#ifdef DEBUG
671static void extract_number_and_incr _RE_ARGS ((int *destination,
672 US_CHAR_TYPE **source));
673static void
674extract_number_and_incr (destination, source)
675 int *destination;
676 US_CHAR_TYPE **source;
677{
678 extract_number (destination, *source);
679 *source += OFFSET_ADDRESS_SIZE;
680}
681
682# ifndef EXTRACT_MACROS
683# undef EXTRACT_NUMBER_AND_INCR
684# define EXTRACT_NUMBER_AND_INCR(dest, src) \
685 extract_number_and_incr (&dest, &src)
686# endif /* not EXTRACT_MACROS */
687
688#endif /* DEBUG */
689
690
691/* If DEBUG is defined, Regex prints many voluminous messages about what
692 it is doing (if the variable `debug' is nonzero). If linked with the
693 main program in `iregex.c', you can enter patterns and strings
694 interactively. And if linked with the main program in `main.c' and
695 the other test files, you can run the already-written tests. */
696
697#ifdef DEBUG
698
699/* We use standard I/O for debugging. */
700# include <stdio.h>
701
702/* It is useful to test things that ``must'' be true when debugging. */
703# include <assert.h>
704
705static int debug;
706
707# define DEBUG_STATEMENT(e) e
708# define DEBUG_PRINT1(x) if (debug) printf (x)
709# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
710# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
711# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
712# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
713 if (debug) print_partial_compiled_pattern (s, e)
714# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
715 if (debug) print_double_string (w, s1, sz1, s2, sz2)
716
717
718/* Print the fastmap in human-readable form. */
719
720void
721print_fastmap (fastmap)
722 char *fastmap;
723{
724 unsigned was_a_range = 0;
725 unsigned i = 0;
726
727 while (i < (1 << BYTEWIDTH))
728 {
729 if (fastmap[i++])
730 {
731 was_a_range = 0;
732 putchar (i - 1);
733 while (i < (1 << BYTEWIDTH) && fastmap[i])
734 {
735 was_a_range = 1;
736 i++;
737 }
738 if (was_a_range)
739 {
740 printf ("-");
741 putchar (i - 1);
742 }
743 }
744 }
745 putchar ('\n');
746}
747
748
749/* Print a compiled pattern string in human-readable form, starting at
750 the START pointer into it and ending just before the pointer END. */
751
752void
753print_partial_compiled_pattern (start, end)
754 US_CHAR_TYPE *start;
755 US_CHAR_TYPE *end;
756{
757 int mcnt, mcnt2;
758 US_CHAR_TYPE *p1;
759 US_CHAR_TYPE *p = start;
760 US_CHAR_TYPE *pend = end;
761
762 if (start == NULL)
763 {
764 printf ("(null)\n");
765 return;
766 }
767
768 /* Loop over pattern commands. */
769 while (p < pend)
770 {
771#ifdef _LIBC
772 printf ("%td:\t", p - start);
773#else
774 printf ("%ld:\t", (long int) (p - start));
775#endif
776
777 switch ((re_opcode_t) *p++)
778 {
779 case no_op:
780 printf ("/no_op");
781 break;
782
783 case exactn:
784 mcnt = *p++;
785 printf ("/exactn/%d", mcnt);
786 do
787 {
788 putchar ('/');
789 PUT_CHAR (*p++);
790 }
791 while (--mcnt);
792 break;
793
794#ifdef MBS_SUPPORT
795 case exactn_bin:
796 mcnt = *p++;
797 printf ("/exactn_bin/%d", mcnt);
798 do
799 {
800 printf("/%lx", (long int) *p++);
801 }
802 while (--mcnt);
803 break;
804#endif /* MBS_SUPPORT */
805
806 case start_memory:
807 mcnt = *p++;
808 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
809 break;
810
811 case stop_memory:
812 mcnt = *p++;
813 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
814 break;
815
816 case duplicate:
817 printf ("/duplicate/%ld", (long int) *p++);
818 break;
819
820 case anychar:
821 printf ("/anychar");
822 break;
823
824 case charset:
825 case charset_not:
826 {
827#ifdef MBS_SUPPORT
828 int i, length;
829 wchar_t *workp = p;
830 printf ("/charset [%s",
831 (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
832 p += 5;
833 length = *workp++; /* the length of char_classes */
834 for (i=0 ; i<length ; i++)
835 printf("[:%lx:]", (long int) *p++);
836 length = *workp++; /* the length of collating_symbol */
837 for (i=0 ; i<length ;)
838 {
839 printf("[.");
840 while(*p != 0)
841 PUT_CHAR((i++,*p++));
842 i++,p++;
843 printf(".]");
844 }
845 length = *workp++; /* the length of equivalence_class */
846 for (i=0 ; i<length ;)
847 {
848 printf("[=");
849 while(*p != 0)
850 PUT_CHAR((i++,*p++));
851 i++,p++;
852 printf("=]");
853 }
854 length = *workp++; /* the length of char_range */
855 for (i=0 ; i<length ; i++)
856 {
857 wchar_t range_start = *p++;
858 wchar_t range_end = *p++;
859 if (MB_CUR_MAX == 1)
860 printf("%c-%c", (char) range_start, (char) range_end);
861 else
862 printf("%C-%C", (wint_t) range_start, (wint_t) range_end);
863 }
864 length = *workp++; /* the length of char */
865 for (i=0 ; i<length ; i++)
866 if (MB_CUR_MAX == 1)
867 putchar (*p++);
868 else
869 printf("%C", (wint_t) *p++);
870 putchar (']');
871#else
872 register int c, last = -100;
873 register int in_range = 0;
874
875 printf ("/charset [%s",
876 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
877
878 assert (p + *p < pend);
879
880 for (c = 0; c < 256; c++)
881 if (c / 8 < *p
882 && (p[1 + (c/8)] & (1 << (c % 8))))
883 {
884 /* Are we starting a range? */
885 if (last + 1 == c && ! in_range)
886 {
887 putchar ('-');
888 in_range = 1;
889 }
890 /* Have we broken a range? */
891 else if (last + 1 != c && in_range)
892 {
893 putchar (last);
894 in_range = 0;
895 }
896
897 if (! in_range)
898 putchar (c);
899
900 last = c;
901 }
902
903 if (in_range)
904 putchar (last);
905
906 putchar (']');
907
908 p += 1 + *p;
909#endif /* MBS_SUPPORT */
910 }
911 break;
912
913 case begline:
914 printf ("/begline");
915 break;
916
917 case endline:
918 printf ("/endline");
919 break;
920
921 case on_failure_jump:
922 extract_number_and_incr (&mcnt, &p);
923#ifdef _LIBC
924 printf ("/on_failure_jump to %td", p + mcnt - start);
925#else
926 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
927#endif
928 break;
929
930 case on_failure_keep_string_jump:
931 extract_number_and_incr (&mcnt, &p);
932#ifdef _LIBC
933 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
934#else
935 printf ("/on_failure_keep_string_jump to %ld",
936 (long int) (p + mcnt - start));
937#endif
938 break;
939
940 case dummy_failure_jump:
941 extract_number_and_incr (&mcnt, &p);
942#ifdef _LIBC
943 printf ("/dummy_failure_jump to %td", p + mcnt - start);
944#else
945 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
946#endif
947 break;
948
949 case push_dummy_failure:
950 printf ("/push_dummy_failure");
951 break;
952
953 case maybe_pop_jump:
954 extract_number_and_incr (&mcnt, &p);
955#ifdef _LIBC
956 printf ("/maybe_pop_jump to %td", p + mcnt - start);
957#else
958 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
959#endif
960 break;
961
962 case pop_failure_jump:
963 extract_number_and_incr (&mcnt, &p);
964#ifdef _LIBC
965 printf ("/pop_failure_jump to %td", p + mcnt - start);
966#else
967 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
968#endif
969 break;
970
971 case jump_past_alt:
972 extract_number_and_incr (&mcnt, &p);
973#ifdef _LIBC
974 printf ("/jump_past_alt to %td", p + mcnt - start);
975#else
976 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
977#endif
978 break;
979
980 case jump:
981 extract_number_and_incr (&mcnt, &p);
982#ifdef _LIBC
983 printf ("/jump to %td", p + mcnt - start);
984#else
985 printf ("/jump to %ld", (long int) (p + mcnt - start));
986#endif
987 break;
988
989 case succeed_n:
990 extract_number_and_incr (&mcnt, &p);
991 p1 = p + mcnt;
992 extract_number_and_incr (&mcnt2, &p);
993#ifdef _LIBC
994 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
995#else
996 printf ("/succeed_n to %ld, %d times",
997 (long int) (p1 - start), mcnt2);
998#endif
999 break;
1000
1001 case jump_n:
1002 extract_number_and_incr (&mcnt, &p);
1003 p1 = p + mcnt;
1004 extract_number_and_incr (&mcnt2, &p);
1005 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
1006 break;
1007
1008 case set_number_at:
1009 extract_number_and_incr (&mcnt, &p);
1010 p1 = p + mcnt;
1011 extract_number_and_incr (&mcnt2, &p);
1012#ifdef _LIBC
1013 printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
1014#else
1015 printf ("/set_number_at location %ld to %d",
1016 (long int) (p1 - start), mcnt2);
1017#endif
1018 break;
1019
1020 case wordbound:
1021 printf ("/wordbound");
1022 break;
1023
1024 case notwordbound:
1025 printf ("/notwordbound");
1026 break;
1027
1028 case wordbeg:
1029 printf ("/wordbeg");
1030 break;
1031
1032 case wordend:
1033 printf ("/wordend");
1034 break;
1035
1036# ifdef emacs
1037 case before_dot:
1038 printf ("/before_dot");
1039 break;
1040
1041 case at_dot:
1042 printf ("/at_dot");
1043 break;
1044
1045 case after_dot:
1046 printf ("/after_dot");
1047 break;
1048
1049 case syntaxspec:
1050 printf ("/syntaxspec");
1051 mcnt = *p++;
1052 printf ("/%d", mcnt);
1053 break;
1054
1055 case notsyntaxspec:
1056 printf ("/notsyntaxspec");
1057 mcnt = *p++;
1058 printf ("/%d", mcnt);
1059 break;
1060# endif /* emacs */
1061
1062 case wordchar:
1063 printf ("/wordchar");
1064 break;
1065
1066 case notwordchar:
1067 printf ("/notwordchar");
1068 break;
1069
1070 case begbuf:
1071 printf ("/begbuf");
1072 break;
1073
1074 case endbuf:
1075 printf ("/endbuf");
1076 break;
1077
1078 default:
1079 printf ("?%ld", (long int) *(p-1));
1080 }
1081
1082 putchar ('\n');
1083 }
1084
1085#ifdef _LIBC
1086 printf ("%td:\tend of pattern.\n", p - start);
1087#else
1088 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
1089#endif
1090}
1091
1092
1093void
1094print_compiled_pattern (bufp)
1095 struct re_pattern_buffer *bufp;
1096{
1097 US_CHAR_TYPE *buffer = (US_CHAR_TYPE*) bufp->buffer;
1098
1099 print_partial_compiled_pattern (buffer, buffer
1100 + bufp->used / sizeof(US_CHAR_TYPE));
1101 printf ("%ld bytes used/%ld bytes allocated.\n",
1102 bufp->used, bufp->allocated);
1103
1104 if (bufp->fastmap_accurate && bufp->fastmap)
1105 {
1106 printf ("fastmap: ");
1107 print_fastmap (bufp->fastmap);
1108 }
1109
1110#ifdef _LIBC
1111 printf ("re_nsub: %Zd\t", bufp->re_nsub);
1112#else
1113 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
1114#endif
1115 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1116 printf ("can_be_null: %d\t", bufp->can_be_null);
1117 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1118 printf ("no_sub: %d\t", bufp->no_sub);
1119 printf ("not_bol: %d\t", bufp->not_bol);
1120 printf ("not_eol: %d\t", bufp->not_eol);
1121 printf ("syntax: %lx\n", bufp->syntax);
1122 /* Perhaps we should print the translate table? */
1123}
1124
1125
1126void
1127print_double_string (where, string1, size1, string2, size2)
1128 const CHAR_TYPE *where;
1129 const CHAR_TYPE *string1;
1130 const CHAR_TYPE *string2;
1131 int size1;
1132 int size2;
1133{
1134 int this_char;
1135
1136 if (where == NULL)
1137 printf ("(null)");
1138 else
1139 {
1140 if (FIRST_STRING_P (where))
1141 {
1142 for (this_char = where - string1; this_char < size1; this_char++)
1143 PUT_CHAR (string1[this_char]);
1144
1145 where = string2;
1146 }
1147
1148 for (this_char = where - string2; this_char < size2; this_char++)
1149 PUT_CHAR (string2[this_char]);
1150 }
1151}
1152
1153void
1154printchar (c)
1155 int c;
1156{
1157 putc (c, stderr);
1158}
1159
1160#else /* not DEBUG */
1161
1162# undef assert
1163# define assert(e)
1164
1165# define DEBUG_STATEMENT(e)
1166# define DEBUG_PRINT1(x)
1167# define DEBUG_PRINT2(x1, x2)
1168# define DEBUG_PRINT3(x1, x2, x3)
1169# define DEBUG_PRINT4(x1, x2, x3, x4)
1170# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1171# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1172
1173#endif /* not DEBUG */
1174
1175
1176#ifdef MBS_SUPPORT
1177/* This convert a multibyte string to a wide character string.
1178 And write their correspondances to offset_buffer(see below)
1179 and write whether each wchar_t is binary data to is_binary.
1180 This assume invalid multibyte sequences as binary data.
1181 We assume offset_buffer and is_binary is already allocated
1182 enough space. */
1183
1184static size_t convert_mbs_to_wcs (CHAR_TYPE *dest, const unsigned char* src,
1185 size_t len, int *offset_buffer,
1186 char *is_binary);
1187static size_t
1188convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
1189 CHAR_TYPE *dest;
1190 const unsigned char* src;
1191 size_t len; /* the length of multibyte string. */
1192
1193 /* It hold correspondances between src(char string) and
1194 dest(wchar_t string) for optimization.
1195 e.g. src = "xxxyzz"
1196 dest = {'X', 'Y', 'Z'}
1197 (each "xxx", "y" and "zz" represent one multibyte character
1198 corresponding to 'X', 'Y' and 'Z'.)
1199 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
1200 = {0, 3, 4, 6}
1201 */
1202 int *offset_buffer;
1203 char *is_binary;
1204{
1205 wchar_t *pdest = dest;
1206 const unsigned char *psrc = src;
1207 size_t wc_count = 0;
1208
1209 if (MB_CUR_MAX == 1)
1210 { /* We don't need conversion. */
1211 for ( ; wc_count < len ; ++wc_count)
1212 {
1213 *pdest++ = *psrc++;
1214 is_binary[wc_count] = FALSE;
1215 offset_buffer[wc_count] = wc_count;
1216 }
1217 offset_buffer[wc_count] = wc_count;
1218 }
1219 else
1220 {
1221 /* We need conversion. */
1222 mbstate_t mbs;
1223 int consumed;
1224 size_t mb_remain = len;
1225 size_t mb_count = 0;
1226
1227 /* Initialize the conversion state. */
1228 memset (&mbs, 0, sizeof (mbstate_t));
1229
1230 offset_buffer[0] = 0;
1231 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
1232 psrc += consumed)
1233 {
1234 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
1235
1236 if (consumed <= 0)
1237 /* failed to convert. maybe src contains binary data.
1238 So we consume 1 byte manualy. */
1239 {
1240 *pdest = *psrc;
1241 consumed = 1;
1242 is_binary[wc_count] = TRUE;
1243 }
1244 else
1245 is_binary[wc_count] = FALSE;
1246 /* In sjis encoding, we use yen sign as escape character in
1247 place of reverse solidus. So we convert 0x5c(yen sign in
1248 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
1249 solidus in UCS2). */
1250 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
1251 *pdest = (wchar_t) *psrc;
1252
1253 offset_buffer[wc_count + 1] = mb_count += consumed;
1254 }
1255 }
1256
1257 return wc_count;
1258}
1259
1260#endif /* MBS_SUPPORT */
1261
1262/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1263 also be assigned to arbitrarily: each pattern buffer stores its own
1264 syntax, so it can be changed between regex compilations. */
1265/* This has no initializer because initialized variables in Emacs
1266 become read-only after dumping. */
1267reg_syntax_t re_syntax_options;
1268
1269
1270/* Specify the precise syntax of regexps for compilation. This provides
1271 for compatibility for various utilities which historically have
1272 different, incompatible syntaxes.
1273
1274 The argument SYNTAX is a bit mask comprised of the various bits
1275 defined in regex.h. We return the old syntax. */
1276
1277reg_syntax_t
1278re_set_syntax (syntax)
1279 reg_syntax_t syntax;
1280{
1281 reg_syntax_t ret = re_syntax_options;
1282
1283 re_syntax_options = syntax;
1284#ifdef DEBUG
1285 if (syntax & RE_DEBUG)
1286 debug = 1;
1287 else if (debug) /* was on but now is not */
1288 debug = 0;
1289#endif /* DEBUG */
1290 return ret;
1291}
1292#ifdef _LIBC
1293weak_alias (__re_set_syntax, re_set_syntax)
1294#endif
1295
1296
1297/* This table gives an error message for each of the error codes listed
1298 in regex.h. Obviously the order here has to be same as there.
1299 POSIX doesn't require that we do anything for REG_NOERROR,
1300 but why not be nice? */
1301
1302static const char re_error_msgid[] =
1303 {
1304#define REG_NOERROR_IDX 0
1305 gettext_noop ("Success") /* REG_NOERROR */
1306 "\0"
1307#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
1308 gettext_noop ("No match") /* REG_NOMATCH */
1309 "\0"
1310#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
1311 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
1312 "\0"
1313#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
1314 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
1315 "\0"
1316#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
1317 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
1318 "\0"
1319#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
1320 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
1321 "\0"
1322#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
1323 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
1324 "\0"
1325#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
1326 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
1327 "\0"
1328#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
1329 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
1330 "\0"
1331#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
1332 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
1333 "\0"
1334#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
1335 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
1336 "\0"
1337#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
1338 gettext_noop ("Invalid range end") /* REG_ERANGE */
1339 "\0"
1340#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
1341 gettext_noop ("Memory exhausted") /* REG_ESPACE */
1342 "\0"
1343#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
1344 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
1345 "\0"
1346#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
1347 gettext_noop ("Premature end of regular expression") /* REG_EEND */
1348 "\0"
1349#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
1350 gettext_noop ("Regular expression too big") /* REG_ESIZE */
1351 "\0"
1352#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
1353 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1354 };
1355
1356static const size_t re_error_msgid_idx[] =
1357 {
1358 REG_NOERROR_IDX,
1359 REG_NOMATCH_IDX,
1360 REG_BADPAT_IDX,
1361 REG_ECOLLATE_IDX,
1362 REG_ECTYPE_IDX,
1363 REG_EESCAPE_IDX,
1364 REG_ESUBREG_IDX,
1365 REG_EBRACK_IDX,
1366 REG_EPAREN_IDX,
1367 REG_EBRACE_IDX,
1368 REG_BADBR_IDX,
1369 REG_ERANGE_IDX,
1370 REG_ESPACE_IDX,
1371 REG_BADRPT_IDX,
1372 REG_EEND_IDX,
1373 REG_ESIZE_IDX,
1374 REG_ERPAREN_IDX
1375 };
1376
1377
1378/* Avoiding alloca during matching, to placate r_alloc. */
1379
1380/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1381 searching and matching functions should not call alloca. On some
1382 systems, alloca is implemented in terms of malloc, and if we're
1383 using the relocating allocator routines, then malloc could cause a
1384 relocation, which might (if the strings being searched are in the
1385 ralloc heap) shift the data out from underneath the regexp
1386 routines.
1387
1388 Here's another reason to avoid allocation: Emacs
1389 processes input from X in a signal handler; processing X input may
1390 call malloc; if input arrives while a matching routine is calling
1391 malloc, then we're scrod. But Emacs can't just block input while
1392 calling matching routines; then we don't notice interrupts when
1393 they come in. So, Emacs blocks input around all regexp calls
1394 except the matching calls, which it leaves unprotected, in the
1395 faith that they will not malloc. */
1396
1397/* Normally, this is fine. */
1398#define MATCH_MAY_ALLOCATE
1399
1400/* When using GNU C, we are not REALLY using the C alloca, no matter
1401 what config.h may say. So don't take precautions for it. */
1402#ifdef __GNUC__
1403# undef C_ALLOCA
1404#endif
1405
1406/* The match routines may not allocate if (1) they would do it with malloc
1407 and (2) it's not safe for them to use malloc.
1408 Note that if REL_ALLOC is defined, matching would not use malloc for the
1409 failure stack, but we would still use it for the register vectors;
1410 so REL_ALLOC should not affect this. */
1411#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1412# undef MATCH_MAY_ALLOCATE
1413#endif
1414
1415
1416
1417/* Failure stack declarations and macros; both re_compile_fastmap and
1418 re_match_2 use a failure stack. These have to be macros because of
1419 REGEX_ALLOCATE_STACK. */
1420
1421
1422/* Number of failure points for which to initially allocate space
1423 when matching. If this number is exceeded, we allocate more
1424 space, so it is not a hard limit. */
1425#ifndef INIT_FAILURE_ALLOC
1426# define INIT_FAILURE_ALLOC 5
1427#endif
1428
1429/* Roughly the maximum number of failure points on the stack. Would be
1430 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1431 This is a variable only so users of regex can assign to it; we never
1432 change it ourselves. */
1433
1434#ifdef INT_IS_16BIT
1435
1436# if defined MATCH_MAY_ALLOCATE
1437/* 4400 was enough to cause a crash on Alpha OSF/1,
1438 whose default stack limit is 2mb. */
1439long int re_max_failures = 4000;
1440# else
1441long int re_max_failures = 2000;
1442# endif
1443
1444union fail_stack_elt
1445{
1446 US_CHAR_TYPE *pointer;
1447 long int integer;
1448};
1449
1450typedef union fail_stack_elt fail_stack_elt_t;
1451
1452typedef struct
1453{
1454 fail_stack_elt_t *stack;
1455 unsigned long int size;
1456 unsigned long int avail; /* Offset of next open position. */
1457} fail_stack_type;
1458
1459#else /* not INT_IS_16BIT */
1460
1461# if defined MATCH_MAY_ALLOCATE
1462/* 4400 was enough to cause a crash on Alpha OSF/1,
1463 whose default stack limit is 2mb. */
1464int re_max_failures = 4000;
1465# else
1466int re_max_failures = 2000;
1467# endif
1468
1469union fail_stack_elt
1470{
1471 US_CHAR_TYPE *pointer;
1472 int integer;
1473};
1474
1475typedef union fail_stack_elt fail_stack_elt_t;
1476
1477typedef struct
1478{
1479 fail_stack_elt_t *stack;
1480 unsigned size;
1481 unsigned avail; /* Offset of next open position. */
1482} fail_stack_type;
1483
1484#endif /* INT_IS_16BIT */
1485
1486#define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1487#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1488#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1489
1490
1491/* Define macros to initialize and free the failure stack.
1492 Do `return -2' if the alloc fails. */
1493
1494#ifdef MATCH_MAY_ALLOCATE
1495# define INIT_FAIL_STACK() \
1496 do { \
1497 fail_stack.stack = (fail_stack_elt_t *) \
1498 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1499 \
1500 if (fail_stack.stack == NULL) \
1501 return -2; \
1502 \
1503 fail_stack.size = INIT_FAILURE_ALLOC; \
1504 fail_stack.avail = 0; \
1505 } while (0)
1506
1507# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1508#else
1509# define INIT_FAIL_STACK() \
1510 do { \
1511 fail_stack.avail = 0; \
1512 } while (0)
1513
1514# define RESET_FAIL_STACK()
1515#endif
1516
1517
1518/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1519
1520 Return 1 if succeeds, and 0 if either ran out of memory
1521 allocating space for it or it was already too large.
1522
1523 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1524
1525#define DOUBLE_FAIL_STACK(fail_stack) \
1526 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1527 ? 0 \
1528 : ((fail_stack).stack = (fail_stack_elt_t *) \
1529 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1530 (fail_stack).size * sizeof (fail_stack_elt_t), \
1531 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1532 \
1533 (fail_stack).stack == NULL \
1534 ? 0 \
1535 : ((fail_stack).size <<= 1, \
1536 1)))
1537
1538
1539/* Push pointer POINTER on FAIL_STACK.
1540 Return 1 if was able to do so and 0 if ran out of memory allocating
1541 space to do so. */
1542#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1543 ((FAIL_STACK_FULL () \
1544 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1545 ? 0 \
1546 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1547 1))
1548
1549/* Push a pointer value onto the failure stack.
1550 Assumes the variable `fail_stack'. Probably should only
1551 be called from within `PUSH_FAILURE_POINT'. */
1552#define PUSH_FAILURE_POINTER(item) \
1553 fail_stack.stack[fail_stack.avail++].pointer = (US_CHAR_TYPE *) (item)
1554
1555/* This pushes an integer-valued item onto the failure stack.
1556 Assumes the variable `fail_stack'. Probably should only
1557 be called from within `PUSH_FAILURE_POINT'. */
1558#define PUSH_FAILURE_INT(item) \
1559 fail_stack.stack[fail_stack.avail++].integer = (item)
1560
1561/* Push a fail_stack_elt_t value onto the failure stack.
1562 Assumes the variable `fail_stack'. Probably should only
1563 be called from within `PUSH_FAILURE_POINT'. */
1564#define PUSH_FAILURE_ELT(item) \
1565 fail_stack.stack[fail_stack.avail++] = (item)
1566
1567/* These three POP... operations complement the three PUSH... operations.
1568 All assume that `fail_stack' is nonempty. */
1569#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1570#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1571#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1572
1573/* Used to omit pushing failure point id's when we're not debugging. */
1574#ifdef DEBUG
1575# define DEBUG_PUSH PUSH_FAILURE_INT
1576# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1577#else
1578# define DEBUG_PUSH(item)
1579# define DEBUG_POP(item_addr)
1580#endif
1581
1582
1583/* Push the information about the state we will need
1584 if we ever fail back to it.
1585
1586 Requires variables fail_stack, regstart, regend, reg_info, and
1587 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1588 be declared.
1589
1590 Does `return FAILURE_CODE' if runs out of memory. */
1591
1592#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1593 do { \
1594 char *destination; \
1595 /* Must be int, so when we don't save any registers, the arithmetic \
1596 of 0 + -1 isn't done as unsigned. */ \
1597 /* Can't be int, since there is not a shred of a guarantee that int \
1598 is wide enough to hold a value of something to which pointer can \
1599 be assigned */ \
1600 active_reg_t this_reg; \
1601 \
1602 DEBUG_STATEMENT (failure_id++); \
1603 DEBUG_STATEMENT (nfailure_points_pushed++); \
1604 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1605 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1606 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1607 \
1608 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1609 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1610 \
1611 /* Ensure we have enough space allocated for what we will push. */ \
1612 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1613 { \
1614 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1615 return failure_code; \
1616 \
1617 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1618 (fail_stack).size); \
1619 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1620 } \
1621 \
1622 /* Push the info, starting with the registers. */ \
1623 DEBUG_PRINT1 ("\n"); \
1624 \
1625 if (1) \
1626 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1627 this_reg++) \
1628 { \
1629 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1630 DEBUG_STATEMENT (num_regs_pushed++); \
1631 \
1632 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1633 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1634 \
1635 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1636 PUSH_FAILURE_POINTER (regend[this_reg]); \
1637 \
1638 DEBUG_PRINT2 (" info: %p\n ", \
1639 reg_info[this_reg].word.pointer); \
1640 DEBUG_PRINT2 (" match_null=%d", \
1641 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1642 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1643 DEBUG_PRINT2 (" matched_something=%d", \
1644 MATCHED_SOMETHING (reg_info[this_reg])); \
1645 DEBUG_PRINT2 (" ever_matched=%d", \
1646 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1647 DEBUG_PRINT1 ("\n"); \
1648 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1649 } \
1650 \
1651 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1652 PUSH_FAILURE_INT (lowest_active_reg); \
1653 \
1654 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1655 PUSH_FAILURE_INT (highest_active_reg); \
1656 \
1657 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1658 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1659 PUSH_FAILURE_POINTER (pattern_place); \
1660 \
1661 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1662 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1663 size2); \
1664 DEBUG_PRINT1 ("'\n"); \
1665 PUSH_FAILURE_POINTER (string_place); \
1666 \
1667 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1668 DEBUG_PUSH (failure_id); \
1669 } while (0)
1670
1671/* This is the number of items that are pushed and popped on the stack
1672 for each register. */
1673#define NUM_REG_ITEMS 3
1674
1675/* Individual items aside from the registers. */
1676#ifdef DEBUG
1677# define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1678#else
1679# define NUM_NONREG_ITEMS 4
1680#endif
1681
1682/* We push at most this many items on the stack. */
1683/* We used to use (num_regs - 1), which is the number of registers
1684 this regexp will save; but that was changed to 5
1685 to avoid stack overflow for a regexp with lots of parens. */
1686#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1687
1688/* We actually push this many items. */
1689#define NUM_FAILURE_ITEMS \
1690 (((0 \
1691 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1692 * NUM_REG_ITEMS) \
1693 + NUM_NONREG_ITEMS)
1694
1695/* How many items can still be added to the stack without overflowing it. */
1696#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1697
1698
1699/* Pops what PUSH_FAIL_STACK pushes.
1700
1701 We restore into the parameters, all of which should be lvalues:
1702 STR -- the saved data position.
1703 PAT -- the saved pattern position.
1704 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1705 REGSTART, REGEND -- arrays of string positions.
1706 REG_INFO -- array of information about each subexpression.
1707
1708 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1709 `pend', `string1', `size1', `string2', and `size2'. */
1710#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1711{ \
1712 DEBUG_STATEMENT (unsigned failure_id;) \
1713 active_reg_t this_reg; \
1714 const US_CHAR_TYPE *string_temp; \
1715 \
1716 assert (!FAIL_STACK_EMPTY ()); \
1717 \
1718 /* Remove failure points and point to how many regs pushed. */ \
1719 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1720 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1721 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1722 \
1723 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1724 \
1725 DEBUG_POP (&failure_id); \
1726 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1727 \
1728 /* If the saved string location is NULL, it came from an \
1729 on_failure_keep_string_jump opcode, and we want to throw away the \
1730 saved NULL, thus retaining our current position in the string. */ \
1731 string_temp = POP_FAILURE_POINTER (); \
1732 if (string_temp != NULL) \
1733 str = (const CHAR_TYPE *) string_temp; \
1734 \
1735 DEBUG_PRINT2 (" Popping string %p: `", str); \
1736 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1737 DEBUG_PRINT1 ("'\n"); \
1738 \
1739 pat = (US_CHAR_TYPE *) POP_FAILURE_POINTER (); \
1740 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1741 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1742 \
1743 /* Restore register info. */ \
1744 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1745 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1746 \
1747 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1748 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1749 \
1750 if (1) \
1751 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1752 { \
1753 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1754 \
1755 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1756 DEBUG_PRINT2 (" info: %p\n", \
1757 reg_info[this_reg].word.pointer); \
1758 \
1759 regend[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER (); \
1760 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1761 \
1762 regstart[this_reg] = (const CHAR_TYPE *) POP_FAILURE_POINTER ();\
1763 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1764 } \
1765 else \
1766 { \
1767 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1768 { \
1769 reg_info[this_reg].word.integer = 0; \
1770 regend[this_reg] = 0; \
1771 regstart[this_reg] = 0; \
1772 } \
1773 highest_active_reg = high_reg; \
1774 } \
1775 \
1776 set_regs_matched_done = 0; \
1777 DEBUG_STATEMENT (nfailure_points_popped++); \
1778} /* POP_FAILURE_POINT */
1779
1780
1781
1782/* Structure for per-register (a.k.a. per-group) information.
1783 Other register information, such as the
1784 starting and ending positions (which are addresses), and the list of
1785 inner groups (which is a bits list) are maintained in separate
1786 variables.
1787
1788 We are making a (strictly speaking) nonportable assumption here: that
1789 the compiler will pack our bit fields into something that fits into
1790 the type of `word', i.e., is something that fits into one item on the
1791 failure stack. */
1792
1793
1794/* Declarations and macros for re_match_2. */
1795
1796typedef union
1797{
1798 fail_stack_elt_t word;
1799 struct
1800 {
1801 /* This field is one if this group can match the empty string,
1802 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1803#define MATCH_NULL_UNSET_VALUE 3
1804 unsigned match_null_string_p : 2;
1805 unsigned is_active : 1;
1806 unsigned matched_something : 1;
1807 unsigned ever_matched_something : 1;
1808 } bits;
1809} register_info_type;
1810
1811#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1812#define IS_ACTIVE(R) ((R).bits.is_active)
1813#define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1814#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1815
1816
1817/* Call this when have matched a real character; it sets `matched' flags
1818 for the subexpressions which we are currently inside. Also records
1819 that those subexprs have matched. */
1820#define SET_REGS_MATCHED() \
1821 do \
1822 { \
1823 if (!set_regs_matched_done) \
1824 { \
1825 active_reg_t r; \
1826 set_regs_matched_done = 1; \
1827 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1828 { \
1829 MATCHED_SOMETHING (reg_info[r]) \
1830 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1831 = 1; \
1832 } \
1833 } \
1834 } \
1835 while (0)
1836
1837/* Registers are set to a sentinel when they haven't yet matched. */
1838static CHAR_TYPE reg_unset_dummy;
1839#define REG_UNSET_VALUE (&reg_unset_dummy)
1840#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1841
1842
1843/* Subroutine declarations and macros for regex_compile. */
1844
1845static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
1846 reg_syntax_t syntax,
1847 struct re_pattern_buffer *bufp));
1848static void store_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc, int arg));
1849static void store_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
1850 int arg1, int arg2));
1851static void insert_op1 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
1852 int arg, US_CHAR_TYPE *end));
1853static void insert_op2 _RE_ARGS ((re_opcode_t op, US_CHAR_TYPE *loc,
1854 int arg1, int arg2, US_CHAR_TYPE *end));
1855static boolean at_begline_loc_p _RE_ARGS ((const CHAR_TYPE *pattern,
1856 const CHAR_TYPE *p,
1857 reg_syntax_t syntax));
1858static boolean at_endline_loc_p _RE_ARGS ((const CHAR_TYPE *p,
1859 const CHAR_TYPE *pend,
1860 reg_syntax_t syntax));
1861#ifdef MBS_SUPPORT
1862static reg_errcode_t compile_range _RE_ARGS ((CHAR_TYPE range_start,
1863 const CHAR_TYPE **p_ptr,
1864 const CHAR_TYPE *pend,
1865 char *translate,
1866 reg_syntax_t syntax,
1867 US_CHAR_TYPE *b,
1868 CHAR_TYPE *char_set));
1869static void insert_space _RE_ARGS ((int num, CHAR_TYPE *loc, CHAR_TYPE *end));
1870#else
1871static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,
1872 const CHAR_TYPE **p_ptr,
1873 const CHAR_TYPE *pend,
1874 char *translate,
1875 reg_syntax_t syntax,
1876 US_CHAR_TYPE *b));
1877#endif /* MBS_SUPPORT */
1878
1879/* Fetch the next character in the uncompiled pattern---translating it
1880 if necessary. Also cast from a signed character in the constant
1881 string passed to us by the user to an unsigned char that we can use
1882 as an array index (in, e.g., `translate'). */
1883/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1884 because it is impossible to allocate 4GB array for some encodings
1885 which have 4 byte character_set like UCS4. */
1886#ifndef PATFETCH
1887# ifdef MBS_SUPPORT
1888# define PATFETCH(c) \
1889 do {if (p == pend) return REG_EEND; \
1890 c = (US_CHAR_TYPE) *p++; \
1891 if (translate && (c <= 0xff)) c = (US_CHAR_TYPE) translate[c]; \
1892 } while (0)
1893# else
1894# define PATFETCH(c) \
1895 do {if (p == pend) return REG_EEND; \
1896 c = (unsigned char) *p++; \
1897 if (translate) c = (unsigned char) translate[c]; \
1898 } while (0)
1899# endif /* MBS_SUPPORT */
1900#endif
1901
1902/* Fetch the next character in the uncompiled pattern, with no
1903 translation. */
1904#define PATFETCH_RAW(c) \
1905 do {if (p == pend) return REG_EEND; \
1906 c = (US_CHAR_TYPE) *p++; \
1907 } while (0)
1908
1909/* Go backwards one character in the pattern. */
1910#define PATUNFETCH p--
1911
1912
1913/* If `translate' is non-null, return translate[D], else just D. We
1914 cast the subscript to translate because some data is declared as
1915 `char *', to avoid warnings when a string constant is passed. But
1916 when we use a character as a subscript we must make it unsigned. */
1917/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1918 because it is impossible to allocate 4GB array for some encodings
1919 which have 4 byte character_set like UCS4. */
1920#ifndef TRANSLATE
1921# ifdef MBS_SUPPORT
1922# define TRANSLATE(d) \
1923 ((translate && ((US_CHAR_TYPE) (d)) <= 0xff) \
1924 ? (char) translate[(unsigned char) (d)] : (d))
1925#else
1926# define TRANSLATE(d) \
1927 (translate ? (char) translate[(unsigned char) (d)] : (d))
1928# endif /* MBS_SUPPORT */
1929#endif
1930
1931
1932/* Macros for outputting the compiled pattern into `buffer'. */
1933
1934/* If the buffer isn't allocated when it comes in, use this. */
1935#define INIT_BUF_SIZE (32 * sizeof(US_CHAR_TYPE))
1936
1937/* Make sure we have at least N more bytes of space in buffer. */
1938#ifdef MBS_SUPPORT
1939# define GET_BUFFER_SPACE(n) \
1940 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
1941 + (n)*sizeof(CHAR_TYPE)) > bufp->allocated) \
1942 EXTEND_BUFFER ()
1943#else
1944# define GET_BUFFER_SPACE(n) \
1945 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1946 EXTEND_BUFFER ()
1947#endif /* MBS_SUPPORT */
1948
1949/* Make sure we have one more byte of buffer space and then add C to it. */
1950#define BUF_PUSH(c) \
1951 do { \
1952 GET_BUFFER_SPACE (1); \
1953 *b++ = (US_CHAR_TYPE) (c); \
1954 } while (0)
1955
1956
1957/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1958#define BUF_PUSH_2(c1, c2) \
1959 do { \
1960 GET_BUFFER_SPACE (2); \
1961 *b++ = (US_CHAR_TYPE) (c1); \
1962 *b++ = (US_CHAR_TYPE) (c2); \
1963 } while (0)
1964
1965
1966/* As with BUF_PUSH_2, except for three bytes. */
1967#define BUF_PUSH_3(c1, c2, c3) \
1968 do { \
1969 GET_BUFFER_SPACE (3); \
1970 *b++ = (US_CHAR_TYPE) (c1); \
1971 *b++ = (US_CHAR_TYPE) (c2); \
1972 *b++ = (US_CHAR_TYPE) (c3); \
1973 } while (0)
1974
1975/* Store a jump with opcode OP at LOC to location TO. We store a
1976 relative address offset by the three bytes the jump itself occupies. */
1977#define STORE_JUMP(op, loc, to) \
1978 store_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
1979
1980/* Likewise, for a two-argument jump. */
1981#define STORE_JUMP2(op, loc, to, arg) \
1982 store_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
1983
1984/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1985#define INSERT_JUMP(op, loc, to) \
1986 insert_op1 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
1987
1988/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1989#define INSERT_JUMP2(op, loc, to, arg) \
1990 insert_op2 (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
1991 arg, b)
1992
1993
1994/* This is not an arbitrary limit: the arguments which represent offsets
1995 into the pattern are two bytes long. So if 2^16 bytes turns out to
1996 be too small, many things would have to change. */
1997/* Any other compiler which, like MSC, has allocation limit below 2^16
1998 bytes will have to use approach similar to what was done below for
1999 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
2000 reallocating to 0 bytes. Such thing is not going to work too well.
2001 You have been warned!! */
2002#if defined _MSC_VER && !defined WIN32
2003/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
2004 The REALLOC define eliminates a flurry of conversion warnings,
2005 but is not required. */
2006# define MAX_BUF_SIZE 65500L
2007# define REALLOC(p,s) realloc ((p), (size_t) (s))
2008#else
2009# define MAX_BUF_SIZE (1L << 16)
2010# define REALLOC(p,s) realloc ((p), (s))
2011#endif
2012
2013/* Extend the buffer by twice its current size via realloc and
2014 reset the pointers that pointed into the old block to point to the
2015 correct places in the new one. If extending the buffer results in it
2016 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
2017#if __BOUNDED_POINTERS__
2018# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
2019# define MOVE_BUFFER_POINTER(P) \
2020 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
2021# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
2022 else \
2023 { \
2024 SET_HIGH_BOUND (b); \
2025 SET_HIGH_BOUND (begalt); \
2026 if (fixup_alt_jump) \
2027 SET_HIGH_BOUND (fixup_alt_jump); \
2028 if (laststart) \
2029 SET_HIGH_BOUND (laststart); \
2030 if (pending_exact) \
2031 SET_HIGH_BOUND (pending_exact); \
2032 }
2033#else
2034# define MOVE_BUFFER_POINTER(P) (P) += incr
2035# define ELSE_EXTEND_BUFFER_HIGH_BOUND
2036#endif
2037
2038#ifdef MBS_SUPPORT
2039# define EXTEND_BUFFER() \
2040 do { \
2041 US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \
2042 int wchar_count; \
2043 if (bufp->allocated + sizeof(US_CHAR_TYPE) > MAX_BUF_SIZE) \
2044 return REG_ESIZE; \
2045 bufp->allocated <<= 1; \
2046 if (bufp->allocated > MAX_BUF_SIZE) \
2047 bufp->allocated = MAX_BUF_SIZE; \
2048 /* How many characters the new buffer can have? */ \
2049 wchar_count = bufp->allocated / sizeof(US_CHAR_TYPE); \
2050 if (wchar_count == 0) wchar_count = 1; \
2051 /* Truncate the buffer to CHAR_TYPE align. */ \
2052 bufp->allocated = wchar_count * sizeof(US_CHAR_TYPE); \
2053 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, US_CHAR_TYPE); \
2054 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
2055 if (COMPILED_BUFFER_VAR == NULL) \
2056 return REG_ESPACE; \
2057 /* If the buffer moved, move all the pointers into it. */ \
2058 if (old_buffer != COMPILED_BUFFER_VAR) \
2059 { \
2060 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2061 MOVE_BUFFER_POINTER (b); \
2062 MOVE_BUFFER_POINTER (begalt); \
2063 if (fixup_alt_jump) \
2064 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2065 if (laststart) \
2066 MOVE_BUFFER_POINTER (laststart); \
2067 if (pending_exact) \
2068 MOVE_BUFFER_POINTER (pending_exact); \
2069 } \
2070 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2071 } while (0)
2072#else
2073# define EXTEND_BUFFER() \
2074 do { \
2075 US_CHAR_TYPE *old_buffer = COMPILED_BUFFER_VAR; \
2076 if (bufp->allocated == MAX_BUF_SIZE) \
2077 return REG_ESIZE; \
2078 bufp->allocated <<= 1; \
2079 if (bufp->allocated > MAX_BUF_SIZE) \
2080 bufp->allocated = MAX_BUF_SIZE; \
2081 bufp->buffer = (US_CHAR_TYPE *) REALLOC (COMPILED_BUFFER_VAR, \
2082 bufp->allocated); \
2083 if (COMPILED_BUFFER_VAR == NULL) \
2084 return REG_ESPACE; \
2085 /* If the buffer moved, move all the pointers into it. */ \
2086 if (old_buffer != COMPILED_BUFFER_VAR) \
2087 { \
2088 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2089 MOVE_BUFFER_POINTER (b); \
2090 MOVE_BUFFER_POINTER (begalt); \
2091 if (fixup_alt_jump) \
2092 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2093 if (laststart) \
2094 MOVE_BUFFER_POINTER (laststart); \
2095 if (pending_exact) \
2096 MOVE_BUFFER_POINTER (pending_exact); \
2097 } \
2098 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2099 } while (0)
2100#endif /* MBS_SUPPORT */
2101
2102/* Since we have one byte reserved for the register number argument to
2103 {start,stop}_memory, the maximum number of groups we can report
2104 things about is what fits in that byte. */
2105#define MAX_REGNUM 255
2106
2107/* But patterns can have more than `MAX_REGNUM' registers. We just
2108 ignore the excess. */
2109typedef unsigned regnum_t;
2110
2111
2112/* Macros for the compile stack. */
2113
2114/* Since offsets can go either forwards or backwards, this type needs to
2115 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
2116/* int may be not enough when sizeof(int) == 2. */
2117typedef long pattern_offset_t;
2118
2119typedef struct
2120{
2121 pattern_offset_t begalt_offset;
2122 pattern_offset_t fixup_alt_jump;
2123 pattern_offset_t inner_group_offset;
2124 pattern_offset_t laststart_offset;
2125 regnum_t regnum;
2126} compile_stack_elt_t;
2127
2128
2129typedef struct
2130{
2131 compile_stack_elt_t *stack;
2132 unsigned size;
2133 unsigned avail; /* Offset of next open position. */
2134} compile_stack_type;
2135
2136
2137#define INIT_COMPILE_STACK_SIZE 32
2138
2139#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
2140#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
2141
2142/* The next available element. */
2143#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
2144
2145
2146/* Set the bit for character C in a list. */
2147#define SET_LIST_BIT(c) \
2148 (b[((unsigned char) (c)) / BYTEWIDTH] \
2149 |= 1 << (((unsigned char) c) % BYTEWIDTH))
2150
2151
2152/* Get the next unsigned number in the uncompiled pattern. */
2153#define GET_UNSIGNED_NUMBER(num) \
2154 { \
2155 while (p != pend) \
2156 { \
2157 PATFETCH (c); \
2158 if (! ('0' <= c && c <= '9')) \
2159 break; \
2160 if (num <= RE_DUP_MAX) \
2161 { \
2162 if (num < 0) \
2163 num = 0; \
2164 num = num * 10 + c - '0'; \
2165 } \
2166 } \
2167 }
2168
2169#if defined _LIBC || WIDE_CHAR_SUPPORT
2170/* The GNU C library provides support for user-defined character classes
2171 and the functions from ISO C amendement 1. */
2172# ifdef CHARCLASS_NAME_MAX
2173# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2174# else
2175/* This shouldn't happen but some implementation might still have this
2176 problem. Use a reasonable default value. */
2177# define CHAR_CLASS_MAX_LENGTH 256
2178# endif
2179
2180# ifdef _LIBC
2181# define IS_CHAR_CLASS(string) __wctype (string)
2182# else
2183# define IS_CHAR_CLASS(string) wctype (string)
2184# endif
2185#else
2186# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
2187
2188# define IS_CHAR_CLASS(string) \
2189 (STREQ (string, "alpha") || STREQ (string, "upper") \
2190 || STREQ (string, "lower") || STREQ (string, "digit") \
2191 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
2192 || STREQ (string, "space") || STREQ (string, "print") \
2193 || STREQ (string, "punct") || STREQ (string, "graph") \
2194 || STREQ (string, "cntrl") || STREQ (string, "blank"))
2195#endif
2196
2197
2198#ifndef MATCH_MAY_ALLOCATE
2199
2200/* If we cannot allocate large objects within re_match_2_internal,
2201 we make the fail stack and register vectors global.
2202 The fail stack, we grow to the maximum size when a regexp
2203 is compiled.
2204 The register vectors, we adjust in size each time we
2205 compile a regexp, according to the number of registers it needs. */
2206
2207static fail_stack_type fail_stack;
2208
2209/* Size with which the following vectors are currently allocated.
2210 That is so we can make them bigger as needed,
2211 but never make them smaller. */
2212static int regs_allocated_size;
2213
2214static const char ** regstart, ** regend;
2215static const char ** old_regstart, ** old_regend;
2216static const char **best_regstart, **best_regend;
2217static register_info_type *reg_info;
2218static const char **reg_dummy;
2219static register_info_type *reg_info_dummy;
2220
2221/* Make the register vectors big enough for NUM_REGS registers,
2222 but don't make them smaller. */
2223
2224static
2225regex_grow_registers (num_regs)
2226 int num_regs;
2227{
2228 if (num_regs > regs_allocated_size)
2229 {
2230 RETALLOC_IF (regstart, num_regs, const char *);
2231 RETALLOC_IF (regend, num_regs, const char *);
2232 RETALLOC_IF (old_regstart, num_regs, const char *);
2233 RETALLOC_IF (old_regend, num_regs, const char *);
2234 RETALLOC_IF (best_regstart, num_regs, const char *);
2235 RETALLOC_IF (best_regend, num_regs, const char *);
2236 RETALLOC_IF (reg_info, num_regs, register_info_type);
2237 RETALLOC_IF (reg_dummy, num_regs, const char *);
2238 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
2239
2240 regs_allocated_size = num_regs;
2241 }
2242}
2243
2244#endif /* not MATCH_MAY_ALLOCATE */
2245
2246
2247static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2248 compile_stack,
2249 regnum_t regnum));
2250
2251/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2252 Returns one of error codes defined in `regex.h', or zero for success.
2253
2254 Assumes the `allocated' (and perhaps `buffer') and `translate'
2255 fields are set in BUFP on entry.
2256
2257 If it succeeds, results are put in BUFP (if it returns an error, the
2258 contents of BUFP are undefined):
2259 `buffer' is the compiled pattern;
2260 `syntax' is set to SYNTAX;
2261 `used' is set to the length of the compiled pattern;
2262 `fastmap_accurate' is zero;
2263 `re_nsub' is the number of subexpressions in PATTERN;
2264 `not_bol' and `not_eol' are zero;
2265
2266 The `fastmap' and `newline_anchor' fields are neither
2267 examined nor set. */
2268
2269/* Return, freeing storage we allocated. */
2270#ifdef MBS_SUPPORT
2271# define FREE_STACK_RETURN(value) \
2272 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
2273#else
2274# define FREE_STACK_RETURN(value) \
2275 return (free (compile_stack.stack), value)
2276#endif /* MBS_SUPPORT */
2277
2278static reg_errcode_t
2279#ifdef MBS_SUPPORT
2280regex_compile (cpattern, csize, syntax, bufp)
2281 const char *cpattern;
2282 size_t csize;
2283#else
2284regex_compile (pattern, size, syntax, bufp)
2285 const char *pattern;
2286 size_t size;
2287#endif /* MBS_SUPPORT */
2288 reg_syntax_t syntax;
2289 struct re_pattern_buffer *bufp;
2290{
2291 /* We fetch characters from PATTERN here. Even though PATTERN is
2292 `char *' (i.e., signed), we declare these variables as unsigned, so
2293 they can be reliably used as array indices. */
2294 register US_CHAR_TYPE c, c1;
2295
2296#ifdef MBS_SUPPORT
2297 /* A temporary space to keep wchar_t pattern and compiled pattern. */
2298 CHAR_TYPE *pattern, *COMPILED_BUFFER_VAR;
2299 size_t size;
2300 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
2301 int *mbs_offset = NULL;
2302 /* It hold whether each wchar_t is binary data or not. */
2303 char *is_binary = NULL;
2304 /* A flag whether exactn is handling binary data or not. */
2305 char is_exactn_bin = FALSE;
2306#endif /* MBS_SUPPORT */
2307
2308 /* A random temporary spot in PATTERN. */
2309 const CHAR_TYPE *p1;
2310
2311 /* Points to the end of the buffer, where we should append. */
2312 register US_CHAR_TYPE *b;
2313
2314 /* Keeps track of unclosed groups. */
2315 compile_stack_type compile_stack;
2316
2317 /* Points to the current (ending) position in the pattern. */
2318#ifdef MBS_SUPPORT
2319 const CHAR_TYPE *p;
2320 const CHAR_TYPE *pend;
2321#else
2322 const CHAR_TYPE *p = pattern;
2323 const CHAR_TYPE *pend = pattern + size;
2324#endif /* MBS_SUPPORT */
2325
2326 /* How to translate the characters in the pattern. */
2327 RE_TRANSLATE_TYPE translate = bufp->translate;
2328
2329 /* Address of the count-byte of the most recently inserted `exactn'
2330 command. This makes it possible to tell if a new exact-match
2331 character can be added to that command or if the character requires
2332 a new `exactn' command. */
2333 US_CHAR_TYPE *pending_exact = 0;
2334
2335 /* Address of start of the most recently finished expression.
2336 This tells, e.g., postfix * where to find the start of its
2337 operand. Reset at the beginning of groups and alternatives. */
2338 US_CHAR_TYPE *laststart = 0;
2339
2340 /* Address of beginning of regexp, or inside of last group. */
2341 US_CHAR_TYPE *begalt;
2342
2343 /* Address of the place where a forward jump should go to the end of
2344 the containing expression. Each alternative of an `or' -- except the
2345 last -- ends with a forward jump of this sort. */
2346 US_CHAR_TYPE *fixup_alt_jump = 0;
2347
2348 /* Counts open-groups as they are encountered. Remembered for the
2349 matching close-group on the compile stack, so the same register
2350 number is put in the stop_memory as the start_memory. */
2351 regnum_t regnum = 0;
2352
2353#ifdef MBS_SUPPORT
2354 /* Initialize the wchar_t PATTERN and offset_buffer. */
2355 p = pend = pattern = TALLOC(csize + 1, CHAR_TYPE);
2356 p[csize] = L'\0'; /* sentinel */
2357 mbs_offset = TALLOC(csize + 1, int);
2358 is_binary = TALLOC(csize + 1, char);
2359 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
2360 {
2361 if (pattern) free(pattern);
2362 if (mbs_offset) free(mbs_offset);
2363 if (is_binary) free(is_binary);
2364 return REG_ESPACE;
2365 }
2366 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
2367 pend = p + size;
2368 if (size < 0)
2369 {
2370 if (pattern) free(pattern);
2371 if (mbs_offset) free(mbs_offset);
2372 if (is_binary) free(is_binary);
2373 return REG_BADPAT;
2374 }
2375#endif
2376
2377#ifdef DEBUG
2378 DEBUG_PRINT1 ("\nCompiling pattern: ");
2379 if (debug)
2380 {
2381 unsigned debug_count;
2382
2383 for (debug_count = 0; debug_count < size; debug_count++)
2384 PUT_CHAR (pattern[debug_count]);
2385 putchar ('\n');
2386 }
2387#endif /* DEBUG */
2388
2389 /* Initialize the compile stack. */
2390 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2391 if (compile_stack.stack == NULL)
2392 {
2393#ifdef MBS_SUPPORT
2394 if (pattern) free(pattern);
2395 if (mbs_offset) free(mbs_offset);
2396 if (is_binary) free(is_binary);
2397#endif
2398 return REG_ESPACE;
2399 }
2400
2401 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2402 compile_stack.avail = 0;
2403
2404 /* Initialize the pattern buffer. */
2405 bufp->syntax = syntax;
2406 bufp->fastmap_accurate = 0;
2407 bufp->not_bol = bufp->not_eol = 0;
2408
2409 /* Set `used' to zero, so that if we return an error, the pattern
2410 printer (for debugging) will think there's no pattern. We reset it
2411 at the end. */
2412 bufp->used = 0;
2413
2414 /* Always count groups, whether or not bufp->no_sub is set. */
2415 bufp->re_nsub = 0;
2416
2417#if !defined emacs && !defined SYNTAX_TABLE
2418 /* Initialize the syntax table. */
2419 init_syntax_once ();
2420#endif
2421
2422 if (bufp->allocated == 0)
2423 {
2424 if (bufp->buffer)
2425 { /* If zero allocated, but buffer is non-null, try to realloc
2426 enough space. This loses if buffer's address is bogus, but
2427 that is the user's responsibility. */
2428#ifdef MBS_SUPPORT
2429 /* Free bufp->buffer and allocate an array for wchar_t pattern
2430 buffer. */
2431 free(bufp->buffer);
2432 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(US_CHAR_TYPE),
2433 US_CHAR_TYPE);
2434#else
2435 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, US_CHAR_TYPE);
2436#endif /* MBS_SUPPORT */
2437 }
2438 else
2439 { /* Caller did not allocate a buffer. Do it for them. */
2440 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(US_CHAR_TYPE),
2441 US_CHAR_TYPE);
2442 }
2443
2444 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
2445#ifdef MBS_SUPPORT
2446 bufp->buffer = (char*)COMPILED_BUFFER_VAR;
2447#endif /* MBS_SUPPORT */
2448 bufp->allocated = INIT_BUF_SIZE;
2449 }
2450#ifdef MBS_SUPPORT
2451 else
2452 COMPILED_BUFFER_VAR = (US_CHAR_TYPE*) bufp->buffer;
2453#endif
2454
2455 begalt = b = COMPILED_BUFFER_VAR;
2456
2457 /* Loop through the uncompiled pattern until we're at the end. */
2458 while (p != pend)
2459 {
2460 PATFETCH (c);
2461
2462 switch (c)
2463 {
2464 case '^':
2465 {
2466 if ( /* If at start of pattern, it's an operator. */
2467 p == pattern + 1
2468 /* If context independent, it's an operator. */
2469 || syntax & RE_CONTEXT_INDEP_ANCHORS
2470 /* Otherwise, depends on what's come before. */
2471 || at_begline_loc_p (pattern, p, syntax))
2472 BUF_PUSH (begline);
2473 else
2474 goto normal_char;
2475 }
2476 break;
2477
2478
2479 case '$':
2480 {
2481 if ( /* If at end of pattern, it's an operator. */
2482 p == pend
2483 /* If context independent, it's an operator. */
2484 || syntax & RE_CONTEXT_INDEP_ANCHORS
2485 /* Otherwise, depends on what's next. */
2486 || at_endline_loc_p (p, pend, syntax))
2487 BUF_PUSH (endline);
2488 else
2489 goto normal_char;
2490 }
2491 break;
2492
2493
2494 case '+':
2495 case '?':
2496 if ((syntax & RE_BK_PLUS_QM)
2497 || (syntax & RE_LIMITED_OPS))
2498 goto normal_char;
2499 handle_plus:
2500 case '*':
2501 /* If there is no previous pattern... */
2502 if (!laststart)
2503 {
2504 if (syntax & RE_CONTEXT_INVALID_OPS)
2505 FREE_STACK_RETURN (REG_BADRPT);
2506 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2507 goto normal_char;
2508 }
2509
2510 {
2511 /* Are we optimizing this jump? */
2512 boolean keep_string_p = false;
2513
2514 /* 1 means zero (many) matches is allowed. */
2515 char zero_times_ok = 0, many_times_ok = 0;
2516
2517 /* If there is a sequence of repetition chars, collapse it
2518 down to just one (the right one). We can't combine
2519 interval operators with these because of, e.g., `a{2}*',
2520 which should only match an even number of `a's. */
2521
2522 for (;;)
2523 {
2524 zero_times_ok |= c != '+';
2525 many_times_ok |= c != '?';
2526
2527 if (p == pend)
2528 break;
2529
2530 PATFETCH (c);
2531
2532 if (c == '*'
2533 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2534 ;
2535
2536 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2537 {
2538 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2539
2540 PATFETCH (c1);
2541 if (!(c1 == '+' || c1 == '?'))
2542 {
2543 PATUNFETCH;
2544 PATUNFETCH;
2545 break;
2546 }
2547
2548 c = c1;
2549 }
2550 else
2551 {
2552 PATUNFETCH;
2553 break;
2554 }
2555
2556 /* If we get here, we found another repeat character. */
2557 }
2558
2559 /* Star, etc. applied to an empty pattern is equivalent
2560 to an empty pattern. */
2561 if (!laststart)
2562 break;
2563
2564 /* Now we know whether or not zero matches is allowed
2565 and also whether or not two or more matches is allowed. */
2566 if (many_times_ok)
2567 { /* More than one repetition is allowed, so put in at the
2568 end a backward relative jump from `b' to before the next
2569 jump we're going to put in below (which jumps from
2570 laststart to after this jump).
2571
2572 But if we are at the `*' in the exact sequence `.*\n',
2573 insert an unconditional jump backwards to the .,
2574 instead of the beginning of the loop. This way we only
2575 push a failure point once, instead of every time
2576 through the loop. */
2577 assert (p - 1 > pattern);
2578
2579 /* Allocate the space for the jump. */
2580 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2581
2582 /* We know we are not at the first character of the pattern,
2583 because laststart was nonzero. And we've already
2584 incremented `p', by the way, to be the character after
2585 the `*'. Do we have to do something analogous here
2586 for null bytes, because of RE_DOT_NOT_NULL? */
2587 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2588 && zero_times_ok
2589 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2590 && !(syntax & RE_DOT_NEWLINE))
2591 { /* We have .*\n. */
2592 STORE_JUMP (jump, b, laststart);
2593 keep_string_p = true;
2594 }
2595 else
2596 /* Anything else. */
2597 STORE_JUMP (maybe_pop_jump, b, laststart -
2598 (1 + OFFSET_ADDRESS_SIZE));
2599
2600 /* We've added more stuff to the buffer. */
2601 b += 1 + OFFSET_ADDRESS_SIZE;
2602 }
2603
2604 /* On failure, jump from laststart to b + 3, which will be the
2605 end of the buffer after this jump is inserted. */
2606 /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
2607 'b + 3'. */
2608 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2609 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2610 : on_failure_jump,
2611 laststart, b + 1 + OFFSET_ADDRESS_SIZE);
2612 pending_exact = 0;
2613 b += 1 + OFFSET_ADDRESS_SIZE;
2614
2615 if (!zero_times_ok)
2616 {
2617 /* At least one repetition is required, so insert a
2618 `dummy_failure_jump' before the initial
2619 `on_failure_jump' instruction of the loop. This
2620 effects a skip over that instruction the first time
2621 we hit that loop. */
2622 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2623 INSERT_JUMP (dummy_failure_jump, laststart, laststart +
2624 2 + 2 * OFFSET_ADDRESS_SIZE);
2625 b += 1 + OFFSET_ADDRESS_SIZE;
2626 }
2627 }
2628 break;
2629
2630
2631 case '.':
2632 laststart = b;
2633 BUF_PUSH (anychar);
2634 break;
2635
2636
2637 case '[':
2638 {
2639 boolean had_char_class = false;
2640#ifdef MBS_SUPPORT
2641 CHAR_TYPE range_start = 0xffffffff;
2642#else
2643 unsigned int range_start = 0xffffffff;
2644#endif
2645 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2646
2647#ifdef MBS_SUPPORT
2648 /* We assume a charset(_not) structure as a wchar_t array.
2649 charset[0] = (re_opcode_t) charset(_not)
2650 charset[1] = l (= length of char_classes)
2651 charset[2] = m (= length of collating_symbols)
2652 charset[3] = n (= length of equivalence_classes)
2653 charset[4] = o (= length of char_ranges)
2654 charset[5] = p (= length of chars)
2655
2656 charset[6] = char_class (wctype_t)
2657 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
2658 ...
2659 charset[l+5] = char_class (wctype_t)
2660
2661 charset[l+6] = collating_symbol (wchar_t)
2662 ...
2663 charset[l+m+5] = collating_symbol (wchar_t)
2664 ifdef _LIBC we use the index if
2665 _NL_COLLATE_SYMB_EXTRAMB instead of
2666 wchar_t string.
2667
2668 charset[l+m+6] = equivalence_classes (wchar_t)
2669 ...
2670 charset[l+m+n+5] = equivalence_classes (wchar_t)
2671 ifdef _LIBC we use the index in
2672 _NL_COLLATE_WEIGHT instead of
2673 wchar_t string.
2674
2675 charset[l+m+n+6] = range_start
2676 charset[l+m+n+7] = range_end
2677 ...
2678 charset[l+m+n+2o+4] = range_start
2679 charset[l+m+n+2o+5] = range_end
2680 ifdef _LIBC we use the value looked up
2681 in _NL_COLLATE_COLLSEQ instead of
2682 wchar_t character.
2683
2684 charset[l+m+n+2o+6] = char
2685 ...
2686 charset[l+m+n+2o+p+5] = char
2687
2688 */
2689
2690 /* We need at least 6 spaces: the opcode, the length of
2691 char_classes, the length of collating_symbols, the length of
2692 equivalence_classes, the length of char_ranges, the length of
2693 chars. */
2694 GET_BUFFER_SPACE (6);
2695
2696 /* Save b as laststart. And We use laststart as the pointer
2697 to the first element of the charset here.
2698 In other words, laststart[i] indicates charset[i]. */
2699 laststart = b;
2700
2701 /* We test `*p == '^' twice, instead of using an if
2702 statement, so we only need one BUF_PUSH. */
2703 BUF_PUSH (*p == '^' ? charset_not : charset);
2704 if (*p == '^')
2705 p++;
2706
2707 /* Push the length of char_classes, the length of
2708 collating_symbols, the length of equivalence_classes, the
2709 length of char_ranges and the length of chars. */
2710 BUF_PUSH_3 (0, 0, 0);
2711 BUF_PUSH_2 (0, 0);
2712
2713 /* Remember the first position in the bracket expression. */
2714 p1 = p;
2715
2716 /* charset_not matches newline according to a syntax bit. */
2717 if ((re_opcode_t) b[-6] == charset_not
2718 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2719 {
2720 BUF_PUSH('\n');
2721 laststart[5]++; /* Update the length of characters */
2722 }
2723
2724 /* Read in characters and ranges, setting map bits. */
2725 for (;;)
2726 {
2727 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2728
2729 PATFETCH (c);
2730
2731 /* \ might escape characters inside [...] and [^...]. */
2732 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2733 {
2734 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2735
2736 PATFETCH (c1);
2737 BUF_PUSH(c1);
2738 laststart[5]++; /* Update the length of chars */
2739 range_start = c1;
2740 continue;
2741 }
2742
2743 /* Could be the end of the bracket expression. If it's
2744 not (i.e., when the bracket expression is `[]' so
2745 far), the ']' character bit gets set way below. */
2746 if (c == ']' && p != p1 + 1)
2747 break;
2748
2749 /* Look ahead to see if it's a range when the last thing
2750 was a character class. */
2751 if (had_char_class && c == '-' && *p != ']')
2752 FREE_STACK_RETURN (REG_ERANGE);
2753
2754 /* Look ahead to see if it's a range when the last thing
2755 was a character: if this is a hyphen not at the
2756 beginning or the end of a list, then it's the range
2757 operator. */
2758 if (c == '-'
2759 && !(p - 2 >= pattern && p[-2] == '[')
2760 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2761 && *p != ']')
2762 {
2763 reg_errcode_t ret;
2764 /* Allocate the space for range_start and range_end. */
2765 GET_BUFFER_SPACE (2);
2766 /* Update the pointer to indicate end of buffer. */
2767 b += 2;
2768 ret = compile_range (range_start, &p, pend, translate,
2769 syntax, b, laststart);
2770 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2771 range_start = 0xffffffff;
2772 }
2773 else if (p[0] == '-' && p[1] != ']')
2774 { /* This handles ranges made up of characters only. */
2775 reg_errcode_t ret;
2776
2777 /* Move past the `-'. */
2778 PATFETCH (c1);
2779 /* Allocate the space for range_start and range_end. */
2780 GET_BUFFER_SPACE (2);
2781 /* Update the pointer to indicate end of buffer. */
2782 b += 2;
2783 ret = compile_range (c, &p, pend, translate, syntax, b,
2784 laststart);
2785 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2786 range_start = 0xffffffff;
2787 }
2788
2789 /* See if we're at the beginning of a possible character
2790 class. */
2791 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2792 { /* Leave room for the null. */
2793 char str[CHAR_CLASS_MAX_LENGTH + 1];
2794
2795 PATFETCH (c);
2796 c1 = 0;
2797
2798 /* If pattern is `[[:'. */
2799 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2800
2801 for (;;)
2802 {
2803 PATFETCH (c);
2804 if ((c == ':' && *p == ']') || p == pend)
2805 break;
2806 if (c1 < CHAR_CLASS_MAX_LENGTH)
2807 str[c1++] = c;
2808 else
2809 /* This is in any case an invalid class name. */
2810 str[0] = '\0';
2811 }
2812 str[c1] = '\0';
2813
2814 /* If isn't a word bracketed by `[:' and `:]':
2815 undo the ending character, the letters, and leave
2816 the leading `:' and `[' (but store them as character). */
2817 if (c == ':' && *p == ']')
2818 {
2819 wctype_t wt;
2820 uintptr_t alignedp;
2821
2822 /* Query the character class as wctype_t. */
2823 wt = IS_CHAR_CLASS (str);
2824 if (wt == 0)
2825 FREE_STACK_RETURN (REG_ECTYPE);
2826
2827 /* Throw away the ] at the end of the character
2828 class. */
2829 PATFETCH (c);
2830
2831 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2832
2833 /* Allocate the space for character class. */
2834 GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
2835 /* Update the pointer to indicate end of buffer. */
2836 b += CHAR_CLASS_SIZE;
2837 /* Move data which follow character classes
2838 not to violate the data. */
2839 insert_space(CHAR_CLASS_SIZE,
2840 laststart + 6 + laststart[1],
2841 b - 1);
2842 alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
2843 + __alignof__(wctype_t) - 1)
2844 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
2845 /* Store the character class. */
2846 *((wctype_t*)alignedp) = wt;
2847 /* Update length of char_classes */
2848 laststart[1] += CHAR_CLASS_SIZE;
2849
2850 had_char_class = true;
2851 }
2852 else
2853 {
2854 c1++;
2855 while (c1--)
2856 PATUNFETCH;
2857 BUF_PUSH ('[');
2858 BUF_PUSH (':');
2859 laststart[5] += 2; /* Update the length of characters */
2860 range_start = ':';
2861 had_char_class = false;
2862 }
2863 }
2864 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
2865 || *p == '.'))
2866 {
2867 CHAR_TYPE str[128]; /* Should be large enough. */
2868 CHAR_TYPE delim = *p; /* '=' or '.' */
2869# ifdef _LIBC
2870 uint32_t nrules =
2871 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2872# endif
2873 PATFETCH (c);
2874 c1 = 0;
2875
2876 /* If pattern is `[[=' or '[[.'. */
2877 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2878
2879 for (;;)
2880 {
2881 PATFETCH (c);
2882 if ((c == delim && *p == ']') || p == pend)
2883 break;
2884 if (c1 < sizeof (str) - 1)
2885 str[c1++] = c;
2886 else
2887 /* This is in any case an invalid class name. */
2888 str[0] = '\0';
2889 }
2890 str[c1] = '\0';
2891
2892 if (c == delim && *p == ']' && str[0] != '\0')
2893 {
2894 unsigned int i, offset;
2895 /* If we have no collation data we use the default
2896 collation in which each character is in a class
2897 by itself. It also means that ASCII is the
2898 character set and therefore we cannot have character
2899 with more than one byte in the multibyte
2900 representation. */
2901
2902 /* If not defined _LIBC, we push the name and
2903 `\0' for the sake of matching performance. */
2904 int datasize = c1 + 1;
2905
2906# ifdef _LIBC
2907 int32_t idx = 0;
2908 if (nrules == 0)
2909# endif
2910 {
2911 if (c1 != 1)
2912 FREE_STACK_RETURN (REG_ECOLLATE);
2913 }
2914# ifdef _LIBC
2915 else
2916 {
2917 const int32_t *table;
2918 const int32_t *weights;
2919 const int32_t *extra;
2920 const int32_t *indirect;
2921 wint_t *cp;
2922
2923 /* This #include defines a local function! */
2924# include <locale/weightwc.h>
2925
2926 if(delim == '=')
2927 {
2928 /* We push the index for equivalence class. */
2929 cp = (wint_t*)str;
2930
2931 table = (const int32_t *)
2932 _NL_CURRENT (LC_COLLATE,
2933 _NL_COLLATE_TABLEWC);
2934 weights = (const int32_t *)
2935 _NL_CURRENT (LC_COLLATE,
2936 _NL_COLLATE_WEIGHTWC);
2937 extra = (const int32_t *)
2938 _NL_CURRENT (LC_COLLATE,
2939 _NL_COLLATE_EXTRAWC);
2940 indirect = (const int32_t *)
2941 _NL_CURRENT (LC_COLLATE,
2942 _NL_COLLATE_INDIRECTWC);
2943
2944 idx = findidx ((const wint_t**)&cp);
2945 if (idx == 0 || cp < (wint_t*) str + c1)
2946 /* This is no valid character. */
2947 FREE_STACK_RETURN (REG_ECOLLATE);
2948
2949 str[0] = (wchar_t)idx;
2950 }
2951 else /* delim == '.' */
2952 {
2953 /* We push collation sequence value
2954 for collating symbol. */
2955 int32_t table_size;
2956 const int32_t *symb_table;
2957 const unsigned char *extra;
2958 int32_t idx;
2959 int32_t elem;
2960 int32_t second;
2961 int32_t hash;
2962 char char_str[c1];
2963
2964 /* We have to convert the name to a single-byte
2965 string. This is possible since the names
2966 consist of ASCII characters and the internal
2967 representation is UCS4. */
2968 for (i = 0; i < c1; ++i)
2969 char_str[i] = str[i];
2970
2971 table_size =
2972 _NL_CURRENT_WORD (LC_COLLATE,
2973 _NL_COLLATE_SYMB_HASH_SIZEMB);
2974 symb_table = (const int32_t *)
2975 _NL_CURRENT (LC_COLLATE,
2976 _NL_COLLATE_SYMB_TABLEMB);
2977 extra = (const unsigned char *)
2978 _NL_CURRENT (LC_COLLATE,
2979 _NL_COLLATE_SYMB_EXTRAMB);
2980
2981 /* Locate the character in the hashing table. */
2982 hash = elem_hash (char_str, c1);
2983
2984 idx = 0;
2985 elem = hash % table_size;
2986 second = hash % (table_size - 2);
2987 while (symb_table[2 * elem] != 0)
2988 {
2989 /* First compare the hashing value. */
2990 if (symb_table[2 * elem] == hash
2991 && c1 == extra[symb_table[2 * elem + 1]]
2992 && memcmp (str,
2993 &extra[symb_table[2 * elem + 1]
2994 + 1], c1) == 0)
2995 {
2996 /* Yep, this is the entry. */
2997 idx = symb_table[2 * elem + 1];
2998 idx += 1 + extra[idx];
2999 break;
3000 }
3001
3002 /* Next entry. */
3003 elem += second;
3004 }
3005
3006 if (symb_table[2 * elem] != 0)
3007 {
3008 /* Compute the index of the byte sequence
3009 in the table. */
3010 idx += 1 + extra[idx];
3011 /* Adjust for the alignment. */
3012 idx = (idx + 3) & ~4;
3013
3014 str[0] = (wchar_t) idx + 4;
3015 }
3016 else if (symb_table[2 * elem] == 0 && c1 == 1)
3017 {
3018 /* No valid character. Match it as a
3019 single byte character. */
3020 had_char_class = false;
3021 BUF_PUSH(str[0]);
3022 /* Update the length of characters */
3023 laststart[5]++;
3024 range_start = str[0];
3025
3026 /* Throw away the ] at the end of the
3027 collating symbol. */
3028 PATFETCH (c);
3029 /* exit from the switch block. */
3030 continue;
3031 }
3032 else
3033 FREE_STACK_RETURN (REG_ECOLLATE);
3034 }
3035 datasize = 1;
3036 }
3037# endif
3038 /* Throw away the ] at the end of the equivalence
3039 class (or collating symbol). */
3040 PATFETCH (c);
3041
3042 /* Allocate the space for the equivalence class
3043 (or collating symbol) (and '\0' if needed). */
3044 GET_BUFFER_SPACE(datasize);
3045 /* Update the pointer to indicate end of buffer. */
3046 b += datasize;
3047
3048 if (delim == '=')
3049 { /* equivalence class */
3050 /* Calculate the offset of char_ranges,
3051 which is next to equivalence_classes. */
3052 offset = laststart[1] + laststart[2]
3053 + laststart[3] +6;
3054 /* Insert space. */
3055 insert_space(datasize, laststart + offset, b - 1);
3056
3057 /* Write the equivalence_class and \0. */
3058 for (i = 0 ; i < datasize ; i++)
3059 laststart[offset + i] = str[i];
3060
3061 /* Update the length of equivalence_classes. */
3062 laststart[3] += datasize;
3063 had_char_class = true;
3064 }
3065 else /* delim == '.' */
3066 { /* collating symbol */
3067 /* Calculate the offset of the equivalence_classes,
3068 which is next to collating_symbols. */
3069 offset = laststart[1] + laststart[2] + 6;
3070 /* Insert space and write the collationg_symbol
3071 and \0. */
3072 insert_space(datasize, laststart + offset, b-1);
3073 for (i = 0 ; i < datasize ; i++)
3074 laststart[offset + i] = str[i];
3075
3076 /* In re_match_2_internal if range_start < -1, we
3077 assume -range_start is the offset of the
3078 collating symbol which is specified as
3079 the character of the range start. So we assign
3080 -(laststart[1] + laststart[2] + 6) to
3081 range_start. */
3082 range_start = -(laststart[1] + laststart[2] + 6);
3083 /* Update the length of collating_symbol. */
3084 laststart[2] += datasize;
3085 had_char_class = false;
3086 }
3087 }
3088 else
3089 {
3090 c1++;
3091 while (c1--)
3092 PATUNFETCH;
3093 BUF_PUSH ('[');
3094 BUF_PUSH (delim);
3095 laststart[5] += 2; /* Update the length of characters */
3096 range_start = delim;
3097 had_char_class = false;
3098 }
3099 }
3100 else
3101 {
3102 had_char_class = false;
3103 BUF_PUSH(c);
3104 laststart[5]++; /* Update the length of characters */
3105 range_start = c;
3106 }
3107 }
3108
3109#else /* not MBS_SUPPORT */
3110 /* Ensure that we have enough space to push a charset: the
3111 opcode, the length count, and the bitset; 34 bytes in all. */
3112 GET_BUFFER_SPACE (34);
3113
3114 laststart = b;
3115
3116 /* We test `*p == '^' twice, instead of using an if
3117 statement, so we only need one BUF_PUSH. */
3118 BUF_PUSH (*p == '^' ? charset_not : charset);
3119 if (*p == '^')
3120 p++;
3121
3122 /* Remember the first position in the bracket expression. */
3123 p1 = p;
3124
3125 /* Push the number of bytes in the bitmap. */
3126 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
3127
3128 /* Clear the whole map. */
3129 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
3130
3131 /* charset_not matches newline according to a syntax bit. */
3132 if ((re_opcode_t) b[-2] == charset_not
3133 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
3134 SET_LIST_BIT ('\n');
3135
3136 /* Read in characters and ranges, setting map bits. */
3137 for (;;)
3138 {
3139 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3140
3141 PATFETCH (c);
3142
3143 /* \ might escape characters inside [...] and [^...]. */
3144 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3145 {
3146 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3147
3148 PATFETCH (c1);
3149 SET_LIST_BIT (c1);
3150 range_start = c1;
3151 continue;
3152 }
3153
3154 /* Could be the end of the bracket expression. If it's
3155 not (i.e., when the bracket expression is `[]' so
3156 far), the ']' character bit gets set way below. */
3157 if (c == ']' && p != p1 + 1)
3158 break;
3159
3160 /* Look ahead to see if it's a range when the last thing
3161 was a character class. */
3162 if (had_char_class && c == '-' && *p != ']')
3163 FREE_STACK_RETURN (REG_ERANGE);
3164
3165 /* Look ahead to see if it's a range when the last thing
3166 was a character: if this is a hyphen not at the
3167 beginning or the end of a list, then it's the range
3168 operator. */
3169 if (c == '-'
3170 && !(p - 2 >= pattern && p[-2] == '[')
3171 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
3172 && *p != ']')
3173 {
3174 reg_errcode_t ret
3175 = compile_range (range_start, &p, pend, translate,
3176 syntax, b);
3177 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3178 range_start = 0xffffffff;
3179 }
3180
3181 else if (p[0] == '-' && p[1] != ']')
3182 { /* This handles ranges made up of characters only. */
3183 reg_errcode_t ret;
3184
3185 /* Move past the `-'. */
3186 PATFETCH (c1);
3187
3188 ret = compile_range (c, &p, pend, translate, syntax, b);
3189 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3190 range_start = 0xffffffff;
3191 }
3192
3193 /* See if we're at the beginning of a possible character
3194 class. */
3195
3196 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
3197 { /* Leave room for the null. */
3198 char str[CHAR_CLASS_MAX_LENGTH + 1];
3199
3200 PATFETCH (c);
3201 c1 = 0;
3202
3203 /* If pattern is `[[:'. */
3204 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3205
3206 for (;;)
3207 {
3208 PATFETCH (c);
3209 if ((c == ':' && *p == ']') || p == pend)
3210 break;
3211 if (c1 < CHAR_CLASS_MAX_LENGTH)
3212 str[c1++] = c;
3213 else
3214 /* This is in any case an invalid class name. */
3215 str[0] = '\0';
3216 }
3217 str[c1] = '\0';
3218
3219 /* If isn't a word bracketed by `[:' and `:]':
3220 undo the ending character, the letters, and leave
3221 the leading `:' and `[' (but set bits for them). */
3222 if (c == ':' && *p == ']')
3223 {
3224# if defined _LIBC || WIDE_CHAR_SUPPORT
3225 boolean is_lower = STREQ (str, "lower");
3226 boolean is_upper = STREQ (str, "upper");
3227 wctype_t wt;
3228 int ch;
3229
3230 wt = IS_CHAR_CLASS (str);
3231 if (wt == 0)
3232 FREE_STACK_RETURN (REG_ECTYPE);
3233
3234 /* Throw away the ] at the end of the character
3235 class. */
3236 PATFETCH (c);
3237
3238 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3239
3240 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
3241 {
3242# ifdef _LIBC
3243 if (__iswctype (__btowc (ch), wt))
3244 SET_LIST_BIT (ch);
3245# else
3246 if (iswctype (btowc (ch), wt))
3247 SET_LIST_BIT (ch);
3248# endif
3249
3250 if (translate && (is_upper || is_lower)
3251 && (ISUPPER (ch) || ISLOWER (ch)))
3252 SET_LIST_BIT (ch);
3253 }
3254
3255 had_char_class = true;
3256# else
3257 int ch;
3258 boolean is_alnum = STREQ (str, "alnum");
3259 boolean is_alpha = STREQ (str, "alpha");
3260 boolean is_blank = STREQ (str, "blank");
3261 boolean is_cntrl = STREQ (str, "cntrl");
3262 boolean is_digit = STREQ (str, "digit");
3263 boolean is_graph = STREQ (str, "graph");
3264 boolean is_lower = STREQ (str, "lower");
3265 boolean is_print = STREQ (str, "print");
3266 boolean is_punct = STREQ (str, "punct");
3267 boolean is_space = STREQ (str, "space");
3268 boolean is_upper = STREQ (str, "upper");
3269 boolean is_xdigit = STREQ (str, "xdigit");
3270
3271 if (!IS_CHAR_CLASS (str))
3272 FREE_STACK_RETURN (REG_ECTYPE);
3273
3274 /* Throw away the ] at the end of the character
3275 class. */
3276 PATFETCH (c);
3277
3278 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3279
3280 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
3281 {
3282 /* This was split into 3 if's to
3283 avoid an arbitrary limit in some compiler. */
3284 if ( (is_alnum && ISALNUM (ch))
3285 || (is_alpha && ISALPHA (ch))
3286 || (is_blank && ISBLANK (ch))
3287 || (is_cntrl && ISCNTRL (ch)))
3288 SET_LIST_BIT (ch);
3289 if ( (is_digit && ISDIGIT (ch))
3290 || (is_graph && ISGRAPH (ch))
3291 || (is_lower && ISLOWER (ch))
3292 || (is_print && ISPRINT (ch)))
3293 SET_LIST_BIT (ch);
3294 if ( (is_punct && ISPUNCT (ch))
3295 || (is_space && ISSPACE (ch))
3296 || (is_upper && ISUPPER (ch))
3297 || (is_xdigit && ISXDIGIT (ch)))
3298 SET_LIST_BIT (ch);
3299 if ( translate && (is_upper || is_lower)
3300 && (ISUPPER (ch) || ISLOWER (ch)))
3301 SET_LIST_BIT (ch);
3302 }
3303 had_char_class = true;
3304# endif /* libc || wctype.h */
3305 }
3306 else
3307 {
3308 c1++;
3309 while (c1--)
3310 PATUNFETCH;
3311 SET_LIST_BIT ('[');
3312 SET_LIST_BIT (':');
3313 range_start = ':';
3314 had_char_class = false;
3315 }
3316 }
3317 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
3318 {
3319 unsigned char str[MB_LEN_MAX + 1];
3320# ifdef _LIBC
3321 uint32_t nrules =
3322 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3323# endif
3324
3325 PATFETCH (c);
3326 c1 = 0;
3327
3328 /* If pattern is `[[='. */
3329 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3330
3331 for (;;)
3332 {
3333 PATFETCH (c);
3334 if ((c == '=' && *p == ']') || p == pend)
3335 break;
3336 if (c1 < MB_LEN_MAX)
3337 str[c1++] = c;
3338 else
3339 /* This is in any case an invalid class name. */
3340 str[0] = '\0';
3341 }
3342 str[c1] = '\0';
3343
3344 if (c == '=' && *p == ']' && str[0] != '\0')
3345 {
3346 /* If we have no collation data we use the default
3347 collation in which each character is in a class
3348 by itself. It also means that ASCII is the
3349 character set and therefore we cannot have character
3350 with more than one byte in the multibyte
3351 representation. */
3352# ifdef _LIBC
3353 if (nrules == 0)
3354# endif
3355 {
3356 if (c1 != 1)
3357 FREE_STACK_RETURN (REG_ECOLLATE);
3358
3359 /* Throw away the ] at the end of the equivalence
3360 class. */
3361 PATFETCH (c);
3362
3363 /* Set the bit for the character. */
3364 SET_LIST_BIT (str[0]);
3365 }
3366# ifdef _LIBC
3367 else
3368 {
3369 /* Try to match the byte sequence in `str' against
3370 those known to the collate implementation.
3371 First find out whether the bytes in `str' are
3372 actually from exactly one character. */
3373 const int32_t *table;
3374 const unsigned char *weights;
3375 const unsigned char *extra;
3376 const int32_t *indirect;
3377 int32_t idx;
3378 const unsigned char *cp = str;
3379 int ch;
3380
3381 /* This #include defines a local function! */
3382# include <locale/weight.h>
3383
3384 table = (const int32_t *)
3385 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3386 weights = (const unsigned char *)
3387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3388 extra = (const unsigned char *)
3389 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3390 indirect = (const int32_t *)
3391 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3392
3393 idx = findidx (&cp);
3394 if (idx == 0 || cp < str + c1)
3395 /* This is no valid character. */
3396 FREE_STACK_RETURN (REG_ECOLLATE);
3397
3398 /* Throw away the ] at the end of the equivalence
3399 class. */
3400 PATFETCH (c);
3401
3402 /* Now we have to go throught the whole table
3403 and find all characters which have the same
3404 first level weight.
3405
3406 XXX Note that this is not entirely correct.
3407 we would have to match multibyte sequences
3408 but this is not possible with the current
3409 implementation. */
3410 for (ch = 1; ch < 256; ++ch)
3411 /* XXX This test would have to be changed if we
3412 would allow matching multibyte sequences. */
3413 if (table[ch] > 0)
3414 {
3415 int32_t idx2 = table[ch];
3416 size_t len = weights[idx2];
3417
3418 /* Test whether the lenghts match. */
3419 if (weights[idx] == len)
3420 {
3421 /* They do. New compare the bytes of
3422 the weight. */
3423 size_t cnt = 0;
3424
3425 while (cnt < len
3426 && (weights[idx + 1 + cnt]
3427 == weights[idx2 + 1 + cnt]))
3428 ++cnt;
3429
3430 if (cnt == len)
3431 /* They match. Mark the character as
3432 acceptable. */
3433 SET_LIST_BIT (ch);
3434 }
3435 }
3436 }
3437# endif
3438 had_char_class = true;
3439 }
3440 else
3441 {
3442 c1++;
3443 while (c1--)
3444 PATUNFETCH;
3445 SET_LIST_BIT ('[');
3446 SET_LIST_BIT ('=');
3447 range_start = '=';
3448 had_char_class = false;
3449 }
3450 }
3451 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
3452 {
3453 unsigned char str[128]; /* Should be large enough. */
3454# ifdef _LIBC
3455 uint32_t nrules =
3456 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3457# endif
3458
3459 PATFETCH (c);
3460 c1 = 0;
3461
3462 /* If pattern is `[[.'. */
3463 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3464
3465 for (;;)
3466 {
3467 PATFETCH (c);
3468 if ((c == '.' && *p == ']') || p == pend)
3469 break;
3470 if (c1 < sizeof (str))
3471 str[c1++] = c;
3472 else
3473 /* This is in any case an invalid class name. */
3474 str[0] = '\0';
3475 }
3476 str[c1] = '\0';
3477
3478 if (c == '.' && *p == ']' && str[0] != '\0')
3479 {
3480 /* If we have no collation data we use the default
3481 collation in which each character is the name
3482 for its own class which contains only the one
3483 character. It also means that ASCII is the
3484 character set and therefore we cannot have character
3485 with more than one byte in the multibyte
3486 representation. */
3487# ifdef _LIBC
3488 if (nrules == 0)
3489# endif
3490 {
3491 if (c1 != 1)
3492 FREE_STACK_RETURN (REG_ECOLLATE);
3493
3494 /* Throw away the ] at the end of the equivalence
3495 class. */
3496 PATFETCH (c);
3497
3498 /* Set the bit for the character. */
3499 SET_LIST_BIT (str[0]);
3500 range_start = ((const unsigned char *) str)[0];
3501 }
3502# ifdef _LIBC
3503 else
3504 {
3505 /* Try to match the byte sequence in `str' against
3506 those known to the collate implementation.
3507 First find out whether the bytes in `str' are
3508 actually from exactly one character. */
3509 int32_t table_size;
3510 const int32_t *symb_table;
3511 const unsigned char *extra;
3512 int32_t idx;
3513 int32_t elem;
3514 int32_t second;
3515 int32_t hash;
3516
3517 table_size =
3518 _NL_CURRENT_WORD (LC_COLLATE,
3519 _NL_COLLATE_SYMB_HASH_SIZEMB);
3520 symb_table = (const int32_t *)
3521 _NL_CURRENT (LC_COLLATE,
3522 _NL_COLLATE_SYMB_TABLEMB);
3523 extra = (const unsigned char *)
3524 _NL_CURRENT (LC_COLLATE,
3525 _NL_COLLATE_SYMB_EXTRAMB);
3526
3527 /* Locate the character in the hashing table. */
3528 hash = elem_hash (str, c1);
3529
3530 idx = 0;
3531 elem = hash % table_size;
3532 second = hash % (table_size - 2);
3533 while (symb_table[2 * elem] != 0)
3534 {
3535 /* First compare the hashing value. */
3536 if (symb_table[2 * elem] == hash
3537 && c1 == extra[symb_table[2 * elem + 1]]
3538 && memcmp (str,
3539 &extra[symb_table[2 * elem + 1]
3540 + 1],
3541 c1) == 0)
3542 {
3543 /* Yep, this is the entry. */
3544 idx = symb_table[2 * elem + 1];
3545 idx += 1 + extra[idx];
3546 break;
3547 }
3548
3549 /* Next entry. */
3550 elem += second;
3551 }
3552
3553 if (symb_table[2 * elem] == 0)
3554 /* This is no valid character. */
3555 FREE_STACK_RETURN (REG_ECOLLATE);
3556
3557 /* Throw away the ] at the end of the equivalence
3558 class. */
3559 PATFETCH (c);
3560
3561 /* Now add the multibyte character(s) we found
3562 to the accept list.
3563
3564 XXX Note that this is not entirely correct.
3565 we would have to match multibyte sequences
3566 but this is not possible with the current
3567 implementation. Also, we have to match
3568 collating symbols, which expand to more than
3569 one file, as a whole and not allow the
3570 individual bytes. */
3571 c1 = extra[idx++];
3572 if (c1 == 1)
3573 range_start = extra[idx];
3574 while (c1-- > 0)
3575 {
3576 SET_LIST_BIT (extra[idx]);
3577 ++idx;
3578 }
3579 }
3580# endif
3581 had_char_class = false;
3582 }
3583 else
3584 {
3585 c1++;
3586 while (c1--)
3587 PATUNFETCH;
3588 SET_LIST_BIT ('[');
3589 SET_LIST_BIT ('.');
3590 range_start = '.';
3591 had_char_class = false;
3592 }
3593 }
3594 else
3595 {
3596 had_char_class = false;
3597 SET_LIST_BIT (c);
3598 range_start = c;
3599 }
3600 }
3601
3602 /* Discard any (non)matching list bytes that are all 0 at the
3603 end of the map. Decrease the map-length byte too. */
3604 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3605 b[-1]--;
3606 b += b[-1];
3607#endif /* MBS_SUPPORT */
3608 }
3609 break;
3610
3611
3612 case '(':
3613 if (syntax & RE_NO_BK_PARENS)
3614 goto handle_open;
3615 else
3616 goto normal_char;
3617
3618
3619 case ')':
3620 if (syntax & RE_NO_BK_PARENS)
3621 goto handle_close;
3622 else
3623 goto normal_char;
3624
3625
3626 case '\n':
3627 if (syntax & RE_NEWLINE_ALT)
3628 goto handle_alt;
3629 else
3630 goto normal_char;
3631
3632
3633 case '|':
3634 if (syntax & RE_NO_BK_VBAR)
3635 goto handle_alt;
3636 else
3637 goto normal_char;
3638
3639
3640 case '{':
3641 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3642 goto handle_interval;
3643 else
3644 goto normal_char;
3645
3646
3647 case '\\':
3648 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3649
3650 /* Do not translate the character after the \, so that we can
3651 distinguish, e.g., \B from \b, even if we normally would
3652 translate, e.g., B to b. */
3653 PATFETCH_RAW (c);
3654
3655 switch (c)
3656 {
3657 case '(':
3658 if (syntax & RE_NO_BK_PARENS)
3659 goto normal_backslash;
3660
3661 handle_open:
3662 bufp->re_nsub++;
3663 regnum++;
3664
3665 if (COMPILE_STACK_FULL)
3666 {
3667 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3668 compile_stack_elt_t);
3669 if (compile_stack.stack == NULL) return REG_ESPACE;
3670
3671 compile_stack.size <<= 1;
3672 }
3673
3674 /* These are the values to restore when we hit end of this
3675 group. They are all relative offsets, so that if the
3676 whole pattern moves because of realloc, they will still
3677 be valid. */
3678 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
3679 COMPILE_STACK_TOP.fixup_alt_jump
3680 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
3681 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
3682 COMPILE_STACK_TOP.regnum = regnum;
3683
3684 /* We will eventually replace the 0 with the number of
3685 groups inner to this one. But do not push a
3686 start_memory for groups beyond the last one we can
3687 represent in the compiled pattern. */
3688 if (regnum <= MAX_REGNUM)
3689 {
3690 COMPILE_STACK_TOP.inner_group_offset = b
3691 - COMPILED_BUFFER_VAR + 2;
3692 BUF_PUSH_3 (start_memory, regnum, 0);
3693 }
3694
3695 compile_stack.avail++;
3696
3697 fixup_alt_jump = 0;
3698 laststart = 0;
3699 begalt = b;
3700 /* If we've reached MAX_REGNUM groups, then this open
3701 won't actually generate any code, so we'll have to
3702 clear pending_exact explicitly. */
3703 pending_exact = 0;
3704 break;
3705
3706
3707 case ')':
3708 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3709
3710 if (COMPILE_STACK_EMPTY)
3711 {
3712 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3713 goto normal_backslash;
3714 else
3715 FREE_STACK_RETURN (REG_ERPAREN);
3716 }
3717
3718 handle_close:
3719 if (fixup_alt_jump)
3720 { /* Push a dummy failure point at the end of the
3721 alternative for a possible future
3722 `pop_failure_jump' to pop. See comments at
3723 `push_dummy_failure' in `re_match_2'. */
3724 BUF_PUSH (push_dummy_failure);
3725
3726 /* We allocated space for this jump when we assigned
3727 to `fixup_alt_jump', in the `handle_alt' case below. */
3728 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
3729 }
3730
3731 /* See similar code for backslashed left paren above. */
3732 if (COMPILE_STACK_EMPTY)
3733 {
3734 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3735 goto normal_char;
3736 else
3737 FREE_STACK_RETURN (REG_ERPAREN);
3738 }
3739
3740 /* Since we just checked for an empty stack above, this
3741 ``can't happen''. */
3742 assert (compile_stack.avail != 0);
3743 {
3744 /* We don't just want to restore into `regnum', because
3745 later groups should continue to be numbered higher,
3746 as in `(ab)c(de)' -- the second group is #2. */
3747 regnum_t this_group_regnum;
3748
3749 compile_stack.avail--;
3750 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
3751 fixup_alt_jump
3752 = COMPILE_STACK_TOP.fixup_alt_jump
3753 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
3754 : 0;
3755 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
3756 this_group_regnum = COMPILE_STACK_TOP.regnum;
3757 /* If we've reached MAX_REGNUM groups, then this open
3758 won't actually generate any code, so we'll have to
3759 clear pending_exact explicitly. */
3760 pending_exact = 0;
3761
3762 /* We're at the end of the group, so now we know how many
3763 groups were inside this one. */
3764 if (this_group_regnum <= MAX_REGNUM)
3765 {
3766 US_CHAR_TYPE *inner_group_loc
3767 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
3768
3769 *inner_group_loc = regnum - this_group_regnum;
3770 BUF_PUSH_3 (stop_memory, this_group_regnum,
3771 regnum - this_group_regnum);
3772 }
3773 }
3774 break;
3775
3776
3777 case '|': /* `\|'. */
3778 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3779 goto normal_backslash;
3780 handle_alt:
3781 if (syntax & RE_LIMITED_OPS)
3782 goto normal_char;
3783
3784 /* Insert before the previous alternative a jump which
3785 jumps to this alternative if the former fails. */
3786 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3787 INSERT_JUMP (on_failure_jump, begalt,
3788 b + 2 + 2 * OFFSET_ADDRESS_SIZE);
3789 pending_exact = 0;
3790 b += 1 + OFFSET_ADDRESS_SIZE;
3791
3792 /* The alternative before this one has a jump after it
3793 which gets executed if it gets matched. Adjust that
3794 jump so it will jump to this alternative's analogous
3795 jump (put in below, which in turn will jump to the next
3796 (if any) alternative's such jump, etc.). The last such
3797 jump jumps to the correct final destination. A picture:
3798 _____ _____
3799 | | | |
3800 | v | v
3801 a | b | c
3802
3803 If we are at `b', then fixup_alt_jump right now points to a
3804 three-byte space after `a'. We'll put in the jump, set
3805 fixup_alt_jump to right after `b', and leave behind three
3806 bytes which we'll fill in when we get to after `c'. */
3807
3808 if (fixup_alt_jump)
3809 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3810
3811 /* Mark and leave space for a jump after this alternative,
3812 to be filled in later either by next alternative or
3813 when know we're at the end of a series of alternatives. */
3814 fixup_alt_jump = b;
3815 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3816 b += 1 + OFFSET_ADDRESS_SIZE;
3817
3818 laststart = 0;
3819 begalt = b;
3820 break;
3821
3822
3823 case '{':
3824 /* If \{ is a literal. */
3825 if (!(syntax & RE_INTERVALS)
3826 /* If we're at `\{' and it's not the open-interval
3827 operator. */
3828 || (syntax & RE_NO_BK_BRACES))
3829 goto normal_backslash;
3830
3831 handle_interval:
3832 {
3833 /* If got here, then the syntax allows intervals. */
3834
3835 /* At least (most) this many matches must be made. */
3836 int lower_bound = -1, upper_bound = -1;
3837
3838 /* Place in the uncompiled pattern (i.e., just after
3839 the '{') to go back to if the interval is invalid. */
3840 const CHAR_TYPE *beg_interval = p;
3841
3842 if (p == pend)
3843 goto invalid_interval;
3844
3845 GET_UNSIGNED_NUMBER (lower_bound);
3846
3847 if (c == ',')
3848 {
3849 GET_UNSIGNED_NUMBER (upper_bound);
3850 if (upper_bound < 0)
3851 upper_bound = RE_DUP_MAX;
3852 }
3853 else
3854 /* Interval such as `{1}' => match exactly once. */
3855 upper_bound = lower_bound;
3856
3857 if (! (0 <= lower_bound && lower_bound <= upper_bound))
3858 goto invalid_interval;
3859
3860 if (!(syntax & RE_NO_BK_BRACES))
3861 {
3862 if (c != '\\' || p == pend)
3863 goto invalid_interval;
3864 PATFETCH (c);
3865 }
3866
3867 if (c != '}')
3868 goto invalid_interval;
3869
3870 /* If it's invalid to have no preceding re. */
3871 if (!laststart)
3872 {
3873 if (syntax & RE_CONTEXT_INVALID_OPS
3874 && !(syntax & RE_INVALID_INTERVAL_ORD))
3875 FREE_STACK_RETURN (REG_BADRPT);
3876 else if (syntax & RE_CONTEXT_INDEP_OPS)
3877 laststart = b;
3878 else
3879 goto unfetch_interval;
3880 }
3881
3882 /* We just parsed a valid interval. */
3883
3884 if (RE_DUP_MAX < upper_bound)
3885 FREE_STACK_RETURN (REG_BADBR);
3886
3887 /* If the upper bound is zero, don't want to succeed at
3888 all; jump from `laststart' to `b + 3', which will be
3889 the end of the buffer after we insert the jump. */
3890 /* ifdef MBS_SUPPORT, 'b + 1 + OFFSET_ADDRESS_SIZE'
3891 instead of 'b + 3'. */
3892 if (upper_bound == 0)
3893 {
3894 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3895 INSERT_JUMP (jump, laststart, b + 1
3896 + OFFSET_ADDRESS_SIZE);
3897 b += 1 + OFFSET_ADDRESS_SIZE;
3898 }
3899
3900 /* Otherwise, we have a nontrivial interval. When
3901 we're all done, the pattern will look like:
3902 set_number_at <jump count> <upper bound>
3903 set_number_at <succeed_n count> <lower bound>
3904 succeed_n <after jump addr> <succeed_n count>
3905 <body of loop>
3906 jump_n <succeed_n addr> <jump count>
3907 (The upper bound and `jump_n' are omitted if
3908 `upper_bound' is 1, though.) */
3909 else
3910 { /* If the upper bound is > 1, we need to insert
3911 more at the end of the loop. */
3912 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
3913 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
3914
3915 GET_BUFFER_SPACE (nbytes);
3916
3917 /* Initialize lower bound of the `succeed_n', even
3918 though it will be set during matching by its
3919 attendant `set_number_at' (inserted next),
3920 because `re_compile_fastmap' needs to know.
3921 Jump to the `jump_n' we might insert below. */
3922 INSERT_JUMP2 (succeed_n, laststart,
3923 b + 1 + 2 * OFFSET_ADDRESS_SIZE
3924 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
3925 , lower_bound);
3926 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3927
3928 /* Code to initialize the lower bound. Insert
3929 before the `succeed_n'. The `5' is the last two
3930 bytes of this `set_number_at', plus 3 bytes of
3931 the following `succeed_n'. */
3932 /* ifdef MBS_SUPPORT, The '1+2*OFFSET_ADDRESS_SIZE'
3933 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
3934 of the following `succeed_n'. */
3935 insert_op2 (set_number_at, laststart, 1
3936 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
3937 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3938
3939 if (upper_bound > 1)
3940 { /* More than one repetition is allowed, so
3941 append a backward jump to the `succeed_n'
3942 that starts this interval.
3943
3944 When we've reached this during matching,
3945 we'll have matched the interval once, so
3946 jump back only `upper_bound - 1' times. */
3947 STORE_JUMP2 (jump_n, b, laststart
3948 + 2 * OFFSET_ADDRESS_SIZE + 1,
3949 upper_bound - 1);
3950 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3951
3952 /* The location we want to set is the second
3953 parameter of the `jump_n'; that is `b-2' as
3954 an absolute address. `laststart' will be
3955 the `set_number_at' we're about to insert;
3956 `laststart+3' the number to set, the source
3957 for the relative address. But we are
3958 inserting into the middle of the pattern --
3959 so everything is getting moved up by 5.
3960 Conclusion: (b - 2) - (laststart + 3) + 5,
3961 i.e., b - laststart.
3962
3963 We insert this at the beginning of the loop
3964 so that if we fail during matching, we'll
3965 reinitialize the bounds. */
3966 insert_op2 (set_number_at, laststart, b - laststart,
3967 upper_bound - 1, b);
3968 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3969 }
3970 }
3971 pending_exact = 0;
3972 break;
3973
3974 invalid_interval:
3975 if (!(syntax & RE_INVALID_INTERVAL_ORD))
3976 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
3977 unfetch_interval:
3978 /* Match the characters as literals. */
3979 p = beg_interval;
3980 c = '{';
3981 if (syntax & RE_NO_BK_BRACES)
3982 goto normal_char;
3983 else
3984 goto normal_backslash;
3985 }
3986
3987#ifdef emacs
3988 /* There is no way to specify the before_dot and after_dot
3989 operators. rms says this is ok. --karl */
3990 case '=':
3991 BUF_PUSH (at_dot);
3992 break;
3993
3994 case 's':
3995 laststart = b;
3996 PATFETCH (c);
3997 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3998 break;
3999
4000 case 'S':
4001 laststart = b;
4002 PATFETCH (c);
4003 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
4004 break;
4005#endif /* emacs */
4006
4007
4008 case 'w':
4009 if (syntax & RE_NO_GNU_OPS)
4010 goto normal_char;
4011 laststart = b;
4012 BUF_PUSH (wordchar);
4013 break;
4014
4015
4016 case 'W':
4017 if (syntax & RE_NO_GNU_OPS)
4018 goto normal_char;
4019 laststart = b;
4020 BUF_PUSH (notwordchar);
4021 break;
4022
4023
4024 case '<':
4025 if (syntax & RE_NO_GNU_OPS)
4026 goto normal_char;
4027 BUF_PUSH (wordbeg);
4028 break;
4029
4030 case '>':
4031 if (syntax & RE_NO_GNU_OPS)
4032 goto normal_char;
4033 BUF_PUSH (wordend);
4034 break;
4035
4036 case 'b':
4037 if (syntax & RE_NO_GNU_OPS)
4038 goto normal_char;
4039 BUF_PUSH (wordbound);
4040 break;
4041
4042 case 'B':
4043 if (syntax & RE_NO_GNU_OPS)
4044 goto normal_char;
4045 BUF_PUSH (notwordbound);
4046 break;
4047
4048 case '`':
4049 if (syntax & RE_NO_GNU_OPS)
4050 goto normal_char;
4051 BUF_PUSH (begbuf);
4052 break;
4053
4054 case '\'':
4055 if (syntax & RE_NO_GNU_OPS)
4056 goto normal_char;
4057 BUF_PUSH (endbuf);
4058 break;
4059
4060 case '1': case '2': case '3': case '4': case '5':
4061 case '6': case '7': case '8': case '9':
4062 if (syntax & RE_NO_BK_REFS)
4063 goto normal_char;
4064
4065 c1 = c - '0';
4066
4067 if (c1 > regnum)
4068 FREE_STACK_RETURN (REG_ESUBREG);
4069
4070 /* Can't back reference to a subexpression if inside of it. */
4071 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
4072 goto normal_char;
4073
4074 laststart = b;
4075 BUF_PUSH_2 (duplicate, c1);
4076 break;
4077
4078
4079 case '+':
4080 case '?':
4081 if (syntax & RE_BK_PLUS_QM)
4082 goto handle_plus;
4083 else
4084 goto normal_backslash;
4085
4086 default:
4087 normal_backslash:
4088 /* You might think it would be useful for \ to mean
4089 not to translate; but if we don't translate it
4090 it will never match anything. */
4091 c = TRANSLATE (c);
4092 goto normal_char;
4093 }
4094 break;
4095
4096
4097 default:
4098 /* Expects the character in `c'. */
4099 normal_char:
4100 /* If no exactn currently being built. */
4101 if (!pending_exact
4102#ifdef MBS_SUPPORT
4103 /* If last exactn handle binary(or character) and
4104 new exactn handle character(or binary). */
4105 || is_exactn_bin != is_binary[p - 1 - pattern]
4106#endif /* MBS_SUPPORT */
4107
4108 /* If last exactn not at current position. */
4109 || pending_exact + *pending_exact + 1 != b
4110
4111 /* We have only one byte following the exactn for the count. */
4112 || *pending_exact == (1 << BYTEWIDTH) - 1
4113
4114 /* If followed by a repetition operator. */
4115 || *p == '*' || *p == '^'
4116 || ((syntax & RE_BK_PLUS_QM)
4117 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
4118 : (*p == '+' || *p == '?'))
4119 || ((syntax & RE_INTERVALS)
4120 && ((syntax & RE_NO_BK_BRACES)
4121 ? *p == '{'
4122 : (p[0] == '\\' && p[1] == '{'))))
4123 {
4124 /* Start building a new exactn. */
4125
4126 laststart = b;
4127
4128#ifdef MBS_SUPPORT
4129 /* Is this exactn binary data or character? */
4130 is_exactn_bin = is_binary[p - 1 - pattern];
4131 if (is_exactn_bin)
4132 BUF_PUSH_2 (exactn_bin, 0);
4133 else
4134 BUF_PUSH_2 (exactn, 0);
4135#else
4136 BUF_PUSH_2 (exactn, 0);
4137#endif /* MBS_SUPPORT */
4138 pending_exact = b - 1;
4139 }
4140
4141 BUF_PUSH (c);
4142 (*pending_exact)++;
4143 break;
4144 } /* switch (c) */
4145 } /* while p != pend */
4146
4147
4148 /* Through the pattern now. */
4149
4150 if (fixup_alt_jump)
4151 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
4152
4153 if (!COMPILE_STACK_EMPTY)
4154 FREE_STACK_RETURN (REG_EPAREN);
4155
4156 /* If we don't want backtracking, force success
4157 the first time we reach the end of the compiled pattern. */
4158 if (syntax & RE_NO_POSIX_BACKTRACKING)
4159 BUF_PUSH (succeed);
4160
4161#ifdef MBS_SUPPORT
4162 free (pattern);
4163 free (mbs_offset);
4164 free (is_binary);
4165#endif
4166 free (compile_stack.stack);
4167
4168 /* We have succeeded; set the length of the buffer. */
4169#ifdef MBS_SUPPORT
4170 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
4171#else
4172 bufp->used = b - bufp->buffer;
4173#endif
4174
4175#ifdef DEBUG
4176 if (debug)
4177 {
4178 DEBUG_PRINT1 ("\nCompiled pattern: \n");
4179 print_compiled_pattern (bufp);
4180 }
4181#endif /* DEBUG */
4182
4183#ifndef MATCH_MAY_ALLOCATE
4184 /* Initialize the failure stack to the largest possible stack. This
4185 isn't necessary unless we're trying to avoid calling alloca in
4186 the search and match routines. */
4187 {
4188 int num_regs = bufp->re_nsub + 1;
4189
4190 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
4191 is strictly greater than re_max_failures, the largest possible stack
4192 is 2 * re_max_failures failure points. */
4193 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
4194 {
4195 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
4196
4197# ifdef emacs
4198 if (! fail_stack.stack)
4199 fail_stack.stack
4200 = (fail_stack_elt_t *) xmalloc (fail_stack.size
4201 * sizeof (fail_stack_elt_t));
4202 else
4203 fail_stack.stack
4204 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
4205 (fail_stack.size
4206 * sizeof (fail_stack_elt_t)));
4207# else /* not emacs */
4208 if (! fail_stack.stack)
4209 fail_stack.stack
4210 = (fail_stack_elt_t *) malloc (fail_stack.size
4211 * sizeof (fail_stack_elt_t));
4212 else
4213 fail_stack.stack
4214 = (fail_stack_elt_t *) realloc (fail_stack.stack,
4215 (fail_stack.size
4216 * sizeof (fail_stack_elt_t)));
4217# endif /* not emacs */
4218 }
4219
4220 regex_grow_registers (num_regs);
4221 }
4222#endif /* not MATCH_MAY_ALLOCATE */
4223
4224 return REG_NOERROR;
4225} /* regex_compile */
4226
4227
4228/* Subroutines for `regex_compile'. */
4229
4230/* Store OP at LOC followed by two-byte integer parameter ARG. */
4231/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
4232
4233static void
4234store_op1 (op, loc, arg)
4235 re_opcode_t op;
4236 US_CHAR_TYPE *loc;
4237 int arg;
4238{
4239 *loc = (US_CHAR_TYPE) op;
4240 STORE_NUMBER (loc + 1, arg);
4241}
4242
4243
4244/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
4245/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
4246
4247static void
4248store_op2 (op, loc, arg1, arg2)
4249 re_opcode_t op;
4250 US_CHAR_TYPE *loc;
4251 int arg1, arg2;
4252{
4253 *loc = (US_CHAR_TYPE) op;
4254 STORE_NUMBER (loc + 1, arg1);
4255 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
4256}
4257
4258
4259/* Copy the bytes from LOC to END to open up three bytes of space at LOC
4260 for OP followed by two-byte integer parameter ARG. */
4261/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
4262
4263static void
4264insert_op1 (op, loc, arg, end)
4265 re_opcode_t op;
4266 US_CHAR_TYPE *loc;
4267 int arg;
4268 US_CHAR_TYPE *end;
4269{
4270 register US_CHAR_TYPE *pfrom = end;
4271 register US_CHAR_TYPE *pto = end + 1 + OFFSET_ADDRESS_SIZE;
4272
4273 while (pfrom != loc)
4274 *--pto = *--pfrom;
4275
4276 store_op1 (op, loc, arg);
4277}
4278
4279
4280/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
4281/* ifdef MBS_SUPPORT, integer parameter is 1 wchar_t. */
4282
4283static void
4284insert_op2 (op, loc, arg1, arg2, end)
4285 re_opcode_t op;
4286 US_CHAR_TYPE *loc;
4287 int arg1, arg2;
4288 US_CHAR_TYPE *end;
4289{
4290 register US_CHAR_TYPE *pfrom = end;
4291 register US_CHAR_TYPE *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
4292
4293 while (pfrom != loc)
4294 *--pto = *--pfrom;
4295
4296 store_op2 (op, loc, arg1, arg2);
4297}
4298
4299
4300/* P points to just after a ^ in PATTERN. Return true if that ^ comes
4301 after an alternative or a begin-subexpression. We assume there is at
4302 least one character before the ^. */
4303
4304static boolean
4305at_begline_loc_p (pattern, p, syntax)
4306 const CHAR_TYPE *pattern, *p;
4307 reg_syntax_t syntax;
4308{
4309 const CHAR_TYPE *prev = p - 2;
4310 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
4311
4312 return
4313 /* After a subexpression? */
4314 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
4315 /* After an alternative? */
4316 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
4317}
4318
4319
4320/* The dual of at_begline_loc_p. This one is for $. We assume there is
4321 at least one character after the $, i.e., `P < PEND'. */
4322
4323static boolean
4324at_endline_loc_p (p, pend, syntax)
4325 const CHAR_TYPE *p, *pend;
4326 reg_syntax_t syntax;
4327{
4328 const CHAR_TYPE *next = p;
4329 boolean next_backslash = *next == '\\';
4330 const CHAR_TYPE *next_next = p + 1 < pend ? p + 1 : 0;
4331
4332 return
4333 /* Before a subexpression? */
4334 (syntax & RE_NO_BK_PARENS ? *next == ')'
4335 : next_backslash && next_next && *next_next == ')')
4336 /* Before an alternative? */
4337 || (syntax & RE_NO_BK_VBAR ? *next == '|'
4338 : next_backslash && next_next && *next_next == '|');
4339}
4340
4341
4342/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4343 false if it's not. */
4344
4345static boolean
4346group_in_compile_stack (compile_stack, regnum)
4347 compile_stack_type compile_stack;
4348 regnum_t regnum;
4349{
4350 int this_element;
4351
4352 for (this_element = compile_stack.avail - 1;
4353 this_element >= 0;
4354 this_element--)
4355 if (compile_stack.stack[this_element].regnum == regnum)
4356 return true;
4357
4358 return false;
4359}
4360
4361#ifdef MBS_SUPPORT
4362/* This insert space, which size is "num", into the pattern at "loc".
4363 "end" must point the end of the allocated buffer. */
4364static void
4365insert_space (num, loc, end)
4366 int num;
4367 CHAR_TYPE *loc;
4368 CHAR_TYPE *end;
4369{
4370 register CHAR_TYPE *pto = end;
4371 register CHAR_TYPE *pfrom = end - num;
4372
4373 while (pfrom >= loc)
4374 *pto-- = *pfrom--;
4375}
4376#endif /* MBS_SUPPORT */
4377
4378#ifdef MBS_SUPPORT
4379static reg_errcode_t
4380compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
4381 char_set)
4382 CHAR_TYPE range_start_char;
4383 const CHAR_TYPE **p_ptr, *pend;
4384 CHAR_TYPE *char_set, *b;
4385 RE_TRANSLATE_TYPE translate;
4386 reg_syntax_t syntax;
4387{
4388 const CHAR_TYPE *p = *p_ptr;
4389 CHAR_TYPE range_start, range_end;
4390 reg_errcode_t ret;
4391# ifdef _LIBC
4392 uint32_t nrules;
4393 uint32_t start_val, end_val;
4394# endif
4395 if (p == pend)
4396 return REG_ERANGE;
4397
4398# ifdef _LIBC
4399 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4400 if (nrules != 0)
4401 {
4402 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
4403 _NL_COLLATE_COLLSEQWC);
4404 const unsigned char *extra = (const unsigned char *)
4405 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4406
4407 if (range_start_char < -1)
4408 {
4409 /* range_start is a collating symbol. */
4410 int32_t *wextra;
4411 /* Retreive the index and get collation sequence value. */
4412 wextra = (int32_t*)(extra + char_set[-range_start_char]);
4413 start_val = wextra[1 + *wextra];
4414 }
4415 else
4416 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
4417
4418 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
4419
4420 /* Report an error if the range is empty and the syntax prohibits
4421 this. */
4422 ret = ((syntax & RE_NO_EMPTY_RANGES)
4423 && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
4424
4425 /* Insert space to the end of the char_ranges. */
4426 insert_space(2, b - char_set[5] - 2, b - 1);
4427 *(b - char_set[5] - 2) = (wchar_t)start_val;
4428 *(b - char_set[5] - 1) = (wchar_t)end_val;
4429 char_set[4]++; /* ranges_index */
4430 }
4431 else
4432# endif
4433 {
4434 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
4435 range_start_char;
4436 range_end = TRANSLATE (p[0]);
4437 /* Report an error if the range is empty and the syntax prohibits
4438 this. */
4439 ret = ((syntax & RE_NO_EMPTY_RANGES)
4440 && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
4441
4442 /* Insert space to the end of the char_ranges. */
4443 insert_space(2, b - char_set[5] - 2, b - 1);
4444 *(b - char_set[5] - 2) = range_start;
4445 *(b - char_set[5] - 1) = range_end;
4446 char_set[4]++; /* ranges_index */
4447 }
4448 /* Have to increment the pointer into the pattern string, so the
4449 caller isn't still at the ending character. */
4450 (*p_ptr)++;
4451
4452 return ret;
4453}
4454#else
4455/* Read the ending character of a range (in a bracket expression) from the
4456 uncompiled pattern *P_PTR (which ends at PEND). We assume the
4457 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4458 Then we set the translation of all bits between the starting and
4459 ending characters (inclusive) in the compiled pattern B.
4460
4461 Return an error code.
4462
4463 We use these short variable names so we can use the same macros as
4464 `regex_compile' itself. */
4465
4466static reg_errcode_t
4467compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
4468 unsigned int range_start_char;
4469 const char **p_ptr, *pend;
4470 RE_TRANSLATE_TYPE translate;
4471 reg_syntax_t syntax;
4472 unsigned char *b;
4473{
4474 unsigned this_char;
4475 const char *p = *p_ptr;
4476 reg_errcode_t ret;
4477# if _LIBC
4478 const unsigned char *collseq;
4479 unsigned int start_colseq;
4480 unsigned int end_colseq;
4481# else
4482 unsigned end_char;
4483# endif
4484
4485 if (p == pend)
4486 return REG_ERANGE;
4487
4488 /* Have to increment the pointer into the pattern string, so the
4489 caller isn't still at the ending character. */
4490 (*p_ptr)++;
4491
4492 /* Report an error if the range is empty and the syntax prohibits this. */
4493 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4494
4495# if _LIBC
4496 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
4497 _NL_COLLATE_COLLSEQMB);
4498
4499 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
4500 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
4501 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
4502 {
4503 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
4504
4505 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
4506 {
4507 SET_LIST_BIT (TRANSLATE (this_char));
4508 ret = REG_NOERROR;
4509 }
4510 }
4511# else
4512 /* Here we see why `this_char' has to be larger than an `unsigned
4513 char' -- we would otherwise go into an infinite loop, since all
4514 characters <= 0xff. */
4515 range_start_char = TRANSLATE (range_start_char);
4516 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
4517 and some compilers cast it to int implicitly, so following for_loop
4518 may fall to (almost) infinite loop.
4519 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
4520 To avoid this, we cast p[0] to unsigned int and truncate it. */
4521 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
4522
4523 for (this_char = range_start_char; this_char <= end_char; ++this_char)
4524 {
4525 SET_LIST_BIT (TRANSLATE (this_char));
4526 ret = REG_NOERROR;
4527 }
4528# endif
4529
4530 return ret;
4531}
4532#endif /* MBS_SUPPORT */
4533
4534
4535/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4536 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4537 characters can start a string that matches the pattern. This fastmap
4538 is used by re_search to skip quickly over impossible starting points.
4539
4540 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4541 area as BUFP->fastmap.
4542
4543 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4544 the pattern buffer.
4545
4546 Returns 0 if we succeed, -2 if an internal error. */
4547
4548#ifdef MBS_SUPPORT
4549/* local function for re_compile_fastmap.
4550 truncate wchar_t character to char. */
4551static unsigned char truncate_wchar (CHAR_TYPE c);
4552
4553static unsigned char
4554truncate_wchar (c)
4555 CHAR_TYPE c;
4556{
4557 unsigned char buf[MB_LEN_MAX];
4558 int retval = wctomb(buf, c);
4559 return retval > 0 ? buf[0] : (unsigned char)c;
4560}
4561#endif /* MBS_SUPPORT */
4562
4563int
4564re_compile_fastmap (bufp)
4565 struct re_pattern_buffer *bufp;
4566{
4567 int j, k;
4568#ifdef MATCH_MAY_ALLOCATE
4569 fail_stack_type fail_stack;
4570#endif
4571#ifndef REGEX_MALLOC
4572 char *destination;
4573#endif
4574
4575 register char *fastmap = bufp->fastmap;
4576
4577#ifdef MBS_SUPPORT
4578 /* We need to cast pattern to (wchar_t*), because we casted this compiled
4579 pattern to (char*) in regex_compile. */
4580 US_CHAR_TYPE *pattern = (US_CHAR_TYPE*)bufp->buffer;
4581 register US_CHAR_TYPE *pend = (US_CHAR_TYPE*) (bufp->buffer + bufp->used);
4582#else
4583 US_CHAR_TYPE *pattern = bufp->buffer;
4584 register US_CHAR_TYPE *pend = pattern + bufp->used;
4585#endif /* MBS_SUPPORT */
4586 US_CHAR_TYPE *p = pattern;
4587
4588#ifdef REL_ALLOC
4589 /* This holds the pointer to the failure stack, when
4590 it is allocated relocatably. */
4591 fail_stack_elt_t *failure_stack_ptr;
4592#endif
4593
4594 /* Assume that each path through the pattern can be null until
4595 proven otherwise. We set this false at the bottom of switch
4596 statement, to which we get only if a particular path doesn't
4597 match the empty string. */
4598 boolean path_can_be_null = true;
4599
4600 /* We aren't doing a `succeed_n' to begin with. */
4601 boolean succeed_n_p = false;
4602
4603 assert (fastmap != NULL && p != NULL);
4604
4605 INIT_FAIL_STACK ();
4606 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4607 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4608 bufp->can_be_null = 0;
4609
4610 while (1)
4611 {
4612 if (p == pend || *p == succeed)
4613 {
4614 /* We have reached the (effective) end of pattern. */
4615 if (!FAIL_STACK_EMPTY ())
4616 {
4617 bufp->can_be_null |= path_can_be_null;
4618
4619 /* Reset for next path. */
4620 path_can_be_null = true;
4621
4622 p = fail_stack.stack[--fail_stack.avail].pointer;
4623
4624 continue;
4625 }
4626 else
4627 break;
4628 }
4629
4630 /* We should never be about to go beyond the end of the pattern. */
4631 assert (p < pend);
4632
4633 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4634 {
4635
4636 /* I guess the idea here is to simply not bother with a fastmap
4637 if a backreference is used, since it's too hard to figure out
4638 the fastmap for the corresponding group. Setting
4639 `can_be_null' stops `re_search_2' from using the fastmap, so
4640 that is all we do. */
4641 case duplicate:
4642 bufp->can_be_null = 1;
4643 goto done;
4644
4645
4646 /* Following are the cases which match a character. These end
4647 with `break'. */
4648
4649#ifdef MBS_SUPPORT
4650 case exactn:
4651 fastmap[truncate_wchar(p[1])] = 1;
4652 break;
4653 case exactn_bin:
4654 fastmap[p[1]] = 1;
4655 break;
4656#else
4657 case exactn:
4658 fastmap[p[1]] = 1;
4659 break;
4660#endif /* MBS_SUPPORT */
4661
4662
4663#ifdef MBS_SUPPORT
4664 /* It is hard to distinguish fastmap from (multi byte) characters
4665 which depends on current locale. */
4666 case charset:
4667 case charset_not:
4668 case wordchar:
4669 case notwordchar:
4670 bufp->can_be_null = 1;
4671 goto done;
4672#else
4673 case charset:
4674 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4675 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
4676 fastmap[j] = 1;
4677 break;
4678
4679
4680 case charset_not:
4681 /* Chars beyond end of map must be allowed. */
4682 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
4683 fastmap[j] = 1;
4684
4685 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4686 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
4687 fastmap[j] = 1;
4688 break;
4689
4690
4691 case wordchar:
4692 for (j = 0; j < (1 << BYTEWIDTH); j++)
4693 if (SYNTAX (j) == Sword)
4694 fastmap[j] = 1;
4695 break;
4696
4697
4698 case notwordchar:
4699 for (j = 0; j < (1 << BYTEWIDTH); j++)
4700 if (SYNTAX (j) != Sword)
4701 fastmap[j] = 1;
4702 break;
4703#endif
4704
4705 case anychar:
4706 {
4707 int fastmap_newline = fastmap['\n'];
4708
4709 /* `.' matches anything ... */
4710 for (j = 0; j < (1 << BYTEWIDTH); j++)
4711 fastmap[j] = 1;
4712
4713 /* ... except perhaps newline. */
4714 if (!(bufp->syntax & RE_DOT_NEWLINE))
4715 fastmap['\n'] = fastmap_newline;
4716
4717 /* Return if we have already set `can_be_null'; if we have,
4718 then the fastmap is irrelevant. Something's wrong here. */
4719 else if (bufp->can_be_null)
4720 goto done;
4721
4722 /* Otherwise, have to check alternative paths. */
4723 break;
4724 }
4725
4726#ifdef emacs
4727 case syntaxspec:
4728 k = *p++;
4729 for (j = 0; j < (1 << BYTEWIDTH); j++)
4730 if (SYNTAX (j) == (enum syntaxcode) k)
4731 fastmap[j] = 1;
4732 break;
4733
4734
4735 case notsyntaxspec:
4736 k = *p++;
4737 for (j = 0; j < (1 << BYTEWIDTH); j++)
4738 if (SYNTAX (j) != (enum syntaxcode) k)
4739 fastmap[j] = 1;
4740 break;
4741
4742
4743 /* All cases after this match the empty string. These end with
4744 `continue'. */
4745
4746
4747 case before_dot:
4748 case at_dot:
4749 case after_dot:
4750 continue;
4751#endif /* emacs */
4752
4753
4754 case no_op:
4755 case begline:
4756 case endline:
4757 case begbuf:
4758 case endbuf:
4759 case wordbound:
4760 case notwordbound:
4761 case wordbeg:
4762 case wordend:
4763 case push_dummy_failure:
4764 continue;
4765
4766
4767 case jump_n:
4768 case pop_failure_jump:
4769 case maybe_pop_jump:
4770 case jump:
4771 case jump_past_alt:
4772 case dummy_failure_jump:
4773 EXTRACT_NUMBER_AND_INCR (j, p);
4774 p += j;
4775 if (j > 0)
4776 continue;
4777
4778 /* Jump backward implies we just went through the body of a
4779 loop and matched nothing. Opcode jumped to should be
4780 `on_failure_jump' or `succeed_n'. Just treat it like an
4781 ordinary jump. For a * loop, it has pushed its failure
4782 point already; if so, discard that as redundant. */
4783 if ((re_opcode_t) *p != on_failure_jump
4784 && (re_opcode_t) *p != succeed_n)
4785 continue;
4786
4787 p++;
4788 EXTRACT_NUMBER_AND_INCR (j, p);
4789 p += j;
4790
4791 /* If what's on the stack is where we are now, pop it. */
4792 if (!FAIL_STACK_EMPTY ()
4793 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
4794 fail_stack.avail--;
4795
4796 continue;
4797
4798
4799 case on_failure_jump:
4800 case on_failure_keep_string_jump:
4801 handle_on_failure_jump:
4802 EXTRACT_NUMBER_AND_INCR (j, p);
4803
4804 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
4805 end of the pattern. We don't want to push such a point,
4806 since when we restore it above, entering the switch will
4807 increment `p' past the end of the pattern. We don't need
4808 to push such a point since we obviously won't find any more
4809 fastmap entries beyond `pend'. Such a pattern can match
4810 the null string, though. */
4811 if (p + j < pend)
4812 {
4813 if (!PUSH_PATTERN_OP (p + j, fail_stack))
4814 {
4815 RESET_FAIL_STACK ();
4816 return -2;
4817 }
4818 }
4819 else
4820 bufp->can_be_null = 1;
4821
4822 if (succeed_n_p)
4823 {
4824 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
4825 succeed_n_p = false;
4826 }
4827
4828 continue;
4829
4830
4831 case succeed_n:
4832 /* Get to the number of times to succeed. */
4833 p += OFFSET_ADDRESS_SIZE;
4834
4835 /* Increment p past the n for when k != 0. */
4836 EXTRACT_NUMBER_AND_INCR (k, p);
4837 if (k == 0)
4838 {
4839 p -= 2 * OFFSET_ADDRESS_SIZE;
4840 succeed_n_p = true; /* Spaghetti code alert. */
4841 goto handle_on_failure_jump;
4842 }
4843 continue;
4844
4845
4846 case set_number_at:
4847 p += 2 * OFFSET_ADDRESS_SIZE;
4848 continue;
4849
4850
4851 case start_memory:
4852 case stop_memory:
4853 p += 2;
4854 continue;
4855
4856
4857 default:
4858 abort (); /* We have listed all the cases. */
4859 } /* switch *p++ */
4860
4861 /* Getting here means we have found the possible starting
4862 characters for one path of the pattern -- and that the empty
4863 string does not match. We need not follow this path further.
4864 Instead, look at the next alternative (remembered on the
4865 stack), or quit if no more. The test at the top of the loop
4866 does these things. */
4867 path_can_be_null = false;
4868 p = pend;
4869 } /* while p */
4870
4871 /* Set `can_be_null' for the last path (also the first path, if the
4872 pattern is empty). */
4873 bufp->can_be_null |= path_can_be_null;
4874
4875 done:
4876 RESET_FAIL_STACK ();
4877 return 0;
4878} /* re_compile_fastmap */
4879#ifdef _LIBC
4880weak_alias (__re_compile_fastmap, re_compile_fastmap)
4881#endif
4882
4883
4884/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4885 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4886 this memory for recording register information. STARTS and ENDS
4887 must be allocated using the malloc library routine, and must each
4888 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4889
4890 If NUM_REGS == 0, then subsequent matches should allocate their own
4891 register data.
4892
4893 Unless this function is called, the first search or match using
4894 PATTERN_BUFFER will allocate its own register data, without
4895 freeing the old data. */
4896
4897void
4898re_set_registers (bufp, regs, num_regs, starts, ends)
4899 struct re_pattern_buffer *bufp;
4900 struct re_registers *regs;
4901 unsigned num_regs;
4902 regoff_t *starts, *ends;
4903{
4904 if (num_regs)
4905 {
4906 bufp->regs_allocated = REGS_REALLOCATE;
4907 regs->num_regs = num_regs;
4908 regs->start = starts;
4909 regs->end = ends;
4910 }
4911 else
4912 {
4913 bufp->regs_allocated = REGS_UNALLOCATED;
4914 regs->num_regs = 0;
4915 regs->start = regs->end = (regoff_t *) 0;
4916 }
4917}
4918#ifdef _LIBC
4919weak_alias (__re_set_registers, re_set_registers)
4920#endif
4921
4922
4923/* Searching routines. */
4924
4925/* Like re_search_2, below, but only one string is specified, and
4926 doesn't let you say where to stop matching. */
4927
4928int
4929re_search (bufp, string, size, startpos, range, regs)
4930 struct re_pattern_buffer *bufp;
4931 const char *string;
4932 int size, startpos, range;
4933 struct re_registers *regs;
4934{
4935 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4936 regs, size);
4937}
4938#ifdef _LIBC
4939weak_alias (__re_search, re_search)
4940#endif
4941
4942
4943/* Using the compiled pattern in BUFP->buffer, first tries to match the
4944 virtual concatenation of STRING1 and STRING2, starting first at index
4945 STARTPOS, then at STARTPOS + 1, and so on.
4946
4947 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4948
4949 RANGE is how far to scan while trying to match. RANGE = 0 means try
4950 only at STARTPOS; in general, the last start tried is STARTPOS +
4951 RANGE.
4952
4953 In REGS, return the indices of the virtual concatenation of STRING1
4954 and STRING2 that matched the entire BUFP->buffer and its contained
4955 subexpressions.
4956
4957 Do not consider matching one past the index STOP in the virtual
4958 concatenation of STRING1 and STRING2.
4959
4960 We return either the position in the strings at which the match was
4961 found, -1 if no match, or -2 if error (such as failure
4962 stack overflow). */
4963
4964int
4965re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
4966 struct re_pattern_buffer *bufp;
4967 const char *string1, *string2;
4968 int size1, size2;
4969 int startpos;
4970 int range;
4971 struct re_registers *regs;
4972 int stop;
4973{
4974 int val;
4975 register char *fastmap = bufp->fastmap;
4976 register RE_TRANSLATE_TYPE translate = bufp->translate;
4977 int total_size = size1 + size2;
4978 int endpos = startpos + range;
4979
4980 /* Check for out-of-range STARTPOS. */
4981 if (startpos < 0 || startpos > total_size)
4982 return -1;
4983
4984 /* Fix up RANGE if it might eventually take us outside
4985 the virtual concatenation of STRING1 and STRING2.
4986 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
4987 if (endpos < 0)
4988 range = 0 - startpos;
4989 else if (endpos > total_size)
4990 range = total_size - startpos;
4991
4992 /* If the search isn't to be a backwards one, don't waste time in a
4993 search for a pattern that must be anchored. */
4994 if (bufp->used > 0 && range > 0
4995 && ((re_opcode_t) bufp->buffer[0] == begbuf
4996 /* `begline' is like `begbuf' if it cannot match at newlines. */
4997 || ((re_opcode_t) bufp->buffer[0] == begline
4998 && !bufp->newline_anchor)))
4999 {
5000 if (startpos > 0)
5001 return -1;
5002 else
5003 range = 1;
5004 }
5005
5006#ifdef emacs
5007 /* In a forward search for something that starts with \=.
5008 don't keep searching past point. */
5009 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
5010 {
5011 range = PT - startpos;
5012 if (range <= 0)
5013 return -1;
5014 }
5015#endif /* emacs */
5016
5017 /* Update the fastmap now if not correct already. */
5018 if (fastmap && !bufp->fastmap_accurate)
5019 if (re_compile_fastmap (bufp) == -2)
5020 return -2;
5021
5022 /* Loop through the string, looking for a place to start matching. */
5023 for (;;)
5024 {
5025 /* If a fastmap is supplied, skip quickly over characters that
5026 cannot be the start of a match. If the pattern can match the
5027 null string, however, we don't need to skip characters; we want
5028 the first null string. */
5029 if (fastmap && startpos < total_size && !bufp->can_be_null)
5030 {
5031 if (range > 0) /* Searching forwards. */
5032 {
5033 register const char *d;
5034 register int lim = 0;
5035 int irange = range;
5036
5037 if (startpos < size1 && startpos + range >= size1)
5038 lim = range - (size1 - startpos);
5039
5040 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
5041
5042 /* Written out as an if-else to avoid testing `translate'
5043 inside the loop. */
5044 if (translate)
5045 while (range > lim
5046 && !fastmap[(unsigned char)
5047 translate[(unsigned char) *d++]])
5048 range--;
5049 else
5050 while (range > lim && !fastmap[(unsigned char) *d++])
5051 range--;
5052
5053 startpos += irange - range;
5054 }
5055 else /* Searching backwards. */
5056 {
5057 register CHAR_TYPE c = (size1 == 0 || startpos >= size1
5058 ? string2[startpos - size1]
5059 : string1[startpos]);
5060
5061 if (!fastmap[(unsigned char) TRANSLATE (c)])
5062 goto advance;
5063 }
5064 }
5065
5066 /* If can't match the null string, and that's all we have left, fail. */
5067 if (range >= 0 && startpos == total_size && fastmap
5068 && !bufp->can_be_null)
5069 return -1;
5070
5071 val = re_match_2_internal (bufp, string1, size1, string2, size2,
5072 startpos, regs, stop);
5073#ifndef REGEX_MALLOC
5074# ifdef C_ALLOCA
5075 alloca (0);
5076# endif
5077#endif
5078
5079 if (val >= 0)
5080 return startpos;
5081
5082 if (val == -2)
5083 return -2;
5084
5085 advance:
5086 if (!range)
5087 break;
5088 else if (range > 0)
5089 {
5090 range--;
5091 startpos++;
5092 }
5093 else
5094 {
5095 range++;
5096 startpos--;
5097 }
5098 }
5099 return -1;
5100} /* re_search_2 */
5101#ifdef _LIBC
5102weak_alias (__re_search_2, re_search_2)
5103#endif
5104
5105
5106#ifdef MBS_SUPPORT
5107/* This converts PTR, a pointer into one of the search wchar_t strings
5108 `string1' and `string2' into an multibyte string offset from the
5109 beginning of that string. We use mbs_offset to optimize.
5110 See convert_mbs_to_wcs. */
5111# define POINTER_TO_OFFSET(ptr) \
5112 (FIRST_STRING_P (ptr) \
5113 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
5114 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
5115 + csize1)))
5116#else
5117/* This converts PTR, a pointer into one of the search strings `string1'
5118 and `string2' into an offset from the beginning of that string. */
5119# define POINTER_TO_OFFSET(ptr) \
5120 (FIRST_STRING_P (ptr) \
5121 ? ((regoff_t) ((ptr) - string1)) \
5122 : ((regoff_t) ((ptr) - string2 + size1)))
5123#endif /* MBS_SUPPORT */
5124
5125/* Macros for dealing with the split strings in re_match_2. */
5126
5127#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
5128
5129/* Call before fetching a character with *d. This switches over to
5130 string2 if necessary. */
5131#define PREFETCH() \
5132 while (d == dend) \
5133 { \
5134 /* End of string2 => fail. */ \
5135 if (dend == end_match_2) \
5136 goto fail; \
5137 /* End of string1 => advance to string2. */ \
5138 d = string2; \
5139 dend = end_match_2; \
5140 }
5141
5142
5143/* Test if at very beginning or at very end of the virtual concatenation
5144 of `string1' and `string2'. If only one string, it's `string2'. */
5145#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5146#define AT_STRINGS_END(d) ((d) == end2)
5147
5148
5149/* Test if D points to a character which is word-constituent. We have
5150 two special cases to check for: if past the end of string1, look at
5151 the first character in string2; and if before the beginning of
5152 string2, look at the last character in string1. */
5153#ifdef MBS_SUPPORT
5154/* Use internationalized API instead of SYNTAX. */
5155# define WORDCHAR_P(d) \
5156 (iswalnum ((wint_t)((d) == end1 ? *string2 \
5157 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0)
5158#else
5159# define WORDCHAR_P(d) \
5160 (SYNTAX ((d) == end1 ? *string2 \
5161 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
5162 == Sword)
5163#endif /* MBS_SUPPORT */
5164
5165/* Disabled due to a compiler bug -- see comment at case wordbound */
5166#if 0
5167/* Test if the character before D and the one at D differ with respect
5168 to being word-constituent. */
5169#define AT_WORD_BOUNDARY(d) \
5170 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
5171 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
5172#endif
5173
5174/* Free everything we malloc. */
5175#ifdef MATCH_MAY_ALLOCATE
5176# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
5177# ifdef MBS_SUPPORT
5178# define FREE_VARIABLES() \
5179 do { \
5180 REGEX_FREE_STACK (fail_stack.stack); \
5181 FREE_VAR (regstart); \
5182 FREE_VAR (regend); \
5183 FREE_VAR (old_regstart); \
5184 FREE_VAR (old_regend); \
5185 FREE_VAR (best_regstart); \
5186 FREE_VAR (best_regend); \
5187 FREE_VAR (reg_info); \
5188 FREE_VAR (reg_dummy); \
5189 FREE_VAR (reg_info_dummy); \
5190 FREE_VAR (string1); \
5191 FREE_VAR (string2); \
5192 FREE_VAR (mbs_offset1); \
5193 FREE_VAR (mbs_offset2); \
5194 } while (0)
5195# else /* not MBS_SUPPORT */
5196# define FREE_VARIABLES() \
5197 do { \
5198 REGEX_FREE_STACK (fail_stack.stack); \
5199 FREE_VAR (regstart); \
5200 FREE_VAR (regend); \
5201 FREE_VAR (old_regstart); \
5202 FREE_VAR (old_regend); \
5203 FREE_VAR (best_regstart); \
5204 FREE_VAR (best_regend); \
5205 FREE_VAR (reg_info); \
5206 FREE_VAR (reg_dummy); \
5207 FREE_VAR (reg_info_dummy); \
5208 } while (0)
5209# endif /* MBS_SUPPORT */
5210#else
5211# define FREE_VAR(var) if (var) free (var); var = NULL
5212# ifdef MBS_SUPPORT
5213# define FREE_VARIABLES() \
5214 do { \
5215 FREE_VAR (string1); \
5216 FREE_VAR (string2); \
5217 FREE_VAR (mbs_offset1); \
5218 FREE_VAR (mbs_offset2); \
5219 } while (0)
5220# else
5221# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
5222# endif /* MBS_SUPPORT */
5223#endif /* not MATCH_MAY_ALLOCATE */
5224
5225/* These values must meet several constraints. They must not be valid
5226 register values; since we have a limit of 255 registers (because
5227 we use only one byte in the pattern for the register number), we can
5228 use numbers larger than 255. They must differ by 1, because of
5229 NUM_FAILURE_ITEMS above. And the value for the lowest register must
5230 be larger than the value for the highest register, so we do not try
5231 to actually save any registers when none are active. */
5232#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
5233#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
5234
5235
5236/* Matching routines. */
5237
5238#ifndef emacs /* Emacs never uses this. */
5239/* re_match is like re_match_2 except it takes only a single string. */
5240
5241int
5242re_match (bufp, string, size, pos, regs)
5243 struct re_pattern_buffer *bufp;
5244 const char *string;
5245 int size, pos;
5246 struct re_registers *regs;
5247{
5248 int result = re_match_2_internal (bufp, NULL, 0, string, size,
5249 pos, regs, size);
5250# ifndef REGEX_MALLOC
5251# ifdef C_ALLOCA
5252 alloca (0);
5253# endif
5254# endif
5255 return result;
5256}
5257# ifdef _LIBC
5258weak_alias (__re_match, re_match)
5259# endif
5260#endif /* not emacs */
5261
5262static boolean group_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p,
5263 US_CHAR_TYPE *end,
5264 register_info_type *reg_info));
5265static boolean alt_match_null_string_p _RE_ARGS ((US_CHAR_TYPE *p,
5266 US_CHAR_TYPE *end,
5267 register_info_type *reg_info));
5268static boolean common_op_match_null_string_p _RE_ARGS ((US_CHAR_TYPE **p,
5269 US_CHAR_TYPE *end,
5270 register_info_type *reg_info));
5271static int bcmp_translate _RE_ARGS ((const CHAR_TYPE *s1, const CHAR_TYPE *s2,
5272 int len, char *translate));
5273
5274/* re_match_2 matches the compiled pattern in BUFP against the
5275 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5276 and SIZE2, respectively). We start matching at POS, and stop
5277 matching at STOP.
5278
5279 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
5280 store offsets for the substring each group matched in REGS. See the
5281 documentation for exactly how many groups we fill.
5282
5283 We return -1 if no match, -2 if an internal error (such as the
5284 failure stack overflowing). Otherwise, we return the length of the
5285 matched substring. */
5286
5287int
5288re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
5289 struct re_pattern_buffer *bufp;
5290 const char *string1, *string2;
5291 int size1, size2;
5292 int pos;
5293 struct re_registers *regs;
5294 int stop;
5295{
5296 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
5297 pos, regs, stop);
5298#ifndef REGEX_MALLOC
5299# ifdef C_ALLOCA
5300 alloca (0);
5301# endif
5302#endif
5303 return result;
5304}
5305#ifdef _LIBC
5306weak_alias (__re_match_2, re_match_2)
5307#endif
5308
5309#ifdef MBS_SUPPORT
5310
5311static int count_mbs_length PARAMS ((int *, int));
5312
5313/* This check the substring (from 0, to length) of the multibyte string,
5314 to which offset_buffer correspond. And count how many wchar_t_characters
5315 the substring occupy. We use offset_buffer to optimization.
5316 See convert_mbs_to_wcs. */
5317
5318static int
5319count_mbs_length(offset_buffer, length)
5320 int *offset_buffer;
5321 int length;
5322{
5323 int wcs_size;
5324
5325 /* Check whether the size is valid. */
5326 if (length < 0)
5327 return -1;
5328
5329 if (offset_buffer == NULL)
5330 return 0;
5331
5332 for (wcs_size = 0 ; offset_buffer[wcs_size] != -1 ; wcs_size++)
5333 {
5334 if (offset_buffer[wcs_size] == length)
5335 return wcs_size;
5336 if (offset_buffer[wcs_size] > length)
5337 /* It is a fragment of a wide character. */
5338 return -1;
5339 }
5340
5341 /* We reached at the sentinel. */
5342 return -1;
5343}
5344#endif /* MBS_SUPPORT */
5345
5346/* This is a separate function so that we can force an alloca cleanup
5347 afterwards. */
5348static int
5349#ifdef MBS_SUPPORT
5350re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos, regs, stop)
5351 struct re_pattern_buffer *bufp;
5352 const char *cstring1, *cstring2;
5353 int csize1, csize2;
5354#else
5355re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
5356 struct re_pattern_buffer *bufp;
5357 const char *string1, *string2;
5358 int size1, size2;
5359#endif
5360 int pos;
5361 struct re_registers *regs;
5362 int stop;
5363{
5364 /* General temporaries. */
5365 int mcnt;
5366 US_CHAR_TYPE *p1;
5367#ifdef MBS_SUPPORT
5368 /* We need wchar_t* buffers correspond to string1, string2. */
5369 CHAR_TYPE *string1 = NULL, *string2 = NULL;
5370 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5371 int size1 = 0, size2 = 0;
5372 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5373 int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
5374 /* They hold whether each wchar_t is binary data or not. */
5375 char *is_binary = NULL;
5376#endif /* MBS_SUPPORT */
5377
5378 /* Just past the end of the corresponding string. */
5379 const CHAR_TYPE *end1, *end2;
5380
5381 /* Pointers into string1 and string2, just past the last characters in
5382 each to consider matching. */
5383 const CHAR_TYPE *end_match_1, *end_match_2;
5384
5385 /* Where we are in the data, and the end of the current string. */
5386 const CHAR_TYPE *d, *dend;
5387
5388 /* Where we are in the pattern, and the end of the pattern. */
5389#ifdef MBS_SUPPORT
5390 US_CHAR_TYPE *pattern, *p;
5391 register US_CHAR_TYPE *pend;
5392#else
5393 US_CHAR_TYPE *p = bufp->buffer;
5394 register US_CHAR_TYPE *pend = p + bufp->used;
5395#endif /* MBS_SUPPORT */
5396
5397 /* Mark the opcode just after a start_memory, so we can test for an
5398 empty subpattern when we get to the stop_memory. */
5399 US_CHAR_TYPE *just_past_start_mem = 0;
5400
5401 /* We use this to map every character in the string. */
5402 RE_TRANSLATE_TYPE translate = bufp->translate;
5403
5404 /* Failure point stack. Each place that can handle a failure further
5405 down the line pushes a failure point on this stack. It consists of
5406 restart, regend, and reg_info for all registers corresponding to
5407 the subexpressions we're currently inside, plus the number of such
5408 registers, and, finally, two char *'s. The first char * is where
5409 to resume scanning the pattern; the second one is where to resume
5410 scanning the strings. If the latter is zero, the failure point is
5411 a ``dummy''; if a failure happens and the failure point is a dummy,
5412 it gets discarded and the next next one is tried. */
5413#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5414 fail_stack_type fail_stack;
5415#endif
5416#ifdef DEBUG
5417 static unsigned failure_id;
5418 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5419#endif
5420
5421#ifdef REL_ALLOC
5422 /* This holds the pointer to the failure stack, when
5423 it is allocated relocatably. */
5424 fail_stack_elt_t *failure_stack_ptr;
5425#endif
5426
5427 /* We fill all the registers internally, independent of what we
5428 return, for use in backreferences. The number here includes
5429 an element for register zero. */
5430 size_t num_regs = bufp->re_nsub + 1;
5431
5432 /* The currently active registers. */
5433 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5434 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5435
5436 /* Information on the contents of registers. These are pointers into
5437 the input strings; they record just what was matched (on this
5438 attempt) by a subexpression part of the pattern, that is, the
5439 regnum-th regstart pointer points to where in the pattern we began
5440 matching and the regnum-th regend points to right after where we
5441 stopped matching the regnum-th subexpression. (The zeroth register
5442 keeps track of what the whole pattern matches.) */
5443#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5444 const CHAR_TYPE **regstart, **regend;
5445#endif
5446
5447 /* If a group that's operated upon by a repetition operator fails to
5448 match anything, then the register for its start will need to be
5449 restored because it will have been set to wherever in the string we
5450 are when we last see its open-group operator. Similarly for a
5451 register's end. */
5452#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5453 const CHAR_TYPE **old_regstart, **old_regend;
5454#endif
5455
5456 /* The is_active field of reg_info helps us keep track of which (possibly
5457 nested) subexpressions we are currently in. The matched_something
5458 field of reg_info[reg_num] helps us tell whether or not we have
5459 matched any of the pattern so far this time through the reg_num-th
5460 subexpression. These two fields get reset each time through any
5461 loop their register is in. */
5462#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5463 register_info_type *reg_info;
5464#endif
5465
5466 /* The following record the register info as found in the above
5467 variables when we find a match better than any we've seen before.
5468 This happens as we backtrack through the failure points, which in
5469 turn happens only if we have not yet matched the entire string. */
5470 unsigned best_regs_set = false;
5471#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5472 const CHAR_TYPE **best_regstart, **best_regend;
5473#endif
5474
5475 /* Logically, this is `best_regend[0]'. But we don't want to have to
5476 allocate space for that if we're not allocating space for anything
5477 else (see below). Also, we never need info about register 0 for
5478 any of the other register vectors, and it seems rather a kludge to
5479 treat `best_regend' differently than the rest. So we keep track of
5480 the end of the best match so far in a separate variable. We
5481 initialize this to NULL so that when we backtrack the first time
5482 and need to test it, it's not garbage. */
5483 const CHAR_TYPE *match_end = NULL;
5484
5485 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
5486 int set_regs_matched_done = 0;
5487
5488 /* Used when we pop values we don't care about. */
5489#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5490 const CHAR_TYPE **reg_dummy;
5491 register_info_type *reg_info_dummy;
5492#endif
5493
5494#ifdef DEBUG
5495 /* Counts the total number of registers pushed. */
5496 unsigned num_regs_pushed = 0;
5497#endif
5498
5499 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5500
5501 INIT_FAIL_STACK ();
5502
5503#ifdef MATCH_MAY_ALLOCATE
5504 /* Do not bother to initialize all the register variables if there are
5505 no groups in the pattern, as it takes a fair amount of time. If
5506 there are groups, we include space for register 0 (the whole
5507 pattern), even though we never use it, since it simplifies the
5508 array indexing. We should fix this. */
5509 if (bufp->re_nsub)
5510 {
5511 regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5512 regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5513 old_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5514 old_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5515 best_regstart = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5516 best_regend = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5517 reg_info = REGEX_TALLOC (num_regs, register_info_type);
5518 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_TYPE *);
5519 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
5520
5521 if (!(regstart && regend && old_regstart && old_regend && reg_info
5522 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
5523 {
5524 FREE_VARIABLES ();
5525 return -2;
5526 }
5527 }
5528 else
5529 {
5530 /* We must initialize all our variables to NULL, so that
5531 `FREE_VARIABLES' doesn't try to free them. */
5532 regstart = regend = old_regstart = old_regend = best_regstart
5533 = best_regend = reg_dummy = NULL;
5534 reg_info = reg_info_dummy = (register_info_type *) NULL;
5535 }
5536#endif /* MATCH_MAY_ALLOCATE */
5537
5538 /* The starting position is bogus. */
5539#ifdef MBS_SUPPORT
5540 if (pos < 0 || pos > csize1 + csize2)
5541#else
5542 if (pos < 0 || pos > size1 + size2)
5543#endif
5544 {
5545 FREE_VARIABLES ();
5546 return -1;
5547 }
5548
5549#ifdef MBS_SUPPORT
5550 /* Allocate wchar_t array for string1 and string2 and
5551 fill them with converted string. */
5552 if (csize1 != 0)
5553 {
5554 string1 = REGEX_TALLOC (csize1 + 1, CHAR_TYPE);
5555 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
5556 is_binary = REGEX_TALLOC (csize1 + 1, char);
5557 if (!string1 || !mbs_offset1 || !is_binary)
5558 {
5559 FREE_VAR (string1);
5560 FREE_VAR (mbs_offset1);
5561 FREE_VAR (is_binary);
5562 return -2;
5563 }
5564 size1 = convert_mbs_to_wcs(string1, cstring1, csize1,
5565 mbs_offset1, is_binary);
5566 string1[size1] = L'\0'; /* for a sentinel */
5567 FREE_VAR (is_binary);
5568 }
5569 if (csize2 != 0)
5570 {
5571 string2 = REGEX_TALLOC (csize2 + 1, CHAR_TYPE);
5572 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
5573 is_binary = REGEX_TALLOC (csize2 + 1, char);
5574 if (!string2 || !mbs_offset2 || !is_binary)
5575 {
5576 FREE_VAR (string1);
5577 FREE_VAR (mbs_offset1);
5578 FREE_VAR (string2);
5579 FREE_VAR (mbs_offset2);
5580 FREE_VAR (is_binary);
5581 return -2;
5582 }
5583 size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
5584 mbs_offset2, is_binary);
5585 string2[size2] = L'\0'; /* for a sentinel */
5586 FREE_VAR (is_binary);
5587 }
5588
5589 /* We need to cast pattern to (wchar_t*), because we casted this compiled
5590 pattern to (char*) in regex_compile. */
5591 p = pattern = (CHAR_TYPE*)bufp->buffer;
5592 pend = (CHAR_TYPE*)(bufp->buffer + bufp->used);
5593
5594#endif /* MBS_SUPPORT */
5595
5596 /* Initialize subexpression text positions to -1 to mark ones that no
5597 start_memory/stop_memory has been seen for. Also initialize the
5598 register information struct. */
5599 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5600 {
5601 regstart[mcnt] = regend[mcnt]
5602 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
5603
5604 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
5605 IS_ACTIVE (reg_info[mcnt]) = 0;
5606 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5607 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5608 }
5609
5610 /* We move `string1' into `string2' if the latter's empty -- but not if
5611 `string1' is null. */
5612 if (size2 == 0 && string1 != NULL)
5613 {
5614 string2 = string1;
5615 size2 = size1;
5616 string1 = 0;
5617 size1 = 0;
5618 }
5619 end1 = string1 + size1;
5620 end2 = string2 + size2;
5621
5622 /* Compute where to stop matching, within the two strings. */
5623#ifdef MBS_SUPPORT
5624 if (stop <= csize1)
5625 {
5626 mcnt = count_mbs_length(mbs_offset1, stop);
5627 end_match_1 = string1 + mcnt;
5628 end_match_2 = string2;
5629 }
5630 else
5631 {
5632 end_match_1 = end1;
5633 mcnt = count_mbs_length(mbs_offset2, stop-csize1);
5634 end_match_2 = string2 + mcnt;
5635 }
5636 if (mcnt < 0)
5637 { /* count_mbs_length return error. */
5638 FREE_VARIABLES ();
5639 return -1;
5640 }
5641#else
5642 if (stop <= size1)
5643 {
5644 end_match_1 = string1 + stop;
5645 end_match_2 = string2;
5646 }
5647 else
5648 {
5649 end_match_1 = end1;
5650 end_match_2 = string2 + stop - size1;
5651 }
5652#endif /* MBS_SUPPORT */
5653
5654 /* `p' scans through the pattern as `d' scans through the data.
5655 `dend' is the end of the input string that `d' points within. `d'
5656 is advanced into the following input string whenever necessary, but
5657 this happens before fetching; therefore, at the beginning of the
5658 loop, `d' can be pointing at the end of a string, but it cannot
5659 equal `string2'. */
5660#ifdef MBS_SUPPORT
5661 if (size1 > 0 && pos <= csize1)
5662 {
5663 mcnt = count_mbs_length(mbs_offset1, pos);
5664 d = string1 + mcnt;
5665 dend = end_match_1;
5666 }
5667 else
5668 {
5669 mcnt = count_mbs_length(mbs_offset2, pos-csize1);
5670 d = string2 + mcnt;
5671 dend = end_match_2;
5672 }
5673
5674 if (mcnt < 0)
5675 { /* count_mbs_length return error. */
5676 FREE_VARIABLES ();
5677 return -1;
5678 }
5679#else
5680 if (size1 > 0 && pos <= size1)
5681 {
5682 d = string1 + pos;
5683 dend = end_match_1;
5684 }
5685 else
5686 {
5687 d = string2 + pos - size1;
5688 dend = end_match_2;
5689 }
5690#endif /* MBS_SUPPORT */
5691
5692 DEBUG_PRINT1 ("The compiled pattern is:\n");
5693 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5694 DEBUG_PRINT1 ("The string to match is: `");
5695 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5696 DEBUG_PRINT1 ("'\n");
5697
5698 /* This loops over pattern commands. It exits by returning from the
5699 function if the match is complete, or it drops through if the match
5700 fails at this starting point in the input data. */
5701 for (;;)
5702 {
5703#ifdef _LIBC
5704 DEBUG_PRINT2 ("\n%p: ", p);
5705#else
5706 DEBUG_PRINT2 ("\n0x%x: ", p);
5707#endif
5708
5709 if (p == pend)
5710 { /* End of pattern means we might have succeeded. */
5711 DEBUG_PRINT1 ("end of pattern ... ");
5712
5713 /* If we haven't matched the entire string, and we want the
5714 longest match, try backtracking. */
5715 if (d != end_match_2)
5716 {
5717 /* 1 if this match ends in the same string (string1 or string2)
5718 as the best previous match. */
5719 boolean same_str_p = (FIRST_STRING_P (match_end)
5720 == MATCHING_IN_FIRST_STRING);
5721 /* 1 if this match is the best seen so far. */
5722 boolean best_match_p;
5723
5724 /* AIX compiler got confused when this was combined
5725 with the previous declaration. */
5726 if (same_str_p)
5727 best_match_p = d > match_end;
5728 else
5729 best_match_p = !MATCHING_IN_FIRST_STRING;
5730
5731 DEBUG_PRINT1 ("backtracking.\n");
5732
5733 if (!FAIL_STACK_EMPTY ())
5734 { /* More failure points to try. */
5735
5736 /* If exceeds best match so far, save it. */
5737 if (!best_regs_set || best_match_p)
5738 {
5739 best_regs_set = true;
5740 match_end = d;
5741
5742 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5743
5744 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5745 {
5746 best_regstart[mcnt] = regstart[mcnt];
5747 best_regend[mcnt] = regend[mcnt];
5748 }
5749 }
5750 goto fail;
5751 }
5752
5753 /* If no failure points, don't restore garbage. And if
5754 last match is real best match, don't restore second
5755 best one. */
5756 else if (best_regs_set && !best_match_p)
5757 {
5758 restore_best_regs:
5759 /* Restore best match. It may happen that `dend ==
5760 end_match_1' while the restored d is in string2.
5761 For example, the pattern `x.*y.*z' against the
5762 strings `x-' and `y-z-', if the two strings are
5763 not consecutive in memory. */
5764 DEBUG_PRINT1 ("Restoring best registers.\n");
5765
5766 d = match_end;
5767 dend = ((d >= string1 && d <= end1)
5768 ? end_match_1 : end_match_2);
5769
5770 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5771 {
5772 regstart[mcnt] = best_regstart[mcnt];
5773 regend[mcnt] = best_regend[mcnt];
5774 }
5775 }
5776 } /* d != end_match_2 */
5777
5778 succeed_label:
5779 DEBUG_PRINT1 ("Accepting match.\n");
5780 /* If caller wants register contents data back, do it. */
5781 if (regs && !bufp->no_sub)
5782 {
5783 /* Have the register data arrays been allocated? */
5784 if (bufp->regs_allocated == REGS_UNALLOCATED)
5785 { /* No. So allocate them with malloc. We need one
5786 extra element beyond `num_regs' for the `-1' marker
5787 GNU code uses. */
5788 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5789 regs->start = TALLOC (regs->num_regs, regoff_t);
5790 regs->end = TALLOC (regs->num_regs, regoff_t);
5791 if (regs->start == NULL || regs->end == NULL)
5792 {
5793 FREE_VARIABLES ();
5794 return -2;
5795 }
5796 bufp->regs_allocated = REGS_REALLOCATE;
5797 }
5798 else if (bufp->regs_allocated == REGS_REALLOCATE)
5799 { /* Yes. If we need more elements than were already
5800 allocated, reallocate them. If we need fewer, just
5801 leave it alone. */
5802 if (regs->num_regs < num_regs + 1)
5803 {
5804 regs->num_regs = num_regs + 1;
5805 RETALLOC (regs->start, regs->num_regs, regoff_t);
5806 RETALLOC (regs->end, regs->num_regs, regoff_t);
5807 if (regs->start == NULL || regs->end == NULL)
5808 {
5809 FREE_VARIABLES ();
5810 return -2;
5811 }
5812 }
5813 }
5814 else
5815 {
5816 /* These braces fend off a "empty body in an else-statement"
5817 warning under GCC when assert expands to nothing. */
5818 assert (bufp->regs_allocated == REGS_FIXED);
5819 }
5820
5821 /* Convert the pointer data in `regstart' and `regend' to
5822 indices. Register zero has to be set differently,
5823 since we haven't kept track of any info for it. */
5824 if (regs->num_regs > 0)
5825 {
5826 regs->start[0] = pos;
5827#ifdef MBS_SUPPORT
5828 if (MATCHING_IN_FIRST_STRING)
5829 regs->end[0] = mbs_offset1 != NULL ?
5830 mbs_offset1[d-string1] : 0;
5831 else
5832 regs->end[0] = csize1 + (mbs_offset2 != NULL ?
5833 mbs_offset2[d-string2] : 0);
5834#else
5835 regs->end[0] = (MATCHING_IN_FIRST_STRING
5836 ? ((regoff_t) (d - string1))
5837 : ((regoff_t) (d - string2 + size1)));
5838#endif /* MBS_SUPPORT */
5839 }
5840
5841 /* Go through the first `min (num_regs, regs->num_regs)'
5842 registers, since that is all we initialized. */
5843 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
5844 mcnt++)
5845 {
5846 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
5847 regs->start[mcnt] = regs->end[mcnt] = -1;
5848 else
5849 {
5850 regs->start[mcnt]
5851 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
5852 regs->end[mcnt]
5853 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
5854 }
5855 }
5856
5857 /* If the regs structure we return has more elements than
5858 were in the pattern, set the extra elements to -1. If
5859 we (re)allocated the registers, this is the case,
5860 because we always allocate enough to have at least one
5861 -1 at the end. */
5862 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
5863 regs->start[mcnt] = regs->end[mcnt] = -1;
5864 } /* regs && !bufp->no_sub */
5865
5866 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5867 nfailure_points_pushed, nfailure_points_popped,
5868 nfailure_points_pushed - nfailure_points_popped);
5869 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
5870
5871#ifdef MBS_SUPPORT
5872 if (MATCHING_IN_FIRST_STRING)
5873 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
5874 else
5875 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
5876 csize1;
5877 mcnt -= pos;
5878#else
5879 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
5880 ? string1
5881 : string2 - size1);
5882#endif /* MBS_SUPPORT */
5883
5884 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
5885
5886 FREE_VARIABLES ();
5887 return mcnt;
5888 }
5889
5890 /* Otherwise match next pattern command. */
5891 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5892 {
5893 /* Ignore these. Used to ignore the n of succeed_n's which
5894 currently have n == 0. */
5895 case no_op:
5896 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5897 break;
5898
5899 case succeed:
5900 DEBUG_PRINT1 ("EXECUTING succeed.\n");
5901 goto succeed_label;
5902
5903 /* Match the next n pattern characters exactly. The following
5904 byte in the pattern defines n, and the n bytes after that
5905 are the characters to match. */
5906 case exactn:
5907#ifdef MBS_SUPPORT
5908 case exactn_bin:
5909#endif
5910 mcnt = *p++;
5911 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
5912
5913 /* This is written out as an if-else so we don't waste time
5914 testing `translate' inside the loop. */
5915 if (translate)
5916 {
5917 do
5918 {
5919 PREFETCH ();
5920#ifdef MBS_SUPPORT
5921 if (*d <= 0xff)
5922 {
5923 if ((US_CHAR_TYPE) translate[(unsigned char) *d++]
5924 != (US_CHAR_TYPE) *p++)
5925 goto fail;
5926 }
5927 else
5928 {
5929 if (*d++ != (CHAR_TYPE) *p++)
5930 goto fail;
5931 }
5932#else
5933 if ((US_CHAR_TYPE) translate[(unsigned char) *d++]
5934 != (US_CHAR_TYPE) *p++)
5935 goto fail;
5936#endif /* MBS_SUPPORT */
5937 }
5938 while (--mcnt);
5939 }
5940 else
5941 {
5942 do
5943 {
5944 PREFETCH ();
5945 if (*d++ != (CHAR_TYPE) *p++) goto fail;
5946 }
5947 while (--mcnt);
5948 }
5949 SET_REGS_MATCHED ();
5950 break;
5951
5952
5953 /* Match any character except possibly a newline or a null. */
5954 case anychar:
5955 DEBUG_PRINT1 ("EXECUTING anychar.\n");
5956
5957 PREFETCH ();
5958
5959 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
5960 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
5961 goto fail;
5962
5963 SET_REGS_MATCHED ();
5964 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
5965 d++;
5966 break;
5967
5968
5969 case charset:
5970 case charset_not:
5971 {
5972 register US_CHAR_TYPE c;
5973#ifdef MBS_SUPPORT
5974 unsigned int i, char_class_length, coll_symbol_length,
5975 equiv_class_length, ranges_length, chars_length, length;
5976 CHAR_TYPE *workp, *workp2, *charset_top;
5977#define WORK_BUFFER_SIZE 128
5978 CHAR_TYPE str_buf[WORK_BUFFER_SIZE];
5979# ifdef _LIBC
5980 uint32_t nrules;
5981# endif /* _LIBC */
5982#endif /* MBS_SUPPORT */
5983 boolean not = (re_opcode_t) *(p - 1) == charset_not;
5984
5985 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
5986 PREFETCH ();
5987 c = TRANSLATE (*d); /* The character to match. */
5988#ifdef MBS_SUPPORT
5989# ifdef _LIBC
5990 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
5991# endif /* _LIBC */
5992 charset_top = p - 1;
5993 char_class_length = *p++;
5994 coll_symbol_length = *p++;
5995 equiv_class_length = *p++;
5996 ranges_length = *p++;
5997 chars_length = *p++;
5998 /* p points charset[6], so the address of the next instruction
5999 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
6000 where l=length of char_classes, m=length of collating_symbol,
6001 n=equivalence_class, o=length of char_range,
6002 p'=length of character. */
6003 workp = p;
6004 /* Update p to indicate the next instruction. */
6005 p += char_class_length + coll_symbol_length+ equiv_class_length +
6006 2*ranges_length + chars_length;
6007
6008 /* match with char_class? */
6009 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
6010 {
6011 wctype_t wctype;
6012 uintptr_t alignedp = ((uintptr_t)workp
6013 + __alignof__(wctype_t) - 1)
6014 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
6015 wctype = *((wctype_t*)alignedp);
6016 workp += CHAR_CLASS_SIZE;
6017 if (iswctype((wint_t)c, wctype))
6018 goto char_set_matched;
6019 }
6020
6021 /* match with collating_symbol? */
6022# ifdef _LIBC
6023 if (nrules != 0)
6024 {
6025 const unsigned char *extra = (const unsigned char *)
6026 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6027
6028 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
6029 workp++)
6030 {
6031 int32_t *wextra;
6032 wextra = (int32_t*)(extra + *workp++);
6033 for (i = 0; i < *wextra; ++i)
6034 if (TRANSLATE(d[i]) != wextra[1 + i])
6035 break;
6036
6037 if (i == *wextra)
6038 {
6039 /* Update d, however d will be incremented at
6040 char_set_matched:, we decrement d here. */
6041 d += i - 1;
6042 goto char_set_matched;
6043 }
6044 }
6045 }
6046 else /* (nrules == 0) */
6047# endif
6048 /* If we can't look up collation data, we use wcscoll
6049 instead. */
6050 {
6051 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
6052 {
6053 const CHAR_TYPE *backup_d = d, *backup_dend = dend;
6054 length = wcslen(workp);
6055
6056 /* If wcscoll(the collating symbol, whole string) > 0,
6057 any substring of the string never match with the
6058 collating symbol. */
6059 if (wcscoll(workp, d) > 0)
6060 {
6061 workp += length + 1;
6062 continue;
6063 }
6064
6065 /* First, we compare the collating symbol with
6066 the first character of the string.
6067 If it don't match, we add the next character to
6068 the compare buffer in turn. */
6069 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
6070 {
6071 int match;
6072 if (d == dend)
6073 {
6074 if (dend == end_match_2)
6075 break;
6076 d = string2;
6077 dend = end_match_2;
6078 }
6079
6080 /* add next character to the compare buffer. */
6081 str_buf[i] = TRANSLATE(*d);
6082 str_buf[i+1] = '\0';
6083
6084 match = wcscoll(workp, str_buf);
6085 if (match == 0)
6086 goto char_set_matched;
6087
6088 if (match < 0)
6089 /* (str_buf > workp) indicate (str_buf + X > workp),
6090 because for all X (str_buf + X > str_buf).
6091 So we don't need continue this loop. */
6092 break;
6093
6094 /* Otherwise(str_buf < workp),
6095 (str_buf+next_character) may equals (workp).
6096 So we continue this loop. */
6097 }
6098 /* not matched */
6099 d = backup_d;
6100 dend = backup_dend;
6101 workp += length + 1;
6102 }
6103 }
6104 /* match with equivalence_class? */
6105# ifdef _LIBC
6106 if (nrules != 0)
6107 {
6108 const CHAR_TYPE *backup_d = d, *backup_dend = dend;
6109 /* Try to match the equivalence class against
6110 those known to the collate implementation. */
6111 const int32_t *table;
6112 const int32_t *weights;
6113 const int32_t *extra;
6114 const int32_t *indirect;
6115 int32_t idx, idx2;
6116 wint_t *cp;
6117 size_t len;
6118
6119 /* This #include defines a local function! */
6120# include <locale/weightwc.h>
6121
6122 table = (const int32_t *)
6123 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
6124 weights = (const wint_t *)
6125 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
6126 extra = (const wint_t *)
6127 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
6128 indirect = (const int32_t *)
6129 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
6130
6131 /* Write 1 collating element to str_buf, and
6132 get its index. */
6133 idx2 = 0;
6134
6135 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
6136 {
6137 cp = (wint_t*)str_buf;
6138 if (d == dend)
6139 {
6140 if (dend == end_match_2)
6141 break;
6142 d = string2;
6143 dend = end_match_2;
6144 }
6145 str_buf[i] = TRANSLATE(*(d+i));
6146 str_buf[i+1] = '\0'; /* sentinel */
6147 idx2 = findidx ((const wint_t**)&cp);
6148 }
6149
6150 /* Update d, however d will be incremented at
6151 char_set_matched:, we decrement d here. */
6152 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
6153 if (d >= dend)
6154 {
6155 if (dend == end_match_2)
6156 d = dend;
6157 else
6158 {
6159 d = string2;
6160 dend = end_match_2;
6161 }
6162 }
6163
6164 len = weights[idx2];
6165
6166 for (workp2 = workp + equiv_class_length ; workp < workp2 ;
6167 workp++)
6168 {
6169 idx = (int32_t)*workp;
6170 /* We already checked idx != 0 in regex_compile. */
6171
6172 if (idx2 != 0 && len == weights[idx])
6173 {
6174 int cnt = 0;
6175 while (cnt < len && (weights[idx + 1 + cnt]
6176 == weights[idx2 + 1 + cnt]))
6177 ++cnt;
6178
6179 if (cnt == len)
6180 goto char_set_matched;
6181 }
6182 }
6183 /* not matched */
6184 d = backup_d;
6185 dend = backup_dend;
6186 }
6187 else /* (nrules == 0) */
6188# endif
6189 /* If we can't look up collation data, we use wcscoll
6190 instead. */
6191 {
6192 for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
6193 {
6194 const CHAR_TYPE *backup_d = d, *backup_dend = dend;
6195 length = wcslen(workp);
6196
6197 /* If wcscoll(the collating symbol, whole string) > 0,
6198 any substring of the string never match with the
6199 collating symbol. */
6200 if (wcscoll(workp, d) > 0)
6201 {
6202 workp += length + 1;
6203 break;
6204 }
6205
6206 /* First, we compare the equivalence class with
6207 the first character of the string.
6208 If it don't match, we add the next character to
6209 the compare buffer in turn. */
6210 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
6211 {
6212 int match;
6213 if (d == dend)
6214 {
6215 if (dend == end_match_2)
6216 break;
6217 d = string2;
6218 dend = end_match_2;
6219 }
6220
6221 /* add next character to the compare buffer. */
6222 str_buf[i] = TRANSLATE(*d);
6223 str_buf[i+1] = '\0';
6224
6225 match = wcscoll(workp, str_buf);
6226
6227 if (match == 0)
6228 goto char_set_matched;
6229
6230 if (match < 0)
6231 /* (str_buf > workp) indicate (str_buf + X > workp),
6232 because for all X (str_buf + X > str_buf).
6233 So we don't need continue this loop. */
6234 break;
6235
6236 /* Otherwise(str_buf < workp),
6237 (str_buf+next_character) may equals (workp).
6238 So we continue this loop. */
6239 }
6240 /* not matched */
6241 d = backup_d;
6242 dend = backup_dend;
6243 workp += length + 1;
6244 }
6245 }
6246
6247 /* match with char_range? */
6248#ifdef _LIBC
6249 if (nrules != 0)
6250 {
6251 uint32_t collseqval;
6252 const char *collseq = (const char *)
6253 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
6254
6255 collseqval = collseq_table_lookup (collseq, c);
6256
6257 for (; workp < p - chars_length ;)
6258 {
6259 uint32_t start_val, end_val;
6260
6261 /* We already compute the collation sequence value
6262 of the characters (or collating symbols). */
6263 start_val = (uint32_t) *workp++; /* range_start */
6264 end_val = (uint32_t) *workp++; /* range_end */
6265
6266 if (start_val <= collseqval && collseqval <= end_val)
6267 goto char_set_matched;
6268 }
6269 }
6270 else
6271#endif
6272 {
6273 /* We set range_start_char at str_buf[0], range_end_char
6274 at str_buf[4], and compared char at str_buf[2]. */
6275 str_buf[1] = 0;
6276 str_buf[2] = c;
6277 str_buf[3] = 0;
6278 str_buf[5] = 0;
6279 for (; workp < p - chars_length ;)
6280 {
6281 wchar_t *range_start_char, *range_end_char;
6282
6283 /* match if (range_start_char <= c <= range_end_char). */
6284
6285 /* If range_start(or end) < 0, we assume -range_start(end)
6286 is the offset of the collating symbol which is specified
6287 as the character of the range start(end). */
6288
6289 /* range_start */
6290 if (*workp < 0)
6291 range_start_char = charset_top - (*workp++);
6292 else
6293 {
6294 str_buf[0] = *workp++;
6295 range_start_char = str_buf;
6296 }
6297
6298 /* range_end */
6299 if (*workp < 0)
6300 range_end_char = charset_top - (*workp++);
6301 else
6302 {
6303 str_buf[4] = *workp++;
6304 range_end_char = str_buf + 4;
6305 }
6306
6307 if (wcscoll(range_start_char, str_buf+2) <= 0 &&
6308 wcscoll(str_buf+2, range_end_char) <= 0)
6309
6310 goto char_set_matched;
6311 }
6312 }
6313
6314 /* match with char? */
6315 for (; workp < p ; workp++)
6316 if (c == *workp)
6317 goto char_set_matched;
6318
6319 not = !not;
6320
6321 char_set_matched:
6322 if (not) goto fail;
6323#else
6324 /* Cast to `unsigned' instead of `unsigned char' in case the
6325 bit list is a full 32 bytes long. */
6326 if (c < (unsigned) (*p * BYTEWIDTH)
6327 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6328 not = !not;
6329
6330 p += 1 + *p;
6331
6332 if (!not) goto fail;
6333#undef WORK_BUFFER_SIZE
6334#endif /* MBS_SUPPORT */
6335 SET_REGS_MATCHED ();
6336 d++;
6337 break;
6338 }
6339
6340
6341 /* The beginning of a group is represented by start_memory.
6342 The arguments are the register number in the next byte, and the
6343 number of groups inner to this one in the next. The text
6344 matched within the group is recorded (in the internal
6345 registers data structure) under the register number. */
6346 case start_memory:
6347 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
6348 (long int) *p, (long int) p[1]);
6349
6350 /* Find out if this group can match the empty string. */
6351 p1 = p; /* To send to group_match_null_string_p. */
6352
6353 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
6354 REG_MATCH_NULL_STRING_P (reg_info[*p])
6355 = group_match_null_string_p (&p1, pend, reg_info);
6356
6357 /* Save the position in the string where we were the last time
6358 we were at this open-group operator in case the group is
6359 operated upon by a repetition operator, e.g., with `(a*)*b'
6360 against `ab'; then we want to ignore where we are now in
6361 the string in case this attempt to match fails. */
6362 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6363 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
6364 : regstart[*p];
6365 DEBUG_PRINT2 (" old_regstart: %d\n",
6366 POINTER_TO_OFFSET (old_regstart[*p]));
6367
6368 regstart[*p] = d;
6369 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
6370
6371 IS_ACTIVE (reg_info[*p]) = 1;
6372 MATCHED_SOMETHING (reg_info[*p]) = 0;
6373
6374 /* Clear this whenever we change the register activity status. */
6375 set_regs_matched_done = 0;
6376
6377 /* This is the new highest active register. */
6378 highest_active_reg = *p;
6379
6380 /* If nothing was active before, this is the new lowest active
6381 register. */
6382 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6383 lowest_active_reg = *p;
6384
6385 /* Move past the register number and inner group count. */
6386 p += 2;
6387 just_past_start_mem = p;
6388
6389 break;
6390
6391
6392 /* The stop_memory opcode represents the end of a group. Its
6393 arguments are the same as start_memory's: the register
6394 number, and the number of inner groups. */
6395 case stop_memory:
6396 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
6397 (long int) *p, (long int) p[1]);
6398
6399 /* We need to save the string position the last time we were at
6400 this close-group operator in case the group is operated
6401 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
6402 against `aba'; then we want to ignore where we are now in
6403 the string in case this attempt to match fails. */
6404 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6405 ? REG_UNSET (regend[*p]) ? d : regend[*p]
6406 : regend[*p];
6407 DEBUG_PRINT2 (" old_regend: %d\n",
6408 POINTER_TO_OFFSET (old_regend[*p]));
6409
6410 regend[*p] = d;
6411 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
6412
6413 /* This register isn't active anymore. */
6414 IS_ACTIVE (reg_info[*p]) = 0;
6415
6416 /* Clear this whenever we change the register activity status. */
6417 set_regs_matched_done = 0;
6418
6419 /* If this was the only register active, nothing is active
6420 anymore. */
6421 if (lowest_active_reg == highest_active_reg)
6422 {
6423 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6424 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6425 }
6426 else
6427 { /* We must scan for the new highest active register, since
6428 it isn't necessarily one less than now: consider
6429 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
6430 new highest active register is 1. */
6431 US_CHAR_TYPE r = *p - 1;
6432 while (r > 0 && !IS_ACTIVE (reg_info[r]))
6433 r--;
6434
6435 /* If we end up at register zero, that means that we saved
6436 the registers as the result of an `on_failure_jump', not
6437 a `start_memory', and we jumped to past the innermost
6438 `stop_memory'. For example, in ((.)*) we save
6439 registers 1 and 2 as a result of the *, but when we pop
6440 back to the second ), we are at the stop_memory 1.
6441 Thus, nothing is active. */
6442 if (r == 0)
6443 {
6444 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6445 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6446 }
6447 else
6448 highest_active_reg = r;
6449 }
6450
6451 /* If just failed to match something this time around with a
6452 group that's operated on by a repetition operator, try to
6453 force exit from the ``loop'', and restore the register
6454 information for this group that we had before trying this
6455 last match. */
6456 if ((!MATCHED_SOMETHING (reg_info[*p])
6457 || just_past_start_mem == p - 1)
6458 && (p + 2) < pend)
6459 {
6460 boolean is_a_jump_n = false;
6461
6462 p1 = p + 2;
6463 mcnt = 0;
6464 switch ((re_opcode_t) *p1++)
6465 {
6466 case jump_n:
6467 is_a_jump_n = true;
6468 case pop_failure_jump:
6469 case maybe_pop_jump:
6470 case jump:
6471 case dummy_failure_jump:
6472 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6473 if (is_a_jump_n)
6474 p1 += OFFSET_ADDRESS_SIZE;
6475 break;
6476
6477 default:
6478 /* do nothing */ ;
6479 }
6480 p1 += mcnt;
6481
6482 /* If the next operation is a jump backwards in the pattern
6483 to an on_failure_jump right before the start_memory
6484 corresponding to this stop_memory, exit from the loop
6485 by forcing a failure after pushing on the stack the
6486 on_failure_jump's jump in the pattern, and d. */
6487 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
6488 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
6489 && p1[2+OFFSET_ADDRESS_SIZE] == *p)
6490 {
6491 /* If this group ever matched anything, then restore
6492 what its registers were before trying this last
6493 failed match, e.g., with `(a*)*b' against `ab' for
6494 regstart[1], and, e.g., with `((a*)*(b*)*)*'
6495 against `aba' for regend[3].
6496
6497 Also restore the registers for inner groups for,
6498 e.g., `((a*)(b*))*' against `aba' (register 3 would
6499 otherwise get trashed). */
6500
6501 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
6502 {
6503 unsigned r;
6504
6505 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
6506
6507 /* Restore this and inner groups' (if any) registers. */
6508 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
6509 r++)
6510 {
6511 regstart[r] = old_regstart[r];
6512
6513 /* xx why this test? */
6514 if (old_regend[r] >= regstart[r])
6515 regend[r] = old_regend[r];
6516 }
6517 }
6518 p1++;
6519 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6520 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
6521
6522 goto fail;
6523 }
6524 }
6525
6526 /* Move past the register number and the inner group count. */
6527 p += 2;
6528 break;
6529
6530
6531 /* \<digit> has been turned into a `duplicate' command which is
6532 followed by the numeric value of <digit> as the register number. */
6533 case duplicate:
6534 {
6535 register const CHAR_TYPE *d2, *dend2;
6536 int regno = *p++; /* Get which register to match against. */
6537 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
6538
6539 /* Can't back reference a group which we've never matched. */
6540 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
6541 goto fail;
6542
6543 /* Where in input to try to start matching. */
6544 d2 = regstart[regno];
6545
6546 /* Where to stop matching; if both the place to start and
6547 the place to stop matching are in the same string, then
6548 set to the place to stop, otherwise, for now have to use
6549 the end of the first string. */
6550
6551 dend2 = ((FIRST_STRING_P (regstart[regno])
6552 == FIRST_STRING_P (regend[regno]))
6553 ? regend[regno] : end_match_1);
6554 for (;;)
6555 {
6556 /* If necessary, advance to next segment in register
6557 contents. */
6558 while (d2 == dend2)
6559 {
6560 if (dend2 == end_match_2) break;
6561 if (dend2 == regend[regno]) break;
6562
6563 /* End of string1 => advance to string2. */
6564 d2 = string2;
6565 dend2 = regend[regno];
6566 }
6567 /* At end of register contents => success */
6568 if (d2 == dend2) break;
6569
6570 /* If necessary, advance to next segment in data. */
6571 PREFETCH ();
6572
6573 /* How many characters left in this segment to match. */
6574 mcnt = dend - d;
6575
6576 /* Want how many consecutive characters we can match in
6577 one shot, so, if necessary, adjust the count. */
6578 if (mcnt > dend2 - d2)
6579 mcnt = dend2 - d2;
6580
6581 /* Compare that many; failure if mismatch, else move
6582 past them. */
6583 if (translate
6584 ? bcmp_translate (d, d2, mcnt, translate)
6585 : memcmp (d, d2, mcnt*sizeof(US_CHAR_TYPE)))
6586 goto fail;
6587 d += mcnt, d2 += mcnt;
6588
6589 /* Do this because we've match some characters. */
6590 SET_REGS_MATCHED ();
6591 }
6592 }
6593 break;
6594
6595
6596 /* begline matches the empty string at the beginning of the string
6597 (unless `not_bol' is set in `bufp'), and, if
6598 `newline_anchor' is set, after newlines. */
6599 case begline:
6600 DEBUG_PRINT1 ("EXECUTING begline.\n");
6601
6602 if (AT_STRINGS_BEG (d))
6603 {
6604 if (!bufp->not_bol) break;
6605 }
6606 else if (d[-1] == '\n' && bufp->newline_anchor)
6607 {
6608 break;
6609 }
6610 /* In all other cases, we fail. */
6611 goto fail;
6612
6613
6614 /* endline is the dual of begline. */
6615 case endline:
6616 DEBUG_PRINT1 ("EXECUTING endline.\n");
6617
6618 if (AT_STRINGS_END (d))
6619 {
6620 if (!bufp->not_eol) break;
6621 }
6622
6623 /* We have to ``prefetch'' the next character. */
6624 else if ((d == end1 ? *string2 : *d) == '\n'
6625 && bufp->newline_anchor)
6626 {
6627 break;
6628 }
6629 goto fail;
6630
6631
6632 /* Match at the very beginning of the data. */
6633 case begbuf:
6634 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
6635 if (AT_STRINGS_BEG (d))
6636 break;
6637 goto fail;
6638
6639
6640 /* Match at the very end of the data. */
6641 case endbuf:
6642 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
6643 if (AT_STRINGS_END (d))
6644 break;
6645 goto fail;
6646
6647
6648 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
6649 pushes NULL as the value for the string on the stack. Then
6650 `pop_failure_point' will keep the current value for the
6651 string, instead of restoring it. To see why, consider
6652 matching `foo\nbar' against `.*\n'. The .* matches the foo;
6653 then the . fails against the \n. But the next thing we want
6654 to do is match the \n against the \n; if we restored the
6655 string value, we would be back at the foo.
6656
6657 Because this is used only in specific cases, we don't need to
6658 check all the things that `on_failure_jump' does, to make
6659 sure the right things get saved on the stack. Hence we don't
6660 share its code. The only reason to push anything on the
6661 stack at all is that otherwise we would have to change
6662 `anychar's code to do something besides goto fail in this
6663 case; that seems worse than this. */
6664 case on_failure_keep_string_jump:
6665 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
6666
6667 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6668#ifdef _LIBC
6669 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
6670#else
6671 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
6672#endif
6673
6674 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
6675 break;
6676
6677
6678 /* Uses of on_failure_jump:
6679
6680 Each alternative starts with an on_failure_jump that points
6681 to the beginning of the next alternative. Each alternative
6682 except the last ends with a jump that in effect jumps past
6683 the rest of the alternatives. (They really jump to the
6684 ending jump of the following alternative, because tensioning
6685 these jumps is a hassle.)
6686
6687 Repeats start with an on_failure_jump that points past both
6688 the repetition text and either the following jump or
6689 pop_failure_jump back to this on_failure_jump. */
6690 case on_failure_jump:
6691 on_failure:
6692 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
6693
6694 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6695#ifdef _LIBC
6696 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
6697#else
6698 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
6699#endif
6700
6701 /* If this on_failure_jump comes right before a group (i.e.,
6702 the original * applied to a group), save the information
6703 for that group and all inner ones, so that if we fail back
6704 to this point, the group's information will be correct.
6705 For example, in \(a*\)*\1, we need the preceding group,
6706 and in \(zz\(a*\)b*\)\2, we need the inner group. */
6707
6708 /* We can't use `p' to check ahead because we push
6709 a failure point to `p + mcnt' after we do this. */
6710 p1 = p;
6711
6712 /* We need to skip no_op's before we look for the
6713 start_memory in case this on_failure_jump is happening as
6714 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
6715 against aba. */
6716 while (p1 < pend && (re_opcode_t) *p1 == no_op)
6717 p1++;
6718
6719 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
6720 {
6721 /* We have a new highest active register now. This will
6722 get reset at the start_memory we are about to get to,
6723 but we will have saved all the registers relevant to
6724 this repetition op, as described above. */
6725 highest_active_reg = *(p1 + 1) + *(p1 + 2);
6726 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6727 lowest_active_reg = *(p1 + 1);
6728 }
6729
6730 DEBUG_PRINT1 (":\n");
6731 PUSH_FAILURE_POINT (p + mcnt, d, -2);
6732 break;
6733
6734
6735 /* A smart repeat ends with `maybe_pop_jump'.
6736 We change it to either `pop_failure_jump' or `jump'. */
6737 case maybe_pop_jump:
6738 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6739 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
6740 {
6741 register US_CHAR_TYPE *p2 = p;
6742
6743 /* Compare the beginning of the repeat with what in the
6744 pattern follows its end. If we can establish that there
6745 is nothing that they would both match, i.e., that we
6746 would have to backtrack because of (as in, e.g., `a*a')
6747 then we can change to pop_failure_jump, because we'll
6748 never have to backtrack.
6749
6750 This is not true in the case of alternatives: in
6751 `(a|ab)*' we do need to backtrack to the `ab' alternative
6752 (e.g., if the string was `ab'). But instead of trying to
6753 detect that here, the alternative has put on a dummy
6754 failure point which is what we will end up popping. */
6755
6756 /* Skip over open/close-group commands.
6757 If what follows this loop is a ...+ construct,
6758 look at what begins its body, since we will have to
6759 match at least one of that. */
6760 while (1)
6761 {
6762 if (p2 + 2 < pend
6763 && ((re_opcode_t) *p2 == stop_memory
6764 || (re_opcode_t) *p2 == start_memory))
6765 p2 += 3;
6766 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
6767 && (re_opcode_t) *p2 == dummy_failure_jump)
6768 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
6769 else
6770 break;
6771 }
6772
6773 p1 = p + mcnt;
6774 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
6775 to the `maybe_finalize_jump' of this case. Examine what
6776 follows. */
6777
6778 /* If we're at the end of the pattern, we can change. */
6779 if (p2 == pend)
6780 {
6781 /* Consider what happens when matching ":\(.*\)"
6782 against ":/". I don't really understand this code
6783 yet. */
6784 p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE)
6785 pop_failure_jump;
6786 DEBUG_PRINT1
6787 (" End of pattern: change to `pop_failure_jump'.\n");
6788 }
6789
6790 else if ((re_opcode_t) *p2 == exactn
6791#ifdef MBS_SUPPORT
6792 || (re_opcode_t) *p2 == exactn_bin
6793#endif
6794 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
6795 {
6796 register US_CHAR_TYPE c
6797 = *p2 == (US_CHAR_TYPE) endline ? '\n' : p2[2];
6798
6799 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
6800#ifdef MBS_SUPPORT
6801 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
6802#endif
6803 ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
6804 {
6805 p[-(1+OFFSET_ADDRESS_SIZE)] = (US_CHAR_TYPE)
6806 pop_failure_jump;
6807#ifdef MBS_SUPPORT
6808 if (MB_CUR_MAX != 1)
6809 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
6810 (wint_t) c,
6811 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
6812 else
6813#endif
6814 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
6815 (char) c,
6816 (char) p1[3+OFFSET_ADDRESS_SIZE]);
6817 }
6818
6819#ifndef MBS_SUPPORT
6820 else if ((re_opcode_t) p1[3] == charset
6821 || (re_opcode_t) p1[3] == charset_not)
6822 {
6823 int not = (re_opcode_t) p1[3] == charset_not;
6824
6825 if (c < (unsigned) (p1[4] * BYTEWIDTH)
6826 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6827 not = !not;
6828
6829 /* `not' is equal to 1 if c would match, which means
6830 that we can't change to pop_failure_jump. */
6831 if (!not)
6832 {
6833 p[-3] = (unsigned char) pop_failure_jump;
6834 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
6835 }
6836 }
6837#endif /* not MBS_SUPPORT */
6838 }
6839#ifndef MBS_SUPPORT
6840 else if ((re_opcode_t) *p2 == charset)
6841 {
6842 /* We win if the first character of the loop is not part
6843 of the charset. */
6844 if ((re_opcode_t) p1[3] == exactn
6845 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
6846 && (p2[2 + p1[5] / BYTEWIDTH]
6847 & (1 << (p1[5] % BYTEWIDTH)))))
6848 {
6849 p[-3] = (unsigned char) pop_failure_jump;
6850 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
6851 }
6852
6853 else if ((re_opcode_t) p1[3] == charset_not)
6854 {
6855 int idx;
6856 /* We win if the charset_not inside the loop
6857 lists every character listed in the charset after. */
6858 for (idx = 0; idx < (int) p2[1]; idx++)
6859 if (! (p2[2 + idx] == 0
6860 || (idx < (int) p1[4]
6861 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
6862 break;
6863
6864 if (idx == p2[1])
6865 {
6866 p[-3] = (unsigned char) pop_failure_jump;
6867 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
6868 }
6869 }
6870 else if ((re_opcode_t) p1[3] == charset)
6871 {
6872 int idx;
6873 /* We win if the charset inside the loop
6874 has no overlap with the one after the loop. */
6875 for (idx = 0;
6876 idx < (int) p2[1] && idx < (int) p1[4];
6877 idx++)
6878 if ((p2[2 + idx] & p1[5 + idx]) != 0)
6879 break;
6880
6881 if (idx == p2[1] || idx == p1[4])
6882 {
6883 p[-3] = (unsigned char) pop_failure_jump;
6884 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
6885 }
6886 }
6887 }
6888#endif /* not MBS_SUPPORT */
6889 }
6890 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
6891 if ((re_opcode_t) p[-1] != pop_failure_jump)
6892 {
6893 p[-1] = (US_CHAR_TYPE) jump;
6894 DEBUG_PRINT1 (" Match => jump.\n");
6895 goto unconditional_jump;
6896 }
6897 /* Note fall through. */
6898
6899
6900 /* The end of a simple repeat has a pop_failure_jump back to
6901 its matching on_failure_jump, where the latter will push a
6902 failure point. The pop_failure_jump takes off failure
6903 points put on by this pop_failure_jump's matching
6904 on_failure_jump; we got through the pattern to here from the
6905 matching on_failure_jump, so didn't fail. */
6906 case pop_failure_jump:
6907 {
6908 /* We need to pass separate storage for the lowest and
6909 highest registers, even though we don't care about the
6910 actual values. Otherwise, we will restore only one
6911 register from the stack, since lowest will == highest in
6912 `pop_failure_point'. */
6913 active_reg_t dummy_low_reg, dummy_high_reg;
6914 US_CHAR_TYPE *pdummy = NULL;
6915 const CHAR_TYPE *sdummy = NULL;
6916
6917 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
6918 POP_FAILURE_POINT (sdummy, pdummy,
6919 dummy_low_reg, dummy_high_reg,
6920 reg_dummy, reg_dummy, reg_info_dummy);
6921 }
6922 /* Note fall through. */
6923
6924 unconditional_jump:
6925#ifdef _LIBC
6926 DEBUG_PRINT2 ("\n%p: ", p);
6927#else
6928 DEBUG_PRINT2 ("\n0x%x: ", p);
6929#endif
6930 /* Note fall through. */
6931
6932 /* Unconditionally jump (without popping any failure points). */
6933 case jump:
6934 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
6935 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
6936 p += mcnt; /* Do the jump. */
6937#ifdef _LIBC
6938 DEBUG_PRINT2 ("(to %p).\n", p);
6939#else
6940 DEBUG_PRINT2 ("(to 0x%x).\n", p);
6941#endif
6942 break;
6943
6944
6945 /* We need this opcode so we can detect where alternatives end
6946 in `group_match_null_string_p' et al. */
6947 case jump_past_alt:
6948 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
6949 goto unconditional_jump;
6950
6951
6952 /* Normally, the on_failure_jump pushes a failure point, which
6953 then gets popped at pop_failure_jump. We will end up at
6954 pop_failure_jump, also, and with a pattern of, say, `a+', we
6955 are skipping over the on_failure_jump, so we have to push
6956 something meaningless for pop_failure_jump to pop. */
6957 case dummy_failure_jump:
6958 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
6959 /* It doesn't matter what we push for the string here. What
6960 the code at `fail' tests is the value for the pattern. */
6961 PUSH_FAILURE_POINT (NULL, NULL, -2);
6962 goto unconditional_jump;
6963
6964
6965 /* At the end of an alternative, we need to push a dummy failure
6966 point in case we are followed by a `pop_failure_jump', because
6967 we don't want the failure point for the alternative to be
6968 popped. For example, matching `(a|ab)*' against `aab'
6969 requires that we match the `ab' alternative. */
6970 case push_dummy_failure:
6971 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
6972 /* See comments just above at `dummy_failure_jump' about the
6973 two zeroes. */
6974 PUSH_FAILURE_POINT (NULL, NULL, -2);
6975 break;
6976
6977 /* Have to succeed matching what follows at least n times.
6978 After that, handle like `on_failure_jump'. */
6979 case succeed_n:
6980 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
6981 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
6982
6983 assert (mcnt >= 0);
6984 /* Originally, this is how many times we HAVE to succeed. */
6985 if (mcnt > 0)
6986 {
6987 mcnt--;
6988 p += OFFSET_ADDRESS_SIZE;
6989 STORE_NUMBER_AND_INCR (p, mcnt);
6990#ifdef _LIBC
6991 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
6992 , mcnt);
6993#else
6994 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
6995 , mcnt);
6996#endif
6997 }
6998 else if (mcnt == 0)
6999 {
7000#ifdef _LIBC
7001 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
7002 p + OFFSET_ADDRESS_SIZE);
7003#else
7004 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
7005 p + OFFSET_ADDRESS_SIZE);
7006#endif /* _LIBC */
7007
7008#ifdef MBS_SUPPORT
7009 p[1] = (US_CHAR_TYPE) no_op;
7010#else
7011 p[2] = (US_CHAR_TYPE) no_op;
7012 p[3] = (US_CHAR_TYPE) no_op;
7013#endif /* MBS_SUPPORT */
7014 goto on_failure;
7015 }
7016 break;
7017
7018 case jump_n:
7019 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7020 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
7021
7022 /* Originally, this is how many times we CAN jump. */
7023 if (mcnt)
7024 {
7025 mcnt--;
7026 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
7027
7028#ifdef _LIBC
7029 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
7030 mcnt);
7031#else
7032 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
7033 mcnt);
7034#endif /* _LIBC */
7035 goto unconditional_jump;
7036 }
7037 /* If don't have to jump any more, skip over the rest of command. */
7038 else
7039 p += 2 * OFFSET_ADDRESS_SIZE;
7040 break;
7041
7042 case set_number_at:
7043 {
7044 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
7045
7046 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7047 p1 = p + mcnt;
7048 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7049#ifdef _LIBC
7050 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
7051#else
7052 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
7053#endif
7054 STORE_NUMBER (p1, mcnt);
7055 break;
7056 }
7057
7058#if 0
7059 /* The DEC Alpha C compiler 3.x generates incorrect code for the
7060 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7061 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
7062 macro and introducing temporary variables works around the bug. */
7063
7064 case wordbound:
7065 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7066 if (AT_WORD_BOUNDARY (d))
7067 break;
7068 goto fail;
7069
7070 case notwordbound:
7071 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7072 if (AT_WORD_BOUNDARY (d))
7073 goto fail;
7074 break;
7075#else
7076 case wordbound:
7077 {
7078 boolean prevchar, thischar;
7079
7080 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7081 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7082 break;
7083
7084 prevchar = WORDCHAR_P (d - 1);
7085 thischar = WORDCHAR_P (d);
7086 if (prevchar != thischar)
7087 break;
7088 goto fail;
7089 }
7090
7091 case notwordbound:
7092 {
7093 boolean prevchar, thischar;
7094
7095 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7096 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7097 goto fail;
7098
7099 prevchar = WORDCHAR_P (d - 1);
7100 thischar = WORDCHAR_P (d);
7101 if (prevchar != thischar)
7102 goto fail;
7103 break;
7104 }
7105#endif
7106
7107 case wordbeg:
7108 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
7109 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
7110 break;
7111 goto fail;
7112
7113 case wordend:
7114 DEBUG_PRINT1 ("EXECUTING wordend.\n");
7115 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
7116 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
7117 break;
7118 goto fail;
7119
7120#ifdef emacs
7121 case before_dot:
7122 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
7123 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
7124 goto fail;
7125 break;
7126
7127 case at_dot:
7128 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
7129 if (PTR_CHAR_POS ((unsigned char *) d) != point)
7130 goto fail;
7131 break;
7132
7133 case after_dot:
7134 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
7135 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
7136 goto fail;
7137 break;
7138
7139 case syntaxspec:
7140 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
7141 mcnt = *p++;
7142 goto matchsyntax;
7143
7144 case wordchar:
7145 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
7146 mcnt = (int) Sword;
7147 matchsyntax:
7148 PREFETCH ();
7149 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7150 d++;
7151 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
7152 goto fail;
7153 SET_REGS_MATCHED ();
7154 break;
7155
7156 case notsyntaxspec:
7157 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
7158 mcnt = *p++;
7159 goto matchnotsyntax;
7160
7161 case notwordchar:
7162 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
7163 mcnt = (int) Sword;
7164 matchnotsyntax:
7165 PREFETCH ();
7166 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7167 d++;
7168 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
7169 goto fail;
7170 SET_REGS_MATCHED ();
7171 break;
7172
7173#else /* not emacs */
7174 case wordchar:
7175 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
7176 PREFETCH ();
7177 if (!WORDCHAR_P (d))
7178 goto fail;
7179 SET_REGS_MATCHED ();
7180 d++;
7181 break;
7182
7183 case notwordchar:
7184 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
7185 PREFETCH ();
7186 if (WORDCHAR_P (d))
7187 goto fail;
7188 SET_REGS_MATCHED ();
7189 d++;
7190 break;
7191#endif /* not emacs */
7192
7193 default:
7194 abort ();
7195 }
7196 continue; /* Successfully executed one pattern command; keep going. */
7197
7198
7199 /* We goto here if a matching operation fails. */
7200 fail:
7201 if (!FAIL_STACK_EMPTY ())
7202 { /* A restart point is known. Restore to that state. */
7203 DEBUG_PRINT1 ("\nFAIL:\n");
7204 POP_FAILURE_POINT (d, p,
7205 lowest_active_reg, highest_active_reg,
7206 regstart, regend, reg_info);
7207
7208 /* If this failure point is a dummy, try the next one. */
7209 if (!p)
7210 goto fail;
7211
7212 /* If we failed to the end of the pattern, don't examine *p. */
7213 assert (p <= pend);
7214 if (p < pend)
7215 {
7216 boolean is_a_jump_n = false;
7217
7218 /* If failed to a backwards jump that's part of a repetition
7219 loop, need to pop this failure point and use the next one. */
7220 switch ((re_opcode_t) *p)
7221 {
7222 case jump_n:
7223 is_a_jump_n = true;
7224 case maybe_pop_jump:
7225 case pop_failure_jump:
7226 case jump:
7227 p1 = p + 1;
7228 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7229 p1 += mcnt;
7230
7231 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
7232 || (!is_a_jump_n
7233 && (re_opcode_t) *p1 == on_failure_jump))
7234 goto fail;
7235 break;
7236 default:
7237 /* do nothing */ ;
7238 }
7239 }
7240
7241 if (d >= string1 && d <= end1)
7242 dend = end_match_1;
7243 }
7244 else
7245 break; /* Matching at this starting point really fails. */
7246 } /* for (;;) */
7247
7248 if (best_regs_set)
7249 goto restore_best_regs;
7250
7251 FREE_VARIABLES ();
7252
7253 return -1; /* Failure to match. */
7254} /* re_match_2 */
7255
7256
7257/* Subroutine definitions for re_match_2. */
7258
7259
7260/* We are passed P pointing to a register number after a start_memory.
7261
7262 Return true if the pattern up to the corresponding stop_memory can
7263 match the empty string, and false otherwise.
7264
7265 If we find the matching stop_memory, sets P to point to one past its number.
7266 Otherwise, sets P to an undefined byte less than or equal to END.
7267
7268 We don't handle duplicates properly (yet). */
7269
7270static boolean
7271group_match_null_string_p (p, end, reg_info)
7272 US_CHAR_TYPE **p, *end;
7273 register_info_type *reg_info;
7274{
7275 int mcnt;
7276 /* Point to after the args to the start_memory. */
7277 US_CHAR_TYPE *p1 = *p + 2;
7278
7279 while (p1 < end)
7280 {
7281 /* Skip over opcodes that can match nothing, and return true or
7282 false, as appropriate, when we get to one that can't, or to the
7283 matching stop_memory. */
7284
7285 switch ((re_opcode_t) *p1)
7286 {
7287 /* Could be either a loop or a series of alternatives. */
7288 case on_failure_jump:
7289 p1++;
7290 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7291
7292 /* If the next operation is not a jump backwards in the
7293 pattern. */
7294
7295 if (mcnt >= 0)
7296 {
7297 /* Go through the on_failure_jumps of the alternatives,
7298 seeing if any of the alternatives cannot match nothing.
7299 The last alternative starts with only a jump,
7300 whereas the rest start with on_failure_jump and end
7301 with a jump, e.g., here is the pattern for `a|b|c':
7302
7303 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
7304 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
7305 /exactn/1/c
7306
7307 So, we have to first go through the first (n-1)
7308 alternatives and then deal with the last one separately. */
7309
7310
7311 /* Deal with the first (n-1) alternatives, which start
7312 with an on_failure_jump (see above) that jumps to right
7313 past a jump_past_alt. */
7314
7315 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
7316 jump_past_alt)
7317 {
7318 /* `mcnt' holds how many bytes long the alternative
7319 is, including the ending `jump_past_alt' and
7320 its number. */
7321
7322 if (!alt_match_null_string_p (p1, p1 + mcnt -
7323 (1 + OFFSET_ADDRESS_SIZE),
7324 reg_info))
7325 return false;
7326
7327 /* Move to right after this alternative, including the
7328 jump_past_alt. */
7329 p1 += mcnt;
7330
7331 /* Break if it's the beginning of an n-th alternative
7332 that doesn't begin with an on_failure_jump. */
7333 if ((re_opcode_t) *p1 != on_failure_jump)
7334 break;
7335
7336 /* Still have to check that it's not an n-th
7337 alternative that starts with an on_failure_jump. */
7338 p1++;
7339 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7340 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
7341 jump_past_alt)
7342 {
7343 /* Get to the beginning of the n-th alternative. */
7344 p1 -= 1 + OFFSET_ADDRESS_SIZE;
7345 break;
7346 }
7347 }
7348
7349 /* Deal with the last alternative: go back and get number
7350 of the `jump_past_alt' just before it. `mcnt' contains
7351 the length of the alternative. */
7352 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
7353
7354 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
7355 return false;
7356
7357 p1 += mcnt; /* Get past the n-th alternative. */
7358 } /* if mcnt > 0 */
7359 break;
7360
7361
7362 case stop_memory:
7363 assert (p1[1] == **p);
7364 *p = p1 + 2;
7365 return true;
7366
7367
7368 default:
7369 if (!common_op_match_null_string_p (&p1, end, reg_info))
7370 return false;
7371 }
7372 } /* while p1 < end */
7373
7374 return false;
7375} /* group_match_null_string_p */
7376
7377
7378/* Similar to group_match_null_string_p, but doesn't deal with alternatives:
7379 It expects P to be the first byte of a single alternative and END one
7380 byte past the last. The alternative can contain groups. */
7381
7382static boolean
7383alt_match_null_string_p (p, end, reg_info)
7384 US_CHAR_TYPE *p, *end;
7385 register_info_type *reg_info;
7386{
7387 int mcnt;
7388 US_CHAR_TYPE *p1 = p;
7389
7390 while (p1 < end)
7391 {
7392 /* Skip over opcodes that can match nothing, and break when we get
7393 to one that can't. */
7394
7395 switch ((re_opcode_t) *p1)
7396 {
7397 /* It's a loop. */
7398 case on_failure_jump:
7399 p1++;
7400 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7401 p1 += mcnt;
7402 break;
7403
7404 default:
7405 if (!common_op_match_null_string_p (&p1, end, reg_info))
7406 return false;
7407 }
7408 } /* while p1 < end */
7409
7410 return true;
7411} /* alt_match_null_string_p */
7412
7413
7414/* Deals with the ops common to group_match_null_string_p and
7415 alt_match_null_string_p.
7416
7417 Sets P to one after the op and its arguments, if any. */
7418
7419static boolean
7420common_op_match_null_string_p (p, end, reg_info)
7421 US_CHAR_TYPE **p, *end;
7422 register_info_type *reg_info;
7423{
7424 int mcnt;
7425 boolean ret;
7426 int reg_no;
7427 US_CHAR_TYPE *p1 = *p;
7428
7429 switch ((re_opcode_t) *p1++)
7430 {
7431 case no_op:
7432 case begline:
7433 case endline:
7434 case begbuf:
7435 case endbuf:
7436 case wordbeg:
7437 case wordend:
7438 case wordbound:
7439 case notwordbound:
7440#ifdef emacs
7441 case before_dot:
7442 case at_dot:
7443 case after_dot:
7444#endif
7445 break;
7446
7447 case start_memory:
7448 reg_no = *p1;
7449 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
7450 ret = group_match_null_string_p (&p1, end, reg_info);
7451
7452 /* Have to set this here in case we're checking a group which
7453 contains a group and a back reference to it. */
7454
7455 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
7456 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
7457
7458 if (!ret)
7459 return false;
7460 break;
7461
7462 /* If this is an optimized succeed_n for zero times, make the jump. */
7463 case jump:
7464 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7465 if (mcnt >= 0)
7466 p1 += mcnt;
7467 else
7468 return false;
7469 break;
7470
7471 case succeed_n:
7472 /* Get to the number of times to succeed. */
7473 p1 += OFFSET_ADDRESS_SIZE;
7474 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7475
7476 if (mcnt == 0)
7477 {
7478 p1 -= 2 * OFFSET_ADDRESS_SIZE;
7479 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7480 p1 += mcnt;
7481 }
7482 else
7483 return false;
7484 break;
7485
7486 case duplicate:
7487 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
7488 return false;
7489 break;
7490
7491 case set_number_at:
7492 p1 += 2 * OFFSET_ADDRESS_SIZE;
7493
7494 default:
7495 /* All other opcodes mean we cannot match the empty string. */
7496 return false;
7497 }
7498
7499 *p = p1;
7500 return true;
7501} /* common_op_match_null_string_p */
7502
7503
7504/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
7505 bytes; nonzero otherwise. */
7506
7507static int
7508bcmp_translate (s1, s2, len, translate)
7509 const CHAR_TYPE *s1, *s2;
7510 register int len;
7511 RE_TRANSLATE_TYPE translate;
7512{
7513 register const US_CHAR_TYPE *p1 = (const US_CHAR_TYPE *) s1;
7514 register const US_CHAR_TYPE *p2 = (const US_CHAR_TYPE *) s2;
7515 while (len)
7516 {
7517#ifdef MBS_SUPPORT
7518 if (((*p1<=0xff)?translate[*p1++]:*p1++)
7519 != ((*p2<=0xff)?translate[*p2++]:*p2++))
7520 return 1;
7521#else
7522 if (translate[*p1++] != translate[*p2++]) return 1;
7523#endif /* MBS_SUPPORT */
7524 len--;
7525 }
7526 return 0;
7527}
7528
7529
7530/* Entry points for GNU code. */
7531
7532/* re_compile_pattern is the GNU regular expression compiler: it
7533 compiles PATTERN (of length SIZE) and puts the result in BUFP.
7534 Returns 0 if the pattern was valid, otherwise an error string.
7535
7536 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
7537 are set in BUFP on entry.
7538
7539 We call regex_compile to do the actual compilation. */
7540
7541const char *
7542re_compile_pattern (pattern, length, bufp)
7543 const char *pattern;
7544 size_t length;
7545 struct re_pattern_buffer *bufp;
7546{
7547 reg_errcode_t ret;
7548
7549 /* GNU code is written to assume at least RE_NREGS registers will be set
7550 (and at least one extra will be -1). */
7551 bufp->regs_allocated = REGS_UNALLOCATED;
7552
7553 /* And GNU code determines whether or not to get register information
7554 by passing null for the REGS argument to re_match, etc., not by
7555 setting no_sub. */
7556 bufp->no_sub = 0;
7557
7558 /* Match anchors at newline. */
7559 bufp->newline_anchor = 1;
7560
7561 ret = regex_compile (pattern, length, re_syntax_options, bufp);
7562
7563 if (!ret)
7564 return NULL;
7565 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7566}
7567#ifdef _LIBC
7568weak_alias (__re_compile_pattern, re_compile_pattern)
7569#endif
7570
7571
7572/* Entry points compatible with 4.2 BSD regex library. We don't define
7573 them unless specifically requested. */
7574
7575#if defined _REGEX_RE_COMP || defined _LIBC
7576
7577/* BSD has one and only one pattern buffer. */
7578static struct re_pattern_buffer re_comp_buf;
7579
7580char *
7581#ifdef _LIBC
7582/* Make these definitions weak in libc, so POSIX programs can redefine
7583 these names if they don't use our functions, and still use
7584 regcomp/regexec below without link errors. */
7585weak_function
7586#endif
7587re_comp (s)
7588 const char *s;
7589{
7590 reg_errcode_t ret;
7591
7592 if (!s)
7593 {
7594 if (!re_comp_buf.buffer)
7595 return gettext ("No previous regular expression");
7596 return 0;
7597 }
7598
7599 if (!re_comp_buf.buffer)
7600 {
7601 re_comp_buf.buffer = (unsigned char *) malloc (200);
7602 if (re_comp_buf.buffer == NULL)
7603 return (char *) gettext (re_error_msgid
7604 + re_error_msgid_idx[(int) REG_ESPACE]);
7605 re_comp_buf.allocated = 200;
7606
7607 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
7608 if (re_comp_buf.fastmap == NULL)
7609 return (char *) gettext (re_error_msgid
7610 + re_error_msgid_idx[(int) REG_ESPACE]);
7611 }
7612
7613 /* Since `re_exec' always passes NULL for the `regs' argument, we
7614 don't need to initialize the pattern buffer fields which affect it. */
7615
7616 /* Match anchors at newlines. */
7617 re_comp_buf.newline_anchor = 1;
7618
7619 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7620
7621 if (!ret)
7622 return NULL;
7623
7624 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
7625 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7626}
7627
7628
7629int
7630#ifdef _LIBC
7631weak_function
7632#endif
7633re_exec (s)
7634 const char *s;
7635{
7636 const int len = strlen (s);
7637 return
7638 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
7639}
7640
7641#endif /* _REGEX_RE_COMP */
7642
7643
7644/* POSIX.2 functions. Don't define these for Emacs. */
7645
7646#ifndef emacs
7647
7648/* regcomp takes a regular expression as a string and compiles it.
7649
7650 PREG is a regex_t *. We do not expect any fields to be initialized,
7651 since POSIX says we shouldn't. Thus, we set
7652
7653 `buffer' to the compiled pattern;
7654 `used' to the length of the compiled pattern;
7655 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
7656 REG_EXTENDED bit in CFLAGS is set; otherwise, to
7657 RE_SYNTAX_POSIX_BASIC;
7658 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
7659 `fastmap' to an allocated space for the fastmap;
7660 `fastmap_accurate' to zero;
7661 `re_nsub' to the number of subexpressions in PATTERN.
7662
7663 PATTERN is the address of the pattern string.
7664
7665 CFLAGS is a series of bits which affect compilation.
7666
7667 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
7668 use POSIX basic syntax.
7669
7670 If REG_NEWLINE is set, then . and [^...] don't match newline.
7671 Also, regexec will try a match beginning after every newline.
7672
7673 If REG_ICASE is set, then we considers upper- and lowercase
7674 versions of letters to be equivalent when matching.
7675
7676 If REG_NOSUB is set, then when PREG is passed to regexec, that
7677 routine will report only success or failure, and nothing about the
7678 registers.
7679
7680 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
7681 the return codes and their meanings.) */
7682
7683int
7684regcomp (preg, pattern, cflags)
7685 regex_t *preg;
7686 const char *pattern;
7687 int cflags;
7688{
7689 reg_errcode_t ret;
7690 reg_syntax_t syntax
7691 = (cflags & REG_EXTENDED) ?
7692 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
7693
7694 /* regex_compile will allocate the space for the compiled pattern. */
7695 preg->buffer = 0;
7696 preg->allocated = 0;
7697 preg->used = 0;
7698
7699 /* Try to allocate space for the fastmap. */
7700 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
7701
7702 if (cflags & REG_ICASE)
7703 {
7704 unsigned i;
7705
7706 preg->translate
7707 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
7708 * sizeof (*(RE_TRANSLATE_TYPE)0));
7709 if (preg->translate == NULL)
7710 return (int) REG_ESPACE;
7711
7712 /* Map uppercase characters to corresponding lowercase ones. */
7713 for (i = 0; i < CHAR_SET_SIZE; i++)
7714 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
7715 }
7716 else
7717 preg->translate = NULL;
7718
7719 /* If REG_NEWLINE is set, newlines are treated differently. */
7720 if (cflags & REG_NEWLINE)
7721 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7722 syntax &= ~RE_DOT_NEWLINE;
7723 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
7724 /* It also changes the matching behavior. */
7725 preg->newline_anchor = 1;
7726 }
7727 else
7728 preg->newline_anchor = 0;
7729
7730 preg->no_sub = !!(cflags & REG_NOSUB);
7731
7732 /* POSIX says a null character in the pattern terminates it, so we
7733 can use strlen here in compiling the pattern. */
7734 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
7735
7736 /* POSIX doesn't distinguish between an unmatched open-group and an
7737 unmatched close-group: both are REG_EPAREN. */
7738 if (ret == REG_ERPAREN) ret = REG_EPAREN;
7739
7740 if (ret == REG_NOERROR && preg->fastmap)
7741 {
7742 /* Compute the fastmap now, since regexec cannot modify the pattern
7743 buffer. */
7744 if (re_compile_fastmap (preg) == -2)
7745 {
7746 /* Some error occurred while computing the fastmap, just forget
7747 about it. */
7748 free (preg->fastmap);
7749 preg->fastmap = NULL;
7750 }
7751 }
7752
7753 return (int) ret;
7754}
7755#ifdef _LIBC
7756weak_alias (__regcomp, regcomp)
7757#endif
7758
7759
7760/* regexec searches for a given pattern, specified by PREG, in the
7761 string STRING.
7762
7763 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
7764 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
7765 least NMATCH elements, and we set them to the offsets of the
7766 corresponding matched substrings.
7767
7768 EFLAGS specifies `execution flags' which affect matching: if
7769 REG_NOTBOL is set, then ^ does not match at the beginning of the
7770 string; if REG_NOTEOL is set, then $ does not match at the end.
7771
7772 We return 0 if we find a match and REG_NOMATCH if not. */
7773
7774int
7775regexec (preg, string, nmatch, pmatch, eflags)
7776 const regex_t *preg;
7777 const char *string;
7778 size_t nmatch;
7779 regmatch_t pmatch[];
7780 int eflags;
7781{
7782 int ret;
7783 struct re_registers regs;
7784 regex_t private_preg;
7785 int len = strlen (string);
7786 boolean want_reg_info = !preg->no_sub && nmatch > 0;
7787
7788 private_preg = *preg;
7789
7790 private_preg.not_bol = !!(eflags & REG_NOTBOL);
7791 private_preg.not_eol = !!(eflags & REG_NOTEOL);
7792
7793 /* The user has told us exactly how many registers to return
7794 information about, via `nmatch'. We have to pass that on to the
7795 matching routines. */
7796 private_preg.regs_allocated = REGS_FIXED;
7797
7798 if (want_reg_info)
7799 {
7800 regs.num_regs = nmatch;
7801 regs.start = TALLOC (nmatch * 2, regoff_t);
7802 if (regs.start == NULL)
7803 return (int) REG_NOMATCH;
7804 regs.end = regs.start + nmatch;
7805 }
7806
7807 /* Perform the searching operation. */
7808 ret = re_search (&private_preg, string, len,
7809 /* start: */ 0, /* range: */ len,
7810 want_reg_info ? &regs : (struct re_registers *) 0);
7811
7812 /* Copy the register information to the POSIX structure. */
7813 if (want_reg_info)
7814 {
7815 if (ret >= 0)
7816 {
7817 unsigned r;
7818
7819 for (r = 0; r < nmatch; r++)
7820 {
7821 pmatch[r].rm_so = regs.start[r];
7822 pmatch[r].rm_eo = regs.end[r];
7823 }
7824 }
7825
7826 /* If we needed the temporary register info, free the space now. */
7827 free (regs.start);
7828 }
7829
7830 /* We want zero return to mean success, unlike `re_search'. */
7831 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
7832}
7833#ifdef _LIBC
7834weak_alias (__regexec, regexec)
7835#endif
7836
7837
7838/* Returns a message corresponding to an error code, ERRCODE, returned
7839 from either regcomp or regexec. We don't use PREG here. */
7840
7841size_t
7842regerror (errcode, preg, errbuf, errbuf_size)
7843 int errcode;
7844 const regex_t *preg;
7845 char *errbuf;
7846 size_t errbuf_size;
7847{
7848 const char *msg;
7849 size_t msg_size;
7850
7851 if (errcode < 0
7852 || errcode >= (int) (sizeof (re_error_msgid_idx)
7853 / sizeof (re_error_msgid_idx[0])))
7854 /* Only error codes returned by the rest of the code should be passed
7855 to this routine. If we are given anything else, or if other regex
7856 code generates an invalid error code, then the program has a bug.
7857 Dump core so we can fix it. */
7858 abort ();
7859
7860 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]);
7861
7862 msg_size = strlen (msg) + 1; /* Includes the null. */
7863
7864 if (errbuf_size != 0)
7865 {
7866 if (msg_size > errbuf_size)
7867 {
7868#if defined HAVE_MEMPCPY || defined _LIBC
7869 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
7870#else
7871 memcpy (errbuf, msg, errbuf_size - 1);
7872 errbuf[errbuf_size - 1] = 0;
7873#endif
7874 }
7875 else
7876 memcpy (errbuf, msg, msg_size);
7877 }
7878
7879 return msg_size;
7880}
7881#ifdef _LIBC
7882weak_alias (__regerror, regerror)
7883#endif
7884
7885
7886/* Free dynamically allocated space used by PREG. */
7887
7888void
7889regfree (preg)
7890 regex_t *preg;
7891{
7892 if (preg->buffer != NULL)
7893 free (preg->buffer);
7894 preg->buffer = NULL;
7895
7896 preg->allocated = 0;
7897 preg->used = 0;
7898
7899 if (preg->fastmap != NULL)
7900 free (preg->fastmap);
7901 preg->fastmap = NULL;
7902 preg->fastmap_accurate = 0;
7903
7904 if (preg->translate != NULL)
7905 free (preg->translate);
7906 preg->translate = NULL;
7907}
7908#ifdef _LIBC
7909weak_alias (__regfree, regfree)
7910#endif
7911
7912#endif /* not emacs */
Note: See TracBrowser for help on using the repository browser.