source: vendor/gcc/3.2.2/libiberty/regex.c

Last change on this file was 2, checked in by bird, 22 years ago

Initial revision

  • Property cvs2svn:cvs-rev set to 1.1
  • Property svn:eol-style set to native
  • Property svn:executable set to *
File size: 255.0 KB
Line 
1/* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5 Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc.
6 This file is part of the GNU C Library.
7
8 The GNU C Library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 2.1 of the License, or (at your option) any later version.
12
13 The GNU C Library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public
19 License along with the GNU C Library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 02111-1307 USA. */
22
23/* This file has been modified for usage in libiberty. It includes "xregex.h"
24 instead of <regex.h>. The "xregex.h" header file renames all external
25 routines with an "x" prefix so they do not collide with the native regex
26 routines or with other components regex routines. */
27/* AIX requires this to be the first thing in the file. */
28#if defined _AIX && !defined REGEX_MALLOC
29 #pragma alloca
30#endif
31
32#undef _GNU_SOURCE
33#define _GNU_SOURCE
34
35#ifdef HAVE_CONFIG_H
36# include <config.h>
37#endif
38
39#ifndef PARAMS
40# if defined __GNUC__ || (defined __STDC__ && __STDC__)
41# define PARAMS(args) args
42# else
43# define PARAMS(args) ()
44# endif /* GCC. */
45#endif /* Not PARAMS. */
46
47#ifndef INSIDE_RECURSION
48
49# if defined STDC_HEADERS && !defined emacs
50# include <stddef.h>
51# else
52/* We need this for `regex.h', and perhaps for the Emacs include files. */
53# include <sys/types.h>
54# endif
55
56# define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
57
58/* For platform which support the ISO C amendement 1 functionality we
59 support user defined character classes. */
60# if defined _LIBC || WIDE_CHAR_SUPPORT
61/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
62# include <wchar.h>
63# include <wctype.h>
64# endif
65
66# ifdef _LIBC
67/* We have to keep the namespace clean. */
68# define regfree(preg) __regfree (preg)
69# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
70# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
71# define regerror(errcode, preg, errbuf, errbuf_size) \
72 __regerror(errcode, preg, errbuf, errbuf_size)
73# define re_set_registers(bu, re, nu, st, en) \
74 __re_set_registers (bu, re, nu, st, en)
75# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
76 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
77# define re_match(bufp, string, size, pos, regs) \
78 __re_match (bufp, string, size, pos, regs)
79# define re_search(bufp, string, size, startpos, range, regs) \
80 __re_search (bufp, string, size, startpos, range, regs)
81# define re_compile_pattern(pattern, length, bufp) \
82 __re_compile_pattern (pattern, length, bufp)
83# define re_set_syntax(syntax) __re_set_syntax (syntax)
84# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
85 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
86# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
87
88# define btowc __btowc
89
90/* We are also using some library internals. */
91# include <locale/localeinfo.h>
92# include <locale/elem-hash.h>
93# include <langinfo.h>
94# include <locale/coll-lookup.h>
95# endif
96
97/* This is for other GNU distributions with internationalized messages. */
98# if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC
99# include <libintl.h>
100# ifdef _LIBC
101# undef gettext
102# define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
103# endif
104# else
105# define gettext(msgid) (msgid)
106# endif
107
108# ifndef gettext_noop
109/* This define is so xgettext can find the internationalizable
110 strings. */
111# define gettext_noop(String) String
112# endif
113
114/* The `emacs' switch turns on certain matching commands
115 that make sense only in Emacs. */
116# ifdef emacs
117
118# include "lisp.h"
119# include "buffer.h"
120# include "syntax.h"
121
122# else /* not emacs */
123
124/* If we are not linking with Emacs proper,
125 we can't use the relocating allocator
126 even if config.h says that we can. */
127# undef REL_ALLOC
128
129# if defined STDC_HEADERS || defined _LIBC
130# include <stdlib.h>
131# else
132char *malloc ();
133char *realloc ();
134# endif
135
136/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
137 If nothing else has been done, use the method below. */
138# ifdef INHIBIT_STRING_HEADER
139# if !(defined HAVE_BZERO && defined HAVE_BCOPY)
140# if !defined bzero && !defined bcopy
141# undef INHIBIT_STRING_HEADER
142# endif
143# endif
144# endif
145
146/* This is the normal way of making sure we have a bcopy and a bzero.
147 This is used in most programs--a few other programs avoid this
148 by defining INHIBIT_STRING_HEADER. */
149# ifndef INHIBIT_STRING_HEADER
150# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
151# include <string.h>
152# ifndef bzero
153# ifndef _LIBC
154# define bzero(s, n) (memset (s, '\0', n), (s))
155# else
156# define bzero(s, n) __bzero (s, n)
157# endif
158# endif
159# else
160# include <strings.h>
161# ifndef memcmp
162# define memcmp(s1, s2, n) bcmp (s1, s2, n)
163# endif
164# ifndef memcpy
165# define memcpy(d, s, n) (bcopy (s, d, n), (d))
166# endif
167# endif
168# endif
169
170/* Define the syntax stuff for \<, \>, etc. */
171
172/* This must be nonzero for the wordchar and notwordchar pattern
173 commands in re_match_2. */
174# ifndef Sword
175# define Sword 1
176# endif
177
178# ifdef SWITCH_ENUM_BUG
179# define SWITCH_ENUM_CAST(x) ((int)(x))
180# else
181# define SWITCH_ENUM_CAST(x) (x)
182# endif
183
184# endif /* not emacs */
185
186# if defined _LIBC || HAVE_LIMITS_H
187# include <limits.h>
188# endif
189
190# ifndef MB_LEN_MAX
191# define MB_LEN_MAX 1
192# endif
193
194
195/* Get the interface, including the syntax bits. */
196# include "xregex.h" /* change for libiberty */
197
198/* isalpha etc. are used for the character classes. */
199# include <ctype.h>
200
201/* Jim Meyering writes:
202
203 "... Some ctype macros are valid only for character codes that
204 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
205 using /bin/cc or gcc but without giving an ansi option). So, all
206 ctype uses should be through macros like ISPRINT... If
207 STDC_HEADERS is defined, then autoconf has verified that the ctype
208 macros don't need to be guarded with references to isascii. ...
209 Defining isascii to 1 should let any compiler worth its salt
210 eliminate the && through constant folding."
211 Solaris defines some of these symbols so we must undefine them first. */
212
213# undef ISASCII
214# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
215# define ISASCII(c) 1
216# else
217# define ISASCII(c) isascii(c)
218# endif
219
220# ifdef isblank
221# define ISBLANK(c) (ISASCII (c) && isblank (c))
222# else
223# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
224# endif
225# ifdef isgraph
226# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
227# else
228# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
229# endif
230
231# undef ISPRINT
232# define ISPRINT(c) (ISASCII (c) && isprint (c))
233# define ISDIGIT(c) (ISASCII (c) && isdigit (c))
234# define ISALNUM(c) (ISASCII (c) && isalnum (c))
235# define ISALPHA(c) (ISASCII (c) && isalpha (c))
236# define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
237# define ISLOWER(c) (ISASCII (c) && islower (c))
238# define ISPUNCT(c) (ISASCII (c) && ispunct (c))
239# define ISSPACE(c) (ISASCII (c) && isspace (c))
240# define ISUPPER(c) (ISASCII (c) && isupper (c))
241# define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
242
243# ifdef _tolower
244# define TOLOWER(c) _tolower(c)
245# else
246# define TOLOWER(c) tolower(c)
247# endif
248
249# ifndef NULL
250# define NULL (void *)0
251# endif
252
253/* We remove any previous definition of `SIGN_EXTEND_CHAR',
254 since ours (we hope) works properly with all combinations of
255 machines, compilers, `char' and `unsigned char' argument types.
256 (Per Bothner suggested the basic approach.) */
257# undef SIGN_EXTEND_CHAR
258# if __STDC__
259# define SIGN_EXTEND_CHAR(c) ((signed char) (c))
260# else /* not __STDC__ */
261/* As in Harbison and Steele. */
262# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
263# endif
264
265
266# ifndef emacs
267/* How many characters in the character set. */
268# define CHAR_SET_SIZE 256
269
270# ifdef SYNTAX_TABLE
271
272extern char *re_syntax_table;
273
274# else /* not SYNTAX_TABLE */
275
276static char re_syntax_table[CHAR_SET_SIZE];
277
278static void init_syntax_once PARAMS ((void));
279
280static void
281init_syntax_once ()
282{
283 register int c;
284 static int done = 0;
285
286 if (done)
287 return;
288 bzero (re_syntax_table, sizeof re_syntax_table);
289
290 for (c = 0; c < CHAR_SET_SIZE; ++c)
291 if (ISALNUM (c))
292 re_syntax_table[c] = Sword;
293
294 re_syntax_table['_'] = Sword;
295
296 done = 1;
297}
298
299# endif /* not SYNTAX_TABLE */
300
301# define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
302
303# endif /* emacs */
304
305
306/* Integer type for pointers. */
307# if !defined _LIBC && !defined HAVE_UINTPTR_T
308typedef unsigned long int uintptr_t;
309# endif
310
311/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
312 use `alloca' instead of `malloc'. This is because using malloc in
313 re_search* or re_match* could cause memory leaks when C-g is used in
314 Emacs; also, malloc is slower and causes storage fragmentation. On
315 the other hand, malloc is more portable, and easier to debug.
316
317 Because we sometimes use alloca, some routines have to be macros,
318 not functions -- `alloca'-allocated space disappears at the end of the
319 function it is called in. */
320
321# ifdef REGEX_MALLOC
322
323# define REGEX_ALLOCATE malloc
324# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
325# define REGEX_FREE free
326
327# else /* not REGEX_MALLOC */
328
329/* Emacs already defines alloca, sometimes. */
330# ifndef alloca
331
332/* Make alloca work the best possible way. */
333# ifdef __GNUC__
334# define alloca __builtin_alloca
335# else /* not __GNUC__ */
336# if HAVE_ALLOCA_H
337# include <alloca.h>
338# endif /* HAVE_ALLOCA_H */
339# endif /* not __GNUC__ */
340
341# endif /* not alloca */
342
343# define REGEX_ALLOCATE alloca
344
345/* Assumes a `char *destination' variable. */
346# define REGEX_REALLOCATE(source, osize, nsize) \
347 (destination = (char *) alloca (nsize), \
348 memcpy (destination, source, osize))
349
350/* No need to do anything to free, after alloca. */
351# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
352
353# endif /* not REGEX_MALLOC */
354
355/* Define how to allocate the failure stack. */
356
357# if defined REL_ALLOC && defined REGEX_MALLOC
358
359# define REGEX_ALLOCATE_STACK(size) \
360 r_alloc (&failure_stack_ptr, (size))
361# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
362 r_re_alloc (&failure_stack_ptr, (nsize))
363# define REGEX_FREE_STACK(ptr) \
364 r_alloc_free (&failure_stack_ptr)
365
366# else /* not using relocating allocator */
367
368# ifdef REGEX_MALLOC
369
370# define REGEX_ALLOCATE_STACK malloc
371# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
372# define REGEX_FREE_STACK free
373
374# else /* not REGEX_MALLOC */
375
376# define REGEX_ALLOCATE_STACK alloca
377
378# define REGEX_REALLOCATE_STACK(source, osize, nsize) \
379 REGEX_REALLOCATE (source, osize, nsize)
380/* No need to explicitly free anything. */
381# define REGEX_FREE_STACK(arg)
382
383# endif /* not REGEX_MALLOC */
384# endif /* not using relocating allocator */
385
386
387/* True if `size1' is non-NULL and PTR is pointing anywhere inside
388 `string1' or just past its end. This works if PTR is NULL, which is
389 a good thing. */
390# define FIRST_STRING_P(ptr) \
391 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
392
393/* (Re)Allocate N items of type T using malloc, or fail. */
394# define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
395# define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
396# define RETALLOC_IF(addr, n, t) \
397 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
398# define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
399
400# define BYTEWIDTH 8 /* In bits. */
401
402# define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
403
404# undef MAX
405# undef MIN
406# define MAX(a, b) ((a) > (b) ? (a) : (b))
407# define MIN(a, b) ((a) < (b) ? (a) : (b))
408
409typedef char boolean;
410# define false 0
411# define true 1
412
413static reg_errcode_t byte_regex_compile _RE_ARGS ((const char *pattern, size_t size,
414 reg_syntax_t syntax,
415 struct re_pattern_buffer *bufp));
416
417static int byte_re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
418 const char *string1, int size1,
419 const char *string2, int size2,
420 int pos,
421 struct re_registers *regs,
422 int stop));
423static int byte_re_search_2 PARAMS ((struct re_pattern_buffer *bufp,
424 const char *string1, int size1,
425 const char *string2, int size2,
426 int startpos, int range,
427 struct re_registers *regs, int stop));
428static int byte_re_compile_fastmap PARAMS ((struct re_pattern_buffer *bufp));
429
430#ifdef MBS_SUPPORT
431static reg_errcode_t wcs_regex_compile _RE_ARGS ((const char *pattern, size_t size,
432 reg_syntax_t syntax,
433 struct re_pattern_buffer *bufp));
434
435
436static int wcs_re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
437 const char *cstring1, int csize1,
438 const char *cstring2, int csize2,
439 int pos,
440 struct re_registers *regs,
441 int stop,
442 wchar_t *string1, int size1,
443 wchar_t *string2, int size2,
444 int *mbs_offset1, int *mbs_offset2));
445static int wcs_re_search_2 PARAMS ((struct re_pattern_buffer *bufp,
446 const char *string1, int size1,
447 const char *string2, int size2,
448 int startpos, int range,
449 struct re_registers *regs, int stop));
450static int wcs_re_compile_fastmap PARAMS ((struct re_pattern_buffer *bufp));
451#endif
452
453
454/* These are the command codes that appear in compiled regular
455 expressions. Some opcodes are followed by argument bytes. A
456 command code can specify any interpretation whatsoever for its
457 arguments. Zero bytes may appear in the compiled regular expression. */
458
459typedef enum
460{
461 no_op = 0,
462
463 /* Succeed right away--no more backtracking. */
464 succeed,
465
466 /* Followed by one byte giving n, then by n literal bytes. */
467 exactn,
468
469# ifdef MBS_SUPPORT
470 /* Same as exactn, but contains binary data. */
471 exactn_bin,
472# endif
473
474 /* Matches any (more or less) character. */
475 anychar,
476
477 /* Matches any one char belonging to specified set. First
478 following byte is number of bitmap bytes. Then come bytes
479 for a bitmap saying which chars are in. Bits in each byte
480 are ordered low-bit-first. A character is in the set if its
481 bit is 1. A character too large to have a bit in the map is
482 automatically not in the set. */
483 /* ifdef MBS_SUPPORT, following element is length of character
484 classes, length of collating symbols, length of equivalence
485 classes, length of character ranges, and length of characters.
486 Next, character class element, collating symbols elements,
487 equivalence class elements, range elements, and character
488 elements follow.
489 See regex_compile function. */
490 charset,
491
492 /* Same parameters as charset, but match any character that is
493 not one of those specified. */
494 charset_not,
495
496 /* Start remembering the text that is matched, for storing in a
497 register. Followed by one byte with the register number, in
498 the range 0 to one less than the pattern buffer's re_nsub
499 field. Then followed by one byte with the number of groups
500 inner to this one. (This last has to be part of the
501 start_memory only because we need it in the on_failure_jump
502 of re_match_2.) */
503 start_memory,
504
505 /* Stop remembering the text that is matched and store it in a
506 memory register. Followed by one byte with the register
507 number, in the range 0 to one less than `re_nsub' in the
508 pattern buffer, and one byte with the number of inner groups,
509 just like `start_memory'. (We need the number of inner
510 groups here because we don't have any easy way of finding the
511 corresponding start_memory when we're at a stop_memory.) */
512 stop_memory,
513
514 /* Match a duplicate of something remembered. Followed by one
515 byte containing the register number. */
516 duplicate,
517
518 /* Fail unless at beginning of line. */
519 begline,
520
521 /* Fail unless at end of line. */
522 endline,
523
524 /* Succeeds if at beginning of buffer (if emacs) or at beginning
525 of string to be matched (if not). */
526 begbuf,
527
528 /* Analogously, for end of buffer/string. */
529 endbuf,
530
531 /* Followed by two byte relative address to which to jump. */
532 jump,
533
534 /* Same as jump, but marks the end of an alternative. */
535 jump_past_alt,
536
537 /* Followed by two-byte relative address of place to resume at
538 in case of failure. */
539 /* ifdef MBS_SUPPORT, the size of address is 1. */
540 on_failure_jump,
541
542 /* Like on_failure_jump, but pushes a placeholder instead of the
543 current string position when executed. */
544 on_failure_keep_string_jump,
545
546 /* Throw away latest failure point and then jump to following
547 two-byte relative address. */
548 /* ifdef MBS_SUPPORT, the size of address is 1. */
549 pop_failure_jump,
550
551 /* Change to pop_failure_jump if know won't have to backtrack to
552 match; otherwise change to jump. This is used to jump
553 back to the beginning of a repeat. If what follows this jump
554 clearly won't match what the repeat does, such that we can be
555 sure that there is no use backtracking out of repetitions
556 already matched, then we change it to a pop_failure_jump.
557 Followed by two-byte address. */
558 /* ifdef MBS_SUPPORT, the size of address is 1. */
559 maybe_pop_jump,
560
561 /* Jump to following two-byte address, and push a dummy failure
562 point. This failure point will be thrown away if an attempt
563 is made to use it for a failure. A `+' construct makes this
564 before the first repeat. Also used as an intermediary kind
565 of jump when compiling an alternative. */
566 /* ifdef MBS_SUPPORT, the size of address is 1. */
567 dummy_failure_jump,
568
569 /* Push a dummy failure point and continue. Used at the end of
570 alternatives. */
571 push_dummy_failure,
572
573 /* Followed by two-byte relative address and two-byte number n.
574 After matching N times, jump to the address upon failure. */
575 /* ifdef MBS_SUPPORT, the size of address is 1. */
576 succeed_n,
577
578 /* Followed by two-byte relative address, and two-byte number n.
579 Jump to the address N times, then fail. */
580 /* ifdef MBS_SUPPORT, the size of address is 1. */
581 jump_n,
582
583 /* Set the following two-byte relative address to the
584 subsequent two-byte number. The address *includes* the two
585 bytes of number. */
586 /* ifdef MBS_SUPPORT, the size of address is 1. */
587 set_number_at,
588
589 wordchar, /* Matches any word-constituent character. */
590 notwordchar, /* Matches any char that is not a word-constituent. */
591
592 wordbeg, /* Succeeds if at word beginning. */
593 wordend, /* Succeeds if at word end. */
594
595 wordbound, /* Succeeds if at a word boundary. */
596 notwordbound /* Succeeds if not at a word boundary. */
597
598# ifdef emacs
599 ,before_dot, /* Succeeds if before point. */
600 at_dot, /* Succeeds if at point. */
601 after_dot, /* Succeeds if after point. */
602
603 /* Matches any character whose syntax is specified. Followed by
604 a byte which contains a syntax code, e.g., Sword. */
605 syntaxspec,
606
607 /* Matches any character whose syntax is not that specified. */
608 notsyntaxspec
609# endif /* emacs */
610} re_opcode_t;
611#endif /* not INSIDE_RECURSION */
612
613
614
615#ifdef BYTE
616# define CHAR_T char
617# define UCHAR_T unsigned char
618# define COMPILED_BUFFER_VAR bufp->buffer
619# define OFFSET_ADDRESS_SIZE 2
620# if defined (__STDC__) || defined (ALMOST_STDC) || defined (HAVE_STRINGIZE)
621# define PREFIX(name) byte_##name
622# else
623# define PREFIX(name) byte_/**/name
624# endif
625# define ARG_PREFIX(name) name
626# define PUT_CHAR(c) putchar (c)
627#else
628# ifdef WCHAR
629# define CHAR_T wchar_t
630# define UCHAR_T wchar_t
631# define COMPILED_BUFFER_VAR wc_buffer
632# define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
633# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1)
634# if defined (__STDC__) || defined (ALMOST_STDC) || defined (HAVE_STRINGIZE)
635# define PREFIX(name) wcs_##name
636# define ARG_PREFIX(name) c##name
637# else
638# define PREFIX(name) wcs_/**/name
639# define ARG_PREFIX(name) c/**/name
640# endif
641/* Should we use wide stream?? */
642# define PUT_CHAR(c) printf ("%C", c);
643# define TRUE 1
644# define FALSE 0
645# else
646# ifdef MBS_SUPPORT
647# define WCHAR
648# define INSIDE_RECURSION
649# include "regex.c"
650# undef INSIDE_RECURSION
651# endif
652# define BYTE
653# define INSIDE_RECURSION
654# include "regex.c"
655# undef INSIDE_RECURSION
656# endif
657#endif
658
659#ifdef INSIDE_RECURSION
660/* Common operations on the compiled pattern. */
661
662/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
663/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
664
665# ifdef WCHAR
666# define STORE_NUMBER(destination, number) \
667 do { \
668 *(destination) = (UCHAR_T)(number); \
669 } while (0)
670# else /* BYTE */
671# define STORE_NUMBER(destination, number) \
672 do { \
673 (destination)[0] = (number) & 0377; \
674 (destination)[1] = (number) >> 8; \
675 } while (0)
676# endif /* WCHAR */
677
678/* Same as STORE_NUMBER, except increment DESTINATION to
679 the byte after where the number is stored. Therefore, DESTINATION
680 must be an lvalue. */
681/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
682
683# define STORE_NUMBER_AND_INCR(destination, number) \
684 do { \
685 STORE_NUMBER (destination, number); \
686 (destination) += OFFSET_ADDRESS_SIZE; \
687 } while (0)
688
689/* Put into DESTINATION a number stored in two contiguous bytes starting
690 at SOURCE. */
691/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
692
693# ifdef WCHAR
694# define EXTRACT_NUMBER(destination, source) \
695 do { \
696 (destination) = *(source); \
697 } while (0)
698# else /* BYTE */
699# define EXTRACT_NUMBER(destination, source) \
700 do { \
701 (destination) = *(source) & 0377; \
702 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
703 } while (0)
704# endif
705
706# ifdef DEBUG
707static void PREFIX(extract_number) _RE_ARGS ((int *dest, UCHAR_T *source));
708static void
709PREFIX(extract_number) (dest, source)
710 int *dest;
711 UCHAR_T *source;
712{
713# ifdef WCHAR
714 *dest = *source;
715# else /* BYTE */
716 int temp = SIGN_EXTEND_CHAR (*(source + 1));
717 *dest = *source & 0377;
718 *dest += temp << 8;
719# endif
720}
721
722# ifndef EXTRACT_MACROS /* To debug the macros. */
723# undef EXTRACT_NUMBER
724# define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src)
725# endif /* not EXTRACT_MACROS */
726
727# endif /* DEBUG */
728
729/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
730 SOURCE must be an lvalue. */
731
732# define EXTRACT_NUMBER_AND_INCR(destination, source) \
733 do { \
734 EXTRACT_NUMBER (destination, source); \
735 (source) += OFFSET_ADDRESS_SIZE; \
736 } while (0)
737
738# ifdef DEBUG
739static void PREFIX(extract_number_and_incr) _RE_ARGS ((int *destination,
740 UCHAR_T **source));
741static void
742PREFIX(extract_number_and_incr) (destination, source)
743 int *destination;
744 UCHAR_T **source;
745{
746 PREFIX(extract_number) (destination, *source);
747 *source += OFFSET_ADDRESS_SIZE;
748}
749
750# ifndef EXTRACT_MACROS
751# undef EXTRACT_NUMBER_AND_INCR
752# define EXTRACT_NUMBER_AND_INCR(dest, src) \
753 PREFIX(extract_number_and_incr) (&dest, &src)
754# endif /* not EXTRACT_MACROS */
755
756# endif /* DEBUG */
757
758
759
760
761/* If DEBUG is defined, Regex prints many voluminous messages about what
762 it is doing (if the variable `debug' is nonzero). If linked with the
763 main program in `iregex.c', you can enter patterns and strings
764 interactively. And if linked with the main program in `main.c' and
765 the other test files, you can run the already-written tests. */
766
767# ifdef DEBUG
768
769# ifndef DEFINED_ONCE
770
771/* We use standard I/O for debugging. */
772# include <stdio.h>
773
774/* It is useful to test things that ``must'' be true when debugging. */
775# include <assert.h>
776
777static int debug;
778
779# define DEBUG_STATEMENT(e) e
780# define DEBUG_PRINT1(x) if (debug) printf (x)
781# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
782# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
783# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
784# endif /* not DEFINED_ONCE */
785
786# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
787 if (debug) PREFIX(print_partial_compiled_pattern) (s, e)
788# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
789 if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2)
790
791
792/* Print the fastmap in human-readable form. */
793
794# ifndef DEFINED_ONCE
795void
796print_fastmap (fastmap)
797 char *fastmap;
798{
799 unsigned was_a_range = 0;
800 unsigned i = 0;
801
802 while (i < (1 << BYTEWIDTH))
803 {
804 if (fastmap[i++])
805 {
806 was_a_range = 0;
807 putchar (i - 1);
808 while (i < (1 << BYTEWIDTH) && fastmap[i])
809 {
810 was_a_range = 1;
811 i++;
812 }
813 if (was_a_range)
814 {
815 printf ("-");
816 putchar (i - 1);
817 }
818 }
819 }
820 putchar ('\n');
821}
822# endif /* not DEFINED_ONCE */
823
824
825/* Print a compiled pattern string in human-readable form, starting at
826 the START pointer into it and ending just before the pointer END. */
827
828void
829PREFIX(print_partial_compiled_pattern) (start, end)
830 UCHAR_T *start;
831 UCHAR_T *end;
832{
833 int mcnt, mcnt2;
834 UCHAR_T *p1;
835 UCHAR_T *p = start;
836 UCHAR_T *pend = end;
837
838 if (start == NULL)
839 {
840 printf ("(null)\n");
841 return;
842 }
843
844 /* Loop over pattern commands. */
845 while (p < pend)
846 {
847# ifdef _LIBC
848 printf ("%td:\t", p - start);
849# else
850 printf ("%ld:\t", (long int) (p - start));
851# endif
852
853 switch ((re_opcode_t) *p++)
854 {
855 case no_op:
856 printf ("/no_op");
857 break;
858
859 case exactn:
860 mcnt = *p++;
861 printf ("/exactn/%d", mcnt);
862 do
863 {
864 putchar ('/');
865 PUT_CHAR (*p++);
866 }
867 while (--mcnt);
868 break;
869
870# ifdef MBS_SUPPORT
871 case exactn_bin:
872 mcnt = *p++;
873 printf ("/exactn_bin/%d", mcnt);
874 do
875 {
876 printf("/%lx", (long int) *p++);
877 }
878 while (--mcnt);
879 break;
880# endif /* MBS_SUPPORT */
881
882 case start_memory:
883 mcnt = *p++;
884 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
885 break;
886
887 case stop_memory:
888 mcnt = *p++;
889 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
890 break;
891
892 case duplicate:
893 printf ("/duplicate/%ld", (long int) *p++);
894 break;
895
896 case anychar:
897 printf ("/anychar");
898 break;
899
900 case charset:
901 case charset_not:
902 {
903# ifdef WCHAR
904 int i, length;
905 wchar_t *workp = p;
906 printf ("/charset [%s",
907 (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
908 p += 5;
909 length = *workp++; /* the length of char_classes */
910 for (i=0 ; i<length ; i++)
911 printf("[:%lx:]", (long int) *p++);
912 length = *workp++; /* the length of collating_symbol */
913 for (i=0 ; i<length ;)
914 {
915 printf("[.");
916 while(*p != 0)
917 PUT_CHAR((i++,*p++));
918 i++,p++;
919 printf(".]");
920 }
921 length = *workp++; /* the length of equivalence_class */
922 for (i=0 ; i<length ;)
923 {
924 printf("[=");
925 while(*p != 0)
926 PUT_CHAR((i++,*p++));
927 i++,p++;
928 printf("=]");
929 }
930 length = *workp++; /* the length of char_range */
931 for (i=0 ; i<length ; i++)
932 {
933 wchar_t range_start = *p++;
934 wchar_t range_end = *p++;
935 printf("%C-%C", range_start, range_end);
936 }
937 length = *workp++; /* the length of char */
938 for (i=0 ; i<length ; i++)
939 printf("%C", *p++);
940 putchar (']');
941# else
942 register int c, last = -100;
943 register int in_range = 0;
944
945 printf ("/charset [%s",
946 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
947
948 assert (p + *p < pend);
949
950 for (c = 0; c < 256; c++)
951 if (c / 8 < *p
952 && (p[1 + (c/8)] & (1 << (c % 8))))
953 {
954 /* Are we starting a range? */
955 if (last + 1 == c && ! in_range)
956 {
957 putchar ('-');
958 in_range = 1;
959 }
960 /* Have we broken a range? */
961 else if (last + 1 != c && in_range)
962 {
963 putchar (last);
964 in_range = 0;
965 }
966
967 if (! in_range)
968 putchar (c);
969
970 last = c;
971 }
972
973 if (in_range)
974 putchar (last);
975
976 putchar (']');
977
978 p += 1 + *p;
979# endif /* WCHAR */
980 }
981 break;
982
983 case begline:
984 printf ("/begline");
985 break;
986
987 case endline:
988 printf ("/endline");
989 break;
990
991 case on_failure_jump:
992 PREFIX(extract_number_and_incr) (&mcnt, &p);
993# ifdef _LIBC
994 printf ("/on_failure_jump to %td", p + mcnt - start);
995# else
996 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
997# endif
998 break;
999
1000 case on_failure_keep_string_jump:
1001 PREFIX(extract_number_and_incr) (&mcnt, &p);
1002# ifdef _LIBC
1003 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
1004# else
1005 printf ("/on_failure_keep_string_jump to %ld",
1006 (long int) (p + mcnt - start));
1007# endif
1008 break;
1009
1010 case dummy_failure_jump:
1011 PREFIX(extract_number_and_incr) (&mcnt, &p);
1012# ifdef _LIBC
1013 printf ("/dummy_failure_jump to %td", p + mcnt - start);
1014# else
1015 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
1016# endif
1017 break;
1018
1019 case push_dummy_failure:
1020 printf ("/push_dummy_failure");
1021 break;
1022
1023 case maybe_pop_jump:
1024 PREFIX(extract_number_and_incr) (&mcnt, &p);
1025# ifdef _LIBC
1026 printf ("/maybe_pop_jump to %td", p + mcnt - start);
1027# else
1028 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
1029# endif
1030 break;
1031
1032 case pop_failure_jump:
1033 PREFIX(extract_number_and_incr) (&mcnt, &p);
1034# ifdef _LIBC
1035 printf ("/pop_failure_jump to %td", p + mcnt - start);
1036# else
1037 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
1038# endif
1039 break;
1040
1041 case jump_past_alt:
1042 PREFIX(extract_number_and_incr) (&mcnt, &p);
1043# ifdef _LIBC
1044 printf ("/jump_past_alt to %td", p + mcnt - start);
1045# else
1046 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
1047# endif
1048 break;
1049
1050 case jump:
1051 PREFIX(extract_number_and_incr) (&mcnt, &p);
1052# ifdef _LIBC
1053 printf ("/jump to %td", p + mcnt - start);
1054# else
1055 printf ("/jump to %ld", (long int) (p + mcnt - start));
1056# endif
1057 break;
1058
1059 case succeed_n:
1060 PREFIX(extract_number_and_incr) (&mcnt, &p);
1061 p1 = p + mcnt;
1062 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1063# ifdef _LIBC
1064 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
1065# else
1066 printf ("/succeed_n to %ld, %d times",
1067 (long int) (p1 - start), mcnt2);
1068# endif
1069 break;
1070
1071 case jump_n:
1072 PREFIX(extract_number_and_incr) (&mcnt, &p);
1073 p1 = p + mcnt;
1074 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1075 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
1076 break;
1077
1078 case set_number_at:
1079 PREFIX(extract_number_and_incr) (&mcnt, &p);
1080 p1 = p + mcnt;
1081 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1082# ifdef _LIBC
1083 printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
1084# else
1085 printf ("/set_number_at location %ld to %d",
1086 (long int) (p1 - start), mcnt2);
1087# endif
1088 break;
1089
1090 case wordbound:
1091 printf ("/wordbound");
1092 break;
1093
1094 case notwordbound:
1095 printf ("/notwordbound");
1096 break;
1097
1098 case wordbeg:
1099 printf ("/wordbeg");
1100 break;
1101
1102 case wordend:
1103 printf ("/wordend");
1104 break;
1105
1106# ifdef emacs
1107 case before_dot:
1108 printf ("/before_dot");
1109 break;
1110
1111 case at_dot:
1112 printf ("/at_dot");
1113 break;
1114
1115 case after_dot:
1116 printf ("/after_dot");
1117 break;
1118
1119 case syntaxspec:
1120 printf ("/syntaxspec");
1121 mcnt = *p++;
1122 printf ("/%d", mcnt);
1123 break;
1124
1125 case notsyntaxspec:
1126 printf ("/notsyntaxspec");
1127 mcnt = *p++;
1128 printf ("/%d", mcnt);
1129 break;
1130# endif /* emacs */
1131
1132 case wordchar:
1133 printf ("/wordchar");
1134 break;
1135
1136 case notwordchar:
1137 printf ("/notwordchar");
1138 break;
1139
1140 case begbuf:
1141 printf ("/begbuf");
1142 break;
1143
1144 case endbuf:
1145 printf ("/endbuf");
1146 break;
1147
1148 default:
1149 printf ("?%ld", (long int) *(p-1));
1150 }
1151
1152 putchar ('\n');
1153 }
1154
1155# ifdef _LIBC
1156 printf ("%td:\tend of pattern.\n", p - start);
1157# else
1158 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
1159# endif
1160}
1161
1162
1163void
1164PREFIX(print_compiled_pattern) (bufp)
1165 struct re_pattern_buffer *bufp;
1166{
1167 UCHAR_T *buffer = (UCHAR_T*) bufp->buffer;
1168
1169 PREFIX(print_partial_compiled_pattern) (buffer, buffer
1170 + bufp->used / sizeof(UCHAR_T));
1171 printf ("%ld bytes used/%ld bytes allocated.\n",
1172 bufp->used, bufp->allocated);
1173
1174 if (bufp->fastmap_accurate && bufp->fastmap)
1175 {
1176 printf ("fastmap: ");
1177 print_fastmap (bufp->fastmap);
1178 }
1179
1180# ifdef _LIBC
1181 printf ("re_nsub: %Zd\t", bufp->re_nsub);
1182# else
1183 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
1184# endif
1185 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1186 printf ("can_be_null: %d\t", bufp->can_be_null);
1187 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1188 printf ("no_sub: %d\t", bufp->no_sub);
1189 printf ("not_bol: %d\t", bufp->not_bol);
1190 printf ("not_eol: %d\t", bufp->not_eol);
1191 printf ("syntax: %lx\n", bufp->syntax);
1192 /* Perhaps we should print the translate table? */
1193}
1194
1195
1196void
1197PREFIX(print_double_string) (where, string1, size1, string2, size2)
1198 const CHAR_T *where;
1199 const CHAR_T *string1;
1200 const CHAR_T *string2;
1201 int size1;
1202 int size2;
1203{
1204 int this_char;
1205
1206 if (where == NULL)
1207 printf ("(null)");
1208 else
1209 {
1210 int cnt;
1211
1212 if (FIRST_STRING_P (where))
1213 {
1214 for (this_char = where - string1; this_char < size1; this_char++)
1215 PUT_CHAR (string1[this_char]);
1216
1217 where = string2;
1218 }
1219
1220 cnt = 0;
1221 for (this_char = where - string2; this_char < size2; this_char++)
1222 {
1223 PUT_CHAR (string2[this_char]);
1224 if (++cnt > 100)
1225 {
1226 fputs ("...", stdout);
1227 break;
1228 }
1229 }
1230 }
1231}
1232
1233# ifndef DEFINED_ONCE
1234void
1235printchar (c)
1236 int c;
1237{
1238 putc (c, stderr);
1239}
1240# endif
1241
1242# else /* not DEBUG */
1243
1244# ifndef DEFINED_ONCE
1245# undef assert
1246# define assert(e)
1247
1248# define DEBUG_STATEMENT(e)
1249# define DEBUG_PRINT1(x)
1250# define DEBUG_PRINT2(x1, x2)
1251# define DEBUG_PRINT3(x1, x2, x3)
1252# define DEBUG_PRINT4(x1, x2, x3, x4)
1253# endif /* not DEFINED_ONCE */
1254# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1255# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1256
1257# endif /* not DEBUG */
1258
1259
1260
1261
1262# ifdef WCHAR
1263/* This convert a multibyte string to a wide character string.
1264 And write their correspondances to offset_buffer(see below)
1265 and write whether each wchar_t is binary data to is_binary.
1266 This assume invalid multibyte sequences as binary data.
1267 We assume offset_buffer and is_binary is already allocated
1268 enough space. */
1269
1270static size_t convert_mbs_to_wcs (CHAR_T *dest, const unsigned char* src,
1271 size_t len, int *offset_buffer,
1272 char *is_binary);
1273static size_t
1274convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
1275 CHAR_T *dest;
1276 const unsigned char* src;
1277 size_t len; /* the length of multibyte string. */
1278
1279 /* It hold correspondances between src(char string) and
1280 dest(wchar_t string) for optimization.
1281 e.g. src = "xxxyzz"
1282 dest = {'X', 'Y', 'Z'}
1283 (each "xxx", "y" and "zz" represent one multibyte character
1284 corresponding to 'X', 'Y' and 'Z'.)
1285 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
1286 = {0, 3, 4, 6}
1287 */
1288 int *offset_buffer;
1289 char *is_binary;
1290{
1291 wchar_t *pdest = dest;
1292 const unsigned char *psrc = src;
1293 size_t wc_count = 0;
1294
1295 mbstate_t mbs;
1296 int i, consumed;
1297 size_t mb_remain = len;
1298 size_t mb_count = 0;
1299
1300 /* Initialize the conversion state. */
1301 memset (&mbs, 0, sizeof (mbstate_t));
1302
1303 offset_buffer[0] = 0;
1304 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
1305 psrc += consumed)
1306 {
1307#ifdef _LIBC
1308 consumed = __mbrtowc (pdest, psrc, mb_remain, &mbs);
1309#else
1310 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
1311#endif
1312
1313 if (consumed <= 0)
1314 /* failed to convert. maybe src contains binary data.
1315 So we consume 1 byte manualy. */
1316 {
1317 *pdest = *psrc;
1318 consumed = 1;
1319 is_binary[wc_count] = TRUE;
1320 }
1321 else
1322 is_binary[wc_count] = FALSE;
1323 /* In sjis encoding, we use yen sign as escape character in
1324 place of reverse solidus. So we convert 0x5c(yen sign in
1325 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
1326 solidus in UCS2). */
1327 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
1328 *pdest = (wchar_t) *psrc;
1329
1330 offset_buffer[wc_count + 1] = mb_count += consumed;
1331 }
1332
1333 /* Fill remain of the buffer with sentinel. */
1334 for (i = wc_count + 1 ; i <= len ; i++)
1335 offset_buffer[i] = mb_count + 1;
1336
1337 return wc_count;
1338}
1339
1340# endif /* WCHAR */
1341
1342#else /* not INSIDE_RECURSION */
1343
1344/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1345 also be assigned to arbitrarily: each pattern buffer stores its own
1346 syntax, so it can be changed between regex compilations. */
1347/* This has no initializer because initialized variables in Emacs
1348 become read-only after dumping. */
1349reg_syntax_t re_syntax_options;
1350
1351
1352/* Specify the precise syntax of regexps for compilation. This provides
1353 for compatibility for various utilities which historically have
1354 different, incompatible syntaxes.
1355
1356 The argument SYNTAX is a bit mask comprised of the various bits
1357 defined in regex.h. We return the old syntax. */
1358
1359reg_syntax_t
1360re_set_syntax (syntax)
1361 reg_syntax_t syntax;
1362{
1363 reg_syntax_t ret = re_syntax_options;
1364
1365 re_syntax_options = syntax;
1366# ifdef DEBUG
1367 if (syntax & RE_DEBUG)
1368 debug = 1;
1369 else if (debug) /* was on but now is not */
1370 debug = 0;
1371# endif /* DEBUG */
1372 return ret;
1373}
1374# ifdef _LIBC
1375weak_alias (__re_set_syntax, re_set_syntax)
1376# endif
1377
1378
1379/* This table gives an error message for each of the error codes listed
1380 in regex.h. Obviously the order here has to be same as there.
1381 POSIX doesn't require that we do anything for REG_NOERROR,
1382 but why not be nice? */
1383
1384static const char re_error_msgid[] =
1385 {
1386# define REG_NOERROR_IDX 0
1387 gettext_noop ("Success") /* REG_NOERROR */
1388 "\0"
1389# define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
1390 gettext_noop ("No match") /* REG_NOMATCH */
1391 "\0"
1392# define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
1393 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
1394 "\0"
1395# define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
1396 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
1397 "\0"
1398# define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
1399 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
1400 "\0"
1401# define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
1402 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
1403 "\0"
1404# define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
1405 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
1406 "\0"
1407# define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
1408 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
1409 "\0"
1410# define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
1411 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
1412 "\0"
1413# define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
1414 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
1415 "\0"
1416# define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
1417 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
1418 "\0"
1419# define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
1420 gettext_noop ("Invalid range end") /* REG_ERANGE */
1421 "\0"
1422# define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
1423 gettext_noop ("Memory exhausted") /* REG_ESPACE */
1424 "\0"
1425# define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
1426 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
1427 "\0"
1428# define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
1429 gettext_noop ("Premature end of regular expression") /* REG_EEND */
1430 "\0"
1431# define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
1432 gettext_noop ("Regular expression too big") /* REG_ESIZE */
1433 "\0"
1434# define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
1435 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1436 };
1437
1438static const size_t re_error_msgid_idx[] =
1439 {
1440 REG_NOERROR_IDX,
1441 REG_NOMATCH_IDX,
1442 REG_BADPAT_IDX,
1443 REG_ECOLLATE_IDX,
1444 REG_ECTYPE_IDX,
1445 REG_EESCAPE_IDX,
1446 REG_ESUBREG_IDX,
1447 REG_EBRACK_IDX,
1448 REG_EPAREN_IDX,
1449 REG_EBRACE_IDX,
1450 REG_BADBR_IDX,
1451 REG_ERANGE_IDX,
1452 REG_ESPACE_IDX,
1453 REG_BADRPT_IDX,
1454 REG_EEND_IDX,
1455 REG_ESIZE_IDX,
1456 REG_ERPAREN_IDX
1457 };
1458
1459
1460#endif /* INSIDE_RECURSION */
1461
1462#ifndef DEFINED_ONCE
1463/* Avoiding alloca during matching, to placate r_alloc. */
1464
1465/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1466 searching and matching functions should not call alloca. On some
1467 systems, alloca is implemented in terms of malloc, and if we're
1468 using the relocating allocator routines, then malloc could cause a
1469 relocation, which might (if the strings being searched are in the
1470 ralloc heap) shift the data out from underneath the regexp
1471 routines.
1472
1473 Here's another reason to avoid allocation: Emacs
1474 processes input from X in a signal handler; processing X input may
1475 call malloc; if input arrives while a matching routine is calling
1476 malloc, then we're scrod. But Emacs can't just block input while
1477 calling matching routines; then we don't notice interrupts when
1478 they come in. So, Emacs blocks input around all regexp calls
1479 except the matching calls, which it leaves unprotected, in the
1480 faith that they will not malloc. */
1481
1482/* Normally, this is fine. */
1483# define MATCH_MAY_ALLOCATE
1484
1485/* When using GNU C, we are not REALLY using the C alloca, no matter
1486 what config.h may say. So don't take precautions for it. */
1487# ifdef __GNUC__
1488# undef C_ALLOCA
1489# endif
1490
1491/* The match routines may not allocate if (1) they would do it with malloc
1492 and (2) it's not safe for them to use malloc.
1493 Note that if REL_ALLOC is defined, matching would not use malloc for the
1494 failure stack, but we would still use it for the register vectors;
1495 so REL_ALLOC should not affect this. */
1496# if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1497# undef MATCH_MAY_ALLOCATE
1498# endif
1499#endif /* not DEFINED_ONCE */
1500
1501
1502#ifdef INSIDE_RECURSION
1503/* Failure stack declarations and macros; both re_compile_fastmap and
1504 re_match_2 use a failure stack. These have to be macros because of
1505 REGEX_ALLOCATE_STACK. */
1506
1507
1508/* Number of failure points for which to initially allocate space
1509 when matching. If this number is exceeded, we allocate more
1510 space, so it is not a hard limit. */
1511# ifndef INIT_FAILURE_ALLOC
1512# define INIT_FAILURE_ALLOC 5
1513# endif
1514
1515/* Roughly the maximum number of failure points on the stack. Would be
1516 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1517 This is a variable only so users of regex can assign to it; we never
1518 change it ourselves. */
1519
1520# ifdef INT_IS_16BIT
1521
1522# ifndef DEFINED_ONCE
1523# if defined MATCH_MAY_ALLOCATE
1524/* 4400 was enough to cause a crash on Alpha OSF/1,
1525 whose default stack limit is 2mb. */
1526long int re_max_failures = 4000;
1527# else
1528long int re_max_failures = 2000;
1529# endif
1530# endif
1531
1532union PREFIX(fail_stack_elt)
1533{
1534 UCHAR_T *pointer;
1535 long int integer;
1536};
1537
1538typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1539
1540typedef struct
1541{
1542 PREFIX(fail_stack_elt_t) *stack;
1543 unsigned long int size;
1544 unsigned long int avail; /* Offset of next open position. */
1545} PREFIX(fail_stack_type);
1546
1547# else /* not INT_IS_16BIT */
1548
1549# ifndef DEFINED_ONCE
1550# if defined MATCH_MAY_ALLOCATE
1551/* 4400 was enough to cause a crash on Alpha OSF/1,
1552 whose default stack limit is 2mb. */
1553int re_max_failures = 4000;
1554# else
1555int re_max_failures = 2000;
1556# endif
1557# endif
1558
1559union PREFIX(fail_stack_elt)
1560{
1561 UCHAR_T *pointer;
1562 int integer;
1563};
1564
1565typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1566
1567typedef struct
1568{
1569 PREFIX(fail_stack_elt_t) *stack;
1570 unsigned size;
1571 unsigned avail; /* Offset of next open position. */
1572} PREFIX(fail_stack_type);
1573
1574# endif /* INT_IS_16BIT */
1575
1576# ifndef DEFINED_ONCE
1577# define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1578# define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1579# define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1580# endif
1581
1582
1583/* Define macros to initialize and free the failure stack.
1584 Do `return -2' if the alloc fails. */
1585
1586# ifdef MATCH_MAY_ALLOCATE
1587# define INIT_FAIL_STACK() \
1588 do { \
1589 fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \
1590 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \
1591 \
1592 if (fail_stack.stack == NULL) \
1593 return -2; \
1594 \
1595 fail_stack.size = INIT_FAILURE_ALLOC; \
1596 fail_stack.avail = 0; \
1597 } while (0)
1598
1599# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1600# else
1601# define INIT_FAIL_STACK() \
1602 do { \
1603 fail_stack.avail = 0; \
1604 } while (0)
1605
1606# define RESET_FAIL_STACK()
1607# endif
1608
1609
1610/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1611
1612 Return 1 if succeeds, and 0 if either ran out of memory
1613 allocating space for it or it was already too large.
1614
1615 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1616
1617# define DOUBLE_FAIL_STACK(fail_stack) \
1618 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1619 ? 0 \
1620 : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \
1621 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1622 (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \
1623 ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\
1624 \
1625 (fail_stack).stack == NULL \
1626 ? 0 \
1627 : ((fail_stack).size <<= 1, \
1628 1)))
1629
1630
1631/* Push pointer POINTER on FAIL_STACK.
1632 Return 1 if was able to do so and 0 if ran out of memory allocating
1633 space to do so. */
1634# define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1635 ((FAIL_STACK_FULL () \
1636 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1637 ? 0 \
1638 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1639 1))
1640
1641/* Push a pointer value onto the failure stack.
1642 Assumes the variable `fail_stack'. Probably should only
1643 be called from within `PUSH_FAILURE_POINT'. */
1644# define PUSH_FAILURE_POINTER(item) \
1645 fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item)
1646
1647/* This pushes an integer-valued item onto the failure stack.
1648 Assumes the variable `fail_stack'. Probably should only
1649 be called from within `PUSH_FAILURE_POINT'. */
1650# define PUSH_FAILURE_INT(item) \
1651 fail_stack.stack[fail_stack.avail++].integer = (item)
1652
1653/* Push a fail_stack_elt_t value onto the failure stack.
1654 Assumes the variable `fail_stack'. Probably should only
1655 be called from within `PUSH_FAILURE_POINT'. */
1656# define PUSH_FAILURE_ELT(item) \
1657 fail_stack.stack[fail_stack.avail++] = (item)
1658
1659/* These three POP... operations complement the three PUSH... operations.
1660 All assume that `fail_stack' is nonempty. */
1661# define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1662# define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1663# define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1664
1665/* Used to omit pushing failure point id's when we're not debugging. */
1666# ifdef DEBUG
1667# define DEBUG_PUSH PUSH_FAILURE_INT
1668# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1669# else
1670# define DEBUG_PUSH(item)
1671# define DEBUG_POP(item_addr)
1672# endif
1673
1674
1675/* Push the information about the state we will need
1676 if we ever fail back to it.
1677
1678 Requires variables fail_stack, regstart, regend, reg_info, and
1679 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1680 be declared.
1681
1682 Does `return FAILURE_CODE' if runs out of memory. */
1683
1684# define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1685 do { \
1686 char *destination; \
1687 /* Must be int, so when we don't save any registers, the arithmetic \
1688 of 0 + -1 isn't done as unsigned. */ \
1689 /* Can't be int, since there is not a shred of a guarantee that int \
1690 is wide enough to hold a value of something to which pointer can \
1691 be assigned */ \
1692 active_reg_t this_reg; \
1693 \
1694 DEBUG_STATEMENT (failure_id++); \
1695 DEBUG_STATEMENT (nfailure_points_pushed++); \
1696 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1697 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1698 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1699 \
1700 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1701 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1702 \
1703 /* Ensure we have enough space allocated for what we will push. */ \
1704 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1705 { \
1706 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1707 return failure_code; \
1708 \
1709 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1710 (fail_stack).size); \
1711 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1712 } \
1713 \
1714 /* Push the info, starting with the registers. */ \
1715 DEBUG_PRINT1 ("\n"); \
1716 \
1717 if (1) \
1718 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1719 this_reg++) \
1720 { \
1721 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1722 DEBUG_STATEMENT (num_regs_pushed++); \
1723 \
1724 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1725 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1726 \
1727 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1728 PUSH_FAILURE_POINTER (regend[this_reg]); \
1729 \
1730 DEBUG_PRINT2 (" info: %p\n ", \
1731 reg_info[this_reg].word.pointer); \
1732 DEBUG_PRINT2 (" match_null=%d", \
1733 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1734 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1735 DEBUG_PRINT2 (" matched_something=%d", \
1736 MATCHED_SOMETHING (reg_info[this_reg])); \
1737 DEBUG_PRINT2 (" ever_matched=%d", \
1738 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1739 DEBUG_PRINT1 ("\n"); \
1740 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1741 } \
1742 \
1743 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1744 PUSH_FAILURE_INT (lowest_active_reg); \
1745 \
1746 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1747 PUSH_FAILURE_INT (highest_active_reg); \
1748 \
1749 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1750 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1751 PUSH_FAILURE_POINTER (pattern_place); \
1752 \
1753 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1754 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1755 size2); \
1756 DEBUG_PRINT1 ("'\n"); \
1757 PUSH_FAILURE_POINTER (string_place); \
1758 \
1759 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1760 DEBUG_PUSH (failure_id); \
1761 } while (0)
1762
1763# ifndef DEFINED_ONCE
1764/* This is the number of items that are pushed and popped on the stack
1765 for each register. */
1766# define NUM_REG_ITEMS 3
1767
1768/* Individual items aside from the registers. */
1769# ifdef DEBUG
1770# define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1771# else
1772# define NUM_NONREG_ITEMS 4
1773# endif
1774
1775/* We push at most this many items on the stack. */
1776/* We used to use (num_regs - 1), which is the number of registers
1777 this regexp will save; but that was changed to 5
1778 to avoid stack overflow for a regexp with lots of parens. */
1779# define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1780
1781/* We actually push this many items. */
1782# define NUM_FAILURE_ITEMS \
1783 (((0 \
1784 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1785 * NUM_REG_ITEMS) \
1786 + NUM_NONREG_ITEMS)
1787
1788/* How many items can still be added to the stack without overflowing it. */
1789# define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1790# endif /* not DEFINED_ONCE */
1791
1792
1793/* Pops what PUSH_FAIL_STACK pushes.
1794
1795 We restore into the parameters, all of which should be lvalues:
1796 STR -- the saved data position.
1797 PAT -- the saved pattern position.
1798 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1799 REGSTART, REGEND -- arrays of string positions.
1800 REG_INFO -- array of information about each subexpression.
1801
1802 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1803 `pend', `string1', `size1', `string2', and `size2'. */
1804# define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1805{ \
1806 DEBUG_STATEMENT (unsigned failure_id;) \
1807 active_reg_t this_reg; \
1808 const UCHAR_T *string_temp; \
1809 \
1810 assert (!FAIL_STACK_EMPTY ()); \
1811 \
1812 /* Remove failure points and point to how many regs pushed. */ \
1813 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1814 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1815 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1816 \
1817 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1818 \
1819 DEBUG_POP (&failure_id); \
1820 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1821 \
1822 /* If the saved string location is NULL, it came from an \
1823 on_failure_keep_string_jump opcode, and we want to throw away the \
1824 saved NULL, thus retaining our current position in the string. */ \
1825 string_temp = POP_FAILURE_POINTER (); \
1826 if (string_temp != NULL) \
1827 str = (const CHAR_T *) string_temp; \
1828 \
1829 DEBUG_PRINT2 (" Popping string %p: `", str); \
1830 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1831 DEBUG_PRINT1 ("'\n"); \
1832 \
1833 pat = (UCHAR_T *) POP_FAILURE_POINTER (); \
1834 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1835 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1836 \
1837 /* Restore register info. */ \
1838 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1839 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1840 \
1841 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1842 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1843 \
1844 if (1) \
1845 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1846 { \
1847 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1848 \
1849 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1850 DEBUG_PRINT2 (" info: %p\n", \
1851 reg_info[this_reg].word.pointer); \
1852 \
1853 regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1854 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1855 \
1856 regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1857 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1858 } \
1859 else \
1860 { \
1861 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1862 { \
1863 reg_info[this_reg].word.integer = 0; \
1864 regend[this_reg] = 0; \
1865 regstart[this_reg] = 0; \
1866 } \
1867 highest_active_reg = high_reg; \
1868 } \
1869 \
1870 set_regs_matched_done = 0; \
1871 DEBUG_STATEMENT (nfailure_points_popped++); \
1872} /* POP_FAILURE_POINT */
1873
1874
1875/* Structure for per-register (a.k.a. per-group) information.
1876 Other register information, such as the
1877 starting and ending positions (which are addresses), and the list of
1878 inner groups (which is a bits list) are maintained in separate
1879 variables.
1880
1881 We are making a (strictly speaking) nonportable assumption here: that
1882 the compiler will pack our bit fields into something that fits into
1883 the type of `word', i.e., is something that fits into one item on the
1884 failure stack. */
1885
1886
1887/* Declarations and macros for re_match_2. */
1888
1889typedef union
1890{
1891 PREFIX(fail_stack_elt_t) word;
1892 struct
1893 {
1894 /* This field is one if this group can match the empty string,
1895 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1896# define MATCH_NULL_UNSET_VALUE 3
1897 unsigned match_null_string_p : 2;
1898 unsigned is_active : 1;
1899 unsigned matched_something : 1;
1900 unsigned ever_matched_something : 1;
1901 } bits;
1902} PREFIX(register_info_type);
1903
1904# ifndef DEFINED_ONCE
1905# define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1906# define IS_ACTIVE(R) ((R).bits.is_active)
1907# define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1908# define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1909
1910
1911/* Call this when have matched a real character; it sets `matched' flags
1912 for the subexpressions which we are currently inside. Also records
1913 that those subexprs have matched. */
1914# define SET_REGS_MATCHED() \
1915 do \
1916 { \
1917 if (!set_regs_matched_done) \
1918 { \
1919 active_reg_t r; \
1920 set_regs_matched_done = 1; \
1921 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1922 { \
1923 MATCHED_SOMETHING (reg_info[r]) \
1924 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1925 = 1; \
1926 } \
1927 } \
1928 } \
1929 while (0)
1930# endif /* not DEFINED_ONCE */
1931
1932/* Registers are set to a sentinel when they haven't yet matched. */
1933static CHAR_T PREFIX(reg_unset_dummy);
1934# define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy))
1935# define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1936
1937/* Subroutine declarations and macros for regex_compile. */
1938static void PREFIX(store_op1) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc, int arg));
1939static void PREFIX(store_op2) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
1940 int arg1, int arg2));
1941static void PREFIX(insert_op1) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
1942 int arg, UCHAR_T *end));
1943static void PREFIX(insert_op2) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
1944 int arg1, int arg2, UCHAR_T *end));
1945static boolean PREFIX(at_begline_loc_p) _RE_ARGS ((const CHAR_T *pattern,
1946 const CHAR_T *p,
1947 reg_syntax_t syntax));
1948static boolean PREFIX(at_endline_loc_p) _RE_ARGS ((const CHAR_T *p,
1949 const CHAR_T *pend,
1950 reg_syntax_t syntax));
1951# ifdef WCHAR
1952static reg_errcode_t wcs_compile_range _RE_ARGS ((CHAR_T range_start,
1953 const CHAR_T **p_ptr,
1954 const CHAR_T *pend,
1955 char *translate,
1956 reg_syntax_t syntax,
1957 UCHAR_T *b,
1958 CHAR_T *char_set));
1959static void insert_space _RE_ARGS ((int num, CHAR_T *loc, CHAR_T *end));
1960# else /* BYTE */
1961static reg_errcode_t byte_compile_range _RE_ARGS ((unsigned int range_start,
1962 const char **p_ptr,
1963 const char *pend,
1964 char *translate,
1965 reg_syntax_t syntax,
1966 unsigned char *b));
1967# endif /* WCHAR */
1968
1969/* Fetch the next character in the uncompiled pattern---translating it
1970 if necessary. Also cast from a signed character in the constant
1971 string passed to us by the user to an unsigned char that we can use
1972 as an array index (in, e.g., `translate'). */
1973/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1974 because it is impossible to allocate 4GB array for some encodings
1975 which have 4 byte character_set like UCS4. */
1976# ifndef PATFETCH
1977# ifdef WCHAR
1978# define PATFETCH(c) \
1979 do {if (p == pend) return REG_EEND; \
1980 c = (UCHAR_T) *p++; \
1981 if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \
1982 } while (0)
1983# else /* BYTE */
1984# define PATFETCH(c) \
1985 do {if (p == pend) return REG_EEND; \
1986 c = (unsigned char) *p++; \
1987 if (translate) c = (unsigned char) translate[c]; \
1988 } while (0)
1989# endif /* WCHAR */
1990# endif
1991
1992/* Fetch the next character in the uncompiled pattern, with no
1993 translation. */
1994# define PATFETCH_RAW(c) \
1995 do {if (p == pend) return REG_EEND; \
1996 c = (UCHAR_T) *p++; \
1997 } while (0)
1998
1999/* Go backwards one character in the pattern. */
2000# define PATUNFETCH p--
2001
2002
2003/* If `translate' is non-null, return translate[D], else just D. We
2004 cast the subscript to translate because some data is declared as
2005 `char *', to avoid warnings when a string constant is passed. But
2006 when we use a character as a subscript we must make it unsigned. */
2007/* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
2008 because it is impossible to allocate 4GB array for some encodings
2009 which have 4 byte character_set like UCS4. */
2010
2011# ifndef TRANSLATE
2012# ifdef WCHAR
2013# define TRANSLATE(d) \
2014 ((translate && ((UCHAR_T) (d)) <= 0xff) \
2015 ? (char) translate[(unsigned char) (d)] : (d))
2016# else /* BYTE */
2017# define TRANSLATE(d) \
2018 (translate ? (char) translate[(unsigned char) (d)] : (d))
2019# endif /* WCHAR */
2020# endif
2021
2022
2023/* Macros for outputting the compiled pattern into `buffer'. */
2024
2025/* If the buffer isn't allocated when it comes in, use this. */
2026# define INIT_BUF_SIZE (32 * sizeof(UCHAR_T))
2027
2028/* Make sure we have at least N more bytes of space in buffer. */
2029# ifdef WCHAR
2030# define GET_BUFFER_SPACE(n) \
2031 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
2032 + (n)*sizeof(CHAR_T)) > bufp->allocated) \
2033 EXTEND_BUFFER ()
2034# else /* BYTE */
2035# define GET_BUFFER_SPACE(n) \
2036 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
2037 EXTEND_BUFFER ()
2038# endif /* WCHAR */
2039
2040/* Make sure we have one more byte of buffer space and then add C to it. */
2041# define BUF_PUSH(c) \
2042 do { \
2043 GET_BUFFER_SPACE (1); \
2044 *b++ = (UCHAR_T) (c); \
2045 } while (0)
2046
2047
2048/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
2049# define BUF_PUSH_2(c1, c2) \
2050 do { \
2051 GET_BUFFER_SPACE (2); \
2052 *b++ = (UCHAR_T) (c1); \
2053 *b++ = (UCHAR_T) (c2); \
2054 } while (0)
2055
2056
2057/* As with BUF_PUSH_2, except for three bytes. */
2058# define BUF_PUSH_3(c1, c2, c3) \
2059 do { \
2060 GET_BUFFER_SPACE (3); \
2061 *b++ = (UCHAR_T) (c1); \
2062 *b++ = (UCHAR_T) (c2); \
2063 *b++ = (UCHAR_T) (c3); \
2064 } while (0)
2065
2066/* Store a jump with opcode OP at LOC to location TO. We store a
2067 relative address offset by the three bytes the jump itself occupies. */
2068# define STORE_JUMP(op, loc, to) \
2069 PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
2070
2071/* Likewise, for a two-argument jump. */
2072# define STORE_JUMP2(op, loc, to, arg) \
2073 PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
2074
2075/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
2076# define INSERT_JUMP(op, loc, to) \
2077 PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
2078
2079/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
2080# define INSERT_JUMP2(op, loc, to, arg) \
2081 PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
2082 arg, b)
2083
2084/* This is not an arbitrary limit: the arguments which represent offsets
2085 into the pattern are two bytes long. So if 2^16 bytes turns out to
2086 be too small, many things would have to change. */
2087/* Any other compiler which, like MSC, has allocation limit below 2^16
2088 bytes will have to use approach similar to what was done below for
2089 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
2090 reallocating to 0 bytes. Such thing is not going to work too well.
2091 You have been warned!! */
2092# ifndef DEFINED_ONCE
2093# if defined _MSC_VER && !defined WIN32
2094/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
2095 The REALLOC define eliminates a flurry of conversion warnings,
2096 but is not required. */
2097# define MAX_BUF_SIZE 65500L
2098# define REALLOC(p,s) realloc ((p), (size_t) (s))
2099# else
2100# define MAX_BUF_SIZE (1L << 16)
2101# define REALLOC(p,s) realloc ((p), (s))
2102# endif
2103
2104/* Extend the buffer by twice its current size via realloc and
2105 reset the pointers that pointed into the old block to point to the
2106 correct places in the new one. If extending the buffer results in it
2107 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
2108# if __BOUNDED_POINTERS__
2109# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
2110# define MOVE_BUFFER_POINTER(P) \
2111 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
2112# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
2113 else \
2114 { \
2115 SET_HIGH_BOUND (b); \
2116 SET_HIGH_BOUND (begalt); \
2117 if (fixup_alt_jump) \
2118 SET_HIGH_BOUND (fixup_alt_jump); \
2119 if (laststart) \
2120 SET_HIGH_BOUND (laststart); \
2121 if (pending_exact) \
2122 SET_HIGH_BOUND (pending_exact); \
2123 }
2124# else
2125# define MOVE_BUFFER_POINTER(P) (P) += incr
2126# define ELSE_EXTEND_BUFFER_HIGH_BOUND
2127# endif
2128# endif /* not DEFINED_ONCE */
2129
2130# ifdef WCHAR
2131# define EXTEND_BUFFER() \
2132 do { \
2133 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2134 int wchar_count; \
2135 if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \
2136 return REG_ESIZE; \
2137 bufp->allocated <<= 1; \
2138 if (bufp->allocated > MAX_BUF_SIZE) \
2139 bufp->allocated = MAX_BUF_SIZE; \
2140 /* How many characters the new buffer can have? */ \
2141 wchar_count = bufp->allocated / sizeof(UCHAR_T); \
2142 if (wchar_count == 0) wchar_count = 1; \
2143 /* Truncate the buffer to CHAR_T align. */ \
2144 bufp->allocated = wchar_count * sizeof(UCHAR_T); \
2145 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \
2146 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
2147 if (COMPILED_BUFFER_VAR == NULL) \
2148 return REG_ESPACE; \
2149 /* If the buffer moved, move all the pointers into it. */ \
2150 if (old_buffer != COMPILED_BUFFER_VAR) \
2151 { \
2152 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2153 MOVE_BUFFER_POINTER (b); \
2154 MOVE_BUFFER_POINTER (begalt); \
2155 if (fixup_alt_jump) \
2156 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2157 if (laststart) \
2158 MOVE_BUFFER_POINTER (laststart); \
2159 if (pending_exact) \
2160 MOVE_BUFFER_POINTER (pending_exact); \
2161 } \
2162 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2163 } while (0)
2164# else /* BYTE */
2165# define EXTEND_BUFFER() \
2166 do { \
2167 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2168 if (bufp->allocated == MAX_BUF_SIZE) \
2169 return REG_ESIZE; \
2170 bufp->allocated <<= 1; \
2171 if (bufp->allocated > MAX_BUF_SIZE) \
2172 bufp->allocated = MAX_BUF_SIZE; \
2173 bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \
2174 bufp->allocated); \
2175 if (COMPILED_BUFFER_VAR == NULL) \
2176 return REG_ESPACE; \
2177 /* If the buffer moved, move all the pointers into it. */ \
2178 if (old_buffer != COMPILED_BUFFER_VAR) \
2179 { \
2180 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2181 MOVE_BUFFER_POINTER (b); \
2182 MOVE_BUFFER_POINTER (begalt); \
2183 if (fixup_alt_jump) \
2184 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2185 if (laststart) \
2186 MOVE_BUFFER_POINTER (laststart); \
2187 if (pending_exact) \
2188 MOVE_BUFFER_POINTER (pending_exact); \
2189 } \
2190 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2191 } while (0)
2192# endif /* WCHAR */
2193
2194# ifndef DEFINED_ONCE
2195/* Since we have one byte reserved for the register number argument to
2196 {start,stop}_memory, the maximum number of groups we can report
2197 things about is what fits in that byte. */
2198# define MAX_REGNUM 255
2199
2200/* But patterns can have more than `MAX_REGNUM' registers. We just
2201 ignore the excess. */
2202typedef unsigned regnum_t;
2203
2204
2205/* Macros for the compile stack. */
2206
2207/* Since offsets can go either forwards or backwards, this type needs to
2208 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
2209/* int may be not enough when sizeof(int) == 2. */
2210typedef long pattern_offset_t;
2211
2212typedef struct
2213{
2214 pattern_offset_t begalt_offset;
2215 pattern_offset_t fixup_alt_jump;
2216 pattern_offset_t inner_group_offset;
2217 pattern_offset_t laststart_offset;
2218 regnum_t regnum;
2219} compile_stack_elt_t;
2220
2221
2222typedef struct
2223{
2224 compile_stack_elt_t *stack;
2225 unsigned size;
2226 unsigned avail; /* Offset of next open position. */
2227} compile_stack_type;
2228
2229
2230# define INIT_COMPILE_STACK_SIZE 32
2231
2232# define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
2233# define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
2234
2235/* The next available element. */
2236# define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
2237
2238# endif /* not DEFINED_ONCE */
2239
2240/* Set the bit for character C in a list. */
2241# ifndef DEFINED_ONCE
2242# define SET_LIST_BIT(c) \
2243 (b[((unsigned char) (c)) / BYTEWIDTH] \
2244 |= 1 << (((unsigned char) c) % BYTEWIDTH))
2245# endif /* DEFINED_ONCE */
2246
2247/* Get the next unsigned number in the uncompiled pattern. */
2248# define GET_UNSIGNED_NUMBER(num) \
2249 { \
2250 while (p != pend) \
2251 { \
2252 PATFETCH (c); \
2253 if (c < '0' || c > '9') \
2254 break; \
2255 if (num <= RE_DUP_MAX) \
2256 { \
2257 if (num < 0) \
2258 num = 0; \
2259 num = num * 10 + c - '0'; \
2260 } \
2261 } \
2262 }
2263
2264# ifndef DEFINED_ONCE
2265# if defined _LIBC || WIDE_CHAR_SUPPORT
2266/* The GNU C library provides support for user-defined character classes
2267 and the functions from ISO C amendement 1. */
2268# ifdef CHARCLASS_NAME_MAX
2269# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2270# else
2271/* This shouldn't happen but some implementation might still have this
2272 problem. Use a reasonable default value. */
2273# define CHAR_CLASS_MAX_LENGTH 256
2274# endif
2275
2276# ifdef _LIBC
2277# define IS_CHAR_CLASS(string) __wctype (string)
2278# else
2279# define IS_CHAR_CLASS(string) wctype (string)
2280# endif
2281# else
2282# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
2283
2284# define IS_CHAR_CLASS(string) \
2285 (STREQ (string, "alpha") || STREQ (string, "upper") \
2286 || STREQ (string, "lower") || STREQ (string, "digit") \
2287 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
2288 || STREQ (string, "space") || STREQ (string, "print") \
2289 || STREQ (string, "punct") || STREQ (string, "graph") \
2290 || STREQ (string, "cntrl") || STREQ (string, "blank"))
2291# endif
2292# endif /* DEFINED_ONCE */
2293
2294
2295# ifndef MATCH_MAY_ALLOCATE
2296
2297/* If we cannot allocate large objects within re_match_2_internal,
2298 we make the fail stack and register vectors global.
2299 The fail stack, we grow to the maximum size when a regexp
2300 is compiled.
2301 The register vectors, we adjust in size each time we
2302 compile a regexp, according to the number of registers it needs. */
2303
2304static PREFIX(fail_stack_type) fail_stack;
2305
2306/* Size with which the following vectors are currently allocated.
2307 That is so we can make them bigger as needed,
2308 but never make them smaller. */
2309# ifdef DEFINED_ONCE
2310static int regs_allocated_size;
2311
2312static const char ** regstart, ** regend;
2313static const char ** old_regstart, ** old_regend;
2314static const char **best_regstart, **best_regend;
2315static const char **reg_dummy;
2316# endif /* DEFINED_ONCE */
2317
2318static PREFIX(register_info_type) *PREFIX(reg_info);
2319static PREFIX(register_info_type) *PREFIX(reg_info_dummy);
2320
2321/* Make the register vectors big enough for NUM_REGS registers,
2322 but don't make them smaller. */
2323
2324static void
2325PREFIX(regex_grow_registers) (num_regs)
2326 int num_regs;
2327{
2328 if (num_regs > regs_allocated_size)
2329 {
2330 RETALLOC_IF (regstart, num_regs, const char *);
2331 RETALLOC_IF (regend, num_regs, const char *);
2332 RETALLOC_IF (old_regstart, num_regs, const char *);
2333 RETALLOC_IF (old_regend, num_regs, const char *);
2334 RETALLOC_IF (best_regstart, num_regs, const char *);
2335 RETALLOC_IF (best_regend, num_regs, const char *);
2336 RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type));
2337 RETALLOC_IF (reg_dummy, num_regs, const char *);
2338 RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type));
2339
2340 regs_allocated_size = num_regs;
2341 }
2342}
2343
2344# endif /* not MATCH_MAY_ALLOCATE */
2345
2346
2347# ifndef DEFINED_ONCE
2348static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2349 compile_stack,
2350 regnum_t regnum));
2351# endif /* not DEFINED_ONCE */
2352
2353/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2354 Returns one of error codes defined in `regex.h', or zero for success.
2355
2356 Assumes the `allocated' (and perhaps `buffer') and `translate'
2357 fields are set in BUFP on entry.
2358
2359 If it succeeds, results are put in BUFP (if it returns an error, the
2360 contents of BUFP are undefined):
2361 `buffer' is the compiled pattern;
2362 `syntax' is set to SYNTAX;
2363 `used' is set to the length of the compiled pattern;
2364 `fastmap_accurate' is zero;
2365 `re_nsub' is the number of subexpressions in PATTERN;
2366 `not_bol' and `not_eol' are zero;
2367
2368 The `fastmap' and `newline_anchor' fields are neither
2369 examined nor set. */
2370
2371/* Return, freeing storage we allocated. */
2372# ifdef WCHAR
2373# define FREE_STACK_RETURN(value) \
2374 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
2375# else
2376# define FREE_STACK_RETURN(value) \
2377 return (free (compile_stack.stack), value)
2378# endif /* WCHAR */
2379
2380static reg_errcode_t
2381PREFIX(regex_compile) (ARG_PREFIX(pattern), ARG_PREFIX(size), syntax, bufp)
2382 const char *ARG_PREFIX(pattern);
2383 size_t ARG_PREFIX(size);
2384 reg_syntax_t syntax;
2385 struct re_pattern_buffer *bufp;
2386{
2387 /* We fetch characters from PATTERN here. Even though PATTERN is
2388 `char *' (i.e., signed), we declare these variables as unsigned, so
2389 they can be reliably used as array indices. */
2390 register UCHAR_T c, c1;
2391
2392#ifdef WCHAR
2393 /* A temporary space to keep wchar_t pattern and compiled pattern. */
2394 CHAR_T *pattern, *COMPILED_BUFFER_VAR;
2395 size_t size;
2396 /* offset buffer for optimization. See convert_mbs_to_wc. */
2397 int *mbs_offset = NULL;
2398 /* It hold whether each wchar_t is binary data or not. */
2399 char *is_binary = NULL;
2400 /* A flag whether exactn is handling binary data or not. */
2401 char is_exactn_bin = FALSE;
2402#endif /* WCHAR */
2403
2404 /* A random temporary spot in PATTERN. */
2405 const CHAR_T *p1;
2406
2407 /* Points to the end of the buffer, where we should append. */
2408 register UCHAR_T *b;
2409
2410 /* Keeps track of unclosed groups. */
2411 compile_stack_type compile_stack;
2412
2413 /* Points to the current (ending) position in the pattern. */
2414#ifdef WCHAR
2415 const CHAR_T *p;
2416 const CHAR_T *pend;
2417#else /* BYTE */
2418 const CHAR_T *p = pattern;
2419 const CHAR_T *pend = pattern + size;
2420#endif /* WCHAR */
2421
2422 /* How to translate the characters in the pattern. */
2423 RE_TRANSLATE_TYPE translate = bufp->translate;
2424
2425 /* Address of the count-byte of the most recently inserted `exactn'
2426 command. This makes it possible to tell if a new exact-match
2427 character can be added to that command or if the character requires
2428 a new `exactn' command. */
2429 UCHAR_T *pending_exact = 0;
2430
2431 /* Address of start of the most recently finished expression.
2432 This tells, e.g., postfix * where to find the start of its
2433 operand. Reset at the beginning of groups and alternatives. */
2434 UCHAR_T *laststart = 0;
2435
2436 /* Address of beginning of regexp, or inside of last group. */
2437 UCHAR_T *begalt;
2438
2439 /* Address of the place where a forward jump should go to the end of
2440 the containing expression. Each alternative of an `or' -- except the
2441 last -- ends with a forward jump of this sort. */
2442 UCHAR_T *fixup_alt_jump = 0;
2443
2444 /* Counts open-groups as they are encountered. Remembered for the
2445 matching close-group on the compile stack, so the same register
2446 number is put in the stop_memory as the start_memory. */
2447 regnum_t regnum = 0;
2448
2449#ifdef WCHAR
2450 /* Initialize the wchar_t PATTERN and offset_buffer. */
2451 p = pend = pattern = TALLOC(csize + 1, CHAR_T);
2452 mbs_offset = TALLOC(csize + 1, int);
2453 is_binary = TALLOC(csize + 1, char);
2454 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
2455 {
2456 free(pattern);
2457 free(mbs_offset);
2458 free(is_binary);
2459 return REG_ESPACE;
2460 }
2461 pattern[csize] = L'\0'; /* sentinel */
2462 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
2463 pend = p + size;
2464 if (size < 0)
2465 {
2466 free(pattern);
2467 free(mbs_offset);
2468 free(is_binary);
2469 return REG_BADPAT;
2470 }
2471#endif
2472
2473#ifdef DEBUG
2474 DEBUG_PRINT1 ("\nCompiling pattern: ");
2475 if (debug)
2476 {
2477 unsigned debug_count;
2478
2479 for (debug_count = 0; debug_count < size; debug_count++)
2480 PUT_CHAR (pattern[debug_count]);
2481 putchar ('\n');
2482 }
2483#endif /* DEBUG */
2484
2485 /* Initialize the compile stack. */
2486 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2487 if (compile_stack.stack == NULL)
2488 {
2489#ifdef WCHAR
2490 free(pattern);
2491 free(mbs_offset);
2492 free(is_binary);
2493#endif
2494 return REG_ESPACE;
2495 }
2496
2497 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2498 compile_stack.avail = 0;
2499
2500 /* Initialize the pattern buffer. */
2501 bufp->syntax = syntax;
2502 bufp->fastmap_accurate = 0;
2503 bufp->not_bol = bufp->not_eol = 0;
2504
2505 /* Set `used' to zero, so that if we return an error, the pattern
2506 printer (for debugging) will think there's no pattern. We reset it
2507 at the end. */
2508 bufp->used = 0;
2509
2510 /* Always count groups, whether or not bufp->no_sub is set. */
2511 bufp->re_nsub = 0;
2512
2513#if !defined emacs && !defined SYNTAX_TABLE
2514 /* Initialize the syntax table. */
2515 init_syntax_once ();
2516#endif
2517
2518 if (bufp->allocated == 0)
2519 {
2520 if (bufp->buffer)
2521 { /* If zero allocated, but buffer is non-null, try to realloc
2522 enough space. This loses if buffer's address is bogus, but
2523 that is the user's responsibility. */
2524#ifdef WCHAR
2525 /* Free bufp->buffer and allocate an array for wchar_t pattern
2526 buffer. */
2527 free(bufp->buffer);
2528 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T),
2529 UCHAR_T);
2530#else
2531 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T);
2532#endif /* WCHAR */
2533 }
2534 else
2535 { /* Caller did not allocate a buffer. Do it for them. */
2536 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T),
2537 UCHAR_T);
2538 }
2539
2540 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
2541#ifdef WCHAR
2542 bufp->buffer = (char*)COMPILED_BUFFER_VAR;
2543#endif /* WCHAR */
2544 bufp->allocated = INIT_BUF_SIZE;
2545 }
2546#ifdef WCHAR
2547 else
2548 COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer;
2549#endif
2550
2551 begalt = b = COMPILED_BUFFER_VAR;
2552
2553 /* Loop through the uncompiled pattern until we're at the end. */
2554 while (p != pend)
2555 {
2556 PATFETCH (c);
2557
2558 switch (c)
2559 {
2560 case '^':
2561 {
2562 if ( /* If at start of pattern, it's an operator. */
2563 p == pattern + 1
2564 /* If context independent, it's an operator. */
2565 || syntax & RE_CONTEXT_INDEP_ANCHORS
2566 /* Otherwise, depends on what's come before. */
2567 || PREFIX(at_begline_loc_p) (pattern, p, syntax))
2568 BUF_PUSH (begline);
2569 else
2570 goto normal_char;
2571 }
2572 break;
2573
2574
2575 case '$':
2576 {
2577 if ( /* If at end of pattern, it's an operator. */
2578 p == pend
2579 /* If context independent, it's an operator. */
2580 || syntax & RE_CONTEXT_INDEP_ANCHORS
2581 /* Otherwise, depends on what's next. */
2582 || PREFIX(at_endline_loc_p) (p, pend, syntax))
2583 BUF_PUSH (endline);
2584 else
2585 goto normal_char;
2586 }
2587 break;
2588
2589
2590 case '+':
2591 case '?':
2592 if ((syntax & RE_BK_PLUS_QM)
2593 || (syntax & RE_LIMITED_OPS))
2594 goto normal_char;
2595 handle_plus:
2596 case '*':
2597 /* If there is no previous pattern... */
2598 if (!laststart)
2599 {
2600 if (syntax & RE_CONTEXT_INVALID_OPS)
2601 FREE_STACK_RETURN (REG_BADRPT);
2602 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2603 goto normal_char;
2604 }
2605
2606 {
2607 /* Are we optimizing this jump? */
2608 boolean keep_string_p = false;
2609
2610 /* 1 means zero (many) matches is allowed. */
2611 char zero_times_ok = 0, many_times_ok = 0;
2612
2613 /* If there is a sequence of repetition chars, collapse it
2614 down to just one (the right one). We can't combine
2615 interval operators with these because of, e.g., `a{2}*',
2616 which should only match an even number of `a's. */
2617
2618 for (;;)
2619 {
2620 zero_times_ok |= c != '+';
2621 many_times_ok |= c != '?';
2622
2623 if (p == pend)
2624 break;
2625
2626 PATFETCH (c);
2627
2628 if (c == '*'
2629 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2630 ;
2631
2632 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2633 {
2634 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2635
2636 PATFETCH (c1);
2637 if (!(c1 == '+' || c1 == '?'))
2638 {
2639 PATUNFETCH;
2640 PATUNFETCH;
2641 break;
2642 }
2643
2644 c = c1;
2645 }
2646 else
2647 {
2648 PATUNFETCH;
2649 break;
2650 }
2651
2652 /* If we get here, we found another repeat character. */
2653 }
2654
2655 /* Star, etc. applied to an empty pattern is equivalent
2656 to an empty pattern. */
2657 if (!laststart)
2658 break;
2659
2660 /* Now we know whether or not zero matches is allowed
2661 and also whether or not two or more matches is allowed. */
2662 if (many_times_ok)
2663 { /* More than one repetition is allowed, so put in at the
2664 end a backward relative jump from `b' to before the next
2665 jump we're going to put in below (which jumps from
2666 laststart to after this jump).
2667
2668 But if we are at the `*' in the exact sequence `.*\n',
2669 insert an unconditional jump backwards to the .,
2670 instead of the beginning of the loop. This way we only
2671 push a failure point once, instead of every time
2672 through the loop. */
2673 assert (p - 1 > pattern);
2674
2675 /* Allocate the space for the jump. */
2676 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2677
2678 /* We know we are not at the first character of the pattern,
2679 because laststart was nonzero. And we've already
2680 incremented `p', by the way, to be the character after
2681 the `*'. Do we have to do something analogous here
2682 for null bytes, because of RE_DOT_NOT_NULL? */
2683 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2684 && zero_times_ok
2685 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2686 && !(syntax & RE_DOT_NEWLINE))
2687 { /* We have .*\n. */
2688 STORE_JUMP (jump, b, laststart);
2689 keep_string_p = true;
2690 }
2691 else
2692 /* Anything else. */
2693 STORE_JUMP (maybe_pop_jump, b, laststart -
2694 (1 + OFFSET_ADDRESS_SIZE));
2695
2696 /* We've added more stuff to the buffer. */
2697 b += 1 + OFFSET_ADDRESS_SIZE;
2698 }
2699
2700 /* On failure, jump from laststart to b + 3, which will be the
2701 end of the buffer after this jump is inserted. */
2702 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
2703 'b + 3'. */
2704 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2705 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2706 : on_failure_jump,
2707 laststart, b + 1 + OFFSET_ADDRESS_SIZE);
2708 pending_exact = 0;
2709 b += 1 + OFFSET_ADDRESS_SIZE;
2710
2711 if (!zero_times_ok)
2712 {
2713 /* At least one repetition is required, so insert a
2714 `dummy_failure_jump' before the initial
2715 `on_failure_jump' instruction of the loop. This
2716 effects a skip over that instruction the first time
2717 we hit that loop. */
2718 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2719 INSERT_JUMP (dummy_failure_jump, laststart, laststart +
2720 2 + 2 * OFFSET_ADDRESS_SIZE);
2721 b += 1 + OFFSET_ADDRESS_SIZE;
2722 }
2723 }
2724 break;
2725
2726
2727 case '.':
2728 laststart = b;
2729 BUF_PUSH (anychar);
2730 break;
2731
2732
2733 case '[':
2734 {
2735 boolean had_char_class = false;
2736#ifdef WCHAR
2737 CHAR_T range_start = 0xffffffff;
2738#else
2739 unsigned int range_start = 0xffffffff;
2740#endif
2741 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2742
2743#ifdef WCHAR
2744 /* We assume a charset(_not) structure as a wchar_t array.
2745 charset[0] = (re_opcode_t) charset(_not)
2746 charset[1] = l (= length of char_classes)
2747 charset[2] = m (= length of collating_symbols)
2748 charset[3] = n (= length of equivalence_classes)
2749 charset[4] = o (= length of char_ranges)
2750 charset[5] = p (= length of chars)
2751
2752 charset[6] = char_class (wctype_t)
2753 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
2754 ...
2755 charset[l+5] = char_class (wctype_t)
2756
2757 charset[l+6] = collating_symbol (wchar_t)
2758 ...
2759 charset[l+m+5] = collating_symbol (wchar_t)
2760 ifdef _LIBC we use the index if
2761 _NL_COLLATE_SYMB_EXTRAMB instead of
2762 wchar_t string.
2763
2764 charset[l+m+6] = equivalence_classes (wchar_t)
2765 ...
2766 charset[l+m+n+5] = equivalence_classes (wchar_t)
2767 ifdef _LIBC we use the index in
2768 _NL_COLLATE_WEIGHT instead of
2769 wchar_t string.
2770
2771 charset[l+m+n+6] = range_start
2772 charset[l+m+n+7] = range_end
2773 ...
2774 charset[l+m+n+2o+4] = range_start
2775 charset[l+m+n+2o+5] = range_end
2776 ifdef _LIBC we use the value looked up
2777 in _NL_COLLATE_COLLSEQ instead of
2778 wchar_t character.
2779
2780 charset[l+m+n+2o+6] = char
2781 ...
2782 charset[l+m+n+2o+p+5] = char
2783
2784 */
2785
2786 /* We need at least 6 spaces: the opcode, the length of
2787 char_classes, the length of collating_symbols, the length of
2788 equivalence_classes, the length of char_ranges, the length of
2789 chars. */
2790 GET_BUFFER_SPACE (6);
2791
2792 /* Save b as laststart. And We use laststart as the pointer
2793 to the first element of the charset here.
2794 In other words, laststart[i] indicates charset[i]. */
2795 laststart = b;
2796
2797 /* We test `*p == '^' twice, instead of using an if
2798 statement, so we only need one BUF_PUSH. */
2799 BUF_PUSH (*p == '^' ? charset_not : charset);
2800 if (*p == '^')
2801 p++;
2802
2803 /* Push the length of char_classes, the length of
2804 collating_symbols, the length of equivalence_classes, the
2805 length of char_ranges and the length of chars. */
2806 BUF_PUSH_3 (0, 0, 0);
2807 BUF_PUSH_2 (0, 0);
2808
2809 /* Remember the first position in the bracket expression. */
2810 p1 = p;
2811
2812 /* charset_not matches newline according to a syntax bit. */
2813 if ((re_opcode_t) b[-6] == charset_not
2814 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2815 {
2816 BUF_PUSH('\n');
2817 laststart[5]++; /* Update the length of characters */
2818 }
2819
2820 /* Read in characters and ranges, setting map bits. */
2821 for (;;)
2822 {
2823 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2824
2825 PATFETCH (c);
2826
2827 /* \ might escape characters inside [...] and [^...]. */
2828 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2829 {
2830 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2831
2832 PATFETCH (c1);
2833 BUF_PUSH(c1);
2834 laststart[5]++; /* Update the length of chars */
2835 range_start = c1;
2836 continue;
2837 }
2838
2839 /* Could be the end of the bracket expression. If it's
2840 not (i.e., when the bracket expression is `[]' so
2841 far), the ']' character bit gets set way below. */
2842 if (c == ']' && p != p1 + 1)
2843 break;
2844
2845 /* Look ahead to see if it's a range when the last thing
2846 was a character class. */
2847 if (had_char_class && c == '-' && *p != ']')
2848 FREE_STACK_RETURN (REG_ERANGE);
2849
2850 /* Look ahead to see if it's a range when the last thing
2851 was a character: if this is a hyphen not at the
2852 beginning or the end of a list, then it's the range
2853 operator. */
2854 if (c == '-'
2855 && !(p - 2 >= pattern && p[-2] == '[')
2856 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2857 && *p != ']')
2858 {
2859 reg_errcode_t ret;
2860 /* Allocate the space for range_start and range_end. */
2861 GET_BUFFER_SPACE (2);
2862 /* Update the pointer to indicate end of buffer. */
2863 b += 2;
2864 ret = wcs_compile_range (range_start, &p, pend, translate,
2865 syntax, b, laststart);
2866 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2867 range_start = 0xffffffff;
2868 }
2869 else if (p[0] == '-' && p[1] != ']')
2870 { /* This handles ranges made up of characters only. */
2871 reg_errcode_t ret;
2872
2873 /* Move past the `-'. */
2874 PATFETCH (c1);
2875 /* Allocate the space for range_start and range_end. */
2876 GET_BUFFER_SPACE (2);
2877 /* Update the pointer to indicate end of buffer. */
2878 b += 2;
2879 ret = wcs_compile_range (c, &p, pend, translate, syntax, b,
2880 laststart);
2881 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2882 range_start = 0xffffffff;
2883 }
2884
2885 /* See if we're at the beginning of a possible character
2886 class. */
2887 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2888 { /* Leave room for the null. */
2889 char str[CHAR_CLASS_MAX_LENGTH + 1];
2890
2891 PATFETCH (c);
2892 c1 = 0;
2893
2894 /* If pattern is `[[:'. */
2895 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2896
2897 for (;;)
2898 {
2899 PATFETCH (c);
2900 if ((c == ':' && *p == ']') || p == pend)
2901 break;
2902 if (c1 < CHAR_CLASS_MAX_LENGTH)
2903 str[c1++] = c;
2904 else
2905 /* This is in any case an invalid class name. */
2906 str[0] = '\0';
2907 }
2908 str[c1] = '\0';
2909
2910 /* If isn't a word bracketed by `[:' and `:]':
2911 undo the ending character, the letters, and leave
2912 the leading `:' and `[' (but store them as character). */
2913 if (c == ':' && *p == ']')
2914 {
2915 wctype_t wt;
2916 uintptr_t alignedp;
2917
2918 /* Query the character class as wctype_t. */
2919 wt = IS_CHAR_CLASS (str);
2920 if (wt == 0)
2921 FREE_STACK_RETURN (REG_ECTYPE);
2922
2923 /* Throw away the ] at the end of the character
2924 class. */
2925 PATFETCH (c);
2926
2927 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2928
2929 /* Allocate the space for character class. */
2930 GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
2931 /* Update the pointer to indicate end of buffer. */
2932 b += CHAR_CLASS_SIZE;
2933 /* Move data which follow character classes
2934 not to violate the data. */
2935 insert_space(CHAR_CLASS_SIZE,
2936 laststart + 6 + laststart[1],
2937 b - 1);
2938 alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
2939 + __alignof__(wctype_t) - 1)
2940 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
2941 /* Store the character class. */
2942 *((wctype_t*)alignedp) = wt;
2943 /* Update length of char_classes */
2944 laststart[1] += CHAR_CLASS_SIZE;
2945
2946 had_char_class = true;
2947 }
2948 else
2949 {
2950 c1++;
2951 while (c1--)
2952 PATUNFETCH;
2953 BUF_PUSH ('[');
2954 BUF_PUSH (':');
2955 laststart[5] += 2; /* Update the length of characters */
2956 range_start = ':';
2957 had_char_class = false;
2958 }
2959 }
2960 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
2961 || *p == '.'))
2962 {
2963 CHAR_T str[128]; /* Should be large enough. */
2964 CHAR_T delim = *p; /* '=' or '.' */
2965# ifdef _LIBC
2966 uint32_t nrules =
2967 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2968# endif
2969 PATFETCH (c);
2970 c1 = 0;
2971
2972 /* If pattern is `[[=' or '[[.'. */
2973 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2974
2975 for (;;)
2976 {
2977 PATFETCH (c);
2978 if ((c == delim && *p == ']') || p == pend)
2979 break;
2980 if (c1 < sizeof (str) - 1)
2981 str[c1++] = c;
2982 else
2983 /* This is in any case an invalid class name. */
2984 str[0] = '\0';
2985 }
2986 str[c1] = '\0';
2987
2988 if (c == delim && *p == ']' && str[0] != '\0')
2989 {
2990 unsigned int i, offset;
2991 /* If we have no collation data we use the default
2992 collation in which each character is in a class
2993 by itself. It also means that ASCII is the
2994 character set and therefore we cannot have character
2995 with more than one byte in the multibyte
2996 representation. */
2997
2998 /* If not defined _LIBC, we push the name and
2999 `\0' for the sake of matching performance. */
3000 int datasize = c1 + 1;
3001
3002# ifdef _LIBC
3003 int32_t idx = 0;
3004 if (nrules == 0)
3005# endif
3006 {
3007 if (c1 != 1)
3008 FREE_STACK_RETURN (REG_ECOLLATE);
3009 }
3010# ifdef _LIBC
3011 else
3012 {
3013 const int32_t *table;
3014 const int32_t *weights;
3015 const int32_t *extra;
3016 const int32_t *indirect;
3017 wint_t *cp;
3018
3019 /* This #include defines a local function! */
3020# include <locale/weightwc.h>
3021
3022 if(delim == '=')
3023 {
3024 /* We push the index for equivalence class. */
3025 cp = (wint_t*)str;
3026
3027 table = (const int32_t *)
3028 _NL_CURRENT (LC_COLLATE,
3029 _NL_COLLATE_TABLEWC);
3030 weights = (const int32_t *)
3031 _NL_CURRENT (LC_COLLATE,
3032 _NL_COLLATE_WEIGHTWC);
3033 extra = (const int32_t *)
3034 _NL_CURRENT (LC_COLLATE,
3035 _NL_COLLATE_EXTRAWC);
3036 indirect = (const int32_t *)
3037 _NL_CURRENT (LC_COLLATE,
3038 _NL_COLLATE_INDIRECTWC);
3039
3040 idx = findidx ((const wint_t**)&cp);
3041 if (idx == 0 || cp < (wint_t*) str + c1)
3042 /* This is no valid character. */
3043 FREE_STACK_RETURN (REG_ECOLLATE);
3044
3045 str[0] = (wchar_t)idx;
3046 }
3047 else /* delim == '.' */
3048 {
3049 /* We push collation sequence value
3050 for collating symbol. */
3051 int32_t table_size;
3052 const int32_t *symb_table;
3053 const unsigned char *extra;
3054 int32_t idx;
3055 int32_t elem;
3056 int32_t second;
3057 int32_t hash;
3058 char char_str[c1];
3059
3060 /* We have to convert the name to a single-byte
3061 string. This is possible since the names
3062 consist of ASCII characters and the internal
3063 representation is UCS4. */
3064 for (i = 0; i < c1; ++i)
3065 char_str[i] = str[i];
3066
3067 table_size =
3068 _NL_CURRENT_WORD (LC_COLLATE,
3069 _NL_COLLATE_SYMB_HASH_SIZEMB);
3070 symb_table = (const int32_t *)
3071 _NL_CURRENT (LC_COLLATE,
3072 _NL_COLLATE_SYMB_TABLEMB);
3073 extra = (const unsigned char *)
3074 _NL_CURRENT (LC_COLLATE,
3075 _NL_COLLATE_SYMB_EXTRAMB);
3076
3077 /* Locate the character in the hashing table. */
3078 hash = elem_hash (char_str, c1);
3079
3080 idx = 0;
3081 elem = hash % table_size;
3082 second = hash % (table_size - 2);
3083 while (symb_table[2 * elem] != 0)
3084 {
3085 /* First compare the hashing value. */
3086 if (symb_table[2 * elem] == hash
3087 && c1 == extra[symb_table[2 * elem + 1]]
3088 && memcmp (char_str,
3089 &extra[symb_table[2 * elem + 1]
3090 + 1], c1) == 0)
3091 {
3092 /* Yep, this is the entry. */
3093 idx = symb_table[2 * elem + 1];
3094 idx += 1 + extra[idx];
3095 break;
3096 }
3097
3098 /* Next entry. */
3099 elem += second;
3100 }
3101
3102 if (symb_table[2 * elem] != 0)
3103 {
3104 /* Compute the index of the byte sequence
3105 in the table. */
3106 idx += 1 + extra[idx];
3107 /* Adjust for the alignment. */
3108 idx = (idx + 3) & ~3;
3109
3110 str[0] = (wchar_t) idx + 4;
3111 }
3112 else if (symb_table[2 * elem] == 0 && c1 == 1)
3113 {
3114 /* No valid character. Match it as a
3115 single byte character. */
3116 had_char_class = false;
3117 BUF_PUSH(str[0]);
3118 /* Update the length of characters */
3119 laststart[5]++;
3120 range_start = str[0];
3121
3122 /* Throw away the ] at the end of the
3123 collating symbol. */
3124 PATFETCH (c);
3125 /* exit from the switch block. */
3126 continue;
3127 }
3128 else
3129 FREE_STACK_RETURN (REG_ECOLLATE);
3130 }
3131 datasize = 1;
3132 }
3133# endif
3134 /* Throw away the ] at the end of the equivalence
3135 class (or collating symbol). */
3136 PATFETCH (c);
3137
3138 /* Allocate the space for the equivalence class
3139 (or collating symbol) (and '\0' if needed). */
3140 GET_BUFFER_SPACE(datasize);
3141 /* Update the pointer to indicate end of buffer. */
3142 b += datasize;
3143
3144 if (delim == '=')
3145 { /* equivalence class */
3146 /* Calculate the offset of char_ranges,
3147 which is next to equivalence_classes. */
3148 offset = laststart[1] + laststart[2]
3149 + laststart[3] +6;
3150 /* Insert space. */
3151 insert_space(datasize, laststart + offset, b - 1);
3152
3153 /* Write the equivalence_class and \0. */
3154 for (i = 0 ; i < datasize ; i++)
3155 laststart[offset + i] = str[i];
3156
3157 /* Update the length of equivalence_classes. */
3158 laststart[3] += datasize;
3159 had_char_class = true;
3160 }
3161 else /* delim == '.' */
3162 { /* collating symbol */
3163 /* Calculate the offset of the equivalence_classes,
3164 which is next to collating_symbols. */
3165 offset = laststart[1] + laststart[2] + 6;
3166 /* Insert space and write the collationg_symbol
3167 and \0. */
3168 insert_space(datasize, laststart + offset, b-1);
3169 for (i = 0 ; i < datasize ; i++)
3170 laststart[offset + i] = str[i];
3171
3172 /* In re_match_2_internal if range_start < -1, we
3173 assume -range_start is the offset of the
3174 collating symbol which is specified as
3175 the character of the range start. So we assign
3176 -(laststart[1] + laststart[2] + 6) to
3177 range_start. */
3178 range_start = -(laststart[1] + laststart[2] + 6);
3179 /* Update the length of collating_symbol. */
3180 laststart[2] += datasize;
3181 had_char_class = false;
3182 }
3183 }
3184 else
3185 {
3186 c1++;
3187 while (c1--)
3188 PATUNFETCH;
3189 BUF_PUSH ('[');
3190 BUF_PUSH (delim);
3191 laststart[5] += 2; /* Update the length of characters */
3192 range_start = delim;
3193 had_char_class = false;
3194 }
3195 }
3196 else
3197 {
3198 had_char_class = false;
3199 BUF_PUSH(c);
3200 laststart[5]++; /* Update the length of characters */
3201 range_start = c;
3202 }
3203 }
3204
3205#else /* BYTE */
3206 /* Ensure that we have enough space to push a charset: the
3207 opcode, the length count, and the bitset; 34 bytes in all. */
3208 GET_BUFFER_SPACE (34);
3209
3210 laststart = b;
3211
3212 /* We test `*p == '^' twice, instead of using an if
3213 statement, so we only need one BUF_PUSH. */
3214 BUF_PUSH (*p == '^' ? charset_not : charset);
3215 if (*p == '^')
3216 p++;
3217
3218 /* Remember the first position in the bracket expression. */
3219 p1 = p;
3220
3221 /* Push the number of bytes in the bitmap. */
3222 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
3223
3224 /* Clear the whole map. */
3225 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
3226
3227 /* charset_not matches newline according to a syntax bit. */
3228 if ((re_opcode_t) b[-2] == charset_not
3229 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
3230 SET_LIST_BIT ('\n');
3231
3232 /* Read in characters and ranges, setting map bits. */
3233 for (;;)
3234 {
3235 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3236
3237 PATFETCH (c);
3238
3239 /* \ might escape characters inside [...] and [^...]. */
3240 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3241 {
3242 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3243
3244 PATFETCH (c1);
3245 SET_LIST_BIT (c1);
3246 range_start = c1;
3247 continue;
3248 }
3249
3250 /* Could be the end of the bracket expression. If it's
3251 not (i.e., when the bracket expression is `[]' so
3252 far), the ']' character bit gets set way below. */
3253 if (c == ']' && p != p1 + 1)
3254 break;
3255
3256 /* Look ahead to see if it's a range when the last thing
3257 was a character class. */
3258 if (had_char_class && c == '-' && *p != ']')
3259 FREE_STACK_RETURN (REG_ERANGE);
3260
3261 /* Look ahead to see if it's a range when the last thing
3262 was a character: if this is a hyphen not at the
3263 beginning or the end of a list, then it's the range
3264 operator. */
3265 if (c == '-'
3266 && !(p - 2 >= pattern && p[-2] == '[')
3267 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
3268 && *p != ']')
3269 {
3270 reg_errcode_t ret
3271 = byte_compile_range (range_start, &p, pend, translate,
3272 syntax, b);
3273 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3274 range_start = 0xffffffff;
3275 }
3276
3277 else if (p[0] == '-' && p[1] != ']')
3278 { /* This handles ranges made up of characters only. */
3279 reg_errcode_t ret;
3280
3281 /* Move past the `-'. */
3282 PATFETCH (c1);
3283
3284 ret = byte_compile_range (c, &p, pend, translate, syntax, b);
3285 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3286 range_start = 0xffffffff;
3287 }
3288
3289 /* See if we're at the beginning of a possible character
3290 class. */
3291
3292 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
3293 { /* Leave room for the null. */
3294 char str[CHAR_CLASS_MAX_LENGTH + 1];
3295
3296 PATFETCH (c);
3297 c1 = 0;
3298
3299 /* If pattern is `[[:'. */
3300 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3301
3302 for (;;)
3303 {
3304 PATFETCH (c);
3305 if ((c == ':' && *p == ']') || p == pend)
3306 break;
3307 if (c1 < CHAR_CLASS_MAX_LENGTH)
3308 str[c1++] = c;
3309 else
3310 /* This is in any case an invalid class name. */
3311 str[0] = '\0';
3312 }
3313 str[c1] = '\0';
3314
3315 /* If isn't a word bracketed by `[:' and `:]':
3316 undo the ending character, the letters, and leave
3317 the leading `:' and `[' (but set bits for them). */
3318 if (c == ':' && *p == ']')
3319 {
3320# if defined _LIBC || WIDE_CHAR_SUPPORT
3321 boolean is_lower = STREQ (str, "lower");
3322 boolean is_upper = STREQ (str, "upper");
3323 wctype_t wt;
3324 int ch;
3325
3326 wt = IS_CHAR_CLASS (str);
3327 if (wt == 0)
3328 FREE_STACK_RETURN (REG_ECTYPE);
3329
3330 /* Throw away the ] at the end of the character
3331 class. */
3332 PATFETCH (c);
3333
3334 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3335
3336 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
3337 {
3338# ifdef _LIBC
3339 if (__iswctype (__btowc (ch), wt))
3340 SET_LIST_BIT (ch);
3341# else
3342 if (iswctype (btowc (ch), wt))
3343 SET_LIST_BIT (ch);
3344# endif
3345
3346 if (translate && (is_upper || is_lower)
3347 && (ISUPPER (ch) || ISLOWER (ch)))
3348 SET_LIST_BIT (ch);
3349 }
3350
3351 had_char_class = true;
3352# else
3353 int ch;
3354 boolean is_alnum = STREQ (str, "alnum");
3355 boolean is_alpha = STREQ (str, "alpha");
3356 boolean is_blank = STREQ (str, "blank");
3357 boolean is_cntrl = STREQ (str, "cntrl");
3358 boolean is_digit = STREQ (str, "digit");
3359 boolean is_graph = STREQ (str, "graph");
3360 boolean is_lower = STREQ (str, "lower");
3361 boolean is_print = STREQ (str, "print");
3362 boolean is_punct = STREQ (str, "punct");
3363 boolean is_space = STREQ (str, "space");
3364 boolean is_upper = STREQ (str, "upper");
3365 boolean is_xdigit = STREQ (str, "xdigit");
3366
3367 if (!IS_CHAR_CLASS (str))
3368 FREE_STACK_RETURN (REG_ECTYPE);
3369
3370 /* Throw away the ] at the end of the character
3371 class. */
3372 PATFETCH (c);
3373
3374 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3375
3376 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
3377 {
3378 /* This was split into 3 if's to
3379 avoid an arbitrary limit in some compiler. */
3380 if ( (is_alnum && ISALNUM (ch))
3381 || (is_alpha && ISALPHA (ch))
3382 || (is_blank && ISBLANK (ch))
3383 || (is_cntrl && ISCNTRL (ch)))
3384 SET_LIST_BIT (ch);
3385 if ( (is_digit && ISDIGIT (ch))
3386 || (is_graph && ISGRAPH (ch))
3387 || (is_lower && ISLOWER (ch))
3388 || (is_print && ISPRINT (ch)))
3389 SET_LIST_BIT (ch);
3390 if ( (is_punct && ISPUNCT (ch))
3391 || (is_space && ISSPACE (ch))
3392 || (is_upper && ISUPPER (ch))
3393 || (is_xdigit && ISXDIGIT (ch)))
3394 SET_LIST_BIT (ch);
3395 if ( translate && (is_upper || is_lower)
3396 && (ISUPPER (ch) || ISLOWER (ch)))
3397 SET_LIST_BIT (ch);
3398 }
3399 had_char_class = true;
3400# endif /* libc || wctype.h */
3401 }
3402 else
3403 {
3404 c1++;
3405 while (c1--)
3406 PATUNFETCH;
3407 SET_LIST_BIT ('[');
3408 SET_LIST_BIT (':');
3409 range_start = ':';
3410 had_char_class = false;
3411 }
3412 }
3413 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
3414 {
3415 unsigned char str[MB_LEN_MAX + 1];
3416# ifdef _LIBC
3417 uint32_t nrules =
3418 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3419# endif
3420
3421 PATFETCH (c);
3422 c1 = 0;
3423
3424 /* If pattern is `[[='. */
3425 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3426
3427 for (;;)
3428 {
3429 PATFETCH (c);
3430 if ((c == '=' && *p == ']') || p == pend)
3431 break;
3432 if (c1 < MB_LEN_MAX)
3433 str[c1++] = c;
3434 else
3435 /* This is in any case an invalid class name. */
3436 str[0] = '\0';
3437 }
3438 str[c1] = '\0';
3439
3440 if (c == '=' && *p == ']' && str[0] != '\0')
3441 {
3442 /* If we have no collation data we use the default
3443 collation in which each character is in a class
3444 by itself. It also means that ASCII is the
3445 character set and therefore we cannot have character
3446 with more than one byte in the multibyte
3447 representation. */
3448# ifdef _LIBC
3449 if (nrules == 0)
3450# endif
3451 {
3452 if (c1 != 1)
3453 FREE_STACK_RETURN (REG_ECOLLATE);
3454
3455 /* Throw away the ] at the end of the equivalence
3456 class. */
3457 PATFETCH (c);
3458
3459 /* Set the bit for the character. */
3460 SET_LIST_BIT (str[0]);
3461 }
3462# ifdef _LIBC
3463 else
3464 {
3465 /* Try to match the byte sequence in `str' against
3466 those known to the collate implementation.
3467 First find out whether the bytes in `str' are
3468 actually from exactly one character. */
3469 const int32_t *table;
3470 const unsigned char *weights;
3471 const unsigned char *extra;
3472 const int32_t *indirect;
3473 int32_t idx;
3474 const unsigned char *cp = str;
3475 int ch;
3476
3477 /* This #include defines a local function! */
3478# include <locale/weight.h>
3479
3480 table = (const int32_t *)
3481 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3482 weights = (const unsigned char *)
3483 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3484 extra = (const unsigned char *)
3485 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3486 indirect = (const int32_t *)
3487 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3488
3489 idx = findidx (&cp);
3490 if (idx == 0 || cp < str + c1)
3491 /* This is no valid character. */
3492 FREE_STACK_RETURN (REG_ECOLLATE);
3493
3494 /* Throw away the ] at the end of the equivalence
3495 class. */
3496 PATFETCH (c);
3497
3498 /* Now we have to go throught the whole table
3499 and find all characters which have the same
3500 first level weight.
3501
3502 XXX Note that this is not entirely correct.
3503 we would have to match multibyte sequences
3504 but this is not possible with the current
3505 implementation. */
3506 for (ch = 1; ch < 256; ++ch)
3507 /* XXX This test would have to be changed if we
3508 would allow matching multibyte sequences. */
3509 if (table[ch] > 0)
3510 {
3511 int32_t idx2 = table[ch];
3512 size_t len = weights[idx2];
3513
3514 /* Test whether the lenghts match. */
3515 if (weights[idx] == len)
3516 {
3517 /* They do. New compare the bytes of
3518 the weight. */
3519 size_t cnt = 0;
3520
3521 while (cnt < len
3522 && (weights[idx + 1 + cnt]
3523 == weights[idx2 + 1 + cnt]))
3524 ++cnt;
3525
3526 if (cnt == len)
3527 /* They match. Mark the character as
3528 acceptable. */
3529 SET_LIST_BIT (ch);
3530 }
3531 }
3532 }
3533# endif
3534 had_char_class = true;
3535 }
3536 else
3537 {
3538 c1++;
3539 while (c1--)
3540 PATUNFETCH;
3541 SET_LIST_BIT ('[');
3542 SET_LIST_BIT ('=');
3543 range_start = '=';
3544 had_char_class = false;
3545 }
3546 }
3547 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
3548 {
3549 unsigned char str[128]; /* Should be large enough. */
3550# ifdef _LIBC
3551 uint32_t nrules =
3552 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3553# endif
3554
3555 PATFETCH (c);
3556 c1 = 0;
3557
3558 /* If pattern is `[[.'. */
3559 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3560
3561 for (;;)
3562 {
3563 PATFETCH (c);
3564 if ((c == '.' && *p == ']') || p == pend)
3565 break;
3566 if (c1 < sizeof (str))
3567 str[c1++] = c;
3568 else
3569 /* This is in any case an invalid class name. */
3570 str[0] = '\0';
3571 }
3572 str[c1] = '\0';
3573
3574 if (c == '.' && *p == ']' && str[0] != '\0')
3575 {
3576 /* If we have no collation data we use the default
3577 collation in which each character is the name
3578 for its own class which contains only the one
3579 character. It also means that ASCII is the
3580 character set and therefore we cannot have character
3581 with more than one byte in the multibyte
3582 representation. */
3583# ifdef _LIBC
3584 if (nrules == 0)
3585# endif
3586 {
3587 if (c1 != 1)
3588 FREE_STACK_RETURN (REG_ECOLLATE);
3589
3590 /* Throw away the ] at the end of the equivalence
3591 class. */
3592 PATFETCH (c);
3593
3594 /* Set the bit for the character. */
3595 SET_LIST_BIT (str[0]);
3596 range_start = ((const unsigned char *) str)[0];
3597 }
3598# ifdef _LIBC
3599 else
3600 {
3601 /* Try to match the byte sequence in `str' against
3602 those known to the collate implementation.
3603 First find out whether the bytes in `str' are
3604 actually from exactly one character. */
3605 int32_t table_size;
3606 const int32_t *symb_table;
3607 const unsigned char *extra;
3608 int32_t idx;
3609 int32_t elem;
3610 int32_t second;
3611 int32_t hash;
3612
3613 table_size =
3614 _NL_CURRENT_WORD (LC_COLLATE,
3615 _NL_COLLATE_SYMB_HASH_SIZEMB);
3616 symb_table = (const int32_t *)
3617 _NL_CURRENT (LC_COLLATE,
3618 _NL_COLLATE_SYMB_TABLEMB);
3619 extra = (const unsigned char *)
3620 _NL_CURRENT (LC_COLLATE,
3621 _NL_COLLATE_SYMB_EXTRAMB);
3622
3623 /* Locate the character in the hashing table. */
3624 hash = elem_hash (str, c1);
3625
3626 idx = 0;
3627 elem = hash % table_size;
3628 second = hash % (table_size - 2);
3629 while (symb_table[2 * elem] != 0)
3630 {
3631 /* First compare the hashing value. */
3632 if (symb_table[2 * elem] == hash
3633 && c1 == extra[symb_table[2 * elem + 1]]
3634 && memcmp (str,
3635 &extra[symb_table[2 * elem + 1]
3636 + 1],
3637 c1) == 0)
3638 {
3639 /* Yep, this is the entry. */
3640 idx = symb_table[2 * elem + 1];
3641 idx += 1 + extra[idx];
3642 break;
3643 }
3644
3645 /* Next entry. */
3646 elem += second;
3647 }
3648
3649 if (symb_table[2 * elem] == 0)
3650 /* This is no valid character. */
3651 FREE_STACK_RETURN (REG_ECOLLATE);
3652
3653 /* Throw away the ] at the end of the equivalence
3654 class. */
3655 PATFETCH (c);
3656
3657 /* Now add the multibyte character(s) we found
3658 to the accept list.
3659
3660 XXX Note that this is not entirely correct.
3661 we would have to match multibyte sequences
3662 but this is not possible with the current
3663 implementation. Also, we have to match
3664 collating symbols, which expand to more than
3665 one file, as a whole and not allow the
3666 individual bytes. */
3667 c1 = extra[idx++];
3668 if (c1 == 1)
3669 range_start = extra[idx];
3670 while (c1-- > 0)
3671 {
3672 SET_LIST_BIT (extra[idx]);
3673 ++idx;
3674 }
3675 }
3676# endif
3677 had_char_class = false;
3678 }
3679 else
3680 {
3681 c1++;
3682 while (c1--)
3683 PATUNFETCH;
3684 SET_LIST_BIT ('[');
3685 SET_LIST_BIT ('.');
3686 range_start = '.';
3687 had_char_class = false;
3688 }
3689 }
3690 else
3691 {
3692 had_char_class = false;
3693 SET_LIST_BIT (c);
3694 range_start = c;
3695 }
3696 }
3697
3698 /* Discard any (non)matching list bytes that are all 0 at the
3699 end of the map. Decrease the map-length byte too. */
3700 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3701 b[-1]--;
3702 b += b[-1];
3703#endif /* WCHAR */
3704 }
3705 break;
3706
3707
3708 case '(':
3709 if (syntax & RE_NO_BK_PARENS)
3710 goto handle_open;
3711 else
3712 goto normal_char;
3713
3714
3715 case ')':
3716 if (syntax & RE_NO_BK_PARENS)
3717 goto handle_close;
3718 else
3719 goto normal_char;
3720
3721
3722 case '\n':
3723 if (syntax & RE_NEWLINE_ALT)
3724 goto handle_alt;
3725 else
3726 goto normal_char;
3727
3728
3729 case '|':
3730 if (syntax & RE_NO_BK_VBAR)
3731 goto handle_alt;
3732 else
3733 goto normal_char;
3734
3735
3736 case '{':
3737 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3738 goto handle_interval;
3739 else
3740 goto normal_char;
3741
3742
3743 case '\\':
3744 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3745
3746 /* Do not translate the character after the \, so that we can
3747 distinguish, e.g., \B from \b, even if we normally would
3748 translate, e.g., B to b. */
3749 PATFETCH_RAW (c);
3750
3751 switch (c)
3752 {
3753 case '(':
3754 if (syntax & RE_NO_BK_PARENS)
3755 goto normal_backslash;
3756
3757 handle_open:
3758 bufp->re_nsub++;
3759 regnum++;
3760
3761 if (COMPILE_STACK_FULL)
3762 {
3763 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3764 compile_stack_elt_t);
3765 if (compile_stack.stack == NULL) return REG_ESPACE;
3766
3767 compile_stack.size <<= 1;
3768 }
3769
3770 /* These are the values to restore when we hit end of this
3771 group. They are all relative offsets, so that if the
3772 whole pattern moves because of realloc, they will still
3773 be valid. */
3774 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
3775 COMPILE_STACK_TOP.fixup_alt_jump
3776 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
3777 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
3778 COMPILE_STACK_TOP.regnum = regnum;
3779
3780 /* We will eventually replace the 0 with the number of
3781 groups inner to this one. But do not push a
3782 start_memory for groups beyond the last one we can
3783 represent in the compiled pattern. */
3784 if (regnum <= MAX_REGNUM)
3785 {
3786 COMPILE_STACK_TOP.inner_group_offset = b
3787 - COMPILED_BUFFER_VAR + 2;
3788 BUF_PUSH_3 (start_memory, regnum, 0);
3789 }
3790
3791 compile_stack.avail++;
3792
3793 fixup_alt_jump = 0;
3794 laststart = 0;
3795 begalt = b;
3796 /* If we've reached MAX_REGNUM groups, then this open
3797 won't actually generate any code, so we'll have to
3798 clear pending_exact explicitly. */
3799 pending_exact = 0;
3800 break;
3801
3802
3803 case ')':
3804 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3805
3806 if (COMPILE_STACK_EMPTY)
3807 {
3808 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3809 goto normal_backslash;
3810 else
3811 FREE_STACK_RETURN (REG_ERPAREN);
3812 }
3813
3814 handle_close:
3815 if (fixup_alt_jump)
3816 { /* Push a dummy failure point at the end of the
3817 alternative for a possible future
3818 `pop_failure_jump' to pop. See comments at
3819 `push_dummy_failure' in `re_match_2'. */
3820 BUF_PUSH (push_dummy_failure);
3821
3822 /* We allocated space for this jump when we assigned
3823 to `fixup_alt_jump', in the `handle_alt' case below. */
3824 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
3825 }
3826
3827 /* See similar code for backslashed left paren above. */
3828 if (COMPILE_STACK_EMPTY)
3829 {
3830 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3831 goto normal_char;
3832 else
3833 FREE_STACK_RETURN (REG_ERPAREN);
3834 }
3835
3836 /* Since we just checked for an empty stack above, this
3837 ``can't happen''. */
3838 assert (compile_stack.avail != 0);
3839 {
3840 /* We don't just want to restore into `regnum', because
3841 later groups should continue to be numbered higher,
3842 as in `(ab)c(de)' -- the second group is #2. */
3843 regnum_t this_group_regnum;
3844
3845 compile_stack.avail--;
3846 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
3847 fixup_alt_jump
3848 = COMPILE_STACK_TOP.fixup_alt_jump
3849 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
3850 : 0;
3851 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
3852 this_group_regnum = COMPILE_STACK_TOP.regnum;
3853 /* If we've reached MAX_REGNUM groups, then this open
3854 won't actually generate any code, so we'll have to
3855 clear pending_exact explicitly. */
3856 pending_exact = 0;
3857
3858 /* We're at the end of the group, so now we know how many
3859 groups were inside this one. */
3860 if (this_group_regnum <= MAX_REGNUM)
3861 {
3862 UCHAR_T *inner_group_loc
3863 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
3864
3865 *inner_group_loc = regnum - this_group_regnum;
3866 BUF_PUSH_3 (stop_memory, this_group_regnum,
3867 regnum - this_group_regnum);
3868 }
3869 }
3870 break;
3871
3872
3873 case '|': /* `\|'. */
3874 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3875 goto normal_backslash;
3876 handle_alt:
3877 if (syntax & RE_LIMITED_OPS)
3878 goto normal_char;
3879
3880 /* Insert before the previous alternative a jump which
3881 jumps to this alternative if the former fails. */
3882 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3883 INSERT_JUMP (on_failure_jump, begalt,
3884 b + 2 + 2 * OFFSET_ADDRESS_SIZE);
3885 pending_exact = 0;
3886 b += 1 + OFFSET_ADDRESS_SIZE;
3887
3888 /* The alternative before this one has a jump after it
3889 which gets executed if it gets matched. Adjust that
3890 jump so it will jump to this alternative's analogous
3891 jump (put in below, which in turn will jump to the next
3892 (if any) alternative's such jump, etc.). The last such
3893 jump jumps to the correct final destination. A picture:
3894 _____ _____
3895 | | | |
3896 | v | v
3897 a | b | c
3898
3899 If we are at `b', then fixup_alt_jump right now points to a
3900 three-byte space after `a'. We'll put in the jump, set
3901 fixup_alt_jump to right after `b', and leave behind three
3902 bytes which we'll fill in when we get to after `c'. */
3903
3904 if (fixup_alt_jump)
3905 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3906
3907 /* Mark and leave space for a jump after this alternative,
3908 to be filled in later either by next alternative or
3909 when know we're at the end of a series of alternatives. */
3910 fixup_alt_jump = b;
3911 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3912 b += 1 + OFFSET_ADDRESS_SIZE;
3913
3914 laststart = 0;
3915 begalt = b;
3916 break;
3917
3918
3919 case '{':
3920 /* If \{ is a literal. */
3921 if (!(syntax & RE_INTERVALS)
3922 /* If we're at `\{' and it's not the open-interval
3923 operator. */
3924 || (syntax & RE_NO_BK_BRACES))
3925 goto normal_backslash;
3926
3927 handle_interval:
3928 {
3929 /* If got here, then the syntax allows intervals. */
3930
3931 /* At least (most) this many matches must be made. */
3932 int lower_bound = -1, upper_bound = -1;
3933
3934 /* Place in the uncompiled pattern (i.e., just after
3935 the '{') to go back to if the interval is invalid. */
3936 const CHAR_T *beg_interval = p;
3937
3938 if (p == pend)
3939 goto invalid_interval;
3940
3941 GET_UNSIGNED_NUMBER (lower_bound);
3942
3943 if (c == ',')
3944 {
3945 GET_UNSIGNED_NUMBER (upper_bound);
3946 if (upper_bound < 0)
3947 upper_bound = RE_DUP_MAX;
3948 }
3949 else
3950 /* Interval such as `{1}' => match exactly once. */
3951 upper_bound = lower_bound;
3952
3953 if (! (0 <= lower_bound && lower_bound <= upper_bound))
3954 goto invalid_interval;
3955
3956 if (!(syntax & RE_NO_BK_BRACES))
3957 {
3958 if (c != '\\' || p == pend)
3959 goto invalid_interval;
3960 PATFETCH (c);
3961 }
3962
3963 if (c != '}')
3964 goto invalid_interval;
3965
3966 /* If it's invalid to have no preceding re. */
3967 if (!laststart)
3968 {
3969 if (syntax & RE_CONTEXT_INVALID_OPS
3970 && !(syntax & RE_INVALID_INTERVAL_ORD))
3971 FREE_STACK_RETURN (REG_BADRPT);
3972 else if (syntax & RE_CONTEXT_INDEP_OPS)
3973 laststart = b;
3974 else
3975 goto unfetch_interval;
3976 }
3977
3978 /* We just parsed a valid interval. */
3979
3980 if (RE_DUP_MAX < upper_bound)
3981 FREE_STACK_RETURN (REG_BADBR);
3982
3983 /* If the upper bound is zero, don't want to succeed at
3984 all; jump from `laststart' to `b + 3', which will be
3985 the end of the buffer after we insert the jump. */
3986 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE'
3987 instead of 'b + 3'. */
3988 if (upper_bound == 0)
3989 {
3990 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3991 INSERT_JUMP (jump, laststart, b + 1
3992 + OFFSET_ADDRESS_SIZE);
3993 b += 1 + OFFSET_ADDRESS_SIZE;
3994 }
3995
3996 /* Otherwise, we have a nontrivial interval. When
3997 we're all done, the pattern will look like:
3998 set_number_at <jump count> <upper bound>
3999 set_number_at <succeed_n count> <lower bound>
4000 succeed_n <after jump addr> <succeed_n count>
4001 <body of loop>
4002 jump_n <succeed_n addr> <jump count>
4003 (The upper bound and `jump_n' are omitted if
4004 `upper_bound' is 1, though.) */
4005 else
4006 { /* If the upper bound is > 1, we need to insert
4007 more at the end of the loop. */
4008 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
4009 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
4010
4011 GET_BUFFER_SPACE (nbytes);
4012
4013 /* Initialize lower bound of the `succeed_n', even
4014 though it will be set during matching by its
4015 attendant `set_number_at' (inserted next),
4016 because `re_compile_fastmap' needs to know.
4017 Jump to the `jump_n' we might insert below. */
4018 INSERT_JUMP2 (succeed_n, laststart,
4019 b + 1 + 2 * OFFSET_ADDRESS_SIZE
4020 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
4021 , lower_bound);
4022 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4023
4024 /* Code to initialize the lower bound. Insert
4025 before the `succeed_n'. The `5' is the last two
4026 bytes of this `set_number_at', plus 3 bytes of
4027 the following `succeed_n'. */
4028 /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE'
4029 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
4030 of the following `succeed_n'. */
4031 PREFIX(insert_op2) (set_number_at, laststart, 1
4032 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
4033 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4034
4035 if (upper_bound > 1)
4036 { /* More than one repetition is allowed, so
4037 append a backward jump to the `succeed_n'
4038 that starts this interval.
4039
4040 When we've reached this during matching,
4041 we'll have matched the interval once, so
4042 jump back only `upper_bound - 1' times. */
4043 STORE_JUMP2 (jump_n, b, laststart
4044 + 2 * OFFSET_ADDRESS_SIZE + 1,
4045 upper_bound - 1);
4046 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4047
4048 /* The location we want to set is the second
4049 parameter of the `jump_n'; that is `b-2' as
4050 an absolute address. `laststart' will be
4051 the `set_number_at' we're about to insert;
4052 `laststart+3' the number to set, the source
4053 for the relative address. But we are
4054 inserting into the middle of the pattern --
4055 so everything is getting moved up by 5.
4056 Conclusion: (b - 2) - (laststart + 3) + 5,
4057 i.e., b - laststart.
4058
4059 We insert this at the beginning of the loop
4060 so that if we fail during matching, we'll
4061 reinitialize the bounds. */
4062 PREFIX(insert_op2) (set_number_at, laststart,
4063 b - laststart,
4064 upper_bound - 1, b);
4065 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4066 }
4067 }
4068 pending_exact = 0;
4069 break;
4070
4071 invalid_interval:
4072 if (!(syntax & RE_INVALID_INTERVAL_ORD))
4073 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
4074 unfetch_interval:
4075 /* Match the characters as literals. */
4076 p = beg_interval;
4077 c = '{';
4078 if (syntax & RE_NO_BK_BRACES)
4079 goto normal_char;
4080 else
4081 goto normal_backslash;
4082 }
4083
4084#ifdef emacs
4085 /* There is no way to specify the before_dot and after_dot
4086 operators. rms says this is ok. --karl */
4087 case '=':
4088 BUF_PUSH (at_dot);
4089 break;
4090
4091 case 's':
4092 laststart = b;
4093 PATFETCH (c);
4094 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
4095 break;
4096
4097 case 'S':
4098 laststart = b;
4099 PATFETCH (c);
4100 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
4101 break;
4102#endif /* emacs */
4103
4104
4105 case 'w':
4106 if (syntax & RE_NO_GNU_OPS)
4107 goto normal_char;
4108 laststart = b;
4109 BUF_PUSH (wordchar);
4110 break;
4111
4112
4113 case 'W':
4114 if (syntax & RE_NO_GNU_OPS)
4115 goto normal_char;
4116 laststart = b;
4117 BUF_PUSH (notwordchar);
4118 break;
4119
4120
4121 case '<':
4122 if (syntax & RE_NO_GNU_OPS)
4123 goto normal_char;
4124 BUF_PUSH (wordbeg);
4125 break;
4126
4127 case '>':
4128 if (syntax & RE_NO_GNU_OPS)
4129 goto normal_char;
4130 BUF_PUSH (wordend);
4131 break;
4132
4133 case 'b':
4134 if (syntax & RE_NO_GNU_OPS)
4135 goto normal_char;
4136 BUF_PUSH (wordbound);
4137 break;
4138
4139 case 'B':
4140 if (syntax & RE_NO_GNU_OPS)
4141 goto normal_char;
4142 BUF_PUSH (notwordbound);
4143 break;
4144
4145 case '`':
4146 if (syntax & RE_NO_GNU_OPS)
4147 goto normal_char;
4148 BUF_PUSH (begbuf);
4149 break;
4150
4151 case '\'':
4152 if (syntax & RE_NO_GNU_OPS)
4153 goto normal_char;
4154 BUF_PUSH (endbuf);
4155 break;
4156
4157 case '1': case '2': case '3': case '4': case '5':
4158 case '6': case '7': case '8': case '9':
4159 if (syntax & RE_NO_BK_REFS)
4160 goto normal_char;
4161
4162 c1 = c - '0';
4163
4164 if (c1 > regnum)
4165 FREE_STACK_RETURN (REG_ESUBREG);
4166
4167 /* Can't back reference to a subexpression if inside of it. */
4168 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
4169 goto normal_char;
4170
4171 laststart = b;
4172 BUF_PUSH_2 (duplicate, c1);
4173 break;
4174
4175
4176 case '+':
4177 case '?':
4178 if (syntax & RE_BK_PLUS_QM)
4179 goto handle_plus;
4180 else
4181 goto normal_backslash;
4182
4183 default:
4184 normal_backslash:
4185 /* You might think it would be useful for \ to mean
4186 not to translate; but if we don't translate it
4187 it will never match anything. */
4188 c = TRANSLATE (c);
4189 goto normal_char;
4190 }
4191 break;
4192
4193
4194 default:
4195 /* Expects the character in `c'. */
4196 normal_char:
4197 /* If no exactn currently being built. */
4198 if (!pending_exact
4199#ifdef WCHAR
4200 /* If last exactn handle binary(or character) and
4201 new exactn handle character(or binary). */
4202 || is_exactn_bin != is_binary[p - 1 - pattern]
4203#endif /* WCHAR */
4204
4205 /* If last exactn not at current position. */
4206 || pending_exact + *pending_exact + 1 != b
4207
4208 /* We have only one byte following the exactn for the count. */
4209 || *pending_exact == (1 << BYTEWIDTH) - 1
4210
4211 /* If followed by a repetition operator. */
4212 || *p == '*' || *p == '^'
4213 || ((syntax & RE_BK_PLUS_QM)
4214 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
4215 : (*p == '+' || *p == '?'))
4216 || ((syntax & RE_INTERVALS)
4217 && ((syntax & RE_NO_BK_BRACES)
4218 ? *p == '{'
4219 : (p[0] == '\\' && p[1] == '{'))))
4220 {
4221 /* Start building a new exactn. */
4222
4223 laststart = b;
4224
4225#ifdef WCHAR
4226 /* Is this exactn binary data or character? */
4227 is_exactn_bin = is_binary[p - 1 - pattern];
4228 if (is_exactn_bin)
4229 BUF_PUSH_2 (exactn_bin, 0);
4230 else
4231 BUF_PUSH_2 (exactn, 0);
4232#else
4233 BUF_PUSH_2 (exactn, 0);
4234#endif /* WCHAR */
4235 pending_exact = b - 1;
4236 }
4237
4238 BUF_PUSH (c);
4239 (*pending_exact)++;
4240 break;
4241 } /* switch (c) */
4242 } /* while p != pend */
4243
4244
4245 /* Through the pattern now. */
4246
4247 if (fixup_alt_jump)
4248 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
4249
4250 if (!COMPILE_STACK_EMPTY)
4251 FREE_STACK_RETURN (REG_EPAREN);
4252
4253 /* If we don't want backtracking, force success
4254 the first time we reach the end of the compiled pattern. */
4255 if (syntax & RE_NO_POSIX_BACKTRACKING)
4256 BUF_PUSH (succeed);
4257
4258#ifdef WCHAR
4259 free (pattern);
4260 free (mbs_offset);
4261 free (is_binary);
4262#endif
4263 free (compile_stack.stack);
4264
4265 /* We have succeeded; set the length of the buffer. */
4266#ifdef WCHAR
4267 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
4268#else
4269 bufp->used = b - bufp->buffer;
4270#endif
4271
4272#ifdef DEBUG
4273 if (debug)
4274 {
4275 DEBUG_PRINT1 ("\nCompiled pattern: \n");
4276 PREFIX(print_compiled_pattern) (bufp);
4277 }
4278#endif /* DEBUG */
4279
4280#ifndef MATCH_MAY_ALLOCATE
4281 /* Initialize the failure stack to the largest possible stack. This
4282 isn't necessary unless we're trying to avoid calling alloca in
4283 the search and match routines. */
4284 {
4285 int num_regs = bufp->re_nsub + 1;
4286
4287 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
4288 is strictly greater than re_max_failures, the largest possible stack
4289 is 2 * re_max_failures failure points. */
4290 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
4291 {
4292 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
4293
4294# ifdef emacs
4295 if (! fail_stack.stack)
4296 fail_stack.stack
4297 = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size
4298 * sizeof (PREFIX(fail_stack_elt_t)));
4299 else
4300 fail_stack.stack
4301 = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack,
4302 (fail_stack.size
4303 * sizeof (PREFIX(fail_stack_elt_t))));
4304# else /* not emacs */
4305 if (! fail_stack.stack)
4306 fail_stack.stack
4307 = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size
4308 * sizeof (PREFIX(fail_stack_elt_t)));
4309 else
4310 fail_stack.stack
4311 = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack,
4312 (fail_stack.size
4313 * sizeof (PREFIX(fail_stack_elt_t))));
4314# endif /* not emacs */
4315 }
4316
4317 PREFIX(regex_grow_registers) (num_regs);
4318 }
4319#endif /* not MATCH_MAY_ALLOCATE */
4320
4321 return REG_NOERROR;
4322} /* regex_compile */
4323
4324/* Subroutines for `regex_compile'. */
4325
4326/* Store OP at LOC followed by two-byte integer parameter ARG. */
4327/* ifdef WCHAR, integer parameter is 1 wchar_t. */
4328
4329static void
4330PREFIX(store_op1) (op, loc, arg)
4331 re_opcode_t op;
4332 UCHAR_T *loc;
4333 int arg;
4334{
4335 *loc = (UCHAR_T) op;
4336 STORE_NUMBER (loc + 1, arg);
4337}
4338
4339
4340/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
4341/* ifdef WCHAR, integer parameter is 1 wchar_t. */
4342
4343static void
4344PREFIX(store_op2) (op, loc, arg1, arg2)
4345 re_opcode_t op;
4346 UCHAR_T *loc;
4347 int arg1, arg2;
4348{
4349 *loc = (UCHAR_T) op;
4350 STORE_NUMBER (loc + 1, arg1);
4351 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
4352}
4353
4354
4355/* Copy the bytes from LOC to END to open up three bytes of space at LOC
4356 for OP followed by two-byte integer parameter ARG. */
4357/* ifdef WCHAR, integer parameter is 1 wchar_t. */
4358
4359static void
4360PREFIX(insert_op1) (op, loc, arg, end)
4361 re_opcode_t op;
4362 UCHAR_T *loc;
4363 int arg;
4364 UCHAR_T *end;
4365{
4366 register UCHAR_T *pfrom = end;
4367 register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE;
4368
4369 while (pfrom != loc)
4370 *--pto = *--pfrom;
4371
4372 PREFIX(store_op1) (op, loc, arg);
4373}
4374
4375
4376/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
4377/* ifdef WCHAR, integer parameter is 1 wchar_t. */
4378
4379static void
4380PREFIX(insert_op2) (op, loc, arg1, arg2, end)
4381 re_opcode_t op;
4382 UCHAR_T *loc;
4383 int arg1, arg2;
4384 UCHAR_T *end;
4385{
4386 register UCHAR_T *pfrom = end;
4387 register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
4388
4389 while (pfrom != loc)
4390 *--pto = *--pfrom;
4391
4392 PREFIX(store_op2) (op, loc, arg1, arg2);
4393}
4394
4395
4396/* P points to just after a ^ in PATTERN. Return true if that ^ comes
4397 after an alternative or a begin-subexpression. We assume there is at
4398 least one character before the ^. */
4399
4400static boolean
4401PREFIX(at_begline_loc_p) (pattern, p, syntax)
4402 const CHAR_T *pattern, *p;
4403 reg_syntax_t syntax;
4404{
4405 const CHAR_T *prev = p - 2;
4406 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
4407
4408 return
4409 /* After a subexpression? */
4410 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
4411 /* After an alternative? */
4412 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
4413}
4414
4415
4416/* The dual of at_begline_loc_p. This one is for $. We assume there is
4417 at least one character after the $, i.e., `P < PEND'. */
4418
4419static boolean
4420PREFIX(at_endline_loc_p) (p, pend, syntax)
4421 const CHAR_T *p, *pend;
4422 reg_syntax_t syntax;
4423{
4424 const CHAR_T *next = p;
4425 boolean next_backslash = *next == '\\';
4426 const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0;
4427
4428 return
4429 /* Before a subexpression? */
4430 (syntax & RE_NO_BK_PARENS ? *next == ')'
4431 : next_backslash && next_next && *next_next == ')')
4432 /* Before an alternative? */
4433 || (syntax & RE_NO_BK_VBAR ? *next == '|'
4434 : next_backslash && next_next && *next_next == '|');
4435}
4436
4437#else /* not INSIDE_RECURSION */
4438
4439/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4440 false if it's not. */
4441
4442static boolean
4443group_in_compile_stack (compile_stack, regnum)
4444 compile_stack_type compile_stack;
4445 regnum_t regnum;
4446{
4447 int this_element;
4448
4449 for (this_element = compile_stack.avail - 1;
4450 this_element >= 0;
4451 this_element--)
4452 if (compile_stack.stack[this_element].regnum == regnum)
4453 return true;
4454
4455 return false;
4456}
4457#endif /* not INSIDE_RECURSION */
4458
4459#ifdef INSIDE_RECURSION
4460
4461#ifdef WCHAR
4462/* This insert space, which size is "num", into the pattern at "loc".
4463 "end" must point the end of the allocated buffer. */
4464static void
4465insert_space (num, loc, end)
4466 int num;
4467 CHAR_T *loc;
4468 CHAR_T *end;
4469{
4470 register CHAR_T *pto = end;
4471 register CHAR_T *pfrom = end - num;
4472
4473 while (pfrom >= loc)
4474 *pto-- = *pfrom--;
4475}
4476#endif /* WCHAR */
4477
4478#ifdef WCHAR
4479static reg_errcode_t
4480wcs_compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
4481 char_set)
4482 CHAR_T range_start_char;
4483 const CHAR_T **p_ptr, *pend;
4484 CHAR_T *char_set, *b;
4485 RE_TRANSLATE_TYPE translate;
4486 reg_syntax_t syntax;
4487{
4488 const CHAR_T *p = *p_ptr;
4489 CHAR_T range_start, range_end;
4490 reg_errcode_t ret;
4491# ifdef _LIBC
4492 uint32_t nrules;
4493 uint32_t start_val, end_val;
4494# endif
4495 if (p == pend)
4496 return REG_ERANGE;
4497
4498# ifdef _LIBC
4499 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4500 if (nrules != 0)
4501 {
4502 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
4503 _NL_COLLATE_COLLSEQWC);
4504 const unsigned char *extra = (const unsigned char *)
4505 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4506
4507 if (range_start_char < -1)
4508 {
4509 /* range_start is a collating symbol. */
4510 int32_t *wextra;
4511 /* Retreive the index and get collation sequence value. */
4512 wextra = (int32_t*)(extra + char_set[-range_start_char]);
4513 start_val = wextra[1 + *wextra];
4514 }
4515 else
4516 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
4517
4518 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
4519
4520 /* Report an error if the range is empty and the syntax prohibits
4521 this. */
4522 ret = ((syntax & RE_NO_EMPTY_RANGES)
4523 && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
4524
4525 /* Insert space to the end of the char_ranges. */
4526 insert_space(2, b - char_set[5] - 2, b - 1);
4527 *(b - char_set[5] - 2) = (wchar_t)start_val;
4528 *(b - char_set[5] - 1) = (wchar_t)end_val;
4529 char_set[4]++; /* ranges_index */
4530 }
4531 else
4532# endif
4533 {
4534 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
4535 range_start_char;
4536 range_end = TRANSLATE (p[0]);
4537 /* Report an error if the range is empty and the syntax prohibits
4538 this. */
4539 ret = ((syntax & RE_NO_EMPTY_RANGES)
4540 && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
4541
4542 /* Insert space to the end of the char_ranges. */
4543 insert_space(2, b - char_set[5] - 2, b - 1);
4544 *(b - char_set[5] - 2) = range_start;
4545 *(b - char_set[5] - 1) = range_end;
4546 char_set[4]++; /* ranges_index */
4547 }
4548 /* Have to increment the pointer into the pattern string, so the
4549 caller isn't still at the ending character. */
4550 (*p_ptr)++;
4551
4552 return ret;
4553}
4554#else /* BYTE */
4555/* Read the ending character of a range (in a bracket expression) from the
4556 uncompiled pattern *P_PTR (which ends at PEND). We assume the
4557 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4558 Then we set the translation of all bits between the starting and
4559 ending characters (inclusive) in the compiled pattern B.
4560
4561 Return an error code.
4562
4563 We use these short variable names so we can use the same macros as
4564 `regex_compile' itself. */
4565
4566static reg_errcode_t
4567byte_compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
4568 unsigned int range_start_char;
4569 const char **p_ptr, *pend;
4570 RE_TRANSLATE_TYPE translate;
4571 reg_syntax_t syntax;
4572 unsigned char *b;
4573{
4574 unsigned this_char;
4575 const char *p = *p_ptr;
4576 reg_errcode_t ret;
4577# if _LIBC
4578 const unsigned char *collseq;
4579 unsigned int start_colseq;
4580 unsigned int end_colseq;
4581# else
4582 unsigned end_char;
4583# endif
4584
4585 if (p == pend)
4586 return REG_ERANGE;
4587
4588 /* Have to increment the pointer into the pattern string, so the
4589 caller isn't still at the ending character. */
4590 (*p_ptr)++;
4591
4592 /* Report an error if the range is empty and the syntax prohibits this. */
4593 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4594
4595# if _LIBC
4596 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
4597 _NL_COLLATE_COLLSEQMB);
4598
4599 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
4600 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
4601 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
4602 {
4603 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
4604
4605 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
4606 {
4607 SET_LIST_BIT (TRANSLATE (this_char));
4608 ret = REG_NOERROR;
4609 }
4610 }
4611# else
4612 /* Here we see why `this_char' has to be larger than an `unsigned
4613 char' -- we would otherwise go into an infinite loop, since all
4614 characters <= 0xff. */
4615 range_start_char = TRANSLATE (range_start_char);
4616 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
4617 and some compilers cast it to int implicitly, so following for_loop
4618 may fall to (almost) infinite loop.
4619 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
4620 To avoid this, we cast p[0] to unsigned int and truncate it. */
4621 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
4622
4623 for (this_char = range_start_char; this_char <= end_char; ++this_char)
4624 {
4625 SET_LIST_BIT (TRANSLATE (this_char));
4626 ret = REG_NOERROR;
4627 }
4628# endif
4629
4630 return ret;
4631}
4632#endif /* WCHAR */
4633
4634
4635/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4636 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4637 characters can start a string that matches the pattern. This fastmap
4638 is used by re_search to skip quickly over impossible starting points.
4639
4640 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4641 area as BUFP->fastmap.
4642
4643 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4644 the pattern buffer.
4645
4646 Returns 0 if we succeed, -2 if an internal error. */
4647
4648#ifdef WCHAR
4649/* local function for re_compile_fastmap.
4650 truncate wchar_t character to char. */
4651static unsigned char truncate_wchar (CHAR_T c);
4652
4653static unsigned char
4654truncate_wchar (c)
4655 CHAR_T c;
4656{
4657 unsigned char buf[MB_CUR_MAX];
4658 mbstate_t state;
4659 int retval;
4660 memset (&state, '\0', sizeof (state));
4661# ifdef _LIBC
4662 retval = __wcrtomb (buf, c, &state);
4663# else
4664 retval = wcrtomb (buf, c, &state);
4665# endif
4666 return retval > 0 ? buf[0] : (unsigned char) c;
4667}
4668#endif /* WCHAR */
4669
4670static int
4671PREFIX(re_compile_fastmap) (bufp)
4672 struct re_pattern_buffer *bufp;
4673{
4674 int j, k;
4675#ifdef MATCH_MAY_ALLOCATE
4676 PREFIX(fail_stack_type) fail_stack;
4677#endif
4678#ifndef REGEX_MALLOC
4679 char *destination;
4680#endif
4681
4682 register char *fastmap = bufp->fastmap;
4683
4684#ifdef WCHAR
4685 /* We need to cast pattern to (wchar_t*), because we casted this compiled
4686 pattern to (char*) in regex_compile. */
4687 UCHAR_T *pattern = (UCHAR_T*)bufp->buffer;
4688 register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used);
4689#else /* BYTE */
4690 UCHAR_T *pattern = bufp->buffer;
4691 register UCHAR_T *pend = pattern + bufp->used;
4692#endif /* WCHAR */
4693 UCHAR_T *p = pattern;
4694
4695#ifdef REL_ALLOC
4696 /* This holds the pointer to the failure stack, when
4697 it is allocated relocatably. */
4698 fail_stack_elt_t *failure_stack_ptr;
4699#endif
4700
4701 /* Assume that each path through the pattern can be null until
4702 proven otherwise. We set this false at the bottom of switch
4703 statement, to which we get only if a particular path doesn't
4704 match the empty string. */
4705 boolean path_can_be_null = true;
4706
4707 /* We aren't doing a `succeed_n' to begin with. */
4708 boolean succeed_n_p = false;
4709
4710 assert (fastmap != NULL && p != NULL);
4711
4712 INIT_FAIL_STACK ();
4713 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4714 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4715 bufp->can_be_null = 0;
4716
4717 while (1)
4718 {
4719 if (p == pend || *p == succeed)
4720 {
4721 /* We have reached the (effective) end of pattern. */
4722 if (!FAIL_STACK_EMPTY ())
4723 {
4724 bufp->can_be_null |= path_can_be_null;
4725
4726 /* Reset for next path. */
4727 path_can_be_null = true;
4728
4729 p = fail_stack.stack[--fail_stack.avail].pointer;
4730
4731 continue;
4732 }
4733 else
4734 break;
4735 }
4736
4737 /* We should never be about to go beyond the end of the pattern. */
4738 assert (p < pend);
4739
4740 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4741 {
4742
4743 /* I guess the idea here is to simply not bother with a fastmap
4744 if a backreference is used, since it's too hard to figure out
4745 the fastmap for the corresponding group. Setting
4746 `can_be_null' stops `re_search_2' from using the fastmap, so
4747 that is all we do. */
4748 case duplicate:
4749 bufp->can_be_null = 1;
4750 goto done;
4751
4752
4753 /* Following are the cases which match a character. These end
4754 with `break'. */
4755
4756#ifdef WCHAR
4757 case exactn:
4758 fastmap[truncate_wchar(p[1])] = 1;
4759 break;
4760#else /* BYTE */
4761 case exactn:
4762 fastmap[p[1]] = 1;
4763 break;
4764#endif /* WCHAR */
4765#ifdef MBS_SUPPORT
4766 case exactn_bin:
4767 fastmap[p[1]] = 1;
4768 break;
4769#endif
4770
4771#ifdef WCHAR
4772 /* It is hard to distinguish fastmap from (multi byte) characters
4773 which depends on current locale. */
4774 case charset:
4775 case charset_not:
4776 case wordchar:
4777 case notwordchar:
4778 bufp->can_be_null = 1;
4779 goto done;
4780#else /* BYTE */
4781 case charset:
4782 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4783 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
4784 fastmap[j] = 1;
4785 break;
4786
4787
4788 case charset_not:
4789 /* Chars beyond end of map must be allowed. */
4790 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
4791 fastmap[j] = 1;
4792
4793 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4794 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
4795 fastmap[j] = 1;
4796 break;
4797
4798
4799 case wordchar:
4800 for (j = 0; j < (1 << BYTEWIDTH); j++)
4801 if (SYNTAX (j) == Sword)
4802 fastmap[j] = 1;
4803 break;
4804
4805
4806 case notwordchar:
4807 for (j = 0; j < (1 << BYTEWIDTH); j++)
4808 if (SYNTAX (j) != Sword)
4809 fastmap[j] = 1;
4810 break;
4811#endif /* WCHAR */
4812
4813 case anychar:
4814 {
4815 int fastmap_newline = fastmap['\n'];
4816
4817 /* `.' matches anything ... */
4818 for (j = 0; j < (1 << BYTEWIDTH); j++)
4819 fastmap[j] = 1;
4820
4821 /* ... except perhaps newline. */
4822 if (!(bufp->syntax & RE_DOT_NEWLINE))
4823 fastmap['\n'] = fastmap_newline;
4824
4825 /* Return if we have already set `can_be_null'; if we have,
4826 then the fastmap is irrelevant. Something's wrong here. */
4827 else if (bufp->can_be_null)
4828 goto done;
4829
4830 /* Otherwise, have to check alternative paths. */
4831 break;
4832 }
4833
4834#ifdef emacs
4835 case syntaxspec:
4836 k = *p++;
4837 for (j = 0; j < (1 << BYTEWIDTH); j++)
4838 if (SYNTAX (j) == (enum syntaxcode) k)
4839 fastmap[j] = 1;
4840 break;
4841
4842
4843 case notsyntaxspec:
4844 k = *p++;
4845 for (j = 0; j < (1 << BYTEWIDTH); j++)
4846 if (SYNTAX (j) != (enum syntaxcode) k)
4847 fastmap[j] = 1;
4848 break;
4849
4850
4851 /* All cases after this match the empty string. These end with
4852 `continue'. */
4853
4854
4855 case before_dot:
4856 case at_dot:
4857 case after_dot:
4858 continue;
4859#endif /* emacs */
4860
4861
4862 case no_op:
4863 case begline:
4864 case endline:
4865 case begbuf:
4866 case endbuf:
4867 case wordbound:
4868 case notwordbound:
4869 case wordbeg:
4870 case wordend:
4871 case push_dummy_failure:
4872 continue;
4873
4874
4875 case jump_n:
4876 case pop_failure_jump:
4877 case maybe_pop_jump:
4878 case jump:
4879 case jump_past_alt:
4880 case dummy_failure_jump:
4881 EXTRACT_NUMBER_AND_INCR (j, p);
4882 p += j;
4883 if (j > 0)
4884 continue;
4885
4886 /* Jump backward implies we just went through the body of a
4887 loop and matched nothing. Opcode jumped to should be
4888 `on_failure_jump' or `succeed_n'. Just treat it like an
4889 ordinary jump. For a * loop, it has pushed its failure
4890 point already; if so, discard that as redundant. */
4891 if ((re_opcode_t) *p != on_failure_jump
4892 && (re_opcode_t) *p != succeed_n)
4893 continue;
4894
4895 p++;
4896 EXTRACT_NUMBER_AND_INCR (j, p);
4897 p += j;
4898
4899 /* If what's on the stack is where we are now, pop it. */
4900 if (!FAIL_STACK_EMPTY ()
4901 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
4902 fail_stack.avail--;
4903
4904 continue;
4905
4906
4907 case on_failure_jump:
4908 case on_failure_keep_string_jump:
4909 handle_on_failure_jump:
4910 EXTRACT_NUMBER_AND_INCR (j, p);
4911
4912 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
4913 end of the pattern. We don't want to push such a point,
4914 since when we restore it above, entering the switch will
4915 increment `p' past the end of the pattern. We don't need
4916 to push such a point since we obviously won't find any more
4917 fastmap entries beyond `pend'. Such a pattern can match
4918 the null string, though. */
4919 if (p + j < pend)
4920 {
4921 if (!PUSH_PATTERN_OP (p + j, fail_stack))
4922 {
4923 RESET_FAIL_STACK ();
4924 return -2;
4925 }
4926 }
4927 else
4928 bufp->can_be_null = 1;
4929
4930 if (succeed_n_p)
4931 {
4932 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
4933 succeed_n_p = false;
4934 }
4935
4936 continue;
4937
4938
4939 case succeed_n:
4940 /* Get to the number of times to succeed. */
4941 p += OFFSET_ADDRESS_SIZE;
4942
4943 /* Increment p past the n for when k != 0. */
4944 EXTRACT_NUMBER_AND_INCR (k, p);
4945 if (k == 0)
4946 {
4947 p -= 2 * OFFSET_ADDRESS_SIZE;
4948 succeed_n_p = true; /* Spaghetti code alert. */
4949 goto handle_on_failure_jump;
4950 }
4951 continue;
4952
4953
4954 case set_number_at:
4955 p += 2 * OFFSET_ADDRESS_SIZE;
4956 continue;
4957
4958
4959 case start_memory:
4960 case stop_memory:
4961 p += 2;
4962 continue;
4963
4964
4965 default:
4966 abort (); /* We have listed all the cases. */
4967 } /* switch *p++ */
4968
4969 /* Getting here means we have found the possible starting
4970 characters for one path of the pattern -- and that the empty
4971 string does not match. We need not follow this path further.
4972 Instead, look at the next alternative (remembered on the
4973 stack), or quit if no more. The test at the top of the loop
4974 does these things. */
4975 path_can_be_null = false;
4976 p = pend;
4977 } /* while p */
4978
4979 /* Set `can_be_null' for the last path (also the first path, if the
4980 pattern is empty). */
4981 bufp->can_be_null |= path_can_be_null;
4982
4983 done:
4984 RESET_FAIL_STACK ();
4985 return 0;
4986}
4987
4988#else /* not INSIDE_RECURSION */
4989
4990int
4991re_compile_fastmap (bufp)
4992 struct re_pattern_buffer *bufp;
4993{
4994# ifdef MBS_SUPPORT
4995 if (MB_CUR_MAX != 1)
4996 return wcs_re_compile_fastmap(bufp);
4997 else
4998# endif
4999 return byte_re_compile_fastmap(bufp);
5000} /* re_compile_fastmap */
5001#ifdef _LIBC
5002weak_alias (__re_compile_fastmap, re_compile_fastmap)
5003#endif
5004
5005
5006
5007/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
5008 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
5009 this memory for recording register information. STARTS and ENDS
5010 must be allocated using the malloc library routine, and must each
5011 be at least NUM_REGS * sizeof (regoff_t) bytes long.
5012
5013 If NUM_REGS == 0, then subsequent matches should allocate their own
5014 register data.
5015
5016 Unless this function is called, the first search or match using
5017 PATTERN_BUFFER will allocate its own register data, without
5018 freeing the old data. */
5019
5020void
5021re_set_registers (bufp, regs, num_regs, starts, ends)
5022 struct re_pattern_buffer *bufp;
5023 struct re_registers *regs;
5024 unsigned num_regs;
5025 regoff_t *starts, *ends;
5026{
5027 if (num_regs)
5028 {
5029 bufp->regs_allocated = REGS_REALLOCATE;
5030 regs->num_regs = num_regs;
5031 regs->start = starts;
5032 regs->end = ends;
5033 }
5034 else
5035 {
5036 bufp->regs_allocated = REGS_UNALLOCATED;
5037 regs->num_regs = 0;
5038 regs->start = regs->end = (regoff_t *) 0;
5039 }
5040}
5041#ifdef _LIBC
5042weak_alias (__re_set_registers, re_set_registers)
5043#endif
5044
5045
5046/* Searching routines. */
5047
5048/* Like re_search_2, below, but only one string is specified, and
5049 doesn't let you say where to stop matching. */
5050
5051int
5052re_search (bufp, string, size, startpos, range, regs)
5053 struct re_pattern_buffer *bufp;
5054 const char *string;
5055 int size, startpos, range;
5056 struct re_registers *regs;
5057{
5058 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
5059 regs, size);
5060}
5061#ifdef _LIBC
5062weak_alias (__re_search, re_search)
5063#endif
5064
5065
5066/* Using the compiled pattern in BUFP->buffer, first tries to match the
5067 virtual concatenation of STRING1 and STRING2, starting first at index
5068 STARTPOS, then at STARTPOS + 1, and so on.
5069
5070 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5071
5072 RANGE is how far to scan while trying to match. RANGE = 0 means try
5073 only at STARTPOS; in general, the last start tried is STARTPOS +
5074 RANGE.
5075
5076 In REGS, return the indices of the virtual concatenation of STRING1
5077 and STRING2 that matched the entire BUFP->buffer and its contained
5078 subexpressions.
5079
5080 Do not consider matching one past the index STOP in the virtual
5081 concatenation of STRING1 and STRING2.
5082
5083 We return either the position in the strings at which the match was
5084 found, -1 if no match, or -2 if error (such as failure
5085 stack overflow). */
5086
5087int
5088re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
5089 struct re_pattern_buffer *bufp;
5090 const char *string1, *string2;
5091 int size1, size2;
5092 int startpos;
5093 int range;
5094 struct re_registers *regs;
5095 int stop;
5096{
5097# ifdef MBS_SUPPORT
5098 if (MB_CUR_MAX != 1)
5099 return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos,
5100 range, regs, stop);
5101 else
5102# endif
5103 return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos,
5104 range, regs, stop);
5105} /* re_search_2 */
5106#ifdef _LIBC
5107weak_alias (__re_search_2, re_search_2)
5108#endif
5109
5110#endif /* not INSIDE_RECURSION */
5111
5112#ifdef INSIDE_RECURSION
5113
5114#ifdef MATCH_MAY_ALLOCATE
5115# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
5116#else
5117# define FREE_VAR(var) if (var) free (var); var = NULL
5118#endif
5119
5120#ifdef WCHAR
5121# define MAX_ALLOCA_SIZE 2000
5122
5123# define FREE_WCS_BUFFERS() \
5124 do { \
5125 if (size1 > MAX_ALLOCA_SIZE) \
5126 { \
5127 free (wcs_string1); \
5128 free (mbs_offset1); \
5129 } \
5130 else \
5131 { \
5132 FREE_VAR (wcs_string1); \
5133 FREE_VAR (mbs_offset1); \
5134 } \
5135 if (size2 > MAX_ALLOCA_SIZE) \
5136 { \
5137 free (wcs_string2); \
5138 free (mbs_offset2); \
5139 } \
5140 else \
5141 { \
5142 FREE_VAR (wcs_string2); \
5143 FREE_VAR (mbs_offset2); \
5144 } \
5145 } while (0)
5146
5147#endif
5148
5149
5150static int
5151PREFIX(re_search_2) (bufp, string1, size1, string2, size2, startpos, range,
5152 regs, stop)
5153 struct re_pattern_buffer *bufp;
5154 const char *string1, *string2;
5155 int size1, size2;
5156 int startpos;
5157 int range;
5158 struct re_registers *regs;
5159 int stop;
5160{
5161 int val;
5162 register char *fastmap = bufp->fastmap;
5163 register RE_TRANSLATE_TYPE translate = bufp->translate;
5164 int total_size = size1 + size2;
5165 int endpos = startpos + range;
5166#ifdef WCHAR
5167 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5168 wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL;
5169 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5170 int wcs_size1 = 0, wcs_size2 = 0;
5171 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5172 int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
5173 /* They hold whether each wchar_t is binary data or not. */
5174 char *is_binary = NULL;
5175#endif /* WCHAR */
5176
5177 /* Check for out-of-range STARTPOS. */
5178 if (startpos < 0 || startpos > total_size)
5179 return -1;
5180
5181 /* Fix up RANGE if it might eventually take us outside
5182 the virtual concatenation of STRING1 and STRING2.
5183 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
5184 if (endpos < 0)
5185 range = 0 - startpos;
5186 else if (endpos > total_size)
5187 range = total_size - startpos;
5188
5189 /* If the search isn't to be a backwards one, don't waste time in a
5190 search for a pattern that must be anchored. */
5191 if (bufp->used > 0 && range > 0
5192 && ((re_opcode_t) bufp->buffer[0] == begbuf
5193 /* `begline' is like `begbuf' if it cannot match at newlines. */
5194 || ((re_opcode_t) bufp->buffer[0] == begline
5195 && !bufp->newline_anchor)))
5196 {
5197 if (startpos > 0)
5198 return -1;
5199 else
5200 range = 1;
5201 }
5202
5203#ifdef emacs
5204 /* In a forward search for something that starts with \=.
5205 don't keep searching past point. */
5206 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
5207 {
5208 range = PT - startpos;
5209 if (range <= 0)
5210 return -1;
5211 }
5212#endif /* emacs */
5213
5214 /* Update the fastmap now if not correct already. */
5215 if (fastmap && !bufp->fastmap_accurate)
5216 if (re_compile_fastmap (bufp) == -2)
5217 return -2;
5218
5219#ifdef WCHAR
5220 /* Allocate wchar_t array for wcs_string1 and wcs_string2 and
5221 fill them with converted string. */
5222 if (size1 != 0)
5223 {
5224 if (size1 > MAX_ALLOCA_SIZE)
5225 {
5226 wcs_string1 = TALLOC (size1 + 1, CHAR_T);
5227 mbs_offset1 = TALLOC (size1 + 1, int);
5228 is_binary = TALLOC (size1 + 1, char);
5229 }
5230 else
5231 {
5232 wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T);
5233 mbs_offset1 = REGEX_TALLOC (size1 + 1, int);
5234 is_binary = REGEX_TALLOC (size1 + 1, char);
5235 }
5236 if (!wcs_string1 || !mbs_offset1 || !is_binary)
5237 {
5238 if (size1 > MAX_ALLOCA_SIZE)
5239 {
5240 free (wcs_string1);
5241 free (mbs_offset1);
5242 free (is_binary);
5243 }
5244 else
5245 {
5246 FREE_VAR (wcs_string1);
5247 FREE_VAR (mbs_offset1);
5248 FREE_VAR (is_binary);
5249 }
5250 return -2;
5251 }
5252 wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1,
5253 mbs_offset1, is_binary);
5254 wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */
5255 if (size1 > MAX_ALLOCA_SIZE)
5256 free (is_binary);
5257 else
5258 FREE_VAR (is_binary);
5259 }
5260 if (size2 != 0)
5261 {
5262 if (size2 > MAX_ALLOCA_SIZE)
5263 {
5264 wcs_string2 = TALLOC (size2 + 1, CHAR_T);
5265 mbs_offset2 = TALLOC (size2 + 1, int);
5266 is_binary = TALLOC (size2 + 1, char);
5267 }
5268 else
5269 {
5270 wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T);
5271 mbs_offset2 = REGEX_TALLOC (size2 + 1, int);
5272 is_binary = REGEX_TALLOC (size2 + 1, char);
5273 }
5274 if (!wcs_string2 || !mbs_offset2 || !is_binary)
5275 {
5276 FREE_WCS_BUFFERS ();
5277 if (size2 > MAX_ALLOCA_SIZE)
5278 free (is_binary);
5279 else
5280 FREE_VAR (is_binary);
5281 return -2;
5282 }
5283 wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2,
5284 mbs_offset2, is_binary);
5285 wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */
5286 if (size2 > MAX_ALLOCA_SIZE)
5287 free (is_binary);
5288 else
5289 FREE_VAR (is_binary);
5290 }
5291#endif /* WCHAR */
5292
5293
5294 /* Loop through the string, looking for a place to start matching. */
5295 for (;;)
5296 {
5297 /* If a fastmap is supplied, skip quickly over characters that
5298 cannot be the start of a match. If the pattern can match the
5299 null string, however, we don't need to skip characters; we want
5300 the first null string. */
5301 if (fastmap && startpos < total_size && !bufp->can_be_null)
5302 {
5303 if (range > 0) /* Searching forwards. */
5304 {
5305 register const char *d;
5306 register int lim = 0;
5307 int irange = range;
5308
5309 if (startpos < size1 && startpos + range >= size1)
5310 lim = range - (size1 - startpos);
5311
5312 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
5313
5314 /* Written out as an if-else to avoid testing `translate'
5315 inside the loop. */
5316 if (translate)
5317 while (range > lim
5318 && !fastmap[(unsigned char)
5319 translate[(unsigned char) *d++]])
5320 range--;
5321 else
5322 while (range > lim && !fastmap[(unsigned char) *d++])
5323 range--;
5324
5325 startpos += irange - range;
5326 }
5327 else /* Searching backwards. */
5328 {
5329 register CHAR_T c = (size1 == 0 || startpos >= size1
5330 ? string2[startpos - size1]
5331 : string1[startpos]);
5332
5333 if (!fastmap[(unsigned char) TRANSLATE (c)])
5334 goto advance;
5335 }
5336 }
5337
5338 /* If can't match the null string, and that's all we have left, fail. */
5339 if (range >= 0 && startpos == total_size && fastmap
5340 && !bufp->can_be_null)
5341 {
5342#ifdef WCHAR
5343 FREE_WCS_BUFFERS ();
5344#endif
5345 return -1;
5346 }
5347
5348#ifdef WCHAR
5349 val = wcs_re_match_2_internal (bufp, string1, size1, string2,
5350 size2, startpos, regs, stop,
5351 wcs_string1, wcs_size1,
5352 wcs_string2, wcs_size2,
5353 mbs_offset1, mbs_offset2);
5354#else /* BYTE */
5355 val = byte_re_match_2_internal (bufp, string1, size1, string2,
5356 size2, startpos, regs, stop);
5357#endif /* BYTE */
5358
5359#ifndef REGEX_MALLOC
5360# ifdef C_ALLOCA
5361 alloca (0);
5362# endif
5363#endif
5364
5365 if (val >= 0)
5366 {
5367#ifdef WCHAR
5368 FREE_WCS_BUFFERS ();
5369#endif
5370 return startpos;
5371 }
5372
5373 if (val == -2)
5374 {
5375#ifdef WCHAR
5376 FREE_WCS_BUFFERS ();
5377#endif
5378 return -2;
5379 }
5380
5381 advance:
5382 if (!range)
5383 break;
5384 else if (range > 0)
5385 {
5386 range--;
5387 startpos++;
5388 }
5389 else
5390 {
5391 range++;
5392 startpos--;
5393 }
5394 }
5395#ifdef WCHAR
5396 FREE_WCS_BUFFERS ();
5397#endif
5398 return -1;
5399}
5400
5401#ifdef WCHAR
5402/* This converts PTR, a pointer into one of the search wchar_t strings
5403 `string1' and `string2' into an multibyte string offset from the
5404 beginning of that string. We use mbs_offset to optimize.
5405 See convert_mbs_to_wcs. */
5406# define POINTER_TO_OFFSET(ptr) \
5407 (FIRST_STRING_P (ptr) \
5408 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
5409 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
5410 + csize1)))
5411#else /* BYTE */
5412/* This converts PTR, a pointer into one of the search strings `string1'
5413 and `string2' into an offset from the beginning of that string. */
5414# define POINTER_TO_OFFSET(ptr) \
5415 (FIRST_STRING_P (ptr) \
5416 ? ((regoff_t) ((ptr) - string1)) \
5417 : ((regoff_t) ((ptr) - string2 + size1)))
5418#endif /* WCHAR */
5419
5420/* Macros for dealing with the split strings in re_match_2. */
5421
5422#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
5423
5424/* Call before fetching a character with *d. This switches over to
5425 string2 if necessary. */
5426#define PREFETCH() \
5427 while (d == dend) \
5428 { \
5429 /* End of string2 => fail. */ \
5430 if (dend == end_match_2) \
5431 goto fail; \
5432 /* End of string1 => advance to string2. */ \
5433 d = string2; \
5434 dend = end_match_2; \
5435 }
5436
5437/* Test if at very beginning or at very end of the virtual concatenation
5438 of `string1' and `string2'. If only one string, it's `string2'. */
5439#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5440#define AT_STRINGS_END(d) ((d) == end2)
5441
5442
5443/* Test if D points to a character which is word-constituent. We have
5444 two special cases to check for: if past the end of string1, look at
5445 the first character in string2; and if before the beginning of
5446 string2, look at the last character in string1. */
5447#ifdef WCHAR
5448/* Use internationalized API instead of SYNTAX. */
5449# define WORDCHAR_P(d) \
5450 (iswalnum ((wint_t)((d) == end1 ? *string2 \
5451 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \
5452 || ((d) == end1 ? *string2 \
5453 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_')
5454#else /* BYTE */
5455# define WORDCHAR_P(d) \
5456 (SYNTAX ((d) == end1 ? *string2 \
5457 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
5458 == Sword)
5459#endif /* WCHAR */
5460
5461/* Disabled due to a compiler bug -- see comment at case wordbound */
5462#if 0
5463/* Test if the character before D and the one at D differ with respect
5464 to being word-constituent. */
5465#define AT_WORD_BOUNDARY(d) \
5466 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
5467 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
5468#endif
5469
5470/* Free everything we malloc. */
5471#ifdef MATCH_MAY_ALLOCATE
5472# ifdef WCHAR
5473# define FREE_VARIABLES() \
5474 do { \
5475 REGEX_FREE_STACK (fail_stack.stack); \
5476 FREE_VAR (regstart); \
5477 FREE_VAR (regend); \
5478 FREE_VAR (old_regstart); \
5479 FREE_VAR (old_regend); \
5480 FREE_VAR (best_regstart); \
5481 FREE_VAR (best_regend); \
5482 FREE_VAR (reg_info); \
5483 FREE_VAR (reg_dummy); \
5484 FREE_VAR (reg_info_dummy); \
5485 if (!cant_free_wcs_buf) \
5486 { \
5487 FREE_VAR (string1); \
5488 FREE_VAR (string2); \
5489 FREE_VAR (mbs_offset1); \
5490 FREE_VAR (mbs_offset2); \
5491 } \
5492 } while (0)
5493# else /* BYTE */
5494# define FREE_VARIABLES() \
5495 do { \
5496 REGEX_FREE_STACK (fail_stack.stack); \
5497 FREE_VAR (regstart); \
5498 FREE_VAR (regend); \
5499 FREE_VAR (old_regstart); \
5500 FREE_VAR (old_regend); \
5501 FREE_VAR (best_regstart); \
5502 FREE_VAR (best_regend); \
5503 FREE_VAR (reg_info); \
5504 FREE_VAR (reg_dummy); \
5505 FREE_VAR (reg_info_dummy); \
5506 } while (0)
5507# endif /* WCHAR */
5508#else
5509# ifdef WCHAR
5510# define FREE_VARIABLES() \
5511 do { \
5512 if (!cant_free_wcs_buf) \
5513 { \
5514 FREE_VAR (string1); \
5515 FREE_VAR (string2); \
5516 FREE_VAR (mbs_offset1); \
5517 FREE_VAR (mbs_offset2); \
5518 } \
5519 } while (0)
5520# else /* BYTE */
5521# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
5522# endif /* WCHAR */
5523#endif /* not MATCH_MAY_ALLOCATE */
5524
5525/* These values must meet several constraints. They must not be valid
5526 register values; since we have a limit of 255 registers (because
5527 we use only one byte in the pattern for the register number), we can
5528 use numbers larger than 255. They must differ by 1, because of
5529 NUM_FAILURE_ITEMS above. And the value for the lowest register must
5530 be larger than the value for the highest register, so we do not try
5531 to actually save any registers when none are active. */
5532#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
5533#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
5534
5535
5536#else /* not INSIDE_RECURSION */
5537/* Matching routines. */
5538
5539#ifndef emacs /* Emacs never uses this. */
5540/* re_match is like re_match_2 except it takes only a single string. */
5541
5542int
5543re_match (bufp, string, size, pos, regs)
5544 struct re_pattern_buffer *bufp;
5545 const char *string;
5546 int size, pos;
5547 struct re_registers *regs;
5548{
5549 int result;
5550# ifdef MBS_SUPPORT
5551 if (MB_CUR_MAX != 1)
5552 result = wcs_re_match_2_internal (bufp, NULL, 0, string, size,
5553 pos, regs, size,
5554 NULL, 0, NULL, 0, NULL, NULL);
5555 else
5556# endif
5557 result = byte_re_match_2_internal (bufp, NULL, 0, string, size,
5558 pos, regs, size);
5559# ifndef REGEX_MALLOC
5560# ifdef C_ALLOCA
5561 alloca (0);
5562# endif
5563# endif
5564 return result;
5565}
5566# ifdef _LIBC
5567weak_alias (__re_match, re_match)
5568# endif
5569#endif /* not emacs */
5570
5571#endif /* not INSIDE_RECURSION */
5572
5573#ifdef INSIDE_RECURSION
5574static boolean PREFIX(group_match_null_string_p) _RE_ARGS ((UCHAR_T **p,
5575 UCHAR_T *end,
5576 PREFIX(register_info_type) *reg_info));
5577static boolean PREFIX(alt_match_null_string_p) _RE_ARGS ((UCHAR_T *p,
5578 UCHAR_T *end,
5579 PREFIX(register_info_type) *reg_info));
5580static boolean PREFIX(common_op_match_null_string_p) _RE_ARGS ((UCHAR_T **p,
5581 UCHAR_T *end,
5582 PREFIX(register_info_type) *reg_info));
5583static int PREFIX(bcmp_translate) _RE_ARGS ((const CHAR_T *s1, const CHAR_T *s2,
5584 int len, char *translate));
5585#else /* not INSIDE_RECURSION */
5586
5587/* re_match_2 matches the compiled pattern in BUFP against the
5588 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5589 and SIZE2, respectively). We start matching at POS, and stop
5590 matching at STOP.
5591
5592 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
5593 store offsets for the substring each group matched in REGS. See the
5594 documentation for exactly how many groups we fill.
5595
5596 We return -1 if no match, -2 if an internal error (such as the
5597 failure stack overflowing). Otherwise, we return the length of the
5598 matched substring. */
5599
5600int
5601re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
5602 struct re_pattern_buffer *bufp;
5603 const char *string1, *string2;
5604 int size1, size2;
5605 int pos;
5606 struct re_registers *regs;
5607 int stop;
5608{
5609 int result;
5610# ifdef MBS_SUPPORT
5611 if (MB_CUR_MAX != 1)
5612 result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2,
5613 pos, regs, stop,
5614 NULL, 0, NULL, 0, NULL, NULL);
5615 else
5616# endif
5617 result = byte_re_match_2_internal (bufp, string1, size1, string2, size2,
5618 pos, regs, stop);
5619
5620#ifndef REGEX_MALLOC
5621# ifdef C_ALLOCA
5622 alloca (0);
5623# endif
5624#endif
5625 return result;
5626}
5627#ifdef _LIBC
5628weak_alias (__re_match_2, re_match_2)
5629#endif
5630
5631#endif /* not INSIDE_RECURSION */
5632
5633#ifdef INSIDE_RECURSION
5634
5635#ifdef WCHAR
5636static int count_mbs_length PARAMS ((int *, int));
5637
5638/* This check the substring (from 0, to length) of the multibyte string,
5639 to which offset_buffer correspond. And count how many wchar_t_characters
5640 the substring occupy. We use offset_buffer to optimization.
5641 See convert_mbs_to_wcs. */
5642
5643static int
5644count_mbs_length(offset_buffer, length)
5645 int *offset_buffer;
5646 int length;
5647{
5648 int upper, lower;
5649
5650 /* Check whether the size is valid. */
5651 if (length < 0)
5652 return -1;
5653
5654 if (offset_buffer == NULL)
5655 return 0;
5656
5657 /* If there are no multibyte character, offset_buffer[i] == i.
5658 Optmize for this case. */
5659 if (offset_buffer[length] == length)
5660 return length;
5661
5662 /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */
5663 upper = length;
5664 lower = 0;
5665
5666 while (true)
5667 {
5668 int middle = (lower + upper) / 2;
5669 if (middle == lower || middle == upper)
5670 break;
5671 if (offset_buffer[middle] > length)
5672 upper = middle;
5673 else if (offset_buffer[middle] < length)
5674 lower = middle;
5675 else
5676 return middle;
5677 }
5678
5679 return -1;
5680}
5681#endif /* WCHAR */
5682
5683/* This is a separate function so that we can force an alloca cleanup
5684 afterwards. */
5685#ifdef WCHAR
5686static int
5687wcs_re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos,
5688 regs, stop, string1, size1, string2, size2,
5689 mbs_offset1, mbs_offset2)
5690 struct re_pattern_buffer *bufp;
5691 const char *cstring1, *cstring2;
5692 int csize1, csize2;
5693 int pos;
5694 struct re_registers *regs;
5695 int stop;
5696 /* string1 == string2 == NULL means string1/2, size1/2 and
5697 mbs_offset1/2 need seting up in this function. */
5698 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5699 wchar_t *string1, *string2;
5700 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5701 int size1, size2;
5702 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5703 int *mbs_offset1, *mbs_offset2;
5704#else /* BYTE */
5705static int
5706byte_re_match_2_internal (bufp, string1, size1,string2, size2, pos,
5707 regs, stop)
5708 struct re_pattern_buffer *bufp;
5709 const char *string1, *string2;
5710 int size1, size2;
5711 int pos;
5712 struct re_registers *regs;
5713 int stop;
5714#endif /* BYTE */
5715{
5716 /* General temporaries. */
5717 int mcnt;
5718 UCHAR_T *p1;
5719#ifdef WCHAR
5720 /* They hold whether each wchar_t is binary data or not. */
5721 char *is_binary = NULL;
5722 /* If true, we can't free string1/2, mbs_offset1/2. */
5723 int cant_free_wcs_buf = 1;
5724#endif /* WCHAR */
5725
5726 /* Just past the end of the corresponding string. */
5727 const CHAR_T *end1, *end2;
5728
5729 /* Pointers into string1 and string2, just past the last characters in
5730 each to consider matching. */
5731 const CHAR_T *end_match_1, *end_match_2;
5732
5733 /* Where we are in the data, and the end of the current string. */
5734 const CHAR_T *d, *dend;
5735
5736 /* Where we are in the pattern, and the end of the pattern. */
5737#ifdef WCHAR
5738 UCHAR_T *pattern, *p;
5739 register UCHAR_T *pend;
5740#else /* BYTE */
5741 UCHAR_T *p = bufp->buffer;
5742 register UCHAR_T *pend = p + bufp->used;
5743#endif /* WCHAR */
5744
5745 /* Mark the opcode just after a start_memory, so we can test for an
5746 empty subpattern when we get to the stop_memory. */
5747 UCHAR_T *just_past_start_mem = 0;
5748
5749 /* We use this to map every character in the string. */
5750 RE_TRANSLATE_TYPE translate = bufp->translate;
5751
5752 /* Failure point stack. Each place that can handle a failure further
5753 down the line pushes a failure point on this stack. It consists of
5754 restart, regend, and reg_info for all registers corresponding to
5755 the subexpressions we're currently inside, plus the number of such
5756 registers, and, finally, two char *'s. The first char * is where
5757 to resume scanning the pattern; the second one is where to resume
5758 scanning the strings. If the latter is zero, the failure point is
5759 a ``dummy''; if a failure happens and the failure point is a dummy,
5760 it gets discarded and the next next one is tried. */
5761#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5762 PREFIX(fail_stack_type) fail_stack;
5763#endif
5764#ifdef DEBUG
5765 static unsigned failure_id;
5766 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5767#endif
5768
5769#ifdef REL_ALLOC
5770 /* This holds the pointer to the failure stack, when
5771 it is allocated relocatably. */
5772 fail_stack_elt_t *failure_stack_ptr;
5773#endif
5774
5775 /* We fill all the registers internally, independent of what we
5776 return, for use in backreferences. The number here includes
5777 an element for register zero. */
5778 size_t num_regs = bufp->re_nsub + 1;
5779
5780 /* The currently active registers. */
5781 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5782 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5783
5784 /* Information on the contents of registers. These are pointers into
5785 the input strings; they record just what was matched (on this
5786 attempt) by a subexpression part of the pattern, that is, the
5787 regnum-th regstart pointer points to where in the pattern we began
5788 matching and the regnum-th regend points to right after where we
5789 stopped matching the regnum-th subexpression. (The zeroth register
5790 keeps track of what the whole pattern matches.) */
5791#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5792 const CHAR_T **regstart, **regend;
5793#endif
5794
5795 /* If a group that's operated upon by a repetition operator fails to
5796 match anything, then the register for its start will need to be
5797 restored because it will have been set to wherever in the string we
5798 are when we last see its open-group operator. Similarly for a
5799 register's end. */
5800#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5801 const CHAR_T **old_regstart, **old_regend;
5802#endif
5803
5804 /* The is_active field of reg_info helps us keep track of which (possibly
5805 nested) subexpressions we are currently in. The matched_something
5806 field of reg_info[reg_num] helps us tell whether or not we have
5807 matched any of the pattern so far this time through the reg_num-th
5808 subexpression. These two fields get reset each time through any
5809 loop their register is in. */
5810#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5811 PREFIX(register_info_type) *reg_info;
5812#endif
5813
5814 /* The following record the register info as found in the above
5815 variables when we find a match better than any we've seen before.
5816 This happens as we backtrack through the failure points, which in
5817 turn happens only if we have not yet matched the entire string. */
5818 unsigned best_regs_set = false;
5819#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5820 const CHAR_T **best_regstart, **best_regend;
5821#endif
5822
5823 /* Logically, this is `best_regend[0]'. But we don't want to have to
5824 allocate space for that if we're not allocating space for anything
5825 else (see below). Also, we never need info about register 0 for
5826 any of the other register vectors, and it seems rather a kludge to
5827 treat `best_regend' differently than the rest. So we keep track of
5828 the end of the best match so far in a separate variable. We
5829 initialize this to NULL so that when we backtrack the first time
5830 and need to test it, it's not garbage. */
5831 const CHAR_T *match_end = NULL;
5832
5833 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
5834 int set_regs_matched_done = 0;
5835
5836 /* Used when we pop values we don't care about. */
5837#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5838 const CHAR_T **reg_dummy;
5839 PREFIX(register_info_type) *reg_info_dummy;
5840#endif
5841
5842#ifdef DEBUG
5843 /* Counts the total number of registers pushed. */
5844 unsigned num_regs_pushed = 0;
5845#endif
5846
5847 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5848
5849 INIT_FAIL_STACK ();
5850
5851#ifdef MATCH_MAY_ALLOCATE
5852 /* Do not bother to initialize all the register variables if there are
5853 no groups in the pattern, as it takes a fair amount of time. If
5854 there are groups, we include space for register 0 (the whole
5855 pattern), even though we never use it, since it simplifies the
5856 array indexing. We should fix this. */
5857 if (bufp->re_nsub)
5858 {
5859 regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5860 regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5861 old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5862 old_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5863 best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5864 best_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5865 reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5866 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *);
5867 reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5868
5869 if (!(regstart && regend && old_regstart && old_regend && reg_info
5870 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
5871 {
5872 FREE_VARIABLES ();
5873 return -2;
5874 }
5875 }
5876 else
5877 {
5878 /* We must initialize all our variables to NULL, so that
5879 `FREE_VARIABLES' doesn't try to free them. */
5880 regstart = regend = old_regstart = old_regend = best_regstart
5881 = best_regend = reg_dummy = NULL;
5882 reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL;
5883 }
5884#endif /* MATCH_MAY_ALLOCATE */
5885
5886 /* The starting position is bogus. */
5887#ifdef WCHAR
5888 if (pos < 0 || pos > csize1 + csize2)
5889#else /* BYTE */
5890 if (pos < 0 || pos > size1 + size2)
5891#endif
5892 {
5893 FREE_VARIABLES ();
5894 return -1;
5895 }
5896
5897#ifdef WCHAR
5898 /* Allocate wchar_t array for string1 and string2 and
5899 fill them with converted string. */
5900 if (string1 == NULL && string2 == NULL)
5901 {
5902 /* We need seting up buffers here. */
5903
5904 /* We must free wcs buffers in this function. */
5905 cant_free_wcs_buf = 0;
5906
5907 if (csize1 != 0)
5908 {
5909 string1 = REGEX_TALLOC (csize1 + 1, CHAR_T);
5910 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
5911 is_binary = REGEX_TALLOC (csize1 + 1, char);
5912 if (!string1 || !mbs_offset1 || !is_binary)
5913 {
5914 FREE_VAR (string1);
5915 FREE_VAR (mbs_offset1);
5916 FREE_VAR (is_binary);
5917 return -2;
5918 }
5919 }
5920 if (csize2 != 0)
5921 {
5922 string2 = REGEX_TALLOC (csize2 + 1, CHAR_T);
5923 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
5924 is_binary = REGEX_TALLOC (csize2 + 1, char);
5925 if (!string2 || !mbs_offset2 || !is_binary)
5926 {
5927 FREE_VAR (string1);
5928 FREE_VAR (mbs_offset1);
5929 FREE_VAR (string2);
5930 FREE_VAR (mbs_offset2);
5931 FREE_VAR (is_binary);
5932 return -2;
5933 }
5934 size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
5935 mbs_offset2, is_binary);
5936 string2[size2] = L'\0'; /* for a sentinel */
5937 FREE_VAR (is_binary);
5938 }
5939 }
5940
5941 /* We need to cast pattern to (wchar_t*), because we casted this compiled
5942 pattern to (char*) in regex_compile. */
5943 p = pattern = (CHAR_T*)bufp->buffer;
5944 pend = (CHAR_T*)(bufp->buffer + bufp->used);
5945
5946#endif /* WCHAR */
5947
5948 /* Initialize subexpression text positions to -1 to mark ones that no
5949 start_memory/stop_memory has been seen for. Also initialize the
5950 register information struct. */
5951 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5952 {
5953 regstart[mcnt] = regend[mcnt]
5954 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
5955
5956 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
5957 IS_ACTIVE (reg_info[mcnt]) = 0;
5958 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5959 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5960 }
5961
5962 /* We move `string1' into `string2' if the latter's empty -- but not if
5963 `string1' is null. */
5964 if (size2 == 0 && string1 != NULL)
5965 {
5966 string2 = string1;
5967 size2 = size1;
5968 string1 = 0;
5969 size1 = 0;
5970#ifdef WCHAR
5971 mbs_offset2 = mbs_offset1;
5972 csize2 = csize1;
5973 mbs_offset1 = NULL;
5974 csize1 = 0;
5975#endif
5976 }
5977 end1 = string1 + size1;
5978 end2 = string2 + size2;
5979
5980 /* Compute where to stop matching, within the two strings. */
5981#ifdef WCHAR
5982 if (stop <= csize1)
5983 {
5984 mcnt = count_mbs_length(mbs_offset1, stop);
5985 end_match_1 = string1 + mcnt;
5986 end_match_2 = string2;
5987 }
5988 else
5989 {
5990 if (stop > csize1 + csize2)
5991 stop = csize1 + csize2;
5992 end_match_1 = end1;
5993 mcnt = count_mbs_length(mbs_offset2, stop-csize1);
5994 end_match_2 = string2 + mcnt;
5995 }
5996 if (mcnt < 0)
5997 { /* count_mbs_length return error. */
5998 FREE_VARIABLES ();
5999 return -1;
6000 }
6001#else
6002 if (stop <= size1)
6003 {
6004 end_match_1 = string1 + stop;
6005 end_match_2 = string2;
6006 }
6007 else
6008 {
6009 end_match_1 = end1;
6010 end_match_2 = string2 + stop - size1;
6011 }
6012#endif /* WCHAR */
6013
6014 /* `p' scans through the pattern as `d' scans through the data.
6015 `dend' is the end of the input string that `d' points within. `d'
6016 is advanced into the following input string whenever necessary, but
6017 this happens before fetching; therefore, at the beginning of the
6018 loop, `d' can be pointing at the end of a string, but it cannot
6019 equal `string2'. */
6020#ifdef WCHAR
6021 if (size1 > 0 && pos <= csize1)
6022 {
6023 mcnt = count_mbs_length(mbs_offset1, pos);
6024 d = string1 + mcnt;
6025 dend = end_match_1;
6026 }
6027 else
6028 {
6029 mcnt = count_mbs_length(mbs_offset2, pos-csize1);
6030 d = string2 + mcnt;
6031 dend = end_match_2;
6032 }
6033
6034 if (mcnt < 0)
6035 { /* count_mbs_length return error. */
6036 FREE_VARIABLES ();
6037 return -1;
6038 }
6039#else
6040 if (size1 > 0 && pos <= size1)
6041 {
6042 d = string1 + pos;
6043 dend = end_match_1;
6044 }
6045 else
6046 {
6047 d = string2 + pos - size1;
6048 dend = end_match_2;
6049 }
6050#endif /* WCHAR */
6051
6052 DEBUG_PRINT1 ("The compiled pattern is:\n");
6053 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
6054 DEBUG_PRINT1 ("The string to match is: `");
6055 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
6056 DEBUG_PRINT1 ("'\n");
6057
6058 /* This loops over pattern commands. It exits by returning from the
6059 function if the match is complete, or it drops through if the match
6060 fails at this starting point in the input data. */
6061 for (;;)
6062 {
6063#ifdef _LIBC
6064 DEBUG_PRINT2 ("\n%p: ", p);
6065#else
6066 DEBUG_PRINT2 ("\n0x%x: ", p);
6067#endif
6068
6069 if (p == pend)
6070 { /* End of pattern means we might have succeeded. */
6071 DEBUG_PRINT1 ("end of pattern ... ");
6072
6073 /* If we haven't matched the entire string, and we want the
6074 longest match, try backtracking. */
6075 if (d != end_match_2)
6076 {
6077 /* 1 if this match ends in the same string (string1 or string2)
6078 as the best previous match. */
6079 boolean same_str_p = (FIRST_STRING_P (match_end)
6080 == MATCHING_IN_FIRST_STRING);
6081 /* 1 if this match is the best seen so far. */
6082 boolean best_match_p;
6083
6084 /* AIX compiler got confused when this was combined
6085 with the previous declaration. */
6086 if (same_str_p)
6087 best_match_p = d > match_end;
6088 else
6089 best_match_p = !MATCHING_IN_FIRST_STRING;
6090
6091 DEBUG_PRINT1 ("backtracking.\n");
6092
6093 if (!FAIL_STACK_EMPTY ())
6094 { /* More failure points to try. */
6095
6096 /* If exceeds best match so far, save it. */
6097 if (!best_regs_set || best_match_p)
6098 {
6099 best_regs_set = true;
6100 match_end = d;
6101
6102 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
6103
6104 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
6105 {
6106 best_regstart[mcnt] = regstart[mcnt];
6107 best_regend[mcnt] = regend[mcnt];
6108 }
6109 }
6110 goto fail;
6111 }
6112
6113 /* If no failure points, don't restore garbage. And if
6114 last match is real best match, don't restore second
6115 best one. */
6116 else if (best_regs_set && !best_match_p)
6117 {
6118 restore_best_regs:
6119 /* Restore best match. It may happen that `dend ==
6120 end_match_1' while the restored d is in string2.
6121 For example, the pattern `x.*y.*z' against the
6122 strings `x-' and `y-z-', if the two strings are
6123 not consecutive in memory. */
6124 DEBUG_PRINT1 ("Restoring best registers.\n");
6125
6126 d = match_end;
6127 dend = ((d >= string1 && d <= end1)
6128 ? end_match_1 : end_match_2);
6129
6130 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
6131 {
6132 regstart[mcnt] = best_regstart[mcnt];
6133 regend[mcnt] = best_regend[mcnt];
6134 }
6135 }
6136 } /* d != end_match_2 */
6137
6138 succeed_label:
6139 DEBUG_PRINT1 ("Accepting match.\n");
6140 /* If caller wants register contents data back, do it. */
6141 if (regs && !bufp->no_sub)
6142 {
6143 /* Have the register data arrays been allocated? */
6144 if (bufp->regs_allocated == REGS_UNALLOCATED)
6145 { /* No. So allocate them with malloc. We need one
6146 extra element beyond `num_regs' for the `-1' marker
6147 GNU code uses. */
6148 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
6149 regs->start = TALLOC (regs->num_regs, regoff_t);
6150 regs->end = TALLOC (regs->num_regs, regoff_t);
6151 if (regs->start == NULL || regs->end == NULL)
6152 {
6153 FREE_VARIABLES ();
6154 return -2;
6155 }
6156 bufp->regs_allocated = REGS_REALLOCATE;
6157 }
6158 else if (bufp->regs_allocated == REGS_REALLOCATE)
6159 { /* Yes. If we need more elements than were already
6160 allocated, reallocate them. If we need fewer, just
6161 leave it alone. */
6162 if (regs->num_regs < num_regs + 1)
6163 {
6164 regs->num_regs = num_regs + 1;
6165 RETALLOC (regs->start, regs->num_regs, regoff_t);
6166 RETALLOC (regs->end, regs->num_regs, regoff_t);
6167 if (regs->start == NULL || regs->end == NULL)
6168 {
6169 FREE_VARIABLES ();
6170 return -2;
6171 }
6172 }
6173 }
6174 else
6175 {
6176 /* These braces fend off a "empty body in an else-statement"
6177 warning under GCC when assert expands to nothing. */
6178 assert (bufp->regs_allocated == REGS_FIXED);
6179 }
6180
6181 /* Convert the pointer data in `regstart' and `regend' to
6182 indices. Register zero has to be set differently,
6183 since we haven't kept track of any info for it. */
6184 if (regs->num_regs > 0)
6185 {
6186 regs->start[0] = pos;
6187#ifdef WCHAR
6188 if (MATCHING_IN_FIRST_STRING)
6189 regs->end[0] = mbs_offset1 != NULL ?
6190 mbs_offset1[d-string1] : 0;
6191 else
6192 regs->end[0] = csize1 + (mbs_offset2 != NULL ?
6193 mbs_offset2[d-string2] : 0);
6194#else
6195 regs->end[0] = (MATCHING_IN_FIRST_STRING
6196 ? ((regoff_t) (d - string1))
6197 : ((regoff_t) (d - string2 + size1)));
6198#endif /* WCHAR */
6199 }
6200
6201 /* Go through the first `min (num_regs, regs->num_regs)'
6202 registers, since that is all we initialized. */
6203 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
6204 mcnt++)
6205 {
6206 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
6207 regs->start[mcnt] = regs->end[mcnt] = -1;
6208 else
6209 {
6210 regs->start[mcnt]
6211 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
6212 regs->end[mcnt]
6213 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
6214 }
6215 }
6216
6217 /* If the regs structure we return has more elements than
6218 were in the pattern, set the extra elements to -1. If
6219 we (re)allocated the registers, this is the case,
6220 because we always allocate enough to have at least one
6221 -1 at the end. */
6222 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
6223 regs->start[mcnt] = regs->end[mcnt] = -1;
6224 } /* regs && !bufp->no_sub */
6225
6226 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
6227 nfailure_points_pushed, nfailure_points_popped,
6228 nfailure_points_pushed - nfailure_points_popped);
6229 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
6230
6231#ifdef WCHAR
6232 if (MATCHING_IN_FIRST_STRING)
6233 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
6234 else
6235 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
6236 csize1;
6237 mcnt -= pos;
6238#else
6239 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
6240 ? string1
6241 : string2 - size1);
6242#endif /* WCHAR */
6243
6244 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
6245
6246 FREE_VARIABLES ();
6247 return mcnt;
6248 }
6249
6250 /* Otherwise match next pattern command. */
6251 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
6252 {
6253 /* Ignore these. Used to ignore the n of succeed_n's which
6254 currently have n == 0. */
6255 case no_op:
6256 DEBUG_PRINT1 ("EXECUTING no_op.\n");
6257 break;
6258
6259 case succeed:
6260 DEBUG_PRINT1 ("EXECUTING succeed.\n");
6261 goto succeed_label;
6262
6263 /* Match the next n pattern characters exactly. The following
6264 byte in the pattern defines n, and the n bytes after that
6265 are the characters to match. */
6266 case exactn:
6267#ifdef MBS_SUPPORT
6268 case exactn_bin:
6269#endif
6270 mcnt = *p++;
6271 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
6272
6273 /* This is written out as an if-else so we don't waste time
6274 testing `translate' inside the loop. */
6275 if (translate)
6276 {
6277 do
6278 {
6279 PREFETCH ();
6280#ifdef WCHAR
6281 if (*d <= 0xff)
6282 {
6283 if ((UCHAR_T) translate[(unsigned char) *d++]
6284 != (UCHAR_T) *p++)
6285 goto fail;
6286 }
6287 else
6288 {
6289 if (*d++ != (CHAR_T) *p++)
6290 goto fail;
6291 }
6292#else
6293 if ((UCHAR_T) translate[(unsigned char) *d++]
6294 != (UCHAR_T) *p++)
6295 goto fail;
6296#endif /* WCHAR */
6297 }
6298 while (--mcnt);
6299 }
6300 else
6301 {
6302 do
6303 {
6304 PREFETCH ();
6305 if (*d++ != (CHAR_T) *p++) goto fail;
6306 }
6307 while (--mcnt);
6308 }
6309 SET_REGS_MATCHED ();
6310 break;
6311
6312
6313 /* Match any character except possibly a newline or a null. */
6314 case anychar:
6315 DEBUG_PRINT1 ("EXECUTING anychar.\n");
6316
6317 PREFETCH ();
6318
6319 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
6320 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
6321 goto fail;
6322
6323 SET_REGS_MATCHED ();
6324 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
6325 d++;
6326 break;
6327
6328
6329 case charset:
6330 case charset_not:
6331 {
6332 register UCHAR_T c;
6333#ifdef WCHAR
6334 unsigned int i, char_class_length, coll_symbol_length,
6335 equiv_class_length, ranges_length, chars_length, length;
6336 CHAR_T *workp, *workp2, *charset_top;
6337#define WORK_BUFFER_SIZE 128
6338 CHAR_T str_buf[WORK_BUFFER_SIZE];
6339# ifdef _LIBC
6340 uint32_t nrules;
6341# endif /* _LIBC */
6342#endif /* WCHAR */
6343 boolean not = (re_opcode_t) *(p - 1) == charset_not;
6344
6345 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
6346 PREFETCH ();
6347 c = TRANSLATE (*d); /* The character to match. */
6348#ifdef WCHAR
6349# ifdef _LIBC
6350 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
6351# endif /* _LIBC */
6352 charset_top = p - 1;
6353 char_class_length = *p++;
6354 coll_symbol_length = *p++;
6355 equiv_class_length = *p++;
6356 ranges_length = *p++;
6357 chars_length = *p++;
6358 /* p points charset[6], so the address of the next instruction
6359 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
6360 where l=length of char_classes, m=length of collating_symbol,
6361 n=equivalence_class, o=length of char_range,
6362 p'=length of character. */
6363 workp = p;
6364 /* Update p to indicate the next instruction. */
6365 p += char_class_length + coll_symbol_length+ equiv_class_length +
6366 2*ranges_length + chars_length;
6367
6368 /* match with char_class? */
6369 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
6370 {
6371 wctype_t wctype;
6372 uintptr_t alignedp = ((uintptr_t)workp
6373 + __alignof__(wctype_t) - 1)
6374 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
6375 wctype = *((wctype_t*)alignedp);
6376 workp += CHAR_CLASS_SIZE;
6377# ifdef _LIBC
6378 if (__iswctype((wint_t)c, wctype))
6379 goto char_set_matched;
6380# else
6381 if (iswctype((wint_t)c, wctype))
6382 goto char_set_matched;
6383# endif
6384 }
6385
6386 /* match with collating_symbol? */
6387# ifdef _LIBC
6388 if (nrules != 0)
6389 {
6390 const unsigned char *extra = (const unsigned char *)
6391 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6392
6393 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
6394 workp++)
6395 {
6396 int32_t *wextra;
6397 wextra = (int32_t*)(extra + *workp++);
6398 for (i = 0; i < *wextra; ++i)
6399 if (TRANSLATE(d[i]) != wextra[1 + i])
6400 break;
6401
6402 if (i == *wextra)
6403 {
6404 /* Update d, however d will be incremented at
6405 char_set_matched:, we decrement d here. */
6406 d += i - 1;
6407 goto char_set_matched;
6408 }
6409 }
6410 }
6411 else /* (nrules == 0) */
6412# endif
6413 /* If we can't look up collation data, we use wcscoll
6414 instead. */
6415 {
6416 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
6417 {
6418 const CHAR_T *backup_d = d, *backup_dend = dend;
6419# ifdef _LIBC
6420 length = __wcslen (workp);
6421# else
6422 length = wcslen (workp);
6423# endif
6424
6425 /* If wcscoll(the collating symbol, whole string) > 0,
6426 any substring of the string never match with the
6427 collating symbol. */
6428# ifdef _LIBC
6429 if (__wcscoll (workp, d) > 0)
6430# else
6431 if (wcscoll (workp, d) > 0)
6432# endif
6433 {
6434 workp += length + 1;
6435 continue;
6436 }
6437
6438 /* First, we compare the collating symbol with
6439 the first character of the string.
6440 If it don't match, we add the next character to
6441 the compare buffer in turn. */
6442 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
6443 {
6444 int match;
6445 if (d == dend)
6446 {
6447 if (dend == end_match_2)
6448 break;
6449 d = string2;
6450 dend = end_match_2;
6451 }
6452
6453 /* add next character to the compare buffer. */
6454 str_buf[i] = TRANSLATE(*d);
6455 str_buf[i+1] = '\0';
6456
6457# ifdef _LIBC
6458 match = __wcscoll (workp, str_buf);
6459# else
6460 match = wcscoll (workp, str_buf);
6461# endif
6462 if (match == 0)
6463 goto char_set_matched;
6464
6465 if (match < 0)
6466 /* (str_buf > workp) indicate (str_buf + X > workp),
6467 because for all X (str_buf + X > str_buf).
6468 So we don't need continue this loop. */
6469 break;
6470
6471 /* Otherwise(str_buf < workp),
6472 (str_buf+next_character) may equals (workp).
6473 So we continue this loop. */
6474 }
6475 /* not matched */
6476 d = backup_d;
6477 dend = backup_dend;
6478 workp += length + 1;
6479 }
6480 }
6481 /* match with equivalence_class? */
6482# ifdef _LIBC
6483 if (nrules != 0)
6484 {
6485 const CHAR_T *backup_d = d, *backup_dend = dend;
6486 /* Try to match the equivalence class against
6487 those known to the collate implementation. */
6488 const int32_t *table;
6489 const int32_t *weights;
6490 const int32_t *extra;
6491 const int32_t *indirect;
6492 int32_t idx, idx2;
6493 wint_t *cp;
6494 size_t len;
6495
6496 /* This #include defines a local function! */
6497# include <locale/weightwc.h>
6498
6499 table = (const int32_t *)
6500 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
6501 weights = (const wint_t *)
6502 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
6503 extra = (const wint_t *)
6504 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
6505 indirect = (const int32_t *)
6506 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
6507
6508 /* Write 1 collating element to str_buf, and
6509 get its index. */
6510 idx2 = 0;
6511
6512 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
6513 {
6514 cp = (wint_t*)str_buf;
6515 if (d == dend)
6516 {
6517 if (dend == end_match_2)
6518 break;
6519 d = string2;
6520 dend = end_match_2;
6521 }
6522 str_buf[i] = TRANSLATE(*(d+i));
6523 str_buf[i+1] = '\0'; /* sentinel */
6524 idx2 = findidx ((const wint_t**)&cp);
6525 }
6526
6527 /* Update d, however d will be incremented at
6528 char_set_matched:, we decrement d here. */
6529 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
6530 if (d >= dend)
6531 {
6532 if (dend == end_match_2)
6533 d = dend;
6534 else
6535 {
6536 d = string2;
6537 dend = end_match_2;
6538 }
6539 }
6540
6541 len = weights[idx2];
6542
6543 for (workp2 = workp + equiv_class_length ; workp < workp2 ;
6544 workp++)
6545 {
6546 idx = (int32_t)*workp;
6547 /* We already checked idx != 0 in regex_compile. */
6548
6549 if (idx2 != 0 && len == weights[idx])
6550 {
6551 int cnt = 0;
6552 while (cnt < len && (weights[idx + 1 + cnt]
6553 == weights[idx2 + 1 + cnt]))
6554 ++cnt;
6555
6556 if (cnt == len)
6557 goto char_set_matched;
6558 }
6559 }
6560 /* not matched */
6561 d = backup_d;
6562 dend = backup_dend;
6563 }
6564 else /* (nrules == 0) */
6565# endif
6566 /* If we can't look up collation data, we use wcscoll
6567 instead. */
6568 {
6569 for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
6570 {
6571 const CHAR_T *backup_d = d, *backup_dend = dend;
6572# ifdef _LIBC
6573 length = __wcslen (workp);
6574# else
6575 length = wcslen (workp);
6576# endif
6577
6578 /* If wcscoll(the collating symbol, whole string) > 0,
6579 any substring of the string never match with the
6580 collating symbol. */
6581# ifdef _LIBC
6582 if (__wcscoll (workp, d) > 0)
6583# else
6584 if (wcscoll (workp, d) > 0)
6585# endif
6586 {
6587 workp += length + 1;
6588 break;
6589 }
6590
6591 /* First, we compare the equivalence class with
6592 the first character of the string.
6593 If it don't match, we add the next character to
6594 the compare buffer in turn. */
6595 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
6596 {
6597 int match;
6598 if (d == dend)
6599 {
6600 if (dend == end_match_2)
6601 break;
6602 d = string2;
6603 dend = end_match_2;
6604 }
6605
6606 /* add next character to the compare buffer. */
6607 str_buf[i] = TRANSLATE(*d);
6608 str_buf[i+1] = '\0';
6609
6610# ifdef _LIBC
6611 match = __wcscoll (workp, str_buf);
6612# else
6613 match = wcscoll (workp, str_buf);
6614# endif
6615
6616 if (match == 0)
6617 goto char_set_matched;
6618
6619 if (match < 0)
6620 /* (str_buf > workp) indicate (str_buf + X > workp),
6621 because for all X (str_buf + X > str_buf).
6622 So we don't need continue this loop. */
6623 break;
6624
6625 /* Otherwise(str_buf < workp),
6626 (str_buf+next_character) may equals (workp).
6627 So we continue this loop. */
6628 }
6629 /* not matched */
6630 d = backup_d;
6631 dend = backup_dend;
6632 workp += length + 1;
6633 }
6634 }
6635
6636 /* match with char_range? */
6637# ifdef _LIBC
6638 if (nrules != 0)
6639 {
6640 uint32_t collseqval;
6641 const char *collseq = (const char *)
6642 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
6643
6644 collseqval = collseq_table_lookup (collseq, c);
6645
6646 for (; workp < p - chars_length ;)
6647 {
6648 uint32_t start_val, end_val;
6649
6650 /* We already compute the collation sequence value
6651 of the characters (or collating symbols). */
6652 start_val = (uint32_t) *workp++; /* range_start */
6653 end_val = (uint32_t) *workp++; /* range_end */
6654
6655 if (start_val <= collseqval && collseqval <= end_val)
6656 goto char_set_matched;
6657 }
6658 }
6659 else
6660# endif
6661 {
6662 /* We set range_start_char at str_buf[0], range_end_char
6663 at str_buf[4], and compared char at str_buf[2]. */
6664 str_buf[1] = 0;
6665 str_buf[2] = c;
6666 str_buf[3] = 0;
6667 str_buf[5] = 0;
6668 for (; workp < p - chars_length ;)
6669 {
6670 wchar_t *range_start_char, *range_end_char;
6671
6672 /* match if (range_start_char <= c <= range_end_char). */
6673
6674 /* If range_start(or end) < 0, we assume -range_start(end)
6675 is the offset of the collating symbol which is specified
6676 as the character of the range start(end). */
6677
6678 /* range_start */
6679 if (*workp < 0)
6680 range_start_char = charset_top - (*workp++);
6681 else
6682 {
6683 str_buf[0] = *workp++;
6684 range_start_char = str_buf;
6685 }
6686
6687 /* range_end */
6688 if (*workp < 0)
6689 range_end_char = charset_top - (*workp++);
6690 else
6691 {
6692 str_buf[4] = *workp++;
6693 range_end_char = str_buf + 4;
6694 }
6695
6696# ifdef _LIBC
6697 if (__wcscoll (range_start_char, str_buf+2) <= 0
6698 && __wcscoll (str_buf+2, range_end_char) <= 0)
6699# else
6700 if (wcscoll (range_start_char, str_buf+2) <= 0
6701 && wcscoll (str_buf+2, range_end_char) <= 0)
6702# endif
6703 goto char_set_matched;
6704 }
6705 }
6706
6707 /* match with char? */
6708 for (; workp < p ; workp++)
6709 if (c == *workp)
6710 goto char_set_matched;
6711
6712 not = !not;
6713
6714 char_set_matched:
6715 if (not) goto fail;
6716#else
6717 /* Cast to `unsigned' instead of `unsigned char' in case the
6718 bit list is a full 32 bytes long. */
6719 if (c < (unsigned) (*p * BYTEWIDTH)
6720 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6721 not = !not;
6722
6723 p += 1 + *p;
6724
6725 if (!not) goto fail;
6726#undef WORK_BUFFER_SIZE
6727#endif /* WCHAR */
6728 SET_REGS_MATCHED ();
6729 d++;
6730 break;
6731 }
6732
6733
6734 /* The beginning of a group is represented by start_memory.
6735 The arguments are the register number in the next byte, and the
6736 number of groups inner to this one in the next. The text
6737 matched within the group is recorded (in the internal
6738 registers data structure) under the register number. */
6739 case start_memory:
6740 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
6741 (long int) *p, (long int) p[1]);
6742
6743 /* Find out if this group can match the empty string. */
6744 p1 = p; /* To send to group_match_null_string_p. */
6745
6746 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
6747 REG_MATCH_NULL_STRING_P (reg_info[*p])
6748 = PREFIX(group_match_null_string_p) (&p1, pend, reg_info);
6749
6750 /* Save the position in the string where we were the last time
6751 we were at this open-group operator in case the group is
6752 operated upon by a repetition operator, e.g., with `(a*)*b'
6753 against `ab'; then we want to ignore where we are now in
6754 the string in case this attempt to match fails. */
6755 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6756 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
6757 : regstart[*p];
6758 DEBUG_PRINT2 (" old_regstart: %d\n",
6759 POINTER_TO_OFFSET (old_regstart[*p]));
6760
6761 regstart[*p] = d;
6762 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
6763
6764 IS_ACTIVE (reg_info[*p]) = 1;
6765 MATCHED_SOMETHING (reg_info[*p]) = 0;
6766
6767 /* Clear this whenever we change the register activity status. */
6768 set_regs_matched_done = 0;
6769
6770 /* This is the new highest active register. */
6771 highest_active_reg = *p;
6772
6773 /* If nothing was active before, this is the new lowest active
6774 register. */
6775 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6776 lowest_active_reg = *p;
6777
6778 /* Move past the register number and inner group count. */
6779 p += 2;
6780 just_past_start_mem = p;
6781
6782 break;
6783
6784
6785 /* The stop_memory opcode represents the end of a group. Its
6786 arguments are the same as start_memory's: the register
6787 number, and the number of inner groups. */
6788 case stop_memory:
6789 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
6790 (long int) *p, (long int) p[1]);
6791
6792 /* We need to save the string position the last time we were at
6793 this close-group operator in case the group is operated
6794 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
6795 against `aba'; then we want to ignore where we are now in
6796 the string in case this attempt to match fails. */
6797 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6798 ? REG_UNSET (regend[*p]) ? d : regend[*p]
6799 : regend[*p];
6800 DEBUG_PRINT2 (" old_regend: %d\n",
6801 POINTER_TO_OFFSET (old_regend[*p]));
6802
6803 regend[*p] = d;
6804 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
6805
6806 /* This register isn't active anymore. */
6807 IS_ACTIVE (reg_info[*p]) = 0;
6808
6809 /* Clear this whenever we change the register activity status. */
6810 set_regs_matched_done = 0;
6811
6812 /* If this was the only register active, nothing is active
6813 anymore. */
6814 if (lowest_active_reg == highest_active_reg)
6815 {
6816 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6817 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6818 }
6819 else
6820 { /* We must scan for the new highest active register, since
6821 it isn't necessarily one less than now: consider
6822 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
6823 new highest active register is 1. */
6824 UCHAR_T r = *p - 1;
6825 while (r > 0 && !IS_ACTIVE (reg_info[r]))
6826 r--;
6827
6828 /* If we end up at register zero, that means that we saved
6829 the registers as the result of an `on_failure_jump', not
6830 a `start_memory', and we jumped to past the innermost
6831 `stop_memory'. For example, in ((.)*) we save
6832 registers 1 and 2 as a result of the *, but when we pop
6833 back to the second ), we are at the stop_memory 1.
6834 Thus, nothing is active. */
6835 if (r == 0)
6836 {
6837 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6838 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6839 }
6840 else
6841 highest_active_reg = r;
6842 }
6843
6844 /* If just failed to match something this time around with a
6845 group that's operated on by a repetition operator, try to
6846 force exit from the ``loop'', and restore the register
6847 information for this group that we had before trying this
6848 last match. */
6849 if ((!MATCHED_SOMETHING (reg_info[*p])
6850 || just_past_start_mem == p - 1)
6851 && (p + 2) < pend)
6852 {
6853 boolean is_a_jump_n = false;
6854
6855 p1 = p + 2;
6856 mcnt = 0;
6857 switch ((re_opcode_t) *p1++)
6858 {
6859 case jump_n:
6860 is_a_jump_n = true;
6861 case pop_failure_jump:
6862 case maybe_pop_jump:
6863 case jump:
6864 case dummy_failure_jump:
6865 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6866 if (is_a_jump_n)
6867 p1 += OFFSET_ADDRESS_SIZE;
6868 break;
6869
6870 default:
6871 /* do nothing */ ;
6872 }
6873 p1 += mcnt;
6874
6875 /* If the next operation is a jump backwards in the pattern
6876 to an on_failure_jump right before the start_memory
6877 corresponding to this stop_memory, exit from the loop
6878 by forcing a failure after pushing on the stack the
6879 on_failure_jump's jump in the pattern, and d. */
6880 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
6881 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
6882 && p1[2+OFFSET_ADDRESS_SIZE] == *p)
6883 {
6884 /* If this group ever matched anything, then restore
6885 what its registers were before trying this last
6886 failed match, e.g., with `(a*)*b' against `ab' for
6887 regstart[1], and, e.g., with `((a*)*(b*)*)*'
6888 against `aba' for regend[3].
6889
6890 Also restore the registers for inner groups for,
6891 e.g., `((a*)(b*))*' against `aba' (register 3 would
6892 otherwise get trashed). */
6893
6894 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
6895 {
6896 unsigned r;
6897
6898 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
6899
6900 /* Restore this and inner groups' (if any) registers. */
6901 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
6902 r++)
6903 {
6904 regstart[r] = old_regstart[r];
6905
6906 /* xx why this test? */
6907 if (old_regend[r] >= regstart[r])
6908 regend[r] = old_regend[r];
6909 }
6910 }
6911 p1++;
6912 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6913 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
6914
6915 goto fail;
6916 }
6917 }
6918
6919 /* Move past the register number and the inner group count. */
6920 p += 2;
6921 break;
6922
6923
6924 /* \<digit> has been turned into a `duplicate' command which is
6925 followed by the numeric value of <digit> as the register number. */
6926 case duplicate:
6927 {
6928 register const CHAR_T *d2, *dend2;
6929 int regno = *p++; /* Get which register to match against. */
6930 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
6931
6932 /* Can't back reference a group which we've never matched. */
6933 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
6934 goto fail;
6935
6936 /* Where in input to try to start matching. */
6937 d2 = regstart[regno];
6938
6939 /* Where to stop matching; if both the place to start and
6940 the place to stop matching are in the same string, then
6941 set to the place to stop, otherwise, for now have to use
6942 the end of the first string. */
6943
6944 dend2 = ((FIRST_STRING_P (regstart[regno])
6945 == FIRST_STRING_P (regend[regno]))
6946 ? regend[regno] : end_match_1);
6947 for (;;)
6948 {
6949 /* If necessary, advance to next segment in register
6950 contents. */
6951 while (d2 == dend2)
6952 {
6953 if (dend2 == end_match_2) break;
6954 if (dend2 == regend[regno]) break;
6955
6956 /* End of string1 => advance to string2. */
6957 d2 = string2;
6958 dend2 = regend[regno];
6959 }
6960 /* At end of register contents => success */
6961 if (d2 == dend2) break;
6962
6963 /* If necessary, advance to next segment in data. */
6964 PREFETCH ();
6965
6966 /* How many characters left in this segment to match. */
6967 mcnt = dend - d;
6968
6969 /* Want how many consecutive characters we can match in
6970 one shot, so, if necessary, adjust the count. */
6971 if (mcnt > dend2 - d2)
6972 mcnt = dend2 - d2;
6973
6974 /* Compare that many; failure if mismatch, else move
6975 past them. */
6976 if (translate
6977 ? PREFIX(bcmp_translate) (d, d2, mcnt, translate)
6978 : memcmp (d, d2, mcnt*sizeof(UCHAR_T)))
6979 goto fail;
6980 d += mcnt, d2 += mcnt;
6981
6982 /* Do this because we've match some characters. */
6983 SET_REGS_MATCHED ();
6984 }
6985 }
6986 break;
6987
6988
6989 /* begline matches the empty string at the beginning of the string
6990 (unless `not_bol' is set in `bufp'), and, if
6991 `newline_anchor' is set, after newlines. */
6992 case begline:
6993 DEBUG_PRINT1 ("EXECUTING begline.\n");
6994
6995 if (AT_STRINGS_BEG (d))
6996 {
6997 if (!bufp->not_bol) break;
6998 }
6999 else if (d[-1] == '\n' && bufp->newline_anchor)
7000 {
7001 break;
7002 }
7003 /* In all other cases, we fail. */
7004 goto fail;
7005
7006
7007 /* endline is the dual of begline. */
7008 case endline:
7009 DEBUG_PRINT1 ("EXECUTING endline.\n");
7010
7011 if (AT_STRINGS_END (d))
7012 {
7013 if (!bufp->not_eol) break;
7014 }
7015
7016 /* We have to ``prefetch'' the next character. */
7017 else if ((d == end1 ? *string2 : *d) == '\n'
7018 && bufp->newline_anchor)
7019 {
7020 break;
7021 }
7022 goto fail;
7023
7024
7025 /* Match at the very beginning of the data. */
7026 case begbuf:
7027 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
7028 if (AT_STRINGS_BEG (d))
7029 break;
7030 goto fail;
7031
7032
7033 /* Match at the very end of the data. */
7034 case endbuf:
7035 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
7036 if (AT_STRINGS_END (d))
7037 break;
7038 goto fail;
7039
7040
7041 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
7042 pushes NULL as the value for the string on the stack. Then
7043 `pop_failure_point' will keep the current value for the
7044 string, instead of restoring it. To see why, consider
7045 matching `foo\nbar' against `.*\n'. The .* matches the foo;
7046 then the . fails against the \n. But the next thing we want
7047 to do is match the \n against the \n; if we restored the
7048 string value, we would be back at the foo.
7049
7050 Because this is used only in specific cases, we don't need to
7051 check all the things that `on_failure_jump' does, to make
7052 sure the right things get saved on the stack. Hence we don't
7053 share its code. The only reason to push anything on the
7054 stack at all is that otherwise we would have to change
7055 `anychar's code to do something besides goto fail in this
7056 case; that seems worse than this. */
7057 case on_failure_keep_string_jump:
7058 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
7059
7060 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7061#ifdef _LIBC
7062 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
7063#else
7064 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
7065#endif
7066
7067 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
7068 break;
7069
7070
7071 /* Uses of on_failure_jump:
7072
7073 Each alternative starts with an on_failure_jump that points
7074 to the beginning of the next alternative. Each alternative
7075 except the last ends with a jump that in effect jumps past
7076 the rest of the alternatives. (They really jump to the
7077 ending jump of the following alternative, because tensioning
7078 these jumps is a hassle.)
7079
7080 Repeats start with an on_failure_jump that points past both
7081 the repetition text and either the following jump or
7082 pop_failure_jump back to this on_failure_jump. */
7083 case on_failure_jump:
7084 on_failure:
7085 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
7086
7087 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7088#ifdef _LIBC
7089 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
7090#else
7091 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
7092#endif
7093
7094 /* If this on_failure_jump comes right before a group (i.e.,
7095 the original * applied to a group), save the information
7096 for that group and all inner ones, so that if we fail back
7097 to this point, the group's information will be correct.
7098 For example, in \(a*\)*\1, we need the preceding group,
7099 and in \(zz\(a*\)b*\)\2, we need the inner group. */
7100
7101 /* We can't use `p' to check ahead because we push
7102 a failure point to `p + mcnt' after we do this. */
7103 p1 = p;
7104
7105 /* We need to skip no_op's before we look for the
7106 start_memory in case this on_failure_jump is happening as
7107 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
7108 against aba. */
7109 while (p1 < pend && (re_opcode_t) *p1 == no_op)
7110 p1++;
7111
7112 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
7113 {
7114 /* We have a new highest active register now. This will
7115 get reset at the start_memory we are about to get to,
7116 but we will have saved all the registers relevant to
7117 this repetition op, as described above. */
7118 highest_active_reg = *(p1 + 1) + *(p1 + 2);
7119 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
7120 lowest_active_reg = *(p1 + 1);
7121 }
7122
7123 DEBUG_PRINT1 (":\n");
7124 PUSH_FAILURE_POINT (p + mcnt, d, -2);
7125 break;
7126
7127
7128 /* A smart repeat ends with `maybe_pop_jump'.
7129 We change it to either `pop_failure_jump' or `jump'. */
7130 case maybe_pop_jump:
7131 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7132 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
7133 {
7134 register UCHAR_T *p2 = p;
7135
7136 /* Compare the beginning of the repeat with what in the
7137 pattern follows its end. If we can establish that there
7138 is nothing that they would both match, i.e., that we
7139 would have to backtrack because of (as in, e.g., `a*a')
7140 then we can change to pop_failure_jump, because we'll
7141 never have to backtrack.
7142
7143 This is not true in the case of alternatives: in
7144 `(a|ab)*' we do need to backtrack to the `ab' alternative
7145 (e.g., if the string was `ab'). But instead of trying to
7146 detect that here, the alternative has put on a dummy
7147 failure point which is what we will end up popping. */
7148
7149 /* Skip over open/close-group commands.
7150 If what follows this loop is a ...+ construct,
7151 look at what begins its body, since we will have to
7152 match at least one of that. */
7153 while (1)
7154 {
7155 if (p2 + 2 < pend
7156 && ((re_opcode_t) *p2 == stop_memory
7157 || (re_opcode_t) *p2 == start_memory))
7158 p2 += 3;
7159 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
7160 && (re_opcode_t) *p2 == dummy_failure_jump)
7161 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
7162 else
7163 break;
7164 }
7165
7166 p1 = p + mcnt;
7167 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
7168 to the `maybe_finalize_jump' of this case. Examine what
7169 follows. */
7170
7171 /* If we're at the end of the pattern, we can change. */
7172 if (p2 == pend)
7173 {
7174 /* Consider what happens when matching ":\(.*\)"
7175 against ":/". I don't really understand this code
7176 yet. */
7177 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7178 pop_failure_jump;
7179 DEBUG_PRINT1
7180 (" End of pattern: change to `pop_failure_jump'.\n");
7181 }
7182
7183 else if ((re_opcode_t) *p2 == exactn
7184#ifdef MBS_SUPPORT
7185 || (re_opcode_t) *p2 == exactn_bin
7186#endif
7187 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
7188 {
7189 register UCHAR_T c
7190 = *p2 == (UCHAR_T) endline ? '\n' : p2[2];
7191
7192 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
7193#ifdef MBS_SUPPORT
7194 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
7195#endif
7196 ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
7197 {
7198 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7199 pop_failure_jump;
7200#ifdef WCHAR
7201 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
7202 (wint_t) c,
7203 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
7204#else
7205 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
7206 (char) c,
7207 (char) p1[3+OFFSET_ADDRESS_SIZE]);
7208#endif
7209 }
7210
7211#ifndef WCHAR
7212 else if ((re_opcode_t) p1[3] == charset
7213 || (re_opcode_t) p1[3] == charset_not)
7214 {
7215 int not = (re_opcode_t) p1[3] == charset_not;
7216
7217 if (c < (unsigned) (p1[4] * BYTEWIDTH)
7218 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
7219 not = !not;
7220
7221 /* `not' is equal to 1 if c would match, which means
7222 that we can't change to pop_failure_jump. */
7223 if (!not)
7224 {
7225 p[-3] = (unsigned char) pop_failure_jump;
7226 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7227 }
7228 }
7229#endif /* not WCHAR */
7230 }
7231#ifndef WCHAR
7232 else if ((re_opcode_t) *p2 == charset)
7233 {
7234 /* We win if the first character of the loop is not part
7235 of the charset. */
7236 if ((re_opcode_t) p1[3] == exactn
7237 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
7238 && (p2[2 + p1[5] / BYTEWIDTH]
7239 & (1 << (p1[5] % BYTEWIDTH)))))
7240 {
7241 p[-3] = (unsigned char) pop_failure_jump;
7242 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7243 }
7244
7245 else if ((re_opcode_t) p1[3] == charset_not)
7246 {
7247 int idx;
7248 /* We win if the charset_not inside the loop
7249 lists every character listed in the charset after. */
7250 for (idx = 0; idx < (int) p2[1]; idx++)
7251 if (! (p2[2 + idx] == 0
7252 || (idx < (int) p1[4]
7253 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
7254 break;
7255
7256 if (idx == p2[1])
7257 {
7258 p[-3] = (unsigned char) pop_failure_jump;
7259 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7260 }
7261 }
7262 else if ((re_opcode_t) p1[3] == charset)
7263 {
7264 int idx;
7265 /* We win if the charset inside the loop
7266 has no overlap with the one after the loop. */
7267 for (idx = 0;
7268 idx < (int) p2[1] && idx < (int) p1[4];
7269 idx++)
7270 if ((p2[2 + idx] & p1[5 + idx]) != 0)
7271 break;
7272
7273 if (idx == p2[1] || idx == p1[4])
7274 {
7275 p[-3] = (unsigned char) pop_failure_jump;
7276 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7277 }
7278 }
7279 }
7280#endif /* not WCHAR */
7281 }
7282 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
7283 if ((re_opcode_t) p[-1] != pop_failure_jump)
7284 {
7285 p[-1] = (UCHAR_T) jump;
7286 DEBUG_PRINT1 (" Match => jump.\n");
7287 goto unconditional_jump;
7288 }
7289 /* Note fall through. */
7290
7291
7292 /* The end of a simple repeat has a pop_failure_jump back to
7293 its matching on_failure_jump, where the latter will push a
7294 failure point. The pop_failure_jump takes off failure
7295 points put on by this pop_failure_jump's matching
7296 on_failure_jump; we got through the pattern to here from the
7297 matching on_failure_jump, so didn't fail. */
7298 case pop_failure_jump:
7299 {
7300 /* We need to pass separate storage for the lowest and
7301 highest registers, even though we don't care about the
7302 actual values. Otherwise, we will restore only one
7303 register from the stack, since lowest will == highest in
7304 `pop_failure_point'. */
7305 active_reg_t dummy_low_reg, dummy_high_reg;
7306 UCHAR_T *pdummy = NULL;
7307 const CHAR_T *sdummy = NULL;
7308
7309 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
7310 POP_FAILURE_POINT (sdummy, pdummy,
7311 dummy_low_reg, dummy_high_reg,
7312 reg_dummy, reg_dummy, reg_info_dummy);
7313 }
7314 /* Note fall through. */
7315
7316 unconditional_jump:
7317#ifdef _LIBC
7318 DEBUG_PRINT2 ("\n%p: ", p);
7319#else
7320 DEBUG_PRINT2 ("\n0x%x: ", p);
7321#endif
7322 /* Note fall through. */
7323
7324 /* Unconditionally jump (without popping any failure points). */
7325 case jump:
7326 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
7327 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7328 p += mcnt; /* Do the jump. */
7329#ifdef _LIBC
7330 DEBUG_PRINT2 ("(to %p).\n", p);
7331#else
7332 DEBUG_PRINT2 ("(to 0x%x).\n", p);
7333#endif
7334 break;
7335
7336
7337 /* We need this opcode so we can detect where alternatives end
7338 in `group_match_null_string_p' et al. */
7339 case jump_past_alt:
7340 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
7341 goto unconditional_jump;
7342
7343
7344 /* Normally, the on_failure_jump pushes a failure point, which
7345 then gets popped at pop_failure_jump. We will end up at
7346 pop_failure_jump, also, and with a pattern of, say, `a+', we
7347 are skipping over the on_failure_jump, so we have to push
7348 something meaningless for pop_failure_jump to pop. */
7349 case dummy_failure_jump:
7350 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
7351 /* It doesn't matter what we push for the string here. What
7352 the code at `fail' tests is the value for the pattern. */
7353 PUSH_FAILURE_POINT (NULL, NULL, -2);
7354 goto unconditional_jump;
7355
7356
7357 /* At the end of an alternative, we need to push a dummy failure
7358 point in case we are followed by a `pop_failure_jump', because
7359 we don't want the failure point for the alternative to be
7360 popped. For example, matching `(a|ab)*' against `aab'
7361 requires that we match the `ab' alternative. */
7362 case push_dummy_failure:
7363 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
7364 /* See comments just above at `dummy_failure_jump' about the
7365 two zeroes. */
7366 PUSH_FAILURE_POINT (NULL, NULL, -2);
7367 break;
7368
7369 /* Have to succeed matching what follows at least n times.
7370 After that, handle like `on_failure_jump'. */
7371 case succeed_n:
7372 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7373 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
7374
7375 assert (mcnt >= 0);
7376 /* Originally, this is how many times we HAVE to succeed. */
7377 if (mcnt > 0)
7378 {
7379 mcnt--;
7380 p += OFFSET_ADDRESS_SIZE;
7381 STORE_NUMBER_AND_INCR (p, mcnt);
7382#ifdef _LIBC
7383 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
7384 , mcnt);
7385#else
7386 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
7387 , mcnt);
7388#endif
7389 }
7390 else if (mcnt == 0)
7391 {
7392#ifdef _LIBC
7393 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
7394 p + OFFSET_ADDRESS_SIZE);
7395#else
7396 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
7397 p + OFFSET_ADDRESS_SIZE);
7398#endif /* _LIBC */
7399
7400#ifdef WCHAR
7401 p[1] = (UCHAR_T) no_op;
7402#else
7403 p[2] = (UCHAR_T) no_op;
7404 p[3] = (UCHAR_T) no_op;
7405#endif /* WCHAR */
7406 goto on_failure;
7407 }
7408 break;
7409
7410 case jump_n:
7411 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7412 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
7413
7414 /* Originally, this is how many times we CAN jump. */
7415 if (mcnt)
7416 {
7417 mcnt--;
7418 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
7419
7420#ifdef _LIBC
7421 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
7422 mcnt);
7423#else
7424 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
7425 mcnt);
7426#endif /* _LIBC */
7427 goto unconditional_jump;
7428 }
7429 /* If don't have to jump any more, skip over the rest of command. */
7430 else
7431 p += 2 * OFFSET_ADDRESS_SIZE;
7432 break;
7433
7434 case set_number_at:
7435 {
7436 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
7437
7438 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7439 p1 = p + mcnt;
7440 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7441#ifdef _LIBC
7442 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
7443#else
7444 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
7445#endif
7446 STORE_NUMBER (p1, mcnt);
7447 break;
7448 }
7449
7450#if 0
7451 /* The DEC Alpha C compiler 3.x generates incorrect code for the
7452 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7453 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
7454 macro and introducing temporary variables works around the bug. */
7455
7456 case wordbound:
7457 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7458 if (AT_WORD_BOUNDARY (d))
7459 break;
7460 goto fail;
7461
7462 case notwordbound:
7463 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7464 if (AT_WORD_BOUNDARY (d))
7465 goto fail;
7466 break;
7467#else
7468 case wordbound:
7469 {
7470 boolean prevchar, thischar;
7471
7472 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7473 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7474 break;
7475
7476 prevchar = WORDCHAR_P (d - 1);
7477 thischar = WORDCHAR_P (d);
7478 if (prevchar != thischar)
7479 break;
7480 goto fail;
7481 }
7482
7483 case notwordbound:
7484 {
7485 boolean prevchar, thischar;
7486
7487 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7488 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7489 goto fail;
7490
7491 prevchar = WORDCHAR_P (d - 1);
7492 thischar = WORDCHAR_P (d);
7493 if (prevchar != thischar)
7494 goto fail;
7495 break;
7496 }
7497#endif
7498
7499 case wordbeg:
7500 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
7501 if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
7502 && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
7503 break;
7504 goto fail;
7505
7506 case wordend:
7507 DEBUG_PRINT1 ("EXECUTING wordend.\n");
7508 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
7509 && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
7510 break;
7511 goto fail;
7512
7513#ifdef emacs
7514 case before_dot:
7515 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
7516 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
7517 goto fail;
7518 break;
7519
7520 case at_dot:
7521 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
7522 if (PTR_CHAR_POS ((unsigned char *) d) != point)
7523 goto fail;
7524 break;
7525
7526 case after_dot:
7527 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
7528 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
7529 goto fail;
7530 break;
7531
7532 case syntaxspec:
7533 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
7534 mcnt = *p++;
7535 goto matchsyntax;
7536
7537 case wordchar:
7538 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
7539 mcnt = (int) Sword;
7540 matchsyntax:
7541 PREFETCH ();
7542 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7543 d++;
7544 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
7545 goto fail;
7546 SET_REGS_MATCHED ();
7547 break;
7548
7549 case notsyntaxspec:
7550 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
7551 mcnt = *p++;
7552 goto matchnotsyntax;
7553
7554 case notwordchar:
7555 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
7556 mcnt = (int) Sword;
7557 matchnotsyntax:
7558 PREFETCH ();
7559 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7560 d++;
7561 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
7562 goto fail;
7563 SET_REGS_MATCHED ();
7564 break;
7565
7566#else /* not emacs */
7567 case wordchar:
7568 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
7569 PREFETCH ();
7570 if (!WORDCHAR_P (d))
7571 goto fail;
7572 SET_REGS_MATCHED ();
7573 d++;
7574 break;
7575
7576 case notwordchar:
7577 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
7578 PREFETCH ();
7579 if (WORDCHAR_P (d))
7580 goto fail;
7581 SET_REGS_MATCHED ();
7582 d++;
7583 break;
7584#endif /* not emacs */
7585
7586 default:
7587 abort ();
7588 }
7589 continue; /* Successfully executed one pattern command; keep going. */
7590
7591
7592 /* We goto here if a matching operation fails. */
7593 fail:
7594 if (!FAIL_STACK_EMPTY ())
7595 { /* A restart point is known. Restore to that state. */
7596 DEBUG_PRINT1 ("\nFAIL:\n");
7597 POP_FAILURE_POINT (d, p,
7598 lowest_active_reg, highest_active_reg,
7599 regstart, regend, reg_info);
7600
7601 /* If this failure point is a dummy, try the next one. */
7602 if (!p)
7603 goto fail;
7604
7605 /* If we failed to the end of the pattern, don't examine *p. */
7606 assert (p <= pend);
7607 if (p < pend)
7608 {
7609 boolean is_a_jump_n = false;
7610
7611 /* If failed to a backwards jump that's part of a repetition
7612 loop, need to pop this failure point and use the next one. */
7613 switch ((re_opcode_t) *p)
7614 {
7615 case jump_n:
7616 is_a_jump_n = true;
7617 case maybe_pop_jump:
7618 case pop_failure_jump:
7619 case jump:
7620 p1 = p + 1;
7621 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7622 p1 += mcnt;
7623
7624 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
7625 || (!is_a_jump_n
7626 && (re_opcode_t) *p1 == on_failure_jump))
7627 goto fail;
7628 break;
7629 default:
7630 /* do nothing */ ;
7631 }
7632 }
7633
7634 if (d >= string1 && d <= end1)
7635 dend = end_match_1;
7636 }
7637 else
7638 break; /* Matching at this starting point really fails. */
7639 } /* for (;;) */
7640
7641 if (best_regs_set)
7642 goto restore_best_regs;
7643
7644 FREE_VARIABLES ();
7645
7646 return -1; /* Failure to match. */
7647} /* re_match_2 */
7648
7649
7650/* Subroutine definitions for re_match_2. */
7651
7652
7653/* We are passed P pointing to a register number after a start_memory.
7654
7655 Return true if the pattern up to the corresponding stop_memory can
7656 match the empty string, and false otherwise.
7657
7658 If we find the matching stop_memory, sets P to point to one past its number.
7659 Otherwise, sets P to an undefined byte less than or equal to END.
7660
7661 We don't handle duplicates properly (yet). */
7662
7663static boolean
7664PREFIX(group_match_null_string_p) (p, end, reg_info)
7665 UCHAR_T **p, *end;
7666 PREFIX(register_info_type) *reg_info;
7667{
7668 int mcnt;
7669 /* Point to after the args to the start_memory. */
7670 UCHAR_T *p1 = *p + 2;
7671
7672 while (p1 < end)
7673 {
7674 /* Skip over opcodes that can match nothing, and return true or
7675 false, as appropriate, when we get to one that can't, or to the
7676 matching stop_memory. */
7677
7678 switch ((re_opcode_t) *p1)
7679 {
7680 /* Could be either a loop or a series of alternatives. */
7681 case on_failure_jump:
7682 p1++;
7683 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7684
7685 /* If the next operation is not a jump backwards in the
7686 pattern. */
7687
7688 if (mcnt >= 0)
7689 {
7690 /* Go through the on_failure_jumps of the alternatives,
7691 seeing if any of the alternatives cannot match nothing.
7692 The last alternative starts with only a jump,
7693 whereas the rest start with on_failure_jump and end
7694 with a jump, e.g., here is the pattern for `a|b|c':
7695
7696 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
7697 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
7698 /exactn/1/c
7699
7700 So, we have to first go through the first (n-1)
7701 alternatives and then deal with the last one separately. */
7702
7703
7704 /* Deal with the first (n-1) alternatives, which start
7705 with an on_failure_jump (see above) that jumps to right
7706 past a jump_past_alt. */
7707
7708 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
7709 jump_past_alt)
7710 {
7711 /* `mcnt' holds how many bytes long the alternative
7712 is, including the ending `jump_past_alt' and
7713 its number. */
7714
7715 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt -
7716 (1 + OFFSET_ADDRESS_SIZE),
7717 reg_info))
7718 return false;
7719
7720 /* Move to right after this alternative, including the
7721 jump_past_alt. */
7722 p1 += mcnt;
7723
7724 /* Break if it's the beginning of an n-th alternative
7725 that doesn't begin with an on_failure_jump. */
7726 if ((re_opcode_t) *p1 != on_failure_jump)
7727 break;
7728
7729 /* Still have to check that it's not an n-th
7730 alternative that starts with an on_failure_jump. */
7731 p1++;
7732 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7733 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
7734 jump_past_alt)
7735 {
7736 /* Get to the beginning of the n-th alternative. */
7737 p1 -= 1 + OFFSET_ADDRESS_SIZE;
7738 break;
7739 }
7740 }
7741
7742 /* Deal with the last alternative: go back and get number
7743 of the `jump_past_alt' just before it. `mcnt' contains
7744 the length of the alternative. */
7745 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
7746
7747 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info))
7748 return false;
7749
7750 p1 += mcnt; /* Get past the n-th alternative. */
7751 } /* if mcnt > 0 */
7752 break;
7753
7754
7755 case stop_memory:
7756 assert (p1[1] == **p);
7757 *p = p1 + 2;
7758 return true;
7759
7760
7761 default:
7762 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7763 return false;
7764 }
7765 } /* while p1 < end */
7766
7767 return false;
7768} /* group_match_null_string_p */
7769
7770
7771/* Similar to group_match_null_string_p, but doesn't deal with alternatives:
7772 It expects P to be the first byte of a single alternative and END one
7773 byte past the last. The alternative can contain groups. */
7774
7775static boolean
7776PREFIX(alt_match_null_string_p) (p, end, reg_info)
7777 UCHAR_T *p, *end;
7778 PREFIX(register_info_type) *reg_info;
7779{
7780 int mcnt;
7781 UCHAR_T *p1 = p;
7782
7783 while (p1 < end)
7784 {
7785 /* Skip over opcodes that can match nothing, and break when we get
7786 to one that can't. */
7787
7788 switch ((re_opcode_t) *p1)
7789 {
7790 /* It's a loop. */
7791 case on_failure_jump:
7792 p1++;
7793 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7794 p1 += mcnt;
7795 break;
7796
7797 default:
7798 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7799 return false;
7800 }
7801 } /* while p1 < end */
7802
7803 return true;
7804} /* alt_match_null_string_p */
7805
7806
7807/* Deals with the ops common to group_match_null_string_p and
7808 alt_match_null_string_p.
7809
7810 Sets P to one after the op and its arguments, if any. */
7811
7812static boolean
7813PREFIX(common_op_match_null_string_p) (p, end, reg_info)
7814 UCHAR_T **p, *end;
7815 PREFIX(register_info_type) *reg_info;
7816{
7817 int mcnt;
7818 boolean ret;
7819 int reg_no;
7820 UCHAR_T *p1 = *p;
7821
7822 switch ((re_opcode_t) *p1++)
7823 {
7824 case no_op:
7825 case begline:
7826 case endline:
7827 case begbuf:
7828 case endbuf:
7829 case wordbeg:
7830 case wordend:
7831 case wordbound:
7832 case notwordbound:
7833#ifdef emacs
7834 case before_dot:
7835 case at_dot:
7836 case after_dot:
7837#endif
7838 break;
7839
7840 case start_memory:
7841 reg_no = *p1;
7842 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
7843 ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info);
7844
7845 /* Have to set this here in case we're checking a group which
7846 contains a group and a back reference to it. */
7847
7848 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
7849 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
7850
7851 if (!ret)
7852 return false;
7853 break;
7854
7855 /* If this is an optimized succeed_n for zero times, make the jump. */
7856 case jump:
7857 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7858 if (mcnt >= 0)
7859 p1 += mcnt;
7860 else
7861 return false;
7862 break;
7863
7864 case succeed_n:
7865 /* Get to the number of times to succeed. */
7866 p1 += OFFSET_ADDRESS_SIZE;
7867 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7868
7869 if (mcnt == 0)
7870 {
7871 p1 -= 2 * OFFSET_ADDRESS_SIZE;
7872 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7873 p1 += mcnt;
7874 }
7875 else
7876 return false;
7877 break;
7878
7879 case duplicate:
7880 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
7881 return false;
7882 break;
7883
7884 case set_number_at:
7885 p1 += 2 * OFFSET_ADDRESS_SIZE;
7886
7887 default:
7888 /* All other opcodes mean we cannot match the empty string. */
7889 return false;
7890 }
7891
7892 *p = p1;
7893 return true;
7894} /* common_op_match_null_string_p */
7895
7896
7897/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
7898 bytes; nonzero otherwise. */
7899
7900static int
7901PREFIX(bcmp_translate) (s1, s2, len, translate)
7902 const CHAR_T *s1, *s2;
7903 register int len;
7904 RE_TRANSLATE_TYPE translate;
7905{
7906 register const UCHAR_T *p1 = (const UCHAR_T *) s1;
7907 register const UCHAR_T *p2 = (const UCHAR_T *) s2;
7908 while (len)
7909 {
7910#ifdef WCHAR
7911 if (((*p1<=0xff)?translate[*p1++]:*p1++)
7912 != ((*p2<=0xff)?translate[*p2++]:*p2++))
7913 return 1;
7914#else /* BYTE */
7915 if (translate[*p1++] != translate[*p2++]) return 1;
7916#endif /* WCHAR */
7917 len--;
7918 }
7919 return 0;
7920}
7921
7922
7923
7924#else /* not INSIDE_RECURSION */
7925
7926/* Entry points for GNU code. */
7927
7928/* re_compile_pattern is the GNU regular expression compiler: it
7929 compiles PATTERN (of length SIZE) and puts the result in BUFP.
7930 Returns 0 if the pattern was valid, otherwise an error string.
7931
7932 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
7933 are set in BUFP on entry.
7934
7935 We call regex_compile to do the actual compilation. */
7936
7937const char *
7938re_compile_pattern (pattern, length, bufp)
7939 const char *pattern;
7940 size_t length;
7941 struct re_pattern_buffer *bufp;
7942{
7943 reg_errcode_t ret;
7944
7945 /* GNU code is written to assume at least RE_NREGS registers will be set
7946 (and at least one extra will be -1). */
7947 bufp->regs_allocated = REGS_UNALLOCATED;
7948
7949 /* And GNU code determines whether or not to get register information
7950 by passing null for the REGS argument to re_match, etc., not by
7951 setting no_sub. */
7952 bufp->no_sub = 0;
7953
7954 /* Match anchors at newline. */
7955 bufp->newline_anchor = 1;
7956
7957# ifdef MBS_SUPPORT
7958 if (MB_CUR_MAX != 1)
7959 ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp);
7960 else
7961# endif
7962 ret = byte_regex_compile (pattern, length, re_syntax_options, bufp);
7963
7964 if (!ret)
7965 return NULL;
7966 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7967}
7968#ifdef _LIBC
7969weak_alias (__re_compile_pattern, re_compile_pattern)
7970#endif
7971
7972
7973/* Entry points compatible with 4.2 BSD regex library. We don't define
7974 them unless specifically requested. */
7975
7976#if defined _REGEX_RE_COMP || defined _LIBC
7977
7978/* BSD has one and only one pattern buffer. */
7979static struct re_pattern_buffer re_comp_buf;
7980
7981char *
7982#ifdef _LIBC
7983/* Make these definitions weak in libc, so POSIX programs can redefine
7984 these names if they don't use our functions, and still use
7985 regcomp/regexec below without link errors. */
7986weak_function
7987#endif
7988re_comp (s)
7989 const char *s;
7990{
7991 reg_errcode_t ret;
7992
7993 if (!s)
7994 {
7995 if (!re_comp_buf.buffer)
7996 return gettext ("No previous regular expression");
7997 return 0;
7998 }
7999
8000 if (!re_comp_buf.buffer)
8001 {
8002 re_comp_buf.buffer = (unsigned char *) malloc (200);
8003 if (re_comp_buf.buffer == NULL)
8004 return (char *) gettext (re_error_msgid
8005 + re_error_msgid_idx[(int) REG_ESPACE]);
8006 re_comp_buf.allocated = 200;
8007
8008 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
8009 if (re_comp_buf.fastmap == NULL)
8010 return (char *) gettext (re_error_msgid
8011 + re_error_msgid_idx[(int) REG_ESPACE]);
8012 }
8013
8014 /* Since `re_exec' always passes NULL for the `regs' argument, we
8015 don't need to initialize the pattern buffer fields which affect it. */
8016
8017 /* Match anchors at newlines. */
8018 re_comp_buf.newline_anchor = 1;
8019
8020# ifdef MBS_SUPPORT
8021 if (MB_CUR_MAX != 1)
8022 ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
8023 else
8024# endif
8025 ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
8026
8027 if (!ret)
8028 return NULL;
8029
8030 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
8031 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
8032}
8033
8034
8035int
8036#ifdef _LIBC
8037weak_function
8038#endif
8039re_exec (s)
8040 const char *s;
8041{
8042 const int len = strlen (s);
8043 return
8044 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
8045}
8046
8047#endif /* _REGEX_RE_COMP */
8048
8049
8050/* POSIX.2 functions. Don't define these for Emacs. */
8051
8052#ifndef emacs
8053
8054/* regcomp takes a regular expression as a string and compiles it.
8055
8056 PREG is a regex_t *. We do not expect any fields to be initialized,
8057 since POSIX says we shouldn't. Thus, we set
8058
8059 `buffer' to the compiled pattern;
8060 `used' to the length of the compiled pattern;
8061 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
8062 REG_EXTENDED bit in CFLAGS is set; otherwise, to
8063 RE_SYNTAX_POSIX_BASIC;
8064 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
8065 `fastmap' to an allocated space for the fastmap;
8066 `fastmap_accurate' to zero;
8067 `re_nsub' to the number of subexpressions in PATTERN.
8068
8069 PATTERN is the address of the pattern string.
8070
8071 CFLAGS is a series of bits which affect compilation.
8072
8073 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
8074 use POSIX basic syntax.
8075
8076 If REG_NEWLINE is set, then . and [^...] don't match newline.
8077 Also, regexec will try a match beginning after every newline.
8078
8079 If REG_ICASE is set, then we considers upper- and lowercase
8080 versions of letters to be equivalent when matching.
8081
8082 If REG_NOSUB is set, then when PREG is passed to regexec, that
8083 routine will report only success or failure, and nothing about the
8084 registers.
8085
8086 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
8087 the return codes and their meanings.) */
8088
8089int
8090regcomp (preg, pattern, cflags)
8091 regex_t *preg;
8092 const char *pattern;
8093 int cflags;
8094{
8095 reg_errcode_t ret;
8096 reg_syntax_t syntax
8097 = (cflags & REG_EXTENDED) ?
8098 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
8099
8100 /* regex_compile will allocate the space for the compiled pattern. */
8101 preg->buffer = 0;
8102 preg->allocated = 0;
8103 preg->used = 0;
8104
8105 /* Try to allocate space for the fastmap. */
8106 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
8107
8108 if (cflags & REG_ICASE)
8109 {
8110 unsigned i;
8111
8112 preg->translate
8113 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
8114 * sizeof (*(RE_TRANSLATE_TYPE)0));
8115 if (preg->translate == NULL)
8116 return (int) REG_ESPACE;
8117
8118 /* Map uppercase characters to corresponding lowercase ones. */
8119 for (i = 0; i < CHAR_SET_SIZE; i++)
8120 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
8121 }
8122 else
8123 preg->translate = NULL;
8124
8125 /* If REG_NEWLINE is set, newlines are treated differently. */
8126 if (cflags & REG_NEWLINE)
8127 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
8128 syntax &= ~RE_DOT_NEWLINE;
8129 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
8130 /* It also changes the matching behavior. */
8131 preg->newline_anchor = 1;
8132 }
8133 else
8134 preg->newline_anchor = 0;
8135
8136 preg->no_sub = !!(cflags & REG_NOSUB);
8137
8138 /* POSIX says a null character in the pattern terminates it, so we
8139 can use strlen here in compiling the pattern. */
8140# ifdef MBS_SUPPORT
8141 if (MB_CUR_MAX != 1)
8142 ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg);
8143 else
8144# endif
8145 ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg);
8146
8147 /* POSIX doesn't distinguish between an unmatched open-group and an
8148 unmatched close-group: both are REG_EPAREN. */
8149 if (ret == REG_ERPAREN) ret = REG_EPAREN;
8150
8151 if (ret == REG_NOERROR && preg->fastmap)
8152 {
8153 /* Compute the fastmap now, since regexec cannot modify the pattern
8154 buffer. */
8155 if (re_compile_fastmap (preg) == -2)
8156 {
8157 /* Some error occurred while computing the fastmap, just forget
8158 about it. */
8159 free (preg->fastmap);
8160 preg->fastmap = NULL;
8161 }
8162 }
8163
8164 return (int) ret;
8165}
8166#ifdef _LIBC
8167weak_alias (__regcomp, regcomp)
8168#endif
8169
8170
8171/* regexec searches for a given pattern, specified by PREG, in the
8172 string STRING.
8173
8174 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
8175 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
8176 least NMATCH elements, and we set them to the offsets of the
8177 corresponding matched substrings.
8178
8179 EFLAGS specifies `execution flags' which affect matching: if
8180 REG_NOTBOL is set, then ^ does not match at the beginning of the
8181 string; if REG_NOTEOL is set, then $ does not match at the end.
8182
8183 We return 0 if we find a match and REG_NOMATCH if not. */
8184
8185int
8186regexec (preg, string, nmatch, pmatch, eflags)
8187 const regex_t *preg;
8188 const char *string;
8189 size_t nmatch;
8190 regmatch_t pmatch[];
8191 int eflags;
8192{
8193 int ret;
8194 struct re_registers regs;
8195 regex_t private_preg;
8196 int len = strlen (string);
8197 boolean want_reg_info = !preg->no_sub && nmatch > 0;
8198
8199 private_preg = *preg;
8200
8201 private_preg.not_bol = !!(eflags & REG_NOTBOL);
8202 private_preg.not_eol = !!(eflags & REG_NOTEOL);
8203
8204 /* The user has told us exactly how many registers to return
8205 information about, via `nmatch'. We have to pass that on to the
8206 matching routines. */
8207 private_preg.regs_allocated = REGS_FIXED;
8208
8209 if (want_reg_info)
8210 {
8211 regs.num_regs = nmatch;
8212 regs.start = TALLOC (nmatch * 2, regoff_t);
8213 if (regs.start == NULL)
8214 return (int) REG_NOMATCH;
8215 regs.end = regs.start + nmatch;
8216 }
8217
8218 /* Perform the searching operation. */
8219 ret = re_search (&private_preg, string, len,
8220 /* start: */ 0, /* range: */ len,
8221 want_reg_info ? &regs : (struct re_registers *) 0);
8222
8223 /* Copy the register information to the POSIX structure. */
8224 if (want_reg_info)
8225 {
8226 if (ret >= 0)
8227 {
8228 unsigned r;
8229
8230 for (r = 0; r < nmatch; r++)
8231 {
8232 pmatch[r].rm_so = regs.start[r];
8233 pmatch[r].rm_eo = regs.end[r];
8234 }
8235 }
8236
8237 /* If we needed the temporary register info, free the space now. */
8238 free (regs.start);
8239 }
8240
8241 /* We want zero return to mean success, unlike `re_search'. */
8242 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
8243}
8244#ifdef _LIBC
8245weak_alias (__regexec, regexec)
8246#endif
8247
8248
8249/* Returns a message corresponding to an error code, ERRCODE, returned
8250 from either regcomp or regexec. We don't use PREG here. */
8251
8252size_t
8253regerror (errcode, preg, errbuf, errbuf_size)
8254 int errcode;
8255 const regex_t *preg;
8256 char *errbuf;
8257 size_t errbuf_size;
8258{
8259 const char *msg;
8260 size_t msg_size;
8261
8262 if (errcode < 0
8263 || errcode >= (int) (sizeof (re_error_msgid_idx)
8264 / sizeof (re_error_msgid_idx[0])))
8265 /* Only error codes returned by the rest of the code should be passed
8266 to this routine. If we are given anything else, or if other regex
8267 code generates an invalid error code, then the program has a bug.
8268 Dump core so we can fix it. */
8269 abort ();
8270
8271 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]);
8272
8273 msg_size = strlen (msg) + 1; /* Includes the null. */
8274
8275 if (errbuf_size != 0)
8276 {
8277 if (msg_size > errbuf_size)
8278 {
8279#if defined HAVE_MEMPCPY || defined _LIBC
8280 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
8281#else
8282 memcpy (errbuf, msg, errbuf_size - 1);
8283 errbuf[errbuf_size - 1] = 0;
8284#endif
8285 }
8286 else
8287 memcpy (errbuf, msg, msg_size);
8288 }
8289
8290 return msg_size;
8291}
8292#ifdef _LIBC
8293weak_alias (__regerror, regerror)
8294#endif
8295
8296
8297/* Free dynamically allocated space used by PREG. */
8298
8299void
8300regfree (preg)
8301 regex_t *preg;
8302{
8303 if (preg->buffer != NULL)
8304 free (preg->buffer);
8305 preg->buffer = NULL;
8306
8307 preg->allocated = 0;
8308 preg->used = 0;
8309
8310 if (preg->fastmap != NULL)
8311 free (preg->fastmap);
8312 preg->fastmap = NULL;
8313 preg->fastmap_accurate = 0;
8314
8315 if (preg->translate != NULL)
8316 free (preg->translate);
8317 preg->translate = NULL;
8318}
8319#ifdef _LIBC
8320weak_alias (__regfree, regfree)
8321#endif
8322
8323#endif /* not emacs */
8324
8325#endif /* not INSIDE_RECURSION */
8326
8327
8328
8329#undef STORE_NUMBER
8330#undef STORE_NUMBER_AND_INCR
8331#undef EXTRACT_NUMBER
8332#undef EXTRACT_NUMBER_AND_INCR
8333
8334#undef DEBUG_PRINT_COMPILED_PATTERN
8335#undef DEBUG_PRINT_DOUBLE_STRING
8336
8337#undef INIT_FAIL_STACK
8338#undef RESET_FAIL_STACK
8339#undef DOUBLE_FAIL_STACK
8340#undef PUSH_PATTERN_OP
8341#undef PUSH_FAILURE_POINTER
8342#undef PUSH_FAILURE_INT
8343#undef PUSH_FAILURE_ELT
8344#undef POP_FAILURE_POINTER
8345#undef POP_FAILURE_INT
8346#undef POP_FAILURE_ELT
8347#undef DEBUG_PUSH
8348#undef DEBUG_POP
8349#undef PUSH_FAILURE_POINT
8350#undef POP_FAILURE_POINT
8351
8352#undef REG_UNSET_VALUE
8353#undef REG_UNSET
8354
8355#undef PATFETCH
8356#undef PATFETCH_RAW
8357#undef PATUNFETCH
8358#undef TRANSLATE
8359
8360#undef INIT_BUF_SIZE
8361#undef GET_BUFFER_SPACE
8362#undef BUF_PUSH
8363#undef BUF_PUSH_2
8364#undef BUF_PUSH_3
8365#undef STORE_JUMP
8366#undef STORE_JUMP2
8367#undef INSERT_JUMP
8368#undef INSERT_JUMP2
8369#undef EXTEND_BUFFER
8370#undef GET_UNSIGNED_NUMBER
8371#undef FREE_STACK_RETURN
8372
8373# undef POINTER_TO_OFFSET
8374# undef MATCHING_IN_FRST_STRING
8375# undef PREFETCH
8376# undef AT_STRINGS_BEG
8377# undef AT_STRINGS_END
8378# undef WORDCHAR_P
8379# undef FREE_VAR
8380# undef FREE_VARIABLES
8381# undef NO_HIGHEST_ACTIVE_REG
8382# undef NO_LOWEST_ACTIVE_REG
8383
8384# undef CHAR_T
8385# undef UCHAR_T
8386# undef COMPILED_BUFFER_VAR
8387# undef OFFSET_ADDRESS_SIZE
8388# undef CHAR_CLASS_SIZE
8389# undef PREFIX
8390# undef ARG_PREFIX
8391# undef PUT_CHAR
8392# undef BYTE
8393# undef WCHAR
8394
8395# define DEFINED_ONCE
Note: See TracBrowser for help on using the repository browser.