source: vendor/perl/5.8.8/regcomp.c@ 3234

Last change on this file since 3234 was 3181, checked in by bird, 18 years ago

perl 5.8.8

File size: 144.7 KB
Line 
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16 */
17
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
44/* We *really* need to overwrite these symbols: */
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
48# define Perl_pregfree my_regfree
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
51# define Perl_regnext my_regnext
52# define Perl_save_re_context my_save_re_context
53# define Perl_reginitcolors my_reginitcolors
54
55# define PERL_NO_GET_CONTEXT
56#endif
57
58/*
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
83 ****
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
93#define PERL_IN_REGCOMP_C
94#include "perl.h"
95
96#ifndef PERL_IN_XSUB_RE
97# include "INTERN.h"
98#endif
99
100#define REG_COMP_C
101#include "regcomp.h"
102
103#ifdef op
104#undef op
105#endif /* op */
106
107#ifdef MSDOS
108# if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110# pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
114#endif /* MSDOS */
115
116#ifndef STATIC
117#define STATIC static
118#endif
119
120typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
138 I32 utf8;
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
145#define RExC_flags (pRExC_state->flags)
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
148#define RExC_start (pRExC_state->start)
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153#define RExC_emit (pRExC_state->emit)
154#define RExC_emit_start (pRExC_state->emit_start)
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
163#define RExC_utf8 (pRExC_state->utf8)
164
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
168
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
172/*
173 * Flags to be passed up and down.
174 */
175#define WORST 0 /* Worst case. */
176#define HASWIDTH 0x1 /* Known to match non-null strings. */
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
180
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
200 I32 *last_closep;
201 struct regnode_charclass_class *start_class;
202} scan_data_t;
203
204/*
205 * Forward declarations for pregcomp()'s friends.
206 */
207
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
234#define SCF_DO_SUBSTR 0x400
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238#define SCF_WHILEM_VISITED_POS 0x2000
239
240#define UTF (RExC_utf8 != 0)
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
243
244#define OOB_UNICODE 12345678
245#define OOB_NAMEDCLASS -1
246
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
250
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
261
262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
269#define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
283
284/*
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
287 * "...".
288 */
289#define FAIL2(pat,msg) STMT_START { \
290 const char *ellipses = ""; \
291 IV len = RExC_end - RExC_precomp; \
292 \
293 if (!SIZE_ONLY) \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
298 ellipses = "..."; \
299 } \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
302} STMT_END
303
304
305/*
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
307 */
308#define Simple_vFAIL(m) STMT_START { \
309 const IV offset = RExC_parse - RExC_precomp; \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
312} STMT_END
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
317#define vFAIL(m) STMT_START { \
318 if (!SIZE_ONLY) \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
320 Simple_vFAIL(m); \
321} STMT_END
322
323/*
324 * Like Simple_vFAIL(), but accepts two arguments.
325 */
326#define Simple_vFAIL2(m,a1) STMT_START { \
327 const IV offset = RExC_parse - RExC_precomp; \
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
330} STMT_END
331
332/*
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 */
335#define vFAIL2(m,a1) STMT_START { \
336 if (!SIZE_ONLY) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
339} STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) STMT_START { \
346 const IV offset = RExC_parse - RExC_precomp; \
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351/*
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
353 */
354#define vFAIL3(m,a1,a2) STMT_START { \
355 if (!SIZE_ONLY) \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
358} STMT_END
359
360/*
361 * Like Simple_vFAIL(), but accepts four arguments.
362 */
363#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
364 const IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
368
369#define vWARN(loc,m) STMT_START { \
370 const IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
374
375#define vWARNdep(loc,m) STMT_START { \
376 const IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
380} STMT_END
381
382
383#define vWARN2(loc, m, a1) STMT_START { \
384 const IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
388
389#define vWARN3(loc, m, a1, a2) STMT_START { \
390 const IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 const IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 const IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
406
407
408/* Allow for side effects in s */
409#define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411} STMT_END
412
413/* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
417 */
418
419#define MJD_OFFSET_DEBUG(x)
420/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423#define Set_Node_Offset_To_R(node,byte) STMT_START { \
424 if (! SIZE_ONLY) { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
427 if((node) < 0) { \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
429 } else { \
430 RExC_offsets[2*(node)-1] = (byte); \
431 } \
432 } \
433} STMT_END
434
435#define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439#define Set_Node_Length_To_R(node,len) STMT_START { \
440 if (! SIZE_ONLY) { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (int)(node), (int)(len))); \
443 if((node) < 0) { \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
445 } else { \
446 RExC_offsets[2*(node)] = (len); \
447 } \
448 } \
449} STMT_END
450
451#define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454#define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
456
457/* Get offsets and lengths */
458#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
461static void clear_re(pTHX_ void *r);
462
463/* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
466
467STATIC void
468S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
469{
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
472
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 SvSetMagicSV(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
478 data->flags
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480 else
481 data->flags &= ~SF_FIX_BEFORE_EOL;
482 }
483 else {
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
491 data->flags
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493 else
494 data->flags &= ~SF_FL_BEFORE_EOL;
495 }
496 }
497 SvCUR_set(data->last_found, 0);
498 {
499 SV * const sv = data->last_found;
500 MAGIC * const mg =
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
503 mg->mg_len = 0;
504 }
505 data->last_end = -1;
506 data->flags &= ~SF_BEFORE_EOL;
507}
508
509/* Can match anything (initialization) */
510STATIC void
511S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
512{
513 ANYOF_CLASS_ZERO(cl);
514 ANYOF_BITMAP_SETALL(cl);
515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
516 if (LOC)
517 cl->flags |= ANYOF_LOCALE;
518}
519
520/* Can match anything (initialization) */
521STATIC int
522S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
523{
524 int value;
525
526 for (value = 0; value <= ANYOF_MAX; value += 2)
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528 return 1;
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
530 return 0;
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
532 return 0;
533 return 1;
534}
535
536/* Can match anything (initialization) */
537STATIC void
538S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
539{
540 Zero(cl, 1, struct regnode_charclass_class);
541 cl->type = ANYOF;
542 cl_anything(pRExC_state, cl);
543}
544
545STATIC void
546S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
547{
548 Zero(cl, 1, struct regnode_charclass_class);
549 cl->type = ANYOF;
550 cl_anything(pRExC_state, cl);
551 if (LOC)
552 cl->flags |= ANYOF_LOCALE;
553}
554
555/* 'And' a given class with another one. Can create false positives */
556/* We assume that cl is not inverted */
557STATIC void
558S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559 const struct regnode_charclass_class *and_with)
560{
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
566 int i;
567
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
571 else
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
577
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
583 }
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
586 cl->flags &= ~ANYOF_UNICODE_ALL;
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
589 cl->flags &= ~ANYOF_UNICODE;
590}
591
592/* 'OR' a given class with another one. Can create false positives */
593/* We assume that cl is not inverted */
594STATIC void
595S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
596{
597 if (or_with->flags & ANYOF_INVERT) {
598 /* We do not use
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606 */
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
610 int i;
611
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
615 else {
616 cl_anything(pRExC_state, cl);
617 }
618 } else {
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && (!(or_with->flags & ANYOF_FOLD)
622 || (cl->flags & ANYOF_FOLD)) ) {
623 int i;
624
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
632 }
633 }
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
635 cl_anything(pRExC_state, cl);
636 }
637 }
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
640
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
645 }
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
649 }
650}
651
652/*
653 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
654 * These need to be revisited when a newer toolchain becomes available.
655 */
656#if defined(__sparc64__) && defined(__GNUC__)
657# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
658# undef SPARC64_GCC_WORKAROUND
659# define SPARC64_GCC_WORKAROUND 1
660# endif
661#endif
662
663/* REx optimizer. Converts nodes into quickier variants "in place".
664 Finds fixed substrings. */
665
666/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
667 to the position after last scanned or to NULL. */
668
669STATIC I32
670S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
671 /* scanp: Start here (read-write). */
672 /* deltap: Write maxlen-minlen here. */
673 /* last: Stop before this one. */
674{
675 I32 min = 0, pars = 0, code;
676 regnode *scan = *scanp, *next;
677 I32 delta = 0;
678 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
679 int is_inf_internal = 0; /* The studied chunk is infinite */
680 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
681 scan_data_t data_fake;
682 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
683
684 while (scan && OP(scan) != END && scan < last) {
685 /* Peephole optimizer: */
686
687 if (PL_regkind[(U8)OP(scan)] == EXACT) {
688 /* Merge several consecutive EXACTish nodes into one. */
689 regnode *n = regnext(scan);
690 U32 stringok = 1;
691#ifdef DEBUGGING
692 regnode *stop = scan;
693#endif
694
695 next = scan + NODE_SZ_STR(scan);
696 /* Skip NOTHING, merge EXACT*. */
697 while (n &&
698 ( PL_regkind[(U8)OP(n)] == NOTHING ||
699 (stringok && (OP(n) == OP(scan))))
700 && NEXT_OFF(n)
701 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
702 if (OP(n) == TAIL || n > next)
703 stringok = 0;
704 if (PL_regkind[(U8)OP(n)] == NOTHING) {
705 NEXT_OFF(scan) += NEXT_OFF(n);
706 next = n + NODE_STEP_REGNODE;
707#ifdef DEBUGGING
708 if (stringok)
709 stop = n;
710#endif
711 n = regnext(n);
712 }
713 else if (stringok) {
714 const int oldl = STR_LEN(scan);
715 regnode *nnext = regnext(n);
716
717 if (oldl + STR_LEN(n) > U8_MAX)
718 break;
719 NEXT_OFF(scan) += NEXT_OFF(n);
720 STR_LEN(scan) += STR_LEN(n);
721 next = n + NODE_SZ_STR(n);
722 /* Now we can overwrite *n : */
723 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
724#ifdef DEBUGGING
725 stop = next - 1;
726#endif
727 n = nnext;
728 }
729 }
730
731 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
732/*
733 Two problematic code points in Unicode casefolding of EXACT nodes:
734
735 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
736 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
737
738 which casefold to
739
740 Unicode UTF-8
741
742 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
743 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
744
745 This means that in case-insensitive matching (or "loose matching",
746 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
747 length of the above casefolded versions) can match a target string
748 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
749 This would rather mess up the minimum length computation.
750
751 What we'll do is to look for the tail four bytes, and then peek
752 at the preceding two bytes to see whether we need to decrease
753 the minimum length by four (six minus two).
754
755 Thanks to the design of UTF-8, there cannot be false matches:
756 A sequence of valid UTF-8 bytes cannot be a subsequence of
757 another valid sequence of UTF-8 bytes.
758
759*/
760 char *s0 = STRING(scan), *s, *t;
761 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
762 const char * const t0 = "\xcc\x88\xcc\x81";
763 const char * const t1 = t0 + 3;
764
765 for (s = s0 + 2;
766 s < s2 && (t = ninstr(s, s1, t0, t1));
767 s = t + 4) {
768 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
769 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
770 min -= 4;
771 }
772 }
773
774#ifdef DEBUGGING
775 /* Allow dumping */
776 n = scan + NODE_SZ_STR(scan);
777 while (n <= stop) {
778 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
779 OP(n) = OPTIMIZED;
780 NEXT_OFF(n) = 0;
781 }
782 n++;
783 }
784#endif
785 }
786 /* Follow the next-chain of the current node and optimize
787 away all the NOTHINGs from it. */
788 if (OP(scan) != CURLYX) {
789 const int max = (reg_off_by_arg[OP(scan)]
790 ? I32_MAX
791 /* I32 may be smaller than U16 on CRAYs! */
792 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
793 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
794 int noff;
795 regnode *n = scan;
796
797 /* Skip NOTHING and LONGJMP. */
798 while ((n = regnext(n))
799 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
800 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
801 && off + noff < max)
802 off += noff;
803 if (reg_off_by_arg[OP(scan)])
804 ARG(scan) = off;
805 else
806 NEXT_OFF(scan) = off;
807 }
808 /* The principal pseudo-switch. Cannot be a switch, since we
809 look into several different things. */
810 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
811 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
812 next = regnext(scan);
813 code = OP(scan);
814
815 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
816 I32 max1 = 0, min1 = I32_MAX, num = 0;
817 struct regnode_charclass_class accum;
818
819 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
820 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
821 if (flags & SCF_DO_STCLASS)
822 cl_init_zero(pRExC_state, &accum);
823 while (OP(scan) == code) {
824 I32 deltanext, minnext, f = 0, fake;
825 struct regnode_charclass_class this_class;
826
827 num++;
828 data_fake.flags = 0;
829 if (data) {
830 data_fake.whilem_c = data->whilem_c;
831 data_fake.last_closep = data->last_closep;
832 }
833 else
834 data_fake.last_closep = &fake;
835 next = regnext(scan);
836 scan = NEXTOPER(scan);
837 if (code != BRANCH)
838 scan = NEXTOPER(scan);
839 if (flags & SCF_DO_STCLASS) {
840 cl_init(pRExC_state, &this_class);
841 data_fake.start_class = &this_class;
842 f = SCF_DO_STCLASS_AND;
843 }
844 if (flags & SCF_WHILEM_VISITED_POS)
845 f |= SCF_WHILEM_VISITED_POS;
846 /* we suppose the run is continuous, last=next...*/
847 minnext = study_chunk(pRExC_state, &scan, &deltanext,
848 next, &data_fake, f);
849 if (min1 > minnext)
850 min1 = minnext;
851 if (max1 < minnext + deltanext)
852 max1 = minnext + deltanext;
853 if (deltanext == I32_MAX)
854 is_inf = is_inf_internal = 1;
855 scan = next;
856 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
857 pars++;
858 if (data && (data_fake.flags & SF_HAS_EVAL))
859 data->flags |= SF_HAS_EVAL;
860 if (data)
861 data->whilem_c = data_fake.whilem_c;
862 if (flags & SCF_DO_STCLASS)
863 cl_or(pRExC_state, &accum, &this_class);
864 if (code == SUSPEND)
865 break;
866 }
867 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
868 min1 = 0;
869 if (flags & SCF_DO_SUBSTR) {
870 data->pos_min += min1;
871 data->pos_delta += max1 - min1;
872 if (max1 != min1 || is_inf)
873 data->longest = &(data->longest_float);
874 }
875 min += min1;
876 delta += max1 - min1;
877 if (flags & SCF_DO_STCLASS_OR) {
878 cl_or(pRExC_state, data->start_class, &accum);
879 if (min1) {
880 cl_and(data->start_class, &and_with);
881 flags &= ~SCF_DO_STCLASS;
882 }
883 }
884 else if (flags & SCF_DO_STCLASS_AND) {
885 if (min1) {
886 cl_and(data->start_class, &accum);
887 flags &= ~SCF_DO_STCLASS;
888 }
889 else {
890 /* Switch to OR mode: cache the old value of
891 * data->start_class */
892 StructCopy(data->start_class, &and_with,
893 struct regnode_charclass_class);
894 flags &= ~SCF_DO_STCLASS_AND;
895 StructCopy(&accum, data->start_class,
896 struct regnode_charclass_class);
897 flags |= SCF_DO_STCLASS_OR;
898 data->start_class->flags |= ANYOF_EOS;
899 }
900 }
901
902 }
903 else if (code == BRANCHJ) /* single branch is optimized. */
904 scan = NEXTOPER(NEXTOPER(scan));
905 else /* single branch is optimized. */
906 scan = NEXTOPER(scan);
907 continue;
908 }
909 else if (OP(scan) == EXACT) {
910 I32 l = STR_LEN(scan);
911 UV uc = *((U8*)STRING(scan));
912 if (UTF) {
913 const U8 * const s = (U8*)STRING(scan);
914 l = utf8_length((U8 *)s, (U8 *)s + l);
915 uc = utf8_to_uvchr((U8 *)s, NULL);
916 }
917 min += l;
918 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
919 /* The code below prefers earlier match for fixed
920 offset, later match for variable offset. */
921 if (data->last_end == -1) { /* Update the start info. */
922 data->last_start_min = data->pos_min;
923 data->last_start_max = is_inf
924 ? I32_MAX : data->pos_min + data->pos_delta;
925 }
926 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
927 {
928 SV * const sv = data->last_found;
929 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
930 mg_find(sv, PERL_MAGIC_utf8) : NULL;
931 if (mg && mg->mg_len >= 0)
932 mg->mg_len += utf8_length((U8*)STRING(scan),
933 (U8*)STRING(scan)+STR_LEN(scan));
934 }
935 if (UTF)
936 SvUTF8_on(data->last_found);
937 data->last_end = data->pos_min + l;
938 data->pos_min += l; /* As in the first entry. */
939 data->flags &= ~SF_BEFORE_EOL;
940 }
941 if (flags & SCF_DO_STCLASS_AND) {
942 /* Check whether it is compatible with what we know already! */
943 int compat = 1;
944
945 if (uc >= 0x100 ||
946 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
947 && !ANYOF_BITMAP_TEST(data->start_class, uc)
948 && (!(data->start_class->flags & ANYOF_FOLD)
949 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
950 )
951 compat = 0;
952 ANYOF_CLASS_ZERO(data->start_class);
953 ANYOF_BITMAP_ZERO(data->start_class);
954 if (compat)
955 ANYOF_BITMAP_SET(data->start_class, uc);
956 data->start_class->flags &= ~ANYOF_EOS;
957 if (uc < 0x100)
958 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
959 }
960 else if (flags & SCF_DO_STCLASS_OR) {
961 /* false positive possible if the class is case-folded */
962 if (uc < 0x100)
963 ANYOF_BITMAP_SET(data->start_class, uc);
964 else
965 data->start_class->flags |= ANYOF_UNICODE_ALL;
966 data->start_class->flags &= ~ANYOF_EOS;
967 cl_and(data->start_class, &and_with);
968 }
969 flags &= ~SCF_DO_STCLASS;
970 }
971 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
972 I32 l = STR_LEN(scan);
973 UV uc = *((U8*)STRING(scan));
974
975 /* Search for fixed substrings supports EXACT only. */
976 if (flags & SCF_DO_SUBSTR)
977 scan_commit(pRExC_state, data);
978 if (UTF) {
979 U8 *s = (U8 *)STRING(scan);
980 l = utf8_length(s, s + l);
981 uc = utf8_to_uvchr(s, NULL);
982 }
983 min += l;
984 if (data && (flags & SCF_DO_SUBSTR))
985 data->pos_min += l;
986 if (flags & SCF_DO_STCLASS_AND) {
987 /* Check whether it is compatible with what we know already! */
988 int compat = 1;
989
990 if (uc >= 0x100 ||
991 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
992 && !ANYOF_BITMAP_TEST(data->start_class, uc)
993 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
994 compat = 0;
995 ANYOF_CLASS_ZERO(data->start_class);
996 ANYOF_BITMAP_ZERO(data->start_class);
997 if (compat) {
998 ANYOF_BITMAP_SET(data->start_class, uc);
999 data->start_class->flags &= ~ANYOF_EOS;
1000 data->start_class->flags |= ANYOF_FOLD;
1001 if (OP(scan) == EXACTFL)
1002 data->start_class->flags |= ANYOF_LOCALE;
1003 }
1004 }
1005 else if (flags & SCF_DO_STCLASS_OR) {
1006 if (data->start_class->flags & ANYOF_FOLD) {
1007 /* false positive possible if the class is case-folded.
1008 Assume that the locale settings are the same... */
1009 if (uc < 0x100)
1010 ANYOF_BITMAP_SET(data->start_class, uc);
1011 data->start_class->flags &= ~ANYOF_EOS;
1012 }
1013 cl_and(data->start_class, &and_with);
1014 }
1015 flags &= ~SCF_DO_STCLASS;
1016 }
1017 else if (strchr((const char*)PL_varies,OP(scan))) {
1018 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1019 I32 f = flags, pos_before = 0;
1020 regnode *oscan = scan;
1021 struct regnode_charclass_class this_class;
1022 struct regnode_charclass_class *oclass = NULL;
1023 I32 next_is_eval = 0;
1024
1025 switch (PL_regkind[(U8)OP(scan)]) {
1026 case WHILEM: /* End of (?:...)* . */
1027 scan = NEXTOPER(scan);
1028 goto finish;
1029 case PLUS:
1030 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1031 next = NEXTOPER(scan);
1032 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1033 mincount = 1;
1034 maxcount = REG_INFTY;
1035 next = regnext(scan);
1036 scan = NEXTOPER(scan);
1037 goto do_curly;
1038 }
1039 }
1040 if (flags & SCF_DO_SUBSTR)
1041 data->pos_min++;
1042 min++;
1043 /* Fall through. */
1044 case STAR:
1045 if (flags & SCF_DO_STCLASS) {
1046 mincount = 0;
1047 maxcount = REG_INFTY;
1048 next = regnext(scan);
1049 scan = NEXTOPER(scan);
1050 goto do_curly;
1051 }
1052 is_inf = is_inf_internal = 1;
1053 scan = regnext(scan);
1054 if (flags & SCF_DO_SUBSTR) {
1055 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1056 data->longest = &(data->longest_float);
1057 }
1058 goto optimize_curly_tail;
1059 case CURLY:
1060 mincount = ARG1(scan);
1061 maxcount = ARG2(scan);
1062 next = regnext(scan);
1063 if (OP(scan) == CURLYX) {
1064 I32 lp = (data ? *(data->last_closep) : 0);
1065
1066 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067 }
1068 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1069 next_is_eval = (OP(scan) == EVAL);
1070 do_curly:
1071 if (flags & SCF_DO_SUBSTR) {
1072 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1073 pos_before = data->pos_min;
1074 }
1075 if (data) {
1076 fl = data->flags;
1077 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078 if (is_inf)
1079 data->flags |= SF_IS_INF;
1080 }
1081 if (flags & SCF_DO_STCLASS) {
1082 cl_init(pRExC_state, &this_class);
1083 oclass = data->start_class;
1084 data->start_class = &this_class;
1085 f |= SCF_DO_STCLASS_AND;
1086 f &= ~SCF_DO_STCLASS_OR;
1087 }
1088 /* These are the cases when once a subexpression
1089 fails at a particular position, it cannot succeed
1090 even after backtracking at the enclosing scope.
1091
1092 XXXX what if minimal match and we are at the
1093 initial run of {n,m}? */
1094 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095 f &= ~SCF_WHILEM_VISITED_POS;
1096
1097 /* This will finish on WHILEM, setting scan, or on NULL: */
1098 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099 mincount == 0
1100 ? (f & ~SCF_DO_SUBSTR) : f);
1101
1102 if (flags & SCF_DO_STCLASS)
1103 data->start_class = oclass;
1104 if (mincount == 0 || minnext == 0) {
1105 if (flags & SCF_DO_STCLASS_OR) {
1106 cl_or(pRExC_state, data->start_class, &this_class);
1107 }
1108 else if (flags & SCF_DO_STCLASS_AND) {
1109 /* Switch to OR mode: cache the old value of
1110 * data->start_class */
1111 StructCopy(data->start_class, &and_with,
1112 struct regnode_charclass_class);
1113 flags &= ~SCF_DO_STCLASS_AND;
1114 StructCopy(&this_class, data->start_class,
1115 struct regnode_charclass_class);
1116 flags |= SCF_DO_STCLASS_OR;
1117 data->start_class->flags |= ANYOF_EOS;
1118 }
1119 } else { /* Non-zero len */
1120 if (flags & SCF_DO_STCLASS_OR) {
1121 cl_or(pRExC_state, data->start_class, &this_class);
1122 cl_and(data->start_class, &and_with);
1123 }
1124 else if (flags & SCF_DO_STCLASS_AND)
1125 cl_and(data->start_class, &this_class);
1126 flags &= ~SCF_DO_STCLASS;
1127 }
1128 if (!scan) /* It was not CURLYX, but CURLY. */
1129 scan = next;
1130 if ( /* ? quantifier ok, except for (?{ ... }) */
1131 (next_is_eval || !(mincount == 0 && maxcount == 1))
1132 && (minnext == 0) && (deltanext == 0)
1133 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1134 && maxcount <= REG_INFTY/3 /* Complement check for big count */
1135 && ckWARN(WARN_REGEXP))
1136 {
1137 vWARN(RExC_parse,
1138 "Quantifier unexpected on zero-length expression");
1139 }
1140
1141 min += minnext * mincount;
1142 is_inf_internal |= ((maxcount == REG_INFTY
1143 && (minnext + deltanext) > 0)
1144 || deltanext == I32_MAX);
1145 is_inf |= is_inf_internal;
1146 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147
1148 /* Try powerful optimization CURLYX => CURLYN. */
1149 if ( OP(oscan) == CURLYX && data
1150 && data->flags & SF_IN_PAR
1151 && !(data->flags & SF_HAS_EVAL)
1152 && !deltanext && minnext == 1 ) {
1153 /* Try to optimize to CURLYN. */
1154 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1155 regnode *nxt1 = nxt;
1156#ifdef DEBUGGING
1157 regnode *nxt2;
1158#endif
1159
1160 /* Skip open. */
1161 nxt = regnext(nxt);
1162 if (!strchr((const char*)PL_simple,OP(nxt))
1163 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1164 && STR_LEN(nxt) == 1))
1165 goto nogo;
1166#ifdef DEBUGGING
1167 nxt2 = nxt;
1168#endif
1169 nxt = regnext(nxt);
1170 if (OP(nxt) != CLOSE)
1171 goto nogo;
1172 /* Now we know that nxt2 is the only contents: */
1173 oscan->flags = (U8)ARG(nxt);
1174 OP(oscan) = CURLYN;
1175 OP(nxt1) = NOTHING; /* was OPEN. */
1176#ifdef DEBUGGING
1177 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1181 OP(nxt + 1) = OPTIMIZED; /* was count. */
1182 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1183#endif
1184 }
1185 nogo:
1186
1187 /* Try optimization CURLYX => CURLYM. */
1188 if ( OP(oscan) == CURLYX && data
1189 && !(data->flags & SF_HAS_PAR)
1190 && !(data->flags & SF_HAS_EVAL)
1191 && !deltanext /* atom is fixed width */
1192 && minnext != 0 /* CURLYM can't handle zero width */
1193 ) {
1194 /* XXXX How to optimize if data == 0? */
1195 /* Optimize to a simpler form. */
1196 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1197 regnode *nxt2;
1198
1199 OP(oscan) = CURLYM;
1200 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1201 && (OP(nxt2) != WHILEM))
1202 nxt = nxt2;
1203 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1204 /* Need to optimize away parenths. */
1205 if (data->flags & SF_IN_PAR) {
1206 /* Set the parenth number. */
1207 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1208
1209 if (OP(nxt) != CLOSE)
1210 FAIL("Panic opt close");
1211 oscan->flags = (U8)ARG(nxt);
1212 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1213 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1214#ifdef DEBUGGING
1215 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1216 OP(nxt + 1) = OPTIMIZED; /* was count. */
1217 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1218 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1219#endif
1220#if 0
1221 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1222 regnode *nnxt = regnext(nxt1);
1223
1224 if (nnxt == nxt) {
1225 if (reg_off_by_arg[OP(nxt1)])
1226 ARG_SET(nxt1, nxt2 - nxt1);
1227 else if (nxt2 - nxt1 < U16_MAX)
1228 NEXT_OFF(nxt1) = nxt2 - nxt1;
1229 else
1230 OP(nxt) = NOTHING; /* Cannot beautify */
1231 }
1232 nxt1 = nnxt;
1233 }
1234#endif
1235 /* Optimize again: */
1236 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1237 NULL, 0);
1238 }
1239 else
1240 oscan->flags = 0;
1241 }
1242 else if ((OP(oscan) == CURLYX)
1243 && (flags & SCF_WHILEM_VISITED_POS)
1244 /* See the comment on a similar expression above.
1245 However, this time it not a subexpression
1246 we care about, but the expression itself. */
1247 && (maxcount == REG_INFTY)
1248 && data && ++data->whilem_c < 16) {
1249 /* This stays as CURLYX, we can put the count/of pair. */
1250 /* Find WHILEM (as in regexec.c) */
1251 regnode *nxt = oscan + NEXT_OFF(oscan);
1252
1253 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1254 nxt += ARG(nxt);
1255 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1256 | (RExC_whilem_seen << 4)); /* On WHILEM */
1257 }
1258 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1259 pars++;
1260 if (flags & SCF_DO_SUBSTR) {
1261 SV *last_str = Nullsv;
1262 int counted = mincount != 0;
1263
1264 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1265#if defined(SPARC64_GCC_WORKAROUND)
1266 I32 b = 0;
1267 STRLEN l = 0;
1268 const char *s = NULL;
1269 I32 old = 0;
1270
1271 if (pos_before >= data->last_start_min)
1272 b = pos_before;
1273 else
1274 b = data->last_start_min;
1275
1276 l = 0;
1277 s = SvPV_const(data->last_found, l);
1278 old = b - data->last_start_min;
1279
1280#else
1281 I32 b = pos_before >= data->last_start_min
1282 ? pos_before : data->last_start_min;
1283 STRLEN l;
1284 const char *s = SvPV_const(data->last_found, l);
1285 I32 old = b - data->last_start_min;
1286#endif
1287
1288 if (UTF)
1289 old = utf8_hop((U8*)s, old) - (U8*)s;
1290
1291 l -= old;
1292 /* Get the added string: */
1293 last_str = newSVpvn(s + old, l);
1294 if (UTF)
1295 SvUTF8_on(last_str);
1296 if (deltanext == 0 && pos_before == b) {
1297 /* What was added is a constant string */
1298 if (mincount > 1) {
1299 SvGROW(last_str, (mincount * l) + 1);
1300 repeatcpy(SvPVX(last_str) + l,
1301 SvPVX_const(last_str), l, mincount - 1);
1302 SvCUR_set(last_str, SvCUR(last_str) * mincount);
1303 /* Add additional parts. */
1304 SvCUR_set(data->last_found,
1305 SvCUR(data->last_found) - l);
1306 sv_catsv(data->last_found, last_str);
1307 {
1308 SV * sv = data->last_found;
1309 MAGIC *mg =
1310 SvUTF8(sv) && SvMAGICAL(sv) ?
1311 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1312 if (mg && mg->mg_len >= 0)
1313 mg->mg_len += CHR_SVLEN(last_str);
1314 }
1315 data->last_end += l * (mincount - 1);
1316 }
1317 } else {
1318 /* start offset must point into the last copy */
1319 data->last_start_min += minnext * (mincount - 1);
1320 data->last_start_max += is_inf ? I32_MAX
1321 : (maxcount - 1) * (minnext + data->pos_delta);
1322 }
1323 }
1324 /* It is counted once already... */
1325 data->pos_min += minnext * (mincount - counted);
1326 data->pos_delta += - counted * deltanext +
1327 (minnext + deltanext) * maxcount - minnext * mincount;
1328 if (mincount != maxcount) {
1329 /* Cannot extend fixed substrings found inside
1330 the group. */
1331 scan_commit(pRExC_state,data);
1332 if (mincount && last_str) {
1333 sv_setsv(data->last_found, last_str);
1334 data->last_end = data->pos_min;
1335 data->last_start_min =
1336 data->pos_min - CHR_SVLEN(last_str);
1337 data->last_start_max = is_inf
1338 ? I32_MAX
1339 : data->pos_min + data->pos_delta
1340 - CHR_SVLEN(last_str);
1341 }
1342 data->longest = &(data->longest_float);
1343 }
1344 SvREFCNT_dec(last_str);
1345 }
1346 if (data && (fl & SF_HAS_EVAL))
1347 data->flags |= SF_HAS_EVAL;
1348 optimize_curly_tail:
1349 if (OP(oscan) != CURLYX) {
1350 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1351 && NEXT_OFF(next))
1352 NEXT_OFF(oscan) += NEXT_OFF(next);
1353 }
1354 continue;
1355 default: /* REF and CLUMP only? */
1356 if (flags & SCF_DO_SUBSTR) {
1357 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1358 data->longest = &(data->longest_float);
1359 }
1360 is_inf = is_inf_internal = 1;
1361 if (flags & SCF_DO_STCLASS_OR)
1362 cl_anything(pRExC_state, data->start_class);
1363 flags &= ~SCF_DO_STCLASS;
1364 break;
1365 }
1366 }
1367 else if (strchr((const char*)PL_simple,OP(scan))) {
1368 int value = 0;
1369
1370 if (flags & SCF_DO_SUBSTR) {
1371 scan_commit(pRExC_state,data);
1372 data->pos_min++;
1373 }
1374 min++;
1375 if (flags & SCF_DO_STCLASS) {
1376 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1377
1378 /* Some of the logic below assumes that switching
1379 locale on will only add false positives. */
1380 switch (PL_regkind[(U8)OP(scan)]) {
1381 case SANY:
1382 default:
1383 do_default:
1384 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1385 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1386 cl_anything(pRExC_state, data->start_class);
1387 break;
1388 case REG_ANY:
1389 if (OP(scan) == SANY)
1390 goto do_default;
1391 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1392 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1393 || (data->start_class->flags & ANYOF_CLASS));
1394 cl_anything(pRExC_state, data->start_class);
1395 }
1396 if (flags & SCF_DO_STCLASS_AND || !value)
1397 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1398 break;
1399 case ANYOF:
1400 if (flags & SCF_DO_STCLASS_AND)
1401 cl_and(data->start_class,
1402 (struct regnode_charclass_class*)scan);
1403 else
1404 cl_or(pRExC_state, data->start_class,
1405 (struct regnode_charclass_class*)scan);
1406 break;
1407 case ALNUM:
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1411 for (value = 0; value < 256; value++)
1412 if (!isALNUM(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1414 }
1415 }
1416 else {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419 else {
1420 for (value = 0; value < 256; value++)
1421 if (isALNUM(value))
1422 ANYOF_BITMAP_SET(data->start_class, value);
1423 }
1424 }
1425 break;
1426 case ALNUML:
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1430 }
1431 else {
1432 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1433 data->start_class->flags |= ANYOF_LOCALE;
1434 }
1435 break;
1436 case NALNUM:
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1440 for (value = 0; value < 256; value++)
1441 if (isALNUM(value))
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1443 }
1444 }
1445 else {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1448 else {
1449 for (value = 0; value < 256; value++)
1450 if (!isALNUM(value))
1451 ANYOF_BITMAP_SET(data->start_class, value);
1452 }
1453 }
1454 break;
1455 case NALNUML:
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE)
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1459 }
1460 else {
1461 data->start_class->flags |= ANYOF_LOCALE;
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1463 }
1464 break;
1465 case SPACE:
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1469 for (value = 0; value < 256; value++)
1470 if (!isSPACE(value))
1471 ANYOF_BITMAP_CLEAR(data->start_class, value);
1472 }
1473 }
1474 else {
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1477 else {
1478 for (value = 0; value < 256; value++)
1479 if (isSPACE(value))
1480 ANYOF_BITMAP_SET(data->start_class, value);
1481 }
1482 }
1483 break;
1484 case SPACEL:
1485 if (flags & SCF_DO_STCLASS_AND) {
1486 if (data->start_class->flags & ANYOF_LOCALE)
1487 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1488 }
1489 else {
1490 data->start_class->flags |= ANYOF_LOCALE;
1491 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1492 }
1493 break;
1494 case NSPACE:
1495 if (flags & SCF_DO_STCLASS_AND) {
1496 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1498 for (value = 0; value < 256; value++)
1499 if (isSPACE(value))
1500 ANYOF_BITMAP_CLEAR(data->start_class, value);
1501 }
1502 }
1503 else {
1504 if (data->start_class->flags & ANYOF_LOCALE)
1505 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1506 else {
1507 for (value = 0; value < 256; value++)
1508 if (!isSPACE(value))
1509 ANYOF_BITMAP_SET(data->start_class, value);
1510 }
1511 }
1512 break;
1513 case NSPACEL:
1514 if (flags & SCF_DO_STCLASS_AND) {
1515 if (data->start_class->flags & ANYOF_LOCALE) {
1516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1517 for (value = 0; value < 256; value++)
1518 if (!isSPACE(value))
1519 ANYOF_BITMAP_CLEAR(data->start_class, value);
1520 }
1521 }
1522 else {
1523 data->start_class->flags |= ANYOF_LOCALE;
1524 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1525 }
1526 break;
1527 case DIGIT:
1528 if (flags & SCF_DO_STCLASS_AND) {
1529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1530 for (value = 0; value < 256; value++)
1531 if (!isDIGIT(value))
1532 ANYOF_BITMAP_CLEAR(data->start_class, value);
1533 }
1534 else {
1535 if (data->start_class->flags & ANYOF_LOCALE)
1536 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1537 else {
1538 for (value = 0; value < 256; value++)
1539 if (isDIGIT(value))
1540 ANYOF_BITMAP_SET(data->start_class, value);
1541 }
1542 }
1543 break;
1544 case NDIGIT:
1545 if (flags & SCF_DO_STCLASS_AND) {
1546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1547 for (value = 0; value < 256; value++)
1548 if (isDIGIT(value))
1549 ANYOF_BITMAP_CLEAR(data->start_class, value);
1550 }
1551 else {
1552 if (data->start_class->flags & ANYOF_LOCALE)
1553 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1554 else {
1555 for (value = 0; value < 256; value++)
1556 if (!isDIGIT(value))
1557 ANYOF_BITMAP_SET(data->start_class, value);
1558 }
1559 }
1560 break;
1561 }
1562 if (flags & SCF_DO_STCLASS_OR)
1563 cl_and(data->start_class, &and_with);
1564 flags &= ~SCF_DO_STCLASS;
1565 }
1566 }
1567 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1568 data->flags |= (OP(scan) == MEOL
1569 ? SF_BEFORE_MEOL
1570 : SF_BEFORE_SEOL);
1571 }
1572 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1573 /* Lookbehind, or need to calculate parens/evals/stclass: */
1574 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1575 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1576 /* Lookahead/lookbehind */
1577 I32 deltanext, minnext, fake = 0;
1578 regnode *nscan;
1579 struct regnode_charclass_class intrnl;
1580 int f = 0;
1581
1582 data_fake.flags = 0;
1583 if (data) {
1584 data_fake.whilem_c = data->whilem_c;
1585 data_fake.last_closep = data->last_closep;
1586 }
1587 else
1588 data_fake.last_closep = &fake;
1589 if ( flags & SCF_DO_STCLASS && !scan->flags
1590 && OP(scan) == IFMATCH ) { /* Lookahead */
1591 cl_init(pRExC_state, &intrnl);
1592 data_fake.start_class = &intrnl;
1593 f |= SCF_DO_STCLASS_AND;
1594 }
1595 if (flags & SCF_WHILEM_VISITED_POS)
1596 f |= SCF_WHILEM_VISITED_POS;
1597 next = regnext(scan);
1598 nscan = NEXTOPER(NEXTOPER(scan));
1599 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1600 if (scan->flags) {
1601 if (deltanext) {
1602 vFAIL("Variable length lookbehind not implemented");
1603 }
1604 else if (minnext > U8_MAX) {
1605 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1606 }
1607 scan->flags = (U8)minnext;
1608 }
1609 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1610 pars++;
1611 if (data && (data_fake.flags & SF_HAS_EVAL))
1612 data->flags |= SF_HAS_EVAL;
1613 if (data)
1614 data->whilem_c = data_fake.whilem_c;
1615 if (f & SCF_DO_STCLASS_AND) {
1616 const int was = (data->start_class->flags & ANYOF_EOS);
1617
1618 cl_and(data->start_class, &intrnl);
1619 if (was)
1620 data->start_class->flags |= ANYOF_EOS;
1621 }
1622 }
1623 else if (OP(scan) == OPEN) {
1624 pars++;
1625 }
1626 else if (OP(scan) == CLOSE) {
1627 if ((I32)ARG(scan) == is_par) {
1628 next = regnext(scan);
1629
1630 if ( next && (OP(next) != WHILEM) && next < last)
1631 is_par = 0; /* Disable optimization */
1632 }
1633 if (data)
1634 *(data->last_closep) = ARG(scan);
1635 }
1636 else if (OP(scan) == EVAL) {
1637 if (data)
1638 data->flags |= SF_HAS_EVAL;
1639 }
1640 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1641 if (flags & SCF_DO_SUBSTR) {
1642 scan_commit(pRExC_state,data);
1643 data->longest = &(data->longest_float);
1644 }
1645 is_inf = is_inf_internal = 1;
1646 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1647 cl_anything(pRExC_state, data->start_class);
1648 flags &= ~SCF_DO_STCLASS;
1649 }
1650 /* Else: zero-length, ignore. */
1651 scan = regnext(scan);
1652 }
1653
1654 finish:
1655 *scanp = scan;
1656 *deltap = is_inf_internal ? I32_MAX : delta;
1657 if (flags & SCF_DO_SUBSTR && is_inf)
1658 data->pos_delta = I32_MAX - data->pos_min;
1659 if (is_par > U8_MAX)
1660 is_par = 0;
1661 if (is_par && pars==1 && data) {
1662 data->flags |= SF_IN_PAR;
1663 data->flags &= ~SF_HAS_PAR;
1664 }
1665 else if (pars && data) {
1666 data->flags |= SF_HAS_PAR;
1667 data->flags &= ~SF_IN_PAR;
1668 }
1669 if (flags & SCF_DO_STCLASS_OR)
1670 cl_and(data->start_class, &and_with);
1671 return min;
1672}
1673
1674STATIC I32
1675S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
1676{
1677 if (RExC_rx->data) {
1678 Renewc(RExC_rx->data,
1679 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1680 char, struct reg_data);
1681 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1682 RExC_rx->data->count += n;
1683 }
1684 else {
1685 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1686 char, struct reg_data);
1687 Newx(RExC_rx->data->what, n, U8);
1688 RExC_rx->data->count = n;
1689 }
1690 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1691 return RExC_rx->data->count - n;
1692}
1693
1694void
1695Perl_reginitcolors(pTHX)
1696{
1697 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
1698 if (s) {
1699 char *t = savepv(s);
1700 int i = 0;
1701 PL_colors[0] = t;
1702 while (++i < 6) {
1703 t = strchr(t, '\t');
1704 if (t) {
1705 *t = '\0';
1706 PL_colors[i] = ++t;
1707 }
1708 else
1709 PL_colors[i] = t = (char *)"";
1710 }
1711 } else {
1712 int i = 0;
1713 while (i < 6)
1714 PL_colors[i++] = (char *)"";
1715 }
1716 PL_colorset = 1;
1717}
1718
1719
1720/*
1721 - pregcomp - compile a regular expression into internal code
1722 *
1723 * We can't allocate space until we know how big the compiled form will be,
1724 * but we can't compile it (and thus know how big it is) until we've got a
1725 * place to put the code. So we cheat: we compile it twice, once with code
1726 * generation turned off and size counting turned on, and once "for real".
1727 * This also means that we don't allocate space until we are sure that the
1728 * thing really will compile successfully, and we never have to move the
1729 * code and thus invalidate pointers into it. (Note that it has to be in
1730 * one piece because free() must be able to free it all.) [NB: not true in perl]
1731 *
1732 * Beware that the optimization-preparation code in here knows about some
1733 * of the structure of the compiled regexp. [I'll say.]
1734 */
1735regexp *
1736Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1737{
1738 register regexp *r;
1739 regnode *scan;
1740 regnode *first;
1741 I32 flags;
1742 I32 minlen = 0;
1743 I32 sawplus = 0;
1744 I32 sawopen = 0;
1745 scan_data_t data;
1746 RExC_state_t RExC_state;
1747 RExC_state_t *pRExC_state = &RExC_state;
1748
1749 if (exp == NULL)
1750 FAIL("NULL regexp argument");
1751
1752 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1753
1754 RExC_precomp = exp;
1755 DEBUG_r({
1756 if (!PL_colorset) reginitcolors();
1757 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1758 PL_colors[4],PL_colors[5],PL_colors[0],
1759 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1760 });
1761 RExC_flags = pm->op_pmflags;
1762 RExC_sawback = 0;
1763
1764 RExC_seen = 0;
1765 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1766 RExC_seen_evals = 0;
1767 RExC_extralen = 0;
1768
1769 /* First pass: determine size, legality. */
1770 RExC_parse = exp;
1771 RExC_start = exp;
1772 RExC_end = xend;
1773 RExC_naughty = 0;
1774 RExC_npar = 1;
1775 RExC_size = 0L;
1776 RExC_emit = &PL_regdummy;
1777 RExC_whilem_seen = 0;
1778#if 0 /* REGC() is (currently) a NOP at the first pass.
1779 * Clever compilers notice this and complain. --jhi */
1780 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1781#endif
1782 if (reg(pRExC_state, 0, &flags) == NULL) {
1783 RExC_precomp = Nullch;
1784 return(NULL);
1785 }
1786 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1787
1788 /* Small enough for pointer-storage convention?
1789 If extralen==0, this means that we will not need long jumps. */
1790 if (RExC_size >= 0x10000L && RExC_extralen)
1791 RExC_size += RExC_extralen;
1792 else
1793 RExC_extralen = 0;
1794 if (RExC_whilem_seen > 15)
1795 RExC_whilem_seen = 15;
1796
1797 /* Allocate space and initialize. */
1798 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1799 char, regexp);
1800 if (r == NULL)
1801 FAIL("Regexp out of space");
1802
1803#ifdef DEBUGGING
1804 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1805 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1806#endif
1807 r->refcnt = 1;
1808 r->prelen = xend - exp;
1809 r->precomp = savepvn(RExC_precomp, r->prelen);
1810 r->subbeg = NULL;
1811 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1812 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1813
1814 r->substrs = 0; /* Useful during FAIL. */
1815 r->startp = 0; /* Useful during FAIL. */
1816 r->endp = 0; /* Useful during FAIL. */
1817
1818 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1819 if (r->offsets) {
1820 r->offsets[0] = RExC_size;
1821 }
1822 DEBUG_r(PerlIO_printf(Perl_debug_log,
1823 "%s %"UVuf" bytes for offset annotations.\n",
1824 r->offsets ? "Got" : "Couldn't get",
1825 (UV)((2*RExC_size+1) * sizeof(U32))));
1826
1827 RExC_rx = r;
1828
1829 /* Second pass: emit code. */
1830 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1831 RExC_parse = exp;
1832 RExC_end = xend;
1833 RExC_naughty = 0;
1834 RExC_npar = 1;
1835 RExC_emit_start = r->program;
1836 RExC_emit = r->program;
1837 /* Store the count of eval-groups for security checks: */
1838 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1839 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1840 r->data = 0;
1841 if (reg(pRExC_state, 0, &flags) == NULL)
1842 return(NULL);
1843
1844 /* Dig out information for optimizations. */
1845 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1846 pm->op_pmflags = RExC_flags;
1847 if (UTF)
1848 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1849 r->regstclass = NULL;
1850 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1851 r->reganch |= ROPT_NAUGHTY;
1852 scan = r->program + 1; /* First BRANCH. */
1853
1854 /* XXXX To minimize changes to RE engine we always allocate
1855 3-units-long substrs field. */
1856 Newxz(r->substrs, 1, struct reg_substr_data);
1857
1858 StructCopy(&zero_scan_data, &data, scan_data_t);
1859 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1860 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1861 I32 fake;
1862 STRLEN longest_float_length, longest_fixed_length;
1863 struct regnode_charclass_class ch_class;
1864 int stclass_flag;
1865 I32 last_close = 0;
1866
1867 first = scan;
1868 /* Skip introductions and multiplicators >= 1. */
1869 while ((OP(first) == OPEN && (sawopen = 1)) ||
1870 /* An OR of *one* alternative - should not happen now. */
1871 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1872 (OP(first) == PLUS) ||
1873 (OP(first) == MINMOD) ||
1874 /* An {n,m} with n>0 */
1875 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1876 if (OP(first) == PLUS)
1877 sawplus = 1;
1878 else
1879 first += regarglen[(U8)OP(first)];
1880 first = NEXTOPER(first);
1881 }
1882
1883 /* Starting-point info. */
1884 again:
1885 if (PL_regkind[(U8)OP(first)] == EXACT) {
1886 if (OP(first) == EXACT)
1887 ; /* Empty, get anchored substr later. */
1888 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1889 r->regstclass = first;
1890 }
1891 else if (strchr((const char*)PL_simple,OP(first)))
1892 r->regstclass = first;
1893 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1894 PL_regkind[(U8)OP(first)] == NBOUND)
1895 r->regstclass = first;
1896 else if (PL_regkind[(U8)OP(first)] == BOL) {
1897 r->reganch |= (OP(first) == MBOL
1898 ? ROPT_ANCH_MBOL
1899 : (OP(first) == SBOL
1900 ? ROPT_ANCH_SBOL
1901 : ROPT_ANCH_BOL));
1902 first = NEXTOPER(first);
1903 goto again;
1904 }
1905 else if (OP(first) == GPOS) {
1906 r->reganch |= ROPT_ANCH_GPOS;
1907 first = NEXTOPER(first);
1908 goto again;
1909 }
1910 else if (!sawopen && (OP(first) == STAR &&
1911 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1912 !(r->reganch & ROPT_ANCH) )
1913 {
1914 /* turn .* into ^.* with an implied $*=1 */
1915 const int type =
1916 (OP(NEXTOPER(first)) == REG_ANY)
1917 ? ROPT_ANCH_MBOL
1918 : ROPT_ANCH_SBOL;
1919 r->reganch |= type | ROPT_IMPLICIT;
1920 first = NEXTOPER(first);
1921 goto again;
1922 }
1923 if (sawplus && (!sawopen || !RExC_sawback)
1924 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1925 /* x+ must match at the 1st pos of run of x's */
1926 r->reganch |= ROPT_SKIP;
1927
1928 /* Scan is after the zeroth branch, first is atomic matcher. */
1929 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1930 (IV)(first - scan + 1)));
1931 /*
1932 * If there's something expensive in the r.e., find the
1933 * longest literal string that must appear and make it the
1934 * regmust. Resolve ties in favor of later strings, since
1935 * the regstart check works with the beginning of the r.e.
1936 * and avoiding duplication strengthens checking. Not a
1937 * strong reason, but sufficient in the absence of others.
1938 * [Now we resolve ties in favor of the earlier string if
1939 * it happens that c_offset_min has been invalidated, since the
1940 * earlier string may buy us something the later one won't.]
1941 */
1942 minlen = 0;
1943
1944 data.longest_fixed = newSVpvn("",0);
1945 data.longest_float = newSVpvn("",0);
1946 data.last_found = newSVpvn("",0);
1947 data.longest = &(data.longest_fixed);
1948 first = scan;
1949 if (!r->regstclass) {
1950 cl_init(pRExC_state, &ch_class);
1951 data.start_class = &ch_class;
1952 stclass_flag = SCF_DO_STCLASS_AND;
1953 } else /* XXXX Check for BOUND? */
1954 stclass_flag = 0;
1955 data.last_closep = &last_close;
1956
1957 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1958 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1959 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1960 && data.last_start_min == 0 && data.last_end > 0
1961 && !RExC_seen_zerolen
1962 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1963 r->reganch |= ROPT_CHECK_ALL;
1964 scan_commit(pRExC_state, &data);
1965 SvREFCNT_dec(data.last_found);
1966
1967 longest_float_length = CHR_SVLEN(data.longest_float);
1968 if (longest_float_length
1969 || (data.flags & SF_FL_BEFORE_EOL
1970 && (!(data.flags & SF_FL_BEFORE_MEOL)
1971 || (RExC_flags & PMf_MULTILINE)))) {
1972 int t;
1973
1974 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1975 && data.offset_fixed == data.offset_float_min
1976 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1977 goto remove_float; /* As in (a)+. */
1978
1979 if (SvUTF8(data.longest_float)) {
1980 r->float_utf8 = data.longest_float;
1981 r->float_substr = Nullsv;
1982 } else {
1983 r->float_substr = data.longest_float;
1984 r->float_utf8 = Nullsv;
1985 }
1986 r->float_min_offset = data.offset_float_min;
1987 r->float_max_offset = data.offset_float_max;
1988 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1989 && (!(data.flags & SF_FL_BEFORE_MEOL)
1990 || (RExC_flags & PMf_MULTILINE)));
1991 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1992 }
1993 else {
1994 remove_float:
1995 r->float_substr = r->float_utf8 = Nullsv;
1996 SvREFCNT_dec(data.longest_float);
1997 longest_float_length = 0;
1998 }
1999
2000 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2001 if (longest_fixed_length
2002 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2003 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2004 || (RExC_flags & PMf_MULTILINE)))) {
2005 int t;
2006
2007 if (SvUTF8(data.longest_fixed)) {
2008 r->anchored_utf8 = data.longest_fixed;
2009 r->anchored_substr = Nullsv;
2010 } else {
2011 r->anchored_substr = data.longest_fixed;
2012 r->anchored_utf8 = Nullsv;
2013 }
2014 r->anchored_offset = data.offset_fixed;
2015 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2016 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2017 || (RExC_flags & PMf_MULTILINE)));
2018 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2019 }
2020 else {
2021 r->anchored_substr = r->anchored_utf8 = Nullsv;
2022 SvREFCNT_dec(data.longest_fixed);
2023 longest_fixed_length = 0;
2024 }
2025 if (r->regstclass
2026 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2027 r->regstclass = NULL;
2028 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2029 && stclass_flag
2030 && !(data.start_class->flags & ANYOF_EOS)
2031 && !cl_is_anything(data.start_class))
2032 {
2033 const I32 n = add_data(pRExC_state, 1, "f");
2034
2035 Newx(RExC_rx->data->data[n], 1,
2036 struct regnode_charclass_class);
2037 StructCopy(data.start_class,
2038 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2039 struct regnode_charclass_class);
2040 r->regstclass = (regnode*)RExC_rx->data->data[n];
2041 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2042 PL_regdata = r->data; /* for regprop() */
2043 DEBUG_r({ SV *sv = sv_newmortal();
2044 regprop(sv, (regnode*)data.start_class);
2045 PerlIO_printf(Perl_debug_log,
2046 "synthetic stclass \"%s\".\n",
2047 SvPVX_const(sv));});
2048 }
2049
2050 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2051 if (longest_fixed_length > longest_float_length) {
2052 r->check_substr = r->anchored_substr;
2053 r->check_utf8 = r->anchored_utf8;
2054 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2055 if (r->reganch & ROPT_ANCH_SINGLE)
2056 r->reganch |= ROPT_NOSCAN;
2057 }
2058 else {
2059 r->check_substr = r->float_substr;
2060 r->check_utf8 = r->float_utf8;
2061 r->check_offset_min = data.offset_float_min;
2062 r->check_offset_max = data.offset_float_max;
2063 }
2064 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2065 This should be changed ASAP! */
2066 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2067 r->reganch |= RE_USE_INTUIT;
2068 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2069 r->reganch |= RE_INTUIT_TAIL;
2070 }
2071 }
2072 else {
2073 /* Several toplevels. Best we can is to set minlen. */
2074 I32 fake;
2075 struct regnode_charclass_class ch_class;
2076 I32 last_close = 0;
2077
2078 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2079 scan = r->program + 1;
2080 cl_init(pRExC_state, &ch_class);
2081 data.start_class = &ch_class;
2082 data.last_closep = &last_close;
2083 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2084 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2085 = r->float_substr = r->float_utf8 = Nullsv;
2086 if (!(data.start_class->flags & ANYOF_EOS)
2087 && !cl_is_anything(data.start_class))
2088 {
2089 const I32 n = add_data(pRExC_state, 1, "f");
2090
2091 Newx(RExC_rx->data->data[n], 1,
2092 struct regnode_charclass_class);
2093 StructCopy(data.start_class,
2094 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2095 struct regnode_charclass_class);
2096 r->regstclass = (regnode*)RExC_rx->data->data[n];
2097 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2098 DEBUG_r({ SV* sv = sv_newmortal();
2099 regprop(sv, (regnode*)data.start_class);
2100 PerlIO_printf(Perl_debug_log,
2101 "synthetic stclass \"%s\".\n",
2102 SvPVX_const(sv));});
2103 }
2104 }
2105
2106 r->minlen = minlen;
2107 if (RExC_seen & REG_SEEN_GPOS)
2108 r->reganch |= ROPT_GPOS_SEEN;
2109 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2110 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2111 if (RExC_seen & REG_SEEN_EVAL)
2112 r->reganch |= ROPT_EVAL_SEEN;
2113 if (RExC_seen & REG_SEEN_CANY)
2114 r->reganch |= ROPT_CANY_SEEN;
2115 Newxz(r->startp, RExC_npar, I32);
2116 Newxz(r->endp, RExC_npar, I32);
2117 PL_regdata = r->data; /* for regprop() */
2118 DEBUG_r(regdump(r));
2119 return(r);
2120}
2121
2122/*
2123 - reg - regular expression, i.e. main body or parenthesized thing
2124 *
2125 * Caller must absorb opening parenthesis.
2126 *
2127 * Combining parenthesis handling with the base level of regular expression
2128 * is a trifle forced, but the need to tie the tails of the branches to what
2129 * follows makes it hard to avoid.
2130 */
2131STATIC regnode *
2132S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2133 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2134{
2135 register regnode *ret; /* Will be the head of the group. */
2136 register regnode *br;
2137 register regnode *lastbr;
2138 register regnode *ender = 0;
2139 register I32 parno = 0;
2140 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2141
2142 /* for (?g), (?gc), and (?o) warnings; warning
2143 about (?c) will warn about (?g) -- japhy */
2144
2145 I32 wastedflags = 0x00,
2146 wasted_o = 0x01,
2147 wasted_g = 0x02,
2148 wasted_gc = 0x02 | 0x04,
2149 wasted_c = 0x04;
2150
2151 char * parse_start = RExC_parse; /* MJD */
2152 char * const oregcomp_parse = RExC_parse;
2153 char c;
2154
2155 *flagp = 0; /* Tentatively. */
2156
2157
2158 /* Make an OPEN node, if parenthesized. */
2159 if (paren) {
2160 if (*RExC_parse == '?') { /* (?...) */
2161 U32 posflags = 0, negflags = 0;
2162 U32 *flagsp = &posflags;
2163 int logical = 0;
2164 const char * const seqstart = RExC_parse;
2165
2166 RExC_parse++;
2167 paren = *RExC_parse++;
2168 ret = NULL; /* For look-ahead/behind. */
2169 switch (paren) {
2170 case '<': /* (?<...) */
2171 RExC_seen |= REG_SEEN_LOOKBEHIND;
2172 if (*RExC_parse == '!')
2173 paren = ',';
2174 if (*RExC_parse != '=' && *RExC_parse != '!')
2175 goto unknown;
2176 RExC_parse++;
2177 case '=': /* (?=...) */
2178 case '!': /* (?!...) */
2179 RExC_seen_zerolen++;
2180 case ':': /* (?:...) */
2181 case '>': /* (?>...) */
2182 break;
2183 case '$': /* (?$...) */
2184 case '@': /* (?@...) */
2185 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2186 break;
2187 case '#': /* (?#...) */
2188 while (*RExC_parse && *RExC_parse != ')')
2189 RExC_parse++;
2190 if (*RExC_parse != ')')
2191 FAIL("Sequence (?#... not terminated");
2192 nextchar(pRExC_state);
2193 *flagp = TRYAGAIN;
2194 return NULL;
2195 case 'p': /* (?p...) */
2196 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2197 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2198 /* FALL THROUGH*/
2199 case '?': /* (??...) */
2200 logical = 1;
2201 if (*RExC_parse != '{')
2202 goto unknown;
2203 paren = *RExC_parse++;
2204 /* FALL THROUGH */
2205 case '{': /* (?{...}) */
2206 {
2207 I32 count = 1, n = 0;
2208 char c;
2209 char *s = RExC_parse;
2210 SV *sv;
2211 OP_4tree *sop, *rop;
2212
2213 RExC_seen_zerolen++;
2214 RExC_seen |= REG_SEEN_EVAL;
2215 while (count && (c = *RExC_parse)) {
2216 if (c == '\\' && RExC_parse[1])
2217 RExC_parse++;
2218 else if (c == '{')
2219 count++;
2220 else if (c == '}')
2221 count--;
2222 RExC_parse++;
2223 }
2224 if (*RExC_parse != ')')
2225 {
2226 RExC_parse = s;
2227 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2228 }
2229 if (!SIZE_ONLY) {
2230 PAD *pad;
2231
2232 if (RExC_parse - 1 - s)
2233 sv = newSVpvn(s, RExC_parse - 1 - s);
2234 else
2235 sv = newSVpvn("", 0);
2236
2237 ENTER;
2238 Perl_save_re_context(aTHX);
2239 rop = sv_compile_2op(sv, &sop, "re", &pad);
2240 sop->op_private |= OPpREFCOUNTED;
2241 /* re_dup will OpREFCNT_inc */
2242 OpREFCNT_set(sop, 1);
2243 LEAVE;
2244
2245 n = add_data(pRExC_state, 3, "nop");
2246 RExC_rx->data->data[n] = (void*)rop;
2247 RExC_rx->data->data[n+1] = (void*)sop;
2248 RExC_rx->data->data[n+2] = (void*)pad;
2249 SvREFCNT_dec(sv);
2250 }
2251 else { /* First pass */
2252 if (PL_reginterp_cnt < ++RExC_seen_evals
2253 && IN_PERL_RUNTIME)
2254 /* No compiled RE interpolated, has runtime
2255 components ===> unsafe. */
2256 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2257 if (PL_tainting && PL_tainted)
2258 FAIL("Eval-group in insecure regular expression");
2259 }
2260
2261 nextchar(pRExC_state);
2262 if (logical) {
2263 ret = reg_node(pRExC_state, LOGICAL);
2264 if (!SIZE_ONLY)
2265 ret->flags = 2;
2266 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2267 /* deal with the length of this later - MJD */
2268 return ret;
2269 }
2270 ret = reganode(pRExC_state, EVAL, n);
2271 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2272 Set_Node_Offset(ret, parse_start);
2273 return ret;
2274 }
2275 case '(': /* (?(?{...})...) and (?(?=...)...) */
2276 {
2277 if (RExC_parse[0] == '?') { /* (?(?...)) */
2278 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2279 || RExC_parse[1] == '<'
2280 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2281 I32 flag;
2282
2283 ret = reg_node(pRExC_state, LOGICAL);
2284 if (!SIZE_ONLY)
2285 ret->flags = 1;
2286 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2287 goto insert_if;
2288 }
2289 }
2290 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2291 /* (?(1)...) */
2292 parno = atoi(RExC_parse++);
2293
2294 while (isDIGIT(*RExC_parse))
2295 RExC_parse++;
2296 ret = reganode(pRExC_state, GROUPP, parno);
2297
2298 if ((c = *nextchar(pRExC_state)) != ')')
2299 vFAIL("Switch condition not recognized");
2300 insert_if:
2301 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2302 br = regbranch(pRExC_state, &flags, 1);
2303 if (br == NULL)
2304 br = reganode(pRExC_state, LONGJMP, 0);
2305 else
2306 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2307 c = *nextchar(pRExC_state);
2308 if (flags&HASWIDTH)
2309 *flagp |= HASWIDTH;
2310 if (c == '|') {
2311 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2312 regbranch(pRExC_state, &flags, 1);
2313 regtail(pRExC_state, ret, lastbr);
2314 if (flags&HASWIDTH)
2315 *flagp |= HASWIDTH;
2316 c = *nextchar(pRExC_state);
2317 }
2318 else
2319 lastbr = NULL;
2320 if (c != ')')
2321 vFAIL("Switch (?(condition)... contains too many branches");
2322 ender = reg_node(pRExC_state, TAIL);
2323 regtail(pRExC_state, br, ender);
2324 if (lastbr) {
2325 regtail(pRExC_state, lastbr, ender);
2326 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2327 }
2328 else
2329 regtail(pRExC_state, ret, ender);
2330 return ret;
2331 }
2332 else {
2333 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2334 }
2335 }
2336 case 0:
2337 RExC_parse--; /* for vFAIL to print correctly */
2338 vFAIL("Sequence (? incomplete");
2339 break;
2340 default:
2341 --RExC_parse;
2342 parse_flags: /* (?i) */
2343 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2344 /* (?g), (?gc) and (?o) are useless here
2345 and must be globally applied -- japhy */
2346
2347 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2350 if (! (wastedflags & wflagbit) ) {
2351 wastedflags |= wflagbit;
2352 vWARN5(
2353 RExC_parse + 1,
2354 "Useless (%s%c) - %suse /%c modifier",
2355 flagsp == &negflags ? "?-" : "?",
2356 *RExC_parse,
2357 flagsp == &negflags ? "don't " : "",
2358 *RExC_parse
2359 );
2360 }
2361 }
2362 }
2363 else if (*RExC_parse == 'c') {
2364 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2365 if (! (wastedflags & wasted_c) ) {
2366 wastedflags |= wasted_gc;
2367 vWARN3(
2368 RExC_parse + 1,
2369 "Useless (%sc) - %suse /gc modifier",
2370 flagsp == &negflags ? "?-" : "?",
2371 flagsp == &negflags ? "don't " : ""
2372 );
2373 }
2374 }
2375 }
2376 else { pmflag(flagsp, *RExC_parse); }
2377
2378 ++RExC_parse;
2379 }
2380 if (*RExC_parse == '-') {
2381 flagsp = &negflags;
2382 wastedflags = 0; /* reset so (?g-c) warns twice */
2383 ++RExC_parse;
2384 goto parse_flags;
2385 }
2386 RExC_flags |= posflags;
2387 RExC_flags &= ~negflags;
2388 if (*RExC_parse == ':') {
2389 RExC_parse++;
2390 paren = ':';
2391 break;
2392 }
2393 unknown:
2394 if (*RExC_parse != ')') {
2395 RExC_parse++;
2396 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2397 }
2398 nextchar(pRExC_state);
2399 *flagp = TRYAGAIN;
2400 return NULL;
2401 }
2402 }
2403 else { /* (...) */
2404 parno = RExC_npar;
2405 RExC_npar++;
2406 ret = reganode(pRExC_state, OPEN, parno);
2407 Set_Node_Length(ret, 1); /* MJD */
2408 Set_Node_Offset(ret, RExC_parse); /* MJD */
2409 open = 1;
2410 }
2411 }
2412 else /* ! paren */
2413 ret = NULL;
2414
2415 /* Pick up the branches, linking them together. */
2416 parse_start = RExC_parse; /* MJD */
2417 br = regbranch(pRExC_state, &flags, 1);
2418 /* branch_len = (paren != 0); */
2419
2420 if (br == NULL)
2421 return(NULL);
2422 if (*RExC_parse == '|') {
2423 if (!SIZE_ONLY && RExC_extralen) {
2424 reginsert(pRExC_state, BRANCHJ, br);
2425 }
2426 else { /* MJD */
2427 reginsert(pRExC_state, BRANCH, br);
2428 Set_Node_Length(br, paren != 0);
2429 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2430 }
2431 have_branch = 1;
2432 if (SIZE_ONLY)
2433 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2434 }
2435 else if (paren == ':') {
2436 *flagp |= flags&SIMPLE;
2437 }
2438 if (open) { /* Starts with OPEN. */
2439 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2440 }
2441 else if (paren != '?') /* Not Conditional */
2442 ret = br;
2443 *flagp |= flags & (SPSTART | HASWIDTH);
2444 lastbr = br;
2445 while (*RExC_parse == '|') {
2446 if (!SIZE_ONLY && RExC_extralen) {
2447 ender = reganode(pRExC_state, LONGJMP,0);
2448 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2449 }
2450 if (SIZE_ONLY)
2451 RExC_extralen += 2; /* Account for LONGJMP. */
2452 nextchar(pRExC_state);
2453 br = regbranch(pRExC_state, &flags, 0);
2454
2455 if (br == NULL)
2456 return(NULL);
2457 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2458 lastbr = br;
2459 if (flags&HASWIDTH)
2460 *flagp |= HASWIDTH;
2461 *flagp |= flags&SPSTART;
2462 }
2463
2464 if (have_branch || paren != ':') {
2465 /* Make a closing node, and hook it on the end. */
2466 switch (paren) {
2467 case ':':
2468 ender = reg_node(pRExC_state, TAIL);
2469 break;
2470 case 1:
2471 ender = reganode(pRExC_state, CLOSE, parno);
2472 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2473 Set_Node_Length(ender,1); /* MJD */
2474 break;
2475 case '<':
2476 case ',':
2477 case '=':
2478 case '!':
2479 *flagp &= ~HASWIDTH;
2480 /* FALL THROUGH */
2481 case '>':
2482 ender = reg_node(pRExC_state, SUCCEED);
2483 break;
2484 case 0:
2485 ender = reg_node(pRExC_state, END);
2486 break;
2487 }
2488 regtail(pRExC_state, lastbr, ender);
2489
2490 if (have_branch) {
2491 /* Hook the tails of the branches to the closing node. */
2492 for (br = ret; br != NULL; br = regnext(br)) {
2493 regoptail(pRExC_state, br, ender);
2494 }
2495 }
2496 }
2497
2498 {
2499 const char *p;
2500 static const char parens[] = "=!<,>";
2501
2502 if (paren && (p = strchr(parens, paren))) {
2503 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2504 int flag = (p - parens) > 1;
2505
2506 if (paren == '>')
2507 node = SUSPEND, flag = 0;
2508 reginsert(pRExC_state, node,ret);
2509 Set_Node_Cur_Length(ret);
2510 Set_Node_Offset(ret, parse_start + 1);
2511 ret->flags = flag;
2512 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2513 }
2514 }
2515
2516 /* Check for proper termination. */
2517 if (paren) {
2518 RExC_flags = oregflags;
2519 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2520 RExC_parse = oregcomp_parse;
2521 vFAIL("Unmatched (");
2522 }
2523 }
2524 else if (!paren && RExC_parse < RExC_end) {
2525 if (*RExC_parse == ')') {
2526 RExC_parse++;
2527 vFAIL("Unmatched )");
2528 }
2529 else
2530 FAIL("Junk on end of regexp"); /* "Can't happen". */
2531 /* NOTREACHED */
2532 }
2533
2534 return(ret);
2535}
2536
2537/*
2538 - regbranch - one alternative of an | operator
2539 *
2540 * Implements the concatenation operator.
2541 */
2542STATIC regnode *
2543S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2544{
2545 register regnode *ret;
2546 register regnode *chain = NULL;
2547 register regnode *latest;
2548 I32 flags = 0, c = 0;
2549
2550 if (first)
2551 ret = NULL;
2552 else {
2553 if (!SIZE_ONLY && RExC_extralen)
2554 ret = reganode(pRExC_state, BRANCHJ,0);
2555 else {
2556 ret = reg_node(pRExC_state, BRANCH);
2557 Set_Node_Length(ret, 1);
2558 }
2559 }
2560
2561 if (!first && SIZE_ONLY)
2562 RExC_extralen += 1; /* BRANCHJ */
2563
2564 *flagp = WORST; /* Tentatively. */
2565
2566 RExC_parse--;
2567 nextchar(pRExC_state);
2568 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2569 flags &= ~TRYAGAIN;
2570 latest = regpiece(pRExC_state, &flags);
2571 if (latest == NULL) {
2572 if (flags & TRYAGAIN)
2573 continue;
2574 return(NULL);
2575 }
2576 else if (ret == NULL)
2577 ret = latest;
2578 *flagp |= flags&HASWIDTH;
2579 if (chain == NULL) /* First piece. */
2580 *flagp |= flags&SPSTART;
2581 else {
2582 RExC_naughty++;
2583 regtail(pRExC_state, chain, latest);
2584 }
2585 chain = latest;
2586 c++;
2587 }
2588 if (chain == NULL) { /* Loop ran zero times. */
2589 chain = reg_node(pRExC_state, NOTHING);
2590 if (ret == NULL)
2591 ret = chain;
2592 }
2593 if (c == 1) {
2594 *flagp |= flags&SIMPLE;
2595 }
2596
2597 return(ret);
2598}
2599
2600/*
2601 - regpiece - something followed by possible [*+?]
2602 *
2603 * Note that the branching code sequences used for ? and the general cases
2604 * of * and + are somewhat optimized: they use the same NOTHING node as
2605 * both the endmarker for their branch list and the body of the last branch.
2606 * It might seem that this node could be dispensed with entirely, but the
2607 * endmarker role is not redundant.
2608 */
2609STATIC regnode *
2610S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2611{
2612 register regnode *ret;
2613 register char op;
2614 register char *next;
2615 I32 flags;
2616 const char * const origparse = RExC_parse;
2617 char *maxpos;
2618 I32 min;
2619 I32 max = REG_INFTY;
2620 char *parse_start;
2621
2622 ret = regatom(pRExC_state, &flags);
2623 if (ret == NULL) {
2624 if (flags & TRYAGAIN)
2625 *flagp |= TRYAGAIN;
2626 return(NULL);
2627 }
2628
2629 op = *RExC_parse;
2630
2631 if (op == '{' && regcurly(RExC_parse)) {
2632 parse_start = RExC_parse; /* MJD */
2633 next = RExC_parse + 1;
2634 maxpos = Nullch;
2635 while (isDIGIT(*next) || *next == ',') {
2636 if (*next == ',') {
2637 if (maxpos)
2638 break;
2639 else
2640 maxpos = next;
2641 }
2642 next++;
2643 }
2644 if (*next == '}') { /* got one */
2645 if (!maxpos)
2646 maxpos = next;
2647 RExC_parse++;
2648 min = atoi(RExC_parse);
2649 if (*maxpos == ',')
2650 maxpos++;
2651 else
2652 maxpos = RExC_parse;
2653 max = atoi(maxpos);
2654 if (!max && *maxpos != '0')
2655 max = REG_INFTY; /* meaning "infinity" */
2656 else if (max >= REG_INFTY)
2657 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2658 RExC_parse = next;
2659 nextchar(pRExC_state);
2660
2661 do_curly:
2662 if ((flags&SIMPLE)) {
2663 RExC_naughty += 2 + RExC_naughty / 2;
2664 reginsert(pRExC_state, CURLY, ret);
2665 Set_Node_Offset(ret, parse_start+1); /* MJD */
2666 Set_Node_Cur_Length(ret);
2667 }
2668 else {
2669 regnode *w = reg_node(pRExC_state, WHILEM);
2670
2671 w->flags = 0;
2672 regtail(pRExC_state, ret, w);
2673 if (!SIZE_ONLY && RExC_extralen) {
2674 reginsert(pRExC_state, LONGJMP,ret);
2675 reginsert(pRExC_state, NOTHING,ret);
2676 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2677 }
2678 reginsert(pRExC_state, CURLYX,ret);
2679 /* MJD hk */
2680 Set_Node_Offset(ret, parse_start+1);
2681 Set_Node_Length(ret,
2682 op == '{' ? (RExC_parse - parse_start) : 1);
2683
2684 if (!SIZE_ONLY && RExC_extralen)
2685 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2686 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2687 if (SIZE_ONLY)
2688 RExC_whilem_seen++, RExC_extralen += 3;
2689 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2690 }
2691 ret->flags = 0;
2692
2693 if (min > 0)
2694 *flagp = WORST;
2695 if (max > 0)
2696 *flagp |= HASWIDTH;
2697 if (max && max < min)
2698 vFAIL("Can't do {n,m} with n > m");
2699 if (!SIZE_ONLY) {
2700 ARG1_SET(ret, (U16)min);
2701 ARG2_SET(ret, (U16)max);
2702 }
2703
2704 goto nest_check;
2705 }
2706 }
2707
2708 if (!ISMULT1(op)) {
2709 *flagp = flags;
2710 return(ret);
2711 }
2712
2713#if 0 /* Now runtime fix should be reliable. */
2714
2715 /* if this is reinstated, don't forget to put this back into perldiag:
2716
2717 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2718
2719 (F) The part of the regexp subject to either the * or + quantifier
2720 could match an empty string. The {#} shows in the regular
2721 expression about where the problem was discovered.
2722
2723 */
2724
2725 if (!(flags&HASWIDTH) && op != '?')
2726 vFAIL("Regexp *+ operand could be empty");
2727#endif
2728
2729 parse_start = RExC_parse;
2730 nextchar(pRExC_state);
2731
2732 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2733
2734 if (op == '*' && (flags&SIMPLE)) {
2735 reginsert(pRExC_state, STAR, ret);
2736 ret->flags = 0;
2737 RExC_naughty += 4;
2738 }
2739 else if (op == '*') {
2740 min = 0;
2741 goto do_curly;
2742 }
2743 else if (op == '+' && (flags&SIMPLE)) {
2744 reginsert(pRExC_state, PLUS, ret);
2745 ret->flags = 0;
2746 RExC_naughty += 3;
2747 }
2748 else if (op == '+') {
2749 min = 1;
2750 goto do_curly;
2751 }
2752 else if (op == '?') {
2753 min = 0; max = 1;
2754 goto do_curly;
2755 }
2756 nest_check:
2757 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
2758 vWARN3(RExC_parse,
2759 "%.*s matches null string many times",
2760 RExC_parse - origparse,
2761 origparse);
2762 }
2763
2764 if (*RExC_parse == '?') {
2765 nextchar(pRExC_state);
2766 reginsert(pRExC_state, MINMOD, ret);
2767 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2768 }
2769 if (ISMULT2(RExC_parse)) {
2770 RExC_parse++;
2771 vFAIL("Nested quantifiers");
2772 }
2773
2774 return(ret);
2775}
2776
2777/*
2778 - regatom - the lowest level
2779 *
2780 * Optimization: gobbles an entire sequence of ordinary characters so that
2781 * it can turn them into a single node, which is smaller to store and
2782 * faster to run. Backslashed characters are exceptions, each becoming a
2783 * separate node; the code is simpler that way and it's not worth fixing.
2784 *
2785 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2786STATIC regnode *
2787S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2788{
2789 register regnode *ret = 0;
2790 I32 flags;
2791 char *parse_start = RExC_parse;
2792
2793 *flagp = WORST; /* Tentatively. */
2794
2795tryagain:
2796 switch (*RExC_parse) {
2797 case '^':
2798 RExC_seen_zerolen++;
2799 nextchar(pRExC_state);
2800 if (RExC_flags & PMf_MULTILINE)
2801 ret = reg_node(pRExC_state, MBOL);
2802 else if (RExC_flags & PMf_SINGLELINE)
2803 ret = reg_node(pRExC_state, SBOL);
2804 else
2805 ret = reg_node(pRExC_state, BOL);
2806 Set_Node_Length(ret, 1); /* MJD */
2807 break;
2808 case '$':
2809 nextchar(pRExC_state);
2810 if (*RExC_parse)
2811 RExC_seen_zerolen++;
2812 if (RExC_flags & PMf_MULTILINE)
2813 ret = reg_node(pRExC_state, MEOL);
2814 else if (RExC_flags & PMf_SINGLELINE)
2815 ret = reg_node(pRExC_state, SEOL);
2816 else
2817 ret = reg_node(pRExC_state, EOL);
2818 Set_Node_Length(ret, 1); /* MJD */
2819 break;
2820 case '.':
2821 nextchar(pRExC_state);
2822 if (RExC_flags & PMf_SINGLELINE)
2823 ret = reg_node(pRExC_state, SANY);
2824 else
2825 ret = reg_node(pRExC_state, REG_ANY);
2826 *flagp |= HASWIDTH|SIMPLE;
2827 RExC_naughty++;
2828 Set_Node_Length(ret, 1); /* MJD */
2829 break;
2830 case '[':
2831 {
2832 char *oregcomp_parse = ++RExC_parse;
2833 ret = regclass(pRExC_state);
2834 if (*RExC_parse != ']') {
2835 RExC_parse = oregcomp_parse;
2836 vFAIL("Unmatched [");
2837 }
2838 nextchar(pRExC_state);
2839 *flagp |= HASWIDTH|SIMPLE;
2840 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2841 break;
2842 }
2843 case '(':
2844 nextchar(pRExC_state);
2845 ret = reg(pRExC_state, 1, &flags);
2846 if (ret == NULL) {
2847 if (flags & TRYAGAIN) {
2848 if (RExC_parse == RExC_end) {
2849 /* Make parent create an empty node if needed. */
2850 *flagp |= TRYAGAIN;
2851 return(NULL);
2852 }
2853 goto tryagain;
2854 }
2855 return(NULL);
2856 }
2857 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2858 break;
2859 case '|':
2860 case ')':
2861 if (flags & TRYAGAIN) {
2862 *flagp |= TRYAGAIN;
2863 return NULL;
2864 }
2865 vFAIL("Internal urp");
2866 /* Supposed to be caught earlier. */
2867 break;
2868 case '{':
2869 if (!regcurly(RExC_parse)) {
2870 RExC_parse++;
2871 goto defchar;
2872 }
2873 /* FALL THROUGH */
2874 case '?':
2875 case '+':
2876 case '*':
2877 RExC_parse++;
2878 vFAIL("Quantifier follows nothing");
2879 break;
2880 case '\\':
2881 switch (*++RExC_parse) {
2882 case 'A':
2883 RExC_seen_zerolen++;
2884 ret = reg_node(pRExC_state, SBOL);
2885 *flagp |= SIMPLE;
2886 nextchar(pRExC_state);
2887 Set_Node_Length(ret, 2); /* MJD */
2888 break;
2889 case 'G':
2890 ret = reg_node(pRExC_state, GPOS);
2891 RExC_seen |= REG_SEEN_GPOS;
2892 *flagp |= SIMPLE;
2893 nextchar(pRExC_state);
2894 Set_Node_Length(ret, 2); /* MJD */
2895 break;
2896 case 'Z':
2897 ret = reg_node(pRExC_state, SEOL);
2898 *flagp |= SIMPLE;
2899 RExC_seen_zerolen++; /* Do not optimize RE away */
2900 nextchar(pRExC_state);
2901 break;
2902 case 'z':
2903 ret = reg_node(pRExC_state, EOS);
2904 *flagp |= SIMPLE;
2905 RExC_seen_zerolen++; /* Do not optimize RE away */
2906 nextchar(pRExC_state);
2907 Set_Node_Length(ret, 2); /* MJD */
2908 break;
2909 case 'C':
2910 ret = reg_node(pRExC_state, CANY);
2911 RExC_seen |= REG_SEEN_CANY;
2912 *flagp |= HASWIDTH|SIMPLE;
2913 nextchar(pRExC_state);
2914 Set_Node_Length(ret, 2); /* MJD */
2915 break;
2916 case 'X':
2917 ret = reg_node(pRExC_state, CLUMP);
2918 *flagp |= HASWIDTH;
2919 nextchar(pRExC_state);
2920 Set_Node_Length(ret, 2); /* MJD */
2921 break;
2922 case 'w':
2923 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2924 *flagp |= HASWIDTH|SIMPLE;
2925 nextchar(pRExC_state);
2926 Set_Node_Length(ret, 2); /* MJD */
2927 break;
2928 case 'W':
2929 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2930 *flagp |= HASWIDTH|SIMPLE;
2931 nextchar(pRExC_state);
2932 Set_Node_Length(ret, 2); /* MJD */
2933 break;
2934 case 'b':
2935 RExC_seen_zerolen++;
2936 RExC_seen |= REG_SEEN_LOOKBEHIND;
2937 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2938 *flagp |= SIMPLE;
2939 nextchar(pRExC_state);
2940 Set_Node_Length(ret, 2); /* MJD */
2941 break;
2942 case 'B':
2943 RExC_seen_zerolen++;
2944 RExC_seen |= REG_SEEN_LOOKBEHIND;
2945 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2946 *flagp |= SIMPLE;
2947 nextchar(pRExC_state);
2948 Set_Node_Length(ret, 2); /* MJD */
2949 break;
2950 case 's':
2951 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2952 *flagp |= HASWIDTH|SIMPLE;
2953 nextchar(pRExC_state);
2954 Set_Node_Length(ret, 2); /* MJD */
2955 break;
2956 case 'S':
2957 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2958 *flagp |= HASWIDTH|SIMPLE;
2959 nextchar(pRExC_state);
2960 Set_Node_Length(ret, 2); /* MJD */
2961 break;
2962 case 'd':
2963 ret = reg_node(pRExC_state, DIGIT);
2964 *flagp |= HASWIDTH|SIMPLE;
2965 nextchar(pRExC_state);
2966 Set_Node_Length(ret, 2); /* MJD */
2967 break;
2968 case 'D':
2969 ret = reg_node(pRExC_state, NDIGIT);
2970 *flagp |= HASWIDTH|SIMPLE;
2971 nextchar(pRExC_state);
2972 Set_Node_Length(ret, 2); /* MJD */
2973 break;
2974 case 'p':
2975 case 'P':
2976 {
2977 char* oldregxend = RExC_end;
2978 char* parse_start = RExC_parse - 2;
2979
2980 if (RExC_parse[1] == '{') {
2981 /* a lovely hack--pretend we saw [\pX] instead */
2982 RExC_end = strchr(RExC_parse, '}');
2983 if (!RExC_end) {
2984 U8 c = (U8)*RExC_parse;
2985 RExC_parse += 2;
2986 RExC_end = oldregxend;
2987 vFAIL2("Missing right brace on \\%c{}", c);
2988 }
2989 RExC_end++;
2990 }
2991 else {
2992 RExC_end = RExC_parse + 2;
2993 if (RExC_end > oldregxend)
2994 RExC_end = oldregxend;
2995 }
2996 RExC_parse--;
2997
2998 ret = regclass(pRExC_state);
2999
3000 RExC_end = oldregxend;
3001 RExC_parse--;
3002
3003 Set_Node_Offset(ret, parse_start + 2);
3004 Set_Node_Cur_Length(ret);
3005 nextchar(pRExC_state);
3006 *flagp |= HASWIDTH|SIMPLE;
3007 }
3008 break;
3009 case 'n':
3010 case 'r':
3011 case 't':
3012 case 'f':
3013 case 'e':
3014 case 'a':
3015 case 'x':
3016 case 'c':
3017 case '0':
3018 goto defchar;
3019 case '1': case '2': case '3': case '4':
3020 case '5': case '6': case '7': case '8': case '9':
3021 {
3022 const I32 num = atoi(RExC_parse);
3023
3024 if (num > 9 && num >= RExC_npar)
3025 goto defchar;
3026 else {
3027 char * parse_start = RExC_parse - 1; /* MJD */
3028 while (isDIGIT(*RExC_parse))
3029 RExC_parse++;
3030
3031 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3032 vFAIL("Reference to nonexistent group");
3033 RExC_sawback = 1;
3034 ret = reganode(pRExC_state,
3035 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3036 num);
3037 *flagp |= HASWIDTH;
3038
3039 /* override incorrect value set in reganode MJD */
3040 Set_Node_Offset(ret, parse_start+1);
3041 Set_Node_Cur_Length(ret); /* MJD */
3042 RExC_parse--;
3043 nextchar(pRExC_state);
3044 }
3045 }
3046 break;
3047 case '\0':
3048 if (RExC_parse >= RExC_end)
3049 FAIL("Trailing \\");
3050 /* FALL THROUGH */
3051 default:
3052 /* Do not generate "unrecognized" warnings here, we fall
3053 back into the quick-grab loop below */
3054 parse_start--;
3055 goto defchar;
3056 }
3057 break;
3058
3059 case '#':
3060 if (RExC_flags & PMf_EXTENDED) {
3061 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3062 if (RExC_parse < RExC_end)
3063 goto tryagain;
3064 }
3065 /* FALL THROUGH */
3066
3067 default: {
3068 register STRLEN len;
3069 register UV ender;
3070 register char *p;
3071 char *oldp, *s;
3072 STRLEN foldlen;
3073 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
3074
3075 parse_start = RExC_parse - 1;
3076
3077 RExC_parse++;
3078
3079 defchar:
3080 ender = 0;
3081 ret = reg_node(pRExC_state,
3082 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3083 s = STRING(ret);
3084 for (len = 0, p = RExC_parse - 1;
3085 len < 127 && p < RExC_end;
3086 len++)
3087 {
3088 oldp = p;
3089
3090 if (RExC_flags & PMf_EXTENDED)
3091 p = regwhite(p, RExC_end);
3092 switch (*p) {
3093 case '^':
3094 case '$':
3095 case '.':
3096 case '[':
3097 case '(':
3098 case ')':
3099 case '|':
3100 goto loopdone;
3101 case '\\':
3102 switch (*++p) {
3103 case 'A':
3104 case 'C':
3105 case 'X':
3106 case 'G':
3107 case 'Z':
3108 case 'z':
3109 case 'w':
3110 case 'W':
3111 case 'b':
3112 case 'B':
3113 case 's':
3114 case 'S':
3115 case 'd':
3116 case 'D':
3117 case 'p':
3118 case 'P':
3119 --p;
3120 goto loopdone;
3121 case 'n':
3122 ender = '\n';
3123 p++;
3124 break;
3125 case 'r':
3126 ender = '\r';
3127 p++;
3128 break;
3129 case 't':
3130 ender = '\t';
3131 p++;
3132 break;
3133 case 'f':
3134 ender = '\f';
3135 p++;
3136 break;
3137 case 'e':
3138 ender = ASCII_TO_NATIVE('\033');
3139 p++;
3140 break;
3141 case 'a':
3142 ender = ASCII_TO_NATIVE('\007');
3143 p++;
3144 break;
3145 case 'x':
3146 if (*++p == '{') {
3147 char* const e = strchr(p, '}');
3148
3149 if (!e) {
3150 RExC_parse = p + 1;
3151 vFAIL("Missing right brace on \\x{}");
3152 }
3153 else {
3154 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3155 | PERL_SCAN_DISALLOW_PREFIX;
3156 STRLEN numlen = e - p - 1;
3157 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3158 if (ender > 0xff)
3159 RExC_utf8 = 1;
3160 p = e + 1;
3161 }
3162 }
3163 else {
3164 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3165 STRLEN numlen = 2;
3166 ender = grok_hex(p, &numlen, &flags, NULL);
3167 p += numlen;
3168 }
3169 break;
3170 case 'c':
3171 p++;
3172 ender = UCHARAT(p++);
3173 ender = toCTRL(ender);
3174 break;
3175 case '0': case '1': case '2': case '3':case '4':
3176 case '5': case '6': case '7': case '8':case '9':
3177 if (*p == '0' ||
3178 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3179 I32 flags = 0;
3180 STRLEN numlen = 3;
3181 ender = grok_oct(p, &numlen, &flags, NULL);
3182 p += numlen;
3183 }
3184 else {
3185 --p;
3186 goto loopdone;
3187 }
3188 break;
3189 case '\0':
3190 if (p >= RExC_end)
3191 FAIL("Trailing \\");
3192 /* FALL THROUGH */
3193 default:
3194 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
3195 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3196 goto normal_default;
3197 }
3198 break;
3199 default:
3200 normal_default:
3201 if (UTF8_IS_START(*p) && UTF) {
3202 STRLEN numlen;
3203 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3204 &numlen, 0);
3205 p += numlen;
3206 }
3207 else
3208 ender = *p++;
3209 break;
3210 }
3211 if (RExC_flags & PMf_EXTENDED)
3212 p = regwhite(p, RExC_end);
3213 if (UTF && FOLD) {
3214 /* Prime the casefolded buffer. */
3215 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3216 }
3217 if (ISMULT2(p)) { /* Back off on ?+*. */
3218 if (len)
3219 p = oldp;
3220 else if (UTF) {
3221 STRLEN unilen;
3222
3223 if (FOLD) {
3224 /* Emit all the Unicode characters. */
3225 STRLEN numlen;
3226 for (foldbuf = tmpbuf;
3227 foldlen;
3228 foldlen -= numlen) {
3229 ender = utf8_to_uvchr(foldbuf, &numlen);
3230 if (numlen > 0) {
3231 reguni(pRExC_state, ender, s, &unilen);
3232 s += unilen;
3233 len += unilen;
3234 /* In EBCDIC the numlen
3235 * and unilen can differ. */
3236 foldbuf += numlen;
3237 if (numlen >= foldlen)
3238 break;
3239 }
3240 else
3241 break; /* "Can't happen." */
3242 }
3243 }
3244 else {
3245 reguni(pRExC_state, ender, s, &unilen);
3246 if (unilen > 0) {
3247 s += unilen;
3248 len += unilen;
3249 }
3250 }
3251 }
3252 else {
3253 len++;
3254 REGC((char)ender, s++);
3255 }
3256 break;
3257 }
3258 if (UTF) {
3259 STRLEN unilen;
3260
3261 if (FOLD) {
3262 /* Emit all the Unicode characters. */
3263 STRLEN numlen;
3264 for (foldbuf = tmpbuf;
3265 foldlen;
3266 foldlen -= numlen) {
3267 ender = utf8_to_uvchr(foldbuf, &numlen);
3268 if (numlen > 0) {
3269 reguni(pRExC_state, ender, s, &unilen);
3270 len += unilen;
3271 s += unilen;
3272 /* In EBCDIC the numlen
3273 * and unilen can differ. */
3274 foldbuf += numlen;
3275 if (numlen >= foldlen)
3276 break;
3277 }
3278 else
3279 break;
3280 }
3281 }
3282 else {
3283 reguni(pRExC_state, ender, s, &unilen);
3284 if (unilen > 0) {
3285 s += unilen;
3286 len += unilen;
3287 }
3288 }
3289 len--;
3290 }
3291 else
3292 REGC((char)ender, s++);
3293 }
3294 loopdone:
3295 RExC_parse = p - 1;
3296 Set_Node_Cur_Length(ret); /* MJD */
3297 nextchar(pRExC_state);
3298 {
3299 /* len is STRLEN which is unsigned, need to copy to signed */
3300 IV iv = len;
3301 if (iv < 0)
3302 vFAIL("Internal disaster");
3303 }
3304 if (len > 0)
3305 *flagp |= HASWIDTH;
3306 if (len == 1 && UNI_IS_INVARIANT(ender))
3307 *flagp |= SIMPLE;
3308 if (!SIZE_ONLY)
3309 STR_LEN(ret) = len;
3310 if (SIZE_ONLY)
3311 RExC_size += STR_SZ(len);
3312 else
3313 RExC_emit += STR_SZ(len);
3314 }
3315 break;
3316 }
3317
3318 /* If the encoding pragma is in effect recode the text of
3319 * any EXACT-kind nodes. */
3320 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3321 STRLEN oldlen = STR_LEN(ret);
3322 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3323
3324 if (RExC_utf8)
3325 SvUTF8_on(sv);
3326 if (sv_utf8_downgrade(sv, TRUE)) {
3327 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
3328 const STRLEN newlen = SvCUR(sv);
3329
3330 if (SvUTF8(sv))
3331 RExC_utf8 = 1;
3332 if (!SIZE_ONLY) {
3333 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3334 (int)oldlen, STRING(ret),
3335 (int)newlen, s));
3336 Copy(s, STRING(ret), newlen, char);
3337 STR_LEN(ret) += newlen - oldlen;
3338 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3339 } else
3340 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3341 }
3342 }
3343
3344 return(ret);
3345}
3346
3347STATIC char *
3348S_regwhite(pTHX_ char *p, const char *e)
3349{
3350 while (p < e) {
3351 if (isSPACE(*p))
3352 ++p;
3353 else if (*p == '#') {
3354 do {
3355 p++;
3356 } while (p < e && *p != '\n');
3357 }
3358 else
3359 break;
3360 }
3361 return p;
3362}
3363
3364/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3365 Character classes ([:foo:]) can also be negated ([:^foo:]).
3366 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3367 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3368 but trigger failures because they are currently unimplemented. */
3369
3370#define POSIXCC_DONE(c) ((c) == ':')
3371#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3372#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3373
3374STATIC I32
3375S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3376{
3377 I32 namedclass = OOB_NAMEDCLASS;
3378
3379 if (value == '[' && RExC_parse + 1 < RExC_end &&
3380 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3381 POSIXCC(UCHARAT(RExC_parse))) {
3382 const char c = UCHARAT(RExC_parse);
3383 char* s = RExC_parse++;
3384
3385 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3386 RExC_parse++;
3387 if (RExC_parse == RExC_end)
3388 /* Grandfather lone [:, [=, [. */
3389 RExC_parse = s;
3390 else {
3391 const char* t = RExC_parse++; /* skip over the c */
3392 const char *posixcc;
3393
3394 assert(*t == c);
3395
3396 if (UCHARAT(RExC_parse) == ']') {
3397 RExC_parse++; /* skip over the ending ] */
3398 posixcc = s + 1;
3399 if (*s == ':') {
3400 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3401 const I32 skip = t - posixcc;
3402
3403 /* Initially switch on the length of the name. */
3404 switch (skip) {
3405 case 4:
3406 if (memEQ(posixcc, "word", 4)) {
3407 /* this is not POSIX, this is the Perl \w */;
3408 namedclass
3409 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3410 }
3411 break;
3412 case 5:
3413 /* Names all of length 5. */
3414 /* alnum alpha ascii blank cntrl digit graph lower
3415 print punct space upper */
3416 /* Offset 4 gives the best switch position. */
3417 switch (posixcc[4]) {
3418 case 'a':
3419 if (memEQ(posixcc, "alph", 4)) {
3420 /* a */
3421 namedclass
3422 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3423 }
3424 break;
3425 case 'e':
3426 if (memEQ(posixcc, "spac", 4)) {
3427 /* e */
3428 namedclass
3429 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3430 }
3431 break;
3432 case 'h':
3433 if (memEQ(posixcc, "grap", 4)) {
3434 /* h */
3435 namedclass
3436 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3437 }
3438 break;
3439 case 'i':
3440 if (memEQ(posixcc, "asci", 4)) {
3441 /* i */
3442 namedclass
3443 = complement ? ANYOF_NASCII : ANYOF_ASCII;
3444 }
3445 break;
3446 case 'k':
3447 if (memEQ(posixcc, "blan", 4)) {
3448 /* k */
3449 namedclass
3450 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3451 }
3452 break;
3453 case 'l':
3454 if (memEQ(posixcc, "cntr", 4)) {
3455 /* l */
3456 namedclass
3457 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3458 }
3459 break;
3460 case 'm':
3461 if (memEQ(posixcc, "alnu", 4)) {
3462 /* m */
3463 namedclass
3464 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3465 }
3466 break;
3467 case 'r':
3468 if (memEQ(posixcc, "lowe", 4)) {
3469 /* r */
3470 namedclass
3471 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3472 }
3473 if (memEQ(posixcc, "uppe", 4)) {
3474 /* r */
3475 namedclass
3476 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3477 }
3478 break;
3479 case 't':
3480 if (memEQ(posixcc, "digi", 4)) {
3481 /* t */
3482 namedclass
3483 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3484 }
3485 if (memEQ(posixcc, "prin", 4)) {
3486 /* t */
3487 namedclass
3488 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3489 }
3490 if (memEQ(posixcc, "punc", 4)) {
3491 /* t */
3492 namedclass
3493 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3494 }
3495 break;
3496 }
3497 break;
3498 case 6:
3499 if (memEQ(posixcc, "xdigit", 6)) {
3500 namedclass
3501 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3502 }
3503 break;
3504 }
3505
3506 if (namedclass == OOB_NAMEDCLASS)
3507 {
3508 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3509 t - s - 1, s + 1);
3510 }
3511 assert (posixcc[skip] == ':');
3512 assert (posixcc[skip+1] == ']');
3513 } else if (!SIZE_ONLY) {
3514 /* [[=foo=]] and [[.foo.]] are still future. */
3515
3516 /* adjust RExC_parse so the warning shows after
3517 the class closes */
3518 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3519 RExC_parse++;
3520 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3521 }
3522 } else {
3523 /* Maternal grandfather:
3524 * "[:" ending in ":" but not in ":]" */
3525 RExC_parse = s;
3526 }
3527 }
3528 }
3529
3530 return namedclass;
3531}
3532
3533STATIC void
3534S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3535{
3536 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3537 const char *s = RExC_parse;
3538 const char c = *s++;
3539
3540 while(*s && isALNUM(*s))
3541 s++;
3542 if (*s && c == *s && s[1] == ']') {
3543 if (ckWARN(WARN_REGEXP))
3544 vWARN3(s+2,
3545 "POSIX syntax [%c %c] belongs inside character classes",
3546 c, c);
3547
3548 /* [[=foo=]] and [[.foo.]] are still future. */
3549 if (POSIXCC_NOTYET(c)) {
3550 /* adjust RExC_parse so the error shows after
3551 the class closes */
3552 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3553 ;
3554 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3555 }
3556 }
3557 }
3558}
3559
3560STATIC regnode *
3561S_regclass(pTHX_ RExC_state_t *pRExC_state)
3562{
3563 register UV value;
3564 register UV nextvalue;
3565 register IV prevvalue = OOB_UNICODE;
3566 register IV range = 0;
3567 register regnode *ret;
3568 STRLEN numlen;
3569 IV namedclass;
3570 char *rangebegin = 0;
3571 bool need_class = 0;
3572 SV *listsv = Nullsv;
3573 register char *e;
3574 UV n;
3575 bool optimize_invert = TRUE;
3576 AV* unicode_alternate = 0;
3577#ifdef EBCDIC
3578 UV literal_endpoint = 0;
3579#endif
3580
3581 ret = reganode(pRExC_state, ANYOF, 0);
3582
3583 if (!SIZE_ONLY)
3584 ANYOF_FLAGS(ret) = 0;
3585
3586 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3587 RExC_naughty++;
3588 RExC_parse++;
3589 if (!SIZE_ONLY)
3590 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3591 }
3592
3593 if (SIZE_ONLY)
3594 RExC_size += ANYOF_SKIP;
3595 else {
3596 RExC_emit += ANYOF_SKIP;
3597 if (FOLD)
3598 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3599 if (LOC)
3600 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3601 ANYOF_BITMAP_ZERO(ret);
3602 listsv = newSVpvn("# comment\n", 10);
3603 }
3604
3605 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3606
3607 if (!SIZE_ONLY && POSIXCC(nextvalue))
3608 checkposixcc(pRExC_state);
3609
3610 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3611 if (UCHARAT(RExC_parse) == ']')
3612 goto charclassloop;
3613
3614 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3615
3616 charclassloop:
3617
3618 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3619
3620 if (!range)
3621 rangebegin = RExC_parse;
3622 if (UTF) {
3623 value = utf8n_to_uvchr((U8*)RExC_parse,
3624 RExC_end - RExC_parse,
3625 &numlen, 0);
3626 RExC_parse += numlen;
3627 }
3628 else
3629 value = UCHARAT(RExC_parse++);
3630 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3631 if (value == '[' && POSIXCC(nextvalue))
3632 namedclass = regpposixcc(pRExC_state, value);
3633 else if (value == '\\') {
3634 if (UTF) {
3635 value = utf8n_to_uvchr((U8*)RExC_parse,
3636 RExC_end - RExC_parse,
3637 &numlen, 0);
3638 RExC_parse += numlen;
3639 }
3640 else
3641 value = UCHARAT(RExC_parse++);
3642 /* Some compilers cannot handle switching on 64-bit integer
3643 * values, therefore value cannot be an UV. Yes, this will
3644 * be a problem later if we want switch on Unicode.
3645 * A similar issue a little bit later when switching on
3646 * namedclass. --jhi */
3647 switch ((I32)value) {
3648 case 'w': namedclass = ANYOF_ALNUM; break;
3649 case 'W': namedclass = ANYOF_NALNUM; break;
3650 case 's': namedclass = ANYOF_SPACE; break;
3651 case 'S': namedclass = ANYOF_NSPACE; break;
3652 case 'd': namedclass = ANYOF_DIGIT; break;
3653 case 'D': namedclass = ANYOF_NDIGIT; break;
3654 case 'p':
3655 case 'P':
3656 if (RExC_parse >= RExC_end)
3657 vFAIL2("Empty \\%c{}", (U8)value);
3658 if (*RExC_parse == '{') {
3659 const U8 c = (U8)value;
3660 e = strchr(RExC_parse++, '}');
3661 if (!e)
3662 vFAIL2("Missing right brace on \\%c{}", c);
3663 while (isSPACE(UCHARAT(RExC_parse)))
3664 RExC_parse++;
3665 if (e == RExC_parse)
3666 vFAIL2("Empty \\%c{}", c);
3667 n = e - RExC_parse;
3668 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3669 n--;
3670 }
3671 else {
3672 e = RExC_parse;
3673 n = 1;
3674 }
3675 if (!SIZE_ONLY) {
3676 if (UCHARAT(RExC_parse) == '^') {
3677 RExC_parse++;
3678 n--;
3679 value = value == 'p' ? 'P' : 'p'; /* toggle */
3680 while (isSPACE(UCHARAT(RExC_parse))) {
3681 RExC_parse++;
3682 n--;
3683 }
3684 }
3685 if (value == 'p')
3686 Perl_sv_catpvf(aTHX_ listsv,
3687 "+utf8::%.*s\n", (int)n, RExC_parse);
3688 else
3689 Perl_sv_catpvf(aTHX_ listsv,
3690 "!utf8::%.*s\n", (int)n, RExC_parse);
3691 }
3692 RExC_parse = e + 1;
3693 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3694 namedclass = ANYOF_MAX; /* no official name, but it's named */
3695 break;
3696 case 'n': value = '\n'; break;
3697 case 'r': value = '\r'; break;
3698 case 't': value = '\t'; break;
3699 case 'f': value = '\f'; break;
3700 case 'b': value = '\b'; break;
3701 case 'e': value = ASCII_TO_NATIVE('\033');break;
3702 case 'a': value = ASCII_TO_NATIVE('\007');break;
3703 case 'x':
3704 if (*RExC_parse == '{') {
3705 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3706 | PERL_SCAN_DISALLOW_PREFIX;
3707 e = strchr(RExC_parse++, '}');
3708 if (!e)
3709 vFAIL("Missing right brace on \\x{}");
3710
3711 numlen = e - RExC_parse;
3712 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3713 RExC_parse = e + 1;
3714 }
3715 else {
3716 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3717 numlen = 2;
3718 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3719 RExC_parse += numlen;
3720 }
3721 break;
3722 case 'c':
3723 value = UCHARAT(RExC_parse++);
3724 value = toCTRL(value);
3725 break;
3726 case '0': case '1': case '2': case '3': case '4':
3727 case '5': case '6': case '7': case '8': case '9':
3728 {
3729 I32 flags = 0;
3730 numlen = 3;
3731 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3732 RExC_parse += numlen;
3733 break;
3734 }
3735 default:
3736 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
3737 vWARN2(RExC_parse,
3738 "Unrecognized escape \\%c in character class passed through",
3739 (int)value);
3740 break;
3741 }
3742 } /* end of \blah */
3743#ifdef EBCDIC
3744 else
3745 literal_endpoint++;
3746#endif
3747
3748 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3749
3750 if (!SIZE_ONLY && !need_class)
3751 ANYOF_CLASS_ZERO(ret);
3752
3753 need_class = 1;
3754
3755 /* a bad range like a-\d, a-[:digit:] ? */
3756 if (range) {
3757 if (!SIZE_ONLY) {
3758 if (ckWARN(WARN_REGEXP))
3759 vWARN4(RExC_parse,
3760 "False [] range \"%*.*s\"",
3761 RExC_parse - rangebegin,
3762 RExC_parse - rangebegin,
3763 rangebegin);
3764 if (prevvalue < 256) {
3765 ANYOF_BITMAP_SET(ret, prevvalue);
3766 ANYOF_BITMAP_SET(ret, '-');
3767 }
3768 else {
3769 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3770 Perl_sv_catpvf(aTHX_ listsv,
3771 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3772 }
3773 }
3774
3775 range = 0; /* this was not a true range */
3776 }
3777
3778 if (!SIZE_ONLY) {
3779 const char *what = NULL;
3780 char yesno = 0;
3781
3782 if (namedclass > OOB_NAMEDCLASS)
3783 optimize_invert = FALSE;
3784 /* Possible truncation here but in some 64-bit environments
3785 * the compiler gets heartburn about switch on 64-bit values.
3786 * A similar issue a little earlier when switching on value.
3787 * --jhi */
3788 switch ((I32)namedclass) {
3789 case ANYOF_ALNUM:
3790 if (LOC)
3791 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3792 else {
3793 for (value = 0; value < 256; value++)
3794 if (isALNUM(value))
3795 ANYOF_BITMAP_SET(ret, value);
3796 }
3797 yesno = '+';
3798 what = "Word";
3799 break;
3800 case ANYOF_NALNUM:
3801 if (LOC)
3802 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3803 else {
3804 for (value = 0; value < 256; value++)
3805 if (!isALNUM(value))
3806 ANYOF_BITMAP_SET(ret, value);
3807 }
3808 yesno = '!';
3809 what = "Word";
3810 break;
3811 case ANYOF_ALNUMC:
3812 if (LOC)
3813 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3814 else {
3815 for (value = 0; value < 256; value++)
3816 if (isALNUMC(value))
3817 ANYOF_BITMAP_SET(ret, value);
3818 }
3819 yesno = '+';
3820 what = "Alnum";
3821 break;
3822 case ANYOF_NALNUMC:
3823 if (LOC)
3824 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3825 else {
3826 for (value = 0; value < 256; value++)
3827 if (!isALNUMC(value))
3828 ANYOF_BITMAP_SET(ret, value);
3829 }
3830 yesno = '!';
3831 what = "Alnum";
3832 break;
3833 case ANYOF_ALPHA:
3834 if (LOC)
3835 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3836 else {
3837 for (value = 0; value < 256; value++)
3838 if (isALPHA(value))
3839 ANYOF_BITMAP_SET(ret, value);
3840 }
3841 yesno = '+';
3842 what = "Alpha";
3843 break;
3844 case ANYOF_NALPHA:
3845 if (LOC)
3846 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3847 else {
3848 for (value = 0; value < 256; value++)
3849 if (!isALPHA(value))
3850 ANYOF_BITMAP_SET(ret, value);
3851 }
3852 yesno = '!';
3853 what = "Alpha";
3854 break;
3855 case ANYOF_ASCII:
3856 if (LOC)
3857 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3858 else {
3859#ifndef EBCDIC
3860 for (value = 0; value < 128; value++)
3861 ANYOF_BITMAP_SET(ret, value);
3862#else /* EBCDIC */
3863 for (value = 0; value < 256; value++) {
3864 if (isASCII(value))
3865 ANYOF_BITMAP_SET(ret, value);
3866 }
3867#endif /* EBCDIC */
3868 }
3869 yesno = '+';
3870 what = "ASCII";
3871 break;
3872 case ANYOF_NASCII:
3873 if (LOC)
3874 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3875 else {
3876#ifndef EBCDIC
3877 for (value = 128; value < 256; value++)
3878 ANYOF_BITMAP_SET(ret, value);
3879#else /* EBCDIC */
3880 for (value = 0; value < 256; value++) {
3881 if (!isASCII(value))
3882 ANYOF_BITMAP_SET(ret, value);
3883 }
3884#endif /* EBCDIC */
3885 }
3886 yesno = '!';
3887 what = "ASCII";
3888 break;
3889 case ANYOF_BLANK:
3890 if (LOC)
3891 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3892 else {
3893 for (value = 0; value < 256; value++)
3894 if (isBLANK(value))
3895 ANYOF_BITMAP_SET(ret, value);
3896 }
3897 yesno = '+';
3898 what = "Blank";
3899 break;
3900 case ANYOF_NBLANK:
3901 if (LOC)
3902 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3903 else {
3904 for (value = 0; value < 256; value++)
3905 if (!isBLANK(value))
3906 ANYOF_BITMAP_SET(ret, value);
3907 }
3908 yesno = '!';
3909 what = "Blank";
3910 break;
3911 case ANYOF_CNTRL:
3912 if (LOC)
3913 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3914 else {
3915 for (value = 0; value < 256; value++)
3916 if (isCNTRL(value))
3917 ANYOF_BITMAP_SET(ret, value);
3918 }
3919 yesno = '+';
3920 what = "Cntrl";
3921 break;
3922 case ANYOF_NCNTRL:
3923 if (LOC)
3924 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3925 else {
3926 for (value = 0; value < 256; value++)
3927 if (!isCNTRL(value))
3928 ANYOF_BITMAP_SET(ret, value);
3929 }
3930 yesno = '!';
3931 what = "Cntrl";
3932 break;
3933 case ANYOF_DIGIT:
3934 if (LOC)
3935 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3936 else {
3937 /* consecutive digits assumed */
3938 for (value = '0'; value <= '9'; value++)
3939 ANYOF_BITMAP_SET(ret, value);
3940 }
3941 yesno = '+';
3942 what = "Digit";
3943 break;
3944 case ANYOF_NDIGIT:
3945 if (LOC)
3946 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3947 else {
3948 /* consecutive digits assumed */
3949 for (value = 0; value < '0'; value++)
3950 ANYOF_BITMAP_SET(ret, value);
3951 for (value = '9' + 1; value < 256; value++)
3952 ANYOF_BITMAP_SET(ret, value);
3953 }
3954 yesno = '!';
3955 what = "Digit";
3956 break;
3957 case ANYOF_GRAPH:
3958 if (LOC)
3959 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3960 else {
3961 for (value = 0; value < 256; value++)
3962 if (isGRAPH(value))
3963 ANYOF_BITMAP_SET(ret, value);
3964 }
3965 yesno = '+';
3966 what = "Graph";
3967 break;
3968 case ANYOF_NGRAPH:
3969 if (LOC)
3970 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3971 else {
3972 for (value = 0; value < 256; value++)
3973 if (!isGRAPH(value))
3974 ANYOF_BITMAP_SET(ret, value);
3975 }
3976 yesno = '!';
3977 what = "Graph";
3978 break;
3979 case ANYOF_LOWER:
3980 if (LOC)
3981 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3982 else {
3983 for (value = 0; value < 256; value++)
3984 if (isLOWER(value))
3985 ANYOF_BITMAP_SET(ret, value);
3986 }
3987 yesno = '+';
3988 what = "Lower";
3989 break;
3990 case ANYOF_NLOWER:
3991 if (LOC)
3992 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3993 else {
3994 for (value = 0; value < 256; value++)
3995 if (!isLOWER(value))
3996 ANYOF_BITMAP_SET(ret, value);
3997 }
3998 yesno = '!';
3999 what = "Lower";
4000 break;
4001 case ANYOF_PRINT:
4002 if (LOC)
4003 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
4004 else {
4005 for (value = 0; value < 256; value++)
4006 if (isPRINT(value))
4007 ANYOF_BITMAP_SET(ret, value);
4008 }
4009 yesno = '+';
4010 what = "Print";
4011 break;
4012 case ANYOF_NPRINT:
4013 if (LOC)
4014 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
4015 else {
4016 for (value = 0; value < 256; value++)
4017 if (!isPRINT(value))
4018 ANYOF_BITMAP_SET(ret, value);
4019 }
4020 yesno = '!';
4021 what = "Print";
4022 break;
4023 case ANYOF_PSXSPC:
4024 if (LOC)
4025 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4026 else {
4027 for (value = 0; value < 256; value++)
4028 if (isPSXSPC(value))
4029 ANYOF_BITMAP_SET(ret, value);
4030 }
4031 yesno = '+';
4032 what = "Space";
4033 break;
4034 case ANYOF_NPSXSPC:
4035 if (LOC)
4036 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4037 else {
4038 for (value = 0; value < 256; value++)
4039 if (!isPSXSPC(value))
4040 ANYOF_BITMAP_SET(ret, value);
4041 }
4042 yesno = '!';
4043 what = "Space";
4044 break;
4045 case ANYOF_PUNCT:
4046 if (LOC)
4047 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4048 else {
4049 for (value = 0; value < 256; value++)
4050 if (isPUNCT(value))
4051 ANYOF_BITMAP_SET(ret, value);
4052 }
4053 yesno = '+';
4054 what = "Punct";
4055 break;
4056 case ANYOF_NPUNCT:
4057 if (LOC)
4058 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4059 else {
4060 for (value = 0; value < 256; value++)
4061 if (!isPUNCT(value))
4062 ANYOF_BITMAP_SET(ret, value);
4063 }
4064 yesno = '!';
4065 what = "Punct";
4066 break;
4067 case ANYOF_SPACE:
4068 if (LOC)
4069 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4070 else {
4071 for (value = 0; value < 256; value++)
4072 if (isSPACE(value))
4073 ANYOF_BITMAP_SET(ret, value);
4074 }
4075 yesno = '+';
4076 what = "SpacePerl";
4077 break;
4078 case ANYOF_NSPACE:
4079 if (LOC)
4080 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4081 else {
4082 for (value = 0; value < 256; value++)
4083 if (!isSPACE(value))
4084 ANYOF_BITMAP_SET(ret, value);
4085 }
4086 yesno = '!';
4087 what = "SpacePerl";
4088 break;
4089 case ANYOF_UPPER:
4090 if (LOC)
4091 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4092 else {
4093 for (value = 0; value < 256; value++)
4094 if (isUPPER(value))
4095 ANYOF_BITMAP_SET(ret, value);
4096 }
4097 yesno = '+';
4098 what = "Upper";
4099 break;
4100 case ANYOF_NUPPER:
4101 if (LOC)
4102 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4103 else {
4104 for (value = 0; value < 256; value++)
4105 if (!isUPPER(value))
4106 ANYOF_BITMAP_SET(ret, value);
4107 }
4108 yesno = '!';
4109 what = "Upper";
4110 break;
4111 case ANYOF_XDIGIT:
4112 if (LOC)
4113 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4114 else {
4115 for (value = 0; value < 256; value++)
4116 if (isXDIGIT(value))
4117 ANYOF_BITMAP_SET(ret, value);
4118 }
4119 yesno = '+';
4120 what = "XDigit";
4121 break;
4122 case ANYOF_NXDIGIT:
4123 if (LOC)
4124 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4125 else {
4126 for (value = 0; value < 256; value++)
4127 if (!isXDIGIT(value))
4128 ANYOF_BITMAP_SET(ret, value);
4129 }
4130 yesno = '!';
4131 what = "XDigit";
4132 break;
4133 case ANYOF_MAX:
4134 /* this is to handle \p and \P */
4135 break;
4136 default:
4137 vFAIL("Invalid [::] class");
4138 break;
4139 }
4140 if (what) {
4141 /* Strings such as "+utf8::isWord\n" */
4142 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4143 }
4144 if (LOC)
4145 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4146 continue;
4147 }
4148 } /* end of namedclass \blah */
4149
4150 if (range) {
4151 if (prevvalue > (IV)value) /* b-a */ {
4152 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4153 RExC_parse - rangebegin,
4154 RExC_parse - rangebegin,
4155 rangebegin);
4156 range = 0; /* not a valid range */
4157 }
4158 }
4159 else {
4160 prevvalue = value; /* save the beginning of the range */
4161 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4162 RExC_parse[1] != ']') {
4163 RExC_parse++;
4164
4165 /* a bad range like \w-, [:word:]- ? */
4166 if (namedclass > OOB_NAMEDCLASS) {
4167 if (ckWARN(WARN_REGEXP))
4168 vWARN4(RExC_parse,
4169 "False [] range \"%*.*s\"",
4170 RExC_parse - rangebegin,
4171 RExC_parse - rangebegin,
4172 rangebegin);
4173 if (!SIZE_ONLY)
4174 ANYOF_BITMAP_SET(ret, '-');
4175 } else
4176 range = 1; /* yeah, it's a range! */
4177 continue; /* but do it the next time */
4178 }
4179 }
4180
4181 /* now is the next time */
4182 if (!SIZE_ONLY) {
4183 IV i;
4184
4185 if (prevvalue < 256) {
4186 const IV ceilvalue = value < 256 ? value : 255;
4187
4188#ifdef EBCDIC
4189 /* In EBCDIC [\x89-\x91] should include
4190 * the \x8e but [i-j] should not. */
4191 if (literal_endpoint == 2 &&
4192 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4193 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4194 {
4195 if (isLOWER(prevvalue)) {
4196 for (i = prevvalue; i <= ceilvalue; i++)
4197 if (isLOWER(i))
4198 ANYOF_BITMAP_SET(ret, i);
4199 } else {
4200 for (i = prevvalue; i <= ceilvalue; i++)
4201 if (isUPPER(i))
4202 ANYOF_BITMAP_SET(ret, i);
4203 }
4204 }
4205 else
4206#endif
4207 for (i = prevvalue; i <= ceilvalue; i++)
4208 ANYOF_BITMAP_SET(ret, i);
4209 }
4210 if (value > 255 || UTF) {
4211 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4212 const UV natvalue = NATIVE_TO_UNI(value);
4213
4214 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4215 if (prevnatvalue < natvalue) { /* what about > ? */
4216 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4217 prevnatvalue, natvalue);
4218 }
4219 else if (prevnatvalue == natvalue) {
4220 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4221 if (FOLD) {
4222 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
4223 STRLEN foldlen;
4224 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4225
4226 /* If folding and foldable and a single
4227 * character, insert also the folded version
4228 * to the charclass. */
4229 if (f != value) {
4230 if (foldlen == (STRLEN)UNISKIP(f))
4231 Perl_sv_catpvf(aTHX_ listsv,
4232 "%04"UVxf"\n", f);
4233 else {
4234 /* Any multicharacter foldings
4235 * require the following transform:
4236 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4237 * where E folds into "pq" and F folds
4238 * into "rst", all other characters
4239 * fold to single characters. We save
4240 * away these multicharacter foldings,
4241 * to be later saved as part of the
4242 * additional "s" data. */
4243 SV *sv;
4244
4245 if (!unicode_alternate)
4246 unicode_alternate = newAV();
4247 sv = newSVpvn((char*)foldbuf, foldlen);
4248 SvUTF8_on(sv);
4249 av_push(unicode_alternate, sv);
4250 }
4251 }
4252
4253 /* If folding and the value is one of the Greek
4254 * sigmas insert a few more sigmas to make the
4255 * folding rules of the sigmas to work right.
4256 * Note that not all the possible combinations
4257 * are handled here: some of them are handled
4258 * by the standard folding rules, and some of
4259 * them (literal or EXACTF cases) are handled
4260 * during runtime in regexec.c:S_find_byclass(). */
4261 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4262 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4263 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4264 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4265 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4266 }
4267 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4268 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4269 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4270 }
4271 }
4272 }
4273#ifdef EBCDIC
4274 literal_endpoint = 0;
4275#endif
4276 }
4277
4278 range = 0; /* this range (if it was one) is done now */
4279 }
4280
4281 if (need_class) {
4282 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4283 if (SIZE_ONLY)
4284 RExC_size += ANYOF_CLASS_ADD_SKIP;
4285 else
4286 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4287 }
4288
4289 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4290 if (!SIZE_ONLY &&
4291 /* If the only flag is folding (plus possibly inversion). */
4292 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4293 ) {
4294 for (value = 0; value < 256; ++value) {
4295 if (ANYOF_BITMAP_TEST(ret, value)) {
4296 UV fold = PL_fold[value];
4297
4298 if (fold != value)
4299 ANYOF_BITMAP_SET(ret, fold);
4300 }
4301 }
4302 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4303 }
4304
4305 /* optimize inverted simple patterns (e.g. [^a-z]) */
4306 if (!SIZE_ONLY && optimize_invert &&
4307 /* If the only flag is inversion. */
4308 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4309 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4310 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4311 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4312 }
4313
4314 if (!SIZE_ONLY) {
4315 AV *av = newAV();
4316 SV *rv;
4317
4318 /* The 0th element stores the character class description
4319 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4320 * to initialize the appropriate swash (which gets stored in
4321 * the 1st element), and also useful for dumping the regnode.
4322 * The 2nd element stores the multicharacter foldings,
4323 * used later (regexec.c:S_reginclass()). */
4324 av_store(av, 0, listsv);
4325 av_store(av, 1, NULL);
4326 av_store(av, 2, (SV*)unicode_alternate);
4327 rv = newRV_noinc((SV*)av);
4328 n = add_data(pRExC_state, 1, "s");
4329 RExC_rx->data->data[n] = (void*)rv;
4330 ARG_SET(ret, n);
4331 }
4332
4333 return ret;
4334}
4335
4336STATIC char*
4337S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4338{
4339 char* retval = RExC_parse++;
4340
4341 for (;;) {
4342 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4343 RExC_parse[2] == '#') {
4344 while (*RExC_parse != ')') {
4345 if (RExC_parse == RExC_end)
4346 FAIL("Sequence (?#... not terminated");
4347 RExC_parse++;
4348 }
4349 RExC_parse++;
4350 continue;
4351 }
4352 if (RExC_flags & PMf_EXTENDED) {
4353 if (isSPACE(*RExC_parse)) {
4354 RExC_parse++;
4355 continue;
4356 }
4357 else if (*RExC_parse == '#') {
4358 while (RExC_parse < RExC_end)
4359 if (*RExC_parse++ == '\n') break;
4360 continue;
4361 }
4362 }
4363 return retval;
4364 }
4365}
4366
4367/*
4368- reg_node - emit a node
4369*/
4370STATIC regnode * /* Location. */
4371S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4372{
4373 register regnode *ptr;
4374 regnode * const ret = RExC_emit;
4375
4376 if (SIZE_ONLY) {
4377 SIZE_ALIGN(RExC_size);
4378 RExC_size += 1;
4379 return(ret);
4380 }
4381
4382 NODE_ALIGN_FILL(ret);
4383 ptr = ret;
4384 FILL_ADVANCE_NODE(ptr, op);
4385 if (RExC_offsets) { /* MJD */
4386 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4387 "reg_node", __LINE__,
4388 reg_name[op],
4389 RExC_emit - RExC_emit_start > RExC_offsets[0]
4390 ? "Overwriting end of array!\n" : "OK",
4391 RExC_emit - RExC_emit_start,
4392 RExC_parse - RExC_start,
4393 RExC_offsets[0]));
4394 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4395 }
4396
4397 RExC_emit = ptr;
4398
4399 return(ret);
4400}
4401
4402/*
4403- reganode - emit a node with an argument
4404*/
4405STATIC regnode * /* Location. */
4406S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4407{
4408 register regnode *ptr;
4409 regnode * const ret = RExC_emit;
4410
4411 if (SIZE_ONLY) {
4412 SIZE_ALIGN(RExC_size);
4413 RExC_size += 2;
4414 return(ret);
4415 }
4416
4417 NODE_ALIGN_FILL(ret);
4418 ptr = ret;
4419 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4420 if (RExC_offsets) { /* MJD */
4421 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4422 "reganode",
4423 __LINE__,
4424 reg_name[op],
4425 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4426 "Overwriting end of array!\n" : "OK",
4427 RExC_emit - RExC_emit_start,
4428 RExC_parse - RExC_start,
4429 RExC_offsets[0]));
4430 Set_Cur_Node_Offset;
4431 }
4432
4433 RExC_emit = ptr;
4434
4435 return(ret);
4436}
4437
4438/*
4439- reguni - emit (if appropriate) a Unicode character
4440*/
4441STATIC void
4442S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4443{
4444 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4445}
4446
4447/*
4448- reginsert - insert an operator in front of already-emitted operand
4449*
4450* Means relocating the operand.
4451*/
4452STATIC void
4453S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4454{
4455 register regnode *src;
4456 register regnode *dst;
4457 register regnode *place;
4458 const int offset = regarglen[(U8)op];
4459
4460/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4461
4462 if (SIZE_ONLY) {
4463 RExC_size += NODE_STEP_REGNODE + offset;
4464 return;
4465 }
4466
4467 src = RExC_emit;
4468 RExC_emit += NODE_STEP_REGNODE + offset;
4469 dst = RExC_emit;
4470 while (src > opnd) {
4471 StructCopy(--src, --dst, regnode);
4472 if (RExC_offsets) { /* MJD 20010112 */
4473 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4474 "reg_insert",
4475 __LINE__,
4476 reg_name[op],
4477 dst - RExC_emit_start > RExC_offsets[0]
4478 ? "Overwriting end of array!\n" : "OK",
4479 src - RExC_emit_start,
4480 dst - RExC_emit_start,
4481 RExC_offsets[0]));
4482 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4483 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4484 }
4485 }
4486
4487
4488 place = opnd; /* Op node, where operand used to be. */
4489 if (RExC_offsets) { /* MJD */
4490 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4491 "reginsert",
4492 __LINE__,
4493 reg_name[op],
4494 place - RExC_emit_start > RExC_offsets[0]
4495 ? "Overwriting end of array!\n" : "OK",
4496 place - RExC_emit_start,
4497 RExC_parse - RExC_start,
4498 RExC_offsets[0]));
4499 Set_Node_Offset(place, RExC_parse);
4500 Set_Node_Length(place, 1);
4501 }
4502 src = NEXTOPER(place);
4503 FILL_ADVANCE_NODE(place, op);
4504 Zero(src, offset, regnode);
4505}
4506
4507/*
4508- regtail - set the next-pointer at the end of a node chain of p to val.
4509*/
4510STATIC void
4511S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4512{
4513 register regnode *scan;
4514
4515 if (SIZE_ONLY)
4516 return;
4517
4518 /* Find last node. */
4519 scan = p;
4520 for (;;) {
4521 regnode * const temp = regnext(scan);
4522 if (temp == NULL)
4523 break;
4524 scan = temp;
4525 }
4526
4527 if (reg_off_by_arg[OP(scan)]) {
4528 ARG_SET(scan, val - scan);
4529 }
4530 else {
4531 NEXT_OFF(scan) = val - scan;
4532 }
4533}
4534
4535/*
4536- regoptail - regtail on operand of first argument; nop if operandless
4537*/
4538STATIC void
4539S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4540{
4541 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4542 if (p == NULL || SIZE_ONLY)
4543 return;
4544 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4545 regtail(pRExC_state, NEXTOPER(p), val);
4546 }
4547 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4548 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4549 }
4550 else
4551 return;
4552}
4553
4554/*
4555 - regcurly - a little FSA that accepts {\d+,?\d*}
4556 */
4557STATIC I32
4558S_regcurly(pTHX_ register const char *s)
4559{
4560 if (*s++ != '{')
4561 return FALSE;
4562 if (!isDIGIT(*s))
4563 return FALSE;
4564 while (isDIGIT(*s))
4565 s++;
4566 if (*s == ',')
4567 s++;
4568 while (isDIGIT(*s))
4569 s++;
4570 if (*s != '}')
4571 return FALSE;
4572 return TRUE;
4573}
4574
4575/*
4576 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4577 */
4578void
4579Perl_regdump(pTHX_ regexp *r)
4580{
4581#ifdef DEBUGGING
4582 SV *sv = sv_newmortal();
4583
4584 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4585
4586 /* Header fields of interest. */
4587 if (r->anchored_substr)
4588 PerlIO_printf(Perl_debug_log,
4589 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
4590 PL_colors[0],
4591 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4592 SvPVX_const(r->anchored_substr),
4593 PL_colors[1],
4594 SvTAIL(r->anchored_substr) ? "$" : "",
4595 (IV)r->anchored_offset);
4596 else if (r->anchored_utf8)
4597 PerlIO_printf(Perl_debug_log,
4598 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
4599 PL_colors[0],
4600 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4601 SvPVX_const(r->anchored_utf8),
4602 PL_colors[1],
4603 SvTAIL(r->anchored_utf8) ? "$" : "",
4604 (IV)r->anchored_offset);
4605 if (r->float_substr)
4606 PerlIO_printf(Perl_debug_log,
4607 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4608 PL_colors[0],
4609 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4610 SvPVX_const(r->float_substr),
4611 PL_colors[1],
4612 SvTAIL(r->float_substr) ? "$" : "",
4613 (IV)r->float_min_offset, (UV)r->float_max_offset);
4614 else if (r->float_utf8)
4615 PerlIO_printf(Perl_debug_log,
4616 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4617 PL_colors[0],
4618 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4619 SvPVX_const(r->float_utf8),
4620 PL_colors[1],
4621 SvTAIL(r->float_utf8) ? "$" : "",
4622 (IV)r->float_min_offset, (UV)r->float_max_offset);
4623 if (r->check_substr || r->check_utf8)
4624 PerlIO_printf(Perl_debug_log,
4625 r->check_substr == r->float_substr
4626 && r->check_utf8 == r->float_utf8
4627 ? "(checking floating" : "(checking anchored");
4628 if (r->reganch & ROPT_NOSCAN)
4629 PerlIO_printf(Perl_debug_log, " noscan");
4630 if (r->reganch & ROPT_CHECK_ALL)
4631 PerlIO_printf(Perl_debug_log, " isall");
4632 if (r->check_substr || r->check_utf8)
4633 PerlIO_printf(Perl_debug_log, ") ");
4634
4635 if (r->regstclass) {
4636 regprop(sv, r->regstclass);
4637 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
4638 }
4639 if (r->reganch & ROPT_ANCH) {
4640 PerlIO_printf(Perl_debug_log, "anchored");
4641 if (r->reganch & ROPT_ANCH_BOL)
4642 PerlIO_printf(Perl_debug_log, "(BOL)");
4643 if (r->reganch & ROPT_ANCH_MBOL)
4644 PerlIO_printf(Perl_debug_log, "(MBOL)");
4645 if (r->reganch & ROPT_ANCH_SBOL)
4646 PerlIO_printf(Perl_debug_log, "(SBOL)");
4647 if (r->reganch & ROPT_ANCH_GPOS)
4648 PerlIO_printf(Perl_debug_log, "(GPOS)");
4649 PerlIO_putc(Perl_debug_log, ' ');
4650 }
4651 if (r->reganch & ROPT_GPOS_SEEN)
4652 PerlIO_printf(Perl_debug_log, "GPOS ");
4653 if (r->reganch & ROPT_SKIP)
4654 PerlIO_printf(Perl_debug_log, "plus ");
4655 if (r->reganch & ROPT_IMPLICIT)
4656 PerlIO_printf(Perl_debug_log, "implicit ");
4657 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4658 if (r->reganch & ROPT_EVAL_SEEN)
4659 PerlIO_printf(Perl_debug_log, "with eval ");
4660 PerlIO_printf(Perl_debug_log, "\n");
4661 if (r->offsets) {
4662 U32 i;
4663 const U32 len = r->offsets[0];
4664 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4665 for (i = 1; i <= len; i++)
4666 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4667 (UV)r->offsets[i*2-1],
4668 (UV)r->offsets[i*2]);
4669 PerlIO_printf(Perl_debug_log, "\n");
4670 }
4671#endif /* DEBUGGING */
4672}
4673
4674/*
4675- regprop - printable representation of opcode
4676*/
4677void
4678Perl_regprop(pTHX_ SV *sv, regnode *o)
4679{
4680#ifdef DEBUGGING
4681 register int k;
4682
4683 sv_setpvn(sv, "", 0);
4684 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4685 /* It would be nice to FAIL() here, but this may be called from
4686 regexec.c, and it would be hard to supply pRExC_state. */
4687 Perl_croak(aTHX_ "Corrupted regexp opcode");
4688 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
4689
4690 k = PL_regkind[(U8)OP(o)];
4691
4692 if (k == EXACT) {
4693 SV * const dsv = sv_2mortal(newSVpvn("", 0));
4694 /* Using is_utf8_string() is a crude hack but it may
4695 * be the best for now since we have no flag "this EXACTish
4696 * node was UTF-8" --jhi */
4697 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4698 const char * const s = do_utf8 ?
4699 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4700 UNI_DISPLAY_REGEX) :
4701 STRING(o);
4702 const int len = do_utf8 ?
4703 strlen(s) :
4704 STR_LEN(o);
4705 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4706 PL_colors[0],
4707 len, s,
4708 PL_colors[1]);
4709 }
4710 else if (k == CURLY) {
4711 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4712 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4713 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4714 }
4715 else if (k == WHILEM && o->flags) /* Ordinal/of */
4716 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4717 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4718 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4719 else if (k == LOGICAL)
4720 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4721 else if (k == ANYOF) {
4722 int i, rangestart = -1;
4723 const U8 flags = ANYOF_FLAGS(o);
4724 const char * const anyofs[] = { /* Should be synchronized with
4725 * ANYOF_ #xdefines in regcomp.h */
4726 "\\w",
4727 "\\W",
4728 "\\s",
4729 "\\S",
4730 "\\d",
4731 "\\D",
4732 "[:alnum:]",
4733 "[:^alnum:]",
4734 "[:alpha:]",
4735 "[:^alpha:]",
4736 "[:ascii:]",
4737 "[:^ascii:]",
4738 "[:ctrl:]",
4739 "[:^ctrl:]",
4740 "[:graph:]",
4741 "[:^graph:]",
4742 "[:lower:]",
4743 "[:^lower:]",
4744 "[:print:]",
4745 "[:^print:]",
4746 "[:punct:]",
4747 "[:^punct:]",
4748 "[:upper:]",
4749 "[:^upper:]",
4750 "[:xdigit:]",
4751 "[:^xdigit:]",
4752 "[:space:]",
4753 "[:^space:]",
4754 "[:blank:]",
4755 "[:^blank:]"
4756 };
4757
4758 if (flags & ANYOF_LOCALE)
4759 sv_catpv(sv, "{loc}");
4760 if (flags & ANYOF_FOLD)
4761 sv_catpv(sv, "{i}");
4762 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4763 if (flags & ANYOF_INVERT)
4764 sv_catpv(sv, "^");
4765 for (i = 0; i <= 256; i++) {
4766 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4767 if (rangestart == -1)
4768 rangestart = i;
4769 } else if (rangestart != -1) {
4770 if (i <= rangestart + 3)
4771 for (; rangestart < i; rangestart++)
4772 put_byte(sv, rangestart);
4773 else {
4774 put_byte(sv, rangestart);
4775 sv_catpv(sv, "-");
4776 put_byte(sv, i - 1);
4777 }
4778 rangestart = -1;
4779 }
4780 }
4781
4782 if (o->flags & ANYOF_CLASS)
4783 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4784 if (ANYOF_CLASS_TEST(o,i))
4785 sv_catpv(sv, anyofs[i]);
4786
4787 if (flags & ANYOF_UNICODE)
4788 sv_catpv(sv, "{unicode}");
4789 else if (flags & ANYOF_UNICODE_ALL)
4790 sv_catpv(sv, "{unicode_all}");
4791
4792 {
4793 SV *lv;
4794 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
4795
4796 if (lv) {
4797 if (sw) {
4798 U8 s[UTF8_MAXBYTES_CASE+1];
4799
4800 for (i = 0; i <= 256; i++) { /* just the first 256 */
4801 uvchr_to_utf8(s, i);
4802
4803 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4804 if (rangestart == -1)
4805 rangestart = i;
4806 } else if (rangestart != -1) {
4807 if (i <= rangestart + 3)
4808 for (; rangestart < i; rangestart++) {
4809 const U8 * const e = uvchr_to_utf8(s,rangestart);
4810 U8 *p;
4811 for(p = s; p < e; p++)
4812 put_byte(sv, *p);
4813 }
4814 else {
4815 const U8 *e = uvchr_to_utf8(s,rangestart);
4816 U8 *p;
4817 for (p = s; p < e; p++)
4818 put_byte(sv, *p);
4819 sv_catpvn(sv, "-", 1);
4820 e = uvchr_to_utf8(s, i-1);
4821 for (p = s; p < e; p++)
4822 put_byte(sv, *p);
4823 }
4824 rangestart = -1;
4825 }
4826 }
4827
4828 sv_catpv(sv, "..."); /* et cetera */
4829 }
4830
4831 {
4832 char *s = savesvpv(lv);
4833 char *origs = s;
4834
4835 while(*s && *s != '\n') s++;
4836
4837 if (*s == '\n') {
4838 const char * const t = ++s;
4839
4840 while (*s) {
4841 if (*s == '\n')
4842 *s = ' ';
4843 s++;
4844 }
4845 if (s[-1] == ' ')
4846 s[-1] = 0;
4847
4848 sv_catpv(sv, t);
4849 }
4850
4851 Safefree(origs);
4852 }
4853 }
4854 }
4855
4856 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4857 }
4858 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4859 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4860#endif /* DEBUGGING */
4861}
4862
4863SV *
4864Perl_re_intuit_string(pTHX_ regexp *prog)
4865{ /* Assume that RE_INTUIT is set */
4866 DEBUG_r(
4867 {
4868 const char * const s = SvPV_nolen_const(prog->check_substr
4869 ? prog->check_substr : prog->check_utf8);
4870
4871 if (!PL_colorset) reginitcolors();
4872 PerlIO_printf(Perl_debug_log,
4873 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
4874 PL_colors[4],
4875 prog->check_substr ? "" : "utf8 ",
4876 PL_colors[5],PL_colors[0],
4877 s,
4878 PL_colors[1],
4879 (strlen(s) > 60 ? "..." : ""));
4880 } );
4881
4882 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4883}
4884
4885void
4886Perl_pregfree(pTHX_ struct regexp *r)
4887{
4888#ifdef DEBUGGING
4889 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4890#endif
4891
4892 if (!r || (--r->refcnt > 0))
4893 return;
4894 DEBUG_r({
4895 const char *s = (r->reganch & ROPT_UTF8)
4896 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
4897 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4898 const int len = SvCUR(dsv);
4899 if (!PL_colorset)
4900 reginitcolors();
4901 PerlIO_printf(Perl_debug_log,
4902 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4903 PL_colors[4],PL_colors[5],PL_colors[0],
4904 len, len, s,
4905 PL_colors[1],
4906 len > 60 ? "..." : "");
4907 });
4908
4909 /* gcov results gave these as non-null 100% of the time, so there's no
4910 optimisation in checking them before calling Safefree */
4911 Safefree(r->precomp);
4912 Safefree(r->offsets); /* 20010421 MJD */
4913 if (RX_MATCH_COPIED(r))
4914 Safefree(r->subbeg);
4915 if (r->substrs) {
4916 if (r->anchored_substr)
4917 SvREFCNT_dec(r->anchored_substr);
4918 if (r->anchored_utf8)
4919 SvREFCNT_dec(r->anchored_utf8);
4920 if (r->float_substr)
4921 SvREFCNT_dec(r->float_substr);
4922 if (r->float_utf8)
4923 SvREFCNT_dec(r->float_utf8);
4924 Safefree(r->substrs);
4925 }
4926 if (r->data) {
4927 int n = r->data->count;
4928 PAD* new_comppad = NULL;
4929 PAD* old_comppad;
4930 PADOFFSET refcnt;
4931
4932 while (--n >= 0) {
4933 /* If you add a ->what type here, update the comment in regcomp.h */
4934 switch (r->data->what[n]) {
4935 case 's':
4936 SvREFCNT_dec((SV*)r->data->data[n]);
4937 break;
4938 case 'f':
4939 Safefree(r->data->data[n]);
4940 break;
4941 case 'p':
4942 new_comppad = (AV*)r->data->data[n];
4943 break;
4944 case 'o':
4945 if (new_comppad == NULL)
4946 Perl_croak(aTHX_ "panic: pregfree comppad");
4947 PAD_SAVE_LOCAL(old_comppad,
4948 /* Watch out for global destruction's random ordering. */
4949 (SvTYPE(new_comppad) == SVt_PVAV) ?
4950 new_comppad : Null(PAD *)
4951 );
4952 OP_REFCNT_LOCK;
4953 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
4954 OP_REFCNT_UNLOCK;
4955 if (!refcnt)
4956 op_free((OP_4tree*)r->data->data[n]);
4957
4958 PAD_RESTORE_LOCAL(old_comppad);
4959 SvREFCNT_dec((SV*)new_comppad);
4960 new_comppad = NULL;
4961 break;
4962 case 'n':
4963 break;
4964 default:
4965 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4966 }
4967 }
4968 Safefree(r->data->what);
4969 Safefree(r->data);
4970 }
4971 Safefree(r->startp);
4972 Safefree(r->endp);
4973 Safefree(r);
4974}
4975
4976/*
4977 - regnext - dig the "next" pointer out of a node
4978 *
4979 * [Note, when REGALIGN is defined there are two places in regmatch()
4980 * that bypass this code for speed.]
4981 */
4982regnode *
4983Perl_regnext(pTHX_ register regnode *p)
4984{
4985 register I32 offset;
4986
4987 if (p == &PL_regdummy)
4988 return(NULL);
4989
4990 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4991 if (offset == 0)
4992 return(NULL);
4993
4994 return(p+offset);
4995}
4996
4997STATIC void
4998S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4999{
5000 va_list args;
5001 STRLEN l1 = strlen(pat1);
5002 STRLEN l2 = strlen(pat2);
5003 char buf[512];
5004 SV *msv;
5005 const char *message;
5006
5007 if (l1 > 510)
5008 l1 = 510;
5009 if (l1 + l2 > 510)
5010 l2 = 510 - l1;
5011 Copy(pat1, buf, l1 , char);
5012 Copy(pat2, buf + l1, l2 , char);
5013 buf[l1 + l2] = '\n';
5014 buf[l1 + l2 + 1] = '\0';
5015#ifdef I_STDARG
5016 /* ANSI variant takes additional second argument */
5017 va_start(args, pat2);
5018#else
5019 va_start(args);
5020#endif
5021 msv = vmess(buf, &args);
5022 va_end(args);
5023 message = SvPV_const(msv,l1);
5024 if (l1 > 512)
5025 l1 = 512;
5026 Copy(message, buf, l1 , char);
5027 buf[l1-1] = '\0'; /* Overwrite \n */
5028 Perl_croak(aTHX_ "%s", buf);
5029}
5030
5031/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5032
5033void
5034Perl_save_re_context(pTHX)
5035{
5036 SAVEI32(PL_reg_flags); /* from regexec.c */
5037 SAVEPPTR(PL_bostr);
5038 SAVEPPTR(PL_reginput); /* String-input pointer. */
5039 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5040 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5041 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5042 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5043 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5044 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5045 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5046 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5047 PL_reg_start_tmp = 0;
5048 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5049 PL_reg_start_tmpl = 0;
5050 SAVEVPTR(PL_regdata);
5051 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5052 SAVEI32(PL_regnarrate); /* from regexec.c */
5053 SAVEVPTR(PL_regprogram); /* from regexec.c */
5054 SAVEINT(PL_regindent); /* from regexec.c */
5055 SAVEVPTR(PL_regcc); /* from regexec.c */
5056 SAVEVPTR(PL_curcop);
5057 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5058 SAVEVPTR(PL_reg_re); /* from regexec.c */
5059 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5060 SAVESPTR(PL_reg_sv); /* from regexec.c */
5061 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5062 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5063 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5064 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5065 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5066 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5067 PL_reg_oldsaved = Nullch;
5068 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5069 PL_reg_oldsavedlen = 0;
5070 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5071 PL_reg_maxiter = 0;
5072 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5073 PL_reg_leftiter = 0;
5074 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5075 PL_reg_poscache = Nullch;
5076 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5077 PL_reg_poscache_size = 0;
5078 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5079 SAVEI32(PL_regnpar); /* () count. */
5080 SAVEI32(PL_regsize); /* from regexec.c */
5081
5082 {
5083 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5084 REGEXP *rx;
5085
5086 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5087 U32 i;
5088 for (i = 1; i <= rx->nparens; i++) {
5089 GV *mgv;
5090 char digits[TYPE_CHARS(long)];
5091 sprintf(digits, "%lu", (long)i);
5092 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5093 save_scalar(mgv);
5094 }
5095 }
5096 }
5097
5098#ifdef DEBUGGING
5099 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5100#endif
5101}
5102
5103static void
5104clear_re(pTHX_ void *r)
5105{
5106 ReREFCNT_dec((regexp *)r);
5107}
5108
5109#ifdef DEBUGGING
5110
5111STATIC void
5112S_put_byte(pTHX_ SV *sv, int c)
5113{
5114 if (isCNTRL(c) || c == 255 || !isPRINT(c))
5115 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5116 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5117 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5118 else
5119 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5120}
5121
5122
5123STATIC regnode *
5124S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
5125{
5126 register U8 op = EXACT; /* Arbitrary non-END op. */
5127 register regnode *next;
5128
5129 while (op != END && (!last || node < last)) {
5130 /* While that wasn't END last time... */
5131
5132 NODE_ALIGN(node);
5133 op = OP(node);
5134 if (op == CLOSE)
5135 l--;
5136 next = regnext(node);
5137 /* Where, what. */
5138 if (OP(node) == OPTIMIZED)
5139 goto after_print;
5140 regprop(sv, node);
5141 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5142 (int)(2*l + 1), "", SvPVX_const(sv));
5143 if (next == NULL) /* Next ptr. */
5144 PerlIO_printf(Perl_debug_log, "(0)");
5145 else
5146 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5147 (void)PerlIO_putc(Perl_debug_log, '\n');
5148 after_print:
5149 if (PL_regkind[(U8)op] == BRANCHJ) {
5150 register regnode *nnode = (OP(next) == LONGJMP
5151 ? regnext(next)
5152 : next);
5153 if (last && nnode > last)
5154 nnode = last;
5155 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5156 }
5157 else if (PL_regkind[(U8)op] == BRANCH) {
5158 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5159 }
5160 else if ( op == CURLY) { /* "next" might be very big: optimizer */
5161 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5162 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5163 }
5164 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5165 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5166 next, sv, l + 1);
5167 }
5168 else if ( op == PLUS || op == STAR) {
5169 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5170 }
5171 else if (op == ANYOF) {
5172 /* arglen 1 + class block */
5173 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5174 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5175 node = NEXTOPER(node);
5176 }
5177 else if (PL_regkind[(U8)op] == EXACT) {
5178 /* Literal string, where present. */
5179 node += NODE_SZ_STR(node) - 1;
5180 node = NEXTOPER(node);
5181 }
5182 else {
5183 node = NEXTOPER(node);
5184 node += regarglen[(U8)op];
5185 }
5186 if (op == CURLYX || op == OPEN)
5187 l++;
5188 else if (op == WHILEM)
5189 l--;
5190 }
5191 return node;
5192}
5193
5194#endif /* DEBUGGING */
5195
5196/*
5197 * Local variables:
5198 * c-indentation-style: bsd
5199 * c-basic-offset: 4
5200 * indent-tabs-mode: t
5201 * End:
5202 *
5203 * ex: set ts=8 sts=4 sw=4 noet:
5204 */
Note: See TracBrowser for help on using the repository browser.