Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

pcre.c

Go to the documentation of this file.
00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* **** This is an ALTERED VERSION of PCRE **** */
00006 
00007 /*
00008 This is a library of functions to support regular expressions whose syntax
00009 and semantics are as close as possible to those of the Perl 5 language. See
00010 the file Tech.Notes for some information on the internals.
00011 
00012 Written by: Philip Hazel <ph10@cam.ac.uk>
00013 
00014            Copyright (c) 1997-1999 University of Cambridge
00015 
00016 -----------------------------------------------------------------------------
00017 Permission is granted to anyone to use this software for any purpose on any
00018 computer system, and to redistribute it freely, subject to the following
00019 restrictions:
00020 
00021 1. This software is distributed in the hope that it will be useful,
00022    but WITHOUT ANY WARRANTY; without even the implied warranty of
00023    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
00024 
00025 2. The origin of this software must not be misrepresented, either by
00026    explicit claim or by omission.
00027 
00028 3. Altered versions must be plainly marked as such, and must not be
00029    misrepresented as being the original software.
00030 
00031 4. If PCRE is embedded in any software that is released under the GNU
00032    General Purpose Licence (GPL), then the terms of that licence shall
00033    supersede any condition above with which it is incompatible.
00034 -----------------------------------------------------------------------------
00035 */
00036 
00037 
00038 #ifdef __cplusplus
00039 extern "C" {
00040 #endif
00041 
00042 
00043 
00044 
00045 /* Define DEBUG to get debugging output on stdout. */
00046 
00047 /* #define DEBUG */
00048 
00049 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
00050 inline, and there are *still* stupid compilers about that don't like indented
00051 pre-processor statements. I suppose it's only been 10 years... */
00052 
00053 #ifdef DEBUG
00054 #define DPRINTF(p) printf p
00055 #else
00056 #define DPRINTF(p) /*nothing*/
00057 #endif
00058 
00059 /* Include the internals header, which itself includes Standard C headers plus
00060 the external pcre header. */
00061 
00062 #include "pcreinternal.h"
00063 
00064 
00065 /* Allow compilation as C++ source code, should anybody want to do that. */
00066 
00067 #ifdef __cplusplus
00068 #define class pcre_class
00069 #endif
00070 
00071 
00072 /* Number of items on the nested bracket stacks at compile time. This should
00073 not be set greater than 200. */
00074 
00075 #define BRASTACK_SIZE 200
00076 
00077 
00078 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
00079 
00080 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
00081 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
00082 
00083 /* Text forms of OP_ values and things, for debugging (not all used) */
00084 
00085 #ifdef DEBUG
00086 static const char *OP_names[] = {
00087   "End", "\\A", "\\B", "\\b", "\\D", "\\d",
00088   "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
00089   "Opt", "^", "$", "Any", "chars", "not",
00090   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
00091   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
00092   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
00093   "*", "*?", "+", "+?", "?", "??", "{", "{",
00094   "class", "Ref",
00095   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
00096   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
00097   "Brazero", "Braminzero", "Bra"
00098 };
00099 #endif
00100 
00101 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
00102 are simple data values; negative values are for special things like \d and so
00103 on. Zero means further processing is needed (for things like \x), or the escape
00104 is invalid. */
00105 
00106 static const short int escapes[] = {
00107     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
00108     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
00109   '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
00110     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
00111     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
00112     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
00113   '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */
00114     0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
00115     0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */
00116     0,      0, -ESC_z                                            /* x - z */
00117 };
00118 
00119 /* Definition to allow mutual recursion */
00120 
00121 static BOOL
00122   compile_regex(int, int, int *, uschar **, const uschar **, const char **,
00123     BOOL, int, compile_data *);
00124 
00125 
00126 
00127 /*************************************************
00128 *               Global variables                 *
00129 *************************************************/
00130 
00131 /* PCRE is thread-clean and doesn't use any global variables in the normal
00132 sense. However, it calls memory allocation and free functions via the two
00133 indirections below, which are can be changed by the caller, but are shared
00134 between all threads. */
00135 
00136 void *(*vmdpcre_malloc)(size_t) = malloc;
00137 void  (*vmdpcre_free)(void *) = free;
00138 
00139 
00140 
00141 
00142 /*************************************************
00143 *             Default character tables           *
00144 *************************************************/
00145 
00146 /* A default set of character tables is included in the PCRE binary. Its source
00147 is built by the maketables auxiliary program, which uses the default C ctypes
00148 functions, and put in the file chartables.c. These tables are used by PCRE
00149 whenever the caller of pcre_compile() does not provide an alternate set of
00150 tables. */
00151 
00152 #include "pcretables.h"
00153 
00154 
00155 
00156 /*************************************************
00157 *          Return version string                 *
00158 *************************************************/
00159 
00160 const char *
00161 vmdpcre_version(void)
00162 {
00163 return PCRE_VERSION;
00164 }
00165 
00166 
00167 
00168 
00169 /*************************************************
00170 *       Return info about a compiled pattern     *
00171 *************************************************/
00172 
00173 /* This function picks potentially useful data out of the private
00174 structure.
00175 
00176 Arguments:
00177   external_re   points to compiled code
00178   optptr        where to pass back the options
00179   first_char    where to pass back the first character,
00180                 or -1 if multiline and all branches start ^,
00181                 or -2 otherwise
00182 
00183 Returns:        number of identifying extraction brackets
00184                 or negative values on error
00185 */
00186 
00187 int
00188 vmdpcre_info(const pcre *external_re, int *optptr, int *first_char)
00189 {
00190 const real_pcre *re = (const real_pcre *)external_re;
00191 if (re == NULL) return PCRE_ERROR_NULL;
00192 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
00193 if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);
00194 if (first_char != NULL)
00195   *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
00196      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
00197 return re->top_bracket;
00198 }
00199 
00200 
00201 
00202 
00203 #ifdef DEBUG
00204 /*************************************************
00205 *        Debugging function to print chars       *
00206 *************************************************/
00207 
00208 /* Print a sequence of chars in printable format, stopping at the end of the
00209 subject if the requested.
00210 
00211 Arguments:
00212   p           points to characters
00213   length      number to print
00214   is_subject  TRUE if printing from within md->start_subject
00215   md          pointer to matching data block, if is_subject is TRUE
00216 
00217 Returns:     nothing
00218 */
00219 
00220 static void
00221 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
00222 {
00223 int c;
00224 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
00225 while (length-- > 0)
00226   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
00227 }
00228 #endif
00229 
00230 
00231 
00232 
00233 /*************************************************
00234 *            Handle escapes                      *
00235 *************************************************/
00236 
00237 /* This function is called when a \ has been encountered. It either returns a
00238 positive value for a simple escape such as \n, or a negative value which
00239 encodes one of the more complicated things such as \d. On entry, ptr is
00240 pointing at the \. On exit, it is on the final character of the escape
00241 sequence.
00242 
00243 Arguments:
00244   ptrptr     points to the pattern position pointer
00245   errorptr   points to the pointer to the error message
00246   bracount   number of previous extracting brackets
00247   options    the options bits
00248   isclass    TRUE if inside a character class
00249   cd         pointer to char tables block
00250 
00251 Returns:     zero or positive => a data character
00252              negative => a special escape sequence
00253              on error, errorptr is set
00254 */
00255 
00256 static int
00257 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
00258   int options, BOOL isclass, compile_data *cd)
00259 {
00260 const uschar *ptr = *ptrptr;
00261 int c = *(ptr+1) & 255;   /* Ensure > 0 on signed-char systems */
00262 int i;
00263 ++ptr;
00264 
00265 if (c == 0) *errorptr = ERR1;
00266 
00267 /* Digits or letters may have special meaning; all others are literals. */
00268 
00269 else if (c < '0' || c > 'z') {}
00270 
00271 /* Do an initial lookup in a table. A non-zero result is something that can be
00272 returned immediately. Otherwise further processing may be required. */
00273 
00274 else if ((i = escapes[c - '0']) != 0) c = i;
00275 
00276 /* Escapes that need further processing, or are illegal. */
00277 
00278 else
00279   {
00280   const uschar *oldptr;
00281   switch (c)
00282     {
00283     /* The handling of escape sequences consisting of a string of digits
00284     starting with one that is not zero is not straightforward. By experiment,
00285     the way Perl works seems to be as follows:
00286 
00287     Outside a character class, the digits are read as a decimal number. If the
00288     number is less than 10, or if there are that many previous extracting
00289     left brackets, then it is a back reference. Otherwise, up to three octal
00290     digits are read to form an escaped byte. Thus \123 is likely to be octal
00291     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
00292     value is greater than 377, the least significant 8 bits are taken. Inside a
00293     character class, \ followed by a digit is always an octal number. */
00294 
00295     case '1': case '2': case '3': case '4': case '5':
00296     case '6': case '7': case '8': case '9':
00297 
00298     if (!isclass)
00299       {
00300       oldptr = ptr;
00301       c -= '0';
00302       while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
00303         c = c * 10 + *(++ptr) - '0';
00304       if (c < 10 || c <= bracount)
00305         {
00306         c = -(ESC_REF + c);
00307         break;
00308         }
00309       ptr = oldptr;      /* Put the pointer back and fall through */
00310       }
00311 
00312     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
00313     generates a binary zero byte and treats the digit as a following literal.
00314     Thus we have to pull back the pointer by one. */
00315 
00316     if ((c = *ptr) >= '8')
00317       {
00318       ptr--;
00319       c = 0;
00320       break;
00321       }
00322 
00323     /* \0 always starts an octal number, but we may drop through to here with a
00324     larger first octal digit */
00325 
00326     case '0':
00327     c -= '0';
00328     while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
00329       ptr[1] != '8' && ptr[1] != '9')
00330         c = c * 8 + *(++ptr) - '0';
00331     break;
00332 
00333     /* Special escapes not starting with a digit are straightforward */
00334 
00335     case 'x':
00336     c = 0;
00337     while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
00338       {
00339       ptr++;
00340       c = c * 16 + cd->lcc[*ptr] -
00341         (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
00342       }
00343     break;
00344 
00345     case 'c':
00346     c = *(++ptr);
00347     if (c == 0)
00348       {
00349       *errorptr = ERR2;
00350       return 0;
00351       }
00352 
00353     /* A letter is upper-cased; then the 0x40 bit is flipped */
00354 
00355     if (c >= 'a' && c <= 'z') c = cd->fcc[c];
00356     c ^= 0x40;
00357     break;
00358 
00359     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
00360     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
00361     for Perl compatibility, it is a literal. This code looks a bit odd, but
00362     there used to be some cases other than the default, and there may be again
00363     in future, so I haven't "optimized" it. */
00364 
00365     default:
00366     if ((options & PCRE_EXTRA) != 0) switch(c)
00367       {
00368       default:
00369       *errorptr = ERR3;
00370       break;
00371       }
00372     break;
00373     }
00374   }
00375 
00376 *ptrptr = ptr;
00377 return c;
00378 }
00379 
00380 
00381 
00382 /*************************************************
00383 *            Check for counted repeat            *
00384 *************************************************/
00385 
00386 /* This function is called when a '{' is encountered in a place where it might
00387 start a quantifier. It looks ahead to see if it really is a quantifier or not.
00388 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
00389 where the ddds are digits.
00390 
00391 Arguments:
00392   p         pointer to the first char after '{'
00393   cd        pointer to char tables block
00394 
00395 Returns:    TRUE or FALSE
00396 */
00397 
00398 static BOOL
00399 is_counted_repeat(const uschar *p, compile_data *cd)
00400 {
00401 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
00402 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
00403 if (*p == '}') return TRUE;
00404 
00405 if (*p++ != ',') return FALSE;
00406 if (*p == '}') return TRUE;
00407 
00408 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
00409 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
00410 return (*p == '}');
00411 }
00412 
00413 
00414 
00415 /*************************************************
00416 *         Read repeat counts                     *
00417 *************************************************/
00418 
00419 /* Read an item of the form {n,m} and return the values. This is called only
00420 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
00421 so the syntax is guaranteed to be correct, but we need to check the values.
00422 
00423 Arguments:
00424   p          pointer to first char after '{'
00425   minp       pointer to int for min
00426   maxp       pointer to int for max
00427              returned as -1 if no max
00428   errorptr   points to pointer to error message
00429   cd         pointer to character tables clock
00430 
00431 Returns:     pointer to '}' on success;
00432              current ptr on error, with errorptr set
00433 */
00434 
00435 static const uschar *
00436 read_repeat_counts(const uschar *p, int *minp, int *maxp,
00437   const char **errorptr, compile_data *cd)
00438 {
00439 int min = 0;
00440 int max = -1;
00441 
00442 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
00443 
00444 if (*p == '}') max = min; else
00445   {
00446   if (*(++p) != '}')
00447     {
00448     max = 0;
00449     while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
00450     if (max < min)
00451       {
00452       *errorptr = ERR4;
00453       return p;
00454       }
00455     }
00456   }
00457 
00458 /* Do paranoid checks, then fill in the required variables, and pass back the
00459 pointer to the terminating '}'. */
00460 
00461 if (min > 65535 || max > 65535)
00462   *errorptr = ERR5;
00463 else
00464   {
00465   *minp = min;
00466   *maxp = max;
00467   }
00468 return p;
00469 }
00470 
00471 
00472 
00473 /*************************************************
00474 *        Find the fixed length of a pattern      *
00475 *************************************************/
00476 
00477 /* Scan a pattern and compute the fixed length of subject that will match it,
00478 if the length is fixed. This is needed for dealing with backward assertions.
00479 
00480 Arguments:
00481   code     points to the start of the pattern (the bracket)
00482 
00483 Returns:   the fixed length, or -1 if there is no fixed length
00484 */
00485 
00486 static int
00487 find_fixedlength(uschar *code)
00488 {
00489 int length = -1;
00490 
00491 register int branchlength = 0;
00492 register uschar *cc = code + 3;
00493 
00494 /* Scan along the opcodes for this branch. If we get to the end of the
00495 branch, check the length against that of the other branches. */
00496 
00497 for (;;)
00498   {
00499   int d;
00500   register int op = *cc;
00501   if (op >= OP_BRA) op = OP_BRA;
00502 
00503   switch (op)
00504     {
00505     case OP_BRA:
00506     case OP_ONCE:
00507     case OP_COND:
00508     d = find_fixedlength(cc);
00509     if (d < 0) return -1;
00510     branchlength += d;
00511     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
00512     cc += 3;
00513     break;
00514 
00515     /* Reached end of a branch; if it's a ket it is the end of a nested
00516     call. If it's ALT it is an alternation in a nested call. If it is
00517     END it's the end of the outer call. All can be handled by the same code. */
00518 
00519     case OP_ALT:
00520     case OP_KET:
00521     case OP_KETRMAX:
00522     case OP_KETRMIN:
00523     case OP_END:
00524     if (length < 0) length = branchlength;
00525       else if (length != branchlength) return -1;
00526     if (*cc != OP_ALT) return length;
00527     cc += 3;
00528     branchlength = 0;
00529     break;
00530 
00531     /* Skip over assertive subpatterns */
00532 
00533     case OP_ASSERT:
00534     case OP_ASSERT_NOT:
00535     case OP_ASSERTBACK:
00536     case OP_ASSERTBACK_NOT:
00537     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
00538     cc += 3;
00539     break;
00540 
00541     /* Skip over things that don't match chars */
00542 
00543     case OP_REVERSE:
00544     cc++;
00545 
00546     case OP_CREF:
00547     case OP_OPT:
00548     cc++;
00549     /* Fall through */
00550 
00551     case OP_SOD:
00552     case OP_EOD:
00553     case OP_EODN:
00554     case OP_CIRC:
00555     case OP_DOLL:
00556     case OP_NOT_WORD_BOUNDARY:
00557     case OP_WORD_BOUNDARY:
00558     cc++;
00559     break;
00560 
00561     /* Handle char strings */
00562 
00563     case OP_CHARS:
00564     branchlength += *(++cc);
00565     cc += *cc + 1;
00566     break;
00567 
00568     /* Handle exact repetitions */
00569 
00570     case OP_EXACT:
00571     case OP_TYPEEXACT:
00572     branchlength += (cc[1] << 8) + cc[2];
00573     cc += 4;
00574     break;
00575 
00576     /* Handle single-char matchers */
00577 
00578     case OP_NOT_DIGIT:
00579     case OP_DIGIT:
00580     case OP_NOT_WHITESPACE:
00581     case OP_WHITESPACE:
00582     case OP_NOT_WORDCHAR:
00583     case OP_WORDCHAR:
00584     case OP_ANY:
00585     branchlength++;
00586     cc++;
00587     break;
00588 
00589 
00590     /* Check a class for variable quantification */
00591 
00592     case OP_CLASS:
00593     cc += (*cc == OP_REF)? 2 : 33;
00594 
00595     switch (*cc)
00596       {
00597       case OP_CRSTAR:
00598       case OP_CRMINSTAR:
00599       case OP_CRQUERY:
00600       case OP_CRMINQUERY:
00601       return -1;
00602 
00603       case OP_CRRANGE:
00604       case OP_CRMINRANGE:
00605       if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
00606       branchlength += (cc[1] << 8) + cc[2];
00607       cc += 5;
00608       break;
00609 
00610       default:
00611       branchlength++;
00612       }
00613     break;
00614 
00615     /* Anything else is variable length */
00616 
00617     default:
00618     return -1;
00619     }
00620   }
00621 /* Control never gets here */
00622 }
00623 
00624 
00625 
00626 
00627 /*************************************************
00628 *           Compile one branch                   *
00629 *************************************************/
00630 
00631 /* Scan the pattern, compiling it into the code vector.
00632 
00633 Arguments:
00634   options      the option bits
00635   brackets     points to number of brackets used
00636   code         points to the pointer to the current code point
00637   ptrptr       points to the current pattern pointer
00638   errorptr     points to pointer to error message
00639   optchanged   set to the value of the last OP_OPT item compiled
00640   cd           contains pointers to tables
00641 
00642 Returns:       TRUE on success
00643                FALSE, with *errorptr set on error
00644 */
00645 
00646 static BOOL
00647 compile_branch(int options, int *brackets, uschar **codeptr,
00648   const uschar **ptrptr, const char **errorptr, int *optchanged,
00649   compile_data *cd)
00650 {
00651 int repeat_type, op_type;
00652 int repeat_min, repeat_max;
00653 int bravalue, length;
00654 int greedy_default, greedy_non_default;
00655 register int c;
00656 register uschar *code = *codeptr;
00657 uschar *tempcode;
00658 const uschar *ptr = *ptrptr;
00659 const uschar *tempptr;
00660 uschar *previous = NULL;
00661 uschar class[32];
00662 
00663 /* Set up the default and non-default settings for greediness */
00664 
00665 greedy_default = ((options & PCRE_UNGREEDY) != 0);
00666 greedy_non_default = greedy_default ^ 1;
00667 
00668 /* Switch on next character until the end of the branch */
00669 
00670 for (;; ptr++)
00671   {
00672   BOOL negate_class;
00673   int class_charcount;
00674   int class_lastchar;
00675   int newoptions;
00676   int condref;
00677 
00678   c = *ptr;
00679   if ((options & PCRE_EXTENDED) != 0)
00680     {
00681     if ((cd->ctypes[c] & ctype_space) != 0) continue;
00682     if (c == '#')
00683       {
00684       while ((c = *(++ptr)) != 0 && c != '\n');
00685       continue;
00686       }
00687     }
00688 
00689   switch(c)
00690     {
00691     /* The branch terminates at end of string, |, or ). */
00692 
00693     case 0:
00694     case '|':
00695     case ')':
00696     *codeptr = code;
00697     *ptrptr = ptr;
00698     return TRUE;
00699 
00700     /* Handle single-character metacharacters */
00701 
00702     case '^':
00703     previous = NULL;
00704     *code++ = OP_CIRC;
00705     break;
00706 
00707     case '$':
00708     previous = NULL;
00709     *code++ = OP_DOLL;
00710     break;
00711 
00712     case '.':
00713     previous = code;
00714     *code++ = OP_ANY;
00715     break;
00716 
00717     /* Character classes. These always build a 32-byte bitmap of the permitted
00718     characters, except in the special case where there is only one character.
00719     For negated classes, we build the map as usual, then invert it at the end.
00720     */
00721 
00722     case '[':
00723     previous = code;
00724     *code++ = OP_CLASS;
00725 
00726     /* If the first character is '^', set the negation flag and skip it. */
00727 
00728     if ((c = *(++ptr)) == '^')
00729       {
00730       negate_class = TRUE;
00731       c = *(++ptr);
00732       }
00733     else negate_class = FALSE;
00734 
00735     /* Keep a count of chars so that we can optimize the case of just a single
00736     character. */
00737 
00738     class_charcount = 0;
00739     class_lastchar = -1;
00740 
00741     /* Initialize the 32-char bit map to all zeros. We have to build the
00742     map in a temporary bit of store, in case the class contains only 1
00743     character, because in that case the compiled code doesn't use the
00744     bit map. */
00745 
00746     memset(class, 0, 32 * sizeof(uschar));
00747 
00748     /* Process characters until ] is reached. By writing this as a "do" it
00749     means that an initial ] is taken as a data character. */
00750 
00751     do
00752       {
00753       if (c == 0)
00754         {
00755         *errorptr = ERR6;
00756         goto FAILED;
00757         }
00758 
00759       /* Backslash may introduce a single character, or it may introduce one
00760       of the specials, which just set a flag. Escaped items are checked for
00761       validity in the pre-compiling pass. The sequence \b is a special case.
00762       Inside a class (and only there) it is treated as backspace. Elsewhere
00763       it marks a word boundary. Other escapes have preset maps ready to
00764       or into the one we are building. We assume they have more than one
00765       character in them, so set class_count bigger than one. */
00766 
00767       if (c == '\\')
00768         {
00769         c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
00770         if (-c == ESC_b) c = '\b';
00771         else if (c < 0)
00772           {
00773           register const uschar *cbits = cd->cbits;
00774           class_charcount = 10;
00775           switch (-c)
00776             {
00777             case ESC_d:
00778             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
00779             continue;
00780 
00781             case ESC_D:
00782             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
00783             continue;
00784 
00785             case ESC_w:
00786             for (c = 0; c < 32; c++)
00787               class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
00788             continue;
00789 
00790             case ESC_W:
00791             for (c = 0; c < 32; c++)
00792               class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
00793             continue;
00794 
00795             case ESC_s:
00796             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
00797             continue;
00798 
00799             case ESC_S:
00800             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
00801             continue;
00802 
00803             default:
00804             *errorptr = ERR7;
00805             goto FAILED;
00806             }
00807           }
00808         /* Fall through if single character */
00809         }
00810 
00811       /* A single character may be followed by '-' to form a range. However,
00812       Perl does not permit ']' to be the end of the range. A '-' character
00813       here is treated as a literal. */
00814 
00815       if (ptr[1] == '-' && ptr[2] != ']')
00816         {
00817         int d;
00818         ptr += 2;
00819         d = *ptr;
00820 
00821         if (d == 0)
00822           {
00823           *errorptr = ERR6;
00824           goto FAILED;
00825           }
00826 
00827         /* The second part of a range can be a single-character escape, but
00828         not any of the other escapes. */
00829 
00830         if (d == '\\')
00831           {
00832           d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
00833           if (d < 0)
00834             {
00835             if (d == -ESC_b) d = '\b'; else
00836               {
00837               *errorptr = ERR7;
00838               goto FAILED;
00839               }
00840             }
00841           }
00842 
00843         if (d < c)
00844           {
00845           *errorptr = ERR8;
00846           goto FAILED;
00847           }
00848 
00849         for (; c <= d; c++)
00850           {
00851           class[c/8] |= (1 << (c&7));
00852           if ((options & PCRE_CASELESS) != 0)
00853             {
00854             int uc = cd->fcc[c];           /* flip case */
00855             class[uc/8] |= (1 << (uc&7));
00856             }
00857           class_charcount++;                /* in case a one-char range */
00858           class_lastchar = c;
00859           }
00860         continue;   /* Go get the next char in the class */
00861         }
00862 
00863       /* Handle a lone single character - we can get here for a normal
00864       non-escape char, or after \ that introduces a single character. */
00865 
00866       class [c/8] |= (1 << (c&7));
00867       if ((options & PCRE_CASELESS) != 0)
00868         {
00869         c = cd->fcc[c];   /* flip case */
00870         class[c/8] |= (1 << (c&7));
00871         }
00872       class_charcount++;
00873       class_lastchar = c;
00874       }
00875 
00876     /* Loop until ']' reached; the check for end of string happens inside the
00877     loop. This "while" is the end of the "do" above. */
00878 
00879     while ((c = *(++ptr)) != ']');
00880 
00881     /* If class_charcount is 1 and class_lastchar is not negative, we saw
00882     precisely one character. This doesn't need the whole 32-byte bit map.
00883     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
00884     it's negative. */
00885 
00886     if (class_charcount == 1 && class_lastchar >= 0)
00887       {
00888       if (negate_class)
00889         {
00890         code[-1] = OP_NOT;
00891         }
00892       else
00893         {
00894         code[-1] = OP_CHARS;
00895         *code++ = 1;
00896         }
00897       *code++ = class_lastchar;
00898       }
00899 
00900     /* Otherwise, negate the 32-byte map if necessary, and copy it into
00901     the code vector. */
00902 
00903     else
00904       {
00905       if (negate_class)
00906         for (c = 0; c < 32; c++) code[c] = ~class[c];
00907       else
00908         memcpy(code, class, 32);
00909       code += 32;
00910       }
00911     break;
00912 
00913     /* Various kinds of repeat */
00914 
00915     case '{':
00916     if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
00917     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
00918     if (*errorptr != NULL) goto FAILED;
00919     goto REPEAT;
00920 
00921     case '*':
00922     repeat_min = 0;
00923     repeat_max = -1;
00924     goto REPEAT;
00925 
00926     case '+':
00927     repeat_min = 1;
00928     repeat_max = -1;
00929     goto REPEAT;
00930 
00931     case '?':
00932     repeat_min = 0;
00933     repeat_max = 1;
00934 
00935     REPEAT:
00936     if (previous == NULL)
00937       {
00938       *errorptr = ERR9;
00939       goto FAILED;
00940       }
00941 
00942     /* If the next character is '?' this is a minimizing repeat, by default,
00943     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
00944     next character. */
00945 
00946     if (ptr[1] == '?')
00947       { repeat_type = greedy_non_default; ptr++; }
00948     else repeat_type = greedy_default;
00949 
00950     /* If the maximum is zero then the minimum must also be zero; Perl allows
00951     this case, so we do too - by simply omitting the item altogether. */
00952 
00953     if (repeat_max == 0) code = previous;
00954 
00955     /* If previous was a string of characters, chop off the last one and use it
00956     as the subject of the repeat. If there was only one character, we can
00957     abolish the previous item altogether. */
00958 
00959     else if (*previous == OP_CHARS)
00960       {
00961       int len = previous[1];
00962       if (len == 1)
00963         {
00964         c = previous[2];
00965         code = previous;
00966         }
00967       else
00968         {
00969         c = previous[len+1];
00970         previous[1]--;
00971         code--;
00972         }
00973       op_type = 0;                 /* Use single-char op codes */
00974       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
00975       }
00976 
00977     /* If previous was a single negated character ([^a] or similar), we use
00978     one of the special opcodes, replacing it. The code is shared with single-
00979     character repeats by adding a suitable offset into repeat_type. */
00980 
00981     else if ((int)*previous == OP_NOT)
00982       {
00983       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
00984       c = previous[1];
00985       code = previous;
00986       goto OUTPUT_SINGLE_REPEAT;
00987       }
00988 
00989     /* If previous was a character type match (\d or similar), abolish it and
00990     create a suitable repeat item. The code is shared with single-character
00991     repeats by adding a suitable offset into repeat_type. */
00992 
00993     else if ((int)*previous < OP_EODN || *previous == OP_ANY)
00994       {
00995       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
00996       c = *previous;
00997       code = previous;
00998 
00999       OUTPUT_SINGLE_REPEAT:
01000       repeat_type += op_type;      /* Combine both values for many cases */
01001 
01002       /* A minimum of zero is handled either as the special case * or ?, or as
01003       an UPTO, with the maximum given. */
01004 
01005       if (repeat_min == 0)
01006         {
01007         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
01008           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
01009         else
01010           {
01011           *code++ = OP_UPTO + repeat_type;
01012           *code++ = repeat_max >> 8;
01013           *code++ = (repeat_max & 255);
01014           }
01015         }
01016 
01017       /* The case {1,} is handled as the special case + */
01018 
01019       else if (repeat_min == 1 && repeat_max == -1)
01020         *code++ = OP_PLUS + repeat_type;
01021 
01022       /* The case {n,n} is just an EXACT, while the general case {n,m} is
01023       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
01024 
01025       else
01026         {
01027         if (repeat_min != 1)
01028           {
01029           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
01030           *code++ = repeat_min >> 8;
01031           *code++ = (repeat_min & 255);
01032           }
01033 
01034         /* If the mininum is 1 and the previous item was a character string,
01035         we either have to put back the item that got cancelled if the string
01036         length was 1, or add the character back onto the end of a longer
01037         string. For a character type nothing need be done; it will just get
01038         put back naturally. Note that the final character is always going to
01039         get added below. */
01040 
01041         else if (*previous == OP_CHARS)
01042           {
01043           if (code == previous) code += 2; else previous[1]++;
01044           }
01045 
01046         /*  For a single negated character we also have to put back the
01047         item that got cancelled. */
01048 
01049         else if (*previous == OP_NOT) code++;
01050 
01051         /* If the maximum is unlimited, insert an OP_STAR. */
01052 
01053         if (repeat_max < 0)
01054           {
01055           *code++ = c;
01056           *code++ = OP_STAR + repeat_type;
01057           }
01058 
01059         /* Else insert an UPTO if the max is greater than the min. */
01060 
01061         else if (repeat_max != repeat_min)
01062           {
01063           *code++ = c;
01064           repeat_max -= repeat_min;
01065           *code++ = OP_UPTO + repeat_type;
01066           *code++ = repeat_max >> 8;
01067           *code++ = (repeat_max & 255);
01068           }
01069         }
01070 
01071       /* The character or character type itself comes last in all cases. */
01072 
01073       *code++ = c;
01074       }
01075 
01076     /* If previous was a character class or a back reference, we put the repeat
01077     stuff after it. */
01078 
01079     else if (*previous == OP_CLASS || *previous == OP_REF)
01080       {
01081       if (repeat_min == 0 && repeat_max == -1)
01082         *code++ = OP_CRSTAR + repeat_type;
01083       else if (repeat_min == 1 && repeat_max == -1)
01084         *code++ = OP_CRPLUS + repeat_type;
01085       else if (repeat_min == 0 && repeat_max == 1)
01086         *code++ = OP_CRQUERY + repeat_type;
01087       else
01088         {
01089         *code++ = OP_CRRANGE + repeat_type;
01090         *code++ = repeat_min >> 8;
01091         *code++ = repeat_min & 255;
01092         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
01093         *code++ = repeat_max >> 8;
01094         *code++ = repeat_max & 255;
01095         }
01096       }
01097 
01098     /* If previous was a bracket group, we may have to replicate it in certain
01099     cases. */
01100 
01101     else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
01102              (int)*previous == OP_COND)
01103       {
01104       register int i;
01105       int ketoffset = 0;
01106       int len = code - previous;
01107       uschar *bralink = NULL;
01108 
01109       /* If the maximum repeat count is unlimited, find the end of the bracket
01110       by scanning through from the start, and compute the offset back to it
01111       from the current code pointer. There may be an OP_OPT setting following
01112       the final KET, so we can't find the end just by going back from the code
01113       pointer. */
01114 
01115       if (repeat_max == -1)
01116         {
01117         register uschar *ket = previous;
01118         do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
01119         ketoffset = code - ket;
01120         }
01121 
01122       /* The case of a zero minimum is special because of the need to stick
01123       OP_BRAZERO in front of it, and because the group appears once in the
01124       data, whereas in other cases it appears the minimum number of times. For
01125       this reason, it is simplest to treat this case separately, as otherwise
01126       the code gets far too mess. There are several special subcases when the
01127       minimum is zero. */
01128 
01129       if (repeat_min == 0)
01130         {
01131         /* If the maximum is also zero, we just omit the group from the output
01132         altogether. */
01133 
01134         if (repeat_max == 0)
01135           {
01136           code = previous;
01137           previous = NULL;
01138           break;
01139           }
01140 
01141         /* If the maximum is 1 or unlimited, we just have to stick in the
01142         BRAZERO and do no more at this point. */
01143 
01144         if (repeat_max <= 1)
01145           {
01146           memmove(previous+1, previous, len);
01147           code++;
01148           *previous++ = OP_BRAZERO + repeat_type;
01149           }
01150 
01151         /* If the maximum is greater than 1 and limited, we have to replicate
01152         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
01153         The first one has to be handled carefully because it's the original
01154         copy, which has to be moved up. The remainder can be handled by code
01155         that is common with the non-zero minimum case below. We just have to
01156         adjust the value or repeat_max, since one less copy is required. */
01157 
01158         else
01159           {
01160           int offset;
01161           memmove(previous+4, previous, len);
01162           code += 4;
01163           *previous++ = OP_BRAZERO + repeat_type;
01164           *previous++ = OP_BRA;
01165 
01166           /* We chain together the bracket offset fields that have to be
01167           filled in later when the ends of the brackets are reached. */
01168 
01169           offset = (bralink == NULL)? 0 : previous - bralink;
01170           bralink = previous;
01171           *previous++ = offset >> 8;
01172           *previous++ = offset & 255;
01173           }
01174 
01175         repeat_max--;
01176         }
01177 
01178       /* If the minimum is greater than zero, replicate the group as many
01179       times as necessary, and adjust the maximum to the number of subsequent
01180       copies that we need. */
01181 
01182       else
01183         {
01184         for (i = 1; i < repeat_min; i++)
01185           {
01186           memcpy(code, previous, len);
01187           code += len;
01188           }
01189         if (repeat_max > 0) repeat_max -= repeat_min;
01190         }
01191 
01192       /* This code is common to both the zero and non-zero minimum cases. If
01193       the maximum is limited, it replicates the group in a nested fashion,
01194       remembering the bracket starts on a stack. In the case of a zero minimum,
01195       the first one was set up above. In all cases the repeat_max now specifies
01196       the number of additional copies needed. */
01197 
01198       if (repeat_max >= 0)
01199         {
01200         for (i = repeat_max - 1; i >= 0; i--)
01201           {
01202           *code++ = OP_BRAZERO + repeat_type;
01203 
01204           /* All but the final copy start a new nesting, maintaining the
01205           chain of brackets outstanding. */
01206 
01207           if (i != 0)
01208             {
01209             int offset;
01210             *code++ = OP_BRA;
01211             offset = (bralink == NULL)? 0 : code - bralink;
01212             bralink = code;
01213             *code++ = offset >> 8;
01214             *code++ = offset & 255;
01215             }
01216 
01217           memcpy(code, previous, len);
01218           code += len;
01219           }
01220 
01221         /* Now chain through the pending brackets, and fill in their length
01222         fields (which are holding the chain links pro tem). */
01223 
01224         while (bralink != NULL)
01225           {
01226           int oldlinkoffset;
01227           int offset = code - bralink + 1;
01228           uschar *bra = code - offset;
01229           oldlinkoffset = (bra[1] << 8) + bra[2];
01230           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
01231           *code++ = OP_KET;
01232           *code++ = bra[1] = offset >> 8;
01233           *code++ = bra[2] = (offset & 255);
01234           }
01235         }
01236 
01237       /* If the maximum is unlimited, set a repeater in the final copy. We
01238       can't just offset backwards from the current code point, because we
01239       don't know if there's been an options resetting after the ket. The
01240       correct offset was computed above. */
01241 
01242       else code[-ketoffset] = OP_KETRMAX + repeat_type;
01243 
01244 
01245 #ifdef NEVER
01246       /* If the minimum is greater than zero, and the maximum is unlimited or
01247       equal to the minimum, the first copy remains where it is, and is
01248       replicated up to the minimum number of times. This case includes the +
01249       repeat, but of course no replication is needed in that case. */
01250 
01251       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))
01252         {
01253         for (i = 1; i < repeat_min; i++)
01254           {
01255           memcpy(code, previous, len);
01256           code += len;
01257           }
01258         }
01259 
01260       /* If the minimum is zero, stick BRAZERO in front of the first copy.
01261       Then, if there is a fixed upper limit, replicated up to that many times,
01262       sticking BRAZERO in front of all the optional ones. */
01263 
01264       else
01265         {
01266         if (repeat_min == 0)
01267           {
01268           memmove(previous+1, previous, len);
01269           code++;
01270           *previous++ = OP_BRAZERO + repeat_type;
01271           }
01272 
01273         for (i = 1; i < repeat_min; i++)
01274           {
01275           memcpy(code, previous, len);
01276           code += len;
01277           }
01278 
01279         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)
01280           {
01281           *code++ = OP_BRAZERO + repeat_type;
01282           memcpy(code, previous, len);
01283           code += len;
01284           }
01285         }
01286 
01287       /* If the maximum is unlimited, set a repeater in the final copy. We
01288       can't just offset backwards from the current code point, because we
01289       don't know if there's been an options resetting after the ket. The
01290       correct offset was computed above. */
01291 
01292       if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
01293 #endif
01294 
01295 
01296       }
01297 
01298     /* Else there's some kind of shambles */
01299 
01300     else
01301       {
01302       *errorptr = ERR11;
01303       goto FAILED;
01304       }
01305 
01306     /* In all case we no longer have a previous item. */
01307 
01308     previous = NULL;
01309     break;
01310 
01311 
01312     /* Start of nested bracket sub-expression, or comment or lookahead or
01313     lookbehind or option setting or condition. First deal with special things
01314     that can come after a bracket; all are introduced by ?, and the appearance
01315     of any of them means that this is not a referencing group. They were
01316     checked for validity in the first pass over the string, so we don't have to
01317     check for syntax errors here.  */
01318 
01319     case '(':
01320     newoptions = options;
01321     condref = -1;
01322 
01323     if (*(++ptr) == '?')
01324       {
01325       int set, unset;
01326       int *optset;
01327 
01328       switch (*(++ptr))
01329         {
01330         case '#':                 /* Comment; skip to ket */
01331         ptr++;
01332         while (*ptr != ')') ptr++;
01333         continue;
01334 
01335         case ':':                 /* Non-extracting bracket */
01336         bravalue = OP_BRA;
01337         ptr++;
01338         break;
01339 
01340         case '(':
01341         bravalue = OP_COND;       /* Conditional group */
01342         if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
01343           {
01344           condref = *ptr - '0';
01345           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
01346           ptr++;
01347           }
01348         else ptr--;
01349         break;
01350 
01351         case '=':                 /* Positive lookahead */
01352         bravalue = OP_ASSERT;
01353         ptr++;
01354         break;
01355 
01356         case '!':                 /* Negative lookahead */
01357         bravalue = OP_ASSERT_NOT;
01358         ptr++;
01359         break;
01360 
01361         case '<':                 /* Lookbehinds */
01362         switch (*(++ptr))
01363           {
01364           case '=':               /* Positive lookbehind */
01365           bravalue = OP_ASSERTBACK;
01366           ptr++;
01367           break;
01368 
01369           case '!':               /* Negative lookbehind */
01370           bravalue = OP_ASSERTBACK_NOT;
01371           ptr++;
01372           break;
01373 
01374           default:                /* Syntax error */
01375           *errorptr = ERR24;
01376           goto FAILED;
01377           }
01378         break;
01379 
01380         case '>':                 /* One-time brackets */
01381         bravalue = OP_ONCE;
01382         ptr++;
01383         break;
01384 
01385         default:                  /* Option setting */
01386         set = unset = 0;
01387         optset = &set;
01388 
01389         while (*ptr != ')' && *ptr != ':')
01390           {
01391           switch (*ptr++)
01392             {
01393             case '-': optset = &unset; break;
01394 
01395             case 'i': *optset |= PCRE_CASELESS; break;
01396             case 'm': *optset |= PCRE_MULTILINE; break;
01397             case 's': *optset |= PCRE_DOTALL; break;
01398             case 'x': *optset |= PCRE_EXTENDED; break;
01399             case 'U': *optset |= PCRE_UNGREEDY; break;
01400             case 'X': *optset |= PCRE_EXTRA; break;
01401 
01402             default:
01403             *errorptr = ERR12;
01404             goto FAILED;
01405             }
01406           }
01407 
01408         /* Set up the changed option bits, but don't change anything yet. */
01409 
01410         newoptions = (options | set) & (~unset);
01411 
01412         /* If the options ended with ')' this is not the start of a nested
01413         group with option changes, so the options change at this level. At top
01414         level there is nothing else to be done (the options will in fact have
01415         been set from the start of compiling as a result of the first pass) but
01416         at an inner level we must compile code to change the ims options if
01417         necessary, and pass the new setting back so that it can be put at the
01418         start of any following branches, and when this group ends, a resetting
01419         item can be compiled. */
01420 
01421         if (*ptr == ')')
01422           {
01423           if ((options & PCRE_INGROUP) != 0 &&
01424               (options & PCRE_IMS) != (newoptions & PCRE_IMS))
01425             {
01426             *code++ = OP_OPT;
01427             *code++ = *optchanged = newoptions & PCRE_IMS;
01428             }
01429           options = newoptions;  /* Change options at this level */
01430           previous = NULL;       /* This item can't be repeated */
01431           continue;              /* It is complete */
01432           }
01433 
01434         /* If the options ended with ':' we are heading into a nested group
01435         with possible change of options. Such groups are non-capturing and are
01436         not assertions of any kind. All we need to do is skip over the ':';
01437         the newoptions value is handled below. */
01438 
01439         bravalue = OP_BRA;
01440         ptr++;
01441         }
01442       }
01443 
01444     /* Else we have a referencing group; adjust the opcode. */
01445 
01446     else
01447       {
01448       if (++(*brackets) > EXTRACT_MAX)
01449         {
01450         *errorptr = ERR13;
01451         goto FAILED;
01452         }
01453       bravalue = OP_BRA + *brackets;
01454       }
01455 
01456     /* Process nested bracketed re. Assertions may not be repeated, but other
01457     kinds can be. We copy code into a non-register variable in order to be able
01458     to pass its address because some compilers complain otherwise. Pass in a
01459     new setting for the ims options if they have changed. */
01460 
01461     previous = (bravalue >= OP_ONCE)? code : NULL;
01462     *code = bravalue;
01463     tempcode = code;
01464 
01465     if (!compile_regex(
01466          options | PCRE_INGROUP,       /* Set for all nested groups */
01467          ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
01468            newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
01469          brackets,                     /* Bracket level */
01470          &tempcode,                    /* Where to put code (updated) */
01471          &ptr,                         /* Input pointer (updated) */
01472          errorptr,                     /* Where to put an error message */
01473          (bravalue == OP_ASSERTBACK ||
01474           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
01475          condref,                      /* Condition reference number */
01476          cd))                          /* Tables block */
01477       goto FAILED;
01478 
01479     /* At the end of compiling, code is still pointing to the start of the
01480     group, while tempcode has been updated to point past the end of the group
01481     and any option resetting that may follow it. The pattern pointer (ptr)
01482     is on the bracket. */
01483 
01484     /* If this is a conditional bracket, check that there are no more than
01485     two branches in the group. */
01486 
01487     if (bravalue == OP_COND)
01488       {
01489       int branchcount = 0;
01490       uschar *tc = code;
01491 
01492       do {
01493          branchcount++;
01494          tc += (tc[1] << 8) | tc[2];
01495          }
01496       while (*tc != OP_KET);
01497 
01498       if (branchcount > 2)
01499         {
01500         *errorptr = ERR27;
01501         goto FAILED;
01502         }
01503       }
01504 
01505     /* Now update the main code pointer to the end of the group. */
01506 
01507     code = tempcode;
01508 
01509     /* Error if hit end of pattern */
01510 
01511     if (*ptr != ')')
01512       {
01513       *errorptr = ERR14;
01514       goto FAILED;
01515       }
01516     break;
01517 
01518     /* Check \ for being a real metacharacter; if not, fall through and handle
01519     it as a data character at the start of a string. Escape items are checked
01520     for validity in the pre-compiling pass. */
01521 
01522     case '\\':
01523     tempptr = ptr;
01524     c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
01525 
01526     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
01527     are arranged to be the negation of the corresponding OP_values. For the
01528     back references, the values are ESC_REF plus the reference number. Only
01529     back references and those types that consume a character may be repeated.
01530     We can test for values between ESC_b and ESC_Z for the latter; this may
01531     have to change if any new ones are ever created. */
01532 
01533     if (c < 0)
01534       {
01535       if (-c >= ESC_REF)
01536         {
01537         previous = code;
01538         *code++ = OP_REF;
01539         *code++ = -c - ESC_REF;
01540         }
01541       else
01542         {
01543         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
01544         *code++ = -c;
01545         }
01546       continue;
01547       }
01548 
01549     /* Data character: reset and fall through */
01550 
01551     ptr = tempptr;
01552     c = '\\';
01553 
01554     /* Handle a run of data characters until a metacharacter is encountered.
01555     The first character is guaranteed not to be whitespace or # when the
01556     extended flag is set. */
01557 
01558     NORMAL_CHAR:
01559     default:
01560     previous = code;
01561     *code = OP_CHARS;
01562     code += 2;
01563     length = 0;
01564 
01565     do
01566       {
01567       if ((options & PCRE_EXTENDED) != 0)
01568         {
01569         if ((cd->ctypes[c] & ctype_space) != 0) continue;
01570         if (c == '#')
01571           {
01572           while ((c = *(++ptr)) != 0 && c != '\n');
01573           if (c == 0) break;
01574           continue;
01575           }
01576         }
01577 
01578       /* Backslash may introduce a data char or a metacharacter. Escaped items
01579       are checked for validity in the pre-compiling pass. Stop the string
01580       before a metaitem. */
01581 
01582       if (c == '\\')
01583         {
01584         tempptr = ptr;
01585         c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
01586         if (c < 0) { ptr = tempptr; break; }
01587         }
01588 
01589       /* Ordinary character or single-char escape */
01590 
01591       *code++ = c;
01592       length++;
01593       }
01594 
01595     /* This "while" is the end of the "do" above. */
01596 
01597     while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
01598 
01599     /* Compute the length and set it in the data vector, and advance to
01600     the next state. */
01601 
01602     previous[1] = length;
01603     if (length < 255) ptr--;
01604     break;
01605     }
01606   }                   /* end of big loop */
01607 
01608 /* Control never reaches here by falling through, only by a goto for all the
01609 error states. Pass back the position in the pattern so that it can be displayed
01610 to the user for diagnosing the error. */
01611 
01612 FAILED:
01613 *ptrptr = ptr;
01614 return FALSE;
01615 }
01616 
01617 
01618 
01619 
01620 /*************************************************
01621 *     Compile sequence of alternatives           *
01622 *************************************************/
01623 
01624 /* On entry, ptr is pointing past the bracket character, but on return
01625 it points to the closing bracket, or vertical bar, or end of string.
01626 The code variable is pointing at the byte into which the BRA operator has been
01627 stored. If the ims options are changed at the start (for a (?ims: group) or
01628 during any branch, we need to insert an OP_OPT item at the start of every
01629 following branch to ensure they get set correctly at run time, and also pass
01630 the new options into every subsequent branch compile.
01631 
01632 Argument:
01633   options     the option bits
01634   optchanged  new ims options to set as if (?ims) were at the start, or -1
01635                for no change
01636   brackets    -> int containing the number of extracting brackets used
01637   codeptr     -> the address of the current code pointer
01638   ptrptr      -> the address of the current pattern pointer
01639   errorptr    -> pointer to error message
01640   lookbehind  TRUE if this is a lookbehind assertion
01641   condref     > 0 for OPT_CREF setting at start of conditional group
01642   cd          points to the data block with tables pointers
01643 
01644 Returns:      TRUE on success
01645 */
01646 
01647 static BOOL
01648 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
01649   const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
01650   compile_data *cd)
01651 {
01652 const uschar *ptr = *ptrptr;
01653 uschar *code = *codeptr;
01654 uschar *last_branch = code;
01655 uschar *start_bracket = code;
01656 uschar *reverse_count = NULL;
01657 int oldoptions = options & PCRE_IMS;
01658 
01659 code += 3;
01660 
01661 /* At the start of a reference-based conditional group, insert the reference
01662 number as an OP_CREF item. */
01663 
01664 if (condref > 0)
01665   {
01666   *code++ = OP_CREF;
01667   *code++ = condref;
01668   }
01669 
01670 /* Loop for each alternative branch */
01671 
01672 for (;;)
01673   {
01674   int length;
01675 
01676   /* Handle change of options */
01677 
01678   if (optchanged >= 0)
01679     {
01680     *code++ = OP_OPT;
01681     *code++ = optchanged;
01682     options = (options & ~PCRE_IMS) | optchanged;
01683     }
01684 
01685   /* Set up dummy OP_REVERSE if lookbehind assertion */
01686 
01687   if (lookbehind)
01688     {
01689     *code++ = OP_REVERSE;
01690     reverse_count = code;
01691     *code++ = 0;
01692     *code++ = 0;
01693     }
01694 
01695   /* Now compile the branch */
01696 
01697   if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))
01698     {
01699     *ptrptr = ptr;
01700     return FALSE;
01701     }
01702 
01703   /* Fill in the length of the last branch */
01704 
01705   length = code - last_branch;
01706   last_branch[1] = length >> 8;
01707   last_branch[2] = length & 255;
01708 
01709   /* If lookbehind, check that this branch matches a fixed-length string,
01710   and put the length into the OP_REVERSE item. Temporarily mark the end of
01711   the branch with OP_END. */
01712 
01713   if (lookbehind)
01714     {
01715     *code = OP_END;
01716     length = find_fixedlength(last_branch);
01717     DPRINTF(("fixed length = %d\n", length));
01718     if (length < 0)
01719       {
01720       *errorptr = ERR25;
01721       *ptrptr = ptr;
01722       return FALSE;
01723       }
01724     reverse_count[0] = (length >> 8);
01725     reverse_count[1] = length & 255;
01726     }
01727 
01728   /* Reached end of expression, either ')' or end of pattern. Insert a
01729   terminating ket and the length of the whole bracketed item, and return,
01730   leaving the pointer at the terminating char. If any of the ims options
01731   were changed inside the group, compile a resetting op-code following. */
01732 
01733   if (*ptr != '|')
01734     {
01735     length = code - start_bracket;
01736     *code++ = OP_KET;
01737     *code++ = length >> 8;
01738     *code++ = length & 255;
01739     if (optchanged >= 0)
01740       {
01741       *code++ = OP_OPT;
01742       *code++ = oldoptions;
01743       }
01744     *codeptr = code;
01745     *ptrptr = ptr;
01746     return TRUE;
01747     }
01748 
01749   /* Another branch follows; insert an "or" node and advance the pointer. */
01750 
01751   *code = OP_ALT;
01752   last_branch = code;
01753   code += 3;
01754   ptr++;
01755   }
01756 /* Control never reaches here */
01757 }
01758 
01759 
01760 
01761 
01762 /*************************************************
01763 *      Find first significant op code            *
01764 *************************************************/
01765 
01766 /* This is called by several functions that scan a compiled expression looking
01767 for a fixed first character, or an anchoring op code etc. It skips over things
01768 that do not influence this. For one application, a change of caseless option is
01769 important.
01770 
01771 Arguments:
01772   code       pointer to the start of the group
01773   options    pointer to external options
01774   optbit     the option bit whose changing is significant, or
01775              zero if none are
01776   optstop    TRUE to return on option change, otherwise change the options
01777                value and continue
01778 
01779 Returns:     pointer to the first significant opcode
01780 */
01781 
01782 static const uschar*
01783 first_significant_code(const uschar *code, int *options, int optbit,
01784   BOOL optstop)
01785 {
01786 for (;;)
01787   {
01788   switch ((int)*code)
01789     {
01790     case OP_OPT:
01791     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
01792       {
01793       if (optstop) return code;
01794       *options = (int)code[1];
01795       }
01796     code += 2;
01797     break;
01798 
01799     case OP_CREF:
01800     code += 2;
01801     break;
01802 
01803     case OP_WORD_BOUNDARY:
01804     case OP_NOT_WORD_BOUNDARY:
01805     code++;
01806     break;
01807 
01808     case OP_ASSERT_NOT:
01809     case OP_ASSERTBACK:
01810     case OP_ASSERTBACK_NOT:
01811     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
01812     code += 3;
01813     break;
01814 
01815     default:
01816     return code;
01817     }
01818   }
01819 /* Control never reaches here */
01820 }
01821 
01822 
01823 
01824 
01825 /*************************************************
01826 *          Check for anchored expression         *
01827 *************************************************/
01828 
01829 /* Try to find out if this is an anchored regular expression. Consider each
01830 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
01831 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
01832 it's anchored. However, if this is a multiline pattern, then only OP_SOD
01833 counts, since OP_CIRC can match in the middle.
01834 
01835 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
01836 because that will try the rest of the pattern at all possible matching points,
01837 so there is no point trying them again.
01838 
01839 Arguments:
01840   code       points to start of expression (the bracket)
01841   options    points to the options setting
01842 
01843 Returns:     TRUE or FALSE
01844 */
01845 
01846 static BOOL
01847 is_anchored(register const uschar *code, int *options)
01848 {
01849 do {
01850    const uschar *scode = first_significant_code(code + 3, options,
01851      PCRE_MULTILINE, FALSE);
01852    register int op = *scode;
01853    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
01854      { if (!is_anchored(scode, options)) return FALSE; }
01855    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
01856             (*options & PCRE_DOTALL) != 0)
01857      { if (scode[1] != OP_ANY) return FALSE; }
01858    else if (op != OP_SOD &&
01859            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
01860      return FALSE;
01861    code += (code[1] << 8) + code[2];
01862    }
01863 while (*code == OP_ALT);
01864 return TRUE;
01865 }
01866 
01867 
01868 
01869 /*************************************************
01870 *         Check for starting with ^ or .*        *
01871 *************************************************/
01872 
01873 /* This is called to find out if every branch starts with ^ or .* so that
01874 "first char" processing can be done to speed things up in multiline
01875 matching and for non-DOTALL patterns that start with .* (which must start at
01876 the beginning or after \n).
01877 
01878 Argument:  points to start of expression (the bracket)
01879 Returns:   TRUE or FALSE
01880 */
01881 
01882 static BOOL
01883 is_startline(const uschar *code)
01884 {
01885 do {
01886    const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
01887    register int op = *scode;
01888    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
01889      { if (!is_startline(scode)) return FALSE; }
01890    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
01891      { if (scode[1] != OP_ANY) return FALSE; }
01892    else if (op != OP_CIRC) return FALSE;
01893    code += (code[1] << 8) + code[2];
01894    }
01895 while (*code == OP_ALT);
01896 return TRUE;
01897 }
01898 
01899 
01900 
01901 /*************************************************
01902 *          Check for fixed first char            *
01903 *************************************************/
01904 
01905 /* Try to find out if there is a fixed first character. This is called for
01906 unanchored expressions, as it speeds up their processing quite considerably.
01907 Consider each alternative branch. If they all start with the same char, or with
01908 a bracket all of whose alternatives start with the same char (recurse ad lib),
01909 then we return that char, otherwise -1.
01910 
01911 Arguments:
01912   code       points to start of expression (the bracket)
01913   options    pointer to the options (used to check casing changes)
01914 
01915 Returns:     -1 or the fixed first char
01916 */
01917 
01918 static int
01919 find_firstchar(const uschar *code, int *options)
01920 {
01921 register int c = -1;
01922 do {
01923    int d;
01924    const uschar *scode = first_significant_code(code + 3, options,
01925      PCRE_CASELESS, TRUE);
01926    register int op = *scode;
01927 
01928    if (op >= OP_BRA) op = OP_BRA;
01929 
01930    switch(op)
01931      {
01932      default:
01933      return -1;
01934 
01935      case OP_BRA:
01936      case OP_ASSERT:
01937      case OP_ONCE:
01938      case OP_COND:
01939      if ((d = find_firstchar(scode, options)) < 0) return -1;
01940      if (c < 0) c = d; else if (c != d) return -1;
01941      break;
01942 
01943      case OP_EXACT:       /* Fall through */
01944      scode++;
01945 
01946      case OP_CHARS:       /* Fall through */
01947      scode++;
01948 
01949      case OP_PLUS:
01950      case OP_MINPLUS:
01951      if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
01952      break;
01953      }
01954 
01955    code += (code[1] << 8) + code[2];
01956    }
01957 while (*code == OP_ALT);
01958 return c;
01959 }
01960 
01961 
01962 
01963 
01964 
01965 /*************************************************
01966 *        Compile a Regular Expression            *
01967 *************************************************/
01968 
01969 /* This function takes a string and returns a pointer to a block of store
01970 holding a compiled version of the expression.
01971 
01972 Arguments:
01973   pattern      the regular expression
01974   options      various option bits
01975   errorptr     pointer to pointer to error text
01976   erroroffset  ptr offset in pattern where error was detected
01977   tables       pointer to character tables or NULL
01978 
01979 Returns:       pointer to compiled data block, or NULL on error,
01980                with errorptr and erroroffset set
01981 */
01982 
01983 pcre *
01984 vmdpcre_compile(const char *pattern, int options, const char **errorptr,
01985   int *erroroffset, const unsigned char *tables)
01986 {
01987 real_pcre *re;
01988 int length = 3;      /* For initial BRA plus length */
01989 int runlength;
01990 int c, size;
01991 int bracount = 0;
01992 int top_backref = 0;
01993 int branch_extra = 0;
01994 int branch_newextra;
01995 unsigned int brastackptr = 0;
01996 uschar *code;
01997 const uschar *ptr;
01998 compile_data compile_block;
01999 int brastack[BRASTACK_SIZE];
02000 uschar bralenstack[BRASTACK_SIZE];
02001 
02002 #ifdef DEBUG
02003 uschar *code_base, *code_end;
02004 #endif
02005 
02006 /* We can't pass back an error message if errorptr is NULL; I guess the best we
02007 can do is just return NULL. */
02008 
02009 if (errorptr == NULL) return NULL;
02010 *errorptr = NULL;
02011 
02012 /* However, we can give a message for this error */
02013 
02014 if (erroroffset == NULL)
02015   {
02016   *errorptr = ERR16;
02017   return NULL;
02018   }
02019 *erroroffset = 0;
02020 
02021 if ((options & ~PUBLIC_OPTIONS) != 0)
02022   {
02023   *errorptr = ERR17;
02024   return NULL;
02025   }
02026 
02027 /* Set up pointers to the individual character tables */
02028 
02029 if (tables == NULL) tables = pcre_default_tables;
02030 compile_block.lcc = tables + lcc_offset;
02031 compile_block.fcc = tables + fcc_offset;
02032 compile_block.cbits = tables + cbits_offset;
02033 compile_block.ctypes = tables + ctypes_offset;
02034 
02035 /* Reflect pattern for debugging output */
02036 
02037 DPRINTF(("------------------------------------------------------------------\n"));
02038 DPRINTF(("%s\n", pattern));
02039 
02040 /* The first thing to do is to make a pass over the pattern to compute the
02041 amount of store required to hold the compiled code. This does not have to be
02042 perfect as long as errors are overestimates. At the same time we can detect any
02043 internal flag settings. Make an attempt to correct for any counted white space
02044 if an "extended" flag setting appears late in the pattern. We can't be so
02045 clever for #-comments. */
02046 
02047 ptr = (const uschar *)(pattern - 1);
02048 while ((c = *(++ptr)) != 0)
02049   {
02050   int min, max;
02051   int class_charcount;
02052 
02053   if ((options & PCRE_EXTENDED) != 0)
02054     {
02055     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
02056     if (c == '#')
02057       {
02058       while ((c = *(++ptr)) != 0 && c != '\n');
02059       continue;
02060       }
02061     }
02062 
02063   switch(c)
02064     {
02065     /* A backslashed item may be an escaped "normal" character or a
02066     character type. For a "normal" character, put the pointers and
02067     character back so that tests for whitespace etc. in the input
02068     are done correctly. */
02069 
02070     case '\\':
02071       {
02072       const uschar *save_ptr = ptr;
02073       c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
02074       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02075       if (c >= 0)
02076         {
02077         ptr = save_ptr;
02078         c = '\\';
02079         goto NORMAL_CHAR;
02080         }
02081       }
02082     length++;
02083 
02084     /* A back reference needs an additional char, plus either one or 5
02085     bytes for a repeat. We also need to keep the value of the highest
02086     back reference. */
02087 
02088     if (c <= -ESC_REF)
02089       {
02090       int refnum = -c - ESC_REF;
02091       if (refnum > top_backref) top_backref = refnum;
02092       length++;   /* For single back reference */
02093       if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
02094         {
02095         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
02096         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02097         if ((min == 0 && (max == 1 || max == -1)) ||
02098           (min == 1 && max == -1))
02099             length++;
02100         else length += 5;
02101         if (ptr[1] == '?') ptr++;
02102         }
02103       }
02104     continue;
02105 
02106     case '^':
02107     case '.':
02108     case '$':
02109     case '*':     /* These repeats won't be after brackets; */
02110     case '+':     /* those are handled separately */
02111     case '?':
02112     length++;
02113     continue;
02114 
02115     /* This covers the cases of repeats after a single char, metachar, class,
02116     or back reference. */
02117 
02118     case '{':
02119     if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
02120     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
02121     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02122     if ((min == 0 && (max == 1 || max == -1)) ||
02123       (min == 1 && max == -1))
02124         length++;
02125     else
02126       {
02127       length--;   /* Uncount the original char or metachar */
02128       if (min == 1) length++; else if (min > 0) length += 4;
02129       if (max > 0) length += 4; else length += 2;
02130       }
02131     if (ptr[1] == '?') ptr++;
02132     continue;
02133 
02134     /* An alternation contains an offset to the next branch or ket. If any ims
02135     options changed in the previous branch(es), and/or if we are in a
02136     lookbehind assertion, extra space will be needed at the start of the
02137     branch. This is handled by branch_extra. */
02138 
02139     case '|':
02140     length += 3 + branch_extra;
02141     continue;
02142 
02143     /* A character class uses 33 characters. Don't worry about character types
02144     that aren't allowed in classes - they'll get picked up during the compile.
02145     A character class that contains only one character uses 2 or 3 bytes,
02146     depending on whether it is negated or not. Notice this where we can. */
02147 
02148     case '[':
02149     class_charcount = 0;
02150     if (*(++ptr) == '^') ptr++;
02151     do
02152       {
02153       if (*ptr == '\\')
02154         {
02155         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
02156           &compile_block);
02157         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02158         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
02159         }
02160       else class_charcount++;
02161       ptr++;
02162       }
02163     while (*ptr != 0 && *ptr != ']');
02164 
02165     /* Repeats for negated single chars are handled by the general code */
02166 
02167     if (class_charcount == 1) length += 3; else
02168       {
02169       length += 33;
02170 
02171       /* A repeat needs either 1 or 5 bytes. */
02172 
02173       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
02174         {
02175         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
02176         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02177         if ((min == 0 && (max == 1 || max == -1)) ||
02178           (min == 1 && max == -1))
02179             length++;
02180         else length += 5;
02181         if (ptr[1] == '?') ptr++;
02182         }
02183       }
02184     continue;
02185 
02186     /* Brackets may be genuine groups or special things */
02187 
02188     case '(':
02189     branch_newextra = 0;
02190 
02191     /* Handle special forms of bracket, which all start (? */
02192 
02193     if (ptr[1] == '?')
02194       {
02195       int set, unset;
02196       int *optset;
02197 
02198       switch (c = ptr[2])
02199         {
02200         /* Skip over comments entirely */
02201         case '#':
02202         ptr += 3;
02203         while (*ptr != 0 && *ptr != ')') ptr++;
02204         if (*ptr == 0)
02205           {
02206           *errorptr = ERR18;
02207           goto PCRE_ERROR_RETURN;
02208           }
02209         continue;
02210 
02211         /* Non-referencing groups and lookaheads just move the pointer on, and
02212         then behave like a non-special bracket, except that they don't increment
02213         the count of extracting brackets. Ditto for the "once only" bracket,
02214         which is in Perl from version 5.005. */
02215 
02216         case ':':
02217         case '=':
02218         case '!':
02219         case '>':
02220         ptr += 2;
02221         break;
02222 
02223         /* Lookbehinds are in Perl from version 5.005 */
02224 
02225         case '<':
02226         if (ptr[3] == '=' || ptr[3] == '!')
02227           {
02228           ptr += 3;
02229           branch_newextra = 3;
02230           length += 3;         /* For the first branch */
02231           break;
02232           }
02233         *errorptr = ERR24;
02234         goto PCRE_ERROR_RETURN;
02235 
02236         /* Conditionals are in Perl from version 5.005. The bracket must either
02237         be followed by a number (for bracket reference) or by an assertion
02238         group. */
02239 
02240         case '(':
02241         if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
02242           {
02243           ptr += 4;
02244           length += 2;
02245           while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
02246           if (*ptr != ')')
02247             {
02248             *errorptr = ERR26;
02249             goto PCRE_ERROR_RETURN;
02250             }
02251           }
02252         else   /* An assertion must follow */
02253           {
02254           ptr++;   /* Can treat like ':' as far as spacing is concerned */
02255 
02256           if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
02257             {
02258             ptr += 2;    /* To get right offset in message */
02259             *errorptr = ERR28;
02260             goto PCRE_ERROR_RETURN;
02261             }
02262           }
02263         break;
02264 
02265         /* Else loop checking valid options until ) is met. Anything else is an
02266         error. If we are without any brackets, i.e. at top level, the settings
02267         act as if specified in the options, so massage the options immediately.
02268         This is for backward compatibility with Perl 5.004. */
02269 
02270         default:
02271         set = unset = 0;
02272         optset = &set;
02273         ptr += 2;
02274 
02275         for (;; ptr++)
02276           {
02277           c = *ptr;
02278           switch (c)
02279             {
02280             case 'i':
02281             *optset |= PCRE_CASELESS;
02282             continue;
02283 
02284             case 'm':
02285             *optset |= PCRE_MULTILINE;
02286             continue;
02287 
02288             case 's':
02289             *optset |= PCRE_DOTALL;
02290             continue;
02291 
02292             case 'x':
02293             *optset |= PCRE_EXTENDED;
02294             continue;
02295 
02296             case 'X':
02297             *optset |= PCRE_EXTRA;
02298             continue;
02299 
02300             case 'U':
02301             *optset |= PCRE_UNGREEDY;
02302             continue;
02303 
02304             case '-':
02305             optset = &unset;
02306             continue;
02307 
02308             /* A termination by ')' indicates an options-setting-only item;
02309             this is global at top level; otherwise nothing is done here and
02310             it is handled during the compiling process on a per-bracket-group
02311             basis. */
02312 
02313             case ')':
02314             if (brastackptr == 0)
02315               {
02316               options = (options | set) & (~unset);
02317               set = unset = 0;     /* To save length */
02318               }
02319             /* Fall through */
02320 
02321             /* A termination by ':' indicates the start of a nested group with
02322             the given options set. This is again handled at compile time, but
02323             we must allow for compiled space if any of the ims options are
02324             set. We also have to allow for resetting space at the end of
02325             the group, which is why 4 is added to the length and not just 2.
02326             If there are several changes of options within the same group, this
02327             will lead to an over-estimate on the length, but this shouldn't
02328             matter very much. We also have to allow for resetting options at
02329             the start of any alternations, which we do by setting
02330             branch_newextra to 2. */
02331 
02332             case ':':
02333             if (((set|unset) & PCRE_IMS) != 0)
02334               {
02335               length += 4;
02336               branch_newextra = 2;
02337               }
02338             goto END_OPTIONS;
02339 
02340             /* Unrecognized option character */
02341 
02342             default:
02343             *errorptr = ERR12;
02344             goto PCRE_ERROR_RETURN;
02345             }
02346           }
02347 
02348         /* If we hit a closing bracket, that's it - this is a freestanding
02349         option-setting. We need to ensure that branch_extra is updated if
02350         necessary. The only values branch_newextra can have here are 0 or 2.
02351         If the value is 2, then branch_extra must either be 2 or 5, depending
02352         on whether this is a lookbehind group or not. */
02353 
02354         END_OPTIONS:
02355         if (c == ')')
02356           {
02357           if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
02358             branch_extra += branch_newextra;
02359           continue;
02360           }
02361 
02362         /* If options were terminated by ':' control comes here. Fall through
02363         to handle the group below. */
02364         }
02365       }
02366 
02367     /* Extracting brackets must be counted so we can process escapes in a
02368     Perlish way. */
02369 
02370     else bracount++;
02371 
02372     /* Non-special forms of bracket. Save length for computing whole length
02373     at end if there's a repeat that requires duplication of the group. Also
02374     save the current value of branch_extra, and start the new group with
02375     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
02376     for a lookbehind assertion. */
02377 
02378     if (brastackptr >= sizeof(brastack)/sizeof(int))
02379       {
02380       *errorptr = ERR19;
02381       goto PCRE_ERROR_RETURN;
02382       }
02383 
02384     bralenstack[brastackptr] = branch_extra;
02385     branch_extra = branch_newextra;
02386 
02387     brastack[brastackptr++] = length;
02388     length += 3;
02389     continue;
02390 
02391     /* Handle ket. Look for subsequent max/min; for certain sets of values we
02392     have to replicate this bracket up to that many times. If brastackptr is
02393     0 this is an unmatched bracket which will generate an error, but take care
02394     not to try to access brastack[-1] when computing the length and restoring
02395     the branch_extra value. */
02396 
02397     case ')':
02398     length += 3;
02399       {
02400       int minval = 1;
02401       int maxval = 1;
02402       int duplength;
02403 
02404       if (brastackptr > 0)
02405         {
02406         duplength = length - brastack[--brastackptr];
02407         branch_extra = bralenstack[brastackptr];
02408         }
02409       else duplength = 0;
02410 
02411       /* Leave ptr at the final char; for read_repeat_counts this happens
02412       automatically; for the others we need an increment. */
02413 
02414       if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
02415         {
02416         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
02417           &compile_block);
02418         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02419         }
02420       else if (c == '*') { minval = 0; maxval = -1; ptr++; }
02421       else if (c == '+') { maxval = -1; ptr++; }
02422       else if (c == '?') { minval = 0; ptr++; }
02423 
02424       /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
02425       group, and if the maximum is greater than zero, we have to replicate
02426       maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
02427       bracket set - hence the 7. */
02428 
02429       if (minval == 0)
02430         {
02431         length++;
02432         if (maxval > 0) length += (maxval - 1) * (duplength + 7);
02433         }
02434 
02435       /* When the minimum is greater than zero, 1 we have to replicate up to
02436       minval-1 times, with no additions required in the copies. Then, if
02437       there is a limited maximum we have to replicate up to maxval-1 times
02438       allowing for a BRAZERO item before each optional copy and nesting
02439       brackets for all but one of the optional copies. */
02440 
02441       else
02442         {
02443         length += (minval - 1) * duplength;
02444         if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
02445           length += (maxval - minval) * (duplength + 7) - 6;
02446         }
02447       }
02448     continue;
02449 
02450     /* Non-special character. For a run of such characters the length required
02451     is the number of characters + 2, except that the maximum run length is 255.
02452     We won't get a skipped space or a non-data escape or the start of a #
02453     comment as the first character, so the length can't be zero. */
02454 
02455     NORMAL_CHAR:
02456     default:
02457     length += 2;
02458     runlength = 0;
02459     do
02460       {
02461       if ((options & PCRE_EXTENDED) != 0)
02462         {
02463         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
02464         if (c == '#')
02465           {
02466           while ((c = *(++ptr)) != 0 && c != '\n');
02467           continue;
02468           }
02469         }
02470 
02471       /* Backslash may introduce a data char or a metacharacter; stop the
02472       string before the latter. */
02473 
02474       if (c == '\\')
02475         {
02476         const uschar *saveptr = ptr;
02477         c = check_escape(&ptr, errorptr, bracount, options, FALSE,
02478           &compile_block);
02479         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
02480         if (c < 0) { ptr = saveptr; break; }
02481         }
02482 
02483       /* Ordinary character or single-char escape */
02484 
02485       runlength++;
02486       }
02487 
02488     /* This "while" is the end of the "do" above. */
02489 
02490     while (runlength < 255 &&
02491       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
02492 
02493     ptr--;
02494     length += runlength;
02495     continue;
02496     }
02497   }
02498 
02499 length += 4;    /* For final KET and END */
02500 
02501 if (length > 65539)
02502   {
02503   *errorptr = ERR20;
02504   return NULL;
02505   }
02506 
02507 /* Compute the size of data block needed and get it, either from malloc or
02508 externally provided function. We specify "code[0]" in the offsetof() expression
02509 rather than just "code", because it has been reported that one broken compiler
02510 fails on "code" because it is also an independent variable. It should make no
02511 difference to the value of the offsetof(). */
02512 
02513 size = length + offsetof(real_pcre, code[0]);
02514 re = (real_pcre *)(vmdpcre_malloc)(size);
02515 
02516 if (re == NULL)
02517   {
02518   *errorptr = ERR21;
02519   return NULL;
02520   }
02521 
02522 /* Put in the magic number and the options. */
02523 
02524 re->magic_number = MAGIC_NUMBER;
02525 re->options = options;
02526 re->tables = tables;
02527 
02528 /* Set up a starting, non-extracting bracket, then compile the expression. On
02529 error, *errorptr will be set non-NULL, so we don't need to look at the result
02530 of the function here. */
02531 
02532 ptr = (const uschar *)pattern;
02533 code = re->code;
02534 *code = OP_BRA;
02535 bracount = 0;
02536 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
02537   &compile_block);
02538 re->top_bracket = bracount;
02539 re->top_backref = top_backref;
02540 
02541 /* If not reached end of pattern on success, there's an excess bracket. */
02542 
02543 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
02544 
02545 /* Fill in the terminating state and check for disastrous overflow, but
02546 if debugging, leave the test till after things are printed out. */
02547 
02548 *code++ = OP_END;
02549 
02550 #ifndef DEBUG
02551 if (code - re->code > length) *errorptr = ERR23;
02552 #endif
02553 
02554 /* Give an error if there's back reference to a non-existent capturing
02555 subpattern. */
02556 
02557 if (top_backref > re->top_bracket) *errorptr = ERR15;
02558 
02559 /* Failed to compile */
02560 
02561 if (*errorptr != NULL)
02562   {
02563   (vmdpcre_free)(re);
02564   PCRE_ERROR_RETURN:
02565   *erroroffset = ptr - (const uschar *)pattern;
02566   return NULL;
02567   }
02568 
02569 /* If the anchored option was not passed, set flag if we can determine that the
02570 pattern is anchored by virtue of ^ characters or \A or anything else (such as
02571 starting with .* when DOTALL is set).
02572 
02573 Otherwise, see if we can determine what the first character has to be, because
02574 that speeds up unanchored matches no end. If not, see if we can set the
02575 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
02576 start with ^. and also when all branches start with .* for non-DOTALL matches.
02577 */
02578 
02579 if ((options & PCRE_ANCHORED) == 0)
02580   {
02581   int temp_options = options;
02582   if (is_anchored(re->code, &temp_options))
02583     re->options |= PCRE_ANCHORED;
02584   else
02585     {
02586     int ch = find_firstchar(re->code, &temp_options);
02587     if (ch >= 0)
02588       {
02589       re->first_char = ch;
02590       re->options |= PCRE_FIRSTSET;
02591       }
02592     else if (is_startline(re->code))
02593       re->options |= PCRE_STARTLINE;
02594     }
02595   }
02596 
02597 /* Print out the compiled data for debugging */
02598 
02599 #ifdef DEBUG
02600 
02601 printf("Length = %d top_bracket = %d top_backref = %d\n",
02602   length, re->top_bracket, re->top_backref);
02603 
02604 if (re->options != 0)
02605   {
02606   printf("%s%s%s%s%s%s%s%s\n",
02607     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
02608     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
02609     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
02610     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
02611     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
02612     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
02613     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
02614     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
02615   }
02616 
02617 if ((re->options & PCRE_FIRSTSET) != 0)
02618   {
02619   if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
02620     else printf("First char = \\x%02x\n", re->first_char);
02621   }
02622 
02623 code_end = code;
02624 code_base = code = re->code;
02625 
02626 while (code < code_end)
02627   {
02628   int charlength;
02629 
02630   printf("%3d ", code - code_base);
02631 
02632   if (*code >= OP_BRA)
02633     {
02634     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
02635     code += 2;
02636     }
02637 
02638   else switch(*code)
02639     {
02640     case OP_OPT:
02641     printf(" %.2x %s", code[1], OP_names[*code]);
02642     code++;
02643     break;
02644 
02645     case OP_COND:
02646     printf("%3d Cond", (code[1] << 8) + code[2]);
02647     code += 2;
02648     break;
02649 
02650     case OP_CREF:
02651     printf(" %.2d %s", code[1], OP_names[*code]);
02652     code++;
02653     break;
02654 
02655     case OP_CHARS:
02656     charlength = *(++code);
02657     printf("%3d ", charlength);
02658     while (charlength-- > 0)
02659       if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
02660     break;
02661 
02662     case OP_KETRMAX:
02663     case OP_KETRMIN:
02664     case OP_ALT:
02665     case OP_KET:
02666     case OP_ASSERT:
02667     case OP_ASSERT_NOT:
02668     case OP_ASSERTBACK:
02669     case OP_ASSERTBACK_NOT:
02670     case OP_ONCE:
02671     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
02672     code += 2;
02673     break;
02674 
02675     case OP_REVERSE:
02676     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
02677     code += 2;
02678     break;
02679 
02680     case OP_STAR:
02681     case OP_MINSTAR:
02682     case OP_PLUS:
02683     case OP_MINPLUS:
02684     case OP_QUERY:
02685     case OP_MINQUERY:
02686     case OP_TYPESTAR:
02687     case OP_TYPEMINSTAR:
02688     case OP_TYPEPLUS:
02689     case OP_TYPEMINPLUS:
02690     case OP_TYPEQUERY:
02691     case OP_TYPEMINQUERY:
02692     if (*code >= OP_TYPESTAR)
02693       printf("    %s", OP_names[code[1]]);
02694     else if (isprint(c = code[1])) printf("    %c", c);
02695       else printf("    \\x%02x", c);
02696     printf("%s", OP_names[*code++]);
02697     break;
02698 
02699     case OP_EXACT:
02700     case OP_UPTO:
02701     case OP_MINUPTO:
02702     if (isprint(c = code[3])) printf("    %c{", c);
02703       else printf("    \\x%02x{", c);
02704     if (*code != OP_EXACT) printf("0,");
02705     printf("%d}", (code[1] << 8) + code[2]);
02706     if (*code == OP_MINUPTO) printf("?");
02707     code += 3;
02708     break;
02709 
02710     case OP_TYPEEXACT:
02711     case OP_TYPEUPTO:
02712     case OP_TYPEMINUPTO:
02713     printf("    %s{", OP_names[code[3]]);
02714     if (*code != OP_TYPEEXACT) printf(",");
02715     printf("%d}", (code[1] << 8) + code[2]);
02716     if (*code == OP_TYPEMINUPTO) printf("?");
02717     code += 3;
02718     break;
02719 
02720     case OP_NOT:
02721     if (isprint(c = *(++code))) printf("    [^%c]", c);
02722       else printf("    [^\\x%02x]", c);
02723     break;
02724 
02725     case OP_NOTSTAR:
02726     case OP_NOTMINSTAR:
02727     case OP_NOTPLUS:
02728     case OP_NOTMINPLUS:
02729     case OP_NOTQUERY:
02730     case OP_NOTMINQUERY:
02731     if (isprint(c = code[1])) printf("    [^%c]", c);
02732       else printf("    [^\\x%02x]", c);
02733     printf("%s", OP_names[*code++]);
02734     break;
02735 
02736     case OP_NOTEXACT:
02737     case OP_NOTUPTO:
02738     case OP_NOTMINUPTO:
02739     if (isprint(c = code[3])) printf("    [^%c]{", c);
02740       else printf("    [^\\x%02x]{", c);
02741     if (*code != OP_NOTEXACT) printf(",");
02742     printf("%d}", (code[1] << 8) + code[2]);
02743     if (*code == OP_NOTMINUPTO) printf("?");
02744     code += 3;
02745     break;
02746 
02747     case OP_REF:
02748     printf("    \\%d", *(++code));
02749     code ++;
02750     goto CLASS_REF_REPEAT;
02751 
02752     case OP_CLASS:
02753       {
02754       int i, min, max;
02755       code++;
02756       printf("    [");
02757 
02758       for (i = 0; i < 256; i++)
02759         {
02760         if ((code[i/8] & (1 << (i&7))) != 0)
02761           {
02762           int j;
02763           for (j = i+1; j < 256; j++)
02764             if ((code[j/8] & (1 << (j&7))) == 0) break;
02765           if (i == '-' || i == ']') printf("\\");
02766           if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
02767           if (--j > i)
02768             {
02769             printf("-");
02770             if (j == '-' || j == ']') printf("\\");
02771             if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
02772             }
02773           i = j;
02774           }
02775         }
02776       printf("]");
02777       code += 32;
02778 
02779       CLASS_REF_REPEAT:
02780 
02781       switch(*code)
02782         {
02783         case OP_CRSTAR:
02784         case OP_CRMINSTAR:
02785         case OP_CRPLUS:
02786         case OP_CRMINPLUS:
02787         case OP_CRQUERY:
02788         case OP_CRMINQUERY:
02789         printf("%s", OP_names[*code]);
02790         break;
02791 
02792         case OP_CRRANGE:
02793         case OP_CRMINRANGE:
02794         min = (code[1] << 8) + code[2];
02795         max = (code[3] << 8) + code[4];
02796         if (max == 0) printf("{%d,}", min);
02797         else printf("{%d,%d}", min, max);
02798         if (*code == OP_CRMINRANGE) printf("?");
02799         code += 4;
02800         break;
02801 
02802         default:
02803         code--;
02804         }
02805       }
02806     break;
02807 
02808     /* Anything else is just a one-node item */
02809 
02810     default:
02811     printf("    %s", OP_names[*code]);
02812     break;
02813     }
02814 
02815   code++;
02816   printf("\n");
02817   }
02818 printf("------------------------------------------------------------------\n");
02819 
02820 /* This check is done here in the debugging case so that the code that
02821 was compiled can be seen. */
02822 
02823 if (code - re->code > length)
02824   {
02825   *errorptr = ERR23;
02826   (vmdpcre_free)(re);
02827   *erroroffset = ptr - (uschar *)pattern;
02828   return NULL;
02829   }
02830 #endif
02831 
02832 return (pcre *)re;
02833 }
02834 
02835 
02836 
02837 /*************************************************
02838 *          Match a back-reference                *
02839 *************************************************/
02840 
02841 /* If a back reference hasn't been set, the length that is passed is greater
02842 than the number of characters left in the string, so the match fails.
02843 
02844 Arguments:
02845   offset      index into the offset vector
02846   eptr        points into the subject
02847   length      length to be matched
02848   md          points to match data block
02849   ims         the ims flags
02850 
02851 Returns:      TRUE if matched
02852 */
02853 
02854 static BOOL
02855 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
02856   int ims)
02857 {
02858 const uschar *p = md->start_subject + md->offset_vector[offset];
02859 
02860 #ifdef DEBUG
02861 if (eptr >= md->end_subject)
02862   printf("matching subject <null>");
02863 else
02864   {
02865   printf("matching subject ");
02866   pchars(eptr, length, TRUE, md);
02867   }
02868 printf(" against backref ");
02869 pchars(p, length, FALSE, md);
02870 printf("\n");
02871 #endif
02872 
02873 /* Always fail if not enough characters left */
02874 
02875 if (length > md->end_subject - eptr) return FALSE;
02876 
02877 /* Separate the caselesss case for speed */
02878 
02879 if ((ims & PCRE_CASELESS) != 0)
02880   {
02881   while (length-- > 0)
02882     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
02883   }
02884 else
02885   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
02886 
02887 return TRUE;
02888 }
02889 
02890 
02891 
02892 /*************************************************
02893 *         Match from current position            *
02894 *************************************************/
02895 
02896 /* On entry ecode points to the first opcode, and eptr to the first character
02897 in the subject string, while eptrb holds the value of eptr at the start of the
02898 last bracketed group - used for breaking infinite loops matching zero-length
02899 strings.
02900 
02901 Arguments:
02902    eptr        pointer in subject
02903    ecode       position in code
02904    offset_top  current top pointer
02905    md          pointer to "static" info for the match
02906    ims         current /i, /m, and /s options
02907    condassert  TRUE if called to check a condition assertion
02908    eptrb       eptr at start of last bracket
02909 
02910 Returns:       TRUE if matched
02911 */
02912 
02913 static BOOL
02914 match(register const uschar *eptr, register const uschar *ecode,
02915   int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb)
02916 {
02917 int original_ims = ims;   /* Save for resetting on ')' */
02918 
02919 for (;;)
02920   {
02921   int op = (int)*ecode;
02922   int min, max, ctype;
02923   register int i;
02924   register int c;
02925   BOOL minimize = FALSE;
02926 
02927   /* Opening capturing bracket. If there is space in the offset vector, save
02928   the current subject position in the working slot at the top of the vector. We
02929   mustn't change the current values of the data slot, because they may be set
02930   from a previous iteration of this group, and be referred to by a reference
02931   inside the group.
02932 
02933   If the bracket fails to match, we need to restore this value and also the
02934   values of the final offsets, in case they were set by a previous iteration of
02935   the same bracket.
02936 
02937   If there isn't enough space in the offset vector, treat this as if it were a
02938   non-capturing bracket. Don't worry about setting the flag for the error case
02939   here; that is handled in the code for KET. */
02940 
02941   if (op > OP_BRA)
02942     {
02943     int number = op - OP_BRA;
02944     int offset = number << 1;
02945 
02946 #ifdef DEBUG
02947     printf("start bracket %d subject=", number);
02948     pchars(eptr, 16, TRUE, md);
02949     printf("\n");
02950 #endif
02951 
02952     if (offset < md->offset_max)
02953       {
02954       int save_offset1 = md->offset_vector[offset];
02955       int save_offset2 = md->offset_vector[offset+1];
02956       int save_offset3 = md->offset_vector[md->offset_end - number];
02957 
02958       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
02959       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
02960 
02961       do
02962         {
02963         if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
02964         ecode += (ecode[1] << 8) + ecode[2];
02965         }
02966       while (*ecode == OP_ALT);
02967 
02968       DPRINTF(("bracket %d failed\n", number));
02969 
02970       md->offset_vector[offset] = save_offset1;
02971       md->offset_vector[offset+1] = save_offset2;
02972       md->offset_vector[md->offset_end - number] = save_offset3;
02973       return FALSE;
02974       }
02975 
02976     /* Insufficient room for saving captured contents */
02977 
02978     else op = OP_BRA;
02979     }
02980 
02981   /* Other types of node can be handled by a switch */
02982 
02983   switch(op)
02984     {
02985     case OP_BRA:     /* Non-capturing bracket: optimized */
02986     DPRINTF(("start bracket 0\n"));
02987     do
02988       {
02989       if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
02990       ecode += (ecode[1] << 8) + ecode[2];
02991       }
02992     while (*ecode == OP_ALT);
02993     DPRINTF(("bracket 0 failed\n"));
02994     return FALSE;
02995 
02996     /* Conditional group: compilation checked that there are no more than
02997     two branches. If the condition is false, skipping the first branch takes us
02998     past the end if there is only one branch, but that's OK because that is
02999     exactly what going to the ket would do. */
03000 
03001     case OP_COND:
03002     if (ecode[3] == OP_CREF)         /* Condition is extraction test */
03003       {
03004       int offset = ecode[4] << 1;    /* Doubled reference number */
03005       return match(eptr,
03006         ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
03007           5 : 3 + (ecode[1] << 8) + ecode[2]),
03008         offset_top, md, ims, FALSE, eptr);
03009       }
03010 
03011     /* The condition is an assertion. Call match() to evaluate it - setting
03012     the final argument TRUE causes it to stop at the end of an assertion. */
03013 
03014     else
03015       {
03016       if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
03017         {
03018         ecode += 3 + (ecode[4] << 8) + ecode[5];
03019         while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
03020         }
03021       else ecode += (ecode[1] << 8) + ecode[2];
03022       return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
03023       }
03024     /* Control never reaches here */
03025 
03026     /* Skip over conditional reference data if encountered (should not be) */
03027 
03028     case OP_CREF:
03029     ecode += 2;
03030     break;
03031 
03032     /* End of the pattern */
03033 
03034     case OP_END:
03035     md->end_match_ptr = eptr;          /* Record where we ended */
03036     md->end_offset_top = offset_top;   /* and how many extracts were taken */
03037     return TRUE;
03038 
03039     /* Change option settings */
03040 
03041     case OP_OPT:
03042     ims = ecode[1];
03043     ecode += 2;
03044     DPRINTF(("ims set to %02x\n", ims));
03045     break;
03046 
03047     /* Assertion brackets. Check the alternative branches in turn - the
03048     matching won't pass the KET for an assertion. If any one branch matches,
03049     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
03050     start of each branch to move the current point backwards, so the code at
03051     this level is identical to the lookahead case. */
03052 
03053     case OP_ASSERT:
03054     case OP_ASSERTBACK:
03055     do
03056       {
03057       if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
03058       ecode += (ecode[1] << 8) + ecode[2];
03059       }
03060     while (*ecode == OP_ALT);
03061     if (*ecode == OP_KET) return FALSE;
03062 
03063     /* If checking an assertion for a condition, return TRUE. */
03064 
03065     if (condassert) return TRUE;
03066 
03067     /* Continue from after the assertion, updating the offsets high water
03068     mark, since extracts may have been taken during the assertion. */
03069 
03070     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
03071     ecode += 3;
03072     offset_top = md->end_offset_top;
03073     continue;
03074 
03075     /* Negative assertion: all branches must fail to match */
03076 
03077     case OP_ASSERT_NOT:
03078     case OP_ASSERTBACK_NOT:
03079     do
03080       {
03081       if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
03082       ecode += (ecode[1] << 8) + ecode[2];
03083       }
03084     while (*ecode == OP_ALT);
03085 
03086     if (condassert) return TRUE;
03087     ecode += 3;
03088     continue;
03089 
03090     /* Move the subject pointer back. This occurs only at the start of
03091     each branch of a lookbehind assertion. If we are too close to the start to
03092     move back, this match function fails. */
03093 
03094     case OP_REVERSE:
03095     eptr -= (ecode[1] << 8) + ecode[2];
03096     if (eptr < md->start_subject) return FALSE;
03097     ecode += 3;
03098     break;
03099 
03100 
03101     /* "Once" brackets are like assertion brackets except that after a match,
03102     the point in the subject string is not moved back. Thus there can never be
03103     a move back into the brackets. Check the alternative branches in turn - the
03104     matching won't pass the KET for this kind of subpattern. If any one branch
03105     matches, we carry on as at the end of a normal bracket, leaving the subject
03106     pointer. */
03107 
03108     case OP_ONCE:
03109       {
03110       const uschar *prev = ecode;
03111 
03112       do
03113         {
03114         if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
03115         ecode += (ecode[1] << 8) + ecode[2];
03116         }
03117       while (*ecode == OP_ALT);
03118 
03119       /* If hit the end of the group (which could be repeated), fail */
03120 
03121       if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
03122 
03123       /* Continue as from after the assertion, updating the offsets high water
03124       mark, since extracts may have been taken. */
03125 
03126       do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
03127 
03128       offset_top = md->end_offset_top;
03129       eptr = md->end_match_ptr;
03130 
03131       /* For a non-repeating ket, just continue at this level. This also
03132       happens for a repeating ket if no characters were matched in the group.
03133       This is the forcible breaking of infinite loops as implemented in Perl
03134       5.005. If there is an options reset, it will get obeyed in the normal
03135       course of events. */
03136 
03137       if (*ecode == OP_KET || eptr == eptrb)
03138         {
03139         ecode += 3;
03140         break;
03141         }
03142 
03143       /* The repeating kets try the rest of the pattern or restart from the
03144       preceding bracket, in the appropriate order. We need to reset any options
03145       that changed within the bracket before re-running it, so check the next
03146       opcode. */
03147 
03148       if (ecode[3] == OP_OPT)
03149         {
03150         ims = (ims & ~PCRE_IMS) | ecode[4];
03151         DPRINTF(("ims set to %02x at group repeat\n", ims));
03152         }
03153 
03154       if (*ecode == OP_KETRMIN)
03155         {
03156         if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
03157             match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
03158         }
03159       else  /* OP_KETRMAX */
03160         {
03161         if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
03162             match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
03163         }
03164       }
03165     return FALSE;
03166 
03167     /* An alternation is the end of a branch; scan along to find the end of the
03168     bracketed group and go to there. */
03169 
03170     case OP_ALT:
03171     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
03172     break;
03173 
03174     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
03175     that it may occur zero times. It may repeat infinitely, or not at all -
03176     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
03177     repeat limits are compiled as a number of copies, with the optional ones
03178     preceded by BRAZERO or BRAMINZERO. */
03179 
03180     case OP_BRAZERO:
03181       {
03182       const uschar *next = ecode+1;
03183       if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
03184       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
03185       ecode = next + 3;
03186       }
03187     break;
03188 
03189     case OP_BRAMINZERO:
03190       {
03191       const uschar *next = ecode+1;
03192       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
03193       if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
03194       ecode++;
03195       }
03196     break;
03197 
03198     /* End of a group, repeated or non-repeating. If we are at the end of
03199     an assertion "group", stop matching and return TRUE, but record the
03200     current high water mark for use by positive assertions. Do this also
03201     for the "once" (not-backup up) groups. */
03202 
03203     case OP_KET:
03204     case OP_KETRMIN:
03205     case OP_KETRMAX:
03206       {
03207       const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
03208 
03209       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
03210           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
03211           *prev == OP_ONCE)
03212         {
03213         md->end_match_ptr = eptr;      /* For ONCE */
03214         md->end_offset_top = offset_top;
03215         return TRUE;
03216         }
03217 
03218       /* In all other cases except a conditional group we have to check the
03219       group number back at the start and if necessary complete handling an
03220       extraction by setting the offsets and bumping the high water mark. */
03221 
03222       if (*prev != OP_COND)
03223         {
03224         int number = *prev - OP_BRA;
03225         int offset = number << 1;
03226 
03227         DPRINTF(("end bracket %d\n", number));
03228 
03229         if (number > 0)
03230           {
03231           if (offset >= md->offset_max) md->offset_overflow = TRUE; else
03232             {
03233             md->offset_vector[offset] =
03234               md->offset_vector[md->offset_end - number];
03235             md->offset_vector[offset+1] = eptr - md->start_subject;
03236             if (offset_top <= offset) offset_top = offset + 2;
03237             }
03238           }
03239         }
03240 
03241       /* Reset the value of the ims flags, in case they got changed during
03242       the group. */
03243 
03244       ims = original_ims;
03245       DPRINTF(("ims reset to %02x\n", ims));
03246 
03247       /* For a non-repeating ket, just continue at this level. This also
03248       happens for a repeating ket if no characters were matched in the group.
03249       This is the forcible breaking of infinite loops as implemented in Perl
03250       5.005. If there is an options reset, it will get obeyed in the normal
03251       course of events. */
03252 
03253       if (*ecode == OP_KET || eptr == eptrb)
03254         {
03255         ecode += 3;
03256         break;
03257         }
03258 
03259       /* The repeating kets try the rest of the pattern or restart from the
03260       preceding bracket, in the appropriate order. */
03261 
03262       if (*ecode == OP_KETRMIN)
03263         {
03264         if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
03265             match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
03266         }
03267       else  /* OP_KETRMAX */
03268         {
03269         if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
03270             match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
03271         }
03272       }
03273     return FALSE;
03274 
03275     /* Start of subject unless notbol, or after internal newline if multiline */
03276 
03277     case OP_CIRC:
03278     if (md->notbol && eptr == md->start_subject) return FALSE;
03279     if ((ims & PCRE_MULTILINE) != 0)
03280       {
03281       if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
03282       ecode++;
03283       break;
03284       }
03285     /* ... else fall through */
03286 
03287     /* Start of subject assertion */
03288 
03289     case OP_SOD:
03290     if (eptr != md->start_subject) return FALSE;
03291     ecode++;
03292     break;
03293 
03294     /* Assert before internal newline if multiline, or before a terminating
03295     newline unless endonly is set, else end of subject unless noteol is set. */
03296 
03297     case OP_DOLL:
03298     if ((ims & PCRE_MULTILINE) != 0)
03299       {
03300       if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
03301         else { if (md->noteol) return FALSE; }
03302       ecode++;
03303       break;
03304       }
03305     else
03306       {
03307       if (md->noteol) return FALSE;
03308       if (!md->endonly)
03309         {
03310         if (eptr < md->end_subject - 1 ||
03311            (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
03312 
03313         ecode++;
03314         break;
03315         }
03316       }
03317     /* ... else fall through */
03318 
03319     /* End of subject assertion (\z) */
03320 
03321     case OP_EOD:
03322     if (eptr < md->end_subject) return FALSE;
03323     ecode++;
03324     break;
03325 
03326     /* End of subject or ending \n assertion (\Z) */
03327 
03328     case OP_EODN:
03329     if (eptr < md->end_subject - 1 ||
03330        (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
03331     ecode++;
03332     break;
03333 
03334     /* Word boundary assertions */
03335 
03336     case OP_NOT_WORD_BOUNDARY:
03337     case OP_WORD_BOUNDARY:
03338       {
03339       BOOL prev_is_word = (eptr != md->start_subject) &&
03340         ((md->ctypes[eptr[-1]] & ctype_word) != 0);
03341       BOOL cur_is_word = (eptr < md->end_subject) &&
03342         ((md->ctypes[*eptr] & ctype_word) != 0);
03343       if ((*ecode++ == OP_WORD_BOUNDARY)?
03344            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
03345         return FALSE;
03346       }
03347     break;
03348 
03349     /* Match a single character type; inline for speed */
03350 
03351     case OP_ANY:
03352     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
03353       return FALSE;
03354     if (eptr++ >= md->end_subject) return FALSE;
03355     ecode++;
03356     break;
03357 
03358     case OP_NOT_DIGIT:
03359     if (eptr >= md->end_subject ||
03360        (md->ctypes[*eptr++] & ctype_digit) != 0)
03361       return FALSE;
03362     ecode++;
03363     break;
03364 
03365     case OP_DIGIT:
03366     if (eptr >= md->end_subject ||
03367        (md->ctypes[*eptr++] & ctype_digit) == 0)
03368       return FALSE;
03369     ecode++;
03370     break;
03371 
03372     case OP_NOT_WHITESPACE:
03373     if (eptr >= md->end_subject ||
03374        (md->ctypes[*eptr++] & ctype_space) != 0)
03375       return FALSE;
03376     ecode++;
03377     break;
03378 
03379     case OP_WHITESPACE:
03380     if (eptr >= md->end_subject ||
03381        (md->ctypes[*eptr++] & ctype_space) == 0)
03382       return FALSE;
03383     ecode++;
03384     break;
03385 
03386     case OP_NOT_WORDCHAR:
03387     if (eptr >= md->end_subject ||
03388        (md->ctypes[*eptr++] & ctype_word) != 0)
03389       return FALSE;
03390     ecode++;
03391     break;
03392 
03393     case OP_WORDCHAR:
03394     if (eptr >= md->end_subject ||
03395        (md->ctypes[*eptr++] & ctype_word) == 0)
03396       return FALSE;
03397     ecode++;
03398     break;
03399 
03400     /* Match a back reference, possibly repeatedly. Look past the end of the
03401     item to see if there is repeat information following. The code is similar
03402     to that for character classes, but repeated for efficiency. Then obey
03403     similar code to character type repeats - written out again for speed.
03404     However, if the referenced string is the empty string, always treat
03405     it as matched, any number of times (otherwise there could be infinite
03406     loops). */
03407 
03408     case OP_REF:
03409       {
03410       int length;
03411       int offset = ecode[1] << 1;                /* Doubled reference number */
03412       ecode += 2;                                /* Advance past the item */
03413 
03414       /* If the reference is unset, set the length to be longer than the amount
03415       of subject left; this ensures that every attempt at a match fails. We
03416       can't just fail here, because of the possibility of quantifiers with zero
03417       minima. */
03418 
03419       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
03420         md->end_subject - eptr + 1 :
03421         md->offset_vector[offset+1] - md->offset_vector[offset];
03422 
03423       /* Set up for repetition, or handle the non-repeated case */
03424 
03425       switch (*ecode)
03426         {
03427         case OP_CRSTAR:
03428         case OP_CRMINSTAR:
03429         case OP_CRPLUS:
03430         case OP_CRMINPLUS:
03431         case OP_CRQUERY:
03432         case OP_CRMINQUERY:
03433         c = *ecode++ - OP_CRSTAR;
03434         minimize = (c & 1) != 0;
03435         min = rep_min[c];                 /* Pick up values from tables; */
03436         max = rep_max[c];                 /* zero for max => infinity */
03437         if (max == 0) max = INT_MAX;
03438         break;
03439 
03440         case OP_CRRANGE:
03441         case OP_CRMINRANGE:
03442         minimize = (*ecode == OP_CRMINRANGE);
03443         min = (ecode[1] << 8) + ecode[2];
03444         max = (ecode[3] << 8) + ecode[4];
03445         if (max == 0) max = INT_MAX;
03446         ecode += 5;
03447         break;
03448 
03449         default:               /* No repeat follows */
03450         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
03451         eptr += length;
03452         continue;              /* With the main loop */
03453         }
03454 
03455       /* If the length of the reference is zero, just continue with the
03456       main loop. */
03457 
03458       if (length == 0) continue;
03459 
03460       /* First, ensure the minimum number of matches are present. We get back
03461       the length of the reference string explicitly rather than passing the
03462       address of eptr, so that eptr can be a register variable. */
03463 
03464       for (i = 1; i <= min; i++)
03465         {
03466         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
03467         eptr += length;
03468         }
03469 
03470       /* If min = max, continue at the same level without recursion.
03471       They are not both allowed to be zero. */
03472 
03473       if (min == max) continue;
03474 
03475       /* If minimizing, keep trying and advancing the pointer */
03476 
03477       if (minimize)
03478         {
03479         for (i = min;; i++)
03480           {
03481           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03482             return TRUE;
03483           if (i >= max || !match_ref(offset, eptr, length, md, ims))
03484             return FALSE;
03485           eptr += length;
03486           }
03487         /* Control never gets here */
03488         }
03489 
03490       /* If maximizing, find the longest string and work backwards */
03491 
03492       else
03493         {
03494         const uschar *pp = eptr;
03495         for (i = min; i < max; i++)
03496           {
03497           if (!match_ref(offset, eptr, length, md, ims)) break;
03498           eptr += length;
03499           }
03500         while (eptr >= pp)
03501           {
03502           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03503             return TRUE;
03504           eptr -= length;
03505           }
03506         return FALSE;
03507         }
03508       }
03509     /* Control never gets here */
03510 
03511 
03512 
03513     /* Match a character class, possibly repeatedly. Look past the end of the
03514     item to see if there is repeat information following. Then obey similar
03515     code to character type repeats - written out again for speed. */
03516 
03517     case OP_CLASS:
03518       {
03519       const uschar *data = ecode + 1;  /* Save for matching */
03520       ecode += 33;                     /* Advance past the item */
03521 
03522       switch (*ecode)
03523         {
03524         case OP_CRSTAR:
03525         case OP_CRMINSTAR:
03526         case OP_CRPLUS:
03527         case OP_CRMINPLUS:
03528         case OP_CRQUERY:
03529         case OP_CRMINQUERY:
03530         c = *ecode++ - OP_CRSTAR;
03531         minimize = (c & 1) != 0;
03532         min = rep_min[c];                 /* Pick up values from tables; */
03533         max = rep_max[c];                 /* zero for max => infinity */
03534         if (max == 0) max = INT_MAX;
03535         break;
03536 
03537         case OP_CRRANGE:
03538         case OP_CRMINRANGE:
03539         minimize = (*ecode == OP_CRMINRANGE);
03540         min = (ecode[1] << 8) + ecode[2];
03541         max = (ecode[3] << 8) + ecode[4];
03542         if (max == 0) max = INT_MAX;
03543         ecode += 5;
03544         break;
03545 
03546         default:               /* No repeat follows */
03547         min = max = 1;
03548         break;
03549         }
03550 
03551       /* First, ensure the minimum number of matches are present. */
03552 
03553       for (i = 1; i <= min; i++)
03554         {
03555         if (eptr >= md->end_subject) return FALSE;
03556         c = *eptr++;
03557         if ((data[c/8] & (1 << (c&7))) != 0) continue;
03558         return FALSE;
03559         }
03560 
03561       /* If max == min we can continue with the main loop without the
03562       need to recurse. */
03563 
03564       if (min == max) continue;
03565 
03566       /* If minimizing, keep testing the rest of the expression and advancing
03567       the pointer while it matches the class. */
03568 
03569       if (minimize)
03570         {
03571         for (i = min;; i++)
03572           {
03573           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03574             return TRUE;
03575           if (i >= max || eptr >= md->end_subject) return FALSE;
03576           c = *eptr++;
03577           if ((data[c/8] & (1 << (c&7))) != 0) continue;
03578           return FALSE;
03579           }
03580         /* Control never gets here */
03581         }
03582 
03583       /* If maximizing, find the longest possible run, then work backwards. */
03584 
03585       else
03586         {
03587         const uschar *pp = eptr;
03588         for (i = min; i < max; eptr++, i++)
03589           {
03590           if (eptr >= md->end_subject) break;
03591           c = *eptr;
03592           if ((data[c/8] & (1 << (c&7))) != 0) continue;
03593           break;
03594           }
03595 
03596         while (eptr >= pp)
03597           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
03598             return TRUE;
03599         return FALSE;
03600         }
03601       }
03602     /* Control never gets here */
03603 
03604     /* Match a run of characters */
03605 
03606     case OP_CHARS:
03607       {
03608       register int length = ecode[1];
03609       ecode += 2;
03610 
03611 #ifdef DEBUG    /* Sigh. Some compilers never learn. */
03612       if (eptr >= md->end_subject)
03613         printf("matching subject <null> against pattern ");
03614       else
03615         {
03616         printf("matching subject ");
03617         pchars(eptr, length, TRUE, md);
03618         printf(" against pattern ");
03619         }
03620       pchars(ecode, length, FALSE, md);
03621       printf("\n");
03622 #endif
03623 
03624       if (length > md->end_subject - eptr) return FALSE;
03625       if ((ims & PCRE_CASELESS) != 0)
03626         {
03627         while (length-- > 0)
03628           if (md->lcc[*ecode++] != md->lcc[*eptr++])
03629             return FALSE;
03630         }
03631       else
03632         {
03633         while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
03634         }
03635       }
03636     break;
03637 
03638     /* Match a single character repeatedly; different opcodes share code. */
03639 
03640     case OP_EXACT:
03641     min = max = (ecode[1] << 8) + ecode[2];
03642     ecode += 3;
03643     goto REPEATCHAR;
03644 
03645     case OP_UPTO:
03646     case OP_MINUPTO:
03647     min = 0;
03648     max = (ecode[1] << 8) + ecode[2];
03649     minimize = *ecode == OP_MINUPTO;
03650     ecode += 3;
03651     goto REPEATCHAR;
03652 
03653     case OP_STAR:
03654     case OP_MINSTAR:
03655     case OP_PLUS:
03656     case OP_MINPLUS:
03657     case OP_QUERY:
03658     case OP_MINQUERY:
03659     c = *ecode++ - OP_STAR;
03660     minimize = (c & 1) != 0;
03661     min = rep_min[c];                 /* Pick up values from tables; */
03662     max = rep_max[c];                 /* zero for max => infinity */
03663     if (max == 0) max = INT_MAX;
03664 
03665     /* Common code for all repeated single-character matches. We can give
03666     up quickly if there are fewer than the minimum number of characters left in
03667     the subject. */
03668 
03669     REPEATCHAR:
03670     if (min > md->end_subject - eptr) return FALSE;
03671     c = *ecode++;
03672 
03673     /* The code is duplicated for the caseless and caseful cases, for speed,
03674     since matching characters is likely to be quite common. First, ensure the
03675     minimum number of matches are present. If min = max, continue at the same
03676     level without recursing. Otherwise, if minimizing, keep trying the rest of
03677     the expression and advancing one matching character if failing, up to the
03678     maximum. Alternatively, if maximizing, find the maximum number of
03679     characters and work backwards. */
03680 
03681     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
03682       max, eptr));
03683 
03684     if ((ims & PCRE_CASELESS) != 0)
03685       {
03686       c = md->lcc[c];
03687       for (i = 1; i <= min; i++)
03688         if (c != md->lcc[*eptr++]) return FALSE;
03689       if (min == max) continue;
03690       if (minimize)
03691         {
03692         for (i = min;; i++)
03693           {
03694           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03695             return TRUE;
03696           if (i >= max || eptr >= md->end_subject ||
03697               c != md->lcc[*eptr++])
03698             return FALSE;
03699           }
03700         /* Control never gets here */
03701         }
03702       else
03703         {
03704         const uschar *pp = eptr;
03705         for (i = min; i < max; i++)
03706           {
03707           if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
03708           eptr++;
03709           }
03710         while (eptr >= pp)
03711           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
03712             return TRUE;
03713         return FALSE;
03714         }
03715       /* Control never gets here */
03716       }
03717 
03718     /* Caseful comparisons */
03719 
03720     else
03721       {
03722       for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
03723       if (min == max) continue;
03724       if (minimize)
03725         {
03726         for (i = min;; i++)
03727           {
03728           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03729             return TRUE;
03730           if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
03731           }
03732         /* Control never gets here */
03733         }
03734       else
03735         {
03736         const uschar *pp = eptr;
03737         for (i = min; i < max; i++)
03738           {
03739           if (eptr >= md->end_subject || c != *eptr) break;
03740           eptr++;
03741           }
03742         while (eptr >= pp)
03743          if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
03744            return TRUE;
03745         return FALSE;
03746         }
03747       }
03748     /* Control never gets here */
03749 
03750     /* Match a negated single character */
03751 
03752     case OP_NOT:
03753     if (eptr >= md->end_subject) return FALSE;
03754     ecode++;
03755     if ((ims & PCRE_CASELESS) != 0)
03756       {
03757       if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
03758       }
03759     else
03760       {
03761       if (*ecode++ == *eptr++) return FALSE;
03762       }
03763     break;
03764 
03765     /* Match a negated single character repeatedly. This is almost a repeat of
03766     the code for a repeated single character, but I haven't found a nice way of
03767     commoning these up that doesn't require a test of the positive/negative
03768     option for each character match. Maybe that wouldn't add very much to the
03769     time taken, but character matching *is* what this is all about... */
03770 
03771     case OP_NOTEXACT:
03772     min = max = (ecode[1] << 8) + ecode[2];
03773     ecode += 3;
03774     goto REPEATNOTCHAR;
03775 
03776     case OP_NOTUPTO:
03777     case OP_NOTMINUPTO:
03778     min = 0;
03779     max = (ecode[1] << 8) + ecode[2];
03780     minimize = *ecode == OP_NOTMINUPTO;
03781     ecode += 3;
03782     goto REPEATNOTCHAR;
03783 
03784     case OP_NOTSTAR:
03785     case OP_NOTMINSTAR:
03786     case OP_NOTPLUS:
03787     case OP_NOTMINPLUS:
03788     case OP_NOTQUERY:
03789     case OP_NOTMINQUERY:
03790     c = *ecode++ - OP_NOTSTAR;
03791     minimize = (c & 1) != 0;
03792     min = rep_min[c];                 /* Pick up values from tables; */
03793     max = rep_max[c];                 /* zero for max => infinity */
03794     if (max == 0) max = INT_MAX;
03795 
03796     /* Common code for all repeated single-character matches. We can give
03797     up quickly if there are fewer than the minimum number of characters left in
03798     the subject. */
03799 
03800     REPEATNOTCHAR:
03801     if (min > md->end_subject - eptr) return FALSE;
03802     c = *ecode++;
03803 
03804     /* The code is duplicated for the caseless and caseful cases, for speed,
03805     since matching characters is likely to be quite common. First, ensure the
03806     minimum number of matches are present. If min = max, continue at the same
03807     level without recursing. Otherwise, if minimizing, keep trying the rest of
03808     the expression and advancing one matching character if failing, up to the
03809     maximum. Alternatively, if maximizing, find the maximum number of
03810     characters and work backwards. */
03811 
03812     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
03813       max, eptr));
03814 
03815     if ((ims & PCRE_CASELESS) != 0)
03816       {
03817       c = md->lcc[c];
03818       for (i = 1; i <= min; i++)
03819         if (c == md->lcc[*eptr++]) return FALSE;
03820       if (min == max) continue;
03821       if (minimize)
03822         {
03823         for (i = min;; i++)
03824           {
03825           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03826             return TRUE;
03827           if (i >= max || eptr >= md->end_subject ||
03828               c == md->lcc[*eptr++])
03829             return FALSE;
03830           }
03831         /* Control never gets here */
03832         }
03833       else
03834         {
03835         const uschar *pp = eptr;
03836         for (i = min; i < max; i++)
03837           {
03838           if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
03839           eptr++;
03840           }
03841         while (eptr >= pp)
03842           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
03843             return TRUE;
03844         return FALSE;
03845         }
03846       /* Control never gets here */
03847       }
03848 
03849     /* Caseful comparisons */
03850 
03851     else
03852       {
03853       for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
03854       if (min == max) continue;
03855       if (minimize)
03856         {
03857         for (i = min;; i++)
03858           {
03859           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
03860             return TRUE;
03861           if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
03862           }
03863         /* Control never gets here */
03864         }
03865       else
03866         {
03867         const uschar *pp = eptr;
03868         for (i = min; i < max; i++)
03869           {
03870           if (eptr >= md->end_subject || c == *eptr) break;
03871           eptr++;
03872           }
03873         while (eptr >= pp)
03874          if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
03875            return TRUE;
03876         return FALSE;
03877         }
03878       }
03879     /* Control never gets here */
03880 
03881     /* Match a single character type repeatedly; several different opcodes
03882     share code. This is very similar to the code for single characters, but we
03883     repeat it in the interests of efficiency. */
03884 
03885     case OP_TYPEEXACT:
03886     min = max = (ecode[1] << 8) + ecode[2];
03887     minimize = TRUE;
03888     ecode += 3;
03889     goto REPEATTYPE;
03890 
03891     case OP_TYPEUPTO:
03892     case OP_TYPEMINUPTO:
03893     min = 0;
03894     max = (ecode[1] << 8) + ecode[2];
03895     minimize = *ecode == OP_TYPEMINUPTO;
03896     ecode += 3;
03897     goto REPEATTYPE;
03898 
03899     case OP_TYPESTAR:
03900     case OP_TYPEMINSTAR:
03901     case OP_TYPEPLUS:
03902     case OP_TYPEMINPLUS:
03903     case OP_TYPEQUERY:
03904     case OP_TYPEMINQUERY:
03905     c = *ecode++ - OP_TYPESTAR;
03906     minimize = (c & 1) != 0;
03907     min = rep_min[c];                 /* Pick up values from tables; */
03908     max = rep_max[c];                 /* zero for max => infinity */
03909     if (max == 0) max = INT_MAX;
03910 
03911     /* Common code for all repeated single character type matches */
03912 
03913     REPEATTYPE:
03914     ctype = *ecode++;      /* Code for the character type */
03915 
03916     /* First, ensure the minimum number of matches are present. Use inline
03917     code for maximizing the speed, and do the type test once at the start
03918     (i.e. keep it out of the loop). Also test that there are at least the
03919     minimum number of characters before we start. */
03920 
03921     if (min > md->end_subject - eptr) return FALSE;
03922     if (min > 0) switch(ctype)
03923       {
03924       case OP_ANY:
03925       if ((ims & PCRE_DOTALL) == 0)
03926         { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
03927       else eptr += min;
03928       break;
03929 
03930       case OP_NOT_DIGIT:
03931       for (i = 1; i <= min; i++)
03932         if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
03933       break;
03934 
03935       case OP_DIGIT:
03936       for (i = 1; i <= min; i++)
03937         if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
03938       break;
03939 
03940       case OP_NOT_WHITESPACE:
03941       for (i = 1; i <= min; i++)
03942         if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
03943       break;
03944 
03945       case OP_WHITESPACE:
03946       for (i = 1; i <= min; i++)
03947         if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
03948       break;
03949 
03950       case OP_NOT_WORDCHAR:
03951       for (i = 1; i <= min; i++)
03952         if ((md->ctypes[*eptr++] & ctype_word) != 0)
03953           return FALSE;
03954       break;
03955 
03956       case OP_WORDCHAR:
03957       for (i = 1; i <= min; i++)
03958         if ((md->ctypes[*eptr++] & ctype_word) == 0)
03959           return FALSE;
03960       break;
03961       }
03962 
03963     /* If min = max, continue at the same level without recursing */
03964 
03965     if (min == max) continue;
03966 
03967     /* If minimizing, we have to test the rest of the pattern before each
03968     subsequent match. */
03969 
03970     if (minimize)
03971       {
03972       for (i = min;; i++)
03973         {
03974         if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
03975         if (i >= max || eptr >= md->end_subject) return FALSE;
03976 
03977         c = *eptr++;
03978         switch(ctype)
03979           {
03980           case OP_ANY:
03981           if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
03982           break;
03983 
03984           case OP_NOT_DIGIT:
03985           if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
03986           break;
03987 
03988           case OP_DIGIT:
03989           if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
03990           break;
03991 
03992           case OP_NOT_WHITESPACE:
03993           if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
03994           break;
03995 
03996           case OP_WHITESPACE:
03997           if  ((md->ctypes[c] & ctype_space) == 0) return FALSE;
03998           break;
03999 
04000           case OP_NOT_WORDCHAR:
04001           if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
04002           break;
04003 
04004           case OP_WORDCHAR:
04005           if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
04006           break;
04007           }
04008         }
04009       /* Control never gets here */
04010       }
04011 
04012     /* If maximizing it is worth using inline code for speed, doing the type
04013     test once at the start (i.e. keep it out of the loop). */
04014 
04015     else
04016       {
04017       const uschar *pp = eptr;
04018       switch(ctype)
04019         {
04020         case OP_ANY:
04021         if ((ims & PCRE_DOTALL) == 0)
04022           {
04023           for (i = min; i < max; i++)
04024             {
04025             if (eptr >= md->end_subject || *eptr == '\n') break;
04026             eptr++;
04027             }
04028           }
04029         else
04030           {
04031           c = max - min;
04032           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
04033           eptr += c;
04034           }
04035         break;
04036 
04037         case OP_NOT_DIGIT:
04038         for (i = min; i < max; i++)
04039           {
04040           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
04041             break;
04042           eptr++;
04043           }
04044         break;
04045 
04046         case OP_DIGIT:
04047         for (i = min; i < max; i++)
04048           {
04049           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
04050             break;
04051           eptr++;
04052           }
04053         break;
04054 
04055         case OP_NOT_WHITESPACE:
04056         for (i = min; i < max; i++)
04057           {
04058           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
04059             break;
04060           eptr++;
04061           }
04062         break;
04063 
04064         case OP_WHITESPACE:
04065         for (i = min; i < max; i++)
04066           {
04067           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
04068             break;
04069           eptr++;
04070           }
04071         break;
04072 
04073         case OP_NOT_WORDCHAR:
04074         for (i = min; i < max; i++)
04075           {
04076           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
04077             break;
04078           eptr++;
04079           }
04080         break;
04081 
04082         case OP_WORDCHAR:
04083         for (i = min; i < max; i++)
04084           {
04085           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
04086             break;
04087           eptr++;
04088           }
04089         break;
04090         }
04091 
04092       while (eptr >= pp)
04093         if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
04094           return TRUE;
04095       return FALSE;
04096       }
04097     /* Control never gets here */
04098 
04099     /* There's been some horrible disaster. */
04100 
04101     default:
04102     DPRINTF(("Unknown opcode %d\n", *ecode));
04103     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
04104     return FALSE;
04105     }
04106 
04107   /* Do not stick any code in here without much thought; it is assumed
04108   that "continue" in the code above comes out to here to repeat the main
04109   loop. */
04110 
04111   }             /* End of main loop */
04112 /* Control never reaches here */
04113 }
04114 
04115 
04116 
04117 
04118 /*************************************************
04119 *         Execute a Regular Expression           *
04120 *************************************************/
04121 
04122 /* This function applies a compiled re to a subject string and picks out
04123 portions of the string if it matches. Two elements in the vector are set for
04124 each substring: the offsets to the start and end of the substring.
04125 
04126 Arguments:
04127   external_re     points to the compiled expression
04128   external_extra  points to "hints" from pcre_study() or is NULL
04129   subject         points to the subject string
04130   length          length of subject string (may contain binary zeros)
04131   start_offset    where to start in the subject string
04132   options         option bits
04133   offsets         points to a vector of ints to be filled in with offsets
04134   offsetcount     the number of elements in the vector
04135 
04136 Returns:          > 0 => success; value is the number of elements filled in
04137                   = 0 => success, but offsets is not big enough
04138                    -1 => failed to match
04139                  < -1 => some kind of unexpected problem
04140 */
04141 
04142 int
04143 vmdpcre_exec(const pcre *external_re, const pcre_extra *external_extra,
04144   const char *subject, int length, int start_offset, int options, int *offsets,
04145   int offsetcount)
04146 {
04147 int resetcount, ocount;
04148 int first_char = -1;
04149 int ims = 0;
04150 match_data match_block;
04151 const uschar *start_bits = NULL;
04152 const uschar *start_match = (const uschar *)subject + start_offset;
04153 const uschar *end_subject;
04154 const real_pcre *re = (const real_pcre *)external_re;
04155 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
04156 BOOL using_temporary_offsets = FALSE;
04157 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
04158 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
04159 
04160 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
04161 
04162 if (re == NULL || subject == NULL ||
04163    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
04164 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
04165 
04166 match_block.start_subject = (const uschar *)subject;
04167 match_block.end_subject = match_block.start_subject + length;
04168 end_subject = match_block.end_subject;
04169 
04170 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
04171 
04172 match_block.notbol = (options & PCRE_NOTBOL) != 0;
04173 match_block.noteol = (options & PCRE_NOTEOL) != 0;
04174 
04175 match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
04176 
04177 match_block.lcc = re->tables + lcc_offset;
04178 match_block.ctypes = re->tables + ctypes_offset;
04179 
04180 /* The ims options can vary during the matching as a result of the presence
04181 of (?ims) items in the pattern. They are kept in a local variable so that
04182 restoring at the exit of a group is easy. */
04183 
04184 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
04185 
04186 /* If the expression has got more back references than the offsets supplied can
04187 hold, we get a temporary bit of working store to use during the matching.
04188 Otherwise, we can use the vector supplied, rounding down its size to a multiple
04189 of 3. */
04190 
04191 ocount = offsetcount - (offsetcount % 3);
04192 
04193 if (re->top_backref > 0 && re->top_backref >= ocount/3)
04194   {
04195   ocount = re->top_backref * 3 + 3;
04196   match_block.offset_vector = (int *)(vmdpcre_malloc)(ocount * sizeof(int));
04197   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
04198   using_temporary_offsets = TRUE;
04199   DPRINTF(("Got memory to hold back references\n"));
04200   }
04201 else match_block.offset_vector = offsets;
04202 
04203 match_block.offset_end = ocount;
04204 match_block.offset_max = (2*ocount)/3;
04205 match_block.offset_overflow = FALSE;
04206 
04207 /* Compute the minimum number of offsets that we need to reset each time. Doing
04208 this makes a huge difference to execution time when there aren't many brackets
04209 in the pattern. */
04210 
04211 resetcount = 2 + re->top_bracket * 2;
04212 if (resetcount > offsetcount) resetcount = ocount;
04213 
04214 /* Reset the working variable associated with each extraction. These should
04215 never be used unless previously set, but they get saved and restored, and so we
04216 initialize them to avoid reading uninitialized locations. */
04217 
04218 if (match_block.offset_vector != NULL)
04219   {
04220   register int *iptr = match_block.offset_vector + ocount;
04221   register int *iend = iptr - resetcount/2 + 1;
04222   while (--iptr >= iend) *iptr = -1;
04223   }
04224 
04225 /* Set up the first character to match, if available. The first_char value is
04226 never set for an anchored regular expression, but the anchoring may be forced
04227 at run time, so we have to test for anchoring. The first char may be unset for
04228 an unanchored pattern, of course. If there's no first char and the pattern was
04229 studied, there may be a bitmap of possible first characters. */
04230 
04231 if (!anchored)
04232   {
04233   if ((re->options & PCRE_FIRSTSET) != 0)
04234     {
04235     first_char = re->first_char;
04236     if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
04237     }
04238   else
04239     if (!startline && extra != NULL &&
04240       (extra->options & PCRE_STUDY_MAPPED) != 0)
04241         start_bits = extra->start_bits;
04242   }
04243 
04244 /* Loop for unanchored matches; for anchored regexs the loop runs just once. */
04245 
04246 do
04247   {
04248   int rc;
04249   register int *iptr = match_block.offset_vector;
04250   register int *iend = iptr + resetcount;
04251 
04252   /* Reset the maximum number of extractions we might see. */
04253 
04254   while (iptr < iend) *iptr++ = -1;
04255 
04256   /* Advance to a unique first char if possible */
04257 
04258   if (first_char >= 0)
04259     {
04260     if ((ims & PCRE_CASELESS) != 0)
04261       while (start_match < end_subject &&
04262              match_block.lcc[*start_match] != first_char)
04263         start_match++;
04264     else
04265       while (start_match < end_subject && *start_match != first_char)
04266         start_match++;
04267     }
04268 
04269   /* Or to just after \n for a multiline match if possible */
04270 
04271   else if (startline)
04272     {
04273     if (start_match > match_block.start_subject)
04274       {
04275       while (start_match < end_subject && start_match[-1] != '\n')
04276         start_match++;
04277       }
04278     }
04279 
04280   /* Or to a non-unique first char */
04281 
04282   else if (start_bits != NULL)
04283     {
04284     while (start_match < end_subject)
04285       {
04286       register int c = *start_match;
04287       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
04288       }
04289     }
04290 
04291 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
04292   printf(">>>> Match against: ");
04293   pchars(start_match, end_subject - start_match, TRUE, &match_block);
04294   printf("\n");
04295 #endif
04296 
04297   /* When a match occurs, substrings will be set for all internal extractions;
04298   we just need to set up the whole thing as substring 0 before returning. If
04299   there were too many extractions, set the return code to zero. In the case
04300   where we had to get some local store to hold offsets for backreferences, copy
04301   those back references that we can. In this case there need not be overflow
04302   if certain parts of the pattern were not used. */
04303 
04304   if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
04305     continue;
04306 
04307   /* Copy the offset information from temporary store if necessary */
04308 
04309   if (using_temporary_offsets)
04310     {
04311     if (offsetcount >= 4)
04312       {
04313       memcpy(offsets + 2, match_block.offset_vector + 2,
04314         (offsetcount - 2) * sizeof(int));
04315       DPRINTF(("Copied offsets from temporary memory\n"));
04316       }
04317     if (match_block.end_offset_top > offsetcount)
04318       match_block.offset_overflow = TRUE;
04319 
04320     DPRINTF(("Freeing temporary memory\n"));
04321     (vmdpcre_free)(match_block.offset_vector);
04322     }
04323 
04324   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
04325 
04326   if (match_block.offset_end < 2) rc = 0; else
04327     {
04328     offsets[0] = start_match - match_block.start_subject;
04329     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
04330     }
04331 
04332   DPRINTF((">>>> returning %d\n", rc));
04333   return rc;
04334   }
04335 
04336 /* This "while" is the end of the "do" above */
04337 
04338 while (!anchored &&
04339        match_block.errorcode == PCRE_ERROR_NOMATCH &&
04340        start_match++ < end_subject);
04341 
04342 if (using_temporary_offsets)
04343   {
04344   DPRINTF(("Freeing temporary memory\n"));
04345   (vmdpcre_free)(match_block.offset_vector);
04346   }
04347 
04348 DPRINTF((">>>> returning %d\n", match_block.errorcode));
04349 
04350 return match_block.errorcode;
04351 }
04352 
04353 #ifdef __cplusplus
04354 }
04355 #endif
04356 
04357 /* End of pcre.c */

Generated on Fri Nov 8 02:45:21 2024 for VMD (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002