source: python/vendor/Python-2.7.6/Modules/_sre.c

Last change on this file was 388, checked in by dmik, 11 years ago

python: Update vendor to 2.7.6.

  • Property svn:eol-style set to native
File size: 115.9 KB
Line 
1/*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5 *
6 * partial history:
7 * 1999-10-24 fl created (based on existing template matcher code)
8 * 2000-03-06 fl first alpha, sort of
9 * 2000-08-01 fl fixes for 1.6b1
10 * 2000-08-07 fl use PyOS_CheckStack() if available
11 * 2000-09-20 fl added expand method
12 * 2001-03-20 fl lots of fixes for 2.1b2
13 * 2001-04-15 fl export copyright as Python attribute, not global
14 * 2001-04-28 fl added __copy__ methods (work in progress)
15 * 2001-05-14 fl fixes for 1.5.2 compatibility
16 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
17 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
18 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
19 * 2001-10-21 fl added sub/subn primitive
20 * 2001-10-24 fl added finditer primitive (for 2.2 only)
21 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
22 * 2002-11-09 fl fixed empty sub/subn return type
23 * 2003-04-18 mvl fully support 4-byte codes
24 * 2003-10-17 gn implemented non recursive scheme
25 *
26 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
27 *
28 * This version of the SRE library can be redistributed under CNRI's
29 * Python 1.6 license. For any other use, please contact Secret Labs
30 * AB (info@pythonware.com).
31 *
32 * Portions of this engine have been developed in cooperation with
33 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
34 * other compatibility work.
35 */
36
37#ifndef SRE_RECURSIVE
38
39static char copyright[] =
40 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
41
42#define PY_SSIZE_T_CLEAN
43
44#include "Python.h"
45#include "structmember.h" /* offsetof */
46
47#include "sre.h"
48
49#include <ctype.h>
50
51/* name of this module, minus the leading underscore */
52#if !defined(SRE_MODULE)
53#define SRE_MODULE "sre"
54#endif
55
56#define SRE_PY_MODULE "re"
57
58/* defining this one enables tracing */
59#undef VERBOSE
60
61#if PY_VERSION_HEX >= 0x01060000
62#if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
63/* defining this enables unicode support (default under 1.6a1 and later) */
64#define HAVE_UNICODE
65#endif
66#endif
67
68/* -------------------------------------------------------------------- */
69/* optional features */
70
71/* enables fast searching */
72#define USE_FAST_SEARCH
73
74/* enables aggressive inlining (always on for Visual C) */
75#undef USE_INLINE
76
77/* enables copy/deepcopy handling (work in progress) */
78#undef USE_BUILTIN_COPY
79
80#if PY_VERSION_HEX < 0x01060000
81#define PyObject_DEL(op) PyMem_DEL((op))
82#endif
83
84/* -------------------------------------------------------------------- */
85
86#if defined(_MSC_VER)
87#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
88#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
89/* fastest possible local call under MSVC */
90#define LOCAL(type) static __inline type __fastcall
91#elif defined(USE_INLINE)
92#define LOCAL(type) static inline type
93#else
94#define LOCAL(type) static type
95#endif
96
97/* error codes */
98#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
99#define SRE_ERROR_STATE -2 /* illegal state */
100#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
101#define SRE_ERROR_MEMORY -9 /* out of memory */
102#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
103
104#if defined(VERBOSE)
105#define TRACE(v) printf v
106#else
107#define TRACE(v)
108#endif
109
110/* -------------------------------------------------------------------- */
111/* search engine state */
112
113/* default character predicates (run sre_chars.py to regenerate tables) */
114
115#define SRE_DIGIT_MASK 1
116#define SRE_SPACE_MASK 2
117#define SRE_LINEBREAK_MASK 4
118#define SRE_ALNUM_MASK 8
119#define SRE_WORD_MASK 16
120
121/* FIXME: this assumes ASCII. create tables in init_sre() instead */
122
123static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1242, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
12625, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12724, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1280, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
12924, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
130
131static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
13210, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
13327, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
13444, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
13561, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
136108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
137122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
138106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
139120, 121, 122, 123, 124, 125, 126, 127 };
140
141#define SRE_IS_DIGIT(ch)\
142 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
143#define SRE_IS_SPACE(ch)\
144 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
145#define SRE_IS_LINEBREAK(ch)\
146 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
147#define SRE_IS_ALNUM(ch)\
148 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
149#define SRE_IS_WORD(ch)\
150 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
151
152static unsigned int sre_lower(unsigned int ch)
153{
154 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
155}
156
157/* locale-specific character predicates */
158/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
159 * warnings when c's type supports only numbers < N+1 */
160#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
161#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
162#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
163#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
164#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
165
166static unsigned int sre_lower_locale(unsigned int ch)
167{
168 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
169}
170
171/* unicode-specific character predicates */
172
173#if defined(HAVE_UNICODE)
174
175#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
176#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
177#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
178#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
179#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
180
181static unsigned int sre_lower_unicode(unsigned int ch)
182{
183 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
184}
185
186#endif
187
188LOCAL(int)
189sre_category(SRE_CODE category, unsigned int ch)
190{
191 switch (category) {
192
193 case SRE_CATEGORY_DIGIT:
194 return SRE_IS_DIGIT(ch);
195 case SRE_CATEGORY_NOT_DIGIT:
196 return !SRE_IS_DIGIT(ch);
197 case SRE_CATEGORY_SPACE:
198 return SRE_IS_SPACE(ch);
199 case SRE_CATEGORY_NOT_SPACE:
200 return !SRE_IS_SPACE(ch);
201 case SRE_CATEGORY_WORD:
202 return SRE_IS_WORD(ch);
203 case SRE_CATEGORY_NOT_WORD:
204 return !SRE_IS_WORD(ch);
205 case SRE_CATEGORY_LINEBREAK:
206 return SRE_IS_LINEBREAK(ch);
207 case SRE_CATEGORY_NOT_LINEBREAK:
208 return !SRE_IS_LINEBREAK(ch);
209
210 case SRE_CATEGORY_LOC_WORD:
211 return SRE_LOC_IS_WORD(ch);
212 case SRE_CATEGORY_LOC_NOT_WORD:
213 return !SRE_LOC_IS_WORD(ch);
214
215#if defined(HAVE_UNICODE)
216 case SRE_CATEGORY_UNI_DIGIT:
217 return SRE_UNI_IS_DIGIT(ch);
218 case SRE_CATEGORY_UNI_NOT_DIGIT:
219 return !SRE_UNI_IS_DIGIT(ch);
220 case SRE_CATEGORY_UNI_SPACE:
221 return SRE_UNI_IS_SPACE(ch);
222 case SRE_CATEGORY_UNI_NOT_SPACE:
223 return !SRE_UNI_IS_SPACE(ch);
224 case SRE_CATEGORY_UNI_WORD:
225 return SRE_UNI_IS_WORD(ch);
226 case SRE_CATEGORY_UNI_NOT_WORD:
227 return !SRE_UNI_IS_WORD(ch);
228 case SRE_CATEGORY_UNI_LINEBREAK:
229 return SRE_UNI_IS_LINEBREAK(ch);
230 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
231 return !SRE_UNI_IS_LINEBREAK(ch);
232#else
233 case SRE_CATEGORY_UNI_DIGIT:
234 return SRE_IS_DIGIT(ch);
235 case SRE_CATEGORY_UNI_NOT_DIGIT:
236 return !SRE_IS_DIGIT(ch);
237 case SRE_CATEGORY_UNI_SPACE:
238 return SRE_IS_SPACE(ch);
239 case SRE_CATEGORY_UNI_NOT_SPACE:
240 return !SRE_IS_SPACE(ch);
241 case SRE_CATEGORY_UNI_WORD:
242 return SRE_LOC_IS_WORD(ch);
243 case SRE_CATEGORY_UNI_NOT_WORD:
244 return !SRE_LOC_IS_WORD(ch);
245 case SRE_CATEGORY_UNI_LINEBREAK:
246 return SRE_IS_LINEBREAK(ch);
247 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
248 return !SRE_IS_LINEBREAK(ch);
249#endif
250 }
251 return 0;
252}
253
254/* helpers */
255
256static void
257data_stack_dealloc(SRE_STATE* state)
258{
259 if (state->data_stack) {
260 PyMem_FREE(state->data_stack);
261 state->data_stack = NULL;
262 }
263 state->data_stack_size = state->data_stack_base = 0;
264}
265
266static int
267data_stack_grow(SRE_STATE* state, Py_ssize_t size)
268{
269 Py_ssize_t minsize, cursize;
270 minsize = state->data_stack_base+size;
271 cursize = state->data_stack_size;
272 if (cursize < minsize) {
273 void* stack;
274 cursize = minsize+minsize/4+1024;
275 TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
276 stack = PyMem_REALLOC(state->data_stack, cursize);
277 if (!stack) {
278 data_stack_dealloc(state);
279 return SRE_ERROR_MEMORY;
280 }
281 state->data_stack = (char *)stack;
282 state->data_stack_size = cursize;
283 }
284 return 0;
285}
286
287/* generate 8-bit version */
288
289#define SRE_CHAR unsigned char
290#define SRE_AT sre_at
291#define SRE_COUNT sre_count
292#define SRE_CHARSET sre_charset
293#define SRE_INFO sre_info
294#define SRE_MATCH sre_match
295#define SRE_MATCH_CONTEXT sre_match_context
296#define SRE_SEARCH sre_search
297#define SRE_LITERAL_TEMPLATE sre_literal_template
298
299#if defined(HAVE_UNICODE)
300
301#define SRE_RECURSIVE
302#include "_sre.c"
303#undef SRE_RECURSIVE
304
305#undef SRE_LITERAL_TEMPLATE
306#undef SRE_SEARCH
307#undef SRE_MATCH
308#undef SRE_MATCH_CONTEXT
309#undef SRE_INFO
310#undef SRE_CHARSET
311#undef SRE_COUNT
312#undef SRE_AT
313#undef SRE_CHAR
314
315/* generate 16-bit unicode version */
316
317#define SRE_CHAR Py_UNICODE
318#define SRE_AT sre_uat
319#define SRE_COUNT sre_ucount
320#define SRE_CHARSET sre_ucharset
321#define SRE_INFO sre_uinfo
322#define SRE_MATCH sre_umatch
323#define SRE_MATCH_CONTEXT sre_umatch_context
324#define SRE_SEARCH sre_usearch
325#define SRE_LITERAL_TEMPLATE sre_uliteral_template
326#endif
327
328#endif /* SRE_RECURSIVE */
329
330/* -------------------------------------------------------------------- */
331/* String matching engine */
332
333/* the following section is compiled twice, with different character
334 settings */
335
336LOCAL(int)
337SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
338{
339 /* check if pointer is at given position */
340
341 Py_ssize_t thisp, thatp;
342
343 switch (at) {
344
345 case SRE_AT_BEGINNING:
346 case SRE_AT_BEGINNING_STRING:
347 return ((void*) ptr == state->beginning);
348
349 case SRE_AT_BEGINNING_LINE:
350 return ((void*) ptr == state->beginning ||
351 SRE_IS_LINEBREAK((int) ptr[-1]));
352
353 case SRE_AT_END:
354 return (((void*) (ptr+1) == state->end &&
355 SRE_IS_LINEBREAK((int) ptr[0])) ||
356 ((void*) ptr == state->end));
357
358 case SRE_AT_END_LINE:
359 return ((void*) ptr == state->end ||
360 SRE_IS_LINEBREAK((int) ptr[0]));
361
362 case SRE_AT_END_STRING:
363 return ((void*) ptr == state->end);
364
365 case SRE_AT_BOUNDARY:
366 if (state->beginning == state->end)
367 return 0;
368 thatp = ((void*) ptr > state->beginning) ?
369 SRE_IS_WORD((int) ptr[-1]) : 0;
370 thisp = ((void*) ptr < state->end) ?
371 SRE_IS_WORD((int) ptr[0]) : 0;
372 return thisp != thatp;
373
374 case SRE_AT_NON_BOUNDARY:
375 if (state->beginning == state->end)
376 return 0;
377 thatp = ((void*) ptr > state->beginning) ?
378 SRE_IS_WORD((int) ptr[-1]) : 0;
379 thisp = ((void*) ptr < state->end) ?
380 SRE_IS_WORD((int) ptr[0]) : 0;
381 return thisp == thatp;
382
383 case SRE_AT_LOC_BOUNDARY:
384 if (state->beginning == state->end)
385 return 0;
386 thatp = ((void*) ptr > state->beginning) ?
387 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
388 thisp = ((void*) ptr < state->end) ?
389 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
390 return thisp != thatp;
391
392 case SRE_AT_LOC_NON_BOUNDARY:
393 if (state->beginning == state->end)
394 return 0;
395 thatp = ((void*) ptr > state->beginning) ?
396 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
397 thisp = ((void*) ptr < state->end) ?
398 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
399 return thisp == thatp;
400
401#if defined(HAVE_UNICODE)
402 case SRE_AT_UNI_BOUNDARY:
403 if (state->beginning == state->end)
404 return 0;
405 thatp = ((void*) ptr > state->beginning) ?
406 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
407 thisp = ((void*) ptr < state->end) ?
408 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
409 return thisp != thatp;
410
411 case SRE_AT_UNI_NON_BOUNDARY:
412 if (state->beginning == state->end)
413 return 0;
414 thatp = ((void*) ptr > state->beginning) ?
415 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
416 thisp = ((void*) ptr < state->end) ?
417 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
418 return thisp == thatp;
419#endif
420
421 }
422
423 return 0;
424}
425
426LOCAL(int)
427SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
428{
429 /* check if character is a member of the given set */
430
431 int ok = 1;
432
433 for (;;) {
434 switch (*set++) {
435
436 case SRE_OP_FAILURE:
437 return !ok;
438
439 case SRE_OP_LITERAL:
440 /* <LITERAL> <code> */
441 if (ch == set[0])
442 return ok;
443 set++;
444 break;
445
446 case SRE_OP_CATEGORY:
447 /* <CATEGORY> <code> */
448 if (sre_category(set[0], (int) ch))
449 return ok;
450 set += 1;
451 break;
452
453 case SRE_OP_CHARSET:
454 if (sizeof(SRE_CODE) == 2) {
455 /* <CHARSET> <bitmap> (16 bits per code word) */
456 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
457 return ok;
458 set += 16;
459 }
460 else {
461 /* <CHARSET> <bitmap> (32 bits per code word) */
462 if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
463 return ok;
464 set += 8;
465 }
466 break;
467
468 case SRE_OP_RANGE:
469 /* <RANGE> <lower> <upper> */
470 if (set[0] <= ch && ch <= set[1])
471 return ok;
472 set += 2;
473 break;
474
475 case SRE_OP_NEGATE:
476 ok = !ok;
477 break;
478
479 case SRE_OP_BIGCHARSET:
480 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
481 {
482 Py_ssize_t count, block;
483 count = *(set++);
484
485 if (sizeof(SRE_CODE) == 2) {
486 block = ((unsigned char*)set)[ch >> 8];
487 set += 128;
488 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
489 return ok;
490 set += count*16;
491 }
492 else {
493 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
494 * warnings when c's type supports only numbers < N+1 */
495 if (!(ch & ~65535))
496 block = ((unsigned char*)set)[ch >> 8];
497 else
498 block = -1;
499 set += 64;
500 if (block >=0 &&
501 (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
502 return ok;
503 set += count*8;
504 }
505 break;
506 }
507
508 default:
509 /* internal error -- there's not much we can do about it
510 here, so let's just pretend it didn't match... */
511 return 0;
512 }
513 }
514}
515
516LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
517
518LOCAL(Py_ssize_t)
519SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
520{
521 SRE_CODE chr;
522 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
523 SRE_CHAR* end = (SRE_CHAR *)state->end;
524 Py_ssize_t i;
525
526 /* adjust end */
527 if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
528 end = ptr + maxcount;
529
530 switch (pattern[0]) {
531
532 case SRE_OP_IN:
533 /* repeated set */
534 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
535 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
536 ptr++;
537 break;
538
539 case SRE_OP_ANY:
540 /* repeated dot wildcard. */
541 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
542 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
543 ptr++;
544 break;
545
546 case SRE_OP_ANY_ALL:
547 /* repeated dot wildcard. skip to the end of the target
548 string, and backtrack from there */
549 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
550 ptr = end;
551 break;
552
553 case SRE_OP_LITERAL:
554 /* repeated literal */
555 chr = pattern[1];
556 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
557 while (ptr < end && (SRE_CODE) *ptr == chr)
558 ptr++;
559 break;
560
561 case SRE_OP_LITERAL_IGNORE:
562 /* repeated literal */
563 chr = pattern[1];
564 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
565 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
566 ptr++;
567 break;
568
569 case SRE_OP_NOT_LITERAL:
570 /* repeated non-literal */
571 chr = pattern[1];
572 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
573 while (ptr < end && (SRE_CODE) *ptr != chr)
574 ptr++;
575 break;
576
577 case SRE_OP_NOT_LITERAL_IGNORE:
578 /* repeated non-literal */
579 chr = pattern[1];
580 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
581 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
582 ptr++;
583 break;
584
585 default:
586 /* repeated single character pattern */
587 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
588 while ((SRE_CHAR*) state->ptr < end) {
589 i = SRE_MATCH(state, pattern);
590 if (i < 0)
591 return i;
592 if (!i)
593 break;
594 }
595 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
596 (SRE_CHAR*) state->ptr - ptr));
597 return (SRE_CHAR*) state->ptr - ptr;
598 }
599
600 TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
601 ptr - (SRE_CHAR*) state->ptr));
602 return ptr - (SRE_CHAR*) state->ptr;
603}
604
605#if 0 /* not used in this release */
606LOCAL(int)
607SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
608{
609 /* check if an SRE_OP_INFO block matches at the current position.
610 returns the number of SRE_CODE objects to skip if successful, 0
611 if no match */
612
613 SRE_CHAR* end = state->end;
614 SRE_CHAR* ptr = state->ptr;
615 Py_ssize_t i;
616
617 /* check minimal length */
618 if (pattern[3] && (end - ptr) < pattern[3])
619 return 0;
620
621 /* check known prefix */
622 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
623 /* <length> <skip> <prefix data> <overlap data> */
624 for (i = 0; i < pattern[5]; i++)
625 if ((SRE_CODE) ptr[i] != pattern[7 + i])
626 return 0;
627 return pattern[0] + 2 * pattern[6];
628 }
629 return pattern[0];
630}
631#endif
632
633/* The macros below should be used to protect recursive SRE_MATCH()
634 * calls that *failed* and do *not* return immediately (IOW, those
635 * that will backtrack). Explaining:
636 *
637 * - Recursive SRE_MATCH() returned true: that's usually a success
638 * (besides atypical cases like ASSERT_NOT), therefore there's no
639 * reason to restore lastmark;
640 *
641 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
642 * is returning to the caller: If the current SRE_MATCH() is the
643 * top function of the recursion, returning false will be a matching
644 * failure, and it doesn't matter where lastmark is pointing to.
645 * If it's *not* the top function, it will be a recursive SRE_MATCH()
646 * failure by itself, and the calling SRE_MATCH() will have to deal
647 * with the failure by the same rules explained here (it will restore
648 * lastmark by itself if necessary);
649 *
650 * - Recursive SRE_MATCH() returned false, and will continue the
651 * outside 'for' loop: must be protected when breaking, since the next
652 * OP could potentially depend on lastmark;
653 *
654 * - Recursive SRE_MATCH() returned false, and will be called again
655 * inside a local for/while loop: must be protected between each
656 * loop iteration, since the recursive SRE_MATCH() could do anything,
657 * and could potentially depend on lastmark.
658 *
659 * For more information, check the discussion at SF patch #712900.
660 */
661#define LASTMARK_SAVE() \
662 do { \
663 ctx->lastmark = state->lastmark; \
664 ctx->lastindex = state->lastindex; \
665 } while (0)
666#define LASTMARK_RESTORE() \
667 do { \
668 state->lastmark = ctx->lastmark; \
669 state->lastindex = ctx->lastindex; \
670 } while (0)
671
672#define RETURN_ERROR(i) do { return i; } while(0)
673#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
674#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
675
676#define RETURN_ON_ERROR(i) \
677 do { if (i < 0) RETURN_ERROR(i); } while (0)
678#define RETURN_ON_SUCCESS(i) \
679 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
680#define RETURN_ON_FAILURE(i) \
681 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
682
683#define SFY(x) #x
684
685#define DATA_STACK_ALLOC(state, type, ptr) \
686do { \
687 alloc_pos = state->data_stack_base; \
688 TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
689 "(%" PY_FORMAT_SIZE_T "d)\n", \
690 SFY(type), alloc_pos, sizeof(type))); \
691 if (sizeof(type) > state->data_stack_size - alloc_pos) { \
692 int j = data_stack_grow(state, sizeof(type)); \
693 if (j < 0) return j; \
694 if (ctx_pos != -1) \
695 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
696 } \
697 ptr = (type*)(state->data_stack+alloc_pos); \
698 state->data_stack_base += sizeof(type); \
699} while (0)
700
701#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
702do { \
703 TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
704 ptr = (type*)(state->data_stack+pos); \
705} while (0)
706
707#define DATA_STACK_PUSH(state, data, size) \
708do { \
709 TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
710 "(%" PY_FORMAT_SIZE_T "d)\n", \
711 data, state->data_stack_base, size)); \
712 if (size > state->data_stack_size - state->data_stack_base) { \
713 int j = data_stack_grow(state, size); \
714 if (j < 0) return j; \
715 if (ctx_pos != -1) \
716 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
717 } \
718 memcpy(state->data_stack+state->data_stack_base, data, size); \
719 state->data_stack_base += size; \
720} while (0)
721
722#define DATA_STACK_POP(state, data, size, discard) \
723do { \
724 TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
725 "(%" PY_FORMAT_SIZE_T "d)\n", \
726 data, state->data_stack_base-size, size)); \
727 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
728 if (discard) \
729 state->data_stack_base -= size; \
730} while (0)
731
732#define DATA_STACK_POP_DISCARD(state, size) \
733do { \
734 TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
735 "(%" PY_FORMAT_SIZE_T "d)\n", \
736 state->data_stack_base-size, size)); \
737 state->data_stack_base -= size; \
738} while(0)
739
740#define DATA_PUSH(x) \
741 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
742#define DATA_POP(x) \
743 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
744#define DATA_POP_DISCARD(x) \
745 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
746#define DATA_ALLOC(t,p) \
747 DATA_STACK_ALLOC(state, t, p)
748#define DATA_LOOKUP_AT(t,p,pos) \
749 DATA_STACK_LOOKUP_AT(state,t,p,pos)
750
751#define MARK_PUSH(lastmark) \
752 do if (lastmark > 0) { \
753 i = lastmark; /* ctx->lastmark may change if reallocated */ \
754 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
755 } while (0)
756#define MARK_POP(lastmark) \
757 do if (lastmark > 0) { \
758 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
759 } while (0)
760#define MARK_POP_KEEP(lastmark) \
761 do if (lastmark > 0) { \
762 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
763 } while (0)
764#define MARK_POP_DISCARD(lastmark) \
765 do if (lastmark > 0) { \
766 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
767 } while (0)
768
769#define JUMP_NONE 0
770#define JUMP_MAX_UNTIL_1 1
771#define JUMP_MAX_UNTIL_2 2
772#define JUMP_MAX_UNTIL_3 3
773#define JUMP_MIN_UNTIL_1 4
774#define JUMP_MIN_UNTIL_2 5
775#define JUMP_MIN_UNTIL_3 6
776#define JUMP_REPEAT 7
777#define JUMP_REPEAT_ONE_1 8
778#define JUMP_REPEAT_ONE_2 9
779#define JUMP_MIN_REPEAT_ONE 10
780#define JUMP_BRANCH 11
781#define JUMP_ASSERT 12
782#define JUMP_ASSERT_NOT 13
783
784#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
785 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
786 nextctx->last_ctx_pos = ctx_pos; \
787 nextctx->jump = jumpvalue; \
788 nextctx->pattern = nextpattern; \
789 ctx_pos = alloc_pos; \
790 ctx = nextctx; \
791 goto entrance; \
792 jumplabel: \
793 while (0) /* gcc doesn't like labels at end of scopes */ \
794
795typedef struct {
796 Py_ssize_t last_ctx_pos;
797 Py_ssize_t jump;
798 SRE_CHAR* ptr;
799 SRE_CODE* pattern;
800 Py_ssize_t count;
801 Py_ssize_t lastmark;
802 Py_ssize_t lastindex;
803 union {
804 SRE_CODE chr;
805 SRE_REPEAT* rep;
806 } u;
807} SRE_MATCH_CONTEXT;
808
809/* check if string matches the given pattern. returns <0 for
810 error, 0 for failure, and 1 for success */
811LOCAL(Py_ssize_t)
812SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
813{
814 SRE_CHAR* end = (SRE_CHAR *)state->end;
815 Py_ssize_t alloc_pos, ctx_pos = -1;
816 Py_ssize_t i, ret = 0;
817 Py_ssize_t jump;
818 unsigned int sigcount=0;
819
820 SRE_MATCH_CONTEXT* ctx;
821 SRE_MATCH_CONTEXT* nextctx;
822
823 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
824
825 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
826 ctx->last_ctx_pos = -1;
827 ctx->jump = JUMP_NONE;
828 ctx->pattern = pattern;
829 ctx_pos = alloc_pos;
830
831entrance:
832
833 ctx->ptr = (SRE_CHAR *)state->ptr;
834
835 if (ctx->pattern[0] == SRE_OP_INFO) {
836 /* optimization info block */
837 /* <INFO> <1=skip> <2=flags> <3=min> ... */
838 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
839 TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
840 "need %" PY_FORMAT_SIZE_T "d)\n",
841 (end - ctx->ptr), (Py_ssize_t) ctx->pattern[3]));
842 RETURN_FAILURE;
843 }
844 ctx->pattern += ctx->pattern[1] + 1;
845 }
846
847 for (;;) {
848 ++sigcount;
849 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
850 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
851
852 switch (*ctx->pattern++) {
853
854 case SRE_OP_MARK:
855 /* set mark */
856 /* <MARK> <gid> */
857 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
858 ctx->ptr, ctx->pattern[0]));
859 i = ctx->pattern[0];
860 if (i & 1)
861 state->lastindex = i/2 + 1;
862 if (i > state->lastmark) {
863 /* state->lastmark is the highest valid index in the
864 state->mark array. If it is increased by more than 1,
865 the intervening marks must be set to NULL to signal
866 that these marks have not been encountered. */
867 Py_ssize_t j = state->lastmark + 1;
868 while (j < i)
869 state->mark[j++] = NULL;
870 state->lastmark = i;
871 }
872 state->mark[i] = ctx->ptr;
873 ctx->pattern++;
874 break;
875
876 case SRE_OP_LITERAL:
877 /* match literal string */
878 /* <LITERAL> <code> */
879 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
880 ctx->ptr, *ctx->pattern));
881 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
882 RETURN_FAILURE;
883 ctx->pattern++;
884 ctx->ptr++;
885 break;
886
887 case SRE_OP_NOT_LITERAL:
888 /* match anything that is not literal character */
889 /* <NOT_LITERAL> <code> */
890 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
891 ctx->ptr, *ctx->pattern));
892 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
893 RETURN_FAILURE;
894 ctx->pattern++;
895 ctx->ptr++;
896 break;
897
898 case SRE_OP_SUCCESS:
899 /* end of pattern */
900 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
901 state->ptr = ctx->ptr;
902 RETURN_SUCCESS;
903
904 case SRE_OP_AT:
905 /* match at given position */
906 /* <AT> <code> */
907 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
908 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
909 RETURN_FAILURE;
910 ctx->pattern++;
911 break;
912
913 case SRE_OP_CATEGORY:
914 /* match at given category */
915 /* <CATEGORY> <code> */
916 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
917 ctx->ptr, *ctx->pattern));
918 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
919 RETURN_FAILURE;
920 ctx->pattern++;
921 ctx->ptr++;
922 break;
923
924 case SRE_OP_ANY:
925 /* match anything (except a newline) */
926 /* <ANY> */
927 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
928 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
929 RETURN_FAILURE;
930 ctx->ptr++;
931 break;
932
933 case SRE_OP_ANY_ALL:
934 /* match anything */
935 /* <ANY_ALL> */
936 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
937 if (ctx->ptr >= end)
938 RETURN_FAILURE;
939 ctx->ptr++;
940 break;
941
942 case SRE_OP_IN:
943 /* match set member (or non_member) */
944 /* <IN> <skip> <set> */
945 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
946 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
947 RETURN_FAILURE;
948 ctx->pattern += ctx->pattern[0];
949 ctx->ptr++;
950 break;
951
952 case SRE_OP_LITERAL_IGNORE:
953 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
954 ctx->pattern, ctx->ptr, ctx->pattern[0]));
955 if (ctx->ptr >= end ||
956 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
957 RETURN_FAILURE;
958 ctx->pattern++;
959 ctx->ptr++;
960 break;
961
962 case SRE_OP_NOT_LITERAL_IGNORE:
963 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
964 ctx->pattern, ctx->ptr, *ctx->pattern));
965 if (ctx->ptr >= end ||
966 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
967 RETURN_FAILURE;
968 ctx->pattern++;
969 ctx->ptr++;
970 break;
971
972 case SRE_OP_IN_IGNORE:
973 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
974 if (ctx->ptr >= end
975 || !SRE_CHARSET(ctx->pattern+1,
976 (SRE_CODE)state->lower(*ctx->ptr)))
977 RETURN_FAILURE;
978 ctx->pattern += ctx->pattern[0];
979 ctx->ptr++;
980 break;
981
982 case SRE_OP_JUMP:
983 case SRE_OP_INFO:
984 /* jump forward */
985 /* <JUMP> <offset> */
986 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
987 ctx->ptr, ctx->pattern[0]));
988 ctx->pattern += ctx->pattern[0];
989 break;
990
991 case SRE_OP_BRANCH:
992 /* alternation */
993 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
994 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
995 LASTMARK_SAVE();
996 ctx->u.rep = state->repeat;
997 if (ctx->u.rep)
998 MARK_PUSH(ctx->lastmark);
999 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
1000 if (ctx->pattern[1] == SRE_OP_LITERAL &&
1001 (ctx->ptr >= end ||
1002 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
1003 continue;
1004 if (ctx->pattern[1] == SRE_OP_IN &&
1005 (ctx->ptr >= end ||
1006 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1007 continue;
1008 state->ptr = ctx->ptr;
1009 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1010 if (ret) {
1011 if (ctx->u.rep)
1012 MARK_POP_DISCARD(ctx->lastmark);
1013 RETURN_ON_ERROR(ret);
1014 RETURN_SUCCESS;
1015 }
1016 if (ctx->u.rep)
1017 MARK_POP_KEEP(ctx->lastmark);
1018 LASTMARK_RESTORE();
1019 }
1020 if (ctx->u.rep)
1021 MARK_POP_DISCARD(ctx->lastmark);
1022 RETURN_FAILURE;
1023
1024 case SRE_OP_REPEAT_ONE:
1025 /* match repeated sequence (maximizing regexp) */
1026
1027 /* this operator only works if the repeated item is
1028 exactly one character wide, and we're not already
1029 collecting backtracking points. for other cases,
1030 use the MAX_REPEAT operator */
1031
1032 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1033
1034 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1035 ctx->pattern[1], ctx->pattern[2]));
1036
1037 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
1038 RETURN_FAILURE; /* cannot match */
1039
1040 state->ptr = ctx->ptr;
1041
1042 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1043 RETURN_ON_ERROR(ret);
1044 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1045 ctx->count = ret;
1046 ctx->ptr += ctx->count;
1047
1048 /* when we arrive here, count contains the number of
1049 matches, and ctx->ptr points to the tail of the target
1050 string. check if the rest of the pattern matches,
1051 and backtrack if not. */
1052
1053 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1054 RETURN_FAILURE;
1055
1056 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1057 /* tail is empty. we're finished */
1058 state->ptr = ctx->ptr;
1059 RETURN_SUCCESS;
1060 }
1061
1062 LASTMARK_SAVE();
1063
1064 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1065 /* tail starts with a literal. skip positions where
1066 the rest of the pattern cannot possibly match */
1067 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1068 for (;;) {
1069 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1070 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1071 ctx->ptr--;
1072 ctx->count--;
1073 }
1074 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1075 break;
1076 state->ptr = ctx->ptr;
1077 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1078 ctx->pattern+ctx->pattern[0]);
1079 if (ret) {
1080 RETURN_ON_ERROR(ret);
1081 RETURN_SUCCESS;
1082 }
1083
1084 LASTMARK_RESTORE();
1085
1086 ctx->ptr--;
1087 ctx->count--;
1088 }
1089
1090 } else {
1091 /* general case */
1092 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1093 state->ptr = ctx->ptr;
1094 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1095 ctx->pattern+ctx->pattern[0]);
1096 if (ret) {
1097 RETURN_ON_ERROR(ret);
1098 RETURN_SUCCESS;
1099 }
1100 ctx->ptr--;
1101 ctx->count--;
1102 LASTMARK_RESTORE();
1103 }
1104 }
1105 RETURN_FAILURE;
1106
1107 case SRE_OP_MIN_REPEAT_ONE:
1108 /* match repeated sequence (minimizing regexp) */
1109
1110 /* this operator only works if the repeated item is
1111 exactly one character wide, and we're not already
1112 collecting backtracking points. for other cases,
1113 use the MIN_REPEAT operator */
1114
1115 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1116
1117 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1118 ctx->pattern[1], ctx->pattern[2]));
1119
1120 if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
1121 RETURN_FAILURE; /* cannot match */
1122
1123 state->ptr = ctx->ptr;
1124
1125 if (ctx->pattern[1] == 0)
1126 ctx->count = 0;
1127 else {
1128 /* count using pattern min as the maximum */
1129 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1130 RETURN_ON_ERROR(ret);
1131 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1132 if (ret < (Py_ssize_t) ctx->pattern[1])
1133 /* didn't match minimum number of times */
1134 RETURN_FAILURE;
1135 /* advance past minimum matches of repeat */
1136 ctx->count = ret;
1137 ctx->ptr += ctx->count;
1138 }
1139
1140 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1141 /* tail is empty. we're finished */
1142 state->ptr = ctx->ptr;
1143 RETURN_SUCCESS;
1144
1145 } else {
1146 /* general case */
1147 LASTMARK_SAVE();
1148 while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
1149 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1150 state->ptr = ctx->ptr;
1151 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1152 ctx->pattern+ctx->pattern[0]);
1153 if (ret) {
1154 RETURN_ON_ERROR(ret);
1155 RETURN_SUCCESS;
1156 }
1157 state->ptr = ctx->ptr;
1158 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1159 RETURN_ON_ERROR(ret);
1160 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1161 if (ret == 0)
1162 break;
1163 assert(ret == 1);
1164 ctx->ptr++;
1165 ctx->count++;
1166 LASTMARK_RESTORE();
1167 }
1168 }
1169 RETURN_FAILURE;
1170
1171 case SRE_OP_REPEAT:
1172 /* create repeat context. all the hard work is done
1173 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1174 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1175 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1176 ctx->pattern[1], ctx->pattern[2]));
1177
1178 /* install new repeat context */
1179 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1180 if (!ctx->u.rep) {
1181 PyErr_NoMemory();
1182 RETURN_FAILURE;
1183 }
1184 ctx->u.rep->count = -1;
1185 ctx->u.rep->pattern = ctx->pattern;
1186 ctx->u.rep->prev = state->repeat;
1187 ctx->u.rep->last_ptr = NULL;
1188 state->repeat = ctx->u.rep;
1189
1190 state->ptr = ctx->ptr;
1191 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1192 state->repeat = ctx->u.rep->prev;
1193 PyObject_FREE(ctx->u.rep);
1194
1195 if (ret) {
1196 RETURN_ON_ERROR(ret);
1197 RETURN_SUCCESS;
1198 }
1199 RETURN_FAILURE;
1200
1201 case SRE_OP_MAX_UNTIL:
1202 /* maximizing repeat */
1203 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1204
1205 /* FIXME: we probably need to deal with zero-width
1206 matches in here... */
1207
1208 ctx->u.rep = state->repeat;
1209 if (!ctx->u.rep)
1210 RETURN_ERROR(SRE_ERROR_STATE);
1211
1212 state->ptr = ctx->ptr;
1213
1214 ctx->count = ctx->u.rep->count+1;
1215
1216 TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1217 ctx->ptr, ctx->count));
1218
1219 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1220 /* not enough matches */
1221 ctx->u.rep->count = ctx->count;
1222 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1223 ctx->u.rep->pattern+3);
1224 if (ret) {
1225 RETURN_ON_ERROR(ret);
1226 RETURN_SUCCESS;
1227 }
1228 ctx->u.rep->count = ctx->count-1;
1229 state->ptr = ctx->ptr;
1230 RETURN_FAILURE;
1231 }
1232
1233 if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
1234 ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
1235 state->ptr != ctx->u.rep->last_ptr) {
1236 /* we may have enough matches, but if we can
1237 match another item, do so */
1238 ctx->u.rep->count = ctx->count;
1239 LASTMARK_SAVE();
1240 MARK_PUSH(ctx->lastmark);
1241 /* zero-width match protection */
1242 DATA_PUSH(&ctx->u.rep->last_ptr);
1243 ctx->u.rep->last_ptr = state->ptr;
1244 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1245 ctx->u.rep->pattern+3);
1246 DATA_POP(&ctx->u.rep->last_ptr);
1247 if (ret) {
1248 MARK_POP_DISCARD(ctx->lastmark);
1249 RETURN_ON_ERROR(ret);
1250 RETURN_SUCCESS;
1251 }
1252 MARK_POP(ctx->lastmark);
1253 LASTMARK_RESTORE();
1254 ctx->u.rep->count = ctx->count-1;
1255 state->ptr = ctx->ptr;
1256 }
1257
1258 /* cannot match more repeated items here. make sure the
1259 tail matches */
1260 state->repeat = ctx->u.rep->prev;
1261 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1262 RETURN_ON_SUCCESS(ret);
1263 state->repeat = ctx->u.rep;
1264 state->ptr = ctx->ptr;
1265 RETURN_FAILURE;
1266
1267 case SRE_OP_MIN_UNTIL:
1268 /* minimizing repeat */
1269 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1270
1271 ctx->u.rep = state->repeat;
1272 if (!ctx->u.rep)
1273 RETURN_ERROR(SRE_ERROR_STATE);
1274
1275 state->ptr = ctx->ptr;
1276
1277 ctx->count = ctx->u.rep->count+1;
1278
1279 TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
1280 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1281
1282 if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
1283 /* not enough matches */
1284 ctx->u.rep->count = ctx->count;
1285 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1286 ctx->u.rep->pattern+3);
1287 if (ret) {
1288 RETURN_ON_ERROR(ret);
1289 RETURN_SUCCESS;
1290 }
1291 ctx->u.rep->count = ctx->count-1;
1292 state->ptr = ctx->ptr;
1293 RETURN_FAILURE;
1294 }
1295
1296 LASTMARK_SAVE();
1297
1298 /* see if the tail matches */
1299 state->repeat = ctx->u.rep->prev;
1300 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1301 if (ret) {
1302 RETURN_ON_ERROR(ret);
1303 RETURN_SUCCESS;
1304 }
1305
1306 state->repeat = ctx->u.rep;
1307 state->ptr = ctx->ptr;
1308
1309 LASTMARK_RESTORE();
1310
1311 if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
1312 && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
1313 state->ptr == ctx->u.rep->last_ptr)
1314 RETURN_FAILURE;
1315
1316 ctx->u.rep->count = ctx->count;
1317 /* zero-width match protection */
1318 DATA_PUSH(&ctx->u.rep->last_ptr);
1319 ctx->u.rep->last_ptr = state->ptr;
1320 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1321 ctx->u.rep->pattern+3);
1322 DATA_POP(&ctx->u.rep->last_ptr);
1323 if (ret) {
1324 RETURN_ON_ERROR(ret);
1325 RETURN_SUCCESS;
1326 }
1327 ctx->u.rep->count = ctx->count-1;
1328 state->ptr = ctx->ptr;
1329 RETURN_FAILURE;
1330
1331 case SRE_OP_GROUPREF:
1332 /* match backreference */
1333 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1334 ctx->ptr, ctx->pattern[0]));
1335 i = ctx->pattern[0];
1336 {
1337 Py_ssize_t groupref = i+i;
1338 if (groupref >= state->lastmark) {
1339 RETURN_FAILURE;
1340 } else {
1341 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1342 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1343 if (!p || !e || e < p)
1344 RETURN_FAILURE;
1345 while (p < e) {
1346 if (ctx->ptr >= end || *ctx->ptr != *p)
1347 RETURN_FAILURE;
1348 p++; ctx->ptr++;
1349 }
1350 }
1351 }
1352 ctx->pattern++;
1353 break;
1354
1355 case SRE_OP_GROUPREF_IGNORE:
1356 /* match backreference */
1357 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1358 ctx->ptr, ctx->pattern[0]));
1359 i = ctx->pattern[0];
1360 {
1361 Py_ssize_t groupref = i+i;
1362 if (groupref >= state->lastmark) {
1363 RETURN_FAILURE;
1364 } else {
1365 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1366 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1367 if (!p || !e || e < p)
1368 RETURN_FAILURE;
1369 while (p < e) {
1370 if (ctx->ptr >= end ||
1371 state->lower(*ctx->ptr) != state->lower(*p))
1372 RETURN_FAILURE;
1373 p++; ctx->ptr++;
1374 }
1375 }
1376 }
1377 ctx->pattern++;
1378 break;
1379
1380 case SRE_OP_GROUPREF_EXISTS:
1381 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1382 ctx->ptr, ctx->pattern[0]));
1383 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1384 i = ctx->pattern[0];
1385 {
1386 Py_ssize_t groupref = i+i;
1387 if (groupref >= state->lastmark) {
1388 ctx->pattern += ctx->pattern[1];
1389 break;
1390 } else {
1391 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1392 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1393 if (!p || !e || e < p) {
1394 ctx->pattern += ctx->pattern[1];
1395 break;
1396 }
1397 }
1398 }
1399 ctx->pattern += 2;
1400 break;
1401
1402 case SRE_OP_ASSERT:
1403 /* assert subpattern */
1404 /* <ASSERT> <skip> <back> <pattern> */
1405 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1406 ctx->ptr, ctx->pattern[1]));
1407 state->ptr = ctx->ptr - ctx->pattern[1];
1408 if (state->ptr < state->beginning)
1409 RETURN_FAILURE;
1410 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1411 RETURN_ON_FAILURE(ret);
1412 ctx->pattern += ctx->pattern[0];
1413 break;
1414
1415 case SRE_OP_ASSERT_NOT:
1416 /* assert not subpattern */
1417 /* <ASSERT_NOT> <skip> <back> <pattern> */
1418 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1419 ctx->ptr, ctx->pattern[1]));
1420 state->ptr = ctx->ptr - ctx->pattern[1];
1421 if (state->ptr >= state->beginning) {
1422 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1423 if (ret) {
1424 RETURN_ON_ERROR(ret);
1425 RETURN_FAILURE;
1426 }
1427 }
1428 ctx->pattern += ctx->pattern[0];
1429 break;
1430
1431 case SRE_OP_FAILURE:
1432 /* immediate failure */
1433 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1434 RETURN_FAILURE;
1435
1436 default:
1437 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1438 ctx->pattern[-1]));
1439 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1440 }
1441 }
1442
1443exit:
1444 ctx_pos = ctx->last_ctx_pos;
1445 jump = ctx->jump;
1446 DATA_POP_DISCARD(ctx);
1447 if (ctx_pos == -1)
1448 return ret;
1449 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1450
1451 switch (jump) {
1452 case JUMP_MAX_UNTIL_2:
1453 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1454 goto jump_max_until_2;
1455 case JUMP_MAX_UNTIL_3:
1456 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1457 goto jump_max_until_3;
1458 case JUMP_MIN_UNTIL_2:
1459 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1460 goto jump_min_until_2;
1461 case JUMP_MIN_UNTIL_3:
1462 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1463 goto jump_min_until_3;
1464 case JUMP_BRANCH:
1465 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1466 goto jump_branch;
1467 case JUMP_MAX_UNTIL_1:
1468 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1469 goto jump_max_until_1;
1470 case JUMP_MIN_UNTIL_1:
1471 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1472 goto jump_min_until_1;
1473 case JUMP_REPEAT:
1474 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1475 goto jump_repeat;
1476 case JUMP_REPEAT_ONE_1:
1477 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1478 goto jump_repeat_one_1;
1479 case JUMP_REPEAT_ONE_2:
1480 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1481 goto jump_repeat_one_2;
1482 case JUMP_MIN_REPEAT_ONE:
1483 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1484 goto jump_min_repeat_one;
1485 case JUMP_ASSERT:
1486 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1487 goto jump_assert;
1488 case JUMP_ASSERT_NOT:
1489 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1490 goto jump_assert_not;
1491 case JUMP_NONE:
1492 TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
1493 ctx->ptr, ret));
1494 break;
1495 }
1496
1497 return ret; /* should never get here */
1498}
1499
1500LOCAL(Py_ssize_t)
1501SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1502{
1503 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1504 SRE_CHAR* end = (SRE_CHAR *)state->end;
1505 Py_ssize_t status = 0;
1506 Py_ssize_t prefix_len = 0;
1507 Py_ssize_t prefix_skip = 0;
1508 SRE_CODE* prefix = NULL;
1509 SRE_CODE* charset = NULL;
1510 SRE_CODE* overlap = NULL;
1511 int flags = 0;
1512
1513 if (pattern[0] == SRE_OP_INFO) {
1514 /* optimization info block */
1515 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1516
1517 flags = pattern[2];
1518
1519 if (pattern[3] > 1) {
1520 /* adjust end point (but make sure we leave at least one
1521 character in there, so literal search will work) */
1522 end -= pattern[3]-1;
1523 if (end <= ptr)
1524 end = ptr+1;
1525 }
1526
1527 if (flags & SRE_INFO_PREFIX) {
1528 /* pattern starts with a known prefix */
1529 /* <length> <skip> <prefix data> <overlap data> */
1530 prefix_len = pattern[5];
1531 prefix_skip = pattern[6];
1532 prefix = pattern + 7;
1533 overlap = prefix + prefix_len - 1;
1534 } else if (flags & SRE_INFO_CHARSET)
1535 /* pattern starts with a character from a known set */
1536 /* <charset> */
1537 charset = pattern + 5;
1538
1539 pattern += 1 + pattern[1];
1540 }
1541
1542 TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
1543 prefix, prefix_len, prefix_skip));
1544 TRACE(("charset = %p\n", charset));
1545
1546#if defined(USE_FAST_SEARCH)
1547 if (prefix_len > 1) {
1548 /* pattern starts with a known prefix. use the overlap
1549 table to skip forward as fast as we possibly can */
1550 Py_ssize_t i = 0;
1551 end = (SRE_CHAR *)state->end;
1552 while (ptr < end) {
1553 for (;;) {
1554 if ((SRE_CODE) ptr[0] != prefix[i]) {
1555 if (!i)
1556 break;
1557 else
1558 i = overlap[i];
1559 } else {
1560 if (++i == prefix_len) {
1561 /* found a potential match */
1562 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1563 state->start = ptr + 1 - prefix_len;
1564 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1565 if (flags & SRE_INFO_LITERAL)
1566 return 1; /* we got all of it */
1567 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1568 if (status != 0)
1569 return status;
1570 /* close but no cigar -- try again */
1571 i = overlap[i];
1572 }
1573 break;
1574 }
1575 }
1576 ptr++;
1577 }
1578 return 0;
1579 }
1580#endif
1581
1582 if (pattern[0] == SRE_OP_LITERAL) {
1583 /* pattern starts with a literal character. this is used
1584 for short prefixes, and if fast search is disabled */
1585 SRE_CODE chr = pattern[1];
1586 end = (SRE_CHAR *)state->end;
1587 for (;;) {
1588 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1589 ptr++;
1590 if (ptr >= end)
1591 return 0;
1592 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1593 state->start = ptr;
1594 state->ptr = ++ptr;
1595 if (flags & SRE_INFO_LITERAL)
1596 return 1; /* we got all of it */
1597 status = SRE_MATCH(state, pattern + 2);
1598 if (status != 0)
1599 break;
1600 }
1601 } else if (charset) {
1602 /* pattern starts with a character from a known set */
1603 end = (SRE_CHAR *)state->end;
1604 for (;;) {
1605 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1606 ptr++;
1607 if (ptr >= end)
1608 return 0;
1609 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1610 state->start = ptr;
1611 state->ptr = ptr;
1612 status = SRE_MATCH(state, pattern);
1613 if (status != 0)
1614 break;
1615 ptr++;
1616 }
1617 } else
1618 /* general case */
1619 while (ptr <= end) {
1620 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1621 state->start = state->ptr = ptr++;
1622 status = SRE_MATCH(state, pattern);
1623 if (status != 0)
1624 break;
1625 }
1626
1627 return status;
1628}
1629
1630LOCAL(int)
1631SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1632{
1633 /* check if given string is a literal template (i.e. no escapes) */
1634 while (len-- > 0)
1635 if (*ptr++ == '\\')
1636 return 0;
1637 return 1;
1638}
1639
1640#if !defined(SRE_RECURSIVE)
1641
1642/* -------------------------------------------------------------------- */
1643/* factories and destructors */
1644
1645/* see sre.h for object declarations */
1646static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1647static PyObject*pattern_scanner(PatternObject*, PyObject*);
1648
1649static PyObject *
1650sre_codesize(PyObject* self, PyObject *unused)
1651{
1652 return PyInt_FromSize_t(sizeof(SRE_CODE));
1653}
1654
1655static PyObject *
1656sre_getlower(PyObject* self, PyObject* args)
1657{
1658 int character, flags;
1659 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1660 return NULL;
1661 if (flags & SRE_FLAG_LOCALE)
1662 return Py_BuildValue("i", sre_lower_locale(character));
1663 if (flags & SRE_FLAG_UNICODE)
1664#if defined(HAVE_UNICODE)
1665 return Py_BuildValue("i", sre_lower_unicode(character));
1666#else
1667 return Py_BuildValue("i", sre_lower_locale(character));
1668#endif
1669 return Py_BuildValue("i", sre_lower(character));
1670}
1671
1672LOCAL(void)
1673state_reset(SRE_STATE* state)
1674{
1675 /* FIXME: dynamic! */
1676 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1677
1678 state->lastmark = -1;
1679 state->lastindex = -1;
1680
1681 state->repeat = NULL;
1682
1683 data_stack_dealloc(state);
1684}
1685
1686static void*
1687getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1688{
1689 /* given a python object, return a data pointer, a length (in
1690 characters), and a character size. return NULL if the object
1691 is not a string (or not compatible) */
1692
1693 PyBufferProcs *buffer;
1694 Py_ssize_t size, bytes;
1695 int charsize;
1696 void* ptr;
1697
1698#if defined(HAVE_UNICODE)
1699 if (PyUnicode_Check(string)) {
1700 /* unicode strings doesn't always support the buffer interface */
1701 ptr = (void*) PyUnicode_AS_DATA(string);
1702 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1703 size = PyUnicode_GET_SIZE(string);
1704 charsize = sizeof(Py_UNICODE);
1705
1706 } else {
1707#endif
1708
1709 /* get pointer to string buffer */
1710 buffer = Py_TYPE(string)->tp_as_buffer;
1711 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1712 buffer->bf_getsegcount(string, NULL) != 1) {
1713 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1714 return NULL;
1715 }
1716
1717 /* determine buffer size */
1718 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1719 if (bytes < 0) {
1720 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1721 return NULL;
1722 }
1723
1724 /* determine character size */
1725#if PY_VERSION_HEX >= 0x01060000
1726 size = PyObject_Size(string);
1727#else
1728 size = PyObject_Length(string);
1729#endif
1730
1731 if (PyString_Check(string) || bytes == size)
1732 charsize = 1;
1733#if defined(HAVE_UNICODE)
1734 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1735 charsize = sizeof(Py_UNICODE);
1736#endif
1737 else {
1738 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1739 return NULL;
1740 }
1741
1742#if defined(HAVE_UNICODE)
1743 }
1744#endif
1745
1746 *p_length = size;
1747 *p_charsize = charsize;
1748
1749 return ptr;
1750}
1751
1752LOCAL(PyObject*)
1753state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1754 Py_ssize_t start, Py_ssize_t end)
1755{
1756 /* prepare state object */
1757
1758 Py_ssize_t length;
1759 int charsize;
1760 void* ptr;
1761
1762 memset(state, 0, sizeof(SRE_STATE));
1763
1764 state->lastmark = -1;
1765 state->lastindex = -1;
1766
1767 ptr = getstring(string, &length, &charsize);
1768 if (!ptr)
1769 return NULL;
1770
1771 /* adjust boundaries */
1772 if (start < 0)
1773 start = 0;
1774 else if (start > length)
1775 start = length;
1776
1777 if (end < 0)
1778 end = 0;
1779 else if (end > length)
1780 end = length;
1781
1782 state->charsize = charsize;
1783
1784 state->beginning = ptr;
1785
1786 state->start = (void*) ((char*) ptr + start * state->charsize);
1787 state->end = (void*) ((char*) ptr + end * state->charsize);
1788
1789 Py_INCREF(string);
1790 state->string = string;
1791 state->pos = start;
1792 state->endpos = end;
1793
1794 if (pattern->flags & SRE_FLAG_LOCALE)
1795 state->lower = sre_lower_locale;
1796 else if (pattern->flags & SRE_FLAG_UNICODE)
1797#if defined(HAVE_UNICODE)
1798 state->lower = sre_lower_unicode;
1799#else
1800 state->lower = sre_lower_locale;
1801#endif
1802 else
1803 state->lower = sre_lower;
1804
1805 return string;
1806}
1807
1808LOCAL(void)
1809state_fini(SRE_STATE* state)
1810{
1811 Py_XDECREF(state->string);
1812 data_stack_dealloc(state);
1813}
1814
1815/* calculate offset from start of string */
1816#define STATE_OFFSET(state, member)\
1817 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1818
1819LOCAL(PyObject*)
1820state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1821{
1822 Py_ssize_t i, j;
1823
1824 index = (index - 1) * 2;
1825
1826 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1827 if (empty)
1828 /* want empty string */
1829 i = j = 0;
1830 else {
1831 Py_INCREF(Py_None);
1832 return Py_None;
1833 }
1834 } else {
1835 i = STATE_OFFSET(state, state->mark[index]);
1836 j = STATE_OFFSET(state, state->mark[index+1]);
1837 }
1838
1839 return PySequence_GetSlice(string, i, j);
1840}
1841
1842static void
1843pattern_error(int status)
1844{
1845 switch (status) {
1846 case SRE_ERROR_RECURSION_LIMIT:
1847 PyErr_SetString(
1848 PyExc_RuntimeError,
1849 "maximum recursion limit exceeded"
1850 );
1851 break;
1852 case SRE_ERROR_MEMORY:
1853 PyErr_NoMemory();
1854 break;
1855 case SRE_ERROR_INTERRUPTED:
1856 /* An exception has already been raised, so let it fly */
1857 break;
1858 default:
1859 /* other error codes indicate compiler/engine bugs */
1860 PyErr_SetString(
1861 PyExc_RuntimeError,
1862 "internal error in regular expression engine"
1863 );
1864 }
1865}
1866
1867static void
1868pattern_dealloc(PatternObject* self)
1869{
1870 if (self->weakreflist != NULL)
1871 PyObject_ClearWeakRefs((PyObject *) self);
1872 Py_XDECREF(self->pattern);
1873 Py_XDECREF(self->groupindex);
1874 Py_XDECREF(self->indexgroup);
1875 PyObject_DEL(self);
1876}
1877
1878static PyObject*
1879pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1880{
1881 SRE_STATE state;
1882 int status;
1883
1884 PyObject* string;
1885 Py_ssize_t start = 0;
1886 Py_ssize_t end = PY_SSIZE_T_MAX;
1887 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1888 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1889 &string, &start, &end))
1890 return NULL;
1891
1892 string = state_init(&state, self, string, start, end);
1893 if (!string)
1894 return NULL;
1895
1896 state.ptr = state.start;
1897
1898 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1899
1900 if (state.charsize == 1) {
1901 status = sre_match(&state, PatternObject_GetCode(self));
1902 } else {
1903#if defined(HAVE_UNICODE)
1904 status = sre_umatch(&state, PatternObject_GetCode(self));
1905#endif
1906 }
1907
1908 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1909 if (PyErr_Occurred())
1910 return NULL;
1911
1912 state_fini(&state);
1913
1914 return pattern_new_match(self, &state, status);
1915}
1916
1917static PyObject*
1918pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1919{
1920 SRE_STATE state;
1921 int status;
1922
1923 PyObject* string;
1924 Py_ssize_t start = 0;
1925 Py_ssize_t end = PY_SSIZE_T_MAX;
1926 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1927 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1928 &string, &start, &end))
1929 return NULL;
1930
1931 string = state_init(&state, self, string, start, end);
1932 if (!string)
1933 return NULL;
1934
1935 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1936
1937 if (state.charsize == 1) {
1938 status = sre_search(&state, PatternObject_GetCode(self));
1939 } else {
1940#if defined(HAVE_UNICODE)
1941 status = sre_usearch(&state, PatternObject_GetCode(self));
1942#endif
1943 }
1944
1945 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1946
1947 state_fini(&state);
1948
1949 if (PyErr_Occurred())
1950 return NULL;
1951
1952 return pattern_new_match(self, &state, status);
1953}
1954
1955static PyObject*
1956call(char* module, char* function, PyObject* args)
1957{
1958 PyObject* name;
1959 PyObject* mod;
1960 PyObject* func;
1961 PyObject* result;
1962
1963 if (!args)
1964 return NULL;
1965 name = PyString_FromString(module);
1966 if (!name)
1967 return NULL;
1968 mod = PyImport_Import(name);
1969 Py_DECREF(name);
1970 if (!mod)
1971 return NULL;
1972 func = PyObject_GetAttrString(mod, function);
1973 Py_DECREF(mod);
1974 if (!func)
1975 return NULL;
1976 result = PyObject_CallObject(func, args);
1977 Py_DECREF(func);
1978 Py_DECREF(args);
1979 return result;
1980}
1981
1982#ifdef USE_BUILTIN_COPY
1983static int
1984deepcopy(PyObject** object, PyObject* memo)
1985{
1986 PyObject* copy;
1987
1988 copy = call(
1989 "copy", "deepcopy",
1990 PyTuple_Pack(2, *object, memo)
1991 );
1992 if (!copy)
1993 return 0;
1994
1995 Py_DECREF(*object);
1996 *object = copy;
1997
1998 return 1; /* success */
1999}
2000#endif
2001
2002static PyObject*
2003join_list(PyObject* list, PyObject* string)
2004{
2005 /* join list elements */
2006
2007 PyObject* joiner;
2008#if PY_VERSION_HEX >= 0x01060000
2009 PyObject* function;
2010 PyObject* args;
2011#endif
2012 PyObject* result;
2013
2014 joiner = PySequence_GetSlice(string, 0, 0);
2015 if (!joiner)
2016 return NULL;
2017
2018 if (PyList_GET_SIZE(list) == 0) {
2019 Py_DECREF(list);
2020 return joiner;
2021 }
2022
2023#if PY_VERSION_HEX >= 0x01060000
2024 function = PyObject_GetAttrString(joiner, "join");
2025 if (!function) {
2026 Py_DECREF(joiner);
2027 return NULL;
2028 }
2029 args = PyTuple_New(1);
2030 if (!args) {
2031 Py_DECREF(function);
2032 Py_DECREF(joiner);
2033 return NULL;
2034 }
2035 PyTuple_SET_ITEM(args, 0, list);
2036 result = PyObject_CallObject(function, args);
2037 Py_DECREF(args); /* also removes list */
2038 Py_DECREF(function);
2039#else
2040 result = call(
2041 "string", "join",
2042 PyTuple_Pack(2, list, joiner)
2043 );
2044#endif
2045 Py_DECREF(joiner);
2046
2047 return result;
2048}
2049
2050static PyObject*
2051pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2052{
2053 SRE_STATE state;
2054 PyObject* list;
2055 int status;
2056 Py_ssize_t i, b, e;
2057
2058 PyObject* string;
2059 Py_ssize_t start = 0;
2060 Py_ssize_t end = PY_SSIZE_T_MAX;
2061 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2062 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2063 &string, &start, &end))
2064 return NULL;
2065
2066 string = state_init(&state, self, string, start, end);
2067 if (!string)
2068 return NULL;
2069
2070 list = PyList_New(0);
2071 if (!list) {
2072 state_fini(&state);
2073 return NULL;
2074 }
2075
2076 while (state.start <= state.end) {
2077
2078 PyObject* item;
2079
2080 state_reset(&state);
2081
2082 state.ptr = state.start;
2083
2084 if (state.charsize == 1) {
2085 status = sre_search(&state, PatternObject_GetCode(self));
2086 } else {
2087#if defined(HAVE_UNICODE)
2088 status = sre_usearch(&state, PatternObject_GetCode(self));
2089#endif
2090 }
2091
2092 if (PyErr_Occurred())
2093 goto error;
2094
2095 if (status <= 0) {
2096 if (status == 0)
2097 break;
2098 pattern_error(status);
2099 goto error;
2100 }
2101
2102 /* don't bother to build a match object */
2103 switch (self->groups) {
2104 case 0:
2105 b = STATE_OFFSET(&state, state.start);
2106 e = STATE_OFFSET(&state, state.ptr);
2107 item = PySequence_GetSlice(string, b, e);
2108 if (!item)
2109 goto error;
2110 break;
2111 case 1:
2112 item = state_getslice(&state, 1, string, 1);
2113 if (!item)
2114 goto error;
2115 break;
2116 default:
2117 item = PyTuple_New(self->groups);
2118 if (!item)
2119 goto error;
2120 for (i = 0; i < self->groups; i++) {
2121 PyObject* o = state_getslice(&state, i+1, string, 1);
2122 if (!o) {
2123 Py_DECREF(item);
2124 goto error;
2125 }
2126 PyTuple_SET_ITEM(item, i, o);
2127 }
2128 break;
2129 }
2130
2131 status = PyList_Append(list, item);
2132 Py_DECREF(item);
2133 if (status < 0)
2134 goto error;
2135
2136 if (state.ptr == state.start)
2137 state.start = (void*) ((char*) state.ptr + state.charsize);
2138 else
2139 state.start = state.ptr;
2140
2141 }
2142
2143 state_fini(&state);
2144 return list;
2145
2146error:
2147 Py_DECREF(list);
2148 state_fini(&state);
2149 return NULL;
2150
2151}
2152
2153#if PY_VERSION_HEX >= 0x02020000
2154static PyObject*
2155pattern_finditer(PatternObject* pattern, PyObject* args)
2156{
2157 PyObject* scanner;
2158 PyObject* search;
2159 PyObject* iterator;
2160
2161 scanner = pattern_scanner(pattern, args);
2162 if (!scanner)
2163 return NULL;
2164
2165 search = PyObject_GetAttrString(scanner, "search");
2166 Py_DECREF(scanner);
2167 if (!search)
2168 return NULL;
2169
2170 iterator = PyCallIter_New(search, Py_None);
2171 Py_DECREF(search);
2172
2173 return iterator;
2174}
2175#endif
2176
2177static PyObject*
2178pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2179{
2180 SRE_STATE state;
2181 PyObject* list;
2182 PyObject* item;
2183 int status;
2184 Py_ssize_t n;
2185 Py_ssize_t i;
2186 void* last;
2187
2188 PyObject* string;
2189 Py_ssize_t maxsplit = 0;
2190 static char* kwlist[] = { "source", "maxsplit", NULL };
2191 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2192 &string, &maxsplit))
2193 return NULL;
2194
2195 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2196 if (!string)
2197 return NULL;
2198
2199 list = PyList_New(0);
2200 if (!list) {
2201 state_fini(&state);
2202 return NULL;
2203 }
2204
2205 n = 0;
2206 last = state.start;
2207
2208 while (!maxsplit || n < maxsplit) {
2209
2210 state_reset(&state);
2211
2212 state.ptr = state.start;
2213
2214 if (state.charsize == 1) {
2215 status = sre_search(&state, PatternObject_GetCode(self));
2216 } else {
2217#if defined(HAVE_UNICODE)
2218 status = sre_usearch(&state, PatternObject_GetCode(self));
2219#endif
2220 }
2221
2222 if (PyErr_Occurred())
2223 goto error;
2224
2225 if (status <= 0) {
2226 if (status == 0)
2227 break;
2228 pattern_error(status);
2229 goto error;
2230 }
2231
2232 if (state.start == state.ptr) {
2233 if (last == state.end)
2234 break;
2235 /* skip one character */
2236 state.start = (void*) ((char*) state.ptr + state.charsize);
2237 continue;
2238 }
2239
2240 /* get segment before this match */
2241 item = PySequence_GetSlice(
2242 string, STATE_OFFSET(&state, last),
2243 STATE_OFFSET(&state, state.start)
2244 );
2245 if (!item)
2246 goto error;
2247 status = PyList_Append(list, item);
2248 Py_DECREF(item);
2249 if (status < 0)
2250 goto error;
2251
2252 /* add groups (if any) */
2253 for (i = 0; i < self->groups; i++) {
2254 item = state_getslice(&state, i+1, string, 0);
2255 if (!item)
2256 goto error;
2257 status = PyList_Append(list, item);
2258 Py_DECREF(item);
2259 if (status < 0)
2260 goto error;
2261 }
2262
2263 n = n + 1;
2264
2265 last = state.start = state.ptr;
2266
2267 }
2268
2269 /* get segment following last match (even if empty) */
2270 item = PySequence_GetSlice(
2271 string, STATE_OFFSET(&state, last), state.endpos
2272 );
2273 if (!item)
2274 goto error;
2275 status = PyList_Append(list, item);
2276 Py_DECREF(item);
2277 if (status < 0)
2278 goto error;
2279
2280 state_fini(&state);
2281 return list;
2282
2283error:
2284 Py_DECREF(list);
2285 state_fini(&state);
2286 return NULL;
2287
2288}
2289
2290static PyObject*
2291pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2292 Py_ssize_t count, Py_ssize_t subn)
2293{
2294 SRE_STATE state;
2295 PyObject* list;
2296 PyObject* item;
2297 PyObject* filter;
2298 PyObject* args;
2299 PyObject* match;
2300 void* ptr;
2301 int status;
2302 Py_ssize_t n;
2303 Py_ssize_t i, b, e;
2304 int bint;
2305 int filter_is_callable;
2306
2307 if (PyCallable_Check(ptemplate)) {
2308 /* sub/subn takes either a function or a template */
2309 filter = ptemplate;
2310 Py_INCREF(filter);
2311 filter_is_callable = 1;
2312 } else {
2313 /* if not callable, check if it's a literal string */
2314 int literal;
2315 ptr = getstring(ptemplate, &n, &bint);
2316 b = bint;
2317 if (ptr) {
2318 if (b == 1) {
2319 literal = sre_literal_template((unsigned char *)ptr, n);
2320 } else {
2321#if defined(HAVE_UNICODE)
2322 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2323#endif
2324 }
2325 } else {
2326 PyErr_Clear();
2327 literal = 0;
2328 }
2329 if (literal) {
2330 filter = ptemplate;
2331 Py_INCREF(filter);
2332 filter_is_callable = 0;
2333 } else {
2334 /* not a literal; hand it over to the template compiler */
2335 filter = call(
2336 SRE_PY_MODULE, "_subx",
2337 PyTuple_Pack(2, self, ptemplate)
2338 );
2339 if (!filter)
2340 return NULL;
2341 filter_is_callable = PyCallable_Check(filter);
2342 }
2343 }
2344
2345 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2346 if (!string) {
2347 Py_DECREF(filter);
2348 return NULL;
2349 }
2350
2351 list = PyList_New(0);
2352 if (!list) {
2353 Py_DECREF(filter);
2354 state_fini(&state);
2355 return NULL;
2356 }
2357
2358 n = i = 0;
2359
2360 while (!count || n < count) {
2361
2362 state_reset(&state);
2363
2364 state.ptr = state.start;
2365
2366 if (state.charsize == 1) {
2367 status = sre_search(&state, PatternObject_GetCode(self));
2368 } else {
2369#if defined(HAVE_UNICODE)
2370 status = sre_usearch(&state, PatternObject_GetCode(self));
2371#endif
2372 }
2373
2374 if (PyErr_Occurred())
2375 goto error;
2376
2377 if (status <= 0) {
2378 if (status == 0)
2379 break;
2380 pattern_error(status);
2381 goto error;
2382 }
2383
2384 b = STATE_OFFSET(&state, state.start);
2385 e = STATE_OFFSET(&state, state.ptr);
2386
2387 if (i < b) {
2388 /* get segment before this match */
2389 item = PySequence_GetSlice(string, i, b);
2390 if (!item)
2391 goto error;
2392 status = PyList_Append(list, item);
2393 Py_DECREF(item);
2394 if (status < 0)
2395 goto error;
2396
2397 } else if (i == b && i == e && n > 0)
2398 /* ignore empty match on latest position */
2399 goto next;
2400
2401 if (filter_is_callable) {
2402 /* pass match object through filter */
2403 match = pattern_new_match(self, &state, 1);
2404 if (!match)
2405 goto error;
2406 args = PyTuple_Pack(1, match);
2407 if (!args) {
2408 Py_DECREF(match);
2409 goto error;
2410 }
2411 item = PyObject_CallObject(filter, args);
2412 Py_DECREF(args);
2413 Py_DECREF(match);
2414 if (!item)
2415 goto error;
2416 } else {
2417 /* filter is literal string */
2418 item = filter;
2419 Py_INCREF(item);
2420 }
2421
2422 /* add to list */
2423 if (item != Py_None) {
2424 status = PyList_Append(list, item);
2425 Py_DECREF(item);
2426 if (status < 0)
2427 goto error;
2428 }
2429
2430 i = e;
2431 n = n + 1;
2432
2433next:
2434 /* move on */
2435 if (state.ptr == state.start)
2436 state.start = (void*) ((char*) state.ptr + state.charsize);
2437 else
2438 state.start = state.ptr;
2439
2440 }
2441
2442 /* get segment following last match */
2443 if (i < state.endpos) {
2444 item = PySequence_GetSlice(string, i, state.endpos);
2445 if (!item)
2446 goto error;
2447 status = PyList_Append(list, item);
2448 Py_DECREF(item);
2449 if (status < 0)
2450 goto error;
2451 }
2452
2453 state_fini(&state);
2454
2455 Py_DECREF(filter);
2456
2457 /* convert list to single string (also removes list) */
2458 item = join_list(list, string);
2459
2460 if (!item)
2461 return NULL;
2462
2463 if (subn)
2464 return Py_BuildValue("Nn", item, n);
2465
2466 return item;
2467
2468error:
2469 Py_DECREF(list);
2470 state_fini(&state);
2471 Py_DECREF(filter);
2472 return NULL;
2473
2474}
2475
2476static PyObject*
2477pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2478{
2479 PyObject* ptemplate;
2480 PyObject* string;
2481 Py_ssize_t count = 0;
2482 static char* kwlist[] = { "repl", "string", "count", NULL };
2483 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2484 &ptemplate, &string, &count))
2485 return NULL;
2486
2487 return pattern_subx(self, ptemplate, string, count, 0);
2488}
2489
2490static PyObject*
2491pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2492{
2493 PyObject* ptemplate;
2494 PyObject* string;
2495 Py_ssize_t count = 0;
2496 static char* kwlist[] = { "repl", "string", "count", NULL };
2497 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2498 &ptemplate, &string, &count))
2499 return NULL;
2500
2501 return pattern_subx(self, ptemplate, string, count, 1);
2502}
2503
2504static PyObject*
2505pattern_copy(PatternObject* self, PyObject *unused)
2506{
2507#ifdef USE_BUILTIN_COPY
2508 PatternObject* copy;
2509 int offset;
2510
2511 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2512 if (!copy)
2513 return NULL;
2514
2515 offset = offsetof(PatternObject, groups);
2516
2517 Py_XINCREF(self->groupindex);
2518 Py_XINCREF(self->indexgroup);
2519 Py_XINCREF(self->pattern);
2520
2521 memcpy((char*) copy + offset, (char*) self + offset,
2522 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2523 copy->weakreflist = NULL;
2524
2525 return (PyObject*) copy;
2526#else
2527 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2528 return NULL;
2529#endif
2530}
2531
2532static PyObject*
2533pattern_deepcopy(PatternObject* self, PyObject* memo)
2534{
2535#ifdef USE_BUILTIN_COPY
2536 PatternObject* copy;
2537
2538 copy = (PatternObject*) pattern_copy(self);
2539 if (!copy)
2540 return NULL;
2541
2542 if (!deepcopy(&copy->groupindex, memo) ||
2543 !deepcopy(&copy->indexgroup, memo) ||
2544 !deepcopy(&copy->pattern, memo)) {
2545 Py_DECREF(copy);
2546 return NULL;
2547 }
2548
2549#else
2550 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2551 return NULL;
2552#endif
2553}
2554
2555PyDoc_STRVAR(pattern_match_doc,
2556"match(string[, pos[, endpos]]) --> match object or None.\n\
2557 Matches zero or more characters at the beginning of the string");
2558
2559PyDoc_STRVAR(pattern_search_doc,
2560"search(string[, pos[, endpos]]) --> match object or None.\n\
2561 Scan through string looking for a match, and return a corresponding\n\
2562 match object instance. Return None if no position in the string matches.");
2563
2564PyDoc_STRVAR(pattern_split_doc,
2565"split(string[, maxsplit = 0]) --> list.\n\
2566 Split string by the occurrences of pattern.");
2567
2568PyDoc_STRVAR(pattern_findall_doc,
2569"findall(string[, pos[, endpos]]) --> list.\n\
2570 Return a list of all non-overlapping matches of pattern in string.");
2571
2572PyDoc_STRVAR(pattern_finditer_doc,
2573"finditer(string[, pos[, endpos]]) --> iterator.\n\
2574 Return an iterator over all non-overlapping matches for the \n\
2575 RE pattern in string. For each match, the iterator returns a\n\
2576 match object.");
2577
2578PyDoc_STRVAR(pattern_sub_doc,
2579"sub(repl, string[, count = 0]) --> newstring\n\
2580 Return the string obtained by replacing the leftmost non-overlapping\n\
2581 occurrences of pattern in string by the replacement repl.");
2582
2583PyDoc_STRVAR(pattern_subn_doc,
2584"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2585 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2586 the leftmost non-overlapping occurrences of pattern with the\n\
2587 replacement repl.");
2588
2589PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2590
2591static PyMethodDef pattern_methods[] = {
2592 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2593 pattern_match_doc},
2594 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2595 pattern_search_doc},
2596 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2597 pattern_sub_doc},
2598 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2599 pattern_subn_doc},
2600 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2601 pattern_split_doc},
2602 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2603 pattern_findall_doc},
2604#if PY_VERSION_HEX >= 0x02020000
2605 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2606 pattern_finditer_doc},
2607#endif
2608 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2609 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2610 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2611 {NULL, NULL}
2612};
2613
2614#define PAT_OFF(x) offsetof(PatternObject, x)
2615static PyMemberDef pattern_members[] = {
2616 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2617 {"flags", T_INT, PAT_OFF(flags), READONLY},
2618 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2619 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2620 {NULL} /* Sentinel */
2621};
2622
2623statichere PyTypeObject Pattern_Type = {
2624 PyObject_HEAD_INIT(NULL)
2625 0, "_" SRE_MODULE ".SRE_Pattern",
2626 sizeof(PatternObject), sizeof(SRE_CODE),
2627 (destructor)pattern_dealloc, /*tp_dealloc*/
2628 0, /* tp_print */
2629 0, /* tp_getattrn */
2630 0, /* tp_setattr */
2631 0, /* tp_compare */
2632 0, /* tp_repr */
2633 0, /* tp_as_number */
2634 0, /* tp_as_sequence */
2635 0, /* tp_as_mapping */
2636 0, /* tp_hash */
2637 0, /* tp_call */
2638 0, /* tp_str */
2639 0, /* tp_getattro */
2640 0, /* tp_setattro */
2641 0, /* tp_as_buffer */
2642 Py_TPFLAGS_DEFAULT, /* tp_flags */
2643 pattern_doc, /* tp_doc */
2644 0, /* tp_traverse */
2645 0, /* tp_clear */
2646 0, /* tp_richcompare */
2647 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2648 0, /* tp_iter */
2649 0, /* tp_iternext */
2650 pattern_methods, /* tp_methods */
2651 pattern_members, /* tp_members */
2652};
2653
2654static int _validate(PatternObject *self); /* Forward */
2655
2656static PyObject *
2657_compile(PyObject* self_, PyObject* args)
2658{
2659 /* "compile" pattern descriptor to pattern object */
2660
2661 PatternObject* self;
2662 Py_ssize_t i, n;
2663
2664 PyObject* pattern;
2665 int flags = 0;
2666 PyObject* code;
2667 Py_ssize_t groups = 0;
2668 PyObject* groupindex = NULL;
2669 PyObject* indexgroup = NULL;
2670 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2671 &PyList_Type, &code, &groups,
2672 &groupindex, &indexgroup))
2673 return NULL;
2674
2675 n = PyList_GET_SIZE(code);
2676 /* coverity[ampersand_in_size] */
2677 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2678 if (!self)
2679 return NULL;
2680 self->weakreflist = NULL;
2681 self->pattern = NULL;
2682 self->groupindex = NULL;
2683 self->indexgroup = NULL;
2684
2685 self->codesize = n;
2686
2687 for (i = 0; i < n; i++) {
2688 PyObject *o = PyList_GET_ITEM(code, i);
2689 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2690 : PyLong_AsUnsignedLong(o);
2691 if (value == (unsigned long)-1 && PyErr_Occurred()) {
2692 if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
2693 PyErr_SetString(PyExc_OverflowError,
2694 "regular expression code size limit exceeded");
2695 }
2696 break;
2697 }
2698 self->code[i] = (SRE_CODE) value;
2699 if ((unsigned long) self->code[i] != value) {
2700 PyErr_SetString(PyExc_OverflowError,
2701 "regular expression code size limit exceeded");
2702 break;
2703 }
2704 }
2705
2706 if (PyErr_Occurred()) {
2707 Py_DECREF(self);
2708 return NULL;
2709 }
2710
2711 Py_INCREF(pattern);
2712 self->pattern = pattern;
2713
2714 self->flags = flags;
2715
2716 self->groups = groups;
2717
2718 Py_XINCREF(groupindex);
2719 self->groupindex = groupindex;
2720
2721 Py_XINCREF(indexgroup);
2722 self->indexgroup = indexgroup;
2723
2724 self->weakreflist = NULL;
2725
2726 if (!_validate(self)) {
2727 Py_DECREF(self);
2728 return NULL;
2729 }
2730
2731 return (PyObject*) self;
2732}
2733
2734/* -------------------------------------------------------------------- */
2735/* Code validation */
2736
2737/* To learn more about this code, have a look at the _compile() function in
2738 Lib/sre_compile.py. The validation functions below checks the code array
2739 for conformance with the code patterns generated there.
2740
2741 The nice thing about the generated code is that it is position-independent:
2742 all jumps are relative jumps forward. Also, jumps don't cross each other:
2743 the target of a later jump is always earlier than the target of an earlier
2744 jump. IOW, this is okay:
2745
2746 J---------J-------T--------T
2747 \ \_____/ /
2748 \______________________/
2749
2750 but this is not:
2751
2752 J---------J-------T--------T
2753 \_________\_____/ /
2754 \____________/
2755
2756 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2757 bytes wide (the latter if Python is compiled for "wide" unicode support).
2758*/
2759
2760/* Defining this one enables tracing of the validator */
2761#undef VVERBOSE
2762
2763/* Trace macro for the validator */
2764#if defined(VVERBOSE)
2765#define VTRACE(v) printf v
2766#else
2767#define VTRACE(v) do {} while(0) /* do nothing */
2768#endif
2769
2770/* Report failure */
2771#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2772
2773/* Extract opcode, argument, or skip count from code array */
2774#define GET_OP \
2775 do { \
2776 VTRACE(("%p: ", code)); \
2777 if (code >= end) FAIL; \
2778 op = *code++; \
2779 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2780 } while (0)
2781#define GET_ARG \
2782 do { \
2783 VTRACE(("%p= ", code)); \
2784 if (code >= end) FAIL; \
2785 arg = *code++; \
2786 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2787 } while (0)
2788#define GET_SKIP_ADJ(adj) \
2789 do { \
2790 VTRACE(("%p= ", code)); \
2791 if (code >= end) FAIL; \
2792 skip = *code; \
2793 VTRACE(("%lu (skip to %p)\n", \
2794 (unsigned long)skip, code+skip)); \
2795 if (skip-adj > end-code) \
2796 FAIL; \
2797 code++; \
2798 } while (0)
2799#define GET_SKIP GET_SKIP_ADJ(0)
2800
2801static int
2802_validate_charset(SRE_CODE *code, SRE_CODE *end)
2803{
2804 /* Some variables are manipulated by the macros above */
2805 SRE_CODE op;
2806 SRE_CODE arg;
2807 SRE_CODE offset;
2808 int i;
2809
2810 while (code < end) {
2811 GET_OP;
2812 switch (op) {
2813
2814 case SRE_OP_NEGATE:
2815 break;
2816
2817 case SRE_OP_LITERAL:
2818 GET_ARG;
2819 break;
2820
2821 case SRE_OP_RANGE:
2822 GET_ARG;
2823 GET_ARG;
2824 break;
2825
2826 case SRE_OP_CHARSET:
2827 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2828 if (offset > end-code)
2829 FAIL;
2830 code += offset;
2831 break;
2832
2833 case SRE_OP_BIGCHARSET:
2834 GET_ARG; /* Number of blocks */
2835 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2836 if (offset > end-code)
2837 FAIL;
2838 /* Make sure that each byte points to a valid block */
2839 for (i = 0; i < 256; i++) {
2840 if (((unsigned char *)code)[i] >= arg)
2841 FAIL;
2842 }
2843 code += offset;
2844 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2845 if (offset > end-code)
2846 FAIL;
2847 code += offset;
2848 break;
2849
2850 case SRE_OP_CATEGORY:
2851 GET_ARG;
2852 switch (arg) {
2853 case SRE_CATEGORY_DIGIT:
2854 case SRE_CATEGORY_NOT_DIGIT:
2855 case SRE_CATEGORY_SPACE:
2856 case SRE_CATEGORY_NOT_SPACE:
2857 case SRE_CATEGORY_WORD:
2858 case SRE_CATEGORY_NOT_WORD:
2859 case SRE_CATEGORY_LINEBREAK:
2860 case SRE_CATEGORY_NOT_LINEBREAK:
2861 case SRE_CATEGORY_LOC_WORD:
2862 case SRE_CATEGORY_LOC_NOT_WORD:
2863 case SRE_CATEGORY_UNI_DIGIT:
2864 case SRE_CATEGORY_UNI_NOT_DIGIT:
2865 case SRE_CATEGORY_UNI_SPACE:
2866 case SRE_CATEGORY_UNI_NOT_SPACE:
2867 case SRE_CATEGORY_UNI_WORD:
2868 case SRE_CATEGORY_UNI_NOT_WORD:
2869 case SRE_CATEGORY_UNI_LINEBREAK:
2870 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2871 break;
2872 default:
2873 FAIL;
2874 }
2875 break;
2876
2877 default:
2878 FAIL;
2879
2880 }
2881 }
2882
2883 return 1;
2884}
2885
2886static int
2887_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2888{
2889 /* Some variables are manipulated by the macros above */
2890 SRE_CODE op;
2891 SRE_CODE arg;
2892 SRE_CODE skip;
2893
2894 VTRACE(("code=%p, end=%p\n", code, end));
2895
2896 if (code > end)
2897 FAIL;
2898
2899 while (code < end) {
2900 GET_OP;
2901 switch (op) {
2902
2903 case SRE_OP_MARK:
2904 /* We don't check whether marks are properly nested; the
2905 sre_match() code is robust even if they don't, and the worst
2906 you can get is nonsensical match results. */
2907 GET_ARG;
2908 if (arg > 2*groups+1) {
2909 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2910 FAIL;
2911 }
2912 break;
2913
2914 case SRE_OP_LITERAL:
2915 case SRE_OP_NOT_LITERAL:
2916 case SRE_OP_LITERAL_IGNORE:
2917 case SRE_OP_NOT_LITERAL_IGNORE:
2918 GET_ARG;
2919 /* The arg is just a character, nothing to check */
2920 break;
2921
2922 case SRE_OP_SUCCESS:
2923 case SRE_OP_FAILURE:
2924 /* Nothing to check; these normally end the matching process */
2925 break;
2926
2927 case SRE_OP_AT:
2928 GET_ARG;
2929 switch (arg) {
2930 case SRE_AT_BEGINNING:
2931 case SRE_AT_BEGINNING_STRING:
2932 case SRE_AT_BEGINNING_LINE:
2933 case SRE_AT_END:
2934 case SRE_AT_END_LINE:
2935 case SRE_AT_END_STRING:
2936 case SRE_AT_BOUNDARY:
2937 case SRE_AT_NON_BOUNDARY:
2938 case SRE_AT_LOC_BOUNDARY:
2939 case SRE_AT_LOC_NON_BOUNDARY:
2940 case SRE_AT_UNI_BOUNDARY:
2941 case SRE_AT_UNI_NON_BOUNDARY:
2942 break;
2943 default:
2944 FAIL;
2945 }
2946 break;
2947
2948 case SRE_OP_ANY:
2949 case SRE_OP_ANY_ALL:
2950 /* These have no operands */
2951 break;
2952
2953 case SRE_OP_IN:
2954 case SRE_OP_IN_IGNORE:
2955 GET_SKIP;
2956 /* Stop 1 before the end; we check the FAILURE below */
2957 if (!_validate_charset(code, code+skip-2))
2958 FAIL;
2959 if (code[skip-2] != SRE_OP_FAILURE)
2960 FAIL;
2961 code += skip-1;
2962 break;
2963
2964 case SRE_OP_INFO:
2965 {
2966 /* A minimal info field is
2967 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2968 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2969 more follows. */
2970 SRE_CODE flags, i;
2971 SRE_CODE *newcode;
2972 GET_SKIP;
2973 newcode = code+skip-1;
2974 GET_ARG; flags = arg;
2975 GET_ARG; /* min */
2976 GET_ARG; /* max */
2977 /* Check that only valid flags are present */
2978 if ((flags & ~(SRE_INFO_PREFIX |
2979 SRE_INFO_LITERAL |
2980 SRE_INFO_CHARSET)) != 0)
2981 FAIL;
2982 /* PREFIX and CHARSET are mutually exclusive */
2983 if ((flags & SRE_INFO_PREFIX) &&
2984 (flags & SRE_INFO_CHARSET))
2985 FAIL;
2986 /* LITERAL implies PREFIX */
2987 if ((flags & SRE_INFO_LITERAL) &&
2988 !(flags & SRE_INFO_PREFIX))
2989 FAIL;
2990 /* Validate the prefix */
2991 if (flags & SRE_INFO_PREFIX) {
2992 SRE_CODE prefix_len;
2993 GET_ARG; prefix_len = arg;
2994 GET_ARG; /* prefix skip */
2995 /* Here comes the prefix string */
2996 if (prefix_len > newcode-code)
2997 FAIL;
2998 code += prefix_len;
2999 /* And here comes the overlap table */
3000 if (prefix_len > newcode-code)
3001 FAIL;
3002 /* Each overlap value should be < prefix_len */
3003 for (i = 0; i < prefix_len; i++) {
3004 if (code[i] >= prefix_len)
3005 FAIL;
3006 }
3007 code += prefix_len;
3008 }
3009 /* Validate the charset */
3010 if (flags & SRE_INFO_CHARSET) {
3011 if (!_validate_charset(code, newcode-1))
3012 FAIL;
3013 if (newcode[-1] != SRE_OP_FAILURE)
3014 FAIL;
3015 code = newcode;
3016 }
3017 else if (code != newcode) {
3018 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3019 FAIL;
3020 }
3021 }
3022 break;
3023
3024 case SRE_OP_BRANCH:
3025 {
3026 SRE_CODE *target = NULL;
3027 for (;;) {
3028 GET_SKIP;
3029 if (skip == 0)
3030 break;
3031 /* Stop 2 before the end; we check the JUMP below */
3032 if (!_validate_inner(code, code+skip-3, groups))
3033 FAIL;
3034 code += skip-3;
3035 /* Check that it ends with a JUMP, and that each JUMP
3036 has the same target */
3037 GET_OP;
3038 if (op != SRE_OP_JUMP)
3039 FAIL;
3040 GET_SKIP;
3041 if (target == NULL)
3042 target = code+skip-1;
3043 else if (code+skip-1 != target)
3044 FAIL;
3045 }
3046 }
3047 break;
3048
3049 case SRE_OP_REPEAT_ONE:
3050 case SRE_OP_MIN_REPEAT_ONE:
3051 {
3052 SRE_CODE min, max;
3053 GET_SKIP;
3054 GET_ARG; min = arg;
3055 GET_ARG; max = arg;
3056 if (min > max)
3057 FAIL;
3058 if (max > SRE_MAXREPEAT)
3059 FAIL;
3060 if (!_validate_inner(code, code+skip-4, groups))
3061 FAIL;
3062 code += skip-4;
3063 GET_OP;
3064 if (op != SRE_OP_SUCCESS)
3065 FAIL;
3066 }
3067 break;
3068
3069 case SRE_OP_REPEAT:
3070 {
3071 SRE_CODE min, max;
3072 GET_SKIP;
3073 GET_ARG; min = arg;
3074 GET_ARG; max = arg;
3075 if (min > max)
3076 FAIL;
3077 if (max > SRE_MAXREPEAT)
3078 FAIL;
3079 if (!_validate_inner(code, code+skip-3, groups))
3080 FAIL;
3081 code += skip-3;
3082 GET_OP;
3083 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3084 FAIL;
3085 }
3086 break;
3087
3088 case SRE_OP_GROUPREF:
3089 case SRE_OP_GROUPREF_IGNORE:
3090 GET_ARG;
3091 if (arg >= groups)
3092 FAIL;
3093 break;
3094
3095 case SRE_OP_GROUPREF_EXISTS:
3096 /* The regex syntax for this is: '(?(group)then|else)', where
3097 'group' is either an integer group number or a group name,
3098 'then' and 'else' are sub-regexes, and 'else' is optional. */
3099 GET_ARG;
3100 if (arg >= groups)
3101 FAIL;
3102 GET_SKIP_ADJ(1);
3103 code--; /* The skip is relative to the first arg! */
3104 /* There are two possibilities here: if there is both a 'then'
3105 part and an 'else' part, the generated code looks like:
3106
3107 GROUPREF_EXISTS
3108 <group>
3109 <skipyes>
3110 ...then part...
3111 JUMP
3112 <skipno>
3113 (<skipyes> jumps here)
3114 ...else part...
3115 (<skipno> jumps here)
3116
3117 If there is only a 'then' part, it looks like:
3118
3119 GROUPREF_EXISTS
3120 <group>
3121 <skip>
3122 ...then part...
3123 (<skip> jumps here)
3124
3125 There is no direct way to decide which it is, and we don't want
3126 to allow arbitrary jumps anywhere in the code; so we just look
3127 for a JUMP opcode preceding our skip target.
3128 */
3129 if (skip >= 3 && skip-3 < end-code &&
3130 code[skip-3] == SRE_OP_JUMP)
3131 {
3132 VTRACE(("both then and else parts present\n"));
3133 if (!_validate_inner(code+1, code+skip-3, groups))
3134 FAIL;
3135 code += skip-2; /* Position after JUMP, at <skipno> */
3136 GET_SKIP;
3137 if (!_validate_inner(code, code+skip-1, groups))
3138 FAIL;
3139 code += skip-1;
3140 }
3141 else {
3142 VTRACE(("only a then part present\n"));
3143 if (!_validate_inner(code+1, code+skip-1, groups))
3144 FAIL;
3145 code += skip-1;
3146 }
3147 break;
3148
3149 case SRE_OP_ASSERT:
3150 case SRE_OP_ASSERT_NOT:
3151 GET_SKIP;
3152 GET_ARG; /* 0 for lookahead, width for lookbehind */
3153 code--; /* Back up over arg to simplify math below */
3154 if (arg & 0x80000000)
3155 FAIL; /* Width too large */
3156 /* Stop 1 before the end; we check the SUCCESS below */
3157 if (!_validate_inner(code+1, code+skip-2, groups))
3158 FAIL;
3159 code += skip-2;
3160 GET_OP;
3161 if (op != SRE_OP_SUCCESS)
3162 FAIL;
3163 break;
3164
3165 default:
3166 FAIL;
3167
3168 }
3169 }
3170
3171 VTRACE(("okay\n"));
3172 return 1;
3173}
3174
3175static int
3176_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3177{
3178 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3179 FAIL;
3180 if (groups == 0) /* fix for simplejson */
3181 groups = 100; /* 100 groups should always be safe */
3182 return _validate_inner(code, end-1, groups);
3183}
3184
3185static int
3186_validate(PatternObject *self)
3187{
3188 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3189 {
3190 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3191 return 0;
3192 }
3193 else
3194 VTRACE(("Success!\n"));
3195 return 1;
3196}
3197
3198/* -------------------------------------------------------------------- */
3199/* match methods */
3200
3201static void
3202match_dealloc(MatchObject* self)
3203{
3204 Py_XDECREF(self->regs);
3205 Py_XDECREF(self->string);
3206 Py_DECREF(self->pattern);
3207 PyObject_DEL(self);
3208}
3209
3210static PyObject*
3211match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3212{
3213 if (index < 0 || index >= self->groups) {
3214 /* raise IndexError if we were given a bad group number */
3215 PyErr_SetString(
3216 PyExc_IndexError,
3217 "no such group"
3218 );
3219 return NULL;
3220 }
3221
3222 index *= 2;
3223
3224 if (self->string == Py_None || self->mark[index] < 0) {
3225 /* return default value if the string or group is undefined */
3226 Py_INCREF(def);
3227 return def;
3228 }
3229
3230 return PySequence_GetSlice(
3231 self->string, self->mark[index], self->mark[index+1]
3232 );
3233}
3234
3235static Py_ssize_t
3236match_getindex(MatchObject* self, PyObject* index)
3237{
3238 Py_ssize_t i;
3239
3240 if (PyInt_Check(index))
3241 return PyInt_AsSsize_t(index);
3242
3243 i = -1;
3244
3245 if (self->pattern->groupindex) {
3246 index = PyObject_GetItem(self->pattern->groupindex, index);
3247 if (index) {
3248 if (PyInt_Check(index) || PyLong_Check(index))
3249 i = PyInt_AsSsize_t(index);
3250 Py_DECREF(index);
3251 } else
3252 PyErr_Clear();
3253 }
3254
3255 return i;
3256}
3257
3258static PyObject*
3259match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3260{
3261 return match_getslice_by_index(self, match_getindex(self, index), def);
3262}
3263
3264static PyObject*
3265match_expand(MatchObject* self, PyObject* ptemplate)
3266{
3267 /* delegate to Python code */
3268 return call(
3269 SRE_PY_MODULE, "_expand",
3270 PyTuple_Pack(3, self->pattern, self, ptemplate)
3271 );
3272}
3273
3274static PyObject*
3275match_group(MatchObject* self, PyObject* args)
3276{
3277 PyObject* result;
3278 Py_ssize_t i, size;
3279
3280 size = PyTuple_GET_SIZE(args);
3281
3282 switch (size) {
3283 case 0:
3284 result = match_getslice(self, Py_False, Py_None);
3285 break;
3286 case 1:
3287 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3288 break;
3289 default:
3290 /* fetch multiple items */
3291 result = PyTuple_New(size);
3292 if (!result)
3293 return NULL;
3294 for (i = 0; i < size; i++) {
3295 PyObject* item = match_getslice(
3296 self, PyTuple_GET_ITEM(args, i), Py_None
3297 );
3298 if (!item) {
3299 Py_DECREF(result);
3300 return NULL;
3301 }
3302 PyTuple_SET_ITEM(result, i, item);
3303 }
3304 break;
3305 }
3306 return result;
3307}
3308
3309static PyObject*
3310match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3311{
3312 PyObject* result;
3313 Py_ssize_t index;
3314
3315 PyObject* def = Py_None;
3316 static char* kwlist[] = { "default", NULL };
3317 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3318 return NULL;
3319
3320 result = PyTuple_New(self->groups-1);
3321 if (!result)
3322 return NULL;
3323
3324 for (index = 1; index < self->groups; index++) {
3325 PyObject* item;
3326 item = match_getslice_by_index(self, index, def);
3327 if (!item) {
3328 Py_DECREF(result);
3329 return NULL;
3330 }
3331 PyTuple_SET_ITEM(result, index-1, item);
3332 }
3333
3334 return result;
3335}
3336
3337static PyObject*
3338match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3339{
3340 PyObject* result;
3341 PyObject* keys;
3342 Py_ssize_t index;
3343
3344 PyObject* def = Py_None;
3345 static char* kwlist[] = { "default", NULL };
3346 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3347 return NULL;
3348
3349 result = PyDict_New();
3350 if (!result || !self->pattern->groupindex)
3351 return result;
3352
3353 keys = PyMapping_Keys(self->pattern->groupindex);
3354 if (!keys)
3355 goto failed;
3356
3357 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3358 int status;
3359 PyObject* key;
3360 PyObject* value;
3361 key = PyList_GET_ITEM(keys, index);
3362 if (!key)
3363 goto failed;
3364 value = match_getslice(self, key, def);
3365 if (!value) {
3366 Py_DECREF(key);
3367 goto failed;
3368 }
3369 status = PyDict_SetItem(result, key, value);
3370 Py_DECREF(value);
3371 if (status < 0)
3372 goto failed;
3373 }
3374
3375 Py_DECREF(keys);
3376
3377 return result;
3378
3379failed:
3380 Py_XDECREF(keys);
3381 Py_DECREF(result);
3382 return NULL;
3383}
3384
3385static PyObject*
3386match_start(MatchObject* self, PyObject* args)
3387{
3388 Py_ssize_t index;
3389
3390 PyObject* index_ = Py_False; /* zero */
3391 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3392 return NULL;
3393
3394 index = match_getindex(self, index_);
3395
3396 if (index < 0 || index >= self->groups) {
3397 PyErr_SetString(
3398 PyExc_IndexError,
3399 "no such group"
3400 );
3401 return NULL;
3402 }
3403
3404 /* mark is -1 if group is undefined */
3405 return PyInt_FromSsize_t(self->mark[index*2]);
3406}
3407
3408static PyObject*
3409match_end(MatchObject* self, PyObject* args)
3410{
3411 Py_ssize_t index;
3412
3413 PyObject* index_ = Py_False; /* zero */
3414 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3415 return NULL;
3416
3417 index = match_getindex(self, index_);
3418
3419 if (index < 0 || index >= self->groups) {
3420 PyErr_SetString(
3421 PyExc_IndexError,
3422 "no such group"
3423 );
3424 return NULL;
3425 }
3426
3427 /* mark is -1 if group is undefined */
3428 return PyInt_FromSsize_t(self->mark[index*2+1]);
3429}
3430
3431LOCAL(PyObject*)
3432_pair(Py_ssize_t i1, Py_ssize_t i2)
3433{
3434 PyObject* pair;
3435 PyObject* item;
3436
3437 pair = PyTuple_New(2);
3438 if (!pair)
3439 return NULL;
3440
3441 item = PyInt_FromSsize_t(i1);
3442 if (!item)
3443 goto error;
3444 PyTuple_SET_ITEM(pair, 0, item);
3445
3446 item = PyInt_FromSsize_t(i2);
3447 if (!item)
3448 goto error;
3449 PyTuple_SET_ITEM(pair, 1, item);
3450
3451 return pair;
3452
3453 error:
3454 Py_DECREF(pair);
3455 return NULL;
3456}
3457
3458static PyObject*
3459match_span(MatchObject* self, PyObject* args)
3460{
3461 Py_ssize_t index;
3462
3463 PyObject* index_ = Py_False; /* zero */
3464 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3465 return NULL;
3466
3467 index = match_getindex(self, index_);
3468
3469 if (index < 0 || index >= self->groups) {
3470 PyErr_SetString(
3471 PyExc_IndexError,
3472 "no such group"
3473 );
3474 return NULL;
3475 }
3476
3477 /* marks are -1 if group is undefined */
3478 return _pair(self->mark[index*2], self->mark[index*2+1]);
3479}
3480
3481static PyObject*
3482match_regs(MatchObject* self)
3483{
3484 PyObject* regs;
3485 PyObject* item;
3486 Py_ssize_t index;
3487
3488 regs = PyTuple_New(self->groups);
3489 if (!regs)
3490 return NULL;
3491
3492 for (index = 0; index < self->groups; index++) {
3493 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3494 if (!item) {
3495 Py_DECREF(regs);
3496 return NULL;
3497 }
3498 PyTuple_SET_ITEM(regs, index, item);
3499 }
3500
3501 Py_INCREF(regs);
3502 self->regs = regs;
3503
3504 return regs;
3505}
3506
3507static PyObject*
3508match_copy(MatchObject* self, PyObject *unused)
3509{
3510#ifdef USE_BUILTIN_COPY
3511 MatchObject* copy;
3512 Py_ssize_t slots, offset;
3513
3514 slots = 2 * (self->pattern->groups+1);
3515
3516 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3517 if (!copy)
3518 return NULL;
3519
3520 /* this value a constant, but any compiler should be able to
3521 figure that out all by itself */
3522 offset = offsetof(MatchObject, string);
3523
3524 Py_XINCREF(self->pattern);
3525 Py_XINCREF(self->string);
3526 Py_XINCREF(self->regs);
3527
3528 memcpy((char*) copy + offset, (char*) self + offset,
3529 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3530
3531 return (PyObject*) copy;
3532#else
3533 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3534 return NULL;
3535#endif
3536}
3537
3538static PyObject*
3539match_deepcopy(MatchObject* self, PyObject* memo)
3540{
3541#ifdef USE_BUILTIN_COPY
3542 MatchObject* copy;
3543
3544 copy = (MatchObject*) match_copy(self);
3545 if (!copy)
3546 return NULL;
3547
3548 if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3549 !deepcopy(&copy->string, memo) ||
3550 !deepcopy(&copy->regs, memo)) {
3551 Py_DECREF(copy);
3552 return NULL;
3553 }
3554
3555#else
3556 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3557 return NULL;
3558#endif
3559}
3560
3561PyDoc_STRVAR(match_doc,
3562"The result of re.match() and re.search().\n\
3563Match objects always have a boolean value of True.");
3564
3565PyDoc_STRVAR(match_group_doc,
3566"group([group1, ...]) -> str or tuple.\n\
3567 Return subgroup(s) of the match by indices or names.\n\
3568 For 0 returns the entire match.");
3569
3570PyDoc_STRVAR(match_start_doc,
3571"start([group=0]) -> int.\n\
3572 Return index of the start of the substring matched by group.");
3573
3574PyDoc_STRVAR(match_end_doc,
3575"end([group=0]) -> int.\n\
3576 Return index of the end of the substring matched by group.");
3577
3578PyDoc_STRVAR(match_span_doc,
3579"span([group]) -> tuple.\n\
3580 For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
3581
3582PyDoc_STRVAR(match_groups_doc,
3583"groups([default=None]) -> tuple.\n\
3584 Return a tuple containing all the subgroups of the match, from 1.\n\
3585 The default argument is used for groups\n\
3586 that did not participate in the match");
3587
3588PyDoc_STRVAR(match_groupdict_doc,
3589"groupdict([default=None]) -> dict.\n\
3590 Return a dictionary containing all the named subgroups of the match,\n\
3591 keyed by the subgroup name. The default argument is used for groups\n\
3592 that did not participate in the match");
3593
3594PyDoc_STRVAR(match_expand_doc,
3595"expand(template) -> str.\n\
3596 Return the string obtained by doing backslash substitution\n\
3597 on the string template, as done by the sub() method.");
3598
3599static PyMethodDef match_methods[] = {
3600 {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
3601 {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
3602 {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
3603 {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
3604 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
3605 match_groups_doc},
3606 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
3607 match_groupdict_doc},
3608 {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
3609 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3610 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3611 {NULL, NULL}
3612};
3613
3614static PyObject *
3615match_lastindex_get(MatchObject *self)
3616{
3617 if (self->lastindex >= 0)
3618 return PyInt_FromSsize_t(self->lastindex);
3619 Py_INCREF(Py_None);
3620 return Py_None;
3621}
3622
3623static PyObject *
3624match_lastgroup_get(MatchObject *self)
3625{
3626 if (self->pattern->indexgroup && self->lastindex >= 0) {
3627 PyObject* result = PySequence_GetItem(
3628 self->pattern->indexgroup, self->lastindex
3629 );
3630 if (result)
3631 return result;
3632 PyErr_Clear();
3633 }
3634 Py_INCREF(Py_None);
3635 return Py_None;
3636}
3637
3638static PyObject *
3639match_regs_get(MatchObject *self)
3640{
3641 if (self->regs) {
3642 Py_INCREF(self->regs);
3643 return self->regs;
3644 } else
3645 return match_regs(self);
3646}
3647
3648static PyGetSetDef match_getset[] = {
3649 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3650 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3651 {"regs", (getter)match_regs_get, (setter)NULL},
3652 {NULL}
3653};
3654
3655#define MATCH_OFF(x) offsetof(MatchObject, x)
3656static PyMemberDef match_members[] = {
3657 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3658 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3659 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3660 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3661 {NULL}
3662};
3663
3664
3665/* FIXME: implement setattr("string", None) as a special case (to
3666 detach the associated string, if any */
3667
3668static PyTypeObject Match_Type = {
3669 PyVarObject_HEAD_INIT(NULL, 0)
3670 "_" SRE_MODULE ".SRE_Match",
3671 sizeof(MatchObject), sizeof(Py_ssize_t),
3672 (destructor)match_dealloc, /* tp_dealloc */
3673 0, /* tp_print */
3674 0, /* tp_getattr */
3675 0, /* tp_setattr */
3676 0, /* tp_compare */
3677 0, /* tp_repr */
3678 0, /* tp_as_number */
3679 0, /* tp_as_sequence */
3680 0, /* tp_as_mapping */
3681 0, /* tp_hash */
3682 0, /* tp_call */
3683 0, /* tp_str */
3684 0, /* tp_getattro */
3685 0, /* tp_setattro */
3686 0, /* tp_as_buffer */
3687 Py_TPFLAGS_DEFAULT,
3688 match_doc, /* tp_doc */
3689 0, /* tp_traverse */
3690 0, /* tp_clear */
3691 0, /* tp_richcompare */
3692 0, /* tp_weaklistoffset */
3693 0, /* tp_iter */
3694 0, /* tp_iternext */
3695 match_methods, /* tp_methods */
3696 match_members, /* tp_members */
3697 match_getset, /* tp_getset */
3698};
3699
3700static PyObject*
3701pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3702{
3703 /* create match object (from state object) */
3704
3705 MatchObject* match;
3706 Py_ssize_t i, j;
3707 char* base;
3708 int n;
3709
3710 if (status > 0) {
3711
3712 /* create match object (with room for extra group marks) */
3713 /* coverity[ampersand_in_size] */
3714 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3715 2*(pattern->groups+1));
3716 if (!match)
3717 return NULL;
3718
3719 Py_INCREF(pattern);
3720 match->pattern = pattern;
3721
3722 Py_INCREF(state->string);
3723 match->string = state->string;
3724
3725 match->regs = NULL;
3726 match->groups = pattern->groups+1;
3727
3728 /* fill in group slices */
3729
3730 base = (char*) state->beginning;
3731 n = state->charsize;
3732
3733 match->mark[0] = ((char*) state->start - base) / n;
3734 match->mark[1] = ((char*) state->ptr - base) / n;
3735
3736 for (i = j = 0; i < pattern->groups; i++, j+=2)
3737 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3738 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3739 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3740 } else
3741 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3742
3743 match->pos = state->pos;
3744 match->endpos = state->endpos;
3745
3746 match->lastindex = state->lastindex;
3747
3748 return (PyObject*) match;
3749
3750 } else if (status == 0) {
3751
3752 /* no match */
3753 Py_INCREF(Py_None);
3754 return Py_None;
3755
3756 }
3757
3758 /* internal error */
3759 pattern_error(status);
3760 return NULL;
3761}
3762
3763
3764/* -------------------------------------------------------------------- */
3765/* scanner methods (experimental) */
3766
3767static void
3768scanner_dealloc(ScannerObject* self)
3769{
3770 state_fini(&self->state);
3771 Py_XDECREF(self->pattern);
3772 PyObject_DEL(self);
3773}
3774
3775static PyObject*
3776scanner_match(ScannerObject* self, PyObject *unused)
3777{
3778 SRE_STATE* state = &self->state;
3779 PyObject* match;
3780 int status;
3781
3782 state_reset(state);
3783
3784 state->ptr = state->start;
3785
3786 if (state->charsize == 1) {
3787 status = sre_match(state, PatternObject_GetCode(self->pattern));
3788 } else {
3789#if defined(HAVE_UNICODE)
3790 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3791#endif
3792 }
3793 if (PyErr_Occurred())
3794 return NULL;
3795
3796 match = pattern_new_match((PatternObject*) self->pattern,
3797 state, status);
3798
3799 if (status == 0 || state->ptr == state->start)
3800 state->start = (void*) ((char*) state->ptr + state->charsize);
3801 else
3802 state->start = state->ptr;
3803
3804 return match;
3805}
3806
3807
3808static PyObject*
3809scanner_search(ScannerObject* self, PyObject *unused)
3810{
3811 SRE_STATE* state = &self->state;
3812 PyObject* match;
3813 int status;
3814
3815 state_reset(state);
3816
3817 state->ptr = state->start;
3818
3819 if (state->charsize == 1) {
3820 status = sre_search(state, PatternObject_GetCode(self->pattern));
3821 } else {
3822#if defined(HAVE_UNICODE)
3823 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3824#endif
3825 }
3826 if (PyErr_Occurred())
3827 return NULL;
3828
3829 match = pattern_new_match((PatternObject*) self->pattern,
3830 state, status);
3831
3832 if (status == 0 || state->ptr == state->start)
3833 state->start = (void*) ((char*) state->ptr + state->charsize);
3834 else
3835 state->start = state->ptr;
3836
3837 return match;
3838}
3839
3840static PyMethodDef scanner_methods[] = {
3841 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3842 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3843 {NULL, NULL}
3844};
3845
3846#define SCAN_OFF(x) offsetof(ScannerObject, x)
3847static PyMemberDef scanner_members[] = {
3848 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3849 {NULL} /* Sentinel */
3850};
3851
3852statichere PyTypeObject Scanner_Type = {
3853 PyObject_HEAD_INIT(NULL)
3854 0, "_" SRE_MODULE ".SRE_Scanner",
3855 sizeof(ScannerObject), 0,
3856 (destructor)scanner_dealloc, /*tp_dealloc*/
3857 0, /* tp_print */
3858 0, /* tp_getattr */
3859 0, /* tp_setattr */
3860 0, /* tp_reserved */
3861 0, /* tp_repr */
3862 0, /* tp_as_number */
3863 0, /* tp_as_sequence */
3864 0, /* tp_as_mapping */
3865 0, /* tp_hash */
3866 0, /* tp_call */
3867 0, /* tp_str */
3868 0, /* tp_getattro */
3869 0, /* tp_setattro */
3870 0, /* tp_as_buffer */
3871 Py_TPFLAGS_DEFAULT, /* tp_flags */
3872 0, /* tp_doc */
3873 0, /* tp_traverse */
3874 0, /* tp_clear */
3875 0, /* tp_richcompare */
3876 0, /* tp_weaklistoffset */
3877 0, /* tp_iter */
3878 0, /* tp_iternext */
3879 scanner_methods, /* tp_methods */
3880 scanner_members, /* tp_members */
3881 0, /* tp_getset */
3882};
3883
3884static PyObject*
3885pattern_scanner(PatternObject* pattern, PyObject* args)
3886{
3887 /* create search state object */
3888
3889 ScannerObject* self;
3890
3891 PyObject* string;
3892 Py_ssize_t start = 0;
3893 Py_ssize_t end = PY_SSIZE_T_MAX;
3894 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3895 return NULL;
3896
3897 /* create scanner object */
3898 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3899 if (!self)
3900 return NULL;
3901 self->pattern = NULL;
3902
3903 string = state_init(&self->state, pattern, string, start, end);
3904 if (!string) {
3905 Py_DECREF(self);
3906 return NULL;
3907 }
3908
3909 Py_INCREF(pattern);
3910 self->pattern = (PyObject*) pattern;
3911
3912 return (PyObject*) self;
3913}
3914
3915static PyMethodDef _functions[] = {
3916 {"compile", _compile, METH_VARARGS},
3917 {"getcodesize", sre_codesize, METH_NOARGS},
3918 {"getlower", sre_getlower, METH_VARARGS},
3919 {NULL, NULL}
3920};
3921
3922#if PY_VERSION_HEX < 0x02030000
3923DL_EXPORT(void) init_sre(void)
3924#else
3925PyMODINIT_FUNC init_sre(void)
3926#endif
3927{
3928 PyObject* m;
3929 PyObject* d;
3930 PyObject* x;
3931
3932 /* Patch object types */
3933 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3934 PyType_Ready(&Scanner_Type))
3935 return;
3936
3937 m = Py_InitModule("_" SRE_MODULE, _functions);
3938 if (m == NULL)
3939 return;
3940 d = PyModule_GetDict(m);
3941
3942 x = PyInt_FromLong(SRE_MAGIC);
3943 if (x) {
3944 PyDict_SetItemString(d, "MAGIC", x);
3945 Py_DECREF(x);
3946 }
3947
3948 x = PyInt_FromLong(sizeof(SRE_CODE));
3949 if (x) {
3950 PyDict_SetItemString(d, "CODESIZE", x);
3951 Py_DECREF(x);
3952 }
3953
3954 x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
3955 if (x) {
3956 PyDict_SetItemString(d, "MAXREPEAT", x);
3957 Py_DECREF(x);
3958 }
3959
3960 x = PyString_FromString(copyright);
3961 if (x) {
3962 PyDict_SetItemString(d, "copyright", x);
3963 Py_DECREF(x);
3964 }
3965}
3966
3967#endif /* !defined(SRE_RECURSIVE) */
3968
3969/* vim:ts=4:sw=4:et
3970*/
Note: See TracBrowser for help on using the repository browser.