1 | /*
|
---|
2 | * Secret Labs' Regular Expression Engine
|
---|
3 | *
|
---|
4 | * regular expression matching engine
|
---|
5 | *
|
---|
6 | * partial history:
|
---|
7 | * 1999-10-24 fl created (based on existing template matcher code)
|
---|
8 | * 2000-03-06 fl first alpha, sort of
|
---|
9 | * 2000-08-01 fl fixes for 1.6b1
|
---|
10 | * 2000-08-07 fl use PyOS_CheckStack() if available
|
---|
11 | * 2000-09-20 fl added expand method
|
---|
12 | * 2001-03-20 fl lots of fixes for 2.1b2
|
---|
13 | * 2001-04-15 fl export copyright as Python attribute, not global
|
---|
14 | * 2001-04-28 fl added __copy__ methods (work in progress)
|
---|
15 | * 2001-05-14 fl fixes for 1.5.2 compatibility
|
---|
16 | * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
|
---|
17 | * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
|
---|
18 | * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
|
---|
19 | * 2001-10-21 fl added sub/subn primitive
|
---|
20 | * 2001-10-24 fl added finditer primitive (for 2.2 only)
|
---|
21 | * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
|
---|
22 | * 2002-11-09 fl fixed empty sub/subn return type
|
---|
23 | * 2003-04-18 mvl fully support 4-byte codes
|
---|
24 | * 2003-10-17 gn implemented non recursive scheme
|
---|
25 | *
|
---|
26 | * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
---|
27 | *
|
---|
28 | * This version of the SRE library can be redistributed under CNRI's
|
---|
29 | * Python 1.6 license. For any other use, please contact Secret Labs
|
---|
30 | * AB (info@pythonware.com).
|
---|
31 | *
|
---|
32 | * Portions of this engine have been developed in cooperation with
|
---|
33 | * CNRI. Hewlett-Packard provided funding for 1.6 integration and
|
---|
34 | * other compatibility work.
|
---|
35 | */
|
---|
36 |
|
---|
37 | #ifndef SRE_RECURSIVE
|
---|
38 |
|
---|
39 | static char copyright[] =
|
---|
40 | " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
|
---|
41 |
|
---|
42 | #define PY_SSIZE_T_CLEAN
|
---|
43 |
|
---|
44 | #include "Python.h"
|
---|
45 | #include "structmember.h" /* offsetof */
|
---|
46 |
|
---|
47 | #include "sre.h"
|
---|
48 |
|
---|
49 | #include <ctype.h>
|
---|
50 |
|
---|
51 | /* name of this module, minus the leading underscore */
|
---|
52 | #if !defined(SRE_MODULE)
|
---|
53 | #define SRE_MODULE "sre"
|
---|
54 | #endif
|
---|
55 |
|
---|
56 | #define SRE_PY_MODULE "re"
|
---|
57 |
|
---|
58 | /* defining this one enables tracing */
|
---|
59 | #undef VERBOSE
|
---|
60 |
|
---|
61 | #if PY_VERSION_HEX >= 0x01060000
|
---|
62 | #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
|
---|
63 | /* defining this enables unicode support (default under 1.6a1 and later) */
|
---|
64 | #define HAVE_UNICODE
|
---|
65 | #endif
|
---|
66 | #endif
|
---|
67 |
|
---|
68 | /* -------------------------------------------------------------------- */
|
---|
69 | /* optional features */
|
---|
70 |
|
---|
71 | /* enables fast searching */
|
---|
72 | #define USE_FAST_SEARCH
|
---|
73 |
|
---|
74 | /* enables aggressive inlining (always on for Visual C) */
|
---|
75 | #undef USE_INLINE
|
---|
76 |
|
---|
77 | /* enables copy/deepcopy handling (work in progress) */
|
---|
78 | #undef USE_BUILTIN_COPY
|
---|
79 |
|
---|
80 | #if PY_VERSION_HEX < 0x01060000
|
---|
81 | #define PyObject_DEL(op) PyMem_DEL((op))
|
---|
82 | #endif
|
---|
83 |
|
---|
84 | /* -------------------------------------------------------------------- */
|
---|
85 |
|
---|
86 | #if defined(_MSC_VER)
|
---|
87 | #pragma optimize("agtw", on) /* doesn't seem to make much difference... */
|
---|
88 | #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
|
---|
89 | /* fastest possible local call under MSVC */
|
---|
90 | #define LOCAL(type) static __inline type __fastcall
|
---|
91 | #elif defined(USE_INLINE)
|
---|
92 | #define LOCAL(type) static inline type
|
---|
93 | #else
|
---|
94 | #define LOCAL(type) static type
|
---|
95 | #endif
|
---|
96 |
|
---|
97 | /* error codes */
|
---|
98 | #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
|
---|
99 | #define SRE_ERROR_STATE -2 /* illegal state */
|
---|
100 | #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
|
---|
101 | #define SRE_ERROR_MEMORY -9 /* out of memory */
|
---|
102 | #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
|
---|
103 |
|
---|
104 | #if defined(VERBOSE)
|
---|
105 | #define TRACE(v) printf v
|
---|
106 | #else
|
---|
107 | #define TRACE(v)
|
---|
108 | #endif
|
---|
109 |
|
---|
110 | /* -------------------------------------------------------------------- */
|
---|
111 | /* search engine state */
|
---|
112 |
|
---|
113 | /* default character predicates (run sre_chars.py to regenerate tables) */
|
---|
114 |
|
---|
115 | #define SRE_DIGIT_MASK 1
|
---|
116 | #define SRE_SPACE_MASK 2
|
---|
117 | #define SRE_LINEBREAK_MASK 4
|
---|
118 | #define SRE_ALNUM_MASK 8
|
---|
119 | #define SRE_WORD_MASK 16
|
---|
120 |
|
---|
121 | /* FIXME: this assumes ASCII. create tables in init_sre() instead */
|
---|
122 |
|
---|
123 | static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
|
---|
124 | 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
|
---|
125 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
|
---|
126 | 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
---|
127 | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
|
---|
128 | 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
|
---|
129 | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
|
---|
130 |
|
---|
131 | static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
---|
132 | 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
|
---|
133 | 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
|
---|
134 | 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
|
---|
135 | 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
|
---|
136 | 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
|
---|
137 | 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
|
---|
138 | 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
|
---|
139 | 120, 121, 122, 123, 124, 125, 126, 127 };
|
---|
140 |
|
---|
141 | #define SRE_IS_DIGIT(ch)\
|
---|
142 | ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
|
---|
143 | #define SRE_IS_SPACE(ch)\
|
---|
144 | ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
|
---|
145 | #define SRE_IS_LINEBREAK(ch)\
|
---|
146 | ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
|
---|
147 | #define SRE_IS_ALNUM(ch)\
|
---|
148 | ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
|
---|
149 | #define SRE_IS_WORD(ch)\
|
---|
150 | ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
|
---|
151 |
|
---|
152 | static unsigned int sre_lower(unsigned int ch)
|
---|
153 | {
|
---|
154 | return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
|
---|
155 | }
|
---|
156 |
|
---|
157 | /* locale-specific character predicates */
|
---|
158 | /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
|
---|
159 | * warnings when c's type supports only numbers < N+1 */
|
---|
160 | #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
|
---|
161 | #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
|
---|
162 | #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
|
---|
163 | #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
|
---|
164 | #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
|
---|
165 |
|
---|
166 | static unsigned int sre_lower_locale(unsigned int ch)
|
---|
167 | {
|
---|
168 | return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
|
---|
169 | }
|
---|
170 |
|
---|
171 | /* unicode-specific character predicates */
|
---|
172 |
|
---|
173 | #if defined(HAVE_UNICODE)
|
---|
174 |
|
---|
175 | #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
|
---|
176 | #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
|
---|
177 | #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
|
---|
178 | #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
|
---|
179 | #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
|
---|
180 |
|
---|
181 | static unsigned int sre_lower_unicode(unsigned int ch)
|
---|
182 | {
|
---|
183 | return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
|
---|
184 | }
|
---|
185 |
|
---|
186 | #endif
|
---|
187 |
|
---|
188 | LOCAL(int)
|
---|
189 | sre_category(SRE_CODE category, unsigned int ch)
|
---|
190 | {
|
---|
191 | switch (category) {
|
---|
192 |
|
---|
193 | case SRE_CATEGORY_DIGIT:
|
---|
194 | return SRE_IS_DIGIT(ch);
|
---|
195 | case SRE_CATEGORY_NOT_DIGIT:
|
---|
196 | return !SRE_IS_DIGIT(ch);
|
---|
197 | case SRE_CATEGORY_SPACE:
|
---|
198 | return SRE_IS_SPACE(ch);
|
---|
199 | case SRE_CATEGORY_NOT_SPACE:
|
---|
200 | return !SRE_IS_SPACE(ch);
|
---|
201 | case SRE_CATEGORY_WORD:
|
---|
202 | return SRE_IS_WORD(ch);
|
---|
203 | case SRE_CATEGORY_NOT_WORD:
|
---|
204 | return !SRE_IS_WORD(ch);
|
---|
205 | case SRE_CATEGORY_LINEBREAK:
|
---|
206 | return SRE_IS_LINEBREAK(ch);
|
---|
207 | case SRE_CATEGORY_NOT_LINEBREAK:
|
---|
208 | return !SRE_IS_LINEBREAK(ch);
|
---|
209 |
|
---|
210 | case SRE_CATEGORY_LOC_WORD:
|
---|
211 | return SRE_LOC_IS_WORD(ch);
|
---|
212 | case SRE_CATEGORY_LOC_NOT_WORD:
|
---|
213 | return !SRE_LOC_IS_WORD(ch);
|
---|
214 |
|
---|
215 | #if defined(HAVE_UNICODE)
|
---|
216 | case SRE_CATEGORY_UNI_DIGIT:
|
---|
217 | return SRE_UNI_IS_DIGIT(ch);
|
---|
218 | case SRE_CATEGORY_UNI_NOT_DIGIT:
|
---|
219 | return !SRE_UNI_IS_DIGIT(ch);
|
---|
220 | case SRE_CATEGORY_UNI_SPACE:
|
---|
221 | return SRE_UNI_IS_SPACE(ch);
|
---|
222 | case SRE_CATEGORY_UNI_NOT_SPACE:
|
---|
223 | return !SRE_UNI_IS_SPACE(ch);
|
---|
224 | case SRE_CATEGORY_UNI_WORD:
|
---|
225 | return SRE_UNI_IS_WORD(ch);
|
---|
226 | case SRE_CATEGORY_UNI_NOT_WORD:
|
---|
227 | return !SRE_UNI_IS_WORD(ch);
|
---|
228 | case SRE_CATEGORY_UNI_LINEBREAK:
|
---|
229 | return SRE_UNI_IS_LINEBREAK(ch);
|
---|
230 | case SRE_CATEGORY_UNI_NOT_LINEBREAK:
|
---|
231 | return !SRE_UNI_IS_LINEBREAK(ch);
|
---|
232 | #else
|
---|
233 | case SRE_CATEGORY_UNI_DIGIT:
|
---|
234 | return SRE_IS_DIGIT(ch);
|
---|
235 | case SRE_CATEGORY_UNI_NOT_DIGIT:
|
---|
236 | return !SRE_IS_DIGIT(ch);
|
---|
237 | case SRE_CATEGORY_UNI_SPACE:
|
---|
238 | return SRE_IS_SPACE(ch);
|
---|
239 | case SRE_CATEGORY_UNI_NOT_SPACE:
|
---|
240 | return !SRE_IS_SPACE(ch);
|
---|
241 | case SRE_CATEGORY_UNI_WORD:
|
---|
242 | return SRE_LOC_IS_WORD(ch);
|
---|
243 | case SRE_CATEGORY_UNI_NOT_WORD:
|
---|
244 | return !SRE_LOC_IS_WORD(ch);
|
---|
245 | case SRE_CATEGORY_UNI_LINEBREAK:
|
---|
246 | return SRE_IS_LINEBREAK(ch);
|
---|
247 | case SRE_CATEGORY_UNI_NOT_LINEBREAK:
|
---|
248 | return !SRE_IS_LINEBREAK(ch);
|
---|
249 | #endif
|
---|
250 | }
|
---|
251 | return 0;
|
---|
252 | }
|
---|
253 |
|
---|
254 | /* helpers */
|
---|
255 |
|
---|
256 | static void
|
---|
257 | data_stack_dealloc(SRE_STATE* state)
|
---|
258 | {
|
---|
259 | if (state->data_stack) {
|
---|
260 | PyMem_FREE(state->data_stack);
|
---|
261 | state->data_stack = NULL;
|
---|
262 | }
|
---|
263 | state->data_stack_size = state->data_stack_base = 0;
|
---|
264 | }
|
---|
265 |
|
---|
266 | static int
|
---|
267 | data_stack_grow(SRE_STATE* state, Py_ssize_t size)
|
---|
268 | {
|
---|
269 | Py_ssize_t minsize, cursize;
|
---|
270 | minsize = state->data_stack_base+size;
|
---|
271 | cursize = state->data_stack_size;
|
---|
272 | if (cursize < minsize) {
|
---|
273 | void* stack;
|
---|
274 | cursize = minsize+minsize/4+1024;
|
---|
275 | TRACE(("allocate/grow stack %" PY_FORMAT_SIZE_T "d\n", cursize));
|
---|
276 | stack = PyMem_REALLOC(state->data_stack, cursize);
|
---|
277 | if (!stack) {
|
---|
278 | data_stack_dealloc(state);
|
---|
279 | return SRE_ERROR_MEMORY;
|
---|
280 | }
|
---|
281 | state->data_stack = (char *)stack;
|
---|
282 | state->data_stack_size = cursize;
|
---|
283 | }
|
---|
284 | return 0;
|
---|
285 | }
|
---|
286 |
|
---|
287 | /* generate 8-bit version */
|
---|
288 |
|
---|
289 | #define SRE_CHAR unsigned char
|
---|
290 | #define SRE_AT sre_at
|
---|
291 | #define SRE_COUNT sre_count
|
---|
292 | #define SRE_CHARSET sre_charset
|
---|
293 | #define SRE_INFO sre_info
|
---|
294 | #define SRE_MATCH sre_match
|
---|
295 | #define SRE_MATCH_CONTEXT sre_match_context
|
---|
296 | #define SRE_SEARCH sre_search
|
---|
297 | #define SRE_LITERAL_TEMPLATE sre_literal_template
|
---|
298 |
|
---|
299 | #if defined(HAVE_UNICODE)
|
---|
300 |
|
---|
301 | #define SRE_RECURSIVE
|
---|
302 | #include "_sre.c"
|
---|
303 | #undef SRE_RECURSIVE
|
---|
304 |
|
---|
305 | #undef SRE_LITERAL_TEMPLATE
|
---|
306 | #undef SRE_SEARCH
|
---|
307 | #undef SRE_MATCH
|
---|
308 | #undef SRE_MATCH_CONTEXT
|
---|
309 | #undef SRE_INFO
|
---|
310 | #undef SRE_CHARSET
|
---|
311 | #undef SRE_COUNT
|
---|
312 | #undef SRE_AT
|
---|
313 | #undef SRE_CHAR
|
---|
314 |
|
---|
315 | /* generate 16-bit unicode version */
|
---|
316 |
|
---|
317 | #define SRE_CHAR Py_UNICODE
|
---|
318 | #define SRE_AT sre_uat
|
---|
319 | #define SRE_COUNT sre_ucount
|
---|
320 | #define SRE_CHARSET sre_ucharset
|
---|
321 | #define SRE_INFO sre_uinfo
|
---|
322 | #define SRE_MATCH sre_umatch
|
---|
323 | #define SRE_MATCH_CONTEXT sre_umatch_context
|
---|
324 | #define SRE_SEARCH sre_usearch
|
---|
325 | #define SRE_LITERAL_TEMPLATE sre_uliteral_template
|
---|
326 | #endif
|
---|
327 |
|
---|
328 | #endif /* SRE_RECURSIVE */
|
---|
329 |
|
---|
330 | /* -------------------------------------------------------------------- */
|
---|
331 | /* String matching engine */
|
---|
332 |
|
---|
333 | /* the following section is compiled twice, with different character
|
---|
334 | settings */
|
---|
335 |
|
---|
336 | LOCAL(int)
|
---|
337 | SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
|
---|
338 | {
|
---|
339 | /* check if pointer is at given position */
|
---|
340 |
|
---|
341 | Py_ssize_t thisp, thatp;
|
---|
342 |
|
---|
343 | switch (at) {
|
---|
344 |
|
---|
345 | case SRE_AT_BEGINNING:
|
---|
346 | case SRE_AT_BEGINNING_STRING:
|
---|
347 | return ((void*) ptr == state->beginning);
|
---|
348 |
|
---|
349 | case SRE_AT_BEGINNING_LINE:
|
---|
350 | return ((void*) ptr == state->beginning ||
|
---|
351 | SRE_IS_LINEBREAK((int) ptr[-1]));
|
---|
352 |
|
---|
353 | case SRE_AT_END:
|
---|
354 | return (((void*) (ptr+1) == state->end &&
|
---|
355 | SRE_IS_LINEBREAK((int) ptr[0])) ||
|
---|
356 | ((void*) ptr == state->end));
|
---|
357 |
|
---|
358 | case SRE_AT_END_LINE:
|
---|
359 | return ((void*) ptr == state->end ||
|
---|
360 | SRE_IS_LINEBREAK((int) ptr[0]));
|
---|
361 |
|
---|
362 | case SRE_AT_END_STRING:
|
---|
363 | return ((void*) ptr == state->end);
|
---|
364 |
|
---|
365 | case SRE_AT_BOUNDARY:
|
---|
366 | if (state->beginning == state->end)
|
---|
367 | return 0;
|
---|
368 | thatp = ((void*) ptr > state->beginning) ?
|
---|
369 | SRE_IS_WORD((int) ptr[-1]) : 0;
|
---|
370 | thisp = ((void*) ptr < state->end) ?
|
---|
371 | SRE_IS_WORD((int) ptr[0]) : 0;
|
---|
372 | return thisp != thatp;
|
---|
373 |
|
---|
374 | case SRE_AT_NON_BOUNDARY:
|
---|
375 | if (state->beginning == state->end)
|
---|
376 | return 0;
|
---|
377 | thatp = ((void*) ptr > state->beginning) ?
|
---|
378 | SRE_IS_WORD((int) ptr[-1]) : 0;
|
---|
379 | thisp = ((void*) ptr < state->end) ?
|
---|
380 | SRE_IS_WORD((int) ptr[0]) : 0;
|
---|
381 | return thisp == thatp;
|
---|
382 |
|
---|
383 | case SRE_AT_LOC_BOUNDARY:
|
---|
384 | if (state->beginning == state->end)
|
---|
385 | return 0;
|
---|
386 | thatp = ((void*) ptr > state->beginning) ?
|
---|
387 | SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
|
---|
388 | thisp = ((void*) ptr < state->end) ?
|
---|
389 | SRE_LOC_IS_WORD((int) ptr[0]) : 0;
|
---|
390 | return thisp != thatp;
|
---|
391 |
|
---|
392 | case SRE_AT_LOC_NON_BOUNDARY:
|
---|
393 | if (state->beginning == state->end)
|
---|
394 | return 0;
|
---|
395 | thatp = ((void*) ptr > state->beginning) ?
|
---|
396 | SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
|
---|
397 | thisp = ((void*) ptr < state->end) ?
|
---|
398 | SRE_LOC_IS_WORD((int) ptr[0]) : 0;
|
---|
399 | return thisp == thatp;
|
---|
400 |
|
---|
401 | #if defined(HAVE_UNICODE)
|
---|
402 | case SRE_AT_UNI_BOUNDARY:
|
---|
403 | if (state->beginning == state->end)
|
---|
404 | return 0;
|
---|
405 | thatp = ((void*) ptr > state->beginning) ?
|
---|
406 | SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
|
---|
407 | thisp = ((void*) ptr < state->end) ?
|
---|
408 | SRE_UNI_IS_WORD((int) ptr[0]) : 0;
|
---|
409 | return thisp != thatp;
|
---|
410 |
|
---|
411 | case SRE_AT_UNI_NON_BOUNDARY:
|
---|
412 | if (state->beginning == state->end)
|
---|
413 | return 0;
|
---|
414 | thatp = ((void*) ptr > state->beginning) ?
|
---|
415 | SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
|
---|
416 | thisp = ((void*) ptr < state->end) ?
|
---|
417 | SRE_UNI_IS_WORD((int) ptr[0]) : 0;
|
---|
418 | return thisp == thatp;
|
---|
419 | #endif
|
---|
420 |
|
---|
421 | }
|
---|
422 |
|
---|
423 | return 0;
|
---|
424 | }
|
---|
425 |
|
---|
426 | LOCAL(int)
|
---|
427 | SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
|
---|
428 | {
|
---|
429 | /* check if character is a member of the given set */
|
---|
430 |
|
---|
431 | int ok = 1;
|
---|
432 |
|
---|
433 | for (;;) {
|
---|
434 | switch (*set++) {
|
---|
435 |
|
---|
436 | case SRE_OP_FAILURE:
|
---|
437 | return !ok;
|
---|
438 |
|
---|
439 | case SRE_OP_LITERAL:
|
---|
440 | /* <LITERAL> <code> */
|
---|
441 | if (ch == set[0])
|
---|
442 | return ok;
|
---|
443 | set++;
|
---|
444 | break;
|
---|
445 |
|
---|
446 | case SRE_OP_CATEGORY:
|
---|
447 | /* <CATEGORY> <code> */
|
---|
448 | if (sre_category(set[0], (int) ch))
|
---|
449 | return ok;
|
---|
450 | set += 1;
|
---|
451 | break;
|
---|
452 |
|
---|
453 | case SRE_OP_CHARSET:
|
---|
454 | if (sizeof(SRE_CODE) == 2) {
|
---|
455 | /* <CHARSET> <bitmap> (16 bits per code word) */
|
---|
456 | if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
|
---|
457 | return ok;
|
---|
458 | set += 16;
|
---|
459 | }
|
---|
460 | else {
|
---|
461 | /* <CHARSET> <bitmap> (32 bits per code word) */
|
---|
462 | if (ch < 256 && (set[ch >> 5] & (1u << (ch & 31))))
|
---|
463 | return ok;
|
---|
464 | set += 8;
|
---|
465 | }
|
---|
466 | break;
|
---|
467 |
|
---|
468 | case SRE_OP_RANGE:
|
---|
469 | /* <RANGE> <lower> <upper> */
|
---|
470 | if (set[0] <= ch && ch <= set[1])
|
---|
471 | return ok;
|
---|
472 | set += 2;
|
---|
473 | break;
|
---|
474 |
|
---|
475 | case SRE_OP_NEGATE:
|
---|
476 | ok = !ok;
|
---|
477 | break;
|
---|
478 |
|
---|
479 | case SRE_OP_BIGCHARSET:
|
---|
480 | /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
|
---|
481 | {
|
---|
482 | Py_ssize_t count, block;
|
---|
483 | count = *(set++);
|
---|
484 |
|
---|
485 | if (sizeof(SRE_CODE) == 2) {
|
---|
486 | block = ((unsigned char*)set)[ch >> 8];
|
---|
487 | set += 128;
|
---|
488 | if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
|
---|
489 | return ok;
|
---|
490 | set += count*16;
|
---|
491 | }
|
---|
492 | else {
|
---|
493 | /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
|
---|
494 | * warnings when c's type supports only numbers < N+1 */
|
---|
495 | if (!(ch & ~65535))
|
---|
496 | block = ((unsigned char*)set)[ch >> 8];
|
---|
497 | else
|
---|
498 | block = -1;
|
---|
499 | set += 64;
|
---|
500 | if (block >=0 &&
|
---|
501 | (set[block*8 + ((ch & 255)>>5)] & (1u << (ch & 31))))
|
---|
502 | return ok;
|
---|
503 | set += count*8;
|
---|
504 | }
|
---|
505 | break;
|
---|
506 | }
|
---|
507 |
|
---|
508 | default:
|
---|
509 | /* internal error -- there's not much we can do about it
|
---|
510 | here, so let's just pretend it didn't match... */
|
---|
511 | return 0;
|
---|
512 | }
|
---|
513 | }
|
---|
514 | }
|
---|
515 |
|
---|
516 | LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
|
---|
517 |
|
---|
518 | LOCAL(Py_ssize_t)
|
---|
519 | SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
|
---|
520 | {
|
---|
521 | SRE_CODE chr;
|
---|
522 | SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
|
---|
523 | SRE_CHAR* end = (SRE_CHAR *)state->end;
|
---|
524 | Py_ssize_t i;
|
---|
525 |
|
---|
526 | /* adjust end */
|
---|
527 | if (maxcount < end - ptr && maxcount != SRE_MAXREPEAT)
|
---|
528 | end = ptr + maxcount;
|
---|
529 |
|
---|
530 | switch (pattern[0]) {
|
---|
531 |
|
---|
532 | case SRE_OP_IN:
|
---|
533 | /* repeated set */
|
---|
534 | TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
|
---|
535 | while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
|
---|
536 | ptr++;
|
---|
537 | break;
|
---|
538 |
|
---|
539 | case SRE_OP_ANY:
|
---|
540 | /* repeated dot wildcard. */
|
---|
541 | TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
|
---|
542 | while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
|
---|
543 | ptr++;
|
---|
544 | break;
|
---|
545 |
|
---|
546 | case SRE_OP_ANY_ALL:
|
---|
547 | /* repeated dot wildcard. skip to the end of the target
|
---|
548 | string, and backtrack from there */
|
---|
549 | TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
|
---|
550 | ptr = end;
|
---|
551 | break;
|
---|
552 |
|
---|
553 | case SRE_OP_LITERAL:
|
---|
554 | /* repeated literal */
|
---|
555 | chr = pattern[1];
|
---|
556 | TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
|
---|
557 | while (ptr < end && (SRE_CODE) *ptr == chr)
|
---|
558 | ptr++;
|
---|
559 | break;
|
---|
560 |
|
---|
561 | case SRE_OP_LITERAL_IGNORE:
|
---|
562 | /* repeated literal */
|
---|
563 | chr = pattern[1];
|
---|
564 | TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
|
---|
565 | while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
|
---|
566 | ptr++;
|
---|
567 | break;
|
---|
568 |
|
---|
569 | case SRE_OP_NOT_LITERAL:
|
---|
570 | /* repeated non-literal */
|
---|
571 | chr = pattern[1];
|
---|
572 | TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
|
---|
573 | while (ptr < end && (SRE_CODE) *ptr != chr)
|
---|
574 | ptr++;
|
---|
575 | break;
|
---|
576 |
|
---|
577 | case SRE_OP_NOT_LITERAL_IGNORE:
|
---|
578 | /* repeated non-literal */
|
---|
579 | chr = pattern[1];
|
---|
580 | TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
|
---|
581 | while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
|
---|
582 | ptr++;
|
---|
583 | break;
|
---|
584 |
|
---|
585 | default:
|
---|
586 | /* repeated single character pattern */
|
---|
587 | TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
|
---|
588 | while ((SRE_CHAR*) state->ptr < end) {
|
---|
589 | i = SRE_MATCH(state, pattern);
|
---|
590 | if (i < 0)
|
---|
591 | return i;
|
---|
592 | if (!i)
|
---|
593 | break;
|
---|
594 | }
|
---|
595 | TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
|
---|
596 | (SRE_CHAR*) state->ptr - ptr));
|
---|
597 | return (SRE_CHAR*) state->ptr - ptr;
|
---|
598 | }
|
---|
599 |
|
---|
600 | TRACE(("|%p|%p|COUNT %" PY_FORMAT_SIZE_T "d\n", pattern, ptr,
|
---|
601 | ptr - (SRE_CHAR*) state->ptr));
|
---|
602 | return ptr - (SRE_CHAR*) state->ptr;
|
---|
603 | }
|
---|
604 |
|
---|
605 | #if 0 /* not used in this release */
|
---|
606 | LOCAL(int)
|
---|
607 | SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
|
---|
608 | {
|
---|
609 | /* check if an SRE_OP_INFO block matches at the current position.
|
---|
610 | returns the number of SRE_CODE objects to skip if successful, 0
|
---|
611 | if no match */
|
---|
612 |
|
---|
613 | SRE_CHAR* end = state->end;
|
---|
614 | SRE_CHAR* ptr = state->ptr;
|
---|
615 | Py_ssize_t i;
|
---|
616 |
|
---|
617 | /* check minimal length */
|
---|
618 | if (pattern[3] && (end - ptr) < pattern[3])
|
---|
619 | return 0;
|
---|
620 |
|
---|
621 | /* check known prefix */
|
---|
622 | if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
|
---|
623 | /* <length> <skip> <prefix data> <overlap data> */
|
---|
624 | for (i = 0; i < pattern[5]; i++)
|
---|
625 | if ((SRE_CODE) ptr[i] != pattern[7 + i])
|
---|
626 | return 0;
|
---|
627 | return pattern[0] + 2 * pattern[6];
|
---|
628 | }
|
---|
629 | return pattern[0];
|
---|
630 | }
|
---|
631 | #endif
|
---|
632 |
|
---|
633 | /* The macros below should be used to protect recursive SRE_MATCH()
|
---|
634 | * calls that *failed* and do *not* return immediately (IOW, those
|
---|
635 | * that will backtrack). Explaining:
|
---|
636 | *
|
---|
637 | * - Recursive SRE_MATCH() returned true: that's usually a success
|
---|
638 | * (besides atypical cases like ASSERT_NOT), therefore there's no
|
---|
639 | * reason to restore lastmark;
|
---|
640 | *
|
---|
641 | * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
|
---|
642 | * is returning to the caller: If the current SRE_MATCH() is the
|
---|
643 | * top function of the recursion, returning false will be a matching
|
---|
644 | * failure, and it doesn't matter where lastmark is pointing to.
|
---|
645 | * If it's *not* the top function, it will be a recursive SRE_MATCH()
|
---|
646 | * failure by itself, and the calling SRE_MATCH() will have to deal
|
---|
647 | * with the failure by the same rules explained here (it will restore
|
---|
648 | * lastmark by itself if necessary);
|
---|
649 | *
|
---|
650 | * - Recursive SRE_MATCH() returned false, and will continue the
|
---|
651 | * outside 'for' loop: must be protected when breaking, since the next
|
---|
652 | * OP could potentially depend on lastmark;
|
---|
653 | *
|
---|
654 | * - Recursive SRE_MATCH() returned false, and will be called again
|
---|
655 | * inside a local for/while loop: must be protected between each
|
---|
656 | * loop iteration, since the recursive SRE_MATCH() could do anything,
|
---|
657 | * and could potentially depend on lastmark.
|
---|
658 | *
|
---|
659 | * For more information, check the discussion at SF patch #712900.
|
---|
660 | */
|
---|
661 | #define LASTMARK_SAVE() \
|
---|
662 | do { \
|
---|
663 | ctx->lastmark = state->lastmark; \
|
---|
664 | ctx->lastindex = state->lastindex; \
|
---|
665 | } while (0)
|
---|
666 | #define LASTMARK_RESTORE() \
|
---|
667 | do { \
|
---|
668 | state->lastmark = ctx->lastmark; \
|
---|
669 | state->lastindex = ctx->lastindex; \
|
---|
670 | } while (0)
|
---|
671 |
|
---|
672 | #define RETURN_ERROR(i) do { return i; } while(0)
|
---|
673 | #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
|
---|
674 | #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
|
---|
675 |
|
---|
676 | #define RETURN_ON_ERROR(i) \
|
---|
677 | do { if (i < 0) RETURN_ERROR(i); } while (0)
|
---|
678 | #define RETURN_ON_SUCCESS(i) \
|
---|
679 | do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
|
---|
680 | #define RETURN_ON_FAILURE(i) \
|
---|
681 | do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
|
---|
682 |
|
---|
683 | #define SFY(x) #x
|
---|
684 |
|
---|
685 | #define DATA_STACK_ALLOC(state, type, ptr) \
|
---|
686 | do { \
|
---|
687 | alloc_pos = state->data_stack_base; \
|
---|
688 | TRACE(("allocating %s in %" PY_FORMAT_SIZE_T "d " \
|
---|
689 | "(%" PY_FORMAT_SIZE_T "d)\n", \
|
---|
690 | SFY(type), alloc_pos, sizeof(type))); \
|
---|
691 | if (sizeof(type) > state->data_stack_size - alloc_pos) { \
|
---|
692 | int j = data_stack_grow(state, sizeof(type)); \
|
---|
693 | if (j < 0) return j; \
|
---|
694 | if (ctx_pos != -1) \
|
---|
695 | DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
|
---|
696 | } \
|
---|
697 | ptr = (type*)(state->data_stack+alloc_pos); \
|
---|
698 | state->data_stack_base += sizeof(type); \
|
---|
699 | } while (0)
|
---|
700 |
|
---|
701 | #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
|
---|
702 | do { \
|
---|
703 | TRACE(("looking up %s at %" PY_FORMAT_SIZE_T "d\n", SFY(type), pos)); \
|
---|
704 | ptr = (type*)(state->data_stack+pos); \
|
---|
705 | } while (0)
|
---|
706 |
|
---|
707 | #define DATA_STACK_PUSH(state, data, size) \
|
---|
708 | do { \
|
---|
709 | TRACE(("copy data in %p to %" PY_FORMAT_SIZE_T "d " \
|
---|
710 | "(%" PY_FORMAT_SIZE_T "d)\n", \
|
---|
711 | data, state->data_stack_base, size)); \
|
---|
712 | if (size > state->data_stack_size - state->data_stack_base) { \
|
---|
713 | int j = data_stack_grow(state, size); \
|
---|
714 | if (j < 0) return j; \
|
---|
715 | if (ctx_pos != -1) \
|
---|
716 | DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
|
---|
717 | } \
|
---|
718 | memcpy(state->data_stack+state->data_stack_base, data, size); \
|
---|
719 | state->data_stack_base += size; \
|
---|
720 | } while (0)
|
---|
721 |
|
---|
722 | #define DATA_STACK_POP(state, data, size, discard) \
|
---|
723 | do { \
|
---|
724 | TRACE(("copy data to %p from %" PY_FORMAT_SIZE_T "d " \
|
---|
725 | "(%" PY_FORMAT_SIZE_T "d)\n", \
|
---|
726 | data, state->data_stack_base-size, size)); \
|
---|
727 | memcpy(data, state->data_stack+state->data_stack_base-size, size); \
|
---|
728 | if (discard) \
|
---|
729 | state->data_stack_base -= size; \
|
---|
730 | } while (0)
|
---|
731 |
|
---|
732 | #define DATA_STACK_POP_DISCARD(state, size) \
|
---|
733 | do { \
|
---|
734 | TRACE(("discard data from %" PY_FORMAT_SIZE_T "d " \
|
---|
735 | "(%" PY_FORMAT_SIZE_T "d)\n", \
|
---|
736 | state->data_stack_base-size, size)); \
|
---|
737 | state->data_stack_base -= size; \
|
---|
738 | } while(0)
|
---|
739 |
|
---|
740 | #define DATA_PUSH(x) \
|
---|
741 | DATA_STACK_PUSH(state, (x), sizeof(*(x)))
|
---|
742 | #define DATA_POP(x) \
|
---|
743 | DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
|
---|
744 | #define DATA_POP_DISCARD(x) \
|
---|
745 | DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
|
---|
746 | #define DATA_ALLOC(t,p) \
|
---|
747 | DATA_STACK_ALLOC(state, t, p)
|
---|
748 | #define DATA_LOOKUP_AT(t,p,pos) \
|
---|
749 | DATA_STACK_LOOKUP_AT(state,t,p,pos)
|
---|
750 |
|
---|
751 | #define MARK_PUSH(lastmark) \
|
---|
752 | do if (lastmark > 0) { \
|
---|
753 | i = lastmark; /* ctx->lastmark may change if reallocated */ \
|
---|
754 | DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
|
---|
755 | } while (0)
|
---|
756 | #define MARK_POP(lastmark) \
|
---|
757 | do if (lastmark > 0) { \
|
---|
758 | DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
|
---|
759 | } while (0)
|
---|
760 | #define MARK_POP_KEEP(lastmark) \
|
---|
761 | do if (lastmark > 0) { \
|
---|
762 | DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
|
---|
763 | } while (0)
|
---|
764 | #define MARK_POP_DISCARD(lastmark) \
|
---|
765 | do if (lastmark > 0) { \
|
---|
766 | DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
|
---|
767 | } while (0)
|
---|
768 |
|
---|
769 | #define JUMP_NONE 0
|
---|
770 | #define JUMP_MAX_UNTIL_1 1
|
---|
771 | #define JUMP_MAX_UNTIL_2 2
|
---|
772 | #define JUMP_MAX_UNTIL_3 3
|
---|
773 | #define JUMP_MIN_UNTIL_1 4
|
---|
774 | #define JUMP_MIN_UNTIL_2 5
|
---|
775 | #define JUMP_MIN_UNTIL_3 6
|
---|
776 | #define JUMP_REPEAT 7
|
---|
777 | #define JUMP_REPEAT_ONE_1 8
|
---|
778 | #define JUMP_REPEAT_ONE_2 9
|
---|
779 | #define JUMP_MIN_REPEAT_ONE 10
|
---|
780 | #define JUMP_BRANCH 11
|
---|
781 | #define JUMP_ASSERT 12
|
---|
782 | #define JUMP_ASSERT_NOT 13
|
---|
783 |
|
---|
784 | #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
|
---|
785 | DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
|
---|
786 | nextctx->last_ctx_pos = ctx_pos; \
|
---|
787 | nextctx->jump = jumpvalue; \
|
---|
788 | nextctx->pattern = nextpattern; \
|
---|
789 | ctx_pos = alloc_pos; \
|
---|
790 | ctx = nextctx; \
|
---|
791 | goto entrance; \
|
---|
792 | jumplabel: \
|
---|
793 | while (0) /* gcc doesn't like labels at end of scopes */ \
|
---|
794 |
|
---|
795 | typedef struct {
|
---|
796 | Py_ssize_t last_ctx_pos;
|
---|
797 | Py_ssize_t jump;
|
---|
798 | SRE_CHAR* ptr;
|
---|
799 | SRE_CODE* pattern;
|
---|
800 | Py_ssize_t count;
|
---|
801 | Py_ssize_t lastmark;
|
---|
802 | Py_ssize_t lastindex;
|
---|
803 | union {
|
---|
804 | SRE_CODE chr;
|
---|
805 | SRE_REPEAT* rep;
|
---|
806 | } u;
|
---|
807 | } SRE_MATCH_CONTEXT;
|
---|
808 |
|
---|
809 | /* check if string matches the given pattern. returns <0 for
|
---|
810 | error, 0 for failure, and 1 for success */
|
---|
811 | LOCAL(Py_ssize_t)
|
---|
812 | SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
|
---|
813 | {
|
---|
814 | SRE_CHAR* end = (SRE_CHAR *)state->end;
|
---|
815 | Py_ssize_t alloc_pos, ctx_pos = -1;
|
---|
816 | Py_ssize_t i, ret = 0;
|
---|
817 | Py_ssize_t jump;
|
---|
818 | unsigned int sigcount=0;
|
---|
819 |
|
---|
820 | SRE_MATCH_CONTEXT* ctx;
|
---|
821 | SRE_MATCH_CONTEXT* nextctx;
|
---|
822 |
|
---|
823 | TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
|
---|
824 |
|
---|
825 | DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
|
---|
826 | ctx->last_ctx_pos = -1;
|
---|
827 | ctx->jump = JUMP_NONE;
|
---|
828 | ctx->pattern = pattern;
|
---|
829 | ctx_pos = alloc_pos;
|
---|
830 |
|
---|
831 | entrance:
|
---|
832 |
|
---|
833 | ctx->ptr = (SRE_CHAR *)state->ptr;
|
---|
834 |
|
---|
835 | if (ctx->pattern[0] == SRE_OP_INFO) {
|
---|
836 | /* optimization info block */
|
---|
837 | /* <INFO> <1=skip> <2=flags> <3=min> ... */
|
---|
838 | if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
|
---|
839 | TRACE(("reject (got %" PY_FORMAT_SIZE_T "d chars, "
|
---|
840 | "need %" PY_FORMAT_SIZE_T "d)\n",
|
---|
841 | (end - ctx->ptr), (Py_ssize_t) ctx->pattern[3]));
|
---|
842 | RETURN_FAILURE;
|
---|
843 | }
|
---|
844 | ctx->pattern += ctx->pattern[1] + 1;
|
---|
845 | }
|
---|
846 |
|
---|
847 | for (;;) {
|
---|
848 | ++sigcount;
|
---|
849 | if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
|
---|
850 | RETURN_ERROR(SRE_ERROR_INTERRUPTED);
|
---|
851 |
|
---|
852 | switch (*ctx->pattern++) {
|
---|
853 |
|
---|
854 | case SRE_OP_MARK:
|
---|
855 | /* set mark */
|
---|
856 | /* <MARK> <gid> */
|
---|
857 | TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
|
---|
858 | ctx->ptr, ctx->pattern[0]));
|
---|
859 | i = ctx->pattern[0];
|
---|
860 | if (i & 1)
|
---|
861 | state->lastindex = i/2 + 1;
|
---|
862 | if (i > state->lastmark) {
|
---|
863 | /* state->lastmark is the highest valid index in the
|
---|
864 | state->mark array. If it is increased by more than 1,
|
---|
865 | the intervening marks must be set to NULL to signal
|
---|
866 | that these marks have not been encountered. */
|
---|
867 | Py_ssize_t j = state->lastmark + 1;
|
---|
868 | while (j < i)
|
---|
869 | state->mark[j++] = NULL;
|
---|
870 | state->lastmark = i;
|
---|
871 | }
|
---|
872 | state->mark[i] = ctx->ptr;
|
---|
873 | ctx->pattern++;
|
---|
874 | break;
|
---|
875 |
|
---|
876 | case SRE_OP_LITERAL:
|
---|
877 | /* match literal string */
|
---|
878 | /* <LITERAL> <code> */
|
---|
879 | TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
|
---|
880 | ctx->ptr, *ctx->pattern));
|
---|
881 | if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
|
---|
882 | RETURN_FAILURE;
|
---|
883 | ctx->pattern++;
|
---|
884 | ctx->ptr++;
|
---|
885 | break;
|
---|
886 |
|
---|
887 | case SRE_OP_NOT_LITERAL:
|
---|
888 | /* match anything that is not literal character */
|
---|
889 | /* <NOT_LITERAL> <code> */
|
---|
890 | TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
|
---|
891 | ctx->ptr, *ctx->pattern));
|
---|
892 | if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
|
---|
893 | RETURN_FAILURE;
|
---|
894 | ctx->pattern++;
|
---|
895 | ctx->ptr++;
|
---|
896 | break;
|
---|
897 |
|
---|
898 | case SRE_OP_SUCCESS:
|
---|
899 | /* end of pattern */
|
---|
900 | TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
|
---|
901 | state->ptr = ctx->ptr;
|
---|
902 | RETURN_SUCCESS;
|
---|
903 |
|
---|
904 | case SRE_OP_AT:
|
---|
905 | /* match at given position */
|
---|
906 | /* <AT> <code> */
|
---|
907 | TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
|
---|
908 | if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
|
---|
909 | RETURN_FAILURE;
|
---|
910 | ctx->pattern++;
|
---|
911 | break;
|
---|
912 |
|
---|
913 | case SRE_OP_CATEGORY:
|
---|
914 | /* match at given category */
|
---|
915 | /* <CATEGORY> <code> */
|
---|
916 | TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
|
---|
917 | ctx->ptr, *ctx->pattern));
|
---|
918 | if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
|
---|
919 | RETURN_FAILURE;
|
---|
920 | ctx->pattern++;
|
---|
921 | ctx->ptr++;
|
---|
922 | break;
|
---|
923 |
|
---|
924 | case SRE_OP_ANY:
|
---|
925 | /* match anything (except a newline) */
|
---|
926 | /* <ANY> */
|
---|
927 | TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
|
---|
928 | if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
|
---|
929 | RETURN_FAILURE;
|
---|
930 | ctx->ptr++;
|
---|
931 | break;
|
---|
932 |
|
---|
933 | case SRE_OP_ANY_ALL:
|
---|
934 | /* match anything */
|
---|
935 | /* <ANY_ALL> */
|
---|
936 | TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
|
---|
937 | if (ctx->ptr >= end)
|
---|
938 | RETURN_FAILURE;
|
---|
939 | ctx->ptr++;
|
---|
940 | break;
|
---|
941 |
|
---|
942 | case SRE_OP_IN:
|
---|
943 | /* match set member (or non_member) */
|
---|
944 | /* <IN> <skip> <set> */
|
---|
945 | TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
|
---|
946 | if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
|
---|
947 | RETURN_FAILURE;
|
---|
948 | ctx->pattern += ctx->pattern[0];
|
---|
949 | ctx->ptr++;
|
---|
950 | break;
|
---|
951 |
|
---|
952 | case SRE_OP_LITERAL_IGNORE:
|
---|
953 | TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
|
---|
954 | ctx->pattern, ctx->ptr, ctx->pattern[0]));
|
---|
955 | if (ctx->ptr >= end ||
|
---|
956 | state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
|
---|
957 | RETURN_FAILURE;
|
---|
958 | ctx->pattern++;
|
---|
959 | ctx->ptr++;
|
---|
960 | break;
|
---|
961 |
|
---|
962 | case SRE_OP_NOT_LITERAL_IGNORE:
|
---|
963 | TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
|
---|
964 | ctx->pattern, ctx->ptr, *ctx->pattern));
|
---|
965 | if (ctx->ptr >= end ||
|
---|
966 | state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
|
---|
967 | RETURN_FAILURE;
|
---|
968 | ctx->pattern++;
|
---|
969 | ctx->ptr++;
|
---|
970 | break;
|
---|
971 |
|
---|
972 | case SRE_OP_IN_IGNORE:
|
---|
973 | TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
|
---|
974 | if (ctx->ptr >= end
|
---|
975 | || !SRE_CHARSET(ctx->pattern+1,
|
---|
976 | (SRE_CODE)state->lower(*ctx->ptr)))
|
---|
977 | RETURN_FAILURE;
|
---|
978 | ctx->pattern += ctx->pattern[0];
|
---|
979 | ctx->ptr++;
|
---|
980 | break;
|
---|
981 |
|
---|
982 | case SRE_OP_JUMP:
|
---|
983 | case SRE_OP_INFO:
|
---|
984 | /* jump forward */
|
---|
985 | /* <JUMP> <offset> */
|
---|
986 | TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
|
---|
987 | ctx->ptr, ctx->pattern[0]));
|
---|
988 | ctx->pattern += ctx->pattern[0];
|
---|
989 | break;
|
---|
990 |
|
---|
991 | case SRE_OP_BRANCH:
|
---|
992 | /* alternation */
|
---|
993 | /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
|
---|
994 | TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
|
---|
995 | LASTMARK_SAVE();
|
---|
996 | ctx->u.rep = state->repeat;
|
---|
997 | if (ctx->u.rep)
|
---|
998 | MARK_PUSH(ctx->lastmark);
|
---|
999 | for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
|
---|
1000 | if (ctx->pattern[1] == SRE_OP_LITERAL &&
|
---|
1001 | (ctx->ptr >= end ||
|
---|
1002 | (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
|
---|
1003 | continue;
|
---|
1004 | if (ctx->pattern[1] == SRE_OP_IN &&
|
---|
1005 | (ctx->ptr >= end ||
|
---|
1006 | !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
|
---|
1007 | continue;
|
---|
1008 | state->ptr = ctx->ptr;
|
---|
1009 | DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
|
---|
1010 | if (ret) {
|
---|
1011 | if (ctx->u.rep)
|
---|
1012 | MARK_POP_DISCARD(ctx->lastmark);
|
---|
1013 | RETURN_ON_ERROR(ret);
|
---|
1014 | RETURN_SUCCESS;
|
---|
1015 | }
|
---|
1016 | if (ctx->u.rep)
|
---|
1017 | MARK_POP_KEEP(ctx->lastmark);
|
---|
1018 | LASTMARK_RESTORE();
|
---|
1019 | }
|
---|
1020 | if (ctx->u.rep)
|
---|
1021 | MARK_POP_DISCARD(ctx->lastmark);
|
---|
1022 | RETURN_FAILURE;
|
---|
1023 |
|
---|
1024 | case SRE_OP_REPEAT_ONE:
|
---|
1025 | /* match repeated sequence (maximizing regexp) */
|
---|
1026 |
|
---|
1027 | /* this operator only works if the repeated item is
|
---|
1028 | exactly one character wide, and we're not already
|
---|
1029 | collecting backtracking points. for other cases,
|
---|
1030 | use the MAX_REPEAT operator */
|
---|
1031 |
|
---|
1032 | /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
|
---|
1033 |
|
---|
1034 | TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
|
---|
1035 | ctx->pattern[1], ctx->pattern[2]));
|
---|
1036 |
|
---|
1037 | if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
|
---|
1038 | RETURN_FAILURE; /* cannot match */
|
---|
1039 |
|
---|
1040 | state->ptr = ctx->ptr;
|
---|
1041 |
|
---|
1042 | ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
|
---|
1043 | RETURN_ON_ERROR(ret);
|
---|
1044 | DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
|
---|
1045 | ctx->count = ret;
|
---|
1046 | ctx->ptr += ctx->count;
|
---|
1047 |
|
---|
1048 | /* when we arrive here, count contains the number of
|
---|
1049 | matches, and ctx->ptr points to the tail of the target
|
---|
1050 | string. check if the rest of the pattern matches,
|
---|
1051 | and backtrack if not. */
|
---|
1052 |
|
---|
1053 | if (ctx->count < (Py_ssize_t) ctx->pattern[1])
|
---|
1054 | RETURN_FAILURE;
|
---|
1055 |
|
---|
1056 | if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
|
---|
1057 | /* tail is empty. we're finished */
|
---|
1058 | state->ptr = ctx->ptr;
|
---|
1059 | RETURN_SUCCESS;
|
---|
1060 | }
|
---|
1061 |
|
---|
1062 | LASTMARK_SAVE();
|
---|
1063 |
|
---|
1064 | if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
|
---|
1065 | /* tail starts with a literal. skip positions where
|
---|
1066 | the rest of the pattern cannot possibly match */
|
---|
1067 | ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
|
---|
1068 | for (;;) {
|
---|
1069 | while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
|
---|
1070 | (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
|
---|
1071 | ctx->ptr--;
|
---|
1072 | ctx->count--;
|
---|
1073 | }
|
---|
1074 | if (ctx->count < (Py_ssize_t) ctx->pattern[1])
|
---|
1075 | break;
|
---|
1076 | state->ptr = ctx->ptr;
|
---|
1077 | DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
|
---|
1078 | ctx->pattern+ctx->pattern[0]);
|
---|
1079 | if (ret) {
|
---|
1080 | RETURN_ON_ERROR(ret);
|
---|
1081 | RETURN_SUCCESS;
|
---|
1082 | }
|
---|
1083 |
|
---|
1084 | LASTMARK_RESTORE();
|
---|
1085 |
|
---|
1086 | ctx->ptr--;
|
---|
1087 | ctx->count--;
|
---|
1088 | }
|
---|
1089 |
|
---|
1090 | } else {
|
---|
1091 | /* general case */
|
---|
1092 | while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
|
---|
1093 | state->ptr = ctx->ptr;
|
---|
1094 | DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
|
---|
1095 | ctx->pattern+ctx->pattern[0]);
|
---|
1096 | if (ret) {
|
---|
1097 | RETURN_ON_ERROR(ret);
|
---|
1098 | RETURN_SUCCESS;
|
---|
1099 | }
|
---|
1100 | ctx->ptr--;
|
---|
1101 | ctx->count--;
|
---|
1102 | LASTMARK_RESTORE();
|
---|
1103 | }
|
---|
1104 | }
|
---|
1105 | RETURN_FAILURE;
|
---|
1106 |
|
---|
1107 | case SRE_OP_MIN_REPEAT_ONE:
|
---|
1108 | /* match repeated sequence (minimizing regexp) */
|
---|
1109 |
|
---|
1110 | /* this operator only works if the repeated item is
|
---|
1111 | exactly one character wide, and we're not already
|
---|
1112 | collecting backtracking points. for other cases,
|
---|
1113 | use the MIN_REPEAT operator */
|
---|
1114 |
|
---|
1115 | /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
|
---|
1116 |
|
---|
1117 | TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
|
---|
1118 | ctx->pattern[1], ctx->pattern[2]));
|
---|
1119 |
|
---|
1120 | if ((Py_ssize_t) ctx->pattern[1] > end - ctx->ptr)
|
---|
1121 | RETURN_FAILURE; /* cannot match */
|
---|
1122 |
|
---|
1123 | state->ptr = ctx->ptr;
|
---|
1124 |
|
---|
1125 | if (ctx->pattern[1] == 0)
|
---|
1126 | ctx->count = 0;
|
---|
1127 | else {
|
---|
1128 | /* count using pattern min as the maximum */
|
---|
1129 | ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
|
---|
1130 | RETURN_ON_ERROR(ret);
|
---|
1131 | DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
|
---|
1132 | if (ret < (Py_ssize_t) ctx->pattern[1])
|
---|
1133 | /* didn't match minimum number of times */
|
---|
1134 | RETURN_FAILURE;
|
---|
1135 | /* advance past minimum matches of repeat */
|
---|
1136 | ctx->count = ret;
|
---|
1137 | ctx->ptr += ctx->count;
|
---|
1138 | }
|
---|
1139 |
|
---|
1140 | if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
|
---|
1141 | /* tail is empty. we're finished */
|
---|
1142 | state->ptr = ctx->ptr;
|
---|
1143 | RETURN_SUCCESS;
|
---|
1144 |
|
---|
1145 | } else {
|
---|
1146 | /* general case */
|
---|
1147 | LASTMARK_SAVE();
|
---|
1148 | while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
|
---|
1149 | || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
|
---|
1150 | state->ptr = ctx->ptr;
|
---|
1151 | DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
|
---|
1152 | ctx->pattern+ctx->pattern[0]);
|
---|
1153 | if (ret) {
|
---|
1154 | RETURN_ON_ERROR(ret);
|
---|
1155 | RETURN_SUCCESS;
|
---|
1156 | }
|
---|
1157 | state->ptr = ctx->ptr;
|
---|
1158 | ret = SRE_COUNT(state, ctx->pattern+3, 1);
|
---|
1159 | RETURN_ON_ERROR(ret);
|
---|
1160 | DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
|
---|
1161 | if (ret == 0)
|
---|
1162 | break;
|
---|
1163 | assert(ret == 1);
|
---|
1164 | ctx->ptr++;
|
---|
1165 | ctx->count++;
|
---|
1166 | LASTMARK_RESTORE();
|
---|
1167 | }
|
---|
1168 | }
|
---|
1169 | RETURN_FAILURE;
|
---|
1170 |
|
---|
1171 | case SRE_OP_REPEAT:
|
---|
1172 | /* create repeat context. all the hard work is done
|
---|
1173 | by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
|
---|
1174 | /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
|
---|
1175 | TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
|
---|
1176 | ctx->pattern[1], ctx->pattern[2]));
|
---|
1177 |
|
---|
1178 | /* install new repeat context */
|
---|
1179 | ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
|
---|
1180 | if (!ctx->u.rep) {
|
---|
1181 | PyErr_NoMemory();
|
---|
1182 | RETURN_FAILURE;
|
---|
1183 | }
|
---|
1184 | ctx->u.rep->count = -1;
|
---|
1185 | ctx->u.rep->pattern = ctx->pattern;
|
---|
1186 | ctx->u.rep->prev = state->repeat;
|
---|
1187 | ctx->u.rep->last_ptr = NULL;
|
---|
1188 | state->repeat = ctx->u.rep;
|
---|
1189 |
|
---|
1190 | state->ptr = ctx->ptr;
|
---|
1191 | DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
|
---|
1192 | state->repeat = ctx->u.rep->prev;
|
---|
1193 | PyObject_FREE(ctx->u.rep);
|
---|
1194 |
|
---|
1195 | if (ret) {
|
---|
1196 | RETURN_ON_ERROR(ret);
|
---|
1197 | RETURN_SUCCESS;
|
---|
1198 | }
|
---|
1199 | RETURN_FAILURE;
|
---|
1200 |
|
---|
1201 | case SRE_OP_MAX_UNTIL:
|
---|
1202 | /* maximizing repeat */
|
---|
1203 | /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
|
---|
1204 |
|
---|
1205 | /* FIXME: we probably need to deal with zero-width
|
---|
1206 | matches in here... */
|
---|
1207 |
|
---|
1208 | ctx->u.rep = state->repeat;
|
---|
1209 | if (!ctx->u.rep)
|
---|
1210 | RETURN_ERROR(SRE_ERROR_STATE);
|
---|
1211 |
|
---|
1212 | state->ptr = ctx->ptr;
|
---|
1213 |
|
---|
1214 | ctx->count = ctx->u.rep->count+1;
|
---|
1215 |
|
---|
1216 | TRACE(("|%p|%p|MAX_UNTIL %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
|
---|
1217 | ctx->ptr, ctx->count));
|
---|
1218 |
|
---|
1219 | if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
|
---|
1220 | /* not enough matches */
|
---|
1221 | ctx->u.rep->count = ctx->count;
|
---|
1222 | DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
|
---|
1223 | ctx->u.rep->pattern+3);
|
---|
1224 | if (ret) {
|
---|
1225 | RETURN_ON_ERROR(ret);
|
---|
1226 | RETURN_SUCCESS;
|
---|
1227 | }
|
---|
1228 | ctx->u.rep->count = ctx->count-1;
|
---|
1229 | state->ptr = ctx->ptr;
|
---|
1230 | RETURN_FAILURE;
|
---|
1231 | }
|
---|
1232 |
|
---|
1233 | if ((ctx->count < (Py_ssize_t) ctx->u.rep->pattern[2] ||
|
---|
1234 | ctx->u.rep->pattern[2] == SRE_MAXREPEAT) &&
|
---|
1235 | state->ptr != ctx->u.rep->last_ptr) {
|
---|
1236 | /* we may have enough matches, but if we can
|
---|
1237 | match another item, do so */
|
---|
1238 | ctx->u.rep->count = ctx->count;
|
---|
1239 | LASTMARK_SAVE();
|
---|
1240 | MARK_PUSH(ctx->lastmark);
|
---|
1241 | /* zero-width match protection */
|
---|
1242 | DATA_PUSH(&ctx->u.rep->last_ptr);
|
---|
1243 | ctx->u.rep->last_ptr = state->ptr;
|
---|
1244 | DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
|
---|
1245 | ctx->u.rep->pattern+3);
|
---|
1246 | DATA_POP(&ctx->u.rep->last_ptr);
|
---|
1247 | if (ret) {
|
---|
1248 | MARK_POP_DISCARD(ctx->lastmark);
|
---|
1249 | RETURN_ON_ERROR(ret);
|
---|
1250 | RETURN_SUCCESS;
|
---|
1251 | }
|
---|
1252 | MARK_POP(ctx->lastmark);
|
---|
1253 | LASTMARK_RESTORE();
|
---|
1254 | ctx->u.rep->count = ctx->count-1;
|
---|
1255 | state->ptr = ctx->ptr;
|
---|
1256 | }
|
---|
1257 |
|
---|
1258 | /* cannot match more repeated items here. make sure the
|
---|
1259 | tail matches */
|
---|
1260 | state->repeat = ctx->u.rep->prev;
|
---|
1261 | DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
|
---|
1262 | RETURN_ON_SUCCESS(ret);
|
---|
1263 | state->repeat = ctx->u.rep;
|
---|
1264 | state->ptr = ctx->ptr;
|
---|
1265 | RETURN_FAILURE;
|
---|
1266 |
|
---|
1267 | case SRE_OP_MIN_UNTIL:
|
---|
1268 | /* minimizing repeat */
|
---|
1269 | /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
|
---|
1270 |
|
---|
1271 | ctx->u.rep = state->repeat;
|
---|
1272 | if (!ctx->u.rep)
|
---|
1273 | RETURN_ERROR(SRE_ERROR_STATE);
|
---|
1274 |
|
---|
1275 | state->ptr = ctx->ptr;
|
---|
1276 |
|
---|
1277 | ctx->count = ctx->u.rep->count+1;
|
---|
1278 |
|
---|
1279 | TRACE(("|%p|%p|MIN_UNTIL %" PY_FORMAT_SIZE_T "d %p\n", ctx->pattern,
|
---|
1280 | ctx->ptr, ctx->count, ctx->u.rep->pattern));
|
---|
1281 |
|
---|
1282 | if (ctx->count < (Py_ssize_t) ctx->u.rep->pattern[1]) {
|
---|
1283 | /* not enough matches */
|
---|
1284 | ctx->u.rep->count = ctx->count;
|
---|
1285 | DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
|
---|
1286 | ctx->u.rep->pattern+3);
|
---|
1287 | if (ret) {
|
---|
1288 | RETURN_ON_ERROR(ret);
|
---|
1289 | RETURN_SUCCESS;
|
---|
1290 | }
|
---|
1291 | ctx->u.rep->count = ctx->count-1;
|
---|
1292 | state->ptr = ctx->ptr;
|
---|
1293 | RETURN_FAILURE;
|
---|
1294 | }
|
---|
1295 |
|
---|
1296 | LASTMARK_SAVE();
|
---|
1297 |
|
---|
1298 | /* see if the tail matches */
|
---|
1299 | state->repeat = ctx->u.rep->prev;
|
---|
1300 | DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
|
---|
1301 | if (ret) {
|
---|
1302 | RETURN_ON_ERROR(ret);
|
---|
1303 | RETURN_SUCCESS;
|
---|
1304 | }
|
---|
1305 |
|
---|
1306 | state->repeat = ctx->u.rep;
|
---|
1307 | state->ptr = ctx->ptr;
|
---|
1308 |
|
---|
1309 | LASTMARK_RESTORE();
|
---|
1310 |
|
---|
1311 | if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
|
---|
1312 | && ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
|
---|
1313 | state->ptr == ctx->u.rep->last_ptr)
|
---|
1314 | RETURN_FAILURE;
|
---|
1315 |
|
---|
1316 | ctx->u.rep->count = ctx->count;
|
---|
1317 | /* zero-width match protection */
|
---|
1318 | DATA_PUSH(&ctx->u.rep->last_ptr);
|
---|
1319 | ctx->u.rep->last_ptr = state->ptr;
|
---|
1320 | DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
|
---|
1321 | ctx->u.rep->pattern+3);
|
---|
1322 | DATA_POP(&ctx->u.rep->last_ptr);
|
---|
1323 | if (ret) {
|
---|
1324 | RETURN_ON_ERROR(ret);
|
---|
1325 | RETURN_SUCCESS;
|
---|
1326 | }
|
---|
1327 | ctx->u.rep->count = ctx->count-1;
|
---|
1328 | state->ptr = ctx->ptr;
|
---|
1329 | RETURN_FAILURE;
|
---|
1330 |
|
---|
1331 | case SRE_OP_GROUPREF:
|
---|
1332 | /* match backreference */
|
---|
1333 | TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
|
---|
1334 | ctx->ptr, ctx->pattern[0]));
|
---|
1335 | i = ctx->pattern[0];
|
---|
1336 | {
|
---|
1337 | Py_ssize_t groupref = i+i;
|
---|
1338 | if (groupref >= state->lastmark) {
|
---|
1339 | RETURN_FAILURE;
|
---|
1340 | } else {
|
---|
1341 | SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
|
---|
1342 | SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
|
---|
1343 | if (!p || !e || e < p)
|
---|
1344 | RETURN_FAILURE;
|
---|
1345 | while (p < e) {
|
---|
1346 | if (ctx->ptr >= end || *ctx->ptr != *p)
|
---|
1347 | RETURN_FAILURE;
|
---|
1348 | p++; ctx->ptr++;
|
---|
1349 | }
|
---|
1350 | }
|
---|
1351 | }
|
---|
1352 | ctx->pattern++;
|
---|
1353 | break;
|
---|
1354 |
|
---|
1355 | case SRE_OP_GROUPREF_IGNORE:
|
---|
1356 | /* match backreference */
|
---|
1357 | TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
|
---|
1358 | ctx->ptr, ctx->pattern[0]));
|
---|
1359 | i = ctx->pattern[0];
|
---|
1360 | {
|
---|
1361 | Py_ssize_t groupref = i+i;
|
---|
1362 | if (groupref >= state->lastmark) {
|
---|
1363 | RETURN_FAILURE;
|
---|
1364 | } else {
|
---|
1365 | SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
|
---|
1366 | SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
|
---|
1367 | if (!p || !e || e < p)
|
---|
1368 | RETURN_FAILURE;
|
---|
1369 | while (p < e) {
|
---|
1370 | if (ctx->ptr >= end ||
|
---|
1371 | state->lower(*ctx->ptr) != state->lower(*p))
|
---|
1372 | RETURN_FAILURE;
|
---|
1373 | p++; ctx->ptr++;
|
---|
1374 | }
|
---|
1375 | }
|
---|
1376 | }
|
---|
1377 | ctx->pattern++;
|
---|
1378 | break;
|
---|
1379 |
|
---|
1380 | case SRE_OP_GROUPREF_EXISTS:
|
---|
1381 | TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
|
---|
1382 | ctx->ptr, ctx->pattern[0]));
|
---|
1383 | /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
|
---|
1384 | i = ctx->pattern[0];
|
---|
1385 | {
|
---|
1386 | Py_ssize_t groupref = i+i;
|
---|
1387 | if (groupref >= state->lastmark) {
|
---|
1388 | ctx->pattern += ctx->pattern[1];
|
---|
1389 | break;
|
---|
1390 | } else {
|
---|
1391 | SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
|
---|
1392 | SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
|
---|
1393 | if (!p || !e || e < p) {
|
---|
1394 | ctx->pattern += ctx->pattern[1];
|
---|
1395 | break;
|
---|
1396 | }
|
---|
1397 | }
|
---|
1398 | }
|
---|
1399 | ctx->pattern += 2;
|
---|
1400 | break;
|
---|
1401 |
|
---|
1402 | case SRE_OP_ASSERT:
|
---|
1403 | /* assert subpattern */
|
---|
1404 | /* <ASSERT> <skip> <back> <pattern> */
|
---|
1405 | TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
|
---|
1406 | ctx->ptr, ctx->pattern[1]));
|
---|
1407 | state->ptr = ctx->ptr - ctx->pattern[1];
|
---|
1408 | if (state->ptr < state->beginning)
|
---|
1409 | RETURN_FAILURE;
|
---|
1410 | DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
|
---|
1411 | RETURN_ON_FAILURE(ret);
|
---|
1412 | ctx->pattern += ctx->pattern[0];
|
---|
1413 | break;
|
---|
1414 |
|
---|
1415 | case SRE_OP_ASSERT_NOT:
|
---|
1416 | /* assert not subpattern */
|
---|
1417 | /* <ASSERT_NOT> <skip> <back> <pattern> */
|
---|
1418 | TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
|
---|
1419 | ctx->ptr, ctx->pattern[1]));
|
---|
1420 | state->ptr = ctx->ptr - ctx->pattern[1];
|
---|
1421 | if (state->ptr >= state->beginning) {
|
---|
1422 | DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
|
---|
1423 | if (ret) {
|
---|
1424 | RETURN_ON_ERROR(ret);
|
---|
1425 | RETURN_FAILURE;
|
---|
1426 | }
|
---|
1427 | }
|
---|
1428 | ctx->pattern += ctx->pattern[0];
|
---|
1429 | break;
|
---|
1430 |
|
---|
1431 | case SRE_OP_FAILURE:
|
---|
1432 | /* immediate failure */
|
---|
1433 | TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
|
---|
1434 | RETURN_FAILURE;
|
---|
1435 |
|
---|
1436 | default:
|
---|
1437 | TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
|
---|
1438 | ctx->pattern[-1]));
|
---|
1439 | RETURN_ERROR(SRE_ERROR_ILLEGAL);
|
---|
1440 | }
|
---|
1441 | }
|
---|
1442 |
|
---|
1443 | exit:
|
---|
1444 | ctx_pos = ctx->last_ctx_pos;
|
---|
1445 | jump = ctx->jump;
|
---|
1446 | DATA_POP_DISCARD(ctx);
|
---|
1447 | if (ctx_pos == -1)
|
---|
1448 | return ret;
|
---|
1449 | DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
|
---|
1450 |
|
---|
1451 | switch (jump) {
|
---|
1452 | case JUMP_MAX_UNTIL_2:
|
---|
1453 | TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
|
---|
1454 | goto jump_max_until_2;
|
---|
1455 | case JUMP_MAX_UNTIL_3:
|
---|
1456 | TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
|
---|
1457 | goto jump_max_until_3;
|
---|
1458 | case JUMP_MIN_UNTIL_2:
|
---|
1459 | TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
|
---|
1460 | goto jump_min_until_2;
|
---|
1461 | case JUMP_MIN_UNTIL_3:
|
---|
1462 | TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
|
---|
1463 | goto jump_min_until_3;
|
---|
1464 | case JUMP_BRANCH:
|
---|
1465 | TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
|
---|
1466 | goto jump_branch;
|
---|
1467 | case JUMP_MAX_UNTIL_1:
|
---|
1468 | TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
|
---|
1469 | goto jump_max_until_1;
|
---|
1470 | case JUMP_MIN_UNTIL_1:
|
---|
1471 | TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
|
---|
1472 | goto jump_min_until_1;
|
---|
1473 | case JUMP_REPEAT:
|
---|
1474 | TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
|
---|
1475 | goto jump_repeat;
|
---|
1476 | case JUMP_REPEAT_ONE_1:
|
---|
1477 | TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
|
---|
1478 | goto jump_repeat_one_1;
|
---|
1479 | case JUMP_REPEAT_ONE_2:
|
---|
1480 | TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
|
---|
1481 | goto jump_repeat_one_2;
|
---|
1482 | case JUMP_MIN_REPEAT_ONE:
|
---|
1483 | TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
|
---|
1484 | goto jump_min_repeat_one;
|
---|
1485 | case JUMP_ASSERT:
|
---|
1486 | TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
|
---|
1487 | goto jump_assert;
|
---|
1488 | case JUMP_ASSERT_NOT:
|
---|
1489 | TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
|
---|
1490 | goto jump_assert_not;
|
---|
1491 | case JUMP_NONE:
|
---|
1492 | TRACE(("|%p|%p|RETURN %" PY_FORMAT_SIZE_T "d\n", ctx->pattern,
|
---|
1493 | ctx->ptr, ret));
|
---|
1494 | break;
|
---|
1495 | }
|
---|
1496 |
|
---|
1497 | return ret; /* should never get here */
|
---|
1498 | }
|
---|
1499 |
|
---|
1500 | LOCAL(Py_ssize_t)
|
---|
1501 | SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
|
---|
1502 | {
|
---|
1503 | SRE_CHAR* ptr = (SRE_CHAR *)state->start;
|
---|
1504 | SRE_CHAR* end = (SRE_CHAR *)state->end;
|
---|
1505 | Py_ssize_t status = 0;
|
---|
1506 | Py_ssize_t prefix_len = 0;
|
---|
1507 | Py_ssize_t prefix_skip = 0;
|
---|
1508 | SRE_CODE* prefix = NULL;
|
---|
1509 | SRE_CODE* charset = NULL;
|
---|
1510 | SRE_CODE* overlap = NULL;
|
---|
1511 | int flags = 0;
|
---|
1512 |
|
---|
1513 | if (pattern[0] == SRE_OP_INFO) {
|
---|
1514 | /* optimization info block */
|
---|
1515 | /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
|
---|
1516 |
|
---|
1517 | flags = pattern[2];
|
---|
1518 |
|
---|
1519 | if (pattern[3] > 1) {
|
---|
1520 | /* adjust end point (but make sure we leave at least one
|
---|
1521 | character in there, so literal search will work) */
|
---|
1522 | end -= pattern[3]-1;
|
---|
1523 | if (end <= ptr)
|
---|
1524 | end = ptr+1;
|
---|
1525 | }
|
---|
1526 |
|
---|
1527 | if (flags & SRE_INFO_PREFIX) {
|
---|
1528 | /* pattern starts with a known prefix */
|
---|
1529 | /* <length> <skip> <prefix data> <overlap data> */
|
---|
1530 | prefix_len = pattern[5];
|
---|
1531 | prefix_skip = pattern[6];
|
---|
1532 | prefix = pattern + 7;
|
---|
1533 | overlap = prefix + prefix_len - 1;
|
---|
1534 | } else if (flags & SRE_INFO_CHARSET)
|
---|
1535 | /* pattern starts with a character from a known set */
|
---|
1536 | /* <charset> */
|
---|
1537 | charset = pattern + 5;
|
---|
1538 |
|
---|
1539 | pattern += 1 + pattern[1];
|
---|
1540 | }
|
---|
1541 |
|
---|
1542 | TRACE(("prefix = %p %" PY_FORMAT_SIZE_T "d %" PY_FORMAT_SIZE_T "d\n",
|
---|
1543 | prefix, prefix_len, prefix_skip));
|
---|
1544 | TRACE(("charset = %p\n", charset));
|
---|
1545 |
|
---|
1546 | #if defined(USE_FAST_SEARCH)
|
---|
1547 | if (prefix_len > 1) {
|
---|
1548 | /* pattern starts with a known prefix. use the overlap
|
---|
1549 | table to skip forward as fast as we possibly can */
|
---|
1550 | Py_ssize_t i = 0;
|
---|
1551 | end = (SRE_CHAR *)state->end;
|
---|
1552 | while (ptr < end) {
|
---|
1553 | for (;;) {
|
---|
1554 | if ((SRE_CODE) ptr[0] != prefix[i]) {
|
---|
1555 | if (!i)
|
---|
1556 | break;
|
---|
1557 | else
|
---|
1558 | i = overlap[i];
|
---|
1559 | } else {
|
---|
1560 | if (++i == prefix_len) {
|
---|
1561 | /* found a potential match */
|
---|
1562 | TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
|
---|
1563 | state->start = ptr + 1 - prefix_len;
|
---|
1564 | state->ptr = ptr + 1 - prefix_len + prefix_skip;
|
---|
1565 | if (flags & SRE_INFO_LITERAL)
|
---|
1566 | return 1; /* we got all of it */
|
---|
1567 | status = SRE_MATCH(state, pattern + 2*prefix_skip);
|
---|
1568 | if (status != 0)
|
---|
1569 | return status;
|
---|
1570 | /* close but no cigar -- try again */
|
---|
1571 | i = overlap[i];
|
---|
1572 | }
|
---|
1573 | break;
|
---|
1574 | }
|
---|
1575 | }
|
---|
1576 | ptr++;
|
---|
1577 | }
|
---|
1578 | return 0;
|
---|
1579 | }
|
---|
1580 | #endif
|
---|
1581 |
|
---|
1582 | if (pattern[0] == SRE_OP_LITERAL) {
|
---|
1583 | /* pattern starts with a literal character. this is used
|
---|
1584 | for short prefixes, and if fast search is disabled */
|
---|
1585 | SRE_CODE chr = pattern[1];
|
---|
1586 | end = (SRE_CHAR *)state->end;
|
---|
1587 | for (;;) {
|
---|
1588 | while (ptr < end && (SRE_CODE) ptr[0] != chr)
|
---|
1589 | ptr++;
|
---|
1590 | if (ptr >= end)
|
---|
1591 | return 0;
|
---|
1592 | TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
|
---|
1593 | state->start = ptr;
|
---|
1594 | state->ptr = ++ptr;
|
---|
1595 | if (flags & SRE_INFO_LITERAL)
|
---|
1596 | return 1; /* we got all of it */
|
---|
1597 | status = SRE_MATCH(state, pattern + 2);
|
---|
1598 | if (status != 0)
|
---|
1599 | break;
|
---|
1600 | }
|
---|
1601 | } else if (charset) {
|
---|
1602 | /* pattern starts with a character from a known set */
|
---|
1603 | end = (SRE_CHAR *)state->end;
|
---|
1604 | for (;;) {
|
---|
1605 | while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
|
---|
1606 | ptr++;
|
---|
1607 | if (ptr >= end)
|
---|
1608 | return 0;
|
---|
1609 | TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
|
---|
1610 | state->start = ptr;
|
---|
1611 | state->ptr = ptr;
|
---|
1612 | status = SRE_MATCH(state, pattern);
|
---|
1613 | if (status != 0)
|
---|
1614 | break;
|
---|
1615 | ptr++;
|
---|
1616 | }
|
---|
1617 | } else
|
---|
1618 | /* general case */
|
---|
1619 | while (ptr <= end) {
|
---|
1620 | TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
|
---|
1621 | state->start = state->ptr = ptr++;
|
---|
1622 | status = SRE_MATCH(state, pattern);
|
---|
1623 | if (status != 0)
|
---|
1624 | break;
|
---|
1625 | }
|
---|
1626 |
|
---|
1627 | return status;
|
---|
1628 | }
|
---|
1629 |
|
---|
1630 | LOCAL(int)
|
---|
1631 | SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
|
---|
1632 | {
|
---|
1633 | /* check if given string is a literal template (i.e. no escapes) */
|
---|
1634 | while (len-- > 0)
|
---|
1635 | if (*ptr++ == '\\')
|
---|
1636 | return 0;
|
---|
1637 | return 1;
|
---|
1638 | }
|
---|
1639 |
|
---|
1640 | #if !defined(SRE_RECURSIVE)
|
---|
1641 |
|
---|
1642 | /* -------------------------------------------------------------------- */
|
---|
1643 | /* factories and destructors */
|
---|
1644 |
|
---|
1645 | /* see sre.h for object declarations */
|
---|
1646 | static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
|
---|
1647 | static PyObject*pattern_scanner(PatternObject*, PyObject*);
|
---|
1648 |
|
---|
1649 | static PyObject *
|
---|
1650 | sre_codesize(PyObject* self, PyObject *unused)
|
---|
1651 | {
|
---|
1652 | return PyInt_FromSize_t(sizeof(SRE_CODE));
|
---|
1653 | }
|
---|
1654 |
|
---|
1655 | static PyObject *
|
---|
1656 | sre_getlower(PyObject* self, PyObject* args)
|
---|
1657 | {
|
---|
1658 | int character, flags;
|
---|
1659 | if (!PyArg_ParseTuple(args, "ii", &character, &flags))
|
---|
1660 | return NULL;
|
---|
1661 | if (flags & SRE_FLAG_LOCALE)
|
---|
1662 | return Py_BuildValue("i", sre_lower_locale(character));
|
---|
1663 | if (flags & SRE_FLAG_UNICODE)
|
---|
1664 | #if defined(HAVE_UNICODE)
|
---|
1665 | return Py_BuildValue("i", sre_lower_unicode(character));
|
---|
1666 | #else
|
---|
1667 | return Py_BuildValue("i", sre_lower_locale(character));
|
---|
1668 | #endif
|
---|
1669 | return Py_BuildValue("i", sre_lower(character));
|
---|
1670 | }
|
---|
1671 |
|
---|
1672 | LOCAL(void)
|
---|
1673 | state_reset(SRE_STATE* state)
|
---|
1674 | {
|
---|
1675 | /* FIXME: dynamic! */
|
---|
1676 | /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
|
---|
1677 |
|
---|
1678 | state->lastmark = -1;
|
---|
1679 | state->lastindex = -1;
|
---|
1680 |
|
---|
1681 | state->repeat = NULL;
|
---|
1682 |
|
---|
1683 | data_stack_dealloc(state);
|
---|
1684 | }
|
---|
1685 |
|
---|
1686 | static void*
|
---|
1687 | getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
|
---|
1688 | {
|
---|
1689 | /* given a python object, return a data pointer, a length (in
|
---|
1690 | characters), and a character size. return NULL if the object
|
---|
1691 | is not a string (or not compatible) */
|
---|
1692 |
|
---|
1693 | PyBufferProcs *buffer;
|
---|
1694 | Py_ssize_t size, bytes;
|
---|
1695 | int charsize;
|
---|
1696 | void* ptr;
|
---|
1697 |
|
---|
1698 | #if defined(HAVE_UNICODE)
|
---|
1699 | if (PyUnicode_Check(string)) {
|
---|
1700 | /* unicode strings doesn't always support the buffer interface */
|
---|
1701 | ptr = (void*) PyUnicode_AS_DATA(string);
|
---|
1702 | /* bytes = PyUnicode_GET_DATA_SIZE(string); */
|
---|
1703 | size = PyUnicode_GET_SIZE(string);
|
---|
1704 | charsize = sizeof(Py_UNICODE);
|
---|
1705 |
|
---|
1706 | } else {
|
---|
1707 | #endif
|
---|
1708 |
|
---|
1709 | /* get pointer to string buffer */
|
---|
1710 | buffer = Py_TYPE(string)->tp_as_buffer;
|
---|
1711 | if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
|
---|
1712 | buffer->bf_getsegcount(string, NULL) != 1) {
|
---|
1713 | PyErr_SetString(PyExc_TypeError, "expected string or buffer");
|
---|
1714 | return NULL;
|
---|
1715 | }
|
---|
1716 |
|
---|
1717 | /* determine buffer size */
|
---|
1718 | bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
|
---|
1719 | if (bytes < 0) {
|
---|
1720 | PyErr_SetString(PyExc_TypeError, "buffer has negative size");
|
---|
1721 | return NULL;
|
---|
1722 | }
|
---|
1723 |
|
---|
1724 | /* determine character size */
|
---|
1725 | #if PY_VERSION_HEX >= 0x01060000
|
---|
1726 | size = PyObject_Size(string);
|
---|
1727 | #else
|
---|
1728 | size = PyObject_Length(string);
|
---|
1729 | #endif
|
---|
1730 |
|
---|
1731 | if (PyString_Check(string) || bytes == size)
|
---|
1732 | charsize = 1;
|
---|
1733 | #if defined(HAVE_UNICODE)
|
---|
1734 | else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
|
---|
1735 | charsize = sizeof(Py_UNICODE);
|
---|
1736 | #endif
|
---|
1737 | else {
|
---|
1738 | PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
|
---|
1739 | return NULL;
|
---|
1740 | }
|
---|
1741 |
|
---|
1742 | #if defined(HAVE_UNICODE)
|
---|
1743 | }
|
---|
1744 | #endif
|
---|
1745 |
|
---|
1746 | *p_length = size;
|
---|
1747 | *p_charsize = charsize;
|
---|
1748 |
|
---|
1749 | return ptr;
|
---|
1750 | }
|
---|
1751 |
|
---|
1752 | LOCAL(PyObject*)
|
---|
1753 | state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
|
---|
1754 | Py_ssize_t start, Py_ssize_t end)
|
---|
1755 | {
|
---|
1756 | /* prepare state object */
|
---|
1757 |
|
---|
1758 | Py_ssize_t length;
|
---|
1759 | int charsize;
|
---|
1760 | void* ptr;
|
---|
1761 |
|
---|
1762 | memset(state, 0, sizeof(SRE_STATE));
|
---|
1763 |
|
---|
1764 | state->lastmark = -1;
|
---|
1765 | state->lastindex = -1;
|
---|
1766 |
|
---|
1767 | ptr = getstring(string, &length, &charsize);
|
---|
1768 | if (!ptr)
|
---|
1769 | return NULL;
|
---|
1770 |
|
---|
1771 | /* adjust boundaries */
|
---|
1772 | if (start < 0)
|
---|
1773 | start = 0;
|
---|
1774 | else if (start > length)
|
---|
1775 | start = length;
|
---|
1776 |
|
---|
1777 | if (end < 0)
|
---|
1778 | end = 0;
|
---|
1779 | else if (end > length)
|
---|
1780 | end = length;
|
---|
1781 |
|
---|
1782 | state->charsize = charsize;
|
---|
1783 |
|
---|
1784 | state->beginning = ptr;
|
---|
1785 |
|
---|
1786 | state->start = (void*) ((char*) ptr + start * state->charsize);
|
---|
1787 | state->end = (void*) ((char*) ptr + end * state->charsize);
|
---|
1788 |
|
---|
1789 | Py_INCREF(string);
|
---|
1790 | state->string = string;
|
---|
1791 | state->pos = start;
|
---|
1792 | state->endpos = end;
|
---|
1793 |
|
---|
1794 | if (pattern->flags & SRE_FLAG_LOCALE)
|
---|
1795 | state->lower = sre_lower_locale;
|
---|
1796 | else if (pattern->flags & SRE_FLAG_UNICODE)
|
---|
1797 | #if defined(HAVE_UNICODE)
|
---|
1798 | state->lower = sre_lower_unicode;
|
---|
1799 | #else
|
---|
1800 | state->lower = sre_lower_locale;
|
---|
1801 | #endif
|
---|
1802 | else
|
---|
1803 | state->lower = sre_lower;
|
---|
1804 |
|
---|
1805 | return string;
|
---|
1806 | }
|
---|
1807 |
|
---|
1808 | LOCAL(void)
|
---|
1809 | state_fini(SRE_STATE* state)
|
---|
1810 | {
|
---|
1811 | Py_XDECREF(state->string);
|
---|
1812 | data_stack_dealloc(state);
|
---|
1813 | }
|
---|
1814 |
|
---|
1815 | /* calculate offset from start of string */
|
---|
1816 | #define STATE_OFFSET(state, member)\
|
---|
1817 | (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
|
---|
1818 |
|
---|
1819 | LOCAL(PyObject*)
|
---|
1820 | state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
|
---|
1821 | {
|
---|
1822 | Py_ssize_t i, j;
|
---|
1823 |
|
---|
1824 | index = (index - 1) * 2;
|
---|
1825 |
|
---|
1826 | if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
|
---|
1827 | if (empty)
|
---|
1828 | /* want empty string */
|
---|
1829 | i = j = 0;
|
---|
1830 | else {
|
---|
1831 | Py_INCREF(Py_None);
|
---|
1832 | return Py_None;
|
---|
1833 | }
|
---|
1834 | } else {
|
---|
1835 | i = STATE_OFFSET(state, state->mark[index]);
|
---|
1836 | j = STATE_OFFSET(state, state->mark[index+1]);
|
---|
1837 | }
|
---|
1838 |
|
---|
1839 | return PySequence_GetSlice(string, i, j);
|
---|
1840 | }
|
---|
1841 |
|
---|
1842 | static void
|
---|
1843 | pattern_error(int status)
|
---|
1844 | {
|
---|
1845 | switch (status) {
|
---|
1846 | case SRE_ERROR_RECURSION_LIMIT:
|
---|
1847 | PyErr_SetString(
|
---|
1848 | PyExc_RuntimeError,
|
---|
1849 | "maximum recursion limit exceeded"
|
---|
1850 | );
|
---|
1851 | break;
|
---|
1852 | case SRE_ERROR_MEMORY:
|
---|
1853 | PyErr_NoMemory();
|
---|
1854 | break;
|
---|
1855 | case SRE_ERROR_INTERRUPTED:
|
---|
1856 | /* An exception has already been raised, so let it fly */
|
---|
1857 | break;
|
---|
1858 | default:
|
---|
1859 | /* other error codes indicate compiler/engine bugs */
|
---|
1860 | PyErr_SetString(
|
---|
1861 | PyExc_RuntimeError,
|
---|
1862 | "internal error in regular expression engine"
|
---|
1863 | );
|
---|
1864 | }
|
---|
1865 | }
|
---|
1866 |
|
---|
1867 | static void
|
---|
1868 | pattern_dealloc(PatternObject* self)
|
---|
1869 | {
|
---|
1870 | if (self->weakreflist != NULL)
|
---|
1871 | PyObject_ClearWeakRefs((PyObject *) self);
|
---|
1872 | Py_XDECREF(self->pattern);
|
---|
1873 | Py_XDECREF(self->groupindex);
|
---|
1874 | Py_XDECREF(self->indexgroup);
|
---|
1875 | PyObject_DEL(self);
|
---|
1876 | }
|
---|
1877 |
|
---|
1878 | static PyObject*
|
---|
1879 | pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
|
---|
1880 | {
|
---|
1881 | SRE_STATE state;
|
---|
1882 | int status;
|
---|
1883 |
|
---|
1884 | PyObject* string;
|
---|
1885 | Py_ssize_t start = 0;
|
---|
1886 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
1887 | static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
|
---|
1888 | if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
|
---|
1889 | &string, &start, &end))
|
---|
1890 | return NULL;
|
---|
1891 |
|
---|
1892 | string = state_init(&state, self, string, start, end);
|
---|
1893 | if (!string)
|
---|
1894 | return NULL;
|
---|
1895 |
|
---|
1896 | state.ptr = state.start;
|
---|
1897 |
|
---|
1898 | TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
|
---|
1899 |
|
---|
1900 | if (state.charsize == 1) {
|
---|
1901 | status = sre_match(&state, PatternObject_GetCode(self));
|
---|
1902 | } else {
|
---|
1903 | #if defined(HAVE_UNICODE)
|
---|
1904 | status = sre_umatch(&state, PatternObject_GetCode(self));
|
---|
1905 | #endif
|
---|
1906 | }
|
---|
1907 |
|
---|
1908 | TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
|
---|
1909 | if (PyErr_Occurred())
|
---|
1910 | return NULL;
|
---|
1911 |
|
---|
1912 | state_fini(&state);
|
---|
1913 |
|
---|
1914 | return pattern_new_match(self, &state, status);
|
---|
1915 | }
|
---|
1916 |
|
---|
1917 | static PyObject*
|
---|
1918 | pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
|
---|
1919 | {
|
---|
1920 | SRE_STATE state;
|
---|
1921 | int status;
|
---|
1922 |
|
---|
1923 | PyObject* string;
|
---|
1924 | Py_ssize_t start = 0;
|
---|
1925 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
1926 | static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
|
---|
1927 | if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
|
---|
1928 | &string, &start, &end))
|
---|
1929 | return NULL;
|
---|
1930 |
|
---|
1931 | string = state_init(&state, self, string, start, end);
|
---|
1932 | if (!string)
|
---|
1933 | return NULL;
|
---|
1934 |
|
---|
1935 | TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
|
---|
1936 |
|
---|
1937 | if (state.charsize == 1) {
|
---|
1938 | status = sre_search(&state, PatternObject_GetCode(self));
|
---|
1939 | } else {
|
---|
1940 | #if defined(HAVE_UNICODE)
|
---|
1941 | status = sre_usearch(&state, PatternObject_GetCode(self));
|
---|
1942 | #endif
|
---|
1943 | }
|
---|
1944 |
|
---|
1945 | TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
|
---|
1946 |
|
---|
1947 | state_fini(&state);
|
---|
1948 |
|
---|
1949 | if (PyErr_Occurred())
|
---|
1950 | return NULL;
|
---|
1951 |
|
---|
1952 | return pattern_new_match(self, &state, status);
|
---|
1953 | }
|
---|
1954 |
|
---|
1955 | static PyObject*
|
---|
1956 | call(char* module, char* function, PyObject* args)
|
---|
1957 | {
|
---|
1958 | PyObject* name;
|
---|
1959 | PyObject* mod;
|
---|
1960 | PyObject* func;
|
---|
1961 | PyObject* result;
|
---|
1962 |
|
---|
1963 | if (!args)
|
---|
1964 | return NULL;
|
---|
1965 | name = PyString_FromString(module);
|
---|
1966 | if (!name)
|
---|
1967 | return NULL;
|
---|
1968 | mod = PyImport_Import(name);
|
---|
1969 | Py_DECREF(name);
|
---|
1970 | if (!mod)
|
---|
1971 | return NULL;
|
---|
1972 | func = PyObject_GetAttrString(mod, function);
|
---|
1973 | Py_DECREF(mod);
|
---|
1974 | if (!func)
|
---|
1975 | return NULL;
|
---|
1976 | result = PyObject_CallObject(func, args);
|
---|
1977 | Py_DECREF(func);
|
---|
1978 | Py_DECREF(args);
|
---|
1979 | return result;
|
---|
1980 | }
|
---|
1981 |
|
---|
1982 | #ifdef USE_BUILTIN_COPY
|
---|
1983 | static int
|
---|
1984 | deepcopy(PyObject** object, PyObject* memo)
|
---|
1985 | {
|
---|
1986 | PyObject* copy;
|
---|
1987 |
|
---|
1988 | copy = call(
|
---|
1989 | "copy", "deepcopy",
|
---|
1990 | PyTuple_Pack(2, *object, memo)
|
---|
1991 | );
|
---|
1992 | if (!copy)
|
---|
1993 | return 0;
|
---|
1994 |
|
---|
1995 | Py_DECREF(*object);
|
---|
1996 | *object = copy;
|
---|
1997 |
|
---|
1998 | return 1; /* success */
|
---|
1999 | }
|
---|
2000 | #endif
|
---|
2001 |
|
---|
2002 | static PyObject*
|
---|
2003 | join_list(PyObject* list, PyObject* string)
|
---|
2004 | {
|
---|
2005 | /* join list elements */
|
---|
2006 |
|
---|
2007 | PyObject* joiner;
|
---|
2008 | #if PY_VERSION_HEX >= 0x01060000
|
---|
2009 | PyObject* function;
|
---|
2010 | PyObject* args;
|
---|
2011 | #endif
|
---|
2012 | PyObject* result;
|
---|
2013 |
|
---|
2014 | joiner = PySequence_GetSlice(string, 0, 0);
|
---|
2015 | if (!joiner)
|
---|
2016 | return NULL;
|
---|
2017 |
|
---|
2018 | if (PyList_GET_SIZE(list) == 0) {
|
---|
2019 | Py_DECREF(list);
|
---|
2020 | return joiner;
|
---|
2021 | }
|
---|
2022 |
|
---|
2023 | #if PY_VERSION_HEX >= 0x01060000
|
---|
2024 | function = PyObject_GetAttrString(joiner, "join");
|
---|
2025 | if (!function) {
|
---|
2026 | Py_DECREF(joiner);
|
---|
2027 | return NULL;
|
---|
2028 | }
|
---|
2029 | args = PyTuple_New(1);
|
---|
2030 | if (!args) {
|
---|
2031 | Py_DECREF(function);
|
---|
2032 | Py_DECREF(joiner);
|
---|
2033 | return NULL;
|
---|
2034 | }
|
---|
2035 | PyTuple_SET_ITEM(args, 0, list);
|
---|
2036 | result = PyObject_CallObject(function, args);
|
---|
2037 | Py_DECREF(args); /* also removes list */
|
---|
2038 | Py_DECREF(function);
|
---|
2039 | #else
|
---|
2040 | result = call(
|
---|
2041 | "string", "join",
|
---|
2042 | PyTuple_Pack(2, list, joiner)
|
---|
2043 | );
|
---|
2044 | #endif
|
---|
2045 | Py_DECREF(joiner);
|
---|
2046 |
|
---|
2047 | return result;
|
---|
2048 | }
|
---|
2049 |
|
---|
2050 | static PyObject*
|
---|
2051 | pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
---|
2052 | {
|
---|
2053 | SRE_STATE state;
|
---|
2054 | PyObject* list;
|
---|
2055 | int status;
|
---|
2056 | Py_ssize_t i, b, e;
|
---|
2057 |
|
---|
2058 | PyObject* string;
|
---|
2059 | Py_ssize_t start = 0;
|
---|
2060 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
2061 | static char* kwlist[] = { "source", "pos", "endpos", NULL };
|
---|
2062 | if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
|
---|
2063 | &string, &start, &end))
|
---|
2064 | return NULL;
|
---|
2065 |
|
---|
2066 | string = state_init(&state, self, string, start, end);
|
---|
2067 | if (!string)
|
---|
2068 | return NULL;
|
---|
2069 |
|
---|
2070 | list = PyList_New(0);
|
---|
2071 | if (!list) {
|
---|
2072 | state_fini(&state);
|
---|
2073 | return NULL;
|
---|
2074 | }
|
---|
2075 |
|
---|
2076 | while (state.start <= state.end) {
|
---|
2077 |
|
---|
2078 | PyObject* item;
|
---|
2079 |
|
---|
2080 | state_reset(&state);
|
---|
2081 |
|
---|
2082 | state.ptr = state.start;
|
---|
2083 |
|
---|
2084 | if (state.charsize == 1) {
|
---|
2085 | status = sre_search(&state, PatternObject_GetCode(self));
|
---|
2086 | } else {
|
---|
2087 | #if defined(HAVE_UNICODE)
|
---|
2088 | status = sre_usearch(&state, PatternObject_GetCode(self));
|
---|
2089 | #endif
|
---|
2090 | }
|
---|
2091 |
|
---|
2092 | if (PyErr_Occurred())
|
---|
2093 | goto error;
|
---|
2094 |
|
---|
2095 | if (status <= 0) {
|
---|
2096 | if (status == 0)
|
---|
2097 | break;
|
---|
2098 | pattern_error(status);
|
---|
2099 | goto error;
|
---|
2100 | }
|
---|
2101 |
|
---|
2102 | /* don't bother to build a match object */
|
---|
2103 | switch (self->groups) {
|
---|
2104 | case 0:
|
---|
2105 | b = STATE_OFFSET(&state, state.start);
|
---|
2106 | e = STATE_OFFSET(&state, state.ptr);
|
---|
2107 | item = PySequence_GetSlice(string, b, e);
|
---|
2108 | if (!item)
|
---|
2109 | goto error;
|
---|
2110 | break;
|
---|
2111 | case 1:
|
---|
2112 | item = state_getslice(&state, 1, string, 1);
|
---|
2113 | if (!item)
|
---|
2114 | goto error;
|
---|
2115 | break;
|
---|
2116 | default:
|
---|
2117 | item = PyTuple_New(self->groups);
|
---|
2118 | if (!item)
|
---|
2119 | goto error;
|
---|
2120 | for (i = 0; i < self->groups; i++) {
|
---|
2121 | PyObject* o = state_getslice(&state, i+1, string, 1);
|
---|
2122 | if (!o) {
|
---|
2123 | Py_DECREF(item);
|
---|
2124 | goto error;
|
---|
2125 | }
|
---|
2126 | PyTuple_SET_ITEM(item, i, o);
|
---|
2127 | }
|
---|
2128 | break;
|
---|
2129 | }
|
---|
2130 |
|
---|
2131 | status = PyList_Append(list, item);
|
---|
2132 | Py_DECREF(item);
|
---|
2133 | if (status < 0)
|
---|
2134 | goto error;
|
---|
2135 |
|
---|
2136 | if (state.ptr == state.start)
|
---|
2137 | state.start = (void*) ((char*) state.ptr + state.charsize);
|
---|
2138 | else
|
---|
2139 | state.start = state.ptr;
|
---|
2140 |
|
---|
2141 | }
|
---|
2142 |
|
---|
2143 | state_fini(&state);
|
---|
2144 | return list;
|
---|
2145 |
|
---|
2146 | error:
|
---|
2147 | Py_DECREF(list);
|
---|
2148 | state_fini(&state);
|
---|
2149 | return NULL;
|
---|
2150 |
|
---|
2151 | }
|
---|
2152 |
|
---|
2153 | #if PY_VERSION_HEX >= 0x02020000
|
---|
2154 | static PyObject*
|
---|
2155 | pattern_finditer(PatternObject* pattern, PyObject* args)
|
---|
2156 | {
|
---|
2157 | PyObject* scanner;
|
---|
2158 | PyObject* search;
|
---|
2159 | PyObject* iterator;
|
---|
2160 |
|
---|
2161 | scanner = pattern_scanner(pattern, args);
|
---|
2162 | if (!scanner)
|
---|
2163 | return NULL;
|
---|
2164 |
|
---|
2165 | search = PyObject_GetAttrString(scanner, "search");
|
---|
2166 | Py_DECREF(scanner);
|
---|
2167 | if (!search)
|
---|
2168 | return NULL;
|
---|
2169 |
|
---|
2170 | iterator = PyCallIter_New(search, Py_None);
|
---|
2171 | Py_DECREF(search);
|
---|
2172 |
|
---|
2173 | return iterator;
|
---|
2174 | }
|
---|
2175 | #endif
|
---|
2176 |
|
---|
2177 | static PyObject*
|
---|
2178 | pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
---|
2179 | {
|
---|
2180 | SRE_STATE state;
|
---|
2181 | PyObject* list;
|
---|
2182 | PyObject* item;
|
---|
2183 | int status;
|
---|
2184 | Py_ssize_t n;
|
---|
2185 | Py_ssize_t i;
|
---|
2186 | void* last;
|
---|
2187 |
|
---|
2188 | PyObject* string;
|
---|
2189 | Py_ssize_t maxsplit = 0;
|
---|
2190 | static char* kwlist[] = { "source", "maxsplit", NULL };
|
---|
2191 | if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
|
---|
2192 | &string, &maxsplit))
|
---|
2193 | return NULL;
|
---|
2194 |
|
---|
2195 | string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
|
---|
2196 | if (!string)
|
---|
2197 | return NULL;
|
---|
2198 |
|
---|
2199 | list = PyList_New(0);
|
---|
2200 | if (!list) {
|
---|
2201 | state_fini(&state);
|
---|
2202 | return NULL;
|
---|
2203 | }
|
---|
2204 |
|
---|
2205 | n = 0;
|
---|
2206 | last = state.start;
|
---|
2207 |
|
---|
2208 | while (!maxsplit || n < maxsplit) {
|
---|
2209 |
|
---|
2210 | state_reset(&state);
|
---|
2211 |
|
---|
2212 | state.ptr = state.start;
|
---|
2213 |
|
---|
2214 | if (state.charsize == 1) {
|
---|
2215 | status = sre_search(&state, PatternObject_GetCode(self));
|
---|
2216 | } else {
|
---|
2217 | #if defined(HAVE_UNICODE)
|
---|
2218 | status = sre_usearch(&state, PatternObject_GetCode(self));
|
---|
2219 | #endif
|
---|
2220 | }
|
---|
2221 |
|
---|
2222 | if (PyErr_Occurred())
|
---|
2223 | goto error;
|
---|
2224 |
|
---|
2225 | if (status <= 0) {
|
---|
2226 | if (status == 0)
|
---|
2227 | break;
|
---|
2228 | pattern_error(status);
|
---|
2229 | goto error;
|
---|
2230 | }
|
---|
2231 |
|
---|
2232 | if (state.start == state.ptr) {
|
---|
2233 | if (last == state.end)
|
---|
2234 | break;
|
---|
2235 | /* skip one character */
|
---|
2236 | state.start = (void*) ((char*) state.ptr + state.charsize);
|
---|
2237 | continue;
|
---|
2238 | }
|
---|
2239 |
|
---|
2240 | /* get segment before this match */
|
---|
2241 | item = PySequence_GetSlice(
|
---|
2242 | string, STATE_OFFSET(&state, last),
|
---|
2243 | STATE_OFFSET(&state, state.start)
|
---|
2244 | );
|
---|
2245 | if (!item)
|
---|
2246 | goto error;
|
---|
2247 | status = PyList_Append(list, item);
|
---|
2248 | Py_DECREF(item);
|
---|
2249 | if (status < 0)
|
---|
2250 | goto error;
|
---|
2251 |
|
---|
2252 | /* add groups (if any) */
|
---|
2253 | for (i = 0; i < self->groups; i++) {
|
---|
2254 | item = state_getslice(&state, i+1, string, 0);
|
---|
2255 | if (!item)
|
---|
2256 | goto error;
|
---|
2257 | status = PyList_Append(list, item);
|
---|
2258 | Py_DECREF(item);
|
---|
2259 | if (status < 0)
|
---|
2260 | goto error;
|
---|
2261 | }
|
---|
2262 |
|
---|
2263 | n = n + 1;
|
---|
2264 |
|
---|
2265 | last = state.start = state.ptr;
|
---|
2266 |
|
---|
2267 | }
|
---|
2268 |
|
---|
2269 | /* get segment following last match (even if empty) */
|
---|
2270 | item = PySequence_GetSlice(
|
---|
2271 | string, STATE_OFFSET(&state, last), state.endpos
|
---|
2272 | );
|
---|
2273 | if (!item)
|
---|
2274 | goto error;
|
---|
2275 | status = PyList_Append(list, item);
|
---|
2276 | Py_DECREF(item);
|
---|
2277 | if (status < 0)
|
---|
2278 | goto error;
|
---|
2279 |
|
---|
2280 | state_fini(&state);
|
---|
2281 | return list;
|
---|
2282 |
|
---|
2283 | error:
|
---|
2284 | Py_DECREF(list);
|
---|
2285 | state_fini(&state);
|
---|
2286 | return NULL;
|
---|
2287 |
|
---|
2288 | }
|
---|
2289 |
|
---|
2290 | static PyObject*
|
---|
2291 | pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
|
---|
2292 | Py_ssize_t count, Py_ssize_t subn)
|
---|
2293 | {
|
---|
2294 | SRE_STATE state;
|
---|
2295 | PyObject* list;
|
---|
2296 | PyObject* item;
|
---|
2297 | PyObject* filter;
|
---|
2298 | PyObject* args;
|
---|
2299 | PyObject* match;
|
---|
2300 | void* ptr;
|
---|
2301 | int status;
|
---|
2302 | Py_ssize_t n;
|
---|
2303 | Py_ssize_t i, b, e;
|
---|
2304 | int bint;
|
---|
2305 | int filter_is_callable;
|
---|
2306 |
|
---|
2307 | if (PyCallable_Check(ptemplate)) {
|
---|
2308 | /* sub/subn takes either a function or a template */
|
---|
2309 | filter = ptemplate;
|
---|
2310 | Py_INCREF(filter);
|
---|
2311 | filter_is_callable = 1;
|
---|
2312 | } else {
|
---|
2313 | /* if not callable, check if it's a literal string */
|
---|
2314 | int literal;
|
---|
2315 | ptr = getstring(ptemplate, &n, &bint);
|
---|
2316 | b = bint;
|
---|
2317 | if (ptr) {
|
---|
2318 | if (b == 1) {
|
---|
2319 | literal = sre_literal_template((unsigned char *)ptr, n);
|
---|
2320 | } else {
|
---|
2321 | #if defined(HAVE_UNICODE)
|
---|
2322 | literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
|
---|
2323 | #endif
|
---|
2324 | }
|
---|
2325 | } else {
|
---|
2326 | PyErr_Clear();
|
---|
2327 | literal = 0;
|
---|
2328 | }
|
---|
2329 | if (literal) {
|
---|
2330 | filter = ptemplate;
|
---|
2331 | Py_INCREF(filter);
|
---|
2332 | filter_is_callable = 0;
|
---|
2333 | } else {
|
---|
2334 | /* not a literal; hand it over to the template compiler */
|
---|
2335 | filter = call(
|
---|
2336 | SRE_PY_MODULE, "_subx",
|
---|
2337 | PyTuple_Pack(2, self, ptemplate)
|
---|
2338 | );
|
---|
2339 | if (!filter)
|
---|
2340 | return NULL;
|
---|
2341 | filter_is_callable = PyCallable_Check(filter);
|
---|
2342 | }
|
---|
2343 | }
|
---|
2344 |
|
---|
2345 | string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
|
---|
2346 | if (!string) {
|
---|
2347 | Py_DECREF(filter);
|
---|
2348 | return NULL;
|
---|
2349 | }
|
---|
2350 |
|
---|
2351 | list = PyList_New(0);
|
---|
2352 | if (!list) {
|
---|
2353 | Py_DECREF(filter);
|
---|
2354 | state_fini(&state);
|
---|
2355 | return NULL;
|
---|
2356 | }
|
---|
2357 |
|
---|
2358 | n = i = 0;
|
---|
2359 |
|
---|
2360 | while (!count || n < count) {
|
---|
2361 |
|
---|
2362 | state_reset(&state);
|
---|
2363 |
|
---|
2364 | state.ptr = state.start;
|
---|
2365 |
|
---|
2366 | if (state.charsize == 1) {
|
---|
2367 | status = sre_search(&state, PatternObject_GetCode(self));
|
---|
2368 | } else {
|
---|
2369 | #if defined(HAVE_UNICODE)
|
---|
2370 | status = sre_usearch(&state, PatternObject_GetCode(self));
|
---|
2371 | #endif
|
---|
2372 | }
|
---|
2373 |
|
---|
2374 | if (PyErr_Occurred())
|
---|
2375 | goto error;
|
---|
2376 |
|
---|
2377 | if (status <= 0) {
|
---|
2378 | if (status == 0)
|
---|
2379 | break;
|
---|
2380 | pattern_error(status);
|
---|
2381 | goto error;
|
---|
2382 | }
|
---|
2383 |
|
---|
2384 | b = STATE_OFFSET(&state, state.start);
|
---|
2385 | e = STATE_OFFSET(&state, state.ptr);
|
---|
2386 |
|
---|
2387 | if (i < b) {
|
---|
2388 | /* get segment before this match */
|
---|
2389 | item = PySequence_GetSlice(string, i, b);
|
---|
2390 | if (!item)
|
---|
2391 | goto error;
|
---|
2392 | status = PyList_Append(list, item);
|
---|
2393 | Py_DECREF(item);
|
---|
2394 | if (status < 0)
|
---|
2395 | goto error;
|
---|
2396 |
|
---|
2397 | } else if (i == b && i == e && n > 0)
|
---|
2398 | /* ignore empty match on latest position */
|
---|
2399 | goto next;
|
---|
2400 |
|
---|
2401 | if (filter_is_callable) {
|
---|
2402 | /* pass match object through filter */
|
---|
2403 | match = pattern_new_match(self, &state, 1);
|
---|
2404 | if (!match)
|
---|
2405 | goto error;
|
---|
2406 | args = PyTuple_Pack(1, match);
|
---|
2407 | if (!args) {
|
---|
2408 | Py_DECREF(match);
|
---|
2409 | goto error;
|
---|
2410 | }
|
---|
2411 | item = PyObject_CallObject(filter, args);
|
---|
2412 | Py_DECREF(args);
|
---|
2413 | Py_DECREF(match);
|
---|
2414 | if (!item)
|
---|
2415 | goto error;
|
---|
2416 | } else {
|
---|
2417 | /* filter is literal string */
|
---|
2418 | item = filter;
|
---|
2419 | Py_INCREF(item);
|
---|
2420 | }
|
---|
2421 |
|
---|
2422 | /* add to list */
|
---|
2423 | if (item != Py_None) {
|
---|
2424 | status = PyList_Append(list, item);
|
---|
2425 | Py_DECREF(item);
|
---|
2426 | if (status < 0)
|
---|
2427 | goto error;
|
---|
2428 | }
|
---|
2429 |
|
---|
2430 | i = e;
|
---|
2431 | n = n + 1;
|
---|
2432 |
|
---|
2433 | next:
|
---|
2434 | /* move on */
|
---|
2435 | if (state.ptr == state.start)
|
---|
2436 | state.start = (void*) ((char*) state.ptr + state.charsize);
|
---|
2437 | else
|
---|
2438 | state.start = state.ptr;
|
---|
2439 |
|
---|
2440 | }
|
---|
2441 |
|
---|
2442 | /* get segment following last match */
|
---|
2443 | if (i < state.endpos) {
|
---|
2444 | item = PySequence_GetSlice(string, i, state.endpos);
|
---|
2445 | if (!item)
|
---|
2446 | goto error;
|
---|
2447 | status = PyList_Append(list, item);
|
---|
2448 | Py_DECREF(item);
|
---|
2449 | if (status < 0)
|
---|
2450 | goto error;
|
---|
2451 | }
|
---|
2452 |
|
---|
2453 | state_fini(&state);
|
---|
2454 |
|
---|
2455 | Py_DECREF(filter);
|
---|
2456 |
|
---|
2457 | /* convert list to single string (also removes list) */
|
---|
2458 | item = join_list(list, string);
|
---|
2459 |
|
---|
2460 | if (!item)
|
---|
2461 | return NULL;
|
---|
2462 |
|
---|
2463 | if (subn)
|
---|
2464 | return Py_BuildValue("Nn", item, n);
|
---|
2465 |
|
---|
2466 | return item;
|
---|
2467 |
|
---|
2468 | error:
|
---|
2469 | Py_DECREF(list);
|
---|
2470 | state_fini(&state);
|
---|
2471 | Py_DECREF(filter);
|
---|
2472 | return NULL;
|
---|
2473 |
|
---|
2474 | }
|
---|
2475 |
|
---|
2476 | static PyObject*
|
---|
2477 | pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
|
---|
2478 | {
|
---|
2479 | PyObject* ptemplate;
|
---|
2480 | PyObject* string;
|
---|
2481 | Py_ssize_t count = 0;
|
---|
2482 | static char* kwlist[] = { "repl", "string", "count", NULL };
|
---|
2483 | if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
|
---|
2484 | &ptemplate, &string, &count))
|
---|
2485 | return NULL;
|
---|
2486 |
|
---|
2487 | return pattern_subx(self, ptemplate, string, count, 0);
|
---|
2488 | }
|
---|
2489 |
|
---|
2490 | static PyObject*
|
---|
2491 | pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
|
---|
2492 | {
|
---|
2493 | PyObject* ptemplate;
|
---|
2494 | PyObject* string;
|
---|
2495 | Py_ssize_t count = 0;
|
---|
2496 | static char* kwlist[] = { "repl", "string", "count", NULL };
|
---|
2497 | if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
|
---|
2498 | &ptemplate, &string, &count))
|
---|
2499 | return NULL;
|
---|
2500 |
|
---|
2501 | return pattern_subx(self, ptemplate, string, count, 1);
|
---|
2502 | }
|
---|
2503 |
|
---|
2504 | static PyObject*
|
---|
2505 | pattern_copy(PatternObject* self, PyObject *unused)
|
---|
2506 | {
|
---|
2507 | #ifdef USE_BUILTIN_COPY
|
---|
2508 | PatternObject* copy;
|
---|
2509 | int offset;
|
---|
2510 |
|
---|
2511 | copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
|
---|
2512 | if (!copy)
|
---|
2513 | return NULL;
|
---|
2514 |
|
---|
2515 | offset = offsetof(PatternObject, groups);
|
---|
2516 |
|
---|
2517 | Py_XINCREF(self->groupindex);
|
---|
2518 | Py_XINCREF(self->indexgroup);
|
---|
2519 | Py_XINCREF(self->pattern);
|
---|
2520 |
|
---|
2521 | memcpy((char*) copy + offset, (char*) self + offset,
|
---|
2522 | sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
|
---|
2523 | copy->weakreflist = NULL;
|
---|
2524 |
|
---|
2525 | return (PyObject*) copy;
|
---|
2526 | #else
|
---|
2527 | PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
|
---|
2528 | return NULL;
|
---|
2529 | #endif
|
---|
2530 | }
|
---|
2531 |
|
---|
2532 | static PyObject*
|
---|
2533 | pattern_deepcopy(PatternObject* self, PyObject* memo)
|
---|
2534 | {
|
---|
2535 | #ifdef USE_BUILTIN_COPY
|
---|
2536 | PatternObject* copy;
|
---|
2537 |
|
---|
2538 | copy = (PatternObject*) pattern_copy(self);
|
---|
2539 | if (!copy)
|
---|
2540 | return NULL;
|
---|
2541 |
|
---|
2542 | if (!deepcopy(©->groupindex, memo) ||
|
---|
2543 | !deepcopy(©->indexgroup, memo) ||
|
---|
2544 | !deepcopy(©->pattern, memo)) {
|
---|
2545 | Py_DECREF(copy);
|
---|
2546 | return NULL;
|
---|
2547 | }
|
---|
2548 |
|
---|
2549 | #else
|
---|
2550 | PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
|
---|
2551 | return NULL;
|
---|
2552 | #endif
|
---|
2553 | }
|
---|
2554 |
|
---|
2555 | PyDoc_STRVAR(pattern_match_doc,
|
---|
2556 | "match(string[, pos[, endpos]]) --> match object or None.\n\
|
---|
2557 | Matches zero or more characters at the beginning of the string");
|
---|
2558 |
|
---|
2559 | PyDoc_STRVAR(pattern_search_doc,
|
---|
2560 | "search(string[, pos[, endpos]]) --> match object or None.\n\
|
---|
2561 | Scan through string looking for a match, and return a corresponding\n\
|
---|
2562 | match object instance. Return None if no position in the string matches.");
|
---|
2563 |
|
---|
2564 | PyDoc_STRVAR(pattern_split_doc,
|
---|
2565 | "split(string[, maxsplit = 0]) --> list.\n\
|
---|
2566 | Split string by the occurrences of pattern.");
|
---|
2567 |
|
---|
2568 | PyDoc_STRVAR(pattern_findall_doc,
|
---|
2569 | "findall(string[, pos[, endpos]]) --> list.\n\
|
---|
2570 | Return a list of all non-overlapping matches of pattern in string.");
|
---|
2571 |
|
---|
2572 | PyDoc_STRVAR(pattern_finditer_doc,
|
---|
2573 | "finditer(string[, pos[, endpos]]) --> iterator.\n\
|
---|
2574 | Return an iterator over all non-overlapping matches for the \n\
|
---|
2575 | RE pattern in string. For each match, the iterator returns a\n\
|
---|
2576 | match object.");
|
---|
2577 |
|
---|
2578 | PyDoc_STRVAR(pattern_sub_doc,
|
---|
2579 | "sub(repl, string[, count = 0]) --> newstring\n\
|
---|
2580 | Return the string obtained by replacing the leftmost non-overlapping\n\
|
---|
2581 | occurrences of pattern in string by the replacement repl.");
|
---|
2582 |
|
---|
2583 | PyDoc_STRVAR(pattern_subn_doc,
|
---|
2584 | "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
|
---|
2585 | Return the tuple (new_string, number_of_subs_made) found by replacing\n\
|
---|
2586 | the leftmost non-overlapping occurrences of pattern with the\n\
|
---|
2587 | replacement repl.");
|
---|
2588 |
|
---|
2589 | PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
|
---|
2590 |
|
---|
2591 | static PyMethodDef pattern_methods[] = {
|
---|
2592 | {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
|
---|
2593 | pattern_match_doc},
|
---|
2594 | {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
|
---|
2595 | pattern_search_doc},
|
---|
2596 | {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
|
---|
2597 | pattern_sub_doc},
|
---|
2598 | {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
|
---|
2599 | pattern_subn_doc},
|
---|
2600 | {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
|
---|
2601 | pattern_split_doc},
|
---|
2602 | {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
|
---|
2603 | pattern_findall_doc},
|
---|
2604 | #if PY_VERSION_HEX >= 0x02020000
|
---|
2605 | {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
|
---|
2606 | pattern_finditer_doc},
|
---|
2607 | #endif
|
---|
2608 | {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
|
---|
2609 | {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
|
---|
2610 | {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
|
---|
2611 | {NULL, NULL}
|
---|
2612 | };
|
---|
2613 |
|
---|
2614 | #define PAT_OFF(x) offsetof(PatternObject, x)
|
---|
2615 | static PyMemberDef pattern_members[] = {
|
---|
2616 | {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
|
---|
2617 | {"flags", T_INT, PAT_OFF(flags), READONLY},
|
---|
2618 | {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
|
---|
2619 | {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
|
---|
2620 | {NULL} /* Sentinel */
|
---|
2621 | };
|
---|
2622 |
|
---|
2623 | statichere PyTypeObject Pattern_Type = {
|
---|
2624 | PyObject_HEAD_INIT(NULL)
|
---|
2625 | 0, "_" SRE_MODULE ".SRE_Pattern",
|
---|
2626 | sizeof(PatternObject), sizeof(SRE_CODE),
|
---|
2627 | (destructor)pattern_dealloc, /*tp_dealloc*/
|
---|
2628 | 0, /* tp_print */
|
---|
2629 | 0, /* tp_getattrn */
|
---|
2630 | 0, /* tp_setattr */
|
---|
2631 | 0, /* tp_compare */
|
---|
2632 | 0, /* tp_repr */
|
---|
2633 | 0, /* tp_as_number */
|
---|
2634 | 0, /* tp_as_sequence */
|
---|
2635 | 0, /* tp_as_mapping */
|
---|
2636 | 0, /* tp_hash */
|
---|
2637 | 0, /* tp_call */
|
---|
2638 | 0, /* tp_str */
|
---|
2639 | 0, /* tp_getattro */
|
---|
2640 | 0, /* tp_setattro */
|
---|
2641 | 0, /* tp_as_buffer */
|
---|
2642 | Py_TPFLAGS_DEFAULT, /* tp_flags */
|
---|
2643 | pattern_doc, /* tp_doc */
|
---|
2644 | 0, /* tp_traverse */
|
---|
2645 | 0, /* tp_clear */
|
---|
2646 | 0, /* tp_richcompare */
|
---|
2647 | offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
|
---|
2648 | 0, /* tp_iter */
|
---|
2649 | 0, /* tp_iternext */
|
---|
2650 | pattern_methods, /* tp_methods */
|
---|
2651 | pattern_members, /* tp_members */
|
---|
2652 | };
|
---|
2653 |
|
---|
2654 | static int _validate(PatternObject *self); /* Forward */
|
---|
2655 |
|
---|
2656 | static PyObject *
|
---|
2657 | _compile(PyObject* self_, PyObject* args)
|
---|
2658 | {
|
---|
2659 | /* "compile" pattern descriptor to pattern object */
|
---|
2660 |
|
---|
2661 | PatternObject* self;
|
---|
2662 | Py_ssize_t i, n;
|
---|
2663 |
|
---|
2664 | PyObject* pattern;
|
---|
2665 | int flags = 0;
|
---|
2666 | PyObject* code;
|
---|
2667 | Py_ssize_t groups = 0;
|
---|
2668 | PyObject* groupindex = NULL;
|
---|
2669 | PyObject* indexgroup = NULL;
|
---|
2670 | if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
|
---|
2671 | &PyList_Type, &code, &groups,
|
---|
2672 | &groupindex, &indexgroup))
|
---|
2673 | return NULL;
|
---|
2674 |
|
---|
2675 | n = PyList_GET_SIZE(code);
|
---|
2676 | /* coverity[ampersand_in_size] */
|
---|
2677 | self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
|
---|
2678 | if (!self)
|
---|
2679 | return NULL;
|
---|
2680 | self->weakreflist = NULL;
|
---|
2681 | self->pattern = NULL;
|
---|
2682 | self->groupindex = NULL;
|
---|
2683 | self->indexgroup = NULL;
|
---|
2684 |
|
---|
2685 | self->codesize = n;
|
---|
2686 |
|
---|
2687 | for (i = 0; i < n; i++) {
|
---|
2688 | PyObject *o = PyList_GET_ITEM(code, i);
|
---|
2689 | unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
|
---|
2690 | : PyLong_AsUnsignedLong(o);
|
---|
2691 | if (value == (unsigned long)-1 && PyErr_Occurred()) {
|
---|
2692 | if (PyErr_ExceptionMatches(PyExc_OverflowError)) {
|
---|
2693 | PyErr_SetString(PyExc_OverflowError,
|
---|
2694 | "regular expression code size limit exceeded");
|
---|
2695 | }
|
---|
2696 | break;
|
---|
2697 | }
|
---|
2698 | self->code[i] = (SRE_CODE) value;
|
---|
2699 | if ((unsigned long) self->code[i] != value) {
|
---|
2700 | PyErr_SetString(PyExc_OverflowError,
|
---|
2701 | "regular expression code size limit exceeded");
|
---|
2702 | break;
|
---|
2703 | }
|
---|
2704 | }
|
---|
2705 |
|
---|
2706 | if (PyErr_Occurred()) {
|
---|
2707 | Py_DECREF(self);
|
---|
2708 | return NULL;
|
---|
2709 | }
|
---|
2710 |
|
---|
2711 | Py_INCREF(pattern);
|
---|
2712 | self->pattern = pattern;
|
---|
2713 |
|
---|
2714 | self->flags = flags;
|
---|
2715 |
|
---|
2716 | self->groups = groups;
|
---|
2717 |
|
---|
2718 | Py_XINCREF(groupindex);
|
---|
2719 | self->groupindex = groupindex;
|
---|
2720 |
|
---|
2721 | Py_XINCREF(indexgroup);
|
---|
2722 | self->indexgroup = indexgroup;
|
---|
2723 |
|
---|
2724 | self->weakreflist = NULL;
|
---|
2725 |
|
---|
2726 | if (!_validate(self)) {
|
---|
2727 | Py_DECREF(self);
|
---|
2728 | return NULL;
|
---|
2729 | }
|
---|
2730 |
|
---|
2731 | return (PyObject*) self;
|
---|
2732 | }
|
---|
2733 |
|
---|
2734 | /* -------------------------------------------------------------------- */
|
---|
2735 | /* Code validation */
|
---|
2736 |
|
---|
2737 | /* To learn more about this code, have a look at the _compile() function in
|
---|
2738 | Lib/sre_compile.py. The validation functions below checks the code array
|
---|
2739 | for conformance with the code patterns generated there.
|
---|
2740 |
|
---|
2741 | The nice thing about the generated code is that it is position-independent:
|
---|
2742 | all jumps are relative jumps forward. Also, jumps don't cross each other:
|
---|
2743 | the target of a later jump is always earlier than the target of an earlier
|
---|
2744 | jump. IOW, this is okay:
|
---|
2745 |
|
---|
2746 | J---------J-------T--------T
|
---|
2747 | \ \_____/ /
|
---|
2748 | \______________________/
|
---|
2749 |
|
---|
2750 | but this is not:
|
---|
2751 |
|
---|
2752 | J---------J-------T--------T
|
---|
2753 | \_________\_____/ /
|
---|
2754 | \____________/
|
---|
2755 |
|
---|
2756 | It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
|
---|
2757 | bytes wide (the latter if Python is compiled for "wide" unicode support).
|
---|
2758 | */
|
---|
2759 |
|
---|
2760 | /* Defining this one enables tracing of the validator */
|
---|
2761 | #undef VVERBOSE
|
---|
2762 |
|
---|
2763 | /* Trace macro for the validator */
|
---|
2764 | #if defined(VVERBOSE)
|
---|
2765 | #define VTRACE(v) printf v
|
---|
2766 | #else
|
---|
2767 | #define VTRACE(v) do {} while(0) /* do nothing */
|
---|
2768 | #endif
|
---|
2769 |
|
---|
2770 | /* Report failure */
|
---|
2771 | #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
|
---|
2772 |
|
---|
2773 | /* Extract opcode, argument, or skip count from code array */
|
---|
2774 | #define GET_OP \
|
---|
2775 | do { \
|
---|
2776 | VTRACE(("%p: ", code)); \
|
---|
2777 | if (code >= end) FAIL; \
|
---|
2778 | op = *code++; \
|
---|
2779 | VTRACE(("%lu (op)\n", (unsigned long)op)); \
|
---|
2780 | } while (0)
|
---|
2781 | #define GET_ARG \
|
---|
2782 | do { \
|
---|
2783 | VTRACE(("%p= ", code)); \
|
---|
2784 | if (code >= end) FAIL; \
|
---|
2785 | arg = *code++; \
|
---|
2786 | VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
|
---|
2787 | } while (0)
|
---|
2788 | #define GET_SKIP_ADJ(adj) \
|
---|
2789 | do { \
|
---|
2790 | VTRACE(("%p= ", code)); \
|
---|
2791 | if (code >= end) FAIL; \
|
---|
2792 | skip = *code; \
|
---|
2793 | VTRACE(("%lu (skip to %p)\n", \
|
---|
2794 | (unsigned long)skip, code+skip)); \
|
---|
2795 | if (skip-adj > end-code) \
|
---|
2796 | FAIL; \
|
---|
2797 | code++; \
|
---|
2798 | } while (0)
|
---|
2799 | #define GET_SKIP GET_SKIP_ADJ(0)
|
---|
2800 |
|
---|
2801 | static int
|
---|
2802 | _validate_charset(SRE_CODE *code, SRE_CODE *end)
|
---|
2803 | {
|
---|
2804 | /* Some variables are manipulated by the macros above */
|
---|
2805 | SRE_CODE op;
|
---|
2806 | SRE_CODE arg;
|
---|
2807 | SRE_CODE offset;
|
---|
2808 | int i;
|
---|
2809 |
|
---|
2810 | while (code < end) {
|
---|
2811 | GET_OP;
|
---|
2812 | switch (op) {
|
---|
2813 |
|
---|
2814 | case SRE_OP_NEGATE:
|
---|
2815 | break;
|
---|
2816 |
|
---|
2817 | case SRE_OP_LITERAL:
|
---|
2818 | GET_ARG;
|
---|
2819 | break;
|
---|
2820 |
|
---|
2821 | case SRE_OP_RANGE:
|
---|
2822 | GET_ARG;
|
---|
2823 | GET_ARG;
|
---|
2824 | break;
|
---|
2825 |
|
---|
2826 | case SRE_OP_CHARSET:
|
---|
2827 | offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
|
---|
2828 | if (offset > end-code)
|
---|
2829 | FAIL;
|
---|
2830 | code += offset;
|
---|
2831 | break;
|
---|
2832 |
|
---|
2833 | case SRE_OP_BIGCHARSET:
|
---|
2834 | GET_ARG; /* Number of blocks */
|
---|
2835 | offset = 256/sizeof(SRE_CODE); /* 256-byte table */
|
---|
2836 | if (offset > end-code)
|
---|
2837 | FAIL;
|
---|
2838 | /* Make sure that each byte points to a valid block */
|
---|
2839 | for (i = 0; i < 256; i++) {
|
---|
2840 | if (((unsigned char *)code)[i] >= arg)
|
---|
2841 | FAIL;
|
---|
2842 | }
|
---|
2843 | code += offset;
|
---|
2844 | offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
|
---|
2845 | if (offset > end-code)
|
---|
2846 | FAIL;
|
---|
2847 | code += offset;
|
---|
2848 | break;
|
---|
2849 |
|
---|
2850 | case SRE_OP_CATEGORY:
|
---|
2851 | GET_ARG;
|
---|
2852 | switch (arg) {
|
---|
2853 | case SRE_CATEGORY_DIGIT:
|
---|
2854 | case SRE_CATEGORY_NOT_DIGIT:
|
---|
2855 | case SRE_CATEGORY_SPACE:
|
---|
2856 | case SRE_CATEGORY_NOT_SPACE:
|
---|
2857 | case SRE_CATEGORY_WORD:
|
---|
2858 | case SRE_CATEGORY_NOT_WORD:
|
---|
2859 | case SRE_CATEGORY_LINEBREAK:
|
---|
2860 | case SRE_CATEGORY_NOT_LINEBREAK:
|
---|
2861 | case SRE_CATEGORY_LOC_WORD:
|
---|
2862 | case SRE_CATEGORY_LOC_NOT_WORD:
|
---|
2863 | case SRE_CATEGORY_UNI_DIGIT:
|
---|
2864 | case SRE_CATEGORY_UNI_NOT_DIGIT:
|
---|
2865 | case SRE_CATEGORY_UNI_SPACE:
|
---|
2866 | case SRE_CATEGORY_UNI_NOT_SPACE:
|
---|
2867 | case SRE_CATEGORY_UNI_WORD:
|
---|
2868 | case SRE_CATEGORY_UNI_NOT_WORD:
|
---|
2869 | case SRE_CATEGORY_UNI_LINEBREAK:
|
---|
2870 | case SRE_CATEGORY_UNI_NOT_LINEBREAK:
|
---|
2871 | break;
|
---|
2872 | default:
|
---|
2873 | FAIL;
|
---|
2874 | }
|
---|
2875 | break;
|
---|
2876 |
|
---|
2877 | default:
|
---|
2878 | FAIL;
|
---|
2879 |
|
---|
2880 | }
|
---|
2881 | }
|
---|
2882 |
|
---|
2883 | return 1;
|
---|
2884 | }
|
---|
2885 |
|
---|
2886 | static int
|
---|
2887 | _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
|
---|
2888 | {
|
---|
2889 | /* Some variables are manipulated by the macros above */
|
---|
2890 | SRE_CODE op;
|
---|
2891 | SRE_CODE arg;
|
---|
2892 | SRE_CODE skip;
|
---|
2893 |
|
---|
2894 | VTRACE(("code=%p, end=%p\n", code, end));
|
---|
2895 |
|
---|
2896 | if (code > end)
|
---|
2897 | FAIL;
|
---|
2898 |
|
---|
2899 | while (code < end) {
|
---|
2900 | GET_OP;
|
---|
2901 | switch (op) {
|
---|
2902 |
|
---|
2903 | case SRE_OP_MARK:
|
---|
2904 | /* We don't check whether marks are properly nested; the
|
---|
2905 | sre_match() code is robust even if they don't, and the worst
|
---|
2906 | you can get is nonsensical match results. */
|
---|
2907 | GET_ARG;
|
---|
2908 | if (arg > 2*groups+1) {
|
---|
2909 | VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
|
---|
2910 | FAIL;
|
---|
2911 | }
|
---|
2912 | break;
|
---|
2913 |
|
---|
2914 | case SRE_OP_LITERAL:
|
---|
2915 | case SRE_OP_NOT_LITERAL:
|
---|
2916 | case SRE_OP_LITERAL_IGNORE:
|
---|
2917 | case SRE_OP_NOT_LITERAL_IGNORE:
|
---|
2918 | GET_ARG;
|
---|
2919 | /* The arg is just a character, nothing to check */
|
---|
2920 | break;
|
---|
2921 |
|
---|
2922 | case SRE_OP_SUCCESS:
|
---|
2923 | case SRE_OP_FAILURE:
|
---|
2924 | /* Nothing to check; these normally end the matching process */
|
---|
2925 | break;
|
---|
2926 |
|
---|
2927 | case SRE_OP_AT:
|
---|
2928 | GET_ARG;
|
---|
2929 | switch (arg) {
|
---|
2930 | case SRE_AT_BEGINNING:
|
---|
2931 | case SRE_AT_BEGINNING_STRING:
|
---|
2932 | case SRE_AT_BEGINNING_LINE:
|
---|
2933 | case SRE_AT_END:
|
---|
2934 | case SRE_AT_END_LINE:
|
---|
2935 | case SRE_AT_END_STRING:
|
---|
2936 | case SRE_AT_BOUNDARY:
|
---|
2937 | case SRE_AT_NON_BOUNDARY:
|
---|
2938 | case SRE_AT_LOC_BOUNDARY:
|
---|
2939 | case SRE_AT_LOC_NON_BOUNDARY:
|
---|
2940 | case SRE_AT_UNI_BOUNDARY:
|
---|
2941 | case SRE_AT_UNI_NON_BOUNDARY:
|
---|
2942 | break;
|
---|
2943 | default:
|
---|
2944 | FAIL;
|
---|
2945 | }
|
---|
2946 | break;
|
---|
2947 |
|
---|
2948 | case SRE_OP_ANY:
|
---|
2949 | case SRE_OP_ANY_ALL:
|
---|
2950 | /* These have no operands */
|
---|
2951 | break;
|
---|
2952 |
|
---|
2953 | case SRE_OP_IN:
|
---|
2954 | case SRE_OP_IN_IGNORE:
|
---|
2955 | GET_SKIP;
|
---|
2956 | /* Stop 1 before the end; we check the FAILURE below */
|
---|
2957 | if (!_validate_charset(code, code+skip-2))
|
---|
2958 | FAIL;
|
---|
2959 | if (code[skip-2] != SRE_OP_FAILURE)
|
---|
2960 | FAIL;
|
---|
2961 | code += skip-1;
|
---|
2962 | break;
|
---|
2963 |
|
---|
2964 | case SRE_OP_INFO:
|
---|
2965 | {
|
---|
2966 | /* A minimal info field is
|
---|
2967 | <INFO> <1=skip> <2=flags> <3=min> <4=max>;
|
---|
2968 | If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
|
---|
2969 | more follows. */
|
---|
2970 | SRE_CODE flags, i;
|
---|
2971 | SRE_CODE *newcode;
|
---|
2972 | GET_SKIP;
|
---|
2973 | newcode = code+skip-1;
|
---|
2974 | GET_ARG; flags = arg;
|
---|
2975 | GET_ARG; /* min */
|
---|
2976 | GET_ARG; /* max */
|
---|
2977 | /* Check that only valid flags are present */
|
---|
2978 | if ((flags & ~(SRE_INFO_PREFIX |
|
---|
2979 | SRE_INFO_LITERAL |
|
---|
2980 | SRE_INFO_CHARSET)) != 0)
|
---|
2981 | FAIL;
|
---|
2982 | /* PREFIX and CHARSET are mutually exclusive */
|
---|
2983 | if ((flags & SRE_INFO_PREFIX) &&
|
---|
2984 | (flags & SRE_INFO_CHARSET))
|
---|
2985 | FAIL;
|
---|
2986 | /* LITERAL implies PREFIX */
|
---|
2987 | if ((flags & SRE_INFO_LITERAL) &&
|
---|
2988 | !(flags & SRE_INFO_PREFIX))
|
---|
2989 | FAIL;
|
---|
2990 | /* Validate the prefix */
|
---|
2991 | if (flags & SRE_INFO_PREFIX) {
|
---|
2992 | SRE_CODE prefix_len;
|
---|
2993 | GET_ARG; prefix_len = arg;
|
---|
2994 | GET_ARG; /* prefix skip */
|
---|
2995 | /* Here comes the prefix string */
|
---|
2996 | if (prefix_len > newcode-code)
|
---|
2997 | FAIL;
|
---|
2998 | code += prefix_len;
|
---|
2999 | /* And here comes the overlap table */
|
---|
3000 | if (prefix_len > newcode-code)
|
---|
3001 | FAIL;
|
---|
3002 | /* Each overlap value should be < prefix_len */
|
---|
3003 | for (i = 0; i < prefix_len; i++) {
|
---|
3004 | if (code[i] >= prefix_len)
|
---|
3005 | FAIL;
|
---|
3006 | }
|
---|
3007 | code += prefix_len;
|
---|
3008 | }
|
---|
3009 | /* Validate the charset */
|
---|
3010 | if (flags & SRE_INFO_CHARSET) {
|
---|
3011 | if (!_validate_charset(code, newcode-1))
|
---|
3012 | FAIL;
|
---|
3013 | if (newcode[-1] != SRE_OP_FAILURE)
|
---|
3014 | FAIL;
|
---|
3015 | code = newcode;
|
---|
3016 | }
|
---|
3017 | else if (code != newcode) {
|
---|
3018 | VTRACE(("code=%p, newcode=%p\n", code, newcode));
|
---|
3019 | FAIL;
|
---|
3020 | }
|
---|
3021 | }
|
---|
3022 | break;
|
---|
3023 |
|
---|
3024 | case SRE_OP_BRANCH:
|
---|
3025 | {
|
---|
3026 | SRE_CODE *target = NULL;
|
---|
3027 | for (;;) {
|
---|
3028 | GET_SKIP;
|
---|
3029 | if (skip == 0)
|
---|
3030 | break;
|
---|
3031 | /* Stop 2 before the end; we check the JUMP below */
|
---|
3032 | if (!_validate_inner(code, code+skip-3, groups))
|
---|
3033 | FAIL;
|
---|
3034 | code += skip-3;
|
---|
3035 | /* Check that it ends with a JUMP, and that each JUMP
|
---|
3036 | has the same target */
|
---|
3037 | GET_OP;
|
---|
3038 | if (op != SRE_OP_JUMP)
|
---|
3039 | FAIL;
|
---|
3040 | GET_SKIP;
|
---|
3041 | if (target == NULL)
|
---|
3042 | target = code+skip-1;
|
---|
3043 | else if (code+skip-1 != target)
|
---|
3044 | FAIL;
|
---|
3045 | }
|
---|
3046 | }
|
---|
3047 | break;
|
---|
3048 |
|
---|
3049 | case SRE_OP_REPEAT_ONE:
|
---|
3050 | case SRE_OP_MIN_REPEAT_ONE:
|
---|
3051 | {
|
---|
3052 | SRE_CODE min, max;
|
---|
3053 | GET_SKIP;
|
---|
3054 | GET_ARG; min = arg;
|
---|
3055 | GET_ARG; max = arg;
|
---|
3056 | if (min > max)
|
---|
3057 | FAIL;
|
---|
3058 | if (max > SRE_MAXREPEAT)
|
---|
3059 | FAIL;
|
---|
3060 | if (!_validate_inner(code, code+skip-4, groups))
|
---|
3061 | FAIL;
|
---|
3062 | code += skip-4;
|
---|
3063 | GET_OP;
|
---|
3064 | if (op != SRE_OP_SUCCESS)
|
---|
3065 | FAIL;
|
---|
3066 | }
|
---|
3067 | break;
|
---|
3068 |
|
---|
3069 | case SRE_OP_REPEAT:
|
---|
3070 | {
|
---|
3071 | SRE_CODE min, max;
|
---|
3072 | GET_SKIP;
|
---|
3073 | GET_ARG; min = arg;
|
---|
3074 | GET_ARG; max = arg;
|
---|
3075 | if (min > max)
|
---|
3076 | FAIL;
|
---|
3077 | if (max > SRE_MAXREPEAT)
|
---|
3078 | FAIL;
|
---|
3079 | if (!_validate_inner(code, code+skip-3, groups))
|
---|
3080 | FAIL;
|
---|
3081 | code += skip-3;
|
---|
3082 | GET_OP;
|
---|
3083 | if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
|
---|
3084 | FAIL;
|
---|
3085 | }
|
---|
3086 | break;
|
---|
3087 |
|
---|
3088 | case SRE_OP_GROUPREF:
|
---|
3089 | case SRE_OP_GROUPREF_IGNORE:
|
---|
3090 | GET_ARG;
|
---|
3091 | if (arg >= groups)
|
---|
3092 | FAIL;
|
---|
3093 | break;
|
---|
3094 |
|
---|
3095 | case SRE_OP_GROUPREF_EXISTS:
|
---|
3096 | /* The regex syntax for this is: '(?(group)then|else)', where
|
---|
3097 | 'group' is either an integer group number or a group name,
|
---|
3098 | 'then' and 'else' are sub-regexes, and 'else' is optional. */
|
---|
3099 | GET_ARG;
|
---|
3100 | if (arg >= groups)
|
---|
3101 | FAIL;
|
---|
3102 | GET_SKIP_ADJ(1);
|
---|
3103 | code--; /* The skip is relative to the first arg! */
|
---|
3104 | /* There are two possibilities here: if there is both a 'then'
|
---|
3105 | part and an 'else' part, the generated code looks like:
|
---|
3106 |
|
---|
3107 | GROUPREF_EXISTS
|
---|
3108 | <group>
|
---|
3109 | <skipyes>
|
---|
3110 | ...then part...
|
---|
3111 | JUMP
|
---|
3112 | <skipno>
|
---|
3113 | (<skipyes> jumps here)
|
---|
3114 | ...else part...
|
---|
3115 | (<skipno> jumps here)
|
---|
3116 |
|
---|
3117 | If there is only a 'then' part, it looks like:
|
---|
3118 |
|
---|
3119 | GROUPREF_EXISTS
|
---|
3120 | <group>
|
---|
3121 | <skip>
|
---|
3122 | ...then part...
|
---|
3123 | (<skip> jumps here)
|
---|
3124 |
|
---|
3125 | There is no direct way to decide which it is, and we don't want
|
---|
3126 | to allow arbitrary jumps anywhere in the code; so we just look
|
---|
3127 | for a JUMP opcode preceding our skip target.
|
---|
3128 | */
|
---|
3129 | if (skip >= 3 && skip-3 < end-code &&
|
---|
3130 | code[skip-3] == SRE_OP_JUMP)
|
---|
3131 | {
|
---|
3132 | VTRACE(("both then and else parts present\n"));
|
---|
3133 | if (!_validate_inner(code+1, code+skip-3, groups))
|
---|
3134 | FAIL;
|
---|
3135 | code += skip-2; /* Position after JUMP, at <skipno> */
|
---|
3136 | GET_SKIP;
|
---|
3137 | if (!_validate_inner(code, code+skip-1, groups))
|
---|
3138 | FAIL;
|
---|
3139 | code += skip-1;
|
---|
3140 | }
|
---|
3141 | else {
|
---|
3142 | VTRACE(("only a then part present\n"));
|
---|
3143 | if (!_validate_inner(code+1, code+skip-1, groups))
|
---|
3144 | FAIL;
|
---|
3145 | code += skip-1;
|
---|
3146 | }
|
---|
3147 | break;
|
---|
3148 |
|
---|
3149 | case SRE_OP_ASSERT:
|
---|
3150 | case SRE_OP_ASSERT_NOT:
|
---|
3151 | GET_SKIP;
|
---|
3152 | GET_ARG; /* 0 for lookahead, width for lookbehind */
|
---|
3153 | code--; /* Back up over arg to simplify math below */
|
---|
3154 | if (arg & 0x80000000)
|
---|
3155 | FAIL; /* Width too large */
|
---|
3156 | /* Stop 1 before the end; we check the SUCCESS below */
|
---|
3157 | if (!_validate_inner(code+1, code+skip-2, groups))
|
---|
3158 | FAIL;
|
---|
3159 | code += skip-2;
|
---|
3160 | GET_OP;
|
---|
3161 | if (op != SRE_OP_SUCCESS)
|
---|
3162 | FAIL;
|
---|
3163 | break;
|
---|
3164 |
|
---|
3165 | default:
|
---|
3166 | FAIL;
|
---|
3167 |
|
---|
3168 | }
|
---|
3169 | }
|
---|
3170 |
|
---|
3171 | VTRACE(("okay\n"));
|
---|
3172 | return 1;
|
---|
3173 | }
|
---|
3174 |
|
---|
3175 | static int
|
---|
3176 | _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
|
---|
3177 | {
|
---|
3178 | if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
|
---|
3179 | FAIL;
|
---|
3180 | if (groups == 0) /* fix for simplejson */
|
---|
3181 | groups = 100; /* 100 groups should always be safe */
|
---|
3182 | return _validate_inner(code, end-1, groups);
|
---|
3183 | }
|
---|
3184 |
|
---|
3185 | static int
|
---|
3186 | _validate(PatternObject *self)
|
---|
3187 | {
|
---|
3188 | if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
|
---|
3189 | {
|
---|
3190 | PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
|
---|
3191 | return 0;
|
---|
3192 | }
|
---|
3193 | else
|
---|
3194 | VTRACE(("Success!\n"));
|
---|
3195 | return 1;
|
---|
3196 | }
|
---|
3197 |
|
---|
3198 | /* -------------------------------------------------------------------- */
|
---|
3199 | /* match methods */
|
---|
3200 |
|
---|
3201 | static void
|
---|
3202 | match_dealloc(MatchObject* self)
|
---|
3203 | {
|
---|
3204 | Py_XDECREF(self->regs);
|
---|
3205 | Py_XDECREF(self->string);
|
---|
3206 | Py_DECREF(self->pattern);
|
---|
3207 | PyObject_DEL(self);
|
---|
3208 | }
|
---|
3209 |
|
---|
3210 | static PyObject*
|
---|
3211 | match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
|
---|
3212 | {
|
---|
3213 | if (index < 0 || index >= self->groups) {
|
---|
3214 | /* raise IndexError if we were given a bad group number */
|
---|
3215 | PyErr_SetString(
|
---|
3216 | PyExc_IndexError,
|
---|
3217 | "no such group"
|
---|
3218 | );
|
---|
3219 | return NULL;
|
---|
3220 | }
|
---|
3221 |
|
---|
3222 | index *= 2;
|
---|
3223 |
|
---|
3224 | if (self->string == Py_None || self->mark[index] < 0) {
|
---|
3225 | /* return default value if the string or group is undefined */
|
---|
3226 | Py_INCREF(def);
|
---|
3227 | return def;
|
---|
3228 | }
|
---|
3229 |
|
---|
3230 | return PySequence_GetSlice(
|
---|
3231 | self->string, self->mark[index], self->mark[index+1]
|
---|
3232 | );
|
---|
3233 | }
|
---|
3234 |
|
---|
3235 | static Py_ssize_t
|
---|
3236 | match_getindex(MatchObject* self, PyObject* index)
|
---|
3237 | {
|
---|
3238 | Py_ssize_t i;
|
---|
3239 |
|
---|
3240 | if (PyInt_Check(index))
|
---|
3241 | return PyInt_AsSsize_t(index);
|
---|
3242 |
|
---|
3243 | i = -1;
|
---|
3244 |
|
---|
3245 | if (self->pattern->groupindex) {
|
---|
3246 | index = PyObject_GetItem(self->pattern->groupindex, index);
|
---|
3247 | if (index) {
|
---|
3248 | if (PyInt_Check(index) || PyLong_Check(index))
|
---|
3249 | i = PyInt_AsSsize_t(index);
|
---|
3250 | Py_DECREF(index);
|
---|
3251 | } else
|
---|
3252 | PyErr_Clear();
|
---|
3253 | }
|
---|
3254 |
|
---|
3255 | return i;
|
---|
3256 | }
|
---|
3257 |
|
---|
3258 | static PyObject*
|
---|
3259 | match_getslice(MatchObject* self, PyObject* index, PyObject* def)
|
---|
3260 | {
|
---|
3261 | return match_getslice_by_index(self, match_getindex(self, index), def);
|
---|
3262 | }
|
---|
3263 |
|
---|
3264 | static PyObject*
|
---|
3265 | match_expand(MatchObject* self, PyObject* ptemplate)
|
---|
3266 | {
|
---|
3267 | /* delegate to Python code */
|
---|
3268 | return call(
|
---|
3269 | SRE_PY_MODULE, "_expand",
|
---|
3270 | PyTuple_Pack(3, self->pattern, self, ptemplate)
|
---|
3271 | );
|
---|
3272 | }
|
---|
3273 |
|
---|
3274 | static PyObject*
|
---|
3275 | match_group(MatchObject* self, PyObject* args)
|
---|
3276 | {
|
---|
3277 | PyObject* result;
|
---|
3278 | Py_ssize_t i, size;
|
---|
3279 |
|
---|
3280 | size = PyTuple_GET_SIZE(args);
|
---|
3281 |
|
---|
3282 | switch (size) {
|
---|
3283 | case 0:
|
---|
3284 | result = match_getslice(self, Py_False, Py_None);
|
---|
3285 | break;
|
---|
3286 | case 1:
|
---|
3287 | result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
|
---|
3288 | break;
|
---|
3289 | default:
|
---|
3290 | /* fetch multiple items */
|
---|
3291 | result = PyTuple_New(size);
|
---|
3292 | if (!result)
|
---|
3293 | return NULL;
|
---|
3294 | for (i = 0; i < size; i++) {
|
---|
3295 | PyObject* item = match_getslice(
|
---|
3296 | self, PyTuple_GET_ITEM(args, i), Py_None
|
---|
3297 | );
|
---|
3298 | if (!item) {
|
---|
3299 | Py_DECREF(result);
|
---|
3300 | return NULL;
|
---|
3301 | }
|
---|
3302 | PyTuple_SET_ITEM(result, i, item);
|
---|
3303 | }
|
---|
3304 | break;
|
---|
3305 | }
|
---|
3306 | return result;
|
---|
3307 | }
|
---|
3308 |
|
---|
3309 | static PyObject*
|
---|
3310 | match_groups(MatchObject* self, PyObject* args, PyObject* kw)
|
---|
3311 | {
|
---|
3312 | PyObject* result;
|
---|
3313 | Py_ssize_t index;
|
---|
3314 |
|
---|
3315 | PyObject* def = Py_None;
|
---|
3316 | static char* kwlist[] = { "default", NULL };
|
---|
3317 | if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
|
---|
3318 | return NULL;
|
---|
3319 |
|
---|
3320 | result = PyTuple_New(self->groups-1);
|
---|
3321 | if (!result)
|
---|
3322 | return NULL;
|
---|
3323 |
|
---|
3324 | for (index = 1; index < self->groups; index++) {
|
---|
3325 | PyObject* item;
|
---|
3326 | item = match_getslice_by_index(self, index, def);
|
---|
3327 | if (!item) {
|
---|
3328 | Py_DECREF(result);
|
---|
3329 | return NULL;
|
---|
3330 | }
|
---|
3331 | PyTuple_SET_ITEM(result, index-1, item);
|
---|
3332 | }
|
---|
3333 |
|
---|
3334 | return result;
|
---|
3335 | }
|
---|
3336 |
|
---|
3337 | static PyObject*
|
---|
3338 | match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
|
---|
3339 | {
|
---|
3340 | PyObject* result;
|
---|
3341 | PyObject* keys;
|
---|
3342 | Py_ssize_t index;
|
---|
3343 |
|
---|
3344 | PyObject* def = Py_None;
|
---|
3345 | static char* kwlist[] = { "default", NULL };
|
---|
3346 | if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
|
---|
3347 | return NULL;
|
---|
3348 |
|
---|
3349 | result = PyDict_New();
|
---|
3350 | if (!result || !self->pattern->groupindex)
|
---|
3351 | return result;
|
---|
3352 |
|
---|
3353 | keys = PyMapping_Keys(self->pattern->groupindex);
|
---|
3354 | if (!keys)
|
---|
3355 | goto failed;
|
---|
3356 |
|
---|
3357 | for (index = 0; index < PyList_GET_SIZE(keys); index++) {
|
---|
3358 | int status;
|
---|
3359 | PyObject* key;
|
---|
3360 | PyObject* value;
|
---|
3361 | key = PyList_GET_ITEM(keys, index);
|
---|
3362 | if (!key)
|
---|
3363 | goto failed;
|
---|
3364 | value = match_getslice(self, key, def);
|
---|
3365 | if (!value) {
|
---|
3366 | Py_DECREF(key);
|
---|
3367 | goto failed;
|
---|
3368 | }
|
---|
3369 | status = PyDict_SetItem(result, key, value);
|
---|
3370 | Py_DECREF(value);
|
---|
3371 | if (status < 0)
|
---|
3372 | goto failed;
|
---|
3373 | }
|
---|
3374 |
|
---|
3375 | Py_DECREF(keys);
|
---|
3376 |
|
---|
3377 | return result;
|
---|
3378 |
|
---|
3379 | failed:
|
---|
3380 | Py_XDECREF(keys);
|
---|
3381 | Py_DECREF(result);
|
---|
3382 | return NULL;
|
---|
3383 | }
|
---|
3384 |
|
---|
3385 | static PyObject*
|
---|
3386 | match_start(MatchObject* self, PyObject* args)
|
---|
3387 | {
|
---|
3388 | Py_ssize_t index;
|
---|
3389 |
|
---|
3390 | PyObject* index_ = Py_False; /* zero */
|
---|
3391 | if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
|
---|
3392 | return NULL;
|
---|
3393 |
|
---|
3394 | index = match_getindex(self, index_);
|
---|
3395 |
|
---|
3396 | if (index < 0 || index >= self->groups) {
|
---|
3397 | PyErr_SetString(
|
---|
3398 | PyExc_IndexError,
|
---|
3399 | "no such group"
|
---|
3400 | );
|
---|
3401 | return NULL;
|
---|
3402 | }
|
---|
3403 |
|
---|
3404 | /* mark is -1 if group is undefined */
|
---|
3405 | return PyInt_FromSsize_t(self->mark[index*2]);
|
---|
3406 | }
|
---|
3407 |
|
---|
3408 | static PyObject*
|
---|
3409 | match_end(MatchObject* self, PyObject* args)
|
---|
3410 | {
|
---|
3411 | Py_ssize_t index;
|
---|
3412 |
|
---|
3413 | PyObject* index_ = Py_False; /* zero */
|
---|
3414 | if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
|
---|
3415 | return NULL;
|
---|
3416 |
|
---|
3417 | index = match_getindex(self, index_);
|
---|
3418 |
|
---|
3419 | if (index < 0 || index >= self->groups) {
|
---|
3420 | PyErr_SetString(
|
---|
3421 | PyExc_IndexError,
|
---|
3422 | "no such group"
|
---|
3423 | );
|
---|
3424 | return NULL;
|
---|
3425 | }
|
---|
3426 |
|
---|
3427 | /* mark is -1 if group is undefined */
|
---|
3428 | return PyInt_FromSsize_t(self->mark[index*2+1]);
|
---|
3429 | }
|
---|
3430 |
|
---|
3431 | LOCAL(PyObject*)
|
---|
3432 | _pair(Py_ssize_t i1, Py_ssize_t i2)
|
---|
3433 | {
|
---|
3434 | PyObject* pair;
|
---|
3435 | PyObject* item;
|
---|
3436 |
|
---|
3437 | pair = PyTuple_New(2);
|
---|
3438 | if (!pair)
|
---|
3439 | return NULL;
|
---|
3440 |
|
---|
3441 | item = PyInt_FromSsize_t(i1);
|
---|
3442 | if (!item)
|
---|
3443 | goto error;
|
---|
3444 | PyTuple_SET_ITEM(pair, 0, item);
|
---|
3445 |
|
---|
3446 | item = PyInt_FromSsize_t(i2);
|
---|
3447 | if (!item)
|
---|
3448 | goto error;
|
---|
3449 | PyTuple_SET_ITEM(pair, 1, item);
|
---|
3450 |
|
---|
3451 | return pair;
|
---|
3452 |
|
---|
3453 | error:
|
---|
3454 | Py_DECREF(pair);
|
---|
3455 | return NULL;
|
---|
3456 | }
|
---|
3457 |
|
---|
3458 | static PyObject*
|
---|
3459 | match_span(MatchObject* self, PyObject* args)
|
---|
3460 | {
|
---|
3461 | Py_ssize_t index;
|
---|
3462 |
|
---|
3463 | PyObject* index_ = Py_False; /* zero */
|
---|
3464 | if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
|
---|
3465 | return NULL;
|
---|
3466 |
|
---|
3467 | index = match_getindex(self, index_);
|
---|
3468 |
|
---|
3469 | if (index < 0 || index >= self->groups) {
|
---|
3470 | PyErr_SetString(
|
---|
3471 | PyExc_IndexError,
|
---|
3472 | "no such group"
|
---|
3473 | );
|
---|
3474 | return NULL;
|
---|
3475 | }
|
---|
3476 |
|
---|
3477 | /* marks are -1 if group is undefined */
|
---|
3478 | return _pair(self->mark[index*2], self->mark[index*2+1]);
|
---|
3479 | }
|
---|
3480 |
|
---|
3481 | static PyObject*
|
---|
3482 | match_regs(MatchObject* self)
|
---|
3483 | {
|
---|
3484 | PyObject* regs;
|
---|
3485 | PyObject* item;
|
---|
3486 | Py_ssize_t index;
|
---|
3487 |
|
---|
3488 | regs = PyTuple_New(self->groups);
|
---|
3489 | if (!regs)
|
---|
3490 | return NULL;
|
---|
3491 |
|
---|
3492 | for (index = 0; index < self->groups; index++) {
|
---|
3493 | item = _pair(self->mark[index*2], self->mark[index*2+1]);
|
---|
3494 | if (!item) {
|
---|
3495 | Py_DECREF(regs);
|
---|
3496 | return NULL;
|
---|
3497 | }
|
---|
3498 | PyTuple_SET_ITEM(regs, index, item);
|
---|
3499 | }
|
---|
3500 |
|
---|
3501 | Py_INCREF(regs);
|
---|
3502 | self->regs = regs;
|
---|
3503 |
|
---|
3504 | return regs;
|
---|
3505 | }
|
---|
3506 |
|
---|
3507 | static PyObject*
|
---|
3508 | match_copy(MatchObject* self, PyObject *unused)
|
---|
3509 | {
|
---|
3510 | #ifdef USE_BUILTIN_COPY
|
---|
3511 | MatchObject* copy;
|
---|
3512 | Py_ssize_t slots, offset;
|
---|
3513 |
|
---|
3514 | slots = 2 * (self->pattern->groups+1);
|
---|
3515 |
|
---|
3516 | copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
|
---|
3517 | if (!copy)
|
---|
3518 | return NULL;
|
---|
3519 |
|
---|
3520 | /* this value a constant, but any compiler should be able to
|
---|
3521 | figure that out all by itself */
|
---|
3522 | offset = offsetof(MatchObject, string);
|
---|
3523 |
|
---|
3524 | Py_XINCREF(self->pattern);
|
---|
3525 | Py_XINCREF(self->string);
|
---|
3526 | Py_XINCREF(self->regs);
|
---|
3527 |
|
---|
3528 | memcpy((char*) copy + offset, (char*) self + offset,
|
---|
3529 | sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
|
---|
3530 |
|
---|
3531 | return (PyObject*) copy;
|
---|
3532 | #else
|
---|
3533 | PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
|
---|
3534 | return NULL;
|
---|
3535 | #endif
|
---|
3536 | }
|
---|
3537 |
|
---|
3538 | static PyObject*
|
---|
3539 | match_deepcopy(MatchObject* self, PyObject* memo)
|
---|
3540 | {
|
---|
3541 | #ifdef USE_BUILTIN_COPY
|
---|
3542 | MatchObject* copy;
|
---|
3543 |
|
---|
3544 | copy = (MatchObject*) match_copy(self);
|
---|
3545 | if (!copy)
|
---|
3546 | return NULL;
|
---|
3547 |
|
---|
3548 | if (!deepcopy((PyObject**) ©->pattern, memo) ||
|
---|
3549 | !deepcopy(©->string, memo) ||
|
---|
3550 | !deepcopy(©->regs, memo)) {
|
---|
3551 | Py_DECREF(copy);
|
---|
3552 | return NULL;
|
---|
3553 | }
|
---|
3554 |
|
---|
3555 | #else
|
---|
3556 | PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
|
---|
3557 | return NULL;
|
---|
3558 | #endif
|
---|
3559 | }
|
---|
3560 |
|
---|
3561 | PyDoc_STRVAR(match_doc,
|
---|
3562 | "The result of re.match() and re.search().\n\
|
---|
3563 | Match objects always have a boolean value of True.");
|
---|
3564 |
|
---|
3565 | PyDoc_STRVAR(match_group_doc,
|
---|
3566 | "group([group1, ...]) -> str or tuple.\n\
|
---|
3567 | Return subgroup(s) of the match by indices or names.\n\
|
---|
3568 | For 0 returns the entire match.");
|
---|
3569 |
|
---|
3570 | PyDoc_STRVAR(match_start_doc,
|
---|
3571 | "start([group=0]) -> int.\n\
|
---|
3572 | Return index of the start of the substring matched by group.");
|
---|
3573 |
|
---|
3574 | PyDoc_STRVAR(match_end_doc,
|
---|
3575 | "end([group=0]) -> int.\n\
|
---|
3576 | Return index of the end of the substring matched by group.");
|
---|
3577 |
|
---|
3578 | PyDoc_STRVAR(match_span_doc,
|
---|
3579 | "span([group]) -> tuple.\n\
|
---|
3580 | For MatchObject m, return the 2-tuple (m.start(group), m.end(group)).");
|
---|
3581 |
|
---|
3582 | PyDoc_STRVAR(match_groups_doc,
|
---|
3583 | "groups([default=None]) -> tuple.\n\
|
---|
3584 | Return a tuple containing all the subgroups of the match, from 1.\n\
|
---|
3585 | The default argument is used for groups\n\
|
---|
3586 | that did not participate in the match");
|
---|
3587 |
|
---|
3588 | PyDoc_STRVAR(match_groupdict_doc,
|
---|
3589 | "groupdict([default=None]) -> dict.\n\
|
---|
3590 | Return a dictionary containing all the named subgroups of the match,\n\
|
---|
3591 | keyed by the subgroup name. The default argument is used for groups\n\
|
---|
3592 | that did not participate in the match");
|
---|
3593 |
|
---|
3594 | PyDoc_STRVAR(match_expand_doc,
|
---|
3595 | "expand(template) -> str.\n\
|
---|
3596 | Return the string obtained by doing backslash substitution\n\
|
---|
3597 | on the string template, as done by the sub() method.");
|
---|
3598 |
|
---|
3599 | static PyMethodDef match_methods[] = {
|
---|
3600 | {"group", (PyCFunction) match_group, METH_VARARGS, match_group_doc},
|
---|
3601 | {"start", (PyCFunction) match_start, METH_VARARGS, match_start_doc},
|
---|
3602 | {"end", (PyCFunction) match_end, METH_VARARGS, match_end_doc},
|
---|
3603 | {"span", (PyCFunction) match_span, METH_VARARGS, match_span_doc},
|
---|
3604 | {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS,
|
---|
3605 | match_groups_doc},
|
---|
3606 | {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS,
|
---|
3607 | match_groupdict_doc},
|
---|
3608 | {"expand", (PyCFunction) match_expand, METH_O, match_expand_doc},
|
---|
3609 | {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
|
---|
3610 | {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
|
---|
3611 | {NULL, NULL}
|
---|
3612 | };
|
---|
3613 |
|
---|
3614 | static PyObject *
|
---|
3615 | match_lastindex_get(MatchObject *self)
|
---|
3616 | {
|
---|
3617 | if (self->lastindex >= 0)
|
---|
3618 | return PyInt_FromSsize_t(self->lastindex);
|
---|
3619 | Py_INCREF(Py_None);
|
---|
3620 | return Py_None;
|
---|
3621 | }
|
---|
3622 |
|
---|
3623 | static PyObject *
|
---|
3624 | match_lastgroup_get(MatchObject *self)
|
---|
3625 | {
|
---|
3626 | if (self->pattern->indexgroup && self->lastindex >= 0) {
|
---|
3627 | PyObject* result = PySequence_GetItem(
|
---|
3628 | self->pattern->indexgroup, self->lastindex
|
---|
3629 | );
|
---|
3630 | if (result)
|
---|
3631 | return result;
|
---|
3632 | PyErr_Clear();
|
---|
3633 | }
|
---|
3634 | Py_INCREF(Py_None);
|
---|
3635 | return Py_None;
|
---|
3636 | }
|
---|
3637 |
|
---|
3638 | static PyObject *
|
---|
3639 | match_regs_get(MatchObject *self)
|
---|
3640 | {
|
---|
3641 | if (self->regs) {
|
---|
3642 | Py_INCREF(self->regs);
|
---|
3643 | return self->regs;
|
---|
3644 | } else
|
---|
3645 | return match_regs(self);
|
---|
3646 | }
|
---|
3647 |
|
---|
3648 | static PyGetSetDef match_getset[] = {
|
---|
3649 | {"lastindex", (getter)match_lastindex_get, (setter)NULL},
|
---|
3650 | {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
|
---|
3651 | {"regs", (getter)match_regs_get, (setter)NULL},
|
---|
3652 | {NULL}
|
---|
3653 | };
|
---|
3654 |
|
---|
3655 | #define MATCH_OFF(x) offsetof(MatchObject, x)
|
---|
3656 | static PyMemberDef match_members[] = {
|
---|
3657 | {"string", T_OBJECT, MATCH_OFF(string), READONLY},
|
---|
3658 | {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
|
---|
3659 | {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
|
---|
3660 | {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
|
---|
3661 | {NULL}
|
---|
3662 | };
|
---|
3663 |
|
---|
3664 |
|
---|
3665 | /* FIXME: implement setattr("string", None) as a special case (to
|
---|
3666 | detach the associated string, if any */
|
---|
3667 |
|
---|
3668 | static PyTypeObject Match_Type = {
|
---|
3669 | PyVarObject_HEAD_INIT(NULL, 0)
|
---|
3670 | "_" SRE_MODULE ".SRE_Match",
|
---|
3671 | sizeof(MatchObject), sizeof(Py_ssize_t),
|
---|
3672 | (destructor)match_dealloc, /* tp_dealloc */
|
---|
3673 | 0, /* tp_print */
|
---|
3674 | 0, /* tp_getattr */
|
---|
3675 | 0, /* tp_setattr */
|
---|
3676 | 0, /* tp_compare */
|
---|
3677 | 0, /* tp_repr */
|
---|
3678 | 0, /* tp_as_number */
|
---|
3679 | 0, /* tp_as_sequence */
|
---|
3680 | 0, /* tp_as_mapping */
|
---|
3681 | 0, /* tp_hash */
|
---|
3682 | 0, /* tp_call */
|
---|
3683 | 0, /* tp_str */
|
---|
3684 | 0, /* tp_getattro */
|
---|
3685 | 0, /* tp_setattro */
|
---|
3686 | 0, /* tp_as_buffer */
|
---|
3687 | Py_TPFLAGS_DEFAULT,
|
---|
3688 | match_doc, /* tp_doc */
|
---|
3689 | 0, /* tp_traverse */
|
---|
3690 | 0, /* tp_clear */
|
---|
3691 | 0, /* tp_richcompare */
|
---|
3692 | 0, /* tp_weaklistoffset */
|
---|
3693 | 0, /* tp_iter */
|
---|
3694 | 0, /* tp_iternext */
|
---|
3695 | match_methods, /* tp_methods */
|
---|
3696 | match_members, /* tp_members */
|
---|
3697 | match_getset, /* tp_getset */
|
---|
3698 | };
|
---|
3699 |
|
---|
3700 | static PyObject*
|
---|
3701 | pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
|
---|
3702 | {
|
---|
3703 | /* create match object (from state object) */
|
---|
3704 |
|
---|
3705 | MatchObject* match;
|
---|
3706 | Py_ssize_t i, j;
|
---|
3707 | char* base;
|
---|
3708 | int n;
|
---|
3709 |
|
---|
3710 | if (status > 0) {
|
---|
3711 |
|
---|
3712 | /* create match object (with room for extra group marks) */
|
---|
3713 | /* coverity[ampersand_in_size] */
|
---|
3714 | match = PyObject_NEW_VAR(MatchObject, &Match_Type,
|
---|
3715 | 2*(pattern->groups+1));
|
---|
3716 | if (!match)
|
---|
3717 | return NULL;
|
---|
3718 |
|
---|
3719 | Py_INCREF(pattern);
|
---|
3720 | match->pattern = pattern;
|
---|
3721 |
|
---|
3722 | Py_INCREF(state->string);
|
---|
3723 | match->string = state->string;
|
---|
3724 |
|
---|
3725 | match->regs = NULL;
|
---|
3726 | match->groups = pattern->groups+1;
|
---|
3727 |
|
---|
3728 | /* fill in group slices */
|
---|
3729 |
|
---|
3730 | base = (char*) state->beginning;
|
---|
3731 | n = state->charsize;
|
---|
3732 |
|
---|
3733 | match->mark[0] = ((char*) state->start - base) / n;
|
---|
3734 | match->mark[1] = ((char*) state->ptr - base) / n;
|
---|
3735 |
|
---|
3736 | for (i = j = 0; i < pattern->groups; i++, j+=2)
|
---|
3737 | if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
|
---|
3738 | match->mark[j+2] = ((char*) state->mark[j] - base) / n;
|
---|
3739 | match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
|
---|
3740 | } else
|
---|
3741 | match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
|
---|
3742 |
|
---|
3743 | match->pos = state->pos;
|
---|
3744 | match->endpos = state->endpos;
|
---|
3745 |
|
---|
3746 | match->lastindex = state->lastindex;
|
---|
3747 |
|
---|
3748 | return (PyObject*) match;
|
---|
3749 |
|
---|
3750 | } else if (status == 0) {
|
---|
3751 |
|
---|
3752 | /* no match */
|
---|
3753 | Py_INCREF(Py_None);
|
---|
3754 | return Py_None;
|
---|
3755 |
|
---|
3756 | }
|
---|
3757 |
|
---|
3758 | /* internal error */
|
---|
3759 | pattern_error(status);
|
---|
3760 | return NULL;
|
---|
3761 | }
|
---|
3762 |
|
---|
3763 |
|
---|
3764 | /* -------------------------------------------------------------------- */
|
---|
3765 | /* scanner methods (experimental) */
|
---|
3766 |
|
---|
3767 | static void
|
---|
3768 | scanner_dealloc(ScannerObject* self)
|
---|
3769 | {
|
---|
3770 | state_fini(&self->state);
|
---|
3771 | Py_XDECREF(self->pattern);
|
---|
3772 | PyObject_DEL(self);
|
---|
3773 | }
|
---|
3774 |
|
---|
3775 | static PyObject*
|
---|
3776 | scanner_match(ScannerObject* self, PyObject *unused)
|
---|
3777 | {
|
---|
3778 | SRE_STATE* state = &self->state;
|
---|
3779 | PyObject* match;
|
---|
3780 | int status;
|
---|
3781 |
|
---|
3782 | state_reset(state);
|
---|
3783 |
|
---|
3784 | state->ptr = state->start;
|
---|
3785 |
|
---|
3786 | if (state->charsize == 1) {
|
---|
3787 | status = sre_match(state, PatternObject_GetCode(self->pattern));
|
---|
3788 | } else {
|
---|
3789 | #if defined(HAVE_UNICODE)
|
---|
3790 | status = sre_umatch(state, PatternObject_GetCode(self->pattern));
|
---|
3791 | #endif
|
---|
3792 | }
|
---|
3793 | if (PyErr_Occurred())
|
---|
3794 | return NULL;
|
---|
3795 |
|
---|
3796 | match = pattern_new_match((PatternObject*) self->pattern,
|
---|
3797 | state, status);
|
---|
3798 |
|
---|
3799 | if (status == 0 || state->ptr == state->start)
|
---|
3800 | state->start = (void*) ((char*) state->ptr + state->charsize);
|
---|
3801 | else
|
---|
3802 | state->start = state->ptr;
|
---|
3803 |
|
---|
3804 | return match;
|
---|
3805 | }
|
---|
3806 |
|
---|
3807 |
|
---|
3808 | static PyObject*
|
---|
3809 | scanner_search(ScannerObject* self, PyObject *unused)
|
---|
3810 | {
|
---|
3811 | SRE_STATE* state = &self->state;
|
---|
3812 | PyObject* match;
|
---|
3813 | int status;
|
---|
3814 |
|
---|
3815 | state_reset(state);
|
---|
3816 |
|
---|
3817 | state->ptr = state->start;
|
---|
3818 |
|
---|
3819 | if (state->charsize == 1) {
|
---|
3820 | status = sre_search(state, PatternObject_GetCode(self->pattern));
|
---|
3821 | } else {
|
---|
3822 | #if defined(HAVE_UNICODE)
|
---|
3823 | status = sre_usearch(state, PatternObject_GetCode(self->pattern));
|
---|
3824 | #endif
|
---|
3825 | }
|
---|
3826 | if (PyErr_Occurred())
|
---|
3827 | return NULL;
|
---|
3828 |
|
---|
3829 | match = pattern_new_match((PatternObject*) self->pattern,
|
---|
3830 | state, status);
|
---|
3831 |
|
---|
3832 | if (status == 0 || state->ptr == state->start)
|
---|
3833 | state->start = (void*) ((char*) state->ptr + state->charsize);
|
---|
3834 | else
|
---|
3835 | state->start = state->ptr;
|
---|
3836 |
|
---|
3837 | return match;
|
---|
3838 | }
|
---|
3839 |
|
---|
3840 | static PyMethodDef scanner_methods[] = {
|
---|
3841 | {"match", (PyCFunction) scanner_match, METH_NOARGS},
|
---|
3842 | {"search", (PyCFunction) scanner_search, METH_NOARGS},
|
---|
3843 | {NULL, NULL}
|
---|
3844 | };
|
---|
3845 |
|
---|
3846 | #define SCAN_OFF(x) offsetof(ScannerObject, x)
|
---|
3847 | static PyMemberDef scanner_members[] = {
|
---|
3848 | {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
|
---|
3849 | {NULL} /* Sentinel */
|
---|
3850 | };
|
---|
3851 |
|
---|
3852 | statichere PyTypeObject Scanner_Type = {
|
---|
3853 | PyObject_HEAD_INIT(NULL)
|
---|
3854 | 0, "_" SRE_MODULE ".SRE_Scanner",
|
---|
3855 | sizeof(ScannerObject), 0,
|
---|
3856 | (destructor)scanner_dealloc, /*tp_dealloc*/
|
---|
3857 | 0, /* tp_print */
|
---|
3858 | 0, /* tp_getattr */
|
---|
3859 | 0, /* tp_setattr */
|
---|
3860 | 0, /* tp_reserved */
|
---|
3861 | 0, /* tp_repr */
|
---|
3862 | 0, /* tp_as_number */
|
---|
3863 | 0, /* tp_as_sequence */
|
---|
3864 | 0, /* tp_as_mapping */
|
---|
3865 | 0, /* tp_hash */
|
---|
3866 | 0, /* tp_call */
|
---|
3867 | 0, /* tp_str */
|
---|
3868 | 0, /* tp_getattro */
|
---|
3869 | 0, /* tp_setattro */
|
---|
3870 | 0, /* tp_as_buffer */
|
---|
3871 | Py_TPFLAGS_DEFAULT, /* tp_flags */
|
---|
3872 | 0, /* tp_doc */
|
---|
3873 | 0, /* tp_traverse */
|
---|
3874 | 0, /* tp_clear */
|
---|
3875 | 0, /* tp_richcompare */
|
---|
3876 | 0, /* tp_weaklistoffset */
|
---|
3877 | 0, /* tp_iter */
|
---|
3878 | 0, /* tp_iternext */
|
---|
3879 | scanner_methods, /* tp_methods */
|
---|
3880 | scanner_members, /* tp_members */
|
---|
3881 | 0, /* tp_getset */
|
---|
3882 | };
|
---|
3883 |
|
---|
3884 | static PyObject*
|
---|
3885 | pattern_scanner(PatternObject* pattern, PyObject* args)
|
---|
3886 | {
|
---|
3887 | /* create search state object */
|
---|
3888 |
|
---|
3889 | ScannerObject* self;
|
---|
3890 |
|
---|
3891 | PyObject* string;
|
---|
3892 | Py_ssize_t start = 0;
|
---|
3893 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
3894 | if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
|
---|
3895 | return NULL;
|
---|
3896 |
|
---|
3897 | /* create scanner object */
|
---|
3898 | self = PyObject_NEW(ScannerObject, &Scanner_Type);
|
---|
3899 | if (!self)
|
---|
3900 | return NULL;
|
---|
3901 | self->pattern = NULL;
|
---|
3902 |
|
---|
3903 | string = state_init(&self->state, pattern, string, start, end);
|
---|
3904 | if (!string) {
|
---|
3905 | Py_DECREF(self);
|
---|
3906 | return NULL;
|
---|
3907 | }
|
---|
3908 |
|
---|
3909 | Py_INCREF(pattern);
|
---|
3910 | self->pattern = (PyObject*) pattern;
|
---|
3911 |
|
---|
3912 | return (PyObject*) self;
|
---|
3913 | }
|
---|
3914 |
|
---|
3915 | static PyMethodDef _functions[] = {
|
---|
3916 | {"compile", _compile, METH_VARARGS},
|
---|
3917 | {"getcodesize", sre_codesize, METH_NOARGS},
|
---|
3918 | {"getlower", sre_getlower, METH_VARARGS},
|
---|
3919 | {NULL, NULL}
|
---|
3920 | };
|
---|
3921 |
|
---|
3922 | #if PY_VERSION_HEX < 0x02030000
|
---|
3923 | DL_EXPORT(void) init_sre(void)
|
---|
3924 | #else
|
---|
3925 | PyMODINIT_FUNC init_sre(void)
|
---|
3926 | #endif
|
---|
3927 | {
|
---|
3928 | PyObject* m;
|
---|
3929 | PyObject* d;
|
---|
3930 | PyObject* x;
|
---|
3931 |
|
---|
3932 | /* Patch object types */
|
---|
3933 | if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
|
---|
3934 | PyType_Ready(&Scanner_Type))
|
---|
3935 | return;
|
---|
3936 |
|
---|
3937 | m = Py_InitModule("_" SRE_MODULE, _functions);
|
---|
3938 | if (m == NULL)
|
---|
3939 | return;
|
---|
3940 | d = PyModule_GetDict(m);
|
---|
3941 |
|
---|
3942 | x = PyInt_FromLong(SRE_MAGIC);
|
---|
3943 | if (x) {
|
---|
3944 | PyDict_SetItemString(d, "MAGIC", x);
|
---|
3945 | Py_DECREF(x);
|
---|
3946 | }
|
---|
3947 |
|
---|
3948 | x = PyInt_FromLong(sizeof(SRE_CODE));
|
---|
3949 | if (x) {
|
---|
3950 | PyDict_SetItemString(d, "CODESIZE", x);
|
---|
3951 | Py_DECREF(x);
|
---|
3952 | }
|
---|
3953 |
|
---|
3954 | x = PyLong_FromUnsignedLong(SRE_MAXREPEAT);
|
---|
3955 | if (x) {
|
---|
3956 | PyDict_SetItemString(d, "MAXREPEAT", x);
|
---|
3957 | Py_DECREF(x);
|
---|
3958 | }
|
---|
3959 |
|
---|
3960 | x = PyString_FromString(copyright);
|
---|
3961 | if (x) {
|
---|
3962 | PyDict_SetItemString(d, "copyright", x);
|
---|
3963 | Py_DECREF(x);
|
---|
3964 | }
|
---|
3965 | }
|
---|
3966 |
|
---|
3967 | #endif /* !defined(SRE_RECURSIVE) */
|
---|
3968 |
|
---|
3969 | /* vim:ts=4:sw=4:et
|
---|
3970 | */
|
---|