Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

sre_compile.py

Last change on this file was 388, checked in by dmik, 11 years ago
python: Update vendor to 2.7.6.
Property svn:eol-style set to `native`
File size: 16.0 KB

Line
1	#
2	# Secret Labs' Regular Expression Engine
3	#
4	# convert template to internal format
5	#
6	# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7	#
8	# See the sre.py file for information on usage and redistribution.
9	#
10
11	"""Internal support module for sre"""
12
13	import _sre, sys
14	import sre_parse
15	from sre_constants import *
16
17	assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
19	if _sre.CODESIZE == 2:
20	MAXCODE = 65535
21	else:
22	MAXCODE = 0xFFFFFFFFL
23
24	def _identityfunction(x):
25	return x
26
27	_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
28	_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
29	_SUCCESS_CODES = set([SUCCESS, FAILURE])
30	_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
31
32	def _compile(code, pattern, flags):
33	# internal: compile a (sub)pattern
34	emit = code.append
35	_len = len
36	LITERAL_CODES = _LITERAL_CODES
37	REPEATING_CODES = _REPEATING_CODES
38	SUCCESS_CODES = _SUCCESS_CODES
39	ASSERT_CODES = _ASSERT_CODES
40	for op, av in pattern:
41	if op in LITERAL_CODES:
42	if flags & SRE_FLAG_IGNORECASE:
43	emit(OPCODES[OP_IGNORE[op]])
44	emit(_sre.getlower(av, flags))
45	else:
46	emit(OPCODES[op])
47	emit(av)
48	elif op is IN:
49	if flags & SRE_FLAG_IGNORECASE:
50	emit(OPCODES[OP_IGNORE[op]])
51	def fixup(literal, flags=flags):
52	return _sre.getlower(literal, flags)
53	else:
54	emit(OPCODES[op])
55	fixup = _identityfunction
56	skip = _len(code); emit(0)
57	_compile_charset(av, flags, code, fixup)
58	code[skip] = _len(code) - skip
59	elif op is ANY:
60	if flags & SRE_FLAG_DOTALL:
61	emit(OPCODES[ANY_ALL])
62	else:
63	emit(OPCODES[ANY])
64	elif op in REPEATING_CODES:
65	if flags & SRE_FLAG_TEMPLATE:
66	raise error, "internal: unsupported template operator"
67	emit(OPCODES[REPEAT])
68	skip = _len(code); emit(0)
69	emit(av[0])
70	emit(av[1])
71	_compile(code, av[2], flags)
72	emit(OPCODES[SUCCESS])
73	code[skip] = _len(code) - skip
74	elif _simple(av) and op is not REPEAT:
75	if op is MAX_REPEAT:
76	emit(OPCODES[REPEAT_ONE])
77	else:
78	emit(OPCODES[MIN_REPEAT_ONE])
79	skip = _len(code); emit(0)
80	emit(av[0])
81	emit(av[1])
82	_compile(code, av[2], flags)
83	emit(OPCODES[SUCCESS])
84	code[skip] = _len(code) - skip
85	else:
86	emit(OPCODES[REPEAT])
87	skip = _len(code); emit(0)
88	emit(av[0])
89	emit(av[1])
90	_compile(code, av[2], flags)
91	code[skip] = _len(code) - skip
92	if op is MAX_REPEAT:
93	emit(OPCODES[MAX_UNTIL])
94	else:
95	emit(OPCODES[MIN_UNTIL])
96	elif op is SUBPATTERN:
97	if av[0]:
98	emit(OPCODES[MARK])
99	emit((av[0]-1)*2)
100	# _compile_info(code, av[1], flags)
101	_compile(code, av[1], flags)
102	if av[0]:
103	emit(OPCODES[MARK])
104	emit((av[0]-1)*2+1)
105	elif op in SUCCESS_CODES:
106	emit(OPCODES[op])
107	elif op in ASSERT_CODES:
108	emit(OPCODES[op])
109	skip = _len(code); emit(0)
110	if av[0] >= 0:
111	emit(0) # look ahead
112	else:
113	lo, hi = av[1].getwidth()
114	if lo != hi:
115	raise error, "look-behind requires fixed-width pattern"
116	emit(lo) # look behind
117	_compile(code, av[1], flags)
118	emit(OPCODES[SUCCESS])
119	code[skip] = _len(code) - skip
120	elif op is CALL:
121	emit(OPCODES[op])
122	skip = _len(code); emit(0)
123	_compile(code, av, flags)
124	emit(OPCODES[SUCCESS])
125	code[skip] = _len(code) - skip
126	elif op is AT:
127	emit(OPCODES[op])
128	if flags & SRE_FLAG_MULTILINE:
129	av = AT_MULTILINE.get(av, av)
130	if flags & SRE_FLAG_LOCALE:
131	av = AT_LOCALE.get(av, av)
132	elif flags & SRE_FLAG_UNICODE:
133	av = AT_UNICODE.get(av, av)
134	emit(ATCODES[av])
135	elif op is BRANCH:
136	emit(OPCODES[op])
137	tail = []
138	tailappend = tail.append
139	for av in av[1]:
140	skip = _len(code); emit(0)
141	# _compile_info(code, av, flags)
142	_compile(code, av, flags)
143	emit(OPCODES[JUMP])
144	tailappend(_len(code)); emit(0)
145	code[skip] = _len(code) - skip
146	emit(0) # end of branch
147	for tail in tail:
148	code[tail] = _len(code) - tail
149	elif op is CATEGORY:
150	emit(OPCODES[op])
151	if flags & SRE_FLAG_LOCALE:
152	av = CH_LOCALE[av]
153	elif flags & SRE_FLAG_UNICODE:
154	av = CH_UNICODE[av]
155	emit(CHCODES[av])
156	elif op is GROUPREF:
157	if flags & SRE_FLAG_IGNORECASE:
158	emit(OPCODES[OP_IGNORE[op]])
159	else:
160	emit(OPCODES[op])
161	emit(av-1)
162	elif op is GROUPREF_EXISTS:
163	emit(OPCODES[op])
164	emit(av[0]-1)
165	skipyes = _len(code); emit(0)
166	_compile(code, av[1], flags)
167	if av[2]:
168	emit(OPCODES[JUMP])
169	skipno = _len(code); emit(0)
170	code[skipyes] = _len(code) - skipyes + 1
171	_compile(code, av[2], flags)
172	code[skipno] = _len(code) - skipno
173	else:
174	code[skipyes] = _len(code) - skipyes + 1
175	else:
176	raise ValueError, ("unsupported operand type", op)
177
178	def _compile_charset(charset, flags, code, fixup=None):
179	# compile charset subprogram
180	emit = code.append
181	if fixup is None:
182	fixup = _identityfunction
183	for op, av in _optimize_charset(charset, fixup):
184	emit(OPCODES[op])
185	if op is NEGATE:
186	pass
187	elif op is LITERAL:
188	emit(fixup(av))
189	elif op is RANGE:
190	emit(fixup(av[0]))
191	emit(fixup(av[1]))
192	elif op is CHARSET:
193	code.extend(av)
194	elif op is BIGCHARSET:
195	code.extend(av)
196	elif op is CATEGORY:
197	if flags & SRE_FLAG_LOCALE:
198	emit(CHCODES[CH_LOCALE[av]])
199	elif flags & SRE_FLAG_UNICODE:
200	emit(CHCODES[CH_UNICODE[av]])
201	else:
202	emit(CHCODES[av])
203	else:
204	raise error, "internal: unsupported set operator"
205	emit(OPCODES[FAILURE])
206
207	def _optimize_charset(charset, fixup):
208	# internal: optimize character set
209	out = []
210	outappend = out.append
211	charmap = [0]*256
212	try:
213	for op, av in charset:
214	if op is NEGATE:
215	outappend((op, av))
216	elif op is LITERAL:
217	charmap[fixup(av)] = 1
218	elif op is RANGE:
219	for i in range(fixup(av[0]), fixup(av[1])+1):
220	charmap[i] = 1
221	elif op is CATEGORY:
222	# XXX: could append to charmap tail
223	return charset # cannot compress
224	except IndexError:
225	# character set contains unicode characters
226	return _optimize_unicode(charset, fixup)
227	# compress character map
228	i = p = n = 0
229	runs = []
230	runsappend = runs.append
231	for c in charmap:
232	if c:
233	if n == 0:
234	p = i
235	n = n + 1
236	elif n:
237	runsappend((p, n))
238	n = 0
239	i = i + 1
240	if n:
241	runsappend((p, n))
242	if len(runs) <= 2:
243	# use literal/range
244	for p, n in runs:
245	if n == 1:
246	outappend((LITERAL, p))
247	else:
248	outappend((RANGE, (p, p+n-1)))
249	if len(out) < len(charset):
250	return out
251	else:
252	# use bitmap
253	data = _mk_bitmap(charmap)
254	outappend((CHARSET, data))
255	return out
256	return charset
257
258	def _mk_bitmap(bits):
259	data = []
260	dataappend = data.append
261	if _sre.CODESIZE == 2:
262	start = (1, 0)
263	else:
264	start = (1L, 0L)
265	m, v = start
266	for c in bits:
267	if c:
268	v = v + m
269	m = m + m
270	if m > MAXCODE:
271	dataappend(v)
272	m, v = start
273	return data
274
275	# To represent a big charset, first a bitmap of all characters in the
276	# set is constructed. Then, this bitmap is sliced into chunks of 256
277	# characters, duplicate chunks are eliminated, and each chunk is
278	# given a number. In the compiled expression, the charset is
279	# represented by a 16-bit word sequence, consisting of one word for
280	# the number of different chunks, a sequence of 256 bytes (128 words)
281	# of chunk numbers indexed by their original chunk position, and a
282	# sequence of chunks (16 words each).
283
284	# Compression is normally good: in a typical charset, large ranges of
285	# Unicode will be either completely excluded (e.g. if only cyrillic
286	# letters are to be matched), or completely included (e.g. if large
287	# subranges of Kanji match). These ranges will be represented by
288	# chunks of all one-bits or all zero-bits.
289
290	# Matching can be also done efficiently: the more significant byte of
291	# the Unicode character is an index into the chunk number, and the
292	# less significant byte is a bit index in the chunk (just like the
293	# CHARSET matching).
294
295	# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
296	# of the basic multilingual plane; an efficient representation
297	# for all of UTF-16 has not yet been developed. This means,
298	# in particular, that negated charsets cannot be represented as
299	# bigcharsets.
300
301	def _optimize_unicode(charset, fixup):
302	try:
303	import array
304	except ImportError:
305	return charset
306	charmap = [0]*65536
307	negate = 0
308	try:
309	for op, av in charset:
310	if op is NEGATE:
311	negate = 1
312	elif op is LITERAL:
313	charmap[fixup(av)] = 1
314	elif op is RANGE:
315	for i in xrange(fixup(av[0]), fixup(av[1])+1):
316	charmap[i] = 1
317	elif op is CATEGORY:
318	# XXX: could expand category
319	return charset # cannot compress
320	except IndexError:
321	# non-BMP characters
322	return charset
323	if negate:
324	if sys.maxunicode != 65535:
325	# XXX: negation does not work with big charsets
326	return charset
327	for i in xrange(65536):
328	charmap[i] = not charmap[i]
329	comps = {}
330	mapping = [0]*256
331	block = 0
332	data = []
333	for i in xrange(256):
334	chunk = tuple(charmap[i256:(i+1)256])
335	new = comps.setdefault(chunk, block)
336	mapping[i] = new
337	if new == block:
338	block = block + 1
339	data = data + _mk_bitmap(chunk)
340	header = [block]
341	if _sre.CODESIZE == 2:
342	code = 'H'
343	else:
344	code = 'I'
345	# Convert block indices to byte array of 256 bytes
346	mapping = array.array('B', mapping).tostring()
347	# Convert byte array to word array
348	mapping = array.array(code, mapping)
349	assert mapping.itemsize == _sre.CODESIZE
350	header = header + mapping.tolist()
351	data[0:0] = header
352	return [(BIGCHARSET, data)]
353
354	def _simple(av):
355	# check if av is a "simple" operator
356	lo, hi = av[2].getwidth()
357	return lo == hi == 1 and av[2][0][0] != SUBPATTERN
358
359	def _compile_info(code, pattern, flags):
360	# internal: compile an info block. in the current version,
361	# this contains min/max pattern width, and an optional literal
362	# prefix or a character map
363	lo, hi = pattern.getwidth()
364	if lo == 0:
365	return # not worth it
366	# look for a literal prefix
367	prefix = []
368	prefixappend = prefix.append
369	prefix_skip = 0
370	charset = [] # not used
371	charsetappend = charset.append
372	if not (flags & SRE_FLAG_IGNORECASE):
373	# look for literal prefix
374	for op, av in pattern.data:
375	if op is LITERAL:
376	if len(prefix) == prefix_skip:
377	prefix_skip = prefix_skip + 1
378	prefixappend(av)
379	elif op is SUBPATTERN and len(av[1]) == 1:
380	op, av = av[1][0]
381	if op is LITERAL:
382	prefixappend(av)
383	else:
384	break
385	else:
386	break
387	# if no prefix, look for charset prefix
388	if not prefix and pattern.data:
389	op, av = pattern.data[0]
390	if op is SUBPATTERN and av[1]:
391	op, av = av[1][0]
392	if op is LITERAL:
393	charsetappend((op, av))
394	elif op is BRANCH:
395	c = []
396	cappend = c.append
397	for p in av[1]:
398	if not p:
399	break
400	op, av = p[0]
401	if op is LITERAL:
402	cappend((op, av))
403	else:
404	break
405	else:
406	charset = c
407	elif op is BRANCH:
408	c = []
409	cappend = c.append
410	for p in av[1]:
411	if not p:
412	break
413	op, av = p[0]
414	if op is LITERAL:
415	cappend((op, av))
416	else:
417	break
418	else:
419	charset = c
420	elif op is IN:
421	charset = av
422	## if prefix:
423	## print "*** PREFIX", prefix, prefix_skip
424	## if charset:
425	## print "*** CHARSET", charset
426	# add an info block
427	emit = code.append
428	emit(OPCODES[INFO])
429	skip = len(code); emit(0)
430	# literal flag
431	mask = 0
432	if prefix:
433	mask = SRE_INFO_PREFIX
434	if len(prefix) == prefix_skip == len(pattern.data):
435	mask = mask + SRE_INFO_LITERAL
436	elif charset:
437	mask = mask + SRE_INFO_CHARSET
438	emit(mask)
439	# pattern length
440	if lo < MAXCODE:
441	emit(lo)
442	else:
443	emit(MAXCODE)
444	prefix = prefix[:MAXCODE]
445	if hi < MAXCODE:
446	emit(hi)
447	else:
448	emit(0)
449	# add literal prefix
450	if prefix:
451	emit(len(prefix)) # length
452	emit(prefix_skip) # skip
453	code.extend(prefix)
454	# generate overlap table
455	table = [-1] + ([0]*len(prefix))
456	for i in xrange(len(prefix)):
457	table[i+1] = table[i]+1
458	while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
459	table[i+1] = table[table[i+1]-1]+1
460	code.extend(table[1:]) # don't store first entry
461	elif charset:
462	_compile_charset(charset, flags, code)
463	code[skip] = len(code) - skip
464
465	try:
466	unicode
467	except NameError:
468	STRING_TYPES = (type(""),)
469	else:
470	STRING_TYPES = (type(""), type(unicode("")))
471
472	def isstring(obj):
473	for tp in STRING_TYPES:
474	if isinstance(obj, tp):
475	return 1
476	return 0
477
478	def _code(p, flags):
479
480	flags = p.pattern.flags \| flags
481	code = []
482
483	# compile info block
484	_compile_info(code, p, flags)
485
486	# compile the pattern
487	_compile(code, p.data, flags)
488
489	code.append(OPCODES[SUCCESS])
490
491	return code
492
493	def compile(p, flags=0):
494	# internal: convert pattern list to internal format
495
496	if isstring(p):
497	pattern = p
498	p = sre_parse.parse(p, flags)
499	else:
500	pattern = None
501
502	code = _code(p, flags)
503
504	# print code
505
506	# XXX: <fl> get rid of this limitation!
507	if p.pattern.groups > 100:
508	raise AssertionError(
509	"sorry, but this version only supports 100 named groups"
510	)
511
512	# map in either direction
513	groupindex = p.pattern.groupdict
514	indexgroup = [None] * p.pattern.groups
515	for k, i in groupindex.items():
516	indexgroup[i] = k
517
518	return _sre.compile(
519	pattern, flags \| p.pattern.flags, code,
520	p.pattern.groups-1,
521	groupindex, indexgroup
522	)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/vendor/Python-2.7.6/Lib/sre_compile.py

Download in other formats: