Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

sre_compile.py@ 382

Last change on this file since 382 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 16.1 KB

Line
1	#
2	# Secret Labs' Regular Expression Engine
3	#
4	# convert template to internal format
5	#
6	# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7	#
8	# See the sre.py file for information on usage and redistribution.
9	#
10
11	"""Internal support module for sre"""
12
13	import _sre, sys
14	import sre_parse
15	from sre_constants import *
16
17	assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
19	if _sre.CODESIZE == 2:
20	MAXCODE = 65535
21	else:
22	MAXCODE = 0xFFFFFFFFL
23
24	def _identityfunction(x):
25	return x
26
27	def set(seq):
28	s = {}
29	for elem in seq:
30	s[elem] = 1
31	return s
32
33	_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
34	_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
35	_SUCCESS_CODES = set([SUCCESS, FAILURE])
36	_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
37
38	def _compile(code, pattern, flags):
39	# internal: compile a (sub)pattern
40	emit = code.append
41	_len = len
42	LITERAL_CODES = _LITERAL_CODES
43	REPEATING_CODES = _REPEATING_CODES
44	SUCCESS_CODES = _SUCCESS_CODES
45	ASSERT_CODES = _ASSERT_CODES
46	for op, av in pattern:
47	if op in LITERAL_CODES:
48	if flags & SRE_FLAG_IGNORECASE:
49	emit(OPCODES[OP_IGNORE[op]])
50	emit(_sre.getlower(av, flags))
51	else:
52	emit(OPCODES[op])
53	emit(av)
54	elif op is IN:
55	if flags & SRE_FLAG_IGNORECASE:
56	emit(OPCODES[OP_IGNORE[op]])
57	def fixup(literal, flags=flags):
58	return _sre.getlower(literal, flags)
59	else:
60	emit(OPCODES[op])
61	fixup = _identityfunction
62	skip = _len(code); emit(0)
63	_compile_charset(av, flags, code, fixup)
64	code[skip] = _len(code) - skip
65	elif op is ANY:
66	if flags & SRE_FLAG_DOTALL:
67	emit(OPCODES[ANY_ALL])
68	else:
69	emit(OPCODES[ANY])
70	elif op in REPEATING_CODES:
71	if flags & SRE_FLAG_TEMPLATE:
72	raise error, "internal: unsupported template operator"
73	emit(OPCODES[REPEAT])
74	skip = _len(code); emit(0)
75	emit(av[0])
76	emit(av[1])
77	_compile(code, av[2], flags)
78	emit(OPCODES[SUCCESS])
79	code[skip] = _len(code) - skip
80	elif _simple(av) and op is not REPEAT:
81	if op is MAX_REPEAT:
82	emit(OPCODES[REPEAT_ONE])
83	else:
84	emit(OPCODES[MIN_REPEAT_ONE])
85	skip = _len(code); emit(0)
86	emit(av[0])
87	emit(av[1])
88	_compile(code, av[2], flags)
89	emit(OPCODES[SUCCESS])
90	code[skip] = _len(code) - skip
91	else:
92	emit(OPCODES[REPEAT])
93	skip = _len(code); emit(0)
94	emit(av[0])
95	emit(av[1])
96	_compile(code, av[2], flags)
97	code[skip] = _len(code) - skip
98	if op is MAX_REPEAT:
99	emit(OPCODES[MAX_UNTIL])
100	else:
101	emit(OPCODES[MIN_UNTIL])
102	elif op is SUBPATTERN:
103	if av[0]:
104	emit(OPCODES[MARK])
105	emit((av[0]-1)*2)
106	# _compile_info(code, av[1], flags)
107	_compile(code, av[1], flags)
108	if av[0]:
109	emit(OPCODES[MARK])
110	emit((av[0]-1)*2+1)
111	elif op in SUCCESS_CODES:
112	emit(OPCODES[op])
113	elif op in ASSERT_CODES:
114	emit(OPCODES[op])
115	skip = _len(code); emit(0)
116	if av[0] >= 0:
117	emit(0) # look ahead
118	else:
119	lo, hi = av[1].getwidth()
120	if lo != hi:
121	raise error, "look-behind requires fixed-width pattern"
122	emit(lo) # look behind
123	_compile(code, av[1], flags)
124	emit(OPCODES[SUCCESS])
125	code[skip] = _len(code) - skip
126	elif op is CALL:
127	emit(OPCODES[op])
128	skip = _len(code); emit(0)
129	_compile(code, av, flags)
130	emit(OPCODES[SUCCESS])
131	code[skip] = _len(code) - skip
132	elif op is AT:
133	emit(OPCODES[op])
134	if flags & SRE_FLAG_MULTILINE:
135	av = AT_MULTILINE.get(av, av)
136	if flags & SRE_FLAG_LOCALE:
137	av = AT_LOCALE.get(av, av)
138	elif flags & SRE_FLAG_UNICODE:
139	av = AT_UNICODE.get(av, av)
140	emit(ATCODES[av])
141	elif op is BRANCH:
142	emit(OPCODES[op])
143	tail = []
144	tailappend = tail.append
145	for av in av[1]:
146	skip = _len(code); emit(0)
147	# _compile_info(code, av, flags)
148	_compile(code, av, flags)
149	emit(OPCODES[JUMP])
150	tailappend(_len(code)); emit(0)
151	code[skip] = _len(code) - skip
152	emit(0) # end of branch
153	for tail in tail:
154	code[tail] = _len(code) - tail
155	elif op is CATEGORY:
156	emit(OPCODES[op])
157	if flags & SRE_FLAG_LOCALE:
158	av = CH_LOCALE[av]
159	elif flags & SRE_FLAG_UNICODE:
160	av = CH_UNICODE[av]
161	emit(CHCODES[av])
162	elif op is GROUPREF:
163	if flags & SRE_FLAG_IGNORECASE:
164	emit(OPCODES[OP_IGNORE[op]])
165	else:
166	emit(OPCODES[op])
167	emit(av-1)
168	elif op is GROUPREF_EXISTS:
169	emit(OPCODES[op])
170	emit(av[0]-1)
171	skipyes = _len(code); emit(0)
172	_compile(code, av[1], flags)
173	if av[2]:
174	emit(OPCODES[JUMP])
175	skipno = _len(code); emit(0)
176	code[skipyes] = _len(code) - skipyes + 1
177	_compile(code, av[2], flags)
178	code[skipno] = _len(code) - skipno
179	else:
180	code[skipyes] = _len(code) - skipyes + 1
181	else:
182	raise ValueError, ("unsupported operand type", op)
183
184	def _compile_charset(charset, flags, code, fixup=None):
185	# compile charset subprogram
186	emit = code.append
187	if fixup is None:
188	fixup = _identityfunction
189	for op, av in _optimize_charset(charset, fixup):
190	emit(OPCODES[op])
191	if op is NEGATE:
192	pass
193	elif op is LITERAL:
194	emit(fixup(av))
195	elif op is RANGE:
196	emit(fixup(av[0]))
197	emit(fixup(av[1]))
198	elif op is CHARSET:
199	code.extend(av)
200	elif op is BIGCHARSET:
201	code.extend(av)
202	elif op is CATEGORY:
203	if flags & SRE_FLAG_LOCALE:
204	emit(CHCODES[CH_LOCALE[av]])
205	elif flags & SRE_FLAG_UNICODE:
206	emit(CHCODES[CH_UNICODE[av]])
207	else:
208	emit(CHCODES[av])
209	else:
210	raise error, "internal: unsupported set operator"
211	emit(OPCODES[FAILURE])
212
213	def _optimize_charset(charset, fixup):
214	# internal: optimize character set
215	out = []
216	outappend = out.append
217	charmap = [0]*256
218	try:
219	for op, av in charset:
220	if op is NEGATE:
221	outappend((op, av))
222	elif op is LITERAL:
223	charmap[fixup(av)] = 1
224	elif op is RANGE:
225	for i in range(fixup(av[0]), fixup(av[1])+1):
226	charmap[i] = 1
227	elif op is CATEGORY:
228	# XXX: could append to charmap tail
229	return charset # cannot compress
230	except IndexError:
231	# character set contains unicode characters
232	return _optimize_unicode(charset, fixup)
233	# compress character map
234	i = p = n = 0
235	runs = []
236	runsappend = runs.append
237	for c in charmap:
238	if c:
239	if n == 0:
240	p = i
241	n = n + 1
242	elif n:
243	runsappend((p, n))
244	n = 0
245	i = i + 1
246	if n:
247	runsappend((p, n))
248	if len(runs) <= 2:
249	# use literal/range
250	for p, n in runs:
251	if n == 1:
252	outappend((LITERAL, p))
253	else:
254	outappend((RANGE, (p, p+n-1)))
255	if len(out) < len(charset):
256	return out
257	else:
258	# use bitmap
259	data = _mk_bitmap(charmap)
260	outappend((CHARSET, data))
261	return out
262	return charset
263
264	def _mk_bitmap(bits):
265	data = []
266	dataappend = data.append
267	if _sre.CODESIZE == 2:
268	start = (1, 0)
269	else:
270	start = (1L, 0L)
271	m, v = start
272	for c in bits:
273	if c:
274	v = v + m
275	m = m + m
276	if m > MAXCODE:
277	dataappend(v)
278	m, v = start
279	return data
280
281	# To represent a big charset, first a bitmap of all characters in the
282	# set is constructed. Then, this bitmap is sliced into chunks of 256
283	# characters, duplicate chunks are eliminated, and each chunk is
284	# given a number. In the compiled expression, the charset is
285	# represented by a 16-bit word sequence, consisting of one word for
286	# the number of different chunks, a sequence of 256 bytes (128 words)
287	# of chunk numbers indexed by their original chunk position, and a
288	# sequence of chunks (16 words each).
289
290	# Compression is normally good: in a typical charset, large ranges of
291	# Unicode will be either completely excluded (e.g. if only cyrillic
292	# letters are to be matched), or completely included (e.g. if large
293	# subranges of Kanji match). These ranges will be represented by
294	# chunks of all one-bits or all zero-bits.
295
296	# Matching can be also done efficiently: the more significant byte of
297	# the Unicode character is an index into the chunk number, and the
298	# less significant byte is a bit index in the chunk (just like the
299	# CHARSET matching).
300
301	# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
302	# of the basic multilingual plane; an efficient representation
303	# for all of UTF-16 has not yet been developed. This means,
304	# in particular, that negated charsets cannot be represented as
305	# bigcharsets.
306
307	def _optimize_unicode(charset, fixup):
308	try:
309	import array
310	except ImportError:
311	return charset
312	charmap = [0]*65536
313	negate = 0
314	try:
315	for op, av in charset:
316	if op is NEGATE:
317	negate = 1
318	elif op is LITERAL:
319	charmap[fixup(av)] = 1
320	elif op is RANGE:
321	for i in xrange(fixup(av[0]), fixup(av[1])+1):
322	charmap[i] = 1
323	elif op is CATEGORY:
324	# XXX: could expand category
325	return charset # cannot compress
326	except IndexError:
327	# non-BMP characters
328	return charset
329	if negate:
330	if sys.maxunicode != 65535:
331	# XXX: negation does not work with big charsets
332	return charset
333	for i in xrange(65536):
334	charmap[i] = not charmap[i]
335	comps = {}
336	mapping = [0]*256
337	block = 0
338	data = []
339	for i in xrange(256):
340	chunk = tuple(charmap[i256:(i+1)256])
341	new = comps.setdefault(chunk, block)
342	mapping[i] = new
343	if new == block:
344	block = block + 1
345	data = data + _mk_bitmap(chunk)
346	header = [block]
347	if _sre.CODESIZE == 2:
348	code = 'H'
349	else:
350	code = 'I'
351	# Convert block indices to byte array of 256 bytes
352	mapping = array.array('b', mapping).tostring()
353	# Convert byte array to word array
354	mapping = array.array(code, mapping)
355	assert mapping.itemsize == _sre.CODESIZE
356	header = header + mapping.tolist()
357	data[0:0] = header
358	return [(BIGCHARSET, data)]
359
360	def _simple(av):
361	# check if av is a "simple" operator
362	lo, hi = av[2].getwidth()
363	if lo == 0 and hi == MAXREPEAT:
364	raise error, "nothing to repeat"
365	return lo == hi == 1 and av[2][0][0] != SUBPATTERN
366
367	def _compile_info(code, pattern, flags):
368	# internal: compile an info block. in the current version,
369	# this contains min/max pattern width, and an optional literal
370	# prefix or a character map
371	lo, hi = pattern.getwidth()
372	if lo == 0:
373	return # not worth it
374	# look for a literal prefix
375	prefix = []
376	prefixappend = prefix.append
377	prefix_skip = 0
378	charset = [] # not used
379	charsetappend = charset.append
380	if not (flags & SRE_FLAG_IGNORECASE):
381	# look for literal prefix
382	for op, av in pattern.data:
383	if op is LITERAL:
384	if len(prefix) == prefix_skip:
385	prefix_skip = prefix_skip + 1
386	prefixappend(av)
387	elif op is SUBPATTERN and len(av[1]) == 1:
388	op, av = av[1][0]
389	if op is LITERAL:
390	prefixappend(av)
391	else:
392	break
393	else:
394	break
395	# if no prefix, look for charset prefix
396	if not prefix and pattern.data:
397	op, av = pattern.data[0]
398	if op is SUBPATTERN and av[1]:
399	op, av = av[1][0]
400	if op is LITERAL:
401	charsetappend((op, av))
402	elif op is BRANCH:
403	c = []
404	cappend = c.append
405	for p in av[1]:
406	if not p:
407	break
408	op, av = p[0]
409	if op is LITERAL:
410	cappend((op, av))
411	else:
412	break
413	else:
414	charset = c
415	elif op is BRANCH:
416	c = []
417	cappend = c.append
418	for p in av[1]:
419	if not p:
420	break
421	op, av = p[0]
422	if op is LITERAL:
423	cappend((op, av))
424	else:
425	break
426	else:
427	charset = c
428	elif op is IN:
429	charset = av
430	## if prefix:
431	## print "*** PREFIX", prefix, prefix_skip
432	## if charset:
433	## print "*** CHARSET", charset
434	# add an info block
435	emit = code.append
436	emit(OPCODES[INFO])
437	skip = len(code); emit(0)
438	# literal flag
439	mask = 0
440	if prefix:
441	mask = SRE_INFO_PREFIX
442	if len(prefix) == prefix_skip == len(pattern.data):
443	mask = mask + SRE_INFO_LITERAL
444	elif charset:
445	mask = mask + SRE_INFO_CHARSET
446	emit(mask)
447	# pattern length
448	if lo < MAXCODE:
449	emit(lo)
450	else:
451	emit(MAXCODE)
452	prefix = prefix[:MAXCODE]
453	if hi < MAXCODE:
454	emit(hi)
455	else:
456	emit(0)
457	# add literal prefix
458	if prefix:
459	emit(len(prefix)) # length
460	emit(prefix_skip) # skip
461	code.extend(prefix)
462	# generate overlap table
463	table = [-1] + ([0]*len(prefix))
464	for i in xrange(len(prefix)):
465	table[i+1] = table[i]+1
466	while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
467	table[i+1] = table[table[i+1]-1]+1
468	code.extend(table[1:]) # don't store first entry
469	elif charset:
470	_compile_charset(charset, flags, code)
471	code[skip] = len(code) - skip
472
473	try:
474	unicode
475	except NameError:
476	STRING_TYPES = (type(""),)
477	else:
478	STRING_TYPES = (type(""), type(unicode("")))
479
480	def isstring(obj):
481	for tp in STRING_TYPES:
482	if isinstance(obj, tp):
483	return 1
484	return 0
485
486	def _code(p, flags):
487
488	flags = p.pattern.flags \| flags
489	code = []
490
491	# compile info block
492	_compile_info(code, p, flags)
493
494	# compile the pattern
495	_compile(code, p.data, flags)
496
497	code.append(OPCODES[SUCCESS])
498
499	return code
500
501	def compile(p, flags=0):
502	# internal: convert pattern list to internal format
503
504	if isstring(p):
505	pattern = p
506	p = sre_parse.parse(p, flags)
507	else:
508	pattern = None
509
510	code = _code(p, flags)
511
512	# print code
513
514	# XXX: <fl> get rid of this limitation!
515	if p.pattern.groups > 100:
516	raise AssertionError(
517	"sorry, but this version only supports 100 named groups"
518	)
519
520	# map in either direction
521	groupindex = p.pattern.groupdict
522	indexgroup = [None] * p.pattern.groups
523	for k, i in groupindex.items():
524	indexgroup[i] = k
525
526	return _sre.compile(
527	pattern, flags \| p.pattern.flags, code,
528	p.pattern.groups-1,
529	groupindex, indexgroup
530	)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/sre_compile.py@ 382

Download in other formats: