Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

sre_compile.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 16.0 KB

Rev	Line
[2]	1	#
	2	# Secret Labs' Regular Expression Engine
	3	#
	4	# convert template to internal format
	5	#
	6	# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
	7	#
	8	# See the sre.py file for information on usage and redistribution.
	9	#
	10
	11	"""Internal support module for sre"""
	12
	13	import _sre, sys
	14	import sre_parse
	15	from sre_constants import *
	16
	17	assert _sre.MAGIC == MAGIC, "SRE module mismatch"
	18
	19	if _sre.CODESIZE == 2:
	20	MAXCODE = 65535
	21	else:
	22	MAXCODE = 0xFFFFFFFFL
	23
	24	def _identityfunction(x):
	25	return x
	26
	27	_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
	28	_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
	29	_SUCCESS_CODES = set([SUCCESS, FAILURE])
	30	_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
	31
	32	def _compile(code, pattern, flags):
	33	# internal: compile a (sub)pattern
	34	emit = code.append
	35	_len = len
	36	LITERAL_CODES = _LITERAL_CODES
	37	REPEATING_CODES = _REPEATING_CODES
	38	SUCCESS_CODES = _SUCCESS_CODES
	39	ASSERT_CODES = _ASSERT_CODES
	40	for op, av in pattern:
	41	if op in LITERAL_CODES:
	42	if flags & SRE_FLAG_IGNORECASE:
	43	emit(OPCODES[OP_IGNORE[op]])
	44	emit(_sre.getlower(av, flags))
	45	else:
	46	emit(OPCODES[op])
	47	emit(av)
	48	elif op is IN:
	49	if flags & SRE_FLAG_IGNORECASE:
	50	emit(OPCODES[OP_IGNORE[op]])
	51	def fixup(literal, flags=flags):
	52	return _sre.getlower(literal, flags)
	53	else:
	54	emit(OPCODES[op])
	55	fixup = _identityfunction
	56	skip = _len(code); emit(0)
	57	_compile_charset(av, flags, code, fixup)
	58	code[skip] = _len(code) - skip
	59	elif op is ANY:
	60	if flags & SRE_FLAG_DOTALL:
	61	emit(OPCODES[ANY_ALL])
	62	else:
	63	emit(OPCODES[ANY])
	64	elif op in REPEATING_CODES:
	65	if flags & SRE_FLAG_TEMPLATE:
	66	raise error, "internal: unsupported template operator"
	67	emit(OPCODES[REPEAT])
	68	skip = _len(code); emit(0)
	69	emit(av[0])
	70	emit(av[1])
	71	_compile(code, av[2], flags)
	72	emit(OPCODES[SUCCESS])
	73	code[skip] = _len(code) - skip
	74	elif _simple(av) and op is not REPEAT:
	75	if op is MAX_REPEAT:
	76	emit(OPCODES[REPEAT_ONE])
	77	else:
	78	emit(OPCODES[MIN_REPEAT_ONE])
	79	skip = _len(code); emit(0)
	80	emit(av[0])
	81	emit(av[1])
	82	_compile(code, av[2], flags)
	83	emit(OPCODES[SUCCESS])
	84	code[skip] = _len(code) - skip
	85	else:
	86	emit(OPCODES[REPEAT])
	87	skip = _len(code); emit(0)
	88	emit(av[0])
	89	emit(av[1])
	90	_compile(code, av[2], flags)
	91	code[skip] = _len(code) - skip
	92	if op is MAX_REPEAT:
	93	emit(OPCODES[MAX_UNTIL])
	94	else:
	95	emit(OPCODES[MIN_UNTIL])
	96	elif op is SUBPATTERN:
	97	if av[0]:
	98	emit(OPCODES[MARK])
	99	emit((av[0]-1)*2)
	100	# _compile_info(code, av[1], flags)
	101	_compile(code, av[1], flags)
	102	if av[0]:
	103	emit(OPCODES[MARK])
	104	emit((av[0]-1)*2+1)
	105	elif op in SUCCESS_CODES:
	106	emit(OPCODES[op])
	107	elif op in ASSERT_CODES:
	108	emit(OPCODES[op])
	109	skip = _len(code); emit(0)
	110	if av[0] >= 0:
	111	emit(0) # look ahead
	112	else:
	113	lo, hi = av[1].getwidth()
	114	if lo != hi:
	115	raise error, "look-behind requires fixed-width pattern"
	116	emit(lo) # look behind
	117	_compile(code, av[1], flags)
	118	emit(OPCODES[SUCCESS])
	119	code[skip] = _len(code) - skip
	120	elif op is CALL:
	121	emit(OPCODES[op])
	122	skip = _len(code); emit(0)
	123	_compile(code, av, flags)
	124	emit(OPCODES[SUCCESS])
	125	code[skip] = _len(code) - skip
	126	elif op is AT:
	127	emit(OPCODES[op])
	128	if flags & SRE_FLAG_MULTILINE:
	129	av = AT_MULTILINE.get(av, av)
	130	if flags & SRE_FLAG_LOCALE:
	131	av = AT_LOCALE.get(av, av)
	132	elif flags & SRE_FLAG_UNICODE:
	133	av = AT_UNICODE.get(av, av)
	134	emit(ATCODES[av])
	135	elif op is BRANCH:
	136	emit(OPCODES[op])
	137	tail = []
	138	tailappend = tail.append
	139	for av in av[1]:
	140	skip = _len(code); emit(0)
	141	# _compile_info(code, av, flags)
	142	_compile(code, av, flags)
	143	emit(OPCODES[JUMP])
	144	tailappend(_len(code)); emit(0)
	145	code[skip] = _len(code) - skip
	146	emit(0) # end of branch
	147	for tail in tail:
	148	code[tail] = _len(code) - tail
	149	elif op is CATEGORY:
	150	emit(OPCODES[op])
	151	if flags & SRE_FLAG_LOCALE:
	152	av = CH_LOCALE[av]
	153	elif flags & SRE_FLAG_UNICODE:
	154	av = CH_UNICODE[av]
	155	emit(CHCODES[av])
	156	elif op is GROUPREF:
	157	if flags & SRE_FLAG_IGNORECASE:
	158	emit(OPCODES[OP_IGNORE[op]])
	159	else:
	160	emit(OPCODES[op])
	161	emit(av-1)
	162	elif op is GROUPREF_EXISTS:
	163	emit(OPCODES[op])
	164	emit(av[0]-1)
	165	skipyes = _len(code); emit(0)
	166	_compile(code, av[1], flags)
	167	if av[2]:
	168	emit(OPCODES[JUMP])
	169	skipno = _len(code); emit(0)
	170	code[skipyes] = _len(code) - skipyes + 1
	171	_compile(code, av[2], flags)
	172	code[skipno] = _len(code) - skipno
	173	else:
	174	code[skipyes] = _len(code) - skipyes + 1
	175	else:
	176	raise ValueError, ("unsupported operand type", op)
	177
	178	def _compile_charset(charset, flags, code, fixup=None):
	179	# compile charset subprogram
	180	emit = code.append
	181	if fixup is None:
	182	fixup = _identityfunction
	183	for op, av in _optimize_charset(charset, fixup):
	184	emit(OPCODES[op])
	185	if op is NEGATE:
	186	pass
	187	elif op is LITERAL:
	188	emit(fixup(av))
	189	elif op is RANGE:
	190	emit(fixup(av[0]))
	191	emit(fixup(av[1]))
	192	elif op is CHARSET:
	193	code.extend(av)
	194	elif op is BIGCHARSET:
	195	code.extend(av)
	196	elif op is CATEGORY:
	197	if flags & SRE_FLAG_LOCALE:
	198	emit(CHCODES[CH_LOCALE[av]])
	199	elif flags & SRE_FLAG_UNICODE:
	200	emit(CHCODES[CH_UNICODE[av]])
	201	else:
	202	emit(CHCODES[av])
	203	else:
	204	raise error, "internal: unsupported set operator"
	205	emit(OPCODES[FAILURE])
	206
	207	def _optimize_charset(charset, fixup):
	208	# internal: optimize character set
	209	out = []
	210	outappend = out.append
	211	charmap = [0]*256
	212	try:
	213	for op, av in charset:
	214	if op is NEGATE:
	215	outappend((op, av))
	216	elif op is LITERAL:
	217	charmap[fixup(av)] = 1
	218	elif op is RANGE:
	219	for i in range(fixup(av[0]), fixup(av[1])+1):
	220	charmap[i] = 1
	221	elif op is CATEGORY:
	222	# XXX: could append to charmap tail
	223	return charset # cannot compress
	224	except IndexError:
	225	# character set contains unicode characters
	226	return _optimize_unicode(charset, fixup)
	227	# compress character map
	228	i = p = n = 0
	229	runs = []
	230	runsappend = runs.append
	231	for c in charmap:
	232	if c:
	233	if n == 0:
	234	p = i
	235	n = n + 1
	236	elif n:
	237	runsappend((p, n))
	238	n = 0
	239	i = i + 1
	240	if n:
	241	runsappend((p, n))
	242	if len(runs) <= 2:
	243	# use literal/range
	244	for p, n in runs:
	245	if n == 1:
	246	outappend((LITERAL, p))
	247	else:
	248	outappend((RANGE, (p, p+n-1)))
	249	if len(out) < len(charset):
	250	return out
	251	else:
	252	# use bitmap
	253	data = _mk_bitmap(charmap)
	254	outappend((CHARSET, data))
	255	return out
	256	return charset
	257
	258	def _mk_bitmap(bits):
	259	data = []
	260	dataappend = data.append
	261	if _sre.CODESIZE == 2:
	262	start = (1, 0)
	263	else:
	264	start = (1L, 0L)
	265	m, v = start
	266	for c in bits:
	267	if c:
	268	v = v + m
	269	m = m + m
	270	if m > MAXCODE:
	271	dataappend(v)
	272	m, v = start
	273	return data
	274
	275	# To represent a big charset, first a bitmap of all characters in the
	276	# set is constructed. Then, this bitmap is sliced into chunks of 256
	277	# characters, duplicate chunks are eliminated, and each chunk is
	278	# given a number. In the compiled expression, the charset is
	279	# represented by a 16-bit word sequence, consisting of one word for
	280	# the number of different chunks, a sequence of 256 bytes (128 words)
	281	# of chunk numbers indexed by their original chunk position, and a
	282	# sequence of chunks (16 words each).
	283
	284	# Compression is normally good: in a typical charset, large ranges of
	285	# Unicode will be either completely excluded (e.g. if only cyrillic
	286	# letters are to be matched), or completely included (e.g. if large
	287	# subranges of Kanji match). These ranges will be represented by
	288	# chunks of all one-bits or all zero-bits.
	289
	290	# Matching can be also done efficiently: the more significant byte of
	291	# the Unicode character is an index into the chunk number, and the
	292	# less significant byte is a bit index in the chunk (just like the
	293	# CHARSET matching).
	294
	295	# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
	296	# of the basic multilingual plane; an efficient representation
	297	# for all of UTF-16 has not yet been developed. This means,
	298	# in particular, that negated charsets cannot be represented as
	299	# bigcharsets.
	300
	301	def _optimize_unicode(charset, fixup):
	302	try:
	303	import array
	304	except ImportError:
	305	return charset
	306	charmap = [0]*65536
	307	negate = 0
	308	try:
	309	for op, av in charset:
	310	if op is NEGATE:
	311	negate = 1
	312	elif op is LITERAL:
	313	charmap[fixup(av)] = 1
	314	elif op is RANGE:
	315	for i in xrange(fixup(av[0]), fixup(av[1])+1):
	316	charmap[i] = 1
	317	elif op is CATEGORY:
	318	# XXX: could expand category
	319	return charset # cannot compress
	320	except IndexError:
	321	# non-BMP characters
	322	return charset
	323	if negate:
	324	if sys.maxunicode != 65535:
	325	# XXX: negation does not work with big charsets
	326	return charset
	327	for i in xrange(65536):
	328	charmap[i] = not charmap[i]
	329	comps = {}
	330	mapping = [0]*256
	331	block = 0
	332	data = []
	333	for i in xrange(256):
	334	chunk = tuple(charmap[i256:(i+1)256])
	335	new = comps.setdefault(chunk, block)
	336	mapping[i] = new
	337	if new == block:
	338	block = block + 1
	339	data = data + _mk_bitmap(chunk)
	340	header = [block]
	341	if _sre.CODESIZE == 2:
	342	code = 'H'
	343	else:
	344	code = 'I'
	345	# Convert block indices to byte array of 256 bytes
[391]	346	mapping = array.array('B', mapping).tostring()
[2]	347	# Convert byte array to word array
	348	mapping = array.array(code, mapping)
	349	assert mapping.itemsize == _sre.CODESIZE
	350	header = header + mapping.tolist()
	351	data[0:0] = header
	352	return [(BIGCHARSET, data)]
	353
	354	def _simple(av):
	355	# check if av is a "simple" operator
	356	lo, hi = av[2].getwidth()
	357	return lo == hi == 1 and av[2][0][0] != SUBPATTERN
	358
	359	def _compile_info(code, pattern, flags):
	360	# internal: compile an info block. in the current version,
	361	# this contains min/max pattern width, and an optional literal
	362	# prefix or a character map
	363	lo, hi = pattern.getwidth()
	364	if lo == 0:
	365	return # not worth it
	366	# look for a literal prefix
	367	prefix = []
	368	prefixappend = prefix.append
	369	prefix_skip = 0
	370	charset = [] # not used
	371	charsetappend = charset.append
	372	if not (flags & SRE_FLAG_IGNORECASE):
	373	# look for literal prefix
	374	for op, av in pattern.data:
	375	if op is LITERAL:
	376	if len(prefix) == prefix_skip:
	377	prefix_skip = prefix_skip + 1
	378	prefixappend(av)
	379	elif op is SUBPATTERN and len(av[1]) == 1:
	380	op, av = av[1][0]
	381	if op is LITERAL:
	382	prefixappend(av)
	383	else:
	384	break
	385	else:
	386	break
	387	# if no prefix, look for charset prefix
	388	if not prefix and pattern.data:
	389	op, av = pattern.data[0]
	390	if op is SUBPATTERN and av[1]:
	391	op, av = av[1][0]
	392	if op is LITERAL:
	393	charsetappend((op, av))
	394	elif op is BRANCH:
	395	c = []
	396	cappend = c.append
	397	for p in av[1]:
	398	if not p:
	399	break
	400	op, av = p[0]
	401	if op is LITERAL:
	402	cappend((op, av))
	403	else:
	404	break
	405	else:
	406	charset = c
	407	elif op is BRANCH:
	408	c = []
	409	cappend = c.append
	410	for p in av[1]:
	411	if not p:
	412	break
	413	op, av = p[0]
	414	if op is LITERAL:
	415	cappend((op, av))
	416	else:
	417	break
	418	else:
	419	charset = c
	420	elif op is IN:
	421	charset = av
	422	## if prefix:
	423	## print "*** PREFIX", prefix, prefix_skip
	424	## if charset:
	425	## print "*** CHARSET", charset
	426	# add an info block
	427	emit = code.append
	428	emit(OPCODES[INFO])
	429	skip = len(code); emit(0)
	430	# literal flag
	431	mask = 0
	432	if prefix:
	433	mask = SRE_INFO_PREFIX
	434	if len(prefix) == prefix_skip == len(pattern.data):
	435	mask = mask + SRE_INFO_LITERAL
	436	elif charset:
	437	mask = mask + SRE_INFO_CHARSET
	438	emit(mask)
	439	# pattern length
	440	if lo < MAXCODE:
	441	emit(lo)
	442	else:
	443	emit(MAXCODE)
	444	prefix = prefix[:MAXCODE]
	445	if hi < MAXCODE:
	446	emit(hi)
	447	else:
	448	emit(0)
	449	# add literal prefix
	450	if prefix:
	451	emit(len(prefix)) # length
	452	emit(prefix_skip) # skip
	453	code.extend(prefix)
	454	# generate overlap table
	455	table = [-1] + ([0]*len(prefix))
	456	for i in xrange(len(prefix)):
	457	table[i+1] = table[i]+1
	458	while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
	459	table[i+1] = table[table[i+1]-1]+1
	460	code.extend(table[1:]) # don't store first entry
	461	elif charset:
	462	_compile_charset(charset, flags, code)
	463	code[skip] = len(code) - skip
	464
	465	try:
	466	unicode
	467	except NameError:
	468	STRING_TYPES = (type(""),)
	469	else:
	470	STRING_TYPES = (type(""), type(unicode("")))
	471
	472	def isstring(obj):
	473	for tp in STRING_TYPES:
	474	if isinstance(obj, tp):
	475	return 1
	476	return 0
	477
	478	def _code(p, flags):
	479
	480	flags = p.pattern.flags \| flags
	481	code = []
	482
	483	# compile info block
	484	_compile_info(code, p, flags)
	485
	486	# compile the pattern
	487	_compile(code, p.data, flags)
	488
	489	code.append(OPCODES[SUCCESS])
	490
	491	return code
	492
	493	def compile(p, flags=0):
	494	# internal: convert pattern list to internal format
	495
	496	if isstring(p):
	497	pattern = p
	498	p = sre_parse.parse(p, flags)
	499	else:
	500	pattern = None
	501
	502	code = _code(p, flags)
	503
	504	# print code
	505
	506	# XXX: <fl> get rid of this limitation!
	507	if p.pattern.groups > 100:
	508	raise AssertionError(
	509	"sorry, but this version only supports 100 named groups"
	510	)
	511
	512	# map in either direction
	513	groupindex = p.pattern.groupdict
	514	indexgroup = [None] * p.pattern.groups
	515	for k, i in groupindex.items():
	516	indexgroup[i] = k
	517
	518	return _sre.compile(
	519	pattern, flags \| p.pattern.flags, code,
	520	p.pattern.groups-1,
	521	groupindex, indexgroup
	522	)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/sre_compile.py

Download in other formats: