Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

sre_parse.py@ 388

Last change on this file since 388 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 26.2 KB

Line
1	#
2	# Secret Labs' Regular Expression Engine
3	#
4	# convert re-style regular expression to sre pattern
5	#
6	# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
7	#
8	# See the sre.py file for information on usage and redistribution.
9	#
10
11	"""Internal support module for sre"""
12
13	# XXX: show string offset and offending character for all errors
14
15	import sys
16
17	from sre_constants import *
18
19	def set(seq):
20	s = {}
21	for elem in seq:
22	s[elem] = 1
23	return s
24
25	SPECIAL_CHARS = ".\\[{()*+?^$\|"
26	REPEAT_CHARS = "*+?{"
27
28	DIGITS = set("0123456789")
29
30	OCTDIGITS = set("01234567")
31	HEXDIGITS = set("0123456789abcdefABCDEF")
32
33	WHITESPACE = set(" \t\n\r\v\f")
34
35	ESCAPES = {
36	r"\a": (LITERAL, ord("\a")),
37	r"\b": (LITERAL, ord("\b")),
38	r"\f": (LITERAL, ord("\f")),
39	r"\n": (LITERAL, ord("\n")),
40	r"\r": (LITERAL, ord("\r")),
41	r"\t": (LITERAL, ord("\t")),
42	r"\v": (LITERAL, ord("\v")),
43	r"\\": (LITERAL, ord("\\"))
44	}
45
46	CATEGORIES = {
47	r"\A": (AT, AT_BEGINNING_STRING), # start of string
48	r"\b": (AT, AT_BOUNDARY),
49	r"\B": (AT, AT_NON_BOUNDARY),
50	r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
51	r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
52	r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
53	r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
54	r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
55	r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
56	r"\Z": (AT, AT_END_STRING), # end of string
57	}
58
59	FLAGS = {
60	# standard flags
61	"i": SRE_FLAG_IGNORECASE,
62	"L": SRE_FLAG_LOCALE,
63	"m": SRE_FLAG_MULTILINE,
64	"s": SRE_FLAG_DOTALL,
65	"x": SRE_FLAG_VERBOSE,
66	# extensions
67	"t": SRE_FLAG_TEMPLATE,
68	"u": SRE_FLAG_UNICODE,
69	}
70
71	class Pattern:
72	# master pattern object. keeps track of global attributes
73	def __init__(self):
74	self.flags = 0
75	self.open = []
76	self.groups = 1
77	self.groupdict = {}
78	def opengroup(self, name=None):
79	gid = self.groups
80	self.groups = gid + 1
81	if name is not None:
82	ogid = self.groupdict.get(name, None)
83	if ogid is not None:
84	raise error, ("redefinition of group name %s as group %d; "
85	"was group %d" % (repr(name), gid, ogid))
86	self.groupdict[name] = gid
87	self.open.append(gid)
88	return gid
89	def closegroup(self, gid):
90	self.open.remove(gid)
91	def checkgroup(self, gid):
92	return gid < self.groups and gid not in self.open
93
94	class SubPattern:
95	# a subpattern, in intermediate form
96	def __init__(self, pattern, data=None):
97	self.pattern = pattern
98	if data is None:
99	data = []
100	self.data = data
101	self.width = None
102	def dump(self, level=0):
103	nl = 1
104	seqtypes = type(()), type([])
105	for op, av in self.data:
106	print level*" " + op,; nl = 0
107	if op == "in":
108	# member sublanguage
109	print; nl = 1
110	for op, a in av:
111	print (level+1)*" " + op, a
112	elif op == "branch":
113	print; nl = 1
114	i = 0
115	for a in av[1]:
116	if i > 0:
117	print level*" " + "or"
118	a.dump(level+1); nl = 1
119	i = i + 1
120	elif type(av) in seqtypes:
121	for a in av:
122	if isinstance(a, SubPattern):
123	if not nl: print
124	a.dump(level+1); nl = 1
125	else:
126	print a, ; nl = 0
127	else:
128	print av, ; nl = 0
129	if not nl: print
130	def __repr__(self):
131	return repr(self.data)
132	def __len__(self):
133	return len(self.data)
134	def __delitem__(self, index):
135	del self.data[index]
136	def __getitem__(self, index):
137	if isinstance(index, slice):
138	return SubPattern(self.pattern, self.data[index])
139	return self.data[index]
140	def __setitem__(self, index, code):
141	self.data[index] = code
142	def insert(self, index, code):
143	self.data.insert(index, code)
144	def append(self, code):
145	self.data.append(code)
146	def getwidth(self):
147	# determine the width (min, max) for this subpattern
148	if self.width:
149	return self.width
150	lo = hi = 0L
151	UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
152	REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
153	for op, av in self.data:
154	if op is BRANCH:
155	i = sys.maxint
156	j = 0
157	for av in av[1]:
158	l, h = av.getwidth()
159	i = min(i, l)
160	j = max(j, h)
161	lo = lo + i
162	hi = hi + j
163	elif op is CALL:
164	i, j = av.getwidth()
165	lo = lo + i
166	hi = hi + j
167	elif op is SUBPATTERN:
168	i, j = av[1].getwidth()
169	lo = lo + i
170	hi = hi + j
171	elif op in REPEATCODES:
172	i, j = av[2].getwidth()
173	lo = lo + long(i) * av[0]
174	hi = hi + long(j) * av[1]
175	elif op in UNITCODES:
176	lo = lo + 1
177	hi = hi + 1
178	elif op == SUCCESS:
179	break
180	self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
181	return self.width
182
183	class Tokenizer:
184	def __init__(self, string):
185	self.string = string
186	self.index = 0
187	self.__next()
188	def __next(self):
189	if self.index >= len(self.string):
190	self.next = None
191	return
192	char = self.string[self.index]
193	if char[0] == "\\":
194	try:
195	c = self.string[self.index + 1]
196	except IndexError:
197	raise error, "bogus escape (end of line)"
198	char = char + c
199	self.index = self.index + len(char)
200	self.next = char
201	def match(self, char, skip=1):
202	if char == self.next:
203	if skip:
204	self.__next()
205	return 1
206	return 0
207	def get(self):
208	this = self.next
209	self.__next()
210	return this
211	def tell(self):
212	return self.index, self.next
213	def seek(self, index):
214	self.index, self.next = index
215
216	def isident(char):
217	return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
218
219	def isdigit(char):
220	return "0" <= char <= "9"
221
222	def isname(name):
223	# check that group name is a valid string
224	if not isident(name[0]):
225	return False
226	for char in name[1:]:
227	if not isident(char) and not isdigit(char):
228	return False
229	return True
230
231	def _class_escape(source, escape):
232	# handle escape code inside character class
233	code = ESCAPES.get(escape)
234	if code:
235	return code
236	code = CATEGORIES.get(escape)
237	if code:
238	return code
239	try:
240	c = escape[1:2]
241	if c == "x":
242	# hexadecimal escape (exactly two digits)
243	while source.next in HEXDIGITS and len(escape) < 4:
244	escape = escape + source.get()
245	escape = escape[2:]
246	if len(escape) != 2:
247	raise error, "bogus escape: %s" % repr("\\" + escape)
248	return LITERAL, int(escape, 16) & 0xff
249	elif c in OCTDIGITS:
250	# octal escape (up to three digits)
251	while source.next in OCTDIGITS and len(escape) < 4:
252	escape = escape + source.get()
253	escape = escape[1:]
254	return LITERAL, int(escape, 8) & 0xff
255	elif c in DIGITS:
256	raise error, "bogus escape: %s" % repr(escape)
257	if len(escape) == 2:
258	return LITERAL, ord(escape[1])
259	except ValueError:
260	pass
261	raise error, "bogus escape: %s" % repr(escape)
262
263	def _escape(source, escape, state):
264	# handle escape code in expression
265	code = CATEGORIES.get(escape)
266	if code:
267	return code
268	code = ESCAPES.get(escape)
269	if code:
270	return code
271	try:
272	c = escape[1:2]
273	if c == "x":
274	# hexadecimal escape
275	while source.next in HEXDIGITS and len(escape) < 4:
276	escape = escape + source.get()
277	if len(escape) != 4:
278	raise ValueError
279	return LITERAL, int(escape[2:], 16) & 0xff
280	elif c == "0":
281	# octal escape
282	while source.next in OCTDIGITS and len(escape) < 4:
283	escape = escape + source.get()
284	return LITERAL, int(escape[1:], 8) & 0xff
285	elif c in DIGITS:
286	# octal escape or decimal group reference (sigh)
287	if source.next in DIGITS:
288	escape = escape + source.get()
289	if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
290	source.next in OCTDIGITS):
291	# got three octal digits; this is an octal escape
292	escape = escape + source.get()
293	return LITERAL, int(escape[1:], 8) & 0xff
294	# not an octal escape, so this is a group reference
295	group = int(escape[1:])
296	if group < state.groups:
297	if not state.checkgroup(group):
298	raise error, "cannot refer to open group"
299	return GROUPREF, group
300	raise ValueError
301	if len(escape) == 2:
302	return LITERAL, ord(escape[1])
303	except ValueError:
304	pass
305	raise error, "bogus escape: %s" % repr(escape)
306
307	def _parse_sub(source, state, nested=1):
308	# parse an alternation: a\|b\|c
309
310	items = []
311	itemsappend = items.append
312	sourcematch = source.match
313	while 1:
314	itemsappend(_parse(source, state))
315	if sourcematch("\|"):
316	continue
317	if not nested:
318	break
319	if not source.next or sourcematch(")", 0):
320	break
321	else:
322	raise error, "pattern not properly closed"
323
324	if len(items) == 1:
325	return items[0]
326
327	subpattern = SubPattern(state)
328	subpatternappend = subpattern.append
329
330	# check if all items share a common prefix
331	while 1:
332	prefix = None
333	for item in items:
334	if not item:
335	break
336	if prefix is None:
337	prefix = item[0]
338	elif item[0] != prefix:
339	break
340	else:
341	# all subitems start with a common "prefix".
342	# move it out of the branch
343	for item in items:
344	del item[0]
345	subpatternappend(prefix)
346	continue # check next one
347	break
348
349	# check if the branch can be replaced by a character set
350	for item in items:
351	if len(item) != 1 or item[0][0] != LITERAL:
352	break
353	else:
354	# we can store this as a character set instead of a
355	# branch (the compiler may optimize this even more)
356	set = []
357	setappend = set.append
358	for item in items:
359	setappend(item[0])
360	subpatternappend((IN, set))
361	return subpattern
362
363	subpattern.append((BRANCH, (None, items)))
364	return subpattern
365
366	def _parse_sub_cond(source, state, condgroup):
367	item_yes = _parse(source, state)
368	if source.match("\|"):
369	item_no = _parse(source, state)
370	if source.match("\|"):
371	raise error, "conditional backref with more than two branches"
372	else:
373	item_no = None
374	if source.next and not source.match(")", 0):
375	raise error, "pattern not properly closed"
376	subpattern = SubPattern(state)
377	subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
378	return subpattern
379
380	_PATTERNENDERS = set("\|)")
381	_ASSERTCHARS = set("=!<")
382	_LOOKBEHINDASSERTCHARS = set("=!")
383	_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
384
385	def _parse(source, state):
386	# parse a simple pattern
387	subpattern = SubPattern(state)
388
389	# precompute constants into local variables
390	subpatternappend = subpattern.append
391	sourceget = source.get
392	sourcematch = source.match
393	_len = len
394	PATTERNENDERS = _PATTERNENDERS
395	ASSERTCHARS = _ASSERTCHARS
396	LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
397	REPEATCODES = _REPEATCODES
398
399	while 1:
400
401	if source.next in PATTERNENDERS:
402	break # end of subpattern
403	this = sourceget()
404	if this is None:
405	break # end of pattern
406
407	if state.flags & SRE_FLAG_VERBOSE:
408	# skip whitespace and comments
409	if this in WHITESPACE:
410	continue
411	if this == "#":
412	while 1:
413	this = sourceget()
414	if this in (None, "\n"):
415	break
416	continue
417
418	if this and this[0] not in SPECIAL_CHARS:
419	subpatternappend((LITERAL, ord(this)))
420
421	elif this == "[":
422	# character set
423	set = []
424	setappend = set.append
425	## if sourcematch(":"):
426	## pass # handle character classes
427	if sourcematch("^"):
428	setappend((NEGATE, None))
429	# check remaining characters
430	start = set[:]
431	while 1:
432	this = sourceget()
433	if this == "]" and set != start:
434	break
435	elif this and this[0] == "\\":
436	code1 = _class_escape(source, this)
437	elif this:
438	code1 = LITERAL, ord(this)
439	else:
440	raise error, "unexpected end of regular expression"
441	if sourcematch("-"):
442	# potential range
443	this = sourceget()
444	if this == "]":
445	if code1[0] is IN:
446	code1 = code1[1][0]
447	setappend(code1)
448	setappend((LITERAL, ord("-")))
449	break
450	elif this:
451	if this[0] == "\\":
452	code2 = _class_escape(source, this)
453	else:
454	code2 = LITERAL, ord(this)
455	if code1[0] != LITERAL or code2[0] != LITERAL:
456	raise error, "bad character range"
457	lo = code1[1]
458	hi = code2[1]
459	if hi < lo:
460	raise error, "bad character range"
461	setappend((RANGE, (lo, hi)))
462	else:
463	raise error, "unexpected end of regular expression"
464	else:
465	if code1[0] is IN:
466	code1 = code1[1][0]
467	setappend(code1)
468
469	# XXX: <fl> should move set optimization to compiler!
470	if _len(set)==1 and set[0][0] is LITERAL:
471	subpatternappend(set[0]) # optimization
472	elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
473	subpatternappend((NOT_LITERAL, set[1][1])) # optimization
474	else:
475	# XXX: <fl> should add charmap optimization here
476	subpatternappend((IN, set))
477
478	elif this and this[0] in REPEAT_CHARS:
479	# repeat previous item
480	if this == "?":
481	min, max = 0, 1
482	elif this == "*":
483	min, max = 0, MAXREPEAT
484
485	elif this == "+":
486	min, max = 1, MAXREPEAT
487	elif this == "{":
488	if source.next == "}":
489	subpatternappend((LITERAL, ord(this)))
490	continue
491	here = source.tell()
492	min, max = 0, MAXREPEAT
493	lo = hi = ""
494	while source.next in DIGITS:
495	lo = lo + source.get()
496	if sourcematch(","):
497	while source.next in DIGITS:
498	hi = hi + sourceget()
499	else:
500	hi = lo
501	if not sourcematch("}"):
502	subpatternappend((LITERAL, ord(this)))
503	source.seek(here)
504	continue
505	if lo:
506	min = int(lo)
507	if hi:
508	max = int(hi)
509	if max < min:
510	raise error, "bad repeat interval"
511	else:
512	raise error, "not supported"
513	# figure out which item to repeat
514	if subpattern:
515	item = subpattern[-1:]
516	else:
517	item = None
518	if not item or (_len(item) == 1 and item[0][0] == AT):
519	raise error, "nothing to repeat"
520	if item[0][0] in REPEATCODES:
521	raise error, "multiple repeat"
522	if sourcematch("?"):
523	subpattern[-1] = (MIN_REPEAT, (min, max, item))
524	else:
525	subpattern[-1] = (MAX_REPEAT, (min, max, item))
526
527	elif this == ".":
528	subpatternappend((ANY, None))
529
530	elif this == "(":
531	group = 1
532	name = None
533	condgroup = None
534	if sourcematch("?"):
535	group = 0
536	# options
537	if sourcematch("P"):
538	# python extensions
539	if sourcematch("<"):
540	# named group: skip forward to end of name
541	name = ""
542	while 1:
543	char = sourceget()
544	if char is None:
545	raise error, "unterminated name"
546	if char == ">":
547	break
548	name = name + char
549	group = 1
550	if not isname(name):
551	raise error, "bad character in group name"
552	elif sourcematch("="):
553	# named backreference
554	name = ""
555	while 1:
556	char = sourceget()
557	if char is None:
558	raise error, "unterminated name"
559	if char == ")":
560	break
561	name = name + char
562	if not isname(name):
563	raise error, "bad character in group name"
564	gid = state.groupdict.get(name)
565	if gid is None:
566	raise error, "unknown group name"
567	subpatternappend((GROUPREF, gid))
568	continue
569	else:
570	char = sourceget()
571	if char is None:
572	raise error, "unexpected end of pattern"
573	raise error, "unknown specifier: ?P%s" % char
574	elif sourcematch(":"):
575	# non-capturing group
576	group = 2
577	elif sourcematch("#"):
578	# comment
579	while 1:
580	if source.next is None or source.next == ")":
581	break
582	sourceget()
583	if not sourcematch(")"):
584	raise error, "unbalanced parenthesis"
585	continue
586	elif source.next in ASSERTCHARS:
587	# lookahead assertions
588	char = sourceget()
589	dir = 1
590	if char == "<":
591	if source.next not in LOOKBEHINDASSERTCHARS:
592	raise error, "syntax error"
593	dir = -1 # lookbehind
594	char = sourceget()
595	p = _parse_sub(source, state)
596	if not sourcematch(")"):
597	raise error, "unbalanced parenthesis"
598	if char == "=":
599	subpatternappend((ASSERT, (dir, p)))
600	else:
601	subpatternappend((ASSERT_NOT, (dir, p)))
602	continue
603	elif sourcematch("("):
604	# conditional backreference group
605	condname = ""
606	while 1:
607	char = sourceget()
608	if char is None:
609	raise error, "unterminated name"
610	if char == ")":
611	break
612	condname = condname + char
613	group = 2
614	if isname(condname):
615	condgroup = state.groupdict.get(condname)
616	if condgroup is None:
617	raise error, "unknown group name"
618	else:
619	try:
620	condgroup = int(condname)
621	except ValueError:
622	raise error, "bad character in group name"
623	else:
624	# flags
625	if not source.next in FLAGS:
626	raise error, "unexpected end of pattern"
627	while source.next in FLAGS:
628	state.flags = state.flags \| FLAGS[sourceget()]
629	if group:
630	# parse group contents
631	if group == 2:
632	# anonymous group
633	group = None
634	else:
635	group = state.opengroup(name)
636	if condgroup:
637	p = _parse_sub_cond(source, state, condgroup)
638	else:
639	p = _parse_sub(source, state)
640	if not sourcematch(")"):
641	raise error, "unbalanced parenthesis"
642	if group is not None:
643	state.closegroup(group)
644	subpatternappend((SUBPATTERN, (group, p)))
645	else:
646	while 1:
647	char = sourceget()
648	if char is None:
649	raise error, "unexpected end of pattern"
650	if char == ")":
651	break
652	raise error, "unknown extension"
653
654	elif this == "^":
655	subpatternappend((AT, AT_BEGINNING))
656
657	elif this == "$":
658	subpattern.append((AT, AT_END))
659
660	elif this and this[0] == "\\":
661	code = _escape(source, this, state)
662	subpatternappend(code)
663
664	else:
665	raise error, "parser error"
666
667	return subpattern
668
669	def parse(str, flags=0, pattern=None):
670	# parse 're' pattern into list of (opcode, argument) tuples
671
672	source = Tokenizer(str)
673
674	if pattern is None:
675	pattern = Pattern()
676	pattern.flags = flags
677	pattern.str = str
678
679	p = _parse_sub(source, pattern, 0)
680
681	tail = source.get()
682	if tail == ")":
683	raise error, "unbalanced parenthesis"
684	elif tail:
685	raise error, "bogus characters at end of regular expression"
686
687	if flags & SRE_FLAG_DEBUG:
688	p.dump()
689
690	if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
691	# the VERBOSE flag was switched on inside the pattern. to be
692	# on the safe side, we'll parse the whole thing again...
693	return parse(str, p.pattern.flags)
694
695	return p
696
697	def parse_template(source, pattern):
698	# parse 're' replacement string into list of literals and
699	# group references
700	s = Tokenizer(source)
701	sget = s.get
702	p = []
703	a = p.append
704	def literal(literal, p=p, pappend=a):
705	if p and p[-1][0] is LITERAL:
706	p[-1] = LITERAL, p[-1][1] + literal
707	else:
708	pappend((LITERAL, literal))
709	sep = source[:0]
710	if type(sep) is type(""):
711	makechar = chr
712	else:
713	makechar = unichr
714	while 1:
715	this = sget()
716	if this is None:
717	break # end of replacement string
718	if this and this[0] == "\\":
719	# group
720	c = this[1:2]
721	if c == "g":
722	name = ""
723	if s.match("<"):
724	while 1:
725	char = sget()
726	if char is None:
727	raise error, "unterminated group name"
728	if char == ">":
729	break
730	name = name + char
731	if not name:
732	raise error, "bad group name"
733	try:
734	index = int(name)
735	if index < 0:
736	raise error, "negative group number"
737	except ValueError:
738	if not isname(name):
739	raise error, "bad character in group name"
740	try:
741	index = pattern.groupindex[name]
742	except KeyError:
743	raise IndexError, "unknown group name"
744	a((MARK, index))
745	elif c == "0":
746	if s.next in OCTDIGITS:
747	this = this + sget()
748	if s.next in OCTDIGITS:
749	this = this + sget()
750	literal(makechar(int(this[1:], 8) & 0xff))
751	elif c in DIGITS:
752	isoctal = False
753	if s.next in DIGITS:
754	this = this + sget()
755	if (c in OCTDIGITS and this[2] in OCTDIGITS and
756	s.next in OCTDIGITS):
757	this = this + sget()
758	isoctal = True
759	literal(makechar(int(this[1:], 8) & 0xff))
760	if not isoctal:
761	a((MARK, int(this[1:])))
762	else:
763	try:
764	this = makechar(ESCAPES[this][1])
765	except KeyError:
766	pass
767	literal(this)
768	else:
769	literal(this)
770	# convert template to groups and literals lists
771	i = 0
772	groups = []
773	groupsappend = groups.append
774	literals = [None] * len(p)
775	for c, s in p:
776	if c is MARK:
777	groupsappend((i, s))
778	# literal[i] is already None
779	else:
780	literals[i] = s
781	i = i + 1
782	return groups, literals
783
784	def expand_template(template, match):
785	g = match.group
786	sep = match.string[:0]
787	groups, literals = template
788	literals = literals[:]
789	try:
790	for index, group in groups:
791	literals[index] = s = g(group)
792	if s is None:
793	raise error, "unmatched group"
794	except IndexError:
795	raise error, "invalid group reference"
796	return sep.join(literals)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/sre_parse.py@ 388

Download in other formats: