1 | """Iterator based sre token scanner
|
---|
2 |
|
---|
3 | """
|
---|
4 |
|
---|
5 | import re
|
---|
6 | import sre_parse
|
---|
7 | import sre_compile
|
---|
8 | import sre_constants
|
---|
9 |
|
---|
10 | from re import VERBOSE, MULTILINE, DOTALL
|
---|
11 | from sre_constants import BRANCH, SUBPATTERN
|
---|
12 |
|
---|
13 | __all__ = ['Scanner', 'pattern']
|
---|
14 |
|
---|
15 | FLAGS = (VERBOSE | MULTILINE | DOTALL)
|
---|
16 |
|
---|
17 | class Scanner(object):
|
---|
18 | def __init__(self, lexicon, flags=FLAGS):
|
---|
19 | self.actions = [None]
|
---|
20 | # Combine phrases into a compound pattern
|
---|
21 | s = sre_parse.Pattern()
|
---|
22 | s.flags = flags
|
---|
23 | p = []
|
---|
24 | for idx, token in enumerate(lexicon):
|
---|
25 | phrase = token.pattern
|
---|
26 | try:
|
---|
27 | subpattern = sre_parse.SubPattern(s,
|
---|
28 | [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))])
|
---|
29 | except sre_constants.error:
|
---|
30 | raise
|
---|
31 | p.append(subpattern)
|
---|
32 | self.actions.append(token)
|
---|
33 |
|
---|
34 | s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work
|
---|
35 | p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
|
---|
36 | self.scanner = sre_compile.compile(p)
|
---|
37 |
|
---|
38 | def iterscan(self, string, idx=0, context=None):
|
---|
39 | """Yield match, end_idx for each match
|
---|
40 |
|
---|
41 | """
|
---|
42 | match = self.scanner.scanner(string, idx).match
|
---|
43 | actions = self.actions
|
---|
44 | lastend = idx
|
---|
45 | end = len(string)
|
---|
46 | while True:
|
---|
47 | m = match()
|
---|
48 | if m is None:
|
---|
49 | break
|
---|
50 | matchbegin, matchend = m.span()
|
---|
51 | if lastend == matchend:
|
---|
52 | break
|
---|
53 | action = actions[m.lastindex]
|
---|
54 | if action is not None:
|
---|
55 | rval, next_pos = action(m, context)
|
---|
56 | if next_pos is not None and next_pos != matchend:
|
---|
57 | # "fast forward" the scanner
|
---|
58 | matchend = next_pos
|
---|
59 | match = self.scanner.scanner(string, matchend).match
|
---|
60 | yield rval, matchend
|
---|
61 | lastend = matchend
|
---|
62 |
|
---|
63 |
|
---|
64 | def pattern(pattern, flags=FLAGS):
|
---|
65 | def decorator(fn):
|
---|
66 | fn.pattern = pattern
|
---|
67 | fn.regex = re.compile(pattern, flags)
|
---|
68 | return fn
|
---|
69 | return decorator
|
---|