1 | "Utility functions used by the btm_matcher module"
|
---|
2 |
|
---|
3 | from . import pytree
|
---|
4 | from .pgen2 import grammar, token
|
---|
5 | from .pygram import pattern_symbols, python_symbols
|
---|
6 |
|
---|
7 | syms = pattern_symbols
|
---|
8 | pysyms = python_symbols
|
---|
9 | tokens = grammar.opmap
|
---|
10 | token_labels = token
|
---|
11 |
|
---|
12 | TYPE_ANY = -1
|
---|
13 | TYPE_ALTERNATIVES = -2
|
---|
14 | TYPE_GROUP = -3
|
---|
15 |
|
---|
16 | class MinNode(object):
|
---|
17 | """This class serves as an intermediate representation of the
|
---|
18 | pattern tree during the conversion to sets of leaf-to-root
|
---|
19 | subpatterns"""
|
---|
20 |
|
---|
21 | def __init__(self, type=None, name=None):
|
---|
22 | self.type = type
|
---|
23 | self.name = name
|
---|
24 | self.children = []
|
---|
25 | self.leaf = False
|
---|
26 | self.parent = None
|
---|
27 | self.alternatives = []
|
---|
28 | self.group = []
|
---|
29 |
|
---|
30 | def __repr__(self):
|
---|
31 | return str(self.type) + ' ' + str(self.name)
|
---|
32 |
|
---|
33 | def leaf_to_root(self):
|
---|
34 | """Internal method. Returns a characteristic path of the
|
---|
35 | pattern tree. This method must be run for all leaves until the
|
---|
36 | linear subpatterns are merged into a single"""
|
---|
37 | node = self
|
---|
38 | subp = []
|
---|
39 | while node:
|
---|
40 | if node.type == TYPE_ALTERNATIVES:
|
---|
41 | node.alternatives.append(subp)
|
---|
42 | if len(node.alternatives) == len(node.children):
|
---|
43 | #last alternative
|
---|
44 | subp = [tuple(node.alternatives)]
|
---|
45 | node.alternatives = []
|
---|
46 | node = node.parent
|
---|
47 | continue
|
---|
48 | else:
|
---|
49 | node = node.parent
|
---|
50 | subp = None
|
---|
51 | break
|
---|
52 |
|
---|
53 | if node.type == TYPE_GROUP:
|
---|
54 | node.group.append(subp)
|
---|
55 | #probably should check the number of leaves
|
---|
56 | if len(node.group) == len(node.children):
|
---|
57 | subp = get_characteristic_subpattern(node.group)
|
---|
58 | node.group = []
|
---|
59 | node = node.parent
|
---|
60 | continue
|
---|
61 | else:
|
---|
62 | node = node.parent
|
---|
63 | subp = None
|
---|
64 | break
|
---|
65 |
|
---|
66 | if node.type == token_labels.NAME and node.name:
|
---|
67 | #in case of type=name, use the name instead
|
---|
68 | subp.append(node.name)
|
---|
69 | else:
|
---|
70 | subp.append(node.type)
|
---|
71 |
|
---|
72 | node = node.parent
|
---|
73 | return subp
|
---|
74 |
|
---|
75 | def get_linear_subpattern(self):
|
---|
76 | """Drives the leaf_to_root method. The reason that
|
---|
77 | leaf_to_root must be run multiple times is because we need to
|
---|
78 | reject 'group' matches; for example the alternative form
|
---|
79 | (a | b c) creates a group [b c] that needs to be matched. Since
|
---|
80 | matching multiple linear patterns overcomes the automaton's
|
---|
81 | capabilities, leaf_to_root merges each group into a single
|
---|
82 | choice based on 'characteristic'ity,
|
---|
83 |
|
---|
84 | i.e. (a|b c) -> (a|b) if b more characteristic than c
|
---|
85 |
|
---|
86 | Returns: The most 'characteristic'(as defined by
|
---|
87 | get_characteristic_subpattern) path for the compiled pattern
|
---|
88 | tree.
|
---|
89 | """
|
---|
90 |
|
---|
91 | for l in self.leaves():
|
---|
92 | subp = l.leaf_to_root()
|
---|
93 | if subp:
|
---|
94 | return subp
|
---|
95 |
|
---|
96 | def leaves(self):
|
---|
97 | "Generator that returns the leaves of the tree"
|
---|
98 | for child in self.children:
|
---|
99 | for x in child.leaves():
|
---|
100 | yield x
|
---|
101 | if not self.children:
|
---|
102 | yield self
|
---|
103 |
|
---|
104 | def reduce_tree(node, parent=None):
|
---|
105 | """
|
---|
106 | Internal function. Reduces a compiled pattern tree to an
|
---|
107 | intermediate representation suitable for feeding the
|
---|
108 | automaton. This also trims off any optional pattern elements(like
|
---|
109 | [a], a*).
|
---|
110 | """
|
---|
111 |
|
---|
112 | new_node = None
|
---|
113 | #switch on the node type
|
---|
114 | if node.type == syms.Matcher:
|
---|
115 | #skip
|
---|
116 | node = node.children[0]
|
---|
117 |
|
---|
118 | if node.type == syms.Alternatives :
|
---|
119 | #2 cases
|
---|
120 | if len(node.children) <= 2:
|
---|
121 | #just a single 'Alternative', skip this node
|
---|
122 | new_node = reduce_tree(node.children[0], parent)
|
---|
123 | else:
|
---|
124 | #real alternatives
|
---|
125 | new_node = MinNode(type=TYPE_ALTERNATIVES)
|
---|
126 | #skip odd children('|' tokens)
|
---|
127 | for child in node.children:
|
---|
128 | if node.children.index(child)%2:
|
---|
129 | continue
|
---|
130 | reduced = reduce_tree(child, new_node)
|
---|
131 | if reduced is not None:
|
---|
132 | new_node.children.append(reduced)
|
---|
133 | elif node.type == syms.Alternative:
|
---|
134 | if len(node.children) > 1:
|
---|
135 |
|
---|
136 | new_node = MinNode(type=TYPE_GROUP)
|
---|
137 | for child in node.children:
|
---|
138 | reduced = reduce_tree(child, new_node)
|
---|
139 | if reduced:
|
---|
140 | new_node.children.append(reduced)
|
---|
141 | if not new_node.children:
|
---|
142 | # delete the group if all of the children were reduced to None
|
---|
143 | new_node = None
|
---|
144 |
|
---|
145 | else:
|
---|
146 | new_node = reduce_tree(node.children[0], parent)
|
---|
147 |
|
---|
148 | elif node.type == syms.Unit:
|
---|
149 | if (isinstance(node.children[0], pytree.Leaf) and
|
---|
150 | node.children[0].value == '('):
|
---|
151 | #skip parentheses
|
---|
152 | return reduce_tree(node.children[1], parent)
|
---|
153 | if ((isinstance(node.children[0], pytree.Leaf) and
|
---|
154 | node.children[0].value == '[')
|
---|
155 | or
|
---|
156 | (len(node.children)>1 and
|
---|
157 | hasattr(node.children[1], "value") and
|
---|
158 | node.children[1].value == '[')):
|
---|
159 | #skip whole unit if its optional
|
---|
160 | return None
|
---|
161 |
|
---|
162 | leaf = True
|
---|
163 | details_node = None
|
---|
164 | alternatives_node = None
|
---|
165 | has_repeater = False
|
---|
166 | repeater_node = None
|
---|
167 | has_variable_name = False
|
---|
168 |
|
---|
169 | for child in node.children:
|
---|
170 | if child.type == syms.Details:
|
---|
171 | leaf = False
|
---|
172 | details_node = child
|
---|
173 | elif child.type == syms.Repeater:
|
---|
174 | has_repeater = True
|
---|
175 | repeater_node = child
|
---|
176 | elif child.type == syms.Alternatives:
|
---|
177 | alternatives_node = child
|
---|
178 | if hasattr(child, 'value') and child.value == '=': # variable name
|
---|
179 | has_variable_name = True
|
---|
180 |
|
---|
181 | #skip variable name
|
---|
182 | if has_variable_name:
|
---|
183 | #skip variable name, '='
|
---|
184 | name_leaf = node.children[2]
|
---|
185 | if hasattr(name_leaf, 'value') and name_leaf.value == '(':
|
---|
186 | # skip parenthesis
|
---|
187 | name_leaf = node.children[3]
|
---|
188 | else:
|
---|
189 | name_leaf = node.children[0]
|
---|
190 |
|
---|
191 | #set node type
|
---|
192 | if name_leaf.type == token_labels.NAME:
|
---|
193 | #(python) non-name or wildcard
|
---|
194 | if name_leaf.value == 'any':
|
---|
195 | new_node = MinNode(type=TYPE_ANY)
|
---|
196 | else:
|
---|
197 | if hasattr(token_labels, name_leaf.value):
|
---|
198 | new_node = MinNode(type=getattr(token_labels, name_leaf.value))
|
---|
199 | else:
|
---|
200 | new_node = MinNode(type=getattr(pysyms, name_leaf.value))
|
---|
201 |
|
---|
202 | elif name_leaf.type == token_labels.STRING:
|
---|
203 | #(python) name or character; remove the apostrophes from
|
---|
204 | #the string value
|
---|
205 | name = name_leaf.value.strip("'")
|
---|
206 | if name in tokens:
|
---|
207 | new_node = MinNode(type=tokens[name])
|
---|
208 | else:
|
---|
209 | new_node = MinNode(type=token_labels.NAME, name=name)
|
---|
210 | elif name_leaf.type == syms.Alternatives:
|
---|
211 | new_node = reduce_tree(alternatives_node, parent)
|
---|
212 |
|
---|
213 | #handle repeaters
|
---|
214 | if has_repeater:
|
---|
215 | if repeater_node.children[0].value == '*':
|
---|
216 | #reduce to None
|
---|
217 | new_node = None
|
---|
218 | elif repeater_node.children[0].value == '+':
|
---|
219 | #reduce to a single occurence i.e. do nothing
|
---|
220 | pass
|
---|
221 | else:
|
---|
222 | #TODO: handle {min, max} repeaters
|
---|
223 | raise NotImplementedError
|
---|
224 | pass
|
---|
225 |
|
---|
226 | #add children
|
---|
227 | if details_node and new_node is not None:
|
---|
228 | for child in details_node.children[1:-1]:
|
---|
229 | #skip '<', '>' markers
|
---|
230 | reduced = reduce_tree(child, new_node)
|
---|
231 | if reduced is not None:
|
---|
232 | new_node.children.append(reduced)
|
---|
233 | if new_node:
|
---|
234 | new_node.parent = parent
|
---|
235 | return new_node
|
---|
236 |
|
---|
237 |
|
---|
238 | def get_characteristic_subpattern(subpatterns):
|
---|
239 | """Picks the most characteristic from a list of linear patterns
|
---|
240 | Current order used is:
|
---|
241 | names > common_names > common_chars
|
---|
242 | """
|
---|
243 | if not isinstance(subpatterns, list):
|
---|
244 | return subpatterns
|
---|
245 | if len(subpatterns)==1:
|
---|
246 | return subpatterns[0]
|
---|
247 |
|
---|
248 | # first pick out the ones containing variable names
|
---|
249 | subpatterns_with_names = []
|
---|
250 | subpatterns_with_common_names = []
|
---|
251 | common_names = ['in', 'for', 'if' , 'not', 'None']
|
---|
252 | subpatterns_with_common_chars = []
|
---|
253 | common_chars = "[]().,:"
|
---|
254 | for subpattern in subpatterns:
|
---|
255 | if any(rec_test(subpattern, lambda x: type(x) is str)):
|
---|
256 | if any(rec_test(subpattern,
|
---|
257 | lambda x: isinstance(x, str) and x in common_chars)):
|
---|
258 | subpatterns_with_common_chars.append(subpattern)
|
---|
259 | elif any(rec_test(subpattern,
|
---|
260 | lambda x: isinstance(x, str) and x in common_names)):
|
---|
261 | subpatterns_with_common_names.append(subpattern)
|
---|
262 |
|
---|
263 | else:
|
---|
264 | subpatterns_with_names.append(subpattern)
|
---|
265 |
|
---|
266 | if subpatterns_with_names:
|
---|
267 | subpatterns = subpatterns_with_names
|
---|
268 | elif subpatterns_with_common_names:
|
---|
269 | subpatterns = subpatterns_with_common_names
|
---|
270 | elif subpatterns_with_common_chars:
|
---|
271 | subpatterns = subpatterns_with_common_chars
|
---|
272 | # of the remaining subpatterns pick out the longest one
|
---|
273 | return max(subpatterns, key=len)
|
---|
274 |
|
---|
275 | def rec_test(sequence, test_func):
|
---|
276 | """Tests test_func on all items of sequence and items of included
|
---|
277 | sub-iterables"""
|
---|
278 | for x in sequence:
|
---|
279 | if isinstance(x, (list, tuple)):
|
---|
280 | for y in rec_test(x, test_func):
|
---|
281 | yield y
|
---|
282 | else:
|
---|
283 | yield test_func(x)
|
---|