1 | #
|
---|
2 | # ElementTree
|
---|
3 | # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
|
---|
4 | #
|
---|
5 | # limited xpath support for element trees
|
---|
6 | #
|
---|
7 | # history:
|
---|
8 | # 2003-05-23 fl created
|
---|
9 | # 2003-05-28 fl added support for // etc
|
---|
10 | # 2003-08-27 fl fixed parsing of periods in element names
|
---|
11 | # 2007-09-10 fl new selection engine
|
---|
12 | # 2007-09-12 fl fixed parent selector
|
---|
13 | # 2007-09-13 fl added iterfind; changed findall to return a list
|
---|
14 | # 2007-11-30 fl added namespaces support
|
---|
15 | # 2009-10-30 fl added child element value filter
|
---|
16 | #
|
---|
17 | # Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.
|
---|
18 | #
|
---|
19 | # fredrik@pythonware.com
|
---|
20 | # http://www.pythonware.com
|
---|
21 | #
|
---|
22 | # --------------------------------------------------------------------
|
---|
23 | # The ElementTree toolkit is
|
---|
24 | #
|
---|
25 | # Copyright (c) 1999-2009 by Fredrik Lundh
|
---|
26 | #
|
---|
27 | # By obtaining, using, and/or copying this software and/or its
|
---|
28 | # associated documentation, you agree that you have read, understood,
|
---|
29 | # and will comply with the following terms and conditions:
|
---|
30 | #
|
---|
31 | # Permission to use, copy, modify, and distribute this software and
|
---|
32 | # its associated documentation for any purpose and without fee is
|
---|
33 | # hereby granted, provided that the above copyright notice appears in
|
---|
34 | # all copies, and that both that copyright notice and this permission
|
---|
35 | # notice appear in supporting documentation, and that the name of
|
---|
36 | # Secret Labs AB or the author not be used in advertising or publicity
|
---|
37 | # pertaining to distribution of the software without specific, written
|
---|
38 | # prior permission.
|
---|
39 | #
|
---|
40 | # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
|
---|
41 | # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
|
---|
42 | # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
|
---|
43 | # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
|
---|
44 | # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
---|
45 | # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
---|
46 | # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
---|
47 | # OF THIS SOFTWARE.
|
---|
48 | # --------------------------------------------------------------------
|
---|
49 |
|
---|
50 | # Licensed to PSF under a Contributor Agreement.
|
---|
51 | # See http://www.python.org/psf/license for licensing details.
|
---|
52 |
|
---|
53 | ##
|
---|
54 | # Implementation module for XPath support. There's usually no reason
|
---|
55 | # to import this module directly; the <b>ElementTree</b> does this for
|
---|
56 | # you, if needed.
|
---|
57 | ##
|
---|
58 |
|
---|
59 | import re
|
---|
60 |
|
---|
61 | xpath_tokenizer_re = re.compile(
|
---|
62 | "("
|
---|
63 | "'[^']*'|\"[^\"]*\"|"
|
---|
64 | "::|"
|
---|
65 | "//?|"
|
---|
66 | "\.\.|"
|
---|
67 | "\(\)|"
|
---|
68 | "[/.*:\[\]\(\)@=])|"
|
---|
69 | "((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
|
---|
70 | "\s+"
|
---|
71 | )
|
---|
72 |
|
---|
73 | def xpath_tokenizer(pattern, namespaces=None):
|
---|
74 | for token in xpath_tokenizer_re.findall(pattern):
|
---|
75 | tag = token[1]
|
---|
76 | if tag and tag[0] != "{" and ":" in tag:
|
---|
77 | try:
|
---|
78 | prefix, uri = tag.split(":", 1)
|
---|
79 | if not namespaces:
|
---|
80 | raise KeyError
|
---|
81 | yield token[0], "{%s}%s" % (namespaces[prefix], uri)
|
---|
82 | except KeyError:
|
---|
83 | raise SyntaxError("prefix %r not found in prefix map" % prefix)
|
---|
84 | else:
|
---|
85 | yield token
|
---|
86 |
|
---|
87 | def get_parent_map(context):
|
---|
88 | parent_map = context.parent_map
|
---|
89 | if parent_map is None:
|
---|
90 | context.parent_map = parent_map = {}
|
---|
91 | for p in context.root.iter():
|
---|
92 | for e in p:
|
---|
93 | parent_map[e] = p
|
---|
94 | return parent_map
|
---|
95 |
|
---|
96 | def prepare_child(next, token):
|
---|
97 | tag = token[1]
|
---|
98 | def select(context, result):
|
---|
99 | for elem in result:
|
---|
100 | for e in elem:
|
---|
101 | if e.tag == tag:
|
---|
102 | yield e
|
---|
103 | return select
|
---|
104 |
|
---|
105 | def prepare_star(next, token):
|
---|
106 | def select(context, result):
|
---|
107 | for elem in result:
|
---|
108 | for e in elem:
|
---|
109 | yield e
|
---|
110 | return select
|
---|
111 |
|
---|
112 | def prepare_self(next, token):
|
---|
113 | def select(context, result):
|
---|
114 | for elem in result:
|
---|
115 | yield elem
|
---|
116 | return select
|
---|
117 |
|
---|
118 | def prepare_descendant(next, token):
|
---|
119 | token = next()
|
---|
120 | if token[0] == "*":
|
---|
121 | tag = "*"
|
---|
122 | elif not token[0]:
|
---|
123 | tag = token[1]
|
---|
124 | else:
|
---|
125 | raise SyntaxError("invalid descendant")
|
---|
126 | def select(context, result):
|
---|
127 | for elem in result:
|
---|
128 | for e in elem.iter(tag):
|
---|
129 | if e is not elem:
|
---|
130 | yield e
|
---|
131 | return select
|
---|
132 |
|
---|
133 | def prepare_parent(next, token):
|
---|
134 | def select(context, result):
|
---|
135 | # FIXME: raise error if .. is applied at toplevel?
|
---|
136 | parent_map = get_parent_map(context)
|
---|
137 | result_map = {}
|
---|
138 | for elem in result:
|
---|
139 | if elem in parent_map:
|
---|
140 | parent = parent_map[elem]
|
---|
141 | if parent not in result_map:
|
---|
142 | result_map[parent] = None
|
---|
143 | yield parent
|
---|
144 | return select
|
---|
145 |
|
---|
146 | def prepare_predicate(next, token):
|
---|
147 | # FIXME: replace with real parser!!! refs:
|
---|
148 | # http://effbot.org/zone/simple-iterator-parser.htm
|
---|
149 | # http://javascript.crockford.com/tdop/tdop.html
|
---|
150 | signature = []
|
---|
151 | predicate = []
|
---|
152 | while 1:
|
---|
153 | token = next()
|
---|
154 | if token[0] == "]":
|
---|
155 | break
|
---|
156 | if token[0] and token[0][:1] in "'\"":
|
---|
157 | token = "'", token[0][1:-1]
|
---|
158 | signature.append(token[0] or "-")
|
---|
159 | predicate.append(token[1])
|
---|
160 | signature = "".join(signature)
|
---|
161 | # use signature to determine predicate type
|
---|
162 | if signature == "@-":
|
---|
163 | # [@attribute] predicate
|
---|
164 | key = predicate[1]
|
---|
165 | def select(context, result):
|
---|
166 | for elem in result:
|
---|
167 | if elem.get(key) is not None:
|
---|
168 | yield elem
|
---|
169 | return select
|
---|
170 | if signature == "@-='":
|
---|
171 | # [@attribute='value']
|
---|
172 | key = predicate[1]
|
---|
173 | value = predicate[-1]
|
---|
174 | def select(context, result):
|
---|
175 | for elem in result:
|
---|
176 | if elem.get(key) == value:
|
---|
177 | yield elem
|
---|
178 | return select
|
---|
179 | if signature == "-" and not re.match("\d+$", predicate[0]):
|
---|
180 | # [tag]
|
---|
181 | tag = predicate[0]
|
---|
182 | def select(context, result):
|
---|
183 | for elem in result:
|
---|
184 | if elem.find(tag) is not None:
|
---|
185 | yield elem
|
---|
186 | return select
|
---|
187 | if signature == "-='" and not re.match("\d+$", predicate[0]):
|
---|
188 | # [tag='value']
|
---|
189 | tag = predicate[0]
|
---|
190 | value = predicate[-1]
|
---|
191 | def select(context, result):
|
---|
192 | for elem in result:
|
---|
193 | for e in elem.findall(tag):
|
---|
194 | if "".join(e.itertext()) == value:
|
---|
195 | yield elem
|
---|
196 | break
|
---|
197 | return select
|
---|
198 | if signature == "-" or signature == "-()" or signature == "-()-":
|
---|
199 | # [index] or [last()] or [last()-index]
|
---|
200 | if signature == "-":
|
---|
201 | index = int(predicate[0]) - 1
|
---|
202 | else:
|
---|
203 | if predicate[0] != "last":
|
---|
204 | raise SyntaxError("unsupported function")
|
---|
205 | if signature == "-()-":
|
---|
206 | try:
|
---|
207 | index = int(predicate[2]) - 1
|
---|
208 | except ValueError:
|
---|
209 | raise SyntaxError("unsupported expression")
|
---|
210 | else:
|
---|
211 | index = -1
|
---|
212 | def select(context, result):
|
---|
213 | parent_map = get_parent_map(context)
|
---|
214 | for elem in result:
|
---|
215 | try:
|
---|
216 | parent = parent_map[elem]
|
---|
217 | # FIXME: what if the selector is "*" ?
|
---|
218 | elems = list(parent.findall(elem.tag))
|
---|
219 | if elems[index] is elem:
|
---|
220 | yield elem
|
---|
221 | except (IndexError, KeyError):
|
---|
222 | pass
|
---|
223 | return select
|
---|
224 | raise SyntaxError("invalid predicate")
|
---|
225 |
|
---|
226 | ops = {
|
---|
227 | "": prepare_child,
|
---|
228 | "*": prepare_star,
|
---|
229 | ".": prepare_self,
|
---|
230 | "..": prepare_parent,
|
---|
231 | "//": prepare_descendant,
|
---|
232 | "[": prepare_predicate,
|
---|
233 | }
|
---|
234 |
|
---|
235 | _cache = {}
|
---|
236 |
|
---|
237 | class _SelectorContext:
|
---|
238 | parent_map = None
|
---|
239 | def __init__(self, root):
|
---|
240 | self.root = root
|
---|
241 |
|
---|
242 | # --------------------------------------------------------------------
|
---|
243 |
|
---|
244 | ##
|
---|
245 | # Generate all matching objects.
|
---|
246 |
|
---|
247 | def iterfind(elem, path, namespaces=None):
|
---|
248 | # compile selector pattern
|
---|
249 | if path[-1:] == "/":
|
---|
250 | path = path + "*" # implicit all (FIXME: keep this?)
|
---|
251 | try:
|
---|
252 | selector = _cache[path]
|
---|
253 | except KeyError:
|
---|
254 | if len(_cache) > 100:
|
---|
255 | _cache.clear()
|
---|
256 | if path[:1] == "/":
|
---|
257 | raise SyntaxError("cannot use absolute path on element")
|
---|
258 | next = iter(xpath_tokenizer(path, namespaces)).next
|
---|
259 | token = next()
|
---|
260 | selector = []
|
---|
261 | while 1:
|
---|
262 | try:
|
---|
263 | selector.append(ops[token[0]](next, token))
|
---|
264 | except StopIteration:
|
---|
265 | raise SyntaxError("invalid path")
|
---|
266 | try:
|
---|
267 | token = next()
|
---|
268 | if token[0] == "/":
|
---|
269 | token = next()
|
---|
270 | except StopIteration:
|
---|
271 | break
|
---|
272 | _cache[path] = selector
|
---|
273 | # execute selector pattern
|
---|
274 | result = [elem]
|
---|
275 | context = _SelectorContext(elem)
|
---|
276 | for select in selector:
|
---|
277 | result = select(context, result)
|
---|
278 | return result
|
---|
279 |
|
---|
280 | ##
|
---|
281 | # Find first matching object.
|
---|
282 |
|
---|
283 | def find(elem, path, namespaces=None):
|
---|
284 | try:
|
---|
285 | return iterfind(elem, path, namespaces).next()
|
---|
286 | except StopIteration:
|
---|
287 | return None
|
---|
288 |
|
---|
289 | ##
|
---|
290 | # Find all matching objects.
|
---|
291 |
|
---|
292 | def findall(elem, path, namespaces=None):
|
---|
293 | return list(iterfind(elem, path, namespaces))
|
---|
294 |
|
---|
295 | ##
|
---|
296 | # Find text for first matching object.
|
---|
297 |
|
---|
298 | def findtext(elem, path, default=None, namespaces=None):
|
---|
299 | try:
|
---|
300 | elem = iterfind(elem, path, namespaces).next()
|
---|
301 | return elem.text or ""
|
---|
302 | except StopIteration:
|
---|
303 | return default
|
---|