1 | """
|
---|
2 | SAX driver for the pyexpat C module. This driver works with
|
---|
3 | pyexpat.__version__ == '2.22'.
|
---|
4 | """
|
---|
5 |
|
---|
6 | version = "0.20"
|
---|
7 |
|
---|
8 | from xml.sax._exceptions import *
|
---|
9 | from xml.sax.handler import feature_validation, feature_namespaces
|
---|
10 | from xml.sax.handler import feature_namespace_prefixes
|
---|
11 | from xml.sax.handler import feature_external_ges, feature_external_pes
|
---|
12 | from xml.sax.handler import feature_string_interning
|
---|
13 | from xml.sax.handler import property_xml_string, property_interning_dict
|
---|
14 |
|
---|
15 | # xml.parsers.expat does not raise ImportError in Jython
|
---|
16 | import sys
|
---|
17 | if sys.platform[:4] == "java":
|
---|
18 | raise SAXReaderNotAvailable("expat not available in Java", None)
|
---|
19 | del sys
|
---|
20 |
|
---|
21 | try:
|
---|
22 | from xml.parsers import expat
|
---|
23 | except ImportError:
|
---|
24 | raise SAXReaderNotAvailable("expat not supported", None)
|
---|
25 | else:
|
---|
26 | if not hasattr(expat, "ParserCreate"):
|
---|
27 | raise SAXReaderNotAvailable("expat not supported", None)
|
---|
28 | from xml.sax import xmlreader, saxutils, handler
|
---|
29 |
|
---|
30 | AttributesImpl = xmlreader.AttributesImpl
|
---|
31 | AttributesNSImpl = xmlreader.AttributesNSImpl
|
---|
32 |
|
---|
33 | # If we're using a sufficiently recent version of Python, we can use
|
---|
34 | # weak references to avoid cycles between the parser and content
|
---|
35 | # handler, otherwise we'll just have to pretend.
|
---|
36 | try:
|
---|
37 | import _weakref
|
---|
38 | except ImportError:
|
---|
39 | def _mkproxy(o):
|
---|
40 | return o
|
---|
41 | else:
|
---|
42 | import weakref
|
---|
43 | _mkproxy = weakref.proxy
|
---|
44 | del weakref, _weakref
|
---|
45 |
|
---|
46 | # --- ExpatLocator
|
---|
47 |
|
---|
48 | class ExpatLocator(xmlreader.Locator):
|
---|
49 | """Locator for use with the ExpatParser class.
|
---|
50 |
|
---|
51 | This uses a weak reference to the parser object to avoid creating
|
---|
52 | a circular reference between the parser and the content handler.
|
---|
53 | """
|
---|
54 | def __init__(self, parser):
|
---|
55 | self._ref = _mkproxy(parser)
|
---|
56 |
|
---|
57 | def getColumnNumber(self):
|
---|
58 | parser = self._ref
|
---|
59 | if parser._parser is None:
|
---|
60 | return None
|
---|
61 | return parser._parser.ErrorColumnNumber
|
---|
62 |
|
---|
63 | def getLineNumber(self):
|
---|
64 | parser = self._ref
|
---|
65 | if parser._parser is None:
|
---|
66 | return 1
|
---|
67 | return parser._parser.ErrorLineNumber
|
---|
68 |
|
---|
69 | def getPublicId(self):
|
---|
70 | parser = self._ref
|
---|
71 | if parser is None:
|
---|
72 | return None
|
---|
73 | return parser._source.getPublicId()
|
---|
74 |
|
---|
75 | def getSystemId(self):
|
---|
76 | parser = self._ref
|
---|
77 | if parser is None:
|
---|
78 | return None
|
---|
79 | return parser._source.getSystemId()
|
---|
80 |
|
---|
81 |
|
---|
82 | # --- ExpatParser
|
---|
83 |
|
---|
84 | class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
|
---|
85 | """SAX driver for the pyexpat C module."""
|
---|
86 |
|
---|
87 | def __init__(self, namespaceHandling=0, bufsize=2**16-20):
|
---|
88 | xmlreader.IncrementalParser.__init__(self, bufsize)
|
---|
89 | self._source = xmlreader.InputSource()
|
---|
90 | self._parser = None
|
---|
91 | self._namespaces = namespaceHandling
|
---|
92 | self._lex_handler_prop = None
|
---|
93 | self._parsing = 0
|
---|
94 | self._entity_stack = []
|
---|
95 | self._external_ges = 1
|
---|
96 | self._interning = None
|
---|
97 |
|
---|
98 | # XMLReader methods
|
---|
99 |
|
---|
100 | def parse(self, source):
|
---|
101 | "Parse an XML document from a URL or an InputSource."
|
---|
102 | source = saxutils.prepare_input_source(source)
|
---|
103 |
|
---|
104 | self._source = source
|
---|
105 | self.reset()
|
---|
106 | self._cont_handler.setDocumentLocator(ExpatLocator(self))
|
---|
107 | xmlreader.IncrementalParser.parse(self, source)
|
---|
108 |
|
---|
109 | def prepareParser(self, source):
|
---|
110 | if source.getSystemId() is not None:
|
---|
111 | base = source.getSystemId()
|
---|
112 | if isinstance(base, unicode):
|
---|
113 | base = base.encode('utf-8')
|
---|
114 | self._parser.SetBase(base)
|
---|
115 |
|
---|
116 | # Redefined setContentHandler to allow changing handlers during parsing
|
---|
117 |
|
---|
118 | def setContentHandler(self, handler):
|
---|
119 | xmlreader.IncrementalParser.setContentHandler(self, handler)
|
---|
120 | if self._parsing:
|
---|
121 | self._reset_cont_handler()
|
---|
122 |
|
---|
123 | def getFeature(self, name):
|
---|
124 | if name == feature_namespaces:
|
---|
125 | return self._namespaces
|
---|
126 | elif name == feature_string_interning:
|
---|
127 | return self._interning is not None
|
---|
128 | elif name in (feature_validation, feature_external_pes,
|
---|
129 | feature_namespace_prefixes):
|
---|
130 | return 0
|
---|
131 | elif name == feature_external_ges:
|
---|
132 | return self._external_ges
|
---|
133 | raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
|
---|
134 |
|
---|
135 | def setFeature(self, name, state):
|
---|
136 | if self._parsing:
|
---|
137 | raise SAXNotSupportedException("Cannot set features while parsing")
|
---|
138 |
|
---|
139 | if name == feature_namespaces:
|
---|
140 | self._namespaces = state
|
---|
141 | elif name == feature_external_ges:
|
---|
142 | self._external_ges = state
|
---|
143 | elif name == feature_string_interning:
|
---|
144 | if state:
|
---|
145 | if self._interning is None:
|
---|
146 | self._interning = {}
|
---|
147 | else:
|
---|
148 | self._interning = None
|
---|
149 | elif name == feature_validation:
|
---|
150 | if state:
|
---|
151 | raise SAXNotSupportedException(
|
---|
152 | "expat does not support validation")
|
---|
153 | elif name == feature_external_pes:
|
---|
154 | if state:
|
---|
155 | raise SAXNotSupportedException(
|
---|
156 | "expat does not read external parameter entities")
|
---|
157 | elif name == feature_namespace_prefixes:
|
---|
158 | if state:
|
---|
159 | raise SAXNotSupportedException(
|
---|
160 | "expat does not report namespace prefixes")
|
---|
161 | else:
|
---|
162 | raise SAXNotRecognizedException(
|
---|
163 | "Feature '%s' not recognized" % name)
|
---|
164 |
|
---|
165 | def getProperty(self, name):
|
---|
166 | if name == handler.property_lexical_handler:
|
---|
167 | return self._lex_handler_prop
|
---|
168 | elif name == property_interning_dict:
|
---|
169 | return self._interning
|
---|
170 | elif name == property_xml_string:
|
---|
171 | if self._parser:
|
---|
172 | if hasattr(self._parser, "GetInputContext"):
|
---|
173 | return self._parser.GetInputContext()
|
---|
174 | else:
|
---|
175 | raise SAXNotRecognizedException(
|
---|
176 | "This version of expat does not support getting"
|
---|
177 | " the XML string")
|
---|
178 | else:
|
---|
179 | raise SAXNotSupportedException(
|
---|
180 | "XML string cannot be returned when not parsing")
|
---|
181 | raise SAXNotRecognizedException("Property '%s' not recognized" % name)
|
---|
182 |
|
---|
183 | def setProperty(self, name, value):
|
---|
184 | if name == handler.property_lexical_handler:
|
---|
185 | self._lex_handler_prop = value
|
---|
186 | if self._parsing:
|
---|
187 | self._reset_lex_handler_prop()
|
---|
188 | elif name == property_interning_dict:
|
---|
189 | self._interning = value
|
---|
190 | elif name == property_xml_string:
|
---|
191 | raise SAXNotSupportedException("Property '%s' cannot be set" %
|
---|
192 | name)
|
---|
193 | else:
|
---|
194 | raise SAXNotRecognizedException("Property '%s' not recognized" %
|
---|
195 | name)
|
---|
196 |
|
---|
197 | # IncrementalParser methods
|
---|
198 |
|
---|
199 | def feed(self, data, isFinal = 0):
|
---|
200 | if not self._parsing:
|
---|
201 | self.reset()
|
---|
202 | self._parsing = 1
|
---|
203 | self._cont_handler.startDocument()
|
---|
204 |
|
---|
205 | try:
|
---|
206 | # The isFinal parameter is internal to the expat reader.
|
---|
207 | # If it is set to true, expat will check validity of the entire
|
---|
208 | # document. When feeding chunks, they are not normally final -
|
---|
209 | # except when invoked from close.
|
---|
210 | self._parser.Parse(data, isFinal)
|
---|
211 | except expat.error, e:
|
---|
212 | exc = SAXParseException(expat.ErrorString(e.code), e, self)
|
---|
213 | # FIXME: when to invoke error()?
|
---|
214 | self._err_handler.fatalError(exc)
|
---|
215 |
|
---|
216 | def close(self):
|
---|
217 | if self._entity_stack:
|
---|
218 | # If we are completing an external entity, do nothing here
|
---|
219 | return
|
---|
220 | self.feed("", isFinal = 1)
|
---|
221 | self._cont_handler.endDocument()
|
---|
222 | self._parsing = 0
|
---|
223 | # break cycle created by expat handlers pointing to our methods
|
---|
224 | self._parser = None
|
---|
225 |
|
---|
226 | def _reset_cont_handler(self):
|
---|
227 | self._parser.ProcessingInstructionHandler = \
|
---|
228 | self._cont_handler.processingInstruction
|
---|
229 | self._parser.CharacterDataHandler = self._cont_handler.characters
|
---|
230 |
|
---|
231 | def _reset_lex_handler_prop(self):
|
---|
232 | lex = self._lex_handler_prop
|
---|
233 | parser = self._parser
|
---|
234 | if lex is None:
|
---|
235 | parser.CommentHandler = None
|
---|
236 | parser.StartCdataSectionHandler = None
|
---|
237 | parser.EndCdataSectionHandler = None
|
---|
238 | parser.StartDoctypeDeclHandler = None
|
---|
239 | parser.EndDoctypeDeclHandler = None
|
---|
240 | else:
|
---|
241 | parser.CommentHandler = lex.comment
|
---|
242 | parser.StartCdataSectionHandler = lex.startCDATA
|
---|
243 | parser.EndCdataSectionHandler = lex.endCDATA
|
---|
244 | parser.StartDoctypeDeclHandler = self.start_doctype_decl
|
---|
245 | parser.EndDoctypeDeclHandler = lex.endDTD
|
---|
246 |
|
---|
247 | def reset(self):
|
---|
248 | if self._namespaces:
|
---|
249 | self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
|
---|
250 | intern=self._interning)
|
---|
251 | self._parser.namespace_prefixes = 1
|
---|
252 | self._parser.StartElementHandler = self.start_element_ns
|
---|
253 | self._parser.EndElementHandler = self.end_element_ns
|
---|
254 | else:
|
---|
255 | self._parser = expat.ParserCreate(self._source.getEncoding(),
|
---|
256 | intern = self._interning)
|
---|
257 | self._parser.StartElementHandler = self.start_element
|
---|
258 | self._parser.EndElementHandler = self.end_element
|
---|
259 |
|
---|
260 | self._reset_cont_handler()
|
---|
261 | self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
|
---|
262 | self._parser.NotationDeclHandler = self.notation_decl
|
---|
263 | self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
|
---|
264 | self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
|
---|
265 |
|
---|
266 | self._decl_handler_prop = None
|
---|
267 | if self._lex_handler_prop:
|
---|
268 | self._reset_lex_handler_prop()
|
---|
269 | # self._parser.DefaultHandler =
|
---|
270 | # self._parser.DefaultHandlerExpand =
|
---|
271 | # self._parser.NotStandaloneHandler =
|
---|
272 | self._parser.ExternalEntityRefHandler = self.external_entity_ref
|
---|
273 | try:
|
---|
274 | self._parser.SkippedEntityHandler = self.skipped_entity_handler
|
---|
275 | except AttributeError:
|
---|
276 | # This pyexpat does not support SkippedEntity
|
---|
277 | pass
|
---|
278 | self._parser.SetParamEntityParsing(
|
---|
279 | expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
|
---|
280 |
|
---|
281 | self._parsing = 0
|
---|
282 | self._entity_stack = []
|
---|
283 |
|
---|
284 | # Locator methods
|
---|
285 |
|
---|
286 | def getColumnNumber(self):
|
---|
287 | if self._parser is None:
|
---|
288 | return None
|
---|
289 | return self._parser.ErrorColumnNumber
|
---|
290 |
|
---|
291 | def getLineNumber(self):
|
---|
292 | if self._parser is None:
|
---|
293 | return 1
|
---|
294 | return self._parser.ErrorLineNumber
|
---|
295 |
|
---|
296 | def getPublicId(self):
|
---|
297 | return self._source.getPublicId()
|
---|
298 |
|
---|
299 | def getSystemId(self):
|
---|
300 | return self._source.getSystemId()
|
---|
301 |
|
---|
302 | # event handlers
|
---|
303 | def start_element(self, name, attrs):
|
---|
304 | self._cont_handler.startElement(name, AttributesImpl(attrs))
|
---|
305 |
|
---|
306 | def end_element(self, name):
|
---|
307 | self._cont_handler.endElement(name)
|
---|
308 |
|
---|
309 | def start_element_ns(self, name, attrs):
|
---|
310 | pair = name.split()
|
---|
311 | if len(pair) == 1:
|
---|
312 | # no namespace
|
---|
313 | pair = (None, name)
|
---|
314 | elif len(pair) == 3:
|
---|
315 | pair = pair[0], pair[1]
|
---|
316 | else:
|
---|
317 | # default namespace
|
---|
318 | pair = tuple(pair)
|
---|
319 |
|
---|
320 | newattrs = {}
|
---|
321 | qnames = {}
|
---|
322 | for (aname, value) in attrs.items():
|
---|
323 | parts = aname.split()
|
---|
324 | length = len(parts)
|
---|
325 | if length == 1:
|
---|
326 | # no namespace
|
---|
327 | qname = aname
|
---|
328 | apair = (None, aname)
|
---|
329 | elif length == 3:
|
---|
330 | qname = "%s:%s" % (parts[2], parts[1])
|
---|
331 | apair = parts[0], parts[1]
|
---|
332 | else:
|
---|
333 | # default namespace
|
---|
334 | qname = parts[1]
|
---|
335 | apair = tuple(parts)
|
---|
336 |
|
---|
337 | newattrs[apair] = value
|
---|
338 | qnames[apair] = qname
|
---|
339 |
|
---|
340 | self._cont_handler.startElementNS(pair, None,
|
---|
341 | AttributesNSImpl(newattrs, qnames))
|
---|
342 |
|
---|
343 | def end_element_ns(self, name):
|
---|
344 | pair = name.split()
|
---|
345 | if len(pair) == 1:
|
---|
346 | pair = (None, name)
|
---|
347 | elif len(pair) == 3:
|
---|
348 | pair = pair[0], pair[1]
|
---|
349 | else:
|
---|
350 | pair = tuple(pair)
|
---|
351 |
|
---|
352 | self._cont_handler.endElementNS(pair, None)
|
---|
353 |
|
---|
354 | # this is not used (call directly to ContentHandler)
|
---|
355 | def processing_instruction(self, target, data):
|
---|
356 | self._cont_handler.processingInstruction(target, data)
|
---|
357 |
|
---|
358 | # this is not used (call directly to ContentHandler)
|
---|
359 | def character_data(self, data):
|
---|
360 | self._cont_handler.characters(data)
|
---|
361 |
|
---|
362 | def start_namespace_decl(self, prefix, uri):
|
---|
363 | self._cont_handler.startPrefixMapping(prefix, uri)
|
---|
364 |
|
---|
365 | def end_namespace_decl(self, prefix):
|
---|
366 | self._cont_handler.endPrefixMapping(prefix)
|
---|
367 |
|
---|
368 | def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
|
---|
369 | self._lex_handler_prop.startDTD(name, pubid, sysid)
|
---|
370 |
|
---|
371 | def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
|
---|
372 | self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
|
---|
373 |
|
---|
374 | def notation_decl(self, name, base, sysid, pubid):
|
---|
375 | self._dtd_handler.notationDecl(name, pubid, sysid)
|
---|
376 |
|
---|
377 | def external_entity_ref(self, context, base, sysid, pubid):
|
---|
378 | if not self._external_ges:
|
---|
379 | return 1
|
---|
380 |
|
---|
381 | source = self._ent_handler.resolveEntity(pubid, sysid)
|
---|
382 | source = saxutils.prepare_input_source(source,
|
---|
383 | self._source.getSystemId() or
|
---|
384 | "")
|
---|
385 |
|
---|
386 | self._entity_stack.append((self._parser, self._source))
|
---|
387 | self._parser = self._parser.ExternalEntityParserCreate(context)
|
---|
388 | self._source = source
|
---|
389 |
|
---|
390 | try:
|
---|
391 | xmlreader.IncrementalParser.parse(self, source)
|
---|
392 | except:
|
---|
393 | return 0 # FIXME: save error info here?
|
---|
394 |
|
---|
395 | (self._parser, self._source) = self._entity_stack[-1]
|
---|
396 | del self._entity_stack[-1]
|
---|
397 | return 1
|
---|
398 |
|
---|
399 | def skipped_entity_handler(self, name, is_pe):
|
---|
400 | if is_pe:
|
---|
401 | # The SAX spec requires to report skipped PEs with a '%'
|
---|
402 | name = '%'+name
|
---|
403 | self._cont_handler.skippedEntity(name)
|
---|
404 |
|
---|
405 | # ---
|
---|
406 |
|
---|
407 | def create_parser(*args, **kwargs):
|
---|
408 | return ExpatParser(*args, **kwargs)
|
---|
409 |
|
---|
410 | # ---
|
---|
411 |
|
---|
412 | if __name__ == "__main__":
|
---|
413 | import xml.sax.saxutils
|
---|
414 | p = create_parser()
|
---|
415 | p.setContentHandler(xml.sax.saxutils.XMLGenerator())
|
---|
416 | p.setErrorHandler(xml.sax.ErrorHandler())
|
---|
417 | p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")
|
---|