source: python/trunk/Lib/xml/sax/expatreader.py

Last change on this file was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 14.3 KB
Line 
1"""
2SAX driver for the pyexpat C module. This driver works with
3pyexpat.__version__ == '2.22'.
4"""
5
6version = "0.20"
7
8from xml.sax._exceptions import *
9from xml.sax.handler import feature_validation, feature_namespaces
10from xml.sax.handler import feature_namespace_prefixes
11from xml.sax.handler import feature_external_ges, feature_external_pes
12from xml.sax.handler import feature_string_interning
13from xml.sax.handler import property_xml_string, property_interning_dict
14
15# xml.parsers.expat does not raise ImportError in Jython
16import sys
17if sys.platform[:4] == "java":
18 raise SAXReaderNotAvailable("expat not available in Java", None)
19del sys
20
21try:
22 from xml.parsers import expat
23except ImportError:
24 raise SAXReaderNotAvailable("expat not supported", None)
25else:
26 if not hasattr(expat, "ParserCreate"):
27 raise SAXReaderNotAvailable("expat not supported", None)
28from xml.sax import xmlreader, saxutils, handler
29
30AttributesImpl = xmlreader.AttributesImpl
31AttributesNSImpl = xmlreader.AttributesNSImpl
32
33# If we're using a sufficiently recent version of Python, we can use
34# weak references to avoid cycles between the parser and content
35# handler, otherwise we'll just have to pretend.
36try:
37 import _weakref
38except ImportError:
39 def _mkproxy(o):
40 return o
41else:
42 import weakref
43 _mkproxy = weakref.proxy
44 del weakref, _weakref
45
46# --- ExpatLocator
47
48class ExpatLocator(xmlreader.Locator):
49 """Locator for use with the ExpatParser class.
50
51 This uses a weak reference to the parser object to avoid creating
52 a circular reference between the parser and the content handler.
53 """
54 def __init__(self, parser):
55 self._ref = _mkproxy(parser)
56
57 def getColumnNumber(self):
58 parser = self._ref
59 if parser._parser is None:
60 return None
61 return parser._parser.ErrorColumnNumber
62
63 def getLineNumber(self):
64 parser = self._ref
65 if parser._parser is None:
66 return 1
67 return parser._parser.ErrorLineNumber
68
69 def getPublicId(self):
70 parser = self._ref
71 if parser is None:
72 return None
73 return parser._source.getPublicId()
74
75 def getSystemId(self):
76 parser = self._ref
77 if parser is None:
78 return None
79 return parser._source.getSystemId()
80
81
82# --- ExpatParser
83
84class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
85 """SAX driver for the pyexpat C module."""
86
87 def __init__(self, namespaceHandling=0, bufsize=2**16-20):
88 xmlreader.IncrementalParser.__init__(self, bufsize)
89 self._source = xmlreader.InputSource()
90 self._parser = None
91 self._namespaces = namespaceHandling
92 self._lex_handler_prop = None
93 self._parsing = 0
94 self._entity_stack = []
95 self._external_ges = 1
96 self._interning = None
97
98 # XMLReader methods
99
100 def parse(self, source):
101 "Parse an XML document from a URL or an InputSource."
102 source = saxutils.prepare_input_source(source)
103
104 self._source = source
105 self.reset()
106 self._cont_handler.setDocumentLocator(ExpatLocator(self))
107 xmlreader.IncrementalParser.parse(self, source)
108
109 def prepareParser(self, source):
110 if source.getSystemId() is not None:
111 base = source.getSystemId()
112 if isinstance(base, unicode):
113 base = base.encode('utf-8')
114 self._parser.SetBase(base)
115
116 # Redefined setContentHandler to allow changing handlers during parsing
117
118 def setContentHandler(self, handler):
119 xmlreader.IncrementalParser.setContentHandler(self, handler)
120 if self._parsing:
121 self._reset_cont_handler()
122
123 def getFeature(self, name):
124 if name == feature_namespaces:
125 return self._namespaces
126 elif name == feature_string_interning:
127 return self._interning is not None
128 elif name in (feature_validation, feature_external_pes,
129 feature_namespace_prefixes):
130 return 0
131 elif name == feature_external_ges:
132 return self._external_ges
133 raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
134
135 def setFeature(self, name, state):
136 if self._parsing:
137 raise SAXNotSupportedException("Cannot set features while parsing")
138
139 if name == feature_namespaces:
140 self._namespaces = state
141 elif name == feature_external_ges:
142 self._external_ges = state
143 elif name == feature_string_interning:
144 if state:
145 if self._interning is None:
146 self._interning = {}
147 else:
148 self._interning = None
149 elif name == feature_validation:
150 if state:
151 raise SAXNotSupportedException(
152 "expat does not support validation")
153 elif name == feature_external_pes:
154 if state:
155 raise SAXNotSupportedException(
156 "expat does not read external parameter entities")
157 elif name == feature_namespace_prefixes:
158 if state:
159 raise SAXNotSupportedException(
160 "expat does not report namespace prefixes")
161 else:
162 raise SAXNotRecognizedException(
163 "Feature '%s' not recognized" % name)
164
165 def getProperty(self, name):
166 if name == handler.property_lexical_handler:
167 return self._lex_handler_prop
168 elif name == property_interning_dict:
169 return self._interning
170 elif name == property_xml_string:
171 if self._parser:
172 if hasattr(self._parser, "GetInputContext"):
173 return self._parser.GetInputContext()
174 else:
175 raise SAXNotRecognizedException(
176 "This version of expat does not support getting"
177 " the XML string")
178 else:
179 raise SAXNotSupportedException(
180 "XML string cannot be returned when not parsing")
181 raise SAXNotRecognizedException("Property '%s' not recognized" % name)
182
183 def setProperty(self, name, value):
184 if name == handler.property_lexical_handler:
185 self._lex_handler_prop = value
186 if self._parsing:
187 self._reset_lex_handler_prop()
188 elif name == property_interning_dict:
189 self._interning = value
190 elif name == property_xml_string:
191 raise SAXNotSupportedException("Property '%s' cannot be set" %
192 name)
193 else:
194 raise SAXNotRecognizedException("Property '%s' not recognized" %
195 name)
196
197 # IncrementalParser methods
198
199 def feed(self, data, isFinal = 0):
200 if not self._parsing:
201 self.reset()
202 self._parsing = 1
203 self._cont_handler.startDocument()
204
205 try:
206 # The isFinal parameter is internal to the expat reader.
207 # If it is set to true, expat will check validity of the entire
208 # document. When feeding chunks, they are not normally final -
209 # except when invoked from close.
210 self._parser.Parse(data, isFinal)
211 except expat.error, e:
212 exc = SAXParseException(expat.ErrorString(e.code), e, self)
213 # FIXME: when to invoke error()?
214 self._err_handler.fatalError(exc)
215
216 def close(self):
217 if self._entity_stack:
218 # If we are completing an external entity, do nothing here
219 return
220 self.feed("", isFinal = 1)
221 self._cont_handler.endDocument()
222 self._parsing = 0
223 # break cycle created by expat handlers pointing to our methods
224 self._parser = None
225
226 def _reset_cont_handler(self):
227 self._parser.ProcessingInstructionHandler = \
228 self._cont_handler.processingInstruction
229 self._parser.CharacterDataHandler = self._cont_handler.characters
230
231 def _reset_lex_handler_prop(self):
232 lex = self._lex_handler_prop
233 parser = self._parser
234 if lex is None:
235 parser.CommentHandler = None
236 parser.StartCdataSectionHandler = None
237 parser.EndCdataSectionHandler = None
238 parser.StartDoctypeDeclHandler = None
239 parser.EndDoctypeDeclHandler = None
240 else:
241 parser.CommentHandler = lex.comment
242 parser.StartCdataSectionHandler = lex.startCDATA
243 parser.EndCdataSectionHandler = lex.endCDATA
244 parser.StartDoctypeDeclHandler = self.start_doctype_decl
245 parser.EndDoctypeDeclHandler = lex.endDTD
246
247 def reset(self):
248 if self._namespaces:
249 self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
250 intern=self._interning)
251 self._parser.namespace_prefixes = 1
252 self._parser.StartElementHandler = self.start_element_ns
253 self._parser.EndElementHandler = self.end_element_ns
254 else:
255 self._parser = expat.ParserCreate(self._source.getEncoding(),
256 intern = self._interning)
257 self._parser.StartElementHandler = self.start_element
258 self._parser.EndElementHandler = self.end_element
259
260 self._reset_cont_handler()
261 self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
262 self._parser.NotationDeclHandler = self.notation_decl
263 self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
264 self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
265
266 self._decl_handler_prop = None
267 if self._lex_handler_prop:
268 self._reset_lex_handler_prop()
269# self._parser.DefaultHandler =
270# self._parser.DefaultHandlerExpand =
271# self._parser.NotStandaloneHandler =
272 self._parser.ExternalEntityRefHandler = self.external_entity_ref
273 try:
274 self._parser.SkippedEntityHandler = self.skipped_entity_handler
275 except AttributeError:
276 # This pyexpat does not support SkippedEntity
277 pass
278 self._parser.SetParamEntityParsing(
279 expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
280
281 self._parsing = 0
282 self._entity_stack = []
283
284 # Locator methods
285
286 def getColumnNumber(self):
287 if self._parser is None:
288 return None
289 return self._parser.ErrorColumnNumber
290
291 def getLineNumber(self):
292 if self._parser is None:
293 return 1
294 return self._parser.ErrorLineNumber
295
296 def getPublicId(self):
297 return self._source.getPublicId()
298
299 def getSystemId(self):
300 return self._source.getSystemId()
301
302 # event handlers
303 def start_element(self, name, attrs):
304 self._cont_handler.startElement(name, AttributesImpl(attrs))
305
306 def end_element(self, name):
307 self._cont_handler.endElement(name)
308
309 def start_element_ns(self, name, attrs):
310 pair = name.split()
311 if len(pair) == 1:
312 # no namespace
313 pair = (None, name)
314 elif len(pair) == 3:
315 pair = pair[0], pair[1]
316 else:
317 # default namespace
318 pair = tuple(pair)
319
320 newattrs = {}
321 qnames = {}
322 for (aname, value) in attrs.items():
323 parts = aname.split()
324 length = len(parts)
325 if length == 1:
326 # no namespace
327 qname = aname
328 apair = (None, aname)
329 elif length == 3:
330 qname = "%s:%s" % (parts[2], parts[1])
331 apair = parts[0], parts[1]
332 else:
333 # default namespace
334 qname = parts[1]
335 apair = tuple(parts)
336
337 newattrs[apair] = value
338 qnames[apair] = qname
339
340 self._cont_handler.startElementNS(pair, None,
341 AttributesNSImpl(newattrs, qnames))
342
343 def end_element_ns(self, name):
344 pair = name.split()
345 if len(pair) == 1:
346 pair = (None, name)
347 elif len(pair) == 3:
348 pair = pair[0], pair[1]
349 else:
350 pair = tuple(pair)
351
352 self._cont_handler.endElementNS(pair, None)
353
354 # this is not used (call directly to ContentHandler)
355 def processing_instruction(self, target, data):
356 self._cont_handler.processingInstruction(target, data)
357
358 # this is not used (call directly to ContentHandler)
359 def character_data(self, data):
360 self._cont_handler.characters(data)
361
362 def start_namespace_decl(self, prefix, uri):
363 self._cont_handler.startPrefixMapping(prefix, uri)
364
365 def end_namespace_decl(self, prefix):
366 self._cont_handler.endPrefixMapping(prefix)
367
368 def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
369 self._lex_handler_prop.startDTD(name, pubid, sysid)
370
371 def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
372 self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
373
374 def notation_decl(self, name, base, sysid, pubid):
375 self._dtd_handler.notationDecl(name, pubid, sysid)
376
377 def external_entity_ref(self, context, base, sysid, pubid):
378 if not self._external_ges:
379 return 1
380
381 source = self._ent_handler.resolveEntity(pubid, sysid)
382 source = saxutils.prepare_input_source(source,
383 self._source.getSystemId() or
384 "")
385
386 self._entity_stack.append((self._parser, self._source))
387 self._parser = self._parser.ExternalEntityParserCreate(context)
388 self._source = source
389
390 try:
391 xmlreader.IncrementalParser.parse(self, source)
392 except:
393 return 0 # FIXME: save error info here?
394
395 (self._parser, self._source) = self._entity_stack[-1]
396 del self._entity_stack[-1]
397 return 1
398
399 def skipped_entity_handler(self, name, is_pe):
400 if is_pe:
401 # The SAX spec requires to report skipped PEs with a '%'
402 name = '%'+name
403 self._cont_handler.skippedEntity(name)
404
405# ---
406
407def create_parser(*args, **kwargs):
408 return ExpatParser(*args, **kwargs)
409
410# ---
411
412if __name__ == "__main__":
413 import xml.sax.saxutils
414 p = create_parser()
415 p.setContentHandler(xml.sax.saxutils.XMLGenerator())
416 p.setErrorHandler(xml.sax.ErrorHandler())
417 p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")
Note: See TracBrowser for help on using the repository browser.