Context Navigation

xmllib.py

Visit:

Last change on this file was 3225, checked in by bird, 18 years ago
Python 2.5
File size: 34.0 KB

Line
1	"""A parser for XML, using the derived class as static DTD."""
2
3	# Author: Sjoerd Mullender.
4
5	import re
6	import string
7
8	import warnings
9	warnings.warn("The xmllib module is obsolete. Use xml.sax instead.", DeprecationWarning)
10	del warnings
11
12	version = '0.3'
13
14	class Error(RuntimeError):
15	pass
16
17	# Regular expressions used for parsing
18
19	_S = '[ \t\r\n]+' # white space
20	_opS = '[ \t\r\n]*' # optional white space
21	_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
22	_QStr = "(?:'[^']'\|\"[^\"]\")" # quoted XML string
23	illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
24	interesting = re.compile('[]&<]')
25
26	amp = re.compile('&')
27	ref = re.compile('&(' + _Name + '\|#[0-9]+\|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
28	entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
29	charref = re.compile('&#(?P<char>[0-9]+[^0-9]\|x[0-9a-fA-F]+[^0-9a-fA-F])')
30	space = re.compile(_S + '$')
31	newline = re.compile('\n')
32
33	attrfind = re.compile(
34	_S + '(?P<name>' + _Name + ')'
35	'(' + _opS + '=' + _opS +
36	'(?P<value>'+_QStr+'\|[-a-zA-Z0-9.:+*%?!_#=~]+))?')
37	starttagopen = re.compile('<' + _Name)
38	starttagend = re.compile(_opS + '(?P<slash>/?)>')
39	starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
40	'(?P<attrs>(?:'+attrfind.pattern+')*)'+
41	starttagend.pattern)
42	endtagopen = re.compile('</')
43	endbracket = re.compile(_opS + '>')
44	endbracketfind = re.compile('(?:[^>\'"]\|'+_QStr+')*>')
45	tagfind = re.compile(_Name)
46	cdataopen = re.compile(r'<!\[CDATA\[')
47	cdataclose = re.compile(r'\]\]>')
48	# this matches one of the following:
49	# SYSTEM SystemLiteral
50	# PUBLIC PubidLiteral SystemLiteral
51	_SystemLiteral = '(?P<%s>'+_QStr+')'
52	_PublicLiteral = '(?P<%s>"[-\'+,./:=?;!#@$_%% \n\ra-zA-Z0-9]"\|' \
53	"'[-+,./:=?;!#@$_%% \n\ra-zA-Z0-9]')"
54	_ExternalId = '(?:SYSTEM\|' \
55	'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
56	')'+_S+_SystemLiteral%'syslit'
57	doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
58	'(?:'+_S+_ExternalId+')?'+_opS)
59	xmldecl = re.compile('<\?xml'+_S+
60	'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
61	'(?:'+_S+'encoding'+_opS+'='+_opS+
62	"(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'\|"
63	'"[A-Za-z][-A-Za-z0-9._]*"))?'
64	'(?:'+_S+'standalone'+_opS+'='+_opS+
65	'(?P<standalone>\'(?:yes\|no)\'\|"(?:yes\|no)"))?'+
66	_opS+'\?>')
67	procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
68	procclose = re.compile(_opS + r'\?>')
69	commentopen = re.compile('<!--')
70	commentclose = re.compile('-->')
71	doubledash = re.compile('--')
72	attrtrans = string.maketrans(' \r\n\t', ' ')
73
74	# definitions for XML namespaces
75	_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"
76	ncname = re.compile(_NCName + '$')
77	qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix
78	'(?P<local>' + _NCName + ')$')
79
80	xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
81
82	# XML parser base class -- find tags and call handler functions.
83	# Usage: p = XMLParser(); p.feed(data); ...; p.close().
84	# The dtd is defined by deriving a class which defines methods with
85	# special names to handle tags: start_foo and end_foo to handle <foo>
86	# and </foo>, respectively. The data between tags is passed to the
87	# parser by calling self.handle_data() with some data as argument (the
88	# data may be split up in arbitrary chunks).
89
90	class XMLParser:
91	attributes = {} # default, to be overridden
92	elements = {} # default, to be overridden
93
94	# parsing options, settable using keyword args in __init__
95	__accept_unquoted_attributes = 0
96	__accept_missing_endtag_name = 0
97	__map_case = 0
98	__accept_utf8 = 0
99	__translate_attribute_references = 1
100
101	# Interface -- initialize and reset this instance
102	def __init__(self, **kw):
103	self.__fixed = 0
104	if 'accept_unquoted_attributes' in kw:
105	self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
106	if 'accept_missing_endtag_name' in kw:
107	self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
108	if 'map_case' in kw:
109	self.__map_case = kw['map_case']
110	if 'accept_utf8' in kw:
111	self.__accept_utf8 = kw['accept_utf8']
112	if 'translate_attribute_references' in kw:
113	self.__translate_attribute_references = kw['translate_attribute_references']
114	self.reset()
115
116	def __fixelements(self):
117	self.__fixed = 1
118	self.elements = {}
119	self.__fixdict(self.__dict__)
120	self.__fixclass(self.__class__)
121
122	def __fixclass(self, kl):
123	self.__fixdict(kl.__dict__)
124	for k in kl.__bases__:
125	self.__fixclass(k)
126
127	def __fixdict(self, dict):
128	for key in dict.keys():
129	if key[:6] == 'start_':
130	tag = key[6:]
131	start, end = self.elements.get(tag, (None, None))
132	if start is None:
133	self.elements[tag] = getattr(self, key), end
134	elif key[:4] == 'end_':
135	tag = key[4:]
136	start, end = self.elements.get(tag, (None, None))
137	if end is None:
138	self.elements[tag] = start, getattr(self, key)
139
140	# Interface -- reset this instance. Loses all unprocessed data
141	def reset(self):
142	self.rawdata = ''
143	self.stack = []
144	self.nomoretags = 0
145	self.literal = 0
146	self.lineno = 1
147	self.__at_start = 1
148	self.__seen_doctype = None
149	self.__seen_starttag = 0
150	self.__use_namespaces = 0
151	self.__namespaces = {'xml':None} # xml is implicitly declared
152	# backward compatibility hack: if elements not overridden,
153	# fill it in ourselves
154	if self.elements is XMLParser.elements:
155	self.__fixelements()
156
157	# For derived classes only -- enter literal mode (CDATA) till EOF
158	def setnomoretags(self):
159	self.nomoretags = self.literal = 1
160
161	# For derived classes only -- enter literal mode (CDATA)
162	def setliteral(self, *args):
163	self.literal = 1
164
165	# Interface -- feed some data to the parser. Call this as
166	# often as you want, with as little or as much text as you
167	# want (may include '\n'). (This just saves the text, all the
168	# processing is done by goahead().)
169	def feed(self, data):
170	self.rawdata = self.rawdata + data
171	self.goahead(0)
172
173	# Interface -- handle the remaining data
174	def close(self):
175	self.goahead(1)
176	if self.__fixed:
177	self.__fixed = 0
178	# remove self.elements so that we don't leak
179	del self.elements
180
181	# Interface -- translate references
182	def translate_references(self, data, all = 1):
183	if not self.__translate_attribute_references:
184	return data
185	i = 0
186	while 1:
187	res = amp.search(data, i)
188	if res is None:
189	return data
190	s = res.start(0)
191	res = ref.match(data, s)
192	if res is None:
193	self.syntax_error("bogus `&'")
194	i = s+1
195	continue
196	i = res.end(0)
197	str = res.group(1)
198	rescan = 0
199	if str[0] == '#':
200	if str[1] == 'x':
201	str = chr(int(str[2:], 16))
202	else:
203	str = chr(int(str[1:]))
204	if data[i - 1] != ';':
205	self.syntax_error("`;' missing after char reference")
206	i = i-1
207	elif all:
208	if str in self.entitydefs:
209	str = self.entitydefs[str]
210	rescan = 1
211	elif data[i - 1] != ';':
212	self.syntax_error("bogus `&'")
213	i = s + 1 # just past the &
214	continue
215	else:
216	self.syntax_error("reference to unknown entity `&%s;'" % str)
217	str = '&' + str + ';'
218	elif data[i - 1] != ';':
219	self.syntax_error("bogus `&'")
220	i = s + 1 # just past the &
221	continue
222
223	# when we get here, str contains the translated text and i points
224	# to the end of the string that is to be replaced
225	data = data[:s] + str + data[i:]
226	if rescan:
227	i = s
228	else:
229	i = s + len(str)
230
231	# Interface - return a dictionary of all namespaces currently valid
232	def getnamespace(self):
233	nsdict = {}
234	for t, d, nst in self.stack:
235	nsdict.update(d)
236	return nsdict
237
238	# Internal -- handle data as far as reasonable. May leave state
239	# and data to be processed by a subsequent call. If 'end' is
240	# true, force handling all data as if followed by EOF marker.
241	def goahead(self, end):
242	rawdata = self.rawdata
243	i = 0
244	n = len(rawdata)
245	while i < n:
246	if i > 0:
247	self.__at_start = 0
248	if self.nomoretags:
249	data = rawdata[i:n]
250	self.handle_data(data)
251	self.lineno = self.lineno + data.count('\n')
252	i = n
253	break
254	res = interesting.search(rawdata, i)
255	if res:
256	j = res.start(0)
257	else:
258	j = n
259	if i < j:
260	data = rawdata[i:j]
261	if self.__at_start and space.match(data) is None:
262	self.syntax_error('illegal data at start of file')
263	self.__at_start = 0
264	if not self.stack and space.match(data) is None:
265	self.syntax_error('data not in content')
266	if not self.__accept_utf8 and illegal.search(data):
267	self.syntax_error('illegal character in content')
268	self.handle_data(data)
269	self.lineno = self.lineno + data.count('\n')
270	i = j
271	if i == n: break
272	if rawdata[i] == '<':
273	if starttagopen.match(rawdata, i):
274	if self.literal:
275	data = rawdata[i]
276	self.handle_data(data)
277	self.lineno = self.lineno + data.count('\n')
278	i = i+1
279	continue
280	k = self.parse_starttag(i)
281	if k < 0: break
282	self.__seen_starttag = 1
283	self.lineno = self.lineno + rawdata[i:k].count('\n')
284	i = k
285	continue
286	if endtagopen.match(rawdata, i):
287	k = self.parse_endtag(i)
288	if k < 0: break
289	self.lineno = self.lineno + rawdata[i:k].count('\n')
290	i = k
291	continue
292	if commentopen.match(rawdata, i):
293	if self.literal:
294	data = rawdata[i]
295	self.handle_data(data)
296	self.lineno = self.lineno + data.count('\n')
297	i = i+1
298	continue
299	k = self.parse_comment(i)
300	if k < 0: break
301	self.lineno = self.lineno + rawdata[i:k].count('\n')
302	i = k
303	continue
304	if cdataopen.match(rawdata, i):
305	k = self.parse_cdata(i)
306	if k < 0: break
307	self.lineno = self.lineno + rawdata[i:k].count('\n')
308	i = k
309	continue
310	res = xmldecl.match(rawdata, i)
311	if res:
312	if not self.__at_start:
313	self.syntax_error("<?xml?> declaration not at start of document")
314	version, encoding, standalone = res.group('version',
315	'encoding',
316	'standalone')
317	if version[1:-1] != '1.0':
318	raise Error('only XML version 1.0 supported')
319	if encoding: encoding = encoding[1:-1]
320	if standalone: standalone = standalone[1:-1]
321	self.handle_xml(encoding, standalone)
322	i = res.end(0)
323	continue
324	res = procopen.match(rawdata, i)
325	if res:
326	k = self.parse_proc(i)
327	if k < 0: break
328	self.lineno = self.lineno + rawdata[i:k].count('\n')
329	i = k
330	continue
331	res = doctype.match(rawdata, i)
332	if res:
333	if self.literal:
334	data = rawdata[i]
335	self.handle_data(data)
336	self.lineno = self.lineno + data.count('\n')
337	i = i+1
338	continue
339	if self.__seen_doctype:
340	self.syntax_error('multiple DOCTYPE elements')
341	if self.__seen_starttag:
342	self.syntax_error('DOCTYPE not at beginning of document')
343	k = self.parse_doctype(res)
344	if k < 0: break
345	self.__seen_doctype = res.group('name')
346	if self.__map_case:
347	self.__seen_doctype = self.__seen_doctype.lower()
348	self.lineno = self.lineno + rawdata[i:k].count('\n')
349	i = k
350	continue
351	elif rawdata[i] == '&':
352	if self.literal:
353	data = rawdata[i]
354	self.handle_data(data)
355	i = i+1
356	continue
357	res = charref.match(rawdata, i)
358	if res is not None:
359	i = res.end(0)
360	if rawdata[i-1] != ';':
361	self.syntax_error("`;' missing in charref")
362	i = i-1
363	if not self.stack:
364	self.syntax_error('data not in content')
365	self.handle_charref(res.group('char')[:-1])
366	self.lineno = self.lineno + res.group(0).count('\n')
367	continue
368	res = entityref.match(rawdata, i)
369	if res is not None:
370	i = res.end(0)
371	if rawdata[i-1] != ';':
372	self.syntax_error("`;' missing in entityref")
373	i = i-1
374	name = res.group('name')
375	if self.__map_case:
376	name = name.lower()
377	if name in self.entitydefs:
378	self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
379	n = len(rawdata)
380	i = res.start(0)
381	else:
382	self.unknown_entityref(name)
383	self.lineno = self.lineno + res.group(0).count('\n')
384	continue
385	elif rawdata[i] == ']':
386	if self.literal:
387	data = rawdata[i]
388	self.handle_data(data)
389	i = i+1
390	continue
391	if n-i < 3:
392	break
393	if cdataclose.match(rawdata, i):
394	self.syntax_error("bogus `]]>'")
395	self.handle_data(rawdata[i])
396	i = i+1
397	continue
398	else:
399	raise Error('neither < nor & ??')
400	# We get here only if incomplete matches but
401	# nothing else
402	break
403	# end while
404	if i > 0:
405	self.__at_start = 0
406	if end and i < n:
407	data = rawdata[i]
408	self.syntax_error("bogus `%s'" % data)
409	if not self.__accept_utf8 and illegal.search(data):
410	self.syntax_error('illegal character in content')
411	self.handle_data(data)
412	self.lineno = self.lineno + data.count('\n')
413	self.rawdata = rawdata[i+1:]
414	return self.goahead(end)
415	self.rawdata = rawdata[i:]
416	if end:
417	if not self.__seen_starttag:
418	self.syntax_error('no elements in file')
419	if self.stack:
420	self.syntax_error('missing end tags')
421	while self.stack:
422	self.finish_endtag(self.stack[-1][0])
423
424	# Internal -- parse comment, return length or -1 if not terminated
425	def parse_comment(self, i):
426	rawdata = self.rawdata
427	if rawdata[i:i+4] != '<!--':
428	raise Error('unexpected call to handle_comment')
429	res = commentclose.search(rawdata, i+4)
430	if res is None:
431	return -1
432	if doubledash.search(rawdata, i+4, res.start(0)):
433	self.syntax_error("`--' inside comment")
434	if rawdata[res.start(0)-1] == '-':
435	self.syntax_error('comment cannot end in three dashes')
436	if not self.__accept_utf8 and \
437	illegal.search(rawdata, i+4, res.start(0)):
438	self.syntax_error('illegal character in comment')
439	self.handle_comment(rawdata[i+4: res.start(0)])
440	return res.end(0)
441
442	# Internal -- handle DOCTYPE tag, return length or -1 if not terminated
443	def parse_doctype(self, res):
444	rawdata = self.rawdata
445	n = len(rawdata)
446	name = res.group('name')
447	if self.__map_case:
448	name = name.lower()
449	pubid, syslit = res.group('pubid', 'syslit')
450	if pubid is not None:
451	pubid = pubid[1:-1] # remove quotes
452	pubid = ' '.join(pubid.split()) # normalize
453	if syslit is not None: syslit = syslit[1:-1] # remove quotes
454	j = k = res.end(0)
455	if k >= n:
456	return -1
457	if rawdata[k] == '[':
458	level = 0
459	k = k+1
460	dq = sq = 0
461	while k < n:
462	c = rawdata[k]
463	if not sq and c == '"':
464	dq = not dq
465	elif not dq and c == "'":
466	sq = not sq
467	elif sq or dq:
468	pass
469	elif level <= 0 and c == ']':
470	res = endbracket.match(rawdata, k+1)
471	if res is None:
472	return -1
473	self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
474	return res.end(0)
475	elif c == '<':
476	level = level + 1
477	elif c == '>':
478	level = level - 1
479	if level < 0:
480	self.syntax_error("bogus `>' in DOCTYPE")
481	k = k+1
482	res = endbracketfind.match(rawdata, k)
483	if res is None:
484	return -1
485	if endbracket.match(rawdata, k) is None:
486	self.syntax_error('garbage in DOCTYPE')
487	self.handle_doctype(name, pubid, syslit, None)
488	return res.end(0)
489
490	# Internal -- handle CDATA tag, return length or -1 if not terminated
491	def parse_cdata(self, i):
492	rawdata = self.rawdata
493	if rawdata[i:i+9] != '<![CDATA[':
494	raise Error('unexpected call to parse_cdata')
495	res = cdataclose.search(rawdata, i+9)
496	if res is None:
497	return -1
498	if not self.__accept_utf8 and \
499	illegal.search(rawdata, i+9, res.start(0)):
500	self.syntax_error('illegal character in CDATA')
501	if not self.stack:
502	self.syntax_error('CDATA not in content')
503	self.handle_cdata(rawdata[i+9:res.start(0)])
504	return res.end(0)
505
506	__xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
507	# Internal -- handle a processing instruction tag
508	def parse_proc(self, i):
509	rawdata = self.rawdata
510	end = procclose.search(rawdata, i)
511	if end is None:
512	return -1
513	j = end.start(0)
514	if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
515	self.syntax_error('illegal character in processing instruction')
516	res = tagfind.match(rawdata, i+2)
517	if res is None:
518	raise Error('unexpected call to parse_proc')
519	k = res.end(0)
520	name = res.group(0)
521	if self.__map_case:
522	name = name.lower()
523	if name == 'xml:namespace':
524	self.syntax_error('old-fashioned namespace declaration')
525	self.__use_namespaces = -1
526	# namespace declaration
527	# this must come after the <?xml?> declaration (if any)
528	# and before the <!DOCTYPE> (if any).
529	if self.__seen_doctype or self.__seen_starttag:
530	self.syntax_error('xml:namespace declaration too late in document')
531	attrdict, namespace, k = self.parse_attributes(name, k, j)
532	if namespace:
533	self.syntax_error('namespace declaration inside namespace declaration')
534	for attrname in attrdict.keys():
535	if not attrname in self.__xml_namespace_attributes:
536	self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
537	if not 'ns' in attrdict or not 'prefix' in attrdict:
538	self.syntax_error('xml:namespace without required attributes')
539	prefix = attrdict.get('prefix')
540	if ncname.match(prefix) is None:
541	self.syntax_error('xml:namespace illegal prefix value')
542	return end.end(0)
543	if prefix in self.__namespaces:
544	self.syntax_error('xml:namespace prefix not unique')
545	self.__namespaces[prefix] = attrdict['ns']
546	else:
547	if name.lower() == 'xml':
548	self.syntax_error('illegal processing instruction target name')
549	self.handle_proc(name, rawdata[k:j])
550	return end.end(0)
551
552	# Internal -- parse attributes between i and j
553	def parse_attributes(self, tag, i, j):
554	rawdata = self.rawdata
555	attrdict = {}
556	namespace = {}
557	while i < j:
558	res = attrfind.match(rawdata, i)
559	if res is None:
560	break
561	attrname, attrvalue = res.group('name', 'value')
562	if self.__map_case:
563	attrname = attrname.lower()
564	i = res.end(0)
565	if attrvalue is None:
566	self.syntax_error("no value specified for attribute `%s'" % attrname)
567	attrvalue = attrname
568	elif attrvalue[:1] == "'" == attrvalue[-1:] or \
569	attrvalue[:1] == '"' == attrvalue[-1:]:
570	attrvalue = attrvalue[1:-1]
571	elif not self.__accept_unquoted_attributes:
572	self.syntax_error("attribute `%s' value not quoted" % attrname)
573	res = xmlns.match(attrname)
574	if res is not None:
575	# namespace declaration
576	ncname = res.group('ncname')
577	namespace[ncname or ''] = attrvalue or None
578	if not self.__use_namespaces:
579	self.__use_namespaces = len(self.stack)+1
580	continue
581	if '<' in attrvalue:
582	self.syntax_error("`<' illegal in attribute value")
583	if attrname in attrdict:
584	self.syntax_error("attribute `%s' specified twice" % attrname)
585	attrvalue = attrvalue.translate(attrtrans)
586	attrdict[attrname] = self.translate_references(attrvalue)
587	return attrdict, namespace, i
588
589	# Internal -- handle starttag, return length or -1 if not terminated
590	def parse_starttag(self, i):
591	rawdata = self.rawdata
592	# i points to start of tag
593	end = endbracketfind.match(rawdata, i+1)
594	if end is None:
595	return -1
596	tag = starttagmatch.match(rawdata, i)
597	if tag is None or tag.end(0) != end.end(0):
598	self.syntax_error('garbage in starttag')
599	return end.end(0)
600	nstag = tagname = tag.group('tagname')
601	if self.__map_case:
602	nstag = tagname = nstag.lower()
603	if not self.__seen_starttag and self.__seen_doctype and \
604	tagname != self.__seen_doctype:
605	self.syntax_error('starttag does not match DOCTYPE')
606	if self.__seen_starttag and not self.stack:
607	self.syntax_error('multiple elements on top level')
608	k, j = tag.span('attrs')
609	attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
610	self.stack.append((tagname, nsdict, nstag))
611	if self.__use_namespaces:
612	res = qname.match(tagname)
613	else:
614	res = None
615	if res is not None:
616	prefix, nstag = res.group('prefix', 'local')
617	if prefix is None:
618	prefix = ''
619	ns = None
620	for t, d, nst in self.stack:
621	if prefix in d:
622	ns = d[prefix]
623	if ns is None and prefix != '':
624	ns = self.__namespaces.get(prefix)
625	if ns is not None:
626	nstag = ns + ' ' + nstag
627	elif prefix != '':
628	nstag = prefix + ':' + nstag # undo split
629	self.stack[-1] = tagname, nsdict, nstag
630	# translate namespace of attributes
631	attrnamemap = {} # map from new name to old name (used for error reporting)
632	for key in attrdict.keys():
633	attrnamemap[key] = key
634	if self.__use_namespaces:
635	nattrdict = {}
636	for key, val in attrdict.items():
637	okey = key
638	res = qname.match(key)
639	if res is not None:
640	aprefix, key = res.group('prefix', 'local')
641	if self.__map_case:
642	key = key.lower()
643	if aprefix is not None:
644	ans = None
645	for t, d, nst in self.stack:
646	if aprefix in d:
647	ans = d[aprefix]
648	if ans is None:
649	ans = self.__namespaces.get(aprefix)
650	if ans is not None:
651	key = ans + ' ' + key
652	else:
653	key = aprefix + ':' + key
654	nattrdict[key] = val
655	attrnamemap[key] = okey
656	attrdict = nattrdict
657	attributes = self.attributes.get(nstag)
658	if attributes is not None:
659	for key in attrdict.keys():
660	if not key in attributes:
661	self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
662	for key, val in attributes.items():
663	if val is not None and not key in attrdict:
664	attrdict[key] = val
665	method = self.elements.get(nstag, (None, None))[0]
666	self.finish_starttag(nstag, attrdict, method)
667	if tag.group('slash') == '/':
668	self.finish_endtag(tagname)
669	return tag.end(0)
670
671	# Internal -- parse endtag
672	def parse_endtag(self, i):
673	rawdata = self.rawdata
674	end = endbracketfind.match(rawdata, i+1)
675	if end is None:
676	return -1
677	res = tagfind.match(rawdata, i+2)
678	if res is None:
679	if self.literal:
680	self.handle_data(rawdata[i])
681	return i+1
682	if not self.__accept_missing_endtag_name:
683	self.syntax_error('no name specified in end tag')
684	tag = self.stack[-1][0]
685	k = i+2
686	else:
687	tag = res.group(0)
688	if self.__map_case:
689	tag = tag.lower()
690	if self.literal:
691	if not self.stack or tag != self.stack[-1][0]:
692	self.handle_data(rawdata[i])
693	return i+1
694	k = res.end(0)
695	if endbracket.match(rawdata, k) is None:
696	self.syntax_error('garbage in end tag')
697	self.finish_endtag(tag)
698	return end.end(0)
699
700	# Internal -- finish processing of start tag
701	def finish_starttag(self, tagname, attrdict, method):
702	if method is not None:
703	self.handle_starttag(tagname, method, attrdict)
704	else:
705	self.unknown_starttag(tagname, attrdict)
706
707	# Internal -- finish processing of end tag
708	def finish_endtag(self, tag):
709	self.literal = 0
710	if not tag:
711	self.syntax_error('name-less end tag')
712	found = len(self.stack) - 1
713	if found < 0:
714	self.unknown_endtag(tag)
715	return
716	else:
717	found = -1
718	for i in range(len(self.stack)):
719	if tag == self.stack[i][0]:
720	found = i
721	if found == -1:
722	self.syntax_error('unopened end tag')
723	return
724	while len(self.stack) > found:
725	if found < len(self.stack) - 1:
726	self.syntax_error('missing close tag for %s' % self.stack[-1][2])
727	nstag = self.stack[-1][2]
728	method = self.elements.get(nstag, (None, None))[1]
729	if method is not None:
730	self.handle_endtag(nstag, method)
731	else:
732	self.unknown_endtag(nstag)
733	if self.__use_namespaces == len(self.stack):
734	self.__use_namespaces = 0
735	del self.stack[-1]
736
737	# Overridable -- handle xml processing instruction
738	def handle_xml(self, encoding, standalone):
739	pass
740
741	# Overridable -- handle DOCTYPE
742	def handle_doctype(self, tag, pubid, syslit, data):
743	pass
744
745	# Overridable -- handle start tag
746	def handle_starttag(self, tag, method, attrs):
747	method(attrs)
748
749	# Overridable -- handle end tag
750	def handle_endtag(self, tag, method):
751	method()
752
753	# Example -- handle character reference, no need to override
754	def handle_charref(self, name):
755	try:
756	if name[0] == 'x':
757	n = int(name[1:], 16)
758	else:
759	n = int(name)
760	except ValueError:
761	self.unknown_charref(name)
762	return
763	if not 0 <= n <= 255:
764	self.unknown_charref(name)
765	return
766	self.handle_data(chr(n))
767
768	# Definition of entities -- derived classes may override
769	entitydefs = {'lt': '<', # must use charref
770	'gt': '>',
771	'amp': '&', # must use charref
772	'quot': '"',
773	'apos': ''',
774	}
775
776	# Example -- handle data, should be overridden
777	def handle_data(self, data):
778	pass
779
780	# Example -- handle cdata, could be overridden
781	def handle_cdata(self, data):
782	pass
783
784	# Example -- handle comment, could be overridden
785	def handle_comment(self, data):
786	pass
787
788	# Example -- handle processing instructions, could be overridden
789	def handle_proc(self, name, data):
790	pass
791
792	# Example -- handle relatively harmless syntax errors, could be overridden
793	def syntax_error(self, message):
794	raise Error('Syntax error at line %d: %s' % (self.lineno, message))
795
796	# To be overridden -- handlers for unknown objects
797	def unknown_starttag(self, tag, attrs): pass
798	def unknown_endtag(self, tag): pass
799	def unknown_charref(self, ref): pass
800	def unknown_entityref(self, name):
801	self.syntax_error("reference to unknown entity `&%s;'" % name)
802
803
804	class TestXMLParser(XMLParser):
805
806	def __init__(self, **kw):
807	self.testdata = ""
808	XMLParser.__init__(self, **kw)
809
810	def handle_xml(self, encoding, standalone):
811	self.flush()
812	print 'xml: encoding =',encoding,'standalone =',standalone
813
814	def handle_doctype(self, tag, pubid, syslit, data):
815	self.flush()
816	print 'DOCTYPE:',tag, repr(data)
817
818	def handle_data(self, data):
819	self.testdata = self.testdata + data
820	if len(repr(self.testdata)) >= 70:
821	self.flush()
822
823	def flush(self):
824	data = self.testdata
825	if data:
826	self.testdata = ""
827	print 'data:', repr(data)
828
829	def handle_cdata(self, data):
830	self.flush()
831	print 'cdata:', repr(data)
832
833	def handle_proc(self, name, data):
834	self.flush()
835	print 'processing:',name,repr(data)
836
837	def handle_comment(self, data):
838	self.flush()
839	r = repr(data)
840	if len(r) > 68:
841	r = r[:32] + '...' + r[-32:]
842	print 'comment:', r
843
844	def syntax_error(self, message):
845	print 'error at line %d:' % self.lineno, message
846
847	def unknown_starttag(self, tag, attrs):
848	self.flush()
849	if not attrs:
850	print 'start tag: <' + tag + '>'
851	else:
852	print 'start tag: <' + tag,
853	for name, value in attrs.items():
854	print name + '=' + '"' + value + '"',
855	print '>'
856
857	def unknown_endtag(self, tag):
858	self.flush()
859	print 'end tag: </' + tag + '>'
860
861	def unknown_entityref(self, ref):
862	self.flush()
863	print '*** unknown entity ref: &' + ref + ';'
864
865	def unknown_charref(self, ref):
866	self.flush()
867	print '*** unknown char ref: &#' + ref + ';'
868
869	def close(self):
870	XMLParser.close(self)
871	self.flush()
872
873	def test(args = None):
874	import sys, getopt
875	from time import time
876
877	if not args:
878	args = sys.argv[1:]
879
880	opts, args = getopt.getopt(args, 'st')
881	klass = TestXMLParser
882	do_time = 0
883	for o, a in opts:
884	if o == '-s':
885	klass = XMLParser
886	elif o == '-t':
887	do_time = 1
888
889	if args:
890	file = args[0]
891	else:
892	file = 'test.xml'
893
894	if file == '-':
895	f = sys.stdin
896	else:
897	try:
898	f = open(file, 'r')
899	except IOError, msg:
900	print file, ":", msg
901	sys.exit(1)
902
903	data = f.read()
904	if f is not sys.stdin:
905	f.close()
906
907	x = klass()
908	t0 = time()
909	try:
910	if do_time:
911	x.feed(data)
912	x.close()
913	else:
914	for c in data:
915	x.feed(c)
916	x.close()
917	except Error, msg:
918	t1 = time()
919	print msg
920	if do_time:
921	print 'total time: %g' % (t1-t0)
922	sys.exit(1)
923	t1 = time()
924	if do_time:
925	print 'total time: %g' % (t1-t0)
926
927
928	if __name__ == '__main__':
929	test()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Lib/xmllib.py

Download in other formats: