Context Navigation

sgmllib.py

Visit:

Last change on this file was 3225, checked in by bird, 18 years ago
Python 2.5
File size: 17.4 KB

Line
1	"""A parser for SGML, using the derived class as a static DTD."""
2
3	# XXX This only supports those SGML features used by HTML.
4
5	# XXX There should be a way to distinguish between PCDATA (parsed
6	# character data -- the normal case), RCDATA (replaceable character
7	# data -- only char and entity references and end tags are special)
8	# and CDATA (character data -- only end tags are special). RCDATA is
9	# not supported at all.
10
11
12	import markupbase
13	import re
14
15	__all__ = ["SGMLParser", "SGMLParseError"]
16
17	# Regular expressions used for parsing
18
19	interesting = re.compile('[&<]')
20	incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]\|#[0-9])?\|'
21	'<([a-zA-Z][^<>]*\|'
22	'/([a-zA-Z][^<>]*)?\|'
23	'![^<>]*)?')
24
25	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26	charref = re.compile('&#([0-9]+)[^0-9]')
27
28	starttagopen = re.compile('<[>a-zA-Z]')
29	shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30	shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9])/([^/])/')
31	piclose = re.compile('>')
32	endbracket = re.compile('[<>]')
33	tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
34	attrfind = re.compile(
35	r'\s([a-zA-Z_][-:.a-zA-Z_0-9])(\s=\s'
36	r'(\'[^\']\'\|"[^"]"\|[][\-a-zA-Z0-9./,:;+%?!&$_#=~\'"@]))?')
37
38
39	class SGMLParseError(RuntimeError):
40	"""Exception raised for all parse errors."""
41	pass
42
43
44	# SGML parser base class -- find tags and call handler functions.
45	# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
46	# The dtd is defined by deriving a class which defines methods
47	# with special names to handle tags: start_foo and end_foo to handle
48	# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
49	# (Tags are converted to lower case for this purpose.) The data
50	# between tags is passed to the parser by calling self.handle_data()
51	# with some data as argument (the data may be split up in arbitrary
52	# chunks). Entity references are passed by calling
53	# self.handle_entityref() with the entity reference as argument.
54
55	class SGMLParser(markupbase.ParserBase):
56	# Definition of entities -- derived classes may override
57	entity_or_charref = re.compile('&(?:'
58	'([a-zA-Z][-.a-zA-Z0-9]*)\|#([0-9]+)'
59	')(;?)')
60
61	def __init__(self, verbose=0):
62	"""Initialize and reset this instance."""
63	self.verbose = verbose
64	self.reset()
65
66	def reset(self):
67	"""Reset this instance. Loses all unprocessed data."""
68	self.__starttag_text = None
69	self.rawdata = ''
70	self.stack = []
71	self.lasttag = '???'
72	self.nomoretags = 0
73	self.literal = 0
74	markupbase.ParserBase.reset(self)
75
76	def setnomoretags(self):
77	"""Enter literal mode (CDATA) till EOF.
78
79	Intended for derived classes only.
80	"""
81	self.nomoretags = self.literal = 1
82
83	def setliteral(self, *args):
84	"""Enter literal mode (CDATA).
85
86	Intended for derived classes only.
87	"""
88	self.literal = 1
89
90	def feed(self, data):
91	"""Feed some data to the parser.
92
93	Call this as often as you want, with as little or as much text
94	as you want (may include '\n'). (This just saves the text,
95	all the processing is done by goahead().)
96	"""
97
98	self.rawdata = self.rawdata + data
99	self.goahead(0)
100
101	def close(self):
102	"""Handle the remaining data."""
103	self.goahead(1)
104
105	def error(self, message):
106	raise SGMLParseError(message)
107
108	# Internal -- handle data as far as reasonable. May leave state
109	# and data to be processed by a subsequent call. If 'end' is
110	# true, force handling all data as if followed by EOF marker.
111	def goahead(self, end):
112	rawdata = self.rawdata
113	i = 0
114	n = len(rawdata)
115	while i < n:
116	if self.nomoretags:
117	self.handle_data(rawdata[i:n])
118	i = n
119	break
120	match = interesting.search(rawdata, i)
121	if match: j = match.start()
122	else: j = n
123	if i < j:
124	self.handle_data(rawdata[i:j])
125	i = j
126	if i == n: break
127	if rawdata[i] == '<':
128	if starttagopen.match(rawdata, i):
129	if self.literal:
130	self.handle_data(rawdata[i])
131	i = i+1
132	continue
133	k = self.parse_starttag(i)
134	if k < 0: break
135	i = k
136	continue
137	if rawdata.startswith("</", i):
138	k = self.parse_endtag(i)
139	if k < 0: break
140	i = k
141	self.literal = 0
142	continue
143	if self.literal:
144	if n > (i + 1):
145	self.handle_data("<")
146	i = i+1
147	else:
148	# incomplete
149	break
150	continue
151	if rawdata.startswith("<!--", i):
152	# Strictly speaking, a comment is --.*--
153	# within a declaration tag <!...>.
154	# This should be removed,
155	# and comments handled only in parse_declaration.
156	k = self.parse_comment(i)
157	if k < 0: break
158	i = k
159	continue
160	if rawdata.startswith("<?", i):
161	k = self.parse_pi(i)
162	if k < 0: break
163	i = i+k
164	continue
165	if rawdata.startswith("<!", i):
166	# This is some sort of declaration; in "HTML as
167	# deployed," this should only be the document type
168	# declaration ("<!DOCTYPE html...>").
169	k = self.parse_declaration(i)
170	if k < 0: break
171	i = k
172	continue
173	elif rawdata[i] == '&':
174	if self.literal:
175	self.handle_data(rawdata[i])
176	i = i+1
177	continue
178	match = charref.match(rawdata, i)
179	if match:
180	name = match.group(1)
181	self.handle_charref(name)
182	i = match.end(0)
183	if rawdata[i-1] != ';': i = i-1
184	continue
185	match = entityref.match(rawdata, i)
186	if match:
187	name = match.group(1)
188	self.handle_entityref(name)
189	i = match.end(0)
190	if rawdata[i-1] != ';': i = i-1
191	continue
192	else:
193	self.error('neither < nor & ??')
194	# We get here only if incomplete matches but
195	# nothing else
196	match = incomplete.match(rawdata, i)
197	if not match:
198	self.handle_data(rawdata[i])
199	i = i+1
200	continue
201	j = match.end(0)
202	if j == n:
203	break # Really incomplete
204	self.handle_data(rawdata[i:j])
205	i = j
206	# end while
207	if end and i < n:
208	self.handle_data(rawdata[i:n])
209	i = n
210	self.rawdata = rawdata[i:]
211	# XXX if end: check for empty stack
212
213	# Extensions for the DOCTYPE scanner:
214	_decl_otherchars = '='
215
216	# Internal -- parse processing instr, return length or -1 if not terminated
217	def parse_pi(self, i):
218	rawdata = self.rawdata
219	if rawdata[i:i+2] != '<?':
220	self.error('unexpected call to parse_pi()')
221	match = piclose.search(rawdata, i+2)
222	if not match:
223	return -1
224	j = match.start(0)
225	self.handle_pi(rawdata[i+2: j])
226	j = match.end(0)
227	return j-i
228
229	def get_starttag_text(self):
230	return self.__starttag_text
231
232	# Internal -- handle starttag, return length or -1 if not terminated
233	def parse_starttag(self, i):
234	self.__starttag_text = None
235	start_pos = i
236	rawdata = self.rawdata
237	if shorttagopen.match(rawdata, i):
238	# SGML shorthand: <tag/data/ == <tag>data</tag>
239	# XXX Can data contain &... (entity or char refs)?
240	# XXX Can data contain < or > (tag characters)?
241	# XXX Can there be whitespace before the first /?
242	match = shorttag.match(rawdata, i)
243	if not match:
244	return -1
245	tag, data = match.group(1, 2)
246	self.__starttag_text = '<%s/' % tag
247	tag = tag.lower()
248	k = match.end(0)
249	self.finish_shorttag(tag, data)
250	self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
251	return k
252	# XXX The following should skip matching quotes (' or ")
253	# As a shortcut way to exit, this isn't so bad, but shouldn't
254	# be used to locate the actual end of the start tag since the
255	# < or > characters may be embedded in an attribute value.
256	match = endbracket.search(rawdata, i+1)
257	if not match:
258	return -1
259	j = match.start(0)
260	# Now parse the data between i+1 and j into a tag and attrs
261	attrs = []
262	if rawdata[i:i+2] == '<>':
263	# SGML shorthand: <> == <last open tag seen>
264	k = j
265	tag = self.lasttag
266	else:
267	match = tagfind.match(rawdata, i+1)
268	if not match:
269	self.error('unexpected call to parse_starttag')
270	k = match.end(0)
271	tag = rawdata[i+1:k].lower()
272	self.lasttag = tag
273	while k < j:
274	match = attrfind.match(rawdata, k)
275	if not match: break
276	attrname, rest, attrvalue = match.group(1, 2, 3)
277	if not rest:
278	attrvalue = attrname
279	else:
280	if (attrvalue[:1] == "'" == attrvalue[-1:] or
281	attrvalue[:1] == '"' == attrvalue[-1:]):
282	# strip quotes
283	attrvalue = attrvalue[1:-1]
284	attrvalue = self.entity_or_charref.sub(
285	self._convert_ref, attrvalue)
286	attrs.append((attrname.lower(), attrvalue))
287	k = match.end(0)
288	if rawdata[j] == '>':
289	j = j+1
290	self.__starttag_text = rawdata[start_pos:j]
291	self.finish_starttag(tag, attrs)
292	return j
293
294	# Internal -- convert entity or character reference
295	def _convert_ref(self, match):
296	if match.group(2):
297	return self.convert_charref(match.group(2)) or \
298	'&#%s%s' % match.groups()[1:]
299	elif match.group(3):
300	return self.convert_entityref(match.group(1)) or \
301	'&%s;' % match.group(1)
302	else:
303	return '&%s' % match.group(1)
304
305	# Internal -- parse endtag
306	def parse_endtag(self, i):
307	rawdata = self.rawdata
308	match = endbracket.search(rawdata, i+1)
309	if not match:
310	return -1
311	j = match.start(0)
312	tag = rawdata[i+2:j].strip().lower()
313	if rawdata[j] == '>':
314	j = j+1
315	self.finish_endtag(tag)
316	return j
317
318	# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
319	def finish_shorttag(self, tag, data):
320	self.finish_starttag(tag, [])
321	self.handle_data(data)
322	self.finish_endtag(tag)
323
324	# Internal -- finish processing of start tag
325	# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
326	def finish_starttag(self, tag, attrs):
327	try:
328	method = getattr(self, 'start_' + tag)
329	except AttributeError:
330	try:
331	method = getattr(self, 'do_' + tag)
332	except AttributeError:
333	self.unknown_starttag(tag, attrs)
334	return -1
335	else:
336	self.handle_starttag(tag, method, attrs)
337	return 0
338	else:
339	self.stack.append(tag)
340	self.handle_starttag(tag, method, attrs)
341	return 1
342
343	# Internal -- finish processing of end tag
344	def finish_endtag(self, tag):
345	if not tag:
346	found = len(self.stack) - 1
347	if found < 0:
348	self.unknown_endtag(tag)
349	return
350	else:
351	if tag not in self.stack:
352	try:
353	method = getattr(self, 'end_' + tag)
354	except AttributeError:
355	self.unknown_endtag(tag)
356	else:
357	self.report_unbalanced(tag)
358	return
359	found = len(self.stack)
360	for i in range(found):
361	if self.stack[i] == tag: found = i
362	while len(self.stack) > found:
363	tag = self.stack[-1]
364	try:
365	method = getattr(self, 'end_' + tag)
366	except AttributeError:
367	method = None
368	if method:
369	self.handle_endtag(tag, method)
370	else:
371	self.unknown_endtag(tag)
372	del self.stack[-1]
373
374	# Overridable -- handle start tag
375	def handle_starttag(self, tag, method, attrs):
376	method(attrs)
377
378	# Overridable -- handle end tag
379	def handle_endtag(self, tag, method):
380	method()
381
382	# Example -- report an unbalanced </...> tag.
383	def report_unbalanced(self, tag):
384	if self.verbose:
385	print '*** Unbalanced </' + tag + '>'
386	print '*** Stack:', self.stack
387
388	def convert_charref(self, name):
389	"""Convert character reference, may be overridden."""
390	try:
391	n = int(name)
392	except ValueError:
393	return
394	if not 0 <= n <= 255:
395	return
396	return self.convert_codepoint(n)
397
398	def convert_codepoint(self, codepoint):
399	return chr(codepoint)
400
401	def handle_charref(self, name):
402	"""Handle character reference, no need to override."""
403	replacement = self.convert_charref(name)
404	if replacement is None:
405	self.unknown_charref(name)
406	else:
407	self.handle_data(replacement)
408
409	# Definition of entities -- derived classes may override
410	entitydefs = \
411	{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
412
413	def convert_entityref(self, name):
414	"""Convert entity references.
415
416	As an alternative to overriding this method; one can tailor the
417	results by setting up the self.entitydefs mapping appropriately.
418	"""
419	table = self.entitydefs
420	if name in table:
421	return table[name]
422	else:
423	return
424
425	def handle_entityref(self, name):
426	"""Handle entity references, no need to override."""
427	replacement = self.convert_entityref(name)
428	if replacement is None:
429	self.unknown_entityref(name)
430	else:
431	self.handle_data(self.convert_entityref(name))
432
433	# Example -- handle data, should be overridden
434	def handle_data(self, data):
435	pass
436
437	# Example -- handle comment, could be overridden
438	def handle_comment(self, data):
439	pass
440
441	# Example -- handle declaration, could be overridden
442	def handle_decl(self, decl):
443	pass
444
445	# Example -- handle processing instruction, could be overridden
446	def handle_pi(self, data):
447	pass
448
449	# To be overridden -- handlers for unknown objects
450	def unknown_starttag(self, tag, attrs): pass
451	def unknown_endtag(self, tag): pass
452	def unknown_charref(self, ref): pass
453	def unknown_entityref(self, ref): pass
454
455
456	class TestSGMLParser(SGMLParser):
457
458	def __init__(self, verbose=0):
459	self.testdata = ""
460	SGMLParser.__init__(self, verbose)
461
462	def handle_data(self, data):
463	self.testdata = self.testdata + data
464	if len(repr(self.testdata)) >= 70:
465	self.flush()
466
467	def flush(self):
468	data = self.testdata
469	if data:
470	self.testdata = ""
471	print 'data:', repr(data)
472
473	def handle_comment(self, data):
474	self.flush()
475	r = repr(data)
476	if len(r) > 68:
477	r = r[:32] + '...' + r[-32:]
478	print 'comment:', r
479
480	def unknown_starttag(self, tag, attrs):
481	self.flush()
482	if not attrs:
483	print 'start tag: <' + tag + '>'
484	else:
485	print 'start tag: <' + tag,
486	for name, value in attrs:
487	print name + '=' + '"' + value + '"',
488	print '>'
489
490	def unknown_endtag(self, tag):
491	self.flush()
492	print 'end tag: </' + tag + '>'
493
494	def unknown_entityref(self, ref):
495	self.flush()
496	print '*** unknown entity ref: &' + ref + ';'
497
498	def unknown_charref(self, ref):
499	self.flush()
500	print '*** unknown char ref: &#' + ref + ';'
501
502	def unknown_decl(self, data):
503	self.flush()
504	print '*** unknown decl: [' + data + ']'
505
506	def close(self):
507	SGMLParser.close(self)
508	self.flush()
509
510
511	def test(args = None):
512	import sys
513
514	if args is None:
515	args = sys.argv[1:]
516
517	if args and args[0] == '-s':
518	args = args[1:]
519	klass = SGMLParser
520	else:
521	klass = TestSGMLParser
522
523	if args:
524	file = args[0]
525	else:
526	file = 'test.html'
527
528	if file == '-':
529	f = sys.stdin
530	else:
531	try:
532	f = open(file, 'r')
533	except IOError, msg:
534	print file, ":", msg
535	sys.exit(1)
536
537	data = f.read()
538	if f is not sys.stdin:
539	f.close()
540
541	x = klass()
542	for c in data:
543	x.feed(c)
544	x.close()
545
546
547	if __name__ == '__main__':
548	test()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Lib/sgmllib.py

Download in other formats: