Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

sgmllib.py@ 1538

Last change on this file since 1538 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 17.5 KB

Line
1	"""A parser for SGML, using the derived class as a static DTD."""
2
3	# XXX This only supports those SGML features used by HTML.
4
5	# XXX There should be a way to distinguish between PCDATA (parsed
6	# character data -- the normal case), RCDATA (replaceable character
7	# data -- only char and entity references and end tags are special)
8	# and CDATA (character data -- only end tags are special). RCDATA is
9	# not supported at all.
10
11
12	from warnings import warnpy3k
13	warnpy3k("the sgmllib module has been removed in Python 3.0",
14	stacklevel=2)
15	del warnpy3k
16
17	import markupbase
18	import re
19
20	__all__ = ["SGMLParser", "SGMLParseError"]
21
22	# Regular expressions used for parsing
23
24	interesting = re.compile('[&<]')
25	incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]\|#[0-9])?\|'
26	'<([a-zA-Z][^<>]*\|'
27	'/([a-zA-Z][^<>]*)?\|'
28	'![^<>]*)?')
29
30	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
31	charref = re.compile('&#([0-9]+)[^0-9]')
32
33	starttagopen = re.compile('<[>a-zA-Z]')
34	shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
35	shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9])/([^/])/')
36	piclose = re.compile('>')
37	endbracket = re.compile('[<>]')
38	tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
39	attrfind = re.compile(
40	r'\s([a-zA-Z_][-:.a-zA-Z_0-9])(\s=\s'
41	r'(\'[^\']\'\|"[^"]"\|[][\-a-zA-Z0-9./,:;+%?!&$_#=~\'"@]))?')
42
43
44	class SGMLParseError(RuntimeError):
45	"""Exception raised for all parse errors."""
46	pass
47
48
49	# SGML parser base class -- find tags and call handler functions.
50	# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
51	# The dtd is defined by deriving a class which defines methods
52	# with special names to handle tags: start_foo and end_foo to handle
53	# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
54	# (Tags are converted to lower case for this purpose.) The data
55	# between tags is passed to the parser by calling self.handle_data()
56	# with some data as argument (the data may be split up in arbitrary
57	# chunks). Entity references are passed by calling
58	# self.handle_entityref() with the entity reference as argument.
59
60	class SGMLParser(markupbase.ParserBase):
61	# Definition of entities -- derived classes may override
62	entity_or_charref = re.compile('&(?:'
63	'([a-zA-Z][-.a-zA-Z0-9]*)\|#([0-9]+)'
64	')(;?)')
65
66	def __init__(self, verbose=0):
67	"""Initialize and reset this instance."""
68	self.verbose = verbose
69	self.reset()
70
71	def reset(self):
72	"""Reset this instance. Loses all unprocessed data."""
73	self.__starttag_text = None
74	self.rawdata = ''
75	self.stack = []
76	self.lasttag = '???'
77	self.nomoretags = 0
78	self.literal = 0
79	markupbase.ParserBase.reset(self)
80
81	def setnomoretags(self):
82	"""Enter literal mode (CDATA) till EOF.
83
84	Intended for derived classes only.
85	"""
86	self.nomoretags = self.literal = 1
87
88	def setliteral(self, *args):
89	"""Enter literal mode (CDATA).
90
91	Intended for derived classes only.
92	"""
93	self.literal = 1
94
95	def feed(self, data):
96	"""Feed some data to the parser.
97
98	Call this as often as you want, with as little or as much text
99	as you want (may include '\n'). (This just saves the text,
100	all the processing is done by goahead().)
101	"""
102
103	self.rawdata = self.rawdata + data
104	self.goahead(0)
105
106	def close(self):
107	"""Handle the remaining data."""
108	self.goahead(1)
109
110	def error(self, message):
111	raise SGMLParseError(message)
112
113	# Internal -- handle data as far as reasonable. May leave state
114	# and data to be processed by a subsequent call. If 'end' is
115	# true, force handling all data as if followed by EOF marker.
116	def goahead(self, end):
117	rawdata = self.rawdata
118	i = 0
119	n = len(rawdata)
120	while i < n:
121	if self.nomoretags:
122	self.handle_data(rawdata[i:n])
123	i = n
124	break
125	match = interesting.search(rawdata, i)
126	if match: j = match.start()
127	else: j = n
128	if i < j:
129	self.handle_data(rawdata[i:j])
130	i = j
131	if i == n: break
132	if rawdata[i] == '<':
133	if starttagopen.match(rawdata, i):
134	if self.literal:
135	self.handle_data(rawdata[i])
136	i = i+1
137	continue
138	k = self.parse_starttag(i)
139	if k < 0: break
140	i = k
141	continue
142	if rawdata.startswith("</", i):
143	k = self.parse_endtag(i)
144	if k < 0: break
145	i = k
146	self.literal = 0
147	continue
148	if self.literal:
149	if n > (i + 1):
150	self.handle_data("<")
151	i = i+1
152	else:
153	# incomplete
154	break
155	continue
156	if rawdata.startswith("<!--", i):
157	# Strictly speaking, a comment is --.*--
158	# within a declaration tag <!...>.
159	# This should be removed,
160	# and comments handled only in parse_declaration.
161	k = self.parse_comment(i)
162	if k < 0: break
163	i = k
164	continue
165	if rawdata.startswith("<?", i):
166	k = self.parse_pi(i)
167	if k < 0: break
168	i = i+k
169	continue
170	if rawdata.startswith("<!", i):
171	# This is some sort of declaration; in "HTML as
172	# deployed," this should only be the document type
173	# declaration ("<!DOCTYPE html...>").
174	k = self.parse_declaration(i)
175	if k < 0: break
176	i = k
177	continue
178	elif rawdata[i] == '&':
179	if self.literal:
180	self.handle_data(rawdata[i])
181	i = i+1
182	continue
183	match = charref.match(rawdata, i)
184	if match:
185	name = match.group(1)
186	self.handle_charref(name)
187	i = match.end(0)
188	if rawdata[i-1] != ';': i = i-1
189	continue
190	match = entityref.match(rawdata, i)
191	if match:
192	name = match.group(1)
193	self.handle_entityref(name)
194	i = match.end(0)
195	if rawdata[i-1] != ';': i = i-1
196	continue
197	else:
198	self.error('neither < nor & ??')
199	# We get here only if incomplete matches but
200	# nothing else
201	match = incomplete.match(rawdata, i)
202	if not match:
203	self.handle_data(rawdata[i])
204	i = i+1
205	continue
206	j = match.end(0)
207	if j == n:
208	break # Really incomplete
209	self.handle_data(rawdata[i:j])
210	i = j
211	# end while
212	if end and i < n:
213	self.handle_data(rawdata[i:n])
214	i = n
215	self.rawdata = rawdata[i:]
216	# XXX if end: check for empty stack
217
218	# Extensions for the DOCTYPE scanner:
219	_decl_otherchars = '='
220
221	# Internal -- parse processing instr, return length or -1 if not terminated
222	def parse_pi(self, i):
223	rawdata = self.rawdata
224	if rawdata[i:i+2] != '<?':
225	self.error('unexpected call to parse_pi()')
226	match = piclose.search(rawdata, i+2)
227	if not match:
228	return -1
229	j = match.start(0)
230	self.handle_pi(rawdata[i+2: j])
231	j = match.end(0)
232	return j-i
233
234	def get_starttag_text(self):
235	return self.__starttag_text
236
237	# Internal -- handle starttag, return length or -1 if not terminated
238	def parse_starttag(self, i):
239	self.__starttag_text = None
240	start_pos = i
241	rawdata = self.rawdata
242	if shorttagopen.match(rawdata, i):
243	# SGML shorthand: <tag/data/ == <tag>data</tag>
244	# XXX Can data contain &... (entity or char refs)?
245	# XXX Can data contain < or > (tag characters)?
246	# XXX Can there be whitespace before the first /?
247	match = shorttag.match(rawdata, i)
248	if not match:
249	return -1
250	tag, data = match.group(1, 2)
251	self.__starttag_text = '<%s/' % tag
252	tag = tag.lower()
253	k = match.end(0)
254	self.finish_shorttag(tag, data)
255	self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
256	return k
257	# XXX The following should skip matching quotes (' or ")
258	# As a shortcut way to exit, this isn't so bad, but shouldn't
259	# be used to locate the actual end of the start tag since the
260	# < or > characters may be embedded in an attribute value.
261	match = endbracket.search(rawdata, i+1)
262	if not match:
263	return -1
264	j = match.start(0)
265	# Now parse the data between i+1 and j into a tag and attrs
266	attrs = []
267	if rawdata[i:i+2] == '<>':
268	# SGML shorthand: <> == <last open tag seen>
269	k = j
270	tag = self.lasttag
271	else:
272	match = tagfind.match(rawdata, i+1)
273	if not match:
274	self.error('unexpected call to parse_starttag')
275	k = match.end(0)
276	tag = rawdata[i+1:k].lower()
277	self.lasttag = tag
278	while k < j:
279	match = attrfind.match(rawdata, k)
280	if not match: break
281	attrname, rest, attrvalue = match.group(1, 2, 3)
282	if not rest:
283	attrvalue = attrname
284	else:
285	if (attrvalue[:1] == "'" == attrvalue[-1:] or
286	attrvalue[:1] == '"' == attrvalue[-1:]):
287	# strip quotes
288	attrvalue = attrvalue[1:-1]
289	attrvalue = self.entity_or_charref.sub(
290	self._convert_ref, attrvalue)
291	attrs.append((attrname.lower(), attrvalue))
292	k = match.end(0)
293	if rawdata[j] == '>':
294	j = j+1
295	self.__starttag_text = rawdata[start_pos:j]
296	self.finish_starttag(tag, attrs)
297	return j
298
299	# Internal -- convert entity or character reference
300	def _convert_ref(self, match):
301	if match.group(2):
302	return self.convert_charref(match.group(2)) or \
303	'&#%s%s' % match.groups()[1:]
304	elif match.group(3):
305	return self.convert_entityref(match.group(1)) or \
306	'&%s;' % match.group(1)
307	else:
308	return '&%s' % match.group(1)
309
310	# Internal -- parse endtag
311	def parse_endtag(self, i):
312	rawdata = self.rawdata
313	match = endbracket.search(rawdata, i+1)
314	if not match:
315	return -1
316	j = match.start(0)
317	tag = rawdata[i+2:j].strip().lower()
318	if rawdata[j] == '>':
319	j = j+1
320	self.finish_endtag(tag)
321	return j
322
323	# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
324	def finish_shorttag(self, tag, data):
325	self.finish_starttag(tag, [])
326	self.handle_data(data)
327	self.finish_endtag(tag)
328
329	# Internal -- finish processing of start tag
330	# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
331	def finish_starttag(self, tag, attrs):
332	try:
333	method = getattr(self, 'start_' + tag)
334	except AttributeError:
335	try:
336	method = getattr(self, 'do_' + tag)
337	except AttributeError:
338	self.unknown_starttag(tag, attrs)
339	return -1
340	else:
341	self.handle_starttag(tag, method, attrs)
342	return 0
343	else:
344	self.stack.append(tag)
345	self.handle_starttag(tag, method, attrs)
346	return 1
347
348	# Internal -- finish processing of end tag
349	def finish_endtag(self, tag):
350	if not tag:
351	found = len(self.stack) - 1
352	if found < 0:
353	self.unknown_endtag(tag)
354	return
355	else:
356	if tag not in self.stack:
357	try:
358	method = getattr(self, 'end_' + tag)
359	except AttributeError:
360	self.unknown_endtag(tag)
361	else:
362	self.report_unbalanced(tag)
363	return
364	found = len(self.stack)
365	for i in range(found):
366	if self.stack[i] == tag: found = i
367	while len(self.stack) > found:
368	tag = self.stack[-1]
369	try:
370	method = getattr(self, 'end_' + tag)
371	except AttributeError:
372	method = None
373	if method:
374	self.handle_endtag(tag, method)
375	else:
376	self.unknown_endtag(tag)
377	del self.stack[-1]
378
379	# Overridable -- handle start tag
380	def handle_starttag(self, tag, method, attrs):
381	method(attrs)
382
383	# Overridable -- handle end tag
384	def handle_endtag(self, tag, method):
385	method()
386
387	# Example -- report an unbalanced </...> tag.
388	def report_unbalanced(self, tag):
389	if self.verbose:
390	print '*** Unbalanced </' + tag + '>'
391	print '*** Stack:', self.stack
392
393	def convert_charref(self, name):
394	"""Convert character reference, may be overridden."""
395	try:
396	n = int(name)
397	except ValueError:
398	return
399	if not 0 <= n <= 127:
400	return
401	return self.convert_codepoint(n)
402
403	def convert_codepoint(self, codepoint):
404	return chr(codepoint)
405
406	def handle_charref(self, name):
407	"""Handle character reference, no need to override."""
408	replacement = self.convert_charref(name)
409	if replacement is None:
410	self.unknown_charref(name)
411	else:
412	self.handle_data(replacement)
413
414	# Definition of entities -- derived classes may override
415	entitydefs = \
416	{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
417
418	def convert_entityref(self, name):
419	"""Convert entity references.
420
421	As an alternative to overriding this method; one can tailor the
422	results by setting up the self.entitydefs mapping appropriately.
423	"""
424	table = self.entitydefs
425	if name in table:
426	return table[name]
427	else:
428	return
429
430	def handle_entityref(self, name):
431	"""Handle entity references, no need to override."""
432	replacement = self.convert_entityref(name)
433	if replacement is None:
434	self.unknown_entityref(name)
435	else:
436	self.handle_data(replacement)
437
438	# Example -- handle data, should be overridden
439	def handle_data(self, data):
440	pass
441
442	# Example -- handle comment, could be overridden
443	def handle_comment(self, data):
444	pass
445
446	# Example -- handle declaration, could be overridden
447	def handle_decl(self, decl):
448	pass
449
450	# Example -- handle processing instruction, could be overridden
451	def handle_pi(self, data):
452	pass
453
454	# To be overridden -- handlers for unknown objects
455	def unknown_starttag(self, tag, attrs): pass
456	def unknown_endtag(self, tag): pass
457	def unknown_charref(self, ref): pass
458	def unknown_entityref(self, ref): pass
459
460
461	class TestSGMLParser(SGMLParser):
462
463	def __init__(self, verbose=0):
464	self.testdata = ""
465	SGMLParser.__init__(self, verbose)
466
467	def handle_data(self, data):
468	self.testdata = self.testdata + data
469	if len(repr(self.testdata)) >= 70:
470	self.flush()
471
472	def flush(self):
473	data = self.testdata
474	if data:
475	self.testdata = ""
476	print 'data:', repr(data)
477
478	def handle_comment(self, data):
479	self.flush()
480	r = repr(data)
481	if len(r) > 68:
482	r = r[:32] + '...' + r[-32:]
483	print 'comment:', r
484
485	def unknown_starttag(self, tag, attrs):
486	self.flush()
487	if not attrs:
488	print 'start tag: <' + tag + '>'
489	else:
490	print 'start tag: <' + tag,
491	for name, value in attrs:
492	print name + '=' + '"' + value + '"',
493	print '>'
494
495	def unknown_endtag(self, tag):
496	self.flush()
497	print 'end tag: </' + tag + '>'
498
499	def unknown_entityref(self, ref):
500	self.flush()
501	print '*** unknown entity ref: &' + ref + ';'
502
503	def unknown_charref(self, ref):
504	self.flush()
505	print '*** unknown char ref: &#' + ref + ';'
506
507	def unknown_decl(self, data):
508	self.flush()
509	print '*** unknown decl: [' + data + ']'
510
511	def close(self):
512	SGMLParser.close(self)
513	self.flush()
514
515
516	def test(args = None):
517	import sys
518
519	if args is None:
520	args = sys.argv[1:]
521
522	if args and args[0] == '-s':
523	args = args[1:]
524	klass = SGMLParser
525	else:
526	klass = TestSGMLParser
527
528	if args:
529	file = args[0]
530	else:
531	file = 'test.html'
532
533	if file == '-':
534	f = sys.stdin
535	else:
536	try:
537	f = open(file, 'r')
538	except IOError, msg:
539	print file, ":", msg
540	sys.exit(1)
541
542	data = f.read()
543	if f is not sys.stdin:
544	f.close()
545
546	x = klass()
547	for c in data:
548	x.feed(c)
549	x.close()
550
551
552	if __name__ == '__main__':
553	test()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/sgmllib.py@ 1538

Download in other formats: