source: trunk/essentials/dev-lang/python/Lib/sgmllib.py

Last change on this file was 3225, checked in by bird, 18 years ago

Python 2.5

File size: 17.4 KB
Line 
1"""A parser for SGML, using the derived class as a static DTD."""
2
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special). RCDATA is
9# not supported at all.
10
11
12import markupbase
13import re
14
15__all__ = ["SGMLParser", "SGMLParseError"]
16
17# Regular expressions used for parsing
18
19interesting = re.compile('[&<]')
20incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
21 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
24
25entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26charref = re.compile('&#([0-9]+)[^0-9]')
27
28starttagopen = re.compile('<[>a-zA-Z]')
29shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31piclose = re.compile('>')
32endbracket = re.compile('[<>]')
33tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
34attrfind = re.compile(
35 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
36 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
37
38
39class SGMLParseError(RuntimeError):
40 """Exception raised for all parse errors."""
41 pass
42
43
44# SGML parser base class -- find tags and call handler functions.
45# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
46# The dtd is defined by deriving a class which defines methods
47# with special names to handle tags: start_foo and end_foo to handle
48# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
49# (Tags are converted to lower case for this purpose.) The data
50# between tags is passed to the parser by calling self.handle_data()
51# with some data as argument (the data may be split up in arbitrary
52# chunks). Entity references are passed by calling
53# self.handle_entityref() with the entity reference as argument.
54
55class SGMLParser(markupbase.ParserBase):
56 # Definition of entities -- derived classes may override
57 entity_or_charref = re.compile('&(?:'
58 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
59 ')(;?)')
60
61 def __init__(self, verbose=0):
62 """Initialize and reset this instance."""
63 self.verbose = verbose
64 self.reset()
65
66 def reset(self):
67 """Reset this instance. Loses all unprocessed data."""
68 self.__starttag_text = None
69 self.rawdata = ''
70 self.stack = []
71 self.lasttag = '???'
72 self.nomoretags = 0
73 self.literal = 0
74 markupbase.ParserBase.reset(self)
75
76 def setnomoretags(self):
77 """Enter literal mode (CDATA) till EOF.
78
79 Intended for derived classes only.
80 """
81 self.nomoretags = self.literal = 1
82
83 def setliteral(self, *args):
84 """Enter literal mode (CDATA).
85
86 Intended for derived classes only.
87 """
88 self.literal = 1
89
90 def feed(self, data):
91 """Feed some data to the parser.
92
93 Call this as often as you want, with as little or as much text
94 as you want (may include '\n'). (This just saves the text,
95 all the processing is done by goahead().)
96 """
97
98 self.rawdata = self.rawdata + data
99 self.goahead(0)
100
101 def close(self):
102 """Handle the remaining data."""
103 self.goahead(1)
104
105 def error(self, message):
106 raise SGMLParseError(message)
107
108 # Internal -- handle data as far as reasonable. May leave state
109 # and data to be processed by a subsequent call. If 'end' is
110 # true, force handling all data as if followed by EOF marker.
111 def goahead(self, end):
112 rawdata = self.rawdata
113 i = 0
114 n = len(rawdata)
115 while i < n:
116 if self.nomoretags:
117 self.handle_data(rawdata[i:n])
118 i = n
119 break
120 match = interesting.search(rawdata, i)
121 if match: j = match.start()
122 else: j = n
123 if i < j:
124 self.handle_data(rawdata[i:j])
125 i = j
126 if i == n: break
127 if rawdata[i] == '<':
128 if starttagopen.match(rawdata, i):
129 if self.literal:
130 self.handle_data(rawdata[i])
131 i = i+1
132 continue
133 k = self.parse_starttag(i)
134 if k < 0: break
135 i = k
136 continue
137 if rawdata.startswith("</", i):
138 k = self.parse_endtag(i)
139 if k < 0: break
140 i = k
141 self.literal = 0
142 continue
143 if self.literal:
144 if n > (i + 1):
145 self.handle_data("<")
146 i = i+1
147 else:
148 # incomplete
149 break
150 continue
151 if rawdata.startswith("<!--", i):
152 # Strictly speaking, a comment is --.*--
153 # within a declaration tag <!...>.
154 # This should be removed,
155 # and comments handled only in parse_declaration.
156 k = self.parse_comment(i)
157 if k < 0: break
158 i = k
159 continue
160 if rawdata.startswith("<?", i):
161 k = self.parse_pi(i)
162 if k < 0: break
163 i = i+k
164 continue
165 if rawdata.startswith("<!", i):
166 # This is some sort of declaration; in "HTML as
167 # deployed," this should only be the document type
168 # declaration ("<!DOCTYPE html...>").
169 k = self.parse_declaration(i)
170 if k < 0: break
171 i = k
172 continue
173 elif rawdata[i] == '&':
174 if self.literal:
175 self.handle_data(rawdata[i])
176 i = i+1
177 continue
178 match = charref.match(rawdata, i)
179 if match:
180 name = match.group(1)
181 self.handle_charref(name)
182 i = match.end(0)
183 if rawdata[i-1] != ';': i = i-1
184 continue
185 match = entityref.match(rawdata, i)
186 if match:
187 name = match.group(1)
188 self.handle_entityref(name)
189 i = match.end(0)
190 if rawdata[i-1] != ';': i = i-1
191 continue
192 else:
193 self.error('neither < nor & ??')
194 # We get here only if incomplete matches but
195 # nothing else
196 match = incomplete.match(rawdata, i)
197 if not match:
198 self.handle_data(rawdata[i])
199 i = i+1
200 continue
201 j = match.end(0)
202 if j == n:
203 break # Really incomplete
204 self.handle_data(rawdata[i:j])
205 i = j
206 # end while
207 if end and i < n:
208 self.handle_data(rawdata[i:n])
209 i = n
210 self.rawdata = rawdata[i:]
211 # XXX if end: check for empty stack
212
213 # Extensions for the DOCTYPE scanner:
214 _decl_otherchars = '='
215
216 # Internal -- parse processing instr, return length or -1 if not terminated
217 def parse_pi(self, i):
218 rawdata = self.rawdata
219 if rawdata[i:i+2] != '<?':
220 self.error('unexpected call to parse_pi()')
221 match = piclose.search(rawdata, i+2)
222 if not match:
223 return -1
224 j = match.start(0)
225 self.handle_pi(rawdata[i+2: j])
226 j = match.end(0)
227 return j-i
228
229 def get_starttag_text(self):
230 return self.__starttag_text
231
232 # Internal -- handle starttag, return length or -1 if not terminated
233 def parse_starttag(self, i):
234 self.__starttag_text = None
235 start_pos = i
236 rawdata = self.rawdata
237 if shorttagopen.match(rawdata, i):
238 # SGML shorthand: <tag/data/ == <tag>data</tag>
239 # XXX Can data contain &... (entity or char refs)?
240 # XXX Can data contain < or > (tag characters)?
241 # XXX Can there be whitespace before the first /?
242 match = shorttag.match(rawdata, i)
243 if not match:
244 return -1
245 tag, data = match.group(1, 2)
246 self.__starttag_text = '<%s/' % tag
247 tag = tag.lower()
248 k = match.end(0)
249 self.finish_shorttag(tag, data)
250 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
251 return k
252 # XXX The following should skip matching quotes (' or ")
253 # As a shortcut way to exit, this isn't so bad, but shouldn't
254 # be used to locate the actual end of the start tag since the
255 # < or > characters may be embedded in an attribute value.
256 match = endbracket.search(rawdata, i+1)
257 if not match:
258 return -1
259 j = match.start(0)
260 # Now parse the data between i+1 and j into a tag and attrs
261 attrs = []
262 if rawdata[i:i+2] == '<>':
263 # SGML shorthand: <> == <last open tag seen>
264 k = j
265 tag = self.lasttag
266 else:
267 match = tagfind.match(rawdata, i+1)
268 if not match:
269 self.error('unexpected call to parse_starttag')
270 k = match.end(0)
271 tag = rawdata[i+1:k].lower()
272 self.lasttag = tag
273 while k < j:
274 match = attrfind.match(rawdata, k)
275 if not match: break
276 attrname, rest, attrvalue = match.group(1, 2, 3)
277 if not rest:
278 attrvalue = attrname
279 else:
280 if (attrvalue[:1] == "'" == attrvalue[-1:] or
281 attrvalue[:1] == '"' == attrvalue[-1:]):
282 # strip quotes
283 attrvalue = attrvalue[1:-1]
284 attrvalue = self.entity_or_charref.sub(
285 self._convert_ref, attrvalue)
286 attrs.append((attrname.lower(), attrvalue))
287 k = match.end(0)
288 if rawdata[j] == '>':
289 j = j+1
290 self.__starttag_text = rawdata[start_pos:j]
291 self.finish_starttag(tag, attrs)
292 return j
293
294 # Internal -- convert entity or character reference
295 def _convert_ref(self, match):
296 if match.group(2):
297 return self.convert_charref(match.group(2)) or \
298 '&#%s%s' % match.groups()[1:]
299 elif match.group(3):
300 return self.convert_entityref(match.group(1)) or \
301 '&%s;' % match.group(1)
302 else:
303 return '&%s' % match.group(1)
304
305 # Internal -- parse endtag
306 def parse_endtag(self, i):
307 rawdata = self.rawdata
308 match = endbracket.search(rawdata, i+1)
309 if not match:
310 return -1
311 j = match.start(0)
312 tag = rawdata[i+2:j].strip().lower()
313 if rawdata[j] == '>':
314 j = j+1
315 self.finish_endtag(tag)
316 return j
317
318 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
319 def finish_shorttag(self, tag, data):
320 self.finish_starttag(tag, [])
321 self.handle_data(data)
322 self.finish_endtag(tag)
323
324 # Internal -- finish processing of start tag
325 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
326 def finish_starttag(self, tag, attrs):
327 try:
328 method = getattr(self, 'start_' + tag)
329 except AttributeError:
330 try:
331 method = getattr(self, 'do_' + tag)
332 except AttributeError:
333 self.unknown_starttag(tag, attrs)
334 return -1
335 else:
336 self.handle_starttag(tag, method, attrs)
337 return 0
338 else:
339 self.stack.append(tag)
340 self.handle_starttag(tag, method, attrs)
341 return 1
342
343 # Internal -- finish processing of end tag
344 def finish_endtag(self, tag):
345 if not tag:
346 found = len(self.stack) - 1
347 if found < 0:
348 self.unknown_endtag(tag)
349 return
350 else:
351 if tag not in self.stack:
352 try:
353 method = getattr(self, 'end_' + tag)
354 except AttributeError:
355 self.unknown_endtag(tag)
356 else:
357 self.report_unbalanced(tag)
358 return
359 found = len(self.stack)
360 for i in range(found):
361 if self.stack[i] == tag: found = i
362 while len(self.stack) > found:
363 tag = self.stack[-1]
364 try:
365 method = getattr(self, 'end_' + tag)
366 except AttributeError:
367 method = None
368 if method:
369 self.handle_endtag(tag, method)
370 else:
371 self.unknown_endtag(tag)
372 del self.stack[-1]
373
374 # Overridable -- handle start tag
375 def handle_starttag(self, tag, method, attrs):
376 method(attrs)
377
378 # Overridable -- handle end tag
379 def handle_endtag(self, tag, method):
380 method()
381
382 # Example -- report an unbalanced </...> tag.
383 def report_unbalanced(self, tag):
384 if self.verbose:
385 print '*** Unbalanced </' + tag + '>'
386 print '*** Stack:', self.stack
387
388 def convert_charref(self, name):
389 """Convert character reference, may be overridden."""
390 try:
391 n = int(name)
392 except ValueError:
393 return
394 if not 0 <= n <= 255:
395 return
396 return self.convert_codepoint(n)
397
398 def convert_codepoint(self, codepoint):
399 return chr(codepoint)
400
401 def handle_charref(self, name):
402 """Handle character reference, no need to override."""
403 replacement = self.convert_charref(name)
404 if replacement is None:
405 self.unknown_charref(name)
406 else:
407 self.handle_data(replacement)
408
409 # Definition of entities -- derived classes may override
410 entitydefs = \
411 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
412
413 def convert_entityref(self, name):
414 """Convert entity references.
415
416 As an alternative to overriding this method; one can tailor the
417 results by setting up the self.entitydefs mapping appropriately.
418 """
419 table = self.entitydefs
420 if name in table:
421 return table[name]
422 else:
423 return
424
425 def handle_entityref(self, name):
426 """Handle entity references, no need to override."""
427 replacement = self.convert_entityref(name)
428 if replacement is None:
429 self.unknown_entityref(name)
430 else:
431 self.handle_data(self.convert_entityref(name))
432
433 # Example -- handle data, should be overridden
434 def handle_data(self, data):
435 pass
436
437 # Example -- handle comment, could be overridden
438 def handle_comment(self, data):
439 pass
440
441 # Example -- handle declaration, could be overridden
442 def handle_decl(self, decl):
443 pass
444
445 # Example -- handle processing instruction, could be overridden
446 def handle_pi(self, data):
447 pass
448
449 # To be overridden -- handlers for unknown objects
450 def unknown_starttag(self, tag, attrs): pass
451 def unknown_endtag(self, tag): pass
452 def unknown_charref(self, ref): pass
453 def unknown_entityref(self, ref): pass
454
455
456class TestSGMLParser(SGMLParser):
457
458 def __init__(self, verbose=0):
459 self.testdata = ""
460 SGMLParser.__init__(self, verbose)
461
462 def handle_data(self, data):
463 self.testdata = self.testdata + data
464 if len(repr(self.testdata)) >= 70:
465 self.flush()
466
467 def flush(self):
468 data = self.testdata
469 if data:
470 self.testdata = ""
471 print 'data:', repr(data)
472
473 def handle_comment(self, data):
474 self.flush()
475 r = repr(data)
476 if len(r) > 68:
477 r = r[:32] + '...' + r[-32:]
478 print 'comment:', r
479
480 def unknown_starttag(self, tag, attrs):
481 self.flush()
482 if not attrs:
483 print 'start tag: <' + tag + '>'
484 else:
485 print 'start tag: <' + tag,
486 for name, value in attrs:
487 print name + '=' + '"' + value + '"',
488 print '>'
489
490 def unknown_endtag(self, tag):
491 self.flush()
492 print 'end tag: </' + tag + '>'
493
494 def unknown_entityref(self, ref):
495 self.flush()
496 print '*** unknown entity ref: &' + ref + ';'
497
498 def unknown_charref(self, ref):
499 self.flush()
500 print '*** unknown char ref: &#' + ref + ';'
501
502 def unknown_decl(self, data):
503 self.flush()
504 print '*** unknown decl: [' + data + ']'
505
506 def close(self):
507 SGMLParser.close(self)
508 self.flush()
509
510
511def test(args = None):
512 import sys
513
514 if args is None:
515 args = sys.argv[1:]
516
517 if args and args[0] == '-s':
518 args = args[1:]
519 klass = SGMLParser
520 else:
521 klass = TestSGMLParser
522
523 if args:
524 file = args[0]
525 else:
526 file = 'test.html'
527
528 if file == '-':
529 f = sys.stdin
530 else:
531 try:
532 f = open(file, 'r')
533 except IOError, msg:
534 print file, ":", msg
535 sys.exit(1)
536
537 data = f.read()
538 if f is not sys.stdin:
539 f.close()
540
541 x = klass()
542 for c in data:
543 x.feed(c)
544 x.close()
545
546
547if __name__ == '__main__':
548 test()
Note: See TracBrowser for help on using the repository browser.