1 | """HTML 2.0 parser.
|
---|
2 |
|
---|
3 | See the HTML 2.0 specification:
|
---|
4 | http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
|
---|
5 | """
|
---|
6 |
|
---|
7 | from warnings import warnpy3k
|
---|
8 | warnpy3k("the htmllib module has been removed in Python 3.0",
|
---|
9 | stacklevel=2)
|
---|
10 | del warnpy3k
|
---|
11 |
|
---|
12 | import sgmllib
|
---|
13 |
|
---|
14 | from formatter import AS_IS
|
---|
15 |
|
---|
16 | __all__ = ["HTMLParser", "HTMLParseError"]
|
---|
17 |
|
---|
18 |
|
---|
19 | class HTMLParseError(sgmllib.SGMLParseError):
|
---|
20 | """Error raised when an HTML document can't be parsed."""
|
---|
21 |
|
---|
22 |
|
---|
23 | class HTMLParser(sgmllib.SGMLParser):
|
---|
24 | """This is the basic HTML parser class.
|
---|
25 |
|
---|
26 | It supports all entity names required by the XHTML 1.0 Recommendation.
|
---|
27 | It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
|
---|
28 | elements.
|
---|
29 |
|
---|
30 | """
|
---|
31 |
|
---|
32 | from htmlentitydefs import entitydefs
|
---|
33 |
|
---|
34 | def __init__(self, formatter, verbose=0):
|
---|
35 | """Creates an instance of the HTMLParser class.
|
---|
36 |
|
---|
37 | The formatter parameter is the formatter instance associated with
|
---|
38 | the parser.
|
---|
39 |
|
---|
40 | """
|
---|
41 | sgmllib.SGMLParser.__init__(self, verbose)
|
---|
42 | self.formatter = formatter
|
---|
43 |
|
---|
44 | def error(self, message):
|
---|
45 | raise HTMLParseError(message)
|
---|
46 |
|
---|
47 | def reset(self):
|
---|
48 | sgmllib.SGMLParser.reset(self)
|
---|
49 | self.savedata = None
|
---|
50 | self.isindex = 0
|
---|
51 | self.title = None
|
---|
52 | self.base = None
|
---|
53 | self.anchor = None
|
---|
54 | self.anchorlist = []
|
---|
55 | self.nofill = 0
|
---|
56 | self.list_stack = []
|
---|
57 |
|
---|
58 | # ------ Methods used internally; some may be overridden
|
---|
59 |
|
---|
60 | # --- Formatter interface, taking care of 'savedata' mode;
|
---|
61 | # shouldn't need to be overridden
|
---|
62 |
|
---|
63 | def handle_data(self, data):
|
---|
64 | if self.savedata is not None:
|
---|
65 | self.savedata = self.savedata + data
|
---|
66 | else:
|
---|
67 | if self.nofill:
|
---|
68 | self.formatter.add_literal_data(data)
|
---|
69 | else:
|
---|
70 | self.formatter.add_flowing_data(data)
|
---|
71 |
|
---|
72 | # --- Hooks to save data; shouldn't need to be overridden
|
---|
73 |
|
---|
74 | def save_bgn(self):
|
---|
75 | """Begins saving character data in a buffer instead of sending it
|
---|
76 | to the formatter object.
|
---|
77 |
|
---|
78 | Retrieve the stored data via the save_end() method. Use of the
|
---|
79 | save_bgn() / save_end() pair may not be nested.
|
---|
80 |
|
---|
81 | """
|
---|
82 | self.savedata = ''
|
---|
83 |
|
---|
84 | def save_end(self):
|
---|
85 | """Ends buffering character data and returns all data saved since
|
---|
86 | the preceding call to the save_bgn() method.
|
---|
87 |
|
---|
88 | If the nofill flag is false, whitespace is collapsed to single
|
---|
89 | spaces. A call to this method without a preceding call to the
|
---|
90 | save_bgn() method will raise a TypeError exception.
|
---|
91 |
|
---|
92 | """
|
---|
93 | data = self.savedata
|
---|
94 | self.savedata = None
|
---|
95 | if not self.nofill:
|
---|
96 | data = ' '.join(data.split())
|
---|
97 | return data
|
---|
98 |
|
---|
99 | # --- Hooks for anchors; should probably be overridden
|
---|
100 |
|
---|
101 | def anchor_bgn(self, href, name, type):
|
---|
102 | """This method is called at the start of an anchor region.
|
---|
103 |
|
---|
104 | The arguments correspond to the attributes of the <A> tag with
|
---|
105 | the same names. The default implementation maintains a list of
|
---|
106 | hyperlinks (defined by the HREF attribute for <A> tags) within
|
---|
107 | the document. The list of hyperlinks is available as the data
|
---|
108 | attribute anchorlist.
|
---|
109 |
|
---|
110 | """
|
---|
111 | self.anchor = href
|
---|
112 | if self.anchor:
|
---|
113 | self.anchorlist.append(href)
|
---|
114 |
|
---|
115 | def anchor_end(self):
|
---|
116 | """This method is called at the end of an anchor region.
|
---|
117 |
|
---|
118 | The default implementation adds a textual footnote marker using an
|
---|
119 | index into the list of hyperlinks created by the anchor_bgn()method.
|
---|
120 |
|
---|
121 | """
|
---|
122 | if self.anchor:
|
---|
123 | self.handle_data("[%d]" % len(self.anchorlist))
|
---|
124 | self.anchor = None
|
---|
125 |
|
---|
126 | # --- Hook for images; should probably be overridden
|
---|
127 |
|
---|
128 | def handle_image(self, src, alt, *args):
|
---|
129 | """This method is called to handle images.
|
---|
130 |
|
---|
131 | The default implementation simply passes the alt value to the
|
---|
132 | handle_data() method.
|
---|
133 |
|
---|
134 | """
|
---|
135 | self.handle_data(alt)
|
---|
136 |
|
---|
137 | # --------- Top level elememts
|
---|
138 |
|
---|
139 | def start_html(self, attrs): pass
|
---|
140 | def end_html(self): pass
|
---|
141 |
|
---|
142 | def start_head(self, attrs): pass
|
---|
143 | def end_head(self): pass
|
---|
144 |
|
---|
145 | def start_body(self, attrs): pass
|
---|
146 | def end_body(self): pass
|
---|
147 |
|
---|
148 | # ------ Head elements
|
---|
149 |
|
---|
150 | def start_title(self, attrs):
|
---|
151 | self.save_bgn()
|
---|
152 |
|
---|
153 | def end_title(self):
|
---|
154 | self.title = self.save_end()
|
---|
155 |
|
---|
156 | def do_base(self, attrs):
|
---|
157 | for a, v in attrs:
|
---|
158 | if a == 'href':
|
---|
159 | self.base = v
|
---|
160 |
|
---|
161 | def do_isindex(self, attrs):
|
---|
162 | self.isindex = 1
|
---|
163 |
|
---|
164 | def do_link(self, attrs):
|
---|
165 | pass
|
---|
166 |
|
---|
167 | def do_meta(self, attrs):
|
---|
168 | pass
|
---|
169 |
|
---|
170 | def do_nextid(self, attrs): # Deprecated
|
---|
171 | pass
|
---|
172 |
|
---|
173 | # ------ Body elements
|
---|
174 |
|
---|
175 | # --- Headings
|
---|
176 |
|
---|
177 | def start_h1(self, attrs):
|
---|
178 | self.formatter.end_paragraph(1)
|
---|
179 | self.formatter.push_font(('h1', 0, 1, 0))
|
---|
180 |
|
---|
181 | def end_h1(self):
|
---|
182 | self.formatter.end_paragraph(1)
|
---|
183 | self.formatter.pop_font()
|
---|
184 |
|
---|
185 | def start_h2(self, attrs):
|
---|
186 | self.formatter.end_paragraph(1)
|
---|
187 | self.formatter.push_font(('h2', 0, 1, 0))
|
---|
188 |
|
---|
189 | def end_h2(self):
|
---|
190 | self.formatter.end_paragraph(1)
|
---|
191 | self.formatter.pop_font()
|
---|
192 |
|
---|
193 | def start_h3(self, attrs):
|
---|
194 | self.formatter.end_paragraph(1)
|
---|
195 | self.formatter.push_font(('h3', 0, 1, 0))
|
---|
196 |
|
---|
197 | def end_h3(self):
|
---|
198 | self.formatter.end_paragraph(1)
|
---|
199 | self.formatter.pop_font()
|
---|
200 |
|
---|
201 | def start_h4(self, attrs):
|
---|
202 | self.formatter.end_paragraph(1)
|
---|
203 | self.formatter.push_font(('h4', 0, 1, 0))
|
---|
204 |
|
---|
205 | def end_h4(self):
|
---|
206 | self.formatter.end_paragraph(1)
|
---|
207 | self.formatter.pop_font()
|
---|
208 |
|
---|
209 | def start_h5(self, attrs):
|
---|
210 | self.formatter.end_paragraph(1)
|
---|
211 | self.formatter.push_font(('h5', 0, 1, 0))
|
---|
212 |
|
---|
213 | def end_h5(self):
|
---|
214 | self.formatter.end_paragraph(1)
|
---|
215 | self.formatter.pop_font()
|
---|
216 |
|
---|
217 | def start_h6(self, attrs):
|
---|
218 | self.formatter.end_paragraph(1)
|
---|
219 | self.formatter.push_font(('h6', 0, 1, 0))
|
---|
220 |
|
---|
221 | def end_h6(self):
|
---|
222 | self.formatter.end_paragraph(1)
|
---|
223 | self.formatter.pop_font()
|
---|
224 |
|
---|
225 | # --- Block Structuring Elements
|
---|
226 |
|
---|
227 | def do_p(self, attrs):
|
---|
228 | self.formatter.end_paragraph(1)
|
---|
229 |
|
---|
230 | def start_pre(self, attrs):
|
---|
231 | self.formatter.end_paragraph(1)
|
---|
232 | self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
|
---|
233 | self.nofill = self.nofill + 1
|
---|
234 |
|
---|
235 | def end_pre(self):
|
---|
236 | self.formatter.end_paragraph(1)
|
---|
237 | self.formatter.pop_font()
|
---|
238 | self.nofill = max(0, self.nofill - 1)
|
---|
239 |
|
---|
240 | def start_xmp(self, attrs):
|
---|
241 | self.start_pre(attrs)
|
---|
242 | self.setliteral('xmp') # Tell SGML parser
|
---|
243 |
|
---|
244 | def end_xmp(self):
|
---|
245 | self.end_pre()
|
---|
246 |
|
---|
247 | def start_listing(self, attrs):
|
---|
248 | self.start_pre(attrs)
|
---|
249 | self.setliteral('listing') # Tell SGML parser
|
---|
250 |
|
---|
251 | def end_listing(self):
|
---|
252 | self.end_pre()
|
---|
253 |
|
---|
254 | def start_address(self, attrs):
|
---|
255 | self.formatter.end_paragraph(0)
|
---|
256 | self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
|
---|
257 |
|
---|
258 | def end_address(self):
|
---|
259 | self.formatter.end_paragraph(0)
|
---|
260 | self.formatter.pop_font()
|
---|
261 |
|
---|
262 | def start_blockquote(self, attrs):
|
---|
263 | self.formatter.end_paragraph(1)
|
---|
264 | self.formatter.push_margin('blockquote')
|
---|
265 |
|
---|
266 | def end_blockquote(self):
|
---|
267 | self.formatter.end_paragraph(1)
|
---|
268 | self.formatter.pop_margin()
|
---|
269 |
|
---|
270 | # --- List Elements
|
---|
271 |
|
---|
272 | def start_ul(self, attrs):
|
---|
273 | self.formatter.end_paragraph(not self.list_stack)
|
---|
274 | self.formatter.push_margin('ul')
|
---|
275 | self.list_stack.append(['ul', '*', 0])
|
---|
276 |
|
---|
277 | def end_ul(self):
|
---|
278 | if self.list_stack: del self.list_stack[-1]
|
---|
279 | self.formatter.end_paragraph(not self.list_stack)
|
---|
280 | self.formatter.pop_margin()
|
---|
281 |
|
---|
282 | def do_li(self, attrs):
|
---|
283 | self.formatter.end_paragraph(0)
|
---|
284 | if self.list_stack:
|
---|
285 | [dummy, label, counter] = top = self.list_stack[-1]
|
---|
286 | top[2] = counter = counter+1
|
---|
287 | else:
|
---|
288 | label, counter = '*', 0
|
---|
289 | self.formatter.add_label_data(label, counter)
|
---|
290 |
|
---|
291 | def start_ol(self, attrs):
|
---|
292 | self.formatter.end_paragraph(not self.list_stack)
|
---|
293 | self.formatter.push_margin('ol')
|
---|
294 | label = '1.'
|
---|
295 | for a, v in attrs:
|
---|
296 | if a == 'type':
|
---|
297 | if len(v) == 1: v = v + '.'
|
---|
298 | label = v
|
---|
299 | self.list_stack.append(['ol', label, 0])
|
---|
300 |
|
---|
301 | def end_ol(self):
|
---|
302 | if self.list_stack: del self.list_stack[-1]
|
---|
303 | self.formatter.end_paragraph(not self.list_stack)
|
---|
304 | self.formatter.pop_margin()
|
---|
305 |
|
---|
306 | def start_menu(self, attrs):
|
---|
307 | self.start_ul(attrs)
|
---|
308 |
|
---|
309 | def end_menu(self):
|
---|
310 | self.end_ul()
|
---|
311 |
|
---|
312 | def start_dir(self, attrs):
|
---|
313 | self.start_ul(attrs)
|
---|
314 |
|
---|
315 | def end_dir(self):
|
---|
316 | self.end_ul()
|
---|
317 |
|
---|
318 | def start_dl(self, attrs):
|
---|
319 | self.formatter.end_paragraph(1)
|
---|
320 | self.list_stack.append(['dl', '', 0])
|
---|
321 |
|
---|
322 | def end_dl(self):
|
---|
323 | self.ddpop(1)
|
---|
324 | if self.list_stack: del self.list_stack[-1]
|
---|
325 |
|
---|
326 | def do_dt(self, attrs):
|
---|
327 | self.ddpop()
|
---|
328 |
|
---|
329 | def do_dd(self, attrs):
|
---|
330 | self.ddpop()
|
---|
331 | self.formatter.push_margin('dd')
|
---|
332 | self.list_stack.append(['dd', '', 0])
|
---|
333 |
|
---|
334 | def ddpop(self, bl=0):
|
---|
335 | self.formatter.end_paragraph(bl)
|
---|
336 | if self.list_stack:
|
---|
337 | if self.list_stack[-1][0] == 'dd':
|
---|
338 | del self.list_stack[-1]
|
---|
339 | self.formatter.pop_margin()
|
---|
340 |
|
---|
341 | # --- Phrase Markup
|
---|
342 |
|
---|
343 | # Idiomatic Elements
|
---|
344 |
|
---|
345 | def start_cite(self, attrs): self.start_i(attrs)
|
---|
346 | def end_cite(self): self.end_i()
|
---|
347 |
|
---|
348 | def start_code(self, attrs): self.start_tt(attrs)
|
---|
349 | def end_code(self): self.end_tt()
|
---|
350 |
|
---|
351 | def start_em(self, attrs): self.start_i(attrs)
|
---|
352 | def end_em(self): self.end_i()
|
---|
353 |
|
---|
354 | def start_kbd(self, attrs): self.start_tt(attrs)
|
---|
355 | def end_kbd(self): self.end_tt()
|
---|
356 |
|
---|
357 | def start_samp(self, attrs): self.start_tt(attrs)
|
---|
358 | def end_samp(self): self.end_tt()
|
---|
359 |
|
---|
360 | def start_strong(self, attrs): self.start_b(attrs)
|
---|
361 | def end_strong(self): self.end_b()
|
---|
362 |
|
---|
363 | def start_var(self, attrs): self.start_i(attrs)
|
---|
364 | def end_var(self): self.end_i()
|
---|
365 |
|
---|
366 | # Typographic Elements
|
---|
367 |
|
---|
368 | def start_i(self, attrs):
|
---|
369 | self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
|
---|
370 | def end_i(self):
|
---|
371 | self.formatter.pop_font()
|
---|
372 |
|
---|
373 | def start_b(self, attrs):
|
---|
374 | self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
|
---|
375 | def end_b(self):
|
---|
376 | self.formatter.pop_font()
|
---|
377 |
|
---|
378 | def start_tt(self, attrs):
|
---|
379 | self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
|
---|
380 | def end_tt(self):
|
---|
381 | self.formatter.pop_font()
|
---|
382 |
|
---|
383 | def start_a(self, attrs):
|
---|
384 | href = ''
|
---|
385 | name = ''
|
---|
386 | type = ''
|
---|
387 | for attrname, value in attrs:
|
---|
388 | value = value.strip()
|
---|
389 | if attrname == 'href':
|
---|
390 | href = value
|
---|
391 | if attrname == 'name':
|
---|
392 | name = value
|
---|
393 | if attrname == 'type':
|
---|
394 | type = value.lower()
|
---|
395 | self.anchor_bgn(href, name, type)
|
---|
396 |
|
---|
397 | def end_a(self):
|
---|
398 | self.anchor_end()
|
---|
399 |
|
---|
400 | # --- Line Break
|
---|
401 |
|
---|
402 | def do_br(self, attrs):
|
---|
403 | self.formatter.add_line_break()
|
---|
404 |
|
---|
405 | # --- Horizontal Rule
|
---|
406 |
|
---|
407 | def do_hr(self, attrs):
|
---|
408 | self.formatter.add_hor_rule()
|
---|
409 |
|
---|
410 | # --- Image
|
---|
411 |
|
---|
412 | def do_img(self, attrs):
|
---|
413 | align = ''
|
---|
414 | alt = '(image)'
|
---|
415 | ismap = ''
|
---|
416 | src = ''
|
---|
417 | width = 0
|
---|
418 | height = 0
|
---|
419 | for attrname, value in attrs:
|
---|
420 | if attrname == 'align':
|
---|
421 | align = value
|
---|
422 | if attrname == 'alt':
|
---|
423 | alt = value
|
---|
424 | if attrname == 'ismap':
|
---|
425 | ismap = value
|
---|
426 | if attrname == 'src':
|
---|
427 | src = value
|
---|
428 | if attrname == 'width':
|
---|
429 | try: width = int(value)
|
---|
430 | except ValueError: pass
|
---|
431 | if attrname == 'height':
|
---|
432 | try: height = int(value)
|
---|
433 | except ValueError: pass
|
---|
434 | self.handle_image(src, alt, ismap, align, width, height)
|
---|
435 |
|
---|
436 | # --- Really Old Unofficial Deprecated Stuff
|
---|
437 |
|
---|
438 | def do_plaintext(self, attrs):
|
---|
439 | self.start_pre(attrs)
|
---|
440 | self.setnomoretags() # Tell SGML parser
|
---|
441 |
|
---|
442 | # --- Unhandled tags
|
---|
443 |
|
---|
444 | def unknown_starttag(self, tag, attrs):
|
---|
445 | pass
|
---|
446 |
|
---|
447 | def unknown_endtag(self, tag):
|
---|
448 | pass
|
---|
449 |
|
---|
450 |
|
---|
451 | def test(args = None):
|
---|
452 | import sys, formatter
|
---|
453 |
|
---|
454 | if not args:
|
---|
455 | args = sys.argv[1:]
|
---|
456 |
|
---|
457 | silent = args and args[0] == '-s'
|
---|
458 | if silent:
|
---|
459 | del args[0]
|
---|
460 |
|
---|
461 | if args:
|
---|
462 | file = args[0]
|
---|
463 | else:
|
---|
464 | file = 'test.html'
|
---|
465 |
|
---|
466 | if file == '-':
|
---|
467 | f = sys.stdin
|
---|
468 | else:
|
---|
469 | try:
|
---|
470 | f = open(file, 'r')
|
---|
471 | except IOError, msg:
|
---|
472 | print file, ":", msg
|
---|
473 | sys.exit(1)
|
---|
474 |
|
---|
475 | data = f.read()
|
---|
476 |
|
---|
477 | if f is not sys.stdin:
|
---|
478 | f.close()
|
---|
479 |
|
---|
480 | if silent:
|
---|
481 | f = formatter.NullFormatter()
|
---|
482 | else:
|
---|
483 | f = formatter.AbstractFormatter(formatter.DumbWriter())
|
---|
484 |
|
---|
485 | p = HTMLParser(f)
|
---|
486 | p.feed(data)
|
---|
487 | p.close()
|
---|
488 |
|
---|
489 |
|
---|
490 | if __name__ == '__main__':
|
---|
491 | test()
|
---|