source: python/vendor/Python-2.7.6/Lib/htmllib.py

Last change on this file was 2, checked in by Yuri Dario, 15 years ago

Initial import for vendor code.

  • Property svn:eol-style set to native
File size: 12.6 KB
Line 
1"""HTML 2.0 parser.
2
3See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
6
7from warnings import warnpy3k
8warnpy3k("the htmllib module has been removed in Python 3.0",
9 stacklevel=2)
10del warnpy3k
11
12import sgmllib
13
14from formatter import AS_IS
15
16__all__ = ["HTMLParser", "HTMLParseError"]
17
18
19class HTMLParseError(sgmllib.SGMLParseError):
20 """Error raised when an HTML document can't be parsed."""
21
22
23class HTMLParser(sgmllib.SGMLParser):
24 """This is the basic HTML parser class.
25
26 It supports all entity names required by the XHTML 1.0 Recommendation.
27 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
28 elements.
29
30 """
31
32 from htmlentitydefs import entitydefs
33
34 def __init__(self, formatter, verbose=0):
35 """Creates an instance of the HTMLParser class.
36
37 The formatter parameter is the formatter instance associated with
38 the parser.
39
40 """
41 sgmllib.SGMLParser.__init__(self, verbose)
42 self.formatter = formatter
43
44 def error(self, message):
45 raise HTMLParseError(message)
46
47 def reset(self):
48 sgmllib.SGMLParser.reset(self)
49 self.savedata = None
50 self.isindex = 0
51 self.title = None
52 self.base = None
53 self.anchor = None
54 self.anchorlist = []
55 self.nofill = 0
56 self.list_stack = []
57
58 # ------ Methods used internally; some may be overridden
59
60 # --- Formatter interface, taking care of 'savedata' mode;
61 # shouldn't need to be overridden
62
63 def handle_data(self, data):
64 if self.savedata is not None:
65 self.savedata = self.savedata + data
66 else:
67 if self.nofill:
68 self.formatter.add_literal_data(data)
69 else:
70 self.formatter.add_flowing_data(data)
71
72 # --- Hooks to save data; shouldn't need to be overridden
73
74 def save_bgn(self):
75 """Begins saving character data in a buffer instead of sending it
76 to the formatter object.
77
78 Retrieve the stored data via the save_end() method. Use of the
79 save_bgn() / save_end() pair may not be nested.
80
81 """
82 self.savedata = ''
83
84 def save_end(self):
85 """Ends buffering character data and returns all data saved since
86 the preceding call to the save_bgn() method.
87
88 If the nofill flag is false, whitespace is collapsed to single
89 spaces. A call to this method without a preceding call to the
90 save_bgn() method will raise a TypeError exception.
91
92 """
93 data = self.savedata
94 self.savedata = None
95 if not self.nofill:
96 data = ' '.join(data.split())
97 return data
98
99 # --- Hooks for anchors; should probably be overridden
100
101 def anchor_bgn(self, href, name, type):
102 """This method is called at the start of an anchor region.
103
104 The arguments correspond to the attributes of the <A> tag with
105 the same names. The default implementation maintains a list of
106 hyperlinks (defined by the HREF attribute for <A> tags) within
107 the document. The list of hyperlinks is available as the data
108 attribute anchorlist.
109
110 """
111 self.anchor = href
112 if self.anchor:
113 self.anchorlist.append(href)
114
115 def anchor_end(self):
116 """This method is called at the end of an anchor region.
117
118 The default implementation adds a textual footnote marker using an
119 index into the list of hyperlinks created by the anchor_bgn()method.
120
121 """
122 if self.anchor:
123 self.handle_data("[%d]" % len(self.anchorlist))
124 self.anchor = None
125
126 # --- Hook for images; should probably be overridden
127
128 def handle_image(self, src, alt, *args):
129 """This method is called to handle images.
130
131 The default implementation simply passes the alt value to the
132 handle_data() method.
133
134 """
135 self.handle_data(alt)
136
137 # --------- Top level elememts
138
139 def start_html(self, attrs): pass
140 def end_html(self): pass
141
142 def start_head(self, attrs): pass
143 def end_head(self): pass
144
145 def start_body(self, attrs): pass
146 def end_body(self): pass
147
148 # ------ Head elements
149
150 def start_title(self, attrs):
151 self.save_bgn()
152
153 def end_title(self):
154 self.title = self.save_end()
155
156 def do_base(self, attrs):
157 for a, v in attrs:
158 if a == 'href':
159 self.base = v
160
161 def do_isindex(self, attrs):
162 self.isindex = 1
163
164 def do_link(self, attrs):
165 pass
166
167 def do_meta(self, attrs):
168 pass
169
170 def do_nextid(self, attrs): # Deprecated
171 pass
172
173 # ------ Body elements
174
175 # --- Headings
176
177 def start_h1(self, attrs):
178 self.formatter.end_paragraph(1)
179 self.formatter.push_font(('h1', 0, 1, 0))
180
181 def end_h1(self):
182 self.formatter.end_paragraph(1)
183 self.formatter.pop_font()
184
185 def start_h2(self, attrs):
186 self.formatter.end_paragraph(1)
187 self.formatter.push_font(('h2', 0, 1, 0))
188
189 def end_h2(self):
190 self.formatter.end_paragraph(1)
191 self.formatter.pop_font()
192
193 def start_h3(self, attrs):
194 self.formatter.end_paragraph(1)
195 self.formatter.push_font(('h3', 0, 1, 0))
196
197 def end_h3(self):
198 self.formatter.end_paragraph(1)
199 self.formatter.pop_font()
200
201 def start_h4(self, attrs):
202 self.formatter.end_paragraph(1)
203 self.formatter.push_font(('h4', 0, 1, 0))
204
205 def end_h4(self):
206 self.formatter.end_paragraph(1)
207 self.formatter.pop_font()
208
209 def start_h5(self, attrs):
210 self.formatter.end_paragraph(1)
211 self.formatter.push_font(('h5', 0, 1, 0))
212
213 def end_h5(self):
214 self.formatter.end_paragraph(1)
215 self.formatter.pop_font()
216
217 def start_h6(self, attrs):
218 self.formatter.end_paragraph(1)
219 self.formatter.push_font(('h6', 0, 1, 0))
220
221 def end_h6(self):
222 self.formatter.end_paragraph(1)
223 self.formatter.pop_font()
224
225 # --- Block Structuring Elements
226
227 def do_p(self, attrs):
228 self.formatter.end_paragraph(1)
229
230 def start_pre(self, attrs):
231 self.formatter.end_paragraph(1)
232 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
233 self.nofill = self.nofill + 1
234
235 def end_pre(self):
236 self.formatter.end_paragraph(1)
237 self.formatter.pop_font()
238 self.nofill = max(0, self.nofill - 1)
239
240 def start_xmp(self, attrs):
241 self.start_pre(attrs)
242 self.setliteral('xmp') # Tell SGML parser
243
244 def end_xmp(self):
245 self.end_pre()
246
247 def start_listing(self, attrs):
248 self.start_pre(attrs)
249 self.setliteral('listing') # Tell SGML parser
250
251 def end_listing(self):
252 self.end_pre()
253
254 def start_address(self, attrs):
255 self.formatter.end_paragraph(0)
256 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
257
258 def end_address(self):
259 self.formatter.end_paragraph(0)
260 self.formatter.pop_font()
261
262 def start_blockquote(self, attrs):
263 self.formatter.end_paragraph(1)
264 self.formatter.push_margin('blockquote')
265
266 def end_blockquote(self):
267 self.formatter.end_paragraph(1)
268 self.formatter.pop_margin()
269
270 # --- List Elements
271
272 def start_ul(self, attrs):
273 self.formatter.end_paragraph(not self.list_stack)
274 self.formatter.push_margin('ul')
275 self.list_stack.append(['ul', '*', 0])
276
277 def end_ul(self):
278 if self.list_stack: del self.list_stack[-1]
279 self.formatter.end_paragraph(not self.list_stack)
280 self.formatter.pop_margin()
281
282 def do_li(self, attrs):
283 self.formatter.end_paragraph(0)
284 if self.list_stack:
285 [dummy, label, counter] = top = self.list_stack[-1]
286 top[2] = counter = counter+1
287 else:
288 label, counter = '*', 0
289 self.formatter.add_label_data(label, counter)
290
291 def start_ol(self, attrs):
292 self.formatter.end_paragraph(not self.list_stack)
293 self.formatter.push_margin('ol')
294 label = '1.'
295 for a, v in attrs:
296 if a == 'type':
297 if len(v) == 1: v = v + '.'
298 label = v
299 self.list_stack.append(['ol', label, 0])
300
301 def end_ol(self):
302 if self.list_stack: del self.list_stack[-1]
303 self.formatter.end_paragraph(not self.list_stack)
304 self.formatter.pop_margin()
305
306 def start_menu(self, attrs):
307 self.start_ul(attrs)
308
309 def end_menu(self):
310 self.end_ul()
311
312 def start_dir(self, attrs):
313 self.start_ul(attrs)
314
315 def end_dir(self):
316 self.end_ul()
317
318 def start_dl(self, attrs):
319 self.formatter.end_paragraph(1)
320 self.list_stack.append(['dl', '', 0])
321
322 def end_dl(self):
323 self.ddpop(1)
324 if self.list_stack: del self.list_stack[-1]
325
326 def do_dt(self, attrs):
327 self.ddpop()
328
329 def do_dd(self, attrs):
330 self.ddpop()
331 self.formatter.push_margin('dd')
332 self.list_stack.append(['dd', '', 0])
333
334 def ddpop(self, bl=0):
335 self.formatter.end_paragraph(bl)
336 if self.list_stack:
337 if self.list_stack[-1][0] == 'dd':
338 del self.list_stack[-1]
339 self.formatter.pop_margin()
340
341 # --- Phrase Markup
342
343 # Idiomatic Elements
344
345 def start_cite(self, attrs): self.start_i(attrs)
346 def end_cite(self): self.end_i()
347
348 def start_code(self, attrs): self.start_tt(attrs)
349 def end_code(self): self.end_tt()
350
351 def start_em(self, attrs): self.start_i(attrs)
352 def end_em(self): self.end_i()
353
354 def start_kbd(self, attrs): self.start_tt(attrs)
355 def end_kbd(self): self.end_tt()
356
357 def start_samp(self, attrs): self.start_tt(attrs)
358 def end_samp(self): self.end_tt()
359
360 def start_strong(self, attrs): self.start_b(attrs)
361 def end_strong(self): self.end_b()
362
363 def start_var(self, attrs): self.start_i(attrs)
364 def end_var(self): self.end_i()
365
366 # Typographic Elements
367
368 def start_i(self, attrs):
369 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
370 def end_i(self):
371 self.formatter.pop_font()
372
373 def start_b(self, attrs):
374 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
375 def end_b(self):
376 self.formatter.pop_font()
377
378 def start_tt(self, attrs):
379 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
380 def end_tt(self):
381 self.formatter.pop_font()
382
383 def start_a(self, attrs):
384 href = ''
385 name = ''
386 type = ''
387 for attrname, value in attrs:
388 value = value.strip()
389 if attrname == 'href':
390 href = value
391 if attrname == 'name':
392 name = value
393 if attrname == 'type':
394 type = value.lower()
395 self.anchor_bgn(href, name, type)
396
397 def end_a(self):
398 self.anchor_end()
399
400 # --- Line Break
401
402 def do_br(self, attrs):
403 self.formatter.add_line_break()
404
405 # --- Horizontal Rule
406
407 def do_hr(self, attrs):
408 self.formatter.add_hor_rule()
409
410 # --- Image
411
412 def do_img(self, attrs):
413 align = ''
414 alt = '(image)'
415 ismap = ''
416 src = ''
417 width = 0
418 height = 0
419 for attrname, value in attrs:
420 if attrname == 'align':
421 align = value
422 if attrname == 'alt':
423 alt = value
424 if attrname == 'ismap':
425 ismap = value
426 if attrname == 'src':
427 src = value
428 if attrname == 'width':
429 try: width = int(value)
430 except ValueError: pass
431 if attrname == 'height':
432 try: height = int(value)
433 except ValueError: pass
434 self.handle_image(src, alt, ismap, align, width, height)
435
436 # --- Really Old Unofficial Deprecated Stuff
437
438 def do_plaintext(self, attrs):
439 self.start_pre(attrs)
440 self.setnomoretags() # Tell SGML parser
441
442 # --- Unhandled tags
443
444 def unknown_starttag(self, tag, attrs):
445 pass
446
447 def unknown_endtag(self, tag):
448 pass
449
450
451def test(args = None):
452 import sys, formatter
453
454 if not args:
455 args = sys.argv[1:]
456
457 silent = args and args[0] == '-s'
458 if silent:
459 del args[0]
460
461 if args:
462 file = args[0]
463 else:
464 file = 'test.html'
465
466 if file == '-':
467 f = sys.stdin
468 else:
469 try:
470 f = open(file, 'r')
471 except IOError, msg:
472 print file, ":", msg
473 sys.exit(1)
474
475 data = f.read()
476
477 if f is not sys.stdin:
478 f.close()
479
480 if silent:
481 f = formatter.NullFormatter()
482 else:
483 f = formatter.AbstractFormatter(formatter.DumbWriter())
484
485 p = HTMLParser(f)
486 p.feed(data)
487 p.close()
488
489
490if __name__ == '__main__':
491 test()
Note: See TracBrowser for help on using the repository browser.