source: python/trunk/Lib/HTMLParser.py@ 394

Last change on this file since 394 was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 16.6 KB
Line 
1"""A parser for HTML and XHTML."""
2
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import markupbase
12import re
13
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17incomplete = re.compile('&[a-zA-Z#]')
18
19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
20charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
21
22starttagopen = re.compile('<[a-zA-Z]')
23piclose = re.compile('>')
24commentclose = re.compile(r'--\s*>')
25tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
26# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
27# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
28tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
29
30attrfind = re.compile(
31 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
32 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
33
34locatestarttagend = re.compile(r"""
35 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
36 (?:[\s/]* # optional whitespace before attribute name
37 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
38 (?:\s*=+\s* # value indicator
39 (?:'[^']*' # LITA-enclosed value
40 |"[^"]*" # LIT-enclosed value
41 |(?!['"])[^>\s]* # bare value
42 )
43 )?(?:\s|/(?!>))*
44 )*
45 )?
46 \s* # trailing whitespace
47""", re.VERBOSE)
48endendtag = re.compile('>')
49# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
50# </ and the tag name, so maybe this should be fixed
51endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
52
53
54class HTMLParseError(Exception):
55 """Exception raised for all parse errors."""
56
57 def __init__(self, msg, position=(None, None)):
58 assert msg
59 self.msg = msg
60 self.lineno = position[0]
61 self.offset = position[1]
62
63 def __str__(self):
64 result = self.msg
65 if self.lineno is not None:
66 result = result + ", at line %d" % self.lineno
67 if self.offset is not None:
68 result = result + ", column %d" % (self.offset + 1)
69 return result
70
71
72class HTMLParser(markupbase.ParserBase):
73 """Find tags and other markup and call handler functions.
74
75 Usage:
76 p = HTMLParser()
77 p.feed(data)
78 ...
79 p.close()
80
81 Start tags are handled by calling self.handle_starttag() or
82 self.handle_startendtag(); end tags by self.handle_endtag(). The
83 data between tags is passed from the parser to the derived class
84 by calling self.handle_data() with the data as argument (the data
85 may be split up in arbitrary chunks). Entity references are
86 passed by calling self.handle_entityref() with the entity
87 reference as the argument. Numeric character references are
88 passed to self.handle_charref() with the string containing the
89 reference as the argument.
90 """
91
92 CDATA_CONTENT_ELEMENTS = ("script", "style")
93
94
95 def __init__(self):
96 """Initialize and reset this instance."""
97 self.reset()
98
99 def reset(self):
100 """Reset this instance. Loses all unprocessed data."""
101 self.rawdata = ''
102 self.lasttag = '???'
103 self.interesting = interesting_normal
104 self.cdata_elem = None
105 markupbase.ParserBase.reset(self)
106
107 def feed(self, data):
108 r"""Feed data to the parser.
109
110 Call this as often as you want, with as little or as much text
111 as you want (may include '\n').
112 """
113 self.rawdata = self.rawdata + data
114 self.goahead(0)
115
116 def close(self):
117 """Handle any buffered data."""
118 self.goahead(1)
119
120 def error(self, message):
121 raise HTMLParseError(message, self.getpos())
122
123 __starttag_text = None
124
125 def get_starttag_text(self):
126 """Return full source of start tag: '<...>'."""
127 return self.__starttag_text
128
129 def set_cdata_mode(self, elem):
130 self.cdata_elem = elem.lower()
131 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
132
133 def clear_cdata_mode(self):
134 self.interesting = interesting_normal
135 self.cdata_elem = None
136
137 # Internal -- handle data as far as reasonable. May leave state
138 # and data to be processed by a subsequent call. If 'end' is
139 # true, force handling all data as if followed by EOF marker.
140 def goahead(self, end):
141 rawdata = self.rawdata
142 i = 0
143 n = len(rawdata)
144 while i < n:
145 match = self.interesting.search(rawdata, i) # < or &
146 if match:
147 j = match.start()
148 else:
149 if self.cdata_elem:
150 break
151 j = n
152 if i < j: self.handle_data(rawdata[i:j])
153 i = self.updatepos(i, j)
154 if i == n: break
155 startswith = rawdata.startswith
156 if startswith('<', i):
157 if starttagopen.match(rawdata, i): # < + letter
158 k = self.parse_starttag(i)
159 elif startswith("</", i):
160 k = self.parse_endtag(i)
161 elif startswith("<!--", i):
162 k = self.parse_comment(i)
163 elif startswith("<?", i):
164 k = self.parse_pi(i)
165 elif startswith("<!", i):
166 k = self.parse_html_declaration(i)
167 elif (i + 1) < n:
168 self.handle_data("<")
169 k = i + 1
170 else:
171 break
172 if k < 0:
173 if not end:
174 break
175 k = rawdata.find('>', i + 1)
176 if k < 0:
177 k = rawdata.find('<', i + 1)
178 if k < 0:
179 k = i + 1
180 else:
181 k += 1
182 self.handle_data(rawdata[i:k])
183 i = self.updatepos(i, k)
184 elif startswith("&#", i):
185 match = charref.match(rawdata, i)
186 if match:
187 name = match.group()[2:-1]
188 self.handle_charref(name)
189 k = match.end()
190 if not startswith(';', k-1):
191 k = k - 1
192 i = self.updatepos(i, k)
193 continue
194 else:
195 if ";" in rawdata[i:]: #bail by consuming &#
196 self.handle_data(rawdata[0:2])
197 i = self.updatepos(i, 2)
198 break
199 elif startswith('&', i):
200 match = entityref.match(rawdata, i)
201 if match:
202 name = match.group(1)
203 self.handle_entityref(name)
204 k = match.end()
205 if not startswith(';', k-1):
206 k = k - 1
207 i = self.updatepos(i, k)
208 continue
209 match = incomplete.match(rawdata, i)
210 if match:
211 # match.group() will contain at least 2 chars
212 if end and match.group() == rawdata[i:]:
213 self.error("EOF in middle of entity or char ref")
214 # incomplete
215 break
216 elif (i + 1) < n:
217 # not the end of the buffer, and can't be confused
218 # with some other construct
219 self.handle_data("&")
220 i = self.updatepos(i, i + 1)
221 else:
222 break
223 else:
224 assert 0, "interesting.search() lied"
225 # end while
226 if end and i < n and not self.cdata_elem:
227 self.handle_data(rawdata[i:n])
228 i = self.updatepos(i, n)
229 self.rawdata = rawdata[i:]
230
231 # Internal -- parse html declarations, return length or -1 if not terminated
232 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
233 # See also parse_declaration in _markupbase
234 def parse_html_declaration(self, i):
235 rawdata = self.rawdata
236 if rawdata[i:i+2] != '<!':
237 self.error('unexpected call to parse_html_declaration()')
238 if rawdata[i:i+4] == '<!--':
239 # this case is actually already handled in goahead()
240 return self.parse_comment(i)
241 elif rawdata[i:i+3] == '<![':
242 return self.parse_marked_section(i)
243 elif rawdata[i:i+9].lower() == '<!doctype':
244 # find the closing >
245 gtpos = rawdata.find('>', i+9)
246 if gtpos == -1:
247 return -1
248 self.handle_decl(rawdata[i+2:gtpos])
249 return gtpos+1
250 else:
251 return self.parse_bogus_comment(i)
252
253 # Internal -- parse bogus comment, return length or -1 if not terminated
254 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
255 def parse_bogus_comment(self, i, report=1):
256 rawdata = self.rawdata
257 if rawdata[i:i+2] not in ('<!', '</'):
258 self.error('unexpected call to parse_comment()')
259 pos = rawdata.find('>', i+2)
260 if pos == -1:
261 return -1
262 if report:
263 self.handle_comment(rawdata[i+2:pos])
264 return pos + 1
265
266 # Internal -- parse processing instr, return end or -1 if not terminated
267 def parse_pi(self, i):
268 rawdata = self.rawdata
269 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
270 match = piclose.search(rawdata, i+2) # >
271 if not match:
272 return -1
273 j = match.start()
274 self.handle_pi(rawdata[i+2: j])
275 j = match.end()
276 return j
277
278 # Internal -- handle starttag, return end or -1 if not terminated
279 def parse_starttag(self, i):
280 self.__starttag_text = None
281 endpos = self.check_for_whole_start_tag(i)
282 if endpos < 0:
283 return endpos
284 rawdata = self.rawdata
285 self.__starttag_text = rawdata[i:endpos]
286
287 # Now parse the data between i+1 and j into a tag and attrs
288 attrs = []
289 match = tagfind.match(rawdata, i+1)
290 assert match, 'unexpected call to parse_starttag()'
291 k = match.end()
292 self.lasttag = tag = match.group(1).lower()
293
294 while k < endpos:
295 m = attrfind.match(rawdata, k)
296 if not m:
297 break
298 attrname, rest, attrvalue = m.group(1, 2, 3)
299 if not rest:
300 attrvalue = None
301 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
302 attrvalue[:1] == '"' == attrvalue[-1:]:
303 attrvalue = attrvalue[1:-1]
304 if attrvalue:
305 attrvalue = self.unescape(attrvalue)
306 attrs.append((attrname.lower(), attrvalue))
307 k = m.end()
308
309 end = rawdata[k:endpos].strip()
310 if end not in (">", "/>"):
311 lineno, offset = self.getpos()
312 if "\n" in self.__starttag_text:
313 lineno = lineno + self.__starttag_text.count("\n")
314 offset = len(self.__starttag_text) \
315 - self.__starttag_text.rfind("\n")
316 else:
317 offset = offset + len(self.__starttag_text)
318 self.handle_data(rawdata[i:endpos])
319 return endpos
320 if end.endswith('/>'):
321 # XHTML-style empty tag: <span attr="value" />
322 self.handle_startendtag(tag, attrs)
323 else:
324 self.handle_starttag(tag, attrs)
325 if tag in self.CDATA_CONTENT_ELEMENTS:
326 self.set_cdata_mode(tag)
327 return endpos
328
329 # Internal -- check to see if we have a complete starttag; return end
330 # or -1 if incomplete.
331 def check_for_whole_start_tag(self, i):
332 rawdata = self.rawdata
333 m = locatestarttagend.match(rawdata, i)
334 if m:
335 j = m.end()
336 next = rawdata[j:j+1]
337 if next == ">":
338 return j + 1
339 if next == "/":
340 if rawdata.startswith("/>", j):
341 return j + 2
342 if rawdata.startswith("/", j):
343 # buffer boundary
344 return -1
345 # else bogus input
346 self.updatepos(i, j + 1)
347 self.error("malformed empty start tag")
348 if next == "":
349 # end of input
350 return -1
351 if next in ("abcdefghijklmnopqrstuvwxyz=/"
352 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
353 # end of input in or before attribute value, or we have the
354 # '/' from a '/>' ending
355 return -1
356 if j > i:
357 return j
358 else:
359 return i + 1
360 raise AssertionError("we should not get here!")
361
362 # Internal -- parse endtag, return end or -1 if incomplete
363 def parse_endtag(self, i):
364 rawdata = self.rawdata
365 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
366 match = endendtag.search(rawdata, i+1) # >
367 if not match:
368 return -1
369 gtpos = match.end()
370 match = endtagfind.match(rawdata, i) # </ + tag + >
371 if not match:
372 if self.cdata_elem is not None:
373 self.handle_data(rawdata[i:gtpos])
374 return gtpos
375 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
376 namematch = tagfind_tolerant.match(rawdata, i+2)
377 if not namematch:
378 # w3.org/TR/html5/tokenization.html#end-tag-open-state
379 if rawdata[i:i+3] == '</>':
380 return i+3
381 else:
382 return self.parse_bogus_comment(i)
383 tagname = namematch.group().lower()
384 # consume and ignore other stuff between the name and the >
385 # Note: this is not 100% correct, since we might have things like
386 # </tag attr=">">, but looking for > after tha name should cover
387 # most of the cases and is much simpler
388 gtpos = rawdata.find('>', namematch.end())
389 self.handle_endtag(tagname)
390 return gtpos+1
391
392 elem = match.group(1).lower() # script or style
393 if self.cdata_elem is not None:
394 if elem != self.cdata_elem:
395 self.handle_data(rawdata[i:gtpos])
396 return gtpos
397
398 self.handle_endtag(elem)
399 self.clear_cdata_mode()
400 return gtpos
401
402 # Overridable -- finish processing of start+end tag: <tag.../>
403 def handle_startendtag(self, tag, attrs):
404 self.handle_starttag(tag, attrs)
405 self.handle_endtag(tag)
406
407 # Overridable -- handle start tag
408 def handle_starttag(self, tag, attrs):
409 pass
410
411 # Overridable -- handle end tag
412 def handle_endtag(self, tag):
413 pass
414
415 # Overridable -- handle character reference
416 def handle_charref(self, name):
417 pass
418
419 # Overridable -- handle entity reference
420 def handle_entityref(self, name):
421 pass
422
423 # Overridable -- handle data
424 def handle_data(self, data):
425 pass
426
427 # Overridable -- handle comment
428 def handle_comment(self, data):
429 pass
430
431 # Overridable -- handle declaration
432 def handle_decl(self, decl):
433 pass
434
435 # Overridable -- handle processing instruction
436 def handle_pi(self, data):
437 pass
438
439 def unknown_decl(self, data):
440 pass
441
442 # Internal -- helper to remove special character quoting
443 entitydefs = None
444 def unescape(self, s):
445 if '&' not in s:
446 return s
447 def replaceEntities(s):
448 s = s.groups()[0]
449 try:
450 if s[0] == "#":
451 s = s[1:]
452 if s[0] in ['x','X']:
453 c = int(s[1:], 16)
454 else:
455 c = int(s)
456 return unichr(c)
457 except ValueError:
458 return '&#'+s+';'
459 else:
460 # Cannot use name2codepoint directly, because HTMLParser supports apos,
461 # which is not part of HTML 4
462 import htmlentitydefs
463 if HTMLParser.entitydefs is None:
464 entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
465 for k, v in htmlentitydefs.name2codepoint.iteritems():
466 entitydefs[k] = unichr(v)
467 try:
468 return self.entitydefs[s]
469 except KeyError:
470 return '&'+s+';'
471
472 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
Note: See TracBrowser for help on using the repository browser.