Context Navigation

HTMLParser.py@ 394

Last change on this file since 394 was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 16.6 KB

Line
1	"""A parser for HTML and XHTML."""
2
3	# This file is based on sgmllib.py, but the API is slightly different.
4
5	# XXX There should be a way to distinguish between PCDATA (parsed
6	# character data -- the normal case), RCDATA (replaceable character
7	# data -- only char and entity references and end tags are special)
8	# and CDATA (character data -- only end tags are special).
9
10
11	import markupbase
12	import re
13
14	# Regular expressions used for parsing
15
16	interesting_normal = re.compile('[&<]')
17	incomplete = re.compile('&[a-zA-Z#]')
18
19	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
20	charref = re.compile('&#(?:[0-9]+\|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
21
22	starttagopen = re.compile('<[a-zA-Z]')
23	piclose = re.compile('>')
24	commentclose = re.compile(r'--\s*>')
25	tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_])(?:\s\|/(?!>))')
26	# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
27	# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
28	tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
29
30	attrfind = re.compile(
31	r'((?<=[\'"\s/])[^\s/>][^\s/=>])(\s=+\s*'
32	r'(\'[^\']\'\|"[^"]"\|(?![\'"])[^>\s]))?(?:\s\|/(?!>))')
33
34	locatestarttagend = re.compile(r"""
35	<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
36	(?:[\s/]* # optional whitespace before attribute name
37	(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
38	(?:\s=+\s # value indicator
39	(?:'[^']*' # LITA-enclosed value
40	\|"[^"]*" # LIT-enclosed value
41	\|(?!['"])[^>\s]* # bare value
42	)
43	)?(?:\s\|/(?!>))*
44	)*
45	)?
46	\s* # trailing whitespace
47	""", re.VERBOSE)
48	endendtag = re.compile('>')
49	# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
50	# </ and the tag name, so maybe this should be fixed
51	endtagfind = re.compile('</\s([a-zA-Z][-.a-zA-Z0-9:_])\s*>')
52
53
54	class HTMLParseError(Exception):
55	"""Exception raised for all parse errors."""
56
57	def __init__(self, msg, position=(None, None)):
58	assert msg
59	self.msg = msg
60	self.lineno = position[0]
61	self.offset = position[1]
62
63	def __str__(self):
64	result = self.msg
65	if self.lineno is not None:
66	result = result + ", at line %d" % self.lineno
67	if self.offset is not None:
68	result = result + ", column %d" % (self.offset + 1)
69	return result
70
71
72	class HTMLParser(markupbase.ParserBase):
73	"""Find tags and other markup and call handler functions.
74
75	Usage:
76	p = HTMLParser()
77	p.feed(data)
78	...
79	p.close()
80
81	Start tags are handled by calling self.handle_starttag() or
82	self.handle_startendtag(); end tags by self.handle_endtag(). The
83	data between tags is passed from the parser to the derived class
84	by calling self.handle_data() with the data as argument (the data
85	may be split up in arbitrary chunks). Entity references are
86	passed by calling self.handle_entityref() with the entity
87	reference as the argument. Numeric character references are
88	passed to self.handle_charref() with the string containing the
89	reference as the argument.
90	"""
91
92	CDATA_CONTENT_ELEMENTS = ("script", "style")
93
94
95	def __init__(self):
96	"""Initialize and reset this instance."""
97	self.reset()
98
99	def reset(self):
100	"""Reset this instance. Loses all unprocessed data."""
101	self.rawdata = ''
102	self.lasttag = '???'
103	self.interesting = interesting_normal
104	self.cdata_elem = None
105	markupbase.ParserBase.reset(self)
106
107	def feed(self, data):
108	r"""Feed data to the parser.
109
110	Call this as often as you want, with as little or as much text
111	as you want (may include '\n').
112	"""
113	self.rawdata = self.rawdata + data
114	self.goahead(0)
115
116	def close(self):
117	"""Handle any buffered data."""
118	self.goahead(1)
119
120	def error(self, message):
121	raise HTMLParseError(message, self.getpos())
122
123	__starttag_text = None
124
125	def get_starttag_text(self):
126	"""Return full source of start tag: '<...>'."""
127	return self.__starttag_text
128
129	def set_cdata_mode(self, elem):
130	self.cdata_elem = elem.lower()
131	self.interesting = re.compile(r'</\s%s\s>' % self.cdata_elem, re.I)
132
133	def clear_cdata_mode(self):
134	self.interesting = interesting_normal
135	self.cdata_elem = None
136
137	# Internal -- handle data as far as reasonable. May leave state
138	# and data to be processed by a subsequent call. If 'end' is
139	# true, force handling all data as if followed by EOF marker.
140	def goahead(self, end):
141	rawdata = self.rawdata
142	i = 0
143	n = len(rawdata)
144	while i < n:
145	match = self.interesting.search(rawdata, i) # < or &
146	if match:
147	j = match.start()
148	else:
149	if self.cdata_elem:
150	break
151	j = n
152	if i < j: self.handle_data(rawdata[i:j])
153	i = self.updatepos(i, j)
154	if i == n: break
155	startswith = rawdata.startswith
156	if startswith('<', i):
157	if starttagopen.match(rawdata, i): # < + letter
158	k = self.parse_starttag(i)
159	elif startswith("</", i):
160	k = self.parse_endtag(i)
161	elif startswith("<!--", i):
162	k = self.parse_comment(i)
163	elif startswith("<?", i):
164	k = self.parse_pi(i)
165	elif startswith("<!", i):
166	k = self.parse_html_declaration(i)
167	elif (i + 1) < n:
168	self.handle_data("<")
169	k = i + 1
170	else:
171	break
172	if k < 0:
173	if not end:
174	break
175	k = rawdata.find('>', i + 1)
176	if k < 0:
177	k = rawdata.find('<', i + 1)
178	if k < 0:
179	k = i + 1
180	else:
181	k += 1
182	self.handle_data(rawdata[i:k])
183	i = self.updatepos(i, k)
184	elif startswith("&#", i):
185	match = charref.match(rawdata, i)
186	if match:
187	name = match.group()[2:-1]
188	self.handle_charref(name)
189	k = match.end()
190	if not startswith(';', k-1):
191	k = k - 1
192	i = self.updatepos(i, k)
193	continue
194	else:
195	if ";" in rawdata[i:]: #bail by consuming &#
196	self.handle_data(rawdata[0:2])
197	i = self.updatepos(i, 2)
198	break
199	elif startswith('&', i):
200	match = entityref.match(rawdata, i)
201	if match:
202	name = match.group(1)
203	self.handle_entityref(name)
204	k = match.end()
205	if not startswith(';', k-1):
206	k = k - 1
207	i = self.updatepos(i, k)
208	continue
209	match = incomplete.match(rawdata, i)
210	if match:
211	# match.group() will contain at least 2 chars
212	if end and match.group() == rawdata[i:]:
213	self.error("EOF in middle of entity or char ref")
214	# incomplete
215	break
216	elif (i + 1) < n:
217	# not the end of the buffer, and can't be confused
218	# with some other construct
219	self.handle_data("&")
220	i = self.updatepos(i, i + 1)
221	else:
222	break
223	else:
224	assert 0, "interesting.search() lied"
225	# end while
226	if end and i < n and not self.cdata_elem:
227	self.handle_data(rawdata[i:n])
228	i = self.updatepos(i, n)
229	self.rawdata = rawdata[i:]
230
231	# Internal -- parse html declarations, return length or -1 if not terminated
232	# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
233	# See also parse_declaration in _markupbase
234	def parse_html_declaration(self, i):
235	rawdata = self.rawdata
236	if rawdata[i:i+2] != '<!':
237	self.error('unexpected call to parse_html_declaration()')
238	if rawdata[i:i+4] == '<!--':
239	# this case is actually already handled in goahead()
240	return self.parse_comment(i)
241	elif rawdata[i:i+3] == '<![':
242	return self.parse_marked_section(i)
243	elif rawdata[i:i+9].lower() == '<!doctype':
244	# find the closing >
245	gtpos = rawdata.find('>', i+9)
246	if gtpos == -1:
247	return -1
248	self.handle_decl(rawdata[i+2:gtpos])
249	return gtpos+1
250	else:
251	return self.parse_bogus_comment(i)
252
253	# Internal -- parse bogus comment, return length or -1 if not terminated
254	# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
255	def parse_bogus_comment(self, i, report=1):
256	rawdata = self.rawdata
257	if rawdata[i:i+2] not in ('<!', '</'):
258	self.error('unexpected call to parse_comment()')
259	pos = rawdata.find('>', i+2)
260	if pos == -1:
261	return -1
262	if report:
263	self.handle_comment(rawdata[i+2:pos])
264	return pos + 1
265
266	# Internal -- parse processing instr, return end or -1 if not terminated
267	def parse_pi(self, i):
268	rawdata = self.rawdata
269	assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
270	match = piclose.search(rawdata, i+2) # >
271	if not match:
272	return -1
273	j = match.start()
274	self.handle_pi(rawdata[i+2: j])
275	j = match.end()
276	return j
277
278	# Internal -- handle starttag, return end or -1 if not terminated
279	def parse_starttag(self, i):
280	self.__starttag_text = None
281	endpos = self.check_for_whole_start_tag(i)
282	if endpos < 0:
283	return endpos
284	rawdata = self.rawdata
285	self.__starttag_text = rawdata[i:endpos]
286
287	# Now parse the data between i+1 and j into a tag and attrs
288	attrs = []
289	match = tagfind.match(rawdata, i+1)
290	assert match, 'unexpected call to parse_starttag()'
291	k = match.end()
292	self.lasttag = tag = match.group(1).lower()
293
294	while k < endpos:
295	m = attrfind.match(rawdata, k)
296	if not m:
297	break
298	attrname, rest, attrvalue = m.group(1, 2, 3)
299	if not rest:
300	attrvalue = None
301	elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
302	attrvalue[:1] == '"' == attrvalue[-1:]:
303	attrvalue = attrvalue[1:-1]
304	if attrvalue:
305	attrvalue = self.unescape(attrvalue)
306	attrs.append((attrname.lower(), attrvalue))
307	k = m.end()
308
309	end = rawdata[k:endpos].strip()
310	if end not in (">", "/>"):
311	lineno, offset = self.getpos()
312	if "\n" in self.__starttag_text:
313	lineno = lineno + self.__starttag_text.count("\n")
314	offset = len(self.__starttag_text) \
315	- self.__starttag_text.rfind("\n")
316	else:
317	offset = offset + len(self.__starttag_text)
318	self.handle_data(rawdata[i:endpos])
319	return endpos
320	if end.endswith('/>'):
321	# XHTML-style empty tag: <span attr="value" />
322	self.handle_startendtag(tag, attrs)
323	else:
324	self.handle_starttag(tag, attrs)
325	if tag in self.CDATA_CONTENT_ELEMENTS:
326	self.set_cdata_mode(tag)
327	return endpos
328
329	# Internal -- check to see if we have a complete starttag; return end
330	# or -1 if incomplete.
331	def check_for_whole_start_tag(self, i):
332	rawdata = self.rawdata
333	m = locatestarttagend.match(rawdata, i)
334	if m:
335	j = m.end()
336	next = rawdata[j:j+1]
337	if next == ">":
338	return j + 1
339	if next == "/":
340	if rawdata.startswith("/>", j):
341	return j + 2
342	if rawdata.startswith("/", j):
343	# buffer boundary
344	return -1
345	# else bogus input
346	self.updatepos(i, j + 1)
347	self.error("malformed empty start tag")
348	if next == "":
349	# end of input
350	return -1
351	if next in ("abcdefghijklmnopqrstuvwxyz=/"
352	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
353	# end of input in or before attribute value, or we have the
354	# '/' from a '/>' ending
355	return -1
356	if j > i:
357	return j
358	else:
359	return i + 1
360	raise AssertionError("we should not get here!")
361
362	# Internal -- parse endtag, return end or -1 if incomplete
363	def parse_endtag(self, i):
364	rawdata = self.rawdata
365	assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
366	match = endendtag.search(rawdata, i+1) # >
367	if not match:
368	return -1
369	gtpos = match.end()
370	match = endtagfind.match(rawdata, i) # </ + tag + >
371	if not match:
372	if self.cdata_elem is not None:
373	self.handle_data(rawdata[i:gtpos])
374	return gtpos
375	# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
376	namematch = tagfind_tolerant.match(rawdata, i+2)
377	if not namematch:
378	# w3.org/TR/html5/tokenization.html#end-tag-open-state
379	if rawdata[i:i+3] == '</>':
380	return i+3
381	else:
382	return self.parse_bogus_comment(i)
383	tagname = namematch.group().lower()
384	# consume and ignore other stuff between the name and the >
385	# Note: this is not 100% correct, since we might have things like
386	# </tag attr=">">, but looking for > after tha name should cover
387	# most of the cases and is much simpler
388	gtpos = rawdata.find('>', namematch.end())
389	self.handle_endtag(tagname)
390	return gtpos+1
391
392	elem = match.group(1).lower() # script or style
393	if self.cdata_elem is not None:
394	if elem != self.cdata_elem:
395	self.handle_data(rawdata[i:gtpos])
396	return gtpos
397
398	self.handle_endtag(elem)
399	self.clear_cdata_mode()
400	return gtpos
401
402	# Overridable -- finish processing of start+end tag: <tag.../>
403	def handle_startendtag(self, tag, attrs):
404	self.handle_starttag(tag, attrs)
405	self.handle_endtag(tag)
406
407	# Overridable -- handle start tag
408	def handle_starttag(self, tag, attrs):
409	pass
410
411	# Overridable -- handle end tag
412	def handle_endtag(self, tag):
413	pass
414
415	# Overridable -- handle character reference
416	def handle_charref(self, name):
417	pass
418
419	# Overridable -- handle entity reference
420	def handle_entityref(self, name):
421	pass
422
423	# Overridable -- handle data
424	def handle_data(self, data):
425	pass
426
427	# Overridable -- handle comment
428	def handle_comment(self, data):
429	pass
430
431	# Overridable -- handle declaration
432	def handle_decl(self, decl):
433	pass
434
435	# Overridable -- handle processing instruction
436	def handle_pi(self, data):
437	pass
438
439	def unknown_decl(self, data):
440	pass
441
442	# Internal -- helper to remove special character quoting
443	entitydefs = None
444	def unescape(self, s):
445	if '&' not in s:
446	return s
447	def replaceEntities(s):
448	s = s.groups()[0]
449	try:
450	if s[0] == "#":
451	s = s[1:]
452	if s[0] in ['x','X']:
453	c = int(s[1:], 16)
454	else:
455	c = int(s)
456	return unichr(c)
457	except ValueError:
458	return '&#'+s+';'
459	else:
460	# Cannot use name2codepoint directly, because HTMLParser supports apos,
461	# which is not part of HTML 4
462	import htmlentitydefs
463	if HTMLParser.entitydefs is None:
464	entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
465	for k, v in htmlentitydefs.name2codepoint.iteritems():
466	entitydefs[k] = unichr(v)
467	try:
468	return self.entitydefs[s]
469	except KeyError:
470	return '&'+s+';'
471
472	return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+\|\w{1,8}));", replaceEntities, s)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/HTMLParser.py@ 394

Download in other formats: