Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

HTMLParser.py@ 20

Last change on this file since 20 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 13.1 KB

Line
1	"""A parser for HTML and XHTML."""
2
3	# This file is based on sgmllib.py, but the API is slightly different.
4
5	# XXX There should be a way to distinguish between PCDATA (parsed
6	# character data -- the normal case), RCDATA (replaceable character
7	# data -- only char and entity references and end tags are special)
8	# and CDATA (character data -- only end tags are special).
9
10
11	import markupbase
12	import re
13
14	# Regular expressions used for parsing
15
16	interesting_normal = re.compile('[&<]')
17	interesting_cdata = re.compile(r'<(/\|\Z)')
18	incomplete = re.compile('&[a-zA-Z#]')
19
20	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
21	charref = re.compile('&#(?:[0-9]+\|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
22
23	starttagopen = re.compile('<[a-zA-Z]')
24	piclose = re.compile('>')
25	commentclose = re.compile(r'--\s*>')
26	tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
27	attrfind = re.compile(
28	r'\s([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s'
29	r'(\'[^\']\'\|"[^"]"\|[-a-zA-Z0-9./,:;+%?!&$_#=~@]))?')
30
31	locatestarttagend = re.compile(r"""
32	<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
33	(?:\s+ # whitespace before attribute name
34	(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
35	(?:\s=\s # value indicator
36	(?:'[^']*' # LITA-enclosed value
37	\|\"[^\"]*\" # LIT-enclosed value
38	\|[^'\">\s]+ # bare value
39	)
40	)?
41	)
42	)*
43	\s* # trailing whitespace
44	""", re.VERBOSE)
45	endendtag = re.compile('>')
46	endtagfind = re.compile('</\s([a-zA-Z][-.a-zA-Z0-9:_])\s*>')
47
48
49	class HTMLParseError(Exception):
50	"""Exception raised for all parse errors."""
51
52	def __init__(self, msg, position=(None, None)):
53	assert msg
54	self.msg = msg
55	self.lineno = position[0]
56	self.offset = position[1]
57
58	def __str__(self):
59	result = self.msg
60	if self.lineno is not None:
61	result = result + ", at line %d" % self.lineno
62	if self.offset is not None:
63	result = result + ", column %d" % (self.offset + 1)
64	return result
65
66
67	class HTMLParser(markupbase.ParserBase):
68	"""Find tags and other markup and call handler functions.
69
70	Usage:
71	p = HTMLParser()
72	p.feed(data)
73	...
74	p.close()
75
76	Start tags are handled by calling self.handle_starttag() or
77	self.handle_startendtag(); end tags by self.handle_endtag(). The
78	data between tags is passed from the parser to the derived class
79	by calling self.handle_data() with the data as argument (the data
80	may be split up in arbitrary chunks). Entity references are
81	passed by calling self.handle_entityref() with the entity
82	reference as the argument. Numeric character references are
83	passed to self.handle_charref() with the string containing the
84	reference as the argument.
85	"""
86
87	CDATA_CONTENT_ELEMENTS = ("script", "style")
88
89
90	def __init__(self):
91	"""Initialize and reset this instance."""
92	self.reset()
93
94	def reset(self):
95	"""Reset this instance. Loses all unprocessed data."""
96	self.rawdata = ''
97	self.lasttag = '???'
98	self.interesting = interesting_normal
99	markupbase.ParserBase.reset(self)
100
101	def feed(self, data):
102	"""Feed data to the parser.
103
104	Call this as often as you want, with as little or as much text
105	as you want (may include '\n').
106	"""
107	self.rawdata = self.rawdata + data
108	self.goahead(0)
109
110	def close(self):
111	"""Handle any buffered data."""
112	self.goahead(1)
113
114	def error(self, message):
115	raise HTMLParseError(message, self.getpos())
116
117	__starttag_text = None
118
119	def get_starttag_text(self):
120	"""Return full source of start tag: '<...>'."""
121	return self.__starttag_text
122
123	def set_cdata_mode(self):
124	self.interesting = interesting_cdata
125
126	def clear_cdata_mode(self):
127	self.interesting = interesting_normal
128
129	# Internal -- handle data as far as reasonable. May leave state
130	# and data to be processed by a subsequent call. If 'end' is
131	# true, force handling all data as if followed by EOF marker.
132	def goahead(self, end):
133	rawdata = self.rawdata
134	i = 0
135	n = len(rawdata)
136	while i < n:
137	match = self.interesting.search(rawdata, i) # < or &
138	if match:
139	j = match.start()
140	else:
141	j = n
142	if i < j: self.handle_data(rawdata[i:j])
143	i = self.updatepos(i, j)
144	if i == n: break
145	startswith = rawdata.startswith
146	if startswith('<', i):
147	if starttagopen.match(rawdata, i): # < + letter
148	k = self.parse_starttag(i)
149	elif startswith("</", i):
150	k = self.parse_endtag(i)
151	elif startswith("<!--", i):
152	k = self.parse_comment(i)
153	elif startswith("<?", i):
154	k = self.parse_pi(i)
155	elif startswith("<!", i):
156	k = self.parse_declaration(i)
157	elif (i + 1) < n:
158	self.handle_data("<")
159	k = i + 1
160	else:
161	break
162	if k < 0:
163	if end:
164	self.error("EOF in middle of construct")
165	break
166	i = self.updatepos(i, k)
167	elif startswith("&#", i):
168	match = charref.match(rawdata, i)
169	if match:
170	name = match.group()[2:-1]
171	self.handle_charref(name)
172	k = match.end()
173	if not startswith(';', k-1):
174	k = k - 1
175	i = self.updatepos(i, k)
176	continue
177	else:
178	break
179	elif startswith('&', i):
180	match = entityref.match(rawdata, i)
181	if match:
182	name = match.group(1)
183	self.handle_entityref(name)
184	k = match.end()
185	if not startswith(';', k-1):
186	k = k - 1
187	i = self.updatepos(i, k)
188	continue
189	match = incomplete.match(rawdata, i)
190	if match:
191	# match.group() will contain at least 2 chars
192	if end and match.group() == rawdata[i:]:
193	self.error("EOF in middle of entity or char ref")
194	# incomplete
195	break
196	elif (i + 1) < n:
197	# not the end of the buffer, and can't be confused
198	# with some other construct
199	self.handle_data("&")
200	i = self.updatepos(i, i + 1)
201	else:
202	break
203	else:
204	assert 0, "interesting.search() lied"
205	# end while
206	if end and i < n:
207	self.handle_data(rawdata[i:n])
208	i = self.updatepos(i, n)
209	self.rawdata = rawdata[i:]
210
211	# Internal -- parse processing instr, return end or -1 if not terminated
212	def parse_pi(self, i):
213	rawdata = self.rawdata
214	assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
215	match = piclose.search(rawdata, i+2) # >
216	if not match:
217	return -1
218	j = match.start()
219	self.handle_pi(rawdata[i+2: j])
220	j = match.end()
221	return j
222
223	# Internal -- handle starttag, return end or -1 if not terminated
224	def parse_starttag(self, i):
225	self.__starttag_text = None
226	endpos = self.check_for_whole_start_tag(i)
227	if endpos < 0:
228	return endpos
229	rawdata = self.rawdata
230	self.__starttag_text = rawdata[i:endpos]
231
232	# Now parse the data between i+1 and j into a tag and attrs
233	attrs = []
234	match = tagfind.match(rawdata, i+1)
235	assert match, 'unexpected call to parse_starttag()'
236	k = match.end()
237	self.lasttag = tag = rawdata[i+1:k].lower()
238
239	while k < endpos:
240	m = attrfind.match(rawdata, k)
241	if not m:
242	break
243	attrname, rest, attrvalue = m.group(1, 2, 3)
244	if not rest:
245	attrvalue = None
246	elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
247	attrvalue[:1] == '"' == attrvalue[-1:]:
248	attrvalue = attrvalue[1:-1]
249	attrvalue = self.unescape(attrvalue)
250	attrs.append((attrname.lower(), attrvalue))
251	k = m.end()
252
253	end = rawdata[k:endpos].strip()
254	if end not in (">", "/>"):
255	lineno, offset = self.getpos()
256	if "\n" in self.__starttag_text:
257	lineno = lineno + self.__starttag_text.count("\n")
258	offset = len(self.__starttag_text) \
259	- self.__starttag_text.rfind("\n")
260	else:
261	offset = offset + len(self.__starttag_text)
262	self.error("junk characters in start tag: %r"
263	% (rawdata[k:endpos][:20],))
264	if end.endswith('/>'):
265	# XHTML-style empty tag: <span attr="value" />
266	self.handle_startendtag(tag, attrs)
267	else:
268	self.handle_starttag(tag, attrs)
269	if tag in self.CDATA_CONTENT_ELEMENTS:
270	self.set_cdata_mode()
271	return endpos
272
273	# Internal -- check to see if we have a complete starttag; return end
274	# or -1 if incomplete.
275	def check_for_whole_start_tag(self, i):
276	rawdata = self.rawdata
277	m = locatestarttagend.match(rawdata, i)
278	if m:
279	j = m.end()
280	next = rawdata[j:j+1]
281	if next == ">":
282	return j + 1
283	if next == "/":
284	if rawdata.startswith("/>", j):
285	return j + 2
286	if rawdata.startswith("/", j):
287	# buffer boundary
288	return -1
289	# else bogus input
290	self.updatepos(i, j + 1)
291	self.error("malformed empty start tag")
292	if next == "":
293	# end of input
294	return -1
295	if next in ("abcdefghijklmnopqrstuvwxyz=/"
296	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
297	# end of input in or before attribute value, or we have the
298	# '/' from a '/>' ending
299	return -1
300	self.updatepos(i, j)
301	self.error("malformed start tag")
302	raise AssertionError("we should not get here!")
303
304	# Internal -- parse endtag, return end or -1 if incomplete
305	def parse_endtag(self, i):
306	rawdata = self.rawdata
307	assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
308	match = endendtag.search(rawdata, i+1) # >
309	if not match:
310	return -1
311	j = match.end()
312	match = endtagfind.match(rawdata, i) # </ + tag + >
313	if not match:
314	self.error("bad end tag: %r" % (rawdata[i:j],))
315	tag = match.group(1)
316	self.handle_endtag(tag.lower())
317	self.clear_cdata_mode()
318	return j
319
320	# Overridable -- finish processing of start+end tag: <tag.../>
321	def handle_startendtag(self, tag, attrs):
322	self.handle_starttag(tag, attrs)
323	self.handle_endtag(tag)
324
325	# Overridable -- handle start tag
326	def handle_starttag(self, tag, attrs):
327	pass
328
329	# Overridable -- handle end tag
330	def handle_endtag(self, tag):
331	pass
332
333	# Overridable -- handle character reference
334	def handle_charref(self, name):
335	pass
336
337	# Overridable -- handle entity reference
338	def handle_entityref(self, name):
339	pass
340
341	# Overridable -- handle data
342	def handle_data(self, data):
343	pass
344
345	# Overridable -- handle comment
346	def handle_comment(self, data):
347	pass
348
349	# Overridable -- handle declaration
350	def handle_decl(self, decl):
351	pass
352
353	# Overridable -- handle processing instruction
354	def handle_pi(self, data):
355	pass
356
357	def unknown_decl(self, data):
358	self.error("unknown declaration: %r" % (data,))
359
360	# Internal -- helper to remove special character quoting
361	entitydefs = None
362	def unescape(self, s):
363	if '&' not in s:
364	return s
365	def replaceEntities(s):
366	s = s.groups()[0]
367	if s[0] == "#":
368	s = s[1:]
369	if s[0] in ['x','X']:
370	c = int(s[1:], 16)
371	else:
372	c = int(s)
373	return unichr(c)
374	else:
375	# Cannot use name2codepoint directly, because HTMLParser supports apos,
376	# which is not part of HTML 4
377	import htmlentitydefs
378	if HTMLParser.entitydefs is None:
379	entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
380	for k, v in htmlentitydefs.name2codepoint.iteritems():
381	entitydefs[k] = unichr(v)
382	try:
383	return self.entitydefs[s]
384	except KeyError:
385	return '&'+s+';'
386
387	return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+\|\w{1,8}));", replaceEntities, s)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/HTMLParser.py@ 20

Download in other formats: