Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

markupbase.py

Last change on this file was 388, checked in by dmik, 11 years ago
python: Update vendor to 2.7.6.
Property svn:eol-style set to `native`
File size: 14.3 KB

Line
1	"""Shared support for scanning document type declarations in HTML and XHTML.
2
3	This module is used as a foundation for the HTMLParser and sgmllib
4	modules (indirectly, for htmllib as well). It has no documented
5	public API and should not be used directly.
6
7	"""
8
9	import re
10
11	_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s').match
12	_declstringlit_match = re.compile(r'(\'[^\']\'\|"[^"]")\s*').match
13	_commentclose = re.compile(r'--\s*>')
14	_markedsectionclose = re.compile(r']\s]\s>')
15
16	# An analysis of the MS-Word extensions is available at
17	# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
18
19	_msmarkedsectionclose = re.compile(r']\s*>')
20
21	del re
22
23
24	class ParserBase:
25	"""Parser base class which provides some common support methods used
26	by the SGML/HTML and XHTML parsers."""
27
28	def __init__(self):
29	if self.__class__ is ParserBase:
30	raise RuntimeError(
31	"markupbase.ParserBase must be subclassed")
32
33	def error(self, message):
34	raise NotImplementedError(
35	"subclasses of ParserBase must override error()")
36
37	def reset(self):
38	self.lineno = 1
39	self.offset = 0
40
41	def getpos(self):
42	"""Return current line number and offset."""
43	return self.lineno, self.offset
44
45	# Internal -- update line number and offset. This should be
46	# called for each piece of data exactly once, in order -- in other
47	# words the concatenation of all the input strings to this
48	# function should be exactly the entire input.
49	def updatepos(self, i, j):
50	if i >= j:
51	return j
52	rawdata = self.rawdata
53	nlines = rawdata.count("\n", i, j)
54	if nlines:
55	self.lineno = self.lineno + nlines
56	pos = rawdata.rindex("\n", i, j) # Should not fail
57	self.offset = j-(pos+1)
58	else:
59	self.offset = self.offset + j-i
60	return j
61
62	_decl_otherchars = ''
63
64	# Internal -- parse declaration (for use by subclasses).
65	def parse_declaration(self, i):
66	# This is some sort of declaration; in "HTML as
67	# deployed," this should only be the document type
68	# declaration ("<!DOCTYPE html...>").
69	# ISO 8879:1986, however, has more complex
70	# declaration syntax for elements in <!...>, including:
71	# --comment--
72	# [marked section]
73	# name in the following list: ENTITY, DOCTYPE, ELEMENT,
74	# ATTLIST, NOTATION, SHORTREF, USEMAP,
75	# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
76	rawdata = self.rawdata
77	j = i + 2
78	assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
79	if rawdata[j:j+1] == ">":
80	# the empty comment <!>
81	return j + 1
82	if rawdata[j:j+1] in ("-", ""):
83	# Start of comment followed by buffer boundary,
84	# or just a buffer boundary.
85	return -1
86	# A simple, practical version could look like: ((name\|stringlit) S*) + '>'
87	n = len(rawdata)
88	if rawdata[j:j+2] == '--': #comment
89	# Locate --.*-- as the body of the comment
90	return self.parse_comment(i)
91	elif rawdata[j] == '[': #marked section
92	# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
93	# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
94	# Note that this is extended by Microsoft Office "Save as Web" function
95	# to include [if...] and [endif].
96	return self.parse_marked_section(i)
97	else: #all other declaration elements
98	decltype, j = self._scan_name(j, i)
99	if j < 0:
100	return j
101	if decltype == "doctype":
102	self._decl_otherchars = ''
103	while j < n:
104	c = rawdata[j]
105	if c == ">":
106	# end of declaration syntax
107	data = rawdata[i+2:j]
108	if decltype == "doctype":
109	self.handle_decl(data)
110	else:
111	# According to the HTML5 specs sections "8.2.4.44 Bogus
112	# comment state" and "8.2.4.45 Markup declaration open
113	# state", a comment token should be emitted.
114	# Calling unknown_decl provides more flexibility though.
115	self.unknown_decl(data)
116	return j + 1
117	if c in "\"'":
118	m = _declstringlit_match(rawdata, j)
119	if not m:
120	return -1 # incomplete
121	j = m.end()
122	elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
123	name, j = self._scan_name(j, i)
124	elif c in self._decl_otherchars:
125	j = j + 1
126	elif c == "[":
127	# this could be handled in a separate doctype parser
128	if decltype == "doctype":
129	j = self._parse_doctype_subset(j + 1, i)
130	elif decltype in ("attlist", "linktype", "link", "element"):
131	# must tolerate []'d groups in a content model in an element declaration
132	# also in data attribute specifications of attlist declaration
133	# also link type declaration subsets in linktype declarations
134	# also link attribute specification lists in link declarations
135	self.error("unsupported '[' char in %s declaration" % decltype)
136	else:
137	self.error("unexpected '[' char in declaration")
138	else:
139	self.error(
140	"unexpected %r char in declaration" % rawdata[j])
141	if j < 0:
142	return j
143	return -1 # incomplete
144
145	# Internal -- parse a marked section
146	# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
147	def parse_marked_section(self, i, report=1):
148	rawdata= self.rawdata
149	assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
150	sectName, j = self._scan_name( i+3, i )
151	if j < 0:
152	return j
153	if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
154	# look for standard ]]> ending
155	match= _markedsectionclose.search(rawdata, i+3)
156	elif sectName in ("if", "else", "endif"):
157	# look for MS Office ]> ending
158	match= _msmarkedsectionclose.search(rawdata, i+3)
159	else:
160	self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
161	if not match:
162	return -1
163	if report:
164	j = match.start(0)
165	self.unknown_decl(rawdata[i+3: j])
166	return match.end(0)
167
168	# Internal -- parse comment, return length or -1 if not terminated
169	def parse_comment(self, i, report=1):
170	rawdata = self.rawdata
171	if rawdata[i:i+4] != '<!--':
172	self.error('unexpected call to parse_comment()')
173	match = _commentclose.search(rawdata, i+4)
174	if not match:
175	return -1
176	if report:
177	j = match.start(0)
178	self.handle_comment(rawdata[i+4: j])
179	return match.end(0)
180
181	# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
182	# returning the index just past any whitespace following the trailing ']'.
183	def _parse_doctype_subset(self, i, declstartpos):
184	rawdata = self.rawdata
185	n = len(rawdata)
186	j = i
187	while j < n:
188	c = rawdata[j]
189	if c == "<":
190	s = rawdata[j:j+2]
191	if s == "<":
192	# end of buffer; incomplete
193	return -1
194	if s != "<!":
195	self.updatepos(declstartpos, j + 1)
196	self.error("unexpected char in internal subset (in %r)" % s)
197	if (j + 2) == n:
198	# end of buffer; incomplete
199	return -1
200	if (j + 4) > n:
201	# end of buffer; incomplete
202	return -1
203	if rawdata[j:j+4] == "<!--":
204	j = self.parse_comment(j, report=0)
205	if j < 0:
206	return j
207	continue
208	name, j = self._scan_name(j + 2, declstartpos)
209	if j == -1:
210	return -1
211	if name not in ("attlist", "element", "entity", "notation"):
212	self.updatepos(declstartpos, j + 2)
213	self.error(
214	"unknown declaration %r in internal subset" % name)
215	# handle the individual names
216	meth = getattr(self, "_parse_doctype_" + name)
217	j = meth(j, declstartpos)
218	if j < 0:
219	return j
220	elif c == "%":
221	# parameter entity reference
222	if (j + 1) == n:
223	# end of buffer; incomplete
224	return -1
225	s, j = self._scan_name(j + 1, declstartpos)
226	if j < 0:
227	return j
228	if rawdata[j] == ";":
229	j = j + 1
230	elif c == "]":
231	j = j + 1
232	while j < n and rawdata[j].isspace():
233	j = j + 1
234	if j < n:
235	if rawdata[j] == ">":
236	return j
237	self.updatepos(declstartpos, j)
238	self.error("unexpected char after internal subset")
239	else:
240	return -1
241	elif c.isspace():
242	j = j + 1
243	else:
244	self.updatepos(declstartpos, j)
245	self.error("unexpected char %r in internal subset" % c)
246	# end of buffer reached
247	return -1
248
249	# Internal -- scan past <!ELEMENT declarations
250	def _parse_doctype_element(self, i, declstartpos):
251	name, j = self._scan_name(i, declstartpos)
252	if j == -1:
253	return -1
254	# style content model; just skip until '>'
255	rawdata = self.rawdata
256	if '>' in rawdata[j:]:
257	return rawdata.find(">", j) + 1
258	return -1
259
260	# Internal -- scan past <!ATTLIST declarations
261	def _parse_doctype_attlist(self, i, declstartpos):
262	rawdata = self.rawdata
263	name, j = self._scan_name(i, declstartpos)
264	c = rawdata[j:j+1]
265	if c == "":
266	return -1
267	if c == ">":
268	return j + 1
269	while 1:
270	# scan a series of attribute descriptions; simplified:
271	# name type [value] [#constraint]
272	name, j = self._scan_name(j, declstartpos)
273	if j < 0:
274	return j
275	c = rawdata[j:j+1]
276	if c == "":
277	return -1
278	if c == "(":
279	# an enumerated type; look for ')'
280	if ")" in rawdata[j:]:
281	j = rawdata.find(")", j) + 1
282	else:
283	return -1
284	while rawdata[j:j+1].isspace():
285	j = j + 1
286	if not rawdata[j:]:
287	# end of buffer, incomplete
288	return -1
289	else:
290	name, j = self._scan_name(j, declstartpos)
291	c = rawdata[j:j+1]
292	if not c:
293	return -1
294	if c in "'\"":
295	m = _declstringlit_match(rawdata, j)
296	if m:
297	j = m.end()
298	else:
299	return -1
300	c = rawdata[j:j+1]
301	if not c:
302	return -1
303	if c == "#":
304	if rawdata[j:] == "#":
305	# end of buffer
306	return -1
307	name, j = self._scan_name(j + 1, declstartpos)
308	if j < 0:
309	return j
310	c = rawdata[j:j+1]
311	if not c:
312	return -1
313	if c == '>':
314	# all done
315	return j + 1
316
317	# Internal -- scan past <!NOTATION declarations
318	def _parse_doctype_notation(self, i, declstartpos):
319	name, j = self._scan_name(i, declstartpos)
320	if j < 0:
321	return j
322	rawdata = self.rawdata
323	while 1:
324	c = rawdata[j:j+1]
325	if not c:
326	# end of buffer; incomplete
327	return -1
328	if c == '>':
329	return j + 1
330	if c in "'\"":
331	m = _declstringlit_match(rawdata, j)
332	if not m:
333	return -1
334	j = m.end()
335	else:
336	name, j = self._scan_name(j, declstartpos)
337	if j < 0:
338	return j
339
340	# Internal -- scan past <!ENTITY declarations
341	def _parse_doctype_entity(self, i, declstartpos):
342	rawdata = self.rawdata
343	if rawdata[i:i+1] == "%":
344	j = i + 1
345	while 1:
346	c = rawdata[j:j+1]
347	if not c:
348	return -1
349	if c.isspace():
350	j = j + 1
351	else:
352	break
353	else:
354	j = i
355	name, j = self._scan_name(j, declstartpos)
356	if j < 0:
357	return j
358	while 1:
359	c = self.rawdata[j:j+1]
360	if not c:
361	return -1
362	if c in "'\"":
363	m = _declstringlit_match(rawdata, j)
364	if m:
365	j = m.end()
366	else:
367	return -1 # incomplete
368	elif c == ">":
369	return j + 1
370	else:
371	name, j = self._scan_name(j, declstartpos)
372	if j < 0:
373	return j
374
375	# Internal -- scan a name token and the new position and the token, or
376	# return -1 if we've reached the end of the buffer.
377	def _scan_name(self, i, declstartpos):
378	rawdata = self.rawdata
379	n = len(rawdata)
380	if i == n:
381	return None, -1
382	m = _declname_match(rawdata, i)
383	if m:
384	s = m.group()
385	name = s.strip()
386	if (i + len(s)) == n:
387	return None, -1 # end of buffer
388	return name.lower(), m.end()
389	else:
390	self.updatepos(declstartpos, i)
391	self.error("expected name token at %r"
392	% rawdata[declstartpos:declstartpos+20])
393
394	# To be overridden -- handlers for unknown objects
395	def unknown_decl(self, data):
396	pass

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/vendor/Python-2.7.6/Lib/markupbase.py

Download in other formats: