Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

markupbase.py@ 389

Last change on this file since 389 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 14.0 KB

Line
1	"""Shared support for scanning document type declarations in HTML and XHTML.
2
3	This module is used as a foundation for the HTMLParser and sgmllib
4	modules (indirectly, for htmllib as well). It has no documented
5	public API and should not be used directly.
6
7	"""
8
9	import re
10
11	_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s').match
12	_declstringlit_match = re.compile(r'(\'[^\']\'\|"[^"]")\s*').match
13	_commentclose = re.compile(r'--\s*>')
14	_markedsectionclose = re.compile(r']\s]\s>')
15
16	# An analysis of the MS-Word extensions is available at
17	# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
18
19	_msmarkedsectionclose = re.compile(r']\s*>')
20
21	del re
22
23
24	class ParserBase:
25	"""Parser base class which provides some common support methods used
26	by the SGML/HTML and XHTML parsers."""
27
28	def __init__(self):
29	if self.__class__ is ParserBase:
30	raise RuntimeError(
31	"markupbase.ParserBase must be subclassed")
32
33	def error(self, message):
34	raise NotImplementedError(
35	"subclasses of ParserBase must override error()")
36
37	def reset(self):
38	self.lineno = 1
39	self.offset = 0
40
41	def getpos(self):
42	"""Return current line number and offset."""
43	return self.lineno, self.offset
44
45	# Internal -- update line number and offset. This should be
46	# called for each piece of data exactly once, in order -- in other
47	# words the concatenation of all the input strings to this
48	# function should be exactly the entire input.
49	def updatepos(self, i, j):
50	if i >= j:
51	return j
52	rawdata = self.rawdata
53	nlines = rawdata.count("\n", i, j)
54	if nlines:
55	self.lineno = self.lineno + nlines
56	pos = rawdata.rindex("\n", i, j) # Should not fail
57	self.offset = j-(pos+1)
58	else:
59	self.offset = self.offset + j-i
60	return j
61
62	_decl_otherchars = ''
63
64	# Internal -- parse declaration (for use by subclasses).
65	def parse_declaration(self, i):
66	# This is some sort of declaration; in "HTML as
67	# deployed," this should only be the document type
68	# declaration ("<!DOCTYPE html...>").
69	# ISO 8879:1986, however, has more complex
70	# declaration syntax for elements in <!...>, including:
71	# --comment--
72	# [marked section]
73	# name in the following list: ENTITY, DOCTYPE, ELEMENT,
74	# ATTLIST, NOTATION, SHORTREF, USEMAP,
75	# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
76	rawdata = self.rawdata
77	j = i + 2
78	assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
79	if rawdata[j:j+1] == ">":
80	# the empty comment <!>
81	return j + 1
82	if rawdata[j:j+1] in ("-", ""):
83	# Start of comment followed by buffer boundary,
84	# or just a buffer boundary.
85	return -1
86	# A simple, practical version could look like: ((name\|stringlit) S*) + '>'
87	n = len(rawdata)
88	if rawdata[j:j+2] == '--': #comment
89	# Locate --.*-- as the body of the comment
90	return self.parse_comment(i)
91	elif rawdata[j] == '[': #marked section
92	# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
93	# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
94	# Note that this is extended by Microsoft Office "Save as Web" function
95	# to include [if...] and [endif].
96	return self.parse_marked_section(i)
97	else: #all other declaration elements
98	decltype, j = self._scan_name(j, i)
99	if j < 0:
100	return j
101	if decltype == "doctype":
102	self._decl_otherchars = ''
103	while j < n:
104	c = rawdata[j]
105	if c == ">":
106	# end of declaration syntax
107	data = rawdata[i+2:j]
108	if decltype == "doctype":
109	self.handle_decl(data)
110	else:
111	self.unknown_decl(data)
112	return j + 1
113	if c in "\"'":
114	m = _declstringlit_match(rawdata, j)
115	if not m:
116	return -1 # incomplete
117	j = m.end()
118	elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
119	name, j = self._scan_name(j, i)
120	elif c in self._decl_otherchars:
121	j = j + 1
122	elif c == "[":
123	# this could be handled in a separate doctype parser
124	if decltype == "doctype":
125	j = self._parse_doctype_subset(j + 1, i)
126	elif decltype in ("attlist", "linktype", "link", "element"):
127	# must tolerate []'d groups in a content model in an element declaration
128	# also in data attribute specifications of attlist declaration
129	# also link type declaration subsets in linktype declarations
130	# also link attribute specification lists in link declarations
131	self.error("unsupported '[' char in %s declaration" % decltype)
132	else:
133	self.error("unexpected '[' char in declaration")
134	else:
135	self.error(
136	"unexpected %r char in declaration" % rawdata[j])
137	if j < 0:
138	return j
139	return -1 # incomplete
140
141	# Internal -- parse a marked section
142	# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
143	def parse_marked_section(self, i, report=1):
144	rawdata= self.rawdata
145	assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
146	sectName, j = self._scan_name( i+3, i )
147	if j < 0:
148	return j
149	if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
150	# look for standard ]]> ending
151	match= _markedsectionclose.search(rawdata, i+3)
152	elif sectName in ("if", "else", "endif"):
153	# look for MS Office ]> ending
154	match= _msmarkedsectionclose.search(rawdata, i+3)
155	else:
156	self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
157	if not match:
158	return -1
159	if report:
160	j = match.start(0)
161	self.unknown_decl(rawdata[i+3: j])
162	return match.end(0)
163
164	# Internal -- parse comment, return length or -1 if not terminated
165	def parse_comment(self, i, report=1):
166	rawdata = self.rawdata
167	if rawdata[i:i+4] != '<!--':
168	self.error('unexpected call to parse_comment()')
169	match = _commentclose.search(rawdata, i+4)
170	if not match:
171	return -1
172	if report:
173	j = match.start(0)
174	self.handle_comment(rawdata[i+4: j])
175	return match.end(0)
176
177	# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
178	# returning the index just past any whitespace following the trailing ']'.
179	def _parse_doctype_subset(self, i, declstartpos):
180	rawdata = self.rawdata
181	n = len(rawdata)
182	j = i
183	while j < n:
184	c = rawdata[j]
185	if c == "<":
186	s = rawdata[j:j+2]
187	if s == "<":
188	# end of buffer; incomplete
189	return -1
190	if s != "<!":
191	self.updatepos(declstartpos, j + 1)
192	self.error("unexpected char in internal subset (in %r)" % s)
193	if (j + 2) == n:
194	# end of buffer; incomplete
195	return -1
196	if (j + 4) > n:
197	# end of buffer; incomplete
198	return -1
199	if rawdata[j:j+4] == "<!--":
200	j = self.parse_comment(j, report=0)
201	if j < 0:
202	return j
203	continue
204	name, j = self._scan_name(j + 2, declstartpos)
205	if j == -1:
206	return -1
207	if name not in ("attlist", "element", "entity", "notation"):
208	self.updatepos(declstartpos, j + 2)
209	self.error(
210	"unknown declaration %r in internal subset" % name)
211	# handle the individual names
212	meth = getattr(self, "_parse_doctype_" + name)
213	j = meth(j, declstartpos)
214	if j < 0:
215	return j
216	elif c == "%":
217	# parameter entity reference
218	if (j + 1) == n:
219	# end of buffer; incomplete
220	return -1
221	s, j = self._scan_name(j + 1, declstartpos)
222	if j < 0:
223	return j
224	if rawdata[j] == ";":
225	j = j + 1
226	elif c == "]":
227	j = j + 1
228	while j < n and rawdata[j].isspace():
229	j = j + 1
230	if j < n:
231	if rawdata[j] == ">":
232	return j
233	self.updatepos(declstartpos, j)
234	self.error("unexpected char after internal subset")
235	else:
236	return -1
237	elif c.isspace():
238	j = j + 1
239	else:
240	self.updatepos(declstartpos, j)
241	self.error("unexpected char %r in internal subset" % c)
242	# end of buffer reached
243	return -1
244
245	# Internal -- scan past <!ELEMENT declarations
246	def _parse_doctype_element(self, i, declstartpos):
247	name, j = self._scan_name(i, declstartpos)
248	if j == -1:
249	return -1
250	# style content model; just skip until '>'
251	rawdata = self.rawdata
252	if '>' in rawdata[j:]:
253	return rawdata.find(">", j) + 1
254	return -1
255
256	# Internal -- scan past <!ATTLIST declarations
257	def _parse_doctype_attlist(self, i, declstartpos):
258	rawdata = self.rawdata
259	name, j = self._scan_name(i, declstartpos)
260	c = rawdata[j:j+1]
261	if c == "":
262	return -1
263	if c == ">":
264	return j + 1
265	while 1:
266	# scan a series of attribute descriptions; simplified:
267	# name type [value] [#constraint]
268	name, j = self._scan_name(j, declstartpos)
269	if j < 0:
270	return j
271	c = rawdata[j:j+1]
272	if c == "":
273	return -1
274	if c == "(":
275	# an enumerated type; look for ')'
276	if ")" in rawdata[j:]:
277	j = rawdata.find(")", j) + 1
278	else:
279	return -1
280	while rawdata[j:j+1].isspace():
281	j = j + 1
282	if not rawdata[j:]:
283	# end of buffer, incomplete
284	return -1
285	else:
286	name, j = self._scan_name(j, declstartpos)
287	c = rawdata[j:j+1]
288	if not c:
289	return -1
290	if c in "'\"":
291	m = _declstringlit_match(rawdata, j)
292	if m:
293	j = m.end()
294	else:
295	return -1
296	c = rawdata[j:j+1]
297	if not c:
298	return -1
299	if c == "#":
300	if rawdata[j:] == "#":
301	# end of buffer
302	return -1
303	name, j = self._scan_name(j + 1, declstartpos)
304	if j < 0:
305	return j
306	c = rawdata[j:j+1]
307	if not c:
308	return -1
309	if c == '>':
310	# all done
311	return j + 1
312
313	# Internal -- scan past <!NOTATION declarations
314	def _parse_doctype_notation(self, i, declstartpos):
315	name, j = self._scan_name(i, declstartpos)
316	if j < 0:
317	return j
318	rawdata = self.rawdata
319	while 1:
320	c = rawdata[j:j+1]
321	if not c:
322	# end of buffer; incomplete
323	return -1
324	if c == '>':
325	return j + 1
326	if c in "'\"":
327	m = _declstringlit_match(rawdata, j)
328	if not m:
329	return -1
330	j = m.end()
331	else:
332	name, j = self._scan_name(j, declstartpos)
333	if j < 0:
334	return j
335
336	# Internal -- scan past <!ENTITY declarations
337	def _parse_doctype_entity(self, i, declstartpos):
338	rawdata = self.rawdata
339	if rawdata[i:i+1] == "%":
340	j = i + 1
341	while 1:
342	c = rawdata[j:j+1]
343	if not c:
344	return -1
345	if c.isspace():
346	j = j + 1
347	else:
348	break
349	else:
350	j = i
351	name, j = self._scan_name(j, declstartpos)
352	if j < 0:
353	return j
354	while 1:
355	c = self.rawdata[j:j+1]
356	if not c:
357	return -1
358	if c in "'\"":
359	m = _declstringlit_match(rawdata, j)
360	if m:
361	j = m.end()
362	else:
363	return -1 # incomplete
364	elif c == ">":
365	return j + 1
366	else:
367	name, j = self._scan_name(j, declstartpos)
368	if j < 0:
369	return j
370
371	# Internal -- scan a name token and the new position and the token, or
372	# return -1 if we've reached the end of the buffer.
373	def _scan_name(self, i, declstartpos):
374	rawdata = self.rawdata
375	n = len(rawdata)
376	if i == n:
377	return None, -1
378	m = _declname_match(rawdata, i)
379	if m:
380	s = m.group()
381	name = s.strip()
382	if (i + len(s)) == n:
383	return None, -1 # end of buffer
384	return name.lower(), m.end()
385	else:
386	self.updatepos(declstartpos, i)
387	self.error("expected name token at %r"
388	% rawdata[declstartpos:declstartpos+20])
389
390	# To be overridden -- handlers for unknown objects
391	def unknown_decl(self, data):
392	pass

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/markupbase.py@ 389

Download in other formats: