Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

markupbase.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 14.3 KB

Rev	Line
[2]	1	"""Shared support for scanning document type declarations in HTML and XHTML.
	2
	3	This module is used as a foundation for the HTMLParser and sgmllib
	4	modules (indirectly, for htmllib as well). It has no documented
	5	public API and should not be used directly.
	6
	7	"""
	8
	9	import re
	10
	11	_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s').match
	12	_declstringlit_match = re.compile(r'(\'[^\']\'\|"[^"]")\s*').match
	13	_commentclose = re.compile(r'--\s*>')
	14	_markedsectionclose = re.compile(r']\s]\s>')
	15
	16	# An analysis of the MS-Word extensions is available at
	17	# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
	18
	19	_msmarkedsectionclose = re.compile(r']\s*>')
	20
	21	del re
	22
	23
	24	class ParserBase:
	25	"""Parser base class which provides some common support methods used
	26	by the SGML/HTML and XHTML parsers."""
	27
	28	def __init__(self):
	29	if self.__class__ is ParserBase:
	30	raise RuntimeError(
	31	"markupbase.ParserBase must be subclassed")
	32
	33	def error(self, message):
	34	raise NotImplementedError(
	35	"subclasses of ParserBase must override error()")
	36
	37	def reset(self):
	38	self.lineno = 1
	39	self.offset = 0
	40
	41	def getpos(self):
	42	"""Return current line number and offset."""
	43	return self.lineno, self.offset
	44
	45	# Internal -- update line number and offset. This should be
	46	# called for each piece of data exactly once, in order -- in other
	47	# words the concatenation of all the input strings to this
	48	# function should be exactly the entire input.
	49	def updatepos(self, i, j):
	50	if i >= j:
	51	return j
	52	rawdata = self.rawdata
	53	nlines = rawdata.count("\n", i, j)
	54	if nlines:
	55	self.lineno = self.lineno + nlines
	56	pos = rawdata.rindex("\n", i, j) # Should not fail
	57	self.offset = j-(pos+1)
	58	else:
	59	self.offset = self.offset + j-i
	60	return j
	61
	62	_decl_otherchars = ''
	63
	64	# Internal -- parse declaration (for use by subclasses).
	65	def parse_declaration(self, i):
	66	# This is some sort of declaration; in "HTML as
	67	# deployed," this should only be the document type
	68	# declaration ("<!DOCTYPE html...>").
	69	# ISO 8879:1986, however, has more complex
	70	# declaration syntax for elements in <!...>, including:
	71	# --comment--
	72	# [marked section]
	73	# name in the following list: ENTITY, DOCTYPE, ELEMENT,
	74	# ATTLIST, NOTATION, SHORTREF, USEMAP,
	75	# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
	76	rawdata = self.rawdata
	77	j = i + 2
	78	assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
	79	if rawdata[j:j+1] == ">":
	80	# the empty comment <!>
	81	return j + 1
	82	if rawdata[j:j+1] in ("-", ""):
	83	# Start of comment followed by buffer boundary,
	84	# or just a buffer boundary.
	85	return -1
	86	# A simple, practical version could look like: ((name\|stringlit) S*) + '>'
	87	n = len(rawdata)
	88	if rawdata[j:j+2] == '--': #comment
	89	# Locate --.*-- as the body of the comment
	90	return self.parse_comment(i)
	91	elif rawdata[j] == '[': #marked section
	92	# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
	93	# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
	94	# Note that this is extended by Microsoft Office "Save as Web" function
	95	# to include [if...] and [endif].
	96	return self.parse_marked_section(i)
	97	else: #all other declaration elements
	98	decltype, j = self._scan_name(j, i)
	99	if j < 0:
	100	return j
	101	if decltype == "doctype":
	102	self._decl_otherchars = ''
	103	while j < n:
	104	c = rawdata[j]
	105	if c == ">":
	106	# end of declaration syntax
	107	data = rawdata[i+2:j]
	108	if decltype == "doctype":
	109	self.handle_decl(data)
	110	else:
[391]	111	# According to the HTML5 specs sections "8.2.4.44 Bogus
	112	# comment state" and "8.2.4.45 Markup declaration open
	113	# state", a comment token should be emitted.
	114	# Calling unknown_decl provides more flexibility though.
[2]	115	self.unknown_decl(data)
	116	return j + 1
	117	if c in "\"'":
	118	m = _declstringlit_match(rawdata, j)
	119	if not m:
	120	return -1 # incomplete
	121	j = m.end()
	122	elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
	123	name, j = self._scan_name(j, i)
	124	elif c in self._decl_otherchars:
	125	j = j + 1
	126	elif c == "[":
	127	# this could be handled in a separate doctype parser
	128	if decltype == "doctype":
	129	j = self._parse_doctype_subset(j + 1, i)
	130	elif decltype in ("attlist", "linktype", "link", "element"):
	131	# must tolerate []'d groups in a content model in an element declaration
	132	# also in data attribute specifications of attlist declaration
	133	# also link type declaration subsets in linktype declarations
	134	# also link attribute specification lists in link declarations
	135	self.error("unsupported '[' char in %s declaration" % decltype)
	136	else:
	137	self.error("unexpected '[' char in declaration")
	138	else:
	139	self.error(
	140	"unexpected %r char in declaration" % rawdata[j])
	141	if j < 0:
	142	return j
	143	return -1 # incomplete
	144
	145	# Internal -- parse a marked section
	146	# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
	147	def parse_marked_section(self, i, report=1):
	148	rawdata= self.rawdata
	149	assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
	150	sectName, j = self._scan_name( i+3, i )
	151	if j < 0:
	152	return j
	153	if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
	154	# look for standard ]]> ending
	155	match= _markedsectionclose.search(rawdata, i+3)
	156	elif sectName in ("if", "else", "endif"):
	157	# look for MS Office ]> ending
	158	match= _msmarkedsectionclose.search(rawdata, i+3)
	159	else:
	160	self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
	161	if not match:
	162	return -1
	163	if report:
	164	j = match.start(0)
	165	self.unknown_decl(rawdata[i+3: j])
	166	return match.end(0)
	167
	168	# Internal -- parse comment, return length or -1 if not terminated
	169	def parse_comment(self, i, report=1):
	170	rawdata = self.rawdata
	171	if rawdata[i:i+4] != '<!--':
	172	self.error('unexpected call to parse_comment()')
	173	match = _commentclose.search(rawdata, i+4)
	174	if not match:
	175	return -1
	176	if report:
	177	j = match.start(0)
	178	self.handle_comment(rawdata[i+4: j])
	179	return match.end(0)
	180
	181	# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
	182	# returning the index just past any whitespace following the trailing ']'.
	183	def _parse_doctype_subset(self, i, declstartpos):
	184	rawdata = self.rawdata
	185	n = len(rawdata)
	186	j = i
	187	while j < n:
	188	c = rawdata[j]
	189	if c == "<":
	190	s = rawdata[j:j+2]
	191	if s == "<":
	192	# end of buffer; incomplete
	193	return -1
	194	if s != "<!":
	195	self.updatepos(declstartpos, j + 1)
	196	self.error("unexpected char in internal subset (in %r)" % s)
	197	if (j + 2) == n:
	198	# end of buffer; incomplete
	199	return -1
	200	if (j + 4) > n:
	201	# end of buffer; incomplete
	202	return -1
	203	if rawdata[j:j+4] == "<!--":
	204	j = self.parse_comment(j, report=0)
	205	if j < 0:
	206	return j
	207	continue
	208	name, j = self._scan_name(j + 2, declstartpos)
	209	if j == -1:
	210	return -1
	211	if name not in ("attlist", "element", "entity", "notation"):
	212	self.updatepos(declstartpos, j + 2)
	213	self.error(
	214	"unknown declaration %r in internal subset" % name)
	215	# handle the individual names
	216	meth = getattr(self, "_parse_doctype_" + name)
	217	j = meth(j, declstartpos)
	218	if j < 0:
	219	return j
	220	elif c == "%":
	221	# parameter entity reference
	222	if (j + 1) == n:
	223	# end of buffer; incomplete
	224	return -1
	225	s, j = self._scan_name(j + 1, declstartpos)
	226	if j < 0:
	227	return j
	228	if rawdata[j] == ";":
	229	j = j + 1
	230	elif c == "]":
	231	j = j + 1
	232	while j < n and rawdata[j].isspace():
	233	j = j + 1
	234	if j < n:
	235	if rawdata[j] == ">":
	236	return j
	237	self.updatepos(declstartpos, j)
	238	self.error("unexpected char after internal subset")
	239	else:
	240	return -1
	241	elif c.isspace():
	242	j = j + 1
	243	else:
	244	self.updatepos(declstartpos, j)
	245	self.error("unexpected char %r in internal subset" % c)
	246	# end of buffer reached
	247	return -1
	248
	249	# Internal -- scan past <!ELEMENT declarations
	250	def _parse_doctype_element(self, i, declstartpos):
	251	name, j = self._scan_name(i, declstartpos)
	252	if j == -1:
	253	return -1
	254	# style content model; just skip until '>'
	255	rawdata = self.rawdata
	256	if '>' in rawdata[j:]:
	257	return rawdata.find(">", j) + 1
	258	return -1
	259
	260	# Internal -- scan past <!ATTLIST declarations
	261	def _parse_doctype_attlist(self, i, declstartpos):
	262	rawdata = self.rawdata
	263	name, j = self._scan_name(i, declstartpos)
	264	c = rawdata[j:j+1]
	265	if c == "":
	266	return -1
	267	if c == ">":
	268	return j + 1
	269	while 1:
	270	# scan a series of attribute descriptions; simplified:
	271	# name type [value] [#constraint]
	272	name, j = self._scan_name(j, declstartpos)
	273	if j < 0:
	274	return j
	275	c = rawdata[j:j+1]
	276	if c == "":
	277	return -1
	278	if c == "(":
	279	# an enumerated type; look for ')'
	280	if ")" in rawdata[j:]:
	281	j = rawdata.find(")", j) + 1
	282	else:
	283	return -1
	284	while rawdata[j:j+1].isspace():
	285	j = j + 1
	286	if not rawdata[j:]:
	287	# end of buffer, incomplete
	288	return -1
	289	else:
	290	name, j = self._scan_name(j, declstartpos)
	291	c = rawdata[j:j+1]
	292	if not c:
	293	return -1
	294	if c in "'\"":
	295	m = _declstringlit_match(rawdata, j)
	296	if m:
	297	j = m.end()
	298	else:
	299	return -1
	300	c = rawdata[j:j+1]
	301	if not c:
	302	return -1
	303	if c == "#":
	304	if rawdata[j:] == "#":
	305	# end of buffer
	306	return -1
	307	name, j = self._scan_name(j + 1, declstartpos)
	308	if j < 0:
	309	return j
	310	c = rawdata[j:j+1]
	311	if not c:
	312	return -1
	313	if c == '>':
	314	# all done
	315	return j + 1
	316
	317	# Internal -- scan past <!NOTATION declarations
	318	def _parse_doctype_notation(self, i, declstartpos):
	319	name, j = self._scan_name(i, declstartpos)
	320	if j < 0:
	321	return j
	322	rawdata = self.rawdata
	323	while 1:
	324	c = rawdata[j:j+1]
	325	if not c:
	326	# end of buffer; incomplete
	327	return -1
	328	if c == '>':
	329	return j + 1
	330	if c in "'\"":
	331	m = _declstringlit_match(rawdata, j)
	332	if not m:
	333	return -1
	334	j = m.end()
	335	else:
	336	name, j = self._scan_name(j, declstartpos)
	337	if j < 0:
	338	return j
	339
	340	# Internal -- scan past <!ENTITY declarations
	341	def _parse_doctype_entity(self, i, declstartpos):
	342	rawdata = self.rawdata
	343	if rawdata[i:i+1] == "%":
	344	j = i + 1
	345	while 1:
	346	c = rawdata[j:j+1]
	347	if not c:
	348	return -1
	349	if c.isspace():
	350	j = j + 1
	351	else:
	352	break
	353	else:
	354	j = i
	355	name, j = self._scan_name(j, declstartpos)
	356	if j < 0:
	357	return j
	358	while 1:
	359	c = self.rawdata[j:j+1]
	360	if not c:
	361	return -1
	362	if c in "'\"":
	363	m = _declstringlit_match(rawdata, j)
	364	if m:
	365	j = m.end()
	366	else:
	367	return -1 # incomplete
	368	elif c == ">":
	369	return j + 1
	370	else:
	371	name, j = self._scan_name(j, declstartpos)
	372	if j < 0:
	373	return j
	374
	375	# Internal -- scan a name token and the new position and the token, or
	376	# return -1 if we've reached the end of the buffer.
	377	def _scan_name(self, i, declstartpos):
	378	rawdata = self.rawdata
	379	n = len(rawdata)
	380	if i == n:
	381	return None, -1
	382	m = _declname_match(rawdata, i)
	383	if m:
	384	s = m.group()
	385	name = s.strip()
	386	if (i + len(s)) == n:
	387	return None, -1 # end of buffer
	388	return name.lower(), m.end()
	389	else:
	390	self.updatepos(declstartpos, i)
	391	self.error("expected name token at %r"
	392	% rawdata[declstartpos:declstartpos+20])
	393
	394	# To be overridden -- handlers for unknown objects
	395	def unknown_decl(self, data):
	396	pass

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/markupbase.py

Download in other formats: