Changeset 388 for python/vendor/current/Lib/HTMLParser.py
- Timestamp:
- Mar 19, 2014, 11:11:30 AM (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
python/vendor/current/Lib/HTMLParser.py
r2 r388 15 15 16 16 interesting_normal = re.compile('[&<]') 17 interesting_cdata = re.compile(r'<(/|\Z)')18 17 incomplete = re.compile('&[a-zA-Z#]') 19 18 … … 24 23 piclose = re.compile('>') 25 24 commentclose = re.compile(r'--\s*>') 26 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') 25 tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') 26 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 27 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 28 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') 29 27 30 attrfind = re.compile( 28 r' \s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'29 r'(\'[^\']*\'|"[^"]*"| [-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')31 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 32 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 30 33 31 34 locatestarttagend = re.compile(r""" 32 35 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 33 (?: \s+ #whitespace before attribute name34 (?: [a-zA-Z_][-.:a-zA-Z0-9_]*# attribute name35 (?:\s*= \s*# value indicator36 (?:[\s/]* # optional whitespace before attribute name 37 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 38 (?:\s*=+\s* # value indicator 36 39 (?:'[^']*' # LITA-enclosed value 37 | \"[^\"]*\"# LIT-enclosed value38 | [^'\">\s]+# bare value40 |"[^"]*" # LIT-enclosed value 41 |(?!['"])[^>\s]* # bare value 39 42 ) 40 )? 41 ) 42 ) *43 )?(?:\s|/(?!>))* 44 )* 45 )? 43 46 \s* # trailing whitespace 44 47 """, re.VERBOSE) 45 48 endendtag = re.compile('>') 49 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 50 # </ and the tag name, so maybe this should be fixed 46 51 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 47 52 … … 97 102 self.lasttag = '???' 98 103 self.interesting = interesting_normal 104 self.cdata_elem = None 99 105 markupbase.ParserBase.reset(self) 100 106 101 107 def feed(self, data): 102 """Feed data to the parser.108 r"""Feed data to the parser. 103 109 104 110 Call this as often as you want, with as little or as much text … … 121 127 return self.__starttag_text 122 128 123 def set_cdata_mode(self): 124 self.interesting = interesting_cdata 129 def set_cdata_mode(self, elem): 130 self.cdata_elem = elem.lower() 131 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 125 132 126 133 def clear_cdata_mode(self): 127 134 self.interesting = interesting_normal 135 self.cdata_elem = None 128 136 129 137 # Internal -- handle data as far as reasonable. May leave state … … 139 147 j = match.start() 140 148 else: 149 if self.cdata_elem: 150 break 141 151 j = n 142 152 if i < j: self.handle_data(rawdata[i:j]) … … 154 164 k = self.parse_pi(i) 155 165 elif startswith("<!", i): 156 k = self.parse_ declaration(i)166 k = self.parse_html_declaration(i) 157 167 elif (i + 1) < n: 158 168 self.handle_data("<") … … 161 171 break 162 172 if k < 0: 163 if end: 164 self.error("EOF in middle of construct") 165 break 173 if not end: 174 break 175 k = rawdata.find('>', i + 1) 176 if k < 0: 177 k = rawdata.find('<', i + 1) 178 if k < 0: 179 k = i + 1 180 else: 181 k += 1 182 self.handle_data(rawdata[i:k]) 166 183 i = self.updatepos(i, k) 167 184 elif startswith("&#", i): … … 176 193 continue 177 194 else: 195 if ";" in rawdata[i:]: #bail by consuming &# 196 self.handle_data(rawdata[0:2]) 197 i = self.updatepos(i, 2) 178 198 break 179 199 elif startswith('&', i): … … 204 224 assert 0, "interesting.search() lied" 205 225 # end while 206 if end and i < n :226 if end and i < n and not self.cdata_elem: 207 227 self.handle_data(rawdata[i:n]) 208 228 i = self.updatepos(i, n) 209 229 self.rawdata = rawdata[i:] 230 231 # Internal -- parse html declarations, return length or -1 if not terminated 232 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 233 # See also parse_declaration in _markupbase 234 def parse_html_declaration(self, i): 235 rawdata = self.rawdata 236 if rawdata[i:i+2] != '<!': 237 self.error('unexpected call to parse_html_declaration()') 238 if rawdata[i:i+4] == '<!--': 239 # this case is actually already handled in goahead() 240 return self.parse_comment(i) 241 elif rawdata[i:i+3] == '<![': 242 return self.parse_marked_section(i) 243 elif rawdata[i:i+9].lower() == '<!doctype': 244 # find the closing > 245 gtpos = rawdata.find('>', i+9) 246 if gtpos == -1: 247 return -1 248 self.handle_decl(rawdata[i+2:gtpos]) 249 return gtpos+1 250 else: 251 return self.parse_bogus_comment(i) 252 253 # Internal -- parse bogus comment, return length or -1 if not terminated 254 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 255 def parse_bogus_comment(self, i, report=1): 256 rawdata = self.rawdata 257 if rawdata[i:i+2] not in ('<!', '</'): 258 self.error('unexpected call to parse_comment()') 259 pos = rawdata.find('>', i+2) 260 if pos == -1: 261 return -1 262 if report: 263 self.handle_comment(rawdata[i+2:pos]) 264 return pos + 1 210 265 211 266 # Internal -- parse processing instr, return end or -1 if not terminated … … 235 290 assert match, 'unexpected call to parse_starttag()' 236 291 k = match.end() 237 self.lasttag = tag = rawdata[i+1:k].lower()292 self.lasttag = tag = match.group(1).lower() 238 293 239 294 while k < endpos: … … 247 302 attrvalue[:1] == '"' == attrvalue[-1:]: 248 303 attrvalue = attrvalue[1:-1] 304 if attrvalue: 249 305 attrvalue = self.unescape(attrvalue) 250 306 attrs.append((attrname.lower(), attrvalue)) … … 260 316 else: 261 317 offset = offset + len(self.__starttag_text) 262 self. error("junk characters in start tag: %r"263 % (rawdata[k:endpos][:20],))318 self.handle_data(rawdata[i:endpos]) 319 return endpos 264 320 if end.endswith('/>'): 265 321 # XHTML-style empty tag: <span attr="value" /> … … 268 324 self.handle_starttag(tag, attrs) 269 325 if tag in self.CDATA_CONTENT_ELEMENTS: 270 self.set_cdata_mode( )326 self.set_cdata_mode(tag) 271 327 return endpos 272 328 … … 298 354 # '/' from a '/>' ending 299 355 return -1 300 self.updatepos(i, j) 301 self.error("malformed start tag") 356 if j > i: 357 return j 358 else: 359 return i + 1 302 360 raise AssertionError("we should not get here!") 303 361 … … 309 367 if not match: 310 368 return -1 311 j= match.end()369 gtpos = match.end() 312 370 match = endtagfind.match(rawdata, i) # </ + tag + > 313 371 if not match: 314 self.error("bad end tag: %r" % (rawdata[i:j],)) 315 tag = match.group(1) 316 self.handle_endtag(tag.lower()) 372 if self.cdata_elem is not None: 373 self.handle_data(rawdata[i:gtpos]) 374 return gtpos 375 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 376 namematch = tagfind_tolerant.match(rawdata, i+2) 377 if not namematch: 378 # w3.org/TR/html5/tokenization.html#end-tag-open-state 379 if rawdata[i:i+3] == '</>': 380 return i+3 381 else: 382 return self.parse_bogus_comment(i) 383 tagname = namematch.group().lower() 384 # consume and ignore other stuff between the name and the > 385 # Note: this is not 100% correct, since we might have things like 386 # </tag attr=">">, but looking for > after tha name should cover 387 # most of the cases and is much simpler 388 gtpos = rawdata.find('>', namematch.end()) 389 self.handle_endtag(tagname) 390 return gtpos+1 391 392 elem = match.group(1).lower() # script or style 393 if self.cdata_elem is not None: 394 if elem != self.cdata_elem: 395 self.handle_data(rawdata[i:gtpos]) 396 return gtpos 397 398 self.handle_endtag(elem) 317 399 self.clear_cdata_mode() 318 return j400 return gtpos 319 401 320 402 # Overridable -- finish processing of start+end tag: <tag.../> … … 356 438 357 439 def unknown_decl(self, data): 358 self.error("unknown declaration: %r" % (data,))440 pass 359 441 360 442 # Internal -- helper to remove special character quoting … … 365 447 def replaceEntities(s): 366 448 s = s.groups()[0] 367 if s[0] == "#": 368 s = s[1:] 369 if s[0] in ['x','X']: 370 c = int(s[1:], 16) 371 else: 372 c = int(s) 373 return unichr(c) 449 try: 450 if s[0] == "#": 451 s = s[1:] 452 if s[0] in ['x','X']: 453 c = int(s[1:], 16) 454 else: 455 c = int(s) 456 return unichr(c) 457 except ValueError: 458 return '&#'+s+';' 374 459 else: 375 460 # Cannot use name2codepoint directly, because HTMLParser supports apos,
Note:
See TracChangeset
for help on using the changeset viewer.