Changeset 391 for python/trunk/Lib/lib2to3/pgen2/tokenize.py
- Timestamp:
- Mar 19, 2014, 11:31:01 PM (11 years ago)
- Location:
- python/trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
python/trunk
-
Property svn:mergeinfo
set to
/python/vendor/Python-2.7.6 merged eligible /python/vendor/current merged eligible
-
Property svn:mergeinfo
set to
-
python/trunk/Lib/lib2to3/pgen2/tokenize.py
r2 r391 38 38 "generate_tokens", "untokenize"] 39 39 del token 40 41 try: 42 bytes 43 except NameError: 44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into 45 # valid Python 3 code. 46 bytes = str 40 47 41 48 def group(*choices): return '(' + '|'.join(choices) + ')' … … 230 237 toks_append(tokval) 231 238 232 cookie_re = re.compile( "coding[:=]\s*([-\w.]+)")239 cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)') 233 240 234 241 def _get_normal_name(orig_enc): … … 254 261 255 262 It detects the encoding from the presence of a utf-8 bom or an encoding 256 cookie as specified in pep-0263. If both a bom and a cookie are present, 257 but disagree, a SyntaxError will be raised. If the encoding cookie is an 258 invalid charset, raise a SyntaxError. 263 cookie as specified in pep-0263. If both a bom and a cookie are present, but 264 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid 265 charset, raise a SyntaxError. Note that if a utf-8 bom is found, 266 'utf-8-sig' is returned. 259 267 260 268 If no encoding is specified, then the default of 'utf-8' will be returned. … … 262 270 bom_found = False 263 271 encoding = None 272 default = 'utf-8' 264 273 def read_or_stop(): 265 274 try: 266 275 return readline() 267 276 except StopIteration: 268 return b ''277 return bytes() 269 278 270 279 def find_cookie(line): … … 273 282 except UnicodeDecodeError: 274 283 return None 275 276 matches = cookie_re.findall(line_string) 277 if not matches: 284 match = cookie_re.match(line_string) 285 if not match: 278 286 return None 279 encoding = _get_normal_name(match es[0])287 encoding = _get_normal_name(match.group(1)) 280 288 try: 281 289 codec = lookup(encoding) … … 288 296 # This behaviour mimics the Python interpreter 289 297 raise SyntaxError('encoding problem: utf-8') 290 else: 291 # Allow it to be properly encoded and decoded. 292 encoding = 'utf-8-sig' 298 encoding += '-sig' 293 299 return encoding 294 300 … … 297 303 bom_found = True 298 304 first = first[3:] 305 default = 'utf-8-sig' 299 306 if not first: 300 return 'utf-8', []307 return default, [] 301 308 302 309 encoding = find_cookie(first) … … 306 313 second = read_or_stop() 307 314 if not second: 308 return 'utf-8', [first]315 return default, [first] 309 316 310 317 encoding = find_cookie(second) … … 312 319 return encoding, [first, second] 313 320 314 return 'utf-8', [first, second]321 return default, [first, second] 315 322 316 323 def untokenize(iterable):
Note:
See TracChangeset
for help on using the changeset viewer.