Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

tokenize.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 18.7 KB

Line
1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2	# All rights reserved.
3
4	"""Tokenization help for Python programs.
5
6	generate_tokens(readline) is a generator that breaks a stream of
7	text into Python tokens. It accepts a readline-like method which is called
8	repeatedly to get the next line of input (or "" for EOF). It generates
9	5-tuples with these members:
10
11	the token type (see token.py)
12	the token (a string)
13	the starting (row, column) indices of the token (a 2-tuple of ints)
14	the ending (row, column) indices of the token (a 2-tuple of ints)
15	the original line (string)
16
17	It is designed to match the working of the Python tokenizer exactly, except
18	that it produces COMMENT tokens for comments and gives type OP for all
19	operators
20
21	Older entry points
22	tokenize_loop(readline, tokeneater)
23	tokenize(readline, tokeneater=printtoken)
24	are the same, except instead of generating tokens, tokeneater is a callback
25	function to which the 5 fields described above are passed as 5 arguments,
26	each time a new token is found."""
27
28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29	__credits__ = \
30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32	import string, re
33	from codecs import BOM_UTF8, lookup
34	from lib2to3.pgen2.token import *
35
36	from . import token
37	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38	"generate_tokens", "untokenize"]
39	del token
40
41	try:
42	bytes
43	except NameError:
44	# Support bytes type in Python <= 2.5, so 2to3 turns itself into
45	# valid Python 3 code.
46	bytes = str
47
48	def group(*choices): return '(' + '\|'.join(choices) + ')'
49	def any(choices): return group(choices) + '*'
50	def maybe(choices): return group(choices) + '?'
51
52	Whitespace = r'[ \f\t]*'
53	Comment = r'#[^\r\n]*'
54	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55	Name = r'[a-zA-Z_]\w*'
56
57	Binnumber = r'0[bB][01]*'
58	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
59	Octnumber = r'0[oO]?[0-7]*[lL]?'
60	Decnumber = r'[1-9]\d*[lL]?'
61	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62	Exponent = r'[eE][-+]?\d+'
63	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
64	Expfloat = r'\d+' + Exponent
65	Floatnumber = group(Pointfloat, Expfloat)
66	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
67	Number = group(Imagnumber, Floatnumber, Intnumber)
68
69	# Tail end of ' string.
70	Single = r"[^'\\](?:\\.[^'\\])*'"
71	# Tail end of " string.
72	Double = r'[^"\\](?:\\.[^"\\])*"'
73	# Tail end of ''' string.
74	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
75	# Tail end of """ string.
76	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
77	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
78	# Single-line ' or " string.
79	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
80	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
81
82	# Because of leftmost-then-longest match semantics, be sure to put the
83	# longest operators first (e.g., if = came before ==, == would get
84	# recognized as two instances of =).
85	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
86	r"//=?", r"->",
87	r"[+\-*/%&\|^=<>]=?",
88	r"~")
89
90	Bracket = '[][(){}]'
91	Special = group(r'\r?\n', r'[:;.,`@]')
92	Funny = group(Operator, Bracket, Special)
93
94	PlainToken = group(Number, Funny, String, Name)
95	Token = Ignore + PlainToken
96
97	# First (or only) line of ' or " string.
98	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
99	group("'", r'\\\r?\n'),
100	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
101	group('"', r'\\\r?\n'))
102	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
103	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
104
105	tokenprog, pseudoprog, single3prog, double3prog = map(
106	re.compile, (Token, PseudoToken, Single3, Double3))
107	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
108	"'''": single3prog, '"""': double3prog,
109	"r'''": single3prog, 'r"""': double3prog,
110	"u'''": single3prog, 'u"""': double3prog,
111	"b'''": single3prog, 'b"""': double3prog,
112	"ur'''": single3prog, 'ur"""': double3prog,
113	"br'''": single3prog, 'br"""': double3prog,
114	"R'''": single3prog, 'R"""': double3prog,
115	"U'''": single3prog, 'U"""': double3prog,
116	"B'''": single3prog, 'B"""': double3prog,
117	"uR'''": single3prog, 'uR"""': double3prog,
118	"Ur'''": single3prog, 'Ur"""': double3prog,
119	"UR'''": single3prog, 'UR"""': double3prog,
120	"bR'''": single3prog, 'bR"""': double3prog,
121	"Br'''": single3prog, 'Br"""': double3prog,
122	"BR'''": single3prog, 'BR"""': double3prog,
123	'r': None, 'R': None,
124	'u': None, 'U': None,
125	'b': None, 'B': None}
126
127	triple_quoted = {}
128	for t in ("'''", '"""',
129	"r'''", 'r"""', "R'''", 'R"""',
130	"u'''", 'u"""', "U'''", 'U"""',
131	"b'''", 'b"""', "B'''", 'B"""',
132	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
133	"uR'''", 'uR"""', "UR'''", 'UR"""',
134	"br'''", 'br"""', "Br'''", 'Br"""',
135	"bR'''", 'bR"""', "BR'''", 'BR"""',):
136	triple_quoted[t] = t
137	single_quoted = {}
138	for t in ("'", '"',
139	"r'", 'r"', "R'", 'R"',
140	"u'", 'u"', "U'", 'U"',
141	"b'", 'b"', "B'", 'B"',
142	"ur'", 'ur"', "Ur'", 'Ur"',
143	"uR'", 'uR"', "UR'", 'UR"',
144	"br'", 'br"', "Br'", 'Br"',
145	"bR'", 'bR"', "BR'", 'BR"', ):
146	single_quoted[t] = t
147
148	tabsize = 8
149
150	class TokenError(Exception): pass
151
152	class StopTokenizing(Exception): pass
153
154	def printtoken(type, token, start, end, line): # for testing
155	(srow, scol) = start
156	(erow, ecol) = end
157	print "%d,%d-%d,%d:\t%s\t%s" % \
158	(srow, scol, erow, ecol, tok_name[type], repr(token))
159
160	def tokenize(readline, tokeneater=printtoken):
161	"""
162	The tokenize() function accepts two parameters: one representing the
163	input stream, and one providing an output mechanism for tokenize().
164
165	The first parameter, readline, must be a callable object which provides
166	the same interface as the readline() method of built-in file objects.
167	Each call to the function should return one line of input as a string.
168
169	The second parameter, tokeneater, must also be a callable object. It is
170	called once for each token, with five arguments, corresponding to the
171	tuples generated by generate_tokens().
172	"""
173	try:
174	tokenize_loop(readline, tokeneater)
175	except StopTokenizing:
176	pass
177
178	# backwards compatible interface
179	def tokenize_loop(readline, tokeneater):
180	for token_info in generate_tokens(readline):
181	tokeneater(*token_info)
182
183	class Untokenizer:
184
185	def __init__(self):
186	self.tokens = []
187	self.prev_row = 1
188	self.prev_col = 0
189
190	def add_whitespace(self, start):
191	row, col = start
192	assert row <= self.prev_row
193	col_offset = col - self.prev_col
194	if col_offset:
195	self.tokens.append(" " * col_offset)
196
197	def untokenize(self, iterable):
198	for t in iterable:
199	if len(t) == 2:
200	self.compat(t, iterable)
201	break
202	tok_type, token, start, end, line = t
203	self.add_whitespace(start)
204	self.tokens.append(token)
205	self.prev_row, self.prev_col = end
206	if tok_type in (NEWLINE, NL):
207	self.prev_row += 1
208	self.prev_col = 0
209	return "".join(self.tokens)
210
211	def compat(self, token, iterable):
212	startline = False
213	indents = []
214	toks_append = self.tokens.append
215	toknum, tokval = token
216	if toknum in (NAME, NUMBER):
217	tokval += ' '
218	if toknum in (NEWLINE, NL):
219	startline = True
220	for tok in iterable:
221	toknum, tokval = tok[:2]
222
223	if toknum in (NAME, NUMBER):
224	tokval += ' '
225
226	if toknum == INDENT:
227	indents.append(tokval)
228	continue
229	elif toknum == DEDENT:
230	indents.pop()
231	continue
232	elif toknum in (NEWLINE, NL):
233	startline = True
234	elif startline and indents:
235	toks_append(indents[-1])
236	startline = False
237	toks_append(tokval)
238
239	cookie_re = re.compile(r'^[ \t\f]#.coding[:=][ \t]*([-\w.]+)')
240
241	def _get_normal_name(orig_enc):
242	"""Imitates get_normal_name in tokenizer.c."""
243	# Only care about the first 12 characters.
244	enc = orig_enc[:12].lower().replace("_", "-")
245	if enc == "utf-8" or enc.startswith("utf-8-"):
246	return "utf-8"
247	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
248	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
249	return "iso-8859-1"
250	return orig_enc
251
252	def detect_encoding(readline):
253	"""
254	The detect_encoding() function is used to detect the encoding that should
255	be used to decode a Python source file. It requires one argment, readline,
256	in the same way as the tokenize() generator.
257
258	It will call readline a maximum of twice, and return the encoding used
259	(as a string) and a list of any lines (left as bytes) it has read
260	in.
261
262	It detects the encoding from the presence of a utf-8 bom or an encoding
263	cookie as specified in pep-0263. If both a bom and a cookie are present, but
264	disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
265	charset, raise a SyntaxError. Note that if a utf-8 bom is found,
266	'utf-8-sig' is returned.
267
268	If no encoding is specified, then the default of 'utf-8' will be returned.
269	"""
270	bom_found = False
271	encoding = None
272	default = 'utf-8'
273	def read_or_stop():
274	try:
275	return readline()
276	except StopIteration:
277	return bytes()
278
279	def find_cookie(line):
280	try:
281	line_string = line.decode('ascii')
282	except UnicodeDecodeError:
283	return None
284	match = cookie_re.match(line_string)
285	if not match:
286	return None
287	encoding = _get_normal_name(match.group(1))
288	try:
289	codec = lookup(encoding)
290	except LookupError:
291	# This behaviour mimics the Python interpreter
292	raise SyntaxError("unknown encoding: " + encoding)
293
294	if bom_found:
295	if codec.name != 'utf-8':
296	# This behaviour mimics the Python interpreter
297	raise SyntaxError('encoding problem: utf-8')
298	encoding += '-sig'
299	return encoding
300
301	first = read_or_stop()
302	if first.startswith(BOM_UTF8):
303	bom_found = True
304	first = first[3:]
305	default = 'utf-8-sig'
306	if not first:
307	return default, []
308
309	encoding = find_cookie(first)
310	if encoding:
311	return encoding, [first]
312
313	second = read_or_stop()
314	if not second:
315	return default, [first]
316
317	encoding = find_cookie(second)
318	if encoding:
319	return encoding, [first, second]
320
321	return default, [first, second]
322
323	def untokenize(iterable):
324	"""Transform tokens back into Python source code.
325
326	Each element returned by the iterable must be a token sequence
327	with at least two elements, a token number and token value. If
328	only two tokens are passed, the resulting output is poor.
329
330	Round-trip invariant for full input:
331	Untokenized source will match input source exactly
332
333	Round-trip invariant for limited intput:
334	# Output text will tokenize the back to the input
335	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
336	newcode = untokenize(t1)
337	readline = iter(newcode.splitlines(1)).next
338	t2 = [tok[:2] for tokin generate_tokens(readline)]
339	assert t1 == t2
340	"""
341	ut = Untokenizer()
342	return ut.untokenize(iterable)
343
344	def generate_tokens(readline):
345	"""
346	The generate_tokens() generator requires one argment, readline, which
347	must be a callable object which provides the same interface as the
348	readline() method of built-in file objects. Each call to the function
349	should return one line of input as a string. Alternately, readline
350	can be a callable function terminating with StopIteration:
351	readline = open(myfile).next # Example of alternate readline
352
353	The generator produces 5-tuples with these members: the token type; the
354	token string; a 2-tuple (srow, scol) of ints specifying the row and
355	column where the token begins in the source; a 2-tuple (erow, ecol) of
356	ints specifying the row and column where the token ends in the source;
357	and the line on which the token was found. The line passed is the
358	logical line; continuation lines are included.
359	"""
360	lnum = parenlev = continued = 0
361	namechars, numchars = string.ascii_letters + '_', '0123456789'
362	contstr, needcont = '', 0
363	contline = None
364	indents = [0]
365
366	while 1: # loop over lines in stream
367	try:
368	line = readline()
369	except StopIteration:
370	line = ''
371	lnum = lnum + 1
372	pos, max = 0, len(line)
373
374	if contstr: # continued string
375	if not line:
376	raise TokenError, ("EOF in multi-line string", strstart)
377	endmatch = endprog.match(line)
378	if endmatch:
379	pos = end = endmatch.end(0)
380	yield (STRING, contstr + line[:end],
381	strstart, (lnum, end), contline + line)
382	contstr, needcont = '', 0
383	contline = None
384	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
385	yield (ERRORTOKEN, contstr + line,
386	strstart, (lnum, len(line)), contline)
387	contstr = ''
388	contline = None
389	continue
390	else:
391	contstr = contstr + line
392	contline = contline + line
393	continue
394
395	elif parenlev == 0 and not continued: # new statement
396	if not line: break
397	column = 0
398	while pos < max: # measure leading whitespace
399	if line[pos] == ' ': column = column + 1
400	elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
401	elif line[pos] == '\f': column = 0
402	else: break
403	pos = pos + 1
404	if pos == max: break
405
406	if line[pos] in '#\r\n': # skip comments or blank lines
407	if line[pos] == '#':
408	comment_token = line[pos:].rstrip('\r\n')
409	nl_pos = pos + len(comment_token)
410	yield (COMMENT, comment_token,
411	(lnum, pos), (lnum, pos + len(comment_token)), line)
412	yield (NL, line[nl_pos:],
413	(lnum, nl_pos), (lnum, len(line)), line)
414	else:
415	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
416	(lnum, pos), (lnum, len(line)), line)
417	continue
418
419	if column > indents[-1]: # count indents or dedents
420	indents.append(column)
421	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
422	while column < indents[-1]:
423	if column not in indents:
424	raise IndentationError(
425	"unindent does not match any outer indentation level",
426	("<tokenize>", lnum, pos, line))
427	indents = indents[:-1]
428	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
429
430	else: # continued statement
431	if not line:
432	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
433	continued = 0
434
435	while pos < max:
436	pseudomatch = pseudoprog.match(line, pos)
437	if pseudomatch: # scan for tokens
438	start, end = pseudomatch.span(1)
439	spos, epos, pos = (lnum, start), (lnum, end), end
440	token, initial = line[start:end], line[start]
441
442	if initial in numchars or \
443	(initial == '.' and token != '.'): # ordinary number
444	yield (NUMBER, token, spos, epos, line)
445	elif initial in '\r\n':
446	newline = NEWLINE
447	if parenlev > 0:
448	newline = NL
449	yield (newline, token, spos, epos, line)
450	elif initial == '#':
451	assert not token.endswith("\n")
452	yield (COMMENT, token, spos, epos, line)
453	elif token in triple_quoted:
454	endprog = endprogs[token]
455	endmatch = endprog.match(line, pos)
456	if endmatch: # all on one line
457	pos = endmatch.end(0)
458	token = line[start:pos]
459	yield (STRING, token, spos, (lnum, pos), line)
460	else:
461	strstart = (lnum, start) # multiple lines
462	contstr = line[start:]
463	contline = line
464	break
465	elif initial in single_quoted or \
466	token[:2] in single_quoted or \
467	token[:3] in single_quoted:
468	if token[-1] == '\n': # continued string
469	strstart = (lnum, start)
470	endprog = (endprogs[initial] or endprogs[token[1]] or
471	endprogs[token[2]])
472	contstr, needcont = line[start:], 1
473	contline = line
474	break
475	else: # ordinary string
476	yield (STRING, token, spos, epos, line)
477	elif initial in namechars: # ordinary name
478	yield (NAME, token, spos, epos, line)
479	elif initial == '\\': # continued stmt
480	# This yield is new; needed for better idempotency:
481	yield (NL, token, spos, (lnum, pos), line)
482	continued = 1
483	else:
484	if initial in '([{': parenlev = parenlev + 1
485	elif initial in ')]}': parenlev = parenlev - 1
486	yield (OP, token, spos, epos, line)
487	else:
488	yield (ERRORTOKEN, line[pos],
489	(lnum, pos), (lnum, pos+1), line)
490	pos = pos + 1
491
492	for indent in indents[1:]: # pop remaining indent levels
493	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
494	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
495
496	if __name__ == '__main__': # testing
497	import sys
498	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
499	else: tokenize(sys.stdin.readline)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/lib2to3/pgen2/tokenize.py

Download in other formats: