Context Navigation

tokenize.py

Visit:

Last change on this file was 3225, checked in by bird, 18 years ago
Python 2.5
File size: 13.2 KB

Line
1	"""Tokenization help for Python programs.
2
3	generate_tokens(readline) is a generator that breaks a stream of
4	text into Python tokens. It accepts a readline-like method which is called
5	repeatedly to get the next line of input (or "" for EOF). It generates
6	5-tuples with these members:
7
8	the token type (see token.py)
9	the token (a string)
10	the starting (row, column) indices of the token (a 2-tuple of ints)
11	the ending (row, column) indices of the token (a 2-tuple of ints)
12	the original line (string)
13
14	It is designed to match the working of the Python tokenizer exactly, except
15	that it produces COMMENT tokens for comments and gives type OP for all
16	operators
17
18	Older entry points
19	tokenize_loop(readline, tokeneater)
20	tokenize(readline, tokeneater=printtoken)
21	are the same, except instead of generating tokens, tokeneater is a callback
22	function to which the 5 fields described above are passed as 5 arguments,
23	each time a new token is found."""
24
25	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26	__credits__ = \
27	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
28
29	import string, re
30	from token import *
31
32	import token
33	__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34	"generate_tokens", "NL", "untokenize"]
35	del x
36	del token
37
38	COMMENT = N_TOKENS
39	tok_name[COMMENT] = 'COMMENT'
40	NL = N_TOKENS + 1
41	tok_name[NL] = 'NL'
42	N_TOKENS += 2
43
44	def group(*choices): return '(' + '\|'.join(choices) + ')'
45	def any(choices): return group(choices) + '*'
46	def maybe(choices): return group(choices) + '?'
47
48	Whitespace = r'[ \f\t]*'
49	Comment = r'#[^\r\n]*'
50	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51	Name = r'[a-zA-Z_]\w*'
52
53	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54	Octnumber = r'0[0-7]*[lL]?'
55	Decnumber = r'[1-9]\d*[lL]?'
56	Intnumber = group(Hexnumber, Octnumber, Decnumber)
57	Exponent = r'[eE][-+]?\d+'
58	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59	Expfloat = r'\d+' + Exponent
60	Floatnumber = group(Pointfloat, Expfloat)
61	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62	Number = group(Imagnumber, Floatnumber, Intnumber)
63
64	# Tail end of ' string.
65	Single = r"[^'\\](?:\\.[^'\\])*'"
66	# Tail end of " string.
67	Double = r'[^"\\](?:\\.[^"\\])*"'
68	# Tail end of ''' string.
69	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
70	# Tail end of """ string.
71	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
72	Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73	# Single-line ' or " string.
74	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
75	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
76
77	# Because of leftmost-then-longest match semantics, be sure to put the
78	# longest operators first (e.g., if = came before ==, == would get
79	# recognized as two instances of =).
80	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
81	r"//=?",
82	r"[+\-*/%&\|^=<>]=?",
83	r"~")
84
85	Bracket = '[][(){}]'
86	Special = group(r'\r?\n', r'[:;.,`@]')
87	Funny = group(Operator, Bracket, Special)
88
89	PlainToken = group(Number, Funny, String, Name)
90	Token = Ignore + PlainToken
91
92	# First (or only) line of ' or " string.
93	ContStr = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
94	group("'", r'\\\r?\n'),
95	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
96	group('"', r'\\\r?\n'))
97	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
99
100	tokenprog, pseudoprog, single3prog, double3prog = map(
101	re.compile, (Token, PseudoToken, Single3, Double3))
102	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103	"'''": single3prog, '"""': double3prog,
104	"r'''": single3prog, 'r"""': double3prog,
105	"u'''": single3prog, 'u"""': double3prog,
106	"ur'''": single3prog, 'ur"""': double3prog,
107	"R'''": single3prog, 'R"""': double3prog,
108	"U'''": single3prog, 'U"""': double3prog,
109	"uR'''": single3prog, 'uR"""': double3prog,
110	"Ur'''": single3prog, 'Ur"""': double3prog,
111	"UR'''": single3prog, 'UR"""': double3prog,
112	'r': None, 'R': None, 'u': None, 'U': None}
113
114	triple_quoted = {}
115	for t in ("'''", '"""',
116	"r'''", 'r"""', "R'''", 'R"""',
117	"u'''", 'u"""', "U'''", 'U"""',
118	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
119	"uR'''", 'uR"""', "UR'''", 'UR"""'):
120	triple_quoted[t] = t
121	single_quoted = {}
122	for t in ("'", '"',
123	"r'", 'r"', "R'", 'R"',
124	"u'", 'u"', "U'", 'U"',
125	"ur'", 'ur"', "Ur'", 'Ur"',
126	"uR'", 'uR"', "UR'", 'UR"' ):
127	single_quoted[t] = t
128
129	tabsize = 8
130
131	class TokenError(Exception): pass
132
133	class StopTokenizing(Exception): pass
134
135	def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
136	print "%d,%d-%d,%d:\t%s\t%s" % \
137	(srow, scol, erow, ecol, tok_name[type], repr(token))
138
139	def tokenize(readline, tokeneater=printtoken):
140	"""
141	The tokenize() function accepts two parameters: one representing the
142	input stream, and one providing an output mechanism for tokenize().
143
144	The first parameter, readline, must be a callable object which provides
145	the same interface as the readline() method of built-in file objects.
146	Each call to the function should return one line of input as a string.
147
148	The second parameter, tokeneater, must also be a callable object. It is
149	called once for each token, with five arguments, corresponding to the
150	tuples generated by generate_tokens().
151	"""
152	try:
153	tokenize_loop(readline, tokeneater)
154	except StopTokenizing:
155	pass
156
157	# backwards compatible interface
158	def tokenize_loop(readline, tokeneater):
159	for token_info in generate_tokens(readline):
160	tokeneater(*token_info)
161
162
163	def untokenize(iterable):
164	"""Transform tokens back into Python source code.
165
166	Each element returned by the iterable must be a token sequence
167	with at least two elements, a token number and token value.
168
169	Round-trip invariant:
170	# Output text will tokenize the back to the input
171	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
172	newcode = untokenize(t1)
173	readline = iter(newcode.splitlines(1)).next
174	t2 = [tok[:2] for tokin generate_tokens(readline)]
175	assert t1 == t2
176	"""
177
178	startline = False
179	indents = []
180	toks = []
181	toks_append = toks.append
182	for tok in iterable:
183	toknum, tokval = tok[:2]
184
185	if toknum in (NAME, NUMBER):
186	tokval += ' '
187
188	if toknum == INDENT:
189	indents.append(tokval)
190	continue
191	elif toknum == DEDENT:
192	indents.pop()
193	continue
194	elif toknum in (NEWLINE, COMMENT, NL):
195	startline = True
196	elif startline and indents:
197	toks_append(indents[-1])
198	startline = False
199	toks_append(tokval)
200	return ''.join(toks)
201
202
203	def generate_tokens(readline):
204	"""
205	The generate_tokens() generator requires one argment, readline, which
206	must be a callable object which provides the same interface as the
207	readline() method of built-in file objects. Each call to the function
208	should return one line of input as a string. Alternately, readline
209	can be a callable function terminating with StopIteration:
210	readline = open(myfile).next # Example of alternate readline
211
212	The generator produces 5-tuples with these members: the token type; the
213	token string; a 2-tuple (srow, scol) of ints specifying the row and
214	column where the token begins in the source; a 2-tuple (erow, ecol) of
215	ints specifying the row and column where the token ends in the source;
216	and the line on which the token was found. The line passed is the
217	logical line; continuation lines are included.
218	"""
219	lnum = parenlev = continued = 0
220	namechars, numchars = string.ascii_letters + '_', '0123456789'
221	contstr, needcont = '', 0
222	contline = None
223	indents = [0]
224
225	while 1: # loop over lines in stream
226	try:
227	line = readline()
228	except StopIteration:
229	line = ''
230	lnum = lnum + 1
231	pos, max = 0, len(line)
232
233	if contstr: # continued string
234	if not line:
235	raise TokenError, ("EOF in multi-line string", strstart)
236	endmatch = endprog.match(line)
237	if endmatch:
238	pos = end = endmatch.end(0)
239	yield (STRING, contstr + line[:end],
240	strstart, (lnum, end), contline + line)
241	contstr, needcont = '', 0
242	contline = None
243	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
244	yield (ERRORTOKEN, contstr + line,
245	strstart, (lnum, len(line)), contline)
246	contstr = ''
247	contline = None
248	continue
249	else:
250	contstr = contstr + line
251	contline = contline + line
252	continue
253
254	elif parenlev == 0 and not continued: # new statement
255	if not line: break
256	column = 0
257	while pos < max: # measure leading whitespace
258	if line[pos] == ' ': column = column + 1
259	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
260	elif line[pos] == '\f': column = 0
261	else: break
262	pos = pos + 1
263	if pos == max: break
264
265	if line[pos] in '#\r\n': # skip comments or blank lines
266	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
267	(lnum, pos), (lnum, len(line)), line)
268	continue
269
270	if column > indents[-1]: # count indents or dedents
271	indents.append(column)
272	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
273	while column < indents[-1]:
274	if column not in indents:
275	raise IndentationError(
276	"unindent does not match any outer indentation level",
277	("<tokenize>", lnum, pos, line))
278	indents = indents[:-1]
279	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
280
281	else: # continued statement
282	if not line:
283	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
284	continued = 0
285
286	while pos < max:
287	pseudomatch = pseudoprog.match(line, pos)
288	if pseudomatch: # scan for tokens
289	start, end = pseudomatch.span(1)
290	spos, epos, pos = (lnum, start), (lnum, end), end
291	token, initial = line[start:end], line[start]
292
293	if initial in numchars or \
294	(initial == '.' and token != '.'): # ordinary number
295	yield (NUMBER, token, spos, epos, line)
296	elif initial in '\r\n':
297	yield (parenlev > 0 and NL or NEWLINE,
298	token, spos, epos, line)
299	elif initial == '#':
300	yield (COMMENT, token, spos, epos, line)
301	elif token in triple_quoted:
302	endprog = endprogs[token]
303	endmatch = endprog.match(line, pos)
304	if endmatch: # all on one line
305	pos = endmatch.end(0)
306	token = line[start:pos]
307	yield (STRING, token, spos, (lnum, pos), line)
308	else:
309	strstart = (lnum, start) # multiple lines
310	contstr = line[start:]
311	contline = line
312	break
313	elif initial in single_quoted or \
314	token[:2] in single_quoted or \
315	token[:3] in single_quoted:
316	if token[-1] == '\n': # continued string
317	strstart = (lnum, start)
318	endprog = (endprogs[initial] or endprogs[token[1]] or
319	endprogs[token[2]])
320	contstr, needcont = line[start:], 1
321	contline = line
322	break
323	else: # ordinary string
324	yield (STRING, token, spos, epos, line)
325	elif initial in namechars: # ordinary name
326	yield (NAME, token, spos, epos, line)
327	elif initial == '\\': # continued stmt
328	continued = 1
329	else:
330	if initial in '([{': parenlev = parenlev + 1
331	elif initial in ')]}': parenlev = parenlev - 1
332	yield (OP, token, spos, epos, line)
333	else:
334	yield (ERRORTOKEN, line[pos],
335	(lnum, pos), (lnum, pos+1), line)
336	pos = pos + 1
337
338	for indent in indents[1:]: # pop remaining indent levels
339	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
340	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
341
342	if __name__ == '__main__': # testing
343	import sys
344	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
345	else: tokenize(sys.stdin.readline)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Lib/tokenize.py

Download in other formats: