Context Navigation

tokenize.py

Last change on this file was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 15.9 KB

Line
1	"""Tokenization help for Python programs.
2
3	generate_tokens(readline) is a generator that breaks a stream of
4	text into Python tokens. It accepts a readline-like method which is called
5	repeatedly to get the next line of input (or "" for EOF). It generates
6	5-tuples with these members:
7
8	the token type (see token.py)
9	the token (a string)
10	the starting (row, column) indices of the token (a 2-tuple of ints)
11	the ending (row, column) indices of the token (a 2-tuple of ints)
12	the original line (string)
13
14	It is designed to match the working of the Python tokenizer exactly, except
15	that it produces COMMENT tokens for comments and gives type OP for all
16	operators
17
18	Older entry points
19	tokenize_loop(readline, tokeneater)
20	tokenize(readline, tokeneater=printtoken)
21	are the same, except instead of generating tokens, tokeneater is a callback
22	function to which the 5 fields described above are passed as 5 arguments,
23	each time a new token is found."""
24
25	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26	__credits__ = \
27	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
28
29	import string, re
30	from token import *
31
32	import token
33	__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34	"generate_tokens", "NL", "untokenize"]
35	del x
36	del token
37
38	COMMENT = N_TOKENS
39	tok_name[COMMENT] = 'COMMENT'
40	NL = N_TOKENS + 1
41	tok_name[NL] = 'NL'
42	N_TOKENS += 2
43
44	def group(*choices): return '(' + '\|'.join(choices) + ')'
45	def any(choices): return group(choices) + '*'
46	def maybe(choices): return group(choices) + '?'
47
48	Whitespace = r'[ \f\t]*'
49	Comment = r'#[^\r\n]*'
50	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51	Name = r'[a-zA-Z_]\w*'
52
53	Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
54	Octnumber = r'(0[oO][0-7]+)\|(0[0-7]*)[lL]?'
55	Binnumber = r'0[bB][01]+[lL]?'
56	Decnumber = r'[1-9]\d*[lL]?'
57	Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
58	Exponent = r'[eE][-+]?\d+'
59	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
60	Expfloat = r'\d+' + Exponent
61	Floatnumber = group(Pointfloat, Expfloat)
62	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
63	Number = group(Imagnumber, Floatnumber, Intnumber)
64
65	# Tail end of ' string.
66	Single = r"[^'\\](?:\\.[^'\\])*'"
67	# Tail end of " string.
68	Double = r'[^"\\](?:\\.[^"\\])*"'
69	# Tail end of ''' string.
70	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
71	# Tail end of """ string.
72	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
73	Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
74	# Single-line ' or " string.
75	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
76	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
77
78	# Because of leftmost-then-longest match semantics, be sure to put the
79	# longest operators first (e.g., if = came before ==, == would get
80	# recognized as two instances of =).
81	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
82	r"//=?",
83	r"[+\-*/%&\|^=<>]=?",
84	r"~")
85
86	Bracket = '[][(){}]'
87	Special = group(r'\r?\n', r'[:;.,`@]')
88	Funny = group(Operator, Bracket, Special)
89
90	PlainToken = group(Number, Funny, String, Name)
91	Token = Ignore + PlainToken
92
93	# First (or only) line of ' or " string.
94	ContStr = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
95	group("'", r'\\\r?\n'),
96	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
97	group('"', r'\\\r?\n'))
98	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
99	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100
101	tokenprog, pseudoprog, single3prog, double3prog = map(
102	re.compile, (Token, PseudoToken, Single3, Double3))
103	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
104	"'''": single3prog, '"""': double3prog,
105	"r'''": single3prog, 'r"""': double3prog,
106	"u'''": single3prog, 'u"""': double3prog,
107	"ur'''": single3prog, 'ur"""': double3prog,
108	"R'''": single3prog, 'R"""': double3prog,
109	"U'''": single3prog, 'U"""': double3prog,
110	"uR'''": single3prog, 'uR"""': double3prog,
111	"Ur'''": single3prog, 'Ur"""': double3prog,
112	"UR'''": single3prog, 'UR"""': double3prog,
113	"b'''": single3prog, 'b"""': double3prog,
114	"br'''": single3prog, 'br"""': double3prog,
115	"B'''": single3prog, 'B"""': double3prog,
116	"bR'''": single3prog, 'bR"""': double3prog,
117	"Br'''": single3prog, 'Br"""': double3prog,
118	"BR'''": single3prog, 'BR"""': double3prog,
119	'r': None, 'R': None, 'u': None, 'U': None,
120	'b': None, 'B': None}
121
122	triple_quoted = {}
123	for t in ("'''", '"""',
124	"r'''", 'r"""', "R'''", 'R"""',
125	"u'''", 'u"""', "U'''", 'U"""',
126	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
127	"uR'''", 'uR"""', "UR'''", 'UR"""',
128	"b'''", 'b"""', "B'''", 'B"""',
129	"br'''", 'br"""', "Br'''", 'Br"""',
130	"bR'''", 'bR"""', "BR'''", 'BR"""'):
131	triple_quoted[t] = t
132	single_quoted = {}
133	for t in ("'", '"',
134	"r'", 'r"', "R'", 'R"',
135	"u'", 'u"', "U'", 'U"',
136	"ur'", 'ur"', "Ur'", 'Ur"',
137	"uR'", 'uR"', "UR'", 'UR"',
138	"b'", 'b"', "B'", 'B"',
139	"br'", 'br"', "Br'", 'Br"',
140	"bR'", 'bR"', "BR'", 'BR"' ):
141	single_quoted[t] = t
142
143	tabsize = 8
144
145	class TokenError(Exception): pass
146
147	class StopTokenizing(Exception): pass
148
149	def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
150	srow, scol = srow_scol
151	erow, ecol = erow_ecol
152	print "%d,%d-%d,%d:\t%s\t%s" % \
153	(srow, scol, erow, ecol, tok_name[type], repr(token))
154
155	def tokenize(readline, tokeneater=printtoken):
156	"""
157	The tokenize() function accepts two parameters: one representing the
158	input stream, and one providing an output mechanism for tokenize().
159
160	The first parameter, readline, must be a callable object which provides
161	the same interface as the readline() method of built-in file objects.
162	Each call to the function should return one line of input as a string.
163
164	The second parameter, tokeneater, must also be a callable object. It is
165	called once for each token, with five arguments, corresponding to the
166	tuples generated by generate_tokens().
167	"""
168	try:
169	tokenize_loop(readline, tokeneater)
170	except StopTokenizing:
171	pass
172
173	# backwards compatible interface
174	def tokenize_loop(readline, tokeneater):
175	for token_info in generate_tokens(readline):
176	tokeneater(*token_info)
177
178	class Untokenizer:
179
180	def __init__(self):
181	self.tokens = []
182	self.prev_row = 1
183	self.prev_col = 0
184
185	def add_whitespace(self, start):
186	row, col = start
187	assert row <= self.prev_row
188	col_offset = col - self.prev_col
189	if col_offset:
190	self.tokens.append(" " * col_offset)
191
192	def untokenize(self, iterable):
193	for t in iterable:
194	if len(t) == 2:
195	self.compat(t, iterable)
196	break
197	tok_type, token, start, end, line = t
198	self.add_whitespace(start)
199	self.tokens.append(token)
200	self.prev_row, self.prev_col = end
201	if tok_type in (NEWLINE, NL):
202	self.prev_row += 1
203	self.prev_col = 0
204	return "".join(self.tokens)
205
206	def compat(self, token, iterable):
207	startline = False
208	indents = []
209	toks_append = self.tokens.append
210	toknum, tokval = token
211	if toknum in (NAME, NUMBER):
212	tokval += ' '
213	if toknum in (NEWLINE, NL):
214	startline = True
215	prevstring = False
216	for tok in iterable:
217	toknum, tokval = tok[:2]
218
219	if toknum in (NAME, NUMBER):
220	tokval += ' '
221
222	# Insert a space between two consecutive strings
223	if toknum == STRING:
224	if prevstring:
225	tokval = ' ' + tokval
226	prevstring = True
227	else:
228	prevstring = False
229
230	if toknum == INDENT:
231	indents.append(tokval)
232	continue
233	elif toknum == DEDENT:
234	indents.pop()
235	continue
236	elif toknum in (NEWLINE, NL):
237	startline = True
238	elif startline and indents:
239	toks_append(indents[-1])
240	startline = False
241	toks_append(tokval)
242
243	def untokenize(iterable):
244	"""Transform tokens back into Python source code.
245
246	Each element returned by the iterable must be a token sequence
247	with at least two elements, a token number and token value. If
248	only two tokens are passed, the resulting output is poor.
249
250	Round-trip invariant for full input:
251	Untokenized source will match input source exactly
252
253	Round-trip invariant for limited intput:
254	# Output text will tokenize the back to the input
255	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
256	newcode = untokenize(t1)
257	readline = iter(newcode.splitlines(1)).next
258	t2 = [tok[:2] for tok in generate_tokens(readline)]
259	assert t1 == t2
260	"""
261	ut = Untokenizer()
262	return ut.untokenize(iterable)
263
264	def generate_tokens(readline):
265	"""
266	The generate_tokens() generator requires one argment, readline, which
267	must be a callable object which provides the same interface as the
268	readline() method of built-in file objects. Each call to the function
269	should return one line of input as a string. Alternately, readline
270	can be a callable function terminating with StopIteration:
271	readline = open(myfile).next # Example of alternate readline
272
273	The generator produces 5-tuples with these members: the token type; the
274	token string; a 2-tuple (srow, scol) of ints specifying the row and
275	column where the token begins in the source; a 2-tuple (erow, ecol) of
276	ints specifying the row and column where the token ends in the source;
277	and the line on which the token was found. The line passed is the
278	logical line; continuation lines are included.
279	"""
280	lnum = parenlev = continued = 0
281	namechars, numchars = string.ascii_letters + '_', '0123456789'
282	contstr, needcont = '', 0
283	contline = None
284	indents = [0]
285
286	while 1: # loop over lines in stream
287	try:
288	line = readline()
289	except StopIteration:
290	line = ''
291	lnum = lnum + 1
292	pos, max = 0, len(line)
293
294	if contstr: # continued string
295	if not line:
296	raise TokenError, ("EOF in multi-line string", strstart)
297	endmatch = endprog.match(line)
298	if endmatch:
299	pos = end = endmatch.end(0)
300	yield (STRING, contstr + line[:end],
301	strstart, (lnum, end), contline + line)
302	contstr, needcont = '', 0
303	contline = None
304	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
305	yield (ERRORTOKEN, contstr + line,
306	strstart, (lnum, len(line)), contline)
307	contstr = ''
308	contline = None
309	continue
310	else:
311	contstr = contstr + line
312	contline = contline + line
313	continue
314
315	elif parenlev == 0 and not continued: # new statement
316	if not line: break
317	column = 0
318	while pos < max: # measure leading whitespace
319	if line[pos] == ' ': column = column + 1
320	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
321	elif line[pos] == '\f': column = 0
322	else: break
323	pos = pos + 1
324	if pos == max: break
325
326	if line[pos] in '#\r\n': # skip comments or blank lines
327	if line[pos] == '#':
328	comment_token = line[pos:].rstrip('\r\n')
329	nl_pos = pos + len(comment_token)
330	yield (COMMENT, comment_token,
331	(lnum, pos), (lnum, pos + len(comment_token)), line)
332	yield (NL, line[nl_pos:],
333	(lnum, nl_pos), (lnum, len(line)), line)
334	else:
335	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
336	(lnum, pos), (lnum, len(line)), line)
337	continue
338
339	if column > indents[-1]: # count indents or dedents
340	indents.append(column)
341	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
342	while column < indents[-1]:
343	if column not in indents:
344	raise IndentationError(
345	"unindent does not match any outer indentation level",
346	("<tokenize>", lnum, pos, line))
347	indents = indents[:-1]
348	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
349
350	else: # continued statement
351	if not line:
352	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
353	continued = 0
354
355	while pos < max:
356	pseudomatch = pseudoprog.match(line, pos)
357	if pseudomatch: # scan for tokens
358	start, end = pseudomatch.span(1)
359	spos, epos, pos = (lnum, start), (lnum, end), end
360	token, initial = line[start:end], line[start]
361
362	if initial in numchars or \
363	(initial == '.' and token != '.'): # ordinary number
364	yield (NUMBER, token, spos, epos, line)
365	elif initial in '\r\n':
366	yield (NL if parenlev > 0 else NEWLINE,
367	token, spos, epos, line)
368	elif initial == '#':
369	assert not token.endswith("\n")
370	yield (COMMENT, token, spos, epos, line)
371	elif token in triple_quoted:
372	endprog = endprogs[token]
373	endmatch = endprog.match(line, pos)
374	if endmatch: # all on one line
375	pos = endmatch.end(0)
376	token = line[start:pos]
377	yield (STRING, token, spos, (lnum, pos), line)
378	else:
379	strstart = (lnum, start) # multiple lines
380	contstr = line[start:]
381	contline = line
382	break
383	elif initial in single_quoted or \
384	token[:2] in single_quoted or \
385	token[:3] in single_quoted:
386	if token[-1] == '\n': # continued string
387	strstart = (lnum, start)
388	endprog = (endprogs[initial] or endprogs[token[1]] or
389	endprogs[token[2]])
390	contstr, needcont = line[start:], 1
391	contline = line
392	break
393	else: # ordinary string
394	yield (STRING, token, spos, epos, line)
395	elif initial in namechars: # ordinary name
396	yield (NAME, token, spos, epos, line)
397	elif initial == '\\': # continued stmt
398	continued = 1
399	else:
400	if initial in '([{': parenlev = parenlev + 1
401	elif initial in ')]}': parenlev = parenlev - 1
402	yield (OP, token, spos, epos, line)
403	else:
404	yield (ERRORTOKEN, line[pos],
405	(lnum, pos), (lnum, pos+1), line)
406	pos = pos + 1
407
408	for indent in indents[1:]: # pop remaining indent levels
409	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
410	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
411
412	if __name__ == '__main__': # testing
413	import sys
414	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
415	else: tokenize(sys.stdin.readline)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/vendor/Python-2.6.5/Lib/tokenize.py

Download in other formats: