Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

tokenize.py

Last change on this file was 388, checked in by dmik, 11 years ago
python: Update vendor to 2.7.6.
Property svn:eol-style set to `native`
File size: 16.2 KB

Line
1	"""Tokenization help for Python programs.
2
3	generate_tokens(readline) is a generator that breaks a stream of
4	text into Python tokens. It accepts a readline-like method which is called
5	repeatedly to get the next line of input (or "" for EOF). It generates
6	5-tuples with these members:
7
8	the token type (see token.py)
9	the token (a string)
10	the starting (row, column) indices of the token (a 2-tuple of ints)
11	the ending (row, column) indices of the token (a 2-tuple of ints)
12	the original line (string)
13
14	It is designed to match the working of the Python tokenizer exactly, except
15	that it produces COMMENT tokens for comments and gives type OP for all
16	operators
17
18	Older entry points
19	tokenize_loop(readline, tokeneater)
20	tokenize(readline, tokeneater=printtoken)
21	are the same, except instead of generating tokens, tokeneater is a callback
22	function to which the 5 fields described above are passed as 5 arguments,
23	each time a new token is found."""
24
25	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26	__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
27	'Skip Montanaro, Raymond Hettinger')
28
29	import string, re
30	from token import *
31
32	import token
33	__all__ = [x for x in dir(token) if not x.startswith("_")]
34	__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
35	del x
36	del token
37
38	COMMENT = N_TOKENS
39	tok_name[COMMENT] = 'COMMENT'
40	NL = N_TOKENS + 1
41	tok_name[NL] = 'NL'
42	N_TOKENS += 2
43
44	def group(*choices): return '(' + '\|'.join(choices) + ')'
45	def any(choices): return group(choices) + '*'
46	def maybe(choices): return group(choices) + '?'
47
48	Whitespace = r'[ \f\t]*'
49	Comment = r'#[^\r\n]*'
50	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51	Name = r'[a-zA-Z_]\w*'
52
53	Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
54	Octnumber = r'(0[oO][0-7]+)\|(0[0-7]*)[lL]?'
55	Binnumber = r'0[bB][01]+[lL]?'
56	Decnumber = r'[1-9]\d*[lL]?'
57	Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
58	Exponent = r'[eE][-+]?\d+'
59	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
60	Expfloat = r'\d+' + Exponent
61	Floatnumber = group(Pointfloat, Expfloat)
62	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
63	Number = group(Imagnumber, Floatnumber, Intnumber)
64
65	# Tail end of ' string.
66	Single = r"[^'\\](?:\\.[^'\\])*'"
67	# Tail end of " string.
68	Double = r'[^"\\](?:\\.[^"\\])*"'
69	# Tail end of ''' string.
70	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
71	# Tail end of """ string.
72	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
73	Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
74	# Single-line ' or " string.
75	String = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
76	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
77
78	# Because of leftmost-then-longest match semantics, be sure to put the
79	# longest operators first (e.g., if = came before ==, == would get
80	# recognized as two instances of =).
81	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
82	r"//=?",
83	r"[+\-*/%&\|^=<>]=?",
84	r"~")
85
86	Bracket = '[][(){}]'
87	Special = group(r'\r?\n', r'[:;.,`@]')
88	Funny = group(Operator, Bracket, Special)
89
90	PlainToken = group(Number, Funny, String, Name)
91	Token = Ignore + PlainToken
92
93	# First (or only) line of ' or " string.
94	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
95	group("'", r'\\\r?\n'),
96	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
97	group('"', r'\\\r?\n'))
98	PseudoExtras = group(r'\\\r?\n\|\Z', Comment, Triple)
99	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100
101	tokenprog, pseudoprog, single3prog, double3prog = map(
102	re.compile, (Token, PseudoToken, Single3, Double3))
103	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
104	"'''": single3prog, '"""': double3prog,
105	"r'''": single3prog, 'r"""': double3prog,
106	"u'''": single3prog, 'u"""': double3prog,
107	"ur'''": single3prog, 'ur"""': double3prog,
108	"R'''": single3prog, 'R"""': double3prog,
109	"U'''": single3prog, 'U"""': double3prog,
110	"uR'''": single3prog, 'uR"""': double3prog,
111	"Ur'''": single3prog, 'Ur"""': double3prog,
112	"UR'''": single3prog, 'UR"""': double3prog,
113	"b'''": single3prog, 'b"""': double3prog,
114	"br'''": single3prog, 'br"""': double3prog,
115	"B'''": single3prog, 'B"""': double3prog,
116	"bR'''": single3prog, 'bR"""': double3prog,
117	"Br'''": single3prog, 'Br"""': double3prog,
118	"BR'''": single3prog, 'BR"""': double3prog,
119	'r': None, 'R': None, 'u': None, 'U': None,
120	'b': None, 'B': None}
121
122	triple_quoted = {}
123	for t in ("'''", '"""',
124	"r'''", 'r"""', "R'''", 'R"""',
125	"u'''", 'u"""', "U'''", 'U"""',
126	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
127	"uR'''", 'uR"""', "UR'''", 'UR"""',
128	"b'''", 'b"""', "B'''", 'B"""',
129	"br'''", 'br"""', "Br'''", 'Br"""',
130	"bR'''", 'bR"""', "BR'''", 'BR"""'):
131	triple_quoted[t] = t
132	single_quoted = {}
133	for t in ("'", '"',
134	"r'", 'r"', "R'", 'R"',
135	"u'", 'u"', "U'", 'U"',
136	"ur'", 'ur"', "Ur'", 'Ur"',
137	"uR'", 'uR"', "UR'", 'UR"',
138	"b'", 'b"', "B'", 'B"',
139	"br'", 'br"', "Br'", 'Br"',
140	"bR'", 'bR"', "BR'", 'BR"' ):
141	single_quoted[t] = t
142
143	tabsize = 8
144
145	class TokenError(Exception): pass
146
147	class StopTokenizing(Exception): pass
148
149	def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
150	srow, scol = srow_scol
151	erow, ecol = erow_ecol
152	print "%d,%d-%d,%d:\t%s\t%s" % \
153	(srow, scol, erow, ecol, tok_name[type], repr(token))
154
155	def tokenize(readline, tokeneater=printtoken):
156	"""
157	The tokenize() function accepts two parameters: one representing the
158	input stream, and one providing an output mechanism for tokenize().
159
160	The first parameter, readline, must be a callable object which provides
161	the same interface as the readline() method of built-in file objects.
162	Each call to the function should return one line of input as a string.
163
164	The second parameter, tokeneater, must also be a callable object. It is
165	called once for each token, with five arguments, corresponding to the
166	tuples generated by generate_tokens().
167	"""
168	try:
169	tokenize_loop(readline, tokeneater)
170	except StopTokenizing:
171	pass
172
173	# backwards compatible interface
174	def tokenize_loop(readline, tokeneater):
175	for token_info in generate_tokens(readline):
176	tokeneater(*token_info)
177
178	class Untokenizer:
179
180	def __init__(self):
181	self.tokens = []
182	self.prev_row = 1
183	self.prev_col = 0
184
185	def add_whitespace(self, start):
186	row, col = start
187	assert row <= self.prev_row
188	col_offset = col - self.prev_col
189	if col_offset:
190	self.tokens.append(" " * col_offset)
191
192	def untokenize(self, iterable):
193	for t in iterable:
194	if len(t) == 2:
195	self.compat(t, iterable)
196	break
197	tok_type, token, start, end, line = t
198	self.add_whitespace(start)
199	self.tokens.append(token)
200	self.prev_row, self.prev_col = end
201	if tok_type in (NEWLINE, NL):
202	self.prev_row += 1
203	self.prev_col = 0
204	return "".join(self.tokens)
205
206	def compat(self, token, iterable):
207	startline = False
208	indents = []
209	toks_append = self.tokens.append
210	toknum, tokval = token
211	if toknum in (NAME, NUMBER):
212	tokval += ' '
213	if toknum in (NEWLINE, NL):
214	startline = True
215	prevstring = False
216	for tok in iterable:
217	toknum, tokval = tok[:2]
218
219	if toknum in (NAME, NUMBER):
220	tokval += ' '
221
222	# Insert a space between two consecutive strings
223	if toknum == STRING:
224	if prevstring:
225	tokval = ' ' + tokval
226	prevstring = True
227	else:
228	prevstring = False
229
230	if toknum == INDENT:
231	indents.append(tokval)
232	continue
233	elif toknum == DEDENT:
234	indents.pop()
235	continue
236	elif toknum in (NEWLINE, NL):
237	startline = True
238	elif startline and indents:
239	toks_append(indents[-1])
240	startline = False
241	toks_append(tokval)
242
243	def untokenize(iterable):
244	"""Transform tokens back into Python source code.
245
246	Each element returned by the iterable must be a token sequence
247	with at least two elements, a token number and token value. If
248	only two tokens are passed, the resulting output is poor.
249
250	Round-trip invariant for full input:
251	Untokenized source will match input source exactly
252
253	Round-trip invariant for limited intput:
254	# Output text will tokenize the back to the input
255	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
256	newcode = untokenize(t1)
257	readline = iter(newcode.splitlines(1)).next
258	t2 = [tok[:2] for tok in generate_tokens(readline)]
259	assert t1 == t2
260	"""
261	ut = Untokenizer()
262	return ut.untokenize(iterable)
263
264	def generate_tokens(readline):
265	"""
266	The generate_tokens() generator requires one argment, readline, which
267	must be a callable object which provides the same interface as the
268	readline() method of built-in file objects. Each call to the function
269	should return one line of input as a string. Alternately, readline
270	can be a callable function terminating with StopIteration:
271	readline = open(myfile).next # Example of alternate readline
272
273	The generator produces 5-tuples with these members: the token type; the
274	token string; a 2-tuple (srow, scol) of ints specifying the row and
275	column where the token begins in the source; a 2-tuple (erow, ecol) of
276	ints specifying the row and column where the token ends in the source;
277	and the line on which the token was found. The line passed is the
278	logical line; continuation lines are included.
279	"""
280	lnum = parenlev = continued = 0
281	namechars, numchars = string.ascii_letters + '_', '0123456789'
282	contstr, needcont = '', 0
283	contline = None
284	indents = [0]
285
286	while 1: # loop over lines in stream
287	try:
288	line = readline()
289	except StopIteration:
290	line = ''
291	lnum += 1
292	pos, max = 0, len(line)
293
294	if contstr: # continued string
295	if not line:
296	raise TokenError, ("EOF in multi-line string", strstart)
297	endmatch = endprog.match(line)
298	if endmatch:
299	pos = end = endmatch.end(0)
300	yield (STRING, contstr + line[:end],
301	strstart, (lnum, end), contline + line)
302	contstr, needcont = '', 0
303	contline = None
304	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
305	yield (ERRORTOKEN, contstr + line,
306	strstart, (lnum, len(line)), contline)
307	contstr = ''
308	contline = None
309	continue
310	else:
311	contstr = contstr + line
312	contline = contline + line
313	continue
314
315	elif parenlev == 0 and not continued: # new statement
316	if not line: break
317	column = 0
318	while pos < max: # measure leading whitespace
319	if line[pos] == ' ':
320	column += 1
321	elif line[pos] == '\t':
322	column = (column//tabsize + 1)*tabsize
323	elif line[pos] == '\f':
324	column = 0
325	else:
326	break
327	pos += 1
328	if pos == max:
329	break
330
331	if line[pos] in '#\r\n': # skip comments or blank lines
332	if line[pos] == '#':
333	comment_token = line[pos:].rstrip('\r\n')
334	nl_pos = pos + len(comment_token)
335	yield (COMMENT, comment_token,
336	(lnum, pos), (lnum, pos + len(comment_token)), line)
337	yield (NL, line[nl_pos:],
338	(lnum, nl_pos), (lnum, len(line)), line)
339	else:
340	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
341	(lnum, pos), (lnum, len(line)), line)
342	continue
343
344	if column > indents[-1]: # count indents or dedents
345	indents.append(column)
346	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
347	while column < indents[-1]:
348	if column not in indents:
349	raise IndentationError(
350	"unindent does not match any outer indentation level",
351	("<tokenize>", lnum, pos, line))
352	indents = indents[:-1]
353	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
354
355	else: # continued statement
356	if not line:
357	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
358	continued = 0
359
360	while pos < max:
361	pseudomatch = pseudoprog.match(line, pos)
362	if pseudomatch: # scan for tokens
363	start, end = pseudomatch.span(1)
364	spos, epos, pos = (lnum, start), (lnum, end), end
365	if start == end:
366	continue
367	token, initial = line[start:end], line[start]
368
369	if initial in numchars or \
370	(initial == '.' and token != '.'): # ordinary number
371	yield (NUMBER, token, spos, epos, line)
372	elif initial in '\r\n':
373	yield (NL if parenlev > 0 else NEWLINE,
374	token, spos, epos, line)
375	elif initial == '#':
376	assert not token.endswith("\n")
377	yield (COMMENT, token, spos, epos, line)
378	elif token in triple_quoted:
379	endprog = endprogs[token]
380	endmatch = endprog.match(line, pos)
381	if endmatch: # all on one line
382	pos = endmatch.end(0)
383	token = line[start:pos]
384	yield (STRING, token, spos, (lnum, pos), line)
385	else:
386	strstart = (lnum, start) # multiple lines
387	contstr = line[start:]
388	contline = line
389	break
390	elif initial in single_quoted or \
391	token[:2] in single_quoted or \
392	token[:3] in single_quoted:
393	if token[-1] == '\n': # continued string
394	strstart = (lnum, start)
395	endprog = (endprogs[initial] or endprogs[token[1]] or
396	endprogs[token[2]])
397	contstr, needcont = line[start:], 1
398	contline = line
399	break
400	else: # ordinary string
401	yield (STRING, token, spos, epos, line)
402	elif initial in namechars: # ordinary name
403	yield (NAME, token, spos, epos, line)
404	elif initial == '\\': # continued stmt
405	continued = 1
406	else:
407	if initial in '([{':
408	parenlev += 1
409	elif initial in ')]}':
410	parenlev -= 1
411	yield (OP, token, spos, epos, line)
412	else:
413	yield (ERRORTOKEN, line[pos],
414	(lnum, pos), (lnum, pos+1), line)
415	pos += 1
416
417	for indent in indents[1:]: # pop remaining indent levels
418	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
419	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
420
421	if __name__ == '__main__': # testing
422	import sys
423	if len(sys.argv) > 1:
424	tokenize(open(sys.argv[1]).readline)
425	else:
426	tokenize(sys.stdin.readline)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/vendor/Python-2.7.6/Lib/tokenize.py

Download in other formats: