source: python/vendor/Python-2.7.6/Lib/tokenize.py

Last change on this file was 388, checked in by dmik, 11 years ago

python: Update vendor to 2.7.6.

  • Property svn:eol-style set to native
File size: 16.2 KB
Line 
1"""Tokenization help for Python programs.
2
3generate_tokens(readline) is a generator that breaks a stream of
4text into Python tokens. It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF). It generates
65-tuples with these members:
7
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
24
25__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
27 'Skip Montanaro, Raymond Hettinger')
28
29import string, re
30from token import *
31
32import token
33__all__ = [x for x in dir(token) if not x.startswith("_")]
34__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
35del x
36del token
37
38COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
40NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
42N_TOKENS += 2
43
44def group(*choices): return '(' + '|'.join(choices) + ')'
45def any(*choices): return group(*choices) + '*'
46def maybe(*choices): return group(*choices) + '?'
47
48Whitespace = r'[ \f\t]*'
49Comment = r'#[^\r\n]*'
50Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51Name = r'[a-zA-Z_]\w*'
52
53Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
54Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
55Binnumber = r'0[bB][01]+[lL]?'
56Decnumber = r'[1-9]\d*[lL]?'
57Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
58Exponent = r'[eE][-+]?\d+'
59Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
60Expfloat = r'\d+' + Exponent
61Floatnumber = group(Pointfloat, Expfloat)
62Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
63Number = group(Imagnumber, Floatnumber, Intnumber)
64
65# Tail end of ' string.
66Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
67# Tail end of " string.
68Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
69# Tail end of ''' string.
70Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
71# Tail end of """ string.
72Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
73Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
74# Single-line ' or " string.
75String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
76 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77
78# Because of leftmost-then-longest match semantics, be sure to put the
79# longest operators first (e.g., if = came before ==, == would get
80# recognized as two instances of =).
81Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
82 r"//=?",
83 r"[+\-*/%&|^=<>]=?",
84 r"~")
85
86Bracket = '[][(){}]'
87Special = group(r'\r?\n', r'[:;.,`@]')
88Funny = group(Operator, Bracket, Special)
89
90PlainToken = group(Number, Funny, String, Name)
91Token = Ignore + PlainToken
92
93# First (or only) line of ' or " string.
94ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
95 group("'", r'\\\r?\n'),
96 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
97 group('"', r'\\\r?\n'))
98PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
99PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100
101tokenprog, pseudoprog, single3prog, double3prog = map(
102 re.compile, (Token, PseudoToken, Single3, Double3))
103endprogs = {"'": re.compile(Single), '"': re.compile(Double),
104 "'''": single3prog, '"""': double3prog,
105 "r'''": single3prog, 'r"""': double3prog,
106 "u'''": single3prog, 'u"""': double3prog,
107 "ur'''": single3prog, 'ur"""': double3prog,
108 "R'''": single3prog, 'R"""': double3prog,
109 "U'''": single3prog, 'U"""': double3prog,
110 "uR'''": single3prog, 'uR"""': double3prog,
111 "Ur'''": single3prog, 'Ur"""': double3prog,
112 "UR'''": single3prog, 'UR"""': double3prog,
113 "b'''": single3prog, 'b"""': double3prog,
114 "br'''": single3prog, 'br"""': double3prog,
115 "B'''": single3prog, 'B"""': double3prog,
116 "bR'''": single3prog, 'bR"""': double3prog,
117 "Br'''": single3prog, 'Br"""': double3prog,
118 "BR'''": single3prog, 'BR"""': double3prog,
119 'r': None, 'R': None, 'u': None, 'U': None,
120 'b': None, 'B': None}
121
122triple_quoted = {}
123for t in ("'''", '"""',
124 "r'''", 'r"""', "R'''", 'R"""',
125 "u'''", 'u"""', "U'''", 'U"""',
126 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
127 "uR'''", 'uR"""', "UR'''", 'UR"""',
128 "b'''", 'b"""', "B'''", 'B"""',
129 "br'''", 'br"""', "Br'''", 'Br"""',
130 "bR'''", 'bR"""', "BR'''", 'BR"""'):
131 triple_quoted[t] = t
132single_quoted = {}
133for t in ("'", '"',
134 "r'", 'r"', "R'", 'R"',
135 "u'", 'u"', "U'", 'U"',
136 "ur'", 'ur"', "Ur'", 'Ur"',
137 "uR'", 'uR"', "UR'", 'UR"',
138 "b'", 'b"', "B'", 'B"',
139 "br'", 'br"', "Br'", 'Br"',
140 "bR'", 'bR"', "BR'", 'BR"' ):
141 single_quoted[t] = t
142
143tabsize = 8
144
145class TokenError(Exception): pass
146
147class StopTokenizing(Exception): pass
148
149def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
150 srow, scol = srow_scol
151 erow, ecol = erow_ecol
152 print "%d,%d-%d,%d:\t%s\t%s" % \
153 (srow, scol, erow, ecol, tok_name[type], repr(token))
154
155def tokenize(readline, tokeneater=printtoken):
156 """
157 The tokenize() function accepts two parameters: one representing the
158 input stream, and one providing an output mechanism for tokenize().
159
160 The first parameter, readline, must be a callable object which provides
161 the same interface as the readline() method of built-in file objects.
162 Each call to the function should return one line of input as a string.
163
164 The second parameter, tokeneater, must also be a callable object. It is
165 called once for each token, with five arguments, corresponding to the
166 tuples generated by generate_tokens().
167 """
168 try:
169 tokenize_loop(readline, tokeneater)
170 except StopTokenizing:
171 pass
172
173# backwards compatible interface
174def tokenize_loop(readline, tokeneater):
175 for token_info in generate_tokens(readline):
176 tokeneater(*token_info)
177
178class Untokenizer:
179
180 def __init__(self):
181 self.tokens = []
182 self.prev_row = 1
183 self.prev_col = 0
184
185 def add_whitespace(self, start):
186 row, col = start
187 assert row <= self.prev_row
188 col_offset = col - self.prev_col
189 if col_offset:
190 self.tokens.append(" " * col_offset)
191
192 def untokenize(self, iterable):
193 for t in iterable:
194 if len(t) == 2:
195 self.compat(t, iterable)
196 break
197 tok_type, token, start, end, line = t
198 self.add_whitespace(start)
199 self.tokens.append(token)
200 self.prev_row, self.prev_col = end
201 if tok_type in (NEWLINE, NL):
202 self.prev_row += 1
203 self.prev_col = 0
204 return "".join(self.tokens)
205
206 def compat(self, token, iterable):
207 startline = False
208 indents = []
209 toks_append = self.tokens.append
210 toknum, tokval = token
211 if toknum in (NAME, NUMBER):
212 tokval += ' '
213 if toknum in (NEWLINE, NL):
214 startline = True
215 prevstring = False
216 for tok in iterable:
217 toknum, tokval = tok[:2]
218
219 if toknum in (NAME, NUMBER):
220 tokval += ' '
221
222 # Insert a space between two consecutive strings
223 if toknum == STRING:
224 if prevstring:
225 tokval = ' ' + tokval
226 prevstring = True
227 else:
228 prevstring = False
229
230 if toknum == INDENT:
231 indents.append(tokval)
232 continue
233 elif toknum == DEDENT:
234 indents.pop()
235 continue
236 elif toknum in (NEWLINE, NL):
237 startline = True
238 elif startline and indents:
239 toks_append(indents[-1])
240 startline = False
241 toks_append(tokval)
242
243def untokenize(iterable):
244 """Transform tokens back into Python source code.
245
246 Each element returned by the iterable must be a token sequence
247 with at least two elements, a token number and token value. If
248 only two tokens are passed, the resulting output is poor.
249
250 Round-trip invariant for full input:
251 Untokenized source will match input source exactly
252
253 Round-trip invariant for limited intput:
254 # Output text will tokenize the back to the input
255 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
256 newcode = untokenize(t1)
257 readline = iter(newcode.splitlines(1)).next
258 t2 = [tok[:2] for tok in generate_tokens(readline)]
259 assert t1 == t2
260 """
261 ut = Untokenizer()
262 return ut.untokenize(iterable)
263
264def generate_tokens(readline):
265 """
266 The generate_tokens() generator requires one argment, readline, which
267 must be a callable object which provides the same interface as the
268 readline() method of built-in file objects. Each call to the function
269 should return one line of input as a string. Alternately, readline
270 can be a callable function terminating with StopIteration:
271 readline = open(myfile).next # Example of alternate readline
272
273 The generator produces 5-tuples with these members: the token type; the
274 token string; a 2-tuple (srow, scol) of ints specifying the row and
275 column where the token begins in the source; a 2-tuple (erow, ecol) of
276 ints specifying the row and column where the token ends in the source;
277 and the line on which the token was found. The line passed is the
278 logical line; continuation lines are included.
279 """
280 lnum = parenlev = continued = 0
281 namechars, numchars = string.ascii_letters + '_', '0123456789'
282 contstr, needcont = '', 0
283 contline = None
284 indents = [0]
285
286 while 1: # loop over lines in stream
287 try:
288 line = readline()
289 except StopIteration:
290 line = ''
291 lnum += 1
292 pos, max = 0, len(line)
293
294 if contstr: # continued string
295 if not line:
296 raise TokenError, ("EOF in multi-line string", strstart)
297 endmatch = endprog.match(line)
298 if endmatch:
299 pos = end = endmatch.end(0)
300 yield (STRING, contstr + line[:end],
301 strstart, (lnum, end), contline + line)
302 contstr, needcont = '', 0
303 contline = None
304 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
305 yield (ERRORTOKEN, contstr + line,
306 strstart, (lnum, len(line)), contline)
307 contstr = ''
308 contline = None
309 continue
310 else:
311 contstr = contstr + line
312 contline = contline + line
313 continue
314
315 elif parenlev == 0 and not continued: # new statement
316 if not line: break
317 column = 0
318 while pos < max: # measure leading whitespace
319 if line[pos] == ' ':
320 column += 1
321 elif line[pos] == '\t':
322 column = (column//tabsize + 1)*tabsize
323 elif line[pos] == '\f':
324 column = 0
325 else:
326 break
327 pos += 1
328 if pos == max:
329 break
330
331 if line[pos] in '#\r\n': # skip comments or blank lines
332 if line[pos] == '#':
333 comment_token = line[pos:].rstrip('\r\n')
334 nl_pos = pos + len(comment_token)
335 yield (COMMENT, comment_token,
336 (lnum, pos), (lnum, pos + len(comment_token)), line)
337 yield (NL, line[nl_pos:],
338 (lnum, nl_pos), (lnum, len(line)), line)
339 else:
340 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
341 (lnum, pos), (lnum, len(line)), line)
342 continue
343
344 if column > indents[-1]: # count indents or dedents
345 indents.append(column)
346 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
347 while column < indents[-1]:
348 if column not in indents:
349 raise IndentationError(
350 "unindent does not match any outer indentation level",
351 ("<tokenize>", lnum, pos, line))
352 indents = indents[:-1]
353 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
354
355 else: # continued statement
356 if not line:
357 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
358 continued = 0
359
360 while pos < max:
361 pseudomatch = pseudoprog.match(line, pos)
362 if pseudomatch: # scan for tokens
363 start, end = pseudomatch.span(1)
364 spos, epos, pos = (lnum, start), (lnum, end), end
365 if start == end:
366 continue
367 token, initial = line[start:end], line[start]
368
369 if initial in numchars or \
370 (initial == '.' and token != '.'): # ordinary number
371 yield (NUMBER, token, spos, epos, line)
372 elif initial in '\r\n':
373 yield (NL if parenlev > 0 else NEWLINE,
374 token, spos, epos, line)
375 elif initial == '#':
376 assert not token.endswith("\n")
377 yield (COMMENT, token, spos, epos, line)
378 elif token in triple_quoted:
379 endprog = endprogs[token]
380 endmatch = endprog.match(line, pos)
381 if endmatch: # all on one line
382 pos = endmatch.end(0)
383 token = line[start:pos]
384 yield (STRING, token, spos, (lnum, pos), line)
385 else:
386 strstart = (lnum, start) # multiple lines
387 contstr = line[start:]
388 contline = line
389 break
390 elif initial in single_quoted or \
391 token[:2] in single_quoted or \
392 token[:3] in single_quoted:
393 if token[-1] == '\n': # continued string
394 strstart = (lnum, start)
395 endprog = (endprogs[initial] or endprogs[token[1]] or
396 endprogs[token[2]])
397 contstr, needcont = line[start:], 1
398 contline = line
399 break
400 else: # ordinary string
401 yield (STRING, token, spos, epos, line)
402 elif initial in namechars: # ordinary name
403 yield (NAME, token, spos, epos, line)
404 elif initial == '\\': # continued stmt
405 continued = 1
406 else:
407 if initial in '([{':
408 parenlev += 1
409 elif initial in ')]}':
410 parenlev -= 1
411 yield (OP, token, spos, epos, line)
412 else:
413 yield (ERRORTOKEN, line[pos],
414 (lnum, pos), (lnum, pos+1), line)
415 pos += 1
416
417 for indent in indents[1:]: # pop remaining indent levels
418 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
419 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
420
421if __name__ == '__main__': # testing
422 import sys
423 if len(sys.argv) > 1:
424 tokenize(open(sys.argv[1]).readline)
425 else:
426 tokenize(sys.stdin.readline)
Note: See TracBrowser for help on using the repository browser.