source: trunk/essentials/dev-lang/python/Lib/tokenize.py

Last change on this file was 3225, checked in by bird, 18 years ago

Python 2.5

File size: 13.2 KB
Line 
1"""Tokenization help for Python programs.
2
3generate_tokens(readline) is a generator that breaks a stream of
4text into Python tokens. It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF). It generates
65-tuples with these members:
7
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
24
25__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26__credits__ = \
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
28
29import string, re
30from token import *
31
32import token
33__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34 "generate_tokens", "NL", "untokenize"]
35del x
36del token
37
38COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
40NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
42N_TOKENS += 2
43
44def group(*choices): return '(' + '|'.join(choices) + ')'
45def any(*choices): return group(*choices) + '*'
46def maybe(*choices): return group(*choices) + '?'
47
48Whitespace = r'[ \f\t]*'
49Comment = r'#[^\r\n]*'
50Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51Name = r'[a-zA-Z_]\w*'
52
53Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54Octnumber = r'0[0-7]*[lL]?'
55Decnumber = r'[1-9]\d*[lL]?'
56Intnumber = group(Hexnumber, Octnumber, Decnumber)
57Exponent = r'[eE][-+]?\d+'
58Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59Expfloat = r'\d+' + Exponent
60Floatnumber = group(Pointfloat, Expfloat)
61Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62Number = group(Imagnumber, Floatnumber, Intnumber)
63
64# Tail end of ' string.
65Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66# Tail end of " string.
67Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68# Tail end of ''' string.
69Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70# Tail end of """ string.
71Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73# Single-line ' or " string.
74String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
76
77# Because of leftmost-then-longest match semantics, be sure to put the
78# longest operators first (e.g., if = came before ==, == would get
79# recognized as two instances of =).
80Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
81 r"//=?",
82 r"[+\-*/%&|^=<>]=?",
83 r"~")
84
85Bracket = '[][(){}]'
86Special = group(r'\r?\n', r'[:;.,`@]')
87Funny = group(Operator, Bracket, Special)
88
89PlainToken = group(Number, Funny, String, Name)
90Token = Ignore + PlainToken
91
92# First (or only) line of ' or " string.
93ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r'\\\r?\n'),
95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r'\\\r?\n'))
97PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
99
100tokenprog, pseudoprog, single3prog, double3prog = map(
101 re.compile, (Token, PseudoToken, Single3, Double3))
102endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103 "'''": single3prog, '"""': double3prog,
104 "r'''": single3prog, 'r"""': double3prog,
105 "u'''": single3prog, 'u"""': double3prog,
106 "ur'''": single3prog, 'ur"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
112 'r': None, 'R': None, 'u': None, 'U': None}
113
114triple_quoted = {}
115for t in ("'''", '"""',
116 "r'''", 'r"""', "R'''", 'R"""',
117 "u'''", 'u"""', "U'''", 'U"""',
118 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
119 "uR'''", 'uR"""', "UR'''", 'UR"""'):
120 triple_quoted[t] = t
121single_quoted = {}
122for t in ("'", '"',
123 "r'", 'r"', "R'", 'R"',
124 "u'", 'u"', "U'", 'U"',
125 "ur'", 'ur"', "Ur'", 'Ur"',
126 "uR'", 'uR"', "UR'", 'UR"' ):
127 single_quoted[t] = t
128
129tabsize = 8
130
131class TokenError(Exception): pass
132
133class StopTokenizing(Exception): pass
134
135def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
136 print "%d,%d-%d,%d:\t%s\t%s" % \
137 (srow, scol, erow, ecol, tok_name[type], repr(token))
138
139def tokenize(readline, tokeneater=printtoken):
140 """
141 The tokenize() function accepts two parameters: one representing the
142 input stream, and one providing an output mechanism for tokenize().
143
144 The first parameter, readline, must be a callable object which provides
145 the same interface as the readline() method of built-in file objects.
146 Each call to the function should return one line of input as a string.
147
148 The second parameter, tokeneater, must also be a callable object. It is
149 called once for each token, with five arguments, corresponding to the
150 tuples generated by generate_tokens().
151 """
152 try:
153 tokenize_loop(readline, tokeneater)
154 except StopTokenizing:
155 pass
156
157# backwards compatible interface
158def tokenize_loop(readline, tokeneater):
159 for token_info in generate_tokens(readline):
160 tokeneater(*token_info)
161
162
163def untokenize(iterable):
164 """Transform tokens back into Python source code.
165
166 Each element returned by the iterable must be a token sequence
167 with at least two elements, a token number and token value.
168
169 Round-trip invariant:
170 # Output text will tokenize the back to the input
171 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
172 newcode = untokenize(t1)
173 readline = iter(newcode.splitlines(1)).next
174 t2 = [tok[:2] for tokin generate_tokens(readline)]
175 assert t1 == t2
176 """
177
178 startline = False
179 indents = []
180 toks = []
181 toks_append = toks.append
182 for tok in iterable:
183 toknum, tokval = tok[:2]
184
185 if toknum in (NAME, NUMBER):
186 tokval += ' '
187
188 if toknum == INDENT:
189 indents.append(tokval)
190 continue
191 elif toknum == DEDENT:
192 indents.pop()
193 continue
194 elif toknum in (NEWLINE, COMMENT, NL):
195 startline = True
196 elif startline and indents:
197 toks_append(indents[-1])
198 startline = False
199 toks_append(tokval)
200 return ''.join(toks)
201
202
203def generate_tokens(readline):
204 """
205 The generate_tokens() generator requires one argment, readline, which
206 must be a callable object which provides the same interface as the
207 readline() method of built-in file objects. Each call to the function
208 should return one line of input as a string. Alternately, readline
209 can be a callable function terminating with StopIteration:
210 readline = open(myfile).next # Example of alternate readline
211
212 The generator produces 5-tuples with these members: the token type; the
213 token string; a 2-tuple (srow, scol) of ints specifying the row and
214 column where the token begins in the source; a 2-tuple (erow, ecol) of
215 ints specifying the row and column where the token ends in the source;
216 and the line on which the token was found. The line passed is the
217 logical line; continuation lines are included.
218 """
219 lnum = parenlev = continued = 0
220 namechars, numchars = string.ascii_letters + '_', '0123456789'
221 contstr, needcont = '', 0
222 contline = None
223 indents = [0]
224
225 while 1: # loop over lines in stream
226 try:
227 line = readline()
228 except StopIteration:
229 line = ''
230 lnum = lnum + 1
231 pos, max = 0, len(line)
232
233 if contstr: # continued string
234 if not line:
235 raise TokenError, ("EOF in multi-line string", strstart)
236 endmatch = endprog.match(line)
237 if endmatch:
238 pos = end = endmatch.end(0)
239 yield (STRING, contstr + line[:end],
240 strstart, (lnum, end), contline + line)
241 contstr, needcont = '', 0
242 contline = None
243 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
244 yield (ERRORTOKEN, contstr + line,
245 strstart, (lnum, len(line)), contline)
246 contstr = ''
247 contline = None
248 continue
249 else:
250 contstr = contstr + line
251 contline = contline + line
252 continue
253
254 elif parenlev == 0 and not continued: # new statement
255 if not line: break
256 column = 0
257 while pos < max: # measure leading whitespace
258 if line[pos] == ' ': column = column + 1
259 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
260 elif line[pos] == '\f': column = 0
261 else: break
262 pos = pos + 1
263 if pos == max: break
264
265 if line[pos] in '#\r\n': # skip comments or blank lines
266 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
267 (lnum, pos), (lnum, len(line)), line)
268 continue
269
270 if column > indents[-1]: # count indents or dedents
271 indents.append(column)
272 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
273 while column < indents[-1]:
274 if column not in indents:
275 raise IndentationError(
276 "unindent does not match any outer indentation level",
277 ("<tokenize>", lnum, pos, line))
278 indents = indents[:-1]
279 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
280
281 else: # continued statement
282 if not line:
283 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
284 continued = 0
285
286 while pos < max:
287 pseudomatch = pseudoprog.match(line, pos)
288 if pseudomatch: # scan for tokens
289 start, end = pseudomatch.span(1)
290 spos, epos, pos = (lnum, start), (lnum, end), end
291 token, initial = line[start:end], line[start]
292
293 if initial in numchars or \
294 (initial == '.' and token != '.'): # ordinary number
295 yield (NUMBER, token, spos, epos, line)
296 elif initial in '\r\n':
297 yield (parenlev > 0 and NL or NEWLINE,
298 token, spos, epos, line)
299 elif initial == '#':
300 yield (COMMENT, token, spos, epos, line)
301 elif token in triple_quoted:
302 endprog = endprogs[token]
303 endmatch = endprog.match(line, pos)
304 if endmatch: # all on one line
305 pos = endmatch.end(0)
306 token = line[start:pos]
307 yield (STRING, token, spos, (lnum, pos), line)
308 else:
309 strstart = (lnum, start) # multiple lines
310 contstr = line[start:]
311 contline = line
312 break
313 elif initial in single_quoted or \
314 token[:2] in single_quoted or \
315 token[:3] in single_quoted:
316 if token[-1] == '\n': # continued string
317 strstart = (lnum, start)
318 endprog = (endprogs[initial] or endprogs[token[1]] or
319 endprogs[token[2]])
320 contstr, needcont = line[start:], 1
321 contline = line
322 break
323 else: # ordinary string
324 yield (STRING, token, spos, epos, line)
325 elif initial in namechars: # ordinary name
326 yield (NAME, token, spos, epos, line)
327 elif initial == '\\': # continued stmt
328 continued = 1
329 else:
330 if initial in '([{': parenlev = parenlev + 1
331 elif initial in ')]}': parenlev = parenlev - 1
332 yield (OP, token, spos, epos, line)
333 else:
334 yield (ERRORTOKEN, line[pos],
335 (lnum, pos), (lnum, pos+1), line)
336 pos = pos + 1
337
338 for indent in indents[1:]: # pop remaining indent levels
339 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
340 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
341
342if __name__ == '__main__': # testing
343 import sys
344 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
345 else: tokenize(sys.stdin.readline)
Note: See TracBrowser for help on using the repository browser.