Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

PyParse.py@ 1538

Last change on this file since 1538 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 19.1 KB

Line
1	import re
2	import sys
3
4	# Reason last stmt is continued (or C_NONE if it's not).
5	(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
6	C_STRING_NEXT_LINES, C_BRACKET) = range(5)
7
8	if 0: # for throwaway debugging output
9	def dump(*stuff):
10	sys.__stdout__.write(" ".join(map(str, stuff)) + "\n")
11
12	# Find what looks like the start of a popular stmt.
13
14	_synchre = re.compile(r"""
15	^
16	[ \t]*
17	(?: while
18	\| else
19	\| def
20	\| return
21	\| assert
22	\| break
23	\| class
24	\| continue
25	\| elif
26	\| try
27	\| except
28	\| raise
29	\| import
30	\| yield
31	)
32	\b
33	""", re.VERBOSE \| re.MULTILINE).search
34
35	# Match blank line or non-indenting comment line.
36
37	_junkre = re.compile(r"""
38	[ \t]*
39	(?: \# \S .* )?
40	\n
41	""", re.VERBOSE).match
42
43	# Match any flavor of string; the terminating quote is optional
44	# so that we're robust in the face of incomplete program text.
45
46	_match_stringre = re.compile(r"""
47	\""" [^"\\]* (?:
48	(?: \\. \| "(?!"") )
49	[^"\\]*
50	)*
51	(?: \""" )?
52
53	\| " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
54
55	\| ''' [^'\\]* (?:
56	(?: \\. \| '(?!'') )
57	[^'\\]*
58	)*
59	(?: ''' )?
60
61	\| ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
62	""", re.VERBOSE \| re.DOTALL).match
63
64	# Match a line that starts with something interesting;
65	# used to find the first item of a bracket structure.
66
67	_itemre = re.compile(r"""
68	[ \t]*
69	[^\s#\\] # if we match, m.end()-1 is the interesting char
70	""", re.VERBOSE).match
71
72	# Match start of stmts that should be followed by a dedent.
73
74	_closere = re.compile(r"""
75	\s*
76	(?: return
77	\| break
78	\| continue
79	\| raise
80	\| pass
81	)
82	\b
83	""", re.VERBOSE).match
84
85	# Chew up non-special chars as quickly as possible. If match is
86	# successful, m.end() less 1 is the index of the last boring char
87	# matched. If match is unsuccessful, the string starts with an
88	# interesting char.
89
90	_chew_ordinaryre = re.compile(r"""
91	[^[\](){}#'"\\]+
92	""", re.VERBOSE).match
93
94	# Build translation table to map uninteresting chars to "x", open
95	# brackets to "(", and close brackets to ")".
96
97	_tran = ['x'] * 256
98	for ch in "({[":
99	_tran[ord(ch)] = '('
100	for ch in ")}]":
101	_tran[ord(ch)] = ')'
102	for ch in "\"'\\\n#":
103	_tran[ord(ch)] = ch
104	_tran = ''.join(_tran)
105	del ch
106
107	try:
108	UnicodeType = type(unicode(""))
109	except NameError:
110	UnicodeType = None
111
112	class Parser:
113
114	def __init__(self, indentwidth, tabwidth):
115	self.indentwidth = indentwidth
116	self.tabwidth = tabwidth
117
118	def set_str(self, str):
119	assert len(str) == 0 or str[-1] == '\n'
120	if type(str) is UnicodeType:
121	# The parse functions have no idea what to do with Unicode, so
122	# replace all Unicode characters with "x". This is "safe"
123	# so long as the only characters germane to parsing the structure
124	# of Python are 7-bit ASCII. It's necessary because Unicode
125	# strings don't have a .translate() method that supports
126	# deletechars.
127	uniphooey = str
128	str = []
129	push = str.append
130	for raw in map(ord, uniphooey):
131	push(raw < 127 and chr(raw) or "x")
132	str = "".join(str)
133	self.str = str
134	self.study_level = 0
135
136	# Return index of a good place to begin parsing, as close to the
137	# end of the string as possible. This will be the start of some
138	# popular stmt like "if" or "def". Return None if none found:
139	# the caller should pass more prior context then, if possible, or
140	# if not (the entire program text up until the point of interest
141	# has already been tried) pass 0 to set_lo.
142	#
143	# This will be reliable iff given a reliable is_char_in_string
144	# function, meaning that when it says "no", it's absolutely
145	# guaranteed that the char is not in a string.
146
147	def find_good_parse_start(self, is_char_in_string=None,
148	_synchre=_synchre):
149	str, pos = self.str, None
150
151	if not is_char_in_string:
152	# no clue -- make the caller pass everything
153	return None
154
155	# Peek back from the end for a good place to start,
156	# but don't try too often; pos will be left None, or
157	# bumped to a legitimate synch point.
158	limit = len(str)
159	for tries in range(5):
160	i = str.rfind(":\n", 0, limit)
161	if i < 0:
162	break
163	i = str.rfind('\n', 0, i) + 1 # start of colon line
164	m = _synchre(str, i, limit)
165	if m and not is_char_in_string(m.start()):
166	pos = m.start()
167	break
168	limit = i
169	if pos is None:
170	# Nothing looks like a block-opener, or stuff does
171	# but is_char_in_string keeps returning true; most likely
172	# we're in or near a giant string, the colorizer hasn't
173	# caught up enough to be helpful, or there simply aren't
174	# any interesting stmts. In any of these cases we're
175	# going to have to parse the whole thing to be sure, so
176	# give it one last try from the start, but stop wasting
177	# time here regardless of the outcome.
178	m = _synchre(str)
179	if m and not is_char_in_string(m.start()):
180	pos = m.start()
181	return pos
182
183	# Peeking back worked; look forward until _synchre no longer
184	# matches.
185	i = pos + 1
186	while 1:
187	m = _synchre(str, i)
188	if m:
189	s, i = m.span()
190	if not is_char_in_string(s):
191	pos = s
192	else:
193	break
194	return pos
195
196	# Throw away the start of the string. Intended to be called with
197	# find_good_parse_start's result.
198
199	def set_lo(self, lo):
200	assert lo == 0 or self.str[lo-1] == '\n'
201	if lo > 0:
202	self.str = self.str[lo:]
203
204	# As quickly as humanly possible <wink>, find the line numbers (0-
205	# based) of the non-continuation lines.
206	# Creates self.{goodlines, continuation}.
207
208	def _study1(self):
209	if self.study_level >= 1:
210	return
211	self.study_level = 1
212
213	# Map all uninteresting characters to "x", all open brackets
214	# to "(", all close brackets to ")", then collapse runs of
215	# uninteresting characters. This can cut the number of chars
216	# by a factor of 10-40, and so greatly speed the following loop.
217	str = self.str
218	str = str.translate(_tran)
219	str = str.replace('xxxxxxxx', 'x')
220	str = str.replace('xxxx', 'x')
221	str = str.replace('xx', 'x')
222	str = str.replace('xx', 'x')
223	str = str.replace('\nx', '\n')
224	# note that replacing x\n with \n would be incorrect, because
225	# x may be preceded by a backslash
226
227	# March over the squashed version of the program, accumulating
228	# the line numbers of non-continued stmts, and determining
229	# whether & why the last stmt is a continuation.
230	continuation = C_NONE
231	level = lno = 0 # level is nesting level; lno is line number
232	self.goodlines = goodlines = [0]
233	push_good = goodlines.append
234	i, n = 0, len(str)
235	while i < n:
236	ch = str[i]
237	i = i+1
238
239	# cases are checked in decreasing order of frequency
240	if ch == 'x':
241	continue
242
243	if ch == '\n':
244	lno = lno + 1
245	if level == 0:
246	push_good(lno)
247	# else we're in an unclosed bracket structure
248	continue
249
250	if ch == '(':
251	level = level + 1
252	continue
253
254	if ch == ')':
255	if level:
256	level = level - 1
257	# else the program is invalid, but we can't complain
258	continue
259
260	if ch == '"' or ch == "'":
261	# consume the string
262	quote = ch
263	if str[i-1:i+2] == quote * 3:
264	quote = quote * 3
265	firstlno = lno
266	w = len(quote) - 1
267	i = i+w
268	while i < n:
269	ch = str[i]
270	i = i+1
271
272	if ch == 'x':
273	continue
274
275	if str[i-1:i+w] == quote:
276	i = i+w
277	break
278
279	if ch == '\n':
280	lno = lno + 1
281	if w == 0:
282	# unterminated single-quoted string
283	if level == 0:
284	push_good(lno)
285	break
286	continue
287
288	if ch == '\\':
289	assert i < n
290	if str[i] == '\n':
291	lno = lno + 1
292	i = i+1
293	continue
294
295	# else comment char or paren inside string
296
297	else:
298	# didn't break out of the loop, so we're still
299	# inside a string
300	if (lno - 1) == firstlno:
301	# before the previous \n in str, we were in the first
302	# line of the string
303	continuation = C_STRING_FIRST_LINE
304	else:
305	continuation = C_STRING_NEXT_LINES
306	continue # with outer loop
307
308	if ch == '#':
309	# consume the comment
310	i = str.find('\n', i)
311	assert i >= 0
312	continue
313
314	assert ch == '\\'
315	assert i < n
316	if str[i] == '\n':
317	lno = lno + 1
318	if i+1 == n:
319	continuation = C_BACKSLASH
320	i = i+1
321
322	# The last stmt may be continued for all 3 reasons.
323	# String continuation takes precedence over bracket
324	# continuation, which beats backslash continuation.
325	if (continuation != C_STRING_FIRST_LINE
326	and continuation != C_STRING_NEXT_LINES and level > 0):
327	continuation = C_BRACKET
328	self.continuation = continuation
329
330	# Push the final line number as a sentinel value, regardless of
331	# whether it's continued.
332	assert (continuation == C_NONE) == (goodlines[-1] == lno)
333	if goodlines[-1] != lno:
334	push_good(lno)
335
336	def get_continuation_type(self):
337	self._study1()
338	return self.continuation
339
340	# study1 was sufficient to determine the continuation status,
341	# but doing more requires looking at every character. study2
342	# does this for the last interesting statement in the block.
343	# Creates:
344	# self.stmt_start, stmt_end
345	# slice indices of last interesting stmt
346	# self.stmt_bracketing
347	# the bracketing structure of the last interesting stmt;
348	# for example, for the statement "say(boo) or die", stmt_bracketing
349	# will be [(0, 0), (3, 1), (8, 0)]. Strings and comments are
350	# treated as brackets, for the matter.
351	# self.lastch
352	# last non-whitespace character before optional trailing
353	# comment
354	# self.lastopenbracketpos
355	# if continuation is C_BRACKET, index of last open bracket
356
357	def _study2(self):
358	if self.study_level >= 2:
359	return
360	self._study1()
361	self.study_level = 2
362
363	# Set p and q to slice indices of last interesting stmt.
364	str, goodlines = self.str, self.goodlines
365	i = len(goodlines) - 1
366	p = len(str) # index of newest line
367	while i:
368	assert p
369	# p is the index of the stmt at line number goodlines[i].
370	# Move p back to the stmt at line number goodlines[i-1].
371	q = p
372	for nothing in range(goodlines[i-1], goodlines[i]):
373	# tricky: sets p to 0 if no preceding newline
374	p = str.rfind('\n', 0, p-1) + 1
375	# The stmt str[p:q] isn't a continuation, but may be blank
376	# or a non-indenting comment line.
377	if _junkre(str, p):
378	i = i-1
379	else:
380	break
381	if i == 0:
382	# nothing but junk!
383	assert p == 0
384	q = p
385	self.stmt_start, self.stmt_end = p, q
386
387	# Analyze this stmt, to find the last open bracket (if any)
388	# and last interesting character (if any).
389	lastch = ""
390	stack = [] # stack of open bracket indices
391	push_stack = stack.append
392	bracketing = [(p, 0)]
393	while p < q:
394	# suck up all except ()[]{}'"#\\
395	m = _chew_ordinaryre(str, p, q)
396	if m:
397	# we skipped at least one boring char
398	newp = m.end()
399	# back up over totally boring whitespace
400	i = newp - 1 # index of last boring char
401	while i >= p and str[i] in " \t\n":
402	i = i-1
403	if i >= p:
404	lastch = str[i]
405	p = newp
406	if p >= q:
407	break
408
409	ch = str[p]
410
411	if ch in "([{":
412	push_stack(p)
413	bracketing.append((p, len(stack)))
414	lastch = ch
415	p = p+1
416	continue
417
418	if ch in ")]}":
419	if stack:
420	del stack[-1]
421	lastch = ch
422	p = p+1
423	bracketing.append((p, len(stack)))
424	continue
425
426	if ch == '"' or ch == "'":
427	# consume string
428	# Note that study1 did this with a Python loop, but
429	# we use a regexp here; the reason is speed in both
430	# cases; the string may be huge, but study1 pre-squashed
431	# strings to a couple of characters per line. study1
432	# also needed to keep track of newlines, and we don't
433	# have to.
434	bracketing.append((p, len(stack)+1))
435	lastch = ch
436	p = _match_stringre(str, p, q).end()
437	bracketing.append((p, len(stack)))
438	continue
439
440	if ch == '#':
441	# consume comment and trailing newline
442	bracketing.append((p, len(stack)+1))
443	p = str.find('\n', p, q) + 1
444	assert p > 0
445	bracketing.append((p, len(stack)))
446	continue
447
448	assert ch == '\\'
449	p = p+1 # beyond backslash
450	assert p < q
451	if str[p] != '\n':
452	# the program is invalid, but can't complain
453	lastch = ch + str[p]
454	p = p+1 # beyond escaped char
455
456	# end while p < q:
457
458	self.lastch = lastch
459	if stack:
460	self.lastopenbracketpos = stack[-1]
461	self.stmt_bracketing = tuple(bracketing)
462
463	# Assuming continuation is C_BRACKET, return the number
464	# of spaces the next line should be indented.
465
466	def compute_bracket_indent(self):
467	self._study2()
468	assert self.continuation == C_BRACKET
469	j = self.lastopenbracketpos
470	str = self.str
471	n = len(str)
472	origi = i = str.rfind('\n', 0, j) + 1
473	j = j+1 # one beyond open bracket
474	# find first list item; set i to start of its line
475	while j < n:
476	m = _itemre(str, j)
477	if m:
478	j = m.end() - 1 # index of first interesting char
479	extra = 0
480	break
481	else:
482	# this line is junk; advance to next line
483	i = j = str.find('\n', j) + 1
484	else:
485	# nothing interesting follows the bracket;
486	# reproduce the bracket line's indentation + a level
487	j = i = origi
488	while str[j] in " \t":
489	j = j+1
490	extra = self.indentwidth
491	return len(str[i:j].expandtabs(self.tabwidth)) + extra
492
493	# Return number of physical lines in last stmt (whether or not
494	# it's an interesting stmt! this is intended to be called when
495	# continuation is C_BACKSLASH).
496
497	def get_num_lines_in_stmt(self):
498	self._study1()
499	goodlines = self.goodlines
500	return goodlines[-1] - goodlines[-2]
501
502	# Assuming continuation is C_BACKSLASH, return the number of spaces
503	# the next line should be indented. Also assuming the new line is
504	# the first one following the initial line of the stmt.
505
506	def compute_backslash_indent(self):
507	self._study2()
508	assert self.continuation == C_BACKSLASH
509	str = self.str
510	i = self.stmt_start
511	while str[i] in " \t":
512	i = i+1
513	startpos = i
514
515	# See whether the initial line starts an assignment stmt; i.e.,
516	# look for an = operator
517	endpos = str.find('\n', startpos) + 1
518	found = level = 0
519	while i < endpos:
520	ch = str[i]
521	if ch in "([{":
522	level = level + 1
523	i = i+1
524	elif ch in ")]}":
525	if level:
526	level = level - 1
527	i = i+1
528	elif ch == '"' or ch == "'":
529	i = _match_stringre(str, i, endpos).end()
530	elif ch == '#':
531	break
532	elif level == 0 and ch == '=' and \
533	(i == 0 or str[i-1] not in "=<>!") and \
534	str[i+1] != '=':
535	found = 1
536	break
537	else:
538	i = i+1
539
540	if found:
541	# found a legit =, but it may be the last interesting
542	# thing on the line
543	i = i+1 # move beyond the =
544	found = re.match(r"\s*\\", str[i:endpos]) is None
545
546	if not found:
547	# oh well ... settle for moving beyond the first chunk
548	# of non-whitespace chars
549	i = startpos
550	while str[i] not in " \t\n":
551	i = i+1
552
553	return len(str[self.stmt_start:i].expandtabs(\
554	self.tabwidth)) + 1
555
556	# Return the leading whitespace on the initial line of the last
557	# interesting stmt.
558
559	def get_base_indent_string(self):
560	self._study2()
561	i, n = self.stmt_start, self.stmt_end
562	j = i
563	str = self.str
564	while j < n and str[j] in " \t":
565	j = j + 1
566	return str[i:j]
567
568	# Did the last interesting stmt open a block?
569
570	def is_block_opener(self):
571	self._study2()
572	return self.lastch == ':'
573
574	# Did the last interesting stmt close a block?
575
576	def is_block_closer(self):
577	self._study2()
578	return _closere(self.str, self.stmt_start) is not None
579
580	# index of last open bracket ({[, or None if none
581	lastopenbracketpos = None
582
583	def get_last_open_bracket_pos(self):
584	self._study2()
585	return self.lastopenbracketpos
586
587	# the structure of the bracketing of the last interesting statement,
588	# in the format defined in _study2, or None if the text didn't contain
589	# anything
590	stmt_bracketing = None
591
592	def get_last_stmt_bracketing(self):
593	self._study2()
594	return self.stmt_bracketing

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/idlelib/PyParse.py@ 1538

Download in other formats: