Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

suspicious.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 8.6 KB

Line
1	"""
2	Try to detect suspicious constructs, resembling markup
3	that has leaked into the final output.
4
5	Suspicious lines are reported in a comma-separated-file,
6	``suspicious.csv``, located in the output directory.
7
8	The file is utf-8 encoded, and each line contains four fields:
9
10	* document name (normalized)
11	* line number in the source document
12	* problematic text
13	* complete line showing the problematic text in context
14
15	It is common to find many false positives. To avoid reporting them
16	again and again, they may be added to the ``ignored.csv`` file
17	(located in the configuration directory). The file has the same
18	format as ``suspicious.csv`` with a few differences:
19
20	- each line defines a rule; if the rule matches, the issue
21	is ignored.
22	- line number may be empty (that is, nothing between the
23	commas: ",,"). In this case, line numbers are ignored (the
24	rule matches anywhere in the file).
25	- the last field does not have to be a complete line; some
26	surrounding text (never more than a line) is enough for
27	context.
28
29	Rules are processed sequentially. A rule matches when:
30
31	* document names are the same
32	* problematic texts are the same
33	* line numbers are close to each other (5 lines up or down)
34	* the rule text is completely contained into the source line
35
36	The simplest way to create the ignored.csv file is by copying
37	undesired entries from suspicious.csv (possibly trimming the last
38	field.)
39
40	Copyright 2009 Gabriel A. Genellina
41
42	"""
43
44	import os
45	import re
46	import csv
47	import sys
48
49	from docutils import nodes
50	from sphinx.builders import Builder
51
52	detect_all = re.compile(ur'''
53	::(?=[^=])\| # two :: (but NOT ::=)
54	:[a-zA-Z][a-zA-Z0-9]+\| # :foo
55	`\| # ` (seldom used by itself)
56	(?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
57	''', re.UNICODE \| re.VERBOSE).finditer
58
59
60	class Rule:
61	def __init__(self, docname, lineno, issue, line):
62	"""A rule for ignoring issues"""
63	self.docname = docname # document to which this rule applies
64	self.lineno = lineno # line number in the original source;
65	# this rule matches only near that.
66	# None -> don't care
67	self.issue = issue # the markup fragment that triggered this rule
68	self.line = line # text of the container element (single line only)
69	self.used = False
70
71	def __repr__(self):
72	return '{0.docname},,{0.issue},{0.line}'.format(self)
73
74
75
76	class dialect(csv.excel):
77	"""Our dialect: uses only linefeed as newline."""
78	lineterminator = '\n'
79
80
81	class CheckSuspiciousMarkupBuilder(Builder):
82	"""
83	Checks for possibly invalid markup that may leak into the output.
84	"""
85	name = 'suspicious'
86
87	def init(self):
88	# create output file
89	self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
90	open(self.log_file_name, 'w').close()
91	# load database of previously ignored issues
92	self.load_rules(os.path.join(os.path.dirname(__file__),
93	'susp-ignored.csv'))
94
95	def get_outdated_docs(self):
96	return self.env.found_docs
97
98	def get_target_uri(self, docname, typ=None):
99	return ''
100
101	def prepare_writing(self, docnames):
102	pass
103
104	def write_doc(self, docname, doctree):
105	# set when any issue is encountered in this document
106	self.any_issue = False
107	self.docname = docname
108	visitor = SuspiciousVisitor(doctree, self)
109	doctree.walk(visitor)
110
111	def finish(self):
112	unused_rules = [rule for rule in self.rules if not rule.used]
113	if unused_rules:
114	self.warn('Found %s/%s unused rules:' %
115	(len(unused_rules), len(self.rules)))
116	for rule in unused_rules:
117	self.info(repr(rule))
118	return
119
120	def check_issue(self, line, lineno, issue):
121	if not self.is_ignored(line, lineno, issue):
122	self.report_issue(line, lineno, issue)
123
124	def is_ignored(self, line, lineno, issue):
125	"""Determine whether this issue should be ignored."""
126	docname = self.docname
127	for rule in self.rules:
128	if rule.docname != docname: continue
129	if rule.issue != issue: continue
130	# Both lines must match exactly. This is rather strict,
131	# and probably should be improved.
132	# Doing fuzzy matches with levenshtein distance could work,
133	# but that means bringing other libraries...
134	# Ok, relax that requirement: just check if the rule fragment
135	# is contained in the document line
136	if rule.line not in line: continue
137	# Check both line numbers. If they're "near"
138	# this rule matches. (lineno=None means "don't care")
139	if (rule.lineno is not None) and \
140	abs(rule.lineno - lineno) > 5: continue
141	# if it came this far, the rule matched
142	rule.used = True
143	return True
144	return False
145
146	def report_issue(self, text, lineno, issue):
147	if not self.any_issue: self.info()
148	self.any_issue = True
149	self.write_log_entry(lineno, issue, text)
150	self.warn('[%s:%d] "%s" found in "%-.120s"' % (
151	self.docname.encode(sys.getdefaultencoding(),'replace'),
152	lineno,
153	issue.encode(sys.getdefaultencoding(),'replace'),
154	text.strip().encode(sys.getdefaultencoding(),'replace')))
155	self.app.statuscode = 1
156
157	def write_log_entry(self, lineno, issue, text):
158	f = open(self.log_file_name, 'ab')
159	writer = csv.writer(f, dialect)
160	writer.writerow([self.docname.encode('utf-8'),
161	lineno,
162	issue.encode('utf-8'),
163	text.strip().encode('utf-8')])
164	f.close()
165
166	def load_rules(self, filename):
167	"""Load database of previously ignored issues.
168
169	A csv file, with exactly the same format as suspicious.csv
170	Fields: document name (normalized), line number, issue, surrounding text
171	"""
172	self.info("loading ignore rules... ", nonl=1)
173	self.rules = rules = []
174	try: f = open(filename, 'rb')
175	except IOError: return
176	for i, row in enumerate(csv.reader(f)):
177	if len(row) != 4:
178	raise ValueError(
179	"wrong format in %s, line %d: %s" % (filename, i+1, row))
180	docname, lineno, issue, text = row
181	docname = docname.decode('utf-8')
182	if lineno: lineno = int(lineno)
183	else: lineno = None
184	issue = issue.decode('utf-8')
185	text = text.decode('utf-8')
186	rule = Rule(docname, lineno, issue, text)
187	rules.append(rule)
188	f.close()
189	self.info('done, %d rules loaded' % len(self.rules))
190
191
192	def get_lineno(node):
193	"""Obtain line number information for a node."""
194	lineno = None
195	while lineno is None and node:
196	node = node.parent
197	lineno = node.line
198	return lineno
199
200
201	def extract_line(text, index):
202	"""text may be a multiline string; extract
203	only the line containing the given character index.
204
205	>>> extract_line("abc\ndefgh\ni", 6)
206	>>> 'defgh'
207	>>> for i in (0, 2, 3, 4, 10):
208	... print extract_line("abc\ndefgh\ni", i)
209	abc
210	abc
211	abc
212	defgh
213	defgh
214	i
215	"""
216	p = text.rfind('\n', 0, index) + 1
217	q = text.find('\n', index)
218	if q < 0:
219	q = len(text)
220	return text[p:q]
221
222
223	class SuspiciousVisitor(nodes.GenericNodeVisitor):
224
225	lastlineno = 0
226
227	def __init__(self, document, builder):
228	nodes.GenericNodeVisitor.__init__(self, document)
229	self.builder = builder
230
231	def default_visit(self, node):
232	if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
233	text = node.astext()
234	# lineno seems to go backwards sometimes (?)
235	self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
236	seen = set() # don't report the same issue more than only once per line
237	for match in detect_all(text):
238	issue = match.group()
239	line = extract_line(text, match.start())
240	if (issue, line) not in seen:
241	self.builder.check_issue(line, lineno, issue)
242	seen.add((issue, line))
243
244	unknown_visit = default_visit
245
246	def visit_document(self, node):
247	self.lastlineno = 0
248
249	def visit_comment(self, node):
250	# ignore comments -- too much false positives.
251	# (although doing this could miss some errors;
252	# there were two sections "commented-out" by mistake
253	# in the Python docs that would not be catched)
254	raise nodes.SkipNode

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Doc/tools/sphinxext/suspicious.py

Download in other formats: