source: python/trunk/Doc/tools/sphinxext/suspicious.py

Last change on this file was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 8.6 KB
Line 
1"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
44import os
45import re
46import csv
47import sys
48
49from docutils import nodes
50from sphinx.builders import Builder
51
52detect_all = re.compile(ur'''
53 ::(?=[^=])| # two :: (but NOT ::=)
54 :[a-zA-Z][a-zA-Z0-9]+| # :foo
55 `| # ` (seldom used by itself)
56 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
57 ''', re.UNICODE | re.VERBOSE).finditer
58
59
60class Rule:
61 def __init__(self, docname, lineno, issue, line):
62 """A rule for ignoring issues"""
63 self.docname = docname # document to which this rule applies
64 self.lineno = lineno # line number in the original source;
65 # this rule matches only near that.
66 # None -> don't care
67 self.issue = issue # the markup fragment that triggered this rule
68 self.line = line # text of the container element (single line only)
69 self.used = False
70
71 def __repr__(self):
72 return '{0.docname},,{0.issue},{0.line}'.format(self)
73
74
75
76class dialect(csv.excel):
77 """Our dialect: uses only linefeed as newline."""
78 lineterminator = '\n'
79
80
81class CheckSuspiciousMarkupBuilder(Builder):
82 """
83 Checks for possibly invalid markup that may leak into the output.
84 """
85 name = 'suspicious'
86
87 def init(self):
88 # create output file
89 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
90 open(self.log_file_name, 'w').close()
91 # load database of previously ignored issues
92 self.load_rules(os.path.join(os.path.dirname(__file__),
93 'susp-ignored.csv'))
94
95 def get_outdated_docs(self):
96 return self.env.found_docs
97
98 def get_target_uri(self, docname, typ=None):
99 return ''
100
101 def prepare_writing(self, docnames):
102 pass
103
104 def write_doc(self, docname, doctree):
105 # set when any issue is encountered in this document
106 self.any_issue = False
107 self.docname = docname
108 visitor = SuspiciousVisitor(doctree, self)
109 doctree.walk(visitor)
110
111 def finish(self):
112 unused_rules = [rule for rule in self.rules if not rule.used]
113 if unused_rules:
114 self.warn('Found %s/%s unused rules:' %
115 (len(unused_rules), len(self.rules)))
116 for rule in unused_rules:
117 self.info(repr(rule))
118 return
119
120 def check_issue(self, line, lineno, issue):
121 if not self.is_ignored(line, lineno, issue):
122 self.report_issue(line, lineno, issue)
123
124 def is_ignored(self, line, lineno, issue):
125 """Determine whether this issue should be ignored."""
126 docname = self.docname
127 for rule in self.rules:
128 if rule.docname != docname: continue
129 if rule.issue != issue: continue
130 # Both lines must match *exactly*. This is rather strict,
131 # and probably should be improved.
132 # Doing fuzzy matches with levenshtein distance could work,
133 # but that means bringing other libraries...
134 # Ok, relax that requirement: just check if the rule fragment
135 # is contained in the document line
136 if rule.line not in line: continue
137 # Check both line numbers. If they're "near"
138 # this rule matches. (lineno=None means "don't care")
139 if (rule.lineno is not None) and \
140 abs(rule.lineno - lineno) > 5: continue
141 # if it came this far, the rule matched
142 rule.used = True
143 return True
144 return False
145
146 def report_issue(self, text, lineno, issue):
147 if not self.any_issue: self.info()
148 self.any_issue = True
149 self.write_log_entry(lineno, issue, text)
150 self.warn('[%s:%d] "%s" found in "%-.120s"' % (
151 self.docname.encode(sys.getdefaultencoding(),'replace'),
152 lineno,
153 issue.encode(sys.getdefaultencoding(),'replace'),
154 text.strip().encode(sys.getdefaultencoding(),'replace')))
155 self.app.statuscode = 1
156
157 def write_log_entry(self, lineno, issue, text):
158 f = open(self.log_file_name, 'ab')
159 writer = csv.writer(f, dialect)
160 writer.writerow([self.docname.encode('utf-8'),
161 lineno,
162 issue.encode('utf-8'),
163 text.strip().encode('utf-8')])
164 f.close()
165
166 def load_rules(self, filename):
167 """Load database of previously ignored issues.
168
169 A csv file, with exactly the same format as suspicious.csv
170 Fields: document name (normalized), line number, issue, surrounding text
171 """
172 self.info("loading ignore rules... ", nonl=1)
173 self.rules = rules = []
174 try: f = open(filename, 'rb')
175 except IOError: return
176 for i, row in enumerate(csv.reader(f)):
177 if len(row) != 4:
178 raise ValueError(
179 "wrong format in %s, line %d: %s" % (filename, i+1, row))
180 docname, lineno, issue, text = row
181 docname = docname.decode('utf-8')
182 if lineno: lineno = int(lineno)
183 else: lineno = None
184 issue = issue.decode('utf-8')
185 text = text.decode('utf-8')
186 rule = Rule(docname, lineno, issue, text)
187 rules.append(rule)
188 f.close()
189 self.info('done, %d rules loaded' % len(self.rules))
190
191
192def get_lineno(node):
193 """Obtain line number information for a node."""
194 lineno = None
195 while lineno is None and node:
196 node = node.parent
197 lineno = node.line
198 return lineno
199
200
201def extract_line(text, index):
202 """text may be a multiline string; extract
203 only the line containing the given character index.
204
205 >>> extract_line("abc\ndefgh\ni", 6)
206 >>> 'defgh'
207 >>> for i in (0, 2, 3, 4, 10):
208 ... print extract_line("abc\ndefgh\ni", i)
209 abc
210 abc
211 abc
212 defgh
213 defgh
214 i
215 """
216 p = text.rfind('\n', 0, index) + 1
217 q = text.find('\n', index)
218 if q < 0:
219 q = len(text)
220 return text[p:q]
221
222
223class SuspiciousVisitor(nodes.GenericNodeVisitor):
224
225 lastlineno = 0
226
227 def __init__(self, document, builder):
228 nodes.GenericNodeVisitor.__init__(self, document)
229 self.builder = builder
230
231 def default_visit(self, node):
232 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
233 text = node.astext()
234 # lineno seems to go backwards sometimes (?)
235 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
236 seen = set() # don't report the same issue more than only once per line
237 for match in detect_all(text):
238 issue = match.group()
239 line = extract_line(text, match.start())
240 if (issue, line) not in seen:
241 self.builder.check_issue(line, lineno, issue)
242 seen.add((issue, line))
243
244 unknown_visit = default_visit
245
246 def visit_document(self, node):
247 self.lastlineno = 0
248
249 def visit_comment(self, node):
250 # ignore comments -- too much false positives.
251 # (although doing this could miss some errors;
252 # there were two sections "commented-out" by mistake
253 # in the Python docs that would not be catched)
254 raise nodes.SkipNode
Note: See TracBrowser for help on using the repository browser.