Changeset 388 for python/vendor/current/Lib/difflib.py
- Timestamp:
- Mar 19, 2014, 11:11:30 AM (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
python/vendor/current/Lib/difflib.py
r2 r388 152 152 """ 153 153 154 def __init__(self, isjunk=None, a='', b='' ):154 def __init__(self, isjunk=None, a='', b='', autojunk=True): 155 155 """Construct a SequenceMatcher. 156 156 … … 170 170 default, an empty string. The elements of b must be hashable. See 171 171 also .set_seqs() and .set_seq2(). 172 173 Optional arg autojunk should be set to False to disable the 174 "automatic junk heuristic" that treats popular elements as junk 175 (see module documentation for more information). 172 176 """ 173 177 … … 208 212 # isbpopular 209 213 # for x in b, isbpopular(x) is true iff b is reasonably long 210 # (at least 200 elements) and x accounts for more than 1% of 211 # its elements. DOES NOT WORK for x in a! 214 # (at least 200 elements) and x accounts for more than 1 + 1% of 215 # its elements (when autojunk is enabled). 216 # DOES NOT WORK for x in a! 212 217 213 218 self.isjunk = isjunk 214 219 self.a = self.b = None 220 self.autojunk = autojunk 215 221 self.set_seqs(a, b) 216 222 … … 289 295 # also creates the fast isbjunk function ... 290 296 # b2j also does not contain entries for "popular" elements, meaning 291 # elements that account for more than 1 % of the total elements, and297 # elements that account for more than 1 + 1% of the total elements, and 292 298 # when the sequence is reasonably large (>= 200 elements); this can 293 299 # be viewed as an adaptive notion of semi-junk, and yields an enormous … … 310 316 # from the start. 311 317 b = self.b 318 self.b2j = b2j = {} 319 320 for i, elt in enumerate(b): 321 indices = b2j.setdefault(elt, []) 322 indices.append(i) 323 324 # Purge junk elements 325 junk = set() 326 isjunk = self.isjunk 327 if isjunk: 328 for elt in list(b2j.keys()): # using list() since b2j is modified 329 if isjunk(elt): 330 junk.add(elt) 331 del b2j[elt] 332 333 # Purge popular elements that are not junk 334 popular = set() 312 335 n = len(b) 313 self.b2j = b2j = {} 314 populardict = {} 315 for i, elt in enumerate(b): 316 if elt in b2j: 317 indices = b2j[elt] 318 if n >= 200 and len(indices) * 100 > n: 319 populardict[elt] = 1 320 del indices[:] 321 else: 322 indices.append(i) 323 else: 324 b2j[elt] = [i] 325 326 # Purge leftover indices for popular elements. 327 for elt in populardict: 328 del b2j[elt] 329 330 # Now b2j.keys() contains elements uniquely, and especially when 331 # the sequence is a string, that's usually a good deal smaller 332 # than len(string). The difference is the number of isjunk calls 333 # saved. 334 isjunk = self.isjunk 335 junkdict = {} 336 if isjunk: 337 for d in populardict, b2j: 338 for elt in d.keys(): 339 if isjunk(elt): 340 junkdict[elt] = 1 341 del d[elt] 342 343 # Now for x in b, isjunk(x) == x in junkdict, but the 344 # latter is much faster. Note too that while there may be a 345 # lot of junk in the sequence, the number of *unique* junk 346 # elements is probably small. So the memory burden of keeping 347 # this dict alive is likely trivial compared to the size of b2j. 348 self.isbjunk = junkdict.__contains__ 349 self.isbpopular = populardict.__contains__ 336 if self.autojunk and n >= 200: 337 ntest = n // 100 + 1 338 for elt, idxs in list(b2j.items()): 339 if len(idxs) > ntest: 340 popular.add(elt) 341 del b2j[elt] 342 343 # Now for x in b, isjunk(x) == x in junk, but the latter is much faster. 344 # Sicne the number of *unique* junk elements is probably small, the 345 # memory burden of keeping this set alive is likely trivial compared to 346 # the size of b2j. 347 self.isbjunk = junk.__contains__ 348 self.isbpopular = popular.__contains__ 350 349 351 350 def find_longest_match(self, alo, ahi, blo, bhi): … … 588 587 """ Isolate change clusters by eliminating ranges with no changes. 589 588 590 Return a generator of groups with up to n lines of context.589 Return a generator of groups with up to n lines of context. 591 590 Each group is in the same format as returned by get_opcodes(). 592 591 … … 1142 1141 1143 1142 1143 ######################################################################## 1144 ### Unified Diff 1145 ######################################################################## 1146 1147 def _format_range_unified(start, stop): 1148 'Convert range to the "ed" format' 1149 # Per the diff spec at http://www.unix.org/single_unix_specification/ 1150 beginning = start + 1 # lines start numbering with one 1151 length = stop - start 1152 if length == 1: 1153 return '{}'.format(beginning) 1154 if not length: 1155 beginning -= 1 # empty ranges begin at line just before the range 1156 return '{},{}'.format(beginning, length) 1157 1144 1158 def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', 1145 1159 tofiledate='', n=3, lineterm='\n'): … … 1162 1176 The unidiff format normally has a header for filenames and modification 1163 1177 times. Any or all of these may be specified using strings for 1164 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification1165 times are normally expressed in the format returned by time.ctime().1178 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. 1179 The modification times are normally expressed in the ISO 8601 format. 1166 1180 1167 1181 Example: … … 1169 1183 >>> for line in unified_diff('one two three four'.split(), 1170 1184 ... 'zero one tree four'.split(), 'Original', 'Current', 1171 ... ' Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003',1185 ... '2005-01-26 23:30:50', '2010-04-02 10:20:52', 1172 1186 ... lineterm=''): 1173 ... print line 1174 --- Original Sat Jan 26 23:30:50 19911175 +++ Current Fri Jun 06 10:20:52 20031187 ... print line # doctest: +NORMALIZE_WHITESPACE 1188 --- Original 2005-01-26 23:30:50 1189 +++ Current 2010-04-02 10:20:52 1176 1190 @@ -1,4 +1,4 @@ 1177 1191 +zero … … 1186 1200 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): 1187 1201 if not started: 1188 yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm)1189 yield '+++ %s %s%s' % (tofile, tofiledate, lineterm)1190 1202 started = True 1191 i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4] 1192 yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm) 1203 fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' 1204 todate = '\t{}'.format(tofiledate) if tofiledate else '' 1205 yield '--- {}{}{}'.format(fromfile, fromdate, lineterm) 1206 yield '+++ {}{}{}'.format(tofile, todate, lineterm) 1207 1208 first, last = group[0], group[-1] 1209 file1_range = _format_range_unified(first[1], last[2]) 1210 file2_range = _format_range_unified(first[3], last[4]) 1211 yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm) 1212 1193 1213 for tag, i1, i2, j1, j2 in group: 1194 1214 if tag == 'equal': … … 1196 1216 yield ' ' + line 1197 1217 continue 1198 if tag == 'replace' or tag == 'delete':1218 if tag in ('replace', 'delete'): 1199 1219 for line in a[i1:i2]: 1200 1220 yield '-' + line 1201 if tag == 'replace' or tag == 'insert':1221 if tag in ('replace', 'insert'): 1202 1222 for line in b[j1:j2]: 1203 1223 yield '+' + line 1224 1225 1226 ######################################################################## 1227 ### Context Diff 1228 ######################################################################## 1229 1230 def _format_range_context(start, stop): 1231 'Convert range to the "ed" format' 1232 # Per the diff spec at http://www.unix.org/single_unix_specification/ 1233 beginning = start + 1 # lines start numbering with one 1234 length = stop - start 1235 if not length: 1236 beginning -= 1 # empty ranges begin at line just before the range 1237 if length <= 1: 1238 return '{}'.format(beginning) 1239 return '{},{}'.format(beginning, beginning + length - 1) 1204 1240 1205 1241 # See http://www.unix.org/single_unix_specification/ … … 1225 1261 modification times. Any or all of these may be specified using 1226 1262 strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. 1227 The modification times are normally expressed in the format returned1228 by time.ctime().If not specified, the strings default to blanks.1263 The modification times are normally expressed in the ISO 8601 format. 1264 If not specified, the strings default to blanks. 1229 1265 1230 1266 Example: 1231 1267 1232 1268 >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1), 1233 ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current', 1234 ... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:22:46 2003')), 1235 *** Original Sat Jan 26 23:30:50 1991 1236 --- Current Fri Jun 06 10:22:46 2003 1269 ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current')), 1270 *** Original 1271 --- Current 1237 1272 *************** 1238 1273 *** 1,4 **** … … 1248 1283 """ 1249 1284 1285 prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ') 1250 1286 started = False 1251 prefixmap = {'insert':'+ ', 'delete':'- ', 'replace':'! ', 'equal':' '}1252 1287 for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): 1253 1288 if not started: 1254 yield '*** %s %s%s' % (fromfile, fromfiledate, lineterm)1255 yield '--- %s %s%s' % (tofile, tofiledate, lineterm)1256 1289 started = True 1257 1258 yield '***************%s' % (lineterm,) 1259 if group[-1][2] - group[0][1] >= 2: 1260 yield '*** %d,%d ****%s' % (group[0][1]+1, group[-1][2], lineterm) 1261 else: 1262 yield '*** %d ****%s' % (group[-1][2], lineterm) 1263 visiblechanges = [e for e in group if e[0] in ('replace', 'delete')] 1264 if visiblechanges: 1290 fromdate = '\t{}'.format(fromfiledate) if fromfiledate else '' 1291 todate = '\t{}'.format(tofiledate) if tofiledate else '' 1292 yield '*** {}{}{}'.format(fromfile, fromdate, lineterm) 1293 yield '--- {}{}{}'.format(tofile, todate, lineterm) 1294 1295 first, last = group[0], group[-1] 1296 yield '***************' + lineterm 1297 1298 file1_range = _format_range_context(first[1], last[2]) 1299 yield '*** {} ****{}'.format(file1_range, lineterm) 1300 1301 if any(tag in ('replace', 'delete') for tag, _, _, _, _ in group): 1265 1302 for tag, i1, i2, _, _ in group: 1266 1303 if tag != 'insert': 1267 1304 for line in a[i1:i2]: 1268 yield prefixmap[tag] + line 1269 1270 if group[-1][4] - group[0][3] >= 2: 1271 yield '--- %d,%d ----%s' % (group[0][3]+1, group[-1][4], lineterm) 1272 else: 1273 yield '--- %d ----%s' % (group[-1][4], lineterm) 1274 visiblechanges = [e for e in group if e[0] in ('replace', 'insert')] 1275 if visiblechanges: 1305 yield prefix[tag] + line 1306 1307 file2_range = _format_range_context(first[3], last[4]) 1308 yield '--- {} ----{}'.format(file2_range, lineterm) 1309 1310 if any(tag in ('replace', 'insert') for tag, _, _, _, _ in group): 1276 1311 for tag, _, _, j1, j2 in group: 1277 1312 if tag != 'delete': 1278 1313 for line in b[j1:j2]: 1279 yield prefix map[tag] + line1314 yield prefix[tag] + line 1280 1315 1281 1316 def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): … … 1327 1362 charjunk -- passed on to ndiff (see ndiff documentation) 1328 1363 1329 This function returns an i nterator which returns a tuple:1364 This function returns an iterator which returns a tuple: 1330 1365 (from line tuple, to line tuple, boolean flag) 1331 1366 … … 1713 1748 # expand tabs into spaces 1714 1749 line = line.expandtabs(self._tabsize) 1715 # re lace spaces from expanded tabs back into tab characters1750 # replace spaces from expanded tabs back into tab characters 1716 1751 # (we'll replace them with markup after we do differencing) 1717 1752 line = line.replace(' ','\t') … … 1929 1964 1930 1965 # change tabs to spaces before it gets more difficult after we insert 1931 # mark kup1966 # markup 1932 1967 fromlines,tolines = self._tab_newline_replace(fromlines,tolines) 1933 1968
Note:
See TracChangeset
for help on using the changeset viewer.