source: vendor/python/2.5/Doc/lib/libdifflib.tex

Last change on this file was 3225, checked in by bird, 18 years ago

Python 2.5

File size: 25.9 KB
Line 
1\section{\module{difflib} ---
2 Helpers for computing deltas}
3
4\declaremodule{standard}{difflib}
5\modulesynopsis{Helpers for computing differences between objects.}
6\moduleauthor{Tim Peters}{tim_one@users.sourceforge.net}
7\sectionauthor{Tim Peters}{tim_one@users.sourceforge.net}
8% LaTeXification by Fred L. Drake, Jr. <fdrake@acm.org>.
9
10\versionadded{2.1}
11
12
13\begin{classdesc*}{SequenceMatcher}
14 This is a flexible class for comparing pairs of sequences of any
15 type, so long as the sequence elements are hashable. The basic
16 algorithm predates, and is a little fancier than, an algorithm
17 published in the late 1980's by Ratcliff and Obershelp under the
18 hyperbolic name ``gestalt pattern matching.'' The idea is to find
19 the longest contiguous matching subsequence that contains no
20 ``junk'' elements (the Ratcliff and Obershelp algorithm doesn't
21 address junk). The same idea is then applied recursively to the
22 pieces of the sequences to the left and to the right of the matching
23 subsequence. This does not yield minimal edit sequences, but does
24 tend to yield matches that ``look right'' to people.
25
26 \strong{Timing:} The basic Ratcliff-Obershelp algorithm is cubic
27 time in the worst case and quadratic time in the expected case.
28 \class{SequenceMatcher} is quadratic time for the worst case and has
29 expected-case behavior dependent in a complicated way on how many
30 elements the sequences have in common; best case time is linear.
31\end{classdesc*}
32
33\begin{classdesc*}{Differ}
34 This is a class for comparing sequences of lines of text, and
35 producing human-readable differences or deltas. Differ uses
36 \class{SequenceMatcher} both to compare sequences of lines, and to
37 compare sequences of characters within similar (near-matching)
38 lines.
39
40 Each line of a \class{Differ} delta begins with a two-letter code:
41
42\begin{tableii}{l|l}{code}{Code}{Meaning}
43 \lineii{'- '}{line unique to sequence 1}
44 \lineii{'+ '}{line unique to sequence 2}
45 \lineii{' '}{line common to both sequences}
46 \lineii{'? '}{line not present in either input sequence}
47\end{tableii}
48
49 Lines beginning with `\code{?~}' attempt to guide the eye to
50 intraline differences, and were not present in either input
51 sequence. These lines can be confusing if the sequences contain tab
52 characters.
53\end{classdesc*}
54
55\begin{classdesc*}{HtmlDiff}
56
57 This class can be used to create an HTML table (or a complete HTML file
58 containing the table) showing a side by side, line by line comparison
59 of text with inter-line and intra-line change highlights. The table can
60 be generated in either full or contextual difference mode.
61
62 The constructor for this class is:
63
64 \begin{funcdesc}{__init__}{\optional{tabsize}\optional{,
65 wrapcolumn}\optional{, linejunk}\optional{, charjunk}}
66
67 Initializes instance of \class{HtmlDiff}.
68
69 \var{tabsize} is an optional keyword argument to specify tab stop spacing
70 and defaults to \code{8}.
71
72 \var{wrapcolumn} is an optional keyword to specify column number where
73 lines are broken and wrapped, defaults to \code{None} where lines are not
74 wrapped.
75
76 \var{linejunk} and \var{charjunk} are optional keyword arguments passed
77 into \code{ndiff()} (used by \class{HtmlDiff} to generate the
78 side by side HTML differences). See \code{ndiff()} documentation for
79 argument default values and descriptions.
80
81 \end{funcdesc}
82
83 The following methods are public:
84
85 \begin{funcdesc}{make_file}{fromlines, tolines
86 \optional{, fromdesc}\optional{, todesc}\optional{, context}\optional{,
87 numlines}}
88 Compares \var{fromlines} and \var{tolines} (lists of strings) and returns
89 a string which is a complete HTML file containing a table showing line by
90 line differences with inter-line and intra-line changes highlighted.
91
92 \var{fromdesc} and \var{todesc} are optional keyword arguments to specify
93 from/to file column header strings (both default to an empty string).
94
95 \var{context} and \var{numlines} are both optional keyword arguments.
96 Set \var{context} to \code{True} when contextual differences are to be
97 shown, else the default is \code{False} to show the full files.
98 \var{numlines} defaults to \code{5}. When \var{context} is \code{True}
99 \var{numlines} controls the number of context lines which surround the
100 difference highlights. When \var{context} is \code{False} \var{numlines}
101 controls the number of lines which are shown before a difference
102 highlight when using the "next" hyperlinks (setting to zero would cause
103 the "next" hyperlinks to place the next difference highlight at the top of
104 the browser without any leading context).
105 \end{funcdesc}
106
107 \begin{funcdesc}{make_table}{fromlines, tolines
108 \optional{, fromdesc}\optional{, todesc}\optional{, context}\optional{,
109 numlines}}
110 Compares \var{fromlines} and \var{tolines} (lists of strings) and returns
111 a string which is a complete HTML table showing line by line differences
112 with inter-line and intra-line changes highlighted.
113
114 The arguments for this method are the same as those for the
115 \method{make_file()} method.
116 \end{funcdesc}
117
118 \file{Tools/scripts/diff.py} is a command-line front-end to this class
119 and contains a good example of its use.
120
121 \versionadded{2.4}
122\end{classdesc*}
123
124\begin{funcdesc}{context_diff}{a, b\optional{, fromfile}\optional{,
125 tofile}\optional{, fromfiledate}\optional{, tofiledate}\optional{,
126 n}\optional{, lineterm}}
127 Compare \var{a} and \var{b} (lists of strings); return a
128 delta (a generator generating the delta lines) in context diff
129 format.
130
131 Context diffs are a compact way of showing just the lines that have
132 changed plus a few lines of context. The changes are shown in a
133 before/after style. The number of context lines is set by \var{n}
134 which defaults to three.
135
136 By default, the diff control lines (those with \code{***} or \code{---})
137 are created with a trailing newline. This is helpful so that inputs created
138 from \function{file.readlines()} result in diffs that are suitable for use
139 with \function{file.writelines()} since both the inputs and outputs have
140 trailing newlines.
141
142 For inputs that do not have trailing newlines, set the \var{lineterm}
143 argument to \code{""} so that the output will be uniformly newline free.
144
145 The context diff format normally has a header for filenames and
146 modification times. Any or all of these may be specified using strings for
147 \var{fromfile}, \var{tofile}, \var{fromfiledate}, and \var{tofiledate}.
148 The modification times are normally expressed in the format returned by
149 \function{time.ctime()}. If not specified, the strings default to blanks.
150
151 \file{Tools/scripts/diff.py} is a command-line front-end for this
152 function.
153
154 \versionadded{2.3}
155\end{funcdesc}
156
157\begin{funcdesc}{get_close_matches}{word, possibilities\optional{,
158 n}\optional{, cutoff}}
159 Return a list of the best ``good enough'' matches. \var{word} is a
160 sequence for which close matches are desired (typically a string),
161 and \var{possibilities} is a list of sequences against which to
162 match \var{word} (typically a list of strings).
163
164 Optional argument \var{n} (default \code{3}) is the maximum number
165 of close matches to return; \var{n} must be greater than \code{0}.
166
167 Optional argument \var{cutoff} (default \code{0.6}) is a float in
168 the range [0, 1]. Possibilities that don't score at least that
169 similar to \var{word} are ignored.
170
171 The best (no more than \var{n}) matches among the possibilities are
172 returned in a list, sorted by similarity score, most similar first.
173
174\begin{verbatim}
175>>> get_close_matches('appel', ['ape', 'apple', 'peach', 'puppy'])
176['apple', 'ape']
177>>> import keyword
178>>> get_close_matches('wheel', keyword.kwlist)
179['while']
180>>> get_close_matches('apple', keyword.kwlist)
181[]
182>>> get_close_matches('accept', keyword.kwlist)
183['except']
184\end{verbatim}
185\end{funcdesc}
186
187\begin{funcdesc}{ndiff}{a, b\optional{, linejunk}\optional{, charjunk}}
188 Compare \var{a} and \var{b} (lists of strings); return a
189 \class{Differ}-style delta (a generator generating the delta lines).
190
191 Optional keyword parameters \var{linejunk} and \var{charjunk} are
192 for filter functions (or \code{None}):
193
194 \var{linejunk}: A function that accepts a single string
195 argument, and returns true if the string is junk, or false if not.
196 The default is (\code{None}), starting with Python 2.3. Before then,
197 the default was the module-level function
198 \function{IS_LINE_JUNK()}, which filters out lines without visible
199 characters, except for at most one pound character (\character{\#}).
200 As of Python 2.3, the underlying \class{SequenceMatcher} class
201 does a dynamic analysis of which lines are so frequent as to
202 constitute noise, and this usually works better than the pre-2.3
203 default.
204
205 \var{charjunk}: A function that accepts a character (a string of
206 length 1), and returns if the character is junk, or false if not.
207 The default is module-level function \function{IS_CHARACTER_JUNK()},
208 which filters out whitespace characters (a blank or tab; note: bad
209 idea to include newline in this!).
210
211 \file{Tools/scripts/ndiff.py} is a command-line front-end to this
212 function.
213
214\begin{verbatim}
215>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
216... 'ore\ntree\nemu\n'.splitlines(1))
217>>> print ''.join(diff),
218- one
219? ^
220+ ore
221? ^
222- two
223- three
224? -
225+ tree
226+ emu
227\end{verbatim}
228\end{funcdesc}
229
230\begin{funcdesc}{restore}{sequence, which}
231 Return one of the two sequences that generated a delta.
232
233 Given a \var{sequence} produced by \method{Differ.compare()} or
234 \function{ndiff()}, extract lines originating from file 1 or 2
235 (parameter \var{which}), stripping off line prefixes.
236
237 Example:
238
239\begin{verbatim}
240>>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
241... 'ore\ntree\nemu\n'.splitlines(1))
242>>> diff = list(diff) # materialize the generated delta into a list
243>>> print ''.join(restore(diff, 1)),
244one
245two
246three
247>>> print ''.join(restore(diff, 2)),
248ore
249tree
250emu
251\end{verbatim}
252
253\end{funcdesc}
254
255\begin{funcdesc}{unified_diff}{a, b\optional{, fromfile}\optional{,
256 tofile}\optional{, fromfiledate}\optional{, tofiledate}\optional{,
257 n}\optional{, lineterm}}
258 Compare \var{a} and \var{b} (lists of strings); return a
259 delta (a generator generating the delta lines) in unified diff
260 format.
261
262 Unified diffs are a compact way of showing just the lines that have
263 changed plus a few lines of context. The changes are shown in a
264 inline style (instead of separate before/after blocks). The number
265 of context lines is set by \var{n} which defaults to three.
266
267 By default, the diff control lines (those with \code{---}, \code{+++},
268 or \code{@@}) are created with a trailing newline. This is helpful so
269 that inputs created from \function{file.readlines()} result in diffs
270 that are suitable for use with \function{file.writelines()} since both
271 the inputs and outputs have trailing newlines.
272
273 For inputs that do not have trailing newlines, set the \var{lineterm}
274 argument to \code{""} so that the output will be uniformly newline free.
275
276 The context diff format normally has a header for filenames and
277 modification times. Any or all of these may be specified using strings for
278 \var{fromfile}, \var{tofile}, \var{fromfiledate}, and \var{tofiledate}.
279 The modification times are normally expressed in the format returned by
280 \function{time.ctime()}. If not specified, the strings default to blanks.
281
282 \file{Tools/scripts/diff.py} is a command-line front-end for this
283 function.
284
285 \versionadded{2.3}
286\end{funcdesc}
287
288\begin{funcdesc}{IS_LINE_JUNK}{line}
289 Return true for ignorable lines. The line \var{line} is ignorable
290 if \var{line} is blank or contains a single \character{\#},
291 otherwise it is not ignorable. Used as a default for parameter
292 \var{linejunk} in \function{ndiff()} before Python 2.3.
293\end{funcdesc}
294
295
296\begin{funcdesc}{IS_CHARACTER_JUNK}{ch}
297 Return true for ignorable characters. The character \var{ch} is
298 ignorable if \var{ch} is a space or tab, otherwise it is not
299 ignorable. Used as a default for parameter \var{charjunk} in
300 \function{ndiff()}.
301\end{funcdesc}
302
303
304\begin{seealso}
305 \seetitle[http://www.ddj.com/documents/s=1103/ddj8807c/]
306 {Pattern Matching: The Gestalt Approach}{Discussion of a
307 similar algorithm by John W. Ratcliff and D. E. Metzener.
308 This was published in
309 \citetitle[http://www.ddj.com/]{Dr. Dobb's Journal} in
310 July, 1988.}
311\end{seealso}
312
313
314\subsection{SequenceMatcher Objects \label{sequence-matcher}}
315
316The \class{SequenceMatcher} class has this constructor:
317
318\begin{classdesc}{SequenceMatcher}{\optional{isjunk\optional{,
319 a\optional{, b}}}}
320 Optional argument \var{isjunk} must be \code{None} (the default) or
321 a one-argument function that takes a sequence element and returns
322 true if and only if the element is ``junk'' and should be ignored.
323 Passing \code{None} for \var{isjunk} is equivalent to passing
324 \code{lambda x: 0}; in other words, no elements are ignored. For
325 example, pass:
326
327\begin{verbatim}
328lambda x: x in " \t"
329\end{verbatim}
330
331 if you're comparing lines as sequences of characters, and don't want
332 to synch up on blanks or hard tabs.
333
334 The optional arguments \var{a} and \var{b} are sequences to be
335 compared; both default to empty strings. The elements of both
336 sequences must be hashable.
337\end{classdesc}
338
339
340\class{SequenceMatcher} objects have the following methods:
341
342\begin{methoddesc}{set_seqs}{a, b}
343 Set the two sequences to be compared.
344\end{methoddesc}
345
346\class{SequenceMatcher} computes and caches detailed information about
347the second sequence, so if you want to compare one sequence against
348many sequences, use \method{set_seq2()} to set the commonly used
349sequence once and call \method{set_seq1()} repeatedly, once for each
350of the other sequences.
351
352\begin{methoddesc}{set_seq1}{a}
353 Set the first sequence to be compared. The second sequence to be
354 compared is not changed.
355\end{methoddesc}
356
357\begin{methoddesc}{set_seq2}{b}
358 Set the second sequence to be compared. The first sequence to be
359 compared is not changed.
360\end{methoddesc}
361
362\begin{methoddesc}{find_longest_match}{alo, ahi, blo, bhi}
363 Find longest matching block in \code{\var{a}[\var{alo}:\var{ahi}]}
364 and \code{\var{b}[\var{blo}:\var{bhi}]}.
365
366 If \var{isjunk} was omitted or \code{None},
367 \method{get_longest_match()} returns \code{(\var{i}, \var{j},
368 \var{k})} such that \code{\var{a}[\var{i}:\var{i}+\var{k}]} is equal
369 to \code{\var{b}[\var{j}:\var{j}+\var{k}]}, where
370 \code{\var{alo} <= \var{i} <= \var{i}+\var{k} <= \var{ahi}} and
371 \code{\var{blo} <= \var{j} <= \var{j}+\var{k} <= \var{bhi}}.
372 For all \code{(\var{i'}, \var{j'}, \var{k'})} meeting those
373 conditions, the additional conditions
374 \code{\var{k} >= \var{k'}},
375 \code{\var{i} <= \var{i'}},
376 and if \code{\var{i} == \var{i'}}, \code{\var{j} <= \var{j'}}
377 are also met.
378 In other words, of all maximal matching blocks, return one that
379 starts earliest in \var{a}, and of all those maximal matching blocks
380 that start earliest in \var{a}, return the one that starts earliest
381 in \var{b}.
382
383\begin{verbatim}
384>>> s = SequenceMatcher(None, " abcd", "abcd abcd")
385>>> s.find_longest_match(0, 5, 0, 9)
386(0, 4, 5)
387\end{verbatim}
388
389 If \var{isjunk} was provided, first the longest matching block is
390 determined as above, but with the additional restriction that no
391 junk element appears in the block. Then that block is extended as
392 far as possible by matching (only) junk elements on both sides.
393 So the resulting block never matches on junk except as identical
394 junk happens to be adjacent to an interesting match.
395
396 Here's the same example as before, but considering blanks to be junk.
397 That prevents \code{' abcd'} from matching the \code{' abcd'} at the
398 tail end of the second sequence directly. Instead only the
399 \code{'abcd'} can match, and matches the leftmost \code{'abcd'} in
400 the second sequence:
401
402\begin{verbatim}
403>>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
404>>> s.find_longest_match(0, 5, 0, 9)
405(1, 0, 4)
406\end{verbatim}
407
408 If no blocks match, this returns \code{(\var{alo}, \var{blo}, 0)}.
409\end{methoddesc}
410
411\begin{methoddesc}{get_matching_blocks}{}
412 Return list of triples describing matching subsequences.
413 Each triple is of the form \code{(\var{i}, \var{j}, \var{n})}, and
414 means that \code{\var{a}[\var{i}:\var{i}+\var{n}] ==
415 \var{b}[\var{j}:\var{j}+\var{n}]}. The triples are monotonically
416 increasing in \var{i} and \var{j}.
417
418 The last triple is a dummy, and has the value \code{(len(\var{a}),
419 len(\var{b}), 0)}. It is the only triple with \code{\var{n} == 0}.
420 % Explain why a dummy is used!
421
422 If
423 \code{(\var{i}, \var{j}, \var{n})} and
424 \code{(\var{i'}, \var{j'}, \var{n'})} are adjacent triples in the list,
425 and the second is not the last triple in the list, then
426 \code{\var{i}+\var{n} != \var{i'}} or
427 \code{\var{j}+\var{n} != \var{j'}}; in other words, adjacent triples
428 always describe non-adjacent equal blocks.
429 \versionchanged[The guarantee that adjacent triples always describe
430 non-adjacent blocks was implemented]{2.5}
431
432\begin{verbatim}
433>>> s = SequenceMatcher(None, "abxcd", "abcd")
434>>> s.get_matching_blocks()
435[(0, 0, 2), (3, 2, 2), (5, 4, 0)]
436\end{verbatim}
437\end{methoddesc}
438
439\begin{methoddesc}{get_opcodes}{}
440 Return list of 5-tuples describing how to turn \var{a} into \var{b}.
441 Each tuple is of the form \code{(\var{tag}, \var{i1}, \var{i2},
442 \var{j1}, \var{j2})}. The first tuple has \code{\var{i1} ==
443 \var{j1} == 0}, and remaining tuples have \var{i1} equal to the
444 \var{i2} from the preceding tuple, and, likewise, \var{j1} equal to
445 the previous \var{j2}.
446
447 The \var{tag} values are strings, with these meanings:
448
449\begin{tableii}{l|l}{code}{Value}{Meaning}
450 \lineii{'replace'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
451 replaced by \code{\var{b}[\var{j1}:\var{j2}]}.}
452 \lineii{'delete'}{\code{\var{a}[\var{i1}:\var{i2}]} should be
453 deleted. Note that \code{\var{j1} == \var{j2}} in
454 this case.}
455 \lineii{'insert'}{\code{\var{b}[\var{j1}:\var{j2}]} should be
456 inserted at \code{\var{a}[\var{i1}:\var{i1}]}.
457 Note that \code{\var{i1} == \var{i2}} in this
458 case.}
459 \lineii{'equal'}{\code{\var{a}[\var{i1}:\var{i2}] ==
460 \var{b}[\var{j1}:\var{j2}]} (the sub-sequences are
461 equal).}
462\end{tableii}
463
464For example:
465
466\begin{verbatim}
467>>> a = "qabxcd"
468>>> b = "abycdf"
469>>> s = SequenceMatcher(None, a, b)
470>>> for tag, i1, i2, j1, j2 in s.get_opcodes():
471... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
472... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
473 delete a[0:1] (q) b[0:0] ()
474 equal a[1:3] (ab) b[0:2] (ab)
475replace a[3:4] (x) b[2:3] (y)
476 equal a[4:6] (cd) b[3:5] (cd)
477 insert a[6:6] () b[5:6] (f)
478\end{verbatim}
479\end{methoddesc}
480
481\begin{methoddesc}{get_grouped_opcodes}{\optional{n}}
482 Return a generator of groups with up to \var{n} lines of context.
483
484 Starting with the groups returned by \method{get_opcodes()},
485 this method splits out smaller change clusters and eliminates
486 intervening ranges which have no changes.
487
488 The groups are returned in the same format as \method{get_opcodes()}.
489 \versionadded{2.3}
490\end{methoddesc}
491
492\begin{methoddesc}{ratio}{}
493 Return a measure of the sequences' similarity as a float in the
494 range [0, 1].
495
496 Where T is the total number of elements in both sequences, and M is
497 the number of matches, this is 2.0*M / T. Note that this is
498 \code{1.0} if the sequences are identical, and \code{0.0} if they
499 have nothing in common.
500
501 This is expensive to compute if \method{get_matching_blocks()} or
502 \method{get_opcodes()} hasn't already been called, in which case you
503 may want to try \method{quick_ratio()} or
504 \method{real_quick_ratio()} first to get an upper bound.
505\end{methoddesc}
506
507\begin{methoddesc}{quick_ratio}{}
508 Return an upper bound on \method{ratio()} relatively quickly.
509
510 This isn't defined beyond that it is an upper bound on
511 \method{ratio()}, and is faster to compute.
512\end{methoddesc}
513
514\begin{methoddesc}{real_quick_ratio}{}
515 Return an upper bound on \method{ratio()} very quickly.
516
517 This isn't defined beyond that it is an upper bound on
518 \method{ratio()}, and is faster to compute than either
519 \method{ratio()} or \method{quick_ratio()}.
520\end{methoddesc}
521
522The three methods that return the ratio of matching to total characters
523can give different results due to differing levels of approximation,
524although \method{quick_ratio()} and \method{real_quick_ratio()} are always
525at least as large as \method{ratio()}:
526
527\begin{verbatim}
528>>> s = SequenceMatcher(None, "abcd", "bcde")
529>>> s.ratio()
5300.75
531>>> s.quick_ratio()
5320.75
533>>> s.real_quick_ratio()
5341.0
535\end{verbatim}
536
537
538\subsection{SequenceMatcher Examples \label{sequencematcher-examples}}
539
540
541This example compares two strings, considering blanks to be ``junk:''
542
543\begin{verbatim}
544>>> s = SequenceMatcher(lambda x: x == " ",
545... "private Thread currentThread;",
546... "private volatile Thread currentThread;")
547\end{verbatim}
548
549\method{ratio()} returns a float in [0, 1], measuring the similarity
550of the sequences. As a rule of thumb, a \method{ratio()} value over
5510.6 means the sequences are close matches:
552
553\begin{verbatim}
554>>> print round(s.ratio(), 3)
5550.866
556\end{verbatim}
557
558If you're only interested in where the sequences match,
559\method{get_matching_blocks()} is handy:
560
561\begin{verbatim}
562>>> for block in s.get_matching_blocks():
563... print "a[%d] and b[%d] match for %d elements" % block
564a[0] and b[0] match for 8 elements
565a[8] and b[17] match for 6 elements
566a[14] and b[23] match for 15 elements
567a[29] and b[38] match for 0 elements
568\end{verbatim}
569
570Note that the last tuple returned by \method{get_matching_blocks()} is
571always a dummy, \code{(len(\var{a}), len(\var{b}), 0)}, and this is
572the only case in which the last tuple element (number of elements
573matched) is \code{0}.
574
575If you want to know how to change the first sequence into the second,
576use \method{get_opcodes()}:
577
578\begin{verbatim}
579>>> for opcode in s.get_opcodes():
580... print "%6s a[%d:%d] b[%d:%d]" % opcode
581 equal a[0:8] b[0:8]
582insert a[8:8] b[8:17]
583 equal a[8:14] b[17:23]
584 equal a[14:29] b[23:38]
585\end{verbatim}
586
587See also the function \function{get_close_matches()} in this module,
588which shows how simple code building on \class{SequenceMatcher} can be
589used to do useful work.
590
591
592\subsection{Differ Objects \label{differ-objects}}
593
594Note that \class{Differ}-generated deltas make no claim to be
595\strong{minimal} diffs. To the contrary, minimal diffs are often
596counter-intuitive, because they synch up anywhere possible, sometimes
597accidental matches 100 pages apart. Restricting synch points to
598contiguous matches preserves some notion of locality, at the
599occasional cost of producing a longer diff.
600
601The \class{Differ} class has this constructor:
602
603\begin{classdesc}{Differ}{\optional{linejunk\optional{, charjunk}}}
604 Optional keyword parameters \var{linejunk} and \var{charjunk} are
605 for filter functions (or \code{None}):
606
607 \var{linejunk}: A function that accepts a single string
608 argument, and returns true if the string is junk. The default is
609 \code{None}, meaning that no line is considered junk.
610
611 \var{charjunk}: A function that accepts a single character argument
612 (a string of length 1), and returns true if the character is junk.
613 The default is \code{None}, meaning that no character is
614 considered junk.
615\end{classdesc}
616
617\class{Differ} objects are used (deltas generated) via a single
618method:
619
620\begin{methoddesc}{compare}{a, b}
621 Compare two sequences of lines, and generate the delta (a sequence
622 of lines).
623
624 Each sequence must contain individual single-line strings ending
625 with newlines. Such sequences can be obtained from the
626 \method{readlines()} method of file-like objects. The delta generated
627 also consists of newline-terminated strings, ready to be printed as-is
628 via the \method{writelines()} method of a file-like object.
629\end{methoddesc}
630
631
632\subsection{Differ Example \label{differ-examples}}
633
634This example compares two texts. First we set up the texts, sequences
635of individual single-line strings ending with newlines (such sequences
636can also be obtained from the \method{readlines()} method of file-like
637objects):
638
639\begin{verbatim}
640>>> text1 = ''' 1. Beautiful is better than ugly.
641... 2. Explicit is better than implicit.
642... 3. Simple is better than complex.
643... 4. Complex is better than complicated.
644... '''.splitlines(1)
645>>> len(text1)
6464
647>>> text1[0][-1]
648'\n'
649>>> text2 = ''' 1. Beautiful is better than ugly.
650... 3. Simple is better than complex.
651... 4. Complicated is better than complex.
652... 5. Flat is better than nested.
653... '''.splitlines(1)
654\end{verbatim}
655
656Next we instantiate a Differ object:
657
658\begin{verbatim}
659>>> d = Differ()
660\end{verbatim}
661
662Note that when instantiating a \class{Differ} object we may pass
663functions to filter out line and character ``junk.'' See the
664\method{Differ()} constructor for details.
665
666Finally, we compare the two:
667
668\begin{verbatim}
669>>> result = list(d.compare(text1, text2))
670\end{verbatim}
671
672\code{result} is a list of strings, so let's pretty-print it:
673
674\begin{verbatim}
675>>> from pprint import pprint
676>>> pprint(result)
677[' 1. Beautiful is better than ugly.\n',
678 '- 2. Explicit is better than implicit.\n',
679 '- 3. Simple is better than complex.\n',
680 '+ 3. Simple is better than complex.\n',
681 '? ++ \n',
682 '- 4. Complex is better than complicated.\n',
683 '? ^ ---- ^ \n',
684 '+ 4. Complicated is better than complex.\n',
685 '? ++++ ^ ^ \n',
686 '+ 5. Flat is better than nested.\n']
687\end{verbatim}
688
689As a single multi-line string it looks like this:
690
691\begin{verbatim}
692>>> import sys
693>>> sys.stdout.writelines(result)
694 1. Beautiful is better than ugly.
695- 2. Explicit is better than implicit.
696- 3. Simple is better than complex.
697+ 3. Simple is better than complex.
698? ++
699- 4. Complex is better than complicated.
700? ^ ---- ^
701+ 4. Complicated is better than complex.
702? ++++ ^ ^
703+ 5. Flat is better than nested.
704\end{verbatim}
Note: See TracBrowser for help on using the repository browser.