Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

analyze.c@ 2799

Visit:

Last change on this file since 2799 was 2556, checked in by bird, 20 years ago
diffutils 2.8.1
File size: 29.6 KB

Line
1	/* Analyze file differences for GNU DIFF.
2
3	Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
4	Free Software Foundation, Inc.
5
6	This file is part of GNU DIFF.
7
8	GNU DIFF is free software; you can redistribute it and/or modify
9	it under the terms of the GNU General Public License as published by
10	the Free Software Foundation; either version 2, or (at your option)
11	any later version.
12
13	GNU DIFF is distributed in the hope that it will be useful,
14	but WITHOUT ANY WARRANTY; without even the implied warranty of
15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	GNU General Public License for more details.
17
18	You should have received a copy of the GNU General Public License
19	along with this program; see the file COPYING.
20	If not, write to the Free Software Foundation,
21	59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22
23	/* The basic algorithm is described in:
24	"An O(ND) Difference Algorithm and its Variations", Eugene Myers,
25	Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
26	see especially section 4.2, which describes the variation used below.
27	Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
28	heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
29	at the price of producing suboptimal output for large inputs with
30	many differences.
31
32	The basic algorithm was independently discovered as described in:
33	"Algorithms for Approximate String Matching", E. Ukkonen,
34	Information and Control Vol. 64, 1985, pp. 100-118. */
35
36	#include "diff.h"
37	#include <cmpbuf.h>
38	#include <error.h>
39	#include <regex.h>
40	#include <xalloc.h>
41
42	static lin xvec, yvec; /* Vectors being compared. */
43	static lin fdiag; / Vector, indexed by diagonal, containing
44	1 + the X coordinate of the point furthest
45	along the given diagonal in the forward
46	search of the edit matrix. */
47	static lin bdiag; / Vector, indexed by diagonal, containing
48	the X coordinate of the point furthest
49	along the given diagonal in the backward
50	search of the edit matrix. */
51	static lin too_expensive; /* Edit scripts longer than this are too
52	expensive to compute. */
53
54	#define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */
55
56	struct partition
57	{
58	lin xmid, ymid; /* Midpoints of this partition. */
59	bool lo_minimal; /* Nonzero if low half will be analyzed minimally. */
60	bool hi_minimal; /* Likewise for high half. */
61	};
62
63	/* Find the midpoint of the shortest edit script for a specified
64	portion of the two files.
65
66	Scan from the beginnings of the files, and simultaneously from the ends,
67	doing a breadth-first search through the space of edit-sequence.
68	When the two searches meet, we have found the midpoint of the shortest
69	edit sequence.
70
71	If FIND_MINIMAL is nonzero, find the minimal edit script regardless
72	of expense. Otherwise, if the search is too expensive, use
73	heuristics to stop the search and report a suboptimal answer.
74
75	Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
76	XMID - YMID equals the number of inserted lines minus the number
77	of deleted lines (counting only lines before the midpoint).
78	Return the approximate edit cost; this is the total number of
79	lines inserted or deleted (counting only lines before the midpoint),
80	unless a heuristic is used to terminate the search prematurely.
81
82	Set PART->lo_minimal to true iff the minimal edit script for the
83	left half of the partition is known; similarly for PART->hi_minimal.
84
85	This function assumes that the first lines of the specified portions
86	of the two files do not match, and likewise that the last lines do not
87	match. The caller must trim matching lines from the beginning and end
88	of the portions it is going to specify.
89
90	If we return the "wrong" partitions,
91	the worst this can do is cause suboptimal diff output.
92	It cannot cause incorrect diff output. */
93
94	static lin
95	diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
96	struct partition *part)
97	{
98	lin const fd = fdiag; / Give the compiler a chance. */
99	lin const bd = bdiag; / Additional help for the compiler. */
100	lin const const xv = xvec; / Still more help for the compiler. */
101	lin const const yv = yvec; / And more and more . . . */
102	lin const dmin = xoff - ylim; /* Minimum valid diagonal. */
103	lin const dmax = xlim - yoff; /* Maximum valid diagonal. */
104	lin const fmid = xoff - yoff; /* Center diagonal of top-down search. */
105	lin const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
106	lin fmin = fmid, fmax = fmid; /* Limits of top-down search. */
107	lin bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */
108	lin c; /* Cost. */
109	bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
110	diagonal with respect to the northwest. */
111
112	fd[fmid] = xoff;
113	bd[bmid] = xlim;
114
115	for (c = 1;; ++c)
116	{
117	lin d; /* Active diagonal. */
118	bool big_snake = 0;
119
120	/* Extend the top-down search by an edit step in each diagonal. */
121	fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
122	fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
123	for (d = fmax; d >= fmin; d -= 2)
124	{
125	lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
126
127	if (tlo >= thi)
128	x = tlo + 1;
129	else
130	x = thi;
131	oldx = x;
132	y = x - d;
133	while (x < xlim && y < ylim && xv[x] == yv[y])
134	++x, ++y;
135	if (x - oldx > SNAKE_LIMIT)
136	big_snake = 1;
137	fd[d] = x;
138	if (odd && bmin <= d && d <= bmax && bd[d] <= x)
139	{
140	part->xmid = x;
141	part->ymid = y;
142	part->lo_minimal = part->hi_minimal = 1;
143	return 2 * c - 1;
144	}
145	}
146
147	/* Similarly extend the bottom-up search. */
148	bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
149	bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
150	for (d = bmax; d >= bmin; d -= 2)
151	{
152	lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
153
154	if (tlo < thi)
155	x = tlo;
156	else
157	x = thi - 1;
158	oldx = x;
159	y = x - d;
160	while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
161	--x, --y;
162	if (oldx - x > SNAKE_LIMIT)
163	big_snake = 1;
164	bd[d] = x;
165	if (!odd && fmin <= d && d <= fmax && x <= fd[d])
166	{
167	part->xmid = x;
168	part->ymid = y;
169	part->lo_minimal = part->hi_minimal = 1;
170	return 2 * c;
171	}
172	}
173
174	if (find_minimal)
175	continue;
176
177	/* Heuristic: check occasionally for a diagonal that has made
178	lots of progress compared with the edit distance.
179	If we have any such, find the one that has made the most
180	progress and return it as if it had succeeded.
181
182	With this heuristic, for files with a constant small density
183	of changes, the algorithm is linear in the file size. */
184
185	if (200 < c && big_snake && speed_large_files)
186	{
187	lin best;
188
189	best = 0;
190	for (d = fmax; d >= fmin; d -= 2)
191	{
192	lin dd = d - fmid;
193	lin x = fd[d];
194	lin y = x - d;
195	lin v = (x - xoff) * 2 - dd;
196	if (v > 12 * (c + (dd < 0 ? -dd : dd)))
197	{
198	if (v > best
199	&& xoff + SNAKE_LIMIT <= x && x < xlim
200	&& yoff + SNAKE_LIMIT <= y && y < ylim)
201	{
202	/* We have a good enough best diagonal;
203	now insist that it end with a significant snake. */
204	int k;
205
206	for (k = 1; xv[x - k] == yv[y - k]; k++)
207	if (k == SNAKE_LIMIT)
208	{
209	best = v;
210	part->xmid = x;
211	part->ymid = y;
212	break;
213	}
214	}
215	}
216	}
217	if (best > 0)
218	{
219	part->lo_minimal = 1;
220	part->hi_minimal = 0;
221	return 2 * c - 1;
222	}
223
224	best = 0;
225	for (d = bmax; d >= bmin; d -= 2)
226	{
227	lin dd = d - bmid;
228	lin x = bd[d];
229	lin y = x - d;
230	lin v = (xlim - x) * 2 + dd;
231	if (v > 12 * (c + (dd < 0 ? -dd : dd)))
232	{
233	if (v > best
234	&& xoff < x && x <= xlim - SNAKE_LIMIT
235	&& yoff < y && y <= ylim - SNAKE_LIMIT)
236	{
237	/* We have a good enough best diagonal;
238	now insist that it end with a significant snake. */
239	int k;
240
241	for (k = 0; xv[x + k] == yv[y + k]; k++)
242	if (k == SNAKE_LIMIT - 1)
243	{
244	best = v;
245	part->xmid = x;
246	part->ymid = y;
247	break;
248	}
249	}
250	}
251	}
252	if (best > 0)
253	{
254	part->lo_minimal = 0;
255	part->hi_minimal = 1;
256	return 2 * c - 1;
257	}
258	}
259
260	/* Heuristic: if we've gone well beyond the call of duty,
261	give up and report halfway between our best results so far. */
262	if (c >= too_expensive)
263	{
264	lin fxybest, fxbest;
265	lin bxybest, bxbest;
266
267	fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */
268
269	/* Find forward diagonal that maximizes X + Y. */
270	fxybest = -1;
271	for (d = fmax; d >= fmin; d -= 2)
272	{
273	lin x = MIN (fd[d], xlim);
274	lin y = x - d;
275	if (ylim < y)
276	x = ylim + d, y = ylim;
277	if (fxybest < x + y)
278	{
279	fxybest = x + y;
280	fxbest = x;
281	}
282	}
283
284	/* Find backward diagonal that minimizes X + Y. */
285	bxybest = LIN_MAX;
286	for (d = bmax; d >= bmin; d -= 2)
287	{
288	lin x = MAX (xoff, bd[d]);
289	lin y = x - d;
290	if (y < yoff)
291	x = yoff + d, y = yoff;
292	if (x + y < bxybest)
293	{
294	bxybest = x + y;
295	bxbest = x;
296	}
297	}
298
299	/* Use the better of the two diagonals. */
300	if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
301	{
302	part->xmid = fxbest;
303	part->ymid = fxybest - fxbest;
304	part->lo_minimal = 1;
305	part->hi_minimal = 0;
306	}
307	else
308	{
309	part->xmid = bxbest;
310	part->ymid = bxybest - bxbest;
311	part->lo_minimal = 0;
312	part->hi_minimal = 1;
313	}
314	return 2 * c - 1;
315	}
316	}
317	}
318
319
320	/* Compare in detail contiguous subsequences of the two files
321	which are known, as a whole, to match each other.
322
323	The results are recorded in the vectors files[N].changed, by
324	storing 1 in the element for each line that is an insertion or deletion.
325
326	The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
327
328	Note that XLIM, YLIM are exclusive bounds.
329	All line numbers are origin-0 and discarded lines are not counted.
330
331	If FIND_MINIMAL, find a minimal difference no matter how
332	expensive it is. */
333
334	static void
335	compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
336	{
337	lin * const xv = xvec; /* Help the compiler. */
338	lin * const yv = yvec;
339
340	/* Slide down the bottom initial diagonal. */
341	while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
342	++xoff, ++yoff;
343	/* Slide up the top initial diagonal. */
344	while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
345	--xlim, --ylim;
346
347	/* Handle simple cases. */
348	if (xoff == xlim)
349	while (yoff < ylim)
350	files[1].changed[files[1].realindexes[yoff++]] = 1;
351	else if (yoff == ylim)
352	while (xoff < xlim)
353	files[0].changed[files[0].realindexes[xoff++]] = 1;
354	else
355	{
356	lin c;
357	struct partition part;
358
359	/* Find a point of correspondence in the middle of the files. */
360
361	c = diag (xoff, xlim, yoff, ylim, find_minimal, &part);
362
363	if (c == 1)
364	{
365	/* This should be impossible, because it implies that
366	one of the two subsequences is empty,
367	and that case was handled above without calling `diag'.
368	Let's verify that this is true. */
369	abort ();
370	#if 0
371	/* The two subsequences differ by a single insert or delete;
372	record it and we are done. */
373	if (part.xmid - part.ymid < xoff - yoff)
374	files[1].changed[files[1].realindexes[part.ymid - 1]] = 1;
375	else
376	files[0].changed[files[0].realindexes[part.xmid]] = 1;
377	#endif
378	}
379	else
380	{
381	/* Use the partitions to split this problem into subproblems. */
382	compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
383	compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
384	}
385	}
386	}
387
388
389	/* Discard lines from one file that have no matches in the other file.
390
391	A line which is discarded will not be considered by the actual
392	comparison algorithm; it will be as if that line were not in the file.
393	The file's `realindexes' table maps virtual line numbers
394	(which don't count the discarded lines) into real line numbers;
395	this is how the actual comparison algorithm produces results
396	that are comprehensible when the discarded lines are counted.
397
398	When we discard a line, we also mark it as a deletion or insertion
399	so that it will be printed in the output. */
400
401	static void
402	discard_confusing_lines (struct file_data filevec[])
403	{
404	int f;
405	lin i;
406	char *discarded[2];
407	lin *equiv_count[2];
408	lin *p;
409
410	/* Allocate our results. */
411	p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
412	* (2 * sizeof *p));
413	for (f = 0; f < 2; f++)
414	{
415	filevec[f].undiscarded = p; p += filevec[f].buffered_lines;
416	filevec[f].realindexes = p; p += filevec[f].buffered_lines;
417	}
418
419	/* Set up equiv_count[F][I] as the number of lines in file F
420	that fall in equivalence class I. */
421
422	p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
423	equiv_count[0] = p;
424	equiv_count[1] = p + filevec[0].equiv_max;
425
426	for (i = 0; i < filevec[0].buffered_lines; ++i)
427	++equiv_count[0][filevec[0].equivs[i]];
428	for (i = 0; i < filevec[1].buffered_lines; ++i)
429	++equiv_count[1][filevec[1].equivs[i]];
430
431	/* Set up tables of which lines are going to be discarded. */
432
433	discarded[0] = zalloc (filevec[0].buffered_lines
434	+ filevec[1].buffered_lines);
435	discarded[1] = discarded[0] + filevec[0].buffered_lines;
436
437	/* Mark to be discarded each line that matches no line of the other file.
438	If a line matches many lines, mark it as provisionally discardable. */
439
440	for (f = 0; f < 2; f++)
441	{
442	size_t end = filevec[f].buffered_lines;
443	char *discards = discarded[f];
444	lin *counts = equiv_count[1 - f];
445	lin *equivs = filevec[f].equivs;
446	size_t many = 5;
447	size_t tem = end / 64;
448
449	/* Multiply MANY by approximate square root of number of lines.
450	That is the threshold for provisionally discardable lines. */
451	while ((tem = tem >> 2) > 0)
452	many *= 2;
453
454	for (i = 0; i < end; i++)
455	{
456	lin nmatch;
457	if (equivs[i] == 0)
458	continue;
459	nmatch = counts[equivs[i]];
460	if (nmatch == 0)
461	discards[i] = 1;
462	else if (nmatch > many)
463	discards[i] = 2;
464	}
465	}
466
467	/* Don't really discard the provisional lines except when they occur
468	in a run of discardables, with nonprovisionals at the beginning
469	and end. */
470
471	for (f = 0; f < 2; f++)
472	{
473	lin end = filevec[f].buffered_lines;
474	register char *discards = discarded[f];
475
476	for (i = 0; i < end; i++)
477	{
478	/* Cancel provisional discards not in middle of run of discards. */
479	if (discards[i] == 2)
480	discards[i] = 0;
481	else if (discards[i] != 0)
482	{
483	/* We have found a nonprovisional discard. */
484	register lin j;
485	lin length;
486	lin provisional = 0;
487
488	/* Find end of this run of discardable lines.
489	Count how many are provisionally discardable. */
490	for (j = i; j < end; j++)
491	{
492	if (discards[j] == 0)
493	break;
494	if (discards[j] == 2)
495	++provisional;
496	}
497
498	/* Cancel provisional discards at end, and shrink the run. */
499	while (j > i && discards[j - 1] == 2)
500	discards[--j] = 0, --provisional;
501
502	/* Now we have the length of a run of discardable lines
503	whose first and last are not provisional. */
504	length = j - i;
505
506	/* If 1/4 of the lines in the run are provisional,
507	cancel discarding of all provisional lines in the run. */
508	if (provisional * 4 > length)
509	{
510	while (j > i)
511	if (discards[--j] == 2)
512	discards[j] = 0;
513	}
514	else
515	{
516	register lin consec;
517	lin minimum = 1;
518	lin tem = length >> 2;
519
520	/* MINIMUM is approximate square root of LENGTH/4.
521	A subrun of two or more provisionals can stand
522	when LENGTH is at least 16.
523	A subrun of 4 or more can stand when LENGTH >= 64. */
524	while (0 < (tem >>= 2))
525	minimum <<= 1;
526	minimum++;
527
528	/* Cancel any subrun of MINIMUM or more provisionals
529	within the larger run. */
530	for (j = 0, consec = 0; j < length; j++)
531	if (discards[i + j] != 2)
532	consec = 0;
533	else if (minimum == ++consec)
534	/* Back up to start of subrun, to cancel it all. */
535	j -= consec;
536	else if (minimum < consec)
537	discards[i + j] = 0;
538
539	/* Scan from beginning of run
540	until we find 3 or more nonprovisionals in a row
541	or until the first nonprovisional at least 8 lines in.
542	Until that point, cancel any provisionals. */
543	for (j = 0, consec = 0; j < length; j++)
544	{
545	if (j >= 8 && discards[i + j] == 1)
546	break;
547	if (discards[i + j] == 2)
548	consec = 0, discards[i + j] = 0;
549	else if (discards[i + j] == 0)
550	consec = 0;
551	else
552	consec++;
553	if (consec == 3)
554	break;
555	}
556
557	/* I advances to the last line of the run. */
558	i += length - 1;
559
560	/* Same thing, from end. */
561	for (j = 0, consec = 0; j < length; j++)
562	{
563	if (j >= 8 && discards[i - j] == 1)
564	break;
565	if (discards[i - j] == 2)
566	consec = 0, discards[i - j] = 0;
567	else if (discards[i - j] == 0)
568	consec = 0;
569	else
570	consec++;
571	if (consec == 3)
572	break;
573	}
574	}
575	}
576	}
577	}
578
579	/* Actually discard the lines. */
580	for (f = 0; f < 2; f++)
581	{
582	char *discards = discarded[f];
583	lin end = filevec[f].buffered_lines;
584	lin j = 0;
585	for (i = 0; i < end; ++i)
586	if (minimal \|\| discards[i] == 0)
587	{
588	filevec[f].undiscarded[j] = filevec[f].equivs[i];
589	filevec[f].realindexes[j++] = i;
590	}
591	else
592	filevec[f].changed[i] = 1;
593	filevec[f].nondiscarded_lines = j;
594	}
595
596	free (discarded[0]);
597	free (equiv_count[0]);
598	}
599
600
601	/* Adjust inserts/deletes of identical lines to join changes
602	as much as possible.
603
604	We do something when a run of changed lines include a
605	line at one end and have an excluded, identical line at the other.
606	We are free to choose which identical line is included.
607	`compareseq' usually chooses the one at the beginning,
608	but usually it is cleaner to consider the following identical line
609	to be the "change". */
610
611	static void
612	shift_boundaries (struct file_data filevec[])
613	{
614	int f;
615
616	for (f = 0; f < 2; f++)
617	{
618	bool *changed = filevec[f].changed;
619	bool const *other_changed = filevec[1 - f].changed;
620	lin const *equivs = filevec[f].equivs;
621	lin i = 0;
622	lin j = 0;
623	lin i_end = filevec[f].buffered_lines;
624
625	while (1)
626	{
627	lin runlength, start, corresponding;
628
629	/* Scan forwards to find beginning of another run of changes.
630	Also keep track of the corresponding point in the other file. */
631
632	while (i < i_end && !changed[i])
633	{
634	while (other_changed[j++])
635	continue;
636	i++;
637	}
638
639	if (i == i_end)
640	break;
641
642	start = i;
643
644	/* Find the end of this run of changes. */
645
646	while (changed[++i])
647	continue;
648	while (other_changed[j])
649	j++;
650
651	do
652	{
653	/* Record the length of this run of changes, so that
654	we can later determine whether the run has grown. */
655	runlength = i - start;
656
657	/* Move the changed region back, so long as the
658	previous unchanged line matches the last changed one.
659	This merges with previous changed regions. */
660
661	while (start && equivs[start - 1] == equivs[i - 1])
662	{
663	changed[--start] = 1;
664	changed[--i] = 0;
665	while (changed[start - 1])
666	start--;
667	while (other_changed[--j])
668	continue;
669	}
670
671	/* Set CORRESPONDING to the end of the changed run, at the last
672	point where it corresponds to a changed run in the other file.
673	CORRESPONDING == I_END means no such point has been found. */
674	corresponding = other_changed[j - 1] ? i : i_end;
675
676	/* Move the changed region forward, so long as the
677	first changed line matches the following unchanged one.
678	This merges with following changed regions.
679	Do this second, so that if there are no merges,
680	the changed region is moved forward as far as possible. */
681
682	while (i != i_end && equivs[start] == equivs[i])
683	{
684	changed[start++] = 0;
685	changed[i++] = 1;
686	while (changed[i])
687	i++;
688	while (other_changed[++j])
689	corresponding = i;
690	}
691	}
692	while (runlength != i - start);
693
694	/* If possible, move the fully-merged run of changes
695	back to a corresponding run in the other file. */
696
697	while (corresponding < i)
698	{
699	changed[--start] = 1;
700	changed[--i] = 0;
701	while (other_changed[--j])
702	continue;
703	}
704	}
705	}
706	}
707
708
709	/* Cons an additional entry onto the front of an edit script OLD.
710	LINE0 and LINE1 are the first affected lines in the two files (origin 0).
711	DELETED is the number of lines deleted here from file 0.
712	INSERTED is the number of lines inserted here in file 1.
713
714	If DELETED is 0 then LINE0 is the number of the line before
715	which the insertion was done; vice versa for INSERTED and LINE1. */
716
717	static struct change *
718	add_change (lin line0, lin line1, lin deleted, lin inserted,
719	struct change *old)
720	{
721	struct change new = xmalloc (sizeof new);
722
723	new->line0 = line0;
724	new->line1 = line1;
725	new->inserted = inserted;
726	new->deleted = deleted;
727	new->link = old;
728	return new;
729	}
730
731	/* Scan the tables of which lines are inserted and deleted,
732	producing an edit script in reverse order. */
733
734	static struct change *
735	build_reverse_script (struct file_data const filevec[])
736	{
737	struct change *script = 0;
738	bool *changed0 = filevec[0].changed;
739	bool *changed1 = filevec[1].changed;
740	lin len0 = filevec[0].buffered_lines;
741	lin len1 = filevec[1].buffered_lines;
742
743	/* Note that changedN[len0] does exist, and is 0. */
744
745	lin i0 = 0, i1 = 0;
746
747	while (i0 < len0 \|\| i1 < len1)
748	{
749	if (changed0[i0] \| changed1[i1])
750	{
751	lin line0 = i0, line1 = i1;
752
753	/* Find # lines changed here in each file. */
754	while (changed0[i0]) ++i0;
755	while (changed1[i1]) ++i1;
756
757	/* Record this change. */
758	script = add_change (line0, line1, i0 - line0, i1 - line1, script);
759	}
760
761	/* We have reached lines in the two files that match each other. */
762	i0++, i1++;
763	}
764
765	return script;
766	}
767
768	/* Scan the tables of which lines are inserted and deleted,
769	producing an edit script in forward order. */
770
771	static struct change *
772	build_script (struct file_data const filevec[])
773	{
774	struct change *script = 0;
775	bool *changed0 = filevec[0].changed;
776	bool *changed1 = filevec[1].changed;
777	lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
778
779	/* Note that changedN[-1] does exist, and is 0. */
780
781	while (i0 >= 0 \|\| i1 >= 0)
782	{
783	if (changed0[i0 - 1] \| changed1[i1 - 1])
784	{
785	lin line0 = i0, line1 = i1;
786
787	/* Find # lines changed here in each file. */
788	while (changed0[i0 - 1]) --i0;
789	while (changed1[i1 - 1]) --i1;
790
791	/* Record this change. */
792	script = add_change (i0, i1, line0 - i0, line1 - i1, script);
793	}
794
795	/* We have reached lines in the two files that match each other. */
796	i0--, i1--;
797	}
798
799	return script;
800	}
801
802
803	/* If CHANGES, briefly report that two files differed.
804	Return 2 if trouble, CHANGES otherwise. */
805	static int
806	briefly_report (int changes, struct file_data const filevec[])
807	{
808	if (changes)
809	{
810	char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
811	char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
812
813	if (brief)
814	message ("Files %s and %s differ\n", label0, label1);
815	else
816	{
817	message ("Binary files %s and %s differ\n", label0, label1);
818	changes = 2;
819	}
820	}
821
822	return changes;
823	}
824
825	/* Report the differences of two files. */
826	int
827	diff_2_files (struct comparison *cmp)
828	{
829	lin diags;
830	int f;
831	struct change e, p;
832	struct change *script;
833	int changes;
834
835
836	/* If we have detected that either file is binary,
837	compare the two files as binary. This can happen
838	only when the first chunk is read.
839	Also, --brief without any --ignore-* options means
840	we can speed things up by treating the files as binary. */
841
842	if (read_files (cmp->file, files_can_be_treated_as_binary))
843	{
844	/* Files with different lengths must be different. */
845	if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
846	&& (cmp->file[0].desc < 0 \|\| S_ISREG (cmp->file[0].stat.st_mode))
847	&& (cmp->file[1].desc < 0 \|\| S_ISREG (cmp->file[1].stat.st_mode)))
848	changes = 1;
849
850	/* Standard input equals itself. */
851	else if (cmp->file[0].desc == cmp->file[1].desc)
852	changes = 0;
853
854	else
855	/* Scan both files, a buffer at a time, looking for a difference. */
856	{
857	/* Allocate same-sized buffers for both files. */
858	size_t lcm_max = PTRDIFF_MAX - 1;
859	size_t buffer_size =
860	buffer_lcm (sizeof (word),
861	buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
862	STAT_BLOCKSIZE (cmp->file[1].stat),
863	lcm_max),
864	lcm_max);
865	for (f = 0; f < 2; f++)
866	cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
867
868	for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
869	{
870	/* Read a buffer's worth from both files. */
871	for (f = 0; f < 2; f++)
872	if (0 <= cmp->file[f].desc)
873	file_block_read (&cmp->file[f],
874	buffer_size - cmp->file[f].buffered);
875
876	/* If the buffers differ, the files differ. */
877	if (cmp->file[0].buffered != cmp->file[1].buffered
878	\|\| memcmp (cmp->file[0].buffer,
879	cmp->file[1].buffer,
880	cmp->file[0].buffered))
881	{
882	changes = 1;
883	break;
884	}
885
886	/* If we reach end of file, the files are the same. */
887	if (cmp->file[0].buffered != buffer_size)
888	{
889	changes = 0;
890	break;
891	}
892	}
893	}
894
895	changes = briefly_report (changes, cmp->file);
896	}
897	else
898	{
899	/* Allocate vectors for the results of comparison:
900	a flag for each line of each file, saying whether that line
901	is an insertion or deletion.
902	Allocate an extra element, always 0, at each end of each vector. */
903
904	size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
905	bool flag_space = zalloc (s sizeof *flag_space);
906	cmp->file[0].changed = flag_space + 1;
907	cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
908
909	/* Some lines are obviously insertions or deletions
910	because they don't match anything. Detect them now, and
911	avoid even thinking about them in the main comparison algorithm. */
912
913	discard_confusing_lines (cmp->file);
914
915	/* Now do the main comparison algorithm, considering just the
916	undiscarded lines. */
917
918	xvec = cmp->file[0].undiscarded;
919	yvec = cmp->file[1].undiscarded;
920	diags = (cmp->file[0].nondiscarded_lines
921	+ cmp->file[1].nondiscarded_lines + 3);
922	fdiag = xmalloc (diags * (2 * sizeof *fdiag));
923	bdiag = fdiag + diags;
924	fdiag += cmp->file[1].nondiscarded_lines + 1;
925	bdiag += cmp->file[1].nondiscarded_lines + 1;
926
927	/* Set TOO_EXPENSIVE to be approximate square root of input size,
928	bounded below by 256. */
929	too_expensive = 1;
930	for (; diags != 0; diags >>= 2)
931	too_expensive <<= 1;
932	too_expensive = MAX (256, too_expensive);
933
934	files[0] = cmp->file[0];
935	files[1] = cmp->file[1];
936
937	compareseq (0, cmp->file[0].nondiscarded_lines,
938	0, cmp->file[1].nondiscarded_lines, minimal);
939
940	free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
941
942	/* Modify the results slightly to make them prettier
943	in cases where that can validly be done. */
944
945	shift_boundaries (cmp->file);
946
947	/* Get the results of comparison in the form of a chain
948	of `struct change's -- an edit script. */
949
950	if (output_style == OUTPUT_ED)
951	script = build_reverse_script (cmp->file);
952	else
953	script = build_script (cmp->file);
954
955	/* Set CHANGES if we had any diffs.
956	If some changes are ignored, we must scan the script to decide. */
957	if (ignore_blank_lines \|\| ignore_regexp.fastmap)
958	{
959	struct change *next = script;
960	changes = 0;
961
962	while (next && changes == 0)
963	{
964	struct change this, end;
965	lin first0, last0, first1, last1;
966
967	/* Find a set of changes that belong together. */
968	this = next;
969	end = find_change (next);
970
971	/* Disconnect them from the rest of the changes, making them
972	a hunk, and remember the rest for next iteration. */
973	next = end->link;
974	end->link = 0;
975
976	/* Determine whether this hunk is really a difference. */
977	if (analyze_hunk (this, &first0, &last0, &first1, &last1))
978	changes = 1;
979
980	/* Reconnect the script so it will all be freed properly. */
981	end->link = next;
982	}
983	}
984	else
985	changes = (script != 0);
986
987	if (brief)
988	changes = briefly_report (changes, cmp->file);
989	else
990	{
991	if (changes \| !no_diff_means_no_output)
992	{
993	/* Record info for starting up output,
994	to be used if and when we have some output to print. */
995	setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
996	file_label[1] ? file_label[1] : cmp->file[1].name,
997	cmp->parent != 0);
998
999	switch (output_style)
1000	{
1001	case OUTPUT_CONTEXT:
1002	print_context_script (script, 0);
1003	break;
1004
1005	case OUTPUT_UNIFIED:
1006	print_context_script (script, 1);
1007	break;
1008
1009	case OUTPUT_ED:
1010	print_ed_script (script);
1011	break;
1012
1013	case OUTPUT_FORWARD_ED:
1014	pr_forward_ed_script (script);
1015	break;
1016
1017	case OUTPUT_RCS:
1018	print_rcs_script (script);
1019	break;
1020
1021	case OUTPUT_NORMAL:
1022	print_normal_script (script);
1023	break;
1024
1025	case OUTPUT_IFDEF:
1026	print_ifdef_script (script);
1027	break;
1028
1029	case OUTPUT_SDIFF:
1030	print_sdiff_script (script);
1031	break;
1032
1033	default:
1034	abort ();
1035	}
1036
1037	finish_output ();
1038	}
1039	}
1040
1041	free (cmp->file[0].undiscarded);
1042
1043	free (flag_space);
1044
1045	for (f = 0; f < 2; f++)
1046	{
1047	free (cmp->file[f].equivs);
1048	free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1049	}
1050
1051	for (e = script; e; e = p)
1052	{
1053	p = e->link;
1054	free (e);
1055	}
1056
1057	if (! ROBUST_OUTPUT_STYLE (output_style))
1058	for (f = 0; f < 2; ++f)
1059	if (cmp->file[f].missing_newline)
1060	{
1061	error (0, 0, "%s: %s\n",
1062	file_label[f] ? file_label[f] : cmp->file[f].name,
1063	_("No newline at end of file"));
1064	changes = 2;
1065	}
1066	}
1067
1068	if (cmp->file[0].buffer != cmp->file[1].buffer)
1069	free (cmp->file[0].buffer);
1070	free (cmp->file[1].buffer);
1071
1072	return changes;
1073	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/diffutils/src/analyze.c@ 2799

Download in other formats: