source: trunk/diffutils/src/io.c@ 2561

Last change on this file since 2561 was 2556, checked in by bird, 20 years ago

diffutils 2.8.1

File size: 23.6 KB
Line 
1/* File I/O for GNU DIFF.
2
3 Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
4 Free Software Foundation, Inc.
5
6 This file is part of GNU DIFF.
7
8 GNU DIFF is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 GNU DIFF is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING.
20 If not, write to the Free Software Foundation,
21 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22
23#include "diff.h"
24#include <cmpbuf.h>
25#include <regex.h>
26#include <setmode.h>
27#include <xalloc.h>
28
29/* Rotate an unsigned value to the left. */
30#define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
31
32/* Given a hash value and a new character, return a new hash value. */
33#define HASH(h, c) ((c) + ROL (h, 7))
34
35
36/* The type of a hash value. */
37typedef size_t hash_value;
38verify (hash_value_is_unsigned, ! TYPE_SIGNED (hash_value));
39
40/* Lines are put into equivalence classes of lines that match in lines_differ.
41 Each equivalence class is represented by one of these structures,
42 but only while the classes are being computed.
43 Afterward, each class is represented by a number. */
44struct equivclass
45{
46 lin next; /* Next item in this bucket. */
47 hash_value hash; /* Hash of lines in this class. */
48 char const *line; /* A line that fits this class. */
49 size_t length; /* That line's length, not counting its newline. */
50};
51
52/* Hash-table: array of buckets, each being a chain of equivalence classes.
53 buckets[-1] is reserved for incomplete lines. */
54static lin *buckets;
55
56/* Number of buckets in the hash table array, not counting buckets[-1]. */
57static size_t nbuckets;
58
59/* Array in which the equivalence classes are allocated.
60 The bucket-chains go through the elements in this array.
61 The number of an equivalence class is its index in this array. */
62static struct equivclass *equivs;
63
64/* Index of first free element in the array `equivs'. */
65static lin equivs_index;
66
67/* Number of elements allocated in the array `equivs'. */
68static lin equivs_alloc;
69
70
71/* Read a block of data into a file buffer, checking for EOF and error. */
72
73void
74file_block_read (struct file_data *current, size_t size)
75{
76 if (size && ! current->eof)
77 {
78 size_t s = block_read (current->desc,
79 FILE_BUFFER (current) + current->buffered, size);
80 if (s == SIZE_MAX)
81 pfatal_with_name (current->name);
82 current->buffered += s;
83 current->eof = s < size;
84 }
85}
86
87
88/* Check for binary files and compare them for exact identity. */
89
90/* Return 1 if BUF contains a non text character.
91 SIZE is the number of characters in BUF. */
92
93#define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
94
95/* Get ready to read the current file.
96 Return nonzero if SKIP_TEST is zero,
97 and if it appears to be a binary file. */
98
99static bool
100sip (struct file_data *current, bool skip_test)
101{
102 /* If we have a nonexistent file at this stage, treat it as empty. */
103 if (current->desc < 0)
104 {
105 /* Leave room for a sentinel. */
106 current->bufsize = sizeof (word);
107 current->buffer = xmalloc (current->bufsize);
108 }
109 else
110 {
111 current->bufsize = buffer_lcm (sizeof (word),
112 STAT_BLOCKSIZE (current->stat),
113 PTRDIFF_MAX - 2 * sizeof (word));
114 current->buffer = xmalloc (current->bufsize);
115
116 if (! skip_test)
117 {
118 /* Check first part of file to see if it's a binary file. */
119
120 bool was_binary = set_binary_mode (current->desc, 1);
121 off_t buffered;
122 file_block_read (current, current->bufsize);
123 buffered = current->buffered;
124
125 if (! was_binary)
126 {
127 /* Revert to text mode and seek back to the beginning to
128 reread the file. Use relative seek, since file
129 descriptors like stdin might not start at offset
130 zero. */
131
132 if (lseek (current->desc, - buffered, SEEK_CUR) == -1)
133 pfatal_with_name (current->name);
134 set_binary_mode (current->desc, 0);
135 current->buffered = 0;
136 current->eof = 0;
137 }
138
139 return binary_file_p (current->buffer, buffered);
140 }
141 }
142
143 current->buffered = 0;
144 current->eof = 0;
145 return 0;
146}
147
148/* Slurp the rest of the current file completely into memory. */
149
150static void
151slurp (struct file_data *current)
152{
153 size_t cc;
154
155 if (current->desc < 0)
156 {
157 /* The file is nonexistent. */
158 return;
159 }
160
161 if (S_ISREG (current->stat.st_mode))
162 {
163 /* It's a regular file; slurp in the rest all at once. */
164
165 /* Get the size out of the stat block.
166 Allocate just enough room for appended newline plus word sentinel,
167 plus word-alignment since we want the buffer word-aligned. */
168 size_t file_size = current->stat.st_size;
169 cc = file_size + 2 * sizeof (word) - file_size % sizeof (word);
170 if (file_size != current->stat.st_size || cc < file_size
171 || PTRDIFF_MAX <= cc)
172 xalloc_die ();
173
174 if (current->bufsize < cc)
175 {
176 current->bufsize = cc;
177 current->buffer = xrealloc (current->buffer, cc);
178 }
179
180 /* Try to read at least 1 more byte than the size indicates, to
181 detect whether the file is growing. This is a nicety for
182 users who run 'diff' on files while they are changing. */
183
184 if (current->buffered <= file_size)
185 {
186 file_block_read (current, file_size + 1 - current->buffered);
187 if (current->buffered <= file_size)
188 return;
189 }
190 }
191
192 /* It's not a regular file, or it's a growing regular file; read it,
193 growing the buffer as needed. */
194
195 file_block_read (current, current->bufsize - current->buffered);
196
197 if (current->buffered)
198 {
199 while (current->buffered == current->bufsize)
200 {
201 if (PTRDIFF_MAX / 2 - sizeof (word) < current->bufsize)
202 xalloc_die ();
203 current->bufsize *= 2;
204 current->buffer = xrealloc (current->buffer, current->bufsize);
205 file_block_read (current, current->bufsize - current->buffered);
206 }
207
208 /* Allocate just enough room for appended newline plus word
209 sentinel, plus word-alignment. */
210 cc = current->buffered + 2 * sizeof (word);
211 current->bufsize = cc - cc % sizeof (word);
212 current->buffer = xrealloc (current->buffer, current->bufsize);
213 }
214}
215
216
217/* Split the file into lines, simultaneously computing the equivalence
218 class for each line. */
219
220static void
221find_and_hash_each_line (struct file_data *current)
222{
223 hash_value h;
224 unsigned char const *p = (unsigned char const *) current->prefix_end;
225 unsigned char c;
226 lin i, *bucket;
227 size_t length;
228
229 /* Cache often-used quantities in local variables to help the compiler. */
230 char const **linbuf = current->linbuf;
231 lin alloc_lines = current->alloc_lines;
232 lin line = 0;
233 lin linbuf_base = current->linbuf_base;
234 lin *cureqs = xmalloc (alloc_lines * sizeof *cureqs);
235 struct equivclass *eqs = equivs;
236 lin eqs_index = equivs_index;
237 lin eqs_alloc = equivs_alloc;
238 char const *suffix_begin = current->suffix_begin;
239 char const *bufend = FILE_BUFFER (current) + current->buffered;
240 bool diff_length_compare_anyway =
241 ignore_white_space != IGNORE_NO_WHITE_SPACE;
242 bool same_length_diff_contents_compare_anyway =
243 diff_length_compare_anyway | ignore_case;
244
245 while ((char const *) p < suffix_begin)
246 {
247 char const *ip = (char const *) p;
248
249 h = 0;
250
251 /* Hash this line until we find a newline. */
252 if (ignore_case)
253 switch (ignore_white_space)
254 {
255 case IGNORE_ALL_SPACE:
256 while ((c = *p++) != '\n')
257 if (! ISSPACE (c))
258 h = HASH (h, TOLOWER (c));
259 break;
260
261 case IGNORE_SPACE_CHANGE:
262 while ((c = *p++) != '\n')
263 {
264 if (ISSPACE (c))
265 {
266 do
267 if ((c = *p++) == '\n')
268 goto hashing_done;
269 while (ISSPACE (c));
270
271 h = HASH (h, ' ');
272 }
273
274 /* C is now the first non-space. */
275 h = HASH (h, TOLOWER (c));
276 }
277 break;
278
279 case IGNORE_TAB_EXPANSION:
280 {
281 size_t column = 0;
282 while ((c = *p++) != '\n')
283 {
284 int repetitions = 1;
285
286 switch (c)
287 {
288 case '\b':
289 column -= 0 < column;
290 break;
291
292 case '\t':
293 c = ' ';
294 repetitions = TAB_WIDTH - column % TAB_WIDTH;
295 column += repetitions;
296 break;
297
298 case '\r':
299 column = 0;
300 break;
301
302 default:
303 c = TOLOWER (c);
304 column++;
305 break;
306 }
307
308 do
309 h = HASH (h, c);
310 while (--repetitions != 0);
311 }
312 }
313 break;
314
315 default:
316 while ((c = *p++) != '\n')
317 h = HASH (h, TOLOWER (c));
318 break;
319 }
320 else
321 switch (ignore_white_space)
322 {
323 case IGNORE_ALL_SPACE:
324 while ((c = *p++) != '\n')
325 if (! ISSPACE (c))
326 h = HASH (h, c);
327 break;
328
329 case IGNORE_SPACE_CHANGE:
330 while ((c = *p++) != '\n')
331 {
332 if (ISSPACE (c))
333 {
334 do
335 if ((c = *p++) == '\n')
336 goto hashing_done;
337 while (ISSPACE (c));
338
339 h = HASH (h, ' ');
340 }
341
342 /* C is now the first non-space. */
343 h = HASH (h, c);
344 }
345 break;
346
347 case IGNORE_TAB_EXPANSION:
348 {
349 size_t column = 0;
350 while ((c = *p++) != '\n')
351 {
352 int repetitions = 1;
353
354 switch (c)
355 {
356 case '\b':
357 column -= 0 < column;
358 break;
359
360 case '\t':
361 c = ' ';
362 repetitions = TAB_WIDTH - column % TAB_WIDTH;
363 column += repetitions;
364 break;
365
366 case '\r':
367 column = 0;
368 break;
369
370 default:
371 column++;
372 break;
373 }
374
375 do
376 h = HASH (h, c);
377 while (--repetitions != 0);
378 }
379 }
380 break;
381
382 default:
383 while ((c = *p++) != '\n')
384 h = HASH (h, c);
385 break;
386 }
387
388 hashing_done:;
389
390 bucket = &buckets[h % nbuckets];
391 length = (char const *) p - ip - 1;
392
393 if ((char const *) p == bufend
394 && current->missing_newline
395 && ROBUST_OUTPUT_STYLE (output_style))
396 {
397 /* This line is incomplete. If this is significant,
398 put the line into buckets[-1]. */
399 if (ignore_white_space < IGNORE_SPACE_CHANGE)
400 bucket = &buckets[-1];
401
402 /* Omit the inserted newline when computing linbuf later. */
403 p--;
404 bufend = suffix_begin = (char const *) p;
405 }
406
407 for (i = *bucket; ; i = eqs[i].next)
408 if (!i)
409 {
410 /* Create a new equivalence class in this bucket. */
411 i = eqs_index++;
412 if (i == eqs_alloc)
413 {
414 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
415 xalloc_die ();
416 eqs_alloc *= 2;
417 eqs = xrealloc (eqs, eqs_alloc * sizeof *eqs);
418 }
419 eqs[i].next = *bucket;
420 eqs[i].hash = h;
421 eqs[i].line = ip;
422 eqs[i].length = length;
423 *bucket = i;
424 break;
425 }
426 else if (eqs[i].hash == h)
427 {
428 char const *eqline = eqs[i].line;
429
430 /* Reuse existing class if lines_differ reports the lines
431 equal. */
432 if (eqs[i].length == length)
433 {
434 /* Reuse existing equivalence class if the lines are identical.
435 This detects the common case of exact identity
436 faster than lines_differ would. */
437 if (memcmp (eqline, ip, length) == 0)
438 break;
439 if (!same_length_diff_contents_compare_anyway)
440 continue;
441 }
442 else if (!diff_length_compare_anyway)
443 continue;
444
445 if (! lines_differ (eqline, ip))
446 break;
447 }
448
449 /* Maybe increase the size of the line table. */
450 if (line == alloc_lines)
451 {
452 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
453 if (PTRDIFF_MAX / 3 <= alloc_lines
454 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
455 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
456 xalloc_die ();
457 alloc_lines = 2 * alloc_lines - linbuf_base;
458 cureqs = xrealloc (cureqs, alloc_lines * sizeof *cureqs);
459 linbuf += linbuf_base;
460 linbuf = xrealloc (linbuf,
461 (alloc_lines - linbuf_base) * sizeof *linbuf);
462 linbuf -= linbuf_base;
463 }
464 linbuf[line] = ip;
465 cureqs[line] = i;
466 ++line;
467 }
468
469 current->buffered_lines = line;
470
471 for (i = 0; ; i++)
472 {
473 /* Record the line start for lines in the suffix that we care about.
474 Record one more line start than lines,
475 so that we can compute the length of any buffered line. */
476 if (line == alloc_lines)
477 {
478 /* Double (alloc_lines - linbuf_base) by adding to alloc_lines. */
479 if (PTRDIFF_MAX / 3 <= alloc_lines
480 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
481 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
482 xalloc_die ();
483 alloc_lines = 2 * alloc_lines - linbuf_base;
484 linbuf += linbuf_base;
485 linbuf = xrealloc (linbuf,
486 (alloc_lines - linbuf_base) * sizeof *linbuf);
487 linbuf -= linbuf_base;
488 }
489 linbuf[line] = (char const *) p;
490
491 if ((char const *) p == bufend)
492 break;
493
494 if (context <= i && no_diff_means_no_output)
495 break;
496
497 line++;
498
499 while (*p++ != '\n')
500 continue;
501 }
502
503 /* Done with cache in local variables. */
504 current->linbuf = linbuf;
505 current->valid_lines = line;
506 current->alloc_lines = alloc_lines;
507 current->equivs = cureqs;
508 equivs = eqs;
509 equivs_alloc = eqs_alloc;
510 equivs_index = eqs_index;
511}
512
513
514/* Prepare the text. Make sure the text end is initialized.
515 Make sure text ends in a newline,
516 but remember that we had to add one.
517 Strip trailing CRs, if that was requested. */
518
519static void
520prepare_text (struct file_data *current)
521{
522 size_t buffered = current->buffered;
523 char *p = FILE_BUFFER (current);
524 char *dst;
525
526 if (buffered == 0 || p[buffered - 1] == '\n')
527 current->missing_newline = 0;
528 else
529 {
530 p[buffered++] = '\n';
531 current->missing_newline = 1;
532 }
533
534 if (!p)
535 return;
536
537 /* Don't use uninitialized storage when planting or using sentinels. */
538 memset (p + buffered, 0, sizeof (word));
539
540 if (strip_trailing_cr && (dst = memchr (p, '\r', buffered)))
541 {
542 char const *src = dst;
543 char const *srclim = p + buffered;
544
545 do
546 dst += ! ((*dst = *src++) == '\r' && *src == '\n');
547 while (src < srclim);
548
549 buffered -= src - dst;
550 }
551
552 current->buffered = buffered;
553}
554
555
556/* We have found N lines in a buffer of size S; guess the
557 proportionate number of lines that will be found in a buffer of
558 size T. However, do not guess a number of lines so large that the
559 resulting line table might cause overflow in size calculations. */
560static lin
561guess_lines (lin n, size_t s, size_t t)
562{
563 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
564 lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
565 return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
566}
567
568/* Given a vector of two file_data objects, find the identical
569 prefixes and suffixes of each object. */
570
571static void
572find_identical_ends (struct file_data filevec[])
573{
574 word *w0, *w1;
575 char *p0, *p1, *buffer0, *buffer1;
576 char const *end0, *beg0;
577 char const **linbuf0, **linbuf1;
578 lin i, lines;
579 size_t n0, n1;
580 lin alloc_lines0, alloc_lines1;
581 lin buffered_prefix, prefix_count, prefix_mask;
582 lin middle_guess, suffix_guess;
583
584 slurp (&filevec[0]);
585 prepare_text (&filevec[0]);
586 if (filevec[0].desc != filevec[1].desc)
587 {
588 slurp (&filevec[1]);
589 prepare_text (&filevec[1]);
590 }
591 else
592 {
593 filevec[1].buffer = filevec[0].buffer;
594 filevec[1].bufsize = filevec[0].bufsize;
595 filevec[1].buffered = filevec[0].buffered;
596 filevec[1].missing_newline = filevec[0].missing_newline;
597 }
598
599 /* Find identical prefix. */
600
601 w0 = filevec[0].buffer;
602 w1 = filevec[1].buffer;
603 p0 = buffer0 = (char *) w0;
604 p1 = buffer1 = (char *) w1;
605 n0 = filevec[0].buffered;
606 n1 = filevec[1].buffered;
607
608 if (p0 == p1)
609 /* The buffers are the same; sentinels won't work. */
610 p0 = p1 += n1;
611 else
612 {
613 /* Insert end sentinels, in this case characters that are guaranteed
614 to make the equality test false, and thus terminate the loop. */
615
616 if (n0 < n1)
617 p0[n0] = ~p1[n0];
618 else
619 p1[n1] = ~p0[n1];
620
621 /* Loop until first mismatch, or to the sentinel characters. */
622
623 /* Compare a word at a time for speed. */
624 while (*w0 == *w1)
625 w0++, w1++;
626
627 /* Do the last few bytes of comparison a byte at a time. */
628 p0 = (char *) w0;
629 p1 = (char *) w1;
630 while (*p0 == *p1)
631 p0++, p1++;
632
633 /* Don't mistakenly count missing newline as part of prefix. */
634 if (ROBUST_OUTPUT_STYLE (output_style)
635 && ((buffer0 + n0 - filevec[0].missing_newline < p0)
636 !=
637 (buffer1 + n1 - filevec[1].missing_newline < p1)))
638 p0--, p1--;
639 }
640
641 /* Now P0 and P1 point at the first nonmatching characters. */
642
643 /* Skip back to last line-beginning in the prefix,
644 and then discard up to HORIZON_LINES lines from the prefix. */
645 i = horizon_lines;
646 while (p0 != buffer0 && (p0[-1] != '\n' || i--))
647 p0--, p1--;
648
649 /* Record the prefix. */
650 filevec[0].prefix_end = p0;
651 filevec[1].prefix_end = p1;
652
653 /* Find identical suffix. */
654
655 /* P0 and P1 point beyond the last chars not yet compared. */
656 p0 = buffer0 + n0;
657 p1 = buffer1 + n1;
658
659 if (! ROBUST_OUTPUT_STYLE (output_style)
660 || filevec[0].missing_newline == filevec[1].missing_newline)
661 {
662 end0 = p0; /* Addr of last char in file 0. */
663
664 /* Get value of P0 at which we should stop scanning backward:
665 this is when either P0 or P1 points just past the last char
666 of the identical prefix. */
667 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
668
669 /* Scan back until chars don't match or we reach that point. */
670 for (; p0 != beg0; p0--, p1--)
671 if (*p0 != *p1)
672 {
673 /* Point at the first char of the matching suffix. */
674 beg0 = p0;
675 break;
676 }
677
678 /* Are we at a line-beginning in both files? If not, add the rest of
679 this line to the main body. Discard up to HORIZON_LINES lines from
680 the identical suffix. Also, discard one extra line,
681 because shift_boundaries may need it. */
682 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
683 &&
684 (buffer1 == p1 || p1[-1] == '\n'));
685 while (i-- && p0 != end0)
686 while (*p0++ != '\n')
687 continue;
688
689 p1 += p0 - beg0;
690 }
691
692 /* Record the suffix. */
693 filevec[0].suffix_begin = p0;
694 filevec[1].suffix_begin = p1;
695
696 /* Calculate number of lines of prefix to save.
697
698 prefix_count == 0 means save the whole prefix;
699 we need this for options like -D that output the whole file,
700 or for enormous contexts (to avoid worrying about arithmetic overflow).
701 We also need it for options like -F that output some preceding line;
702 at least we will need to find the last few lines,
703 but since we don't know how many, it's easiest to find them all.
704
705 Otherwise, prefix_count != 0. Save just prefix_count lines at start
706 of the line buffer; they'll be moved to the proper location later.
707 Handle 1 more line than the context says (because we count 1 too many),
708 rounded up to the next power of 2 to speed index computation. */
709
710 if (no_diff_means_no_output && ! function_regexp.fastmap
711 && context < LIN_MAX / 4 && context < n0)
712 {
713 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
714 suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
715 for (prefix_count = 1; prefix_count <= context; prefix_count *= 2)
716 continue;
717 alloc_lines0 = (prefix_count + middle_guess
718 + MIN (context, suffix_guess));
719 }
720 else
721 {
722 prefix_count = 0;
723 alloc_lines0 = guess_lines (0, 0, n0);
724 }
725
726 prefix_mask = prefix_count - 1;
727 lines = 0;
728 linbuf0 = xmalloc (alloc_lines0 * sizeof *linbuf0);
729 p0 = buffer0;
730
731 /* If the prefix is needed, find the prefix lines. */
732 if (! (no_diff_means_no_output
733 && filevec[0].prefix_end == p0
734 && filevec[1].prefix_end == p1))
735 {
736 end0 = filevec[0].prefix_end;
737 while (p0 != end0)
738 {
739 lin l = lines++ & prefix_mask;
740 if (l == alloc_lines0)
741 {
742 if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
743 xalloc_die ();
744 alloc_lines0 *= 2;
745 linbuf0 = xrealloc (linbuf0, alloc_lines0 * sizeof *linbuf0);
746 }
747 linbuf0[l] = p0;
748 while (*p0++ != '\n')
749 continue;
750 }
751 }
752 buffered_prefix = prefix_count && context < lines ? context : lines;
753
754 /* Allocate line buffer 1. */
755
756 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
757 suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
758 alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
759 if (alloc_lines1 < buffered_prefix
760 || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
761 xalloc_die ();
762 linbuf1 = xmalloc (alloc_lines1 * sizeof *linbuf1);
763
764 if (buffered_prefix != lines)
765 {
766 /* Rotate prefix lines to proper location. */
767 for (i = 0; i < buffered_prefix; i++)
768 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
769 for (i = 0; i < buffered_prefix; i++)
770 linbuf0[i] = linbuf1[i];
771 }
772
773 /* Initialize line buffer 1 from line buffer 0. */
774 for (i = 0; i < buffered_prefix; i++)
775 linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
776
777 /* Record the line buffer, adjusted so that
778 linbuf[0] points at the first differing line. */
779 filevec[0].linbuf = linbuf0 + buffered_prefix;
780 filevec[1].linbuf = linbuf1 + buffered_prefix;
781 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
782 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
783 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
784 filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
785}
786
787
788/* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
789 than 2**k. This table is derived from Chris K. Caldwell's list
790 <http://www.utm.edu/research/primes/lists/2small/>. */
791
792static unsigned char const prime_offset[] =
793{
794 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
795 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
796 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
797 55, 93, 1, 57, 25
798};
799
800/* Verify that this host's size_t is not too wide for the above table. */
801
802verify (enough_prime_offsets,
803 sizeof (size_t) * CHAR_BIT <= sizeof prime_offset);
804
805/* Given a vector of two file_data objects, read the file associated
806 with each one, and build the table of equivalence classes.
807 Return nonzero if either file appears to be a binary file.
808 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
809
810bool
811read_files (struct file_data filevec[], bool pretend_binary)
812{
813 int i;
814 bool skip_test = text | pretend_binary;
815 bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
816
817 if (filevec[0].desc != filevec[1].desc)
818 appears_binary |= sip (&filevec[1], skip_test | appears_binary);
819 else
820 {
821 filevec[1].buffer = filevec[0].buffer;
822 filevec[1].bufsize = filevec[0].bufsize;
823 filevec[1].buffered = filevec[0].buffered;
824 }
825 if (appears_binary)
826 {
827 set_binary_mode (filevec[0].desc, 1);
828 set_binary_mode (filevec[1].desc, 1);
829 return 1;
830 }
831
832 find_identical_ends (filevec);
833
834 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
835 if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
836 xalloc_die ();
837 equivs = xmalloc (equivs_alloc * sizeof *equivs);
838 /* Equivalence class 0 is permanently safe for lines that were not
839 hashed. Real equivalence classes start at 1. */
840 equivs_index = 1;
841
842 /* Allocate (one plus) a prime number of hash buckets. Use a prime
843 number between 1/3 and 2/3 of the value of equiv_allocs,
844 approximately. */
845 for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
846 continue;
847 nbuckets = ((size_t) 1 << i) - prime_offset[i];
848 if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
849 xalloc_die ();
850 buckets = zalloc ((nbuckets + 1) * sizeof *buckets);
851 buckets++;
852
853 for (i = 0; i < 2; i++)
854 find_and_hash_each_line (&filevec[i]);
855
856 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
857
858 free (equivs);
859 free (buckets - 1);
860
861 return 0;
862}
Note: See TracBrowser for help on using the repository browser.