source: trunk/texinfo/util/texindex.c@ 2618

Last change on this file since 2618 was 2617, checked in by bird, 19 years ago

GNU Texinfo 4.8

File size: 42.3 KB
Line 
1/* texindex -- sort TeX index dribble output into an actual index.
2 $Id: texindex.c,v 1.11 2004/04/11 17:56:47 karl Exp $
3
4 Copyright (C) 1987, 1991, 1992, 1996, 1997, 1998, 1999, 2000, 2001,
5 2002, 2003, 2004 Free Software Foundation, Inc.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307. */
20
21#include "system.h"
22#include <getopt.h>
23
24static char *program_name = "texindex";
25
26#if defined (emacs)
27# include "../src/config.h"
28/* Some s/os.h files redefine these. */
29# undef read
30# undef close
31# undef write
32# undef open
33#endif
34
35#if !defined (HAVE_MEMSET)
36#undef memset
37#define memset(ptr, ignore, count) bzero (ptr, count)
38#endif
39
40char *mktemp (char *);
41
42#if !defined (SEEK_SET)
43# define SEEK_SET 0
44# define SEEK_CUR 1
45# define SEEK_END 2
46#endif /* !SEEK_SET */
47
48struct linebuffer;
49
50/* When sorting in core, this structure describes one line
51 and the position and length of its first keyfield. */
52struct lineinfo
53{
54 char *text; /* The actual text of the line. */
55 union {
56 char *text; /* The start of the key (for textual comparison). */
57 long number; /* The numeric value (for numeric comparison). */
58 } key;
59 long keylen; /* Length of KEY field. */
60};
61
62/* This structure describes a field to use as a sort key. */
63struct keyfield
64{
65 int startwords; /* Number of words to skip. */
66 int startchars; /* Number of additional chars to skip. */
67 int endwords; /* Number of words to ignore at end. */
68 int endchars; /* Ditto for characters of last word. */
69 char ignore_blanks; /* Non-zero means ignore spaces and tabs. */
70 char fold_case; /* Non-zero means case doesn't matter. */
71 char reverse; /* Non-zero means compare in reverse order. */
72 char numeric; /* Non-zeros means field is ASCII numeric. */
73 char positional; /* Sort according to file position. */
74 char braced; /* Count balanced-braced groupings as fields. */
75};
76
77/* Vector of keyfields to use. */
78struct keyfield keyfields[3];
79
80/* Number of keyfields stored in that vector. */
81int num_keyfields = 3;
82
83/* Vector of input file names, terminated with a null pointer. */
84char **infiles;
85
86/* Vector of corresponding output file names, or NULL, meaning default it
87 (add an `s' to the end). */
88char **outfiles;
89
90/* Length of `infiles'. */
91int num_infiles;
92
93/* Pointer to the array of pointers to lines being sorted. */
94char **linearray;
95
96/* The allocated length of `linearray'. */
97long nlines;
98
99/* Directory to use for temporary files. On Unix, it ends with a slash. */
100char *tempdir;
101
102/* Number of last temporary file. */
103int tempcount;
104
105/* Number of last temporary file already deleted.
106 Temporary files are deleted by `flush_tempfiles' in order of creation. */
107int last_deleted_tempcount;
108
109/* During in-core sort, this points to the base of the data block
110 which contains all the lines of data. */
111char *text_base;
112
113/* Initially 0; changed to 1 if we want initials in this index. */
114int need_initials;
115
116/* Remembers the first initial letter seen in this index, so we can
117 determine whether we need initials in the sorted form. */
118char first_initial;
119
120/* Additional command switches .*/
121
122/* Nonzero means do not delete tempfiles -- for debugging. */
123int keep_tempfiles;
124
125/* Forward declarations of functions in this file. */
126void decode_command (int argc, char **argv);
127void sort_in_core (char *infile, int total, char *outfile);
128void sort_offline (char *infile, off_t total, char *outfile);
129char **parsefile (char *filename, char **nextline, char *data, long int size);
130char *find_field (struct keyfield *keyfield, char *str, long int *lengthptr);
131char *find_pos (char *str, int words, int chars, int ignore_blanks);
132long find_value (char *start, long int length);
133char *find_braced_pos (char *str, int words, int chars, int ignore_blanks);
134char *find_braced_end (char *str);
135void writelines (char **linearray, int nlines, FILE *ostream);
136int compare_field (struct keyfield *keyfield, char *start1,
137 long int length1, long int pos1, char *start2,
138 long int length2, long int pos2);
139int compare_full (const void *, const void *);
140long readline (struct linebuffer *linebuffer, FILE *stream);
141int merge_files (char **infiles, int nfiles, char *outfile);
142int merge_direct (char **infiles, int nfiles, char *outfile);
143void pfatal_with_name (const char *name);
144void fatal (const char *format, const char *arg);
145void error (const char *format, const char *arg);
146void *xmalloc (), *xrealloc ();
147char *concat (char *s1, char *s2);
148void flush_tempfiles (int to_count);
149
150
151#define MAX_IN_CORE_SORT 500000
152
153int
154main (int argc, char **argv)
155{
156 int i;
157
158 tempcount = 0;
159 last_deleted_tempcount = 0;
160
161#ifdef HAVE_SETLOCALE
162 /* Set locale via LC_ALL. */
163 setlocale (LC_ALL, "");
164#endif
165
166 /* Set the text message domain. */
167 bindtextdomain (PACKAGE, LOCALEDIR);
168 textdomain (PACKAGE);
169
170 /* In case we write to a redirected stdout that fails. */
171 /* not ready atexit (close_stdout); */
172
173 /* Describe the kind of sorting to do. */
174 /* The first keyfield uses the first braced field and folds case. */
175 keyfields[0].braced = 1;
176 keyfields[0].fold_case = 1;
177 keyfields[0].endwords = -1;
178 keyfields[0].endchars = -1;
179
180 /* The second keyfield uses the second braced field, numerically. */
181 keyfields[1].braced = 1;
182 keyfields[1].numeric = 1;
183 keyfields[1].startwords = 1;
184 keyfields[1].endwords = -1;
185 keyfields[1].endchars = -1;
186
187 /* The third keyfield (which is ignored while discarding duplicates)
188 compares the whole line. */
189 keyfields[2].endwords = -1;
190 keyfields[2].endchars = -1;
191
192 decode_command (argc, argv);
193
194 /* Process input files completely, one by one. */
195
196 for (i = 0; i < num_infiles; i++)
197 {
198 int desc;
199 off_t ptr;
200 char *outfile;
201 struct stat instat;
202
203 desc = open (infiles[i], O_RDONLY, 0);
204 if (desc < 0)
205 pfatal_with_name (infiles[i]);
206
207 if (stat (infiles[i], &instat))
208 pfatal_with_name (infiles[i]);
209 if (S_ISDIR (instat.st_mode))
210 {
211#ifdef EISDIR
212 errno = EISDIR;
213#endif
214 pfatal_with_name (infiles[i]);
215 }
216
217 lseek (desc, (off_t) 0, SEEK_END);
218 ptr = (off_t) lseek (desc, (off_t) 0, SEEK_CUR);
219
220 close (desc);
221
222 outfile = outfiles[i];
223 if (!outfile)
224 outfile = concat (infiles[i], "s");
225
226 need_initials = 0;
227 first_initial = '\0';
228
229 if (ptr < MAX_IN_CORE_SORT)
230 /* Sort a small amount of data. */
231 sort_in_core (infiles[i], (int)ptr, outfile);
232 else
233 sort_offline (infiles[i], ptr, outfile);
234 }
235
236 flush_tempfiles (tempcount);
237 xexit (0);
238 return 0; /* Avoid bogus warnings. */
239}
240
241
242typedef struct
243{
244 char *long_name;
245 char *short_name;
246 int *variable_ref;
247 int variable_value;
248 char *arg_name;
249 char *doc_string;
250} TEXINDEX_OPTION;
251
252TEXINDEX_OPTION texindex_options[] = {
253 { "--help", "-h", (int *)NULL, 0, (char *)NULL,
254 N_("display this help and exit") },
255 { "--keep", "-k", &keep_tempfiles, 1, (char *)NULL,
256 N_("keep temporary files around after processing") },
257 { "--no-keep", 0, &keep_tempfiles, 0, (char *)NULL,
258 N_("do not keep temporary files around after processing (default)") },
259 { "--output", "-o", (int *)NULL, 0, "FILE",
260 N_("send output to FILE") },
261 { "--version", (char *)NULL, (int *)NULL, 0, (char *)NULL,
262 N_("display version information and exit") },
263 { (char *)NULL, (char *)NULL, (int *)NULL, 0, (char *)NULL }
264};
265
266void
267usage (int result_value)
268{
269 register int i;
270 FILE *f = result_value ? stderr : stdout;
271
272 fprintf (f, _("Usage: %s [OPTION]... FILE...\n"), program_name);
273 fprintf (f, _("Generate a sorted index for each TeX output FILE.\n"));
274 /* Avoid trigraph nonsense. */
275 fprintf (f,
276_("Usually FILE... is specified as `foo.%c%c\' for a document `foo.texi'.\n"),
277 '?', '?'); /* avoid trigraph in cat-id-tbl.c */
278 fprintf (f, _("\nOptions:\n"));
279
280 for (i = 0; texindex_options[i].long_name; i++)
281 {
282 putc (' ', f);
283
284 if (texindex_options[i].short_name)
285 fprintf (f, "%s, ", texindex_options[i].short_name);
286
287 fprintf (f, "%s %s",
288 texindex_options[i].long_name,
289 texindex_options[i].arg_name
290 ? texindex_options[i].arg_name : "");
291
292 fprintf (f, "\t%s\n", _(texindex_options[i].doc_string));
293 }
294 fputs (_("\n\
295Email bug reports to bug-texinfo@gnu.org,\n\
296general questions and discussion to help-texinfo@gnu.org.\n\
297Texinfo home page: http://www.gnu.org/software/texinfo/"), f);
298 fputs ("\n", f);
299
300 xexit (result_value);
301}
302
303/* Decode the command line arguments to set the parameter variables
304 and set up the vector of keyfields and the vector of input files. */
305
306void
307decode_command (int argc, char **argv)
308{
309 int arg_index = 1;
310 char **ip;
311 char **op;
312
313 /* Store default values into parameter variables. */
314
315 tempdir = getenv ("TMPDIR");
316 if (tempdir == NULL)
317 tempdir = getenv ("TEMP");
318 if (tempdir == NULL)
319 tempdir = getenv ("TMP");
320 if (tempdir == NULL)
321 tempdir = DEFAULT_TMPDIR;
322 else
323 tempdir = concat (tempdir, "/");
324
325 keep_tempfiles = 0;
326
327 /* Allocate ARGC input files, which must be enough. */
328
329 infiles = (char **) xmalloc (argc * sizeof (char *));
330 outfiles = (char **) xmalloc (argc * sizeof (char *));
331 ip = infiles;
332 op = outfiles;
333
334 while (arg_index < argc)
335 {
336 char *arg = argv[arg_index++];
337
338 if (*arg == '-')
339 {
340 if (strcmp (arg, "--version") == 0)
341 {
342 printf ("texindex (GNU %s) %s\n", PACKAGE, VERSION);
343 puts ("");
344 puts ("Copyright (C) 2004 Free Software Foundation, Inc.");
345 printf (_("There is NO warranty. You may redistribute this software\n\
346under the terms of the GNU General Public License.\n\
347For more information about these matters, see the files named COPYING.\n"));
348 xexit (0);
349 }
350 else if ((strcmp (arg, "--keep") == 0) ||
351 (strcmp (arg, "-k") == 0))
352 {
353 keep_tempfiles = 1;
354 }
355 else if ((strcmp (arg, "--help") == 0) ||
356 (strcmp (arg, "-h") == 0))
357 {
358 usage (0);
359 }
360 else if ((strcmp (arg, "--output") == 0) ||
361 (strcmp (arg, "-o") == 0))
362 {
363 if (argv[arg_index] != (char *)NULL)
364 {
365 arg_index++;
366 if (op > outfiles)
367 *(op - 1) = argv[arg_index];
368 }
369 else
370 usage (1);
371 }
372 else
373 usage (1);
374 }
375 else
376 {
377 *ip++ = arg;
378 *op++ = (char *)NULL;
379 }
380 }
381
382 /* Record number of keyfields and terminate list of filenames. */
383 num_infiles = ip - infiles;
384 *ip = (char *)NULL;
385 if (num_infiles == 0)
386 usage (1);
387}
388
389
390/* Return a name for temporary file COUNT. */
391
392static char *
393maketempname (int count)
394{
395 static char *tempbase = NULL;
396 char tempsuffix[10];
397
398 if (!tempbase)
399 {
400 int fd;
401 tempbase = concat (tempdir, "txidxXXXXXX");
402
403 fd = mkstemp (tempbase);
404 if (fd == -1)
405 pfatal_with_name (tempbase);
406 }
407
408 sprintf (tempsuffix, ".%d", count);
409 return concat (tempbase, tempsuffix);
410}
411
412
413/* Delete all temporary files up to TO_COUNT. */
414
415void
416flush_tempfiles (int to_count)
417{
418 if (keep_tempfiles)
419 return;
420 while (last_deleted_tempcount < to_count)
421 unlink (maketempname (++last_deleted_tempcount));
422}
423
424
425
426/* Compare LINE1 and LINE2 according to the specified set of keyfields. */
427
428int
429compare_full (const void *p1, const void *p2)
430{
431 char **line1 = (char **) p1;
432 char **line2 = (char **) p2;
433 int i;
434
435 /* Compare using the first keyfield;
436 if that does not distinguish the lines, try the second keyfield;
437 and so on. */
438
439 for (i = 0; i < num_keyfields; i++)
440 {
441 long length1, length2;
442 char *start1 = find_field (&keyfields[i], *line1, &length1);
443 char *start2 = find_field (&keyfields[i], *line2, &length2);
444 int tem = compare_field (&keyfields[i], start1, length1,
445 *line1 - text_base,
446 start2, length2, *line2 - text_base);
447 if (tem)
448 {
449 if (keyfields[i].reverse)
450 return -tem;
451 return tem;
452 }
453 }
454
455 return 0; /* Lines match exactly. */
456}
457
458/* Compare LINE1 and LINE2, described by structures
459 in which the first keyfield is identified in advance.
460 For positional sorting, assumes that the order of the lines in core
461 reflects their nominal order. */
462int
463compare_prepared (const void *p1, const void *p2)
464{
465 struct lineinfo *line1 = (struct lineinfo *) p1;
466 struct lineinfo *line2 = (struct lineinfo *) p2;
467 int i;
468 int tem;
469 char *text1, *text2;
470
471 /* Compare using the first keyfield, which has been found for us already. */
472 if (keyfields->positional)
473 {
474 if (line1->text - text_base > line2->text - text_base)
475 tem = 1;
476 else
477 tem = -1;
478 }
479 else if (keyfields->numeric)
480 tem = line1->key.number - line2->key.number;
481 else
482 tem = compare_field (keyfields, line1->key.text, line1->keylen, 0,
483 line2->key.text, line2->keylen, 0);
484 if (tem)
485 {
486 if (keyfields->reverse)
487 return -tem;
488 return tem;
489 }
490
491 text1 = line1->text;
492 text2 = line2->text;
493
494 /* Compare using the second keyfield;
495 if that does not distinguish the lines, try the third keyfield;
496 and so on. */
497
498 for (i = 1; i < num_keyfields; i++)
499 {
500 long length1, length2;
501 char *start1 = find_field (&keyfields[i], text1, &length1);
502 char *start2 = find_field (&keyfields[i], text2, &length2);
503 int tem = compare_field (&keyfields[i], start1, length1,
504 text1 - text_base,
505 start2, length2, text2 - text_base);
506 if (tem)
507 {
508 if (keyfields[i].reverse)
509 return -tem;
510 return tem;
511 }
512 }
513
514 return 0; /* Lines match exactly. */
515}
516
517/* Like compare_full but more general.
518 You can pass any strings, and you can say how many keyfields to use.
519 POS1 and POS2 should indicate the nominal positional ordering of
520 the two lines in the input. */
521
522int
523compare_general (char *str1, char *str2, long int pos1, long int pos2, int use_keyfields)
524{
525 int i;
526
527 /* Compare using the first keyfield;
528 if that does not distinguish the lines, try the second keyfield;
529 and so on. */
530
531 for (i = 0; i < use_keyfields; i++)
532 {
533 long length1, length2;
534 char *start1 = find_field (&keyfields[i], str1, &length1);
535 char *start2 = find_field (&keyfields[i], str2, &length2);
536 int tem = compare_field (&keyfields[i], start1, length1, pos1,
537 start2, length2, pos2);
538 if (tem)
539 {
540 if (keyfields[i].reverse)
541 return -tem;
542 return tem;
543 }
544 }
545
546 return 0; /* Lines match exactly. */
547}
548
549/* Find the start and length of a field in STR according to KEYFIELD.
550 A pointer to the starting character is returned, and the length
551 is stored into the int that LENGTHPTR points to. */
552
553char *
554find_field (struct keyfield *keyfield, char *str, long int *lengthptr)
555{
556 char *start;
557 char *end;
558 char *(*fun) ();
559
560 if (keyfield->braced)
561 fun = find_braced_pos;
562 else
563 fun = find_pos;
564
565 start = (*fun) (str, keyfield->startwords, keyfield->startchars,
566 keyfield->ignore_blanks);
567 if (keyfield->endwords < 0)
568 {
569 if (keyfield->braced)
570 end = find_braced_end (start);
571 else
572 {
573 end = start;
574 while (*end && *end != '\n')
575 end++;
576 }
577 }
578 else
579 {
580 end = (*fun) (str, keyfield->endwords, keyfield->endchars, 0);
581 if (end - str < start - str)
582 end = start;
583 }
584 *lengthptr = end - start;
585 return start;
586}
587
588/* Return a pointer to a specified place within STR,
589 skipping (from the beginning) WORDS words and then CHARS chars.
590 If IGNORE_BLANKS is nonzero, we skip all blanks
591 after finding the specified word. */
592
593char *
594find_pos (char *str, int words, int chars, int ignore_blanks)
595{
596 int i;
597 char *p = str;
598
599 for (i = 0; i < words; i++)
600 {
601 char c;
602 /* Find next bunch of nonblanks and skip them. */
603 while ((c = *p) == ' ' || c == '\t')
604 p++;
605 while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t'))
606 p++;
607 if (!*p || *p == '\n')
608 return p;
609 }
610
611 while (*p == ' ' || *p == '\t')
612 p++;
613
614 for (i = 0; i < chars; i++)
615 {
616 if (!*p || *p == '\n')
617 break;
618 p++;
619 }
620 return p;
621}
622
623/* Like find_pos but assumes that each field is surrounded by braces
624 and that braces within fields are balanced. */
625
626char *
627find_braced_pos (char *str, int words, int chars, int ignore_blanks)
628{
629 int i;
630 int bracelevel;
631 char *p = str;
632 char c;
633
634 for (i = 0; i < words; i++)
635 {
636 bracelevel = 1;
637 while ((c = *p++) != '{' && c != '\n' && c)
638 /* Do nothing. */ ;
639 if (c != '{')
640 return p - 1;
641 while (bracelevel)
642 {
643 c = *p++;
644 if (c == '{')
645 bracelevel++;
646 if (c == '}')
647 bracelevel--;
648 if (c == 0 || c == '\n')
649 return p - 1;
650 }
651 }
652
653 while ((c = *p++) != '{' && c != '\n' && c)
654 /* Do nothing. */ ;
655
656 if (c != '{')
657 return p - 1;
658
659 if (ignore_blanks)
660 while ((c = *p) == ' ' || c == '\t')
661 p++;
662
663 for (i = 0; i < chars; i++)
664 {
665 if (!*p || *p == '\n')
666 break;
667 p++;
668 }
669 return p;
670}
671
672/* Find the end of the balanced-brace field which starts at STR.
673 The position returned is just before the closing brace. */
674
675char *
676find_braced_end (char *str)
677{
678 int bracelevel;
679 char *p = str;
680 char c;
681
682 bracelevel = 1;
683 while (bracelevel)
684 {
685 c = *p++;
686 if (c == '{')
687 bracelevel++;
688 if (c == '}')
689 bracelevel--;
690 if (c == 0 || c == '\n')
691 return p - 1;
692 }
693 return p - 1;
694}
695
696long
697find_value (char *start, long int length)
698{
699 while (length != 0L)
700 {
701 if (isdigit (*start))
702 return atol (start);
703 length--;
704 start++;
705 }
706 return 0l;
707}
708
709/* Vector used to translate characters for comparison.
710 This is how we make all alphanumerics follow all else,
711 and ignore case in the first sorting. */
712int char_order[256];
713
714void
715init_char_order (void)
716{
717 int i;
718 for (i = 1; i < 256; i++)
719 char_order[i] = i;
720
721 for (i = '0'; i <= '9'; i++)
722 char_order[i] += 512;
723
724 for (i = 'a'; i <= 'z'; i++)
725 {
726 char_order[i] = 512 + i;
727 char_order[i + 'A' - 'a'] = 512 + i;
728 }
729}
730
731/* Compare two fields (each specified as a start pointer and a character count)
732 according to KEYFIELD.
733 The sign of the value reports the relation between the fields. */
734
735int
736compare_field (struct keyfield *keyfield, char *start1, long int length1,
737 long int pos1, char *start2, long int length2, long int pos2)
738{
739 if (keyfields->positional)
740 {
741 if (pos1 > pos2)
742 return 1;
743 else
744 return -1;
745 }
746 if (keyfield->numeric)
747 {
748 long value = find_value (start1, length1) - find_value (start2, length2);
749 if (value > 0)
750 return 1;
751 if (value < 0)
752 return -1;
753 return 0;
754 }
755 else
756 {
757 char *p1 = start1;
758 char *p2 = start2;
759 char *e1 = start1 + length1;
760 char *e2 = start2 + length2;
761
762 while (1)
763 {
764 int c1, c2;
765
766 if (p1 == e1)
767 c1 = 0;
768 else
769 c1 = *p1++;
770 if (p2 == e2)
771 c2 = 0;
772 else
773 c2 = *p2++;
774
775 if (char_order[c1] != char_order[c2])
776 return char_order[c1] - char_order[c2];
777 if (!c1)
778 break;
779 }
780
781 /* Strings are equal except possibly for case. */
782 p1 = start1;
783 p2 = start2;
784 while (1)
785 {
786 int c1, c2;
787
788 if (p1 == e1)
789 c1 = 0;
790 else
791 c1 = *p1++;
792 if (p2 == e2)
793 c2 = 0;
794 else
795 c2 = *p2++;
796
797 if (c1 != c2)
798 /* Reverse sign here so upper case comes out last. */
799 return c2 - c1;
800 if (!c1)
801 break;
802 }
803
804 return 0;
805 }
806}
807
808
809/* A `struct linebuffer' is a structure which holds a line of text.
810 `readline' reads a line from a stream into a linebuffer
811 and works regardless of the length of the line. */
812
813struct linebuffer
814{
815 long size;
816 char *buffer;
817};
818
819/* Initialize LINEBUFFER for use. */
820
821void
822initbuffer (struct linebuffer *linebuffer)
823{
824 linebuffer->size = 200;
825 linebuffer->buffer = (char *) xmalloc (200);
826}
827
828/* Read a line of text from STREAM into LINEBUFFER.
829 Return the length of the line. */
830
831long
832readline (struct linebuffer *linebuffer, FILE *stream)
833{
834 char *buffer = linebuffer->buffer;
835 char *p = linebuffer->buffer;
836 char *end = p + linebuffer->size;
837
838 while (1)
839 {
840 int c = getc (stream);
841 if (p == end)
842 {
843 buffer = (char *) xrealloc (buffer, linebuffer->size *= 2);
844 p += buffer - linebuffer->buffer;
845 end += buffer - linebuffer->buffer;
846 linebuffer->buffer = buffer;
847 }
848 if (c < 0 || c == '\n')
849 {
850 *p = 0;
851 break;
852 }
853 *p++ = c;
854 }
855
856 return p - buffer;
857}
858
859
860/* Sort an input file too big to sort in core. */
861
862void
863sort_offline (char *infile, off_t total, char *outfile)
864{
865 /* More than enough. */
866 int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT;
867 char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
868 FILE *istream = fopen (infile, "r");
869 int i;
870 struct linebuffer lb;
871 long linelength;
872 int failure = 0;
873
874 initbuffer (&lb);
875
876 /* Read in one line of input data. */
877
878 linelength = readline (&lb, istream);
879
880 if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
881 {
882 error (_("%s: not a texinfo index file"), infile);
883 return;
884 }
885
886 /* Split up the input into `ntemps' temporary files, or maybe fewer,
887 and put the new files' names into `tempfiles' */
888
889 for (i = 0; i < ntemps; i++)
890 {
891 char *outname = maketempname (++tempcount);
892 FILE *ostream = fopen (outname, "w");
893 long tempsize = 0;
894
895 if (!ostream)
896 pfatal_with_name (outname);
897 tempfiles[i] = outname;
898
899 /* Copy lines into this temp file as long as it does not make file
900 "too big" or until there are no more lines. */
901
902 while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
903 {
904 tempsize += linelength + 1;
905 fputs (lb.buffer, ostream);
906 putc ('\n', ostream);
907
908 /* Read another line of input data. */
909
910 linelength = readline (&lb, istream);
911 if (!linelength && feof (istream))
912 break;
913
914 if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
915 {
916 error (_("%s: not a texinfo index file"), infile);
917 failure = 1;
918 goto fail;
919 }
920 }
921 fclose (ostream);
922 if (feof (istream))
923 break;
924 }
925
926 free (lb.buffer);
927
928fail:
929 /* Record number of temp files we actually needed. */
930
931 ntemps = i;
932
933 /* Sort each tempfile into another tempfile.
934 Delete the first set of tempfiles and put the names of the second
935 into `tempfiles'. */
936
937 for (i = 0; i < ntemps; i++)
938 {
939 char *newtemp = maketempname (++tempcount);
940 sort_in_core (tempfiles[i], MAX_IN_CORE_SORT, newtemp);
941 if (!keep_tempfiles)
942 unlink (tempfiles[i]);
943 tempfiles[i] = newtemp;
944 }
945
946 if (failure)
947 return;
948
949 /* Merge the tempfiles together and indexify. */
950
951 merge_files (tempfiles, ntemps, outfile);
952}
953
954
955/* Sort INFILE, whose size is TOTAL,
956 assuming that is small enough to be done in-core,
957 then indexify it and send the output to OUTFILE (or to stdout). */
958
959void
960sort_in_core (char *infile, int total, char *outfile)
961{
962 char **nextline;
963 char *data = (char *) xmalloc (total + 1);
964 char *file_data;
965 long file_size;
966 int i;
967 FILE *ostream = stdout;
968 struct lineinfo *lineinfo;
969
970 /* Read the contents of the file into the moby array `data'. */
971
972 int desc = open (infile, O_RDONLY, 0);
973
974 if (desc < 0)
975 fatal (_("failure reopening %s"), infile);
976 for (file_size = 0;;)
977 {
978 i = read (desc, data + file_size, total - file_size);
979 if (i <= 0)
980 break;
981 file_size += i;
982 }
983 file_data = data;
984 data[file_size] = 0;
985
986 close (desc);
987
988 if (file_size > 0 && data[0] != '\\' && data[0] != '@')
989 {
990 error (_("%s: not a texinfo index file"), infile);
991 return;
992 }
993
994 init_char_order ();
995
996 /* Sort routines want to know this address. */
997
998 text_base = data;
999
1000 /* Create the array of pointers to lines, with a default size
1001 frequently enough. */
1002
1003 nlines = total / 50;
1004 if (!nlines)
1005 nlines = 2;
1006 linearray = (char **) xmalloc (nlines * sizeof (char *));
1007
1008 /* `nextline' points to the next free slot in this array.
1009 `nlines' is the allocated size. */
1010
1011 nextline = linearray;
1012
1013 /* Parse the input file's data, and make entries for the lines. */
1014
1015 nextline = parsefile (infile, nextline, file_data, file_size);
1016 if (nextline == 0)
1017 {
1018 error (_("%s: not a texinfo index file"), infile);
1019 return;
1020 }
1021
1022 /* Sort the lines. */
1023
1024 /* If we have enough space, find the first keyfield of each line in advance.
1025 Make a `struct lineinfo' for each line, which records the keyfield
1026 as well as the line, and sort them. */
1027
1028 lineinfo = malloc ((nextline - linearray) * sizeof (struct lineinfo));
1029
1030 if (lineinfo)
1031 {
1032 struct lineinfo *lp;
1033 char **p;
1034
1035 for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1036 {
1037 lp->text = *p;
1038 lp->key.text = find_field (keyfields, *p, &lp->keylen);
1039 if (keyfields->numeric)
1040 lp->key.number = find_value (lp->key.text, lp->keylen);
1041 }
1042
1043 qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo),
1044 compare_prepared);
1045
1046 for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1047 *p = lp->text;
1048
1049 free (lineinfo);
1050 }
1051 else
1052 qsort (linearray, nextline - linearray, sizeof (char *), compare_full);
1053
1054 /* Open the output file. */
1055
1056 if (outfile)
1057 {
1058 ostream = fopen (outfile, "w");
1059 if (!ostream)
1060 pfatal_with_name (outfile);
1061 }
1062
1063 writelines (linearray, nextline - linearray, ostream);
1064 if (outfile)
1065 fclose (ostream);
1066
1067 free (linearray);
1068 free (data);
1069}
1070
1071
1072/* Parse an input string in core into lines.
1073 DATA is the input string, and SIZE is its length.
1074 Data goes in LINEARRAY starting at NEXTLINE.
1075 The value returned is the first entry in LINEARRAY still unused.
1076 Value 0 means input file contents are invalid. */
1077
1078char **
1079parsefile (char *filename, char **nextline, char *data, long int size)
1080{
1081 char *p, *end;
1082 char **line = nextline;
1083
1084 p = data;
1085 end = p + size;
1086 *end = 0;
1087
1088 while (p != end)
1089 {
1090 if (p[0] != '\\' && p[0] != '@')
1091 return 0;
1092
1093 *line = p;
1094
1095 /* Find the first letter of the first field of this line. If it
1096 is different from the first letter of the first field of the
1097 first line, we need initial headers in the output index. */
1098 while (*p && *p != '{')
1099 p++;
1100 if (p == end)
1101 return 0;
1102 p++;
1103 if (first_initial)
1104 {
1105 if (first_initial != toupper (*p))
1106 need_initials = 1;
1107 }
1108 else
1109 first_initial = toupper (*p);
1110
1111 while (*p && *p != '\n')
1112 p++;
1113 if (p != end)
1114 p++;
1115
1116 line++;
1117 if (line == linearray + nlines)
1118 {
1119 char **old = linearray;
1120 linearray = xrealloc (linearray, sizeof (char *) * (nlines *= 4));
1121 line += linearray - old;
1122 }
1123 }
1124
1125 return line;
1126}
1127
1128
1129/* Indexification is a filter applied to the sorted lines
1130 as they are being written to the output file.
1131 Multiple entries for the same name, with different page numbers,
1132 get combined into a single entry with multiple page numbers.
1133 The first braced field, which is used for sorting, is discarded.
1134 However, its first character is examined, folded to lower case,
1135 and if it is different from that in the previous line fed to us
1136 a \initial line is written with one argument, the new initial.
1137
1138 If an entry has four braced fields, then the second and third
1139 constitute primary and secondary names.
1140 In this case, each change of primary name
1141 generates a \primary line which contains only the primary name,
1142 and in between these are \secondary lines which contain
1143 just a secondary name and page numbers. */
1144
1145/* The last primary name we wrote a \primary entry for.
1146 If only one level of indexing is being done, this is the last name seen. */
1147char *lastprimary;
1148/* Length of storage allocated for lastprimary. */
1149int lastprimarylength;
1150
1151/* Similar, for the secondary name. */
1152char *lastsecondary;
1153int lastsecondarylength;
1154
1155/* Zero if we are not in the middle of writing an entry.
1156 One if we have written the beginning of an entry but have not
1157 yet written any page numbers into it.
1158 Greater than one if we have written the beginning of an entry
1159 plus at least one page number. */
1160int pending;
1161
1162/* The initial (for sorting purposes) of the last primary entry written.
1163 When this changes, a \initial {c} line is written */
1164
1165char *lastinitial;
1166
1167int lastinitiallength;
1168
1169/* When we need a string of length 1 for the value of lastinitial,
1170 store it here. */
1171
1172char lastinitial1[2];
1173
1174/* Initialize static storage for writing an index. */
1175
1176void
1177init_index (void)
1178{
1179 pending = 0;
1180 lastinitial = lastinitial1;
1181 lastinitial1[0] = 0;
1182 lastinitial1[1] = 0;
1183 lastinitiallength = 0;
1184 lastprimarylength = 100;
1185 lastprimary = (char *) xmalloc (lastprimarylength + 1);
1186 memset (lastprimary, '\0', lastprimarylength + 1);
1187 lastsecondarylength = 100;
1188 lastsecondary = (char *) xmalloc (lastsecondarylength + 1);
1189 memset (lastsecondary, '\0', lastsecondarylength + 1);
1190}
1191
1192/* Indexify. Merge entries for the same name,
1193 insert headers for each initial character, etc. */
1194
1195void
1196indexify (char *line, FILE *ostream)
1197{
1198 char *primary, *secondary, *pagenumber;
1199 int primarylength, secondarylength = 0, pagelength;
1200 int nosecondary;
1201 int initiallength;
1202 char *initial;
1203 char initial1[2];
1204 register char *p;
1205
1206 /* First, analyze the parts of the entry fed to us this time. */
1207
1208 p = find_braced_pos (line, 0, 0, 0);
1209 if (*p == '{')
1210 {
1211 initial = p;
1212 /* Get length of inner pair of braces starting at `p',
1213 including that inner pair of braces. */
1214 initiallength = find_braced_end (p + 1) + 1 - p;
1215 }
1216 else
1217 {
1218 initial = initial1;
1219 initial1[0] = toupper (*p);
1220 initial1[1] = 0;
1221 initiallength = 1;
1222 }
1223
1224 pagenumber = find_braced_pos (line, 1, 0, 0);
1225 pagelength = find_braced_end (pagenumber) - pagenumber;
1226 if (pagelength == 0)
1227 fatal (_("No page number in %s"), line);
1228
1229 primary = find_braced_pos (line, 2, 0, 0);
1230 primarylength = find_braced_end (primary) - primary;
1231
1232 secondary = find_braced_pos (line, 3, 0, 0);
1233 nosecondary = !*secondary;
1234 if (!nosecondary)
1235 secondarylength = find_braced_end (secondary) - secondary;
1236
1237 /* If the primary is different from before, make a new primary entry. */
1238 if (strncmp (primary, lastprimary, primarylength))
1239 {
1240 /* Close off current secondary entry first, if one is open. */
1241 if (pending)
1242 {
1243 fputs ("}\n", ostream);
1244 pending = 0;
1245 }
1246
1247 /* If this primary has a different initial, include an entry for
1248 the initial. */
1249 if (need_initials &&
1250 (initiallength != lastinitiallength ||
1251 strncmp (initial, lastinitial, initiallength)))
1252 {
1253 fprintf (ostream, "\\initial {");
1254 fwrite (initial, 1, initiallength, ostream);
1255 fputs ("}\n", ostream);
1256 if (initial == initial1)
1257 {
1258 lastinitial = lastinitial1;
1259 *lastinitial1 = *initial1;
1260 }
1261 else
1262 {
1263 lastinitial = initial;
1264 }
1265 lastinitiallength = initiallength;
1266 }
1267
1268 /* Make the entry for the primary. */
1269 if (nosecondary)
1270 fputs ("\\entry {", ostream);
1271 else
1272 fputs ("\\primary {", ostream);
1273 fwrite (primary, primarylength, 1, ostream);
1274 if (nosecondary)
1275 {
1276 fputs ("}{", ostream);
1277 pending = 1;
1278 }
1279 else
1280 fputs ("}\n", ostream);
1281
1282 /* Record name of most recent primary. */
1283 if (lastprimarylength < primarylength)
1284 {
1285 lastprimarylength = primarylength + 100;
1286 lastprimary = (char *) xrealloc (lastprimary,
1287 1 + lastprimarylength);
1288 }
1289 strncpy (lastprimary, primary, primarylength);
1290 lastprimary[primarylength] = 0;
1291
1292 /* There is no current secondary within this primary, now. */
1293 lastsecondary[0] = 0;
1294 }
1295
1296 /* Should not have an entry with no subtopic following one with a
1297 subtopic. */
1298
1299 if (nosecondary && *lastsecondary)
1300 error (_("entry %s follows an entry with a secondary name"), line);
1301
1302 /* Start a new secondary entry if necessary. */
1303 if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength))
1304 {
1305 if (pending)
1306 {
1307 fputs ("}\n", ostream);
1308 pending = 0;
1309 }
1310
1311 /* Write the entry for the secondary. */
1312 fputs ("\\secondary {", ostream);
1313 fwrite (secondary, secondarylength, 1, ostream);
1314 fputs ("}{", ostream);
1315 pending = 1;
1316
1317 /* Record name of most recent secondary. */
1318 if (lastsecondarylength < secondarylength)
1319 {
1320 lastsecondarylength = secondarylength + 100;
1321 lastsecondary = (char *) xrealloc (lastsecondary,
1322 1 + lastsecondarylength);
1323 }
1324 strncpy (lastsecondary, secondary, secondarylength);
1325 lastsecondary[secondarylength] = 0;
1326 }
1327
1328 /* Here to add one more page number to the current entry. */
1329 if (pending++ != 1)
1330 fputs (", ", ostream); /* Punctuate first, if this is not the first. */
1331 fwrite (pagenumber, pagelength, 1, ostream);
1332}
1333
1334/* Close out any unfinished output entry. */
1335
1336void
1337finish_index (FILE *ostream)
1338{
1339 if (pending)
1340 fputs ("}\n", ostream);
1341 free (lastprimary);
1342 free (lastsecondary);
1343}
1344
1345
1346/* Copy the lines in the sorted order.
1347 Each line is copied out of the input file it was found in. */
1348
1349void
1350writelines (char **linearray, int nlines, FILE *ostream)
1351{
1352 char **stop_line = linearray + nlines;
1353 char **next_line;
1354
1355 init_index ();
1356
1357 /* Output the text of the lines, and free the buffer space. */
1358
1359 for (next_line = linearray; next_line != stop_line; next_line++)
1360 {
1361 /* If -u was specified, output the line only if distinct from
1362 previous one. */
1363 if (next_line == linearray
1364 /* Compare previous line with this one, using only the
1365 explicitly specd keyfields. */
1366 || compare_general (*(next_line - 1), *next_line, 0L, 0L,
1367 num_keyfields - 1))
1368 {
1369 char *p = *next_line;
1370 char c;
1371
1372 while ((c = *p++) && c != '\n')
1373 /* Do nothing. */ ;
1374 *(p - 1) = 0;
1375 indexify (*next_line, ostream);
1376 }
1377 }
1378
1379 finish_index (ostream);
1380}
1381
1382
1383/* Assume (and optionally verify) that each input file is sorted;
1384 merge them and output the result.
1385 Returns nonzero if any input file fails to be sorted.
1386
1387 This is the high-level interface that can handle an unlimited
1388 number of files. */
1389
1390#define MAX_DIRECT_MERGE 10
1391
1392int
1393merge_files (char **infiles, int nfiles, char *outfile)
1394{
1395 char **tempfiles;
1396 int ntemps;
1397 int i;
1398 int value = 0;
1399 int start_tempcount = tempcount;
1400
1401 if (nfiles <= MAX_DIRECT_MERGE)
1402 return merge_direct (infiles, nfiles, outfile);
1403
1404 /* Merge groups of MAX_DIRECT_MERGE input files at a time,
1405 making a temporary file to hold each group's result. */
1406
1407 ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
1408 tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
1409 for (i = 0; i < ntemps; i++)
1410 {
1411 int nf = MAX_DIRECT_MERGE;
1412 if (i + 1 == ntemps)
1413 nf = nfiles - i * MAX_DIRECT_MERGE;
1414 tempfiles[i] = maketempname (++tempcount);
1415 value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
1416 }
1417
1418 /* All temporary files that existed before are no longer needed
1419 since their contents have been merged into our new tempfiles.
1420 So delete them. */
1421 flush_tempfiles (start_tempcount);
1422
1423 /* Now merge the temporary files we created. */
1424
1425 merge_files (tempfiles, ntemps, outfile);
1426
1427 free (tempfiles);
1428
1429 return value;
1430}
1431
1432
1433/* Assume (and optionally verify) that each input file is sorted;
1434 merge them and output the result.
1435 Returns nonzero if any input file fails to be sorted.
1436
1437 This version of merging will not work if the number of
1438 input files gets too high. Higher level functions
1439 use it only with a bounded number of input files. */
1440
1441int
1442merge_direct (char **infiles, int nfiles, char *outfile)
1443{
1444 struct linebuffer *lb1, *lb2;
1445 struct linebuffer **thisline, **prevline;
1446 FILE **streams;
1447 int i;
1448 int nleft;
1449 int lossage = 0;
1450 int *file_lossage;
1451 struct linebuffer *prev_out = 0;
1452 FILE *ostream = stdout;
1453
1454 if (outfile)
1455 {
1456 ostream = fopen (outfile, "w");
1457 }
1458 if (!ostream)
1459 pfatal_with_name (outfile);
1460
1461 init_index ();
1462
1463 if (nfiles == 0)
1464 {
1465 if (outfile)
1466 fclose (ostream);
1467 return 0;
1468 }
1469
1470 /* For each file, make two line buffers. Also, for each file, there
1471 is an element of `thisline' which points at any time to one of the
1472 file's two buffers, and an element of `prevline' which points to
1473 the other buffer. `thisline' is supposed to point to the next
1474 available line from the file, while `prevline' holds the last file
1475 line used, which is remembered so that we can verify that the file
1476 is properly sorted. */
1477
1478 /* lb1 and lb2 contain one buffer each per file. */
1479 lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1480 lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1481
1482 /* thisline[i] points to the linebuffer holding the next available
1483 line in file i, or is zero if there are no lines left in that file. */
1484 thisline = (struct linebuffer **)
1485 xmalloc (nfiles * sizeof (struct linebuffer *));
1486 /* prevline[i] points to the linebuffer holding the last used line
1487 from file i. This is just for verifying that file i is properly
1488 sorted. */
1489 prevline = (struct linebuffer **)
1490 xmalloc (nfiles * sizeof (struct linebuffer *));
1491 /* streams[i] holds the input stream for file i. */
1492 streams = (FILE **) xmalloc (nfiles * sizeof (FILE *));
1493 /* file_lossage[i] is nonzero if we already know file i is not
1494 properly sorted. */
1495 file_lossage = (int *) xmalloc (nfiles * sizeof (int));
1496
1497 /* Allocate and initialize all that storage. */
1498
1499 for (i = 0; i < nfiles; i++)
1500 {
1501 initbuffer (&lb1[i]);
1502 initbuffer (&lb2[i]);
1503 thisline[i] = &lb1[i];
1504 prevline[i] = &lb2[i];
1505 file_lossage[i] = 0;
1506 streams[i] = fopen (infiles[i], "r");
1507 if (!streams[i])
1508 pfatal_with_name (infiles[i]);
1509
1510 readline (thisline[i], streams[i]);
1511 }
1512
1513 /* Keep count of number of files not at eof. */
1514 nleft = nfiles;
1515
1516 while (nleft)
1517 {
1518 struct linebuffer *best = 0;
1519 struct linebuffer *exch;
1520 int bestfile = -1;
1521 int i;
1522
1523 /* Look at the next avail line of each file; choose the least one. */
1524
1525 for (i = 0; i < nfiles; i++)
1526 {
1527 if (thisline[i] &&
1528 (!best ||
1529 0 < compare_general (best->buffer, thisline[i]->buffer,
1530 (long) bestfile, (long) i, num_keyfields)))
1531 {
1532 best = thisline[i];
1533 bestfile = i;
1534 }
1535 }
1536
1537 /* Output that line, unless it matches the previous one and we
1538 don't want duplicates. */
1539
1540 if (!(prev_out &&
1541 !compare_general (prev_out->buffer,
1542 best->buffer, 0L, 1L, num_keyfields - 1)))
1543 indexify (best->buffer, ostream);
1544 prev_out = best;
1545
1546 /* Now make the line the previous of its file, and fetch a new
1547 line from that file. */
1548
1549 exch = prevline[bestfile];
1550 prevline[bestfile] = thisline[bestfile];
1551 thisline[bestfile] = exch;
1552
1553 while (1)
1554 {
1555 /* If the file has no more, mark it empty. */
1556
1557 if (feof (streams[bestfile]))
1558 {
1559 thisline[bestfile] = 0;
1560 /* Update the number of files still not empty. */
1561 nleft--;
1562 break;
1563 }
1564 readline (thisline[bestfile], streams[bestfile]);
1565 if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile]))
1566 break;
1567 }
1568 }
1569
1570 finish_index (ostream);
1571
1572 /* Free all storage and close all input streams. */
1573
1574 for (i = 0; i < nfiles; i++)
1575 {
1576 fclose (streams[i]);
1577 free (lb1[i].buffer);
1578 free (lb2[i].buffer);
1579 }
1580 free (file_lossage);
1581 free (lb1);
1582 free (lb2);
1583 free (thisline);
1584 free (prevline);
1585 free (streams);
1586
1587 if (outfile)
1588 fclose (ostream);
1589
1590 return lossage;
1591}
1592
1593
1594/* Print error message and exit. */
1595
1596void
1597fatal (const char *format, const char *arg)
1598{
1599 error (format, arg);
1600 xexit (1);
1601}
1602
1603/* Print error message. FORMAT is printf control string, ARG is arg for it. */
1604void
1605error (const char *format, const char *arg)
1606{
1607 printf ("%s: ", program_name);
1608 printf (format, arg);
1609 if (format[strlen (format) -1] != '\n')
1610 printf ("\n");
1611}
1612
1613void
1614perror_with_name (const char *name)
1615{
1616 fprintf (stderr, "%s: ", program_name);
1617 perror (name);
1618}
1619
1620void
1621pfatal_with_name (const char *name)
1622{
1623 perror_with_name (name);
1624 xexit (1);
1625}
1626
1627
1628
1629/* Return a newly-allocated string concatenating S1 and S2. */
1630
1631char *
1632concat (char *s1, char *s2)
1633{
1634 int len1 = strlen (s1), len2 = strlen (s2);
1635 char *result = (char *) xmalloc (len1 + len2 + 1);
1636
1637 strcpy (result, s1);
1638 strcpy (result + len1, s2);
1639 *(result + len1 + len2) = 0;
1640
1641 return result;
1642}
Note: See TracBrowser for help on using the repository browser.