Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

texindex.c@ 2618

Visit:

Last change on this file since 2618 was 2617, checked in by bird, 19 years ago
GNU Texinfo 4.8
File size: 42.3 KB

Line
1	/* texindex -- sort TeX index dribble output into an actual index.
2	$Id: texindex.c,v 1.11 2004/04/11 17:56:47 karl Exp $
3
4	Copyright (C) 1987, 1991, 1992, 1996, 1997, 1998, 1999, 2000, 2001,
5	2002, 2003, 2004 Free Software Foundation, Inc.
6
7	This program is free software; you can redistribute it and/or modify
8	it under the terms of the GNU General Public License as published by
9	the Free Software Foundation; either version 2, or (at your option)
10	any later version.
11
12	This program is distributed in the hope that it will be useful,
13	but WITHOUT ANY WARRANTY; without even the implied warranty of
14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	GNU General Public License for more details.
16
17	You should have received a copy of the GNU General Public License
18	along with this program; if not, write to the Free Software
19	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307. */
20
21	#include "system.h"
22	#include <getopt.h>
23
24	static char *program_name = "texindex";
25
26	#if defined (emacs)
27	# include "../src/config.h"
28	/* Some s/os.h files redefine these. */
29	# undef read
30	# undef close
31	# undef write
32	# undef open
33	#endif
34
35	#if !defined (HAVE_MEMSET)
36	#undef memset
37	#define memset(ptr, ignore, count) bzero (ptr, count)
38	#endif
39
40	char mktemp (char );
41
42	#if !defined (SEEK_SET)
43	# define SEEK_SET 0
44	# define SEEK_CUR 1
45	# define SEEK_END 2
46	#endif /* !SEEK_SET */
47
48	struct linebuffer;
49
50	/* When sorting in core, this structure describes one line
51	and the position and length of its first keyfield. */
52	struct lineinfo
53	{
54	char text; / The actual text of the line. */
55	union {
56	char text; / The start of the key (for textual comparison). */
57	long number; /* The numeric value (for numeric comparison). */
58	} key;
59	long keylen; /* Length of KEY field. */
60	};
61
62	/* This structure describes a field to use as a sort key. */
63	struct keyfield
64	{
65	int startwords; /* Number of words to skip. */
66	int startchars; /* Number of additional chars to skip. */
67	int endwords; /* Number of words to ignore at end. */
68	int endchars; /* Ditto for characters of last word. */
69	char ignore_blanks; /* Non-zero means ignore spaces and tabs. */
70	char fold_case; /* Non-zero means case doesn't matter. */
71	char reverse; /* Non-zero means compare in reverse order. */
72	char numeric; /* Non-zeros means field is ASCII numeric. */
73	char positional; /* Sort according to file position. */
74	char braced; /* Count balanced-braced groupings as fields. */
75	};
76
77	/* Vector of keyfields to use. */
78	struct keyfield keyfields[3];
79
80	/* Number of keyfields stored in that vector. */
81	int num_keyfields = 3;
82
83	/* Vector of input file names, terminated with a null pointer. */
84	char **infiles;
85
86	/* Vector of corresponding output file names, or NULL, meaning default it
87	(add an `s' to the end). */
88	char **outfiles;
89
90	/* Length of `infiles'. */
91	int num_infiles;
92
93	/* Pointer to the array of pointers to lines being sorted. */
94	char **linearray;
95
96	/* The allocated length of `linearray'. */
97	long nlines;
98
99	/* Directory to use for temporary files. On Unix, it ends with a slash. */
100	char *tempdir;
101
102	/* Number of last temporary file. */
103	int tempcount;
104
105	/* Number of last temporary file already deleted.
106	Temporary files are deleted by `flush_tempfiles' in order of creation. */
107	int last_deleted_tempcount;
108
109	/* During in-core sort, this points to the base of the data block
110	which contains all the lines of data. */
111	char *text_base;
112
113	/* Initially 0; changed to 1 if we want initials in this index. */
114	int need_initials;
115
116	/* Remembers the first initial letter seen in this index, so we can
117	determine whether we need initials in the sorted form. */
118	char first_initial;
119
120	/* Additional command switches .*/
121
122	/* Nonzero means do not delete tempfiles -- for debugging. */
123	int keep_tempfiles;
124
125	/* Forward declarations of functions in this file. */
126	void decode_command (int argc, char **argv);
127	void sort_in_core (char infile, int total, char outfile);
128	void sort_offline (char infile, off_t total, char outfile);
129	char *parsefile (char filename, char *nextline, char data, long int size);
130	char find_field (struct keyfield keyfield, char str, long int lengthptr);
131	char find_pos (char str, int words, int chars, int ignore_blanks);
132	long find_value (char *start, long int length);
133	char find_braced_pos (char str, int words, int chars, int ignore_blanks);
134	char find_braced_end (char str);
135	void writelines (char *linearray, int nlines, FILE ostream);
136	int compare_field (struct keyfield keyfield, char start1,
137	long int length1, long int pos1, char *start2,
138	long int length2, long int pos2);
139	int compare_full (const void , const void );
140	long readline (struct linebuffer linebuffer, FILE stream);
141	int merge_files (char *infiles, int nfiles, char outfile);
142	int merge_direct (char *infiles, int nfiles, char outfile);
143	void pfatal_with_name (const char *name);
144	void fatal (const char format, const char arg);
145	void error (const char format, const char arg);
146	void xmalloc (), xrealloc ();
147	char concat (char s1, char *s2);
148	void flush_tempfiles (int to_count);
149
150
151	#define MAX_IN_CORE_SORT 500000
152
153	int
154	main (int argc, char **argv)
155	{
156	int i;
157
158	tempcount = 0;
159	last_deleted_tempcount = 0;
160
161	#ifdef HAVE_SETLOCALE
162	/* Set locale via LC_ALL. */
163	setlocale (LC_ALL, "");
164	#endif
165
166	/* Set the text message domain. */
167	bindtextdomain (PACKAGE, LOCALEDIR);
168	textdomain (PACKAGE);
169
170	/* In case we write to a redirected stdout that fails. */
171	/* not ready atexit (close_stdout); */
172
173	/* Describe the kind of sorting to do. */
174	/* The first keyfield uses the first braced field and folds case. */
175	keyfields[0].braced = 1;
176	keyfields[0].fold_case = 1;
177	keyfields[0].endwords = -1;
178	keyfields[0].endchars = -1;
179
180	/* The second keyfield uses the second braced field, numerically. */
181	keyfields[1].braced = 1;
182	keyfields[1].numeric = 1;
183	keyfields[1].startwords = 1;
184	keyfields[1].endwords = -1;
185	keyfields[1].endchars = -1;
186
187	/* The third keyfield (which is ignored while discarding duplicates)
188	compares the whole line. */
189	keyfields[2].endwords = -1;
190	keyfields[2].endchars = -1;
191
192	decode_command (argc, argv);
193
194	/* Process input files completely, one by one. */
195
196	for (i = 0; i < num_infiles; i++)
197	{
198	int desc;
199	off_t ptr;
200	char *outfile;
201	struct stat instat;
202
203	desc = open (infiles[i], O_RDONLY, 0);
204	if (desc < 0)
205	pfatal_with_name (infiles[i]);
206
207	if (stat (infiles[i], &instat))
208	pfatal_with_name (infiles[i]);
209	if (S_ISDIR (instat.st_mode))
210	{
211	#ifdef EISDIR
212	errno = EISDIR;
213	#endif
214	pfatal_with_name (infiles[i]);
215	}
216
217	lseek (desc, (off_t) 0, SEEK_END);
218	ptr = (off_t) lseek (desc, (off_t) 0, SEEK_CUR);
219
220	close (desc);
221
222	outfile = outfiles[i];
223	if (!outfile)
224	outfile = concat (infiles[i], "s");
225
226	need_initials = 0;
227	first_initial = '\0';
228
229	if (ptr < MAX_IN_CORE_SORT)
230	/* Sort a small amount of data. */
231	sort_in_core (infiles[i], (int)ptr, outfile);
232	else
233	sort_offline (infiles[i], ptr, outfile);
234	}
235
236	flush_tempfiles (tempcount);
237	xexit (0);
238	return 0; /* Avoid bogus warnings. */
239	}
240
241
242	typedef struct
243	{
244	char *long_name;
245	char *short_name;
246	int *variable_ref;
247	int variable_value;
248	char *arg_name;
249	char *doc_string;
250	} TEXINDEX_OPTION;
251
252	TEXINDEX_OPTION texindex_options[] = {
253	{ "--help", "-h", (int )NULL, 0, (char )NULL,
254	N_("display this help and exit") },
255	{ "--keep", "-k", &keep_tempfiles, 1, (char *)NULL,
256	N_("keep temporary files around after processing") },
257	{ "--no-keep", 0, &keep_tempfiles, 0, (char *)NULL,
258	N_("do not keep temporary files around after processing (default)") },
259	{ "--output", "-o", (int *)NULL, 0, "FILE",
260	N_("send output to FILE") },
261	{ "--version", (char )NULL, (int )NULL, 0, (char *)NULL,
262	N_("display version information and exit") },
263	{ (char )NULL, (char )NULL, (int )NULL, 0, (char )NULL }
264	};
265
266	void
267	usage (int result_value)
268	{
269	register int i;
270	FILE *f = result_value ? stderr : stdout;
271
272	fprintf (f, _("Usage: %s [OPTION]... FILE...\n"), program_name);
273	fprintf (f, _("Generate a sorted index for each TeX output FILE.\n"));
274	/* Avoid trigraph nonsense. */
275	fprintf (f,
276	_("Usually FILE... is specified as `foo.%c%c\' for a document `foo.texi'.\n"),
277	'?', '?'); /* avoid trigraph in cat-id-tbl.c */
278	fprintf (f, _("\nOptions:\n"));
279
280	for (i = 0; texindex_options[i].long_name; i++)
281	{
282	putc (' ', f);
283
284	if (texindex_options[i].short_name)
285	fprintf (f, "%s, ", texindex_options[i].short_name);
286
287	fprintf (f, "%s %s",
288	texindex_options[i].long_name,
289	texindex_options[i].arg_name
290	? texindex_options[i].arg_name : "");
291
292	fprintf (f, "\t%s\n", _(texindex_options[i].doc_string));
293	}
294	fputs (_("\n\
295	Email bug reports to bug-texinfo@gnu.org,\n\
296	general questions and discussion to help-texinfo@gnu.org.\n\
297	Texinfo home page: http://www.gnu.org/software/texinfo/"), f);
298	fputs ("\n", f);
299
300	xexit (result_value);
301	}
302
303	/* Decode the command line arguments to set the parameter variables
304	and set up the vector of keyfields and the vector of input files. */
305
306	void
307	decode_command (int argc, char **argv)
308	{
309	int arg_index = 1;
310	char **ip;
311	char **op;
312
313	/* Store default values into parameter variables. */
314
315	tempdir = getenv ("TMPDIR");
316	if (tempdir == NULL)
317	tempdir = getenv ("TEMP");
318	if (tempdir == NULL)
319	tempdir = getenv ("TMP");
320	if (tempdir == NULL)
321	tempdir = DEFAULT_TMPDIR;
322	else
323	tempdir = concat (tempdir, "/");
324
325	keep_tempfiles = 0;
326
327	/* Allocate ARGC input files, which must be enough. */
328
329	infiles = (char *) xmalloc (argc sizeof (char *));
330	outfiles = (char *) xmalloc (argc sizeof (char *));
331	ip = infiles;
332	op = outfiles;
333
334	while (arg_index < argc)
335	{
336	char *arg = argv[arg_index++];
337
338	if (*arg == '-')
339	{
340	if (strcmp (arg, "--version") == 0)
341	{
342	printf ("texindex (GNU %s) %s\n", PACKAGE, VERSION);
343	puts ("");
344	puts ("Copyright (C) 2004 Free Software Foundation, Inc.");
345	printf (_("There is NO warranty. You may redistribute this software\n\
346	under the terms of the GNU General Public License.\n\
347	For more information about these matters, see the files named COPYING.\n"));
348	xexit (0);
349	}
350	else if ((strcmp (arg, "--keep") == 0) \|\|
351	(strcmp (arg, "-k") == 0))
352	{
353	keep_tempfiles = 1;
354	}
355	else if ((strcmp (arg, "--help") == 0) \|\|
356	(strcmp (arg, "-h") == 0))
357	{
358	usage (0);
359	}
360	else if ((strcmp (arg, "--output") == 0) \|\|
361	(strcmp (arg, "-o") == 0))
362	{
363	if (argv[arg_index] != (char *)NULL)
364	{
365	arg_index++;
366	if (op > outfiles)
367	*(op - 1) = argv[arg_index];
368	}
369	else
370	usage (1);
371	}
372	else
373	usage (1);
374	}
375	else
376	{
377	*ip++ = arg;
378	op++ = (char )NULL;
379	}
380	}
381
382	/* Record number of keyfields and terminate list of filenames. */
383	num_infiles = ip - infiles;
384	ip = (char )NULL;
385	if (num_infiles == 0)
386	usage (1);
387	}
388
389
390	/* Return a name for temporary file COUNT. */
391
392	static char *
393	maketempname (int count)
394	{
395	static char *tempbase = NULL;
396	char tempsuffix[10];
397
398	if (!tempbase)
399	{
400	int fd;
401	tempbase = concat (tempdir, "txidxXXXXXX");
402
403	fd = mkstemp (tempbase);
404	if (fd == -1)
405	pfatal_with_name (tempbase);
406	}
407
408	sprintf (tempsuffix, ".%d", count);
409	return concat (tempbase, tempsuffix);
410	}
411
412
413	/* Delete all temporary files up to TO_COUNT. */
414
415	void
416	flush_tempfiles (int to_count)
417	{
418	if (keep_tempfiles)
419	return;
420	while (last_deleted_tempcount < to_count)
421	unlink (maketempname (++last_deleted_tempcount));
422	}
423
424
425
426	/* Compare LINE1 and LINE2 according to the specified set of keyfields. */
427
428	int
429	compare_full (const void p1, const void p2)
430	{
431	char line1 = (char ) p1;
432	char line2 = (char ) p2;
433	int i;
434
435	/* Compare using the first keyfield;
436	if that does not distinguish the lines, try the second keyfield;
437	and so on. */
438
439	for (i = 0; i < num_keyfields; i++)
440	{
441	long length1, length2;
442	char start1 = find_field (&keyfields[i], line1, &length1);
443	char start2 = find_field (&keyfields[i], line2, &length2);
444	int tem = compare_field (&keyfields[i], start1, length1,
445	*line1 - text_base,
446	start2, length2, *line2 - text_base);
447	if (tem)
448	{
449	if (keyfields[i].reverse)
450	return -tem;
451	return tem;
452	}
453	}
454
455	return 0; /* Lines match exactly. */
456	}
457
458	/* Compare LINE1 and LINE2, described by structures
459	in which the first keyfield is identified in advance.
460	For positional sorting, assumes that the order of the lines in core
461	reflects their nominal order. */
462	int
463	compare_prepared (const void p1, const void p2)
464	{
465	struct lineinfo line1 = (struct lineinfo ) p1;
466	struct lineinfo line2 = (struct lineinfo ) p2;
467	int i;
468	int tem;
469	char text1, text2;
470
471	/* Compare using the first keyfield, which has been found for us already. */
472	if (keyfields->positional)
473	{
474	if (line1->text - text_base > line2->text - text_base)
475	tem = 1;
476	else
477	tem = -1;
478	}
479	else if (keyfields->numeric)
480	tem = line1->key.number - line2->key.number;
481	else
482	tem = compare_field (keyfields, line1->key.text, line1->keylen, 0,
483	line2->key.text, line2->keylen, 0);
484	if (tem)
485	{
486	if (keyfields->reverse)
487	return -tem;
488	return tem;
489	}
490
491	text1 = line1->text;
492	text2 = line2->text;
493
494	/* Compare using the second keyfield;
495	if that does not distinguish the lines, try the third keyfield;
496	and so on. */
497
498	for (i = 1; i < num_keyfields; i++)
499	{
500	long length1, length2;
501	char *start1 = find_field (&keyfields[i], text1, &length1);
502	char *start2 = find_field (&keyfields[i], text2, &length2);
503	int tem = compare_field (&keyfields[i], start1, length1,
504	text1 - text_base,
505	start2, length2, text2 - text_base);
506	if (tem)
507	{
508	if (keyfields[i].reverse)
509	return -tem;
510	return tem;
511	}
512	}
513
514	return 0; /* Lines match exactly. */
515	}
516
517	/* Like compare_full but more general.
518	You can pass any strings, and you can say how many keyfields to use.
519	POS1 and POS2 should indicate the nominal positional ordering of
520	the two lines in the input. */
521
522	int
523	compare_general (char str1, char str2, long int pos1, long int pos2, int use_keyfields)
524	{
525	int i;
526
527	/* Compare using the first keyfield;
528	if that does not distinguish the lines, try the second keyfield;
529	and so on. */
530
531	for (i = 0; i < use_keyfields; i++)
532	{
533	long length1, length2;
534	char *start1 = find_field (&keyfields[i], str1, &length1);
535	char *start2 = find_field (&keyfields[i], str2, &length2);
536	int tem = compare_field (&keyfields[i], start1, length1, pos1,
537	start2, length2, pos2);
538	if (tem)
539	{
540	if (keyfields[i].reverse)
541	return -tem;
542	return tem;
543	}
544	}
545
546	return 0; /* Lines match exactly. */
547	}
548
549	/* Find the start and length of a field in STR according to KEYFIELD.
550	A pointer to the starting character is returned, and the length
551	is stored into the int that LENGTHPTR points to. */
552
553	char *
554	find_field (struct keyfield keyfield, char str, long int *lengthptr)
555	{
556	char *start;
557	char *end;
558	char (fun) ();
559
560	if (keyfield->braced)
561	fun = find_braced_pos;
562	else
563	fun = find_pos;
564
565	start = (*fun) (str, keyfield->startwords, keyfield->startchars,
566	keyfield->ignore_blanks);
567	if (keyfield->endwords < 0)
568	{
569	if (keyfield->braced)
570	end = find_braced_end (start);
571	else
572	{
573	end = start;
574	while (end && end != '\n')
575	end++;
576	}
577	}
578	else
579	{
580	end = (*fun) (str, keyfield->endwords, keyfield->endchars, 0);
581	if (end - str < start - str)
582	end = start;
583	}
584	*lengthptr = end - start;
585	return start;
586	}
587
588	/* Return a pointer to a specified place within STR,
589	skipping (from the beginning) WORDS words and then CHARS chars.
590	If IGNORE_BLANKS is nonzero, we skip all blanks
591	after finding the specified word. */
592
593	char *
594	find_pos (char *str, int words, int chars, int ignore_blanks)
595	{
596	int i;
597	char *p = str;
598
599	for (i = 0; i < words; i++)
600	{
601	char c;
602	/* Find next bunch of nonblanks and skip them. */
603	while ((c = *p) == ' ' \|\| c == '\t')
604	p++;
605	while ((c = *p) && c != '\n' && !(c == ' ' \|\| c == '\t'))
606	p++;
607	if (!p \|\| p == '\n')
608	return p;
609	}
610
611	while (p == ' ' \|\| p == '\t')
612	p++;
613
614	for (i = 0; i < chars; i++)
615	{
616	if (!p \|\| p == '\n')
617	break;
618	p++;
619	}
620	return p;
621	}
622
623	/* Like find_pos but assumes that each field is surrounded by braces
624	and that braces within fields are balanced. */
625
626	char *
627	find_braced_pos (char *str, int words, int chars, int ignore_blanks)
628	{
629	int i;
630	int bracelevel;
631	char *p = str;
632	char c;
633
634	for (i = 0; i < words; i++)
635	{
636	bracelevel = 1;
637	while ((c = *p++) != '{' && c != '\n' && c)
638	/* Do nothing. */ ;
639	if (c != '{')
640	return p - 1;
641	while (bracelevel)
642	{
643	c = *p++;
644	if (c == '{')
645	bracelevel++;
646	if (c == '}')
647	bracelevel--;
648	if (c == 0 \|\| c == '\n')
649	return p - 1;
650	}
651	}
652
653	while ((c = *p++) != '{' && c != '\n' && c)
654	/* Do nothing. */ ;
655
656	if (c != '{')
657	return p - 1;
658
659	if (ignore_blanks)
660	while ((c = *p) == ' ' \|\| c == '\t')
661	p++;
662
663	for (i = 0; i < chars; i++)
664	{
665	if (!p \|\| p == '\n')
666	break;
667	p++;
668	}
669	return p;
670	}
671
672	/* Find the end of the balanced-brace field which starts at STR.
673	The position returned is just before the closing brace. */
674
675	char *
676	find_braced_end (char *str)
677	{
678	int bracelevel;
679	char *p = str;
680	char c;
681
682	bracelevel = 1;
683	while (bracelevel)
684	{
685	c = *p++;
686	if (c == '{')
687	bracelevel++;
688	if (c == '}')
689	bracelevel--;
690	if (c == 0 \|\| c == '\n')
691	return p - 1;
692	}
693	return p - 1;
694	}
695
696	long
697	find_value (char *start, long int length)
698	{
699	while (length != 0L)
700	{
701	if (isdigit (*start))
702	return atol (start);
703	length--;
704	start++;
705	}
706	return 0l;
707	}
708
709	/* Vector used to translate characters for comparison.
710	This is how we make all alphanumerics follow all else,
711	and ignore case in the first sorting. */
712	int char_order[256];
713
714	void
715	init_char_order (void)
716	{
717	int i;
718	for (i = 1; i < 256; i++)
719	char_order[i] = i;
720
721	for (i = '0'; i <= '9'; i++)
722	char_order[i] += 512;
723
724	for (i = 'a'; i <= 'z'; i++)
725	{
726	char_order[i] = 512 + i;
727	char_order[i + 'A' - 'a'] = 512 + i;
728	}
729	}
730
731	/* Compare two fields (each specified as a start pointer and a character count)
732	according to KEYFIELD.
733	The sign of the value reports the relation between the fields. */
734
735	int
736	compare_field (struct keyfield keyfield, char start1, long int length1,
737	long int pos1, char *start2, long int length2, long int pos2)
738	{
739	if (keyfields->positional)
740	{
741	if (pos1 > pos2)
742	return 1;
743	else
744	return -1;
745	}
746	if (keyfield->numeric)
747	{
748	long value = find_value (start1, length1) - find_value (start2, length2);
749	if (value > 0)
750	return 1;
751	if (value < 0)
752	return -1;
753	return 0;
754	}
755	else
756	{
757	char *p1 = start1;
758	char *p2 = start2;
759	char *e1 = start1 + length1;
760	char *e2 = start2 + length2;
761
762	while (1)
763	{
764	int c1, c2;
765
766	if (p1 == e1)
767	c1 = 0;
768	else
769	c1 = *p1++;
770	if (p2 == e2)
771	c2 = 0;
772	else
773	c2 = *p2++;
774
775	if (char_order[c1] != char_order[c2])
776	return char_order[c1] - char_order[c2];
777	if (!c1)
778	break;
779	}
780
781	/* Strings are equal except possibly for case. */
782	p1 = start1;
783	p2 = start2;
784	while (1)
785	{
786	int c1, c2;
787
788	if (p1 == e1)
789	c1 = 0;
790	else
791	c1 = *p1++;
792	if (p2 == e2)
793	c2 = 0;
794	else
795	c2 = *p2++;
796
797	if (c1 != c2)
798	/* Reverse sign here so upper case comes out last. */
799	return c2 - c1;
800	if (!c1)
801	break;
802	}
803
804	return 0;
805	}
806	}
807
808
809	/* A `struct linebuffer' is a structure which holds a line of text.
810	`readline' reads a line from a stream into a linebuffer
811	and works regardless of the length of the line. */
812
813	struct linebuffer
814	{
815	long size;
816	char *buffer;
817	};
818
819	/* Initialize LINEBUFFER for use. */
820
821	void
822	initbuffer (struct linebuffer *linebuffer)
823	{
824	linebuffer->size = 200;
825	linebuffer->buffer = (char *) xmalloc (200);
826	}
827
828	/* Read a line of text from STREAM into LINEBUFFER.
829	Return the length of the line. */
830
831	long
832	readline (struct linebuffer linebuffer, FILE stream)
833	{
834	char *buffer = linebuffer->buffer;
835	char *p = linebuffer->buffer;
836	char *end = p + linebuffer->size;
837
838	while (1)
839	{
840	int c = getc (stream);
841	if (p == end)
842	{
843	buffer = (char ) xrealloc (buffer, linebuffer->size = 2);
844	p += buffer - linebuffer->buffer;
845	end += buffer - linebuffer->buffer;
846	linebuffer->buffer = buffer;
847	}
848	if (c < 0 \|\| c == '\n')
849	{
850	*p = 0;
851	break;
852	}
853	*p++ = c;
854	}
855
856	return p - buffer;
857	}
858
859
860	/* Sort an input file too big to sort in core. */
861
862	void
863	sort_offline (char infile, off_t total, char outfile)
864	{
865	/* More than enough. */
866	int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT;
867	char tempfiles = (char ) xmalloc (ntemps * sizeof (char *));
868	FILE *istream = fopen (infile, "r");
869	int i;
870	struct linebuffer lb;
871	long linelength;
872	int failure = 0;
873
874	initbuffer (&lb);
875
876	/* Read in one line of input data. */
877
878	linelength = readline (&lb, istream);
879
880	if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
881	{
882	error (_("%s: not a texinfo index file"), infile);
883	return;
884	}
885
886	/* Split up the input into `ntemps' temporary files, or maybe fewer,
887	and put the new files' names into `tempfiles' */
888
889	for (i = 0; i < ntemps; i++)
890	{
891	char *outname = maketempname (++tempcount);
892	FILE *ostream = fopen (outname, "w");
893	long tempsize = 0;
894
895	if (!ostream)
896	pfatal_with_name (outname);
897	tempfiles[i] = outname;
898
899	/* Copy lines into this temp file as long as it does not make file
900	"too big" or until there are no more lines. */
901
902	while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
903	{
904	tempsize += linelength + 1;
905	fputs (lb.buffer, ostream);
906	putc ('\n', ostream);
907
908	/* Read another line of input data. */
909
910	linelength = readline (&lb, istream);
911	if (!linelength && feof (istream))
912	break;
913
914	if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
915	{
916	error (_("%s: not a texinfo index file"), infile);
917	failure = 1;
918	goto fail;
919	}
920	}
921	fclose (ostream);
922	if (feof (istream))
923	break;
924	}
925
926	free (lb.buffer);
927
928	fail:
929	/* Record number of temp files we actually needed. */
930
931	ntemps = i;
932
933	/* Sort each tempfile into another tempfile.
934	Delete the first set of tempfiles and put the names of the second
935	into `tempfiles'. */
936
937	for (i = 0; i < ntemps; i++)
938	{
939	char *newtemp = maketempname (++tempcount);
940	sort_in_core (tempfiles[i], MAX_IN_CORE_SORT, newtemp);
941	if (!keep_tempfiles)
942	unlink (tempfiles[i]);
943	tempfiles[i] = newtemp;
944	}
945
946	if (failure)
947	return;
948
949	/* Merge the tempfiles together and indexify. */
950
951	merge_files (tempfiles, ntemps, outfile);
952	}
953
954
955	/* Sort INFILE, whose size is TOTAL,
956	assuming that is small enough to be done in-core,
957	then indexify it and send the output to OUTFILE (or to stdout). */
958
959	void
960	sort_in_core (char infile, int total, char outfile)
961	{
962	char **nextline;
963	char data = (char ) xmalloc (total + 1);
964	char *file_data;
965	long file_size;
966	int i;
967	FILE *ostream = stdout;
968	struct lineinfo *lineinfo;
969
970	/* Read the contents of the file into the moby array `data'. */
971
972	int desc = open (infile, O_RDONLY, 0);
973
974	if (desc < 0)
975	fatal (_("failure reopening %s"), infile);
976	for (file_size = 0;;)
977	{
978	i = read (desc, data + file_size, total - file_size);
979	if (i <= 0)
980	break;
981	file_size += i;
982	}
983	file_data = data;
984	data[file_size] = 0;
985
986	close (desc);
987
988	if (file_size > 0 && data[0] != '\\' && data[0] != '@')
989	{
990	error (_("%s: not a texinfo index file"), infile);
991	return;
992	}
993
994	init_char_order ();
995
996	/* Sort routines want to know this address. */
997
998	text_base = data;
999
1000	/* Create the array of pointers to lines, with a default size
1001	frequently enough. */
1002
1003	nlines = total / 50;
1004	if (!nlines)
1005	nlines = 2;
1006	linearray = (char *) xmalloc (nlines sizeof (char *));
1007
1008	/* `nextline' points to the next free slot in this array.
1009	`nlines' is the allocated size. */
1010
1011	nextline = linearray;
1012
1013	/* Parse the input file's data, and make entries for the lines. */
1014
1015	nextline = parsefile (infile, nextline, file_data, file_size);
1016	if (nextline == 0)
1017	{
1018	error (_("%s: not a texinfo index file"), infile);
1019	return;
1020	}
1021
1022	/* Sort the lines. */
1023
1024	/* If we have enough space, find the first keyfield of each line in advance.
1025	Make a `struct lineinfo' for each line, which records the keyfield
1026	as well as the line, and sort them. */
1027
1028	lineinfo = malloc ((nextline - linearray) * sizeof (struct lineinfo));
1029
1030	if (lineinfo)
1031	{
1032	struct lineinfo *lp;
1033	char **p;
1034
1035	for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1036	{
1037	lp->text = *p;
1038	lp->key.text = find_field (keyfields, *p, &lp->keylen);
1039	if (keyfields->numeric)
1040	lp->key.number = find_value (lp->key.text, lp->keylen);
1041	}
1042
1043	qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo),
1044	compare_prepared);
1045
1046	for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1047	*p = lp->text;
1048
1049	free (lineinfo);
1050	}
1051	else
1052	qsort (linearray, nextline - linearray, sizeof (char *), compare_full);
1053
1054	/* Open the output file. */
1055
1056	if (outfile)
1057	{
1058	ostream = fopen (outfile, "w");
1059	if (!ostream)
1060	pfatal_with_name (outfile);
1061	}
1062
1063	writelines (linearray, nextline - linearray, ostream);
1064	if (outfile)
1065	fclose (ostream);
1066
1067	free (linearray);
1068	free (data);
1069	}
1070
1071
1072	/* Parse an input string in core into lines.
1073	DATA is the input string, and SIZE is its length.
1074	Data goes in LINEARRAY starting at NEXTLINE.
1075	The value returned is the first entry in LINEARRAY still unused.
1076	Value 0 means input file contents are invalid. */
1077
1078	char **
1079	parsefile (char filename, char nextline, char data, long int size)
1080	{
1081	char p, end;
1082	char **line = nextline;
1083
1084	p = data;
1085	end = p + size;
1086	*end = 0;
1087
1088	while (p != end)
1089	{
1090	if (p[0] != '\\' && p[0] != '@')
1091	return 0;
1092
1093	*line = p;
1094
1095	/* Find the first letter of the first field of this line. If it
1096	is different from the first letter of the first field of the
1097	first line, we need initial headers in the output index. */
1098	while (p && p != '{')
1099	p++;
1100	if (p == end)
1101	return 0;
1102	p++;
1103	if (first_initial)
1104	{
1105	if (first_initial != toupper (*p))
1106	need_initials = 1;
1107	}
1108	else
1109	first_initial = toupper (*p);
1110
1111	while (p && p != '\n')
1112	p++;
1113	if (p != end)
1114	p++;
1115
1116	line++;
1117	if (line == linearray + nlines)
1118	{
1119	char **old = linearray;
1120	linearray = xrealloc (linearray, sizeof (char ) (nlines *= 4));
1121	line += linearray - old;
1122	}
1123	}
1124
1125	return line;
1126	}
1127
1128
1129	/* Indexification is a filter applied to the sorted lines
1130	as they are being written to the output file.
1131	Multiple entries for the same name, with different page numbers,
1132	get combined into a single entry with multiple page numbers.
1133	The first braced field, which is used for sorting, is discarded.
1134	However, its first character is examined, folded to lower case,
1135	and if it is different from that in the previous line fed to us
1136	a \initial line is written with one argument, the new initial.
1137
1138	If an entry has four braced fields, then the second and third
1139	constitute primary and secondary names.
1140	In this case, each change of primary name
1141	generates a \primary line which contains only the primary name,
1142	and in between these are \secondary lines which contain
1143	just a secondary name and page numbers. */
1144
1145	/* The last primary name we wrote a \primary entry for.
1146	If only one level of indexing is being done, this is the last name seen. */
1147	char *lastprimary;
1148	/* Length of storage allocated for lastprimary. */
1149	int lastprimarylength;
1150
1151	/* Similar, for the secondary name. */
1152	char *lastsecondary;
1153	int lastsecondarylength;
1154
1155	/* Zero if we are not in the middle of writing an entry.
1156	One if we have written the beginning of an entry but have not
1157	yet written any page numbers into it.
1158	Greater than one if we have written the beginning of an entry
1159	plus at least one page number. */
1160	int pending;
1161
1162	/* The initial (for sorting purposes) of the last primary entry written.
1163	When this changes, a \initial {c} line is written */
1164
1165	char *lastinitial;
1166
1167	int lastinitiallength;
1168
1169	/* When we need a string of length 1 for the value of lastinitial,
1170	store it here. */
1171
1172	char lastinitial1[2];
1173
1174	/* Initialize static storage for writing an index. */
1175
1176	void
1177	init_index (void)
1178	{
1179	pending = 0;
1180	lastinitial = lastinitial1;
1181	lastinitial1[0] = 0;
1182	lastinitial1[1] = 0;
1183	lastinitiallength = 0;
1184	lastprimarylength = 100;
1185	lastprimary = (char *) xmalloc (lastprimarylength + 1);
1186	memset (lastprimary, '\0', lastprimarylength + 1);
1187	lastsecondarylength = 100;
1188	lastsecondary = (char *) xmalloc (lastsecondarylength + 1);
1189	memset (lastsecondary, '\0', lastsecondarylength + 1);
1190	}
1191
1192	/* Indexify. Merge entries for the same name,
1193	insert headers for each initial character, etc. */
1194
1195	void
1196	indexify (char line, FILE ostream)
1197	{
1198	char primary, secondary, *pagenumber;
1199	int primarylength, secondarylength = 0, pagelength;
1200	int nosecondary;
1201	int initiallength;
1202	char *initial;
1203	char initial1[2];
1204	register char *p;
1205
1206	/* First, analyze the parts of the entry fed to us this time. */
1207
1208	p = find_braced_pos (line, 0, 0, 0);
1209	if (*p == '{')
1210	{
1211	initial = p;
1212	/* Get length of inner pair of braces starting at `p',
1213	including that inner pair of braces. */
1214	initiallength = find_braced_end (p + 1) + 1 - p;
1215	}
1216	else
1217	{
1218	initial = initial1;
1219	initial1[0] = toupper (*p);
1220	initial1[1] = 0;
1221	initiallength = 1;
1222	}
1223
1224	pagenumber = find_braced_pos (line, 1, 0, 0);
1225	pagelength = find_braced_end (pagenumber) - pagenumber;
1226	if (pagelength == 0)
1227	fatal (_("No page number in %s"), line);
1228
1229	primary = find_braced_pos (line, 2, 0, 0);
1230	primarylength = find_braced_end (primary) - primary;
1231
1232	secondary = find_braced_pos (line, 3, 0, 0);
1233	nosecondary = !*secondary;
1234	if (!nosecondary)
1235	secondarylength = find_braced_end (secondary) - secondary;
1236
1237	/* If the primary is different from before, make a new primary entry. */
1238	if (strncmp (primary, lastprimary, primarylength))
1239	{
1240	/* Close off current secondary entry first, if one is open. */
1241	if (pending)
1242	{
1243	fputs ("}\n", ostream);
1244	pending = 0;
1245	}
1246
1247	/* If this primary has a different initial, include an entry for
1248	the initial. */
1249	if (need_initials &&
1250	(initiallength != lastinitiallength \|\|
1251	strncmp (initial, lastinitial, initiallength)))
1252	{
1253	fprintf (ostream, "\\initial {");
1254	fwrite (initial, 1, initiallength, ostream);
1255	fputs ("}\n", ostream);
1256	if (initial == initial1)
1257	{
1258	lastinitial = lastinitial1;
1259	lastinitial1 = initial1;
1260	}
1261	else
1262	{
1263	lastinitial = initial;
1264	}
1265	lastinitiallength = initiallength;
1266	}
1267
1268	/* Make the entry for the primary. */
1269	if (nosecondary)
1270	fputs ("\\entry {", ostream);
1271	else
1272	fputs ("\\primary {", ostream);
1273	fwrite (primary, primarylength, 1, ostream);
1274	if (nosecondary)
1275	{
1276	fputs ("}{", ostream);
1277	pending = 1;
1278	}
1279	else
1280	fputs ("}\n", ostream);
1281
1282	/* Record name of most recent primary. */
1283	if (lastprimarylength < primarylength)
1284	{
1285	lastprimarylength = primarylength + 100;
1286	lastprimary = (char *) xrealloc (lastprimary,
1287	1 + lastprimarylength);
1288	}
1289	strncpy (lastprimary, primary, primarylength);
1290	lastprimary[primarylength] = 0;
1291
1292	/* There is no current secondary within this primary, now. */
1293	lastsecondary[0] = 0;
1294	}
1295
1296	/* Should not have an entry with no subtopic following one with a
1297	subtopic. */
1298
1299	if (nosecondary && *lastsecondary)
1300	error (_("entry %s follows an entry with a secondary name"), line);
1301
1302	/* Start a new secondary entry if necessary. */
1303	if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength))
1304	{
1305	if (pending)
1306	{
1307	fputs ("}\n", ostream);
1308	pending = 0;
1309	}
1310
1311	/* Write the entry for the secondary. */
1312	fputs ("\\secondary {", ostream);
1313	fwrite (secondary, secondarylength, 1, ostream);
1314	fputs ("}{", ostream);
1315	pending = 1;
1316
1317	/* Record name of most recent secondary. */
1318	if (lastsecondarylength < secondarylength)
1319	{
1320	lastsecondarylength = secondarylength + 100;
1321	lastsecondary = (char *) xrealloc (lastsecondary,
1322	1 + lastsecondarylength);
1323	}
1324	strncpy (lastsecondary, secondary, secondarylength);
1325	lastsecondary[secondarylength] = 0;
1326	}
1327
1328	/* Here to add one more page number to the current entry. */
1329	if (pending++ != 1)
1330	fputs (", ", ostream); /* Punctuate first, if this is not the first. */
1331	fwrite (pagenumber, pagelength, 1, ostream);
1332	}
1333
1334	/* Close out any unfinished output entry. */
1335
1336	void
1337	finish_index (FILE *ostream)
1338	{
1339	if (pending)
1340	fputs ("}\n", ostream);
1341	free (lastprimary);
1342	free (lastsecondary);
1343	}
1344
1345
1346	/* Copy the lines in the sorted order.
1347	Each line is copied out of the input file it was found in. */
1348
1349	void
1350	writelines (char *linearray, int nlines, FILE ostream)
1351	{
1352	char **stop_line = linearray + nlines;
1353	char **next_line;
1354
1355	init_index ();
1356
1357	/* Output the text of the lines, and free the buffer space. */
1358
1359	for (next_line = linearray; next_line != stop_line; next_line++)
1360	{
1361	/* If -u was specified, output the line only if distinct from
1362	previous one. */
1363	if (next_line == linearray
1364	/* Compare previous line with this one, using only the
1365	explicitly specd keyfields. */
1366	\|\| compare_general ((next_line - 1), next_line, 0L, 0L,
1367	num_keyfields - 1))
1368	{
1369	char p = next_line;
1370	char c;
1371
1372	while ((c = *p++) && c != '\n')
1373	/* Do nothing. */ ;
1374	*(p - 1) = 0;
1375	indexify (*next_line, ostream);
1376	}
1377	}
1378
1379	finish_index (ostream);
1380	}
1381
1382
1383	/* Assume (and optionally verify) that each input file is sorted;
1384	merge them and output the result.
1385	Returns nonzero if any input file fails to be sorted.
1386
1387	This is the high-level interface that can handle an unlimited
1388	number of files. */
1389
1390	#define MAX_DIRECT_MERGE 10
1391
1392	int
1393	merge_files (char *infiles, int nfiles, char outfile)
1394	{
1395	char **tempfiles;
1396	int ntemps;
1397	int i;
1398	int value = 0;
1399	int start_tempcount = tempcount;
1400
1401	if (nfiles <= MAX_DIRECT_MERGE)
1402	return merge_direct (infiles, nfiles, outfile);
1403
1404	/* Merge groups of MAX_DIRECT_MERGE input files at a time,
1405	making a temporary file to hold each group's result. */
1406
1407	ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
1408	tempfiles = (char *) xmalloc (ntemps sizeof (char *));
1409	for (i = 0; i < ntemps; i++)
1410	{
1411	int nf = MAX_DIRECT_MERGE;
1412	if (i + 1 == ntemps)
1413	nf = nfiles - i * MAX_DIRECT_MERGE;
1414	tempfiles[i] = maketempname (++tempcount);
1415	value \|= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
1416	}
1417
1418	/* All temporary files that existed before are no longer needed
1419	since their contents have been merged into our new tempfiles.
1420	So delete them. */
1421	flush_tempfiles (start_tempcount);
1422
1423	/* Now merge the temporary files we created. */
1424
1425	merge_files (tempfiles, ntemps, outfile);
1426
1427	free (tempfiles);
1428
1429	return value;
1430	}
1431
1432
1433	/* Assume (and optionally verify) that each input file is sorted;
1434	merge them and output the result.
1435	Returns nonzero if any input file fails to be sorted.
1436
1437	This version of merging will not work if the number of
1438	input files gets too high. Higher level functions
1439	use it only with a bounded number of input files. */
1440
1441	int
1442	merge_direct (char *infiles, int nfiles, char outfile)
1443	{
1444	struct linebuffer lb1, lb2;
1445	struct linebuffer thisline, prevline;
1446	FILE **streams;
1447	int i;
1448	int nleft;
1449	int lossage = 0;
1450	int *file_lossage;
1451	struct linebuffer *prev_out = 0;
1452	FILE *ostream = stdout;
1453
1454	if (outfile)
1455	{
1456	ostream = fopen (outfile, "w");
1457	}
1458	if (!ostream)
1459	pfatal_with_name (outfile);
1460
1461	init_index ();
1462
1463	if (nfiles == 0)
1464	{
1465	if (outfile)
1466	fclose (ostream);
1467	return 0;
1468	}
1469
1470	/* For each file, make two line buffers. Also, for each file, there
1471	is an element of `thisline' which points at any time to one of the
1472	file's two buffers, and an element of `prevline' which points to
1473	the other buffer. `thisline' is supposed to point to the next
1474	available line from the file, while `prevline' holds the last file
1475	line used, which is remembered so that we can verify that the file
1476	is properly sorted. */
1477
1478	/* lb1 and lb2 contain one buffer each per file. */
1479	lb1 = (struct linebuffer ) xmalloc (nfiles sizeof (struct linebuffer));
1480	lb2 = (struct linebuffer ) xmalloc (nfiles sizeof (struct linebuffer));
1481
1482	/* thisline[i] points to the linebuffer holding the next available
1483	line in file i, or is zero if there are no lines left in that file. */
1484	thisline = (struct linebuffer **)
1485	xmalloc (nfiles * sizeof (struct linebuffer *));
1486	/* prevline[i] points to the linebuffer holding the last used line
1487	from file i. This is just for verifying that file i is properly
1488	sorted. */
1489	prevline = (struct linebuffer **)
1490	xmalloc (nfiles * sizeof (struct linebuffer *));
1491	/* streams[i] holds the input stream for file i. */
1492	streams = (FILE *) xmalloc (nfiles sizeof (FILE *));
1493	/* file_lossage[i] is nonzero if we already know file i is not
1494	properly sorted. */
1495	file_lossage = (int ) xmalloc (nfiles sizeof (int));
1496
1497	/* Allocate and initialize all that storage. */
1498
1499	for (i = 0; i < nfiles; i++)
1500	{
1501	initbuffer (&lb1[i]);
1502	initbuffer (&lb2[i]);
1503	thisline[i] = &lb1[i];
1504	prevline[i] = &lb2[i];
1505	file_lossage[i] = 0;
1506	streams[i] = fopen (infiles[i], "r");
1507	if (!streams[i])
1508	pfatal_with_name (infiles[i]);
1509
1510	readline (thisline[i], streams[i]);
1511	}
1512
1513	/* Keep count of number of files not at eof. */
1514	nleft = nfiles;
1515
1516	while (nleft)
1517	{
1518	struct linebuffer *best = 0;
1519	struct linebuffer *exch;
1520	int bestfile = -1;
1521	int i;
1522
1523	/* Look at the next avail line of each file; choose the least one. */
1524
1525	for (i = 0; i < nfiles; i++)
1526	{
1527	if (thisline[i] &&
1528	(!best \|\|
1529	0 < compare_general (best->buffer, thisline[i]->buffer,
1530	(long) bestfile, (long) i, num_keyfields)))
1531	{
1532	best = thisline[i];
1533	bestfile = i;
1534	}
1535	}
1536
1537	/* Output that line, unless it matches the previous one and we
1538	don't want duplicates. */
1539
1540	if (!(prev_out &&
1541	!compare_general (prev_out->buffer,
1542	best->buffer, 0L, 1L, num_keyfields - 1)))
1543	indexify (best->buffer, ostream);
1544	prev_out = best;
1545
1546	/* Now make the line the previous of its file, and fetch a new
1547	line from that file. */
1548
1549	exch = prevline[bestfile];
1550	prevline[bestfile] = thisline[bestfile];
1551	thisline[bestfile] = exch;
1552
1553	while (1)
1554	{
1555	/* If the file has no more, mark it empty. */
1556
1557	if (feof (streams[bestfile]))
1558	{
1559	thisline[bestfile] = 0;
1560	/* Update the number of files still not empty. */
1561	nleft--;
1562	break;
1563	}
1564	readline (thisline[bestfile], streams[bestfile]);
1565	if (thisline[bestfile]->buffer[0] \|\| !feof (streams[bestfile]))
1566	break;
1567	}
1568	}
1569
1570	finish_index (ostream);
1571
1572	/* Free all storage and close all input streams. */
1573
1574	for (i = 0; i < nfiles; i++)
1575	{
1576	fclose (streams[i]);
1577	free (lb1[i].buffer);
1578	free (lb2[i].buffer);
1579	}
1580	free (file_lossage);
1581	free (lb1);
1582	free (lb2);
1583	free (thisline);
1584	free (prevline);
1585	free (streams);
1586
1587	if (outfile)
1588	fclose (ostream);
1589
1590	return lossage;
1591	}
1592
1593
1594	/* Print error message and exit. */
1595
1596	void
1597	fatal (const char format, const char arg)
1598	{
1599	error (format, arg);
1600	xexit (1);
1601	}
1602
1603	/* Print error message. FORMAT is printf control string, ARG is arg for it. */
1604	void
1605	error (const char format, const char arg)
1606	{
1607	printf ("%s: ", program_name);
1608	printf (format, arg);
1609	if (format[strlen (format) -1] != '\n')
1610	printf ("\n");
1611	}
1612
1613	void
1614	perror_with_name (const char *name)
1615	{
1616	fprintf (stderr, "%s: ", program_name);
1617	perror (name);
1618	}
1619
1620	void
1621	pfatal_with_name (const char *name)
1622	{
1623	perror_with_name (name);
1624	xexit (1);
1625	}
1626
1627
1628
1629	/* Return a newly-allocated string concatenating S1 and S2. */
1630
1631	char *
1632	concat (char s1, char s2)
1633	{
1634	int len1 = strlen (s1), len2 = strlen (s2);
1635	char result = (char ) xmalloc (len1 + len2 + 1);
1636
1637	strcpy (result, s1);
1638	strcpy (result + len1, s2);
1639	*(result + len1 + len2) = 0;
1640
1641	return result;
1642	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/texinfo/util/texindex.c@ 2618

Download in other formats: