source: trunk/essentials/sys-apps/findutils/locate/code.c

Last change on this file was 3170, checked in by bird, 18 years ago

findutils 4.3.2

File size: 6.6 KB
Line 
1/* code -- bigram- and front-encode filenames for locate
2 Copyright (C) 1994 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
17 USA.
18*/
19
20/* Compress a sorted list.
21 Works with `find' to encode a filename database to save space
22 and search time.
23
24 Usage:
25
26 bigram < file_list > bigrams
27 process-bigrams > most_common_bigrams
28 code most_common_bigrams < file_list > squeezed_list
29
30 Uses `front compression' (see ";login:", March 1983, p. 8).
31 The output begins with the 128 most common bigrams.
32 After that, the output format is, for each line,
33 an offset (from the previous line) differential count byte
34 followed by a (partially bigram-encoded) ASCII remainder.
35 The output lines have no terminating byte; the start of the next line
36 is indicated by its first byte having a value <= 30.
37
38 The encoding of the output bytes is:
39
40 0-28 likeliest differential counts + offset (14) to make nonnegative
41 30 escape code for out-of-range count to follow in next halfword
42 128-255 bigram codes (the 128 most common, as determined by `updatedb')
43 32-127 single character (printable) ASCII remainder
44
45 Written by James A. Woods <jwoods@adobe.com>.
46 Modified by David MacKenzie <djm@gnu.org>. */
47
48#include <config.h>
49#include <stdio.h>
50#include <sys/types.h>
51
52#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
53#include <string.h>
54#else
55#include <strings.h>
56#endif
57
58#ifdef STDC_HEADERS
59#include <stdlib.h>
60#endif
61
62#if ENABLE_NLS
63# include <libintl.h>
64# define _(Text) gettext (Text)
65#else
66# define _(Text) Text
67#define textdomain(Domain)
68#define bindtextdomain(Package, Directory)
69#endif
70#ifdef gettext_noop
71# define N_(String) gettext_noop (String)
72#else
73/* See locate.c for explanation as to why not use (String) */
74# define N_(String) String
75#endif
76
77#include "locatedb.h"
78#include <getline.h>
79#include "closeout.h"
80
81char *xmalloc PARAMS((size_t));
82
83/* The name this program was run with. */
84char *program_name;
85
86/* The 128 most common bigrams in the file list, padded with NULs
87 if there are fewer. */
88static char bigrams[257] = {0};
89
90/* Return the offset of PATTERN in STRING, or -1 if not found. */
91
92static int
93strindex (char *string, char *pattern)
94{
95 register char *s;
96
97 for (s = string; *s != '\0'; s++)
98 /* Fast first char check. */
99 if (*s == *pattern)
100 {
101 register char *p2 = pattern + 1, *s2 = s + 1;
102 while (*p2 != '\0' && *p2 == *s2)
103 p2++, s2++;
104 if (*p2 == '\0')
105 return s2 - strlen (pattern) - string;
106 }
107 return -1;
108}
109
110/* Return the length of the longest common prefix of strings S1 and S2. */
111
112static int
113prefix_length (char *s1, char *s2)
114{
115 register char *start;
116
117 for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
118 ;
119 return s1 - start;
120}
121
122extern char *version_string;
123
124static void
125usage (FILE *stream)
126{
127 fprintf (stream, _("\
128Usage: %s [--version | --help]\n\
129or %s most_common_bigrams < file-list > locate-database\n"),
130 program_name, program_name);
131 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
132}
133
134
135int
136main (int argc, char **argv)
137{
138 char *path; /* The current input entry. */
139 char *oldpath; /* The previous input entry. */
140 size_t pathsize, oldpathsize; /* Amounts allocated for them. */
141 int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
142 char bigram[3]; /* Bigram to search for in table. */
143 int code; /* Index of `bigram' in bigrams table. */
144 FILE *fp; /* Most common bigrams file. */
145 int line_len; /* Length of input line. */
146
147 program_name = argv[0];
148 atexit (close_stdout);
149
150 bigram[2] = '\0';
151
152 if (argc != 2)
153 {
154 usage(stderr);
155 return 2;
156 }
157
158 if (0 == strcmp(argv[1], "--help"))
159 {
160 usage(stdout);
161 return 0;
162 }
163 else if (0 == strcmp(argv[1], "--version"))
164 {
165 printf (_("GNU findutils version %s\n"), version_string);
166 return 0;
167 }
168
169 fp = fopen (argv[1], "r");
170 if (fp == NULL)
171 {
172 fprintf (stderr, "%s: ", argv[0]);
173 perror (argv[1]);
174 return 1;
175 }
176
177 pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */
178 path = xmalloc (pathsize);
179 oldpath = xmalloc (oldpathsize);
180
181 /* Set to anything not starting with a slash, to force the first
182 prefix count to 0. */
183 strcpy (oldpath, " ");
184 oldcount = 0;
185
186 /* Copy the list of most common bigrams to the output,
187 padding with NULs if there are <128 of them. */
188 fgets (bigrams, 257, fp);
189 fwrite (bigrams, 1, 256, stdout);
190 fclose (fp);
191
192 while ((line_len = getline (&path, &pathsize, stdin)) > 0)
193 {
194 char *pp;
195
196 path[line_len - 1] = '\0'; /* Remove newline. */
197
198 /* Squelch unprintable chars in path so as not to botch decoding. */
199 for (pp = path; *pp != '\0'; pp++)
200 {
201 if (!(*pp >= 040 && *pp < 0177))
202 *pp = '?';
203 }
204
205 count = prefix_length (oldpath, path);
206 diffcount = count - oldcount;
207 oldcount = count;
208 /* If the difference is small, it fits in one byte;
209 otherwise, two bytes plus a marker noting that fact. */
210 if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
211 {
212 putc (LOCATEDB_OLD_ESCAPE, stdout);
213 putw (diffcount + LOCATEDB_OLD_OFFSET, stdout);
214 }
215 else
216 putc (diffcount + LOCATEDB_OLD_OFFSET, stdout);
217
218 /* Look for bigrams in the remainder of the path. */
219 for (pp = path + count; *pp != '\0'; pp += 2)
220 {
221 if (pp[1] == '\0')
222 {
223 /* No bigram is possible; only one char is left. */
224 putchar (*pp);
225 break;
226 }
227 bigram[0] = *pp;
228 bigram[1] = pp[1];
229 /* Linear search for specific bigram in string table. */
230 code = strindex (bigrams, bigram);
231 if (code % 2 == 0)
232 putchar ((code / 2) | 0200); /* It's a common bigram. */
233 else
234 fputs (bigram, stdout); /* Write the text as printable ASCII. */
235 }
236
237 {
238 /* Swap path and oldpath and their sizes. */
239 char *tmppath = oldpath;
240 size_t tmppathsize = oldpathsize;
241 oldpath = path;
242 oldpathsize = pathsize;
243 path = tmppath;
244 pathsize = tmppathsize;
245 }
246 }
247
248 free (path);
249 free (oldpath);
250
251 return 0;
252}
Note: See TracBrowser for help on using the repository browser.