1 | /* search.c -- searching large bodies of text.
|
---|
2 | $Id: search.c,v 1.3 2004/04/11 17:56:46 karl Exp $
|
---|
3 |
|
---|
4 | Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc.
|
---|
5 |
|
---|
6 | This program is free software; you can redistribute it and/or modify
|
---|
7 | it under the terms of the GNU General Public License as published by
|
---|
8 | the Free Software Foundation; either version 2, or (at your option)
|
---|
9 | any later version.
|
---|
10 |
|
---|
11 | This program is distributed in the hope that it will be useful,
|
---|
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | GNU General Public License for more details.
|
---|
15 |
|
---|
16 | You should have received a copy of the GNU General Public License
|
---|
17 | along with this program; if not, write to the Free Software
|
---|
18 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
---|
19 |
|
---|
20 | Written by Brian Fox (bfox@ai.mit.edu). */
|
---|
21 |
|
---|
22 | #include "info.h"
|
---|
23 |
|
---|
24 | #include "search.h"
|
---|
25 | #include "nodes.h"
|
---|
26 |
|
---|
27 | /* The search functions take two arguments:
|
---|
28 |
|
---|
29 | 1) a string to search for, and
|
---|
30 |
|
---|
31 | 2) a pointer to a SEARCH_BINDING which contains the buffer, start,
|
---|
32 | and end of the search.
|
---|
33 |
|
---|
34 | They return a long, which is the offset from the start of the buffer
|
---|
35 | at which the match was found. An offset of -1 indicates failure. */
|
---|
36 |
|
---|
37 | /* A function which makes a binding with buffer and bounds. */
|
---|
38 | SEARCH_BINDING *
|
---|
39 | make_binding (char *buffer, long int start, long int end)
|
---|
40 | {
|
---|
41 | SEARCH_BINDING *binding;
|
---|
42 |
|
---|
43 | binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING));
|
---|
44 | binding->buffer = buffer;
|
---|
45 | binding->start = start;
|
---|
46 | binding->end = end;
|
---|
47 | binding->flags = 0;
|
---|
48 |
|
---|
49 | return (binding);
|
---|
50 | }
|
---|
51 |
|
---|
52 | /* Make a copy of BINDING without duplicating the data. */
|
---|
53 | SEARCH_BINDING *
|
---|
54 | copy_binding (SEARCH_BINDING *binding)
|
---|
55 | {
|
---|
56 | SEARCH_BINDING *copy;
|
---|
57 |
|
---|
58 | copy = make_binding (binding->buffer, binding->start, binding->end);
|
---|
59 | copy->flags = binding->flags;
|
---|
60 | return (copy);
|
---|
61 | }
|
---|
62 |
|
---|
63 | |
---|
64 |
|
---|
65 | /* **************************************************************** */
|
---|
66 | /* */
|
---|
67 | /* The Actual Searching Functions */
|
---|
68 | /* */
|
---|
69 | /* **************************************************************** */
|
---|
70 |
|
---|
71 | /* Search forwards or backwards for the text delimited by BINDING.
|
---|
72 | The search is forwards if BINDING->start is greater than BINDING->end. */
|
---|
73 | long
|
---|
74 | search (char *string, SEARCH_BINDING *binding)
|
---|
75 | {
|
---|
76 | long result;
|
---|
77 |
|
---|
78 | /* If the search is backwards, then search backwards, otherwise forwards. */
|
---|
79 | if (binding->start > binding->end)
|
---|
80 | result = search_backward (string, binding);
|
---|
81 | else
|
---|
82 | result = search_forward (string, binding);
|
---|
83 |
|
---|
84 | return (result);
|
---|
85 | }
|
---|
86 |
|
---|
87 | /* Search forwards for STRING through the text delimited in BINDING. */
|
---|
88 | long
|
---|
89 | search_forward (char *string, SEARCH_BINDING *binding)
|
---|
90 | {
|
---|
91 | register int c, i, len;
|
---|
92 | register char *buff, *end;
|
---|
93 | char *alternate = (char *)NULL;
|
---|
94 |
|
---|
95 | len = strlen (string);
|
---|
96 |
|
---|
97 | /* We match characters in the search buffer against STRING and ALTERNATE.
|
---|
98 | ALTERNATE is a case reversed version of STRING; this is cheaper than
|
---|
99 | case folding each character before comparison. Alternate is only
|
---|
100 | used if the case folding bit is turned on in the passed BINDING. */
|
---|
101 |
|
---|
102 | if (binding->flags & S_FoldCase)
|
---|
103 | {
|
---|
104 | alternate = xstrdup (string);
|
---|
105 |
|
---|
106 | for (i = 0; i < len; i++)
|
---|
107 | {
|
---|
108 | if (islower (alternate[i]))
|
---|
109 | alternate[i] = toupper (alternate[i]);
|
---|
110 | else if (isupper (alternate[i]))
|
---|
111 | alternate[i] = tolower (alternate[i]);
|
---|
112 | }
|
---|
113 | }
|
---|
114 |
|
---|
115 | buff = binding->buffer + binding->start;
|
---|
116 | end = binding->buffer + binding->end + 1;
|
---|
117 |
|
---|
118 | while (buff < (end - len))
|
---|
119 | {
|
---|
120 | for (i = 0; i < len; i++)
|
---|
121 | {
|
---|
122 | c = buff[i];
|
---|
123 |
|
---|
124 | if ((c != string[i]) && (!alternate || c != alternate[i]))
|
---|
125 | break;
|
---|
126 | }
|
---|
127 |
|
---|
128 | if (!string[i])
|
---|
129 | {
|
---|
130 | if (alternate)
|
---|
131 | free (alternate);
|
---|
132 | if (binding->flags & S_SkipDest)
|
---|
133 | buff += len;
|
---|
134 | return ((long) (buff - binding->buffer));
|
---|
135 | }
|
---|
136 |
|
---|
137 | buff++;
|
---|
138 | }
|
---|
139 |
|
---|
140 | if (alternate)
|
---|
141 | free (alternate);
|
---|
142 |
|
---|
143 | return ((long) -1);
|
---|
144 | }
|
---|
145 |
|
---|
146 | /* Search for STRING backwards through the text delimited in BINDING. */
|
---|
147 | long
|
---|
148 | search_backward (char *input_string, SEARCH_BINDING *binding)
|
---|
149 | {
|
---|
150 | register int c, i, len;
|
---|
151 | register char *buff, *end;
|
---|
152 | char *string;
|
---|
153 | char *alternate = (char *)NULL;
|
---|
154 |
|
---|
155 | len = strlen (input_string);
|
---|
156 |
|
---|
157 | /* Reverse the characters in the search string. */
|
---|
158 | string = (char *)xmalloc (1 + len);
|
---|
159 | for (c = 0, i = len - 1; input_string[c]; c++, i--)
|
---|
160 | string[i] = input_string[c];
|
---|
161 |
|
---|
162 | string[c] = '\0';
|
---|
163 |
|
---|
164 | /* We match characters in the search buffer against STRING and ALTERNATE.
|
---|
165 | ALTERNATE is a case reversed version of STRING; this is cheaper than
|
---|
166 | case folding each character before comparison. ALTERNATE is only
|
---|
167 | used if the case folding bit is turned on in the passed BINDING. */
|
---|
168 |
|
---|
169 | if (binding->flags & S_FoldCase)
|
---|
170 | {
|
---|
171 | alternate = xstrdup (string);
|
---|
172 |
|
---|
173 | for (i = 0; i < len; i++)
|
---|
174 | {
|
---|
175 | if (islower (alternate[i]))
|
---|
176 | alternate[i] = toupper (alternate[i]);
|
---|
177 | else if (isupper (alternate[i]))
|
---|
178 | alternate[i] = tolower (alternate[i]);
|
---|
179 | }
|
---|
180 | }
|
---|
181 |
|
---|
182 | buff = binding->buffer + binding->start - 1;
|
---|
183 | end = binding->buffer + binding->end;
|
---|
184 |
|
---|
185 | while (buff > (end + len))
|
---|
186 | {
|
---|
187 | for (i = 0; i < len; i++)
|
---|
188 | {
|
---|
189 | c = *(buff - i);
|
---|
190 |
|
---|
191 | if (c != string[i] && (!alternate || c != alternate[i]))
|
---|
192 | break;
|
---|
193 | }
|
---|
194 |
|
---|
195 | if (!string[i])
|
---|
196 | {
|
---|
197 | free (string);
|
---|
198 | if (alternate)
|
---|
199 | free (alternate);
|
---|
200 |
|
---|
201 | if (binding->flags & S_SkipDest)
|
---|
202 | buff -= len;
|
---|
203 | return ((long) (1 + (buff - binding->buffer)));
|
---|
204 | }
|
---|
205 |
|
---|
206 | buff--;
|
---|
207 | }
|
---|
208 |
|
---|
209 | free (string);
|
---|
210 | if (alternate)
|
---|
211 | free (alternate);
|
---|
212 |
|
---|
213 | return ((long) -1);
|
---|
214 | }
|
---|
215 |
|
---|
216 | /* Find STRING in LINE, returning the offset of the end of the string.
|
---|
217 | Return an offset of -1 if STRING does not appear in LINE. The search
|
---|
218 | is bound by the end of the line (i.e., either NEWLINE or 0). */
|
---|
219 | int
|
---|
220 | string_in_line (char *string, char *line)
|
---|
221 | {
|
---|
222 | register int end;
|
---|
223 | SEARCH_BINDING binding;
|
---|
224 |
|
---|
225 | /* Find the end of the line. */
|
---|
226 | for (end = 0; line[end] && line[end] != '\n'; end++);
|
---|
227 |
|
---|
228 | /* Search for STRING within these confines. */
|
---|
229 | binding.buffer = line;
|
---|
230 | binding.start = 0;
|
---|
231 | binding.end = end;
|
---|
232 | binding.flags = S_FoldCase | S_SkipDest;
|
---|
233 |
|
---|
234 | return (search_forward (string, &binding));
|
---|
235 | }
|
---|
236 |
|
---|
237 | /* Return non-zero if STRING is the first text to appear at BINDING. */
|
---|
238 | int
|
---|
239 | looking_at (char *string, SEARCH_BINDING *binding)
|
---|
240 | {
|
---|
241 | long search_end;
|
---|
242 |
|
---|
243 | search_end = search (string, binding);
|
---|
244 |
|
---|
245 | /* If the string was not found, SEARCH_END is -1. If the string was found,
|
---|
246 | but not right away, SEARCH_END is != binding->start. Otherwise, the
|
---|
247 | string was found at binding->start. */
|
---|
248 | return (search_end == binding->start);
|
---|
249 | }
|
---|
250 | |
---|
251 |
|
---|
252 | /* **************************************************************** */
|
---|
253 | /* */
|
---|
254 | /* Small String Searches */
|
---|
255 | /* */
|
---|
256 | /* **************************************************************** */
|
---|
257 |
|
---|
258 | /* Function names that start with "skip" are passed a string, and return
|
---|
259 | an offset from the start of that string. Function names that start
|
---|
260 | with "find" are passed a SEARCH_BINDING, and return an absolute position
|
---|
261 | marker of the item being searched for. "Find" functions return a value
|
---|
262 | of -1 if the item being looked for couldn't be found. */
|
---|
263 |
|
---|
264 | /* Return the index of the first non-whitespace character in STRING. */
|
---|
265 | int
|
---|
266 | skip_whitespace (char *string)
|
---|
267 | {
|
---|
268 | register int i;
|
---|
269 |
|
---|
270 | for (i = 0; string && whitespace (string[i]); i++);
|
---|
271 | return (i);
|
---|
272 | }
|
---|
273 |
|
---|
274 | /* Return the index of the first non-whitespace or newline character in
|
---|
275 | STRING. */
|
---|
276 | int
|
---|
277 | skip_whitespace_and_newlines (char *string)
|
---|
278 | {
|
---|
279 | register int i;
|
---|
280 |
|
---|
281 | for (i = 0; string && whitespace_or_newline (string[i]); i++);
|
---|
282 | return (i);
|
---|
283 | }
|
---|
284 |
|
---|
285 | /* Return the index of the first whitespace character in STRING. */
|
---|
286 | int
|
---|
287 | skip_non_whitespace (char *string)
|
---|
288 | {
|
---|
289 | register int i;
|
---|
290 |
|
---|
291 | for (i = 0; string && string[i] && !whitespace (string[i]); i++);
|
---|
292 | return (i);
|
---|
293 | }
|
---|
294 |
|
---|
295 | /* Return the index of the first non-node character in STRING. Note that
|
---|
296 | this function contains quite a bit of hair to ignore periods in some
|
---|
297 | special cases. This is because we here at GNU ship some info files which
|
---|
298 | contain nodenames that contain periods. No such nodename can start with
|
---|
299 | a period, or continue with whitespace, newline, or ')' immediately following
|
---|
300 | the period. If second argument NEWLINES_OKAY is non-zero, newlines should
|
---|
301 | be skipped while parsing out the nodename specification. */
|
---|
302 | int
|
---|
303 | skip_node_characters (char *string, int newlines_okay)
|
---|
304 | {
|
---|
305 | register int c, i = 0;
|
---|
306 | int paren_seen = 0;
|
---|
307 | int paren = 0;
|
---|
308 |
|
---|
309 | /* Handle special case. This is when another function has parsed out the
|
---|
310 | filename component of the node name, and we just want to parse out the
|
---|
311 | nodename proper. In that case, a period at the start of the nodename
|
---|
312 | indicates an empty nodename. */
|
---|
313 | if (string && *string == '.')
|
---|
314 | return (0);
|
---|
315 |
|
---|
316 | if (string && *string == '(')
|
---|
317 | {
|
---|
318 | paren++;
|
---|
319 | paren_seen++;
|
---|
320 | i++;
|
---|
321 | }
|
---|
322 |
|
---|
323 | for (; string && (c = string[i]); i++)
|
---|
324 | {
|
---|
325 | if (paren)
|
---|
326 | {
|
---|
327 | if (c == '(')
|
---|
328 | paren++;
|
---|
329 | else if (c == ')')
|
---|
330 | paren--;
|
---|
331 |
|
---|
332 | continue;
|
---|
333 | }
|
---|
334 |
|
---|
335 | /* If the character following the close paren is a space or period,
|
---|
336 | then this node name has no more characters associated with it. */
|
---|
337 | if (c == '\t' ||
|
---|
338 | c == ',' ||
|
---|
339 | c == INFO_TAGSEP ||
|
---|
340 | ((!newlines_okay) && (c == '\n')) ||
|
---|
341 | ((paren_seen && string[i - 1] == ')') &&
|
---|
342 | (c == ' ' || c == '.')) ||
|
---|
343 | (c == '.' &&
|
---|
344 | (
|
---|
345 | #if 0
|
---|
346 | /* This test causes a node name ending in a period, like `This.', not to
|
---|
347 | be found. The trailing . is stripped. This occurs in the jargon
|
---|
348 | file (`I see no X here.' is a node name). */
|
---|
349 | (!string[i + 1]) ||
|
---|
350 | #endif
|
---|
351 | (whitespace_or_newline (string[i + 1])) ||
|
---|
352 | (string[i + 1] == ')'))))
|
---|
353 | break;
|
---|
354 | }
|
---|
355 | return (i);
|
---|
356 | }
|
---|
357 |
|
---|
358 | |
---|
359 |
|
---|
360 | /* **************************************************************** */
|
---|
361 | /* */
|
---|
362 | /* Searching FILE_BUFFER's */
|
---|
363 | /* */
|
---|
364 | /* **************************************************************** */
|
---|
365 |
|
---|
366 | /* Return the absolute position of the first occurence of a node separator in
|
---|
367 | BINDING-buffer. The search starts at BINDING->start. Return -1 if no node
|
---|
368 | separator was found. */
|
---|
369 | long
|
---|
370 | find_node_separator (SEARCH_BINDING *binding)
|
---|
371 | {
|
---|
372 | register long i;
|
---|
373 | char *body;
|
---|
374 |
|
---|
375 | body = binding->buffer;
|
---|
376 |
|
---|
377 | /* A node is started by [^L]^_[^L]\n. That is to say, the C-l's are
|
---|
378 | optional, but the DELETE and NEWLINE are not. This separator holds
|
---|
379 | true for all separated elements in an Info file, including the tags
|
---|
380 | table (if present) and the indirect tags table (if present). */
|
---|
381 | for (i = binding->start; i < binding->end - 1; i++)
|
---|
382 | if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
|
---|
383 | (body[i + 2] == '\n' ||
|
---|
384 | (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
|
---|
385 | ((body[i] == INFO_COOKIE) &&
|
---|
386 | (body[i + 1] == '\n' ||
|
---|
387 | (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
|
---|
388 | return (i);
|
---|
389 | return (-1);
|
---|
390 | }
|
---|
391 |
|
---|
392 | /* Return the length of the node separator characters that BODY is
|
---|
393 | currently pointing at. */
|
---|
394 | int
|
---|
395 | skip_node_separator (char *body)
|
---|
396 | {
|
---|
397 | register int i;
|
---|
398 |
|
---|
399 | i = 0;
|
---|
400 |
|
---|
401 | if (body[i] == INFO_FF)
|
---|
402 | i++;
|
---|
403 |
|
---|
404 | if (body[i++] != INFO_COOKIE)
|
---|
405 | return (0);
|
---|
406 |
|
---|
407 | if (body[i] == INFO_FF)
|
---|
408 | i++;
|
---|
409 |
|
---|
410 | if (body[i++] != '\n')
|
---|
411 | return (0);
|
---|
412 |
|
---|
413 | return (i);
|
---|
414 | }
|
---|
415 |
|
---|
416 | /* Return the number of characters from STRING to the start of
|
---|
417 | the next line. */
|
---|
418 | int
|
---|
419 | skip_line (char *string)
|
---|
420 | {
|
---|
421 | register int i;
|
---|
422 |
|
---|
423 | for (i = 0; string && string[i] && string[i] != '\n'; i++);
|
---|
424 |
|
---|
425 | if (string[i] == '\n')
|
---|
426 | i++;
|
---|
427 |
|
---|
428 | return (i);
|
---|
429 | }
|
---|
430 |
|
---|
431 | /* Return the absolute position of the beginning of a tags table in this
|
---|
432 | binding starting the search at binding->start. */
|
---|
433 | long
|
---|
434 | find_tags_table (SEARCH_BINDING *binding)
|
---|
435 | {
|
---|
436 | SEARCH_BINDING tmp_search;
|
---|
437 | long position;
|
---|
438 |
|
---|
439 | tmp_search.buffer = binding->buffer;
|
---|
440 | tmp_search.start = binding->start;
|
---|
441 | tmp_search.end = binding->end;
|
---|
442 | tmp_search.flags = S_FoldCase;
|
---|
443 |
|
---|
444 | while ((position = find_node_separator (&tmp_search)) != -1 )
|
---|
445 | {
|
---|
446 | tmp_search.start = position;
|
---|
447 | tmp_search.start += skip_node_separator (tmp_search.buffer
|
---|
448 | + tmp_search.start);
|
---|
449 |
|
---|
450 | if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search))
|
---|
451 | return (position);
|
---|
452 | }
|
---|
453 | return (-1);
|
---|
454 | }
|
---|
455 |
|
---|
456 | /* Return the absolute position of the node named NODENAME in BINDING.
|
---|
457 | This is a brute force search, and we wish to avoid it when possible.
|
---|
458 | This function is called when a tag (indirect or otherwise) doesn't
|
---|
459 | really point to the right node. It returns the absolute position of
|
---|
460 | the separator preceding the node. */
|
---|
461 | long
|
---|
462 | find_node_in_binding (char *nodename, SEARCH_BINDING *binding)
|
---|
463 | {
|
---|
464 | long position;
|
---|
465 | int offset, namelen;
|
---|
466 | SEARCH_BINDING tmp_search;
|
---|
467 |
|
---|
468 | namelen = strlen (nodename);
|
---|
469 |
|
---|
470 | tmp_search.buffer = binding->buffer;
|
---|
471 | tmp_search.start = binding->start;
|
---|
472 | tmp_search.end = binding->end;
|
---|
473 | tmp_search.flags = 0;
|
---|
474 |
|
---|
475 | while ((position = find_node_separator (&tmp_search)) != -1)
|
---|
476 | {
|
---|
477 | tmp_search.start = position;
|
---|
478 | tmp_search.start += skip_node_separator
|
---|
479 | (tmp_search.buffer + tmp_search.start);
|
---|
480 |
|
---|
481 | offset = string_in_line
|
---|
482 | (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start);
|
---|
483 |
|
---|
484 | if (offset == -1)
|
---|
485 | continue;
|
---|
486 |
|
---|
487 | tmp_search.start += offset;
|
---|
488 | tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start);
|
---|
489 | offset = skip_node_characters
|
---|
490 | (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES);
|
---|
491 |
|
---|
492 | /* Notice that this is an exact match. You cannot grovel through
|
---|
493 | the buffer with this function looking for random nodes. */
|
---|
494 | if ((offset == namelen) &&
|
---|
495 | (tmp_search.buffer[tmp_search.start] == nodename[0]) &&
|
---|
496 | (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0))
|
---|
497 | return (position);
|
---|
498 | }
|
---|
499 | return (-1);
|
---|
500 | }
|
---|