source: trunk/grep/src/search.c@ 3020

Last change on this file since 3020 was 2557, checked in by bird, 20 years ago

grep 2.5.1a

File size: 18.6 KB
Line 
1/* search.c - searching subroutines using dfa, kwset and regex for grep.
2 Copyright 1992, 1998, 2000 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
17 02111-1307, USA. */
18
19/* Written August 1992 by Mike Haertel. */
20
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
24#include <sys/types.h>
25#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
26/* We can handle multibyte string. */
27# define MBS_SUPPORT
28# include <wchar.h>
29# include <wctype.h>
30#endif
31
32#include "system.h"
33#include "grep.h"
34#include "regex.h"
35#include "dfa.h"
36#include "kwset.h"
37#include "error.h"
38#include "xalloc.h"
39#ifdef HAVE_LIBPCRE
40# include <pcre.h>
41#endif
42
43#define NCHAR (UCHAR_MAX + 1)
44
45/* For -w, we also consider _ to be word constituent. */
46#define WCHAR(C) (ISALNUM(C) || (C) == '_')
47
48/* DFA compiled regexp. */
49static struct dfa dfa;
50
51/* The Regex compiled patterns. */
52static struct patterns
53{
54 /* Regex compiled regexp. */
55 struct re_pattern_buffer regexbuf;
56 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
57 Q@#%!# library interface in regex.c. */
58} patterns0;
59
60struct patterns *patterns;
61size_t pcount;
62
63/* KWset compiled pattern. For Ecompile and Gcompile, we compile
64 a list of strings, at least one of which is known to occur in
65 any string matching the regexp. */
66static kwset_t kwset;
67
68/* Number of compiled fixed strings known to exactly match the regexp.
69 If kwsexec returns < kwset_exact_matches, then we don't need to
70 call the regexp matcher at all. */
71static int kwset_exact_matches;
72
73#if defined(MBS_SUPPORT)
74static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
75#endif
76static void kwsinit PARAMS ((void));
77static void kwsmusts PARAMS ((void));
78static void Gcompile PARAMS ((char const *, size_t));
79static void Ecompile PARAMS ((char const *, size_t));
80static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
81static void Fcompile PARAMS ((char const *, size_t));
82static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
83static void Pcompile PARAMS ((char const *, size_t ));
84static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
85
86void
87dfaerror (char const *mesg)
88{
89 error (2, 0, mesg);
90}
91
92static void
93kwsinit (void)
94{
95 static char trans[NCHAR];
96 int i;
97
98 if (match_icase)
99 for (i = 0; i < NCHAR; ++i)
100 trans[i] = TOLOWER (i);
101
102 if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
103 error (2, 0, _("memory exhausted"));
104}
105
106/* If the DFA turns out to have some set of fixed strings one of
107 which must occur in the match, then we build a kwset matcher
108 to find those strings, and thus quickly filter out impossible
109 matches. */
110static void
111kwsmusts (void)
112{
113 struct dfamust const *dm;
114 char const *err;
115
116 if (dfa.musts)
117 {
118 kwsinit ();
119 /* First, we compile in the substrings known to be exact
120 matches. The kwset matcher will return the index
121 of the matching string that it chooses. */
122 for (dm = dfa.musts; dm; dm = dm->next)
123 {
124 if (!dm->exact)
125 continue;
126 ++kwset_exact_matches;
127 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
128 error (2, 0, err);
129 }
130 /* Now, we compile the substrings that will require
131 the use of the regexp matcher. */
132 for (dm = dfa.musts; dm; dm = dm->next)
133 {
134 if (dm->exact)
135 continue;
136 if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
137 error (2, 0, err);
138 }
139 if ((err = kwsprep (kwset)) != 0)
140 error (2, 0, err);
141 }
142}
143
144#ifdef MBS_SUPPORT
145/* This function allocate the array which correspond to "buf".
146 Then this check multibyte string and mark on the positions which
147 are not singlebyte character nor the first byte of a multibyte
148 character. Caller must free the array. */
149static char*
150check_multibyte_string(char const *buf, size_t size)
151{
152 char *mb_properties = malloc(size);
153 mbstate_t cur_state;
154 int i;
155 memset(&cur_state, 0, sizeof(mbstate_t));
156 memset(mb_properties, 0, sizeof(char)*size);
157 for (i = 0; i < size ;)
158 {
159 size_t mbclen;
160 mbclen = mbrlen(buf + i, size - i, &cur_state);
161
162 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
163 {
164 /* An invalid sequence, or a truncated multibyte character.
165 We treat it as a singlebyte character. */
166 mbclen = 1;
167 }
168 mb_properties[i] = mbclen;
169 i += mbclen;
170 }
171
172 return mb_properties;
173}
174#endif
175
176static void
177Gcompile (char const *pattern, size_t size)
178{
179 const char *err;
180 char const *sep;
181 size_t total = size;
182 char const *motif = pattern;
183
184 re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
185 dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
186
187 /* For GNU regex compiler we have to pass the patterns separately to detect
188 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
189 GNU regex should have raise a syntax error. The same for backref, where
190 the backref should have been local to each pattern. */
191 do
192 {
193 size_t len;
194 sep = memchr (motif, '\n', total);
195 if (sep)
196 {
197 len = sep - motif;
198 sep++;
199 total -= (len + 1);
200 }
201 else
202 {
203 len = total;
204 total = 0;
205 }
206
207 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
208 if (patterns == NULL)
209 error (2, errno, _("memory exhausted"));
210
211 patterns[pcount] = patterns0;
212
213 if ((err = re_compile_pattern (motif, len,
214 &(patterns[pcount].regexbuf))) != 0)
215 error (2, 0, err);
216 pcount++;
217
218 motif = sep;
219 } while (sep && total != 0);
220
221 /* In the match_words and match_lines cases, we use a different pattern
222 for the DFA matcher that will quickly throw out cases that won't work.
223 Then if DFA succeeds we do some hairy stuff using the regex matcher
224 to decide whether the match should really count. */
225 if (match_words || match_lines)
226 {
227 /* In the whole-word case, we use the pattern:
228 \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
229 In the whole-line case, we use the pattern:
230 ^\(userpattern\)$. */
231
232 static char const line_beg[] = "^\\(";
233 static char const line_end[] = "\\)$";
234 static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
235 static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
236 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
237 size_t i;
238 strcpy (n, match_lines ? line_beg : word_beg);
239 i = strlen (n);
240 memcpy (n + i, pattern, size);
241 i += size;
242 strcpy (n + i, match_lines ? line_end : word_end);
243 i += strlen (n + i);
244 pattern = n;
245 size = i;
246 }
247
248 dfacomp (pattern, size, &dfa, 1);
249 kwsmusts ();
250}
251
252static void
253Ecompile (char const *pattern, size_t size)
254{
255 const char *err;
256 const char *sep;
257 size_t total = size;
258 char const *motif = pattern;
259
260 if (strcmp (matcher, "awk") == 0)
261 {
262 re_set_syntax (RE_SYNTAX_AWK);
263 dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
264 }
265 else
266 {
267 re_set_syntax (RE_SYNTAX_POSIX_EGREP);
268 dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
269 }
270
271 /* For GNU regex compiler we have to pass the patterns separately to detect
272 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
273 GNU regex should have raise a syntax error. The same for backref, where
274 the backref should have been local to each pattern. */
275 do
276 {
277 size_t len;
278 sep = memchr (motif, '\n', total);
279 if (sep)
280 {
281 len = sep - motif;
282 sep++;
283 total -= (len + 1);
284 }
285 else
286 {
287 len = total;
288 total = 0;
289 }
290
291 patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
292 if (patterns == NULL)
293 error (2, errno, _("memory exhausted"));
294 patterns[pcount] = patterns0;
295
296 if ((err = re_compile_pattern (motif, len,
297 &(patterns[pcount].regexbuf))) != 0)
298 error (2, 0, err);
299 pcount++;
300
301 motif = sep;
302 } while (sep && total != 0);
303
304 /* In the match_words and match_lines cases, we use a different pattern
305 for the DFA matcher that will quickly throw out cases that won't work.
306 Then if DFA succeeds we do some hairy stuff using the regex matcher
307 to decide whether the match should really count. */
308 if (match_words || match_lines)
309 {
310 /* In the whole-word case, we use the pattern:
311 (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
312 In the whole-line case, we use the pattern:
313 ^(userpattern)$. */
314
315 static char const line_beg[] = "^(";
316 static char const line_end[] = ")$";
317 static char const word_beg[] = "(^|[^[:alnum:]_])(";
318 static char const word_end[] = ")([^[:alnum:]_]|$)";
319 char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
320 size_t i;
321 strcpy (n, match_lines ? line_beg : word_beg);
322 i = strlen(n);
323 memcpy (n + i, pattern, size);
324 i += size;
325 strcpy (n + i, match_lines ? line_end : word_end);
326 i += strlen (n + i);
327 pattern = n;
328 size = i;
329 }
330
331 dfacomp (pattern, size, &dfa, 1);
332 kwsmusts ();
333}
334
335static size_t
336EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
337{
338 register char const *buflim, *beg, *end;
339 char eol = eolbyte;
340 int backref, start, len;
341 struct kwsmatch kwsm;
342 size_t i;
343#ifdef MBS_SUPPORT
344 char *mb_properties = NULL;
345#endif /* MBS_SUPPORT */
346
347#ifdef MBS_SUPPORT
348 if (MB_CUR_MAX > 1 && kwset)
349 mb_properties = check_multibyte_string(buf, size);
350#endif /* MBS_SUPPORT */
351
352 buflim = buf + size;
353
354 for (beg = end = buf; end < buflim; beg = end)
355 {
356 if (!exact)
357 {
358 if (kwset)
359 {
360 /* Find a possible match using the KWset matcher. */
361 size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
362 if (offset == (size_t) -1)
363 {
364#ifdef MBS_SUPPORT
365 if (MB_CUR_MAX > 1)
366 free(mb_properties);
367#endif
368 return (size_t)-1;
369 }
370 beg += offset;
371 /* Narrow down to the line containing the candidate, and
372 run it through DFA. */
373 end = memchr(beg, eol, buflim - beg);
374 end++;
375#ifdef MBS_SUPPORT
376 if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
377 continue;
378#endif
379 while (beg > buf && beg[-1] != eol)
380 --beg;
381 if (kwsm.index < kwset_exact_matches)
382 goto success;
383 if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
384 continue;
385 }
386 else
387 {
388 /* No good fixed strings; start with DFA. */
389 size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
390 if (offset == (size_t) -1)
391 break;
392 /* Narrow down to the line we've found. */
393 beg += offset;
394 end = memchr (beg, eol, buflim - beg);
395 end++;
396 while (beg > buf && beg[-1] != eol)
397 --beg;
398 }
399 /* Successful, no backreferences encountered! */
400 if (!backref)
401 goto success;
402 }
403 else
404 end = beg + size;
405
406 /* If we've made it to this point, this means DFA has seen
407 a probable match, and we need to run it through Regex. */
408 for (i = 0; i < pcount; i++)
409 {
410 patterns[i].regexbuf.not_eol = 0;
411 if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
412 end - beg - 1, 0,
413 end - beg - 1, &(patterns[i].regs))))
414 {
415 len = patterns[i].regs.end[0] - start;
416 if (exact)
417 {
418 *match_size = len;
419 return start;
420 }
421 if ((!match_lines && !match_words)
422 || (match_lines && len == end - beg - 1))
423 goto success;
424 /* If -w, check if the match aligns with word boundaries.
425 We do this iteratively because:
426 (a) the line may contain more than one occurence of the
427 pattern, and
428 (b) Several alternatives in the pattern might be valid at a
429 given point, and we may need to consider a shorter one to
430 find a word boundary. */
431 if (match_words)
432 while (start >= 0)
433 {
434 if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
435 && (len == end - beg - 1
436 || !WCHAR ((unsigned char) beg[start + len])))
437 goto success;
438 if (len > 0)
439 {
440 /* Try a shorter length anchored at the same place. */
441 --len;
442 patterns[i].regexbuf.not_eol = 1;
443 len = re_match (&(patterns[i].regexbuf), beg,
444 start + len, start,
445 &(patterns[i].regs));
446 }
447 if (len <= 0)
448 {
449 /* Try looking further on. */
450 if (start == end - beg - 1)
451 break;
452 ++start;
453 patterns[i].regexbuf.not_eol = 0;
454 start = re_search (&(patterns[i].regexbuf), beg,
455 end - beg - 1,
456 start, end - beg - 1 - start,
457 &(patterns[i].regs));
458 len = patterns[i].regs.end[0] - start;
459 }
460 }
461 }
462 } /* for Regex patterns. */
463 } /* for (beg = end ..) */
464#ifdef MBS_SUPPORT
465 if (MB_CUR_MAX > 1 && mb_properties)
466 free (mb_properties);
467#endif /* MBS_SUPPORT */
468 return (size_t) -1;
469
470 success:
471#ifdef MBS_SUPPORT
472 if (MB_CUR_MAX > 1 && mb_properties)
473 free (mb_properties);
474#endif /* MBS_SUPPORT */
475 *match_size = end - beg;
476 return beg - buf;
477}
478
479static void
480Fcompile (char const *pattern, size_t size)
481{
482 char const *beg, *lim, *err;
483
484 kwsinit ();
485 beg = pattern;
486 do
487 {
488 for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
489 ;
490 if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
491 error (2, 0, err);
492 if (lim < pattern + size)
493 ++lim;
494 beg = lim;
495 }
496 while (beg < pattern + size);
497
498 if ((err = kwsprep (kwset)) != 0)
499 error (2, 0, err);
500}
501
502static size_t
503Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
504{
505 register char const *beg, *try, *end;
506 register size_t len;
507 char eol = eolbyte;
508 struct kwsmatch kwsmatch;
509#ifdef MBS_SUPPORT
510 char *mb_properties;
511 if (MB_CUR_MAX > 1)
512 mb_properties = check_multibyte_string (buf, size);
513#endif /* MBS_SUPPORT */
514
515 for (beg = buf; beg <= buf + size; ++beg)
516 {
517 size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
518 if (offset == (size_t) -1)
519 {
520#ifdef MBS_SUPPORT
521 if (MB_CUR_MAX > 1)
522 free(mb_properties);
523#endif /* MBS_SUPPORT */
524 return offset;
525 }
526#ifdef MBS_SUPPORT
527 if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
528 continue; /* It is a part of multibyte character. */
529#endif /* MBS_SUPPORT */
530 beg += offset;
531 len = kwsmatch.size[0];
532 if (exact)
533 {
534 *match_size = len;
535#ifdef MBS_SUPPORT
536 if (MB_CUR_MAX > 1)
537 free (mb_properties);
538#endif /* MBS_SUPPORT */
539 return beg - buf;
540 }
541 if (match_lines)
542 {
543 if (beg > buf && beg[-1] != eol)
544 continue;
545 if (beg + len < buf + size && beg[len] != eol)
546 continue;
547 goto success;
548 }
549 else if (match_words)
550 for (try = beg; len; )
551 {
552 if (try > buf && WCHAR((unsigned char) try[-1]))
553 break;
554 if (try + len < buf + size && WCHAR((unsigned char) try[len]))
555 {
556 offset = kwsexec (kwset, beg, --len, &kwsmatch);
557 if (offset == (size_t) -1)
558 {
559#ifdef MBS_SUPPORT
560 if (MB_CUR_MAX > 1)
561 free (mb_properties);
562#endif /* MBS_SUPPORT */
563 return offset;
564 }
565 try = beg + offset;
566 len = kwsmatch.size[0];
567 }
568 else
569 goto success;
570 }
571 else
572 goto success;
573 }
574
575#ifdef MBS_SUPPORT
576 if (MB_CUR_MAX > 1)
577 free (mb_properties);
578#endif /* MBS_SUPPORT */
579 return -1;
580
581 success:
582 end = memchr (beg + len, eol, (buf + size) - (beg + len));
583 end++;
584 while (buf < beg && beg[-1] != eol)
585 --beg;
586 *match_size = end - beg;
587#ifdef MBS_SUPPORT
588 if (MB_CUR_MAX > 1)
589 free (mb_properties);
590#endif /* MBS_SUPPORT */
591 return beg - buf;
592}
593
594#if HAVE_LIBPCRE
595/* Compiled internal form of a Perl regular expression. */
596static pcre *cre;
597
598/* Additional information about the pattern. */
599static pcre_extra *extra;
600#endif
601
602static void
603Pcompile (char const *pattern, size_t size)
604{
605#if !HAVE_LIBPCRE
606 error (2, 0, _("The -P option is not supported"));
607#else
608 int e;
609 char const *ep;
610 char *re = xmalloc (4 * size + 7);
611 int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
612 char const *patlim = pattern + size;
613 char *n = re;
614 char const *p;
615 char const *pnul;
616
617 /* FIXME: Remove this restriction. */
618 if (eolbyte != '\n')
619 error (2, 0, _("The -P and -z options cannot be combined"));
620
621 *n = '\0';
622 if (match_lines)
623 strcpy (n, "^(");
624 if (match_words)
625 strcpy (n, "\\b(");
626 n += strlen (n);
627
628 /* The PCRE interface doesn't allow NUL bytes in the pattern, so
629 replace each NUL byte in the pattern with the four characters
630 "\000", removing a preceding backslash if there are an odd
631 number of backslashes before the NUL.
632
633 FIXME: This method does not work with some multibyte character
634 encodings, notably Shift-JIS, where a multibyte character can end
635 in a backslash byte. */
636 for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
637 {
638 memcpy (n, p, pnul - p);
639 n += pnul - p;
640 for (p = pnul; pattern < p && p[-1] == '\\'; p--)
641 continue;
642 n -= (pnul - p) & 1;
643 strcpy (n, "\\000");
644 n += 4;
645 }
646
647 memcpy (n, p, patlim - p);
648 n += patlim - p;
649 *n = '\0';
650 if (match_words)
651 strcpy (n, ")\\b");
652 if (match_lines)
653 strcpy (n, ")$");
654
655 cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
656 if (!cre)
657 error (2, 0, ep);
658
659 extra = pcre_study (cre, 0, &ep);
660 if (ep)
661 error (2, 0, ep);
662
663 free (re);
664#endif
665}
666
667static size_t
668Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
669{
670#if !HAVE_LIBPCRE
671 abort ();
672 return -1;
673#else
674 /* This array must have at least two elements; everything after that
675 is just for performance improvement in pcre_exec. */
676 int sub[300];
677
678 int e = pcre_exec (cre, extra, buf, size, 0, 0,
679 sub, sizeof sub / sizeof *sub);
680
681 if (e <= 0)
682 {
683 switch (e)
684 {
685 case PCRE_ERROR_NOMATCH:
686 return -1;
687
688 case PCRE_ERROR_NOMEMORY:
689 error (2, 0, _("Memory exhausted"));
690
691 default:
692 abort ();
693 }
694 }
695 else
696 {
697 /* Narrow down to the line we've found. */
698 char const *beg = buf + sub[0];
699 char const *end = buf + sub[1];
700 char const *buflim = buf + size;
701 char eol = eolbyte;
702 if (!exact)
703 {
704 end = memchr (end, eol, buflim - end);
705 end++;
706 while (buf < beg && beg[-1] != eol)
707 --beg;
708 }
709
710 *match_size = end - beg;
711 return beg - buf;
712 }
713#endif
714}
715
716struct matcher const matchers[] = {
717 { "default", Gcompile, EGexecute },
718 { "grep", Gcompile, EGexecute },
719 { "egrep", Ecompile, EGexecute },
720 { "awk", Ecompile, EGexecute },
721 { "fgrep", Fcompile, Fexecute },
722 { "perl", Pcompile, Pexecute },
723 { "", 0, 0 },
724};
Note: See TracBrowser for help on using the repository browser.