1 | #From stolfi@ic.unicamp.br Sun Jan 28 19:02:09 2001
|
---|
2 | #Received: from mail.actcom.co.il [192.114.47.13]
|
---|
3 | # by localhost with POP3 (fetchmail-5.5.0)
|
---|
4 | # for arnold@localhost (single-drop); Sun, 28 Jan 2001 19:02:09 +0200 (IST)
|
---|
5 | #Received: by actcom.co.il (mbox arobbins)
|
---|
6 | # (with Cubic Circle's cucipop (v1.31 1998/05/13) Sun Jan 28 19:03:34 2001)
|
---|
7 | #X-From_: stolfi@ic.unicamp.br Sun Jan 28 18:46:02 2001
|
---|
8 | #Received: from lmail.actcom.co.il by actcom.co.il with ESMTP
|
---|
9 | # (8.9.1a/actcom-0.2) id SAA22932 for <arobbins@actcom.co.il>;
|
---|
10 | # Sun, 28 Jan 2001 18:46:00 +0200 (EET)
|
---|
11 | # (rfc931-sender: lmail.actcom.co.il [192.114.47.13])
|
---|
12 | #Received: from billohost.com (www.billohost.com [209.196.35.10])
|
---|
13 | # by lmail.actcom.co.il (8.9.3/8.9.1) with ESMTP id SAA18523
|
---|
14 | # for <arobbins@actcom.co.il>; Sun, 28 Jan 2001 18:46:35 +0200
|
---|
15 | #Received: from grande.dcc.unicamp.br (grande.dcc.unicamp.br [143.106.7.8])
|
---|
16 | # by billohost.com (8.9.3/8.9.3) with ESMTP id LAA20063
|
---|
17 | # for <arnold@skeeve.com>; Sun, 28 Jan 2001 11:45:54 -0500
|
---|
18 | #Received: from amazonas.dcc.unicamp.br (amazonas.dcc.unicamp.br [143.106.7.11])
|
---|
19 | # by grande.dcc.unicamp.br (8.9.3/8.9.3) with ESMTP id OAA29726;
|
---|
20 | # Sun, 28 Jan 2001 14:45:47 -0200 (EDT)
|
---|
21 | #Received: from coruja.dcc.unicamp.br (coruja.dcc.unicamp.br [143.106.24.80])
|
---|
22 | # by amazonas.dcc.unicamp.br (8.8.5/8.8.5) with ESMTP id OAA06542;
|
---|
23 | # Sun, 28 Jan 2001 14:45:45 -0200 (EDT)
|
---|
24 | #Received: (from stolfi@localhost)
|
---|
25 | # by coruja.dcc.unicamp.br (8.11.0/8.11.0) id f0SGjib16703;
|
---|
26 | # Sun, 28 Jan 2001 14:45:44 -0200 (EDT)
|
---|
27 | #Date: Sun, 28 Jan 2001 14:45:44 -0200 (EDT)
|
---|
28 | #Message-Id: <200101281645.f0SGjib16703@coruja.dcc.unicamp.br>
|
---|
29 | #From: Jorge Stolfi <stolfi@ic.unicamp.br>
|
---|
30 | #To: Michal Jaegermann <michal@ellpspace.math.ualberta.ca>
|
---|
31 | #Cc: Aharon Robbins <arnold@skeeve.com>, oliva@ic.unicamp.br,
|
---|
32 | # celio@ic.unicamp.br, ducatte@ic.unicamp.br, machado@ic.unicamp.br
|
---|
33 | #Subject: Re: a regex.c problem
|
---|
34 | #MIME-Version: 1.0
|
---|
35 | #Content-Transfer-Encoding: 8bit
|
---|
36 | #Content-Type: text/plain; charset=iso-8859-1
|
---|
37 | #In-Reply-To: <20010128090314.A5820@ellpspace.math.ualberta.ca>
|
---|
38 | #References: <200101281207.f0SC7Un08435@skeeve.com>
|
---|
39 | # <20010128090314.A5820@ellpspace.math.ualberta.ca>
|
---|
40 | #Reply-To: stolfi@ic.unicamp.br
|
---|
41 | #Status: RO
|
---|
42 | #
|
---|
43 | #
|
---|
44 | # > [Michal] Are there any other examples of "certain characters"
|
---|
45 | # > which would throw this regex engine off?
|
---|
46 | #
|
---|
47 | #I now tested [anX]*n for X ranging trough all characters from \000 and
|
---|
48 | #\377, and got that unexpected result only for the following ones:
|
---|
49 | #
|
---|
50 | # \370 | =F8 | ø | Small o, slash
|
---|
51 | # \371 | =F9 | ù | Small u, grave accent
|
---|
52 | # \372 | =FA | ú | Small u, acute accent
|
---|
53 | # \373 | =FB | û | Small u, circumflex accent
|
---|
54 | # \374 | =FC | ü | Small u, dieresis or umlaut mark
|
---|
55 | # \375 | =FD | ý | Small y, acute accent
|
---|
56 | # \376 | =FE | þ | Small thorn, Icelandic
|
---|
57 | # \377 | =FF | ÿ | Small y, dieresis or umlaut mark
|
---|
58 | #
|
---|
59 | #I have also tried those offending REs from inside emacs (20.7.1), with
|
---|
60 | #query-replace-regexp, and it seems to be working fine. So presumably
|
---|
61 | #the bug lies in gawk itself, or in the RE parsing code, rather than in
|
---|
62 | #the matching engine?
|
---|
63 | #
|
---|
64 | #Could it be an underdimensioned table somewhere?
|
---|
65 | #
|
---|
66 | #Thanks for the help, and all the best
|
---|
67 | #
|
---|
68 | #--stolfi
|
---|
69 | #
|
---|
70 | # ----------------------------------------------------------------------
|
---|
71 | #! /usr/bin/gawk -f
|
---|
72 |
|
---|
73 | BEGIN {
|
---|
74 | for (c = 0; c < 256; c++)
|
---|
75 | { do_test(c); }
|
---|
76 | }
|
---|
77 |
|
---|
78 | function do_test(char, pat,s,t)
|
---|
79 | {
|
---|
80 | if (char == 92) { printf "(error for \\%03o)\n", char; return; }
|
---|
81 | pat = sprintf("[an\\%03o]*n", char);
|
---|
82 | s = "bananas and ananases in canaan";
|
---|
83 | t = s; gsub(pat, "AN", t); printf "%-8s %s\n", pat, t;
|
---|
84 | # ADR: Added:
|
---|
85 | if (s ~ pat) printf "\tmatch\n" ; else printf "\tno-match\n"
|
---|
86 | }
|
---|
87 |
|
---|
88 | # ----------------------------------------------------------------------
|
---|