1 | # From arnold@f7.net Sun Sep 5 12:30:53 2004
|
---|
2 | # Date: Fri, 3 Sep 2004 00:54:32 -0400 (EDT)
|
---|
3 | # From: William J Poser <wjposer@ldc.upenn.edu>
|
---|
4 | # To: arnold@skeeve.com
|
---|
5 | # Subject: gawk bug
|
---|
6 | # Message-ID: <20040903004347.W80049@lorax.ldc.upenn.edu>
|
---|
7 | #
|
---|
8 | # Here is a revised version of my previous message, modified to describe
|
---|
9 | # the accompanying files.
|
---|
10 | #
|
---|
11 | # IhSplit.awk should replicate every record with exactly one entry in the
|
---|
12 | # IH field, delete records lacking an IH field, and produce as many copies
|
---|
13 | # of records with two or more entries in the IH field as there are entries.
|
---|
14 | # In the latter case, the original IH field should be relabelled OIH and
|
---|
15 | # a new IH field be added at the beginning of the record.
|
---|
16 | #
|
---|
17 | # This has worked properly for many years, since at least 1997. It worked properly with gawk 3.0.5
|
---|
18 | # and possibly later versions. Unfortunately I didn't keep track of exactly what version it
|
---|
19 | # broke on, but it was whatever came with Mandrake Linux 9.0. It continued to fail with version
|
---|
20 | # 3.1.2. However, the problem was eliminated with version 3.1.3 and remains
|
---|
21 | # eliminated in version 3.1.4.
|
---|
22 | #
|
---|
23 | # The problem was that an apparently random subset of records would loose some
|
---|
24 | # or all of their fields. Running the script on the same input always produces
|
---|
25 | # the same output with the same errors.
|
---|
26 | #
|
---|
27 | # The file Input is a subset of a real lexicon that produces errors using
|
---|
28 | # gawk 3.1.2. GoodOutput is the expected output. BadOutput is the erroneous
|
---|
29 | # output. A diff will show that there are actually two errors. One record
|
---|
30 | # has fields stripped as described above. Another is omitted in its entirety.
|
---|
31 | #
|
---|
32 | #
|
---|
33 | # Bill Poser, Linguistics, University of Pennsylvania
|
---|
34 | # http://www.ling.upenn.edu/~wjposer/ billposer@alum.mit.edu
|
---|
35 | # ----------------------------------------------------------------------------
|
---|
36 | #For each record that contains multiple items in its inverse headword (IH)
|
---|
37 | #field, generate a set of new records each containing exactly one item
|
---|
38 | #in the inverse headword field, otherwise copies of the original.
|
---|
39 |
|
---|
40 | function CleanUp() #Clean up for next input record.
|
---|
41 | {
|
---|
42 | for(i in rec) delete rec[i];
|
---|
43 | }
|
---|
44 |
|
---|
45 | BEGIN {
|
---|
46 | RS = "";
|
---|
47 | FS = "\n?%"
|
---|
48 | }
|
---|
49 | {
|
---|
50 |
|
---|
51 | # First, create an associative array with the tags as indices.
|
---|
52 | for(i = 2; i <= NF; i++) { # The leading FS creates an initial empty field
|
---|
53 | split($i, f, ":");
|
---|
54 | rec[f[1]]=substr($i,index($i,":")+1);
|
---|
55 | }
|
---|
56 |
|
---|
57 | if(!("IH" in rec)) next;
|
---|
58 |
|
---|
59 | # Parse out the inverse headwords
|
---|
60 |
|
---|
61 | items = split(rec["IH"],ihs,"/");
|
---|
62 |
|
---|
63 | # Replace the old IH field.
|
---|
64 |
|
---|
65 | sub(/%IH:/,"%OIH:",$0);
|
---|
66 |
|
---|
67 | # Generate a new copy of the record for each inverse headword
|
---|
68 |
|
---|
69 | for(i = 1; i <= items; i++){
|
---|
70 | entries+=1;
|
---|
71 | printf("%%IH:%s\n",ihs[i]);
|
---|
72 | printf("%s\n\n",$0);
|
---|
73 | }
|
---|
74 | CleanUp();
|
---|
75 | }
|
---|