source: trunk/grep/src/dosbuf.c@ 2698

Last change on this file since 2698 was 2557, checked in by bird, 19 years ago

grep 2.5.1a

File size: 5.4 KB
Line 
1/* Messy DOS-specific code for correctly treating binary, Unix text
2 and DOS text files.
3
4 This has several aspects:
5
6 * Guessing the file type (unless the user tells us);
7 * Stripping CR characters from DOS text files (otherwise regex
8 functions won't work correctly);
9 * Reporting correct byte count with -b for any kind of file.
10
11*/
12
13typedef enum {
14 UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT
15} File_type;
16
17struct dos_map {
18 off_t pos; /* position in buffer passed to matcher */
19 off_t add; /* how much to add when reporting char position */
20};
21
22static int dos_report_unix_offset = 0;
23
24static File_type dos_file_type = UNKNOWN;
25static File_type dos_use_file_type = UNKNOWN;
26static off_t dos_stripped_crs = 0;
27static struct dos_map *dos_pos_map;
28static int dos_pos_map_size = 0;
29static int dos_pos_map_used = 0;
30static int inp_map_idx = 0, out_map_idx = 1;
31
32/* Guess DOS file type by looking at its contents. */
33static inline File_type
34guess_type (char *buf, register size_t buflen)
35{
36 int crlf_seen = 0;
37 register char *bp = buf;
38
39 while (buflen--)
40 {
41 /* Treat a file as binary if it has a NUL character. */
42 if (!*bp)
43 return DOS_BINARY;
44
45 /* CR before LF means DOS text file (unless we later see
46 binary characters). */
47 else if (*bp == '\r' && buflen && bp[1] == '\n')
48 crlf_seen = 1;
49
50 bp++;
51 }
52
53 return crlf_seen ? DOS_TEXT : UNIX_TEXT;
54}
55
56/* Convert external DOS file representation to internal.
57 Return the count of characters left in the buffer.
58 Build table to map character positions when reporting byte counts. */
59static inline int
60undossify_input (register char *buf, size_t buflen)
61{
62 int chars_left = 0;
63
64 if (totalcc == 0)
65 {
66 /* New file: forget everything we knew about character
67 position mapping table and file type. */
68 inp_map_idx = 0;
69 out_map_idx = 1;
70 dos_pos_map_used = 0;
71 dos_stripped_crs = 0;
72 dos_file_type = dos_use_file_type;
73 }
74
75 /* Guess if this file is binary, unless we already know that. */
76 if (dos_file_type == UNKNOWN)
77 dos_file_type = guess_type(buf, buflen);
78
79 /* If this file is to be treated as DOS Text, strip the CR characters
80 and maybe build the table for character position mapping on output. */
81 if (dos_file_type == DOS_TEXT)
82 {
83 char *destp = buf;
84
85 while (buflen--)
86 {
87 if (*buf != '\r')
88 {
89 *destp++ = *buf++;
90 chars_left++;
91 }
92 else
93 {
94 buf++;
95 if (out_byte && !dos_report_unix_offset)
96 {
97 dos_stripped_crs++;
98 while (buflen && *buf == '\r')
99 {
100 dos_stripped_crs++;
101 buflen--;
102 buf++;
103 }
104 if (inp_map_idx >= dos_pos_map_size - 1)
105 {
106 dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000;
107 dos_pos_map =
108 (struct dos_map *)xrealloc((char *)dos_pos_map,
109 dos_pos_map_size *
110 sizeof(struct dos_map));
111 }
112
113 if (!inp_map_idx)
114 {
115 /* Add sentinel entry. */
116 dos_pos_map[inp_map_idx].pos = 0;
117 dos_pos_map[inp_map_idx++].add = 0;
118
119 /* Initialize first real entry. */
120 dos_pos_map[inp_map_idx].add = 0;
121 }
122
123 /* Put the new entry. If the stripped CR characters
124 precede a Newline (the usual case), pretend that
125 they were found *after* the Newline. This makes
126 displayed byte offsets more reasonable in some
127 cases, and fits better the intuitive notion that
128 the line ends *before* the CR, not *after* it. */
129 inp_map_idx++;
130 dos_pos_map[inp_map_idx-1].pos =
131 (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc;
132 dos_pos_map[inp_map_idx].add = dos_stripped_crs;
133 dos_pos_map_used = inp_map_idx;
134
135 /* The following will be updated on the next pass. */
136 dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1;
137 }
138 }
139 }
140
141 return chars_left;
142 }
143
144 return buflen;
145}
146
147/* Convert internal byte count into external. */
148static inline off_t
149dossified_pos (off_t byteno)
150{
151 off_t pos_lo;
152 off_t pos_hi;
153
154 if (dos_file_type != DOS_TEXT || dos_report_unix_offset)
155 return byteno;
156
157 /* Optimization: usually the file will be scanned sequentially.
158 So in most cases, this byte position will be found in the
159 table near the previous one, as recorded in `out_map_idx'. */
160 pos_lo = dos_pos_map[out_map_idx-1].pos;
161 pos_hi = dos_pos_map[out_map_idx].pos;
162
163 /* If the initial guess failed, search up or down, as
164 appropriate, beginning with the previous place. */
165 if (byteno >= pos_hi)
166 {
167 out_map_idx++;
168 while (out_map_idx < dos_pos_map_used &&
169 byteno >= dos_pos_map[out_map_idx].pos)
170 out_map_idx++;
171 }
172
173 else if (byteno < pos_lo)
174 {
175 out_map_idx--;
176 while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos)
177 out_map_idx--;
178 }
179
180 return byteno + dos_pos_map[out_map_idx].add;
181}
Note: See TracBrowser for help on using the repository browser.