| 1 | /* Messy DOS-specific code for correctly treating binary, Unix text
|
|---|
| 2 | and DOS text files.
|
|---|
| 3 |
|
|---|
| 4 | This has several aspects:
|
|---|
| 5 |
|
|---|
| 6 | * Guessing the file type (unless the user tells us);
|
|---|
| 7 | * Stripping CR characters from DOS text files (otherwise regex
|
|---|
| 8 | functions won't work correctly);
|
|---|
| 9 | * Reporting correct byte count with -b for any kind of file.
|
|---|
| 10 |
|
|---|
| 11 | */
|
|---|
| 12 |
|
|---|
| 13 | typedef enum {
|
|---|
| 14 | UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT
|
|---|
| 15 | } File_type;
|
|---|
| 16 |
|
|---|
| 17 | struct dos_map {
|
|---|
| 18 | off_t pos; /* position in buffer passed to matcher */
|
|---|
| 19 | off_t add; /* how much to add when reporting char position */
|
|---|
| 20 | };
|
|---|
| 21 |
|
|---|
| 22 | static int dos_report_unix_offset = 0;
|
|---|
| 23 |
|
|---|
| 24 | static File_type dos_file_type = UNKNOWN;
|
|---|
| 25 | static File_type dos_use_file_type = UNKNOWN;
|
|---|
| 26 | static off_t dos_stripped_crs = 0;
|
|---|
| 27 | static struct dos_map *dos_pos_map;
|
|---|
| 28 | static int dos_pos_map_size = 0;
|
|---|
| 29 | static int dos_pos_map_used = 0;
|
|---|
| 30 | static int inp_map_idx = 0, out_map_idx = 1;
|
|---|
| 31 |
|
|---|
| 32 | /* Guess DOS file type by looking at its contents. */
|
|---|
| 33 | static inline File_type
|
|---|
| 34 | guess_type (char *buf, register size_t buflen)
|
|---|
| 35 | {
|
|---|
| 36 | int crlf_seen = 0;
|
|---|
| 37 | register char *bp = buf;
|
|---|
| 38 |
|
|---|
| 39 | while (buflen--)
|
|---|
| 40 | {
|
|---|
| 41 | /* Treat a file as binary if it has a NUL character. */
|
|---|
| 42 | if (!*bp)
|
|---|
| 43 | return DOS_BINARY;
|
|---|
| 44 |
|
|---|
| 45 | /* CR before LF means DOS text file (unless we later see
|
|---|
| 46 | binary characters). */
|
|---|
| 47 | else if (*bp == '\r' && buflen && bp[1] == '\n')
|
|---|
| 48 | crlf_seen = 1;
|
|---|
| 49 |
|
|---|
| 50 | bp++;
|
|---|
| 51 | }
|
|---|
| 52 |
|
|---|
| 53 | return crlf_seen ? DOS_TEXT : UNIX_TEXT;
|
|---|
| 54 | }
|
|---|
| 55 |
|
|---|
| 56 | /* Convert external DOS file representation to internal.
|
|---|
| 57 | Return the count of characters left in the buffer.
|
|---|
| 58 | Build table to map character positions when reporting byte counts. */
|
|---|
| 59 | static inline int
|
|---|
| 60 | undossify_input (register char *buf, size_t buflen)
|
|---|
| 61 | {
|
|---|
| 62 | int chars_left = 0;
|
|---|
| 63 |
|
|---|
| 64 | if (totalcc == 0)
|
|---|
| 65 | {
|
|---|
| 66 | /* New file: forget everything we knew about character
|
|---|
| 67 | position mapping table and file type. */
|
|---|
| 68 | inp_map_idx = 0;
|
|---|
| 69 | out_map_idx = 1;
|
|---|
| 70 | dos_pos_map_used = 0;
|
|---|
| 71 | dos_stripped_crs = 0;
|
|---|
| 72 | dos_file_type = dos_use_file_type;
|
|---|
| 73 | }
|
|---|
| 74 |
|
|---|
| 75 | /* Guess if this file is binary, unless we already know that. */
|
|---|
| 76 | if (dos_file_type == UNKNOWN)
|
|---|
| 77 | dos_file_type = guess_type(buf, buflen);
|
|---|
| 78 |
|
|---|
| 79 | /* If this file is to be treated as DOS Text, strip the CR characters
|
|---|
| 80 | and maybe build the table for character position mapping on output. */
|
|---|
| 81 | if (dos_file_type == DOS_TEXT)
|
|---|
| 82 | {
|
|---|
| 83 | char *destp = buf;
|
|---|
| 84 |
|
|---|
| 85 | while (buflen--)
|
|---|
| 86 | {
|
|---|
| 87 | if (*buf != '\r')
|
|---|
| 88 | {
|
|---|
| 89 | *destp++ = *buf++;
|
|---|
| 90 | chars_left++;
|
|---|
| 91 | }
|
|---|
| 92 | else
|
|---|
| 93 | {
|
|---|
| 94 | buf++;
|
|---|
| 95 | if (out_byte && !dos_report_unix_offset)
|
|---|
| 96 | {
|
|---|
| 97 | dos_stripped_crs++;
|
|---|
| 98 | while (buflen && *buf == '\r')
|
|---|
| 99 | {
|
|---|
| 100 | dos_stripped_crs++;
|
|---|
| 101 | buflen--;
|
|---|
| 102 | buf++;
|
|---|
| 103 | }
|
|---|
| 104 | if (inp_map_idx >= dos_pos_map_size - 1)
|
|---|
| 105 | {
|
|---|
| 106 | dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000;
|
|---|
| 107 | dos_pos_map =
|
|---|
| 108 | (struct dos_map *)xrealloc((char *)dos_pos_map,
|
|---|
| 109 | dos_pos_map_size *
|
|---|
| 110 | sizeof(struct dos_map));
|
|---|
| 111 | }
|
|---|
| 112 |
|
|---|
| 113 | if (!inp_map_idx)
|
|---|
| 114 | {
|
|---|
| 115 | /* Add sentinel entry. */
|
|---|
| 116 | dos_pos_map[inp_map_idx].pos = 0;
|
|---|
| 117 | dos_pos_map[inp_map_idx++].add = 0;
|
|---|
| 118 |
|
|---|
| 119 | /* Initialize first real entry. */
|
|---|
| 120 | dos_pos_map[inp_map_idx].add = 0;
|
|---|
| 121 | }
|
|---|
| 122 |
|
|---|
| 123 | /* Put the new entry. If the stripped CR characters
|
|---|
| 124 | precede a Newline (the usual case), pretend that
|
|---|
| 125 | they were found *after* the Newline. This makes
|
|---|
| 126 | displayed byte offsets more reasonable in some
|
|---|
| 127 | cases, and fits better the intuitive notion that
|
|---|
| 128 | the line ends *before* the CR, not *after* it. */
|
|---|
| 129 | inp_map_idx++;
|
|---|
| 130 | dos_pos_map[inp_map_idx-1].pos =
|
|---|
| 131 | (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc;
|
|---|
| 132 | dos_pos_map[inp_map_idx].add = dos_stripped_crs;
|
|---|
| 133 | dos_pos_map_used = inp_map_idx;
|
|---|
| 134 |
|
|---|
| 135 | /* The following will be updated on the next pass. */
|
|---|
| 136 | dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1;
|
|---|
| 137 | }
|
|---|
| 138 | }
|
|---|
| 139 | }
|
|---|
| 140 |
|
|---|
| 141 | return chars_left;
|
|---|
| 142 | }
|
|---|
| 143 |
|
|---|
| 144 | return buflen;
|
|---|
| 145 | }
|
|---|
| 146 |
|
|---|
| 147 | /* Convert internal byte count into external. */
|
|---|
| 148 | static inline off_t
|
|---|
| 149 | dossified_pos (off_t byteno)
|
|---|
| 150 | {
|
|---|
| 151 | off_t pos_lo;
|
|---|
| 152 | off_t pos_hi;
|
|---|
| 153 |
|
|---|
| 154 | if (dos_file_type != DOS_TEXT || dos_report_unix_offset)
|
|---|
| 155 | return byteno;
|
|---|
| 156 |
|
|---|
| 157 | /* Optimization: usually the file will be scanned sequentially.
|
|---|
| 158 | So in most cases, this byte position will be found in the
|
|---|
| 159 | table near the previous one, as recorded in `out_map_idx'. */
|
|---|
| 160 | pos_lo = dos_pos_map[out_map_idx-1].pos;
|
|---|
| 161 | pos_hi = dos_pos_map[out_map_idx].pos;
|
|---|
| 162 |
|
|---|
| 163 | /* If the initial guess failed, search up or down, as
|
|---|
| 164 | appropriate, beginning with the previous place. */
|
|---|
| 165 | if (byteno >= pos_hi)
|
|---|
| 166 | {
|
|---|
| 167 | out_map_idx++;
|
|---|
| 168 | while (out_map_idx < dos_pos_map_used &&
|
|---|
| 169 | byteno >= dos_pos_map[out_map_idx].pos)
|
|---|
| 170 | out_map_idx++;
|
|---|
| 171 | }
|
|---|
| 172 |
|
|---|
| 173 | else if (byteno < pos_lo)
|
|---|
| 174 | {
|
|---|
| 175 | out_map_idx--;
|
|---|
| 176 | while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos)
|
|---|
| 177 | out_map_idx--;
|
|---|
| 178 | }
|
|---|
| 179 |
|
|---|
| 180 | return byteno + dos_pos_map[out_map_idx].add;
|
|---|
| 181 | }
|
|---|