| 1 | /* Messy DOS-specific code for correctly treating binary, Unix text | 
|---|
| 2 | and DOS text files. | 
|---|
| 3 |  | 
|---|
| 4 | This has several aspects: | 
|---|
| 5 |  | 
|---|
| 6 | * Guessing the file type (unless the user tells us); | 
|---|
| 7 | * Stripping CR characters from DOS text files (otherwise regex | 
|---|
| 8 | functions won't work correctly); | 
|---|
| 9 | * Reporting correct byte count with -b for any kind of file. | 
|---|
| 10 |  | 
|---|
| 11 | */ | 
|---|
| 12 |  | 
|---|
| 13 | typedef enum { | 
|---|
| 14 | UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT | 
|---|
| 15 | } File_type; | 
|---|
| 16 |  | 
|---|
| 17 | struct dos_map { | 
|---|
| 18 | off_t pos;    /* position in buffer passed to matcher */ | 
|---|
| 19 | off_t add;    /* how much to add when reporting char position */ | 
|---|
| 20 | }; | 
|---|
| 21 |  | 
|---|
| 22 | static int       dos_report_unix_offset = 0; | 
|---|
| 23 |  | 
|---|
| 24 | static File_type dos_file_type     = UNKNOWN; | 
|---|
| 25 | static File_type dos_use_file_type = UNKNOWN; | 
|---|
| 26 | static off_t     dos_stripped_crs  = 0; | 
|---|
| 27 | static struct dos_map *dos_pos_map; | 
|---|
| 28 | static int       dos_pos_map_size  = 0; | 
|---|
| 29 | static int       dos_pos_map_used  = 0; | 
|---|
| 30 | static int       inp_map_idx = 0, out_map_idx = 1; | 
|---|
| 31 |  | 
|---|
| 32 | /* Guess DOS file type by looking at its contents.  */ | 
|---|
| 33 | static inline File_type | 
|---|
| 34 | guess_type (char *buf, register size_t buflen) | 
|---|
| 35 | { | 
|---|
| 36 | int crlf_seen = 0; | 
|---|
| 37 | register char *bp = buf; | 
|---|
| 38 |  | 
|---|
| 39 | while (buflen--) | 
|---|
| 40 | { | 
|---|
| 41 | /* Treat a file as binary if it has a NUL character.  */ | 
|---|
| 42 | if (!*bp) | 
|---|
| 43 | return DOS_BINARY; | 
|---|
| 44 |  | 
|---|
| 45 | /* CR before LF means DOS text file (unless we later see | 
|---|
| 46 | binary characters).  */ | 
|---|
| 47 | else if (*bp == '\r' && buflen && bp[1] == '\n') | 
|---|
| 48 | crlf_seen = 1; | 
|---|
| 49 |  | 
|---|
| 50 | bp++; | 
|---|
| 51 | } | 
|---|
| 52 |  | 
|---|
| 53 | return crlf_seen ? DOS_TEXT : UNIX_TEXT; | 
|---|
| 54 | } | 
|---|
| 55 |  | 
|---|
| 56 | /* Convert external DOS file representation to internal. | 
|---|
| 57 | Return the count of characters left in the buffer. | 
|---|
| 58 | Build table to map character positions when reporting byte counts.  */ | 
|---|
| 59 | static inline int | 
|---|
| 60 | undossify_input (register char *buf, size_t buflen) | 
|---|
| 61 | { | 
|---|
| 62 | int chars_left = 0; | 
|---|
| 63 |  | 
|---|
| 64 | if (totalcc == 0) | 
|---|
| 65 | { | 
|---|
| 66 | /* New file: forget everything we knew about character | 
|---|
| 67 | position mapping table and file type.  */ | 
|---|
| 68 | inp_map_idx = 0; | 
|---|
| 69 | out_map_idx = 1; | 
|---|
| 70 | dos_pos_map_used = 0; | 
|---|
| 71 | dos_stripped_crs = 0; | 
|---|
| 72 | dos_file_type = dos_use_file_type; | 
|---|
| 73 | } | 
|---|
| 74 |  | 
|---|
| 75 | /* Guess if this file is binary, unless we already know that.  */ | 
|---|
| 76 | if (dos_file_type == UNKNOWN) | 
|---|
| 77 | dos_file_type = guess_type(buf, buflen); | 
|---|
| 78 |  | 
|---|
| 79 | /* If this file is to be treated as DOS Text, strip the CR characters | 
|---|
| 80 | and maybe build the table for character position mapping on output.  */ | 
|---|
| 81 | if (dos_file_type == DOS_TEXT) | 
|---|
| 82 | { | 
|---|
| 83 | char   *destp   = buf; | 
|---|
| 84 |  | 
|---|
| 85 | while (buflen--) | 
|---|
| 86 | { | 
|---|
| 87 | if (*buf != '\r') | 
|---|
| 88 | { | 
|---|
| 89 | *destp++ = *buf++; | 
|---|
| 90 | chars_left++; | 
|---|
| 91 | } | 
|---|
| 92 | else | 
|---|
| 93 | { | 
|---|
| 94 | buf++; | 
|---|
| 95 | if (out_byte && !dos_report_unix_offset) | 
|---|
| 96 | { | 
|---|
| 97 | dos_stripped_crs++; | 
|---|
| 98 | while (buflen && *buf == '\r') | 
|---|
| 99 | { | 
|---|
| 100 | dos_stripped_crs++; | 
|---|
| 101 | buflen--; | 
|---|
| 102 | buf++; | 
|---|
| 103 | } | 
|---|
| 104 | if (inp_map_idx >= dos_pos_map_size - 1) | 
|---|
| 105 | { | 
|---|
| 106 | dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000; | 
|---|
| 107 | dos_pos_map = | 
|---|
| 108 | (struct dos_map *)xrealloc((char *)dos_pos_map, | 
|---|
| 109 | dos_pos_map_size * | 
|---|
| 110 | sizeof(struct dos_map)); | 
|---|
| 111 | } | 
|---|
| 112 |  | 
|---|
| 113 | if (!inp_map_idx) | 
|---|
| 114 | { | 
|---|
| 115 | /* Add sentinel entry.  */ | 
|---|
| 116 | dos_pos_map[inp_map_idx].pos = 0; | 
|---|
| 117 | dos_pos_map[inp_map_idx++].add = 0; | 
|---|
| 118 |  | 
|---|
| 119 | /* Initialize first real entry.  */ | 
|---|
| 120 | dos_pos_map[inp_map_idx].add = 0; | 
|---|
| 121 | } | 
|---|
| 122 |  | 
|---|
| 123 | /* Put the new entry.  If the stripped CR characters | 
|---|
| 124 | precede a Newline (the usual case), pretend that | 
|---|
| 125 | they were found *after* the Newline.  This makes | 
|---|
| 126 | displayed byte offsets more reasonable in some | 
|---|
| 127 | cases, and fits better the intuitive notion that | 
|---|
| 128 | the line ends *before* the CR, not *after* it.  */ | 
|---|
| 129 | inp_map_idx++; | 
|---|
| 130 | dos_pos_map[inp_map_idx-1].pos = | 
|---|
| 131 | (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc; | 
|---|
| 132 | dos_pos_map[inp_map_idx].add = dos_stripped_crs; | 
|---|
| 133 | dos_pos_map_used = inp_map_idx; | 
|---|
| 134 |  | 
|---|
| 135 | /* The following will be updated on the next pass.  */ | 
|---|
| 136 | dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1; | 
|---|
| 137 | } | 
|---|
| 138 | } | 
|---|
| 139 | } | 
|---|
| 140 |  | 
|---|
| 141 | return chars_left; | 
|---|
| 142 | } | 
|---|
| 143 |  | 
|---|
| 144 | return buflen; | 
|---|
| 145 | } | 
|---|
| 146 |  | 
|---|
| 147 | /* Convert internal byte count into external.  */ | 
|---|
| 148 | static inline off_t | 
|---|
| 149 | dossified_pos (off_t byteno) | 
|---|
| 150 | { | 
|---|
| 151 | off_t pos_lo; | 
|---|
| 152 | off_t pos_hi; | 
|---|
| 153 |  | 
|---|
| 154 | if (dos_file_type != DOS_TEXT || dos_report_unix_offset) | 
|---|
| 155 | return byteno; | 
|---|
| 156 |  | 
|---|
| 157 | /* Optimization: usually the file will be scanned sequentially. | 
|---|
| 158 | So in most cases, this byte position will be found in the | 
|---|
| 159 | table near the previous one, as recorded in `out_map_idx'.  */ | 
|---|
| 160 | pos_lo = dos_pos_map[out_map_idx-1].pos; | 
|---|
| 161 | pos_hi = dos_pos_map[out_map_idx].pos; | 
|---|
| 162 |  | 
|---|
| 163 | /* If the initial guess failed, search up or down, as | 
|---|
| 164 | appropriate, beginning with the previous place.  */ | 
|---|
| 165 | if (byteno >= pos_hi) | 
|---|
| 166 | { | 
|---|
| 167 | out_map_idx++; | 
|---|
| 168 | while (out_map_idx < dos_pos_map_used && | 
|---|
| 169 | byteno >= dos_pos_map[out_map_idx].pos) | 
|---|
| 170 | out_map_idx++; | 
|---|
| 171 | } | 
|---|
| 172 |  | 
|---|
| 173 | else if (byteno < pos_lo) | 
|---|
| 174 | { | 
|---|
| 175 | out_map_idx--; | 
|---|
| 176 | while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos) | 
|---|
| 177 | out_map_idx--; | 
|---|
| 178 | } | 
|---|
| 179 |  | 
|---|
| 180 | return byteno + dos_pos_map[out_map_idx].add; | 
|---|
| 181 | } | 
|---|