Changeset 147 for trunk/src/helpers/encodings.c
- Timestamp:
- Mar 16, 2002, 8:53:47 AM (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/helpers/encodings.c
r97 r147 3 3 *@@sourcefile encodings.c: 4 4 * character encoding translations. 5 * 6 * See encCreateCodec for an introduction. 5 7 * 6 8 *@@header "encodings\base.h" … … 31 33 #include "setup.h" // code generation and debugging options 32 34 33 #include "encodings\base.h" // includes all other encodings 35 #include "helpers\standards.h" 36 37 #include "encodings\base.h" 38 #include "encodings\alltables.h" 39 // #include "encodings\collate.h" 34 40 35 41 #pragma hdrstop 36 42 37 typedef struct _ENCODINGTABLE 38 { 39 XWPENCODINGID EncodingID; 40 41 unsigned short cEntries; 42 unsigned short ausEntries[1]; // variable size 43 } ENCODINGTABLE, *PENCODINGTABLE; 44 45 46 /* 47 *@@ encRegisterEncoding: 48 * registers a new proprietary encoding with the engine. 49 * 50 * Before you can translate encodings with this engine, 51 * you have to register them. This makes sure that the 52 * big encoding tables will only be linked to the executable 53 * code if they are explicitly referenced. As a result, you 54 * have to #include "encodings\base.h" and pass a pointer to 55 * one of the global tables in the header files to this 56 * function. 57 * 58 * This returns an encoding handle that can then be used 59 * with the other encoding functions. 60 * 61 * Example: 62 * 63 + #include "encodings\base.h" 64 + #include "encodings\alltables.h" // or a specific table only 65 + 66 + int rc = encRegisterEncoding(&G_iso8859_1, 67 + sizeof(G_iso8859_1) / sizeof(G_iso8859_1[0]), 68 + enc_iso8859_1); // ID to register with 69 */ 70 71 long encRegisterEncoding(PXWPENCODINGMAP pEncodingMap, 72 unsigned long cArrayEntries, // count of array items 73 XWPENCODINGID EncodingID) // enum from encodings\base.h 74 { 75 long lrc = 0; 76 77 unsigned short usHighest = 0; 43 #define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id) 44 45 /* 46 *@@ G_aEncodings: 47 * list of all encodings supported by this engine 48 * (i.e. we have a corresponding codepage in 49 * include\encodings\*.h) together with some 50 * additional information for each encoding, 51 * such as the corresponding OS/2 codepage 52 * number and a descriptive string. 53 * 54 *@@added V [umoeller] 55 */ 56 57 struct 58 { 59 ENCID id; // engine ID (enum) 60 PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h 61 unsigned long cEntries; // entries in map (array item count) 62 unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none 63 ENCBYTECOUNT bc; 64 const char *pcszDescription; // description 65 } G_aEncodings[] = 66 { 67 ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US", 68 ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek", 69 ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim", 70 ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1", 71 ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary, 72 // Romania, Poland 73 ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic", 74 ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)", 75 ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese", 76 ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic", 77 ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew", 78 ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French", 79 ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt 80 ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic", 81 ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia 82 ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2", 83 ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand 84 // ENCODINGENTRY(cp932), 932 or 943?, DOUBLE, "Japanese Windows", 85 // ENCODINGENTRY(cp936), 936 or 946?, DOUBLE, "Chinese", 86 // ENCODINGENTRY(cp949), 951 or 949?, DOUBLE, "Korean", 87 // ENCODINGENTRY(cp950), 947 or 950?, DOUBLE, "Taiwan Big-5", // default in China? 88 ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended", 89 ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2", 90 ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic", 91 ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1", 92 ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek", 93 ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish", 94 ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew", 95 ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic", 96 ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4", 97 ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown", 98 ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)", 99 ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)", 100 ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)", 101 ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)", 102 ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)", 103 ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)", 104 ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece 105 ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)", 106 ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)", 107 ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998", 108 ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998", 109 ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998", 110 ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999", 111 112 UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2", 113 UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8" 114 }; 115 116 /* 117 *@@ FindEntry: 118 * 119 *@@added V0.9.18 (2002-03-08) [umoeller] 120 */ 121 122 static int FindEntry(ENCID id, 123 PXWPENCODINGMAP *ppMap, 124 unsigned long *pcEntries) 125 { 78 126 unsigned long ul; 79 80 // step 1:81 // run through the table and calculate the highest82 // character entry used83 127 for (ul = 0; 84 ul < cArrayEntries;128 ul < ARRAYITEMCOUNT(G_aEncodings); 85 129 ul++) 86 130 { 87 unsigned short usFrom = pEncodingMap[ul].usFrom; 88 if (usFrom > usHighest) 89 usHighest = usFrom; 131 if (G_aEncodings[ul].id == id) 132 { 133 *ppMap = G_aEncodings[ul].pMap; 134 *pcEntries = G_aEncodings[ul].cEntries; 135 return (1); 136 } 90 137 } 91 138 92 // step 2: allocate encoding table 93 if (usHighest) 139 return (0); 140 } 141 142 /* 143 *@@ encFindIdForCodepage: 144 * returns the ENCID for the given OS/2 145 * codepage, or UNSUPPORTED if there's none. 146 * 147 *@@added V0.9.18 (2002-03-08) [umoeller] 148 */ 149 150 ENCID encFindIdForCodepage(unsigned short usCodepage, 151 const char **ppcszDescription, // out: codepage description; ptr can be NULL 152 ENCBYTECOUNT *pByteCount) 153 { 154 unsigned long ul; 155 for (ul = 0; 156 ul < ARRAYITEMCOUNT(G_aEncodings); 157 ul++) 94 158 { 95 // allocate memory as needed 96 unsigned long cb = sizeof(ENCODINGTABLE) 97 + ( (usHighest - 1) 98 * sizeof(unsigned short) 99 ); 100 101 PENCODINGTABLE pTableNew = (PENCODINGTABLE)malloc(cb); 102 if (pTableNew) 103 { 104 memset(pTableNew, -1, cb); 105 pTableNew->cEntries = usHighest; // array size 106 107 // step 3: fill encoding table 108 // this only has the Unicode target USHORTs; 109 // the source is simply the offset. So to 110 // get Unicode for character 123 in the specific encoding, 111 // do pTableNew->ausEntries[123]. 112 // If you get 0xFFFF, the encoding is undefined. 113 114 for (ul = 0; 115 ul < cArrayEntries; 116 ul++) 159 if (G_aEncodings[ul].usCodepageOS2 == usCodepage) 160 161 { 162 if (ppcszDescription) 163 *ppcszDescription = G_aEncodings[ul].pcszDescription; 164 if (pByteCount) 165 *pByteCount = G_aEncodings[ul].bc; 166 return G_aEncodings[ul].id; 167 } 168 } 169 170 return (UNSUPPORTED); 171 } 172 173 /* 174 *@@ encCreateCodec: 175 * creates a codec that can be used for conversion between 176 * Unicode and codepaged characters (and vice versa). 177 * 178 * A codec essentially consists of two tables which can 179 * be used for quick index-based lookups in both directions. 180 * This function goes thru the tables provided in 181 * include\encodings\*.h and builds the codec tables 182 * from them. 183 * 184 * This function takes an encoding ID as input. Each 185 * codepage table in include\encodings\*.h has one 186 * of those IDs assigned. Use encFindIdForCodepage 187 * to find the ID for a given OS/2 codepage. 188 * 189 * Use codecs carefully and only when they are really 190 * needed for a specific conversion. Building a codec 191 * is expensive, so you should create a codec once 192 * and reuse it for future conversions. In addition, 193 * create codecs only for the codepages that are 194 * actually used. Each codec will take up 195 * n * sizeof(USHORT) bytes, where n is the highest 196 * Unicode character used in the codepage. 197 * 198 * Remarks: 199 * 200 * -- All codepages share the first 128 characters 201 * (0-0x7F) with ASCII. 202 * 203 * -- Since the first 128 characters (0-0x7F) in 204 * Unicode are equivalent to ASCII also, codecs 205 * are not needed if you process ASCII strings 206 * only. 207 * 208 * -- Since the next 128 characters (0x80-0xFF) in 209 * Unicode are equivalent to ISO/IEC 8859-1 210 * (Latin-1), codecs aren't needed for those 211 * strings either. 212 * 213 * Note that codepoints 0x80-0x9F are undefined 214 * in Latin-1 but used as control sequences in 215 * Unicode. 216 * 217 * -- As far as I know, codepage 1252, which is 218 * used per default under Windows, is equivalent 219 * to Latin 1 except that it also defines 220 * codepoints 0x80-0x9F to certain DTP characters. 221 * 222 * -- From my testing, codepage 1004 (which is 223 * described as "Windows-compatible" in most OS/2 224 * docs) is the same as codepage 1252, except for 225 * character 0xAF. 226 * 227 * Unfortunately, OS/2 uses codepage 850 on most 228 * systems (and Windows uses OS/2 codepage 1252), 229 * so for conversion between those, codecs are needed. 230 */ 231 232 PCONVERSION encCreateCodec(ENCID id) 233 { 234 PXWPENCODINGMAP pEncodingMap; 235 unsigned long cArrayEntries; 236 237 if (FindEntry(id, 238 &pEncodingMap, 239 &cArrayEntries)) 240 { 241 unsigned short usHighestCP = 0, 242 usHighestUni = 0; 243 unsigned long ul; 244 245 // step 1: 246 // run through the table and calculate the highest 247 // character entry used 248 for (ul = 0; 249 ul < cArrayEntries; 250 ul++) 251 { 252 if (pEncodingMap[ul].usCP > usHighestCP) 253 usHighestCP = pEncodingMap[ul].usCP; 254 if (pEncodingMap[ul].usUni > usHighestUni) 255 usHighestUni = pEncodingMap[ul].usUni; 256 } 257 258 // step 2: allocate encoding table 259 if (usHighestCP && usHighestUni) 260 { 261 PCONVERSION pTableNew; 262 if (pTableNew = NEW(CONVERSION)) 117 263 { 118 PXWPENCODINGMAP pEntry = &pEncodingMap[ul]; 119 pTableNew->ausEntries[pEntry->usFrom] = pEntry->usUni; 264 unsigned long cbEntriesUniFromCP 265 = (usHighestCP + 1) * sizeof(unsigned short); 266 unsigned long cbEntriesCPFromUni 267 = (usHighestUni + 1) * sizeof(unsigned short); 268 269 ZERO(pTableNew); 270 271 pTableNew->usHighestCP = usHighestCP; 272 pTableNew->usHighestUni = usHighestUni; 273 274 if ( (pTableNew->ausEntriesUniFromCP 275 = (unsigned short*)malloc(cbEntriesUniFromCP)) 276 && (pTableNew->ausEntriesCPFromUni 277 = (unsigned short*)malloc(cbEntriesCPFromUni)) 278 ) 279 { 280 // step 3: fill encoding tables 281 282 memset(pTableNew->ausEntriesUniFromCP, 283 0xFF, 284 cbEntriesUniFromCP); 285 memset(pTableNew->ausEntriesCPFromUni, 286 0xFF, 287 cbEntriesCPFromUni); 288 289 for (ul = 0; 290 ul < cArrayEntries; 291 ul++) 292 { 293 PXWPENCODINGMAP pEntry = &pEncodingMap[ul]; 294 295 pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni; 296 297 pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP; 298 } 299 300 return (pTableNew); 301 } 302 303 free(pTableNew); 120 304 } 121 122 lrc = (long)pTableNew;123 305 } 124 306 } 125 307 126 return (lrc); 308 return (NULL); 309 } 310 311 /* 312 *@@ encFreeCodec: 313 * frees a codec created with encFreeConversion 314 * and sets the given pointer to NULL. 315 * 316 *@@added V0.9.18 (2002-03-08) [umoeller] 317 */ 318 319 void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec 320 { 321 PCONVERSION pTable; 322 if (pTable = *ppTable) 323 { 324 if (pTable->ausEntriesUniFromCP) 325 free(pTable->ausEntriesUniFromCP); 326 if (pTable->ausEntriesCPFromUni) 327 free(pTable->ausEntriesCPFromUni); 328 free(pTable); 329 *ppTable = NULL; 330 } 331 } 332 333 /* 334 *@@ encChar2Uni: 335 * converts a codepage-specific character 336 * to Unicode, using the given conversion 337 * table from encCreateCodec(). 338 * 339 * Returns 0xFFFF on errors, which is unlikely 340 * with Unicode though. 341 * 342 *@@added V0.9.18 (2002-03-08) [umoeller] 343 */ 344 345 unsigned long encChar2Uni(PCONVERSION pTable, 346 unsigned short c) 347 { 348 if ( (pTable) 349 && (c <= pTable->usHighestCP) 350 ) 351 return (pTable->ausEntriesUniFromCP[c]); 352 353 return (0xFFFF); 354 } 355 356 /* 357 *@@ encUni2Char: 358 * converts a Unicode character to the 359 * codepage specified by the given 360 * conversion table from encCreateCodec(). 361 * 362 * Returns 0xFFFF if the Unicode character 363 * has no codepage equivalent. 364 * 365 *@@added V0.9.18 (2002-03-08) [umoeller] 366 */ 367 368 unsigned short encUni2Char(PCONVERSION pTable, 369 unsigned long ulUni) 370 { 371 if ( (pTable) 372 && (ulUni <= pTable->usHighestUni) 373 ) 374 return (pTable->ausEntriesCPFromUni[ulUni]); 375 376 return (0xFFFF); 127 377 } 128 378 … … 246 496 } 247 497 498 #if 0 499 500 /* 501 *@@ encCodepageToUTF8: 502 * 503 *@@added V0.9.18 (2002-03-08) [umoeller] 504 */ 505 506 void encCodepageToUTF8(const char **ppch) 507 { 508 509 } 510 511 putwchar(c) 512 { 513 if (c < 0x80) { 514 putchar (c); 515 } 516 else if (c < 0x800) { 517 putchar (0xC0 | c>>6); 518 putchar (0x80 | c & 0x3F); 519 } 520 else if (c < 0x10000) { 521 putchar (0xE0 | c>>12); 522 putchar (0x80 | c>>6 & 0x3F); 523 putchar (0x80 | c & 0x3F); 524 } 525 else if (c < 0x200000) { 526 putchar (0xF0 | c>>18); 527 putchar (0x80 | c>>12 & 0x3F); 528 putchar (0x80 | c>>6 & 0x3F); 529 putchar (0x80 | c & 0x3F); 530 } 531 } 532 533 #endif 534
Note:
See TracChangeset
for help on using the changeset viewer.