source: trunk/src/helpers/encodings.c@ 186

Last change on this file since 186 was 186, checked in by umoeller, 23 years ago

Some Unicode fixes.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 20.5 KB
Line 
1
2/*
3 *@@sourcefile encodings.c:
4 * character encoding translations.
5 *
6 * See encCreateCodec for an introduction.
7 *
8 * Be warned, compilation of this file takes a long
9 * file because this includes all the complex codepage
10 * from include\encodings.
11 *
12 *@@header "encodings\base.h"
13 *@@added V0.9.9 (2001-02-14) [umoeller]
14 */
15
16/*
17 * Copyright (C) 2001-2002 Ulrich M”ller.
18 * This file is part of the "XWorkplace helpers" source package.
19 * This is free software; you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published
21 * by the Free Software Foundation, in version 2 as it comes in the
22 * "COPYING" file of the XWorkplace main distribution.
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 */
28
29#define OS2EMX_PLAIN_CHAR
30 // this is needed for "os2emx.h"; if this is defined,
31 // emx will define PSZ as _signed_ char, otherwise
32 // as unsigned char
33
34#include <stdlib.h>
35#include <string.h>
36
37#include "setup.h" // code generation and debugging options
38
39#include "helpers\standards.h"
40
41#include "encodings\base.h"
42
43#include "encodings\unicase.h"
44
45#include "encodings\alltables.h" // this takes a very long time
46
47#pragma hdrstop
48
49/*
50 *@@category: Helpers\National Language Support\Encodings
51 * See encodings.c.
52 */
53
54/*
55 *@@ G_aEncodings:
56 * list of all encodings supported by this engine
57 * (i.e. we have a corresponding codepage in
58 * include\encodings\*.h) together with some
59 * additional information for each encoding,
60 * such as the corresponding OS/2 codepage
61 * number and a descriptive string.
62 *
63 *@@added V [umoeller]
64 */
65
66struct
67{
68 ENCID id; // engine ID (enum)
69 PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
70 unsigned long cEntries; // entries in map (array item count)
71 unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
72 ENCBYTECOUNT bc;
73 const char *pcszDescription; // description
74} G_aEncodings[] =
75 {
76 #define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
77
78 ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
79 ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
80 ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
81 ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
82 ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
83 // Romania, Poland
84 ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
85 ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
86 ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
87 ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
88 ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
89 ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
90 ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
91 ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
92 ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
93 ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
94 ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
95
96 ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
97 ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
98 ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
99 ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
100
101 ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
102 ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
103 ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
104 ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
105 ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
106 ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
107 ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
108 ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
109 ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
110 ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
111 ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
112 ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
113 ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
114 ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
115 ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
116 ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
117 ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
118 ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
119 ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
120 ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
121 ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
122 ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
123 ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
124
125 UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
126 UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
127 };
128
129/*
130 *@@ ENCCASEFOLD:
131 *
132 *@@added V0.9.20 (2002-07-03) [umoeller]
133 */
134
135typedef struct _ENCCASEFOLD
136{
137 unsigned long cEntries;
138 unsigned long aulFolds[1];
139} ENCCASEFOLD, *PENCCASEFOLD;
140
141static PENCCASEFOLD G_pFold = NULL;
142
143/*
144 *@@ encGetTable:
145 *
146 *@@added V0.9.18 (2002-03-08) [umoeller]
147 */
148
149int encGetTable(ENCID id,
150 PXWPENCODINGMAP *ppMap,
151 unsigned long *pcEntries)
152{
153 unsigned long ul;
154 for (ul = 0;
155 ul < ARRAYITEMCOUNT(G_aEncodings);
156 ul++)
157 {
158 if (G_aEncodings[ul].id == id)
159 {
160 *ppMap = G_aEncodings[ul].pMap;
161 *pcEntries = G_aEncodings[ul].cEntries;
162 return (1);
163 }
164 }
165
166 return (0);
167}
168
169/*
170 *@@ encFindIdForCodepage:
171 * returns the ENCID for the given OS/2
172 * codepage, or UNSUPPORTED if there's none.
173 *
174 *@@added V0.9.18 (2002-03-08) [umoeller]
175 */
176
177ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
178 const char **ppcszDescription, // out: codepage description; ptr can be NULL
179 ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE; ptr can be NULL
180{
181 unsigned long ul;
182 for (ul = 0;
183 ul < ARRAYITEMCOUNT(G_aEncodings);
184 ul++)
185 {
186 if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
187
188 {
189 if (ppcszDescription)
190 *ppcszDescription = G_aEncodings[ul].pcszDescription;
191 if (pByteCount)
192 *pByteCount = G_aEncodings[ul].bc;
193 return G_aEncodings[ul].id;
194 }
195 }
196
197 return (UNSUPPORTED);
198}
199
200/*
201 *@@ encCreateCodec:
202 * creates a codec that can be used for conversion between
203 * Unicode and codepaged characters (and vice versa).
204 *
205 * A codec essentially consists of two tables which can
206 * be used for quick index-based lookups in both directions.
207 * This function goes thru the tables provided in
208 * include\encodings\*.h and builds the codec tables
209 * from them.
210 *
211 * This function takes an encoding ID as input. Each
212 * codepage table in include\encodings\*.h has one
213 * of those IDs assigned. Use encFindIdForCodepage
214 * to find the ID for a given OS/2 codepage.
215 *
216 * Use codecs carefully and only when they are really
217 * needed for a specific conversion. Building a codec
218 * is expensive, so you should create a codec once
219 * and reuse it for future conversions. In addition,
220 * create codecs only for the codepages that are
221 * actually used. Each codec will take up
222 * n * sizeof(USHORT) bytes, where n is the highest
223 * Unicode character used in the codepage.
224 *
225 * Codec remarks:
226 *
227 * -- All codepages share the first 128 characters
228 * (0-0x7F) with ASCII.
229 *
230 * -- Since the first 128 characters (0-0x7F) in
231 * Unicode are equivalent to ASCII also, codecs
232 * are not needed if you process ASCII strings
233 * only.
234 *
235 * -- Since the next 128 characters (0x80-0xFF) in
236 * Unicode are equivalent to ISO/IEC 8859-1
237 * (Latin-1), codecs aren't needed for those
238 * strings either.
239 *
240 * Note that codepoints 0x80-0x9F are undefined
241 * in Latin-1 but used as control sequences in
242 * Unicode.
243 *
244 * -- As far as I know, codepage 1252, which is
245 * used per default under Windows, is equivalent
246 * to Latin 1 except that it also defines
247 * codepoints 0x80-0x9F to certain DTP characters.
248 *
249 * -- From my testing, codepage 1004 (which is
250 * described as "Windows-compatible" in most OS/2
251 * docs) is the same as codepage 1252, except for
252 * character 0xAF.
253 *
254 * Unfortunately, OS/2 uses codepage 850 on most
255 * systems (and Windows uses OS/2 codepage 1252),
256 * so for conversion between those, codecs are needed.
257 *
258 * This works and is presently used in WarpIN.
259 */
260
261PCONVERSION encCreateCodec(ENCID id)
262{
263 PXWPENCODINGMAP pEncodingMap;
264 unsigned long cArrayEntries;
265
266 if (encGetTable(id,
267 &pEncodingMap,
268 &cArrayEntries))
269 {
270 unsigned short usHighestCP = 0,
271 usHighestUni = 0;
272 unsigned long ul;
273
274 // step 1:
275 // run through the table and calculate the highest
276 // character entry used
277 for (ul = 0;
278 ul < cArrayEntries;
279 ul++)
280 {
281 if (pEncodingMap[ul].usCP > usHighestCP)
282 usHighestCP = pEncodingMap[ul].usCP;
283 if (pEncodingMap[ul].usUni > usHighestUni)
284 usHighestUni = pEncodingMap[ul].usUni;
285 }
286
287 // step 2: allocate encoding table
288 if (usHighestCP && usHighestUni)
289 {
290 PCONVERSION pTableNew;
291 if (pTableNew = NEW(CONVERSION))
292 {
293 unsigned long cbEntriesUniFromCP
294 = (usHighestCP + 1) * sizeof(unsigned short);
295 unsigned long cbEntriesCPFromUni
296 = (usHighestUni + 1) * sizeof(unsigned short);
297
298 ZERO(pTableNew);
299
300 pTableNew->usHighestCP = usHighestCP;
301 pTableNew->usHighestUni = usHighestUni;
302
303 if ( (pTableNew->ausEntriesUniFromCP
304 = (unsigned short*)malloc(cbEntriesUniFromCP))
305 && (pTableNew->ausEntriesCPFromUni
306 = (unsigned short*)malloc(cbEntriesCPFromUni))
307 )
308 {
309 // step 3: fill encoding tables
310
311 memset(pTableNew->ausEntriesUniFromCP,
312 0xFF,
313 cbEntriesUniFromCP);
314 memset(pTableNew->ausEntriesCPFromUni,
315 0xFF,
316 cbEntriesCPFromUni);
317
318 for (ul = 0;
319 ul < cArrayEntries;
320 ul++)
321 {
322 PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
323
324 pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
325
326 pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
327 }
328
329 return (pTableNew);
330 }
331
332 free(pTableNew);
333 }
334 }
335 }
336
337 return NULL;
338}
339
340/*
341 *@@ encFreeCodec:
342 * frees a codec created with encFreeConversion
343 * and sets the given pointer to NULL.
344 *
345 * This works and is presently used in WarpIN.
346 *
347 *@@added V0.9.18 (2002-03-08) [umoeller]
348 */
349
350void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
351{
352 PCONVERSION pTable;
353 if (pTable = *ppTable)
354 {
355 if (pTable->ausEntriesUniFromCP)
356 free(pTable->ausEntriesUniFromCP);
357 if (pTable->ausEntriesCPFromUni)
358 free(pTable->ausEntriesCPFromUni);
359 free(pTable);
360 *ppTable = NULL;
361 }
362}
363
364/*
365 *@@ encChar2Uni:
366 * converts a codepage-specific character
367 * to Unicode, using the given conversion
368 * table from encCreateCodec().
369 *
370 * Returns 0xFFFF on errors, which is unlikely
371 * with Unicode though.
372 *
373 * This works and is presently used in WarpIN.
374 *
375 *@@added V0.9.18 (2002-03-08) [umoeller]
376 */
377
378unsigned long encChar2Uni(PCONVERSION pTable,
379 unsigned short c)
380{
381 if ( (pTable)
382 && (c <= pTable->usHighestCP)
383 )
384 return (pTable->ausEntriesUniFromCP[c]);
385
386 return (0xFFFF);
387}
388
389/*
390 *@@ encUni2Char:
391 * converts a Unicode character to the
392 * codepage specified by the given
393 * conversion table from encCreateCodec().
394 *
395 * Returns 0xFFFF if the Unicode character
396 * has no codepage equivalent.
397 *
398 * This works and is presently used in WarpIN.
399 *
400 *@@added V0.9.18 (2002-03-08) [umoeller]
401 */
402
403unsigned short encUni2Char(PCONVERSION pTable,
404 unsigned long ulUni)
405{
406 if ( (pTable)
407 && (ulUni <= pTable->usHighestUni)
408 )
409 return (pTable->ausEntriesCPFromUni[ulUni]);
410
411 return (0xFFFF);
412}
413
414/*
415 *@@ encDecodeUTF8:
416 * decodes one UTF-8 character and returns
417 * the Unicode value or -1 if the character
418 * is invalid.
419 *
420 * On input, *ppch is assumed to point to
421 * the first byte of the UTF-8 char to be
422 * read.
423 *
424 * This function will advance *ppch by at
425 * least one byte (or more if the UTF-8
426 * char initially pointed to introduces
427 * a multi-byte sequence).
428 *
429 * This returns -1 if *ppch points to an
430 * invalid encoding (in which case the
431 * pointer is advanced anyway).
432 *
433 * This returns 0 if **ppch points to a
434 * null character.
435 *
436 * This works and is presently used in WarpIN.
437 *
438 *@@added V0.9.14 (2001-08-09) [umoeller]
439 */
440
441unsigned long encDecodeUTF8(const char **ppch)
442{
443 unsigned long ulChar;
444
445 if (!(ulChar = **ppch))
446 // null is null
447 return 0;
448
449 // if (ulChar < 0x80): simple, one byte only... use that
450
451 if (ulChar < 0x80)
452 {
453 (*ppch)++;
454 return (ulChar);
455 }
456 else
457 {
458 unsigned long ulCount = 1;
459 int fIllegal = 0;
460
461 // note: 0xc0 and 0xc1 are reserved and
462 // cannot appear as the first UTF-8 byte
463
464 if ( (ulChar >= 0xc2)
465 && (ulChar < 0xe0)
466 )
467 {
468 // that's two bytes
469 ulCount = 2;
470 ulChar &= 0x1f;
471 }
472 else if ((ulChar & 0xf0) == 0xe0)
473 {
474 // three bytes
475 ulCount = 3;
476 ulChar &= 0x0f;
477 }
478 else if ((ulChar & 0xf8) == 0xf0)
479 {
480 // four bytes
481 ulCount = 4;
482 ulChar &= 0x07;
483 }
484 else if ((ulChar & 0xfc) == 0xf8)
485 {
486 // five bytes
487 ulCount = 5;
488 ulChar &= 0x03;
489 }
490 else if ((ulChar & 0xfe) == 0xfc)
491 {
492 // six bytes
493 ulCount = 6;
494 ulChar &= 0x01;
495 }
496 else
497 ++fIllegal;
498
499 if (!fIllegal)
500 {
501 // go for the second and more bytes then
502 int ul2;
503
504 for (ul2 = 1;
505 ul2 < ulCount;
506 ++ul2)
507 {
508 unsigned long ulChar2 = *((*ppch) + ul2);
509
510 if (!(ulChar2 & 0xc0)) // != 0x80)
511 {
512 ++fIllegal;
513 break;
514 }
515
516 ulChar <<= 6;
517 ulChar |= ulChar2 & 0x3f;
518 }
519 }
520
521 if (fIllegal)
522 {
523 // skip all the following characters
524 // until we find something with bit 7 off
525 do
526 {
527 ulChar = *(++(*ppch));
528 if (!ulChar)
529 break;
530 } while (ulChar & 0x80);
531 }
532 else
533 *ppch += ulCount;
534 }
535
536 return (ulChar);
537}
538
539/*
540 *@@ CreateCaseFold:
541 * creates a casefold for later use with
542 * encToUpper.
543 *
544 * This only uses one-byte sequences from
545 * the Unicode case folding table (see
546 * include\encodings\unicase.h), so this
547 * cannot be used for expanding characters
548 * at this point.
549 *
550 * Returns 1 (TRUE) on success.
551 *
552 * This works and is presently used in WarpIN.
553 *
554 *@@added V0.9.20 (2002-07-03) [umoeller]
555 */
556
557int encInitCase(void)
558{
559 unsigned long ul,
560 cEntries = 0,
561 cb;
562
563 for (ul = 0;
564 ul < ARRAYITEMCOUNT(G_aCaseFolds);
565 ++ul)
566 {
567 // ignore CASEFL_T (duplicate entries for i chars)
568 // and CASEFL_F (expansions)
569 if ( (G_aCaseFolds[ul].fl & (CASEFL_C | CASEFL_S))
570 && (G_aCaseFolds[ul].ulLow > cEntries)
571 )
572 cEntries = G_aCaseFolds[ul].ulLow;
573 }
574
575 cb = sizeof(ENCCASEFOLD) + cEntries * sizeof(unsigned long);
576 if (G_pFold = (PENCCASEFOLD)malloc(cb))
577 {
578 memset(G_pFold, 0, cb);
579 G_pFold->cEntries = cEntries;
580
581 for (ul = 0;
582 ul < ARRAYITEMCOUNT(G_aCaseFolds);
583 ++ul)
584 {
585 if (G_aCaseFolds[ul].fl & (CASEFL_C | CASEFL_S))
586 G_pFold->aulFolds[G_aCaseFolds[ul].ulLow] = G_aCaseFolds[ul].c1;
587 }
588
589 return 1;
590 }
591
592 return 0;
593}
594
595/*
596 *@@ encToUpper:
597 * converts the given unicode character to
598 * upper case, if possible, or returns
599 * ulUni back if Unicode doesn't define
600 * an upper-case character for it.
601 *
602 * Special cases:
603 *
604 * -- Returns 0 for 0.
605 *
606 * Preconditions:
607 *
608 * -- You must call encInitCase before
609 * the first call.
610 *
611 * This works and is presently used in WarpIN.
612 *
613 *@@added V0.9.20 (2002-07-03) [umoeller]
614 */
615
616unsigned long encToUpper(unsigned long ulUni)
617{
618 unsigned long ulFold;
619
620 if ( (ulUni < G_pFold->cEntries)
621 && (ulFold = G_pFold->aulFolds[ulUni])
622 )
623 return ulFold;
624
625 return ulUni;
626}
627
628/*
629 *@@ encicmp:
630 * like stricmp, but for UTF-8 strings.
631 * This uses encToUpper for the comparisons.
632 *
633 * Like stricmp, this returns:
634 *
635 * -- -1 if pcsz1 is less than pcsz2
636 * -- 0 if pcsz1 is equal to pcsz2
637 * -- +1 if pcsz1 is greater than pcsz2
638 *
639 * However, this does not crash on passing
640 * in NULL strings.
641 *
642 * Preconditions:
643 *
644 * -- You must call encInitCase before
645 * the first call.
646 *
647 * This works and is presently used in WarpIN.
648 *
649 *@@added V0.9.20 (2002-07-03) [umoeller]
650 */
651
652int encicmp(const char *pcsz1,
653 const char *pcsz2)
654{
655 const char *p1 = pcsz1,
656 *p2 = pcsz2;
657
658 unsigned long ul1, ul2;
659
660 do
661 {
662 // encDecodeUTF8 returns null for null, so this is safe
663 ul1 = encToUpper(encDecodeUTF8(&p1));
664 ul2 = encToUpper(encDecodeUTF8(&p2));
665
666 if (ul1 < ul2)
667 return -1;
668 if (ul1 > ul2)
669 return +1;
670
671 // both are equal: check for null bytes then
672 if (!ul1)
673 if (!ul2)
674 return 0;
675 else
676 // ul1 is null, but ul2 isn't:
677 return -1;
678 else
679 if (!ul2)
680 // ul1 is not null, but ul2 is:
681 return +1;
682
683 // both are non-null: continue
684
685 } while (1);
686
687 return 0;
688}
689
Note: See TracBrowser for help on using the repository browser.