source: trunk/src/helpers/encodings.c@ 209

Last change on this file since 209 was 209, checked in by umoeller, 23 years ago

Dialog formatter rewrite.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 20.5 KB
Line 
1
2/*
3 *@@sourcefile encodings.c:
4 * character encoding support. Handles all kinds
5 * of legacy codepages (including most OS/2 codepages)
6 * and Unicode in the form of UTF-8 and translations
7 * between then.
8 *
9 * See encCreateCodec for an introduction.
10 *
11 * See http://www.ietf.org/rfc/rfc2279.txt for
12 * RFC 2279, which defines UTF-8.
13 *
14 * Be warned, compilation of this file takes a long
15 * file because this includes all the complex codepages
16 * from include\encodings.
17 *
18 *@@header "encodings\base.h"
19 *@@added V0.9.9 (2001-02-14) [umoeller]
20 */
21
22/*
23 * Copyright (C) 2001-2002 Ulrich M”ller.
24 * This file is part of the "XWorkplace helpers" source package.
25 * This is free software; you can redistribute it and/or modify
26 * it under the terms of the GNU General Public License as published
27 * by the Free Software Foundation, in version 2 as it comes in the
28 * "COPYING" file of the XWorkplace main distribution.
29 * This program is distributed in the hope that it will be useful,
30 * but WITHOUT ANY WARRANTY; without even the implied warranty of
31 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32 * GNU General Public License for more details.
33 */
34
35#define OS2EMX_PLAIN_CHAR
36 // this is needed for "os2emx.h"; if this is defined,
37 // emx will define PSZ as _signed_ char, otherwise
38 // as unsigned char
39
40#include <stdlib.h>
41#include <string.h>
42
43#include "setup.h" // code generation and debugging options
44
45#include "helpers\standards.h"
46
47#include "encodings\base.h"
48
49#include "encodings\unicase.h"
50
51#include "encodings\alltables.h" // this takes a very long time
52
53#pragma hdrstop
54
55/*
56 *@@category: Helpers\National Language Support\Encodings
57 * See encodings.c.
58 */
59
60/*
61 *@@ G_aEncodings:
62 * list of all encodings supported by this engine
63 * (i.e. we have a corresponding codepage in
64 * include\encodings\*.h) together with some
65 * additional information for each encoding,
66 * such as the corresponding OS/2 codepage
67 * number and a descriptive string.
68 *
69 *@@added V [umoeller]
70 */
71
72struct
73{
74 ENCID id; // engine ID (enum)
75 PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
76 unsigned long cEntries; // entries in map (array item count)
77 unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
78 ENCBYTECOUNT bc;
79 const char *pcszDescription; // description
80} G_aEncodings[] =
81 {
82 #define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
83
84 ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
85 ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
86 ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
87 ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
88 ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
89 // Romania, Poland
90 ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
91 ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
92 ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
93 ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
94 ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
95 ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
96 ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
97 ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
98 ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
99 ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
100 ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
101
102 ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
103 ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
104 ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
105 ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
106
107 ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
108 ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
109 ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
110 ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
111 ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
112 ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
113 ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
114 ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
115 ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
116 ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
117 ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
118 ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
119 ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
120 ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
121 ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
122 ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
123 ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
124 ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
125 ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
126 ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
127 ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
128 ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
129 ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
130
131 UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
132 UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
133 };
134
135/*
136 *@@ ENCCASEFOLD:
137 *
138 *@@added V0.9.20 (2002-07-03) [umoeller]
139 */
140
141typedef struct _ENCCASEFOLD
142{
143 unsigned long cEntries;
144 unsigned long aulFolds[1];
145} ENCCASEFOLD, *PENCCASEFOLD;
146
147static PENCCASEFOLD G_pFold = NULL;
148
149/*
150 *@@ encGetTable:
151 *
152 *@@added V0.9.18 (2002-03-08) [umoeller]
153 */
154
155int encGetTable(ENCID id,
156 PXWPENCODINGMAP *ppMap,
157 unsigned long *pcEntries)
158{
159 unsigned long ul;
160 for (ul = 0;
161 ul < ARRAYITEMCOUNT(G_aEncodings);
162 ul++)
163 {
164 if (G_aEncodings[ul].id == id)
165 {
166 *ppMap = G_aEncodings[ul].pMap;
167 *pcEntries = G_aEncodings[ul].cEntries;
168 return 1;
169 }
170 }
171
172 return 0;
173}
174
175/*
176 *@@ encFindIdForCodepage:
177 * returns the ENCID for the given OS/2
178 * codepage, or UNSUPPORTED if there's none.
179 *
180 *@@added V0.9.18 (2002-03-08) [umoeller]
181 */
182
183ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
184 const char **ppcszDescription, // out: codepage description; ptr can be NULL
185 ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE; ptr can be NULL
186{
187 unsigned long ul;
188 for (ul = 0;
189 ul < ARRAYITEMCOUNT(G_aEncodings);
190 ul++)
191 {
192 if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
193
194 {
195 if (ppcszDescription)
196 *ppcszDescription = G_aEncodings[ul].pcszDescription;
197 if (pByteCount)
198 *pByteCount = G_aEncodings[ul].bc;
199 return G_aEncodings[ul].id;
200 }
201 }
202
203 return UNSUPPORTED;
204}
205
206/*
207 *@@ encCreateCodec:
208 * creates a codec that can be used for conversion between
209 * Unicode and codepaged characters (and vice versa).
210 *
211 * A codec essentially consists of two tables which can
212 * be used for quick index-based lookups in both directions.
213 * This function goes thru the tables provided in
214 * include\encodings\*.h and builds the codec tables
215 * from them.
216 *
217 * This function takes an encoding ID as input. Each
218 * codepage table in include\encodings\*.h has one
219 * of those IDs assigned. Use encFindIdForCodepage
220 * to find the ID for a given OS/2 codepage.
221 *
222 * Use codecs carefully and only when they are really
223 * needed for a specific conversion. Building a codec
224 * is expensive, so you should create a codec once
225 * and reuse it for future conversions. In addition,
226 * create codecs only for the codepages that are
227 * actually used. Each codec will take up
228 * n * sizeof(USHORT) bytes, where n is the highest
229 * Unicode character used in the codepage.
230 *
231 * Codec remarks:
232 *
233 * -- All codepages share the first 128 characters
234 * (0-0x7F) with ASCII.
235 *
236 * -- Since the first 128 characters (0-0x7F) in
237 * Unicode are equivalent to ASCII also, codecs
238 * are not needed if you process ASCII strings
239 * only.
240 *
241 * -- Since the next 128 characters (0x80-0xFF) in
242 * Unicode are equivalent to ISO/IEC 8859-1
243 * (Latin-1), codecs aren't needed for those
244 * strings either.
245 *
246 * Note that codepoints 0x80-0x9F are undefined
247 * in Latin-1 but used as control sequences in
248 * Unicode.
249 *
250 * -- As far as I know, codepage 1252, which is
251 * used per default under Windows, is equivalent
252 * to Latin 1 except that it also defines
253 * codepoints 0x80-0x9F to certain DTP characters.
254 *
255 * -- From my testing, codepage 1004 (which is
256 * described as "Windows-compatible" in most OS/2
257 * docs) is the same as codepage 1252, except for
258 * character 0xAF.
259 *
260 * Unfortunately, OS/2 uses codepage 850 on most
261 * systems (and Windows uses OS/2 codepage 1252),
262 * so for conversion between those, codecs are needed.
263 *
264 * This works and is presently used in WarpIN.
265 */
266
267PCONVERSION encCreateCodec(ENCID id)
268{
269 PXWPENCODINGMAP pEncodingMap;
270 unsigned long cArrayEntries;
271
272 if (encGetTable(id,
273 &pEncodingMap,
274 &cArrayEntries))
275 {
276 unsigned short usHighestCP = 0,
277 usHighestUni = 0;
278 unsigned long ul;
279
280 // step 1:
281 // run through the table and calculate the highest
282 // character entry used
283 for (ul = 0;
284 ul < cArrayEntries;
285 ul++)
286 {
287 if (pEncodingMap[ul].usCP > usHighestCP)
288 usHighestCP = pEncodingMap[ul].usCP;
289 if (pEncodingMap[ul].usUni > usHighestUni)
290 usHighestUni = pEncodingMap[ul].usUni;
291 }
292
293 // step 2: allocate encoding table
294 if (usHighestCP && usHighestUni)
295 {
296 PCONVERSION pTableNew;
297 if (pTableNew = NEW(CONVERSION))
298 {
299 unsigned long cbEntriesUniFromCP
300 = (usHighestCP + 1) * sizeof(unsigned short);
301 unsigned long cbEntriesCPFromUni
302 = (usHighestUni + 1) * sizeof(unsigned short);
303
304 ZERO(pTableNew);
305
306 pTableNew->usHighestCP = usHighestCP;
307 pTableNew->usHighestUni = usHighestUni;
308
309 if ( (pTableNew->ausEntriesUniFromCP
310 = (unsigned short*)malloc(cbEntriesUniFromCP))
311 && (pTableNew->ausEntriesCPFromUni
312 = (unsigned short*)malloc(cbEntriesCPFromUni))
313 )
314 {
315 // step 3: fill encoding tables
316
317 memset(pTableNew->ausEntriesUniFromCP,
318 0xFF,
319 cbEntriesUniFromCP);
320 memset(pTableNew->ausEntriesCPFromUni,
321 0xFF,
322 cbEntriesCPFromUni);
323
324 for (ul = 0;
325 ul < cArrayEntries;
326 ul++)
327 {
328 PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
329
330 pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
331
332 pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
333 }
334
335 return pTableNew;
336 }
337
338 free(pTableNew);
339 }
340 }
341 }
342
343 return NULL;
344}
345
346/*
347 *@@ encFreeCodec:
348 * frees a codec created with encFreeConversion
349 * and sets the given pointer to NULL.
350 *
351 * This works and is presently used in WarpIN.
352 *
353 *@@added V0.9.18 (2002-03-08) [umoeller]
354 */
355
356void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
357{
358 PCONVERSION pTable;
359 if (pTable = *ppTable)
360 {
361 if (pTable->ausEntriesUniFromCP)
362 free(pTable->ausEntriesUniFromCP);
363 if (pTable->ausEntriesCPFromUni)
364 free(pTable->ausEntriesCPFromUni);
365 free(pTable);
366 *ppTable = NULL;
367 }
368}
369
370/*
371 *@@ encChar2Uni:
372 * converts a codepage-specific character
373 * to Unicode, using the given conversion
374 * table from encCreateCodec().
375 *
376 * Returns 0xFFFF on errors, which is unlikely
377 * with Unicode though.
378 *
379 * This works and is presently used in WarpIN.
380 *
381 *@@added V0.9.18 (2002-03-08) [umoeller]
382 */
383
384unsigned long encChar2Uni(PCONVERSION pTable,
385 unsigned short c)
386{
387 if ( (pTable)
388 && (c <= pTable->usHighestCP)
389 )
390 return pTable->ausEntriesUniFromCP[c];
391
392 return 0xFFFF;
393}
394
395/*
396 *@@ encUni2Char:
397 * converts a Unicode character to the
398 * codepage specified by the given
399 * conversion table from encCreateCodec().
400 *
401 * Returns 0xFFFF if the Unicode character
402 * has no codepage equivalent.
403 *
404 * This works and is presently used in WarpIN.
405 *
406 *@@added V0.9.18 (2002-03-08) [umoeller]
407 */
408
409unsigned short encUni2Char(PCONVERSION pTable,
410 unsigned long ulUni)
411{
412 if ( (pTable)
413 && (ulUni <= pTable->usHighestUni)
414 )
415 return pTable->ausEntriesCPFromUni[ulUni];
416
417 return 0xFFFF;
418}
419
420/*
421 *@@ encDecodeUTF8:
422 * decodes one UTF-8 character and returns
423 * the Unicode value or -1 if the character
424 * is invalid.
425 *
426 * On input, *ppch is assumed to point to
427 * the first byte of the UTF-8 char to be
428 * read.
429 *
430 * This function will advance *ppch by at
431 * least one byte (or more if the UTF-8
432 * char initially pointed to introduces
433 * a multi-byte sequence).
434 *
435 * This returns -1 if *ppch points to an
436 * invalid encoding (in which case the
437 * pointer is advanced anyway).
438 *
439 * This returns 0 if **ppch points to a
440 * null character.
441 *
442 * This works and is presently used in WarpIN.
443 *
444 *@@added V0.9.14 (2001-08-09) [umoeller]
445 */
446
447unsigned long encDecodeUTF8(const char **ppch)
448{
449 unsigned long ulChar;
450 unsigned long ulCount;
451 int fIllegal;
452
453 if (!(ulChar = **ppch))
454 // null is null
455 return 0;
456
457 // if (ulChar < 0x80): simple, one byte only... use that
458
459 if (ulChar < 0x80)
460 {
461 (*ppch)++;
462 return ulChar;
463 }
464
465 ulCount = 1;
466 fIllegal = 0;
467
468 // note: 0xc0 and 0xc1 are reserved and
469 // cannot appear as the first UTF-8 byte
470
471 if ( (ulChar >= 0xc2)
472 && (ulChar < 0xe0)
473 )
474 {
475 // that's two bytes
476 ulCount = 2;
477 ulChar &= 0x1f;
478 }
479 else if ((ulChar & 0xf0) == 0xe0)
480 {
481 // three bytes
482 ulCount = 3;
483 ulChar &= 0x0f;
484 }
485 else if ((ulChar & 0xf8) == 0xf0)
486 {
487 // four bytes
488 ulCount = 4;
489 ulChar &= 0x07;
490 }
491 else if ((ulChar & 0xfc) == 0xf8)
492 {
493 // five bytes
494 ulCount = 5;
495 ulChar &= 0x03;
496 }
497 else if ((ulChar & 0xfe) == 0xfc)
498 {
499 // six bytes
500 ulCount = 6;
501 ulChar &= 0x01;
502 }
503 else
504 ++fIllegal;
505
506 if (!fIllegal)
507 {
508 // go for the second and more bytes then
509 int ul2;
510
511 for (ul2 = 1;
512 ul2 < ulCount;
513 ++ul2)
514 {
515 unsigned long ulChar2 = *((*ppch) + ul2);
516
517 if (!(ulChar2 & 0xc0)) // != 0x80)
518 {
519 ++fIllegal;
520 break;
521 }
522
523 ulChar <<= 6;
524 ulChar |= ulChar2 & 0x3f;
525 }
526 }
527
528 if (fIllegal)
529 {
530 // skip all the following characters
531 // until we find something with bit 7 off
532 do
533 {
534 ulChar = *(++(*ppch));
535 if (!ulChar)
536 break;
537 } while (ulChar & 0x80);
538 }
539 else
540 *ppch += ulCount;
541
542 return ulChar;
543}
544
545/*
546 *@@ encInitCase:
547 * creates a casefold for later use with
548 * encToUpper.
549 *
550 * This only uses one-byte sequences from
551 * the Unicode case folding table (see
552 * include\encodings\unicase.h), so this
553 * cannot be used for expanding characters
554 * at this point.
555 *
556 * Returns 1 (TRUE) on success.
557 *
558 * This works and is presently used in WarpIN.
559 *
560 *@@added V0.9.20 (2002-07-03) [umoeller]
561 */
562
563int encInitCase(void)
564{
565 unsigned long ul,
566 cEntries = 0,
567 cb;
568
569 for (ul = 0;
570 ul < ARRAYITEMCOUNT(G_aCaseFolds);
571 ++ul)
572 {
573 // ignore CASEFL_T (duplicate entries for i chars)
574 // and CASEFL_F (expansions)
575 if ( (G_aCaseFolds[ul].fl & (CASEFL_C | CASEFL_S))
576 && (G_aCaseFolds[ul].ulLow > cEntries)
577 )
578 cEntries = G_aCaseFolds[ul].ulLow;
579 }
580
581 cb = sizeof(ENCCASEFOLD) + cEntries * sizeof(unsigned long);
582 if (G_pFold = (PENCCASEFOLD)malloc(cb))
583 {
584 memset(G_pFold, 0, cb);
585 G_pFold->cEntries = cEntries;
586
587 for (ul = 0;
588 ul < ARRAYITEMCOUNT(G_aCaseFolds);
589 ++ul)
590 {
591 if (G_aCaseFolds[ul].fl & (CASEFL_C | CASEFL_S))
592 G_pFold->aulFolds[G_aCaseFolds[ul].ulLow] = G_aCaseFolds[ul].c1;
593 }
594
595 return 1;
596 }
597
598 return 0;
599}
600
601/*
602 *@@ encToUpper:
603 * converts the given unicode character to
604 * upper case, if possible, or returns
605 * ulUni back if Unicode doesn't define
606 * an upper-case character for it.
607 *
608 * Special cases:
609 *
610 * -- Returns 0 for 0.
611 *
612 * Preconditions:
613 *
614 * -- You must call encInitCase before
615 * the first call.
616 *
617 * This works and is presently used in WarpIN.
618 *
619 *@@added V0.9.20 (2002-07-03) [umoeller]
620 */
621
622unsigned long encToUpper(unsigned long ulUni)
623{
624 unsigned long ulFold;
625
626 if ( (ulUni < G_pFold->cEntries)
627 && (ulFold = G_pFold->aulFolds[ulUni])
628 )
629 return ulFold;
630
631 return ulUni;
632}
633
634/*
635 *@@ encicmp:
636 * like stricmp, but for UTF-8 strings.
637 * This uses encToUpper for the comparisons.
638 *
639 * Like stricmp, this returns:
640 *
641 * -- -1 if pcsz1 is less than pcsz2
642 * -- 0 if pcsz1 is equal to pcsz2
643 * -- +1 if pcsz1 is greater than pcsz2
644 *
645 * However, this does not crash on passing
646 * in NULL strings.
647 *
648 * Preconditions:
649 *
650 * -- You must call encInitCase before
651 * the first call.
652 *
653 * This works and is presently used in WarpIN.
654 *
655 *@@added V0.9.20 (2002-07-03) [umoeller]
656 */
657
658int encicmp(const char *pcsz1,
659 const char *pcsz2)
660{
661 const char *p1 = pcsz1,
662 *p2 = pcsz2;
663
664 unsigned long ul1, ul2;
665
666 do
667 {
668 // encDecodeUTF8 returns null for null, so this is safe
669 ul1 = encToUpper(encDecodeUTF8(&p1));
670 ul2 = encToUpper(encDecodeUTF8(&p2));
671
672 if (ul1 < ul2)
673 return -1;
674 if (ul1 > ul2)
675 return +1;
676
677 // both are equal: check for null bytes then
678 if (!ul1)
679 if (!ul2)
680 return 0;
681 else
682 // ul1 is null, but ul2 isn't:
683 return -1;
684 else
685 if (!ul2)
686 // ul1 is not null, but ul2 is:
687 return +1;
688
689 // both are non-null: continue
690
691 } while (1);
692
693 return 0;
694}
695
Note: See TracBrowser for help on using the repository browser.