source: branches/branch-1-0/src/helpers/encodings.c@ 231

Last change on this file since 231 was 229, checked in by umoeller, 23 years ago

Sources as of 1.0.0.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 21.3 KB
Line 
1
2/*
3 *@@sourcefile encodings.c:
4 * character encoding support. Handles all kinds
5 * of legacy codepages (including most OS/2 codepages)
6 * and Unicode in the form of UTF-8 and translations
7 * between then.
8 *
9 * See encCreateCodec for an introduction.
10 *
11 * See http://www.ietf.org/rfc/rfc2279.txt for
12 * RFC 2279, which defines UTF-8.
13 *
14 * Be warned, compilation of this file takes a long
15 * time because this includes all the complex codepages
16 * from include\encodings.
17 *
18 *@@header "encodings\base.h"
19 *@@added V0.9.9 (2001-02-14) [umoeller]
20 */
21
22/*
23 * Copyright (C) 2001-2002 Ulrich M”ller.
24 * This file is part of the "XWorkplace helpers" source package.
25 * This is free software; you can redistribute it and/or modify
26 * it under the terms of the GNU General Public License as published
27 * by the Free Software Foundation, in version 2 as it comes in the
28 * "COPYING" file of the XWorkplace main distribution.
29 * This program is distributed in the hope that it will be useful,
30 * but WITHOUT ANY WARRANTY; without even the implied warranty of
31 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32 * GNU General Public License for more details.
33 */
34
35#define OS2EMX_PLAIN_CHAR
36 // this is needed for "os2emx.h"; if this is defined,
37 // emx will define PSZ as _signed_ char, otherwise
38 // as unsigned char
39
40#include <stdlib.h>
41#include <string.h>
42
43#include "setup.h" // code generation and debugging options
44
45#include "helpers\standards.h"
46
47#include "encodings\base.h"
48
49#include "encodings\unicase.h"
50
51#include "encodings\alltables.h" // this takes a very long time
52
53#pragma hdrstop
54
55/*
56 *@@category: Helpers\National Language Support\Encodings
57 * See encodings.c.
58 */
59
60/*
61 *@@ G_aEncodings:
62 * list of all encodings supported by this engine
63 * (i.e. we have a corresponding codepage in
64 * include\encodings\*.h) together with some
65 * additional information for each encoding,
66 * such as the corresponding OS/2 codepage
67 * number and a descriptive string.
68 *
69 * For a way too extensive list of codepage
70 * names, see "http://www.iana.org/assignments/character-sets".
71 *
72 *@@added V [umoeller]
73 */
74
75struct
76{
77 ENCID id; // engine ID (enum)
78 PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
79 unsigned long cEntries; // entries in map (array item count)
80 unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
81 // V1.0.0 (2002-08-21) [umoeller]
82 unsigned short usLatin; // ISO 8859-X correspondance or 0
83 ENCBYTECOUNT bc;
84 const char *pcszDescription; // description
85} G_aEncodings[] =
86 {
87 #define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
88
89 ENCODINGENTRY(cp437), 437, 0, SINGLE, "DOS Latin US",
90 ENCODINGENTRY(cp737), 737, 0, SINGLE, "DOS Greek",
91 ENCODINGENTRY(cp775), 775, 0, SINGLE, "DOS BaltRim",
92 ENCODINGENTRY(cp850), 850, 0, SINGLE, "DOS Latin 1",
93 ENCODINGENTRY(cp852), 852, 0, SINGLE, "DOS Latin 2", // default in Hungary,
94 // Romania, Poland
95 ENCODINGENTRY(cp855), 855, 0, SINGLE, "DOS Cyrillic",
96 ENCODINGENTRY(cp857), 857, 0, SINGLE, "DOS Latin 5 (Turkish)",
97 ENCODINGENTRY(cp860), 860, 0, SINGLE, "DOS Portuguese",
98 ENCODINGENTRY(cp861), 861, 0, SINGLE, "DOS Icelandic",
99 ENCODINGENTRY(cp862), 862, 0, SINGLE, "DOS Hebrew",
100 ENCODINGENTRY(cp863), 863, 0, SINGLE, "DOS Canadian French",
101 ENCODINGENTRY(cp864), 864, 0, SINGLE, "DOS Arabic", // default in Egypt
102 ENCODINGENTRY(cp865), 865, 0, SINGLE, "DOS Nordic",
103 ENCODINGENTRY(cp866), 866, 0, SINGLE, "DOS Cyrillic Russian", // default in Russia
104 ENCODINGENTRY(cp869), 869, 0, SINGLE, "DOS Greek2",
105 ENCODINGENTRY(cp874), 874, 0, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
106
107 ENCODINGENTRY(cp932), 932 /* or 943?*/ ,
108 0, DOUBLE, "Japanese Windows",
109 ENCODINGENTRY(cp936), 936 /* or 946?*/ ,
110 0, DOUBLE, "Chinese",
111 ENCODINGENTRY(cp949), 951 /* or 949?*/ ,
112 0, DOUBLE, "Korean",
113 ENCODINGENTRY(cp950), 947 /* or 950?*/ ,
114 0, DOUBLE, "Taiwan Big-5", // default in China?
115
116 ENCODINGENTRY(cp1004), 1004, 0, SINGLE, "Windows Extended",
117 ENCODINGENTRY(cp1250), 1250, 0, SINGLE, "Windows Latin 2",
118 ENCODINGENTRY(cp1251), 1251, 0, SINGLE, "Windows Cyrillic",
119 ENCODINGENTRY(cp1252), 1252, 0, SINGLE, "Windows Latin 1",
120 ENCODINGENTRY(cp1253), 1253, 0, SINGLE, "Windows Greek",
121 ENCODINGENTRY(cp1254), 1254, 0, SINGLE, "Windows Turkish",
122 ENCODINGENTRY(cp1255), 1255, 0, SINGLE, "Windows Hebrew",
123 ENCODINGENTRY(cp1256), 1256, 0, SINGLE, "Windows Arabic",
124 ENCODINGENTRY(cp1257), 1257, 0, SINGLE, "Windows Latin-4",
125 ENCODINGENTRY(cp1258), 1258, 0, UNKNOWN, "unknown",
126 ENCODINGENTRY(iso8859_1), 819, 1, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
127 ENCODINGENTRY(iso8859_2), 912, 2, SINGLE, "ISO 8859-2:1999 (Latin-2)",
128 ENCODINGENTRY(iso8859_3), 913, 3, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
129 ENCODINGENTRY(iso8859_4), 914, 4, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
130 ENCODINGENTRY(iso8859_5), 915, 5, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
131 ENCODINGENTRY(iso8859_6), 1089, 6, SINGLE, "ISO 8859-6:1999 (Arabic)",
132 ENCODINGENTRY(iso8859_7), 813, 7, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
133 ENCODINGENTRY(iso8859_8), 916, 8, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
134 ENCODINGENTRY(iso8859_9), 920, 9, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
135 ENCODINGENTRY(iso8859_10), 0, 10, SINGLE, "ISO/IEC 8859-10:1998",
136 ENCODINGENTRY(iso8859_13), 0, 13, SINGLE, "ISO/IEC 8859-13:1998",
137 ENCODINGENTRY(iso8859_14), 0, 14, SINGLE, "ISO/IEC 8859-14:1998",
138 ENCODINGENTRY(iso8859_15), 923, 15, SINGLE, "ISO/IEC 8859-15:1999",
139
140 UNSUPPORTED, NULL, 0, 1200, 0, MULTI_UNICODE, "Unicode UCS-2",
141 UNSUPPORTED, NULL, 0, 1208, 0, MULTI_UNICODE, "Unicode UTF-8"
142 };
143
144/*
145 *@@ ENCCASEFOLD:
146 *
147 *@@added V0.9.20 (2002-07-03) [umoeller]
148 */
149
150typedef struct _ENCCASEFOLD
151{
152 unsigned long cEntries;
153 unsigned long aulFolds[1];
154} ENCCASEFOLD, *PENCCASEFOLD;
155
156STATIC PENCCASEFOLD G_pFold = NULL;
157
158/*
159 *@@ encGetTable:
160 *
161 *@@added V0.9.18 (2002-03-08) [umoeller]
162 */
163
164int encGetTable(ENCID id,
165 PXWPENCODINGMAP *ppMap,
166 unsigned long *pcEntries)
167{
168 unsigned long ul;
169 for (ul = 0;
170 ul < ARRAYITEMCOUNT(G_aEncodings);
171 ul++)
172 {
173 if (G_aEncodings[ul].id == id)
174 {
175 *ppMap = G_aEncodings[ul].pMap;
176 *pcEntries = G_aEncodings[ul].cEntries;
177 return 1;
178 }
179 }
180
181 return 0;
182}
183
184/*
185 *@@ encFindIdForCodepage:
186 * returns the ENCID for the given OS/2
187 * codepage, or UNSUPPORTED if there's none.
188 *
189 *@@added V0.9.18 (2002-03-08) [umoeller]
190 */
191
192ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
193 const char **ppcszDescription, // out: codepage description; ptr can be NULL
194 ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE; ptr can be NULL
195{
196 unsigned long ul;
197 for (ul = 0;
198 ul < ARRAYITEMCOUNT(G_aEncodings);
199 ul++)
200 {
201 if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
202
203 {
204 if (ppcszDescription)
205 *ppcszDescription = G_aEncodings[ul].pcszDescription;
206 if (pByteCount)
207 *pByteCount = G_aEncodings[ul].bc;
208 return G_aEncodings[ul].id;
209 }
210 }
211
212 return UNSUPPORTED;
213}
214
215/*
216 *@@ encCreateCodec:
217 * creates a codec that can be used for conversion between
218 * Unicode and codepaged characters (and vice versa).
219 *
220 * A codec essentially consists of two tables which can
221 * be used for quick index-based lookups in both directions.
222 * This function goes thru the tables provided in
223 * include\encodings\*.h and builds the codec tables
224 * from them.
225 *
226 * This function takes an encoding ID as input. Each
227 * codepage table in include\encodings\*.h has one
228 * of those IDs assigned. Use encFindIdForCodepage
229 * to find the ID for a given OS/2 codepage.
230 *
231 * Use codecs carefully and only when they are really
232 * needed for a specific conversion. Building a codec
233 * is expensive, so you should create a codec once
234 * and reuse it for future conversions. In addition,
235 * create codecs only for the codepages that are
236 * actually used. Each codec will take up to
237 * n * sizeof(USHORT) bytes, where n is the highest
238 * Unicode character used in the codepage.
239 *
240 * Codec remarks:
241 *
242 * -- All codepages share the first 128 characters
243 * (0-0x7F) with ASCII.
244 *
245 * -- Since the first 128 characters (0-0x7F) in
246 * Unicode are equivalent to ASCII also, codecs
247 * are not needed if you process ASCII strings
248 * only.
249 *
250 * -- Since the next 128 characters (0x80-0xFF) in
251 * Unicode are equivalent to ISO/IEC 8859-1
252 * (Latin-1), codecs aren't needed for those
253 * strings either.
254 *
255 * Note that codepoints 0x80-0x9F are undefined
256 * in Latin-1 but used as control sequences in
257 * Unicode.
258 *
259 * -- As far as I know, codepage 1252, which is
260 * used per default under Windows, is equivalent
261 * to Latin 1 except that it also defines
262 * codepoints 0x80-0x9F to certain DTP characters.
263 *
264 * -- From my testing, codepage 1004 (which is
265 * described as "Windows-compatible" in most OS/2
266 * docs) is the same as codepage 1252, except for
267 * character 0xAF.
268 *
269 * Unfortunately, OS/2 uses codepage 850 on most
270 * systems (and Windows uses OS/2 codepage 1252),
271 * so for conversion between those, codecs are needed.
272 *
273 * This works and is presently used in WarpIN.
274 */
275
276PCONVERSION encCreateCodec(ENCID id)
277{
278 PXWPENCODINGMAP pEncodingMap;
279 unsigned long cArrayEntries;
280
281 if (encGetTable(id,
282 &pEncodingMap,
283 &cArrayEntries))
284 {
285 unsigned short usHighestCP = 0,
286 usHighestUni = 0;
287 unsigned long ul;
288
289 // step 1:
290 // run through the table and calculate the highest
291 // character entry used
292 for (ul = 0;
293 ul < cArrayEntries;
294 ul++)
295 {
296 if (pEncodingMap[ul].usCP > usHighestCP)
297 usHighestCP = pEncodingMap[ul].usCP;
298 if (pEncodingMap[ul].usUni > usHighestUni)
299 usHighestUni = pEncodingMap[ul].usUni;
300 }
301
302 // step 2: allocate encoding table
303 if (usHighestCP && usHighestUni)
304 {
305 PCONVERSION pTableNew;
306 if (pTableNew = NEW(CONVERSION))
307 {
308 unsigned long cbEntriesUniFromCP
309 = (usHighestCP + 1) * sizeof(unsigned short);
310 unsigned long cbEntriesCPFromUni
311 = (usHighestUni + 1) * sizeof(unsigned short);
312
313 ZERO(pTableNew);
314
315 pTableNew->usHighestCP = usHighestCP;
316 pTableNew->usHighestUni = usHighestUni;
317
318 if ( (pTableNew->ausEntriesUniFromCP
319 = (unsigned short*)malloc(cbEntriesUniFromCP))
320 && (pTableNew->ausEntriesCPFromUni
321 = (unsigned short*)malloc(cbEntriesCPFromUni))
322 )
323 {
324 // step 3: fill encoding tables
325
326 memset(pTableNew->ausEntriesUniFromCP,
327 0xFF,
328 cbEntriesUniFromCP);
329 memset(pTableNew->ausEntriesCPFromUni,
330 0xFF,
331 cbEntriesCPFromUni);
332
333 for (ul = 0;
334 ul < cArrayEntries;
335 ul++)
336 {
337 PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
338
339 pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
340
341 pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
342 }
343
344 return pTableNew;
345 }
346
347 free(pTableNew);
348 }
349 }
350 }
351
352 return NULL;
353}
354
355/*
356 *@@ encFreeCodec:
357 * frees a codec created with encCreateCodec
358 * and sets the given pointer to NULL.
359 *
360 * This works and is presently used in WarpIN.
361 *
362 *@@added V0.9.18 (2002-03-08) [umoeller]
363 */
364
365void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
366{
367 PCONVERSION pTable;
368 if (pTable = *ppTable)
369 {
370 if (pTable->ausEntriesUniFromCP)
371 free(pTable->ausEntriesUniFromCP);
372 if (pTable->ausEntriesCPFromUni)
373 free(pTable->ausEntriesCPFromUni);
374 free(pTable);
375 *ppTable = NULL;
376 }
377}
378
379/*
380 *@@ encChar2Uni:
381 * converts a codepage-specific character
382 * to Unicode, using the given conversion
383 * table from encCreateCodec().
384 *
385 * Returns 0xFFFF on errors, which is unlikely
386 * with Unicode though.
387 *
388 * This works and is presently used in WarpIN.
389 *
390 *@@added V0.9.18 (2002-03-08) [umoeller]
391 */
392
393unsigned long encChar2Uni(PCONVERSION pTable,
394 unsigned short c)
395{
396 if ( (pTable)
397 && (c <= pTable->usHighestCP)
398 )
399 return pTable->ausEntriesUniFromCP[c];
400
401 return 0xFFFF;
402}
403
404/*
405 *@@ encUni2Char:
406 * converts a Unicode character to the
407 * codepage specified by the given
408 * conversion table from encCreateCodec().
409 *
410 * Returns 0xFFFF if the Unicode character
411 * has no codepage equivalent.
412 *
413 * This works and is presently used in WarpIN.
414 *
415 *@@added V0.9.18 (2002-03-08) [umoeller]
416 */
417
418unsigned short encUni2Char(PCONVERSION pTable,
419 unsigned long ulUni)
420{
421 if ( (pTable)
422 && (ulUni <= pTable->usHighestUni)
423 )
424 return pTable->ausEntriesCPFromUni[ulUni];
425
426 return 0xFFFF;
427}
428
429/*
430 *@@ encDecodeUTF8:
431 * decodes one UTF-8 character and returns
432 * the Unicode value or -1 if the character
433 * is invalid.
434 *
435 * On input, *ppch is assumed to point to
436 * the first byte of the UTF-8 char to be
437 * read.
438 *
439 * This function will advance *ppch by at
440 * least one byte (or more if the UTF-8
441 * char initially pointed to introduces
442 * a multi-byte sequence).
443 *
444 * This returns -1 if *ppch points to an
445 * invalid encoding (in which case the
446 * pointer is advanced anyway).
447 *
448 * This returns 0 if *ppch points to a
449 * null character.
450 *
451 * This works and is presently used in WarpIN.
452 *
453 *@@added V0.9.14 (2001-08-09) [umoeller]
454 */
455
456unsigned long encDecodeUTF8(const char **ppch)
457{
458 unsigned long ulChar;
459 unsigned long ulCount;
460 int fIllegal;
461
462 if (!(ulChar = **ppch))
463 // null is null
464 return 0;
465
466 // if (ulChar < 0x80): simple, one byte only... use that
467
468 if (ulChar < 0x80)
469 {
470 (*ppch)++;
471 return ulChar;
472 }
473
474 ulCount = 1;
475 fIllegal = 0;
476
477 // note: 0xc0 and 0xc1 are reserved and
478 // cannot appear as the first UTF-8 byte
479
480 if ( (ulChar >= 0xc2)
481 && (ulChar < 0xe0)
482 )
483 {
484 // that's two bytes
485 ulCount = 2;
486 ulChar &= 0x1f;
487 }
488 else if ((ulChar & 0xf0) == 0xe0)
489 {
490 // three bytes
491 ulCount = 3;
492 ulChar &= 0x0f;
493 }
494 else if ((ulChar & 0xf8) == 0xf0)
495 {
496 // four bytes
497 ulCount = 4;
498 ulChar &= 0x07;
499 }
500 else if ((ulChar & 0xfc) == 0xf8)
501 {
502 // five bytes
503 ulCount = 5;
504 ulChar &= 0x03;
505 }
506 else if ((ulChar & 0xfe) == 0xfc)
507 {
508 // six bytes
509 ulCount = 6;
510 ulChar &= 0x01;
511 }
512 else
513 ++fIllegal;
514
515 if (!fIllegal)
516 {
517 // go for the second and more bytes then
518 int ul2;
519
520 for (ul2 = 1;
521 ul2 < ulCount;
522 ++ul2)
523 {
524 unsigned long ulChar2 = *((*ppch) + ul2);
525
526 if (!(ulChar2 & 0xc0)) // != 0x80)
527 {
528 ++fIllegal;
529 break;
530 }
531
532 ulChar <<= 6;
533 ulChar |= ulChar2 & 0x3f;
534 }
535 }
536
537 if (fIllegal)
538 {
539 // skip all the following characters
540 // until we find something with bit 7 off
541 do
542 {
543 ulChar = *(++(*ppch));
544 if (!ulChar)
545 break;
546 } while (ulChar & 0x80);
547 }
548 else
549 *ppch += ulCount;
550
551 return ulChar;
552}
553
554/*
555 *@@ encInitCase:
556 * creates a casefold for later use with
557 * encToUpper.
558 *
559 * This only uses one-byte sequences from
560 * the Unicode case folding table (see
561 * include\encodings\unicase.h), so this
562 * cannot be used for expanding characters
563 * at this point.
564 *
565 * Returns 1 (TRUE) on success.
566 *
567 * This works and is presently used in WarpIN.
568 *
569 *@@added V0.9.20 (2002-07-03) [umoeller]
570 */
571
572int encInitCase(void)
573{
574 unsigned long ul,
575 cEntries = 0,
576 cb;
577
578 for (ul = 0;
579 ul < ARRAYITEMCOUNT(G_aCaseFolds);
580 ++ul)
581 {
582 // ignore CASEFL_T (duplicate entries for i chars)
583 // and CASEFL_F (expansions)
584 if ( (G_aCaseFolds[ul].fl & (CASEFL_C | CASEFL_S))
585 && (G_aCaseFolds[ul].ulLow > cEntries)
586 )
587 cEntries = G_aCaseFolds[ul].ulLow;
588 }
589
590 cb = sizeof(ENCCASEFOLD) + cEntries * sizeof(unsigned long);
591 if (G_pFold = (PENCCASEFOLD)malloc(cb))
592 {
593 memset(G_pFold, 0, cb);
594 G_pFold->cEntries = cEntries;
595
596 for (ul = 0;
597 ul < ARRAYITEMCOUNT(G_aCaseFolds);
598 ++ul)
599 {
600 if (G_aCaseFolds[ul].fl & (CASEFL_C | CASEFL_S))
601 G_pFold->aulFolds[G_aCaseFolds[ul].ulLow] = G_aCaseFolds[ul].c1;
602 }
603
604 return 1;
605 }
606
607 return 0;
608}
609
610/*
611 *@@ encToUpper:
612 * converts the given unicode character to
613 * upper case, if possible, or returns
614 * ulUni back if Unicode doesn't define
615 * an upper-case character for it.
616 *
617 * Special cases:
618 *
619 * -- Returns 0 for 0.
620 *
621 * Preconditions:
622 *
623 * -- You must call encInitCase before
624 * the first call.
625 *
626 * This works and is presently used in WarpIN.
627 *
628 *@@added V0.9.20 (2002-07-03) [umoeller]
629 */
630
631unsigned long encToUpper(unsigned long ulUni)
632{
633 unsigned long ulFold;
634
635 if ( (ulUni < G_pFold->cEntries)
636 && (ulFold = G_pFold->aulFolds[ulUni])
637 )
638 return ulFold;
639
640 return ulUni;
641}
642
643/*
644 *@@ encicmp:
645 * like stricmp, but for UTF-8 strings.
646 * This uses encToUpper for the comparisons.
647 *
648 * Like stricmp, this returns:
649 *
650 * -- -1 if pcsz1 is less than pcsz2
651 * -- 0 if pcsz1 is equal to pcsz2
652 * -- +1 if pcsz1 is greater than pcsz2
653 *
654 * However, this does not crash on passing
655 * in NULL strings.
656 *
657 * Preconditions:
658 *
659 * -- You must call encInitCase before
660 * the first call.
661 *
662 * This works and is presently used in WarpIN.
663 *
664 *@@added V0.9.20 (2002-07-03) [umoeller]
665 */
666
667int encicmp(const char *pcsz1,
668 const char *pcsz2)
669{
670 const char *p1 = pcsz1,
671 *p2 = pcsz2;
672
673 unsigned long ul1, ul2;
674
675 do
676 {
677 // encDecodeUTF8 returns null for null, so this is safe
678 ul1 = encToUpper(encDecodeUTF8(&p1));
679 ul2 = encToUpper(encDecodeUTF8(&p2));
680
681 if (ul1 < ul2)
682 return -1;
683 if (ul1 > ul2)
684 return +1;
685
686 // both are equal: check for null bytes then
687 if (!ul1)
688 if (!ul2)
689 return 0;
690 else
691 // ul1 is null, but ul2 isn't:
692 return -1;
693 else
694 if (!ul2)
695 // ul1 is not null, but ul2 is:
696 return +1;
697
698 // both are non-null: continue
699
700 } while (1);
701
702 return 0;
703}
704
Note: See TracBrowser for help on using the repository browser.