source: trunk/src/helpers/encodings.c@ 191

Last change on this file since 191 was 191, checked in by umoeller, 23 years ago

Mutex modifications.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 20.7 KB
Line 
1
2/*
3 *@@sourcefile encodings.c:
4 * character encoding support. Handles all kinds
5 * of legacy codepages (including most OS/2 codepage)
6 * and Unicode in the form of UTF-8 and translations
7 * between then.
8 *
9 * See encCreateCodec for an introduction.
10 *
11 * Be warned, compilation of this file takes a long
12 * file because this includes all the complex codepage
13 * from include\encodings.
14 *
15 *@@header "encodings\base.h"
16 *@@added V0.9.9 (2001-02-14) [umoeller]
17 */
18
19/*
20 * Copyright (C) 2001-2002 Ulrich M”ller.
21 * This file is part of the "XWorkplace helpers" source package.
22 * This is free software; you can redistribute it and/or modify
23 * it under the terms of the GNU General Public License as published
24 * by the Free Software Foundation, in version 2 as it comes in the
25 * "COPYING" file of the XWorkplace main distribution.
26 * This program is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 */
31
32#define OS2EMX_PLAIN_CHAR
33 // this is needed for "os2emx.h"; if this is defined,
34 // emx will define PSZ as _signed_ char, otherwise
35 // as unsigned char
36
37#include <stdlib.h>
38#include <string.h>
39
40#include "setup.h" // code generation and debugging options
41
42#include "helpers\standards.h"
43
44#include "encodings\base.h"
45
46#include "encodings\unicase.h"
47
48#include "encodings\alltables.h" // this takes a very long time
49
50#pragma hdrstop
51
52/*
53 *@@category: Helpers\National Language Support\Encodings
54 * See encodings.c.
55 */
56
57/*
58 *@@ G_aEncodings:
59 * list of all encodings supported by this engine
60 * (i.e. we have a corresponding codepage in
61 * include\encodings\*.h) together with some
62 * additional information for each encoding,
63 * such as the corresponding OS/2 codepage
64 * number and a descriptive string.
65 *
66 *@@added V [umoeller]
67 */
68
69struct
70{
71 ENCID id; // engine ID (enum)
72 PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
73 unsigned long cEntries; // entries in map (array item count)
74 unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
75 ENCBYTECOUNT bc;
76 const char *pcszDescription; // description
77} G_aEncodings[] =
78 {
79 #define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
80
81 ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
82 ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
83 ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
84 ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
85 ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
86 // Romania, Poland
87 ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
88 ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
89 ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
90 ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
91 ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
92 ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
93 ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
94 ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
95 ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
96 ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
97 ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
98
99 ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
100 ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
101 ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
102 ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
103
104 ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
105 ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
106 ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
107 ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
108 ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
109 ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
110 ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
111 ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
112 ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
113 ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
114 ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
115 ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
116 ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
117 ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
118 ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
119 ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
120 ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
121 ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
122 ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
123 ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
124 ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
125 ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
126 ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
127
128 UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
129 UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
130 };
131
132/*
133 *@@ ENCCASEFOLD:
134 *
135 *@@added V0.9.20 (2002-07-03) [umoeller]
136 */
137
138typedef struct _ENCCASEFOLD
139{
140 unsigned long cEntries;
141 unsigned long aulFolds[1];
142} ENCCASEFOLD, *PENCCASEFOLD;
143
144static PENCCASEFOLD G_pFold = NULL;
145
146/*
147 *@@ encGetTable:
148 *
149 *@@added V0.9.18 (2002-03-08) [umoeller]
150 */
151
152int encGetTable(ENCID id,
153 PXWPENCODINGMAP *ppMap,
154 unsigned long *pcEntries)
155{
156 unsigned long ul;
157 for (ul = 0;
158 ul < ARRAYITEMCOUNT(G_aEncodings);
159 ul++)
160 {
161 if (G_aEncodings[ul].id == id)
162 {
163 *ppMap = G_aEncodings[ul].pMap;
164 *pcEntries = G_aEncodings[ul].cEntries;
165 return (1);
166 }
167 }
168
169 return (0);
170}
171
172/*
173 *@@ encFindIdForCodepage:
174 * returns the ENCID for the given OS/2
175 * codepage, or UNSUPPORTED if there's none.
176 *
177 *@@added V0.9.18 (2002-03-08) [umoeller]
178 */
179
180ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
181 const char **ppcszDescription, // out: codepage description; ptr can be NULL
182 ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE; ptr can be NULL
183{
184 unsigned long ul;
185 for (ul = 0;
186 ul < ARRAYITEMCOUNT(G_aEncodings);
187 ul++)
188 {
189 if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
190
191 {
192 if (ppcszDescription)
193 *ppcszDescription = G_aEncodings[ul].pcszDescription;
194 if (pByteCount)
195 *pByteCount = G_aEncodings[ul].bc;
196 return G_aEncodings[ul].id;
197 }
198 }
199
200 return (UNSUPPORTED);
201}
202
203/*
204 *@@ encCreateCodec:
205 * creates a codec that can be used for conversion between
206 * Unicode and codepaged characters (and vice versa).
207 *
208 * A codec essentially consists of two tables which can
209 * be used for quick index-based lookups in both directions.
210 * This function goes thru the tables provided in
211 * include\encodings\*.h and builds the codec tables
212 * from them.
213 *
214 * This function takes an encoding ID as input. Each
215 * codepage table in include\encodings\*.h has one
216 * of those IDs assigned. Use encFindIdForCodepage
217 * to find the ID for a given OS/2 codepage.
218 *
219 * Use codecs carefully and only when they are really
220 * needed for a specific conversion. Building a codec
221 * is expensive, so you should create a codec once
222 * and reuse it for future conversions. In addition,
223 * create codecs only for the codepages that are
224 * actually used. Each codec will take up
225 * n * sizeof(USHORT) bytes, where n is the highest
226 * Unicode character used in the codepage.
227 *
228 * Codec remarks:
229 *
230 * -- All codepages share the first 128 characters
231 * (0-0x7F) with ASCII.
232 *
233 * -- Since the first 128 characters (0-0x7F) in
234 * Unicode are equivalent to ASCII also, codecs
235 * are not needed if you process ASCII strings
236 * only.
237 *
238 * -- Since the next 128 characters (0x80-0xFF) in
239 * Unicode are equivalent to ISO/IEC 8859-1
240 * (Latin-1), codecs aren't needed for those
241 * strings either.
242 *
243 * Note that codepoints 0x80-0x9F are undefined
244 * in Latin-1 but used as control sequences in
245 * Unicode.
246 *
247 * -- As far as I know, codepage 1252, which is
248 * used per default under Windows, is equivalent
249 * to Latin 1 except that it also defines
250 * codepoints 0x80-0x9F to certain DTP characters.
251 *
252 * -- From my testing, codepage 1004 (which is
253 * described as "Windows-compatible" in most OS/2
254 * docs) is the same as codepage 1252, except for
255 * character 0xAF.
256 *
257 * Unfortunately, OS/2 uses codepage 850 on most
258 * systems (and Windows uses OS/2 codepage 1252),
259 * so for conversion between those, codecs are needed.
260 *
261 * This works and is presently used in WarpIN.
262 */
263
264PCONVERSION encCreateCodec(ENCID id)
265{
266 PXWPENCODINGMAP pEncodingMap;
267 unsigned long cArrayEntries;
268
269 if (encGetTable(id,
270 &pEncodingMap,
271 &cArrayEntries))
272 {
273 unsigned short usHighestCP = 0,
274 usHighestUni = 0;
275 unsigned long ul;
276
277 // step 1:
278 // run through the table and calculate the highest
279 // character entry used
280 for (ul = 0;
281 ul < cArrayEntries;
282 ul++)
283 {
284 if (pEncodingMap[ul].usCP > usHighestCP)
285 usHighestCP = pEncodingMap[ul].usCP;
286 if (pEncodingMap[ul].usUni > usHighestUni)
287 usHighestUni = pEncodingMap[ul].usUni;
288 }
289
290 // step 2: allocate encoding table
291 if (usHighestCP && usHighestUni)
292 {
293 PCONVERSION pTableNew;
294 if (pTableNew = NEW(CONVERSION))
295 {
296 unsigned long cbEntriesUniFromCP
297 = (usHighestCP + 1) * sizeof(unsigned short);
298 unsigned long cbEntriesCPFromUni
299 = (usHighestUni + 1) * sizeof(unsigned short);
300
301 ZERO(pTableNew);
302
303 pTableNew->usHighestCP = usHighestCP;
304 pTableNew->usHighestUni = usHighestUni;
305
306 if ( (pTableNew->ausEntriesUniFromCP
307 = (unsigned short*)malloc(cbEntriesUniFromCP))
308 && (pTableNew->ausEntriesCPFromUni
309 = (unsigned short*)malloc(cbEntriesCPFromUni))
310 )
311 {
312 // step 3: fill encoding tables
313
314 memset(pTableNew->ausEntriesUniFromCP,
315 0xFF,
316 cbEntriesUniFromCP);
317 memset(pTableNew->ausEntriesCPFromUni,
318 0xFF,
319 cbEntriesCPFromUni);
320
321 for (ul = 0;
322 ul < cArrayEntries;
323 ul++)
324 {
325 PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
326
327 pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
328
329 pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
330 }
331
332 return (pTableNew);
333 }
334
335 free(pTableNew);
336 }
337 }
338 }
339
340 return NULL;
341}
342
343/*
344 *@@ encFreeCodec:
345 * frees a codec created with encFreeConversion
346 * and sets the given pointer to NULL.
347 *
348 * This works and is presently used in WarpIN.
349 *
350 *@@added V0.9.18 (2002-03-08) [umoeller]
351 */
352
353void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
354{
355 PCONVERSION pTable;
356 if (pTable = *ppTable)
357 {
358 if (pTable->ausEntriesUniFromCP)
359 free(pTable->ausEntriesUniFromCP);
360 if (pTable->ausEntriesCPFromUni)
361 free(pTable->ausEntriesCPFromUni);
362 free(pTable);
363 *ppTable = NULL;
364 }
365}
366
367/*
368 *@@ encChar2Uni:
369 * converts a codepage-specific character
370 * to Unicode, using the given conversion
371 * table from encCreateCodec().
372 *
373 * Returns 0xFFFF on errors, which is unlikely
374 * with Unicode though.
375 *
376 * This works and is presently used in WarpIN.
377 *
378 *@@added V0.9.18 (2002-03-08) [umoeller]
379 */
380
381unsigned long encChar2Uni(PCONVERSION pTable,
382 unsigned short c)
383{
384 if ( (pTable)
385 && (c <= pTable->usHighestCP)
386 )
387 return (pTable->ausEntriesUniFromCP[c]);
388
389 return (0xFFFF);
390}
391
392/*
393 *@@ encUni2Char:
394 * converts a Unicode character to the
395 * codepage specified by the given
396 * conversion table from encCreateCodec().
397 *
398 * Returns 0xFFFF if the Unicode character
399 * has no codepage equivalent.
400 *
401 * This works and is presently used in WarpIN.
402 *
403 *@@added V0.9.18 (2002-03-08) [umoeller]
404 */
405
406unsigned short encUni2Char(PCONVERSION pTable,
407 unsigned long ulUni)
408{
409 if ( (pTable)
410 && (ulUni <= pTable->usHighestUni)
411 )
412 return (pTable->ausEntriesCPFromUni[ulUni]);
413
414 return (0xFFFF);
415}
416
417/*
418 *@@ encDecodeUTF8:
419 * decodes one UTF-8 character and returns
420 * the Unicode value or -1 if the character
421 * is invalid.
422 *
423 * On input, *ppch is assumed to point to
424 * the first byte of the UTF-8 char to be
425 * read.
426 *
427 * This function will advance *ppch by at
428 * least one byte (or more if the UTF-8
429 * char initially pointed to introduces
430 * a multi-byte sequence).
431 *
432 * This returns -1 if *ppch points to an
433 * invalid encoding (in which case the
434 * pointer is advanced anyway).
435 *
436 * This returns 0 if **ppch points to a
437 * null character.
438 *
439 * This works and is presently used in WarpIN.
440 *
441 *@@added V0.9.14 (2001-08-09) [umoeller]
442 */
443
444unsigned long encDecodeUTF8(const char **ppch)
445{
446 unsigned long ulChar;
447
448 if (!(ulChar = **ppch))
449 // null is null
450 return 0;
451
452 // if (ulChar < 0x80): simple, one byte only... use that
453
454 if (ulChar < 0x80)
455 {
456 (*ppch)++;
457 return (ulChar);
458 }
459 else
460 {
461 unsigned long ulCount = 1;
462 int fIllegal = 0;
463
464 // note: 0xc0 and 0xc1 are reserved and
465 // cannot appear as the first UTF-8 byte
466
467 if ( (ulChar >= 0xc2)
468 && (ulChar < 0xe0)
469 )
470 {
471 // that's two bytes
472 ulCount = 2;
473 ulChar &= 0x1f;
474 }
475 else if ((ulChar & 0xf0) == 0xe0)
476 {
477 // three bytes
478 ulCount = 3;
479 ulChar &= 0x0f;
480 }
481 else if ((ulChar & 0xf8) == 0xf0)
482 {
483 // four bytes
484 ulCount = 4;
485 ulChar &= 0x07;
486 }
487 else if ((ulChar & 0xfc) == 0xf8)
488 {
489 // five bytes
490 ulCount = 5;
491 ulChar &= 0x03;
492 }
493 else if ((ulChar & 0xfe) == 0xfc)
494 {
495 // six bytes
496 ulCount = 6;
497 ulChar &= 0x01;
498 }
499 else
500 ++fIllegal;
501
502 if (!fIllegal)
503 {
504 // go for the second and more bytes then
505 int ul2;
506
507 for (ul2 = 1;
508 ul2 < ulCount;
509 ++ul2)
510 {
511 unsigned long ulChar2 = *((*ppch) + ul2);
512
513 if (!(ulChar2 & 0xc0)) // != 0x80)
514 {
515 ++fIllegal;
516 break;
517 }
518
519 ulChar <<= 6;
520 ulChar |= ulChar2 & 0x3f;
521 }
522 }
523
524 if (fIllegal)
525 {
526 // skip all the following characters
527 // until we find something with bit 7 off
528 do
529 {
530 ulChar = *(++(*ppch));
531 if (!ulChar)
532 break;
533 } while (ulChar & 0x80);
534 }
535 else
536 *ppch += ulCount;
537 }
538
539 return (ulChar);
540}
541
542/*
543 *@@ encInitCase:
544 * creates a casefold for later use with
545 * encToUpper.
546 *
547 * This only uses one-byte sequences from
548 * the Unicode case folding table (see
549 * include\encodings\unicase.h), so this
550 * cannot be used for expanding characters
551 * at this point.
552 *
553 * Returns 1 (TRUE) on success.
554 *
555 * This works and is presently used in WarpIN.
556 *
557 *@@added V0.9.20 (2002-07-03) [umoeller]
558 */
559
560int encInitCase(void)
561{
562 unsigned long ul,
563 cEntries = 0,
564 cb;
565
566 for (ul = 0;
567 ul < ARRAYITEMCOUNT(G_aCaseFolds);
568 ++ul)
569 {
570 // ignore CASEFL_T (duplicate entries for i chars)
571 // and CASEFL_F (expansions)
572 if ( (G_aCaseFolds[ul].fl & (CASEFL_C | CASEFL_S))
573 && (G_aCaseFolds[ul].ulLow > cEntries)
574 )
575 cEntries = G_aCaseFolds[ul].ulLow;
576 }
577
578 cb = sizeof(ENCCASEFOLD) + cEntries * sizeof(unsigned long);
579 if (G_pFold = (PENCCASEFOLD)malloc(cb))
580 {
581 memset(G_pFold, 0, cb);
582 G_pFold->cEntries = cEntries;
583
584 for (ul = 0;
585 ul < ARRAYITEMCOUNT(G_aCaseFolds);
586 ++ul)
587 {
588 if (G_aCaseFolds[ul].fl & (CASEFL_C | CASEFL_S))
589 G_pFold->aulFolds[G_aCaseFolds[ul].ulLow] = G_aCaseFolds[ul].c1;
590 }
591
592 return 1;
593 }
594
595 return 0;
596}
597
598/*
599 *@@ encToUpper:
600 * converts the given unicode character to
601 * upper case, if possible, or returns
602 * ulUni back if Unicode doesn't define
603 * an upper-case character for it.
604 *
605 * Special cases:
606 *
607 * -- Returns 0 for 0.
608 *
609 * Preconditions:
610 *
611 * -- You must call encInitCase before
612 * the first call.
613 *
614 * This works and is presently used in WarpIN.
615 *
616 *@@added V0.9.20 (2002-07-03) [umoeller]
617 */
618
619unsigned long encToUpper(unsigned long ulUni)
620{
621 unsigned long ulFold;
622
623 if ( (ulUni < G_pFold->cEntries)
624 && (ulFold = G_pFold->aulFolds[ulUni])
625 )
626 return ulFold;
627
628 return ulUni;
629}
630
631/*
632 *@@ encicmp:
633 * like stricmp, but for UTF-8 strings.
634 * This uses encToUpper for the comparisons.
635 *
636 * Like stricmp, this returns:
637 *
638 * -- -1 if pcsz1 is less than pcsz2
639 * -- 0 if pcsz1 is equal to pcsz2
640 * -- +1 if pcsz1 is greater than pcsz2
641 *
642 * However, this does not crash on passing
643 * in NULL strings.
644 *
645 * Preconditions:
646 *
647 * -- You must call encInitCase before
648 * the first call.
649 *
650 * This works and is presently used in WarpIN.
651 *
652 *@@added V0.9.20 (2002-07-03) [umoeller]
653 */
654
655int encicmp(const char *pcsz1,
656 const char *pcsz2)
657{
658 const char *p1 = pcsz1,
659 *p2 = pcsz2;
660
661 unsigned long ul1, ul2;
662
663 do
664 {
665 // encDecodeUTF8 returns null for null, so this is safe
666 ul1 = encToUpper(encDecodeUTF8(&p1));
667 ul2 = encToUpper(encDecodeUTF8(&p2));
668
669 if (ul1 < ul2)
670 return -1;
671 if (ul1 > ul2)
672 return +1;
673
674 // both are equal: check for null bytes then
675 if (!ul1)
676 if (!ul2)
677 return 0;
678 else
679 // ul1 is null, but ul2 isn't:
680 return -1;
681 else
682 if (!ul2)
683 // ul1 is not null, but ul2 is:
684 return +1;
685
686 // both are non-null: continue
687
688 } while (1);
689
690 return 0;
691}
692
Note: See TracBrowser for help on using the repository browser.