source: trunk/src/helpers/encodings.c@ 151

Last change on this file since 151 was 147, checked in by umoeller, 23 years ago

Misc updates for Unicode.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 16.6 KB
Line 
1
2/*
3 *@@sourcefile encodings.c:
4 * character encoding translations.
5 *
6 * See encCreateCodec for an introduction.
7 *
8 *@@header "encodings\base.h"
9 *@@added V0.9.9 (2001-02-14) [umoeller]
10 */
11
12/*
13 * Copyright (C) 2001 Ulrich M”ller.
14 * This file is part of the "XWorkplace helpers" source package.
15 * This is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published
17 * by the Free Software Foundation, in version 2 as it comes in the
18 * "COPYING" file of the XWorkplace main distribution.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 */
24
25#define OS2EMX_PLAIN_CHAR
26 // this is needed for "os2emx.h"; if this is defined,
27 // emx will define PSZ as _signed_ char, otherwise
28 // as unsigned char
29
30#include <stdlib.h>
31#include <string.h>
32
33#include "setup.h" // code generation and debugging options
34
35#include "helpers\standards.h"
36
37#include "encodings\base.h"
38#include "encodings\alltables.h"
39// #include "encodings\collate.h"
40
41#pragma hdrstop
42
43#define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
44
45/*
46 *@@ G_aEncodings:
47 * list of all encodings supported by this engine
48 * (i.e. we have a corresponding codepage in
49 * include\encodings\*.h) together with some
50 * additional information for each encoding,
51 * such as the corresponding OS/2 codepage
52 * number and a descriptive string.
53 *
54 *@@added V [umoeller]
55 */
56
57struct
58{
59 ENCID id; // engine ID (enum)
60 PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
61 unsigned long cEntries; // entries in map (array item count)
62 unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
63 ENCBYTECOUNT bc;
64 const char *pcszDescription; // description
65} G_aEncodings[] =
66 {
67 ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
68 ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
69 ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
70 ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
71 ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
72 // Romania, Poland
73 ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
74 ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
75 ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
76 ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
77 ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
78 ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
79 ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
80 ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
81 ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
82 ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
83 ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
84 // ENCODINGENTRY(cp932), 932 or 943?, DOUBLE, "Japanese Windows",
85 // ENCODINGENTRY(cp936), 936 or 946?, DOUBLE, "Chinese",
86 // ENCODINGENTRY(cp949), 951 or 949?, DOUBLE, "Korean",
87 // ENCODINGENTRY(cp950), 947 or 950?, DOUBLE, "Taiwan Big-5", // default in China?
88 ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
89 ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
90 ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
91 ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
92 ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
93 ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
94 ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
95 ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
96 ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
97 ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
98 ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
99 ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
100 ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
101 ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
102 ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
103 ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
104 ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
105 ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
106 ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
107 ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
108 ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
109 ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
110 ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
111
112 UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
113 UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
114 };
115
116/*
117 *@@ FindEntry:
118 *
119 *@@added V0.9.18 (2002-03-08) [umoeller]
120 */
121
122static int FindEntry(ENCID id,
123 PXWPENCODINGMAP *ppMap,
124 unsigned long *pcEntries)
125{
126 unsigned long ul;
127 for (ul = 0;
128 ul < ARRAYITEMCOUNT(G_aEncodings);
129 ul++)
130 {
131 if (G_aEncodings[ul].id == id)
132 {
133 *ppMap = G_aEncodings[ul].pMap;
134 *pcEntries = G_aEncodings[ul].cEntries;
135 return (1);
136 }
137 }
138
139 return (0);
140}
141
142/*
143 *@@ encFindIdForCodepage:
144 * returns the ENCID for the given OS/2
145 * codepage, or UNSUPPORTED if there's none.
146 *
147 *@@added V0.9.18 (2002-03-08) [umoeller]
148 */
149
150ENCID encFindIdForCodepage(unsigned short usCodepage,
151 const char **ppcszDescription, // out: codepage description; ptr can be NULL
152 ENCBYTECOUNT *pByteCount)
153{
154 unsigned long ul;
155 for (ul = 0;
156 ul < ARRAYITEMCOUNT(G_aEncodings);
157 ul++)
158 {
159 if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
160
161 {
162 if (ppcszDescription)
163 *ppcszDescription = G_aEncodings[ul].pcszDescription;
164 if (pByteCount)
165 *pByteCount = G_aEncodings[ul].bc;
166 return G_aEncodings[ul].id;
167 }
168 }
169
170 return (UNSUPPORTED);
171}
172
173/*
174 *@@ encCreateCodec:
175 * creates a codec that can be used for conversion between
176 * Unicode and codepaged characters (and vice versa).
177 *
178 * A codec essentially consists of two tables which can
179 * be used for quick index-based lookups in both directions.
180 * This function goes thru the tables provided in
181 * include\encodings\*.h and builds the codec tables
182 * from them.
183 *
184 * This function takes an encoding ID as input. Each
185 * codepage table in include\encodings\*.h has one
186 * of those IDs assigned. Use encFindIdForCodepage
187 * to find the ID for a given OS/2 codepage.
188 *
189 * Use codecs carefully and only when they are really
190 * needed for a specific conversion. Building a codec
191 * is expensive, so you should create a codec once
192 * and reuse it for future conversions. In addition,
193 * create codecs only for the codepages that are
194 * actually used. Each codec will take up
195 * n * sizeof(USHORT) bytes, where n is the highest
196 * Unicode character used in the codepage.
197 *
198 * Remarks:
199 *
200 * -- All codepages share the first 128 characters
201 * (0-0x7F) with ASCII.
202 *
203 * -- Since the first 128 characters (0-0x7F) in
204 * Unicode are equivalent to ASCII also, codecs
205 * are not needed if you process ASCII strings
206 * only.
207 *
208 * -- Since the next 128 characters (0x80-0xFF) in
209 * Unicode are equivalent to ISO/IEC 8859-1
210 * (Latin-1), codecs aren't needed for those
211 * strings either.
212 *
213 * Note that codepoints 0x80-0x9F are undefined
214 * in Latin-1 but used as control sequences in
215 * Unicode.
216 *
217 * -- As far as I know, codepage 1252, which is
218 * used per default under Windows, is equivalent
219 * to Latin 1 except that it also defines
220 * codepoints 0x80-0x9F to certain DTP characters.
221 *
222 * -- From my testing, codepage 1004 (which is
223 * described as "Windows-compatible" in most OS/2
224 * docs) is the same as codepage 1252, except for
225 * character 0xAF.
226 *
227 * Unfortunately, OS/2 uses codepage 850 on most
228 * systems (and Windows uses OS/2 codepage 1252),
229 * so for conversion between those, codecs are needed.
230 */
231
232PCONVERSION encCreateCodec(ENCID id)
233{
234 PXWPENCODINGMAP pEncodingMap;
235 unsigned long cArrayEntries;
236
237 if (FindEntry(id,
238 &pEncodingMap,
239 &cArrayEntries))
240 {
241 unsigned short usHighestCP = 0,
242 usHighestUni = 0;
243 unsigned long ul;
244
245 // step 1:
246 // run through the table and calculate the highest
247 // character entry used
248 for (ul = 0;
249 ul < cArrayEntries;
250 ul++)
251 {
252 if (pEncodingMap[ul].usCP > usHighestCP)
253 usHighestCP = pEncodingMap[ul].usCP;
254 if (pEncodingMap[ul].usUni > usHighestUni)
255 usHighestUni = pEncodingMap[ul].usUni;
256 }
257
258 // step 2: allocate encoding table
259 if (usHighestCP && usHighestUni)
260 {
261 PCONVERSION pTableNew;
262 if (pTableNew = NEW(CONVERSION))
263 {
264 unsigned long cbEntriesUniFromCP
265 = (usHighestCP + 1) * sizeof(unsigned short);
266 unsigned long cbEntriesCPFromUni
267 = (usHighestUni + 1) * sizeof(unsigned short);
268
269 ZERO(pTableNew);
270
271 pTableNew->usHighestCP = usHighestCP;
272 pTableNew->usHighestUni = usHighestUni;
273
274 if ( (pTableNew->ausEntriesUniFromCP
275 = (unsigned short*)malloc(cbEntriesUniFromCP))
276 && (pTableNew->ausEntriesCPFromUni
277 = (unsigned short*)malloc(cbEntriesCPFromUni))
278 )
279 {
280 // step 3: fill encoding tables
281
282 memset(pTableNew->ausEntriesUniFromCP,
283 0xFF,
284 cbEntriesUniFromCP);
285 memset(pTableNew->ausEntriesCPFromUni,
286 0xFF,
287 cbEntriesCPFromUni);
288
289 for (ul = 0;
290 ul < cArrayEntries;
291 ul++)
292 {
293 PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
294
295 pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
296
297 pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
298 }
299
300 return (pTableNew);
301 }
302
303 free(pTableNew);
304 }
305 }
306 }
307
308 return (NULL);
309}
310
311/*
312 *@@ encFreeCodec:
313 * frees a codec created with encFreeConversion
314 * and sets the given pointer to NULL.
315 *
316 *@@added V0.9.18 (2002-03-08) [umoeller]
317 */
318
319void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
320{
321 PCONVERSION pTable;
322 if (pTable = *ppTable)
323 {
324 if (pTable->ausEntriesUniFromCP)
325 free(pTable->ausEntriesUniFromCP);
326 if (pTable->ausEntriesCPFromUni)
327 free(pTable->ausEntriesCPFromUni);
328 free(pTable);
329 *ppTable = NULL;
330 }
331}
332
333/*
334 *@@ encChar2Uni:
335 * converts a codepage-specific character
336 * to Unicode, using the given conversion
337 * table from encCreateCodec().
338 *
339 * Returns 0xFFFF on errors, which is unlikely
340 * with Unicode though.
341 *
342 *@@added V0.9.18 (2002-03-08) [umoeller]
343 */
344
345unsigned long encChar2Uni(PCONVERSION pTable,
346 unsigned short c)
347{
348 if ( (pTable)
349 && (c <= pTable->usHighestCP)
350 )
351 return (pTable->ausEntriesUniFromCP[c]);
352
353 return (0xFFFF);
354}
355
356/*
357 *@@ encUni2Char:
358 * converts a Unicode character to the
359 * codepage specified by the given
360 * conversion table from encCreateCodec().
361 *
362 * Returns 0xFFFF if the Unicode character
363 * has no codepage equivalent.
364 *
365 *@@added V0.9.18 (2002-03-08) [umoeller]
366 */
367
368unsigned short encUni2Char(PCONVERSION pTable,
369 unsigned long ulUni)
370{
371 if ( (pTable)
372 && (ulUni <= pTable->usHighestUni)
373 )
374 return (pTable->ausEntriesCPFromUni[ulUni]);
375
376 return (0xFFFF);
377}
378
379/*
380 *@@ encDecodeUTF8:
381 * decodes one UTF-8 character and returns
382 * the Unicode value or -1 if the character
383 * is invalid.
384 *
385 * On input, *ppch is assumed to point to
386 * the first byte of the UTF-8 char to be
387 * read.
388 *
389 * This function will advance *ppch by at
390 * least one byte (or more if the UTF-8
391 * char initially pointed to introduces
392 * a multi-byte sequence).
393 *
394 * This returns -1 if *ppch points to an
395 * invalid encoding (in which case the
396 * pointer is advanced anyway).
397 *
398 * This returns 0 if **ppch points to a
399 * null character.
400 *
401 *@@added V0.9.14 (2001-08-09) [umoeller]
402 */
403
404unsigned long encDecodeUTF8(const char **ppch)
405{
406 unsigned long ulChar = **ppch;
407
408 if (!ulChar)
409 return 0;
410
411 // if (ulChar < 0x80): simple, one byte only... use that
412
413 if (ulChar >= 0x80)
414 {
415 unsigned long ulCount = 1;
416 int fIllegal = 0;
417
418 // note: 0xc0 and 0xc1 are reserved and
419 // cannot appear as the first UTF-8 byte
420
421 if ( (ulChar >= 0xc2)
422 && (ulChar < 0xe0)
423 )
424 {
425 // that's two bytes
426 ulCount = 2;
427 ulChar &= 0x1f;
428 }
429 else if ((ulChar & 0xf0) == 0xe0)
430 {
431 // three bytes
432 ulCount = 3;
433 ulChar &= 0x0f;
434 }
435 else if ((ulChar & 0xf8) == 0xf0)
436 {
437 // four bytes
438 ulCount = 4;
439 ulChar &= 0x07;
440 }
441 else if ((ulChar & 0xfc) == 0xf8)
442 {
443 // five bytes
444 ulCount = 5;
445 ulChar &= 0x03;
446 }
447 else if ((ulChar & 0xfe) == 0xfc)
448 {
449 // six bytes
450 ulCount = 6;
451 ulChar &= 0x01;
452 }
453 else
454 ++fIllegal;
455
456 if (!fIllegal)
457 {
458 // go for the second and more bytes then
459 int ul2;
460
461 for (ul2 = 1;
462 ul2 < ulCount;
463 ++ul2)
464 {
465 unsigned long ulChar2 = *((*ppch) + ul2);
466
467 if (!(ulChar2 & 0xc0)) // != 0x80)
468 {
469 ++fIllegal;
470 break;
471 }
472
473 ulChar <<= 6;
474 ulChar |= ulChar2 & 0x3f;
475 }
476 }
477
478 if (fIllegal)
479 {
480 // skip all the following characters
481 // until we find something with bit 7 off
482 do
483 {
484 ulChar = *(++(*ppch));
485 if (!ulChar)
486 break;
487 } while (ulChar & 0x80);
488 }
489 else
490 *ppch += ulCount;
491 }
492 else
493 (*ppch)++;
494
495 return (ulChar);
496}
497
498#if 0
499
500/*
501 *@@ encCodepageToUTF8:
502 *
503 *@@added V0.9.18 (2002-03-08) [umoeller]
504 */
505
506void encCodepageToUTF8(const char **ppch)
507{
508
509}
510
511putwchar(c)
512{
513 if (c < 0x80) {
514 putchar (c);
515 }
516 else if (c < 0x800) {
517 putchar (0xC0 | c>>6);
518 putchar (0x80 | c & 0x3F);
519 }
520 else if (c < 0x10000) {
521 putchar (0xE0 | c>>12);
522 putchar (0x80 | c>>6 & 0x3F);
523 putchar (0x80 | c & 0x3F);
524 }
525 else if (c < 0x200000) {
526 putchar (0xF0 | c>>18);
527 putchar (0x80 | c>>12 & 0x3F);
528 putchar (0x80 | c>>6 & 0x3F);
529 putchar (0x80 | c & 0x3F);
530 }
531}
532
533#endif
534
Note: See TracBrowser for help on using the repository browser.