source: trunk/src/helpers/encodings.c@ 169

Last change on this file since 169 was 169, checked in by umoeller, 23 years ago

Removed animouseptr, some more fixes.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 16.4 KB
Line 
1
2/*
3 *@@sourcefile encodings.c:
4 * character encoding translations.
5 *
6 * See encCreateCodec for an introduction.
7 *
8 * Be warned, compilation of this file takes a long
9 * file because this includes all the complex codepage
10 * from include\encodings.
11 *
12 *@@header "encodings\base.h"
13 *@@added V0.9.9 (2001-02-14) [umoeller]
14 */
15
16/*
17 * Copyright (C) 2001 Ulrich M”ller.
18 * This file is part of the "XWorkplace helpers" source package.
19 * This is free software; you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published
21 * by the Free Software Foundation, in version 2 as it comes in the
22 * "COPYING" file of the XWorkplace main distribution.
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 */
28
29#define OS2EMX_PLAIN_CHAR
30 // this is needed for "os2emx.h"; if this is defined,
31 // emx will define PSZ as _signed_ char, otherwise
32 // as unsigned char
33
34#include <stdlib.h>
35#include <string.h>
36
37#include "setup.h" // code generation and debugging options
38
39#include "helpers\standards.h"
40
41#include "encodings\base.h"
42#include "encodings\alltables.h"
43// #include "encodings\collate.h"
44
45#pragma hdrstop
46
47/*
48 *@@category: Helpers\National Language Support\Encodings
49 * See encodings.c.
50 */
51
52/*
53 *@@ G_aEncodings:
54 * list of all encodings supported by this engine
55 * (i.e. we have a corresponding codepage in
56 * include\encodings\*.h) together with some
57 * additional information for each encoding,
58 * such as the corresponding OS/2 codepage
59 * number and a descriptive string.
60 *
61 *@@added V [umoeller]
62 */
63
64struct
65{
66 ENCID id; // engine ID (enum)
67 PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
68 unsigned long cEntries; // entries in map (array item count)
69 unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
70 ENCBYTECOUNT bc;
71 const char *pcszDescription; // description
72} G_aEncodings[] =
73 {
74 #define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
75
76 ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
77 ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
78 ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
79 ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
80 ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
81 // Romania, Poland
82 ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
83 ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
84 ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
85 ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
86 ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
87 ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
88 ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
89 ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
90 ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
91 ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
92 ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
93
94 ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
95 ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
96 ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
97 ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
98
99 ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
100 ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
101 ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
102 ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
103 ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
104 ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
105 ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
106 ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
107 ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
108 ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
109 ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
110 ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
111 ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
112 ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
113 ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
114 ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
115 ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
116 ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
117 ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
118 ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
119 ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
120 ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
121 ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
122
123 UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
124 UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
125 };
126
127/*
128 *@@ encGetTable:
129 *
130 *@@added V0.9.18 (2002-03-08) [umoeller]
131 */
132
133int encGetTable(ENCID id,
134 PXWPENCODINGMAP *ppMap,
135 unsigned long *pcEntries)
136{
137 unsigned long ul;
138 for (ul = 0;
139 ul < ARRAYITEMCOUNT(G_aEncodings);
140 ul++)
141 {
142 if (G_aEncodings[ul].id == id)
143 {
144 *ppMap = G_aEncodings[ul].pMap;
145 *pcEntries = G_aEncodings[ul].cEntries;
146 return (1);
147 }
148 }
149
150 return (0);
151}
152
153/*
154 *@@ encFindIdForCodepage:
155 * returns the ENCID for the given OS/2
156 * codepage, or UNSUPPORTED if there's none.
157 *
158 *@@added V0.9.18 (2002-03-08) [umoeller]
159 */
160
161ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
162 const char **ppcszDescription, // out: codepage description; ptr can be NULL
163 ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE; ptr can be NULL
164{
165 unsigned long ul;
166 for (ul = 0;
167 ul < ARRAYITEMCOUNT(G_aEncodings);
168 ul++)
169 {
170 if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
171
172 {
173 if (ppcszDescription)
174 *ppcszDescription = G_aEncodings[ul].pcszDescription;
175 if (pByteCount)
176 *pByteCount = G_aEncodings[ul].bc;
177 return G_aEncodings[ul].id;
178 }
179 }
180
181 return (UNSUPPORTED);
182}
183
184/*
185 *@@ encCreateCodec:
186 * creates a codec that can be used for conversion between
187 * Unicode and codepaged characters (and vice versa).
188 *
189 * A codec essentially consists of two tables which can
190 * be used for quick index-based lookups in both directions.
191 * This function goes thru the tables provided in
192 * include\encodings\*.h and builds the codec tables
193 * from them.
194 *
195 * This function takes an encoding ID as input. Each
196 * codepage table in include\encodings\*.h has one
197 * of those IDs assigned. Use encFindIdForCodepage
198 * to find the ID for a given OS/2 codepage.
199 *
200 * Use codecs carefully and only when they are really
201 * needed for a specific conversion. Building a codec
202 * is expensive, so you should create a codec once
203 * and reuse it for future conversions. In addition,
204 * create codecs only for the codepages that are
205 * actually used. Each codec will take up
206 * n * sizeof(USHORT) bytes, where n is the highest
207 * Unicode character used in the codepage.
208 *
209 * Codec remarks:
210 *
211 * -- All codepages share the first 128 characters
212 * (0-0x7F) with ASCII.
213 *
214 * -- Since the first 128 characters (0-0x7F) in
215 * Unicode are equivalent to ASCII also, codecs
216 * are not needed if you process ASCII strings
217 * only.
218 *
219 * -- Since the next 128 characters (0x80-0xFF) in
220 * Unicode are equivalent to ISO/IEC 8859-1
221 * (Latin-1), codecs aren't needed for those
222 * strings either.
223 *
224 * Note that codepoints 0x80-0x9F are undefined
225 * in Latin-1 but used as control sequences in
226 * Unicode.
227 *
228 * -- As far as I know, codepage 1252, which is
229 * used per default under Windows, is equivalent
230 * to Latin 1 except that it also defines
231 * codepoints 0x80-0x9F to certain DTP characters.
232 *
233 * -- From my testing, codepage 1004 (which is
234 * described as "Windows-compatible" in most OS/2
235 * docs) is the same as codepage 1252, except for
236 * character 0xAF.
237 *
238 * Unfortunately, OS/2 uses codepage 850 on most
239 * systems (and Windows uses OS/2 codepage 1252),
240 * so for conversion between those, codecs are needed.
241 */
242
243PCONVERSION encCreateCodec(ENCID id)
244{
245 PXWPENCODINGMAP pEncodingMap;
246 unsigned long cArrayEntries;
247
248 if (encGetTable(id,
249 &pEncodingMap,
250 &cArrayEntries))
251 {
252 unsigned short usHighestCP = 0,
253 usHighestUni = 0;
254 unsigned long ul;
255
256 // step 1:
257 // run through the table and calculate the highest
258 // character entry used
259 for (ul = 0;
260 ul < cArrayEntries;
261 ul++)
262 {
263 if (pEncodingMap[ul].usCP > usHighestCP)
264 usHighestCP = pEncodingMap[ul].usCP;
265 if (pEncodingMap[ul].usUni > usHighestUni)
266 usHighestUni = pEncodingMap[ul].usUni;
267 }
268
269 // step 2: allocate encoding table
270 if (usHighestCP && usHighestUni)
271 {
272 PCONVERSION pTableNew;
273 if (pTableNew = NEW(CONVERSION))
274 {
275 unsigned long cbEntriesUniFromCP
276 = (usHighestCP + 1) * sizeof(unsigned short);
277 unsigned long cbEntriesCPFromUni
278 = (usHighestUni + 1) * sizeof(unsigned short);
279
280 ZERO(pTableNew);
281
282 pTableNew->usHighestCP = usHighestCP;
283 pTableNew->usHighestUni = usHighestUni;
284
285 if ( (pTableNew->ausEntriesUniFromCP
286 = (unsigned short*)malloc(cbEntriesUniFromCP))
287 && (pTableNew->ausEntriesCPFromUni
288 = (unsigned short*)malloc(cbEntriesCPFromUni))
289 )
290 {
291 // step 3: fill encoding tables
292
293 memset(pTableNew->ausEntriesUniFromCP,
294 0xFF,
295 cbEntriesUniFromCP);
296 memset(pTableNew->ausEntriesCPFromUni,
297 0xFF,
298 cbEntriesCPFromUni);
299
300 for (ul = 0;
301 ul < cArrayEntries;
302 ul++)
303 {
304 PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
305
306 pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
307
308 pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
309 }
310
311 return (pTableNew);
312 }
313
314 free(pTableNew);
315 }
316 }
317 }
318
319 return NULL;
320}
321
322/*
323 *@@ encFreeCodec:
324 * frees a codec created with encFreeConversion
325 * and sets the given pointer to NULL.
326 *
327 *@@added V0.9.18 (2002-03-08) [umoeller]
328 */
329
330void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
331{
332 PCONVERSION pTable;
333 if (pTable = *ppTable)
334 {
335 if (pTable->ausEntriesUniFromCP)
336 free(pTable->ausEntriesUniFromCP);
337 if (pTable->ausEntriesCPFromUni)
338 free(pTable->ausEntriesCPFromUni);
339 free(pTable);
340 *ppTable = NULL;
341 }
342}
343
344/*
345 *@@ encChar2Uni:
346 * converts a codepage-specific character
347 * to Unicode, using the given conversion
348 * table from encCreateCodec().
349 *
350 * Returns 0xFFFF on errors, which is unlikely
351 * with Unicode though.
352 *
353 *@@added V0.9.18 (2002-03-08) [umoeller]
354 */
355
356unsigned long encChar2Uni(PCONVERSION pTable,
357 unsigned short c)
358{
359 if ( (pTable)
360 && (c <= pTable->usHighestCP)
361 )
362 return (pTable->ausEntriesUniFromCP[c]);
363
364 return (0xFFFF);
365}
366
367/*
368 *@@ encUni2Char:
369 * converts a Unicode character to the
370 * codepage specified by the given
371 * conversion table from encCreateCodec().
372 *
373 * Returns 0xFFFF if the Unicode character
374 * has no codepage equivalent.
375 *
376 *@@added V0.9.18 (2002-03-08) [umoeller]
377 */
378
379unsigned short encUni2Char(PCONVERSION pTable,
380 unsigned long ulUni)
381{
382 if ( (pTable)
383 && (ulUni <= pTable->usHighestUni)
384 )
385 return (pTable->ausEntriesCPFromUni[ulUni]);
386
387 return (0xFFFF);
388}
389
390/*
391 *@@ encDecodeUTF8:
392 * decodes one UTF-8 character and returns
393 * the Unicode value or -1 if the character
394 * is invalid.
395 *
396 * On input, *ppch is assumed to point to
397 * the first byte of the UTF-8 char to be
398 * read.
399 *
400 * This function will advance *ppch by at
401 * least one byte (or more if the UTF-8
402 * char initially pointed to introduces
403 * a multi-byte sequence).
404 *
405 * This returns -1 if *ppch points to an
406 * invalid encoding (in which case the
407 * pointer is advanced anyway).
408 *
409 * This returns 0 if **ppch points to a
410 * null character.
411 *
412 *@@added V0.9.14 (2001-08-09) [umoeller]
413 */
414
415unsigned long encDecodeUTF8(const char **ppch)
416{
417 unsigned long ulChar;
418
419 if (!(ulChar = **ppch))
420 // null is null
421 return 0;
422
423 // if (ulChar < 0x80): simple, one byte only... use that
424
425 if (ulChar < 0x80)
426 {
427 (*ppch)++;
428 return (ulChar);
429 }
430 else
431 {
432 unsigned long ulCount = 1;
433 int fIllegal = 0;
434
435 // note: 0xc0 and 0xc1 are reserved and
436 // cannot appear as the first UTF-8 byte
437
438 if ( (ulChar >= 0xc2)
439 && (ulChar < 0xe0)
440 )
441 {
442 // that's two bytes
443 ulCount = 2;
444 ulChar &= 0x1f;
445 }
446 else if ((ulChar & 0xf0) == 0xe0)
447 {
448 // three bytes
449 ulCount = 3;
450 ulChar &= 0x0f;
451 }
452 else if ((ulChar & 0xf8) == 0xf0)
453 {
454 // four bytes
455 ulCount = 4;
456 ulChar &= 0x07;
457 }
458 else if ((ulChar & 0xfc) == 0xf8)
459 {
460 // five bytes
461 ulCount = 5;
462 ulChar &= 0x03;
463 }
464 else if ((ulChar & 0xfe) == 0xfc)
465 {
466 // six bytes
467 ulCount = 6;
468 ulChar &= 0x01;
469 }
470 else
471 ++fIllegal;
472
473 if (!fIllegal)
474 {
475 // go for the second and more bytes then
476 int ul2;
477
478 for (ul2 = 1;
479 ul2 < ulCount;
480 ++ul2)
481 {
482 unsigned long ulChar2 = *((*ppch) + ul2);
483
484 if (!(ulChar2 & 0xc0)) // != 0x80)
485 {
486 ++fIllegal;
487 break;
488 }
489
490 ulChar <<= 6;
491 ulChar |= ulChar2 & 0x3f;
492 }
493 }
494
495 if (fIllegal)
496 {
497 // skip all the following characters
498 // until we find something with bit 7 off
499 do
500 {
501 ulChar = *(++(*ppch));
502 if (!ulChar)
503 break;
504 } while (ulChar & 0x80);
505 }
506 else
507 *ppch += ulCount;
508 }
509
510 return (ulChar);
511}
512
513
Note: See TracBrowser for help on using the repository browser.