source: trunk/src/helpers/encodings.c@ 153

Last change on this file since 153 was 153, checked in by umoeller, 23 years ago

Lots of changes from the last three weeks.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
Line 
1
2/*
3 *@@sourcefile encodings.c:
4 * character encoding translations.
5 *
6 * See encCreateCodec for an introduction.
7 *
8 *@@header "encodings\base.h"
9 *@@added V0.9.9 (2001-02-14) [umoeller]
10 */
11
12/*
13 * Copyright (C) 2001 Ulrich M”ller.
14 * This file is part of the "XWorkplace helpers" source package.
15 * This is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published
17 * by the Free Software Foundation, in version 2 as it comes in the
18 * "COPYING" file of the XWorkplace main distribution.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 */
24
25#define OS2EMX_PLAIN_CHAR
26 // this is needed for "os2emx.h"; if this is defined,
27 // emx will define PSZ as _signed_ char, otherwise
28 // as unsigned char
29
30#include <stdlib.h>
31#include <string.h>
32
33#include "setup.h" // code generation and debugging options
34
35#include "helpers\standards.h"
36
37#include "encodings\base.h"
38#include "encodings\alltables.h"
39// #include "encodings\collate.h"
40
41#pragma hdrstop
42
43/*
44 *@@ G_aEncodings:
45 * list of all encodings supported by this engine
46 * (i.e. we have a corresponding codepage in
47 * include\encodings\*.h) together with some
48 * additional information for each encoding,
49 * such as the corresponding OS/2 codepage
50 * number and a descriptive string.
51 *
52 *@@added V [umoeller]
53 */
54
55struct
56{
57 ENCID id; // engine ID (enum)
58 PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
59 unsigned long cEntries; // entries in map (array item count)
60 unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
61 ENCBYTECOUNT bc;
62 const char *pcszDescription; // description
63} G_aEncodings[] =
64 {
65 #define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
66
67 ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
68 ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
69 ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
70 ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
71 ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
72 // Romania, Poland
73 ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
74 ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
75 ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
76 ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
77 ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
78 ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
79 ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
80 ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
81 ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
82 ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
83 ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
84
85 ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
86 ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
87 ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
88 ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
89
90 ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
91 ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
92 ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
93 ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
94 ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
95 ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
96 ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
97 ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
98 ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
99 ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
100 ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
101 ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
102 ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
103 ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
104 ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
105 ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
106 ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
107 ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
108 ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
109 ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
110 ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
111 ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
112 ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
113
114 UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
115 UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
116 };
117
118/*
119 *@@ FindEntry:
120 *
121 *@@added V0.9.18 (2002-03-08) [umoeller]
122 */
123
124static int FindEntry(ENCID id,
125 PXWPENCODINGMAP *ppMap,
126 unsigned long *pcEntries)
127{
128 unsigned long ul;
129 for (ul = 0;
130 ul < ARRAYITEMCOUNT(G_aEncodings);
131 ul++)
132 {
133 if (G_aEncodings[ul].id == id)
134 {
135 *ppMap = G_aEncodings[ul].pMap;
136 *pcEntries = G_aEncodings[ul].cEntries;
137 return (1);
138 }
139 }
140
141 return (0);
142}
143
144/*
145 *@@ encFindIdForCodepage:
146 * returns the ENCID for the given OS/2
147 * codepage, or UNSUPPORTED if there's none.
148 *
149 *@@added V0.9.18 (2002-03-08) [umoeller]
150 */
151
152ENCID encFindIdForCodepage(unsigned short usCodepage,
153 const char **ppcszDescription, // out: codepage description; ptr can be NULL
154 ENCBYTECOUNT *pByteCount)
155{
156 unsigned long ul;
157 for (ul = 0;
158 ul < ARRAYITEMCOUNT(G_aEncodings);
159 ul++)
160 {
161 if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
162
163 {
164 if (ppcszDescription)
165 *ppcszDescription = G_aEncodings[ul].pcszDescription;
166 if (pByteCount)
167 *pByteCount = G_aEncodings[ul].bc;
168 return G_aEncodings[ul].id;
169 }
170 }
171
172 return (UNSUPPORTED);
173}
174
175/*
176 *@@ encCreateCodec:
177 * creates a codec that can be used for conversion between
178 * Unicode and codepaged characters (and vice versa).
179 *
180 * A codec essentially consists of two tables which can
181 * be used for quick index-based lookups in both directions.
182 * This function goes thru the tables provided in
183 * include\encodings\*.h and builds the codec tables
184 * from them.
185 *
186 * This function takes an encoding ID as input. Each
187 * codepage table in include\encodings\*.h has one
188 * of those IDs assigned. Use encFindIdForCodepage
189 * to find the ID for a given OS/2 codepage.
190 *
191 * Use codecs carefully and only when they are really
192 * needed for a specific conversion. Building a codec
193 * is expensive, so you should create a codec once
194 * and reuse it for future conversions. In addition,
195 * create codecs only for the codepages that are
196 * actually used. Each codec will take up
197 * n * sizeof(USHORT) bytes, where n is the highest
198 * Unicode character used in the codepage.
199 *
200 * Remarks:
201 *
202 * -- All codepages share the first 128 characters
203 * (0-0x7F) with ASCII.
204 *
205 * -- Since the first 128 characters (0-0x7F) in
206 * Unicode are equivalent to ASCII also, codecs
207 * are not needed if you process ASCII strings
208 * only.
209 *
210 * -- Since the next 128 characters (0x80-0xFF) in
211 * Unicode are equivalent to ISO/IEC 8859-1
212 * (Latin-1), codecs aren't needed for those
213 * strings either.
214 *
215 * Note that codepoints 0x80-0x9F are undefined
216 * in Latin-1 but used as control sequences in
217 * Unicode.
218 *
219 * -- As far as I know, codepage 1252, which is
220 * used per default under Windows, is equivalent
221 * to Latin 1 except that it also defines
222 * codepoints 0x80-0x9F to certain DTP characters.
223 *
224 * -- From my testing, codepage 1004 (which is
225 * described as "Windows-compatible" in most OS/2
226 * docs) is the same as codepage 1252, except for
227 * character 0xAF.
228 *
229 * Unfortunately, OS/2 uses codepage 850 on most
230 * systems (and Windows uses OS/2 codepage 1252),
231 * so for conversion between those, codecs are needed.
232 */
233
234PCONVERSION encCreateCodec(ENCID id)
235{
236 PXWPENCODINGMAP pEncodingMap;
237 unsigned long cArrayEntries;
238
239 if (FindEntry(id,
240 &pEncodingMap,
241 &cArrayEntries))
242 {
243 unsigned short usHighestCP = 0,
244 usHighestUni = 0;
245 unsigned long ul;
246
247 // step 1:
248 // run through the table and calculate the highest
249 // character entry used
250 for (ul = 0;
251 ul < cArrayEntries;
252 ul++)
253 {
254 if (pEncodingMap[ul].usCP > usHighestCP)
255 usHighestCP = pEncodingMap[ul].usCP;
256 if (pEncodingMap[ul].usUni > usHighestUni)
257 usHighestUni = pEncodingMap[ul].usUni;
258 }
259
260 // step 2: allocate encoding table
261 if (usHighestCP && usHighestUni)
262 {
263 PCONVERSION pTableNew;
264 if (pTableNew = NEW(CONVERSION))
265 {
266 unsigned long cbEntriesUniFromCP
267 = (usHighestCP + 1) * sizeof(unsigned short);
268 unsigned long cbEntriesCPFromUni
269 = (usHighestUni + 1) * sizeof(unsigned short);
270
271 ZERO(pTableNew);
272
273 pTableNew->usHighestCP = usHighestCP;
274 pTableNew->usHighestUni = usHighestUni;
275
276 if ( (pTableNew->ausEntriesUniFromCP
277 = (unsigned short*)malloc(cbEntriesUniFromCP))
278 && (pTableNew->ausEntriesCPFromUni
279 = (unsigned short*)malloc(cbEntriesCPFromUni))
280 )
281 {
282 // step 3: fill encoding tables
283
284 memset(pTableNew->ausEntriesUniFromCP,
285 0xFF,
286 cbEntriesUniFromCP);
287 memset(pTableNew->ausEntriesCPFromUni,
288 0xFF,
289 cbEntriesCPFromUni);
290
291 for (ul = 0;
292 ul < cArrayEntries;
293 ul++)
294 {
295 PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
296
297 pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
298
299 pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
300 }
301
302 return (pTableNew);
303 }
304
305 free(pTableNew);
306 }
307 }
308 }
309
310 return (NULL);
311}
312
313/*
314 *@@ encFreeCodec:
315 * frees a codec created with encFreeConversion
316 * and sets the given pointer to NULL.
317 *
318 *@@added V0.9.18 (2002-03-08) [umoeller]
319 */
320
321void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
322{
323 PCONVERSION pTable;
324 if (pTable = *ppTable)
325 {
326 if (pTable->ausEntriesUniFromCP)
327 free(pTable->ausEntriesUniFromCP);
328 if (pTable->ausEntriesCPFromUni)
329 free(pTable->ausEntriesCPFromUni);
330 free(pTable);
331 *ppTable = NULL;
332 }
333}
334
335/*
336 *@@ encChar2Uni:
337 * converts a codepage-specific character
338 * to Unicode, using the given conversion
339 * table from encCreateCodec().
340 *
341 * Returns 0xFFFF on errors, which is unlikely
342 * with Unicode though.
343 *
344 *@@added V0.9.18 (2002-03-08) [umoeller]
345 */
346
347unsigned long encChar2Uni(PCONVERSION pTable,
348 unsigned short c)
349{
350 if ( (pTable)
351 && (c <= pTable->usHighestCP)
352 )
353 return (pTable->ausEntriesUniFromCP[c]);
354
355 return (0xFFFF);
356}
357
358/*
359 *@@ encUni2Char:
360 * converts a Unicode character to the
361 * codepage specified by the given
362 * conversion table from encCreateCodec().
363 *
364 * Returns 0xFFFF if the Unicode character
365 * has no codepage equivalent.
366 *
367 *@@added V0.9.18 (2002-03-08) [umoeller]
368 */
369
370unsigned short encUni2Char(PCONVERSION pTable,
371 unsigned long ulUni)
372{
373 if ( (pTable)
374 && (ulUni <= pTable->usHighestUni)
375 )
376 return (pTable->ausEntriesCPFromUni[ulUni]);
377
378 return (0xFFFF);
379}
380
381/*
382 *@@ encDecodeUTF8:
383 * decodes one UTF-8 character and returns
384 * the Unicode value or -1 if the character
385 * is invalid.
386 *
387 * On input, *ppch is assumed to point to
388 * the first byte of the UTF-8 char to be
389 * read.
390 *
391 * This function will advance *ppch by at
392 * least one byte (or more if the UTF-8
393 * char initially pointed to introduces
394 * a multi-byte sequence).
395 *
396 * This returns -1 if *ppch points to an
397 * invalid encoding (in which case the
398 * pointer is advanced anyway).
399 *
400 * This returns 0 if **ppch points to a
401 * null character.
402 *
403 *@@added V0.9.14 (2001-08-09) [umoeller]
404 */
405
406unsigned long encDecodeUTF8(const char **ppch)
407{
408 unsigned long ulChar;
409
410 if (!(ulChar = **ppch))
411 // null is null
412 return 0;
413
414 // if (ulChar < 0x80): simple, one byte only... use that
415
416 if (ulChar < 0x80)
417 {
418 (*ppch)++;
419 return (ulChar);
420 }
421 else
422 {
423 unsigned long ulCount = 1;
424 int fIllegal = 0;
425
426 // note: 0xc0 and 0xc1 are reserved and
427 // cannot appear as the first UTF-8 byte
428
429 if ( (ulChar >= 0xc2)
430 && (ulChar < 0xe0)
431 )
432 {
433 // that's two bytes
434 ulCount = 2;
435 ulChar &= 0x1f;
436 }
437 else if ((ulChar & 0xf0) == 0xe0)
438 {
439 // three bytes
440 ulCount = 3;
441 ulChar &= 0x0f;
442 }
443 else if ((ulChar & 0xf8) == 0xf0)
444 {
445 // four bytes
446 ulCount = 4;
447 ulChar &= 0x07;
448 }
449 else if ((ulChar & 0xfc) == 0xf8)
450 {
451 // five bytes
452 ulCount = 5;
453 ulChar &= 0x03;
454 }
455 else if ((ulChar & 0xfe) == 0xfc)
456 {
457 // six bytes
458 ulCount = 6;
459 ulChar &= 0x01;
460 }
461 else
462 ++fIllegal;
463
464 if (!fIllegal)
465 {
466 // go for the second and more bytes then
467 int ul2;
468
469 for (ul2 = 1;
470 ul2 < ulCount;
471 ++ul2)
472 {
473 unsigned long ulChar2 = *((*ppch) + ul2);
474
475 if (!(ulChar2 & 0xc0)) // != 0x80)
476 {
477 ++fIllegal;
478 break;
479 }
480
481 ulChar <<= 6;
482 ulChar |= ulChar2 & 0x3f;
483 }
484 }
485
486 if (fIllegal)
487 {
488 // skip all the following characters
489 // until we find something with bit 7 off
490 do
491 {
492 ulChar = *(++(*ppch));
493 if (!ulChar)
494 break;
495 } while (ulChar & 0x80);
496 }
497 else
498 *ppch += ulCount;
499 }
500
501 return (ulChar);
502}
503
504
Note: See TracBrowser for help on using the repository browser.