source: trunk/src/helpers/encodings.c@ 154

Last change on this file since 154 was 154, checked in by umoeller, 23 years ago

Misc changes.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
Line 
1
2/*
3 *@@sourcefile encodings.c:
4 * character encoding translations.
5 *
6 * See encCreateCodec for an introduction.
7 *
8 * Be warned, compilation of this file takes a long
9 * file because this includes all the complex codepage
10 * from include\encodings.
11 *
12 *@@header "encodings\base.h"
13 *@@added V0.9.9 (2001-02-14) [umoeller]
14 */
15
16/*
17 * Copyright (C) 2001 Ulrich M”ller.
18 * This file is part of the "XWorkplace helpers" source package.
19 * This is free software; you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published
21 * by the Free Software Foundation, in version 2 as it comes in the
22 * "COPYING" file of the XWorkplace main distribution.
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 */
28
29#define OS2EMX_PLAIN_CHAR
30 // this is needed for "os2emx.h"; if this is defined,
31 // emx will define PSZ as _signed_ char, otherwise
32 // as unsigned char
33
34#include <stdlib.h>
35#include <string.h>
36
37#include "setup.h" // code generation and debugging options
38
39#include "helpers\standards.h"
40
41#include "encodings\base.h"
42#include "encodings\alltables.h"
43// #include "encodings\collate.h"
44
45#pragma hdrstop
46
47/*
48 *@@ G_aEncodings:
49 * list of all encodings supported by this engine
50 * (i.e. we have a corresponding codepage in
51 * include\encodings\*.h) together with some
52 * additional information for each encoding,
53 * such as the corresponding OS/2 codepage
54 * number and a descriptive string.
55 *
56 *@@added V [umoeller]
57 */
58
59struct
60{
61 ENCID id; // engine ID (enum)
62 PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
63 unsigned long cEntries; // entries in map (array item count)
64 unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
65 ENCBYTECOUNT bc;
66 const char *pcszDescription; // description
67} G_aEncodings[] =
68 {
69 #define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
70
71 ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
72 ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
73 ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
74 ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
75 ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
76 // Romania, Poland
77 ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
78 ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
79 ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
80 ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
81 ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
82 ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
83 ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
84 ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
85 ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
86 ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
87 ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
88
89 ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
90 ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
91 ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
92 ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
93
94 ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
95 ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
96 ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
97 ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
98 ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
99 ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
100 ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
101 ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
102 ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
103 ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
104 ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
105 ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
106 ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
107 ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
108 ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
109 ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
110 ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
111 ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
112 ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
113 ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
114 ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
115 ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
116 ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
117
118 UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
119 UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
120 };
121
122/*
123 *@@ FindEntry:
124 *
125 *@@added V0.9.18 (2002-03-08) [umoeller]
126 */
127
128static int FindEntry(ENCID id,
129 PXWPENCODINGMAP *ppMap,
130 unsigned long *pcEntries)
131{
132 unsigned long ul;
133 for (ul = 0;
134 ul < ARRAYITEMCOUNT(G_aEncodings);
135 ul++)
136 {
137 if (G_aEncodings[ul].id == id)
138 {
139 *ppMap = G_aEncodings[ul].pMap;
140 *pcEntries = G_aEncodings[ul].cEntries;
141 return (1);
142 }
143 }
144
145 return (0);
146}
147
148/*
149 *@@ encFindIdForCodepage:
150 * returns the ENCID for the given OS/2
151 * codepage, or UNSUPPORTED if there's none.
152 *
153 *@@added V0.9.18 (2002-03-08) [umoeller]
154 */
155
156ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
157 const char **ppcszDescription, // out: codepage description; ptr can be NULL
158 ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE
159{
160 unsigned long ul;
161 for (ul = 0;
162 ul < ARRAYITEMCOUNT(G_aEncodings);
163 ul++)
164 {
165 if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
166
167 {
168 if (ppcszDescription)
169 *ppcszDescription = G_aEncodings[ul].pcszDescription;
170 if (pByteCount)
171 *pByteCount = G_aEncodings[ul].bc;
172 return G_aEncodings[ul].id;
173 }
174 }
175
176 return (UNSUPPORTED);
177}
178
179/*
180 *@@ encCreateCodec:
181 * creates a codec that can be used for conversion between
182 * Unicode and codepaged characters (and vice versa).
183 *
184 * A codec essentially consists of two tables which can
185 * be used for quick index-based lookups in both directions.
186 * This function goes thru the tables provided in
187 * include\encodings\*.h and builds the codec tables
188 * from them.
189 *
190 * This function takes an encoding ID as input. Each
191 * codepage table in include\encodings\*.h has one
192 * of those IDs assigned. Use encFindIdForCodepage
193 * to find the ID for a given OS/2 codepage.
194 *
195 * Use codecs carefully and only when they are really
196 * needed for a specific conversion. Building a codec
197 * is expensive, so you should create a codec once
198 * and reuse it for future conversions. In addition,
199 * create codecs only for the codepages that are
200 * actually used. Each codec will take up
201 * n * sizeof(USHORT) bytes, where n is the highest
202 * Unicode character used in the codepage.
203 *
204 * Codec remarks:
205 *
206 * -- All codepages share the first 128 characters
207 * (0-0x7F) with ASCII.
208 *
209 * -- Since the first 128 characters (0-0x7F) in
210 * Unicode are equivalent to ASCII also, codecs
211 * are not needed if you process ASCII strings
212 * only.
213 *
214 * -- Since the next 128 characters (0x80-0xFF) in
215 * Unicode are equivalent to ISO/IEC 8859-1
216 * (Latin-1), codecs aren't needed for those
217 * strings either.
218 *
219 * Note that codepoints 0x80-0x9F are undefined
220 * in Latin-1 but used as control sequences in
221 * Unicode.
222 *
223 * -- As far as I know, codepage 1252, which is
224 * used per default under Windows, is equivalent
225 * to Latin 1 except that it also defines
226 * codepoints 0x80-0x9F to certain DTP characters.
227 *
228 * -- From my testing, codepage 1004 (which is
229 * described as "Windows-compatible" in most OS/2
230 * docs) is the same as codepage 1252, except for
231 * character 0xAF.
232 *
233 * Unfortunately, OS/2 uses codepage 850 on most
234 * systems (and Windows uses OS/2 codepage 1252),
235 * so for conversion between those, codecs are needed.
236 */
237
238PCONVERSION encCreateCodec(ENCID id)
239{
240 PXWPENCODINGMAP pEncodingMap;
241 unsigned long cArrayEntries;
242
243 if (FindEntry(id,
244 &pEncodingMap,
245 &cArrayEntries))
246 {
247 unsigned short usHighestCP = 0,
248 usHighestUni = 0;
249 unsigned long ul;
250
251 // step 1:
252 // run through the table and calculate the highest
253 // character entry used
254 for (ul = 0;
255 ul < cArrayEntries;
256 ul++)
257 {
258 if (pEncodingMap[ul].usCP > usHighestCP)
259 usHighestCP = pEncodingMap[ul].usCP;
260 if (pEncodingMap[ul].usUni > usHighestUni)
261 usHighestUni = pEncodingMap[ul].usUni;
262 }
263
264 // step 2: allocate encoding table
265 if (usHighestCP && usHighestUni)
266 {
267 PCONVERSION pTableNew;
268 if (pTableNew = NEW(CONVERSION))
269 {
270 unsigned long cbEntriesUniFromCP
271 = (usHighestCP + 1) * sizeof(unsigned short);
272 unsigned long cbEntriesCPFromUni
273 = (usHighestUni + 1) * sizeof(unsigned short);
274
275 ZERO(pTableNew);
276
277 pTableNew->usHighestCP = usHighestCP;
278 pTableNew->usHighestUni = usHighestUni;
279
280 if ( (pTableNew->ausEntriesUniFromCP
281 = (unsigned short*)malloc(cbEntriesUniFromCP))
282 && (pTableNew->ausEntriesCPFromUni
283 = (unsigned short*)malloc(cbEntriesCPFromUni))
284 )
285 {
286 // step 3: fill encoding tables
287
288 memset(pTableNew->ausEntriesUniFromCP,
289 0xFF,
290 cbEntriesUniFromCP);
291 memset(pTableNew->ausEntriesCPFromUni,
292 0xFF,
293 cbEntriesCPFromUni);
294
295 for (ul = 0;
296 ul < cArrayEntries;
297 ul++)
298 {
299 PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
300
301 pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
302
303 pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
304 }
305
306 return (pTableNew);
307 }
308
309 free(pTableNew);
310 }
311 }
312 }
313
314 return (NULL);
315}
316
317/*
318 *@@ encFreeCodec:
319 * frees a codec created with encFreeConversion
320 * and sets the given pointer to NULL.
321 *
322 *@@added V0.9.18 (2002-03-08) [umoeller]
323 */
324
325void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
326{
327 PCONVERSION pTable;
328 if (pTable = *ppTable)
329 {
330 if (pTable->ausEntriesUniFromCP)
331 free(pTable->ausEntriesUniFromCP);
332 if (pTable->ausEntriesCPFromUni)
333 free(pTable->ausEntriesCPFromUni);
334 free(pTable);
335 *ppTable = NULL;
336 }
337}
338
339/*
340 *@@ encChar2Uni:
341 * converts a codepage-specific character
342 * to Unicode, using the given conversion
343 * table from encCreateCodec().
344 *
345 * Returns 0xFFFF on errors, which is unlikely
346 * with Unicode though.
347 *
348 *@@added V0.9.18 (2002-03-08) [umoeller]
349 */
350
351unsigned long encChar2Uni(PCONVERSION pTable,
352 unsigned short c)
353{
354 if ( (pTable)
355 && (c <= pTable->usHighestCP)
356 )
357 return (pTable->ausEntriesUniFromCP[c]);
358
359 return (0xFFFF);
360}
361
362/*
363 *@@ encUni2Char:
364 * converts a Unicode character to the
365 * codepage specified by the given
366 * conversion table from encCreateCodec().
367 *
368 * Returns 0xFFFF if the Unicode character
369 * has no codepage equivalent.
370 *
371 *@@added V0.9.18 (2002-03-08) [umoeller]
372 */
373
374unsigned short encUni2Char(PCONVERSION pTable,
375 unsigned long ulUni)
376{
377 if ( (pTable)
378 && (ulUni <= pTable->usHighestUni)
379 )
380 return (pTable->ausEntriesCPFromUni[ulUni]);
381
382 return (0xFFFF);
383}
384
385/*
386 *@@ encDecodeUTF8:
387 * decodes one UTF-8 character and returns
388 * the Unicode value or -1 if the character
389 * is invalid.
390 *
391 * On input, *ppch is assumed to point to
392 * the first byte of the UTF-8 char to be
393 * read.
394 *
395 * This function will advance *ppch by at
396 * least one byte (or more if the UTF-8
397 * char initially pointed to introduces
398 * a multi-byte sequence).
399 *
400 * This returns -1 if *ppch points to an
401 * invalid encoding (in which case the
402 * pointer is advanced anyway).
403 *
404 * This returns 0 if **ppch points to a
405 * null character.
406 *
407 *@@added V0.9.14 (2001-08-09) [umoeller]
408 */
409
410unsigned long encDecodeUTF8(const char **ppch)
411{
412 unsigned long ulChar;
413
414 if (!(ulChar = **ppch))
415 // null is null
416 return 0;
417
418 // if (ulChar < 0x80): simple, one byte only... use that
419
420 if (ulChar < 0x80)
421 {
422 (*ppch)++;
423 return (ulChar);
424 }
425 else
426 {
427 unsigned long ulCount = 1;
428 int fIllegal = 0;
429
430 // note: 0xc0 and 0xc1 are reserved and
431 // cannot appear as the first UTF-8 byte
432
433 if ( (ulChar >= 0xc2)
434 && (ulChar < 0xe0)
435 )
436 {
437 // that's two bytes
438 ulCount = 2;
439 ulChar &= 0x1f;
440 }
441 else if ((ulChar & 0xf0) == 0xe0)
442 {
443 // three bytes
444 ulCount = 3;
445 ulChar &= 0x0f;
446 }
447 else if ((ulChar & 0xf8) == 0xf0)
448 {
449 // four bytes
450 ulCount = 4;
451 ulChar &= 0x07;
452 }
453 else if ((ulChar & 0xfc) == 0xf8)
454 {
455 // five bytes
456 ulCount = 5;
457 ulChar &= 0x03;
458 }
459 else if ((ulChar & 0xfe) == 0xfc)
460 {
461 // six bytes
462 ulCount = 6;
463 ulChar &= 0x01;
464 }
465 else
466 ++fIllegal;
467
468 if (!fIllegal)
469 {
470 // go for the second and more bytes then
471 int ul2;
472
473 for (ul2 = 1;
474 ul2 < ulCount;
475 ++ul2)
476 {
477 unsigned long ulChar2 = *((*ppch) + ul2);
478
479 if (!(ulChar2 & 0xc0)) // != 0x80)
480 {
481 ++fIllegal;
482 break;
483 }
484
485 ulChar <<= 6;
486 ulChar |= ulChar2 & 0x3f;
487 }
488 }
489
490 if (fIllegal)
491 {
492 // skip all the following characters
493 // until we find something with bit 7 off
494 do
495 {
496 ulChar = *(++(*ppch));
497 if (!ulChar)
498 break;
499 } while (ulChar & 0x80);
500 }
501 else
502 *ppch += ulCount;
503 }
504
505 return (ulChar);
506}
507
508
Note: See TracBrowser for help on using the repository browser.