Context Navigation

source: trunk/src/helpers/encodings.c@ 209

Visit:

Last change on this file since 209 was 209, checked in by umoeller, 23 years ago
Dialog formatter rewrite.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 20.5 KB

Line
1
2	/*
3	*@@sourcefile encodings.c:
4	* character encoding support. Handles all kinds
5	* of legacy codepages (including most OS/2 codepages)
6	* and Unicode in the form of UTF-8 and translations
7	* between then.
8	*
9	* See encCreateCodec for an introduction.
10	*
11	* See http://www.ietf.org/rfc/rfc2279.txt for
12	* RFC 2279, which defines UTF-8.
13	*
14	* Be warned, compilation of this file takes a long
15	* file because this includes all the complex codepages
16	* from include\encodings.
17	*
18	*@@header "encodings\base.h"
19	*@@added V0.9.9 (2001-02-14) [umoeller]
20	*/
21
22	/*
23	* Copyright (C) 2001-2002 Ulrich Mller.
24	* This file is part of the "XWorkplace helpers" source package.
25	* This is free software; you can redistribute it and/or modify
26	* it under the terms of the GNU General Public License as published
27	* by the Free Software Foundation, in version 2 as it comes in the
28	* "COPYING" file of the XWorkplace main distribution.
29	* This program is distributed in the hope that it will be useful,
30	* but WITHOUT ANY WARRANTY; without even the implied warranty of
31	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32	* GNU General Public License for more details.
33	*/
34
35	#define OS2EMX_PLAIN_CHAR
36	// this is needed for "os2emx.h"; if this is defined,
37	// emx will define PSZ as _signed_ char, otherwise
38	// as unsigned char
39
40	#include <stdlib.h>
41	#include <string.h>
42
43	#include "setup.h" // code generation and debugging options
44
45	#include "helpers\standards.h"
46
47	#include "encodings\base.h"
48
49	#include "encodings\unicase.h"
50
51	#include "encodings\alltables.h" // this takes a very long time
52
53	#pragma hdrstop
54
55	/*
56	*@@category: Helpers\National Language Support\Encodings
57	* See encodings.c.
58	*/
59
60	/*
61	*@@ G_aEncodings:
62	* list of all encodings supported by this engine
63	* (i.e. we have a corresponding codepage in
64	* include\encodings\*.h) together with some
65	* additional information for each encoding,
66	* such as the corresponding OS/2 codepage
67	* number and a descriptive string.
68	*
69	*@@added V [umoeller]
70	*/
71
72	struct
73	{
74	ENCID id; // engine ID (enum)
75	PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
76	unsigned long cEntries; // entries in map (array item count)
77	unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
78	ENCBYTECOUNT bc;
79	const char *pcszDescription; // description
80	} G_aEncodings[] =
81	{
82	#define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
83
84	ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
85	ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
86	ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
87	ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
88	ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
89	// Romania, Poland
90	ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
91	ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
92	ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
93	ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
94	ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
95	ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
96	ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
97	ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
98	ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
99	ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
100	ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
101
102	ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
103	ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
104	ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
105	ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
106
107	ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
108	ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
109	ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
110	ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
111	ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
112	ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
113	ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
114	ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
115	ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
116	ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
117	ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
118	ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
119	ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
120	ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
121	ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
122	ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
123	ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
124	ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
125	ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
126	ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
127	ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
128	ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
129	ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
130
131	UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
132	UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
133	};
134
135	/*
136	*@@ ENCCASEFOLD:
137	*
138	*@@added V0.9.20 (2002-07-03) [umoeller]
139	*/
140
141	typedef struct _ENCCASEFOLD
142	{
143	unsigned long cEntries;
144	unsigned long aulFolds[1];
145	} ENCCASEFOLD, *PENCCASEFOLD;
146
147	static PENCCASEFOLD G_pFold = NULL;
148
149	/*
150	*@@ encGetTable:
151	*
152	*@@added V0.9.18 (2002-03-08) [umoeller]
153	*/
154
155	int encGetTable(ENCID id,
156	PXWPENCODINGMAP *ppMap,
157	unsigned long *pcEntries)
158	{
159	unsigned long ul;
160	for (ul = 0;
161	ul < ARRAYITEMCOUNT(G_aEncodings);
162	ul++)
163	{
164	if (G_aEncodings[ul].id == id)
165	{
166	*ppMap = G_aEncodings[ul].pMap;
167	*pcEntries = G_aEncodings[ul].cEntries;
168	return 1;
169	}
170	}
171
172	return 0;
173	}
174
175	/*
176	*@@ encFindIdForCodepage:
177	* returns the ENCID for the given OS/2
178	* codepage, or UNSUPPORTED if there's none.
179	*
180	*@@added V0.9.18 (2002-03-08) [umoeller]
181	*/
182
183	ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
184	const char **ppcszDescription, // out: codepage description; ptr can be NULL
185	ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE; ptr can be NULL
186	{
187	unsigned long ul;
188	for (ul = 0;
189	ul < ARRAYITEMCOUNT(G_aEncodings);
190	ul++)
191	{
192	if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
193
194	{
195	if (ppcszDescription)
196	*ppcszDescription = G_aEncodings[ul].pcszDescription;
197	if (pByteCount)
198	*pByteCount = G_aEncodings[ul].bc;
199	return G_aEncodings[ul].id;
200	}
201	}
202
203	return UNSUPPORTED;
204	}
205
206	/*
207	*@@ encCreateCodec:
208	* creates a codec that can be used for conversion between
209	* Unicode and codepaged characters (and vice versa).
210	*
211	* A codec essentially consists of two tables which can
212	* be used for quick index-based lookups in both directions.
213	* This function goes thru the tables provided in
214	* include\encodings\*.h and builds the codec tables
215	* from them.
216	*
217	* This function takes an encoding ID as input. Each
218	* codepage table in include\encodings\*.h has one
219	* of those IDs assigned. Use encFindIdForCodepage
220	* to find the ID for a given OS/2 codepage.
221	*
222	* Use codecs carefully and only when they are really
223	* needed for a specific conversion. Building a codec
224	* is expensive, so you should create a codec once
225	* and reuse it for future conversions. In addition,
226	* create codecs only for the codepages that are
227	* actually used. Each codec will take up
228	* n * sizeof(USHORT) bytes, where n is the highest
229	* Unicode character used in the codepage.
230	*
231	* Codec remarks:
232	*
233	* -- All codepages share the first 128 characters
234	* (0-0x7F) with ASCII.
235	*
236	* -- Since the first 128 characters (0-0x7F) in
237	* Unicode are equivalent to ASCII also, codecs
238	* are not needed if you process ASCII strings
239	* only.
240	*
241	* -- Since the next 128 characters (0x80-0xFF) in
242	* Unicode are equivalent to ISO/IEC 8859-1
243	* (Latin-1), codecs aren't needed for those
244	* strings either.
245	*
246	* Note that codepoints 0x80-0x9F are undefined
247	* in Latin-1 but used as control sequences in
248	* Unicode.
249	*
250	* -- As far as I know, codepage 1252, which is
251	* used per default under Windows, is equivalent
252	* to Latin 1 except that it also defines
253	* codepoints 0x80-0x9F to certain DTP characters.
254	*
255	* -- From my testing, codepage 1004 (which is
256	* described as "Windows-compatible" in most OS/2
257	* docs) is the same as codepage 1252, except for
258	* character 0xAF.
259	*
260	* Unfortunately, OS/2 uses codepage 850 on most
261	* systems (and Windows uses OS/2 codepage 1252),
262	* so for conversion between those, codecs are needed.
263	*
264	* This works and is presently used in WarpIN.
265	*/
266
267	PCONVERSION encCreateCodec(ENCID id)
268	{
269	PXWPENCODINGMAP pEncodingMap;
270	unsigned long cArrayEntries;
271
272	if (encGetTable(id,
273	&pEncodingMap,
274	&cArrayEntries))
275	{
276	unsigned short usHighestCP = 0,
277	usHighestUni = 0;
278	unsigned long ul;
279
280	// step 1:
281	// run through the table and calculate the highest
282	// character entry used
283	for (ul = 0;
284	ul < cArrayEntries;
285	ul++)
286	{
287	if (pEncodingMap[ul].usCP > usHighestCP)
288	usHighestCP = pEncodingMap[ul].usCP;
289	if (pEncodingMap[ul].usUni > usHighestUni)
290	usHighestUni = pEncodingMap[ul].usUni;
291	}
292
293	// step 2: allocate encoding table
294	if (usHighestCP && usHighestUni)
295	{
296	PCONVERSION pTableNew;
297	if (pTableNew = NEW(CONVERSION))
298	{
299	unsigned long cbEntriesUniFromCP
300	= (usHighestCP + 1) * sizeof(unsigned short);
301	unsigned long cbEntriesCPFromUni
302	= (usHighestUni + 1) * sizeof(unsigned short);
303
304	ZERO(pTableNew);
305
306	pTableNew->usHighestCP = usHighestCP;
307	pTableNew->usHighestUni = usHighestUni;
308
309	if ( (pTableNew->ausEntriesUniFromCP
310	= (unsigned short*)malloc(cbEntriesUniFromCP))
311	&& (pTableNew->ausEntriesCPFromUni
312	= (unsigned short*)malloc(cbEntriesCPFromUni))
313	)
314	{
315	// step 3: fill encoding tables
316
317	memset(pTableNew->ausEntriesUniFromCP,
318	0xFF,
319	cbEntriesUniFromCP);
320	memset(pTableNew->ausEntriesCPFromUni,
321	0xFF,
322	cbEntriesCPFromUni);
323
324	for (ul = 0;
325	ul < cArrayEntries;
326	ul++)
327	{
328	PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
329
330	pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
331
332	pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
333	}
334
335	return pTableNew;
336	}
337
338	free(pTableNew);
339	}
340	}
341	}
342
343	return NULL;
344	}
345
346	/*
347	*@@ encFreeCodec:
348	* frees a codec created with encFreeConversion
349	* and sets the given pointer to NULL.
350	*
351	* This works and is presently used in WarpIN.
352	*
353	*@@added V0.9.18 (2002-03-08) [umoeller]
354	*/
355
356	void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
357	{
358	PCONVERSION pTable;
359	if (pTable = *ppTable)
360	{
361	if (pTable->ausEntriesUniFromCP)
362	free(pTable->ausEntriesUniFromCP);
363	if (pTable->ausEntriesCPFromUni)
364	free(pTable->ausEntriesCPFromUni);
365	free(pTable);
366	*ppTable = NULL;
367	}
368	}
369
370	/*
371	*@@ encChar2Uni:
372	* converts a codepage-specific character
373	* to Unicode, using the given conversion
374	* table from encCreateCodec().
375	*
376	* Returns 0xFFFF on errors, which is unlikely
377	* with Unicode though.
378	*
379	* This works and is presently used in WarpIN.
380	*
381	*@@added V0.9.18 (2002-03-08) [umoeller]
382	*/
383
384	unsigned long encChar2Uni(PCONVERSION pTable,
385	unsigned short c)
386	{
387	if ( (pTable)
388	&& (c <= pTable->usHighestCP)
389	)
390	return pTable->ausEntriesUniFromCP[c];
391
392	return 0xFFFF;
393	}
394
395	/*
396	*@@ encUni2Char:
397	* converts a Unicode character to the
398	* codepage specified by the given
399	* conversion table from encCreateCodec().
400	*
401	* Returns 0xFFFF if the Unicode character
402	* has no codepage equivalent.
403	*
404	* This works and is presently used in WarpIN.
405	*
406	*@@added V0.9.18 (2002-03-08) [umoeller]
407	*/
408
409	unsigned short encUni2Char(PCONVERSION pTable,
410	unsigned long ulUni)
411	{
412	if ( (pTable)
413	&& (ulUni <= pTable->usHighestUni)
414	)
415	return pTable->ausEntriesCPFromUni[ulUni];
416
417	return 0xFFFF;
418	}
419
420	/*
421	*@@ encDecodeUTF8:
422	* decodes one UTF-8 character and returns
423	* the Unicode value or -1 if the character
424	* is invalid.
425	*
426	* On input, *ppch is assumed to point to
427	* the first byte of the UTF-8 char to be
428	* read.
429	*
430	* This function will advance *ppch by at
431	* least one byte (or more if the UTF-8
432	* char initially pointed to introduces
433	* a multi-byte sequence).
434	*
435	* This returns -1 if *ppch points to an
436	* invalid encoding (in which case the
437	* pointer is advanced anyway).
438	*
439	* This returns 0 if **ppch points to a
440	* null character.
441	*
442	* This works and is presently used in WarpIN.
443	*
444	*@@added V0.9.14 (2001-08-09) [umoeller]
445	*/
446
447	unsigned long encDecodeUTF8(const char **ppch)
448	{
449	unsigned long ulChar;
450	unsigned long ulCount;
451	int fIllegal;
452
453	if (!(ulChar = **ppch))
454	// null is null
455	return 0;
456
457	// if (ulChar < 0x80): simple, one byte only... use that
458
459	if (ulChar < 0x80)
460	{
461	(*ppch)++;
462	return ulChar;
463	}
464
465	ulCount = 1;
466	fIllegal = 0;
467
468	// note: 0xc0 and 0xc1 are reserved and
469	// cannot appear as the first UTF-8 byte
470
471	if ( (ulChar >= 0xc2)
472	&& (ulChar < 0xe0)
473	)
474	{
475	// that's two bytes
476	ulCount = 2;
477	ulChar &= 0x1f;
478	}
479	else if ((ulChar & 0xf0) == 0xe0)
480	{
481	// three bytes
482	ulCount = 3;
483	ulChar &= 0x0f;
484	}
485	else if ((ulChar & 0xf8) == 0xf0)
486	{
487	// four bytes
488	ulCount = 4;
489	ulChar &= 0x07;
490	}
491	else if ((ulChar & 0xfc) == 0xf8)
492	{
493	// five bytes
494	ulCount = 5;
495	ulChar &= 0x03;
496	}
497	else if ((ulChar & 0xfe) == 0xfc)
498	{
499	// six bytes
500	ulCount = 6;
501	ulChar &= 0x01;
502	}
503	else
504	++fIllegal;
505
506	if (!fIllegal)
507	{
508	// go for the second and more bytes then
509	int ul2;
510
511	for (ul2 = 1;
512	ul2 < ulCount;
513	++ul2)
514	{
515	unsigned long ulChar2 = ((ppch) + ul2);
516
517	if (!(ulChar2 & 0xc0)) // != 0x80)
518	{
519	++fIllegal;
520	break;
521	}
522
523	ulChar <<= 6;
524	ulChar \|= ulChar2 & 0x3f;
525	}
526	}
527
528	if (fIllegal)
529	{
530	// skip all the following characters
531	// until we find something with bit 7 off
532	do
533	{
534	ulChar = (++(ppch));
535	if (!ulChar)
536	break;
537	} while (ulChar & 0x80);
538	}
539	else
540	*ppch += ulCount;
541
542	return ulChar;
543	}
544
545	/*
546	*@@ encInitCase:
547	* creates a casefold for later use with
548	* encToUpper.
549	*
550	* This only uses one-byte sequences from
551	* the Unicode case folding table (see
552	* include\encodings\unicase.h), so this
553	* cannot be used for expanding characters
554	* at this point.
555	*
556	* Returns 1 (TRUE) on success.
557	*
558	* This works and is presently used in WarpIN.
559	*
560	*@@added V0.9.20 (2002-07-03) [umoeller]
561	*/
562
563	int encInitCase(void)
564	{
565	unsigned long ul,
566	cEntries = 0,
567	cb;
568
569	for (ul = 0;
570	ul < ARRAYITEMCOUNT(G_aCaseFolds);
571	++ul)
572	{
573	// ignore CASEFL_T (duplicate entries for i chars)
574	// and CASEFL_F (expansions)
575	if ( (G_aCaseFolds[ul].fl & (CASEFL_C \| CASEFL_S))
576	&& (G_aCaseFolds[ul].ulLow > cEntries)
577	)
578	cEntries = G_aCaseFolds[ul].ulLow;
579	}
580
581	cb = sizeof(ENCCASEFOLD) + cEntries * sizeof(unsigned long);
582	if (G_pFold = (PENCCASEFOLD)malloc(cb))
583	{
584	memset(G_pFold, 0, cb);
585	G_pFold->cEntries = cEntries;
586
587	for (ul = 0;
588	ul < ARRAYITEMCOUNT(G_aCaseFolds);
589	++ul)
590	{
591	if (G_aCaseFolds[ul].fl & (CASEFL_C \| CASEFL_S))
592	G_pFold->aulFolds[G_aCaseFolds[ul].ulLow] = G_aCaseFolds[ul].c1;
593	}
594
595	return 1;
596	}
597
598	return 0;
599	}
600
601	/*
602	*@@ encToUpper:
603	* converts the given unicode character to
604	* upper case, if possible, or returns
605	* ulUni back if Unicode doesn't define
606	* an upper-case character for it.
607	*
608	* Special cases:
609	*
610	* -- Returns 0 for 0.
611	*
612	* Preconditions:
613	*
614	* -- You must call encInitCase before
615	* the first call.
616	*
617	* This works and is presently used in WarpIN.
618	*
619	*@@added V0.9.20 (2002-07-03) [umoeller]
620	*/
621
622	unsigned long encToUpper(unsigned long ulUni)
623	{
624	unsigned long ulFold;
625
626	if ( (ulUni < G_pFold->cEntries)
627	&& (ulFold = G_pFold->aulFolds[ulUni])
628	)
629	return ulFold;
630
631	return ulUni;
632	}
633
634	/*
635	*@@ encicmp:
636	* like stricmp, but for UTF-8 strings.
637	* This uses encToUpper for the comparisons.
638	*
639	* Like stricmp, this returns:
640	*
641	* -- -1 if pcsz1 is less than pcsz2
642	* -- 0 if pcsz1 is equal to pcsz2
643	* -- +1 if pcsz1 is greater than pcsz2
644	*
645	* However, this does not crash on passing
646	* in NULL strings.
647	*
648	* Preconditions:
649	*
650	* -- You must call encInitCase before
651	* the first call.
652	*
653	* This works and is presently used in WarpIN.
654	*
655	*@@added V0.9.20 (2002-07-03) [umoeller]
656	*/
657
658	int encicmp(const char *pcsz1,
659	const char *pcsz2)
660	{
661	const char *p1 = pcsz1,
662	*p2 = pcsz2;
663
664	unsigned long ul1, ul2;
665
666	do
667	{
668	// encDecodeUTF8 returns null for null, so this is safe
669	ul1 = encToUpper(encDecodeUTF8(&p1));
670	ul2 = encToUpper(encDecodeUTF8(&p2));
671
672	if (ul1 < ul2)
673	return -1;
674	if (ul1 > ul2)
675	return +1;
676
677	// both are equal: check for null bytes then
678	if (!ul1)
679	if (!ul2)
680	return 0;
681	else
682	// ul1 is null, but ul2 isn't:
683	return -1;
684	else
685	if (!ul2)
686	// ul1 is not null, but ul2 is:
687	return +1;
688
689	// both are non-null: continue
690
691	} while (1);
692
693	return 0;
694	}
695

Note: See TracBrowser for help on using the repository browser.

Download in other formats: