Context Navigation

source: trunk/src/helpers/encodings.c@ 186

Visit:

Last change on this file since 186 was 186, checked in by umoeller, 23 years ago
Some Unicode fixes.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 20.5 KB

Line
1
2	/*
3	*@@sourcefile encodings.c:
4	* character encoding translations.
5	*
6	* See encCreateCodec for an introduction.
7	*
8	* Be warned, compilation of this file takes a long
9	* file because this includes all the complex codepage
10	* from include\encodings.
11	*
12	*@@header "encodings\base.h"
13	*@@added V0.9.9 (2001-02-14) [umoeller]
14	*/
15
16	/*
17	* Copyright (C) 2001-2002 Ulrich Mller.
18	* This file is part of the "XWorkplace helpers" source package.
19	* This is free software; you can redistribute it and/or modify
20	* it under the terms of the GNU General Public License as published
21	* by the Free Software Foundation, in version 2 as it comes in the
22	* "COPYING" file of the XWorkplace main distribution.
23	* This program is distributed in the hope that it will be useful,
24	* but WITHOUT ANY WARRANTY; without even the implied warranty of
25	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26	* GNU General Public License for more details.
27	*/
28
29	#define OS2EMX_PLAIN_CHAR
30	// this is needed for "os2emx.h"; if this is defined,
31	// emx will define PSZ as _signed_ char, otherwise
32	// as unsigned char
33
34	#include <stdlib.h>
35	#include <string.h>
36
37	#include "setup.h" // code generation and debugging options
38
39	#include "helpers\standards.h"
40
41	#include "encodings\base.h"
42
43	#include "encodings\unicase.h"
44
45	#include "encodings\alltables.h" // this takes a very long time
46
47	#pragma hdrstop
48
49	/*
50	*@@category: Helpers\National Language Support\Encodings
51	* See encodings.c.
52	*/
53
54	/*
55	*@@ G_aEncodings:
56	* list of all encodings supported by this engine
57	* (i.e. we have a corresponding codepage in
58	* include\encodings\*.h) together with some
59	* additional information for each encoding,
60	* such as the corresponding OS/2 codepage
61	* number and a descriptive string.
62	*
63	*@@added V [umoeller]
64	*/
65
66	struct
67	{
68	ENCID id; // engine ID (enum)
69	PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
70	unsigned long cEntries; // entries in map (array item count)
71	unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
72	ENCBYTECOUNT bc;
73	const char *pcszDescription; // description
74	} G_aEncodings[] =
75	{
76	#define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
77
78	ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
79	ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
80	ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
81	ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
82	ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
83	// Romania, Poland
84	ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
85	ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
86	ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
87	ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
88	ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
89	ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
90	ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
91	ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
92	ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
93	ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
94	ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
95
96	ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
97	ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
98	ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
99	ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
100
101	ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
102	ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
103	ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
104	ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
105	ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
106	ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
107	ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
108	ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
109	ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
110	ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
111	ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
112	ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
113	ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
114	ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
115	ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
116	ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
117	ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
118	ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
119	ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
120	ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
121	ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
122	ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
123	ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
124
125	UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
126	UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
127	};
128
129	/*
130	*@@ ENCCASEFOLD:
131	*
132	*@@added V0.9.20 (2002-07-03) [umoeller]
133	*/
134
135	typedef struct _ENCCASEFOLD
136	{
137	unsigned long cEntries;
138	unsigned long aulFolds[1];
139	} ENCCASEFOLD, *PENCCASEFOLD;
140
141	static PENCCASEFOLD G_pFold = NULL;
142
143	/*
144	*@@ encGetTable:
145	*
146	*@@added V0.9.18 (2002-03-08) [umoeller]
147	*/
148
149	int encGetTable(ENCID id,
150	PXWPENCODINGMAP *ppMap,
151	unsigned long *pcEntries)
152	{
153	unsigned long ul;
154	for (ul = 0;
155	ul < ARRAYITEMCOUNT(G_aEncodings);
156	ul++)
157	{
158	if (G_aEncodings[ul].id == id)
159	{
160	*ppMap = G_aEncodings[ul].pMap;
161	*pcEntries = G_aEncodings[ul].cEntries;
162	return (1);
163	}
164	}
165
166	return (0);
167	}
168
169	/*
170	*@@ encFindIdForCodepage:
171	* returns the ENCID for the given OS/2
172	* codepage, or UNSUPPORTED if there's none.
173	*
174	*@@added V0.9.18 (2002-03-08) [umoeller]
175	*/
176
177	ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
178	const char **ppcszDescription, // out: codepage description; ptr can be NULL
179	ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE; ptr can be NULL
180	{
181	unsigned long ul;
182	for (ul = 0;
183	ul < ARRAYITEMCOUNT(G_aEncodings);
184	ul++)
185	{
186	if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
187
188	{
189	if (ppcszDescription)
190	*ppcszDescription = G_aEncodings[ul].pcszDescription;
191	if (pByteCount)
192	*pByteCount = G_aEncodings[ul].bc;
193	return G_aEncodings[ul].id;
194	}
195	}
196
197	return (UNSUPPORTED);
198	}
199
200	/*
201	*@@ encCreateCodec:
202	* creates a codec that can be used for conversion between
203	* Unicode and codepaged characters (and vice versa).
204	*
205	* A codec essentially consists of two tables which can
206	* be used for quick index-based lookups in both directions.
207	* This function goes thru the tables provided in
208	* include\encodings\*.h and builds the codec tables
209	* from them.
210	*
211	* This function takes an encoding ID as input. Each
212	* codepage table in include\encodings\*.h has one
213	* of those IDs assigned. Use encFindIdForCodepage
214	* to find the ID for a given OS/2 codepage.
215	*
216	* Use codecs carefully and only when they are really
217	* needed for a specific conversion. Building a codec
218	* is expensive, so you should create a codec once
219	* and reuse it for future conversions. In addition,
220	* create codecs only for the codepages that are
221	* actually used. Each codec will take up
222	* n * sizeof(USHORT) bytes, where n is the highest
223	* Unicode character used in the codepage.
224	*
225	* Codec remarks:
226	*
227	* -- All codepages share the first 128 characters
228	* (0-0x7F) with ASCII.
229	*
230	* -- Since the first 128 characters (0-0x7F) in
231	* Unicode are equivalent to ASCII also, codecs
232	* are not needed if you process ASCII strings
233	* only.
234	*
235	* -- Since the next 128 characters (0x80-0xFF) in
236	* Unicode are equivalent to ISO/IEC 8859-1
237	* (Latin-1), codecs aren't needed for those
238	* strings either.
239	*
240	* Note that codepoints 0x80-0x9F are undefined
241	* in Latin-1 but used as control sequences in
242	* Unicode.
243	*
244	* -- As far as I know, codepage 1252, which is
245	* used per default under Windows, is equivalent
246	* to Latin 1 except that it also defines
247	* codepoints 0x80-0x9F to certain DTP characters.
248	*
249	* -- From my testing, codepage 1004 (which is
250	* described as "Windows-compatible" in most OS/2
251	* docs) is the same as codepage 1252, except for
252	* character 0xAF.
253	*
254	* Unfortunately, OS/2 uses codepage 850 on most
255	* systems (and Windows uses OS/2 codepage 1252),
256	* so for conversion between those, codecs are needed.
257	*
258	* This works and is presently used in WarpIN.
259	*/
260
261	PCONVERSION encCreateCodec(ENCID id)
262	{
263	PXWPENCODINGMAP pEncodingMap;
264	unsigned long cArrayEntries;
265
266	if (encGetTable(id,
267	&pEncodingMap,
268	&cArrayEntries))
269	{
270	unsigned short usHighestCP = 0,
271	usHighestUni = 0;
272	unsigned long ul;
273
274	// step 1:
275	// run through the table and calculate the highest
276	// character entry used
277	for (ul = 0;
278	ul < cArrayEntries;
279	ul++)
280	{
281	if (pEncodingMap[ul].usCP > usHighestCP)
282	usHighestCP = pEncodingMap[ul].usCP;
283	if (pEncodingMap[ul].usUni > usHighestUni)
284	usHighestUni = pEncodingMap[ul].usUni;
285	}
286
287	// step 2: allocate encoding table
288	if (usHighestCP && usHighestUni)
289	{
290	PCONVERSION pTableNew;
291	if (pTableNew = NEW(CONVERSION))
292	{
293	unsigned long cbEntriesUniFromCP
294	= (usHighestCP + 1) * sizeof(unsigned short);
295	unsigned long cbEntriesCPFromUni
296	= (usHighestUni + 1) * sizeof(unsigned short);
297
298	ZERO(pTableNew);
299
300	pTableNew->usHighestCP = usHighestCP;
301	pTableNew->usHighestUni = usHighestUni;
302
303	if ( (pTableNew->ausEntriesUniFromCP
304	= (unsigned short*)malloc(cbEntriesUniFromCP))
305	&& (pTableNew->ausEntriesCPFromUni
306	= (unsigned short*)malloc(cbEntriesCPFromUni))
307	)
308	{
309	// step 3: fill encoding tables
310
311	memset(pTableNew->ausEntriesUniFromCP,
312	0xFF,
313	cbEntriesUniFromCP);
314	memset(pTableNew->ausEntriesCPFromUni,
315	0xFF,
316	cbEntriesCPFromUni);
317
318	for (ul = 0;
319	ul < cArrayEntries;
320	ul++)
321	{
322	PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
323
324	pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
325
326	pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
327	}
328
329	return (pTableNew);
330	}
331
332	free(pTableNew);
333	}
334	}
335	}
336
337	return NULL;
338	}
339
340	/*
341	*@@ encFreeCodec:
342	* frees a codec created with encFreeConversion
343	* and sets the given pointer to NULL.
344	*
345	* This works and is presently used in WarpIN.
346	*
347	*@@added V0.9.18 (2002-03-08) [umoeller]
348	*/
349
350	void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
351	{
352	PCONVERSION pTable;
353	if (pTable = *ppTable)
354	{
355	if (pTable->ausEntriesUniFromCP)
356	free(pTable->ausEntriesUniFromCP);
357	if (pTable->ausEntriesCPFromUni)
358	free(pTable->ausEntriesCPFromUni);
359	free(pTable);
360	*ppTable = NULL;
361	}
362	}
363
364	/*
365	*@@ encChar2Uni:
366	* converts a codepage-specific character
367	* to Unicode, using the given conversion
368	* table from encCreateCodec().
369	*
370	* Returns 0xFFFF on errors, which is unlikely
371	* with Unicode though.
372	*
373	* This works and is presently used in WarpIN.
374	*
375	*@@added V0.9.18 (2002-03-08) [umoeller]
376	*/
377
378	unsigned long encChar2Uni(PCONVERSION pTable,
379	unsigned short c)
380	{
381	if ( (pTable)
382	&& (c <= pTable->usHighestCP)
383	)
384	return (pTable->ausEntriesUniFromCP[c]);
385
386	return (0xFFFF);
387	}
388
389	/*
390	*@@ encUni2Char:
391	* converts a Unicode character to the
392	* codepage specified by the given
393	* conversion table from encCreateCodec().
394	*
395	* Returns 0xFFFF if the Unicode character
396	* has no codepage equivalent.
397	*
398	* This works and is presently used in WarpIN.
399	*
400	*@@added V0.9.18 (2002-03-08) [umoeller]
401	*/
402
403	unsigned short encUni2Char(PCONVERSION pTable,
404	unsigned long ulUni)
405	{
406	if ( (pTable)
407	&& (ulUni <= pTable->usHighestUni)
408	)
409	return (pTable->ausEntriesCPFromUni[ulUni]);
410
411	return (0xFFFF);
412	}
413
414	/*
415	*@@ encDecodeUTF8:
416	* decodes one UTF-8 character and returns
417	* the Unicode value or -1 if the character
418	* is invalid.
419	*
420	* On input, *ppch is assumed to point to
421	* the first byte of the UTF-8 char to be
422	* read.
423	*
424	* This function will advance *ppch by at
425	* least one byte (or more if the UTF-8
426	* char initially pointed to introduces
427	* a multi-byte sequence).
428	*
429	* This returns -1 if *ppch points to an
430	* invalid encoding (in which case the
431	* pointer is advanced anyway).
432	*
433	* This returns 0 if **ppch points to a
434	* null character.
435	*
436	* This works and is presently used in WarpIN.
437	*
438	*@@added V0.9.14 (2001-08-09) [umoeller]
439	*/
440
441	unsigned long encDecodeUTF8(const char **ppch)
442	{
443	unsigned long ulChar;
444
445	if (!(ulChar = **ppch))
446	// null is null
447	return 0;
448
449	// if (ulChar < 0x80): simple, one byte only... use that
450
451	if (ulChar < 0x80)
452	{
453	(*ppch)++;
454	return (ulChar);
455	}
456	else
457	{
458	unsigned long ulCount = 1;
459	int fIllegal = 0;
460
461	// note: 0xc0 and 0xc1 are reserved and
462	// cannot appear as the first UTF-8 byte
463
464	if ( (ulChar >= 0xc2)
465	&& (ulChar < 0xe0)
466	)
467	{
468	// that's two bytes
469	ulCount = 2;
470	ulChar &= 0x1f;
471	}
472	else if ((ulChar & 0xf0) == 0xe0)
473	{
474	// three bytes
475	ulCount = 3;
476	ulChar &= 0x0f;
477	}
478	else if ((ulChar & 0xf8) == 0xf0)
479	{
480	// four bytes
481	ulCount = 4;
482	ulChar &= 0x07;
483	}
484	else if ((ulChar & 0xfc) == 0xf8)
485	{
486	// five bytes
487	ulCount = 5;
488	ulChar &= 0x03;
489	}
490	else if ((ulChar & 0xfe) == 0xfc)
491	{
492	// six bytes
493	ulCount = 6;
494	ulChar &= 0x01;
495	}
496	else
497	++fIllegal;
498
499	if (!fIllegal)
500	{
501	// go for the second and more bytes then
502	int ul2;
503
504	for (ul2 = 1;
505	ul2 < ulCount;
506	++ul2)
507	{
508	unsigned long ulChar2 = ((ppch) + ul2);
509
510	if (!(ulChar2 & 0xc0)) // != 0x80)
511	{
512	++fIllegal;
513	break;
514	}
515
516	ulChar <<= 6;
517	ulChar \|= ulChar2 & 0x3f;
518	}
519	}
520
521	if (fIllegal)
522	{
523	// skip all the following characters
524	// until we find something with bit 7 off
525	do
526	{
527	ulChar = (++(ppch));
528	if (!ulChar)
529	break;
530	} while (ulChar & 0x80);
531	}
532	else
533	*ppch += ulCount;
534	}
535
536	return (ulChar);
537	}
538
539	/*
540	*@@ CreateCaseFold:
541	* creates a casefold for later use with
542	* encToUpper.
543	*
544	* This only uses one-byte sequences from
545	* the Unicode case folding table (see
546	* include\encodings\unicase.h), so this
547	* cannot be used for expanding characters
548	* at this point.
549	*
550	* Returns 1 (TRUE) on success.
551	*
552	* This works and is presently used in WarpIN.
553	*
554	*@@added V0.9.20 (2002-07-03) [umoeller]
555	*/
556
557	int encInitCase(void)
558	{
559	unsigned long ul,
560	cEntries = 0,
561	cb;
562
563	for (ul = 0;
564	ul < ARRAYITEMCOUNT(G_aCaseFolds);
565	++ul)
566	{
567	// ignore CASEFL_T (duplicate entries for i chars)
568	// and CASEFL_F (expansions)
569	if ( (G_aCaseFolds[ul].fl & (CASEFL_C \| CASEFL_S))
570	&& (G_aCaseFolds[ul].ulLow > cEntries)
571	)
572	cEntries = G_aCaseFolds[ul].ulLow;
573	}
574
575	cb = sizeof(ENCCASEFOLD) + cEntries * sizeof(unsigned long);
576	if (G_pFold = (PENCCASEFOLD)malloc(cb))
577	{
578	memset(G_pFold, 0, cb);
579	G_pFold->cEntries = cEntries;
580
581	for (ul = 0;
582	ul < ARRAYITEMCOUNT(G_aCaseFolds);
583	++ul)
584	{
585	if (G_aCaseFolds[ul].fl & (CASEFL_C \| CASEFL_S))
586	G_pFold->aulFolds[G_aCaseFolds[ul].ulLow] = G_aCaseFolds[ul].c1;
587	}
588
589	return 1;
590	}
591
592	return 0;
593	}
594
595	/*
596	*@@ encToUpper:
597	* converts the given unicode character to
598	* upper case, if possible, or returns
599	* ulUni back if Unicode doesn't define
600	* an upper-case character for it.
601	*
602	* Special cases:
603	*
604	* -- Returns 0 for 0.
605	*
606	* Preconditions:
607	*
608	* -- You must call encInitCase before
609	* the first call.
610	*
611	* This works and is presently used in WarpIN.
612	*
613	*@@added V0.9.20 (2002-07-03) [umoeller]
614	*/
615
616	unsigned long encToUpper(unsigned long ulUni)
617	{
618	unsigned long ulFold;
619
620	if ( (ulUni < G_pFold->cEntries)
621	&& (ulFold = G_pFold->aulFolds[ulUni])
622	)
623	return ulFold;
624
625	return ulUni;
626	}
627
628	/*
629	*@@ encicmp:
630	* like stricmp, but for UTF-8 strings.
631	* This uses encToUpper for the comparisons.
632	*
633	* Like stricmp, this returns:
634	*
635	* -- -1 if pcsz1 is less than pcsz2
636	* -- 0 if pcsz1 is equal to pcsz2
637	* -- +1 if pcsz1 is greater than pcsz2
638	*
639	* However, this does not crash on passing
640	* in NULL strings.
641	*
642	* Preconditions:
643	*
644	* -- You must call encInitCase before
645	* the first call.
646	*
647	* This works and is presently used in WarpIN.
648	*
649	*@@added V0.9.20 (2002-07-03) [umoeller]
650	*/
651
652	int encicmp(const char *pcsz1,
653	const char *pcsz2)
654	{
655	const char *p1 = pcsz1,
656	*p2 = pcsz2;
657
658	unsigned long ul1, ul2;
659
660	do
661	{
662	// encDecodeUTF8 returns null for null, so this is safe
663	ul1 = encToUpper(encDecodeUTF8(&p1));
664	ul2 = encToUpper(encDecodeUTF8(&p2));
665
666	if (ul1 < ul2)
667	return -1;
668	if (ul1 > ul2)
669	return +1;
670
671	// both are equal: check for null bytes then
672	if (!ul1)
673	if (!ul2)
674	return 0;
675	else
676	// ul1 is null, but ul2 isn't:
677	return -1;
678	else
679	if (!ul2)
680	// ul1 is not null, but ul2 is:
681	return +1;
682
683	// both are non-null: continue
684
685	} while (1);
686
687	return 0;
688	}
689

Note: See TracBrowser for help on using the repository browser.

Download in other formats: