Context Navigation

source: trunk/src/helpers/encodings.c@ 196

Visit:

Last change on this file since 196 was 192, checked in by umoeller, 23 years ago
misc fixes
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 20.4 KB

Line
1
2	/*
3	*@@sourcefile encodings.c:
4	* character encoding support. Handles all kinds
5	* of legacy codepages (including most OS/2 codepages)
6	* and Unicode in the form of UTF-8 and translations
7	* between then.
8	*
9	* See encCreateCodec for an introduction.
10	*
11	* Be warned, compilation of this file takes a long
12	* file because this includes all the complex codepages
13	* from include\encodings.
14	*
15	*@@header "encodings\base.h"
16	*@@added V0.9.9 (2001-02-14) [umoeller]
17	*/
18
19	/*
20	* Copyright (C) 2001-2002 Ulrich Mller.
21	* This file is part of the "XWorkplace helpers" source package.
22	* This is free software; you can redistribute it and/or modify
23	* it under the terms of the GNU General Public License as published
24	* by the Free Software Foundation, in version 2 as it comes in the
25	* "COPYING" file of the XWorkplace main distribution.
26	* This program is distributed in the hope that it will be useful,
27	* but WITHOUT ANY WARRANTY; without even the implied warranty of
28	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29	* GNU General Public License for more details.
30	*/
31
32	#define OS2EMX_PLAIN_CHAR
33	// this is needed for "os2emx.h"; if this is defined,
34	// emx will define PSZ as _signed_ char, otherwise
35	// as unsigned char
36
37	#include <stdlib.h>
38	#include <string.h>
39
40	#include "setup.h" // code generation and debugging options
41
42	#include "helpers\standards.h"
43
44	#include "encodings\base.h"
45
46	#include "encodings\unicase.h"
47
48	#include "encodings\alltables.h" // this takes a very long time
49
50	#pragma hdrstop
51
52	/*
53	*@@category: Helpers\National Language Support\Encodings
54	* See encodings.c.
55	*/
56
57	/*
58	*@@ G_aEncodings:
59	* list of all encodings supported by this engine
60	* (i.e. we have a corresponding codepage in
61	* include\encodings\*.h) together with some
62	* additional information for each encoding,
63	* such as the corresponding OS/2 codepage
64	* number and a descriptive string.
65	*
66	*@@added V [umoeller]
67	*/
68
69	struct
70	{
71	ENCID id; // engine ID (enum)
72	PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
73	unsigned long cEntries; // entries in map (array item count)
74	unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
75	ENCBYTECOUNT bc;
76	const char *pcszDescription; // description
77	} G_aEncodings[] =
78	{
79	#define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
80
81	ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
82	ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
83	ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
84	ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
85	ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
86	// Romania, Poland
87	ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
88	ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
89	ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
90	ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
91	ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
92	ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
93	ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
94	ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
95	ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
96	ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
97	ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
98
99	ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
100	ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
101	ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
102	ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
103
104	ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
105	ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
106	ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
107	ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
108	ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
109	ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
110	ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
111	ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
112	ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
113	ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
114	ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
115	ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
116	ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
117	ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
118	ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
119	ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
120	ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
121	ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
122	ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
123	ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
124	ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
125	ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
126	ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
127
128	UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
129	UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
130	};
131
132	/*
133	*@@ ENCCASEFOLD:
134	*
135	*@@added V0.9.20 (2002-07-03) [umoeller]
136	*/
137
138	typedef struct _ENCCASEFOLD
139	{
140	unsigned long cEntries;
141	unsigned long aulFolds[1];
142	} ENCCASEFOLD, *PENCCASEFOLD;
143
144	static PENCCASEFOLD G_pFold = NULL;
145
146	/*
147	*@@ encGetTable:
148	*
149	*@@added V0.9.18 (2002-03-08) [umoeller]
150	*/
151
152	int encGetTable(ENCID id,
153	PXWPENCODINGMAP *ppMap,
154	unsigned long *pcEntries)
155	{
156	unsigned long ul;
157	for (ul = 0;
158	ul < ARRAYITEMCOUNT(G_aEncodings);
159	ul++)
160	{
161	if (G_aEncodings[ul].id == id)
162	{
163	*ppMap = G_aEncodings[ul].pMap;
164	*pcEntries = G_aEncodings[ul].cEntries;
165	return 1;
166	}
167	}
168
169	return 0;
170	}
171
172	/*
173	*@@ encFindIdForCodepage:
174	* returns the ENCID for the given OS/2
175	* codepage, or UNSUPPORTED if there's none.
176	*
177	*@@added V0.9.18 (2002-03-08) [umoeller]
178	*/
179
180	ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
181	const char **ppcszDescription, // out: codepage description; ptr can be NULL
182	ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE; ptr can be NULL
183	{
184	unsigned long ul;
185	for (ul = 0;
186	ul < ARRAYITEMCOUNT(G_aEncodings);
187	ul++)
188	{
189	if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
190
191	{
192	if (ppcszDescription)
193	*ppcszDescription = G_aEncodings[ul].pcszDescription;
194	if (pByteCount)
195	*pByteCount = G_aEncodings[ul].bc;
196	return G_aEncodings[ul].id;
197	}
198	}
199
200	return UNSUPPORTED;
201	}
202
203	/*
204	*@@ encCreateCodec:
205	* creates a codec that can be used for conversion between
206	* Unicode and codepaged characters (and vice versa).
207	*
208	* A codec essentially consists of two tables which can
209	* be used for quick index-based lookups in both directions.
210	* This function goes thru the tables provided in
211	* include\encodings\*.h and builds the codec tables
212	* from them.
213	*
214	* This function takes an encoding ID as input. Each
215	* codepage table in include\encodings\*.h has one
216	* of those IDs assigned. Use encFindIdForCodepage
217	* to find the ID for a given OS/2 codepage.
218	*
219	* Use codecs carefully and only when they are really
220	* needed for a specific conversion. Building a codec
221	* is expensive, so you should create a codec once
222	* and reuse it for future conversions. In addition,
223	* create codecs only for the codepages that are
224	* actually used. Each codec will take up
225	* n * sizeof(USHORT) bytes, where n is the highest
226	* Unicode character used in the codepage.
227	*
228	* Codec remarks:
229	*
230	* -- All codepages share the first 128 characters
231	* (0-0x7F) with ASCII.
232	*
233	* -- Since the first 128 characters (0-0x7F) in
234	* Unicode are equivalent to ASCII also, codecs
235	* are not needed if you process ASCII strings
236	* only.
237	*
238	* -- Since the next 128 characters (0x80-0xFF) in
239	* Unicode are equivalent to ISO/IEC 8859-1
240	* (Latin-1), codecs aren't needed for those
241	* strings either.
242	*
243	* Note that codepoints 0x80-0x9F are undefined
244	* in Latin-1 but used as control sequences in
245	* Unicode.
246	*
247	* -- As far as I know, codepage 1252, which is
248	* used per default under Windows, is equivalent
249	* to Latin 1 except that it also defines
250	* codepoints 0x80-0x9F to certain DTP characters.
251	*
252	* -- From my testing, codepage 1004 (which is
253	* described as "Windows-compatible" in most OS/2
254	* docs) is the same as codepage 1252, except for
255	* character 0xAF.
256	*
257	* Unfortunately, OS/2 uses codepage 850 on most
258	* systems (and Windows uses OS/2 codepage 1252),
259	* so for conversion between those, codecs are needed.
260	*
261	* This works and is presently used in WarpIN.
262	*/
263
264	PCONVERSION encCreateCodec(ENCID id)
265	{
266	PXWPENCODINGMAP pEncodingMap;
267	unsigned long cArrayEntries;
268
269	if (encGetTable(id,
270	&pEncodingMap,
271	&cArrayEntries))
272	{
273	unsigned short usHighestCP = 0,
274	usHighestUni = 0;
275	unsigned long ul;
276
277	// step 1:
278	// run through the table and calculate the highest
279	// character entry used
280	for (ul = 0;
281	ul < cArrayEntries;
282	ul++)
283	{
284	if (pEncodingMap[ul].usCP > usHighestCP)
285	usHighestCP = pEncodingMap[ul].usCP;
286	if (pEncodingMap[ul].usUni > usHighestUni)
287	usHighestUni = pEncodingMap[ul].usUni;
288	}
289
290	// step 2: allocate encoding table
291	if (usHighestCP && usHighestUni)
292	{
293	PCONVERSION pTableNew;
294	if (pTableNew = NEW(CONVERSION))
295	{
296	unsigned long cbEntriesUniFromCP
297	= (usHighestCP + 1) * sizeof(unsigned short);
298	unsigned long cbEntriesCPFromUni
299	= (usHighestUni + 1) * sizeof(unsigned short);
300
301	ZERO(pTableNew);
302
303	pTableNew->usHighestCP = usHighestCP;
304	pTableNew->usHighestUni = usHighestUni;
305
306	if ( (pTableNew->ausEntriesUniFromCP
307	= (unsigned short*)malloc(cbEntriesUniFromCP))
308	&& (pTableNew->ausEntriesCPFromUni
309	= (unsigned short*)malloc(cbEntriesCPFromUni))
310	)
311	{
312	// step 3: fill encoding tables
313
314	memset(pTableNew->ausEntriesUniFromCP,
315	0xFF,
316	cbEntriesUniFromCP);
317	memset(pTableNew->ausEntriesCPFromUni,
318	0xFF,
319	cbEntriesCPFromUni);
320
321	for (ul = 0;
322	ul < cArrayEntries;
323	ul++)
324	{
325	PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
326
327	pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
328
329	pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
330	}
331
332	return pTableNew;
333	}
334
335	free(pTableNew);
336	}
337	}
338	}
339
340	return NULL;
341	}
342
343	/*
344	*@@ encFreeCodec:
345	* frees a codec created with encFreeConversion
346	* and sets the given pointer to NULL.
347	*
348	* This works and is presently used in WarpIN.
349	*
350	*@@added V0.9.18 (2002-03-08) [umoeller]
351	*/
352
353	void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
354	{
355	PCONVERSION pTable;
356	if (pTable = *ppTable)
357	{
358	if (pTable->ausEntriesUniFromCP)
359	free(pTable->ausEntriesUniFromCP);
360	if (pTable->ausEntriesCPFromUni)
361	free(pTable->ausEntriesCPFromUni);
362	free(pTable);
363	*ppTable = NULL;
364	}
365	}
366
367	/*
368	*@@ encChar2Uni:
369	* converts a codepage-specific character
370	* to Unicode, using the given conversion
371	* table from encCreateCodec().
372	*
373	* Returns 0xFFFF on errors, which is unlikely
374	* with Unicode though.
375	*
376	* This works and is presently used in WarpIN.
377	*
378	*@@added V0.9.18 (2002-03-08) [umoeller]
379	*/
380
381	unsigned long encChar2Uni(PCONVERSION pTable,
382	unsigned short c)
383	{
384	if ( (pTable)
385	&& (c <= pTable->usHighestCP)
386	)
387	return pTable->ausEntriesUniFromCP[c];
388
389	return 0xFFFF;
390	}
391
392	/*
393	*@@ encUni2Char:
394	* converts a Unicode character to the
395	* codepage specified by the given
396	* conversion table from encCreateCodec().
397	*
398	* Returns 0xFFFF if the Unicode character
399	* has no codepage equivalent.
400	*
401	* This works and is presently used in WarpIN.
402	*
403	*@@added V0.9.18 (2002-03-08) [umoeller]
404	*/
405
406	unsigned short encUni2Char(PCONVERSION pTable,
407	unsigned long ulUni)
408	{
409	if ( (pTable)
410	&& (ulUni <= pTable->usHighestUni)
411	)
412	return pTable->ausEntriesCPFromUni[ulUni];
413
414	return 0xFFFF;
415	}
416
417	/*
418	*@@ encDecodeUTF8:
419	* decodes one UTF-8 character and returns
420	* the Unicode value or -1 if the character
421	* is invalid.
422	*
423	* On input, *ppch is assumed to point to
424	* the first byte of the UTF-8 char to be
425	* read.
426	*
427	* This function will advance *ppch by at
428	* least one byte (or more if the UTF-8
429	* char initially pointed to introduces
430	* a multi-byte sequence).
431	*
432	* This returns -1 if *ppch points to an
433	* invalid encoding (in which case the
434	* pointer is advanced anyway).
435	*
436	* This returns 0 if **ppch points to a
437	* null character.
438	*
439	* This works and is presently used in WarpIN.
440	*
441	*@@added V0.9.14 (2001-08-09) [umoeller]
442	*/
443
444	unsigned long encDecodeUTF8(const char **ppch)
445	{
446	unsigned long ulChar;
447	unsigned long ulCount;
448	int fIllegal;
449
450	if (!(ulChar = **ppch))
451	// null is null
452	return 0;
453
454	// if (ulChar < 0x80): simple, one byte only... use that
455
456	if (ulChar < 0x80)
457	{
458	(*ppch)++;
459	return ulChar;
460	}
461
462	ulCount = 1;
463	fIllegal = 0;
464
465	// note: 0xc0 and 0xc1 are reserved and
466	// cannot appear as the first UTF-8 byte
467
468	if ( (ulChar >= 0xc2)
469	&& (ulChar < 0xe0)
470	)
471	{
472	// that's two bytes
473	ulCount = 2;
474	ulChar &= 0x1f;
475	}
476	else if ((ulChar & 0xf0) == 0xe0)
477	{
478	// three bytes
479	ulCount = 3;
480	ulChar &= 0x0f;
481	}
482	else if ((ulChar & 0xf8) == 0xf0)
483	{
484	// four bytes
485	ulCount = 4;
486	ulChar &= 0x07;
487	}
488	else if ((ulChar & 0xfc) == 0xf8)
489	{
490	// five bytes
491	ulCount = 5;
492	ulChar &= 0x03;
493	}
494	else if ((ulChar & 0xfe) == 0xfc)
495	{
496	// six bytes
497	ulCount = 6;
498	ulChar &= 0x01;
499	}
500	else
501	++fIllegal;
502
503	if (!fIllegal)
504	{
505	// go for the second and more bytes then
506	int ul2;
507
508	for (ul2 = 1;
509	ul2 < ulCount;
510	++ul2)
511	{
512	unsigned long ulChar2 = ((ppch) + ul2);
513
514	if (!(ulChar2 & 0xc0)) // != 0x80)
515	{
516	++fIllegal;
517	break;
518	}
519
520	ulChar <<= 6;
521	ulChar \|= ulChar2 & 0x3f;
522	}
523	}
524
525	if (fIllegal)
526	{
527	// skip all the following characters
528	// until we find something with bit 7 off
529	do
530	{
531	ulChar = (++(ppch));
532	if (!ulChar)
533	break;
534	} while (ulChar & 0x80);
535	}
536	else
537	*ppch += ulCount;
538
539	return ulChar;
540	}
541
542	/*
543	*@@ encInitCase:
544	* creates a casefold for later use with
545	* encToUpper.
546	*
547	* This only uses one-byte sequences from
548	* the Unicode case folding table (see
549	* include\encodings\unicase.h), so this
550	* cannot be used for expanding characters
551	* at this point.
552	*
553	* Returns 1 (TRUE) on success.
554	*
555	* This works and is presently used in WarpIN.
556	*
557	*@@added V0.9.20 (2002-07-03) [umoeller]
558	*/
559
560	int encInitCase(void)
561	{
562	unsigned long ul,
563	cEntries = 0,
564	cb;
565
566	for (ul = 0;
567	ul < ARRAYITEMCOUNT(G_aCaseFolds);
568	++ul)
569	{
570	// ignore CASEFL_T (duplicate entries for i chars)
571	// and CASEFL_F (expansions)
572	if ( (G_aCaseFolds[ul].fl & (CASEFL_C \| CASEFL_S))
573	&& (G_aCaseFolds[ul].ulLow > cEntries)
574	)
575	cEntries = G_aCaseFolds[ul].ulLow;
576	}
577
578	cb = sizeof(ENCCASEFOLD) + cEntries * sizeof(unsigned long);
579	if (G_pFold = (PENCCASEFOLD)malloc(cb))
580	{
581	memset(G_pFold, 0, cb);
582	G_pFold->cEntries = cEntries;
583
584	for (ul = 0;
585	ul < ARRAYITEMCOUNT(G_aCaseFolds);
586	++ul)
587	{
588	if (G_aCaseFolds[ul].fl & (CASEFL_C \| CASEFL_S))
589	G_pFold->aulFolds[G_aCaseFolds[ul].ulLow] = G_aCaseFolds[ul].c1;
590	}
591
592	return 1;
593	}
594
595	return 0;
596	}
597
598	/*
599	*@@ encToUpper:
600	* converts the given unicode character to
601	* upper case, if possible, or returns
602	* ulUni back if Unicode doesn't define
603	* an upper-case character for it.
604	*
605	* Special cases:
606	*
607	* -- Returns 0 for 0.
608	*
609	* Preconditions:
610	*
611	* -- You must call encInitCase before
612	* the first call.
613	*
614	* This works and is presently used in WarpIN.
615	*
616	*@@added V0.9.20 (2002-07-03) [umoeller]
617	*/
618
619	unsigned long encToUpper(unsigned long ulUni)
620	{
621	unsigned long ulFold;
622
623	if ( (ulUni < G_pFold->cEntries)
624	&& (ulFold = G_pFold->aulFolds[ulUni])
625	)
626	return ulFold;
627
628	return ulUni;
629	}
630
631	/*
632	*@@ encicmp:
633	* like stricmp, but for UTF-8 strings.
634	* This uses encToUpper for the comparisons.
635	*
636	* Like stricmp, this returns:
637	*
638	* -- -1 if pcsz1 is less than pcsz2
639	* -- 0 if pcsz1 is equal to pcsz2
640	* -- +1 if pcsz1 is greater than pcsz2
641	*
642	* However, this does not crash on passing
643	* in NULL strings.
644	*
645	* Preconditions:
646	*
647	* -- You must call encInitCase before
648	* the first call.
649	*
650	* This works and is presently used in WarpIN.
651	*
652	*@@added V0.9.20 (2002-07-03) [umoeller]
653	*/
654
655	int encicmp(const char *pcsz1,
656	const char *pcsz2)
657	{
658	const char *p1 = pcsz1,
659	*p2 = pcsz2;
660
661	unsigned long ul1, ul2;
662
663	do
664	{
665	// encDecodeUTF8 returns null for null, so this is safe
666	ul1 = encToUpper(encDecodeUTF8(&p1));
667	ul2 = encToUpper(encDecodeUTF8(&p2));
668
669	if (ul1 < ul2)
670	return -1;
671	if (ul1 > ul2)
672	return +1;
673
674	// both are equal: check for null bytes then
675	if (!ul1)
676	if (!ul2)
677	return 0;
678	else
679	// ul1 is null, but ul2 isn't:
680	return -1;
681	else
682	if (!ul2)
683	// ul1 is not null, but ul2 is:
684	return +1;
685
686	// both are non-null: continue
687
688	} while (1);
689
690	return 0;
691	}
692

Note: See TracBrowser for help on using the repository browser.

Download in other formats: