Context Navigation

source: trunk/src/helpers/encodings.c@ 151

Visit:

Last change on this file since 151 was 147, checked in by umoeller, 23 years ago
Misc updates for Unicode.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 16.6 KB

Line
1
2	/*
3	*@@sourcefile encodings.c:
4	* character encoding translations.
5	*
6	* See encCreateCodec for an introduction.
7	*
8	*@@header "encodings\base.h"
9	*@@added V0.9.9 (2001-02-14) [umoeller]
10	*/
11
12	/*
13	* Copyright (C) 2001 Ulrich Mller.
14	* This file is part of the "XWorkplace helpers" source package.
15	* This is free software; you can redistribute it and/or modify
16	* it under the terms of the GNU General Public License as published
17	* by the Free Software Foundation, in version 2 as it comes in the
18	* "COPYING" file of the XWorkplace main distribution.
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*/
24
25	#define OS2EMX_PLAIN_CHAR
26	// this is needed for "os2emx.h"; if this is defined,
27	// emx will define PSZ as _signed_ char, otherwise
28	// as unsigned char
29
30	#include <stdlib.h>
31	#include <string.h>
32
33	#include "setup.h" // code generation and debugging options
34
35	#include "helpers\standards.h"
36
37	#include "encodings\base.h"
38	#include "encodings\alltables.h"
39	// #include "encodings\collate.h"
40
41	#pragma hdrstop
42
43	#define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
44
45	/*
46	*@@ G_aEncodings:
47	* list of all encodings supported by this engine
48	* (i.e. we have a corresponding codepage in
49	* include\encodings\*.h) together with some
50	* additional information for each encoding,
51	* such as the corresponding OS/2 codepage
52	* number and a descriptive string.
53	*
54	*@@added V [umoeller]
55	*/
56
57	struct
58	{
59	ENCID id; // engine ID (enum)
60	PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
61	unsigned long cEntries; // entries in map (array item count)
62	unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
63	ENCBYTECOUNT bc;
64	const char *pcszDescription; // description
65	} G_aEncodings[] =
66	{
67	ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
68	ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
69	ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
70	ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
71	ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
72	// Romania, Poland
73	ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
74	ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
75	ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
76	ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
77	ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
78	ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
79	ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
80	ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
81	ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
82	ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
83	ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
84	// ENCODINGENTRY(cp932), 932 or 943?, DOUBLE, "Japanese Windows",
85	// ENCODINGENTRY(cp936), 936 or 946?, DOUBLE, "Chinese",
86	// ENCODINGENTRY(cp949), 951 or 949?, DOUBLE, "Korean",
87	// ENCODINGENTRY(cp950), 947 or 950?, DOUBLE, "Taiwan Big-5", // default in China?
88	ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
89	ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
90	ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
91	ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
92	ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
93	ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
94	ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
95	ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
96	ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
97	ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
98	ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
99	ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
100	ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
101	ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
102	ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
103	ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
104	ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
105	ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
106	ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
107	ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
108	ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
109	ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
110	ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
111
112	UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
113	UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
114	};
115
116	/*
117	*@@ FindEntry:
118	*
119	*@@added V0.9.18 (2002-03-08) [umoeller]
120	*/
121
122	static int FindEntry(ENCID id,
123	PXWPENCODINGMAP *ppMap,
124	unsigned long *pcEntries)
125	{
126	unsigned long ul;
127	for (ul = 0;
128	ul < ARRAYITEMCOUNT(G_aEncodings);
129	ul++)
130	{
131	if (G_aEncodings[ul].id == id)
132	{
133	*ppMap = G_aEncodings[ul].pMap;
134	*pcEntries = G_aEncodings[ul].cEntries;
135	return (1);
136	}
137	}
138
139	return (0);
140	}
141
142	/*
143	*@@ encFindIdForCodepage:
144	* returns the ENCID for the given OS/2
145	* codepage, or UNSUPPORTED if there's none.
146	*
147	*@@added V0.9.18 (2002-03-08) [umoeller]
148	*/
149
150	ENCID encFindIdForCodepage(unsigned short usCodepage,
151	const char **ppcszDescription, // out: codepage description; ptr can be NULL
152	ENCBYTECOUNT *pByteCount)
153	{
154	unsigned long ul;
155	for (ul = 0;
156	ul < ARRAYITEMCOUNT(G_aEncodings);
157	ul++)
158	{
159	if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
160
161	{
162	if (ppcszDescription)
163	*ppcszDescription = G_aEncodings[ul].pcszDescription;
164	if (pByteCount)
165	*pByteCount = G_aEncodings[ul].bc;
166	return G_aEncodings[ul].id;
167	}
168	}
169
170	return (UNSUPPORTED);
171	}
172
173	/*
174	*@@ encCreateCodec:
175	* creates a codec that can be used for conversion between
176	* Unicode and codepaged characters (and vice versa).
177	*
178	* A codec essentially consists of two tables which can
179	* be used for quick index-based lookups in both directions.
180	* This function goes thru the tables provided in
181	* include\encodings\*.h and builds the codec tables
182	* from them.
183	*
184	* This function takes an encoding ID as input. Each
185	* codepage table in include\encodings\*.h has one
186	* of those IDs assigned. Use encFindIdForCodepage
187	* to find the ID for a given OS/2 codepage.
188	*
189	* Use codecs carefully and only when they are really
190	* needed for a specific conversion. Building a codec
191	* is expensive, so you should create a codec once
192	* and reuse it for future conversions. In addition,
193	* create codecs only for the codepages that are
194	* actually used. Each codec will take up
195	* n * sizeof(USHORT) bytes, where n is the highest
196	* Unicode character used in the codepage.
197	*
198	* Remarks:
199	*
200	* -- All codepages share the first 128 characters
201	* (0-0x7F) with ASCII.
202	*
203	* -- Since the first 128 characters (0-0x7F) in
204	* Unicode are equivalent to ASCII also, codecs
205	* are not needed if you process ASCII strings
206	* only.
207	*
208	* -- Since the next 128 characters (0x80-0xFF) in
209	* Unicode are equivalent to ISO/IEC 8859-1
210	* (Latin-1), codecs aren't needed for those
211	* strings either.
212	*
213	* Note that codepoints 0x80-0x9F are undefined
214	* in Latin-1 but used as control sequences in
215	* Unicode.
216	*
217	* -- As far as I know, codepage 1252, which is
218	* used per default under Windows, is equivalent
219	* to Latin 1 except that it also defines
220	* codepoints 0x80-0x9F to certain DTP characters.
221	*
222	* -- From my testing, codepage 1004 (which is
223	* described as "Windows-compatible" in most OS/2
224	* docs) is the same as codepage 1252, except for
225	* character 0xAF.
226	*
227	* Unfortunately, OS/2 uses codepage 850 on most
228	* systems (and Windows uses OS/2 codepage 1252),
229	* so for conversion between those, codecs are needed.
230	*/
231
232	PCONVERSION encCreateCodec(ENCID id)
233	{
234	PXWPENCODINGMAP pEncodingMap;
235	unsigned long cArrayEntries;
236
237	if (FindEntry(id,
238	&pEncodingMap,
239	&cArrayEntries))
240	{
241	unsigned short usHighestCP = 0,
242	usHighestUni = 0;
243	unsigned long ul;
244
245	// step 1:
246	// run through the table and calculate the highest
247	// character entry used
248	for (ul = 0;
249	ul < cArrayEntries;
250	ul++)
251	{
252	if (pEncodingMap[ul].usCP > usHighestCP)
253	usHighestCP = pEncodingMap[ul].usCP;
254	if (pEncodingMap[ul].usUni > usHighestUni)
255	usHighestUni = pEncodingMap[ul].usUni;
256	}
257
258	// step 2: allocate encoding table
259	if (usHighestCP && usHighestUni)
260	{
261	PCONVERSION pTableNew;
262	if (pTableNew = NEW(CONVERSION))
263	{
264	unsigned long cbEntriesUniFromCP
265	= (usHighestCP + 1) * sizeof(unsigned short);
266	unsigned long cbEntriesCPFromUni
267	= (usHighestUni + 1) * sizeof(unsigned short);
268
269	ZERO(pTableNew);
270
271	pTableNew->usHighestCP = usHighestCP;
272	pTableNew->usHighestUni = usHighestUni;
273
274	if ( (pTableNew->ausEntriesUniFromCP
275	= (unsigned short*)malloc(cbEntriesUniFromCP))
276	&& (pTableNew->ausEntriesCPFromUni
277	= (unsigned short*)malloc(cbEntriesCPFromUni))
278	)
279	{
280	// step 3: fill encoding tables
281
282	memset(pTableNew->ausEntriesUniFromCP,
283	0xFF,
284	cbEntriesUniFromCP);
285	memset(pTableNew->ausEntriesCPFromUni,
286	0xFF,
287	cbEntriesCPFromUni);
288
289	for (ul = 0;
290	ul < cArrayEntries;
291	ul++)
292	{
293	PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
294
295	pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
296
297	pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
298	}
299
300	return (pTableNew);
301	}
302
303	free(pTableNew);
304	}
305	}
306	}
307
308	return (NULL);
309	}
310
311	/*
312	*@@ encFreeCodec:
313	* frees a codec created with encFreeConversion
314	* and sets the given pointer to NULL.
315	*
316	*@@added V0.9.18 (2002-03-08) [umoeller]
317	*/
318
319	void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
320	{
321	PCONVERSION pTable;
322	if (pTable = *ppTable)
323	{
324	if (pTable->ausEntriesUniFromCP)
325	free(pTable->ausEntriesUniFromCP);
326	if (pTable->ausEntriesCPFromUni)
327	free(pTable->ausEntriesCPFromUni);
328	free(pTable);
329	*ppTable = NULL;
330	}
331	}
332
333	/*
334	*@@ encChar2Uni:
335	* converts a codepage-specific character
336	* to Unicode, using the given conversion
337	* table from encCreateCodec().
338	*
339	* Returns 0xFFFF on errors, which is unlikely
340	* with Unicode though.
341	*
342	*@@added V0.9.18 (2002-03-08) [umoeller]
343	*/
344
345	unsigned long encChar2Uni(PCONVERSION pTable,
346	unsigned short c)
347	{
348	if ( (pTable)
349	&& (c <= pTable->usHighestCP)
350	)
351	return (pTable->ausEntriesUniFromCP[c]);
352
353	return (0xFFFF);
354	}
355
356	/*
357	*@@ encUni2Char:
358	* converts a Unicode character to the
359	* codepage specified by the given
360	* conversion table from encCreateCodec().
361	*
362	* Returns 0xFFFF if the Unicode character
363	* has no codepage equivalent.
364	*
365	*@@added V0.9.18 (2002-03-08) [umoeller]
366	*/
367
368	unsigned short encUni2Char(PCONVERSION pTable,
369	unsigned long ulUni)
370	{
371	if ( (pTable)
372	&& (ulUni <= pTable->usHighestUni)
373	)
374	return (pTable->ausEntriesCPFromUni[ulUni]);
375
376	return (0xFFFF);
377	}
378
379	/*
380	*@@ encDecodeUTF8:
381	* decodes one UTF-8 character and returns
382	* the Unicode value or -1 if the character
383	* is invalid.
384	*
385	* On input, *ppch is assumed to point to
386	* the first byte of the UTF-8 char to be
387	* read.
388	*
389	* This function will advance *ppch by at
390	* least one byte (or more if the UTF-8
391	* char initially pointed to introduces
392	* a multi-byte sequence).
393	*
394	* This returns -1 if *ppch points to an
395	* invalid encoding (in which case the
396	* pointer is advanced anyway).
397	*
398	* This returns 0 if **ppch points to a
399	* null character.
400	*
401	*@@added V0.9.14 (2001-08-09) [umoeller]
402	*/
403
404	unsigned long encDecodeUTF8(const char **ppch)
405	{
406	unsigned long ulChar = **ppch;
407
408	if (!ulChar)
409	return 0;
410
411	// if (ulChar < 0x80): simple, one byte only... use that
412
413	if (ulChar >= 0x80)
414	{
415	unsigned long ulCount = 1;
416	int fIllegal = 0;
417
418	// note: 0xc0 and 0xc1 are reserved and
419	// cannot appear as the first UTF-8 byte
420
421	if ( (ulChar >= 0xc2)
422	&& (ulChar < 0xe0)
423	)
424	{
425	// that's two bytes
426	ulCount = 2;
427	ulChar &= 0x1f;
428	}
429	else if ((ulChar & 0xf0) == 0xe0)
430	{
431	// three bytes
432	ulCount = 3;
433	ulChar &= 0x0f;
434	}
435	else if ((ulChar & 0xf8) == 0xf0)
436	{
437	// four bytes
438	ulCount = 4;
439	ulChar &= 0x07;
440	}
441	else if ((ulChar & 0xfc) == 0xf8)
442	{
443	// five bytes
444	ulCount = 5;
445	ulChar &= 0x03;
446	}
447	else if ((ulChar & 0xfe) == 0xfc)
448	{
449	// six bytes
450	ulCount = 6;
451	ulChar &= 0x01;
452	}
453	else
454	++fIllegal;
455
456	if (!fIllegal)
457	{
458	// go for the second and more bytes then
459	int ul2;
460
461	for (ul2 = 1;
462	ul2 < ulCount;
463	++ul2)
464	{
465	unsigned long ulChar2 = ((ppch) + ul2);
466
467	if (!(ulChar2 & 0xc0)) // != 0x80)
468	{
469	++fIllegal;
470	break;
471	}
472
473	ulChar <<= 6;
474	ulChar \|= ulChar2 & 0x3f;
475	}
476	}
477
478	if (fIllegal)
479	{
480	// skip all the following characters
481	// until we find something with bit 7 off
482	do
483	{
484	ulChar = (++(ppch));
485	if (!ulChar)
486	break;
487	} while (ulChar & 0x80);
488	}
489	else
490	*ppch += ulCount;
491	}
492	else
493	(*ppch)++;
494
495	return (ulChar);
496	}
497
498	#if 0
499
500	/*
501	*@@ encCodepageToUTF8:
502	*
503	*@@added V0.9.18 (2002-03-08) [umoeller]
504	*/
505
506	void encCodepageToUTF8(const char **ppch)
507	{
508
509	}
510
511	putwchar(c)
512	{
513	if (c < 0x80) {
514	putchar (c);
515	}
516	else if (c < 0x800) {
517	putchar (0xC0 \| c>>6);
518	putchar (0x80 \| c & 0x3F);
519	}
520	else if (c < 0x10000) {
521	putchar (0xE0 \| c>>12);
522	putchar (0x80 \| c>>6 & 0x3F);
523	putchar (0x80 \| c & 0x3F);
524	}
525	else if (c < 0x200000) {
526	putchar (0xF0 \| c>>18);
527	putchar (0x80 \| c>>12 & 0x3F);
528	putchar (0x80 \| c>>6 & 0x3F);
529	putchar (0x80 \| c & 0x3F);
530	}
531	}
532
533	#endif
534

Note: See TracBrowser for help on using the repository browser.

Download in other formats: