Context Navigation

source: trunk/src/helpers/encodings.c@ 154

Visit:

Last change on this file since 154 was 154, checked in by umoeller, 23 years ago
Misc changes.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 16.3 KB

Line
1
2	/*
3	*@@sourcefile encodings.c:
4	* character encoding translations.
5	*
6	* See encCreateCodec for an introduction.
7	*
8	* Be warned, compilation of this file takes a long
9	* file because this includes all the complex codepage
10	* from include\encodings.
11	*
12	*@@header "encodings\base.h"
13	*@@added V0.9.9 (2001-02-14) [umoeller]
14	*/
15
16	/*
17	* Copyright (C) 2001 Ulrich Mller.
18	* This file is part of the "XWorkplace helpers" source package.
19	* This is free software; you can redistribute it and/or modify
20	* it under the terms of the GNU General Public License as published
21	* by the Free Software Foundation, in version 2 as it comes in the
22	* "COPYING" file of the XWorkplace main distribution.
23	* This program is distributed in the hope that it will be useful,
24	* but WITHOUT ANY WARRANTY; without even the implied warranty of
25	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26	* GNU General Public License for more details.
27	*/
28
29	#define OS2EMX_PLAIN_CHAR
30	// this is needed for "os2emx.h"; if this is defined,
31	// emx will define PSZ as _signed_ char, otherwise
32	// as unsigned char
33
34	#include <stdlib.h>
35	#include <string.h>
36
37	#include "setup.h" // code generation and debugging options
38
39	#include "helpers\standards.h"
40
41	#include "encodings\base.h"
42	#include "encodings\alltables.h"
43	// #include "encodings\collate.h"
44
45	#pragma hdrstop
46
47	/*
48	*@@ G_aEncodings:
49	* list of all encodings supported by this engine
50	* (i.e. we have a corresponding codepage in
51	* include\encodings\*.h) together with some
52	* additional information for each encoding,
53	* such as the corresponding OS/2 codepage
54	* number and a descriptive string.
55	*
56	*@@added V [umoeller]
57	*/
58
59	struct
60	{
61	ENCID id; // engine ID (enum)
62	PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
63	unsigned long cEntries; // entries in map (array item count)
64	unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
65	ENCBYTECOUNT bc;
66	const char *pcszDescription; // description
67	} G_aEncodings[] =
68	{
69	#define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
70
71	ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
72	ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
73	ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
74	ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
75	ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
76	// Romania, Poland
77	ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
78	ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
79	ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
80	ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
81	ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
82	ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
83	ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
84	ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
85	ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
86	ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
87	ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
88
89	ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
90	ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
91	ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
92	ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
93
94	ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
95	ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
96	ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
97	ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
98	ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
99	ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
100	ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
101	ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
102	ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
103	ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
104	ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
105	ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
106	ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
107	ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
108	ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
109	ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
110	ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
111	ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
112	ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
113	ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
114	ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
115	ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
116	ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
117
118	UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
119	UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
120	};
121
122	/*
123	*@@ FindEntry:
124	*
125	*@@added V0.9.18 (2002-03-08) [umoeller]
126	*/
127
128	static int FindEntry(ENCID id,
129	PXWPENCODINGMAP *ppMap,
130	unsigned long *pcEntries)
131	{
132	unsigned long ul;
133	for (ul = 0;
134	ul < ARRAYITEMCOUNT(G_aEncodings);
135	ul++)
136	{
137	if (G_aEncodings[ul].id == id)
138	{
139	*ppMap = G_aEncodings[ul].pMap;
140	*pcEntries = G_aEncodings[ul].cEntries;
141	return (1);
142	}
143	}
144
145	return (0);
146	}
147
148	/*
149	*@@ encFindIdForCodepage:
150	* returns the ENCID for the given OS/2
151	* codepage, or UNSUPPORTED if there's none.
152	*
153	*@@added V0.9.18 (2002-03-08) [umoeller]
154	*/
155
156	ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
157	const char **ppcszDescription, // out: codepage description; ptr can be NULL
158	ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE
159	{
160	unsigned long ul;
161	for (ul = 0;
162	ul < ARRAYITEMCOUNT(G_aEncodings);
163	ul++)
164	{
165	if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
166
167	{
168	if (ppcszDescription)
169	*ppcszDescription = G_aEncodings[ul].pcszDescription;
170	if (pByteCount)
171	*pByteCount = G_aEncodings[ul].bc;
172	return G_aEncodings[ul].id;
173	}
174	}
175
176	return (UNSUPPORTED);
177	}
178
179	/*
180	*@@ encCreateCodec:
181	* creates a codec that can be used for conversion between
182	* Unicode and codepaged characters (and vice versa).
183	*
184	* A codec essentially consists of two tables which can
185	* be used for quick index-based lookups in both directions.
186	* This function goes thru the tables provided in
187	* include\encodings\*.h and builds the codec tables
188	* from them.
189	*
190	* This function takes an encoding ID as input. Each
191	* codepage table in include\encodings\*.h has one
192	* of those IDs assigned. Use encFindIdForCodepage
193	* to find the ID for a given OS/2 codepage.
194	*
195	* Use codecs carefully and only when they are really
196	* needed for a specific conversion. Building a codec
197	* is expensive, so you should create a codec once
198	* and reuse it for future conversions. In addition,
199	* create codecs only for the codepages that are
200	* actually used. Each codec will take up
201	* n * sizeof(USHORT) bytes, where n is the highest
202	* Unicode character used in the codepage.
203	*
204	* Codec remarks:
205	*
206	* -- All codepages share the first 128 characters
207	* (0-0x7F) with ASCII.
208	*
209	* -- Since the first 128 characters (0-0x7F) in
210	* Unicode are equivalent to ASCII also, codecs
211	* are not needed if you process ASCII strings
212	* only.
213	*
214	* -- Since the next 128 characters (0x80-0xFF) in
215	* Unicode are equivalent to ISO/IEC 8859-1
216	* (Latin-1), codecs aren't needed for those
217	* strings either.
218	*
219	* Note that codepoints 0x80-0x9F are undefined
220	* in Latin-1 but used as control sequences in
221	* Unicode.
222	*
223	* -- As far as I know, codepage 1252, which is
224	* used per default under Windows, is equivalent
225	* to Latin 1 except that it also defines
226	* codepoints 0x80-0x9F to certain DTP characters.
227	*
228	* -- From my testing, codepage 1004 (which is
229	* described as "Windows-compatible" in most OS/2
230	* docs) is the same as codepage 1252, except for
231	* character 0xAF.
232	*
233	* Unfortunately, OS/2 uses codepage 850 on most
234	* systems (and Windows uses OS/2 codepage 1252),
235	* so for conversion between those, codecs are needed.
236	*/
237
238	PCONVERSION encCreateCodec(ENCID id)
239	{
240	PXWPENCODINGMAP pEncodingMap;
241	unsigned long cArrayEntries;
242
243	if (FindEntry(id,
244	&pEncodingMap,
245	&cArrayEntries))
246	{
247	unsigned short usHighestCP = 0,
248	usHighestUni = 0;
249	unsigned long ul;
250
251	// step 1:
252	// run through the table and calculate the highest
253	// character entry used
254	for (ul = 0;
255	ul < cArrayEntries;
256	ul++)
257	{
258	if (pEncodingMap[ul].usCP > usHighestCP)
259	usHighestCP = pEncodingMap[ul].usCP;
260	if (pEncodingMap[ul].usUni > usHighestUni)
261	usHighestUni = pEncodingMap[ul].usUni;
262	}
263
264	// step 2: allocate encoding table
265	if (usHighestCP && usHighestUni)
266	{
267	PCONVERSION pTableNew;
268	if (pTableNew = NEW(CONVERSION))
269	{
270	unsigned long cbEntriesUniFromCP
271	= (usHighestCP + 1) * sizeof(unsigned short);
272	unsigned long cbEntriesCPFromUni
273	= (usHighestUni + 1) * sizeof(unsigned short);
274
275	ZERO(pTableNew);
276
277	pTableNew->usHighestCP = usHighestCP;
278	pTableNew->usHighestUni = usHighestUni;
279
280	if ( (pTableNew->ausEntriesUniFromCP
281	= (unsigned short*)malloc(cbEntriesUniFromCP))
282	&& (pTableNew->ausEntriesCPFromUni
283	= (unsigned short*)malloc(cbEntriesCPFromUni))
284	)
285	{
286	// step 3: fill encoding tables
287
288	memset(pTableNew->ausEntriesUniFromCP,
289	0xFF,
290	cbEntriesUniFromCP);
291	memset(pTableNew->ausEntriesCPFromUni,
292	0xFF,
293	cbEntriesCPFromUni);
294
295	for (ul = 0;
296	ul < cArrayEntries;
297	ul++)
298	{
299	PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
300
301	pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
302
303	pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
304	}
305
306	return (pTableNew);
307	}
308
309	free(pTableNew);
310	}
311	}
312	}
313
314	return (NULL);
315	}
316
317	/*
318	*@@ encFreeCodec:
319	* frees a codec created with encFreeConversion
320	* and sets the given pointer to NULL.
321	*
322	*@@added V0.9.18 (2002-03-08) [umoeller]
323	*/
324
325	void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
326	{
327	PCONVERSION pTable;
328	if (pTable = *ppTable)
329	{
330	if (pTable->ausEntriesUniFromCP)
331	free(pTable->ausEntriesUniFromCP);
332	if (pTable->ausEntriesCPFromUni)
333	free(pTable->ausEntriesCPFromUni);
334	free(pTable);
335	*ppTable = NULL;
336	}
337	}
338
339	/*
340	*@@ encChar2Uni:
341	* converts a codepage-specific character
342	* to Unicode, using the given conversion
343	* table from encCreateCodec().
344	*
345	* Returns 0xFFFF on errors, which is unlikely
346	* with Unicode though.
347	*
348	*@@added V0.9.18 (2002-03-08) [umoeller]
349	*/
350
351	unsigned long encChar2Uni(PCONVERSION pTable,
352	unsigned short c)
353	{
354	if ( (pTable)
355	&& (c <= pTable->usHighestCP)
356	)
357	return (pTable->ausEntriesUniFromCP[c]);
358
359	return (0xFFFF);
360	}
361
362	/*
363	*@@ encUni2Char:
364	* converts a Unicode character to the
365	* codepage specified by the given
366	* conversion table from encCreateCodec().
367	*
368	* Returns 0xFFFF if the Unicode character
369	* has no codepage equivalent.
370	*
371	*@@added V0.9.18 (2002-03-08) [umoeller]
372	*/
373
374	unsigned short encUni2Char(PCONVERSION pTable,
375	unsigned long ulUni)
376	{
377	if ( (pTable)
378	&& (ulUni <= pTable->usHighestUni)
379	)
380	return (pTable->ausEntriesCPFromUni[ulUni]);
381
382	return (0xFFFF);
383	}
384
385	/*
386	*@@ encDecodeUTF8:
387	* decodes one UTF-8 character and returns
388	* the Unicode value or -1 if the character
389	* is invalid.
390	*
391	* On input, *ppch is assumed to point to
392	* the first byte of the UTF-8 char to be
393	* read.
394	*
395	* This function will advance *ppch by at
396	* least one byte (or more if the UTF-8
397	* char initially pointed to introduces
398	* a multi-byte sequence).
399	*
400	* This returns -1 if *ppch points to an
401	* invalid encoding (in which case the
402	* pointer is advanced anyway).
403	*
404	* This returns 0 if **ppch points to a
405	* null character.
406	*
407	*@@added V0.9.14 (2001-08-09) [umoeller]
408	*/
409
410	unsigned long encDecodeUTF8(const char **ppch)
411	{
412	unsigned long ulChar;
413
414	if (!(ulChar = **ppch))
415	// null is null
416	return 0;
417
418	// if (ulChar < 0x80): simple, one byte only... use that
419
420	if (ulChar < 0x80)
421	{
422	(*ppch)++;
423	return (ulChar);
424	}
425	else
426	{
427	unsigned long ulCount = 1;
428	int fIllegal = 0;
429
430	// note: 0xc0 and 0xc1 are reserved and
431	// cannot appear as the first UTF-8 byte
432
433	if ( (ulChar >= 0xc2)
434	&& (ulChar < 0xe0)
435	)
436	{
437	// that's two bytes
438	ulCount = 2;
439	ulChar &= 0x1f;
440	}
441	else if ((ulChar & 0xf0) == 0xe0)
442	{
443	// three bytes
444	ulCount = 3;
445	ulChar &= 0x0f;
446	}
447	else if ((ulChar & 0xf8) == 0xf0)
448	{
449	// four bytes
450	ulCount = 4;
451	ulChar &= 0x07;
452	}
453	else if ((ulChar & 0xfc) == 0xf8)
454	{
455	// five bytes
456	ulCount = 5;
457	ulChar &= 0x03;
458	}
459	else if ((ulChar & 0xfe) == 0xfc)
460	{
461	// six bytes
462	ulCount = 6;
463	ulChar &= 0x01;
464	}
465	else
466	++fIllegal;
467
468	if (!fIllegal)
469	{
470	// go for the second and more bytes then
471	int ul2;
472
473	for (ul2 = 1;
474	ul2 < ulCount;
475	++ul2)
476	{
477	unsigned long ulChar2 = ((ppch) + ul2);
478
479	if (!(ulChar2 & 0xc0)) // != 0x80)
480	{
481	++fIllegal;
482	break;
483	}
484
485	ulChar <<= 6;
486	ulChar \|= ulChar2 & 0x3f;
487	}
488	}
489
490	if (fIllegal)
491	{
492	// skip all the following characters
493	// until we find something with bit 7 off
494	do
495	{
496	ulChar = (++(ppch));
497	if (!ulChar)
498	break;
499	} while (ulChar & 0x80);
500	}
501	else
502	*ppch += ulCount;
503	}
504
505	return (ulChar);
506	}
507
508

Note: See TracBrowser for help on using the repository browser.

Download in other formats: