Context Navigation

source: trunk/src/helpers/encodings.c@ 158

Visit:

Last change on this file since 158 was 156, checked in by umoeller, 23 years ago
Misc fixes.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 16.4 KB

Line
1
2	/*
3	*@@sourcefile encodings.c:
4	* character encoding translations.
5	*
6	* See encCreateCodec for an introduction.
7	*
8	* Be warned, compilation of this file takes a long
9	* file because this includes all the complex codepage
10	* from include\encodings.
11	*
12	*@@header "encodings\base.h"
13	*@@added V0.9.9 (2001-02-14) [umoeller]
14	*/
15
16	/*
17	* Copyright (C) 2001 Ulrich Mller.
18	* This file is part of the "XWorkplace helpers" source package.
19	* This is free software; you can redistribute it and/or modify
20	* it under the terms of the GNU General Public License as published
21	* by the Free Software Foundation, in version 2 as it comes in the
22	* "COPYING" file of the XWorkplace main distribution.
23	* This program is distributed in the hope that it will be useful,
24	* but WITHOUT ANY WARRANTY; without even the implied warranty of
25	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26	* GNU General Public License for more details.
27	*/
28
29	#define OS2EMX_PLAIN_CHAR
30	// this is needed for "os2emx.h"; if this is defined,
31	// emx will define PSZ as _signed_ char, otherwise
32	// as unsigned char
33
34	#include <stdlib.h>
35	#include <string.h>
36
37	#include "setup.h" // code generation and debugging options
38
39	#include "helpers\standards.h"
40
41	#include "encodings\base.h"
42	#include "encodings\alltables.h"
43	// #include "encodings\collate.h"
44
45	#pragma hdrstop
46
47	/*
48	*@@category: Helpers\National Language Support\Encodings
49	* See encodings.c.
50	*/
51
52	/*
53	*@@ G_aEncodings:
54	* list of all encodings supported by this engine
55	* (i.e. we have a corresponding codepage in
56	* include\encodings\*.h) together with some
57	* additional information for each encoding,
58	* such as the corresponding OS/2 codepage
59	* number and a descriptive string.
60	*
61	*@@added V [umoeller]
62	*/
63
64	struct
65	{
66	ENCID id; // engine ID (enum)
67	PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
68	unsigned long cEntries; // entries in map (array item count)
69	unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
70	ENCBYTECOUNT bc;
71	const char *pcszDescription; // description
72	} G_aEncodings[] =
73	{
74	#define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
75
76	ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
77	ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
78	ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
79	ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
80	ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
81	// Romania, Poland
82	ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
83	ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
84	ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
85	ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
86	ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
87	ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
88	ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
89	ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
90	ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
91	ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
92	ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
93
94	ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
95	ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
96	ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
97	ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
98
99	ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
100	ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
101	ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
102	ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
103	ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
104	ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
105	ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
106	ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
107	ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
108	ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
109	ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
110	ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
111	ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
112	ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
113	ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
114	ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
115	ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
116	ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
117	ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
118	ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
119	ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
120	ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
121	ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
122
123	UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
124	UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
125	};
126
127	/*
128	*@@ encGetTable:
129	*
130	*@@added V0.9.18 (2002-03-08) [umoeller]
131	*/
132
133	int encGetTable(ENCID id,
134	PXWPENCODINGMAP *ppMap,
135	unsigned long *pcEntries)
136	{
137	unsigned long ul;
138	for (ul = 0;
139	ul < ARRAYITEMCOUNT(G_aEncodings);
140	ul++)
141	{
142	if (G_aEncodings[ul].id == id)
143	{
144	*ppMap = G_aEncodings[ul].pMap;
145	*pcEntries = G_aEncodings[ul].cEntries;
146	return (1);
147	}
148	}
149
150	return (0);
151	}
152
153	/*
154	*@@ encFindIdForCodepage:
155	* returns the ENCID for the given OS/2
156	* codepage, or UNSUPPORTED if there's none.
157	*
158	*@@added V0.9.18 (2002-03-08) [umoeller]
159	*/
160
161	ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
162	const char **ppcszDescription, // out: codepage description; ptr can be NULL
163	ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE; ptr can be NULL
164	{
165	unsigned long ul;
166	for (ul = 0;
167	ul < ARRAYITEMCOUNT(G_aEncodings);
168	ul++)
169	{
170	if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
171
172	{
173	if (ppcszDescription)
174	*ppcszDescription = G_aEncodings[ul].pcszDescription;
175	if (pByteCount)
176	*pByteCount = G_aEncodings[ul].bc;
177	return G_aEncodings[ul].id;
178	}
179	}
180
181	return (UNSUPPORTED);
182	}
183
184	/*
185	*@@ encCreateCodec:
186	* creates a codec that can be used for conversion between
187	* Unicode and codepaged characters (and vice versa).
188	*
189	* A codec essentially consists of two tables which can
190	* be used for quick index-based lookups in both directions.
191	* This function goes thru the tables provided in
192	* include\encodings\*.h and builds the codec tables
193	* from them.
194	*
195	* This function takes an encoding ID as input. Each
196	* codepage table in include\encodings\*.h has one
197	* of those IDs assigned. Use encFindIdForCodepage
198	* to find the ID for a given OS/2 codepage.
199	*
200	* Use codecs carefully and only when they are really
201	* needed for a specific conversion. Building a codec
202	* is expensive, so you should create a codec once
203	* and reuse it for future conversions. In addition,
204	* create codecs only for the codepages that are
205	* actually used. Each codec will take up
206	* n * sizeof(USHORT) bytes, where n is the highest
207	* Unicode character used in the codepage.
208	*
209	* Codec remarks:
210	*
211	* -- All codepages share the first 128 characters
212	* (0-0x7F) with ASCII.
213	*
214	* -- Since the first 128 characters (0-0x7F) in
215	* Unicode are equivalent to ASCII also, codecs
216	* are not needed if you process ASCII strings
217	* only.
218	*
219	* -- Since the next 128 characters (0x80-0xFF) in
220	* Unicode are equivalent to ISO/IEC 8859-1
221	* (Latin-1), codecs aren't needed for those
222	* strings either.
223	*
224	* Note that codepoints 0x80-0x9F are undefined
225	* in Latin-1 but used as control sequences in
226	* Unicode.
227	*
228	* -- As far as I know, codepage 1252, which is
229	* used per default under Windows, is equivalent
230	* to Latin 1 except that it also defines
231	* codepoints 0x80-0x9F to certain DTP characters.
232	*
233	* -- From my testing, codepage 1004 (which is
234	* described as "Windows-compatible" in most OS/2
235	* docs) is the same as codepage 1252, except for
236	* character 0xAF.
237	*
238	* Unfortunately, OS/2 uses codepage 850 on most
239	* systems (and Windows uses OS/2 codepage 1252),
240	* so for conversion between those, codecs are needed.
241	*/
242
243	PCONVERSION encCreateCodec(ENCID id)
244	{
245	PXWPENCODINGMAP pEncodingMap;
246	unsigned long cArrayEntries;
247
248	if (encGetTable(id,
249	&pEncodingMap,
250	&cArrayEntries))
251	{
252	unsigned short usHighestCP = 0,
253	usHighestUni = 0;
254	unsigned long ul;
255
256	// step 1:
257	// run through the table and calculate the highest
258	// character entry used
259	for (ul = 0;
260	ul < cArrayEntries;
261	ul++)
262	{
263	if (pEncodingMap[ul].usCP > usHighestCP)
264	usHighestCP = pEncodingMap[ul].usCP;
265	if (pEncodingMap[ul].usUni > usHighestUni)
266	usHighestUni = pEncodingMap[ul].usUni;
267	}
268
269	// step 2: allocate encoding table
270	if (usHighestCP && usHighestUni)
271	{
272	PCONVERSION pTableNew;
273	if (pTableNew = NEW(CONVERSION))
274	{
275	unsigned long cbEntriesUniFromCP
276	= (usHighestCP + 1) * sizeof(unsigned short);
277	unsigned long cbEntriesCPFromUni
278	= (usHighestUni + 1) * sizeof(unsigned short);
279
280	ZERO(pTableNew);
281
282	pTableNew->usHighestCP = usHighestCP;
283	pTableNew->usHighestUni = usHighestUni;
284
285	if ( (pTableNew->ausEntriesUniFromCP
286	= (unsigned short*)malloc(cbEntriesUniFromCP))
287	&& (pTableNew->ausEntriesCPFromUni
288	= (unsigned short*)malloc(cbEntriesCPFromUni))
289	)
290	{
291	// step 3: fill encoding tables
292
293	memset(pTableNew->ausEntriesUniFromCP,
294	0xFF,
295	cbEntriesUniFromCP);
296	memset(pTableNew->ausEntriesCPFromUni,
297	0xFF,
298	cbEntriesCPFromUni);
299
300	for (ul = 0;
301	ul < cArrayEntries;
302	ul++)
303	{
304	PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
305
306	pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
307
308	pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
309	}
310
311	return (pTableNew);
312	}
313
314	free(pTableNew);
315	}
316	}
317	}
318
319	return (NULL);
320	}
321
322	/*
323	*@@ encFreeCodec:
324	* frees a codec created with encFreeConversion
325	* and sets the given pointer to NULL.
326	*
327	*@@added V0.9.18 (2002-03-08) [umoeller]
328	*/
329
330	void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
331	{
332	PCONVERSION pTable;
333	if (pTable = *ppTable)
334	{
335	if (pTable->ausEntriesUniFromCP)
336	free(pTable->ausEntriesUniFromCP);
337	if (pTable->ausEntriesCPFromUni)
338	free(pTable->ausEntriesCPFromUni);
339	free(pTable);
340	*ppTable = NULL;
341	}
342	}
343
344	/*
345	*@@ encChar2Uni:
346	* converts a codepage-specific character
347	* to Unicode, using the given conversion
348	* table from encCreateCodec().
349	*
350	* Returns 0xFFFF on errors, which is unlikely
351	* with Unicode though.
352	*
353	*@@added V0.9.18 (2002-03-08) [umoeller]
354	*/
355
356	unsigned long encChar2Uni(PCONVERSION pTable,
357	unsigned short c)
358	{
359	if ( (pTable)
360	&& (c <= pTable->usHighestCP)
361	)
362	return (pTable->ausEntriesUniFromCP[c]);
363
364	return (0xFFFF);
365	}
366
367	/*
368	*@@ encUni2Char:
369	* converts a Unicode character to the
370	* codepage specified by the given
371	* conversion table from encCreateCodec().
372	*
373	* Returns 0xFFFF if the Unicode character
374	* has no codepage equivalent.
375	*
376	*@@added V0.9.18 (2002-03-08) [umoeller]
377	*/
378
379	unsigned short encUni2Char(PCONVERSION pTable,
380	unsigned long ulUni)
381	{
382	if ( (pTable)
383	&& (ulUni <= pTable->usHighestUni)
384	)
385	return (pTable->ausEntriesCPFromUni[ulUni]);
386
387	return (0xFFFF);
388	}
389
390	/*
391	*@@ encDecodeUTF8:
392	* decodes one UTF-8 character and returns
393	* the Unicode value or -1 if the character
394	* is invalid.
395	*
396	* On input, *ppch is assumed to point to
397	* the first byte of the UTF-8 char to be
398	* read.
399	*
400	* This function will advance *ppch by at
401	* least one byte (or more if the UTF-8
402	* char initially pointed to introduces
403	* a multi-byte sequence).
404	*
405	* This returns -1 if *ppch points to an
406	* invalid encoding (in which case the
407	* pointer is advanced anyway).
408	*
409	* This returns 0 if **ppch points to a
410	* null character.
411	*
412	*@@added V0.9.14 (2001-08-09) [umoeller]
413	*/
414
415	unsigned long encDecodeUTF8(const char **ppch)
416	{
417	unsigned long ulChar;
418
419	if (!(ulChar = **ppch))
420	// null is null
421	return 0;
422
423	// if (ulChar < 0x80): simple, one byte only... use that
424
425	if (ulChar < 0x80)
426	{
427	(*ppch)++;
428	return (ulChar);
429	}
430	else
431	{
432	unsigned long ulCount = 1;
433	int fIllegal = 0;
434
435	// note: 0xc0 and 0xc1 are reserved and
436	// cannot appear as the first UTF-8 byte
437
438	if ( (ulChar >= 0xc2)
439	&& (ulChar < 0xe0)
440	)
441	{
442	// that's two bytes
443	ulCount = 2;
444	ulChar &= 0x1f;
445	}
446	else if ((ulChar & 0xf0) == 0xe0)
447	{
448	// three bytes
449	ulCount = 3;
450	ulChar &= 0x0f;
451	}
452	else if ((ulChar & 0xf8) == 0xf0)
453	{
454	// four bytes
455	ulCount = 4;
456	ulChar &= 0x07;
457	}
458	else if ((ulChar & 0xfc) == 0xf8)
459	{
460	// five bytes
461	ulCount = 5;
462	ulChar &= 0x03;
463	}
464	else if ((ulChar & 0xfe) == 0xfc)
465	{
466	// six bytes
467	ulCount = 6;
468	ulChar &= 0x01;
469	}
470	else
471	++fIllegal;
472
473	if (!fIllegal)
474	{
475	// go for the second and more bytes then
476	int ul2;
477
478	for (ul2 = 1;
479	ul2 < ulCount;
480	++ul2)
481	{
482	unsigned long ulChar2 = ((ppch) + ul2);
483
484	if (!(ulChar2 & 0xc0)) // != 0x80)
485	{
486	++fIllegal;
487	break;
488	}
489
490	ulChar <<= 6;
491	ulChar \|= ulChar2 & 0x3f;
492	}
493	}
494
495	if (fIllegal)
496	{
497	// skip all the following characters
498	// until we find something with bit 7 off
499	do
500	{
501	ulChar = (++(ppch));
502	if (!ulChar)
503	break;
504	} while (ulChar & 0x80);
505	}
506	else
507	*ppch += ulCount;
508	}
509
510	return (ulChar);
511	}
512
513

Note: See TracBrowser for help on using the repository browser.

Download in other formats: