Context Navigation

source: trunk/src/helpers/encodings.c@ 153

Visit:

Last change on this file since 153 was 153, checked in by umoeller, 23 years ago
Lots of changes from the last three weeks.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 16.1 KB

Line
1
2	/*
3	*@@sourcefile encodings.c:
4	* character encoding translations.
5	*
6	* See encCreateCodec for an introduction.
7	*
8	*@@header "encodings\base.h"
9	*@@added V0.9.9 (2001-02-14) [umoeller]
10	*/
11
12	/*
13	* Copyright (C) 2001 Ulrich Mller.
14	* This file is part of the "XWorkplace helpers" source package.
15	* This is free software; you can redistribute it and/or modify
16	* it under the terms of the GNU General Public License as published
17	* by the Free Software Foundation, in version 2 as it comes in the
18	* "COPYING" file of the XWorkplace main distribution.
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*/
24
25	#define OS2EMX_PLAIN_CHAR
26	// this is needed for "os2emx.h"; if this is defined,
27	// emx will define PSZ as _signed_ char, otherwise
28	// as unsigned char
29
30	#include <stdlib.h>
31	#include <string.h>
32
33	#include "setup.h" // code generation and debugging options
34
35	#include "helpers\standards.h"
36
37	#include "encodings\base.h"
38	#include "encodings\alltables.h"
39	// #include "encodings\collate.h"
40
41	#pragma hdrstop
42
43	/*
44	*@@ G_aEncodings:
45	* list of all encodings supported by this engine
46	* (i.e. we have a corresponding codepage in
47	* include\encodings\*.h) together with some
48	* additional information for each encoding,
49	* such as the corresponding OS/2 codepage
50	* number and a descriptive string.
51	*
52	*@@added V [umoeller]
53	*/
54
55	struct
56	{
57	ENCID id; // engine ID (enum)
58	PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
59	unsigned long cEntries; // entries in map (array item count)
60	unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
61	ENCBYTECOUNT bc;
62	const char *pcszDescription; // description
63	} G_aEncodings[] =
64	{
65	#define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
66
67	ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
68	ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
69	ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
70	ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
71	ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2", // default in Hungary,
72	// Romania, Poland
73	ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
74	ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
75	ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
76	ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
77	ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
78	ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
79	ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic", // default in Egypt
80	ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
81	ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian", // default in Russia
82	ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
83	ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
84
85	ENCODINGENTRY(cp932), 932 /* or 943?*/ , DOUBLE, "Japanese Windows",
86	ENCODINGENTRY(cp936), 936 /* or 946?*/ , DOUBLE, "Chinese",
87	ENCODINGENTRY(cp949), 951 /* or 949?*/ , DOUBLE, "Korean",
88	ENCODINGENTRY(cp950), 947 /* or 950?*/ , DOUBLE, "Taiwan Big-5", // default in China?
89
90	ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
91	ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
92	ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
93	ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
94	ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
95	ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
96	ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
97	ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
98	ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
99	ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
100	ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
101	ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
102	ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
103	ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
104	ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
105	ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
106	ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
107	ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
108	ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
109	ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
110	ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
111	ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
112	ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
113
114	UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
115	UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
116	};
117
118	/*
119	*@@ FindEntry:
120	*
121	*@@added V0.9.18 (2002-03-08) [umoeller]
122	*/
123
124	static int FindEntry(ENCID id,
125	PXWPENCODINGMAP *ppMap,
126	unsigned long *pcEntries)
127	{
128	unsigned long ul;
129	for (ul = 0;
130	ul < ARRAYITEMCOUNT(G_aEncodings);
131	ul++)
132	{
133	if (G_aEncodings[ul].id == id)
134	{
135	*ppMap = G_aEncodings[ul].pMap;
136	*pcEntries = G_aEncodings[ul].cEntries;
137	return (1);
138	}
139	}
140
141	return (0);
142	}
143
144	/*
145	*@@ encFindIdForCodepage:
146	* returns the ENCID for the given OS/2
147	* codepage, or UNSUPPORTED if there's none.
148	*
149	*@@added V0.9.18 (2002-03-08) [umoeller]
150	*/
151
152	ENCID encFindIdForCodepage(unsigned short usCodepage,
153	const char **ppcszDescription, // out: codepage description; ptr can be NULL
154	ENCBYTECOUNT *pByteCount)
155	{
156	unsigned long ul;
157	for (ul = 0;
158	ul < ARRAYITEMCOUNT(G_aEncodings);
159	ul++)
160	{
161	if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
162
163	{
164	if (ppcszDescription)
165	*ppcszDescription = G_aEncodings[ul].pcszDescription;
166	if (pByteCount)
167	*pByteCount = G_aEncodings[ul].bc;
168	return G_aEncodings[ul].id;
169	}
170	}
171
172	return (UNSUPPORTED);
173	}
174
175	/*
176	*@@ encCreateCodec:
177	* creates a codec that can be used for conversion between
178	* Unicode and codepaged characters (and vice versa).
179	*
180	* A codec essentially consists of two tables which can
181	* be used for quick index-based lookups in both directions.
182	* This function goes thru the tables provided in
183	* include\encodings\*.h and builds the codec tables
184	* from them.
185	*
186	* This function takes an encoding ID as input. Each
187	* codepage table in include\encodings\*.h has one
188	* of those IDs assigned. Use encFindIdForCodepage
189	* to find the ID for a given OS/2 codepage.
190	*
191	* Use codecs carefully and only when they are really
192	* needed for a specific conversion. Building a codec
193	* is expensive, so you should create a codec once
194	* and reuse it for future conversions. In addition,
195	* create codecs only for the codepages that are
196	* actually used. Each codec will take up
197	* n * sizeof(USHORT) bytes, where n is the highest
198	* Unicode character used in the codepage.
199	*
200	* Remarks:
201	*
202	* -- All codepages share the first 128 characters
203	* (0-0x7F) with ASCII.
204	*
205	* -- Since the first 128 characters (0-0x7F) in
206	* Unicode are equivalent to ASCII also, codecs
207	* are not needed if you process ASCII strings
208	* only.
209	*
210	* -- Since the next 128 characters (0x80-0xFF) in
211	* Unicode are equivalent to ISO/IEC 8859-1
212	* (Latin-1), codecs aren't needed for those
213	* strings either.
214	*
215	* Note that codepoints 0x80-0x9F are undefined
216	* in Latin-1 but used as control sequences in
217	* Unicode.
218	*
219	* -- As far as I know, codepage 1252, which is
220	* used per default under Windows, is equivalent
221	* to Latin 1 except that it also defines
222	* codepoints 0x80-0x9F to certain DTP characters.
223	*
224	* -- From my testing, codepage 1004 (which is
225	* described as "Windows-compatible" in most OS/2
226	* docs) is the same as codepage 1252, except for
227	* character 0xAF.
228	*
229	* Unfortunately, OS/2 uses codepage 850 on most
230	* systems (and Windows uses OS/2 codepage 1252),
231	* so for conversion between those, codecs are needed.
232	*/
233
234	PCONVERSION encCreateCodec(ENCID id)
235	{
236	PXWPENCODINGMAP pEncodingMap;
237	unsigned long cArrayEntries;
238
239	if (FindEntry(id,
240	&pEncodingMap,
241	&cArrayEntries))
242	{
243	unsigned short usHighestCP = 0,
244	usHighestUni = 0;
245	unsigned long ul;
246
247	// step 1:
248	// run through the table and calculate the highest
249	// character entry used
250	for (ul = 0;
251	ul < cArrayEntries;
252	ul++)
253	{
254	if (pEncodingMap[ul].usCP > usHighestCP)
255	usHighestCP = pEncodingMap[ul].usCP;
256	if (pEncodingMap[ul].usUni > usHighestUni)
257	usHighestUni = pEncodingMap[ul].usUni;
258	}
259
260	// step 2: allocate encoding table
261	if (usHighestCP && usHighestUni)
262	{
263	PCONVERSION pTableNew;
264	if (pTableNew = NEW(CONVERSION))
265	{
266	unsigned long cbEntriesUniFromCP
267	= (usHighestCP + 1) * sizeof(unsigned short);
268	unsigned long cbEntriesCPFromUni
269	= (usHighestUni + 1) * sizeof(unsigned short);
270
271	ZERO(pTableNew);
272
273	pTableNew->usHighestCP = usHighestCP;
274	pTableNew->usHighestUni = usHighestUni;
275
276	if ( (pTableNew->ausEntriesUniFromCP
277	= (unsigned short*)malloc(cbEntriesUniFromCP))
278	&& (pTableNew->ausEntriesCPFromUni
279	= (unsigned short*)malloc(cbEntriesCPFromUni))
280	)
281	{
282	// step 3: fill encoding tables
283
284	memset(pTableNew->ausEntriesUniFromCP,
285	0xFF,
286	cbEntriesUniFromCP);
287	memset(pTableNew->ausEntriesCPFromUni,
288	0xFF,
289	cbEntriesCPFromUni);
290
291	for (ul = 0;
292	ul < cArrayEntries;
293	ul++)
294	{
295	PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
296
297	pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
298
299	pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
300	}
301
302	return (pTableNew);
303	}
304
305	free(pTableNew);
306	}
307	}
308	}
309
310	return (NULL);
311	}
312
313	/*
314	*@@ encFreeCodec:
315	* frees a codec created with encFreeConversion
316	* and sets the given pointer to NULL.
317	*
318	*@@added V0.9.18 (2002-03-08) [umoeller]
319	*/
320
321	void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
322	{
323	PCONVERSION pTable;
324	if (pTable = *ppTable)
325	{
326	if (pTable->ausEntriesUniFromCP)
327	free(pTable->ausEntriesUniFromCP);
328	if (pTable->ausEntriesCPFromUni)
329	free(pTable->ausEntriesCPFromUni);
330	free(pTable);
331	*ppTable = NULL;
332	}
333	}
334
335	/*
336	*@@ encChar2Uni:
337	* converts a codepage-specific character
338	* to Unicode, using the given conversion
339	* table from encCreateCodec().
340	*
341	* Returns 0xFFFF on errors, which is unlikely
342	* with Unicode though.
343	*
344	*@@added V0.9.18 (2002-03-08) [umoeller]
345	*/
346
347	unsigned long encChar2Uni(PCONVERSION pTable,
348	unsigned short c)
349	{
350	if ( (pTable)
351	&& (c <= pTable->usHighestCP)
352	)
353	return (pTable->ausEntriesUniFromCP[c]);
354
355	return (0xFFFF);
356	}
357
358	/*
359	*@@ encUni2Char:
360	* converts a Unicode character to the
361	* codepage specified by the given
362	* conversion table from encCreateCodec().
363	*
364	* Returns 0xFFFF if the Unicode character
365	* has no codepage equivalent.
366	*
367	*@@added V0.9.18 (2002-03-08) [umoeller]
368	*/
369
370	unsigned short encUni2Char(PCONVERSION pTable,
371	unsigned long ulUni)
372	{
373	if ( (pTable)
374	&& (ulUni <= pTable->usHighestUni)
375	)
376	return (pTable->ausEntriesCPFromUni[ulUni]);
377
378	return (0xFFFF);
379	}
380
381	/*
382	*@@ encDecodeUTF8:
383	* decodes one UTF-8 character and returns
384	* the Unicode value or -1 if the character
385	* is invalid.
386	*
387	* On input, *ppch is assumed to point to
388	* the first byte of the UTF-8 char to be
389	* read.
390	*
391	* This function will advance *ppch by at
392	* least one byte (or more if the UTF-8
393	* char initially pointed to introduces
394	* a multi-byte sequence).
395	*
396	* This returns -1 if *ppch points to an
397	* invalid encoding (in which case the
398	* pointer is advanced anyway).
399	*
400	* This returns 0 if **ppch points to a
401	* null character.
402	*
403	*@@added V0.9.14 (2001-08-09) [umoeller]
404	*/
405
406	unsigned long encDecodeUTF8(const char **ppch)
407	{
408	unsigned long ulChar;
409
410	if (!(ulChar = **ppch))
411	// null is null
412	return 0;
413
414	// if (ulChar < 0x80): simple, one byte only... use that
415
416	if (ulChar < 0x80)
417	{
418	(*ppch)++;
419	return (ulChar);
420	}
421	else
422	{
423	unsigned long ulCount = 1;
424	int fIllegal = 0;
425
426	// note: 0xc0 and 0xc1 are reserved and
427	// cannot appear as the first UTF-8 byte
428
429	if ( (ulChar >= 0xc2)
430	&& (ulChar < 0xe0)
431	)
432	{
433	// that's two bytes
434	ulCount = 2;
435	ulChar &= 0x1f;
436	}
437	else if ((ulChar & 0xf0) == 0xe0)
438	{
439	// three bytes
440	ulCount = 3;
441	ulChar &= 0x0f;
442	}
443	else if ((ulChar & 0xf8) == 0xf0)
444	{
445	// four bytes
446	ulCount = 4;
447	ulChar &= 0x07;
448	}
449	else if ((ulChar & 0xfc) == 0xf8)
450	{
451	// five bytes
452	ulCount = 5;
453	ulChar &= 0x03;
454	}
455	else if ((ulChar & 0xfe) == 0xfc)
456	{
457	// six bytes
458	ulCount = 6;
459	ulChar &= 0x01;
460	}
461	else
462	++fIllegal;
463
464	if (!fIllegal)
465	{
466	// go for the second and more bytes then
467	int ul2;
468
469	for (ul2 = 1;
470	ul2 < ulCount;
471	++ul2)
472	{
473	unsigned long ulChar2 = ((ppch) + ul2);
474
475	if (!(ulChar2 & 0xc0)) // != 0x80)
476	{
477	++fIllegal;
478	break;
479	}
480
481	ulChar <<= 6;
482	ulChar \|= ulChar2 & 0x3f;
483	}
484	}
485
486	if (fIllegal)
487	{
488	// skip all the following characters
489	// until we find something with bit 7 off
490	do
491	{
492	ulChar = (++(ppch));
493	if (!ulChar)
494	break;
495	} while (ulChar & 0x80);
496	}
497	else
498	*ppch += ulCount;
499	}
500
501	return (ulChar);
502	}
503
504

Note: See TracBrowser for help on using the repository browser.

Download in other formats: