Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

encodings.c

Visit:

Last change on this file was 357, checked in by pr, 17 years ago
Add CP1386.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 21.5 KB

Rev	Line
[36]	1
	2	/*
	3	*@@sourcefile encodings.c:
[187]	4	* character encoding support. Handles all kinds
[192]	5	* of legacy codepages (including most OS/2 codepages)
[187]	6	* and Unicode in the form of UTF-8 and translations
	7	* between then.
[36]	8	*
[147]	9	* See encCreateCodec for an introduction.
	10	*
[209]	11	* See http://www.ietf.org/rfc/rfc2279.txt for
	12	* RFC 2279, which defines UTF-8.
	13	*
[154]	14	* Be warned, compilation of this file takes a long
[229]	15	* time because this includes all the complex codepages
[154]	16	* from include\encodings.
	17	*
[36]	18	*@@header "encodings\base.h"
	19	*@@added V0.9.9 (2001-02-14) [umoeller]
	20	*/
	21
	22	/*
[357]	23	* Copyright (C) 2001-2008 Ulrich Mller.
[36]	24	* This file is part of the "XWorkplace helpers" source package.
	25	* This is free software; you can redistribute it and/or modify
	26	* it under the terms of the GNU General Public License as published
	27	* by the Free Software Foundation, in version 2 as it comes in the
	28	* "COPYING" file of the XWorkplace main distribution.
	29	* This program is distributed in the hope that it will be useful,
	30	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	31	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	32	* GNU General Public License for more details.
	33	*/
	34
	35	#define OS2EMX_PLAIN_CHAR
	36	// this is needed for "os2emx.h"; if this is defined,
	37	// emx will define PSZ as _signed_ char, otherwise
	38	// as unsigned char
	39
	40	#include <stdlib.h>
	41	#include <string.h>
	42
	43	#include "setup.h" // code generation and debugging options
	44
[147]	45	#include "helpers\standards.h"
[36]	46
[147]	47	#include "encodings\base.h"
	48
[186]	49	#include "encodings\unicase.h"
	50
	51	#include "encodings\alltables.h" // this takes a very long time
	52
[36]	53	#pragma hdrstop
	54
[147]	55	/*
[155]	56	*@@category: Helpers\National Language Support\Encodings
	57	* See encodings.c.
	58	*/
	59
	60	/*
[147]	61	*@@ G_aEncodings:
	62	* list of all encodings supported by this engine
	63	* (i.e. we have a corresponding codepage in
	64	* include\encodings\*.h) together with some
	65	* additional information for each encoding,
	66	* such as the corresponding OS/2 codepage
	67	* number and a descriptive string.
	68	*
[212]	69	* For a way too extensive list of codepage
	70	* names, see "http://www.iana.org/assignments/character-sets".
	71	*
[147]	72	*@@added V [umoeller]
	73	*/
	74
	75	struct
[36]	76	{
[147]	77	ENCID id; // engine ID (enum)
	78	PXWPENCODINGMAP pMap; // ptr to map from include\encodings\*.h
	79	unsigned long cEntries; // entries in map (array item count)
	80	unsigned short usCodepageOS2; // corresponding OS/2 codepage or 0 if none
[229]	81	// V1.0.0 (2002-08-21) [umoeller]
[212]	82	unsigned short usLatin; // ISO 8859-X correspondance or 0
[147]	83	ENCBYTECOUNT bc;
	84	const char *pcszDescription; // description
	85	} G_aEncodings[] =
	86	{
[153]	87	#define ENCODINGENTRY(id) enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
	88
[212]	89	ENCODINGENTRY(cp437), 437, 0, SINGLE, "DOS Latin US",
	90	ENCODINGENTRY(cp737), 737, 0, SINGLE, "DOS Greek",
	91	ENCODINGENTRY(cp775), 775, 0, SINGLE, "DOS BaltRim",
	92	ENCODINGENTRY(cp850), 850, 0, SINGLE, "DOS Latin 1",
	93	ENCODINGENTRY(cp852), 852, 0, SINGLE, "DOS Latin 2", // default in Hungary,
	94	// Romania, Poland
	95	ENCODINGENTRY(cp855), 855, 0, SINGLE, "DOS Cyrillic",
	96	ENCODINGENTRY(cp857), 857, 0, SINGLE, "DOS Latin 5 (Turkish)",
	97	ENCODINGENTRY(cp860), 860, 0, SINGLE, "DOS Portuguese",
	98	ENCODINGENTRY(cp861), 861, 0, SINGLE, "DOS Icelandic",
	99	ENCODINGENTRY(cp862), 862, 0, SINGLE, "DOS Hebrew",
	100	ENCODINGENTRY(cp863), 863, 0, SINGLE, "DOS Canadian French",
	101	ENCODINGENTRY(cp864), 864, 0, SINGLE, "DOS Arabic", // default in Egypt
	102	ENCODINGENTRY(cp865), 865, 0, SINGLE, "DOS Nordic",
	103	ENCODINGENTRY(cp866), 866, 0, SINGLE, "DOS Cyrillic Russian", // default in Russia
	104	ENCODINGENTRY(cp869), 869, 0, SINGLE, "DOS Greek2",
	105	ENCODINGENTRY(cp874), 874, 0, SINGLE, "DOS Thai (TIS-620)", // default in Thailand
[153]	106
[212]	107	ENCODINGENTRY(cp932), 932 /* or 943?*/ ,
	108	0, DOUBLE, "Japanese Windows",
	109	ENCODINGENTRY(cp936), 936 /* or 946?*/ ,
	110	0, DOUBLE, "Chinese",
[256]	111	ENCODINGENTRY(cp949), 949 /* was 951, fixed V1.0.2 (2003-09-19) [umoeller] */ ,
[212]	112	0, DOUBLE, "Korean",
[256]	113	ENCODINGENTRY(cp950), 950 /* was 947, fixed V1.0.2 (2003-09-19) [umoeller] */ ,
[212]	114	0, DOUBLE, "Taiwan Big-5", // default in China?
[153]	115
[212]	116	ENCODINGENTRY(cp1004), 1004, 0, SINGLE, "Windows Extended",
	117	ENCODINGENTRY(cp1250), 1250, 0, SINGLE, "Windows Latin 2",
	118	ENCODINGENTRY(cp1251), 1251, 0, SINGLE, "Windows Cyrillic",
	119	ENCODINGENTRY(cp1252), 1252, 0, SINGLE, "Windows Latin 1",
	120	ENCODINGENTRY(cp1253), 1253, 0, SINGLE, "Windows Greek",
	121	ENCODINGENTRY(cp1254), 1254, 0, SINGLE, "Windows Turkish",
	122	ENCODINGENTRY(cp1255), 1255, 0, SINGLE, "Windows Hebrew",
	123	ENCODINGENTRY(cp1256), 1256, 0, SINGLE, "Windows Arabic",
	124	ENCODINGENTRY(cp1257), 1257, 0, SINGLE, "Windows Latin-4",
	125	ENCODINGENTRY(cp1258), 1258, 0, UNKNOWN, "unknown",
[357]	126	ENCODINGENTRY(cp1386), 1386, 0, DOUBLE, "Chinese (IBM)", // WarpIN V1.0.16 (2008-02-19) [pr]
[212]	127	ENCODINGENTRY(iso8859_1), 819, 1, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
	128	ENCODINGENTRY(iso8859_2), 912, 2, SINGLE, "ISO 8859-2:1999 (Latin-2)",
	129	ENCODINGENTRY(iso8859_3), 913, 3, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
	130	ENCODINGENTRY(iso8859_4), 914, 4, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
	131	ENCODINGENTRY(iso8859_5), 915, 5, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
	132	ENCODINGENTRY(iso8859_6), 1089, 6, SINGLE, "ISO 8859-6:1999 (Arabic)",
	133	ENCODINGENTRY(iso8859_7), 813, 7, SINGLE, "ISO 8859-7:1987 (Greek)", // default in Greece
	134	ENCODINGENTRY(iso8859_8), 916, 8, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
	135	ENCODINGENTRY(iso8859_9), 920, 9, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
	136	ENCODINGENTRY(iso8859_10), 0, 10, SINGLE, "ISO/IEC 8859-10:1998",
	137	ENCODINGENTRY(iso8859_13), 0, 13, SINGLE, "ISO/IEC 8859-13:1998",
	138	ENCODINGENTRY(iso8859_14), 0, 14, SINGLE, "ISO/IEC 8859-14:1998",
	139	ENCODINGENTRY(iso8859_15), 923, 15, SINGLE, "ISO/IEC 8859-15:1999",
[63]	140
[212]	141	UNSUPPORTED, NULL, 0, 1200, 0, MULTI_UNICODE, "Unicode UCS-2",
	142	UNSUPPORTED, NULL, 0, 1208, 0, MULTI_UNICODE, "Unicode UTF-8"
[147]	143	};
[36]	144
[147]	145	/*
[186]	146	*@@ ENCCASEFOLD:
	147	*
	148	*@@added V0.9.20 (2002-07-03) [umoeller]
	149	*/
	150
	151	typedef struct _ENCCASEFOLD
	152	{
	153	unsigned long cEntries;
	154	unsigned long aulFolds[1];
	155	} ENCCASEFOLD, *PENCCASEFOLD;
	156
[222]	157	STATIC PENCCASEFOLD G_pFold = NULL;
[186]	158
	159	/*
[156]	160	*@@ encGetTable:
[147]	161	*
	162	*@@added V0.9.18 (2002-03-08) [umoeller]
	163	*/
[36]	164
[156]	165	int encGetTable(ENCID id,
	166	PXWPENCODINGMAP *ppMap,
	167	unsigned long *pcEntries)
[147]	168	{
	169	unsigned long ul;
	170	for (ul = 0;
	171	ul < ARRAYITEMCOUNT(G_aEncodings);
	172	ul++)
	173	{
	174	if (G_aEncodings[ul].id == id)
	175	{
	176	*ppMap = G_aEncodings[ul].pMap;
	177	*pcEntries = G_aEncodings[ul].cEntries;
[192]	178	return 1;
[147]	179	}
	180	}
	181
[192]	182	return 0;
[147]	183	}
	184
[36]	185	/*
[147]	186	*@@ encFindIdForCodepage:
	187	* returns the ENCID for the given OS/2
	188	* codepage, or UNSUPPORTED if there's none.
[36]	189	*
[147]	190	*@@added V0.9.18 (2002-03-08) [umoeller]
[258]	191	*@@changed V1.0.2 (2003-09-19) [umoeller]: fixed Korean codepage from 951 to 949
[36]	192	*/
	193
[154]	194	ENCID encFindIdForCodepage(unsigned short usCodepage, // in: codepage to find
[147]	195	const char **ppcszDescription, // out: codepage description; ptr can be NULL
[156]	196	ENCBYTECOUNT *pByteCount) // out: SINGLE or DOUBLE; ptr can be NULL
[36]	197	{
	198	unsigned long ul;
	199	for (ul = 0;
[147]	200	ul < ARRAYITEMCOUNT(G_aEncodings);
[36]	201	ul++)
	202	{
[147]	203	if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
	204
	205	{
	206	if (ppcszDescription)
	207	*ppcszDescription = G_aEncodings[ul].pcszDescription;
	208	if (pByteCount)
	209	*pByteCount = G_aEncodings[ul].bc;
	210	return G_aEncodings[ul].id;
	211	}
[36]	212	}
	213
[192]	214	return UNSUPPORTED;
[147]	215	}
	216
	217	/*
	218	*@@ encCreateCodec:
	219	* creates a codec that can be used for conversion between
	220	* Unicode and codepaged characters (and vice versa).
	221	*
	222	* A codec essentially consists of two tables which can
	223	* be used for quick index-based lookups in both directions.
	224	* This function goes thru the tables provided in
	225	* include\encodings\*.h and builds the codec tables
	226	* from them.
	227	*
	228	* This function takes an encoding ID as input. Each
	229	* codepage table in include\encodings\*.h has one
	230	* of those IDs assigned. Use encFindIdForCodepage
	231	* to find the ID for a given OS/2 codepage.
	232	*
	233	* Use codecs carefully and only when they are really
	234	* needed for a specific conversion. Building a codec
	235	* is expensive, so you should create a codec once
	236	* and reuse it for future conversions. In addition,
	237	* create codecs only for the codepages that are
[229]	238	* actually used. Each codec will take up to
[147]	239	* n * sizeof(USHORT) bytes, where n is the highest
	240	* Unicode character used in the codepage.
	241	*
[154]	242	* Codec remarks:
[147]	243	*
	244	* -- All codepages share the first 128 characters
	245	* (0-0x7F) with ASCII.
	246	*
	247	* -- Since the first 128 characters (0-0x7F) in
	248	* Unicode are equivalent to ASCII also, codecs
	249	* are not needed if you process ASCII strings
	250	* only.
	251	*
	252	* -- Since the next 128 characters (0x80-0xFF) in
	253	* Unicode are equivalent to ISO/IEC 8859-1
	254	* (Latin-1), codecs aren't needed for those
	255	* strings either.
	256	*
	257	* Note that codepoints 0x80-0x9F are undefined
	258	* in Latin-1 but used as control sequences in
	259	* Unicode.
	260	*
	261	* -- As far as I know, codepage 1252, which is
	262	* used per default under Windows, is equivalent
	263	* to Latin 1 except that it also defines
	264	* codepoints 0x80-0x9F to certain DTP characters.
	265	*
	266	* -- From my testing, codepage 1004 (which is
	267	* described as "Windows-compatible" in most OS/2
	268	* docs) is the same as codepage 1252, except for
	269	* character 0xAF.
	270	*
	271	* Unfortunately, OS/2 uses codepage 850 on most
	272	* systems (and Windows uses OS/2 codepage 1252),
	273	* so for conversion between those, codecs are needed.
[186]	274	*
	275	* This works and is presently used in WarpIN.
[147]	276	*/
	277
	278	PCONVERSION encCreateCodec(ENCID id)
	279	{
	280	PXWPENCODINGMAP pEncodingMap;
	281	unsigned long cArrayEntries;
	282
[156]	283	if (encGetTable(id,
	284	&pEncodingMap,
	285	&cArrayEntries))
[36]	286	{
[147]	287	unsigned short usHighestCP = 0,
	288	usHighestUni = 0;
	289	unsigned long ul;
[36]	290
[147]	291	// step 1:
	292	// run through the table and calculate the highest
	293	// character entry used
	294	for (ul = 0;
	295	ul < cArrayEntries;
	296	ul++)
[36]	297	{
[147]	298	if (pEncodingMap[ul].usCP > usHighestCP)
	299	usHighestCP = pEncodingMap[ul].usCP;
	300	if (pEncodingMap[ul].usUni > usHighestUni)
	301	usHighestUni = pEncodingMap[ul].usUni;
	302	}
[36]	303
[147]	304	// step 2: allocate encoding table
	305	if (usHighestCP && usHighestUni)
	306	{
	307	PCONVERSION pTableNew;
	308	if (pTableNew = NEW(CONVERSION))
	309	{
	310	unsigned long cbEntriesUniFromCP
	311	= (usHighestCP + 1) * sizeof(unsigned short);
	312	unsigned long cbEntriesCPFromUni
	313	= (usHighestUni + 1) * sizeof(unsigned short);
[36]	314
[147]	315	ZERO(pTableNew);
	316
	317	pTableNew->usHighestCP = usHighestCP;
	318	pTableNew->usHighestUni = usHighestUni;
	319
	320	if ( (pTableNew->ausEntriesUniFromCP
	321	= (unsigned short*)malloc(cbEntriesUniFromCP))
	322	&& (pTableNew->ausEntriesCPFromUni
	323	= (unsigned short*)malloc(cbEntriesCPFromUni))
	324	)
	325	{
	326	// step 3: fill encoding tables
	327
	328	memset(pTableNew->ausEntriesUniFromCP,
	329	0xFF,
	330	cbEntriesUniFromCP);
	331	memset(pTableNew->ausEntriesCPFromUni,
	332	0xFF,
	333	cbEntriesCPFromUni);
	334
	335	for (ul = 0;
	336	ul < cArrayEntries;
	337	ul++)
	338	{
	339	PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
	340
	341	pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
	342
	343	pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
	344	}
	345
[192]	346	return pTableNew;
[147]	347	}
	348
	349	free(pTableNew);
[36]	350	}
	351	}
	352	}
	353
[169]	354	return NULL;
[36]	355	}
	356
[97]	357	/*
[147]	358	*@@ encFreeCodec:
[229]	359	* frees a codec created with encCreateCodec
[147]	360	* and sets the given pointer to NULL.
	361	*
[186]	362	* This works and is presently used in WarpIN.
	363	*
[147]	364	*@@added V0.9.18 (2002-03-08) [umoeller]
	365	*/
	366
	367	void encFreeCodec(PCONVERSION *ppTable) // in: ptr to codec ptr returned by encCreateCodec
	368	{
	369	PCONVERSION pTable;
	370	if (pTable = *ppTable)
	371	{
	372	if (pTable->ausEntriesUniFromCP)
	373	free(pTable->ausEntriesUniFromCP);
	374	if (pTable->ausEntriesCPFromUni)
	375	free(pTable->ausEntriesCPFromUni);
	376	free(pTable);
	377	*ppTable = NULL;
	378	}
	379	}
	380
	381	/*
	382	*@@ encChar2Uni:
	383	* converts a codepage-specific character
	384	* to Unicode, using the given conversion
	385	* table from encCreateCodec().
	386	*
	387	* Returns 0xFFFF on errors, which is unlikely
	388	* with Unicode though.
	389	*
[186]	390	* This works and is presently used in WarpIN.
	391	*
[147]	392	*@@added V0.9.18 (2002-03-08) [umoeller]
	393	*/
	394
	395	unsigned long encChar2Uni(PCONVERSION pTable,
	396	unsigned short c)
	397	{
	398	if ( (pTable)
	399	&& (c <= pTable->usHighestCP)
	400	)
[192]	401	return pTable->ausEntriesUniFromCP[c];
[147]	402
[192]	403	return 0xFFFF;
[147]	404	}
	405
	406	/*
	407	*@@ encUni2Char:
	408	* converts a Unicode character to the
	409	* codepage specified by the given
	410	* conversion table from encCreateCodec().
	411	*
	412	* Returns 0xFFFF if the Unicode character
	413	* has no codepage equivalent.
	414	*
[186]	415	* This works and is presently used in WarpIN.
	416	*
[147]	417	*@@added V0.9.18 (2002-03-08) [umoeller]
	418	*/
	419
	420	unsigned short encUni2Char(PCONVERSION pTable,
	421	unsigned long ulUni)
	422	{
	423	if ( (pTable)
	424	&& (ulUni <= pTable->usHighestUni)
	425	)
[192]	426	return pTable->ausEntriesCPFromUni[ulUni];
[147]	427
[192]	428	return 0xFFFF;
[147]	429	}
	430
	431	/*
[97]	432	*@@ encDecodeUTF8:
	433	* decodes one UTF-8 character and returns
	434	* the Unicode value or -1 if the character
	435	* is invalid.
	436	*
	437	* On input, *ppch is assumed to point to
	438	* the first byte of the UTF-8 char to be
	439	* read.
	440	*
	441	* This function will advance *ppch by at
	442	* least one byte (or more if the UTF-8
	443	* char initially pointed to introduces
	444	* a multi-byte sequence).
	445	*
	446	* This returns -1 if *ppch points to an
	447	* invalid encoding (in which case the
	448	* pointer is advanced anyway).
	449	*
[229]	450	* This returns 0 if *ppch points to a
[97]	451	* null character.
	452	*
[186]	453	* This works and is presently used in WarpIN.
	454	*
[97]	455	*@@added V0.9.14 (2001-08-09) [umoeller]
	456	*/
[36]	457
[97]	458	unsigned long encDecodeUTF8(const char **ppch)
	459	{
[153]	460	unsigned long ulChar;
[192]	461	unsigned long ulCount;
	462	int fIllegal;
[97]	463
[153]	464	if (!(ulChar = **ppch))
	465	// null is null
[97]	466	return 0;
	467
	468	// if (ulChar < 0x80): simple, one byte only... use that
	469
[153]	470	if (ulChar < 0x80)
[97]	471	{
[153]	472	(*ppch)++;
[192]	473	return ulChar;
[153]	474	}
[192]	475
	476	ulCount = 1;
	477	fIllegal = 0;
	478
	479	// note: 0xc0 and 0xc1 are reserved and
	480	// cannot appear as the first UTF-8 byte
	481
	482	if ( (ulChar >= 0xc2)
	483	&& (ulChar < 0xe0)
	484	)
	485	{
	486	// that's two bytes
	487	ulCount = 2;
	488	ulChar &= 0x1f;
	489	}
	490	else if ((ulChar & 0xf0) == 0xe0)
	491	{
	492	// three bytes
	493	ulCount = 3;
	494	ulChar &= 0x0f;
	495	}
	496	else if ((ulChar & 0xf8) == 0xf0)
	497	{
	498	// four bytes
	499	ulCount = 4;
	500	ulChar &= 0x07;
	501	}
	502	else if ((ulChar & 0xfc) == 0xf8)
	503	{
	504	// five bytes
	505	ulCount = 5;
	506	ulChar &= 0x03;
	507	}
	508	else if ((ulChar & 0xfe) == 0xfc)
	509	{
	510	// six bytes
	511	ulCount = 6;
	512	ulChar &= 0x01;
	513	}
[153]	514	else
[192]	515	++fIllegal;
	516
	517	if (!fIllegal)
[153]	518	{
[192]	519	// go for the second and more bytes then
	520	int ul2;
[97]	521
[192]	522	for (ul2 = 1;
	523	ul2 < ulCount;
	524	++ul2)
[97]	525	{
[192]	526	unsigned long ulChar2 = ((ppch) + ul2);
[97]	527
[192]	528	if (!(ulChar2 & 0xc0)) // != 0x80)
[97]	529	{
[192]	530	++fIllegal;
	531	break;
	532	}
[97]	533
[192]	534	ulChar <<= 6;
	535	ulChar \|= ulChar2 & 0x3f;
[97]	536	}
[192]	537	}
[97]	538
[192]	539	if (fIllegal)
	540	{
	541	// skip all the following characters
	542	// until we find something with bit 7 off
	543	do
[97]	544	{
[192]	545	ulChar = (++(ppch));
	546	if (!ulChar)
	547	break;
	548	} while (ulChar & 0x80);
[97]	549	}
[192]	550	else
	551	*ppch += ulCount;
[97]	552
[192]	553	return ulChar;
[97]	554	}
	555
[186]	556	/*
[191]	557	*@@ encInitCase:
[186]	558	* creates a casefold for later use with
	559	* encToUpper.
	560	*
	561	* This only uses one-byte sequences from
	562	* the Unicode case folding table (see
	563	* include\encodings\unicase.h), so this
	564	* cannot be used for expanding characters
	565	* at this point.
	566	*
	567	* Returns 1 (TRUE) on success.
	568	*
	569	* This works and is presently used in WarpIN.
	570	*
	571	*@@added V0.9.20 (2002-07-03) [umoeller]
	572	*/
[147]	573
[186]	574	int encInitCase(void)
	575	{
	576	unsigned long ul,
	577	cEntries = 0,
	578	cb;
	579
	580	for (ul = 0;
	581	ul < ARRAYITEMCOUNT(G_aCaseFolds);
	582	++ul)
	583	{
	584	// ignore CASEFL_T (duplicate entries for i chars)
	585	// and CASEFL_F (expansions)
	586	if ( (G_aCaseFolds[ul].fl & (CASEFL_C \| CASEFL_S))
	587	&& (G_aCaseFolds[ul].ulLow > cEntries)
	588	)
	589	cEntries = G_aCaseFolds[ul].ulLow;
	590	}
	591
	592	cb = sizeof(ENCCASEFOLD) + cEntries * sizeof(unsigned long);
	593	if (G_pFold = (PENCCASEFOLD)malloc(cb))
	594	{
	595	memset(G_pFold, 0, cb);
	596	G_pFold->cEntries = cEntries;
	597
	598	for (ul = 0;
	599	ul < ARRAYITEMCOUNT(G_aCaseFolds);
	600	++ul)
	601	{
	602	if (G_aCaseFolds[ul].fl & (CASEFL_C \| CASEFL_S))
	603	G_pFold->aulFolds[G_aCaseFolds[ul].ulLow] = G_aCaseFolds[ul].c1;
	604	}
	605
	606	return 1;
	607	}
	608
	609	return 0;
	610	}
	611
	612	/*
	613	*@@ encToUpper:
	614	* converts the given unicode character to
	615	* upper case, if possible, or returns
	616	* ulUni back if Unicode doesn't define
	617	* an upper-case character for it.
	618	*
	619	* Special cases:
	620	*
	621	* -- Returns 0 for 0.
	622	*
	623	* Preconditions:
	624	*
	625	* -- You must call encInitCase before
	626	* the first call.
	627	*
	628	* This works and is presently used in WarpIN.
	629	*
	630	*@@added V0.9.20 (2002-07-03) [umoeller]
	631	*/
	632
	633	unsigned long encToUpper(unsigned long ulUni)
	634	{
	635	unsigned long ulFold;
	636
	637	if ( (ulUni < G_pFold->cEntries)
	638	&& (ulFold = G_pFold->aulFolds[ulUni])
	639	)
	640	return ulFold;
	641
	642	return ulUni;
	643	}
	644
	645	/*
	646	*@@ encicmp:
	647	* like stricmp, but for UTF-8 strings.
	648	* This uses encToUpper for the comparisons.
	649	*
	650	* Like stricmp, this returns:
	651	*
	652	* -- -1 if pcsz1 is less than pcsz2
	653	* -- 0 if pcsz1 is equal to pcsz2
	654	* -- +1 if pcsz1 is greater than pcsz2
	655	*
	656	* However, this does not crash on passing
	657	* in NULL strings.
	658	*
	659	* Preconditions:
	660	*
	661	* -- You must call encInitCase before
	662	* the first call.
	663	*
	664	* This works and is presently used in WarpIN.
	665	*
	666	*@@added V0.9.20 (2002-07-03) [umoeller]
	667	*/
	668
	669	int encicmp(const char *pcsz1,
	670	const char *pcsz2)
	671	{
	672	const char *p1 = pcsz1,
	673	*p2 = pcsz2;
	674
	675	unsigned long ul1, ul2;
	676
	677	do
	678	{
	679	// encDecodeUTF8 returns null for null, so this is safe
	680	ul1 = encToUpper(encDecodeUTF8(&p1));
	681	ul2 = encToUpper(encDecodeUTF8(&p2));
	682
	683	if (ul1 < ul2)
	684	return -1;
	685	if (ul1 > ul2)
	686	return +1;
	687
	688	// both are equal: check for null bytes then
	689	if (!ul1)
	690	if (!ul2)
	691	return 0;
	692	else
	693	// ul1 is null, but ul2 isn't:
	694	return -1;
	695	else
	696	if (!ul2)
	697	// ul1 is not null, but ul2 is:
	698	return +1;
	699
	700	// both are non-null: continue
	701
	702	} while (1);
	703
	704	return 0;
	705	}
	706

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/branch-1-0/src/helpers/encodings.c

Download in other formats: