Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

xmltok.c

Visit:

Last change on this file was 147, checked in by umoeller, 23 years ago
Misc updates for Unicode.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 46.9 KB

Rev	Line
[36]	1	/*
[97]	2	* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
	3	* See the file COPYING for copying permission.
[36]	4	*/
	5
[97]	6	/* #ifdef COMPILED_FROM_DSP
	7	* # include "winconfig.h"
	8	* #else
	9	* # include <config.h>
	10	* #endif
[36]	11	*/
	12
[97]	13	#include <memory.h>
[36]	14
[97]	15	#include "expat\expat_setup.h" // V0.9.9 (2001-02-10) [umoeller]
[36]	16
	17	#pragma info(norea, nogen)
	18	// disable "statement unreachable" and "missing break statement"
	19	// this code generates those options HEAVILY
	20
	21	#include "expat\xmltok.h"
	22	#include "expat\nametab.h"
	23
	24	#ifdef XML_DTD
	25	#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
	26	#else
	27	#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
	28	#endif
	29
	30	#define VTABLE1 \
	31	{ PREFIX(prologTok), PREFIX(contentTok), \
	32	PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
	33	{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
	34	PREFIX(sameName), \
	35	PREFIX(nameMatchesAscii), \
	36	PREFIX(nameLength), \
	37	PREFIX(skipS), \
	38	PREFIX(getAtts), \
	39	PREFIX(charRefNumber), \
	40	PREFIX(predefinedEntityName), \
	41	PREFIX(updatePosition), \
	42	PREFIX(isPublicId)
	43
	44	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
	45
	46	#define UCS2_GET_NAMING(pages, hi, lo) \
	47	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
	48
	49	/* A 2 byte UTF-8 representation splits the characters 11 bits
	50	* between the bottom 5 and 6 bits of the bytes.
	51	* We need 8 bits to index into pages, 3 bits to add to that index and
	52	* 5 bits to generate the mask. */
	53	#define UTF8_GET_NAMING2(pages, byte) \
	54	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
	55	+ ((((byte)[0]) & 3) << 1) \
	56	+ ((((byte)[1]) >> 5) & 1)] \
	57	& (1 << (((byte)[1]) & 0x1F)))
	58
	59	/* A 3 byte UTF-8 representation splits the characters 16 bits
	60	* between the bottom 4, 6 and 6 bits of the bytes.
	61	* We need 8 bits to index into pages, 3 bits to add to that index and
	62	* 5 bits to generate the mask. */
	63	#define UTF8_GET_NAMING3(pages, byte) \
	64	(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
	65	+ ((((byte)[1]) >> 2) & 0xF)] \
	66	<< 3) \
	67	+ ((((byte)[1]) & 3) << 1) \
	68	+ ((((byte)[2]) >> 5) & 1)] \
	69	& (1 << (((byte)[2]) & 0x1F)))
	70
	71	#define UTF8_GET_NAMING(pages, p, n) \
	72	((n) == 2 \
	73	? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
	74	: ((n) == 3 \
	75	? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
	76	: 0))
	77
	78	#define UTF8_INVALID3(p) \
	79	((*p) == 0xED \
	80	? (((p)[1] & 0x20) != 0) \
	81	: ((*p) == 0xEF \
	82	? ((p)[1] == 0xBF && ((p)[2] == 0xBF \|\| (p)[2] == 0xBE)) \
	83	: 0))
	84
	85	#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
	86
[147]	87	static int EXPATENTRY isNever(const ENCODING * enc, const char *p)
[36]	88	{
	89	return 0;
	90	}
	91
[147]	92	static int EXPATENTRY utf8_isName2(const ENCODING * enc, const char *p)
[36]	93	{
	94	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
	95	}
	96
[147]	97	static int EXPATENTRY utf8_isName3(const ENCODING * enc, const char *p)
[36]	98	{
	99	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
	100	}
	101
	102	#define utf8_isName4 isNever
	103
[147]	104	static int EXPATENTRY utf8_isNmstrt2(const ENCODING * enc, const char *p)
[36]	105	{
	106	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
	107	}
	108
[147]	109	static int EXPATENTRY utf8_isNmstrt3(const ENCODING * enc, const char *p)
[36]	110	{
	111	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
	112	}
	113
	114	#define utf8_isNmstrt4 isNever
	115
	116	#define utf8_isInvalid2 isNever
	117
[147]	118	static int EXPATENTRY utf8_isInvalid3(const ENCODING * enc, const char *p)
[36]	119	{
	120	return UTF8_INVALID3((const unsigned char *)p);
	121	}
	122
[147]	123	static int EXPATENTRY utf8_isInvalid4(const ENCODING * enc, const char *p)
[36]	124	{
	125	return UTF8_INVALID4((const unsigned char *)p);
	126	}
	127
	128	struct normal_encoding
	129	{
	130	ENCODING enc;
	131	unsigned char type[256];
	132	#ifdef XML_MIN_SIZE
[98]	133	int (* EXPATENTRY byteType) (const ENCODING , const char );
	134	int (* EXPATENTRY isNameMin) (const ENCODING , const char );
	135	int (* EXPATENTRY isNmstrtMin) (const ENCODING , const char );
	136	int (* EXPATENTRY byteToAscii) (const ENCODING , const char );
	137	int (* EXPATENTRY charMatches) (const ENCODING , const char , int);
[36]	138	#endif /* XML_MIN_SIZE */
[98]	139	int (* EXPATENTRY isName2) (const ENCODING , const char );
	140	int (* EXPATENTRY isName3) (const ENCODING , const char );
	141	int (* EXPATENTRY isName4) (const ENCODING , const char );
	142	int (* EXPATENTRY isNmstrt2) (const ENCODING , const char );
	143	int (* EXPATENTRY isNmstrt3) (const ENCODING , const char );
	144	int (* EXPATENTRY isNmstrt4) (const ENCODING , const char );
	145	int (* EXPATENTRY isInvalid2) (const ENCODING , const char );
	146	int (* EXPATENTRY isInvalid3) (const ENCODING , const char );
	147	int (* EXPATENTRY isInvalid4) (const ENCODING , const char );
[36]	148	};
	149
	150	#ifdef XML_MIN_SIZE
	151
	152	#define STANDARD_VTABLE(E) \
	153	E ## byteType, \
	154	E ## isNameMin, \
	155	E ## isNmstrtMin, \
	156	E ## byteToAscii, \
	157	E ## charMatches,
	158
	159	#else
	160
	161	#define STANDARD_VTABLE(E) /* as nothing */
	162
	163	#endif
	164
	165	#define NORMAL_VTABLE(E) \
	166	E ## isName2, \
	167	E ## isName3, \
	168	E ## isName4, \
	169	E ## isNmstrt2, \
	170	E ## isNmstrt3, \
	171	E ## isNmstrt4, \
	172	E ## isInvalid2, \
	173	E ## isInvalid3, \
	174	E ## isInvalid4
	175
	176	static int checkCharRefNumber(int);
	177
	178	#include "expat\xmltok_impl.h"
	179	#include "expat\ascii.h"
	180
	181	#ifdef XML_MIN_SIZE
	182	#define sb_isNameMin isNever
	183	#define sb_isNmstrtMin isNever
	184	#endif
	185
	186	#ifdef XML_MIN_SIZE
	187	#define MINBPC(enc) ((enc)->minBytesPerChar)
	188	#else
	189	/* minimum bytes per character */
	190	#define MINBPC(enc) 1
	191	#endif
	192
	193	#define SB_BYTE_TYPE(enc, p) \
	194	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
	195
	196	#ifdef XML_MIN_SIZE
[98]	197	static int EXPATENTRY sb_byteType(const ENCODING * enc, const char *p)
[36]	198	{
	199	return SB_BYTE_TYPE(enc, p);
	200	}
	201	#define BYTE_TYPE(enc, p) \
	202	(((const struct normal_encoding *)(enc))->byteType(enc, p))
	203	#else
	204	#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
	205	#endif
	206
	207	#ifdef XML_MIN_SIZE
	208	#define BYTE_TO_ASCII(enc, p) \
	209	(((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
[97]	210	static
[98]	211	int EXPATENTRY sb_byteToAscii(const ENCODING * enc, const char *p)
[36]	212	{
	213	return *p;
	214	}
	215	#else
	216	#define BYTE_TO_ASCII(enc, p) (*(p))
	217	#endif
	218
	219	#define IS_NAME_CHAR(enc, p, n) \
	220	(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
	221	#define IS_NMSTRT_CHAR(enc, p, n) \
	222	(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
	223	#define IS_INVALID_CHAR(enc, p, n) \
	224	(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
	225
	226	#ifdef XML_MIN_SIZE
	227	#define IS_NAME_CHAR_MINBPC(enc, p) \
	228	(((const struct normal_encoding *)(enc))->isNameMin(enc, p))
	229	#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
	230	(((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
	231	#else
	232	#define IS_NAME_CHAR_MINBPC(enc, p) (0)
	233	#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
	234	#endif
	235
	236	#ifdef XML_MIN_SIZE
	237	#define CHAR_MATCHES(enc, p, c) \
	238	(((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
[97]	239	static
[98]	240	int EXPATENTRY sb_charMatches(const ENCODING * enc, const char *p, int c)
[36]	241	{
	242	return *p == c;
	243	}
	244	#else
	245	/* c is an ASCII character */
	246	#define CHAR_MATCHES(enc, p, c) (*(p) == c)
	247	#endif
	248
	249	#define PREFIX(ident) normal_ ## ident
	250	#include "xmltok_impl.c"
	251
	252	#undef MINBPC
	253	#undef BYTE_TYPE
	254	#undef BYTE_TO_ASCII
	255	#undef CHAR_MATCHES
	256	#undef IS_NAME_CHAR
	257	#undef IS_NAME_CHAR_MINBPC
	258	#undef IS_NMSTRT_CHAR
	259	#undef IS_NMSTRT_CHAR_MINBPC
	260	#undef IS_INVALID_CHAR
	261
	262	enum
	263	{ /* UTF8_cvalN is value of masked first byte of N byte sequence */
	264	UTF8_cval1 = 0x00,
	265	UTF8_cval2 = 0xc0,
	266	UTF8_cval3 = 0xe0,
	267	UTF8_cval4 = 0xf0
	268	};
	269
[98]	270	static void EXPATENTRY utf8_toUtf8(const ENCODING * enc,
[147]	271	const char **fromP,
	272	const char *fromLim,
	273	char **toP,
	274	const char *toLim)
[36]	275	{
	276	char *to;
	277	const char *from;
	278
	279	if (fromLim - fromP > toLim - toP)
	280	{
	281	/* Avoid copying partial characters. */
[97]	282	for (fromLim = fromP + (toLim - toP);
	283	fromLim > *fromP;
	284	fromLim--)
[36]	285	if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
	286	break;
	287	}
[97]	288	for (to = toP, from = fromP;
	289	from != fromLim;
	290	from++, to++)
[36]	291	to = from;
	292	*fromP = from;
	293	*toP = to;
	294	}
	295
[98]	296	static void EXPATENTRY utf8_toUtf16(const ENCODING * enc,
[147]	297	const char **fromP,
	298	const char *fromLim,
	299	unsigned short **toP,
	300	const unsigned short *toLim)
[36]	301	{
	302	unsigned short to = toP;
	303	const char from = fromP;
	304
	305	while (from != fromLim && to != toLim)
	306	{
	307	switch (((struct normal_encoding )enc)->type[(unsigned char)from])
	308	{
	309	case BT_LEAD2:
	310	*to++ = ((from[0] & 0x1f) << 6) \| (from[1] & 0x3f);
	311	from += 2;
	312	break;
	313	case BT_LEAD3:
	314	*to++ = ((from[0] & 0xf) << 12) \| ((from[1] & 0x3f) << 6) \| (from[2] & 0x3f);
	315	from += 3;
	316	break;
	317	case BT_LEAD4:
	318	{
	319	unsigned long n;
	320
	321	if (to + 1 == toLim)
	322	break;
	323	n = ((from[0] & 0x7) << 18) \| ((from[1] & 0x3f) << 12) \| ((from[2] & 0x3f) << 6) \| (from[3] & 0x3f);
	324	n -= 0x10000;
	325	to[0] = (unsigned short)((n >> 10) \| 0xD800);
	326	to[1] = (unsigned short)((n & 0x3FF) \| 0xDC00);
	327	to += 2;
	328	from += 4;
	329	}
	330	break;
	331	default:
	332	to++ = from++;
	333	break;
	334	}
	335	}
	336	*fromP = from;
	337	*toP = to;
	338	}
	339
	340	#ifdef XML_NS
	341	static const struct normal_encoding utf8_encoding_ns =
	342	{
	343	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
	344	{
[75]	345	#include "expat\asciitab.h"
	346	#include "expat\utf8tab.h"
[36]	347	},
	348	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	349	};
	350
	351	#endif
	352
	353	static const struct normal_encoding utf8_encoding =
	354	{
	355	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
	356	{
	357	#define BT_COLON BT_NMSTRT
	358	#include "expat\asciitab.h"
	359	#undef BT_COLON
	360	#include "expat\utf8tab.h"
	361	},
	362	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	363	};
	364
	365	#ifdef XML_NS
	366
	367	static const struct normal_encoding internal_utf8_encoding_ns =
	368	{
	369	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
	370	{
[75]	371	#include "expat\iasciitab.h"
	372	#include "expat\utf8tab.h"
[36]	373	},
	374	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	375	};
	376
	377	#endif
	378
	379	static const struct normal_encoding internal_utf8_encoding =
	380	{
	381	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
	382	{
	383	#define BT_COLON BT_NMSTRT
	384	#include "expat\iasciitab.h"
	385	#undef BT_COLON
	386	#include "expat\utf8tab.h"
	387	},
	388	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	389	};
	390
[98]	391	static void EXPATENTRY latin1_toUtf8(const ENCODING * enc,
[147]	392	const char **fromP,
	393	const char *fromLim,
	394	char **toP,
	395	const char *toLim)
[36]	396	{
	397	for (;;)
	398	{
	399	unsigned char c;
	400
	401	if (*fromP == fromLim)
	402	break;
	403	c = (unsigned char)**fromP;
	404	if (c & 0x80)
	405	{
	406	if (toLim - *toP < 2)
	407	break;
	408	(toP)++ = ((c >> 6) \| UTF8_cval2);
	409	(toP)++ = ((c & 0x3f) \| 0x80);
	410	(*fromP)++;
	411	}
	412	else
	413	{
	414	if (*toP == toLim)
	415	break;
	416	(toP)++ = (fromP)++;
	417	}
	418	}
	419	}
	420
[98]	421	static void EXPATENTRY latin1_toUtf16(const ENCODING * enc,
[147]	422	const char **fromP,
	423	const char *fromLim,
	424	unsigned short **toP,
	425	const unsigned short *toLim)
[36]	426	{
	427	while (fromP != fromLim && toP != toLim)
	428	(toP)++ = (unsigned char)(fromP)++;
	429	}
	430
	431	#ifdef XML_NS
	432
	433	static const struct normal_encoding latin1_encoding_ns =
	434	{
	435	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
	436	{
[75]	437	#include "expat\asciitab.h"
	438	#include "expat\latin1tab.h"
[36]	439	},
	440	STANDARD_VTABLE(sb_)
	441	};
	442
	443	#endif
	444
	445	static const struct normal_encoding latin1_encoding =
	446	{
	447	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
	448	{
	449	#define BT_COLON BT_NMSTRT
	450	#include "expat\asciitab.h"
	451	#undef BT_COLON
	452	#include "expat\latin1tab.h"
	453	},
	454	STANDARD_VTABLE(sb_)
	455	};
	456
[98]	457	static void EXPATENTRY ascii_toUtf8(const ENCODING * enc,
[147]	458	const char **fromP,
	459	const char *fromLim,
	460	char **toP,
	461	const char *toLim)
[36]	462	{
	463	while (fromP != fromLim && toP != toLim)
	464	(toP)++ = (fromP)++;
	465	}
	466
	467	#ifdef XML_NS
	468
	469	static const struct normal_encoding ascii_encoding_ns =
	470	{
	471	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
	472	{
[75]	473	#include "expat\asciitab.h"
[36]	474	/* BT_NONXML == 0 */
	475	},
	476	STANDARD_VTABLE(sb_)
	477	};
	478
	479	#endif
	480
	481	static const struct normal_encoding ascii_encoding =
	482	{
	483	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
	484	{
	485	#define BT_COLON BT_NMSTRT
	486	#include "expat\asciitab.h"
	487	#undef BT_COLON
	488	/* BT_NONXML == 0 */
	489	},
	490	STANDARD_VTABLE(sb_)
	491	};
	492
	493	static int unicode_byte_type(char hi, char lo)
	494	{
	495	switch ((unsigned char)hi)
	496	{
	497	case 0xD8:
	498	case 0xD9:
	499	case 0xDA:
	500	case 0xDB:
	501	return BT_LEAD4;
	502	case 0xDC:
	503	case 0xDD:
	504	case 0xDE:
	505	case 0xDF:
	506	return BT_TRAIL;
	507	case 0xFF:
	508	switch ((unsigned char)lo)
	509	{
	510	case 0xFF:
	511	case 0xFE:
	512	return BT_NONXML;
	513	}
	514	break;
	515	}
	516	return BT_NONASCII;
	517	}
	518
	519	#define DEFINE_UTF16_TO_UTF8(E) \
[98]	520	static void EXPATENTRY E ## toUtf8(const ENCODING *enc, \
[36]	521	const char *fromP, const char fromLim, \
	522	char *toP, const char toLim) \
	523	{ \
	524	const char *from; \
	525	for (from = *fromP; from != fromLim; from += 2) { \
	526	int plane; \
	527	unsigned char lo2; \
	528	unsigned char lo = GET_LO(from); \
	529	unsigned char hi = GET_HI(from); \
	530	switch (hi) { \
	531	case 0: \
	532	if (lo < 0x80) { \
	533	if (*toP == toLim) { \
	534	*fromP = from; \
	535	return; \
	536	} \
	537	(toP)++ = lo; \
	538	break; \
	539	} \
	540	/* fall through */ \
	541	case 0x1: case 0x2: case 0x3: \
	542	case 0x4: case 0x5: case 0x6: case 0x7: \
	543	if (toLim - *toP < 2) { \
	544	*fromP = from; \
	545	return; \
	546	} \
	547	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
	548	(toP)++ = ((lo & 0x3f) \| 0x80); \
	549	break; \
	550	default: \
	551	if (toLim - *toP < 3) { \
	552	*fromP = from; \
	553	return; \
	554	} \
	555	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
	556	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
	557	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
	558	(toP)++ = ((lo & 0x3f) \| 0x80); \
	559	break; \
	560	case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
	561	if (toLim - *toP < 4) { \
	562	*fromP = from; \
	563	return; \
	564	} \
	565	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
	566	(toP)++ = ((plane >> 2) \| UTF8_cval4); \
	567	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
	568	from += 2; \
	569	lo2 = GET_LO(from); \
	570	(toP)++ = (((lo & 0x3) << 4) \
	571	\| ((GET_HI(from) & 0x3) << 2) \
	572	\| (lo2 >> 6) \
	573	\| 0x80); \
	574	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
	575	break; \
	576	} \
	577	} \
	578	*fromP = from; \
	579	}
	580
	581	#define DEFINE_UTF16_TO_UTF16(E) \
[98]	582	static void EXPATENTRY E ## toUtf16(const ENCODING *enc, \
[36]	583	const char *fromP, const char fromLim, \
	584	unsigned short *toP, const unsigned short toLim) \
	585	{ \
	586	/* Avoid copying first half only of surrogate */ \
	587	if (fromLim - fromP > ((toLim - toP) << 1) \
	588	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
	589	fromLim -= 2; \
	590	for (; fromP != fromLim && toP != toLim; *fromP += 2) \
	591	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
	592	}
	593
	594	#define SET2(ptr, ch) \
	595	(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
	596	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
	597	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
	598
	599	DEFINE_UTF16_TO_UTF8(little2_)
	600	DEFINE_UTF16_TO_UTF16(little2_)
	601
	602	#undef SET2
	603	#undef GET_LO
	604	#undef GET_HI
	605
	606	#define SET2(ptr, ch) \
	607	(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
	608	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
	609	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
	610
	611	DEFINE_UTF16_TO_UTF8(big2_)
	612	DEFINE_UTF16_TO_UTF16(big2_)
	613
	614	#undef SET2
	615	#undef GET_LO
	616	#undef GET_HI
	617
	618	#define LITTLE2_BYTE_TYPE(enc, p) \
	619	((p)[1] == 0 \
	620	? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
	621	: unicode_byte_type((p)[1], (p)[0]))
	622	#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
	623	#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
	624	#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
	625	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
	626	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
	627	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
	628
	629	#ifdef XML_MIN_SIZE
[98]	630	static int EXPATENTRY little2_byteType(const ENCODING * enc, const char *p)
[97]	631	{
	632	return LITTLE2_BYTE_TYPE(enc, p);
	633	}
[36]	634
[98]	635	static int EXPATENTRY little2_byteToAscii(const ENCODING * enc, const char *p)
[97]	636	{
	637	return LITTLE2_BYTE_TO_ASCII(enc, p);
	638	}
[36]	639
[98]	640	static int EXPATENTRY little2_charMatches(const ENCODING * enc, const char *p, int c)
[97]	641	{
	642	return LITTLE2_CHAR_MATCHES(enc, p, c);
	643	}
[36]	644
[98]	645	static int EXPATENTRY little2_isNameMin(const ENCODING * enc, const char *p)
[97]	646	{
	647	return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
	648	}
[36]	649
[98]	650	static int EXPATENTRY little2_isNmstrtMin(const ENCODING * enc, const char *p)
[97]	651	{
	652	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
	653	}
[36]	654
[97]	655	#undef VTABLE
	656	#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
[36]	657
	658	#else /* not XML_MIN_SIZE */
	659
[97]	660	#undef PREFIX
	661	#define PREFIX(ident) little2_ ## ident
	662	#define MINBPC(enc) 2
	663	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
	664	#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
	665	#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
	666	#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
	667	#define IS_NAME_CHAR(enc, p, n) 0
	668	#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
	669	#define IS_NMSTRT_CHAR(enc, p, n) (0)
	670	#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
[36]	671
[97]	672	#include "xmltok_impl.c"
[36]	673
[97]	674	#undef MINBPC
	675	#undef BYTE_TYPE
	676	#undef BYTE_TO_ASCII
	677	#undef CHAR_MATCHES
	678	#undef IS_NAME_CHAR
	679	#undef IS_NAME_CHAR_MINBPC
	680	#undef IS_NMSTRT_CHAR
	681	#undef IS_NMSTRT_CHAR_MINBPC
	682	#undef IS_INVALID_CHAR
[36]	683
	684	#endif /* not XML_MIN_SIZE */
	685
	686	#ifdef XML_NS
	687
[97]	688	static const struct normal_encoding little2_encoding_ns =
	689	{
	690	{VTABLE, 2, 0,
	691	#if XML_BYTE_ORDER == 12
	692	1
	693	#else
	694	0
	695	#endif
	696	},
[36]	697	{
[97]	698	#include "expat\asciitab.h"
	699	#include "expat\latin1tab.h"
	700	},
	701	STANDARD_VTABLE(little2_)
	702	};
[36]	703
	704	#endif
	705
	706	static const struct normal_encoding little2_encoding =
	707	{
	708	{VTABLE, 2, 0,
	709	#if XML_BYTE_ORDER == 12
	710	1
	711	#else
	712	0
	713	#endif
	714	},
	715	{
	716	#define BT_COLON BT_NMSTRT
	717	#include "expat\asciitab.h"
	718	#undef BT_COLON
	719	#include "expat\latin1tab.h"
	720	},
	721	STANDARD_VTABLE(little2_)
	722	};
	723
	724	#if XML_BYTE_ORDER != 21
	725
	726	#ifdef XML_NS
	727
	728	static const struct normal_encoding internal_little2_encoding_ns =
	729	{
	730	{VTABLE, 2, 0, 1},
	731	{
[75]	732	#include "expat\iasciitab.h"
	733	#include "expat\latin1tab.h"
[36]	734	},
	735	STANDARD_VTABLE(little2_)
	736	};
	737
	738	#endif
	739
	740	static const struct normal_encoding internal_little2_encoding =
	741	{
	742	{VTABLE, 2, 0, 1},
	743	{
	744	#define BT_COLON BT_NMSTRT
	745	#include "expat\iasciitab.h"
	746	#undef BT_COLON
	747	#include "expat\latin1tab.h"
	748	},
	749	STANDARD_VTABLE(little2_)
	750	};
	751
	752	#endif
	753
	754
	755	#define BIG2_BYTE_TYPE(enc, p) \
	756	((p)[0] == 0 \
	757	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
	758	: unicode_byte_type((p)[0], (p)[1]))
	759	#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
	760	#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
	761	#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
	762	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
	763	#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
	764	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
	765
	766	#ifdef XML_MIN_SIZE
	767
[98]	768	static int EXPATENTRY big2_byteType(const ENCODING * enc, const char *p)
[97]	769	{
	770	return BIG2_BYTE_TYPE(enc, p);
	771	}
[36]	772
[98]	773	static int EXPATENTRY big2_byteToAscii(const ENCODING * enc, const char *p)
[97]	774	{
	775	return BIG2_BYTE_TO_ASCII(enc, p);
	776	}
[36]	777
[98]	778	static int EXPATENTRY big2_charMatches(const ENCODING * enc, const char *p, int c)
[97]	779	{
	780	return BIG2_CHAR_MATCHES(enc, p, c);
	781	}
[36]	782
[98]	783	static int EXPATENTRY big2_isNameMin(const ENCODING * enc, const char *p)
[97]	784	{
	785	return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
	786	}
[36]	787
[98]	788	static int EXPATENTRY big2_isNmstrtMin(const ENCODING * enc, const char *p)
[97]	789	{
	790	return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
	791	}
[36]	792
[97]	793	#undef VTABLE
	794	#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
[36]	795
	796	#else /* not XML_MIN_SIZE */
	797
[97]	798	#undef PREFIX
	799	#define PREFIX(ident) big2_ ## ident
	800	#define MINBPC(enc) 2
	801	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
	802	#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
	803	#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
	804	#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
	805	#define IS_NAME_CHAR(enc, p, n) 0
	806	#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
	807	#define IS_NMSTRT_CHAR(enc, p, n) (0)
	808	#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
[36]	809
[97]	810	#include "xmltok_impl.c"
[36]	811
[97]	812	#undef MINBPC
	813	#undef BYTE_TYPE
	814	#undef BYTE_TO_ASCII
	815	#undef CHAR_MATCHES
	816	#undef IS_NAME_CHAR
	817	#undef IS_NAME_CHAR_MINBPC
	818	#undef IS_NMSTRT_CHAR
	819	#undef IS_NMSTRT_CHAR_MINBPC
	820	#undef IS_INVALID_CHAR
[36]	821
	822	#endif /* not XML_MIN_SIZE */
	823
	824	#ifdef XML_NS
	825
	826	static const struct normal_encoding big2_encoding_ns =
	827	{
	828	{VTABLE, 2, 0,
	829	#if XML_BYTE_ORDER == 21
	830	1
	831	#else
	832	0
	833	#endif
	834	},
	835	{
[75]	836	#include "expat\asciitab.h"
	837	#include "expat\latin1tab.h"
[36]	838	},
	839	STANDARD_VTABLE(big2_)
	840	};
	841
	842	#endif
	843
	844	static const struct normal_encoding big2_encoding =
	845	{
	846	{VTABLE, 2, 0,
	847	#if XML_BYTE_ORDER == 21
	848	1
	849	#else
	850	0
	851	#endif
	852	},
	853	{
	854	#define BT_COLON BT_NMSTRT
	855	#include "expat\asciitab.h"
	856	#undef BT_COLON
	857	#include "expat\latin1tab.h"
	858	},
	859	STANDARD_VTABLE(big2_)
	860	};
	861
	862	#if XML_BYTE_ORDER != 12
	863
	864	#ifdef XML_NS
	865
	866	static const struct normal_encoding internal_big2_encoding_ns =
	867	{
	868	{VTABLE, 2, 0, 1},
	869	{
[75]	870	#include "expat\iasciitab.h"
	871	#include "expat\latin1tab.h"
[36]	872	},
	873	STANDARD_VTABLE(big2_)
	874	};
	875
	876	#endif
	877
	878	static const struct normal_encoding internal_big2_encoding =
	879	{
	880	{VTABLE, 2, 0, 1},
	881	{
	882	#define BT_COLON BT_NMSTRT
	883	#include "expat\iasciitab.h"
	884	#undef BT_COLON
	885	#include "expat\latin1tab.h"
	886	},
	887	STANDARD_VTABLE(big2_)
	888	};
	889
	890	#endif
	891
	892	#undef PREFIX
	893
[97]	894	static int streqci(const char s1, const char s2)
[36]	895	{
	896	for (;;)
	897	{
	898	char c1 = *s1++;
	899	char c2 = *s2++;
	900
	901	if (ASCII_a <= c1 && c1 <= ASCII_z)
	902	c1 += ASCII_A - ASCII_a;
	903	if (ASCII_a <= c2 && c2 <= ASCII_z)
	904	c2 += ASCII_A - ASCII_a;
	905	if (c1 != c2)
	906	return 0;
	907	if (!c1)
	908	break;
	909	}
	910	return 1;
	911	}
	912
[147]	913	static void EXPATENTRY initUpdatePosition(const ENCODING * enc,
	914	const char *ptr,
	915	const char *end,
	916	POSITION * pos)
[36]	917	{
	918	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
	919	}
	920
[147]	921	static int EXPATENTRY toAscii(const ENCODING * enc,
	922	const char *ptr,
	923	const char *end)
[36]	924	{
	925	char buf[1];
	926	char *p = buf;
	927
	928	XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
	929	if (p == buf)
	930	return -1;
	931	else
	932	return buf[0];
	933	}
	934
[98]	935	static int EXPATENTRY isSpace(int c)
[36]	936	{
	937	switch (c)
	938	{
	939	case 0x20:
	940	case 0xD:
	941	case 0xA:
	942	case 0x9:
	943	return 1;
	944	}
	945	return 0;
	946	}
	947
	948	/* Return 1 if there's just optional white space
	949	* or there's an S followed by name=val. */
[98]	950	static int EXPATENTRY parsePseudoAttribute(const ENCODING * enc,
[147]	951	const char *ptr,
	952	const char *end,
	953	const char **namePtr,
	954	const char **nameEndPtr,
	955	const char **valPtr,
	956	const char **nextTokPtr)
[36]	957	{
	958	int c;
	959	char open;
	960
	961	if (ptr == end)
	962	{
	963	*namePtr = 0;
	964	return 1;
	965	}
	966	if (!isSpace(toAscii(enc, ptr, end)))
	967	{
	968	*nextTokPtr = ptr;
	969	return 0;
	970	}
	971	do
	972	{
	973	ptr += enc->minBytesPerChar;
	974	}
	975	while (isSpace(toAscii(enc, ptr, end)));
	976	if (ptr == end)
	977	{
	978	*namePtr = 0;
	979	return 1;
	980	}
	981	*namePtr = ptr;
	982	for (;;)
	983	{
	984	c = toAscii(enc, ptr, end);
	985	if (c == -1)
	986	{
	987	*nextTokPtr = ptr;
	988	return 0;
	989	}
	990	if (c == ASCII_EQUALS)
	991	{
	992	*nameEndPtr = ptr;
	993	break;
	994	}
	995	if (isSpace(c))
	996	{
	997	*nameEndPtr = ptr;
	998	do
	999	{
	1000	ptr += enc->minBytesPerChar;
	1001	}
	1002	while (isSpace(c = toAscii(enc, ptr, end)));
	1003	if (c != ASCII_EQUALS)
	1004	{
	1005	*nextTokPtr = ptr;
	1006	return 0;
	1007	}
	1008	break;
	1009	}
	1010	ptr += enc->minBytesPerChar;
	1011	}
	1012	if (ptr == *namePtr)
	1013	{
	1014	*nextTokPtr = ptr;
	1015	return 0;
	1016	}
	1017	ptr += enc->minBytesPerChar;
	1018	c = toAscii(enc, ptr, end);
	1019	while (isSpace(c))
	1020	{
	1021	ptr += enc->minBytesPerChar;
	1022	c = toAscii(enc, ptr, end);
	1023	}
	1024	if (c != ASCII_QUOT && c != ASCII_APOS)
	1025	{
	1026	*nextTokPtr = ptr;
	1027	return 0;
	1028	}
	1029	open = c;
	1030	ptr += enc->minBytesPerChar;
	1031	*valPtr = ptr;
	1032	for (;; ptr += enc->minBytesPerChar)
	1033	{
	1034	c = toAscii(enc, ptr, end);
	1035	if (c == open)
	1036	break;
	1037	if (!(ASCII_a <= c && c <= ASCII_z)
	1038	&& !(ASCII_A <= c && c <= ASCII_Z)
	1039	&& !(ASCII_0 <= c && c <= ASCII_9)
	1040	&& c != ASCII_PERIOD
	1041	&& c != ASCII_MINUS
	1042	&& c != ASCII_UNDERSCORE)
	1043	{
	1044	*nextTokPtr = ptr;
	1045	return 0;
	1046	}
	1047	}
	1048	*nextTokPtr = ptr + enc->minBytesPerChar;
	1049	return 1;
	1050	}
	1051
	1052	static const char KW_version[] =
	1053	{
	1054	ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
	1055	};
	1056
	1057	static const char KW_encoding[] =
	1058	{
	1059	ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
	1060	};
	1061
	1062	static const char KW_standalone[] =
	1063	{
	1064	ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
	1065	};
	1066
	1067	static const char KW_yes[] =
	1068	{
	1069	ASCII_y, ASCII_e, ASCII_s, '\0'
	1070	};
	1071
	1072	static const char KW_no[] =
	1073	{
	1074	ASCII_n, ASCII_o, '\0'
	1075	};
	1076
[97]	1077	static int doParseXmlDecl(const ENCODING * (encodingFinder) (const ENCODING ,
	1078	const char *,
	1079	const char *),
[36]	1080	int isGeneralTextEntity,
	1081	const ENCODING * enc,
	1082	const char *ptr,
	1083	const char *end,
	1084	const char **badPtr,
	1085	const char **versionPtr,
	1086	const char **versionEndPtr,
	1087	const char **encodingName,
	1088	const ENCODING ** encoding,
	1089	int *standalone)
	1090	{
	1091	const char *val = 0;
	1092	const char *name = 0;
	1093	const char *nameEnd = 0;
	1094
	1095	ptr += 5 * enc->minBytesPerChar;
	1096	end -= 2 * enc->minBytesPerChar;
	1097	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) \|\| !name)
	1098	{
	1099	*badPtr = ptr;
	1100	return 0;
	1101	}
	1102	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version))
	1103	{
	1104	if (!isGeneralTextEntity)
	1105	{
	1106	*badPtr = name;
	1107	return 0;
	1108	}
	1109	}
	1110	else
	1111	{
	1112	if (versionPtr)
	1113	*versionPtr = val;
	1114	if (versionEndPtr)
	1115	*versionEndPtr = ptr;
	1116	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
	1117	{
	1118	*badPtr = ptr;
	1119	return 0;
	1120	}
	1121	if (!name)
	1122	{
	1123	if (isGeneralTextEntity)
	1124	{
	1125	/* a TextDecl must have an EncodingDecl */
	1126	*badPtr = ptr;
	1127	return 0;
	1128	}
	1129	return 1;
	1130	}
	1131	}
	1132	if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding))
	1133	{
	1134	int c = toAscii(enc, val, end);
	1135
	1136	if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z))
	1137	{
	1138	*badPtr = val;
	1139	return 0;
	1140	}
	1141	if (encodingName)
	1142	*encodingName = val;
	1143	if (encoding)
	1144	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
	1145	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
	1146	{
	1147	*badPtr = ptr;
	1148	return 0;
	1149	}
	1150	if (!name)
	1151	return 1;
	1152	}
	1153	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) \|\| isGeneralTextEntity)
	1154	{
	1155	*badPtr = name;
	1156	return 0;
	1157	}
	1158	if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes))
	1159	{
	1160	if (standalone)
	1161	*standalone = 1;
	1162	}
	1163	else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no))
	1164	{
	1165	if (standalone)
	1166	*standalone = 0;
	1167	}
	1168	else
	1169	{
	1170	*badPtr = val;
	1171	return 0;
	1172	}
	1173	while (isSpace(toAscii(enc, ptr, end)))
	1174	ptr += enc->minBytesPerChar;
	1175	if (ptr != end)
	1176	{
	1177	*badPtr = ptr;
	1178	return 0;
	1179	}
	1180	return 1;
	1181	}
	1182
	1183	static int checkCharRefNumber(int result)
	1184	{
	1185	switch (result >> 8)
	1186	{
	1187	case 0xD8:
	1188	case 0xD9:
	1189	case 0xDA:
	1190	case 0xDB:
	1191	case 0xDC:
	1192	case 0xDD:
	1193	case 0xDE:
	1194	case 0xDF:
	1195	return -1;
	1196	case 0:
	1197	if (latin1_encoding.type[result] == BT_NONXML)
	1198	return -1;
	1199	break;
	1200	case 0xFF:
	1201	if (result == 0xFFFE \|\| result == 0xFFFF)
	1202	return -1;
	1203	break;
	1204	}
	1205	return result;
	1206	}
	1207
	1208	int XmlUtf8Encode(int c, char *buf)
	1209	{
	1210	enum
	1211	{
	1212	/* minN is minimum legal resulting value for N byte sequence */
	1213	min2 = 0x80,
	1214	min3 = 0x800,
	1215	min4 = 0x10000
	1216	};
	1217
	1218	if (c < 0)
	1219	return 0;
	1220	if (c < min2)
	1221	{
	1222	buf[0] = (c \| UTF8_cval1);
	1223	return 1;
	1224	}
	1225	if (c < min3)
	1226	{
	1227	buf[0] = ((c >> 6) \| UTF8_cval2);
	1228	buf[1] = ((c & 0x3f) \| 0x80);
	1229	return 2;
	1230	}
	1231	if (c < min4)
	1232	{
	1233	buf[0] = ((c >> 12) \| UTF8_cval3);
	1234	buf[1] = (((c >> 6) & 0x3f) \| 0x80);
	1235	buf[2] = ((c & 0x3f) \| 0x80);
	1236	return 3;
	1237	}
	1238	if (c < 0x110000)
	1239	{
	1240	buf[0] = ((c >> 18) \| UTF8_cval4);
	1241	buf[1] = (((c >> 12) & 0x3f) \| 0x80);
	1242	buf[2] = (((c >> 6) & 0x3f) \| 0x80);
	1243	buf[3] = ((c & 0x3f) \| 0x80);
	1244	return 4;
	1245	}
	1246	return 0;
	1247	}
	1248
	1249	int XmlUtf16Encode(int charNum, unsigned short *buf)
	1250	{
	1251	if (charNum < 0)
	1252	return 0;
	1253	if (charNum < 0x10000)
	1254	{
	1255	buf[0] = charNum;
	1256	return 1;
	1257	}
	1258	if (charNum < 0x110000)
	1259	{
	1260	charNum -= 0x10000;
	1261	buf[0] = (charNum >> 10) + 0xD800;
	1262	buf[1] = (charNum & 0x3FF) + 0xDC00;
	1263	return 2;
	1264	}
	1265	return 0;
	1266	}
	1267
	1268	struct unknown_encoding
	1269	{
	1270	struct normal_encoding normal;
	1271	int (convert) (void userData, const char *p);
	1272	void *userData;
	1273	unsigned short utf16[256];
	1274	char utf8[256][4];
	1275	};
	1276
[97]	1277	int XmlSizeOfUnknownEncoding(void)
[36]	1278	{
	1279	return sizeof(struct unknown_encoding);
	1280	}
	1281
[98]	1282	static int EXPATENTRY unknown_isName(const ENCODING * enc, const char *p)
[36]	1283	{
	1284	int c = ((const struct unknown_encoding *)enc)
	1285	->convert(((const struct unknown_encoding *)enc)->userData, p);
	1286
	1287	if (c & ~0xFFFF)
	1288	return 0;
	1289	return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
	1290	}
	1291
[98]	1292	static int EXPATENTRY unknown_isNmstrt(const ENCODING * enc, const char *p)
[36]	1293	{
	1294	int c = ((const struct unknown_encoding *)enc)
	1295	->convert(((const struct unknown_encoding *)enc)->userData, p);
	1296
	1297	if (c & ~0xFFFF)
	1298	return 0;
	1299	return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
	1300	}
	1301
[98]	1302	static int EXPATENTRY unknown_isInvalid(const ENCODING * enc, const char *p)
[36]	1303	{
	1304	int c = ((const struct unknown_encoding *)enc)
	1305	->convert(((const struct unknown_encoding *)enc)->userData, p);
	1306
	1307	return (c & ~0xFFFF) \|\| checkCharRefNumber(c) < 0;
	1308	}
	1309
[98]	1310	static void EXPATENTRY unknown_toUtf8(const ENCODING * enc,
[147]	1311	const char **fromP,
	1312	const char *fromLim,
	1313	char **toP,
	1314	const char *toLim)
[36]	1315	{
	1316	char buf[XML_UTF8_ENCODE_MAX];
	1317
	1318	for (;;)
	1319	{
	1320	const char *utf8;
	1321	int n;
	1322
	1323	if (*fromP == fromLim)
	1324	break;
	1325	utf8 = ((const struct unknown_encoding )enc)->utf8[(unsigned char)*fromP];
	1326	n = *utf8++;
	1327	if (n == 0)
	1328	{
	1329	int c = ((const struct unknown_encoding *)enc)
	1330	->convert(((const struct unknown_encoding )enc)->userData, fromP);
	1331
	1332	n = XmlUtf8Encode(c, buf);
	1333	if (n > toLim - *toP)
	1334	break;
	1335	utf8 = buf;
	1336	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
	1337	- (BT_LEAD2 - 2);
	1338	}
	1339	else
	1340	{
	1341	if (n > toLim - *toP)
	1342	break;
	1343	(*fromP)++;
	1344	}
	1345	do
	1346	{
	1347	(toP)++ = *utf8++;
	1348	}
	1349	while (--n != 0);
	1350	}
	1351	}
	1352
[98]	1353	static void EXPATENTRY unknown_toUtf16(const ENCODING * enc,
[147]	1354	const char **fromP,
	1355	const char *fromLim,
	1356	unsigned short **toP,
	1357	const unsigned short *toLim)
[36]	1358	{
	1359	while (fromP != fromLim && toP != toLim)
	1360	{
	1361	unsigned short c
	1362	= ((const struct unknown_encoding )enc)->utf16[(unsigned char)*fromP];
	1363
	1364	if (c == 0)
	1365	{
	1366	c = (unsigned short)((const struct unknown_encoding *)enc)
	1367	->convert(((const struct unknown_encoding )enc)->userData, fromP);
	1368	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
	1369	- (BT_LEAD2 - 2);
	1370	}
	1371	else
	1372	(*fromP)++;
	1373	(toP)++ = c;
	1374	}
	1375	}
	1376
[97]	1377	/*
	1378	*@@ XmlInitUnknownEncoding:
	1379	*
	1380	*@@changed V0.9.14 (2001-08-09) [umoeller]: couple of performance hacks
	1381	*/
	1382
	1383	ENCODING* XmlInitUnknownEncoding(void *mem,
	1384	int *table,
	1385	int (convert) (void userData, const char *p),
	1386	void *userData)
[36]	1387	{
	1388	int i;
[98]	1389	struct unknown_encoding e = (struct unknown_encoding)mem;
[36]	1390
[97]	1391	// gee, isn't this a regular memcpy?!?
	1392	/* for (i = 0;
	1393	i < (int)sizeof(struct normal_encoding);
	1394	i++)
	1395	((char )mem)[i] = ((char )&latin1_encoding)[i]; */
	1396
	1397	// replaced the above with this V0.9.14 (2001-08-09) [umoeller]
	1398	memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
	1399
[36]	1400	for (i = 0; i < 128; i++)
[97]	1401	if ( latin1_encoding.type[i] != BT_OTHER
	1402	&& latin1_encoding.type[i] != BT_NONXML
	1403	&& table[i] != i
	1404	)
[36]	1405	return 0;
[97]	1406
[36]	1407	for (i = 0; i < 256; i++)
	1408	{
	1409	int c = table[i];
	1410
	1411	if (c == -1)
	1412	{
	1413	e->normal.type[i] = BT_MALFORM;
	1414	/* This shouldn't really get used. */
	1415	e->utf16[i] = 0xFFFF;
	1416	e->utf8[i][0] = 1;
	1417	e->utf8[i][1] = 0;
	1418	}
	1419	else if (c < 0)
	1420	{
	1421	if (c < -4)
	1422	return 0;
	1423	e->normal.type[i] = BT_LEAD2 - (c + 2);
	1424	e->utf8[i][0] = 0;
	1425	e->utf16[i] = 0;
	1426	}
	1427	else if (c < 0x80)
	1428	{
[97]	1429	if ( latin1_encoding.type[c] != BT_OTHER
	1430	&& latin1_encoding.type[c] != BT_NONXML
	1431	&& c != i
	1432	)
[36]	1433	return 0;
	1434	e->normal.type[i] = latin1_encoding.type[c];
	1435	e->utf8[i][0] = 1;
	1436	e->utf8[i][1] = (char)c;
	1437	e->utf16[i] = c == 0 ? 0xFFFF : c;
	1438	}
	1439	else if (checkCharRefNumber(c) < 0)
	1440	{
	1441	e->normal.type[i] = BT_NONXML;
	1442	/* This shouldn't really get used. */
	1443	e->utf16[i] = 0xFFFF;
	1444	e->utf8[i][0] = 1;
	1445	e->utf8[i][1] = 0;
	1446	}
	1447	else
	1448	{
	1449	if (c > 0xFFFF)
	1450	return 0;
	1451	if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
	1452	e->normal.type[i] = BT_NMSTRT;
	1453	else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
	1454	e->normal.type[i] = BT_NAME;
	1455	else
	1456	e->normal.type[i] = BT_OTHER;
	1457	e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
	1458	e->utf16[i] = c;
	1459	}
	1460	}
	1461	e->userData = userData;
	1462	e->convert = convert;
	1463	if (convert)
	1464	{
	1465	e->normal.isName2 = unknown_isName;
	1466	e->normal.isName3 = unknown_isName;
	1467	e->normal.isName4 = unknown_isName;
	1468	e->normal.isNmstrt2 = unknown_isNmstrt;
	1469	e->normal.isNmstrt3 = unknown_isNmstrt;
	1470	e->normal.isNmstrt4 = unknown_isNmstrt;
	1471	e->normal.isInvalid2 = unknown_isInvalid;
	1472	e->normal.isInvalid3 = unknown_isInvalid;
	1473	e->normal.isInvalid4 = unknown_isInvalid;
	1474	}
	1475	e->normal.enc.utf8Convert = unknown_toUtf8;
	1476	e->normal.enc.utf16Convert = unknown_toUtf16;
	1477	return &(e->normal.enc);
	1478	}
	1479
	1480	/* If this enumeration is changed, getEncodingIndex and encodings
	1481	* must also be changed. */
	1482	enum
	1483	{
	1484	UNKNOWN_ENC = -1,
	1485	ISO_8859_1_ENC = 0,
	1486	US_ASCII_ENC,
	1487	UTF_8_ENC,
	1488	UTF_16_ENC,
	1489	UTF_16BE_ENC,
	1490	UTF_16LE_ENC,
	1491	/* must match encodingNames up to here */
	1492	NO_ENC
	1493	};
	1494
	1495	static const char KW_ISO_8859_1[] =
	1496	{
	1497	ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
	1498	};
	1499	static const char KW_US_ASCII[] =
	1500	{
	1501	ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
	1502	};
	1503	static const char KW_UTF_8[] =
	1504	{
	1505	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
	1506	};
	1507	static const char KW_UTF_16[] =
	1508	{
	1509	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
	1510	};
	1511	static const char KW_UTF_16BE[] =
	1512	{
	1513	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
	1514	};
	1515	static const char KW_UTF_16LE[] =
	1516	{
	1517	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
	1518	};
	1519
	1520	static int getEncodingIndex(const char *name)
	1521	{
	1522	static const char *encodingNames[] =
	1523	{
	1524	KW_ISO_8859_1,
	1525	KW_US_ASCII,
	1526	KW_UTF_8,
	1527	KW_UTF_16,
	1528	KW_UTF_16BE,
	1529	KW_UTF_16LE,
	1530	};
	1531	int i;
	1532
	1533	if (name == 0)
	1534	return NO_ENC;
[97]	1535	for (i = 0;
	1536	i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0]));
	1537	i++)
[36]	1538	if (streqci(name, encodingNames[i]))
	1539	return i;
	1540	return UNKNOWN_ENC;
	1541	}
	1542
	1543	/* For binary compatibility, we store the index of the encoding specified
	1544	* at initialization in the isUtf16 member. */
	1545
	1546	#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
	1547	#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
	1548
	1549	/* This is what detects the encoding.
	1550	* encodingTable maps from encoding indices to encodings;
	1551	* INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
	1552	* state is XML_CONTENT_STATE if we're parsing an external text entity,
	1553	* and XML_PROLOG_STATE otherwise.
	1554	*/
	1555
	1556
[98]	1557	static int EXPATENTRY initScan(const ENCODING ** encodingTable,
[147]	1558	const INIT_ENCODING * enc,
	1559	int state,
	1560	const char *ptr,
	1561	const char *end,
	1562	const char **nextTokPtr)
[36]	1563	{
	1564	const ENCODING **encPtr;
	1565
	1566	if (ptr == end)
	1567	return XML_TOK_NONE;
	1568	encPtr = enc->encPtr;
	1569	if (ptr + 1 == end)
	1570	{
	1571	/* only a single byte available for auto-detection */
	1572	#ifndef XML_DTD /* FIXME */
	1573	/* a well-formed document entity must have more than one byte */
	1574	if (state != XML_CONTENT_STATE)
	1575	return XML_TOK_PARTIAL;
	1576	#endif
	1577	/* so we're parsing an external text entity... */
	1578	/* if UTF-16 was externally specified, then we need at least 2 bytes */
	1579	switch (INIT_ENC_INDEX(enc))
	1580	{
	1581	case UTF_16_ENC:
	1582	case UTF_16LE_ENC:
	1583	case UTF_16BE_ENC:
	1584	return XML_TOK_PARTIAL;
	1585	}
	1586	switch ((unsigned char)*ptr)
	1587	{
	1588	case 0xFE:
	1589	case 0xFF:
	1590	case 0xEF: /* possibly first byte of UTF-8 BOM */
	1591	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
	1592	&& state == XML_CONTENT_STATE)
	1593	break;
	1594	/* fall through */
	1595	case 0x00:
	1596	case 0x3C:
	1597	return XML_TOK_PARTIAL;
	1598	}
	1599	}
	1600	else
	1601	{
	1602	switch (((unsigned char)ptr[0] << 8) \| (unsigned char)ptr[1])
	1603	{
	1604	case 0xFEFF:
	1605	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
	1606	&& state == XML_CONTENT_STATE)
	1607	break;
	1608	*nextTokPtr = ptr + 2;
	1609	*encPtr = encodingTable[UTF_16BE_ENC];
	1610	return XML_TOK_BOM;
	1611	/* 00 3C is handled in the default case */
	1612	case 0x3C00:
	1613	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
	1614	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
	1615	&& state == XML_CONTENT_STATE)
	1616	break;
	1617	*encPtr = encodingTable[UTF_16LE_ENC];
	1618	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1619	case 0xFFFE:
	1620	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
	1621	&& state == XML_CONTENT_STATE)
	1622	break;
	1623	*nextTokPtr = ptr + 2;
	1624	*encPtr = encodingTable[UTF_16LE_ENC];
	1625	return XML_TOK_BOM;
	1626	case 0xEFBB:
	1627	/* Maybe a UTF-8 BOM (EF BB BF) */
	1628	/* If there's an explicitly specified (external) encoding
	1629	* of ISO-8859-1 or some flavour of UTF-16
	1630	* and this is an external text entity,
	1631	* don't look for the BOM,
	1632	* because it might be a legal data. */
	1633	if (state == XML_CONTENT_STATE)
	1634	{
	1635	int e = INIT_ENC_INDEX(enc);
	1636
	1637	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC \|\| e == UTF_16LE_ENC \|\| e == UTF_16_ENC)
	1638	break;
	1639	}
	1640	if (ptr + 2 == end)
	1641	return XML_TOK_PARTIAL;
	1642	if ((unsigned char)ptr[2] == 0xBF)
	1643	{
[97]	1644	*nextTokPtr = ptr + 3;
[36]	1645	*encPtr = encodingTable[UTF_8_ENC];
	1646	return XML_TOK_BOM;
	1647	}
	1648	break;
	1649	default:
	1650	if (ptr[0] == '\0')
	1651	{
	1652	/* 0 isn't a legal data character. Furthermore a document entity can only
	1653	* start with ASCII characters. So the only way this can fail to be big-endian
	1654	* UTF-16 if it it's an external parsed general entity that's labelled as
	1655	* UTF-16LE. */
	1656	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
	1657	break;
	1658	*encPtr = encodingTable[UTF_16BE_ENC];
	1659	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1660	}
	1661	else if (ptr[1] == '\0')
	1662	{
	1663	/* We could recover here in the case:
	1664	* - parsing an external entity
	1665	* - second byte is 0
	1666	* - no externally specified encoding
	1667	* - no encoding declaration
	1668	* by assuming UTF-16LE. But we don't, because this would mean when
	1669	* presented just with a single byte, we couldn't reliably determine
	1670	* whether we needed further bytes. */
	1671	if (state == XML_CONTENT_STATE)
	1672	break;
	1673	*encPtr = encodingTable[UTF_16LE_ENC];
	1674	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1675	}
	1676	break;
	1677	}
	1678	}
	1679	*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
	1680	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1681	}
	1682
	1683
	1684	#define NS(x) x
	1685	#define ns(x) x
	1686	#include "xmltok_ns.c"
	1687	#undef NS
	1688	#undef ns
	1689
	1690	#ifdef XML_NS
	1691
	1692	#define NS(x) x ## NS
	1693	#define ns(x) x ## _ns
	1694
	1695	#include "xmltok_ns.c"
	1696
	1697	#undef NS
	1698	#undef ns
	1699
	1700	ENCODING * XmlInitUnknownEncodingNS(void *mem,
	1701	int *table,
[98]	1702	int (* EXPATENTRY convert) (void userData, const char p),
[36]	1703	void *userData)
	1704	{
	1705	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
	1706
	1707	if (enc)
	1708	((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
	1709	return enc;
	1710	}
	1711
	1712	#endif /* XML_NS */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/helpers/xmltok.c

Download in other formats: