Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

xmltok.c

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 40.4 KB

Rev	Line
[2]	1	/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
	2	See the file COPYING for copying permission.
	3	*/
	4
[391]	5	#include <stddef.h>
	6
[2]	7	#ifdef COMPILED_FROM_DSP
	8	#include "winconfig.h"
	9	#elif defined(MACOS_CLASSIC)
	10	#include "macconfig.h"
[391]	11	#elif defined(__amigaos__)
[2]	12	#include "amigaconfig.h"
[391]	13	#elif defined(__WATCOMC__)
	14	#include "watcomconfig.h"
[2]	15	#else
	16	#ifdef HAVE_EXPAT_CONFIG_H
	17	#include <expat_config.h>
	18	#endif
	19	#endif /* ndef COMPILED_FROM_DSP */
	20
	21	#include "expat_external.h"
	22	#include "internal.h"
	23	#include "xmltok.h"
	24	#include "nametab.h"
	25
	26	#ifdef XML_DTD
	27	#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
	28	#else
	29	#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
	30	#endif
	31
	32	#define VTABLE1 \
	33	{ PREFIX(prologTok), PREFIX(contentTok), \
	34	PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
	35	{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
	36	PREFIX(sameName), \
	37	PREFIX(nameMatchesAscii), \
	38	PREFIX(nameLength), \
	39	PREFIX(skipS), \
	40	PREFIX(getAtts), \
	41	PREFIX(charRefNumber), \
	42	PREFIX(predefinedEntityName), \
	43	PREFIX(updatePosition), \
	44	PREFIX(isPublicId)
	45
	46	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
	47
	48	#define UCS2_GET_NAMING(pages, hi, lo) \
	49	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
	50
	51	/* A 2 byte UTF-8 representation splits the characters 11 bits between
	52	the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
	53	pages, 3 bits to add to that index and 5 bits to generate the mask.
	54	*/
	55	#define UTF8_GET_NAMING2(pages, byte) \
	56	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
	57	+ ((((byte)[0]) & 3) << 1) \
	58	+ ((((byte)[1]) >> 5) & 1)] \
	59	& (1 << (((byte)[1]) & 0x1F)))
	60
	61	/* A 3 byte UTF-8 representation splits the characters 16 bits between
	62	the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
	63	into pages, 3 bits to add to that index and 5 bits to generate the
	64	mask.
	65	*/
	66	#define UTF8_GET_NAMING3(pages, byte) \
	67	(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
	68	+ ((((byte)[1]) >> 2) & 0xF)] \
	69	<< 3) \
	70	+ ((((byte)[1]) & 3) << 1) \
	71	+ ((((byte)[2]) >> 5) & 1)] \
	72	& (1 << (((byte)[2]) & 0x1F)))
	73
	74	#define UTF8_GET_NAMING(pages, p, n) \
	75	((n) == 2 \
	76	? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
	77	: ((n) == 3 \
	78	? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
	79	: 0))
	80
	81	/* Detection of invalid UTF-8 sequences is based on Table 3.1B
	82	of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
	83	with the additional restriction of not allowing the Unicode
	84	code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
	85	Implementation details:
	86	(A & 0x80) == 0 means A < 0x80
	87	and
	88	(A & 0xC0) == 0xC0 means A > 0xBF
	89	*/
	90
	91	#define UTF8_INVALID2(p) \
	92	((*p) < 0xC2 \|\| ((p)[1] & 0x80) == 0 \|\| ((p)[1] & 0xC0) == 0xC0)
	93
	94	#define UTF8_INVALID3(p) \
	95	(((p)[2] & 0x80) == 0 \
	96	\|\| \
	97	((*p) == 0xEF && (p)[1] == 0xBF \
	98	? \
	99	(p)[2] > 0xBD \
	100	: \
	101	((p)[2] & 0xC0) == 0xC0) \
	102	\|\| \
	103	((*p) == 0xE0 \
	104	? \
	105	(p)[1] < 0xA0 \|\| ((p)[1] & 0xC0) == 0xC0 \
	106	: \
	107	((p)[1] & 0x80) == 0 \
	108	\|\| \
	109	((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
	110
	111	#define UTF8_INVALID4(p) \
	112	(((p)[3] & 0x80) == 0 \|\| ((p)[3] & 0xC0) == 0xC0 \
	113	\|\| \
	114	((p)[2] & 0x80) == 0 \|\| ((p)[2] & 0xC0) == 0xC0 \
	115	\|\| \
	116	((*p) == 0xF0 \
	117	? \
	118	(p)[1] < 0x90 \|\| ((p)[1] & 0xC0) == 0xC0 \
	119	: \
	120	((p)[1] & 0x80) == 0 \
	121	\|\| \
	122	((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
	123
	124	static int PTRFASTCALL
	125	isNever(const ENCODING enc, const char p)
	126	{
	127	return 0;
	128	}
	129
	130	static int PTRFASTCALL
	131	utf8_isName2(const ENCODING enc, const char p)
	132	{
	133	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
	134	}
	135
	136	static int PTRFASTCALL
	137	utf8_isName3(const ENCODING enc, const char p)
	138	{
	139	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
	140	}
	141
	142	#define utf8_isName4 isNever
	143
	144	static int PTRFASTCALL
	145	utf8_isNmstrt2(const ENCODING enc, const char p)
	146	{
	147	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
	148	}
	149
	150	static int PTRFASTCALL
	151	utf8_isNmstrt3(const ENCODING enc, const char p)
	152	{
	153	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
	154	}
	155
	156	#define utf8_isNmstrt4 isNever
	157
	158	static int PTRFASTCALL
	159	utf8_isInvalid2(const ENCODING enc, const char p)
	160	{
	161	return UTF8_INVALID2((const unsigned char *)p);
	162	}
	163
	164	static int PTRFASTCALL
	165	utf8_isInvalid3(const ENCODING enc, const char p)
	166	{
	167	return UTF8_INVALID3((const unsigned char *)p);
	168	}
	169
	170	static int PTRFASTCALL
	171	utf8_isInvalid4(const ENCODING enc, const char p)
	172	{
	173	return UTF8_INVALID4((const unsigned char *)p);
	174	}
	175
	176	struct normal_encoding {
	177	ENCODING enc;
	178	unsigned char type[256];
	179	#ifdef XML_MIN_SIZE
	180	int (PTRFASTCALL byteType)(const ENCODING , const char *);
	181	int (PTRFASTCALL isNameMin)(const ENCODING , const char *);
	182	int (PTRFASTCALL isNmstrtMin)(const ENCODING , const char *);
	183	int (PTRFASTCALL byteToAscii)(const ENCODING , const char *);
	184	int (PTRCALL charMatches)(const ENCODING , const char *, int);
	185	#endif /* XML_MIN_SIZE */
	186	int (PTRFASTCALL isName2)(const ENCODING , const char *);
	187	int (PTRFASTCALL isName3)(const ENCODING , const char *);
	188	int (PTRFASTCALL isName4)(const ENCODING , const char *);
	189	int (PTRFASTCALL isNmstrt2)(const ENCODING , const char *);
	190	int (PTRFASTCALL isNmstrt3)(const ENCODING , const char *);
	191	int (PTRFASTCALL isNmstrt4)(const ENCODING , const char *);
	192	int (PTRFASTCALL isInvalid2)(const ENCODING , const char *);
	193	int (PTRFASTCALL isInvalid3)(const ENCODING , const char *);
	194	int (PTRFASTCALL isInvalid4)(const ENCODING , const char *);
	195	};
	196
	197	#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
	198
	199	#ifdef XML_MIN_SIZE
	200
	201	#define STANDARD_VTABLE(E) \
	202	E ## byteType, \
	203	E ## isNameMin, \
	204	E ## isNmstrtMin, \
	205	E ## byteToAscii, \
	206	E ## charMatches,
	207
	208	#else
	209
	210	#define STANDARD_VTABLE(E) /* as nothing */
	211
	212	#endif
	213
	214	#define NORMAL_VTABLE(E) \
	215	E ## isName2, \
	216	E ## isName3, \
	217	E ## isName4, \
	218	E ## isNmstrt2, \
	219	E ## isNmstrt3, \
	220	E ## isNmstrt4, \
	221	E ## isInvalid2, \
	222	E ## isInvalid3, \
	223	E ## isInvalid4
	224
	225	static int FASTCALL checkCharRefNumber(int);
	226
	227	#include "xmltok_impl.h"
	228	#include "ascii.h"
	229
	230	#ifdef XML_MIN_SIZE
	231	#define sb_isNameMin isNever
	232	#define sb_isNmstrtMin isNever
	233	#endif
	234
	235	#ifdef XML_MIN_SIZE
	236	#define MINBPC(enc) ((enc)->minBytesPerChar)
	237	#else
	238	/* minimum bytes per character */
	239	#define MINBPC(enc) 1
	240	#endif
	241
	242	#define SB_BYTE_TYPE(enc, p) \
	243	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
	244
	245	#ifdef XML_MIN_SIZE
	246	static int PTRFASTCALL
	247	sb_byteType(const ENCODING enc, const char p)
	248	{
	249	return SB_BYTE_TYPE(enc, p);
	250	}
	251	#define BYTE_TYPE(enc, p) \
	252	(AS_NORMAL_ENCODING(enc)->byteType(enc, p))
	253	#else
	254	#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
	255	#endif
	256
	257	#ifdef XML_MIN_SIZE
	258	#define BYTE_TO_ASCII(enc, p) \
	259	(AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
	260	static int PTRFASTCALL
	261	sb_byteToAscii(const ENCODING enc, const char p)
	262	{
	263	return *p;
	264	}
	265	#else
	266	#define BYTE_TO_ASCII(enc, p) (*(p))
	267	#endif
	268
	269	#define IS_NAME_CHAR(enc, p, n) \
	270	(AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
	271	#define IS_NMSTRT_CHAR(enc, p, n) \
	272	(AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
	273	#define IS_INVALID_CHAR(enc, p, n) \
	274	(AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
	275
	276	#ifdef XML_MIN_SIZE
	277	#define IS_NAME_CHAR_MINBPC(enc, p) \
	278	(AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
	279	#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
	280	(AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
	281	#else
	282	#define IS_NAME_CHAR_MINBPC(enc, p) (0)
	283	#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
	284	#endif
	285
	286	#ifdef XML_MIN_SIZE
	287	#define CHAR_MATCHES(enc, p, c) \
	288	(AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
	289	static int PTRCALL
	290	sb_charMatches(const ENCODING enc, const char p, int c)
	291	{
	292	return *p == c;
	293	}
	294	#else
	295	/* c is an ASCII character */
	296	#define CHAR_MATCHES(enc, p, c) (*(p) == c)
	297	#endif
	298
	299	#define PREFIX(ident) normal_ ## ident
[391]	300	#define XML_TOK_IMPL_C
[2]	301	#include "xmltok_impl.c"
[391]	302	#undef XML_TOK_IMPL_C
[2]	303
	304	#undef MINBPC
	305	#undef BYTE_TYPE
	306	#undef BYTE_TO_ASCII
	307	#undef CHAR_MATCHES
	308	#undef IS_NAME_CHAR
	309	#undef IS_NAME_CHAR_MINBPC
	310	#undef IS_NMSTRT_CHAR
	311	#undef IS_NMSTRT_CHAR_MINBPC
	312	#undef IS_INVALID_CHAR
	313
	314	enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
	315	UTF8_cval1 = 0x00,
	316	UTF8_cval2 = 0xc0,
	317	UTF8_cval3 = 0xe0,
	318	UTF8_cval4 = 0xf0
	319	};
	320
	321	static void PTRCALL
	322	utf8_toUtf8(const ENCODING *enc,
	323	const char *fromP, const char fromLim,
	324	char *toP, const char toLim)
	325	{
	326	char *to;
	327	const char *from;
	328	if (fromLim - fromP > toLim - toP) {
	329	/* Avoid copying partial characters. */
	330	for (fromLim = fromP + (toLim - toP); fromLim > *fromP; fromLim--)
	331	if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
	332	break;
	333	}
	334	for (to = toP, from = fromP; from != fromLim; from++, to++)
	335	to = from;
	336	*fromP = from;
	337	*toP = to;
	338	}
	339
	340	static void PTRCALL
	341	utf8_toUtf16(const ENCODING *enc,
	342	const char *fromP, const char fromLim,
	343	unsigned short *toP, const unsigned short toLim)
	344	{
	345	unsigned short to = toP;
	346	const char from = fromP;
	347	while (from != fromLim && to != toLim) {
	348	switch (((struct normal_encoding )enc)->type[(unsigned char)from]) {
	349	case BT_LEAD2:
	350	*to++ = (unsigned short)(((from[0] & 0x1f) << 6) \| (from[1] & 0x3f));
	351	from += 2;
	352	break;
	353	case BT_LEAD3:
	354	*to++ = (unsigned short)(((from[0] & 0xf) << 12)
	355	\| ((from[1] & 0x3f) << 6) \| (from[2] & 0x3f));
	356	from += 3;
	357	break;
	358	case BT_LEAD4:
	359	{
	360	unsigned long n;
	361	if (to + 1 == toLim)
	362	goto after;
	363	n = ((from[0] & 0x7) << 18) \| ((from[1] & 0x3f) << 12)
	364	\| ((from[2] & 0x3f) << 6) \| (from[3] & 0x3f);
	365	n -= 0x10000;
	366	to[0] = (unsigned short)((n >> 10) \| 0xD800);
	367	to[1] = (unsigned short)((n & 0x3FF) \| 0xDC00);
	368	to += 2;
	369	from += 4;
	370	}
	371	break;
	372	default:
	373	to++ = from++;
	374	break;
	375	}
	376	}
	377	after:
	378	*fromP = from;
	379	*toP = to;
	380	}
	381
	382	#ifdef XML_NS
	383	static const struct normal_encoding utf8_encoding_ns = {
	384	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
	385	{
	386	#include "asciitab.h"
	387	#include "utf8tab.h"
	388	},
	389	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	390	};
	391	#endif
	392
	393	static const struct normal_encoding utf8_encoding = {
	394	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
	395	{
	396	#define BT_COLON BT_NMSTRT
	397	#include "asciitab.h"
	398	#undef BT_COLON
	399	#include "utf8tab.h"
	400	},
	401	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	402	};
	403
	404	#ifdef XML_NS
	405
	406	static const struct normal_encoding internal_utf8_encoding_ns = {
	407	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
	408	{
	409	#include "iasciitab.h"
	410	#include "utf8tab.h"
	411	},
	412	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	413	};
	414
	415	#endif
	416
	417	static const struct normal_encoding internal_utf8_encoding = {
	418	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
	419	{
	420	#define BT_COLON BT_NMSTRT
	421	#include "iasciitab.h"
	422	#undef BT_COLON
	423	#include "utf8tab.h"
	424	},
	425	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
	426	};
	427
	428	static void PTRCALL
	429	latin1_toUtf8(const ENCODING *enc,
	430	const char *fromP, const char fromLim,
	431	char *toP, const char toLim)
	432	{
	433	for (;;) {
	434	unsigned char c;
	435	if (*fromP == fromLim)
	436	break;
	437	c = (unsigned char)**fromP;
	438	if (c & 0x80) {
	439	if (toLim - *toP < 2)
	440	break;
	441	(toP)++ = (char)((c >> 6) \| UTF8_cval2);
	442	(toP)++ = (char)((c & 0x3f) \| 0x80);
	443	(*fromP)++;
	444	}
	445	else {
	446	if (*toP == toLim)
	447	break;
	448	(toP)++ = (fromP)++;
	449	}
	450	}
	451	}
	452
	453	static void PTRCALL
	454	latin1_toUtf16(const ENCODING *enc,
	455	const char *fromP, const char fromLim,
	456	unsigned short *toP, const unsigned short toLim)
	457	{
	458	while (fromP != fromLim && toP != toLim)
	459	(toP)++ = (unsigned char)(fromP)++;
	460	}
	461
	462	#ifdef XML_NS
	463
	464	static const struct normal_encoding latin1_encoding_ns = {
	465	{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
	466	{
	467	#include "asciitab.h"
	468	#include "latin1tab.h"
	469	},
	470	STANDARD_VTABLE(sb_)
	471	};
	472
	473	#endif
	474
	475	static const struct normal_encoding latin1_encoding = {
	476	{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
	477	{
	478	#define BT_COLON BT_NMSTRT
	479	#include "asciitab.h"
	480	#undef BT_COLON
	481	#include "latin1tab.h"
	482	},
	483	STANDARD_VTABLE(sb_)
	484	};
	485
	486	static void PTRCALL
	487	ascii_toUtf8(const ENCODING *enc,
	488	const char *fromP, const char fromLim,
	489	char *toP, const char toLim)
	490	{
	491	while (fromP != fromLim && toP != toLim)
	492	(toP)++ = (fromP)++;
	493	}
	494
	495	#ifdef XML_NS
	496
	497	static const struct normal_encoding ascii_encoding_ns = {
	498	{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
	499	{
	500	#include "asciitab.h"
	501	/* BT_NONXML == 0 */
	502	},
	503	STANDARD_VTABLE(sb_)
	504	};
	505
	506	#endif
	507
	508	static const struct normal_encoding ascii_encoding = {
	509	{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
	510	{
	511	#define BT_COLON BT_NMSTRT
	512	#include "asciitab.h"
	513	#undef BT_COLON
	514	/* BT_NONXML == 0 */
	515	},
	516	STANDARD_VTABLE(sb_)
	517	};
	518
	519	static int PTRFASTCALL
	520	unicode_byte_type(char hi, char lo)
	521	{
	522	switch ((unsigned char)hi) {
	523	case 0xD8: case 0xD9: case 0xDA: case 0xDB:
	524	return BT_LEAD4;
	525	case 0xDC: case 0xDD: case 0xDE: case 0xDF:
	526	return BT_TRAIL;
	527	case 0xFF:
	528	switch ((unsigned char)lo) {
	529	case 0xFF:
	530	case 0xFE:
	531	return BT_NONXML;
	532	}
	533	break;
	534	}
	535	return BT_NONASCII;
	536	}
	537
	538	#define DEFINE_UTF16_TO_UTF8(E) \
	539	static void PTRCALL \
	540	E ## toUtf8(const ENCODING *enc, \
	541	const char *fromP, const char fromLim, \
	542	char *toP, const char toLim) \
	543	{ \
	544	const char *from; \
	545	for (from = *fromP; from != fromLim; from += 2) { \
	546	int plane; \
	547	unsigned char lo2; \
	548	unsigned char lo = GET_LO(from); \
	549	unsigned char hi = GET_HI(from); \
	550	switch (hi) { \
	551	case 0: \
	552	if (lo < 0x80) { \
	553	if (*toP == toLim) { \
	554	*fromP = from; \
	555	return; \
	556	} \
	557	(toP)++ = lo; \
	558	break; \
	559	} \
	560	/* fall through */ \
	561	case 0x1: case 0x2: case 0x3: \
	562	case 0x4: case 0x5: case 0x6: case 0x7: \
	563	if (toLim - *toP < 2) { \
	564	*fromP = from; \
	565	return; \
	566	} \
	567	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
	568	(toP)++ = ((lo & 0x3f) \| 0x80); \
	569	break; \
	570	default: \
	571	if (toLim - *toP < 3) { \
	572	*fromP = from; \
	573	return; \
	574	} \
	575	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
	576	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
	577	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
	578	(toP)++ = ((lo & 0x3f) \| 0x80); \
	579	break; \
	580	case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
	581	if (toLim - *toP < 4) { \
	582	*fromP = from; \
	583	return; \
	584	} \
	585	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
	586	(toP)++ = ((plane >> 2) \| UTF8_cval4); \
	587	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
	588	from += 2; \
	589	lo2 = GET_LO(from); \
	590	(toP)++ = (((lo & 0x3) << 4) \
	591	\| ((GET_HI(from) & 0x3) << 2) \
	592	\| (lo2 >> 6) \
	593	\| 0x80); \
	594	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
	595	break; \
	596	} \
	597	} \
	598	*fromP = from; \
	599	}
	600
	601	#define DEFINE_UTF16_TO_UTF16(E) \
	602	static void PTRCALL \
	603	E ## toUtf16(const ENCODING *enc, \
	604	const char *fromP, const char fromLim, \
	605	unsigned short *toP, const unsigned short toLim) \
	606	{ \
	607	/* Avoid copying first half only of surrogate */ \
	608	if (fromLim - fromP > ((toLim - toP) << 1) \
	609	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
	610	fromLim -= 2; \
	611	for (; fromP != fromLim && toP != toLim; *fromP += 2) \
	612	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
	613	}
	614
	615	#define SET2(ptr, ch) \
	616	(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
	617	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
	618	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
	619
	620	DEFINE_UTF16_TO_UTF8(little2_)
	621	DEFINE_UTF16_TO_UTF16(little2_)
	622
	623	#undef SET2
	624	#undef GET_LO
	625	#undef GET_HI
	626
	627	#define SET2(ptr, ch) \
	628	(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
	629	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
	630	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
	631
	632	DEFINE_UTF16_TO_UTF8(big2_)
	633	DEFINE_UTF16_TO_UTF16(big2_)
	634
	635	#undef SET2
	636	#undef GET_LO
	637	#undef GET_HI
	638
	639	#define LITTLE2_BYTE_TYPE(enc, p) \
	640	((p)[1] == 0 \
	641	? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
	642	: unicode_byte_type((p)[1], (p)[0]))
	643	#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
	644	#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
	645	#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
	646	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
	647	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
	648	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
	649
	650	#ifdef XML_MIN_SIZE
	651
	652	static int PTRFASTCALL
	653	little2_byteType(const ENCODING enc, const char p)
	654	{
	655	return LITTLE2_BYTE_TYPE(enc, p);
	656	}
	657
	658	static int PTRFASTCALL
	659	little2_byteToAscii(const ENCODING enc, const char p)
	660	{
	661	return LITTLE2_BYTE_TO_ASCII(enc, p);
	662	}
	663
	664	static int PTRCALL
	665	little2_charMatches(const ENCODING enc, const char p, int c)
	666	{
	667	return LITTLE2_CHAR_MATCHES(enc, p, c);
	668	}
	669
	670	static int PTRFASTCALL
	671	little2_isNameMin(const ENCODING enc, const char p)
	672	{
	673	return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
	674	}
	675
	676	static int PTRFASTCALL
	677	little2_isNmstrtMin(const ENCODING enc, const char p)
	678	{
	679	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
	680	}
	681
	682	#undef VTABLE
	683	#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
	684
	685	#else /* not XML_MIN_SIZE */
	686
	687	#undef PREFIX
	688	#define PREFIX(ident) little2_ ## ident
	689	#define MINBPC(enc) 2
	690	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
	691	#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
	692	#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
	693	#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
	694	#define IS_NAME_CHAR(enc, p, n) 0
	695	#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
	696	#define IS_NMSTRT_CHAR(enc, p, n) (0)
	697	#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
	698
[391]	699	#define XML_TOK_IMPL_C
[2]	700	#include "xmltok_impl.c"
[391]	701	#undef XML_TOK_IMPL_C
[2]	702
	703	#undef MINBPC
	704	#undef BYTE_TYPE
	705	#undef BYTE_TO_ASCII
	706	#undef CHAR_MATCHES
	707	#undef IS_NAME_CHAR
	708	#undef IS_NAME_CHAR_MINBPC
	709	#undef IS_NMSTRT_CHAR
	710	#undef IS_NMSTRT_CHAR_MINBPC
	711	#undef IS_INVALID_CHAR
	712
	713	#endif /* not XML_MIN_SIZE */
	714
	715	#ifdef XML_NS
	716
	717	static const struct normal_encoding little2_encoding_ns = {
	718	{ VTABLE, 2, 0,
	719	#if BYTEORDER == 1234
	720	1
	721	#else
	722	0
	723	#endif
	724	},
	725	{
	726	#include "asciitab.h"
	727	#include "latin1tab.h"
	728	},
	729	STANDARD_VTABLE(little2_)
	730	};
	731
	732	#endif
	733
	734	static const struct normal_encoding little2_encoding = {
	735	{ VTABLE, 2, 0,
	736	#if BYTEORDER == 1234
	737	1
	738	#else
	739	0
	740	#endif
	741	},
	742	{
	743	#define BT_COLON BT_NMSTRT
	744	#include "asciitab.h"
	745	#undef BT_COLON
	746	#include "latin1tab.h"
	747	},
	748	STANDARD_VTABLE(little2_)
	749	};
	750
	751	#if BYTEORDER != 4321
	752
	753	#ifdef XML_NS
	754
	755	static const struct normal_encoding internal_little2_encoding_ns = {
	756	{ VTABLE, 2, 0, 1 },
	757	{
	758	#include "iasciitab.h"
	759	#include "latin1tab.h"
	760	},
	761	STANDARD_VTABLE(little2_)
	762	};
	763
	764	#endif
	765
	766	static const struct normal_encoding internal_little2_encoding = {
	767	{ VTABLE, 2, 0, 1 },
	768	{
	769	#define BT_COLON BT_NMSTRT
	770	#include "iasciitab.h"
	771	#undef BT_COLON
	772	#include "latin1tab.h"
	773	},
	774	STANDARD_VTABLE(little2_)
	775	};
	776
	777	#endif
	778
	779
	780	#define BIG2_BYTE_TYPE(enc, p) \
	781	((p)[0] == 0 \
	782	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
	783	: unicode_byte_type((p)[0], (p)[1]))
	784	#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
	785	#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
	786	#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
	787	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
	788	#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
	789	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
	790
	791	#ifdef XML_MIN_SIZE
	792
	793	static int PTRFASTCALL
	794	big2_byteType(const ENCODING enc, const char p)
	795	{
	796	return BIG2_BYTE_TYPE(enc, p);
	797	}
	798
	799	static int PTRFASTCALL
	800	big2_byteToAscii(const ENCODING enc, const char p)
	801	{
	802	return BIG2_BYTE_TO_ASCII(enc, p);
	803	}
	804
	805	static int PTRCALL
	806	big2_charMatches(const ENCODING enc, const char p, int c)
	807	{
	808	return BIG2_CHAR_MATCHES(enc, p, c);
	809	}
	810
	811	static int PTRFASTCALL
	812	big2_isNameMin(const ENCODING enc, const char p)
	813	{
	814	return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
	815	}
	816
	817	static int PTRFASTCALL
	818	big2_isNmstrtMin(const ENCODING enc, const char p)
	819	{
	820	return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
	821	}
	822
	823	#undef VTABLE
	824	#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
	825
	826	#else /* not XML_MIN_SIZE */
	827
	828	#undef PREFIX
	829	#define PREFIX(ident) big2_ ## ident
	830	#define MINBPC(enc) 2
	831	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
	832	#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
	833	#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
	834	#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
	835	#define IS_NAME_CHAR(enc, p, n) 0
	836	#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
	837	#define IS_NMSTRT_CHAR(enc, p, n) (0)
	838	#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
	839
[391]	840	#define XML_TOK_IMPL_C
[2]	841	#include "xmltok_impl.c"
[391]	842	#undef XML_TOK_IMPL_C
[2]	843
	844	#undef MINBPC
	845	#undef BYTE_TYPE
	846	#undef BYTE_TO_ASCII
	847	#undef CHAR_MATCHES
	848	#undef IS_NAME_CHAR
	849	#undef IS_NAME_CHAR_MINBPC
	850	#undef IS_NMSTRT_CHAR
	851	#undef IS_NMSTRT_CHAR_MINBPC
	852	#undef IS_INVALID_CHAR
	853
	854	#endif /* not XML_MIN_SIZE */
	855
	856	#ifdef XML_NS
	857
	858	static const struct normal_encoding big2_encoding_ns = {
	859	{ VTABLE, 2, 0,
	860	#if BYTEORDER == 4321
	861	1
	862	#else
	863	0
	864	#endif
	865	},
	866	{
	867	#include "asciitab.h"
	868	#include "latin1tab.h"
	869	},
	870	STANDARD_VTABLE(big2_)
	871	};
	872
	873	#endif
	874
	875	static const struct normal_encoding big2_encoding = {
	876	{ VTABLE, 2, 0,
	877	#if BYTEORDER == 4321
	878	1
	879	#else
	880	0
	881	#endif
	882	},
	883	{
	884	#define BT_COLON BT_NMSTRT
	885	#include "asciitab.h"
	886	#undef BT_COLON
	887	#include "latin1tab.h"
	888	},
	889	STANDARD_VTABLE(big2_)
	890	};
	891
	892	#if BYTEORDER != 1234
	893
	894	#ifdef XML_NS
	895
	896	static const struct normal_encoding internal_big2_encoding_ns = {
	897	{ VTABLE, 2, 0, 1 },
	898	{
	899	#include "iasciitab.h"
	900	#include "latin1tab.h"
	901	},
	902	STANDARD_VTABLE(big2_)
	903	};
	904
	905	#endif
	906
	907	static const struct normal_encoding internal_big2_encoding = {
	908	{ VTABLE, 2, 0, 1 },
	909	{
	910	#define BT_COLON BT_NMSTRT
	911	#include "iasciitab.h"
	912	#undef BT_COLON
	913	#include "latin1tab.h"
	914	},
	915	STANDARD_VTABLE(big2_)
	916	};
	917
	918	#endif
	919
	920	#undef PREFIX
	921
	922	static int FASTCALL
	923	streqci(const char s1, const char s2)
	924	{
	925	for (;;) {
	926	char c1 = *s1++;
	927	char c2 = *s2++;
	928	if (ASCII_a <= c1 && c1 <= ASCII_z)
	929	c1 += ASCII_A - ASCII_a;
	930	if (ASCII_a <= c2 && c2 <= ASCII_z)
	931	c2 += ASCII_A - ASCII_a;
	932	if (c1 != c2)
	933	return 0;
	934	if (!c1)
	935	break;
	936	}
	937	return 1;
	938	}
	939
	940	static void PTRCALL
	941	initUpdatePosition(const ENCODING enc, const char ptr,
	942	const char end, POSITION pos)
	943	{
	944	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
	945	}
	946
	947	static int
	948	toAscii(const ENCODING enc, const char ptr, const char *end)
	949	{
	950	char buf[1];
	951	char *p = buf;
	952	XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
	953	if (p == buf)
	954	return -1;
	955	else
	956	return buf[0];
	957	}
	958
	959	static int FASTCALL
	960	isSpace(int c)
	961	{
	962	switch (c) {
	963	case 0x20:
	964	case 0xD:
	965	case 0xA:
	966	case 0x9:
	967	return 1;
	968	}
	969	return 0;
	970	}
	971
	972	/* Return 1 if there's just optional white space or there's an S
	973	followed by name=val.
	974	*/
	975	static int
	976	parsePseudoAttribute(const ENCODING *enc,
	977	const char *ptr,
	978	const char *end,
	979	const char **namePtr,
	980	const char **nameEndPtr,
	981	const char **valPtr,
	982	const char **nextTokPtr)
	983	{
	984	int c;
	985	char open;
	986	if (ptr == end) {
	987	*namePtr = NULL;
	988	return 1;
	989	}
	990	if (!isSpace(toAscii(enc, ptr, end))) {
	991	*nextTokPtr = ptr;
	992	return 0;
	993	}
	994	do {
	995	ptr += enc->minBytesPerChar;
	996	} while (isSpace(toAscii(enc, ptr, end)));
	997	if (ptr == end) {
	998	*namePtr = NULL;
	999	return 1;
	1000	}
	1001	*namePtr = ptr;
	1002	for (;;) {
	1003	c = toAscii(enc, ptr, end);
	1004	if (c == -1) {
	1005	*nextTokPtr = ptr;
	1006	return 0;
	1007	}
	1008	if (c == ASCII_EQUALS) {
	1009	*nameEndPtr = ptr;
	1010	break;
	1011	}
	1012	if (isSpace(c)) {
	1013	*nameEndPtr = ptr;
	1014	do {
	1015	ptr += enc->minBytesPerChar;
	1016	} while (isSpace(c = toAscii(enc, ptr, end)));
	1017	if (c != ASCII_EQUALS) {
	1018	*nextTokPtr = ptr;
	1019	return 0;
	1020	}
	1021	break;
	1022	}
	1023	ptr += enc->minBytesPerChar;
	1024	}
	1025	if (ptr == *namePtr) {
	1026	*nextTokPtr = ptr;
	1027	return 0;
	1028	}
	1029	ptr += enc->minBytesPerChar;
	1030	c = toAscii(enc, ptr, end);
	1031	while (isSpace(c)) {
	1032	ptr += enc->minBytesPerChar;
	1033	c = toAscii(enc, ptr, end);
	1034	}
	1035	if (c != ASCII_QUOT && c != ASCII_APOS) {
	1036	*nextTokPtr = ptr;
	1037	return 0;
	1038	}
	1039	open = (char)c;
	1040	ptr += enc->minBytesPerChar;
	1041	*valPtr = ptr;
	1042	for (;; ptr += enc->minBytesPerChar) {
	1043	c = toAscii(enc, ptr, end);
	1044	if (c == open)
	1045	break;
	1046	if (!(ASCII_a <= c && c <= ASCII_z)
	1047	&& !(ASCII_A <= c && c <= ASCII_Z)
	1048	&& !(ASCII_0 <= c && c <= ASCII_9)
	1049	&& c != ASCII_PERIOD
	1050	&& c != ASCII_MINUS
	1051	&& c != ASCII_UNDERSCORE) {
	1052	*nextTokPtr = ptr;
	1053	return 0;
	1054	}
	1055	}
	1056	*nextTokPtr = ptr + enc->minBytesPerChar;
	1057	return 1;
	1058	}
	1059
	1060	static const char KW_version[] = {
	1061	ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
	1062	};
	1063
	1064	static const char KW_encoding[] = {
	1065	ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
	1066	};
	1067
	1068	static const char KW_standalone[] = {
	1069	ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
	1070	ASCII_n, ASCII_e, '\0'
	1071	};
	1072
	1073	static const char KW_yes[] = {
	1074	ASCII_y, ASCII_e, ASCII_s, '\0'
	1075	};
	1076
	1077	static const char KW_no[] = {
	1078	ASCII_n, ASCII_o, '\0'
	1079	};
	1080
	1081	static int
	1082	doParseXmlDecl(const ENCODING (encodingFinder)(const ENCODING *,
	1083	const char *,
	1084	const char *),
	1085	int isGeneralTextEntity,
	1086	const ENCODING *enc,
	1087	const char *ptr,
	1088	const char *end,
	1089	const char **badPtr,
	1090	const char **versionPtr,
	1091	const char **versionEndPtr,
	1092	const char **encodingName,
	1093	const ENCODING **encoding,
	1094	int *standalone)
	1095	{
	1096	const char *val = NULL;
	1097	const char *name = NULL;
	1098	const char *nameEnd = NULL;
	1099	ptr += 5 * enc->minBytesPerChar;
	1100	end -= 2 * enc->minBytesPerChar;
	1101	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
	1102	\|\| !name) {
	1103	*badPtr = ptr;
	1104	return 0;
	1105	}
	1106	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
	1107	if (!isGeneralTextEntity) {
	1108	*badPtr = name;
	1109	return 0;
	1110	}
	1111	}
	1112	else {
	1113	if (versionPtr)
	1114	*versionPtr = val;
	1115	if (versionEndPtr)
	1116	*versionEndPtr = ptr;
	1117	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
	1118	*badPtr = ptr;
	1119	return 0;
	1120	}
	1121	if (!name) {
	1122	if (isGeneralTextEntity) {
	1123	/* a TextDecl must have an EncodingDecl */
	1124	*badPtr = ptr;
	1125	return 0;
	1126	}
	1127	return 1;
	1128	}
	1129	}
	1130	if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
	1131	int c = toAscii(enc, val, end);
	1132	if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
	1133	*badPtr = val;
	1134	return 0;
	1135	}
	1136	if (encodingName)
	1137	*encodingName = val;
	1138	if (encoding)
	1139	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
	1140	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
	1141	*badPtr = ptr;
	1142	return 0;
	1143	}
	1144	if (!name)
	1145	return 1;
	1146	}
	1147	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
	1148	\|\| isGeneralTextEntity) {
	1149	*badPtr = name;
	1150	return 0;
	1151	}
	1152	if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
	1153	if (standalone)
	1154	*standalone = 1;
	1155	}
	1156	else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
	1157	if (standalone)
	1158	*standalone = 0;
	1159	}
	1160	else {
	1161	*badPtr = val;
	1162	return 0;
	1163	}
	1164	while (isSpace(toAscii(enc, ptr, end)))
	1165	ptr += enc->minBytesPerChar;
	1166	if (ptr != end) {
	1167	*badPtr = ptr;
	1168	return 0;
	1169	}
	1170	return 1;
	1171	}
	1172
	1173	static int FASTCALL
	1174	checkCharRefNumber(int result)
	1175	{
	1176	switch (result >> 8) {
	1177	case 0xD8: case 0xD9: case 0xDA: case 0xDB:
	1178	case 0xDC: case 0xDD: case 0xDE: case 0xDF:
	1179	return -1;
	1180	case 0:
	1181	if (latin1_encoding.type[result] == BT_NONXML)
	1182	return -1;
	1183	break;
	1184	case 0xFF:
	1185	if (result == 0xFFFE \|\| result == 0xFFFF)
	1186	return -1;
	1187	break;
	1188	}
	1189	return result;
	1190	}
	1191
	1192	int FASTCALL
	1193	XmlUtf8Encode(int c, char *buf)
	1194	{
	1195	enum {
	1196	/* minN is minimum legal resulting value for N byte sequence */
	1197	min2 = 0x80,
	1198	min3 = 0x800,
	1199	min4 = 0x10000
	1200	};
	1201
	1202	if (c < 0)
	1203	return 0;
	1204	if (c < min2) {
	1205	buf[0] = (char)(c \| UTF8_cval1);
	1206	return 1;
	1207	}
	1208	if (c < min3) {
	1209	buf[0] = (char)((c >> 6) \| UTF8_cval2);
	1210	buf[1] = (char)((c & 0x3f) \| 0x80);
	1211	return 2;
	1212	}
	1213	if (c < min4) {
	1214	buf[0] = (char)((c >> 12) \| UTF8_cval3);
	1215	buf[1] = (char)(((c >> 6) & 0x3f) \| 0x80);
	1216	buf[2] = (char)((c & 0x3f) \| 0x80);
	1217	return 3;
	1218	}
	1219	if (c < 0x110000) {
	1220	buf[0] = (char)((c >> 18) \| UTF8_cval4);
	1221	buf[1] = (char)(((c >> 12) & 0x3f) \| 0x80);
	1222	buf[2] = (char)(((c >> 6) & 0x3f) \| 0x80);
	1223	buf[3] = (char)((c & 0x3f) \| 0x80);
	1224	return 4;
	1225	}
	1226	return 0;
	1227	}
	1228
	1229	int FASTCALL
	1230	XmlUtf16Encode(int charNum, unsigned short *buf)
	1231	{
	1232	if (charNum < 0)
	1233	return 0;
	1234	if (charNum < 0x10000) {
	1235	buf[0] = (unsigned short)charNum;
	1236	return 1;
	1237	}
	1238	if (charNum < 0x110000) {
	1239	charNum -= 0x10000;
	1240	buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
	1241	buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
	1242	return 2;
	1243	}
	1244	return 0;
	1245	}
	1246
	1247	struct unknown_encoding {
	1248	struct normal_encoding normal;
	1249	CONVERTER convert;
	1250	void *userData;
	1251	unsigned short utf16[256];
	1252	char utf8[256][4];
	1253	};
	1254
	1255	#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
	1256
	1257	int
	1258	XmlSizeOfUnknownEncoding(void)
	1259	{
	1260	return sizeof(struct unknown_encoding);
	1261	}
	1262
	1263	static int PTRFASTCALL
	1264	unknown_isName(const ENCODING enc, const char p)
	1265	{
	1266	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
	1267	int c = uenc->convert(uenc->userData, p);
	1268	if (c & ~0xFFFF)
	1269	return 0;
	1270	return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
	1271	}
	1272
	1273	static int PTRFASTCALL
	1274	unknown_isNmstrt(const ENCODING enc, const char p)
	1275	{
	1276	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
	1277	int c = uenc->convert(uenc->userData, p);
	1278	if (c & ~0xFFFF)
	1279	return 0;
	1280	return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
	1281	}
	1282
	1283	static int PTRFASTCALL
	1284	unknown_isInvalid(const ENCODING enc, const char p)
	1285	{
	1286	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
	1287	int c = uenc->convert(uenc->userData, p);
	1288	return (c & ~0xFFFF) \|\| checkCharRefNumber(c) < 0;
	1289	}
	1290
	1291	static void PTRCALL
	1292	unknown_toUtf8(const ENCODING *enc,
	1293	const char *fromP, const char fromLim,
	1294	char *toP, const char toLim)
	1295	{
	1296	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
	1297	char buf[XML_UTF8_ENCODE_MAX];
	1298	for (;;) {
	1299	const char *utf8;
	1300	int n;
	1301	if (*fromP == fromLim)
	1302	break;
	1303	utf8 = uenc->utf8[(unsigned char)**fromP];
	1304	n = *utf8++;
	1305	if (n == 0) {
	1306	int c = uenc->convert(uenc->userData, *fromP);
	1307	n = XmlUtf8Encode(c, buf);
	1308	if (n > toLim - *toP)
	1309	break;
	1310	utf8 = buf;
	1311	fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)*fromP]
	1312	- (BT_LEAD2 - 2));
	1313	}
	1314	else {
	1315	if (n > toLim - *toP)
	1316	break;
	1317	(*fromP)++;
	1318	}
	1319	do {
	1320	(toP)++ = *utf8++;
	1321	} while (--n != 0);
	1322	}
	1323	}
	1324
	1325	static void PTRCALL
	1326	unknown_toUtf16(const ENCODING *enc,
	1327	const char *fromP, const char fromLim,
	1328	unsigned short *toP, const unsigned short toLim)
	1329	{
	1330	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
	1331	while (fromP != fromLim && toP != toLim) {
	1332	unsigned short c = uenc->utf16[(unsigned char)**fromP];
	1333	if (c == 0) {
	1334	c = (unsigned short)
	1335	uenc->convert(uenc->userData, *fromP);
	1336	fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)*fromP]
	1337	- (BT_LEAD2 - 2));
	1338	}
	1339	else
	1340	(*fromP)++;
	1341	(toP)++ = c;
	1342	}
	1343	}
	1344
	1345	ENCODING *
	1346	XmlInitUnknownEncoding(void *mem,
	1347	int *table,
[391]	1348	CONVERTER convert,
[2]	1349	void *userData)
	1350	{
	1351	int i;
	1352	struct unknown_encoding e = (struct unknown_encoding )mem;
	1353	for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
	1354	((char )mem)[i] = ((char )&latin1_encoding)[i];
	1355	for (i = 0; i < 128; i++)
	1356	if (latin1_encoding.type[i] != BT_OTHER
	1357	&& latin1_encoding.type[i] != BT_NONXML
	1358	&& table[i] != i)
	1359	return 0;
	1360	for (i = 0; i < 256; i++) {
	1361	int c = table[i];
	1362	if (c == -1) {
	1363	e->normal.type[i] = BT_MALFORM;
	1364	/* This shouldn't really get used. */
	1365	e->utf16[i] = 0xFFFF;
	1366	e->utf8[i][0] = 1;
	1367	e->utf8[i][1] = 0;
	1368	}
	1369	else if (c < 0) {
	1370	if (c < -4)
	1371	return 0;
	1372	e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
	1373	e->utf8[i][0] = 0;
	1374	e->utf16[i] = 0;
	1375	}
	1376	else if (c < 0x80) {
	1377	if (latin1_encoding.type[c] != BT_OTHER
	1378	&& latin1_encoding.type[c] != BT_NONXML
	1379	&& c != i)
	1380	return 0;
	1381	e->normal.type[i] = latin1_encoding.type[c];
	1382	e->utf8[i][0] = 1;
	1383	e->utf8[i][1] = (char)c;
	1384	e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
	1385	}
	1386	else if (checkCharRefNumber(c) < 0) {
	1387	e->normal.type[i] = BT_NONXML;
	1388	/* This shouldn't really get used. */
	1389	e->utf16[i] = 0xFFFF;
	1390	e->utf8[i][0] = 1;
	1391	e->utf8[i][1] = 0;
	1392	}
	1393	else {
	1394	if (c > 0xFFFF)
	1395	return 0;
	1396	if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
	1397	e->normal.type[i] = BT_NMSTRT;
	1398	else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
	1399	e->normal.type[i] = BT_NAME;
	1400	else
	1401	e->normal.type[i] = BT_OTHER;
	1402	e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
	1403	e->utf16[i] = (unsigned short)c;
	1404	}
	1405	}
	1406	e->userData = userData;
	1407	e->convert = convert;
	1408	if (convert) {
	1409	e->normal.isName2 = unknown_isName;
	1410	e->normal.isName3 = unknown_isName;
	1411	e->normal.isName4 = unknown_isName;
	1412	e->normal.isNmstrt2 = unknown_isNmstrt;
	1413	e->normal.isNmstrt3 = unknown_isNmstrt;
	1414	e->normal.isNmstrt4 = unknown_isNmstrt;
	1415	e->normal.isInvalid2 = unknown_isInvalid;
	1416	e->normal.isInvalid3 = unknown_isInvalid;
	1417	e->normal.isInvalid4 = unknown_isInvalid;
	1418	}
	1419	e->normal.enc.utf8Convert = unknown_toUtf8;
	1420	e->normal.enc.utf16Convert = unknown_toUtf16;
	1421	return &(e->normal.enc);
	1422	}
	1423
	1424	/* If this enumeration is changed, getEncodingIndex and encodings
	1425	must also be changed. */
	1426	enum {
	1427	UNKNOWN_ENC = -1,
	1428	ISO_8859_1_ENC = 0,
	1429	US_ASCII_ENC,
	1430	UTF_8_ENC,
	1431	UTF_16_ENC,
	1432	UTF_16BE_ENC,
	1433	UTF_16LE_ENC,
	1434	/* must match encodingNames up to here */
	1435	NO_ENC
	1436	};
	1437
	1438	static const char KW_ISO_8859_1[] = {
	1439	ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
	1440	ASCII_MINUS, ASCII_1, '\0'
	1441	};
	1442	static const char KW_US_ASCII[] = {
	1443	ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
	1444	'\0'
	1445	};
	1446	static const char KW_UTF_8[] = {
	1447	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
	1448	};
	1449	static const char KW_UTF_16[] = {
	1450	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
	1451	};
	1452	static const char KW_UTF_16BE[] = {
	1453	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
	1454	'\0'
	1455	};
	1456	static const char KW_UTF_16LE[] = {
	1457	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
	1458	'\0'
	1459	};
	1460
	1461	static int FASTCALL
	1462	getEncodingIndex(const char *name)
	1463	{
	1464	static const char * const encodingNames[] = {
	1465	KW_ISO_8859_1,
	1466	KW_US_ASCII,
	1467	KW_UTF_8,
	1468	KW_UTF_16,
	1469	KW_UTF_16BE,
	1470	KW_UTF_16LE,
	1471	};
	1472	int i;
	1473	if (name == NULL)
	1474	return NO_ENC;
	1475	for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
	1476	if (streqci(name, encodingNames[i]))
	1477	return i;
	1478	return UNKNOWN_ENC;
	1479	}
	1480
	1481	/* For binary compatibility, we store the index of the encoding
	1482	specified at initialization in the isUtf16 member.
	1483	*/
	1484
	1485	#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
	1486	#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
	1487
	1488	/* This is what detects the encoding. encodingTable maps from
	1489	encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
	1490	the external (protocol) specified encoding; state is
	1491	XML_CONTENT_STATE if we're parsing an external text entity, and
	1492	XML_PROLOG_STATE otherwise.
	1493	*/
	1494
	1495
	1496	static int
	1497	initScan(const ENCODING * const *encodingTable,
	1498	const INIT_ENCODING *enc,
	1499	int state,
	1500	const char *ptr,
	1501	const char *end,
	1502	const char **nextTokPtr)
	1503	{
	1504	const ENCODING **encPtr;
	1505
	1506	if (ptr == end)
	1507	return XML_TOK_NONE;
	1508	encPtr = enc->encPtr;
	1509	if (ptr + 1 == end) {
	1510	/* only a single byte available for auto-detection */
	1511	#ifndef XML_DTD /* FIXME */
	1512	/* a well-formed document entity must have more than one byte */
	1513	if (state != XML_CONTENT_STATE)
	1514	return XML_TOK_PARTIAL;
	1515	#endif
	1516	/* so we're parsing an external text entity... */
	1517	/* if UTF-16 was externally specified, then we need at least 2 bytes */
	1518	switch (INIT_ENC_INDEX(enc)) {
	1519	case UTF_16_ENC:
	1520	case UTF_16LE_ENC:
	1521	case UTF_16BE_ENC:
	1522	return XML_TOK_PARTIAL;
	1523	}
	1524	switch ((unsigned char)*ptr) {
	1525	case 0xFE:
	1526	case 0xFF:
	1527	case 0xEF: /* possibly first byte of UTF-8 BOM */
	1528	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
	1529	&& state == XML_CONTENT_STATE)
	1530	break;
	1531	/* fall through */
	1532	case 0x00:
	1533	case 0x3C:
	1534	return XML_TOK_PARTIAL;
	1535	}
	1536	}
	1537	else {
	1538	switch (((unsigned char)ptr[0] << 8) \| (unsigned char)ptr[1]) {
	1539	case 0xFEFF:
	1540	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
	1541	&& state == XML_CONTENT_STATE)
	1542	break;
	1543	*nextTokPtr = ptr + 2;
	1544	*encPtr = encodingTable[UTF_16BE_ENC];
	1545	return XML_TOK_BOM;
	1546	/* 00 3C is handled in the default case */
	1547	case 0x3C00:
	1548	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
	1549	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
	1550	&& state == XML_CONTENT_STATE)
	1551	break;
	1552	*encPtr = encodingTable[UTF_16LE_ENC];
	1553	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1554	case 0xFFFE:
	1555	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
	1556	&& state == XML_CONTENT_STATE)
	1557	break;
	1558	*nextTokPtr = ptr + 2;
	1559	*encPtr = encodingTable[UTF_16LE_ENC];
	1560	return XML_TOK_BOM;
	1561	case 0xEFBB:
	1562	/* Maybe a UTF-8 BOM (EF BB BF) */
	1563	/* If there's an explicitly specified (external) encoding
	1564	of ISO-8859-1 or some flavour of UTF-16
	1565	and this is an external text entity,
	1566	don't look for the BOM,
	1567	because it might be a legal data.
	1568	*/
	1569	if (state == XML_CONTENT_STATE) {
	1570	int e = INIT_ENC_INDEX(enc);
	1571	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC
	1572	\|\| e == UTF_16LE_ENC \|\| e == UTF_16_ENC)
	1573	break;
	1574	}
	1575	if (ptr + 2 == end)
	1576	return XML_TOK_PARTIAL;
	1577	if ((unsigned char)ptr[2] == 0xBF) {
	1578	*nextTokPtr = ptr + 3;
	1579	*encPtr = encodingTable[UTF_8_ENC];
	1580	return XML_TOK_BOM;
	1581	}
	1582	break;
	1583	default:
	1584	if (ptr[0] == '\0') {
	1585	/* 0 isn't a legal data character. Furthermore a document
	1586	entity can only start with ASCII characters. So the only
[391]	1587	way this can fail to be big-endian UTF-16 is if it is an
[2]	1588	external parsed general entity that's labelled as
	1589	UTF-16LE.
	1590	*/
	1591	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
	1592	break;
	1593	*encPtr = encodingTable[UTF_16BE_ENC];
	1594	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1595	}
	1596	else if (ptr[1] == '\0') {
	1597	/* We could recover here in the case:
	1598	- parsing an external entity
	1599	- second byte is 0
	1600	- no externally specified encoding
	1601	- no encoding declaration
	1602	by assuming UTF-16LE. But we don't, because this would mean when
	1603	presented just with a single byte, we couldn't reliably determine
	1604	whether we needed further bytes.
	1605	*/
	1606	if (state == XML_CONTENT_STATE)
	1607	break;
	1608	*encPtr = encodingTable[UTF_16LE_ENC];
	1609	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1610	}
	1611	break;
	1612	}
	1613	}
	1614	*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
	1615	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
	1616	}
	1617
	1618
	1619	#define NS(x) x
	1620	#define ns(x) x
[391]	1621	#define XML_TOK_NS_C
[2]	1622	#include "xmltok_ns.c"
[391]	1623	#undef XML_TOK_NS_C
[2]	1624	#undef NS
	1625	#undef ns
	1626
	1627	#ifdef XML_NS
	1628
	1629	#define NS(x) x ## NS
	1630	#define ns(x) x ## _ns
	1631
[391]	1632	#define XML_TOK_NS_C
[2]	1633	#include "xmltok_ns.c"
[391]	1634	#undef XML_TOK_NS_C
[2]	1635
	1636	#undef NS
	1637	#undef ns
	1638
	1639	ENCODING *
	1640	XmlInitUnknownEncodingNS(void *mem,
	1641	int *table,
[391]	1642	CONVERTER convert,
[2]	1643	void *userData)
	1644	{
	1645	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
	1646	if (enc)
	1647	((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
	1648	return enc;
	1649	}
	1650
	1651	#endif /* XML_NS */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Modules/expat/xmltok.c

Download in other formats: