Context Navigation

source: trunk/src/helpers/xmltok.c@ 38

Visit:

Last change on this file since 38 was 38, checked in by umoeller, 25 years ago
Updates to XML.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 47.5 KB

Line
1
2	/*
3	*sourcefile xmltok.c
4	* part of the expat implementation. See xmlparse.c.
5	*
6	*/
7
8	/*
9	* Copyright (C) 2001 Ulrich Mller.
10	* Copyright (c) 1998, 1999, 2000 Thai Open Source Software Center Ltd.
11	* and Clark Cooper.
12	*
13	* Permission is hereby granted, free of charge, to any person obtaining
14	* a copy of this software and associated documentation files (the
15	* "Software"), to deal in the Software without restriction, including
16	* without limitation the rights to use, copy, modify, merge, publish,
17	* distribute, sublicense, and/or sell copies of the Software, and to
18	* permit persons to whom the Software is furnished to do so, subject to
19	* the following conditions:
20	*
21	* The above copyright notice and this permission notice shall be included
22	* in all copies or substantial portions of the Software.
23	*
24	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27	* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28	* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29	* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30	* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31	*/
32
33	#include "setup.h"
34
35	#include "expat\expat_setup.h" // V0.9.9 (2001-02-10) [umoeller]
36
37	#pragma info(norea, nogen)
38	// disable "statement unreachable" and "missing break statement"
39	// this code generates those options HEAVILY
40
41	#ifdef COMPILED_FROM_DSP
42	#include "winconfig.h"
43	#else
44	// #include <config.h>
45	#endif /* ndef COMPILED_FROM_DSP */
46
47	#include "expat\xmltok.h"
48	#include "expat\nametab.h"
49
50	#ifdef XML_DTD
51	#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
52	#else
53	#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
54	#endif
55
56	#define VTABLE1 \
57	{ PREFIX(prologTok), PREFIX(contentTok), \
58	PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
59	{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
60	PREFIX(sameName), \
61	PREFIX(nameMatchesAscii), \
62	PREFIX(nameLength), \
63	PREFIX(skipS), \
64	PREFIX(getAtts), \
65	PREFIX(charRefNumber), \
66	PREFIX(predefinedEntityName), \
67	PREFIX(updatePosition), \
68	PREFIX(isPublicId)
69
70	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
71
72	#define UCS2_GET_NAMING(pages, hi, lo) \
73	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
74
75	/* A 2 byte UTF-8 representation splits the characters 11 bits
76	* between the bottom 5 and 6 bits of the bytes.
77	* We need 8 bits to index into pages, 3 bits to add to that index and
78	* 5 bits to generate the mask. */
79	#define UTF8_GET_NAMING2(pages, byte) \
80	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
81	+ ((((byte)[0]) & 3) << 1) \
82	+ ((((byte)[1]) >> 5) & 1)] \
83	& (1 << (((byte)[1]) & 0x1F)))
84
85	/* A 3 byte UTF-8 representation splits the characters 16 bits
86	* between the bottom 4, 6 and 6 bits of the bytes.
87	* We need 8 bits to index into pages, 3 bits to add to that index and
88	* 5 bits to generate the mask. */
89	#define UTF8_GET_NAMING3(pages, byte) \
90	(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
91	+ ((((byte)[1]) >> 2) & 0xF)] \
92	<< 3) \
93	+ ((((byte)[1]) & 3) << 1) \
94	+ ((((byte)[2]) >> 5) & 1)] \
95	& (1 << (((byte)[2]) & 0x1F)))
96
97	#define UTF8_GET_NAMING(pages, p, n) \
98	((n) == 2 \
99	? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
100	: ((n) == 3 \
101	? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
102	: 0))
103
104	#define UTF8_INVALID3(p) \
105	((*p) == 0xED \
106	? (((p)[1] & 0x20) != 0) \
107	: ((*p) == 0xEF \
108	? ((p)[1] == 0xBF && ((p)[2] == 0xBF \|\| (p)[2] == 0xBE)) \
109	: 0))
110
111	#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
112
113	static int EXPATENTRY isNever(const ENCODING * enc, const char *p)
114	{
115	return 0;
116	}
117
118	static int EXPATENTRY utf8_isName2(const ENCODING * enc, const char *p)
119	{
120	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
121	}
122
123	static int EXPATENTRY utf8_isName3(const ENCODING * enc, const char *p)
124	{
125	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
126	}
127
128	#define utf8_isName4 isNever
129
130	static int EXPATENTRY utf8_isNmstrt2(const ENCODING * enc, const char *p)
131	{
132	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
133	}
134
135	static int EXPATENTRY utf8_isNmstrt3(const ENCODING * enc, const char *p)
136	{
137	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
138	}
139
140	#define utf8_isNmstrt4 isNever
141
142	#define utf8_isInvalid2 isNever
143
144	static int EXPATENTRY utf8_isInvalid3(const ENCODING * enc, const char *p)
145	{
146	return UTF8_INVALID3((const unsigned char *)p);
147	}
148
149	static int EXPATENTRY utf8_isInvalid4(const ENCODING * enc, const char *p)
150	{
151	return UTF8_INVALID4((const unsigned char *)p);
152	}
153
154	struct normal_encoding
155	{
156	ENCODING enc;
157	unsigned char type[256];
158	#ifdef XML_MIN_SIZE
159	int (* EXPATENTRY byteType) (const ENCODING , const char );
160	int (* EXPATENTRY isNameMin) (const ENCODING , const char );
161	int (* EXPATENTRY isNmstrtMin) (const ENCODING , const char );
162	int (* EXPATENTRY byteToAscii) (const ENCODING , const char );
163	int (* EXPATENTRY charMatches) (const ENCODING , const char , int);
164	#endif /* XML_MIN_SIZE */
165	int (* EXPATENTRY isName2) (const ENCODING , const char );
166	int (* EXPATENTRY isName3) (const ENCODING , const char );
167	int (* EXPATENTRY isName4) (const ENCODING , const char );
168	int (* EXPATENTRY isNmstrt2) (const ENCODING , const char );
169	int (* EXPATENTRY isNmstrt3) (const ENCODING , const char );
170	int (* EXPATENTRY isNmstrt4) (const ENCODING , const char );
171	int (* EXPATENTRY isInvalid2) (const ENCODING , const char );
172	int (* EXPATENTRY isInvalid3) (const ENCODING , const char );
173	int (* EXPATENTRY isInvalid4) (const ENCODING , const char );
174	};
175
176	#ifdef XML_MIN_SIZE
177
178	#define STANDARD_VTABLE(E) \
179	E ## byteType, \
180	E ## isNameMin, \
181	E ## isNmstrtMin, \
182	E ## byteToAscii, \
183	E ## charMatches,
184
185	#else
186
187	#define STANDARD_VTABLE(E) /* as nothing */
188
189	#endif
190
191	#define NORMAL_VTABLE(E) \
192	E ## isName2, \
193	E ## isName3, \
194	E ## isName4, \
195	E ## isNmstrt2, \
196	E ## isNmstrt3, \
197	E ## isNmstrt4, \
198	E ## isInvalid2, \
199	E ## isInvalid3, \
200	E ## isInvalid4
201
202	static int checkCharRefNumber(int);
203
204	#include "expat\xmltok_impl.h"
205	#include "expat\ascii.h"
206
207	#ifdef XML_MIN_SIZE
208	#define sb_isNameMin isNever
209	#define sb_isNmstrtMin isNever
210	#endif
211
212	#ifdef XML_MIN_SIZE
213	#define MINBPC(enc) ((enc)->minBytesPerChar)
214	#else
215	/* minimum bytes per character */
216	#define MINBPC(enc) 1
217	#endif
218
219	#define SB_BYTE_TYPE(enc, p) \
220	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
221
222	#ifdef XML_MIN_SIZE
223	static int EXPATENTRY sb_byteType(const ENCODING * enc, const char *p)
224	{
225	return SB_BYTE_TYPE(enc, p);
226	}
227	#define BYTE_TYPE(enc, p) \
228	(((const struct normal_encoding *)(enc))->byteType(enc, p))
229	#else
230	#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
231	#endif
232
233	#ifdef XML_MIN_SIZE
234	#define BYTE_TO_ASCII(enc, p) \
235	(((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
236	static int EXPATENTRY sb_byteToAscii(const ENCODING * enc, const char *p)
237	{
238	return *p;
239	}
240	#else
241	#define BYTE_TO_ASCII(enc, p) (*(p))
242	#endif
243
244	#define IS_NAME_CHAR(enc, p, n) \
245	(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
246	#define IS_NMSTRT_CHAR(enc, p, n) \
247	(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
248	#define IS_INVALID_CHAR(enc, p, n) \
249	(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
250
251	#ifdef XML_MIN_SIZE
252	#define IS_NAME_CHAR_MINBPC(enc, p) \
253	(((const struct normal_encoding *)(enc))->isNameMin(enc, p))
254	#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
255	(((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
256	#else
257	#define IS_NAME_CHAR_MINBPC(enc, p) (0)
258	#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
259	#endif
260
261	#ifdef XML_MIN_SIZE
262	#define CHAR_MATCHES(enc, p, c) \
263	(((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
264	static int EXPATENTRY sb_charMatches(const ENCODING * enc, const char *p, int c)
265	{
266	return *p == c;
267	}
268	#else
269	/* c is an ASCII character */
270	#define CHAR_MATCHES(enc, p, c) (*(p) == c)
271	#endif
272
273	#define PREFIX(ident) normal_ ## ident
274	#include "xmltok_impl.c"
275
276	#undef MINBPC
277	#undef BYTE_TYPE
278	#undef BYTE_TO_ASCII
279	#undef CHAR_MATCHES
280	#undef IS_NAME_CHAR
281	#undef IS_NAME_CHAR_MINBPC
282	#undef IS_NMSTRT_CHAR
283	#undef IS_NMSTRT_CHAR_MINBPC
284	#undef IS_INVALID_CHAR
285
286	enum
287	{ /* UTF8_cvalN is value of masked first byte of N byte sequence */
288	UTF8_cval1 = 0x00,
289	UTF8_cval2 = 0xc0,
290	UTF8_cval3 = 0xe0,
291	UTF8_cval4 = 0xf0
292	};
293
294	static void EXPATENTRY utf8_toUtf8(const ENCODING * enc,
295	const char **fromP,
296	const char *fromLim,
297	char **toP,
298	const char *toLim)
299	{
300	char *to;
301	const char *from;
302
303	if (fromLim - fromP > toLim - toP)
304	{
305	/* Avoid copying partial characters. */
306	for (fromLim = fromP + (toLim - toP); fromLim > *fromP; fromLim--)
307	if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
308	break;
309	}
310	for (to = toP, from = fromP; from != fromLim; from++, to++)
311	to = from;
312	*fromP = from;
313	*toP = to;
314	}
315
316	static void EXPATENTRY utf8_toUtf16(const ENCODING * enc,
317	const char *fromP, const char fromLim,
318	unsigned short *toP, const unsigned short toLim)
319	{
320	unsigned short to = toP;
321	const char from = fromP;
322
323	while (from != fromLim && to != toLim)
324	{
325	switch (((struct normal_encoding )enc)->type[(unsigned char)from])
326	{
327	case BT_LEAD2:
328	*to++ = ((from[0] & 0x1f) << 6) \| (from[1] & 0x3f);
329	from += 2;
330	break;
331	case BT_LEAD3:
332	*to++ = ((from[0] & 0xf) << 12) \| ((from[1] & 0x3f) << 6) \| (from[2] & 0x3f);
333	from += 3;
334	break;
335	case BT_LEAD4:
336	{
337	unsigned long n;
338
339	if (to + 1 == toLim)
340	break;
341	n = ((from[0] & 0x7) << 18) \| ((from[1] & 0x3f) << 12) \| ((from[2] & 0x3f) << 6) \| (from[3] & 0x3f);
342	n -= 0x10000;
343	to[0] = (unsigned short)((n >> 10) \| 0xD800);
344	to[1] = (unsigned short)((n & 0x3FF) \| 0xDC00);
345	to += 2;
346	from += 4;
347	}
348	break;
349	default:
350	to++ = from++;
351	break;
352	}
353	}
354	*fromP = from;
355	*toP = to;
356	}
357
358	#ifdef XML_NS
359	static const struct normal_encoding utf8_encoding_ns =
360	{
361	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
362	{
363	#include "asciitab.h"
364	#include "utf8tab.h"
365	},
366	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
367	};
368
369	#endif
370
371	static const struct normal_encoding utf8_encoding =
372	{
373	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
374	{
375	#define BT_COLON BT_NMSTRT
376	#include "expat\asciitab.h"
377	#undef BT_COLON
378	#include "expat\utf8tab.h"
379	},
380	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
381	};
382
383	#ifdef XML_NS
384
385	static const struct normal_encoding internal_utf8_encoding_ns =
386	{
387	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
388	{
389	#include "iasciitab.h"
390	#include "utf8tab.h"
391	},
392	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
393	};
394
395	#endif
396
397	static const struct normal_encoding internal_utf8_encoding =
398	{
399	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
400	{
401	#define BT_COLON BT_NMSTRT
402	#include "expat\iasciitab.h"
403	#undef BT_COLON
404	#include "expat\utf8tab.h"
405	},
406	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
407	};
408
409	static void EXPATENTRY latin1_toUtf8(const ENCODING * enc,
410	const char *fromP, const char fromLim,
411	char *toP, const char toLim)
412	{
413	for (;;)
414	{
415	unsigned char c;
416
417	if (*fromP == fromLim)
418	break;
419	c = (unsigned char)**fromP;
420	if (c & 0x80)
421	{
422	if (toLim - *toP < 2)
423	break;
424	(toP)++ = ((c >> 6) \| UTF8_cval2);
425	(toP)++ = ((c & 0x3f) \| 0x80);
426	(*fromP)++;
427	}
428	else
429	{
430	if (*toP == toLim)
431	break;
432	(toP)++ = (fromP)++;
433	}
434	}
435	}
436
437	static void EXPATENTRY latin1_toUtf16(const ENCODING * enc,
438	const char *fromP, const char fromLim,
439	unsigned short *toP, const unsigned short toLim)
440	{
441	while (fromP != fromLim && toP != toLim)
442	(toP)++ = (unsigned char)(fromP)++;
443	}
444
445	#ifdef XML_NS
446
447	static const struct normal_encoding latin1_encoding_ns =
448	{
449	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
450	{
451	#include "asciitab.h"
452	#include "latin1tab.h"
453	},
454	STANDARD_VTABLE(sb_)
455	};
456
457	#endif
458
459	static const struct normal_encoding latin1_encoding =
460	{
461	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
462	{
463	#define BT_COLON BT_NMSTRT
464	#include "expat\asciitab.h"
465	#undef BT_COLON
466	#include "expat\latin1tab.h"
467	},
468	STANDARD_VTABLE(sb_)
469	};
470
471	static void EXPATENTRY ascii_toUtf8(const ENCODING * enc,
472	const char *fromP, const char fromLim,
473	char *toP, const char toLim)
474	{
475	while (fromP != fromLim && toP != toLim)
476	(toP)++ = (fromP)++;
477	}
478
479	#ifdef XML_NS
480
481	static const struct normal_encoding ascii_encoding_ns =
482	{
483	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
484	{
485	#include "asciitab.h"
486	/* BT_NONXML == 0 */
487	},
488	STANDARD_VTABLE(sb_)
489	};
490
491	#endif
492
493	static const struct normal_encoding ascii_encoding =
494	{
495	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
496	{
497	#define BT_COLON BT_NMSTRT
498	#include "expat\asciitab.h"
499	#undef BT_COLON
500	/* BT_NONXML == 0 */
501	},
502	STANDARD_VTABLE(sb_)
503	};
504
505	static int unicode_byte_type(char hi, char lo)
506	{
507	switch ((unsigned char)hi)
508	{
509	case 0xD8:
510	case 0xD9:
511	case 0xDA:
512	case 0xDB:
513	return BT_LEAD4;
514	case 0xDC:
515	case 0xDD:
516	case 0xDE:
517	case 0xDF:
518	return BT_TRAIL;
519	case 0xFF:
520	switch ((unsigned char)lo)
521	{
522	case 0xFF:
523	case 0xFE:
524	return BT_NONXML;
525	}
526	break;
527	}
528	return BT_NONASCII;
529	}
530
531	#define DEFINE_UTF16_TO_UTF8(E) \
532	static void EXPATENTRY E ## toUtf8(const ENCODING *enc, \
533	const char *fromP, const char fromLim, \
534	char *toP, const char toLim) \
535	{ \
536	const char *from; \
537	for (from = *fromP; from != fromLim; from += 2) { \
538	int plane; \
539	unsigned char lo2; \
540	unsigned char lo = GET_LO(from); \
541	unsigned char hi = GET_HI(from); \
542	switch (hi) { \
543	case 0: \
544	if (lo < 0x80) { \
545	if (*toP == toLim) { \
546	*fromP = from; \
547	return; \
548	} \
549	(toP)++ = lo; \
550	break; \
551	} \
552	/* fall through */ \
553	case 0x1: case 0x2: case 0x3: \
554	case 0x4: case 0x5: case 0x6: case 0x7: \
555	if (toLim - *toP < 2) { \
556	*fromP = from; \
557	return; \
558	} \
559	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
560	(toP)++ = ((lo & 0x3f) \| 0x80); \
561	break; \
562	default: \
563	if (toLim - *toP < 3) { \
564	*fromP = from; \
565	return; \
566	} \
567	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
568	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
569	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
570	(toP)++ = ((lo & 0x3f) \| 0x80); \
571	break; \
572	case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
573	if (toLim - *toP < 4) { \
574	*fromP = from; \
575	return; \
576	} \
577	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
578	(toP)++ = ((plane >> 2) \| UTF8_cval4); \
579	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
580	from += 2; \
581	lo2 = GET_LO(from); \
582	(toP)++ = (((lo & 0x3) << 4) \
583	\| ((GET_HI(from) & 0x3) << 2) \
584	\| (lo2 >> 6) \
585	\| 0x80); \
586	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
587	break; \
588	} \
589	} \
590	*fromP = from; \
591	}
592
593	#define DEFINE_UTF16_TO_UTF16(E) \
594	static void EXPATENTRY E ## toUtf16(const ENCODING *enc, \
595	const char *fromP, const char fromLim, \
596	unsigned short *toP, const unsigned short toLim) \
597	{ \
598	/* Avoid copying first half only of surrogate */ \
599	if (fromLim - fromP > ((toLim - toP) << 1) \
600	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
601	fromLim -= 2; \
602	for (; fromP != fromLim && toP != toLim; *fromP += 2) \
603	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
604	}
605
606	#define SET2(ptr, ch) \
607	(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
608	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
609	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
610
611	DEFINE_UTF16_TO_UTF8(little2_)
612	DEFINE_UTF16_TO_UTF16(little2_)
613
614	#undef SET2
615	#undef GET_LO
616	#undef GET_HI
617
618	#define SET2(ptr, ch) \
619	(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
620	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
621	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
622
623	DEFINE_UTF16_TO_UTF8(big2_)
624	DEFINE_UTF16_TO_UTF16(big2_)
625
626	#undef SET2
627	#undef GET_LO
628	#undef GET_HI
629
630	#define LITTLE2_BYTE_TYPE(enc, p) \
631	((p)[1] == 0 \
632	? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
633	: unicode_byte_type((p)[1], (p)[0]))
634	#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
635	#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
636	#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
637	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
638	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
639	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
640
641	#ifdef XML_MIN_SIZE
642
643	static int EXPATENTRY little2_byteType(const ENCODING * enc, const char *p)
644	{
645	return LITTLE2_BYTE_TYPE(enc, p);
646	}
647
648	static int EXPATENTRY little2_byteToAscii(const ENCODING * enc, const char *p)
649	{
650	return LITTLE2_BYTE_TO_ASCII(enc, p);
651	}
652
653	static int EXPATENTRY little2_charMatches(const ENCODING * enc, const char *p, int c)
654	{
655	return LITTLE2_CHAR_MATCHES(enc, p, c);
656	}
657
658	static int EXPATENTRY little2_isNameMin(const ENCODING * enc, const char *p)
659	{
660	return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
661	}
662
663	static int EXPATENTRY little2_isNmstrtMin(const ENCODING * enc, const char *p)
664	{
665	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
666	}
667
668	#undef VTABLE
669	#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
670
671	#else /* not XML_MIN_SIZE */
672
673	#undef PREFIX
674	#define PREFIX(ident) little2_ ## ident
675	#define MINBPC(enc) 2
676	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
677	#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
678	#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
679	#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
680	#define IS_NAME_CHAR(enc, p, n) 0
681	#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
682	#define IS_NMSTRT_CHAR(enc, p, n) (0)
683	#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
684
685	#include "xmltok_impl.c"
686
687	#undef MINBPC
688	#undef BYTE_TYPE
689	#undef BYTE_TO_ASCII
690	#undef CHAR_MATCHES
691	#undef IS_NAME_CHAR
692	#undef IS_NAME_CHAR_MINBPC
693	#undef IS_NMSTRT_CHAR
694	#undef IS_NMSTRT_CHAR_MINBPC
695	#undef IS_INVALID_CHAR
696
697	#endif /* not XML_MIN_SIZE */
698
699	#ifdef XML_NS
700
701	static const struct normal_encoding little2_encoding_ns =
702	{
703	{VTABLE, 2, 0,
704	#if XML_BYTE_ORDER == 12
705	1
706	#else
707	0
708	#endif
709	},
710	{
711	#include "asciitab.h"
712	#include "latin1tab.h"
713	},
714	STANDARD_VTABLE(little2_)
715	};
716
717	#endif
718
719	static const struct normal_encoding little2_encoding =
720	{
721	{VTABLE, 2, 0,
722	#if XML_BYTE_ORDER == 12
723	1
724	#else
725	0
726	#endif
727	},
728	{
729	#define BT_COLON BT_NMSTRT
730	#include "expat\asciitab.h"
731	#undef BT_COLON
732	#include "expat\latin1tab.h"
733	},
734	STANDARD_VTABLE(little2_)
735	};
736
737	#if XML_BYTE_ORDER != 21
738
739	#ifdef XML_NS
740
741	static const struct normal_encoding internal_little2_encoding_ns =
742	{
743	{VTABLE, 2, 0, 1},
744	{
745	#include "iasciitab.h"
746	#include "latin1tab.h"
747	},
748	STANDARD_VTABLE(little2_)
749	};
750
751	#endif
752
753	static const struct normal_encoding internal_little2_encoding =
754	{
755	{VTABLE, 2, 0, 1},
756	{
757	#define BT_COLON BT_NMSTRT
758	#include "expat\iasciitab.h"
759	#undef BT_COLON
760	#include "expat\latin1tab.h"
761	},
762	STANDARD_VTABLE(little2_)
763	};
764
765	#endif
766
767
768	#define BIG2_BYTE_TYPE(enc, p) \
769	((p)[0] == 0 \
770	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
771	: unicode_byte_type((p)[0], (p)[1]))
772	#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
773	#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
774	#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
775	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
776	#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
777	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
778
779	#ifdef XML_MIN_SIZE
780
781	static int EXPATENTRY big2_byteType(const ENCODING * enc, const char *p)
782	{
783	return BIG2_BYTE_TYPE(enc, p);
784	}
785
786	static int EXPATENTRY big2_byteToAscii(const ENCODING * enc, const char *p)
787	{
788	return BIG2_BYTE_TO_ASCII(enc, p);
789	}
790
791	static int EXPATENTRY big2_charMatches(const ENCODING * enc, const char *p, int c)
792	{
793	return BIG2_CHAR_MATCHES(enc, p, c);
794	}
795
796	static int EXPATENTRY big2_isNameMin(const ENCODING * enc, const char *p)
797	{
798	return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
799	}
800
801	static int EXPATENTRY big2_isNmstrtMin(const ENCODING * enc, const char *p)
802	{
803	return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
804	}
805
806	#undef VTABLE
807	#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
808
809	#else /* not XML_MIN_SIZE */
810
811	#undef PREFIX
812	#define PREFIX(ident) big2_ ## ident
813	#define MINBPC(enc) 2
814	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
815	#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
816	#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
817	#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
818	#define IS_NAME_CHAR(enc, p, n) 0
819	#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
820	#define IS_NMSTRT_CHAR(enc, p, n) (0)
821	#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
822
823	#include "xmltok_impl.c"
824
825	#undef MINBPC
826	#undef BYTE_TYPE
827	#undef BYTE_TO_ASCII
828	#undef CHAR_MATCHES
829	#undef IS_NAME_CHAR
830	#undef IS_NAME_CHAR_MINBPC
831	#undef IS_NMSTRT_CHAR
832	#undef IS_NMSTRT_CHAR_MINBPC
833	#undef IS_INVALID_CHAR
834
835	#endif /* not XML_MIN_SIZE */
836
837	#ifdef XML_NS
838
839	static const struct normal_encoding big2_encoding_ns =
840	{
841	{VTABLE, 2, 0,
842	#if XML_BYTE_ORDER == 21
843	1
844	#else
845	0
846	#endif
847	},
848	{
849	#include "asciitab.h"
850	#include "latin1tab.h"
851	},
852	STANDARD_VTABLE(big2_)
853	};
854
855	#endif
856
857	static const struct normal_encoding big2_encoding =
858	{
859	{VTABLE, 2, 0,
860	#if XML_BYTE_ORDER == 21
861	1
862	#else
863	0
864	#endif
865	},
866	{
867	#define BT_COLON BT_NMSTRT
868	#include "expat\asciitab.h"
869	#undef BT_COLON
870	#include "expat\latin1tab.h"
871	},
872	STANDARD_VTABLE(big2_)
873	};
874
875	#if XML_BYTE_ORDER != 12
876
877	#ifdef XML_NS
878
879	static const struct normal_encoding internal_big2_encoding_ns =
880	{
881	{VTABLE, 2, 0, 1},
882	{
883	#include "iasciitab.h"
884	#include "latin1tab.h"
885	},
886	STANDARD_VTABLE(big2_)
887	};
888
889	#endif
890
891	static const struct normal_encoding internal_big2_encoding =
892	{
893	{VTABLE, 2, 0, 1},
894	{
895	#define BT_COLON BT_NMSTRT
896	#include "expat\iasciitab.h"
897	#undef BT_COLON
898	#include "expat\latin1tab.h"
899	},
900	STANDARD_VTABLE(big2_)
901	};
902
903	#endif
904
905	#undef PREFIX
906
907	static
908	int streqci(const char s1, const char s2)
909	{
910	for (;;)
911	{
912	char c1 = *s1++;
913	char c2 = *s2++;
914
915	if (ASCII_a <= c1 && c1 <= ASCII_z)
916	c1 += ASCII_A - ASCII_a;
917	if (ASCII_a <= c2 && c2 <= ASCII_z)
918	c2 += ASCII_A - ASCII_a;
919	if (c1 != c2)
920	return 0;
921	if (!c1)
922	break;
923	}
924	return 1;
925	}
926
927	static void EXPATENTRY initUpdatePosition(const ENCODING * enc,
928	const char *ptr,
929	const char *end,
930	POSITION * pos)
931	{
932	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
933	}
934
935	static int EXPATENTRY toAscii(const ENCODING * enc, const char ptr, const char end)
936	{
937	char buf[1];
938	char *p = buf;
939
940	XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
941	if (p == buf)
942	return -1;
943	else
944	return buf[0];
945	}
946
947	static int isSpace(int c)
948	{
949	switch (c)
950	{
951	case 0x20:
952	case 0xD:
953	case 0xA:
954	case 0x9:
955	return 1;
956	}
957	return 0;
958	}
959
960	/* Return 1 if there's just optional white space
961	* or there's an S followed by name=val. */
962	static int EXPATENTRY parsePseudoAttribute(const ENCODING * enc,
963	const char *ptr,
964	const char *end,
965	const char **namePtr,
966	const char **nameEndPtr,
967	const char **valPtr,
968	const char **nextTokPtr)
969	{
970	int c;
971	char open;
972
973	if (ptr == end)
974	{
975	*namePtr = 0;
976	return 1;
977	}
978	if (!isSpace(toAscii(enc, ptr, end)))
979	{
980	*nextTokPtr = ptr;
981	return 0;
982	}
983	do
984	{
985	ptr += enc->minBytesPerChar;
986	}
987	while (isSpace(toAscii(enc, ptr, end)));
988	if (ptr == end)
989	{
990	*namePtr = 0;
991	return 1;
992	}
993	*namePtr = ptr;
994	for (;;)
995	{
996	c = toAscii(enc, ptr, end);
997	if (c == -1)
998	{
999	*nextTokPtr = ptr;
1000	return 0;
1001	}
1002	if (c == ASCII_EQUALS)
1003	{
1004	*nameEndPtr = ptr;
1005	break;
1006	}
1007	if (isSpace(c))
1008	{
1009	*nameEndPtr = ptr;
1010	do
1011	{
1012	ptr += enc->minBytesPerChar;
1013	}
1014	while (isSpace(c = toAscii(enc, ptr, end)));
1015	if (c != ASCII_EQUALS)
1016	{
1017	*nextTokPtr = ptr;
1018	return 0;
1019	}
1020	break;
1021	}
1022	ptr += enc->minBytesPerChar;
1023	}
1024	if (ptr == *namePtr)
1025	{
1026	*nextTokPtr = ptr;
1027	return 0;
1028	}
1029	ptr += enc->minBytesPerChar;
1030	c = toAscii(enc, ptr, end);
1031	while (isSpace(c))
1032	{
1033	ptr += enc->minBytesPerChar;
1034	c = toAscii(enc, ptr, end);
1035	}
1036	if (c != ASCII_QUOT && c != ASCII_APOS)
1037	{
1038	*nextTokPtr = ptr;
1039	return 0;
1040	}
1041	open = c;
1042	ptr += enc->minBytesPerChar;
1043	*valPtr = ptr;
1044	for (;; ptr += enc->minBytesPerChar)
1045	{
1046	c = toAscii(enc, ptr, end);
1047	if (c == open)
1048	break;
1049	if (!(ASCII_a <= c && c <= ASCII_z)
1050	&& !(ASCII_A <= c && c <= ASCII_Z)
1051	&& !(ASCII_0 <= c && c <= ASCII_9)
1052	&& c != ASCII_PERIOD
1053	&& c != ASCII_MINUS
1054	&& c != ASCII_UNDERSCORE)
1055	{
1056	*nextTokPtr = ptr;
1057	return 0;
1058	}
1059	}
1060	*nextTokPtr = ptr + enc->minBytesPerChar;
1061	return 1;
1062	}
1063
1064	static const char KW_version[] =
1065	{
1066	ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1067	};
1068
1069	static const char KW_encoding[] =
1070	{
1071	ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1072	};
1073
1074	static const char KW_standalone[] =
1075	{
1076	ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
1077	};
1078
1079	static const char KW_yes[] =
1080	{
1081	ASCII_y, ASCII_e, ASCII_s, '\0'
1082	};
1083
1084	static const char KW_no[] =
1085	{
1086	ASCII_n, ASCII_o, '\0'
1087	};
1088
1089	static int doParseXmlDecl(const ENCODING* (* EXPATENTRY encodingFinder)(const ENCODING *,
1090	const char *,
1091	const char *),
1092	int isGeneralTextEntity,
1093	const ENCODING * enc,
1094	const char *ptr,
1095	const char *end,
1096	const char **badPtr,
1097	const char **versionPtr,
1098	const char **versionEndPtr,
1099	const char **encodingName,
1100	const ENCODING ** encoding,
1101	int *standalone)
1102	{
1103	const char *val = 0;
1104	const char *name = 0;
1105	const char *nameEnd = 0;
1106
1107	ptr += 5 * enc->minBytesPerChar;
1108	end -= 2 * enc->minBytesPerChar;
1109	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) \|\| !name)
1110	{
1111	*badPtr = ptr;
1112	return 0;
1113	}
1114	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version))
1115	{
1116	if (!isGeneralTextEntity)
1117	{
1118	*badPtr = name;
1119	return 0;
1120	}
1121	}
1122	else
1123	{
1124	if (versionPtr)
1125	*versionPtr = val;
1126	if (versionEndPtr)
1127	*versionEndPtr = ptr;
1128	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1129	{
1130	*badPtr = ptr;
1131	return 0;
1132	}
1133	if (!name)
1134	{
1135	if (isGeneralTextEntity)
1136	{
1137	/* a TextDecl must have an EncodingDecl */
1138	*badPtr = ptr;
1139	return 0;
1140	}
1141	return 1;
1142	}
1143	}
1144	if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding))
1145	{
1146	int c = toAscii(enc, val, end);
1147
1148	if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z))
1149	{
1150	*badPtr = val;
1151	return 0;
1152	}
1153	if (encodingName)
1154	*encodingName = val;
1155	if (encoding)
1156	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1157	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1158	{
1159	*badPtr = ptr;
1160	return 0;
1161	}
1162	if (!name)
1163	return 1;
1164	}
1165	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) \|\| isGeneralTextEntity)
1166	{
1167	*badPtr = name;
1168	return 0;
1169	}
1170	if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes))
1171	{
1172	if (standalone)
1173	*standalone = 1;
1174	}
1175	else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no))
1176	{
1177	if (standalone)
1178	*standalone = 0;
1179	}
1180	else
1181	{
1182	*badPtr = val;
1183	return 0;
1184	}
1185	while (isSpace(toAscii(enc, ptr, end)))
1186	ptr += enc->minBytesPerChar;
1187	if (ptr != end)
1188	{
1189	*badPtr = ptr;
1190	return 0;
1191	}
1192	return 1;
1193	}
1194
1195	static int checkCharRefNumber(int result)
1196	{
1197	switch (result >> 8)
1198	{
1199	case 0xD8:
1200	case 0xD9:
1201	case 0xDA:
1202	case 0xDB:
1203	case 0xDC:
1204	case 0xDD:
1205	case 0xDE:
1206	case 0xDF:
1207	return -1;
1208	case 0:
1209	if (latin1_encoding.type[result] == BT_NONXML)
1210	return -1;
1211	break;
1212	case 0xFF:
1213	if (result == 0xFFFE \|\| result == 0xFFFF)
1214	return -1;
1215	break;
1216	}
1217	return result;
1218	}
1219
1220	int XmlUtf8Encode(int c, char *buf)
1221	{
1222	enum
1223	{
1224	/* minN is minimum legal resulting value for N byte sequence */
1225	min2 = 0x80,
1226	min3 = 0x800,
1227	min4 = 0x10000
1228	};
1229
1230	if (c < 0)
1231	return 0;
1232	if (c < min2)
1233	{
1234	buf[0] = (c \| UTF8_cval1);
1235	return 1;
1236	}
1237	if (c < min3)
1238	{
1239	buf[0] = ((c >> 6) \| UTF8_cval2);
1240	buf[1] = ((c & 0x3f) \| 0x80);
1241	return 2;
1242	}
1243	if (c < min4)
1244	{
1245	buf[0] = ((c >> 12) \| UTF8_cval3);
1246	buf[1] = (((c >> 6) & 0x3f) \| 0x80);
1247	buf[2] = ((c & 0x3f) \| 0x80);
1248	return 3;
1249	}
1250	if (c < 0x110000)
1251	{
1252	buf[0] = ((c >> 18) \| UTF8_cval4);
1253	buf[1] = (((c >> 12) & 0x3f) \| 0x80);
1254	buf[2] = (((c >> 6) & 0x3f) \| 0x80);
1255	buf[3] = ((c & 0x3f) \| 0x80);
1256	return 4;
1257	}
1258	return 0;
1259	}
1260
1261	int XmlUtf16Encode(int charNum, unsigned short *buf)
1262	{
1263	if (charNum < 0)
1264	return 0;
1265	if (charNum < 0x10000)
1266	{
1267	buf[0] = charNum;
1268	return 1;
1269	}
1270	if (charNum < 0x110000)
1271	{
1272	charNum -= 0x10000;
1273	buf[0] = (charNum >> 10) + 0xD800;
1274	buf[1] = (charNum & 0x3FF) + 0xDC00;
1275	return 2;
1276	}
1277	return 0;
1278	}
1279
1280	struct unknown_encoding
1281	{
1282	struct normal_encoding normal;
1283	int (convert) (void userData, const char *p);
1284	void *userData;
1285	unsigned short utf16[256];
1286	char utf8[256][4];
1287	};
1288
1289	int EXPATENTRY XmlSizeOfUnknownEncoding(void)
1290	{
1291	return sizeof(struct unknown_encoding);
1292	}
1293
1294	static int EXPATENTRY unknown_isName(const ENCODING * enc, const char *p)
1295	{
1296	int c = ((const struct unknown_encoding *)enc)
1297	->convert(((const struct unknown_encoding *)enc)->userData, p);
1298
1299	if (c & ~0xFFFF)
1300	return 0;
1301	return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1302	}
1303
1304	static int EXPATENTRY unknown_isNmstrt(const ENCODING * enc, const char *p)
1305	{
1306	int c = ((const struct unknown_encoding *)enc)
1307	->convert(((const struct unknown_encoding *)enc)->userData, p);
1308
1309	if (c & ~0xFFFF)
1310	return 0;
1311	return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1312	}
1313
1314	static int EXPATENTRY unknown_isInvalid(const ENCODING * enc, const char *p)
1315	{
1316	int c = ((const struct unknown_encoding *)enc)
1317	->convert(((const struct unknown_encoding *)enc)->userData, p);
1318
1319	return (c & ~0xFFFF) \|\| checkCharRefNumber(c) < 0;
1320	}
1321
1322	static void EXPATENTRY unknown_toUtf8(const ENCODING * enc,
1323	const char **fromP,
1324	const char *fromLim,
1325	char **toP,
1326	const char *toLim)
1327	{
1328	char buf[XML_UTF8_ENCODE_MAX];
1329
1330	for (;;)
1331	{
1332	const char *utf8;
1333	int n;
1334
1335	if (*fromP == fromLim)
1336	break;
1337	utf8 = ((const struct unknown_encoding )enc)->utf8[(unsigned char)*fromP];
1338	n = *utf8++;
1339	if (n == 0)
1340	{
1341	int c = ((const struct unknown_encoding *)enc)
1342	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1343
1344	n = XmlUtf8Encode(c, buf);
1345	if (n > toLim - *toP)
1346	break;
1347	utf8 = buf;
1348	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1349	- (BT_LEAD2 - 2);
1350	}
1351	else
1352	{
1353	if (n > toLim - *toP)
1354	break;
1355	(*fromP)++;
1356	}
1357	do
1358	{
1359	(toP)++ = *utf8++;
1360	}
1361	while (--n != 0);
1362	}
1363	}
1364
1365	static void EXPATENTRY unknown_toUtf16(const ENCODING * enc,
1366	const char **fromP,
1367	const char *fromLim,
1368	unsigned short **toP,
1369	const unsigned short *toLim)
1370	{
1371	while (fromP != fromLim && toP != toLim)
1372	{
1373	unsigned short c
1374	= ((const struct unknown_encoding )enc)->utf16[(unsigned char)*fromP];
1375
1376	if (c == 0)
1377	{
1378	c = (unsigned short)((const struct unknown_encoding *)enc)
1379	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1380	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1381	- (BT_LEAD2 - 2);
1382	}
1383	else
1384	(*fromP)++;
1385	(toP)++ = c;
1386	}
1387	}
1388
1389	ENCODING * XmlInitUnknownEncoding(void *mem,
1390	int *table,
1391	int (convert) (void userData, const char *p),
1392	void *userData)
1393	{
1394	int i;
1395	struct unknown_encoding e = (struct unknown_encoding )mem;
1396	for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1397
1398	((char )mem)[i] = ((char )&latin1_encoding)[i];
1399	for (i = 0; i < 128; i++)
1400	if (latin1_encoding.type[i] != BT_OTHER
1401	&& latin1_encoding.type[i] != BT_NONXML
1402	&& table[i] != i)
1403	return 0;
1404	for (i = 0; i < 256; i++)
1405	{
1406	int c = table[i];
1407
1408	if (c == -1)
1409	{
1410	e->normal.type[i] = BT_MALFORM;
1411	/* This shouldn't really get used. */
1412	e->utf16[i] = 0xFFFF;
1413	e->utf8[i][0] = 1;
1414	e->utf8[i][1] = 0;
1415	}
1416	else if (c < 0)
1417	{
1418	if (c < -4)
1419	return 0;
1420	e->normal.type[i] = BT_LEAD2 - (c + 2);
1421	e->utf8[i][0] = 0;
1422	e->utf16[i] = 0;
1423	}
1424	else if (c < 0x80)
1425	{
1426	if (latin1_encoding.type[c] != BT_OTHER
1427	&& latin1_encoding.type[c] != BT_NONXML
1428	&& c != i)
1429	return 0;
1430	e->normal.type[i] = latin1_encoding.type[c];
1431	e->utf8[i][0] = 1;
1432	e->utf8[i][1] = (char)c;
1433	e->utf16[i] = c == 0 ? 0xFFFF : c;
1434	}
1435	else if (checkCharRefNumber(c) < 0)
1436	{
1437	e->normal.type[i] = BT_NONXML;
1438	/* This shouldn't really get used. */
1439	e->utf16[i] = 0xFFFF;
1440	e->utf8[i][0] = 1;
1441	e->utf8[i][1] = 0;
1442	}
1443	else
1444	{
1445	if (c > 0xFFFF)
1446	return 0;
1447	if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1448	e->normal.type[i] = BT_NMSTRT;
1449	else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1450	e->normal.type[i] = BT_NAME;
1451	else
1452	e->normal.type[i] = BT_OTHER;
1453	e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1454	e->utf16[i] = c;
1455	}
1456	}
1457	e->userData = userData;
1458	e->convert = convert;
1459	if (convert)
1460	{
1461	e->normal.isName2 = unknown_isName;
1462	e->normal.isName3 = unknown_isName;
1463	e->normal.isName4 = unknown_isName;
1464	e->normal.isNmstrt2 = unknown_isNmstrt;
1465	e->normal.isNmstrt3 = unknown_isNmstrt;
1466	e->normal.isNmstrt4 = unknown_isNmstrt;
1467	e->normal.isInvalid2 = unknown_isInvalid;
1468	e->normal.isInvalid3 = unknown_isInvalid;
1469	e->normal.isInvalid4 = unknown_isInvalid;
1470	}
1471	e->normal.enc.utf8Convert = unknown_toUtf8;
1472	e->normal.enc.utf16Convert = unknown_toUtf16;
1473	return &(e->normal.enc);
1474	}
1475
1476	/* If this enumeration is changed, getEncodingIndex and encodings
1477	* must also be changed. */
1478	enum
1479	{
1480	UNKNOWN_ENC = -1,
1481	ISO_8859_1_ENC = 0,
1482	US_ASCII_ENC,
1483	UTF_8_ENC,
1484	UTF_16_ENC,
1485	UTF_16BE_ENC,
1486	UTF_16LE_ENC,
1487	/* must match encodingNames up to here */
1488	NO_ENC
1489	};
1490
1491	static const char KW_ISO_8859_1[] =
1492	{
1493	ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
1494	};
1495	static const char KW_US_ASCII[] =
1496	{
1497	ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
1498	};
1499	static const char KW_UTF_8[] =
1500	{
1501	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1502	};
1503	static const char KW_UTF_16[] =
1504	{
1505	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1506	};
1507	static const char KW_UTF_16BE[] =
1508	{
1509	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
1510	};
1511	static const char KW_UTF_16LE[] =
1512	{
1513	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
1514	};
1515
1516	static int getEncodingIndex(const char *name)
1517	{
1518	static const char *encodingNames[] =
1519	{
1520	KW_ISO_8859_1,
1521	KW_US_ASCII,
1522	KW_UTF_8,
1523	KW_UTF_16,
1524	KW_UTF_16BE,
1525	KW_UTF_16LE,
1526	};
1527	int i;
1528
1529	if (name == 0)
1530	return NO_ENC;
1531	for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1532	if (streqci(name, encodingNames[i]))
1533	return i;
1534	return UNKNOWN_ENC;
1535	}
1536
1537	/* For binary compatibility, we store the index of the encoding specified
1538	* at initialization in the isUtf16 member. */
1539
1540	#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1541	#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1542
1543	/* This is what detects the encoding.
1544	* encodingTable maps from encoding indices to encodings;
1545	* INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1546	* state is XML_CONTENT_STATE if we're parsing an external text entity,
1547	* and XML_PROLOG_STATE otherwise.
1548	*/
1549
1550
1551	static int EXPATENTRY initScan(const ENCODING ** encodingTable,
1552	const INIT_ENCODING * enc,
1553	int state,
1554	const char *ptr,
1555	const char *end,
1556	const char **nextTokPtr)
1557	{
1558	const ENCODING **encPtr;
1559
1560	if (ptr == end)
1561	return XML_TOK_NONE;
1562	encPtr = enc->encPtr;
1563	if (ptr + 1 == end)
1564	{
1565	/* only a single byte available for auto-detection */
1566	#ifndef XML_DTD /* FIXME */
1567	/* a well-formed document entity must have more than one byte */
1568	if (state != XML_CONTENT_STATE)
1569	return XML_TOK_PARTIAL;
1570	#endif
1571	/* so we're parsing an external text entity... */
1572	/* if UTF-16 was externally specified, then we need at least 2 bytes */
1573	switch (INIT_ENC_INDEX(enc))
1574	{
1575	case UTF_16_ENC:
1576	case UTF_16LE_ENC:
1577	case UTF_16BE_ENC:
1578	return XML_TOK_PARTIAL;
1579	}
1580	switch ((unsigned char)*ptr)
1581	{
1582	case 0xFE:
1583	case 0xFF:
1584	case 0xEF: /* possibly first byte of UTF-8 BOM */
1585	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1586	&& state == XML_CONTENT_STATE)
1587	break;
1588	/* fall through */
1589	case 0x00:
1590	case 0x3C:
1591	return XML_TOK_PARTIAL;
1592	}
1593	}
1594	else
1595	{
1596	switch (((unsigned char)ptr[0] << 8) \| (unsigned char)ptr[1])
1597	{
1598	case 0xFEFF:
1599	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1600	&& state == XML_CONTENT_STATE)
1601	break;
1602	*nextTokPtr = ptr + 2;
1603	*encPtr = encodingTable[UTF_16BE_ENC];
1604	return XML_TOK_BOM;
1605	/* 00 3C is handled in the default case */
1606	case 0x3C00:
1607	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1608	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
1609	&& state == XML_CONTENT_STATE)
1610	break;
1611	*encPtr = encodingTable[UTF_16LE_ENC];
1612	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1613	case 0xFFFE:
1614	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1615	&& state == XML_CONTENT_STATE)
1616	break;
1617	*nextTokPtr = ptr + 2;
1618	*encPtr = encodingTable[UTF_16LE_ENC];
1619	return XML_TOK_BOM;
1620	case 0xEFBB:
1621	/* Maybe a UTF-8 BOM (EF BB BF) */
1622	/* If there's an explicitly specified (external) encoding
1623	* of ISO-8859-1 or some flavour of UTF-16
1624	* and this is an external text entity,
1625	* don't look for the BOM,
1626	* because it might be a legal data. */
1627	if (state == XML_CONTENT_STATE)
1628	{
1629	int e = INIT_ENC_INDEX(enc);
1630
1631	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC \|\| e == UTF_16LE_ENC \|\| e == UTF_16_ENC)
1632	break;
1633	}
1634	if (ptr + 2 == end)
1635	return XML_TOK_PARTIAL;
1636	if ((unsigned char)ptr[2] == 0xBF)
1637	{
1638	*encPtr = encodingTable[UTF_8_ENC];
1639	return XML_TOK_BOM;
1640	}
1641	break;
1642	default:
1643	if (ptr[0] == '\0')
1644	{
1645	/* 0 isn't a legal data character. Furthermore a document entity can only
1646	* start with ASCII characters. So the only way this can fail to be big-endian
1647	* UTF-16 if it it's an external parsed general entity that's labelled as
1648	* UTF-16LE. */
1649	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1650	break;
1651	*encPtr = encodingTable[UTF_16BE_ENC];
1652	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1653	}
1654	else if (ptr[1] == '\0')
1655	{
1656	/* We could recover here in the case:
1657	* - parsing an external entity
1658	* - second byte is 0
1659	* - no externally specified encoding
1660	* - no encoding declaration
1661	* by assuming UTF-16LE. But we don't, because this would mean when
1662	* presented just with a single byte, we couldn't reliably determine
1663	* whether we needed further bytes. */
1664	if (state == XML_CONTENT_STATE)
1665	break;
1666	*encPtr = encodingTable[UTF_16LE_ENC];
1667	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1668	}
1669	break;
1670	}
1671	}
1672	*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1673	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1674	}
1675
1676
1677	#define NS(x) x
1678	#define ns(x) x
1679	#include "xmltok_ns.c"
1680	#undef NS
1681	#undef ns
1682
1683	#ifdef XML_NS
1684
1685	#define NS(x) x ## NS
1686	#define ns(x) x ## _ns
1687
1688	#include "xmltok_ns.c"
1689
1690	#undef NS
1691	#undef ns
1692
1693	ENCODING * XmlInitUnknownEncodingNS(void *mem,
1694	int *table,
1695	int (* EXPATENTRY convert) (void userData, const char p),
1696	void *userData)
1697	{
1698	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1699
1700	if (enc)
1701	((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1702	return enc;
1703	}
1704
1705	#endif /* XML_NS */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: