Context Navigation

source: trunk/src/helpers/xmltok.c@ 132

Visit:

Last change on this file since 132 was 98, checked in by umoeller, 24 years ago
Misc updates.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 46.3 KB

Line
1	/*
2	* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3	* See the file COPYING for copying permission.
4	*/
5
6	/* #ifdef COMPILED_FROM_DSP
7	* # include "winconfig.h"
8	* #else
9	* # include <config.h>
10	* #endif
11	*/
12
13	#include <memory.h>
14
15	#include "expat\expat_setup.h" // V0.9.9 (2001-02-10) [umoeller]
16
17	#pragma info(norea, nogen)
18	// disable "statement unreachable" and "missing break statement"
19	// this code generates those options HEAVILY
20
21	#include "expat\xmltok.h"
22	#include "expat\nametab.h"
23
24	#ifdef XML_DTD
25	#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
26	#else
27	#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
28	#endif
29
30	#define VTABLE1 \
31	{ PREFIX(prologTok), PREFIX(contentTok), \
32	PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
33	{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
34	PREFIX(sameName), \
35	PREFIX(nameMatchesAscii), \
36	PREFIX(nameLength), \
37	PREFIX(skipS), \
38	PREFIX(getAtts), \
39	PREFIX(charRefNumber), \
40	PREFIX(predefinedEntityName), \
41	PREFIX(updatePosition), \
42	PREFIX(isPublicId)
43
44	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
45
46	#define UCS2_GET_NAMING(pages, hi, lo) \
47	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
48
49	/* A 2 byte UTF-8 representation splits the characters 11 bits
50	* between the bottom 5 and 6 bits of the bytes.
51	* We need 8 bits to index into pages, 3 bits to add to that index and
52	* 5 bits to generate the mask. */
53	#define UTF8_GET_NAMING2(pages, byte) \
54	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
55	+ ((((byte)[0]) & 3) << 1) \
56	+ ((((byte)[1]) >> 5) & 1)] \
57	& (1 << (((byte)[1]) & 0x1F)))
58
59	/* A 3 byte UTF-8 representation splits the characters 16 bits
60	* between the bottom 4, 6 and 6 bits of the bytes.
61	* We need 8 bits to index into pages, 3 bits to add to that index and
62	* 5 bits to generate the mask. */
63	#define UTF8_GET_NAMING3(pages, byte) \
64	(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
65	+ ((((byte)[1]) >> 2) & 0xF)] \
66	<< 3) \
67	+ ((((byte)[1]) & 3) << 1) \
68	+ ((((byte)[2]) >> 5) & 1)] \
69	& (1 << (((byte)[2]) & 0x1F)))
70
71	#define UTF8_GET_NAMING(pages, p, n) \
72	((n) == 2 \
73	? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
74	: ((n) == 3 \
75	? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
76	: 0))
77
78	#define UTF8_INVALID3(p) \
79	((*p) == 0xED \
80	? (((p)[1] & 0x20) != 0) \
81	: ((*p) == 0xEF \
82	? ((p)[1] == 0xBF && ((p)[2] == 0xBF \|\| (p)[2] == 0xBE)) \
83	: 0))
84
85	#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
86
87	static
88	int EXPATENTRY isNever(const ENCODING * enc, const char *p)
89	{
90	return 0;
91	}
92
93	static
94	int EXPATENTRY utf8_isName2(const ENCODING * enc, const char *p)
95	{
96	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
97	}
98
99	static
100	int EXPATENTRY utf8_isName3(const ENCODING * enc, const char *p)
101	{
102	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
103	}
104
105	#define utf8_isName4 isNever
106
107	static
108	int EXPATENTRY utf8_isNmstrt2(const ENCODING * enc, const char *p)
109	{
110	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
111	}
112
113	static
114	int EXPATENTRY utf8_isNmstrt3(const ENCODING * enc, const char *p)
115	{
116	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
117	}
118
119	#define utf8_isNmstrt4 isNever
120
121	#define utf8_isInvalid2 isNever
122
123	static
124	int EXPATENTRY utf8_isInvalid3(const ENCODING * enc, const char *p)
125	{
126	return UTF8_INVALID3((const unsigned char *)p);
127	}
128
129	static
130	int EXPATENTRY utf8_isInvalid4(const ENCODING * enc, const char *p)
131	{
132	return UTF8_INVALID4((const unsigned char *)p);
133	}
134
135	struct normal_encoding
136	{
137	ENCODING enc;
138	unsigned char type[256];
139	#ifdef XML_MIN_SIZE
140	int (* EXPATENTRY byteType) (const ENCODING , const char );
141	int (* EXPATENTRY isNameMin) (const ENCODING , const char );
142	int (* EXPATENTRY isNmstrtMin) (const ENCODING , const char );
143	int (* EXPATENTRY byteToAscii) (const ENCODING , const char );
144	int (* EXPATENTRY charMatches) (const ENCODING , const char , int);
145	#endif /* XML_MIN_SIZE */
146	int (* EXPATENTRY isName2) (const ENCODING , const char );
147	int (* EXPATENTRY isName3) (const ENCODING , const char );
148	int (* EXPATENTRY isName4) (const ENCODING , const char );
149	int (* EXPATENTRY isNmstrt2) (const ENCODING , const char );
150	int (* EXPATENTRY isNmstrt3) (const ENCODING , const char );
151	int (* EXPATENTRY isNmstrt4) (const ENCODING , const char );
152	int (* EXPATENTRY isInvalid2) (const ENCODING , const char );
153	int (* EXPATENTRY isInvalid3) (const ENCODING , const char );
154	int (* EXPATENTRY isInvalid4) (const ENCODING , const char );
155	};
156
157	#ifdef XML_MIN_SIZE
158
159	#define STANDARD_VTABLE(E) \
160	E ## byteType, \
161	E ## isNameMin, \
162	E ## isNmstrtMin, \
163	E ## byteToAscii, \
164	E ## charMatches,
165
166	#else
167
168	#define STANDARD_VTABLE(E) /* as nothing */
169
170	#endif
171
172	#define NORMAL_VTABLE(E) \
173	E ## isName2, \
174	E ## isName3, \
175	E ## isName4, \
176	E ## isNmstrt2, \
177	E ## isNmstrt3, \
178	E ## isNmstrt4, \
179	E ## isInvalid2, \
180	E ## isInvalid3, \
181	E ## isInvalid4
182
183	static int checkCharRefNumber(int);
184
185	#include "expat\xmltok_impl.h"
186	#include "expat\ascii.h"
187
188	#ifdef XML_MIN_SIZE
189	#define sb_isNameMin isNever
190	#define sb_isNmstrtMin isNever
191	#endif
192
193	#ifdef XML_MIN_SIZE
194	#define MINBPC(enc) ((enc)->minBytesPerChar)
195	#else
196	/* minimum bytes per character */
197	#define MINBPC(enc) 1
198	#endif
199
200	#define SB_BYTE_TYPE(enc, p) \
201	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
202
203	#ifdef XML_MIN_SIZE
204	static int EXPATENTRY sb_byteType(const ENCODING * enc, const char *p)
205	{
206	return SB_BYTE_TYPE(enc, p);
207	}
208	#define BYTE_TYPE(enc, p) \
209	(((const struct normal_encoding *)(enc))->byteType(enc, p))
210	#else
211	#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
212	#endif
213
214	#ifdef XML_MIN_SIZE
215	#define BYTE_TO_ASCII(enc, p) \
216	(((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
217	static
218	int EXPATENTRY sb_byteToAscii(const ENCODING * enc, const char *p)
219	{
220	return *p;
221	}
222	#else
223	#define BYTE_TO_ASCII(enc, p) (*(p))
224	#endif
225
226	#define IS_NAME_CHAR(enc, p, n) \
227	(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
228	#define IS_NMSTRT_CHAR(enc, p, n) \
229	(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
230	#define IS_INVALID_CHAR(enc, p, n) \
231	(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
232
233	#ifdef XML_MIN_SIZE
234	#define IS_NAME_CHAR_MINBPC(enc, p) \
235	(((const struct normal_encoding *)(enc))->isNameMin(enc, p))
236	#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
237	(((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
238	#else
239	#define IS_NAME_CHAR_MINBPC(enc, p) (0)
240	#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
241	#endif
242
243	#ifdef XML_MIN_SIZE
244	#define CHAR_MATCHES(enc, p, c) \
245	(((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
246	static
247	int EXPATENTRY sb_charMatches(const ENCODING * enc, const char *p, int c)
248	{
249	return *p == c;
250	}
251	#else
252	/* c is an ASCII character */
253	#define CHAR_MATCHES(enc, p, c) (*(p) == c)
254	#endif
255
256	#define PREFIX(ident) normal_ ## ident
257	#include "xmltok_impl.c"
258
259	#undef MINBPC
260	#undef BYTE_TYPE
261	#undef BYTE_TO_ASCII
262	#undef CHAR_MATCHES
263	#undef IS_NAME_CHAR
264	#undef IS_NAME_CHAR_MINBPC
265	#undef IS_NMSTRT_CHAR
266	#undef IS_NMSTRT_CHAR_MINBPC
267	#undef IS_INVALID_CHAR
268
269	enum
270	{ /* UTF8_cvalN is value of masked first byte of N byte sequence */
271	UTF8_cval1 = 0x00,
272	UTF8_cval2 = 0xc0,
273	UTF8_cval3 = 0xe0,
274	UTF8_cval4 = 0xf0
275	};
276
277	static void EXPATENTRY utf8_toUtf8(const ENCODING * enc,
278	const char **fromP,
279	const char *fromLim,
280	char **toP,
281	const char *toLim)
282	{
283	char *to;
284	const char *from;
285
286	if (fromLim - fromP > toLim - toP)
287	{
288	/* Avoid copying partial characters. */
289	for (fromLim = fromP + (toLim - toP);
290	fromLim > *fromP;
291	fromLim--)
292	if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
293	break;
294	}
295	for (to = toP, from = fromP;
296	from != fromLim;
297	from++, to++)
298	to = from;
299	*fromP = from;
300	*toP = to;
301	}
302
303	static void EXPATENTRY utf8_toUtf16(const ENCODING * enc,
304	const char **fromP,
305	const char *fromLim,
306	unsigned short **toP,
307	const unsigned short *toLim)
308	{
309	unsigned short to = toP;
310	const char from = fromP;
311
312	while (from != fromLim && to != toLim)
313	{
314	switch (((struct normal_encoding )enc)->type[(unsigned char)from])
315	{
316	case BT_LEAD2:
317	*to++ = ((from[0] & 0x1f) << 6) \| (from[1] & 0x3f);
318	from += 2;
319	break;
320	case BT_LEAD3:
321	*to++ = ((from[0] & 0xf) << 12) \| ((from[1] & 0x3f) << 6) \| (from[2] & 0x3f);
322	from += 3;
323	break;
324	case BT_LEAD4:
325	{
326	unsigned long n;
327
328	if (to + 1 == toLim)
329	break;
330	n = ((from[0] & 0x7) << 18) \| ((from[1] & 0x3f) << 12) \| ((from[2] & 0x3f) << 6) \| (from[3] & 0x3f);
331	n -= 0x10000;
332	to[0] = (unsigned short)((n >> 10) \| 0xD800);
333	to[1] = (unsigned short)((n & 0x3FF) \| 0xDC00);
334	to += 2;
335	from += 4;
336	}
337	break;
338	default:
339	to++ = from++;
340	break;
341	}
342	}
343	*fromP = from;
344	*toP = to;
345	}
346
347	#ifdef XML_NS
348	static const struct normal_encoding utf8_encoding_ns =
349	{
350	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
351	{
352	#include "expat\asciitab.h"
353	#include "expat\utf8tab.h"
354	},
355	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
356	};
357
358	#endif
359
360	static const struct normal_encoding utf8_encoding =
361	{
362	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
363	{
364	#define BT_COLON BT_NMSTRT
365	#include "expat\asciitab.h"
366	#undef BT_COLON
367	#include "expat\utf8tab.h"
368	},
369	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
370	};
371
372	#ifdef XML_NS
373
374	static const struct normal_encoding internal_utf8_encoding_ns =
375	{
376	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
377	{
378	#include "expat\iasciitab.h"
379	#include "expat\utf8tab.h"
380	},
381	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
382	};
383
384	#endif
385
386	static const struct normal_encoding internal_utf8_encoding =
387	{
388	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
389	{
390	#define BT_COLON BT_NMSTRT
391	#include "expat\iasciitab.h"
392	#undef BT_COLON
393	#include "expat\utf8tab.h"
394	},
395	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
396	};
397
398	static void EXPATENTRY latin1_toUtf8(const ENCODING * enc,
399	const char **fromP,
400	const char *fromLim,
401	char **toP,
402	const char *toLim)
403	{
404	for (;;)
405	{
406	unsigned char c;
407
408	if (*fromP == fromLim)
409	break;
410	c = (unsigned char)**fromP;
411	if (c & 0x80)
412	{
413	if (toLim - *toP < 2)
414	break;
415	(toP)++ = ((c >> 6) \| UTF8_cval2);
416	(toP)++ = ((c & 0x3f) \| 0x80);
417	(*fromP)++;
418	}
419	else
420	{
421	if (*toP == toLim)
422	break;
423	(toP)++ = (fromP)++;
424	}
425	}
426	}
427
428	static void EXPATENTRY latin1_toUtf16(const ENCODING * enc,
429	const char **fromP,
430	const char *fromLim,
431	unsigned short **toP,
432	const unsigned short *toLim)
433	{
434	while (fromP != fromLim && toP != toLim)
435	(toP)++ = (unsigned char)(fromP)++;
436	}
437
438	#ifdef XML_NS
439
440	static const struct normal_encoding latin1_encoding_ns =
441	{
442	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
443	{
444	#include "expat\asciitab.h"
445	#include "expat\latin1tab.h"
446	},
447	STANDARD_VTABLE(sb_)
448	};
449
450	#endif
451
452	static const struct normal_encoding latin1_encoding =
453	{
454	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
455	{
456	#define BT_COLON BT_NMSTRT
457	#include "expat\asciitab.h"
458	#undef BT_COLON
459	#include "expat\latin1tab.h"
460	},
461	STANDARD_VTABLE(sb_)
462	};
463
464	static void EXPATENTRY ascii_toUtf8(const ENCODING * enc,
465	const char **fromP,
466	const char *fromLim,
467	char **toP,
468	const char *toLim)
469	{
470	while (fromP != fromLim && toP != toLim)
471	(toP)++ = (fromP)++;
472	}
473
474	#ifdef XML_NS
475
476	static const struct normal_encoding ascii_encoding_ns =
477	{
478	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
479	{
480	#include "expat\asciitab.h"
481	/* BT_NONXML == 0 */
482	},
483	STANDARD_VTABLE(sb_)
484	};
485
486	#endif
487
488	static const struct normal_encoding ascii_encoding =
489	{
490	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
491	{
492	#define BT_COLON BT_NMSTRT
493	#include "expat\asciitab.h"
494	#undef BT_COLON
495	/* BT_NONXML == 0 */
496	},
497	STANDARD_VTABLE(sb_)
498	};
499
500	static int unicode_byte_type(char hi, char lo)
501	{
502	switch ((unsigned char)hi)
503	{
504	case 0xD8:
505	case 0xD9:
506	case 0xDA:
507	case 0xDB:
508	return BT_LEAD4;
509	case 0xDC:
510	case 0xDD:
511	case 0xDE:
512	case 0xDF:
513	return BT_TRAIL;
514	case 0xFF:
515	switch ((unsigned char)lo)
516	{
517	case 0xFF:
518	case 0xFE:
519	return BT_NONXML;
520	}
521	break;
522	}
523	return BT_NONASCII;
524	}
525
526	#define DEFINE_UTF16_TO_UTF8(E) \
527	static void EXPATENTRY E ## toUtf8(const ENCODING *enc, \
528	const char *fromP, const char fromLim, \
529	char *toP, const char toLim) \
530	{ \
531	const char *from; \
532	for (from = *fromP; from != fromLim; from += 2) { \
533	int plane; \
534	unsigned char lo2; \
535	unsigned char lo = GET_LO(from); \
536	unsigned char hi = GET_HI(from); \
537	switch (hi) { \
538	case 0: \
539	if (lo < 0x80) { \
540	if (*toP == toLim) { \
541	*fromP = from; \
542	return; \
543	} \
544	(toP)++ = lo; \
545	break; \
546	} \
547	/* fall through */ \
548	case 0x1: case 0x2: case 0x3: \
549	case 0x4: case 0x5: case 0x6: case 0x7: \
550	if (toLim - *toP < 2) { \
551	*fromP = from; \
552	return; \
553	} \
554	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
555	(toP)++ = ((lo & 0x3f) \| 0x80); \
556	break; \
557	default: \
558	if (toLim - *toP < 3) { \
559	*fromP = from; \
560	return; \
561	} \
562	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
563	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
564	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
565	(toP)++ = ((lo & 0x3f) \| 0x80); \
566	break; \
567	case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
568	if (toLim - *toP < 4) { \
569	*fromP = from; \
570	return; \
571	} \
572	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
573	(toP)++ = ((plane >> 2) \| UTF8_cval4); \
574	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
575	from += 2; \
576	lo2 = GET_LO(from); \
577	(toP)++ = (((lo & 0x3) << 4) \
578	\| ((GET_HI(from) & 0x3) << 2) \
579	\| (lo2 >> 6) \
580	\| 0x80); \
581	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
582	break; \
583	} \
584	} \
585	*fromP = from; \
586	}
587
588	#define DEFINE_UTF16_TO_UTF16(E) \
589	static void EXPATENTRY E ## toUtf16(const ENCODING *enc, \
590	const char *fromP, const char fromLim, \
591	unsigned short *toP, const unsigned short toLim) \
592	{ \
593	/* Avoid copying first half only of surrogate */ \
594	if (fromLim - fromP > ((toLim - toP) << 1) \
595	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
596	fromLim -= 2; \
597	for (; fromP != fromLim && toP != toLim; *fromP += 2) \
598	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
599	}
600
601	#define SET2(ptr, ch) \
602	(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
603	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
604	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
605
606	DEFINE_UTF16_TO_UTF8(little2_)
607	DEFINE_UTF16_TO_UTF16(little2_)
608
609	#undef SET2
610	#undef GET_LO
611	#undef GET_HI
612
613	#define SET2(ptr, ch) \
614	(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
615	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
616	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
617
618	DEFINE_UTF16_TO_UTF8(big2_)
619	DEFINE_UTF16_TO_UTF16(big2_)
620
621	#undef SET2
622	#undef GET_LO
623	#undef GET_HI
624
625	#define LITTLE2_BYTE_TYPE(enc, p) \
626	((p)[1] == 0 \
627	? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
628	: unicode_byte_type((p)[1], (p)[0]))
629	#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
630	#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
631	#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
632	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
633	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
634	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
635
636	#ifdef XML_MIN_SIZE
637	static int EXPATENTRY little2_byteType(const ENCODING * enc, const char *p)
638	{
639	return LITTLE2_BYTE_TYPE(enc, p);
640	}
641
642	static int EXPATENTRY little2_byteToAscii(const ENCODING * enc, const char *p)
643	{
644	return LITTLE2_BYTE_TO_ASCII(enc, p);
645	}
646
647	static int EXPATENTRY little2_charMatches(const ENCODING * enc, const char *p, int c)
648	{
649	return LITTLE2_CHAR_MATCHES(enc, p, c);
650	}
651
652	static int EXPATENTRY little2_isNameMin(const ENCODING * enc, const char *p)
653	{
654	return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
655	}
656
657	static int EXPATENTRY little2_isNmstrtMin(const ENCODING * enc, const char *p)
658	{
659	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
660	}
661
662	#undef VTABLE
663	#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
664
665	#else /* not XML_MIN_SIZE */
666
667	#undef PREFIX
668	#define PREFIX(ident) little2_ ## ident
669	#define MINBPC(enc) 2
670	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
671	#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
672	#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
673	#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
674	#define IS_NAME_CHAR(enc, p, n) 0
675	#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
676	#define IS_NMSTRT_CHAR(enc, p, n) (0)
677	#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
678
679	#include "xmltok_impl.c"
680
681	#undef MINBPC
682	#undef BYTE_TYPE
683	#undef BYTE_TO_ASCII
684	#undef CHAR_MATCHES
685	#undef IS_NAME_CHAR
686	#undef IS_NAME_CHAR_MINBPC
687	#undef IS_NMSTRT_CHAR
688	#undef IS_NMSTRT_CHAR_MINBPC
689	#undef IS_INVALID_CHAR
690
691	#endif /* not XML_MIN_SIZE */
692
693	#ifdef XML_NS
694
695	static const struct normal_encoding little2_encoding_ns =
696	{
697	{VTABLE, 2, 0,
698	#if XML_BYTE_ORDER == 12
699	1
700	#else
701	0
702	#endif
703	},
704	{
705	#include "expat\asciitab.h"
706	#include "expat\latin1tab.h"
707	},
708	STANDARD_VTABLE(little2_)
709	};
710
711	#endif
712
713	static const struct normal_encoding little2_encoding =
714	{
715	{VTABLE, 2, 0,
716	#if XML_BYTE_ORDER == 12
717	1
718	#else
719	0
720	#endif
721	},
722	{
723	#define BT_COLON BT_NMSTRT
724	#include "expat\asciitab.h"
725	#undef BT_COLON
726	#include "expat\latin1tab.h"
727	},
728	STANDARD_VTABLE(little2_)
729	};
730
731	#if XML_BYTE_ORDER != 21
732
733	#ifdef XML_NS
734
735	static const struct normal_encoding internal_little2_encoding_ns =
736	{
737	{VTABLE, 2, 0, 1},
738	{
739	#include "expat\iasciitab.h"
740	#include "expat\latin1tab.h"
741	},
742	STANDARD_VTABLE(little2_)
743	};
744
745	#endif
746
747	static const struct normal_encoding internal_little2_encoding =
748	{
749	{VTABLE, 2, 0, 1},
750	{
751	#define BT_COLON BT_NMSTRT
752	#include "expat\iasciitab.h"
753	#undef BT_COLON
754	#include "expat\latin1tab.h"
755	},
756	STANDARD_VTABLE(little2_)
757	};
758
759	#endif
760
761
762	#define BIG2_BYTE_TYPE(enc, p) \
763	((p)[0] == 0 \
764	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
765	: unicode_byte_type((p)[0], (p)[1]))
766	#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
767	#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
768	#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
769	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
770	#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
771	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
772
773	#ifdef XML_MIN_SIZE
774
775	static int EXPATENTRY big2_byteType(const ENCODING * enc, const char *p)
776	{
777	return BIG2_BYTE_TYPE(enc, p);
778	}
779
780	static int EXPATENTRY big2_byteToAscii(const ENCODING * enc, const char *p)
781	{
782	return BIG2_BYTE_TO_ASCII(enc, p);
783	}
784
785	static int EXPATENTRY big2_charMatches(const ENCODING * enc, const char *p, int c)
786	{
787	return BIG2_CHAR_MATCHES(enc, p, c);
788	}
789
790	static int EXPATENTRY big2_isNameMin(const ENCODING * enc, const char *p)
791	{
792	return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
793	}
794
795	static int EXPATENTRY big2_isNmstrtMin(const ENCODING * enc, const char *p)
796	{
797	return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
798	}
799
800	#undef VTABLE
801	#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
802
803	#else /* not XML_MIN_SIZE */
804
805	#undef PREFIX
806	#define PREFIX(ident) big2_ ## ident
807	#define MINBPC(enc) 2
808	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
809	#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
810	#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
811	#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
812	#define IS_NAME_CHAR(enc, p, n) 0
813	#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
814	#define IS_NMSTRT_CHAR(enc, p, n) (0)
815	#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
816
817	#include "xmltok_impl.c"
818
819	#undef MINBPC
820	#undef BYTE_TYPE
821	#undef BYTE_TO_ASCII
822	#undef CHAR_MATCHES
823	#undef IS_NAME_CHAR
824	#undef IS_NAME_CHAR_MINBPC
825	#undef IS_NMSTRT_CHAR
826	#undef IS_NMSTRT_CHAR_MINBPC
827	#undef IS_INVALID_CHAR
828
829	#endif /* not XML_MIN_SIZE */
830
831	#ifdef XML_NS
832
833	static const struct normal_encoding big2_encoding_ns =
834	{
835	{VTABLE, 2, 0,
836	#if XML_BYTE_ORDER == 21
837	1
838	#else
839	0
840	#endif
841	},
842	{
843	#include "expat\asciitab.h"
844	#include "expat\latin1tab.h"
845	},
846	STANDARD_VTABLE(big2_)
847	};
848
849	#endif
850
851	static const struct normal_encoding big2_encoding =
852	{
853	{VTABLE, 2, 0,
854	#if XML_BYTE_ORDER == 21
855	1
856	#else
857	0
858	#endif
859	},
860	{
861	#define BT_COLON BT_NMSTRT
862	#include "expat\asciitab.h"
863	#undef BT_COLON
864	#include "expat\latin1tab.h"
865	},
866	STANDARD_VTABLE(big2_)
867	};
868
869	#if XML_BYTE_ORDER != 12
870
871	#ifdef XML_NS
872
873	static const struct normal_encoding internal_big2_encoding_ns =
874	{
875	{VTABLE, 2, 0, 1},
876	{
877	#include "expat\iasciitab.h"
878	#include "expat\latin1tab.h"
879	},
880	STANDARD_VTABLE(big2_)
881	};
882
883	#endif
884
885	static const struct normal_encoding internal_big2_encoding =
886	{
887	{VTABLE, 2, 0, 1},
888	{
889	#define BT_COLON BT_NMSTRT
890	#include "expat\iasciitab.h"
891	#undef BT_COLON
892	#include "expat\latin1tab.h"
893	},
894	STANDARD_VTABLE(big2_)
895	};
896
897	#endif
898
899	#undef PREFIX
900
901	static int streqci(const char s1, const char s2)
902	{
903	for (;;)
904	{
905	char c1 = *s1++;
906	char c2 = *s2++;
907
908	if (ASCII_a <= c1 && c1 <= ASCII_z)
909	c1 += ASCII_A - ASCII_a;
910	if (ASCII_a <= c2 && c2 <= ASCII_z)
911	c2 += ASCII_A - ASCII_a;
912	if (c1 != c2)
913	return 0;
914	if (!c1)
915	break;
916	}
917	return 1;
918	}
919
920	static void EXPATENTRY initUpdatePosition(const ENCODING * enc, const char *ptr,
921	const char end, POSITION pos)
922	{
923	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
924	}
925
926	static int EXPATENTRY toAscii(const ENCODING * enc, const char ptr, const char end)
927	{
928	char buf[1];
929	char *p = buf;
930
931	XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
932	if (p == buf)
933	return -1;
934	else
935	return buf[0];
936	}
937
938	static int EXPATENTRY isSpace(int c)
939	{
940	switch (c)
941	{
942	case 0x20:
943	case 0xD:
944	case 0xA:
945	case 0x9:
946	return 1;
947	}
948	return 0;
949	}
950
951	/* Return 1 if there's just optional white space
952	* or there's an S followed by name=val. */
953	static int EXPATENTRY parsePseudoAttribute(const ENCODING * enc,
954	const char *ptr,
955	const char *end,
956	const char **namePtr,
957	const char **nameEndPtr,
958	const char **valPtr,
959	const char **nextTokPtr)
960	{
961	int c;
962	char open;
963
964	if (ptr == end)
965	{
966	*namePtr = 0;
967	return 1;
968	}
969	if (!isSpace(toAscii(enc, ptr, end)))
970	{
971	*nextTokPtr = ptr;
972	return 0;
973	}
974	do
975	{
976	ptr += enc->minBytesPerChar;
977	}
978	while (isSpace(toAscii(enc, ptr, end)));
979	if (ptr == end)
980	{
981	*namePtr = 0;
982	return 1;
983	}
984	*namePtr = ptr;
985	for (;;)
986	{
987	c = toAscii(enc, ptr, end);
988	if (c == -1)
989	{
990	*nextTokPtr = ptr;
991	return 0;
992	}
993	if (c == ASCII_EQUALS)
994	{
995	*nameEndPtr = ptr;
996	break;
997	}
998	if (isSpace(c))
999	{
1000	*nameEndPtr = ptr;
1001	do
1002	{
1003	ptr += enc->minBytesPerChar;
1004	}
1005	while (isSpace(c = toAscii(enc, ptr, end)));
1006	if (c != ASCII_EQUALS)
1007	{
1008	*nextTokPtr = ptr;
1009	return 0;
1010	}
1011	break;
1012	}
1013	ptr += enc->minBytesPerChar;
1014	}
1015	if (ptr == *namePtr)
1016	{
1017	*nextTokPtr = ptr;
1018	return 0;
1019	}
1020	ptr += enc->minBytesPerChar;
1021	c = toAscii(enc, ptr, end);
1022	while (isSpace(c))
1023	{
1024	ptr += enc->minBytesPerChar;
1025	c = toAscii(enc, ptr, end);
1026	}
1027	if (c != ASCII_QUOT && c != ASCII_APOS)
1028	{
1029	*nextTokPtr = ptr;
1030	return 0;
1031	}
1032	open = c;
1033	ptr += enc->minBytesPerChar;
1034	*valPtr = ptr;
1035	for (;; ptr += enc->minBytesPerChar)
1036	{
1037	c = toAscii(enc, ptr, end);
1038	if (c == open)
1039	break;
1040	if (!(ASCII_a <= c && c <= ASCII_z)
1041	&& !(ASCII_A <= c && c <= ASCII_Z)
1042	&& !(ASCII_0 <= c && c <= ASCII_9)
1043	&& c != ASCII_PERIOD
1044	&& c != ASCII_MINUS
1045	&& c != ASCII_UNDERSCORE)
1046	{
1047	*nextTokPtr = ptr;
1048	return 0;
1049	}
1050	}
1051	*nextTokPtr = ptr + enc->minBytesPerChar;
1052	return 1;
1053	}
1054
1055	static const char KW_version[] =
1056	{
1057	ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1058	};
1059
1060	static const char KW_encoding[] =
1061	{
1062	ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1063	};
1064
1065	static const char KW_standalone[] =
1066	{
1067	ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
1068	};
1069
1070	static const char KW_yes[] =
1071	{
1072	ASCII_y, ASCII_e, ASCII_s, '\0'
1073	};
1074
1075	static const char KW_no[] =
1076	{
1077	ASCII_n, ASCII_o, '\0'
1078	};
1079
1080	static int doParseXmlDecl(const ENCODING * (encodingFinder) (const ENCODING ,
1081	const char *,
1082	const char *),
1083	int isGeneralTextEntity,
1084	const ENCODING * enc,
1085	const char *ptr,
1086	const char *end,
1087	const char **badPtr,
1088	const char **versionPtr,
1089	const char **versionEndPtr,
1090	const char **encodingName,
1091	const ENCODING ** encoding,
1092	int *standalone)
1093	{
1094	const char *val = 0;
1095	const char *name = 0;
1096	const char *nameEnd = 0;
1097
1098	ptr += 5 * enc->minBytesPerChar;
1099	end -= 2 * enc->minBytesPerChar;
1100	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) \|\| !name)
1101	{
1102	*badPtr = ptr;
1103	return 0;
1104	}
1105	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version))
1106	{
1107	if (!isGeneralTextEntity)
1108	{
1109	*badPtr = name;
1110	return 0;
1111	}
1112	}
1113	else
1114	{
1115	if (versionPtr)
1116	*versionPtr = val;
1117	if (versionEndPtr)
1118	*versionEndPtr = ptr;
1119	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1120	{
1121	*badPtr = ptr;
1122	return 0;
1123	}
1124	if (!name)
1125	{
1126	if (isGeneralTextEntity)
1127	{
1128	/* a TextDecl must have an EncodingDecl */
1129	*badPtr = ptr;
1130	return 0;
1131	}
1132	return 1;
1133	}
1134	}
1135	if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding))
1136	{
1137	int c = toAscii(enc, val, end);
1138
1139	if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z))
1140	{
1141	*badPtr = val;
1142	return 0;
1143	}
1144	if (encodingName)
1145	*encodingName = val;
1146	if (encoding)
1147	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1148	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1149	{
1150	*badPtr = ptr;
1151	return 0;
1152	}
1153	if (!name)
1154	return 1;
1155	}
1156	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) \|\| isGeneralTextEntity)
1157	{
1158	*badPtr = name;
1159	return 0;
1160	}
1161	if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes))
1162	{
1163	if (standalone)
1164	*standalone = 1;
1165	}
1166	else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no))
1167	{
1168	if (standalone)
1169	*standalone = 0;
1170	}
1171	else
1172	{
1173	*badPtr = val;
1174	return 0;
1175	}
1176	while (isSpace(toAscii(enc, ptr, end)))
1177	ptr += enc->minBytesPerChar;
1178	if (ptr != end)
1179	{
1180	*badPtr = ptr;
1181	return 0;
1182	}
1183	return 1;
1184	}
1185
1186	static int checkCharRefNumber(int result)
1187	{
1188	switch (result >> 8)
1189	{
1190	case 0xD8:
1191	case 0xD9:
1192	case 0xDA:
1193	case 0xDB:
1194	case 0xDC:
1195	case 0xDD:
1196	case 0xDE:
1197	case 0xDF:
1198	return -1;
1199	case 0:
1200	if (latin1_encoding.type[result] == BT_NONXML)
1201	return -1;
1202	break;
1203	case 0xFF:
1204	if (result == 0xFFFE \|\| result == 0xFFFF)
1205	return -1;
1206	break;
1207	}
1208	return result;
1209	}
1210
1211	int XmlUtf8Encode(int c, char *buf)
1212	{
1213	enum
1214	{
1215	/* minN is minimum legal resulting value for N byte sequence */
1216	min2 = 0x80,
1217	min3 = 0x800,
1218	min4 = 0x10000
1219	};
1220
1221	if (c < 0)
1222	return 0;
1223	if (c < min2)
1224	{
1225	buf[0] = (c \| UTF8_cval1);
1226	return 1;
1227	}
1228	if (c < min3)
1229	{
1230	buf[0] = ((c >> 6) \| UTF8_cval2);
1231	buf[1] = ((c & 0x3f) \| 0x80);
1232	return 2;
1233	}
1234	if (c < min4)
1235	{
1236	buf[0] = ((c >> 12) \| UTF8_cval3);
1237	buf[1] = (((c >> 6) & 0x3f) \| 0x80);
1238	buf[2] = ((c & 0x3f) \| 0x80);
1239	return 3;
1240	}
1241	if (c < 0x110000)
1242	{
1243	buf[0] = ((c >> 18) \| UTF8_cval4);
1244	buf[1] = (((c >> 12) & 0x3f) \| 0x80);
1245	buf[2] = (((c >> 6) & 0x3f) \| 0x80);
1246	buf[3] = ((c & 0x3f) \| 0x80);
1247	return 4;
1248	}
1249	return 0;
1250	}
1251
1252	int XmlUtf16Encode(int charNum, unsigned short *buf)
1253	{
1254	if (charNum < 0)
1255	return 0;
1256	if (charNum < 0x10000)
1257	{
1258	buf[0] = charNum;
1259	return 1;
1260	}
1261	if (charNum < 0x110000)
1262	{
1263	charNum -= 0x10000;
1264	buf[0] = (charNum >> 10) + 0xD800;
1265	buf[1] = (charNum & 0x3FF) + 0xDC00;
1266	return 2;
1267	}
1268	return 0;
1269	}
1270
1271	struct unknown_encoding
1272	{
1273	struct normal_encoding normal;
1274	int (convert) (void userData, const char *p);
1275	void *userData;
1276	unsigned short utf16[256];
1277	char utf8[256][4];
1278	};
1279
1280	int XmlSizeOfUnknownEncoding(void)
1281	{
1282	return sizeof(struct unknown_encoding);
1283	}
1284
1285	static int EXPATENTRY unknown_isName(const ENCODING * enc, const char *p)
1286	{
1287	int c = ((const struct unknown_encoding *)enc)
1288	->convert(((const struct unknown_encoding *)enc)->userData, p);
1289
1290	if (c & ~0xFFFF)
1291	return 0;
1292	return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1293	}
1294
1295	static int EXPATENTRY unknown_isNmstrt(const ENCODING * enc, const char *p)
1296	{
1297	int c = ((const struct unknown_encoding *)enc)
1298	->convert(((const struct unknown_encoding *)enc)->userData, p);
1299
1300	if (c & ~0xFFFF)
1301	return 0;
1302	return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1303	}
1304
1305	static int EXPATENTRY unknown_isInvalid(const ENCODING * enc, const char *p)
1306	{
1307	int c = ((const struct unknown_encoding *)enc)
1308	->convert(((const struct unknown_encoding *)enc)->userData, p);
1309
1310	return (c & ~0xFFFF) \|\| checkCharRefNumber(c) < 0;
1311	}
1312
1313	static void EXPATENTRY unknown_toUtf8(const ENCODING * enc,
1314	const char **fromP,
1315	const char *fromLim,
1316	char **toP,
1317	const char *toLim)
1318	{
1319	char buf[XML_UTF8_ENCODE_MAX];
1320
1321	for (;;)
1322	{
1323	const char *utf8;
1324	int n;
1325
1326	if (*fromP == fromLim)
1327	break;
1328	utf8 = ((const struct unknown_encoding )enc)->utf8[(unsigned char)*fromP];
1329	n = *utf8++;
1330	if (n == 0)
1331	{
1332	int c = ((const struct unknown_encoding *)enc)
1333	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1334
1335	n = XmlUtf8Encode(c, buf);
1336	if (n > toLim - *toP)
1337	break;
1338	utf8 = buf;
1339	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1340	- (BT_LEAD2 - 2);
1341	}
1342	else
1343	{
1344	if (n > toLim - *toP)
1345	break;
1346	(*fromP)++;
1347	}
1348	do
1349	{
1350	(toP)++ = *utf8++;
1351	}
1352	while (--n != 0);
1353	}
1354	}
1355
1356	static void EXPATENTRY unknown_toUtf16(const ENCODING * enc,
1357	const char **fromP,
1358	const char *fromLim,
1359	unsigned short **toP,
1360	const unsigned short *toLim)
1361	{
1362	while (fromP != fromLim && toP != toLim)
1363	{
1364	unsigned short c
1365	= ((const struct unknown_encoding )enc)->utf16[(unsigned char)*fromP];
1366
1367	if (c == 0)
1368	{
1369	c = (unsigned short)((const struct unknown_encoding *)enc)
1370	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1371	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1372	- (BT_LEAD2 - 2);
1373	}
1374	else
1375	(*fromP)++;
1376	(toP)++ = c;
1377	}
1378	}
1379
1380	/*
1381	*@@ XmlInitUnknownEncoding:
1382	*
1383	*@@changed V0.9.14 (2001-08-09) [umoeller]: couple of performance hacks
1384	*/
1385
1386	ENCODING* XmlInitUnknownEncoding(void *mem,
1387	int *table,
1388	int (convert) (void userData, const char *p),
1389	void *userData)
1390	{
1391	int i;
1392	struct unknown_encoding e = (struct unknown_encoding)mem;
1393
1394	// gee, isn't this a regular memcpy?!?
1395	/* for (i = 0;
1396	i < (int)sizeof(struct normal_encoding);
1397	i++)
1398	((char )mem)[i] = ((char )&latin1_encoding)[i]; */
1399
1400	// replaced the above with this V0.9.14 (2001-08-09) [umoeller]
1401	memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1402
1403	for (i = 0; i < 128; i++)
1404	if ( latin1_encoding.type[i] != BT_OTHER
1405	&& latin1_encoding.type[i] != BT_NONXML
1406	&& table[i] != i
1407	)
1408	return 0;
1409
1410	for (i = 0; i < 256; i++)
1411	{
1412	int c = table[i];
1413
1414	if (c == -1)
1415	{
1416	e->normal.type[i] = BT_MALFORM;
1417	/* This shouldn't really get used. */
1418	e->utf16[i] = 0xFFFF;
1419	e->utf8[i][0] = 1;
1420	e->utf8[i][1] = 0;
1421	}
1422	else if (c < 0)
1423	{
1424	if (c < -4)
1425	return 0;
1426	e->normal.type[i] = BT_LEAD2 - (c + 2);
1427	e->utf8[i][0] = 0;
1428	e->utf16[i] = 0;
1429	}
1430	else if (c < 0x80)
1431	{
1432	if ( latin1_encoding.type[c] != BT_OTHER
1433	&& latin1_encoding.type[c] != BT_NONXML
1434	&& c != i
1435	)
1436	return 0;
1437	e->normal.type[i] = latin1_encoding.type[c];
1438	e->utf8[i][0] = 1;
1439	e->utf8[i][1] = (char)c;
1440	e->utf16[i] = c == 0 ? 0xFFFF : c;
1441	}
1442	else if (checkCharRefNumber(c) < 0)
1443	{
1444	e->normal.type[i] = BT_NONXML;
1445	/* This shouldn't really get used. */
1446	e->utf16[i] = 0xFFFF;
1447	e->utf8[i][0] = 1;
1448	e->utf8[i][1] = 0;
1449	}
1450	else
1451	{
1452	if (c > 0xFFFF)
1453	return 0;
1454	if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1455	e->normal.type[i] = BT_NMSTRT;
1456	else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1457	e->normal.type[i] = BT_NAME;
1458	else
1459	e->normal.type[i] = BT_OTHER;
1460	e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1461	e->utf16[i] = c;
1462	}
1463	}
1464	e->userData = userData;
1465	e->convert = convert;
1466	if (convert)
1467	{
1468	e->normal.isName2 = unknown_isName;
1469	e->normal.isName3 = unknown_isName;
1470	e->normal.isName4 = unknown_isName;
1471	e->normal.isNmstrt2 = unknown_isNmstrt;
1472	e->normal.isNmstrt3 = unknown_isNmstrt;
1473	e->normal.isNmstrt4 = unknown_isNmstrt;
1474	e->normal.isInvalid2 = unknown_isInvalid;
1475	e->normal.isInvalid3 = unknown_isInvalid;
1476	e->normal.isInvalid4 = unknown_isInvalid;
1477	}
1478	e->normal.enc.utf8Convert = unknown_toUtf8;
1479	e->normal.enc.utf16Convert = unknown_toUtf16;
1480	return &(e->normal.enc);
1481	}
1482
1483	/* If this enumeration is changed, getEncodingIndex and encodings
1484	* must also be changed. */
1485	enum
1486	{
1487	UNKNOWN_ENC = -1,
1488	ISO_8859_1_ENC = 0,
1489	US_ASCII_ENC,
1490	UTF_8_ENC,
1491	UTF_16_ENC,
1492	UTF_16BE_ENC,
1493	UTF_16LE_ENC,
1494	/* must match encodingNames up to here */
1495	NO_ENC
1496	};
1497
1498	static const char KW_ISO_8859_1[] =
1499	{
1500	ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
1501	};
1502	static const char KW_US_ASCII[] =
1503	{
1504	ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
1505	};
1506	static const char KW_UTF_8[] =
1507	{
1508	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1509	};
1510	static const char KW_UTF_16[] =
1511	{
1512	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1513	};
1514	static const char KW_UTF_16BE[] =
1515	{
1516	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
1517	};
1518	static const char KW_UTF_16LE[] =
1519	{
1520	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
1521	};
1522
1523	static int getEncodingIndex(const char *name)
1524	{
1525	static const char *encodingNames[] =
1526	{
1527	KW_ISO_8859_1,
1528	KW_US_ASCII,
1529	KW_UTF_8,
1530	KW_UTF_16,
1531	KW_UTF_16BE,
1532	KW_UTF_16LE,
1533	};
1534	int i;
1535
1536	if (name == 0)
1537	return NO_ENC;
1538	for (i = 0;
1539	i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0]));
1540	i++)
1541	if (streqci(name, encodingNames[i]))
1542	return i;
1543	return UNKNOWN_ENC;
1544	}
1545
1546	/* For binary compatibility, we store the index of the encoding specified
1547	* at initialization in the isUtf16 member. */
1548
1549	#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1550	#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1551
1552	/* This is what detects the encoding.
1553	* encodingTable maps from encoding indices to encodings;
1554	* INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1555	* state is XML_CONTENT_STATE if we're parsing an external text entity,
1556	* and XML_PROLOG_STATE otherwise.
1557	*/
1558
1559
1560	static int EXPATENTRY initScan(const ENCODING ** encodingTable,
1561	const INIT_ENCODING * enc,
1562	int state,
1563	const char *ptr,
1564	const char *end,
1565	const char **nextTokPtr)
1566	{
1567	const ENCODING **encPtr;
1568
1569	if (ptr == end)
1570	return XML_TOK_NONE;
1571	encPtr = enc->encPtr;
1572	if (ptr + 1 == end)
1573	{
1574	/* only a single byte available for auto-detection */
1575	#ifndef XML_DTD /* FIXME */
1576	/* a well-formed document entity must have more than one byte */
1577	if (state != XML_CONTENT_STATE)
1578	return XML_TOK_PARTIAL;
1579	#endif
1580	/* so we're parsing an external text entity... */
1581	/* if UTF-16 was externally specified, then we need at least 2 bytes */
1582	switch (INIT_ENC_INDEX(enc))
1583	{
1584	case UTF_16_ENC:
1585	case UTF_16LE_ENC:
1586	case UTF_16BE_ENC:
1587	return XML_TOK_PARTIAL;
1588	}
1589	switch ((unsigned char)*ptr)
1590	{
1591	case 0xFE:
1592	case 0xFF:
1593	case 0xEF: /* possibly first byte of UTF-8 BOM */
1594	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1595	&& state == XML_CONTENT_STATE)
1596	break;
1597	/* fall through */
1598	case 0x00:
1599	case 0x3C:
1600	return XML_TOK_PARTIAL;
1601	}
1602	}
1603	else
1604	{
1605	switch (((unsigned char)ptr[0] << 8) \| (unsigned char)ptr[1])
1606	{
1607	case 0xFEFF:
1608	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1609	&& state == XML_CONTENT_STATE)
1610	break;
1611	*nextTokPtr = ptr + 2;
1612	*encPtr = encodingTable[UTF_16BE_ENC];
1613	return XML_TOK_BOM;
1614	/* 00 3C is handled in the default case */
1615	case 0x3C00:
1616	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1617	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
1618	&& state == XML_CONTENT_STATE)
1619	break;
1620	*encPtr = encodingTable[UTF_16LE_ENC];
1621	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1622	case 0xFFFE:
1623	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1624	&& state == XML_CONTENT_STATE)
1625	break;
1626	*nextTokPtr = ptr + 2;
1627	*encPtr = encodingTable[UTF_16LE_ENC];
1628	return XML_TOK_BOM;
1629	case 0xEFBB:
1630	/* Maybe a UTF-8 BOM (EF BB BF) */
1631	/* If there's an explicitly specified (external) encoding
1632	* of ISO-8859-1 or some flavour of UTF-16
1633	* and this is an external text entity,
1634	* don't look for the BOM,
1635	* because it might be a legal data. */
1636	if (state == XML_CONTENT_STATE)
1637	{
1638	int e = INIT_ENC_INDEX(enc);
1639
1640	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC \|\| e == UTF_16LE_ENC \|\| e == UTF_16_ENC)
1641	break;
1642	}
1643	if (ptr + 2 == end)
1644	return XML_TOK_PARTIAL;
1645	if ((unsigned char)ptr[2] == 0xBF)
1646	{
1647	*nextTokPtr = ptr + 3;
1648	*encPtr = encodingTable[UTF_8_ENC];
1649	return XML_TOK_BOM;
1650	}
1651	break;
1652	default:
1653	if (ptr[0] == '\0')
1654	{
1655	/* 0 isn't a legal data character. Furthermore a document entity can only
1656	* start with ASCII characters. So the only way this can fail to be big-endian
1657	* UTF-16 if it it's an external parsed general entity that's labelled as
1658	* UTF-16LE. */
1659	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1660	break;
1661	*encPtr = encodingTable[UTF_16BE_ENC];
1662	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1663	}
1664	else if (ptr[1] == '\0')
1665	{
1666	/* We could recover here in the case:
1667	* - parsing an external entity
1668	* - second byte is 0
1669	* - no externally specified encoding
1670	* - no encoding declaration
1671	* by assuming UTF-16LE. But we don't, because this would mean when
1672	* presented just with a single byte, we couldn't reliably determine
1673	* whether we needed further bytes. */
1674	if (state == XML_CONTENT_STATE)
1675	break;
1676	*encPtr = encodingTable[UTF_16LE_ENC];
1677	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1678	}
1679	break;
1680	}
1681	}
1682	*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1683	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1684	}
1685
1686
1687	#define NS(x) x
1688	#define ns(x) x
1689	#include "xmltok_ns.c"
1690	#undef NS
1691	#undef ns
1692
1693	#ifdef XML_NS
1694
1695	#define NS(x) x ## NS
1696	#define ns(x) x ## _ns
1697
1698	#include "xmltok_ns.c"
1699
1700	#undef NS
1701	#undef ns
1702
1703	ENCODING * XmlInitUnknownEncodingNS(void *mem,
1704	int *table,
1705	int (* EXPATENTRY convert) (void userData, const char p),
1706	void *userData)
1707	{
1708	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1709
1710	if (enc)
1711	((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1712	return enc;
1713	}
1714
1715	#endif /* XML_NS */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: