Context Navigation

source: trunk/src/helpers/xmltok.c@ 97

Visit:

Last change on this file since 97 was 97, checked in by umoeller, 24 years ago
XML updates.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 45.7 KB

Line
1	/*
2	* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3	* See the file COPYING for copying permission.
4	*/
5
6	/* #ifdef COMPILED_FROM_DSP
7	* # include "winconfig.h"
8	* #else
9	* # include <config.h>
10	* #endif
11	*/
12
13	#include <memory.h>
14
15	#include "expat\expat_setup.h" // V0.9.9 (2001-02-10) [umoeller]
16
17	#pragma info(norea, nogen)
18	// disable "statement unreachable" and "missing break statement"
19	// this code generates those options HEAVILY
20
21	#include "expat\xmltok.h"
22	#include "expat\nametab.h"
23
24	#ifdef XML_DTD
25	#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
26	#else
27	#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
28	#endif
29
30	#define VTABLE1 \
31	{ PREFIX(prologTok), PREFIX(contentTok), \
32	PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
33	{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
34	PREFIX(sameName), \
35	PREFIX(nameMatchesAscii), \
36	PREFIX(nameLength), \
37	PREFIX(skipS), \
38	PREFIX(getAtts), \
39	PREFIX(charRefNumber), \
40	PREFIX(predefinedEntityName), \
41	PREFIX(updatePosition), \
42	PREFIX(isPublicId)
43
44	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
45
46	#define UCS2_GET_NAMING(pages, hi, lo) \
47	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
48
49	/* A 2 byte UTF-8 representation splits the characters 11 bits
50	* between the bottom 5 and 6 bits of the bytes.
51	* We need 8 bits to index into pages, 3 bits to add to that index and
52	* 5 bits to generate the mask. */
53	#define UTF8_GET_NAMING2(pages, byte) \
54	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
55	+ ((((byte)[0]) & 3) << 1) \
56	+ ((((byte)[1]) >> 5) & 1)] \
57	& (1 << (((byte)[1]) & 0x1F)))
58
59	/* A 3 byte UTF-8 representation splits the characters 16 bits
60	* between the bottom 4, 6 and 6 bits of the bytes.
61	* We need 8 bits to index into pages, 3 bits to add to that index and
62	* 5 bits to generate the mask. */
63	#define UTF8_GET_NAMING3(pages, byte) \
64	(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
65	+ ((((byte)[1]) >> 2) & 0xF)] \
66	<< 3) \
67	+ ((((byte)[1]) & 3) << 1) \
68	+ ((((byte)[2]) >> 5) & 1)] \
69	& (1 << (((byte)[2]) & 0x1F)))
70
71	#define UTF8_GET_NAMING(pages, p, n) \
72	((n) == 2 \
73	? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
74	: ((n) == 3 \
75	? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
76	: 0))
77
78	#define UTF8_INVALID3(p) \
79	((*p) == 0xED \
80	? (((p)[1] & 0x20) != 0) \
81	: ((*p) == 0xEF \
82	? ((p)[1] == 0xBF && ((p)[2] == 0xBF \|\| (p)[2] == 0xBE)) \
83	: 0))
84
85	#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
86
87	static
88	int isNever(const ENCODING * enc, const char *p)
89	{
90	return 0;
91	}
92
93	static
94	int utf8_isName2(const ENCODING * enc, const char *p)
95	{
96	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
97	}
98
99	static
100	int utf8_isName3(const ENCODING * enc, const char *p)
101	{
102	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
103	}
104
105	#define utf8_isName4 isNever
106
107	static
108	int utf8_isNmstrt2(const ENCODING * enc, const char *p)
109	{
110	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
111	}
112
113	static
114	int utf8_isNmstrt3(const ENCODING * enc, const char *p)
115	{
116	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
117	}
118
119	#define utf8_isNmstrt4 isNever
120
121	#define utf8_isInvalid2 isNever
122
123	static
124	int utf8_isInvalid3(const ENCODING * enc, const char *p)
125	{
126	return UTF8_INVALID3((const unsigned char *)p);
127	}
128
129	static
130	int utf8_isInvalid4(const ENCODING * enc, const char *p)
131	{
132	return UTF8_INVALID4((const unsigned char *)p);
133	}
134
135	struct normal_encoding
136	{
137	ENCODING enc;
138	unsigned char type[256];
139	#ifdef XML_MIN_SIZE
140	int (byteType) (const ENCODING , const char *);
141	int (isNameMin) (const ENCODING , const char *);
142	int (isNmstrtMin) (const ENCODING , const char *);
143	int (byteToAscii) (const ENCODING , const char *);
144	int (charMatches) (const ENCODING , const char *, int);
145	#endif /* XML_MIN_SIZE */
146	int (isName2) (const ENCODING , const char *);
147	int (isName3) (const ENCODING , const char *);
148	int (isName4) (const ENCODING , const char *);
149	int (isNmstrt2) (const ENCODING , const char *);
150	int (isNmstrt3) (const ENCODING , const char *);
151	int (isNmstrt4) (const ENCODING , const char *);
152	int (isInvalid2) (const ENCODING , const char *);
153	int (isInvalid3) (const ENCODING , const char *);
154	int (isInvalid4) (const ENCODING , const char *);
155	};
156
157	#ifdef XML_MIN_SIZE
158
159	#define STANDARD_VTABLE(E) \
160	E ## byteType, \
161	E ## isNameMin, \
162	E ## isNmstrtMin, \
163	E ## byteToAscii, \
164	E ## charMatches,
165
166	#else
167
168	#define STANDARD_VTABLE(E) /* as nothing */
169
170	#endif
171
172	#define NORMAL_VTABLE(E) \
173	E ## isName2, \
174	E ## isName3, \
175	E ## isName4, \
176	E ## isNmstrt2, \
177	E ## isNmstrt3, \
178	E ## isNmstrt4, \
179	E ## isInvalid2, \
180	E ## isInvalid3, \
181	E ## isInvalid4
182
183	static int checkCharRefNumber(int);
184
185	#include "expat\xmltok_impl.h"
186	#include "expat\ascii.h"
187
188	#ifdef XML_MIN_SIZE
189	#define sb_isNameMin isNever
190	#define sb_isNmstrtMin isNever
191	#endif
192
193	#ifdef XML_MIN_SIZE
194	#define MINBPC(enc) ((enc)->minBytesPerChar)
195	#else
196	/* minimum bytes per character */
197	#define MINBPC(enc) 1
198	#endif
199
200	#define SB_BYTE_TYPE(enc, p) \
201	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
202
203	#ifdef XML_MIN_SIZE
204	static
205	int sb_byteType(const ENCODING * enc, const char *p)
206	{
207	return SB_BYTE_TYPE(enc, p);
208	}
209	#define BYTE_TYPE(enc, p) \
210	(((const struct normal_encoding *)(enc))->byteType(enc, p))
211	#else
212	#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
213	#endif
214
215	#ifdef XML_MIN_SIZE
216	#define BYTE_TO_ASCII(enc, p) \
217	(((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
218	static
219	int sb_byteToAscii(const ENCODING * enc, const char *p)
220	{
221	return *p;
222	}
223	#else
224	#define BYTE_TO_ASCII(enc, p) (*(p))
225	#endif
226
227	#define IS_NAME_CHAR(enc, p, n) \
228	(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
229	#define IS_NMSTRT_CHAR(enc, p, n) \
230	(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
231	#define IS_INVALID_CHAR(enc, p, n) \
232	(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
233
234	#ifdef XML_MIN_SIZE
235	#define IS_NAME_CHAR_MINBPC(enc, p) \
236	(((const struct normal_encoding *)(enc))->isNameMin(enc, p))
237	#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
238	(((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
239	#else
240	#define IS_NAME_CHAR_MINBPC(enc, p) (0)
241	#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
242	#endif
243
244	#ifdef XML_MIN_SIZE
245	#define CHAR_MATCHES(enc, p, c) \
246	(((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
247	static
248	int sb_charMatches(const ENCODING * enc, const char *p, int c)
249	{
250	return *p == c;
251	}
252	#else
253	/* c is an ASCII character */
254	#define CHAR_MATCHES(enc, p, c) (*(p) == c)
255	#endif
256
257	#define PREFIX(ident) normal_ ## ident
258	#include "xmltok_impl.c"
259
260	#undef MINBPC
261	#undef BYTE_TYPE
262	#undef BYTE_TO_ASCII
263	#undef CHAR_MATCHES
264	#undef IS_NAME_CHAR
265	#undef IS_NAME_CHAR_MINBPC
266	#undef IS_NMSTRT_CHAR
267	#undef IS_NMSTRT_CHAR_MINBPC
268	#undef IS_INVALID_CHAR
269
270	enum
271	{ /* UTF8_cvalN is value of masked first byte of N byte sequence */
272	UTF8_cval1 = 0x00,
273	UTF8_cval2 = 0xc0,
274	UTF8_cval3 = 0xe0,
275	UTF8_cval4 = 0xf0
276	};
277
278	static void utf8_toUtf8(const ENCODING * enc,
279	const char **fromP,
280	const char *fromLim,
281	char **toP,
282	const char *toLim)
283	{
284	char *to;
285	const char *from;
286
287	if (fromLim - fromP > toLim - toP)
288	{
289	/* Avoid copying partial characters. */
290	for (fromLim = fromP + (toLim - toP);
291	fromLim > *fromP;
292	fromLim--)
293	if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
294	break;
295	}
296	for (to = toP, from = fromP;
297	from != fromLim;
298	from++, to++)
299	to = from;
300	*fromP = from;
301	*toP = to;
302	}
303
304	static void utf8_toUtf16(const ENCODING * enc,
305	const char **fromP,
306	const char *fromLim,
307	unsigned short **toP,
308	const unsigned short *toLim)
309	{
310	unsigned short to = toP;
311	const char from = fromP;
312
313	while (from != fromLim && to != toLim)
314	{
315	switch (((struct normal_encoding )enc)->type[(unsigned char)from])
316	{
317	case BT_LEAD2:
318	*to++ = ((from[0] & 0x1f) << 6) \| (from[1] & 0x3f);
319	from += 2;
320	break;
321	case BT_LEAD3:
322	*to++ = ((from[0] & 0xf) << 12) \| ((from[1] & 0x3f) << 6) \| (from[2] & 0x3f);
323	from += 3;
324	break;
325	case BT_LEAD4:
326	{
327	unsigned long n;
328
329	if (to + 1 == toLim)
330	break;
331	n = ((from[0] & 0x7) << 18) \| ((from[1] & 0x3f) << 12) \| ((from[2] & 0x3f) << 6) \| (from[3] & 0x3f);
332	n -= 0x10000;
333	to[0] = (unsigned short)((n >> 10) \| 0xD800);
334	to[1] = (unsigned short)((n & 0x3FF) \| 0xDC00);
335	to += 2;
336	from += 4;
337	}
338	break;
339	default:
340	to++ = from++;
341	break;
342	}
343	}
344	*fromP = from;
345	*toP = to;
346	}
347
348	#ifdef XML_NS
349	static const struct normal_encoding utf8_encoding_ns =
350	{
351	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
352	{
353	#include "expat\asciitab.h"
354	#include "expat\utf8tab.h"
355	},
356	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
357	};
358
359	#endif
360
361	static const struct normal_encoding utf8_encoding =
362	{
363	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
364	{
365	#define BT_COLON BT_NMSTRT
366	#include "expat\asciitab.h"
367	#undef BT_COLON
368	#include "expat\utf8tab.h"
369	},
370	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
371	};
372
373	#ifdef XML_NS
374
375	static const struct normal_encoding internal_utf8_encoding_ns =
376	{
377	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
378	{
379	#include "expat\iasciitab.h"
380	#include "expat\utf8tab.h"
381	},
382	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
383	};
384
385	#endif
386
387	static const struct normal_encoding internal_utf8_encoding =
388	{
389	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
390	{
391	#define BT_COLON BT_NMSTRT
392	#include "expat\iasciitab.h"
393	#undef BT_COLON
394	#include "expat\utf8tab.h"
395	},
396	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
397	};
398
399	static void latin1_toUtf8(const ENCODING * enc,
400	const char **fromP,
401	const char *fromLim,
402	char **toP,
403	const char *toLim)
404	{
405	for (;;)
406	{
407	unsigned char c;
408
409	if (*fromP == fromLim)
410	break;
411	c = (unsigned char)**fromP;
412	if (c & 0x80)
413	{
414	if (toLim - *toP < 2)
415	break;
416	(toP)++ = ((c >> 6) \| UTF8_cval2);
417	(toP)++ = ((c & 0x3f) \| 0x80);
418	(*fromP)++;
419	}
420	else
421	{
422	if (*toP == toLim)
423	break;
424	(toP)++ = (fromP)++;
425	}
426	}
427	}
428
429	static void latin1_toUtf16(const ENCODING * enc,
430	const char **fromP,
431	const char *fromLim,
432	unsigned short **toP,
433	const unsigned short *toLim)
434	{
435	while (fromP != fromLim && toP != toLim)
436	(toP)++ = (unsigned char)(fromP)++;
437	}
438
439	#ifdef XML_NS
440
441	static const struct normal_encoding latin1_encoding_ns =
442	{
443	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
444	{
445	#include "expat\asciitab.h"
446	#include "expat\latin1tab.h"
447	},
448	STANDARD_VTABLE(sb_)
449	};
450
451	#endif
452
453	static const struct normal_encoding latin1_encoding =
454	{
455	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
456	{
457	#define BT_COLON BT_NMSTRT
458	#include "expat\asciitab.h"
459	#undef BT_COLON
460	#include "expat\latin1tab.h"
461	},
462	STANDARD_VTABLE(sb_)
463	};
464
465	static void ascii_toUtf8(const ENCODING * enc,
466	const char **fromP,
467	const char *fromLim,
468	char **toP,
469	const char *toLim)
470	{
471	while (fromP != fromLim && toP != toLim)
472	(toP)++ = (fromP)++;
473	}
474
475	#ifdef XML_NS
476
477	static const struct normal_encoding ascii_encoding_ns =
478	{
479	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
480	{
481	#include "expat\asciitab.h"
482	/* BT_NONXML == 0 */
483	},
484	STANDARD_VTABLE(sb_)
485	};
486
487	#endif
488
489	static const struct normal_encoding ascii_encoding =
490	{
491	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
492	{
493	#define BT_COLON BT_NMSTRT
494	#include "expat\asciitab.h"
495	#undef BT_COLON
496	/* BT_NONXML == 0 */
497	},
498	STANDARD_VTABLE(sb_)
499	};
500
501	static int unicode_byte_type(char hi, char lo)
502	{
503	switch ((unsigned char)hi)
504	{
505	case 0xD8:
506	case 0xD9:
507	case 0xDA:
508	case 0xDB:
509	return BT_LEAD4;
510	case 0xDC:
511	case 0xDD:
512	case 0xDE:
513	case 0xDF:
514	return BT_TRAIL;
515	case 0xFF:
516	switch ((unsigned char)lo)
517	{
518	case 0xFF:
519	case 0xFE:
520	return BT_NONXML;
521	}
522	break;
523	}
524	return BT_NONASCII;
525	}
526
527	#define DEFINE_UTF16_TO_UTF8(E) \
528	static \
529	void E ## toUtf8(const ENCODING *enc, \
530	const char *fromP, const char fromLim, \
531	char *toP, const char toLim) \
532	{ \
533	const char *from; \
534	for (from = *fromP; from != fromLim; from += 2) { \
535	int plane; \
536	unsigned char lo2; \
537	unsigned char lo = GET_LO(from); \
538	unsigned char hi = GET_HI(from); \
539	switch (hi) { \
540	case 0: \
541	if (lo < 0x80) { \
542	if (*toP == toLim) { \
543	*fromP = from; \
544	return; \
545	} \
546	(toP)++ = lo; \
547	break; \
548	} \
549	/* fall through */ \
550	case 0x1: case 0x2: case 0x3: \
551	case 0x4: case 0x5: case 0x6: case 0x7: \
552	if (toLim - *toP < 2) { \
553	*fromP = from; \
554	return; \
555	} \
556	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
557	(toP)++ = ((lo & 0x3f) \| 0x80); \
558	break; \
559	default: \
560	if (toLim - *toP < 3) { \
561	*fromP = from; \
562	return; \
563	} \
564	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
565	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
566	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
567	(toP)++ = ((lo & 0x3f) \| 0x80); \
568	break; \
569	case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
570	if (toLim - *toP < 4) { \
571	*fromP = from; \
572	return; \
573	} \
574	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
575	(toP)++ = ((plane >> 2) \| UTF8_cval4); \
576	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
577	from += 2; \
578	lo2 = GET_LO(from); \
579	(toP)++ = (((lo & 0x3) << 4) \
580	\| ((GET_HI(from) & 0x3) << 2) \
581	\| (lo2 >> 6) \
582	\| 0x80); \
583	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
584	break; \
585	} \
586	} \
587	*fromP = from; \
588	}
589
590	#define DEFINE_UTF16_TO_UTF16(E) \
591	static \
592	void E ## toUtf16(const ENCODING *enc, \
593	const char *fromP, const char fromLim, \
594	unsigned short *toP, const unsigned short toLim) \
595	{ \
596	/* Avoid copying first half only of surrogate */ \
597	if (fromLim - fromP > ((toLim - toP) << 1) \
598	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
599	fromLim -= 2; \
600	for (; fromP != fromLim && toP != toLim; *fromP += 2) \
601	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
602	}
603
604	#define SET2(ptr, ch) \
605	(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
606	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
607	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
608
609	DEFINE_UTF16_TO_UTF8(little2_)
610	DEFINE_UTF16_TO_UTF16(little2_)
611
612	#undef SET2
613	#undef GET_LO
614	#undef GET_HI
615
616	#define SET2(ptr, ch) \
617	(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
618	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
619	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
620
621	DEFINE_UTF16_TO_UTF8(big2_)
622	DEFINE_UTF16_TO_UTF16(big2_)
623
624	#undef SET2
625	#undef GET_LO
626	#undef GET_HI
627
628	#define LITTLE2_BYTE_TYPE(enc, p) \
629	((p)[1] == 0 \
630	? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
631	: unicode_byte_type((p)[1], (p)[0]))
632	#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
633	#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
634	#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
635	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
636	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
637	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
638
639	#ifdef XML_MIN_SIZE
640
641	static
642	int little2_byteType(const ENCODING * enc, const char *p)
643	{
644	return LITTLE2_BYTE_TYPE(enc, p);
645	}
646
647	static int little2_byteToAscii(const ENCODING * enc, const char *p)
648	{
649	return LITTLE2_BYTE_TO_ASCII(enc, p);
650	}
651
652	static int little2_charMatches(const ENCODING * enc, const char *p, int c)
653	{
654	return LITTLE2_CHAR_MATCHES(enc, p, c);
655	}
656
657	static int little2_isNameMin(const ENCODING * enc, const char *p)
658	{
659	return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
660	}
661
662	static int little2_isNmstrtMin(const ENCODING * enc, const char *p)
663	{
664	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
665	}
666
667	#undef VTABLE
668	#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
669
670	#else /* not XML_MIN_SIZE */
671
672	#undef PREFIX
673	#define PREFIX(ident) little2_ ## ident
674	#define MINBPC(enc) 2
675	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
676	#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
677	#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
678	#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
679	#define IS_NAME_CHAR(enc, p, n) 0
680	#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
681	#define IS_NMSTRT_CHAR(enc, p, n) (0)
682	#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
683
684	#include "xmltok_impl.c"
685
686	#undef MINBPC
687	#undef BYTE_TYPE
688	#undef BYTE_TO_ASCII
689	#undef CHAR_MATCHES
690	#undef IS_NAME_CHAR
691	#undef IS_NAME_CHAR_MINBPC
692	#undef IS_NMSTRT_CHAR
693	#undef IS_NMSTRT_CHAR_MINBPC
694	#undef IS_INVALID_CHAR
695
696	#endif /* not XML_MIN_SIZE */
697
698	#ifdef XML_NS
699
700	static const struct normal_encoding little2_encoding_ns =
701	{
702	{VTABLE, 2, 0,
703	#if XML_BYTE_ORDER == 12
704	1
705	#else
706	0
707	#endif
708	},
709	{
710	#include "expat\asciitab.h"
711	#include "expat\latin1tab.h"
712	},
713	STANDARD_VTABLE(little2_)
714	};
715
716	#endif
717
718	static const struct normal_encoding little2_encoding =
719	{
720	{VTABLE, 2, 0,
721	#if XML_BYTE_ORDER == 12
722	1
723	#else
724	0
725	#endif
726	},
727	{
728	#define BT_COLON BT_NMSTRT
729	#include "expat\asciitab.h"
730	#undef BT_COLON
731	#include "expat\latin1tab.h"
732	},
733	STANDARD_VTABLE(little2_)
734	};
735
736	#if XML_BYTE_ORDER != 21
737
738	#ifdef XML_NS
739
740	static const struct normal_encoding internal_little2_encoding_ns =
741	{
742	{VTABLE, 2, 0, 1},
743	{
744	#include "expat\iasciitab.h"
745	#include "expat\latin1tab.h"
746	},
747	STANDARD_VTABLE(little2_)
748	};
749
750	#endif
751
752	static const struct normal_encoding internal_little2_encoding =
753	{
754	{VTABLE, 2, 0, 1},
755	{
756	#define BT_COLON BT_NMSTRT
757	#include "expat\iasciitab.h"
758	#undef BT_COLON
759	#include "expat\latin1tab.h"
760	},
761	STANDARD_VTABLE(little2_)
762	};
763
764	#endif
765
766
767	#define BIG2_BYTE_TYPE(enc, p) \
768	((p)[0] == 0 \
769	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
770	: unicode_byte_type((p)[0], (p)[1]))
771	#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
772	#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
773	#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
774	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
775	#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
776	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
777
778	#ifdef XML_MIN_SIZE
779
780	static int big2_byteType(const ENCODING * enc, const char *p)
781	{
782	return BIG2_BYTE_TYPE(enc, p);
783	}
784
785	static int big2_byteToAscii(const ENCODING * enc, const char *p)
786	{
787	return BIG2_BYTE_TO_ASCII(enc, p);
788	}
789
790	static int big2_charMatches(const ENCODING * enc, const char *p, int c)
791	{
792	return BIG2_CHAR_MATCHES(enc, p, c);
793	}
794
795	static int big2_isNameMin(const ENCODING * enc, const char *p)
796	{
797	return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
798	}
799
800	static int big2_isNmstrtMin(const ENCODING * enc, const char *p)
801	{
802	return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
803	}
804
805	#undef VTABLE
806	#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
807
808	#else /* not XML_MIN_SIZE */
809
810	#undef PREFIX
811	#define PREFIX(ident) big2_ ## ident
812	#define MINBPC(enc) 2
813	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
814	#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
815	#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
816	#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
817	#define IS_NAME_CHAR(enc, p, n) 0
818	#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
819	#define IS_NMSTRT_CHAR(enc, p, n) (0)
820	#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
821
822	#include "xmltok_impl.c"
823
824	#undef MINBPC
825	#undef BYTE_TYPE
826	#undef BYTE_TO_ASCII
827	#undef CHAR_MATCHES
828	#undef IS_NAME_CHAR
829	#undef IS_NAME_CHAR_MINBPC
830	#undef IS_NMSTRT_CHAR
831	#undef IS_NMSTRT_CHAR_MINBPC
832	#undef IS_INVALID_CHAR
833
834	#endif /* not XML_MIN_SIZE */
835
836	#ifdef XML_NS
837
838	static const struct normal_encoding big2_encoding_ns =
839	{
840	{VTABLE, 2, 0,
841	#if XML_BYTE_ORDER == 21
842	1
843	#else
844	0
845	#endif
846	},
847	{
848	#include "expat\asciitab.h"
849	#include "expat\latin1tab.h"
850	},
851	STANDARD_VTABLE(big2_)
852	};
853
854	#endif
855
856	static const struct normal_encoding big2_encoding =
857	{
858	{VTABLE, 2, 0,
859	#if XML_BYTE_ORDER == 21
860	1
861	#else
862	0
863	#endif
864	},
865	{
866	#define BT_COLON BT_NMSTRT
867	#include "expat\asciitab.h"
868	#undef BT_COLON
869	#include "expat\latin1tab.h"
870	},
871	STANDARD_VTABLE(big2_)
872	};
873
874	#if XML_BYTE_ORDER != 12
875
876	#ifdef XML_NS
877
878	static const struct normal_encoding internal_big2_encoding_ns =
879	{
880	{VTABLE, 2, 0, 1},
881	{
882	#include "expat\iasciitab.h"
883	#include "expat\latin1tab.h"
884	},
885	STANDARD_VTABLE(big2_)
886	};
887
888	#endif
889
890	static const struct normal_encoding internal_big2_encoding =
891	{
892	{VTABLE, 2, 0, 1},
893	{
894	#define BT_COLON BT_NMSTRT
895	#include "expat\iasciitab.h"
896	#undef BT_COLON
897	#include "expat\latin1tab.h"
898	},
899	STANDARD_VTABLE(big2_)
900	};
901
902	#endif
903
904	#undef PREFIX
905
906	static int streqci(const char s1, const char s2)
907	{
908	for (;;)
909	{
910	char c1 = *s1++;
911	char c2 = *s2++;
912
913	if (ASCII_a <= c1 && c1 <= ASCII_z)
914	c1 += ASCII_A - ASCII_a;
915	if (ASCII_a <= c2 && c2 <= ASCII_z)
916	c2 += ASCII_A - ASCII_a;
917	if (c1 != c2)
918	return 0;
919	if (!c1)
920	break;
921	}
922	return 1;
923	}
924
925	static void initUpdatePosition(const ENCODING * enc, const char *ptr,
926	const char end, POSITION pos)
927	{
928	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
929	}
930
931	static int toAscii(const ENCODING * enc, const char ptr, const char end)
932	{
933	char buf[1];
934	char *p = buf;
935
936	XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
937	if (p == buf)
938	return -1;
939	else
940	return buf[0];
941	}
942
943	static int isSpace(int c)
944	{
945	switch (c)
946	{
947	case 0x20:
948	case 0xD:
949	case 0xA:
950	case 0x9:
951	return 1;
952	}
953	return 0;
954	}
955
956	/* Return 1 if there's just optional white space
957	* or there's an S followed by name=val. */
958	static int parsePseudoAttribute(const ENCODING * enc,
959	const char *ptr,
960	const char *end,
961	const char **namePtr,
962	const char **nameEndPtr,
963	const char **valPtr,
964	const char **nextTokPtr)
965	{
966	int c;
967	char open;
968
969	if (ptr == end)
970	{
971	*namePtr = 0;
972	return 1;
973	}
974	if (!isSpace(toAscii(enc, ptr, end)))
975	{
976	*nextTokPtr = ptr;
977	return 0;
978	}
979	do
980	{
981	ptr += enc->minBytesPerChar;
982	}
983	while (isSpace(toAscii(enc, ptr, end)));
984	if (ptr == end)
985	{
986	*namePtr = 0;
987	return 1;
988	}
989	*namePtr = ptr;
990	for (;;)
991	{
992	c = toAscii(enc, ptr, end);
993	if (c == -1)
994	{
995	*nextTokPtr = ptr;
996	return 0;
997	}
998	if (c == ASCII_EQUALS)
999	{
1000	*nameEndPtr = ptr;
1001	break;
1002	}
1003	if (isSpace(c))
1004	{
1005	*nameEndPtr = ptr;
1006	do
1007	{
1008	ptr += enc->minBytesPerChar;
1009	}
1010	while (isSpace(c = toAscii(enc, ptr, end)));
1011	if (c != ASCII_EQUALS)
1012	{
1013	*nextTokPtr = ptr;
1014	return 0;
1015	}
1016	break;
1017	}
1018	ptr += enc->minBytesPerChar;
1019	}
1020	if (ptr == *namePtr)
1021	{
1022	*nextTokPtr = ptr;
1023	return 0;
1024	}
1025	ptr += enc->minBytesPerChar;
1026	c = toAscii(enc, ptr, end);
1027	while (isSpace(c))
1028	{
1029	ptr += enc->minBytesPerChar;
1030	c = toAscii(enc, ptr, end);
1031	}
1032	if (c != ASCII_QUOT && c != ASCII_APOS)
1033	{
1034	*nextTokPtr = ptr;
1035	return 0;
1036	}
1037	open = c;
1038	ptr += enc->minBytesPerChar;
1039	*valPtr = ptr;
1040	for (;; ptr += enc->minBytesPerChar)
1041	{
1042	c = toAscii(enc, ptr, end);
1043	if (c == open)
1044	break;
1045	if (!(ASCII_a <= c && c <= ASCII_z)
1046	&& !(ASCII_A <= c && c <= ASCII_Z)
1047	&& !(ASCII_0 <= c && c <= ASCII_9)
1048	&& c != ASCII_PERIOD
1049	&& c != ASCII_MINUS
1050	&& c != ASCII_UNDERSCORE)
1051	{
1052	*nextTokPtr = ptr;
1053	return 0;
1054	}
1055	}
1056	*nextTokPtr = ptr + enc->minBytesPerChar;
1057	return 1;
1058	}
1059
1060	static const char KW_version[] =
1061	{
1062	ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1063	};
1064
1065	static const char KW_encoding[] =
1066	{
1067	ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1068	};
1069
1070	static const char KW_standalone[] =
1071	{
1072	ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
1073	};
1074
1075	static const char KW_yes[] =
1076	{
1077	ASCII_y, ASCII_e, ASCII_s, '\0'
1078	};
1079
1080	static const char KW_no[] =
1081	{
1082	ASCII_n, ASCII_o, '\0'
1083	};
1084
1085	static int doParseXmlDecl(const ENCODING * (encodingFinder) (const ENCODING ,
1086	const char *,
1087	const char *),
1088	int isGeneralTextEntity,
1089	const ENCODING * enc,
1090	const char *ptr,
1091	const char *end,
1092	const char **badPtr,
1093	const char **versionPtr,
1094	const char **versionEndPtr,
1095	const char **encodingName,
1096	const ENCODING ** encoding,
1097	int *standalone)
1098	{
1099	const char *val = 0;
1100	const char *name = 0;
1101	const char *nameEnd = 0;
1102
1103	ptr += 5 * enc->minBytesPerChar;
1104	end -= 2 * enc->minBytesPerChar;
1105	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) \|\| !name)
1106	{
1107	*badPtr = ptr;
1108	return 0;
1109	}
1110	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version))
1111	{
1112	if (!isGeneralTextEntity)
1113	{
1114	*badPtr = name;
1115	return 0;
1116	}
1117	}
1118	else
1119	{
1120	if (versionPtr)
1121	*versionPtr = val;
1122	if (versionEndPtr)
1123	*versionEndPtr = ptr;
1124	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1125	{
1126	*badPtr = ptr;
1127	return 0;
1128	}
1129	if (!name)
1130	{
1131	if (isGeneralTextEntity)
1132	{
1133	/* a TextDecl must have an EncodingDecl */
1134	*badPtr = ptr;
1135	return 0;
1136	}
1137	return 1;
1138	}
1139	}
1140	if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding))
1141	{
1142	int c = toAscii(enc, val, end);
1143
1144	if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z))
1145	{
1146	*badPtr = val;
1147	return 0;
1148	}
1149	if (encodingName)
1150	*encodingName = val;
1151	if (encoding)
1152	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1153	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1154	{
1155	*badPtr = ptr;
1156	return 0;
1157	}
1158	if (!name)
1159	return 1;
1160	}
1161	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) \|\| isGeneralTextEntity)
1162	{
1163	*badPtr = name;
1164	return 0;
1165	}
1166	if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes))
1167	{
1168	if (standalone)
1169	*standalone = 1;
1170	}
1171	else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no))
1172	{
1173	if (standalone)
1174	*standalone = 0;
1175	}
1176	else
1177	{
1178	*badPtr = val;
1179	return 0;
1180	}
1181	while (isSpace(toAscii(enc, ptr, end)))
1182	ptr += enc->minBytesPerChar;
1183	if (ptr != end)
1184	{
1185	*badPtr = ptr;
1186	return 0;
1187	}
1188	return 1;
1189	}
1190
1191	static int checkCharRefNumber(int result)
1192	{
1193	switch (result >> 8)
1194	{
1195	case 0xD8:
1196	case 0xD9:
1197	case 0xDA:
1198	case 0xDB:
1199	case 0xDC:
1200	case 0xDD:
1201	case 0xDE:
1202	case 0xDF:
1203	return -1;
1204	case 0:
1205	if (latin1_encoding.type[result] == BT_NONXML)
1206	return -1;
1207	break;
1208	case 0xFF:
1209	if (result == 0xFFFE \|\| result == 0xFFFF)
1210	return -1;
1211	break;
1212	}
1213	return result;
1214	}
1215
1216	int XmlUtf8Encode(int c, char *buf)
1217	{
1218	enum
1219	{
1220	/* minN is minimum legal resulting value for N byte sequence */
1221	min2 = 0x80,
1222	min3 = 0x800,
1223	min4 = 0x10000
1224	};
1225
1226	if (c < 0)
1227	return 0;
1228	if (c < min2)
1229	{
1230	buf[0] = (c \| UTF8_cval1);
1231	return 1;
1232	}
1233	if (c < min3)
1234	{
1235	buf[0] = ((c >> 6) \| UTF8_cval2);
1236	buf[1] = ((c & 0x3f) \| 0x80);
1237	return 2;
1238	}
1239	if (c < min4)
1240	{
1241	buf[0] = ((c >> 12) \| UTF8_cval3);
1242	buf[1] = (((c >> 6) & 0x3f) \| 0x80);
1243	buf[2] = ((c & 0x3f) \| 0x80);
1244	return 3;
1245	}
1246	if (c < 0x110000)
1247	{
1248	buf[0] = ((c >> 18) \| UTF8_cval4);
1249	buf[1] = (((c >> 12) & 0x3f) \| 0x80);
1250	buf[2] = (((c >> 6) & 0x3f) \| 0x80);
1251	buf[3] = ((c & 0x3f) \| 0x80);
1252	return 4;
1253	}
1254	return 0;
1255	}
1256
1257	int XmlUtf16Encode(int charNum, unsigned short *buf)
1258	{
1259	if (charNum < 0)
1260	return 0;
1261	if (charNum < 0x10000)
1262	{
1263	buf[0] = charNum;
1264	return 1;
1265	}
1266	if (charNum < 0x110000)
1267	{
1268	charNum -= 0x10000;
1269	buf[0] = (charNum >> 10) + 0xD800;
1270	buf[1] = (charNum & 0x3FF) + 0xDC00;
1271	return 2;
1272	}
1273	return 0;
1274	}
1275
1276	struct unknown_encoding
1277	{
1278	struct normal_encoding normal;
1279	int (convert) (void userData, const char *p);
1280	void *userData;
1281	unsigned short utf16[256];
1282	char utf8[256][4];
1283	};
1284
1285	int XmlSizeOfUnknownEncoding(void)
1286	{
1287	return sizeof(struct unknown_encoding);
1288	}
1289
1290	static int unknown_isName(const ENCODING * enc, const char *p)
1291	{
1292	int c = ((const struct unknown_encoding *)enc)
1293	->convert(((const struct unknown_encoding *)enc)->userData, p);
1294
1295	if (c & ~0xFFFF)
1296	return 0;
1297	return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1298	}
1299
1300	static int unknown_isNmstrt(const ENCODING * enc, const char *p)
1301	{
1302	int c = ((const struct unknown_encoding *)enc)
1303	->convert(((const struct unknown_encoding *)enc)->userData, p);
1304
1305	if (c & ~0xFFFF)
1306	return 0;
1307	return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1308	}
1309
1310	static int unknown_isInvalid(const ENCODING * enc, const char *p)
1311	{
1312	int c = ((const struct unknown_encoding *)enc)
1313	->convert(((const struct unknown_encoding *)enc)->userData, p);
1314
1315	return (c & ~0xFFFF) \|\| checkCharRefNumber(c) < 0;
1316	}
1317
1318	static void unknown_toUtf8(const ENCODING * enc,
1319	const char **fromP,
1320	const char *fromLim,
1321	char **toP,
1322	const char *toLim)
1323	{
1324	char buf[XML_UTF8_ENCODE_MAX];
1325
1326	for (;;)
1327	{
1328	const char *utf8;
1329	int n;
1330
1331	if (*fromP == fromLim)
1332	break;
1333	utf8 = ((const struct unknown_encoding )enc)->utf8[(unsigned char)*fromP];
1334	n = *utf8++;
1335	if (n == 0)
1336	{
1337	int c = ((const struct unknown_encoding *)enc)
1338	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1339
1340	n = XmlUtf8Encode(c, buf);
1341	if (n > toLim - *toP)
1342	break;
1343	utf8 = buf;
1344	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1345	- (BT_LEAD2 - 2);
1346	}
1347	else
1348	{
1349	if (n > toLim - *toP)
1350	break;
1351	(*fromP)++;
1352	}
1353	do
1354	{
1355	(toP)++ = *utf8++;
1356	}
1357	while (--n != 0);
1358	}
1359	}
1360
1361	static void unknown_toUtf16(const ENCODING * enc,
1362	const char **fromP,
1363	const char *fromLim,
1364	unsigned short **toP,
1365	const unsigned short *toLim)
1366	{
1367	while (fromP != fromLim && toP != toLim)
1368	{
1369	unsigned short c
1370	= ((const struct unknown_encoding )enc)->utf16[(unsigned char)*fromP];
1371
1372	if (c == 0)
1373	{
1374	c = (unsigned short)((const struct unknown_encoding *)enc)
1375	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1376	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1377	- (BT_LEAD2 - 2);
1378	}
1379	else
1380	(*fromP)++;
1381	(toP)++ = c;
1382	}
1383	}
1384
1385	/*
1386	*@@ XmlInitUnknownEncoding:
1387	*
1388	*@@changed V0.9.14 (2001-08-09) [umoeller]: couple of performance hacks
1389	*/
1390
1391	ENCODING* XmlInitUnknownEncoding(void *mem,
1392	int *table,
1393	int (convert) (void userData, const char *p),
1394	void *userData)
1395	{
1396	int i;
1397	struct unknown_encoding *e = mem;
1398
1399	// gee, isn't this a regular memcpy?!?
1400	/* for (i = 0;
1401	i < (int)sizeof(struct normal_encoding);
1402	i++)
1403	((char )mem)[i] = ((char )&latin1_encoding)[i]; */
1404
1405	// replaced the above with this V0.9.14 (2001-08-09) [umoeller]
1406	memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1407
1408	for (i = 0; i < 128; i++)
1409	if ( latin1_encoding.type[i] != BT_OTHER
1410	&& latin1_encoding.type[i] != BT_NONXML
1411	&& table[i] != i
1412	)
1413	return 0;
1414
1415	for (i = 0; i < 256; i++)
1416	{
1417	int c = table[i];
1418
1419	if (c == -1)
1420	{
1421	e->normal.type[i] = BT_MALFORM;
1422	/* This shouldn't really get used. */
1423	e->utf16[i] = 0xFFFF;
1424	e->utf8[i][0] = 1;
1425	e->utf8[i][1] = 0;
1426	}
1427	else if (c < 0)
1428	{
1429	if (c < -4)
1430	return 0;
1431	e->normal.type[i] = BT_LEAD2 - (c + 2);
1432	e->utf8[i][0] = 0;
1433	e->utf16[i] = 0;
1434	}
1435	else if (c < 0x80)
1436	{
1437	if ( latin1_encoding.type[c] != BT_OTHER
1438	&& latin1_encoding.type[c] != BT_NONXML
1439	&& c != i
1440	)
1441	return 0;
1442	e->normal.type[i] = latin1_encoding.type[c];
1443	e->utf8[i][0] = 1;
1444	e->utf8[i][1] = (char)c;
1445	e->utf16[i] = c == 0 ? 0xFFFF : c;
1446	}
1447	else if (checkCharRefNumber(c) < 0)
1448	{
1449	e->normal.type[i] = BT_NONXML;
1450	/* This shouldn't really get used. */
1451	e->utf16[i] = 0xFFFF;
1452	e->utf8[i][0] = 1;
1453	e->utf8[i][1] = 0;
1454	}
1455	else
1456	{
1457	if (c > 0xFFFF)
1458	return 0;
1459	if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1460	e->normal.type[i] = BT_NMSTRT;
1461	else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1462	e->normal.type[i] = BT_NAME;
1463	else
1464	e->normal.type[i] = BT_OTHER;
1465	e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1466	e->utf16[i] = c;
1467	}
1468	}
1469	e->userData = userData;
1470	e->convert = convert;
1471	if (convert)
1472	{
1473	e->normal.isName2 = unknown_isName;
1474	e->normal.isName3 = unknown_isName;
1475	e->normal.isName4 = unknown_isName;
1476	e->normal.isNmstrt2 = unknown_isNmstrt;
1477	e->normal.isNmstrt3 = unknown_isNmstrt;
1478	e->normal.isNmstrt4 = unknown_isNmstrt;
1479	e->normal.isInvalid2 = unknown_isInvalid;
1480	e->normal.isInvalid3 = unknown_isInvalid;
1481	e->normal.isInvalid4 = unknown_isInvalid;
1482	}
1483	e->normal.enc.utf8Convert = unknown_toUtf8;
1484	e->normal.enc.utf16Convert = unknown_toUtf16;
1485	return &(e->normal.enc);
1486	}
1487
1488	/* If this enumeration is changed, getEncodingIndex and encodings
1489	* must also be changed. */
1490	enum
1491	{
1492	UNKNOWN_ENC = -1,
1493	ISO_8859_1_ENC = 0,
1494	US_ASCII_ENC,
1495	UTF_8_ENC,
1496	UTF_16_ENC,
1497	UTF_16BE_ENC,
1498	UTF_16LE_ENC,
1499	/* must match encodingNames up to here */
1500	NO_ENC
1501	};
1502
1503	static const char KW_ISO_8859_1[] =
1504	{
1505	ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
1506	};
1507	static const char KW_US_ASCII[] =
1508	{
1509	ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
1510	};
1511	static const char KW_UTF_8[] =
1512	{
1513	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1514	};
1515	static const char KW_UTF_16[] =
1516	{
1517	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1518	};
1519	static const char KW_UTF_16BE[] =
1520	{
1521	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
1522	};
1523	static const char KW_UTF_16LE[] =
1524	{
1525	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
1526	};
1527
1528	static int getEncodingIndex(const char *name)
1529	{
1530	static const char *encodingNames[] =
1531	{
1532	KW_ISO_8859_1,
1533	KW_US_ASCII,
1534	KW_UTF_8,
1535	KW_UTF_16,
1536	KW_UTF_16BE,
1537	KW_UTF_16LE,
1538	};
1539	int i;
1540
1541	if (name == 0)
1542	return NO_ENC;
1543	for (i = 0;
1544	i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0]));
1545	i++)
1546	if (streqci(name, encodingNames[i]))
1547	return i;
1548	return UNKNOWN_ENC;
1549	}
1550
1551	/* For binary compatibility, we store the index of the encoding specified
1552	* at initialization in the isUtf16 member. */
1553
1554	#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1555	#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1556
1557	/* This is what detects the encoding.
1558	* encodingTable maps from encoding indices to encodings;
1559	* INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1560	* state is XML_CONTENT_STATE if we're parsing an external text entity,
1561	* and XML_PROLOG_STATE otherwise.
1562	*/
1563
1564
1565	static int initScan(const ENCODING ** encodingTable,
1566	const INIT_ENCODING * enc,
1567	int state,
1568	const char *ptr,
1569	const char *end,
1570	const char **nextTokPtr)
1571	{
1572	const ENCODING **encPtr;
1573
1574	if (ptr == end)
1575	return XML_TOK_NONE;
1576	encPtr = enc->encPtr;
1577	if (ptr + 1 == end)
1578	{
1579	/* only a single byte available for auto-detection */
1580	#ifndef XML_DTD /* FIXME */
1581	/* a well-formed document entity must have more than one byte */
1582	if (state != XML_CONTENT_STATE)
1583	return XML_TOK_PARTIAL;
1584	#endif
1585	/* so we're parsing an external text entity... */
1586	/* if UTF-16 was externally specified, then we need at least 2 bytes */
1587	switch (INIT_ENC_INDEX(enc))
1588	{
1589	case UTF_16_ENC:
1590	case UTF_16LE_ENC:
1591	case UTF_16BE_ENC:
1592	return XML_TOK_PARTIAL;
1593	}
1594	switch ((unsigned char)*ptr)
1595	{
1596	case 0xFE:
1597	case 0xFF:
1598	case 0xEF: /* possibly first byte of UTF-8 BOM */
1599	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1600	&& state == XML_CONTENT_STATE)
1601	break;
1602	/* fall through */
1603	case 0x00:
1604	case 0x3C:
1605	return XML_TOK_PARTIAL;
1606	}
1607	}
1608	else
1609	{
1610	switch (((unsigned char)ptr[0] << 8) \| (unsigned char)ptr[1])
1611	{
1612	case 0xFEFF:
1613	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1614	&& state == XML_CONTENT_STATE)
1615	break;
1616	*nextTokPtr = ptr + 2;
1617	*encPtr = encodingTable[UTF_16BE_ENC];
1618	return XML_TOK_BOM;
1619	/* 00 3C is handled in the default case */
1620	case 0x3C00:
1621	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1622	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
1623	&& state == XML_CONTENT_STATE)
1624	break;
1625	*encPtr = encodingTable[UTF_16LE_ENC];
1626	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1627	case 0xFFFE:
1628	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1629	&& state == XML_CONTENT_STATE)
1630	break;
1631	*nextTokPtr = ptr + 2;
1632	*encPtr = encodingTable[UTF_16LE_ENC];
1633	return XML_TOK_BOM;
1634	case 0xEFBB:
1635	/* Maybe a UTF-8 BOM (EF BB BF) */
1636	/* If there's an explicitly specified (external) encoding
1637	* of ISO-8859-1 or some flavour of UTF-16
1638	* and this is an external text entity,
1639	* don't look for the BOM,
1640	* because it might be a legal data. */
1641	if (state == XML_CONTENT_STATE)
1642	{
1643	int e = INIT_ENC_INDEX(enc);
1644
1645	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC \|\| e == UTF_16LE_ENC \|\| e == UTF_16_ENC)
1646	break;
1647	}
1648	if (ptr + 2 == end)
1649	return XML_TOK_PARTIAL;
1650	if ((unsigned char)ptr[2] == 0xBF)
1651	{
1652	*nextTokPtr = ptr + 3;
1653	*encPtr = encodingTable[UTF_8_ENC];
1654	return XML_TOK_BOM;
1655	}
1656	break;
1657	default:
1658	if (ptr[0] == '\0')
1659	{
1660	/* 0 isn't a legal data character. Furthermore a document entity can only
1661	* start with ASCII characters. So the only way this can fail to be big-endian
1662	* UTF-16 if it it's an external parsed general entity that's labelled as
1663	* UTF-16LE. */
1664	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1665	break;
1666	*encPtr = encodingTable[UTF_16BE_ENC];
1667	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1668	}
1669	else if (ptr[1] == '\0')
1670	{
1671	/* We could recover here in the case:
1672	* - parsing an external entity
1673	* - second byte is 0
1674	* - no externally specified encoding
1675	* - no encoding declaration
1676	* by assuming UTF-16LE. But we don't, because this would mean when
1677	* presented just with a single byte, we couldn't reliably determine
1678	* whether we needed further bytes. */
1679	if (state == XML_CONTENT_STATE)
1680	break;
1681	*encPtr = encodingTable[UTF_16LE_ENC];
1682	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1683	}
1684	break;
1685	}
1686	}
1687	*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1688	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1689	}
1690
1691
1692	#define NS(x) x
1693	#define ns(x) x
1694	#include "xmltok_ns.c"
1695	#undef NS
1696	#undef ns
1697
1698	#ifdef XML_NS
1699
1700	#define NS(x) x ## NS
1701	#define ns(x) x ## _ns
1702
1703	#include "xmltok_ns.c"
1704
1705	#undef NS
1706	#undef ns
1707
1708	ENCODING * XmlInitUnknownEncodingNS(void *mem,
1709	int *table,
1710	int (convert) (void userData, const char *p),
1711	void *userData)
1712	{
1713	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1714
1715	if (enc)
1716	((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1717	return enc;
1718	}
1719
1720	#endif /* XML_NS */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: