Context Navigation

source: trunk/src/helpers/xmltok.c@ 90

Visit:

Last change on this file since 90 was 75, checked in by umoeller, 24 years ago
Misc changes.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 47.5 KB

Line
1
2	/*
3	*sourcefile xmltok.c
4	* part of the expat implementation. See xmlparse.c.
5	*
6	*/
7
8	/*
9	* Copyright (C) 2001 Ulrich Mller.
10	* Copyright (c) 1998, 1999, 2000 Thai Open Source Software Center Ltd.
11	* and Clark Cooper.
12	*
13	* Permission is hereby granted, free of charge, to any person obtaining
14	* a copy of this software and associated documentation files (the
15	* "Software"), to deal in the Software without restriction, including
16	* without limitation the rights to use, copy, modify, merge, publish,
17	* distribute, sublicense, and/or sell copies of the Software, and to
18	* permit persons to whom the Software is furnished to do so, subject to
19	* the following conditions:
20	*
21	* The above copyright notice and this permission notice shall be included
22	* in all copies or substantial portions of the Software.
23	*
24	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27	* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28	* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29	* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30	* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31	*/
32
33	#include "setup.h"
34
35	#include "expat\expat_setup.h" // V0.9.9 (2001-02-10) [umoeller]
36
37	#pragma info(norea, nogen)
38	// disable "statement unreachable" and "missing break statement"
39	// this code generates those options HEAVILY
40
41	#include "expat\xmltok.h"
42	#include "expat\nametab.h"
43
44	#ifdef XML_DTD
45	#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
46	#else
47	#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
48	#endif
49
50	#define VTABLE1 \
51	{ PREFIX(prologTok), PREFIX(contentTok), \
52	PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
53	{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
54	PREFIX(sameName), \
55	PREFIX(nameMatchesAscii), \
56	PREFIX(nameLength), \
57	PREFIX(skipS), \
58	PREFIX(getAtts), \
59	PREFIX(charRefNumber), \
60	PREFIX(predefinedEntityName), \
61	PREFIX(updatePosition), \
62	PREFIX(isPublicId)
63
64	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
65
66	#define UCS2_GET_NAMING(pages, hi, lo) \
67	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
68
69	/* A 2 byte UTF-8 representation splits the characters 11 bits
70	* between the bottom 5 and 6 bits of the bytes.
71	* We need 8 bits to index into pages, 3 bits to add to that index and
72	* 5 bits to generate the mask. */
73	#define UTF8_GET_NAMING2(pages, byte) \
74	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
75	+ ((((byte)[0]) & 3) << 1) \
76	+ ((((byte)[1]) >> 5) & 1)] \
77	& (1 << (((byte)[1]) & 0x1F)))
78
79	/* A 3 byte UTF-8 representation splits the characters 16 bits
80	* between the bottom 4, 6 and 6 bits of the bytes.
81	* We need 8 bits to index into pages, 3 bits to add to that index and
82	* 5 bits to generate the mask. */
83	#define UTF8_GET_NAMING3(pages, byte) \
84	(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
85	+ ((((byte)[1]) >> 2) & 0xF)] \
86	<< 3) \
87	+ ((((byte)[1]) & 3) << 1) \
88	+ ((((byte)[2]) >> 5) & 1)] \
89	& (1 << (((byte)[2]) & 0x1F)))
90
91	#define UTF8_GET_NAMING(pages, p, n) \
92	((n) == 2 \
93	? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
94	: ((n) == 3 \
95	? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
96	: 0))
97
98	#define UTF8_INVALID3(p) \
99	((*p) == 0xED \
100	? (((p)[1] & 0x20) != 0) \
101	: ((*p) == 0xEF \
102	? ((p)[1] == 0xBF && ((p)[2] == 0xBF \|\| (p)[2] == 0xBE)) \
103	: 0))
104
105	#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
106
107	static int EXPATENTRY isNever(const ENCODING * enc, const char *p)
108	{
109	return 0;
110	}
111
112	static int EXPATENTRY utf8_isName2(const ENCODING * enc, const char *p)
113	{
114	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
115	}
116
117	static int EXPATENTRY utf8_isName3(const ENCODING * enc, const char *p)
118	{
119	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
120	}
121
122	#define utf8_isName4 isNever
123
124	static int EXPATENTRY utf8_isNmstrt2(const ENCODING * enc, const char *p)
125	{
126	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
127	}
128
129	static int EXPATENTRY utf8_isNmstrt3(const ENCODING * enc, const char *p)
130	{
131	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
132	}
133
134	#define utf8_isNmstrt4 isNever
135
136	#define utf8_isInvalid2 isNever
137
138	static int EXPATENTRY utf8_isInvalid3(const ENCODING * enc, const char *p)
139	{
140	return UTF8_INVALID3((const unsigned char *)p);
141	}
142
143	static int EXPATENTRY utf8_isInvalid4(const ENCODING * enc, const char *p)
144	{
145	return UTF8_INVALID4((const unsigned char *)p);
146	}
147
148	struct normal_encoding
149	{
150	ENCODING enc;
151	unsigned char type[256];
152	#ifdef XML_MIN_SIZE
153	int (* EXPATENTRY byteType) (const ENCODING , const char );
154	int (* EXPATENTRY isNameMin) (const ENCODING , const char );
155	int (* EXPATENTRY isNmstrtMin) (const ENCODING , const char );
156	int (* EXPATENTRY byteToAscii) (const ENCODING , const char );
157	int (* EXPATENTRY charMatches) (const ENCODING , const char , int);
158	#endif /* XML_MIN_SIZE */
159	int (* EXPATENTRY isName2) (const ENCODING , const char );
160	int (* EXPATENTRY isName3) (const ENCODING , const char );
161	int (* EXPATENTRY isName4) (const ENCODING , const char );
162	int (* EXPATENTRY isNmstrt2) (const ENCODING , const char );
163	int (* EXPATENTRY isNmstrt3) (const ENCODING , const char );
164	int (* EXPATENTRY isNmstrt4) (const ENCODING , const char );
165	int (* EXPATENTRY isInvalid2) (const ENCODING , const char );
166	int (* EXPATENTRY isInvalid3) (const ENCODING , const char );
167	int (* EXPATENTRY isInvalid4) (const ENCODING , const char );
168	};
169
170	#ifdef XML_MIN_SIZE
171
172	#define STANDARD_VTABLE(E) \
173	E ## byteType, \
174	E ## isNameMin, \
175	E ## isNmstrtMin, \
176	E ## byteToAscii, \
177	E ## charMatches,
178
179	#else
180
181	#define STANDARD_VTABLE(E) /* as nothing */
182
183	#endif
184
185	#define NORMAL_VTABLE(E) \
186	E ## isName2, \
187	E ## isName3, \
188	E ## isName4, \
189	E ## isNmstrt2, \
190	E ## isNmstrt3, \
191	E ## isNmstrt4, \
192	E ## isInvalid2, \
193	E ## isInvalid3, \
194	E ## isInvalid4
195
196	static int checkCharRefNumber(int);
197
198	#include "expat\xmltok_impl.h"
199	#include "expat\ascii.h"
200
201	#ifdef XML_MIN_SIZE
202	#define sb_isNameMin isNever
203	#define sb_isNmstrtMin isNever
204	#endif
205
206	#ifdef XML_MIN_SIZE
207	#define MINBPC(enc) ((enc)->minBytesPerChar)
208	#else
209	/* minimum bytes per character */
210	#define MINBPC(enc) 1
211	#endif
212
213	#define SB_BYTE_TYPE(enc, p) \
214	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
215
216	#ifdef XML_MIN_SIZE
217	static int EXPATENTRY sb_byteType(const ENCODING * enc, const char *p)
218	{
219	return SB_BYTE_TYPE(enc, p);
220	}
221	#define BYTE_TYPE(enc, p) \
222	(((const struct normal_encoding *)(enc))->byteType(enc, p))
223	#else
224	#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
225	#endif
226
227	#ifdef XML_MIN_SIZE
228	#define BYTE_TO_ASCII(enc, p) \
229	(((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
230	static int EXPATENTRY sb_byteToAscii(const ENCODING * enc, const char *p)
231	{
232	return *p;
233	}
234	#else
235	#define BYTE_TO_ASCII(enc, p) (*(p))
236	#endif
237
238	#define IS_NAME_CHAR(enc, p, n) \
239	(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
240	#define IS_NMSTRT_CHAR(enc, p, n) \
241	(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
242	#define IS_INVALID_CHAR(enc, p, n) \
243	(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
244
245	#ifdef XML_MIN_SIZE
246	#define IS_NAME_CHAR_MINBPC(enc, p) \
247	(((const struct normal_encoding *)(enc))->isNameMin(enc, p))
248	#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
249	(((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
250	#else
251	#define IS_NAME_CHAR_MINBPC(enc, p) (0)
252	#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
253	#endif
254
255	#ifdef XML_MIN_SIZE
256	#define CHAR_MATCHES(enc, p, c) \
257	(((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
258	static int EXPATENTRY sb_charMatches(const ENCODING * enc, const char *p, int c)
259	{
260	return *p == c;
261	}
262	#else
263	/* c is an ASCII character */
264	#define CHAR_MATCHES(enc, p, c) (*(p) == c)
265	#endif
266
267	#define PREFIX(ident) normal_ ## ident
268	#include "xmltok_impl.c"
269
270	#undef MINBPC
271	#undef BYTE_TYPE
272	#undef BYTE_TO_ASCII
273	#undef CHAR_MATCHES
274	#undef IS_NAME_CHAR
275	#undef IS_NAME_CHAR_MINBPC
276	#undef IS_NMSTRT_CHAR
277	#undef IS_NMSTRT_CHAR_MINBPC
278	#undef IS_INVALID_CHAR
279
280	enum
281	{ /* UTF8_cvalN is value of masked first byte of N byte sequence */
282	UTF8_cval1 = 0x00,
283	UTF8_cval2 = 0xc0,
284	UTF8_cval3 = 0xe0,
285	UTF8_cval4 = 0xf0
286	};
287
288	static void EXPATENTRY utf8_toUtf8(const ENCODING * enc,
289	const char **fromP,
290	const char *fromLim,
291	char **toP,
292	const char *toLim)
293	{
294	char *to;
295	const char *from;
296
297	if (fromLim - fromP > toLim - toP)
298	{
299	/* Avoid copying partial characters. */
300	for (fromLim = fromP + (toLim - toP); fromLim > *fromP; fromLim--)
301	if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
302	break;
303	}
304	for (to = toP, from = fromP; from != fromLim; from++, to++)
305	to = from;
306	*fromP = from;
307	*toP = to;
308	}
309
310	static void EXPATENTRY utf8_toUtf16(const ENCODING * enc,
311	const char *fromP, const char fromLim,
312	unsigned short *toP, const unsigned short toLim)
313	{
314	unsigned short to = toP;
315	const char from = fromP;
316
317	while (from != fromLim && to != toLim)
318	{
319	switch (((struct normal_encoding )enc)->type[(unsigned char)from])
320	{
321	case BT_LEAD2:
322	*to++ = ((from[0] & 0x1f) << 6) \| (from[1] & 0x3f);
323	from += 2;
324	break;
325	case BT_LEAD3:
326	*to++ = ((from[0] & 0xf) << 12) \| ((from[1] & 0x3f) << 6) \| (from[2] & 0x3f);
327	from += 3;
328	break;
329	case BT_LEAD4:
330	{
331	unsigned long n;
332
333	if (to + 1 == toLim)
334	break;
335	n = ((from[0] & 0x7) << 18) \| ((from[1] & 0x3f) << 12) \| ((from[2] & 0x3f) << 6) \| (from[3] & 0x3f);
336	n -= 0x10000;
337	to[0] = (unsigned short)((n >> 10) \| 0xD800);
338	to[1] = (unsigned short)((n & 0x3FF) \| 0xDC00);
339	to += 2;
340	from += 4;
341	}
342	break;
343	default:
344	to++ = from++;
345	break;
346	}
347	}
348	*fromP = from;
349	*toP = to;
350	}
351
352	#ifdef XML_NS
353	static const struct normal_encoding utf8_encoding_ns =
354	{
355	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
356	{
357	#include "expat\asciitab.h"
358	#include "expat\utf8tab.h"
359	},
360	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
361	};
362
363	#endif
364
365	static const struct normal_encoding utf8_encoding =
366	{
367	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
368	{
369	#define BT_COLON BT_NMSTRT
370	#include "expat\asciitab.h"
371	#undef BT_COLON
372	#include "expat\utf8tab.h"
373	},
374	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
375	};
376
377	#ifdef XML_NS
378
379	static const struct normal_encoding internal_utf8_encoding_ns =
380	{
381	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
382	{
383	#include "expat\iasciitab.h"
384	#include "expat\utf8tab.h"
385	},
386	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
387	};
388
389	#endif
390
391	static const struct normal_encoding internal_utf8_encoding =
392	{
393	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
394	{
395	#define BT_COLON BT_NMSTRT
396	#include "expat\iasciitab.h"
397	#undef BT_COLON
398	#include "expat\utf8tab.h"
399	},
400	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
401	};
402
403	static void EXPATENTRY latin1_toUtf8(const ENCODING * enc,
404	const char *fromP, const char fromLim,
405	char *toP, const char toLim)
406	{
407	for (;;)
408	{
409	unsigned char c;
410
411	if (*fromP == fromLim)
412	break;
413	c = (unsigned char)**fromP;
414	if (c & 0x80)
415	{
416	if (toLim - *toP < 2)
417	break;
418	(toP)++ = ((c >> 6) \| UTF8_cval2);
419	(toP)++ = ((c & 0x3f) \| 0x80);
420	(*fromP)++;
421	}
422	else
423	{
424	if (*toP == toLim)
425	break;
426	(toP)++ = (fromP)++;
427	}
428	}
429	}
430
431	static void EXPATENTRY latin1_toUtf16(const ENCODING * enc,
432	const char *fromP, const char fromLim,
433	unsigned short *toP, const unsigned short toLim)
434	{
435	while (fromP != fromLim && toP != toLim)
436	(toP)++ = (unsigned char)(fromP)++;
437	}
438
439	#ifdef XML_NS
440
441	static const struct normal_encoding latin1_encoding_ns =
442	{
443	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
444	{
445	#include "expat\asciitab.h"
446	#include "expat\latin1tab.h"
447	},
448	STANDARD_VTABLE(sb_)
449	};
450
451	#endif
452
453	static const struct normal_encoding latin1_encoding =
454	{
455	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
456	{
457	#define BT_COLON BT_NMSTRT
458	#include "expat\asciitab.h"
459	#undef BT_COLON
460	#include "expat\latin1tab.h"
461	},
462	STANDARD_VTABLE(sb_)
463	};
464
465	static void EXPATENTRY ascii_toUtf8(const ENCODING * enc,
466	const char *fromP, const char fromLim,
467	char *toP, const char toLim)
468	{
469	while (fromP != fromLim && toP != toLim)
470	(toP)++ = (fromP)++;
471	}
472
473	#ifdef XML_NS
474
475	static const struct normal_encoding ascii_encoding_ns =
476	{
477	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
478	{
479	#include "expat\asciitab.h"
480	/* BT_NONXML == 0 */
481	},
482	STANDARD_VTABLE(sb_)
483	};
484
485	#endif
486
487	static const struct normal_encoding ascii_encoding =
488	{
489	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
490	{
491	#define BT_COLON BT_NMSTRT
492	#include "expat\asciitab.h"
493	#undef BT_COLON
494	/* BT_NONXML == 0 */
495	},
496	STANDARD_VTABLE(sb_)
497	};
498
499	static int unicode_byte_type(char hi, char lo)
500	{
501	switch ((unsigned char)hi)
502	{
503	case 0xD8:
504	case 0xD9:
505	case 0xDA:
506	case 0xDB:
507	return BT_LEAD4;
508	case 0xDC:
509	case 0xDD:
510	case 0xDE:
511	case 0xDF:
512	return BT_TRAIL;
513	case 0xFF:
514	switch ((unsigned char)lo)
515	{
516	case 0xFF:
517	case 0xFE:
518	return BT_NONXML;
519	}
520	break;
521	}
522	return BT_NONASCII;
523	}
524
525	#define DEFINE_UTF16_TO_UTF8(E) \
526	static void EXPATENTRY E ## toUtf8(const ENCODING *enc, \
527	const char *fromP, const char fromLim, \
528	char *toP, const char toLim) \
529	{ \
530	const char *from; \
531	for (from = *fromP; from != fromLim; from += 2) { \
532	int plane; \
533	unsigned char lo2; \
534	unsigned char lo = GET_LO(from); \
535	unsigned char hi = GET_HI(from); \
536	switch (hi) { \
537	case 0: \
538	if (lo < 0x80) { \
539	if (*toP == toLim) { \
540	*fromP = from; \
541	return; \
542	} \
543	(toP)++ = lo; \
544	break; \
545	} \
546	/* fall through */ \
547	case 0x1: case 0x2: case 0x3: \
548	case 0x4: case 0x5: case 0x6: case 0x7: \
549	if (toLim - *toP < 2) { \
550	*fromP = from; \
551	return; \
552	} \
553	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
554	(toP)++ = ((lo & 0x3f) \| 0x80); \
555	break; \
556	default: \
557	if (toLim - *toP < 3) { \
558	*fromP = from; \
559	return; \
560	} \
561	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
562	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
563	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
564	(toP)++ = ((lo & 0x3f) \| 0x80); \
565	break; \
566	case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
567	if (toLim - *toP < 4) { \
568	*fromP = from; \
569	return; \
570	} \
571	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
572	(toP)++ = ((plane >> 2) \| UTF8_cval4); \
573	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
574	from += 2; \
575	lo2 = GET_LO(from); \
576	(toP)++ = (((lo & 0x3) << 4) \
577	\| ((GET_HI(from) & 0x3) << 2) \
578	\| (lo2 >> 6) \
579	\| 0x80); \
580	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
581	break; \
582	} \
583	} \
584	*fromP = from; \
585	}
586
587	#define DEFINE_UTF16_TO_UTF16(E) \
588	static void EXPATENTRY E ## toUtf16(const ENCODING *enc, \
589	const char *fromP, const char fromLim, \
590	unsigned short *toP, const unsigned short toLim) \
591	{ \
592	/* Avoid copying first half only of surrogate */ \
593	if (fromLim - fromP > ((toLim - toP) << 1) \
594	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
595	fromLim -= 2; \
596	for (; fromP != fromLim && toP != toLim; *fromP += 2) \
597	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
598	}
599
600	#define SET2(ptr, ch) \
601	(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
602	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
603	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
604
605	DEFINE_UTF16_TO_UTF8(little2_)
606	DEFINE_UTF16_TO_UTF16(little2_)
607
608	#undef SET2
609	#undef GET_LO
610	#undef GET_HI
611
612	#define SET2(ptr, ch) \
613	(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
614	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
615	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
616
617	DEFINE_UTF16_TO_UTF8(big2_)
618	DEFINE_UTF16_TO_UTF16(big2_)
619
620	#undef SET2
621	#undef GET_LO
622	#undef GET_HI
623
624	#define LITTLE2_BYTE_TYPE(enc, p) \
625	((p)[1] == 0 \
626	? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
627	: unicode_byte_type((p)[1], (p)[0]))
628	#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
629	#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
630	#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
631	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
632	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
633	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
634
635	#ifdef XML_MIN_SIZE
636
637	static int EXPATENTRY little2_byteType(const ENCODING * enc, const char *p)
638	{
639	return LITTLE2_BYTE_TYPE(enc, p);
640	}
641
642	static int EXPATENTRY little2_byteToAscii(const ENCODING * enc, const char *p)
643	{
644	return LITTLE2_BYTE_TO_ASCII(enc, p);
645	}
646
647	static int EXPATENTRY little2_charMatches(const ENCODING * enc, const char *p, int c)
648	{
649	return LITTLE2_CHAR_MATCHES(enc, p, c);
650	}
651
652	static int EXPATENTRY little2_isNameMin(const ENCODING * enc, const char *p)
653	{
654	return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
655	}
656
657	static int EXPATENTRY little2_isNmstrtMin(const ENCODING * enc, const char *p)
658	{
659	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
660	}
661
662	#undef VTABLE
663	#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
664
665	#else /* not XML_MIN_SIZE */
666
667	#undef PREFIX
668	#define PREFIX(ident) little2_ ## ident
669	#define MINBPC(enc) 2
670	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
671	#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
672	#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
673	#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
674	#define IS_NAME_CHAR(enc, p, n) 0
675	#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
676	#define IS_NMSTRT_CHAR(enc, p, n) (0)
677	#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
678
679	#include "xmltok_impl.c"
680
681	#undef MINBPC
682	#undef BYTE_TYPE
683	#undef BYTE_TO_ASCII
684	#undef CHAR_MATCHES
685	#undef IS_NAME_CHAR
686	#undef IS_NAME_CHAR_MINBPC
687	#undef IS_NMSTRT_CHAR
688	#undef IS_NMSTRT_CHAR_MINBPC
689	#undef IS_INVALID_CHAR
690
691	#endif /* not XML_MIN_SIZE */
692
693	#ifdef XML_NS
694
695	static const struct normal_encoding little2_encoding_ns =
696	{
697	{VTABLE, 2, 0,
698	#if XML_BYTE_ORDER == 12
699	1
700	#else
701	0
702	#endif
703	},
704	{
705	#include "expat\asciitab.h"
706	#include "expat\latin1tab.h"
707	},
708	STANDARD_VTABLE(little2_)
709	};
710
711	#endif
712
713	static const struct normal_encoding little2_encoding =
714	{
715	{VTABLE, 2, 0,
716	#if XML_BYTE_ORDER == 12
717	1
718	#else
719	0
720	#endif
721	},
722	{
723	#define BT_COLON BT_NMSTRT
724	#include "expat\asciitab.h"
725	#undef BT_COLON
726	#include "expat\latin1tab.h"
727	},
728	STANDARD_VTABLE(little2_)
729	};
730
731	#if XML_BYTE_ORDER != 21
732
733	#ifdef XML_NS
734
735	static const struct normal_encoding internal_little2_encoding_ns =
736	{
737	{VTABLE, 2, 0, 1},
738	{
739	#include "expat\iasciitab.h"
740	#include "expat\latin1tab.h"
741	},
742	STANDARD_VTABLE(little2_)
743	};
744
745	#endif
746
747	static const struct normal_encoding internal_little2_encoding =
748	{
749	{VTABLE, 2, 0, 1},
750	{
751	#define BT_COLON BT_NMSTRT
752	#include "expat\iasciitab.h"
753	#undef BT_COLON
754	#include "expat\latin1tab.h"
755	},
756	STANDARD_VTABLE(little2_)
757	};
758
759	#endif
760
761
762	#define BIG2_BYTE_TYPE(enc, p) \
763	((p)[0] == 0 \
764	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
765	: unicode_byte_type((p)[0], (p)[1]))
766	#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
767	#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
768	#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
769	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
770	#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
771	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
772
773	#ifdef XML_MIN_SIZE
774
775	static int EXPATENTRY big2_byteType(const ENCODING * enc, const char *p)
776	{
777	return BIG2_BYTE_TYPE(enc, p);
778	}
779
780	static int EXPATENTRY big2_byteToAscii(const ENCODING * enc, const char *p)
781	{
782	return BIG2_BYTE_TO_ASCII(enc, p);
783	}
784
785	static int EXPATENTRY big2_charMatches(const ENCODING * enc, const char *p, int c)
786	{
787	return BIG2_CHAR_MATCHES(enc, p, c);
788	}
789
790	static int EXPATENTRY big2_isNameMin(const ENCODING * enc, const char *p)
791	{
792	return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
793	}
794
795	static int EXPATENTRY big2_isNmstrtMin(const ENCODING * enc, const char *p)
796	{
797	return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
798	}
799
800	#undef VTABLE
801	#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
802
803	#else /* not XML_MIN_SIZE */
804
805	#undef PREFIX
806	#define PREFIX(ident) big2_ ## ident
807	#define MINBPC(enc) 2
808	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
809	#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
810	#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
811	#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
812	#define IS_NAME_CHAR(enc, p, n) 0
813	#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
814	#define IS_NMSTRT_CHAR(enc, p, n) (0)
815	#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
816
817	#include "xmltok_impl.c"
818
819	#undef MINBPC
820	#undef BYTE_TYPE
821	#undef BYTE_TO_ASCII
822	#undef CHAR_MATCHES
823	#undef IS_NAME_CHAR
824	#undef IS_NAME_CHAR_MINBPC
825	#undef IS_NMSTRT_CHAR
826	#undef IS_NMSTRT_CHAR_MINBPC
827	#undef IS_INVALID_CHAR
828
829	#endif /* not XML_MIN_SIZE */
830
831	#ifdef XML_NS
832
833	static const struct normal_encoding big2_encoding_ns =
834	{
835	{VTABLE, 2, 0,
836	#if XML_BYTE_ORDER == 21
837	1
838	#else
839	0
840	#endif
841	},
842	{
843	#include "expat\asciitab.h"
844	#include "expat\latin1tab.h"
845	},
846	STANDARD_VTABLE(big2_)
847	};
848
849	#endif
850
851	static const struct normal_encoding big2_encoding =
852	{
853	{VTABLE, 2, 0,
854	#if XML_BYTE_ORDER == 21
855	1
856	#else
857	0
858	#endif
859	},
860	{
861	#define BT_COLON BT_NMSTRT
862	#include "expat\asciitab.h"
863	#undef BT_COLON
864	#include "expat\latin1tab.h"
865	},
866	STANDARD_VTABLE(big2_)
867	};
868
869	#if XML_BYTE_ORDER != 12
870
871	#ifdef XML_NS
872
873	static const struct normal_encoding internal_big2_encoding_ns =
874	{
875	{VTABLE, 2, 0, 1},
876	{
877	#include "expat\iasciitab.h"
878	#include "expat\latin1tab.h"
879	},
880	STANDARD_VTABLE(big2_)
881	};
882
883	#endif
884
885	static const struct normal_encoding internal_big2_encoding =
886	{
887	{VTABLE, 2, 0, 1},
888	{
889	#define BT_COLON BT_NMSTRT
890	#include "expat\iasciitab.h"
891	#undef BT_COLON
892	#include "expat\latin1tab.h"
893	},
894	STANDARD_VTABLE(big2_)
895	};
896
897	#endif
898
899	#undef PREFIX
900
901	static
902	int streqci(const char s1, const char s2)
903	{
904	for (;;)
905	{
906	char c1 = *s1++;
907	char c2 = *s2++;
908
909	if (ASCII_a <= c1 && c1 <= ASCII_z)
910	c1 += ASCII_A - ASCII_a;
911	if (ASCII_a <= c2 && c2 <= ASCII_z)
912	c2 += ASCII_A - ASCII_a;
913	if (c1 != c2)
914	return 0;
915	if (!c1)
916	break;
917	}
918	return 1;
919	}
920
921	static void EXPATENTRY initUpdatePosition(const ENCODING * enc,
922	const char *ptr,
923	const char *end,
924	POSITION * pos)
925	{
926	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
927	}
928
929	static int EXPATENTRY toAscii(const ENCODING * enc, const char ptr, const char end)
930	{
931	char buf[1];
932	char *p = buf;
933
934	XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
935	if (p == buf)
936	return -1;
937	else
938	return buf[0];
939	}
940
941	static int isSpace(int c)
942	{
943	switch (c)
944	{
945	case 0x20:
946	case 0xD:
947	case 0xA:
948	case 0x9:
949	return 1;
950	}
951	return 0;
952	}
953
954	/* Return 1 if there's just optional white space
955	* or there's an S followed by name=val. */
956	static int EXPATENTRY parsePseudoAttribute(const ENCODING * enc,
957	const char *ptr,
958	const char *end,
959	const char **namePtr,
960	const char **nameEndPtr,
961	const char **valPtr,
962	const char **nextTokPtr)
963	{
964	int c;
965	char open;
966
967	if (ptr == end)
968	{
969	*namePtr = 0;
970	return 1;
971	}
972	if (!isSpace(toAscii(enc, ptr, end)))
973	{
974	*nextTokPtr = ptr;
975	return 0;
976	}
977	do
978	{
979	ptr += enc->minBytesPerChar;
980	}
981	while (isSpace(toAscii(enc, ptr, end)));
982	if (ptr == end)
983	{
984	*namePtr = 0;
985	return 1;
986	}
987	*namePtr = ptr;
988	for (;;)
989	{
990	c = toAscii(enc, ptr, end);
991	if (c == -1)
992	{
993	*nextTokPtr = ptr;
994	return 0;
995	}
996	if (c == ASCII_EQUALS)
997	{
998	*nameEndPtr = ptr;
999	break;
1000	}
1001	if (isSpace(c))
1002	{
1003	*nameEndPtr = ptr;
1004	do
1005	{
1006	ptr += enc->minBytesPerChar;
1007	}
1008	while (isSpace(c = toAscii(enc, ptr, end)));
1009	if (c != ASCII_EQUALS)
1010	{
1011	*nextTokPtr = ptr;
1012	return 0;
1013	}
1014	break;
1015	}
1016	ptr += enc->minBytesPerChar;
1017	}
1018	if (ptr == *namePtr)
1019	{
1020	*nextTokPtr = ptr;
1021	return 0;
1022	}
1023	ptr += enc->minBytesPerChar;
1024	c = toAscii(enc, ptr, end);
1025	while (isSpace(c))
1026	{
1027	ptr += enc->minBytesPerChar;
1028	c = toAscii(enc, ptr, end);
1029	}
1030	if (c != ASCII_QUOT && c != ASCII_APOS)
1031	{
1032	*nextTokPtr = ptr;
1033	return 0;
1034	}
1035	open = c;
1036	ptr += enc->minBytesPerChar;
1037	*valPtr = ptr;
1038	for (;; ptr += enc->minBytesPerChar)
1039	{
1040	c = toAscii(enc, ptr, end);
1041	if (c == open)
1042	break;
1043	if (!(ASCII_a <= c && c <= ASCII_z)
1044	&& !(ASCII_A <= c && c <= ASCII_Z)
1045	&& !(ASCII_0 <= c && c <= ASCII_9)
1046	&& c != ASCII_PERIOD
1047	&& c != ASCII_MINUS
1048	&& c != ASCII_UNDERSCORE)
1049	{
1050	*nextTokPtr = ptr;
1051	return 0;
1052	}
1053	}
1054	*nextTokPtr = ptr + enc->minBytesPerChar;
1055	return 1;
1056	}
1057
1058	static const char KW_version[] =
1059	{
1060	ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1061	};
1062
1063	static const char KW_encoding[] =
1064	{
1065	ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1066	};
1067
1068	static const char KW_standalone[] =
1069	{
1070	ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
1071	};
1072
1073	static const char KW_yes[] =
1074	{
1075	ASCII_y, ASCII_e, ASCII_s, '\0'
1076	};
1077
1078	static const char KW_no[] =
1079	{
1080	ASCII_n, ASCII_o, '\0'
1081	};
1082
1083	static int doParseXmlDecl(const ENCODING* (* EXPATENTRY encodingFinder)(const ENCODING *,
1084	const char *,
1085	const char *),
1086	int isGeneralTextEntity,
1087	const ENCODING * enc,
1088	const char *ptr,
1089	const char *end,
1090	const char **badPtr,
1091	const char **versionPtr,
1092	const char **versionEndPtr,
1093	const char **encodingName,
1094	const ENCODING ** encoding,
1095	int *standalone)
1096	{
1097	const char *val = 0;
1098	const char *name = 0;
1099	const char *nameEnd = 0;
1100
1101	ptr += 5 * enc->minBytesPerChar;
1102	end -= 2 * enc->minBytesPerChar;
1103	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) \|\| !name)
1104	{
1105	*badPtr = ptr;
1106	return 0;
1107	}
1108	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version))
1109	{
1110	if (!isGeneralTextEntity)
1111	{
1112	*badPtr = name;
1113	return 0;
1114	}
1115	}
1116	else
1117	{
1118	if (versionPtr)
1119	*versionPtr = val;
1120	if (versionEndPtr)
1121	*versionEndPtr = ptr;
1122	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1123	{
1124	*badPtr = ptr;
1125	return 0;
1126	}
1127	if (!name)
1128	{
1129	if (isGeneralTextEntity)
1130	{
1131	/* a TextDecl must have an EncodingDecl */
1132	*badPtr = ptr;
1133	return 0;
1134	}
1135	return 1;
1136	}
1137	}
1138	if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding))
1139	{
1140	int c = toAscii(enc, val, end);
1141
1142	if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z))
1143	{
1144	*badPtr = val;
1145	return 0;
1146	}
1147	if (encodingName)
1148	*encodingName = val;
1149	if (encoding)
1150	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1151	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1152	{
1153	*badPtr = ptr;
1154	return 0;
1155	}
1156	if (!name)
1157	return 1;
1158	}
1159	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) \|\| isGeneralTextEntity)
1160	{
1161	*badPtr = name;
1162	return 0;
1163	}
1164	if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes))
1165	{
1166	if (standalone)
1167	*standalone = 1;
1168	}
1169	else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no))
1170	{
1171	if (standalone)
1172	*standalone = 0;
1173	}
1174	else
1175	{
1176	*badPtr = val;
1177	return 0;
1178	}
1179	while (isSpace(toAscii(enc, ptr, end)))
1180	ptr += enc->minBytesPerChar;
1181	if (ptr != end)
1182	{
1183	*badPtr = ptr;
1184	return 0;
1185	}
1186	return 1;
1187	}
1188
1189	static int checkCharRefNumber(int result)
1190	{
1191	switch (result >> 8)
1192	{
1193	case 0xD8:
1194	case 0xD9:
1195	case 0xDA:
1196	case 0xDB:
1197	case 0xDC:
1198	case 0xDD:
1199	case 0xDE:
1200	case 0xDF:
1201	return -1;
1202	case 0:
1203	if (latin1_encoding.type[result] == BT_NONXML)
1204	return -1;
1205	break;
1206	case 0xFF:
1207	if (result == 0xFFFE \|\| result == 0xFFFF)
1208	return -1;
1209	break;
1210	}
1211	return result;
1212	}
1213
1214	int XmlUtf8Encode(int c, char *buf)
1215	{
1216	enum
1217	{
1218	/* minN is minimum legal resulting value for N byte sequence */
1219	min2 = 0x80,
1220	min3 = 0x800,
1221	min4 = 0x10000
1222	};
1223
1224	if (c < 0)
1225	return 0;
1226	if (c < min2)
1227	{
1228	buf[0] = (c \| UTF8_cval1);
1229	return 1;
1230	}
1231	if (c < min3)
1232	{
1233	buf[0] = ((c >> 6) \| UTF8_cval2);
1234	buf[1] = ((c & 0x3f) \| 0x80);
1235	return 2;
1236	}
1237	if (c < min4)
1238	{
1239	buf[0] = ((c >> 12) \| UTF8_cval3);
1240	buf[1] = (((c >> 6) & 0x3f) \| 0x80);
1241	buf[2] = ((c & 0x3f) \| 0x80);
1242	return 3;
1243	}
1244	if (c < 0x110000)
1245	{
1246	buf[0] = ((c >> 18) \| UTF8_cval4);
1247	buf[1] = (((c >> 12) & 0x3f) \| 0x80);
1248	buf[2] = (((c >> 6) & 0x3f) \| 0x80);
1249	buf[3] = ((c & 0x3f) \| 0x80);
1250	return 4;
1251	}
1252	return 0;
1253	}
1254
1255	int XmlUtf16Encode(int charNum, unsigned short *buf)
1256	{
1257	if (charNum < 0)
1258	return 0;
1259	if (charNum < 0x10000)
1260	{
1261	buf[0] = charNum;
1262	return 1;
1263	}
1264	if (charNum < 0x110000)
1265	{
1266	charNum -= 0x10000;
1267	buf[0] = (charNum >> 10) + 0xD800;
1268	buf[1] = (charNum & 0x3FF) + 0xDC00;
1269	return 2;
1270	}
1271	return 0;
1272	}
1273
1274	struct unknown_encoding
1275	{
1276	struct normal_encoding normal;
1277	int (convert) (void userData, const char *p);
1278	void *userData;
1279	unsigned short utf16[256];
1280	char utf8[256][4];
1281	};
1282
1283	int EXPATENTRY XmlSizeOfUnknownEncoding(void)
1284	{
1285	return sizeof(struct unknown_encoding);
1286	}
1287
1288	static int EXPATENTRY unknown_isName(const ENCODING * enc, const char *p)
1289	{
1290	int c = ((const struct unknown_encoding *)enc)
1291	->convert(((const struct unknown_encoding *)enc)->userData, p);
1292
1293	if (c & ~0xFFFF)
1294	return 0;
1295	return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1296	}
1297
1298	static int EXPATENTRY unknown_isNmstrt(const ENCODING * enc, const char *p)
1299	{
1300	int c = ((const struct unknown_encoding *)enc)
1301	->convert(((const struct unknown_encoding *)enc)->userData, p);
1302
1303	if (c & ~0xFFFF)
1304	return 0;
1305	return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1306	}
1307
1308	static int EXPATENTRY unknown_isInvalid(const ENCODING * enc, const char *p)
1309	{
1310	int c = ((const struct unknown_encoding *)enc)
1311	->convert(((const struct unknown_encoding *)enc)->userData, p);
1312
1313	return (c & ~0xFFFF) \|\| checkCharRefNumber(c) < 0;
1314	}
1315
1316	static void EXPATENTRY unknown_toUtf8(const ENCODING * enc,
1317	const char **fromP,
1318	const char *fromLim,
1319	char **toP,
1320	const char *toLim)
1321	{
1322	char buf[XML_UTF8_ENCODE_MAX];
1323
1324	for (;;)
1325	{
1326	const char *utf8;
1327	int n;
1328
1329	if (*fromP == fromLim)
1330	break;
1331	utf8 = ((const struct unknown_encoding )enc)->utf8[(unsigned char)*fromP];
1332	n = *utf8++;
1333	if (n == 0)
1334	{
1335	int c = ((const struct unknown_encoding *)enc)
1336	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1337
1338	n = XmlUtf8Encode(c, buf);
1339	if (n > toLim - *toP)
1340	break;
1341	utf8 = buf;
1342	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1343	- (BT_LEAD2 - 2);
1344	}
1345	else
1346	{
1347	if (n > toLim - *toP)
1348	break;
1349	(*fromP)++;
1350	}
1351	do
1352	{
1353	(toP)++ = *utf8++;
1354	}
1355	while (--n != 0);
1356	}
1357	}
1358
1359	static void EXPATENTRY unknown_toUtf16(const ENCODING * enc,
1360	const char **fromP,
1361	const char *fromLim,
1362	unsigned short **toP,
1363	const unsigned short *toLim)
1364	{
1365	while (fromP != fromLim && toP != toLim)
1366	{
1367	unsigned short c
1368	= ((const struct unknown_encoding )enc)->utf16[(unsigned char)*fromP];
1369
1370	if (c == 0)
1371	{
1372	c = (unsigned short)((const struct unknown_encoding *)enc)
1373	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1374	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1375	- (BT_LEAD2 - 2);
1376	}
1377	else
1378	(*fromP)++;
1379	(toP)++ = c;
1380	}
1381	}
1382
1383	ENCODING * XmlInitUnknownEncoding(void *mem,
1384	int *table,
1385	int (convert) (void userData, const char *p),
1386	void *userData)
1387	{
1388	int i;
1389	struct unknown_encoding e = (struct unknown_encoding )mem;
1390	for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1391
1392	((char )mem)[i] = ((char )&latin1_encoding)[i];
1393	for (i = 0; i < 128; i++)
1394	if (latin1_encoding.type[i] != BT_OTHER
1395	&& latin1_encoding.type[i] != BT_NONXML
1396	&& table[i] != i)
1397	return 0;
1398	for (i = 0; i < 256; i++)
1399	{
1400	int c = table[i];
1401
1402	if (c == -1)
1403	{
1404	e->normal.type[i] = BT_MALFORM;
1405	/* This shouldn't really get used. */
1406	e->utf16[i] = 0xFFFF;
1407	e->utf8[i][0] = 1;
1408	e->utf8[i][1] = 0;
1409	}
1410	else if (c < 0)
1411	{
1412	if (c < -4)
1413	return 0;
1414	e->normal.type[i] = BT_LEAD2 - (c + 2);
1415	e->utf8[i][0] = 0;
1416	e->utf16[i] = 0;
1417	}
1418	else if (c < 0x80)
1419	{
1420	if (latin1_encoding.type[c] != BT_OTHER
1421	&& latin1_encoding.type[c] != BT_NONXML
1422	&& c != i)
1423	return 0;
1424	e->normal.type[i] = latin1_encoding.type[c];
1425	e->utf8[i][0] = 1;
1426	e->utf8[i][1] = (char)c;
1427	e->utf16[i] = c == 0 ? 0xFFFF : c;
1428	}
1429	else if (checkCharRefNumber(c) < 0)
1430	{
1431	e->normal.type[i] = BT_NONXML;
1432	/* This shouldn't really get used. */
1433	e->utf16[i] = 0xFFFF;
1434	e->utf8[i][0] = 1;
1435	e->utf8[i][1] = 0;
1436	}
1437	else
1438	{
1439	if (c > 0xFFFF)
1440	return 0;
1441	if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1442	e->normal.type[i] = BT_NMSTRT;
1443	else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1444	e->normal.type[i] = BT_NAME;
1445	else
1446	e->normal.type[i] = BT_OTHER;
1447	e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1448	e->utf16[i] = c;
1449	}
1450	}
1451	e->userData = userData;
1452	e->convert = convert;
1453	if (convert)
1454	{
1455	e->normal.isName2 = unknown_isName;
1456	e->normal.isName3 = unknown_isName;
1457	e->normal.isName4 = unknown_isName;
1458	e->normal.isNmstrt2 = unknown_isNmstrt;
1459	e->normal.isNmstrt3 = unknown_isNmstrt;
1460	e->normal.isNmstrt4 = unknown_isNmstrt;
1461	e->normal.isInvalid2 = unknown_isInvalid;
1462	e->normal.isInvalid3 = unknown_isInvalid;
1463	e->normal.isInvalid4 = unknown_isInvalid;
1464	}
1465	e->normal.enc.utf8Convert = unknown_toUtf8;
1466	e->normal.enc.utf16Convert = unknown_toUtf16;
1467	return &(e->normal.enc);
1468	}
1469
1470	/* If this enumeration is changed, getEncodingIndex and encodings
1471	* must also be changed. */
1472	enum
1473	{
1474	UNKNOWN_ENC = -1,
1475	ISO_8859_1_ENC = 0,
1476	US_ASCII_ENC,
1477	UTF_8_ENC,
1478	UTF_16_ENC,
1479	UTF_16BE_ENC,
1480	UTF_16LE_ENC,
1481	/* must match encodingNames up to here */
1482	NO_ENC
1483	};
1484
1485	static const char KW_ISO_8859_1[] =
1486	{
1487	ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
1488	};
1489	static const char KW_US_ASCII[] =
1490	{
1491	ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
1492	};
1493	static const char KW_UTF_8[] =
1494	{
1495	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1496	};
1497	static const char KW_UTF_16[] =
1498	{
1499	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1500	};
1501	static const char KW_UTF_16BE[] =
1502	{
1503	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
1504	};
1505	static const char KW_UTF_16LE[] =
1506	{
1507	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
1508	};
1509
1510	static int getEncodingIndex(const char *name)
1511	{
1512	static const char *encodingNames[] =
1513	{
1514	KW_ISO_8859_1,
1515	KW_US_ASCII,
1516	KW_UTF_8,
1517	KW_UTF_16,
1518	KW_UTF_16BE,
1519	KW_UTF_16LE,
1520	};
1521	int i;
1522
1523	if (name == 0)
1524	return NO_ENC;
1525	for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1526	if (streqci(name, encodingNames[i]))
1527	return i;
1528	return UNKNOWN_ENC;
1529	}
1530
1531	/* For binary compatibility, we store the index of the encoding specified
1532	* at initialization in the isUtf16 member. */
1533
1534	#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1535	#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1536
1537	/* This is what detects the encoding.
1538	* encodingTable maps from encoding indices to encodings;
1539	* INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1540	* state is XML_CONTENT_STATE if we're parsing an external text entity,
1541	* and XML_PROLOG_STATE otherwise.
1542	*/
1543
1544
1545	static int EXPATENTRY initScan(const ENCODING ** encodingTable,
1546	const INIT_ENCODING * enc,
1547	int state,
1548	const char *ptr,
1549	const char *end,
1550	const char **nextTokPtr)
1551	{
1552	const ENCODING **encPtr;
1553
1554	if (ptr == end)
1555	return XML_TOK_NONE;
1556	encPtr = enc->encPtr;
1557	if (ptr + 1 == end)
1558	{
1559	/* only a single byte available for auto-detection */
1560	#ifndef XML_DTD /* FIXME */
1561	/* a well-formed document entity must have more than one byte */
1562	if (state != XML_CONTENT_STATE)
1563	return XML_TOK_PARTIAL;
1564	#endif
1565	/* so we're parsing an external text entity... */
1566	/* if UTF-16 was externally specified, then we need at least 2 bytes */
1567	switch (INIT_ENC_INDEX(enc))
1568	{
1569	case UTF_16_ENC:
1570	case UTF_16LE_ENC:
1571	case UTF_16BE_ENC:
1572	return XML_TOK_PARTIAL;
1573	}
1574	switch ((unsigned char)*ptr)
1575	{
1576	case 0xFE:
1577	case 0xFF:
1578	case 0xEF: /* possibly first byte of UTF-8 BOM */
1579	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1580	&& state == XML_CONTENT_STATE)
1581	break;
1582	/* fall through */
1583	case 0x00:
1584	case 0x3C:
1585	return XML_TOK_PARTIAL;
1586	}
1587	}
1588	else
1589	{
1590	switch (((unsigned char)ptr[0] << 8) \| (unsigned char)ptr[1])
1591	{
1592	case 0xFEFF:
1593	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1594	&& state == XML_CONTENT_STATE)
1595	break;
1596	*nextTokPtr = ptr + 2;
1597	*encPtr = encodingTable[UTF_16BE_ENC];
1598	return XML_TOK_BOM;
1599	/* 00 3C is handled in the default case */
1600	case 0x3C00:
1601	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1602	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
1603	&& state == XML_CONTENT_STATE)
1604	break;
1605	*encPtr = encodingTable[UTF_16LE_ENC];
1606	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1607	case 0xFFFE:
1608	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1609	&& state == XML_CONTENT_STATE)
1610	break;
1611	*nextTokPtr = ptr + 2;
1612	*encPtr = encodingTable[UTF_16LE_ENC];
1613	return XML_TOK_BOM;
1614	case 0xEFBB:
1615	/* Maybe a UTF-8 BOM (EF BB BF) */
1616	/* If there's an explicitly specified (external) encoding
1617	* of ISO-8859-1 or some flavour of UTF-16
1618	* and this is an external text entity,
1619	* don't look for the BOM,
1620	* because it might be a legal data. */
1621	if (state == XML_CONTENT_STATE)
1622	{
1623	int e = INIT_ENC_INDEX(enc);
1624
1625	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC \|\| e == UTF_16LE_ENC \|\| e == UTF_16_ENC)
1626	break;
1627	}
1628	if (ptr + 2 == end)
1629	return XML_TOK_PARTIAL;
1630	if ((unsigned char)ptr[2] == 0xBF)
1631	{
1632	*encPtr = encodingTable[UTF_8_ENC];
1633	return XML_TOK_BOM;
1634	}
1635	break;
1636	default:
1637	if (ptr[0] == '\0')
1638	{
1639	/* 0 isn't a legal data character. Furthermore a document entity can only
1640	* start with ASCII characters. So the only way this can fail to be big-endian
1641	* UTF-16 if it it's an external parsed general entity that's labelled as
1642	* UTF-16LE. */
1643	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1644	break;
1645	*encPtr = encodingTable[UTF_16BE_ENC];
1646	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1647	}
1648	else if (ptr[1] == '\0')
1649	{
1650	/* We could recover here in the case:
1651	* - parsing an external entity
1652	* - second byte is 0
1653	* - no externally specified encoding
1654	* - no encoding declaration
1655	* by assuming UTF-16LE. But we don't, because this would mean when
1656	* presented just with a single byte, we couldn't reliably determine
1657	* whether we needed further bytes. */
1658	if (state == XML_CONTENT_STATE)
1659	break;
1660	*encPtr = encodingTable[UTF_16LE_ENC];
1661	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1662	}
1663	break;
1664	}
1665	}
1666	*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1667	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1668	}
1669
1670
1671	#define NS(x) x
1672	#define ns(x) x
1673	#include "xmltok_ns.c"
1674	#undef NS
1675	#undef ns
1676
1677	#ifdef XML_NS
1678
1679	#define NS(x) x ## NS
1680	#define ns(x) x ## _ns
1681
1682	#include "xmltok_ns.c"
1683
1684	#undef NS
1685	#undef ns
1686
1687	ENCODING * XmlInitUnknownEncodingNS(void *mem,
1688	int *table,
1689	int (* EXPATENTRY convert) (void userData, const char p),
1690	void *userData)
1691	{
1692	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1693
1694	if (enc)
1695	((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1696	return enc;
1697	}
1698
1699	#endif /* XML_NS */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: