Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

xmltok.c@ 10

Last change on this file since 10 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 40.2 KB

Line
1	/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2	See the file COPYING for copying permission.
3	*/
4
5	#ifdef COMPILED_FROM_DSP
6	#include "winconfig.h"
7	#elif defined(MACOS_CLASSIC)
8	#include "macconfig.h"
9	#elif defined(__amigaos4__)
10	#include "amigaconfig.h"
11	#else
12	#ifdef HAVE_EXPAT_CONFIG_H
13	#include <expat_config.h>
14	#endif
15	#endif /* ndef COMPILED_FROM_DSP */
16
17	#include <stddef.h>
18
19	#include "expat_external.h"
20	#include "internal.h"
21	#include "xmltok.h"
22	#include "nametab.h"
23
24	#ifdef XML_DTD
25	#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
26	#else
27	#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
28	#endif
29
30	#define VTABLE1 \
31	{ PREFIX(prologTok), PREFIX(contentTok), \
32	PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
33	{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
34	PREFIX(sameName), \
35	PREFIX(nameMatchesAscii), \
36	PREFIX(nameLength), \
37	PREFIX(skipS), \
38	PREFIX(getAtts), \
39	PREFIX(charRefNumber), \
40	PREFIX(predefinedEntityName), \
41	PREFIX(updatePosition), \
42	PREFIX(isPublicId)
43
44	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
45
46	#define UCS2_GET_NAMING(pages, hi, lo) \
47	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
48
49	/* A 2 byte UTF-8 representation splits the characters 11 bits between
50	the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
51	pages, 3 bits to add to that index and 5 bits to generate the mask.
52	*/
53	#define UTF8_GET_NAMING2(pages, byte) \
54	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
55	+ ((((byte)[0]) & 3) << 1) \
56	+ ((((byte)[1]) >> 5) & 1)] \
57	& (1 << (((byte)[1]) & 0x1F)))
58
59	/* A 3 byte UTF-8 representation splits the characters 16 bits between
60	the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
61	into pages, 3 bits to add to that index and 5 bits to generate the
62	mask.
63	*/
64	#define UTF8_GET_NAMING3(pages, byte) \
65	(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
66	+ ((((byte)[1]) >> 2) & 0xF)] \
67	<< 3) \
68	+ ((((byte)[1]) & 3) << 1) \
69	+ ((((byte)[2]) >> 5) & 1)] \
70	& (1 << (((byte)[2]) & 0x1F)))
71
72	#define UTF8_GET_NAMING(pages, p, n) \
73	((n) == 2 \
74	? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
75	: ((n) == 3 \
76	? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
77	: 0))
78
79	/* Detection of invalid UTF-8 sequences is based on Table 3.1B
80	of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
81	with the additional restriction of not allowing the Unicode
82	code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
83	Implementation details:
84	(A & 0x80) == 0 means A < 0x80
85	and
86	(A & 0xC0) == 0xC0 means A > 0xBF
87	*/
88
89	#define UTF8_INVALID2(p) \
90	((*p) < 0xC2 \|\| ((p)[1] & 0x80) == 0 \|\| ((p)[1] & 0xC0) == 0xC0)
91
92	#define UTF8_INVALID3(p) \
93	(((p)[2] & 0x80) == 0 \
94	\|\| \
95	((*p) == 0xEF && (p)[1] == 0xBF \
96	? \
97	(p)[2] > 0xBD \
98	: \
99	((p)[2] & 0xC0) == 0xC0) \
100	\|\| \
101	((*p) == 0xE0 \
102	? \
103	(p)[1] < 0xA0 \|\| ((p)[1] & 0xC0) == 0xC0 \
104	: \
105	((p)[1] & 0x80) == 0 \
106	\|\| \
107	((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
108
109	#define UTF8_INVALID4(p) \
110	(((p)[3] & 0x80) == 0 \|\| ((p)[3] & 0xC0) == 0xC0 \
111	\|\| \
112	((p)[2] & 0x80) == 0 \|\| ((p)[2] & 0xC0) == 0xC0 \
113	\|\| \
114	((*p) == 0xF0 \
115	? \
116	(p)[1] < 0x90 \|\| ((p)[1] & 0xC0) == 0xC0 \
117	: \
118	((p)[1] & 0x80) == 0 \
119	\|\| \
120	((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
121
122	static int PTRFASTCALL
123	isNever(const ENCODING enc, const char p)
124	{
125	return 0;
126	}
127
128	static int PTRFASTCALL
129	utf8_isName2(const ENCODING enc, const char p)
130	{
131	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
132	}
133
134	static int PTRFASTCALL
135	utf8_isName3(const ENCODING enc, const char p)
136	{
137	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
138	}
139
140	#define utf8_isName4 isNever
141
142	static int PTRFASTCALL
143	utf8_isNmstrt2(const ENCODING enc, const char p)
144	{
145	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
146	}
147
148	static int PTRFASTCALL
149	utf8_isNmstrt3(const ENCODING enc, const char p)
150	{
151	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
152	}
153
154	#define utf8_isNmstrt4 isNever
155
156	static int PTRFASTCALL
157	utf8_isInvalid2(const ENCODING enc, const char p)
158	{
159	return UTF8_INVALID2((const unsigned char *)p);
160	}
161
162	static int PTRFASTCALL
163	utf8_isInvalid3(const ENCODING enc, const char p)
164	{
165	return UTF8_INVALID3((const unsigned char *)p);
166	}
167
168	static int PTRFASTCALL
169	utf8_isInvalid4(const ENCODING enc, const char p)
170	{
171	return UTF8_INVALID4((const unsigned char *)p);
172	}
173
174	struct normal_encoding {
175	ENCODING enc;
176	unsigned char type[256];
177	#ifdef XML_MIN_SIZE
178	int (PTRFASTCALL byteType)(const ENCODING , const char *);
179	int (PTRFASTCALL isNameMin)(const ENCODING , const char *);
180	int (PTRFASTCALL isNmstrtMin)(const ENCODING , const char *);
181	int (PTRFASTCALL byteToAscii)(const ENCODING , const char *);
182	int (PTRCALL charMatches)(const ENCODING , const char *, int);
183	#endif /* XML_MIN_SIZE */
184	int (PTRFASTCALL isName2)(const ENCODING , const char *);
185	int (PTRFASTCALL isName3)(const ENCODING , const char *);
186	int (PTRFASTCALL isName4)(const ENCODING , const char *);
187	int (PTRFASTCALL isNmstrt2)(const ENCODING , const char *);
188	int (PTRFASTCALL isNmstrt3)(const ENCODING , const char *);
189	int (PTRFASTCALL isNmstrt4)(const ENCODING , const char *);
190	int (PTRFASTCALL isInvalid2)(const ENCODING , const char *);
191	int (PTRFASTCALL isInvalid3)(const ENCODING , const char *);
192	int (PTRFASTCALL isInvalid4)(const ENCODING , const char *);
193	};
194
195	#define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
196
197	#ifdef XML_MIN_SIZE
198
199	#define STANDARD_VTABLE(E) \
200	E ## byteType, \
201	E ## isNameMin, \
202	E ## isNmstrtMin, \
203	E ## byteToAscii, \
204	E ## charMatches,
205
206	#else
207
208	#define STANDARD_VTABLE(E) /* as nothing */
209
210	#endif
211
212	#define NORMAL_VTABLE(E) \
213	E ## isName2, \
214	E ## isName3, \
215	E ## isName4, \
216	E ## isNmstrt2, \
217	E ## isNmstrt3, \
218	E ## isNmstrt4, \
219	E ## isInvalid2, \
220	E ## isInvalid3, \
221	E ## isInvalid4
222
223	static int FASTCALL checkCharRefNumber(int);
224
225	#include "xmltok_impl.h"
226	#include "ascii.h"
227
228	#ifdef XML_MIN_SIZE
229	#define sb_isNameMin isNever
230	#define sb_isNmstrtMin isNever
231	#endif
232
233	#ifdef XML_MIN_SIZE
234	#define MINBPC(enc) ((enc)->minBytesPerChar)
235	#else
236	/* minimum bytes per character */
237	#define MINBPC(enc) 1
238	#endif
239
240	#define SB_BYTE_TYPE(enc, p) \
241	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
242
243	#ifdef XML_MIN_SIZE
244	static int PTRFASTCALL
245	sb_byteType(const ENCODING enc, const char p)
246	{
247	return SB_BYTE_TYPE(enc, p);
248	}
249	#define BYTE_TYPE(enc, p) \
250	(AS_NORMAL_ENCODING(enc)->byteType(enc, p))
251	#else
252	#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
253	#endif
254
255	#ifdef XML_MIN_SIZE
256	#define BYTE_TO_ASCII(enc, p) \
257	(AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
258	static int PTRFASTCALL
259	sb_byteToAscii(const ENCODING enc, const char p)
260	{
261	return *p;
262	}
263	#else
264	#define BYTE_TO_ASCII(enc, p) (*(p))
265	#endif
266
267	#define IS_NAME_CHAR(enc, p, n) \
268	(AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
269	#define IS_NMSTRT_CHAR(enc, p, n) \
270	(AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
271	#define IS_INVALID_CHAR(enc, p, n) \
272	(AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
273
274	#ifdef XML_MIN_SIZE
275	#define IS_NAME_CHAR_MINBPC(enc, p) \
276	(AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
277	#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
278	(AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
279	#else
280	#define IS_NAME_CHAR_MINBPC(enc, p) (0)
281	#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
282	#endif
283
284	#ifdef XML_MIN_SIZE
285	#define CHAR_MATCHES(enc, p, c) \
286	(AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
287	static int PTRCALL
288	sb_charMatches(const ENCODING enc, const char p, int c)
289	{
290	return *p == c;
291	}
292	#else
293	/* c is an ASCII character */
294	#define CHAR_MATCHES(enc, p, c) (*(p) == c)
295	#endif
296
297	#define PREFIX(ident) normal_ ## ident
298	#include "xmltok_impl.c"
299
300	#undef MINBPC
301	#undef BYTE_TYPE
302	#undef BYTE_TO_ASCII
303	#undef CHAR_MATCHES
304	#undef IS_NAME_CHAR
305	#undef IS_NAME_CHAR_MINBPC
306	#undef IS_NMSTRT_CHAR
307	#undef IS_NMSTRT_CHAR_MINBPC
308	#undef IS_INVALID_CHAR
309
310	enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
311	UTF8_cval1 = 0x00,
312	UTF8_cval2 = 0xc0,
313	UTF8_cval3 = 0xe0,
314	UTF8_cval4 = 0xf0
315	};
316
317	static void PTRCALL
318	utf8_toUtf8(const ENCODING *enc,
319	const char *fromP, const char fromLim,
320	char *toP, const char toLim)
321	{
322	char *to;
323	const char *from;
324	if (fromLim - fromP > toLim - toP) {
325	/* Avoid copying partial characters. */
326	for (fromLim = fromP + (toLim - toP); fromLim > *fromP; fromLim--)
327	if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
328	break;
329	}
330	for (to = toP, from = fromP; from != fromLim; from++, to++)
331	to = from;
332	*fromP = from;
333	*toP = to;
334	}
335
336	static void PTRCALL
337	utf8_toUtf16(const ENCODING *enc,
338	const char *fromP, const char fromLim,
339	unsigned short *toP, const unsigned short toLim)
340	{
341	unsigned short to = toP;
342	const char from = fromP;
343	while (from != fromLim && to != toLim) {
344	switch (((struct normal_encoding )enc)->type[(unsigned char)from]) {
345	case BT_LEAD2:
346	*to++ = (unsigned short)(((from[0] & 0x1f) << 6) \| (from[1] & 0x3f));
347	from += 2;
348	break;
349	case BT_LEAD3:
350	*to++ = (unsigned short)(((from[0] & 0xf) << 12)
351	\| ((from[1] & 0x3f) << 6) \| (from[2] & 0x3f));
352	from += 3;
353	break;
354	case BT_LEAD4:
355	{
356	unsigned long n;
357	if (to + 1 == toLim)
358	goto after;
359	n = ((from[0] & 0x7) << 18) \| ((from[1] & 0x3f) << 12)
360	\| ((from[2] & 0x3f) << 6) \| (from[3] & 0x3f);
361	n -= 0x10000;
362	to[0] = (unsigned short)((n >> 10) \| 0xD800);
363	to[1] = (unsigned short)((n & 0x3FF) \| 0xDC00);
364	to += 2;
365	from += 4;
366	}
367	break;
368	default:
369	to++ = from++;
370	break;
371	}
372	}
373	after:
374	*fromP = from;
375	*toP = to;
376	}
377
378	#ifdef XML_NS
379	static const struct normal_encoding utf8_encoding_ns = {
380	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
381	{
382	#include "asciitab.h"
383	#include "utf8tab.h"
384	},
385	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
386	};
387	#endif
388
389	static const struct normal_encoding utf8_encoding = {
390	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
391	{
392	#define BT_COLON BT_NMSTRT
393	#include "asciitab.h"
394	#undef BT_COLON
395	#include "utf8tab.h"
396	},
397	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
398	};
399
400	#ifdef XML_NS
401
402	static const struct normal_encoding internal_utf8_encoding_ns = {
403	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
404	{
405	#include "iasciitab.h"
406	#include "utf8tab.h"
407	},
408	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
409	};
410
411	#endif
412
413	static const struct normal_encoding internal_utf8_encoding = {
414	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
415	{
416	#define BT_COLON BT_NMSTRT
417	#include "iasciitab.h"
418	#undef BT_COLON
419	#include "utf8tab.h"
420	},
421	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
422	};
423
424	static void PTRCALL
425	latin1_toUtf8(const ENCODING *enc,
426	const char *fromP, const char fromLim,
427	char *toP, const char toLim)
428	{
429	for (;;) {
430	unsigned char c;
431	if (*fromP == fromLim)
432	break;
433	c = (unsigned char)**fromP;
434	if (c & 0x80) {
435	if (toLim - *toP < 2)
436	break;
437	(toP)++ = (char)((c >> 6) \| UTF8_cval2);
438	(toP)++ = (char)((c & 0x3f) \| 0x80);
439	(*fromP)++;
440	}
441	else {
442	if (*toP == toLim)
443	break;
444	(toP)++ = (fromP)++;
445	}
446	}
447	}
448
449	static void PTRCALL
450	latin1_toUtf16(const ENCODING *enc,
451	const char *fromP, const char fromLim,
452	unsigned short *toP, const unsigned short toLim)
453	{
454	while (fromP != fromLim && toP != toLim)
455	(toP)++ = (unsigned char)(fromP)++;
456	}
457
458	#ifdef XML_NS
459
460	static const struct normal_encoding latin1_encoding_ns = {
461	{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
462	{
463	#include "asciitab.h"
464	#include "latin1tab.h"
465	},
466	STANDARD_VTABLE(sb_)
467	};
468
469	#endif
470
471	static const struct normal_encoding latin1_encoding = {
472	{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
473	{
474	#define BT_COLON BT_NMSTRT
475	#include "asciitab.h"
476	#undef BT_COLON
477	#include "latin1tab.h"
478	},
479	STANDARD_VTABLE(sb_)
480	};
481
482	static void PTRCALL
483	ascii_toUtf8(const ENCODING *enc,
484	const char *fromP, const char fromLim,
485	char *toP, const char toLim)
486	{
487	while (fromP != fromLim && toP != toLim)
488	(toP)++ = (fromP)++;
489	}
490
491	#ifdef XML_NS
492
493	static const struct normal_encoding ascii_encoding_ns = {
494	{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
495	{
496	#include "asciitab.h"
497	/* BT_NONXML == 0 */
498	},
499	STANDARD_VTABLE(sb_)
500	};
501
502	#endif
503
504	static const struct normal_encoding ascii_encoding = {
505	{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
506	{
507	#define BT_COLON BT_NMSTRT
508	#include "asciitab.h"
509	#undef BT_COLON
510	/* BT_NONXML == 0 */
511	},
512	STANDARD_VTABLE(sb_)
513	};
514
515	static int PTRFASTCALL
516	unicode_byte_type(char hi, char lo)
517	{
518	switch ((unsigned char)hi) {
519	case 0xD8: case 0xD9: case 0xDA: case 0xDB:
520	return BT_LEAD4;
521	case 0xDC: case 0xDD: case 0xDE: case 0xDF:
522	return BT_TRAIL;
523	case 0xFF:
524	switch ((unsigned char)lo) {
525	case 0xFF:
526	case 0xFE:
527	return BT_NONXML;
528	}
529	break;
530	}
531	return BT_NONASCII;
532	}
533
534	#define DEFINE_UTF16_TO_UTF8(E) \
535	static void PTRCALL \
536	E ## toUtf8(const ENCODING *enc, \
537	const char *fromP, const char fromLim, \
538	char *toP, const char toLim) \
539	{ \
540	const char *from; \
541	for (from = *fromP; from != fromLim; from += 2) { \
542	int plane; \
543	unsigned char lo2; \
544	unsigned char lo = GET_LO(from); \
545	unsigned char hi = GET_HI(from); \
546	switch (hi) { \
547	case 0: \
548	if (lo < 0x80) { \
549	if (*toP == toLim) { \
550	*fromP = from; \
551	return; \
552	} \
553	(toP)++ = lo; \
554	break; \
555	} \
556	/* fall through */ \
557	case 0x1: case 0x2: case 0x3: \
558	case 0x4: case 0x5: case 0x6: case 0x7: \
559	if (toLim - *toP < 2) { \
560	*fromP = from; \
561	return; \
562	} \
563	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
564	(toP)++ = ((lo & 0x3f) \| 0x80); \
565	break; \
566	default: \
567	if (toLim - *toP < 3) { \
568	*fromP = from; \
569	return; \
570	} \
571	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
572	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
573	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
574	(toP)++ = ((lo & 0x3f) \| 0x80); \
575	break; \
576	case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
577	if (toLim - *toP < 4) { \
578	*fromP = from; \
579	return; \
580	} \
581	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
582	(toP)++ = ((plane >> 2) \| UTF8_cval4); \
583	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
584	from += 2; \
585	lo2 = GET_LO(from); \
586	(toP)++ = (((lo & 0x3) << 4) \
587	\| ((GET_HI(from) & 0x3) << 2) \
588	\| (lo2 >> 6) \
589	\| 0x80); \
590	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
591	break; \
592	} \
593	} \
594	*fromP = from; \
595	}
596
597	#define DEFINE_UTF16_TO_UTF16(E) \
598	static void PTRCALL \
599	E ## toUtf16(const ENCODING *enc, \
600	const char *fromP, const char fromLim, \
601	unsigned short *toP, const unsigned short toLim) \
602	{ \
603	/* Avoid copying first half only of surrogate */ \
604	if (fromLim - fromP > ((toLim - toP) << 1) \
605	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
606	fromLim -= 2; \
607	for (; fromP != fromLim && toP != toLim; *fromP += 2) \
608	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
609	}
610
611	#define SET2(ptr, ch) \
612	(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
613	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
614	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
615
616	DEFINE_UTF16_TO_UTF8(little2_)
617	DEFINE_UTF16_TO_UTF16(little2_)
618
619	#undef SET2
620	#undef GET_LO
621	#undef GET_HI
622
623	#define SET2(ptr, ch) \
624	(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
625	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
626	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
627
628	DEFINE_UTF16_TO_UTF8(big2_)
629	DEFINE_UTF16_TO_UTF16(big2_)
630
631	#undef SET2
632	#undef GET_LO
633	#undef GET_HI
634
635	#define LITTLE2_BYTE_TYPE(enc, p) \
636	((p)[1] == 0 \
637	? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
638	: unicode_byte_type((p)[1], (p)[0]))
639	#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
640	#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
641	#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
642	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
643	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
644	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
645
646	#ifdef XML_MIN_SIZE
647
648	static int PTRFASTCALL
649	little2_byteType(const ENCODING enc, const char p)
650	{
651	return LITTLE2_BYTE_TYPE(enc, p);
652	}
653
654	static int PTRFASTCALL
655	little2_byteToAscii(const ENCODING enc, const char p)
656	{
657	return LITTLE2_BYTE_TO_ASCII(enc, p);
658	}
659
660	static int PTRCALL
661	little2_charMatches(const ENCODING enc, const char p, int c)
662	{
663	return LITTLE2_CHAR_MATCHES(enc, p, c);
664	}
665
666	static int PTRFASTCALL
667	little2_isNameMin(const ENCODING enc, const char p)
668	{
669	return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
670	}
671
672	static int PTRFASTCALL
673	little2_isNmstrtMin(const ENCODING enc, const char p)
674	{
675	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
676	}
677
678	#undef VTABLE
679	#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
680
681	#else /* not XML_MIN_SIZE */
682
683	#undef PREFIX
684	#define PREFIX(ident) little2_ ## ident
685	#define MINBPC(enc) 2
686	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
687	#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
688	#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
689	#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
690	#define IS_NAME_CHAR(enc, p, n) 0
691	#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
692	#define IS_NMSTRT_CHAR(enc, p, n) (0)
693	#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
694
695	#include "xmltok_impl.c"
696
697	#undef MINBPC
698	#undef BYTE_TYPE
699	#undef BYTE_TO_ASCII
700	#undef CHAR_MATCHES
701	#undef IS_NAME_CHAR
702	#undef IS_NAME_CHAR_MINBPC
703	#undef IS_NMSTRT_CHAR
704	#undef IS_NMSTRT_CHAR_MINBPC
705	#undef IS_INVALID_CHAR
706
707	#endif /* not XML_MIN_SIZE */
708
709	#ifdef XML_NS
710
711	static const struct normal_encoding little2_encoding_ns = {
712	{ VTABLE, 2, 0,
713	#if BYTEORDER == 1234
714	1
715	#else
716	0
717	#endif
718	},
719	{
720	#include "asciitab.h"
721	#include "latin1tab.h"
722	},
723	STANDARD_VTABLE(little2_)
724	};
725
726	#endif
727
728	static const struct normal_encoding little2_encoding = {
729	{ VTABLE, 2, 0,
730	#if BYTEORDER == 1234
731	1
732	#else
733	0
734	#endif
735	},
736	{
737	#define BT_COLON BT_NMSTRT
738	#include "asciitab.h"
739	#undef BT_COLON
740	#include "latin1tab.h"
741	},
742	STANDARD_VTABLE(little2_)
743	};
744
745	#if BYTEORDER != 4321
746
747	#ifdef XML_NS
748
749	static const struct normal_encoding internal_little2_encoding_ns = {
750	{ VTABLE, 2, 0, 1 },
751	{
752	#include "iasciitab.h"
753	#include "latin1tab.h"
754	},
755	STANDARD_VTABLE(little2_)
756	};
757
758	#endif
759
760	static const struct normal_encoding internal_little2_encoding = {
761	{ VTABLE, 2, 0, 1 },
762	{
763	#define BT_COLON BT_NMSTRT
764	#include "iasciitab.h"
765	#undef BT_COLON
766	#include "latin1tab.h"
767	},
768	STANDARD_VTABLE(little2_)
769	};
770
771	#endif
772
773
774	#define BIG2_BYTE_TYPE(enc, p) \
775	((p)[0] == 0 \
776	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
777	: unicode_byte_type((p)[0], (p)[1]))
778	#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
779	#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
780	#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
781	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
782	#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
783	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
784
785	#ifdef XML_MIN_SIZE
786
787	static int PTRFASTCALL
788	big2_byteType(const ENCODING enc, const char p)
789	{
790	return BIG2_BYTE_TYPE(enc, p);
791	}
792
793	static int PTRFASTCALL
794	big2_byteToAscii(const ENCODING enc, const char p)
795	{
796	return BIG2_BYTE_TO_ASCII(enc, p);
797	}
798
799	static int PTRCALL
800	big2_charMatches(const ENCODING enc, const char p, int c)
801	{
802	return BIG2_CHAR_MATCHES(enc, p, c);
803	}
804
805	static int PTRFASTCALL
806	big2_isNameMin(const ENCODING enc, const char p)
807	{
808	return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
809	}
810
811	static int PTRFASTCALL
812	big2_isNmstrtMin(const ENCODING enc, const char p)
813	{
814	return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
815	}
816
817	#undef VTABLE
818	#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
819
820	#else /* not XML_MIN_SIZE */
821
822	#undef PREFIX
823	#define PREFIX(ident) big2_ ## ident
824	#define MINBPC(enc) 2
825	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
826	#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
827	#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
828	#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
829	#define IS_NAME_CHAR(enc, p, n) 0
830	#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
831	#define IS_NMSTRT_CHAR(enc, p, n) (0)
832	#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
833
834	#include "xmltok_impl.c"
835
836	#undef MINBPC
837	#undef BYTE_TYPE
838	#undef BYTE_TO_ASCII
839	#undef CHAR_MATCHES
840	#undef IS_NAME_CHAR
841	#undef IS_NAME_CHAR_MINBPC
842	#undef IS_NMSTRT_CHAR
843	#undef IS_NMSTRT_CHAR_MINBPC
844	#undef IS_INVALID_CHAR
845
846	#endif /* not XML_MIN_SIZE */
847
848	#ifdef XML_NS
849
850	static const struct normal_encoding big2_encoding_ns = {
851	{ VTABLE, 2, 0,
852	#if BYTEORDER == 4321
853	1
854	#else
855	0
856	#endif
857	},
858	{
859	#include "asciitab.h"
860	#include "latin1tab.h"
861	},
862	STANDARD_VTABLE(big2_)
863	};
864
865	#endif
866
867	static const struct normal_encoding big2_encoding = {
868	{ VTABLE, 2, 0,
869	#if BYTEORDER == 4321
870	1
871	#else
872	0
873	#endif
874	},
875	{
876	#define BT_COLON BT_NMSTRT
877	#include "asciitab.h"
878	#undef BT_COLON
879	#include "latin1tab.h"
880	},
881	STANDARD_VTABLE(big2_)
882	};
883
884	#if BYTEORDER != 1234
885
886	#ifdef XML_NS
887
888	static const struct normal_encoding internal_big2_encoding_ns = {
889	{ VTABLE, 2, 0, 1 },
890	{
891	#include "iasciitab.h"
892	#include "latin1tab.h"
893	},
894	STANDARD_VTABLE(big2_)
895	};
896
897	#endif
898
899	static const struct normal_encoding internal_big2_encoding = {
900	{ VTABLE, 2, 0, 1 },
901	{
902	#define BT_COLON BT_NMSTRT
903	#include "iasciitab.h"
904	#undef BT_COLON
905	#include "latin1tab.h"
906	},
907	STANDARD_VTABLE(big2_)
908	};
909
910	#endif
911
912	#undef PREFIX
913
914	static int FASTCALL
915	streqci(const char s1, const char s2)
916	{
917	for (;;) {
918	char c1 = *s1++;
919	char c2 = *s2++;
920	if (ASCII_a <= c1 && c1 <= ASCII_z)
921	c1 += ASCII_A - ASCII_a;
922	if (ASCII_a <= c2 && c2 <= ASCII_z)
923	c2 += ASCII_A - ASCII_a;
924	if (c1 != c2)
925	return 0;
926	if (!c1)
927	break;
928	}
929	return 1;
930	}
931
932	static void PTRCALL
933	initUpdatePosition(const ENCODING enc, const char ptr,
934	const char end, POSITION pos)
935	{
936	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
937	}
938
939	static int
940	toAscii(const ENCODING enc, const char ptr, const char *end)
941	{
942	char buf[1];
943	char *p = buf;
944	XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
945	if (p == buf)
946	return -1;
947	else
948	return buf[0];
949	}
950
951	static int FASTCALL
952	isSpace(int c)
953	{
954	switch (c) {
955	case 0x20:
956	case 0xD:
957	case 0xA:
958	case 0x9:
959	return 1;
960	}
961	return 0;
962	}
963
964	/* Return 1 if there's just optional white space or there's an S
965	followed by name=val.
966	*/
967	static int
968	parsePseudoAttribute(const ENCODING *enc,
969	const char *ptr,
970	const char *end,
971	const char **namePtr,
972	const char **nameEndPtr,
973	const char **valPtr,
974	const char **nextTokPtr)
975	{
976	int c;
977	char open;
978	if (ptr == end) {
979	*namePtr = NULL;
980	return 1;
981	}
982	if (!isSpace(toAscii(enc, ptr, end))) {
983	*nextTokPtr = ptr;
984	return 0;
985	}
986	do {
987	ptr += enc->minBytesPerChar;
988	} while (isSpace(toAscii(enc, ptr, end)));
989	if (ptr == end) {
990	*namePtr = NULL;
991	return 1;
992	}
993	*namePtr = ptr;
994	for (;;) {
995	c = toAscii(enc, ptr, end);
996	if (c == -1) {
997	*nextTokPtr = ptr;
998	return 0;
999	}
1000	if (c == ASCII_EQUALS) {
1001	*nameEndPtr = ptr;
1002	break;
1003	}
1004	if (isSpace(c)) {
1005	*nameEndPtr = ptr;
1006	do {
1007	ptr += enc->minBytesPerChar;
1008	} while (isSpace(c = toAscii(enc, ptr, end)));
1009	if (c != ASCII_EQUALS) {
1010	*nextTokPtr = ptr;
1011	return 0;
1012	}
1013	break;
1014	}
1015	ptr += enc->minBytesPerChar;
1016	}
1017	if (ptr == *namePtr) {
1018	*nextTokPtr = ptr;
1019	return 0;
1020	}
1021	ptr += enc->minBytesPerChar;
1022	c = toAscii(enc, ptr, end);
1023	while (isSpace(c)) {
1024	ptr += enc->minBytesPerChar;
1025	c = toAscii(enc, ptr, end);
1026	}
1027	if (c != ASCII_QUOT && c != ASCII_APOS) {
1028	*nextTokPtr = ptr;
1029	return 0;
1030	}
1031	open = (char)c;
1032	ptr += enc->minBytesPerChar;
1033	*valPtr = ptr;
1034	for (;; ptr += enc->minBytesPerChar) {
1035	c = toAscii(enc, ptr, end);
1036	if (c == open)
1037	break;
1038	if (!(ASCII_a <= c && c <= ASCII_z)
1039	&& !(ASCII_A <= c && c <= ASCII_Z)
1040	&& !(ASCII_0 <= c && c <= ASCII_9)
1041	&& c != ASCII_PERIOD
1042	&& c != ASCII_MINUS
1043	&& c != ASCII_UNDERSCORE) {
1044	*nextTokPtr = ptr;
1045	return 0;
1046	}
1047	}
1048	*nextTokPtr = ptr + enc->minBytesPerChar;
1049	return 1;
1050	}
1051
1052	static const char KW_version[] = {
1053	ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1054	};
1055
1056	static const char KW_encoding[] = {
1057	ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1058	};
1059
1060	static const char KW_standalone[] = {
1061	ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1062	ASCII_n, ASCII_e, '\0'
1063	};
1064
1065	static const char KW_yes[] = {
1066	ASCII_y, ASCII_e, ASCII_s, '\0'
1067	};
1068
1069	static const char KW_no[] = {
1070	ASCII_n, ASCII_o, '\0'
1071	};
1072
1073	static int
1074	doParseXmlDecl(const ENCODING (encodingFinder)(const ENCODING *,
1075	const char *,
1076	const char *),
1077	int isGeneralTextEntity,
1078	const ENCODING *enc,
1079	const char *ptr,
1080	const char *end,
1081	const char **badPtr,
1082	const char **versionPtr,
1083	const char **versionEndPtr,
1084	const char **encodingName,
1085	const ENCODING **encoding,
1086	int *standalone)
1087	{
1088	const char *val = NULL;
1089	const char *name = NULL;
1090	const char *nameEnd = NULL;
1091	ptr += 5 * enc->minBytesPerChar;
1092	end -= 2 * enc->minBytesPerChar;
1093	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1094	\|\| !name) {
1095	*badPtr = ptr;
1096	return 0;
1097	}
1098	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1099	if (!isGeneralTextEntity) {
1100	*badPtr = name;
1101	return 0;
1102	}
1103	}
1104	else {
1105	if (versionPtr)
1106	*versionPtr = val;
1107	if (versionEndPtr)
1108	*versionEndPtr = ptr;
1109	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1110	*badPtr = ptr;
1111	return 0;
1112	}
1113	if (!name) {
1114	if (isGeneralTextEntity) {
1115	/* a TextDecl must have an EncodingDecl */
1116	*badPtr = ptr;
1117	return 0;
1118	}
1119	return 1;
1120	}
1121	}
1122	if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1123	int c = toAscii(enc, val, end);
1124	if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1125	*badPtr = val;
1126	return 0;
1127	}
1128	if (encodingName)
1129	*encodingName = val;
1130	if (encoding)
1131	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1132	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1133	*badPtr = ptr;
1134	return 0;
1135	}
1136	if (!name)
1137	return 1;
1138	}
1139	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1140	\|\| isGeneralTextEntity) {
1141	*badPtr = name;
1142	return 0;
1143	}
1144	if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1145	if (standalone)
1146	*standalone = 1;
1147	}
1148	else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1149	if (standalone)
1150	*standalone = 0;
1151	}
1152	else {
1153	*badPtr = val;
1154	return 0;
1155	}
1156	while (isSpace(toAscii(enc, ptr, end)))
1157	ptr += enc->minBytesPerChar;
1158	if (ptr != end) {
1159	*badPtr = ptr;
1160	return 0;
1161	}
1162	return 1;
1163	}
1164
1165	static int FASTCALL
1166	checkCharRefNumber(int result)
1167	{
1168	switch (result >> 8) {
1169	case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1170	case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1171	return -1;
1172	case 0:
1173	if (latin1_encoding.type[result] == BT_NONXML)
1174	return -1;
1175	break;
1176	case 0xFF:
1177	if (result == 0xFFFE \|\| result == 0xFFFF)
1178	return -1;
1179	break;
1180	}
1181	return result;
1182	}
1183
1184	int FASTCALL
1185	XmlUtf8Encode(int c, char *buf)
1186	{
1187	enum {
1188	/* minN is minimum legal resulting value for N byte sequence */
1189	min2 = 0x80,
1190	min3 = 0x800,
1191	min4 = 0x10000
1192	};
1193
1194	if (c < 0)
1195	return 0;
1196	if (c < min2) {
1197	buf[0] = (char)(c \| UTF8_cval1);
1198	return 1;
1199	}
1200	if (c < min3) {
1201	buf[0] = (char)((c >> 6) \| UTF8_cval2);
1202	buf[1] = (char)((c & 0x3f) \| 0x80);
1203	return 2;
1204	}
1205	if (c < min4) {
1206	buf[0] = (char)((c >> 12) \| UTF8_cval3);
1207	buf[1] = (char)(((c >> 6) & 0x3f) \| 0x80);
1208	buf[2] = (char)((c & 0x3f) \| 0x80);
1209	return 3;
1210	}
1211	if (c < 0x110000) {
1212	buf[0] = (char)((c >> 18) \| UTF8_cval4);
1213	buf[1] = (char)(((c >> 12) & 0x3f) \| 0x80);
1214	buf[2] = (char)(((c >> 6) & 0x3f) \| 0x80);
1215	buf[3] = (char)((c & 0x3f) \| 0x80);
1216	return 4;
1217	}
1218	return 0;
1219	}
1220
1221	int FASTCALL
1222	XmlUtf16Encode(int charNum, unsigned short *buf)
1223	{
1224	if (charNum < 0)
1225	return 0;
1226	if (charNum < 0x10000) {
1227	buf[0] = (unsigned short)charNum;
1228	return 1;
1229	}
1230	if (charNum < 0x110000) {
1231	charNum -= 0x10000;
1232	buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1233	buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1234	return 2;
1235	}
1236	return 0;
1237	}
1238
1239	struct unknown_encoding {
1240	struct normal_encoding normal;
1241	CONVERTER convert;
1242	void *userData;
1243	unsigned short utf16[256];
1244	char utf8[256][4];
1245	};
1246
1247	#define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1248
1249	int
1250	XmlSizeOfUnknownEncoding(void)
1251	{
1252	return sizeof(struct unknown_encoding);
1253	}
1254
1255	static int PTRFASTCALL
1256	unknown_isName(const ENCODING enc, const char p)
1257	{
1258	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1259	int c = uenc->convert(uenc->userData, p);
1260	if (c & ~0xFFFF)
1261	return 0;
1262	return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1263	}
1264
1265	static int PTRFASTCALL
1266	unknown_isNmstrt(const ENCODING enc, const char p)
1267	{
1268	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1269	int c = uenc->convert(uenc->userData, p);
1270	if (c & ~0xFFFF)
1271	return 0;
1272	return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1273	}
1274
1275	static int PTRFASTCALL
1276	unknown_isInvalid(const ENCODING enc, const char p)
1277	{
1278	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1279	int c = uenc->convert(uenc->userData, p);
1280	return (c & ~0xFFFF) \|\| checkCharRefNumber(c) < 0;
1281	}
1282
1283	static void PTRCALL
1284	unknown_toUtf8(const ENCODING *enc,
1285	const char *fromP, const char fromLim,
1286	char *toP, const char toLim)
1287	{
1288	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1289	char buf[XML_UTF8_ENCODE_MAX];
1290	for (;;) {
1291	const char *utf8;
1292	int n;
1293	if (*fromP == fromLim)
1294	break;
1295	utf8 = uenc->utf8[(unsigned char)**fromP];
1296	n = *utf8++;
1297	if (n == 0) {
1298	int c = uenc->convert(uenc->userData, *fromP);
1299	n = XmlUtf8Encode(c, buf);
1300	if (n > toLim - *toP)
1301	break;
1302	utf8 = buf;
1303	fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)*fromP]
1304	- (BT_LEAD2 - 2));
1305	}
1306	else {
1307	if (n > toLim - *toP)
1308	break;
1309	(*fromP)++;
1310	}
1311	do {
1312	(toP)++ = *utf8++;
1313	} while (--n != 0);
1314	}
1315	}
1316
1317	static void PTRCALL
1318	unknown_toUtf16(const ENCODING *enc,
1319	const char *fromP, const char fromLim,
1320	unsigned short *toP, const unsigned short toLim)
1321	{
1322	const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1323	while (fromP != fromLim && toP != toLim) {
1324	unsigned short c = uenc->utf16[(unsigned char)**fromP];
1325	if (c == 0) {
1326	c = (unsigned short)
1327	uenc->convert(uenc->userData, *fromP);
1328	fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)*fromP]
1329	- (BT_LEAD2 - 2));
1330	}
1331	else
1332	(*fromP)++;
1333	(toP)++ = c;
1334	}
1335	}
1336
1337	ENCODING *
1338	XmlInitUnknownEncoding(void *mem,
1339	int *table,
1340	CONVERTER convert,
1341	void *userData)
1342	{
1343	int i;
1344	struct unknown_encoding e = (struct unknown_encoding )mem;
1345	for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1346	((char )mem)[i] = ((char )&latin1_encoding)[i];
1347	for (i = 0; i < 128; i++)
1348	if (latin1_encoding.type[i] != BT_OTHER
1349	&& latin1_encoding.type[i] != BT_NONXML
1350	&& table[i] != i)
1351	return 0;
1352	for (i = 0; i < 256; i++) {
1353	int c = table[i];
1354	if (c == -1) {
1355	e->normal.type[i] = BT_MALFORM;
1356	/* This shouldn't really get used. */
1357	e->utf16[i] = 0xFFFF;
1358	e->utf8[i][0] = 1;
1359	e->utf8[i][1] = 0;
1360	}
1361	else if (c < 0) {
1362	if (c < -4)
1363	return 0;
1364	e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1365	e->utf8[i][0] = 0;
1366	e->utf16[i] = 0;
1367	}
1368	else if (c < 0x80) {
1369	if (latin1_encoding.type[c] != BT_OTHER
1370	&& latin1_encoding.type[c] != BT_NONXML
1371	&& c != i)
1372	return 0;
1373	e->normal.type[i] = latin1_encoding.type[c];
1374	e->utf8[i][0] = 1;
1375	e->utf8[i][1] = (char)c;
1376	e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1377	}
1378	else if (checkCharRefNumber(c) < 0) {
1379	e->normal.type[i] = BT_NONXML;
1380	/* This shouldn't really get used. */
1381	e->utf16[i] = 0xFFFF;
1382	e->utf8[i][0] = 1;
1383	e->utf8[i][1] = 0;
1384	}
1385	else {
1386	if (c > 0xFFFF)
1387	return 0;
1388	if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1389	e->normal.type[i] = BT_NMSTRT;
1390	else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1391	e->normal.type[i] = BT_NAME;
1392	else
1393	e->normal.type[i] = BT_OTHER;
1394	e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1395	e->utf16[i] = (unsigned short)c;
1396	}
1397	}
1398	e->userData = userData;
1399	e->convert = convert;
1400	if (convert) {
1401	e->normal.isName2 = unknown_isName;
1402	e->normal.isName3 = unknown_isName;
1403	e->normal.isName4 = unknown_isName;
1404	e->normal.isNmstrt2 = unknown_isNmstrt;
1405	e->normal.isNmstrt3 = unknown_isNmstrt;
1406	e->normal.isNmstrt4 = unknown_isNmstrt;
1407	e->normal.isInvalid2 = unknown_isInvalid;
1408	e->normal.isInvalid3 = unknown_isInvalid;
1409	e->normal.isInvalid4 = unknown_isInvalid;
1410	}
1411	e->normal.enc.utf8Convert = unknown_toUtf8;
1412	e->normal.enc.utf16Convert = unknown_toUtf16;
1413	return &(e->normal.enc);
1414	}
1415
1416	/* If this enumeration is changed, getEncodingIndex and encodings
1417	must also be changed. */
1418	enum {
1419	UNKNOWN_ENC = -1,
1420	ISO_8859_1_ENC = 0,
1421	US_ASCII_ENC,
1422	UTF_8_ENC,
1423	UTF_16_ENC,
1424	UTF_16BE_ENC,
1425	UTF_16LE_ENC,
1426	/* must match encodingNames up to here */
1427	NO_ENC
1428	};
1429
1430	static const char KW_ISO_8859_1[] = {
1431	ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1432	ASCII_MINUS, ASCII_1, '\0'
1433	};
1434	static const char KW_US_ASCII[] = {
1435	ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1436	'\0'
1437	};
1438	static const char KW_UTF_8[] = {
1439	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1440	};
1441	static const char KW_UTF_16[] = {
1442	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1443	};
1444	static const char KW_UTF_16BE[] = {
1445	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1446	'\0'
1447	};
1448	static const char KW_UTF_16LE[] = {
1449	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1450	'\0'
1451	};
1452
1453	static int FASTCALL
1454	getEncodingIndex(const char *name)
1455	{
1456	static const char * const encodingNames[] = {
1457	KW_ISO_8859_1,
1458	KW_US_ASCII,
1459	KW_UTF_8,
1460	KW_UTF_16,
1461	KW_UTF_16BE,
1462	KW_UTF_16LE,
1463	};
1464	int i;
1465	if (name == NULL)
1466	return NO_ENC;
1467	for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1468	if (streqci(name, encodingNames[i]))
1469	return i;
1470	return UNKNOWN_ENC;
1471	}
1472
1473	/* For binary compatibility, we store the index of the encoding
1474	specified at initialization in the isUtf16 member.
1475	*/
1476
1477	#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1478	#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1479
1480	/* This is what detects the encoding. encodingTable maps from
1481	encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1482	the external (protocol) specified encoding; state is
1483	XML_CONTENT_STATE if we're parsing an external text entity, and
1484	XML_PROLOG_STATE otherwise.
1485	*/
1486
1487
1488	static int
1489	initScan(const ENCODING * const *encodingTable,
1490	const INIT_ENCODING *enc,
1491	int state,
1492	const char *ptr,
1493	const char *end,
1494	const char **nextTokPtr)
1495	{
1496	const ENCODING **encPtr;
1497
1498	if (ptr == end)
1499	return XML_TOK_NONE;
1500	encPtr = enc->encPtr;
1501	if (ptr + 1 == end) {
1502	/* only a single byte available for auto-detection */
1503	#ifndef XML_DTD /* FIXME */
1504	/* a well-formed document entity must have more than one byte */
1505	if (state != XML_CONTENT_STATE)
1506	return XML_TOK_PARTIAL;
1507	#endif
1508	/* so we're parsing an external text entity... */
1509	/* if UTF-16 was externally specified, then we need at least 2 bytes */
1510	switch (INIT_ENC_INDEX(enc)) {
1511	case UTF_16_ENC:
1512	case UTF_16LE_ENC:
1513	case UTF_16BE_ENC:
1514	return XML_TOK_PARTIAL;
1515	}
1516	switch ((unsigned char)*ptr) {
1517	case 0xFE:
1518	case 0xFF:
1519	case 0xEF: /* possibly first byte of UTF-8 BOM */
1520	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1521	&& state == XML_CONTENT_STATE)
1522	break;
1523	/* fall through */
1524	case 0x00:
1525	case 0x3C:
1526	return XML_TOK_PARTIAL;
1527	}
1528	}
1529	else {
1530	switch (((unsigned char)ptr[0] << 8) \| (unsigned char)ptr[1]) {
1531	case 0xFEFF:
1532	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1533	&& state == XML_CONTENT_STATE)
1534	break;
1535	*nextTokPtr = ptr + 2;
1536	*encPtr = encodingTable[UTF_16BE_ENC];
1537	return XML_TOK_BOM;
1538	/* 00 3C is handled in the default case */
1539	case 0x3C00:
1540	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1541	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
1542	&& state == XML_CONTENT_STATE)
1543	break;
1544	*encPtr = encodingTable[UTF_16LE_ENC];
1545	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1546	case 0xFFFE:
1547	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1548	&& state == XML_CONTENT_STATE)
1549	break;
1550	*nextTokPtr = ptr + 2;
1551	*encPtr = encodingTable[UTF_16LE_ENC];
1552	return XML_TOK_BOM;
1553	case 0xEFBB:
1554	/* Maybe a UTF-8 BOM (EF BB BF) */
1555	/* If there's an explicitly specified (external) encoding
1556	of ISO-8859-1 or some flavour of UTF-16
1557	and this is an external text entity,
1558	don't look for the BOM,
1559	because it might be a legal data.
1560	*/
1561	if (state == XML_CONTENT_STATE) {
1562	int e = INIT_ENC_INDEX(enc);
1563	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC
1564	\|\| e == UTF_16LE_ENC \|\| e == UTF_16_ENC)
1565	break;
1566	}
1567	if (ptr + 2 == end)
1568	return XML_TOK_PARTIAL;
1569	if ((unsigned char)ptr[2] == 0xBF) {
1570	*nextTokPtr = ptr + 3;
1571	*encPtr = encodingTable[UTF_8_ENC];
1572	return XML_TOK_BOM;
1573	}
1574	break;
1575	default:
1576	if (ptr[0] == '\0') {
1577	/* 0 isn't a legal data character. Furthermore a document
1578	entity can only start with ASCII characters. So the only
1579	way this can fail to be big-endian UTF-16 if it it's an
1580	external parsed general entity that's labelled as
1581	UTF-16LE.
1582	*/
1583	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1584	break;
1585	*encPtr = encodingTable[UTF_16BE_ENC];
1586	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1587	}
1588	else if (ptr[1] == '\0') {
1589	/* We could recover here in the case:
1590	- parsing an external entity
1591	- second byte is 0
1592	- no externally specified encoding
1593	- no encoding declaration
1594	by assuming UTF-16LE. But we don't, because this would mean when
1595	presented just with a single byte, we couldn't reliably determine
1596	whether we needed further bytes.
1597	*/
1598	if (state == XML_CONTENT_STATE)
1599	break;
1600	*encPtr = encodingTable[UTF_16LE_ENC];
1601	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1602	}
1603	break;
1604	}
1605	}
1606	*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1607	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1608	}
1609
1610
1611	#define NS(x) x
1612	#define ns(x) x
1613	#include "xmltok_ns.c"
1614	#undef NS
1615	#undef ns
1616
1617	#ifdef XML_NS
1618
1619	#define NS(x) x ## NS
1620	#define ns(x) x ## _ns
1621
1622	#include "xmltok_ns.c"
1623
1624	#undef NS
1625	#undef ns
1626
1627	ENCODING *
1628	XmlInitUnknownEncodingNS(void *mem,
1629	int *table,
1630	CONVERTER convert,
1631	void *userData)
1632	{
1633	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1634	if (enc)
1635	((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1636	return enc;
1637	}
1638
1639	#endif /* XML_NS */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Modules/expat/xmltok.c@ 10

Download in other formats: