Context Navigation

xmltok.c@ 297

Visit:

Last change on this file since 297 was 147, checked in by umoeller, 23 years ago
Misc updates for Unicode.
Property svn:eol-style set to `CRLF` Property svn:keywords set to `Author Date Id Revision`
File size: 46.9 KB

Line
1	/*
2	* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3	* See the file COPYING for copying permission.
4	*/
5
6	/* #ifdef COMPILED_FROM_DSP
7	* # include "winconfig.h"
8	* #else
9	* # include <config.h>
10	* #endif
11	*/
12
13	#include <memory.h>
14
15	#include "expat\expat_setup.h" // V0.9.9 (2001-02-10) [umoeller]
16
17	#pragma info(norea, nogen)
18	// disable "statement unreachable" and "missing break statement"
19	// this code generates those options HEAVILY
20
21	#include "expat\xmltok.h"
22	#include "expat\nametab.h"
23
24	#ifdef XML_DTD
25	#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
26	#else
27	#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
28	#endif
29
30	#define VTABLE1 \
31	{ PREFIX(prologTok), PREFIX(contentTok), \
32	PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
33	{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
34	PREFIX(sameName), \
35	PREFIX(nameMatchesAscii), \
36	PREFIX(nameLength), \
37	PREFIX(skipS), \
38	PREFIX(getAtts), \
39	PREFIX(charRefNumber), \
40	PREFIX(predefinedEntityName), \
41	PREFIX(updatePosition), \
42	PREFIX(isPublicId)
43
44	#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
45
46	#define UCS2_GET_NAMING(pages, hi, lo) \
47	(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
48
49	/* A 2 byte UTF-8 representation splits the characters 11 bits
50	* between the bottom 5 and 6 bits of the bytes.
51	* We need 8 bits to index into pages, 3 bits to add to that index and
52	* 5 bits to generate the mask. */
53	#define UTF8_GET_NAMING2(pages, byte) \
54	(namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
55	+ ((((byte)[0]) & 3) << 1) \
56	+ ((((byte)[1]) >> 5) & 1)] \
57	& (1 << (((byte)[1]) & 0x1F)))
58
59	/* A 3 byte UTF-8 representation splits the characters 16 bits
60	* between the bottom 4, 6 and 6 bits of the bytes.
61	* We need 8 bits to index into pages, 3 bits to add to that index and
62	* 5 bits to generate the mask. */
63	#define UTF8_GET_NAMING3(pages, byte) \
64	(namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
65	+ ((((byte)[1]) >> 2) & 0xF)] \
66	<< 3) \
67	+ ((((byte)[1]) & 3) << 1) \
68	+ ((((byte)[2]) >> 5) & 1)] \
69	& (1 << (((byte)[2]) & 0x1F)))
70
71	#define UTF8_GET_NAMING(pages, p, n) \
72	((n) == 2 \
73	? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
74	: ((n) == 3 \
75	? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
76	: 0))
77
78	#define UTF8_INVALID3(p) \
79	((*p) == 0xED \
80	? (((p)[1] & 0x20) != 0) \
81	: ((*p) == 0xEF \
82	? ((p)[1] == 0xBF && ((p)[2] == 0xBF \|\| (p)[2] == 0xBE)) \
83	: 0))
84
85	#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
86
87	static int EXPATENTRY isNever(const ENCODING * enc, const char *p)
88	{
89	return 0;
90	}
91
92	static int EXPATENTRY utf8_isName2(const ENCODING * enc, const char *p)
93	{
94	return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
95	}
96
97	static int EXPATENTRY utf8_isName3(const ENCODING * enc, const char *p)
98	{
99	return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
100	}
101
102	#define utf8_isName4 isNever
103
104	static int EXPATENTRY utf8_isNmstrt2(const ENCODING * enc, const char *p)
105	{
106	return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
107	}
108
109	static int EXPATENTRY utf8_isNmstrt3(const ENCODING * enc, const char *p)
110	{
111	return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
112	}
113
114	#define utf8_isNmstrt4 isNever
115
116	#define utf8_isInvalid2 isNever
117
118	static int EXPATENTRY utf8_isInvalid3(const ENCODING * enc, const char *p)
119	{
120	return UTF8_INVALID3((const unsigned char *)p);
121	}
122
123	static int EXPATENTRY utf8_isInvalid4(const ENCODING * enc, const char *p)
124	{
125	return UTF8_INVALID4((const unsigned char *)p);
126	}
127
128	struct normal_encoding
129	{
130	ENCODING enc;
131	unsigned char type[256];
132	#ifdef XML_MIN_SIZE
133	int (* EXPATENTRY byteType) (const ENCODING , const char );
134	int (* EXPATENTRY isNameMin) (const ENCODING , const char );
135	int (* EXPATENTRY isNmstrtMin) (const ENCODING , const char );
136	int (* EXPATENTRY byteToAscii) (const ENCODING , const char );
137	int (* EXPATENTRY charMatches) (const ENCODING , const char , int);
138	#endif /* XML_MIN_SIZE */
139	int (* EXPATENTRY isName2) (const ENCODING , const char );
140	int (* EXPATENTRY isName3) (const ENCODING , const char );
141	int (* EXPATENTRY isName4) (const ENCODING , const char );
142	int (* EXPATENTRY isNmstrt2) (const ENCODING , const char );
143	int (* EXPATENTRY isNmstrt3) (const ENCODING , const char );
144	int (* EXPATENTRY isNmstrt4) (const ENCODING , const char );
145	int (* EXPATENTRY isInvalid2) (const ENCODING , const char );
146	int (* EXPATENTRY isInvalid3) (const ENCODING , const char );
147	int (* EXPATENTRY isInvalid4) (const ENCODING , const char );
148	};
149
150	#ifdef XML_MIN_SIZE
151
152	#define STANDARD_VTABLE(E) \
153	E ## byteType, \
154	E ## isNameMin, \
155	E ## isNmstrtMin, \
156	E ## byteToAscii, \
157	E ## charMatches,
158
159	#else
160
161	#define STANDARD_VTABLE(E) /* as nothing */
162
163	#endif
164
165	#define NORMAL_VTABLE(E) \
166	E ## isName2, \
167	E ## isName3, \
168	E ## isName4, \
169	E ## isNmstrt2, \
170	E ## isNmstrt3, \
171	E ## isNmstrt4, \
172	E ## isInvalid2, \
173	E ## isInvalid3, \
174	E ## isInvalid4
175
176	static int checkCharRefNumber(int);
177
178	#include "expat\xmltok_impl.h"
179	#include "expat\ascii.h"
180
181	#ifdef XML_MIN_SIZE
182	#define sb_isNameMin isNever
183	#define sb_isNmstrtMin isNever
184	#endif
185
186	#ifdef XML_MIN_SIZE
187	#define MINBPC(enc) ((enc)->minBytesPerChar)
188	#else
189	/* minimum bytes per character */
190	#define MINBPC(enc) 1
191	#endif
192
193	#define SB_BYTE_TYPE(enc, p) \
194	(((struct normal_encoding )(enc))->type[(unsigned char)(p)])
195
196	#ifdef XML_MIN_SIZE
197	static int EXPATENTRY sb_byteType(const ENCODING * enc, const char *p)
198	{
199	return SB_BYTE_TYPE(enc, p);
200	}
201	#define BYTE_TYPE(enc, p) \
202	(((const struct normal_encoding *)(enc))->byteType(enc, p))
203	#else
204	#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
205	#endif
206
207	#ifdef XML_MIN_SIZE
208	#define BYTE_TO_ASCII(enc, p) \
209	(((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
210	static
211	int EXPATENTRY sb_byteToAscii(const ENCODING * enc, const char *p)
212	{
213	return *p;
214	}
215	#else
216	#define BYTE_TO_ASCII(enc, p) (*(p))
217	#endif
218
219	#define IS_NAME_CHAR(enc, p, n) \
220	(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
221	#define IS_NMSTRT_CHAR(enc, p, n) \
222	(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
223	#define IS_INVALID_CHAR(enc, p, n) \
224	(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
225
226	#ifdef XML_MIN_SIZE
227	#define IS_NAME_CHAR_MINBPC(enc, p) \
228	(((const struct normal_encoding *)(enc))->isNameMin(enc, p))
229	#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
230	(((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
231	#else
232	#define IS_NAME_CHAR_MINBPC(enc, p) (0)
233	#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
234	#endif
235
236	#ifdef XML_MIN_SIZE
237	#define CHAR_MATCHES(enc, p, c) \
238	(((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
239	static
240	int EXPATENTRY sb_charMatches(const ENCODING * enc, const char *p, int c)
241	{
242	return *p == c;
243	}
244	#else
245	/* c is an ASCII character */
246	#define CHAR_MATCHES(enc, p, c) (*(p) == c)
247	#endif
248
249	#define PREFIX(ident) normal_ ## ident
250	#include "xmltok_impl.c"
251
252	#undef MINBPC
253	#undef BYTE_TYPE
254	#undef BYTE_TO_ASCII
255	#undef CHAR_MATCHES
256	#undef IS_NAME_CHAR
257	#undef IS_NAME_CHAR_MINBPC
258	#undef IS_NMSTRT_CHAR
259	#undef IS_NMSTRT_CHAR_MINBPC
260	#undef IS_INVALID_CHAR
261
262	enum
263	{ /* UTF8_cvalN is value of masked first byte of N byte sequence */
264	UTF8_cval1 = 0x00,
265	UTF8_cval2 = 0xc0,
266	UTF8_cval3 = 0xe0,
267	UTF8_cval4 = 0xf0
268	};
269
270	static void EXPATENTRY utf8_toUtf8(const ENCODING * enc,
271	const char **fromP,
272	const char *fromLim,
273	char **toP,
274	const char *toLim)
275	{
276	char *to;
277	const char *from;
278
279	if (fromLim - fromP > toLim - toP)
280	{
281	/* Avoid copying partial characters. */
282	for (fromLim = fromP + (toLim - toP);
283	fromLim > *fromP;
284	fromLim--)
285	if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
286	break;
287	}
288	for (to = toP, from = fromP;
289	from != fromLim;
290	from++, to++)
291	to = from;
292	*fromP = from;
293	*toP = to;
294	}
295
296	static void EXPATENTRY utf8_toUtf16(const ENCODING * enc,
297	const char **fromP,
298	const char *fromLim,
299	unsigned short **toP,
300	const unsigned short *toLim)
301	{
302	unsigned short to = toP;
303	const char from = fromP;
304
305	while (from != fromLim && to != toLim)
306	{
307	switch (((struct normal_encoding )enc)->type[(unsigned char)from])
308	{
309	case BT_LEAD2:
310	*to++ = ((from[0] & 0x1f) << 6) \| (from[1] & 0x3f);
311	from += 2;
312	break;
313	case BT_LEAD3:
314	*to++ = ((from[0] & 0xf) << 12) \| ((from[1] & 0x3f) << 6) \| (from[2] & 0x3f);
315	from += 3;
316	break;
317	case BT_LEAD4:
318	{
319	unsigned long n;
320
321	if (to + 1 == toLim)
322	break;
323	n = ((from[0] & 0x7) << 18) \| ((from[1] & 0x3f) << 12) \| ((from[2] & 0x3f) << 6) \| (from[3] & 0x3f);
324	n -= 0x10000;
325	to[0] = (unsigned short)((n >> 10) \| 0xD800);
326	to[1] = (unsigned short)((n & 0x3FF) \| 0xDC00);
327	to += 2;
328	from += 4;
329	}
330	break;
331	default:
332	to++ = from++;
333	break;
334	}
335	}
336	*fromP = from;
337	*toP = to;
338	}
339
340	#ifdef XML_NS
341	static const struct normal_encoding utf8_encoding_ns =
342	{
343	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
344	{
345	#include "expat\asciitab.h"
346	#include "expat\utf8tab.h"
347	},
348	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
349	};
350
351	#endif
352
353	static const struct normal_encoding utf8_encoding =
354	{
355	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
356	{
357	#define BT_COLON BT_NMSTRT
358	#include "expat\asciitab.h"
359	#undef BT_COLON
360	#include "expat\utf8tab.h"
361	},
362	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
363	};
364
365	#ifdef XML_NS
366
367	static const struct normal_encoding internal_utf8_encoding_ns =
368	{
369	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
370	{
371	#include "expat\iasciitab.h"
372	#include "expat\utf8tab.h"
373	},
374	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
375	};
376
377	#endif
378
379	static const struct normal_encoding internal_utf8_encoding =
380	{
381	{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
382	{
383	#define BT_COLON BT_NMSTRT
384	#include "expat\iasciitab.h"
385	#undef BT_COLON
386	#include "expat\utf8tab.h"
387	},
388	STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
389	};
390
391	static void EXPATENTRY latin1_toUtf8(const ENCODING * enc,
392	const char **fromP,
393	const char *fromLim,
394	char **toP,
395	const char *toLim)
396	{
397	for (;;)
398	{
399	unsigned char c;
400
401	if (*fromP == fromLim)
402	break;
403	c = (unsigned char)**fromP;
404	if (c & 0x80)
405	{
406	if (toLim - *toP < 2)
407	break;
408	(toP)++ = ((c >> 6) \| UTF8_cval2);
409	(toP)++ = ((c & 0x3f) \| 0x80);
410	(*fromP)++;
411	}
412	else
413	{
414	if (*toP == toLim)
415	break;
416	(toP)++ = (fromP)++;
417	}
418	}
419	}
420
421	static void EXPATENTRY latin1_toUtf16(const ENCODING * enc,
422	const char **fromP,
423	const char *fromLim,
424	unsigned short **toP,
425	const unsigned short *toLim)
426	{
427	while (fromP != fromLim && toP != toLim)
428	(toP)++ = (unsigned char)(fromP)++;
429	}
430
431	#ifdef XML_NS
432
433	static const struct normal_encoding latin1_encoding_ns =
434	{
435	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
436	{
437	#include "expat\asciitab.h"
438	#include "expat\latin1tab.h"
439	},
440	STANDARD_VTABLE(sb_)
441	};
442
443	#endif
444
445	static const struct normal_encoding latin1_encoding =
446	{
447	{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
448	{
449	#define BT_COLON BT_NMSTRT
450	#include "expat\asciitab.h"
451	#undef BT_COLON
452	#include "expat\latin1tab.h"
453	},
454	STANDARD_VTABLE(sb_)
455	};
456
457	static void EXPATENTRY ascii_toUtf8(const ENCODING * enc,
458	const char **fromP,
459	const char *fromLim,
460	char **toP,
461	const char *toLim)
462	{
463	while (fromP != fromLim && toP != toLim)
464	(toP)++ = (fromP)++;
465	}
466
467	#ifdef XML_NS
468
469	static const struct normal_encoding ascii_encoding_ns =
470	{
471	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
472	{
473	#include "expat\asciitab.h"
474	/* BT_NONXML == 0 */
475	},
476	STANDARD_VTABLE(sb_)
477	};
478
479	#endif
480
481	static const struct normal_encoding ascii_encoding =
482	{
483	{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
484	{
485	#define BT_COLON BT_NMSTRT
486	#include "expat\asciitab.h"
487	#undef BT_COLON
488	/* BT_NONXML == 0 */
489	},
490	STANDARD_VTABLE(sb_)
491	};
492
493	static int unicode_byte_type(char hi, char lo)
494	{
495	switch ((unsigned char)hi)
496	{
497	case 0xD8:
498	case 0xD9:
499	case 0xDA:
500	case 0xDB:
501	return BT_LEAD4;
502	case 0xDC:
503	case 0xDD:
504	case 0xDE:
505	case 0xDF:
506	return BT_TRAIL;
507	case 0xFF:
508	switch ((unsigned char)lo)
509	{
510	case 0xFF:
511	case 0xFE:
512	return BT_NONXML;
513	}
514	break;
515	}
516	return BT_NONASCII;
517	}
518
519	#define DEFINE_UTF16_TO_UTF8(E) \
520	static void EXPATENTRY E ## toUtf8(const ENCODING *enc, \
521	const char *fromP, const char fromLim, \
522	char *toP, const char toLim) \
523	{ \
524	const char *from; \
525	for (from = *fromP; from != fromLim; from += 2) { \
526	int plane; \
527	unsigned char lo2; \
528	unsigned char lo = GET_LO(from); \
529	unsigned char hi = GET_HI(from); \
530	switch (hi) { \
531	case 0: \
532	if (lo < 0x80) { \
533	if (*toP == toLim) { \
534	*fromP = from; \
535	return; \
536	} \
537	(toP)++ = lo; \
538	break; \
539	} \
540	/* fall through */ \
541	case 0x1: case 0x2: case 0x3: \
542	case 0x4: case 0x5: case 0x6: case 0x7: \
543	if (toLim - *toP < 2) { \
544	*fromP = from; \
545	return; \
546	} \
547	(toP)++ = ((lo >> 6) \| (hi << 2) \| UTF8_cval2); \
548	(toP)++ = ((lo & 0x3f) \| 0x80); \
549	break; \
550	default: \
551	if (toLim - *toP < 3) { \
552	*fromP = from; \
553	return; \
554	} \
555	/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
556	(toP)++ = ((hi >> 4) \| UTF8_cval3); \
557	(toP)++ = (((hi & 0xf) << 2) \| (lo >> 6) \| 0x80); \
558	(toP)++ = ((lo & 0x3f) \| 0x80); \
559	break; \
560	case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
561	if (toLim - *toP < 4) { \
562	*fromP = from; \
563	return; \
564	} \
565	plane = (((hi & 0x3) << 2) \| ((lo >> 6) & 0x3)) + 1; \
566	(toP)++ = ((plane >> 2) \| UTF8_cval4); \
567	(toP)++ = (((lo >> 2) & 0xF) \| ((plane & 0x3) << 4) \| 0x80); \
568	from += 2; \
569	lo2 = GET_LO(from); \
570	(toP)++ = (((lo & 0x3) << 4) \
571	\| ((GET_HI(from) & 0x3) << 2) \
572	\| (lo2 >> 6) \
573	\| 0x80); \
574	(toP)++ = ((lo2 & 0x3f) \| 0x80); \
575	break; \
576	} \
577	} \
578	*fromP = from; \
579	}
580
581	#define DEFINE_UTF16_TO_UTF16(E) \
582	static void EXPATENTRY E ## toUtf16(const ENCODING *enc, \
583	const char *fromP, const char fromLim, \
584	unsigned short *toP, const unsigned short toLim) \
585	{ \
586	/* Avoid copying first half only of surrogate */ \
587	if (fromLim - fromP > ((toLim - toP) << 1) \
588	&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
589	fromLim -= 2; \
590	for (; fromP != fromLim && toP != toLim; *fromP += 2) \
591	(toP)++ = (GET_HI(fromP) << 8) \| GET_LO(fromP); \
592	}
593
594	#define SET2(ptr, ch) \
595	(((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
596	#define GET_LO(ptr) ((unsigned char)(ptr)[0])
597	#define GET_HI(ptr) ((unsigned char)(ptr)[1])
598
599	DEFINE_UTF16_TO_UTF8(little2_)
600	DEFINE_UTF16_TO_UTF16(little2_)
601
602	#undef SET2
603	#undef GET_LO
604	#undef GET_HI
605
606	#define SET2(ptr, ch) \
607	(((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
608	#define GET_LO(ptr) ((unsigned char)(ptr)[1])
609	#define GET_HI(ptr) ((unsigned char)(ptr)[0])
610
611	DEFINE_UTF16_TO_UTF8(big2_)
612	DEFINE_UTF16_TO_UTF16(big2_)
613
614	#undef SET2
615	#undef GET_LO
616	#undef GET_HI
617
618	#define LITTLE2_BYTE_TYPE(enc, p) \
619	((p)[1] == 0 \
620	? ((struct normal_encoding )(enc))->type[(unsigned char)(p)] \
621	: unicode_byte_type((p)[1], (p)[0]))
622	#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
623	#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
624	#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
625	UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
626	#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
627	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
628
629	#ifdef XML_MIN_SIZE
630	static int EXPATENTRY little2_byteType(const ENCODING * enc, const char *p)
631	{
632	return LITTLE2_BYTE_TYPE(enc, p);
633	}
634
635	static int EXPATENTRY little2_byteToAscii(const ENCODING * enc, const char *p)
636	{
637	return LITTLE2_BYTE_TO_ASCII(enc, p);
638	}
639
640	static int EXPATENTRY little2_charMatches(const ENCODING * enc, const char *p, int c)
641	{
642	return LITTLE2_CHAR_MATCHES(enc, p, c);
643	}
644
645	static int EXPATENTRY little2_isNameMin(const ENCODING * enc, const char *p)
646	{
647	return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
648	}
649
650	static int EXPATENTRY little2_isNmstrtMin(const ENCODING * enc, const char *p)
651	{
652	return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
653	}
654
655	#undef VTABLE
656	#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
657
658	#else /* not XML_MIN_SIZE */
659
660	#undef PREFIX
661	#define PREFIX(ident) little2_ ## ident
662	#define MINBPC(enc) 2
663	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
664	#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
665	#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
666	#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
667	#define IS_NAME_CHAR(enc, p, n) 0
668	#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
669	#define IS_NMSTRT_CHAR(enc, p, n) (0)
670	#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
671
672	#include "xmltok_impl.c"
673
674	#undef MINBPC
675	#undef BYTE_TYPE
676	#undef BYTE_TO_ASCII
677	#undef CHAR_MATCHES
678	#undef IS_NAME_CHAR
679	#undef IS_NAME_CHAR_MINBPC
680	#undef IS_NMSTRT_CHAR
681	#undef IS_NMSTRT_CHAR_MINBPC
682	#undef IS_INVALID_CHAR
683
684	#endif /* not XML_MIN_SIZE */
685
686	#ifdef XML_NS
687
688	static const struct normal_encoding little2_encoding_ns =
689	{
690	{VTABLE, 2, 0,
691	#if XML_BYTE_ORDER == 12
692	1
693	#else
694	0
695	#endif
696	},
697	{
698	#include "expat\asciitab.h"
699	#include "expat\latin1tab.h"
700	},
701	STANDARD_VTABLE(little2_)
702	};
703
704	#endif
705
706	static const struct normal_encoding little2_encoding =
707	{
708	{VTABLE, 2, 0,
709	#if XML_BYTE_ORDER == 12
710	1
711	#else
712	0
713	#endif
714	},
715	{
716	#define BT_COLON BT_NMSTRT
717	#include "expat\asciitab.h"
718	#undef BT_COLON
719	#include "expat\latin1tab.h"
720	},
721	STANDARD_VTABLE(little2_)
722	};
723
724	#if XML_BYTE_ORDER != 21
725
726	#ifdef XML_NS
727
728	static const struct normal_encoding internal_little2_encoding_ns =
729	{
730	{VTABLE, 2, 0, 1},
731	{
732	#include "expat\iasciitab.h"
733	#include "expat\latin1tab.h"
734	},
735	STANDARD_VTABLE(little2_)
736	};
737
738	#endif
739
740	static const struct normal_encoding internal_little2_encoding =
741	{
742	{VTABLE, 2, 0, 1},
743	{
744	#define BT_COLON BT_NMSTRT
745	#include "expat\iasciitab.h"
746	#undef BT_COLON
747	#include "expat\latin1tab.h"
748	},
749	STANDARD_VTABLE(little2_)
750	};
751
752	#endif
753
754
755	#define BIG2_BYTE_TYPE(enc, p) \
756	((p)[0] == 0 \
757	? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
758	: unicode_byte_type((p)[0], (p)[1]))
759	#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
760	#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
761	#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
762	UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
763	#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
764	UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
765
766	#ifdef XML_MIN_SIZE
767
768	static int EXPATENTRY big2_byteType(const ENCODING * enc, const char *p)
769	{
770	return BIG2_BYTE_TYPE(enc, p);
771	}
772
773	static int EXPATENTRY big2_byteToAscii(const ENCODING * enc, const char *p)
774	{
775	return BIG2_BYTE_TO_ASCII(enc, p);
776	}
777
778	static int EXPATENTRY big2_charMatches(const ENCODING * enc, const char *p, int c)
779	{
780	return BIG2_CHAR_MATCHES(enc, p, c);
781	}
782
783	static int EXPATENTRY big2_isNameMin(const ENCODING * enc, const char *p)
784	{
785	return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
786	}
787
788	static int EXPATENTRY big2_isNmstrtMin(const ENCODING * enc, const char *p)
789	{
790	return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
791	}
792
793	#undef VTABLE
794	#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
795
796	#else /* not XML_MIN_SIZE */
797
798	#undef PREFIX
799	#define PREFIX(ident) big2_ ## ident
800	#define MINBPC(enc) 2
801	/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
802	#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
803	#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
804	#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
805	#define IS_NAME_CHAR(enc, p, n) 0
806	#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
807	#define IS_NMSTRT_CHAR(enc, p, n) (0)
808	#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
809
810	#include "xmltok_impl.c"
811
812	#undef MINBPC
813	#undef BYTE_TYPE
814	#undef BYTE_TO_ASCII
815	#undef CHAR_MATCHES
816	#undef IS_NAME_CHAR
817	#undef IS_NAME_CHAR_MINBPC
818	#undef IS_NMSTRT_CHAR
819	#undef IS_NMSTRT_CHAR_MINBPC
820	#undef IS_INVALID_CHAR
821
822	#endif /* not XML_MIN_SIZE */
823
824	#ifdef XML_NS
825
826	static const struct normal_encoding big2_encoding_ns =
827	{
828	{VTABLE, 2, 0,
829	#if XML_BYTE_ORDER == 21
830	1
831	#else
832	0
833	#endif
834	},
835	{
836	#include "expat\asciitab.h"
837	#include "expat\latin1tab.h"
838	},
839	STANDARD_VTABLE(big2_)
840	};
841
842	#endif
843
844	static const struct normal_encoding big2_encoding =
845	{
846	{VTABLE, 2, 0,
847	#if XML_BYTE_ORDER == 21
848	1
849	#else
850	0
851	#endif
852	},
853	{
854	#define BT_COLON BT_NMSTRT
855	#include "expat\asciitab.h"
856	#undef BT_COLON
857	#include "expat\latin1tab.h"
858	},
859	STANDARD_VTABLE(big2_)
860	};
861
862	#if XML_BYTE_ORDER != 12
863
864	#ifdef XML_NS
865
866	static const struct normal_encoding internal_big2_encoding_ns =
867	{
868	{VTABLE, 2, 0, 1},
869	{
870	#include "expat\iasciitab.h"
871	#include "expat\latin1tab.h"
872	},
873	STANDARD_VTABLE(big2_)
874	};
875
876	#endif
877
878	static const struct normal_encoding internal_big2_encoding =
879	{
880	{VTABLE, 2, 0, 1},
881	{
882	#define BT_COLON BT_NMSTRT
883	#include "expat\iasciitab.h"
884	#undef BT_COLON
885	#include "expat\latin1tab.h"
886	},
887	STANDARD_VTABLE(big2_)
888	};
889
890	#endif
891
892	#undef PREFIX
893
894	static int streqci(const char s1, const char s2)
895	{
896	for (;;)
897	{
898	char c1 = *s1++;
899	char c2 = *s2++;
900
901	if (ASCII_a <= c1 && c1 <= ASCII_z)
902	c1 += ASCII_A - ASCII_a;
903	if (ASCII_a <= c2 && c2 <= ASCII_z)
904	c2 += ASCII_A - ASCII_a;
905	if (c1 != c2)
906	return 0;
907	if (!c1)
908	break;
909	}
910	return 1;
911	}
912
913	static void EXPATENTRY initUpdatePosition(const ENCODING * enc,
914	const char *ptr,
915	const char *end,
916	POSITION * pos)
917	{
918	normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
919	}
920
921	static int EXPATENTRY toAscii(const ENCODING * enc,
922	const char *ptr,
923	const char *end)
924	{
925	char buf[1];
926	char *p = buf;
927
928	XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
929	if (p == buf)
930	return -1;
931	else
932	return buf[0];
933	}
934
935	static int EXPATENTRY isSpace(int c)
936	{
937	switch (c)
938	{
939	case 0x20:
940	case 0xD:
941	case 0xA:
942	case 0x9:
943	return 1;
944	}
945	return 0;
946	}
947
948	/* Return 1 if there's just optional white space
949	* or there's an S followed by name=val. */
950	static int EXPATENTRY parsePseudoAttribute(const ENCODING * enc,
951	const char *ptr,
952	const char *end,
953	const char **namePtr,
954	const char **nameEndPtr,
955	const char **valPtr,
956	const char **nextTokPtr)
957	{
958	int c;
959	char open;
960
961	if (ptr == end)
962	{
963	*namePtr = 0;
964	return 1;
965	}
966	if (!isSpace(toAscii(enc, ptr, end)))
967	{
968	*nextTokPtr = ptr;
969	return 0;
970	}
971	do
972	{
973	ptr += enc->minBytesPerChar;
974	}
975	while (isSpace(toAscii(enc, ptr, end)));
976	if (ptr == end)
977	{
978	*namePtr = 0;
979	return 1;
980	}
981	*namePtr = ptr;
982	for (;;)
983	{
984	c = toAscii(enc, ptr, end);
985	if (c == -1)
986	{
987	*nextTokPtr = ptr;
988	return 0;
989	}
990	if (c == ASCII_EQUALS)
991	{
992	*nameEndPtr = ptr;
993	break;
994	}
995	if (isSpace(c))
996	{
997	*nameEndPtr = ptr;
998	do
999	{
1000	ptr += enc->minBytesPerChar;
1001	}
1002	while (isSpace(c = toAscii(enc, ptr, end)));
1003	if (c != ASCII_EQUALS)
1004	{
1005	*nextTokPtr = ptr;
1006	return 0;
1007	}
1008	break;
1009	}
1010	ptr += enc->minBytesPerChar;
1011	}
1012	if (ptr == *namePtr)
1013	{
1014	*nextTokPtr = ptr;
1015	return 0;
1016	}
1017	ptr += enc->minBytesPerChar;
1018	c = toAscii(enc, ptr, end);
1019	while (isSpace(c))
1020	{
1021	ptr += enc->minBytesPerChar;
1022	c = toAscii(enc, ptr, end);
1023	}
1024	if (c != ASCII_QUOT && c != ASCII_APOS)
1025	{
1026	*nextTokPtr = ptr;
1027	return 0;
1028	}
1029	open = c;
1030	ptr += enc->minBytesPerChar;
1031	*valPtr = ptr;
1032	for (;; ptr += enc->minBytesPerChar)
1033	{
1034	c = toAscii(enc, ptr, end);
1035	if (c == open)
1036	break;
1037	if (!(ASCII_a <= c && c <= ASCII_z)
1038	&& !(ASCII_A <= c && c <= ASCII_Z)
1039	&& !(ASCII_0 <= c && c <= ASCII_9)
1040	&& c != ASCII_PERIOD
1041	&& c != ASCII_MINUS
1042	&& c != ASCII_UNDERSCORE)
1043	{
1044	*nextTokPtr = ptr;
1045	return 0;
1046	}
1047	}
1048	*nextTokPtr = ptr + enc->minBytesPerChar;
1049	return 1;
1050	}
1051
1052	static const char KW_version[] =
1053	{
1054	ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1055	};
1056
1057	static const char KW_encoding[] =
1058	{
1059	ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1060	};
1061
1062	static const char KW_standalone[] =
1063	{
1064	ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
1065	};
1066
1067	static const char KW_yes[] =
1068	{
1069	ASCII_y, ASCII_e, ASCII_s, '\0'
1070	};
1071
1072	static const char KW_no[] =
1073	{
1074	ASCII_n, ASCII_o, '\0'
1075	};
1076
1077	static int doParseXmlDecl(const ENCODING * (encodingFinder) (const ENCODING ,
1078	const char *,
1079	const char *),
1080	int isGeneralTextEntity,
1081	const ENCODING * enc,
1082	const char *ptr,
1083	const char *end,
1084	const char **badPtr,
1085	const char **versionPtr,
1086	const char **versionEndPtr,
1087	const char **encodingName,
1088	const ENCODING ** encoding,
1089	int *standalone)
1090	{
1091	const char *val = 0;
1092	const char *name = 0;
1093	const char *nameEnd = 0;
1094
1095	ptr += 5 * enc->minBytesPerChar;
1096	end -= 2 * enc->minBytesPerChar;
1097	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) \|\| !name)
1098	{
1099	*badPtr = ptr;
1100	return 0;
1101	}
1102	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version))
1103	{
1104	if (!isGeneralTextEntity)
1105	{
1106	*badPtr = name;
1107	return 0;
1108	}
1109	}
1110	else
1111	{
1112	if (versionPtr)
1113	*versionPtr = val;
1114	if (versionEndPtr)
1115	*versionEndPtr = ptr;
1116	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1117	{
1118	*badPtr = ptr;
1119	return 0;
1120	}
1121	if (!name)
1122	{
1123	if (isGeneralTextEntity)
1124	{
1125	/* a TextDecl must have an EncodingDecl */
1126	*badPtr = ptr;
1127	return 0;
1128	}
1129	return 1;
1130	}
1131	}
1132	if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding))
1133	{
1134	int c = toAscii(enc, val, end);
1135
1136	if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z))
1137	{
1138	*badPtr = val;
1139	return 0;
1140	}
1141	if (encodingName)
1142	*encodingName = val;
1143	if (encoding)
1144	*encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1145	if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1146	{
1147	*badPtr = ptr;
1148	return 0;
1149	}
1150	if (!name)
1151	return 1;
1152	}
1153	if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) \|\| isGeneralTextEntity)
1154	{
1155	*badPtr = name;
1156	return 0;
1157	}
1158	if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes))
1159	{
1160	if (standalone)
1161	*standalone = 1;
1162	}
1163	else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no))
1164	{
1165	if (standalone)
1166	*standalone = 0;
1167	}
1168	else
1169	{
1170	*badPtr = val;
1171	return 0;
1172	}
1173	while (isSpace(toAscii(enc, ptr, end)))
1174	ptr += enc->minBytesPerChar;
1175	if (ptr != end)
1176	{
1177	*badPtr = ptr;
1178	return 0;
1179	}
1180	return 1;
1181	}
1182
1183	static int checkCharRefNumber(int result)
1184	{
1185	switch (result >> 8)
1186	{
1187	case 0xD8:
1188	case 0xD9:
1189	case 0xDA:
1190	case 0xDB:
1191	case 0xDC:
1192	case 0xDD:
1193	case 0xDE:
1194	case 0xDF:
1195	return -1;
1196	case 0:
1197	if (latin1_encoding.type[result] == BT_NONXML)
1198	return -1;
1199	break;
1200	case 0xFF:
1201	if (result == 0xFFFE \|\| result == 0xFFFF)
1202	return -1;
1203	break;
1204	}
1205	return result;
1206	}
1207
1208	int XmlUtf8Encode(int c, char *buf)
1209	{
1210	enum
1211	{
1212	/* minN is minimum legal resulting value for N byte sequence */
1213	min2 = 0x80,
1214	min3 = 0x800,
1215	min4 = 0x10000
1216	};
1217
1218	if (c < 0)
1219	return 0;
1220	if (c < min2)
1221	{
1222	buf[0] = (c \| UTF8_cval1);
1223	return 1;
1224	}
1225	if (c < min3)
1226	{
1227	buf[0] = ((c >> 6) \| UTF8_cval2);
1228	buf[1] = ((c & 0x3f) \| 0x80);
1229	return 2;
1230	}
1231	if (c < min4)
1232	{
1233	buf[0] = ((c >> 12) \| UTF8_cval3);
1234	buf[1] = (((c >> 6) & 0x3f) \| 0x80);
1235	buf[2] = ((c & 0x3f) \| 0x80);
1236	return 3;
1237	}
1238	if (c < 0x110000)
1239	{
1240	buf[0] = ((c >> 18) \| UTF8_cval4);
1241	buf[1] = (((c >> 12) & 0x3f) \| 0x80);
1242	buf[2] = (((c >> 6) & 0x3f) \| 0x80);
1243	buf[3] = ((c & 0x3f) \| 0x80);
1244	return 4;
1245	}
1246	return 0;
1247	}
1248
1249	int XmlUtf16Encode(int charNum, unsigned short *buf)
1250	{
1251	if (charNum < 0)
1252	return 0;
1253	if (charNum < 0x10000)
1254	{
1255	buf[0] = charNum;
1256	return 1;
1257	}
1258	if (charNum < 0x110000)
1259	{
1260	charNum -= 0x10000;
1261	buf[0] = (charNum >> 10) + 0xD800;
1262	buf[1] = (charNum & 0x3FF) + 0xDC00;
1263	return 2;
1264	}
1265	return 0;
1266	}
1267
1268	struct unknown_encoding
1269	{
1270	struct normal_encoding normal;
1271	int (convert) (void userData, const char *p);
1272	void *userData;
1273	unsigned short utf16[256];
1274	char utf8[256][4];
1275	};
1276
1277	int XmlSizeOfUnknownEncoding(void)
1278	{
1279	return sizeof(struct unknown_encoding);
1280	}
1281
1282	static int EXPATENTRY unknown_isName(const ENCODING * enc, const char *p)
1283	{
1284	int c = ((const struct unknown_encoding *)enc)
1285	->convert(((const struct unknown_encoding *)enc)->userData, p);
1286
1287	if (c & ~0xFFFF)
1288	return 0;
1289	return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1290	}
1291
1292	static int EXPATENTRY unknown_isNmstrt(const ENCODING * enc, const char *p)
1293	{
1294	int c = ((const struct unknown_encoding *)enc)
1295	->convert(((const struct unknown_encoding *)enc)->userData, p);
1296
1297	if (c & ~0xFFFF)
1298	return 0;
1299	return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1300	}
1301
1302	static int EXPATENTRY unknown_isInvalid(const ENCODING * enc, const char *p)
1303	{
1304	int c = ((const struct unknown_encoding *)enc)
1305	->convert(((const struct unknown_encoding *)enc)->userData, p);
1306
1307	return (c & ~0xFFFF) \|\| checkCharRefNumber(c) < 0;
1308	}
1309
1310	static void EXPATENTRY unknown_toUtf8(const ENCODING * enc,
1311	const char **fromP,
1312	const char *fromLim,
1313	char **toP,
1314	const char *toLim)
1315	{
1316	char buf[XML_UTF8_ENCODE_MAX];
1317
1318	for (;;)
1319	{
1320	const char *utf8;
1321	int n;
1322
1323	if (*fromP == fromLim)
1324	break;
1325	utf8 = ((const struct unknown_encoding )enc)->utf8[(unsigned char)*fromP];
1326	n = *utf8++;
1327	if (n == 0)
1328	{
1329	int c = ((const struct unknown_encoding *)enc)
1330	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1331
1332	n = XmlUtf8Encode(c, buf);
1333	if (n > toLim - *toP)
1334	break;
1335	utf8 = buf;
1336	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1337	- (BT_LEAD2 - 2);
1338	}
1339	else
1340	{
1341	if (n > toLim - *toP)
1342	break;
1343	(*fromP)++;
1344	}
1345	do
1346	{
1347	(toP)++ = *utf8++;
1348	}
1349	while (--n != 0);
1350	}
1351	}
1352
1353	static void EXPATENTRY unknown_toUtf16(const ENCODING * enc,
1354	const char **fromP,
1355	const char *fromLim,
1356	unsigned short **toP,
1357	const unsigned short *toLim)
1358	{
1359	while (fromP != fromLim && toP != toLim)
1360	{
1361	unsigned short c
1362	= ((const struct unknown_encoding )enc)->utf16[(unsigned char)*fromP];
1363
1364	if (c == 0)
1365	{
1366	c = (unsigned short)((const struct unknown_encoding *)enc)
1367	->convert(((const struct unknown_encoding )enc)->userData, fromP);
1368	fromP += ((const struct normal_encoding )enc)->type[(unsigned char)**fromP]
1369	- (BT_LEAD2 - 2);
1370	}
1371	else
1372	(*fromP)++;
1373	(toP)++ = c;
1374	}
1375	}
1376
1377	/*
1378	*@@ XmlInitUnknownEncoding:
1379	*
1380	*@@changed V0.9.14 (2001-08-09) [umoeller]: couple of performance hacks
1381	*/
1382
1383	ENCODING* XmlInitUnknownEncoding(void *mem,
1384	int *table,
1385	int (convert) (void userData, const char *p),
1386	void *userData)
1387	{
1388	int i;
1389	struct unknown_encoding e = (struct unknown_encoding)mem;
1390
1391	// gee, isn't this a regular memcpy?!?
1392	/* for (i = 0;
1393	i < (int)sizeof(struct normal_encoding);
1394	i++)
1395	((char )mem)[i] = ((char )&latin1_encoding)[i]; */
1396
1397	// replaced the above with this V0.9.14 (2001-08-09) [umoeller]
1398	memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1399
1400	for (i = 0; i < 128; i++)
1401	if ( latin1_encoding.type[i] != BT_OTHER
1402	&& latin1_encoding.type[i] != BT_NONXML
1403	&& table[i] != i
1404	)
1405	return 0;
1406
1407	for (i = 0; i < 256; i++)
1408	{
1409	int c = table[i];
1410
1411	if (c == -1)
1412	{
1413	e->normal.type[i] = BT_MALFORM;
1414	/* This shouldn't really get used. */
1415	e->utf16[i] = 0xFFFF;
1416	e->utf8[i][0] = 1;
1417	e->utf8[i][1] = 0;
1418	}
1419	else if (c < 0)
1420	{
1421	if (c < -4)
1422	return 0;
1423	e->normal.type[i] = BT_LEAD2 - (c + 2);
1424	e->utf8[i][0] = 0;
1425	e->utf16[i] = 0;
1426	}
1427	else if (c < 0x80)
1428	{
1429	if ( latin1_encoding.type[c] != BT_OTHER
1430	&& latin1_encoding.type[c] != BT_NONXML
1431	&& c != i
1432	)
1433	return 0;
1434	e->normal.type[i] = latin1_encoding.type[c];
1435	e->utf8[i][0] = 1;
1436	e->utf8[i][1] = (char)c;
1437	e->utf16[i] = c == 0 ? 0xFFFF : c;
1438	}
1439	else if (checkCharRefNumber(c) < 0)
1440	{
1441	e->normal.type[i] = BT_NONXML;
1442	/* This shouldn't really get used. */
1443	e->utf16[i] = 0xFFFF;
1444	e->utf8[i][0] = 1;
1445	e->utf8[i][1] = 0;
1446	}
1447	else
1448	{
1449	if (c > 0xFFFF)
1450	return 0;
1451	if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1452	e->normal.type[i] = BT_NMSTRT;
1453	else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1454	e->normal.type[i] = BT_NAME;
1455	else
1456	e->normal.type[i] = BT_OTHER;
1457	e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1458	e->utf16[i] = c;
1459	}
1460	}
1461	e->userData = userData;
1462	e->convert = convert;
1463	if (convert)
1464	{
1465	e->normal.isName2 = unknown_isName;
1466	e->normal.isName3 = unknown_isName;
1467	e->normal.isName4 = unknown_isName;
1468	e->normal.isNmstrt2 = unknown_isNmstrt;
1469	e->normal.isNmstrt3 = unknown_isNmstrt;
1470	e->normal.isNmstrt4 = unknown_isNmstrt;
1471	e->normal.isInvalid2 = unknown_isInvalid;
1472	e->normal.isInvalid3 = unknown_isInvalid;
1473	e->normal.isInvalid4 = unknown_isInvalid;
1474	}
1475	e->normal.enc.utf8Convert = unknown_toUtf8;
1476	e->normal.enc.utf16Convert = unknown_toUtf16;
1477	return &(e->normal.enc);
1478	}
1479
1480	/* If this enumeration is changed, getEncodingIndex and encodings
1481	* must also be changed. */
1482	enum
1483	{
1484	UNKNOWN_ENC = -1,
1485	ISO_8859_1_ENC = 0,
1486	US_ASCII_ENC,
1487	UTF_8_ENC,
1488	UTF_16_ENC,
1489	UTF_16BE_ENC,
1490	UTF_16LE_ENC,
1491	/* must match encodingNames up to here */
1492	NO_ENC
1493	};
1494
1495	static const char KW_ISO_8859_1[] =
1496	{
1497	ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
1498	};
1499	static const char KW_US_ASCII[] =
1500	{
1501	ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
1502	};
1503	static const char KW_UTF_8[] =
1504	{
1505	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1506	};
1507	static const char KW_UTF_16[] =
1508	{
1509	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1510	};
1511	static const char KW_UTF_16BE[] =
1512	{
1513	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
1514	};
1515	static const char KW_UTF_16LE[] =
1516	{
1517	ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
1518	};
1519
1520	static int getEncodingIndex(const char *name)
1521	{
1522	static const char *encodingNames[] =
1523	{
1524	KW_ISO_8859_1,
1525	KW_US_ASCII,
1526	KW_UTF_8,
1527	KW_UTF_16,
1528	KW_UTF_16BE,
1529	KW_UTF_16LE,
1530	};
1531	int i;
1532
1533	if (name == 0)
1534	return NO_ENC;
1535	for (i = 0;
1536	i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0]));
1537	i++)
1538	if (streqci(name, encodingNames[i]))
1539	return i;
1540	return UNKNOWN_ENC;
1541	}
1542
1543	/* For binary compatibility, we store the index of the encoding specified
1544	* at initialization in the isUtf16 member. */
1545
1546	#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1547	#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1548
1549	/* This is what detects the encoding.
1550	* encodingTable maps from encoding indices to encodings;
1551	* INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1552	* state is XML_CONTENT_STATE if we're parsing an external text entity,
1553	* and XML_PROLOG_STATE otherwise.
1554	*/
1555
1556
1557	static int EXPATENTRY initScan(const ENCODING ** encodingTable,
1558	const INIT_ENCODING * enc,
1559	int state,
1560	const char *ptr,
1561	const char *end,
1562	const char **nextTokPtr)
1563	{
1564	const ENCODING **encPtr;
1565
1566	if (ptr == end)
1567	return XML_TOK_NONE;
1568	encPtr = enc->encPtr;
1569	if (ptr + 1 == end)
1570	{
1571	/* only a single byte available for auto-detection */
1572	#ifndef XML_DTD /* FIXME */
1573	/* a well-formed document entity must have more than one byte */
1574	if (state != XML_CONTENT_STATE)
1575	return XML_TOK_PARTIAL;
1576	#endif
1577	/* so we're parsing an external text entity... */
1578	/* if UTF-16 was externally specified, then we need at least 2 bytes */
1579	switch (INIT_ENC_INDEX(enc))
1580	{
1581	case UTF_16_ENC:
1582	case UTF_16LE_ENC:
1583	case UTF_16BE_ENC:
1584	return XML_TOK_PARTIAL;
1585	}
1586	switch ((unsigned char)*ptr)
1587	{
1588	case 0xFE:
1589	case 0xFF:
1590	case 0xEF: /* possibly first byte of UTF-8 BOM */
1591	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1592	&& state == XML_CONTENT_STATE)
1593	break;
1594	/* fall through */
1595	case 0x00:
1596	case 0x3C:
1597	return XML_TOK_PARTIAL;
1598	}
1599	}
1600	else
1601	{
1602	switch (((unsigned char)ptr[0] << 8) \| (unsigned char)ptr[1])
1603	{
1604	case 0xFEFF:
1605	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1606	&& state == XML_CONTENT_STATE)
1607	break;
1608	*nextTokPtr = ptr + 2;
1609	*encPtr = encodingTable[UTF_16BE_ENC];
1610	return XML_TOK_BOM;
1611	/* 00 3C is handled in the default case */
1612	case 0x3C00:
1613	if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1614	\|\| INIT_ENC_INDEX(enc) == UTF_16_ENC)
1615	&& state == XML_CONTENT_STATE)
1616	break;
1617	*encPtr = encodingTable[UTF_16LE_ENC];
1618	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1619	case 0xFFFE:
1620	if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1621	&& state == XML_CONTENT_STATE)
1622	break;
1623	*nextTokPtr = ptr + 2;
1624	*encPtr = encodingTable[UTF_16LE_ENC];
1625	return XML_TOK_BOM;
1626	case 0xEFBB:
1627	/* Maybe a UTF-8 BOM (EF BB BF) */
1628	/* If there's an explicitly specified (external) encoding
1629	* of ISO-8859-1 or some flavour of UTF-16
1630	* and this is an external text entity,
1631	* don't look for the BOM,
1632	* because it might be a legal data. */
1633	if (state == XML_CONTENT_STATE)
1634	{
1635	int e = INIT_ENC_INDEX(enc);
1636
1637	if (e == ISO_8859_1_ENC \|\| e == UTF_16BE_ENC \|\| e == UTF_16LE_ENC \|\| e == UTF_16_ENC)
1638	break;
1639	}
1640	if (ptr + 2 == end)
1641	return XML_TOK_PARTIAL;
1642	if ((unsigned char)ptr[2] == 0xBF)
1643	{
1644	*nextTokPtr = ptr + 3;
1645	*encPtr = encodingTable[UTF_8_ENC];
1646	return XML_TOK_BOM;
1647	}
1648	break;
1649	default:
1650	if (ptr[0] == '\0')
1651	{
1652	/* 0 isn't a legal data character. Furthermore a document entity can only
1653	* start with ASCII characters. So the only way this can fail to be big-endian
1654	* UTF-16 if it it's an external parsed general entity that's labelled as
1655	* UTF-16LE. */
1656	if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1657	break;
1658	*encPtr = encodingTable[UTF_16BE_ENC];
1659	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1660	}
1661	else if (ptr[1] == '\0')
1662	{
1663	/* We could recover here in the case:
1664	* - parsing an external entity
1665	* - second byte is 0
1666	* - no externally specified encoding
1667	* - no encoding declaration
1668	* by assuming UTF-16LE. But we don't, because this would mean when
1669	* presented just with a single byte, we couldn't reliably determine
1670	* whether we needed further bytes. */
1671	if (state == XML_CONTENT_STATE)
1672	break;
1673	*encPtr = encodingTable[UTF_16LE_ENC];
1674	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1675	}
1676	break;
1677	}
1678	}
1679	*encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1680	return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1681	}
1682
1683
1684	#define NS(x) x
1685	#define ns(x) x
1686	#include "xmltok_ns.c"
1687	#undef NS
1688	#undef ns
1689
1690	#ifdef XML_NS
1691
1692	#define NS(x) x ## NS
1693	#define ns(x) x ## _ns
1694
1695	#include "xmltok_ns.c"
1696
1697	#undef NS
1698	#undef ns
1699
1700	ENCODING * XmlInitUnknownEncodingNS(void *mem,
1701	int *table,
1702	int (* EXPATENTRY convert) (void userData, const char p),
1703	void *userData)
1704	{
1705	ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1706
1707	if (enc)
1708	((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1709	return enc;
1710	}
1711
1712	#endif /* XML_NS */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/branch-1-0/src/helpers/xmltok.c@ 297

Download in other formats: