source: branches/branch-1-0/src/helpers/xmltok.c@ 297

Last change on this file since 297 was 147, checked in by umoeller, 23 years ago

Misc updates for Unicode.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 46.9 KB
Line 
1/*
2 * Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3 * See the file COPYING for copying permission.
4 */
5
6/* #ifdef COMPILED_FROM_DSP
7 * # include "winconfig.h"
8 * #else
9 * # include <config.h>
10 * #endif
11 */
12
13#include <memory.h>
14
15#include "expat\expat_setup.h" // V0.9.9 (2001-02-10) [umoeller]
16
17#pragma info(norea, nogen)
18 // disable "statement unreachable" and "missing break statement"
19 // this code generates those options HEAVILY
20
21#include "expat\xmltok.h"
22#include "expat\nametab.h"
23
24#ifdef XML_DTD
25#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
26#else
27#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
28#endif
29
30#define VTABLE1 \
31 { PREFIX(prologTok), PREFIX(contentTok), \
32 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
33 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
34 PREFIX(sameName), \
35 PREFIX(nameMatchesAscii), \
36 PREFIX(nameLength), \
37 PREFIX(skipS), \
38 PREFIX(getAtts), \
39 PREFIX(charRefNumber), \
40 PREFIX(predefinedEntityName), \
41 PREFIX(updatePosition), \
42 PREFIX(isPublicId)
43
44#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
45
46#define UCS2_GET_NAMING(pages, hi, lo) \
47 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
48
49/* A 2 byte UTF-8 representation splits the characters 11 bits
50 * between the bottom 5 and 6 bits of the bytes.
51 * We need 8 bits to index into pages, 3 bits to add to that index and
52 * 5 bits to generate the mask. */
53#define UTF8_GET_NAMING2(pages, byte) \
54 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
55 + ((((byte)[0]) & 3) << 1) \
56 + ((((byte)[1]) >> 5) & 1)] \
57 & (1 << (((byte)[1]) & 0x1F)))
58
59/* A 3 byte UTF-8 representation splits the characters 16 bits
60 * between the bottom 4, 6 and 6 bits of the bytes.
61 * We need 8 bits to index into pages, 3 bits to add to that index and
62 * 5 bits to generate the mask. */
63#define UTF8_GET_NAMING3(pages, byte) \
64 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
65 + ((((byte)[1]) >> 2) & 0xF)] \
66 << 3) \
67 + ((((byte)[1]) & 3) << 1) \
68 + ((((byte)[2]) >> 5) & 1)] \
69 & (1 << (((byte)[2]) & 0x1F)))
70
71#define UTF8_GET_NAMING(pages, p, n) \
72 ((n) == 2 \
73 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
74 : ((n) == 3 \
75 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
76 : 0))
77
78#define UTF8_INVALID3(p) \
79 ((*p) == 0xED \
80 ? (((p)[1] & 0x20) != 0) \
81 : ((*p) == 0xEF \
82 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
83 : 0))
84
85#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
86
87static int EXPATENTRY isNever(const ENCODING * enc, const char *p)
88{
89 return 0;
90}
91
92static int EXPATENTRY utf8_isName2(const ENCODING * enc, const char *p)
93{
94 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
95}
96
97static int EXPATENTRY utf8_isName3(const ENCODING * enc, const char *p)
98{
99 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
100}
101
102#define utf8_isName4 isNever
103
104static int EXPATENTRY utf8_isNmstrt2(const ENCODING * enc, const char *p)
105{
106 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
107}
108
109static int EXPATENTRY utf8_isNmstrt3(const ENCODING * enc, const char *p)
110{
111 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
112}
113
114#define utf8_isNmstrt4 isNever
115
116#define utf8_isInvalid2 isNever
117
118static int EXPATENTRY utf8_isInvalid3(const ENCODING * enc, const char *p)
119{
120 return UTF8_INVALID3((const unsigned char *)p);
121}
122
123static int EXPATENTRY utf8_isInvalid4(const ENCODING * enc, const char *p)
124{
125 return UTF8_INVALID4((const unsigned char *)p);
126}
127
128struct normal_encoding
129{
130 ENCODING enc;
131 unsigned char type[256];
132#ifdef XML_MIN_SIZE
133 int (* EXPATENTRY byteType) (const ENCODING *, const char *);
134 int (* EXPATENTRY isNameMin) (const ENCODING *, const char *);
135 int (* EXPATENTRY isNmstrtMin) (const ENCODING *, const char *);
136 int (* EXPATENTRY byteToAscii) (const ENCODING *, const char *);
137 int (* EXPATENTRY charMatches) (const ENCODING *, const char *, int);
138#endif /* XML_MIN_SIZE */
139 int (* EXPATENTRY isName2) (const ENCODING *, const char *);
140 int (* EXPATENTRY isName3) (const ENCODING *, const char *);
141 int (* EXPATENTRY isName4) (const ENCODING *, const char *);
142 int (* EXPATENTRY isNmstrt2) (const ENCODING *, const char *);
143 int (* EXPATENTRY isNmstrt3) (const ENCODING *, const char *);
144 int (* EXPATENTRY isNmstrt4) (const ENCODING *, const char *);
145 int (* EXPATENTRY isInvalid2) (const ENCODING *, const char *);
146 int (* EXPATENTRY isInvalid3) (const ENCODING *, const char *);
147 int (* EXPATENTRY isInvalid4) (const ENCODING *, const char *);
148};
149
150#ifdef XML_MIN_SIZE
151
152#define STANDARD_VTABLE(E) \
153 E ## byteType, \
154 E ## isNameMin, \
155 E ## isNmstrtMin, \
156 E ## byteToAscii, \
157 E ## charMatches,
158
159#else
160
161#define STANDARD_VTABLE(E) /* as nothing */
162
163#endif
164
165#define NORMAL_VTABLE(E) \
166 E ## isName2, \
167 E ## isName3, \
168 E ## isName4, \
169 E ## isNmstrt2, \
170 E ## isNmstrt3, \
171 E ## isNmstrt4, \
172 E ## isInvalid2, \
173 E ## isInvalid3, \
174 E ## isInvalid4
175
176static int checkCharRefNumber(int);
177
178#include "expat\xmltok_impl.h"
179#include "expat\ascii.h"
180
181#ifdef XML_MIN_SIZE
182#define sb_isNameMin isNever
183#define sb_isNmstrtMin isNever
184#endif
185
186#ifdef XML_MIN_SIZE
187#define MINBPC(enc) ((enc)->minBytesPerChar)
188#else
189/* minimum bytes per character */
190#define MINBPC(enc) 1
191#endif
192
193#define SB_BYTE_TYPE(enc, p) \
194 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
195
196#ifdef XML_MIN_SIZE
197static int EXPATENTRY sb_byteType(const ENCODING * enc, const char *p)
198{
199 return SB_BYTE_TYPE(enc, p);
200}
201#define BYTE_TYPE(enc, p) \
202 (((const struct normal_encoding *)(enc))->byteType(enc, p))
203#else
204#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
205#endif
206
207#ifdef XML_MIN_SIZE
208#define BYTE_TO_ASCII(enc, p) \
209 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
210static
211int EXPATENTRY sb_byteToAscii(const ENCODING * enc, const char *p)
212{
213 return *p;
214}
215#else
216#define BYTE_TO_ASCII(enc, p) (*(p))
217#endif
218
219#define IS_NAME_CHAR(enc, p, n) \
220 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
221#define IS_NMSTRT_CHAR(enc, p, n) \
222 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
223#define IS_INVALID_CHAR(enc, p, n) \
224 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
225
226#ifdef XML_MIN_SIZE
227#define IS_NAME_CHAR_MINBPC(enc, p) \
228 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
229#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
230 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
231#else
232#define IS_NAME_CHAR_MINBPC(enc, p) (0)
233#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
234#endif
235
236#ifdef XML_MIN_SIZE
237#define CHAR_MATCHES(enc, p, c) \
238 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
239static
240int EXPATENTRY sb_charMatches(const ENCODING * enc, const char *p, int c)
241{
242 return *p == c;
243}
244#else
245/* c is an ASCII character */
246#define CHAR_MATCHES(enc, p, c) (*(p) == c)
247#endif
248
249#define PREFIX(ident) normal_ ## ident
250#include "xmltok_impl.c"
251
252#undef MINBPC
253#undef BYTE_TYPE
254#undef BYTE_TO_ASCII
255#undef CHAR_MATCHES
256#undef IS_NAME_CHAR
257#undef IS_NAME_CHAR_MINBPC
258#undef IS_NMSTRT_CHAR
259#undef IS_NMSTRT_CHAR_MINBPC
260#undef IS_INVALID_CHAR
261
262enum
263{ /* UTF8_cvalN is value of masked first byte of N byte sequence */
264 UTF8_cval1 = 0x00,
265 UTF8_cval2 = 0xc0,
266 UTF8_cval3 = 0xe0,
267 UTF8_cval4 = 0xf0
268};
269
270static void EXPATENTRY utf8_toUtf8(const ENCODING * enc,
271 const char **fromP,
272 const char *fromLim,
273 char **toP,
274 const char *toLim)
275{
276 char *to;
277 const char *from;
278
279 if (fromLim - *fromP > toLim - *toP)
280 {
281 /* Avoid copying partial characters. */
282 for (fromLim = *fromP + (toLim - *toP);
283 fromLim > *fromP;
284 fromLim--)
285 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
286 break;
287 }
288 for (to = *toP, from = *fromP;
289 from != fromLim;
290 from++, to++)
291 *to = *from;
292 *fromP = from;
293 *toP = to;
294}
295
296static void EXPATENTRY utf8_toUtf16(const ENCODING * enc,
297 const char **fromP,
298 const char *fromLim,
299 unsigned short **toP,
300 const unsigned short *toLim)
301{
302 unsigned short *to = *toP;
303 const char *from = *fromP;
304
305 while (from != fromLim && to != toLim)
306 {
307 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from])
308 {
309 case BT_LEAD2:
310 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
311 from += 2;
312 break;
313 case BT_LEAD3:
314 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
315 from += 3;
316 break;
317 case BT_LEAD4:
318 {
319 unsigned long n;
320
321 if (to + 1 == toLim)
322 break;
323 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
324 n -= 0x10000;
325 to[0] = (unsigned short)((n >> 10) | 0xD800);
326 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
327 to += 2;
328 from += 4;
329 }
330 break;
331 default:
332 *to++ = *from++;
333 break;
334 }
335 }
336 *fromP = from;
337 *toP = to;
338}
339
340#ifdef XML_NS
341static const struct normal_encoding utf8_encoding_ns =
342{
343 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
344 {
345#include "expat\asciitab.h"
346#include "expat\utf8tab.h"
347 },
348 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
349};
350
351#endif
352
353static const struct normal_encoding utf8_encoding =
354{
355 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
356 {
357#define BT_COLON BT_NMSTRT
358#include "expat\asciitab.h"
359#undef BT_COLON
360#include "expat\utf8tab.h"
361 },
362 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
363};
364
365#ifdef XML_NS
366
367static const struct normal_encoding internal_utf8_encoding_ns =
368{
369 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
370 {
371#include "expat\iasciitab.h"
372#include "expat\utf8tab.h"
373 },
374 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
375};
376
377#endif
378
379static const struct normal_encoding internal_utf8_encoding =
380{
381 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
382 {
383#define BT_COLON BT_NMSTRT
384#include "expat\iasciitab.h"
385#undef BT_COLON
386#include "expat\utf8tab.h"
387 },
388 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
389};
390
391static void EXPATENTRY latin1_toUtf8(const ENCODING * enc,
392 const char **fromP,
393 const char *fromLim,
394 char **toP,
395 const char *toLim)
396{
397 for (;;)
398 {
399 unsigned char c;
400
401 if (*fromP == fromLim)
402 break;
403 c = (unsigned char)**fromP;
404 if (c & 0x80)
405 {
406 if (toLim - *toP < 2)
407 break;
408 *(*toP)++ = ((c >> 6) | UTF8_cval2);
409 *(*toP)++ = ((c & 0x3f) | 0x80);
410 (*fromP)++;
411 }
412 else
413 {
414 if (*toP == toLim)
415 break;
416 *(*toP)++ = *(*fromP)++;
417 }
418 }
419}
420
421static void EXPATENTRY latin1_toUtf16(const ENCODING * enc,
422 const char **fromP,
423 const char *fromLim,
424 unsigned short **toP,
425 const unsigned short *toLim)
426{
427 while (*fromP != fromLim && *toP != toLim)
428 *(*toP)++ = (unsigned char)*(*fromP)++;
429}
430
431#ifdef XML_NS
432
433static const struct normal_encoding latin1_encoding_ns =
434{
435 {VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
436 {
437#include "expat\asciitab.h"
438#include "expat\latin1tab.h"
439 },
440 STANDARD_VTABLE(sb_)
441};
442
443#endif
444
445static const struct normal_encoding latin1_encoding =
446{
447 {VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
448 {
449#define BT_COLON BT_NMSTRT
450#include "expat\asciitab.h"
451#undef BT_COLON
452#include "expat\latin1tab.h"
453 },
454 STANDARD_VTABLE(sb_)
455};
456
457static void EXPATENTRY ascii_toUtf8(const ENCODING * enc,
458 const char **fromP,
459 const char *fromLim,
460 char **toP,
461 const char *toLim)
462{
463 while (*fromP != fromLim && *toP != toLim)
464 *(*toP)++ = *(*fromP)++;
465}
466
467#ifdef XML_NS
468
469static const struct normal_encoding ascii_encoding_ns =
470{
471 {VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
472 {
473#include "expat\asciitab.h"
474/* BT_NONXML == 0 */
475 },
476 STANDARD_VTABLE(sb_)
477};
478
479#endif
480
481static const struct normal_encoding ascii_encoding =
482{
483 {VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
484 {
485#define BT_COLON BT_NMSTRT
486#include "expat\asciitab.h"
487#undef BT_COLON
488/* BT_NONXML == 0 */
489 },
490 STANDARD_VTABLE(sb_)
491};
492
493static int unicode_byte_type(char hi, char lo)
494{
495 switch ((unsigned char)hi)
496 {
497 case 0xD8:
498 case 0xD9:
499 case 0xDA:
500 case 0xDB:
501 return BT_LEAD4;
502 case 0xDC:
503 case 0xDD:
504 case 0xDE:
505 case 0xDF:
506 return BT_TRAIL;
507 case 0xFF:
508 switch ((unsigned char)lo)
509 {
510 case 0xFF:
511 case 0xFE:
512 return BT_NONXML;
513 }
514 break;
515 }
516 return BT_NONASCII;
517}
518
519#define DEFINE_UTF16_TO_UTF8(E) \
520static void EXPATENTRY E ## toUtf8(const ENCODING *enc, \
521 const char **fromP, const char *fromLim, \
522 char **toP, const char *toLim) \
523{ \
524 const char *from; \
525 for (from = *fromP; from != fromLim; from += 2) { \
526 int plane; \
527 unsigned char lo2; \
528 unsigned char lo = GET_LO(from); \
529 unsigned char hi = GET_HI(from); \
530 switch (hi) { \
531 case 0: \
532 if (lo < 0x80) { \
533 if (*toP == toLim) { \
534 *fromP = from; \
535 return; \
536 } \
537 *(*toP)++ = lo; \
538 break; \
539 } \
540 /* fall through */ \
541 case 0x1: case 0x2: case 0x3: \
542 case 0x4: case 0x5: case 0x6: case 0x7: \
543 if (toLim - *toP < 2) { \
544 *fromP = from; \
545 return; \
546 } \
547 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
548 *(*toP)++ = ((lo & 0x3f) | 0x80); \
549 break; \
550 default: \
551 if (toLim - *toP < 3) { \
552 *fromP = from; \
553 return; \
554 } \
555 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
556 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
557 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
558 *(*toP)++ = ((lo & 0x3f) | 0x80); \
559 break; \
560 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
561 if (toLim - *toP < 4) { \
562 *fromP = from; \
563 return; \
564 } \
565 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
566 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
567 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
568 from += 2; \
569 lo2 = GET_LO(from); \
570 *(*toP)++ = (((lo & 0x3) << 4) \
571 | ((GET_HI(from) & 0x3) << 2) \
572 | (lo2 >> 6) \
573 | 0x80); \
574 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
575 break; \
576 } \
577 } \
578 *fromP = from; \
579}
580
581#define DEFINE_UTF16_TO_UTF16(E) \
582static void EXPATENTRY E ## toUtf16(const ENCODING *enc, \
583 const char **fromP, const char *fromLim, \
584 unsigned short **toP, const unsigned short *toLim) \
585{ \
586 /* Avoid copying first half only of surrogate */ \
587 if (fromLim - *fromP > ((toLim - *toP) << 1) \
588 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
589 fromLim -= 2; \
590 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
591 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
592}
593
594#define SET2(ptr, ch) \
595 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
596#define GET_LO(ptr) ((unsigned char)(ptr)[0])
597#define GET_HI(ptr) ((unsigned char)(ptr)[1])
598
599DEFINE_UTF16_TO_UTF8(little2_)
600DEFINE_UTF16_TO_UTF16(little2_)
601
602#undef SET2
603#undef GET_LO
604#undef GET_HI
605
606#define SET2(ptr, ch) \
607 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
608#define GET_LO(ptr) ((unsigned char)(ptr)[1])
609#define GET_HI(ptr) ((unsigned char)(ptr)[0])
610
611DEFINE_UTF16_TO_UTF8(big2_)
612DEFINE_UTF16_TO_UTF16(big2_)
613
614#undef SET2
615#undef GET_LO
616#undef GET_HI
617
618#define LITTLE2_BYTE_TYPE(enc, p) \
619 ((p)[1] == 0 \
620 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
621 : unicode_byte_type((p)[1], (p)[0]))
622#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
623#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
624#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
625 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
626#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
627 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
628
629#ifdef XML_MIN_SIZE
630static int EXPATENTRY little2_byteType(const ENCODING * enc, const char *p)
631{
632 return LITTLE2_BYTE_TYPE(enc, p);
633}
634
635static int EXPATENTRY little2_byteToAscii(const ENCODING * enc, const char *p)
636{
637 return LITTLE2_BYTE_TO_ASCII(enc, p);
638}
639
640static int EXPATENTRY little2_charMatches(const ENCODING * enc, const char *p, int c)
641{
642 return LITTLE2_CHAR_MATCHES(enc, p, c);
643}
644
645static int EXPATENTRY little2_isNameMin(const ENCODING * enc, const char *p)
646{
647 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
648}
649
650static int EXPATENTRY little2_isNmstrtMin(const ENCODING * enc, const char *p)
651{
652 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
653}
654
655#undef VTABLE
656#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
657
658#else /* not XML_MIN_SIZE */
659
660#undef PREFIX
661#define PREFIX(ident) little2_ ## ident
662#define MINBPC(enc) 2
663/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
664#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
665#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
666#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
667#define IS_NAME_CHAR(enc, p, n) 0
668#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
669#define IS_NMSTRT_CHAR(enc, p, n) (0)
670#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
671
672#include "xmltok_impl.c"
673
674#undef MINBPC
675#undef BYTE_TYPE
676#undef BYTE_TO_ASCII
677#undef CHAR_MATCHES
678#undef IS_NAME_CHAR
679#undef IS_NAME_CHAR_MINBPC
680#undef IS_NMSTRT_CHAR
681#undef IS_NMSTRT_CHAR_MINBPC
682#undef IS_INVALID_CHAR
683
684#endif /* not XML_MIN_SIZE */
685
686#ifdef XML_NS
687
688 static const struct normal_encoding little2_encoding_ns =
689 {
690 {VTABLE, 2, 0,
691#if XML_BYTE_ORDER == 12
692 1
693#else
694 0
695#endif
696 },
697 {
698#include "expat\asciitab.h"
699#include "expat\latin1tab.h"
700 },
701 STANDARD_VTABLE(little2_)
702};
703
704#endif
705
706 static const struct normal_encoding little2_encoding =
707 {
708 {VTABLE, 2, 0,
709#if XML_BYTE_ORDER == 12
710 1
711#else
712 0
713#endif
714 },
715 {
716#define BT_COLON BT_NMSTRT
717#include "expat\asciitab.h"
718#undef BT_COLON
719#include "expat\latin1tab.h"
720 },
721 STANDARD_VTABLE(little2_)
722};
723
724#if XML_BYTE_ORDER != 21
725
726#ifdef XML_NS
727
728 static const struct normal_encoding internal_little2_encoding_ns =
729 {
730 {VTABLE, 2, 0, 1},
731 {
732#include "expat\iasciitab.h"
733#include "expat\latin1tab.h"
734 },
735 STANDARD_VTABLE(little2_)
736};
737
738#endif
739
740 static const struct normal_encoding internal_little2_encoding =
741 {
742 {VTABLE, 2, 0, 1},
743 {
744#define BT_COLON BT_NMSTRT
745#include "expat\iasciitab.h"
746#undef BT_COLON
747#include "expat\latin1tab.h"
748 },
749 STANDARD_VTABLE(little2_)
750};
751
752#endif
753
754
755#define BIG2_BYTE_TYPE(enc, p) \
756 ((p)[0] == 0 \
757 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
758 : unicode_byte_type((p)[0], (p)[1]))
759#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
760#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
761#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
762 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
763#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
764 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
765
766#ifdef XML_MIN_SIZE
767
768static int EXPATENTRY big2_byteType(const ENCODING * enc, const char *p)
769{
770 return BIG2_BYTE_TYPE(enc, p);
771}
772
773static int EXPATENTRY big2_byteToAscii(const ENCODING * enc, const char *p)
774{
775 return BIG2_BYTE_TO_ASCII(enc, p);
776}
777
778static int EXPATENTRY big2_charMatches(const ENCODING * enc, const char *p, int c)
779{
780 return BIG2_CHAR_MATCHES(enc, p, c);
781}
782
783static int EXPATENTRY big2_isNameMin(const ENCODING * enc, const char *p)
784{
785 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
786}
787
788static int EXPATENTRY big2_isNmstrtMin(const ENCODING * enc, const char *p)
789{
790 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
791}
792
793#undef VTABLE
794#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
795
796#else /* not XML_MIN_SIZE */
797
798#undef PREFIX
799#define PREFIX(ident) big2_ ## ident
800#define MINBPC(enc) 2
801/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
802#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
803#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
804#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
805#define IS_NAME_CHAR(enc, p, n) 0
806#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
807#define IS_NMSTRT_CHAR(enc, p, n) (0)
808#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
809
810#include "xmltok_impl.c"
811
812#undef MINBPC
813#undef BYTE_TYPE
814#undef BYTE_TO_ASCII
815#undef CHAR_MATCHES
816#undef IS_NAME_CHAR
817#undef IS_NAME_CHAR_MINBPC
818#undef IS_NMSTRT_CHAR
819#undef IS_NMSTRT_CHAR_MINBPC
820#undef IS_INVALID_CHAR
821
822#endif /* not XML_MIN_SIZE */
823
824#ifdef XML_NS
825
826 static const struct normal_encoding big2_encoding_ns =
827 {
828 {VTABLE, 2, 0,
829#if XML_BYTE_ORDER == 21
830 1
831#else
832 0
833#endif
834 },
835 {
836#include "expat\asciitab.h"
837#include "expat\latin1tab.h"
838 },
839 STANDARD_VTABLE(big2_)
840};
841
842#endif
843
844 static const struct normal_encoding big2_encoding =
845 {
846 {VTABLE, 2, 0,
847#if XML_BYTE_ORDER == 21
848 1
849#else
850 0
851#endif
852 },
853 {
854#define BT_COLON BT_NMSTRT
855#include "expat\asciitab.h"
856#undef BT_COLON
857#include "expat\latin1tab.h"
858 },
859 STANDARD_VTABLE(big2_)
860};
861
862#if XML_BYTE_ORDER != 12
863
864#ifdef XML_NS
865
866 static const struct normal_encoding internal_big2_encoding_ns =
867 {
868 {VTABLE, 2, 0, 1},
869 {
870#include "expat\iasciitab.h"
871#include "expat\latin1tab.h"
872 },
873 STANDARD_VTABLE(big2_)
874};
875
876#endif
877
878 static const struct normal_encoding internal_big2_encoding =
879 {
880 {VTABLE, 2, 0, 1},
881 {
882#define BT_COLON BT_NMSTRT
883#include "expat\iasciitab.h"
884#undef BT_COLON
885#include "expat\latin1tab.h"
886 },
887 STANDARD_VTABLE(big2_)
888};
889
890#endif
891
892#undef PREFIX
893
894static int streqci(const char *s1, const char *s2)
895{
896 for (;;)
897 {
898 char c1 = *s1++;
899 char c2 = *s2++;
900
901 if (ASCII_a <= c1 && c1 <= ASCII_z)
902 c1 += ASCII_A - ASCII_a;
903 if (ASCII_a <= c2 && c2 <= ASCII_z)
904 c2 += ASCII_A - ASCII_a;
905 if (c1 != c2)
906 return 0;
907 if (!c1)
908 break;
909 }
910 return 1;
911}
912
913static void EXPATENTRY initUpdatePosition(const ENCODING * enc,
914 const char *ptr,
915 const char *end,
916 POSITION * pos)
917{
918 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
919}
920
921static int EXPATENTRY toAscii(const ENCODING * enc,
922 const char *ptr,
923 const char *end)
924{
925 char buf[1];
926 char *p = buf;
927
928 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
929 if (p == buf)
930 return -1;
931 else
932 return buf[0];
933}
934
935static int EXPATENTRY isSpace(int c)
936{
937 switch (c)
938 {
939 case 0x20:
940 case 0xD:
941 case 0xA:
942 case 0x9:
943 return 1;
944 }
945 return 0;
946}
947
948/* Return 1 if there's just optional white space
949 * or there's an S followed by name=val. */
950static int EXPATENTRY parsePseudoAttribute(const ENCODING * enc,
951 const char *ptr,
952 const char *end,
953 const char **namePtr,
954 const char **nameEndPtr,
955 const char **valPtr,
956 const char **nextTokPtr)
957{
958 int c;
959 char open;
960
961 if (ptr == end)
962 {
963 *namePtr = 0;
964 return 1;
965 }
966 if (!isSpace(toAscii(enc, ptr, end)))
967 {
968 *nextTokPtr = ptr;
969 return 0;
970 }
971 do
972 {
973 ptr += enc->minBytesPerChar;
974 }
975 while (isSpace(toAscii(enc, ptr, end)));
976 if (ptr == end)
977 {
978 *namePtr = 0;
979 return 1;
980 }
981 *namePtr = ptr;
982 for (;;)
983 {
984 c = toAscii(enc, ptr, end);
985 if (c == -1)
986 {
987 *nextTokPtr = ptr;
988 return 0;
989 }
990 if (c == ASCII_EQUALS)
991 {
992 *nameEndPtr = ptr;
993 break;
994 }
995 if (isSpace(c))
996 {
997 *nameEndPtr = ptr;
998 do
999 {
1000 ptr += enc->minBytesPerChar;
1001 }
1002 while (isSpace(c = toAscii(enc, ptr, end)));
1003 if (c != ASCII_EQUALS)
1004 {
1005 *nextTokPtr = ptr;
1006 return 0;
1007 }
1008 break;
1009 }
1010 ptr += enc->minBytesPerChar;
1011 }
1012 if (ptr == *namePtr)
1013 {
1014 *nextTokPtr = ptr;
1015 return 0;
1016 }
1017 ptr += enc->minBytesPerChar;
1018 c = toAscii(enc, ptr, end);
1019 while (isSpace(c))
1020 {
1021 ptr += enc->minBytesPerChar;
1022 c = toAscii(enc, ptr, end);
1023 }
1024 if (c != ASCII_QUOT && c != ASCII_APOS)
1025 {
1026 *nextTokPtr = ptr;
1027 return 0;
1028 }
1029 open = c;
1030 ptr += enc->minBytesPerChar;
1031 *valPtr = ptr;
1032 for (;; ptr += enc->minBytesPerChar)
1033 {
1034 c = toAscii(enc, ptr, end);
1035 if (c == open)
1036 break;
1037 if (!(ASCII_a <= c && c <= ASCII_z)
1038 && !(ASCII_A <= c && c <= ASCII_Z)
1039 && !(ASCII_0 <= c && c <= ASCII_9)
1040 && c != ASCII_PERIOD
1041 && c != ASCII_MINUS
1042 && c != ASCII_UNDERSCORE)
1043 {
1044 *nextTokPtr = ptr;
1045 return 0;
1046 }
1047 }
1048 *nextTokPtr = ptr + enc->minBytesPerChar;
1049 return 1;
1050}
1051
1052static const char KW_version[] =
1053{
1054 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1055};
1056
1057static const char KW_encoding[] =
1058{
1059 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1060};
1061
1062static const char KW_standalone[] =
1063{
1064 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
1065};
1066
1067static const char KW_yes[] =
1068{
1069 ASCII_y, ASCII_e, ASCII_s, '\0'
1070};
1071
1072static const char KW_no[] =
1073{
1074 ASCII_n, ASCII_o, '\0'
1075};
1076
1077static int doParseXmlDecl(const ENCODING * (*encodingFinder) (const ENCODING *,
1078 const char *,
1079 const char *),
1080 int isGeneralTextEntity,
1081 const ENCODING * enc,
1082 const char *ptr,
1083 const char *end,
1084 const char **badPtr,
1085 const char **versionPtr,
1086 const char **versionEndPtr,
1087 const char **encodingName,
1088 const ENCODING ** encoding,
1089 int *standalone)
1090{
1091 const char *val = 0;
1092 const char *name = 0;
1093 const char *nameEnd = 0;
1094
1095 ptr += 5 * enc->minBytesPerChar;
1096 end -= 2 * enc->minBytesPerChar;
1097 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name)
1098 {
1099 *badPtr = ptr;
1100 return 0;
1101 }
1102 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version))
1103 {
1104 if (!isGeneralTextEntity)
1105 {
1106 *badPtr = name;
1107 return 0;
1108 }
1109 }
1110 else
1111 {
1112 if (versionPtr)
1113 *versionPtr = val;
1114 if (versionEndPtr)
1115 *versionEndPtr = ptr;
1116 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1117 {
1118 *badPtr = ptr;
1119 return 0;
1120 }
1121 if (!name)
1122 {
1123 if (isGeneralTextEntity)
1124 {
1125 /* a TextDecl must have an EncodingDecl */
1126 *badPtr = ptr;
1127 return 0;
1128 }
1129 return 1;
1130 }
1131 }
1132 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding))
1133 {
1134 int c = toAscii(enc, val, end);
1135
1136 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z))
1137 {
1138 *badPtr = val;
1139 return 0;
1140 }
1141 if (encodingName)
1142 *encodingName = val;
1143 if (encoding)
1144 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1145 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1146 {
1147 *badPtr = ptr;
1148 return 0;
1149 }
1150 if (!name)
1151 return 1;
1152 }
1153 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) || isGeneralTextEntity)
1154 {
1155 *badPtr = name;
1156 return 0;
1157 }
1158 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes))
1159 {
1160 if (standalone)
1161 *standalone = 1;
1162 }
1163 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no))
1164 {
1165 if (standalone)
1166 *standalone = 0;
1167 }
1168 else
1169 {
1170 *badPtr = val;
1171 return 0;
1172 }
1173 while (isSpace(toAscii(enc, ptr, end)))
1174 ptr += enc->minBytesPerChar;
1175 if (ptr != end)
1176 {
1177 *badPtr = ptr;
1178 return 0;
1179 }
1180 return 1;
1181}
1182
1183static int checkCharRefNumber(int result)
1184{
1185 switch (result >> 8)
1186 {
1187 case 0xD8:
1188 case 0xD9:
1189 case 0xDA:
1190 case 0xDB:
1191 case 0xDC:
1192 case 0xDD:
1193 case 0xDE:
1194 case 0xDF:
1195 return -1;
1196 case 0:
1197 if (latin1_encoding.type[result] == BT_NONXML)
1198 return -1;
1199 break;
1200 case 0xFF:
1201 if (result == 0xFFFE || result == 0xFFFF)
1202 return -1;
1203 break;
1204 }
1205 return result;
1206}
1207
1208int XmlUtf8Encode(int c, char *buf)
1209{
1210 enum
1211 {
1212 /* minN is minimum legal resulting value for N byte sequence */
1213 min2 = 0x80,
1214 min3 = 0x800,
1215 min4 = 0x10000
1216 };
1217
1218 if (c < 0)
1219 return 0;
1220 if (c < min2)
1221 {
1222 buf[0] = (c | UTF8_cval1);
1223 return 1;
1224 }
1225 if (c < min3)
1226 {
1227 buf[0] = ((c >> 6) | UTF8_cval2);
1228 buf[1] = ((c & 0x3f) | 0x80);
1229 return 2;
1230 }
1231 if (c < min4)
1232 {
1233 buf[0] = ((c >> 12) | UTF8_cval3);
1234 buf[1] = (((c >> 6) & 0x3f) | 0x80);
1235 buf[2] = ((c & 0x3f) | 0x80);
1236 return 3;
1237 }
1238 if (c < 0x110000)
1239 {
1240 buf[0] = ((c >> 18) | UTF8_cval4);
1241 buf[1] = (((c >> 12) & 0x3f) | 0x80);
1242 buf[2] = (((c >> 6) & 0x3f) | 0x80);
1243 buf[3] = ((c & 0x3f) | 0x80);
1244 return 4;
1245 }
1246 return 0;
1247}
1248
1249int XmlUtf16Encode(int charNum, unsigned short *buf)
1250{
1251 if (charNum < 0)
1252 return 0;
1253 if (charNum < 0x10000)
1254 {
1255 buf[0] = charNum;
1256 return 1;
1257 }
1258 if (charNum < 0x110000)
1259 {
1260 charNum -= 0x10000;
1261 buf[0] = (charNum >> 10) + 0xD800;
1262 buf[1] = (charNum & 0x3FF) + 0xDC00;
1263 return 2;
1264 }
1265 return 0;
1266}
1267
1268struct unknown_encoding
1269{
1270 struct normal_encoding normal;
1271 int (*convert) (void *userData, const char *p);
1272 void *userData;
1273 unsigned short utf16[256];
1274 char utf8[256][4];
1275};
1276
1277int XmlSizeOfUnknownEncoding(void)
1278{
1279 return sizeof(struct unknown_encoding);
1280}
1281
1282static int EXPATENTRY unknown_isName(const ENCODING * enc, const char *p)
1283{
1284 int c = ((const struct unknown_encoding *)enc)
1285 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1286
1287 if (c & ~0xFFFF)
1288 return 0;
1289 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1290}
1291
1292static int EXPATENTRY unknown_isNmstrt(const ENCODING * enc, const char *p)
1293{
1294 int c = ((const struct unknown_encoding *)enc)
1295 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1296
1297 if (c & ~0xFFFF)
1298 return 0;
1299 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1300}
1301
1302static int EXPATENTRY unknown_isInvalid(const ENCODING * enc, const char *p)
1303{
1304 int c = ((const struct unknown_encoding *)enc)
1305 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1306
1307 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1308}
1309
1310static void EXPATENTRY unknown_toUtf8(const ENCODING * enc,
1311 const char **fromP,
1312 const char *fromLim,
1313 char **toP,
1314 const char *toLim)
1315{
1316 char buf[XML_UTF8_ENCODE_MAX];
1317
1318 for (;;)
1319 {
1320 const char *utf8;
1321 int n;
1322
1323 if (*fromP == fromLim)
1324 break;
1325 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1326 n = *utf8++;
1327 if (n == 0)
1328 {
1329 int c = ((const struct unknown_encoding *)enc)
1330 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1331
1332 n = XmlUtf8Encode(c, buf);
1333 if (n > toLim - *toP)
1334 break;
1335 utf8 = buf;
1336 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1337 - (BT_LEAD2 - 2);
1338 }
1339 else
1340 {
1341 if (n > toLim - *toP)
1342 break;
1343 (*fromP)++;
1344 }
1345 do
1346 {
1347 *(*toP)++ = *utf8++;
1348 }
1349 while (--n != 0);
1350 }
1351}
1352
1353static void EXPATENTRY unknown_toUtf16(const ENCODING * enc,
1354 const char **fromP,
1355 const char *fromLim,
1356 unsigned short **toP,
1357 const unsigned short *toLim)
1358{
1359 while (*fromP != fromLim && *toP != toLim)
1360 {
1361 unsigned short c
1362 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1363
1364 if (c == 0)
1365 {
1366 c = (unsigned short)((const struct unknown_encoding *)enc)
1367 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1368 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1369 - (BT_LEAD2 - 2);
1370 }
1371 else
1372 (*fromP)++;
1373 *(*toP)++ = c;
1374 }
1375}
1376
1377/*
1378 *@@ XmlInitUnknownEncoding:
1379 *
1380 *@@changed V0.9.14 (2001-08-09) [umoeller]: couple of performance hacks
1381 */
1382
1383ENCODING* XmlInitUnknownEncoding(void *mem,
1384 int *table,
1385 int (*convert) (void *userData, const char *p),
1386 void *userData)
1387{
1388 int i;
1389 struct unknown_encoding *e = (struct unknown_encoding*)mem;
1390
1391 // gee, isn't this a regular memcpy?!?
1392 /* for (i = 0;
1393 i < (int)sizeof(struct normal_encoding);
1394 i++)
1395 ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; */
1396
1397 // replaced the above with this V0.9.14 (2001-08-09) [umoeller]
1398 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1399
1400 for (i = 0; i < 128; i++)
1401 if ( latin1_encoding.type[i] != BT_OTHER
1402 && latin1_encoding.type[i] != BT_NONXML
1403 && table[i] != i
1404 )
1405 return 0;
1406
1407 for (i = 0; i < 256; i++)
1408 {
1409 int c = table[i];
1410
1411 if (c == -1)
1412 {
1413 e->normal.type[i] = BT_MALFORM;
1414 /* This shouldn't really get used. */
1415 e->utf16[i] = 0xFFFF;
1416 e->utf8[i][0] = 1;
1417 e->utf8[i][1] = 0;
1418 }
1419 else if (c < 0)
1420 {
1421 if (c < -4)
1422 return 0;
1423 e->normal.type[i] = BT_LEAD2 - (c + 2);
1424 e->utf8[i][0] = 0;
1425 e->utf16[i] = 0;
1426 }
1427 else if (c < 0x80)
1428 {
1429 if ( latin1_encoding.type[c] != BT_OTHER
1430 && latin1_encoding.type[c] != BT_NONXML
1431 && c != i
1432 )
1433 return 0;
1434 e->normal.type[i] = latin1_encoding.type[c];
1435 e->utf8[i][0] = 1;
1436 e->utf8[i][1] = (char)c;
1437 e->utf16[i] = c == 0 ? 0xFFFF : c;
1438 }
1439 else if (checkCharRefNumber(c) < 0)
1440 {
1441 e->normal.type[i] = BT_NONXML;
1442 /* This shouldn't really get used. */
1443 e->utf16[i] = 0xFFFF;
1444 e->utf8[i][0] = 1;
1445 e->utf8[i][1] = 0;
1446 }
1447 else
1448 {
1449 if (c > 0xFFFF)
1450 return 0;
1451 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1452 e->normal.type[i] = BT_NMSTRT;
1453 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1454 e->normal.type[i] = BT_NAME;
1455 else
1456 e->normal.type[i] = BT_OTHER;
1457 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1458 e->utf16[i] = c;
1459 }
1460 }
1461 e->userData = userData;
1462 e->convert = convert;
1463 if (convert)
1464 {
1465 e->normal.isName2 = unknown_isName;
1466 e->normal.isName3 = unknown_isName;
1467 e->normal.isName4 = unknown_isName;
1468 e->normal.isNmstrt2 = unknown_isNmstrt;
1469 e->normal.isNmstrt3 = unknown_isNmstrt;
1470 e->normal.isNmstrt4 = unknown_isNmstrt;
1471 e->normal.isInvalid2 = unknown_isInvalid;
1472 e->normal.isInvalid3 = unknown_isInvalid;
1473 e->normal.isInvalid4 = unknown_isInvalid;
1474 }
1475 e->normal.enc.utf8Convert = unknown_toUtf8;
1476 e->normal.enc.utf16Convert = unknown_toUtf16;
1477 return &(e->normal.enc);
1478}
1479
1480/* If this enumeration is changed, getEncodingIndex and encodings
1481 * must also be changed. */
1482enum
1483{
1484 UNKNOWN_ENC = -1,
1485 ISO_8859_1_ENC = 0,
1486 US_ASCII_ENC,
1487 UTF_8_ENC,
1488 UTF_16_ENC,
1489 UTF_16BE_ENC,
1490 UTF_16LE_ENC,
1491 /* must match encodingNames up to here */
1492 NO_ENC
1493};
1494
1495static const char KW_ISO_8859_1[] =
1496{
1497 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
1498};
1499static const char KW_US_ASCII[] =
1500{
1501 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
1502};
1503static const char KW_UTF_8[] =
1504{
1505 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1506};
1507static const char KW_UTF_16[] =
1508{
1509 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1510};
1511static const char KW_UTF_16BE[] =
1512{
1513 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
1514};
1515static const char KW_UTF_16LE[] =
1516{
1517 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
1518};
1519
1520static int getEncodingIndex(const char *name)
1521{
1522 static const char *encodingNames[] =
1523 {
1524 KW_ISO_8859_1,
1525 KW_US_ASCII,
1526 KW_UTF_8,
1527 KW_UTF_16,
1528 KW_UTF_16BE,
1529 KW_UTF_16LE,
1530 };
1531 int i;
1532
1533 if (name == 0)
1534 return NO_ENC;
1535 for (i = 0;
1536 i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0]));
1537 i++)
1538 if (streqci(name, encodingNames[i]))
1539 return i;
1540 return UNKNOWN_ENC;
1541}
1542
1543/* For binary compatibility, we store the index of the encoding specified
1544 * at initialization in the isUtf16 member. */
1545
1546#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1547#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1548
1549/* This is what detects the encoding.
1550 * encodingTable maps from encoding indices to encodings;
1551 * INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1552 * state is XML_CONTENT_STATE if we're parsing an external text entity,
1553 * and XML_PROLOG_STATE otherwise.
1554 */
1555
1556
1557static int EXPATENTRY initScan(const ENCODING ** encodingTable,
1558 const INIT_ENCODING * enc,
1559 int state,
1560 const char *ptr,
1561 const char *end,
1562 const char **nextTokPtr)
1563{
1564 const ENCODING **encPtr;
1565
1566 if (ptr == end)
1567 return XML_TOK_NONE;
1568 encPtr = enc->encPtr;
1569 if (ptr + 1 == end)
1570 {
1571 /* only a single byte available for auto-detection */
1572#ifndef XML_DTD /* FIXME */
1573 /* a well-formed document entity must have more than one byte */
1574 if (state != XML_CONTENT_STATE)
1575 return XML_TOK_PARTIAL;
1576#endif
1577 /* so we're parsing an external text entity... */
1578 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1579 switch (INIT_ENC_INDEX(enc))
1580 {
1581 case UTF_16_ENC:
1582 case UTF_16LE_ENC:
1583 case UTF_16BE_ENC:
1584 return XML_TOK_PARTIAL;
1585 }
1586 switch ((unsigned char)*ptr)
1587 {
1588 case 0xFE:
1589 case 0xFF:
1590 case 0xEF: /* possibly first byte of UTF-8 BOM */
1591 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1592 && state == XML_CONTENT_STATE)
1593 break;
1594 /* fall through */
1595 case 0x00:
1596 case 0x3C:
1597 return XML_TOK_PARTIAL;
1598 }
1599 }
1600 else
1601 {
1602 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1])
1603 {
1604 case 0xFEFF:
1605 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1606 && state == XML_CONTENT_STATE)
1607 break;
1608 *nextTokPtr = ptr + 2;
1609 *encPtr = encodingTable[UTF_16BE_ENC];
1610 return XML_TOK_BOM;
1611 /* 00 3C is handled in the default case */
1612 case 0x3C00:
1613 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1614 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1615 && state == XML_CONTENT_STATE)
1616 break;
1617 *encPtr = encodingTable[UTF_16LE_ENC];
1618 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1619 case 0xFFFE:
1620 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1621 && state == XML_CONTENT_STATE)
1622 break;
1623 *nextTokPtr = ptr + 2;
1624 *encPtr = encodingTable[UTF_16LE_ENC];
1625 return XML_TOK_BOM;
1626 case 0xEFBB:
1627 /* Maybe a UTF-8 BOM (EF BB BF) */
1628 /* If there's an explicitly specified (external) encoding
1629 * of ISO-8859-1 or some flavour of UTF-16
1630 * and this is an external text entity,
1631 * don't look for the BOM,
1632 * because it might be a legal data. */
1633 if (state == XML_CONTENT_STATE)
1634 {
1635 int e = INIT_ENC_INDEX(enc);
1636
1637 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
1638 break;
1639 }
1640 if (ptr + 2 == end)
1641 return XML_TOK_PARTIAL;
1642 if ((unsigned char)ptr[2] == 0xBF)
1643 {
1644 *nextTokPtr = ptr + 3;
1645 *encPtr = encodingTable[UTF_8_ENC];
1646 return XML_TOK_BOM;
1647 }
1648 break;
1649 default:
1650 if (ptr[0] == '\0')
1651 {
1652 /* 0 isn't a legal data character. Furthermore a document entity can only
1653 * start with ASCII characters. So the only way this can fail to be big-endian
1654 * UTF-16 if it it's an external parsed general entity that's labelled as
1655 * UTF-16LE. */
1656 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1657 break;
1658 *encPtr = encodingTable[UTF_16BE_ENC];
1659 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1660 }
1661 else if (ptr[1] == '\0')
1662 {
1663 /* We could recover here in the case:
1664 * - parsing an external entity
1665 * - second byte is 0
1666 * - no externally specified encoding
1667 * - no encoding declaration
1668 * by assuming UTF-16LE. But we don't, because this would mean when
1669 * presented just with a single byte, we couldn't reliably determine
1670 * whether we needed further bytes. */
1671 if (state == XML_CONTENT_STATE)
1672 break;
1673 *encPtr = encodingTable[UTF_16LE_ENC];
1674 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1675 }
1676 break;
1677 }
1678 }
1679 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1680 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1681}
1682
1683
1684#define NS(x) x
1685#define ns(x) x
1686#include "xmltok_ns.c"
1687#undef NS
1688#undef ns
1689
1690#ifdef XML_NS
1691
1692#define NS(x) x ## NS
1693#define ns(x) x ## _ns
1694
1695#include "xmltok_ns.c"
1696
1697#undef NS
1698#undef ns
1699
1700ENCODING * XmlInitUnknownEncodingNS(void *mem,
1701 int *table,
1702 int (* EXPATENTRY convert) (void *userData, const char *p),
1703 void *userData)
1704{
1705 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1706
1707 if (enc)
1708 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1709 return enc;
1710}
1711
1712#endif /* XML_NS */
Note: See TracBrowser for help on using the repository browser.