source: trunk/src/helpers/xmltok.c@ 38

Last change on this file since 38 was 38, checked in by umoeller, 25 years ago

Updates to XML.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 47.5 KB
Line 
1
2/*
3 *sourcefile xmltok.c
4 * part of the expat implementation. See xmlparse.c.
5 *
6 */
7
8/*
9 * Copyright (C) 2001 Ulrich M”ller.
10 * Copyright (c) 1998, 1999, 2000 Thai Open Source Software Center Ltd.
11 * and Clark Cooper.
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining
14 * a copy of this software and associated documentation files (the
15 * "Software"), to deal in the Software without restriction, including
16 * without limitation the rights to use, copy, modify, merge, publish,
17 * distribute, sublicense, and/or sell copies of the Software, and to
18 * permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included
22 * in all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32
33#include "setup.h"
34
35#include "expat\expat_setup.h" // V0.9.9 (2001-02-10) [umoeller]
36
37#pragma info(norea, nogen)
38 // disable "statement unreachable" and "missing break statement"
39 // this code generates those options HEAVILY
40
41#ifdef COMPILED_FROM_DSP
42#include "winconfig.h"
43#else
44// #include <config.h>
45#endif /* ndef COMPILED_FROM_DSP */
46
47#include "expat\xmltok.h"
48#include "expat\nametab.h"
49
50#ifdef XML_DTD
51#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
52#else
53#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
54#endif
55
56#define VTABLE1 \
57 { PREFIX(prologTok), PREFIX(contentTok), \
58 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
59 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
60 PREFIX(sameName), \
61 PREFIX(nameMatchesAscii), \
62 PREFIX(nameLength), \
63 PREFIX(skipS), \
64 PREFIX(getAtts), \
65 PREFIX(charRefNumber), \
66 PREFIX(predefinedEntityName), \
67 PREFIX(updatePosition), \
68 PREFIX(isPublicId)
69
70#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
71
72#define UCS2_GET_NAMING(pages, hi, lo) \
73 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
74
75/* A 2 byte UTF-8 representation splits the characters 11 bits
76 * between the bottom 5 and 6 bits of the bytes.
77 * We need 8 bits to index into pages, 3 bits to add to that index and
78 * 5 bits to generate the mask. */
79#define UTF8_GET_NAMING2(pages, byte) \
80 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
81 + ((((byte)[0]) & 3) << 1) \
82 + ((((byte)[1]) >> 5) & 1)] \
83 & (1 << (((byte)[1]) & 0x1F)))
84
85/* A 3 byte UTF-8 representation splits the characters 16 bits
86 * between the bottom 4, 6 and 6 bits of the bytes.
87 * We need 8 bits to index into pages, 3 bits to add to that index and
88 * 5 bits to generate the mask. */
89#define UTF8_GET_NAMING3(pages, byte) \
90 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
91 + ((((byte)[1]) >> 2) & 0xF)] \
92 << 3) \
93 + ((((byte)[1]) & 3) << 1) \
94 + ((((byte)[2]) >> 5) & 1)] \
95 & (1 << (((byte)[2]) & 0x1F)))
96
97#define UTF8_GET_NAMING(pages, p, n) \
98 ((n) == 2 \
99 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
100 : ((n) == 3 \
101 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
102 : 0))
103
104#define UTF8_INVALID3(p) \
105 ((*p) == 0xED \
106 ? (((p)[1] & 0x20) != 0) \
107 : ((*p) == 0xEF \
108 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
109 : 0))
110
111#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
112
113static int EXPATENTRY isNever(const ENCODING * enc, const char *p)
114{
115 return 0;
116}
117
118static int EXPATENTRY utf8_isName2(const ENCODING * enc, const char *p)
119{
120 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
121}
122
123static int EXPATENTRY utf8_isName3(const ENCODING * enc, const char *p)
124{
125 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
126}
127
128#define utf8_isName4 isNever
129
130static int EXPATENTRY utf8_isNmstrt2(const ENCODING * enc, const char *p)
131{
132 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
133}
134
135static int EXPATENTRY utf8_isNmstrt3(const ENCODING * enc, const char *p)
136{
137 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
138}
139
140#define utf8_isNmstrt4 isNever
141
142#define utf8_isInvalid2 isNever
143
144static int EXPATENTRY utf8_isInvalid3(const ENCODING * enc, const char *p)
145{
146 return UTF8_INVALID3((const unsigned char *)p);
147}
148
149static int EXPATENTRY utf8_isInvalid4(const ENCODING * enc, const char *p)
150{
151 return UTF8_INVALID4((const unsigned char *)p);
152}
153
154struct normal_encoding
155{
156 ENCODING enc;
157 unsigned char type[256];
158#ifdef XML_MIN_SIZE
159 int (* EXPATENTRY byteType) (const ENCODING *, const char *);
160 int (* EXPATENTRY isNameMin) (const ENCODING *, const char *);
161 int (* EXPATENTRY isNmstrtMin) (const ENCODING *, const char *);
162 int (* EXPATENTRY byteToAscii) (const ENCODING *, const char *);
163 int (* EXPATENTRY charMatches) (const ENCODING *, const char *, int);
164#endif /* XML_MIN_SIZE */
165 int (* EXPATENTRY isName2) (const ENCODING *, const char *);
166 int (* EXPATENTRY isName3) (const ENCODING *, const char *);
167 int (* EXPATENTRY isName4) (const ENCODING *, const char *);
168 int (* EXPATENTRY isNmstrt2) (const ENCODING *, const char *);
169 int (* EXPATENTRY isNmstrt3) (const ENCODING *, const char *);
170 int (* EXPATENTRY isNmstrt4) (const ENCODING *, const char *);
171 int (* EXPATENTRY isInvalid2) (const ENCODING *, const char *);
172 int (* EXPATENTRY isInvalid3) (const ENCODING *, const char *);
173 int (* EXPATENTRY isInvalid4) (const ENCODING *, const char *);
174};
175
176#ifdef XML_MIN_SIZE
177
178#define STANDARD_VTABLE(E) \
179 E ## byteType, \
180 E ## isNameMin, \
181 E ## isNmstrtMin, \
182 E ## byteToAscii, \
183 E ## charMatches,
184
185#else
186
187#define STANDARD_VTABLE(E) /* as nothing */
188
189#endif
190
191#define NORMAL_VTABLE(E) \
192 E ## isName2, \
193 E ## isName3, \
194 E ## isName4, \
195 E ## isNmstrt2, \
196 E ## isNmstrt3, \
197 E ## isNmstrt4, \
198 E ## isInvalid2, \
199 E ## isInvalid3, \
200 E ## isInvalid4
201
202static int checkCharRefNumber(int);
203
204#include "expat\xmltok_impl.h"
205#include "expat\ascii.h"
206
207#ifdef XML_MIN_SIZE
208#define sb_isNameMin isNever
209#define sb_isNmstrtMin isNever
210#endif
211
212#ifdef XML_MIN_SIZE
213#define MINBPC(enc) ((enc)->minBytesPerChar)
214#else
215/* minimum bytes per character */
216#define MINBPC(enc) 1
217#endif
218
219#define SB_BYTE_TYPE(enc, p) \
220 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
221
222#ifdef XML_MIN_SIZE
223static int EXPATENTRY sb_byteType(const ENCODING * enc, const char *p)
224{
225 return SB_BYTE_TYPE(enc, p);
226}
227#define BYTE_TYPE(enc, p) \
228 (((const struct normal_encoding *)(enc))->byteType(enc, p))
229#else
230#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
231#endif
232
233#ifdef XML_MIN_SIZE
234#define BYTE_TO_ASCII(enc, p) \
235 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
236static int EXPATENTRY sb_byteToAscii(const ENCODING * enc, const char *p)
237{
238 return *p;
239}
240#else
241#define BYTE_TO_ASCII(enc, p) (*(p))
242#endif
243
244#define IS_NAME_CHAR(enc, p, n) \
245 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
246#define IS_NMSTRT_CHAR(enc, p, n) \
247 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
248#define IS_INVALID_CHAR(enc, p, n) \
249 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
250
251#ifdef XML_MIN_SIZE
252#define IS_NAME_CHAR_MINBPC(enc, p) \
253 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
254#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
255 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
256#else
257#define IS_NAME_CHAR_MINBPC(enc, p) (0)
258#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
259#endif
260
261#ifdef XML_MIN_SIZE
262#define CHAR_MATCHES(enc, p, c) \
263 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
264static int EXPATENTRY sb_charMatches(const ENCODING * enc, const char *p, int c)
265{
266 return *p == c;
267}
268#else
269/* c is an ASCII character */
270#define CHAR_MATCHES(enc, p, c) (*(p) == c)
271#endif
272
273#define PREFIX(ident) normal_ ## ident
274#include "xmltok_impl.c"
275
276#undef MINBPC
277#undef BYTE_TYPE
278#undef BYTE_TO_ASCII
279#undef CHAR_MATCHES
280#undef IS_NAME_CHAR
281#undef IS_NAME_CHAR_MINBPC
282#undef IS_NMSTRT_CHAR
283#undef IS_NMSTRT_CHAR_MINBPC
284#undef IS_INVALID_CHAR
285
286enum
287{ /* UTF8_cvalN is value of masked first byte of N byte sequence */
288 UTF8_cval1 = 0x00,
289 UTF8_cval2 = 0xc0,
290 UTF8_cval3 = 0xe0,
291 UTF8_cval4 = 0xf0
292};
293
294static void EXPATENTRY utf8_toUtf8(const ENCODING * enc,
295 const char **fromP,
296 const char *fromLim,
297 char **toP,
298 const char *toLim)
299{
300 char *to;
301 const char *from;
302
303 if (fromLim - *fromP > toLim - *toP)
304 {
305 /* Avoid copying partial characters. */
306 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
307 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
308 break;
309 }
310 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
311 *to = *from;
312 *fromP = from;
313 *toP = to;
314}
315
316static void EXPATENTRY utf8_toUtf16(const ENCODING * enc,
317 const char **fromP, const char *fromLim,
318 unsigned short **toP, const unsigned short *toLim)
319{
320 unsigned short *to = *toP;
321 const char *from = *fromP;
322
323 while (from != fromLim && to != toLim)
324 {
325 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from])
326 {
327 case BT_LEAD2:
328 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
329 from += 2;
330 break;
331 case BT_LEAD3:
332 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
333 from += 3;
334 break;
335 case BT_LEAD4:
336 {
337 unsigned long n;
338
339 if (to + 1 == toLim)
340 break;
341 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
342 n -= 0x10000;
343 to[0] = (unsigned short)((n >> 10) | 0xD800);
344 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
345 to += 2;
346 from += 4;
347 }
348 break;
349 default:
350 *to++ = *from++;
351 break;
352 }
353 }
354 *fromP = from;
355 *toP = to;
356}
357
358#ifdef XML_NS
359static const struct normal_encoding utf8_encoding_ns =
360{
361 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
362 {
363#include "asciitab.h"
364#include "utf8tab.h"
365 },
366 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
367};
368
369#endif
370
371static const struct normal_encoding utf8_encoding =
372{
373 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
374 {
375#define BT_COLON BT_NMSTRT
376#include "expat\asciitab.h"
377#undef BT_COLON
378#include "expat\utf8tab.h"
379 },
380 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
381};
382
383#ifdef XML_NS
384
385static const struct normal_encoding internal_utf8_encoding_ns =
386{
387 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
388 {
389#include "iasciitab.h"
390#include "utf8tab.h"
391 },
392 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
393};
394
395#endif
396
397static const struct normal_encoding internal_utf8_encoding =
398{
399 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
400 {
401#define BT_COLON BT_NMSTRT
402#include "expat\iasciitab.h"
403#undef BT_COLON
404#include "expat\utf8tab.h"
405 },
406 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
407};
408
409static void EXPATENTRY latin1_toUtf8(const ENCODING * enc,
410 const char **fromP, const char *fromLim,
411 char **toP, const char *toLim)
412{
413 for (;;)
414 {
415 unsigned char c;
416
417 if (*fromP == fromLim)
418 break;
419 c = (unsigned char)**fromP;
420 if (c & 0x80)
421 {
422 if (toLim - *toP < 2)
423 break;
424 *(*toP)++ = ((c >> 6) | UTF8_cval2);
425 *(*toP)++ = ((c & 0x3f) | 0x80);
426 (*fromP)++;
427 }
428 else
429 {
430 if (*toP == toLim)
431 break;
432 *(*toP)++ = *(*fromP)++;
433 }
434 }
435}
436
437static void EXPATENTRY latin1_toUtf16(const ENCODING * enc,
438 const char **fromP, const char *fromLim,
439 unsigned short **toP, const unsigned short *toLim)
440{
441 while (*fromP != fromLim && *toP != toLim)
442 *(*toP)++ = (unsigned char)*(*fromP)++;
443}
444
445#ifdef XML_NS
446
447static const struct normal_encoding latin1_encoding_ns =
448{
449 {VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
450 {
451#include "asciitab.h"
452#include "latin1tab.h"
453 },
454 STANDARD_VTABLE(sb_)
455};
456
457#endif
458
459static const struct normal_encoding latin1_encoding =
460{
461 {VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
462 {
463#define BT_COLON BT_NMSTRT
464#include "expat\asciitab.h"
465#undef BT_COLON
466#include "expat\latin1tab.h"
467 },
468 STANDARD_VTABLE(sb_)
469};
470
471static void EXPATENTRY ascii_toUtf8(const ENCODING * enc,
472 const char **fromP, const char *fromLim,
473 char **toP, const char *toLim)
474{
475 while (*fromP != fromLim && *toP != toLim)
476 *(*toP)++ = *(*fromP)++;
477}
478
479#ifdef XML_NS
480
481static const struct normal_encoding ascii_encoding_ns =
482{
483 {VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
484 {
485#include "asciitab.h"
486/* BT_NONXML == 0 */
487 },
488 STANDARD_VTABLE(sb_)
489};
490
491#endif
492
493static const struct normal_encoding ascii_encoding =
494{
495 {VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
496 {
497#define BT_COLON BT_NMSTRT
498#include "expat\asciitab.h"
499#undef BT_COLON
500/* BT_NONXML == 0 */
501 },
502 STANDARD_VTABLE(sb_)
503};
504
505static int unicode_byte_type(char hi, char lo)
506{
507 switch ((unsigned char)hi)
508 {
509 case 0xD8:
510 case 0xD9:
511 case 0xDA:
512 case 0xDB:
513 return BT_LEAD4;
514 case 0xDC:
515 case 0xDD:
516 case 0xDE:
517 case 0xDF:
518 return BT_TRAIL;
519 case 0xFF:
520 switch ((unsigned char)lo)
521 {
522 case 0xFF:
523 case 0xFE:
524 return BT_NONXML;
525 }
526 break;
527 }
528 return BT_NONASCII;
529}
530
531#define DEFINE_UTF16_TO_UTF8(E) \
532static void EXPATENTRY E ## toUtf8(const ENCODING *enc, \
533 const char **fromP, const char *fromLim, \
534 char **toP, const char *toLim) \
535{ \
536 const char *from; \
537 for (from = *fromP; from != fromLim; from += 2) { \
538 int plane; \
539 unsigned char lo2; \
540 unsigned char lo = GET_LO(from); \
541 unsigned char hi = GET_HI(from); \
542 switch (hi) { \
543 case 0: \
544 if (lo < 0x80) { \
545 if (*toP == toLim) { \
546 *fromP = from; \
547 return; \
548 } \
549 *(*toP)++ = lo; \
550 break; \
551 } \
552 /* fall through */ \
553 case 0x1: case 0x2: case 0x3: \
554 case 0x4: case 0x5: case 0x6: case 0x7: \
555 if (toLim - *toP < 2) { \
556 *fromP = from; \
557 return; \
558 } \
559 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
560 *(*toP)++ = ((lo & 0x3f) | 0x80); \
561 break; \
562 default: \
563 if (toLim - *toP < 3) { \
564 *fromP = from; \
565 return; \
566 } \
567 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
568 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
569 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
570 *(*toP)++ = ((lo & 0x3f) | 0x80); \
571 break; \
572 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
573 if (toLim - *toP < 4) { \
574 *fromP = from; \
575 return; \
576 } \
577 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
578 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
579 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
580 from += 2; \
581 lo2 = GET_LO(from); \
582 *(*toP)++ = (((lo & 0x3) << 4) \
583 | ((GET_HI(from) & 0x3) << 2) \
584 | (lo2 >> 6) \
585 | 0x80); \
586 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
587 break; \
588 } \
589 } \
590 *fromP = from; \
591}
592
593#define DEFINE_UTF16_TO_UTF16(E) \
594static void EXPATENTRY E ## toUtf16(const ENCODING *enc, \
595 const char **fromP, const char *fromLim, \
596 unsigned short **toP, const unsigned short *toLim) \
597{ \
598 /* Avoid copying first half only of surrogate */ \
599 if (fromLim - *fromP > ((toLim - *toP) << 1) \
600 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
601 fromLim -= 2; \
602 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
603 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
604}
605
606#define SET2(ptr, ch) \
607 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
608#define GET_LO(ptr) ((unsigned char)(ptr)[0])
609#define GET_HI(ptr) ((unsigned char)(ptr)[1])
610
611DEFINE_UTF16_TO_UTF8(little2_)
612DEFINE_UTF16_TO_UTF16(little2_)
613
614#undef SET2
615#undef GET_LO
616#undef GET_HI
617
618#define SET2(ptr, ch) \
619 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
620#define GET_LO(ptr) ((unsigned char)(ptr)[1])
621#define GET_HI(ptr) ((unsigned char)(ptr)[0])
622
623DEFINE_UTF16_TO_UTF8(big2_)
624DEFINE_UTF16_TO_UTF16(big2_)
625
626#undef SET2
627#undef GET_LO
628#undef GET_HI
629
630#define LITTLE2_BYTE_TYPE(enc, p) \
631 ((p)[1] == 0 \
632 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
633 : unicode_byte_type((p)[1], (p)[0]))
634#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
635#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
636#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
637 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
638#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
639 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
640
641#ifdef XML_MIN_SIZE
642
643 static int EXPATENTRY little2_byteType(const ENCODING * enc, const char *p)
644 {
645 return LITTLE2_BYTE_TYPE(enc, p);
646 }
647
648 static int EXPATENTRY little2_byteToAscii(const ENCODING * enc, const char *p)
649 {
650 return LITTLE2_BYTE_TO_ASCII(enc, p);
651 }
652
653 static int EXPATENTRY little2_charMatches(const ENCODING * enc, const char *p, int c)
654 {
655 return LITTLE2_CHAR_MATCHES(enc, p, c);
656 }
657
658 static int EXPATENTRY little2_isNameMin(const ENCODING * enc, const char *p)
659 {
660 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
661 }
662
663 static int EXPATENTRY little2_isNmstrtMin(const ENCODING * enc, const char *p)
664 {
665 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
666 }
667
668 #undef VTABLE
669 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
670
671#else /* not XML_MIN_SIZE */
672
673 #undef PREFIX
674 #define PREFIX(ident) little2_ ## ident
675 #define MINBPC(enc) 2
676 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
677 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
678 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
679 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
680 #define IS_NAME_CHAR(enc, p, n) 0
681 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
682 #define IS_NMSTRT_CHAR(enc, p, n) (0)
683 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
684
685 #include "xmltok_impl.c"
686
687 #undef MINBPC
688 #undef BYTE_TYPE
689 #undef BYTE_TO_ASCII
690 #undef CHAR_MATCHES
691 #undef IS_NAME_CHAR
692 #undef IS_NAME_CHAR_MINBPC
693 #undef IS_NMSTRT_CHAR
694 #undef IS_NMSTRT_CHAR_MINBPC
695 #undef IS_INVALID_CHAR
696
697#endif /* not XML_MIN_SIZE */
698
699#ifdef XML_NS
700
701 static const struct normal_encoding little2_encoding_ns =
702 {
703 {VTABLE, 2, 0,
704 #if XML_BYTE_ORDER == 12
705 1
706 #else
707 0
708 #endif
709 },
710 {
711 #include "asciitab.h"
712 #include "latin1tab.h"
713 },
714 STANDARD_VTABLE(little2_)
715 };
716
717#endif
718
719 static const struct normal_encoding little2_encoding =
720 {
721 {VTABLE, 2, 0,
722#if XML_BYTE_ORDER == 12
723 1
724#else
725 0
726#endif
727 },
728 {
729#define BT_COLON BT_NMSTRT
730#include "expat\asciitab.h"
731#undef BT_COLON
732#include "expat\latin1tab.h"
733 },
734 STANDARD_VTABLE(little2_)
735};
736
737#if XML_BYTE_ORDER != 21
738
739#ifdef XML_NS
740
741 static const struct normal_encoding internal_little2_encoding_ns =
742 {
743 {VTABLE, 2, 0, 1},
744 {
745#include "iasciitab.h"
746#include "latin1tab.h"
747 },
748 STANDARD_VTABLE(little2_)
749};
750
751#endif
752
753 static const struct normal_encoding internal_little2_encoding =
754 {
755 {VTABLE, 2, 0, 1},
756 {
757#define BT_COLON BT_NMSTRT
758#include "expat\iasciitab.h"
759#undef BT_COLON
760#include "expat\latin1tab.h"
761 },
762 STANDARD_VTABLE(little2_)
763};
764
765#endif
766
767
768#define BIG2_BYTE_TYPE(enc, p) \
769 ((p)[0] == 0 \
770 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
771 : unicode_byte_type((p)[0], (p)[1]))
772#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
773#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
774#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
775 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
776#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
777 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
778
779#ifdef XML_MIN_SIZE
780
781 static int EXPATENTRY big2_byteType(const ENCODING * enc, const char *p)
782 {
783 return BIG2_BYTE_TYPE(enc, p);
784 }
785
786 static int EXPATENTRY big2_byteToAscii(const ENCODING * enc, const char *p)
787 {
788 return BIG2_BYTE_TO_ASCII(enc, p);
789 }
790
791 static int EXPATENTRY big2_charMatches(const ENCODING * enc, const char *p, int c)
792 {
793 return BIG2_CHAR_MATCHES(enc, p, c);
794 }
795
796 static int EXPATENTRY big2_isNameMin(const ENCODING * enc, const char *p)
797 {
798 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
799 }
800
801 static int EXPATENTRY big2_isNmstrtMin(const ENCODING * enc, const char *p)
802 {
803 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
804 }
805
806 #undef VTABLE
807 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
808
809#else /* not XML_MIN_SIZE */
810
811 #undef PREFIX
812 #define PREFIX(ident) big2_ ## ident
813 #define MINBPC(enc) 2
814 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
815 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
816 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
817 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
818 #define IS_NAME_CHAR(enc, p, n) 0
819 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
820 #define IS_NMSTRT_CHAR(enc, p, n) (0)
821 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
822
823 #include "xmltok_impl.c"
824
825 #undef MINBPC
826 #undef BYTE_TYPE
827 #undef BYTE_TO_ASCII
828 #undef CHAR_MATCHES
829 #undef IS_NAME_CHAR
830 #undef IS_NAME_CHAR_MINBPC
831 #undef IS_NMSTRT_CHAR
832 #undef IS_NMSTRT_CHAR_MINBPC
833 #undef IS_INVALID_CHAR
834
835#endif /* not XML_MIN_SIZE */
836
837#ifdef XML_NS
838
839 static const struct normal_encoding big2_encoding_ns =
840 {
841 {VTABLE, 2, 0,
842#if XML_BYTE_ORDER == 21
843 1
844#else
845 0
846#endif
847 },
848 {
849#include "asciitab.h"
850#include "latin1tab.h"
851 },
852 STANDARD_VTABLE(big2_)
853};
854
855#endif
856
857 static const struct normal_encoding big2_encoding =
858 {
859 {VTABLE, 2, 0,
860#if XML_BYTE_ORDER == 21
861 1
862#else
863 0
864#endif
865 },
866 {
867#define BT_COLON BT_NMSTRT
868#include "expat\asciitab.h"
869#undef BT_COLON
870#include "expat\latin1tab.h"
871 },
872 STANDARD_VTABLE(big2_)
873};
874
875#if XML_BYTE_ORDER != 12
876
877#ifdef XML_NS
878
879 static const struct normal_encoding internal_big2_encoding_ns =
880 {
881 {VTABLE, 2, 0, 1},
882 {
883#include "iasciitab.h"
884#include "latin1tab.h"
885 },
886 STANDARD_VTABLE(big2_)
887};
888
889#endif
890
891 static const struct normal_encoding internal_big2_encoding =
892 {
893 {VTABLE, 2, 0, 1},
894 {
895#define BT_COLON BT_NMSTRT
896#include "expat\iasciitab.h"
897#undef BT_COLON
898#include "expat\latin1tab.h"
899 },
900 STANDARD_VTABLE(big2_)
901};
902
903#endif
904
905#undef PREFIX
906
907 static
908 int streqci(const char *s1, const char *s2)
909{
910 for (;;)
911 {
912 char c1 = *s1++;
913 char c2 = *s2++;
914
915 if (ASCII_a <= c1 && c1 <= ASCII_z)
916 c1 += ASCII_A - ASCII_a;
917 if (ASCII_a <= c2 && c2 <= ASCII_z)
918 c2 += ASCII_A - ASCII_a;
919 if (c1 != c2)
920 return 0;
921 if (!c1)
922 break;
923 }
924 return 1;
925}
926
927static void EXPATENTRY initUpdatePosition(const ENCODING * enc,
928 const char *ptr,
929 const char *end,
930 POSITION * pos)
931{
932 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
933}
934
935static int EXPATENTRY toAscii(const ENCODING * enc, const char *ptr, const char *end)
936{
937 char buf[1];
938 char *p = buf;
939
940 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
941 if (p == buf)
942 return -1;
943 else
944 return buf[0];
945}
946
947static int isSpace(int c)
948{
949 switch (c)
950 {
951 case 0x20:
952 case 0xD:
953 case 0xA:
954 case 0x9:
955 return 1;
956 }
957 return 0;
958}
959
960/* Return 1 if there's just optional white space
961 * or there's an S followed by name=val. */
962static int EXPATENTRY parsePseudoAttribute(const ENCODING * enc,
963 const char *ptr,
964 const char *end,
965 const char **namePtr,
966 const char **nameEndPtr,
967 const char **valPtr,
968 const char **nextTokPtr)
969{
970 int c;
971 char open;
972
973 if (ptr == end)
974 {
975 *namePtr = 0;
976 return 1;
977 }
978 if (!isSpace(toAscii(enc, ptr, end)))
979 {
980 *nextTokPtr = ptr;
981 return 0;
982 }
983 do
984 {
985 ptr += enc->minBytesPerChar;
986 }
987 while (isSpace(toAscii(enc, ptr, end)));
988 if (ptr == end)
989 {
990 *namePtr = 0;
991 return 1;
992 }
993 *namePtr = ptr;
994 for (;;)
995 {
996 c = toAscii(enc, ptr, end);
997 if (c == -1)
998 {
999 *nextTokPtr = ptr;
1000 return 0;
1001 }
1002 if (c == ASCII_EQUALS)
1003 {
1004 *nameEndPtr = ptr;
1005 break;
1006 }
1007 if (isSpace(c))
1008 {
1009 *nameEndPtr = ptr;
1010 do
1011 {
1012 ptr += enc->minBytesPerChar;
1013 }
1014 while (isSpace(c = toAscii(enc, ptr, end)));
1015 if (c != ASCII_EQUALS)
1016 {
1017 *nextTokPtr = ptr;
1018 return 0;
1019 }
1020 break;
1021 }
1022 ptr += enc->minBytesPerChar;
1023 }
1024 if (ptr == *namePtr)
1025 {
1026 *nextTokPtr = ptr;
1027 return 0;
1028 }
1029 ptr += enc->minBytesPerChar;
1030 c = toAscii(enc, ptr, end);
1031 while (isSpace(c))
1032 {
1033 ptr += enc->minBytesPerChar;
1034 c = toAscii(enc, ptr, end);
1035 }
1036 if (c != ASCII_QUOT && c != ASCII_APOS)
1037 {
1038 *nextTokPtr = ptr;
1039 return 0;
1040 }
1041 open = c;
1042 ptr += enc->minBytesPerChar;
1043 *valPtr = ptr;
1044 for (;; ptr += enc->minBytesPerChar)
1045 {
1046 c = toAscii(enc, ptr, end);
1047 if (c == open)
1048 break;
1049 if (!(ASCII_a <= c && c <= ASCII_z)
1050 && !(ASCII_A <= c && c <= ASCII_Z)
1051 && !(ASCII_0 <= c && c <= ASCII_9)
1052 && c != ASCII_PERIOD
1053 && c != ASCII_MINUS
1054 && c != ASCII_UNDERSCORE)
1055 {
1056 *nextTokPtr = ptr;
1057 return 0;
1058 }
1059 }
1060 *nextTokPtr = ptr + enc->minBytesPerChar;
1061 return 1;
1062}
1063
1064static const char KW_version[] =
1065{
1066 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1067};
1068
1069static const char KW_encoding[] =
1070{
1071 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1072};
1073
1074static const char KW_standalone[] =
1075{
1076 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
1077};
1078
1079static const char KW_yes[] =
1080{
1081 ASCII_y, ASCII_e, ASCII_s, '\0'
1082};
1083
1084static const char KW_no[] =
1085{
1086 ASCII_n, ASCII_o, '\0'
1087};
1088
1089static int doParseXmlDecl(const ENCODING* (* EXPATENTRY encodingFinder)(const ENCODING *,
1090 const char *,
1091 const char *),
1092 int isGeneralTextEntity,
1093 const ENCODING * enc,
1094 const char *ptr,
1095 const char *end,
1096 const char **badPtr,
1097 const char **versionPtr,
1098 const char **versionEndPtr,
1099 const char **encodingName,
1100 const ENCODING ** encoding,
1101 int *standalone)
1102{
1103 const char *val = 0;
1104 const char *name = 0;
1105 const char *nameEnd = 0;
1106
1107 ptr += 5 * enc->minBytesPerChar;
1108 end -= 2 * enc->minBytesPerChar;
1109 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name)
1110 {
1111 *badPtr = ptr;
1112 return 0;
1113 }
1114 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version))
1115 {
1116 if (!isGeneralTextEntity)
1117 {
1118 *badPtr = name;
1119 return 0;
1120 }
1121 }
1122 else
1123 {
1124 if (versionPtr)
1125 *versionPtr = val;
1126 if (versionEndPtr)
1127 *versionEndPtr = ptr;
1128 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1129 {
1130 *badPtr = ptr;
1131 return 0;
1132 }
1133 if (!name)
1134 {
1135 if (isGeneralTextEntity)
1136 {
1137 /* a TextDecl must have an EncodingDecl */
1138 *badPtr = ptr;
1139 return 0;
1140 }
1141 return 1;
1142 }
1143 }
1144 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding))
1145 {
1146 int c = toAscii(enc, val, end);
1147
1148 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z))
1149 {
1150 *badPtr = val;
1151 return 0;
1152 }
1153 if (encodingName)
1154 *encodingName = val;
1155 if (encoding)
1156 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1157 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1158 {
1159 *badPtr = ptr;
1160 return 0;
1161 }
1162 if (!name)
1163 return 1;
1164 }
1165 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) || isGeneralTextEntity)
1166 {
1167 *badPtr = name;
1168 return 0;
1169 }
1170 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes))
1171 {
1172 if (standalone)
1173 *standalone = 1;
1174 }
1175 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no))
1176 {
1177 if (standalone)
1178 *standalone = 0;
1179 }
1180 else
1181 {
1182 *badPtr = val;
1183 return 0;
1184 }
1185 while (isSpace(toAscii(enc, ptr, end)))
1186 ptr += enc->minBytesPerChar;
1187 if (ptr != end)
1188 {
1189 *badPtr = ptr;
1190 return 0;
1191 }
1192 return 1;
1193}
1194
1195static int checkCharRefNumber(int result)
1196{
1197 switch (result >> 8)
1198 {
1199 case 0xD8:
1200 case 0xD9:
1201 case 0xDA:
1202 case 0xDB:
1203 case 0xDC:
1204 case 0xDD:
1205 case 0xDE:
1206 case 0xDF:
1207 return -1;
1208 case 0:
1209 if (latin1_encoding.type[result] == BT_NONXML)
1210 return -1;
1211 break;
1212 case 0xFF:
1213 if (result == 0xFFFE || result == 0xFFFF)
1214 return -1;
1215 break;
1216 }
1217 return result;
1218}
1219
1220int XmlUtf8Encode(int c, char *buf)
1221{
1222 enum
1223 {
1224 /* minN is minimum legal resulting value for N byte sequence */
1225 min2 = 0x80,
1226 min3 = 0x800,
1227 min4 = 0x10000
1228 };
1229
1230 if (c < 0)
1231 return 0;
1232 if (c < min2)
1233 {
1234 buf[0] = (c | UTF8_cval1);
1235 return 1;
1236 }
1237 if (c < min3)
1238 {
1239 buf[0] = ((c >> 6) | UTF8_cval2);
1240 buf[1] = ((c & 0x3f) | 0x80);
1241 return 2;
1242 }
1243 if (c < min4)
1244 {
1245 buf[0] = ((c >> 12) | UTF8_cval3);
1246 buf[1] = (((c >> 6) & 0x3f) | 0x80);
1247 buf[2] = ((c & 0x3f) | 0x80);
1248 return 3;
1249 }
1250 if (c < 0x110000)
1251 {
1252 buf[0] = ((c >> 18) | UTF8_cval4);
1253 buf[1] = (((c >> 12) & 0x3f) | 0x80);
1254 buf[2] = (((c >> 6) & 0x3f) | 0x80);
1255 buf[3] = ((c & 0x3f) | 0x80);
1256 return 4;
1257 }
1258 return 0;
1259}
1260
1261int XmlUtf16Encode(int charNum, unsigned short *buf)
1262{
1263 if (charNum < 0)
1264 return 0;
1265 if (charNum < 0x10000)
1266 {
1267 buf[0] = charNum;
1268 return 1;
1269 }
1270 if (charNum < 0x110000)
1271 {
1272 charNum -= 0x10000;
1273 buf[0] = (charNum >> 10) + 0xD800;
1274 buf[1] = (charNum & 0x3FF) + 0xDC00;
1275 return 2;
1276 }
1277 return 0;
1278}
1279
1280struct unknown_encoding
1281{
1282 struct normal_encoding normal;
1283 int (*convert) (void *userData, const char *p);
1284 void *userData;
1285 unsigned short utf16[256];
1286 char utf8[256][4];
1287};
1288
1289int EXPATENTRY XmlSizeOfUnknownEncoding(void)
1290{
1291 return sizeof(struct unknown_encoding);
1292}
1293
1294static int EXPATENTRY unknown_isName(const ENCODING * enc, const char *p)
1295{
1296 int c = ((const struct unknown_encoding *)enc)
1297 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1298
1299 if (c & ~0xFFFF)
1300 return 0;
1301 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1302}
1303
1304static int EXPATENTRY unknown_isNmstrt(const ENCODING * enc, const char *p)
1305{
1306 int c = ((const struct unknown_encoding *)enc)
1307 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1308
1309 if (c & ~0xFFFF)
1310 return 0;
1311 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1312}
1313
1314static int EXPATENTRY unknown_isInvalid(const ENCODING * enc, const char *p)
1315{
1316 int c = ((const struct unknown_encoding *)enc)
1317 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1318
1319 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1320}
1321
1322static void EXPATENTRY unknown_toUtf8(const ENCODING * enc,
1323 const char **fromP,
1324 const char *fromLim,
1325 char **toP,
1326 const char *toLim)
1327{
1328 char buf[XML_UTF8_ENCODE_MAX];
1329
1330 for (;;)
1331 {
1332 const char *utf8;
1333 int n;
1334
1335 if (*fromP == fromLim)
1336 break;
1337 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1338 n = *utf8++;
1339 if (n == 0)
1340 {
1341 int c = ((const struct unknown_encoding *)enc)
1342 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1343
1344 n = XmlUtf8Encode(c, buf);
1345 if (n > toLim - *toP)
1346 break;
1347 utf8 = buf;
1348 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1349 - (BT_LEAD2 - 2);
1350 }
1351 else
1352 {
1353 if (n > toLim - *toP)
1354 break;
1355 (*fromP)++;
1356 }
1357 do
1358 {
1359 *(*toP)++ = *utf8++;
1360 }
1361 while (--n != 0);
1362 }
1363}
1364
1365static void EXPATENTRY unknown_toUtf16(const ENCODING * enc,
1366 const char **fromP,
1367 const char *fromLim,
1368 unsigned short **toP,
1369 const unsigned short *toLim)
1370{
1371 while (*fromP != fromLim && *toP != toLim)
1372 {
1373 unsigned short c
1374 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1375
1376 if (c == 0)
1377 {
1378 c = (unsigned short)((const struct unknown_encoding *)enc)
1379 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1380 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1381 - (BT_LEAD2 - 2);
1382 }
1383 else
1384 (*fromP)++;
1385 *(*toP)++ = c;
1386 }
1387}
1388
1389ENCODING * XmlInitUnknownEncoding(void *mem,
1390 int *table,
1391 int (*convert) (void *userData, const char *p),
1392 void *userData)
1393{
1394 int i;
1395 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1396 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1397
1398 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1399 for (i = 0; i < 128; i++)
1400 if (latin1_encoding.type[i] != BT_OTHER
1401 && latin1_encoding.type[i] != BT_NONXML
1402 && table[i] != i)
1403 return 0;
1404 for (i = 0; i < 256; i++)
1405 {
1406 int c = table[i];
1407
1408 if (c == -1)
1409 {
1410 e->normal.type[i] = BT_MALFORM;
1411 /* This shouldn't really get used. */
1412 e->utf16[i] = 0xFFFF;
1413 e->utf8[i][0] = 1;
1414 e->utf8[i][1] = 0;
1415 }
1416 else if (c < 0)
1417 {
1418 if (c < -4)
1419 return 0;
1420 e->normal.type[i] = BT_LEAD2 - (c + 2);
1421 e->utf8[i][0] = 0;
1422 e->utf16[i] = 0;
1423 }
1424 else if (c < 0x80)
1425 {
1426 if (latin1_encoding.type[c] != BT_OTHER
1427 && latin1_encoding.type[c] != BT_NONXML
1428 && c != i)
1429 return 0;
1430 e->normal.type[i] = latin1_encoding.type[c];
1431 e->utf8[i][0] = 1;
1432 e->utf8[i][1] = (char)c;
1433 e->utf16[i] = c == 0 ? 0xFFFF : c;
1434 }
1435 else if (checkCharRefNumber(c) < 0)
1436 {
1437 e->normal.type[i] = BT_NONXML;
1438 /* This shouldn't really get used. */
1439 e->utf16[i] = 0xFFFF;
1440 e->utf8[i][0] = 1;
1441 e->utf8[i][1] = 0;
1442 }
1443 else
1444 {
1445 if (c > 0xFFFF)
1446 return 0;
1447 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1448 e->normal.type[i] = BT_NMSTRT;
1449 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1450 e->normal.type[i] = BT_NAME;
1451 else
1452 e->normal.type[i] = BT_OTHER;
1453 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1454 e->utf16[i] = c;
1455 }
1456 }
1457 e->userData = userData;
1458 e->convert = convert;
1459 if (convert)
1460 {
1461 e->normal.isName2 = unknown_isName;
1462 e->normal.isName3 = unknown_isName;
1463 e->normal.isName4 = unknown_isName;
1464 e->normal.isNmstrt2 = unknown_isNmstrt;
1465 e->normal.isNmstrt3 = unknown_isNmstrt;
1466 e->normal.isNmstrt4 = unknown_isNmstrt;
1467 e->normal.isInvalid2 = unknown_isInvalid;
1468 e->normal.isInvalid3 = unknown_isInvalid;
1469 e->normal.isInvalid4 = unknown_isInvalid;
1470 }
1471 e->normal.enc.utf8Convert = unknown_toUtf8;
1472 e->normal.enc.utf16Convert = unknown_toUtf16;
1473 return &(e->normal.enc);
1474}
1475
1476/* If this enumeration is changed, getEncodingIndex and encodings
1477 * must also be changed. */
1478enum
1479{
1480 UNKNOWN_ENC = -1,
1481 ISO_8859_1_ENC = 0,
1482 US_ASCII_ENC,
1483 UTF_8_ENC,
1484 UTF_16_ENC,
1485 UTF_16BE_ENC,
1486 UTF_16LE_ENC,
1487 /* must match encodingNames up to here */
1488 NO_ENC
1489};
1490
1491static const char KW_ISO_8859_1[] =
1492{
1493 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
1494};
1495static const char KW_US_ASCII[] =
1496{
1497 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
1498};
1499static const char KW_UTF_8[] =
1500{
1501 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1502};
1503static const char KW_UTF_16[] =
1504{
1505 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1506};
1507static const char KW_UTF_16BE[] =
1508{
1509 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
1510};
1511static const char KW_UTF_16LE[] =
1512{
1513 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
1514};
1515
1516static int getEncodingIndex(const char *name)
1517{
1518 static const char *encodingNames[] =
1519 {
1520 KW_ISO_8859_1,
1521 KW_US_ASCII,
1522 KW_UTF_8,
1523 KW_UTF_16,
1524 KW_UTF_16BE,
1525 KW_UTF_16LE,
1526 };
1527 int i;
1528
1529 if (name == 0)
1530 return NO_ENC;
1531 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1532 if (streqci(name, encodingNames[i]))
1533 return i;
1534 return UNKNOWN_ENC;
1535}
1536
1537/* For binary compatibility, we store the index of the encoding specified
1538 * at initialization in the isUtf16 member. */
1539
1540#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1541#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1542
1543/* This is what detects the encoding.
1544 * encodingTable maps from encoding indices to encodings;
1545 * INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1546 * state is XML_CONTENT_STATE if we're parsing an external text entity,
1547 * and XML_PROLOG_STATE otherwise.
1548 */
1549
1550
1551static int EXPATENTRY initScan(const ENCODING ** encodingTable,
1552 const INIT_ENCODING * enc,
1553 int state,
1554 const char *ptr,
1555 const char *end,
1556 const char **nextTokPtr)
1557{
1558 const ENCODING **encPtr;
1559
1560 if (ptr == end)
1561 return XML_TOK_NONE;
1562 encPtr = enc->encPtr;
1563 if (ptr + 1 == end)
1564 {
1565 /* only a single byte available for auto-detection */
1566#ifndef XML_DTD /* FIXME */
1567 /* a well-formed document entity must have more than one byte */
1568 if (state != XML_CONTENT_STATE)
1569 return XML_TOK_PARTIAL;
1570#endif
1571 /* so we're parsing an external text entity... */
1572 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1573 switch (INIT_ENC_INDEX(enc))
1574 {
1575 case UTF_16_ENC:
1576 case UTF_16LE_ENC:
1577 case UTF_16BE_ENC:
1578 return XML_TOK_PARTIAL;
1579 }
1580 switch ((unsigned char)*ptr)
1581 {
1582 case 0xFE:
1583 case 0xFF:
1584 case 0xEF: /* possibly first byte of UTF-8 BOM */
1585 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1586 && state == XML_CONTENT_STATE)
1587 break;
1588 /* fall through */
1589 case 0x00:
1590 case 0x3C:
1591 return XML_TOK_PARTIAL;
1592 }
1593 }
1594 else
1595 {
1596 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1])
1597 {
1598 case 0xFEFF:
1599 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1600 && state == XML_CONTENT_STATE)
1601 break;
1602 *nextTokPtr = ptr + 2;
1603 *encPtr = encodingTable[UTF_16BE_ENC];
1604 return XML_TOK_BOM;
1605 /* 00 3C is handled in the default case */
1606 case 0x3C00:
1607 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1608 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1609 && state == XML_CONTENT_STATE)
1610 break;
1611 *encPtr = encodingTable[UTF_16LE_ENC];
1612 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1613 case 0xFFFE:
1614 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1615 && state == XML_CONTENT_STATE)
1616 break;
1617 *nextTokPtr = ptr + 2;
1618 *encPtr = encodingTable[UTF_16LE_ENC];
1619 return XML_TOK_BOM;
1620 case 0xEFBB:
1621 /* Maybe a UTF-8 BOM (EF BB BF) */
1622 /* If there's an explicitly specified (external) encoding
1623 * of ISO-8859-1 or some flavour of UTF-16
1624 * and this is an external text entity,
1625 * don't look for the BOM,
1626 * because it might be a legal data. */
1627 if (state == XML_CONTENT_STATE)
1628 {
1629 int e = INIT_ENC_INDEX(enc);
1630
1631 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
1632 break;
1633 }
1634 if (ptr + 2 == end)
1635 return XML_TOK_PARTIAL;
1636 if ((unsigned char)ptr[2] == 0xBF)
1637 {
1638 *encPtr = encodingTable[UTF_8_ENC];
1639 return XML_TOK_BOM;
1640 }
1641 break;
1642 default:
1643 if (ptr[0] == '\0')
1644 {
1645 /* 0 isn't a legal data character. Furthermore a document entity can only
1646 * start with ASCII characters. So the only way this can fail to be big-endian
1647 * UTF-16 if it it's an external parsed general entity that's labelled as
1648 * UTF-16LE. */
1649 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1650 break;
1651 *encPtr = encodingTable[UTF_16BE_ENC];
1652 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1653 }
1654 else if (ptr[1] == '\0')
1655 {
1656 /* We could recover here in the case:
1657 * - parsing an external entity
1658 * - second byte is 0
1659 * - no externally specified encoding
1660 * - no encoding declaration
1661 * by assuming UTF-16LE. But we don't, because this would mean when
1662 * presented just with a single byte, we couldn't reliably determine
1663 * whether we needed further bytes. */
1664 if (state == XML_CONTENT_STATE)
1665 break;
1666 *encPtr = encodingTable[UTF_16LE_ENC];
1667 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1668 }
1669 break;
1670 }
1671 }
1672 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1673 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1674}
1675
1676
1677#define NS(x) x
1678#define ns(x) x
1679#include "xmltok_ns.c"
1680#undef NS
1681#undef ns
1682
1683#ifdef XML_NS
1684
1685#define NS(x) x ## NS
1686#define ns(x) x ## _ns
1687
1688#include "xmltok_ns.c"
1689
1690#undef NS
1691#undef ns
1692
1693ENCODING * XmlInitUnknownEncodingNS(void *mem,
1694 int *table,
1695 int (* EXPATENTRY convert) (void *userData, const char *p),
1696 void *userData)
1697{
1698 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1699
1700 if (enc)
1701 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1702 return enc;
1703}
1704
1705#endif /* XML_NS */
Note: See TracBrowser for help on using the repository browser.