source: trunk/src/helpers/xmltok.c@ 90

Last change on this file since 90 was 75, checked in by umoeller, 24 years ago

Misc changes.

  • Property svn:eol-style set to CRLF
  • Property svn:keywords set to Author Date Id Revision
File size: 47.5 KB
Line 
1
2/*
3 *sourcefile xmltok.c
4 * part of the expat implementation. See xmlparse.c.
5 *
6 */
7
8/*
9 * Copyright (C) 2001 Ulrich M”ller.
10 * Copyright (c) 1998, 1999, 2000 Thai Open Source Software Center Ltd.
11 * and Clark Cooper.
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining
14 * a copy of this software and associated documentation files (the
15 * "Software"), to deal in the Software without restriction, including
16 * without limitation the rights to use, copy, modify, merge, publish,
17 * distribute, sublicense, and/or sell copies of the Software, and to
18 * permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included
22 * in all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32
33#include "setup.h"
34
35#include "expat\expat_setup.h" // V0.9.9 (2001-02-10) [umoeller]
36
37#pragma info(norea, nogen)
38 // disable "statement unreachable" and "missing break statement"
39 // this code generates those options HEAVILY
40
41#include "expat\xmltok.h"
42#include "expat\nametab.h"
43
44#ifdef XML_DTD
45#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
46#else
47#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
48#endif
49
50#define VTABLE1 \
51 { PREFIX(prologTok), PREFIX(contentTok), \
52 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
53 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
54 PREFIX(sameName), \
55 PREFIX(nameMatchesAscii), \
56 PREFIX(nameLength), \
57 PREFIX(skipS), \
58 PREFIX(getAtts), \
59 PREFIX(charRefNumber), \
60 PREFIX(predefinedEntityName), \
61 PREFIX(updatePosition), \
62 PREFIX(isPublicId)
63
64#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
65
66#define UCS2_GET_NAMING(pages, hi, lo) \
67 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
68
69/* A 2 byte UTF-8 representation splits the characters 11 bits
70 * between the bottom 5 and 6 bits of the bytes.
71 * We need 8 bits to index into pages, 3 bits to add to that index and
72 * 5 bits to generate the mask. */
73#define UTF8_GET_NAMING2(pages, byte) \
74 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
75 + ((((byte)[0]) & 3) << 1) \
76 + ((((byte)[1]) >> 5) & 1)] \
77 & (1 << (((byte)[1]) & 0x1F)))
78
79/* A 3 byte UTF-8 representation splits the characters 16 bits
80 * between the bottom 4, 6 and 6 bits of the bytes.
81 * We need 8 bits to index into pages, 3 bits to add to that index and
82 * 5 bits to generate the mask. */
83#define UTF8_GET_NAMING3(pages, byte) \
84 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
85 + ((((byte)[1]) >> 2) & 0xF)] \
86 << 3) \
87 + ((((byte)[1]) & 3) << 1) \
88 + ((((byte)[2]) >> 5) & 1)] \
89 & (1 << (((byte)[2]) & 0x1F)))
90
91#define UTF8_GET_NAMING(pages, p, n) \
92 ((n) == 2 \
93 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
94 : ((n) == 3 \
95 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
96 : 0))
97
98#define UTF8_INVALID3(p) \
99 ((*p) == 0xED \
100 ? (((p)[1] & 0x20) != 0) \
101 : ((*p) == 0xEF \
102 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
103 : 0))
104
105#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
106
107static int EXPATENTRY isNever(const ENCODING * enc, const char *p)
108{
109 return 0;
110}
111
112static int EXPATENTRY utf8_isName2(const ENCODING * enc, const char *p)
113{
114 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
115}
116
117static int EXPATENTRY utf8_isName3(const ENCODING * enc, const char *p)
118{
119 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
120}
121
122#define utf8_isName4 isNever
123
124static int EXPATENTRY utf8_isNmstrt2(const ENCODING * enc, const char *p)
125{
126 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
127}
128
129static int EXPATENTRY utf8_isNmstrt3(const ENCODING * enc, const char *p)
130{
131 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
132}
133
134#define utf8_isNmstrt4 isNever
135
136#define utf8_isInvalid2 isNever
137
138static int EXPATENTRY utf8_isInvalid3(const ENCODING * enc, const char *p)
139{
140 return UTF8_INVALID3((const unsigned char *)p);
141}
142
143static int EXPATENTRY utf8_isInvalid4(const ENCODING * enc, const char *p)
144{
145 return UTF8_INVALID4((const unsigned char *)p);
146}
147
148struct normal_encoding
149{
150 ENCODING enc;
151 unsigned char type[256];
152#ifdef XML_MIN_SIZE
153 int (* EXPATENTRY byteType) (const ENCODING *, const char *);
154 int (* EXPATENTRY isNameMin) (const ENCODING *, const char *);
155 int (* EXPATENTRY isNmstrtMin) (const ENCODING *, const char *);
156 int (* EXPATENTRY byteToAscii) (const ENCODING *, const char *);
157 int (* EXPATENTRY charMatches) (const ENCODING *, const char *, int);
158#endif /* XML_MIN_SIZE */
159 int (* EXPATENTRY isName2) (const ENCODING *, const char *);
160 int (* EXPATENTRY isName3) (const ENCODING *, const char *);
161 int (* EXPATENTRY isName4) (const ENCODING *, const char *);
162 int (* EXPATENTRY isNmstrt2) (const ENCODING *, const char *);
163 int (* EXPATENTRY isNmstrt3) (const ENCODING *, const char *);
164 int (* EXPATENTRY isNmstrt4) (const ENCODING *, const char *);
165 int (* EXPATENTRY isInvalid2) (const ENCODING *, const char *);
166 int (* EXPATENTRY isInvalid3) (const ENCODING *, const char *);
167 int (* EXPATENTRY isInvalid4) (const ENCODING *, const char *);
168};
169
170#ifdef XML_MIN_SIZE
171
172#define STANDARD_VTABLE(E) \
173 E ## byteType, \
174 E ## isNameMin, \
175 E ## isNmstrtMin, \
176 E ## byteToAscii, \
177 E ## charMatches,
178
179#else
180
181#define STANDARD_VTABLE(E) /* as nothing */
182
183#endif
184
185#define NORMAL_VTABLE(E) \
186 E ## isName2, \
187 E ## isName3, \
188 E ## isName4, \
189 E ## isNmstrt2, \
190 E ## isNmstrt3, \
191 E ## isNmstrt4, \
192 E ## isInvalid2, \
193 E ## isInvalid3, \
194 E ## isInvalid4
195
196static int checkCharRefNumber(int);
197
198#include "expat\xmltok_impl.h"
199#include "expat\ascii.h"
200
201#ifdef XML_MIN_SIZE
202#define sb_isNameMin isNever
203#define sb_isNmstrtMin isNever
204#endif
205
206#ifdef XML_MIN_SIZE
207#define MINBPC(enc) ((enc)->minBytesPerChar)
208#else
209/* minimum bytes per character */
210#define MINBPC(enc) 1
211#endif
212
213#define SB_BYTE_TYPE(enc, p) \
214 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
215
216#ifdef XML_MIN_SIZE
217static int EXPATENTRY sb_byteType(const ENCODING * enc, const char *p)
218{
219 return SB_BYTE_TYPE(enc, p);
220}
221#define BYTE_TYPE(enc, p) \
222 (((const struct normal_encoding *)(enc))->byteType(enc, p))
223#else
224#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
225#endif
226
227#ifdef XML_MIN_SIZE
228#define BYTE_TO_ASCII(enc, p) \
229 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
230static int EXPATENTRY sb_byteToAscii(const ENCODING * enc, const char *p)
231{
232 return *p;
233}
234#else
235#define BYTE_TO_ASCII(enc, p) (*(p))
236#endif
237
238#define IS_NAME_CHAR(enc, p, n) \
239 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
240#define IS_NMSTRT_CHAR(enc, p, n) \
241 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
242#define IS_INVALID_CHAR(enc, p, n) \
243 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
244
245#ifdef XML_MIN_SIZE
246#define IS_NAME_CHAR_MINBPC(enc, p) \
247 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
248#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
249 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
250#else
251#define IS_NAME_CHAR_MINBPC(enc, p) (0)
252#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
253#endif
254
255#ifdef XML_MIN_SIZE
256#define CHAR_MATCHES(enc, p, c) \
257 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
258static int EXPATENTRY sb_charMatches(const ENCODING * enc, const char *p, int c)
259{
260 return *p == c;
261}
262#else
263/* c is an ASCII character */
264#define CHAR_MATCHES(enc, p, c) (*(p) == c)
265#endif
266
267#define PREFIX(ident) normal_ ## ident
268#include "xmltok_impl.c"
269
270#undef MINBPC
271#undef BYTE_TYPE
272#undef BYTE_TO_ASCII
273#undef CHAR_MATCHES
274#undef IS_NAME_CHAR
275#undef IS_NAME_CHAR_MINBPC
276#undef IS_NMSTRT_CHAR
277#undef IS_NMSTRT_CHAR_MINBPC
278#undef IS_INVALID_CHAR
279
280enum
281{ /* UTF8_cvalN is value of masked first byte of N byte sequence */
282 UTF8_cval1 = 0x00,
283 UTF8_cval2 = 0xc0,
284 UTF8_cval3 = 0xe0,
285 UTF8_cval4 = 0xf0
286};
287
288static void EXPATENTRY utf8_toUtf8(const ENCODING * enc,
289 const char **fromP,
290 const char *fromLim,
291 char **toP,
292 const char *toLim)
293{
294 char *to;
295 const char *from;
296
297 if (fromLim - *fromP > toLim - *toP)
298 {
299 /* Avoid copying partial characters. */
300 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
301 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
302 break;
303 }
304 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
305 *to = *from;
306 *fromP = from;
307 *toP = to;
308}
309
310static void EXPATENTRY utf8_toUtf16(const ENCODING * enc,
311 const char **fromP, const char *fromLim,
312 unsigned short **toP, const unsigned short *toLim)
313{
314 unsigned short *to = *toP;
315 const char *from = *fromP;
316
317 while (from != fromLim && to != toLim)
318 {
319 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from])
320 {
321 case BT_LEAD2:
322 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
323 from += 2;
324 break;
325 case BT_LEAD3:
326 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
327 from += 3;
328 break;
329 case BT_LEAD4:
330 {
331 unsigned long n;
332
333 if (to + 1 == toLim)
334 break;
335 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
336 n -= 0x10000;
337 to[0] = (unsigned short)((n >> 10) | 0xD800);
338 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
339 to += 2;
340 from += 4;
341 }
342 break;
343 default:
344 *to++ = *from++;
345 break;
346 }
347 }
348 *fromP = from;
349 *toP = to;
350}
351
352#ifdef XML_NS
353static const struct normal_encoding utf8_encoding_ns =
354{
355 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
356 {
357#include "expat\asciitab.h"
358#include "expat\utf8tab.h"
359 },
360 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
361};
362
363#endif
364
365static const struct normal_encoding utf8_encoding =
366{
367 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
368 {
369#define BT_COLON BT_NMSTRT
370#include "expat\asciitab.h"
371#undef BT_COLON
372#include "expat\utf8tab.h"
373 },
374 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
375};
376
377#ifdef XML_NS
378
379static const struct normal_encoding internal_utf8_encoding_ns =
380{
381 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
382 {
383#include "expat\iasciitab.h"
384#include "expat\utf8tab.h"
385 },
386 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
387};
388
389#endif
390
391static const struct normal_encoding internal_utf8_encoding =
392{
393 {VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
394 {
395#define BT_COLON BT_NMSTRT
396#include "expat\iasciitab.h"
397#undef BT_COLON
398#include "expat\utf8tab.h"
399 },
400 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
401};
402
403static void EXPATENTRY latin1_toUtf8(const ENCODING * enc,
404 const char **fromP, const char *fromLim,
405 char **toP, const char *toLim)
406{
407 for (;;)
408 {
409 unsigned char c;
410
411 if (*fromP == fromLim)
412 break;
413 c = (unsigned char)**fromP;
414 if (c & 0x80)
415 {
416 if (toLim - *toP < 2)
417 break;
418 *(*toP)++ = ((c >> 6) | UTF8_cval2);
419 *(*toP)++ = ((c & 0x3f) | 0x80);
420 (*fromP)++;
421 }
422 else
423 {
424 if (*toP == toLim)
425 break;
426 *(*toP)++ = *(*fromP)++;
427 }
428 }
429}
430
431static void EXPATENTRY latin1_toUtf16(const ENCODING * enc,
432 const char **fromP, const char *fromLim,
433 unsigned short **toP, const unsigned short *toLim)
434{
435 while (*fromP != fromLim && *toP != toLim)
436 *(*toP)++ = (unsigned char)*(*fromP)++;
437}
438
439#ifdef XML_NS
440
441static const struct normal_encoding latin1_encoding_ns =
442{
443 {VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
444 {
445#include "expat\asciitab.h"
446#include "expat\latin1tab.h"
447 },
448 STANDARD_VTABLE(sb_)
449};
450
451#endif
452
453static const struct normal_encoding latin1_encoding =
454{
455 {VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
456 {
457#define BT_COLON BT_NMSTRT
458#include "expat\asciitab.h"
459#undef BT_COLON
460#include "expat\latin1tab.h"
461 },
462 STANDARD_VTABLE(sb_)
463};
464
465static void EXPATENTRY ascii_toUtf8(const ENCODING * enc,
466 const char **fromP, const char *fromLim,
467 char **toP, const char *toLim)
468{
469 while (*fromP != fromLim && *toP != toLim)
470 *(*toP)++ = *(*fromP)++;
471}
472
473#ifdef XML_NS
474
475static const struct normal_encoding ascii_encoding_ns =
476{
477 {VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
478 {
479#include "expat\asciitab.h"
480/* BT_NONXML == 0 */
481 },
482 STANDARD_VTABLE(sb_)
483};
484
485#endif
486
487static const struct normal_encoding ascii_encoding =
488{
489 {VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
490 {
491#define BT_COLON BT_NMSTRT
492#include "expat\asciitab.h"
493#undef BT_COLON
494/* BT_NONXML == 0 */
495 },
496 STANDARD_VTABLE(sb_)
497};
498
499static int unicode_byte_type(char hi, char lo)
500{
501 switch ((unsigned char)hi)
502 {
503 case 0xD8:
504 case 0xD9:
505 case 0xDA:
506 case 0xDB:
507 return BT_LEAD4;
508 case 0xDC:
509 case 0xDD:
510 case 0xDE:
511 case 0xDF:
512 return BT_TRAIL;
513 case 0xFF:
514 switch ((unsigned char)lo)
515 {
516 case 0xFF:
517 case 0xFE:
518 return BT_NONXML;
519 }
520 break;
521 }
522 return BT_NONASCII;
523}
524
525#define DEFINE_UTF16_TO_UTF8(E) \
526static void EXPATENTRY E ## toUtf8(const ENCODING *enc, \
527 const char **fromP, const char *fromLim, \
528 char **toP, const char *toLim) \
529{ \
530 const char *from; \
531 for (from = *fromP; from != fromLim; from += 2) { \
532 int plane; \
533 unsigned char lo2; \
534 unsigned char lo = GET_LO(from); \
535 unsigned char hi = GET_HI(from); \
536 switch (hi) { \
537 case 0: \
538 if (lo < 0x80) { \
539 if (*toP == toLim) { \
540 *fromP = from; \
541 return; \
542 } \
543 *(*toP)++ = lo; \
544 break; \
545 } \
546 /* fall through */ \
547 case 0x1: case 0x2: case 0x3: \
548 case 0x4: case 0x5: case 0x6: case 0x7: \
549 if (toLim - *toP < 2) { \
550 *fromP = from; \
551 return; \
552 } \
553 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
554 *(*toP)++ = ((lo & 0x3f) | 0x80); \
555 break; \
556 default: \
557 if (toLim - *toP < 3) { \
558 *fromP = from; \
559 return; \
560 } \
561 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
562 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
563 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
564 *(*toP)++ = ((lo & 0x3f) | 0x80); \
565 break; \
566 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
567 if (toLim - *toP < 4) { \
568 *fromP = from; \
569 return; \
570 } \
571 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
572 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
573 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
574 from += 2; \
575 lo2 = GET_LO(from); \
576 *(*toP)++ = (((lo & 0x3) << 4) \
577 | ((GET_HI(from) & 0x3) << 2) \
578 | (lo2 >> 6) \
579 | 0x80); \
580 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
581 break; \
582 } \
583 } \
584 *fromP = from; \
585}
586
587#define DEFINE_UTF16_TO_UTF16(E) \
588static void EXPATENTRY E ## toUtf16(const ENCODING *enc, \
589 const char **fromP, const char *fromLim, \
590 unsigned short **toP, const unsigned short *toLim) \
591{ \
592 /* Avoid copying first half only of surrogate */ \
593 if (fromLim - *fromP > ((toLim - *toP) << 1) \
594 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
595 fromLim -= 2; \
596 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
597 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
598}
599
600#define SET2(ptr, ch) \
601 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
602#define GET_LO(ptr) ((unsigned char)(ptr)[0])
603#define GET_HI(ptr) ((unsigned char)(ptr)[1])
604
605DEFINE_UTF16_TO_UTF8(little2_)
606DEFINE_UTF16_TO_UTF16(little2_)
607
608#undef SET2
609#undef GET_LO
610#undef GET_HI
611
612#define SET2(ptr, ch) \
613 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
614#define GET_LO(ptr) ((unsigned char)(ptr)[1])
615#define GET_HI(ptr) ((unsigned char)(ptr)[0])
616
617DEFINE_UTF16_TO_UTF8(big2_)
618DEFINE_UTF16_TO_UTF16(big2_)
619
620#undef SET2
621#undef GET_LO
622#undef GET_HI
623
624#define LITTLE2_BYTE_TYPE(enc, p) \
625 ((p)[1] == 0 \
626 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
627 : unicode_byte_type((p)[1], (p)[0]))
628#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
629#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
630#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
631 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
632#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
633 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
634
635#ifdef XML_MIN_SIZE
636
637 static int EXPATENTRY little2_byteType(const ENCODING * enc, const char *p)
638 {
639 return LITTLE2_BYTE_TYPE(enc, p);
640 }
641
642 static int EXPATENTRY little2_byteToAscii(const ENCODING * enc, const char *p)
643 {
644 return LITTLE2_BYTE_TO_ASCII(enc, p);
645 }
646
647 static int EXPATENTRY little2_charMatches(const ENCODING * enc, const char *p, int c)
648 {
649 return LITTLE2_CHAR_MATCHES(enc, p, c);
650 }
651
652 static int EXPATENTRY little2_isNameMin(const ENCODING * enc, const char *p)
653 {
654 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
655 }
656
657 static int EXPATENTRY little2_isNmstrtMin(const ENCODING * enc, const char *p)
658 {
659 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
660 }
661
662 #undef VTABLE
663 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
664
665#else /* not XML_MIN_SIZE */
666
667 #undef PREFIX
668 #define PREFIX(ident) little2_ ## ident
669 #define MINBPC(enc) 2
670 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
671 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
672 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
673 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
674 #define IS_NAME_CHAR(enc, p, n) 0
675 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
676 #define IS_NMSTRT_CHAR(enc, p, n) (0)
677 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
678
679 #include "xmltok_impl.c"
680
681 #undef MINBPC
682 #undef BYTE_TYPE
683 #undef BYTE_TO_ASCII
684 #undef CHAR_MATCHES
685 #undef IS_NAME_CHAR
686 #undef IS_NAME_CHAR_MINBPC
687 #undef IS_NMSTRT_CHAR
688 #undef IS_NMSTRT_CHAR_MINBPC
689 #undef IS_INVALID_CHAR
690
691#endif /* not XML_MIN_SIZE */
692
693#ifdef XML_NS
694
695 static const struct normal_encoding little2_encoding_ns =
696 {
697 {VTABLE, 2, 0,
698 #if XML_BYTE_ORDER == 12
699 1
700 #else
701 0
702 #endif
703 },
704 {
705 #include "expat\asciitab.h"
706 #include "expat\latin1tab.h"
707 },
708 STANDARD_VTABLE(little2_)
709 };
710
711#endif
712
713 static const struct normal_encoding little2_encoding =
714 {
715 {VTABLE, 2, 0,
716#if XML_BYTE_ORDER == 12
717 1
718#else
719 0
720#endif
721 },
722 {
723#define BT_COLON BT_NMSTRT
724#include "expat\asciitab.h"
725#undef BT_COLON
726#include "expat\latin1tab.h"
727 },
728 STANDARD_VTABLE(little2_)
729};
730
731#if XML_BYTE_ORDER != 21
732
733#ifdef XML_NS
734
735 static const struct normal_encoding internal_little2_encoding_ns =
736 {
737 {VTABLE, 2, 0, 1},
738 {
739#include "expat\iasciitab.h"
740#include "expat\latin1tab.h"
741 },
742 STANDARD_VTABLE(little2_)
743};
744
745#endif
746
747 static const struct normal_encoding internal_little2_encoding =
748 {
749 {VTABLE, 2, 0, 1},
750 {
751#define BT_COLON BT_NMSTRT
752#include "expat\iasciitab.h"
753#undef BT_COLON
754#include "expat\latin1tab.h"
755 },
756 STANDARD_VTABLE(little2_)
757};
758
759#endif
760
761
762#define BIG2_BYTE_TYPE(enc, p) \
763 ((p)[0] == 0 \
764 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
765 : unicode_byte_type((p)[0], (p)[1]))
766#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
767#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
768#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
769 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
770#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
771 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
772
773#ifdef XML_MIN_SIZE
774
775 static int EXPATENTRY big2_byteType(const ENCODING * enc, const char *p)
776 {
777 return BIG2_BYTE_TYPE(enc, p);
778 }
779
780 static int EXPATENTRY big2_byteToAscii(const ENCODING * enc, const char *p)
781 {
782 return BIG2_BYTE_TO_ASCII(enc, p);
783 }
784
785 static int EXPATENTRY big2_charMatches(const ENCODING * enc, const char *p, int c)
786 {
787 return BIG2_CHAR_MATCHES(enc, p, c);
788 }
789
790 static int EXPATENTRY big2_isNameMin(const ENCODING * enc, const char *p)
791 {
792 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
793 }
794
795 static int EXPATENTRY big2_isNmstrtMin(const ENCODING * enc, const char *p)
796 {
797 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
798 }
799
800 #undef VTABLE
801 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
802
803#else /* not XML_MIN_SIZE */
804
805 #undef PREFIX
806 #define PREFIX(ident) big2_ ## ident
807 #define MINBPC(enc) 2
808 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
809 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
810 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
811 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
812 #define IS_NAME_CHAR(enc, p, n) 0
813 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
814 #define IS_NMSTRT_CHAR(enc, p, n) (0)
815 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
816
817 #include "xmltok_impl.c"
818
819 #undef MINBPC
820 #undef BYTE_TYPE
821 #undef BYTE_TO_ASCII
822 #undef CHAR_MATCHES
823 #undef IS_NAME_CHAR
824 #undef IS_NAME_CHAR_MINBPC
825 #undef IS_NMSTRT_CHAR
826 #undef IS_NMSTRT_CHAR_MINBPC
827 #undef IS_INVALID_CHAR
828
829#endif /* not XML_MIN_SIZE */
830
831#ifdef XML_NS
832
833 static const struct normal_encoding big2_encoding_ns =
834 {
835 {VTABLE, 2, 0,
836#if XML_BYTE_ORDER == 21
837 1
838#else
839 0
840#endif
841 },
842 {
843#include "expat\asciitab.h"
844#include "expat\latin1tab.h"
845 },
846 STANDARD_VTABLE(big2_)
847};
848
849#endif
850
851 static const struct normal_encoding big2_encoding =
852 {
853 {VTABLE, 2, 0,
854#if XML_BYTE_ORDER == 21
855 1
856#else
857 0
858#endif
859 },
860 {
861#define BT_COLON BT_NMSTRT
862#include "expat\asciitab.h"
863#undef BT_COLON
864#include "expat\latin1tab.h"
865 },
866 STANDARD_VTABLE(big2_)
867};
868
869#if XML_BYTE_ORDER != 12
870
871#ifdef XML_NS
872
873 static const struct normal_encoding internal_big2_encoding_ns =
874 {
875 {VTABLE, 2, 0, 1},
876 {
877#include "expat\iasciitab.h"
878#include "expat\latin1tab.h"
879 },
880 STANDARD_VTABLE(big2_)
881};
882
883#endif
884
885 static const struct normal_encoding internal_big2_encoding =
886 {
887 {VTABLE, 2, 0, 1},
888 {
889#define BT_COLON BT_NMSTRT
890#include "expat\iasciitab.h"
891#undef BT_COLON
892#include "expat\latin1tab.h"
893 },
894 STANDARD_VTABLE(big2_)
895};
896
897#endif
898
899#undef PREFIX
900
901 static
902 int streqci(const char *s1, const char *s2)
903{
904 for (;;)
905 {
906 char c1 = *s1++;
907 char c2 = *s2++;
908
909 if (ASCII_a <= c1 && c1 <= ASCII_z)
910 c1 += ASCII_A - ASCII_a;
911 if (ASCII_a <= c2 && c2 <= ASCII_z)
912 c2 += ASCII_A - ASCII_a;
913 if (c1 != c2)
914 return 0;
915 if (!c1)
916 break;
917 }
918 return 1;
919}
920
921static void EXPATENTRY initUpdatePosition(const ENCODING * enc,
922 const char *ptr,
923 const char *end,
924 POSITION * pos)
925{
926 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
927}
928
929static int EXPATENTRY toAscii(const ENCODING * enc, const char *ptr, const char *end)
930{
931 char buf[1];
932 char *p = buf;
933
934 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
935 if (p == buf)
936 return -1;
937 else
938 return buf[0];
939}
940
941static int isSpace(int c)
942{
943 switch (c)
944 {
945 case 0x20:
946 case 0xD:
947 case 0xA:
948 case 0x9:
949 return 1;
950 }
951 return 0;
952}
953
954/* Return 1 if there's just optional white space
955 * or there's an S followed by name=val. */
956static int EXPATENTRY parsePseudoAttribute(const ENCODING * enc,
957 const char *ptr,
958 const char *end,
959 const char **namePtr,
960 const char **nameEndPtr,
961 const char **valPtr,
962 const char **nextTokPtr)
963{
964 int c;
965 char open;
966
967 if (ptr == end)
968 {
969 *namePtr = 0;
970 return 1;
971 }
972 if (!isSpace(toAscii(enc, ptr, end)))
973 {
974 *nextTokPtr = ptr;
975 return 0;
976 }
977 do
978 {
979 ptr += enc->minBytesPerChar;
980 }
981 while (isSpace(toAscii(enc, ptr, end)));
982 if (ptr == end)
983 {
984 *namePtr = 0;
985 return 1;
986 }
987 *namePtr = ptr;
988 for (;;)
989 {
990 c = toAscii(enc, ptr, end);
991 if (c == -1)
992 {
993 *nextTokPtr = ptr;
994 return 0;
995 }
996 if (c == ASCII_EQUALS)
997 {
998 *nameEndPtr = ptr;
999 break;
1000 }
1001 if (isSpace(c))
1002 {
1003 *nameEndPtr = ptr;
1004 do
1005 {
1006 ptr += enc->minBytesPerChar;
1007 }
1008 while (isSpace(c = toAscii(enc, ptr, end)));
1009 if (c != ASCII_EQUALS)
1010 {
1011 *nextTokPtr = ptr;
1012 return 0;
1013 }
1014 break;
1015 }
1016 ptr += enc->minBytesPerChar;
1017 }
1018 if (ptr == *namePtr)
1019 {
1020 *nextTokPtr = ptr;
1021 return 0;
1022 }
1023 ptr += enc->minBytesPerChar;
1024 c = toAscii(enc, ptr, end);
1025 while (isSpace(c))
1026 {
1027 ptr += enc->minBytesPerChar;
1028 c = toAscii(enc, ptr, end);
1029 }
1030 if (c != ASCII_QUOT && c != ASCII_APOS)
1031 {
1032 *nextTokPtr = ptr;
1033 return 0;
1034 }
1035 open = c;
1036 ptr += enc->minBytesPerChar;
1037 *valPtr = ptr;
1038 for (;; ptr += enc->minBytesPerChar)
1039 {
1040 c = toAscii(enc, ptr, end);
1041 if (c == open)
1042 break;
1043 if (!(ASCII_a <= c && c <= ASCII_z)
1044 && !(ASCII_A <= c && c <= ASCII_Z)
1045 && !(ASCII_0 <= c && c <= ASCII_9)
1046 && c != ASCII_PERIOD
1047 && c != ASCII_MINUS
1048 && c != ASCII_UNDERSCORE)
1049 {
1050 *nextTokPtr = ptr;
1051 return 0;
1052 }
1053 }
1054 *nextTokPtr = ptr + enc->minBytesPerChar;
1055 return 1;
1056}
1057
1058static const char KW_version[] =
1059{
1060 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1061};
1062
1063static const char KW_encoding[] =
1064{
1065 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1066};
1067
1068static const char KW_standalone[] =
1069{
1070 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
1071};
1072
1073static const char KW_yes[] =
1074{
1075 ASCII_y, ASCII_e, ASCII_s, '\0'
1076};
1077
1078static const char KW_no[] =
1079{
1080 ASCII_n, ASCII_o, '\0'
1081};
1082
1083static int doParseXmlDecl(const ENCODING* (* EXPATENTRY encodingFinder)(const ENCODING *,
1084 const char *,
1085 const char *),
1086 int isGeneralTextEntity,
1087 const ENCODING * enc,
1088 const char *ptr,
1089 const char *end,
1090 const char **badPtr,
1091 const char **versionPtr,
1092 const char **versionEndPtr,
1093 const char **encodingName,
1094 const ENCODING ** encoding,
1095 int *standalone)
1096{
1097 const char *val = 0;
1098 const char *name = 0;
1099 const char *nameEnd = 0;
1100
1101 ptr += 5 * enc->minBytesPerChar;
1102 end -= 2 * enc->minBytesPerChar;
1103 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name)
1104 {
1105 *badPtr = ptr;
1106 return 0;
1107 }
1108 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version))
1109 {
1110 if (!isGeneralTextEntity)
1111 {
1112 *badPtr = name;
1113 return 0;
1114 }
1115 }
1116 else
1117 {
1118 if (versionPtr)
1119 *versionPtr = val;
1120 if (versionEndPtr)
1121 *versionEndPtr = ptr;
1122 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1123 {
1124 *badPtr = ptr;
1125 return 0;
1126 }
1127 if (!name)
1128 {
1129 if (isGeneralTextEntity)
1130 {
1131 /* a TextDecl must have an EncodingDecl */
1132 *badPtr = ptr;
1133 return 0;
1134 }
1135 return 1;
1136 }
1137 }
1138 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding))
1139 {
1140 int c = toAscii(enc, val, end);
1141
1142 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z))
1143 {
1144 *badPtr = val;
1145 return 0;
1146 }
1147 if (encodingName)
1148 *encodingName = val;
1149 if (encoding)
1150 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1151 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr))
1152 {
1153 *badPtr = ptr;
1154 return 0;
1155 }
1156 if (!name)
1157 return 1;
1158 }
1159 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) || isGeneralTextEntity)
1160 {
1161 *badPtr = name;
1162 return 0;
1163 }
1164 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes))
1165 {
1166 if (standalone)
1167 *standalone = 1;
1168 }
1169 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no))
1170 {
1171 if (standalone)
1172 *standalone = 0;
1173 }
1174 else
1175 {
1176 *badPtr = val;
1177 return 0;
1178 }
1179 while (isSpace(toAscii(enc, ptr, end)))
1180 ptr += enc->minBytesPerChar;
1181 if (ptr != end)
1182 {
1183 *badPtr = ptr;
1184 return 0;
1185 }
1186 return 1;
1187}
1188
1189static int checkCharRefNumber(int result)
1190{
1191 switch (result >> 8)
1192 {
1193 case 0xD8:
1194 case 0xD9:
1195 case 0xDA:
1196 case 0xDB:
1197 case 0xDC:
1198 case 0xDD:
1199 case 0xDE:
1200 case 0xDF:
1201 return -1;
1202 case 0:
1203 if (latin1_encoding.type[result] == BT_NONXML)
1204 return -1;
1205 break;
1206 case 0xFF:
1207 if (result == 0xFFFE || result == 0xFFFF)
1208 return -1;
1209 break;
1210 }
1211 return result;
1212}
1213
1214int XmlUtf8Encode(int c, char *buf)
1215{
1216 enum
1217 {
1218 /* minN is minimum legal resulting value for N byte sequence */
1219 min2 = 0x80,
1220 min3 = 0x800,
1221 min4 = 0x10000
1222 };
1223
1224 if (c < 0)
1225 return 0;
1226 if (c < min2)
1227 {
1228 buf[0] = (c | UTF8_cval1);
1229 return 1;
1230 }
1231 if (c < min3)
1232 {
1233 buf[0] = ((c >> 6) | UTF8_cval2);
1234 buf[1] = ((c & 0x3f) | 0x80);
1235 return 2;
1236 }
1237 if (c < min4)
1238 {
1239 buf[0] = ((c >> 12) | UTF8_cval3);
1240 buf[1] = (((c >> 6) & 0x3f) | 0x80);
1241 buf[2] = ((c & 0x3f) | 0x80);
1242 return 3;
1243 }
1244 if (c < 0x110000)
1245 {
1246 buf[0] = ((c >> 18) | UTF8_cval4);
1247 buf[1] = (((c >> 12) & 0x3f) | 0x80);
1248 buf[2] = (((c >> 6) & 0x3f) | 0x80);
1249 buf[3] = ((c & 0x3f) | 0x80);
1250 return 4;
1251 }
1252 return 0;
1253}
1254
1255int XmlUtf16Encode(int charNum, unsigned short *buf)
1256{
1257 if (charNum < 0)
1258 return 0;
1259 if (charNum < 0x10000)
1260 {
1261 buf[0] = charNum;
1262 return 1;
1263 }
1264 if (charNum < 0x110000)
1265 {
1266 charNum -= 0x10000;
1267 buf[0] = (charNum >> 10) + 0xD800;
1268 buf[1] = (charNum & 0x3FF) + 0xDC00;
1269 return 2;
1270 }
1271 return 0;
1272}
1273
1274struct unknown_encoding
1275{
1276 struct normal_encoding normal;
1277 int (*convert) (void *userData, const char *p);
1278 void *userData;
1279 unsigned short utf16[256];
1280 char utf8[256][4];
1281};
1282
1283int EXPATENTRY XmlSizeOfUnknownEncoding(void)
1284{
1285 return sizeof(struct unknown_encoding);
1286}
1287
1288static int EXPATENTRY unknown_isName(const ENCODING * enc, const char *p)
1289{
1290 int c = ((const struct unknown_encoding *)enc)
1291 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1292
1293 if (c & ~0xFFFF)
1294 return 0;
1295 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1296}
1297
1298static int EXPATENTRY unknown_isNmstrt(const ENCODING * enc, const char *p)
1299{
1300 int c = ((const struct unknown_encoding *)enc)
1301 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1302
1303 if (c & ~0xFFFF)
1304 return 0;
1305 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1306}
1307
1308static int EXPATENTRY unknown_isInvalid(const ENCODING * enc, const char *p)
1309{
1310 int c = ((const struct unknown_encoding *)enc)
1311 ->convert(((const struct unknown_encoding *)enc)->userData, p);
1312
1313 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1314}
1315
1316static void EXPATENTRY unknown_toUtf8(const ENCODING * enc,
1317 const char **fromP,
1318 const char *fromLim,
1319 char **toP,
1320 const char *toLim)
1321{
1322 char buf[XML_UTF8_ENCODE_MAX];
1323
1324 for (;;)
1325 {
1326 const char *utf8;
1327 int n;
1328
1329 if (*fromP == fromLim)
1330 break;
1331 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1332 n = *utf8++;
1333 if (n == 0)
1334 {
1335 int c = ((const struct unknown_encoding *)enc)
1336 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1337
1338 n = XmlUtf8Encode(c, buf);
1339 if (n > toLim - *toP)
1340 break;
1341 utf8 = buf;
1342 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1343 - (BT_LEAD2 - 2);
1344 }
1345 else
1346 {
1347 if (n > toLim - *toP)
1348 break;
1349 (*fromP)++;
1350 }
1351 do
1352 {
1353 *(*toP)++ = *utf8++;
1354 }
1355 while (--n != 0);
1356 }
1357}
1358
1359static void EXPATENTRY unknown_toUtf16(const ENCODING * enc,
1360 const char **fromP,
1361 const char *fromLim,
1362 unsigned short **toP,
1363 const unsigned short *toLim)
1364{
1365 while (*fromP != fromLim && *toP != toLim)
1366 {
1367 unsigned short c
1368 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1369
1370 if (c == 0)
1371 {
1372 c = (unsigned short)((const struct unknown_encoding *)enc)
1373 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1374 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1375 - (BT_LEAD2 - 2);
1376 }
1377 else
1378 (*fromP)++;
1379 *(*toP)++ = c;
1380 }
1381}
1382
1383ENCODING * XmlInitUnknownEncoding(void *mem,
1384 int *table,
1385 int (*convert) (void *userData, const char *p),
1386 void *userData)
1387{
1388 int i;
1389 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1390 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1391
1392 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1393 for (i = 0; i < 128; i++)
1394 if (latin1_encoding.type[i] != BT_OTHER
1395 && latin1_encoding.type[i] != BT_NONXML
1396 && table[i] != i)
1397 return 0;
1398 for (i = 0; i < 256; i++)
1399 {
1400 int c = table[i];
1401
1402 if (c == -1)
1403 {
1404 e->normal.type[i] = BT_MALFORM;
1405 /* This shouldn't really get used. */
1406 e->utf16[i] = 0xFFFF;
1407 e->utf8[i][0] = 1;
1408 e->utf8[i][1] = 0;
1409 }
1410 else if (c < 0)
1411 {
1412 if (c < -4)
1413 return 0;
1414 e->normal.type[i] = BT_LEAD2 - (c + 2);
1415 e->utf8[i][0] = 0;
1416 e->utf16[i] = 0;
1417 }
1418 else if (c < 0x80)
1419 {
1420 if (latin1_encoding.type[c] != BT_OTHER
1421 && latin1_encoding.type[c] != BT_NONXML
1422 && c != i)
1423 return 0;
1424 e->normal.type[i] = latin1_encoding.type[c];
1425 e->utf8[i][0] = 1;
1426 e->utf8[i][1] = (char)c;
1427 e->utf16[i] = c == 0 ? 0xFFFF : c;
1428 }
1429 else if (checkCharRefNumber(c) < 0)
1430 {
1431 e->normal.type[i] = BT_NONXML;
1432 /* This shouldn't really get used. */
1433 e->utf16[i] = 0xFFFF;
1434 e->utf8[i][0] = 1;
1435 e->utf8[i][1] = 0;
1436 }
1437 else
1438 {
1439 if (c > 0xFFFF)
1440 return 0;
1441 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1442 e->normal.type[i] = BT_NMSTRT;
1443 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1444 e->normal.type[i] = BT_NAME;
1445 else
1446 e->normal.type[i] = BT_OTHER;
1447 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1448 e->utf16[i] = c;
1449 }
1450 }
1451 e->userData = userData;
1452 e->convert = convert;
1453 if (convert)
1454 {
1455 e->normal.isName2 = unknown_isName;
1456 e->normal.isName3 = unknown_isName;
1457 e->normal.isName4 = unknown_isName;
1458 e->normal.isNmstrt2 = unknown_isNmstrt;
1459 e->normal.isNmstrt3 = unknown_isNmstrt;
1460 e->normal.isNmstrt4 = unknown_isNmstrt;
1461 e->normal.isInvalid2 = unknown_isInvalid;
1462 e->normal.isInvalid3 = unknown_isInvalid;
1463 e->normal.isInvalid4 = unknown_isInvalid;
1464 }
1465 e->normal.enc.utf8Convert = unknown_toUtf8;
1466 e->normal.enc.utf16Convert = unknown_toUtf16;
1467 return &(e->normal.enc);
1468}
1469
1470/* If this enumeration is changed, getEncodingIndex and encodings
1471 * must also be changed. */
1472enum
1473{
1474 UNKNOWN_ENC = -1,
1475 ISO_8859_1_ENC = 0,
1476 US_ASCII_ENC,
1477 UTF_8_ENC,
1478 UTF_16_ENC,
1479 UTF_16BE_ENC,
1480 UTF_16LE_ENC,
1481 /* must match encodingNames up to here */
1482 NO_ENC
1483};
1484
1485static const char KW_ISO_8859_1[] =
1486{
1487 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
1488};
1489static const char KW_US_ASCII[] =
1490{
1491 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
1492};
1493static const char KW_UTF_8[] =
1494{
1495 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1496};
1497static const char KW_UTF_16[] =
1498{
1499 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1500};
1501static const char KW_UTF_16BE[] =
1502{
1503 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
1504};
1505static const char KW_UTF_16LE[] =
1506{
1507 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
1508};
1509
1510static int getEncodingIndex(const char *name)
1511{
1512 static const char *encodingNames[] =
1513 {
1514 KW_ISO_8859_1,
1515 KW_US_ASCII,
1516 KW_UTF_8,
1517 KW_UTF_16,
1518 KW_UTF_16BE,
1519 KW_UTF_16LE,
1520 };
1521 int i;
1522
1523 if (name == 0)
1524 return NO_ENC;
1525 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1526 if (streqci(name, encodingNames[i]))
1527 return i;
1528 return UNKNOWN_ENC;
1529}
1530
1531/* For binary compatibility, we store the index of the encoding specified
1532 * at initialization in the isUtf16 member. */
1533
1534#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1535#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1536
1537/* This is what detects the encoding.
1538 * encodingTable maps from encoding indices to encodings;
1539 * INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1540 * state is XML_CONTENT_STATE if we're parsing an external text entity,
1541 * and XML_PROLOG_STATE otherwise.
1542 */
1543
1544
1545static int EXPATENTRY initScan(const ENCODING ** encodingTable,
1546 const INIT_ENCODING * enc,
1547 int state,
1548 const char *ptr,
1549 const char *end,
1550 const char **nextTokPtr)
1551{
1552 const ENCODING **encPtr;
1553
1554 if (ptr == end)
1555 return XML_TOK_NONE;
1556 encPtr = enc->encPtr;
1557 if (ptr + 1 == end)
1558 {
1559 /* only a single byte available for auto-detection */
1560#ifndef XML_DTD /* FIXME */
1561 /* a well-formed document entity must have more than one byte */
1562 if (state != XML_CONTENT_STATE)
1563 return XML_TOK_PARTIAL;
1564#endif
1565 /* so we're parsing an external text entity... */
1566 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1567 switch (INIT_ENC_INDEX(enc))
1568 {
1569 case UTF_16_ENC:
1570 case UTF_16LE_ENC:
1571 case UTF_16BE_ENC:
1572 return XML_TOK_PARTIAL;
1573 }
1574 switch ((unsigned char)*ptr)
1575 {
1576 case 0xFE:
1577 case 0xFF:
1578 case 0xEF: /* possibly first byte of UTF-8 BOM */
1579 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1580 && state == XML_CONTENT_STATE)
1581 break;
1582 /* fall through */
1583 case 0x00:
1584 case 0x3C:
1585 return XML_TOK_PARTIAL;
1586 }
1587 }
1588 else
1589 {
1590 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1])
1591 {
1592 case 0xFEFF:
1593 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1594 && state == XML_CONTENT_STATE)
1595 break;
1596 *nextTokPtr = ptr + 2;
1597 *encPtr = encodingTable[UTF_16BE_ENC];
1598 return XML_TOK_BOM;
1599 /* 00 3C is handled in the default case */
1600 case 0x3C00:
1601 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1602 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1603 && state == XML_CONTENT_STATE)
1604 break;
1605 *encPtr = encodingTable[UTF_16LE_ENC];
1606 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1607 case 0xFFFE:
1608 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1609 && state == XML_CONTENT_STATE)
1610 break;
1611 *nextTokPtr = ptr + 2;
1612 *encPtr = encodingTable[UTF_16LE_ENC];
1613 return XML_TOK_BOM;
1614 case 0xEFBB:
1615 /* Maybe a UTF-8 BOM (EF BB BF) */
1616 /* If there's an explicitly specified (external) encoding
1617 * of ISO-8859-1 or some flavour of UTF-16
1618 * and this is an external text entity,
1619 * don't look for the BOM,
1620 * because it might be a legal data. */
1621 if (state == XML_CONTENT_STATE)
1622 {
1623 int e = INIT_ENC_INDEX(enc);
1624
1625 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
1626 break;
1627 }
1628 if (ptr + 2 == end)
1629 return XML_TOK_PARTIAL;
1630 if ((unsigned char)ptr[2] == 0xBF)
1631 {
1632 *encPtr = encodingTable[UTF_8_ENC];
1633 return XML_TOK_BOM;
1634 }
1635 break;
1636 default:
1637 if (ptr[0] == '\0')
1638 {
1639 /* 0 isn't a legal data character. Furthermore a document entity can only
1640 * start with ASCII characters. So the only way this can fail to be big-endian
1641 * UTF-16 if it it's an external parsed general entity that's labelled as
1642 * UTF-16LE. */
1643 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1644 break;
1645 *encPtr = encodingTable[UTF_16BE_ENC];
1646 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1647 }
1648 else if (ptr[1] == '\0')
1649 {
1650 /* We could recover here in the case:
1651 * - parsing an external entity
1652 * - second byte is 0
1653 * - no externally specified encoding
1654 * - no encoding declaration
1655 * by assuming UTF-16LE. But we don't, because this would mean when
1656 * presented just with a single byte, we couldn't reliably determine
1657 * whether we needed further bytes. */
1658 if (state == XML_CONTENT_STATE)
1659 break;
1660 *encPtr = encodingTable[UTF_16LE_ENC];
1661 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1662 }
1663 break;
1664 }
1665 }
1666 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1667 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1668}
1669
1670
1671#define NS(x) x
1672#define ns(x) x
1673#include "xmltok_ns.c"
1674#undef NS
1675#undef ns
1676
1677#ifdef XML_NS
1678
1679#define NS(x) x ## NS
1680#define ns(x) x ## _ns
1681
1682#include "xmltok_ns.c"
1683
1684#undef NS
1685#undef ns
1686
1687ENCODING * XmlInitUnknownEncodingNS(void *mem,
1688 int *table,
1689 int (* EXPATENTRY convert) (void *userData, const char *p),
1690 void *userData)
1691{
1692 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1693
1694 if (enc)
1695 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1696 return enc;
1697}
1698
1699#endif /* XML_NS */
Note: See TracBrowser for help on using the repository browser.