source: trunk/src/xmlpatterns/parser/qxquerytokenizer.cpp@ 624

Last change on this file since 624 was 561, checked in by Dmitry A. Kuminov, 16 years ago

trunk: Merged in qt 4.6.1 sources.

File size: 68.5 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation (qt-info@nokia.com)
6**
7** This file is part of the QtXmlPatterns module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at qt-info@nokia.com.
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include <QByteArray>
43
44#include "qquerytransformparser_p.h"
45
46#include "qxquerytokenizer_p.h"
47
48#include "qtokenlookup.cpp"
49
50QT_BEGIN_NAMESPACE
51
52namespace QPatternist
53{
54
55#define handleWhitespace() \
56{ \
57 const TokenType t = consumeWhitespace(); \
58 if(t != SUCCESS) \
59 return Token(t); \
60}
61
62XQueryTokenizer::XQueryTokenizer(const QString &query,
63 const QUrl &location,
64 const State startingState) : Tokenizer(location)
65 , m_data(query)
66 , m_length(query.length())
67 , m_state(startingState)
68 , m_pos(0)
69 , m_line(1)
70 , m_columnOffset(0)
71 , m_scanOnly(false)
72{
73 Q_ASSERT(location.isValid() || location.isEmpty());
74}
75
76const QChar XQueryTokenizer::current() const
77{
78 if(m_pos < m_length)
79 return m_data.at(m_pos);
80 else
81 return QChar();
82}
83
84char XQueryTokenizer::peekCurrent() const
85{
86 return current().toAscii();
87}
88
89int XQueryTokenizer::peekForColonColon() const
90{
91 /* Note, we don't modify m_pos in this function, so we need to do offset
92 * calculations. */
93 int pos = m_pos;
94
95 while(pos < m_length)
96 {
97 switch(m_data.at(pos).toAscii())
98 {
99 /* Fallthrough these four. */
100 case ' ':
101 case '\t':
102 case '\n':
103 case '\r':
104 break;
105 case ':':
106 {
107 if(peekAhead((pos - m_pos) + 1) == ':')
108 return pos - m_pos;
109 /* Fallthrough. */
110 }
111 default:
112 return -1;
113 }
114 ++pos;
115 }
116
117 return -1;
118}
119
120Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
121 const State s,
122 const int advance)
123{
124 Q_ASSERT(advance >= 0);
125 m_pos += advance;
126 setState(s);
127 return Token(code);
128}
129
130Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
131 const QString &value,
132 const State s)
133{
134 setState(s);
135 return Token(code, value);
136}
137
138Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code,
139 const int advance)
140{
141 Q_ASSERT(advance >= 0);
142 m_pos += advance;
143 return Token(code);
144}
145
146QString XQueryTokenizer::normalizeEOL(const QString &input,
147 const CharacterSkips &characterSkips)
148{
149 const int len = input.count();
150 QString result;
151
152 /* The likely hood is rather high it'll be the same content. */
153 result.reserve(len);
154
155 for(int i = 0; i < len; ++i)
156 {
157 const QChar &at = input.at(i);
158
159 if(characterSkips.contains(i))
160 {
161 result.append(at);
162 continue;
163 }
164 switch(input.at(i).unicode())
165 {
166 case '\r':
167 {
168 if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
169 ++i;
170
171 /* Else, fallthrough. */
172 }
173 case '\n':
174 {
175 result.append(QLatin1Char('\n'));
176 continue;
177 }
178 default:
179 {
180 result.append(at);
181 }
182 }
183 }
184
185 return result;
186}
187
188Tokenizer::TokenType XQueryTokenizer::consumeComment()
189{
190 /* Below, we return ERROR instead of END_OF_FILE such that the parser
191 * sees an invalid comment. */
192 while(m_pos < m_length)
193 {
194 switch(peekCurrent())
195 {
196 case ':':
197 {
198 ++m_pos; /* Consume ':' */
199 if(atEnd())
200 return ERROR;
201
202 if(peekCurrent() == ')')
203 {
204 ++m_pos; /* Consume ')' */
205 return SUCCESS; /* The comment closed nicely. */
206 }
207 continue; /* We don't want to increment m_pos twice. */
208 }
209 case '(':
210 { /* It looks like the start of a comment. */
211 ++m_pos;
212
213 if(atEnd())
214 return END_OF_FILE;
215 else if(peekCurrent() == ':')
216 {
217 /* And it is a nested comment -- parse it. */
218 const TokenType retval = consumeComment();
219 if(retval == SUCCESS)
220 continue; /* Continue with our "own" comment. */
221 else
222 return retval; /* Return the error in the nested comment. */
223 }
224 break;
225 }
226 case '\n':
227 /* Fallthrough. */
228 case '\r':
229 {
230 /* We want to count \r\n as a single line break. */
231 if(peekAhead() == '\n')
232 ++m_pos;
233
234 m_columnOffset = m_pos;
235 ++m_line;
236
237 break;
238 }
239 }
240 ++m_pos;
241 }
242
243 return ERROR; /* Error: we reached the end while inside a comment. */
244}
245
246bool XQueryTokenizer::consumeRawWhitespace()
247{
248 while(m_pos < m_length)
249 {
250 switch(peekCurrent())
251 {
252 case ' ':
253 case '\t':
254 break;
255 case '\n':
256 case '\r':
257 {
258 if(peekAhead() == '\n')
259 ++m_pos;
260
261 m_columnOffset = m_pos;
262 ++m_line;
263
264 break;
265 }
266 default:
267 return false;
268 }
269 ++m_pos;
270 }
271 return true;
272}
273
274Tokenizer::TokenType XQueryTokenizer::consumeWhitespace()
275{
276 while(m_pos < m_length)
277 {
278 switch(peekCurrent())
279 {
280 case ' ':
281 case '\t':
282 break;
283 case '\n':
284 case '\r':
285 {
286 /* We want to count \r\n as a single line break. */
287 if(peekAhead() == '\n')
288 ++m_pos;
289
290 m_columnOffset = m_pos;
291 ++m_line;
292
293 break;
294 }
295 case '(':
296 {
297 if(peekAhead() == ':')
298 {
299 m_pos += 2; /* Consume "(:" */
300
301 const TokenType comment = consumeComment();
302 if(comment == SUCCESS)
303 continue;
304 else
305 return comment;
306 }
307 }
308 default:
309 return SUCCESS;
310 }
311 ++m_pos;
312 }
313
314 return END_OF_FILE;
315}
316
317char XQueryTokenizer::peekAhead(const int length) const
318{
319 if(m_pos + length < m_length)
320 return m_data.at(m_pos + length).toAscii();
321 else
322 return 0;
323}
324
325Tokenizer::Token XQueryTokenizer::error()
326{
327 return Token(ERROR);
328}
329
330bool XQueryTokenizer::isDigit(const char ch)
331{
332 return ch >= '0' && ch <= '9';
333}
334
335/* Replace with function in QXmlUtils. Write test cases for this. */
336bool XQueryTokenizer::isNCNameStart(const QChar ch)
337{
338 if(ch == QLatin1Char('_'))
339 return true;
340
341 switch(ch.category())
342 {
343 case QChar::Letter_Lowercase:
344 case QChar::Letter_Uppercase:
345 case QChar::Letter_Other:
346 case QChar::Letter_Titlecase:
347 case QChar::Number_Letter:
348 return true;
349 default:
350 return false;
351 }
352}
353
354bool XQueryTokenizer::isNCNameBody(const QChar ch)
355{
356 switch(ch.unicode())
357 {
358 case '.':
359 case '_':
360 case '-':
361 return true;
362 }
363
364 switch(ch.category())
365 {
366 case QChar::Letter_Lowercase:
367 case QChar::Letter_Uppercase:
368 case QChar::Letter_Other:
369 case QChar::Letter_Titlecase:
370 case QChar::Number_Letter:
371 case QChar::Mark_SpacingCombining:
372 case QChar::Mark_Enclosing:
373 case QChar::Mark_NonSpacing:
374 case QChar::Letter_Modifier:
375 case QChar::Number_DecimalDigit:
376 return true;
377 default:
378 return false;
379 }
380}
381
382bool XQueryTokenizer::isPhraseKeyword(const TokenType code)
383{
384 switch(code)
385 {
386 /* Fallthrough all these. */
387 case CASTABLE:
388 case CAST:
389 case COPY_NAMESPACES:
390 case DECLARE:
391 case EMPTY:
392 case MODULE:
393 case IMPORT:
394 case INSTANCE:
395 case ORDER:
396 case ORDERING:
397 case XQUERY:
398 case STABLE:
399 case TREAT:
400 return true;
401 default:
402 return false;
403 }
404}
405
406bool XQueryTokenizer::isOperatorKeyword(const TokenType code)
407{
408 switch(code)
409 {
410 /* Fallthrough all these. */
411 case AS:
412 case ASCENDING:
413 case AT:
414 case CASE:
415 case CAST:
416 case CASTABLE:
417 case EQ:
418 case EXTERNAL:
419 case GE:
420 case G_EQ:
421 case G_GT:
422 case G_LT:
423 case G_NE:
424 case GT:
425 case IN:
426 case INHERIT:
427 case INSTANCE:
428 case IS:
429 case ITEM:
430 case LE:
431 case LT:
432 case NE:
433 case NO_INHERIT:
434 case NO_PRESERVE:
435 case OF:
436 case PRESERVE:
437 case RETURN:
438 case STABLE:
439 case TO:
440 case TREAT:
441 return true;
442 default:
443 return false;
444 };
445}
446
447bool XQueryTokenizer::isTypeToken(const TokenType t)
448{
449 switch(t)
450 {
451 /* Fallthrough all these. */
452 case ATTRIBUTE:
453 case COMMENT:
454 case DOCUMENT:
455 case DOCUMENT_NODE:
456 case ELEMENT:
457 case ITEM:
458 case NODE:
459 case PROCESSING_INSTRUCTION:
460 case SCHEMA_ATTRIBUTE:
461 case SCHEMA_ELEMENT:
462 case TEXT:
463 return true;
464 default:
465 return false;
466 }
467}
468
469Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName()
470{
471 const int start = m_pos;
472
473 const Token t1 = tokenizeNCName();
474 if(t1.hasError())
475 return t1;
476
477 if(peekCurrent() != ':' || peekAhead() == '=')
478 return t1;
479
480 ++m_pos;
481
482 const Token t2 = tokenizeNCName();
483 if(t2.hasError())
484 return t2;
485 else
486 return Token(QNAME, m_data.mid(start, m_pos - start));
487}
488
489Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral()
490{
491 setState(Operator);
492 const int startPos = m_pos;
493 bool hasDot = false;
494 bool isXPath20 = false;
495
496 for(; m_pos < m_length; ++m_pos)
497 {
498 QChar ch(current());
499
500 char cell = ch.cell();
501
502 if(cell == 'e' || cell == 'E')
503 {
504 isXPath20 = true;
505 ++m_pos;
506 ch = current();
507
508 if(ch.row() != 0)
509 break;
510
511 cell = ch.cell();
512
513 if(cell == '+' || cell == '-')
514 continue;
515 }
516
517 if(isNCNameStart(ch))
518 return error();
519
520 if(cell < '0' || cell > '9')
521 {
522 if(cell == '.' && !hasDot)
523 hasDot = true;
524 else
525 break;
526 }
527 }
528
529 return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
530}
531
532QString XQueryTokenizer::tokenizeCharacterReference()
533{
534 Q_ASSERT(peekCurrent() == '&');
535
536 const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
537
538 if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
539 return QString();
540
541 QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
542 m_pos = theEnd;
543
544 const QChar charRef(charForReference(content));
545
546 if(!charRef.isNull())
547 return charRef;
548 else if(content.startsWith(QLatin1Char('#')))
549 {
550 int base;
551
552 /* It is only '#' or '#x'. */
553 if(content.length() < 2)
554 return QString();
555
556 /* We got a hex number if it starts with 'x', otherwise it's a decimal. */
557 if(content.at(1) == QLatin1Char('x'))
558 {
559 base = 16;
560 content = content.mid(2); /* Remove "#x". */
561 }
562 else
563 {
564 base = 10;
565 content = content.mid(1); /* Remove "#". */
566 }
567
568 bool conversionOK = false;
569 const int codepoint = content.toInt(&conversionOK, base);
570
571 if(conversionOK)
572 {
573 const QChar ch(codepoint);
574
575 if(ch.isNull())
576 {
577 /* We likely have something which require surrogate pairs. */
578 QString result;
579 result += QChar(QChar::highSurrogate(codepoint));
580 result += QChar(QChar::lowSurrogate(codepoint));
581 return result;
582 }
583 else
584 return ch;
585 }
586 else
587 return QString();
588 }
589 else
590 return QString();
591}
592
593int XQueryTokenizer::scanUntil(const char *const content)
594{
595 const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
596
597 if(end == -1)
598 return -1;
599 else
600 {
601 const int len = end - m_pos;
602 m_pos += len;
603 return len;
604 }
605}
606
607QChar XQueryTokenizer::charForReference(const QString &reference)
608{
609 if(m_charRefs.isEmpty())
610 {
611 /* Initialize. */
612 m_charRefs.reserve(5);
613 m_charRefs.insert(QLatin1String("lt"), QLatin1Char('<'));
614 m_charRefs.insert(QLatin1String("gt"), QLatin1Char('>'));
615 m_charRefs.insert(QLatin1String("amp"), QLatin1Char('&'));
616 m_charRefs.insert(QLatin1String("quot"), QLatin1Char('"'));
617 m_charRefs.insert(QLatin1String("apos"), QLatin1Char('\''));
618 }
619
620 return m_charRefs.value(reference);
621}
622
623Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral()
624{
625 const QChar delimiter(current());
626 /* We cannot unfortunately just scan and then do mid(),
627 * since we can encounter character references. */
628 QString result;
629
630 /* This is more likely than QString's default allocation. */
631 result.reserve(8);
632
633 CharacterSkips skipEOLNormalization;
634
635 /* Advance over the initial quote character. */
636 ++m_pos;
637
638 for(; m_pos < m_length; ++m_pos)
639 {
640 const QChar c(current());
641
642 if(c == QLatin1Char('&'))
643 {
644 const QString charRef(tokenizeCharacterReference());
645
646 if(charRef.isNull())
647 return error();
648 else
649 {
650 skipEOLNormalization.insert(result.count());
651 result.append(charRef);
652 }
653
654 }
655 else if(c == delimiter)
656 {
657 /* Maybe the escaping mechanism is used. For instance, "s""s"
658 * has the value `s"s'. */
659 ++m_pos;
660
661 if(current() == delimiter) /* Double quote. */
662 result += delimiter;
663 else
664 return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
665 }
666 else
667 result += c;
668 }
669
670 return error();
671}
672
673Tokenizer::Token XQueryTokenizer::tokenizeNCName()
674{
675 const int startPos = m_pos;
676
677 if(m_pos < m_length && isNCNameStart(current()))
678 {
679 ++m_pos;
680
681 for(; m_pos < m_length; ++m_pos)
682 {
683 if(!isNCNameBody(current()))
684 break;
685 }
686
687 return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
688 }
689 else
690 return error();
691}
692
693bool XQueryTokenizer::aheadEquals(const char *const chs,
694 const int len,
695 const int offset) const
696{
697 Q_ASSERT(len > 0);
698 Q_ASSERT(qstrlen(chs) == uint(len));
699
700 if(m_pos + len >= m_length)
701 return false;
702
703 for(int i = offset; i < (len + offset); ++i)
704 {
705 if(m_data.at(m_pos + i).toAscii() != chs[i - offset])
706 return false;
707 }
708
709 return true;
710}
711
712const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword)
713{
714 return TokenLookup::value(keyword.toAscii().constData(), keyword.length());
715}
716
717XQueryTokenizer::State XQueryTokenizer::state() const
718{
719 return m_state;
720}
721
722void XQueryTokenizer::setState(const State s)
723{
724 m_state = s;
725}
726
727void XQueryTokenizer::pushState(const State s)
728{
729 m_stateStack.push(s);
730}
731
732void XQueryTokenizer::pushState()
733{
734 m_stateStack.push(m_state);
735}
736
737void XQueryTokenizer::popState()
738{
739 /* QStack::pop() asserts if it's empty, so we need to check
740 * it, since we might receive unbalanced curlies. */
741 if(!m_stateStack.isEmpty())
742 m_state = m_stateStack.pop();
743}
744
745Tokenizer::Token XQueryTokenizer::nextToken()
746{
747 switch(state())
748 {
749 /* We want to skip or do special whitespace handling for these
750 * states. So fallthrough all of the following. */
751 case AposAttributeContent:
752 case Axis:
753 case ElementContent:
754 case EndTag:
755 case Pragma:
756 case PragmaContent:
757 case ProcessingInstructionName:
758 case QuotAttributeContent:
759 case StartTag:
760 case XMLComment:
761 break;
762 default:
763 handleWhitespace();
764 }
765
766 switch(state())
767 {
768 case XMLSpaceDecl:
769 /* Fallthrough. */
770 case NamespaceKeyword:
771 {
772 switch(peekCurrent())
773 {
774 case ',':
775 return tokenAndAdvance(COMMA);
776 case '"':
777 /* Fallthrough. */
778 case '\'':
779 {
780 setState(NamespaceDecl);
781 return tokenizeStringLiteral();
782 }
783 }
784
785 const Token id(tokenizeNCName());
786
787 if(id.type != NCNAME)
788 return id;
789
790 const TokenMap *const keyword = lookupKeyword(id.value);
791 if(keyword)
792 {
793 switch(keyword->token)
794 {
795 case INHERIT:
796 /* Fallthrough. */
797 case NO_INHERIT:
798 {
799 setState(Default);
800 break;
801 }
802 case NAMESPACE:
803 {
804 setState(NamespaceDecl);
805 break;
806 }
807 case ORDERED:
808 /* Fallthrough. */
809 case UNORDERED:
810 /* Fallthrough. */
811 case STRIP:
812 {
813 setState(Default);
814 break;
815 }
816 case PRESERVE:
817 {
818 if(state() != NamespaceKeyword)
819 setState(Default);
820 }
821 default:
822 break;
823 }
824
825 return Token(keyword->token);
826 }
827 else
828 return id;
829
830 Q_ASSERT(false);
831 }
832 case NamespaceDecl:
833 {
834 switch(peekCurrent())
835 {
836 case '=':
837 return tokenAndAdvance(G_EQ);
838 case ';':
839 return tokenAndChangeState(SEMI_COLON, Default);
840 case '\'':
841 /* Fallthrough. */
842 case '\"':
843 return tokenizeStringLiteral();
844 }
845
846 const Token nc(tokenizeNCName());
847
848 handleWhitespace();
849
850 const char pc = peekCurrent();
851 const TokenMap* const t = lookupKeyword(nc.value);
852
853 if(pc == '\'' || (pc == '"' && t))
854 return tokenAndChangeState(t->token, Default, 0);
855 else
856 return nc;
857
858 Q_ASSERT(false);
859 }
860 case Axis:
861 {
862 if(peekCurrent() == ':')
863 {
864 Q_ASSERT(peekAhead() == ':');
865 m_pos += 2;
866 setState(AfterAxisSeparator);
867 return Token(COLONCOLON);
868 }
869 /* Fallthrough. */
870 }
871 case AfterAxisSeparator:
872 /* Fallthrough. */
873 case Default:
874 /* State Operator and state Default have a lot of tokens in common except
875 * for minor differences. So we treat them the same way, and sprinkles logic
876 * here and there to handle the small differences. */
877 /* Fallthrough. */
878 case Operator:
879 {
880 switch(peekCurrent())
881 {
882 case '=':
883 return tokenAndChangeState(G_EQ, Default);
884 case '-':
885 return tokenAndChangeState(MINUS, Default);
886 case '+':
887 return tokenAndChangeState(PLUS, Default);
888 case '[':
889 return tokenAndChangeState(LBRACKET, Default);
890 case ']':
891 return tokenAndChangeState(RBRACKET, Operator);
892 case ',':
893 return tokenAndChangeState(COMMA, Default);
894 case ';':
895 return tokenAndChangeState(SEMI_COLON, Default);
896 case '$':
897 return tokenAndChangeState(DOLLAR, VarName);
898 case '|':
899 return tokenAndChangeState(BAR, Default);
900 case '?':
901 return tokenAndChangeState(QUESTION, Operator);
902 case ')':
903 return tokenAndChangeState(RPAREN, Operator);
904 case '@':
905 return tokenAndChangeState(AT_SIGN, Default);
906 /* Fallthrough all these. */
907 case '1':
908 case '2':
909 case '3':
910 case '4':
911 case '5':
912 case '6':
913 case '7':
914 case '8':
915 case '9':
916 case '0':
917 return tokenizeNumberLiteral();
918 case '.':
919 {
920 const char next = peekAhead();
921 if(next == '.')
922 return tokenAndChangeState(DOTDOT, Operator, 2);
923 /* .5 is allowed, as short form for 0.5:
924 * <tt>[142] DecimalLiteral ::= ("." Digits) | (Digits "." [0-9]*)</tt>
925 */
926 else if(isDigit(next))
927 return tokenizeNumberLiteral();
928 else
929 return tokenAndChangeState(DOT, Operator);
930 }
931 case '\'':
932 /* Fallthrough. */
933 case '"':
934 {
935 setState(Operator);
936 return tokenizeStringLiteral();
937
938 }
939 case '(':
940 {
941 if(peekAhead() == '#')
942 return tokenAndChangeState(PRAGMA_START, Pragma, 2);
943 else
944 return tokenAndChangeState(LPAREN, Default);
945 }
946 case '*':
947 {
948 if(peekAhead() == ':')
949 {
950 m_pos += 2; /* Consume *:. */
951 const Token nc = tokenizeNCName();
952
953 if(nc.hasError())
954 return error();
955 else
956 return tokenAndChangeState(ANY_PREFIX, nc.value, Operator);
957 }
958 else
959 return tokenAndChangeState(STAR, state() == Default ? Operator : Default);
960 }
961 case ':':
962 {
963 switch(peekAhead())
964 {
965 case '=':
966 return tokenAndChangeState(ASSIGN, Default, 2);
967 case ':':
968 return tokenAndChangeState(COLONCOLON, Default, 2);
969 default:
970 return error();
971 }
972 }
973 case '!':
974 {
975 if(peekAhead() == '=')
976 return tokenAndChangeState(G_NE, Default, 2);
977 else
978 return error();
979 }
980 case '<':
981 {
982 switch(peekAhead())
983 {
984 case '=':
985 return tokenAndChangeState(G_LE, Default, 2);
986 case '<':
987 return tokenAndChangeState(PRECEDES, Default, 2);
988 case '?':
989 {
990 pushState(Operator);
991 return tokenAndChangeState(PI_START, ProcessingInstructionName, 2);
992 }
993 case '!':
994 {
995 if(aheadEquals("!--", 3))
996 {
997 m_pos += 3; /* Consume "!--". */
998 pushState(Operator);
999 return tokenAndChangeState(COMMENT_START, XMLComment);
1000 }
1001 /* Fallthrough. It's a syntax error, and this is a good way to report it. */
1002 }
1003 default:
1004 {
1005 if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
1006 {
1007 /* We assume it's an element constructor. */
1008 pushState(Operator);
1009 }
1010
1011 return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag);
1012 }
1013 }
1014 }
1015 case '>':
1016 {
1017 switch(peekAhead())
1018 {
1019 case '=':
1020 return tokenAndChangeState(G_GE, Default, 2);
1021 case '>':
1022 return tokenAndChangeState(FOLLOWS, Default, 2);
1023 default:
1024 return tokenAndChangeState(G_GT, Default);
1025 }
1026 }
1027 case '/':
1028 {
1029 if(peekAhead() == '/')
1030 return tokenAndChangeState(SLASHSLASH, Default, 2);
1031 else
1032 return tokenAndChangeState(SLASH, Default);
1033 }
1034 case '{':
1035 {
1036 pushState(Operator);
1037 return tokenAndChangeState(CURLY_LBRACE, Default);
1038 }
1039 case '}':
1040 {
1041 popState();
1042
1043 return tokenAndAdvance(CURLY_RBRACE);
1044 }
1045 }
1046
1047 /* Ok. We're in state Default or Operator, and it wasn't a simple
1048 * character. */
1049
1050 const Token id(tokenizeNCName());
1051
1052 if(id.type != NCNAME)
1053 return id;
1054
1055 const TokenMap *const keyword = lookupKeyword(id.value);
1056
1057 if(state() == Operator)
1058 {
1059 if(keyword)
1060 {
1061 if(keyword->token == DEFAULT || keyword->token == ASCENDING || keyword->token == DESCENDING)
1062 setState(Operator);
1063 else if(keyword->token == RETURN)
1064 setState(Default);
1065 else if(isPhraseKeyword(keyword->token))
1066 {
1067 const TokenType ws = consumeWhitespace();
1068 if(ws == ERROR)
1069 return error();
1070
1071 const Token id2(tokenizeNCName());
1072 const TokenMap *const keyword2 = lookupKeyword(id2.value);
1073
1074 if(keyword2)
1075 {
1076 if(keyword->token == TREAT && keyword2->token == AS)
1077 setState(ItemType);
1078 else if (keyword->token == CAST || (keyword->token == CASTABLE && keyword2->token == AS) || keyword2->token == BY)
1079 setState(Default);
1080
1081 m_tokenStack.push(Token(keyword2->token));
1082 }
1083 else
1084 m_tokenStack.push(id2);
1085
1086 return Token(keyword->token);
1087 }
1088 else
1089 {
1090 /* Such that we tokenize the second token in "empty greatest". */
1091 if(keyword->token != EMPTY)
1092 setState(Default);
1093 }
1094
1095 if(keyword->token == AS || keyword->token == CASE)
1096 setState(ItemType);
1097
1098 return Token(keyword->token);
1099 }
1100 else
1101 return id;
1102 }
1103
1104 Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator);
1105
1106 /*
1107 * This is hard. Consider this:
1108 *
1109 * Valid: child ::nameTest
1110 * Valid: child:: nameTest
1111 * Syntax Error: child :localName
1112 * Syntax Error: child: localName
1113 *
1114 * Consider "child ::name". Right now, we're here:
1115 * ^
1116 * We don't know whether "child" is a prefix and hence the whitespace is invalid,
1117 * or whether it's an axis and hence skippable. */
1118 {
1119 const int wsLength = peekForColonColon();
1120 /* We cannot call handleWhitespace() because it returns on
1121 * END_OF_FILE, and we have parsed up keyword, and we need to
1122 * deal with that.
1123 *
1124 * If we have a colon colon, which means the whitespace is
1125 * allowed, we skip it. */
1126 if(wsLength != -1)
1127 m_pos += wsLength;
1128 }
1129
1130 /* Handle name tests. */
1131 if(peekCurrent() == ':')
1132 {
1133 switch(peekAhead())
1134 {
1135 case '=':
1136 return id;
1137 case '*':
1138 {
1139 m_pos += 2;
1140 return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
1141 }
1142 case ':':
1143 {
1144 /* We have an axis. */
1145 setState(Axis);
1146 return keyword ? Token(keyword->token) : id;
1147 }
1148 default:
1149 {
1150 /* It's a QName. */
1151 ++m_pos; /* Consume the colon. */
1152
1153 const Token id2(tokenizeNCName());
1154
1155 if(id2.type != NCNAME)
1156 {
1157 --m_pos;
1158 return id;
1159 }
1160
1161 setState(Operator);
1162 const int qNameLen = id.value.length() + id2.value.length() + 1;
1163 return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
1164 }
1165 }
1166 }
1167
1168 if(!keyword || isOperatorKeyword(keyword->token))
1169 {
1170 setState(Operator);
1171 return id;
1172 }
1173
1174 const TokenType ws = consumeWhitespace();
1175 if(ws == ERROR) // TODO this should test for success. Write test.
1176 return Token(ERROR);
1177
1178 if(atEnd())
1179 {
1180 setState(Operator);
1181 return id;
1182 }
1183
1184 /* Let the if-body apply for constructors, and node type tests. */
1185 if(isTypeToken(keyword->token) ||
1186 keyword->token == TYPESWITCH ||
1187 keyword->token == ORDERED ||
1188 keyword->token == UNORDERED ||
1189 keyword->token == IF)
1190 {
1191 switch(peekCurrent())
1192 {
1193 case '(':
1194 {
1195 // TODO See if we can remove DOCUMENT from isTypeToken.
1196 if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
1197 {
1198 m_tokenStack.push(Token(LPAREN));
1199 ++m_pos; /* Consume '('. */
1200 pushState(Operator);
1201
1202 if(keyword->token == PROCESSING_INSTRUCTION)
1203 setState(KindTestForPI);
1204 else
1205 setState(KindTest);
1206
1207 return Token(keyword->token);
1208 }
1209 else if(keyword->token == TYPESWITCH || keyword->token == IF)
1210 return Token(keyword->token);
1211 else /* It's a function call. */
1212 return id;
1213 }
1214 case '{':
1215 {
1216 m_tokenStack.push(Token(CURLY_LBRACE));
1217 ++m_pos; /* Consume '{'. */
1218 pushState(Operator);
1219 /* Stay in state Default. */
1220 return Token(keyword->token);
1221 }
1222 default:
1223 {
1224 /* We have read in a token which is for instance
1225 * "return", and now it can be an element
1226 * test("element") a node kind test("element()"), or a
1227 * computed element constructor("element name {...").
1228 * We need to do a two-token lookahead here, because
1229 * "element return" can be an element test followed by
1230 * the return keyword, but it can also be an element
1231 * constructor("element return {"). */
1232 if(isNCNameStart(current()))
1233 {
1234 const int currentPos = m_pos;
1235 const Token token2 = tokenizeNCNameOrQName();
1236
1237 if(token2.hasError())
1238 return token2;
1239
1240 handleWhitespace();
1241
1242 if(peekCurrent() == '{')
1243 {
1244 /* An element constructor. */
1245 m_tokenStack.push(token2);
1246 return Token(keyword->token);
1247 }
1248
1249 /* We jump back in the stream, we need to tokenize token2 according
1250 * to the state. */
1251 m_pos = currentPos;
1252 setState(Operator);
1253 return Token(NCNAME, QLatin1String(keyword->name));
1254 }
1255 }
1256 }
1257 }
1258
1259 if(peekCurrent() == '$')
1260 {
1261 setState(VarName);
1262 return Token(keyword->token);
1263 }
1264
1265 /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
1266 if(peekCurrent() == '(')
1267 return id;
1268 else if(peekCurrent() == '{' && keyword->token == VALIDATE)
1269 return Token(keyword->token);
1270
1271 if(!isNCNameStart(current()))
1272 {
1273 setState(Operator);
1274 return id;
1275 }
1276
1277 const Token id2(tokenizeNCName());
1278 const TokenMap *const keyword2 = lookupKeyword(id2.value);
1279
1280 if(!keyword2)
1281 {
1282 /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
1283 setState(Operator);
1284 return id;
1285 }
1286
1287 switch(keyword->token)
1288 {
1289 case DECLARE:
1290 {
1291 switch(keyword2->token)
1292 {
1293 case VARIABLE:
1294 /* Fallthrough. */
1295 case FUNCTION:
1296 {
1297 m_tokenStack.push(Token(keyword2->token));
1298 setState(Default);
1299 return Token(keyword->token);
1300 }
1301 case OPTION:
1302 {
1303 m_tokenStack.push(Token(keyword2->token));
1304 setState(Default);
1305 return Token(keyword->token);
1306 }
1307 case COPY_NAMESPACES:
1308 /* Fallthrough. */
1309 case ORDERING:
1310 {
1311 m_tokenStack.push(Token(keyword2->token));
1312 setState(NamespaceKeyword);
1313 return Token(keyword->token);
1314 }
1315 case CONSTRUCTION:
1316 {
1317 // TODO identical to CONSTRUCTION?
1318 m_tokenStack.push(Token(keyword2->token));
1319 setState(Operator);
1320 return Token(keyword->token);
1321 }
1322 case NAMESPACE:
1323 /* Fallthrough. */
1324 case BASEURI:
1325 {
1326 m_tokenStack.push(Token(keyword2->token));
1327 setState(NamespaceDecl);
1328 return Token(keyword->token);
1329 }
1330 case BOUNDARY_SPACE:
1331 {
1332 m_tokenStack.push(Token(keyword2->token));
1333 setState(XMLSpaceDecl);
1334 return Token(keyword->token);
1335 }
1336 case DEFAULT:
1337 {
1338 m_tokenStack.push(Token(keyword2->token));
1339
1340 const TokenType ws2 = consumeWhitespace();
1341 if(ws2 != SUCCESS)
1342 {
1343 m_tokenStack.prepend(Token(ws2));
1344 return Token(keyword->token);
1345 }
1346
1347 const Token id3(tokenizeNCName());
1348
1349 if(id3.type != NCNAME)
1350 {
1351 m_tokenStack.prepend(id3);
1352 return Token(keyword->token);
1353 }
1354
1355 const TokenMap *const keyword3 = lookupKeyword(id3.value);
1356 if(!keyword3)
1357 {
1358 m_tokenStack.prepend(id3);
1359 return Token(keyword->token);
1360 }
1361 else
1362 {
1363 m_tokenStack.prepend(Token(keyword3->token));
1364
1365 if(keyword3->token == ORDER)
1366 setState(Operator);
1367 else
1368 setState(NamespaceDecl);
1369 }
1370
1371 return Token(keyword->token);
1372 }
1373 default:
1374 {
1375 m_tokenStack.push(Token(keyword2->token));
1376 setState(Default);
1377 return id;
1378 }
1379 }
1380 }
1381 case XQUERY:
1382 {
1383 m_tokenStack.push(Token(keyword2->token));
1384
1385 if(keyword2->token == VERSION)
1386 {
1387 setState(NamespaceDecl);
1388 return Token(keyword->token);
1389 }
1390 else
1391 {
1392 setState(Operator);
1393 return id;
1394 }
1395 }
1396 case IMPORT:
1397 {
1398 m_tokenStack.push(Token(keyword2->token));
1399
1400 switch(keyword2->token)
1401 {
1402 case SCHEMA:
1403 /* Fallthrough. */
1404 case MODULE:
1405 {
1406 setState(NamespaceKeyword);
1407 return Token(keyword->token);
1408 }
1409 default:
1410 {
1411 setState(Operator);
1412 return id;
1413 }
1414 }
1415 }
1416 case VALIDATE:
1417 {
1418 m_tokenStack.push(Token(keyword2->token));
1419
1420 switch(keyword2->token)
1421 {
1422 case LAX:
1423 case STRICT:
1424 {
1425 pushState(Operator);
1426 return Token(keyword->token);
1427 }
1428 default:
1429 {
1430 setState(Operator);
1431 return id;
1432 }
1433 }
1434 }
1435 default:
1436 {
1437 m_tokenStack.push(Token(keyword2->token));
1438 setState(Operator);
1439 return id;
1440 }
1441 }
1442
1443 Q_ASSERT(false);
1444
1445 }
1446 case VarName:
1447 {
1448 if(peekCurrent() == '$')
1449 return tokenAndAdvance(DOLLAR);
1450
1451 setState(Operator);
1452 return tokenizeNCNameOrQName();
1453 Q_ASSERT(false);
1454 }
1455 case ItemType:
1456 {
1457 switch(peekCurrent())
1458 {
1459 case '(':
1460 return tokenAndChangeState(LPAREN, KindTest);
1461 case '$':
1462 return tokenAndChangeState(DOLLAR, VarName);
1463 }
1464
1465 const Token name(tokenizeNCNameOrQName());
1466
1467 if(name.hasError())
1468 return error();
1469
1470 else if(name.type == QNAME)
1471 {
1472 setState(OccurrenceIndicator);
1473 return name;
1474 }
1475 else
1476 {
1477 const TokenMap *const keyword = lookupKeyword(name.value);
1478
1479 if(keyword)
1480 {
1481 pushState(OccurrenceIndicator);
1482 return Token(keyword->token);
1483 }
1484 else
1485 {
1486 setState(Default);
1487 return name;
1488 }
1489 }
1490 Q_ASSERT(false);
1491 }
1492 case KindTest:
1493 {
1494 switch(peekCurrent())
1495 {
1496 case ')':
1497 {
1498 popState();
1499 return tokenAndAdvance(RPAREN);
1500 }
1501 case '(':
1502 return tokenAndAdvance(LPAREN);
1503 case ',':
1504 return tokenAndAdvance(COMMA);
1505 case '*':
1506 return tokenAndAdvance(STAR);
1507 case '?':
1508 return tokenAndAdvance(QUESTION);
1509 case '\'':
1510 /* Fallthrough. */
1511 case '"':
1512 return tokenizeStringLiteral();
1513 }
1514
1515 const Token nc(tokenizeNCNameOrQName());
1516 if(nc.hasError())
1517 return nc;
1518
1519 const TokenType ws = consumeWhitespace();
1520 if(ws == ERROR)
1521 return error();
1522
1523 if(peekCurrent() == '(')
1524 {
1525 const TokenMap *const keyword = lookupKeyword(nc.value);
1526 if(keyword)
1527 {
1528 pushState(KindTest);
1529 return Token(keyword->token);
1530 }
1531 else
1532 return nc;
1533 }
1534 else
1535 return nc;
1536 Q_ASSERT(false);
1537 }
1538 case KindTestForPI:
1539 {
1540 switch(peekCurrent())
1541 {
1542 case ')':
1543 {
1544 popState();
1545 return tokenAndAdvance(RPAREN);
1546 }
1547 case '\'':
1548 /* Fallthrough. */
1549 case '"':
1550 return tokenizeStringLiteral();
1551 default:
1552 return tokenizeNCName();
1553 }
1554 Q_ASSERT(false);
1555 }
1556 case OccurrenceIndicator:
1557 {
1558 switch(peekCurrent())
1559 {
1560 case '?':
1561 return tokenAndChangeState(QUESTION, Operator);
1562 case '*':
1563 return tokenAndChangeState(STAR, Operator);
1564 case '+':
1565 return tokenAndChangeState(PLUS, Operator);
1566 default:
1567 {
1568 setState(Operator);
1569 return nextToken();
1570 }
1571 }
1572 Q_ASSERT(false);
1573 }
1574 case XQueryVersion:
1575 {
1576 switch(peekCurrent())
1577 {
1578 case '\'':
1579 /* Fallthrough. */
1580 case '"':
1581 return tokenizeStringLiteral();
1582 case ';':
1583 return tokenAndChangeState(SEMI_COLON, Default);
1584 }
1585
1586 const Token id(tokenizeNCName());
1587
1588 if(id.type != NCNAME)
1589 return id;
1590
1591 const TokenMap *const keyword = lookupKeyword(id.value);
1592 if(keyword)
1593 return tokenAndChangeState(keyword->token, Default);
1594 else
1595 return id;
1596 Q_ASSERT(false);
1597 }
1598 case StartTag:
1599 {
1600 if(peekAhead(-1) == '<')
1601 {
1602 if(current().isSpace())
1603 return Token(ERROR);
1604 }
1605 else
1606 {
1607 if(consumeRawWhitespace())
1608 return Token(END_OF_FILE);
1609 }
1610
1611 switch(peekCurrent())
1612 {
1613 case '/':
1614 {
1615 if(peekAhead() == '>')
1616 {
1617 m_pos += 2;
1618
1619 if(m_scanOnly)
1620 return Token(POSITION_SET);
1621 else
1622 {
1623 popState();
1624 return Token(QUICK_TAG_END);
1625 }
1626 }
1627 else
1628 return error();
1629 }
1630 case '>':
1631 {
1632 if(m_scanOnly)
1633 return tokenAndChangeState(POSITION_SET, StartTag);
1634 else
1635 return tokenAndChangeState(G_GT, ElementContent);
1636 }
1637 case '=':
1638 return tokenAndAdvance(G_EQ);
1639 case '\'':
1640 return tokenAndChangeState(APOS, AposAttributeContent);
1641 case '"':
1642 return tokenAndChangeState(QUOTE, QuotAttributeContent);
1643 default:
1644 return tokenizeNCNameOrQName();
1645 }
1646 Q_ASSERT(false);
1647 }
1648 case AposAttributeContent:
1649 /* Fallthrough. */
1650 case QuotAttributeContent:
1651 {
1652 const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
1653 QString result;
1654 result.reserve(20);
1655
1656 if(m_scanOnly)
1657 {
1658 int stack = 0;
1659 return attributeAsRaw(sep, stack, m_pos, true, result);
1660 }
1661
1662 Q_ASSERT(!m_scanOnly);
1663 while(true)
1664 {
1665 if(atEnd())
1666 {
1667 /* In the case that the XSL-T tokenizer invokes us with
1668 * default state QuotAttributeContent, we need to be able
1669 * to return a single string, in case that is all we have
1670 * accumulated. */
1671 if(result.isEmpty())
1672 return Token(END_OF_FILE);
1673 else
1674 return Token(STRING_LITERAL, result);
1675 }
1676
1677 const QChar curr(current());
1678
1679 if(curr == sep)
1680 {
1681 if(m_pos + 1 == m_length)
1682 return Token(END_OF_FILE);
1683
1684 if(m_data.at(m_pos + 1) == sep)
1685 {
1686 /* The quoting mechanism was used. */
1687 m_pos += 2;
1688 result.append(sep);
1689 continue;
1690 }
1691
1692 const QChar next(m_data.at(m_pos + 1));
1693 if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
1694 return Token(ERROR); // i18n Space must separate attributes
1695 else if(result.isEmpty())
1696 {
1697 return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE,
1698 StartTag, 1);
1699 }
1700 else
1701 {
1702 /* Don't consume the sep, but leave it so we next time return a token for it. */
1703 return Token(STRING_LITERAL, result);
1704 }
1705
1706 ++m_pos;
1707 continue;
1708 }
1709 else if(curr == QLatin1Char('{'))
1710 {
1711 if(m_pos + 1 == m_length)
1712 return Token(END_OF_FILE);
1713 else if(peekAhead() == '{')
1714 {
1715 ++m_pos;
1716 result.append(QLatin1Char('{'));
1717 }
1718 else
1719 {
1720 if(result.isEmpty())
1721 {
1722 /* The Attribute Value Template appeared directly in the attribute. */
1723 pushState();
1724 return tokenAndChangeState(CURLY_LBRACE, Default);
1725 }
1726 else
1727 {
1728 /* We don't advance, keep '{' as next token. */
1729 return Token(STRING_LITERAL, result);
1730 }
1731 }
1732 }
1733 else if(curr == QLatin1Char('}'))
1734 {
1735 if(m_pos + 1 == m_length)
1736 return Token(END_OF_FILE);
1737 else if(peekAhead() == '}')
1738 {
1739 ++m_pos;
1740 result.append(QLatin1Char('}'));
1741 }
1742 else
1743 return Token(ERROR);
1744 }
1745 else if(curr == QLatin1Char('&'))
1746 {
1747 const QString ret(tokenizeCharacterReference());
1748 if(ret.isNull())
1749 return Token(ERROR);
1750 else
1751 result.append(ret);
1752 }
1753 else if(curr == QLatin1Char('<'))
1754 return Token(STRING_LITERAL, result);
1755 else
1756 {
1757 /* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
1758 * 3.3.3 Attribute-Value Normalization.
1759 *
1760 * However, it is complicated a bit by that AVN is defined on top of
1761 * EOL normalization and we do those two in one go here. */
1762 switch(curr.unicode())
1763 {
1764 case 0xD:
1765 {
1766 if(peekAhead() == '\n')
1767 {
1768 result.append(QLatin1Char(' '));
1769 ++m_pos;
1770 break;
1771 }
1772 }
1773 case 0xA:
1774 /* Fallthrough. */
1775 case 0x9:
1776 {
1777 result.append(QLatin1Char(' '));
1778 break;
1779 }
1780 default:
1781 result.append(curr);
1782 }
1783 }
1784
1785 ++m_pos;
1786 }
1787 Q_ASSERT(false);
1788 }
1789 case ElementContent:
1790 {
1791 QString result;
1792 result.reserve(20);
1793
1794 /* Whether the text node, result, may be whitespace only. Character references
1795 * and CDATA sections disables that. */
1796 bool mayBeWS = true;
1797
1798 CharacterSkips skipEOLNormalization;
1799
1800 while(true)
1801 {
1802 if(atEnd())
1803 return Token(END_OF_FILE);
1804
1805 switch(peekCurrent())
1806 {
1807 case '<':
1808 {
1809 if(!result.isEmpty() && peekAhead(2) != '[')
1810 {
1811 /* We encountered the end, and it was not a CDATA section. */
1812 /* We don't advance. Next time we'll handle the <... stuff. */
1813 return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1814 }
1815
1816 ++m_pos;
1817 if(atEnd())
1818 return Token(END_OF_FILE);
1819
1820 const QChar ahead(current());
1821 if(ahead.isSpace())
1822 return error();
1823 else if(ahead == QLatin1Char('/'))
1824 {
1825 if(m_pos + 1 == m_length)
1826 return Token(END_OF_FILE);
1827 else if(m_data.at(m_pos + 1).isSpace())
1828 return error();
1829 else
1830 return tokenAndChangeState(BEGIN_END_TAG, EndTag);
1831 }
1832 else if(isNCNameStart(ahead))
1833 {
1834 pushState();
1835 return tokenAndChangeState(G_LT, StartTag, 0);
1836 }
1837 else if(aheadEquals("!--", 3, 0))
1838 {
1839 pushState();
1840 m_pos += 3;
1841 return tokenAndChangeState(COMMENT_START, XMLComment, 0);
1842 }
1843 else if(aheadEquals("![CDATA[", 8, 0))
1844 {
1845 mayBeWS = false;
1846 m_pos += 8;
1847 const int start = m_pos;
1848 const int len = scanUntil("]]>");
1849
1850 if(len == -1)
1851 return Token(END_OF_FILE);
1852
1853 m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
1854 result.append(m_data.mid(start, len));
1855 break;
1856 }
1857 else if(ahead == QLatin1Char('?'))
1858 {
1859 pushState();
1860 return tokenAndChangeState(PI_START, ProcessingInstructionName);
1861 }
1862 else
1863 return Token(G_LT);
1864 }
1865 case '&':
1866 {
1867 const QString ret(tokenizeCharacterReference());
1868 if(ret.isNull())
1869 return Token(ERROR);
1870 else
1871 {
1872 skipEOLNormalization.insert(result.count());
1873 result.append(ret);
1874 mayBeWS = false;
1875 break;
1876 }
1877 }
1878 case '{':
1879 {
1880 // TODO remove this check, also below.
1881 if(m_pos + 1 == m_length)
1882 return Token(END_OF_FILE);
1883 else if(peekAhead() == '{')
1884 {
1885 ++m_pos;
1886 result.append(QLatin1Char('{'));
1887 }
1888 else
1889 {
1890 if(result.isEmpty())
1891 {
1892 pushState();
1893 return tokenAndChangeState(CURLY_LBRACE, Default);
1894 }
1895 else
1896 {
1897 /* We don't advance here. */
1898 return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1899 }
1900 }
1901 break;
1902 }
1903 case '}':
1904 {
1905 if(m_pos + 1 == m_length)
1906 return Token(END_OF_FILE);
1907 else if(peekAhead() == '}')
1908 {
1909 ++m_pos;
1910 result.append(QLatin1Char('}'));
1911 }
1912 else
1913 {
1914 /* This is a parse error, and the grammar won't be able
1915 * to reduce this CURLY_RBRACE. */
1916 return tokenAndChangeState(CURLY_RBRACE, Default);
1917 }
1918 break;
1919 }
1920 case '\n':
1921 {
1922 /* We want to translate \r\n into \n. */
1923 if(peekAhead(-1) == '\r')
1924 break;
1925 /* else, fallthrough. */
1926 }
1927 case '\r':
1928 {
1929 result.append(QLatin1Char('\n'));
1930 break;
1931 }
1932 default:
1933 {
1934 result.append(current());
1935 break;
1936 }
1937 }
1938 ++m_pos;
1939 }
1940 Q_ASSERT(false);
1941 }
1942 case ProcessingInstructionName:
1943 {
1944 const int start = m_pos;
1945
1946 while(true)
1947 {
1948 ++m_pos;
1949 if(m_pos >= m_length)
1950 return Token(END_OF_FILE);
1951
1952 const QChar next(current());
1953 if(next.isSpace() || next == QLatin1Char('?'))
1954 {
1955 return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
1956 ProcessingInstructionContent);
1957 }
1958 }
1959 Q_ASSERT(false);
1960 }
1961 case ProcessingInstructionContent:
1962 {
1963 /* Consume whitespace between the name and the content. */
1964 if(consumeRawWhitespace())
1965 return Token(END_OF_FILE);
1966
1967 const int start = m_pos;
1968 const int len = scanUntil("?>");
1969
1970 if(len == -1)
1971 return Token(END_OF_FILE);
1972 else
1973 {
1974 m_pos += 2; /* Consume "?>" */
1975 popState();
1976 return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
1977 }
1978 Q_ASSERT(false);
1979 }
1980 case EndTag:
1981 {
1982 if(consumeRawWhitespace())
1983 return END_OF_FILE;
1984
1985 if(peekCurrent() == '>')
1986 {
1987 popState();
1988 return tokenAndAdvance(G_GT);
1989 }
1990 else
1991 return tokenizeNCNameOrQName();
1992 Q_ASSERT(false);
1993 }
1994 case XMLComment:
1995 {
1996 const int start = m_pos;
1997 const int len = scanUntil("--");
1998
1999 if(len == -1)
2000 return END_OF_FILE;
2001 else
2002 {
2003 m_pos += 2; /* Consume "--". */
2004 popState();
2005
2006 if(peekCurrent() == '>')
2007 {
2008 ++m_pos;
2009 return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
2010 }
2011 else
2012 return error();
2013 }
2014 Q_ASSERT(false);
2015 }
2016 case Pragma:
2017 {
2018 /* Consume whitespace. */
2019 if(consumeRawWhitespace())
2020 return Token(END_OF_FILE);
2021
2022 setState(PragmaContent);
2023 return tokenizeNCNameOrQName();
2024 }
2025 case PragmaContent:
2026 {
2027 QString result;
2028 result.reserve(20);
2029
2030 const bool hasWS = m_pos < m_length && current().isSpace();
2031
2032 /* Consume all whitespace up to the pragma content(if any). */
2033 if(consumeRawWhitespace())
2034 return Token(END_OF_FILE);
2035
2036 if(peekCurrent() == '#' && peekAhead() == ')')
2037 {
2038 /* We reached the end, and there's no pragma content. */
2039 return tokenAndChangeState(PRAGMA_END, Default, 2);
2040 }
2041 else if(!hasWS)
2042 {
2043 /* A separating space is required if there's pragma content. */
2044 return error(); /* i18n */
2045 }
2046
2047 const int start = m_pos;
2048 const int len = scanUntil("#)");
2049 if(len == -1)
2050 return Token(END_OF_FILE);
2051
2052 return Token(STRING_LITERAL, m_data.mid(start, len));
2053 Q_ASSERT(false);
2054 }
2055 }
2056
2057 Q_ASSERT(false);
2058 return error();
2059}
2060
2061Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep,
2062 int &sepStack,
2063 const int startPos,
2064 const bool aInLiteral,
2065 QString &result)
2066{
2067 bool inLiteral = aInLiteral;
2068 const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
2069
2070 while(true)
2071 {
2072 if(atEnd())
2073 return END_OF_FILE;
2074
2075 if(peekCurrent() == sep.unicode())
2076 {
2077 if(inLiteral)
2078 inLiteral = false;
2079 else
2080 inLiteral = true;
2081
2082 if(peekAhead() == sep.unicode())
2083 {
2084 /* The quoting mechanism was used. */
2085 result.append(current());
2086 m_pos += 2;
2087 continue;
2088 }
2089 else
2090 {
2091 /* Don't consume the separator, such that we
2092 * return a token for it next time. */
2093 if(m_pos == startPos)
2094 {
2095 ++m_pos;
2096 setState(StartTag);
2097 return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
2098 }
2099
2100
2101 if(sepStack == 0)
2102 {
2103 return Token(STRING_LITERAL, result);
2104 }
2105 else
2106 {
2107 result.append(current());
2108 ++m_pos;
2109 continue;
2110 }
2111 }
2112 }
2113 else if(peekCurrent() == '&')
2114 {
2115 const QString ret(tokenizeCharacterReference());
2116 if(ret.isNull())
2117 return Token(ERROR);
2118 else
2119 {
2120 result.append(ret);
2121 ++m_pos;
2122 continue;
2123 }
2124 }
2125 else if(peekCurrent() == otherSep)
2126 {
2127 result.append(current());
2128 ++m_pos;
2129
2130 if(peekCurrent() == otherSep)
2131 ++m_pos;
2132
2133 if(inLiteral)
2134 inLiteral = false;
2135 else
2136 inLiteral = true;
2137
2138 continue;
2139 }
2140 else if(peekCurrent() == '{')
2141 {
2142 result.append(current());
2143
2144 if(peekAhead() == '{')
2145 {
2146 m_pos += 2;
2147 continue;
2148 }
2149 else
2150 {
2151 ++m_pos;
2152 ++sepStack;
2153 const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
2154 if(t.type != SUCCESS)
2155 return t;
2156 }
2157
2158 }
2159 else if(peekCurrent() == '}')
2160 {
2161 if(inLiteral && peekAhead() == '}')
2162 {
2163 result.append(current());
2164 m_pos += 2;
2165 continue;
2166 }
2167 else
2168 {
2169 ++m_pos;
2170 --sepStack;
2171 return Token(SUCCESS); /* The return value is arbitrary. */
2172 }
2173 }
2174 else
2175 {
2176 result.append(current());
2177 ++m_pos;
2178 }
2179 }
2180}
2181
2182Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator)
2183{
2184 sourceLocator->first_line = m_line;
2185 sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
2186
2187 if(m_tokenStack.isEmpty())
2188 return nextToken();
2189 else
2190 {
2191 const Token retval(m_tokenStack.pop());
2192
2193 switch(retval.type)
2194 {
2195 case MODULE:
2196 /* Fallthrough.*/
2197 case SCHEMA:
2198 /* Fallthrough.*/
2199 case COPY_NAMESPACES:
2200 {
2201 setState(NamespaceKeyword);
2202 break;
2203 }
2204 case VERSION:
2205 {
2206 setState(XQueryVersion);
2207 break;
2208 }
2209 case AS:
2210 /* Fallthrough. */
2211 case OF:
2212 {
2213 setState(ItemType);
2214 break;
2215 }
2216 default:
2217 {
2218 if(isOperatorKeyword(retval.type))
2219 setState(Default);
2220
2221 break;
2222 }
2223 };
2224
2225 return retval;
2226 }
2227}
2228
2229int XQueryTokenizer::commenceScanOnly()
2230{
2231 m_scanOnly = true;
2232 return m_pos;
2233}
2234
2235void XQueryTokenizer::resumeTokenizationFrom(const int pos)
2236{
2237 m_scanOnly = false;
2238 m_pos = pos;
2239}
2240
2241void XQueryTokenizer::setParserContext(const ParserContext::Ptr &)
2242{
2243}
2244
2245#undef handleWhitespace
2246
2247} // namespace QPatternist
2248
2249QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.