Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

qxquerytokenizer.cpp@ 490

Last change on this file since 490 was 2, checked in by Dmitry A. Kuminov, 16 years ago
Initially imported qt-all-opensource-src-4.5.1 from Trolltech.
File size: 68.5 KB

Line
1	/****************************************************************************
2	**
3	** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4	** Contact: Qt Software Information (qt-info@nokia.com)
5	**
6	** This file is part of the QtXmlPatterns module of the Qt Toolkit.
7	**
8	** $QT_BEGIN_LICENSE:LGPL$
9	** Commercial Usage
10	** Licensees holding valid Qt Commercial licenses may use this file in
11	** accordance with the Qt Commercial License Agreement provided with the
12	** Software or, alternatively, in accordance with the terms contained in
13	** a written agreement between you and Nokia.
14	**
15	** GNU Lesser General Public License Usage
16	** Alternatively, this file may be used under the terms of the GNU Lesser
17	** General Public License version 2.1 as published by the Free Software
18	** Foundation and appearing in the file LICENSE.LGPL included in the
19	** packaging of this file. Please review the following information to
20	** ensure the GNU Lesser General Public License version 2.1 requirements
21	** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
22	**
23	** In addition, as a special exception, Nokia gives you certain
24	** additional rights. These rights are described in the Nokia Qt LGPL
25	** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26	** package.
27	**
28	** GNU General Public License Usage
29	** Alternatively, this file may be used under the terms of the GNU
30	** General Public License version 3.0 as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL included in the
32	** packaging of this file. Please review the following information to
33	** ensure the GNU General Public License version 3.0 requirements will be
34	** met: http://www.gnu.org/copyleft/gpl.html.
35	**
36	** If you are unsure which license is appropriate for your use, please
37	** contact the sales department at qt-sales@nokia.com.
38	** $QT_END_LICENSE$
39	**
40	****************************************************************************/
41
42	#include <QByteArray>
43
44	#include "qquerytransformparser_p.h"
45
46	#include "qxquerytokenizer_p.h"
47
48	#include "qtokenlookup.cpp"
49
50	QT_BEGIN_NAMESPACE
51
52	namespace QPatternist
53	{
54
55	#define handleWhitespace() \
56	{ \
57	const TokenType t = consumeWhitespace(); \
58	if(t != SUCCESS) \
59	return Token(t); \
60	}
61
62	XQueryTokenizer::XQueryTokenizer(const QString &query,
63	const QUrl &location,
64	const State startingState) : Tokenizer(location)
65	, m_data(query)
66	, m_length(query.length())
67	, m_state(startingState)
68	, m_pos(0)
69	, m_line(1)
70	, m_columnOffset(0)
71	, m_scanOnly(false)
72	{
73	Q_ASSERT(location.isValid() \|\| location.isEmpty());
74	}
75
76	const QChar XQueryTokenizer::current() const
77	{
78	if(m_pos < m_length)
79	return m_data.at(m_pos);
80	else
81	return QChar();
82	}
83
84	char XQueryTokenizer::peekCurrent() const
85	{
86	return current().toAscii();
87	}
88
89	int XQueryTokenizer::peekForColonColon() const
90	{
91	/* Note, we don't modify m_pos in this function, so we need to do offset
92	* calculations. */
93	int pos = m_pos;
94
95	while(pos < m_length)
96	{
97	switch(m_data.at(pos).toAscii())
98	{
99	/* Fallthrough these four. */
100	case ' ':
101	case '\t':
102	case '\n':
103	case '\r':
104	break;
105	case ':':
106	{
107	if(peekAhead((pos - m_pos) + 1) == ':')
108	return pos - m_pos;
109	/* Fallthrough. */
110	}
111	default:
112	return -1;
113	}
114	++pos;
115	}
116
117	return -1;
118	}
119
120	Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
121	const State s,
122	const int advance)
123	{
124	Q_ASSERT(advance >= 0);
125	m_pos += advance;
126	setState(s);
127	return Token(code);
128	}
129
130	Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
131	const QString &value,
132	const State s)
133	{
134	setState(s);
135	return Token(code, value);
136	}
137
138	Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code,
139	const int advance)
140	{
141	Q_ASSERT(advance >= 0);
142	m_pos += advance;
143	return Token(code);
144	}
145
146	QString XQueryTokenizer::normalizeEOL(const QString &input,
147	const CharacterSkips &characterSkips)
148	{
149	const int len = input.count();
150	QString result;
151
152	/* The likely hood is rather high it'll be the same content. */
153	result.reserve(len);
154
155	for(int i = 0; i < len; ++i)
156	{
157	const QChar &at = input.at(i);
158
159	if(characterSkips.contains(i))
160	{
161	result.append(at);
162	continue;
163	}
164	switch(input.at(i).unicode())
165	{
166	case '\r':
167	{
168	if(i + 1 < len && input.at(i + 1) == QLatin1Char('\n'))
169	++i;
170
171	/* Else, fallthrough. */
172	}
173	case '\n':
174	{
175	result.append(QLatin1Char('\n'));
176	continue;
177	}
178	default:
179	{
180	result.append(at);
181	}
182	}
183	}
184
185	return result;
186	}
187
188	Tokenizer::TokenType XQueryTokenizer::consumeComment()
189	{
190	/* Below, we return ERROR instead of END_OF_FILE such that the parser
191	* sees an invalid comment. */
192	while(m_pos < m_length)
193	{
194	switch(peekCurrent())
195	{
196	case ':':
197	{
198	++m_pos; /* Consume ':' */
199	if(atEnd())
200	return ERROR;
201
202	if(peekCurrent() == ')')
203	{
204	++m_pos; /* Consume ')' */
205	return SUCCESS; /* The comment closed nicely. */
206	}
207	continue; /* We don't want to increment m_pos twice. */
208	}
209	case '(':
210	{ /* It looks like the start of a comment. */
211	++m_pos;
212
213	if(atEnd())
214	return END_OF_FILE;
215	else if(peekCurrent() == ':')
216	{
217	/* And it is a nested comment -- parse it. */
218	const TokenType retval = consumeComment();
219	if(retval == SUCCESS)
220	continue; /* Continue with our "own" comment. */
221	else
222	return retval; /* Return the error in the nested comment. */
223	}
224	break;
225	}
226	case '\n':
227	/* Fallthrough. */
228	case '\r':
229	{
230	/* We want to count \r\n as a single line break. */
231	if(peekAhead() == '\n')
232	++m_pos;
233
234	m_columnOffset = m_pos;
235	++m_line;
236
237	break;
238	}
239	}
240	++m_pos;
241	}
242
243	return ERROR; /* Error: we reached the end while inside a comment. */
244	}
245
246	bool XQueryTokenizer::consumeRawWhitespace()
247	{
248	while(m_pos < m_length)
249	{
250	switch(peekCurrent())
251	{
252	case ' ':
253	case '\t':
254	break;
255	case '\n':
256	case '\r':
257	{
258	if(peekAhead() == '\n')
259	++m_pos;
260
261	m_columnOffset = m_pos;
262	++m_line;
263
264	break;
265	}
266	default:
267	return false;
268	}
269	++m_pos;
270	}
271	return true;
272	}
273
274	Tokenizer::TokenType XQueryTokenizer::consumeWhitespace()
275	{
276	while(m_pos < m_length)
277	{
278	switch(peekCurrent())
279	{
280	case ' ':
281	case '\t':
282	break;
283	case '\n':
284	case '\r':
285	{
286	/* We want to count \r\n as a single line break. */
287	if(peekAhead() == '\n')
288	++m_pos;
289
290	m_columnOffset = m_pos;
291	++m_line;
292
293	break;
294	}
295	case '(':
296	{
297	if(peekAhead() == ':')
298	{
299	m_pos += 2; /* Consume "(:" */
300
301	const TokenType comment = consumeComment();
302	if(comment == SUCCESS)
303	continue;
304	else
305	return comment;
306	}
307	}
308	default:
309	return SUCCESS;
310	}
311	++m_pos;
312	}
313
314	return END_OF_FILE;
315	}
316
317	char XQueryTokenizer::peekAhead(const int length) const
318	{
319	if(m_pos + length < m_length)
320	return m_data.at(m_pos + length).toAscii();
321	else
322	return 0;
323	}
324
325	Tokenizer::Token XQueryTokenizer::error()
326	{
327	return Token(ERROR);
328	}
329
330	bool XQueryTokenizer::isDigit(const char ch)
331	{
332	return ch >= '0' && ch <= '9';
333	}
334
335	/* Replace with function in QXmlUtils. Write test cases for this. */
336	bool XQueryTokenizer::isNCNameStart(const QChar ch)
337	{
338	if(ch == QLatin1Char('_'))
339	return true;
340
341	switch(ch.category())
342	{
343	case QChar::Letter_Lowercase:
344	case QChar::Letter_Uppercase:
345	case QChar::Letter_Other:
346	case QChar::Letter_Titlecase:
347	case QChar::Number_Letter:
348	return true;
349	default:
350	return false;
351	}
352	}
353
354	bool XQueryTokenizer::isNCNameBody(const QChar ch)
355	{
356	switch(ch.unicode())
357	{
358	case '.':
359	case '_':
360	case '-':
361	return true;
362	}
363
364	switch(ch.category())
365	{
366	case QChar::Letter_Lowercase:
367	case QChar::Letter_Uppercase:
368	case QChar::Letter_Other:
369	case QChar::Letter_Titlecase:
370	case QChar::Number_Letter:
371	case QChar::Mark_SpacingCombining:
372	case QChar::Mark_Enclosing:
373	case QChar::Mark_NonSpacing:
374	case QChar::Letter_Modifier:
375	case QChar::Number_DecimalDigit:
376	return true;
377	default:
378	return false;
379	}
380	}
381
382	bool XQueryTokenizer::isPhraseKeyword(const TokenType code)
383	{
384	switch(code)
385	{
386	/* Fallthrough all these. */
387	case CASTABLE:
388	case CAST:
389	case COPY_NAMESPACES:
390	case DECLARE:
391	case EMPTY:
392	case MODULE:
393	case IMPORT:
394	case INSTANCE:
395	case ORDER:
396	case ORDERING:
397	case XQUERY:
398	case STABLE:
399	case TREAT:
400	return true;
401	default:
402	return false;
403	}
404	}
405
406	bool XQueryTokenizer::isOperatorKeyword(const TokenType code)
407	{
408	switch(code)
409	{
410	/* Fallthrough all these. */
411	case AS:
412	case ASCENDING:
413	case AT:
414	case CASE:
415	case CAST:
416	case CASTABLE:
417	case EQ:
418	case EXTERNAL:
419	case GE:
420	case G_EQ:
421	case G_GT:
422	case G_LT:
423	case G_NE:
424	case GT:
425	case IN:
426	case INHERIT:
427	case INSTANCE:
428	case IS:
429	case ITEM:
430	case LE:
431	case LT:
432	case NE:
433	case NO_INHERIT:
434	case NO_PRESERVE:
435	case OF:
436	case PRESERVE:
437	case RETURN:
438	case STABLE:
439	case TO:
440	case TREAT:
441	return true;
442	default:
443	return false;
444	};
445	}
446
447	bool XQueryTokenizer::isTypeToken(const TokenType t)
448	{
449	switch(t)
450	{
451	/* Fallthrough all these. */
452	case ATTRIBUTE:
453	case COMMENT:
454	case DOCUMENT:
455	case DOCUMENT_NODE:
456	case ELEMENT:
457	case ITEM:
458	case NODE:
459	case PROCESSING_INSTRUCTION:
460	case SCHEMA_ATTRIBUTE:
461	case SCHEMA_ELEMENT:
462	case TEXT:
463	return true;
464	default:
465	return false;
466	}
467	}
468
469	Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName()
470	{
471	const int start = m_pos;
472
473	const Token t1 = tokenizeNCName();
474	if(t1.hasError())
475	return t1;
476
477	if(peekCurrent() != ':' \|\| peekAhead() == '=')
478	return t1;
479
480	++m_pos;
481
482	const Token t2 = tokenizeNCName();
483	if(t2.hasError())
484	return t2;
485	else
486	return Token(QNAME, m_data.mid(start, m_pos - start));
487	}
488
489	Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral()
490	{
491	setState(Operator);
492	const int startPos = m_pos;
493	bool hasDot = false;
494	bool isXPath20 = false;
495
496	for(; m_pos < m_length; ++m_pos)
497	{
498	QChar ch(current());
499
500	char cell = ch.cell();
501
502	if(cell == 'e' \|\| cell == 'E')
503	{
504	isXPath20 = true;
505	++m_pos;
506	ch = current();
507
508	if(ch.row() != 0)
509	break;
510
511	cell = ch.cell();
512
513	if(cell == '+' \|\| cell == '-')
514	continue;
515	}
516
517	if(isNCNameStart(ch))
518	return error();
519
520	if(cell < '0' \|\| cell > '9')
521	{
522	if(cell == '.' && !hasDot)
523	hasDot = true;
524	else
525	break;
526	}
527	}
528
529	return Token(isXPath20 ? XPATH2_NUMBER : NUMBER, m_data.mid(startPos, m_pos - startPos));
530	}
531
532	QString XQueryTokenizer::tokenizeCharacterReference()
533	{
534	Q_ASSERT(peekCurrent() == '&');
535
536	const int theEnd = m_data.indexOf(QLatin1Char(';'), m_pos + 1);
537
538	if(theEnd == -1) /* No ';' found, a syntax error. i18n. */
539	return QString();
540
541	QString content(m_data.mid(m_pos + 1, (theEnd - m_pos) - 1));
542	m_pos = theEnd;
543
544	const QChar charRef(charForReference(content));
545
546	if(!charRef.isNull())
547	return charRef;
548	else if(content.startsWith(QLatin1Char('#')))
549	{
550	int base;
551
552	/* It is only '#' or '#x'. */
553	if(content.length() < 2)
554	return QString();
555
556	/* We got a hex number if it starts with 'x', otherwise it's a decimal. */
557	if(content.at(1) == QLatin1Char('x'))
558	{
559	base = 16;
560	content = content.mid(2); /* Remove "#x". */
561	}
562	else
563	{
564	base = 10;
565	content = content.mid(1); /* Remove "#". */
566	}
567
568	bool conversionOK = false;
569	const int codepoint = content.toInt(&conversionOK, base);
570
571	if(conversionOK)
572	{
573	const QChar ch(codepoint);
574
575	if(ch.isNull())
576	{
577	/* We likely have something which require surrogate pairs. */
578	QString result;
579	result += QChar(QChar::highSurrogate(codepoint));
580	result += QChar(QChar::lowSurrogate(codepoint));
581	return result;
582	}
583	else
584	return ch;
585	}
586	else
587	return QString();
588	}
589	else
590	return QString();
591	}
592
593	int XQueryTokenizer::scanUntil(const char *const content)
594	{
595	const int end = m_data.indexOf(QString::fromLatin1(content), m_pos);
596
597	if(end == -1)
598	return -1;
599	else
600	{
601	const int len = end - m_pos;
602	m_pos += len;
603	return len;
604	}
605	}
606
607	QChar XQueryTokenizer::charForReference(const QString &reference)
608	{
609	if(m_charRefs.isEmpty())
610	{
611	/* Initialize. */
612	m_charRefs.reserve(5);
613	m_charRefs.insert(QLatin1String("lt"), QLatin1Char('<'));
614	m_charRefs.insert(QLatin1String("gt"), QLatin1Char('>'));
615	m_charRefs.insert(QLatin1String("amp"), QLatin1Char('&'));
616	m_charRefs.insert(QLatin1String("quot"), QLatin1Char('"'));
617	m_charRefs.insert(QLatin1String("apos"), QLatin1Char('\''));
618	}
619
620	return m_charRefs.value(reference);
621	}
622
623	Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral()
624	{
625	const QChar delimiter(current());
626	/* We cannot unfortunately just scan and then do mid(),
627	* since we can encounter character references. */
628	QString result;
629
630	/* This is more likely than QString's default allocation. */
631	result.reserve(8);
632
633	CharacterSkips skipEOLNormalization;
634
635	/* Advance over the initial quote character. */
636	++m_pos;
637
638	for(; m_pos < m_length; ++m_pos)
639	{
640	const QChar c(current());
641
642	if(c == QLatin1Char('&'))
643	{
644	const QString charRef(tokenizeCharacterReference());
645
646	if(charRef.isNull())
647	return error();
648	else
649	{
650	skipEOLNormalization.insert(result.count());
651	result.append(charRef);
652	}
653
654	}
655	else if(c == delimiter)
656	{
657	/* Maybe the escaping mechanism is used. For instance, "s""s"
658	* has the value `s"s'. */
659	++m_pos;
660
661	if(current() == delimiter) /* Double quote. */
662	result += delimiter;
663	else
664	return Token(STRING_LITERAL, normalizeEOL(result, skipEOLNormalization));
665	}
666	else
667	result += c;
668	}
669
670	return error();
671	}
672
673	Tokenizer::Token XQueryTokenizer::tokenizeNCName()
674	{
675	const int startPos = m_pos;
676
677	if(m_pos < m_length && isNCNameStart(current()))
678	{
679	++m_pos;
680
681	for(; m_pos < m_length; ++m_pos)
682	{
683	if(!isNCNameBody(current()))
684	break;
685	}
686
687	return Token(NCNAME, m_data.mid(startPos, m_pos - startPos));
688	}
689	else
690	return error();
691	}
692
693	bool XQueryTokenizer::aheadEquals(const char *const chs,
694	const int len,
695	const int offset) const
696	{
697	Q_ASSERT(len > 0);
698	Q_ASSERT(qstrlen(chs) == uint(len));
699
700	if(m_pos + len >= m_length)
701	return false;
702
703	for(int i = offset; i < (len + offset); ++i)
704	{
705	if(m_data.at(m_pos + i).toAscii() != chs[i - offset])
706	return false;
707	}
708
709	return true;
710	}
711
712	const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword)
713	{
714	return TokenLookup::value(keyword.toAscii().constData(), keyword.length());
715	}
716
717	XQueryTokenizer::State XQueryTokenizer::state() const
718	{
719	return m_state;
720	}
721
722	void XQueryTokenizer::setState(const State s)
723	{
724	m_state = s;
725	}
726
727	void XQueryTokenizer::pushState(const State s)
728	{
729	m_stateStack.push(s);
730	}
731
732	void XQueryTokenizer::pushState()
733	{
734	m_stateStack.push(m_state);
735	}
736
737	void XQueryTokenizer::popState()
738	{
739	/* QStack::pop() asserts if it's empty, so we need to check
740	* it, since we might receive unbalanced curlies. */
741	if(!m_stateStack.isEmpty())
742	m_state = m_stateStack.pop();
743	}
744
745	Tokenizer::Token XQueryTokenizer::nextToken()
746	{
747	switch(state())
748	{
749	/* We want to skip or do special whitespace handling for these
750	* states. So fallthrough all of the following. */
751	case AposAttributeContent:
752	case Axis:
753	case ElementContent:
754	case EndTag:
755	case Pragma:
756	case PragmaContent:
757	case ProcessingInstructionName:
758	case QuotAttributeContent:
759	case StartTag:
760	case XMLComment:
761	break;
762	default:
763	handleWhitespace();
764	}
765
766	switch(state())
767	{
768	case XMLSpaceDecl:
769	/* Fallthrough. */
770	case NamespaceKeyword:
771	{
772	switch(peekCurrent())
773	{
774	case ',':
775	return tokenAndAdvance(COMMA);
776	case '"':
777	/* Fallthrough. */
778	case '\'':
779	{
780	setState(NamespaceDecl);
781	return tokenizeStringLiteral();
782	}
783	}
784
785	const Token id(tokenizeNCName());
786
787	if(id.type != NCNAME)
788	return id;
789
790	const TokenMap *const keyword = lookupKeyword(id.value);
791	if(keyword)
792	{
793	switch(keyword->token)
794	{
795	case INHERIT:
796	/* Fallthrough. */
797	case NO_INHERIT:
798	{
799	setState(Default);
800	break;
801	}
802	case NAMESPACE:
803	{
804	setState(NamespaceDecl);
805	break;
806	}
807	case ORDERED:
808	/* Fallthrough. */
809	case UNORDERED:
810	/* Fallthrough. */
811	case STRIP:
812	{
813	setState(Default);
814	break;
815	}
816	case PRESERVE:
817	{
818	if(state() != NamespaceKeyword)
819	setState(Default);
820	}
821	default:
822	break;
823	}
824
825	return Token(keyword->token);
826	}
827	else
828	return id;
829
830	Q_ASSERT(false);
831	}
832	case NamespaceDecl:
833	{
834	switch(peekCurrent())
835	{
836	case '=':
837	return tokenAndAdvance(G_EQ);
838	case ';':
839	return tokenAndChangeState(SEMI_COLON, Default);
840	case '\'':
841	/* Fallthrough. */
842	case '\"':
843	return tokenizeStringLiteral();
844	}
845
846	const Token nc(tokenizeNCName());
847
848	handleWhitespace();
849
850	const char pc = peekCurrent();
851	const TokenMap* const t = lookupKeyword(nc.value);
852
853	if(pc == '\'' \|\| (pc == '"' && t))
854	return tokenAndChangeState(t->token, Default, 0);
855	else
856	return nc;
857
858	Q_ASSERT(false);
859	}
860	case Axis:
861	{
862	if(peekCurrent() == ':')
863	{
864	Q_ASSERT(peekAhead() == ':');
865	m_pos += 2;
866	setState(AfterAxisSeparator);
867	return Token(COLONCOLON);
868	}
869	/* Fallthrough. */
870	}
871	case AfterAxisSeparator:
872	/* Fallthrough. */
873	case Default:
874	/* State Operator and state Default have a lot of tokens in common except
875	* for minor differences. So we treat them the same way, and sprinkles logic
876	* here and there to handle the small differences. */
877	/* Fallthrough. */
878	case Operator:
879	{
880	switch(peekCurrent())
881	{
882	case '=':
883	return tokenAndChangeState(G_EQ, Default);
884	case '-':
885	return tokenAndChangeState(MINUS, Default);
886	case '+':
887	return tokenAndChangeState(PLUS, Default);
888	case '[':
889	return tokenAndChangeState(LBRACKET, Default);
890	case ']':
891	return tokenAndChangeState(RBRACKET, Operator);
892	case ',':
893	return tokenAndChangeState(COMMA, Default);
894	case ';':
895	return tokenAndChangeState(SEMI_COLON, Default);
896	case '$':
897	return tokenAndChangeState(DOLLAR, VarName);
898	case '\|':
899	return tokenAndChangeState(BAR, Default);
900	case '?':
901	return tokenAndChangeState(QUESTION, Operator);
902	case ')':
903	return tokenAndChangeState(RPAREN, Operator);
904	case '@':
905	return tokenAndChangeState(AT_SIGN, Default);
906	/* Fallthrough all these. */
907	case '1':
908	case '2':
909	case '3':
910	case '4':
911	case '5':
912	case '6':
913	case '7':
914	case '8':
915	case '9':
916	case '0':
917	return tokenizeNumberLiteral();
918	case '.':
919	{
920	const char next = peekAhead();
921	if(next == '.')
922	return tokenAndChangeState(DOTDOT, Operator, 2);
923	/* .5 is allowed, as short form for 0.5:
924	* <tt>[142] DecimalLiteral ::= ("." Digits) \| (Digits "." [0-9]*)</tt>
925	*/
926	else if(isDigit(next))
927	return tokenizeNumberLiteral();
928	else
929	return tokenAndChangeState(DOT, Operator);
930	}
931	case '\'':
932	/* Fallthrough. */
933	case '"':
934	{
935	setState(Operator);
936	return tokenizeStringLiteral();
937
938	}
939	case '(':
940	{
941	if(peekAhead() == '#')
942	return tokenAndChangeState(PRAGMA_START, Pragma, 2);
943	else
944	return tokenAndChangeState(LPAREN, Default);
945	}
946	case '*':
947	{
948	if(peekAhead() == ':')
949	{
950	m_pos += 2; /* Consume :. /
951	const Token nc = tokenizeNCName();
952
953	if(nc.hasError())
954	return error();
955	else
956	return tokenAndChangeState(ANY_PREFIX, nc.value, Operator);
957	}
958	else
959	return tokenAndChangeState(STAR, state() == Default ? Operator : Default);
960	}
961	case ':':
962	{
963	switch(peekAhead())
964	{
965	case '=':
966	return tokenAndChangeState(ASSIGN, Default, 2);
967	case ':':
968	return tokenAndChangeState(COLONCOLON, Default, 2);
969	default:
970	return error();
971	}
972	}
973	case '!':
974	{
975	if(peekAhead() == '=')
976	return tokenAndChangeState(G_NE, Default, 2);
977	else
978	return error();
979	}
980	case '<':
981	{
982	switch(peekAhead())
983	{
984	case '=':
985	return tokenAndChangeState(G_LE, Default, 2);
986	case '<':
987	return tokenAndChangeState(PRECEDES, Default, 2);
988	case '?':
989	{
990	pushState(Operator);
991	return tokenAndChangeState(PI_START, ProcessingInstructionName, 2);
992	}
993	case '!':
994	{
995	if(aheadEquals("!--", 3))
996	{
997	m_pos += 3; /* Consume "!--". */
998	pushState(Operator);
999	return tokenAndChangeState(COMMENT_START, XMLComment);
1000	}
1001	/* Fallthrough. It's a syntax error, and this is a good way to report it. */
1002	}
1003	default:
1004	{
1005	if((m_pos + 1) < m_length && isNCNameStart(m_data.at(m_pos + 1)))
1006	{
1007	/* We assume it's an element constructor. */
1008	pushState(Operator);
1009	}
1010
1011	return tokenAndChangeState(G_LT, state() == Operator ? Default : StartTag);
1012	}
1013	}
1014	}
1015	case '>':
1016	{
1017	switch(peekAhead())
1018	{
1019	case '=':
1020	return tokenAndChangeState(G_GE, Default, 2);
1021	case '>':
1022	return tokenAndChangeState(FOLLOWS, Default, 2);
1023	default:
1024	return tokenAndChangeState(G_GT, Default);
1025	}
1026	}
1027	case '/':
1028	{
1029	if(peekAhead() == '/')
1030	return tokenAndChangeState(SLASHSLASH, Default, 2);
1031	else
1032	return tokenAndChangeState(SLASH, Default);
1033	}
1034	case '{':
1035	{
1036	pushState(Operator);
1037	return tokenAndChangeState(CURLY_LBRACE, Default);
1038	}
1039	case '}':
1040	{
1041	popState();
1042
1043	return tokenAndAdvance(CURLY_RBRACE);
1044	}
1045	}
1046
1047	/* Ok. We're in state Default or Operator, and it wasn't a simple
1048	* character. */
1049
1050	const Token id(tokenizeNCName());
1051
1052	if(id.type != NCNAME)
1053	return id;
1054
1055	const TokenMap *const keyword = lookupKeyword(id.value);
1056
1057	if(state() == Operator)
1058	{
1059	if(keyword)
1060	{
1061	if(keyword->token == DEFAULT \|\| keyword->token == ASCENDING \|\| keyword->token == DESCENDING)
1062	setState(Operator);
1063	else if(keyword->token == RETURN)
1064	setState(Default);
1065	else if(isPhraseKeyword(keyword->token))
1066	{
1067	const TokenType ws = consumeWhitespace();
1068	if(ws == ERROR)
1069	return error();
1070
1071	const Token id2(tokenizeNCName());
1072	const TokenMap *const keyword2 = lookupKeyword(id2.value);
1073
1074	if(keyword2)
1075	{
1076	if(keyword->token == TREAT && keyword2->token == AS)
1077	setState(ItemType);
1078	else if (keyword->token == CAST \|\| (keyword->token == CASTABLE && keyword2->token == AS) \|\| keyword2->token == BY)
1079	setState(Default);
1080
1081	m_tokenStack.push(Token(keyword2->token));
1082	}
1083	else
1084	m_tokenStack.push(id2);
1085
1086	return Token(keyword->token);
1087	}
1088	else
1089	{
1090	/* Such that we tokenize the second token in "empty greatest". */
1091	if(keyword->token != EMPTY)
1092	setState(Default);
1093	}
1094
1095	if(keyword->token == AS \|\| keyword->token == CASE)
1096	setState(ItemType);
1097
1098	return Token(keyword->token);
1099	}
1100	else
1101	return id;
1102	}
1103
1104	Q_ASSERT(state() == Default \|\| state() == Axis \|\| state() == AfterAxisSeparator);
1105
1106	/*
1107	* This is hard. Consider this:
1108	*
1109	* Valid: child ::nameTest
1110	* Valid: child:: nameTest
1111	* Syntax Error: child :localName
1112	* Syntax Error: child: localName
1113	*
1114	* Consider "child ::name". Right now, we're here:
1115	* ^
1116	* We don't know whether "child" is a prefix and hence the whitespace is invalid,
1117	* or whether it's an axis and hence skippable. */
1118	{
1119	const int wsLength = peekForColonColon();
1120	/* We cannot call handleWhitespace() because it returns on
1121	* END_OF_FILE, and we have parsed up keyword, and we need to
1122	* deal with that.
1123	*
1124	* If we have a colon colon, which means the whitespace is
1125	* allowed, we skip it. */
1126	if(wsLength != -1)
1127	m_pos += wsLength;
1128	}
1129
1130	/* Handle name tests. */
1131	if(peekCurrent() == ':')
1132	{
1133	switch(peekAhead())
1134	{
1135	case '=':
1136	return id;
1137	case '*':
1138	{
1139	m_pos += 2;
1140	return tokenAndChangeState(ANY_LOCAL_NAME, id.value, Operator);
1141	}
1142	case ':':
1143	{
1144	/* We have an axis. */
1145	setState(Axis);
1146	return keyword ? Token(keyword->token) : id;
1147	}
1148	default:
1149	{
1150	/* It's a QName. */
1151	++m_pos; /* Consume the colon. */
1152
1153	const Token id2(tokenizeNCName());
1154
1155	if(id2.type != NCNAME)
1156	{
1157	--m_pos;
1158	return id;
1159	}
1160
1161	setState(Operator);
1162	const int qNameLen = id.value.length() + id2.value.length() + 1;
1163	return Token(QNAME, m_data.mid(m_pos - qNameLen, qNameLen));
1164	}
1165	}
1166	}
1167
1168	if(!keyword \|\| isOperatorKeyword(keyword->token))
1169	{
1170	setState(Operator);
1171	return id;
1172	}
1173
1174	const TokenType ws = consumeWhitespace();
1175	if(ws == ERROR) // TODO this should test for success. Write test.
1176	return Token(ERROR);
1177
1178	if(atEnd())
1179	{
1180	setState(Operator);
1181	return id;
1182	}
1183
1184	/* Let the if-body apply for constructors, and node type tests. */
1185	if(isTypeToken(keyword->token) \|\|
1186	keyword->token == TYPESWITCH \|\|
1187	keyword->token == ORDERED \|\|
1188	keyword->token == UNORDERED \|\|
1189	keyword->token == IF)
1190	{
1191	switch(peekCurrent())
1192	{
1193	case '(':
1194	{
1195	// TODO See if we can remove DOCUMENT from isTypeToken.
1196	if(isTypeToken(keyword->token) && keyword->token != DOCUMENT)
1197	{
1198	m_tokenStack.push(Token(LPAREN));
1199	++m_pos; /* Consume '('. */
1200	pushState(Operator);
1201
1202	if(keyword->token == PROCESSING_INSTRUCTION)
1203	setState(KindTestForPI);
1204	else
1205	setState(KindTest);
1206
1207	return Token(keyword->token);
1208	}
1209	else if(keyword->token == TYPESWITCH \|\| keyword->token == IF)
1210	return Token(keyword->token);
1211	else /* It's a function call. */
1212	return id;
1213	}
1214	case '{':
1215	{
1216	m_tokenStack.push(Token(CURLY_LBRACE));
1217	++m_pos; /* Consume '{'. */
1218	pushState(Operator);
1219	/* Stay in state Default. */
1220	return Token(keyword->token);
1221	}
1222	default:
1223	{
1224	/* We have read in a token which is for instance
1225	* "return", and now it can be an element
1226	* test("element") a node kind test("element()"), or a
1227	* computed element constructor("element name {...").
1228	* We need to do a two-token lookahead here, because
1229	* "element return" can be an element test followed by
1230	* the return keyword, but it can also be an element
1231	* constructor("element return {"). */
1232	if(isNCNameStart(current()))
1233	{
1234	const int currentPos = m_pos;
1235	const Token token2 = tokenizeNCNameOrQName();
1236
1237	if(token2.hasError())
1238	return token2;
1239
1240	handleWhitespace();
1241
1242	if(peekCurrent() == '{')
1243	{
1244	/* An element constructor. */
1245	m_tokenStack.push(token2);
1246	return Token(keyword->token);
1247	}
1248
1249	/* We jump back in the stream, we need to tokenize token2 according
1250	* to the state. */
1251	m_pos = currentPos;
1252	setState(Operator);
1253	return Token(NCNAME, QLatin1String(keyword->name));
1254	}
1255	}
1256	}
1257	}
1258
1259	if(peekCurrent() == '$')
1260	{
1261	setState(VarName);
1262	return Token(keyword->token);
1263	}
1264
1265	/* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
1266	if(peekCurrent() == '(')
1267	return id;
1268	else if(peekCurrent() == '{' && keyword->token == VALIDATE)
1269	return Token(keyword->token);
1270
1271	if(!isNCNameStart(current()))
1272	{
1273	setState(Operator);
1274	return id;
1275	}
1276
1277	const Token id2(tokenizeNCName());
1278	const TokenMap *const keyword2 = lookupKeyword(id2.value);
1279
1280	if(!keyword2)
1281	{
1282	/* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
1283	setState(Operator);
1284	return id;
1285	}
1286
1287	switch(keyword->token)
1288	{
1289	case DECLARE:
1290	{
1291	switch(keyword2->token)
1292	{
1293	case VARIABLE:
1294	/* Fallthrough. */
1295	case FUNCTION:
1296	{
1297	m_tokenStack.push(Token(keyword2->token));
1298	setState(Default);
1299	return Token(keyword->token);
1300	}
1301	case OPTION:
1302	{
1303	m_tokenStack.push(Token(keyword2->token));
1304	setState(Default);
1305	return Token(keyword->token);
1306	}
1307	case COPY_NAMESPACES:
1308	/* Fallthrough. */
1309	case ORDERING:
1310	{
1311	m_tokenStack.push(Token(keyword2->token));
1312	setState(NamespaceKeyword);
1313	return Token(keyword->token);
1314	}
1315	case CONSTRUCTION:
1316	{
1317	// TODO identical to CONSTRUCTION?
1318	m_tokenStack.push(Token(keyword2->token));
1319	setState(Operator);
1320	return Token(keyword->token);
1321	}
1322	case NAMESPACE:
1323	/* Fallthrough. */
1324	case BASEURI:
1325	{
1326	m_tokenStack.push(Token(keyword2->token));
1327	setState(NamespaceDecl);
1328	return Token(keyword->token);
1329	}
1330	case BOUNDARY_SPACE:
1331	{
1332	m_tokenStack.push(Token(keyword2->token));
1333	setState(XMLSpaceDecl);
1334	return Token(keyword->token);
1335	}
1336	case DEFAULT:
1337	{
1338	m_tokenStack.push(Token(keyword2->token));
1339
1340	const TokenType ws2 = consumeWhitespace();
1341	if(ws2 != SUCCESS)
1342	{
1343	m_tokenStack.prepend(Token(ws2));
1344	return Token(keyword->token);
1345	}
1346
1347	const Token id3(tokenizeNCName());
1348
1349	if(id3.type != NCNAME)
1350	{
1351	m_tokenStack.prepend(id3);
1352	return Token(keyword->token);
1353	}
1354
1355	const TokenMap *const keyword3 = lookupKeyword(id3.value);
1356	if(!keyword3)
1357	{
1358	m_tokenStack.prepend(id3);
1359	return Token(keyword->token);
1360	}
1361	else
1362	{
1363	m_tokenStack.prepend(Token(keyword3->token));
1364
1365	if(keyword3->token == ORDER)
1366	setState(Operator);
1367	else
1368	setState(NamespaceDecl);
1369	}
1370
1371	return Token(keyword->token);
1372	}
1373	default:
1374	{
1375	m_tokenStack.push(Token(keyword2->token));
1376	setState(Default);
1377	return id;
1378	}
1379	}
1380	}
1381	case XQUERY:
1382	{
1383	m_tokenStack.push(Token(keyword2->token));
1384
1385	if(keyword2->token == VERSION)
1386	{
1387	setState(NamespaceDecl);
1388	return Token(keyword->token);
1389	}
1390	else
1391	{
1392	setState(Operator);
1393	return id;
1394	}
1395	}
1396	case IMPORT:
1397	{
1398	m_tokenStack.push(Token(keyword2->token));
1399
1400	switch(keyword2->token)
1401	{
1402	case SCHEMA:
1403	/* Fallthrough. */
1404	case MODULE:
1405	{
1406	setState(NamespaceKeyword);
1407	return Token(keyword->token);
1408	}
1409	default:
1410	{
1411	setState(Operator);
1412	return id;
1413	}
1414	}
1415	}
1416	case VALIDATE:
1417	{
1418	m_tokenStack.push(Token(keyword2->token));
1419
1420	switch(keyword2->token)
1421	{
1422	case LAX:
1423	case STRICT:
1424	{
1425	pushState(Operator);
1426	return Token(keyword->token);
1427	}
1428	default:
1429	{
1430	setState(Operator);
1431	return id;
1432	}
1433	}
1434	}
1435	default:
1436	{
1437	m_tokenStack.push(Token(keyword2->token));
1438	setState(Operator);
1439	return id;
1440	}
1441	}
1442
1443	Q_ASSERT(false);
1444
1445	}
1446	case VarName:
1447	{
1448	if(peekCurrent() == '$')
1449	return tokenAndAdvance(DOLLAR);
1450
1451	setState(Operator);
1452	return tokenizeNCNameOrQName();
1453	Q_ASSERT(false);
1454	}
1455	case ItemType:
1456	{
1457	switch(peekCurrent())
1458	{
1459	case '(':
1460	return tokenAndChangeState(LPAREN, KindTest);
1461	case '$':
1462	return tokenAndChangeState(DOLLAR, VarName);
1463	}
1464
1465	const Token name(tokenizeNCNameOrQName());
1466
1467	if(name.hasError())
1468	return error();
1469
1470	else if(name.type == QNAME)
1471	{
1472	setState(OccurrenceIndicator);
1473	return name;
1474	}
1475	else
1476	{
1477	const TokenMap *const keyword = lookupKeyword(name.value);
1478
1479	if(keyword)
1480	{
1481	pushState(OccurrenceIndicator);
1482	return Token(keyword->token);
1483	}
1484	else
1485	{
1486	setState(Default);
1487	return name;
1488	}
1489	}
1490	Q_ASSERT(false);
1491	}
1492	case KindTest:
1493	{
1494	switch(peekCurrent())
1495	{
1496	case ')':
1497	{
1498	popState();
1499	return tokenAndAdvance(RPAREN);
1500	}
1501	case '(':
1502	return tokenAndAdvance(LPAREN);
1503	case ',':
1504	return tokenAndAdvance(COMMA);
1505	case '*':
1506	return tokenAndAdvance(STAR);
1507	case '?':
1508	return tokenAndAdvance(QUESTION);
1509	case '\'':
1510	/* Fallthrough. */
1511	case '"':
1512	return tokenizeStringLiteral();
1513	}
1514
1515	const Token nc(tokenizeNCNameOrQName());
1516	if(nc.hasError())
1517	return nc;
1518
1519	const TokenType ws = consumeWhitespace();
1520	if(ws == ERROR)
1521	return error();
1522
1523	if(peekCurrent() == '(')
1524	{
1525	const TokenMap *const keyword = lookupKeyword(nc.value);
1526	if(keyword)
1527	{
1528	pushState(KindTest);
1529	return Token(keyword->token);
1530	}
1531	else
1532	return nc;
1533	}
1534	else
1535	return nc;
1536	Q_ASSERT(false);
1537	}
1538	case KindTestForPI:
1539	{
1540	switch(peekCurrent())
1541	{
1542	case ')':
1543	{
1544	popState();
1545	return tokenAndAdvance(RPAREN);
1546	}
1547	case '\'':
1548	/* Fallthrough. */
1549	case '"':
1550	return tokenizeStringLiteral();
1551	default:
1552	return tokenizeNCName();
1553	}
1554	Q_ASSERT(false);
1555	}
1556	case OccurrenceIndicator:
1557	{
1558	switch(peekCurrent())
1559	{
1560	case '?':
1561	return tokenAndChangeState(QUESTION, Operator);
1562	case '*':
1563	return tokenAndChangeState(STAR, Operator);
1564	case '+':
1565	return tokenAndChangeState(PLUS, Operator);
1566	default:
1567	{
1568	setState(Operator);
1569	return nextToken();
1570	}
1571	}
1572	Q_ASSERT(false);
1573	}
1574	case XQueryVersion:
1575	{
1576	switch(peekCurrent())
1577	{
1578	case '\'':
1579	/* Fallthrough. */
1580	case '"':
1581	return tokenizeStringLiteral();
1582	case ';':
1583	return tokenAndChangeState(SEMI_COLON, Default);
1584	}
1585
1586	const Token id(tokenizeNCName());
1587
1588	if(id.type != NCNAME)
1589	return id;
1590
1591	const TokenMap *const keyword = lookupKeyword(id.value);
1592	if(keyword)
1593	return tokenAndChangeState(keyword->token, Default);
1594	else
1595	return id;
1596	Q_ASSERT(false);
1597	}
1598	case StartTag:
1599	{
1600	if(peekAhead(-1) == '<')
1601	{
1602	if(current().isSpace())
1603	return Token(ERROR);
1604	}
1605	else
1606	{
1607	if(consumeRawWhitespace())
1608	return Token(END_OF_FILE);
1609	}
1610
1611	switch(peekCurrent())
1612	{
1613	case '/':
1614	{
1615	if(peekAhead() == '>')
1616	{
1617	m_pos += 2;
1618
1619	if(m_scanOnly)
1620	return Token(POSITION_SET);
1621	else
1622	{
1623	popState();
1624	return Token(QUICK_TAG_END);
1625	}
1626	}
1627	else
1628	return error();
1629	}
1630	case '>':
1631	{
1632	if(m_scanOnly)
1633	return tokenAndChangeState(POSITION_SET, StartTag);
1634	else
1635	return tokenAndChangeState(G_GT, ElementContent);
1636	}
1637	case '=':
1638	return tokenAndAdvance(G_EQ);
1639	case '\'':
1640	return tokenAndChangeState(APOS, AposAttributeContent);
1641	case '"':
1642	return tokenAndChangeState(QUOTE, QuotAttributeContent);
1643	default:
1644	return tokenizeNCNameOrQName();
1645	}
1646	Q_ASSERT(false);
1647	}
1648	case AposAttributeContent:
1649	/* Fallthrough. */
1650	case QuotAttributeContent:
1651	{
1652	const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
1653	QString result;
1654	result.reserve(20);
1655
1656	if(m_scanOnly)
1657	{
1658	int stack = 0;
1659	return attributeAsRaw(sep, stack, m_pos, true, result);
1660	}
1661
1662	Q_ASSERT(!m_scanOnly);
1663	while(true)
1664	{
1665	if(atEnd())
1666	{
1667	/* In the case that the XSL-T tokenizer invokes us with
1668	* default state QuotAttributeContent, we need to be able
1669	* to return a single string, in case that is all we have
1670	* accumulated. */
1671	if(result.isEmpty())
1672	return Token(END_OF_FILE);
1673	else
1674	return Token(STRING_LITERAL, result);
1675	}
1676
1677	const QChar curr(current());
1678
1679	if(curr == sep)
1680	{
1681	if(m_pos + 1 == m_length)
1682	return Token(END_OF_FILE);
1683
1684	if(m_data.at(m_pos + 1) == sep)
1685	{
1686	/* The quoting mechanism was used. */
1687	m_pos += 2;
1688	result.append(sep);
1689	continue;
1690	}
1691
1692	const QChar next(m_data.at(m_pos + 1));
1693	if(!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
1694	return Token(ERROR); // i18n Space must separate attributes
1695	else if(result.isEmpty())
1696	{
1697	return tokenAndChangeState(state() == AposAttributeContent ? APOS : QUOTE,
1698	StartTag, 1);
1699	}
1700	else
1701	{
1702	/* Don't consume the sep, but leave it so we next time return a token for it. */
1703	return Token(STRING_LITERAL, result);
1704	}
1705
1706	++m_pos;
1707	continue;
1708	}
1709	else if(curr == QLatin1Char('{'))
1710	{
1711	if(m_pos + 1 == m_length)
1712	return Token(END_OF_FILE);
1713	else if(peekAhead() == '{')
1714	{
1715	++m_pos;
1716	result.append(QLatin1Char('{'));
1717	}
1718	else
1719	{
1720	if(result.isEmpty())
1721	{
1722	/* The Attribute Value Template appeared directly in the attribute. */
1723	pushState();
1724	return tokenAndChangeState(CURLY_LBRACE, Default);
1725	}
1726	else
1727	{
1728	/* We don't advance, keep '{' as next token. */
1729	return Token(STRING_LITERAL, result);
1730	}
1731	}
1732	}
1733	else if(curr == QLatin1Char('}'))
1734	{
1735	if(m_pos + 1 == m_length)
1736	return Token(END_OF_FILE);
1737	else if(peekAhead() == '}')
1738	{
1739	++m_pos;
1740	result.append(QLatin1Char('}'));
1741	}
1742	else
1743	return Token(ERROR);
1744	}
1745	else if(curr == QLatin1Char('&'))
1746	{
1747	const QString ret(tokenizeCharacterReference());
1748	if(ret.isNull())
1749	return Token(ERROR);
1750	else
1751	result.append(ret);
1752	}
1753	else if(curr == QLatin1Char('<'))
1754	return Token(STRING_LITERAL, result);
1755	else
1756	{
1757	/* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
1758	* 3.3.3 Attribute-Value Normalization.
1759	*
1760	* However, it is complicated a bit by that AVN is defined on top of
1761	* EOL normalization and we do those two in one go here. */
1762	switch(curr.unicode())
1763	{
1764	case 0xD:
1765	{
1766	if(peekAhead() == '\n')
1767	{
1768	result.append(QLatin1Char(' '));
1769	++m_pos;
1770	break;
1771	}
1772	}
1773	case 0xA:
1774	/* Fallthrough. */
1775	case 0x9:
1776	{
1777	result.append(QLatin1Char(' '));
1778	break;
1779	}
1780	default:
1781	result.append(curr);
1782	}
1783	}
1784
1785	++m_pos;
1786	}
1787	Q_ASSERT(false);
1788	}
1789	case ElementContent:
1790	{
1791	QString result;
1792	result.reserve(20);
1793
1794	/* Whether the text node, result, may be whitespace only. Character references
1795	* and CDATA sections disables that. */
1796	bool mayBeWS = true;
1797
1798	CharacterSkips skipEOLNormalization;
1799
1800	while(true)
1801	{
1802	if(atEnd())
1803	return Token(END_OF_FILE);
1804
1805	switch(peekCurrent())
1806	{
1807	case '<':
1808	{
1809	if(!result.isEmpty() && peekAhead(2) != '[')
1810	{
1811	/* We encountered the end, and it was not a CDATA section. */
1812	/* We don't advance. Next time we'll handle the <... stuff. */
1813	return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1814	}
1815
1816	++m_pos;
1817	if(atEnd())
1818	return Token(END_OF_FILE);
1819
1820	const QChar ahead(current());
1821	if(ahead.isSpace())
1822	return error();
1823	else if(ahead == QLatin1Char('/'))
1824	{
1825	if(m_pos + 1 == m_length)
1826	return Token(END_OF_FILE);
1827	else if(m_data.at(m_pos + 1).isSpace())
1828	return error();
1829	else
1830	return tokenAndChangeState(BEGIN_END_TAG, EndTag);
1831	}
1832	else if(isNCNameStart(ahead))
1833	{
1834	pushState();
1835	return tokenAndChangeState(G_LT, StartTag, 0);
1836	}
1837	else if(aheadEquals("!--", 3, 0))
1838	{
1839	pushState();
1840	m_pos += 3;
1841	return tokenAndChangeState(COMMENT_START, XMLComment, 0);
1842	}
1843	else if(aheadEquals("![CDATA[", 8, 0))
1844	{
1845	mayBeWS = false;
1846	m_pos += 8;
1847	const int start = m_pos;
1848	const int len = scanUntil("]]>");
1849
1850	if(len == -1)
1851	return Token(END_OF_FILE);
1852
1853	m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
1854	result.append(m_data.mid(start, len));
1855	break;
1856	}
1857	else if(ahead == QLatin1Char('?'))
1858	{
1859	pushState();
1860	return tokenAndChangeState(PI_START, ProcessingInstructionName);
1861	}
1862	else
1863	return Token(G_LT);
1864	}
1865	case '&':
1866	{
1867	const QString ret(tokenizeCharacterReference());
1868	if(ret.isNull())
1869	return Token(ERROR);
1870	else
1871	{
1872	skipEOLNormalization.insert(result.count());
1873	result.append(ret);
1874	mayBeWS = false;
1875	break;
1876	}
1877	}
1878	case '{':
1879	{
1880	// TODO remove this check, also below.
1881	if(m_pos + 1 == m_length)
1882	return Token(END_OF_FILE);
1883	else if(peekAhead() == '{')
1884	{
1885	++m_pos;
1886	result.append(QLatin1Char('{'));
1887	}
1888	else
1889	{
1890	if(result.isEmpty())
1891	{
1892	pushState();
1893	return tokenAndChangeState(CURLY_LBRACE, Default);
1894	}
1895	else
1896	{
1897	/* We don't advance here. */
1898	return Token(mayBeWS ? STRING_LITERAL : NON_BOUNDARY_WS, normalizeEOL(result, skipEOLNormalization));
1899	}
1900	}
1901	break;
1902	}
1903	case '}':
1904	{
1905	if(m_pos + 1 == m_length)
1906	return Token(END_OF_FILE);
1907	else if(peekAhead() == '}')
1908	{
1909	++m_pos;
1910	result.append(QLatin1Char('}'));
1911	}
1912	else
1913	{
1914	/* This is a parse error, and the grammar won't be able
1915	* to reduce this CURLY_RBRACE. */
1916	return tokenAndChangeState(CURLY_RBRACE, Default);
1917	}
1918	break;
1919	}
1920	case '\n':
1921	{
1922	/* We want to translate \r\n into \n. */
1923	if(peekAhead(-1) == '\r')
1924	break;
1925	/* else, fallthrough. */
1926	}
1927	case '\r':
1928	{
1929	result.append(QLatin1Char('\n'));
1930	break;
1931	}
1932	default:
1933	{
1934	result.append(current());
1935	break;
1936	}
1937	}
1938	++m_pos;
1939	}
1940	Q_ASSERT(false);
1941	}
1942	case ProcessingInstructionName:
1943	{
1944	const int start = m_pos;
1945
1946	while(true)
1947	{
1948	++m_pos;
1949	if(m_pos >= m_length)
1950	return Token(END_OF_FILE);
1951
1952	const QChar next(current());
1953	if(next.isSpace() \|\| next == QLatin1Char('?'))
1954	{
1955	return tokenAndChangeState(PI_TARGET, m_data.mid(start, m_pos - start),
1956	ProcessingInstructionContent);
1957	}
1958	}
1959	Q_ASSERT(false);
1960	}
1961	case ProcessingInstructionContent:
1962	{
1963	/* Consume whitespace between the name and the content. */
1964	if(consumeRawWhitespace())
1965	return Token(END_OF_FILE);
1966
1967	const int start = m_pos;
1968	const int len = scanUntil("?>");
1969
1970	if(len == -1)
1971	return Token(END_OF_FILE);
1972	else
1973	{
1974	m_pos += 2; /* Consume "?>" */
1975	popState();
1976	return Token(PI_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
1977	}
1978	Q_ASSERT(false);
1979	}
1980	case EndTag:
1981	{
1982	if(consumeRawWhitespace())
1983	return END_OF_FILE;
1984
1985	if(peekCurrent() == '>')
1986	{
1987	popState();
1988	return tokenAndAdvance(G_GT);
1989	}
1990	else
1991	return tokenizeNCNameOrQName();
1992	Q_ASSERT(false);
1993	}
1994	case XMLComment:
1995	{
1996	const int start = m_pos;
1997	const int len = scanUntil("--");
1998
1999	if(len == -1)
2000	return END_OF_FILE;
2001	else
2002	{
2003	m_pos += 2; /* Consume "--". */
2004	popState();
2005
2006	if(peekCurrent() == '>')
2007	{
2008	++m_pos;
2009	return Token(COMMENT_CONTENT, normalizeEOL(m_data.mid(start, len), CharacterSkips()));
2010	}
2011	else
2012	return error();
2013	}
2014	Q_ASSERT(false);
2015	}
2016	case Pragma:
2017	{
2018	/* Consume whitespace. */
2019	if(consumeRawWhitespace())
2020	return Token(END_OF_FILE);
2021
2022	setState(PragmaContent);
2023	return tokenizeNCNameOrQName();
2024	}
2025	case PragmaContent:
2026	{
2027	QString result;
2028	result.reserve(20);
2029
2030	const bool hasWS = m_pos < m_length && current().isSpace();
2031
2032	/* Consume all whitespace up to the pragma content(if any). */
2033	if(consumeRawWhitespace())
2034	return Token(END_OF_FILE);
2035
2036	if(peekCurrent() == '#' && peekAhead() == ')')
2037	{
2038	/* We reached the end, and there's no pragma content. */
2039	return tokenAndChangeState(PRAGMA_END, Default, 2);
2040	}
2041	else if(!hasWS)
2042	{
2043	/* A separating space is required if there's pragma content. */
2044	return error(); /* i18n */
2045	}
2046
2047	const int start = m_pos;
2048	const int len = scanUntil("#)");
2049	if(len == -1)
2050	return Token(END_OF_FILE);
2051
2052	return Token(STRING_LITERAL, m_data.mid(start, len));
2053	Q_ASSERT(false);
2054	}
2055	}
2056
2057	Q_ASSERT(false);
2058	return error();
2059	}
2060
2061	Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep,
2062	int &sepStack,
2063	const int startPos,
2064	const bool aInLiteral,
2065	QString &result)
2066	{
2067	bool inLiteral = aInLiteral;
2068	const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
2069
2070	while(true)
2071	{
2072	if(atEnd())
2073	return END_OF_FILE;
2074
2075	if(peekCurrent() == sep.unicode())
2076	{
2077	if(inLiteral)
2078	inLiteral = false;
2079	else
2080	inLiteral = true;
2081
2082	if(peekAhead() == sep.unicode())
2083	{
2084	/* The quoting mechanism was used. */
2085	result.append(current());
2086	m_pos += 2;
2087	continue;
2088	}
2089	else
2090	{
2091	/* Don't consume the separator, such that we
2092	* return a token for it next time. */
2093	if(m_pos == startPos)
2094	{
2095	++m_pos;
2096	setState(StartTag);
2097	return Token(sep == QLatin1Char('"') ? QUOTE : APOS);
2098	}
2099
2100
2101	if(sepStack == 0)
2102	{
2103	return Token(STRING_LITERAL, result);
2104	}
2105	else
2106	{
2107	result.append(current());
2108	++m_pos;
2109	continue;
2110	}
2111	}
2112	}
2113	else if(peekCurrent() == '&')
2114	{
2115	const QString ret(tokenizeCharacterReference());
2116	if(ret.isNull())
2117	return Token(ERROR);
2118	else
2119	{
2120	result.append(ret);
2121	++m_pos;
2122	continue;
2123	}
2124	}
2125	else if(peekCurrent() == otherSep)
2126	{
2127	result.append(current());
2128	++m_pos;
2129
2130	if(peekCurrent() == otherSep)
2131	++m_pos;
2132
2133	if(inLiteral)
2134	inLiteral = false;
2135	else
2136	inLiteral = true;
2137
2138	continue;
2139	}
2140	else if(peekCurrent() == '{')
2141	{
2142	result.append(current());
2143
2144	if(peekAhead() == '{')
2145	{
2146	m_pos += 2;
2147	continue;
2148	}
2149	else
2150	{
2151	++m_pos;
2152	++sepStack;
2153	const Token t(attributeAsRaw(sep, sepStack, startPos, false, result));
2154	if(t.type != SUCCESS)
2155	return t;
2156	}
2157
2158	}
2159	else if(peekCurrent() == '}')
2160	{
2161	if(inLiteral && peekAhead() == '}')
2162	{
2163	result.append(current());
2164	m_pos += 2;
2165	continue;
2166	}
2167	else
2168	{
2169	++m_pos;
2170	--sepStack;
2171	return Token(SUCCESS); /* The return value is arbitrary. */
2172	}
2173	}
2174	else
2175	{
2176	result.append(current());
2177	++m_pos;
2178	}
2179	}
2180	}
2181
2182	Tokenizer::Token XQueryTokenizer::nextToken(YYLTYPE *const sourceLocator)
2183	{
2184	sourceLocator->first_line = m_line;
2185	sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
2186
2187	if(m_tokenStack.isEmpty())
2188	return nextToken();
2189	else
2190	{
2191	const Token retval(m_tokenStack.pop());
2192
2193	switch(retval.type)
2194	{
2195	case MODULE:
2196	/* Fallthrough.*/
2197	case SCHEMA:
2198	/* Fallthrough.*/
2199	case COPY_NAMESPACES:
2200	{
2201	setState(NamespaceKeyword);
2202	break;
2203	}
2204	case VERSION:
2205	{
2206	setState(XQueryVersion);
2207	break;
2208	}
2209	case AS:
2210	/* Fallthrough. */
2211	case OF:
2212	{
2213	setState(ItemType);
2214	break;
2215	}
2216	default:
2217	{
2218	if(isOperatorKeyword(retval.type))
2219	setState(Default);
2220
2221	break;
2222	}
2223	};
2224
2225	return retval;
2226	}
2227	}
2228
2229	int XQueryTokenizer::commenceScanOnly()
2230	{
2231	m_scanOnly = true;
2232	return m_pos;
2233	}
2234
2235	void XQueryTokenizer::resumeTokenizationFrom(const int pos)
2236	{
2237	m_scanOnly = false;
2238	m_pos = pos;
2239	}
2240
2241	void XQueryTokenizer::setParserContext(const ParserContext::Ptr &)
2242	{
2243	}
2244
2245	#undef handleWhitespace
2246
2247	} // namespace QPatternist
2248
2249	QT_END_NAMESPACE

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/xmlpatterns/parser/qxquerytokenizer.cpp@ 490

Download in other formats: