1 | /****************************************************************************
|
---|
2 | ** $Id: qregexp.cpp 2 2005-11-16 15:49:26Z dmik $
|
---|
3 | **
|
---|
4 | ** Implementation of QRegExp class
|
---|
5 | **
|
---|
6 | ** Created : 950126
|
---|
7 | **
|
---|
8 | ** Copyright (C) 1992-2000 Trolltech AS. All rights reserved.
|
---|
9 | **
|
---|
10 | ** This file is part of the tools module of the Qt GUI Toolkit.
|
---|
11 | **
|
---|
12 | ** This file may be distributed under the terms of the Q Public License
|
---|
13 | ** as defined by Trolltech AS of Norway and appearing in the file
|
---|
14 | ** LICENSE.QPL included in the packaging of this file.
|
---|
15 | **
|
---|
16 | ** This file may be distributed and/or modified under the terms of the
|
---|
17 | ** GNU General Public License version 2 as published by the Free Software
|
---|
18 | ** Foundation and appearing in the file LICENSE.GPL included in the
|
---|
19 | ** packaging of this file.
|
---|
20 | **
|
---|
21 | ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
|
---|
22 | ** licenses may use this file in accordance with the Qt Commercial License
|
---|
23 | ** Agreement provided with the Software.
|
---|
24 | **
|
---|
25 | ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
|
---|
26 | ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
|
---|
27 | **
|
---|
28 | ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
|
---|
29 | ** information about Qt Commercial License Agreements.
|
---|
30 | ** See http://www.trolltech.com/qpl/ for QPL licensing information.
|
---|
31 | ** See http://www.trolltech.com/gpl/ for GPL licensing information.
|
---|
32 | **
|
---|
33 | ** Contact info@trolltech.com if any conditions of this licensing are
|
---|
34 | ** not clear to you.
|
---|
35 | **
|
---|
36 | **********************************************************************/
|
---|
37 |
|
---|
38 | #include "qregexp.h"
|
---|
39 |
|
---|
40 | #ifndef QT_NO_REGEXP
|
---|
41 |
|
---|
42 | #include "qmemarray.h"
|
---|
43 | #include "qbitarray.h"
|
---|
44 | #include "qcache.h"
|
---|
45 | #include "qcleanuphandler.h"
|
---|
46 | #include "qintdict.h"
|
---|
47 | #include "qmap.h"
|
---|
48 | #include "qptrvector.h"
|
---|
49 | #include "qstring.h"
|
---|
50 | #include "qtl.h"
|
---|
51 |
|
---|
52 | #ifdef QT_THREAD_SUPPORT
|
---|
53 | #include "qthreadstorage.h"
|
---|
54 | #endif // QT_THREAD_SUPPORT
|
---|
55 |
|
---|
56 | #undef QT_TRANSLATE_NOOP
|
---|
57 | #define QT_TRANSLATE_NOOP( context, sourceText ) sourceText
|
---|
58 |
|
---|
59 | #include <limits.h>
|
---|
60 |
|
---|
61 | // error strings for the regexp parser
|
---|
62 | #define RXERR_OK QT_TRANSLATE_NOOP( "QRegExp", "no error occurred" )
|
---|
63 | #define RXERR_DISABLED QT_TRANSLATE_NOOP( "QRegExp", "disabled feature used" )
|
---|
64 | #define RXERR_CHARCLASS QT_TRANSLATE_NOOP( "QRegExp", "bad char class syntax" )
|
---|
65 | #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP( "QRegExp", "bad lookahead syntax" )
|
---|
66 | #define RXERR_REPETITION QT_TRANSLATE_NOOP( "QRegExp", "bad repetition syntax" )
|
---|
67 | #define RXERR_OCTAL QT_TRANSLATE_NOOP( "QRegExp", "invalid octal value" )
|
---|
68 | #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP( "QRegExp", "missing left delim" )
|
---|
69 | #define RXERR_END QT_TRANSLATE_NOOP( "QRegExp", "unexpected end" )
|
---|
70 | #define RXERR_LIMIT QT_TRANSLATE_NOOP( "QRegExp", "met internal limit" )
|
---|
71 |
|
---|
72 | /*
|
---|
73 | WARNING! Be sure to read qregexp.tex before modifying this file.
|
---|
74 | */
|
---|
75 |
|
---|
76 | /*!
|
---|
77 | \class QRegExp qregexp.h
|
---|
78 | \reentrant
|
---|
79 | \brief The QRegExp class provides pattern matching using regular expressions.
|
---|
80 |
|
---|
81 | \ingroup tools
|
---|
82 | \ingroup misc
|
---|
83 | \ingroup shared
|
---|
84 | \mainclass
|
---|
85 | \keyword regular expression
|
---|
86 |
|
---|
87 | Regular expressions, or "regexps", provide a way to find patterns
|
---|
88 | within text. This is useful in many contexts, for example:
|
---|
89 |
|
---|
90 | \table
|
---|
91 | \row \i Validation
|
---|
92 | \i A regexp can be used to check whether a piece of text
|
---|
93 | meets some criteria, e.g. is an integer or contains no
|
---|
94 | whitespace.
|
---|
95 | \row \i Searching
|
---|
96 | \i Regexps provide a much more powerful means of searching
|
---|
97 | text than simple string matching does. For example we can
|
---|
98 | create a regexp which says "find one of the words 'mail',
|
---|
99 | 'letter' or 'correspondence' but not any of the words
|
---|
100 | 'email', 'mailman' 'mailer', 'letterbox' etc."
|
---|
101 | \row \i Search and Replace
|
---|
102 | \i A regexp can be used to replace a pattern with a piece of
|
---|
103 | text, for example replace all occurrences of '&' with
|
---|
104 | '\&' except where the '&' is already followed by 'amp;'.
|
---|
105 | \row \i String Splitting
|
---|
106 | \i A regexp can be used to identify where a string should be
|
---|
107 | split into its component fields, e.g. splitting tab-delimited
|
---|
108 | strings.
|
---|
109 | \endtable
|
---|
110 |
|
---|
111 | We present a very brief introduction to regexps, a description of
|
---|
112 | Qt's regexp language, some code examples, and finally the function
|
---|
113 | documentation itself. QRegExp is modeled on Perl's regexp
|
---|
114 | language, and also fully supports Unicode. QRegExp can also be
|
---|
115 | used in the weaker 'wildcard' (globbing) mode which works in a
|
---|
116 | similar way to command shells. A good text on regexps is \e
|
---|
117 | {Mastering Regular Expressions: Powerful Techniques for Perl and
|
---|
118 | Other Tools} by Jeffrey E. Friedl, ISBN 1565922573.
|
---|
119 |
|
---|
120 | Experienced regexp users may prefer to skip the introduction and
|
---|
121 | go directly to the relevant information.
|
---|
122 |
|
---|
123 | \tableofcontents
|
---|
124 |
|
---|
125 | \section1 Introduction
|
---|
126 |
|
---|
127 | Regexps are built up from expressions, quantifiers, and assertions.
|
---|
128 | The simplest form of expression is simply a character, e.g.
|
---|
129 | <b>x</b> or <b>5</b>. An expression can also be a set of
|
---|
130 | characters. For example, <b>[ABCD]</b>, will match an <b>A</b> or
|
---|
131 | a <b>B</b> or a <b>C</b> or a <b>D</b>. As a shorthand we could
|
---|
132 | write this as <b>[A-D]</b>. If we want to match any of the
|
---|
133 | captital letters in the English alphabet we can write
|
---|
134 | <b>[A-Z]</b>. A quantifier tells the regexp engine how many
|
---|
135 | occurrences of the expression we want, e.g. <b>x{1,1}</b> means
|
---|
136 | match an <b>x</b> which occurs at least once and at most once.
|
---|
137 | We'll look at assertions and more complex expressions later.
|
---|
138 |
|
---|
139 | Note that in general regexps cannot be used to check for balanced
|
---|
140 | brackets or tags. For example if you want to match an opening html
|
---|
141 | \c <b> and its closing \c </b> you can only use a regexp if you
|
---|
142 | know that these tags are not nested; the html fragment, \c{<b>bold
|
---|
143 | <b>bolder</b></b>} will not match as expected. If you know the
|
---|
144 | maximum level of nesting it is possible to create a regexp that
|
---|
145 | will match correctly, but for an unknown level of nesting, regexps
|
---|
146 | will fail.
|
---|
147 |
|
---|
148 | We'll start by writing a regexp to match integers in the range 0
|
---|
149 | to 99. We will require at least one digit so we will start with
|
---|
150 | <b>[0-9]{1,1}</b> which means match a digit exactly once. This
|
---|
151 | regexp alone will match integers in the range 0 to 9. To match one
|
---|
152 | or two digits we can increase the maximum number of occurrences so
|
---|
153 | the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at
|
---|
154 | least once and at most twice. However, this regexp as it stands
|
---|
155 | will not match correctly. This regexp will match one or two digits
|
---|
156 | \e within a string. To ensure that we match against the whole
|
---|
157 | string we must use the anchor assertions. We need <b>^</b> (caret)
|
---|
158 | which when it is the first character in the regexp means that the
|
---|
159 | regexp must match from the beginning of the string. And we also
|
---|
160 | need <b>$</b> (dollar) which when it is the last character in the
|
---|
161 | regexp means that the regexp must match until the end of the
|
---|
162 | string. So now our regexp is <b>^[0-9]{1,2}$</b>. Note that
|
---|
163 | assertions, such as <b>^</b> and <b>$</b>, do not match any
|
---|
164 | characters.
|
---|
165 |
|
---|
166 | If you've seen regexps elsewhere they may have looked different from
|
---|
167 | the ones above. This is because some sets of characters and some
|
---|
168 | quantifiers are so common that they have special symbols to
|
---|
169 | represent them. <b>[0-9]</b> can be replaced with the symbol
|
---|
170 | <b>\d</b>. The quantifier to match exactly one occurrence,
|
---|
171 | <b>{1,1}</b>, can be replaced with the expression itself. This means
|
---|
172 | that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0
|
---|
173 | to 99 matcher could be written <b>^\d{1,2}$</b>. Another way of
|
---|
174 | writing it would be <b>^\d\d{0,1}$</b>, i.e. from the start of the
|
---|
175 | string match a digit followed by zero or one digits. In practice
|
---|
176 | most people would write it <b>^\d\d?$</b>. The <b>?</b> is a
|
---|
177 | shorthand for the quantifier <b>{0,1}</b>, i.e. a minimum of no
|
---|
178 | occurrences a maximum of one occurrence. This is used to make an
|
---|
179 | expression optional. The regexp <b>^\d\d?$</b> means "from the
|
---|
180 | beginning of the string match one digit followed by zero or one
|
---|
181 | digits and then the end of the string".
|
---|
182 |
|
---|
183 | Our second example is matching the words 'mail', 'letter' or
|
---|
184 | 'correspondence' but without matching 'email', 'mailman',
|
---|
185 | 'mailer', 'letterbox' etc. We'll start by just matching 'mail'. In
|
---|
186 | full the regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since
|
---|
187 | each expression itself is automatically quantified by <b>{1,1}</b>
|
---|
188 | we can simply write this as <b>mail</b>; an 'm' followed by an 'a'
|
---|
189 | followed by an 'i' followed by an 'l'. The symbol '|' (bar) is
|
---|
190 | used for \e alternation, so our regexp now becomes
|
---|
191 | <b>mail|letter|correspondence</b> which means match 'mail' \e or
|
---|
192 | 'letter' \e or 'correspondence'. Whilst this regexp will find the
|
---|
193 | words we want it will also find words we don't want such as
|
---|
194 | 'email'. We will start by putting our regexp in parentheses,
|
---|
195 | <b>(mail|letter|correspondence)</b>. Parentheses have two effects,
|
---|
196 | firstly they group expressions together and secondly they identify
|
---|
197 | parts of the regexp that we wish to \link #capturing-text capture
|
---|
198 | \endlink. Our regexp still matches any of the three words but now
|
---|
199 | they are grouped together as a unit. This is useful for building
|
---|
200 | up more complex regexps. It is also useful because it allows us to
|
---|
201 | examine which of the words actually matched. We need to use
|
---|
202 | another assertion, this time <b>\b</b> "word boundary":
|
---|
203 | <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match
|
---|
204 | a word boundary followed by the expression in parentheses followed
|
---|
205 | by another word boundary". The <b>\b</b> assertion matches at a \e
|
---|
206 | position in the regexp not a \e character in the regexp. A word
|
---|
207 | boundary is any non-word character such as a space a newline or
|
---|
208 | the beginning or end of the string.
|
---|
209 |
|
---|
210 | For our third example we want to replace ampersands with the HTML
|
---|
211 | entity '\&'. The regexp to match is simple: <b>\&</b>, i.e.
|
---|
212 | match one ampersand. Unfortunately this will mess up our text if
|
---|
213 | some of the ampersands have already been turned into HTML
|
---|
214 | entities. So what we really want to say is replace an ampersand
|
---|
215 | providing it is not followed by 'amp;'. For this we need the
|
---|
216 | negative lookahead assertion and our regexp becomes:
|
---|
217 | <b>\&(?!amp;)</b>. The negative lookahead assertion is introduced
|
---|
218 | with '(?!' and finishes at the ')'. It means that the text it
|
---|
219 | contains, 'amp;' in our example, must \e not follow the expression
|
---|
220 | that preceeds it.
|
---|
221 |
|
---|
222 | Regexps provide a rich language that can be used in a variety of
|
---|
223 | ways. For example suppose we want to count all the occurrences of
|
---|
224 | 'Eric' and 'Eirik' in a string. Two valid regexps to match these
|
---|
225 | are <b>\\b(Eric|Eirik)\\b</b> and <b>\\bEi?ri[ck]\\b</b>. We need
|
---|
226 | the word boundary '\b' so we don't get 'Ericsson' etc. The second
|
---|
227 | regexp actually matches more than we want, 'Eric', 'Erik', 'Eiric'
|
---|
228 | and 'Eirik'.
|
---|
229 |
|
---|
230 | We will implement some the examples above in the
|
---|
231 | \link #code-examples code examples \endlink section.
|
---|
232 |
|
---|
233 | \target characters-and-abbreviations-for-sets-of-characters
|
---|
234 | \section1 Characters and Abbreviations for Sets of Characters
|
---|
235 |
|
---|
236 | \table
|
---|
237 | \header \i Element \i Meaning
|
---|
238 | \row \i <b>c</b>
|
---|
239 | \i Any character represents itself unless it has a special
|
---|
240 | regexp meaning. Thus <b>c</b> matches the character \e c.
|
---|
241 | \row \i <b>\\c</b>
|
---|
242 | \i A character that follows a backslash matches the character
|
---|
243 | itself except where mentioned below. For example if you
|
---|
244 | wished to match a literal caret at the beginning of a string
|
---|
245 | you would write <b>\^</b>.
|
---|
246 | \row \i <b>\\a</b>
|
---|
247 | \i This matches the ASCII bell character (BEL, 0x07).
|
---|
248 | \row \i <b>\\f</b>
|
---|
249 | \i This matches the ASCII form feed character (FF, 0x0C).
|
---|
250 | \row \i <b>\\n</b>
|
---|
251 | \i This matches the ASCII line feed character (LF, 0x0A, Unix newline).
|
---|
252 | \row \i <b>\\r</b>
|
---|
253 | \i This matches the ASCII carriage return character (CR, 0x0D).
|
---|
254 | \row \i <b>\\t</b>
|
---|
255 | \i This matches the ASCII horizontal tab character (HT, 0x09).
|
---|
256 | \row \i <b>\\v</b>
|
---|
257 | \i This matches the ASCII vertical tab character (VT, 0x0B).
|
---|
258 | \row \i <b>\\xhhhh</b>
|
---|
259 | \i This matches the Unicode character corresponding to the
|
---|
260 | hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo
|
---|
261 | (i.e., \zero ooo) matches the ASCII/Latin-1 character
|
---|
262 | corresponding to the octal number ooo (between 0 and 0377).
|
---|
263 | \row \i <b>. (dot)</b>
|
---|
264 | \i This matches any character (including newline).
|
---|
265 | \row \i <b>\\d</b>
|
---|
266 | \i This matches a digit (QChar::isDigit()).
|
---|
267 | \row \i <b>\\D</b>
|
---|
268 | \i This matches a non-digit.
|
---|
269 | \row \i <b>\\s</b>
|
---|
270 | \i This matches a whitespace (QChar::isSpace()).
|
---|
271 | \row \i <b>\\S</b>
|
---|
272 | \i This matches a non-whitespace.
|
---|
273 | \row \i <b>\\w</b>
|
---|
274 | \i This matches a word character (QChar::isLetterOrNumber() or '_').
|
---|
275 | \row \i <b>\\W</b>
|
---|
276 | \i This matches a non-word character.
|
---|
277 | \row \i <b>\\n</b>
|
---|
278 | \i The n-th \link #capturing-text backreference \endlink,
|
---|
279 | e.g. \1, \2, etc.
|
---|
280 | \endtable
|
---|
281 |
|
---|
282 | \e {Note that the C++ compiler transforms backslashes in strings
|
---|
283 | so to include a <b>\\</b> in a regexp you will need to enter it
|
---|
284 | twice, i.e. <b>\\\\</b>.}
|
---|
285 |
|
---|
286 | \target sets-of-characters
|
---|
287 | \section1 Sets of Characters
|
---|
288 |
|
---|
289 | Square brackets are used to match any character in the set of
|
---|
290 | characters contained within the square brackets. All the character
|
---|
291 | set abbreviations described above can be used within square
|
---|
292 | brackets. Apart from the character set abbreviations and the
|
---|
293 | following two exceptions no characters have special meanings in
|
---|
294 | square brackets.
|
---|
295 |
|
---|
296 | \table
|
---|
297 | \row \i <b>^</b>
|
---|
298 | \i The caret negates the character set if it occurs as the
|
---|
299 | first character, i.e. immediately after the opening square
|
---|
300 | bracket. For example, <b>[abc]</b> matches 'a' or 'b' or 'c',
|
---|
301 | but <b>[^abc]</b> matches anything \e except 'a' or 'b' or
|
---|
302 | 'c'.
|
---|
303 | \row \i <b>-</b>
|
---|
304 | \i The dash is used to indicate a range of characters, for
|
---|
305 | example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'.
|
---|
306 | \endtable
|
---|
307 |
|
---|
308 | Using the predefined character set abbreviations is more portable
|
---|
309 | than using character ranges across platforms and languages. For
|
---|
310 | example, <b>[0-9]</b> matches a digit in Western alphabets but
|
---|
311 | <b>\d</b> matches a digit in \e any alphabet.
|
---|
312 |
|
---|
313 | Note that in most regexp literature sets of characters are called
|
---|
314 | "character classes".
|
---|
315 |
|
---|
316 | \target quantifiers
|
---|
317 | \section1 Quantifiers
|
---|
318 |
|
---|
319 | By default an expression is automatically quantified by
|
---|
320 | <b>{1,1}</b>, i.e. it should occur exactly once. In the following
|
---|
321 | list <b>\e {E}</b> stands for any expression. An expression is a
|
---|
322 | character or an abbreviation for a set of characters or a set of
|
---|
323 | characters in square brackets or any parenthesised expression.
|
---|
324 |
|
---|
325 | \table
|
---|
326 | \row \i <b>\e {E}?</b>
|
---|
327 | \i Matches zero or one occurrence of \e E. This quantifier
|
---|
328 | means "the previous expression is optional" since it will
|
---|
329 | match whether or not the expression occurs in the string. It
|
---|
330 | is the same as <b>\e {E}{0,1}</b>. For example <b>dents?</b>
|
---|
331 | will match 'dent' and 'dents'.
|
---|
332 |
|
---|
333 | \row \i <b>\e {E}+</b>
|
---|
334 | \i Matches one or more occurrences of \e E. This is the same
|
---|
335 | as <b>\e {E}{1,MAXINT}</b>. For example, <b>0+</b> will match
|
---|
336 | '0', '00', '000', etc.
|
---|
337 |
|
---|
338 | \row \i <b>\e {E}*</b>
|
---|
339 | \i Matches zero or more occurrences of \e E. This is the same
|
---|
340 | as <b>\e {E}{0,MAXINT}</b>. The <b>*</b> quantifier is often
|
---|
341 | used by a mistake. Since it matches \e zero or more
|
---|
342 | occurrences it will match no occurrences at all. For example
|
---|
343 | if we want to match strings that end in whitespace and use
|
---|
344 | the regexp <b>\s*$</b> we would get a match on every string.
|
---|
345 | This is because we have said find zero or more whitespace
|
---|
346 | followed by the end of string, so even strings that don't end
|
---|
347 | in whitespace will match. The regexp we want in this case is
|
---|
348 | <b>\s+$</b> to match strings that have at least one
|
---|
349 | whitespace at the end.
|
---|
350 |
|
---|
351 | \row \i <b>\e {E}{n}</b>
|
---|
352 | \i Matches exactly \e n occurrences of the expression. This
|
---|
353 | is the same as repeating the expression \e n times. For
|
---|
354 | example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also
|
---|
355 | the same as <b>\e {E}{n,n}</b>, e.g. <b>x{5,5}</b>.
|
---|
356 |
|
---|
357 | \row \i <b>\e {E}{n,}</b>
|
---|
358 | \i Matches at least \e n occurrences of the expression. This
|
---|
359 | is the same as <b>\e {E}{n,MAXINT}</b>.
|
---|
360 |
|
---|
361 | \row \i <b>\e {E}{,m}</b>
|
---|
362 | \i Matches at most \e m occurrences of the expression. This
|
---|
363 | is the same as <b>\e {E}{0,m}</b>.
|
---|
364 |
|
---|
365 | \row \i <b>\e {E}{n,m}</b>
|
---|
366 | \i Matches at least \e n occurrences of the expression and at
|
---|
367 | most \e m occurrences of the expression.
|
---|
368 | \endtable
|
---|
369 |
|
---|
370 | (MAXINT is implementation dependent but will not be smaller than
|
---|
371 | 1024.)
|
---|
372 |
|
---|
373 | If we wish to apply a quantifier to more than just the preceding
|
---|
374 | character we can use parentheses to group characters together in
|
---|
375 | an expression. For example, <b>tag+</b> matches a 't' followed by
|
---|
376 | an 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches
|
---|
377 | at least one occurrence of 'tag'.
|
---|
378 |
|
---|
379 | Note that quantifiers are "greedy". They will match as much text
|
---|
380 | as they can. For example, <b>0+</b> will match as many zeros as it
|
---|
381 | can from the first zero it finds, e.g. '2.<u>000</u>5'.
|
---|
382 | Quantifiers can be made non-greedy, see setMinimal().
|
---|
383 |
|
---|
384 | \target capturing-text
|
---|
385 | \section1 Capturing Text
|
---|
386 |
|
---|
387 | Parentheses allow us to group elements together so that we can
|
---|
388 | quantify and capture them. For example if we have the expression
|
---|
389 | <b>mail|letter|correspondence</b> that matches a string we know
|
---|
390 | that \e one of the words matched but not which one. Using
|
---|
391 | parentheses allows us to "capture" whatever is matched within
|
---|
392 | their bounds, so if we used <b>(mail|letter|correspondence)</b>
|
---|
393 | and matched this regexp against the string "I sent you some email"
|
---|
394 | we can use the cap() or capturedTexts() functions to extract the
|
---|
395 | matched characters, in this case 'mail'.
|
---|
396 |
|
---|
397 | We can use captured text within the regexp itself. To refer to the
|
---|
398 | captured text we use \e backreferences which are indexed from 1,
|
---|
399 | the same as for cap(). For example we could search for duplicate
|
---|
400 | words in a string using <b>\b(\w+)\W+\1\b</b> which means match a
|
---|
401 | word boundary followed by one or more word characters followed by
|
---|
402 | one or more non-word characters followed by the same text as the
|
---|
403 | first parenthesised expression followed by a word boundary.
|
---|
404 |
|
---|
405 | If we want to use parentheses purely for grouping and not for
|
---|
406 | capturing we can use the non-capturing syntax, e.g.
|
---|
407 | <b>(?:green|blue)</b>. Non-capturing parentheses begin '(?:' and
|
---|
408 | end ')'. In this example we match either 'green' or 'blue' but we
|
---|
409 | do not capture the match so we only know whether or not we matched
|
---|
410 | but not which color we actually found. Using non-capturing
|
---|
411 | parentheses is more efficient than using capturing parentheses
|
---|
412 | since the regexp engine has to do less book-keeping.
|
---|
413 |
|
---|
414 | Both capturing and non-capturing parentheses may be nested.
|
---|
415 |
|
---|
416 | \target assertions
|
---|
417 | \section1 Assertions
|
---|
418 |
|
---|
419 | Assertions make some statement about the text at the point where
|
---|
420 | they occur in the regexp but they do not match any characters. In
|
---|
421 | the following list <b>\e {E}</b> stands for any expression.
|
---|
422 |
|
---|
423 | \table
|
---|
424 | \row \i <b>^</b>
|
---|
425 | \i The caret signifies the beginning of the string. If you
|
---|
426 | wish to match a literal \c{^} you must escape it by
|
---|
427 | writing \c{\\^}. For example, <b>^#include</b> will only
|
---|
428 | match strings which \e begin with the characters '#include'.
|
---|
429 | (When the caret is the first character of a character set it
|
---|
430 | has a special meaning, see \link #sets-of-characters Sets of
|
---|
431 | Characters \endlink.)
|
---|
432 |
|
---|
433 | \row \i <b>$</b>
|
---|
434 | \i The dollar signifies the end of the string. For example
|
---|
435 | <b>\d\s*$</b> will match strings which end with a digit
|
---|
436 | optionally followed by whitespace. If you wish to match a
|
---|
437 | literal \c{$} you must escape it by writing
|
---|
438 | \c{\\$}.
|
---|
439 |
|
---|
440 | \row \i <b>\\b</b>
|
---|
441 | \i A word boundary. For example the regexp
|
---|
442 | <b>\\bOK\\b</b> means match immediately after a word
|
---|
443 | boundary (e.g. start of string or whitespace) the letter 'O'
|
---|
444 | then the letter 'K' immediately before another word boundary
|
---|
445 | (e.g. end of string or whitespace). But note that the
|
---|
446 | assertion does not actually match any whitespace so if we
|
---|
447 | write <b>(\\bOK\\b)</b> and we have a match it will only
|
---|
448 | contain 'OK' even if the string is "Its <u>OK</u> now".
|
---|
449 |
|
---|
450 | \row \i <b>\\B</b>
|
---|
451 | \i A non-word boundary. This assertion is true wherever
|
---|
452 | <b>\\b</b> is false. For example if we searched for
|
---|
453 | <b>\\Bon\\B</b> in "Left on" the match would fail (space
|
---|
454 | and end of string aren't non-word boundaries), but it would
|
---|
455 | match in "t<u>on</u>ne".
|
---|
456 |
|
---|
457 | \row \i <b>(?=\e E)</b>
|
---|
458 | \i Positive lookahead. This assertion is true if the
|
---|
459 | expression matches at this point in the regexp. For example,
|
---|
460 | <b>const(?=\\s+char)</b> matches 'const' whenever it is
|
---|
461 | followed by 'char', as in 'static <u>const</u> char *'.
|
---|
462 | (Compare with <b>const\\s+char</b>, which matches 'static
|
---|
463 | <u>const char</u> *'.)
|
---|
464 |
|
---|
465 | \row \i <b>(?!\e E)</b>
|
---|
466 | \i Negative lookahead. This assertion is true if the
|
---|
467 | expression does not match at this point in the regexp. For
|
---|
468 | example, <b>const(?!\\s+char)</b> matches 'const' \e except
|
---|
469 | when it is followed by 'char'.
|
---|
470 | \endtable
|
---|
471 |
|
---|
472 | \target wildcard-matching
|
---|
473 | \section1 Wildcard Matching (globbing)
|
---|
474 |
|
---|
475 | Most command shells such as \e bash or \e cmd.exe support "file
|
---|
476 | globbing", the ability to identify a group of files by using
|
---|
477 | wildcards. The setWildcard() function is used to switch between
|
---|
478 | regexp and wildcard mode. Wildcard matching is much simpler than
|
---|
479 | full regexps and has only four features:
|
---|
480 |
|
---|
481 | \table
|
---|
482 | \row \i <b>c</b>
|
---|
483 | \i Any character represents itself apart from those mentioned
|
---|
484 | below. Thus <b>c</b> matches the character \e c.
|
---|
485 | \row \i <b>?</b>
|
---|
486 | \i This matches any single character. It is the same as
|
---|
487 | <b>.</b> in full regexps.
|
---|
488 | \row \i <b>*</b>
|
---|
489 | \i This matches zero or more of any characters. It is the
|
---|
490 | same as <b>.*</b> in full regexps.
|
---|
491 | \row \i <b>[...]</b>
|
---|
492 | \i Sets of characters can be represented in square brackets,
|
---|
493 | similar to full regexps. Within the character class, like
|
---|
494 | outside, backslash has no special meaning.
|
---|
495 | \endtable
|
---|
496 |
|
---|
497 | For example if we are in wildcard mode and have strings which
|
---|
498 | contain filenames we could identify HTML files with <b>*.html</b>.
|
---|
499 | This will match zero or more characters followed by a dot followed
|
---|
500 | by 'h', 't', 'm' and 'l'.
|
---|
501 |
|
---|
502 | \target perl-users
|
---|
503 | \section1 Notes for Perl Users
|
---|
504 |
|
---|
505 | Most of the character class abbreviations supported by Perl are
|
---|
506 | supported by QRegExp, see \link
|
---|
507 | #characters-and-abbreviations-for-sets-of-characters characters
|
---|
508 | and abbreviations for sets of characters \endlink.
|
---|
509 |
|
---|
510 | In QRegExp, apart from within character classes, \c{^} always
|
---|
511 | signifies the start of the string, so carets must always be
|
---|
512 | escaped unless used for that purpose. In Perl the meaning of caret
|
---|
513 | varies automagically depending on where it occurs so escaping it
|
---|
514 | is rarely necessary. The same applies to \c{$} which in
|
---|
515 | QRegExp always signifies the end of the string.
|
---|
516 |
|
---|
517 | QRegExp's quantifiers are the same as Perl's greedy quantifiers.
|
---|
518 | Non-greedy matching cannot be applied to individual quantifiers,
|
---|
519 | but can be applied to all the quantifiers in the pattern. For
|
---|
520 | example, to match the Perl regexp <b>ro+?m</b> requires:
|
---|
521 | \code
|
---|
522 | QRegExp rx( "ro+m" );
|
---|
523 | rx.setMinimal( TRUE );
|
---|
524 | \endcode
|
---|
525 |
|
---|
526 | The equivalent of Perl's \c{/i} option is
|
---|
527 | setCaseSensitive(FALSE).
|
---|
528 |
|
---|
529 | Perl's \c{/g} option can be emulated using a \link
|
---|
530 | #cap_in_a_loop loop \endlink.
|
---|
531 |
|
---|
532 | In QRegExp <b>.</b> matches any character, therefore all QRegExp
|
---|
533 | regexps have the equivalent of Perl's \c{/s} option. QRegExp
|
---|
534 | does not have an equivalent to Perl's \c{/m} option, but this
|
---|
535 | can be emulated in various ways for example by splitting the input
|
---|
536 | into lines or by looping with a regexp that searches for newlines.
|
---|
537 |
|
---|
538 | Because QRegExp is string oriented there are no \A, \Z or \z
|
---|
539 | assertions. The \G assertion is not supported but can be emulated
|
---|
540 | in a loop.
|
---|
541 |
|
---|
542 | Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
|
---|
543 | equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
|
---|
544 | ... correspond to cap(1) or capturedTexts()[1], cap(2) or
|
---|
545 | capturedTexts()[2], etc.
|
---|
546 |
|
---|
547 | To substitute a pattern use QString::replace().
|
---|
548 |
|
---|
549 | Perl's extended \c{/x} syntax is not supported, nor are
|
---|
550 | directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
|
---|
551 | the other hand, C++'s rules for literal strings can be used to
|
---|
552 | achieve the same:
|
---|
553 | \code
|
---|
554 | QRegExp mark( "\\b" // word boundary
|
---|
555 | "[Mm]ark" // the word we want to match
|
---|
556 | );
|
---|
557 | \endcode
|
---|
558 |
|
---|
559 | Both zero-width positive and zero-width negative lookahead
|
---|
560 | assertions (?=pattern) and (?!pattern) are supported with the same
|
---|
561 | syntax as Perl. Perl's lookbehind assertions, "independent"
|
---|
562 | subexpressions and conditional expressions are not supported.
|
---|
563 |
|
---|
564 | Non-capturing parentheses are also supported, with the same
|
---|
565 | (?:pattern) syntax.
|
---|
566 |
|
---|
567 | See QStringList::split() and QStringList::join() for equivalents
|
---|
568 | to Perl's split and join functions.
|
---|
569 |
|
---|
570 | Note: because C++ transforms \\'s they must be written \e twice in
|
---|
571 | code, e.g. <b>\\b</b> must be written <b>\\\\b</b>.
|
---|
572 |
|
---|
573 | \target code-examples
|
---|
574 | \section1 Code Examples
|
---|
575 |
|
---|
576 | \code
|
---|
577 | QRegExp rx( "^\\d\\d?$" ); // match integers 0 to 99
|
---|
578 | rx.search( "123" ); // returns -1 (no match)
|
---|
579 | rx.search( "-6" ); // returns -1 (no match)
|
---|
580 | rx.search( "6" ); // returns 0 (matched as position 0)
|
---|
581 | \endcode
|
---|
582 |
|
---|
583 | The third string matches '<u>6</u>'. This is a simple validation
|
---|
584 | regexp for integers in the range 0 to 99.
|
---|
585 |
|
---|
586 | \code
|
---|
587 | QRegExp rx( "^\\S+$" ); // match strings without whitespace
|
---|
588 | rx.search( "Hello world" ); // returns -1 (no match)
|
---|
589 | rx.search( "This_is-OK" ); // returns 0 (matched at position 0)
|
---|
590 | \endcode
|
---|
591 |
|
---|
592 | The second string matches '<u>This_is-OK</u>'. We've used the
|
---|
593 | character set abbreviation '\S' (non-whitespace) and the anchors
|
---|
594 | to match strings which contain no whitespace.
|
---|
595 |
|
---|
596 | In the following example we match strings containing 'mail' or
|
---|
597 | 'letter' or 'correspondence' but only match whole words i.e. not
|
---|
598 | 'email'
|
---|
599 |
|
---|
600 | \code
|
---|
601 | QRegExp rx( "\\b(mail|letter|correspondence)\\b" );
|
---|
602 | rx.search( "I sent you an email" ); // returns -1 (no match)
|
---|
603 | rx.search( "Please write the letter" ); // returns 17
|
---|
604 | \endcode
|
---|
605 |
|
---|
606 | The second string matches "Please write the <u>letter</u>". The
|
---|
607 | word 'letter' is also captured (because of the parentheses). We
|
---|
608 | can see what text we've captured like this:
|
---|
609 |
|
---|
610 | \code
|
---|
611 | QString captured = rx.cap( 1 ); // captured == "letter"
|
---|
612 | \endcode
|
---|
613 |
|
---|
614 | This will capture the text from the first set of capturing
|
---|
615 | parentheses (counting capturing left parentheses from left to
|
---|
616 | right). The parentheses are counted from 1 since cap( 0 ) is the
|
---|
617 | whole matched regexp (equivalent to '&' in most regexp engines).
|
---|
618 |
|
---|
619 | \code
|
---|
620 | QRegExp rx( "&(?!amp;)" ); // match ampersands but not &
|
---|
621 | QString line1 = "This & that";
|
---|
622 | line1.replace( rx, "&" );
|
---|
623 | // line1 == "This & that"
|
---|
624 | QString line2 = "His & hers & theirs";
|
---|
625 | line2.replace( rx, "&" );
|
---|
626 | // line2 == "His & hers & theirs"
|
---|
627 | \endcode
|
---|
628 |
|
---|
629 | Here we've passed the QRegExp to QString's replace() function to
|
---|
630 | replace the matched text with new text.
|
---|
631 |
|
---|
632 | \code
|
---|
633 | QString str = "One Eric another Eirik, and an Ericsson."
|
---|
634 | " How many Eiriks, Eric?";
|
---|
635 | QRegExp rx( "\\b(Eric|Eirik)\\b" ); // match Eric or Eirik
|
---|
636 | int pos = 0; // where we are in the string
|
---|
637 | int count = 0; // how many Eric and Eirik's we've counted
|
---|
638 | while ( pos >= 0 ) {
|
---|
639 | pos = rx.search( str, pos );
|
---|
640 | if ( pos >= 0 ) {
|
---|
641 | pos++; // move along in str
|
---|
642 | count++; // count our Eric or Eirik
|
---|
643 | }
|
---|
644 | }
|
---|
645 | \endcode
|
---|
646 |
|
---|
647 | We've used the search() function to repeatedly match the regexp in
|
---|
648 | the string. Note that instead of moving forward by one character
|
---|
649 | at a time \c pos++ we could have written \c {pos +=
|
---|
650 | rx.matchedLength()} to skip over the already matched string. The
|
---|
651 | count will equal 3, matching 'One <u>Eric</u> another
|
---|
652 | <u>Eirik</u>, and an Ericsson. How many Eiriks, <u>Eric</u>?'; it
|
---|
653 | doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
|
---|
654 | by non-word boundaries.
|
---|
655 |
|
---|
656 | One common use of regexps is to split lines of delimited data into
|
---|
657 | their component fields.
|
---|
658 |
|
---|
659 | \code
|
---|
660 | str = "Trolltech AS\twww.trolltech.com\tNorway";
|
---|
661 | QString company, web, country;
|
---|
662 | rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" );
|
---|
663 | if ( rx.search( str ) != -1 ) {
|
---|
664 | company = rx.cap( 1 );
|
---|
665 | web = rx.cap( 2 );
|
---|
666 | country = rx.cap( 3 );
|
---|
667 | }
|
---|
668 | \endcode
|
---|
669 |
|
---|
670 | In this example our input lines have the format company name, web
|
---|
671 | address and country. Unfortunately the regexp is rather long and
|
---|
672 | not very versatile -- the code will break if we add any more
|
---|
673 | fields. A simpler and better solution is to look for the
|
---|
674 | separator, '\t' in this case, and take the surrounding text. The
|
---|
675 | QStringList split() function can take a separator string or regexp
|
---|
676 | as an argument and split a string accordingly.
|
---|
677 |
|
---|
678 | \code
|
---|
679 | QStringList field = QStringList::split( "\t", str );
|
---|
680 | \endcode
|
---|
681 |
|
---|
682 | Here field[0] is the company, field[1] the web address and so on.
|
---|
683 |
|
---|
684 | To imitate the matching of a shell we can use wildcard mode.
|
---|
685 |
|
---|
686 | \code
|
---|
687 | QRegExp rx( "*.html" ); // invalid regexp: * doesn't quantify anything
|
---|
688 | rx.setWildcard( TRUE ); // now it's a valid wildcard regexp
|
---|
689 | rx.exactMatch( "index.html" ); // returns TRUE
|
---|
690 | rx.exactMatch( "default.htm" ); // returns FALSE
|
---|
691 | rx.exactMatch( "readme.txt" ); // returns FALSE
|
---|
692 | \endcode
|
---|
693 |
|
---|
694 | Wildcard matching can be convenient because of its simplicity, but
|
---|
695 | any wildcard regexp can be defined using full regexps, e.g.
|
---|
696 | <b>.*\.html$</b>. Notice that we can't match both \c .html and \c
|
---|
697 | .htm files with a wildcard unless we use <b>*.htm*</b> which will
|
---|
698 | also match 'test.html.bak'. A full regexp gives us the precision
|
---|
699 | we need, <b>.*\\.html?$</b>.
|
---|
700 |
|
---|
701 | QRegExp can match case insensitively using setCaseSensitive(), and
|
---|
702 | can use non-greedy matching, see setMinimal(). By default QRegExp
|
---|
703 | uses full regexps but this can be changed with setWildcard().
|
---|
704 | Searching can be forward with search() or backward with
|
---|
705 | searchRev(). Captured text can be accessed using capturedTexts()
|
---|
706 | which returns a string list of all captured strings, or using
|
---|
707 | cap() which returns the captured string for the given index. The
|
---|
708 | pos() function takes a match index and returns the position in the
|
---|
709 | string where the match was made (or -1 if there was no match).
|
---|
710 |
|
---|
711 | \sa QRegExpValidator QString QStringList
|
---|
712 |
|
---|
713 | \target member-function-documentation
|
---|
714 | */
|
---|
715 |
|
---|
716 | const int NumBadChars = 64;
|
---|
717 | #define BadChar( ch ) ( (ch).unicode() % NumBadChars )
|
---|
718 |
|
---|
719 | const int NoOccurrence = INT_MAX;
|
---|
720 | const int EmptyCapture = INT_MAX;
|
---|
721 | const int InftyLen = INT_MAX;
|
---|
722 | const int InftyRep = 1025;
|
---|
723 | const int EOS = -1;
|
---|
724 |
|
---|
725 | static bool isWord( QChar ch )
|
---|
726 | {
|
---|
727 | return ch.isLetterOrNumber() || ch == QChar( '_' );
|
---|
728 | }
|
---|
729 |
|
---|
730 | /*
|
---|
731 | Merges two QMemArrays of ints and puts the result into the first
|
---|
732 | one.
|
---|
733 | */
|
---|
734 | static void mergeInto( QMemArray<int> *a, const QMemArray<int>& b )
|
---|
735 | {
|
---|
736 | int asize = a->size();
|
---|
737 | int bsize = b.size();
|
---|
738 | if ( asize == 0 ) {
|
---|
739 | *a = b.copy();
|
---|
740 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
741 | } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) {
|
---|
742 | a->resize( asize + 1 );
|
---|
743 | (*a)[asize] = b[0];
|
---|
744 | #endif
|
---|
745 | } else if ( bsize >= 1 ) {
|
---|
746 | int csize = asize + bsize;
|
---|
747 | QMemArray<int> c( csize );
|
---|
748 | int i = 0, j = 0, k = 0;
|
---|
749 | while ( i < asize ) {
|
---|
750 | if ( j < bsize ) {
|
---|
751 | if ( (*a)[i] == b[j] ) {
|
---|
752 | i++;
|
---|
753 | csize--;
|
---|
754 | } else if ( (*a)[i] < b[j] ) {
|
---|
755 | c[k++] = (*a)[i++];
|
---|
756 | } else {
|
---|
757 | c[k++] = b[j++];
|
---|
758 | }
|
---|
759 | } else {
|
---|
760 | memcpy( c.data() + k, (*a).data() + i,
|
---|
761 | (asize - i) * sizeof(int) );
|
---|
762 | break;
|
---|
763 | }
|
---|
764 | }
|
---|
765 | c.resize( csize );
|
---|
766 | if ( j < bsize )
|
---|
767 | memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) );
|
---|
768 | *a = c;
|
---|
769 | }
|
---|
770 | }
|
---|
771 |
|
---|
772 | /*
|
---|
773 | Merges two disjoint QMaps of (int, int) pairs and puts the result
|
---|
774 | into the first one.
|
---|
775 | */
|
---|
776 | static void mergeInto( QMap<int, int> *a, const QMap<int, int>& b )
|
---|
777 | {
|
---|
778 | QMap<int, int>::ConstIterator it;
|
---|
779 | for ( it = b.begin(); it != b.end(); ++it )
|
---|
780 | a->insert( it.key(), *it );
|
---|
781 | }
|
---|
782 |
|
---|
783 | /*
|
---|
784 | Returns the value associated to key k in QMap m of (int, int)
|
---|
785 | pairs, or 0 if no such value is explicitly present.
|
---|
786 | */
|
---|
787 | static int at( const QMap<int, int>& m, int k )
|
---|
788 | {
|
---|
789 | QMap<int, int>::ConstIterator it = m.find( k );
|
---|
790 | if ( it == m.end() )
|
---|
791 | return 0;
|
---|
792 | else
|
---|
793 | return *it;
|
---|
794 | }
|
---|
795 |
|
---|
796 | #ifndef QT_NO_REGEXP_WILDCARD
|
---|
797 | /*
|
---|
798 | Translates a wildcard pattern to an equivalent regular expression
|
---|
799 | pattern (e.g., *.cpp to .*\.cpp).
|
---|
800 | */
|
---|
801 | static QString wc2rx( const QString& wc_str )
|
---|
802 | {
|
---|
803 | int wclen = wc_str.length();
|
---|
804 | QString rx = QString::fromLatin1( "" );
|
---|
805 | int i = 0;
|
---|
806 | const QChar *wc = wc_str.unicode();
|
---|
807 | while ( i < wclen ) {
|
---|
808 | QChar c = wc[i++];
|
---|
809 | switch ( c.unicode() ) {
|
---|
810 | case '*':
|
---|
811 | rx += QString::fromLatin1( ".*" );
|
---|
812 | break;
|
---|
813 | case '?':
|
---|
814 | rx += QChar( '.' );
|
---|
815 | break;
|
---|
816 | case '$':
|
---|
817 | case '(':
|
---|
818 | case ')':
|
---|
819 | case '+':
|
---|
820 | case '.':
|
---|
821 | case '\\':
|
---|
822 | case '^':
|
---|
823 | case '{':
|
---|
824 | case '|':
|
---|
825 | case '}':
|
---|
826 | rx += QChar( '\\' );
|
---|
827 | rx += c;
|
---|
828 | break;
|
---|
829 | case '[':
|
---|
830 | rx += c;
|
---|
831 | if ( wc[i] == QChar('^') )
|
---|
832 | rx += wc[i++];
|
---|
833 | if ( i < wclen ) {
|
---|
834 | if ( rx[i] == ']' )
|
---|
835 | rx += wc[i++];
|
---|
836 | while ( i < wclen && wc[i] != QChar(']') ) {
|
---|
837 | if ( wc[i] == '\\' )
|
---|
838 | rx += QChar( '\\' );
|
---|
839 | rx += wc[i++];
|
---|
840 | }
|
---|
841 | }
|
---|
842 | break;
|
---|
843 | default:
|
---|
844 | rx += c;
|
---|
845 | }
|
---|
846 | }
|
---|
847 | return rx;
|
---|
848 | }
|
---|
849 | #endif
|
---|
850 |
|
---|
851 | /*
|
---|
852 | The class QRegExpEngine encapsulates a modified nondeterministic
|
---|
853 | finite automaton (NFA).
|
---|
854 | */
|
---|
855 | class QRegExpEngine : public QShared
|
---|
856 | {
|
---|
857 | public:
|
---|
858 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
859 | /*
|
---|
860 | The class CharClass represents a set of characters, such as can
|
---|
861 | be found in regular expressions (e.g., [a-z] denotes the set
|
---|
862 | {a, b, ..., z}).
|
---|
863 | */
|
---|
864 | class CharClass
|
---|
865 | {
|
---|
866 | public:
|
---|
867 | CharClass();
|
---|
868 | CharClass( const CharClass& cc ) { operator=( cc ); }
|
---|
869 |
|
---|
870 | CharClass& operator=( const CharClass& cc );
|
---|
871 |
|
---|
872 | void clear();
|
---|
873 | bool negative() const { return n; }
|
---|
874 | void setNegative( bool negative );
|
---|
875 | void addCategories( int cats );
|
---|
876 | void addRange( ushort from, ushort to );
|
---|
877 | void addSingleton( ushort ch ) { addRange( ch, ch ); }
|
---|
878 |
|
---|
879 | bool in( QChar ch ) const;
|
---|
880 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
881 | const QMemArray<int>& firstOccurrence() const { return occ1; }
|
---|
882 | #endif
|
---|
883 |
|
---|
884 | #if defined(QT_DEBUG)
|
---|
885 | void dump() const;
|
---|
886 | #endif
|
---|
887 |
|
---|
888 | private:
|
---|
889 | /*
|
---|
890 | The struct Range represents a range of characters (e.g.,
|
---|
891 | [0-9] denotes range 48 to 57).
|
---|
892 | */
|
---|
893 | struct Range
|
---|
894 | {
|
---|
895 | ushort from; // 48
|
---|
896 | ushort to; // 57
|
---|
897 | };
|
---|
898 |
|
---|
899 | int c; // character classes
|
---|
900 | QMemArray<Range> r; // character ranges
|
---|
901 | bool n; // negative?
|
---|
902 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
903 | QMemArray<int> occ1; // first-occurrence array
|
---|
904 | #endif
|
---|
905 | };
|
---|
906 | #else
|
---|
907 | struct CharClass
|
---|
908 | {
|
---|
909 | int dummy;
|
---|
910 |
|
---|
911 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
912 | CharClass() { occ1.fill( 0, NumBadChars ); }
|
---|
913 |
|
---|
914 | const QMemArray<int>& firstOccurrence() const { return occ1; }
|
---|
915 | QMemArray<int> occ1;
|
---|
916 | #endif
|
---|
917 | };
|
---|
918 | #endif
|
---|
919 |
|
---|
920 | QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); }
|
---|
921 | QRegExpEngine( const QString& rx, bool caseSensitive );
|
---|
922 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
923 | ~QRegExpEngine();
|
---|
924 | #endif
|
---|
925 |
|
---|
926 | bool isValid() const { return valid; }
|
---|
927 | bool caseSensitive() const { return cs; }
|
---|
928 | const QString& errorString() const { return yyError; }
|
---|
929 | int numCaptures() const { return officialncap; }
|
---|
930 | void match( const QString& str, int pos, bool minimal, bool oneTest,
|
---|
931 | int caretIndex, QMemArray<int>& captured );
|
---|
932 | int partialMatchLength() const { return mmOneTestMatchedLen; }
|
---|
933 |
|
---|
934 | int createState( QChar ch );
|
---|
935 | int createState( const CharClass& cc );
|
---|
936 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
937 | int createState( int bref );
|
---|
938 | #endif
|
---|
939 |
|
---|
940 | void addCatTransitions( const QMemArray<int>& from,
|
---|
941 | const QMemArray<int>& to );
|
---|
942 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
943 | void addPlusTransitions( const QMemArray<int>& from,
|
---|
944 | const QMemArray<int>& to, int atom );
|
---|
945 | #endif
|
---|
946 |
|
---|
947 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
948 | int anchorAlternation( int a, int b );
|
---|
949 | int anchorConcatenation( int a, int b );
|
---|
950 | #else
|
---|
951 | int anchorAlternation( int a, int b ) { return a & b; }
|
---|
952 | int anchorConcatenation( int a, int b ) { return a | b; }
|
---|
953 | #endif
|
---|
954 | void addAnchors( int from, int to, int a );
|
---|
955 |
|
---|
956 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
957 | void heuristicallyChooseHeuristic();
|
---|
958 | #endif
|
---|
959 |
|
---|
960 | #if defined(QT_DEBUG)
|
---|
961 | void dump() const;
|
---|
962 | #endif
|
---|
963 |
|
---|
964 | private:
|
---|
965 | enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
|
---|
966 |
|
---|
967 | /*
|
---|
968 | The struct State represents one state in a modified NFA. The
|
---|
969 | input characters matched are stored in the state instead of on
|
---|
970 | the transitions, something possible for an automaton
|
---|
971 | constructed from a regular expression.
|
---|
972 | */
|
---|
973 | struct State
|
---|
974 | {
|
---|
975 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
976 | int atom; // which atom does this state belong to?
|
---|
977 | #endif
|
---|
978 | int match; // what does it match? (see CharClassBit and BackRefBit)
|
---|
979 | QMemArray<int> outs; // out-transitions
|
---|
980 | QMap<int, int> *reenter; // atoms reentered when transiting out
|
---|
981 | QMap<int, int> *anchors; // anchors met when transiting out
|
---|
982 |
|
---|
983 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
984 | State( int a, int m )
|
---|
985 | : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { }
|
---|
986 | #else
|
---|
987 | State( int m )
|
---|
988 | : match( m ), reenter( 0 ), anchors( 0 ) { }
|
---|
989 | #endif
|
---|
990 | ~State() { delete reenter; delete anchors; }
|
---|
991 | };
|
---|
992 |
|
---|
993 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
994 | /*
|
---|
995 | The struct Lookahead represents a lookahead a la Perl (e.g.,
|
---|
996 | (?=foo) and (?!bar)).
|
---|
997 | */
|
---|
998 | struct Lookahead
|
---|
999 | {
|
---|
1000 | QRegExpEngine *eng; // NFA representing the embedded regular expression
|
---|
1001 | bool neg; // negative lookahead?
|
---|
1002 |
|
---|
1003 | Lookahead( QRegExpEngine *eng0, bool neg0 )
|
---|
1004 | : eng( eng0 ), neg( neg0 ) { }
|
---|
1005 | ~Lookahead() { delete eng; }
|
---|
1006 | };
|
---|
1007 | #endif
|
---|
1008 |
|
---|
1009 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1010 | /*
|
---|
1011 | The struct Atom represents one node in the hierarchy of regular
|
---|
1012 | expression atoms.
|
---|
1013 | */
|
---|
1014 | struct Atom
|
---|
1015 | {
|
---|
1016 | int parent; // index of parent in array of atoms
|
---|
1017 | int capture; // index of capture, from 1 to ncap
|
---|
1018 | };
|
---|
1019 | #endif
|
---|
1020 |
|
---|
1021 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1022 | /*
|
---|
1023 | The struct AnchorAlternation represents a pair of anchors with
|
---|
1024 | OR semantics.
|
---|
1025 | */
|
---|
1026 | struct AnchorAlternation
|
---|
1027 | {
|
---|
1028 | int a; // this anchor...
|
---|
1029 | int b; // ...or this one
|
---|
1030 | };
|
---|
1031 | #endif
|
---|
1032 |
|
---|
1033 | enum { InitialState = 0, FinalState = 1 };
|
---|
1034 | void setup( bool caseSensitive );
|
---|
1035 | int setupState( int match );
|
---|
1036 |
|
---|
1037 | /*
|
---|
1038 | Let's hope that 13 lookaheads and 14 back-references are
|
---|
1039 | enough.
|
---|
1040 | */
|
---|
1041 | enum { MaxLookaheads = 13, MaxBackRefs = 14 };
|
---|
1042 | enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002,
|
---|
1043 | Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008,
|
---|
1044 | Anchor_FirstLookahead = 0x00000010,
|
---|
1045 | Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
|
---|
1046 | Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
|
---|
1047 | Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs,
|
---|
1048 |
|
---|
1049 | Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^
|
---|
1050 | ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) };
|
---|
1051 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1052 | int startAtom( bool capture );
|
---|
1053 | void finishAtom( int atom ) { cf = f[atom].parent; }
|
---|
1054 | #endif
|
---|
1055 |
|
---|
1056 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1057 | int addLookahead( QRegExpEngine *eng, bool negative );
|
---|
1058 | #endif
|
---|
1059 |
|
---|
1060 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1061 | bool isBetterCapture( const int *begin1, const int *end1, const int *begin2,
|
---|
1062 | const int *end2 );
|
---|
1063 | #endif
|
---|
1064 | bool testAnchor( int i, int a, const int *capBegin );
|
---|
1065 |
|
---|
1066 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1067 | bool goodStringMatch();
|
---|
1068 | bool badCharMatch();
|
---|
1069 | #else
|
---|
1070 | bool bruteMatch();
|
---|
1071 | #endif
|
---|
1072 | bool matchHere();
|
---|
1073 |
|
---|
1074 | QPtrVector<State> s; // array of states
|
---|
1075 | int ns; // number of states
|
---|
1076 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1077 | QMemArray<Atom> f; // atom hierarchy
|
---|
1078 | int nf; // number of atoms
|
---|
1079 | int cf; // current atom
|
---|
1080 | #endif
|
---|
1081 | int officialncap; // number of captures, seen from the outside
|
---|
1082 | int ncap; // number of captures, seen from the inside
|
---|
1083 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
1084 | QPtrVector<CharClass> cl; // array of character classes
|
---|
1085 | #endif
|
---|
1086 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1087 | QPtrVector<Lookahead> ahead; // array of lookaheads
|
---|
1088 | #endif
|
---|
1089 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1090 | QMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors
|
---|
1091 | #endif
|
---|
1092 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1093 | bool caretAnchored; // does the regexp start with ^?
|
---|
1094 | bool trivial; // is the good-string all that needs to match?
|
---|
1095 | #endif
|
---|
1096 | bool valid; // is the regular expression valid?
|
---|
1097 | bool cs; // case sensitive?
|
---|
1098 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1099 | int nbrefs; // number of back-references
|
---|
1100 | #endif
|
---|
1101 |
|
---|
1102 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1103 | bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
|
---|
1104 |
|
---|
1105 | int goodEarlyStart; // the index where goodStr can first occur in a match
|
---|
1106 | int goodLateStart; // the index where goodStr can last occur in a match
|
---|
1107 | QString goodStr; // the string that any match has to contain
|
---|
1108 |
|
---|
1109 | int minl; // the minimum length of a match
|
---|
1110 | QMemArray<int> occ1; // first-occurrence array
|
---|
1111 | #endif
|
---|
1112 |
|
---|
1113 | /*
|
---|
1114 | The class Box is an abstraction for a regular expression
|
---|
1115 | fragment. It can also be seen as one node in the syntax tree of
|
---|
1116 | a regular expression with synthetized attributes.
|
---|
1117 |
|
---|
1118 | Its interface is ugly for performance reasons.
|
---|
1119 | */
|
---|
1120 | class Box
|
---|
1121 | {
|
---|
1122 | public:
|
---|
1123 | Box( QRegExpEngine *engine );
|
---|
1124 | Box( const Box& b ) { operator=( b ); }
|
---|
1125 |
|
---|
1126 | Box& operator=( const Box& b );
|
---|
1127 |
|
---|
1128 | void clear() { operator=( Box(eng) ); }
|
---|
1129 | void set( QChar ch );
|
---|
1130 | void set( const CharClass& cc );
|
---|
1131 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1132 | void set( int bref );
|
---|
1133 | #endif
|
---|
1134 |
|
---|
1135 | void cat( const Box& b );
|
---|
1136 | void orx( const Box& b );
|
---|
1137 | void plus( int atom );
|
---|
1138 | void opt();
|
---|
1139 | void catAnchor( int a );
|
---|
1140 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1141 | void setupHeuristics();
|
---|
1142 | #endif
|
---|
1143 |
|
---|
1144 | #if defined(QT_DEBUG)
|
---|
1145 | void dump() const;
|
---|
1146 | #endif
|
---|
1147 |
|
---|
1148 | private:
|
---|
1149 | void addAnchorsToEngine( const Box& to ) const;
|
---|
1150 |
|
---|
1151 | QRegExpEngine *eng; // the automaton under construction
|
---|
1152 | QMemArray<int> ls; // the left states (firstpos)
|
---|
1153 | QMemArray<int> rs; // the right states (lastpos)
|
---|
1154 | QMap<int, int> lanchors; // the left anchors
|
---|
1155 | QMap<int, int> ranchors; // the right anchors
|
---|
1156 | int skipanchors; // the anchors to match if the box is skipped
|
---|
1157 |
|
---|
1158 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1159 | int earlyStart; // the index where str can first occur
|
---|
1160 | int lateStart; // the index where str can last occur
|
---|
1161 | QString str; // a string that has to occur in any match
|
---|
1162 | QString leftStr; // a string occurring at the left of this box
|
---|
1163 | QString rightStr; // a string occurring at the right of this box
|
---|
1164 | int maxl; // the maximum length of this box (possibly InftyLen)
|
---|
1165 | #endif
|
---|
1166 |
|
---|
1167 | int minl; // the minimum length of this box
|
---|
1168 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1169 | QMemArray<int> occ1; // first-occurrence array
|
---|
1170 | #endif
|
---|
1171 | };
|
---|
1172 | friend class Box;
|
---|
1173 |
|
---|
1174 | /*
|
---|
1175 | This is the lexical analyzer for regular expressions.
|
---|
1176 | */
|
---|
1177 | enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen,
|
---|
1178 | Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass,
|
---|
1179 | Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord,
|
---|
1180 | Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
|
---|
1181 | int getChar();
|
---|
1182 | int getEscape();
|
---|
1183 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
1184 | int getRep( int def );
|
---|
1185 | #endif
|
---|
1186 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1187 | void skipChars( int n );
|
---|
1188 | #endif
|
---|
1189 | void error( const char *msg );
|
---|
1190 | void startTokenizer( const QChar *rx, int len );
|
---|
1191 | int getToken();
|
---|
1192 |
|
---|
1193 | const QChar *yyIn; // a pointer to the input regular expression pattern
|
---|
1194 | int yyPos0; // the position of yyTok in the input pattern
|
---|
1195 | int yyPos; // the position of the next character to read
|
---|
1196 | int yyLen; // the length of yyIn
|
---|
1197 | int yyCh; // the last character read
|
---|
1198 | CharClass *yyCharClass; // attribute for Tok_CharClass tokens
|
---|
1199 | int yyMinRep; // attribute for Tok_Quantifier
|
---|
1200 | int yyMaxRep; // ditto
|
---|
1201 | QString yyError; // syntax error or overflow during parsing?
|
---|
1202 |
|
---|
1203 | /*
|
---|
1204 | This is the syntactic analyzer for regular expressions.
|
---|
1205 | */
|
---|
1206 | int parse( const QChar *rx, int len );
|
---|
1207 | void parseAtom( Box *box );
|
---|
1208 | void parseFactor( Box *box );
|
---|
1209 | void parseTerm( Box *box );
|
---|
1210 | void parseExpression( Box *box );
|
---|
1211 |
|
---|
1212 | int yyTok; // the last token read
|
---|
1213 | bool yyMayCapture; // set this to FALSE to disable capturing
|
---|
1214 |
|
---|
1215 | /*
|
---|
1216 | This is the engine state during matching.
|
---|
1217 | */
|
---|
1218 | const QString *mmStr; // a pointer to the input QString
|
---|
1219 | const QChar *mmIn; // a pointer to the input string data
|
---|
1220 | int mmPos; // the current position in the string
|
---|
1221 | int mmCaretPos;
|
---|
1222 | int mmLen; // the length of the input string
|
---|
1223 | bool mmMinimal; // minimal matching?
|
---|
1224 | QMemArray<int> mmBigArray; // big QMemArray<int> array
|
---|
1225 | int *mmInNextStack; // is state is mmNextStack?
|
---|
1226 | int *mmCurStack; // stack of current states
|
---|
1227 | int *mmNextStack; // stack of next states
|
---|
1228 | int *mmCurCapBegin; // start of current states' captures
|
---|
1229 | int *mmNextCapBegin; // start of next states' captures
|
---|
1230 | int *mmCurCapEnd; // end of current states' captures
|
---|
1231 | int *mmNextCapEnd; // end of next states' captures
|
---|
1232 | int *mmTempCapBegin; // start of temporary captures
|
---|
1233 | int *mmTempCapEnd; // end of temporary captures
|
---|
1234 | int *mmCapBegin; // start of captures for a next state
|
---|
1235 | int *mmCapEnd; // end of captures for a next state
|
---|
1236 | int *mmSlideTab; // bump-along slide table for bad-character heuristic
|
---|
1237 | int mmSlideTabSize; // size of slide table
|
---|
1238 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1239 | QIntDict<int> mmSleeping; // dictionary of back-reference sleepers
|
---|
1240 | #endif
|
---|
1241 | int mmMatchLen; // length of match
|
---|
1242 | int mmOneTestMatchedLen; // length of partial match
|
---|
1243 | };
|
---|
1244 |
|
---|
1245 | QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive )
|
---|
1246 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1247 | : mmSleeping( 101 )
|
---|
1248 | #endif
|
---|
1249 | {
|
---|
1250 | setup( caseSensitive );
|
---|
1251 | valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() );
|
---|
1252 | if ( !valid ) {
|
---|
1253 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1254 | trivial = FALSE;
|
---|
1255 | #endif
|
---|
1256 | error( RXERR_LEFTDELIM );
|
---|
1257 | }
|
---|
1258 | }
|
---|
1259 |
|
---|
1260 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1261 | QRegExpEngine::~QRegExpEngine()
|
---|
1262 | {
|
---|
1263 | }
|
---|
1264 | #endif
|
---|
1265 |
|
---|
1266 | /*
|
---|
1267 | Tries to match in str and returns an array of (begin, length) pairs
|
---|
1268 | for captured text. If there is no match, all pairs are (-1, -1).
|
---|
1269 | */
|
---|
1270 | void QRegExpEngine::match( const QString& str, int pos, bool minimal,
|
---|
1271 | bool oneTest, int caretIndex,
|
---|
1272 | QMemArray<int>& captured )
|
---|
1273 | {
|
---|
1274 | bool matched = FALSE;
|
---|
1275 |
|
---|
1276 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1277 | if ( trivial && !oneTest ) {
|
---|
1278 | mmPos = str.find( goodStr, pos, cs );
|
---|
1279 | mmMatchLen = goodStr.length();
|
---|
1280 | matched = ( mmPos != -1 );
|
---|
1281 | } else
|
---|
1282 | #endif
|
---|
1283 | {
|
---|
1284 | mmStr = &str;
|
---|
1285 | mmIn = str.unicode();
|
---|
1286 | if ( mmIn == 0 )
|
---|
1287 | mmIn = &QChar::null;
|
---|
1288 | mmPos = pos;
|
---|
1289 | mmCaretPos = caretIndex;
|
---|
1290 | mmLen = str.length();
|
---|
1291 | mmMinimal = minimal;
|
---|
1292 | mmMatchLen = 0;
|
---|
1293 | mmOneTestMatchedLen = 0;
|
---|
1294 |
|
---|
1295 | if ( valid && mmPos >= 0 && mmPos <= mmLen ) {
|
---|
1296 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1297 | if ( oneTest ) {
|
---|
1298 | matched = matchHere();
|
---|
1299 | } else {
|
---|
1300 | if ( mmPos <= mmLen - minl ) {
|
---|
1301 | if ( caretAnchored ) {
|
---|
1302 | matched = matchHere();
|
---|
1303 | } else if ( useGoodStringHeuristic ) {
|
---|
1304 | matched = goodStringMatch();
|
---|
1305 | } else {
|
---|
1306 | matched = badCharMatch();
|
---|
1307 | }
|
---|
1308 | }
|
---|
1309 | }
|
---|
1310 | #else
|
---|
1311 | matched = oneTest ? matchHere() : bruteMatch();
|
---|
1312 | #endif
|
---|
1313 | }
|
---|
1314 | }
|
---|
1315 |
|
---|
1316 | int capturedSize = 2 + 2 * officialncap;
|
---|
1317 | captured.detach();
|
---|
1318 | captured.resize( capturedSize );
|
---|
1319 | if ( matched ) {
|
---|
1320 | captured[0] = mmPos;
|
---|
1321 | captured[1] = mmMatchLen;
|
---|
1322 | for ( int j = 0; j < officialncap; j++ ) {
|
---|
1323 | int len = mmCapEnd[j] - mmCapBegin[j];
|
---|
1324 | captured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0;
|
---|
1325 | captured[2 + 2 * j + 1] = len;
|
---|
1326 | }
|
---|
1327 | } else {
|
---|
1328 | // we rely on 2's complement here
|
---|
1329 | memset( captured.data(), -1, capturedSize * sizeof(int) );
|
---|
1330 | }
|
---|
1331 | }
|
---|
1332 |
|
---|
1333 | /*
|
---|
1334 | The three following functions add one state to the automaton and
|
---|
1335 | return the number of the state.
|
---|
1336 | */
|
---|
1337 |
|
---|
1338 | int QRegExpEngine::createState( QChar ch )
|
---|
1339 | {
|
---|
1340 | return setupState( ch.unicode() );
|
---|
1341 | }
|
---|
1342 |
|
---|
1343 | int QRegExpEngine::createState( const CharClass& cc )
|
---|
1344 | {
|
---|
1345 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
1346 | int n = cl.size();
|
---|
1347 | cl.resize( n + 1 );
|
---|
1348 | cl.insert( n, new CharClass(cc) );
|
---|
1349 | return setupState( CharClassBit | n );
|
---|
1350 | #else
|
---|
1351 | Q_UNUSED( cc );
|
---|
1352 | return setupState( CharClassBit );
|
---|
1353 | #endif
|
---|
1354 | }
|
---|
1355 |
|
---|
1356 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1357 | int QRegExpEngine::createState( int bref )
|
---|
1358 | {
|
---|
1359 | if ( bref > nbrefs ) {
|
---|
1360 | nbrefs = bref;
|
---|
1361 | if ( nbrefs > MaxBackRefs ) {
|
---|
1362 | error( RXERR_LIMIT );
|
---|
1363 | return 0;
|
---|
1364 | }
|
---|
1365 | }
|
---|
1366 | return setupState( BackRefBit | bref );
|
---|
1367 | }
|
---|
1368 | #endif
|
---|
1369 |
|
---|
1370 | /*
|
---|
1371 | The two following functions add a transition between all pairs of
|
---|
1372 | states (i, j) where i is fond in from, and j is found in to.
|
---|
1373 |
|
---|
1374 | Cat-transitions are distinguished from plus-transitions for
|
---|
1375 | capturing.
|
---|
1376 | */
|
---|
1377 |
|
---|
1378 | void QRegExpEngine::addCatTransitions( const QMemArray<int>& from,
|
---|
1379 | const QMemArray<int>& to )
|
---|
1380 | {
|
---|
1381 | for ( int i = 0; i < (int) from.size(); i++ ) {
|
---|
1382 | State *st = s[from[i]];
|
---|
1383 | mergeInto( &st->outs, to );
|
---|
1384 | }
|
---|
1385 | }
|
---|
1386 |
|
---|
1387 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1388 | void QRegExpEngine::addPlusTransitions( const QMemArray<int>& from,
|
---|
1389 | const QMemArray<int>& to, int atom )
|
---|
1390 | {
|
---|
1391 | for ( int i = 0; i < (int) from.size(); i++ ) {
|
---|
1392 | State *st = s[from[i]];
|
---|
1393 | QMemArray<int> oldOuts = st->outs.copy();
|
---|
1394 | mergeInto( &st->outs, to );
|
---|
1395 | if ( f[atom].capture >= 0 ) {
|
---|
1396 | if ( st->reenter == 0 )
|
---|
1397 | st->reenter = new QMap<int, int>;
|
---|
1398 | for ( int j = 0; j < (int) to.size(); j++ ) {
|
---|
1399 | if ( !st->reenter->contains(to[j]) &&
|
---|
1400 | oldOuts.bsearch(to[j]) < 0 )
|
---|
1401 | st->reenter->insert( to[j], atom );
|
---|
1402 | }
|
---|
1403 | }
|
---|
1404 | }
|
---|
1405 | }
|
---|
1406 | #endif
|
---|
1407 |
|
---|
1408 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1409 | /*
|
---|
1410 | Returns an anchor that means a OR b.
|
---|
1411 | */
|
---|
1412 | int QRegExpEngine::anchorAlternation( int a, int b )
|
---|
1413 | {
|
---|
1414 | if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 )
|
---|
1415 | return a & b;
|
---|
1416 |
|
---|
1417 | int n = aa.size();
|
---|
1418 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1419 | if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b )
|
---|
1420 | return Anchor_Alternation | ( n - 1 );
|
---|
1421 | #endif
|
---|
1422 |
|
---|
1423 | aa.resize( n + 1 );
|
---|
1424 | aa[n].a = a;
|
---|
1425 | aa[n].b = b;
|
---|
1426 | return Anchor_Alternation | n;
|
---|
1427 | }
|
---|
1428 |
|
---|
1429 | /*
|
---|
1430 | Returns an anchor that means a AND b.
|
---|
1431 | */
|
---|
1432 | int QRegExpEngine::anchorConcatenation( int a, int b )
|
---|
1433 | {
|
---|
1434 | if ( ((a | b) & Anchor_Alternation) == 0 )
|
---|
1435 | return a | b;
|
---|
1436 | if ( (b & Anchor_Alternation) != 0 )
|
---|
1437 | qSwap( a, b );
|
---|
1438 |
|
---|
1439 | int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b );
|
---|
1440 | int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b );
|
---|
1441 | return anchorAlternation( aprime, bprime );
|
---|
1442 | }
|
---|
1443 | #endif
|
---|
1444 |
|
---|
1445 | /*
|
---|
1446 | Adds anchor a on a transition caracterised by its from state and
|
---|
1447 | its to state.
|
---|
1448 | */
|
---|
1449 | void QRegExpEngine::addAnchors( int from, int to, int a )
|
---|
1450 | {
|
---|
1451 | State *st = s[from];
|
---|
1452 | if ( st->anchors == 0 )
|
---|
1453 | st->anchors = new QMap<int, int>;
|
---|
1454 | if ( st->anchors->contains(to) )
|
---|
1455 | a = anchorAlternation( (*st->anchors)[to], a );
|
---|
1456 | st->anchors->insert( to, a );
|
---|
1457 | }
|
---|
1458 |
|
---|
1459 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1460 | /*
|
---|
1461 | This function chooses between the good-string and the bad-character
|
---|
1462 | heuristics. It computes two scores and chooses the heuristic with
|
---|
1463 | the highest score.
|
---|
1464 |
|
---|
1465 | Here are some common-sense constraints on the scores that should be
|
---|
1466 | respected if the formulas are ever modified: (1) If goodStr is
|
---|
1467 | empty, the good-string heuristic scores 0. (2) If the regular
|
---|
1468 | expression is trivial, the good-string heuristic should be used.
|
---|
1469 | (3) If the search is case insensitive, the good-string heuristic
|
---|
1470 | should be used, unless it scores 0. (Case insensitivity turns all
|
---|
1471 | entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
|
---|
1472 | big, the good-string heuristic should score less.
|
---|
1473 | */
|
---|
1474 | void QRegExpEngine::heuristicallyChooseHeuristic()
|
---|
1475 | {
|
---|
1476 | if ( minl == 0 ) {
|
---|
1477 | useGoodStringHeuristic = FALSE;
|
---|
1478 | } else if ( trivial ) {
|
---|
1479 | useGoodStringHeuristic = TRUE;
|
---|
1480 | } else {
|
---|
1481 | /*
|
---|
1482 | Magic formula: The good string has to constitute a good
|
---|
1483 | proportion of the minimum-length string, and appear at a
|
---|
1484 | more-or-less known index.
|
---|
1485 | */
|
---|
1486 | int goodStringScore = ( 64 * goodStr.length() / minl ) -
|
---|
1487 | ( goodLateStart - goodEarlyStart );
|
---|
1488 | /*
|
---|
1489 | Less magic formula: We pick some characters at random, and
|
---|
1490 | check whether they are good or bad.
|
---|
1491 | */
|
---|
1492 | int badCharScore = 0;
|
---|
1493 | int step = QMAX( 1, NumBadChars / 32 );
|
---|
1494 | for ( int i = 1; i < NumBadChars; i += step ) {
|
---|
1495 | if ( occ1[i] == NoOccurrence )
|
---|
1496 | badCharScore += minl;
|
---|
1497 | else
|
---|
1498 | badCharScore += occ1[i];
|
---|
1499 | }
|
---|
1500 | badCharScore /= minl;
|
---|
1501 | useGoodStringHeuristic = ( goodStringScore > badCharScore );
|
---|
1502 | }
|
---|
1503 | }
|
---|
1504 | #endif
|
---|
1505 |
|
---|
1506 | #if defined(QT_DEBUG)
|
---|
1507 | void QRegExpEngine::dump() const
|
---|
1508 | {
|
---|
1509 | int i, j;
|
---|
1510 | qDebug( "Case %ssensitive engine", cs ? "" : "in" );
|
---|
1511 | qDebug( " States" );
|
---|
1512 | for ( i = 0; i < ns; i++ ) {
|
---|
1513 | qDebug( " %d%s", i,
|
---|
1514 | i == InitialState ? " (initial)" :
|
---|
1515 | i == FinalState ? " (final)" : "" );
|
---|
1516 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1517 | qDebug( " in atom %d", s[i]->atom );
|
---|
1518 | #endif
|
---|
1519 | int m = s[i]->match;
|
---|
1520 | if ( (m & CharClassBit) != 0 ) {
|
---|
1521 | qDebug( " match character class %d", m ^ CharClassBit );
|
---|
1522 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
1523 | cl[m ^ CharClassBit]->dump();
|
---|
1524 | #else
|
---|
1525 | qDebug( " negative character class" );
|
---|
1526 | #endif
|
---|
1527 | } else if ( (m & BackRefBit) != 0 ) {
|
---|
1528 | qDebug( " match back-reference %d", m ^ BackRefBit );
|
---|
1529 | } else if ( m >= 0x20 && m <= 0x7e ) {
|
---|
1530 | qDebug( " match 0x%.4x (%c)", m, m );
|
---|
1531 | } else {
|
---|
1532 | qDebug( " match 0x%.4x", m );
|
---|
1533 | }
|
---|
1534 | for ( j = 0; j < (int) s[i]->outs.size(); j++ ) {
|
---|
1535 | int next = s[i]->outs[j];
|
---|
1536 | qDebug( " -> %d", next );
|
---|
1537 | if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) )
|
---|
1538 | qDebug( " [reenter %d]", (*s[i]->reenter)[next] );
|
---|
1539 | if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 )
|
---|
1540 | qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] );
|
---|
1541 | }
|
---|
1542 | }
|
---|
1543 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1544 | if ( nf > 0 ) {
|
---|
1545 | qDebug( " Atom Parent Capture" );
|
---|
1546 | for ( i = 0; i < nf; i++ )
|
---|
1547 | qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture );
|
---|
1548 | }
|
---|
1549 | #endif
|
---|
1550 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1551 | for ( i = 0; i < (int) aa.size(); i++ )
|
---|
1552 | qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a,
|
---|
1553 | aa[i].b );
|
---|
1554 | #endif
|
---|
1555 | }
|
---|
1556 | #endif
|
---|
1557 |
|
---|
1558 | void QRegExpEngine::setup( bool caseSensitive )
|
---|
1559 | {
|
---|
1560 | s.setAutoDelete( TRUE );
|
---|
1561 | s.resize( 32 );
|
---|
1562 | ns = 0;
|
---|
1563 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1564 | f.resize( 32 );
|
---|
1565 | nf = 0;
|
---|
1566 | cf = -1;
|
---|
1567 | #endif
|
---|
1568 | officialncap = 0;
|
---|
1569 | ncap = 0;
|
---|
1570 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
1571 | cl.setAutoDelete( TRUE );
|
---|
1572 | #endif
|
---|
1573 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1574 | ahead.setAutoDelete( TRUE );
|
---|
1575 | #endif
|
---|
1576 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1577 | caretAnchored = TRUE;
|
---|
1578 | trivial = TRUE;
|
---|
1579 | #endif
|
---|
1580 | valid = FALSE;
|
---|
1581 | cs = caseSensitive;
|
---|
1582 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1583 | nbrefs = 0;
|
---|
1584 | #endif
|
---|
1585 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1586 | useGoodStringHeuristic = TRUE;
|
---|
1587 | minl = 0;
|
---|
1588 | occ1.fill( 0, NumBadChars );
|
---|
1589 | #endif
|
---|
1590 | }
|
---|
1591 |
|
---|
1592 | int QRegExpEngine::setupState( int match )
|
---|
1593 | {
|
---|
1594 | if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() )
|
---|
1595 | s.resize( (ns + 1) << 1 );
|
---|
1596 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1597 | s.insert( ns, new State(cf, match) );
|
---|
1598 | #else
|
---|
1599 | s.insert( ns, new State(match) );
|
---|
1600 | #endif
|
---|
1601 | return ns++;
|
---|
1602 | }
|
---|
1603 |
|
---|
1604 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1605 | /*
|
---|
1606 | Functions startAtom() and finishAtom() should be called to delimit
|
---|
1607 | atoms. When a state is created, it is assigned to the current atom.
|
---|
1608 | The information is later used for capturing.
|
---|
1609 | */
|
---|
1610 | int QRegExpEngine::startAtom( bool capture )
|
---|
1611 | {
|
---|
1612 | if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() )
|
---|
1613 | f.resize( (nf + 1) << 1 );
|
---|
1614 | f[nf].parent = cf;
|
---|
1615 | cf = nf++;
|
---|
1616 | f[cf].capture = capture ? ncap++ : -1;
|
---|
1617 | return cf;
|
---|
1618 | }
|
---|
1619 | #endif
|
---|
1620 |
|
---|
1621 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1622 | /*
|
---|
1623 | Creates a lookahead anchor.
|
---|
1624 | */
|
---|
1625 | int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative )
|
---|
1626 | {
|
---|
1627 | int n = ahead.size();
|
---|
1628 | if ( n == MaxLookaheads ) {
|
---|
1629 | error( RXERR_LIMIT );
|
---|
1630 | return 0;
|
---|
1631 | }
|
---|
1632 | ahead.resize( n + 1 );
|
---|
1633 | ahead.insert( n, new Lookahead(eng, negative) );
|
---|
1634 | return Anchor_FirstLookahead << n;
|
---|
1635 | }
|
---|
1636 | #endif
|
---|
1637 |
|
---|
1638 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1639 | /*
|
---|
1640 | We want the longest leftmost captures.
|
---|
1641 | */
|
---|
1642 | bool QRegExpEngine::isBetterCapture( const int *begin1, const int *end1,
|
---|
1643 | const int *begin2, const int *end2 )
|
---|
1644 | {
|
---|
1645 | for ( int i = 0; i < ncap; i++ ) {
|
---|
1646 | int delta = begin2[i] - begin1[i]; // it has to start early...
|
---|
1647 | if ( delta == 0 )
|
---|
1648 | delta = end1[i] - end2[i]; // ...and end late (like a party)
|
---|
1649 |
|
---|
1650 | if ( delta != 0 )
|
---|
1651 | return delta > 0;
|
---|
1652 | }
|
---|
1653 | return FALSE;
|
---|
1654 | }
|
---|
1655 | #endif
|
---|
1656 |
|
---|
1657 | /*
|
---|
1658 | Returns TRUE if anchor a matches at position mmPos + i in the input
|
---|
1659 | string, otherwise FALSE.
|
---|
1660 | */
|
---|
1661 | bool QRegExpEngine::testAnchor( int i, int a, const int *capBegin )
|
---|
1662 | {
|
---|
1663 | int j;
|
---|
1664 |
|
---|
1665 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1666 | if ( (a & Anchor_Alternation) != 0 ) {
|
---|
1667 | return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) ||
|
---|
1668 | testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin );
|
---|
1669 | }
|
---|
1670 | #endif
|
---|
1671 |
|
---|
1672 | if ( (a & Anchor_Caret) != 0 ) {
|
---|
1673 | if ( mmPos + i != mmCaretPos )
|
---|
1674 | return FALSE;
|
---|
1675 | }
|
---|
1676 | if ( (a & Anchor_Dollar) != 0 ) {
|
---|
1677 | if ( mmPos + i != mmLen )
|
---|
1678 | return FALSE;
|
---|
1679 | }
|
---|
1680 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
1681 | if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) {
|
---|
1682 | bool before = FALSE;
|
---|
1683 | bool after = FALSE;
|
---|
1684 | if ( mmPos + i != 0 )
|
---|
1685 | before = isWord( mmIn[mmPos + i - 1] );
|
---|
1686 | if ( mmPos + i != mmLen )
|
---|
1687 | after = isWord( mmIn[mmPos + i] );
|
---|
1688 | if ( (a & Anchor_Word) != 0 && (before == after) )
|
---|
1689 | return FALSE;
|
---|
1690 | if ( (a & Anchor_NonWord) != 0 && (before != after) )
|
---|
1691 | return FALSE;
|
---|
1692 | }
|
---|
1693 | #endif
|
---|
1694 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1695 | if ( (a & Anchor_LookaheadMask) != 0 ) {
|
---|
1696 | QConstString cstr = QConstString( (QChar *) mmIn + mmPos + i,
|
---|
1697 | mmLen - mmPos - i );
|
---|
1698 | for ( j = 0; j < (int) ahead.size(); j++ ) {
|
---|
1699 | if ( (a & (Anchor_FirstLookahead << j)) != 0 ) {
|
---|
1700 | QMemArray<int> captured;
|
---|
1701 | ahead[j]->eng->match( cstr.string(), 0, TRUE, TRUE,
|
---|
1702 | mmCaretPos - mmPos - i, captured );
|
---|
1703 | if ( (captured[0] == 0) == ahead[j]->neg )
|
---|
1704 | return FALSE;
|
---|
1705 | }
|
---|
1706 | }
|
---|
1707 | }
|
---|
1708 | #endif
|
---|
1709 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1710 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1711 | for ( j = 0; j < nbrefs; j++ ) {
|
---|
1712 | if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) {
|
---|
1713 | if ( capBegin[j] != EmptyCapture )
|
---|
1714 | return FALSE;
|
---|
1715 | }
|
---|
1716 | }
|
---|
1717 | #endif
|
---|
1718 | #endif
|
---|
1719 | return TRUE;
|
---|
1720 | }
|
---|
1721 |
|
---|
1722 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1723 | /*
|
---|
1724 | The three following functions are what Jeffrey Friedl would call
|
---|
1725 | transmissions (or bump-alongs). Using one or the other should make
|
---|
1726 | no difference except in performance.
|
---|
1727 | */
|
---|
1728 |
|
---|
1729 | bool QRegExpEngine::goodStringMatch()
|
---|
1730 | {
|
---|
1731 | int k = mmPos + goodEarlyStart;
|
---|
1732 | while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) {
|
---|
1733 | int from = k - goodLateStart;
|
---|
1734 | int to = k - goodEarlyStart;
|
---|
1735 | if ( from > mmPos )
|
---|
1736 | mmPos = from;
|
---|
1737 |
|
---|
1738 | while ( mmPos <= to ) {
|
---|
1739 | if ( matchHere() )
|
---|
1740 | return TRUE;
|
---|
1741 | mmPos++;
|
---|
1742 | }
|
---|
1743 | k++;
|
---|
1744 | }
|
---|
1745 | return FALSE;
|
---|
1746 | }
|
---|
1747 |
|
---|
1748 | bool QRegExpEngine::badCharMatch()
|
---|
1749 | {
|
---|
1750 | int slideHead = 0;
|
---|
1751 | int slideNext = 0;
|
---|
1752 | int i;
|
---|
1753 | int lastPos = mmLen - minl;
|
---|
1754 | memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) );
|
---|
1755 |
|
---|
1756 | /*
|
---|
1757 | Set up the slide table, used for the bad-character heuristic,
|
---|
1758 | using the table of first occurrence of each character.
|
---|
1759 | */
|
---|
1760 | for ( i = 0; i < minl; i++ ) {
|
---|
1761 | int sk = occ1[BadChar(mmIn[mmPos + i])];
|
---|
1762 | if ( sk == NoOccurrence )
|
---|
1763 | sk = i + 1;
|
---|
1764 | if ( sk > 0 ) {
|
---|
1765 | int k = i + 1 - sk;
|
---|
1766 | if ( k < 0 ) {
|
---|
1767 | sk = i + 1;
|
---|
1768 | k = 0;
|
---|
1769 | }
|
---|
1770 | if ( sk > mmSlideTab[k] )
|
---|
1771 | mmSlideTab[k] = sk;
|
---|
1772 | }
|
---|
1773 | }
|
---|
1774 |
|
---|
1775 | if ( mmPos > lastPos )
|
---|
1776 | return FALSE;
|
---|
1777 |
|
---|
1778 | for ( ;; ) {
|
---|
1779 | if ( ++slideNext >= mmSlideTabSize )
|
---|
1780 | slideNext = 0;
|
---|
1781 | if ( mmSlideTab[slideHead] > 0 ) {
|
---|
1782 | if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] )
|
---|
1783 | mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1;
|
---|
1784 | mmSlideTab[slideHead] = 0;
|
---|
1785 | } else {
|
---|
1786 | if ( matchHere() )
|
---|
1787 | return TRUE;
|
---|
1788 | }
|
---|
1789 |
|
---|
1790 | if ( mmPos == lastPos )
|
---|
1791 | break;
|
---|
1792 |
|
---|
1793 | /*
|
---|
1794 | Update the slide table. This code has much in common with
|
---|
1795 | the initialization code.
|
---|
1796 | */
|
---|
1797 | int sk = occ1[BadChar(mmIn[mmPos + minl])];
|
---|
1798 | if ( sk == NoOccurrence ) {
|
---|
1799 | mmSlideTab[slideNext] = minl;
|
---|
1800 | } else if ( sk > 0 ) {
|
---|
1801 | int k = slideNext + minl - sk;
|
---|
1802 | if ( k >= mmSlideTabSize )
|
---|
1803 | k -= mmSlideTabSize;
|
---|
1804 | if ( sk > mmSlideTab[k] )
|
---|
1805 | mmSlideTab[k] = sk;
|
---|
1806 | }
|
---|
1807 | slideHead = slideNext;
|
---|
1808 | mmPos++;
|
---|
1809 | }
|
---|
1810 | return FALSE;
|
---|
1811 | }
|
---|
1812 | #else
|
---|
1813 | bool QRegExpEngine::bruteMatch()
|
---|
1814 | {
|
---|
1815 | while ( mmPos <= mmLen ) {
|
---|
1816 | if ( matchHere() )
|
---|
1817 | return TRUE;
|
---|
1818 | mmPos++;
|
---|
1819 | }
|
---|
1820 | return FALSE;
|
---|
1821 | }
|
---|
1822 | #endif
|
---|
1823 |
|
---|
1824 | /*
|
---|
1825 | Here's the core of the engine. It tries to do a match here and now.
|
---|
1826 | */
|
---|
1827 | bool QRegExpEngine::matchHere()
|
---|
1828 | {
|
---|
1829 | int ncur = 1, nnext = 0;
|
---|
1830 | int i = 0, j, k, m;
|
---|
1831 | bool stop = FALSE;
|
---|
1832 |
|
---|
1833 | mmMatchLen = -1;
|
---|
1834 | mmOneTestMatchedLen = -1;
|
---|
1835 | mmCurStack[0] = InitialState;
|
---|
1836 |
|
---|
1837 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1838 | if ( ncap > 0 ) {
|
---|
1839 | for ( j = 0; j < ncap; j++ ) {
|
---|
1840 | mmCurCapBegin[j] = EmptyCapture;
|
---|
1841 | mmCurCapEnd[j] = EmptyCapture;
|
---|
1842 | }
|
---|
1843 | }
|
---|
1844 | #endif
|
---|
1845 |
|
---|
1846 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1847 | int *zzZ = 0;
|
---|
1848 |
|
---|
1849 | while ( (ncur > 0 || !mmSleeping.isEmpty()) && i <= mmLen - mmPos &&
|
---|
1850 | !stop )
|
---|
1851 | #else
|
---|
1852 | while ( ncur > 0 && i <= mmLen - mmPos && !stop )
|
---|
1853 | #endif
|
---|
1854 | {
|
---|
1855 | int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].unicode() : 0;
|
---|
1856 | for ( j = 0; j < ncur; j++ ) {
|
---|
1857 | int cur = mmCurStack[j];
|
---|
1858 | State *scur = s[cur];
|
---|
1859 | QMemArray<int>& outs = scur->outs;
|
---|
1860 | for ( k = 0; k < (int) outs.size(); k++ ) {
|
---|
1861 | int next = outs[k];
|
---|
1862 | State *snext = s[next];
|
---|
1863 | bool in = TRUE;
|
---|
1864 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1865 | int needSomeSleep = 0;
|
---|
1866 | #endif
|
---|
1867 |
|
---|
1868 | /*
|
---|
1869 | First, check if the anchors are anchored properly.
|
---|
1870 | */
|
---|
1871 | if ( scur->anchors != 0 ) {
|
---|
1872 | int a = at( *scur->anchors, next );
|
---|
1873 | if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) )
|
---|
1874 | in = FALSE;
|
---|
1875 | }
|
---|
1876 | /*
|
---|
1877 | If indeed they are, check if the input character is
|
---|
1878 | correct for this transition.
|
---|
1879 | */
|
---|
1880 | if ( in ) {
|
---|
1881 | m = snext->match;
|
---|
1882 | if ( (m & (CharClassBit | BackRefBit)) == 0 ) {
|
---|
1883 | if ( cs )
|
---|
1884 | in = ( m == ch );
|
---|
1885 | else
|
---|
1886 | in = ( QChar(m).lower() == QChar(ch).lower() );
|
---|
1887 | } else if ( next == FinalState ) {
|
---|
1888 | mmMatchLen = i;
|
---|
1889 | stop = mmMinimal;
|
---|
1890 | in = TRUE;
|
---|
1891 | } else if ( (m & CharClassBit) != 0 ) {
|
---|
1892 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
1893 | const CharClass *cc = cl[m ^ CharClassBit];
|
---|
1894 | if ( cs )
|
---|
1895 | in = cc->in( ch );
|
---|
1896 | else if ( cc->negative() )
|
---|
1897 | in = cc->in( QChar(ch).lower() ) &&
|
---|
1898 | cc->in( QChar(ch).upper() );
|
---|
1899 | else
|
---|
1900 | in = cc->in( QChar(ch).lower() ) ||
|
---|
1901 | cc->in( QChar(ch).upper() );
|
---|
1902 | #endif
|
---|
1903 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1904 | } else { /* ( (m & BackRefBit) != 0 ) */
|
---|
1905 | int bref = m ^ BackRefBit;
|
---|
1906 | int ell = j * ncap + ( bref - 1 );
|
---|
1907 |
|
---|
1908 | in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture;
|
---|
1909 | if ( in ) {
|
---|
1910 | if ( cs )
|
---|
1911 | in = ( mmIn[mmPos + mmCurCapBegin[ell]]
|
---|
1912 | == QChar(ch) );
|
---|
1913 | else
|
---|
1914 | in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower()
|
---|
1915 | == QChar(ch).lower() );
|
---|
1916 | }
|
---|
1917 |
|
---|
1918 | if ( in ) {
|
---|
1919 | int delta;
|
---|
1920 | if ( mmCurCapEnd[ell] == EmptyCapture )
|
---|
1921 | delta = i - mmCurCapBegin[ell];
|
---|
1922 | else
|
---|
1923 | delta = mmCurCapEnd[ell] - mmCurCapBegin[ell];
|
---|
1924 |
|
---|
1925 | in = ( delta <= mmLen - (mmPos + i) );
|
---|
1926 | if ( in && delta > 1 ) {
|
---|
1927 | int n = 1;
|
---|
1928 | if ( cs ) {
|
---|
1929 | while ( n < delta ) {
|
---|
1930 | if ( mmIn[mmPos +
|
---|
1931 | mmCurCapBegin[ell] + n] !=
|
---|
1932 | mmIn[mmPos + i + n] )
|
---|
1933 | break;
|
---|
1934 | n++;
|
---|
1935 | }
|
---|
1936 | } else {
|
---|
1937 | while ( n < delta ) {
|
---|
1938 | QChar a = mmIn[mmPos +
|
---|
1939 | mmCurCapBegin[ell] + n];
|
---|
1940 | QChar b = mmIn[mmPos + i + n];
|
---|
1941 | if ( a.lower() != b.lower() )
|
---|
1942 | break;
|
---|
1943 | n++;
|
---|
1944 | }
|
---|
1945 | }
|
---|
1946 | in = ( n == delta );
|
---|
1947 | if ( in )
|
---|
1948 | needSomeSleep = delta - 1;
|
---|
1949 | }
|
---|
1950 | }
|
---|
1951 | #endif
|
---|
1952 | }
|
---|
1953 | }
|
---|
1954 |
|
---|
1955 | /*
|
---|
1956 | We must now update our data structures.
|
---|
1957 | */
|
---|
1958 | if ( in ) {
|
---|
1959 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1960 | int *capBegin, *capEnd;
|
---|
1961 | #endif
|
---|
1962 | /*
|
---|
1963 | If the next state was not encountered yet, all
|
---|
1964 | is fine.
|
---|
1965 | */
|
---|
1966 | if ( (m = mmInNextStack[next]) == -1 ) {
|
---|
1967 | m = nnext++;
|
---|
1968 | mmNextStack[m] = next;
|
---|
1969 | mmInNextStack[next] = m;
|
---|
1970 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1971 | capBegin = mmNextCapBegin + m * ncap;
|
---|
1972 | capEnd = mmNextCapEnd + m * ncap;
|
---|
1973 |
|
---|
1974 | /*
|
---|
1975 | Otherwise, we'll first maintain captures in
|
---|
1976 | temporary arrays, and decide at the end whether
|
---|
1977 | it's best to keep the previous capture zones or
|
---|
1978 | the new ones.
|
---|
1979 | */
|
---|
1980 | } else {
|
---|
1981 | capBegin = mmTempCapBegin;
|
---|
1982 | capEnd = mmTempCapEnd;
|
---|
1983 | #endif
|
---|
1984 | }
|
---|
1985 |
|
---|
1986 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1987 | /*
|
---|
1988 | Updating the capture zones is much of a task.
|
---|
1989 | */
|
---|
1990 | if ( ncap > 0 ) {
|
---|
1991 | memcpy( capBegin, mmCurCapBegin + j * ncap,
|
---|
1992 | ncap * sizeof(int) );
|
---|
1993 | memcpy( capEnd, mmCurCapEnd + j * ncap,
|
---|
1994 | ncap * sizeof(int) );
|
---|
1995 | int c = scur->atom, n = snext->atom;
|
---|
1996 | int p = -1, q = -1;
|
---|
1997 | int cap;
|
---|
1998 |
|
---|
1999 | /*
|
---|
2000 | Lemma 1. For any x in the range [0..nf), we
|
---|
2001 | have f[x].parent < x.
|
---|
2002 |
|
---|
2003 | Proof. By looking at startAtom(), it is
|
---|
2004 | clear that cf < nf holds all the time, and
|
---|
2005 | thus that f[nf].parent < nf.
|
---|
2006 | */
|
---|
2007 |
|
---|
2008 | /*
|
---|
2009 | If we are reentering an atom, we empty all
|
---|
2010 | capture zones inside it.
|
---|
2011 | */
|
---|
2012 | if ( scur->reenter != 0 &&
|
---|
2013 | (q = at(*scur->reenter, next)) != 0 ) {
|
---|
2014 | QBitArray b;
|
---|
2015 | b.fill( FALSE, nf );
|
---|
2016 | b.setBit( q, TRUE );
|
---|
2017 | for ( int ell = q + 1; ell < nf; ell++ ) {
|
---|
2018 | if ( b.testBit(f[ell].parent) ) {
|
---|
2019 | b.setBit( ell, TRUE );
|
---|
2020 | cap = f[ell].capture;
|
---|
2021 | if ( cap >= 0 ) {
|
---|
2022 | capBegin[cap] = EmptyCapture;
|
---|
2023 | capEnd[cap] = EmptyCapture;
|
---|
2024 | }
|
---|
2025 | }
|
---|
2026 | }
|
---|
2027 | p = f[q].parent;
|
---|
2028 |
|
---|
2029 | /*
|
---|
2030 | Otherwise, close the capture zones we are
|
---|
2031 | leaving. We are leaving f[c].capture,
|
---|
2032 | f[f[c].parent].capture,
|
---|
2033 | f[f[f[c].parent].parent].capture, ...,
|
---|
2034 | until f[x].capture, with x such that
|
---|
2035 | f[x].parent is the youngest common ancestor
|
---|
2036 | for c and n.
|
---|
2037 |
|
---|
2038 | We go up along c's and n's ancestry until
|
---|
2039 | we find x.
|
---|
2040 | */
|
---|
2041 | } else {
|
---|
2042 | p = c;
|
---|
2043 | q = n;
|
---|
2044 | while ( p != q ) {
|
---|
2045 | if ( p > q ) {
|
---|
2046 | cap = f[p].capture;
|
---|
2047 | if ( cap >= 0 ) {
|
---|
2048 | if ( capBegin[cap] == i ) {
|
---|
2049 | capBegin[cap] = EmptyCapture;
|
---|
2050 | capEnd[cap] = EmptyCapture;
|
---|
2051 | } else {
|
---|
2052 | capEnd[cap] = i;
|
---|
2053 | }
|
---|
2054 | }
|
---|
2055 | p = f[p].parent;
|
---|
2056 | } else {
|
---|
2057 | q = f[q].parent;
|
---|
2058 | }
|
---|
2059 | }
|
---|
2060 | }
|
---|
2061 |
|
---|
2062 | /*
|
---|
2063 | In any case, we now open the capture zones
|
---|
2064 | we are entering. We work upwards from n
|
---|
2065 | until we reach p (the parent of the atom we
|
---|
2066 | reenter or the youngest common ancestor).
|
---|
2067 | */
|
---|
2068 | while ( n > p ) {
|
---|
2069 | cap = f[n].capture;
|
---|
2070 | if ( cap >= 0 ) {
|
---|
2071 | capBegin[cap] = i;
|
---|
2072 | capEnd[cap] = EmptyCapture;
|
---|
2073 | }
|
---|
2074 | n = f[n].parent;
|
---|
2075 | }
|
---|
2076 | /*
|
---|
2077 | If the next state was already in
|
---|
2078 | mmNextStack, we must choose carefully which
|
---|
2079 | capture zones we want to keep.
|
---|
2080 | */
|
---|
2081 | if ( capBegin == mmTempCapBegin &&
|
---|
2082 | isBetterCapture(capBegin, capEnd,
|
---|
2083 | mmNextCapBegin + m * ncap,
|
---|
2084 | mmNextCapEnd + m * ncap) ) {
|
---|
2085 | memcpy( mmNextCapBegin + m * ncap, capBegin,
|
---|
2086 | ncap * sizeof(int) );
|
---|
2087 | memcpy( mmNextCapEnd + m * ncap, capEnd,
|
---|
2088 | ncap * sizeof(int) );
|
---|
2089 | }
|
---|
2090 | }
|
---|
2091 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2092 | /*
|
---|
2093 | We are done with updating the capture zones.
|
---|
2094 | It's now time to put the next state to sleep,
|
---|
2095 | if it needs to, and to remove it from
|
---|
2096 | mmNextStack.
|
---|
2097 | */
|
---|
2098 | if ( needSomeSleep > 0 ) {
|
---|
2099 | zzZ = new int[1 + 2 * ncap];
|
---|
2100 | zzZ[0] = next;
|
---|
2101 | if ( ncap > 0 ) {
|
---|
2102 | memcpy( zzZ + 1, capBegin, ncap * sizeof(int) );
|
---|
2103 | memcpy( zzZ + 1 + ncap, capEnd,
|
---|
2104 | ncap * sizeof(int) );
|
---|
2105 | }
|
---|
2106 | mmInNextStack[mmNextStack[--nnext]] = -1;
|
---|
2107 | mmSleeping.insert( i + needSomeSleep, zzZ );
|
---|
2108 | }
|
---|
2109 | #endif
|
---|
2110 | #endif
|
---|
2111 | }
|
---|
2112 | }
|
---|
2113 | }
|
---|
2114 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2115 | /*
|
---|
2116 | If we reached the final state, hurray! Copy the captured
|
---|
2117 | zone.
|
---|
2118 | */
|
---|
2119 | if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) {
|
---|
2120 | memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) );
|
---|
2121 | memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) );
|
---|
2122 | }
|
---|
2123 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2124 | /*
|
---|
2125 | It's time to wake up the sleepers.
|
---|
2126 | */
|
---|
2127 | if ( !mmSleeping.isEmpty() ) {
|
---|
2128 | while ( (zzZ = mmSleeping.take(i)) != 0 ) {
|
---|
2129 | int next = zzZ[0];
|
---|
2130 | int *capBegin = zzZ + 1;
|
---|
2131 | int *capEnd = zzZ + 1 + ncap;
|
---|
2132 | bool copyOver = TRUE;
|
---|
2133 |
|
---|
2134 | if ( (m = mmInNextStack[zzZ[0]]) == -1 ) {
|
---|
2135 | m = nnext++;
|
---|
2136 | mmNextStack[m] = next;
|
---|
2137 | mmInNextStack[next] = m;
|
---|
2138 | } else {
|
---|
2139 | copyOver = isBetterCapture( mmNextCapBegin + m * ncap,
|
---|
2140 | mmNextCapEnd + m * ncap,
|
---|
2141 | capBegin, capEnd );
|
---|
2142 | }
|
---|
2143 | if ( copyOver ) {
|
---|
2144 | memcpy( mmNextCapBegin + m * ncap, capBegin,
|
---|
2145 | ncap * sizeof(int) );
|
---|
2146 | memcpy( mmNextCapEnd + m * ncap, capEnd,
|
---|
2147 | ncap * sizeof(int) );
|
---|
2148 | }
|
---|
2149 | delete[] zzZ;
|
---|
2150 | }
|
---|
2151 | }
|
---|
2152 | #endif
|
---|
2153 | #endif
|
---|
2154 | for ( j = 0; j < nnext; j++ )
|
---|
2155 | mmInNextStack[mmNextStack[j]] = -1;
|
---|
2156 |
|
---|
2157 | // avoid needless iteration that confuses mmOneTestMatchedLen
|
---|
2158 | if ( nnext == 1 && mmNextStack[0] == FinalState
|
---|
2159 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2160 | && mmSleeping.isEmpty()
|
---|
2161 | #endif
|
---|
2162 | )
|
---|
2163 | stop = TRUE;
|
---|
2164 |
|
---|
2165 | qSwap( mmCurStack, mmNextStack );
|
---|
2166 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2167 | qSwap( mmCurCapBegin, mmNextCapBegin );
|
---|
2168 | qSwap( mmCurCapEnd, mmNextCapEnd );
|
---|
2169 | #endif
|
---|
2170 | ncur = nnext;
|
---|
2171 | nnext = 0;
|
---|
2172 | i++;
|
---|
2173 | }
|
---|
2174 |
|
---|
2175 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2176 | /*
|
---|
2177 | If minimal matching is enabled, we might have some sleepers
|
---|
2178 | left.
|
---|
2179 | */
|
---|
2180 | while ( !mmSleeping.isEmpty() ) {
|
---|
2181 | zzZ = mmSleeping.take( *QIntDictIterator<int>(mmSleeping) );
|
---|
2182 | delete[] zzZ;
|
---|
2183 | }
|
---|
2184 | #endif
|
---|
2185 |
|
---|
2186 | mmOneTestMatchedLen = i - 1;
|
---|
2187 | return ( mmMatchLen >= 0 );
|
---|
2188 | }
|
---|
2189 |
|
---|
2190 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2191 |
|
---|
2192 | QRegExpEngine::CharClass::CharClass()
|
---|
2193 | : c( 0 ), n( FALSE )
|
---|
2194 | {
|
---|
2195 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2196 | occ1.fill( NoOccurrence, NumBadChars );
|
---|
2197 | #endif
|
---|
2198 | }
|
---|
2199 |
|
---|
2200 | QRegExpEngine::CharClass& QRegExpEngine::CharClass::operator=(
|
---|
2201 | const CharClass& cc )
|
---|
2202 | {
|
---|
2203 | c = cc.c;
|
---|
2204 | r = cc.r.copy();
|
---|
2205 | n = cc.n;
|
---|
2206 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2207 | occ1 = cc.occ1;
|
---|
2208 | #endif
|
---|
2209 | return *this;
|
---|
2210 | }
|
---|
2211 |
|
---|
2212 | void QRegExpEngine::CharClass::clear()
|
---|
2213 | {
|
---|
2214 | c = 0;
|
---|
2215 | r.resize( 0 );
|
---|
2216 | n = FALSE;
|
---|
2217 | }
|
---|
2218 |
|
---|
2219 | void QRegExpEngine::CharClass::setNegative( bool negative )
|
---|
2220 | {
|
---|
2221 | n = negative;
|
---|
2222 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2223 | occ1.fill( 0, NumBadChars );
|
---|
2224 | #endif
|
---|
2225 | }
|
---|
2226 |
|
---|
2227 | void QRegExpEngine::CharClass::addCategories( int cats )
|
---|
2228 | {
|
---|
2229 | c |= cats;
|
---|
2230 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2231 | occ1.fill( 0, NumBadChars );
|
---|
2232 | #endif
|
---|
2233 | }
|
---|
2234 |
|
---|
2235 | void QRegExpEngine::CharClass::addRange( ushort from, ushort to )
|
---|
2236 | {
|
---|
2237 | if ( from > to )
|
---|
2238 | qSwap( from, to );
|
---|
2239 | int m = r.size();
|
---|
2240 | r.resize( m + 1 );
|
---|
2241 | r[m].from = from;
|
---|
2242 | r[m].to = to;
|
---|
2243 |
|
---|
2244 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2245 | int i;
|
---|
2246 |
|
---|
2247 | if ( to - from < NumBadChars ) {
|
---|
2248 | occ1.detach();
|
---|
2249 | if ( from % NumBadChars <= to % NumBadChars ) {
|
---|
2250 | for ( i = from % NumBadChars; i <= to % NumBadChars; i++ )
|
---|
2251 | occ1[i] = 0;
|
---|
2252 | } else {
|
---|
2253 | for ( i = 0; i <= to % NumBadChars; i++ )
|
---|
2254 | occ1[i] = 0;
|
---|
2255 | for ( i = from % NumBadChars; i < NumBadChars; i++ )
|
---|
2256 | occ1[i] = 0;
|
---|
2257 | }
|
---|
2258 | } else {
|
---|
2259 | occ1.fill( 0, NumBadChars );
|
---|
2260 | }
|
---|
2261 | #endif
|
---|
2262 | }
|
---|
2263 |
|
---|
2264 | bool QRegExpEngine::CharClass::in( QChar ch ) const
|
---|
2265 | {
|
---|
2266 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2267 | if ( occ1[BadChar(ch)] == NoOccurrence )
|
---|
2268 | return n;
|
---|
2269 | #endif
|
---|
2270 |
|
---|
2271 | if ( c != 0 && (c & (1 << (int) ch.category())) != 0 )
|
---|
2272 | return !n;
|
---|
2273 | for ( int i = 0; i < (int) r.size(); i++ ) {
|
---|
2274 | if ( ch.unicode() >= r[i].from && ch.unicode() <= r[i].to )
|
---|
2275 | return !n;
|
---|
2276 | }
|
---|
2277 | return n;
|
---|
2278 | }
|
---|
2279 |
|
---|
2280 | #if defined(QT_DEBUG)
|
---|
2281 | void QRegExpEngine::CharClass::dump() const
|
---|
2282 | {
|
---|
2283 | int i;
|
---|
2284 | qDebug( " %stive character class", n ? "nega" : "posi" );
|
---|
2285 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2286 | if ( c != 0 )
|
---|
2287 | qDebug( " categories 0x%.8x", c );
|
---|
2288 | #endif
|
---|
2289 | for ( i = 0; i < (int) r.size(); i++ )
|
---|
2290 | qDebug( " 0x%.4x through 0x%.4x", r[i].from, r[i].to );
|
---|
2291 | }
|
---|
2292 | #endif
|
---|
2293 | #endif
|
---|
2294 |
|
---|
2295 | QRegExpEngine::Box::Box( QRegExpEngine *engine )
|
---|
2296 | : eng( engine ), skipanchors( 0 )
|
---|
2297 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2298 | , earlyStart( 0 ), lateStart( 0 ), maxl( 0 )
|
---|
2299 | #endif
|
---|
2300 | {
|
---|
2301 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2302 | occ1.fill( NoOccurrence, NumBadChars );
|
---|
2303 | #endif
|
---|
2304 | minl = 0;
|
---|
2305 | }
|
---|
2306 |
|
---|
2307 | QRegExpEngine::Box& QRegExpEngine::Box::operator=( const Box& b )
|
---|
2308 | {
|
---|
2309 | eng = b.eng;
|
---|
2310 | ls = b.ls;
|
---|
2311 | rs = b.rs;
|
---|
2312 | lanchors = b.lanchors;
|
---|
2313 | ranchors = b.ranchors;
|
---|
2314 | skipanchors = b.skipanchors;
|
---|
2315 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2316 | earlyStart = b.earlyStart;
|
---|
2317 | lateStart = b.lateStart;
|
---|
2318 | str = b.str;
|
---|
2319 | leftStr = b.leftStr;
|
---|
2320 | rightStr = b.rightStr;
|
---|
2321 | maxl = b.maxl;
|
---|
2322 | occ1 = b.occ1;
|
---|
2323 | #endif
|
---|
2324 | minl = b.minl;
|
---|
2325 | return *this;
|
---|
2326 | }
|
---|
2327 |
|
---|
2328 | void QRegExpEngine::Box::set( QChar ch )
|
---|
2329 | {
|
---|
2330 | ls.resize( 1 );
|
---|
2331 | ls[0] = eng->createState( ch );
|
---|
2332 | rs = ls;
|
---|
2333 | rs.detach();
|
---|
2334 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2335 | str = ch;
|
---|
2336 | leftStr = ch;
|
---|
2337 | rightStr = ch;
|
---|
2338 | maxl = 1;
|
---|
2339 | occ1.detach();
|
---|
2340 | occ1[BadChar(ch)] = 0;
|
---|
2341 | #endif
|
---|
2342 | minl = 1;
|
---|
2343 | }
|
---|
2344 |
|
---|
2345 | void QRegExpEngine::Box::set( const CharClass& cc )
|
---|
2346 | {
|
---|
2347 | ls.resize( 1 );
|
---|
2348 | ls[0] = eng->createState( cc );
|
---|
2349 | rs = ls;
|
---|
2350 | rs.detach();
|
---|
2351 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2352 | maxl = 1;
|
---|
2353 | occ1 = cc.firstOccurrence();
|
---|
2354 | #endif
|
---|
2355 | minl = 1;
|
---|
2356 | }
|
---|
2357 |
|
---|
2358 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2359 | void QRegExpEngine::Box::set( int bref )
|
---|
2360 | {
|
---|
2361 | ls.resize( 1 );
|
---|
2362 | ls[0] = eng->createState( bref );
|
---|
2363 | rs = ls;
|
---|
2364 | rs.detach();
|
---|
2365 | if ( bref >= 1 && bref <= MaxBackRefs )
|
---|
2366 | skipanchors = Anchor_BackRef0Empty << bref;
|
---|
2367 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2368 | maxl = InftyLen;
|
---|
2369 | #endif
|
---|
2370 | minl = 0;
|
---|
2371 | }
|
---|
2372 | #endif
|
---|
2373 |
|
---|
2374 | void QRegExpEngine::Box::cat( const Box& b )
|
---|
2375 | {
|
---|
2376 | eng->addCatTransitions( rs, b.ls );
|
---|
2377 | addAnchorsToEngine( b );
|
---|
2378 | if ( minl == 0 ) {
|
---|
2379 | mergeInto( &lanchors, b.lanchors );
|
---|
2380 | if ( skipanchors != 0 ) {
|
---|
2381 | for ( int i = 0; i < (int) b.ls.size(); i++ ) {
|
---|
2382 | int a = eng->anchorConcatenation( at(lanchors, b.ls[i]),
|
---|
2383 | skipanchors );
|
---|
2384 | lanchors.insert( b.ls[i], a );
|
---|
2385 | }
|
---|
2386 | }
|
---|
2387 | mergeInto( &ls, b.ls );
|
---|
2388 | }
|
---|
2389 | if ( b.minl == 0 ) {
|
---|
2390 | mergeInto( &ranchors, b.ranchors );
|
---|
2391 | if ( b.skipanchors != 0 ) {
|
---|
2392 | for ( int i = 0; i < (int) rs.size(); i++ ) {
|
---|
2393 | int a = eng->anchorConcatenation( at(ranchors, rs[i]),
|
---|
2394 | b.skipanchors );
|
---|
2395 | ranchors.insert( rs[i], a );
|
---|
2396 | }
|
---|
2397 | }
|
---|
2398 | mergeInto( &rs, b.rs );
|
---|
2399 | } else {
|
---|
2400 | ranchors = b.ranchors;
|
---|
2401 | rs = b.rs;
|
---|
2402 | }
|
---|
2403 |
|
---|
2404 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2405 | if ( maxl != InftyLen ) {
|
---|
2406 | if ( rightStr.length() + b.leftStr.length() >
|
---|
2407 | QMAX(str.length(), b.str.length()) ) {
|
---|
2408 | earlyStart = minl - rightStr.length();
|
---|
2409 | lateStart = maxl - rightStr.length();
|
---|
2410 | str = rightStr + b.leftStr;
|
---|
2411 | } else if ( b.str.length() > str.length() ) {
|
---|
2412 | earlyStart = minl + b.earlyStart;
|
---|
2413 | lateStart = maxl + b.lateStart;
|
---|
2414 | str = b.str;
|
---|
2415 | }
|
---|
2416 | }
|
---|
2417 |
|
---|
2418 | if ( (int) leftStr.length() == maxl )
|
---|
2419 | leftStr += b.leftStr;
|
---|
2420 |
|
---|
2421 | if ( (int) b.rightStr.length() == b.maxl ) {
|
---|
2422 | rightStr += b.rightStr;
|
---|
2423 | } else {
|
---|
2424 | rightStr = b.rightStr;
|
---|
2425 | }
|
---|
2426 |
|
---|
2427 | if ( maxl == InftyLen || b.maxl == InftyLen ) {
|
---|
2428 | maxl = InftyLen;
|
---|
2429 | } else {
|
---|
2430 | maxl += b.maxl;
|
---|
2431 | }
|
---|
2432 |
|
---|
2433 | occ1.detach();
|
---|
2434 | for ( int i = 0; i < NumBadChars; i++ ) {
|
---|
2435 | if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] )
|
---|
2436 | occ1[i] = minl + b.occ1[i];
|
---|
2437 | }
|
---|
2438 | #endif
|
---|
2439 |
|
---|
2440 | minl += b.minl;
|
---|
2441 | if ( minl == 0 )
|
---|
2442 | skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors );
|
---|
2443 | else
|
---|
2444 | skipanchors = 0;
|
---|
2445 | }
|
---|
2446 |
|
---|
2447 | void QRegExpEngine::Box::orx( const Box& b )
|
---|
2448 | {
|
---|
2449 | mergeInto( &ls, b.ls );
|
---|
2450 | mergeInto( &lanchors, b.lanchors );
|
---|
2451 | mergeInto( &rs, b.rs );
|
---|
2452 | mergeInto( &ranchors, b.ranchors );
|
---|
2453 |
|
---|
2454 | if ( b.minl == 0 ) {
|
---|
2455 | if ( minl == 0 )
|
---|
2456 | skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors );
|
---|
2457 | else
|
---|
2458 | skipanchors = b.skipanchors;
|
---|
2459 | }
|
---|
2460 |
|
---|
2461 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2462 | occ1.detach();
|
---|
2463 | for ( int i = 0; i < NumBadChars; i++ ) {
|
---|
2464 | if ( occ1[i] > b.occ1[i] )
|
---|
2465 | occ1[i] = b.occ1[i];
|
---|
2466 | }
|
---|
2467 | earlyStart = 0;
|
---|
2468 | lateStart = 0;
|
---|
2469 | str = QString();
|
---|
2470 | leftStr = QString();
|
---|
2471 | rightStr = QString();
|
---|
2472 | if ( b.maxl > maxl )
|
---|
2473 | maxl = b.maxl;
|
---|
2474 | #endif
|
---|
2475 | if ( b.minl < minl )
|
---|
2476 | minl = b.minl;
|
---|
2477 | }
|
---|
2478 |
|
---|
2479 | void QRegExpEngine::Box::plus( int atom )
|
---|
2480 | {
|
---|
2481 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2482 | eng->addPlusTransitions( rs, ls, atom );
|
---|
2483 | #else
|
---|
2484 | Q_UNUSED( atom );
|
---|
2485 | eng->addCatTransitions( rs, ls );
|
---|
2486 | #endif
|
---|
2487 | addAnchorsToEngine( *this );
|
---|
2488 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2489 | maxl = InftyLen;
|
---|
2490 | #endif
|
---|
2491 | }
|
---|
2492 |
|
---|
2493 | void QRegExpEngine::Box::opt()
|
---|
2494 | {
|
---|
2495 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2496 | earlyStart = 0;
|
---|
2497 | lateStart = 0;
|
---|
2498 | str = QString();
|
---|
2499 | leftStr = QString();
|
---|
2500 | rightStr = QString();
|
---|
2501 | #endif
|
---|
2502 | skipanchors = 0;
|
---|
2503 | minl = 0;
|
---|
2504 | }
|
---|
2505 |
|
---|
2506 | void QRegExpEngine::Box::catAnchor( int a )
|
---|
2507 | {
|
---|
2508 | if ( a != 0 ) {
|
---|
2509 | for ( int i = 0; i < (int) rs.size(); i++ ) {
|
---|
2510 | a = eng->anchorConcatenation( at(ranchors, rs[i]), a );
|
---|
2511 | ranchors.insert( rs[i], a );
|
---|
2512 | }
|
---|
2513 | if ( minl == 0 )
|
---|
2514 | skipanchors = eng->anchorConcatenation( skipanchors, a );
|
---|
2515 | }
|
---|
2516 | }
|
---|
2517 |
|
---|
2518 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2519 | void QRegExpEngine::Box::setupHeuristics()
|
---|
2520 | {
|
---|
2521 | eng->goodEarlyStart = earlyStart;
|
---|
2522 | eng->goodLateStart = lateStart;
|
---|
2523 | eng->goodStr = eng->cs ? str : str.lower();
|
---|
2524 |
|
---|
2525 | eng->minl = minl;
|
---|
2526 | if ( eng->cs ) {
|
---|
2527 | /*
|
---|
2528 | A regular expression such as 112|1 has occ1['2'] = 2 and minl =
|
---|
2529 | 1 at this point. An entry of occ1 has to be at most minl or
|
---|
2530 | infinity for the rest of the algorithm to go well.
|
---|
2531 |
|
---|
2532 | We waited until here before normalizing these cases (instead of
|
---|
2533 | doing it in Box::orx()) because sometimes things improve by
|
---|
2534 | themselves. Consider for example (112|1)34.
|
---|
2535 | */
|
---|
2536 | for ( int i = 0; i < NumBadChars; i++ ) {
|
---|
2537 | if ( occ1[i] != NoOccurrence && occ1[i] >= minl )
|
---|
2538 | occ1[i] = minl;
|
---|
2539 | }
|
---|
2540 | eng->occ1 = occ1;
|
---|
2541 | } else {
|
---|
2542 | eng->occ1.fill( 0, NumBadChars );
|
---|
2543 | }
|
---|
2544 |
|
---|
2545 | eng->heuristicallyChooseHeuristic();
|
---|
2546 | }
|
---|
2547 | #endif
|
---|
2548 |
|
---|
2549 | #if defined(QT_DEBUG)
|
---|
2550 | void QRegExpEngine::Box::dump() const
|
---|
2551 | {
|
---|
2552 | int i;
|
---|
2553 | qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" );
|
---|
2554 | qDebug( " Left states:" );
|
---|
2555 | for ( i = 0; i < (int) ls.size(); i++ ) {
|
---|
2556 | if ( at(lanchors, ls[i]) == 0 )
|
---|
2557 | qDebug( " %d", ls[i] );
|
---|
2558 | else
|
---|
2559 | qDebug( " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] );
|
---|
2560 | }
|
---|
2561 | qDebug( " Right states:" );
|
---|
2562 | for ( i = 0; i < (int) rs.size(); i++ ) {
|
---|
2563 | if ( at(ranchors, rs[i]) == 0 )
|
---|
2564 | qDebug( " %d", rs[i] );
|
---|
2565 | else
|
---|
2566 | qDebug( " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] );
|
---|
2567 | }
|
---|
2568 | qDebug( " Skip anchors: 0x%.8x", skipanchors );
|
---|
2569 | }
|
---|
2570 | #endif
|
---|
2571 |
|
---|
2572 | void QRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const
|
---|
2573 | {
|
---|
2574 | for ( int i = 0; i < (int) to.ls.size(); i++ ) {
|
---|
2575 | for ( int j = 0; j < (int) rs.size(); j++ ) {
|
---|
2576 | int a = eng->anchorConcatenation( at(ranchors, rs[j]),
|
---|
2577 | at(to.lanchors, to.ls[i]) );
|
---|
2578 | eng->addAnchors( rs[j], to.ls[i], a );
|
---|
2579 | }
|
---|
2580 | }
|
---|
2581 | }
|
---|
2582 |
|
---|
2583 | int QRegExpEngine::getChar()
|
---|
2584 | {
|
---|
2585 | return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].unicode();
|
---|
2586 | }
|
---|
2587 |
|
---|
2588 | int QRegExpEngine::getEscape()
|
---|
2589 | {
|
---|
2590 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2591 | const char tab[] = "afnrtv"; // no b, as \b means word boundary
|
---|
2592 | const char backTab[] = "\a\f\n\r\t\v";
|
---|
2593 | ushort low;
|
---|
2594 | int i;
|
---|
2595 | #endif
|
---|
2596 | ushort val;
|
---|
2597 | int prevCh = yyCh;
|
---|
2598 |
|
---|
2599 | if ( prevCh == EOS ) {
|
---|
2600 | error( RXERR_END );
|
---|
2601 | return Tok_Char | '\\';
|
---|
2602 | }
|
---|
2603 | yyCh = getChar();
|
---|
2604 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2605 | if ( (prevCh & ~0xff) == 0 ) {
|
---|
2606 | const char *p = strchr( tab, prevCh );
|
---|
2607 | if ( p != 0 )
|
---|
2608 | return Tok_Char | backTab[p - tab];
|
---|
2609 | }
|
---|
2610 | #endif
|
---|
2611 |
|
---|
2612 | switch ( prevCh ) {
|
---|
2613 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2614 | case '0':
|
---|
2615 | val = 0;
|
---|
2616 | for ( i = 0; i < 3; i++ ) {
|
---|
2617 | if ( yyCh >= '0' && yyCh <= '7' )
|
---|
2618 | val = ( val << 3 ) | ( yyCh - '0' );
|
---|
2619 | else
|
---|
2620 | break;
|
---|
2621 | yyCh = getChar();
|
---|
2622 | }
|
---|
2623 | if ( (val & ~0377) != 0 )
|
---|
2624 | error( RXERR_OCTAL );
|
---|
2625 | return Tok_Char | val;
|
---|
2626 | #endif
|
---|
2627 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2628 | case 'B':
|
---|
2629 | return Tok_NonWord;
|
---|
2630 | #endif
|
---|
2631 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2632 | case 'D':
|
---|
2633 | // see QChar::isDigit()
|
---|
2634 | yyCharClass->addCategories( 0x7fffffef );
|
---|
2635 | return Tok_CharClass;
|
---|
2636 | case 'S':
|
---|
2637 | // see QChar::isSpace()
|
---|
2638 | yyCharClass->addCategories( 0x7ffff87f );
|
---|
2639 | yyCharClass->addRange( 0x0000, 0x0008 );
|
---|
2640 | yyCharClass->addRange( 0x000e, 0x001f );
|
---|
2641 | yyCharClass->addRange( 0x007f, 0x009f );
|
---|
2642 | return Tok_CharClass;
|
---|
2643 | case 'W':
|
---|
2644 | // see QChar::isLetterOrNumber()
|
---|
2645 | yyCharClass->addCategories( 0x7fe07f8f );
|
---|
2646 | yyCharClass->addRange( 0x203f, 0x2040 );
|
---|
2647 | yyCharClass->addSingleton( 0x2040 );
|
---|
2648 | yyCharClass->addSingleton( 0x30fb );
|
---|
2649 | yyCharClass->addRange( 0xfe33, 0xfe34 );
|
---|
2650 | yyCharClass->addRange( 0xfe4d, 0xfe4f );
|
---|
2651 | yyCharClass->addSingleton( 0xff3f );
|
---|
2652 | yyCharClass->addSingleton( 0xff65 );
|
---|
2653 | return Tok_CharClass;
|
---|
2654 | #endif
|
---|
2655 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2656 | case 'b':
|
---|
2657 | return Tok_Word;
|
---|
2658 | #endif
|
---|
2659 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2660 | case 'd':
|
---|
2661 | // see QChar::isDigit()
|
---|
2662 | yyCharClass->addCategories( 0x00000010 );
|
---|
2663 | return Tok_CharClass;
|
---|
2664 | case 's':
|
---|
2665 | // see QChar::isSpace()
|
---|
2666 | yyCharClass->addCategories( 0x00000380 );
|
---|
2667 | yyCharClass->addRange( 0x0009, 0x000d );
|
---|
2668 | return Tok_CharClass;
|
---|
2669 | case 'w':
|
---|
2670 | // see QChar::isLetterOrNumber()
|
---|
2671 | yyCharClass->addCategories( 0x000f8070 );
|
---|
2672 | yyCharClass->addSingleton( 0x005f ); // '_'
|
---|
2673 | return Tok_CharClass;
|
---|
2674 | #endif
|
---|
2675 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2676 | case 'x':
|
---|
2677 | val = 0;
|
---|
2678 | for ( i = 0; i < 4; i++ ) {
|
---|
2679 | low = QChar( yyCh ).lower();
|
---|
2680 | if ( low >= '0' && low <= '9' )
|
---|
2681 | val = ( val << 4 ) | ( low - '0' );
|
---|
2682 | else if ( low >= 'a' && low <= 'f' )
|
---|
2683 | val = ( val << 4 ) | ( low - 'a' + 10 );
|
---|
2684 | else
|
---|
2685 | break;
|
---|
2686 | yyCh = getChar();
|
---|
2687 | }
|
---|
2688 | return Tok_Char | val;
|
---|
2689 | #endif
|
---|
2690 | default:
|
---|
2691 | if ( prevCh >= '1' && prevCh <= '9' ) {
|
---|
2692 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2693 | val = prevCh - '0';
|
---|
2694 | while ( yyCh >= '0' && yyCh <= '9' ) {
|
---|
2695 | val = ( val *= 10 ) | ( yyCh - '0' );
|
---|
2696 | yyCh = getChar();
|
---|
2697 | }
|
---|
2698 | return Tok_BackRef | val;
|
---|
2699 | #else
|
---|
2700 | error( RXERR_DISABLED );
|
---|
2701 | #endif
|
---|
2702 | }
|
---|
2703 | return Tok_Char | prevCh;
|
---|
2704 | }
|
---|
2705 | }
|
---|
2706 |
|
---|
2707 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
2708 | int QRegExpEngine::getRep( int def )
|
---|
2709 | {
|
---|
2710 | if ( yyCh >= '0' && yyCh <= '9' ) {
|
---|
2711 | int rep = 0;
|
---|
2712 | do {
|
---|
2713 | rep = 10 * rep + yyCh - '0';
|
---|
2714 | if ( rep >= InftyRep ) {
|
---|
2715 | error( RXERR_REPETITION );
|
---|
2716 | rep = def;
|
---|
2717 | }
|
---|
2718 | yyCh = getChar();
|
---|
2719 | } while ( yyCh >= '0' && yyCh <= '9' );
|
---|
2720 | return rep;
|
---|
2721 | } else {
|
---|
2722 | return def;
|
---|
2723 | }
|
---|
2724 | }
|
---|
2725 | #endif
|
---|
2726 |
|
---|
2727 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
2728 | void QRegExpEngine::skipChars( int n )
|
---|
2729 | {
|
---|
2730 | if ( n > 0 ) {
|
---|
2731 | yyPos += n - 1;
|
---|
2732 | yyCh = getChar();
|
---|
2733 | }
|
---|
2734 | }
|
---|
2735 | #endif
|
---|
2736 |
|
---|
2737 | void QRegExpEngine::error( const char *msg )
|
---|
2738 | {
|
---|
2739 | if ( yyError.isEmpty() )
|
---|
2740 | yyError = QString::fromLatin1( msg );
|
---|
2741 | }
|
---|
2742 |
|
---|
2743 | void QRegExpEngine::startTokenizer( const QChar *rx, int len )
|
---|
2744 | {
|
---|
2745 | yyIn = rx;
|
---|
2746 | yyPos0 = 0;
|
---|
2747 | yyPos = 0;
|
---|
2748 | yyLen = len;
|
---|
2749 | yyCh = getChar();
|
---|
2750 | yyCharClass = new CharClass;
|
---|
2751 | yyMinRep = 0;
|
---|
2752 | yyMaxRep = 0;
|
---|
2753 | yyError = QString();
|
---|
2754 | }
|
---|
2755 |
|
---|
2756 | int QRegExpEngine::getToken()
|
---|
2757 | {
|
---|
2758 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2759 | ushort pendingCh = 0;
|
---|
2760 | bool charPending;
|
---|
2761 | bool rangePending;
|
---|
2762 | int tok;
|
---|
2763 | #endif
|
---|
2764 | int prevCh = yyCh;
|
---|
2765 |
|
---|
2766 | yyPos0 = yyPos - 1;
|
---|
2767 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2768 | yyCharClass->clear();
|
---|
2769 | #endif
|
---|
2770 | yyMinRep = 0;
|
---|
2771 | yyMaxRep = 0;
|
---|
2772 | yyCh = getChar();
|
---|
2773 |
|
---|
2774 | switch ( prevCh ) {
|
---|
2775 | case EOS:
|
---|
2776 | yyPos0 = yyPos;
|
---|
2777 | return Tok_Eos;
|
---|
2778 | case '$':
|
---|
2779 | return Tok_Dollar;
|
---|
2780 | case '(':
|
---|
2781 | if ( yyCh == '?' ) {
|
---|
2782 | prevCh = getChar();
|
---|
2783 | yyCh = getChar();
|
---|
2784 | switch ( prevCh ) {
|
---|
2785 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
2786 | case '!':
|
---|
2787 | return Tok_NegLookahead;
|
---|
2788 | case '=':
|
---|
2789 | return Tok_PosLookahead;
|
---|
2790 | #endif
|
---|
2791 | case ':':
|
---|
2792 | return Tok_MagicLeftParen;
|
---|
2793 | default:
|
---|
2794 | error( RXERR_LOOKAHEAD );
|
---|
2795 | return Tok_MagicLeftParen;
|
---|
2796 | }
|
---|
2797 | } else {
|
---|
2798 | return Tok_LeftParen;
|
---|
2799 | }
|
---|
2800 | case ')':
|
---|
2801 | return Tok_RightParen;
|
---|
2802 | case '*':
|
---|
2803 | yyMinRep = 0;
|
---|
2804 | yyMaxRep = InftyRep;
|
---|
2805 | return Tok_Quantifier;
|
---|
2806 | case '+':
|
---|
2807 | yyMinRep = 1;
|
---|
2808 | yyMaxRep = InftyRep;
|
---|
2809 | return Tok_Quantifier;
|
---|
2810 | case '.':
|
---|
2811 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2812 | yyCharClass->setNegative( TRUE );
|
---|
2813 | #endif
|
---|
2814 | return Tok_CharClass;
|
---|
2815 | case '?':
|
---|
2816 | yyMinRep = 0;
|
---|
2817 | yyMaxRep = 1;
|
---|
2818 | return Tok_Quantifier;
|
---|
2819 | case '[':
|
---|
2820 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2821 | if ( yyCh == '^' ) {
|
---|
2822 | yyCharClass->setNegative( TRUE );
|
---|
2823 | yyCh = getChar();
|
---|
2824 | }
|
---|
2825 | charPending = FALSE;
|
---|
2826 | rangePending = FALSE;
|
---|
2827 | do {
|
---|
2828 | if ( yyCh == '-' && charPending && !rangePending ) {
|
---|
2829 | rangePending = TRUE;
|
---|
2830 | yyCh = getChar();
|
---|
2831 | } else {
|
---|
2832 | if ( charPending && !rangePending ) {
|
---|
2833 | yyCharClass->addSingleton( pendingCh );
|
---|
2834 | charPending = FALSE;
|
---|
2835 | }
|
---|
2836 | if ( yyCh == '\\' ) {
|
---|
2837 | yyCh = getChar();
|
---|
2838 | tok = getEscape();
|
---|
2839 | if ( tok == Tok_Word )
|
---|
2840 | tok = '\b';
|
---|
2841 | } else {
|
---|
2842 | tok = Tok_Char | yyCh;
|
---|
2843 | yyCh = getChar();
|
---|
2844 | }
|
---|
2845 | if ( tok == Tok_CharClass ) {
|
---|
2846 | if ( rangePending ) {
|
---|
2847 | yyCharClass->addSingleton( '-' );
|
---|
2848 | yyCharClass->addSingleton( pendingCh );
|
---|
2849 | charPending = FALSE;
|
---|
2850 | rangePending = FALSE;
|
---|
2851 | }
|
---|
2852 | } else if ( (tok & Tok_Char) != 0 ) {
|
---|
2853 | if ( rangePending ) {
|
---|
2854 | yyCharClass->addRange( pendingCh, tok ^ Tok_Char );
|
---|
2855 | charPending = FALSE;
|
---|
2856 | rangePending = FALSE;
|
---|
2857 | } else {
|
---|
2858 | pendingCh = tok ^ Tok_Char;
|
---|
2859 | charPending = TRUE;
|
---|
2860 | }
|
---|
2861 | } else {
|
---|
2862 | error( RXERR_CHARCLASS );
|
---|
2863 | }
|
---|
2864 | }
|
---|
2865 | } while ( yyCh != ']' && yyCh != EOS );
|
---|
2866 | if ( rangePending )
|
---|
2867 | yyCharClass->addSingleton( '-' );
|
---|
2868 | if ( charPending )
|
---|
2869 | yyCharClass->addSingleton( pendingCh );
|
---|
2870 | if ( yyCh == EOS )
|
---|
2871 | error( RXERR_END );
|
---|
2872 | else
|
---|
2873 | yyCh = getChar();
|
---|
2874 | return Tok_CharClass;
|
---|
2875 | #else
|
---|
2876 | error( RXERR_END );
|
---|
2877 | return Tok_Char | '[';
|
---|
2878 | #endif
|
---|
2879 | case '\\':
|
---|
2880 | return getEscape();
|
---|
2881 | case ']':
|
---|
2882 | error( RXERR_LEFTDELIM );
|
---|
2883 | return Tok_Char | ']';
|
---|
2884 | case '^':
|
---|
2885 | return Tok_Caret;
|
---|
2886 | case '{':
|
---|
2887 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
2888 | yyMinRep = getRep( 0 );
|
---|
2889 | yyMaxRep = yyMinRep;
|
---|
2890 | if ( yyCh == ',' ) {
|
---|
2891 | yyCh = getChar();
|
---|
2892 | yyMaxRep = getRep( InftyRep );
|
---|
2893 | }
|
---|
2894 | if ( yyMaxRep < yyMinRep )
|
---|
2895 | qSwap( yyMinRep, yyMaxRep );
|
---|
2896 | if ( yyCh != '}' )
|
---|
2897 | error( RXERR_REPETITION );
|
---|
2898 | yyCh = getChar();
|
---|
2899 | return Tok_Quantifier;
|
---|
2900 | #else
|
---|
2901 | error( RXERR_DISABLED );
|
---|
2902 | return Tok_Char | '{';
|
---|
2903 | #endif
|
---|
2904 | case '|':
|
---|
2905 | return Tok_Bar;
|
---|
2906 | case '}':
|
---|
2907 | error( RXERR_LEFTDELIM );
|
---|
2908 | return Tok_Char | '}';
|
---|
2909 | default:
|
---|
2910 | return Tok_Char | prevCh;
|
---|
2911 | }
|
---|
2912 | }
|
---|
2913 |
|
---|
2914 | int QRegExpEngine::parse( const QChar *pattern, int len )
|
---|
2915 | {
|
---|
2916 | valid = TRUE;
|
---|
2917 | startTokenizer( pattern, len );
|
---|
2918 | yyTok = getToken();
|
---|
2919 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2920 | yyMayCapture = TRUE;
|
---|
2921 | #else
|
---|
2922 | yyMayCapture = FALSE;
|
---|
2923 | #endif
|
---|
2924 |
|
---|
2925 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2926 | int atom = startAtom( FALSE );
|
---|
2927 | #endif
|
---|
2928 | CharClass anything;
|
---|
2929 | Box box( this ); // create InitialState
|
---|
2930 | box.set( anything );
|
---|
2931 | Box rightBox( this ); // create FinalState
|
---|
2932 | rightBox.set( anything );
|
---|
2933 |
|
---|
2934 | Box middleBox( this );
|
---|
2935 | parseExpression( &middleBox );
|
---|
2936 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2937 | finishAtom( atom );
|
---|
2938 | #endif
|
---|
2939 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2940 | middleBox.setupHeuristics();
|
---|
2941 | #endif
|
---|
2942 | box.cat( middleBox );
|
---|
2943 | box.cat( rightBox );
|
---|
2944 | delete yyCharClass;
|
---|
2945 | yyCharClass = 0;
|
---|
2946 |
|
---|
2947 | officialncap = ncap;
|
---|
2948 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2949 | if ( nbrefs > ncap )
|
---|
2950 | ncap = nbrefs;
|
---|
2951 | #endif
|
---|
2952 |
|
---|
2953 | /*
|
---|
2954 | We use one QMemArray<int> for all the big data used a lot in
|
---|
2955 | matchHere() and friends.
|
---|
2956 | */
|
---|
2957 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2958 | mmSlideTabSize = QMAX( minl + 1, 16 );
|
---|
2959 | #else
|
---|
2960 | mmSlideTabSize = 0;
|
---|
2961 | #endif
|
---|
2962 | mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize );
|
---|
2963 |
|
---|
2964 | mmInNextStack = mmBigArray.data();
|
---|
2965 | memset( mmInNextStack, -1, ns * sizeof(int) );
|
---|
2966 | mmCurStack = mmInNextStack + ns;
|
---|
2967 | mmNextStack = mmInNextStack + 2 * ns;
|
---|
2968 |
|
---|
2969 | mmCurCapBegin = mmInNextStack + 3 * ns;
|
---|
2970 | mmNextCapBegin = mmCurCapBegin + ncap * ns;
|
---|
2971 | mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns;
|
---|
2972 | mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns;
|
---|
2973 |
|
---|
2974 | mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns;
|
---|
2975 | mmTempCapEnd = mmTempCapBegin + ncap;
|
---|
2976 | mmCapBegin = mmTempCapBegin + 2 * ncap;
|
---|
2977 | mmCapEnd = mmTempCapBegin + 3 * ncap;
|
---|
2978 |
|
---|
2979 | mmSlideTab = mmTempCapBegin + 4 * ncap;
|
---|
2980 |
|
---|
2981 | if ( !yyError.isEmpty() )
|
---|
2982 | return -1;
|
---|
2983 |
|
---|
2984 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2985 | State *sinit = s[InitialState];
|
---|
2986 | caretAnchored = ( sinit->anchors != 0 );
|
---|
2987 | if ( caretAnchored ) {
|
---|
2988 | QMap<int, int>& anchors = *sinit->anchors;
|
---|
2989 | QMap<int, int>::ConstIterator a;
|
---|
2990 | for ( a = anchors.begin(); a != anchors.end(); ++a ) {
|
---|
2991 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
2992 | if ( (*a & Anchor_Alternation) != 0 )
|
---|
2993 | break;
|
---|
2994 | #endif
|
---|
2995 | if ( (*a & Anchor_Caret) == 0 ) {
|
---|
2996 | caretAnchored = FALSE;
|
---|
2997 | break;
|
---|
2998 | }
|
---|
2999 | }
|
---|
3000 | }
|
---|
3001 | #endif
|
---|
3002 | return yyPos0;
|
---|
3003 | }
|
---|
3004 |
|
---|
3005 | void QRegExpEngine::parseAtom( Box *box )
|
---|
3006 | {
|
---|
3007 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
3008 | QRegExpEngine *eng = 0;
|
---|
3009 | bool neg;
|
---|
3010 | int len;
|
---|
3011 | #endif
|
---|
3012 |
|
---|
3013 | if ( (yyTok & Tok_Char) != 0 ) {
|
---|
3014 | box->set( QChar(yyTok ^ Tok_Char) );
|
---|
3015 | } else {
|
---|
3016 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3017 | trivial = FALSE;
|
---|
3018 | #endif
|
---|
3019 | switch ( yyTok ) {
|
---|
3020 | case Tok_Dollar:
|
---|
3021 | box->catAnchor( Anchor_Dollar );
|
---|
3022 | break;
|
---|
3023 | case Tok_Caret:
|
---|
3024 | box->catAnchor( Anchor_Caret );
|
---|
3025 | break;
|
---|
3026 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
3027 | case Tok_PosLookahead:
|
---|
3028 | case Tok_NegLookahead:
|
---|
3029 | neg = ( yyTok == Tok_NegLookahead );
|
---|
3030 | eng = new QRegExpEngine( cs );
|
---|
3031 | len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 );
|
---|
3032 | if ( len >= 0 )
|
---|
3033 | skipChars( len );
|
---|
3034 | else
|
---|
3035 | error( RXERR_LOOKAHEAD );
|
---|
3036 | box->catAnchor( addLookahead(eng, neg) );
|
---|
3037 | yyTok = getToken();
|
---|
3038 | if ( yyTok != Tok_RightParen )
|
---|
3039 | error( RXERR_LOOKAHEAD );
|
---|
3040 | break;
|
---|
3041 | #endif
|
---|
3042 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
3043 | case Tok_Word:
|
---|
3044 | box->catAnchor( Anchor_Word );
|
---|
3045 | break;
|
---|
3046 | case Tok_NonWord:
|
---|
3047 | box->catAnchor( Anchor_NonWord );
|
---|
3048 | break;
|
---|
3049 | #endif
|
---|
3050 | case Tok_LeftParen:
|
---|
3051 | case Tok_MagicLeftParen:
|
---|
3052 | yyTok = getToken();
|
---|
3053 | parseExpression( box );
|
---|
3054 | if ( yyTok != Tok_RightParen )
|
---|
3055 | error( RXERR_END );
|
---|
3056 | break;
|
---|
3057 | case Tok_CharClass:
|
---|
3058 | box->set( *yyCharClass );
|
---|
3059 | break;
|
---|
3060 | case Tok_Quantifier:
|
---|
3061 | error( RXERR_REPETITION );
|
---|
3062 | break;
|
---|
3063 | default:
|
---|
3064 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
3065 | if ( (yyTok & Tok_BackRef) != 0 )
|
---|
3066 | box->set( yyTok ^ Tok_BackRef );
|
---|
3067 | else
|
---|
3068 | #endif
|
---|
3069 | error( RXERR_DISABLED );
|
---|
3070 | }
|
---|
3071 | }
|
---|
3072 | yyTok = getToken();
|
---|
3073 | }
|
---|
3074 |
|
---|
3075 | void QRegExpEngine::parseFactor( Box *box )
|
---|
3076 | {
|
---|
3077 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3078 | int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen );
|
---|
3079 | #else
|
---|
3080 | static const int atom = 0;
|
---|
3081 | #endif
|
---|
3082 |
|
---|
3083 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
3084 | #define YYREDO() \
|
---|
3085 | yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
|
---|
3086 | *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
|
---|
3087 |
|
---|
3088 | const QChar *in = yyIn;
|
---|
3089 | int pos0 = yyPos0;
|
---|
3090 | int pos = yyPos;
|
---|
3091 | int len = yyLen;
|
---|
3092 | int ch = yyCh;
|
---|
3093 | CharClass charClass;
|
---|
3094 | if ( yyTok == Tok_CharClass )
|
---|
3095 | charClass = *yyCharClass;
|
---|
3096 | int tok = yyTok;
|
---|
3097 | bool mayCapture = yyMayCapture;
|
---|
3098 | #endif
|
---|
3099 |
|
---|
3100 | parseAtom( box );
|
---|
3101 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3102 | finishAtom( atom );
|
---|
3103 | #endif
|
---|
3104 |
|
---|
3105 | if ( yyTok == Tok_Quantifier ) {
|
---|
3106 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3107 | trivial = FALSE;
|
---|
3108 | #endif
|
---|
3109 | if ( yyMaxRep == InftyRep ) {
|
---|
3110 | box->plus( atom );
|
---|
3111 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
3112 | } else if ( yyMaxRep == 0 ) {
|
---|
3113 | box->clear();
|
---|
3114 | #endif
|
---|
3115 | }
|
---|
3116 | if ( yyMinRep == 0 )
|
---|
3117 | box->opt();
|
---|
3118 |
|
---|
3119 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
3120 | yyMayCapture = FALSE;
|
---|
3121 | int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1;
|
---|
3122 | int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 );
|
---|
3123 |
|
---|
3124 | Box rightBox( this );
|
---|
3125 | int i;
|
---|
3126 |
|
---|
3127 | for ( i = 0; i < beta; i++ ) {
|
---|
3128 | YYREDO();
|
---|
3129 | Box leftBox( this );
|
---|
3130 | parseAtom( &leftBox );
|
---|
3131 | leftBox.cat( rightBox );
|
---|
3132 | leftBox.opt();
|
---|
3133 | rightBox = leftBox;
|
---|
3134 | }
|
---|
3135 | for ( i = 0; i < alpha; i++ ) {
|
---|
3136 | YYREDO();
|
---|
3137 | Box leftBox( this );
|
---|
3138 | parseAtom( &leftBox );
|
---|
3139 | leftBox.cat( rightBox );
|
---|
3140 | rightBox = leftBox;
|
---|
3141 | }
|
---|
3142 | rightBox.cat( *box );
|
---|
3143 | *box = rightBox;
|
---|
3144 | #endif
|
---|
3145 | yyTok = getToken();
|
---|
3146 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
3147 | yyMayCapture = mayCapture;
|
---|
3148 | #endif
|
---|
3149 | }
|
---|
3150 | #undef YYREDO
|
---|
3151 | }
|
---|
3152 |
|
---|
3153 | void QRegExpEngine::parseTerm( Box *box )
|
---|
3154 | {
|
---|
3155 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3156 | if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar )
|
---|
3157 | parseFactor( box );
|
---|
3158 | #endif
|
---|
3159 | while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) {
|
---|
3160 | Box rightBox( this );
|
---|
3161 | parseFactor( &rightBox );
|
---|
3162 | box->cat( rightBox );
|
---|
3163 | }
|
---|
3164 | }
|
---|
3165 |
|
---|
3166 | void QRegExpEngine::parseExpression( Box *box )
|
---|
3167 | {
|
---|
3168 | parseTerm( box );
|
---|
3169 | while ( yyTok == Tok_Bar ) {
|
---|
3170 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3171 | trivial = FALSE;
|
---|
3172 | #endif
|
---|
3173 | Box rightBox( this );
|
---|
3174 | yyTok = getToken();
|
---|
3175 | parseTerm( &rightBox );
|
---|
3176 | box->orx( rightBox );
|
---|
3177 | }
|
---|
3178 | }
|
---|
3179 |
|
---|
3180 | /*
|
---|
3181 | The struct QRegExpPrivate contains the private data of a regular
|
---|
3182 | expression other than the automaton. It makes it possible for many
|
---|
3183 | QRegExp objects to use the same QRegExpEngine object with different
|
---|
3184 | QRegExpPrivate objects.
|
---|
3185 | */
|
---|
3186 | struct QRegExpPrivate
|
---|
3187 | {
|
---|
3188 | QString pattern; // regular-expression or wildcard pattern
|
---|
3189 | QString rxpattern; // regular-expression pattern
|
---|
3190 | #ifndef QT_NO_REGEXP_WILDCARD
|
---|
3191 | bool wc : 1; // wildcard mode?
|
---|
3192 | #endif
|
---|
3193 | bool min : 1; // minimal matching? (instead of maximal)
|
---|
3194 | bool cs : 1; // case sensitive?
|
---|
3195 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3196 | QString t; // last string passed to QRegExp::search() or searchRev()
|
---|
3197 | QStringList capturedCache; // what QRegExp::capturedTexts() returned last
|
---|
3198 | #endif
|
---|
3199 | QMemArray<int> captured; // what QRegExpEngine::search() returned last
|
---|
3200 |
|
---|
3201 | QRegExpPrivate() { captured.fill( -1, 2 ); }
|
---|
3202 | };
|
---|
3203 |
|
---|
3204 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3205 | static QSingleCleanupHandler<QCache<QRegExpEngine> > cleanup_cache;
|
---|
3206 | # ifndef QT_THREAD_SUPPORT
|
---|
3207 | static QCache<QRegExpEngine> *engineCache = 0;
|
---|
3208 | # endif // QT_THREAD_SUPPORT
|
---|
3209 | #endif // QT_NO_REGEXP_OPTIM
|
---|
3210 |
|
---|
3211 | static void regexpEngine( QRegExpEngine *&eng, const QString &pattern,
|
---|
3212 | bool caseSensitive, bool deref )
|
---|
3213 | {
|
---|
3214 | # ifdef QT_THREAD_SUPPORT
|
---|
3215 | static QThreadStorage<QCache<QRegExpEngine> *> engineCaches;
|
---|
3216 | QCache<QRegExpEngine> *&engineCache = engineCaches.localData();
|
---|
3217 | #endif // QT_THREAD_SUPPORT
|
---|
3218 |
|
---|
3219 | if ( !deref ) {
|
---|
3220 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3221 | if ( engineCache != 0 ) {
|
---|
3222 | eng = engineCache->take( pattern );
|
---|
3223 | if ( eng == 0 || eng->caseSensitive() != caseSensitive ) {
|
---|
3224 | delete eng;
|
---|
3225 | } else {
|
---|
3226 | eng->ref();
|
---|
3227 | return;
|
---|
3228 | }
|
---|
3229 | }
|
---|
3230 | #endif // QT_NO_REGEXP_OPTIM
|
---|
3231 | eng = new QRegExpEngine( pattern, caseSensitive );
|
---|
3232 | return;
|
---|
3233 | }
|
---|
3234 |
|
---|
3235 | if ( eng->deref() ) {
|
---|
3236 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3237 | if ( engineCache == 0 ) {
|
---|
3238 | engineCache = new QCache<QRegExpEngine>;
|
---|
3239 | engineCache->setAutoDelete( TRUE );
|
---|
3240 | # ifndef QT_THREAD_SUPPORT
|
---|
3241 | cleanup_cache.set( &engineCache );
|
---|
3242 | # endif // !QT_THREAD_SUPPORT
|
---|
3243 | }
|
---|
3244 | if ( !pattern.isNull() &&
|
---|
3245 | engineCache->insert(pattern, eng, 4 + pattern.length() / 4) )
|
---|
3246 | return;
|
---|
3247 | #else
|
---|
3248 | Q_UNUSED( pattern );
|
---|
3249 | #endif // QT_NO_REGEXP_OPTIM
|
---|
3250 | delete eng;
|
---|
3251 | eng = 0;
|
---|
3252 | }
|
---|
3253 | }
|
---|
3254 |
|
---|
3255 | /*!
|
---|
3256 | \enum QRegExp::CaretMode
|
---|
3257 |
|
---|
3258 | The CaretMode enum defines the different meanings of the caret
|
---|
3259 | (<b>^</b>) in a regular expression. The possible values are:
|
---|
3260 |
|
---|
3261 | \value CaretAtZero
|
---|
3262 | The caret corresponds to index 0 in the searched string.
|
---|
3263 |
|
---|
3264 | \value CaretAtOffset
|
---|
3265 | The caret corresponds to the start offset of the search.
|
---|
3266 |
|
---|
3267 | \value CaretWontMatch
|
---|
3268 | The caret never matches.
|
---|
3269 | */
|
---|
3270 |
|
---|
3271 | /*!
|
---|
3272 | Constructs an empty regexp.
|
---|
3273 |
|
---|
3274 | \sa isValid() errorString()
|
---|
3275 | */
|
---|
3276 | QRegExp::QRegExp()
|
---|
3277 | : eng( 0 )
|
---|
3278 | {
|
---|
3279 | priv = new QRegExpPrivate;
|
---|
3280 | #ifndef QT_NO_REGEXP_WILDCARD
|
---|
3281 | priv->wc = FALSE;
|
---|
3282 | #endif
|
---|
3283 | priv->min = FALSE;
|
---|
3284 | priv->cs = TRUE;
|
---|
3285 | }
|
---|
3286 |
|
---|
3287 | /*!
|
---|
3288 | Constructs a regular expression object for the given \a pattern
|
---|
3289 | string. The pattern must be given using wildcard notation if \a
|
---|
3290 | wildcard is TRUE (default is FALSE). The pattern is case
|
---|
3291 | sensitive, unless \a caseSensitive is FALSE. Matching is greedy
|
---|
3292 | (maximal), but can be changed by calling setMinimal().
|
---|
3293 |
|
---|
3294 | \sa setPattern() setCaseSensitive() setWildcard() setMinimal()
|
---|
3295 | */
|
---|
3296 | QRegExp::QRegExp( const QString& pattern, bool caseSensitive, bool wildcard )
|
---|
3297 | : eng( 0 )
|
---|
3298 | {
|
---|
3299 | priv = new QRegExpPrivate;
|
---|
3300 | priv->pattern = pattern;
|
---|
3301 | #ifndef QT_NO_REGEXP_WILDCARD
|
---|
3302 | priv->wc = wildcard;
|
---|
3303 | #endif
|
---|
3304 | priv->min = FALSE;
|
---|
3305 | priv->cs = caseSensitive;
|
---|
3306 | }
|
---|
3307 |
|
---|
3308 | /*!
|
---|
3309 | Constructs a regular expression as a copy of \a rx.
|
---|
3310 |
|
---|
3311 | \sa operator=()
|
---|
3312 | */
|
---|
3313 | QRegExp::QRegExp( const QRegExp& rx )
|
---|
3314 | : eng( 0 )
|
---|
3315 | {
|
---|
3316 | priv = new QRegExpPrivate;
|
---|
3317 | operator=( rx );
|
---|
3318 | }
|
---|
3319 |
|
---|
3320 | /*!
|
---|
3321 | Destroys the regular expression and cleans up its internal data.
|
---|
3322 | */
|
---|
3323 | QRegExp::~QRegExp()
|
---|
3324 | {
|
---|
3325 | invalidateEngine();
|
---|
3326 | delete priv;
|
---|
3327 | }
|
---|
3328 |
|
---|
3329 | /*!
|
---|
3330 | Copies the regular expression \a rx and returns a reference to the
|
---|
3331 | copy. The case sensitivity, wildcard and minimal matching options
|
---|
3332 | are also copied.
|
---|
3333 | */
|
---|
3334 | QRegExp& QRegExp::operator=( const QRegExp& rx )
|
---|
3335 | {
|
---|
3336 | QRegExpEngine *otherEng = rx.eng;
|
---|
3337 | if ( otherEng != 0 )
|
---|
3338 | otherEng->ref();
|
---|
3339 | invalidateEngine();
|
---|
3340 | eng = otherEng;
|
---|
3341 | priv->pattern = rx.priv->pattern;
|
---|
3342 | priv->rxpattern = rx.priv->rxpattern;
|
---|
3343 | #ifndef QT_NO_REGEXP_WILDCARD
|
---|
3344 | priv->wc = rx.priv->wc;
|
---|
3345 | #endif
|
---|
3346 | priv->min = rx.priv->min;
|
---|
3347 | priv->cs = rx.priv->cs;
|
---|
3348 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3349 | priv->t = rx.priv->t;
|
---|
3350 | priv->capturedCache = rx.priv->capturedCache;
|
---|
3351 | #endif
|
---|
3352 | priv->captured = rx.priv->captured;
|
---|
3353 | return *this;
|
---|
3354 | }
|
---|
3355 |
|
---|
3356 | /*!
|
---|
3357 | Returns TRUE if this regular expression is equal to \a rx;
|
---|
3358 | otherwise returns FALSE.
|
---|
3359 |
|
---|
3360 | Two QRegExp objects are equal if they have the same pattern
|
---|
3361 | strings and the same settings for case sensitivity, wildcard and
|
---|
3362 | minimal matching.
|
---|
3363 | */
|
---|
3364 | bool QRegExp::operator==( const QRegExp& rx ) const
|
---|
3365 | {
|
---|
3366 | return priv->pattern == rx.priv->pattern &&
|
---|
3367 | #ifndef QT_NO_REGEXP_WILDCARD
|
---|
3368 | priv->wc == rx.priv->wc &&
|
---|
3369 | #endif
|
---|
3370 | priv->min == rx.priv->min &&
|
---|
3371 | priv->cs == rx.priv->cs;
|
---|
3372 | }
|
---|
3373 |
|
---|
3374 | /*!
|
---|
3375 | \fn bool QRegExp::operator!=( const QRegExp& rx ) const
|
---|
3376 |
|
---|
3377 | Returns TRUE if this regular expression is not equal to \a rx;
|
---|
3378 | otherwise returns FALSE.
|
---|
3379 |
|
---|
3380 | \sa operator==()
|
---|
3381 | */
|
---|
3382 |
|
---|
3383 | /*!
|
---|
3384 | Returns TRUE if the pattern string is empty; otherwise returns
|
---|
3385 | FALSE.
|
---|
3386 |
|
---|
3387 | If you call exactMatch() with an empty pattern on an empty string
|
---|
3388 | it will return TRUE; otherwise it returns FALSE since it operates
|
---|
3389 | over the whole string. If you call search() with an empty pattern
|
---|
3390 | on \e any string it will return the start offset (0 by default)
|
---|
3391 | because the empty pattern matches the 'emptiness' at the start of
|
---|
3392 | the string. In this case the length of the match returned by
|
---|
3393 | matchedLength() will be 0.
|
---|
3394 |
|
---|
3395 | See QString::isEmpty().
|
---|
3396 | */
|
---|
3397 |
|
---|
3398 | bool QRegExp::isEmpty() const
|
---|
3399 | {
|
---|
3400 | return priv->pattern.isEmpty();
|
---|
3401 | }
|
---|
3402 |
|
---|
3403 | /*!
|
---|
3404 | Returns TRUE if the regular expression is valid; otherwise returns
|
---|
3405 | FALSE. An invalid regular expression never matches.
|
---|
3406 |
|
---|
3407 | The pattern <b>[a-z</b> is an example of an invalid pattern, since
|
---|
3408 | it lacks a closing square bracket.
|
---|
3409 |
|
---|
3410 | Note that the validity of a regexp may also depend on the setting
|
---|
3411 | of the wildcard flag, for example <b>*.html</b> is a valid
|
---|
3412 | wildcard regexp but an invalid full regexp.
|
---|
3413 |
|
---|
3414 | \sa errorString()
|
---|
3415 | */
|
---|
3416 | bool QRegExp::isValid() const
|
---|
3417 | {
|
---|
3418 | if ( priv->pattern.isEmpty() ) {
|
---|
3419 | return TRUE;
|
---|
3420 | } else {
|
---|
3421 | prepareEngine();
|
---|
3422 | return eng->isValid();
|
---|
3423 | }
|
---|
3424 | }
|
---|
3425 |
|
---|
3426 | /*!
|
---|
3427 | Returns the pattern string of the regular expression. The pattern
|
---|
3428 | has either regular expression syntax or wildcard syntax, depending
|
---|
3429 | on wildcard().
|
---|
3430 |
|
---|
3431 | \sa setPattern()
|
---|
3432 | */
|
---|
3433 | QString QRegExp::pattern() const
|
---|
3434 | {
|
---|
3435 | return priv->pattern;
|
---|
3436 | }
|
---|
3437 |
|
---|
3438 | /*!
|
---|
3439 | Sets the pattern string to \a pattern. The case sensitivity,
|
---|
3440 | wildcard and minimal matching options are not changed.
|
---|
3441 |
|
---|
3442 | \sa pattern()
|
---|
3443 | */
|
---|
3444 | void QRegExp::setPattern( const QString& pattern )
|
---|
3445 | {
|
---|
3446 | if ( priv->pattern != pattern ) {
|
---|
3447 | priv->pattern = pattern;
|
---|
3448 | invalidateEngine();
|
---|
3449 | }
|
---|
3450 | }
|
---|
3451 |
|
---|
3452 | /*!
|
---|
3453 | Returns TRUE if case sensitivity is enabled; otherwise returns
|
---|
3454 | FALSE. The default is TRUE.
|
---|
3455 |
|
---|
3456 | \sa setCaseSensitive()
|
---|
3457 | */
|
---|
3458 | bool QRegExp::caseSensitive() const
|
---|
3459 | {
|
---|
3460 | return priv->cs;
|
---|
3461 | }
|
---|
3462 |
|
---|
3463 | /*!
|
---|
3464 | Sets case sensitive matching to \a sensitive.
|
---|
3465 |
|
---|
3466 | If \a sensitive is TRUE, <b>\\.txt$</b> matches \c{readme.txt} but
|
---|
3467 | not \c{README.TXT}.
|
---|
3468 |
|
---|
3469 | \sa caseSensitive()
|
---|
3470 | */
|
---|
3471 | void QRegExp::setCaseSensitive( bool sensitive )
|
---|
3472 | {
|
---|
3473 | if ( sensitive != priv->cs ) {
|
---|
3474 | priv->cs = sensitive;
|
---|
3475 | invalidateEngine();
|
---|
3476 | }
|
---|
3477 | }
|
---|
3478 |
|
---|
3479 | #ifndef QT_NO_REGEXP_WILDCARD
|
---|
3480 | /*!
|
---|
3481 | Returns TRUE if wildcard mode is enabled; otherwise returns FALSE.
|
---|
3482 | The default is FALSE.
|
---|
3483 |
|
---|
3484 | \sa setWildcard()
|
---|
3485 | */
|
---|
3486 | bool QRegExp::wildcard() const
|
---|
3487 | {
|
---|
3488 | return priv->wc;
|
---|
3489 | }
|
---|
3490 |
|
---|
3491 | /*!
|
---|
3492 | Sets the wildcard mode for the regular expression. The default is
|
---|
3493 | FALSE.
|
---|
3494 |
|
---|
3495 | Setting \a wildcard to TRUE enables simple shell-like wildcard
|
---|
3496 | matching. (See \link #wildcard-matching wildcard matching
|
---|
3497 | (globbing) \endlink.)
|
---|
3498 |
|
---|
3499 | For example, <b>r*.txt</b> matches the string \c{readme.txt} in
|
---|
3500 | wildcard mode, but does not match \c{readme}.
|
---|
3501 |
|
---|
3502 | \sa wildcard()
|
---|
3503 | */
|
---|
3504 | void QRegExp::setWildcard( bool wildcard )
|
---|
3505 | {
|
---|
3506 | if ( wildcard != priv->wc ) {
|
---|
3507 | priv->wc = wildcard;
|
---|
3508 | invalidateEngine();
|
---|
3509 | }
|
---|
3510 | }
|
---|
3511 | #endif
|
---|
3512 |
|
---|
3513 | /*!
|
---|
3514 | Returns TRUE if minimal (non-greedy) matching is enabled;
|
---|
3515 | otherwise returns FALSE.
|
---|
3516 |
|
---|
3517 | \sa setMinimal()
|
---|
3518 | */
|
---|
3519 | bool QRegExp::minimal() const
|
---|
3520 | {
|
---|
3521 | return priv->min;
|
---|
3522 | }
|
---|
3523 |
|
---|
3524 | /*!
|
---|
3525 | Enables or disables minimal matching. If \a minimal is FALSE,
|
---|
3526 | matching is greedy (maximal) which is the default.
|
---|
3527 |
|
---|
3528 | For example, suppose we have the input string "We must be
|
---|
3529 | \<b>bold\</b>, very \<b>bold\</b>!" and the pattern
|
---|
3530 | <b>\<b>.*\</b></b>. With the default greedy (maximal) matching,
|
---|
3531 | the match is "We must be <u>\<b>bold\</b>, very
|
---|
3532 | \<b>bold\</b></u>!". But with minimal (non-greedy) matching the
|
---|
3533 | first match is: "We must be <u>\<b>bold\</b></u>, very
|
---|
3534 | \<b>bold\</b>!" and the second match is "We must be \<b>bold\</b>,
|
---|
3535 | very <u>\<b>bold\</b></u>!". In practice we might use the pattern
|
---|
3536 | <b>\<b>[^\<]+\</b></b> instead, although this will still fail for
|
---|
3537 | nested tags.
|
---|
3538 |
|
---|
3539 | \sa minimal()
|
---|
3540 | */
|
---|
3541 | void QRegExp::setMinimal( bool minimal )
|
---|
3542 | {
|
---|
3543 | priv->min = minimal;
|
---|
3544 | }
|
---|
3545 |
|
---|
3546 | /*!
|
---|
3547 | Returns TRUE if \a str is matched exactly by this regular
|
---|
3548 | expression; otherwise returns FALSE. You can determine how much of
|
---|
3549 | the string was matched by calling matchedLength().
|
---|
3550 |
|
---|
3551 | For a given regexp string, R, exactMatch("R") is the equivalent of
|
---|
3552 | search("^R$") since exactMatch() effectively encloses the regexp
|
---|
3553 | in the start of string and end of string anchors, except that it
|
---|
3554 | sets matchedLength() differently.
|
---|
3555 |
|
---|
3556 | For example, if the regular expression is <b>blue</b>, then
|
---|
3557 | exactMatch() returns TRUE only for input \c blue. For inputs \c
|
---|
3558 | bluebell, \c blutak and \c lightblue, exactMatch() returns FALSE
|
---|
3559 | and matchedLength() will return 4, 3 and 0 respectively.
|
---|
3560 |
|
---|
3561 | Although const, this function sets matchedLength(),
|
---|
3562 | capturedTexts() and pos().
|
---|
3563 |
|
---|
3564 | \sa search() searchRev() QRegExpValidator
|
---|
3565 | */
|
---|
3566 | bool QRegExp::exactMatch( const QString& str ) const
|
---|
3567 | {
|
---|
3568 | prepareEngineForMatch( str );
|
---|
3569 | eng->match( str, 0, priv->min, TRUE, 0, priv->captured );
|
---|
3570 | if ( priv->captured[1] == (int) str.length() ) {
|
---|
3571 | return TRUE;
|
---|
3572 | } else {
|
---|
3573 | priv->captured[0] = 0;
|
---|
3574 | priv->captured[1] = eng->partialMatchLength();
|
---|
3575 | return FALSE;
|
---|
3576 | }
|
---|
3577 | }
|
---|
3578 |
|
---|
3579 | #ifndef QT_NO_COMPAT
|
---|
3580 | /*! \obsolete
|
---|
3581 |
|
---|
3582 | Attempts to match in \a str, starting from position \a index.
|
---|
3583 | Returns the position of the match, or -1 if there was no match.
|
---|
3584 |
|
---|
3585 | The length of the match is stored in \a *len, unless \a len is a
|
---|
3586 | null pointer.
|
---|
3587 |
|
---|
3588 | If \a indexIsStart is TRUE (the default), the position \a index in
|
---|
3589 | the string will match the start of string anchor, <b>^</b>, in the
|
---|
3590 | regexp, if present. Otherwise, position 0 in \a str will match.
|
---|
3591 |
|
---|
3592 | Use search() and matchedLength() instead of this function.
|
---|
3593 |
|
---|
3594 | \sa QString::mid() QConstString
|
---|
3595 | */
|
---|
3596 | int QRegExp::match( const QString& str, int index, int *len,
|
---|
3597 | bool indexIsStart ) const
|
---|
3598 | {
|
---|
3599 | int pos = search( str, index, indexIsStart ? CaretAtOffset : CaretAtZero );
|
---|
3600 | if ( len != 0 )
|
---|
3601 | *len = matchedLength();
|
---|
3602 | return pos;
|
---|
3603 | }
|
---|
3604 | #endif // QT_NO_COMPAT
|
---|
3605 |
|
---|
3606 | int QRegExp::search( const QString& str, int offset ) const
|
---|
3607 | {
|
---|
3608 | return search( str, offset, CaretAtZero );
|
---|
3609 | }
|
---|
3610 |
|
---|
3611 | /*!
|
---|
3612 | Attempts to find a match in \a str from position \a offset (0 by
|
---|
3613 | default). If \a offset is -1, the search starts at the last
|
---|
3614 | character; if -2, at the next to last character; etc.
|
---|
3615 |
|
---|
3616 | Returns the position of the first match, or -1 if there was no
|
---|
3617 | match.
|
---|
3618 |
|
---|
3619 | The \a caretMode parameter can be used to instruct whether <b>^</b>
|
---|
3620 | should match at index 0 or at \a offset.
|
---|
3621 |
|
---|
3622 | You might prefer to use QString::find(), QString::contains() or
|
---|
3623 | even QStringList::grep(). To replace matches use
|
---|
3624 | QString::replace().
|
---|
3625 |
|
---|
3626 | Example:
|
---|
3627 | \code
|
---|
3628 | QString str = "offsets: 1.23 .50 71.00 6.00";
|
---|
3629 | QRegExp rx( "\\d*\\.\\d+" ); // primitive floating point matching
|
---|
3630 | int count = 0;
|
---|
3631 | int pos = 0;
|
---|
3632 | while ( (pos = rx.search(str, pos)) != -1 ) {
|
---|
3633 | count++;
|
---|
3634 | pos += rx.matchedLength();
|
---|
3635 | }
|
---|
3636 | // pos will be 9, 14, 18 and finally 24; count will end up as 4
|
---|
3637 | \endcode
|
---|
3638 |
|
---|
3639 | Although const, this function sets matchedLength(),
|
---|
3640 | capturedTexts() and pos().
|
---|
3641 |
|
---|
3642 | \sa searchRev() exactMatch()
|
---|
3643 | */
|
---|
3644 |
|
---|
3645 | int QRegExp::search( const QString& str, int offset, CaretMode caretMode ) const
|
---|
3646 | {
|
---|
3647 | prepareEngineForMatch( str );
|
---|
3648 | if ( offset < 0 )
|
---|
3649 | offset += str.length();
|
---|
3650 | eng->match( str, offset, priv->min, FALSE, caretIndex(offset, caretMode),
|
---|
3651 | priv->captured );
|
---|
3652 | return priv->captured[0];
|
---|
3653 | }
|
---|
3654 |
|
---|
3655 |
|
---|
3656 | int QRegExp::searchRev( const QString& str, int offset ) const
|
---|
3657 | {
|
---|
3658 | return searchRev( str, offset, CaretAtZero );
|
---|
3659 | }
|
---|
3660 |
|
---|
3661 | /*!
|
---|
3662 | Attempts to find a match backwards in \a str from position \a
|
---|
3663 | offset. If \a offset is -1 (the default), the search starts at the
|
---|
3664 | last character; if -2, at the next to last character; etc.
|
---|
3665 |
|
---|
3666 | Returns the position of the first match, or -1 if there was no
|
---|
3667 | match.
|
---|
3668 |
|
---|
3669 | The \a caretMode parameter can be used to instruct whether <b>^</b>
|
---|
3670 | should match at index 0 or at \a offset.
|
---|
3671 |
|
---|
3672 | Although const, this function sets matchedLength(),
|
---|
3673 | capturedTexts() and pos().
|
---|
3674 |
|
---|
3675 | \warning Searching backwards is much slower than searching
|
---|
3676 | forwards.
|
---|
3677 |
|
---|
3678 | \sa search() exactMatch()
|
---|
3679 | */
|
---|
3680 |
|
---|
3681 | int QRegExp::searchRev( const QString& str, int offset,
|
---|
3682 | CaretMode caretMode ) const
|
---|
3683 | {
|
---|
3684 | prepareEngineForMatch( str );
|
---|
3685 | if ( offset < 0 )
|
---|
3686 | offset += str.length();
|
---|
3687 | if ( offset < 0 || offset > (int) str.length() ) {
|
---|
3688 | priv->captured.detach();
|
---|
3689 | priv->captured.fill( -1 );
|
---|
3690 | return -1;
|
---|
3691 | }
|
---|
3692 |
|
---|
3693 | while ( offset >= 0 ) {
|
---|
3694 | eng->match( str, offset, priv->min, TRUE, caretIndex(offset, caretMode),
|
---|
3695 | priv->captured );
|
---|
3696 | if ( priv->captured[0] == offset )
|
---|
3697 | return offset;
|
---|
3698 | offset--;
|
---|
3699 | }
|
---|
3700 | return -1;
|
---|
3701 | }
|
---|
3702 |
|
---|
3703 | /*!
|
---|
3704 | Returns the length of the last matched string, or -1 if there was
|
---|
3705 | no match.
|
---|
3706 |
|
---|
3707 | \sa exactMatch() search() searchRev()
|
---|
3708 | */
|
---|
3709 | int QRegExp::matchedLength() const
|
---|
3710 | {
|
---|
3711 | return priv->captured[1];
|
---|
3712 | }
|
---|
3713 |
|
---|
3714 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3715 | /*!
|
---|
3716 | Returns the number of captures contained in the regular expression.
|
---|
3717 | */
|
---|
3718 | int QRegExp::numCaptures() const
|
---|
3719 | {
|
---|
3720 | prepareEngine();
|
---|
3721 | return eng->numCaptures();
|
---|
3722 | }
|
---|
3723 |
|
---|
3724 | /*!
|
---|
3725 | Returns a list of the captured text strings.
|
---|
3726 |
|
---|
3727 | The first string in the list is the entire matched string. Each
|
---|
3728 | subsequent list element contains a string that matched a
|
---|
3729 | (capturing) subexpression of the regexp.
|
---|
3730 |
|
---|
3731 | For example:
|
---|
3732 | \code
|
---|
3733 | QRegExp rx( "(\\d+)(\\s*)(cm|inch(es)?)" );
|
---|
3734 | int pos = rx.search( "Length: 36 inches" );
|
---|
3735 | QStringList list = rx.capturedTexts();
|
---|
3736 | // list is now ( "36 inches", "36", " ", "inches", "es" )
|
---|
3737 | \endcode
|
---|
3738 |
|
---|
3739 | The above example also captures elements that may be present but
|
---|
3740 | which we have no interest in. This problem can be solved by using
|
---|
3741 | non-capturing parentheses:
|
---|
3742 |
|
---|
3743 | \code
|
---|
3744 | QRegExp rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" );
|
---|
3745 | int pos = rx.search( "Length: 36 inches" );
|
---|
3746 | QStringList list = rx.capturedTexts();
|
---|
3747 | // list is now ( "36 inches", "36", "inches" )
|
---|
3748 | \endcode
|
---|
3749 |
|
---|
3750 | Note that if you want to iterate over the list, you should iterate
|
---|
3751 | over a copy, e.g.
|
---|
3752 | \code
|
---|
3753 | QStringList list = rx.capturedTexts();
|
---|
3754 | QStringList::Iterator it = list.begin();
|
---|
3755 | while( it != list.end() ) {
|
---|
3756 | myProcessing( *it );
|
---|
3757 | ++it;
|
---|
3758 | }
|
---|
3759 | \endcode
|
---|
3760 |
|
---|
3761 | Some regexps can match an indeterminate number of times. For
|
---|
3762 | example if the input string is "Offsets: 12 14 99 231 7" and the
|
---|
3763 | regexp, \c{rx}, is <b>(\\d+)+</b>, we would hope to get a list of
|
---|
3764 | all the numbers matched. However, after calling
|
---|
3765 | \c{rx.search(str)}, capturedTexts() will return the list ( "12",
|
---|
3766 | "12" ), i.e. the entire match was "12" and the first subexpression
|
---|
3767 | matched was "12". The correct approach is to use cap() in a \link
|
---|
3768 | #cap_in_a_loop loop \endlink.
|
---|
3769 |
|
---|
3770 | The order of elements in the string list is as follows. The first
|
---|
3771 | element is the entire matching string. Each subsequent element
|
---|
3772 | corresponds to the next capturing open left parentheses. Thus
|
---|
3773 | capturedTexts()[1] is the text of the first capturing parentheses,
|
---|
3774 | capturedTexts()[2] is the text of the second and so on
|
---|
3775 | (corresponding to $1, $2, etc., in some other regexp languages).
|
---|
3776 |
|
---|
3777 | \sa cap() pos() exactMatch() search() searchRev()
|
---|
3778 | */
|
---|
3779 | QStringList QRegExp::capturedTexts()
|
---|
3780 | {
|
---|
3781 | if ( priv->capturedCache.isEmpty() ) {
|
---|
3782 | for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) {
|
---|
3783 | QString m;
|
---|
3784 | if ( priv->captured[i + 1] == 0 )
|
---|
3785 | m = QString::fromLatin1( "" );
|
---|
3786 | else if ( priv->captured[i] >= 0 )
|
---|
3787 | m = priv->t.mid( priv->captured[i],
|
---|
3788 | priv->captured[i + 1] );
|
---|
3789 | priv->capturedCache.append( m );
|
---|
3790 | }
|
---|
3791 | priv->t = QString::null;
|
---|
3792 | }
|
---|
3793 | return priv->capturedCache;
|
---|
3794 | }
|
---|
3795 |
|
---|
3796 | /*!
|
---|
3797 | Returns the text captured by the \a nth subexpression. The entire
|
---|
3798 | match has index 0 and the parenthesized subexpressions have
|
---|
3799 | indices starting from 1 (excluding non-capturing parentheses).
|
---|
3800 |
|
---|
3801 | \code
|
---|
3802 | QRegExp rxlen( "(\\d+)(?:\\s*)(cm|inch)" );
|
---|
3803 | int pos = rxlen.search( "Length: 189cm" );
|
---|
3804 | if ( pos > -1 ) {
|
---|
3805 | QString value = rxlen.cap( 1 ); // "189"
|
---|
3806 | QString unit = rxlen.cap( 2 ); // "cm"
|
---|
3807 | // ...
|
---|
3808 | }
|
---|
3809 | \endcode
|
---|
3810 |
|
---|
3811 | The order of elements matched by cap() is as follows. The first
|
---|
3812 | element, cap(0), is the entire matching string. Each subsequent
|
---|
3813 | element corresponds to the next capturing open left parentheses.
|
---|
3814 | Thus cap(1) is the text of the first capturing parentheses, cap(2)
|
---|
3815 | is the text of the second, and so on.
|
---|
3816 |
|
---|
3817 | \target cap_in_a_loop
|
---|
3818 | Some patterns may lead to a number of matches which cannot be
|
---|
3819 | determined in advance, for example:
|
---|
3820 |
|
---|
3821 | \code
|
---|
3822 | QRegExp rx( "(\\d+)" );
|
---|
3823 | str = "Offsets: 12 14 99 231 7";
|
---|
3824 | QStringList list;
|
---|
3825 | pos = 0;
|
---|
3826 | while ( pos >= 0 ) {
|
---|
3827 | pos = rx.search( str, pos );
|
---|
3828 | if ( pos > -1 ) {
|
---|
3829 | list += rx.cap( 1 );
|
---|
3830 | pos += rx.matchedLength();
|
---|
3831 | }
|
---|
3832 | }
|
---|
3833 | // list contains "12", "14", "99", "231", "7"
|
---|
3834 | \endcode
|
---|
3835 |
|
---|
3836 | \sa capturedTexts() pos() exactMatch() search() searchRev()
|
---|
3837 | */
|
---|
3838 | QString QRegExp::cap( int nth )
|
---|
3839 | {
|
---|
3840 | if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) {
|
---|
3841 | return QString::null;
|
---|
3842 | } else {
|
---|
3843 | return capturedTexts()[nth];
|
---|
3844 | }
|
---|
3845 | }
|
---|
3846 |
|
---|
3847 | /*!
|
---|
3848 | Returns the position of the \a nth captured text in the searched
|
---|
3849 | string. If \a nth is 0 (the default), pos() returns the position
|
---|
3850 | of the whole match.
|
---|
3851 |
|
---|
3852 | Example:
|
---|
3853 | \code
|
---|
3854 | QRegExp rx( "/([a-z]+)/([a-z]+)" );
|
---|
3855 | rx.search( "Output /dev/null" ); // returns 7 (position of /dev/null)
|
---|
3856 | rx.pos( 0 ); // returns 7 (position of /dev/null)
|
---|
3857 | rx.pos( 1 ); // returns 8 (position of dev)
|
---|
3858 | rx.pos( 2 ); // returns 12 (position of null)
|
---|
3859 | \endcode
|
---|
3860 |
|
---|
3861 | For zero-length matches, pos() always returns -1. (For example, if
|
---|
3862 | cap(4) would return an empty string, pos(4) returns -1.) This is
|
---|
3863 | due to an implementation tradeoff.
|
---|
3864 |
|
---|
3865 | \sa capturedTexts() exactMatch() search() searchRev()
|
---|
3866 | */
|
---|
3867 | int QRegExp::pos( int nth )
|
---|
3868 | {
|
---|
3869 | if ( nth < 0 || nth >= (int) priv->captured.size() / 2 )
|
---|
3870 | return -1;
|
---|
3871 | else
|
---|
3872 | return priv->captured[2 * nth];
|
---|
3873 | }
|
---|
3874 |
|
---|
3875 | /*!
|
---|
3876 | Returns a text string that explains why a regexp pattern is
|
---|
3877 | invalid the case being; otherwise returns "no error occurred".
|
---|
3878 |
|
---|
3879 | \sa isValid()
|
---|
3880 | */
|
---|
3881 | QString QRegExp::errorString()
|
---|
3882 | {
|
---|
3883 | if ( isValid() ) {
|
---|
3884 | return QString( RXERR_OK );
|
---|
3885 | } else {
|
---|
3886 | return eng->errorString();
|
---|
3887 | }
|
---|
3888 | }
|
---|
3889 | #endif
|
---|
3890 |
|
---|
3891 | /*!
|
---|
3892 | Returns the string \a str with every regexp special character
|
---|
3893 | escaped with a backslash. The special characters are $, (, ), *, +,
|
---|
3894 | ., ?, [, \, ], ^, {, | and }.
|
---|
3895 |
|
---|
3896 | Example:
|
---|
3897 | \code
|
---|
3898 | s1 = QRegExp::escape( "bingo" ); // s1 == "bingo"
|
---|
3899 | s2 = QRegExp::escape( "f(x)" ); // s2 == "f\\(x\\)"
|
---|
3900 | \endcode
|
---|
3901 |
|
---|
3902 | This function is useful to construct regexp patterns dynamically:
|
---|
3903 |
|
---|
3904 | \code
|
---|
3905 | QRegExp rx( "(" + QRegExp::escape(name) +
|
---|
3906 | "|" + QRegExp::escape(alias) + ")" );
|
---|
3907 | \endcode
|
---|
3908 | */
|
---|
3909 | QString QRegExp::escape( const QString& str )
|
---|
3910 | {
|
---|
3911 | static const char meta[] = "$()*+.?[\\]^{|}";
|
---|
3912 | QString quoted = str;
|
---|
3913 | int i = 0;
|
---|
3914 |
|
---|
3915 | while ( i < (int) quoted.length() ) {
|
---|
3916 | if ( strchr(meta, quoted[i].latin1()) != 0 )
|
---|
3917 | quoted.insert( i++, "\\" );
|
---|
3918 | i++;
|
---|
3919 | }
|
---|
3920 | return quoted;
|
---|
3921 | }
|
---|
3922 |
|
---|
3923 | void QRegExp::prepareEngine() const
|
---|
3924 | {
|
---|
3925 | if ( eng == 0 ) {
|
---|
3926 | #ifndef QT_NO_REGEXP_WILDCARD
|
---|
3927 | if ( priv->wc )
|
---|
3928 | priv->rxpattern = wc2rx( priv->pattern );
|
---|
3929 | else
|
---|
3930 | #endif
|
---|
3931 | priv->rxpattern = priv->pattern.isNull() ? QString::fromLatin1( "" )
|
---|
3932 | : priv->pattern;
|
---|
3933 | QRegExp *that = (QRegExp *) this;
|
---|
3934 | // that->eng = newEngine( priv->rxpattern, priv->cs );
|
---|
3935 | regexpEngine( that->eng, priv->rxpattern, priv->cs, FALSE );
|
---|
3936 | priv->captured.detach();
|
---|
3937 | priv->captured.fill( -1, 2 + 2 * eng->numCaptures() );
|
---|
3938 | }
|
---|
3939 | }
|
---|
3940 |
|
---|
3941 | void QRegExp::prepareEngineForMatch( const QString& str ) const
|
---|
3942 | {
|
---|
3943 | prepareEngine();
|
---|
3944 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3945 | priv->t = str;
|
---|
3946 | priv->capturedCache.clear();
|
---|
3947 | #else
|
---|
3948 | Q_UNUSED( str );
|
---|
3949 | #endif
|
---|
3950 | }
|
---|
3951 |
|
---|
3952 | void QRegExp::invalidateEngine()
|
---|
3953 | {
|
---|
3954 | if ( eng != 0 ) {
|
---|
3955 | regexpEngine( eng, priv->rxpattern, priv->cs, TRUE );
|
---|
3956 | priv->rxpattern = QString();
|
---|
3957 | eng = 0;
|
---|
3958 | }
|
---|
3959 | }
|
---|
3960 |
|
---|
3961 | int QRegExp::caretIndex( int offset, CaretMode caretMode )
|
---|
3962 | {
|
---|
3963 | if ( caretMode == CaretAtZero ) {
|
---|
3964 | return 0;
|
---|
3965 | } else if ( caretMode == CaretAtOffset ) {
|
---|
3966 | return offset;
|
---|
3967 | } else { // CaretWontMatch
|
---|
3968 | return -1;
|
---|
3969 | }
|
---|
3970 | }
|
---|
3971 |
|
---|
3972 | #endif // QT_NO_REGEXP
|
---|