source: vendor/trolltech/current/src/tools/qregexp.cpp

Last change on this file was 2, checked in by dmik, 20 years ago

Imported xplatform parts of the official release 3.3.1 from Trolltech

  • Property svn:keywords set to Id
File size: 113.7 KB
Line 
1/****************************************************************************
2** $Id: qregexp.cpp 2 2005-11-16 15:49:26Z dmik $
3**
4** Implementation of QRegExp class
5**
6** Created : 950126
7**
8** Copyright (C) 1992-2000 Trolltech AS. All rights reserved.
9**
10** This file is part of the tools module of the Qt GUI Toolkit.
11**
12** This file may be distributed under the terms of the Q Public License
13** as defined by Trolltech AS of Norway and appearing in the file
14** LICENSE.QPL included in the packaging of this file.
15**
16** This file may be distributed and/or modified under the terms of the
17** GNU General Public License version 2 as published by the Free Software
18** Foundation and appearing in the file LICENSE.GPL included in the
19** packaging of this file.
20**
21** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22** licenses may use this file in accordance with the Qt Commercial License
23** Agreement provided with the Software.
24**
25** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27**
28** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29** information about Qt Commercial License Agreements.
30** See http://www.trolltech.com/qpl/ for QPL licensing information.
31** See http://www.trolltech.com/gpl/ for GPL licensing information.
32**
33** Contact info@trolltech.com if any conditions of this licensing are
34** not clear to you.
35**
36**********************************************************************/
37
38#include "qregexp.h"
39
40#ifndef QT_NO_REGEXP
41
42#include "qmemarray.h"
43#include "qbitarray.h"
44#include "qcache.h"
45#include "qcleanuphandler.h"
46#include "qintdict.h"
47#include "qmap.h"
48#include "qptrvector.h"
49#include "qstring.h"
50#include "qtl.h"
51
52#ifdef QT_THREAD_SUPPORT
53#include "qthreadstorage.h"
54#endif // QT_THREAD_SUPPORT
55
56#undef QT_TRANSLATE_NOOP
57#define QT_TRANSLATE_NOOP( context, sourceText ) sourceText
58
59#include <limits.h>
60
61// error strings for the regexp parser
62#define RXERR_OK QT_TRANSLATE_NOOP( "QRegExp", "no error occurred" )
63#define RXERR_DISABLED QT_TRANSLATE_NOOP( "QRegExp", "disabled feature used" )
64#define RXERR_CHARCLASS QT_TRANSLATE_NOOP( "QRegExp", "bad char class syntax" )
65#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP( "QRegExp", "bad lookahead syntax" )
66#define RXERR_REPETITION QT_TRANSLATE_NOOP( "QRegExp", "bad repetition syntax" )
67#define RXERR_OCTAL QT_TRANSLATE_NOOP( "QRegExp", "invalid octal value" )
68#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP( "QRegExp", "missing left delim" )
69#define RXERR_END QT_TRANSLATE_NOOP( "QRegExp", "unexpected end" )
70#define RXERR_LIMIT QT_TRANSLATE_NOOP( "QRegExp", "met internal limit" )
71
72/*
73 WARNING! Be sure to read qregexp.tex before modifying this file.
74*/
75
76/*!
77 \class QRegExp qregexp.h
78 \reentrant
79 \brief The QRegExp class provides pattern matching using regular expressions.
80
81 \ingroup tools
82 \ingroup misc
83 \ingroup shared
84 \mainclass
85 \keyword regular expression
86
87 Regular expressions, or "regexps", provide a way to find patterns
88 within text. This is useful in many contexts, for example:
89
90 \table
91 \row \i Validation
92 \i A regexp can be used to check whether a piece of text
93 meets some criteria, e.g. is an integer or contains no
94 whitespace.
95 \row \i Searching
96 \i Regexps provide a much more powerful means of searching
97 text than simple string matching does. For example we can
98 create a regexp which says "find one of the words 'mail',
99 'letter' or 'correspondence' but not any of the words
100 'email', 'mailman' 'mailer', 'letterbox' etc."
101 \row \i Search and Replace
102 \i A regexp can be used to replace a pattern with a piece of
103 text, for example replace all occurrences of '&' with
104 '\&amp;' except where the '&' is already followed by 'amp;'.
105 \row \i String Splitting
106 \i A regexp can be used to identify where a string should be
107 split into its component fields, e.g. splitting tab-delimited
108 strings.
109 \endtable
110
111 We present a very brief introduction to regexps, a description of
112 Qt's regexp language, some code examples, and finally the function
113 documentation itself. QRegExp is modeled on Perl's regexp
114 language, and also fully supports Unicode. QRegExp can also be
115 used in the weaker 'wildcard' (globbing) mode which works in a
116 similar way to command shells. A good text on regexps is \e
117 {Mastering Regular Expressions: Powerful Techniques for Perl and
118 Other Tools} by Jeffrey E. Friedl, ISBN 1565922573.
119
120 Experienced regexp users may prefer to skip the introduction and
121 go directly to the relevant information.
122
123 \tableofcontents
124
125 \section1 Introduction
126
127 Regexps are built up from expressions, quantifiers, and assertions.
128 The simplest form of expression is simply a character, e.g.
129 <b>x</b> or <b>5</b>. An expression can also be a set of
130 characters. For example, <b>[ABCD]</b>, will match an <b>A</b> or
131 a <b>B</b> or a <b>C</b> or a <b>D</b>. As a shorthand we could
132 write this as <b>[A-D]</b>. If we want to match any of the
133 captital letters in the English alphabet we can write
134 <b>[A-Z]</b>. A quantifier tells the regexp engine how many
135 occurrences of the expression we want, e.g. <b>x{1,1}</b> means
136 match an <b>x</b> which occurs at least once and at most once.
137 We'll look at assertions and more complex expressions later.
138
139 Note that in general regexps cannot be used to check for balanced
140 brackets or tags. For example if you want to match an opening html
141 \c <b> and its closing \c </b> you can only use a regexp if you
142 know that these tags are not nested; the html fragment, \c{<b>bold
143 <b>bolder</b></b>} will not match as expected. If you know the
144 maximum level of nesting it is possible to create a regexp that
145 will match correctly, but for an unknown level of nesting, regexps
146 will fail.
147
148 We'll start by writing a regexp to match integers in the range 0
149 to 99. We will require at least one digit so we will start with
150 <b>[0-9]{1,1}</b> which means match a digit exactly once. This
151 regexp alone will match integers in the range 0 to 9. To match one
152 or two digits we can increase the maximum number of occurrences so
153 the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at
154 least once and at most twice. However, this regexp as it stands
155 will not match correctly. This regexp will match one or two digits
156 \e within a string. To ensure that we match against the whole
157 string we must use the anchor assertions. We need <b>^</b> (caret)
158 which when it is the first character in the regexp means that the
159 regexp must match from the beginning of the string. And we also
160 need <b>$</b> (dollar) which when it is the last character in the
161 regexp means that the regexp must match until the end of the
162 string. So now our regexp is <b>^[0-9]{1,2}$</b>. Note that
163 assertions, such as <b>^</b> and <b>$</b>, do not match any
164 characters.
165
166 If you've seen regexps elsewhere they may have looked different from
167 the ones above. This is because some sets of characters and some
168 quantifiers are so common that they have special symbols to
169 represent them. <b>[0-9]</b> can be replaced with the symbol
170 <b>\d</b>. The quantifier to match exactly one occurrence,
171 <b>{1,1}</b>, can be replaced with the expression itself. This means
172 that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0
173 to 99 matcher could be written <b>^\d{1,2}$</b>. Another way of
174 writing it would be <b>^\d\d{0,1}$</b>, i.e. from the start of the
175 string match a digit followed by zero or one digits. In practice
176 most people would write it <b>^\d\d?$</b>. The <b>?</b> is a
177 shorthand for the quantifier <b>{0,1}</b>, i.e. a minimum of no
178 occurrences a maximum of one occurrence. This is used to make an
179 expression optional. The regexp <b>^\d\d?$</b> means "from the
180 beginning of the string match one digit followed by zero or one
181 digits and then the end of the string".
182
183 Our second example is matching the words 'mail', 'letter' or
184 'correspondence' but without matching 'email', 'mailman',
185 'mailer', 'letterbox' etc. We'll start by just matching 'mail'. In
186 full the regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since
187 each expression itself is automatically quantified by <b>{1,1}</b>
188 we can simply write this as <b>mail</b>; an 'm' followed by an 'a'
189 followed by an 'i' followed by an 'l'. The symbol '|' (bar) is
190 used for \e alternation, so our regexp now becomes
191 <b>mail|letter|correspondence</b> which means match 'mail' \e or
192 'letter' \e or 'correspondence'. Whilst this regexp will find the
193 words we want it will also find words we don't want such as
194 'email'. We will start by putting our regexp in parentheses,
195 <b>(mail|letter|correspondence)</b>. Parentheses have two effects,
196 firstly they group expressions together and secondly they identify
197 parts of the regexp that we wish to \link #capturing-text capture
198 \endlink. Our regexp still matches any of the three words but now
199 they are grouped together as a unit. This is useful for building
200 up more complex regexps. It is also useful because it allows us to
201 examine which of the words actually matched. We need to use
202 another assertion, this time <b>\b</b> "word boundary":
203 <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match
204 a word boundary followed by the expression in parentheses followed
205 by another word boundary". The <b>\b</b> assertion matches at a \e
206 position in the regexp not a \e character in the regexp. A word
207 boundary is any non-word character such as a space a newline or
208 the beginning or end of the string.
209
210 For our third example we want to replace ampersands with the HTML
211 entity '\&amp;'. The regexp to match is simple: <b>\&</b>, i.e.
212 match one ampersand. Unfortunately this will mess up our text if
213 some of the ampersands have already been turned into HTML
214 entities. So what we really want to say is replace an ampersand
215 providing it is not followed by 'amp;'. For this we need the
216 negative lookahead assertion and our regexp becomes:
217 <b>\&(?!amp;)</b>. The negative lookahead assertion is introduced
218 with '(?!' and finishes at the ')'. It means that the text it
219 contains, 'amp;' in our example, must \e not follow the expression
220 that preceeds it.
221
222 Regexps provide a rich language that can be used in a variety of
223 ways. For example suppose we want to count all the occurrences of
224 'Eric' and 'Eirik' in a string. Two valid regexps to match these
225 are <b>\\b(Eric|Eirik)\\b</b> and <b>\\bEi?ri[ck]\\b</b>. We need
226 the word boundary '\b' so we don't get 'Ericsson' etc. The second
227 regexp actually matches more than we want, 'Eric', 'Erik', 'Eiric'
228 and 'Eirik'.
229
230 We will implement some the examples above in the
231 \link #code-examples code examples \endlink section.
232
233 \target characters-and-abbreviations-for-sets-of-characters
234 \section1 Characters and Abbreviations for Sets of Characters
235
236 \table
237 \header \i Element \i Meaning
238 \row \i <b>c</b>
239 \i Any character represents itself unless it has a special
240 regexp meaning. Thus <b>c</b> matches the character \e c.
241 \row \i <b>\\c</b>
242 \i A character that follows a backslash matches the character
243 itself except where mentioned below. For example if you
244 wished to match a literal caret at the beginning of a string
245 you would write <b>\^</b>.
246 \row \i <b>\\a</b>
247 \i This matches the ASCII bell character (BEL, 0x07).
248 \row \i <b>\\f</b>
249 \i This matches the ASCII form feed character (FF, 0x0C).
250 \row \i <b>\\n</b>
251 \i This matches the ASCII line feed character (LF, 0x0A, Unix newline).
252 \row \i <b>\\r</b>
253 \i This matches the ASCII carriage return character (CR, 0x0D).
254 \row \i <b>\\t</b>
255 \i This matches the ASCII horizontal tab character (HT, 0x09).
256 \row \i <b>\\v</b>
257 \i This matches the ASCII vertical tab character (VT, 0x0B).
258 \row \i <b>\\xhhhh</b>
259 \i This matches the Unicode character corresponding to the
260 hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo
261 (i.e., \zero ooo) matches the ASCII/Latin-1 character
262 corresponding to the octal number ooo (between 0 and 0377).
263 \row \i <b>. (dot)</b>
264 \i This matches any character (including newline).
265 \row \i <b>\\d</b>
266 \i This matches a digit (QChar::isDigit()).
267 \row \i <b>\\D</b>
268 \i This matches a non-digit.
269 \row \i <b>\\s</b>
270 \i This matches a whitespace (QChar::isSpace()).
271 \row \i <b>\\S</b>
272 \i This matches a non-whitespace.
273 \row \i <b>\\w</b>
274 \i This matches a word character (QChar::isLetterOrNumber() or '_').
275 \row \i <b>\\W</b>
276 \i This matches a non-word character.
277 \row \i <b>\\n</b>
278 \i The n-th \link #capturing-text backreference \endlink,
279 e.g. \1, \2, etc.
280 \endtable
281
282 \e {Note that the C++ compiler transforms backslashes in strings
283 so to include a <b>\\</b> in a regexp you will need to enter it
284 twice, i.e. <b>\\\\</b>.}
285
286 \target sets-of-characters
287 \section1 Sets of Characters
288
289 Square brackets are used to match any character in the set of
290 characters contained within the square brackets. All the character
291 set abbreviations described above can be used within square
292 brackets. Apart from the character set abbreviations and the
293 following two exceptions no characters have special meanings in
294 square brackets.
295
296 \table
297 \row \i <b>^</b>
298 \i The caret negates the character set if it occurs as the
299 first character, i.e. immediately after the opening square
300 bracket. For example, <b>[abc]</b> matches 'a' or 'b' or 'c',
301 but <b>[^abc]</b> matches anything \e except 'a' or 'b' or
302 'c'.
303 \row \i <b>-</b>
304 \i The dash is used to indicate a range of characters, for
305 example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'.
306 \endtable
307
308 Using the predefined character set abbreviations is more portable
309 than using character ranges across platforms and languages. For
310 example, <b>[0-9]</b> matches a digit in Western alphabets but
311 <b>\d</b> matches a digit in \e any alphabet.
312
313 Note that in most regexp literature sets of characters are called
314 "character classes".
315
316 \target quantifiers
317 \section1 Quantifiers
318
319 By default an expression is automatically quantified by
320 <b>{1,1}</b>, i.e. it should occur exactly once. In the following
321 list <b>\e {E}</b> stands for any expression. An expression is a
322 character or an abbreviation for a set of characters or a set of
323 characters in square brackets or any parenthesised expression.
324
325 \table
326 \row \i <b>\e {E}?</b>
327 \i Matches zero or one occurrence of \e E. This quantifier
328 means "the previous expression is optional" since it will
329 match whether or not the expression occurs in the string. It
330 is the same as <b>\e {E}{0,1}</b>. For example <b>dents?</b>
331 will match 'dent' and 'dents'.
332
333 \row \i <b>\e {E}+</b>
334 \i Matches one or more occurrences of \e E. This is the same
335 as <b>\e {E}{1,MAXINT}</b>. For example, <b>0+</b> will match
336 '0', '00', '000', etc.
337
338 \row \i <b>\e {E}*</b>
339 \i Matches zero or more occurrences of \e E. This is the same
340 as <b>\e {E}{0,MAXINT}</b>. The <b>*</b> quantifier is often
341 used by a mistake. Since it matches \e zero or more
342 occurrences it will match no occurrences at all. For example
343 if we want to match strings that end in whitespace and use
344 the regexp <b>\s*$</b> we would get a match on every string.
345 This is because we have said find zero or more whitespace
346 followed by the end of string, so even strings that don't end
347 in whitespace will match. The regexp we want in this case is
348 <b>\s+$</b> to match strings that have at least one
349 whitespace at the end.
350
351 \row \i <b>\e {E}{n}</b>
352 \i Matches exactly \e n occurrences of the expression. This
353 is the same as repeating the expression \e n times. For
354 example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also
355 the same as <b>\e {E}{n,n}</b>, e.g. <b>x{5,5}</b>.
356
357 \row \i <b>\e {E}{n,}</b>
358 \i Matches at least \e n occurrences of the expression. This
359 is the same as <b>\e {E}{n,MAXINT}</b>.
360
361 \row \i <b>\e {E}{,m}</b>
362 \i Matches at most \e m occurrences of the expression. This
363 is the same as <b>\e {E}{0,m}</b>.
364
365 \row \i <b>\e {E}{n,m}</b>
366 \i Matches at least \e n occurrences of the expression and at
367 most \e m occurrences of the expression.
368 \endtable
369
370 (MAXINT is implementation dependent but will not be smaller than
371 1024.)
372
373 If we wish to apply a quantifier to more than just the preceding
374 character we can use parentheses to group characters together in
375 an expression. For example, <b>tag+</b> matches a 't' followed by
376 an 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches
377 at least one occurrence of 'tag'.
378
379 Note that quantifiers are "greedy". They will match as much text
380 as they can. For example, <b>0+</b> will match as many zeros as it
381 can from the first zero it finds, e.g. '2.<u>000</u>5'.
382 Quantifiers can be made non-greedy, see setMinimal().
383
384 \target capturing-text
385 \section1 Capturing Text
386
387 Parentheses allow us to group elements together so that we can
388 quantify and capture them. For example if we have the expression
389 <b>mail|letter|correspondence</b> that matches a string we know
390 that \e one of the words matched but not which one. Using
391 parentheses allows us to "capture" whatever is matched within
392 their bounds, so if we used <b>(mail|letter|correspondence)</b>
393 and matched this regexp against the string "I sent you some email"
394 we can use the cap() or capturedTexts() functions to extract the
395 matched characters, in this case 'mail'.
396
397 We can use captured text within the regexp itself. To refer to the
398 captured text we use \e backreferences which are indexed from 1,
399 the same as for cap(). For example we could search for duplicate
400 words in a string using <b>\b(\w+)\W+\1\b</b> which means match a
401 word boundary followed by one or more word characters followed by
402 one or more non-word characters followed by the same text as the
403 first parenthesised expression followed by a word boundary.
404
405 If we want to use parentheses purely for grouping and not for
406 capturing we can use the non-capturing syntax, e.g.
407 <b>(?:green|blue)</b>. Non-capturing parentheses begin '(?:' and
408 end ')'. In this example we match either 'green' or 'blue' but we
409 do not capture the match so we only know whether or not we matched
410 but not which color we actually found. Using non-capturing
411 parentheses is more efficient than using capturing parentheses
412 since the regexp engine has to do less book-keeping.
413
414 Both capturing and non-capturing parentheses may be nested.
415
416 \target assertions
417 \section1 Assertions
418
419 Assertions make some statement about the text at the point where
420 they occur in the regexp but they do not match any characters. In
421 the following list <b>\e {E}</b> stands for any expression.
422
423 \table
424 \row \i <b>^</b>
425 \i The caret signifies the beginning of the string. If you
426 wish to match a literal \c{^} you must escape it by
427 writing \c{\\^}. For example, <b>^#include</b> will only
428 match strings which \e begin with the characters '#include'.
429 (When the caret is the first character of a character set it
430 has a special meaning, see \link #sets-of-characters Sets of
431 Characters \endlink.)
432
433 \row \i <b>$</b>
434 \i The dollar signifies the end of the string. For example
435 <b>\d\s*$</b> will match strings which end with a digit
436 optionally followed by whitespace. If you wish to match a
437 literal \c{$} you must escape it by writing
438 \c{\\$}.
439
440 \row \i <b>\\b</b>
441 \i A word boundary. For example the regexp
442 <b>\\bOK\\b</b> means match immediately after a word
443 boundary (e.g. start of string or whitespace) the letter 'O'
444 then the letter 'K' immediately before another word boundary
445 (e.g. end of string or whitespace). But note that the
446 assertion does not actually match any whitespace so if we
447 write <b>(\\bOK\\b)</b> and we have a match it will only
448 contain 'OK' even if the string is "Its <u>OK</u> now".
449
450 \row \i <b>\\B</b>
451 \i A non-word boundary. This assertion is true wherever
452 <b>\\b</b> is false. For example if we searched for
453 <b>\\Bon\\B</b> in "Left on" the match would fail (space
454 and end of string aren't non-word boundaries), but it would
455 match in "t<u>on</u>ne".
456
457 \row \i <b>(?=\e E)</b>
458 \i Positive lookahead. This assertion is true if the
459 expression matches at this point in the regexp. For example,
460 <b>const(?=\\s+char)</b> matches 'const' whenever it is
461 followed by 'char', as in 'static <u>const</u> char *'.
462 (Compare with <b>const\\s+char</b>, which matches 'static
463 <u>const char</u> *'.)
464
465 \row \i <b>(?!\e E)</b>
466 \i Negative lookahead. This assertion is true if the
467 expression does not match at this point in the regexp. For
468 example, <b>const(?!\\s+char)</b> matches 'const' \e except
469 when it is followed by 'char'.
470 \endtable
471
472 \target wildcard-matching
473 \section1 Wildcard Matching (globbing)
474
475 Most command shells such as \e bash or \e cmd.exe support "file
476 globbing", the ability to identify a group of files by using
477 wildcards. The setWildcard() function is used to switch between
478 regexp and wildcard mode. Wildcard matching is much simpler than
479 full regexps and has only four features:
480
481 \table
482 \row \i <b>c</b>
483 \i Any character represents itself apart from those mentioned
484 below. Thus <b>c</b> matches the character \e c.
485 \row \i <b>?</b>
486 \i This matches any single character. It is the same as
487 <b>.</b> in full regexps.
488 \row \i <b>*</b>
489 \i This matches zero or more of any characters. It is the
490 same as <b>.*</b> in full regexps.
491 \row \i <b>[...]</b>
492 \i Sets of characters can be represented in square brackets,
493 similar to full regexps. Within the character class, like
494 outside, backslash has no special meaning.
495 \endtable
496
497 For example if we are in wildcard mode and have strings which
498 contain filenames we could identify HTML files with <b>*.html</b>.
499 This will match zero or more characters followed by a dot followed
500 by 'h', 't', 'm' and 'l'.
501
502 \target perl-users
503 \section1 Notes for Perl Users
504
505 Most of the character class abbreviations supported by Perl are
506 supported by QRegExp, see \link
507 #characters-and-abbreviations-for-sets-of-characters characters
508 and abbreviations for sets of characters \endlink.
509
510 In QRegExp, apart from within character classes, \c{^} always
511 signifies the start of the string, so carets must always be
512 escaped unless used for that purpose. In Perl the meaning of caret
513 varies automagically depending on where it occurs so escaping it
514 is rarely necessary. The same applies to \c{$} which in
515 QRegExp always signifies the end of the string.
516
517 QRegExp's quantifiers are the same as Perl's greedy quantifiers.
518 Non-greedy matching cannot be applied to individual quantifiers,
519 but can be applied to all the quantifiers in the pattern. For
520 example, to match the Perl regexp <b>ro+?m</b> requires:
521 \code
522 QRegExp rx( "ro+m" );
523 rx.setMinimal( TRUE );
524 \endcode
525
526 The equivalent of Perl's \c{/i} option is
527 setCaseSensitive(FALSE).
528
529 Perl's \c{/g} option can be emulated using a \link
530 #cap_in_a_loop loop \endlink.
531
532 In QRegExp <b>.</b> matches any character, therefore all QRegExp
533 regexps have the equivalent of Perl's \c{/s} option. QRegExp
534 does not have an equivalent to Perl's \c{/m} option, but this
535 can be emulated in various ways for example by splitting the input
536 into lines or by looping with a regexp that searches for newlines.
537
538 Because QRegExp is string oriented there are no \A, \Z or \z
539 assertions. The \G assertion is not supported but can be emulated
540 in a loop.
541
542 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
543 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
544 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
545 capturedTexts()[2], etc.
546
547 To substitute a pattern use QString::replace().
548
549 Perl's extended \c{/x} syntax is not supported, nor are
550 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
551 the other hand, C++'s rules for literal strings can be used to
552 achieve the same:
553 \code
554 QRegExp mark( "\\b" // word boundary
555 "[Mm]ark" // the word we want to match
556 );
557 \endcode
558
559 Both zero-width positive and zero-width negative lookahead
560 assertions (?=pattern) and (?!pattern) are supported with the same
561 syntax as Perl. Perl's lookbehind assertions, "independent"
562 subexpressions and conditional expressions are not supported.
563
564 Non-capturing parentheses are also supported, with the same
565 (?:pattern) syntax.
566
567 See QStringList::split() and QStringList::join() for equivalents
568 to Perl's split and join functions.
569
570 Note: because C++ transforms \\'s they must be written \e twice in
571 code, e.g. <b>\\b</b> must be written <b>\\\\b</b>.
572
573 \target code-examples
574 \section1 Code Examples
575
576 \code
577 QRegExp rx( "^\\d\\d?$" ); // match integers 0 to 99
578 rx.search( "123" ); // returns -1 (no match)
579 rx.search( "-6" ); // returns -1 (no match)
580 rx.search( "6" ); // returns 0 (matched as position 0)
581 \endcode
582
583 The third string matches '<u>6</u>'. This is a simple validation
584 regexp for integers in the range 0 to 99.
585
586 \code
587 QRegExp rx( "^\\S+$" ); // match strings without whitespace
588 rx.search( "Hello world" ); // returns -1 (no match)
589 rx.search( "This_is-OK" ); // returns 0 (matched at position 0)
590 \endcode
591
592 The second string matches '<u>This_is-OK</u>'. We've used the
593 character set abbreviation '\S' (non-whitespace) and the anchors
594 to match strings which contain no whitespace.
595
596 In the following example we match strings containing 'mail' or
597 'letter' or 'correspondence' but only match whole words i.e. not
598 'email'
599
600 \code
601 QRegExp rx( "\\b(mail|letter|correspondence)\\b" );
602 rx.search( "I sent you an email" ); // returns -1 (no match)
603 rx.search( "Please write the letter" ); // returns 17
604 \endcode
605
606 The second string matches "Please write the <u>letter</u>". The
607 word 'letter' is also captured (because of the parentheses). We
608 can see what text we've captured like this:
609
610 \code
611 QString captured = rx.cap( 1 ); // captured == "letter"
612 \endcode
613
614 This will capture the text from the first set of capturing
615 parentheses (counting capturing left parentheses from left to
616 right). The parentheses are counted from 1 since cap( 0 ) is the
617 whole matched regexp (equivalent to '&' in most regexp engines).
618
619 \code
620 QRegExp rx( "&(?!amp;)" ); // match ampersands but not &amp;
621 QString line1 = "This & that";
622 line1.replace( rx, "&amp;" );
623 // line1 == "This &amp; that"
624 QString line2 = "His &amp; hers & theirs";
625 line2.replace( rx, "&amp;" );
626 // line2 == "His &amp; hers &amp; theirs"
627 \endcode
628
629 Here we've passed the QRegExp to QString's replace() function to
630 replace the matched text with new text.
631
632 \code
633 QString str = "One Eric another Eirik, and an Ericsson."
634 " How many Eiriks, Eric?";
635 QRegExp rx( "\\b(Eric|Eirik)\\b" ); // match Eric or Eirik
636 int pos = 0; // where we are in the string
637 int count = 0; // how many Eric and Eirik's we've counted
638 while ( pos >= 0 ) {
639 pos = rx.search( str, pos );
640 if ( pos >= 0 ) {
641 pos++; // move along in str
642 count++; // count our Eric or Eirik
643 }
644 }
645 \endcode
646
647 We've used the search() function to repeatedly match the regexp in
648 the string. Note that instead of moving forward by one character
649 at a time \c pos++ we could have written \c {pos +=
650 rx.matchedLength()} to skip over the already matched string. The
651 count will equal 3, matching 'One <u>Eric</u> another
652 <u>Eirik</u>, and an Ericsson. How many Eiriks, <u>Eric</u>?'; it
653 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
654 by non-word boundaries.
655
656 One common use of regexps is to split lines of delimited data into
657 their component fields.
658
659 \code
660 str = "Trolltech AS\twww.trolltech.com\tNorway";
661 QString company, web, country;
662 rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" );
663 if ( rx.search( str ) != -1 ) {
664 company = rx.cap( 1 );
665 web = rx.cap( 2 );
666 country = rx.cap( 3 );
667 }
668 \endcode
669
670 In this example our input lines have the format company name, web
671 address and country. Unfortunately the regexp is rather long and
672 not very versatile -- the code will break if we add any more
673 fields. A simpler and better solution is to look for the
674 separator, '\t' in this case, and take the surrounding text. The
675 QStringList split() function can take a separator string or regexp
676 as an argument and split a string accordingly.
677
678 \code
679 QStringList field = QStringList::split( "\t", str );
680 \endcode
681
682 Here field[0] is the company, field[1] the web address and so on.
683
684 To imitate the matching of a shell we can use wildcard mode.
685
686 \code
687 QRegExp rx( "*.html" ); // invalid regexp: * doesn't quantify anything
688 rx.setWildcard( TRUE ); // now it's a valid wildcard regexp
689 rx.exactMatch( "index.html" ); // returns TRUE
690 rx.exactMatch( "default.htm" ); // returns FALSE
691 rx.exactMatch( "readme.txt" ); // returns FALSE
692 \endcode
693
694 Wildcard matching can be convenient because of its simplicity, but
695 any wildcard regexp can be defined using full regexps, e.g.
696 <b>.*\.html$</b>. Notice that we can't match both \c .html and \c
697 .htm files with a wildcard unless we use <b>*.htm*</b> which will
698 also match 'test.html.bak'. A full regexp gives us the precision
699 we need, <b>.*\\.html?$</b>.
700
701 QRegExp can match case insensitively using setCaseSensitive(), and
702 can use non-greedy matching, see setMinimal(). By default QRegExp
703 uses full regexps but this can be changed with setWildcard().
704 Searching can be forward with search() or backward with
705 searchRev(). Captured text can be accessed using capturedTexts()
706 which returns a string list of all captured strings, or using
707 cap() which returns the captured string for the given index. The
708 pos() function takes a match index and returns the position in the
709 string where the match was made (or -1 if there was no match).
710
711 \sa QRegExpValidator QString QStringList
712
713 \target member-function-documentation
714*/
715
716const int NumBadChars = 64;
717#define BadChar( ch ) ( (ch).unicode() % NumBadChars )
718
719const int NoOccurrence = INT_MAX;
720const int EmptyCapture = INT_MAX;
721const int InftyLen = INT_MAX;
722const int InftyRep = 1025;
723const int EOS = -1;
724
725static bool isWord( QChar ch )
726{
727 return ch.isLetterOrNumber() || ch == QChar( '_' );
728}
729
730/*
731 Merges two QMemArrays of ints and puts the result into the first
732 one.
733*/
734static void mergeInto( QMemArray<int> *a, const QMemArray<int>& b )
735{
736 int asize = a->size();
737 int bsize = b.size();
738 if ( asize == 0 ) {
739 *a = b.copy();
740#ifndef QT_NO_REGEXP_OPTIM
741 } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) {
742 a->resize( asize + 1 );
743 (*a)[asize] = b[0];
744#endif
745 } else if ( bsize >= 1 ) {
746 int csize = asize + bsize;
747 QMemArray<int> c( csize );
748 int i = 0, j = 0, k = 0;
749 while ( i < asize ) {
750 if ( j < bsize ) {
751 if ( (*a)[i] == b[j] ) {
752 i++;
753 csize--;
754 } else if ( (*a)[i] < b[j] ) {
755 c[k++] = (*a)[i++];
756 } else {
757 c[k++] = b[j++];
758 }
759 } else {
760 memcpy( c.data() + k, (*a).data() + i,
761 (asize - i) * sizeof(int) );
762 break;
763 }
764 }
765 c.resize( csize );
766 if ( j < bsize )
767 memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) );
768 *a = c;
769 }
770}
771
772/*
773 Merges two disjoint QMaps of (int, int) pairs and puts the result
774 into the first one.
775*/
776static void mergeInto( QMap<int, int> *a, const QMap<int, int>& b )
777{
778 QMap<int, int>::ConstIterator it;
779 for ( it = b.begin(); it != b.end(); ++it )
780 a->insert( it.key(), *it );
781}
782
783/*
784 Returns the value associated to key k in QMap m of (int, int)
785 pairs, or 0 if no such value is explicitly present.
786*/
787static int at( const QMap<int, int>& m, int k )
788{
789 QMap<int, int>::ConstIterator it = m.find( k );
790 if ( it == m.end() )
791 return 0;
792 else
793 return *it;
794}
795
796#ifndef QT_NO_REGEXP_WILDCARD
797/*
798 Translates a wildcard pattern to an equivalent regular expression
799 pattern (e.g., *.cpp to .*\.cpp).
800*/
801static QString wc2rx( const QString& wc_str )
802{
803 int wclen = wc_str.length();
804 QString rx = QString::fromLatin1( "" );
805 int i = 0;
806 const QChar *wc = wc_str.unicode();
807 while ( i < wclen ) {
808 QChar c = wc[i++];
809 switch ( c.unicode() ) {
810 case '*':
811 rx += QString::fromLatin1( ".*" );
812 break;
813 case '?':
814 rx += QChar( '.' );
815 break;
816 case '$':
817 case '(':
818 case ')':
819 case '+':
820 case '.':
821 case '\\':
822 case '^':
823 case '{':
824 case '|':
825 case '}':
826 rx += QChar( '\\' );
827 rx += c;
828 break;
829 case '[':
830 rx += c;
831 if ( wc[i] == QChar('^') )
832 rx += wc[i++];
833 if ( i < wclen ) {
834 if ( rx[i] == ']' )
835 rx += wc[i++];
836 while ( i < wclen && wc[i] != QChar(']') ) {
837 if ( wc[i] == '\\' )
838 rx += QChar( '\\' );
839 rx += wc[i++];
840 }
841 }
842 break;
843 default:
844 rx += c;
845 }
846 }
847 return rx;
848}
849#endif
850
851/*
852 The class QRegExpEngine encapsulates a modified nondeterministic
853 finite automaton (NFA).
854*/
855class QRegExpEngine : public QShared
856{
857public:
858#ifndef QT_NO_REGEXP_CCLASS
859 /*
860 The class CharClass represents a set of characters, such as can
861 be found in regular expressions (e.g., [a-z] denotes the set
862 {a, b, ..., z}).
863 */
864 class CharClass
865 {
866 public:
867 CharClass();
868 CharClass( const CharClass& cc ) { operator=( cc ); }
869
870 CharClass& operator=( const CharClass& cc );
871
872 void clear();
873 bool negative() const { return n; }
874 void setNegative( bool negative );
875 void addCategories( int cats );
876 void addRange( ushort from, ushort to );
877 void addSingleton( ushort ch ) { addRange( ch, ch ); }
878
879 bool in( QChar ch ) const;
880#ifndef QT_NO_REGEXP_OPTIM
881 const QMemArray<int>& firstOccurrence() const { return occ1; }
882#endif
883
884#if defined(QT_DEBUG)
885 void dump() const;
886#endif
887
888 private:
889 /*
890 The struct Range represents a range of characters (e.g.,
891 [0-9] denotes range 48 to 57).
892 */
893 struct Range
894 {
895 ushort from; // 48
896 ushort to; // 57
897 };
898
899 int c; // character classes
900 QMemArray<Range> r; // character ranges
901 bool n; // negative?
902#ifndef QT_NO_REGEXP_OPTIM
903 QMemArray<int> occ1; // first-occurrence array
904#endif
905 };
906#else
907 struct CharClass
908 {
909 int dummy;
910
911#ifndef QT_NO_REGEXP_OPTIM
912 CharClass() { occ1.fill( 0, NumBadChars ); }
913
914 const QMemArray<int>& firstOccurrence() const { return occ1; }
915 QMemArray<int> occ1;
916#endif
917 };
918#endif
919
920 QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); }
921 QRegExpEngine( const QString& rx, bool caseSensitive );
922#ifndef QT_NO_REGEXP_OPTIM
923 ~QRegExpEngine();
924#endif
925
926 bool isValid() const { return valid; }
927 bool caseSensitive() const { return cs; }
928 const QString& errorString() const { return yyError; }
929 int numCaptures() const { return officialncap; }
930 void match( const QString& str, int pos, bool minimal, bool oneTest,
931 int caretIndex, QMemArray<int>& captured );
932 int partialMatchLength() const { return mmOneTestMatchedLen; }
933
934 int createState( QChar ch );
935 int createState( const CharClass& cc );
936#ifndef QT_NO_REGEXP_BACKREF
937 int createState( int bref );
938#endif
939
940 void addCatTransitions( const QMemArray<int>& from,
941 const QMemArray<int>& to );
942#ifndef QT_NO_REGEXP_CAPTURE
943 void addPlusTransitions( const QMemArray<int>& from,
944 const QMemArray<int>& to, int atom );
945#endif
946
947#ifndef QT_NO_REGEXP_ANCHOR_ALT
948 int anchorAlternation( int a, int b );
949 int anchorConcatenation( int a, int b );
950#else
951 int anchorAlternation( int a, int b ) { return a & b; }
952 int anchorConcatenation( int a, int b ) { return a | b; }
953#endif
954 void addAnchors( int from, int to, int a );
955
956#ifndef QT_NO_REGEXP_OPTIM
957 void heuristicallyChooseHeuristic();
958#endif
959
960#if defined(QT_DEBUG)
961 void dump() const;
962#endif
963
964private:
965 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
966
967 /*
968 The struct State represents one state in a modified NFA. The
969 input characters matched are stored in the state instead of on
970 the transitions, something possible for an automaton
971 constructed from a regular expression.
972 */
973 struct State
974 {
975#ifndef QT_NO_REGEXP_CAPTURE
976 int atom; // which atom does this state belong to?
977#endif
978 int match; // what does it match? (see CharClassBit and BackRefBit)
979 QMemArray<int> outs; // out-transitions
980 QMap<int, int> *reenter; // atoms reentered when transiting out
981 QMap<int, int> *anchors; // anchors met when transiting out
982
983#ifndef QT_NO_REGEXP_CAPTURE
984 State( int a, int m )
985 : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { }
986#else
987 State( int m )
988 : match( m ), reenter( 0 ), anchors( 0 ) { }
989#endif
990 ~State() { delete reenter; delete anchors; }
991 };
992
993#ifndef QT_NO_REGEXP_LOOKAHEAD
994 /*
995 The struct Lookahead represents a lookahead a la Perl (e.g.,
996 (?=foo) and (?!bar)).
997 */
998 struct Lookahead
999 {
1000 QRegExpEngine *eng; // NFA representing the embedded regular expression
1001 bool neg; // negative lookahead?
1002
1003 Lookahead( QRegExpEngine *eng0, bool neg0 )
1004 : eng( eng0 ), neg( neg0 ) { }
1005 ~Lookahead() { delete eng; }
1006 };
1007#endif
1008
1009#ifndef QT_NO_REGEXP_CAPTURE
1010 /*
1011 The struct Atom represents one node in the hierarchy of regular
1012 expression atoms.
1013 */
1014 struct Atom
1015 {
1016 int parent; // index of parent in array of atoms
1017 int capture; // index of capture, from 1 to ncap
1018 };
1019#endif
1020
1021#ifndef QT_NO_REGEXP_ANCHOR_ALT
1022 /*
1023 The struct AnchorAlternation represents a pair of anchors with
1024 OR semantics.
1025 */
1026 struct AnchorAlternation
1027 {
1028 int a; // this anchor...
1029 int b; // ...or this one
1030 };
1031#endif
1032
1033 enum { InitialState = 0, FinalState = 1 };
1034 void setup( bool caseSensitive );
1035 int setupState( int match );
1036
1037 /*
1038 Let's hope that 13 lookaheads and 14 back-references are
1039 enough.
1040 */
1041 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1042 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002,
1043 Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008,
1044 Anchor_FirstLookahead = 0x00000010,
1045 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1046 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1047 Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs,
1048
1049 Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^
1050 ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) };
1051#ifndef QT_NO_REGEXP_CAPTURE
1052 int startAtom( bool capture );
1053 void finishAtom( int atom ) { cf = f[atom].parent; }
1054#endif
1055
1056#ifndef QT_NO_REGEXP_LOOKAHEAD
1057 int addLookahead( QRegExpEngine *eng, bool negative );
1058#endif
1059
1060#ifndef QT_NO_REGEXP_CAPTURE
1061 bool isBetterCapture( const int *begin1, const int *end1, const int *begin2,
1062 const int *end2 );
1063#endif
1064 bool testAnchor( int i, int a, const int *capBegin );
1065
1066#ifndef QT_NO_REGEXP_OPTIM
1067 bool goodStringMatch();
1068 bool badCharMatch();
1069#else
1070 bool bruteMatch();
1071#endif
1072 bool matchHere();
1073
1074 QPtrVector<State> s; // array of states
1075 int ns; // number of states
1076#ifndef QT_NO_REGEXP_CAPTURE
1077 QMemArray<Atom> f; // atom hierarchy
1078 int nf; // number of atoms
1079 int cf; // current atom
1080#endif
1081 int officialncap; // number of captures, seen from the outside
1082 int ncap; // number of captures, seen from the inside
1083#ifndef QT_NO_REGEXP_CCLASS
1084 QPtrVector<CharClass> cl; // array of character classes
1085#endif
1086#ifndef QT_NO_REGEXP_LOOKAHEAD
1087 QPtrVector<Lookahead> ahead; // array of lookaheads
1088#endif
1089#ifndef QT_NO_REGEXP_ANCHOR_ALT
1090 QMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors
1091#endif
1092#ifndef QT_NO_REGEXP_OPTIM
1093 bool caretAnchored; // does the regexp start with ^?
1094 bool trivial; // is the good-string all that needs to match?
1095#endif
1096 bool valid; // is the regular expression valid?
1097 bool cs; // case sensitive?
1098#ifndef QT_NO_REGEXP_BACKREF
1099 int nbrefs; // number of back-references
1100#endif
1101
1102#ifndef QT_NO_REGEXP_OPTIM
1103 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1104
1105 int goodEarlyStart; // the index where goodStr can first occur in a match
1106 int goodLateStart; // the index where goodStr can last occur in a match
1107 QString goodStr; // the string that any match has to contain
1108
1109 int minl; // the minimum length of a match
1110 QMemArray<int> occ1; // first-occurrence array
1111#endif
1112
1113 /*
1114 The class Box is an abstraction for a regular expression
1115 fragment. It can also be seen as one node in the syntax tree of
1116 a regular expression with synthetized attributes.
1117
1118 Its interface is ugly for performance reasons.
1119 */
1120 class Box
1121 {
1122 public:
1123 Box( QRegExpEngine *engine );
1124 Box( const Box& b ) { operator=( b ); }
1125
1126 Box& operator=( const Box& b );
1127
1128 void clear() { operator=( Box(eng) ); }
1129 void set( QChar ch );
1130 void set( const CharClass& cc );
1131#ifndef QT_NO_REGEXP_BACKREF
1132 void set( int bref );
1133#endif
1134
1135 void cat( const Box& b );
1136 void orx( const Box& b );
1137 void plus( int atom );
1138 void opt();
1139 void catAnchor( int a );
1140#ifndef QT_NO_REGEXP_OPTIM
1141 void setupHeuristics();
1142#endif
1143
1144#if defined(QT_DEBUG)
1145 void dump() const;
1146#endif
1147
1148 private:
1149 void addAnchorsToEngine( const Box& to ) const;
1150
1151 QRegExpEngine *eng; // the automaton under construction
1152 QMemArray<int> ls; // the left states (firstpos)
1153 QMemArray<int> rs; // the right states (lastpos)
1154 QMap<int, int> lanchors; // the left anchors
1155 QMap<int, int> ranchors; // the right anchors
1156 int skipanchors; // the anchors to match if the box is skipped
1157
1158#ifndef QT_NO_REGEXP_OPTIM
1159 int earlyStart; // the index where str can first occur
1160 int lateStart; // the index where str can last occur
1161 QString str; // a string that has to occur in any match
1162 QString leftStr; // a string occurring at the left of this box
1163 QString rightStr; // a string occurring at the right of this box
1164 int maxl; // the maximum length of this box (possibly InftyLen)
1165#endif
1166
1167 int minl; // the minimum length of this box
1168#ifndef QT_NO_REGEXP_OPTIM
1169 QMemArray<int> occ1; // first-occurrence array
1170#endif
1171 };
1172 friend class Box;
1173
1174 /*
1175 This is the lexical analyzer for regular expressions.
1176 */
1177 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen,
1178 Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass,
1179 Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord,
1180 Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1181 int getChar();
1182 int getEscape();
1183#ifndef QT_NO_REGEXP_INTERVAL
1184 int getRep( int def );
1185#endif
1186#ifndef QT_NO_REGEXP_LOOKAHEAD
1187 void skipChars( int n );
1188#endif
1189 void error( const char *msg );
1190 void startTokenizer( const QChar *rx, int len );
1191 int getToken();
1192
1193 const QChar *yyIn; // a pointer to the input regular expression pattern
1194 int yyPos0; // the position of yyTok in the input pattern
1195 int yyPos; // the position of the next character to read
1196 int yyLen; // the length of yyIn
1197 int yyCh; // the last character read
1198 CharClass *yyCharClass; // attribute for Tok_CharClass tokens
1199 int yyMinRep; // attribute for Tok_Quantifier
1200 int yyMaxRep; // ditto
1201 QString yyError; // syntax error or overflow during parsing?
1202
1203 /*
1204 This is the syntactic analyzer for regular expressions.
1205 */
1206 int parse( const QChar *rx, int len );
1207 void parseAtom( Box *box );
1208 void parseFactor( Box *box );
1209 void parseTerm( Box *box );
1210 void parseExpression( Box *box );
1211
1212 int yyTok; // the last token read
1213 bool yyMayCapture; // set this to FALSE to disable capturing
1214
1215 /*
1216 This is the engine state during matching.
1217 */
1218 const QString *mmStr; // a pointer to the input QString
1219 const QChar *mmIn; // a pointer to the input string data
1220 int mmPos; // the current position in the string
1221 int mmCaretPos;
1222 int mmLen; // the length of the input string
1223 bool mmMinimal; // minimal matching?
1224 QMemArray<int> mmBigArray; // big QMemArray<int> array
1225 int *mmInNextStack; // is state is mmNextStack?
1226 int *mmCurStack; // stack of current states
1227 int *mmNextStack; // stack of next states
1228 int *mmCurCapBegin; // start of current states' captures
1229 int *mmNextCapBegin; // start of next states' captures
1230 int *mmCurCapEnd; // end of current states' captures
1231 int *mmNextCapEnd; // end of next states' captures
1232 int *mmTempCapBegin; // start of temporary captures
1233 int *mmTempCapEnd; // end of temporary captures
1234 int *mmCapBegin; // start of captures for a next state
1235 int *mmCapEnd; // end of captures for a next state
1236 int *mmSlideTab; // bump-along slide table for bad-character heuristic
1237 int mmSlideTabSize; // size of slide table
1238#ifndef QT_NO_REGEXP_BACKREF
1239 QIntDict<int> mmSleeping; // dictionary of back-reference sleepers
1240#endif
1241 int mmMatchLen; // length of match
1242 int mmOneTestMatchedLen; // length of partial match
1243};
1244
1245QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive )
1246#ifndef QT_NO_REGEXP_BACKREF
1247 : mmSleeping( 101 )
1248#endif
1249{
1250 setup( caseSensitive );
1251 valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() );
1252 if ( !valid ) {
1253#ifndef QT_NO_REGEXP_OPTIM
1254 trivial = FALSE;
1255#endif
1256 error( RXERR_LEFTDELIM );
1257 }
1258}
1259
1260#ifndef QT_NO_REGEXP_OPTIM
1261QRegExpEngine::~QRegExpEngine()
1262{
1263}
1264#endif
1265
1266/*
1267 Tries to match in str and returns an array of (begin, length) pairs
1268 for captured text. If there is no match, all pairs are (-1, -1).
1269*/
1270void QRegExpEngine::match( const QString& str, int pos, bool minimal,
1271 bool oneTest, int caretIndex,
1272 QMemArray<int>& captured )
1273{
1274 bool matched = FALSE;
1275
1276#ifndef QT_NO_REGEXP_OPTIM
1277 if ( trivial && !oneTest ) {
1278 mmPos = str.find( goodStr, pos, cs );
1279 mmMatchLen = goodStr.length();
1280 matched = ( mmPos != -1 );
1281 } else
1282#endif
1283 {
1284 mmStr = &str;
1285 mmIn = str.unicode();
1286 if ( mmIn == 0 )
1287 mmIn = &QChar::null;
1288 mmPos = pos;
1289 mmCaretPos = caretIndex;
1290 mmLen = str.length();
1291 mmMinimal = minimal;
1292 mmMatchLen = 0;
1293 mmOneTestMatchedLen = 0;
1294
1295 if ( valid && mmPos >= 0 && mmPos <= mmLen ) {
1296#ifndef QT_NO_REGEXP_OPTIM
1297 if ( oneTest ) {
1298 matched = matchHere();
1299 } else {
1300 if ( mmPos <= mmLen - minl ) {
1301 if ( caretAnchored ) {
1302 matched = matchHere();
1303 } else if ( useGoodStringHeuristic ) {
1304 matched = goodStringMatch();
1305 } else {
1306 matched = badCharMatch();
1307 }
1308 }
1309 }
1310#else
1311 matched = oneTest ? matchHere() : bruteMatch();
1312#endif
1313 }
1314 }
1315
1316 int capturedSize = 2 + 2 * officialncap;
1317 captured.detach();
1318 captured.resize( capturedSize );
1319 if ( matched ) {
1320 captured[0] = mmPos;
1321 captured[1] = mmMatchLen;
1322 for ( int j = 0; j < officialncap; j++ ) {
1323 int len = mmCapEnd[j] - mmCapBegin[j];
1324 captured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0;
1325 captured[2 + 2 * j + 1] = len;
1326 }
1327 } else {
1328 // we rely on 2's complement here
1329 memset( captured.data(), -1, capturedSize * sizeof(int) );
1330 }
1331}
1332
1333/*
1334 The three following functions add one state to the automaton and
1335 return the number of the state.
1336*/
1337
1338int QRegExpEngine::createState( QChar ch )
1339{
1340 return setupState( ch.unicode() );
1341}
1342
1343int QRegExpEngine::createState( const CharClass& cc )
1344{
1345#ifndef QT_NO_REGEXP_CCLASS
1346 int n = cl.size();
1347 cl.resize( n + 1 );
1348 cl.insert( n, new CharClass(cc) );
1349 return setupState( CharClassBit | n );
1350#else
1351 Q_UNUSED( cc );
1352 return setupState( CharClassBit );
1353#endif
1354}
1355
1356#ifndef QT_NO_REGEXP_BACKREF
1357int QRegExpEngine::createState( int bref )
1358{
1359 if ( bref > nbrefs ) {
1360 nbrefs = bref;
1361 if ( nbrefs > MaxBackRefs ) {
1362 error( RXERR_LIMIT );
1363 return 0;
1364 }
1365 }
1366 return setupState( BackRefBit | bref );
1367}
1368#endif
1369
1370/*
1371 The two following functions add a transition between all pairs of
1372 states (i, j) where i is fond in from, and j is found in to.
1373
1374 Cat-transitions are distinguished from plus-transitions for
1375 capturing.
1376*/
1377
1378void QRegExpEngine::addCatTransitions( const QMemArray<int>& from,
1379 const QMemArray<int>& to )
1380{
1381 for ( int i = 0; i < (int) from.size(); i++ ) {
1382 State *st = s[from[i]];
1383 mergeInto( &st->outs, to );
1384 }
1385}
1386
1387#ifndef QT_NO_REGEXP_CAPTURE
1388void QRegExpEngine::addPlusTransitions( const QMemArray<int>& from,
1389 const QMemArray<int>& to, int atom )
1390{
1391 for ( int i = 0; i < (int) from.size(); i++ ) {
1392 State *st = s[from[i]];
1393 QMemArray<int> oldOuts = st->outs.copy();
1394 mergeInto( &st->outs, to );
1395 if ( f[atom].capture >= 0 ) {
1396 if ( st->reenter == 0 )
1397 st->reenter = new QMap<int, int>;
1398 for ( int j = 0; j < (int) to.size(); j++ ) {
1399 if ( !st->reenter->contains(to[j]) &&
1400 oldOuts.bsearch(to[j]) < 0 )
1401 st->reenter->insert( to[j], atom );
1402 }
1403 }
1404 }
1405}
1406#endif
1407
1408#ifndef QT_NO_REGEXP_ANCHOR_ALT
1409/*
1410 Returns an anchor that means a OR b.
1411*/
1412int QRegExpEngine::anchorAlternation( int a, int b )
1413{
1414 if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 )
1415 return a & b;
1416
1417 int n = aa.size();
1418#ifndef QT_NO_REGEXP_OPTIM
1419 if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b )
1420 return Anchor_Alternation | ( n - 1 );
1421#endif
1422
1423 aa.resize( n + 1 );
1424 aa[n].a = a;
1425 aa[n].b = b;
1426 return Anchor_Alternation | n;
1427}
1428
1429/*
1430 Returns an anchor that means a AND b.
1431*/
1432int QRegExpEngine::anchorConcatenation( int a, int b )
1433{
1434 if ( ((a | b) & Anchor_Alternation) == 0 )
1435 return a | b;
1436 if ( (b & Anchor_Alternation) != 0 )
1437 qSwap( a, b );
1438
1439 int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b );
1440 int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b );
1441 return anchorAlternation( aprime, bprime );
1442}
1443#endif
1444
1445/*
1446 Adds anchor a on a transition caracterised by its from state and
1447 its to state.
1448*/
1449void QRegExpEngine::addAnchors( int from, int to, int a )
1450{
1451 State *st = s[from];
1452 if ( st->anchors == 0 )
1453 st->anchors = new QMap<int, int>;
1454 if ( st->anchors->contains(to) )
1455 a = anchorAlternation( (*st->anchors)[to], a );
1456 st->anchors->insert( to, a );
1457}
1458
1459#ifndef QT_NO_REGEXP_OPTIM
1460/*
1461 This function chooses between the good-string and the bad-character
1462 heuristics. It computes two scores and chooses the heuristic with
1463 the highest score.
1464
1465 Here are some common-sense constraints on the scores that should be
1466 respected if the formulas are ever modified: (1) If goodStr is
1467 empty, the good-string heuristic scores 0. (2) If the regular
1468 expression is trivial, the good-string heuristic should be used.
1469 (3) If the search is case insensitive, the good-string heuristic
1470 should be used, unless it scores 0. (Case insensitivity turns all
1471 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1472 big, the good-string heuristic should score less.
1473*/
1474void QRegExpEngine::heuristicallyChooseHeuristic()
1475{
1476 if ( minl == 0 ) {
1477 useGoodStringHeuristic = FALSE;
1478 } else if ( trivial ) {
1479 useGoodStringHeuristic = TRUE;
1480 } else {
1481 /*
1482 Magic formula: The good string has to constitute a good
1483 proportion of the minimum-length string, and appear at a
1484 more-or-less known index.
1485 */
1486 int goodStringScore = ( 64 * goodStr.length() / minl ) -
1487 ( goodLateStart - goodEarlyStart );
1488 /*
1489 Less magic formula: We pick some characters at random, and
1490 check whether they are good or bad.
1491 */
1492 int badCharScore = 0;
1493 int step = QMAX( 1, NumBadChars / 32 );
1494 for ( int i = 1; i < NumBadChars; i += step ) {
1495 if ( occ1[i] == NoOccurrence )
1496 badCharScore += minl;
1497 else
1498 badCharScore += occ1[i];
1499 }
1500 badCharScore /= minl;
1501 useGoodStringHeuristic = ( goodStringScore > badCharScore );
1502 }
1503}
1504#endif
1505
1506#if defined(QT_DEBUG)
1507void QRegExpEngine::dump() const
1508{
1509 int i, j;
1510 qDebug( "Case %ssensitive engine", cs ? "" : "in" );
1511 qDebug( " States" );
1512 for ( i = 0; i < ns; i++ ) {
1513 qDebug( " %d%s", i,
1514 i == InitialState ? " (initial)" :
1515 i == FinalState ? " (final)" : "" );
1516#ifndef QT_NO_REGEXP_CAPTURE
1517 qDebug( " in atom %d", s[i]->atom );
1518#endif
1519 int m = s[i]->match;
1520 if ( (m & CharClassBit) != 0 ) {
1521 qDebug( " match character class %d", m ^ CharClassBit );
1522#ifndef QT_NO_REGEXP_CCLASS
1523 cl[m ^ CharClassBit]->dump();
1524#else
1525 qDebug( " negative character class" );
1526#endif
1527 } else if ( (m & BackRefBit) != 0 ) {
1528 qDebug( " match back-reference %d", m ^ BackRefBit );
1529 } else if ( m >= 0x20 && m <= 0x7e ) {
1530 qDebug( " match 0x%.4x (%c)", m, m );
1531 } else {
1532 qDebug( " match 0x%.4x", m );
1533 }
1534 for ( j = 0; j < (int) s[i]->outs.size(); j++ ) {
1535 int next = s[i]->outs[j];
1536 qDebug( " -> %d", next );
1537 if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) )
1538 qDebug( " [reenter %d]", (*s[i]->reenter)[next] );
1539 if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 )
1540 qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] );
1541 }
1542 }
1543#ifndef QT_NO_REGEXP_CAPTURE
1544 if ( nf > 0 ) {
1545 qDebug( " Atom Parent Capture" );
1546 for ( i = 0; i < nf; i++ )
1547 qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture );
1548 }
1549#endif
1550#ifndef QT_NO_REGEXP_ANCHOR_ALT
1551 for ( i = 0; i < (int) aa.size(); i++ )
1552 qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a,
1553 aa[i].b );
1554#endif
1555}
1556#endif
1557
1558void QRegExpEngine::setup( bool caseSensitive )
1559{
1560 s.setAutoDelete( TRUE );
1561 s.resize( 32 );
1562 ns = 0;
1563#ifndef QT_NO_REGEXP_CAPTURE
1564 f.resize( 32 );
1565 nf = 0;
1566 cf = -1;
1567#endif
1568 officialncap = 0;
1569 ncap = 0;
1570#ifndef QT_NO_REGEXP_CCLASS
1571 cl.setAutoDelete( TRUE );
1572#endif
1573#ifndef QT_NO_REGEXP_LOOKAHEAD
1574 ahead.setAutoDelete( TRUE );
1575#endif
1576#ifndef QT_NO_REGEXP_OPTIM
1577 caretAnchored = TRUE;
1578 trivial = TRUE;
1579#endif
1580 valid = FALSE;
1581 cs = caseSensitive;
1582#ifndef QT_NO_REGEXP_BACKREF
1583 nbrefs = 0;
1584#endif
1585#ifndef QT_NO_REGEXP_OPTIM
1586 useGoodStringHeuristic = TRUE;
1587 minl = 0;
1588 occ1.fill( 0, NumBadChars );
1589#endif
1590}
1591
1592int QRegExpEngine::setupState( int match )
1593{
1594 if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() )
1595 s.resize( (ns + 1) << 1 );
1596#ifndef QT_NO_REGEXP_CAPTURE
1597 s.insert( ns, new State(cf, match) );
1598#else
1599 s.insert( ns, new State(match) );
1600#endif
1601 return ns++;
1602}
1603
1604#ifndef QT_NO_REGEXP_CAPTURE
1605/*
1606 Functions startAtom() and finishAtom() should be called to delimit
1607 atoms. When a state is created, it is assigned to the current atom.
1608 The information is later used for capturing.
1609*/
1610int QRegExpEngine::startAtom( bool capture )
1611{
1612 if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() )
1613 f.resize( (nf + 1) << 1 );
1614 f[nf].parent = cf;
1615 cf = nf++;
1616 f[cf].capture = capture ? ncap++ : -1;
1617 return cf;
1618}
1619#endif
1620
1621#ifndef QT_NO_REGEXP_LOOKAHEAD
1622/*
1623 Creates a lookahead anchor.
1624*/
1625int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative )
1626{
1627 int n = ahead.size();
1628 if ( n == MaxLookaheads ) {
1629 error( RXERR_LIMIT );
1630 return 0;
1631 }
1632 ahead.resize( n + 1 );
1633 ahead.insert( n, new Lookahead(eng, negative) );
1634 return Anchor_FirstLookahead << n;
1635}
1636#endif
1637
1638#ifndef QT_NO_REGEXP_CAPTURE
1639/*
1640 We want the longest leftmost captures.
1641*/
1642bool QRegExpEngine::isBetterCapture( const int *begin1, const int *end1,
1643 const int *begin2, const int *end2 )
1644{
1645 for ( int i = 0; i < ncap; i++ ) {
1646 int delta = begin2[i] - begin1[i]; // it has to start early...
1647 if ( delta == 0 )
1648 delta = end1[i] - end2[i]; // ...and end late (like a party)
1649
1650 if ( delta != 0 )
1651 return delta > 0;
1652 }
1653 return FALSE;
1654}
1655#endif
1656
1657/*
1658 Returns TRUE if anchor a matches at position mmPos + i in the input
1659 string, otherwise FALSE.
1660*/
1661bool QRegExpEngine::testAnchor( int i, int a, const int *capBegin )
1662{
1663 int j;
1664
1665#ifndef QT_NO_REGEXP_ANCHOR_ALT
1666 if ( (a & Anchor_Alternation) != 0 ) {
1667 return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) ||
1668 testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin );
1669 }
1670#endif
1671
1672 if ( (a & Anchor_Caret) != 0 ) {
1673 if ( mmPos + i != mmCaretPos )
1674 return FALSE;
1675 }
1676 if ( (a & Anchor_Dollar) != 0 ) {
1677 if ( mmPos + i != mmLen )
1678 return FALSE;
1679 }
1680#ifndef QT_NO_REGEXP_ESCAPE
1681 if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) {
1682 bool before = FALSE;
1683 bool after = FALSE;
1684 if ( mmPos + i != 0 )
1685 before = isWord( mmIn[mmPos + i - 1] );
1686 if ( mmPos + i != mmLen )
1687 after = isWord( mmIn[mmPos + i] );
1688 if ( (a & Anchor_Word) != 0 && (before == after) )
1689 return FALSE;
1690 if ( (a & Anchor_NonWord) != 0 && (before != after) )
1691 return FALSE;
1692 }
1693#endif
1694#ifndef QT_NO_REGEXP_LOOKAHEAD
1695 if ( (a & Anchor_LookaheadMask) != 0 ) {
1696 QConstString cstr = QConstString( (QChar *) mmIn + mmPos + i,
1697 mmLen - mmPos - i );
1698 for ( j = 0; j < (int) ahead.size(); j++ ) {
1699 if ( (a & (Anchor_FirstLookahead << j)) != 0 ) {
1700 QMemArray<int> captured;
1701 ahead[j]->eng->match( cstr.string(), 0, TRUE, TRUE,
1702 mmCaretPos - mmPos - i, captured );
1703 if ( (captured[0] == 0) == ahead[j]->neg )
1704 return FALSE;
1705 }
1706 }
1707 }
1708#endif
1709#ifndef QT_NO_REGEXP_CAPTURE
1710#ifndef QT_NO_REGEXP_BACKREF
1711 for ( j = 0; j < nbrefs; j++ ) {
1712 if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) {
1713 if ( capBegin[j] != EmptyCapture )
1714 return FALSE;
1715 }
1716 }
1717#endif
1718#endif
1719 return TRUE;
1720}
1721
1722#ifndef QT_NO_REGEXP_OPTIM
1723/*
1724 The three following functions are what Jeffrey Friedl would call
1725 transmissions (or bump-alongs). Using one or the other should make
1726 no difference except in performance.
1727*/
1728
1729bool QRegExpEngine::goodStringMatch()
1730{
1731 int k = mmPos + goodEarlyStart;
1732 while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) {
1733 int from = k - goodLateStart;
1734 int to = k - goodEarlyStart;
1735 if ( from > mmPos )
1736 mmPos = from;
1737
1738 while ( mmPos <= to ) {
1739 if ( matchHere() )
1740 return TRUE;
1741 mmPos++;
1742 }
1743 k++;
1744 }
1745 return FALSE;
1746}
1747
1748bool QRegExpEngine::badCharMatch()
1749{
1750 int slideHead = 0;
1751 int slideNext = 0;
1752 int i;
1753 int lastPos = mmLen - minl;
1754 memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) );
1755
1756 /*
1757 Set up the slide table, used for the bad-character heuristic,
1758 using the table of first occurrence of each character.
1759 */
1760 for ( i = 0; i < minl; i++ ) {
1761 int sk = occ1[BadChar(mmIn[mmPos + i])];
1762 if ( sk == NoOccurrence )
1763 sk = i + 1;
1764 if ( sk > 0 ) {
1765 int k = i + 1 - sk;
1766 if ( k < 0 ) {
1767 sk = i + 1;
1768 k = 0;
1769 }
1770 if ( sk > mmSlideTab[k] )
1771 mmSlideTab[k] = sk;
1772 }
1773 }
1774
1775 if ( mmPos > lastPos )
1776 return FALSE;
1777
1778 for ( ;; ) {
1779 if ( ++slideNext >= mmSlideTabSize )
1780 slideNext = 0;
1781 if ( mmSlideTab[slideHead] > 0 ) {
1782 if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] )
1783 mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1;
1784 mmSlideTab[slideHead] = 0;
1785 } else {
1786 if ( matchHere() )
1787 return TRUE;
1788 }
1789
1790 if ( mmPos == lastPos )
1791 break;
1792
1793 /*
1794 Update the slide table. This code has much in common with
1795 the initialization code.
1796 */
1797 int sk = occ1[BadChar(mmIn[mmPos + minl])];
1798 if ( sk == NoOccurrence ) {
1799 mmSlideTab[slideNext] = minl;
1800 } else if ( sk > 0 ) {
1801 int k = slideNext + minl - sk;
1802 if ( k >= mmSlideTabSize )
1803 k -= mmSlideTabSize;
1804 if ( sk > mmSlideTab[k] )
1805 mmSlideTab[k] = sk;
1806 }
1807 slideHead = slideNext;
1808 mmPos++;
1809 }
1810 return FALSE;
1811}
1812#else
1813bool QRegExpEngine::bruteMatch()
1814{
1815 while ( mmPos <= mmLen ) {
1816 if ( matchHere() )
1817 return TRUE;
1818 mmPos++;
1819 }
1820 return FALSE;
1821}
1822#endif
1823
1824/*
1825 Here's the core of the engine. It tries to do a match here and now.
1826*/
1827bool QRegExpEngine::matchHere()
1828{
1829 int ncur = 1, nnext = 0;
1830 int i = 0, j, k, m;
1831 bool stop = FALSE;
1832
1833 mmMatchLen = -1;
1834 mmOneTestMatchedLen = -1;
1835 mmCurStack[0] = InitialState;
1836
1837#ifndef QT_NO_REGEXP_CAPTURE
1838 if ( ncap > 0 ) {
1839 for ( j = 0; j < ncap; j++ ) {
1840 mmCurCapBegin[j] = EmptyCapture;
1841 mmCurCapEnd[j] = EmptyCapture;
1842 }
1843 }
1844#endif
1845
1846#ifndef QT_NO_REGEXP_BACKREF
1847 int *zzZ = 0;
1848
1849 while ( (ncur > 0 || !mmSleeping.isEmpty()) && i <= mmLen - mmPos &&
1850 !stop )
1851#else
1852 while ( ncur > 0 && i <= mmLen - mmPos && !stop )
1853#endif
1854 {
1855 int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].unicode() : 0;
1856 for ( j = 0; j < ncur; j++ ) {
1857 int cur = mmCurStack[j];
1858 State *scur = s[cur];
1859 QMemArray<int>& outs = scur->outs;
1860 for ( k = 0; k < (int) outs.size(); k++ ) {
1861 int next = outs[k];
1862 State *snext = s[next];
1863 bool in = TRUE;
1864#ifndef QT_NO_REGEXP_BACKREF
1865 int needSomeSleep = 0;
1866#endif
1867
1868 /*
1869 First, check if the anchors are anchored properly.
1870 */
1871 if ( scur->anchors != 0 ) {
1872 int a = at( *scur->anchors, next );
1873 if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) )
1874 in = FALSE;
1875 }
1876 /*
1877 If indeed they are, check if the input character is
1878 correct for this transition.
1879 */
1880 if ( in ) {
1881 m = snext->match;
1882 if ( (m & (CharClassBit | BackRefBit)) == 0 ) {
1883 if ( cs )
1884 in = ( m == ch );
1885 else
1886 in = ( QChar(m).lower() == QChar(ch).lower() );
1887 } else if ( next == FinalState ) {
1888 mmMatchLen = i;
1889 stop = mmMinimal;
1890 in = TRUE;
1891 } else if ( (m & CharClassBit) != 0 ) {
1892#ifndef QT_NO_REGEXP_CCLASS
1893 const CharClass *cc = cl[m ^ CharClassBit];
1894 if ( cs )
1895 in = cc->in( ch );
1896 else if ( cc->negative() )
1897 in = cc->in( QChar(ch).lower() ) &&
1898 cc->in( QChar(ch).upper() );
1899 else
1900 in = cc->in( QChar(ch).lower() ) ||
1901 cc->in( QChar(ch).upper() );
1902#endif
1903#ifndef QT_NO_REGEXP_BACKREF
1904 } else { /* ( (m & BackRefBit) != 0 ) */
1905 int bref = m ^ BackRefBit;
1906 int ell = j * ncap + ( bref - 1 );
1907
1908 in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture;
1909 if ( in ) {
1910 if ( cs )
1911 in = ( mmIn[mmPos + mmCurCapBegin[ell]]
1912 == QChar(ch) );
1913 else
1914 in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower()
1915 == QChar(ch).lower() );
1916 }
1917
1918 if ( in ) {
1919 int delta;
1920 if ( mmCurCapEnd[ell] == EmptyCapture )
1921 delta = i - mmCurCapBegin[ell];
1922 else
1923 delta = mmCurCapEnd[ell] - mmCurCapBegin[ell];
1924
1925 in = ( delta <= mmLen - (mmPos + i) );
1926 if ( in && delta > 1 ) {
1927 int n = 1;
1928 if ( cs ) {
1929 while ( n < delta ) {
1930 if ( mmIn[mmPos +
1931 mmCurCapBegin[ell] + n] !=
1932 mmIn[mmPos + i + n] )
1933 break;
1934 n++;
1935 }
1936 } else {
1937 while ( n < delta ) {
1938 QChar a = mmIn[mmPos +
1939 mmCurCapBegin[ell] + n];
1940 QChar b = mmIn[mmPos + i + n];
1941 if ( a.lower() != b.lower() )
1942 break;
1943 n++;
1944 }
1945 }
1946 in = ( n == delta );
1947 if ( in )
1948 needSomeSleep = delta - 1;
1949 }
1950 }
1951#endif
1952 }
1953 }
1954
1955 /*
1956 We must now update our data structures.
1957 */
1958 if ( in ) {
1959#ifndef QT_NO_REGEXP_CAPTURE
1960 int *capBegin, *capEnd;
1961#endif
1962 /*
1963 If the next state was not encountered yet, all
1964 is fine.
1965 */
1966 if ( (m = mmInNextStack[next]) == -1 ) {
1967 m = nnext++;
1968 mmNextStack[m] = next;
1969 mmInNextStack[next] = m;
1970#ifndef QT_NO_REGEXP_CAPTURE
1971 capBegin = mmNextCapBegin + m * ncap;
1972 capEnd = mmNextCapEnd + m * ncap;
1973
1974 /*
1975 Otherwise, we'll first maintain captures in
1976 temporary arrays, and decide at the end whether
1977 it's best to keep the previous capture zones or
1978 the new ones.
1979 */
1980 } else {
1981 capBegin = mmTempCapBegin;
1982 capEnd = mmTempCapEnd;
1983#endif
1984 }
1985
1986#ifndef QT_NO_REGEXP_CAPTURE
1987 /*
1988 Updating the capture zones is much of a task.
1989 */
1990 if ( ncap > 0 ) {
1991 memcpy( capBegin, mmCurCapBegin + j * ncap,
1992 ncap * sizeof(int) );
1993 memcpy( capEnd, mmCurCapEnd + j * ncap,
1994 ncap * sizeof(int) );
1995 int c = scur->atom, n = snext->atom;
1996 int p = -1, q = -1;
1997 int cap;
1998
1999 /*
2000 Lemma 1. For any x in the range [0..nf), we
2001 have f[x].parent < x.
2002
2003 Proof. By looking at startAtom(), it is
2004 clear that cf < nf holds all the time, and
2005 thus that f[nf].parent < nf.
2006 */
2007
2008 /*
2009 If we are reentering an atom, we empty all
2010 capture zones inside it.
2011 */
2012 if ( scur->reenter != 0 &&
2013 (q = at(*scur->reenter, next)) != 0 ) {
2014 QBitArray b;
2015 b.fill( FALSE, nf );
2016 b.setBit( q, TRUE );
2017 for ( int ell = q + 1; ell < nf; ell++ ) {
2018 if ( b.testBit(f[ell].parent) ) {
2019 b.setBit( ell, TRUE );
2020 cap = f[ell].capture;
2021 if ( cap >= 0 ) {
2022 capBegin[cap] = EmptyCapture;
2023 capEnd[cap] = EmptyCapture;
2024 }
2025 }
2026 }
2027 p = f[q].parent;
2028
2029 /*
2030 Otherwise, close the capture zones we are
2031 leaving. We are leaving f[c].capture,
2032 f[f[c].parent].capture,
2033 f[f[f[c].parent].parent].capture, ...,
2034 until f[x].capture, with x such that
2035 f[x].parent is the youngest common ancestor
2036 for c and n.
2037
2038 We go up along c's and n's ancestry until
2039 we find x.
2040 */
2041 } else {
2042 p = c;
2043 q = n;
2044 while ( p != q ) {
2045 if ( p > q ) {
2046 cap = f[p].capture;
2047 if ( cap >= 0 ) {
2048 if ( capBegin[cap] == i ) {
2049 capBegin[cap] = EmptyCapture;
2050 capEnd[cap] = EmptyCapture;
2051 } else {
2052 capEnd[cap] = i;
2053 }
2054 }
2055 p = f[p].parent;
2056 } else {
2057 q = f[q].parent;
2058 }
2059 }
2060 }
2061
2062 /*
2063 In any case, we now open the capture zones
2064 we are entering. We work upwards from n
2065 until we reach p (the parent of the atom we
2066 reenter or the youngest common ancestor).
2067 */
2068 while ( n > p ) {
2069 cap = f[n].capture;
2070 if ( cap >= 0 ) {
2071 capBegin[cap] = i;
2072 capEnd[cap] = EmptyCapture;
2073 }
2074 n = f[n].parent;
2075 }
2076 /*
2077 If the next state was already in
2078 mmNextStack, we must choose carefully which
2079 capture zones we want to keep.
2080 */
2081 if ( capBegin == mmTempCapBegin &&
2082 isBetterCapture(capBegin, capEnd,
2083 mmNextCapBegin + m * ncap,
2084 mmNextCapEnd + m * ncap) ) {
2085 memcpy( mmNextCapBegin + m * ncap, capBegin,
2086 ncap * sizeof(int) );
2087 memcpy( mmNextCapEnd + m * ncap, capEnd,
2088 ncap * sizeof(int) );
2089 }
2090 }
2091#ifndef QT_NO_REGEXP_BACKREF
2092 /*
2093 We are done with updating the capture zones.
2094 It's now time to put the next state to sleep,
2095 if it needs to, and to remove it from
2096 mmNextStack.
2097 */
2098 if ( needSomeSleep > 0 ) {
2099 zzZ = new int[1 + 2 * ncap];
2100 zzZ[0] = next;
2101 if ( ncap > 0 ) {
2102 memcpy( zzZ + 1, capBegin, ncap * sizeof(int) );
2103 memcpy( zzZ + 1 + ncap, capEnd,
2104 ncap * sizeof(int) );
2105 }
2106 mmInNextStack[mmNextStack[--nnext]] = -1;
2107 mmSleeping.insert( i + needSomeSleep, zzZ );
2108 }
2109#endif
2110#endif
2111 }
2112 }
2113 }
2114#ifndef QT_NO_REGEXP_CAPTURE
2115 /*
2116 If we reached the final state, hurray! Copy the captured
2117 zone.
2118 */
2119 if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) {
2120 memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) );
2121 memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) );
2122 }
2123#ifndef QT_NO_REGEXP_BACKREF
2124 /*
2125 It's time to wake up the sleepers.
2126 */
2127 if ( !mmSleeping.isEmpty() ) {
2128 while ( (zzZ = mmSleeping.take(i)) != 0 ) {
2129 int next = zzZ[0];
2130 int *capBegin = zzZ + 1;
2131 int *capEnd = zzZ + 1 + ncap;
2132 bool copyOver = TRUE;
2133
2134 if ( (m = mmInNextStack[zzZ[0]]) == -1 ) {
2135 m = nnext++;
2136 mmNextStack[m] = next;
2137 mmInNextStack[next] = m;
2138 } else {
2139 copyOver = isBetterCapture( mmNextCapBegin + m * ncap,
2140 mmNextCapEnd + m * ncap,
2141 capBegin, capEnd );
2142 }
2143 if ( copyOver ) {
2144 memcpy( mmNextCapBegin + m * ncap, capBegin,
2145 ncap * sizeof(int) );
2146 memcpy( mmNextCapEnd + m * ncap, capEnd,
2147 ncap * sizeof(int) );
2148 }
2149 delete[] zzZ;
2150 }
2151 }
2152#endif
2153#endif
2154 for ( j = 0; j < nnext; j++ )
2155 mmInNextStack[mmNextStack[j]] = -1;
2156
2157 // avoid needless iteration that confuses mmOneTestMatchedLen
2158 if ( nnext == 1 && mmNextStack[0] == FinalState
2159#ifndef QT_NO_REGEXP_BACKREF
2160 && mmSleeping.isEmpty()
2161#endif
2162 )
2163 stop = TRUE;
2164
2165 qSwap( mmCurStack, mmNextStack );
2166#ifndef QT_NO_REGEXP_CAPTURE
2167 qSwap( mmCurCapBegin, mmNextCapBegin );
2168 qSwap( mmCurCapEnd, mmNextCapEnd );
2169#endif
2170 ncur = nnext;
2171 nnext = 0;
2172 i++;
2173 }
2174
2175#ifndef QT_NO_REGEXP_BACKREF
2176 /*
2177 If minimal matching is enabled, we might have some sleepers
2178 left.
2179 */
2180 while ( !mmSleeping.isEmpty() ) {
2181 zzZ = mmSleeping.take( *QIntDictIterator<int>(mmSleeping) );
2182 delete[] zzZ;
2183 }
2184#endif
2185
2186 mmOneTestMatchedLen = i - 1;
2187 return ( mmMatchLen >= 0 );
2188}
2189
2190#ifndef QT_NO_REGEXP_CCLASS
2191
2192QRegExpEngine::CharClass::CharClass()
2193 : c( 0 ), n( FALSE )
2194{
2195#ifndef QT_NO_REGEXP_OPTIM
2196 occ1.fill( NoOccurrence, NumBadChars );
2197#endif
2198}
2199
2200QRegExpEngine::CharClass& QRegExpEngine::CharClass::operator=(
2201 const CharClass& cc )
2202{
2203 c = cc.c;
2204 r = cc.r.copy();
2205 n = cc.n;
2206#ifndef QT_NO_REGEXP_OPTIM
2207 occ1 = cc.occ1;
2208#endif
2209 return *this;
2210}
2211
2212void QRegExpEngine::CharClass::clear()
2213{
2214 c = 0;
2215 r.resize( 0 );
2216 n = FALSE;
2217}
2218
2219void QRegExpEngine::CharClass::setNegative( bool negative )
2220{
2221 n = negative;
2222#ifndef QT_NO_REGEXP_OPTIM
2223 occ1.fill( 0, NumBadChars );
2224#endif
2225}
2226
2227void QRegExpEngine::CharClass::addCategories( int cats )
2228{
2229 c |= cats;
2230#ifndef QT_NO_REGEXP_OPTIM
2231 occ1.fill( 0, NumBadChars );
2232#endif
2233}
2234
2235void QRegExpEngine::CharClass::addRange( ushort from, ushort to )
2236{
2237 if ( from > to )
2238 qSwap( from, to );
2239 int m = r.size();
2240 r.resize( m + 1 );
2241 r[m].from = from;
2242 r[m].to = to;
2243
2244#ifndef QT_NO_REGEXP_OPTIM
2245 int i;
2246
2247 if ( to - from < NumBadChars ) {
2248 occ1.detach();
2249 if ( from % NumBadChars <= to % NumBadChars ) {
2250 for ( i = from % NumBadChars; i <= to % NumBadChars; i++ )
2251 occ1[i] = 0;
2252 } else {
2253 for ( i = 0; i <= to % NumBadChars; i++ )
2254 occ1[i] = 0;
2255 for ( i = from % NumBadChars; i < NumBadChars; i++ )
2256 occ1[i] = 0;
2257 }
2258 } else {
2259 occ1.fill( 0, NumBadChars );
2260 }
2261#endif
2262}
2263
2264bool QRegExpEngine::CharClass::in( QChar ch ) const
2265{
2266#ifndef QT_NO_REGEXP_OPTIM
2267 if ( occ1[BadChar(ch)] == NoOccurrence )
2268 return n;
2269#endif
2270
2271 if ( c != 0 && (c & (1 << (int) ch.category())) != 0 )
2272 return !n;
2273 for ( int i = 0; i < (int) r.size(); i++ ) {
2274 if ( ch.unicode() >= r[i].from && ch.unicode() <= r[i].to )
2275 return !n;
2276 }
2277 return n;
2278}
2279
2280#if defined(QT_DEBUG)
2281void QRegExpEngine::CharClass::dump() const
2282{
2283 int i;
2284 qDebug( " %stive character class", n ? "nega" : "posi" );
2285#ifndef QT_NO_REGEXP_CCLASS
2286 if ( c != 0 )
2287 qDebug( " categories 0x%.8x", c );
2288#endif
2289 for ( i = 0; i < (int) r.size(); i++ )
2290 qDebug( " 0x%.4x through 0x%.4x", r[i].from, r[i].to );
2291}
2292#endif
2293#endif
2294
2295QRegExpEngine::Box::Box( QRegExpEngine *engine )
2296 : eng( engine ), skipanchors( 0 )
2297#ifndef QT_NO_REGEXP_OPTIM
2298 , earlyStart( 0 ), lateStart( 0 ), maxl( 0 )
2299#endif
2300{
2301#ifndef QT_NO_REGEXP_OPTIM
2302 occ1.fill( NoOccurrence, NumBadChars );
2303#endif
2304 minl = 0;
2305}
2306
2307QRegExpEngine::Box& QRegExpEngine::Box::operator=( const Box& b )
2308{
2309 eng = b.eng;
2310 ls = b.ls;
2311 rs = b.rs;
2312 lanchors = b.lanchors;
2313 ranchors = b.ranchors;
2314 skipanchors = b.skipanchors;
2315#ifndef QT_NO_REGEXP_OPTIM
2316 earlyStart = b.earlyStart;
2317 lateStart = b.lateStart;
2318 str = b.str;
2319 leftStr = b.leftStr;
2320 rightStr = b.rightStr;
2321 maxl = b.maxl;
2322 occ1 = b.occ1;
2323#endif
2324 minl = b.minl;
2325 return *this;
2326}
2327
2328void QRegExpEngine::Box::set( QChar ch )
2329{
2330 ls.resize( 1 );
2331 ls[0] = eng->createState( ch );
2332 rs = ls;
2333 rs.detach();
2334#ifndef QT_NO_REGEXP_OPTIM
2335 str = ch;
2336 leftStr = ch;
2337 rightStr = ch;
2338 maxl = 1;
2339 occ1.detach();
2340 occ1[BadChar(ch)] = 0;
2341#endif
2342 minl = 1;
2343}
2344
2345void QRegExpEngine::Box::set( const CharClass& cc )
2346{
2347 ls.resize( 1 );
2348 ls[0] = eng->createState( cc );
2349 rs = ls;
2350 rs.detach();
2351#ifndef QT_NO_REGEXP_OPTIM
2352 maxl = 1;
2353 occ1 = cc.firstOccurrence();
2354#endif
2355 minl = 1;
2356}
2357
2358#ifndef QT_NO_REGEXP_BACKREF
2359void QRegExpEngine::Box::set( int bref )
2360{
2361 ls.resize( 1 );
2362 ls[0] = eng->createState( bref );
2363 rs = ls;
2364 rs.detach();
2365 if ( bref >= 1 && bref <= MaxBackRefs )
2366 skipanchors = Anchor_BackRef0Empty << bref;
2367#ifndef QT_NO_REGEXP_OPTIM
2368 maxl = InftyLen;
2369#endif
2370 minl = 0;
2371}
2372#endif
2373
2374void QRegExpEngine::Box::cat( const Box& b )
2375{
2376 eng->addCatTransitions( rs, b.ls );
2377 addAnchorsToEngine( b );
2378 if ( minl == 0 ) {
2379 mergeInto( &lanchors, b.lanchors );
2380 if ( skipanchors != 0 ) {
2381 for ( int i = 0; i < (int) b.ls.size(); i++ ) {
2382 int a = eng->anchorConcatenation( at(lanchors, b.ls[i]),
2383 skipanchors );
2384 lanchors.insert( b.ls[i], a );
2385 }
2386 }
2387 mergeInto( &ls, b.ls );
2388 }
2389 if ( b.minl == 0 ) {
2390 mergeInto( &ranchors, b.ranchors );
2391 if ( b.skipanchors != 0 ) {
2392 for ( int i = 0; i < (int) rs.size(); i++ ) {
2393 int a = eng->anchorConcatenation( at(ranchors, rs[i]),
2394 b.skipanchors );
2395 ranchors.insert( rs[i], a );
2396 }
2397 }
2398 mergeInto( &rs, b.rs );
2399 } else {
2400 ranchors = b.ranchors;
2401 rs = b.rs;
2402 }
2403
2404#ifndef QT_NO_REGEXP_OPTIM
2405 if ( maxl != InftyLen ) {
2406 if ( rightStr.length() + b.leftStr.length() >
2407 QMAX(str.length(), b.str.length()) ) {
2408 earlyStart = minl - rightStr.length();
2409 lateStart = maxl - rightStr.length();
2410 str = rightStr + b.leftStr;
2411 } else if ( b.str.length() > str.length() ) {
2412 earlyStart = minl + b.earlyStart;
2413 lateStart = maxl + b.lateStart;
2414 str = b.str;
2415 }
2416 }
2417
2418 if ( (int) leftStr.length() == maxl )
2419 leftStr += b.leftStr;
2420
2421 if ( (int) b.rightStr.length() == b.maxl ) {
2422 rightStr += b.rightStr;
2423 } else {
2424 rightStr = b.rightStr;
2425 }
2426
2427 if ( maxl == InftyLen || b.maxl == InftyLen ) {
2428 maxl = InftyLen;
2429 } else {
2430 maxl += b.maxl;
2431 }
2432
2433 occ1.detach();
2434 for ( int i = 0; i < NumBadChars; i++ ) {
2435 if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] )
2436 occ1[i] = minl + b.occ1[i];
2437 }
2438#endif
2439
2440 minl += b.minl;
2441 if ( minl == 0 )
2442 skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors );
2443 else
2444 skipanchors = 0;
2445}
2446
2447void QRegExpEngine::Box::orx( const Box& b )
2448{
2449 mergeInto( &ls, b.ls );
2450 mergeInto( &lanchors, b.lanchors );
2451 mergeInto( &rs, b.rs );
2452 mergeInto( &ranchors, b.ranchors );
2453
2454 if ( b.minl == 0 ) {
2455 if ( minl == 0 )
2456 skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors );
2457 else
2458 skipanchors = b.skipanchors;
2459 }
2460
2461#ifndef QT_NO_REGEXP_OPTIM
2462 occ1.detach();
2463 for ( int i = 0; i < NumBadChars; i++ ) {
2464 if ( occ1[i] > b.occ1[i] )
2465 occ1[i] = b.occ1[i];
2466 }
2467 earlyStart = 0;
2468 lateStart = 0;
2469 str = QString();
2470 leftStr = QString();
2471 rightStr = QString();
2472 if ( b.maxl > maxl )
2473 maxl = b.maxl;
2474#endif
2475 if ( b.minl < minl )
2476 minl = b.minl;
2477}
2478
2479void QRegExpEngine::Box::plus( int atom )
2480{
2481#ifndef QT_NO_REGEXP_CAPTURE
2482 eng->addPlusTransitions( rs, ls, atom );
2483#else
2484 Q_UNUSED( atom );
2485 eng->addCatTransitions( rs, ls );
2486#endif
2487 addAnchorsToEngine( *this );
2488#ifndef QT_NO_REGEXP_OPTIM
2489 maxl = InftyLen;
2490#endif
2491}
2492
2493void QRegExpEngine::Box::opt()
2494{
2495#ifndef QT_NO_REGEXP_OPTIM
2496 earlyStart = 0;
2497 lateStart = 0;
2498 str = QString();
2499 leftStr = QString();
2500 rightStr = QString();
2501#endif
2502 skipanchors = 0;
2503 minl = 0;
2504}
2505
2506void QRegExpEngine::Box::catAnchor( int a )
2507{
2508 if ( a != 0 ) {
2509 for ( int i = 0; i < (int) rs.size(); i++ ) {
2510 a = eng->anchorConcatenation( at(ranchors, rs[i]), a );
2511 ranchors.insert( rs[i], a );
2512 }
2513 if ( minl == 0 )
2514 skipanchors = eng->anchorConcatenation( skipanchors, a );
2515 }
2516}
2517
2518#ifndef QT_NO_REGEXP_OPTIM
2519void QRegExpEngine::Box::setupHeuristics()
2520{
2521 eng->goodEarlyStart = earlyStart;
2522 eng->goodLateStart = lateStart;
2523 eng->goodStr = eng->cs ? str : str.lower();
2524
2525 eng->minl = minl;
2526 if ( eng->cs ) {
2527 /*
2528 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2529 1 at this point. An entry of occ1 has to be at most minl or
2530 infinity for the rest of the algorithm to go well.
2531
2532 We waited until here before normalizing these cases (instead of
2533 doing it in Box::orx()) because sometimes things improve by
2534 themselves. Consider for example (112|1)34.
2535 */
2536 for ( int i = 0; i < NumBadChars; i++ ) {
2537 if ( occ1[i] != NoOccurrence && occ1[i] >= minl )
2538 occ1[i] = minl;
2539 }
2540 eng->occ1 = occ1;
2541 } else {
2542 eng->occ1.fill( 0, NumBadChars );
2543 }
2544
2545 eng->heuristicallyChooseHeuristic();
2546}
2547#endif
2548
2549#if defined(QT_DEBUG)
2550void QRegExpEngine::Box::dump() const
2551{
2552 int i;
2553 qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" );
2554 qDebug( " Left states:" );
2555 for ( i = 0; i < (int) ls.size(); i++ ) {
2556 if ( at(lanchors, ls[i]) == 0 )
2557 qDebug( " %d", ls[i] );
2558 else
2559 qDebug( " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] );
2560 }
2561 qDebug( " Right states:" );
2562 for ( i = 0; i < (int) rs.size(); i++ ) {
2563 if ( at(ranchors, rs[i]) == 0 )
2564 qDebug( " %d", rs[i] );
2565 else
2566 qDebug( " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] );
2567 }
2568 qDebug( " Skip anchors: 0x%.8x", skipanchors );
2569}
2570#endif
2571
2572void QRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const
2573{
2574 for ( int i = 0; i < (int) to.ls.size(); i++ ) {
2575 for ( int j = 0; j < (int) rs.size(); j++ ) {
2576 int a = eng->anchorConcatenation( at(ranchors, rs[j]),
2577 at(to.lanchors, to.ls[i]) );
2578 eng->addAnchors( rs[j], to.ls[i], a );
2579 }
2580 }
2581}
2582
2583int QRegExpEngine::getChar()
2584{
2585 return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].unicode();
2586}
2587
2588int QRegExpEngine::getEscape()
2589{
2590#ifndef QT_NO_REGEXP_ESCAPE
2591 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2592 const char backTab[] = "\a\f\n\r\t\v";
2593 ushort low;
2594 int i;
2595#endif
2596 ushort val;
2597 int prevCh = yyCh;
2598
2599 if ( prevCh == EOS ) {
2600 error( RXERR_END );
2601 return Tok_Char | '\\';
2602 }
2603 yyCh = getChar();
2604#ifndef QT_NO_REGEXP_ESCAPE
2605 if ( (prevCh & ~0xff) == 0 ) {
2606 const char *p = strchr( tab, prevCh );
2607 if ( p != 0 )
2608 return Tok_Char | backTab[p - tab];
2609 }
2610#endif
2611
2612 switch ( prevCh ) {
2613#ifndef QT_NO_REGEXP_ESCAPE
2614 case '0':
2615 val = 0;
2616 for ( i = 0; i < 3; i++ ) {
2617 if ( yyCh >= '0' && yyCh <= '7' )
2618 val = ( val << 3 ) | ( yyCh - '0' );
2619 else
2620 break;
2621 yyCh = getChar();
2622 }
2623 if ( (val & ~0377) != 0 )
2624 error( RXERR_OCTAL );
2625 return Tok_Char | val;
2626#endif
2627#ifndef QT_NO_REGEXP_ESCAPE
2628 case 'B':
2629 return Tok_NonWord;
2630#endif
2631#ifndef QT_NO_REGEXP_CCLASS
2632 case 'D':
2633 // see QChar::isDigit()
2634 yyCharClass->addCategories( 0x7fffffef );
2635 return Tok_CharClass;
2636 case 'S':
2637 // see QChar::isSpace()
2638 yyCharClass->addCategories( 0x7ffff87f );
2639 yyCharClass->addRange( 0x0000, 0x0008 );
2640 yyCharClass->addRange( 0x000e, 0x001f );
2641 yyCharClass->addRange( 0x007f, 0x009f );
2642 return Tok_CharClass;
2643 case 'W':
2644 // see QChar::isLetterOrNumber()
2645 yyCharClass->addCategories( 0x7fe07f8f );
2646 yyCharClass->addRange( 0x203f, 0x2040 );
2647 yyCharClass->addSingleton( 0x2040 );
2648 yyCharClass->addSingleton( 0x30fb );
2649 yyCharClass->addRange( 0xfe33, 0xfe34 );
2650 yyCharClass->addRange( 0xfe4d, 0xfe4f );
2651 yyCharClass->addSingleton( 0xff3f );
2652 yyCharClass->addSingleton( 0xff65 );
2653 return Tok_CharClass;
2654#endif
2655#ifndef QT_NO_REGEXP_ESCAPE
2656 case 'b':
2657 return Tok_Word;
2658#endif
2659#ifndef QT_NO_REGEXP_CCLASS
2660 case 'd':
2661 // see QChar::isDigit()
2662 yyCharClass->addCategories( 0x00000010 );
2663 return Tok_CharClass;
2664 case 's':
2665 // see QChar::isSpace()
2666 yyCharClass->addCategories( 0x00000380 );
2667 yyCharClass->addRange( 0x0009, 0x000d );
2668 return Tok_CharClass;
2669 case 'w':
2670 // see QChar::isLetterOrNumber()
2671 yyCharClass->addCategories( 0x000f8070 );
2672 yyCharClass->addSingleton( 0x005f ); // '_'
2673 return Tok_CharClass;
2674#endif
2675#ifndef QT_NO_REGEXP_ESCAPE
2676 case 'x':
2677 val = 0;
2678 for ( i = 0; i < 4; i++ ) {
2679 low = QChar( yyCh ).lower();
2680 if ( low >= '0' && low <= '9' )
2681 val = ( val << 4 ) | ( low - '0' );
2682 else if ( low >= 'a' && low <= 'f' )
2683 val = ( val << 4 ) | ( low - 'a' + 10 );
2684 else
2685 break;
2686 yyCh = getChar();
2687 }
2688 return Tok_Char | val;
2689#endif
2690 default:
2691 if ( prevCh >= '1' && prevCh <= '9' ) {
2692#ifndef QT_NO_REGEXP_BACKREF
2693 val = prevCh - '0';
2694 while ( yyCh >= '0' && yyCh <= '9' ) {
2695 val = ( val *= 10 ) | ( yyCh - '0' );
2696 yyCh = getChar();
2697 }
2698 return Tok_BackRef | val;
2699#else
2700 error( RXERR_DISABLED );
2701#endif
2702 }
2703 return Tok_Char | prevCh;
2704 }
2705}
2706
2707#ifndef QT_NO_REGEXP_INTERVAL
2708int QRegExpEngine::getRep( int def )
2709{
2710 if ( yyCh >= '0' && yyCh <= '9' ) {
2711 int rep = 0;
2712 do {
2713 rep = 10 * rep + yyCh - '0';
2714 if ( rep >= InftyRep ) {
2715 error( RXERR_REPETITION );
2716 rep = def;
2717 }
2718 yyCh = getChar();
2719 } while ( yyCh >= '0' && yyCh <= '9' );
2720 return rep;
2721 } else {
2722 return def;
2723 }
2724}
2725#endif
2726
2727#ifndef QT_NO_REGEXP_LOOKAHEAD
2728void QRegExpEngine::skipChars( int n )
2729{
2730 if ( n > 0 ) {
2731 yyPos += n - 1;
2732 yyCh = getChar();
2733 }
2734}
2735#endif
2736
2737void QRegExpEngine::error( const char *msg )
2738{
2739 if ( yyError.isEmpty() )
2740 yyError = QString::fromLatin1( msg );
2741}
2742
2743void QRegExpEngine::startTokenizer( const QChar *rx, int len )
2744{
2745 yyIn = rx;
2746 yyPos0 = 0;
2747 yyPos = 0;
2748 yyLen = len;
2749 yyCh = getChar();
2750 yyCharClass = new CharClass;
2751 yyMinRep = 0;
2752 yyMaxRep = 0;
2753 yyError = QString();
2754}
2755
2756int QRegExpEngine::getToken()
2757{
2758#ifndef QT_NO_REGEXP_CCLASS
2759 ushort pendingCh = 0;
2760 bool charPending;
2761 bool rangePending;
2762 int tok;
2763#endif
2764 int prevCh = yyCh;
2765
2766 yyPos0 = yyPos - 1;
2767#ifndef QT_NO_REGEXP_CCLASS
2768 yyCharClass->clear();
2769#endif
2770 yyMinRep = 0;
2771 yyMaxRep = 0;
2772 yyCh = getChar();
2773
2774 switch ( prevCh ) {
2775 case EOS:
2776 yyPos0 = yyPos;
2777 return Tok_Eos;
2778 case '$':
2779 return Tok_Dollar;
2780 case '(':
2781 if ( yyCh == '?' ) {
2782 prevCh = getChar();
2783 yyCh = getChar();
2784 switch ( prevCh ) {
2785#ifndef QT_NO_REGEXP_LOOKAHEAD
2786 case '!':
2787 return Tok_NegLookahead;
2788 case '=':
2789 return Tok_PosLookahead;
2790#endif
2791 case ':':
2792 return Tok_MagicLeftParen;
2793 default:
2794 error( RXERR_LOOKAHEAD );
2795 return Tok_MagicLeftParen;
2796 }
2797 } else {
2798 return Tok_LeftParen;
2799 }
2800 case ')':
2801 return Tok_RightParen;
2802 case '*':
2803 yyMinRep = 0;
2804 yyMaxRep = InftyRep;
2805 return Tok_Quantifier;
2806 case '+':
2807 yyMinRep = 1;
2808 yyMaxRep = InftyRep;
2809 return Tok_Quantifier;
2810 case '.':
2811#ifndef QT_NO_REGEXP_CCLASS
2812 yyCharClass->setNegative( TRUE );
2813#endif
2814 return Tok_CharClass;
2815 case '?':
2816 yyMinRep = 0;
2817 yyMaxRep = 1;
2818 return Tok_Quantifier;
2819 case '[':
2820#ifndef QT_NO_REGEXP_CCLASS
2821 if ( yyCh == '^' ) {
2822 yyCharClass->setNegative( TRUE );
2823 yyCh = getChar();
2824 }
2825 charPending = FALSE;
2826 rangePending = FALSE;
2827 do {
2828 if ( yyCh == '-' && charPending && !rangePending ) {
2829 rangePending = TRUE;
2830 yyCh = getChar();
2831 } else {
2832 if ( charPending && !rangePending ) {
2833 yyCharClass->addSingleton( pendingCh );
2834 charPending = FALSE;
2835 }
2836 if ( yyCh == '\\' ) {
2837 yyCh = getChar();
2838 tok = getEscape();
2839 if ( tok == Tok_Word )
2840 tok = '\b';
2841 } else {
2842 tok = Tok_Char | yyCh;
2843 yyCh = getChar();
2844 }
2845 if ( tok == Tok_CharClass ) {
2846 if ( rangePending ) {
2847 yyCharClass->addSingleton( '-' );
2848 yyCharClass->addSingleton( pendingCh );
2849 charPending = FALSE;
2850 rangePending = FALSE;
2851 }
2852 } else if ( (tok & Tok_Char) != 0 ) {
2853 if ( rangePending ) {
2854 yyCharClass->addRange( pendingCh, tok ^ Tok_Char );
2855 charPending = FALSE;
2856 rangePending = FALSE;
2857 } else {
2858 pendingCh = tok ^ Tok_Char;
2859 charPending = TRUE;
2860 }
2861 } else {
2862 error( RXERR_CHARCLASS );
2863 }
2864 }
2865 } while ( yyCh != ']' && yyCh != EOS );
2866 if ( rangePending )
2867 yyCharClass->addSingleton( '-' );
2868 if ( charPending )
2869 yyCharClass->addSingleton( pendingCh );
2870 if ( yyCh == EOS )
2871 error( RXERR_END );
2872 else
2873 yyCh = getChar();
2874 return Tok_CharClass;
2875#else
2876 error( RXERR_END );
2877 return Tok_Char | '[';
2878#endif
2879 case '\\':
2880 return getEscape();
2881 case ']':
2882 error( RXERR_LEFTDELIM );
2883 return Tok_Char | ']';
2884 case '^':
2885 return Tok_Caret;
2886 case '{':
2887#ifndef QT_NO_REGEXP_INTERVAL
2888 yyMinRep = getRep( 0 );
2889 yyMaxRep = yyMinRep;
2890 if ( yyCh == ',' ) {
2891 yyCh = getChar();
2892 yyMaxRep = getRep( InftyRep );
2893 }
2894 if ( yyMaxRep < yyMinRep )
2895 qSwap( yyMinRep, yyMaxRep );
2896 if ( yyCh != '}' )
2897 error( RXERR_REPETITION );
2898 yyCh = getChar();
2899 return Tok_Quantifier;
2900#else
2901 error( RXERR_DISABLED );
2902 return Tok_Char | '{';
2903#endif
2904 case '|':
2905 return Tok_Bar;
2906 case '}':
2907 error( RXERR_LEFTDELIM );
2908 return Tok_Char | '}';
2909 default:
2910 return Tok_Char | prevCh;
2911 }
2912}
2913
2914int QRegExpEngine::parse( const QChar *pattern, int len )
2915{
2916 valid = TRUE;
2917 startTokenizer( pattern, len );
2918 yyTok = getToken();
2919#ifndef QT_NO_REGEXP_CAPTURE
2920 yyMayCapture = TRUE;
2921#else
2922 yyMayCapture = FALSE;
2923#endif
2924
2925#ifndef QT_NO_REGEXP_CAPTURE
2926 int atom = startAtom( FALSE );
2927#endif
2928 CharClass anything;
2929 Box box( this ); // create InitialState
2930 box.set( anything );
2931 Box rightBox( this ); // create FinalState
2932 rightBox.set( anything );
2933
2934 Box middleBox( this );
2935 parseExpression( &middleBox );
2936#ifndef QT_NO_REGEXP_CAPTURE
2937 finishAtom( atom );
2938#endif
2939#ifndef QT_NO_REGEXP_OPTIM
2940 middleBox.setupHeuristics();
2941#endif
2942 box.cat( middleBox );
2943 box.cat( rightBox );
2944 delete yyCharClass;
2945 yyCharClass = 0;
2946
2947 officialncap = ncap;
2948#ifndef QT_NO_REGEXP_BACKREF
2949 if ( nbrefs > ncap )
2950 ncap = nbrefs;
2951#endif
2952
2953 /*
2954 We use one QMemArray<int> for all the big data used a lot in
2955 matchHere() and friends.
2956 */
2957#ifndef QT_NO_REGEXP_OPTIM
2958 mmSlideTabSize = QMAX( minl + 1, 16 );
2959#else
2960 mmSlideTabSize = 0;
2961#endif
2962 mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize );
2963
2964 mmInNextStack = mmBigArray.data();
2965 memset( mmInNextStack, -1, ns * sizeof(int) );
2966 mmCurStack = mmInNextStack + ns;
2967 mmNextStack = mmInNextStack + 2 * ns;
2968
2969 mmCurCapBegin = mmInNextStack + 3 * ns;
2970 mmNextCapBegin = mmCurCapBegin + ncap * ns;
2971 mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns;
2972 mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns;
2973
2974 mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns;
2975 mmTempCapEnd = mmTempCapBegin + ncap;
2976 mmCapBegin = mmTempCapBegin + 2 * ncap;
2977 mmCapEnd = mmTempCapBegin + 3 * ncap;
2978
2979 mmSlideTab = mmTempCapBegin + 4 * ncap;
2980
2981 if ( !yyError.isEmpty() )
2982 return -1;
2983
2984#ifndef QT_NO_REGEXP_OPTIM
2985 State *sinit = s[InitialState];
2986 caretAnchored = ( sinit->anchors != 0 );
2987 if ( caretAnchored ) {
2988 QMap<int, int>& anchors = *sinit->anchors;
2989 QMap<int, int>::ConstIterator a;
2990 for ( a = anchors.begin(); a != anchors.end(); ++a ) {
2991#ifndef QT_NO_REGEXP_ANCHOR_ALT
2992 if ( (*a & Anchor_Alternation) != 0 )
2993 break;
2994#endif
2995 if ( (*a & Anchor_Caret) == 0 ) {
2996 caretAnchored = FALSE;
2997 break;
2998 }
2999 }
3000 }
3001#endif
3002 return yyPos0;
3003}
3004
3005void QRegExpEngine::parseAtom( Box *box )
3006{
3007#ifndef QT_NO_REGEXP_LOOKAHEAD
3008 QRegExpEngine *eng = 0;
3009 bool neg;
3010 int len;
3011#endif
3012
3013 if ( (yyTok & Tok_Char) != 0 ) {
3014 box->set( QChar(yyTok ^ Tok_Char) );
3015 } else {
3016#ifndef QT_NO_REGEXP_OPTIM
3017 trivial = FALSE;
3018#endif
3019 switch ( yyTok ) {
3020 case Tok_Dollar:
3021 box->catAnchor( Anchor_Dollar );
3022 break;
3023 case Tok_Caret:
3024 box->catAnchor( Anchor_Caret );
3025 break;
3026#ifndef QT_NO_REGEXP_LOOKAHEAD
3027 case Tok_PosLookahead:
3028 case Tok_NegLookahead:
3029 neg = ( yyTok == Tok_NegLookahead );
3030 eng = new QRegExpEngine( cs );
3031 len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 );
3032 if ( len >= 0 )
3033 skipChars( len );
3034 else
3035 error( RXERR_LOOKAHEAD );
3036 box->catAnchor( addLookahead(eng, neg) );
3037 yyTok = getToken();
3038 if ( yyTok != Tok_RightParen )
3039 error( RXERR_LOOKAHEAD );
3040 break;
3041#endif
3042#ifndef QT_NO_REGEXP_ESCAPE
3043 case Tok_Word:
3044 box->catAnchor( Anchor_Word );
3045 break;
3046 case Tok_NonWord:
3047 box->catAnchor( Anchor_NonWord );
3048 break;
3049#endif
3050 case Tok_LeftParen:
3051 case Tok_MagicLeftParen:
3052 yyTok = getToken();
3053 parseExpression( box );
3054 if ( yyTok != Tok_RightParen )
3055 error( RXERR_END );
3056 break;
3057 case Tok_CharClass:
3058 box->set( *yyCharClass );
3059 break;
3060 case Tok_Quantifier:
3061 error( RXERR_REPETITION );
3062 break;
3063 default:
3064#ifndef QT_NO_REGEXP_BACKREF
3065 if ( (yyTok & Tok_BackRef) != 0 )
3066 box->set( yyTok ^ Tok_BackRef );
3067 else
3068#endif
3069 error( RXERR_DISABLED );
3070 }
3071 }
3072 yyTok = getToken();
3073}
3074
3075void QRegExpEngine::parseFactor( Box *box )
3076{
3077#ifndef QT_NO_REGEXP_CAPTURE
3078 int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen );
3079#else
3080 static const int atom = 0;
3081#endif
3082
3083#ifndef QT_NO_REGEXP_INTERVAL
3084#define YYREDO() \
3085 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3086 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3087
3088 const QChar *in = yyIn;
3089 int pos0 = yyPos0;
3090 int pos = yyPos;
3091 int len = yyLen;
3092 int ch = yyCh;
3093 CharClass charClass;
3094 if ( yyTok == Tok_CharClass )
3095 charClass = *yyCharClass;
3096 int tok = yyTok;
3097 bool mayCapture = yyMayCapture;
3098#endif
3099
3100 parseAtom( box );
3101#ifndef QT_NO_REGEXP_CAPTURE
3102 finishAtom( atom );
3103#endif
3104
3105 if ( yyTok == Tok_Quantifier ) {
3106#ifndef QT_NO_REGEXP_OPTIM
3107 trivial = FALSE;
3108#endif
3109 if ( yyMaxRep == InftyRep ) {
3110 box->plus( atom );
3111#ifndef QT_NO_REGEXP_INTERVAL
3112 } else if ( yyMaxRep == 0 ) {
3113 box->clear();
3114#endif
3115 }
3116 if ( yyMinRep == 0 )
3117 box->opt();
3118
3119#ifndef QT_NO_REGEXP_INTERVAL
3120 yyMayCapture = FALSE;
3121 int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1;
3122 int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 );
3123
3124 Box rightBox( this );
3125 int i;
3126
3127 for ( i = 0; i < beta; i++ ) {
3128 YYREDO();
3129 Box leftBox( this );
3130 parseAtom( &leftBox );
3131 leftBox.cat( rightBox );
3132 leftBox.opt();
3133 rightBox = leftBox;
3134 }
3135 for ( i = 0; i < alpha; i++ ) {
3136 YYREDO();
3137 Box leftBox( this );
3138 parseAtom( &leftBox );
3139 leftBox.cat( rightBox );
3140 rightBox = leftBox;
3141 }
3142 rightBox.cat( *box );
3143 *box = rightBox;
3144#endif
3145 yyTok = getToken();
3146#ifndef QT_NO_REGEXP_INTERVAL
3147 yyMayCapture = mayCapture;
3148#endif
3149 }
3150#undef YYREDO
3151}
3152
3153void QRegExpEngine::parseTerm( Box *box )
3154{
3155#ifndef QT_NO_REGEXP_OPTIM
3156 if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar )
3157 parseFactor( box );
3158#endif
3159 while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) {
3160 Box rightBox( this );
3161 parseFactor( &rightBox );
3162 box->cat( rightBox );
3163 }
3164}
3165
3166void QRegExpEngine::parseExpression( Box *box )
3167{
3168 parseTerm( box );
3169 while ( yyTok == Tok_Bar ) {
3170#ifndef QT_NO_REGEXP_OPTIM
3171 trivial = FALSE;
3172#endif
3173 Box rightBox( this );
3174 yyTok = getToken();
3175 parseTerm( &rightBox );
3176 box->orx( rightBox );
3177 }
3178}
3179
3180/*
3181 The struct QRegExpPrivate contains the private data of a regular
3182 expression other than the automaton. It makes it possible for many
3183 QRegExp objects to use the same QRegExpEngine object with different
3184 QRegExpPrivate objects.
3185*/
3186struct QRegExpPrivate
3187{
3188 QString pattern; // regular-expression or wildcard pattern
3189 QString rxpattern; // regular-expression pattern
3190#ifndef QT_NO_REGEXP_WILDCARD
3191 bool wc : 1; // wildcard mode?
3192#endif
3193 bool min : 1; // minimal matching? (instead of maximal)
3194 bool cs : 1; // case sensitive?
3195#ifndef QT_NO_REGEXP_CAPTURE
3196 QString t; // last string passed to QRegExp::search() or searchRev()
3197 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3198#endif
3199 QMemArray<int> captured; // what QRegExpEngine::search() returned last
3200
3201 QRegExpPrivate() { captured.fill( -1, 2 ); }
3202};
3203
3204#ifndef QT_NO_REGEXP_OPTIM
3205static QSingleCleanupHandler<QCache<QRegExpEngine> > cleanup_cache;
3206# ifndef QT_THREAD_SUPPORT
3207static QCache<QRegExpEngine> *engineCache = 0;
3208# endif // QT_THREAD_SUPPORT
3209#endif // QT_NO_REGEXP_OPTIM
3210
3211static void regexpEngine( QRegExpEngine *&eng, const QString &pattern,
3212 bool caseSensitive, bool deref )
3213{
3214# ifdef QT_THREAD_SUPPORT
3215 static QThreadStorage<QCache<QRegExpEngine> *> engineCaches;
3216 QCache<QRegExpEngine> *&engineCache = engineCaches.localData();
3217#endif // QT_THREAD_SUPPORT
3218
3219 if ( !deref ) {
3220#ifndef QT_NO_REGEXP_OPTIM
3221 if ( engineCache != 0 ) {
3222 eng = engineCache->take( pattern );
3223 if ( eng == 0 || eng->caseSensitive() != caseSensitive ) {
3224 delete eng;
3225 } else {
3226 eng->ref();
3227 return;
3228 }
3229 }
3230#endif // QT_NO_REGEXP_OPTIM
3231 eng = new QRegExpEngine( pattern, caseSensitive );
3232 return;
3233 }
3234
3235 if ( eng->deref() ) {
3236#ifndef QT_NO_REGEXP_OPTIM
3237 if ( engineCache == 0 ) {
3238 engineCache = new QCache<QRegExpEngine>;
3239 engineCache->setAutoDelete( TRUE );
3240# ifndef QT_THREAD_SUPPORT
3241 cleanup_cache.set( &engineCache );
3242# endif // !QT_THREAD_SUPPORT
3243 }
3244 if ( !pattern.isNull() &&
3245 engineCache->insert(pattern, eng, 4 + pattern.length() / 4) )
3246 return;
3247#else
3248 Q_UNUSED( pattern );
3249#endif // QT_NO_REGEXP_OPTIM
3250 delete eng;
3251 eng = 0;
3252 }
3253}
3254
3255/*!
3256 \enum QRegExp::CaretMode
3257
3258 The CaretMode enum defines the different meanings of the caret
3259 (<b>^</b>) in a regular expression. The possible values are:
3260
3261 \value CaretAtZero
3262 The caret corresponds to index 0 in the searched string.
3263
3264 \value CaretAtOffset
3265 The caret corresponds to the start offset of the search.
3266
3267 \value CaretWontMatch
3268 The caret never matches.
3269*/
3270
3271/*!
3272 Constructs an empty regexp.
3273
3274 \sa isValid() errorString()
3275*/
3276QRegExp::QRegExp()
3277 : eng( 0 )
3278{
3279 priv = new QRegExpPrivate;
3280#ifndef QT_NO_REGEXP_WILDCARD
3281 priv->wc = FALSE;
3282#endif
3283 priv->min = FALSE;
3284 priv->cs = TRUE;
3285}
3286
3287/*!
3288 Constructs a regular expression object for the given \a pattern
3289 string. The pattern must be given using wildcard notation if \a
3290 wildcard is TRUE (default is FALSE). The pattern is case
3291 sensitive, unless \a caseSensitive is FALSE. Matching is greedy
3292 (maximal), but can be changed by calling setMinimal().
3293
3294 \sa setPattern() setCaseSensitive() setWildcard() setMinimal()
3295*/
3296QRegExp::QRegExp( const QString& pattern, bool caseSensitive, bool wildcard )
3297 : eng( 0 )
3298{
3299 priv = new QRegExpPrivate;
3300 priv->pattern = pattern;
3301#ifndef QT_NO_REGEXP_WILDCARD
3302 priv->wc = wildcard;
3303#endif
3304 priv->min = FALSE;
3305 priv->cs = caseSensitive;
3306}
3307
3308/*!
3309 Constructs a regular expression as a copy of \a rx.
3310
3311 \sa operator=()
3312*/
3313QRegExp::QRegExp( const QRegExp& rx )
3314 : eng( 0 )
3315{
3316 priv = new QRegExpPrivate;
3317 operator=( rx );
3318}
3319
3320/*!
3321 Destroys the regular expression and cleans up its internal data.
3322*/
3323QRegExp::~QRegExp()
3324{
3325 invalidateEngine();
3326 delete priv;
3327}
3328
3329/*!
3330 Copies the regular expression \a rx and returns a reference to the
3331 copy. The case sensitivity, wildcard and minimal matching options
3332 are also copied.
3333*/
3334QRegExp& QRegExp::operator=( const QRegExp& rx )
3335{
3336 QRegExpEngine *otherEng = rx.eng;
3337 if ( otherEng != 0 )
3338 otherEng->ref();
3339 invalidateEngine();
3340 eng = otherEng;
3341 priv->pattern = rx.priv->pattern;
3342 priv->rxpattern = rx.priv->rxpattern;
3343#ifndef QT_NO_REGEXP_WILDCARD
3344 priv->wc = rx.priv->wc;
3345#endif
3346 priv->min = rx.priv->min;
3347 priv->cs = rx.priv->cs;
3348#ifndef QT_NO_REGEXP_CAPTURE
3349 priv->t = rx.priv->t;
3350 priv->capturedCache = rx.priv->capturedCache;
3351#endif
3352 priv->captured = rx.priv->captured;
3353 return *this;
3354}
3355
3356/*!
3357 Returns TRUE if this regular expression is equal to \a rx;
3358 otherwise returns FALSE.
3359
3360 Two QRegExp objects are equal if they have the same pattern
3361 strings and the same settings for case sensitivity, wildcard and
3362 minimal matching.
3363*/
3364bool QRegExp::operator==( const QRegExp& rx ) const
3365{
3366 return priv->pattern == rx.priv->pattern &&
3367#ifndef QT_NO_REGEXP_WILDCARD
3368 priv->wc == rx.priv->wc &&
3369#endif
3370 priv->min == rx.priv->min &&
3371 priv->cs == rx.priv->cs;
3372}
3373
3374/*!
3375 \fn bool QRegExp::operator!=( const QRegExp& rx ) const
3376
3377 Returns TRUE if this regular expression is not equal to \a rx;
3378 otherwise returns FALSE.
3379
3380 \sa operator==()
3381*/
3382
3383/*!
3384 Returns TRUE if the pattern string is empty; otherwise returns
3385 FALSE.
3386
3387 If you call exactMatch() with an empty pattern on an empty string
3388 it will return TRUE; otherwise it returns FALSE since it operates
3389 over the whole string. If you call search() with an empty pattern
3390 on \e any string it will return the start offset (0 by default)
3391 because the empty pattern matches the 'emptiness' at the start of
3392 the string. In this case the length of the match returned by
3393 matchedLength() will be 0.
3394
3395 See QString::isEmpty().
3396*/
3397
3398bool QRegExp::isEmpty() const
3399{
3400 return priv->pattern.isEmpty();
3401}
3402
3403/*!
3404 Returns TRUE if the regular expression is valid; otherwise returns
3405 FALSE. An invalid regular expression never matches.
3406
3407 The pattern <b>[a-z</b> is an example of an invalid pattern, since
3408 it lacks a closing square bracket.
3409
3410 Note that the validity of a regexp may also depend on the setting
3411 of the wildcard flag, for example <b>*.html</b> is a valid
3412 wildcard regexp but an invalid full regexp.
3413
3414 \sa errorString()
3415*/
3416bool QRegExp::isValid() const
3417{
3418 if ( priv->pattern.isEmpty() ) {
3419 return TRUE;
3420 } else {
3421 prepareEngine();
3422 return eng->isValid();
3423 }
3424}
3425
3426/*!
3427 Returns the pattern string of the regular expression. The pattern
3428 has either regular expression syntax or wildcard syntax, depending
3429 on wildcard().
3430
3431 \sa setPattern()
3432*/
3433QString QRegExp::pattern() const
3434{
3435 return priv->pattern;
3436}
3437
3438/*!
3439 Sets the pattern string to \a pattern. The case sensitivity,
3440 wildcard and minimal matching options are not changed.
3441
3442 \sa pattern()
3443*/
3444void QRegExp::setPattern( const QString& pattern )
3445{
3446 if ( priv->pattern != pattern ) {
3447 priv->pattern = pattern;
3448 invalidateEngine();
3449 }
3450}
3451
3452/*!
3453 Returns TRUE if case sensitivity is enabled; otherwise returns
3454 FALSE. The default is TRUE.
3455
3456 \sa setCaseSensitive()
3457*/
3458bool QRegExp::caseSensitive() const
3459{
3460 return priv->cs;
3461}
3462
3463/*!
3464 Sets case sensitive matching to \a sensitive.
3465
3466 If \a sensitive is TRUE, <b>\\.txt$</b> matches \c{readme.txt} but
3467 not \c{README.TXT}.
3468
3469 \sa caseSensitive()
3470*/
3471void QRegExp::setCaseSensitive( bool sensitive )
3472{
3473 if ( sensitive != priv->cs ) {
3474 priv->cs = sensitive;
3475 invalidateEngine();
3476 }
3477}
3478
3479#ifndef QT_NO_REGEXP_WILDCARD
3480/*!
3481 Returns TRUE if wildcard mode is enabled; otherwise returns FALSE.
3482 The default is FALSE.
3483
3484 \sa setWildcard()
3485*/
3486bool QRegExp::wildcard() const
3487{
3488 return priv->wc;
3489}
3490
3491/*!
3492 Sets the wildcard mode for the regular expression. The default is
3493 FALSE.
3494
3495 Setting \a wildcard to TRUE enables simple shell-like wildcard
3496 matching. (See \link #wildcard-matching wildcard matching
3497 (globbing) \endlink.)
3498
3499 For example, <b>r*.txt</b> matches the string \c{readme.txt} in
3500 wildcard mode, but does not match \c{readme}.
3501
3502 \sa wildcard()
3503*/
3504void QRegExp::setWildcard( bool wildcard )
3505{
3506 if ( wildcard != priv->wc ) {
3507 priv->wc = wildcard;
3508 invalidateEngine();
3509 }
3510}
3511#endif
3512
3513/*!
3514 Returns TRUE if minimal (non-greedy) matching is enabled;
3515 otherwise returns FALSE.
3516
3517 \sa setMinimal()
3518*/
3519bool QRegExp::minimal() const
3520{
3521 return priv->min;
3522}
3523
3524/*!
3525 Enables or disables minimal matching. If \a minimal is FALSE,
3526 matching is greedy (maximal) which is the default.
3527
3528 For example, suppose we have the input string "We must be
3529 \<b>bold\</b>, very \<b>bold\</b>!" and the pattern
3530 <b>\<b>.*\</b></b>. With the default greedy (maximal) matching,
3531 the match is "We must be <u>\<b>bold\</b>, very
3532 \<b>bold\</b></u>!". But with minimal (non-greedy) matching the
3533 first match is: "We must be <u>\<b>bold\</b></u>, very
3534 \<b>bold\</b>!" and the second match is "We must be \<b>bold\</b>,
3535 very <u>\<b>bold\</b></u>!". In practice we might use the pattern
3536 <b>\<b>[^\<]+\</b></b> instead, although this will still fail for
3537 nested tags.
3538
3539 \sa minimal()
3540*/
3541void QRegExp::setMinimal( bool minimal )
3542{
3543 priv->min = minimal;
3544}
3545
3546/*!
3547 Returns TRUE if \a str is matched exactly by this regular
3548 expression; otherwise returns FALSE. You can determine how much of
3549 the string was matched by calling matchedLength().
3550
3551 For a given regexp string, R, exactMatch("R") is the equivalent of
3552 search("^R$") since exactMatch() effectively encloses the regexp
3553 in the start of string and end of string anchors, except that it
3554 sets matchedLength() differently.
3555
3556 For example, if the regular expression is <b>blue</b>, then
3557 exactMatch() returns TRUE only for input \c blue. For inputs \c
3558 bluebell, \c blutak and \c lightblue, exactMatch() returns FALSE
3559 and matchedLength() will return 4, 3 and 0 respectively.
3560
3561 Although const, this function sets matchedLength(),
3562 capturedTexts() and pos().
3563
3564 \sa search() searchRev() QRegExpValidator
3565*/
3566bool QRegExp::exactMatch( const QString& str ) const
3567{
3568 prepareEngineForMatch( str );
3569 eng->match( str, 0, priv->min, TRUE, 0, priv->captured );
3570 if ( priv->captured[1] == (int) str.length() ) {
3571 return TRUE;
3572 } else {
3573 priv->captured[0] = 0;
3574 priv->captured[1] = eng->partialMatchLength();
3575 return FALSE;
3576 }
3577}
3578
3579#ifndef QT_NO_COMPAT
3580/*! \obsolete
3581
3582 Attempts to match in \a str, starting from position \a index.
3583 Returns the position of the match, or -1 if there was no match.
3584
3585 The length of the match is stored in \a *len, unless \a len is a
3586 null pointer.
3587
3588 If \a indexIsStart is TRUE (the default), the position \a index in
3589 the string will match the start of string anchor, <b>^</b>, in the
3590 regexp, if present. Otherwise, position 0 in \a str will match.
3591
3592 Use search() and matchedLength() instead of this function.
3593
3594 \sa QString::mid() QConstString
3595*/
3596int QRegExp::match( const QString& str, int index, int *len,
3597 bool indexIsStart ) const
3598{
3599 int pos = search( str, index, indexIsStart ? CaretAtOffset : CaretAtZero );
3600 if ( len != 0 )
3601 *len = matchedLength();
3602 return pos;
3603}
3604#endif // QT_NO_COMPAT
3605
3606int QRegExp::search( const QString& str, int offset ) const
3607{
3608 return search( str, offset, CaretAtZero );
3609}
3610
3611/*!
3612 Attempts to find a match in \a str from position \a offset (0 by
3613 default). If \a offset is -1, the search starts at the last
3614 character; if -2, at the next to last character; etc.
3615
3616 Returns the position of the first match, or -1 if there was no
3617 match.
3618
3619 The \a caretMode parameter can be used to instruct whether <b>^</b>
3620 should match at index 0 or at \a offset.
3621
3622 You might prefer to use QString::find(), QString::contains() or
3623 even QStringList::grep(). To replace matches use
3624 QString::replace().
3625
3626 Example:
3627 \code
3628 QString str = "offsets: 1.23 .50 71.00 6.00";
3629 QRegExp rx( "\\d*\\.\\d+" ); // primitive floating point matching
3630 int count = 0;
3631 int pos = 0;
3632 while ( (pos = rx.search(str, pos)) != -1 ) {
3633 count++;
3634 pos += rx.matchedLength();
3635 }
3636 // pos will be 9, 14, 18 and finally 24; count will end up as 4
3637 \endcode
3638
3639 Although const, this function sets matchedLength(),
3640 capturedTexts() and pos().
3641
3642 \sa searchRev() exactMatch()
3643*/
3644
3645int QRegExp::search( const QString& str, int offset, CaretMode caretMode ) const
3646{
3647 prepareEngineForMatch( str );
3648 if ( offset < 0 )
3649 offset += str.length();
3650 eng->match( str, offset, priv->min, FALSE, caretIndex(offset, caretMode),
3651 priv->captured );
3652 return priv->captured[0];
3653}
3654
3655
3656int QRegExp::searchRev( const QString& str, int offset ) const
3657{
3658 return searchRev( str, offset, CaretAtZero );
3659}
3660
3661/*!
3662 Attempts to find a match backwards in \a str from position \a
3663 offset. If \a offset is -1 (the default), the search starts at the
3664 last character; if -2, at the next to last character; etc.
3665
3666 Returns the position of the first match, or -1 if there was no
3667 match.
3668
3669 The \a caretMode parameter can be used to instruct whether <b>^</b>
3670 should match at index 0 or at \a offset.
3671
3672 Although const, this function sets matchedLength(),
3673 capturedTexts() and pos().
3674
3675 \warning Searching backwards is much slower than searching
3676 forwards.
3677
3678 \sa search() exactMatch()
3679*/
3680
3681int QRegExp::searchRev( const QString& str, int offset,
3682 CaretMode caretMode ) const
3683{
3684 prepareEngineForMatch( str );
3685 if ( offset < 0 )
3686 offset += str.length();
3687 if ( offset < 0 || offset > (int) str.length() ) {
3688 priv->captured.detach();
3689 priv->captured.fill( -1 );
3690 return -1;
3691 }
3692
3693 while ( offset >= 0 ) {
3694 eng->match( str, offset, priv->min, TRUE, caretIndex(offset, caretMode),
3695 priv->captured );
3696 if ( priv->captured[0] == offset )
3697 return offset;
3698 offset--;
3699 }
3700 return -1;
3701}
3702
3703/*!
3704 Returns the length of the last matched string, or -1 if there was
3705 no match.
3706
3707 \sa exactMatch() search() searchRev()
3708*/
3709int QRegExp::matchedLength() const
3710{
3711 return priv->captured[1];
3712}
3713
3714#ifndef QT_NO_REGEXP_CAPTURE
3715/*!
3716 Returns the number of captures contained in the regular expression.
3717 */
3718int QRegExp::numCaptures() const
3719{
3720 prepareEngine();
3721 return eng->numCaptures();
3722}
3723
3724/*!
3725 Returns a list of the captured text strings.
3726
3727 The first string in the list is the entire matched string. Each
3728 subsequent list element contains a string that matched a
3729 (capturing) subexpression of the regexp.
3730
3731 For example:
3732 \code
3733 QRegExp rx( "(\\d+)(\\s*)(cm|inch(es)?)" );
3734 int pos = rx.search( "Length: 36 inches" );
3735 QStringList list = rx.capturedTexts();
3736 // list is now ( "36 inches", "36", " ", "inches", "es" )
3737 \endcode
3738
3739 The above example also captures elements that may be present but
3740 which we have no interest in. This problem can be solved by using
3741 non-capturing parentheses:
3742
3743 \code
3744 QRegExp rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" );
3745 int pos = rx.search( "Length: 36 inches" );
3746 QStringList list = rx.capturedTexts();
3747 // list is now ( "36 inches", "36", "inches" )
3748 \endcode
3749
3750 Note that if you want to iterate over the list, you should iterate
3751 over a copy, e.g.
3752 \code
3753 QStringList list = rx.capturedTexts();
3754 QStringList::Iterator it = list.begin();
3755 while( it != list.end() ) {
3756 myProcessing( *it );
3757 ++it;
3758 }
3759 \endcode
3760
3761 Some regexps can match an indeterminate number of times. For
3762 example if the input string is "Offsets: 12 14 99 231 7" and the
3763 regexp, \c{rx}, is <b>(\\d+)+</b>, we would hope to get a list of
3764 all the numbers matched. However, after calling
3765 \c{rx.search(str)}, capturedTexts() will return the list ( "12",
3766 "12" ), i.e. the entire match was "12" and the first subexpression
3767 matched was "12". The correct approach is to use cap() in a \link
3768 #cap_in_a_loop loop \endlink.
3769
3770 The order of elements in the string list is as follows. The first
3771 element is the entire matching string. Each subsequent element
3772 corresponds to the next capturing open left parentheses. Thus
3773 capturedTexts()[1] is the text of the first capturing parentheses,
3774 capturedTexts()[2] is the text of the second and so on
3775 (corresponding to $1, $2, etc., in some other regexp languages).
3776
3777 \sa cap() pos() exactMatch() search() searchRev()
3778*/
3779QStringList QRegExp::capturedTexts()
3780{
3781 if ( priv->capturedCache.isEmpty() ) {
3782 for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) {
3783 QString m;
3784 if ( priv->captured[i + 1] == 0 )
3785 m = QString::fromLatin1( "" );
3786 else if ( priv->captured[i] >= 0 )
3787 m = priv->t.mid( priv->captured[i],
3788 priv->captured[i + 1] );
3789 priv->capturedCache.append( m );
3790 }
3791 priv->t = QString::null;
3792 }
3793 return priv->capturedCache;
3794}
3795
3796/*!
3797 Returns the text captured by the \a nth subexpression. The entire
3798 match has index 0 and the parenthesized subexpressions have
3799 indices starting from 1 (excluding non-capturing parentheses).
3800
3801 \code
3802 QRegExp rxlen( "(\\d+)(?:\\s*)(cm|inch)" );
3803 int pos = rxlen.search( "Length: 189cm" );
3804 if ( pos > -1 ) {
3805 QString value = rxlen.cap( 1 ); // "189"
3806 QString unit = rxlen.cap( 2 ); // "cm"
3807 // ...
3808 }
3809 \endcode
3810
3811 The order of elements matched by cap() is as follows. The first
3812 element, cap(0), is the entire matching string. Each subsequent
3813 element corresponds to the next capturing open left parentheses.
3814 Thus cap(1) is the text of the first capturing parentheses, cap(2)
3815 is the text of the second, and so on.
3816
3817 \target cap_in_a_loop
3818 Some patterns may lead to a number of matches which cannot be
3819 determined in advance, for example:
3820
3821 \code
3822 QRegExp rx( "(\\d+)" );
3823 str = "Offsets: 12 14 99 231 7";
3824 QStringList list;
3825 pos = 0;
3826 while ( pos >= 0 ) {
3827 pos = rx.search( str, pos );
3828 if ( pos > -1 ) {
3829 list += rx.cap( 1 );
3830 pos += rx.matchedLength();
3831 }
3832 }
3833 // list contains "12", "14", "99", "231", "7"
3834 \endcode
3835
3836 \sa capturedTexts() pos() exactMatch() search() searchRev()
3837*/
3838QString QRegExp::cap( int nth )
3839{
3840 if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) {
3841 return QString::null;
3842 } else {
3843 return capturedTexts()[nth];
3844 }
3845}
3846
3847/*!
3848 Returns the position of the \a nth captured text in the searched
3849 string. If \a nth is 0 (the default), pos() returns the position
3850 of the whole match.
3851
3852 Example:
3853 \code
3854 QRegExp rx( "/([a-z]+)/([a-z]+)" );
3855 rx.search( "Output /dev/null" ); // returns 7 (position of /dev/null)
3856 rx.pos( 0 ); // returns 7 (position of /dev/null)
3857 rx.pos( 1 ); // returns 8 (position of dev)
3858 rx.pos( 2 ); // returns 12 (position of null)
3859 \endcode
3860
3861 For zero-length matches, pos() always returns -1. (For example, if
3862 cap(4) would return an empty string, pos(4) returns -1.) This is
3863 due to an implementation tradeoff.
3864
3865 \sa capturedTexts() exactMatch() search() searchRev()
3866*/
3867int QRegExp::pos( int nth )
3868{
3869 if ( nth < 0 || nth >= (int) priv->captured.size() / 2 )
3870 return -1;
3871 else
3872 return priv->captured[2 * nth];
3873}
3874
3875/*!
3876 Returns a text string that explains why a regexp pattern is
3877 invalid the case being; otherwise returns "no error occurred".
3878
3879 \sa isValid()
3880*/
3881QString QRegExp::errorString()
3882{
3883 if ( isValid() ) {
3884 return QString( RXERR_OK );
3885 } else {
3886 return eng->errorString();
3887 }
3888}
3889#endif
3890
3891/*!
3892 Returns the string \a str with every regexp special character
3893 escaped with a backslash. The special characters are $, (, ), *, +,
3894 ., ?, [, \, ], ^, {, | and }.
3895
3896 Example:
3897 \code
3898 s1 = QRegExp::escape( "bingo" ); // s1 == "bingo"
3899 s2 = QRegExp::escape( "f(x)" ); // s2 == "f\\(x\\)"
3900 \endcode
3901
3902 This function is useful to construct regexp patterns dynamically:
3903
3904 \code
3905 QRegExp rx( "(" + QRegExp::escape(name) +
3906 "|" + QRegExp::escape(alias) + ")" );
3907 \endcode
3908*/
3909QString QRegExp::escape( const QString& str )
3910{
3911 static const char meta[] = "$()*+.?[\\]^{|}";
3912 QString quoted = str;
3913 int i = 0;
3914
3915 while ( i < (int) quoted.length() ) {
3916 if ( strchr(meta, quoted[i].latin1()) != 0 )
3917 quoted.insert( i++, "\\" );
3918 i++;
3919 }
3920 return quoted;
3921}
3922
3923void QRegExp::prepareEngine() const
3924{
3925 if ( eng == 0 ) {
3926#ifndef QT_NO_REGEXP_WILDCARD
3927 if ( priv->wc )
3928 priv->rxpattern = wc2rx( priv->pattern );
3929 else
3930#endif
3931 priv->rxpattern = priv->pattern.isNull() ? QString::fromLatin1( "" )
3932 : priv->pattern;
3933 QRegExp *that = (QRegExp *) this;
3934 // that->eng = newEngine( priv->rxpattern, priv->cs );
3935 regexpEngine( that->eng, priv->rxpattern, priv->cs, FALSE );
3936 priv->captured.detach();
3937 priv->captured.fill( -1, 2 + 2 * eng->numCaptures() );
3938 }
3939}
3940
3941void QRegExp::prepareEngineForMatch( const QString& str ) const
3942{
3943 prepareEngine();
3944#ifndef QT_NO_REGEXP_CAPTURE
3945 priv->t = str;
3946 priv->capturedCache.clear();
3947#else
3948 Q_UNUSED( str );
3949#endif
3950}
3951
3952void QRegExp::invalidateEngine()
3953{
3954 if ( eng != 0 ) {
3955 regexpEngine( eng, priv->rxpattern, priv->cs, TRUE );
3956 priv->rxpattern = QString();
3957 eng = 0;
3958 }
3959}
3960
3961int QRegExp::caretIndex( int offset, CaretMode caretMode )
3962{
3963 if ( caretMode == CaretAtZero ) {
3964 return 0;
3965 } else if ( caretMode == CaretAtOffset ) {
3966 return offset;
3967 } else { // CaretWontMatch
3968 return -1;
3969 }
3970}
3971
3972#endif // QT_NO_REGEXP
Note: See TracBrowser for help on using the repository browser.