| 1 | /**************************************************************************** | 
|---|
| 2 | ** $Id: qregexp.cpp 2 2005-11-16 15:49:26Z dmik $ | 
|---|
| 3 | ** | 
|---|
| 4 | ** Implementation of QRegExp class | 
|---|
| 5 | ** | 
|---|
| 6 | ** Created : 950126 | 
|---|
| 7 | ** | 
|---|
| 8 | ** Copyright (C) 1992-2000 Trolltech AS.  All rights reserved. | 
|---|
| 9 | ** | 
|---|
| 10 | ** This file is part of the tools module of the Qt GUI Toolkit. | 
|---|
| 11 | ** | 
|---|
| 12 | ** This file may be distributed under the terms of the Q Public License | 
|---|
| 13 | ** as defined by Trolltech AS of Norway and appearing in the file | 
|---|
| 14 | ** LICENSE.QPL included in the packaging of this file. | 
|---|
| 15 | ** | 
|---|
| 16 | ** This file may be distributed and/or modified under the terms of the | 
|---|
| 17 | ** GNU General Public License version 2 as published by the Free Software | 
|---|
| 18 | ** Foundation and appearing in the file LICENSE.GPL included in the | 
|---|
| 19 | ** packaging of this file. | 
|---|
| 20 | ** | 
|---|
| 21 | ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition | 
|---|
| 22 | ** licenses may use this file in accordance with the Qt Commercial License | 
|---|
| 23 | ** Agreement provided with the Software. | 
|---|
| 24 | ** | 
|---|
| 25 | ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE | 
|---|
| 26 | ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. | 
|---|
| 27 | ** | 
|---|
| 28 | ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for | 
|---|
| 29 | **   information about Qt Commercial License Agreements. | 
|---|
| 30 | ** See http://www.trolltech.com/qpl/ for QPL licensing information. | 
|---|
| 31 | ** See http://www.trolltech.com/gpl/ for GPL licensing information. | 
|---|
| 32 | ** | 
|---|
| 33 | ** Contact info@trolltech.com if any conditions of this licensing are | 
|---|
| 34 | ** not clear to you. | 
|---|
| 35 | ** | 
|---|
| 36 | **********************************************************************/ | 
|---|
| 37 |  | 
|---|
| 38 | #include "qregexp.h" | 
|---|
| 39 |  | 
|---|
| 40 | #ifndef QT_NO_REGEXP | 
|---|
| 41 |  | 
|---|
| 42 | #include "qmemarray.h" | 
|---|
| 43 | #include "qbitarray.h" | 
|---|
| 44 | #include "qcache.h" | 
|---|
| 45 | #include "qcleanuphandler.h" | 
|---|
| 46 | #include "qintdict.h" | 
|---|
| 47 | #include "qmap.h" | 
|---|
| 48 | #include "qptrvector.h" | 
|---|
| 49 | #include "qstring.h" | 
|---|
| 50 | #include "qtl.h" | 
|---|
| 51 |  | 
|---|
| 52 | #ifdef QT_THREAD_SUPPORT | 
|---|
| 53 | #include "qthreadstorage.h" | 
|---|
| 54 | #endif // QT_THREAD_SUPPORT | 
|---|
| 55 |  | 
|---|
| 56 | #undef QT_TRANSLATE_NOOP | 
|---|
| 57 | #define QT_TRANSLATE_NOOP( context, sourceText ) sourceText | 
|---|
| 58 |  | 
|---|
| 59 | #include <limits.h> | 
|---|
| 60 |  | 
|---|
| 61 | // error strings for the regexp parser | 
|---|
| 62 | #define RXERR_OK         QT_TRANSLATE_NOOP( "QRegExp", "no error occurred" ) | 
|---|
| 63 | #define RXERR_DISABLED   QT_TRANSLATE_NOOP( "QRegExp", "disabled feature used" ) | 
|---|
| 64 | #define RXERR_CHARCLASS  QT_TRANSLATE_NOOP( "QRegExp", "bad char class syntax" ) | 
|---|
| 65 | #define RXERR_LOOKAHEAD  QT_TRANSLATE_NOOP( "QRegExp", "bad lookahead syntax" ) | 
|---|
| 66 | #define RXERR_REPETITION QT_TRANSLATE_NOOP( "QRegExp", "bad repetition syntax" ) | 
|---|
| 67 | #define RXERR_OCTAL      QT_TRANSLATE_NOOP( "QRegExp", "invalid octal value" ) | 
|---|
| 68 | #define RXERR_LEFTDELIM  QT_TRANSLATE_NOOP( "QRegExp", "missing left delim" ) | 
|---|
| 69 | #define RXERR_END        QT_TRANSLATE_NOOP( "QRegExp", "unexpected end" ) | 
|---|
| 70 | #define RXERR_LIMIT      QT_TRANSLATE_NOOP( "QRegExp", "met internal limit" ) | 
|---|
| 71 |  | 
|---|
| 72 | /* | 
|---|
| 73 | WARNING! Be sure to read qregexp.tex before modifying this file. | 
|---|
| 74 | */ | 
|---|
| 75 |  | 
|---|
| 76 | /*! | 
|---|
| 77 | \class QRegExp qregexp.h | 
|---|
| 78 | \reentrant | 
|---|
| 79 | \brief The QRegExp class provides pattern matching using regular expressions. | 
|---|
| 80 |  | 
|---|
| 81 | \ingroup tools | 
|---|
| 82 | \ingroup misc | 
|---|
| 83 | \ingroup shared | 
|---|
| 84 | \mainclass | 
|---|
| 85 | \keyword regular expression | 
|---|
| 86 |  | 
|---|
| 87 | Regular expressions, or "regexps", provide a way to find patterns | 
|---|
| 88 | within text. This is useful in many contexts, for example: | 
|---|
| 89 |  | 
|---|
| 90 | \table | 
|---|
| 91 | \row \i Validation | 
|---|
| 92 | \i A regexp can be used to check whether a piece of text | 
|---|
| 93 | meets some criteria, e.g. is an integer or contains no | 
|---|
| 94 | whitespace. | 
|---|
| 95 | \row \i Searching | 
|---|
| 96 | \i Regexps provide a much more powerful means of searching | 
|---|
| 97 | text than simple string matching does. For example we can | 
|---|
| 98 | create a regexp which says "find one of the words 'mail', | 
|---|
| 99 | 'letter' or 'correspondence' but not any of the words | 
|---|
| 100 | 'email', 'mailman' 'mailer', 'letterbox' etc." | 
|---|
| 101 | \row \i Search and Replace | 
|---|
| 102 | \i A regexp can be used to replace a pattern with a piece of | 
|---|
| 103 | text, for example replace all occurrences of '&' with | 
|---|
| 104 | '\&' except where the '&' is already followed by 'amp;'. | 
|---|
| 105 | \row \i String Splitting | 
|---|
| 106 | \i A regexp can be used to identify where a string should be | 
|---|
| 107 | split into its component fields, e.g. splitting tab-delimited | 
|---|
| 108 | strings. | 
|---|
| 109 | \endtable | 
|---|
| 110 |  | 
|---|
| 111 | We present a very brief introduction to regexps, a description of | 
|---|
| 112 | Qt's regexp language, some code examples, and finally the function | 
|---|
| 113 | documentation itself. QRegExp is modeled on Perl's regexp | 
|---|
| 114 | language, and also fully supports Unicode. QRegExp can also be | 
|---|
| 115 | used in the weaker 'wildcard' (globbing) mode which works in a | 
|---|
| 116 | similar way to command shells. A good text on regexps is \e | 
|---|
| 117 | {Mastering Regular Expressions: Powerful Techniques for Perl and | 
|---|
| 118 | Other Tools} by Jeffrey E. Friedl, ISBN 1565922573. | 
|---|
| 119 |  | 
|---|
| 120 | Experienced regexp users may prefer to skip the introduction and | 
|---|
| 121 | go directly to the relevant information. | 
|---|
| 122 |  | 
|---|
| 123 | \tableofcontents | 
|---|
| 124 |  | 
|---|
| 125 | \section1 Introduction | 
|---|
| 126 |  | 
|---|
| 127 | Regexps are built up from expressions, quantifiers, and assertions. | 
|---|
| 128 | The simplest form of expression is simply a character, e.g. | 
|---|
| 129 | <b>x</b> or <b>5</b>. An expression can also be a set of | 
|---|
| 130 | characters. For example, <b>[ABCD]</b>, will match an <b>A</b> or | 
|---|
| 131 | a <b>B</b> or a <b>C</b> or a <b>D</b>. As a shorthand we could | 
|---|
| 132 | write this as <b>[A-D]</b>. If we want to match any of the | 
|---|
| 133 | captital letters in the English alphabet we can write | 
|---|
| 134 | <b>[A-Z]</b>. A quantifier tells the regexp engine how many | 
|---|
| 135 | occurrences of the expression we want, e.g. <b>x{1,1}</b> means | 
|---|
| 136 | match an <b>x</b> which occurs at least once and at most once. | 
|---|
| 137 | We'll look at assertions and more complex expressions later. | 
|---|
| 138 |  | 
|---|
| 139 | Note that in general regexps cannot be used to check for balanced | 
|---|
| 140 | brackets or tags. For example if you want to match an opening html | 
|---|
| 141 | \c <b> and its closing \c </b> you can only use a regexp if you | 
|---|
| 142 | know that these tags are not nested; the html fragment, \c{<b>bold | 
|---|
| 143 | <b>bolder</b></b>} will not match as expected. If you know the | 
|---|
| 144 | maximum level of nesting it is possible to create a regexp that | 
|---|
| 145 | will match correctly, but for an unknown level of nesting, regexps | 
|---|
| 146 | will fail. | 
|---|
| 147 |  | 
|---|
| 148 | We'll start by writing a regexp to match integers in the range 0 | 
|---|
| 149 | to 99. We will require at least one digit so we will start with | 
|---|
| 150 | <b>[0-9]{1,1}</b> which means match a digit exactly once. This | 
|---|
| 151 | regexp alone will match integers in the range 0 to 9. To match one | 
|---|
| 152 | or two digits we can increase the maximum number of occurrences so | 
|---|
| 153 | the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at | 
|---|
| 154 | least once and at most twice. However, this regexp as it stands | 
|---|
| 155 | will not match correctly. This regexp will match one or two digits | 
|---|
| 156 | \e within a string. To ensure that we match against the whole | 
|---|
| 157 | string we must use the anchor assertions. We need <b>^</b> (caret) | 
|---|
| 158 | which when it is the first character in the regexp means that the | 
|---|
| 159 | regexp must match from the beginning of the string. And we also | 
|---|
| 160 | need <b>$</b> (dollar) which when it is the last character in the | 
|---|
| 161 | regexp means that the regexp must match until the end of the | 
|---|
| 162 | string. So now our regexp is <b>^[0-9]{1,2}$</b>. Note that | 
|---|
| 163 | assertions, such as <b>^</b> and <b>$</b>, do not match any | 
|---|
| 164 | characters. | 
|---|
| 165 |  | 
|---|
| 166 | If you've seen regexps elsewhere they may have looked different from | 
|---|
| 167 | the ones above. This is because some sets of characters and some | 
|---|
| 168 | quantifiers are so common that they have special symbols to | 
|---|
| 169 | represent them. <b>[0-9]</b> can be replaced with the symbol | 
|---|
| 170 | <b>\d</b>. The quantifier to match exactly one occurrence, | 
|---|
| 171 | <b>{1,1}</b>, can be replaced with the expression itself. This means | 
|---|
| 172 | that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0 | 
|---|
| 173 | to 99 matcher could be written <b>^\d{1,2}$</b>. Another way of | 
|---|
| 174 | writing it would be <b>^\d\d{0,1}$</b>, i.e. from the start of the | 
|---|
| 175 | string match a digit followed by zero or one digits. In practice | 
|---|
| 176 | most people would write it <b>^\d\d?$</b>. The <b>?</b> is a | 
|---|
| 177 | shorthand for the quantifier <b>{0,1}</b>, i.e. a minimum of no | 
|---|
| 178 | occurrences a maximum of one occurrence. This is used to make an | 
|---|
| 179 | expression optional. The regexp <b>^\d\d?$</b> means "from the | 
|---|
| 180 | beginning of the string match one digit followed by zero or one | 
|---|
| 181 | digits and then the end of the string". | 
|---|
| 182 |  | 
|---|
| 183 | Our second example is matching the words 'mail', 'letter' or | 
|---|
| 184 | 'correspondence' but without matching 'email', 'mailman', | 
|---|
| 185 | 'mailer', 'letterbox' etc. We'll start by just matching 'mail'. In | 
|---|
| 186 | full the regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since | 
|---|
| 187 | each expression itself is automatically quantified by <b>{1,1}</b> | 
|---|
| 188 | we can simply write this as <b>mail</b>; an 'm' followed by an 'a' | 
|---|
| 189 | followed by an 'i' followed by an 'l'. The symbol '|' (bar) is | 
|---|
| 190 | used for \e alternation, so our regexp now becomes | 
|---|
| 191 | <b>mail|letter|correspondence</b> which means match 'mail' \e or | 
|---|
| 192 | 'letter' \e or 'correspondence'. Whilst this regexp will find the | 
|---|
| 193 | words we want it will also find words we don't want such as | 
|---|
| 194 | 'email'. We will start by putting our regexp in parentheses, | 
|---|
| 195 | <b>(mail|letter|correspondence)</b>. Parentheses have two effects, | 
|---|
| 196 | firstly they group expressions together and secondly they identify | 
|---|
| 197 | parts of the regexp that we wish to \link #capturing-text capture | 
|---|
| 198 | \endlink. Our regexp still matches any of the three words but now | 
|---|
| 199 | they are grouped together as a unit. This is useful for building | 
|---|
| 200 | up more complex regexps. It is also useful because it allows us to | 
|---|
| 201 | examine which of the words actually matched. We need to use | 
|---|
| 202 | another assertion, this time <b>\b</b> "word boundary": | 
|---|
| 203 | <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match | 
|---|
| 204 | a word boundary followed by the expression in parentheses followed | 
|---|
| 205 | by another word boundary". The <b>\b</b> assertion matches at a \e | 
|---|
| 206 | position in the regexp not a \e character in the regexp. A word | 
|---|
| 207 | boundary is any non-word character such as a space a newline or | 
|---|
| 208 | the beginning or end of the string. | 
|---|
| 209 |  | 
|---|
| 210 | For our third example we want to replace ampersands with the HTML | 
|---|
| 211 | entity '\&'. The regexp to match is simple: <b>\&</b>, i.e. | 
|---|
| 212 | match one ampersand. Unfortunately this will mess up our text if | 
|---|
| 213 | some of the ampersands have already been turned into HTML | 
|---|
| 214 | entities. So what we really want to say is replace an ampersand | 
|---|
| 215 | providing it is not followed by 'amp;'. For this we need the | 
|---|
| 216 | negative lookahead assertion and our regexp becomes: | 
|---|
| 217 | <b>\&(?!amp;)</b>. The negative lookahead assertion is introduced | 
|---|
| 218 | with '(?!' and finishes at the ')'. It means that the text it | 
|---|
| 219 | contains, 'amp;' in our example, must \e not follow the expression | 
|---|
| 220 | that preceeds it. | 
|---|
| 221 |  | 
|---|
| 222 | Regexps provide a rich language that can be used in a variety of | 
|---|
| 223 | ways. For example suppose we want to count all the occurrences of | 
|---|
| 224 | 'Eric' and 'Eirik' in a string. Two valid regexps to match these | 
|---|
| 225 | are <b>\\b(Eric|Eirik)\\b</b> and <b>\\bEi?ri[ck]\\b</b>. We need | 
|---|
| 226 | the word boundary '\b' so we don't get 'Ericsson' etc. The second | 
|---|
| 227 | regexp actually matches more than we want, 'Eric', 'Erik', 'Eiric' | 
|---|
| 228 | and 'Eirik'. | 
|---|
| 229 |  | 
|---|
| 230 | We will implement some the examples above in the | 
|---|
| 231 | \link #code-examples code examples \endlink section. | 
|---|
| 232 |  | 
|---|
| 233 | \target characters-and-abbreviations-for-sets-of-characters | 
|---|
| 234 | \section1 Characters and Abbreviations for Sets of Characters | 
|---|
| 235 |  | 
|---|
| 236 | \table | 
|---|
| 237 | \header \i Element \i Meaning | 
|---|
| 238 | \row \i <b>c</b> | 
|---|
| 239 | \i Any character represents itself unless it has a special | 
|---|
| 240 | regexp meaning. Thus <b>c</b> matches the character \e c. | 
|---|
| 241 | \row \i <b>\\c</b> | 
|---|
| 242 | \i A character that follows a backslash matches the character | 
|---|
| 243 | itself except where mentioned below. For example if you | 
|---|
| 244 | wished to match a literal caret at the beginning of a string | 
|---|
| 245 | you would write <b>\^</b>. | 
|---|
| 246 | \row \i <b>\\a</b> | 
|---|
| 247 | \i This matches the ASCII bell character (BEL, 0x07). | 
|---|
| 248 | \row \i <b>\\f</b> | 
|---|
| 249 | \i This matches the ASCII form feed character (FF, 0x0C). | 
|---|
| 250 | \row \i <b>\\n</b> | 
|---|
| 251 | \i This matches the ASCII line feed character (LF, 0x0A, Unix newline). | 
|---|
| 252 | \row \i <b>\\r</b> | 
|---|
| 253 | \i This matches the ASCII carriage return character (CR, 0x0D). | 
|---|
| 254 | \row \i <b>\\t</b> | 
|---|
| 255 | \i This matches the ASCII horizontal tab character (HT, 0x09). | 
|---|
| 256 | \row \i <b>\\v</b> | 
|---|
| 257 | \i This matches the ASCII vertical tab character (VT, 0x0B). | 
|---|
| 258 | \row \i <b>\\xhhhh</b> | 
|---|
| 259 | \i This matches the Unicode character corresponding to the | 
|---|
| 260 | hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo | 
|---|
| 261 | (i.e., \zero ooo) matches the ASCII/Latin-1 character | 
|---|
| 262 | corresponding to the octal number ooo (between 0 and 0377). | 
|---|
| 263 | \row \i <b>. (dot)</b> | 
|---|
| 264 | \i This matches any character (including newline). | 
|---|
| 265 | \row \i <b>\\d</b> | 
|---|
| 266 | \i This matches a digit (QChar::isDigit()). | 
|---|
| 267 | \row \i <b>\\D</b> | 
|---|
| 268 | \i This matches a non-digit. | 
|---|
| 269 | \row \i <b>\\s</b> | 
|---|
| 270 | \i This matches a whitespace (QChar::isSpace()). | 
|---|
| 271 | \row \i <b>\\S</b> | 
|---|
| 272 | \i This matches a non-whitespace. | 
|---|
| 273 | \row \i <b>\\w</b> | 
|---|
| 274 | \i This matches a word character (QChar::isLetterOrNumber() or '_'). | 
|---|
| 275 | \row \i <b>\\W</b> | 
|---|
| 276 | \i This matches a non-word character. | 
|---|
| 277 | \row \i <b>\\n</b> | 
|---|
| 278 | \i The n-th \link #capturing-text backreference \endlink, | 
|---|
| 279 | e.g. \1, \2, etc. | 
|---|
| 280 | \endtable | 
|---|
| 281 |  | 
|---|
| 282 | \e {Note that the C++ compiler transforms backslashes in strings | 
|---|
| 283 | so to include a <b>\\</b> in a regexp you will need to enter it | 
|---|
| 284 | twice, i.e. <b>\\\\</b>.} | 
|---|
| 285 |  | 
|---|
| 286 | \target sets-of-characters | 
|---|
| 287 | \section1 Sets of Characters | 
|---|
| 288 |  | 
|---|
| 289 | Square brackets are used to match any character in the set of | 
|---|
| 290 | characters contained within the square brackets. All the character | 
|---|
| 291 | set abbreviations described above can be used within square | 
|---|
| 292 | brackets. Apart from the character set abbreviations and the | 
|---|
| 293 | following two exceptions no characters have special meanings in | 
|---|
| 294 | square brackets. | 
|---|
| 295 |  | 
|---|
| 296 | \table | 
|---|
| 297 | \row \i <b>^</b> | 
|---|
| 298 | \i The caret negates the character set if it occurs as the | 
|---|
| 299 | first character, i.e. immediately after the opening square | 
|---|
| 300 | bracket. For example, <b>[abc]</b> matches 'a' or 'b' or 'c', | 
|---|
| 301 | but <b>[^abc]</b> matches anything \e except 'a' or 'b' or | 
|---|
| 302 | 'c'. | 
|---|
| 303 | \row \i <b>-</b> | 
|---|
| 304 | \i The dash is used to indicate a range of characters, for | 
|---|
| 305 | example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'. | 
|---|
| 306 | \endtable | 
|---|
| 307 |  | 
|---|
| 308 | Using the predefined character set abbreviations is more portable | 
|---|
| 309 | than using character ranges across platforms and languages. For | 
|---|
| 310 | example, <b>[0-9]</b> matches a digit in Western alphabets but | 
|---|
| 311 | <b>\d</b> matches a digit in \e any alphabet. | 
|---|
| 312 |  | 
|---|
| 313 | Note that in most regexp literature sets of characters are called | 
|---|
| 314 | "character classes". | 
|---|
| 315 |  | 
|---|
| 316 | \target quantifiers | 
|---|
| 317 | \section1 Quantifiers | 
|---|
| 318 |  | 
|---|
| 319 | By default an expression is automatically quantified by | 
|---|
| 320 | <b>{1,1}</b>, i.e. it should occur exactly once. In the following | 
|---|
| 321 | list <b>\e {E}</b> stands for any expression. An expression is a | 
|---|
| 322 | character or an abbreviation for a set of characters or a set of | 
|---|
| 323 | characters in square brackets or any parenthesised expression. | 
|---|
| 324 |  | 
|---|
| 325 | \table | 
|---|
| 326 | \row \i <b>\e {E}?</b> | 
|---|
| 327 | \i Matches zero or one occurrence of \e E. This quantifier | 
|---|
| 328 | means "the previous expression is optional" since it will | 
|---|
| 329 | match whether or not the expression occurs in the string. It | 
|---|
| 330 | is the same as <b>\e {E}{0,1}</b>. For example <b>dents?</b> | 
|---|
| 331 | will match 'dent' and 'dents'. | 
|---|
| 332 |  | 
|---|
| 333 | \row \i <b>\e {E}+</b> | 
|---|
| 334 | \i Matches one or more occurrences of \e E. This is the same | 
|---|
| 335 | as <b>\e {E}{1,MAXINT}</b>. For example, <b>0+</b> will match | 
|---|
| 336 | '0', '00', '000', etc. | 
|---|
| 337 |  | 
|---|
| 338 | \row \i <b>\e {E}*</b> | 
|---|
| 339 | \i Matches zero or more occurrences of \e E. This is the same | 
|---|
| 340 | as <b>\e {E}{0,MAXINT}</b>. The <b>*</b> quantifier is often | 
|---|
| 341 | used by a mistake. Since it matches \e zero or more | 
|---|
| 342 | occurrences it will match no occurrences at all. For example | 
|---|
| 343 | if we want to match strings that end in whitespace and use | 
|---|
| 344 | the regexp <b>\s*$</b> we would get a match on every string. | 
|---|
| 345 | This is because we have said find zero or more whitespace | 
|---|
| 346 | followed by the end of string, so even strings that don't end | 
|---|
| 347 | in whitespace will match. The regexp we want in this case is | 
|---|
| 348 | <b>\s+$</b> to match strings that have at least one | 
|---|
| 349 | whitespace at the end. | 
|---|
| 350 |  | 
|---|
| 351 | \row \i <b>\e {E}{n}</b> | 
|---|
| 352 | \i Matches exactly \e n occurrences of the expression. This | 
|---|
| 353 | is the same as repeating the expression \e n times. For | 
|---|
| 354 | example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also | 
|---|
| 355 | the same as <b>\e {E}{n,n}</b>, e.g. <b>x{5,5}</b>. | 
|---|
| 356 |  | 
|---|
| 357 | \row \i <b>\e {E}{n,}</b> | 
|---|
| 358 | \i Matches at least \e n occurrences of the expression. This | 
|---|
| 359 | is the same as <b>\e {E}{n,MAXINT}</b>. | 
|---|
| 360 |  | 
|---|
| 361 | \row \i <b>\e {E}{,m}</b> | 
|---|
| 362 | \i Matches at most \e m occurrences of the expression. This | 
|---|
| 363 | is the same as <b>\e {E}{0,m}</b>. | 
|---|
| 364 |  | 
|---|
| 365 | \row \i <b>\e {E}{n,m}</b> | 
|---|
| 366 | \i Matches at least \e n occurrences of the expression and at | 
|---|
| 367 | most \e m occurrences of the expression. | 
|---|
| 368 | \endtable | 
|---|
| 369 |  | 
|---|
| 370 | (MAXINT is implementation dependent but will not be smaller than | 
|---|
| 371 | 1024.) | 
|---|
| 372 |  | 
|---|
| 373 | If we wish to apply a quantifier to more than just the preceding | 
|---|
| 374 | character we can use parentheses to group characters together in | 
|---|
| 375 | an expression. For example, <b>tag+</b> matches a 't' followed by | 
|---|
| 376 | an 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches | 
|---|
| 377 | at least one occurrence of 'tag'. | 
|---|
| 378 |  | 
|---|
| 379 | Note that quantifiers are "greedy". They will match as much text | 
|---|
| 380 | as they can. For example, <b>0+</b> will match as many zeros as it | 
|---|
| 381 | can from the first zero it finds, e.g. '2.<u>000</u>5'. | 
|---|
| 382 | Quantifiers can be made non-greedy, see setMinimal(). | 
|---|
| 383 |  | 
|---|
| 384 | \target capturing-text | 
|---|
| 385 | \section1 Capturing Text | 
|---|
| 386 |  | 
|---|
| 387 | Parentheses allow us to group elements together so that we can | 
|---|
| 388 | quantify and capture them. For example if we have the expression | 
|---|
| 389 | <b>mail|letter|correspondence</b> that matches a string we know | 
|---|
| 390 | that \e one of the words matched but not which one. Using | 
|---|
| 391 | parentheses allows us to "capture" whatever is matched within | 
|---|
| 392 | their bounds, so if we used <b>(mail|letter|correspondence)</b> | 
|---|
| 393 | and matched this regexp against the string "I sent you some email" | 
|---|
| 394 | we can use the cap() or capturedTexts() functions to extract the | 
|---|
| 395 | matched characters, in this case 'mail'. | 
|---|
| 396 |  | 
|---|
| 397 | We can use captured text within the regexp itself. To refer to the | 
|---|
| 398 | captured text we use \e backreferences which are indexed from 1, | 
|---|
| 399 | the same as for cap(). For example we could search for duplicate | 
|---|
| 400 | words in a string using <b>\b(\w+)\W+\1\b</b> which means match a | 
|---|
| 401 | word boundary followed by one or more word characters followed by | 
|---|
| 402 | one or more non-word characters followed by the same text as the | 
|---|
| 403 | first parenthesised expression followed by a word boundary. | 
|---|
| 404 |  | 
|---|
| 405 | If we want to use parentheses purely for grouping and not for | 
|---|
| 406 | capturing we can use the non-capturing syntax, e.g. | 
|---|
| 407 | <b>(?:green|blue)</b>. Non-capturing parentheses begin '(?:' and | 
|---|
| 408 | end ')'. In this example we match either 'green' or 'blue' but we | 
|---|
| 409 | do not capture the match so we only know whether or not we matched | 
|---|
| 410 | but not which color we actually found. Using non-capturing | 
|---|
| 411 | parentheses is more efficient than using capturing parentheses | 
|---|
| 412 | since the regexp engine has to do less book-keeping. | 
|---|
| 413 |  | 
|---|
| 414 | Both capturing and non-capturing parentheses may be nested. | 
|---|
| 415 |  | 
|---|
| 416 | \target assertions | 
|---|
| 417 | \section1 Assertions | 
|---|
| 418 |  | 
|---|
| 419 | Assertions make some statement about the text at the point where | 
|---|
| 420 | they occur in the regexp but they do not match any characters. In | 
|---|
| 421 | the following list <b>\e {E}</b> stands for any expression. | 
|---|
| 422 |  | 
|---|
| 423 | \table | 
|---|
| 424 | \row \i <b>^</b> | 
|---|
| 425 | \i The caret signifies the beginning of the string. If you | 
|---|
| 426 | wish to match a literal \c{^} you must escape it by | 
|---|
| 427 | writing \c{\\^}. For example, <b>^#include</b> will only | 
|---|
| 428 | match strings which \e begin with the characters '#include'. | 
|---|
| 429 | (When the caret is the first character of a character set it | 
|---|
| 430 | has a special meaning, see \link #sets-of-characters Sets of | 
|---|
| 431 | Characters \endlink.) | 
|---|
| 432 |  | 
|---|
| 433 | \row \i <b>$</b> | 
|---|
| 434 | \i The dollar signifies the end of the string. For example | 
|---|
| 435 | <b>\d\s*$</b> will match strings which end with a digit | 
|---|
| 436 | optionally followed by whitespace. If you wish to match a | 
|---|
| 437 | literal \c{$} you must escape it by writing | 
|---|
| 438 | \c{\\$}. | 
|---|
| 439 |  | 
|---|
| 440 | \row \i <b>\\b</b> | 
|---|
| 441 | \i A word boundary. For example the regexp | 
|---|
| 442 | <b>\\bOK\\b</b> means match immediately after a word | 
|---|
| 443 | boundary (e.g. start of string or whitespace) the letter 'O' | 
|---|
| 444 | then the letter 'K' immediately before another word boundary | 
|---|
| 445 | (e.g. end of string or whitespace). But note that the | 
|---|
| 446 | assertion does not actually match any whitespace so if we | 
|---|
| 447 | write <b>(\\bOK\\b)</b> and we have a match it will only | 
|---|
| 448 | contain 'OK' even if the string is "Its <u>OK</u> now". | 
|---|
| 449 |  | 
|---|
| 450 | \row \i <b>\\B</b> | 
|---|
| 451 | \i A non-word boundary. This assertion is true wherever | 
|---|
| 452 | <b>\\b</b> is false. For example if we searched for | 
|---|
| 453 | <b>\\Bon\\B</b> in "Left on" the match would fail (space | 
|---|
| 454 | and end of string aren't non-word boundaries), but it would | 
|---|
| 455 | match in "t<u>on</u>ne". | 
|---|
| 456 |  | 
|---|
| 457 | \row \i <b>(?=\e E)</b> | 
|---|
| 458 | \i Positive lookahead. This assertion is true if the | 
|---|
| 459 | expression matches at this point in the regexp. For example, | 
|---|
| 460 | <b>const(?=\\s+char)</b> matches 'const' whenever it is | 
|---|
| 461 | followed by 'char', as in 'static <u>const</u> char *'. | 
|---|
| 462 | (Compare with <b>const\\s+char</b>, which matches 'static | 
|---|
| 463 | <u>const char</u> *'.) | 
|---|
| 464 |  | 
|---|
| 465 | \row \i <b>(?!\e E)</b> | 
|---|
| 466 | \i Negative lookahead. This assertion is true if the | 
|---|
| 467 | expression does not match at this point in the regexp. For | 
|---|
| 468 | example, <b>const(?!\\s+char)</b> matches 'const' \e except | 
|---|
| 469 | when it is followed by 'char'. | 
|---|
| 470 | \endtable | 
|---|
| 471 |  | 
|---|
| 472 | \target wildcard-matching | 
|---|
| 473 | \section1 Wildcard Matching (globbing) | 
|---|
| 474 |  | 
|---|
| 475 | Most command shells such as \e bash or \e cmd.exe support "file | 
|---|
| 476 | globbing", the ability to identify a group of files by using | 
|---|
| 477 | wildcards. The setWildcard() function is used to switch between | 
|---|
| 478 | regexp and wildcard mode. Wildcard matching is much simpler than | 
|---|
| 479 | full regexps and has only four features: | 
|---|
| 480 |  | 
|---|
| 481 | \table | 
|---|
| 482 | \row \i <b>c</b> | 
|---|
| 483 | \i Any character represents itself apart from those mentioned | 
|---|
| 484 | below. Thus <b>c</b> matches the character \e c. | 
|---|
| 485 | \row \i <b>?</b> | 
|---|
| 486 | \i This matches any single character. It is the same as | 
|---|
| 487 | <b>.</b> in full regexps. | 
|---|
| 488 | \row \i <b>*</b> | 
|---|
| 489 | \i This matches zero or more of any characters. It is the | 
|---|
| 490 | same as <b>.*</b> in full regexps. | 
|---|
| 491 | \row \i <b>[...]</b> | 
|---|
| 492 | \i Sets of characters can be represented in square brackets, | 
|---|
| 493 | similar to full regexps. Within the character class, like | 
|---|
| 494 | outside, backslash has no special meaning. | 
|---|
| 495 | \endtable | 
|---|
| 496 |  | 
|---|
| 497 | For example if we are in wildcard mode and have strings which | 
|---|
| 498 | contain filenames we could identify HTML files with <b>*.html</b>. | 
|---|
| 499 | This will match zero or more characters followed by a dot followed | 
|---|
| 500 | by 'h', 't', 'm' and 'l'. | 
|---|
| 501 |  | 
|---|
| 502 | \target perl-users | 
|---|
| 503 | \section1 Notes for Perl Users | 
|---|
| 504 |  | 
|---|
| 505 | Most of the character class abbreviations supported by Perl are | 
|---|
| 506 | supported by QRegExp, see \link | 
|---|
| 507 | #characters-and-abbreviations-for-sets-of-characters characters | 
|---|
| 508 | and abbreviations for sets of characters \endlink. | 
|---|
| 509 |  | 
|---|
| 510 | In QRegExp, apart from within character classes, \c{^} always | 
|---|
| 511 | signifies the start of the string, so carets must always be | 
|---|
| 512 | escaped unless used for that purpose. In Perl the meaning of caret | 
|---|
| 513 | varies automagically depending on where it occurs so escaping it | 
|---|
| 514 | is rarely necessary. The same applies to \c{$} which in | 
|---|
| 515 | QRegExp always signifies the end of the string. | 
|---|
| 516 |  | 
|---|
| 517 | QRegExp's quantifiers are the same as Perl's greedy quantifiers. | 
|---|
| 518 | Non-greedy matching cannot be applied to individual quantifiers, | 
|---|
| 519 | but can be applied to all the quantifiers in the pattern. For | 
|---|
| 520 | example, to match the Perl regexp <b>ro+?m</b> requires: | 
|---|
| 521 | \code | 
|---|
| 522 | QRegExp rx( "ro+m" ); | 
|---|
| 523 | rx.setMinimal( TRUE ); | 
|---|
| 524 | \endcode | 
|---|
| 525 |  | 
|---|
| 526 | The equivalent of Perl's \c{/i} option is | 
|---|
| 527 | setCaseSensitive(FALSE). | 
|---|
| 528 |  | 
|---|
| 529 | Perl's \c{/g} option can be emulated using a \link | 
|---|
| 530 | #cap_in_a_loop loop \endlink. | 
|---|
| 531 |  | 
|---|
| 532 | In QRegExp <b>.</b> matches any character, therefore all QRegExp | 
|---|
| 533 | regexps have the equivalent of Perl's \c{/s} option. QRegExp | 
|---|
| 534 | does not have an equivalent to Perl's \c{/m} option, but this | 
|---|
| 535 | can be emulated in various ways for example by splitting the input | 
|---|
| 536 | into lines or by looping with a regexp that searches for newlines. | 
|---|
| 537 |  | 
|---|
| 538 | Because QRegExp is string oriented there are no \A, \Z or \z | 
|---|
| 539 | assertions. The \G assertion is not supported but can be emulated | 
|---|
| 540 | in a loop. | 
|---|
| 541 |  | 
|---|
| 542 | Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp | 
|---|
| 543 | equivalents for $`, $' or $+. Perl's capturing variables, $1, $2, | 
|---|
| 544 | ... correspond to cap(1) or capturedTexts()[1], cap(2) or | 
|---|
| 545 | capturedTexts()[2], etc. | 
|---|
| 546 |  | 
|---|
| 547 | To substitute a pattern use QString::replace(). | 
|---|
| 548 |  | 
|---|
| 549 | Perl's extended \c{/x} syntax is not supported, nor are | 
|---|
| 550 | directives, e.g. (?i), or regexp comments, e.g. (?#comment). On | 
|---|
| 551 | the other hand, C++'s rules for literal strings can be used to | 
|---|
| 552 | achieve the same: | 
|---|
| 553 | \code | 
|---|
| 554 | QRegExp mark( "\\b" // word boundary | 
|---|
| 555 | "[Mm]ark" // the word we want to match | 
|---|
| 556 | ); | 
|---|
| 557 | \endcode | 
|---|
| 558 |  | 
|---|
| 559 | Both zero-width positive and zero-width negative lookahead | 
|---|
| 560 | assertions (?=pattern) and (?!pattern) are supported with the same | 
|---|
| 561 | syntax as Perl. Perl's lookbehind assertions, "independent" | 
|---|
| 562 | subexpressions and conditional expressions are not supported. | 
|---|
| 563 |  | 
|---|
| 564 | Non-capturing parentheses are also supported, with the same | 
|---|
| 565 | (?:pattern) syntax. | 
|---|
| 566 |  | 
|---|
| 567 | See QStringList::split() and QStringList::join() for equivalents | 
|---|
| 568 | to Perl's split and join functions. | 
|---|
| 569 |  | 
|---|
| 570 | Note: because C++ transforms \\'s they must be written \e twice in | 
|---|
| 571 | code, e.g. <b>\\b</b> must be written <b>\\\\b</b>. | 
|---|
| 572 |  | 
|---|
| 573 | \target code-examples | 
|---|
| 574 | \section1 Code Examples | 
|---|
| 575 |  | 
|---|
| 576 | \code | 
|---|
| 577 | QRegExp rx( "^\\d\\d?$" );  // match integers 0 to 99 | 
|---|
| 578 | rx.search( "123" );         // returns -1 (no match) | 
|---|
| 579 | rx.search( "-6" );          // returns -1 (no match) | 
|---|
| 580 | rx.search( "6" );           // returns 0 (matched as position 0) | 
|---|
| 581 | \endcode | 
|---|
| 582 |  | 
|---|
| 583 | The third string matches '<u>6</u>'. This is a simple validation | 
|---|
| 584 | regexp for integers in the range 0 to 99. | 
|---|
| 585 |  | 
|---|
| 586 | \code | 
|---|
| 587 | QRegExp rx( "^\\S+$" );     // match strings without whitespace | 
|---|
| 588 | rx.search( "Hello world" ); // returns -1 (no match) | 
|---|
| 589 | rx.search( "This_is-OK" );  // returns 0 (matched at position 0) | 
|---|
| 590 | \endcode | 
|---|
| 591 |  | 
|---|
| 592 | The second string matches '<u>This_is-OK</u>'. We've used the | 
|---|
| 593 | character set abbreviation '\S' (non-whitespace) and the anchors | 
|---|
| 594 | to match strings which contain no whitespace. | 
|---|
| 595 |  | 
|---|
| 596 | In the following example we match strings containing 'mail' or | 
|---|
| 597 | 'letter' or 'correspondence' but only match whole words i.e. not | 
|---|
| 598 | 'email' | 
|---|
| 599 |  | 
|---|
| 600 | \code | 
|---|
| 601 | QRegExp rx( "\\b(mail|letter|correspondence)\\b" ); | 
|---|
| 602 | rx.search( "I sent you an email" );     // returns -1 (no match) | 
|---|
| 603 | rx.search( "Please write the letter" ); // returns 17 | 
|---|
| 604 | \endcode | 
|---|
| 605 |  | 
|---|
| 606 | The second string matches "Please write the <u>letter</u>". The | 
|---|
| 607 | word 'letter' is also captured (because of the parentheses). We | 
|---|
| 608 | can see what text we've captured like this: | 
|---|
| 609 |  | 
|---|
| 610 | \code | 
|---|
| 611 | QString captured = rx.cap( 1 ); // captured == "letter" | 
|---|
| 612 | \endcode | 
|---|
| 613 |  | 
|---|
| 614 | This will capture the text from the first set of capturing | 
|---|
| 615 | parentheses (counting capturing left parentheses from left to | 
|---|
| 616 | right). The parentheses are counted from 1 since cap( 0 ) is the | 
|---|
| 617 | whole matched regexp (equivalent to '&' in most regexp engines). | 
|---|
| 618 |  | 
|---|
| 619 | \code | 
|---|
| 620 | QRegExp rx( "&(?!amp;)" );      // match ampersands but not & | 
|---|
| 621 | QString line1 = "This & that"; | 
|---|
| 622 | line1.replace( rx, "&" ); | 
|---|
| 623 | // line1 == "This & that" | 
|---|
| 624 | QString line2 = "His & hers & theirs"; | 
|---|
| 625 | line2.replace( rx, "&" ); | 
|---|
| 626 | // line2 == "His & hers & theirs" | 
|---|
| 627 | \endcode | 
|---|
| 628 |  | 
|---|
| 629 | Here we've passed the QRegExp to QString's replace() function to | 
|---|
| 630 | replace the matched text with new text. | 
|---|
| 631 |  | 
|---|
| 632 | \code | 
|---|
| 633 | QString str = "One Eric another Eirik, and an Ericsson." | 
|---|
| 634 | " How many Eiriks, Eric?"; | 
|---|
| 635 | QRegExp rx( "\\b(Eric|Eirik)\\b" ); // match Eric or Eirik | 
|---|
| 636 | int pos = 0;    // where we are in the string | 
|---|
| 637 | int count = 0;  // how many Eric and Eirik's we've counted | 
|---|
| 638 | while ( pos >= 0 ) { | 
|---|
| 639 | pos = rx.search( str, pos ); | 
|---|
| 640 | if ( pos >= 0 ) { | 
|---|
| 641 | pos++;      // move along in str | 
|---|
| 642 | count++;    // count our Eric or Eirik | 
|---|
| 643 | } | 
|---|
| 644 | } | 
|---|
| 645 | \endcode | 
|---|
| 646 |  | 
|---|
| 647 | We've used the search() function to repeatedly match the regexp in | 
|---|
| 648 | the string. Note that instead of moving forward by one character | 
|---|
| 649 | at a time \c pos++ we could have written \c {pos += | 
|---|
| 650 | rx.matchedLength()} to skip over the already matched string. The | 
|---|
| 651 | count will equal 3, matching 'One <u>Eric</u> another | 
|---|
| 652 | <u>Eirik</u>, and an Ericsson. How many Eiriks, <u>Eric</u>?'; it | 
|---|
| 653 | doesn't match 'Ericsson' or 'Eiriks' because they are not bounded | 
|---|
| 654 | by non-word boundaries. | 
|---|
| 655 |  | 
|---|
| 656 | One common use of regexps is to split lines of delimited data into | 
|---|
| 657 | their component fields. | 
|---|
| 658 |  | 
|---|
| 659 | \code | 
|---|
| 660 | str = "Trolltech AS\twww.trolltech.com\tNorway"; | 
|---|
| 661 | QString company, web, country; | 
|---|
| 662 | rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" ); | 
|---|
| 663 | if ( rx.search( str ) != -1 ) { | 
|---|
| 664 | company = rx.cap( 1 ); | 
|---|
| 665 | web = rx.cap( 2 ); | 
|---|
| 666 | country = rx.cap( 3 ); | 
|---|
| 667 | } | 
|---|
| 668 | \endcode | 
|---|
| 669 |  | 
|---|
| 670 | In this example our input lines have the format company name, web | 
|---|
| 671 | address and country. Unfortunately the regexp is rather long and | 
|---|
| 672 | not very versatile -- the code will break if we add any more | 
|---|
| 673 | fields. A simpler and better solution is to look for the | 
|---|
| 674 | separator, '\t' in this case, and take the surrounding text. The | 
|---|
| 675 | QStringList split() function can take a separator string or regexp | 
|---|
| 676 | as an argument and split a string accordingly. | 
|---|
| 677 |  | 
|---|
| 678 | \code | 
|---|
| 679 | QStringList field = QStringList::split( "\t", str ); | 
|---|
| 680 | \endcode | 
|---|
| 681 |  | 
|---|
| 682 | Here field[0] is the company, field[1] the web address and so on. | 
|---|
| 683 |  | 
|---|
| 684 | To imitate the matching of a shell we can use wildcard mode. | 
|---|
| 685 |  | 
|---|
| 686 | \code | 
|---|
| 687 | QRegExp rx( "*.html" );         // invalid regexp: * doesn't quantify anything | 
|---|
| 688 | rx.setWildcard( TRUE );         // now it's a valid wildcard regexp | 
|---|
| 689 | rx.exactMatch( "index.html" );  // returns TRUE | 
|---|
| 690 | rx.exactMatch( "default.htm" ); // returns FALSE | 
|---|
| 691 | rx.exactMatch( "readme.txt" );  // returns FALSE | 
|---|
| 692 | \endcode | 
|---|
| 693 |  | 
|---|
| 694 | Wildcard matching can be convenient because of its simplicity, but | 
|---|
| 695 | any wildcard regexp can be defined using full regexps, e.g. | 
|---|
| 696 | <b>.*\.html$</b>. Notice that we can't match both \c .html and \c | 
|---|
| 697 | .htm files with a wildcard unless we use <b>*.htm*</b> which will | 
|---|
| 698 | also match 'test.html.bak'. A full regexp gives us the precision | 
|---|
| 699 | we need, <b>.*\\.html?$</b>. | 
|---|
| 700 |  | 
|---|
| 701 | QRegExp can match case insensitively using setCaseSensitive(), and | 
|---|
| 702 | can use non-greedy matching, see setMinimal(). By default QRegExp | 
|---|
| 703 | uses full regexps but this can be changed with setWildcard(). | 
|---|
| 704 | Searching can be forward with search() or backward with | 
|---|
| 705 | searchRev(). Captured text can be accessed using capturedTexts() | 
|---|
| 706 | which returns a string list of all captured strings, or using | 
|---|
| 707 | cap() which returns the captured string for the given index. The | 
|---|
| 708 | pos() function takes a match index and returns the position in the | 
|---|
| 709 | string where the match was made (or -1 if there was no match). | 
|---|
| 710 |  | 
|---|
| 711 | \sa QRegExpValidator QString QStringList | 
|---|
| 712 |  | 
|---|
| 713 | \target member-function-documentation | 
|---|
| 714 | */ | 
|---|
| 715 |  | 
|---|
| 716 | const int NumBadChars = 64; | 
|---|
| 717 | #define BadChar( ch ) ( (ch).unicode() % NumBadChars ) | 
|---|
| 718 |  | 
|---|
| 719 | const int NoOccurrence = INT_MAX; | 
|---|
| 720 | const int EmptyCapture = INT_MAX; | 
|---|
| 721 | const int InftyLen = INT_MAX; | 
|---|
| 722 | const int InftyRep = 1025; | 
|---|
| 723 | const int EOS = -1; | 
|---|
| 724 |  | 
|---|
| 725 | static bool isWord( QChar ch ) | 
|---|
| 726 | { | 
|---|
| 727 | return ch.isLetterOrNumber() || ch == QChar( '_' ); | 
|---|
| 728 | } | 
|---|
| 729 |  | 
|---|
| 730 | /* | 
|---|
| 731 | Merges two QMemArrays of ints and puts the result into the first | 
|---|
| 732 | one. | 
|---|
| 733 | */ | 
|---|
| 734 | static void mergeInto( QMemArray<int> *a, const QMemArray<int>& b ) | 
|---|
| 735 | { | 
|---|
| 736 | int asize = a->size(); | 
|---|
| 737 | int bsize = b.size(); | 
|---|
| 738 | if ( asize == 0 ) { | 
|---|
| 739 | *a = b.copy(); | 
|---|
| 740 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 741 | } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) { | 
|---|
| 742 | a->resize( asize + 1 ); | 
|---|
| 743 | (*a)[asize] = b[0]; | 
|---|
| 744 | #endif | 
|---|
| 745 | } else if ( bsize >= 1 ) { | 
|---|
| 746 | int csize = asize + bsize; | 
|---|
| 747 | QMemArray<int> c( csize ); | 
|---|
| 748 | int i = 0, j = 0, k = 0; | 
|---|
| 749 | while ( i < asize ) { | 
|---|
| 750 | if ( j < bsize ) { | 
|---|
| 751 | if ( (*a)[i] == b[j] ) { | 
|---|
| 752 | i++; | 
|---|
| 753 | csize--; | 
|---|
| 754 | } else if ( (*a)[i] < b[j] ) { | 
|---|
| 755 | c[k++] = (*a)[i++]; | 
|---|
| 756 | } else { | 
|---|
| 757 | c[k++] = b[j++]; | 
|---|
| 758 | } | 
|---|
| 759 | } else { | 
|---|
| 760 | memcpy( c.data() + k, (*a).data() + i, | 
|---|
| 761 | (asize - i) * sizeof(int) ); | 
|---|
| 762 | break; | 
|---|
| 763 | } | 
|---|
| 764 | } | 
|---|
| 765 | c.resize( csize ); | 
|---|
| 766 | if ( j < bsize ) | 
|---|
| 767 | memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) ); | 
|---|
| 768 | *a = c; | 
|---|
| 769 | } | 
|---|
| 770 | } | 
|---|
| 771 |  | 
|---|
| 772 | /* | 
|---|
| 773 | Merges two disjoint QMaps of (int, int) pairs and puts the result | 
|---|
| 774 | into the first one. | 
|---|
| 775 | */ | 
|---|
| 776 | static void mergeInto( QMap<int, int> *a, const QMap<int, int>& b ) | 
|---|
| 777 | { | 
|---|
| 778 | QMap<int, int>::ConstIterator it; | 
|---|
| 779 | for ( it = b.begin(); it != b.end(); ++it ) | 
|---|
| 780 | a->insert( it.key(), *it ); | 
|---|
| 781 | } | 
|---|
| 782 |  | 
|---|
| 783 | /* | 
|---|
| 784 | Returns the value associated to key k in QMap m of (int, int) | 
|---|
| 785 | pairs, or 0 if no such value is explicitly present. | 
|---|
| 786 | */ | 
|---|
| 787 | static int at( const QMap<int, int>& m, int k ) | 
|---|
| 788 | { | 
|---|
| 789 | QMap<int, int>::ConstIterator it = m.find( k ); | 
|---|
| 790 | if ( it == m.end() ) | 
|---|
| 791 | return 0; | 
|---|
| 792 | else | 
|---|
| 793 | return *it; | 
|---|
| 794 | } | 
|---|
| 795 |  | 
|---|
| 796 | #ifndef QT_NO_REGEXP_WILDCARD | 
|---|
| 797 | /* | 
|---|
| 798 | Translates a wildcard pattern to an equivalent regular expression | 
|---|
| 799 | pattern (e.g., *.cpp to .*\.cpp). | 
|---|
| 800 | */ | 
|---|
| 801 | static QString wc2rx( const QString& wc_str ) | 
|---|
| 802 | { | 
|---|
| 803 | int wclen = wc_str.length(); | 
|---|
| 804 | QString rx = QString::fromLatin1( "" ); | 
|---|
| 805 | int i = 0; | 
|---|
| 806 | const QChar *wc = wc_str.unicode(); | 
|---|
| 807 | while ( i < wclen ) { | 
|---|
| 808 | QChar c = wc[i++]; | 
|---|
| 809 | switch ( c.unicode() ) { | 
|---|
| 810 | case '*': | 
|---|
| 811 | rx += QString::fromLatin1( ".*" ); | 
|---|
| 812 | break; | 
|---|
| 813 | case '?': | 
|---|
| 814 | rx += QChar( '.' ); | 
|---|
| 815 | break; | 
|---|
| 816 | case '$': | 
|---|
| 817 | case '(': | 
|---|
| 818 | case ')': | 
|---|
| 819 | case '+': | 
|---|
| 820 | case '.': | 
|---|
| 821 | case '\\': | 
|---|
| 822 | case '^': | 
|---|
| 823 | case '{': | 
|---|
| 824 | case '|': | 
|---|
| 825 | case '}': | 
|---|
| 826 | rx += QChar( '\\' ); | 
|---|
| 827 | rx += c; | 
|---|
| 828 | break; | 
|---|
| 829 | case '[': | 
|---|
| 830 | rx += c; | 
|---|
| 831 | if ( wc[i] == QChar('^') ) | 
|---|
| 832 | rx += wc[i++]; | 
|---|
| 833 | if ( i < wclen ) { | 
|---|
| 834 | if ( rx[i] == ']' ) | 
|---|
| 835 | rx += wc[i++]; | 
|---|
| 836 | while ( i < wclen && wc[i] != QChar(']') ) { | 
|---|
| 837 | if ( wc[i] == '\\' ) | 
|---|
| 838 | rx += QChar( '\\' ); | 
|---|
| 839 | rx += wc[i++]; | 
|---|
| 840 | } | 
|---|
| 841 | } | 
|---|
| 842 | break; | 
|---|
| 843 | default: | 
|---|
| 844 | rx += c; | 
|---|
| 845 | } | 
|---|
| 846 | } | 
|---|
| 847 | return rx; | 
|---|
| 848 | } | 
|---|
| 849 | #endif | 
|---|
| 850 |  | 
|---|
| 851 | /* | 
|---|
| 852 | The class QRegExpEngine encapsulates a modified nondeterministic | 
|---|
| 853 | finite automaton (NFA). | 
|---|
| 854 | */ | 
|---|
| 855 | class QRegExpEngine : public QShared | 
|---|
| 856 | { | 
|---|
| 857 | public: | 
|---|
| 858 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 859 | /* | 
|---|
| 860 | The class CharClass represents a set of characters, such as can | 
|---|
| 861 | be found in regular expressions (e.g., [a-z] denotes the set | 
|---|
| 862 | {a, b, ..., z}). | 
|---|
| 863 | */ | 
|---|
| 864 | class CharClass | 
|---|
| 865 | { | 
|---|
| 866 | public: | 
|---|
| 867 | CharClass(); | 
|---|
| 868 | CharClass( const CharClass& cc ) { operator=( cc ); } | 
|---|
| 869 |  | 
|---|
| 870 | CharClass& operator=( const CharClass& cc ); | 
|---|
| 871 |  | 
|---|
| 872 | void clear(); | 
|---|
| 873 | bool negative() const { return n; } | 
|---|
| 874 | void setNegative( bool negative ); | 
|---|
| 875 | void addCategories( int cats ); | 
|---|
| 876 | void addRange( ushort from, ushort to ); | 
|---|
| 877 | void addSingleton( ushort ch ) { addRange( ch, ch ); } | 
|---|
| 878 |  | 
|---|
| 879 | bool in( QChar ch ) const; | 
|---|
| 880 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 881 | const QMemArray<int>& firstOccurrence() const { return occ1; } | 
|---|
| 882 | #endif | 
|---|
| 883 |  | 
|---|
| 884 | #if defined(QT_DEBUG) | 
|---|
| 885 | void dump() const; | 
|---|
| 886 | #endif | 
|---|
| 887 |  | 
|---|
| 888 | private: | 
|---|
| 889 | /* | 
|---|
| 890 | The struct Range represents a range of characters (e.g., | 
|---|
| 891 | [0-9] denotes range 48 to 57). | 
|---|
| 892 | */ | 
|---|
| 893 | struct Range | 
|---|
| 894 | { | 
|---|
| 895 | ushort from; // 48 | 
|---|
| 896 | ushort to; // 57 | 
|---|
| 897 | }; | 
|---|
| 898 |  | 
|---|
| 899 | int c; // character classes | 
|---|
| 900 | QMemArray<Range> r; // character ranges | 
|---|
| 901 | bool n; // negative? | 
|---|
| 902 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 903 | QMemArray<int> occ1; // first-occurrence array | 
|---|
| 904 | #endif | 
|---|
| 905 | }; | 
|---|
| 906 | #else | 
|---|
| 907 | struct CharClass | 
|---|
| 908 | { | 
|---|
| 909 | int dummy; | 
|---|
| 910 |  | 
|---|
| 911 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 912 | CharClass() { occ1.fill( 0, NumBadChars ); } | 
|---|
| 913 |  | 
|---|
| 914 | const QMemArray<int>& firstOccurrence() const { return occ1; } | 
|---|
| 915 | QMemArray<int> occ1; | 
|---|
| 916 | #endif | 
|---|
| 917 | }; | 
|---|
| 918 | #endif | 
|---|
| 919 |  | 
|---|
| 920 | QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); } | 
|---|
| 921 | QRegExpEngine( const QString& rx, bool caseSensitive ); | 
|---|
| 922 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 923 | ~QRegExpEngine(); | 
|---|
| 924 | #endif | 
|---|
| 925 |  | 
|---|
| 926 | bool isValid() const { return valid; } | 
|---|
| 927 | bool caseSensitive() const { return cs; } | 
|---|
| 928 | const QString& errorString() const { return yyError; } | 
|---|
| 929 | int numCaptures() const { return officialncap; } | 
|---|
| 930 | void match( const QString& str, int pos, bool minimal, bool oneTest, | 
|---|
| 931 | int caretIndex, QMemArray<int>& captured ); | 
|---|
| 932 | int partialMatchLength() const { return mmOneTestMatchedLen; } | 
|---|
| 933 |  | 
|---|
| 934 | int createState( QChar ch ); | 
|---|
| 935 | int createState( const CharClass& cc ); | 
|---|
| 936 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 937 | int createState( int bref ); | 
|---|
| 938 | #endif | 
|---|
| 939 |  | 
|---|
| 940 | void addCatTransitions( const QMemArray<int>& from, | 
|---|
| 941 | const QMemArray<int>& to ); | 
|---|
| 942 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 943 | void addPlusTransitions( const QMemArray<int>& from, | 
|---|
| 944 | const QMemArray<int>& to, int atom ); | 
|---|
| 945 | #endif | 
|---|
| 946 |  | 
|---|
| 947 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | 
|---|
| 948 | int anchorAlternation( int a, int b ); | 
|---|
| 949 | int anchorConcatenation( int a, int b ); | 
|---|
| 950 | #else | 
|---|
| 951 | int anchorAlternation( int a, int b ) { return a & b; } | 
|---|
| 952 | int anchorConcatenation( int a, int b ) { return a | b; } | 
|---|
| 953 | #endif | 
|---|
| 954 | void addAnchors( int from, int to, int a ); | 
|---|
| 955 |  | 
|---|
| 956 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 957 | void heuristicallyChooseHeuristic(); | 
|---|
| 958 | #endif | 
|---|
| 959 |  | 
|---|
| 960 | #if defined(QT_DEBUG) | 
|---|
| 961 | void dump() const; | 
|---|
| 962 | #endif | 
|---|
| 963 |  | 
|---|
| 964 | private: | 
|---|
| 965 | enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; | 
|---|
| 966 |  | 
|---|
| 967 | /* | 
|---|
| 968 | The struct State represents one state in a modified NFA. The | 
|---|
| 969 | input characters matched are stored in the state instead of on | 
|---|
| 970 | the transitions, something possible for an automaton | 
|---|
| 971 | constructed from a regular expression. | 
|---|
| 972 | */ | 
|---|
| 973 | struct State | 
|---|
| 974 | { | 
|---|
| 975 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 976 | int atom; // which atom does this state belong to? | 
|---|
| 977 | #endif | 
|---|
| 978 | int match; // what does it match? (see CharClassBit and BackRefBit) | 
|---|
| 979 | QMemArray<int> outs; // out-transitions | 
|---|
| 980 | QMap<int, int> *reenter; // atoms reentered when transiting out | 
|---|
| 981 | QMap<int, int> *anchors; // anchors met when transiting out | 
|---|
| 982 |  | 
|---|
| 983 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 984 | State( int a, int m ) | 
|---|
| 985 | : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { } | 
|---|
| 986 | #else | 
|---|
| 987 | State( int m ) | 
|---|
| 988 | : match( m ), reenter( 0 ), anchors( 0 ) { } | 
|---|
| 989 | #endif | 
|---|
| 990 | ~State() { delete reenter; delete anchors; } | 
|---|
| 991 | }; | 
|---|
| 992 |  | 
|---|
| 993 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 994 | /* | 
|---|
| 995 | The struct Lookahead represents a lookahead a la Perl (e.g., | 
|---|
| 996 | (?=foo) and (?!bar)). | 
|---|
| 997 | */ | 
|---|
| 998 | struct Lookahead | 
|---|
| 999 | { | 
|---|
| 1000 | QRegExpEngine *eng; // NFA representing the embedded regular expression | 
|---|
| 1001 | bool neg; // negative lookahead? | 
|---|
| 1002 |  | 
|---|
| 1003 | Lookahead( QRegExpEngine *eng0, bool neg0 ) | 
|---|
| 1004 | : eng( eng0 ), neg( neg0 ) { } | 
|---|
| 1005 | ~Lookahead() { delete eng; } | 
|---|
| 1006 | }; | 
|---|
| 1007 | #endif | 
|---|
| 1008 |  | 
|---|
| 1009 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1010 | /* | 
|---|
| 1011 | The struct Atom represents one node in the hierarchy of regular | 
|---|
| 1012 | expression atoms. | 
|---|
| 1013 | */ | 
|---|
| 1014 | struct Atom | 
|---|
| 1015 | { | 
|---|
| 1016 | int parent; // index of parent in array of atoms | 
|---|
| 1017 | int capture; // index of capture, from 1 to ncap | 
|---|
| 1018 | }; | 
|---|
| 1019 | #endif | 
|---|
| 1020 |  | 
|---|
| 1021 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | 
|---|
| 1022 | /* | 
|---|
| 1023 | The struct AnchorAlternation represents a pair of anchors with | 
|---|
| 1024 | OR semantics. | 
|---|
| 1025 | */ | 
|---|
| 1026 | struct AnchorAlternation | 
|---|
| 1027 | { | 
|---|
| 1028 | int a; // this anchor... | 
|---|
| 1029 | int b; // ...or this one | 
|---|
| 1030 | }; | 
|---|
| 1031 | #endif | 
|---|
| 1032 |  | 
|---|
| 1033 | enum { InitialState = 0, FinalState = 1 }; | 
|---|
| 1034 | void setup( bool caseSensitive ); | 
|---|
| 1035 | int setupState( int match ); | 
|---|
| 1036 |  | 
|---|
| 1037 | /* | 
|---|
| 1038 | Let's hope that 13 lookaheads and 14 back-references are | 
|---|
| 1039 | enough. | 
|---|
| 1040 | */ | 
|---|
| 1041 | enum { MaxLookaheads = 13, MaxBackRefs = 14 }; | 
|---|
| 1042 | enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, | 
|---|
| 1043 | Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008, | 
|---|
| 1044 | Anchor_FirstLookahead = 0x00000010, | 
|---|
| 1045 | Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, | 
|---|
| 1046 | Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, | 
|---|
| 1047 | Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs, | 
|---|
| 1048 |  | 
|---|
| 1049 | Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^ | 
|---|
| 1050 | ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) }; | 
|---|
| 1051 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1052 | int startAtom( bool capture ); | 
|---|
| 1053 | void finishAtom( int atom ) { cf = f[atom].parent; } | 
|---|
| 1054 | #endif | 
|---|
| 1055 |  | 
|---|
| 1056 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 1057 | int addLookahead( QRegExpEngine *eng, bool negative ); | 
|---|
| 1058 | #endif | 
|---|
| 1059 |  | 
|---|
| 1060 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1061 | bool isBetterCapture( const int *begin1, const int *end1, const int *begin2, | 
|---|
| 1062 | const int *end2 ); | 
|---|
| 1063 | #endif | 
|---|
| 1064 | bool testAnchor( int i, int a, const int *capBegin ); | 
|---|
| 1065 |  | 
|---|
| 1066 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1067 | bool goodStringMatch(); | 
|---|
| 1068 | bool badCharMatch(); | 
|---|
| 1069 | #else | 
|---|
| 1070 | bool bruteMatch(); | 
|---|
| 1071 | #endif | 
|---|
| 1072 | bool matchHere(); | 
|---|
| 1073 |  | 
|---|
| 1074 | QPtrVector<State> s; // array of states | 
|---|
| 1075 | int ns; // number of states | 
|---|
| 1076 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1077 | QMemArray<Atom> f; // atom hierarchy | 
|---|
| 1078 | int nf; // number of atoms | 
|---|
| 1079 | int cf; // current atom | 
|---|
| 1080 | #endif | 
|---|
| 1081 | int officialncap; // number of captures, seen from the outside | 
|---|
| 1082 | int ncap; // number of captures, seen from the inside | 
|---|
| 1083 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 1084 | QPtrVector<CharClass> cl; // array of character classes | 
|---|
| 1085 | #endif | 
|---|
| 1086 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 1087 | QPtrVector<Lookahead> ahead; // array of lookaheads | 
|---|
| 1088 | #endif | 
|---|
| 1089 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | 
|---|
| 1090 | QMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors | 
|---|
| 1091 | #endif | 
|---|
| 1092 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1093 | bool caretAnchored; // does the regexp start with ^? | 
|---|
| 1094 | bool trivial; // is the good-string all that needs to match? | 
|---|
| 1095 | #endif | 
|---|
| 1096 | bool valid; // is the regular expression valid? | 
|---|
| 1097 | bool cs; // case sensitive? | 
|---|
| 1098 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 1099 | int nbrefs; // number of back-references | 
|---|
| 1100 | #endif | 
|---|
| 1101 |  | 
|---|
| 1102 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1103 | bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch | 
|---|
| 1104 |  | 
|---|
| 1105 | int goodEarlyStart; // the index where goodStr can first occur in a match | 
|---|
| 1106 | int goodLateStart; // the index where goodStr can last occur in a match | 
|---|
| 1107 | QString goodStr; // the string that any match has to contain | 
|---|
| 1108 |  | 
|---|
| 1109 | int minl; // the minimum length of a match | 
|---|
| 1110 | QMemArray<int> occ1; // first-occurrence array | 
|---|
| 1111 | #endif | 
|---|
| 1112 |  | 
|---|
| 1113 | /* | 
|---|
| 1114 | The class Box is an abstraction for a regular expression | 
|---|
| 1115 | fragment. It can also be seen as one node in the syntax tree of | 
|---|
| 1116 | a regular expression with synthetized attributes. | 
|---|
| 1117 |  | 
|---|
| 1118 | Its interface is ugly for performance reasons. | 
|---|
| 1119 | */ | 
|---|
| 1120 | class Box | 
|---|
| 1121 | { | 
|---|
| 1122 | public: | 
|---|
| 1123 | Box( QRegExpEngine *engine ); | 
|---|
| 1124 | Box( const Box& b ) { operator=( b ); } | 
|---|
| 1125 |  | 
|---|
| 1126 | Box& operator=( const Box& b ); | 
|---|
| 1127 |  | 
|---|
| 1128 | void clear() { operator=( Box(eng) ); } | 
|---|
| 1129 | void set( QChar ch ); | 
|---|
| 1130 | void set( const CharClass& cc ); | 
|---|
| 1131 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 1132 | void set( int bref ); | 
|---|
| 1133 | #endif | 
|---|
| 1134 |  | 
|---|
| 1135 | void cat( const Box& b ); | 
|---|
| 1136 | void orx( const Box& b ); | 
|---|
| 1137 | void plus( int atom ); | 
|---|
| 1138 | void opt(); | 
|---|
| 1139 | void catAnchor( int a ); | 
|---|
| 1140 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1141 | void setupHeuristics(); | 
|---|
| 1142 | #endif | 
|---|
| 1143 |  | 
|---|
| 1144 | #if defined(QT_DEBUG) | 
|---|
| 1145 | void dump() const; | 
|---|
| 1146 | #endif | 
|---|
| 1147 |  | 
|---|
| 1148 | private: | 
|---|
| 1149 | void addAnchorsToEngine( const Box& to ) const; | 
|---|
| 1150 |  | 
|---|
| 1151 | QRegExpEngine *eng; // the automaton under construction | 
|---|
| 1152 | QMemArray<int> ls; // the left states (firstpos) | 
|---|
| 1153 | QMemArray<int> rs; // the right states (lastpos) | 
|---|
| 1154 | QMap<int, int> lanchors; // the left anchors | 
|---|
| 1155 | QMap<int, int> ranchors; // the right anchors | 
|---|
| 1156 | int skipanchors; // the anchors to match if the box is skipped | 
|---|
| 1157 |  | 
|---|
| 1158 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1159 | int earlyStart; // the index where str can first occur | 
|---|
| 1160 | int lateStart; // the index where str can last occur | 
|---|
| 1161 | QString str; // a string that has to occur in any match | 
|---|
| 1162 | QString leftStr; // a string occurring at the left of this box | 
|---|
| 1163 | QString rightStr; // a string occurring at the right of this box | 
|---|
| 1164 | int maxl; // the maximum length of this box (possibly InftyLen) | 
|---|
| 1165 | #endif | 
|---|
| 1166 |  | 
|---|
| 1167 | int minl; // the minimum length of this box | 
|---|
| 1168 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1169 | QMemArray<int> occ1; // first-occurrence array | 
|---|
| 1170 | #endif | 
|---|
| 1171 | }; | 
|---|
| 1172 | friend class Box; | 
|---|
| 1173 |  | 
|---|
| 1174 | /* | 
|---|
| 1175 | This is the lexical analyzer for regular expressions. | 
|---|
| 1176 | */ | 
|---|
| 1177 | enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, | 
|---|
| 1178 | Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass, | 
|---|
| 1179 | Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord, | 
|---|
| 1180 | Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; | 
|---|
| 1181 | int getChar(); | 
|---|
| 1182 | int getEscape(); | 
|---|
| 1183 | #ifndef QT_NO_REGEXP_INTERVAL | 
|---|
| 1184 | int getRep( int def ); | 
|---|
| 1185 | #endif | 
|---|
| 1186 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 1187 | void skipChars( int n ); | 
|---|
| 1188 | #endif | 
|---|
| 1189 | void error( const char *msg ); | 
|---|
| 1190 | void startTokenizer( const QChar *rx, int len ); | 
|---|
| 1191 | int getToken(); | 
|---|
| 1192 |  | 
|---|
| 1193 | const QChar *yyIn; // a pointer to the input regular expression pattern | 
|---|
| 1194 | int yyPos0; // the position of yyTok in the input pattern | 
|---|
| 1195 | int yyPos; // the position of the next character to read | 
|---|
| 1196 | int yyLen; // the length of yyIn | 
|---|
| 1197 | int yyCh; // the last character read | 
|---|
| 1198 | CharClass *yyCharClass; // attribute for Tok_CharClass tokens | 
|---|
| 1199 | int yyMinRep; // attribute for Tok_Quantifier | 
|---|
| 1200 | int yyMaxRep; // ditto | 
|---|
| 1201 | QString yyError; // syntax error or overflow during parsing? | 
|---|
| 1202 |  | 
|---|
| 1203 | /* | 
|---|
| 1204 | This is the syntactic analyzer for regular expressions. | 
|---|
| 1205 | */ | 
|---|
| 1206 | int parse( const QChar *rx, int len ); | 
|---|
| 1207 | void parseAtom( Box *box ); | 
|---|
| 1208 | void parseFactor( Box *box ); | 
|---|
| 1209 | void parseTerm( Box *box ); | 
|---|
| 1210 | void parseExpression( Box *box ); | 
|---|
| 1211 |  | 
|---|
| 1212 | int yyTok; // the last token read | 
|---|
| 1213 | bool yyMayCapture; // set this to FALSE to disable capturing | 
|---|
| 1214 |  | 
|---|
| 1215 | /* | 
|---|
| 1216 | This is the engine state during matching. | 
|---|
| 1217 | */ | 
|---|
| 1218 | const QString *mmStr; // a pointer to the input QString | 
|---|
| 1219 | const QChar *mmIn; // a pointer to the input string data | 
|---|
| 1220 | int mmPos; // the current position in the string | 
|---|
| 1221 | int mmCaretPos; | 
|---|
| 1222 | int mmLen; // the length of the input string | 
|---|
| 1223 | bool mmMinimal; // minimal matching? | 
|---|
| 1224 | QMemArray<int> mmBigArray; // big QMemArray<int> array | 
|---|
| 1225 | int *mmInNextStack; // is state is mmNextStack? | 
|---|
| 1226 | int *mmCurStack; // stack of current states | 
|---|
| 1227 | int *mmNextStack; // stack of next states | 
|---|
| 1228 | int *mmCurCapBegin; // start of current states' captures | 
|---|
| 1229 | int *mmNextCapBegin; // start of next states' captures | 
|---|
| 1230 | int *mmCurCapEnd; // end of current states' captures | 
|---|
| 1231 | int *mmNextCapEnd; // end of next states' captures | 
|---|
| 1232 | int *mmTempCapBegin; // start of temporary captures | 
|---|
| 1233 | int *mmTempCapEnd; // end of temporary captures | 
|---|
| 1234 | int *mmCapBegin; // start of captures for a next state | 
|---|
| 1235 | int *mmCapEnd; // end of captures for a next state | 
|---|
| 1236 | int *mmSlideTab; // bump-along slide table for bad-character heuristic | 
|---|
| 1237 | int mmSlideTabSize; // size of slide table | 
|---|
| 1238 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 1239 | QIntDict<int> mmSleeping; // dictionary of back-reference sleepers | 
|---|
| 1240 | #endif | 
|---|
| 1241 | int mmMatchLen; // length of match | 
|---|
| 1242 | int mmOneTestMatchedLen; // length of partial match | 
|---|
| 1243 | }; | 
|---|
| 1244 |  | 
|---|
| 1245 | QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive ) | 
|---|
| 1246 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 1247 | : mmSleeping( 101 ) | 
|---|
| 1248 | #endif | 
|---|
| 1249 | { | 
|---|
| 1250 | setup( caseSensitive ); | 
|---|
| 1251 | valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() ); | 
|---|
| 1252 | if ( !valid ) { | 
|---|
| 1253 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1254 | trivial = FALSE; | 
|---|
| 1255 | #endif | 
|---|
| 1256 | error( RXERR_LEFTDELIM ); | 
|---|
| 1257 | } | 
|---|
| 1258 | } | 
|---|
| 1259 |  | 
|---|
| 1260 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1261 | QRegExpEngine::~QRegExpEngine() | 
|---|
| 1262 | { | 
|---|
| 1263 | } | 
|---|
| 1264 | #endif | 
|---|
| 1265 |  | 
|---|
| 1266 | /* | 
|---|
| 1267 | Tries to match in str and returns an array of (begin, length) pairs | 
|---|
| 1268 | for captured text. If there is no match, all pairs are (-1, -1). | 
|---|
| 1269 | */ | 
|---|
| 1270 | void QRegExpEngine::match( const QString& str, int pos, bool minimal, | 
|---|
| 1271 | bool oneTest, int caretIndex, | 
|---|
| 1272 | QMemArray<int>& captured ) | 
|---|
| 1273 | { | 
|---|
| 1274 | bool matched = FALSE; | 
|---|
| 1275 |  | 
|---|
| 1276 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1277 | if ( trivial && !oneTest ) { | 
|---|
| 1278 | mmPos = str.find( goodStr, pos, cs ); | 
|---|
| 1279 | mmMatchLen = goodStr.length(); | 
|---|
| 1280 | matched = ( mmPos != -1 ); | 
|---|
| 1281 | } else | 
|---|
| 1282 | #endif | 
|---|
| 1283 | { | 
|---|
| 1284 | mmStr = &str; | 
|---|
| 1285 | mmIn = str.unicode(); | 
|---|
| 1286 | if ( mmIn == 0 ) | 
|---|
| 1287 | mmIn = &QChar::null; | 
|---|
| 1288 | mmPos = pos; | 
|---|
| 1289 | mmCaretPos = caretIndex; | 
|---|
| 1290 | mmLen = str.length(); | 
|---|
| 1291 | mmMinimal = minimal; | 
|---|
| 1292 | mmMatchLen = 0; | 
|---|
| 1293 | mmOneTestMatchedLen = 0; | 
|---|
| 1294 |  | 
|---|
| 1295 | if ( valid && mmPos >= 0 && mmPos <= mmLen ) { | 
|---|
| 1296 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1297 | if ( oneTest ) { | 
|---|
| 1298 | matched = matchHere(); | 
|---|
| 1299 | } else { | 
|---|
| 1300 | if ( mmPos <= mmLen - minl ) { | 
|---|
| 1301 | if ( caretAnchored ) { | 
|---|
| 1302 | matched = matchHere(); | 
|---|
| 1303 | } else if ( useGoodStringHeuristic ) { | 
|---|
| 1304 | matched = goodStringMatch(); | 
|---|
| 1305 | } else { | 
|---|
| 1306 | matched = badCharMatch(); | 
|---|
| 1307 | } | 
|---|
| 1308 | } | 
|---|
| 1309 | } | 
|---|
| 1310 | #else | 
|---|
| 1311 | matched = oneTest ? matchHere() : bruteMatch(); | 
|---|
| 1312 | #endif | 
|---|
| 1313 | } | 
|---|
| 1314 | } | 
|---|
| 1315 |  | 
|---|
| 1316 | int capturedSize = 2 + 2 * officialncap; | 
|---|
| 1317 | captured.detach(); | 
|---|
| 1318 | captured.resize( capturedSize ); | 
|---|
| 1319 | if ( matched ) { | 
|---|
| 1320 | captured[0] = mmPos; | 
|---|
| 1321 | captured[1] = mmMatchLen; | 
|---|
| 1322 | for ( int j = 0; j < officialncap; j++ ) { | 
|---|
| 1323 | int len = mmCapEnd[j] - mmCapBegin[j]; | 
|---|
| 1324 | captured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0; | 
|---|
| 1325 | captured[2 + 2 * j + 1] = len; | 
|---|
| 1326 | } | 
|---|
| 1327 | } else { | 
|---|
| 1328 | // we rely on 2's complement here | 
|---|
| 1329 | memset( captured.data(), -1, capturedSize * sizeof(int) ); | 
|---|
| 1330 | } | 
|---|
| 1331 | } | 
|---|
| 1332 |  | 
|---|
| 1333 | /* | 
|---|
| 1334 | The three following functions add one state to the automaton and | 
|---|
| 1335 | return the number of the state. | 
|---|
| 1336 | */ | 
|---|
| 1337 |  | 
|---|
| 1338 | int QRegExpEngine::createState( QChar ch ) | 
|---|
| 1339 | { | 
|---|
| 1340 | return setupState( ch.unicode() ); | 
|---|
| 1341 | } | 
|---|
| 1342 |  | 
|---|
| 1343 | int QRegExpEngine::createState( const CharClass& cc ) | 
|---|
| 1344 | { | 
|---|
| 1345 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 1346 | int n = cl.size(); | 
|---|
| 1347 | cl.resize( n + 1 ); | 
|---|
| 1348 | cl.insert( n, new CharClass(cc) ); | 
|---|
| 1349 | return setupState( CharClassBit | n ); | 
|---|
| 1350 | #else | 
|---|
| 1351 | Q_UNUSED( cc ); | 
|---|
| 1352 | return setupState( CharClassBit ); | 
|---|
| 1353 | #endif | 
|---|
| 1354 | } | 
|---|
| 1355 |  | 
|---|
| 1356 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 1357 | int QRegExpEngine::createState( int bref ) | 
|---|
| 1358 | { | 
|---|
| 1359 | if ( bref > nbrefs ) { | 
|---|
| 1360 | nbrefs = bref; | 
|---|
| 1361 | if ( nbrefs > MaxBackRefs ) { | 
|---|
| 1362 | error( RXERR_LIMIT ); | 
|---|
| 1363 | return 0; | 
|---|
| 1364 | } | 
|---|
| 1365 | } | 
|---|
| 1366 | return setupState( BackRefBit | bref ); | 
|---|
| 1367 | } | 
|---|
| 1368 | #endif | 
|---|
| 1369 |  | 
|---|
| 1370 | /* | 
|---|
| 1371 | The two following functions add a transition between all pairs of | 
|---|
| 1372 | states (i, j) where i is fond in from, and j is found in to. | 
|---|
| 1373 |  | 
|---|
| 1374 | Cat-transitions are distinguished from plus-transitions for | 
|---|
| 1375 | capturing. | 
|---|
| 1376 | */ | 
|---|
| 1377 |  | 
|---|
| 1378 | void QRegExpEngine::addCatTransitions( const QMemArray<int>& from, | 
|---|
| 1379 | const QMemArray<int>& to ) | 
|---|
| 1380 | { | 
|---|
| 1381 | for ( int i = 0; i < (int) from.size(); i++ ) { | 
|---|
| 1382 | State *st = s[from[i]]; | 
|---|
| 1383 | mergeInto( &st->outs, to ); | 
|---|
| 1384 | } | 
|---|
| 1385 | } | 
|---|
| 1386 |  | 
|---|
| 1387 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1388 | void QRegExpEngine::addPlusTransitions( const QMemArray<int>& from, | 
|---|
| 1389 | const QMemArray<int>& to, int atom ) | 
|---|
| 1390 | { | 
|---|
| 1391 | for ( int i = 0; i < (int) from.size(); i++ ) { | 
|---|
| 1392 | State *st = s[from[i]]; | 
|---|
| 1393 | QMemArray<int> oldOuts = st->outs.copy(); | 
|---|
| 1394 | mergeInto( &st->outs, to ); | 
|---|
| 1395 | if ( f[atom].capture >= 0 ) { | 
|---|
| 1396 | if ( st->reenter == 0 ) | 
|---|
| 1397 | st->reenter = new QMap<int, int>; | 
|---|
| 1398 | for ( int j = 0; j < (int) to.size(); j++ ) { | 
|---|
| 1399 | if ( !st->reenter->contains(to[j]) && | 
|---|
| 1400 | oldOuts.bsearch(to[j]) < 0 ) | 
|---|
| 1401 | st->reenter->insert( to[j], atom ); | 
|---|
| 1402 | } | 
|---|
| 1403 | } | 
|---|
| 1404 | } | 
|---|
| 1405 | } | 
|---|
| 1406 | #endif | 
|---|
| 1407 |  | 
|---|
| 1408 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | 
|---|
| 1409 | /* | 
|---|
| 1410 | Returns an anchor that means a OR b. | 
|---|
| 1411 | */ | 
|---|
| 1412 | int QRegExpEngine::anchorAlternation( int a, int b ) | 
|---|
| 1413 | { | 
|---|
| 1414 | if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 ) | 
|---|
| 1415 | return a & b; | 
|---|
| 1416 |  | 
|---|
| 1417 | int n = aa.size(); | 
|---|
| 1418 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1419 | if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b ) | 
|---|
| 1420 | return Anchor_Alternation | ( n - 1 ); | 
|---|
| 1421 | #endif | 
|---|
| 1422 |  | 
|---|
| 1423 | aa.resize( n + 1 ); | 
|---|
| 1424 | aa[n].a = a; | 
|---|
| 1425 | aa[n].b = b; | 
|---|
| 1426 | return Anchor_Alternation | n; | 
|---|
| 1427 | } | 
|---|
| 1428 |  | 
|---|
| 1429 | /* | 
|---|
| 1430 | Returns an anchor that means a AND b. | 
|---|
| 1431 | */ | 
|---|
| 1432 | int QRegExpEngine::anchorConcatenation( int a, int b ) | 
|---|
| 1433 | { | 
|---|
| 1434 | if ( ((a | b) & Anchor_Alternation) == 0 ) | 
|---|
| 1435 | return a | b; | 
|---|
| 1436 | if ( (b & Anchor_Alternation) != 0 ) | 
|---|
| 1437 | qSwap( a, b ); | 
|---|
| 1438 |  | 
|---|
| 1439 | int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b ); | 
|---|
| 1440 | int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b ); | 
|---|
| 1441 | return anchorAlternation( aprime, bprime ); | 
|---|
| 1442 | } | 
|---|
| 1443 | #endif | 
|---|
| 1444 |  | 
|---|
| 1445 | /* | 
|---|
| 1446 | Adds anchor a on a transition caracterised by its from state and | 
|---|
| 1447 | its to state. | 
|---|
| 1448 | */ | 
|---|
| 1449 | void QRegExpEngine::addAnchors( int from, int to, int a ) | 
|---|
| 1450 | { | 
|---|
| 1451 | State *st = s[from]; | 
|---|
| 1452 | if ( st->anchors == 0 ) | 
|---|
| 1453 | st->anchors = new QMap<int, int>; | 
|---|
| 1454 | if ( st->anchors->contains(to) ) | 
|---|
| 1455 | a = anchorAlternation( (*st->anchors)[to], a ); | 
|---|
| 1456 | st->anchors->insert( to, a ); | 
|---|
| 1457 | } | 
|---|
| 1458 |  | 
|---|
| 1459 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1460 | /* | 
|---|
| 1461 | This function chooses between the good-string and the bad-character | 
|---|
| 1462 | heuristics. It computes two scores and chooses the heuristic with | 
|---|
| 1463 | the highest score. | 
|---|
| 1464 |  | 
|---|
| 1465 | Here are some common-sense constraints on the scores that should be | 
|---|
| 1466 | respected if the formulas are ever modified: (1) If goodStr is | 
|---|
| 1467 | empty, the good-string heuristic scores 0. (2) If the regular | 
|---|
| 1468 | expression is trivial, the good-string heuristic should be used. | 
|---|
| 1469 | (3) If the search is case insensitive, the good-string heuristic | 
|---|
| 1470 | should be used, unless it scores 0. (Case insensitivity turns all | 
|---|
| 1471 | entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is | 
|---|
| 1472 | big, the good-string heuristic should score less. | 
|---|
| 1473 | */ | 
|---|
| 1474 | void QRegExpEngine::heuristicallyChooseHeuristic() | 
|---|
| 1475 | { | 
|---|
| 1476 | if ( minl == 0 ) { | 
|---|
| 1477 | useGoodStringHeuristic = FALSE; | 
|---|
| 1478 | } else if ( trivial ) { | 
|---|
| 1479 | useGoodStringHeuristic = TRUE; | 
|---|
| 1480 | } else { | 
|---|
| 1481 | /* | 
|---|
| 1482 | Magic formula: The good string has to constitute a good | 
|---|
| 1483 | proportion of the minimum-length string, and appear at a | 
|---|
| 1484 | more-or-less known index. | 
|---|
| 1485 | */ | 
|---|
| 1486 | int goodStringScore = ( 64 * goodStr.length() / minl ) - | 
|---|
| 1487 | ( goodLateStart - goodEarlyStart ); | 
|---|
| 1488 | /* | 
|---|
| 1489 | Less magic formula: We pick some characters at random, and | 
|---|
| 1490 | check whether they are good or bad. | 
|---|
| 1491 | */ | 
|---|
| 1492 | int badCharScore = 0; | 
|---|
| 1493 | int step = QMAX( 1, NumBadChars / 32 ); | 
|---|
| 1494 | for ( int i = 1; i < NumBadChars; i += step ) { | 
|---|
| 1495 | if ( occ1[i] == NoOccurrence ) | 
|---|
| 1496 | badCharScore += minl; | 
|---|
| 1497 | else | 
|---|
| 1498 | badCharScore += occ1[i]; | 
|---|
| 1499 | } | 
|---|
| 1500 | badCharScore /= minl; | 
|---|
| 1501 | useGoodStringHeuristic = ( goodStringScore > badCharScore ); | 
|---|
| 1502 | } | 
|---|
| 1503 | } | 
|---|
| 1504 | #endif | 
|---|
| 1505 |  | 
|---|
| 1506 | #if defined(QT_DEBUG) | 
|---|
| 1507 | void QRegExpEngine::dump() const | 
|---|
| 1508 | { | 
|---|
| 1509 | int i, j; | 
|---|
| 1510 | qDebug( "Case %ssensitive engine", cs ? "" : "in" ); | 
|---|
| 1511 | qDebug( "  States" ); | 
|---|
| 1512 | for ( i = 0; i < ns; i++ ) { | 
|---|
| 1513 | qDebug( "  %d%s", i, | 
|---|
| 1514 | i == InitialState ? " (initial)" : | 
|---|
| 1515 | i == FinalState ? " (final)" : "" ); | 
|---|
| 1516 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1517 | qDebug( "    in atom %d", s[i]->atom ); | 
|---|
| 1518 | #endif | 
|---|
| 1519 | int m = s[i]->match; | 
|---|
| 1520 | if ( (m & CharClassBit) != 0 ) { | 
|---|
| 1521 | qDebug( "    match character class %d", m ^ CharClassBit ); | 
|---|
| 1522 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 1523 | cl[m ^ CharClassBit]->dump(); | 
|---|
| 1524 | #else | 
|---|
| 1525 | qDebug( "    negative character class" ); | 
|---|
| 1526 | #endif | 
|---|
| 1527 | } else if ( (m & BackRefBit) != 0 ) { | 
|---|
| 1528 | qDebug( "    match back-reference %d", m ^ BackRefBit ); | 
|---|
| 1529 | } else if ( m >= 0x20 && m <= 0x7e ) { | 
|---|
| 1530 | qDebug( "    match 0x%.4x (%c)", m, m ); | 
|---|
| 1531 | } else { | 
|---|
| 1532 | qDebug( "    match 0x%.4x", m ); | 
|---|
| 1533 | } | 
|---|
| 1534 | for ( j = 0; j < (int) s[i]->outs.size(); j++ ) { | 
|---|
| 1535 | int next = s[i]->outs[j]; | 
|---|
| 1536 | qDebug( "    -> %d", next ); | 
|---|
| 1537 | if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) ) | 
|---|
| 1538 | qDebug( "       [reenter %d]", (*s[i]->reenter)[next] ); | 
|---|
| 1539 | if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 ) | 
|---|
| 1540 | qDebug( "       [anchors 0x%.8x]", (*s[i]->anchors)[next] ); | 
|---|
| 1541 | } | 
|---|
| 1542 | } | 
|---|
| 1543 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1544 | if ( nf > 0 ) { | 
|---|
| 1545 | qDebug( "  Atom    Parent  Capture" ); | 
|---|
| 1546 | for ( i = 0; i < nf; i++ ) | 
|---|
| 1547 | qDebug( "  %6d  %6d  %6d", i, f[i].parent, f[i].capture ); | 
|---|
| 1548 | } | 
|---|
| 1549 | #endif | 
|---|
| 1550 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | 
|---|
| 1551 | for ( i = 0; i < (int) aa.size(); i++ ) | 
|---|
| 1552 | qDebug( "  Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, | 
|---|
| 1553 | aa[i].b ); | 
|---|
| 1554 | #endif | 
|---|
| 1555 | } | 
|---|
| 1556 | #endif | 
|---|
| 1557 |  | 
|---|
| 1558 | void QRegExpEngine::setup( bool caseSensitive ) | 
|---|
| 1559 | { | 
|---|
| 1560 | s.setAutoDelete( TRUE ); | 
|---|
| 1561 | s.resize( 32 ); | 
|---|
| 1562 | ns = 0; | 
|---|
| 1563 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1564 | f.resize( 32 ); | 
|---|
| 1565 | nf = 0; | 
|---|
| 1566 | cf = -1; | 
|---|
| 1567 | #endif | 
|---|
| 1568 | officialncap = 0; | 
|---|
| 1569 | ncap = 0; | 
|---|
| 1570 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 1571 | cl.setAutoDelete( TRUE ); | 
|---|
| 1572 | #endif | 
|---|
| 1573 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 1574 | ahead.setAutoDelete( TRUE ); | 
|---|
| 1575 | #endif | 
|---|
| 1576 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1577 | caretAnchored = TRUE; | 
|---|
| 1578 | trivial = TRUE; | 
|---|
| 1579 | #endif | 
|---|
| 1580 | valid = FALSE; | 
|---|
| 1581 | cs = caseSensitive; | 
|---|
| 1582 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 1583 | nbrefs = 0; | 
|---|
| 1584 | #endif | 
|---|
| 1585 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1586 | useGoodStringHeuristic = TRUE; | 
|---|
| 1587 | minl = 0; | 
|---|
| 1588 | occ1.fill( 0, NumBadChars ); | 
|---|
| 1589 | #endif | 
|---|
| 1590 | } | 
|---|
| 1591 |  | 
|---|
| 1592 | int QRegExpEngine::setupState( int match ) | 
|---|
| 1593 | { | 
|---|
| 1594 | if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() ) | 
|---|
| 1595 | s.resize( (ns + 1) << 1 ); | 
|---|
| 1596 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1597 | s.insert( ns, new State(cf, match) ); | 
|---|
| 1598 | #else | 
|---|
| 1599 | s.insert( ns, new State(match) ); | 
|---|
| 1600 | #endif | 
|---|
| 1601 | return ns++; | 
|---|
| 1602 | } | 
|---|
| 1603 |  | 
|---|
| 1604 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1605 | /* | 
|---|
| 1606 | Functions startAtom() and finishAtom() should be called to delimit | 
|---|
| 1607 | atoms. When a state is created, it is assigned to the current atom. | 
|---|
| 1608 | The information is later used for capturing. | 
|---|
| 1609 | */ | 
|---|
| 1610 | int QRegExpEngine::startAtom( bool capture ) | 
|---|
| 1611 | { | 
|---|
| 1612 | if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() ) | 
|---|
| 1613 | f.resize( (nf + 1) << 1 ); | 
|---|
| 1614 | f[nf].parent = cf; | 
|---|
| 1615 | cf = nf++; | 
|---|
| 1616 | f[cf].capture = capture ? ncap++ : -1; | 
|---|
| 1617 | return cf; | 
|---|
| 1618 | } | 
|---|
| 1619 | #endif | 
|---|
| 1620 |  | 
|---|
| 1621 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 1622 | /* | 
|---|
| 1623 | Creates a lookahead anchor. | 
|---|
| 1624 | */ | 
|---|
| 1625 | int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative ) | 
|---|
| 1626 | { | 
|---|
| 1627 | int n = ahead.size(); | 
|---|
| 1628 | if ( n == MaxLookaheads ) { | 
|---|
| 1629 | error( RXERR_LIMIT ); | 
|---|
| 1630 | return 0; | 
|---|
| 1631 | } | 
|---|
| 1632 | ahead.resize( n + 1 ); | 
|---|
| 1633 | ahead.insert( n, new Lookahead(eng, negative) ); | 
|---|
| 1634 | return Anchor_FirstLookahead << n; | 
|---|
| 1635 | } | 
|---|
| 1636 | #endif | 
|---|
| 1637 |  | 
|---|
| 1638 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1639 | /* | 
|---|
| 1640 | We want the longest leftmost captures. | 
|---|
| 1641 | */ | 
|---|
| 1642 | bool QRegExpEngine::isBetterCapture( const int *begin1, const int *end1, | 
|---|
| 1643 | const int *begin2, const int *end2 ) | 
|---|
| 1644 | { | 
|---|
| 1645 | for ( int i = 0; i < ncap; i++ ) { | 
|---|
| 1646 | int delta = begin2[i] - begin1[i]; // it has to start early... | 
|---|
| 1647 | if ( delta == 0 ) | 
|---|
| 1648 | delta = end1[i] - end2[i]; // ...and end late (like a party) | 
|---|
| 1649 |  | 
|---|
| 1650 | if ( delta != 0 ) | 
|---|
| 1651 | return delta > 0; | 
|---|
| 1652 | } | 
|---|
| 1653 | return FALSE; | 
|---|
| 1654 | } | 
|---|
| 1655 | #endif | 
|---|
| 1656 |  | 
|---|
| 1657 | /* | 
|---|
| 1658 | Returns TRUE if anchor a matches at position mmPos + i in the input | 
|---|
| 1659 | string, otherwise FALSE. | 
|---|
| 1660 | */ | 
|---|
| 1661 | bool QRegExpEngine::testAnchor( int i, int a, const int *capBegin ) | 
|---|
| 1662 | { | 
|---|
| 1663 | int j; | 
|---|
| 1664 |  | 
|---|
| 1665 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | 
|---|
| 1666 | if ( (a & Anchor_Alternation) != 0 ) { | 
|---|
| 1667 | return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) || | 
|---|
| 1668 | testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin ); | 
|---|
| 1669 | } | 
|---|
| 1670 | #endif | 
|---|
| 1671 |  | 
|---|
| 1672 | if ( (a & Anchor_Caret) != 0 ) { | 
|---|
| 1673 | if ( mmPos + i != mmCaretPos ) | 
|---|
| 1674 | return FALSE; | 
|---|
| 1675 | } | 
|---|
| 1676 | if ( (a & Anchor_Dollar) != 0 ) { | 
|---|
| 1677 | if ( mmPos + i != mmLen ) | 
|---|
| 1678 | return FALSE; | 
|---|
| 1679 | } | 
|---|
| 1680 | #ifndef QT_NO_REGEXP_ESCAPE | 
|---|
| 1681 | if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) { | 
|---|
| 1682 | bool before = FALSE; | 
|---|
| 1683 | bool after = FALSE; | 
|---|
| 1684 | if ( mmPos + i != 0 ) | 
|---|
| 1685 | before = isWord( mmIn[mmPos + i - 1] ); | 
|---|
| 1686 | if ( mmPos + i != mmLen ) | 
|---|
| 1687 | after = isWord( mmIn[mmPos + i] ); | 
|---|
| 1688 | if ( (a & Anchor_Word) != 0 && (before == after) ) | 
|---|
| 1689 | return FALSE; | 
|---|
| 1690 | if ( (a & Anchor_NonWord) != 0 && (before != after) ) | 
|---|
| 1691 | return FALSE; | 
|---|
| 1692 | } | 
|---|
| 1693 | #endif | 
|---|
| 1694 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 1695 | if ( (a & Anchor_LookaheadMask) != 0 ) { | 
|---|
| 1696 | QConstString cstr = QConstString( (QChar *) mmIn + mmPos + i, | 
|---|
| 1697 | mmLen - mmPos - i ); | 
|---|
| 1698 | for ( j = 0; j < (int) ahead.size(); j++ ) { | 
|---|
| 1699 | if ( (a & (Anchor_FirstLookahead << j)) != 0 ) { | 
|---|
| 1700 | QMemArray<int> captured; | 
|---|
| 1701 | ahead[j]->eng->match( cstr.string(), 0, TRUE, TRUE, | 
|---|
| 1702 | mmCaretPos - mmPos - i, captured ); | 
|---|
| 1703 | if ( (captured[0] == 0) == ahead[j]->neg ) | 
|---|
| 1704 | return FALSE; | 
|---|
| 1705 | } | 
|---|
| 1706 | } | 
|---|
| 1707 | } | 
|---|
| 1708 | #endif | 
|---|
| 1709 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1710 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 1711 | for ( j = 0; j < nbrefs; j++ ) { | 
|---|
| 1712 | if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) { | 
|---|
| 1713 | if ( capBegin[j] != EmptyCapture ) | 
|---|
| 1714 | return FALSE; | 
|---|
| 1715 | } | 
|---|
| 1716 | } | 
|---|
| 1717 | #endif | 
|---|
| 1718 | #endif | 
|---|
| 1719 | return TRUE; | 
|---|
| 1720 | } | 
|---|
| 1721 |  | 
|---|
| 1722 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 1723 | /* | 
|---|
| 1724 | The three following functions are what Jeffrey Friedl would call | 
|---|
| 1725 | transmissions (or bump-alongs). Using one or the other should make | 
|---|
| 1726 | no difference except in performance. | 
|---|
| 1727 | */ | 
|---|
| 1728 |  | 
|---|
| 1729 | bool QRegExpEngine::goodStringMatch() | 
|---|
| 1730 | { | 
|---|
| 1731 | int k = mmPos + goodEarlyStart; | 
|---|
| 1732 | while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) { | 
|---|
| 1733 | int from = k - goodLateStart; | 
|---|
| 1734 | int to = k - goodEarlyStart; | 
|---|
| 1735 | if ( from > mmPos ) | 
|---|
| 1736 | mmPos = from; | 
|---|
| 1737 |  | 
|---|
| 1738 | while ( mmPos <= to ) { | 
|---|
| 1739 | if ( matchHere() ) | 
|---|
| 1740 | return TRUE; | 
|---|
| 1741 | mmPos++; | 
|---|
| 1742 | } | 
|---|
| 1743 | k++; | 
|---|
| 1744 | } | 
|---|
| 1745 | return FALSE; | 
|---|
| 1746 | } | 
|---|
| 1747 |  | 
|---|
| 1748 | bool QRegExpEngine::badCharMatch() | 
|---|
| 1749 | { | 
|---|
| 1750 | int slideHead = 0; | 
|---|
| 1751 | int slideNext = 0; | 
|---|
| 1752 | int i; | 
|---|
| 1753 | int lastPos = mmLen - minl; | 
|---|
| 1754 | memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) ); | 
|---|
| 1755 |  | 
|---|
| 1756 | /* | 
|---|
| 1757 | Set up the slide table, used for the bad-character heuristic, | 
|---|
| 1758 | using the table of first occurrence of each character. | 
|---|
| 1759 | */ | 
|---|
| 1760 | for ( i = 0; i < minl; i++ ) { | 
|---|
| 1761 | int sk = occ1[BadChar(mmIn[mmPos + i])]; | 
|---|
| 1762 | if ( sk == NoOccurrence ) | 
|---|
| 1763 | sk = i + 1; | 
|---|
| 1764 | if ( sk > 0 ) { | 
|---|
| 1765 | int k = i + 1 - sk; | 
|---|
| 1766 | if ( k < 0 ) { | 
|---|
| 1767 | sk = i + 1; | 
|---|
| 1768 | k = 0; | 
|---|
| 1769 | } | 
|---|
| 1770 | if ( sk > mmSlideTab[k] ) | 
|---|
| 1771 | mmSlideTab[k] = sk; | 
|---|
| 1772 | } | 
|---|
| 1773 | } | 
|---|
| 1774 |  | 
|---|
| 1775 | if ( mmPos > lastPos ) | 
|---|
| 1776 | return FALSE; | 
|---|
| 1777 |  | 
|---|
| 1778 | for ( ;; ) { | 
|---|
| 1779 | if ( ++slideNext >= mmSlideTabSize ) | 
|---|
| 1780 | slideNext = 0; | 
|---|
| 1781 | if ( mmSlideTab[slideHead] > 0 ) { | 
|---|
| 1782 | if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] ) | 
|---|
| 1783 | mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1; | 
|---|
| 1784 | mmSlideTab[slideHead] = 0; | 
|---|
| 1785 | } else { | 
|---|
| 1786 | if ( matchHere() ) | 
|---|
| 1787 | return TRUE; | 
|---|
| 1788 | } | 
|---|
| 1789 |  | 
|---|
| 1790 | if ( mmPos == lastPos ) | 
|---|
| 1791 | break; | 
|---|
| 1792 |  | 
|---|
| 1793 | /* | 
|---|
| 1794 | Update the slide table. This code has much in common with | 
|---|
| 1795 | the initialization code. | 
|---|
| 1796 | */ | 
|---|
| 1797 | int sk = occ1[BadChar(mmIn[mmPos + minl])]; | 
|---|
| 1798 | if ( sk == NoOccurrence ) { | 
|---|
| 1799 | mmSlideTab[slideNext] = minl; | 
|---|
| 1800 | } else if ( sk > 0 ) { | 
|---|
| 1801 | int k = slideNext + minl - sk; | 
|---|
| 1802 | if ( k >= mmSlideTabSize ) | 
|---|
| 1803 | k -= mmSlideTabSize; | 
|---|
| 1804 | if ( sk > mmSlideTab[k] ) | 
|---|
| 1805 | mmSlideTab[k] = sk; | 
|---|
| 1806 | } | 
|---|
| 1807 | slideHead = slideNext; | 
|---|
| 1808 | mmPos++; | 
|---|
| 1809 | } | 
|---|
| 1810 | return FALSE; | 
|---|
| 1811 | } | 
|---|
| 1812 | #else | 
|---|
| 1813 | bool QRegExpEngine::bruteMatch() | 
|---|
| 1814 | { | 
|---|
| 1815 | while ( mmPos <= mmLen ) { | 
|---|
| 1816 | if ( matchHere() ) | 
|---|
| 1817 | return TRUE; | 
|---|
| 1818 | mmPos++; | 
|---|
| 1819 | } | 
|---|
| 1820 | return FALSE; | 
|---|
| 1821 | } | 
|---|
| 1822 | #endif | 
|---|
| 1823 |  | 
|---|
| 1824 | /* | 
|---|
| 1825 | Here's the core of the engine. It tries to do a match here and now. | 
|---|
| 1826 | */ | 
|---|
| 1827 | bool QRegExpEngine::matchHere() | 
|---|
| 1828 | { | 
|---|
| 1829 | int ncur = 1, nnext = 0; | 
|---|
| 1830 | int i = 0, j, k, m; | 
|---|
| 1831 | bool stop = FALSE; | 
|---|
| 1832 |  | 
|---|
| 1833 | mmMatchLen = -1; | 
|---|
| 1834 | mmOneTestMatchedLen = -1; | 
|---|
| 1835 | mmCurStack[0] = InitialState; | 
|---|
| 1836 |  | 
|---|
| 1837 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1838 | if ( ncap > 0 ) { | 
|---|
| 1839 | for ( j = 0; j < ncap; j++ ) { | 
|---|
| 1840 | mmCurCapBegin[j] = EmptyCapture; | 
|---|
| 1841 | mmCurCapEnd[j] = EmptyCapture; | 
|---|
| 1842 | } | 
|---|
| 1843 | } | 
|---|
| 1844 | #endif | 
|---|
| 1845 |  | 
|---|
| 1846 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 1847 | int *zzZ = 0; | 
|---|
| 1848 |  | 
|---|
| 1849 | while ( (ncur > 0 || !mmSleeping.isEmpty()) && i <= mmLen - mmPos && | 
|---|
| 1850 | !stop ) | 
|---|
| 1851 | #else | 
|---|
| 1852 | while ( ncur > 0 && i <= mmLen - mmPos && !stop ) | 
|---|
| 1853 | #endif | 
|---|
| 1854 | { | 
|---|
| 1855 | int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].unicode() : 0; | 
|---|
| 1856 | for ( j = 0; j < ncur; j++ ) { | 
|---|
| 1857 | int cur = mmCurStack[j]; | 
|---|
| 1858 | State *scur = s[cur]; | 
|---|
| 1859 | QMemArray<int>& outs = scur->outs; | 
|---|
| 1860 | for ( k = 0; k < (int) outs.size(); k++ ) { | 
|---|
| 1861 | int next = outs[k]; | 
|---|
| 1862 | State *snext = s[next]; | 
|---|
| 1863 | bool in = TRUE; | 
|---|
| 1864 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 1865 | int needSomeSleep = 0; | 
|---|
| 1866 | #endif | 
|---|
| 1867 |  | 
|---|
| 1868 | /* | 
|---|
| 1869 | First, check if the anchors are anchored properly. | 
|---|
| 1870 | */ | 
|---|
| 1871 | if ( scur->anchors != 0 ) { | 
|---|
| 1872 | int a = at( *scur->anchors, next ); | 
|---|
| 1873 | if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) ) | 
|---|
| 1874 | in = FALSE; | 
|---|
| 1875 | } | 
|---|
| 1876 | /* | 
|---|
| 1877 | If indeed they are, check if the input character is | 
|---|
| 1878 | correct for this transition. | 
|---|
| 1879 | */ | 
|---|
| 1880 | if ( in ) { | 
|---|
| 1881 | m = snext->match; | 
|---|
| 1882 | if ( (m & (CharClassBit | BackRefBit)) == 0 ) { | 
|---|
| 1883 | if ( cs ) | 
|---|
| 1884 | in = ( m == ch ); | 
|---|
| 1885 | else | 
|---|
| 1886 | in = ( QChar(m).lower() == QChar(ch).lower() ); | 
|---|
| 1887 | } else if ( next == FinalState ) { | 
|---|
| 1888 | mmMatchLen = i; | 
|---|
| 1889 | stop = mmMinimal; | 
|---|
| 1890 | in = TRUE; | 
|---|
| 1891 | } else if ( (m & CharClassBit) != 0 ) { | 
|---|
| 1892 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 1893 | const CharClass *cc = cl[m ^ CharClassBit]; | 
|---|
| 1894 | if ( cs ) | 
|---|
| 1895 | in = cc->in( ch ); | 
|---|
| 1896 | else if ( cc->negative() ) | 
|---|
| 1897 | in = cc->in( QChar(ch).lower() ) && | 
|---|
| 1898 | cc->in( QChar(ch).upper() ); | 
|---|
| 1899 | else | 
|---|
| 1900 | in = cc->in( QChar(ch).lower() ) || | 
|---|
| 1901 | cc->in( QChar(ch).upper() ); | 
|---|
| 1902 | #endif | 
|---|
| 1903 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 1904 | } else { /* ( (m & BackRefBit) != 0 ) */ | 
|---|
| 1905 | int bref = m ^ BackRefBit; | 
|---|
| 1906 | int ell = j * ncap + ( bref - 1 ); | 
|---|
| 1907 |  | 
|---|
| 1908 | in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture; | 
|---|
| 1909 | if ( in ) { | 
|---|
| 1910 | if ( cs ) | 
|---|
| 1911 | in = ( mmIn[mmPos + mmCurCapBegin[ell]] | 
|---|
| 1912 | == QChar(ch) ); | 
|---|
| 1913 | else | 
|---|
| 1914 | in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower() | 
|---|
| 1915 | == QChar(ch).lower() ); | 
|---|
| 1916 | } | 
|---|
| 1917 |  | 
|---|
| 1918 | if ( in ) { | 
|---|
| 1919 | int delta; | 
|---|
| 1920 | if ( mmCurCapEnd[ell] == EmptyCapture ) | 
|---|
| 1921 | delta = i - mmCurCapBegin[ell]; | 
|---|
| 1922 | else | 
|---|
| 1923 | delta = mmCurCapEnd[ell] - mmCurCapBegin[ell]; | 
|---|
| 1924 |  | 
|---|
| 1925 | in = ( delta <= mmLen - (mmPos + i) ); | 
|---|
| 1926 | if ( in && delta > 1 ) { | 
|---|
| 1927 | int n = 1; | 
|---|
| 1928 | if ( cs ) { | 
|---|
| 1929 | while ( n < delta ) { | 
|---|
| 1930 | if ( mmIn[mmPos + | 
|---|
| 1931 | mmCurCapBegin[ell] + n] != | 
|---|
| 1932 | mmIn[mmPos + i + n] ) | 
|---|
| 1933 | break; | 
|---|
| 1934 | n++; | 
|---|
| 1935 | } | 
|---|
| 1936 | } else { | 
|---|
| 1937 | while ( n < delta ) { | 
|---|
| 1938 | QChar a = mmIn[mmPos + | 
|---|
| 1939 | mmCurCapBegin[ell] + n]; | 
|---|
| 1940 | QChar b = mmIn[mmPos + i + n]; | 
|---|
| 1941 | if ( a.lower() != b.lower() ) | 
|---|
| 1942 | break; | 
|---|
| 1943 | n++; | 
|---|
| 1944 | } | 
|---|
| 1945 | } | 
|---|
| 1946 | in = ( n == delta ); | 
|---|
| 1947 | if ( in ) | 
|---|
| 1948 | needSomeSleep = delta - 1; | 
|---|
| 1949 | } | 
|---|
| 1950 | } | 
|---|
| 1951 | #endif | 
|---|
| 1952 | } | 
|---|
| 1953 | } | 
|---|
| 1954 |  | 
|---|
| 1955 | /* | 
|---|
| 1956 | We must now update our data structures. | 
|---|
| 1957 | */ | 
|---|
| 1958 | if ( in ) { | 
|---|
| 1959 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1960 | int *capBegin, *capEnd; | 
|---|
| 1961 | #endif | 
|---|
| 1962 | /* | 
|---|
| 1963 | If the next state was not encountered yet, all | 
|---|
| 1964 | is fine. | 
|---|
| 1965 | */ | 
|---|
| 1966 | if ( (m = mmInNextStack[next]) == -1 ) { | 
|---|
| 1967 | m = nnext++; | 
|---|
| 1968 | mmNextStack[m] = next; | 
|---|
| 1969 | mmInNextStack[next] = m; | 
|---|
| 1970 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1971 | capBegin = mmNextCapBegin + m * ncap; | 
|---|
| 1972 | capEnd = mmNextCapEnd + m * ncap; | 
|---|
| 1973 |  | 
|---|
| 1974 | /* | 
|---|
| 1975 | Otherwise, we'll first maintain captures in | 
|---|
| 1976 | temporary arrays, and decide at the end whether | 
|---|
| 1977 | it's best to keep the previous capture zones or | 
|---|
| 1978 | the new ones. | 
|---|
| 1979 | */ | 
|---|
| 1980 | } else { | 
|---|
| 1981 | capBegin = mmTempCapBegin; | 
|---|
| 1982 | capEnd = mmTempCapEnd; | 
|---|
| 1983 | #endif | 
|---|
| 1984 | } | 
|---|
| 1985 |  | 
|---|
| 1986 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 1987 | /* | 
|---|
| 1988 | Updating the capture zones is much of a task. | 
|---|
| 1989 | */ | 
|---|
| 1990 | if ( ncap > 0 ) { | 
|---|
| 1991 | memcpy( capBegin, mmCurCapBegin + j * ncap, | 
|---|
| 1992 | ncap * sizeof(int) ); | 
|---|
| 1993 | memcpy( capEnd, mmCurCapEnd + j * ncap, | 
|---|
| 1994 | ncap * sizeof(int) ); | 
|---|
| 1995 | int c = scur->atom, n = snext->atom; | 
|---|
| 1996 | int p = -1, q = -1; | 
|---|
| 1997 | int cap; | 
|---|
| 1998 |  | 
|---|
| 1999 | /* | 
|---|
| 2000 | Lemma 1. For any x in the range [0..nf), we | 
|---|
| 2001 | have f[x].parent < x. | 
|---|
| 2002 |  | 
|---|
| 2003 | Proof. By looking at startAtom(), it is | 
|---|
| 2004 | clear that cf < nf holds all the time, and | 
|---|
| 2005 | thus that f[nf].parent < nf. | 
|---|
| 2006 | */ | 
|---|
| 2007 |  | 
|---|
| 2008 | /* | 
|---|
| 2009 | If we are reentering an atom, we empty all | 
|---|
| 2010 | capture zones inside it. | 
|---|
| 2011 | */ | 
|---|
| 2012 | if ( scur->reenter != 0 && | 
|---|
| 2013 | (q = at(*scur->reenter, next)) != 0 ) { | 
|---|
| 2014 | QBitArray b; | 
|---|
| 2015 | b.fill( FALSE, nf ); | 
|---|
| 2016 | b.setBit( q, TRUE ); | 
|---|
| 2017 | for ( int ell = q + 1; ell < nf; ell++ ) { | 
|---|
| 2018 | if ( b.testBit(f[ell].parent) ) { | 
|---|
| 2019 | b.setBit( ell, TRUE ); | 
|---|
| 2020 | cap = f[ell].capture; | 
|---|
| 2021 | if ( cap >= 0 ) { | 
|---|
| 2022 | capBegin[cap] = EmptyCapture; | 
|---|
| 2023 | capEnd[cap] = EmptyCapture; | 
|---|
| 2024 | } | 
|---|
| 2025 | } | 
|---|
| 2026 | } | 
|---|
| 2027 | p = f[q].parent; | 
|---|
| 2028 |  | 
|---|
| 2029 | /* | 
|---|
| 2030 | Otherwise, close the capture zones we are | 
|---|
| 2031 | leaving. We are leaving f[c].capture, | 
|---|
| 2032 | f[f[c].parent].capture, | 
|---|
| 2033 | f[f[f[c].parent].parent].capture, ..., | 
|---|
| 2034 | until f[x].capture, with x such that | 
|---|
| 2035 | f[x].parent is the youngest common ancestor | 
|---|
| 2036 | for c and n. | 
|---|
| 2037 |  | 
|---|
| 2038 | We go up along c's and n's ancestry until | 
|---|
| 2039 | we find x. | 
|---|
| 2040 | */ | 
|---|
| 2041 | } else { | 
|---|
| 2042 | p = c; | 
|---|
| 2043 | q = n; | 
|---|
| 2044 | while ( p != q ) { | 
|---|
| 2045 | if ( p > q ) { | 
|---|
| 2046 | cap = f[p].capture; | 
|---|
| 2047 | if ( cap >= 0 ) { | 
|---|
| 2048 | if ( capBegin[cap] == i ) { | 
|---|
| 2049 | capBegin[cap] = EmptyCapture; | 
|---|
| 2050 | capEnd[cap] = EmptyCapture; | 
|---|
| 2051 | } else { | 
|---|
| 2052 | capEnd[cap] = i; | 
|---|
| 2053 | } | 
|---|
| 2054 | } | 
|---|
| 2055 | p = f[p].parent; | 
|---|
| 2056 | } else { | 
|---|
| 2057 | q = f[q].parent; | 
|---|
| 2058 | } | 
|---|
| 2059 | } | 
|---|
| 2060 | } | 
|---|
| 2061 |  | 
|---|
| 2062 | /* | 
|---|
| 2063 | In any case, we now open the capture zones | 
|---|
| 2064 | we are entering. We work upwards from n | 
|---|
| 2065 | until we reach p (the parent of the atom we | 
|---|
| 2066 | reenter or the youngest common ancestor). | 
|---|
| 2067 | */ | 
|---|
| 2068 | while ( n > p ) { | 
|---|
| 2069 | cap = f[n].capture; | 
|---|
| 2070 | if ( cap >= 0 ) { | 
|---|
| 2071 | capBegin[cap] = i; | 
|---|
| 2072 | capEnd[cap] = EmptyCapture; | 
|---|
| 2073 | } | 
|---|
| 2074 | n = f[n].parent; | 
|---|
| 2075 | } | 
|---|
| 2076 | /* | 
|---|
| 2077 | If the next state was already in | 
|---|
| 2078 | mmNextStack, we must choose carefully which | 
|---|
| 2079 | capture zones we want to keep. | 
|---|
| 2080 | */ | 
|---|
| 2081 | if ( capBegin == mmTempCapBegin && | 
|---|
| 2082 | isBetterCapture(capBegin, capEnd, | 
|---|
| 2083 | mmNextCapBegin + m * ncap, | 
|---|
| 2084 | mmNextCapEnd + m * ncap) ) { | 
|---|
| 2085 | memcpy( mmNextCapBegin + m * ncap, capBegin, | 
|---|
| 2086 | ncap * sizeof(int) ); | 
|---|
| 2087 | memcpy( mmNextCapEnd + m * ncap, capEnd, | 
|---|
| 2088 | ncap * sizeof(int) ); | 
|---|
| 2089 | } | 
|---|
| 2090 | } | 
|---|
| 2091 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 2092 | /* | 
|---|
| 2093 | We are done with updating the capture zones. | 
|---|
| 2094 | It's now time to put the next state to sleep, | 
|---|
| 2095 | if it needs to, and to remove it from | 
|---|
| 2096 | mmNextStack. | 
|---|
| 2097 | */ | 
|---|
| 2098 | if ( needSomeSleep > 0 ) { | 
|---|
| 2099 | zzZ = new int[1 + 2 * ncap]; | 
|---|
| 2100 | zzZ[0] = next; | 
|---|
| 2101 | if ( ncap > 0 ) { | 
|---|
| 2102 | memcpy( zzZ + 1, capBegin, ncap * sizeof(int) ); | 
|---|
| 2103 | memcpy( zzZ + 1 + ncap, capEnd, | 
|---|
| 2104 | ncap * sizeof(int) ); | 
|---|
| 2105 | } | 
|---|
| 2106 | mmInNextStack[mmNextStack[--nnext]] = -1; | 
|---|
| 2107 | mmSleeping.insert( i + needSomeSleep, zzZ ); | 
|---|
| 2108 | } | 
|---|
| 2109 | #endif | 
|---|
| 2110 | #endif | 
|---|
| 2111 | } | 
|---|
| 2112 | } | 
|---|
| 2113 | } | 
|---|
| 2114 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 2115 | /* | 
|---|
| 2116 | If we reached the final state, hurray! Copy the captured | 
|---|
| 2117 | zone. | 
|---|
| 2118 | */ | 
|---|
| 2119 | if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) { | 
|---|
| 2120 | memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) ); | 
|---|
| 2121 | memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) ); | 
|---|
| 2122 | } | 
|---|
| 2123 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 2124 | /* | 
|---|
| 2125 | It's time to wake up the sleepers. | 
|---|
| 2126 | */ | 
|---|
| 2127 | if ( !mmSleeping.isEmpty() ) { | 
|---|
| 2128 | while ( (zzZ = mmSleeping.take(i)) != 0 ) { | 
|---|
| 2129 | int next = zzZ[0]; | 
|---|
| 2130 | int *capBegin = zzZ + 1; | 
|---|
| 2131 | int *capEnd = zzZ + 1 + ncap; | 
|---|
| 2132 | bool copyOver = TRUE; | 
|---|
| 2133 |  | 
|---|
| 2134 | if ( (m = mmInNextStack[zzZ[0]]) == -1 ) { | 
|---|
| 2135 | m = nnext++; | 
|---|
| 2136 | mmNextStack[m] = next; | 
|---|
| 2137 | mmInNextStack[next] = m; | 
|---|
| 2138 | } else { | 
|---|
| 2139 | copyOver = isBetterCapture( mmNextCapBegin + m * ncap, | 
|---|
| 2140 | mmNextCapEnd + m * ncap, | 
|---|
| 2141 | capBegin, capEnd ); | 
|---|
| 2142 | } | 
|---|
| 2143 | if ( copyOver ) { | 
|---|
| 2144 | memcpy( mmNextCapBegin + m * ncap, capBegin, | 
|---|
| 2145 | ncap * sizeof(int) ); | 
|---|
| 2146 | memcpy( mmNextCapEnd + m * ncap, capEnd, | 
|---|
| 2147 | ncap * sizeof(int) ); | 
|---|
| 2148 | } | 
|---|
| 2149 | delete[] zzZ; | 
|---|
| 2150 | } | 
|---|
| 2151 | } | 
|---|
| 2152 | #endif | 
|---|
| 2153 | #endif | 
|---|
| 2154 | for ( j = 0; j < nnext; j++ ) | 
|---|
| 2155 | mmInNextStack[mmNextStack[j]] = -1; | 
|---|
| 2156 |  | 
|---|
| 2157 | // avoid needless iteration that confuses mmOneTestMatchedLen | 
|---|
| 2158 | if ( nnext == 1 && mmNextStack[0] == FinalState | 
|---|
| 2159 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 2160 | && mmSleeping.isEmpty() | 
|---|
| 2161 | #endif | 
|---|
| 2162 | ) | 
|---|
| 2163 | stop = TRUE; | 
|---|
| 2164 |  | 
|---|
| 2165 | qSwap( mmCurStack, mmNextStack ); | 
|---|
| 2166 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 2167 | qSwap( mmCurCapBegin, mmNextCapBegin ); | 
|---|
| 2168 | qSwap( mmCurCapEnd, mmNextCapEnd ); | 
|---|
| 2169 | #endif | 
|---|
| 2170 | ncur = nnext; | 
|---|
| 2171 | nnext = 0; | 
|---|
| 2172 | i++; | 
|---|
| 2173 | } | 
|---|
| 2174 |  | 
|---|
| 2175 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 2176 | /* | 
|---|
| 2177 | If minimal matching is enabled, we might have some sleepers | 
|---|
| 2178 | left. | 
|---|
| 2179 | */ | 
|---|
| 2180 | while ( !mmSleeping.isEmpty() ) { | 
|---|
| 2181 | zzZ = mmSleeping.take( *QIntDictIterator<int>(mmSleeping) ); | 
|---|
| 2182 | delete[] zzZ; | 
|---|
| 2183 | } | 
|---|
| 2184 | #endif | 
|---|
| 2185 |  | 
|---|
| 2186 | mmOneTestMatchedLen = i - 1; | 
|---|
| 2187 | return ( mmMatchLen >= 0 ); | 
|---|
| 2188 | } | 
|---|
| 2189 |  | 
|---|
| 2190 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 2191 |  | 
|---|
| 2192 | QRegExpEngine::CharClass::CharClass() | 
|---|
| 2193 | : c( 0 ), n( FALSE ) | 
|---|
| 2194 | { | 
|---|
| 2195 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2196 | occ1.fill( NoOccurrence, NumBadChars ); | 
|---|
| 2197 | #endif | 
|---|
| 2198 | } | 
|---|
| 2199 |  | 
|---|
| 2200 | QRegExpEngine::CharClass& QRegExpEngine::CharClass::operator=( | 
|---|
| 2201 | const CharClass& cc ) | 
|---|
| 2202 | { | 
|---|
| 2203 | c = cc.c; | 
|---|
| 2204 | r = cc.r.copy(); | 
|---|
| 2205 | n = cc.n; | 
|---|
| 2206 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2207 | occ1 = cc.occ1; | 
|---|
| 2208 | #endif | 
|---|
| 2209 | return *this; | 
|---|
| 2210 | } | 
|---|
| 2211 |  | 
|---|
| 2212 | void QRegExpEngine::CharClass::clear() | 
|---|
| 2213 | { | 
|---|
| 2214 | c = 0; | 
|---|
| 2215 | r.resize( 0 ); | 
|---|
| 2216 | n = FALSE; | 
|---|
| 2217 | } | 
|---|
| 2218 |  | 
|---|
| 2219 | void QRegExpEngine::CharClass::setNegative( bool negative ) | 
|---|
| 2220 | { | 
|---|
| 2221 | n = negative; | 
|---|
| 2222 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2223 | occ1.fill( 0, NumBadChars ); | 
|---|
| 2224 | #endif | 
|---|
| 2225 | } | 
|---|
| 2226 |  | 
|---|
| 2227 | void QRegExpEngine::CharClass::addCategories( int cats ) | 
|---|
| 2228 | { | 
|---|
| 2229 | c |= cats; | 
|---|
| 2230 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2231 | occ1.fill( 0, NumBadChars ); | 
|---|
| 2232 | #endif | 
|---|
| 2233 | } | 
|---|
| 2234 |  | 
|---|
| 2235 | void QRegExpEngine::CharClass::addRange( ushort from, ushort to ) | 
|---|
| 2236 | { | 
|---|
| 2237 | if ( from > to ) | 
|---|
| 2238 | qSwap( from, to ); | 
|---|
| 2239 | int m = r.size(); | 
|---|
| 2240 | r.resize( m + 1 ); | 
|---|
| 2241 | r[m].from = from; | 
|---|
| 2242 | r[m].to = to; | 
|---|
| 2243 |  | 
|---|
| 2244 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2245 | int i; | 
|---|
| 2246 |  | 
|---|
| 2247 | if ( to - from < NumBadChars ) { | 
|---|
| 2248 | occ1.detach(); | 
|---|
| 2249 | if ( from % NumBadChars <= to % NumBadChars ) { | 
|---|
| 2250 | for ( i = from % NumBadChars; i <= to % NumBadChars; i++ ) | 
|---|
| 2251 | occ1[i] = 0; | 
|---|
| 2252 | } else { | 
|---|
| 2253 | for ( i = 0; i <= to % NumBadChars; i++ ) | 
|---|
| 2254 | occ1[i] = 0; | 
|---|
| 2255 | for ( i = from % NumBadChars; i < NumBadChars; i++ ) | 
|---|
| 2256 | occ1[i] = 0; | 
|---|
| 2257 | } | 
|---|
| 2258 | } else { | 
|---|
| 2259 | occ1.fill( 0, NumBadChars ); | 
|---|
| 2260 | } | 
|---|
| 2261 | #endif | 
|---|
| 2262 | } | 
|---|
| 2263 |  | 
|---|
| 2264 | bool QRegExpEngine::CharClass::in( QChar ch ) const | 
|---|
| 2265 | { | 
|---|
| 2266 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2267 | if ( occ1[BadChar(ch)] == NoOccurrence ) | 
|---|
| 2268 | return n; | 
|---|
| 2269 | #endif | 
|---|
| 2270 |  | 
|---|
| 2271 | if ( c != 0 && (c & (1 << (int) ch.category())) != 0 ) | 
|---|
| 2272 | return !n; | 
|---|
| 2273 | for ( int i = 0; i < (int) r.size(); i++ ) { | 
|---|
| 2274 | if ( ch.unicode() >= r[i].from && ch.unicode() <= r[i].to ) | 
|---|
| 2275 | return !n; | 
|---|
| 2276 | } | 
|---|
| 2277 | return n; | 
|---|
| 2278 | } | 
|---|
| 2279 |  | 
|---|
| 2280 | #if defined(QT_DEBUG) | 
|---|
| 2281 | void QRegExpEngine::CharClass::dump() const | 
|---|
| 2282 | { | 
|---|
| 2283 | int i; | 
|---|
| 2284 | qDebug( "    %stive character class", n ? "nega" : "posi" ); | 
|---|
| 2285 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 2286 | if ( c != 0 ) | 
|---|
| 2287 | qDebug( "      categories 0x%.8x", c ); | 
|---|
| 2288 | #endif | 
|---|
| 2289 | for ( i = 0; i < (int) r.size(); i++ ) | 
|---|
| 2290 | qDebug( "      0x%.4x through 0x%.4x", r[i].from, r[i].to ); | 
|---|
| 2291 | } | 
|---|
| 2292 | #endif | 
|---|
| 2293 | #endif | 
|---|
| 2294 |  | 
|---|
| 2295 | QRegExpEngine::Box::Box( QRegExpEngine *engine ) | 
|---|
| 2296 | : eng( engine ), skipanchors( 0 ) | 
|---|
| 2297 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2298 | , earlyStart( 0 ), lateStart( 0 ), maxl( 0 ) | 
|---|
| 2299 | #endif | 
|---|
| 2300 | { | 
|---|
| 2301 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2302 | occ1.fill( NoOccurrence, NumBadChars ); | 
|---|
| 2303 | #endif | 
|---|
| 2304 | minl = 0; | 
|---|
| 2305 | } | 
|---|
| 2306 |  | 
|---|
| 2307 | QRegExpEngine::Box& QRegExpEngine::Box::operator=( const Box& b ) | 
|---|
| 2308 | { | 
|---|
| 2309 | eng = b.eng; | 
|---|
| 2310 | ls = b.ls; | 
|---|
| 2311 | rs = b.rs; | 
|---|
| 2312 | lanchors = b.lanchors; | 
|---|
| 2313 | ranchors = b.ranchors; | 
|---|
| 2314 | skipanchors = b.skipanchors; | 
|---|
| 2315 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2316 | earlyStart = b.earlyStart; | 
|---|
| 2317 | lateStart = b.lateStart; | 
|---|
| 2318 | str = b.str; | 
|---|
| 2319 | leftStr = b.leftStr; | 
|---|
| 2320 | rightStr = b.rightStr; | 
|---|
| 2321 | maxl = b.maxl; | 
|---|
| 2322 | occ1 = b.occ1; | 
|---|
| 2323 | #endif | 
|---|
| 2324 | minl = b.minl; | 
|---|
| 2325 | return *this; | 
|---|
| 2326 | } | 
|---|
| 2327 |  | 
|---|
| 2328 | void QRegExpEngine::Box::set( QChar ch ) | 
|---|
| 2329 | { | 
|---|
| 2330 | ls.resize( 1 ); | 
|---|
| 2331 | ls[0] = eng->createState( ch ); | 
|---|
| 2332 | rs = ls; | 
|---|
| 2333 | rs.detach(); | 
|---|
| 2334 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2335 | str = ch; | 
|---|
| 2336 | leftStr = ch; | 
|---|
| 2337 | rightStr = ch; | 
|---|
| 2338 | maxl = 1; | 
|---|
| 2339 | occ1.detach(); | 
|---|
| 2340 | occ1[BadChar(ch)] = 0; | 
|---|
| 2341 | #endif | 
|---|
| 2342 | minl = 1; | 
|---|
| 2343 | } | 
|---|
| 2344 |  | 
|---|
| 2345 | void QRegExpEngine::Box::set( const CharClass& cc ) | 
|---|
| 2346 | { | 
|---|
| 2347 | ls.resize( 1 ); | 
|---|
| 2348 | ls[0] = eng->createState( cc ); | 
|---|
| 2349 | rs = ls; | 
|---|
| 2350 | rs.detach(); | 
|---|
| 2351 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2352 | maxl = 1; | 
|---|
| 2353 | occ1 = cc.firstOccurrence(); | 
|---|
| 2354 | #endif | 
|---|
| 2355 | minl = 1; | 
|---|
| 2356 | } | 
|---|
| 2357 |  | 
|---|
| 2358 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 2359 | void QRegExpEngine::Box::set( int bref ) | 
|---|
| 2360 | { | 
|---|
| 2361 | ls.resize( 1 ); | 
|---|
| 2362 | ls[0] = eng->createState( bref ); | 
|---|
| 2363 | rs = ls; | 
|---|
| 2364 | rs.detach(); | 
|---|
| 2365 | if ( bref >= 1 && bref <= MaxBackRefs ) | 
|---|
| 2366 | skipanchors = Anchor_BackRef0Empty << bref; | 
|---|
| 2367 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2368 | maxl = InftyLen; | 
|---|
| 2369 | #endif | 
|---|
| 2370 | minl = 0; | 
|---|
| 2371 | } | 
|---|
| 2372 | #endif | 
|---|
| 2373 |  | 
|---|
| 2374 | void QRegExpEngine::Box::cat( const Box& b ) | 
|---|
| 2375 | { | 
|---|
| 2376 | eng->addCatTransitions( rs, b.ls ); | 
|---|
| 2377 | addAnchorsToEngine( b ); | 
|---|
| 2378 | if ( minl == 0 ) { | 
|---|
| 2379 | mergeInto( &lanchors, b.lanchors ); | 
|---|
| 2380 | if ( skipanchors != 0 ) { | 
|---|
| 2381 | for ( int i = 0; i < (int) b.ls.size(); i++ ) { | 
|---|
| 2382 | int a = eng->anchorConcatenation( at(lanchors, b.ls[i]), | 
|---|
| 2383 | skipanchors ); | 
|---|
| 2384 | lanchors.insert( b.ls[i], a ); | 
|---|
| 2385 | } | 
|---|
| 2386 | } | 
|---|
| 2387 | mergeInto( &ls, b.ls ); | 
|---|
| 2388 | } | 
|---|
| 2389 | if ( b.minl == 0 ) { | 
|---|
| 2390 | mergeInto( &ranchors, b.ranchors ); | 
|---|
| 2391 | if ( b.skipanchors != 0 ) { | 
|---|
| 2392 | for ( int i = 0; i < (int) rs.size(); i++ ) { | 
|---|
| 2393 | int a = eng->anchorConcatenation( at(ranchors, rs[i]), | 
|---|
| 2394 | b.skipanchors ); | 
|---|
| 2395 | ranchors.insert( rs[i], a ); | 
|---|
| 2396 | } | 
|---|
| 2397 | } | 
|---|
| 2398 | mergeInto( &rs, b.rs ); | 
|---|
| 2399 | } else { | 
|---|
| 2400 | ranchors = b.ranchors; | 
|---|
| 2401 | rs = b.rs; | 
|---|
| 2402 | } | 
|---|
| 2403 |  | 
|---|
| 2404 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2405 | if ( maxl != InftyLen ) { | 
|---|
| 2406 | if ( rightStr.length() + b.leftStr.length() > | 
|---|
| 2407 | QMAX(str.length(), b.str.length()) ) { | 
|---|
| 2408 | earlyStart = minl - rightStr.length(); | 
|---|
| 2409 | lateStart = maxl - rightStr.length(); | 
|---|
| 2410 | str = rightStr + b.leftStr; | 
|---|
| 2411 | } else if ( b.str.length() > str.length() ) { | 
|---|
| 2412 | earlyStart = minl + b.earlyStart; | 
|---|
| 2413 | lateStart = maxl + b.lateStart; | 
|---|
| 2414 | str = b.str; | 
|---|
| 2415 | } | 
|---|
| 2416 | } | 
|---|
| 2417 |  | 
|---|
| 2418 | if ( (int) leftStr.length() == maxl ) | 
|---|
| 2419 | leftStr += b.leftStr; | 
|---|
| 2420 |  | 
|---|
| 2421 | if ( (int) b.rightStr.length() == b.maxl ) { | 
|---|
| 2422 | rightStr += b.rightStr; | 
|---|
| 2423 | } else { | 
|---|
| 2424 | rightStr = b.rightStr; | 
|---|
| 2425 | } | 
|---|
| 2426 |  | 
|---|
| 2427 | if ( maxl == InftyLen || b.maxl == InftyLen ) { | 
|---|
| 2428 | maxl = InftyLen; | 
|---|
| 2429 | } else { | 
|---|
| 2430 | maxl += b.maxl; | 
|---|
| 2431 | } | 
|---|
| 2432 |  | 
|---|
| 2433 | occ1.detach(); | 
|---|
| 2434 | for ( int i = 0; i < NumBadChars; i++ ) { | 
|---|
| 2435 | if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] ) | 
|---|
| 2436 | occ1[i] = minl + b.occ1[i]; | 
|---|
| 2437 | } | 
|---|
| 2438 | #endif | 
|---|
| 2439 |  | 
|---|
| 2440 | minl += b.minl; | 
|---|
| 2441 | if ( minl == 0 ) | 
|---|
| 2442 | skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors ); | 
|---|
| 2443 | else | 
|---|
| 2444 | skipanchors = 0; | 
|---|
| 2445 | } | 
|---|
| 2446 |  | 
|---|
| 2447 | void QRegExpEngine::Box::orx( const Box& b ) | 
|---|
| 2448 | { | 
|---|
| 2449 | mergeInto( &ls, b.ls ); | 
|---|
| 2450 | mergeInto( &lanchors, b.lanchors ); | 
|---|
| 2451 | mergeInto( &rs, b.rs ); | 
|---|
| 2452 | mergeInto( &ranchors, b.ranchors ); | 
|---|
| 2453 |  | 
|---|
| 2454 | if ( b.minl == 0 ) { | 
|---|
| 2455 | if ( minl == 0 ) | 
|---|
| 2456 | skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors ); | 
|---|
| 2457 | else | 
|---|
| 2458 | skipanchors = b.skipanchors; | 
|---|
| 2459 | } | 
|---|
| 2460 |  | 
|---|
| 2461 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2462 | occ1.detach(); | 
|---|
| 2463 | for ( int i = 0; i < NumBadChars; i++ ) { | 
|---|
| 2464 | if ( occ1[i] > b.occ1[i] ) | 
|---|
| 2465 | occ1[i] = b.occ1[i]; | 
|---|
| 2466 | } | 
|---|
| 2467 | earlyStart = 0; | 
|---|
| 2468 | lateStart = 0; | 
|---|
| 2469 | str = QString(); | 
|---|
| 2470 | leftStr = QString(); | 
|---|
| 2471 | rightStr = QString(); | 
|---|
| 2472 | if ( b.maxl > maxl ) | 
|---|
| 2473 | maxl = b.maxl; | 
|---|
| 2474 | #endif | 
|---|
| 2475 | if ( b.minl < minl ) | 
|---|
| 2476 | minl = b.minl; | 
|---|
| 2477 | } | 
|---|
| 2478 |  | 
|---|
| 2479 | void QRegExpEngine::Box::plus( int atom ) | 
|---|
| 2480 | { | 
|---|
| 2481 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 2482 | eng->addPlusTransitions( rs, ls, atom ); | 
|---|
| 2483 | #else | 
|---|
| 2484 | Q_UNUSED( atom ); | 
|---|
| 2485 | eng->addCatTransitions( rs, ls ); | 
|---|
| 2486 | #endif | 
|---|
| 2487 | addAnchorsToEngine( *this ); | 
|---|
| 2488 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2489 | maxl = InftyLen; | 
|---|
| 2490 | #endif | 
|---|
| 2491 | } | 
|---|
| 2492 |  | 
|---|
| 2493 | void QRegExpEngine::Box::opt() | 
|---|
| 2494 | { | 
|---|
| 2495 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2496 | earlyStart = 0; | 
|---|
| 2497 | lateStart = 0; | 
|---|
| 2498 | str = QString(); | 
|---|
| 2499 | leftStr = QString(); | 
|---|
| 2500 | rightStr = QString(); | 
|---|
| 2501 | #endif | 
|---|
| 2502 | skipanchors = 0; | 
|---|
| 2503 | minl = 0; | 
|---|
| 2504 | } | 
|---|
| 2505 |  | 
|---|
| 2506 | void QRegExpEngine::Box::catAnchor( int a ) | 
|---|
| 2507 | { | 
|---|
| 2508 | if ( a != 0 ) { | 
|---|
| 2509 | for ( int i = 0; i < (int) rs.size(); i++ ) { | 
|---|
| 2510 | a = eng->anchorConcatenation( at(ranchors, rs[i]), a ); | 
|---|
| 2511 | ranchors.insert( rs[i], a ); | 
|---|
| 2512 | } | 
|---|
| 2513 | if ( minl == 0 ) | 
|---|
| 2514 | skipanchors = eng->anchorConcatenation( skipanchors, a ); | 
|---|
| 2515 | } | 
|---|
| 2516 | } | 
|---|
| 2517 |  | 
|---|
| 2518 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2519 | void QRegExpEngine::Box::setupHeuristics() | 
|---|
| 2520 | { | 
|---|
| 2521 | eng->goodEarlyStart = earlyStart; | 
|---|
| 2522 | eng->goodLateStart = lateStart; | 
|---|
| 2523 | eng->goodStr = eng->cs ? str : str.lower(); | 
|---|
| 2524 |  | 
|---|
| 2525 | eng->minl = minl; | 
|---|
| 2526 | if ( eng->cs ) { | 
|---|
| 2527 | /* | 
|---|
| 2528 | A regular expression such as 112|1 has occ1['2'] = 2 and minl = | 
|---|
| 2529 | 1 at this point. An entry of occ1 has to be at most minl or | 
|---|
| 2530 | infinity for the rest of the algorithm to go well. | 
|---|
| 2531 |  | 
|---|
| 2532 | We waited until here before normalizing these cases (instead of | 
|---|
| 2533 | doing it in Box::orx()) because sometimes things improve by | 
|---|
| 2534 | themselves. Consider for example (112|1)34. | 
|---|
| 2535 | */ | 
|---|
| 2536 | for ( int i = 0; i < NumBadChars; i++ ) { | 
|---|
| 2537 | if ( occ1[i] != NoOccurrence && occ1[i] >= minl ) | 
|---|
| 2538 | occ1[i] = minl; | 
|---|
| 2539 | } | 
|---|
| 2540 | eng->occ1 = occ1; | 
|---|
| 2541 | } else { | 
|---|
| 2542 | eng->occ1.fill( 0, NumBadChars ); | 
|---|
| 2543 | } | 
|---|
| 2544 |  | 
|---|
| 2545 | eng->heuristicallyChooseHeuristic(); | 
|---|
| 2546 | } | 
|---|
| 2547 | #endif | 
|---|
| 2548 |  | 
|---|
| 2549 | #if defined(QT_DEBUG) | 
|---|
| 2550 | void QRegExpEngine::Box::dump() const | 
|---|
| 2551 | { | 
|---|
| 2552 | int i; | 
|---|
| 2553 | qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" ); | 
|---|
| 2554 | qDebug( "  Left states:" ); | 
|---|
| 2555 | for ( i = 0; i < (int) ls.size(); i++ ) { | 
|---|
| 2556 | if ( at(lanchors, ls[i]) == 0 ) | 
|---|
| 2557 | qDebug( "    %d", ls[i] ); | 
|---|
| 2558 | else | 
|---|
| 2559 | qDebug( "    %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] ); | 
|---|
| 2560 | } | 
|---|
| 2561 | qDebug( "  Right states:" ); | 
|---|
| 2562 | for ( i = 0; i < (int) rs.size(); i++ ) { | 
|---|
| 2563 | if ( at(ranchors, rs[i]) == 0 ) | 
|---|
| 2564 | qDebug( "    %d", rs[i] ); | 
|---|
| 2565 | else | 
|---|
| 2566 | qDebug( "    %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] ); | 
|---|
| 2567 | } | 
|---|
| 2568 | qDebug( "  Skip anchors: 0x%.8x", skipanchors ); | 
|---|
| 2569 | } | 
|---|
| 2570 | #endif | 
|---|
| 2571 |  | 
|---|
| 2572 | void QRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const | 
|---|
| 2573 | { | 
|---|
| 2574 | for ( int i = 0; i < (int) to.ls.size(); i++ ) { | 
|---|
| 2575 | for ( int j = 0; j < (int) rs.size(); j++ ) { | 
|---|
| 2576 | int a = eng->anchorConcatenation( at(ranchors, rs[j]), | 
|---|
| 2577 | at(to.lanchors, to.ls[i]) ); | 
|---|
| 2578 | eng->addAnchors( rs[j], to.ls[i], a ); | 
|---|
| 2579 | } | 
|---|
| 2580 | } | 
|---|
| 2581 | } | 
|---|
| 2582 |  | 
|---|
| 2583 | int QRegExpEngine::getChar() | 
|---|
| 2584 | { | 
|---|
| 2585 | return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].unicode(); | 
|---|
| 2586 | } | 
|---|
| 2587 |  | 
|---|
| 2588 | int QRegExpEngine::getEscape() | 
|---|
| 2589 | { | 
|---|
| 2590 | #ifndef QT_NO_REGEXP_ESCAPE | 
|---|
| 2591 | const char tab[] = "afnrtv"; // no b, as \b means word boundary | 
|---|
| 2592 | const char backTab[] = "\a\f\n\r\t\v"; | 
|---|
| 2593 | ushort low; | 
|---|
| 2594 | int i; | 
|---|
| 2595 | #endif | 
|---|
| 2596 | ushort val; | 
|---|
| 2597 | int prevCh = yyCh; | 
|---|
| 2598 |  | 
|---|
| 2599 | if ( prevCh == EOS ) { | 
|---|
| 2600 | error( RXERR_END ); | 
|---|
| 2601 | return Tok_Char | '\\'; | 
|---|
| 2602 | } | 
|---|
| 2603 | yyCh = getChar(); | 
|---|
| 2604 | #ifndef QT_NO_REGEXP_ESCAPE | 
|---|
| 2605 | if ( (prevCh & ~0xff) == 0 ) { | 
|---|
| 2606 | const char *p = strchr( tab, prevCh ); | 
|---|
| 2607 | if ( p != 0 ) | 
|---|
| 2608 | return Tok_Char | backTab[p - tab]; | 
|---|
| 2609 | } | 
|---|
| 2610 | #endif | 
|---|
| 2611 |  | 
|---|
| 2612 | switch ( prevCh ) { | 
|---|
| 2613 | #ifndef QT_NO_REGEXP_ESCAPE | 
|---|
| 2614 | case '0': | 
|---|
| 2615 | val = 0; | 
|---|
| 2616 | for ( i = 0; i < 3; i++ ) { | 
|---|
| 2617 | if ( yyCh >= '0' && yyCh <= '7' ) | 
|---|
| 2618 | val = ( val << 3 ) | ( yyCh - '0' ); | 
|---|
| 2619 | else | 
|---|
| 2620 | break; | 
|---|
| 2621 | yyCh = getChar(); | 
|---|
| 2622 | } | 
|---|
| 2623 | if ( (val & ~0377) != 0 ) | 
|---|
| 2624 | error( RXERR_OCTAL ); | 
|---|
| 2625 | return Tok_Char | val; | 
|---|
| 2626 | #endif | 
|---|
| 2627 | #ifndef QT_NO_REGEXP_ESCAPE | 
|---|
| 2628 | case 'B': | 
|---|
| 2629 | return Tok_NonWord; | 
|---|
| 2630 | #endif | 
|---|
| 2631 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 2632 | case 'D': | 
|---|
| 2633 | // see QChar::isDigit() | 
|---|
| 2634 | yyCharClass->addCategories( 0x7fffffef ); | 
|---|
| 2635 | return Tok_CharClass; | 
|---|
| 2636 | case 'S': | 
|---|
| 2637 | // see QChar::isSpace() | 
|---|
| 2638 | yyCharClass->addCategories( 0x7ffff87f ); | 
|---|
| 2639 | yyCharClass->addRange( 0x0000, 0x0008 ); | 
|---|
| 2640 | yyCharClass->addRange( 0x000e, 0x001f ); | 
|---|
| 2641 | yyCharClass->addRange( 0x007f, 0x009f ); | 
|---|
| 2642 | return Tok_CharClass; | 
|---|
| 2643 | case 'W': | 
|---|
| 2644 | // see QChar::isLetterOrNumber() | 
|---|
| 2645 | yyCharClass->addCategories( 0x7fe07f8f ); | 
|---|
| 2646 | yyCharClass->addRange( 0x203f, 0x2040 ); | 
|---|
| 2647 | yyCharClass->addSingleton( 0x2040 ); | 
|---|
| 2648 | yyCharClass->addSingleton( 0x30fb ); | 
|---|
| 2649 | yyCharClass->addRange( 0xfe33, 0xfe34 ); | 
|---|
| 2650 | yyCharClass->addRange( 0xfe4d, 0xfe4f ); | 
|---|
| 2651 | yyCharClass->addSingleton( 0xff3f ); | 
|---|
| 2652 | yyCharClass->addSingleton( 0xff65 ); | 
|---|
| 2653 | return Tok_CharClass; | 
|---|
| 2654 | #endif | 
|---|
| 2655 | #ifndef QT_NO_REGEXP_ESCAPE | 
|---|
| 2656 | case 'b': | 
|---|
| 2657 | return Tok_Word; | 
|---|
| 2658 | #endif | 
|---|
| 2659 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 2660 | case 'd': | 
|---|
| 2661 | // see QChar::isDigit() | 
|---|
| 2662 | yyCharClass->addCategories( 0x00000010 ); | 
|---|
| 2663 | return Tok_CharClass; | 
|---|
| 2664 | case 's': | 
|---|
| 2665 | // see QChar::isSpace() | 
|---|
| 2666 | yyCharClass->addCategories( 0x00000380 ); | 
|---|
| 2667 | yyCharClass->addRange( 0x0009, 0x000d ); | 
|---|
| 2668 | return Tok_CharClass; | 
|---|
| 2669 | case 'w': | 
|---|
| 2670 | // see QChar::isLetterOrNumber() | 
|---|
| 2671 | yyCharClass->addCategories( 0x000f8070 ); | 
|---|
| 2672 | yyCharClass->addSingleton( 0x005f ); // '_' | 
|---|
| 2673 | return Tok_CharClass; | 
|---|
| 2674 | #endif | 
|---|
| 2675 | #ifndef QT_NO_REGEXP_ESCAPE | 
|---|
| 2676 | case 'x': | 
|---|
| 2677 | val = 0; | 
|---|
| 2678 | for ( i = 0; i < 4; i++ ) { | 
|---|
| 2679 | low = QChar( yyCh ).lower(); | 
|---|
| 2680 | if ( low >= '0' && low <= '9' ) | 
|---|
| 2681 | val = ( val << 4 ) | ( low - '0' ); | 
|---|
| 2682 | else if ( low >= 'a' && low <= 'f' ) | 
|---|
| 2683 | val = ( val << 4 ) | ( low - 'a' + 10 ); | 
|---|
| 2684 | else | 
|---|
| 2685 | break; | 
|---|
| 2686 | yyCh = getChar(); | 
|---|
| 2687 | } | 
|---|
| 2688 | return Tok_Char | val; | 
|---|
| 2689 | #endif | 
|---|
| 2690 | default: | 
|---|
| 2691 | if ( prevCh >= '1' && prevCh <= '9' ) { | 
|---|
| 2692 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 2693 | val = prevCh - '0'; | 
|---|
| 2694 | while ( yyCh >= '0' && yyCh <= '9' ) { | 
|---|
| 2695 | val = ( val *= 10 ) | ( yyCh - '0' ); | 
|---|
| 2696 | yyCh = getChar(); | 
|---|
| 2697 | } | 
|---|
| 2698 | return Tok_BackRef | val; | 
|---|
| 2699 | #else | 
|---|
| 2700 | error( RXERR_DISABLED ); | 
|---|
| 2701 | #endif | 
|---|
| 2702 | } | 
|---|
| 2703 | return Tok_Char | prevCh; | 
|---|
| 2704 | } | 
|---|
| 2705 | } | 
|---|
| 2706 |  | 
|---|
| 2707 | #ifndef QT_NO_REGEXP_INTERVAL | 
|---|
| 2708 | int QRegExpEngine::getRep( int def ) | 
|---|
| 2709 | { | 
|---|
| 2710 | if ( yyCh >= '0' && yyCh <= '9' ) { | 
|---|
| 2711 | int rep = 0; | 
|---|
| 2712 | do { | 
|---|
| 2713 | rep = 10 * rep + yyCh - '0'; | 
|---|
| 2714 | if ( rep >= InftyRep ) { | 
|---|
| 2715 | error( RXERR_REPETITION ); | 
|---|
| 2716 | rep = def; | 
|---|
| 2717 | } | 
|---|
| 2718 | yyCh = getChar(); | 
|---|
| 2719 | } while ( yyCh >= '0' && yyCh <= '9' ); | 
|---|
| 2720 | return rep; | 
|---|
| 2721 | } else { | 
|---|
| 2722 | return def; | 
|---|
| 2723 | } | 
|---|
| 2724 | } | 
|---|
| 2725 | #endif | 
|---|
| 2726 |  | 
|---|
| 2727 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 2728 | void QRegExpEngine::skipChars( int n ) | 
|---|
| 2729 | { | 
|---|
| 2730 | if ( n > 0 ) { | 
|---|
| 2731 | yyPos += n - 1; | 
|---|
| 2732 | yyCh = getChar(); | 
|---|
| 2733 | } | 
|---|
| 2734 | } | 
|---|
| 2735 | #endif | 
|---|
| 2736 |  | 
|---|
| 2737 | void QRegExpEngine::error( const char *msg ) | 
|---|
| 2738 | { | 
|---|
| 2739 | if ( yyError.isEmpty() ) | 
|---|
| 2740 | yyError = QString::fromLatin1( msg ); | 
|---|
| 2741 | } | 
|---|
| 2742 |  | 
|---|
| 2743 | void QRegExpEngine::startTokenizer( const QChar *rx, int len ) | 
|---|
| 2744 | { | 
|---|
| 2745 | yyIn = rx; | 
|---|
| 2746 | yyPos0 = 0; | 
|---|
| 2747 | yyPos = 0; | 
|---|
| 2748 | yyLen = len; | 
|---|
| 2749 | yyCh = getChar(); | 
|---|
| 2750 | yyCharClass = new CharClass; | 
|---|
| 2751 | yyMinRep = 0; | 
|---|
| 2752 | yyMaxRep = 0; | 
|---|
| 2753 | yyError = QString(); | 
|---|
| 2754 | } | 
|---|
| 2755 |  | 
|---|
| 2756 | int QRegExpEngine::getToken() | 
|---|
| 2757 | { | 
|---|
| 2758 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 2759 | ushort pendingCh = 0; | 
|---|
| 2760 | bool charPending; | 
|---|
| 2761 | bool rangePending; | 
|---|
| 2762 | int tok; | 
|---|
| 2763 | #endif | 
|---|
| 2764 | int prevCh = yyCh; | 
|---|
| 2765 |  | 
|---|
| 2766 | yyPos0 = yyPos - 1; | 
|---|
| 2767 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 2768 | yyCharClass->clear(); | 
|---|
| 2769 | #endif | 
|---|
| 2770 | yyMinRep = 0; | 
|---|
| 2771 | yyMaxRep = 0; | 
|---|
| 2772 | yyCh = getChar(); | 
|---|
| 2773 |  | 
|---|
| 2774 | switch ( prevCh ) { | 
|---|
| 2775 | case EOS: | 
|---|
| 2776 | yyPos0 = yyPos; | 
|---|
| 2777 | return Tok_Eos; | 
|---|
| 2778 | case '$': | 
|---|
| 2779 | return Tok_Dollar; | 
|---|
| 2780 | case '(': | 
|---|
| 2781 | if ( yyCh == '?' ) { | 
|---|
| 2782 | prevCh = getChar(); | 
|---|
| 2783 | yyCh = getChar(); | 
|---|
| 2784 | switch ( prevCh ) { | 
|---|
| 2785 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 2786 | case '!': | 
|---|
| 2787 | return Tok_NegLookahead; | 
|---|
| 2788 | case '=': | 
|---|
| 2789 | return Tok_PosLookahead; | 
|---|
| 2790 | #endif | 
|---|
| 2791 | case ':': | 
|---|
| 2792 | return Tok_MagicLeftParen; | 
|---|
| 2793 | default: | 
|---|
| 2794 | error( RXERR_LOOKAHEAD ); | 
|---|
| 2795 | return Tok_MagicLeftParen; | 
|---|
| 2796 | } | 
|---|
| 2797 | } else { | 
|---|
| 2798 | return Tok_LeftParen; | 
|---|
| 2799 | } | 
|---|
| 2800 | case ')': | 
|---|
| 2801 | return Tok_RightParen; | 
|---|
| 2802 | case '*': | 
|---|
| 2803 | yyMinRep = 0; | 
|---|
| 2804 | yyMaxRep = InftyRep; | 
|---|
| 2805 | return Tok_Quantifier; | 
|---|
| 2806 | case '+': | 
|---|
| 2807 | yyMinRep = 1; | 
|---|
| 2808 | yyMaxRep = InftyRep; | 
|---|
| 2809 | return Tok_Quantifier; | 
|---|
| 2810 | case '.': | 
|---|
| 2811 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 2812 | yyCharClass->setNegative( TRUE ); | 
|---|
| 2813 | #endif | 
|---|
| 2814 | return Tok_CharClass; | 
|---|
| 2815 | case '?': | 
|---|
| 2816 | yyMinRep = 0; | 
|---|
| 2817 | yyMaxRep = 1; | 
|---|
| 2818 | return Tok_Quantifier; | 
|---|
| 2819 | case '[': | 
|---|
| 2820 | #ifndef QT_NO_REGEXP_CCLASS | 
|---|
| 2821 | if ( yyCh == '^' ) { | 
|---|
| 2822 | yyCharClass->setNegative( TRUE ); | 
|---|
| 2823 | yyCh = getChar(); | 
|---|
| 2824 | } | 
|---|
| 2825 | charPending = FALSE; | 
|---|
| 2826 | rangePending = FALSE; | 
|---|
| 2827 | do { | 
|---|
| 2828 | if ( yyCh == '-' && charPending && !rangePending ) { | 
|---|
| 2829 | rangePending = TRUE; | 
|---|
| 2830 | yyCh = getChar(); | 
|---|
| 2831 | } else { | 
|---|
| 2832 | if ( charPending && !rangePending ) { | 
|---|
| 2833 | yyCharClass->addSingleton( pendingCh ); | 
|---|
| 2834 | charPending = FALSE; | 
|---|
| 2835 | } | 
|---|
| 2836 | if ( yyCh == '\\' ) { | 
|---|
| 2837 | yyCh = getChar(); | 
|---|
| 2838 | tok = getEscape(); | 
|---|
| 2839 | if ( tok == Tok_Word ) | 
|---|
| 2840 | tok = '\b'; | 
|---|
| 2841 | } else { | 
|---|
| 2842 | tok = Tok_Char | yyCh; | 
|---|
| 2843 | yyCh = getChar(); | 
|---|
| 2844 | } | 
|---|
| 2845 | if ( tok == Tok_CharClass ) { | 
|---|
| 2846 | if ( rangePending ) { | 
|---|
| 2847 | yyCharClass->addSingleton( '-' ); | 
|---|
| 2848 | yyCharClass->addSingleton( pendingCh ); | 
|---|
| 2849 | charPending = FALSE; | 
|---|
| 2850 | rangePending = FALSE; | 
|---|
| 2851 | } | 
|---|
| 2852 | } else if ( (tok & Tok_Char) != 0 ) { | 
|---|
| 2853 | if ( rangePending ) { | 
|---|
| 2854 | yyCharClass->addRange( pendingCh, tok ^ Tok_Char ); | 
|---|
| 2855 | charPending = FALSE; | 
|---|
| 2856 | rangePending = FALSE; | 
|---|
| 2857 | } else { | 
|---|
| 2858 | pendingCh = tok ^ Tok_Char; | 
|---|
| 2859 | charPending = TRUE; | 
|---|
| 2860 | } | 
|---|
| 2861 | } else { | 
|---|
| 2862 | error( RXERR_CHARCLASS ); | 
|---|
| 2863 | } | 
|---|
| 2864 | } | 
|---|
| 2865 | }  while ( yyCh != ']' && yyCh != EOS ); | 
|---|
| 2866 | if ( rangePending ) | 
|---|
| 2867 | yyCharClass->addSingleton( '-' ); | 
|---|
| 2868 | if ( charPending ) | 
|---|
| 2869 | yyCharClass->addSingleton( pendingCh ); | 
|---|
| 2870 | if ( yyCh == EOS ) | 
|---|
| 2871 | error( RXERR_END ); | 
|---|
| 2872 | else | 
|---|
| 2873 | yyCh = getChar(); | 
|---|
| 2874 | return Tok_CharClass; | 
|---|
| 2875 | #else | 
|---|
| 2876 | error( RXERR_END ); | 
|---|
| 2877 | return Tok_Char | '['; | 
|---|
| 2878 | #endif | 
|---|
| 2879 | case '\\': | 
|---|
| 2880 | return getEscape(); | 
|---|
| 2881 | case ']': | 
|---|
| 2882 | error( RXERR_LEFTDELIM ); | 
|---|
| 2883 | return Tok_Char | ']'; | 
|---|
| 2884 | case '^': | 
|---|
| 2885 | return Tok_Caret; | 
|---|
| 2886 | case '{': | 
|---|
| 2887 | #ifndef QT_NO_REGEXP_INTERVAL | 
|---|
| 2888 | yyMinRep = getRep( 0 ); | 
|---|
| 2889 | yyMaxRep = yyMinRep; | 
|---|
| 2890 | if ( yyCh == ',' ) { | 
|---|
| 2891 | yyCh = getChar(); | 
|---|
| 2892 | yyMaxRep = getRep( InftyRep ); | 
|---|
| 2893 | } | 
|---|
| 2894 | if ( yyMaxRep < yyMinRep ) | 
|---|
| 2895 | qSwap( yyMinRep, yyMaxRep ); | 
|---|
| 2896 | if ( yyCh != '}' ) | 
|---|
| 2897 | error( RXERR_REPETITION ); | 
|---|
| 2898 | yyCh = getChar(); | 
|---|
| 2899 | return Tok_Quantifier; | 
|---|
| 2900 | #else | 
|---|
| 2901 | error( RXERR_DISABLED ); | 
|---|
| 2902 | return Tok_Char | '{'; | 
|---|
| 2903 | #endif | 
|---|
| 2904 | case '|': | 
|---|
| 2905 | return Tok_Bar; | 
|---|
| 2906 | case '}': | 
|---|
| 2907 | error( RXERR_LEFTDELIM ); | 
|---|
| 2908 | return Tok_Char | '}'; | 
|---|
| 2909 | default: | 
|---|
| 2910 | return Tok_Char | prevCh; | 
|---|
| 2911 | } | 
|---|
| 2912 | } | 
|---|
| 2913 |  | 
|---|
| 2914 | int QRegExpEngine::parse( const QChar *pattern, int len ) | 
|---|
| 2915 | { | 
|---|
| 2916 | valid = TRUE; | 
|---|
| 2917 | startTokenizer( pattern, len ); | 
|---|
| 2918 | yyTok = getToken(); | 
|---|
| 2919 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 2920 | yyMayCapture = TRUE; | 
|---|
| 2921 | #else | 
|---|
| 2922 | yyMayCapture = FALSE; | 
|---|
| 2923 | #endif | 
|---|
| 2924 |  | 
|---|
| 2925 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 2926 | int atom = startAtom( FALSE ); | 
|---|
| 2927 | #endif | 
|---|
| 2928 | CharClass anything; | 
|---|
| 2929 | Box box( this ); // create InitialState | 
|---|
| 2930 | box.set( anything ); | 
|---|
| 2931 | Box rightBox( this ); // create FinalState | 
|---|
| 2932 | rightBox.set( anything ); | 
|---|
| 2933 |  | 
|---|
| 2934 | Box middleBox( this ); | 
|---|
| 2935 | parseExpression( &middleBox ); | 
|---|
| 2936 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 2937 | finishAtom( atom ); | 
|---|
| 2938 | #endif | 
|---|
| 2939 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2940 | middleBox.setupHeuristics(); | 
|---|
| 2941 | #endif | 
|---|
| 2942 | box.cat( middleBox ); | 
|---|
| 2943 | box.cat( rightBox ); | 
|---|
| 2944 | delete yyCharClass; | 
|---|
| 2945 | yyCharClass = 0; | 
|---|
| 2946 |  | 
|---|
| 2947 | officialncap = ncap; | 
|---|
| 2948 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 2949 | if ( nbrefs > ncap ) | 
|---|
| 2950 | ncap = nbrefs; | 
|---|
| 2951 | #endif | 
|---|
| 2952 |  | 
|---|
| 2953 | /* | 
|---|
| 2954 | We use one QMemArray<int> for all the big data used a lot in | 
|---|
| 2955 | matchHere() and friends. | 
|---|
| 2956 | */ | 
|---|
| 2957 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2958 | mmSlideTabSize = QMAX( minl + 1, 16 ); | 
|---|
| 2959 | #else | 
|---|
| 2960 | mmSlideTabSize = 0; | 
|---|
| 2961 | #endif | 
|---|
| 2962 | mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize ); | 
|---|
| 2963 |  | 
|---|
| 2964 | mmInNextStack = mmBigArray.data(); | 
|---|
| 2965 | memset( mmInNextStack, -1, ns * sizeof(int) ); | 
|---|
| 2966 | mmCurStack = mmInNextStack + ns; | 
|---|
| 2967 | mmNextStack = mmInNextStack + 2 * ns; | 
|---|
| 2968 |  | 
|---|
| 2969 | mmCurCapBegin = mmInNextStack + 3 * ns; | 
|---|
| 2970 | mmNextCapBegin = mmCurCapBegin + ncap * ns; | 
|---|
| 2971 | mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns; | 
|---|
| 2972 | mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns; | 
|---|
| 2973 |  | 
|---|
| 2974 | mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns; | 
|---|
| 2975 | mmTempCapEnd = mmTempCapBegin + ncap; | 
|---|
| 2976 | mmCapBegin = mmTempCapBegin + 2 * ncap; | 
|---|
| 2977 | mmCapEnd = mmTempCapBegin + 3 * ncap; | 
|---|
| 2978 |  | 
|---|
| 2979 | mmSlideTab = mmTempCapBegin + 4 * ncap; | 
|---|
| 2980 |  | 
|---|
| 2981 | if ( !yyError.isEmpty() ) | 
|---|
| 2982 | return -1; | 
|---|
| 2983 |  | 
|---|
| 2984 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 2985 | State *sinit = s[InitialState]; | 
|---|
| 2986 | caretAnchored = ( sinit->anchors != 0 ); | 
|---|
| 2987 | if ( caretAnchored ) { | 
|---|
| 2988 | QMap<int, int>& anchors = *sinit->anchors; | 
|---|
| 2989 | QMap<int, int>::ConstIterator a; | 
|---|
| 2990 | for ( a = anchors.begin(); a != anchors.end(); ++a ) { | 
|---|
| 2991 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | 
|---|
| 2992 | if ( (*a & Anchor_Alternation) != 0 ) | 
|---|
| 2993 | break; | 
|---|
| 2994 | #endif | 
|---|
| 2995 | if ( (*a & Anchor_Caret) == 0 ) { | 
|---|
| 2996 | caretAnchored = FALSE; | 
|---|
| 2997 | break; | 
|---|
| 2998 | } | 
|---|
| 2999 | } | 
|---|
| 3000 | } | 
|---|
| 3001 | #endif | 
|---|
| 3002 | return yyPos0; | 
|---|
| 3003 | } | 
|---|
| 3004 |  | 
|---|
| 3005 | void QRegExpEngine::parseAtom( Box *box ) | 
|---|
| 3006 | { | 
|---|
| 3007 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 3008 | QRegExpEngine *eng = 0; | 
|---|
| 3009 | bool neg; | 
|---|
| 3010 | int len; | 
|---|
| 3011 | #endif | 
|---|
| 3012 |  | 
|---|
| 3013 | if ( (yyTok & Tok_Char) != 0 ) { | 
|---|
| 3014 | box->set( QChar(yyTok ^ Tok_Char) ); | 
|---|
| 3015 | } else { | 
|---|
| 3016 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 3017 | trivial = FALSE; | 
|---|
| 3018 | #endif | 
|---|
| 3019 | switch ( yyTok ) { | 
|---|
| 3020 | case Tok_Dollar: | 
|---|
| 3021 | box->catAnchor( Anchor_Dollar ); | 
|---|
| 3022 | break; | 
|---|
| 3023 | case Tok_Caret: | 
|---|
| 3024 | box->catAnchor( Anchor_Caret ); | 
|---|
| 3025 | break; | 
|---|
| 3026 | #ifndef QT_NO_REGEXP_LOOKAHEAD | 
|---|
| 3027 | case Tok_PosLookahead: | 
|---|
| 3028 | case Tok_NegLookahead: | 
|---|
| 3029 | neg = ( yyTok == Tok_NegLookahead ); | 
|---|
| 3030 | eng = new QRegExpEngine( cs ); | 
|---|
| 3031 | len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 ); | 
|---|
| 3032 | if ( len >= 0 ) | 
|---|
| 3033 | skipChars( len ); | 
|---|
| 3034 | else | 
|---|
| 3035 | error( RXERR_LOOKAHEAD ); | 
|---|
| 3036 | box->catAnchor( addLookahead(eng, neg) ); | 
|---|
| 3037 | yyTok = getToken(); | 
|---|
| 3038 | if ( yyTok != Tok_RightParen ) | 
|---|
| 3039 | error( RXERR_LOOKAHEAD ); | 
|---|
| 3040 | break; | 
|---|
| 3041 | #endif | 
|---|
| 3042 | #ifndef QT_NO_REGEXP_ESCAPE | 
|---|
| 3043 | case Tok_Word: | 
|---|
| 3044 | box->catAnchor( Anchor_Word ); | 
|---|
| 3045 | break; | 
|---|
| 3046 | case Tok_NonWord: | 
|---|
| 3047 | box->catAnchor( Anchor_NonWord ); | 
|---|
| 3048 | break; | 
|---|
| 3049 | #endif | 
|---|
| 3050 | case Tok_LeftParen: | 
|---|
| 3051 | case Tok_MagicLeftParen: | 
|---|
| 3052 | yyTok = getToken(); | 
|---|
| 3053 | parseExpression( box ); | 
|---|
| 3054 | if ( yyTok != Tok_RightParen ) | 
|---|
| 3055 | error( RXERR_END ); | 
|---|
| 3056 | break; | 
|---|
| 3057 | case Tok_CharClass: | 
|---|
| 3058 | box->set( *yyCharClass ); | 
|---|
| 3059 | break; | 
|---|
| 3060 | case Tok_Quantifier: | 
|---|
| 3061 | error( RXERR_REPETITION ); | 
|---|
| 3062 | break; | 
|---|
| 3063 | default: | 
|---|
| 3064 | #ifndef QT_NO_REGEXP_BACKREF | 
|---|
| 3065 | if ( (yyTok & Tok_BackRef) != 0 ) | 
|---|
| 3066 | box->set( yyTok ^ Tok_BackRef ); | 
|---|
| 3067 | else | 
|---|
| 3068 | #endif | 
|---|
| 3069 | error( RXERR_DISABLED ); | 
|---|
| 3070 | } | 
|---|
| 3071 | } | 
|---|
| 3072 | yyTok = getToken(); | 
|---|
| 3073 | } | 
|---|
| 3074 |  | 
|---|
| 3075 | void QRegExpEngine::parseFactor( Box *box ) | 
|---|
| 3076 | { | 
|---|
| 3077 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 3078 | int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen ); | 
|---|
| 3079 | #else | 
|---|
| 3080 | static const int atom = 0; | 
|---|
| 3081 | #endif | 
|---|
| 3082 |  | 
|---|
| 3083 | #ifndef QT_NO_REGEXP_INTERVAL | 
|---|
| 3084 | #define YYREDO() \ | 
|---|
| 3085 | yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \ | 
|---|
| 3086 | *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok | 
|---|
| 3087 |  | 
|---|
| 3088 | const QChar *in = yyIn; | 
|---|
| 3089 | int pos0 = yyPos0; | 
|---|
| 3090 | int pos = yyPos; | 
|---|
| 3091 | int len = yyLen; | 
|---|
| 3092 | int ch = yyCh; | 
|---|
| 3093 | CharClass charClass; | 
|---|
| 3094 | if ( yyTok == Tok_CharClass ) | 
|---|
| 3095 | charClass = *yyCharClass; | 
|---|
| 3096 | int tok = yyTok; | 
|---|
| 3097 | bool mayCapture = yyMayCapture; | 
|---|
| 3098 | #endif | 
|---|
| 3099 |  | 
|---|
| 3100 | parseAtom( box ); | 
|---|
| 3101 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 3102 | finishAtom( atom ); | 
|---|
| 3103 | #endif | 
|---|
| 3104 |  | 
|---|
| 3105 | if ( yyTok == Tok_Quantifier ) { | 
|---|
| 3106 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 3107 | trivial = FALSE; | 
|---|
| 3108 | #endif | 
|---|
| 3109 | if ( yyMaxRep == InftyRep ) { | 
|---|
| 3110 | box->plus( atom ); | 
|---|
| 3111 | #ifndef QT_NO_REGEXP_INTERVAL | 
|---|
| 3112 | } else if ( yyMaxRep == 0 ) { | 
|---|
| 3113 | box->clear(); | 
|---|
| 3114 | #endif | 
|---|
| 3115 | } | 
|---|
| 3116 | if ( yyMinRep == 0 ) | 
|---|
| 3117 | box->opt(); | 
|---|
| 3118 |  | 
|---|
| 3119 | #ifndef QT_NO_REGEXP_INTERVAL | 
|---|
| 3120 | yyMayCapture = FALSE; | 
|---|
| 3121 | int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1; | 
|---|
| 3122 | int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 ); | 
|---|
| 3123 |  | 
|---|
| 3124 | Box rightBox( this ); | 
|---|
| 3125 | int i; | 
|---|
| 3126 |  | 
|---|
| 3127 | for ( i = 0; i < beta; i++ ) { | 
|---|
| 3128 | YYREDO(); | 
|---|
| 3129 | Box leftBox( this ); | 
|---|
| 3130 | parseAtom( &leftBox ); | 
|---|
| 3131 | leftBox.cat( rightBox ); | 
|---|
| 3132 | leftBox.opt(); | 
|---|
| 3133 | rightBox = leftBox; | 
|---|
| 3134 | } | 
|---|
| 3135 | for ( i = 0; i < alpha; i++ ) { | 
|---|
| 3136 | YYREDO(); | 
|---|
| 3137 | Box leftBox( this ); | 
|---|
| 3138 | parseAtom( &leftBox ); | 
|---|
| 3139 | leftBox.cat( rightBox ); | 
|---|
| 3140 | rightBox = leftBox; | 
|---|
| 3141 | } | 
|---|
| 3142 | rightBox.cat( *box ); | 
|---|
| 3143 | *box = rightBox; | 
|---|
| 3144 | #endif | 
|---|
| 3145 | yyTok = getToken(); | 
|---|
| 3146 | #ifndef QT_NO_REGEXP_INTERVAL | 
|---|
| 3147 | yyMayCapture = mayCapture; | 
|---|
| 3148 | #endif | 
|---|
| 3149 | } | 
|---|
| 3150 | #undef YYREDO | 
|---|
| 3151 | } | 
|---|
| 3152 |  | 
|---|
| 3153 | void QRegExpEngine::parseTerm( Box *box ) | 
|---|
| 3154 | { | 
|---|
| 3155 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 3156 | if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) | 
|---|
| 3157 | parseFactor( box ); | 
|---|
| 3158 | #endif | 
|---|
| 3159 | while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) { | 
|---|
| 3160 | Box rightBox( this ); | 
|---|
| 3161 | parseFactor( &rightBox ); | 
|---|
| 3162 | box->cat( rightBox ); | 
|---|
| 3163 | } | 
|---|
| 3164 | } | 
|---|
| 3165 |  | 
|---|
| 3166 | void QRegExpEngine::parseExpression( Box *box ) | 
|---|
| 3167 | { | 
|---|
| 3168 | parseTerm( box ); | 
|---|
| 3169 | while ( yyTok == Tok_Bar ) { | 
|---|
| 3170 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 3171 | trivial = FALSE; | 
|---|
| 3172 | #endif | 
|---|
| 3173 | Box rightBox( this ); | 
|---|
| 3174 | yyTok = getToken(); | 
|---|
| 3175 | parseTerm( &rightBox ); | 
|---|
| 3176 | box->orx( rightBox ); | 
|---|
| 3177 | } | 
|---|
| 3178 | } | 
|---|
| 3179 |  | 
|---|
| 3180 | /* | 
|---|
| 3181 | The struct QRegExpPrivate contains the private data of a regular | 
|---|
| 3182 | expression other than the automaton. It makes it possible for many | 
|---|
| 3183 | QRegExp objects to use the same QRegExpEngine object with different | 
|---|
| 3184 | QRegExpPrivate objects. | 
|---|
| 3185 | */ | 
|---|
| 3186 | struct QRegExpPrivate | 
|---|
| 3187 | { | 
|---|
| 3188 | QString pattern; // regular-expression or wildcard pattern | 
|---|
| 3189 | QString rxpattern; // regular-expression pattern | 
|---|
| 3190 | #ifndef QT_NO_REGEXP_WILDCARD | 
|---|
| 3191 | bool wc : 1; // wildcard mode? | 
|---|
| 3192 | #endif | 
|---|
| 3193 | bool min : 1; // minimal matching? (instead of maximal) | 
|---|
| 3194 | bool cs : 1; // case sensitive? | 
|---|
| 3195 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 3196 | QString t; // last string passed to QRegExp::search() or searchRev() | 
|---|
| 3197 | QStringList capturedCache; // what QRegExp::capturedTexts() returned last | 
|---|
| 3198 | #endif | 
|---|
| 3199 | QMemArray<int> captured; // what QRegExpEngine::search() returned last | 
|---|
| 3200 |  | 
|---|
| 3201 | QRegExpPrivate() { captured.fill( -1, 2 ); } | 
|---|
| 3202 | }; | 
|---|
| 3203 |  | 
|---|
| 3204 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 3205 | static QSingleCleanupHandler<QCache<QRegExpEngine> > cleanup_cache; | 
|---|
| 3206 | #  ifndef QT_THREAD_SUPPORT | 
|---|
| 3207 | static QCache<QRegExpEngine> *engineCache = 0; | 
|---|
| 3208 | #  endif // QT_THREAD_SUPPORT | 
|---|
| 3209 | #endif // QT_NO_REGEXP_OPTIM | 
|---|
| 3210 |  | 
|---|
| 3211 | static void regexpEngine( QRegExpEngine *&eng, const QString &pattern, | 
|---|
| 3212 | bool caseSensitive, bool deref ) | 
|---|
| 3213 | { | 
|---|
| 3214 | #  ifdef QT_THREAD_SUPPORT | 
|---|
| 3215 | static QThreadStorage<QCache<QRegExpEngine> *> engineCaches; | 
|---|
| 3216 | QCache<QRegExpEngine> *&engineCache = engineCaches.localData(); | 
|---|
| 3217 | #endif // QT_THREAD_SUPPORT | 
|---|
| 3218 |  | 
|---|
| 3219 | if ( !deref ) { | 
|---|
| 3220 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 3221 | if ( engineCache != 0 ) { | 
|---|
| 3222 | eng = engineCache->take( pattern ); | 
|---|
| 3223 | if ( eng == 0 || eng->caseSensitive() != caseSensitive ) { | 
|---|
| 3224 | delete eng; | 
|---|
| 3225 | } else { | 
|---|
| 3226 | eng->ref(); | 
|---|
| 3227 | return; | 
|---|
| 3228 | } | 
|---|
| 3229 | } | 
|---|
| 3230 | #endif // QT_NO_REGEXP_OPTIM | 
|---|
| 3231 | eng = new QRegExpEngine( pattern, caseSensitive ); | 
|---|
| 3232 | return; | 
|---|
| 3233 | } | 
|---|
| 3234 |  | 
|---|
| 3235 | if ( eng->deref() ) { | 
|---|
| 3236 | #ifndef QT_NO_REGEXP_OPTIM | 
|---|
| 3237 | if ( engineCache == 0 ) { | 
|---|
| 3238 | engineCache = new QCache<QRegExpEngine>; | 
|---|
| 3239 | engineCache->setAutoDelete( TRUE ); | 
|---|
| 3240 | #  ifndef QT_THREAD_SUPPORT | 
|---|
| 3241 | cleanup_cache.set( &engineCache ); | 
|---|
| 3242 | #  endif // !QT_THREAD_SUPPORT | 
|---|
| 3243 | } | 
|---|
| 3244 | if ( !pattern.isNull() && | 
|---|
| 3245 | engineCache->insert(pattern, eng, 4 + pattern.length() / 4) ) | 
|---|
| 3246 | return; | 
|---|
| 3247 | #else | 
|---|
| 3248 | Q_UNUSED( pattern ); | 
|---|
| 3249 | #endif // QT_NO_REGEXP_OPTIM | 
|---|
| 3250 | delete eng; | 
|---|
| 3251 | eng = 0; | 
|---|
| 3252 | } | 
|---|
| 3253 | } | 
|---|
| 3254 |  | 
|---|
| 3255 | /*! | 
|---|
| 3256 | \enum QRegExp::CaretMode | 
|---|
| 3257 |  | 
|---|
| 3258 | The CaretMode enum defines the different meanings of the caret | 
|---|
| 3259 | (<b>^</b>) in a regular expression. The possible values are: | 
|---|
| 3260 |  | 
|---|
| 3261 | \value CaretAtZero | 
|---|
| 3262 | The caret corresponds to index 0 in the searched string. | 
|---|
| 3263 |  | 
|---|
| 3264 | \value CaretAtOffset | 
|---|
| 3265 | The caret corresponds to the start offset of the search. | 
|---|
| 3266 |  | 
|---|
| 3267 | \value CaretWontMatch | 
|---|
| 3268 | The caret never matches. | 
|---|
| 3269 | */ | 
|---|
| 3270 |  | 
|---|
| 3271 | /*! | 
|---|
| 3272 | Constructs an empty regexp. | 
|---|
| 3273 |  | 
|---|
| 3274 | \sa isValid() errorString() | 
|---|
| 3275 | */ | 
|---|
| 3276 | QRegExp::QRegExp() | 
|---|
| 3277 | : eng( 0 ) | 
|---|
| 3278 | { | 
|---|
| 3279 | priv = new QRegExpPrivate; | 
|---|
| 3280 | #ifndef QT_NO_REGEXP_WILDCARD | 
|---|
| 3281 | priv->wc = FALSE; | 
|---|
| 3282 | #endif | 
|---|
| 3283 | priv->min = FALSE; | 
|---|
| 3284 | priv->cs = TRUE; | 
|---|
| 3285 | } | 
|---|
| 3286 |  | 
|---|
| 3287 | /*! | 
|---|
| 3288 | Constructs a regular expression object for the given \a pattern | 
|---|
| 3289 | string. The pattern must be given using wildcard notation if \a | 
|---|
| 3290 | wildcard is TRUE (default is FALSE). The pattern is case | 
|---|
| 3291 | sensitive, unless \a caseSensitive is FALSE. Matching is greedy | 
|---|
| 3292 | (maximal), but can be changed by calling setMinimal(). | 
|---|
| 3293 |  | 
|---|
| 3294 | \sa setPattern() setCaseSensitive() setWildcard() setMinimal() | 
|---|
| 3295 | */ | 
|---|
| 3296 | QRegExp::QRegExp( const QString& pattern, bool caseSensitive, bool wildcard ) | 
|---|
| 3297 | : eng( 0 ) | 
|---|
| 3298 | { | 
|---|
| 3299 | priv = new QRegExpPrivate; | 
|---|
| 3300 | priv->pattern = pattern; | 
|---|
| 3301 | #ifndef QT_NO_REGEXP_WILDCARD | 
|---|
| 3302 | priv->wc = wildcard; | 
|---|
| 3303 | #endif | 
|---|
| 3304 | priv->min = FALSE; | 
|---|
| 3305 | priv->cs = caseSensitive; | 
|---|
| 3306 | } | 
|---|
| 3307 |  | 
|---|
| 3308 | /*! | 
|---|
| 3309 | Constructs a regular expression as a copy of \a rx. | 
|---|
| 3310 |  | 
|---|
| 3311 | \sa operator=() | 
|---|
| 3312 | */ | 
|---|
| 3313 | QRegExp::QRegExp( const QRegExp& rx ) | 
|---|
| 3314 | : eng( 0 ) | 
|---|
| 3315 | { | 
|---|
| 3316 | priv = new QRegExpPrivate; | 
|---|
| 3317 | operator=( rx ); | 
|---|
| 3318 | } | 
|---|
| 3319 |  | 
|---|
| 3320 | /*! | 
|---|
| 3321 | Destroys the regular expression and cleans up its internal data. | 
|---|
| 3322 | */ | 
|---|
| 3323 | QRegExp::~QRegExp() | 
|---|
| 3324 | { | 
|---|
| 3325 | invalidateEngine(); | 
|---|
| 3326 | delete priv; | 
|---|
| 3327 | } | 
|---|
| 3328 |  | 
|---|
| 3329 | /*! | 
|---|
| 3330 | Copies the regular expression \a rx and returns a reference to the | 
|---|
| 3331 | copy. The case sensitivity, wildcard and minimal matching options | 
|---|
| 3332 | are also copied. | 
|---|
| 3333 | */ | 
|---|
| 3334 | QRegExp& QRegExp::operator=( const QRegExp& rx ) | 
|---|
| 3335 | { | 
|---|
| 3336 | QRegExpEngine *otherEng = rx.eng; | 
|---|
| 3337 | if ( otherEng != 0 ) | 
|---|
| 3338 | otherEng->ref(); | 
|---|
| 3339 | invalidateEngine(); | 
|---|
| 3340 | eng = otherEng; | 
|---|
| 3341 | priv->pattern = rx.priv->pattern; | 
|---|
| 3342 | priv->rxpattern = rx.priv->rxpattern; | 
|---|
| 3343 | #ifndef QT_NO_REGEXP_WILDCARD | 
|---|
| 3344 | priv->wc = rx.priv->wc; | 
|---|
| 3345 | #endif | 
|---|
| 3346 | priv->min = rx.priv->min; | 
|---|
| 3347 | priv->cs = rx.priv->cs; | 
|---|
| 3348 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 3349 | priv->t = rx.priv->t; | 
|---|
| 3350 | priv->capturedCache = rx.priv->capturedCache; | 
|---|
| 3351 | #endif | 
|---|
| 3352 | priv->captured = rx.priv->captured; | 
|---|
| 3353 | return *this; | 
|---|
| 3354 | } | 
|---|
| 3355 |  | 
|---|
| 3356 | /*! | 
|---|
| 3357 | Returns TRUE if this regular expression is equal to \a rx; | 
|---|
| 3358 | otherwise returns FALSE. | 
|---|
| 3359 |  | 
|---|
| 3360 | Two QRegExp objects are equal if they have the same pattern | 
|---|
| 3361 | strings and the same settings for case sensitivity, wildcard and | 
|---|
| 3362 | minimal matching. | 
|---|
| 3363 | */ | 
|---|
| 3364 | bool QRegExp::operator==( const QRegExp& rx ) const | 
|---|
| 3365 | { | 
|---|
| 3366 | return priv->pattern == rx.priv->pattern && | 
|---|
| 3367 | #ifndef QT_NO_REGEXP_WILDCARD | 
|---|
| 3368 | priv->wc == rx.priv->wc && | 
|---|
| 3369 | #endif | 
|---|
| 3370 | priv->min == rx.priv->min && | 
|---|
| 3371 | priv->cs == rx.priv->cs; | 
|---|
| 3372 | } | 
|---|
| 3373 |  | 
|---|
| 3374 | /*! | 
|---|
| 3375 | \fn bool QRegExp::operator!=( const QRegExp& rx ) const | 
|---|
| 3376 |  | 
|---|
| 3377 | Returns TRUE if this regular expression is not equal to \a rx; | 
|---|
| 3378 | otherwise returns FALSE. | 
|---|
| 3379 |  | 
|---|
| 3380 | \sa operator==() | 
|---|
| 3381 | */ | 
|---|
| 3382 |  | 
|---|
| 3383 | /*! | 
|---|
| 3384 | Returns TRUE if the pattern string is empty; otherwise returns | 
|---|
| 3385 | FALSE. | 
|---|
| 3386 |  | 
|---|
| 3387 | If you call exactMatch() with an empty pattern on an empty string | 
|---|
| 3388 | it will return TRUE; otherwise it returns FALSE since it operates | 
|---|
| 3389 | over the whole string. If you call search() with an empty pattern | 
|---|
| 3390 | on \e any string it will return the start offset (0 by default) | 
|---|
| 3391 | because the empty pattern matches the 'emptiness' at the start of | 
|---|
| 3392 | the string. In this case the length of the match returned by | 
|---|
| 3393 | matchedLength() will be 0. | 
|---|
| 3394 |  | 
|---|
| 3395 | See QString::isEmpty(). | 
|---|
| 3396 | */ | 
|---|
| 3397 |  | 
|---|
| 3398 | bool QRegExp::isEmpty() const | 
|---|
| 3399 | { | 
|---|
| 3400 | return priv->pattern.isEmpty(); | 
|---|
| 3401 | } | 
|---|
| 3402 |  | 
|---|
| 3403 | /*! | 
|---|
| 3404 | Returns TRUE if the regular expression is valid; otherwise returns | 
|---|
| 3405 | FALSE. An invalid regular expression never matches. | 
|---|
| 3406 |  | 
|---|
| 3407 | The pattern <b>[a-z</b> is an example of an invalid pattern, since | 
|---|
| 3408 | it lacks a closing square bracket. | 
|---|
| 3409 |  | 
|---|
| 3410 | Note that the validity of a regexp may also depend on the setting | 
|---|
| 3411 | of the wildcard flag, for example <b>*.html</b> is a valid | 
|---|
| 3412 | wildcard regexp but an invalid full regexp. | 
|---|
| 3413 |  | 
|---|
| 3414 | \sa errorString() | 
|---|
| 3415 | */ | 
|---|
| 3416 | bool QRegExp::isValid() const | 
|---|
| 3417 | { | 
|---|
| 3418 | if ( priv->pattern.isEmpty() ) { | 
|---|
| 3419 | return TRUE; | 
|---|
| 3420 | } else { | 
|---|
| 3421 | prepareEngine(); | 
|---|
| 3422 | return eng->isValid(); | 
|---|
| 3423 | } | 
|---|
| 3424 | } | 
|---|
| 3425 |  | 
|---|
| 3426 | /*! | 
|---|
| 3427 | Returns the pattern string of the regular expression. The pattern | 
|---|
| 3428 | has either regular expression syntax or wildcard syntax, depending | 
|---|
| 3429 | on wildcard(). | 
|---|
| 3430 |  | 
|---|
| 3431 | \sa setPattern() | 
|---|
| 3432 | */ | 
|---|
| 3433 | QString QRegExp::pattern() const | 
|---|
| 3434 | { | 
|---|
| 3435 | return priv->pattern; | 
|---|
| 3436 | } | 
|---|
| 3437 |  | 
|---|
| 3438 | /*! | 
|---|
| 3439 | Sets the pattern string to \a pattern. The case sensitivity, | 
|---|
| 3440 | wildcard and minimal matching options are not changed. | 
|---|
| 3441 |  | 
|---|
| 3442 | \sa pattern() | 
|---|
| 3443 | */ | 
|---|
| 3444 | void QRegExp::setPattern( const QString& pattern ) | 
|---|
| 3445 | { | 
|---|
| 3446 | if ( priv->pattern != pattern ) { | 
|---|
| 3447 | priv->pattern = pattern; | 
|---|
| 3448 | invalidateEngine(); | 
|---|
| 3449 | } | 
|---|
| 3450 | } | 
|---|
| 3451 |  | 
|---|
| 3452 | /*! | 
|---|
| 3453 | Returns TRUE if case sensitivity is enabled; otherwise returns | 
|---|
| 3454 | FALSE. The default is TRUE. | 
|---|
| 3455 |  | 
|---|
| 3456 | \sa setCaseSensitive() | 
|---|
| 3457 | */ | 
|---|
| 3458 | bool QRegExp::caseSensitive() const | 
|---|
| 3459 | { | 
|---|
| 3460 | return priv->cs; | 
|---|
| 3461 | } | 
|---|
| 3462 |  | 
|---|
| 3463 | /*! | 
|---|
| 3464 | Sets case sensitive matching to \a sensitive. | 
|---|
| 3465 |  | 
|---|
| 3466 | If \a sensitive is TRUE, <b>\\.txt$</b> matches \c{readme.txt} but | 
|---|
| 3467 | not \c{README.TXT}. | 
|---|
| 3468 |  | 
|---|
| 3469 | \sa caseSensitive() | 
|---|
| 3470 | */ | 
|---|
| 3471 | void QRegExp::setCaseSensitive( bool sensitive ) | 
|---|
| 3472 | { | 
|---|
| 3473 | if ( sensitive != priv->cs ) { | 
|---|
| 3474 | priv->cs = sensitive; | 
|---|
| 3475 | invalidateEngine(); | 
|---|
| 3476 | } | 
|---|
| 3477 | } | 
|---|
| 3478 |  | 
|---|
| 3479 | #ifndef QT_NO_REGEXP_WILDCARD | 
|---|
| 3480 | /*! | 
|---|
| 3481 | Returns TRUE if wildcard mode is enabled; otherwise returns FALSE. | 
|---|
| 3482 | The default is FALSE. | 
|---|
| 3483 |  | 
|---|
| 3484 | \sa setWildcard() | 
|---|
| 3485 | */ | 
|---|
| 3486 | bool QRegExp::wildcard() const | 
|---|
| 3487 | { | 
|---|
| 3488 | return priv->wc; | 
|---|
| 3489 | } | 
|---|
| 3490 |  | 
|---|
| 3491 | /*! | 
|---|
| 3492 | Sets the wildcard mode for the regular expression. The default is | 
|---|
| 3493 | FALSE. | 
|---|
| 3494 |  | 
|---|
| 3495 | Setting \a wildcard to TRUE enables simple shell-like wildcard | 
|---|
| 3496 | matching. (See \link #wildcard-matching wildcard matching | 
|---|
| 3497 | (globbing) \endlink.) | 
|---|
| 3498 |  | 
|---|
| 3499 | For example, <b>r*.txt</b> matches the string \c{readme.txt} in | 
|---|
| 3500 | wildcard mode, but does not match \c{readme}. | 
|---|
| 3501 |  | 
|---|
| 3502 | \sa wildcard() | 
|---|
| 3503 | */ | 
|---|
| 3504 | void QRegExp::setWildcard( bool wildcard ) | 
|---|
| 3505 | { | 
|---|
| 3506 | if ( wildcard != priv->wc ) { | 
|---|
| 3507 | priv->wc = wildcard; | 
|---|
| 3508 | invalidateEngine(); | 
|---|
| 3509 | } | 
|---|
| 3510 | } | 
|---|
| 3511 | #endif | 
|---|
| 3512 |  | 
|---|
| 3513 | /*! | 
|---|
| 3514 | Returns TRUE if minimal (non-greedy) matching is enabled; | 
|---|
| 3515 | otherwise returns FALSE. | 
|---|
| 3516 |  | 
|---|
| 3517 | \sa setMinimal() | 
|---|
| 3518 | */ | 
|---|
| 3519 | bool QRegExp::minimal() const | 
|---|
| 3520 | { | 
|---|
| 3521 | return priv->min; | 
|---|
| 3522 | } | 
|---|
| 3523 |  | 
|---|
| 3524 | /*! | 
|---|
| 3525 | Enables or disables minimal matching. If \a minimal is FALSE, | 
|---|
| 3526 | matching is greedy (maximal) which is the default. | 
|---|
| 3527 |  | 
|---|
| 3528 | For example, suppose we have the input string "We must be | 
|---|
| 3529 | \<b>bold\</b>, very \<b>bold\</b>!" and the pattern | 
|---|
| 3530 | <b>\<b>.*\</b></b>. With the default greedy (maximal) matching, | 
|---|
| 3531 | the match is "We must be <u>\<b>bold\</b>, very | 
|---|
| 3532 | \<b>bold\</b></u>!". But with minimal (non-greedy) matching the | 
|---|
| 3533 | first match is: "We must be <u>\<b>bold\</b></u>, very | 
|---|
| 3534 | \<b>bold\</b>!" and the second match is "We must be \<b>bold\</b>, | 
|---|
| 3535 | very <u>\<b>bold\</b></u>!". In practice we might use the pattern | 
|---|
| 3536 | <b>\<b>[^\<]+\</b></b> instead, although this will still fail for | 
|---|
| 3537 | nested tags. | 
|---|
| 3538 |  | 
|---|
| 3539 | \sa minimal() | 
|---|
| 3540 | */ | 
|---|
| 3541 | void QRegExp::setMinimal( bool minimal ) | 
|---|
| 3542 | { | 
|---|
| 3543 | priv->min = minimal; | 
|---|
| 3544 | } | 
|---|
| 3545 |  | 
|---|
| 3546 | /*! | 
|---|
| 3547 | Returns TRUE if \a str is matched exactly by this regular | 
|---|
| 3548 | expression; otherwise returns FALSE. You can determine how much of | 
|---|
| 3549 | the string was matched by calling matchedLength(). | 
|---|
| 3550 |  | 
|---|
| 3551 | For a given regexp string, R, exactMatch("R") is the equivalent of | 
|---|
| 3552 | search("^R$") since exactMatch() effectively encloses the regexp | 
|---|
| 3553 | in the start of string and end of string anchors, except that it | 
|---|
| 3554 | sets matchedLength() differently. | 
|---|
| 3555 |  | 
|---|
| 3556 | For example, if the regular expression is <b>blue</b>, then | 
|---|
| 3557 | exactMatch() returns TRUE only for input \c blue. For inputs \c | 
|---|
| 3558 | bluebell, \c blutak and \c lightblue, exactMatch() returns FALSE | 
|---|
| 3559 | and matchedLength() will return 4, 3 and 0 respectively. | 
|---|
| 3560 |  | 
|---|
| 3561 | Although const, this function sets matchedLength(), | 
|---|
| 3562 | capturedTexts() and pos(). | 
|---|
| 3563 |  | 
|---|
| 3564 | \sa search() searchRev() QRegExpValidator | 
|---|
| 3565 | */ | 
|---|
| 3566 | bool QRegExp::exactMatch( const QString& str ) const | 
|---|
| 3567 | { | 
|---|
| 3568 | prepareEngineForMatch( str ); | 
|---|
| 3569 | eng->match( str, 0, priv->min, TRUE, 0, priv->captured ); | 
|---|
| 3570 | if ( priv->captured[1] == (int) str.length() ) { | 
|---|
| 3571 | return TRUE; | 
|---|
| 3572 | } else { | 
|---|
| 3573 | priv->captured[0] = 0; | 
|---|
| 3574 | priv->captured[1] = eng->partialMatchLength(); | 
|---|
| 3575 | return FALSE; | 
|---|
| 3576 | } | 
|---|
| 3577 | } | 
|---|
| 3578 |  | 
|---|
| 3579 | #ifndef QT_NO_COMPAT | 
|---|
| 3580 | /*! \obsolete | 
|---|
| 3581 |  | 
|---|
| 3582 | Attempts to match in \a str, starting from position \a index. | 
|---|
| 3583 | Returns the position of the match, or -1 if there was no match. | 
|---|
| 3584 |  | 
|---|
| 3585 | The length of the match is stored in \a *len, unless \a len is a | 
|---|
| 3586 | null pointer. | 
|---|
| 3587 |  | 
|---|
| 3588 | If \a indexIsStart is TRUE (the default), the position \a index in | 
|---|
| 3589 | the string will match the start of string anchor, <b>^</b>, in the | 
|---|
| 3590 | regexp, if present. Otherwise, position 0 in \a str will match. | 
|---|
| 3591 |  | 
|---|
| 3592 | Use search() and matchedLength() instead of this function. | 
|---|
| 3593 |  | 
|---|
| 3594 | \sa QString::mid() QConstString | 
|---|
| 3595 | */ | 
|---|
| 3596 | int QRegExp::match( const QString& str, int index, int *len, | 
|---|
| 3597 | bool indexIsStart ) const | 
|---|
| 3598 | { | 
|---|
| 3599 | int pos = search( str, index, indexIsStart ? CaretAtOffset : CaretAtZero ); | 
|---|
| 3600 | if ( len != 0 ) | 
|---|
| 3601 | *len = matchedLength(); | 
|---|
| 3602 | return pos; | 
|---|
| 3603 | } | 
|---|
| 3604 | #endif // QT_NO_COMPAT | 
|---|
| 3605 |  | 
|---|
| 3606 | int QRegExp::search( const QString& str, int offset ) const | 
|---|
| 3607 | { | 
|---|
| 3608 | return search( str, offset, CaretAtZero ); | 
|---|
| 3609 | } | 
|---|
| 3610 |  | 
|---|
| 3611 | /*! | 
|---|
| 3612 | Attempts to find a match in \a str from position \a offset (0 by | 
|---|
| 3613 | default). If \a offset is -1, the search starts at the last | 
|---|
| 3614 | character; if -2, at the next to last character; etc. | 
|---|
| 3615 |  | 
|---|
| 3616 | Returns the position of the first match, or -1 if there was no | 
|---|
| 3617 | match. | 
|---|
| 3618 |  | 
|---|
| 3619 | The \a caretMode parameter can be used to instruct whether <b>^</b> | 
|---|
| 3620 | should match at index 0 or at \a offset. | 
|---|
| 3621 |  | 
|---|
| 3622 | You might prefer to use QString::find(), QString::contains() or | 
|---|
| 3623 | even QStringList::grep(). To replace matches use | 
|---|
| 3624 | QString::replace(). | 
|---|
| 3625 |  | 
|---|
| 3626 | Example: | 
|---|
| 3627 | \code | 
|---|
| 3628 | QString str = "offsets: 1.23 .50 71.00 6.00"; | 
|---|
| 3629 | QRegExp rx( "\\d*\\.\\d+" );    // primitive floating point matching | 
|---|
| 3630 | int count = 0; | 
|---|
| 3631 | int pos = 0; | 
|---|
| 3632 | while ( (pos = rx.search(str, pos)) != -1 ) { | 
|---|
| 3633 | count++; | 
|---|
| 3634 | pos += rx.matchedLength(); | 
|---|
| 3635 | } | 
|---|
| 3636 | // pos will be 9, 14, 18 and finally 24; count will end up as 4 | 
|---|
| 3637 | \endcode | 
|---|
| 3638 |  | 
|---|
| 3639 | Although const, this function sets matchedLength(), | 
|---|
| 3640 | capturedTexts() and pos(). | 
|---|
| 3641 |  | 
|---|
| 3642 | \sa searchRev() exactMatch() | 
|---|
| 3643 | */ | 
|---|
| 3644 |  | 
|---|
| 3645 | int QRegExp::search( const QString& str, int offset, CaretMode caretMode ) const | 
|---|
| 3646 | { | 
|---|
| 3647 | prepareEngineForMatch( str ); | 
|---|
| 3648 | if ( offset < 0 ) | 
|---|
| 3649 | offset += str.length(); | 
|---|
| 3650 | eng->match( str, offset, priv->min, FALSE, caretIndex(offset, caretMode), | 
|---|
| 3651 | priv->captured ); | 
|---|
| 3652 | return priv->captured[0]; | 
|---|
| 3653 | } | 
|---|
| 3654 |  | 
|---|
| 3655 |  | 
|---|
| 3656 | int QRegExp::searchRev( const QString& str, int offset ) const | 
|---|
| 3657 | { | 
|---|
| 3658 | return searchRev( str, offset, CaretAtZero ); | 
|---|
| 3659 | } | 
|---|
| 3660 |  | 
|---|
| 3661 | /*! | 
|---|
| 3662 | Attempts to find a match backwards in \a str from position \a | 
|---|
| 3663 | offset. If \a offset is -1 (the default), the search starts at the | 
|---|
| 3664 | last character; if -2, at the next to last character; etc. | 
|---|
| 3665 |  | 
|---|
| 3666 | Returns the position of the first match, or -1 if there was no | 
|---|
| 3667 | match. | 
|---|
| 3668 |  | 
|---|
| 3669 | The \a caretMode parameter can be used to instruct whether <b>^</b> | 
|---|
| 3670 | should match at index 0 or at \a offset. | 
|---|
| 3671 |  | 
|---|
| 3672 | Although const, this function sets matchedLength(), | 
|---|
| 3673 | capturedTexts() and pos(). | 
|---|
| 3674 |  | 
|---|
| 3675 | \warning Searching backwards is much slower than searching | 
|---|
| 3676 | forwards. | 
|---|
| 3677 |  | 
|---|
| 3678 | \sa search() exactMatch() | 
|---|
| 3679 | */ | 
|---|
| 3680 |  | 
|---|
| 3681 | int QRegExp::searchRev( const QString& str, int offset, | 
|---|
| 3682 | CaretMode caretMode ) const | 
|---|
| 3683 | { | 
|---|
| 3684 | prepareEngineForMatch( str ); | 
|---|
| 3685 | if ( offset < 0 ) | 
|---|
| 3686 | offset += str.length(); | 
|---|
| 3687 | if ( offset < 0 || offset > (int) str.length() ) { | 
|---|
| 3688 | priv->captured.detach(); | 
|---|
| 3689 | priv->captured.fill( -1 ); | 
|---|
| 3690 | return -1; | 
|---|
| 3691 | } | 
|---|
| 3692 |  | 
|---|
| 3693 | while ( offset >= 0 ) { | 
|---|
| 3694 | eng->match( str, offset, priv->min, TRUE, caretIndex(offset, caretMode), | 
|---|
| 3695 | priv->captured ); | 
|---|
| 3696 | if ( priv->captured[0] == offset ) | 
|---|
| 3697 | return offset; | 
|---|
| 3698 | offset--; | 
|---|
| 3699 | } | 
|---|
| 3700 | return -1; | 
|---|
| 3701 | } | 
|---|
| 3702 |  | 
|---|
| 3703 | /*! | 
|---|
| 3704 | Returns the length of the last matched string, or -1 if there was | 
|---|
| 3705 | no match. | 
|---|
| 3706 |  | 
|---|
| 3707 | \sa exactMatch() search() searchRev() | 
|---|
| 3708 | */ | 
|---|
| 3709 | int QRegExp::matchedLength() const | 
|---|
| 3710 | { | 
|---|
| 3711 | return priv->captured[1]; | 
|---|
| 3712 | } | 
|---|
| 3713 |  | 
|---|
| 3714 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 3715 | /*! | 
|---|
| 3716 | Returns the number of captures contained in the regular expression. | 
|---|
| 3717 | */ | 
|---|
| 3718 | int QRegExp::numCaptures() const | 
|---|
| 3719 | { | 
|---|
| 3720 | prepareEngine(); | 
|---|
| 3721 | return eng->numCaptures(); | 
|---|
| 3722 | } | 
|---|
| 3723 |  | 
|---|
| 3724 | /*! | 
|---|
| 3725 | Returns a list of the captured text strings. | 
|---|
| 3726 |  | 
|---|
| 3727 | The first string in the list is the entire matched string. Each | 
|---|
| 3728 | subsequent list element contains a string that matched a | 
|---|
| 3729 | (capturing) subexpression of the regexp. | 
|---|
| 3730 |  | 
|---|
| 3731 | For example: | 
|---|
| 3732 | \code | 
|---|
| 3733 | QRegExp rx( "(\\d+)(\\s*)(cm|inch(es)?)" ); | 
|---|
| 3734 | int pos = rx.search( "Length: 36 inches" ); | 
|---|
| 3735 | QStringList list = rx.capturedTexts(); | 
|---|
| 3736 | // list is now ( "36 inches", "36", " ", "inches", "es" ) | 
|---|
| 3737 | \endcode | 
|---|
| 3738 |  | 
|---|
| 3739 | The above example also captures elements that may be present but | 
|---|
| 3740 | which we have no interest in. This problem can be solved by using | 
|---|
| 3741 | non-capturing parentheses: | 
|---|
| 3742 |  | 
|---|
| 3743 | \code | 
|---|
| 3744 | QRegExp rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" ); | 
|---|
| 3745 | int pos = rx.search( "Length: 36 inches" ); | 
|---|
| 3746 | QStringList list = rx.capturedTexts(); | 
|---|
| 3747 | // list is now ( "36 inches", "36", "inches" ) | 
|---|
| 3748 | \endcode | 
|---|
| 3749 |  | 
|---|
| 3750 | Note that if you want to iterate over the list, you should iterate | 
|---|
| 3751 | over a copy, e.g. | 
|---|
| 3752 | \code | 
|---|
| 3753 | QStringList list = rx.capturedTexts(); | 
|---|
| 3754 | QStringList::Iterator it = list.begin(); | 
|---|
| 3755 | while( it != list.end() ) { | 
|---|
| 3756 | myProcessing( *it ); | 
|---|
| 3757 | ++it; | 
|---|
| 3758 | } | 
|---|
| 3759 | \endcode | 
|---|
| 3760 |  | 
|---|
| 3761 | Some regexps can match an indeterminate number of times. For | 
|---|
| 3762 | example if the input string is "Offsets: 12 14 99 231 7" and the | 
|---|
| 3763 | regexp, \c{rx}, is <b>(\\d+)+</b>, we would hope to get a list of | 
|---|
| 3764 | all the numbers matched. However, after calling | 
|---|
| 3765 | \c{rx.search(str)}, capturedTexts() will return the list ( "12", | 
|---|
| 3766 | "12" ), i.e. the entire match was "12" and the first subexpression | 
|---|
| 3767 | matched was "12". The correct approach is to use cap() in a \link | 
|---|
| 3768 | #cap_in_a_loop loop \endlink. | 
|---|
| 3769 |  | 
|---|
| 3770 | The order of elements in the string list is as follows. The first | 
|---|
| 3771 | element is the entire matching string. Each subsequent element | 
|---|
| 3772 | corresponds to the next capturing open left parentheses. Thus | 
|---|
| 3773 | capturedTexts()[1] is the text of the first capturing parentheses, | 
|---|
| 3774 | capturedTexts()[2] is the text of the second and so on | 
|---|
| 3775 | (corresponding to $1, $2, etc., in some other regexp languages). | 
|---|
| 3776 |  | 
|---|
| 3777 | \sa cap() pos() exactMatch() search() searchRev() | 
|---|
| 3778 | */ | 
|---|
| 3779 | QStringList QRegExp::capturedTexts() | 
|---|
| 3780 | { | 
|---|
| 3781 | if ( priv->capturedCache.isEmpty() ) { | 
|---|
| 3782 | for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) { | 
|---|
| 3783 | QString m; | 
|---|
| 3784 | if ( priv->captured[i + 1] == 0 ) | 
|---|
| 3785 | m = QString::fromLatin1( "" ); | 
|---|
| 3786 | else if ( priv->captured[i] >= 0 ) | 
|---|
| 3787 | m = priv->t.mid( priv->captured[i], | 
|---|
| 3788 | priv->captured[i + 1] ); | 
|---|
| 3789 | priv->capturedCache.append( m ); | 
|---|
| 3790 | } | 
|---|
| 3791 | priv->t = QString::null; | 
|---|
| 3792 | } | 
|---|
| 3793 | return priv->capturedCache; | 
|---|
| 3794 | } | 
|---|
| 3795 |  | 
|---|
| 3796 | /*! | 
|---|
| 3797 | Returns the text captured by the \a nth subexpression. The entire | 
|---|
| 3798 | match has index 0 and the parenthesized subexpressions have | 
|---|
| 3799 | indices starting from 1 (excluding non-capturing parentheses). | 
|---|
| 3800 |  | 
|---|
| 3801 | \code | 
|---|
| 3802 | QRegExp rxlen( "(\\d+)(?:\\s*)(cm|inch)" ); | 
|---|
| 3803 | int pos = rxlen.search( "Length: 189cm" ); | 
|---|
| 3804 | if ( pos > -1 ) { | 
|---|
| 3805 | QString value = rxlen.cap( 1 ); // "189" | 
|---|
| 3806 | QString unit = rxlen.cap( 2 );  // "cm" | 
|---|
| 3807 | // ... | 
|---|
| 3808 | } | 
|---|
| 3809 | \endcode | 
|---|
| 3810 |  | 
|---|
| 3811 | The order of elements matched by cap() is as follows. The first | 
|---|
| 3812 | element, cap(0), is the entire matching string. Each subsequent | 
|---|
| 3813 | element corresponds to the next capturing open left parentheses. | 
|---|
| 3814 | Thus cap(1) is the text of the first capturing parentheses, cap(2) | 
|---|
| 3815 | is the text of the second, and so on. | 
|---|
| 3816 |  | 
|---|
| 3817 | \target cap_in_a_loop | 
|---|
| 3818 | Some patterns may lead to a number of matches which cannot be | 
|---|
| 3819 | determined in advance, for example: | 
|---|
| 3820 |  | 
|---|
| 3821 | \code | 
|---|
| 3822 | QRegExp rx( "(\\d+)" ); | 
|---|
| 3823 | str = "Offsets: 12 14 99 231 7"; | 
|---|
| 3824 | QStringList list; | 
|---|
| 3825 | pos = 0; | 
|---|
| 3826 | while ( pos >= 0 ) { | 
|---|
| 3827 | pos = rx.search( str, pos ); | 
|---|
| 3828 | if ( pos > -1 ) { | 
|---|
| 3829 | list += rx.cap( 1 ); | 
|---|
| 3830 | pos  += rx.matchedLength(); | 
|---|
| 3831 | } | 
|---|
| 3832 | } | 
|---|
| 3833 | // list contains "12", "14", "99", "231", "7" | 
|---|
| 3834 | \endcode | 
|---|
| 3835 |  | 
|---|
| 3836 | \sa capturedTexts() pos() exactMatch() search() searchRev() | 
|---|
| 3837 | */ | 
|---|
| 3838 | QString QRegExp::cap( int nth ) | 
|---|
| 3839 | { | 
|---|
| 3840 | if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) { | 
|---|
| 3841 | return QString::null; | 
|---|
| 3842 | } else { | 
|---|
| 3843 | return capturedTexts()[nth]; | 
|---|
| 3844 | } | 
|---|
| 3845 | } | 
|---|
| 3846 |  | 
|---|
| 3847 | /*! | 
|---|
| 3848 | Returns the position of the \a nth captured text in the searched | 
|---|
| 3849 | string. If \a nth is 0 (the default), pos() returns the position | 
|---|
| 3850 | of the whole match. | 
|---|
| 3851 |  | 
|---|
| 3852 | Example: | 
|---|
| 3853 | \code | 
|---|
| 3854 | QRegExp rx( "/([a-z]+)/([a-z]+)" ); | 
|---|
| 3855 | rx.search( "Output /dev/null" );    // returns 7 (position of /dev/null) | 
|---|
| 3856 | rx.pos( 0 );                        // returns 7 (position of /dev/null) | 
|---|
| 3857 | rx.pos( 1 );                        // returns 8 (position of dev) | 
|---|
| 3858 | rx.pos( 2 );                        // returns 12 (position of null) | 
|---|
| 3859 | \endcode | 
|---|
| 3860 |  | 
|---|
| 3861 | For zero-length matches, pos() always returns -1. (For example, if | 
|---|
| 3862 | cap(4) would return an empty string, pos(4) returns -1.) This is | 
|---|
| 3863 | due to an implementation tradeoff. | 
|---|
| 3864 |  | 
|---|
| 3865 | \sa capturedTexts() exactMatch() search() searchRev() | 
|---|
| 3866 | */ | 
|---|
| 3867 | int QRegExp::pos( int nth ) | 
|---|
| 3868 | { | 
|---|
| 3869 | if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) | 
|---|
| 3870 | return -1; | 
|---|
| 3871 | else | 
|---|
| 3872 | return priv->captured[2 * nth]; | 
|---|
| 3873 | } | 
|---|
| 3874 |  | 
|---|
| 3875 | /*! | 
|---|
| 3876 | Returns a text string that explains why a regexp pattern is | 
|---|
| 3877 | invalid the case being; otherwise returns "no error occurred". | 
|---|
| 3878 |  | 
|---|
| 3879 | \sa isValid() | 
|---|
| 3880 | */ | 
|---|
| 3881 | QString QRegExp::errorString() | 
|---|
| 3882 | { | 
|---|
| 3883 | if ( isValid() ) { | 
|---|
| 3884 | return QString( RXERR_OK ); | 
|---|
| 3885 | } else { | 
|---|
| 3886 | return eng->errorString(); | 
|---|
| 3887 | } | 
|---|
| 3888 | } | 
|---|
| 3889 | #endif | 
|---|
| 3890 |  | 
|---|
| 3891 | /*! | 
|---|
| 3892 | Returns the string \a str with every regexp special character | 
|---|
| 3893 | escaped with a backslash. The special characters are $, (, ), *, +, | 
|---|
| 3894 | ., ?, [, \, ], ^, {, | and }. | 
|---|
| 3895 |  | 
|---|
| 3896 | Example: | 
|---|
| 3897 | \code | 
|---|
| 3898 | s1 = QRegExp::escape( "bingo" );   // s1 == "bingo" | 
|---|
| 3899 | s2 = QRegExp::escape( "f(x)" );    // s2 == "f\\(x\\)" | 
|---|
| 3900 | \endcode | 
|---|
| 3901 |  | 
|---|
| 3902 | This function is useful to construct regexp patterns dynamically: | 
|---|
| 3903 |  | 
|---|
| 3904 | \code | 
|---|
| 3905 | QRegExp rx( "(" + QRegExp::escape(name) + | 
|---|
| 3906 | "|" + QRegExp::escape(alias) + ")" ); | 
|---|
| 3907 | \endcode | 
|---|
| 3908 | */ | 
|---|
| 3909 | QString QRegExp::escape( const QString& str ) | 
|---|
| 3910 | { | 
|---|
| 3911 | static const char meta[] = "$()*+.?[\\]^{|}"; | 
|---|
| 3912 | QString quoted = str; | 
|---|
| 3913 | int i = 0; | 
|---|
| 3914 |  | 
|---|
| 3915 | while ( i < (int) quoted.length() ) { | 
|---|
| 3916 | if ( strchr(meta, quoted[i].latin1()) != 0 ) | 
|---|
| 3917 | quoted.insert( i++, "\\" ); | 
|---|
| 3918 | i++; | 
|---|
| 3919 | } | 
|---|
| 3920 | return quoted; | 
|---|
| 3921 | } | 
|---|
| 3922 |  | 
|---|
| 3923 | void QRegExp::prepareEngine() const | 
|---|
| 3924 | { | 
|---|
| 3925 | if ( eng == 0 ) { | 
|---|
| 3926 | #ifndef QT_NO_REGEXP_WILDCARD | 
|---|
| 3927 | if ( priv->wc ) | 
|---|
| 3928 | priv->rxpattern = wc2rx( priv->pattern ); | 
|---|
| 3929 | else | 
|---|
| 3930 | #endif | 
|---|
| 3931 | priv->rxpattern = priv->pattern.isNull() ? QString::fromLatin1( "" ) | 
|---|
| 3932 | : priv->pattern; | 
|---|
| 3933 | QRegExp *that = (QRegExp *) this; | 
|---|
| 3934 | // that->eng = newEngine( priv->rxpattern, priv->cs ); | 
|---|
| 3935 | regexpEngine( that->eng, priv->rxpattern, priv->cs, FALSE ); | 
|---|
| 3936 | priv->captured.detach(); | 
|---|
| 3937 | priv->captured.fill( -1, 2 + 2 * eng->numCaptures() ); | 
|---|
| 3938 | } | 
|---|
| 3939 | } | 
|---|
| 3940 |  | 
|---|
| 3941 | void QRegExp::prepareEngineForMatch( const QString& str ) const | 
|---|
| 3942 | { | 
|---|
| 3943 | prepareEngine(); | 
|---|
| 3944 | #ifndef QT_NO_REGEXP_CAPTURE | 
|---|
| 3945 | priv->t = str; | 
|---|
| 3946 | priv->capturedCache.clear(); | 
|---|
| 3947 | #else | 
|---|
| 3948 | Q_UNUSED( str ); | 
|---|
| 3949 | #endif | 
|---|
| 3950 | } | 
|---|
| 3951 |  | 
|---|
| 3952 | void QRegExp::invalidateEngine() | 
|---|
| 3953 | { | 
|---|
| 3954 | if ( eng != 0 ) { | 
|---|
| 3955 | regexpEngine( eng, priv->rxpattern, priv->cs, TRUE ); | 
|---|
| 3956 | priv->rxpattern = QString(); | 
|---|
| 3957 | eng = 0; | 
|---|
| 3958 | } | 
|---|
| 3959 | } | 
|---|
| 3960 |  | 
|---|
| 3961 | int QRegExp::caretIndex( int offset, CaretMode caretMode ) | 
|---|
| 3962 | { | 
|---|
| 3963 | if ( caretMode == CaretAtZero ) { | 
|---|
| 3964 | return 0; | 
|---|
| 3965 | } else if ( caretMode == CaretAtOffset ) { | 
|---|
| 3966 | return offset; | 
|---|
| 3967 | } else { // CaretWontMatch | 
|---|
| 3968 | return -1; | 
|---|
| 3969 | } | 
|---|
| 3970 | } | 
|---|
| 3971 |  | 
|---|
| 3972 | #endif // QT_NO_REGEXP | 
|---|