| 1 | /****************************************************************************
|
|---|
| 2 | ** $Id: qregexp.cpp 2 2005-11-16 15:49:26Z dmik $
|
|---|
| 3 | **
|
|---|
| 4 | ** Implementation of QRegExp class
|
|---|
| 5 | **
|
|---|
| 6 | ** Created : 950126
|
|---|
| 7 | **
|
|---|
| 8 | ** Copyright (C) 1992-2000 Trolltech AS. All rights reserved.
|
|---|
| 9 | **
|
|---|
| 10 | ** This file is part of the tools module of the Qt GUI Toolkit.
|
|---|
| 11 | **
|
|---|
| 12 | ** This file may be distributed under the terms of the Q Public License
|
|---|
| 13 | ** as defined by Trolltech AS of Norway and appearing in the file
|
|---|
| 14 | ** LICENSE.QPL included in the packaging of this file.
|
|---|
| 15 | **
|
|---|
| 16 | ** This file may be distributed and/or modified under the terms of the
|
|---|
| 17 | ** GNU General Public License version 2 as published by the Free Software
|
|---|
| 18 | ** Foundation and appearing in the file LICENSE.GPL included in the
|
|---|
| 19 | ** packaging of this file.
|
|---|
| 20 | **
|
|---|
| 21 | ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
|
|---|
| 22 | ** licenses may use this file in accordance with the Qt Commercial License
|
|---|
| 23 | ** Agreement provided with the Software.
|
|---|
| 24 | **
|
|---|
| 25 | ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
|
|---|
| 26 | ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
|
|---|
| 27 | **
|
|---|
| 28 | ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
|
|---|
| 29 | ** information about Qt Commercial License Agreements.
|
|---|
| 30 | ** See http://www.trolltech.com/qpl/ for QPL licensing information.
|
|---|
| 31 | ** See http://www.trolltech.com/gpl/ for GPL licensing information.
|
|---|
| 32 | **
|
|---|
| 33 | ** Contact info@trolltech.com if any conditions of this licensing are
|
|---|
| 34 | ** not clear to you.
|
|---|
| 35 | **
|
|---|
| 36 | **********************************************************************/
|
|---|
| 37 |
|
|---|
| 38 | #include "qregexp.h"
|
|---|
| 39 |
|
|---|
| 40 | #ifndef QT_NO_REGEXP
|
|---|
| 41 |
|
|---|
| 42 | #include "qmemarray.h"
|
|---|
| 43 | #include "qbitarray.h"
|
|---|
| 44 | #include "qcache.h"
|
|---|
| 45 | #include "qcleanuphandler.h"
|
|---|
| 46 | #include "qintdict.h"
|
|---|
| 47 | #include "qmap.h"
|
|---|
| 48 | #include "qptrvector.h"
|
|---|
| 49 | #include "qstring.h"
|
|---|
| 50 | #include "qtl.h"
|
|---|
| 51 |
|
|---|
| 52 | #ifdef QT_THREAD_SUPPORT
|
|---|
| 53 | #include "qthreadstorage.h"
|
|---|
| 54 | #endif // QT_THREAD_SUPPORT
|
|---|
| 55 |
|
|---|
| 56 | #undef QT_TRANSLATE_NOOP
|
|---|
| 57 | #define QT_TRANSLATE_NOOP( context, sourceText ) sourceText
|
|---|
| 58 |
|
|---|
| 59 | #include <limits.h>
|
|---|
| 60 |
|
|---|
| 61 | // error strings for the regexp parser
|
|---|
| 62 | #define RXERR_OK QT_TRANSLATE_NOOP( "QRegExp", "no error occurred" )
|
|---|
| 63 | #define RXERR_DISABLED QT_TRANSLATE_NOOP( "QRegExp", "disabled feature used" )
|
|---|
| 64 | #define RXERR_CHARCLASS QT_TRANSLATE_NOOP( "QRegExp", "bad char class syntax" )
|
|---|
| 65 | #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP( "QRegExp", "bad lookahead syntax" )
|
|---|
| 66 | #define RXERR_REPETITION QT_TRANSLATE_NOOP( "QRegExp", "bad repetition syntax" )
|
|---|
| 67 | #define RXERR_OCTAL QT_TRANSLATE_NOOP( "QRegExp", "invalid octal value" )
|
|---|
| 68 | #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP( "QRegExp", "missing left delim" )
|
|---|
| 69 | #define RXERR_END QT_TRANSLATE_NOOP( "QRegExp", "unexpected end" )
|
|---|
| 70 | #define RXERR_LIMIT QT_TRANSLATE_NOOP( "QRegExp", "met internal limit" )
|
|---|
| 71 |
|
|---|
| 72 | /*
|
|---|
| 73 | WARNING! Be sure to read qregexp.tex before modifying this file.
|
|---|
| 74 | */
|
|---|
| 75 |
|
|---|
| 76 | /*!
|
|---|
| 77 | \class QRegExp qregexp.h
|
|---|
| 78 | \reentrant
|
|---|
| 79 | \brief The QRegExp class provides pattern matching using regular expressions.
|
|---|
| 80 |
|
|---|
| 81 | \ingroup tools
|
|---|
| 82 | \ingroup misc
|
|---|
| 83 | \ingroup shared
|
|---|
| 84 | \mainclass
|
|---|
| 85 | \keyword regular expression
|
|---|
| 86 |
|
|---|
| 87 | Regular expressions, or "regexps", provide a way to find patterns
|
|---|
| 88 | within text. This is useful in many contexts, for example:
|
|---|
| 89 |
|
|---|
| 90 | \table
|
|---|
| 91 | \row \i Validation
|
|---|
| 92 | \i A regexp can be used to check whether a piece of text
|
|---|
| 93 | meets some criteria, e.g. is an integer or contains no
|
|---|
| 94 | whitespace.
|
|---|
| 95 | \row \i Searching
|
|---|
| 96 | \i Regexps provide a much more powerful means of searching
|
|---|
| 97 | text than simple string matching does. For example we can
|
|---|
| 98 | create a regexp which says "find one of the words 'mail',
|
|---|
| 99 | 'letter' or 'correspondence' but not any of the words
|
|---|
| 100 | 'email', 'mailman' 'mailer', 'letterbox' etc."
|
|---|
| 101 | \row \i Search and Replace
|
|---|
| 102 | \i A regexp can be used to replace a pattern with a piece of
|
|---|
| 103 | text, for example replace all occurrences of '&' with
|
|---|
| 104 | '\&' except where the '&' is already followed by 'amp;'.
|
|---|
| 105 | \row \i String Splitting
|
|---|
| 106 | \i A regexp can be used to identify where a string should be
|
|---|
| 107 | split into its component fields, e.g. splitting tab-delimited
|
|---|
| 108 | strings.
|
|---|
| 109 | \endtable
|
|---|
| 110 |
|
|---|
| 111 | We present a very brief introduction to regexps, a description of
|
|---|
| 112 | Qt's regexp language, some code examples, and finally the function
|
|---|
| 113 | documentation itself. QRegExp is modeled on Perl's regexp
|
|---|
| 114 | language, and also fully supports Unicode. QRegExp can also be
|
|---|
| 115 | used in the weaker 'wildcard' (globbing) mode which works in a
|
|---|
| 116 | similar way to command shells. A good text on regexps is \e
|
|---|
| 117 | {Mastering Regular Expressions: Powerful Techniques for Perl and
|
|---|
| 118 | Other Tools} by Jeffrey E. Friedl, ISBN 1565922573.
|
|---|
| 119 |
|
|---|
| 120 | Experienced regexp users may prefer to skip the introduction and
|
|---|
| 121 | go directly to the relevant information.
|
|---|
| 122 |
|
|---|
| 123 | \tableofcontents
|
|---|
| 124 |
|
|---|
| 125 | \section1 Introduction
|
|---|
| 126 |
|
|---|
| 127 | Regexps are built up from expressions, quantifiers, and assertions.
|
|---|
| 128 | The simplest form of expression is simply a character, e.g.
|
|---|
| 129 | <b>x</b> or <b>5</b>. An expression can also be a set of
|
|---|
| 130 | characters. For example, <b>[ABCD]</b>, will match an <b>A</b> or
|
|---|
| 131 | a <b>B</b> or a <b>C</b> or a <b>D</b>. As a shorthand we could
|
|---|
| 132 | write this as <b>[A-D]</b>. If we want to match any of the
|
|---|
| 133 | captital letters in the English alphabet we can write
|
|---|
| 134 | <b>[A-Z]</b>. A quantifier tells the regexp engine how many
|
|---|
| 135 | occurrences of the expression we want, e.g. <b>x{1,1}</b> means
|
|---|
| 136 | match an <b>x</b> which occurs at least once and at most once.
|
|---|
| 137 | We'll look at assertions and more complex expressions later.
|
|---|
| 138 |
|
|---|
| 139 | Note that in general regexps cannot be used to check for balanced
|
|---|
| 140 | brackets or tags. For example if you want to match an opening html
|
|---|
| 141 | \c <b> and its closing \c </b> you can only use a regexp if you
|
|---|
| 142 | know that these tags are not nested; the html fragment, \c{<b>bold
|
|---|
| 143 | <b>bolder</b></b>} will not match as expected. If you know the
|
|---|
| 144 | maximum level of nesting it is possible to create a regexp that
|
|---|
| 145 | will match correctly, but for an unknown level of nesting, regexps
|
|---|
| 146 | will fail.
|
|---|
| 147 |
|
|---|
| 148 | We'll start by writing a regexp to match integers in the range 0
|
|---|
| 149 | to 99. We will require at least one digit so we will start with
|
|---|
| 150 | <b>[0-9]{1,1}</b> which means match a digit exactly once. This
|
|---|
| 151 | regexp alone will match integers in the range 0 to 9. To match one
|
|---|
| 152 | or two digits we can increase the maximum number of occurrences so
|
|---|
| 153 | the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at
|
|---|
| 154 | least once and at most twice. However, this regexp as it stands
|
|---|
| 155 | will not match correctly. This regexp will match one or two digits
|
|---|
| 156 | \e within a string. To ensure that we match against the whole
|
|---|
| 157 | string we must use the anchor assertions. We need <b>^</b> (caret)
|
|---|
| 158 | which when it is the first character in the regexp means that the
|
|---|
| 159 | regexp must match from the beginning of the string. And we also
|
|---|
| 160 | need <b>$</b> (dollar) which when it is the last character in the
|
|---|
| 161 | regexp means that the regexp must match until the end of the
|
|---|
| 162 | string. So now our regexp is <b>^[0-9]{1,2}$</b>. Note that
|
|---|
| 163 | assertions, such as <b>^</b> and <b>$</b>, do not match any
|
|---|
| 164 | characters.
|
|---|
| 165 |
|
|---|
| 166 | If you've seen regexps elsewhere they may have looked different from
|
|---|
| 167 | the ones above. This is because some sets of characters and some
|
|---|
| 168 | quantifiers are so common that they have special symbols to
|
|---|
| 169 | represent them. <b>[0-9]</b> can be replaced with the symbol
|
|---|
| 170 | <b>\d</b>. The quantifier to match exactly one occurrence,
|
|---|
| 171 | <b>{1,1}</b>, can be replaced with the expression itself. This means
|
|---|
| 172 | that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0
|
|---|
| 173 | to 99 matcher could be written <b>^\d{1,2}$</b>. Another way of
|
|---|
| 174 | writing it would be <b>^\d\d{0,1}$</b>, i.e. from the start of the
|
|---|
| 175 | string match a digit followed by zero or one digits. In practice
|
|---|
| 176 | most people would write it <b>^\d\d?$</b>. The <b>?</b> is a
|
|---|
| 177 | shorthand for the quantifier <b>{0,1}</b>, i.e. a minimum of no
|
|---|
| 178 | occurrences a maximum of one occurrence. This is used to make an
|
|---|
| 179 | expression optional. The regexp <b>^\d\d?$</b> means "from the
|
|---|
| 180 | beginning of the string match one digit followed by zero or one
|
|---|
| 181 | digits and then the end of the string".
|
|---|
| 182 |
|
|---|
| 183 | Our second example is matching the words 'mail', 'letter' or
|
|---|
| 184 | 'correspondence' but without matching 'email', 'mailman',
|
|---|
| 185 | 'mailer', 'letterbox' etc. We'll start by just matching 'mail'. In
|
|---|
| 186 | full the regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since
|
|---|
| 187 | each expression itself is automatically quantified by <b>{1,1}</b>
|
|---|
| 188 | we can simply write this as <b>mail</b>; an 'm' followed by an 'a'
|
|---|
| 189 | followed by an 'i' followed by an 'l'. The symbol '|' (bar) is
|
|---|
| 190 | used for \e alternation, so our regexp now becomes
|
|---|
| 191 | <b>mail|letter|correspondence</b> which means match 'mail' \e or
|
|---|
| 192 | 'letter' \e or 'correspondence'. Whilst this regexp will find the
|
|---|
| 193 | words we want it will also find words we don't want such as
|
|---|
| 194 | 'email'. We will start by putting our regexp in parentheses,
|
|---|
| 195 | <b>(mail|letter|correspondence)</b>. Parentheses have two effects,
|
|---|
| 196 | firstly they group expressions together and secondly they identify
|
|---|
| 197 | parts of the regexp that we wish to \link #capturing-text capture
|
|---|
| 198 | \endlink. Our regexp still matches any of the three words but now
|
|---|
| 199 | they are grouped together as a unit. This is useful for building
|
|---|
| 200 | up more complex regexps. It is also useful because it allows us to
|
|---|
| 201 | examine which of the words actually matched. We need to use
|
|---|
| 202 | another assertion, this time <b>\b</b> "word boundary":
|
|---|
| 203 | <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match
|
|---|
| 204 | a word boundary followed by the expression in parentheses followed
|
|---|
| 205 | by another word boundary". The <b>\b</b> assertion matches at a \e
|
|---|
| 206 | position in the regexp not a \e character in the regexp. A word
|
|---|
| 207 | boundary is any non-word character such as a space a newline or
|
|---|
| 208 | the beginning or end of the string.
|
|---|
| 209 |
|
|---|
| 210 | For our third example we want to replace ampersands with the HTML
|
|---|
| 211 | entity '\&'. The regexp to match is simple: <b>\&</b>, i.e.
|
|---|
| 212 | match one ampersand. Unfortunately this will mess up our text if
|
|---|
| 213 | some of the ampersands have already been turned into HTML
|
|---|
| 214 | entities. So what we really want to say is replace an ampersand
|
|---|
| 215 | providing it is not followed by 'amp;'. For this we need the
|
|---|
| 216 | negative lookahead assertion and our regexp becomes:
|
|---|
| 217 | <b>\&(?!amp;)</b>. The negative lookahead assertion is introduced
|
|---|
| 218 | with '(?!' and finishes at the ')'. It means that the text it
|
|---|
| 219 | contains, 'amp;' in our example, must \e not follow the expression
|
|---|
| 220 | that preceeds it.
|
|---|
| 221 |
|
|---|
| 222 | Regexps provide a rich language that can be used in a variety of
|
|---|
| 223 | ways. For example suppose we want to count all the occurrences of
|
|---|
| 224 | 'Eric' and 'Eirik' in a string. Two valid regexps to match these
|
|---|
| 225 | are <b>\\b(Eric|Eirik)\\b</b> and <b>\\bEi?ri[ck]\\b</b>. We need
|
|---|
| 226 | the word boundary '\b' so we don't get 'Ericsson' etc. The second
|
|---|
| 227 | regexp actually matches more than we want, 'Eric', 'Erik', 'Eiric'
|
|---|
| 228 | and 'Eirik'.
|
|---|
| 229 |
|
|---|
| 230 | We will implement some the examples above in the
|
|---|
| 231 | \link #code-examples code examples \endlink section.
|
|---|
| 232 |
|
|---|
| 233 | \target characters-and-abbreviations-for-sets-of-characters
|
|---|
| 234 | \section1 Characters and Abbreviations for Sets of Characters
|
|---|
| 235 |
|
|---|
| 236 | \table
|
|---|
| 237 | \header \i Element \i Meaning
|
|---|
| 238 | \row \i <b>c</b>
|
|---|
| 239 | \i Any character represents itself unless it has a special
|
|---|
| 240 | regexp meaning. Thus <b>c</b> matches the character \e c.
|
|---|
| 241 | \row \i <b>\\c</b>
|
|---|
| 242 | \i A character that follows a backslash matches the character
|
|---|
| 243 | itself except where mentioned below. For example if you
|
|---|
| 244 | wished to match a literal caret at the beginning of a string
|
|---|
| 245 | you would write <b>\^</b>.
|
|---|
| 246 | \row \i <b>\\a</b>
|
|---|
| 247 | \i This matches the ASCII bell character (BEL, 0x07).
|
|---|
| 248 | \row \i <b>\\f</b>
|
|---|
| 249 | \i This matches the ASCII form feed character (FF, 0x0C).
|
|---|
| 250 | \row \i <b>\\n</b>
|
|---|
| 251 | \i This matches the ASCII line feed character (LF, 0x0A, Unix newline).
|
|---|
| 252 | \row \i <b>\\r</b>
|
|---|
| 253 | \i This matches the ASCII carriage return character (CR, 0x0D).
|
|---|
| 254 | \row \i <b>\\t</b>
|
|---|
| 255 | \i This matches the ASCII horizontal tab character (HT, 0x09).
|
|---|
| 256 | \row \i <b>\\v</b>
|
|---|
| 257 | \i This matches the ASCII vertical tab character (VT, 0x0B).
|
|---|
| 258 | \row \i <b>\\xhhhh</b>
|
|---|
| 259 | \i This matches the Unicode character corresponding to the
|
|---|
| 260 | hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo
|
|---|
| 261 | (i.e., \zero ooo) matches the ASCII/Latin-1 character
|
|---|
| 262 | corresponding to the octal number ooo (between 0 and 0377).
|
|---|
| 263 | \row \i <b>. (dot)</b>
|
|---|
| 264 | \i This matches any character (including newline).
|
|---|
| 265 | \row \i <b>\\d</b>
|
|---|
| 266 | \i This matches a digit (QChar::isDigit()).
|
|---|
| 267 | \row \i <b>\\D</b>
|
|---|
| 268 | \i This matches a non-digit.
|
|---|
| 269 | \row \i <b>\\s</b>
|
|---|
| 270 | \i This matches a whitespace (QChar::isSpace()).
|
|---|
| 271 | \row \i <b>\\S</b>
|
|---|
| 272 | \i This matches a non-whitespace.
|
|---|
| 273 | \row \i <b>\\w</b>
|
|---|
| 274 | \i This matches a word character (QChar::isLetterOrNumber() or '_').
|
|---|
| 275 | \row \i <b>\\W</b>
|
|---|
| 276 | \i This matches a non-word character.
|
|---|
| 277 | \row \i <b>\\n</b>
|
|---|
| 278 | \i The n-th \link #capturing-text backreference \endlink,
|
|---|
| 279 | e.g. \1, \2, etc.
|
|---|
| 280 | \endtable
|
|---|
| 281 |
|
|---|
| 282 | \e {Note that the C++ compiler transforms backslashes in strings
|
|---|
| 283 | so to include a <b>\\</b> in a regexp you will need to enter it
|
|---|
| 284 | twice, i.e. <b>\\\\</b>.}
|
|---|
| 285 |
|
|---|
| 286 | \target sets-of-characters
|
|---|
| 287 | \section1 Sets of Characters
|
|---|
| 288 |
|
|---|
| 289 | Square brackets are used to match any character in the set of
|
|---|
| 290 | characters contained within the square brackets. All the character
|
|---|
| 291 | set abbreviations described above can be used within square
|
|---|
| 292 | brackets. Apart from the character set abbreviations and the
|
|---|
| 293 | following two exceptions no characters have special meanings in
|
|---|
| 294 | square brackets.
|
|---|
| 295 |
|
|---|
| 296 | \table
|
|---|
| 297 | \row \i <b>^</b>
|
|---|
| 298 | \i The caret negates the character set if it occurs as the
|
|---|
| 299 | first character, i.e. immediately after the opening square
|
|---|
| 300 | bracket. For example, <b>[abc]</b> matches 'a' or 'b' or 'c',
|
|---|
| 301 | but <b>[^abc]</b> matches anything \e except 'a' or 'b' or
|
|---|
| 302 | 'c'.
|
|---|
| 303 | \row \i <b>-</b>
|
|---|
| 304 | \i The dash is used to indicate a range of characters, for
|
|---|
| 305 | example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'.
|
|---|
| 306 | \endtable
|
|---|
| 307 |
|
|---|
| 308 | Using the predefined character set abbreviations is more portable
|
|---|
| 309 | than using character ranges across platforms and languages. For
|
|---|
| 310 | example, <b>[0-9]</b> matches a digit in Western alphabets but
|
|---|
| 311 | <b>\d</b> matches a digit in \e any alphabet.
|
|---|
| 312 |
|
|---|
| 313 | Note that in most regexp literature sets of characters are called
|
|---|
| 314 | "character classes".
|
|---|
| 315 |
|
|---|
| 316 | \target quantifiers
|
|---|
| 317 | \section1 Quantifiers
|
|---|
| 318 |
|
|---|
| 319 | By default an expression is automatically quantified by
|
|---|
| 320 | <b>{1,1}</b>, i.e. it should occur exactly once. In the following
|
|---|
| 321 | list <b>\e {E}</b> stands for any expression. An expression is a
|
|---|
| 322 | character or an abbreviation for a set of characters or a set of
|
|---|
| 323 | characters in square brackets or any parenthesised expression.
|
|---|
| 324 |
|
|---|
| 325 | \table
|
|---|
| 326 | \row \i <b>\e {E}?</b>
|
|---|
| 327 | \i Matches zero or one occurrence of \e E. This quantifier
|
|---|
| 328 | means "the previous expression is optional" since it will
|
|---|
| 329 | match whether or not the expression occurs in the string. It
|
|---|
| 330 | is the same as <b>\e {E}{0,1}</b>. For example <b>dents?</b>
|
|---|
| 331 | will match 'dent' and 'dents'.
|
|---|
| 332 |
|
|---|
| 333 | \row \i <b>\e {E}+</b>
|
|---|
| 334 | \i Matches one or more occurrences of \e E. This is the same
|
|---|
| 335 | as <b>\e {E}{1,MAXINT}</b>. For example, <b>0+</b> will match
|
|---|
| 336 | '0', '00', '000', etc.
|
|---|
| 337 |
|
|---|
| 338 | \row \i <b>\e {E}*</b>
|
|---|
| 339 | \i Matches zero or more occurrences of \e E. This is the same
|
|---|
| 340 | as <b>\e {E}{0,MAXINT}</b>. The <b>*</b> quantifier is often
|
|---|
| 341 | used by a mistake. Since it matches \e zero or more
|
|---|
| 342 | occurrences it will match no occurrences at all. For example
|
|---|
| 343 | if we want to match strings that end in whitespace and use
|
|---|
| 344 | the regexp <b>\s*$</b> we would get a match on every string.
|
|---|
| 345 | This is because we have said find zero or more whitespace
|
|---|
| 346 | followed by the end of string, so even strings that don't end
|
|---|
| 347 | in whitespace will match. The regexp we want in this case is
|
|---|
| 348 | <b>\s+$</b> to match strings that have at least one
|
|---|
| 349 | whitespace at the end.
|
|---|
| 350 |
|
|---|
| 351 | \row \i <b>\e {E}{n}</b>
|
|---|
| 352 | \i Matches exactly \e n occurrences of the expression. This
|
|---|
| 353 | is the same as repeating the expression \e n times. For
|
|---|
| 354 | example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also
|
|---|
| 355 | the same as <b>\e {E}{n,n}</b>, e.g. <b>x{5,5}</b>.
|
|---|
| 356 |
|
|---|
| 357 | \row \i <b>\e {E}{n,}</b>
|
|---|
| 358 | \i Matches at least \e n occurrences of the expression. This
|
|---|
| 359 | is the same as <b>\e {E}{n,MAXINT}</b>.
|
|---|
| 360 |
|
|---|
| 361 | \row \i <b>\e {E}{,m}</b>
|
|---|
| 362 | \i Matches at most \e m occurrences of the expression. This
|
|---|
| 363 | is the same as <b>\e {E}{0,m}</b>.
|
|---|
| 364 |
|
|---|
| 365 | \row \i <b>\e {E}{n,m}</b>
|
|---|
| 366 | \i Matches at least \e n occurrences of the expression and at
|
|---|
| 367 | most \e m occurrences of the expression.
|
|---|
| 368 | \endtable
|
|---|
| 369 |
|
|---|
| 370 | (MAXINT is implementation dependent but will not be smaller than
|
|---|
| 371 | 1024.)
|
|---|
| 372 |
|
|---|
| 373 | If we wish to apply a quantifier to more than just the preceding
|
|---|
| 374 | character we can use parentheses to group characters together in
|
|---|
| 375 | an expression. For example, <b>tag+</b> matches a 't' followed by
|
|---|
| 376 | an 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches
|
|---|
| 377 | at least one occurrence of 'tag'.
|
|---|
| 378 |
|
|---|
| 379 | Note that quantifiers are "greedy". They will match as much text
|
|---|
| 380 | as they can. For example, <b>0+</b> will match as many zeros as it
|
|---|
| 381 | can from the first zero it finds, e.g. '2.<u>000</u>5'.
|
|---|
| 382 | Quantifiers can be made non-greedy, see setMinimal().
|
|---|
| 383 |
|
|---|
| 384 | \target capturing-text
|
|---|
| 385 | \section1 Capturing Text
|
|---|
| 386 |
|
|---|
| 387 | Parentheses allow us to group elements together so that we can
|
|---|
| 388 | quantify and capture them. For example if we have the expression
|
|---|
| 389 | <b>mail|letter|correspondence</b> that matches a string we know
|
|---|
| 390 | that \e one of the words matched but not which one. Using
|
|---|
| 391 | parentheses allows us to "capture" whatever is matched within
|
|---|
| 392 | their bounds, so if we used <b>(mail|letter|correspondence)</b>
|
|---|
| 393 | and matched this regexp against the string "I sent you some email"
|
|---|
| 394 | we can use the cap() or capturedTexts() functions to extract the
|
|---|
| 395 | matched characters, in this case 'mail'.
|
|---|
| 396 |
|
|---|
| 397 | We can use captured text within the regexp itself. To refer to the
|
|---|
| 398 | captured text we use \e backreferences which are indexed from 1,
|
|---|
| 399 | the same as for cap(). For example we could search for duplicate
|
|---|
| 400 | words in a string using <b>\b(\w+)\W+\1\b</b> which means match a
|
|---|
| 401 | word boundary followed by one or more word characters followed by
|
|---|
| 402 | one or more non-word characters followed by the same text as the
|
|---|
| 403 | first parenthesised expression followed by a word boundary.
|
|---|
| 404 |
|
|---|
| 405 | If we want to use parentheses purely for grouping and not for
|
|---|
| 406 | capturing we can use the non-capturing syntax, e.g.
|
|---|
| 407 | <b>(?:green|blue)</b>. Non-capturing parentheses begin '(?:' and
|
|---|
| 408 | end ')'. In this example we match either 'green' or 'blue' but we
|
|---|
| 409 | do not capture the match so we only know whether or not we matched
|
|---|
| 410 | but not which color we actually found. Using non-capturing
|
|---|
| 411 | parentheses is more efficient than using capturing parentheses
|
|---|
| 412 | since the regexp engine has to do less book-keeping.
|
|---|
| 413 |
|
|---|
| 414 | Both capturing and non-capturing parentheses may be nested.
|
|---|
| 415 |
|
|---|
| 416 | \target assertions
|
|---|
| 417 | \section1 Assertions
|
|---|
| 418 |
|
|---|
| 419 | Assertions make some statement about the text at the point where
|
|---|
| 420 | they occur in the regexp but they do not match any characters. In
|
|---|
| 421 | the following list <b>\e {E}</b> stands for any expression.
|
|---|
| 422 |
|
|---|
| 423 | \table
|
|---|
| 424 | \row \i <b>^</b>
|
|---|
| 425 | \i The caret signifies the beginning of the string. If you
|
|---|
| 426 | wish to match a literal \c{^} you must escape it by
|
|---|
| 427 | writing \c{\\^}. For example, <b>^#include</b> will only
|
|---|
| 428 | match strings which \e begin with the characters '#include'.
|
|---|
| 429 | (When the caret is the first character of a character set it
|
|---|
| 430 | has a special meaning, see \link #sets-of-characters Sets of
|
|---|
| 431 | Characters \endlink.)
|
|---|
| 432 |
|
|---|
| 433 | \row \i <b>$</b>
|
|---|
| 434 | \i The dollar signifies the end of the string. For example
|
|---|
| 435 | <b>\d\s*$</b> will match strings which end with a digit
|
|---|
| 436 | optionally followed by whitespace. If you wish to match a
|
|---|
| 437 | literal \c{$} you must escape it by writing
|
|---|
| 438 | \c{\\$}.
|
|---|
| 439 |
|
|---|
| 440 | \row \i <b>\\b</b>
|
|---|
| 441 | \i A word boundary. For example the regexp
|
|---|
| 442 | <b>\\bOK\\b</b> means match immediately after a word
|
|---|
| 443 | boundary (e.g. start of string or whitespace) the letter 'O'
|
|---|
| 444 | then the letter 'K' immediately before another word boundary
|
|---|
| 445 | (e.g. end of string or whitespace). But note that the
|
|---|
| 446 | assertion does not actually match any whitespace so if we
|
|---|
| 447 | write <b>(\\bOK\\b)</b> and we have a match it will only
|
|---|
| 448 | contain 'OK' even if the string is "Its <u>OK</u> now".
|
|---|
| 449 |
|
|---|
| 450 | \row \i <b>\\B</b>
|
|---|
| 451 | \i A non-word boundary. This assertion is true wherever
|
|---|
| 452 | <b>\\b</b> is false. For example if we searched for
|
|---|
| 453 | <b>\\Bon\\B</b> in "Left on" the match would fail (space
|
|---|
| 454 | and end of string aren't non-word boundaries), but it would
|
|---|
| 455 | match in "t<u>on</u>ne".
|
|---|
| 456 |
|
|---|
| 457 | \row \i <b>(?=\e E)</b>
|
|---|
| 458 | \i Positive lookahead. This assertion is true if the
|
|---|
| 459 | expression matches at this point in the regexp. For example,
|
|---|
| 460 | <b>const(?=\\s+char)</b> matches 'const' whenever it is
|
|---|
| 461 | followed by 'char', as in 'static <u>const</u> char *'.
|
|---|
| 462 | (Compare with <b>const\\s+char</b>, which matches 'static
|
|---|
| 463 | <u>const char</u> *'.)
|
|---|
| 464 |
|
|---|
| 465 | \row \i <b>(?!\e E)</b>
|
|---|
| 466 | \i Negative lookahead. This assertion is true if the
|
|---|
| 467 | expression does not match at this point in the regexp. For
|
|---|
| 468 | example, <b>const(?!\\s+char)</b> matches 'const' \e except
|
|---|
| 469 | when it is followed by 'char'.
|
|---|
| 470 | \endtable
|
|---|
| 471 |
|
|---|
| 472 | \target wildcard-matching
|
|---|
| 473 | \section1 Wildcard Matching (globbing)
|
|---|
| 474 |
|
|---|
| 475 | Most command shells such as \e bash or \e cmd.exe support "file
|
|---|
| 476 | globbing", the ability to identify a group of files by using
|
|---|
| 477 | wildcards. The setWildcard() function is used to switch between
|
|---|
| 478 | regexp and wildcard mode. Wildcard matching is much simpler than
|
|---|
| 479 | full regexps and has only four features:
|
|---|
| 480 |
|
|---|
| 481 | \table
|
|---|
| 482 | \row \i <b>c</b>
|
|---|
| 483 | \i Any character represents itself apart from those mentioned
|
|---|
| 484 | below. Thus <b>c</b> matches the character \e c.
|
|---|
| 485 | \row \i <b>?</b>
|
|---|
| 486 | \i This matches any single character. It is the same as
|
|---|
| 487 | <b>.</b> in full regexps.
|
|---|
| 488 | \row \i <b>*</b>
|
|---|
| 489 | \i This matches zero or more of any characters. It is the
|
|---|
| 490 | same as <b>.*</b> in full regexps.
|
|---|
| 491 | \row \i <b>[...]</b>
|
|---|
| 492 | \i Sets of characters can be represented in square brackets,
|
|---|
| 493 | similar to full regexps. Within the character class, like
|
|---|
| 494 | outside, backslash has no special meaning.
|
|---|
| 495 | \endtable
|
|---|
| 496 |
|
|---|
| 497 | For example if we are in wildcard mode and have strings which
|
|---|
| 498 | contain filenames we could identify HTML files with <b>*.html</b>.
|
|---|
| 499 | This will match zero or more characters followed by a dot followed
|
|---|
| 500 | by 'h', 't', 'm' and 'l'.
|
|---|
| 501 |
|
|---|
| 502 | \target perl-users
|
|---|
| 503 | \section1 Notes for Perl Users
|
|---|
| 504 |
|
|---|
| 505 | Most of the character class abbreviations supported by Perl are
|
|---|
| 506 | supported by QRegExp, see \link
|
|---|
| 507 | #characters-and-abbreviations-for-sets-of-characters characters
|
|---|
| 508 | and abbreviations for sets of characters \endlink.
|
|---|
| 509 |
|
|---|
| 510 | In QRegExp, apart from within character classes, \c{^} always
|
|---|
| 511 | signifies the start of the string, so carets must always be
|
|---|
| 512 | escaped unless used for that purpose. In Perl the meaning of caret
|
|---|
| 513 | varies automagically depending on where it occurs so escaping it
|
|---|
| 514 | is rarely necessary. The same applies to \c{$} which in
|
|---|
| 515 | QRegExp always signifies the end of the string.
|
|---|
| 516 |
|
|---|
| 517 | QRegExp's quantifiers are the same as Perl's greedy quantifiers.
|
|---|
| 518 | Non-greedy matching cannot be applied to individual quantifiers,
|
|---|
| 519 | but can be applied to all the quantifiers in the pattern. For
|
|---|
| 520 | example, to match the Perl regexp <b>ro+?m</b> requires:
|
|---|
| 521 | \code
|
|---|
| 522 | QRegExp rx( "ro+m" );
|
|---|
| 523 | rx.setMinimal( TRUE );
|
|---|
| 524 | \endcode
|
|---|
| 525 |
|
|---|
| 526 | The equivalent of Perl's \c{/i} option is
|
|---|
| 527 | setCaseSensitive(FALSE).
|
|---|
| 528 |
|
|---|
| 529 | Perl's \c{/g} option can be emulated using a \link
|
|---|
| 530 | #cap_in_a_loop loop \endlink.
|
|---|
| 531 |
|
|---|
| 532 | In QRegExp <b>.</b> matches any character, therefore all QRegExp
|
|---|
| 533 | regexps have the equivalent of Perl's \c{/s} option. QRegExp
|
|---|
| 534 | does not have an equivalent to Perl's \c{/m} option, but this
|
|---|
| 535 | can be emulated in various ways for example by splitting the input
|
|---|
| 536 | into lines or by looping with a regexp that searches for newlines.
|
|---|
| 537 |
|
|---|
| 538 | Because QRegExp is string oriented there are no \A, \Z or \z
|
|---|
| 539 | assertions. The \G assertion is not supported but can be emulated
|
|---|
| 540 | in a loop.
|
|---|
| 541 |
|
|---|
| 542 | Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
|
|---|
| 543 | equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
|
|---|
| 544 | ... correspond to cap(1) or capturedTexts()[1], cap(2) or
|
|---|
| 545 | capturedTexts()[2], etc.
|
|---|
| 546 |
|
|---|
| 547 | To substitute a pattern use QString::replace().
|
|---|
| 548 |
|
|---|
| 549 | Perl's extended \c{/x} syntax is not supported, nor are
|
|---|
| 550 | directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
|
|---|
| 551 | the other hand, C++'s rules for literal strings can be used to
|
|---|
| 552 | achieve the same:
|
|---|
| 553 | \code
|
|---|
| 554 | QRegExp mark( "\\b" // word boundary
|
|---|
| 555 | "[Mm]ark" // the word we want to match
|
|---|
| 556 | );
|
|---|
| 557 | \endcode
|
|---|
| 558 |
|
|---|
| 559 | Both zero-width positive and zero-width negative lookahead
|
|---|
| 560 | assertions (?=pattern) and (?!pattern) are supported with the same
|
|---|
| 561 | syntax as Perl. Perl's lookbehind assertions, "independent"
|
|---|
| 562 | subexpressions and conditional expressions are not supported.
|
|---|
| 563 |
|
|---|
| 564 | Non-capturing parentheses are also supported, with the same
|
|---|
| 565 | (?:pattern) syntax.
|
|---|
| 566 |
|
|---|
| 567 | See QStringList::split() and QStringList::join() for equivalents
|
|---|
| 568 | to Perl's split and join functions.
|
|---|
| 569 |
|
|---|
| 570 | Note: because C++ transforms \\'s they must be written \e twice in
|
|---|
| 571 | code, e.g. <b>\\b</b> must be written <b>\\\\b</b>.
|
|---|
| 572 |
|
|---|
| 573 | \target code-examples
|
|---|
| 574 | \section1 Code Examples
|
|---|
| 575 |
|
|---|
| 576 | \code
|
|---|
| 577 | QRegExp rx( "^\\d\\d?$" ); // match integers 0 to 99
|
|---|
| 578 | rx.search( "123" ); // returns -1 (no match)
|
|---|
| 579 | rx.search( "-6" ); // returns -1 (no match)
|
|---|
| 580 | rx.search( "6" ); // returns 0 (matched as position 0)
|
|---|
| 581 | \endcode
|
|---|
| 582 |
|
|---|
| 583 | The third string matches '<u>6</u>'. This is a simple validation
|
|---|
| 584 | regexp for integers in the range 0 to 99.
|
|---|
| 585 |
|
|---|
| 586 | \code
|
|---|
| 587 | QRegExp rx( "^\\S+$" ); // match strings without whitespace
|
|---|
| 588 | rx.search( "Hello world" ); // returns -1 (no match)
|
|---|
| 589 | rx.search( "This_is-OK" ); // returns 0 (matched at position 0)
|
|---|
| 590 | \endcode
|
|---|
| 591 |
|
|---|
| 592 | The second string matches '<u>This_is-OK</u>'. We've used the
|
|---|
| 593 | character set abbreviation '\S' (non-whitespace) and the anchors
|
|---|
| 594 | to match strings which contain no whitespace.
|
|---|
| 595 |
|
|---|
| 596 | In the following example we match strings containing 'mail' or
|
|---|
| 597 | 'letter' or 'correspondence' but only match whole words i.e. not
|
|---|
| 598 | 'email'
|
|---|
| 599 |
|
|---|
| 600 | \code
|
|---|
| 601 | QRegExp rx( "\\b(mail|letter|correspondence)\\b" );
|
|---|
| 602 | rx.search( "I sent you an email" ); // returns -1 (no match)
|
|---|
| 603 | rx.search( "Please write the letter" ); // returns 17
|
|---|
| 604 | \endcode
|
|---|
| 605 |
|
|---|
| 606 | The second string matches "Please write the <u>letter</u>". The
|
|---|
| 607 | word 'letter' is also captured (because of the parentheses). We
|
|---|
| 608 | can see what text we've captured like this:
|
|---|
| 609 |
|
|---|
| 610 | \code
|
|---|
| 611 | QString captured = rx.cap( 1 ); // captured == "letter"
|
|---|
| 612 | \endcode
|
|---|
| 613 |
|
|---|
| 614 | This will capture the text from the first set of capturing
|
|---|
| 615 | parentheses (counting capturing left parentheses from left to
|
|---|
| 616 | right). The parentheses are counted from 1 since cap( 0 ) is the
|
|---|
| 617 | whole matched regexp (equivalent to '&' in most regexp engines).
|
|---|
| 618 |
|
|---|
| 619 | \code
|
|---|
| 620 | QRegExp rx( "&(?!amp;)" ); // match ampersands but not &
|
|---|
| 621 | QString line1 = "This & that";
|
|---|
| 622 | line1.replace( rx, "&" );
|
|---|
| 623 | // line1 == "This & that"
|
|---|
| 624 | QString line2 = "His & hers & theirs";
|
|---|
| 625 | line2.replace( rx, "&" );
|
|---|
| 626 | // line2 == "His & hers & theirs"
|
|---|
| 627 | \endcode
|
|---|
| 628 |
|
|---|
| 629 | Here we've passed the QRegExp to QString's replace() function to
|
|---|
| 630 | replace the matched text with new text.
|
|---|
| 631 |
|
|---|
| 632 | \code
|
|---|
| 633 | QString str = "One Eric another Eirik, and an Ericsson."
|
|---|
| 634 | " How many Eiriks, Eric?";
|
|---|
| 635 | QRegExp rx( "\\b(Eric|Eirik)\\b" ); // match Eric or Eirik
|
|---|
| 636 | int pos = 0; // where we are in the string
|
|---|
| 637 | int count = 0; // how many Eric and Eirik's we've counted
|
|---|
| 638 | while ( pos >= 0 ) {
|
|---|
| 639 | pos = rx.search( str, pos );
|
|---|
| 640 | if ( pos >= 0 ) {
|
|---|
| 641 | pos++; // move along in str
|
|---|
| 642 | count++; // count our Eric or Eirik
|
|---|
| 643 | }
|
|---|
| 644 | }
|
|---|
| 645 | \endcode
|
|---|
| 646 |
|
|---|
| 647 | We've used the search() function to repeatedly match the regexp in
|
|---|
| 648 | the string. Note that instead of moving forward by one character
|
|---|
| 649 | at a time \c pos++ we could have written \c {pos +=
|
|---|
| 650 | rx.matchedLength()} to skip over the already matched string. The
|
|---|
| 651 | count will equal 3, matching 'One <u>Eric</u> another
|
|---|
| 652 | <u>Eirik</u>, and an Ericsson. How many Eiriks, <u>Eric</u>?'; it
|
|---|
| 653 | doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
|
|---|
| 654 | by non-word boundaries.
|
|---|
| 655 |
|
|---|
| 656 | One common use of regexps is to split lines of delimited data into
|
|---|
| 657 | their component fields.
|
|---|
| 658 |
|
|---|
| 659 | \code
|
|---|
| 660 | str = "Trolltech AS\twww.trolltech.com\tNorway";
|
|---|
| 661 | QString company, web, country;
|
|---|
| 662 | rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" );
|
|---|
| 663 | if ( rx.search( str ) != -1 ) {
|
|---|
| 664 | company = rx.cap( 1 );
|
|---|
| 665 | web = rx.cap( 2 );
|
|---|
| 666 | country = rx.cap( 3 );
|
|---|
| 667 | }
|
|---|
| 668 | \endcode
|
|---|
| 669 |
|
|---|
| 670 | In this example our input lines have the format company name, web
|
|---|
| 671 | address and country. Unfortunately the regexp is rather long and
|
|---|
| 672 | not very versatile -- the code will break if we add any more
|
|---|
| 673 | fields. A simpler and better solution is to look for the
|
|---|
| 674 | separator, '\t' in this case, and take the surrounding text. The
|
|---|
| 675 | QStringList split() function can take a separator string or regexp
|
|---|
| 676 | as an argument and split a string accordingly.
|
|---|
| 677 |
|
|---|
| 678 | \code
|
|---|
| 679 | QStringList field = QStringList::split( "\t", str );
|
|---|
| 680 | \endcode
|
|---|
| 681 |
|
|---|
| 682 | Here field[0] is the company, field[1] the web address and so on.
|
|---|
| 683 |
|
|---|
| 684 | To imitate the matching of a shell we can use wildcard mode.
|
|---|
| 685 |
|
|---|
| 686 | \code
|
|---|
| 687 | QRegExp rx( "*.html" ); // invalid regexp: * doesn't quantify anything
|
|---|
| 688 | rx.setWildcard( TRUE ); // now it's a valid wildcard regexp
|
|---|
| 689 | rx.exactMatch( "index.html" ); // returns TRUE
|
|---|
| 690 | rx.exactMatch( "default.htm" ); // returns FALSE
|
|---|
| 691 | rx.exactMatch( "readme.txt" ); // returns FALSE
|
|---|
| 692 | \endcode
|
|---|
| 693 |
|
|---|
| 694 | Wildcard matching can be convenient because of its simplicity, but
|
|---|
| 695 | any wildcard regexp can be defined using full regexps, e.g.
|
|---|
| 696 | <b>.*\.html$</b>. Notice that we can't match both \c .html and \c
|
|---|
| 697 | .htm files with a wildcard unless we use <b>*.htm*</b> which will
|
|---|
| 698 | also match 'test.html.bak'. A full regexp gives us the precision
|
|---|
| 699 | we need, <b>.*\\.html?$</b>.
|
|---|
| 700 |
|
|---|
| 701 | QRegExp can match case insensitively using setCaseSensitive(), and
|
|---|
| 702 | can use non-greedy matching, see setMinimal(). By default QRegExp
|
|---|
| 703 | uses full regexps but this can be changed with setWildcard().
|
|---|
| 704 | Searching can be forward with search() or backward with
|
|---|
| 705 | searchRev(). Captured text can be accessed using capturedTexts()
|
|---|
| 706 | which returns a string list of all captured strings, or using
|
|---|
| 707 | cap() which returns the captured string for the given index. The
|
|---|
| 708 | pos() function takes a match index and returns the position in the
|
|---|
| 709 | string where the match was made (or -1 if there was no match).
|
|---|
| 710 |
|
|---|
| 711 | \sa QRegExpValidator QString QStringList
|
|---|
| 712 |
|
|---|
| 713 | \target member-function-documentation
|
|---|
| 714 | */
|
|---|
| 715 |
|
|---|
| 716 | const int NumBadChars = 64;
|
|---|
| 717 | #define BadChar( ch ) ( (ch).unicode() % NumBadChars )
|
|---|
| 718 |
|
|---|
| 719 | const int NoOccurrence = INT_MAX;
|
|---|
| 720 | const int EmptyCapture = INT_MAX;
|
|---|
| 721 | const int InftyLen = INT_MAX;
|
|---|
| 722 | const int InftyRep = 1025;
|
|---|
| 723 | const int EOS = -1;
|
|---|
| 724 |
|
|---|
| 725 | static bool isWord( QChar ch )
|
|---|
| 726 | {
|
|---|
| 727 | return ch.isLetterOrNumber() || ch == QChar( '_' );
|
|---|
| 728 | }
|
|---|
| 729 |
|
|---|
| 730 | /*
|
|---|
| 731 | Merges two QMemArrays of ints and puts the result into the first
|
|---|
| 732 | one.
|
|---|
| 733 | */
|
|---|
| 734 | static void mergeInto( QMemArray<int> *a, const QMemArray<int>& b )
|
|---|
| 735 | {
|
|---|
| 736 | int asize = a->size();
|
|---|
| 737 | int bsize = b.size();
|
|---|
| 738 | if ( asize == 0 ) {
|
|---|
| 739 | *a = b.copy();
|
|---|
| 740 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 741 | } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) {
|
|---|
| 742 | a->resize( asize + 1 );
|
|---|
| 743 | (*a)[asize] = b[0];
|
|---|
| 744 | #endif
|
|---|
| 745 | } else if ( bsize >= 1 ) {
|
|---|
| 746 | int csize = asize + bsize;
|
|---|
| 747 | QMemArray<int> c( csize );
|
|---|
| 748 | int i = 0, j = 0, k = 0;
|
|---|
| 749 | while ( i < asize ) {
|
|---|
| 750 | if ( j < bsize ) {
|
|---|
| 751 | if ( (*a)[i] == b[j] ) {
|
|---|
| 752 | i++;
|
|---|
| 753 | csize--;
|
|---|
| 754 | } else if ( (*a)[i] < b[j] ) {
|
|---|
| 755 | c[k++] = (*a)[i++];
|
|---|
| 756 | } else {
|
|---|
| 757 | c[k++] = b[j++];
|
|---|
| 758 | }
|
|---|
| 759 | } else {
|
|---|
| 760 | memcpy( c.data() + k, (*a).data() + i,
|
|---|
| 761 | (asize - i) * sizeof(int) );
|
|---|
| 762 | break;
|
|---|
| 763 | }
|
|---|
| 764 | }
|
|---|
| 765 | c.resize( csize );
|
|---|
| 766 | if ( j < bsize )
|
|---|
| 767 | memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) );
|
|---|
| 768 | *a = c;
|
|---|
| 769 | }
|
|---|
| 770 | }
|
|---|
| 771 |
|
|---|
| 772 | /*
|
|---|
| 773 | Merges two disjoint QMaps of (int, int) pairs and puts the result
|
|---|
| 774 | into the first one.
|
|---|
| 775 | */
|
|---|
| 776 | static void mergeInto( QMap<int, int> *a, const QMap<int, int>& b )
|
|---|
| 777 | {
|
|---|
| 778 | QMap<int, int>::ConstIterator it;
|
|---|
| 779 | for ( it = b.begin(); it != b.end(); ++it )
|
|---|
| 780 | a->insert( it.key(), *it );
|
|---|
| 781 | }
|
|---|
| 782 |
|
|---|
| 783 | /*
|
|---|
| 784 | Returns the value associated to key k in QMap m of (int, int)
|
|---|
| 785 | pairs, or 0 if no such value is explicitly present.
|
|---|
| 786 | */
|
|---|
| 787 | static int at( const QMap<int, int>& m, int k )
|
|---|
| 788 | {
|
|---|
| 789 | QMap<int, int>::ConstIterator it = m.find( k );
|
|---|
| 790 | if ( it == m.end() )
|
|---|
| 791 | return 0;
|
|---|
| 792 | else
|
|---|
| 793 | return *it;
|
|---|
| 794 | }
|
|---|
| 795 |
|
|---|
| 796 | #ifndef QT_NO_REGEXP_WILDCARD
|
|---|
| 797 | /*
|
|---|
| 798 | Translates a wildcard pattern to an equivalent regular expression
|
|---|
| 799 | pattern (e.g., *.cpp to .*\.cpp).
|
|---|
| 800 | */
|
|---|
| 801 | static QString wc2rx( const QString& wc_str )
|
|---|
| 802 | {
|
|---|
| 803 | int wclen = wc_str.length();
|
|---|
| 804 | QString rx = QString::fromLatin1( "" );
|
|---|
| 805 | int i = 0;
|
|---|
| 806 | const QChar *wc = wc_str.unicode();
|
|---|
| 807 | while ( i < wclen ) {
|
|---|
| 808 | QChar c = wc[i++];
|
|---|
| 809 | switch ( c.unicode() ) {
|
|---|
| 810 | case '*':
|
|---|
| 811 | rx += QString::fromLatin1( ".*" );
|
|---|
| 812 | break;
|
|---|
| 813 | case '?':
|
|---|
| 814 | rx += QChar( '.' );
|
|---|
| 815 | break;
|
|---|
| 816 | case '$':
|
|---|
| 817 | case '(':
|
|---|
| 818 | case ')':
|
|---|
| 819 | case '+':
|
|---|
| 820 | case '.':
|
|---|
| 821 | case '\\':
|
|---|
| 822 | case '^':
|
|---|
| 823 | case '{':
|
|---|
| 824 | case '|':
|
|---|
| 825 | case '}':
|
|---|
| 826 | rx += QChar( '\\' );
|
|---|
| 827 | rx += c;
|
|---|
| 828 | break;
|
|---|
| 829 | case '[':
|
|---|
| 830 | rx += c;
|
|---|
| 831 | if ( wc[i] == QChar('^') )
|
|---|
| 832 | rx += wc[i++];
|
|---|
| 833 | if ( i < wclen ) {
|
|---|
| 834 | if ( rx[i] == ']' )
|
|---|
| 835 | rx += wc[i++];
|
|---|
| 836 | while ( i < wclen && wc[i] != QChar(']') ) {
|
|---|
| 837 | if ( wc[i] == '\\' )
|
|---|
| 838 | rx += QChar( '\\' );
|
|---|
| 839 | rx += wc[i++];
|
|---|
| 840 | }
|
|---|
| 841 | }
|
|---|
| 842 | break;
|
|---|
| 843 | default:
|
|---|
| 844 | rx += c;
|
|---|
| 845 | }
|
|---|
| 846 | }
|
|---|
| 847 | return rx;
|
|---|
| 848 | }
|
|---|
| 849 | #endif
|
|---|
| 850 |
|
|---|
| 851 | /*
|
|---|
| 852 | The class QRegExpEngine encapsulates a modified nondeterministic
|
|---|
| 853 | finite automaton (NFA).
|
|---|
| 854 | */
|
|---|
| 855 | class QRegExpEngine : public QShared
|
|---|
| 856 | {
|
|---|
| 857 | public:
|
|---|
| 858 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 859 | /*
|
|---|
| 860 | The class CharClass represents a set of characters, such as can
|
|---|
| 861 | be found in regular expressions (e.g., [a-z] denotes the set
|
|---|
| 862 | {a, b, ..., z}).
|
|---|
| 863 | */
|
|---|
| 864 | class CharClass
|
|---|
| 865 | {
|
|---|
| 866 | public:
|
|---|
| 867 | CharClass();
|
|---|
| 868 | CharClass( const CharClass& cc ) { operator=( cc ); }
|
|---|
| 869 |
|
|---|
| 870 | CharClass& operator=( const CharClass& cc );
|
|---|
| 871 |
|
|---|
| 872 | void clear();
|
|---|
| 873 | bool negative() const { return n; }
|
|---|
| 874 | void setNegative( bool negative );
|
|---|
| 875 | void addCategories( int cats );
|
|---|
| 876 | void addRange( ushort from, ushort to );
|
|---|
| 877 | void addSingleton( ushort ch ) { addRange( ch, ch ); }
|
|---|
| 878 |
|
|---|
| 879 | bool in( QChar ch ) const;
|
|---|
| 880 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 881 | const QMemArray<int>& firstOccurrence() const { return occ1; }
|
|---|
| 882 | #endif
|
|---|
| 883 |
|
|---|
| 884 | #if defined(QT_DEBUG)
|
|---|
| 885 | void dump() const;
|
|---|
| 886 | #endif
|
|---|
| 887 |
|
|---|
| 888 | private:
|
|---|
| 889 | /*
|
|---|
| 890 | The struct Range represents a range of characters (e.g.,
|
|---|
| 891 | [0-9] denotes range 48 to 57).
|
|---|
| 892 | */
|
|---|
| 893 | struct Range
|
|---|
| 894 | {
|
|---|
| 895 | ushort from; // 48
|
|---|
| 896 | ushort to; // 57
|
|---|
| 897 | };
|
|---|
| 898 |
|
|---|
| 899 | int c; // character classes
|
|---|
| 900 | QMemArray<Range> r; // character ranges
|
|---|
| 901 | bool n; // negative?
|
|---|
| 902 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 903 | QMemArray<int> occ1; // first-occurrence array
|
|---|
| 904 | #endif
|
|---|
| 905 | };
|
|---|
| 906 | #else
|
|---|
| 907 | struct CharClass
|
|---|
| 908 | {
|
|---|
| 909 | int dummy;
|
|---|
| 910 |
|
|---|
| 911 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 912 | CharClass() { occ1.fill( 0, NumBadChars ); }
|
|---|
| 913 |
|
|---|
| 914 | const QMemArray<int>& firstOccurrence() const { return occ1; }
|
|---|
| 915 | QMemArray<int> occ1;
|
|---|
| 916 | #endif
|
|---|
| 917 | };
|
|---|
| 918 | #endif
|
|---|
| 919 |
|
|---|
| 920 | QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); }
|
|---|
| 921 | QRegExpEngine( const QString& rx, bool caseSensitive );
|
|---|
| 922 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 923 | ~QRegExpEngine();
|
|---|
| 924 | #endif
|
|---|
| 925 |
|
|---|
| 926 | bool isValid() const { return valid; }
|
|---|
| 927 | bool caseSensitive() const { return cs; }
|
|---|
| 928 | const QString& errorString() const { return yyError; }
|
|---|
| 929 | int numCaptures() const { return officialncap; }
|
|---|
| 930 | void match( const QString& str, int pos, bool minimal, bool oneTest,
|
|---|
| 931 | int caretIndex, QMemArray<int>& captured );
|
|---|
| 932 | int partialMatchLength() const { return mmOneTestMatchedLen; }
|
|---|
| 933 |
|
|---|
| 934 | int createState( QChar ch );
|
|---|
| 935 | int createState( const CharClass& cc );
|
|---|
| 936 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 937 | int createState( int bref );
|
|---|
| 938 | #endif
|
|---|
| 939 |
|
|---|
| 940 | void addCatTransitions( const QMemArray<int>& from,
|
|---|
| 941 | const QMemArray<int>& to );
|
|---|
| 942 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 943 | void addPlusTransitions( const QMemArray<int>& from,
|
|---|
| 944 | const QMemArray<int>& to, int atom );
|
|---|
| 945 | #endif
|
|---|
| 946 |
|
|---|
| 947 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
|---|
| 948 | int anchorAlternation( int a, int b );
|
|---|
| 949 | int anchorConcatenation( int a, int b );
|
|---|
| 950 | #else
|
|---|
| 951 | int anchorAlternation( int a, int b ) { return a & b; }
|
|---|
| 952 | int anchorConcatenation( int a, int b ) { return a | b; }
|
|---|
| 953 | #endif
|
|---|
| 954 | void addAnchors( int from, int to, int a );
|
|---|
| 955 |
|
|---|
| 956 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 957 | void heuristicallyChooseHeuristic();
|
|---|
| 958 | #endif
|
|---|
| 959 |
|
|---|
| 960 | #if defined(QT_DEBUG)
|
|---|
| 961 | void dump() const;
|
|---|
| 962 | #endif
|
|---|
| 963 |
|
|---|
| 964 | private:
|
|---|
| 965 | enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
|
|---|
| 966 |
|
|---|
| 967 | /*
|
|---|
| 968 | The struct State represents one state in a modified NFA. The
|
|---|
| 969 | input characters matched are stored in the state instead of on
|
|---|
| 970 | the transitions, something possible for an automaton
|
|---|
| 971 | constructed from a regular expression.
|
|---|
| 972 | */
|
|---|
| 973 | struct State
|
|---|
| 974 | {
|
|---|
| 975 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 976 | int atom; // which atom does this state belong to?
|
|---|
| 977 | #endif
|
|---|
| 978 | int match; // what does it match? (see CharClassBit and BackRefBit)
|
|---|
| 979 | QMemArray<int> outs; // out-transitions
|
|---|
| 980 | QMap<int, int> *reenter; // atoms reentered when transiting out
|
|---|
| 981 | QMap<int, int> *anchors; // anchors met when transiting out
|
|---|
| 982 |
|
|---|
| 983 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 984 | State( int a, int m )
|
|---|
| 985 | : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { }
|
|---|
| 986 | #else
|
|---|
| 987 | State( int m )
|
|---|
| 988 | : match( m ), reenter( 0 ), anchors( 0 ) { }
|
|---|
| 989 | #endif
|
|---|
| 990 | ~State() { delete reenter; delete anchors; }
|
|---|
| 991 | };
|
|---|
| 992 |
|
|---|
| 993 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 994 | /*
|
|---|
| 995 | The struct Lookahead represents a lookahead a la Perl (e.g.,
|
|---|
| 996 | (?=foo) and (?!bar)).
|
|---|
| 997 | */
|
|---|
| 998 | struct Lookahead
|
|---|
| 999 | {
|
|---|
| 1000 | QRegExpEngine *eng; // NFA representing the embedded regular expression
|
|---|
| 1001 | bool neg; // negative lookahead?
|
|---|
| 1002 |
|
|---|
| 1003 | Lookahead( QRegExpEngine *eng0, bool neg0 )
|
|---|
| 1004 | : eng( eng0 ), neg( neg0 ) { }
|
|---|
| 1005 | ~Lookahead() { delete eng; }
|
|---|
| 1006 | };
|
|---|
| 1007 | #endif
|
|---|
| 1008 |
|
|---|
| 1009 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1010 | /*
|
|---|
| 1011 | The struct Atom represents one node in the hierarchy of regular
|
|---|
| 1012 | expression atoms.
|
|---|
| 1013 | */
|
|---|
| 1014 | struct Atom
|
|---|
| 1015 | {
|
|---|
| 1016 | int parent; // index of parent in array of atoms
|
|---|
| 1017 | int capture; // index of capture, from 1 to ncap
|
|---|
| 1018 | };
|
|---|
| 1019 | #endif
|
|---|
| 1020 |
|
|---|
| 1021 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
|---|
| 1022 | /*
|
|---|
| 1023 | The struct AnchorAlternation represents a pair of anchors with
|
|---|
| 1024 | OR semantics.
|
|---|
| 1025 | */
|
|---|
| 1026 | struct AnchorAlternation
|
|---|
| 1027 | {
|
|---|
| 1028 | int a; // this anchor...
|
|---|
| 1029 | int b; // ...or this one
|
|---|
| 1030 | };
|
|---|
| 1031 | #endif
|
|---|
| 1032 |
|
|---|
| 1033 | enum { InitialState = 0, FinalState = 1 };
|
|---|
| 1034 | void setup( bool caseSensitive );
|
|---|
| 1035 | int setupState( int match );
|
|---|
| 1036 |
|
|---|
| 1037 | /*
|
|---|
| 1038 | Let's hope that 13 lookaheads and 14 back-references are
|
|---|
| 1039 | enough.
|
|---|
| 1040 | */
|
|---|
| 1041 | enum { MaxLookaheads = 13, MaxBackRefs = 14 };
|
|---|
| 1042 | enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002,
|
|---|
| 1043 | Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008,
|
|---|
| 1044 | Anchor_FirstLookahead = 0x00000010,
|
|---|
| 1045 | Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
|
|---|
| 1046 | Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
|
|---|
| 1047 | Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs,
|
|---|
| 1048 |
|
|---|
| 1049 | Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^
|
|---|
| 1050 | ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) };
|
|---|
| 1051 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1052 | int startAtom( bool capture );
|
|---|
| 1053 | void finishAtom( int atom ) { cf = f[atom].parent; }
|
|---|
| 1054 | #endif
|
|---|
| 1055 |
|
|---|
| 1056 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 1057 | int addLookahead( QRegExpEngine *eng, bool negative );
|
|---|
| 1058 | #endif
|
|---|
| 1059 |
|
|---|
| 1060 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1061 | bool isBetterCapture( const int *begin1, const int *end1, const int *begin2,
|
|---|
| 1062 | const int *end2 );
|
|---|
| 1063 | #endif
|
|---|
| 1064 | bool testAnchor( int i, int a, const int *capBegin );
|
|---|
| 1065 |
|
|---|
| 1066 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1067 | bool goodStringMatch();
|
|---|
| 1068 | bool badCharMatch();
|
|---|
| 1069 | #else
|
|---|
| 1070 | bool bruteMatch();
|
|---|
| 1071 | #endif
|
|---|
| 1072 | bool matchHere();
|
|---|
| 1073 |
|
|---|
| 1074 | QPtrVector<State> s; // array of states
|
|---|
| 1075 | int ns; // number of states
|
|---|
| 1076 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1077 | QMemArray<Atom> f; // atom hierarchy
|
|---|
| 1078 | int nf; // number of atoms
|
|---|
| 1079 | int cf; // current atom
|
|---|
| 1080 | #endif
|
|---|
| 1081 | int officialncap; // number of captures, seen from the outside
|
|---|
| 1082 | int ncap; // number of captures, seen from the inside
|
|---|
| 1083 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 1084 | QPtrVector<CharClass> cl; // array of character classes
|
|---|
| 1085 | #endif
|
|---|
| 1086 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 1087 | QPtrVector<Lookahead> ahead; // array of lookaheads
|
|---|
| 1088 | #endif
|
|---|
| 1089 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
|---|
| 1090 | QMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors
|
|---|
| 1091 | #endif
|
|---|
| 1092 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1093 | bool caretAnchored; // does the regexp start with ^?
|
|---|
| 1094 | bool trivial; // is the good-string all that needs to match?
|
|---|
| 1095 | #endif
|
|---|
| 1096 | bool valid; // is the regular expression valid?
|
|---|
| 1097 | bool cs; // case sensitive?
|
|---|
| 1098 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 1099 | int nbrefs; // number of back-references
|
|---|
| 1100 | #endif
|
|---|
| 1101 |
|
|---|
| 1102 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1103 | bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
|
|---|
| 1104 |
|
|---|
| 1105 | int goodEarlyStart; // the index where goodStr can first occur in a match
|
|---|
| 1106 | int goodLateStart; // the index where goodStr can last occur in a match
|
|---|
| 1107 | QString goodStr; // the string that any match has to contain
|
|---|
| 1108 |
|
|---|
| 1109 | int minl; // the minimum length of a match
|
|---|
| 1110 | QMemArray<int> occ1; // first-occurrence array
|
|---|
| 1111 | #endif
|
|---|
| 1112 |
|
|---|
| 1113 | /*
|
|---|
| 1114 | The class Box is an abstraction for a regular expression
|
|---|
| 1115 | fragment. It can also be seen as one node in the syntax tree of
|
|---|
| 1116 | a regular expression with synthetized attributes.
|
|---|
| 1117 |
|
|---|
| 1118 | Its interface is ugly for performance reasons.
|
|---|
| 1119 | */
|
|---|
| 1120 | class Box
|
|---|
| 1121 | {
|
|---|
| 1122 | public:
|
|---|
| 1123 | Box( QRegExpEngine *engine );
|
|---|
| 1124 | Box( const Box& b ) { operator=( b ); }
|
|---|
| 1125 |
|
|---|
| 1126 | Box& operator=( const Box& b );
|
|---|
| 1127 |
|
|---|
| 1128 | void clear() { operator=( Box(eng) ); }
|
|---|
| 1129 | void set( QChar ch );
|
|---|
| 1130 | void set( const CharClass& cc );
|
|---|
| 1131 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 1132 | void set( int bref );
|
|---|
| 1133 | #endif
|
|---|
| 1134 |
|
|---|
| 1135 | void cat( const Box& b );
|
|---|
| 1136 | void orx( const Box& b );
|
|---|
| 1137 | void plus( int atom );
|
|---|
| 1138 | void opt();
|
|---|
| 1139 | void catAnchor( int a );
|
|---|
| 1140 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1141 | void setupHeuristics();
|
|---|
| 1142 | #endif
|
|---|
| 1143 |
|
|---|
| 1144 | #if defined(QT_DEBUG)
|
|---|
| 1145 | void dump() const;
|
|---|
| 1146 | #endif
|
|---|
| 1147 |
|
|---|
| 1148 | private:
|
|---|
| 1149 | void addAnchorsToEngine( const Box& to ) const;
|
|---|
| 1150 |
|
|---|
| 1151 | QRegExpEngine *eng; // the automaton under construction
|
|---|
| 1152 | QMemArray<int> ls; // the left states (firstpos)
|
|---|
| 1153 | QMemArray<int> rs; // the right states (lastpos)
|
|---|
| 1154 | QMap<int, int> lanchors; // the left anchors
|
|---|
| 1155 | QMap<int, int> ranchors; // the right anchors
|
|---|
| 1156 | int skipanchors; // the anchors to match if the box is skipped
|
|---|
| 1157 |
|
|---|
| 1158 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1159 | int earlyStart; // the index where str can first occur
|
|---|
| 1160 | int lateStart; // the index where str can last occur
|
|---|
| 1161 | QString str; // a string that has to occur in any match
|
|---|
| 1162 | QString leftStr; // a string occurring at the left of this box
|
|---|
| 1163 | QString rightStr; // a string occurring at the right of this box
|
|---|
| 1164 | int maxl; // the maximum length of this box (possibly InftyLen)
|
|---|
| 1165 | #endif
|
|---|
| 1166 |
|
|---|
| 1167 | int minl; // the minimum length of this box
|
|---|
| 1168 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1169 | QMemArray<int> occ1; // first-occurrence array
|
|---|
| 1170 | #endif
|
|---|
| 1171 | };
|
|---|
| 1172 | friend class Box;
|
|---|
| 1173 |
|
|---|
| 1174 | /*
|
|---|
| 1175 | This is the lexical analyzer for regular expressions.
|
|---|
| 1176 | */
|
|---|
| 1177 | enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen,
|
|---|
| 1178 | Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass,
|
|---|
| 1179 | Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord,
|
|---|
| 1180 | Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
|
|---|
| 1181 | int getChar();
|
|---|
| 1182 | int getEscape();
|
|---|
| 1183 | #ifndef QT_NO_REGEXP_INTERVAL
|
|---|
| 1184 | int getRep( int def );
|
|---|
| 1185 | #endif
|
|---|
| 1186 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 1187 | void skipChars( int n );
|
|---|
| 1188 | #endif
|
|---|
| 1189 | void error( const char *msg );
|
|---|
| 1190 | void startTokenizer( const QChar *rx, int len );
|
|---|
| 1191 | int getToken();
|
|---|
| 1192 |
|
|---|
| 1193 | const QChar *yyIn; // a pointer to the input regular expression pattern
|
|---|
| 1194 | int yyPos0; // the position of yyTok in the input pattern
|
|---|
| 1195 | int yyPos; // the position of the next character to read
|
|---|
| 1196 | int yyLen; // the length of yyIn
|
|---|
| 1197 | int yyCh; // the last character read
|
|---|
| 1198 | CharClass *yyCharClass; // attribute for Tok_CharClass tokens
|
|---|
| 1199 | int yyMinRep; // attribute for Tok_Quantifier
|
|---|
| 1200 | int yyMaxRep; // ditto
|
|---|
| 1201 | QString yyError; // syntax error or overflow during parsing?
|
|---|
| 1202 |
|
|---|
| 1203 | /*
|
|---|
| 1204 | This is the syntactic analyzer for regular expressions.
|
|---|
| 1205 | */
|
|---|
| 1206 | int parse( const QChar *rx, int len );
|
|---|
| 1207 | void parseAtom( Box *box );
|
|---|
| 1208 | void parseFactor( Box *box );
|
|---|
| 1209 | void parseTerm( Box *box );
|
|---|
| 1210 | void parseExpression( Box *box );
|
|---|
| 1211 |
|
|---|
| 1212 | int yyTok; // the last token read
|
|---|
| 1213 | bool yyMayCapture; // set this to FALSE to disable capturing
|
|---|
| 1214 |
|
|---|
| 1215 | /*
|
|---|
| 1216 | This is the engine state during matching.
|
|---|
| 1217 | */
|
|---|
| 1218 | const QString *mmStr; // a pointer to the input QString
|
|---|
| 1219 | const QChar *mmIn; // a pointer to the input string data
|
|---|
| 1220 | int mmPos; // the current position in the string
|
|---|
| 1221 | int mmCaretPos;
|
|---|
| 1222 | int mmLen; // the length of the input string
|
|---|
| 1223 | bool mmMinimal; // minimal matching?
|
|---|
| 1224 | QMemArray<int> mmBigArray; // big QMemArray<int> array
|
|---|
| 1225 | int *mmInNextStack; // is state is mmNextStack?
|
|---|
| 1226 | int *mmCurStack; // stack of current states
|
|---|
| 1227 | int *mmNextStack; // stack of next states
|
|---|
| 1228 | int *mmCurCapBegin; // start of current states' captures
|
|---|
| 1229 | int *mmNextCapBegin; // start of next states' captures
|
|---|
| 1230 | int *mmCurCapEnd; // end of current states' captures
|
|---|
| 1231 | int *mmNextCapEnd; // end of next states' captures
|
|---|
| 1232 | int *mmTempCapBegin; // start of temporary captures
|
|---|
| 1233 | int *mmTempCapEnd; // end of temporary captures
|
|---|
| 1234 | int *mmCapBegin; // start of captures for a next state
|
|---|
| 1235 | int *mmCapEnd; // end of captures for a next state
|
|---|
| 1236 | int *mmSlideTab; // bump-along slide table for bad-character heuristic
|
|---|
| 1237 | int mmSlideTabSize; // size of slide table
|
|---|
| 1238 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 1239 | QIntDict<int> mmSleeping; // dictionary of back-reference sleepers
|
|---|
| 1240 | #endif
|
|---|
| 1241 | int mmMatchLen; // length of match
|
|---|
| 1242 | int mmOneTestMatchedLen; // length of partial match
|
|---|
| 1243 | };
|
|---|
| 1244 |
|
|---|
| 1245 | QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive )
|
|---|
| 1246 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 1247 | : mmSleeping( 101 )
|
|---|
| 1248 | #endif
|
|---|
| 1249 | {
|
|---|
| 1250 | setup( caseSensitive );
|
|---|
| 1251 | valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() );
|
|---|
| 1252 | if ( !valid ) {
|
|---|
| 1253 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1254 | trivial = FALSE;
|
|---|
| 1255 | #endif
|
|---|
| 1256 | error( RXERR_LEFTDELIM );
|
|---|
| 1257 | }
|
|---|
| 1258 | }
|
|---|
| 1259 |
|
|---|
| 1260 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1261 | QRegExpEngine::~QRegExpEngine()
|
|---|
| 1262 | {
|
|---|
| 1263 | }
|
|---|
| 1264 | #endif
|
|---|
| 1265 |
|
|---|
| 1266 | /*
|
|---|
| 1267 | Tries to match in str and returns an array of (begin, length) pairs
|
|---|
| 1268 | for captured text. If there is no match, all pairs are (-1, -1).
|
|---|
| 1269 | */
|
|---|
| 1270 | void QRegExpEngine::match( const QString& str, int pos, bool minimal,
|
|---|
| 1271 | bool oneTest, int caretIndex,
|
|---|
| 1272 | QMemArray<int>& captured )
|
|---|
| 1273 | {
|
|---|
| 1274 | bool matched = FALSE;
|
|---|
| 1275 |
|
|---|
| 1276 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1277 | if ( trivial && !oneTest ) {
|
|---|
| 1278 | mmPos = str.find( goodStr, pos, cs );
|
|---|
| 1279 | mmMatchLen = goodStr.length();
|
|---|
| 1280 | matched = ( mmPos != -1 );
|
|---|
| 1281 | } else
|
|---|
| 1282 | #endif
|
|---|
| 1283 | {
|
|---|
| 1284 | mmStr = &str;
|
|---|
| 1285 | mmIn = str.unicode();
|
|---|
| 1286 | if ( mmIn == 0 )
|
|---|
| 1287 | mmIn = &QChar::null;
|
|---|
| 1288 | mmPos = pos;
|
|---|
| 1289 | mmCaretPos = caretIndex;
|
|---|
| 1290 | mmLen = str.length();
|
|---|
| 1291 | mmMinimal = minimal;
|
|---|
| 1292 | mmMatchLen = 0;
|
|---|
| 1293 | mmOneTestMatchedLen = 0;
|
|---|
| 1294 |
|
|---|
| 1295 | if ( valid && mmPos >= 0 && mmPos <= mmLen ) {
|
|---|
| 1296 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1297 | if ( oneTest ) {
|
|---|
| 1298 | matched = matchHere();
|
|---|
| 1299 | } else {
|
|---|
| 1300 | if ( mmPos <= mmLen - minl ) {
|
|---|
| 1301 | if ( caretAnchored ) {
|
|---|
| 1302 | matched = matchHere();
|
|---|
| 1303 | } else if ( useGoodStringHeuristic ) {
|
|---|
| 1304 | matched = goodStringMatch();
|
|---|
| 1305 | } else {
|
|---|
| 1306 | matched = badCharMatch();
|
|---|
| 1307 | }
|
|---|
| 1308 | }
|
|---|
| 1309 | }
|
|---|
| 1310 | #else
|
|---|
| 1311 | matched = oneTest ? matchHere() : bruteMatch();
|
|---|
| 1312 | #endif
|
|---|
| 1313 | }
|
|---|
| 1314 | }
|
|---|
| 1315 |
|
|---|
| 1316 | int capturedSize = 2 + 2 * officialncap;
|
|---|
| 1317 | captured.detach();
|
|---|
| 1318 | captured.resize( capturedSize );
|
|---|
| 1319 | if ( matched ) {
|
|---|
| 1320 | captured[0] = mmPos;
|
|---|
| 1321 | captured[1] = mmMatchLen;
|
|---|
| 1322 | for ( int j = 0; j < officialncap; j++ ) {
|
|---|
| 1323 | int len = mmCapEnd[j] - mmCapBegin[j];
|
|---|
| 1324 | captured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0;
|
|---|
| 1325 | captured[2 + 2 * j + 1] = len;
|
|---|
| 1326 | }
|
|---|
| 1327 | } else {
|
|---|
| 1328 | // we rely on 2's complement here
|
|---|
| 1329 | memset( captured.data(), -1, capturedSize * sizeof(int) );
|
|---|
| 1330 | }
|
|---|
| 1331 | }
|
|---|
| 1332 |
|
|---|
| 1333 | /*
|
|---|
| 1334 | The three following functions add one state to the automaton and
|
|---|
| 1335 | return the number of the state.
|
|---|
| 1336 | */
|
|---|
| 1337 |
|
|---|
| 1338 | int QRegExpEngine::createState( QChar ch )
|
|---|
| 1339 | {
|
|---|
| 1340 | return setupState( ch.unicode() );
|
|---|
| 1341 | }
|
|---|
| 1342 |
|
|---|
| 1343 | int QRegExpEngine::createState( const CharClass& cc )
|
|---|
| 1344 | {
|
|---|
| 1345 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 1346 | int n = cl.size();
|
|---|
| 1347 | cl.resize( n + 1 );
|
|---|
| 1348 | cl.insert( n, new CharClass(cc) );
|
|---|
| 1349 | return setupState( CharClassBit | n );
|
|---|
| 1350 | #else
|
|---|
| 1351 | Q_UNUSED( cc );
|
|---|
| 1352 | return setupState( CharClassBit );
|
|---|
| 1353 | #endif
|
|---|
| 1354 | }
|
|---|
| 1355 |
|
|---|
| 1356 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 1357 | int QRegExpEngine::createState( int bref )
|
|---|
| 1358 | {
|
|---|
| 1359 | if ( bref > nbrefs ) {
|
|---|
| 1360 | nbrefs = bref;
|
|---|
| 1361 | if ( nbrefs > MaxBackRefs ) {
|
|---|
| 1362 | error( RXERR_LIMIT );
|
|---|
| 1363 | return 0;
|
|---|
| 1364 | }
|
|---|
| 1365 | }
|
|---|
| 1366 | return setupState( BackRefBit | bref );
|
|---|
| 1367 | }
|
|---|
| 1368 | #endif
|
|---|
| 1369 |
|
|---|
| 1370 | /*
|
|---|
| 1371 | The two following functions add a transition between all pairs of
|
|---|
| 1372 | states (i, j) where i is fond in from, and j is found in to.
|
|---|
| 1373 |
|
|---|
| 1374 | Cat-transitions are distinguished from plus-transitions for
|
|---|
| 1375 | capturing.
|
|---|
| 1376 | */
|
|---|
| 1377 |
|
|---|
| 1378 | void QRegExpEngine::addCatTransitions( const QMemArray<int>& from,
|
|---|
| 1379 | const QMemArray<int>& to )
|
|---|
| 1380 | {
|
|---|
| 1381 | for ( int i = 0; i < (int) from.size(); i++ ) {
|
|---|
| 1382 | State *st = s[from[i]];
|
|---|
| 1383 | mergeInto( &st->outs, to );
|
|---|
| 1384 | }
|
|---|
| 1385 | }
|
|---|
| 1386 |
|
|---|
| 1387 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1388 | void QRegExpEngine::addPlusTransitions( const QMemArray<int>& from,
|
|---|
| 1389 | const QMemArray<int>& to, int atom )
|
|---|
| 1390 | {
|
|---|
| 1391 | for ( int i = 0; i < (int) from.size(); i++ ) {
|
|---|
| 1392 | State *st = s[from[i]];
|
|---|
| 1393 | QMemArray<int> oldOuts = st->outs.copy();
|
|---|
| 1394 | mergeInto( &st->outs, to );
|
|---|
| 1395 | if ( f[atom].capture >= 0 ) {
|
|---|
| 1396 | if ( st->reenter == 0 )
|
|---|
| 1397 | st->reenter = new QMap<int, int>;
|
|---|
| 1398 | for ( int j = 0; j < (int) to.size(); j++ ) {
|
|---|
| 1399 | if ( !st->reenter->contains(to[j]) &&
|
|---|
| 1400 | oldOuts.bsearch(to[j]) < 0 )
|
|---|
| 1401 | st->reenter->insert( to[j], atom );
|
|---|
| 1402 | }
|
|---|
| 1403 | }
|
|---|
| 1404 | }
|
|---|
| 1405 | }
|
|---|
| 1406 | #endif
|
|---|
| 1407 |
|
|---|
| 1408 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
|---|
| 1409 | /*
|
|---|
| 1410 | Returns an anchor that means a OR b.
|
|---|
| 1411 | */
|
|---|
| 1412 | int QRegExpEngine::anchorAlternation( int a, int b )
|
|---|
| 1413 | {
|
|---|
| 1414 | if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 )
|
|---|
| 1415 | return a & b;
|
|---|
| 1416 |
|
|---|
| 1417 | int n = aa.size();
|
|---|
| 1418 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1419 | if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b )
|
|---|
| 1420 | return Anchor_Alternation | ( n - 1 );
|
|---|
| 1421 | #endif
|
|---|
| 1422 |
|
|---|
| 1423 | aa.resize( n + 1 );
|
|---|
| 1424 | aa[n].a = a;
|
|---|
| 1425 | aa[n].b = b;
|
|---|
| 1426 | return Anchor_Alternation | n;
|
|---|
| 1427 | }
|
|---|
| 1428 |
|
|---|
| 1429 | /*
|
|---|
| 1430 | Returns an anchor that means a AND b.
|
|---|
| 1431 | */
|
|---|
| 1432 | int QRegExpEngine::anchorConcatenation( int a, int b )
|
|---|
| 1433 | {
|
|---|
| 1434 | if ( ((a | b) & Anchor_Alternation) == 0 )
|
|---|
| 1435 | return a | b;
|
|---|
| 1436 | if ( (b & Anchor_Alternation) != 0 )
|
|---|
| 1437 | qSwap( a, b );
|
|---|
| 1438 |
|
|---|
| 1439 | int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b );
|
|---|
| 1440 | int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b );
|
|---|
| 1441 | return anchorAlternation( aprime, bprime );
|
|---|
| 1442 | }
|
|---|
| 1443 | #endif
|
|---|
| 1444 |
|
|---|
| 1445 | /*
|
|---|
| 1446 | Adds anchor a on a transition caracterised by its from state and
|
|---|
| 1447 | its to state.
|
|---|
| 1448 | */
|
|---|
| 1449 | void QRegExpEngine::addAnchors( int from, int to, int a )
|
|---|
| 1450 | {
|
|---|
| 1451 | State *st = s[from];
|
|---|
| 1452 | if ( st->anchors == 0 )
|
|---|
| 1453 | st->anchors = new QMap<int, int>;
|
|---|
| 1454 | if ( st->anchors->contains(to) )
|
|---|
| 1455 | a = anchorAlternation( (*st->anchors)[to], a );
|
|---|
| 1456 | st->anchors->insert( to, a );
|
|---|
| 1457 | }
|
|---|
| 1458 |
|
|---|
| 1459 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1460 | /*
|
|---|
| 1461 | This function chooses between the good-string and the bad-character
|
|---|
| 1462 | heuristics. It computes two scores and chooses the heuristic with
|
|---|
| 1463 | the highest score.
|
|---|
| 1464 |
|
|---|
| 1465 | Here are some common-sense constraints on the scores that should be
|
|---|
| 1466 | respected if the formulas are ever modified: (1) If goodStr is
|
|---|
| 1467 | empty, the good-string heuristic scores 0. (2) If the regular
|
|---|
| 1468 | expression is trivial, the good-string heuristic should be used.
|
|---|
| 1469 | (3) If the search is case insensitive, the good-string heuristic
|
|---|
| 1470 | should be used, unless it scores 0. (Case insensitivity turns all
|
|---|
| 1471 | entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
|
|---|
| 1472 | big, the good-string heuristic should score less.
|
|---|
| 1473 | */
|
|---|
| 1474 | void QRegExpEngine::heuristicallyChooseHeuristic()
|
|---|
| 1475 | {
|
|---|
| 1476 | if ( minl == 0 ) {
|
|---|
| 1477 | useGoodStringHeuristic = FALSE;
|
|---|
| 1478 | } else if ( trivial ) {
|
|---|
| 1479 | useGoodStringHeuristic = TRUE;
|
|---|
| 1480 | } else {
|
|---|
| 1481 | /*
|
|---|
| 1482 | Magic formula: The good string has to constitute a good
|
|---|
| 1483 | proportion of the minimum-length string, and appear at a
|
|---|
| 1484 | more-or-less known index.
|
|---|
| 1485 | */
|
|---|
| 1486 | int goodStringScore = ( 64 * goodStr.length() / minl ) -
|
|---|
| 1487 | ( goodLateStart - goodEarlyStart );
|
|---|
| 1488 | /*
|
|---|
| 1489 | Less magic formula: We pick some characters at random, and
|
|---|
| 1490 | check whether they are good or bad.
|
|---|
| 1491 | */
|
|---|
| 1492 | int badCharScore = 0;
|
|---|
| 1493 | int step = QMAX( 1, NumBadChars / 32 );
|
|---|
| 1494 | for ( int i = 1; i < NumBadChars; i += step ) {
|
|---|
| 1495 | if ( occ1[i] == NoOccurrence )
|
|---|
| 1496 | badCharScore += minl;
|
|---|
| 1497 | else
|
|---|
| 1498 | badCharScore += occ1[i];
|
|---|
| 1499 | }
|
|---|
| 1500 | badCharScore /= minl;
|
|---|
| 1501 | useGoodStringHeuristic = ( goodStringScore > badCharScore );
|
|---|
| 1502 | }
|
|---|
| 1503 | }
|
|---|
| 1504 | #endif
|
|---|
| 1505 |
|
|---|
| 1506 | #if defined(QT_DEBUG)
|
|---|
| 1507 | void QRegExpEngine::dump() const
|
|---|
| 1508 | {
|
|---|
| 1509 | int i, j;
|
|---|
| 1510 | qDebug( "Case %ssensitive engine", cs ? "" : "in" );
|
|---|
| 1511 | qDebug( " States" );
|
|---|
| 1512 | for ( i = 0; i < ns; i++ ) {
|
|---|
| 1513 | qDebug( " %d%s", i,
|
|---|
| 1514 | i == InitialState ? " (initial)" :
|
|---|
| 1515 | i == FinalState ? " (final)" : "" );
|
|---|
| 1516 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1517 | qDebug( " in atom %d", s[i]->atom );
|
|---|
| 1518 | #endif
|
|---|
| 1519 | int m = s[i]->match;
|
|---|
| 1520 | if ( (m & CharClassBit) != 0 ) {
|
|---|
| 1521 | qDebug( " match character class %d", m ^ CharClassBit );
|
|---|
| 1522 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 1523 | cl[m ^ CharClassBit]->dump();
|
|---|
| 1524 | #else
|
|---|
| 1525 | qDebug( " negative character class" );
|
|---|
| 1526 | #endif
|
|---|
| 1527 | } else if ( (m & BackRefBit) != 0 ) {
|
|---|
| 1528 | qDebug( " match back-reference %d", m ^ BackRefBit );
|
|---|
| 1529 | } else if ( m >= 0x20 && m <= 0x7e ) {
|
|---|
| 1530 | qDebug( " match 0x%.4x (%c)", m, m );
|
|---|
| 1531 | } else {
|
|---|
| 1532 | qDebug( " match 0x%.4x", m );
|
|---|
| 1533 | }
|
|---|
| 1534 | for ( j = 0; j < (int) s[i]->outs.size(); j++ ) {
|
|---|
| 1535 | int next = s[i]->outs[j];
|
|---|
| 1536 | qDebug( " -> %d", next );
|
|---|
| 1537 | if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) )
|
|---|
| 1538 | qDebug( " [reenter %d]", (*s[i]->reenter)[next] );
|
|---|
| 1539 | if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 )
|
|---|
| 1540 | qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] );
|
|---|
| 1541 | }
|
|---|
| 1542 | }
|
|---|
| 1543 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1544 | if ( nf > 0 ) {
|
|---|
| 1545 | qDebug( " Atom Parent Capture" );
|
|---|
| 1546 | for ( i = 0; i < nf; i++ )
|
|---|
| 1547 | qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture );
|
|---|
| 1548 | }
|
|---|
| 1549 | #endif
|
|---|
| 1550 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
|---|
| 1551 | for ( i = 0; i < (int) aa.size(); i++ )
|
|---|
| 1552 | qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a,
|
|---|
| 1553 | aa[i].b );
|
|---|
| 1554 | #endif
|
|---|
| 1555 | }
|
|---|
| 1556 | #endif
|
|---|
| 1557 |
|
|---|
| 1558 | void QRegExpEngine::setup( bool caseSensitive )
|
|---|
| 1559 | {
|
|---|
| 1560 | s.setAutoDelete( TRUE );
|
|---|
| 1561 | s.resize( 32 );
|
|---|
| 1562 | ns = 0;
|
|---|
| 1563 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1564 | f.resize( 32 );
|
|---|
| 1565 | nf = 0;
|
|---|
| 1566 | cf = -1;
|
|---|
| 1567 | #endif
|
|---|
| 1568 | officialncap = 0;
|
|---|
| 1569 | ncap = 0;
|
|---|
| 1570 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 1571 | cl.setAutoDelete( TRUE );
|
|---|
| 1572 | #endif
|
|---|
| 1573 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 1574 | ahead.setAutoDelete( TRUE );
|
|---|
| 1575 | #endif
|
|---|
| 1576 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1577 | caretAnchored = TRUE;
|
|---|
| 1578 | trivial = TRUE;
|
|---|
| 1579 | #endif
|
|---|
| 1580 | valid = FALSE;
|
|---|
| 1581 | cs = caseSensitive;
|
|---|
| 1582 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 1583 | nbrefs = 0;
|
|---|
| 1584 | #endif
|
|---|
| 1585 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1586 | useGoodStringHeuristic = TRUE;
|
|---|
| 1587 | minl = 0;
|
|---|
| 1588 | occ1.fill( 0, NumBadChars );
|
|---|
| 1589 | #endif
|
|---|
| 1590 | }
|
|---|
| 1591 |
|
|---|
| 1592 | int QRegExpEngine::setupState( int match )
|
|---|
| 1593 | {
|
|---|
| 1594 | if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() )
|
|---|
| 1595 | s.resize( (ns + 1) << 1 );
|
|---|
| 1596 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1597 | s.insert( ns, new State(cf, match) );
|
|---|
| 1598 | #else
|
|---|
| 1599 | s.insert( ns, new State(match) );
|
|---|
| 1600 | #endif
|
|---|
| 1601 | return ns++;
|
|---|
| 1602 | }
|
|---|
| 1603 |
|
|---|
| 1604 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1605 | /*
|
|---|
| 1606 | Functions startAtom() and finishAtom() should be called to delimit
|
|---|
| 1607 | atoms. When a state is created, it is assigned to the current atom.
|
|---|
| 1608 | The information is later used for capturing.
|
|---|
| 1609 | */
|
|---|
| 1610 | int QRegExpEngine::startAtom( bool capture )
|
|---|
| 1611 | {
|
|---|
| 1612 | if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() )
|
|---|
| 1613 | f.resize( (nf + 1) << 1 );
|
|---|
| 1614 | f[nf].parent = cf;
|
|---|
| 1615 | cf = nf++;
|
|---|
| 1616 | f[cf].capture = capture ? ncap++ : -1;
|
|---|
| 1617 | return cf;
|
|---|
| 1618 | }
|
|---|
| 1619 | #endif
|
|---|
| 1620 |
|
|---|
| 1621 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 1622 | /*
|
|---|
| 1623 | Creates a lookahead anchor.
|
|---|
| 1624 | */
|
|---|
| 1625 | int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative )
|
|---|
| 1626 | {
|
|---|
| 1627 | int n = ahead.size();
|
|---|
| 1628 | if ( n == MaxLookaheads ) {
|
|---|
| 1629 | error( RXERR_LIMIT );
|
|---|
| 1630 | return 0;
|
|---|
| 1631 | }
|
|---|
| 1632 | ahead.resize( n + 1 );
|
|---|
| 1633 | ahead.insert( n, new Lookahead(eng, negative) );
|
|---|
| 1634 | return Anchor_FirstLookahead << n;
|
|---|
| 1635 | }
|
|---|
| 1636 | #endif
|
|---|
| 1637 |
|
|---|
| 1638 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1639 | /*
|
|---|
| 1640 | We want the longest leftmost captures.
|
|---|
| 1641 | */
|
|---|
| 1642 | bool QRegExpEngine::isBetterCapture( const int *begin1, const int *end1,
|
|---|
| 1643 | const int *begin2, const int *end2 )
|
|---|
| 1644 | {
|
|---|
| 1645 | for ( int i = 0; i < ncap; i++ ) {
|
|---|
| 1646 | int delta = begin2[i] - begin1[i]; // it has to start early...
|
|---|
| 1647 | if ( delta == 0 )
|
|---|
| 1648 | delta = end1[i] - end2[i]; // ...and end late (like a party)
|
|---|
| 1649 |
|
|---|
| 1650 | if ( delta != 0 )
|
|---|
| 1651 | return delta > 0;
|
|---|
| 1652 | }
|
|---|
| 1653 | return FALSE;
|
|---|
| 1654 | }
|
|---|
| 1655 | #endif
|
|---|
| 1656 |
|
|---|
| 1657 | /*
|
|---|
| 1658 | Returns TRUE if anchor a matches at position mmPos + i in the input
|
|---|
| 1659 | string, otherwise FALSE.
|
|---|
| 1660 | */
|
|---|
| 1661 | bool QRegExpEngine::testAnchor( int i, int a, const int *capBegin )
|
|---|
| 1662 | {
|
|---|
| 1663 | int j;
|
|---|
| 1664 |
|
|---|
| 1665 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
|---|
| 1666 | if ( (a & Anchor_Alternation) != 0 ) {
|
|---|
| 1667 | return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) ||
|
|---|
| 1668 | testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin );
|
|---|
| 1669 | }
|
|---|
| 1670 | #endif
|
|---|
| 1671 |
|
|---|
| 1672 | if ( (a & Anchor_Caret) != 0 ) {
|
|---|
| 1673 | if ( mmPos + i != mmCaretPos )
|
|---|
| 1674 | return FALSE;
|
|---|
| 1675 | }
|
|---|
| 1676 | if ( (a & Anchor_Dollar) != 0 ) {
|
|---|
| 1677 | if ( mmPos + i != mmLen )
|
|---|
| 1678 | return FALSE;
|
|---|
| 1679 | }
|
|---|
| 1680 | #ifndef QT_NO_REGEXP_ESCAPE
|
|---|
| 1681 | if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) {
|
|---|
| 1682 | bool before = FALSE;
|
|---|
| 1683 | bool after = FALSE;
|
|---|
| 1684 | if ( mmPos + i != 0 )
|
|---|
| 1685 | before = isWord( mmIn[mmPos + i - 1] );
|
|---|
| 1686 | if ( mmPos + i != mmLen )
|
|---|
| 1687 | after = isWord( mmIn[mmPos + i] );
|
|---|
| 1688 | if ( (a & Anchor_Word) != 0 && (before == after) )
|
|---|
| 1689 | return FALSE;
|
|---|
| 1690 | if ( (a & Anchor_NonWord) != 0 && (before != after) )
|
|---|
| 1691 | return FALSE;
|
|---|
| 1692 | }
|
|---|
| 1693 | #endif
|
|---|
| 1694 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 1695 | if ( (a & Anchor_LookaheadMask) != 0 ) {
|
|---|
| 1696 | QConstString cstr = QConstString( (QChar *) mmIn + mmPos + i,
|
|---|
| 1697 | mmLen - mmPos - i );
|
|---|
| 1698 | for ( j = 0; j < (int) ahead.size(); j++ ) {
|
|---|
| 1699 | if ( (a & (Anchor_FirstLookahead << j)) != 0 ) {
|
|---|
| 1700 | QMemArray<int> captured;
|
|---|
| 1701 | ahead[j]->eng->match( cstr.string(), 0, TRUE, TRUE,
|
|---|
| 1702 | mmCaretPos - mmPos - i, captured );
|
|---|
| 1703 | if ( (captured[0] == 0) == ahead[j]->neg )
|
|---|
| 1704 | return FALSE;
|
|---|
| 1705 | }
|
|---|
| 1706 | }
|
|---|
| 1707 | }
|
|---|
| 1708 | #endif
|
|---|
| 1709 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1710 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 1711 | for ( j = 0; j < nbrefs; j++ ) {
|
|---|
| 1712 | if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) {
|
|---|
| 1713 | if ( capBegin[j] != EmptyCapture )
|
|---|
| 1714 | return FALSE;
|
|---|
| 1715 | }
|
|---|
| 1716 | }
|
|---|
| 1717 | #endif
|
|---|
| 1718 | #endif
|
|---|
| 1719 | return TRUE;
|
|---|
| 1720 | }
|
|---|
| 1721 |
|
|---|
| 1722 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 1723 | /*
|
|---|
| 1724 | The three following functions are what Jeffrey Friedl would call
|
|---|
| 1725 | transmissions (or bump-alongs). Using one or the other should make
|
|---|
| 1726 | no difference except in performance.
|
|---|
| 1727 | */
|
|---|
| 1728 |
|
|---|
| 1729 | bool QRegExpEngine::goodStringMatch()
|
|---|
| 1730 | {
|
|---|
| 1731 | int k = mmPos + goodEarlyStart;
|
|---|
| 1732 | while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) {
|
|---|
| 1733 | int from = k - goodLateStart;
|
|---|
| 1734 | int to = k - goodEarlyStart;
|
|---|
| 1735 | if ( from > mmPos )
|
|---|
| 1736 | mmPos = from;
|
|---|
| 1737 |
|
|---|
| 1738 | while ( mmPos <= to ) {
|
|---|
| 1739 | if ( matchHere() )
|
|---|
| 1740 | return TRUE;
|
|---|
| 1741 | mmPos++;
|
|---|
| 1742 | }
|
|---|
| 1743 | k++;
|
|---|
| 1744 | }
|
|---|
| 1745 | return FALSE;
|
|---|
| 1746 | }
|
|---|
| 1747 |
|
|---|
| 1748 | bool QRegExpEngine::badCharMatch()
|
|---|
| 1749 | {
|
|---|
| 1750 | int slideHead = 0;
|
|---|
| 1751 | int slideNext = 0;
|
|---|
| 1752 | int i;
|
|---|
| 1753 | int lastPos = mmLen - minl;
|
|---|
| 1754 | memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) );
|
|---|
| 1755 |
|
|---|
| 1756 | /*
|
|---|
| 1757 | Set up the slide table, used for the bad-character heuristic,
|
|---|
| 1758 | using the table of first occurrence of each character.
|
|---|
| 1759 | */
|
|---|
| 1760 | for ( i = 0; i < minl; i++ ) {
|
|---|
| 1761 | int sk = occ1[BadChar(mmIn[mmPos + i])];
|
|---|
| 1762 | if ( sk == NoOccurrence )
|
|---|
| 1763 | sk = i + 1;
|
|---|
| 1764 | if ( sk > 0 ) {
|
|---|
| 1765 | int k = i + 1 - sk;
|
|---|
| 1766 | if ( k < 0 ) {
|
|---|
| 1767 | sk = i + 1;
|
|---|
| 1768 | k = 0;
|
|---|
| 1769 | }
|
|---|
| 1770 | if ( sk > mmSlideTab[k] )
|
|---|
| 1771 | mmSlideTab[k] = sk;
|
|---|
| 1772 | }
|
|---|
| 1773 | }
|
|---|
| 1774 |
|
|---|
| 1775 | if ( mmPos > lastPos )
|
|---|
| 1776 | return FALSE;
|
|---|
| 1777 |
|
|---|
| 1778 | for ( ;; ) {
|
|---|
| 1779 | if ( ++slideNext >= mmSlideTabSize )
|
|---|
| 1780 | slideNext = 0;
|
|---|
| 1781 | if ( mmSlideTab[slideHead] > 0 ) {
|
|---|
| 1782 | if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] )
|
|---|
| 1783 | mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1;
|
|---|
| 1784 | mmSlideTab[slideHead] = 0;
|
|---|
| 1785 | } else {
|
|---|
| 1786 | if ( matchHere() )
|
|---|
| 1787 | return TRUE;
|
|---|
| 1788 | }
|
|---|
| 1789 |
|
|---|
| 1790 | if ( mmPos == lastPos )
|
|---|
| 1791 | break;
|
|---|
| 1792 |
|
|---|
| 1793 | /*
|
|---|
| 1794 | Update the slide table. This code has much in common with
|
|---|
| 1795 | the initialization code.
|
|---|
| 1796 | */
|
|---|
| 1797 | int sk = occ1[BadChar(mmIn[mmPos + minl])];
|
|---|
| 1798 | if ( sk == NoOccurrence ) {
|
|---|
| 1799 | mmSlideTab[slideNext] = minl;
|
|---|
| 1800 | } else if ( sk > 0 ) {
|
|---|
| 1801 | int k = slideNext + minl - sk;
|
|---|
| 1802 | if ( k >= mmSlideTabSize )
|
|---|
| 1803 | k -= mmSlideTabSize;
|
|---|
| 1804 | if ( sk > mmSlideTab[k] )
|
|---|
| 1805 | mmSlideTab[k] = sk;
|
|---|
| 1806 | }
|
|---|
| 1807 | slideHead = slideNext;
|
|---|
| 1808 | mmPos++;
|
|---|
| 1809 | }
|
|---|
| 1810 | return FALSE;
|
|---|
| 1811 | }
|
|---|
| 1812 | #else
|
|---|
| 1813 | bool QRegExpEngine::bruteMatch()
|
|---|
| 1814 | {
|
|---|
| 1815 | while ( mmPos <= mmLen ) {
|
|---|
| 1816 | if ( matchHere() )
|
|---|
| 1817 | return TRUE;
|
|---|
| 1818 | mmPos++;
|
|---|
| 1819 | }
|
|---|
| 1820 | return FALSE;
|
|---|
| 1821 | }
|
|---|
| 1822 | #endif
|
|---|
| 1823 |
|
|---|
| 1824 | /*
|
|---|
| 1825 | Here's the core of the engine. It tries to do a match here and now.
|
|---|
| 1826 | */
|
|---|
| 1827 | bool QRegExpEngine::matchHere()
|
|---|
| 1828 | {
|
|---|
| 1829 | int ncur = 1, nnext = 0;
|
|---|
| 1830 | int i = 0, j, k, m;
|
|---|
| 1831 | bool stop = FALSE;
|
|---|
| 1832 |
|
|---|
| 1833 | mmMatchLen = -1;
|
|---|
| 1834 | mmOneTestMatchedLen = -1;
|
|---|
| 1835 | mmCurStack[0] = InitialState;
|
|---|
| 1836 |
|
|---|
| 1837 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1838 | if ( ncap > 0 ) {
|
|---|
| 1839 | for ( j = 0; j < ncap; j++ ) {
|
|---|
| 1840 | mmCurCapBegin[j] = EmptyCapture;
|
|---|
| 1841 | mmCurCapEnd[j] = EmptyCapture;
|
|---|
| 1842 | }
|
|---|
| 1843 | }
|
|---|
| 1844 | #endif
|
|---|
| 1845 |
|
|---|
| 1846 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 1847 | int *zzZ = 0;
|
|---|
| 1848 |
|
|---|
| 1849 | while ( (ncur > 0 || !mmSleeping.isEmpty()) && i <= mmLen - mmPos &&
|
|---|
| 1850 | !stop )
|
|---|
| 1851 | #else
|
|---|
| 1852 | while ( ncur > 0 && i <= mmLen - mmPos && !stop )
|
|---|
| 1853 | #endif
|
|---|
| 1854 | {
|
|---|
| 1855 | int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].unicode() : 0;
|
|---|
| 1856 | for ( j = 0; j < ncur; j++ ) {
|
|---|
| 1857 | int cur = mmCurStack[j];
|
|---|
| 1858 | State *scur = s[cur];
|
|---|
| 1859 | QMemArray<int>& outs = scur->outs;
|
|---|
| 1860 | for ( k = 0; k < (int) outs.size(); k++ ) {
|
|---|
| 1861 | int next = outs[k];
|
|---|
| 1862 | State *snext = s[next];
|
|---|
| 1863 | bool in = TRUE;
|
|---|
| 1864 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 1865 | int needSomeSleep = 0;
|
|---|
| 1866 | #endif
|
|---|
| 1867 |
|
|---|
| 1868 | /*
|
|---|
| 1869 | First, check if the anchors are anchored properly.
|
|---|
| 1870 | */
|
|---|
| 1871 | if ( scur->anchors != 0 ) {
|
|---|
| 1872 | int a = at( *scur->anchors, next );
|
|---|
| 1873 | if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) )
|
|---|
| 1874 | in = FALSE;
|
|---|
| 1875 | }
|
|---|
| 1876 | /*
|
|---|
| 1877 | If indeed they are, check if the input character is
|
|---|
| 1878 | correct for this transition.
|
|---|
| 1879 | */
|
|---|
| 1880 | if ( in ) {
|
|---|
| 1881 | m = snext->match;
|
|---|
| 1882 | if ( (m & (CharClassBit | BackRefBit)) == 0 ) {
|
|---|
| 1883 | if ( cs )
|
|---|
| 1884 | in = ( m == ch );
|
|---|
| 1885 | else
|
|---|
| 1886 | in = ( QChar(m).lower() == QChar(ch).lower() );
|
|---|
| 1887 | } else if ( next == FinalState ) {
|
|---|
| 1888 | mmMatchLen = i;
|
|---|
| 1889 | stop = mmMinimal;
|
|---|
| 1890 | in = TRUE;
|
|---|
| 1891 | } else if ( (m & CharClassBit) != 0 ) {
|
|---|
| 1892 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 1893 | const CharClass *cc = cl[m ^ CharClassBit];
|
|---|
| 1894 | if ( cs )
|
|---|
| 1895 | in = cc->in( ch );
|
|---|
| 1896 | else if ( cc->negative() )
|
|---|
| 1897 | in = cc->in( QChar(ch).lower() ) &&
|
|---|
| 1898 | cc->in( QChar(ch).upper() );
|
|---|
| 1899 | else
|
|---|
| 1900 | in = cc->in( QChar(ch).lower() ) ||
|
|---|
| 1901 | cc->in( QChar(ch).upper() );
|
|---|
| 1902 | #endif
|
|---|
| 1903 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 1904 | } else { /* ( (m & BackRefBit) != 0 ) */
|
|---|
| 1905 | int bref = m ^ BackRefBit;
|
|---|
| 1906 | int ell = j * ncap + ( bref - 1 );
|
|---|
| 1907 |
|
|---|
| 1908 | in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture;
|
|---|
| 1909 | if ( in ) {
|
|---|
| 1910 | if ( cs )
|
|---|
| 1911 | in = ( mmIn[mmPos + mmCurCapBegin[ell]]
|
|---|
| 1912 | == QChar(ch) );
|
|---|
| 1913 | else
|
|---|
| 1914 | in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower()
|
|---|
| 1915 | == QChar(ch).lower() );
|
|---|
| 1916 | }
|
|---|
| 1917 |
|
|---|
| 1918 | if ( in ) {
|
|---|
| 1919 | int delta;
|
|---|
| 1920 | if ( mmCurCapEnd[ell] == EmptyCapture )
|
|---|
| 1921 | delta = i - mmCurCapBegin[ell];
|
|---|
| 1922 | else
|
|---|
| 1923 | delta = mmCurCapEnd[ell] - mmCurCapBegin[ell];
|
|---|
| 1924 |
|
|---|
| 1925 | in = ( delta <= mmLen - (mmPos + i) );
|
|---|
| 1926 | if ( in && delta > 1 ) {
|
|---|
| 1927 | int n = 1;
|
|---|
| 1928 | if ( cs ) {
|
|---|
| 1929 | while ( n < delta ) {
|
|---|
| 1930 | if ( mmIn[mmPos +
|
|---|
| 1931 | mmCurCapBegin[ell] + n] !=
|
|---|
| 1932 | mmIn[mmPos + i + n] )
|
|---|
| 1933 | break;
|
|---|
| 1934 | n++;
|
|---|
| 1935 | }
|
|---|
| 1936 | } else {
|
|---|
| 1937 | while ( n < delta ) {
|
|---|
| 1938 | QChar a = mmIn[mmPos +
|
|---|
| 1939 | mmCurCapBegin[ell] + n];
|
|---|
| 1940 | QChar b = mmIn[mmPos + i + n];
|
|---|
| 1941 | if ( a.lower() != b.lower() )
|
|---|
| 1942 | break;
|
|---|
| 1943 | n++;
|
|---|
| 1944 | }
|
|---|
| 1945 | }
|
|---|
| 1946 | in = ( n == delta );
|
|---|
| 1947 | if ( in )
|
|---|
| 1948 | needSomeSleep = delta - 1;
|
|---|
| 1949 | }
|
|---|
| 1950 | }
|
|---|
| 1951 | #endif
|
|---|
| 1952 | }
|
|---|
| 1953 | }
|
|---|
| 1954 |
|
|---|
| 1955 | /*
|
|---|
| 1956 | We must now update our data structures.
|
|---|
| 1957 | */
|
|---|
| 1958 | if ( in ) {
|
|---|
| 1959 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1960 | int *capBegin, *capEnd;
|
|---|
| 1961 | #endif
|
|---|
| 1962 | /*
|
|---|
| 1963 | If the next state was not encountered yet, all
|
|---|
| 1964 | is fine.
|
|---|
| 1965 | */
|
|---|
| 1966 | if ( (m = mmInNextStack[next]) == -1 ) {
|
|---|
| 1967 | m = nnext++;
|
|---|
| 1968 | mmNextStack[m] = next;
|
|---|
| 1969 | mmInNextStack[next] = m;
|
|---|
| 1970 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1971 | capBegin = mmNextCapBegin + m * ncap;
|
|---|
| 1972 | capEnd = mmNextCapEnd + m * ncap;
|
|---|
| 1973 |
|
|---|
| 1974 | /*
|
|---|
| 1975 | Otherwise, we'll first maintain captures in
|
|---|
| 1976 | temporary arrays, and decide at the end whether
|
|---|
| 1977 | it's best to keep the previous capture zones or
|
|---|
| 1978 | the new ones.
|
|---|
| 1979 | */
|
|---|
| 1980 | } else {
|
|---|
| 1981 | capBegin = mmTempCapBegin;
|
|---|
| 1982 | capEnd = mmTempCapEnd;
|
|---|
| 1983 | #endif
|
|---|
| 1984 | }
|
|---|
| 1985 |
|
|---|
| 1986 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 1987 | /*
|
|---|
| 1988 | Updating the capture zones is much of a task.
|
|---|
| 1989 | */
|
|---|
| 1990 | if ( ncap > 0 ) {
|
|---|
| 1991 | memcpy( capBegin, mmCurCapBegin + j * ncap,
|
|---|
| 1992 | ncap * sizeof(int) );
|
|---|
| 1993 | memcpy( capEnd, mmCurCapEnd + j * ncap,
|
|---|
| 1994 | ncap * sizeof(int) );
|
|---|
| 1995 | int c = scur->atom, n = snext->atom;
|
|---|
| 1996 | int p = -1, q = -1;
|
|---|
| 1997 | int cap;
|
|---|
| 1998 |
|
|---|
| 1999 | /*
|
|---|
| 2000 | Lemma 1. For any x in the range [0..nf), we
|
|---|
| 2001 | have f[x].parent < x.
|
|---|
| 2002 |
|
|---|
| 2003 | Proof. By looking at startAtom(), it is
|
|---|
| 2004 | clear that cf < nf holds all the time, and
|
|---|
| 2005 | thus that f[nf].parent < nf.
|
|---|
| 2006 | */
|
|---|
| 2007 |
|
|---|
| 2008 | /*
|
|---|
| 2009 | If we are reentering an atom, we empty all
|
|---|
| 2010 | capture zones inside it.
|
|---|
| 2011 | */
|
|---|
| 2012 | if ( scur->reenter != 0 &&
|
|---|
| 2013 | (q = at(*scur->reenter, next)) != 0 ) {
|
|---|
| 2014 | QBitArray b;
|
|---|
| 2015 | b.fill( FALSE, nf );
|
|---|
| 2016 | b.setBit( q, TRUE );
|
|---|
| 2017 | for ( int ell = q + 1; ell < nf; ell++ ) {
|
|---|
| 2018 | if ( b.testBit(f[ell].parent) ) {
|
|---|
| 2019 | b.setBit( ell, TRUE );
|
|---|
| 2020 | cap = f[ell].capture;
|
|---|
| 2021 | if ( cap >= 0 ) {
|
|---|
| 2022 | capBegin[cap] = EmptyCapture;
|
|---|
| 2023 | capEnd[cap] = EmptyCapture;
|
|---|
| 2024 | }
|
|---|
| 2025 | }
|
|---|
| 2026 | }
|
|---|
| 2027 | p = f[q].parent;
|
|---|
| 2028 |
|
|---|
| 2029 | /*
|
|---|
| 2030 | Otherwise, close the capture zones we are
|
|---|
| 2031 | leaving. We are leaving f[c].capture,
|
|---|
| 2032 | f[f[c].parent].capture,
|
|---|
| 2033 | f[f[f[c].parent].parent].capture, ...,
|
|---|
| 2034 | until f[x].capture, with x such that
|
|---|
| 2035 | f[x].parent is the youngest common ancestor
|
|---|
| 2036 | for c and n.
|
|---|
| 2037 |
|
|---|
| 2038 | We go up along c's and n's ancestry until
|
|---|
| 2039 | we find x.
|
|---|
| 2040 | */
|
|---|
| 2041 | } else {
|
|---|
| 2042 | p = c;
|
|---|
| 2043 | q = n;
|
|---|
| 2044 | while ( p != q ) {
|
|---|
| 2045 | if ( p > q ) {
|
|---|
| 2046 | cap = f[p].capture;
|
|---|
| 2047 | if ( cap >= 0 ) {
|
|---|
| 2048 | if ( capBegin[cap] == i ) {
|
|---|
| 2049 | capBegin[cap] = EmptyCapture;
|
|---|
| 2050 | capEnd[cap] = EmptyCapture;
|
|---|
| 2051 | } else {
|
|---|
| 2052 | capEnd[cap] = i;
|
|---|
| 2053 | }
|
|---|
| 2054 | }
|
|---|
| 2055 | p = f[p].parent;
|
|---|
| 2056 | } else {
|
|---|
| 2057 | q = f[q].parent;
|
|---|
| 2058 | }
|
|---|
| 2059 | }
|
|---|
| 2060 | }
|
|---|
| 2061 |
|
|---|
| 2062 | /*
|
|---|
| 2063 | In any case, we now open the capture zones
|
|---|
| 2064 | we are entering. We work upwards from n
|
|---|
| 2065 | until we reach p (the parent of the atom we
|
|---|
| 2066 | reenter or the youngest common ancestor).
|
|---|
| 2067 | */
|
|---|
| 2068 | while ( n > p ) {
|
|---|
| 2069 | cap = f[n].capture;
|
|---|
| 2070 | if ( cap >= 0 ) {
|
|---|
| 2071 | capBegin[cap] = i;
|
|---|
| 2072 | capEnd[cap] = EmptyCapture;
|
|---|
| 2073 | }
|
|---|
| 2074 | n = f[n].parent;
|
|---|
| 2075 | }
|
|---|
| 2076 | /*
|
|---|
| 2077 | If the next state was already in
|
|---|
| 2078 | mmNextStack, we must choose carefully which
|
|---|
| 2079 | capture zones we want to keep.
|
|---|
| 2080 | */
|
|---|
| 2081 | if ( capBegin == mmTempCapBegin &&
|
|---|
| 2082 | isBetterCapture(capBegin, capEnd,
|
|---|
| 2083 | mmNextCapBegin + m * ncap,
|
|---|
| 2084 | mmNextCapEnd + m * ncap) ) {
|
|---|
| 2085 | memcpy( mmNextCapBegin + m * ncap, capBegin,
|
|---|
| 2086 | ncap * sizeof(int) );
|
|---|
| 2087 | memcpy( mmNextCapEnd + m * ncap, capEnd,
|
|---|
| 2088 | ncap * sizeof(int) );
|
|---|
| 2089 | }
|
|---|
| 2090 | }
|
|---|
| 2091 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 2092 | /*
|
|---|
| 2093 | We are done with updating the capture zones.
|
|---|
| 2094 | It's now time to put the next state to sleep,
|
|---|
| 2095 | if it needs to, and to remove it from
|
|---|
| 2096 | mmNextStack.
|
|---|
| 2097 | */
|
|---|
| 2098 | if ( needSomeSleep > 0 ) {
|
|---|
| 2099 | zzZ = new int[1 + 2 * ncap];
|
|---|
| 2100 | zzZ[0] = next;
|
|---|
| 2101 | if ( ncap > 0 ) {
|
|---|
| 2102 | memcpy( zzZ + 1, capBegin, ncap * sizeof(int) );
|
|---|
| 2103 | memcpy( zzZ + 1 + ncap, capEnd,
|
|---|
| 2104 | ncap * sizeof(int) );
|
|---|
| 2105 | }
|
|---|
| 2106 | mmInNextStack[mmNextStack[--nnext]] = -1;
|
|---|
| 2107 | mmSleeping.insert( i + needSomeSleep, zzZ );
|
|---|
| 2108 | }
|
|---|
| 2109 | #endif
|
|---|
| 2110 | #endif
|
|---|
| 2111 | }
|
|---|
| 2112 | }
|
|---|
| 2113 | }
|
|---|
| 2114 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 2115 | /*
|
|---|
| 2116 | If we reached the final state, hurray! Copy the captured
|
|---|
| 2117 | zone.
|
|---|
| 2118 | */
|
|---|
| 2119 | if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) {
|
|---|
| 2120 | memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) );
|
|---|
| 2121 | memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) );
|
|---|
| 2122 | }
|
|---|
| 2123 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 2124 | /*
|
|---|
| 2125 | It's time to wake up the sleepers.
|
|---|
| 2126 | */
|
|---|
| 2127 | if ( !mmSleeping.isEmpty() ) {
|
|---|
| 2128 | while ( (zzZ = mmSleeping.take(i)) != 0 ) {
|
|---|
| 2129 | int next = zzZ[0];
|
|---|
| 2130 | int *capBegin = zzZ + 1;
|
|---|
| 2131 | int *capEnd = zzZ + 1 + ncap;
|
|---|
| 2132 | bool copyOver = TRUE;
|
|---|
| 2133 |
|
|---|
| 2134 | if ( (m = mmInNextStack[zzZ[0]]) == -1 ) {
|
|---|
| 2135 | m = nnext++;
|
|---|
| 2136 | mmNextStack[m] = next;
|
|---|
| 2137 | mmInNextStack[next] = m;
|
|---|
| 2138 | } else {
|
|---|
| 2139 | copyOver = isBetterCapture( mmNextCapBegin + m * ncap,
|
|---|
| 2140 | mmNextCapEnd + m * ncap,
|
|---|
| 2141 | capBegin, capEnd );
|
|---|
| 2142 | }
|
|---|
| 2143 | if ( copyOver ) {
|
|---|
| 2144 | memcpy( mmNextCapBegin + m * ncap, capBegin,
|
|---|
| 2145 | ncap * sizeof(int) );
|
|---|
| 2146 | memcpy( mmNextCapEnd + m * ncap, capEnd,
|
|---|
| 2147 | ncap * sizeof(int) );
|
|---|
| 2148 | }
|
|---|
| 2149 | delete[] zzZ;
|
|---|
| 2150 | }
|
|---|
| 2151 | }
|
|---|
| 2152 | #endif
|
|---|
| 2153 | #endif
|
|---|
| 2154 | for ( j = 0; j < nnext; j++ )
|
|---|
| 2155 | mmInNextStack[mmNextStack[j]] = -1;
|
|---|
| 2156 |
|
|---|
| 2157 | // avoid needless iteration that confuses mmOneTestMatchedLen
|
|---|
| 2158 | if ( nnext == 1 && mmNextStack[0] == FinalState
|
|---|
| 2159 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 2160 | && mmSleeping.isEmpty()
|
|---|
| 2161 | #endif
|
|---|
| 2162 | )
|
|---|
| 2163 | stop = TRUE;
|
|---|
| 2164 |
|
|---|
| 2165 | qSwap( mmCurStack, mmNextStack );
|
|---|
| 2166 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 2167 | qSwap( mmCurCapBegin, mmNextCapBegin );
|
|---|
| 2168 | qSwap( mmCurCapEnd, mmNextCapEnd );
|
|---|
| 2169 | #endif
|
|---|
| 2170 | ncur = nnext;
|
|---|
| 2171 | nnext = 0;
|
|---|
| 2172 | i++;
|
|---|
| 2173 | }
|
|---|
| 2174 |
|
|---|
| 2175 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 2176 | /*
|
|---|
| 2177 | If minimal matching is enabled, we might have some sleepers
|
|---|
| 2178 | left.
|
|---|
| 2179 | */
|
|---|
| 2180 | while ( !mmSleeping.isEmpty() ) {
|
|---|
| 2181 | zzZ = mmSleeping.take( *QIntDictIterator<int>(mmSleeping) );
|
|---|
| 2182 | delete[] zzZ;
|
|---|
| 2183 | }
|
|---|
| 2184 | #endif
|
|---|
| 2185 |
|
|---|
| 2186 | mmOneTestMatchedLen = i - 1;
|
|---|
| 2187 | return ( mmMatchLen >= 0 );
|
|---|
| 2188 | }
|
|---|
| 2189 |
|
|---|
| 2190 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 2191 |
|
|---|
| 2192 | QRegExpEngine::CharClass::CharClass()
|
|---|
| 2193 | : c( 0 ), n( FALSE )
|
|---|
| 2194 | {
|
|---|
| 2195 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2196 | occ1.fill( NoOccurrence, NumBadChars );
|
|---|
| 2197 | #endif
|
|---|
| 2198 | }
|
|---|
| 2199 |
|
|---|
| 2200 | QRegExpEngine::CharClass& QRegExpEngine::CharClass::operator=(
|
|---|
| 2201 | const CharClass& cc )
|
|---|
| 2202 | {
|
|---|
| 2203 | c = cc.c;
|
|---|
| 2204 | r = cc.r.copy();
|
|---|
| 2205 | n = cc.n;
|
|---|
| 2206 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2207 | occ1 = cc.occ1;
|
|---|
| 2208 | #endif
|
|---|
| 2209 | return *this;
|
|---|
| 2210 | }
|
|---|
| 2211 |
|
|---|
| 2212 | void QRegExpEngine::CharClass::clear()
|
|---|
| 2213 | {
|
|---|
| 2214 | c = 0;
|
|---|
| 2215 | r.resize( 0 );
|
|---|
| 2216 | n = FALSE;
|
|---|
| 2217 | }
|
|---|
| 2218 |
|
|---|
| 2219 | void QRegExpEngine::CharClass::setNegative( bool negative )
|
|---|
| 2220 | {
|
|---|
| 2221 | n = negative;
|
|---|
| 2222 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2223 | occ1.fill( 0, NumBadChars );
|
|---|
| 2224 | #endif
|
|---|
| 2225 | }
|
|---|
| 2226 |
|
|---|
| 2227 | void QRegExpEngine::CharClass::addCategories( int cats )
|
|---|
| 2228 | {
|
|---|
| 2229 | c |= cats;
|
|---|
| 2230 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2231 | occ1.fill( 0, NumBadChars );
|
|---|
| 2232 | #endif
|
|---|
| 2233 | }
|
|---|
| 2234 |
|
|---|
| 2235 | void QRegExpEngine::CharClass::addRange( ushort from, ushort to )
|
|---|
| 2236 | {
|
|---|
| 2237 | if ( from > to )
|
|---|
| 2238 | qSwap( from, to );
|
|---|
| 2239 | int m = r.size();
|
|---|
| 2240 | r.resize( m + 1 );
|
|---|
| 2241 | r[m].from = from;
|
|---|
| 2242 | r[m].to = to;
|
|---|
| 2243 |
|
|---|
| 2244 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2245 | int i;
|
|---|
| 2246 |
|
|---|
| 2247 | if ( to - from < NumBadChars ) {
|
|---|
| 2248 | occ1.detach();
|
|---|
| 2249 | if ( from % NumBadChars <= to % NumBadChars ) {
|
|---|
| 2250 | for ( i = from % NumBadChars; i <= to % NumBadChars; i++ )
|
|---|
| 2251 | occ1[i] = 0;
|
|---|
| 2252 | } else {
|
|---|
| 2253 | for ( i = 0; i <= to % NumBadChars; i++ )
|
|---|
| 2254 | occ1[i] = 0;
|
|---|
| 2255 | for ( i = from % NumBadChars; i < NumBadChars; i++ )
|
|---|
| 2256 | occ1[i] = 0;
|
|---|
| 2257 | }
|
|---|
| 2258 | } else {
|
|---|
| 2259 | occ1.fill( 0, NumBadChars );
|
|---|
| 2260 | }
|
|---|
| 2261 | #endif
|
|---|
| 2262 | }
|
|---|
| 2263 |
|
|---|
| 2264 | bool QRegExpEngine::CharClass::in( QChar ch ) const
|
|---|
| 2265 | {
|
|---|
| 2266 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2267 | if ( occ1[BadChar(ch)] == NoOccurrence )
|
|---|
| 2268 | return n;
|
|---|
| 2269 | #endif
|
|---|
| 2270 |
|
|---|
| 2271 | if ( c != 0 && (c & (1 << (int) ch.category())) != 0 )
|
|---|
| 2272 | return !n;
|
|---|
| 2273 | for ( int i = 0; i < (int) r.size(); i++ ) {
|
|---|
| 2274 | if ( ch.unicode() >= r[i].from && ch.unicode() <= r[i].to )
|
|---|
| 2275 | return !n;
|
|---|
| 2276 | }
|
|---|
| 2277 | return n;
|
|---|
| 2278 | }
|
|---|
| 2279 |
|
|---|
| 2280 | #if defined(QT_DEBUG)
|
|---|
| 2281 | void QRegExpEngine::CharClass::dump() const
|
|---|
| 2282 | {
|
|---|
| 2283 | int i;
|
|---|
| 2284 | qDebug( " %stive character class", n ? "nega" : "posi" );
|
|---|
| 2285 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 2286 | if ( c != 0 )
|
|---|
| 2287 | qDebug( " categories 0x%.8x", c );
|
|---|
| 2288 | #endif
|
|---|
| 2289 | for ( i = 0; i < (int) r.size(); i++ )
|
|---|
| 2290 | qDebug( " 0x%.4x through 0x%.4x", r[i].from, r[i].to );
|
|---|
| 2291 | }
|
|---|
| 2292 | #endif
|
|---|
| 2293 | #endif
|
|---|
| 2294 |
|
|---|
| 2295 | QRegExpEngine::Box::Box( QRegExpEngine *engine )
|
|---|
| 2296 | : eng( engine ), skipanchors( 0 )
|
|---|
| 2297 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2298 | , earlyStart( 0 ), lateStart( 0 ), maxl( 0 )
|
|---|
| 2299 | #endif
|
|---|
| 2300 | {
|
|---|
| 2301 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2302 | occ1.fill( NoOccurrence, NumBadChars );
|
|---|
| 2303 | #endif
|
|---|
| 2304 | minl = 0;
|
|---|
| 2305 | }
|
|---|
| 2306 |
|
|---|
| 2307 | QRegExpEngine::Box& QRegExpEngine::Box::operator=( const Box& b )
|
|---|
| 2308 | {
|
|---|
| 2309 | eng = b.eng;
|
|---|
| 2310 | ls = b.ls;
|
|---|
| 2311 | rs = b.rs;
|
|---|
| 2312 | lanchors = b.lanchors;
|
|---|
| 2313 | ranchors = b.ranchors;
|
|---|
| 2314 | skipanchors = b.skipanchors;
|
|---|
| 2315 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2316 | earlyStart = b.earlyStart;
|
|---|
| 2317 | lateStart = b.lateStart;
|
|---|
| 2318 | str = b.str;
|
|---|
| 2319 | leftStr = b.leftStr;
|
|---|
| 2320 | rightStr = b.rightStr;
|
|---|
| 2321 | maxl = b.maxl;
|
|---|
| 2322 | occ1 = b.occ1;
|
|---|
| 2323 | #endif
|
|---|
| 2324 | minl = b.minl;
|
|---|
| 2325 | return *this;
|
|---|
| 2326 | }
|
|---|
| 2327 |
|
|---|
| 2328 | void QRegExpEngine::Box::set( QChar ch )
|
|---|
| 2329 | {
|
|---|
| 2330 | ls.resize( 1 );
|
|---|
| 2331 | ls[0] = eng->createState( ch );
|
|---|
| 2332 | rs = ls;
|
|---|
| 2333 | rs.detach();
|
|---|
| 2334 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2335 | str = ch;
|
|---|
| 2336 | leftStr = ch;
|
|---|
| 2337 | rightStr = ch;
|
|---|
| 2338 | maxl = 1;
|
|---|
| 2339 | occ1.detach();
|
|---|
| 2340 | occ1[BadChar(ch)] = 0;
|
|---|
| 2341 | #endif
|
|---|
| 2342 | minl = 1;
|
|---|
| 2343 | }
|
|---|
| 2344 |
|
|---|
| 2345 | void QRegExpEngine::Box::set( const CharClass& cc )
|
|---|
| 2346 | {
|
|---|
| 2347 | ls.resize( 1 );
|
|---|
| 2348 | ls[0] = eng->createState( cc );
|
|---|
| 2349 | rs = ls;
|
|---|
| 2350 | rs.detach();
|
|---|
| 2351 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2352 | maxl = 1;
|
|---|
| 2353 | occ1 = cc.firstOccurrence();
|
|---|
| 2354 | #endif
|
|---|
| 2355 | minl = 1;
|
|---|
| 2356 | }
|
|---|
| 2357 |
|
|---|
| 2358 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 2359 | void QRegExpEngine::Box::set( int bref )
|
|---|
| 2360 | {
|
|---|
| 2361 | ls.resize( 1 );
|
|---|
| 2362 | ls[0] = eng->createState( bref );
|
|---|
| 2363 | rs = ls;
|
|---|
| 2364 | rs.detach();
|
|---|
| 2365 | if ( bref >= 1 && bref <= MaxBackRefs )
|
|---|
| 2366 | skipanchors = Anchor_BackRef0Empty << bref;
|
|---|
| 2367 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2368 | maxl = InftyLen;
|
|---|
| 2369 | #endif
|
|---|
| 2370 | minl = 0;
|
|---|
| 2371 | }
|
|---|
| 2372 | #endif
|
|---|
| 2373 |
|
|---|
| 2374 | void QRegExpEngine::Box::cat( const Box& b )
|
|---|
| 2375 | {
|
|---|
| 2376 | eng->addCatTransitions( rs, b.ls );
|
|---|
| 2377 | addAnchorsToEngine( b );
|
|---|
| 2378 | if ( minl == 0 ) {
|
|---|
| 2379 | mergeInto( &lanchors, b.lanchors );
|
|---|
| 2380 | if ( skipanchors != 0 ) {
|
|---|
| 2381 | for ( int i = 0; i < (int) b.ls.size(); i++ ) {
|
|---|
| 2382 | int a = eng->anchorConcatenation( at(lanchors, b.ls[i]),
|
|---|
| 2383 | skipanchors );
|
|---|
| 2384 | lanchors.insert( b.ls[i], a );
|
|---|
| 2385 | }
|
|---|
| 2386 | }
|
|---|
| 2387 | mergeInto( &ls, b.ls );
|
|---|
| 2388 | }
|
|---|
| 2389 | if ( b.minl == 0 ) {
|
|---|
| 2390 | mergeInto( &ranchors, b.ranchors );
|
|---|
| 2391 | if ( b.skipanchors != 0 ) {
|
|---|
| 2392 | for ( int i = 0; i < (int) rs.size(); i++ ) {
|
|---|
| 2393 | int a = eng->anchorConcatenation( at(ranchors, rs[i]),
|
|---|
| 2394 | b.skipanchors );
|
|---|
| 2395 | ranchors.insert( rs[i], a );
|
|---|
| 2396 | }
|
|---|
| 2397 | }
|
|---|
| 2398 | mergeInto( &rs, b.rs );
|
|---|
| 2399 | } else {
|
|---|
| 2400 | ranchors = b.ranchors;
|
|---|
| 2401 | rs = b.rs;
|
|---|
| 2402 | }
|
|---|
| 2403 |
|
|---|
| 2404 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2405 | if ( maxl != InftyLen ) {
|
|---|
| 2406 | if ( rightStr.length() + b.leftStr.length() >
|
|---|
| 2407 | QMAX(str.length(), b.str.length()) ) {
|
|---|
| 2408 | earlyStart = minl - rightStr.length();
|
|---|
| 2409 | lateStart = maxl - rightStr.length();
|
|---|
| 2410 | str = rightStr + b.leftStr;
|
|---|
| 2411 | } else if ( b.str.length() > str.length() ) {
|
|---|
| 2412 | earlyStart = minl + b.earlyStart;
|
|---|
| 2413 | lateStart = maxl + b.lateStart;
|
|---|
| 2414 | str = b.str;
|
|---|
| 2415 | }
|
|---|
| 2416 | }
|
|---|
| 2417 |
|
|---|
| 2418 | if ( (int) leftStr.length() == maxl )
|
|---|
| 2419 | leftStr += b.leftStr;
|
|---|
| 2420 |
|
|---|
| 2421 | if ( (int) b.rightStr.length() == b.maxl ) {
|
|---|
| 2422 | rightStr += b.rightStr;
|
|---|
| 2423 | } else {
|
|---|
| 2424 | rightStr = b.rightStr;
|
|---|
| 2425 | }
|
|---|
| 2426 |
|
|---|
| 2427 | if ( maxl == InftyLen || b.maxl == InftyLen ) {
|
|---|
| 2428 | maxl = InftyLen;
|
|---|
| 2429 | } else {
|
|---|
| 2430 | maxl += b.maxl;
|
|---|
| 2431 | }
|
|---|
| 2432 |
|
|---|
| 2433 | occ1.detach();
|
|---|
| 2434 | for ( int i = 0; i < NumBadChars; i++ ) {
|
|---|
| 2435 | if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] )
|
|---|
| 2436 | occ1[i] = minl + b.occ1[i];
|
|---|
| 2437 | }
|
|---|
| 2438 | #endif
|
|---|
| 2439 |
|
|---|
| 2440 | minl += b.minl;
|
|---|
| 2441 | if ( minl == 0 )
|
|---|
| 2442 | skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors );
|
|---|
| 2443 | else
|
|---|
| 2444 | skipanchors = 0;
|
|---|
| 2445 | }
|
|---|
| 2446 |
|
|---|
| 2447 | void QRegExpEngine::Box::orx( const Box& b )
|
|---|
| 2448 | {
|
|---|
| 2449 | mergeInto( &ls, b.ls );
|
|---|
| 2450 | mergeInto( &lanchors, b.lanchors );
|
|---|
| 2451 | mergeInto( &rs, b.rs );
|
|---|
| 2452 | mergeInto( &ranchors, b.ranchors );
|
|---|
| 2453 |
|
|---|
| 2454 | if ( b.minl == 0 ) {
|
|---|
| 2455 | if ( minl == 0 )
|
|---|
| 2456 | skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors );
|
|---|
| 2457 | else
|
|---|
| 2458 | skipanchors = b.skipanchors;
|
|---|
| 2459 | }
|
|---|
| 2460 |
|
|---|
| 2461 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2462 | occ1.detach();
|
|---|
| 2463 | for ( int i = 0; i < NumBadChars; i++ ) {
|
|---|
| 2464 | if ( occ1[i] > b.occ1[i] )
|
|---|
| 2465 | occ1[i] = b.occ1[i];
|
|---|
| 2466 | }
|
|---|
| 2467 | earlyStart = 0;
|
|---|
| 2468 | lateStart = 0;
|
|---|
| 2469 | str = QString();
|
|---|
| 2470 | leftStr = QString();
|
|---|
| 2471 | rightStr = QString();
|
|---|
| 2472 | if ( b.maxl > maxl )
|
|---|
| 2473 | maxl = b.maxl;
|
|---|
| 2474 | #endif
|
|---|
| 2475 | if ( b.minl < minl )
|
|---|
| 2476 | minl = b.minl;
|
|---|
| 2477 | }
|
|---|
| 2478 |
|
|---|
| 2479 | void QRegExpEngine::Box::plus( int atom )
|
|---|
| 2480 | {
|
|---|
| 2481 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 2482 | eng->addPlusTransitions( rs, ls, atom );
|
|---|
| 2483 | #else
|
|---|
| 2484 | Q_UNUSED( atom );
|
|---|
| 2485 | eng->addCatTransitions( rs, ls );
|
|---|
| 2486 | #endif
|
|---|
| 2487 | addAnchorsToEngine( *this );
|
|---|
| 2488 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2489 | maxl = InftyLen;
|
|---|
| 2490 | #endif
|
|---|
| 2491 | }
|
|---|
| 2492 |
|
|---|
| 2493 | void QRegExpEngine::Box::opt()
|
|---|
| 2494 | {
|
|---|
| 2495 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2496 | earlyStart = 0;
|
|---|
| 2497 | lateStart = 0;
|
|---|
| 2498 | str = QString();
|
|---|
| 2499 | leftStr = QString();
|
|---|
| 2500 | rightStr = QString();
|
|---|
| 2501 | #endif
|
|---|
| 2502 | skipanchors = 0;
|
|---|
| 2503 | minl = 0;
|
|---|
| 2504 | }
|
|---|
| 2505 |
|
|---|
| 2506 | void QRegExpEngine::Box::catAnchor( int a )
|
|---|
| 2507 | {
|
|---|
| 2508 | if ( a != 0 ) {
|
|---|
| 2509 | for ( int i = 0; i < (int) rs.size(); i++ ) {
|
|---|
| 2510 | a = eng->anchorConcatenation( at(ranchors, rs[i]), a );
|
|---|
| 2511 | ranchors.insert( rs[i], a );
|
|---|
| 2512 | }
|
|---|
| 2513 | if ( minl == 0 )
|
|---|
| 2514 | skipanchors = eng->anchorConcatenation( skipanchors, a );
|
|---|
| 2515 | }
|
|---|
| 2516 | }
|
|---|
| 2517 |
|
|---|
| 2518 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2519 | void QRegExpEngine::Box::setupHeuristics()
|
|---|
| 2520 | {
|
|---|
| 2521 | eng->goodEarlyStart = earlyStart;
|
|---|
| 2522 | eng->goodLateStart = lateStart;
|
|---|
| 2523 | eng->goodStr = eng->cs ? str : str.lower();
|
|---|
| 2524 |
|
|---|
| 2525 | eng->minl = minl;
|
|---|
| 2526 | if ( eng->cs ) {
|
|---|
| 2527 | /*
|
|---|
| 2528 | A regular expression such as 112|1 has occ1['2'] = 2 and minl =
|
|---|
| 2529 | 1 at this point. An entry of occ1 has to be at most minl or
|
|---|
| 2530 | infinity for the rest of the algorithm to go well.
|
|---|
| 2531 |
|
|---|
| 2532 | We waited until here before normalizing these cases (instead of
|
|---|
| 2533 | doing it in Box::orx()) because sometimes things improve by
|
|---|
| 2534 | themselves. Consider for example (112|1)34.
|
|---|
| 2535 | */
|
|---|
| 2536 | for ( int i = 0; i < NumBadChars; i++ ) {
|
|---|
| 2537 | if ( occ1[i] != NoOccurrence && occ1[i] >= minl )
|
|---|
| 2538 | occ1[i] = minl;
|
|---|
| 2539 | }
|
|---|
| 2540 | eng->occ1 = occ1;
|
|---|
| 2541 | } else {
|
|---|
| 2542 | eng->occ1.fill( 0, NumBadChars );
|
|---|
| 2543 | }
|
|---|
| 2544 |
|
|---|
| 2545 | eng->heuristicallyChooseHeuristic();
|
|---|
| 2546 | }
|
|---|
| 2547 | #endif
|
|---|
| 2548 |
|
|---|
| 2549 | #if defined(QT_DEBUG)
|
|---|
| 2550 | void QRegExpEngine::Box::dump() const
|
|---|
| 2551 | {
|
|---|
| 2552 | int i;
|
|---|
| 2553 | qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" );
|
|---|
| 2554 | qDebug( " Left states:" );
|
|---|
| 2555 | for ( i = 0; i < (int) ls.size(); i++ ) {
|
|---|
| 2556 | if ( at(lanchors, ls[i]) == 0 )
|
|---|
| 2557 | qDebug( " %d", ls[i] );
|
|---|
| 2558 | else
|
|---|
| 2559 | qDebug( " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] );
|
|---|
| 2560 | }
|
|---|
| 2561 | qDebug( " Right states:" );
|
|---|
| 2562 | for ( i = 0; i < (int) rs.size(); i++ ) {
|
|---|
| 2563 | if ( at(ranchors, rs[i]) == 0 )
|
|---|
| 2564 | qDebug( " %d", rs[i] );
|
|---|
| 2565 | else
|
|---|
| 2566 | qDebug( " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] );
|
|---|
| 2567 | }
|
|---|
| 2568 | qDebug( " Skip anchors: 0x%.8x", skipanchors );
|
|---|
| 2569 | }
|
|---|
| 2570 | #endif
|
|---|
| 2571 |
|
|---|
| 2572 | void QRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const
|
|---|
| 2573 | {
|
|---|
| 2574 | for ( int i = 0; i < (int) to.ls.size(); i++ ) {
|
|---|
| 2575 | for ( int j = 0; j < (int) rs.size(); j++ ) {
|
|---|
| 2576 | int a = eng->anchorConcatenation( at(ranchors, rs[j]),
|
|---|
| 2577 | at(to.lanchors, to.ls[i]) );
|
|---|
| 2578 | eng->addAnchors( rs[j], to.ls[i], a );
|
|---|
| 2579 | }
|
|---|
| 2580 | }
|
|---|
| 2581 | }
|
|---|
| 2582 |
|
|---|
| 2583 | int QRegExpEngine::getChar()
|
|---|
| 2584 | {
|
|---|
| 2585 | return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].unicode();
|
|---|
| 2586 | }
|
|---|
| 2587 |
|
|---|
| 2588 | int QRegExpEngine::getEscape()
|
|---|
| 2589 | {
|
|---|
| 2590 | #ifndef QT_NO_REGEXP_ESCAPE
|
|---|
| 2591 | const char tab[] = "afnrtv"; // no b, as \b means word boundary
|
|---|
| 2592 | const char backTab[] = "\a\f\n\r\t\v";
|
|---|
| 2593 | ushort low;
|
|---|
| 2594 | int i;
|
|---|
| 2595 | #endif
|
|---|
| 2596 | ushort val;
|
|---|
| 2597 | int prevCh = yyCh;
|
|---|
| 2598 |
|
|---|
| 2599 | if ( prevCh == EOS ) {
|
|---|
| 2600 | error( RXERR_END );
|
|---|
| 2601 | return Tok_Char | '\\';
|
|---|
| 2602 | }
|
|---|
| 2603 | yyCh = getChar();
|
|---|
| 2604 | #ifndef QT_NO_REGEXP_ESCAPE
|
|---|
| 2605 | if ( (prevCh & ~0xff) == 0 ) {
|
|---|
| 2606 | const char *p = strchr( tab, prevCh );
|
|---|
| 2607 | if ( p != 0 )
|
|---|
| 2608 | return Tok_Char | backTab[p - tab];
|
|---|
| 2609 | }
|
|---|
| 2610 | #endif
|
|---|
| 2611 |
|
|---|
| 2612 | switch ( prevCh ) {
|
|---|
| 2613 | #ifndef QT_NO_REGEXP_ESCAPE
|
|---|
| 2614 | case '0':
|
|---|
| 2615 | val = 0;
|
|---|
| 2616 | for ( i = 0; i < 3; i++ ) {
|
|---|
| 2617 | if ( yyCh >= '0' && yyCh <= '7' )
|
|---|
| 2618 | val = ( val << 3 ) | ( yyCh - '0' );
|
|---|
| 2619 | else
|
|---|
| 2620 | break;
|
|---|
| 2621 | yyCh = getChar();
|
|---|
| 2622 | }
|
|---|
| 2623 | if ( (val & ~0377) != 0 )
|
|---|
| 2624 | error( RXERR_OCTAL );
|
|---|
| 2625 | return Tok_Char | val;
|
|---|
| 2626 | #endif
|
|---|
| 2627 | #ifndef QT_NO_REGEXP_ESCAPE
|
|---|
| 2628 | case 'B':
|
|---|
| 2629 | return Tok_NonWord;
|
|---|
| 2630 | #endif
|
|---|
| 2631 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 2632 | case 'D':
|
|---|
| 2633 | // see QChar::isDigit()
|
|---|
| 2634 | yyCharClass->addCategories( 0x7fffffef );
|
|---|
| 2635 | return Tok_CharClass;
|
|---|
| 2636 | case 'S':
|
|---|
| 2637 | // see QChar::isSpace()
|
|---|
| 2638 | yyCharClass->addCategories( 0x7ffff87f );
|
|---|
| 2639 | yyCharClass->addRange( 0x0000, 0x0008 );
|
|---|
| 2640 | yyCharClass->addRange( 0x000e, 0x001f );
|
|---|
| 2641 | yyCharClass->addRange( 0x007f, 0x009f );
|
|---|
| 2642 | return Tok_CharClass;
|
|---|
| 2643 | case 'W':
|
|---|
| 2644 | // see QChar::isLetterOrNumber()
|
|---|
| 2645 | yyCharClass->addCategories( 0x7fe07f8f );
|
|---|
| 2646 | yyCharClass->addRange( 0x203f, 0x2040 );
|
|---|
| 2647 | yyCharClass->addSingleton( 0x2040 );
|
|---|
| 2648 | yyCharClass->addSingleton( 0x30fb );
|
|---|
| 2649 | yyCharClass->addRange( 0xfe33, 0xfe34 );
|
|---|
| 2650 | yyCharClass->addRange( 0xfe4d, 0xfe4f );
|
|---|
| 2651 | yyCharClass->addSingleton( 0xff3f );
|
|---|
| 2652 | yyCharClass->addSingleton( 0xff65 );
|
|---|
| 2653 | return Tok_CharClass;
|
|---|
| 2654 | #endif
|
|---|
| 2655 | #ifndef QT_NO_REGEXP_ESCAPE
|
|---|
| 2656 | case 'b':
|
|---|
| 2657 | return Tok_Word;
|
|---|
| 2658 | #endif
|
|---|
| 2659 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 2660 | case 'd':
|
|---|
| 2661 | // see QChar::isDigit()
|
|---|
| 2662 | yyCharClass->addCategories( 0x00000010 );
|
|---|
| 2663 | return Tok_CharClass;
|
|---|
| 2664 | case 's':
|
|---|
| 2665 | // see QChar::isSpace()
|
|---|
| 2666 | yyCharClass->addCategories( 0x00000380 );
|
|---|
| 2667 | yyCharClass->addRange( 0x0009, 0x000d );
|
|---|
| 2668 | return Tok_CharClass;
|
|---|
| 2669 | case 'w':
|
|---|
| 2670 | // see QChar::isLetterOrNumber()
|
|---|
| 2671 | yyCharClass->addCategories( 0x000f8070 );
|
|---|
| 2672 | yyCharClass->addSingleton( 0x005f ); // '_'
|
|---|
| 2673 | return Tok_CharClass;
|
|---|
| 2674 | #endif
|
|---|
| 2675 | #ifndef QT_NO_REGEXP_ESCAPE
|
|---|
| 2676 | case 'x':
|
|---|
| 2677 | val = 0;
|
|---|
| 2678 | for ( i = 0; i < 4; i++ ) {
|
|---|
| 2679 | low = QChar( yyCh ).lower();
|
|---|
| 2680 | if ( low >= '0' && low <= '9' )
|
|---|
| 2681 | val = ( val << 4 ) | ( low - '0' );
|
|---|
| 2682 | else if ( low >= 'a' && low <= 'f' )
|
|---|
| 2683 | val = ( val << 4 ) | ( low - 'a' + 10 );
|
|---|
| 2684 | else
|
|---|
| 2685 | break;
|
|---|
| 2686 | yyCh = getChar();
|
|---|
| 2687 | }
|
|---|
| 2688 | return Tok_Char | val;
|
|---|
| 2689 | #endif
|
|---|
| 2690 | default:
|
|---|
| 2691 | if ( prevCh >= '1' && prevCh <= '9' ) {
|
|---|
| 2692 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 2693 | val = prevCh - '0';
|
|---|
| 2694 | while ( yyCh >= '0' && yyCh <= '9' ) {
|
|---|
| 2695 | val = ( val *= 10 ) | ( yyCh - '0' );
|
|---|
| 2696 | yyCh = getChar();
|
|---|
| 2697 | }
|
|---|
| 2698 | return Tok_BackRef | val;
|
|---|
| 2699 | #else
|
|---|
| 2700 | error( RXERR_DISABLED );
|
|---|
| 2701 | #endif
|
|---|
| 2702 | }
|
|---|
| 2703 | return Tok_Char | prevCh;
|
|---|
| 2704 | }
|
|---|
| 2705 | }
|
|---|
| 2706 |
|
|---|
| 2707 | #ifndef QT_NO_REGEXP_INTERVAL
|
|---|
| 2708 | int QRegExpEngine::getRep( int def )
|
|---|
| 2709 | {
|
|---|
| 2710 | if ( yyCh >= '0' && yyCh <= '9' ) {
|
|---|
| 2711 | int rep = 0;
|
|---|
| 2712 | do {
|
|---|
| 2713 | rep = 10 * rep + yyCh - '0';
|
|---|
| 2714 | if ( rep >= InftyRep ) {
|
|---|
| 2715 | error( RXERR_REPETITION );
|
|---|
| 2716 | rep = def;
|
|---|
| 2717 | }
|
|---|
| 2718 | yyCh = getChar();
|
|---|
| 2719 | } while ( yyCh >= '0' && yyCh <= '9' );
|
|---|
| 2720 | return rep;
|
|---|
| 2721 | } else {
|
|---|
| 2722 | return def;
|
|---|
| 2723 | }
|
|---|
| 2724 | }
|
|---|
| 2725 | #endif
|
|---|
| 2726 |
|
|---|
| 2727 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 2728 | void QRegExpEngine::skipChars( int n )
|
|---|
| 2729 | {
|
|---|
| 2730 | if ( n > 0 ) {
|
|---|
| 2731 | yyPos += n - 1;
|
|---|
| 2732 | yyCh = getChar();
|
|---|
| 2733 | }
|
|---|
| 2734 | }
|
|---|
| 2735 | #endif
|
|---|
| 2736 |
|
|---|
| 2737 | void QRegExpEngine::error( const char *msg )
|
|---|
| 2738 | {
|
|---|
| 2739 | if ( yyError.isEmpty() )
|
|---|
| 2740 | yyError = QString::fromLatin1( msg );
|
|---|
| 2741 | }
|
|---|
| 2742 |
|
|---|
| 2743 | void QRegExpEngine::startTokenizer( const QChar *rx, int len )
|
|---|
| 2744 | {
|
|---|
| 2745 | yyIn = rx;
|
|---|
| 2746 | yyPos0 = 0;
|
|---|
| 2747 | yyPos = 0;
|
|---|
| 2748 | yyLen = len;
|
|---|
| 2749 | yyCh = getChar();
|
|---|
| 2750 | yyCharClass = new CharClass;
|
|---|
| 2751 | yyMinRep = 0;
|
|---|
| 2752 | yyMaxRep = 0;
|
|---|
| 2753 | yyError = QString();
|
|---|
| 2754 | }
|
|---|
| 2755 |
|
|---|
| 2756 | int QRegExpEngine::getToken()
|
|---|
| 2757 | {
|
|---|
| 2758 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 2759 | ushort pendingCh = 0;
|
|---|
| 2760 | bool charPending;
|
|---|
| 2761 | bool rangePending;
|
|---|
| 2762 | int tok;
|
|---|
| 2763 | #endif
|
|---|
| 2764 | int prevCh = yyCh;
|
|---|
| 2765 |
|
|---|
| 2766 | yyPos0 = yyPos - 1;
|
|---|
| 2767 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 2768 | yyCharClass->clear();
|
|---|
| 2769 | #endif
|
|---|
| 2770 | yyMinRep = 0;
|
|---|
| 2771 | yyMaxRep = 0;
|
|---|
| 2772 | yyCh = getChar();
|
|---|
| 2773 |
|
|---|
| 2774 | switch ( prevCh ) {
|
|---|
| 2775 | case EOS:
|
|---|
| 2776 | yyPos0 = yyPos;
|
|---|
| 2777 | return Tok_Eos;
|
|---|
| 2778 | case '$':
|
|---|
| 2779 | return Tok_Dollar;
|
|---|
| 2780 | case '(':
|
|---|
| 2781 | if ( yyCh == '?' ) {
|
|---|
| 2782 | prevCh = getChar();
|
|---|
| 2783 | yyCh = getChar();
|
|---|
| 2784 | switch ( prevCh ) {
|
|---|
| 2785 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 2786 | case '!':
|
|---|
| 2787 | return Tok_NegLookahead;
|
|---|
| 2788 | case '=':
|
|---|
| 2789 | return Tok_PosLookahead;
|
|---|
| 2790 | #endif
|
|---|
| 2791 | case ':':
|
|---|
| 2792 | return Tok_MagicLeftParen;
|
|---|
| 2793 | default:
|
|---|
| 2794 | error( RXERR_LOOKAHEAD );
|
|---|
| 2795 | return Tok_MagicLeftParen;
|
|---|
| 2796 | }
|
|---|
| 2797 | } else {
|
|---|
| 2798 | return Tok_LeftParen;
|
|---|
| 2799 | }
|
|---|
| 2800 | case ')':
|
|---|
| 2801 | return Tok_RightParen;
|
|---|
| 2802 | case '*':
|
|---|
| 2803 | yyMinRep = 0;
|
|---|
| 2804 | yyMaxRep = InftyRep;
|
|---|
| 2805 | return Tok_Quantifier;
|
|---|
| 2806 | case '+':
|
|---|
| 2807 | yyMinRep = 1;
|
|---|
| 2808 | yyMaxRep = InftyRep;
|
|---|
| 2809 | return Tok_Quantifier;
|
|---|
| 2810 | case '.':
|
|---|
| 2811 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 2812 | yyCharClass->setNegative( TRUE );
|
|---|
| 2813 | #endif
|
|---|
| 2814 | return Tok_CharClass;
|
|---|
| 2815 | case '?':
|
|---|
| 2816 | yyMinRep = 0;
|
|---|
| 2817 | yyMaxRep = 1;
|
|---|
| 2818 | return Tok_Quantifier;
|
|---|
| 2819 | case '[':
|
|---|
| 2820 | #ifndef QT_NO_REGEXP_CCLASS
|
|---|
| 2821 | if ( yyCh == '^' ) {
|
|---|
| 2822 | yyCharClass->setNegative( TRUE );
|
|---|
| 2823 | yyCh = getChar();
|
|---|
| 2824 | }
|
|---|
| 2825 | charPending = FALSE;
|
|---|
| 2826 | rangePending = FALSE;
|
|---|
| 2827 | do {
|
|---|
| 2828 | if ( yyCh == '-' && charPending && !rangePending ) {
|
|---|
| 2829 | rangePending = TRUE;
|
|---|
| 2830 | yyCh = getChar();
|
|---|
| 2831 | } else {
|
|---|
| 2832 | if ( charPending && !rangePending ) {
|
|---|
| 2833 | yyCharClass->addSingleton( pendingCh );
|
|---|
| 2834 | charPending = FALSE;
|
|---|
| 2835 | }
|
|---|
| 2836 | if ( yyCh == '\\' ) {
|
|---|
| 2837 | yyCh = getChar();
|
|---|
| 2838 | tok = getEscape();
|
|---|
| 2839 | if ( tok == Tok_Word )
|
|---|
| 2840 | tok = '\b';
|
|---|
| 2841 | } else {
|
|---|
| 2842 | tok = Tok_Char | yyCh;
|
|---|
| 2843 | yyCh = getChar();
|
|---|
| 2844 | }
|
|---|
| 2845 | if ( tok == Tok_CharClass ) {
|
|---|
| 2846 | if ( rangePending ) {
|
|---|
| 2847 | yyCharClass->addSingleton( '-' );
|
|---|
| 2848 | yyCharClass->addSingleton( pendingCh );
|
|---|
| 2849 | charPending = FALSE;
|
|---|
| 2850 | rangePending = FALSE;
|
|---|
| 2851 | }
|
|---|
| 2852 | } else if ( (tok & Tok_Char) != 0 ) {
|
|---|
| 2853 | if ( rangePending ) {
|
|---|
| 2854 | yyCharClass->addRange( pendingCh, tok ^ Tok_Char );
|
|---|
| 2855 | charPending = FALSE;
|
|---|
| 2856 | rangePending = FALSE;
|
|---|
| 2857 | } else {
|
|---|
| 2858 | pendingCh = tok ^ Tok_Char;
|
|---|
| 2859 | charPending = TRUE;
|
|---|
| 2860 | }
|
|---|
| 2861 | } else {
|
|---|
| 2862 | error( RXERR_CHARCLASS );
|
|---|
| 2863 | }
|
|---|
| 2864 | }
|
|---|
| 2865 | } while ( yyCh != ']' && yyCh != EOS );
|
|---|
| 2866 | if ( rangePending )
|
|---|
| 2867 | yyCharClass->addSingleton( '-' );
|
|---|
| 2868 | if ( charPending )
|
|---|
| 2869 | yyCharClass->addSingleton( pendingCh );
|
|---|
| 2870 | if ( yyCh == EOS )
|
|---|
| 2871 | error( RXERR_END );
|
|---|
| 2872 | else
|
|---|
| 2873 | yyCh = getChar();
|
|---|
| 2874 | return Tok_CharClass;
|
|---|
| 2875 | #else
|
|---|
| 2876 | error( RXERR_END );
|
|---|
| 2877 | return Tok_Char | '[';
|
|---|
| 2878 | #endif
|
|---|
| 2879 | case '\\':
|
|---|
| 2880 | return getEscape();
|
|---|
| 2881 | case ']':
|
|---|
| 2882 | error( RXERR_LEFTDELIM );
|
|---|
| 2883 | return Tok_Char | ']';
|
|---|
| 2884 | case '^':
|
|---|
| 2885 | return Tok_Caret;
|
|---|
| 2886 | case '{':
|
|---|
| 2887 | #ifndef QT_NO_REGEXP_INTERVAL
|
|---|
| 2888 | yyMinRep = getRep( 0 );
|
|---|
| 2889 | yyMaxRep = yyMinRep;
|
|---|
| 2890 | if ( yyCh == ',' ) {
|
|---|
| 2891 | yyCh = getChar();
|
|---|
| 2892 | yyMaxRep = getRep( InftyRep );
|
|---|
| 2893 | }
|
|---|
| 2894 | if ( yyMaxRep < yyMinRep )
|
|---|
| 2895 | qSwap( yyMinRep, yyMaxRep );
|
|---|
| 2896 | if ( yyCh != '}' )
|
|---|
| 2897 | error( RXERR_REPETITION );
|
|---|
| 2898 | yyCh = getChar();
|
|---|
| 2899 | return Tok_Quantifier;
|
|---|
| 2900 | #else
|
|---|
| 2901 | error( RXERR_DISABLED );
|
|---|
| 2902 | return Tok_Char | '{';
|
|---|
| 2903 | #endif
|
|---|
| 2904 | case '|':
|
|---|
| 2905 | return Tok_Bar;
|
|---|
| 2906 | case '}':
|
|---|
| 2907 | error( RXERR_LEFTDELIM );
|
|---|
| 2908 | return Tok_Char | '}';
|
|---|
| 2909 | default:
|
|---|
| 2910 | return Tok_Char | prevCh;
|
|---|
| 2911 | }
|
|---|
| 2912 | }
|
|---|
| 2913 |
|
|---|
| 2914 | int QRegExpEngine::parse( const QChar *pattern, int len )
|
|---|
| 2915 | {
|
|---|
| 2916 | valid = TRUE;
|
|---|
| 2917 | startTokenizer( pattern, len );
|
|---|
| 2918 | yyTok = getToken();
|
|---|
| 2919 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 2920 | yyMayCapture = TRUE;
|
|---|
| 2921 | #else
|
|---|
| 2922 | yyMayCapture = FALSE;
|
|---|
| 2923 | #endif
|
|---|
| 2924 |
|
|---|
| 2925 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 2926 | int atom = startAtom( FALSE );
|
|---|
| 2927 | #endif
|
|---|
| 2928 | CharClass anything;
|
|---|
| 2929 | Box box( this ); // create InitialState
|
|---|
| 2930 | box.set( anything );
|
|---|
| 2931 | Box rightBox( this ); // create FinalState
|
|---|
| 2932 | rightBox.set( anything );
|
|---|
| 2933 |
|
|---|
| 2934 | Box middleBox( this );
|
|---|
| 2935 | parseExpression( &middleBox );
|
|---|
| 2936 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 2937 | finishAtom( atom );
|
|---|
| 2938 | #endif
|
|---|
| 2939 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2940 | middleBox.setupHeuristics();
|
|---|
| 2941 | #endif
|
|---|
| 2942 | box.cat( middleBox );
|
|---|
| 2943 | box.cat( rightBox );
|
|---|
| 2944 | delete yyCharClass;
|
|---|
| 2945 | yyCharClass = 0;
|
|---|
| 2946 |
|
|---|
| 2947 | officialncap = ncap;
|
|---|
| 2948 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 2949 | if ( nbrefs > ncap )
|
|---|
| 2950 | ncap = nbrefs;
|
|---|
| 2951 | #endif
|
|---|
| 2952 |
|
|---|
| 2953 | /*
|
|---|
| 2954 | We use one QMemArray<int> for all the big data used a lot in
|
|---|
| 2955 | matchHere() and friends.
|
|---|
| 2956 | */
|
|---|
| 2957 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2958 | mmSlideTabSize = QMAX( minl + 1, 16 );
|
|---|
| 2959 | #else
|
|---|
| 2960 | mmSlideTabSize = 0;
|
|---|
| 2961 | #endif
|
|---|
| 2962 | mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize );
|
|---|
| 2963 |
|
|---|
| 2964 | mmInNextStack = mmBigArray.data();
|
|---|
| 2965 | memset( mmInNextStack, -1, ns * sizeof(int) );
|
|---|
| 2966 | mmCurStack = mmInNextStack + ns;
|
|---|
| 2967 | mmNextStack = mmInNextStack + 2 * ns;
|
|---|
| 2968 |
|
|---|
| 2969 | mmCurCapBegin = mmInNextStack + 3 * ns;
|
|---|
| 2970 | mmNextCapBegin = mmCurCapBegin + ncap * ns;
|
|---|
| 2971 | mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns;
|
|---|
| 2972 | mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns;
|
|---|
| 2973 |
|
|---|
| 2974 | mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns;
|
|---|
| 2975 | mmTempCapEnd = mmTempCapBegin + ncap;
|
|---|
| 2976 | mmCapBegin = mmTempCapBegin + 2 * ncap;
|
|---|
| 2977 | mmCapEnd = mmTempCapBegin + 3 * ncap;
|
|---|
| 2978 |
|
|---|
| 2979 | mmSlideTab = mmTempCapBegin + 4 * ncap;
|
|---|
| 2980 |
|
|---|
| 2981 | if ( !yyError.isEmpty() )
|
|---|
| 2982 | return -1;
|
|---|
| 2983 |
|
|---|
| 2984 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 2985 | State *sinit = s[InitialState];
|
|---|
| 2986 | caretAnchored = ( sinit->anchors != 0 );
|
|---|
| 2987 | if ( caretAnchored ) {
|
|---|
| 2988 | QMap<int, int>& anchors = *sinit->anchors;
|
|---|
| 2989 | QMap<int, int>::ConstIterator a;
|
|---|
| 2990 | for ( a = anchors.begin(); a != anchors.end(); ++a ) {
|
|---|
| 2991 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
|---|
| 2992 | if ( (*a & Anchor_Alternation) != 0 )
|
|---|
| 2993 | break;
|
|---|
| 2994 | #endif
|
|---|
| 2995 | if ( (*a & Anchor_Caret) == 0 ) {
|
|---|
| 2996 | caretAnchored = FALSE;
|
|---|
| 2997 | break;
|
|---|
| 2998 | }
|
|---|
| 2999 | }
|
|---|
| 3000 | }
|
|---|
| 3001 | #endif
|
|---|
| 3002 | return yyPos0;
|
|---|
| 3003 | }
|
|---|
| 3004 |
|
|---|
| 3005 | void QRegExpEngine::parseAtom( Box *box )
|
|---|
| 3006 | {
|
|---|
| 3007 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 3008 | QRegExpEngine *eng = 0;
|
|---|
| 3009 | bool neg;
|
|---|
| 3010 | int len;
|
|---|
| 3011 | #endif
|
|---|
| 3012 |
|
|---|
| 3013 | if ( (yyTok & Tok_Char) != 0 ) {
|
|---|
| 3014 | box->set( QChar(yyTok ^ Tok_Char) );
|
|---|
| 3015 | } else {
|
|---|
| 3016 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 3017 | trivial = FALSE;
|
|---|
| 3018 | #endif
|
|---|
| 3019 | switch ( yyTok ) {
|
|---|
| 3020 | case Tok_Dollar:
|
|---|
| 3021 | box->catAnchor( Anchor_Dollar );
|
|---|
| 3022 | break;
|
|---|
| 3023 | case Tok_Caret:
|
|---|
| 3024 | box->catAnchor( Anchor_Caret );
|
|---|
| 3025 | break;
|
|---|
| 3026 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
|---|
| 3027 | case Tok_PosLookahead:
|
|---|
| 3028 | case Tok_NegLookahead:
|
|---|
| 3029 | neg = ( yyTok == Tok_NegLookahead );
|
|---|
| 3030 | eng = new QRegExpEngine( cs );
|
|---|
| 3031 | len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 );
|
|---|
| 3032 | if ( len >= 0 )
|
|---|
| 3033 | skipChars( len );
|
|---|
| 3034 | else
|
|---|
| 3035 | error( RXERR_LOOKAHEAD );
|
|---|
| 3036 | box->catAnchor( addLookahead(eng, neg) );
|
|---|
| 3037 | yyTok = getToken();
|
|---|
| 3038 | if ( yyTok != Tok_RightParen )
|
|---|
| 3039 | error( RXERR_LOOKAHEAD );
|
|---|
| 3040 | break;
|
|---|
| 3041 | #endif
|
|---|
| 3042 | #ifndef QT_NO_REGEXP_ESCAPE
|
|---|
| 3043 | case Tok_Word:
|
|---|
| 3044 | box->catAnchor( Anchor_Word );
|
|---|
| 3045 | break;
|
|---|
| 3046 | case Tok_NonWord:
|
|---|
| 3047 | box->catAnchor( Anchor_NonWord );
|
|---|
| 3048 | break;
|
|---|
| 3049 | #endif
|
|---|
| 3050 | case Tok_LeftParen:
|
|---|
| 3051 | case Tok_MagicLeftParen:
|
|---|
| 3052 | yyTok = getToken();
|
|---|
| 3053 | parseExpression( box );
|
|---|
| 3054 | if ( yyTok != Tok_RightParen )
|
|---|
| 3055 | error( RXERR_END );
|
|---|
| 3056 | break;
|
|---|
| 3057 | case Tok_CharClass:
|
|---|
| 3058 | box->set( *yyCharClass );
|
|---|
| 3059 | break;
|
|---|
| 3060 | case Tok_Quantifier:
|
|---|
| 3061 | error( RXERR_REPETITION );
|
|---|
| 3062 | break;
|
|---|
| 3063 | default:
|
|---|
| 3064 | #ifndef QT_NO_REGEXP_BACKREF
|
|---|
| 3065 | if ( (yyTok & Tok_BackRef) != 0 )
|
|---|
| 3066 | box->set( yyTok ^ Tok_BackRef );
|
|---|
| 3067 | else
|
|---|
| 3068 | #endif
|
|---|
| 3069 | error( RXERR_DISABLED );
|
|---|
| 3070 | }
|
|---|
| 3071 | }
|
|---|
| 3072 | yyTok = getToken();
|
|---|
| 3073 | }
|
|---|
| 3074 |
|
|---|
| 3075 | void QRegExpEngine::parseFactor( Box *box )
|
|---|
| 3076 | {
|
|---|
| 3077 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 3078 | int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen );
|
|---|
| 3079 | #else
|
|---|
| 3080 | static const int atom = 0;
|
|---|
| 3081 | #endif
|
|---|
| 3082 |
|
|---|
| 3083 | #ifndef QT_NO_REGEXP_INTERVAL
|
|---|
| 3084 | #define YYREDO() \
|
|---|
| 3085 | yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
|
|---|
| 3086 | *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
|
|---|
| 3087 |
|
|---|
| 3088 | const QChar *in = yyIn;
|
|---|
| 3089 | int pos0 = yyPos0;
|
|---|
| 3090 | int pos = yyPos;
|
|---|
| 3091 | int len = yyLen;
|
|---|
| 3092 | int ch = yyCh;
|
|---|
| 3093 | CharClass charClass;
|
|---|
| 3094 | if ( yyTok == Tok_CharClass )
|
|---|
| 3095 | charClass = *yyCharClass;
|
|---|
| 3096 | int tok = yyTok;
|
|---|
| 3097 | bool mayCapture = yyMayCapture;
|
|---|
| 3098 | #endif
|
|---|
| 3099 |
|
|---|
| 3100 | parseAtom( box );
|
|---|
| 3101 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 3102 | finishAtom( atom );
|
|---|
| 3103 | #endif
|
|---|
| 3104 |
|
|---|
| 3105 | if ( yyTok == Tok_Quantifier ) {
|
|---|
| 3106 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 3107 | trivial = FALSE;
|
|---|
| 3108 | #endif
|
|---|
| 3109 | if ( yyMaxRep == InftyRep ) {
|
|---|
| 3110 | box->plus( atom );
|
|---|
| 3111 | #ifndef QT_NO_REGEXP_INTERVAL
|
|---|
| 3112 | } else if ( yyMaxRep == 0 ) {
|
|---|
| 3113 | box->clear();
|
|---|
| 3114 | #endif
|
|---|
| 3115 | }
|
|---|
| 3116 | if ( yyMinRep == 0 )
|
|---|
| 3117 | box->opt();
|
|---|
| 3118 |
|
|---|
| 3119 | #ifndef QT_NO_REGEXP_INTERVAL
|
|---|
| 3120 | yyMayCapture = FALSE;
|
|---|
| 3121 | int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1;
|
|---|
| 3122 | int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 );
|
|---|
| 3123 |
|
|---|
| 3124 | Box rightBox( this );
|
|---|
| 3125 | int i;
|
|---|
| 3126 |
|
|---|
| 3127 | for ( i = 0; i < beta; i++ ) {
|
|---|
| 3128 | YYREDO();
|
|---|
| 3129 | Box leftBox( this );
|
|---|
| 3130 | parseAtom( &leftBox );
|
|---|
| 3131 | leftBox.cat( rightBox );
|
|---|
| 3132 | leftBox.opt();
|
|---|
| 3133 | rightBox = leftBox;
|
|---|
| 3134 | }
|
|---|
| 3135 | for ( i = 0; i < alpha; i++ ) {
|
|---|
| 3136 | YYREDO();
|
|---|
| 3137 | Box leftBox( this );
|
|---|
| 3138 | parseAtom( &leftBox );
|
|---|
| 3139 | leftBox.cat( rightBox );
|
|---|
| 3140 | rightBox = leftBox;
|
|---|
| 3141 | }
|
|---|
| 3142 | rightBox.cat( *box );
|
|---|
| 3143 | *box = rightBox;
|
|---|
| 3144 | #endif
|
|---|
| 3145 | yyTok = getToken();
|
|---|
| 3146 | #ifndef QT_NO_REGEXP_INTERVAL
|
|---|
| 3147 | yyMayCapture = mayCapture;
|
|---|
| 3148 | #endif
|
|---|
| 3149 | }
|
|---|
| 3150 | #undef YYREDO
|
|---|
| 3151 | }
|
|---|
| 3152 |
|
|---|
| 3153 | void QRegExpEngine::parseTerm( Box *box )
|
|---|
| 3154 | {
|
|---|
| 3155 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 3156 | if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar )
|
|---|
| 3157 | parseFactor( box );
|
|---|
| 3158 | #endif
|
|---|
| 3159 | while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) {
|
|---|
| 3160 | Box rightBox( this );
|
|---|
| 3161 | parseFactor( &rightBox );
|
|---|
| 3162 | box->cat( rightBox );
|
|---|
| 3163 | }
|
|---|
| 3164 | }
|
|---|
| 3165 |
|
|---|
| 3166 | void QRegExpEngine::parseExpression( Box *box )
|
|---|
| 3167 | {
|
|---|
| 3168 | parseTerm( box );
|
|---|
| 3169 | while ( yyTok == Tok_Bar ) {
|
|---|
| 3170 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 3171 | trivial = FALSE;
|
|---|
| 3172 | #endif
|
|---|
| 3173 | Box rightBox( this );
|
|---|
| 3174 | yyTok = getToken();
|
|---|
| 3175 | parseTerm( &rightBox );
|
|---|
| 3176 | box->orx( rightBox );
|
|---|
| 3177 | }
|
|---|
| 3178 | }
|
|---|
| 3179 |
|
|---|
| 3180 | /*
|
|---|
| 3181 | The struct QRegExpPrivate contains the private data of a regular
|
|---|
| 3182 | expression other than the automaton. It makes it possible for many
|
|---|
| 3183 | QRegExp objects to use the same QRegExpEngine object with different
|
|---|
| 3184 | QRegExpPrivate objects.
|
|---|
| 3185 | */
|
|---|
| 3186 | struct QRegExpPrivate
|
|---|
| 3187 | {
|
|---|
| 3188 | QString pattern; // regular-expression or wildcard pattern
|
|---|
| 3189 | QString rxpattern; // regular-expression pattern
|
|---|
| 3190 | #ifndef QT_NO_REGEXP_WILDCARD
|
|---|
| 3191 | bool wc : 1; // wildcard mode?
|
|---|
| 3192 | #endif
|
|---|
| 3193 | bool min : 1; // minimal matching? (instead of maximal)
|
|---|
| 3194 | bool cs : 1; // case sensitive?
|
|---|
| 3195 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 3196 | QString t; // last string passed to QRegExp::search() or searchRev()
|
|---|
| 3197 | QStringList capturedCache; // what QRegExp::capturedTexts() returned last
|
|---|
| 3198 | #endif
|
|---|
| 3199 | QMemArray<int> captured; // what QRegExpEngine::search() returned last
|
|---|
| 3200 |
|
|---|
| 3201 | QRegExpPrivate() { captured.fill( -1, 2 ); }
|
|---|
| 3202 | };
|
|---|
| 3203 |
|
|---|
| 3204 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 3205 | static QSingleCleanupHandler<QCache<QRegExpEngine> > cleanup_cache;
|
|---|
| 3206 | # ifndef QT_THREAD_SUPPORT
|
|---|
| 3207 | static QCache<QRegExpEngine> *engineCache = 0;
|
|---|
| 3208 | # endif // QT_THREAD_SUPPORT
|
|---|
| 3209 | #endif // QT_NO_REGEXP_OPTIM
|
|---|
| 3210 |
|
|---|
| 3211 | static void regexpEngine( QRegExpEngine *&eng, const QString &pattern,
|
|---|
| 3212 | bool caseSensitive, bool deref )
|
|---|
| 3213 | {
|
|---|
| 3214 | # ifdef QT_THREAD_SUPPORT
|
|---|
| 3215 | static QThreadStorage<QCache<QRegExpEngine> *> engineCaches;
|
|---|
| 3216 | QCache<QRegExpEngine> *&engineCache = engineCaches.localData();
|
|---|
| 3217 | #endif // QT_THREAD_SUPPORT
|
|---|
| 3218 |
|
|---|
| 3219 | if ( !deref ) {
|
|---|
| 3220 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 3221 | if ( engineCache != 0 ) {
|
|---|
| 3222 | eng = engineCache->take( pattern );
|
|---|
| 3223 | if ( eng == 0 || eng->caseSensitive() != caseSensitive ) {
|
|---|
| 3224 | delete eng;
|
|---|
| 3225 | } else {
|
|---|
| 3226 | eng->ref();
|
|---|
| 3227 | return;
|
|---|
| 3228 | }
|
|---|
| 3229 | }
|
|---|
| 3230 | #endif // QT_NO_REGEXP_OPTIM
|
|---|
| 3231 | eng = new QRegExpEngine( pattern, caseSensitive );
|
|---|
| 3232 | return;
|
|---|
| 3233 | }
|
|---|
| 3234 |
|
|---|
| 3235 | if ( eng->deref() ) {
|
|---|
| 3236 | #ifndef QT_NO_REGEXP_OPTIM
|
|---|
| 3237 | if ( engineCache == 0 ) {
|
|---|
| 3238 | engineCache = new QCache<QRegExpEngine>;
|
|---|
| 3239 | engineCache->setAutoDelete( TRUE );
|
|---|
| 3240 | # ifndef QT_THREAD_SUPPORT
|
|---|
| 3241 | cleanup_cache.set( &engineCache );
|
|---|
| 3242 | # endif // !QT_THREAD_SUPPORT
|
|---|
| 3243 | }
|
|---|
| 3244 | if ( !pattern.isNull() &&
|
|---|
| 3245 | engineCache->insert(pattern, eng, 4 + pattern.length() / 4) )
|
|---|
| 3246 | return;
|
|---|
| 3247 | #else
|
|---|
| 3248 | Q_UNUSED( pattern );
|
|---|
| 3249 | #endif // QT_NO_REGEXP_OPTIM
|
|---|
| 3250 | delete eng;
|
|---|
| 3251 | eng = 0;
|
|---|
| 3252 | }
|
|---|
| 3253 | }
|
|---|
| 3254 |
|
|---|
| 3255 | /*!
|
|---|
| 3256 | \enum QRegExp::CaretMode
|
|---|
| 3257 |
|
|---|
| 3258 | The CaretMode enum defines the different meanings of the caret
|
|---|
| 3259 | (<b>^</b>) in a regular expression. The possible values are:
|
|---|
| 3260 |
|
|---|
| 3261 | \value CaretAtZero
|
|---|
| 3262 | The caret corresponds to index 0 in the searched string.
|
|---|
| 3263 |
|
|---|
| 3264 | \value CaretAtOffset
|
|---|
| 3265 | The caret corresponds to the start offset of the search.
|
|---|
| 3266 |
|
|---|
| 3267 | \value CaretWontMatch
|
|---|
| 3268 | The caret never matches.
|
|---|
| 3269 | */
|
|---|
| 3270 |
|
|---|
| 3271 | /*!
|
|---|
| 3272 | Constructs an empty regexp.
|
|---|
| 3273 |
|
|---|
| 3274 | \sa isValid() errorString()
|
|---|
| 3275 | */
|
|---|
| 3276 | QRegExp::QRegExp()
|
|---|
| 3277 | : eng( 0 )
|
|---|
| 3278 | {
|
|---|
| 3279 | priv = new QRegExpPrivate;
|
|---|
| 3280 | #ifndef QT_NO_REGEXP_WILDCARD
|
|---|
| 3281 | priv->wc = FALSE;
|
|---|
| 3282 | #endif
|
|---|
| 3283 | priv->min = FALSE;
|
|---|
| 3284 | priv->cs = TRUE;
|
|---|
| 3285 | }
|
|---|
| 3286 |
|
|---|
| 3287 | /*!
|
|---|
| 3288 | Constructs a regular expression object for the given \a pattern
|
|---|
| 3289 | string. The pattern must be given using wildcard notation if \a
|
|---|
| 3290 | wildcard is TRUE (default is FALSE). The pattern is case
|
|---|
| 3291 | sensitive, unless \a caseSensitive is FALSE. Matching is greedy
|
|---|
| 3292 | (maximal), but can be changed by calling setMinimal().
|
|---|
| 3293 |
|
|---|
| 3294 | \sa setPattern() setCaseSensitive() setWildcard() setMinimal()
|
|---|
| 3295 | */
|
|---|
| 3296 | QRegExp::QRegExp( const QString& pattern, bool caseSensitive, bool wildcard )
|
|---|
| 3297 | : eng( 0 )
|
|---|
| 3298 | {
|
|---|
| 3299 | priv = new QRegExpPrivate;
|
|---|
| 3300 | priv->pattern = pattern;
|
|---|
| 3301 | #ifndef QT_NO_REGEXP_WILDCARD
|
|---|
| 3302 | priv->wc = wildcard;
|
|---|
| 3303 | #endif
|
|---|
| 3304 | priv->min = FALSE;
|
|---|
| 3305 | priv->cs = caseSensitive;
|
|---|
| 3306 | }
|
|---|
| 3307 |
|
|---|
| 3308 | /*!
|
|---|
| 3309 | Constructs a regular expression as a copy of \a rx.
|
|---|
| 3310 |
|
|---|
| 3311 | \sa operator=()
|
|---|
| 3312 | */
|
|---|
| 3313 | QRegExp::QRegExp( const QRegExp& rx )
|
|---|
| 3314 | : eng( 0 )
|
|---|
| 3315 | {
|
|---|
| 3316 | priv = new QRegExpPrivate;
|
|---|
| 3317 | operator=( rx );
|
|---|
| 3318 | }
|
|---|
| 3319 |
|
|---|
| 3320 | /*!
|
|---|
| 3321 | Destroys the regular expression and cleans up its internal data.
|
|---|
| 3322 | */
|
|---|
| 3323 | QRegExp::~QRegExp()
|
|---|
| 3324 | {
|
|---|
| 3325 | invalidateEngine();
|
|---|
| 3326 | delete priv;
|
|---|
| 3327 | }
|
|---|
| 3328 |
|
|---|
| 3329 | /*!
|
|---|
| 3330 | Copies the regular expression \a rx and returns a reference to the
|
|---|
| 3331 | copy. The case sensitivity, wildcard and minimal matching options
|
|---|
| 3332 | are also copied.
|
|---|
| 3333 | */
|
|---|
| 3334 | QRegExp& QRegExp::operator=( const QRegExp& rx )
|
|---|
| 3335 | {
|
|---|
| 3336 | QRegExpEngine *otherEng = rx.eng;
|
|---|
| 3337 | if ( otherEng != 0 )
|
|---|
| 3338 | otherEng->ref();
|
|---|
| 3339 | invalidateEngine();
|
|---|
| 3340 | eng = otherEng;
|
|---|
| 3341 | priv->pattern = rx.priv->pattern;
|
|---|
| 3342 | priv->rxpattern = rx.priv->rxpattern;
|
|---|
| 3343 | #ifndef QT_NO_REGEXP_WILDCARD
|
|---|
| 3344 | priv->wc = rx.priv->wc;
|
|---|
| 3345 | #endif
|
|---|
| 3346 | priv->min = rx.priv->min;
|
|---|
| 3347 | priv->cs = rx.priv->cs;
|
|---|
| 3348 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 3349 | priv->t = rx.priv->t;
|
|---|
| 3350 | priv->capturedCache = rx.priv->capturedCache;
|
|---|
| 3351 | #endif
|
|---|
| 3352 | priv->captured = rx.priv->captured;
|
|---|
| 3353 | return *this;
|
|---|
| 3354 | }
|
|---|
| 3355 |
|
|---|
| 3356 | /*!
|
|---|
| 3357 | Returns TRUE if this regular expression is equal to \a rx;
|
|---|
| 3358 | otherwise returns FALSE.
|
|---|
| 3359 |
|
|---|
| 3360 | Two QRegExp objects are equal if they have the same pattern
|
|---|
| 3361 | strings and the same settings for case sensitivity, wildcard and
|
|---|
| 3362 | minimal matching.
|
|---|
| 3363 | */
|
|---|
| 3364 | bool QRegExp::operator==( const QRegExp& rx ) const
|
|---|
| 3365 | {
|
|---|
| 3366 | return priv->pattern == rx.priv->pattern &&
|
|---|
| 3367 | #ifndef QT_NO_REGEXP_WILDCARD
|
|---|
| 3368 | priv->wc == rx.priv->wc &&
|
|---|
| 3369 | #endif
|
|---|
| 3370 | priv->min == rx.priv->min &&
|
|---|
| 3371 | priv->cs == rx.priv->cs;
|
|---|
| 3372 | }
|
|---|
| 3373 |
|
|---|
| 3374 | /*!
|
|---|
| 3375 | \fn bool QRegExp::operator!=( const QRegExp& rx ) const
|
|---|
| 3376 |
|
|---|
| 3377 | Returns TRUE if this regular expression is not equal to \a rx;
|
|---|
| 3378 | otherwise returns FALSE.
|
|---|
| 3379 |
|
|---|
| 3380 | \sa operator==()
|
|---|
| 3381 | */
|
|---|
| 3382 |
|
|---|
| 3383 | /*!
|
|---|
| 3384 | Returns TRUE if the pattern string is empty; otherwise returns
|
|---|
| 3385 | FALSE.
|
|---|
| 3386 |
|
|---|
| 3387 | If you call exactMatch() with an empty pattern on an empty string
|
|---|
| 3388 | it will return TRUE; otherwise it returns FALSE since it operates
|
|---|
| 3389 | over the whole string. If you call search() with an empty pattern
|
|---|
| 3390 | on \e any string it will return the start offset (0 by default)
|
|---|
| 3391 | because the empty pattern matches the 'emptiness' at the start of
|
|---|
| 3392 | the string. In this case the length of the match returned by
|
|---|
| 3393 | matchedLength() will be 0.
|
|---|
| 3394 |
|
|---|
| 3395 | See QString::isEmpty().
|
|---|
| 3396 | */
|
|---|
| 3397 |
|
|---|
| 3398 | bool QRegExp::isEmpty() const
|
|---|
| 3399 | {
|
|---|
| 3400 | return priv->pattern.isEmpty();
|
|---|
| 3401 | }
|
|---|
| 3402 |
|
|---|
| 3403 | /*!
|
|---|
| 3404 | Returns TRUE if the regular expression is valid; otherwise returns
|
|---|
| 3405 | FALSE. An invalid regular expression never matches.
|
|---|
| 3406 |
|
|---|
| 3407 | The pattern <b>[a-z</b> is an example of an invalid pattern, since
|
|---|
| 3408 | it lacks a closing square bracket.
|
|---|
| 3409 |
|
|---|
| 3410 | Note that the validity of a regexp may also depend on the setting
|
|---|
| 3411 | of the wildcard flag, for example <b>*.html</b> is a valid
|
|---|
| 3412 | wildcard regexp but an invalid full regexp.
|
|---|
| 3413 |
|
|---|
| 3414 | \sa errorString()
|
|---|
| 3415 | */
|
|---|
| 3416 | bool QRegExp::isValid() const
|
|---|
| 3417 | {
|
|---|
| 3418 | if ( priv->pattern.isEmpty() ) {
|
|---|
| 3419 | return TRUE;
|
|---|
| 3420 | } else {
|
|---|
| 3421 | prepareEngine();
|
|---|
| 3422 | return eng->isValid();
|
|---|
| 3423 | }
|
|---|
| 3424 | }
|
|---|
| 3425 |
|
|---|
| 3426 | /*!
|
|---|
| 3427 | Returns the pattern string of the regular expression. The pattern
|
|---|
| 3428 | has either regular expression syntax or wildcard syntax, depending
|
|---|
| 3429 | on wildcard().
|
|---|
| 3430 |
|
|---|
| 3431 | \sa setPattern()
|
|---|
| 3432 | */
|
|---|
| 3433 | QString QRegExp::pattern() const
|
|---|
| 3434 | {
|
|---|
| 3435 | return priv->pattern;
|
|---|
| 3436 | }
|
|---|
| 3437 |
|
|---|
| 3438 | /*!
|
|---|
| 3439 | Sets the pattern string to \a pattern. The case sensitivity,
|
|---|
| 3440 | wildcard and minimal matching options are not changed.
|
|---|
| 3441 |
|
|---|
| 3442 | \sa pattern()
|
|---|
| 3443 | */
|
|---|
| 3444 | void QRegExp::setPattern( const QString& pattern )
|
|---|
| 3445 | {
|
|---|
| 3446 | if ( priv->pattern != pattern ) {
|
|---|
| 3447 | priv->pattern = pattern;
|
|---|
| 3448 | invalidateEngine();
|
|---|
| 3449 | }
|
|---|
| 3450 | }
|
|---|
| 3451 |
|
|---|
| 3452 | /*!
|
|---|
| 3453 | Returns TRUE if case sensitivity is enabled; otherwise returns
|
|---|
| 3454 | FALSE. The default is TRUE.
|
|---|
| 3455 |
|
|---|
| 3456 | \sa setCaseSensitive()
|
|---|
| 3457 | */
|
|---|
| 3458 | bool QRegExp::caseSensitive() const
|
|---|
| 3459 | {
|
|---|
| 3460 | return priv->cs;
|
|---|
| 3461 | }
|
|---|
| 3462 |
|
|---|
| 3463 | /*!
|
|---|
| 3464 | Sets case sensitive matching to \a sensitive.
|
|---|
| 3465 |
|
|---|
| 3466 | If \a sensitive is TRUE, <b>\\.txt$</b> matches \c{readme.txt} but
|
|---|
| 3467 | not \c{README.TXT}.
|
|---|
| 3468 |
|
|---|
| 3469 | \sa caseSensitive()
|
|---|
| 3470 | */
|
|---|
| 3471 | void QRegExp::setCaseSensitive( bool sensitive )
|
|---|
| 3472 | {
|
|---|
| 3473 | if ( sensitive != priv->cs ) {
|
|---|
| 3474 | priv->cs = sensitive;
|
|---|
| 3475 | invalidateEngine();
|
|---|
| 3476 | }
|
|---|
| 3477 | }
|
|---|
| 3478 |
|
|---|
| 3479 | #ifndef QT_NO_REGEXP_WILDCARD
|
|---|
| 3480 | /*!
|
|---|
| 3481 | Returns TRUE if wildcard mode is enabled; otherwise returns FALSE.
|
|---|
| 3482 | The default is FALSE.
|
|---|
| 3483 |
|
|---|
| 3484 | \sa setWildcard()
|
|---|
| 3485 | */
|
|---|
| 3486 | bool QRegExp::wildcard() const
|
|---|
| 3487 | {
|
|---|
| 3488 | return priv->wc;
|
|---|
| 3489 | }
|
|---|
| 3490 |
|
|---|
| 3491 | /*!
|
|---|
| 3492 | Sets the wildcard mode for the regular expression. The default is
|
|---|
| 3493 | FALSE.
|
|---|
| 3494 |
|
|---|
| 3495 | Setting \a wildcard to TRUE enables simple shell-like wildcard
|
|---|
| 3496 | matching. (See \link #wildcard-matching wildcard matching
|
|---|
| 3497 | (globbing) \endlink.)
|
|---|
| 3498 |
|
|---|
| 3499 | For example, <b>r*.txt</b> matches the string \c{readme.txt} in
|
|---|
| 3500 | wildcard mode, but does not match \c{readme}.
|
|---|
| 3501 |
|
|---|
| 3502 | \sa wildcard()
|
|---|
| 3503 | */
|
|---|
| 3504 | void QRegExp::setWildcard( bool wildcard )
|
|---|
| 3505 | {
|
|---|
| 3506 | if ( wildcard != priv->wc ) {
|
|---|
| 3507 | priv->wc = wildcard;
|
|---|
| 3508 | invalidateEngine();
|
|---|
| 3509 | }
|
|---|
| 3510 | }
|
|---|
| 3511 | #endif
|
|---|
| 3512 |
|
|---|
| 3513 | /*!
|
|---|
| 3514 | Returns TRUE if minimal (non-greedy) matching is enabled;
|
|---|
| 3515 | otherwise returns FALSE.
|
|---|
| 3516 |
|
|---|
| 3517 | \sa setMinimal()
|
|---|
| 3518 | */
|
|---|
| 3519 | bool QRegExp::minimal() const
|
|---|
| 3520 | {
|
|---|
| 3521 | return priv->min;
|
|---|
| 3522 | }
|
|---|
| 3523 |
|
|---|
| 3524 | /*!
|
|---|
| 3525 | Enables or disables minimal matching. If \a minimal is FALSE,
|
|---|
| 3526 | matching is greedy (maximal) which is the default.
|
|---|
| 3527 |
|
|---|
| 3528 | For example, suppose we have the input string "We must be
|
|---|
| 3529 | \<b>bold\</b>, very \<b>bold\</b>!" and the pattern
|
|---|
| 3530 | <b>\<b>.*\</b></b>. With the default greedy (maximal) matching,
|
|---|
| 3531 | the match is "We must be <u>\<b>bold\</b>, very
|
|---|
| 3532 | \<b>bold\</b></u>!". But with minimal (non-greedy) matching the
|
|---|
| 3533 | first match is: "We must be <u>\<b>bold\</b></u>, very
|
|---|
| 3534 | \<b>bold\</b>!" and the second match is "We must be \<b>bold\</b>,
|
|---|
| 3535 | very <u>\<b>bold\</b></u>!". In practice we might use the pattern
|
|---|
| 3536 | <b>\<b>[^\<]+\</b></b> instead, although this will still fail for
|
|---|
| 3537 | nested tags.
|
|---|
| 3538 |
|
|---|
| 3539 | \sa minimal()
|
|---|
| 3540 | */
|
|---|
| 3541 | void QRegExp::setMinimal( bool minimal )
|
|---|
| 3542 | {
|
|---|
| 3543 | priv->min = minimal;
|
|---|
| 3544 | }
|
|---|
| 3545 |
|
|---|
| 3546 | /*!
|
|---|
| 3547 | Returns TRUE if \a str is matched exactly by this regular
|
|---|
| 3548 | expression; otherwise returns FALSE. You can determine how much of
|
|---|
| 3549 | the string was matched by calling matchedLength().
|
|---|
| 3550 |
|
|---|
| 3551 | For a given regexp string, R, exactMatch("R") is the equivalent of
|
|---|
| 3552 | search("^R$") since exactMatch() effectively encloses the regexp
|
|---|
| 3553 | in the start of string and end of string anchors, except that it
|
|---|
| 3554 | sets matchedLength() differently.
|
|---|
| 3555 |
|
|---|
| 3556 | For example, if the regular expression is <b>blue</b>, then
|
|---|
| 3557 | exactMatch() returns TRUE only for input \c blue. For inputs \c
|
|---|
| 3558 | bluebell, \c blutak and \c lightblue, exactMatch() returns FALSE
|
|---|
| 3559 | and matchedLength() will return 4, 3 and 0 respectively.
|
|---|
| 3560 |
|
|---|
| 3561 | Although const, this function sets matchedLength(),
|
|---|
| 3562 | capturedTexts() and pos().
|
|---|
| 3563 |
|
|---|
| 3564 | \sa search() searchRev() QRegExpValidator
|
|---|
| 3565 | */
|
|---|
| 3566 | bool QRegExp::exactMatch( const QString& str ) const
|
|---|
| 3567 | {
|
|---|
| 3568 | prepareEngineForMatch( str );
|
|---|
| 3569 | eng->match( str, 0, priv->min, TRUE, 0, priv->captured );
|
|---|
| 3570 | if ( priv->captured[1] == (int) str.length() ) {
|
|---|
| 3571 | return TRUE;
|
|---|
| 3572 | } else {
|
|---|
| 3573 | priv->captured[0] = 0;
|
|---|
| 3574 | priv->captured[1] = eng->partialMatchLength();
|
|---|
| 3575 | return FALSE;
|
|---|
| 3576 | }
|
|---|
| 3577 | }
|
|---|
| 3578 |
|
|---|
| 3579 | #ifndef QT_NO_COMPAT
|
|---|
| 3580 | /*! \obsolete
|
|---|
| 3581 |
|
|---|
| 3582 | Attempts to match in \a str, starting from position \a index.
|
|---|
| 3583 | Returns the position of the match, or -1 if there was no match.
|
|---|
| 3584 |
|
|---|
| 3585 | The length of the match is stored in \a *len, unless \a len is a
|
|---|
| 3586 | null pointer.
|
|---|
| 3587 |
|
|---|
| 3588 | If \a indexIsStart is TRUE (the default), the position \a index in
|
|---|
| 3589 | the string will match the start of string anchor, <b>^</b>, in the
|
|---|
| 3590 | regexp, if present. Otherwise, position 0 in \a str will match.
|
|---|
| 3591 |
|
|---|
| 3592 | Use search() and matchedLength() instead of this function.
|
|---|
| 3593 |
|
|---|
| 3594 | \sa QString::mid() QConstString
|
|---|
| 3595 | */
|
|---|
| 3596 | int QRegExp::match( const QString& str, int index, int *len,
|
|---|
| 3597 | bool indexIsStart ) const
|
|---|
| 3598 | {
|
|---|
| 3599 | int pos = search( str, index, indexIsStart ? CaretAtOffset : CaretAtZero );
|
|---|
| 3600 | if ( len != 0 )
|
|---|
| 3601 | *len = matchedLength();
|
|---|
| 3602 | return pos;
|
|---|
| 3603 | }
|
|---|
| 3604 | #endif // QT_NO_COMPAT
|
|---|
| 3605 |
|
|---|
| 3606 | int QRegExp::search( const QString& str, int offset ) const
|
|---|
| 3607 | {
|
|---|
| 3608 | return search( str, offset, CaretAtZero );
|
|---|
| 3609 | }
|
|---|
| 3610 |
|
|---|
| 3611 | /*!
|
|---|
| 3612 | Attempts to find a match in \a str from position \a offset (0 by
|
|---|
| 3613 | default). If \a offset is -1, the search starts at the last
|
|---|
| 3614 | character; if -2, at the next to last character; etc.
|
|---|
| 3615 |
|
|---|
| 3616 | Returns the position of the first match, or -1 if there was no
|
|---|
| 3617 | match.
|
|---|
| 3618 |
|
|---|
| 3619 | The \a caretMode parameter can be used to instruct whether <b>^</b>
|
|---|
| 3620 | should match at index 0 or at \a offset.
|
|---|
| 3621 |
|
|---|
| 3622 | You might prefer to use QString::find(), QString::contains() or
|
|---|
| 3623 | even QStringList::grep(). To replace matches use
|
|---|
| 3624 | QString::replace().
|
|---|
| 3625 |
|
|---|
| 3626 | Example:
|
|---|
| 3627 | \code
|
|---|
| 3628 | QString str = "offsets: 1.23 .50 71.00 6.00";
|
|---|
| 3629 | QRegExp rx( "\\d*\\.\\d+" ); // primitive floating point matching
|
|---|
| 3630 | int count = 0;
|
|---|
| 3631 | int pos = 0;
|
|---|
| 3632 | while ( (pos = rx.search(str, pos)) != -1 ) {
|
|---|
| 3633 | count++;
|
|---|
| 3634 | pos += rx.matchedLength();
|
|---|
| 3635 | }
|
|---|
| 3636 | // pos will be 9, 14, 18 and finally 24; count will end up as 4
|
|---|
| 3637 | \endcode
|
|---|
| 3638 |
|
|---|
| 3639 | Although const, this function sets matchedLength(),
|
|---|
| 3640 | capturedTexts() and pos().
|
|---|
| 3641 |
|
|---|
| 3642 | \sa searchRev() exactMatch()
|
|---|
| 3643 | */
|
|---|
| 3644 |
|
|---|
| 3645 | int QRegExp::search( const QString& str, int offset, CaretMode caretMode ) const
|
|---|
| 3646 | {
|
|---|
| 3647 | prepareEngineForMatch( str );
|
|---|
| 3648 | if ( offset < 0 )
|
|---|
| 3649 | offset += str.length();
|
|---|
| 3650 | eng->match( str, offset, priv->min, FALSE, caretIndex(offset, caretMode),
|
|---|
| 3651 | priv->captured );
|
|---|
| 3652 | return priv->captured[0];
|
|---|
| 3653 | }
|
|---|
| 3654 |
|
|---|
| 3655 |
|
|---|
| 3656 | int QRegExp::searchRev( const QString& str, int offset ) const
|
|---|
| 3657 | {
|
|---|
| 3658 | return searchRev( str, offset, CaretAtZero );
|
|---|
| 3659 | }
|
|---|
| 3660 |
|
|---|
| 3661 | /*!
|
|---|
| 3662 | Attempts to find a match backwards in \a str from position \a
|
|---|
| 3663 | offset. If \a offset is -1 (the default), the search starts at the
|
|---|
| 3664 | last character; if -2, at the next to last character; etc.
|
|---|
| 3665 |
|
|---|
| 3666 | Returns the position of the first match, or -1 if there was no
|
|---|
| 3667 | match.
|
|---|
| 3668 |
|
|---|
| 3669 | The \a caretMode parameter can be used to instruct whether <b>^</b>
|
|---|
| 3670 | should match at index 0 or at \a offset.
|
|---|
| 3671 |
|
|---|
| 3672 | Although const, this function sets matchedLength(),
|
|---|
| 3673 | capturedTexts() and pos().
|
|---|
| 3674 |
|
|---|
| 3675 | \warning Searching backwards is much slower than searching
|
|---|
| 3676 | forwards.
|
|---|
| 3677 |
|
|---|
| 3678 | \sa search() exactMatch()
|
|---|
| 3679 | */
|
|---|
| 3680 |
|
|---|
| 3681 | int QRegExp::searchRev( const QString& str, int offset,
|
|---|
| 3682 | CaretMode caretMode ) const
|
|---|
| 3683 | {
|
|---|
| 3684 | prepareEngineForMatch( str );
|
|---|
| 3685 | if ( offset < 0 )
|
|---|
| 3686 | offset += str.length();
|
|---|
| 3687 | if ( offset < 0 || offset > (int) str.length() ) {
|
|---|
| 3688 | priv->captured.detach();
|
|---|
| 3689 | priv->captured.fill( -1 );
|
|---|
| 3690 | return -1;
|
|---|
| 3691 | }
|
|---|
| 3692 |
|
|---|
| 3693 | while ( offset >= 0 ) {
|
|---|
| 3694 | eng->match( str, offset, priv->min, TRUE, caretIndex(offset, caretMode),
|
|---|
| 3695 | priv->captured );
|
|---|
| 3696 | if ( priv->captured[0] == offset )
|
|---|
| 3697 | return offset;
|
|---|
| 3698 | offset--;
|
|---|
| 3699 | }
|
|---|
| 3700 | return -1;
|
|---|
| 3701 | }
|
|---|
| 3702 |
|
|---|
| 3703 | /*!
|
|---|
| 3704 | Returns the length of the last matched string, or -1 if there was
|
|---|
| 3705 | no match.
|
|---|
| 3706 |
|
|---|
| 3707 | \sa exactMatch() search() searchRev()
|
|---|
| 3708 | */
|
|---|
| 3709 | int QRegExp::matchedLength() const
|
|---|
| 3710 | {
|
|---|
| 3711 | return priv->captured[1];
|
|---|
| 3712 | }
|
|---|
| 3713 |
|
|---|
| 3714 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 3715 | /*!
|
|---|
| 3716 | Returns the number of captures contained in the regular expression.
|
|---|
| 3717 | */
|
|---|
| 3718 | int QRegExp::numCaptures() const
|
|---|
| 3719 | {
|
|---|
| 3720 | prepareEngine();
|
|---|
| 3721 | return eng->numCaptures();
|
|---|
| 3722 | }
|
|---|
| 3723 |
|
|---|
| 3724 | /*!
|
|---|
| 3725 | Returns a list of the captured text strings.
|
|---|
| 3726 |
|
|---|
| 3727 | The first string in the list is the entire matched string. Each
|
|---|
| 3728 | subsequent list element contains a string that matched a
|
|---|
| 3729 | (capturing) subexpression of the regexp.
|
|---|
| 3730 |
|
|---|
| 3731 | For example:
|
|---|
| 3732 | \code
|
|---|
| 3733 | QRegExp rx( "(\\d+)(\\s*)(cm|inch(es)?)" );
|
|---|
| 3734 | int pos = rx.search( "Length: 36 inches" );
|
|---|
| 3735 | QStringList list = rx.capturedTexts();
|
|---|
| 3736 | // list is now ( "36 inches", "36", " ", "inches", "es" )
|
|---|
| 3737 | \endcode
|
|---|
| 3738 |
|
|---|
| 3739 | The above example also captures elements that may be present but
|
|---|
| 3740 | which we have no interest in. This problem can be solved by using
|
|---|
| 3741 | non-capturing parentheses:
|
|---|
| 3742 |
|
|---|
| 3743 | \code
|
|---|
| 3744 | QRegExp rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" );
|
|---|
| 3745 | int pos = rx.search( "Length: 36 inches" );
|
|---|
| 3746 | QStringList list = rx.capturedTexts();
|
|---|
| 3747 | // list is now ( "36 inches", "36", "inches" )
|
|---|
| 3748 | \endcode
|
|---|
| 3749 |
|
|---|
| 3750 | Note that if you want to iterate over the list, you should iterate
|
|---|
| 3751 | over a copy, e.g.
|
|---|
| 3752 | \code
|
|---|
| 3753 | QStringList list = rx.capturedTexts();
|
|---|
| 3754 | QStringList::Iterator it = list.begin();
|
|---|
| 3755 | while( it != list.end() ) {
|
|---|
| 3756 | myProcessing( *it );
|
|---|
| 3757 | ++it;
|
|---|
| 3758 | }
|
|---|
| 3759 | \endcode
|
|---|
| 3760 |
|
|---|
| 3761 | Some regexps can match an indeterminate number of times. For
|
|---|
| 3762 | example if the input string is "Offsets: 12 14 99 231 7" and the
|
|---|
| 3763 | regexp, \c{rx}, is <b>(\\d+)+</b>, we would hope to get a list of
|
|---|
| 3764 | all the numbers matched. However, after calling
|
|---|
| 3765 | \c{rx.search(str)}, capturedTexts() will return the list ( "12",
|
|---|
| 3766 | "12" ), i.e. the entire match was "12" and the first subexpression
|
|---|
| 3767 | matched was "12". The correct approach is to use cap() in a \link
|
|---|
| 3768 | #cap_in_a_loop loop \endlink.
|
|---|
| 3769 |
|
|---|
| 3770 | The order of elements in the string list is as follows. The first
|
|---|
| 3771 | element is the entire matching string. Each subsequent element
|
|---|
| 3772 | corresponds to the next capturing open left parentheses. Thus
|
|---|
| 3773 | capturedTexts()[1] is the text of the first capturing parentheses,
|
|---|
| 3774 | capturedTexts()[2] is the text of the second and so on
|
|---|
| 3775 | (corresponding to $1, $2, etc., in some other regexp languages).
|
|---|
| 3776 |
|
|---|
| 3777 | \sa cap() pos() exactMatch() search() searchRev()
|
|---|
| 3778 | */
|
|---|
| 3779 | QStringList QRegExp::capturedTexts()
|
|---|
| 3780 | {
|
|---|
| 3781 | if ( priv->capturedCache.isEmpty() ) {
|
|---|
| 3782 | for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) {
|
|---|
| 3783 | QString m;
|
|---|
| 3784 | if ( priv->captured[i + 1] == 0 )
|
|---|
| 3785 | m = QString::fromLatin1( "" );
|
|---|
| 3786 | else if ( priv->captured[i] >= 0 )
|
|---|
| 3787 | m = priv->t.mid( priv->captured[i],
|
|---|
| 3788 | priv->captured[i + 1] );
|
|---|
| 3789 | priv->capturedCache.append( m );
|
|---|
| 3790 | }
|
|---|
| 3791 | priv->t = QString::null;
|
|---|
| 3792 | }
|
|---|
| 3793 | return priv->capturedCache;
|
|---|
| 3794 | }
|
|---|
| 3795 |
|
|---|
| 3796 | /*!
|
|---|
| 3797 | Returns the text captured by the \a nth subexpression. The entire
|
|---|
| 3798 | match has index 0 and the parenthesized subexpressions have
|
|---|
| 3799 | indices starting from 1 (excluding non-capturing parentheses).
|
|---|
| 3800 |
|
|---|
| 3801 | \code
|
|---|
| 3802 | QRegExp rxlen( "(\\d+)(?:\\s*)(cm|inch)" );
|
|---|
| 3803 | int pos = rxlen.search( "Length: 189cm" );
|
|---|
| 3804 | if ( pos > -1 ) {
|
|---|
| 3805 | QString value = rxlen.cap( 1 ); // "189"
|
|---|
| 3806 | QString unit = rxlen.cap( 2 ); // "cm"
|
|---|
| 3807 | // ...
|
|---|
| 3808 | }
|
|---|
| 3809 | \endcode
|
|---|
| 3810 |
|
|---|
| 3811 | The order of elements matched by cap() is as follows. The first
|
|---|
| 3812 | element, cap(0), is the entire matching string. Each subsequent
|
|---|
| 3813 | element corresponds to the next capturing open left parentheses.
|
|---|
| 3814 | Thus cap(1) is the text of the first capturing parentheses, cap(2)
|
|---|
| 3815 | is the text of the second, and so on.
|
|---|
| 3816 |
|
|---|
| 3817 | \target cap_in_a_loop
|
|---|
| 3818 | Some patterns may lead to a number of matches which cannot be
|
|---|
| 3819 | determined in advance, for example:
|
|---|
| 3820 |
|
|---|
| 3821 | \code
|
|---|
| 3822 | QRegExp rx( "(\\d+)" );
|
|---|
| 3823 | str = "Offsets: 12 14 99 231 7";
|
|---|
| 3824 | QStringList list;
|
|---|
| 3825 | pos = 0;
|
|---|
| 3826 | while ( pos >= 0 ) {
|
|---|
| 3827 | pos = rx.search( str, pos );
|
|---|
| 3828 | if ( pos > -1 ) {
|
|---|
| 3829 | list += rx.cap( 1 );
|
|---|
| 3830 | pos += rx.matchedLength();
|
|---|
| 3831 | }
|
|---|
| 3832 | }
|
|---|
| 3833 | // list contains "12", "14", "99", "231", "7"
|
|---|
| 3834 | \endcode
|
|---|
| 3835 |
|
|---|
| 3836 | \sa capturedTexts() pos() exactMatch() search() searchRev()
|
|---|
| 3837 | */
|
|---|
| 3838 | QString QRegExp::cap( int nth )
|
|---|
| 3839 | {
|
|---|
| 3840 | if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) {
|
|---|
| 3841 | return QString::null;
|
|---|
| 3842 | } else {
|
|---|
| 3843 | return capturedTexts()[nth];
|
|---|
| 3844 | }
|
|---|
| 3845 | }
|
|---|
| 3846 |
|
|---|
| 3847 | /*!
|
|---|
| 3848 | Returns the position of the \a nth captured text in the searched
|
|---|
| 3849 | string. If \a nth is 0 (the default), pos() returns the position
|
|---|
| 3850 | of the whole match.
|
|---|
| 3851 |
|
|---|
| 3852 | Example:
|
|---|
| 3853 | \code
|
|---|
| 3854 | QRegExp rx( "/([a-z]+)/([a-z]+)" );
|
|---|
| 3855 | rx.search( "Output /dev/null" ); // returns 7 (position of /dev/null)
|
|---|
| 3856 | rx.pos( 0 ); // returns 7 (position of /dev/null)
|
|---|
| 3857 | rx.pos( 1 ); // returns 8 (position of dev)
|
|---|
| 3858 | rx.pos( 2 ); // returns 12 (position of null)
|
|---|
| 3859 | \endcode
|
|---|
| 3860 |
|
|---|
| 3861 | For zero-length matches, pos() always returns -1. (For example, if
|
|---|
| 3862 | cap(4) would return an empty string, pos(4) returns -1.) This is
|
|---|
| 3863 | due to an implementation tradeoff.
|
|---|
| 3864 |
|
|---|
| 3865 | \sa capturedTexts() exactMatch() search() searchRev()
|
|---|
| 3866 | */
|
|---|
| 3867 | int QRegExp::pos( int nth )
|
|---|
| 3868 | {
|
|---|
| 3869 | if ( nth < 0 || nth >= (int) priv->captured.size() / 2 )
|
|---|
| 3870 | return -1;
|
|---|
| 3871 | else
|
|---|
| 3872 | return priv->captured[2 * nth];
|
|---|
| 3873 | }
|
|---|
| 3874 |
|
|---|
| 3875 | /*!
|
|---|
| 3876 | Returns a text string that explains why a regexp pattern is
|
|---|
| 3877 | invalid the case being; otherwise returns "no error occurred".
|
|---|
| 3878 |
|
|---|
| 3879 | \sa isValid()
|
|---|
| 3880 | */
|
|---|
| 3881 | QString QRegExp::errorString()
|
|---|
| 3882 | {
|
|---|
| 3883 | if ( isValid() ) {
|
|---|
| 3884 | return QString( RXERR_OK );
|
|---|
| 3885 | } else {
|
|---|
| 3886 | return eng->errorString();
|
|---|
| 3887 | }
|
|---|
| 3888 | }
|
|---|
| 3889 | #endif
|
|---|
| 3890 |
|
|---|
| 3891 | /*!
|
|---|
| 3892 | Returns the string \a str with every regexp special character
|
|---|
| 3893 | escaped with a backslash. The special characters are $, (, ), *, +,
|
|---|
| 3894 | ., ?, [, \, ], ^, {, | and }.
|
|---|
| 3895 |
|
|---|
| 3896 | Example:
|
|---|
| 3897 | \code
|
|---|
| 3898 | s1 = QRegExp::escape( "bingo" ); // s1 == "bingo"
|
|---|
| 3899 | s2 = QRegExp::escape( "f(x)" ); // s2 == "f\\(x\\)"
|
|---|
| 3900 | \endcode
|
|---|
| 3901 |
|
|---|
| 3902 | This function is useful to construct regexp patterns dynamically:
|
|---|
| 3903 |
|
|---|
| 3904 | \code
|
|---|
| 3905 | QRegExp rx( "(" + QRegExp::escape(name) +
|
|---|
| 3906 | "|" + QRegExp::escape(alias) + ")" );
|
|---|
| 3907 | \endcode
|
|---|
| 3908 | */
|
|---|
| 3909 | QString QRegExp::escape( const QString& str )
|
|---|
| 3910 | {
|
|---|
| 3911 | static const char meta[] = "$()*+.?[\\]^{|}";
|
|---|
| 3912 | QString quoted = str;
|
|---|
| 3913 | int i = 0;
|
|---|
| 3914 |
|
|---|
| 3915 | while ( i < (int) quoted.length() ) {
|
|---|
| 3916 | if ( strchr(meta, quoted[i].latin1()) != 0 )
|
|---|
| 3917 | quoted.insert( i++, "\\" );
|
|---|
| 3918 | i++;
|
|---|
| 3919 | }
|
|---|
| 3920 | return quoted;
|
|---|
| 3921 | }
|
|---|
| 3922 |
|
|---|
| 3923 | void QRegExp::prepareEngine() const
|
|---|
| 3924 | {
|
|---|
| 3925 | if ( eng == 0 ) {
|
|---|
| 3926 | #ifndef QT_NO_REGEXP_WILDCARD
|
|---|
| 3927 | if ( priv->wc )
|
|---|
| 3928 | priv->rxpattern = wc2rx( priv->pattern );
|
|---|
| 3929 | else
|
|---|
| 3930 | #endif
|
|---|
| 3931 | priv->rxpattern = priv->pattern.isNull() ? QString::fromLatin1( "" )
|
|---|
| 3932 | : priv->pattern;
|
|---|
| 3933 | QRegExp *that = (QRegExp *) this;
|
|---|
| 3934 | // that->eng = newEngine( priv->rxpattern, priv->cs );
|
|---|
| 3935 | regexpEngine( that->eng, priv->rxpattern, priv->cs, FALSE );
|
|---|
| 3936 | priv->captured.detach();
|
|---|
| 3937 | priv->captured.fill( -1, 2 + 2 * eng->numCaptures() );
|
|---|
| 3938 | }
|
|---|
| 3939 | }
|
|---|
| 3940 |
|
|---|
| 3941 | void QRegExp::prepareEngineForMatch( const QString& str ) const
|
|---|
| 3942 | {
|
|---|
| 3943 | prepareEngine();
|
|---|
| 3944 | #ifndef QT_NO_REGEXP_CAPTURE
|
|---|
| 3945 | priv->t = str;
|
|---|
| 3946 | priv->capturedCache.clear();
|
|---|
| 3947 | #else
|
|---|
| 3948 | Q_UNUSED( str );
|
|---|
| 3949 | #endif
|
|---|
| 3950 | }
|
|---|
| 3951 |
|
|---|
| 3952 | void QRegExp::invalidateEngine()
|
|---|
| 3953 | {
|
|---|
| 3954 | if ( eng != 0 ) {
|
|---|
| 3955 | regexpEngine( eng, priv->rxpattern, priv->cs, TRUE );
|
|---|
| 3956 | priv->rxpattern = QString();
|
|---|
| 3957 | eng = 0;
|
|---|
| 3958 | }
|
|---|
| 3959 | }
|
|---|
| 3960 |
|
|---|
| 3961 | int QRegExp::caretIndex( int offset, CaretMode caretMode )
|
|---|
| 3962 | {
|
|---|
| 3963 | if ( caretMode == CaretAtZero ) {
|
|---|
| 3964 | return 0;
|
|---|
| 3965 | } else if ( caretMode == CaretAtOffset ) {
|
|---|
| 3966 | return offset;
|
|---|
| 3967 | } else { // CaretWontMatch
|
|---|
| 3968 | return -1;
|
|---|
| 3969 | }
|
|---|
| 3970 | }
|
|---|
| 3971 |
|
|---|
| 3972 | #endif // QT_NO_REGEXP
|
|---|