[2] | 1 | #
|
---|
| 2 | # Secret Labs' Regular Expression Engine
|
---|
| 3 | #
|
---|
| 4 | # convert re-style regular expression to sre pattern
|
---|
| 5 | #
|
---|
| 6 | # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
|
---|
| 7 | #
|
---|
| 8 | # See the sre.py file for information on usage and redistribution.
|
---|
| 9 | #
|
---|
| 10 |
|
---|
| 11 | """Internal support module for sre"""
|
---|
| 12 |
|
---|
| 13 | # XXX: show string offset and offending character for all errors
|
---|
| 14 |
|
---|
| 15 | import sys
|
---|
| 16 |
|
---|
| 17 | from sre_constants import *
|
---|
| 18 |
|
---|
| 19 | SPECIAL_CHARS = ".\\[{()*+?^$|"
|
---|
| 20 | REPEAT_CHARS = "*+?{"
|
---|
| 21 |
|
---|
| 22 | DIGITS = set("0123456789")
|
---|
| 23 |
|
---|
| 24 | OCTDIGITS = set("01234567")
|
---|
| 25 | HEXDIGITS = set("0123456789abcdefABCDEF")
|
---|
| 26 |
|
---|
| 27 | WHITESPACE = set(" \t\n\r\v\f")
|
---|
| 28 |
|
---|
| 29 | ESCAPES = {
|
---|
| 30 | r"\a": (LITERAL, ord("\a")),
|
---|
| 31 | r"\b": (LITERAL, ord("\b")),
|
---|
| 32 | r"\f": (LITERAL, ord("\f")),
|
---|
| 33 | r"\n": (LITERAL, ord("\n")),
|
---|
| 34 | r"\r": (LITERAL, ord("\r")),
|
---|
| 35 | r"\t": (LITERAL, ord("\t")),
|
---|
| 36 | r"\v": (LITERAL, ord("\v")),
|
---|
| 37 | r"\\": (LITERAL, ord("\\"))
|
---|
| 38 | }
|
---|
| 39 |
|
---|
| 40 | CATEGORIES = {
|
---|
| 41 | r"\A": (AT, AT_BEGINNING_STRING), # start of string
|
---|
| 42 | r"\b": (AT, AT_BOUNDARY),
|
---|
| 43 | r"\B": (AT, AT_NON_BOUNDARY),
|
---|
| 44 | r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
|
---|
| 45 | r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
|
---|
| 46 | r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
|
---|
| 47 | r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
|
---|
| 48 | r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
|
---|
| 49 | r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
|
---|
| 50 | r"\Z": (AT, AT_END_STRING), # end of string
|
---|
| 51 | }
|
---|
| 52 |
|
---|
| 53 | FLAGS = {
|
---|
| 54 | # standard flags
|
---|
| 55 | "i": SRE_FLAG_IGNORECASE,
|
---|
| 56 | "L": SRE_FLAG_LOCALE,
|
---|
| 57 | "m": SRE_FLAG_MULTILINE,
|
---|
| 58 | "s": SRE_FLAG_DOTALL,
|
---|
| 59 | "x": SRE_FLAG_VERBOSE,
|
---|
| 60 | # extensions
|
---|
| 61 | "t": SRE_FLAG_TEMPLATE,
|
---|
| 62 | "u": SRE_FLAG_UNICODE,
|
---|
| 63 | }
|
---|
| 64 |
|
---|
| 65 | class Pattern:
|
---|
| 66 | # master pattern object. keeps track of global attributes
|
---|
| 67 | def __init__(self):
|
---|
| 68 | self.flags = 0
|
---|
| 69 | self.open = []
|
---|
| 70 | self.groups = 1
|
---|
| 71 | self.groupdict = {}
|
---|
| 72 | def opengroup(self, name=None):
|
---|
| 73 | gid = self.groups
|
---|
| 74 | self.groups = gid + 1
|
---|
| 75 | if name is not None:
|
---|
| 76 | ogid = self.groupdict.get(name, None)
|
---|
| 77 | if ogid is not None:
|
---|
| 78 | raise error, ("redefinition of group name %s as group %d; "
|
---|
| 79 | "was group %d" % (repr(name), gid, ogid))
|
---|
| 80 | self.groupdict[name] = gid
|
---|
| 81 | self.open.append(gid)
|
---|
| 82 | return gid
|
---|
| 83 | def closegroup(self, gid):
|
---|
| 84 | self.open.remove(gid)
|
---|
| 85 | def checkgroup(self, gid):
|
---|
| 86 | return gid < self.groups and gid not in self.open
|
---|
| 87 |
|
---|
| 88 | class SubPattern:
|
---|
| 89 | # a subpattern, in intermediate form
|
---|
| 90 | def __init__(self, pattern, data=None):
|
---|
| 91 | self.pattern = pattern
|
---|
| 92 | if data is None:
|
---|
| 93 | data = []
|
---|
| 94 | self.data = data
|
---|
| 95 | self.width = None
|
---|
| 96 | def dump(self, level=0):
|
---|
| 97 | nl = 1
|
---|
| 98 | seqtypes = type(()), type([])
|
---|
| 99 | for op, av in self.data:
|
---|
| 100 | print level*" " + op,; nl = 0
|
---|
| 101 | if op == "in":
|
---|
| 102 | # member sublanguage
|
---|
| 103 | print; nl = 1
|
---|
| 104 | for op, a in av:
|
---|
| 105 | print (level+1)*" " + op, a
|
---|
| 106 | elif op == "branch":
|
---|
| 107 | print; nl = 1
|
---|
| 108 | i = 0
|
---|
| 109 | for a in av[1]:
|
---|
| 110 | if i > 0:
|
---|
| 111 | print level*" " + "or"
|
---|
| 112 | a.dump(level+1); nl = 1
|
---|
| 113 | i = i + 1
|
---|
| 114 | elif type(av) in seqtypes:
|
---|
| 115 | for a in av:
|
---|
| 116 | if isinstance(a, SubPattern):
|
---|
| 117 | if not nl: print
|
---|
| 118 | a.dump(level+1); nl = 1
|
---|
| 119 | else:
|
---|
| 120 | print a, ; nl = 0
|
---|
| 121 | else:
|
---|
| 122 | print av, ; nl = 0
|
---|
| 123 | if not nl: print
|
---|
| 124 | def __repr__(self):
|
---|
| 125 | return repr(self.data)
|
---|
| 126 | def __len__(self):
|
---|
| 127 | return len(self.data)
|
---|
| 128 | def __delitem__(self, index):
|
---|
| 129 | del self.data[index]
|
---|
| 130 | def __getitem__(self, index):
|
---|
| 131 | if isinstance(index, slice):
|
---|
| 132 | return SubPattern(self.pattern, self.data[index])
|
---|
| 133 | return self.data[index]
|
---|
| 134 | def __setitem__(self, index, code):
|
---|
| 135 | self.data[index] = code
|
---|
| 136 | def insert(self, index, code):
|
---|
| 137 | self.data.insert(index, code)
|
---|
| 138 | def append(self, code):
|
---|
| 139 | self.data.append(code)
|
---|
| 140 | def getwidth(self):
|
---|
| 141 | # determine the width (min, max) for this subpattern
|
---|
| 142 | if self.width:
|
---|
| 143 | return self.width
|
---|
[391] | 144 | lo = hi = 0
|
---|
[2] | 145 | UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
|
---|
| 146 | REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
|
---|
| 147 | for op, av in self.data:
|
---|
| 148 | if op is BRANCH:
|
---|
[391] | 149 | i = MAXREPEAT - 1
|
---|
[2] | 150 | j = 0
|
---|
| 151 | for av in av[1]:
|
---|
| 152 | l, h = av.getwidth()
|
---|
| 153 | i = min(i, l)
|
---|
| 154 | j = max(j, h)
|
---|
| 155 | lo = lo + i
|
---|
| 156 | hi = hi + j
|
---|
| 157 | elif op is CALL:
|
---|
| 158 | i, j = av.getwidth()
|
---|
| 159 | lo = lo + i
|
---|
| 160 | hi = hi + j
|
---|
| 161 | elif op is SUBPATTERN:
|
---|
| 162 | i, j = av[1].getwidth()
|
---|
| 163 | lo = lo + i
|
---|
| 164 | hi = hi + j
|
---|
| 165 | elif op in REPEATCODES:
|
---|
| 166 | i, j = av[2].getwidth()
|
---|
[391] | 167 | lo = lo + i * av[0]
|
---|
| 168 | hi = hi + j * av[1]
|
---|
[2] | 169 | elif op in UNITCODES:
|
---|
| 170 | lo = lo + 1
|
---|
| 171 | hi = hi + 1
|
---|
| 172 | elif op == SUCCESS:
|
---|
| 173 | break
|
---|
[391] | 174 | self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
|
---|
[2] | 175 | return self.width
|
---|
| 176 |
|
---|
| 177 | class Tokenizer:
|
---|
| 178 | def __init__(self, string):
|
---|
| 179 | self.string = string
|
---|
| 180 | self.index = 0
|
---|
| 181 | self.__next()
|
---|
| 182 | def __next(self):
|
---|
| 183 | if self.index >= len(self.string):
|
---|
| 184 | self.next = None
|
---|
| 185 | return
|
---|
| 186 | char = self.string[self.index]
|
---|
| 187 | if char[0] == "\\":
|
---|
| 188 | try:
|
---|
| 189 | c = self.string[self.index + 1]
|
---|
| 190 | except IndexError:
|
---|
| 191 | raise error, "bogus escape (end of line)"
|
---|
| 192 | char = char + c
|
---|
| 193 | self.index = self.index + len(char)
|
---|
| 194 | self.next = char
|
---|
| 195 | def match(self, char, skip=1):
|
---|
| 196 | if char == self.next:
|
---|
| 197 | if skip:
|
---|
| 198 | self.__next()
|
---|
| 199 | return 1
|
---|
| 200 | return 0
|
---|
| 201 | def get(self):
|
---|
| 202 | this = self.next
|
---|
| 203 | self.__next()
|
---|
| 204 | return this
|
---|
| 205 | def tell(self):
|
---|
| 206 | return self.index, self.next
|
---|
| 207 | def seek(self, index):
|
---|
| 208 | self.index, self.next = index
|
---|
| 209 |
|
---|
| 210 | def isident(char):
|
---|
| 211 | return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
|
---|
| 212 |
|
---|
| 213 | def isdigit(char):
|
---|
| 214 | return "0" <= char <= "9"
|
---|
| 215 |
|
---|
| 216 | def isname(name):
|
---|
| 217 | # check that group name is a valid string
|
---|
| 218 | if not isident(name[0]):
|
---|
| 219 | return False
|
---|
| 220 | for char in name[1:]:
|
---|
| 221 | if not isident(char) and not isdigit(char):
|
---|
| 222 | return False
|
---|
| 223 | return True
|
---|
| 224 |
|
---|
| 225 | def _class_escape(source, escape):
|
---|
| 226 | # handle escape code inside character class
|
---|
| 227 | code = ESCAPES.get(escape)
|
---|
| 228 | if code:
|
---|
| 229 | return code
|
---|
| 230 | code = CATEGORIES.get(escape)
|
---|
[391] | 231 | if code and code[0] == IN:
|
---|
[2] | 232 | return code
|
---|
| 233 | try:
|
---|
| 234 | c = escape[1:2]
|
---|
| 235 | if c == "x":
|
---|
| 236 | # hexadecimal escape (exactly two digits)
|
---|
| 237 | while source.next in HEXDIGITS and len(escape) < 4:
|
---|
| 238 | escape = escape + source.get()
|
---|
| 239 | escape = escape[2:]
|
---|
| 240 | if len(escape) != 2:
|
---|
| 241 | raise error, "bogus escape: %s" % repr("\\" + escape)
|
---|
| 242 | return LITERAL, int(escape, 16) & 0xff
|
---|
| 243 | elif c in OCTDIGITS:
|
---|
| 244 | # octal escape (up to three digits)
|
---|
| 245 | while source.next in OCTDIGITS and len(escape) < 4:
|
---|
| 246 | escape = escape + source.get()
|
---|
| 247 | escape = escape[1:]
|
---|
| 248 | return LITERAL, int(escape, 8) & 0xff
|
---|
| 249 | elif c in DIGITS:
|
---|
| 250 | raise error, "bogus escape: %s" % repr(escape)
|
---|
| 251 | if len(escape) == 2:
|
---|
| 252 | return LITERAL, ord(escape[1])
|
---|
| 253 | except ValueError:
|
---|
| 254 | pass
|
---|
| 255 | raise error, "bogus escape: %s" % repr(escape)
|
---|
| 256 |
|
---|
| 257 | def _escape(source, escape, state):
|
---|
| 258 | # handle escape code in expression
|
---|
| 259 | code = CATEGORIES.get(escape)
|
---|
| 260 | if code:
|
---|
| 261 | return code
|
---|
| 262 | code = ESCAPES.get(escape)
|
---|
| 263 | if code:
|
---|
| 264 | return code
|
---|
| 265 | try:
|
---|
| 266 | c = escape[1:2]
|
---|
| 267 | if c == "x":
|
---|
| 268 | # hexadecimal escape
|
---|
| 269 | while source.next in HEXDIGITS and len(escape) < 4:
|
---|
| 270 | escape = escape + source.get()
|
---|
| 271 | if len(escape) != 4:
|
---|
| 272 | raise ValueError
|
---|
| 273 | return LITERAL, int(escape[2:], 16) & 0xff
|
---|
| 274 | elif c == "0":
|
---|
| 275 | # octal escape
|
---|
| 276 | while source.next in OCTDIGITS and len(escape) < 4:
|
---|
| 277 | escape = escape + source.get()
|
---|
| 278 | return LITERAL, int(escape[1:], 8) & 0xff
|
---|
| 279 | elif c in DIGITS:
|
---|
| 280 | # octal escape *or* decimal group reference (sigh)
|
---|
| 281 | if source.next in DIGITS:
|
---|
| 282 | escape = escape + source.get()
|
---|
| 283 | if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
|
---|
| 284 | source.next in OCTDIGITS):
|
---|
| 285 | # got three octal digits; this is an octal escape
|
---|
| 286 | escape = escape + source.get()
|
---|
| 287 | return LITERAL, int(escape[1:], 8) & 0xff
|
---|
| 288 | # not an octal escape, so this is a group reference
|
---|
| 289 | group = int(escape[1:])
|
---|
| 290 | if group < state.groups:
|
---|
| 291 | if not state.checkgroup(group):
|
---|
| 292 | raise error, "cannot refer to open group"
|
---|
| 293 | return GROUPREF, group
|
---|
| 294 | raise ValueError
|
---|
| 295 | if len(escape) == 2:
|
---|
| 296 | return LITERAL, ord(escape[1])
|
---|
| 297 | except ValueError:
|
---|
| 298 | pass
|
---|
| 299 | raise error, "bogus escape: %s" % repr(escape)
|
---|
| 300 |
|
---|
| 301 | def _parse_sub(source, state, nested=1):
|
---|
| 302 | # parse an alternation: a|b|c
|
---|
| 303 |
|
---|
| 304 | items = []
|
---|
| 305 | itemsappend = items.append
|
---|
| 306 | sourcematch = source.match
|
---|
| 307 | while 1:
|
---|
| 308 | itemsappend(_parse(source, state))
|
---|
| 309 | if sourcematch("|"):
|
---|
| 310 | continue
|
---|
| 311 | if not nested:
|
---|
| 312 | break
|
---|
| 313 | if not source.next or sourcematch(")", 0):
|
---|
| 314 | break
|
---|
| 315 | else:
|
---|
| 316 | raise error, "pattern not properly closed"
|
---|
| 317 |
|
---|
| 318 | if len(items) == 1:
|
---|
| 319 | return items[0]
|
---|
| 320 |
|
---|
| 321 | subpattern = SubPattern(state)
|
---|
| 322 | subpatternappend = subpattern.append
|
---|
| 323 |
|
---|
| 324 | # check if all items share a common prefix
|
---|
| 325 | while 1:
|
---|
| 326 | prefix = None
|
---|
| 327 | for item in items:
|
---|
| 328 | if not item:
|
---|
| 329 | break
|
---|
| 330 | if prefix is None:
|
---|
| 331 | prefix = item[0]
|
---|
| 332 | elif item[0] != prefix:
|
---|
| 333 | break
|
---|
| 334 | else:
|
---|
| 335 | # all subitems start with a common "prefix".
|
---|
| 336 | # move it out of the branch
|
---|
| 337 | for item in items:
|
---|
| 338 | del item[0]
|
---|
| 339 | subpatternappend(prefix)
|
---|
| 340 | continue # check next one
|
---|
| 341 | break
|
---|
| 342 |
|
---|
| 343 | # check if the branch can be replaced by a character set
|
---|
| 344 | for item in items:
|
---|
| 345 | if len(item) != 1 or item[0][0] != LITERAL:
|
---|
| 346 | break
|
---|
| 347 | else:
|
---|
| 348 | # we can store this as a character set instead of a
|
---|
| 349 | # branch (the compiler may optimize this even more)
|
---|
| 350 | set = []
|
---|
| 351 | setappend = set.append
|
---|
| 352 | for item in items:
|
---|
| 353 | setappend(item[0])
|
---|
| 354 | subpatternappend((IN, set))
|
---|
| 355 | return subpattern
|
---|
| 356 |
|
---|
| 357 | subpattern.append((BRANCH, (None, items)))
|
---|
| 358 | return subpattern
|
---|
| 359 |
|
---|
| 360 | def _parse_sub_cond(source, state, condgroup):
|
---|
| 361 | item_yes = _parse(source, state)
|
---|
| 362 | if source.match("|"):
|
---|
| 363 | item_no = _parse(source, state)
|
---|
| 364 | if source.match("|"):
|
---|
| 365 | raise error, "conditional backref with more than two branches"
|
---|
| 366 | else:
|
---|
| 367 | item_no = None
|
---|
| 368 | if source.next and not source.match(")", 0):
|
---|
| 369 | raise error, "pattern not properly closed"
|
---|
| 370 | subpattern = SubPattern(state)
|
---|
| 371 | subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
---|
| 372 | return subpattern
|
---|
| 373 |
|
---|
| 374 | _PATTERNENDERS = set("|)")
|
---|
| 375 | _ASSERTCHARS = set("=!<")
|
---|
| 376 | _LOOKBEHINDASSERTCHARS = set("=!")
|
---|
| 377 | _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
|
---|
| 378 |
|
---|
| 379 | def _parse(source, state):
|
---|
| 380 | # parse a simple pattern
|
---|
| 381 | subpattern = SubPattern(state)
|
---|
| 382 |
|
---|
| 383 | # precompute constants into local variables
|
---|
| 384 | subpatternappend = subpattern.append
|
---|
| 385 | sourceget = source.get
|
---|
| 386 | sourcematch = source.match
|
---|
| 387 | _len = len
|
---|
| 388 | PATTERNENDERS = _PATTERNENDERS
|
---|
| 389 | ASSERTCHARS = _ASSERTCHARS
|
---|
| 390 | LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
|
---|
| 391 | REPEATCODES = _REPEATCODES
|
---|
| 392 |
|
---|
| 393 | while 1:
|
---|
| 394 |
|
---|
| 395 | if source.next in PATTERNENDERS:
|
---|
| 396 | break # end of subpattern
|
---|
| 397 | this = sourceget()
|
---|
| 398 | if this is None:
|
---|
| 399 | break # end of pattern
|
---|
| 400 |
|
---|
| 401 | if state.flags & SRE_FLAG_VERBOSE:
|
---|
| 402 | # skip whitespace and comments
|
---|
| 403 | if this in WHITESPACE:
|
---|
| 404 | continue
|
---|
| 405 | if this == "#":
|
---|
| 406 | while 1:
|
---|
| 407 | this = sourceget()
|
---|
| 408 | if this in (None, "\n"):
|
---|
| 409 | break
|
---|
| 410 | continue
|
---|
| 411 |
|
---|
| 412 | if this and this[0] not in SPECIAL_CHARS:
|
---|
| 413 | subpatternappend((LITERAL, ord(this)))
|
---|
| 414 |
|
---|
| 415 | elif this == "[":
|
---|
| 416 | # character set
|
---|
| 417 | set = []
|
---|
| 418 | setappend = set.append
|
---|
| 419 | ## if sourcematch(":"):
|
---|
| 420 | ## pass # handle character classes
|
---|
| 421 | if sourcematch("^"):
|
---|
| 422 | setappend((NEGATE, None))
|
---|
| 423 | # check remaining characters
|
---|
| 424 | start = set[:]
|
---|
| 425 | while 1:
|
---|
| 426 | this = sourceget()
|
---|
| 427 | if this == "]" and set != start:
|
---|
| 428 | break
|
---|
| 429 | elif this and this[0] == "\\":
|
---|
| 430 | code1 = _class_escape(source, this)
|
---|
| 431 | elif this:
|
---|
| 432 | code1 = LITERAL, ord(this)
|
---|
| 433 | else:
|
---|
| 434 | raise error, "unexpected end of regular expression"
|
---|
| 435 | if sourcematch("-"):
|
---|
| 436 | # potential range
|
---|
| 437 | this = sourceget()
|
---|
| 438 | if this == "]":
|
---|
| 439 | if code1[0] is IN:
|
---|
| 440 | code1 = code1[1][0]
|
---|
| 441 | setappend(code1)
|
---|
| 442 | setappend((LITERAL, ord("-")))
|
---|
| 443 | break
|
---|
| 444 | elif this:
|
---|
| 445 | if this[0] == "\\":
|
---|
| 446 | code2 = _class_escape(source, this)
|
---|
| 447 | else:
|
---|
| 448 | code2 = LITERAL, ord(this)
|
---|
| 449 | if code1[0] != LITERAL or code2[0] != LITERAL:
|
---|
| 450 | raise error, "bad character range"
|
---|
| 451 | lo = code1[1]
|
---|
| 452 | hi = code2[1]
|
---|
| 453 | if hi < lo:
|
---|
| 454 | raise error, "bad character range"
|
---|
| 455 | setappend((RANGE, (lo, hi)))
|
---|
| 456 | else:
|
---|
| 457 | raise error, "unexpected end of regular expression"
|
---|
| 458 | else:
|
---|
| 459 | if code1[0] is IN:
|
---|
| 460 | code1 = code1[1][0]
|
---|
| 461 | setappend(code1)
|
---|
| 462 |
|
---|
| 463 | # XXX: <fl> should move set optimization to compiler!
|
---|
| 464 | if _len(set)==1 and set[0][0] is LITERAL:
|
---|
| 465 | subpatternappend(set[0]) # optimization
|
---|
| 466 | elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
|
---|
| 467 | subpatternappend((NOT_LITERAL, set[1][1])) # optimization
|
---|
| 468 | else:
|
---|
| 469 | # XXX: <fl> should add charmap optimization here
|
---|
| 470 | subpatternappend((IN, set))
|
---|
| 471 |
|
---|
| 472 | elif this and this[0] in REPEAT_CHARS:
|
---|
| 473 | # repeat previous item
|
---|
| 474 | if this == "?":
|
---|
| 475 | min, max = 0, 1
|
---|
| 476 | elif this == "*":
|
---|
| 477 | min, max = 0, MAXREPEAT
|
---|
| 478 |
|
---|
| 479 | elif this == "+":
|
---|
| 480 | min, max = 1, MAXREPEAT
|
---|
| 481 | elif this == "{":
|
---|
| 482 | if source.next == "}":
|
---|
| 483 | subpatternappend((LITERAL, ord(this)))
|
---|
| 484 | continue
|
---|
| 485 | here = source.tell()
|
---|
| 486 | min, max = 0, MAXREPEAT
|
---|
| 487 | lo = hi = ""
|
---|
| 488 | while source.next in DIGITS:
|
---|
| 489 | lo = lo + source.get()
|
---|
| 490 | if sourcematch(","):
|
---|
| 491 | while source.next in DIGITS:
|
---|
| 492 | hi = hi + sourceget()
|
---|
| 493 | else:
|
---|
| 494 | hi = lo
|
---|
| 495 | if not sourcematch("}"):
|
---|
| 496 | subpatternappend((LITERAL, ord(this)))
|
---|
| 497 | source.seek(here)
|
---|
| 498 | continue
|
---|
| 499 | if lo:
|
---|
| 500 | min = int(lo)
|
---|
[391] | 501 | if min >= MAXREPEAT:
|
---|
| 502 | raise OverflowError("the repetition number is too large")
|
---|
[2] | 503 | if hi:
|
---|
| 504 | max = int(hi)
|
---|
[391] | 505 | if max >= MAXREPEAT:
|
---|
| 506 | raise OverflowError("the repetition number is too large")
|
---|
| 507 | if max < min:
|
---|
| 508 | raise error("bad repeat interval")
|
---|
[2] | 509 | else:
|
---|
| 510 | raise error, "not supported"
|
---|
| 511 | # figure out which item to repeat
|
---|
| 512 | if subpattern:
|
---|
| 513 | item = subpattern[-1:]
|
---|
| 514 | else:
|
---|
| 515 | item = None
|
---|
| 516 | if not item or (_len(item) == 1 and item[0][0] == AT):
|
---|
| 517 | raise error, "nothing to repeat"
|
---|
| 518 | if item[0][0] in REPEATCODES:
|
---|
| 519 | raise error, "multiple repeat"
|
---|
| 520 | if sourcematch("?"):
|
---|
| 521 | subpattern[-1] = (MIN_REPEAT, (min, max, item))
|
---|
| 522 | else:
|
---|
| 523 | subpattern[-1] = (MAX_REPEAT, (min, max, item))
|
---|
| 524 |
|
---|
| 525 | elif this == ".":
|
---|
| 526 | subpatternappend((ANY, None))
|
---|
| 527 |
|
---|
| 528 | elif this == "(":
|
---|
| 529 | group = 1
|
---|
| 530 | name = None
|
---|
| 531 | condgroup = None
|
---|
| 532 | if sourcematch("?"):
|
---|
| 533 | group = 0
|
---|
| 534 | # options
|
---|
| 535 | if sourcematch("P"):
|
---|
| 536 | # python extensions
|
---|
| 537 | if sourcematch("<"):
|
---|
| 538 | # named group: skip forward to end of name
|
---|
| 539 | name = ""
|
---|
| 540 | while 1:
|
---|
| 541 | char = sourceget()
|
---|
| 542 | if char is None:
|
---|
| 543 | raise error, "unterminated name"
|
---|
| 544 | if char == ">":
|
---|
| 545 | break
|
---|
| 546 | name = name + char
|
---|
| 547 | group = 1
|
---|
[391] | 548 | if not name:
|
---|
| 549 | raise error("missing group name")
|
---|
[2] | 550 | if not isname(name):
|
---|
[391] | 551 | raise error("bad character in group name %r" %
|
---|
| 552 | name)
|
---|
[2] | 553 | elif sourcematch("="):
|
---|
| 554 | # named backreference
|
---|
| 555 | name = ""
|
---|
| 556 | while 1:
|
---|
| 557 | char = sourceget()
|
---|
| 558 | if char is None:
|
---|
| 559 | raise error, "unterminated name"
|
---|
| 560 | if char == ")":
|
---|
| 561 | break
|
---|
| 562 | name = name + char
|
---|
[391] | 563 | if not name:
|
---|
| 564 | raise error("missing group name")
|
---|
[2] | 565 | if not isname(name):
|
---|
[391] | 566 | raise error("bad character in backref group name "
|
---|
| 567 | "%r" % name)
|
---|
[2] | 568 | gid = state.groupdict.get(name)
|
---|
| 569 | if gid is None:
|
---|
| 570 | raise error, "unknown group name"
|
---|
| 571 | subpatternappend((GROUPREF, gid))
|
---|
| 572 | continue
|
---|
| 573 | else:
|
---|
| 574 | char = sourceget()
|
---|
| 575 | if char is None:
|
---|
| 576 | raise error, "unexpected end of pattern"
|
---|
| 577 | raise error, "unknown specifier: ?P%s" % char
|
---|
| 578 | elif sourcematch(":"):
|
---|
| 579 | # non-capturing group
|
---|
| 580 | group = 2
|
---|
| 581 | elif sourcematch("#"):
|
---|
| 582 | # comment
|
---|
| 583 | while 1:
|
---|
| 584 | if source.next is None or source.next == ")":
|
---|
| 585 | break
|
---|
| 586 | sourceget()
|
---|
| 587 | if not sourcematch(")"):
|
---|
| 588 | raise error, "unbalanced parenthesis"
|
---|
| 589 | continue
|
---|
| 590 | elif source.next in ASSERTCHARS:
|
---|
| 591 | # lookahead assertions
|
---|
| 592 | char = sourceget()
|
---|
| 593 | dir = 1
|
---|
| 594 | if char == "<":
|
---|
| 595 | if source.next not in LOOKBEHINDASSERTCHARS:
|
---|
| 596 | raise error, "syntax error"
|
---|
| 597 | dir = -1 # lookbehind
|
---|
| 598 | char = sourceget()
|
---|
| 599 | p = _parse_sub(source, state)
|
---|
| 600 | if not sourcematch(")"):
|
---|
| 601 | raise error, "unbalanced parenthesis"
|
---|
| 602 | if char == "=":
|
---|
| 603 | subpatternappend((ASSERT, (dir, p)))
|
---|
| 604 | else:
|
---|
| 605 | subpatternappend((ASSERT_NOT, (dir, p)))
|
---|
| 606 | continue
|
---|
| 607 | elif sourcematch("("):
|
---|
| 608 | # conditional backreference group
|
---|
| 609 | condname = ""
|
---|
| 610 | while 1:
|
---|
| 611 | char = sourceget()
|
---|
| 612 | if char is None:
|
---|
| 613 | raise error, "unterminated name"
|
---|
| 614 | if char == ")":
|
---|
| 615 | break
|
---|
| 616 | condname = condname + char
|
---|
| 617 | group = 2
|
---|
[391] | 618 | if not condname:
|
---|
| 619 | raise error("missing group name")
|
---|
[2] | 620 | if isname(condname):
|
---|
| 621 | condgroup = state.groupdict.get(condname)
|
---|
| 622 | if condgroup is None:
|
---|
| 623 | raise error, "unknown group name"
|
---|
| 624 | else:
|
---|
| 625 | try:
|
---|
| 626 | condgroup = int(condname)
|
---|
| 627 | except ValueError:
|
---|
| 628 | raise error, "bad character in group name"
|
---|
| 629 | else:
|
---|
| 630 | # flags
|
---|
| 631 | if not source.next in FLAGS:
|
---|
| 632 | raise error, "unexpected end of pattern"
|
---|
| 633 | while source.next in FLAGS:
|
---|
| 634 | state.flags = state.flags | FLAGS[sourceget()]
|
---|
| 635 | if group:
|
---|
| 636 | # parse group contents
|
---|
| 637 | if group == 2:
|
---|
| 638 | # anonymous group
|
---|
| 639 | group = None
|
---|
| 640 | else:
|
---|
| 641 | group = state.opengroup(name)
|
---|
| 642 | if condgroup:
|
---|
| 643 | p = _parse_sub_cond(source, state, condgroup)
|
---|
| 644 | else:
|
---|
| 645 | p = _parse_sub(source, state)
|
---|
| 646 | if not sourcematch(")"):
|
---|
| 647 | raise error, "unbalanced parenthesis"
|
---|
| 648 | if group is not None:
|
---|
| 649 | state.closegroup(group)
|
---|
| 650 | subpatternappend((SUBPATTERN, (group, p)))
|
---|
| 651 | else:
|
---|
| 652 | while 1:
|
---|
| 653 | char = sourceget()
|
---|
| 654 | if char is None:
|
---|
| 655 | raise error, "unexpected end of pattern"
|
---|
| 656 | if char == ")":
|
---|
| 657 | break
|
---|
| 658 | raise error, "unknown extension"
|
---|
| 659 |
|
---|
| 660 | elif this == "^":
|
---|
| 661 | subpatternappend((AT, AT_BEGINNING))
|
---|
| 662 |
|
---|
| 663 | elif this == "$":
|
---|
| 664 | subpattern.append((AT, AT_END))
|
---|
| 665 |
|
---|
| 666 | elif this and this[0] == "\\":
|
---|
| 667 | code = _escape(source, this, state)
|
---|
| 668 | subpatternappend(code)
|
---|
| 669 |
|
---|
| 670 | else:
|
---|
| 671 | raise error, "parser error"
|
---|
| 672 |
|
---|
| 673 | return subpattern
|
---|
| 674 |
|
---|
| 675 | def parse(str, flags=0, pattern=None):
|
---|
| 676 | # parse 're' pattern into list of (opcode, argument) tuples
|
---|
| 677 |
|
---|
| 678 | source = Tokenizer(str)
|
---|
| 679 |
|
---|
| 680 | if pattern is None:
|
---|
| 681 | pattern = Pattern()
|
---|
| 682 | pattern.flags = flags
|
---|
| 683 | pattern.str = str
|
---|
| 684 |
|
---|
| 685 | p = _parse_sub(source, pattern, 0)
|
---|
| 686 |
|
---|
| 687 | tail = source.get()
|
---|
| 688 | if tail == ")":
|
---|
| 689 | raise error, "unbalanced parenthesis"
|
---|
| 690 | elif tail:
|
---|
| 691 | raise error, "bogus characters at end of regular expression"
|
---|
| 692 |
|
---|
| 693 | if flags & SRE_FLAG_DEBUG:
|
---|
| 694 | p.dump()
|
---|
| 695 |
|
---|
| 696 | if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
|
---|
| 697 | # the VERBOSE flag was switched on inside the pattern. to be
|
---|
| 698 | # on the safe side, we'll parse the whole thing again...
|
---|
| 699 | return parse(str, p.pattern.flags)
|
---|
| 700 |
|
---|
| 701 | return p
|
---|
| 702 |
|
---|
| 703 | def parse_template(source, pattern):
|
---|
| 704 | # parse 're' replacement string into list of literals and
|
---|
| 705 | # group references
|
---|
| 706 | s = Tokenizer(source)
|
---|
| 707 | sget = s.get
|
---|
| 708 | p = []
|
---|
| 709 | a = p.append
|
---|
| 710 | def literal(literal, p=p, pappend=a):
|
---|
| 711 | if p and p[-1][0] is LITERAL:
|
---|
| 712 | p[-1] = LITERAL, p[-1][1] + literal
|
---|
| 713 | else:
|
---|
| 714 | pappend((LITERAL, literal))
|
---|
| 715 | sep = source[:0]
|
---|
| 716 | if type(sep) is type(""):
|
---|
| 717 | makechar = chr
|
---|
| 718 | else:
|
---|
| 719 | makechar = unichr
|
---|
| 720 | while 1:
|
---|
| 721 | this = sget()
|
---|
| 722 | if this is None:
|
---|
| 723 | break # end of replacement string
|
---|
| 724 | if this and this[0] == "\\":
|
---|
| 725 | # group
|
---|
| 726 | c = this[1:2]
|
---|
| 727 | if c == "g":
|
---|
| 728 | name = ""
|
---|
| 729 | if s.match("<"):
|
---|
| 730 | while 1:
|
---|
| 731 | char = sget()
|
---|
| 732 | if char is None:
|
---|
| 733 | raise error, "unterminated group name"
|
---|
| 734 | if char == ">":
|
---|
| 735 | break
|
---|
| 736 | name = name + char
|
---|
| 737 | if not name:
|
---|
[391] | 738 | raise error, "missing group name"
|
---|
[2] | 739 | try:
|
---|
| 740 | index = int(name)
|
---|
| 741 | if index < 0:
|
---|
| 742 | raise error, "negative group number"
|
---|
| 743 | except ValueError:
|
---|
| 744 | if not isname(name):
|
---|
| 745 | raise error, "bad character in group name"
|
---|
| 746 | try:
|
---|
| 747 | index = pattern.groupindex[name]
|
---|
| 748 | except KeyError:
|
---|
| 749 | raise IndexError, "unknown group name"
|
---|
| 750 | a((MARK, index))
|
---|
| 751 | elif c == "0":
|
---|
| 752 | if s.next in OCTDIGITS:
|
---|
| 753 | this = this + sget()
|
---|
| 754 | if s.next in OCTDIGITS:
|
---|
| 755 | this = this + sget()
|
---|
| 756 | literal(makechar(int(this[1:], 8) & 0xff))
|
---|
| 757 | elif c in DIGITS:
|
---|
| 758 | isoctal = False
|
---|
| 759 | if s.next in DIGITS:
|
---|
| 760 | this = this + sget()
|
---|
| 761 | if (c in OCTDIGITS and this[2] in OCTDIGITS and
|
---|
| 762 | s.next in OCTDIGITS):
|
---|
| 763 | this = this + sget()
|
---|
| 764 | isoctal = True
|
---|
| 765 | literal(makechar(int(this[1:], 8) & 0xff))
|
---|
| 766 | if not isoctal:
|
---|
| 767 | a((MARK, int(this[1:])))
|
---|
| 768 | else:
|
---|
| 769 | try:
|
---|
| 770 | this = makechar(ESCAPES[this][1])
|
---|
| 771 | except KeyError:
|
---|
| 772 | pass
|
---|
| 773 | literal(this)
|
---|
| 774 | else:
|
---|
| 775 | literal(this)
|
---|
| 776 | # convert template to groups and literals lists
|
---|
| 777 | i = 0
|
---|
| 778 | groups = []
|
---|
| 779 | groupsappend = groups.append
|
---|
| 780 | literals = [None] * len(p)
|
---|
| 781 | for c, s in p:
|
---|
| 782 | if c is MARK:
|
---|
| 783 | groupsappend((i, s))
|
---|
| 784 | # literal[i] is already None
|
---|
| 785 | else:
|
---|
| 786 | literals[i] = s
|
---|
| 787 | i = i + 1
|
---|
| 788 | return groups, literals
|
---|
| 789 |
|
---|
| 790 | def expand_template(template, match):
|
---|
| 791 | g = match.group
|
---|
| 792 | sep = match.string[:0]
|
---|
| 793 | groups, literals = template
|
---|
| 794 | literals = literals[:]
|
---|
| 795 | try:
|
---|
| 796 | for index, group in groups:
|
---|
| 797 | literals[index] = s = g(group)
|
---|
| 798 | if s is None:
|
---|
| 799 | raise error, "unmatched group"
|
---|
| 800 | except IndexError:
|
---|
| 801 | raise error, "invalid group reference"
|
---|
| 802 | return sep.join(literals)
|
---|