[391] | 1 | r"""HTTP cookie handling for web clients.
|
---|
[2] | 2 |
|
---|
| 3 | This module has (now fairly distant) origins in Gisle Aas' Perl module
|
---|
| 4 | HTTP::Cookies, from the libwww-perl library.
|
---|
| 5 |
|
---|
| 6 | Docstrings, comments and debug strings in this code refer to the
|
---|
| 7 | attributes of the HTTP cookie system as cookie-attributes, to distinguish
|
---|
| 8 | them clearly from Python attributes.
|
---|
| 9 |
|
---|
| 10 | Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
|
---|
| 11 | distributed with the Python standard library, but are available from
|
---|
| 12 | http://wwwsearch.sf.net/):
|
---|
| 13 |
|
---|
| 14 | CookieJar____
|
---|
| 15 | / \ \
|
---|
| 16 | FileCookieJar \ \
|
---|
| 17 | / | \ \ \
|
---|
| 18 | MozillaCookieJar | LWPCookieJar \ \
|
---|
| 19 | | | \
|
---|
| 20 | | ---MSIEBase | \
|
---|
| 21 | | / | | \
|
---|
| 22 | | / MSIEDBCookieJar BSDDBCookieJar
|
---|
| 23 | |/
|
---|
| 24 | MSIECookieJar
|
---|
| 25 |
|
---|
| 26 | """
|
---|
| 27 |
|
---|
| 28 | __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
|
---|
| 29 | 'FileCookieJar', 'LWPCookieJar', 'lwp_cookie_str', 'LoadError',
|
---|
| 30 | 'MozillaCookieJar']
|
---|
| 31 |
|
---|
| 32 | import re, urlparse, copy, time, urllib
|
---|
| 33 | try:
|
---|
| 34 | import threading as _threading
|
---|
| 35 | except ImportError:
|
---|
| 36 | import dummy_threading as _threading
|
---|
| 37 | import httplib # only for the default HTTP port
|
---|
| 38 | from calendar import timegm
|
---|
| 39 |
|
---|
| 40 | debug = False # set to True to enable debugging via the logging module
|
---|
| 41 | logger = None
|
---|
| 42 |
|
---|
| 43 | def _debug(*args):
|
---|
| 44 | if not debug:
|
---|
| 45 | return
|
---|
| 46 | global logger
|
---|
| 47 | if not logger:
|
---|
| 48 | import logging
|
---|
| 49 | logger = logging.getLogger("cookielib")
|
---|
| 50 | return logger.debug(*args)
|
---|
| 51 |
|
---|
| 52 |
|
---|
| 53 | DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
|
---|
| 54 | MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
|
---|
| 55 | "instance initialised with one)")
|
---|
| 56 |
|
---|
| 57 | def _warn_unhandled_exception():
|
---|
| 58 | # There are a few catch-all except: statements in this module, for
|
---|
| 59 | # catching input that's bad in unexpected ways. Warn if any
|
---|
| 60 | # exceptions are caught there.
|
---|
| 61 | import warnings, traceback, StringIO
|
---|
| 62 | f = StringIO.StringIO()
|
---|
| 63 | traceback.print_exc(None, f)
|
---|
| 64 | msg = f.getvalue()
|
---|
| 65 | warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
|
---|
| 66 |
|
---|
| 67 |
|
---|
| 68 | # Date/time conversion
|
---|
| 69 | # -----------------------------------------------------------------------------
|
---|
| 70 |
|
---|
| 71 | EPOCH_YEAR = 1970
|
---|
| 72 | def _timegm(tt):
|
---|
| 73 | year, month, mday, hour, min, sec = tt[:6]
|
---|
| 74 | if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
|
---|
| 75 | (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
|
---|
| 76 | return timegm(tt)
|
---|
| 77 | else:
|
---|
| 78 | return None
|
---|
| 79 |
|
---|
| 80 | DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
---|
| 81 | MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
---|
| 82 | "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
---|
| 83 | MONTHS_LOWER = []
|
---|
| 84 | for month in MONTHS: MONTHS_LOWER.append(month.lower())
|
---|
| 85 |
|
---|
| 86 | def time2isoz(t=None):
|
---|
| 87 | """Return a string representing time in seconds since epoch, t.
|
---|
| 88 |
|
---|
| 89 | If the function is called without an argument, it will use the current
|
---|
| 90 | time.
|
---|
| 91 |
|
---|
| 92 | The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
|
---|
| 93 | representing Universal Time (UTC, aka GMT). An example of this format is:
|
---|
| 94 |
|
---|
| 95 | 1994-11-24 08:49:37Z
|
---|
| 96 |
|
---|
| 97 | """
|
---|
| 98 | if t is None: t = time.time()
|
---|
| 99 | year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
|
---|
| 100 | return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
|
---|
| 101 | year, mon, mday, hour, min, sec)
|
---|
| 102 |
|
---|
| 103 | def time2netscape(t=None):
|
---|
| 104 | """Return a string representing time in seconds since epoch, t.
|
---|
| 105 |
|
---|
| 106 | If the function is called without an argument, it will use the current
|
---|
| 107 | time.
|
---|
| 108 |
|
---|
| 109 | The format of the returned string is like this:
|
---|
| 110 |
|
---|
| 111 | Wed, DD-Mon-YYYY HH:MM:SS GMT
|
---|
| 112 |
|
---|
| 113 | """
|
---|
| 114 | if t is None: t = time.time()
|
---|
| 115 | year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
|
---|
| 116 | return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
|
---|
| 117 | DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
|
---|
| 118 |
|
---|
| 119 |
|
---|
| 120 | UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
|
---|
| 121 |
|
---|
| 122 | TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
|
---|
| 123 | def offset_from_tz_string(tz):
|
---|
| 124 | offset = None
|
---|
| 125 | if tz in UTC_ZONES:
|
---|
| 126 | offset = 0
|
---|
| 127 | else:
|
---|
| 128 | m = TIMEZONE_RE.search(tz)
|
---|
| 129 | if m:
|
---|
| 130 | offset = 3600 * int(m.group(2))
|
---|
| 131 | if m.group(3):
|
---|
| 132 | offset = offset + 60 * int(m.group(3))
|
---|
| 133 | if m.group(1) == '-':
|
---|
| 134 | offset = -offset
|
---|
| 135 | return offset
|
---|
| 136 |
|
---|
| 137 | def _str2time(day, mon, yr, hr, min, sec, tz):
|
---|
| 138 | # translate month name to number
|
---|
| 139 | # month numbers start with 1 (January)
|
---|
| 140 | try:
|
---|
| 141 | mon = MONTHS_LOWER.index(mon.lower())+1
|
---|
| 142 | except ValueError:
|
---|
| 143 | # maybe it's already a number
|
---|
| 144 | try:
|
---|
| 145 | imon = int(mon)
|
---|
| 146 | except ValueError:
|
---|
| 147 | return None
|
---|
| 148 | if 1 <= imon <= 12:
|
---|
| 149 | mon = imon
|
---|
| 150 | else:
|
---|
| 151 | return None
|
---|
| 152 |
|
---|
| 153 | # make sure clock elements are defined
|
---|
| 154 | if hr is None: hr = 0
|
---|
| 155 | if min is None: min = 0
|
---|
| 156 | if sec is None: sec = 0
|
---|
| 157 |
|
---|
| 158 | yr = int(yr)
|
---|
| 159 | day = int(day)
|
---|
| 160 | hr = int(hr)
|
---|
| 161 | min = int(min)
|
---|
| 162 | sec = int(sec)
|
---|
| 163 |
|
---|
| 164 | if yr < 1000:
|
---|
| 165 | # find "obvious" year
|
---|
| 166 | cur_yr = time.localtime(time.time())[0]
|
---|
| 167 | m = cur_yr % 100
|
---|
| 168 | tmp = yr
|
---|
| 169 | yr = yr + cur_yr - m
|
---|
| 170 | m = m - tmp
|
---|
| 171 | if abs(m) > 50:
|
---|
| 172 | if m > 0: yr = yr + 100
|
---|
| 173 | else: yr = yr - 100
|
---|
| 174 |
|
---|
| 175 | # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
|
---|
| 176 | t = _timegm((yr, mon, day, hr, min, sec, tz))
|
---|
| 177 |
|
---|
| 178 | if t is not None:
|
---|
| 179 | # adjust time using timezone string, to get absolute time since epoch
|
---|
| 180 | if tz is None:
|
---|
| 181 | tz = "UTC"
|
---|
| 182 | tz = tz.upper()
|
---|
| 183 | offset = offset_from_tz_string(tz)
|
---|
| 184 | if offset is None:
|
---|
| 185 | return None
|
---|
| 186 | t = t - offset
|
---|
| 187 |
|
---|
| 188 | return t
|
---|
| 189 |
|
---|
| 190 | STRICT_DATE_RE = re.compile(
|
---|
| 191 | r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
|
---|
| 192 | "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
|
---|
| 193 | WEEKDAY_RE = re.compile(
|
---|
| 194 | r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
|
---|
| 195 | LOOSE_HTTP_DATE_RE = re.compile(
|
---|
| 196 | r"""^
|
---|
| 197 | (\d\d?) # day
|
---|
| 198 | (?:\s+|[-\/])
|
---|
| 199 | (\w+) # month
|
---|
| 200 | (?:\s+|[-\/])
|
---|
| 201 | (\d+) # year
|
---|
| 202 | (?:
|
---|
| 203 | (?:\s+|:) # separator before clock
|
---|
| 204 | (\d\d?):(\d\d) # hour:min
|
---|
| 205 | (?::(\d\d))? # optional seconds
|
---|
| 206 | )? # optional clock
|
---|
| 207 | \s*
|
---|
| 208 | ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
|
---|
| 209 | \s*
|
---|
| 210 | (?:\(\w+\))? # ASCII representation of timezone in parens.
|
---|
| 211 | \s*$""", re.X)
|
---|
| 212 | def http2time(text):
|
---|
| 213 | """Returns time in seconds since epoch of time represented by a string.
|
---|
| 214 |
|
---|
| 215 | Return value is an integer.
|
---|
| 216 |
|
---|
| 217 | None is returned if the format of str is unrecognized, the time is outside
|
---|
| 218 | the representable range, or the timezone string is not recognized. If the
|
---|
| 219 | string contains no timezone, UTC is assumed.
|
---|
| 220 |
|
---|
| 221 | The timezone in the string may be numerical (like "-0800" or "+0100") or a
|
---|
| 222 | string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
|
---|
| 223 | timezone strings equivalent to UTC (zero offset) are known to the function.
|
---|
| 224 |
|
---|
| 225 | The function loosely parses the following formats:
|
---|
| 226 |
|
---|
| 227 | Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
|
---|
| 228 | Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
|
---|
| 229 | Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
|
---|
| 230 | 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
|
---|
| 231 | 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
|
---|
| 232 | 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
|
---|
| 233 |
|
---|
| 234 | The parser ignores leading and trailing whitespace. The time may be
|
---|
| 235 | absent.
|
---|
| 236 |
|
---|
| 237 | If the year is given with only 2 digits, the function will select the
|
---|
| 238 | century that makes the year closest to the current date.
|
---|
| 239 |
|
---|
| 240 | """
|
---|
| 241 | # fast exit for strictly conforming string
|
---|
| 242 | m = STRICT_DATE_RE.search(text)
|
---|
| 243 | if m:
|
---|
| 244 | g = m.groups()
|
---|
| 245 | mon = MONTHS_LOWER.index(g[1].lower()) + 1
|
---|
| 246 | tt = (int(g[2]), mon, int(g[0]),
|
---|
| 247 | int(g[3]), int(g[4]), float(g[5]))
|
---|
| 248 | return _timegm(tt)
|
---|
| 249 |
|
---|
| 250 | # No, we need some messy parsing...
|
---|
| 251 |
|
---|
| 252 | # clean up
|
---|
| 253 | text = text.lstrip()
|
---|
| 254 | text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
|
---|
| 255 |
|
---|
| 256 | # tz is time zone specifier string
|
---|
| 257 | day, mon, yr, hr, min, sec, tz = [None]*7
|
---|
| 258 |
|
---|
| 259 | # loose regexp parse
|
---|
| 260 | m = LOOSE_HTTP_DATE_RE.search(text)
|
---|
| 261 | if m is not None:
|
---|
| 262 | day, mon, yr, hr, min, sec, tz = m.groups()
|
---|
| 263 | else:
|
---|
| 264 | return None # bad format
|
---|
| 265 |
|
---|
| 266 | return _str2time(day, mon, yr, hr, min, sec, tz)
|
---|
| 267 |
|
---|
| 268 | ISO_DATE_RE = re.compile(
|
---|
| 269 | """^
|
---|
| 270 | (\d{4}) # year
|
---|
| 271 | [-\/]?
|
---|
| 272 | (\d\d?) # numerical month
|
---|
| 273 | [-\/]?
|
---|
| 274 | (\d\d?) # day
|
---|
| 275 | (?:
|
---|
| 276 | (?:\s+|[-:Tt]) # separator before clock
|
---|
| 277 | (\d\d?):?(\d\d) # hour:min
|
---|
| 278 | (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
|
---|
| 279 | )? # optional clock
|
---|
| 280 | \s*
|
---|
| 281 | ([-+]?\d\d?:?(:?\d\d)?
|
---|
| 282 | |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
|
---|
| 283 | \s*$""", re.X)
|
---|
| 284 | def iso2time(text):
|
---|
| 285 | """
|
---|
| 286 | As for http2time, but parses the ISO 8601 formats:
|
---|
| 287 |
|
---|
| 288 | 1994-02-03 14:15:29 -0100 -- ISO 8601 format
|
---|
| 289 | 1994-02-03 14:15:29 -- zone is optional
|
---|
| 290 | 1994-02-03 -- only date
|
---|
| 291 | 1994-02-03T14:15:29 -- Use T as separator
|
---|
| 292 | 19940203T141529Z -- ISO 8601 compact format
|
---|
| 293 | 19940203 -- only date
|
---|
| 294 |
|
---|
| 295 | """
|
---|
| 296 | # clean up
|
---|
| 297 | text = text.lstrip()
|
---|
| 298 |
|
---|
| 299 | # tz is time zone specifier string
|
---|
| 300 | day, mon, yr, hr, min, sec, tz = [None]*7
|
---|
| 301 |
|
---|
| 302 | # loose regexp parse
|
---|
| 303 | m = ISO_DATE_RE.search(text)
|
---|
| 304 | if m is not None:
|
---|
| 305 | # XXX there's an extra bit of the timezone I'm ignoring here: is
|
---|
| 306 | # this the right thing to do?
|
---|
| 307 | yr, mon, day, hr, min, sec, tz, _ = m.groups()
|
---|
| 308 | else:
|
---|
| 309 | return None # bad format
|
---|
| 310 |
|
---|
| 311 | return _str2time(day, mon, yr, hr, min, sec, tz)
|
---|
| 312 |
|
---|
| 313 |
|
---|
| 314 | # Header parsing
|
---|
| 315 | # -----------------------------------------------------------------------------
|
---|
| 316 |
|
---|
| 317 | def unmatched(match):
|
---|
| 318 | """Return unmatched part of re.Match object."""
|
---|
| 319 | start, end = match.span(0)
|
---|
| 320 | return match.string[:start]+match.string[end:]
|
---|
| 321 |
|
---|
| 322 | HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
|
---|
| 323 | HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
|
---|
| 324 | HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
|
---|
| 325 | HEADER_ESCAPE_RE = re.compile(r"\\(.)")
|
---|
| 326 | def split_header_words(header_values):
|
---|
| 327 | r"""Parse header values into a list of lists containing key,value pairs.
|
---|
| 328 |
|
---|
| 329 | The function knows how to deal with ",", ";" and "=" as well as quoted
|
---|
| 330 | values after "=". A list of space separated tokens are parsed as if they
|
---|
| 331 | were separated by ";".
|
---|
| 332 |
|
---|
| 333 | If the header_values passed as argument contains multiple values, then they
|
---|
| 334 | are treated as if they were a single value separated by comma ",".
|
---|
| 335 |
|
---|
| 336 | This means that this function is useful for parsing header fields that
|
---|
| 337 | follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
|
---|
| 338 | the requirement for tokens).
|
---|
| 339 |
|
---|
| 340 | headers = #header
|
---|
| 341 | header = (token | parameter) *( [";"] (token | parameter))
|
---|
| 342 |
|
---|
| 343 | token = 1*<any CHAR except CTLs or separators>
|
---|
| 344 | separators = "(" | ")" | "<" | ">" | "@"
|
---|
| 345 | | "," | ";" | ":" | "\" | <">
|
---|
| 346 | | "/" | "[" | "]" | "?" | "="
|
---|
| 347 | | "{" | "}" | SP | HT
|
---|
| 348 |
|
---|
| 349 | quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
|
---|
| 350 | qdtext = <any TEXT except <">>
|
---|
| 351 | quoted-pair = "\" CHAR
|
---|
| 352 |
|
---|
| 353 | parameter = attribute "=" value
|
---|
| 354 | attribute = token
|
---|
| 355 | value = token | quoted-string
|
---|
| 356 |
|
---|
| 357 | Each header is represented by a list of key/value pairs. The value for a
|
---|
| 358 | simple token (not part of a parameter) is None. Syntactically incorrect
|
---|
| 359 | headers will not necessarily be parsed as you would want.
|
---|
| 360 |
|
---|
| 361 | This is easier to describe with some examples:
|
---|
| 362 |
|
---|
| 363 | >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
|
---|
| 364 | [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
|
---|
| 365 | >>> split_header_words(['text/html; charset="iso-8859-1"'])
|
---|
| 366 | [[('text/html', None), ('charset', 'iso-8859-1')]]
|
---|
| 367 | >>> split_header_words([r'Basic realm="\"foo\bar\""'])
|
---|
| 368 | [[('Basic', None), ('realm', '"foobar"')]]
|
---|
| 369 |
|
---|
| 370 | """
|
---|
| 371 | assert not isinstance(header_values, basestring)
|
---|
| 372 | result = []
|
---|
| 373 | for text in header_values:
|
---|
| 374 | orig_text = text
|
---|
| 375 | pairs = []
|
---|
| 376 | while text:
|
---|
| 377 | m = HEADER_TOKEN_RE.search(text)
|
---|
| 378 | if m:
|
---|
| 379 | text = unmatched(m)
|
---|
| 380 | name = m.group(1)
|
---|
| 381 | m = HEADER_QUOTED_VALUE_RE.search(text)
|
---|
| 382 | if m: # quoted value
|
---|
| 383 | text = unmatched(m)
|
---|
| 384 | value = m.group(1)
|
---|
| 385 | value = HEADER_ESCAPE_RE.sub(r"\1", value)
|
---|
| 386 | else:
|
---|
| 387 | m = HEADER_VALUE_RE.search(text)
|
---|
| 388 | if m: # unquoted value
|
---|
| 389 | text = unmatched(m)
|
---|
| 390 | value = m.group(1)
|
---|
| 391 | value = value.rstrip()
|
---|
| 392 | else:
|
---|
| 393 | # no value, a lone token
|
---|
| 394 | value = None
|
---|
| 395 | pairs.append((name, value))
|
---|
| 396 | elif text.lstrip().startswith(","):
|
---|
| 397 | # concatenated headers, as per RFC 2616 section 4.2
|
---|
| 398 | text = text.lstrip()[1:]
|
---|
| 399 | if pairs: result.append(pairs)
|
---|
| 400 | pairs = []
|
---|
| 401 | else:
|
---|
| 402 | # skip junk
|
---|
| 403 | non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
|
---|
| 404 | assert nr_junk_chars > 0, (
|
---|
| 405 | "split_header_words bug: '%s', '%s', %s" %
|
---|
| 406 | (orig_text, text, pairs))
|
---|
| 407 | text = non_junk
|
---|
| 408 | if pairs: result.append(pairs)
|
---|
| 409 | return result
|
---|
| 410 |
|
---|
| 411 | HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
|
---|
| 412 | def join_header_words(lists):
|
---|
| 413 | """Do the inverse (almost) of the conversion done by split_header_words.
|
---|
| 414 |
|
---|
| 415 | Takes a list of lists of (key, value) pairs and produces a single header
|
---|
| 416 | value. Attribute values are quoted if needed.
|
---|
| 417 |
|
---|
| 418 | >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
|
---|
| 419 | 'text/plain; charset="iso-8859/1"'
|
---|
| 420 | >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
|
---|
| 421 | 'text/plain, charset="iso-8859/1"'
|
---|
| 422 |
|
---|
| 423 | """
|
---|
| 424 | headers = []
|
---|
| 425 | for pairs in lists:
|
---|
| 426 | attr = []
|
---|
| 427 | for k, v in pairs:
|
---|
| 428 | if v is not None:
|
---|
| 429 | if not re.search(r"^\w+$", v):
|
---|
| 430 | v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
|
---|
| 431 | v = '"%s"' % v
|
---|
| 432 | k = "%s=%s" % (k, v)
|
---|
| 433 | attr.append(k)
|
---|
| 434 | if attr: headers.append("; ".join(attr))
|
---|
| 435 | return ", ".join(headers)
|
---|
| 436 |
|
---|
[391] | 437 | def _strip_quotes(text):
|
---|
| 438 | if text.startswith('"'):
|
---|
| 439 | text = text[1:]
|
---|
| 440 | if text.endswith('"'):
|
---|
| 441 | text = text[:-1]
|
---|
| 442 | return text
|
---|
| 443 |
|
---|
[2] | 444 | def parse_ns_headers(ns_headers):
|
---|
| 445 | """Ad-hoc parser for Netscape protocol cookie-attributes.
|
---|
| 446 |
|
---|
| 447 | The old Netscape cookie format for Set-Cookie can for instance contain
|
---|
| 448 | an unquoted "," in the expires field, so we have to use this ad-hoc
|
---|
| 449 | parser instead of split_header_words.
|
---|
| 450 |
|
---|
| 451 | XXX This may not make the best possible effort to parse all the crap
|
---|
| 452 | that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
|
---|
| 453 | parser is probably better, so could do worse than following that if
|
---|
| 454 | this ever gives any trouble.
|
---|
| 455 |
|
---|
| 456 | Currently, this is also used for parsing RFC 2109 cookies.
|
---|
| 457 |
|
---|
| 458 | """
|
---|
| 459 | known_attrs = ("expires", "domain", "path", "secure",
|
---|
| 460 | # RFC 2109 attrs (may turn up in Netscape cookies, too)
|
---|
[391] | 461 | "version", "port", "max-age")
|
---|
[2] | 462 |
|
---|
| 463 | result = []
|
---|
| 464 | for ns_header in ns_headers:
|
---|
| 465 | pairs = []
|
---|
| 466 | version_set = False
|
---|
| 467 | for ii, param in enumerate(re.split(r";\s*", ns_header)):
|
---|
| 468 | param = param.rstrip()
|
---|
| 469 | if param == "": continue
|
---|
| 470 | if "=" not in param:
|
---|
| 471 | k, v = param, None
|
---|
| 472 | else:
|
---|
| 473 | k, v = re.split(r"\s*=\s*", param, 1)
|
---|
| 474 | k = k.lstrip()
|
---|
| 475 | if ii != 0:
|
---|
| 476 | lc = k.lower()
|
---|
| 477 | if lc in known_attrs:
|
---|
| 478 | k = lc
|
---|
| 479 | if k == "version":
|
---|
| 480 | # This is an RFC 2109 cookie.
|
---|
[391] | 481 | v = _strip_quotes(v)
|
---|
[2] | 482 | version_set = True
|
---|
| 483 | if k == "expires":
|
---|
| 484 | # convert expires date to seconds since epoch
|
---|
[391] | 485 | v = http2time(_strip_quotes(v)) # None if invalid
|
---|
[2] | 486 | pairs.append((k, v))
|
---|
| 487 |
|
---|
| 488 | if pairs:
|
---|
| 489 | if not version_set:
|
---|
| 490 | pairs.append(("version", "0"))
|
---|
| 491 | result.append(pairs)
|
---|
| 492 |
|
---|
| 493 | return result
|
---|
| 494 |
|
---|
| 495 |
|
---|
| 496 | IPV4_RE = re.compile(r"\.\d+$")
|
---|
| 497 | def is_HDN(text):
|
---|
| 498 | """Return True if text is a host domain name."""
|
---|
| 499 | # XXX
|
---|
| 500 | # This may well be wrong. Which RFC is HDN defined in, if any (for
|
---|
| 501 | # the purposes of RFC 2965)?
|
---|
| 502 | # For the current implementation, what about IPv6? Remember to look
|
---|
| 503 | # at other uses of IPV4_RE also, if change this.
|
---|
| 504 | if IPV4_RE.search(text):
|
---|
| 505 | return False
|
---|
| 506 | if text == "":
|
---|
| 507 | return False
|
---|
| 508 | if text[0] == "." or text[-1] == ".":
|
---|
| 509 | return False
|
---|
| 510 | return True
|
---|
| 511 |
|
---|
| 512 | def domain_match(A, B):
|
---|
| 513 | """Return True if domain A domain-matches domain B, according to RFC 2965.
|
---|
| 514 |
|
---|
| 515 | A and B may be host domain names or IP addresses.
|
---|
| 516 |
|
---|
| 517 | RFC 2965, section 1:
|
---|
| 518 |
|
---|
| 519 | Host names can be specified either as an IP address or a HDN string.
|
---|
| 520 | Sometimes we compare one host name with another. (Such comparisons SHALL
|
---|
| 521 | be case-insensitive.) Host A's name domain-matches host B's if
|
---|
| 522 |
|
---|
| 523 | * their host name strings string-compare equal; or
|
---|
| 524 |
|
---|
| 525 | * A is a HDN string and has the form NB, where N is a non-empty
|
---|
| 526 | name string, B has the form .B', and B' is a HDN string. (So,
|
---|
| 527 | x.y.com domain-matches .Y.com but not Y.com.)
|
---|
| 528 |
|
---|
| 529 | Note that domain-match is not a commutative operation: a.b.c.com
|
---|
| 530 | domain-matches .c.com, but not the reverse.
|
---|
| 531 |
|
---|
| 532 | """
|
---|
| 533 | # Note that, if A or B are IP addresses, the only relevant part of the
|
---|
| 534 | # definition of the domain-match algorithm is the direct string-compare.
|
---|
| 535 | A = A.lower()
|
---|
| 536 | B = B.lower()
|
---|
| 537 | if A == B:
|
---|
| 538 | return True
|
---|
| 539 | if not is_HDN(A):
|
---|
| 540 | return False
|
---|
| 541 | i = A.rfind(B)
|
---|
| 542 | if i == -1 or i == 0:
|
---|
| 543 | # A does not have form NB, or N is the empty string
|
---|
| 544 | return False
|
---|
| 545 | if not B.startswith("."):
|
---|
| 546 | return False
|
---|
| 547 | if not is_HDN(B[1:]):
|
---|
| 548 | return False
|
---|
| 549 | return True
|
---|
| 550 |
|
---|
| 551 | def liberal_is_HDN(text):
|
---|
| 552 | """Return True if text is a sort-of-like a host domain name.
|
---|
| 553 |
|
---|
| 554 | For accepting/blocking domains.
|
---|
| 555 |
|
---|
| 556 | """
|
---|
| 557 | if IPV4_RE.search(text):
|
---|
| 558 | return False
|
---|
| 559 | return True
|
---|
| 560 |
|
---|
| 561 | def user_domain_match(A, B):
|
---|
| 562 | """For blocking/accepting domains.
|
---|
| 563 |
|
---|
| 564 | A and B may be host domain names or IP addresses.
|
---|
| 565 |
|
---|
| 566 | """
|
---|
| 567 | A = A.lower()
|
---|
| 568 | B = B.lower()
|
---|
| 569 | if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
|
---|
| 570 | if A == B:
|
---|
| 571 | # equal IP addresses
|
---|
| 572 | return True
|
---|
| 573 | return False
|
---|
| 574 | initial_dot = B.startswith(".")
|
---|
| 575 | if initial_dot and A.endswith(B):
|
---|
| 576 | return True
|
---|
| 577 | if not initial_dot and A == B:
|
---|
| 578 | return True
|
---|
| 579 | return False
|
---|
| 580 |
|
---|
| 581 | cut_port_re = re.compile(r":\d+$")
|
---|
| 582 | def request_host(request):
|
---|
| 583 | """Return request-host, as defined by RFC 2965.
|
---|
| 584 |
|
---|
| 585 | Variation from RFC: returned value is lowercased, for convenient
|
---|
| 586 | comparison.
|
---|
| 587 |
|
---|
| 588 | """
|
---|
| 589 | url = request.get_full_url()
|
---|
| 590 | host = urlparse.urlparse(url)[1]
|
---|
| 591 | if host == "":
|
---|
| 592 | host = request.get_header("Host", "")
|
---|
| 593 |
|
---|
| 594 | # remove port, if present
|
---|
| 595 | host = cut_port_re.sub("", host, 1)
|
---|
| 596 | return host.lower()
|
---|
| 597 |
|
---|
| 598 | def eff_request_host(request):
|
---|
| 599 | """Return a tuple (request-host, effective request-host name).
|
---|
| 600 |
|
---|
| 601 | As defined by RFC 2965, except both are lowercased.
|
---|
| 602 |
|
---|
| 603 | """
|
---|
| 604 | erhn = req_host = request_host(request)
|
---|
| 605 | if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
|
---|
| 606 | erhn = req_host + ".local"
|
---|
| 607 | return req_host, erhn
|
---|
| 608 |
|
---|
| 609 | def request_path(request):
|
---|
[391] | 610 | """Path component of request-URI, as defined by RFC 2965."""
|
---|
[2] | 611 | url = request.get_full_url()
|
---|
[391] | 612 | parts = urlparse.urlsplit(url)
|
---|
| 613 | path = escape_path(parts.path)
|
---|
| 614 | if not path.startswith("/"):
|
---|
[2] | 615 | # fix bad RFC 2396 absoluteURI
|
---|
[391] | 616 | path = "/" + path
|
---|
| 617 | return path
|
---|
[2] | 618 |
|
---|
| 619 | def request_port(request):
|
---|
| 620 | host = request.get_host()
|
---|
| 621 | i = host.find(':')
|
---|
| 622 | if i >= 0:
|
---|
| 623 | port = host[i+1:]
|
---|
| 624 | try:
|
---|
| 625 | int(port)
|
---|
| 626 | except ValueError:
|
---|
| 627 | _debug("nonnumeric port: '%s'", port)
|
---|
| 628 | return None
|
---|
| 629 | else:
|
---|
| 630 | port = DEFAULT_HTTP_PORT
|
---|
| 631 | return port
|
---|
| 632 |
|
---|
| 633 | # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
|
---|
| 634 | # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
|
---|
| 635 | HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
|
---|
| 636 | ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
|
---|
| 637 | def uppercase_escaped_char(match):
|
---|
| 638 | return "%%%s" % match.group(1).upper()
|
---|
| 639 | def escape_path(path):
|
---|
| 640 | """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
|
---|
| 641 | # There's no knowing what character encoding was used to create URLs
|
---|
| 642 | # containing %-escapes, but since we have to pick one to escape invalid
|
---|
| 643 | # path characters, we pick UTF-8, as recommended in the HTML 4.0
|
---|
| 644 | # specification:
|
---|
| 645 | # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
|
---|
| 646 | # And here, kind of: draft-fielding-uri-rfc2396bis-03
|
---|
| 647 | # (And in draft IRI specification: draft-duerst-iri-05)
|
---|
| 648 | # (And here, for new URI schemes: RFC 2718)
|
---|
| 649 | if isinstance(path, unicode):
|
---|
| 650 | path = path.encode("utf-8")
|
---|
| 651 | path = urllib.quote(path, HTTP_PATH_SAFE)
|
---|
| 652 | path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
|
---|
| 653 | return path
|
---|
| 654 |
|
---|
| 655 | def reach(h):
|
---|
| 656 | """Return reach of host h, as defined by RFC 2965, section 1.
|
---|
| 657 |
|
---|
| 658 | The reach R of a host name H is defined as follows:
|
---|
| 659 |
|
---|
| 660 | * If
|
---|
| 661 |
|
---|
| 662 | - H is the host domain name of a host; and,
|
---|
| 663 |
|
---|
| 664 | - H has the form A.B; and
|
---|
| 665 |
|
---|
| 666 | - A has no embedded (that is, interior) dots; and
|
---|
| 667 |
|
---|
| 668 | - B has at least one embedded dot, or B is the string "local".
|
---|
| 669 | then the reach of H is .B.
|
---|
| 670 |
|
---|
| 671 | * Otherwise, the reach of H is H.
|
---|
| 672 |
|
---|
| 673 | >>> reach("www.acme.com")
|
---|
| 674 | '.acme.com'
|
---|
| 675 | >>> reach("acme.com")
|
---|
| 676 | 'acme.com'
|
---|
| 677 | >>> reach("acme.local")
|
---|
| 678 | '.local'
|
---|
| 679 |
|
---|
| 680 | """
|
---|
| 681 | i = h.find(".")
|
---|
| 682 | if i >= 0:
|
---|
| 683 | #a = h[:i] # this line is only here to show what a is
|
---|
| 684 | b = h[i+1:]
|
---|
| 685 | i = b.find(".")
|
---|
| 686 | if is_HDN(h) and (i >= 0 or b == "local"):
|
---|
| 687 | return "."+b
|
---|
| 688 | return h
|
---|
| 689 |
|
---|
| 690 | def is_third_party(request):
|
---|
| 691 | """
|
---|
| 692 |
|
---|
| 693 | RFC 2965, section 3.3.6:
|
---|
| 694 |
|
---|
| 695 | An unverifiable transaction is to a third-party host if its request-
|
---|
| 696 | host U does not domain-match the reach R of the request-host O in the
|
---|
| 697 | origin transaction.
|
---|
| 698 |
|
---|
| 699 | """
|
---|
| 700 | req_host = request_host(request)
|
---|
| 701 | if not domain_match(req_host, reach(request.get_origin_req_host())):
|
---|
| 702 | return True
|
---|
| 703 | else:
|
---|
| 704 | return False
|
---|
| 705 |
|
---|
| 706 |
|
---|
| 707 | class Cookie:
|
---|
| 708 | """HTTP Cookie.
|
---|
| 709 |
|
---|
| 710 | This class represents both Netscape and RFC 2965 cookies.
|
---|
| 711 |
|
---|
| 712 | This is deliberately a very simple class. It just holds attributes. It's
|
---|
| 713 | possible to construct Cookie instances that don't comply with the cookie
|
---|
| 714 | standards. CookieJar.make_cookies is the factory function for Cookie
|
---|
| 715 | objects -- it deals with cookie parsing, supplying defaults, and
|
---|
| 716 | normalising to the representation used in this class. CookiePolicy is
|
---|
| 717 | responsible for checking them to see whether they should be accepted from
|
---|
| 718 | and returned to the server.
|
---|
| 719 |
|
---|
| 720 | Note that the port may be present in the headers, but unspecified ("Port"
|
---|
| 721 | rather than"Port=80", for example); if this is the case, port is None.
|
---|
| 722 |
|
---|
| 723 | """
|
---|
| 724 |
|
---|
| 725 | def __init__(self, version, name, value,
|
---|
| 726 | port, port_specified,
|
---|
| 727 | domain, domain_specified, domain_initial_dot,
|
---|
| 728 | path, path_specified,
|
---|
| 729 | secure,
|
---|
| 730 | expires,
|
---|
| 731 | discard,
|
---|
| 732 | comment,
|
---|
| 733 | comment_url,
|
---|
| 734 | rest,
|
---|
| 735 | rfc2109=False,
|
---|
| 736 | ):
|
---|
| 737 |
|
---|
| 738 | if version is not None: version = int(version)
|
---|
| 739 | if expires is not None: expires = int(expires)
|
---|
| 740 | if port is None and port_specified is True:
|
---|
| 741 | raise ValueError("if port is None, port_specified must be false")
|
---|
| 742 |
|
---|
| 743 | self.version = version
|
---|
| 744 | self.name = name
|
---|
| 745 | self.value = value
|
---|
| 746 | self.port = port
|
---|
| 747 | self.port_specified = port_specified
|
---|
| 748 | # normalise case, as per RFC 2965 section 3.3.3
|
---|
| 749 | self.domain = domain.lower()
|
---|
| 750 | self.domain_specified = domain_specified
|
---|
| 751 | # Sigh. We need to know whether the domain given in the
|
---|
| 752 | # cookie-attribute had an initial dot, in order to follow RFC 2965
|
---|
| 753 | # (as clarified in draft errata). Needed for the returned $Domain
|
---|
| 754 | # value.
|
---|
| 755 | self.domain_initial_dot = domain_initial_dot
|
---|
| 756 | self.path = path
|
---|
| 757 | self.path_specified = path_specified
|
---|
| 758 | self.secure = secure
|
---|
| 759 | self.expires = expires
|
---|
| 760 | self.discard = discard
|
---|
| 761 | self.comment = comment
|
---|
| 762 | self.comment_url = comment_url
|
---|
| 763 | self.rfc2109 = rfc2109
|
---|
| 764 |
|
---|
| 765 | self._rest = copy.copy(rest)
|
---|
| 766 |
|
---|
| 767 | def has_nonstandard_attr(self, name):
|
---|
| 768 | return name in self._rest
|
---|
| 769 | def get_nonstandard_attr(self, name, default=None):
|
---|
| 770 | return self._rest.get(name, default)
|
---|
| 771 | def set_nonstandard_attr(self, name, value):
|
---|
| 772 | self._rest[name] = value
|
---|
| 773 |
|
---|
| 774 | def is_expired(self, now=None):
|
---|
| 775 | if now is None: now = time.time()
|
---|
| 776 | if (self.expires is not None) and (self.expires <= now):
|
---|
| 777 | return True
|
---|
| 778 | return False
|
---|
| 779 |
|
---|
| 780 | def __str__(self):
|
---|
| 781 | if self.port is None: p = ""
|
---|
| 782 | else: p = ":"+self.port
|
---|
| 783 | limit = self.domain + p + self.path
|
---|
| 784 | if self.value is not None:
|
---|
| 785 | namevalue = "%s=%s" % (self.name, self.value)
|
---|
| 786 | else:
|
---|
| 787 | namevalue = self.name
|
---|
| 788 | return "<Cookie %s for %s>" % (namevalue, limit)
|
---|
| 789 |
|
---|
| 790 | def __repr__(self):
|
---|
| 791 | args = []
|
---|
| 792 | for name in ("version", "name", "value",
|
---|
| 793 | "port", "port_specified",
|
---|
| 794 | "domain", "domain_specified", "domain_initial_dot",
|
---|
| 795 | "path", "path_specified",
|
---|
| 796 | "secure", "expires", "discard", "comment", "comment_url",
|
---|
| 797 | ):
|
---|
| 798 | attr = getattr(self, name)
|
---|
| 799 | args.append("%s=%s" % (name, repr(attr)))
|
---|
| 800 | args.append("rest=%s" % repr(self._rest))
|
---|
| 801 | args.append("rfc2109=%s" % repr(self.rfc2109))
|
---|
| 802 | return "Cookie(%s)" % ", ".join(args)
|
---|
| 803 |
|
---|
| 804 |
|
---|
| 805 | class CookiePolicy:
|
---|
| 806 | """Defines which cookies get accepted from and returned to server.
|
---|
| 807 |
|
---|
| 808 | May also modify cookies, though this is probably a bad idea.
|
---|
| 809 |
|
---|
| 810 | The subclass DefaultCookiePolicy defines the standard rules for Netscape
|
---|
| 811 | and RFC 2965 cookies -- override that if you want a customised policy.
|
---|
| 812 |
|
---|
| 813 | """
|
---|
| 814 | def set_ok(self, cookie, request):
|
---|
| 815 | """Return true if (and only if) cookie should be accepted from server.
|
---|
| 816 |
|
---|
| 817 | Currently, pre-expired cookies never get this far -- the CookieJar
|
---|
| 818 | class deletes such cookies itself.
|
---|
| 819 |
|
---|
| 820 | """
|
---|
| 821 | raise NotImplementedError()
|
---|
| 822 |
|
---|
| 823 | def return_ok(self, cookie, request):
|
---|
| 824 | """Return true if (and only if) cookie should be returned to server."""
|
---|
| 825 | raise NotImplementedError()
|
---|
| 826 |
|
---|
| 827 | def domain_return_ok(self, domain, request):
|
---|
| 828 | """Return false if cookies should not be returned, given cookie domain.
|
---|
| 829 | """
|
---|
| 830 | return True
|
---|
| 831 |
|
---|
| 832 | def path_return_ok(self, path, request):
|
---|
| 833 | """Return false if cookies should not be returned, given cookie path.
|
---|
| 834 | """
|
---|
| 835 | return True
|
---|
| 836 |
|
---|
| 837 |
|
---|
| 838 | class DefaultCookiePolicy(CookiePolicy):
|
---|
| 839 | """Implements the standard rules for accepting and returning cookies."""
|
---|
| 840 |
|
---|
| 841 | DomainStrictNoDots = 1
|
---|
| 842 | DomainStrictNonDomain = 2
|
---|
| 843 | DomainRFC2965Match = 4
|
---|
| 844 |
|
---|
| 845 | DomainLiberal = 0
|
---|
| 846 | DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
|
---|
| 847 |
|
---|
| 848 | def __init__(self,
|
---|
| 849 | blocked_domains=None, allowed_domains=None,
|
---|
| 850 | netscape=True, rfc2965=False,
|
---|
| 851 | rfc2109_as_netscape=None,
|
---|
| 852 | hide_cookie2=False,
|
---|
| 853 | strict_domain=False,
|
---|
| 854 | strict_rfc2965_unverifiable=True,
|
---|
| 855 | strict_ns_unverifiable=False,
|
---|
| 856 | strict_ns_domain=DomainLiberal,
|
---|
| 857 | strict_ns_set_initial_dollar=False,
|
---|
| 858 | strict_ns_set_path=False,
|
---|
| 859 | ):
|
---|
| 860 | """Constructor arguments should be passed as keyword arguments only."""
|
---|
| 861 | self.netscape = netscape
|
---|
| 862 | self.rfc2965 = rfc2965
|
---|
| 863 | self.rfc2109_as_netscape = rfc2109_as_netscape
|
---|
| 864 | self.hide_cookie2 = hide_cookie2
|
---|
| 865 | self.strict_domain = strict_domain
|
---|
| 866 | self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
|
---|
| 867 | self.strict_ns_unverifiable = strict_ns_unverifiable
|
---|
| 868 | self.strict_ns_domain = strict_ns_domain
|
---|
| 869 | self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
|
---|
| 870 | self.strict_ns_set_path = strict_ns_set_path
|
---|
| 871 |
|
---|
| 872 | if blocked_domains is not None:
|
---|
| 873 | self._blocked_domains = tuple(blocked_domains)
|
---|
| 874 | else:
|
---|
| 875 | self._blocked_domains = ()
|
---|
| 876 |
|
---|
| 877 | if allowed_domains is not None:
|
---|
| 878 | allowed_domains = tuple(allowed_domains)
|
---|
| 879 | self._allowed_domains = allowed_domains
|
---|
| 880 |
|
---|
| 881 | def blocked_domains(self):
|
---|
| 882 | """Return the sequence of blocked domains (as a tuple)."""
|
---|
| 883 | return self._blocked_domains
|
---|
| 884 | def set_blocked_domains(self, blocked_domains):
|
---|
| 885 | """Set the sequence of blocked domains."""
|
---|
| 886 | self._blocked_domains = tuple(blocked_domains)
|
---|
| 887 |
|
---|
| 888 | def is_blocked(self, domain):
|
---|
| 889 | for blocked_domain in self._blocked_domains:
|
---|
| 890 | if user_domain_match(domain, blocked_domain):
|
---|
| 891 | return True
|
---|
| 892 | return False
|
---|
| 893 |
|
---|
| 894 | def allowed_domains(self):
|
---|
| 895 | """Return None, or the sequence of allowed domains (as a tuple)."""
|
---|
| 896 | return self._allowed_domains
|
---|
| 897 | def set_allowed_domains(self, allowed_domains):
|
---|
| 898 | """Set the sequence of allowed domains, or None."""
|
---|
| 899 | if allowed_domains is not None:
|
---|
| 900 | allowed_domains = tuple(allowed_domains)
|
---|
| 901 | self._allowed_domains = allowed_domains
|
---|
| 902 |
|
---|
| 903 | def is_not_allowed(self, domain):
|
---|
| 904 | if self._allowed_domains is None:
|
---|
| 905 | return False
|
---|
| 906 | for allowed_domain in self._allowed_domains:
|
---|
| 907 | if user_domain_match(domain, allowed_domain):
|
---|
| 908 | return False
|
---|
| 909 | return True
|
---|
| 910 |
|
---|
| 911 | def set_ok(self, cookie, request):
|
---|
| 912 | """
|
---|
| 913 | If you override .set_ok(), be sure to call this method. If it returns
|
---|
| 914 | false, so should your subclass (assuming your subclass wants to be more
|
---|
| 915 | strict about which cookies to accept).
|
---|
| 916 |
|
---|
| 917 | """
|
---|
| 918 | _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
|
---|
| 919 |
|
---|
| 920 | assert cookie.name is not None
|
---|
| 921 |
|
---|
| 922 | for n in "version", "verifiability", "name", "path", "domain", "port":
|
---|
| 923 | fn_name = "set_ok_"+n
|
---|
| 924 | fn = getattr(self, fn_name)
|
---|
| 925 | if not fn(cookie, request):
|
---|
| 926 | return False
|
---|
| 927 |
|
---|
| 928 | return True
|
---|
| 929 |
|
---|
| 930 | def set_ok_version(self, cookie, request):
|
---|
| 931 | if cookie.version is None:
|
---|
| 932 | # Version is always set to 0 by parse_ns_headers if it's a Netscape
|
---|
| 933 | # cookie, so this must be an invalid RFC 2965 cookie.
|
---|
| 934 | _debug(" Set-Cookie2 without version attribute (%s=%s)",
|
---|
| 935 | cookie.name, cookie.value)
|
---|
| 936 | return False
|
---|
| 937 | if cookie.version > 0 and not self.rfc2965:
|
---|
| 938 | _debug(" RFC 2965 cookies are switched off")
|
---|
| 939 | return False
|
---|
| 940 | elif cookie.version == 0 and not self.netscape:
|
---|
| 941 | _debug(" Netscape cookies are switched off")
|
---|
| 942 | return False
|
---|
| 943 | return True
|
---|
| 944 |
|
---|
| 945 | def set_ok_verifiability(self, cookie, request):
|
---|
| 946 | if request.is_unverifiable() and is_third_party(request):
|
---|
| 947 | if cookie.version > 0 and self.strict_rfc2965_unverifiable:
|
---|
| 948 | _debug(" third-party RFC 2965 cookie during "
|
---|
| 949 | "unverifiable transaction")
|
---|
| 950 | return False
|
---|
| 951 | elif cookie.version == 0 and self.strict_ns_unverifiable:
|
---|
| 952 | _debug(" third-party Netscape cookie during "
|
---|
| 953 | "unverifiable transaction")
|
---|
| 954 | return False
|
---|
| 955 | return True
|
---|
| 956 |
|
---|
| 957 | def set_ok_name(self, cookie, request):
|
---|
| 958 | # Try and stop servers setting V0 cookies designed to hack other
|
---|
| 959 | # servers that know both V0 and V1 protocols.
|
---|
| 960 | if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
|
---|
| 961 | cookie.name.startswith("$")):
|
---|
| 962 | _debug(" illegal name (starts with '$'): '%s'", cookie.name)
|
---|
| 963 | return False
|
---|
| 964 | return True
|
---|
| 965 |
|
---|
| 966 | def set_ok_path(self, cookie, request):
|
---|
| 967 | if cookie.path_specified:
|
---|
| 968 | req_path = request_path(request)
|
---|
| 969 | if ((cookie.version > 0 or
|
---|
| 970 | (cookie.version == 0 and self.strict_ns_set_path)) and
|
---|
| 971 | not req_path.startswith(cookie.path)):
|
---|
| 972 | _debug(" path attribute %s is not a prefix of request "
|
---|
| 973 | "path %s", cookie.path, req_path)
|
---|
| 974 | return False
|
---|
| 975 | return True
|
---|
| 976 |
|
---|
| 977 | def set_ok_domain(self, cookie, request):
|
---|
| 978 | if self.is_blocked(cookie.domain):
|
---|
| 979 | _debug(" domain %s is in user block-list", cookie.domain)
|
---|
| 980 | return False
|
---|
| 981 | if self.is_not_allowed(cookie.domain):
|
---|
| 982 | _debug(" domain %s is not in user allow-list", cookie.domain)
|
---|
| 983 | return False
|
---|
| 984 | if cookie.domain_specified:
|
---|
| 985 | req_host, erhn = eff_request_host(request)
|
---|
| 986 | domain = cookie.domain
|
---|
| 987 | if self.strict_domain and (domain.count(".") >= 2):
|
---|
| 988 | # XXX This should probably be compared with the Konqueror
|
---|
| 989 | # (kcookiejar.cpp) and Mozilla implementations, but it's a
|
---|
| 990 | # losing battle.
|
---|
| 991 | i = domain.rfind(".")
|
---|
| 992 | j = domain.rfind(".", 0, i)
|
---|
| 993 | if j == 0: # domain like .foo.bar
|
---|
| 994 | tld = domain[i+1:]
|
---|
| 995 | sld = domain[j+1:i]
|
---|
| 996 | if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
|
---|
| 997 | "gov", "mil", "int", "aero", "biz", "cat", "coop",
|
---|
| 998 | "info", "jobs", "mobi", "museum", "name", "pro",
|
---|
| 999 | "travel", "eu") and len(tld) == 2:
|
---|
| 1000 | # domain like .co.uk
|
---|
| 1001 | _debug(" country-code second level domain %s", domain)
|
---|
| 1002 | return False
|
---|
| 1003 | if domain.startswith("."):
|
---|
| 1004 | undotted_domain = domain[1:]
|
---|
| 1005 | else:
|
---|
| 1006 | undotted_domain = domain
|
---|
| 1007 | embedded_dots = (undotted_domain.find(".") >= 0)
|
---|
| 1008 | if not embedded_dots and domain != ".local":
|
---|
| 1009 | _debug(" non-local domain %s contains no embedded dot",
|
---|
| 1010 | domain)
|
---|
| 1011 | return False
|
---|
| 1012 | if cookie.version == 0:
|
---|
| 1013 | if (not erhn.endswith(domain) and
|
---|
| 1014 | (not erhn.startswith(".") and
|
---|
| 1015 | not ("."+erhn).endswith(domain))):
|
---|
| 1016 | _debug(" effective request-host %s (even with added "
|
---|
[391] | 1017 | "initial dot) does not end with %s",
|
---|
[2] | 1018 | erhn, domain)
|
---|
| 1019 | return False
|
---|
| 1020 | if (cookie.version > 0 or
|
---|
| 1021 | (self.strict_ns_domain & self.DomainRFC2965Match)):
|
---|
| 1022 | if not domain_match(erhn, domain):
|
---|
| 1023 | _debug(" effective request-host %s does not domain-match "
|
---|
| 1024 | "%s", erhn, domain)
|
---|
| 1025 | return False
|
---|
| 1026 | if (cookie.version > 0 or
|
---|
| 1027 | (self.strict_ns_domain & self.DomainStrictNoDots)):
|
---|
| 1028 | host_prefix = req_host[:-len(domain)]
|
---|
| 1029 | if (host_prefix.find(".") >= 0 and
|
---|
| 1030 | not IPV4_RE.search(req_host)):
|
---|
| 1031 | _debug(" host prefix %s for domain %s contains a dot",
|
---|
| 1032 | host_prefix, domain)
|
---|
| 1033 | return False
|
---|
| 1034 | return True
|
---|
| 1035 |
|
---|
| 1036 | def set_ok_port(self, cookie, request):
|
---|
| 1037 | if cookie.port_specified:
|
---|
| 1038 | req_port = request_port(request)
|
---|
| 1039 | if req_port is None:
|
---|
| 1040 | req_port = "80"
|
---|
| 1041 | else:
|
---|
| 1042 | req_port = str(req_port)
|
---|
| 1043 | for p in cookie.port.split(","):
|
---|
| 1044 | try:
|
---|
| 1045 | int(p)
|
---|
| 1046 | except ValueError:
|
---|
| 1047 | _debug(" bad port %s (not numeric)", p)
|
---|
| 1048 | return False
|
---|
| 1049 | if p == req_port:
|
---|
| 1050 | break
|
---|
| 1051 | else:
|
---|
| 1052 | _debug(" request port (%s) not found in %s",
|
---|
| 1053 | req_port, cookie.port)
|
---|
| 1054 | return False
|
---|
| 1055 | return True
|
---|
| 1056 |
|
---|
| 1057 | def return_ok(self, cookie, request):
|
---|
| 1058 | """
|
---|
| 1059 | If you override .return_ok(), be sure to call this method. If it
|
---|
| 1060 | returns false, so should your subclass (assuming your subclass wants to
|
---|
| 1061 | be more strict about which cookies to return).
|
---|
| 1062 |
|
---|
| 1063 | """
|
---|
| 1064 | # Path has already been checked by .path_return_ok(), and domain
|
---|
| 1065 | # blocking done by .domain_return_ok().
|
---|
| 1066 | _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
|
---|
| 1067 |
|
---|
| 1068 | for n in "version", "verifiability", "secure", "expires", "port", "domain":
|
---|
| 1069 | fn_name = "return_ok_"+n
|
---|
| 1070 | fn = getattr(self, fn_name)
|
---|
| 1071 | if not fn(cookie, request):
|
---|
| 1072 | return False
|
---|
| 1073 | return True
|
---|
| 1074 |
|
---|
| 1075 | def return_ok_version(self, cookie, request):
|
---|
| 1076 | if cookie.version > 0 and not self.rfc2965:
|
---|
| 1077 | _debug(" RFC 2965 cookies are switched off")
|
---|
| 1078 | return False
|
---|
| 1079 | elif cookie.version == 0 and not self.netscape:
|
---|
| 1080 | _debug(" Netscape cookies are switched off")
|
---|
| 1081 | return False
|
---|
| 1082 | return True
|
---|
| 1083 |
|
---|
| 1084 | def return_ok_verifiability(self, cookie, request):
|
---|
| 1085 | if request.is_unverifiable() and is_third_party(request):
|
---|
| 1086 | if cookie.version > 0 and self.strict_rfc2965_unverifiable:
|
---|
| 1087 | _debug(" third-party RFC 2965 cookie during unverifiable "
|
---|
| 1088 | "transaction")
|
---|
| 1089 | return False
|
---|
| 1090 | elif cookie.version == 0 and self.strict_ns_unverifiable:
|
---|
| 1091 | _debug(" third-party Netscape cookie during unverifiable "
|
---|
| 1092 | "transaction")
|
---|
| 1093 | return False
|
---|
| 1094 | return True
|
---|
| 1095 |
|
---|
| 1096 | def return_ok_secure(self, cookie, request):
|
---|
| 1097 | if cookie.secure and request.get_type() != "https":
|
---|
| 1098 | _debug(" secure cookie with non-secure request")
|
---|
| 1099 | return False
|
---|
| 1100 | return True
|
---|
| 1101 |
|
---|
| 1102 | def return_ok_expires(self, cookie, request):
|
---|
| 1103 | if cookie.is_expired(self._now):
|
---|
| 1104 | _debug(" cookie expired")
|
---|
| 1105 | return False
|
---|
| 1106 | return True
|
---|
| 1107 |
|
---|
| 1108 | def return_ok_port(self, cookie, request):
|
---|
| 1109 | if cookie.port:
|
---|
| 1110 | req_port = request_port(request)
|
---|
| 1111 | if req_port is None:
|
---|
| 1112 | req_port = "80"
|
---|
| 1113 | for p in cookie.port.split(","):
|
---|
| 1114 | if p == req_port:
|
---|
| 1115 | break
|
---|
| 1116 | else:
|
---|
| 1117 | _debug(" request port %s does not match cookie port %s",
|
---|
| 1118 | req_port, cookie.port)
|
---|
| 1119 | return False
|
---|
| 1120 | return True
|
---|
| 1121 |
|
---|
| 1122 | def return_ok_domain(self, cookie, request):
|
---|
| 1123 | req_host, erhn = eff_request_host(request)
|
---|
| 1124 | domain = cookie.domain
|
---|
| 1125 |
|
---|
| 1126 | # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
|
---|
| 1127 | if (cookie.version == 0 and
|
---|
| 1128 | (self.strict_ns_domain & self.DomainStrictNonDomain) and
|
---|
| 1129 | not cookie.domain_specified and domain != erhn):
|
---|
| 1130 | _debug(" cookie with unspecified domain does not string-compare "
|
---|
| 1131 | "equal to request domain")
|
---|
| 1132 | return False
|
---|
| 1133 |
|
---|
| 1134 | if cookie.version > 0 and not domain_match(erhn, domain):
|
---|
| 1135 | _debug(" effective request-host name %s does not domain-match "
|
---|
| 1136 | "RFC 2965 cookie domain %s", erhn, domain)
|
---|
| 1137 | return False
|
---|
| 1138 | if cookie.version == 0 and not ("."+erhn).endswith(domain):
|
---|
| 1139 | _debug(" request-host %s does not match Netscape cookie domain "
|
---|
| 1140 | "%s", req_host, domain)
|
---|
| 1141 | return False
|
---|
| 1142 | return True
|
---|
| 1143 |
|
---|
| 1144 | def domain_return_ok(self, domain, request):
|
---|
| 1145 | # Liberal check of. This is here as an optimization to avoid
|
---|
| 1146 | # having to load lots of MSIE cookie files unless necessary.
|
---|
| 1147 | req_host, erhn = eff_request_host(request)
|
---|
| 1148 | if not req_host.startswith("."):
|
---|
| 1149 | req_host = "."+req_host
|
---|
| 1150 | if not erhn.startswith("."):
|
---|
| 1151 | erhn = "."+erhn
|
---|
| 1152 | if not (req_host.endswith(domain) or erhn.endswith(domain)):
|
---|
| 1153 | #_debug(" request domain %s does not match cookie domain %s",
|
---|
| 1154 | # req_host, domain)
|
---|
| 1155 | return False
|
---|
| 1156 |
|
---|
| 1157 | if self.is_blocked(domain):
|
---|
| 1158 | _debug(" domain %s is in user block-list", domain)
|
---|
| 1159 | return False
|
---|
| 1160 | if self.is_not_allowed(domain):
|
---|
| 1161 | _debug(" domain %s is not in user allow-list", domain)
|
---|
| 1162 | return False
|
---|
| 1163 |
|
---|
| 1164 | return True
|
---|
| 1165 |
|
---|
| 1166 | def path_return_ok(self, path, request):
|
---|
| 1167 | _debug("- checking cookie path=%s", path)
|
---|
| 1168 | req_path = request_path(request)
|
---|
| 1169 | if not req_path.startswith(path):
|
---|
| 1170 | _debug(" %s does not path-match %s", req_path, path)
|
---|
| 1171 | return False
|
---|
| 1172 | return True
|
---|
| 1173 |
|
---|
| 1174 |
|
---|
| 1175 | def vals_sorted_by_key(adict):
|
---|
| 1176 | keys = adict.keys()
|
---|
| 1177 | keys.sort()
|
---|
| 1178 | return map(adict.get, keys)
|
---|
| 1179 |
|
---|
| 1180 | def deepvalues(mapping):
|
---|
| 1181 | """Iterates over nested mapping, depth-first, in sorted order by key."""
|
---|
| 1182 | values = vals_sorted_by_key(mapping)
|
---|
| 1183 | for obj in values:
|
---|
| 1184 | mapping = False
|
---|
| 1185 | try:
|
---|
| 1186 | obj.items
|
---|
| 1187 | except AttributeError:
|
---|
| 1188 | pass
|
---|
| 1189 | else:
|
---|
| 1190 | mapping = True
|
---|
| 1191 | for subobj in deepvalues(obj):
|
---|
| 1192 | yield subobj
|
---|
| 1193 | if not mapping:
|
---|
| 1194 | yield obj
|
---|
| 1195 |
|
---|
| 1196 |
|
---|
| 1197 | # Used as second parameter to dict.get() method, to distinguish absent
|
---|
| 1198 | # dict key from one with a None value.
|
---|
| 1199 | class Absent: pass
|
---|
| 1200 |
|
---|
| 1201 | class CookieJar:
|
---|
| 1202 | """Collection of HTTP cookies.
|
---|
| 1203 |
|
---|
| 1204 | You may not need to know about this class: try
|
---|
| 1205 | urllib2.build_opener(HTTPCookieProcessor).open(url).
|
---|
| 1206 |
|
---|
| 1207 | """
|
---|
| 1208 |
|
---|
| 1209 | non_word_re = re.compile(r"\W")
|
---|
| 1210 | quote_re = re.compile(r"([\"\\])")
|
---|
| 1211 | strict_domain_re = re.compile(r"\.?[^.]*")
|
---|
| 1212 | domain_re = re.compile(r"[^.]*")
|
---|
| 1213 | dots_re = re.compile(r"^\.+")
|
---|
| 1214 |
|
---|
| 1215 | magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
|
---|
| 1216 |
|
---|
| 1217 | def __init__(self, policy=None):
|
---|
| 1218 | if policy is None:
|
---|
| 1219 | policy = DefaultCookiePolicy()
|
---|
| 1220 | self._policy = policy
|
---|
| 1221 |
|
---|
| 1222 | self._cookies_lock = _threading.RLock()
|
---|
| 1223 | self._cookies = {}
|
---|
| 1224 |
|
---|
| 1225 | def set_policy(self, policy):
|
---|
| 1226 | self._policy = policy
|
---|
| 1227 |
|
---|
| 1228 | def _cookies_for_domain(self, domain, request):
|
---|
| 1229 | cookies = []
|
---|
| 1230 | if not self._policy.domain_return_ok(domain, request):
|
---|
| 1231 | return []
|
---|
| 1232 | _debug("Checking %s for cookies to return", domain)
|
---|
| 1233 | cookies_by_path = self._cookies[domain]
|
---|
| 1234 | for path in cookies_by_path.keys():
|
---|
| 1235 | if not self._policy.path_return_ok(path, request):
|
---|
| 1236 | continue
|
---|
| 1237 | cookies_by_name = cookies_by_path[path]
|
---|
| 1238 | for cookie in cookies_by_name.values():
|
---|
| 1239 | if not self._policy.return_ok(cookie, request):
|
---|
| 1240 | _debug(" not returning cookie")
|
---|
| 1241 | continue
|
---|
| 1242 | _debug(" it's a match")
|
---|
| 1243 | cookies.append(cookie)
|
---|
| 1244 | return cookies
|
---|
| 1245 |
|
---|
| 1246 | def _cookies_for_request(self, request):
|
---|
| 1247 | """Return a list of cookies to be returned to server."""
|
---|
| 1248 | cookies = []
|
---|
| 1249 | for domain in self._cookies.keys():
|
---|
| 1250 | cookies.extend(self._cookies_for_domain(domain, request))
|
---|
| 1251 | return cookies
|
---|
| 1252 |
|
---|
| 1253 | def _cookie_attrs(self, cookies):
|
---|
| 1254 | """Return a list of cookie-attributes to be returned to server.
|
---|
| 1255 |
|
---|
| 1256 | like ['foo="bar"; $Path="/"', ...]
|
---|
| 1257 |
|
---|
| 1258 | The $Version attribute is also added when appropriate (currently only
|
---|
| 1259 | once per request).
|
---|
| 1260 |
|
---|
| 1261 | """
|
---|
| 1262 | # add cookies in order of most specific (ie. longest) path first
|
---|
| 1263 | cookies.sort(key=lambda arg: len(arg.path), reverse=True)
|
---|
| 1264 |
|
---|
| 1265 | version_set = False
|
---|
| 1266 |
|
---|
| 1267 | attrs = []
|
---|
| 1268 | for cookie in cookies:
|
---|
| 1269 | # set version of Cookie header
|
---|
| 1270 | # XXX
|
---|
| 1271 | # What should it be if multiple matching Set-Cookie headers have
|
---|
| 1272 | # different versions themselves?
|
---|
| 1273 | # Answer: there is no answer; was supposed to be settled by
|
---|
| 1274 | # RFC 2965 errata, but that may never appear...
|
---|
| 1275 | version = cookie.version
|
---|
| 1276 | if not version_set:
|
---|
| 1277 | version_set = True
|
---|
| 1278 | if version > 0:
|
---|
| 1279 | attrs.append("$Version=%s" % version)
|
---|
| 1280 |
|
---|
| 1281 | # quote cookie value if necessary
|
---|
| 1282 | # (not for Netscape protocol, which already has any quotes
|
---|
| 1283 | # intact, due to the poorly-specified Netscape Cookie: syntax)
|
---|
| 1284 | if ((cookie.value is not None) and
|
---|
| 1285 | self.non_word_re.search(cookie.value) and version > 0):
|
---|
| 1286 | value = self.quote_re.sub(r"\\\1", cookie.value)
|
---|
| 1287 | else:
|
---|
| 1288 | value = cookie.value
|
---|
| 1289 |
|
---|
| 1290 | # add cookie-attributes to be returned in Cookie header
|
---|
| 1291 | if cookie.value is None:
|
---|
| 1292 | attrs.append(cookie.name)
|
---|
| 1293 | else:
|
---|
| 1294 | attrs.append("%s=%s" % (cookie.name, value))
|
---|
| 1295 | if version > 0:
|
---|
| 1296 | if cookie.path_specified:
|
---|
| 1297 | attrs.append('$Path="%s"' % cookie.path)
|
---|
| 1298 | if cookie.domain.startswith("."):
|
---|
| 1299 | domain = cookie.domain
|
---|
| 1300 | if (not cookie.domain_initial_dot and
|
---|
| 1301 | domain.startswith(".")):
|
---|
| 1302 | domain = domain[1:]
|
---|
| 1303 | attrs.append('$Domain="%s"' % domain)
|
---|
| 1304 | if cookie.port is not None:
|
---|
| 1305 | p = "$Port"
|
---|
| 1306 | if cookie.port_specified:
|
---|
| 1307 | p = p + ('="%s"' % cookie.port)
|
---|
| 1308 | attrs.append(p)
|
---|
| 1309 |
|
---|
| 1310 | return attrs
|
---|
| 1311 |
|
---|
| 1312 | def add_cookie_header(self, request):
|
---|
| 1313 | """Add correct Cookie: header to request (urllib2.Request object).
|
---|
| 1314 |
|
---|
| 1315 | The Cookie2 header is also added unless policy.hide_cookie2 is true.
|
---|
| 1316 |
|
---|
| 1317 | """
|
---|
| 1318 | _debug("add_cookie_header")
|
---|
| 1319 | self._cookies_lock.acquire()
|
---|
| 1320 | try:
|
---|
| 1321 |
|
---|
| 1322 | self._policy._now = self._now = int(time.time())
|
---|
| 1323 |
|
---|
| 1324 | cookies = self._cookies_for_request(request)
|
---|
| 1325 |
|
---|
| 1326 | attrs = self._cookie_attrs(cookies)
|
---|
| 1327 | if attrs:
|
---|
| 1328 | if not request.has_header("Cookie"):
|
---|
| 1329 | request.add_unredirected_header(
|
---|
| 1330 | "Cookie", "; ".join(attrs))
|
---|
| 1331 |
|
---|
| 1332 | # if necessary, advertise that we know RFC 2965
|
---|
| 1333 | if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
|
---|
| 1334 | not request.has_header("Cookie2")):
|
---|
| 1335 | for cookie in cookies:
|
---|
| 1336 | if cookie.version != 1:
|
---|
| 1337 | request.add_unredirected_header("Cookie2", '$Version="1"')
|
---|
| 1338 | break
|
---|
| 1339 |
|
---|
| 1340 | finally:
|
---|
| 1341 | self._cookies_lock.release()
|
---|
| 1342 |
|
---|
| 1343 | self.clear_expired_cookies()
|
---|
| 1344 |
|
---|
| 1345 | def _normalized_cookie_tuples(self, attrs_set):
|
---|
| 1346 | """Return list of tuples containing normalised cookie information.
|
---|
| 1347 |
|
---|
| 1348 | attrs_set is the list of lists of key,value pairs extracted from
|
---|
| 1349 | the Set-Cookie or Set-Cookie2 headers.
|
---|
| 1350 |
|
---|
| 1351 | Tuples are name, value, standard, rest, where name and value are the
|
---|
| 1352 | cookie name and value, standard is a dictionary containing the standard
|
---|
| 1353 | cookie-attributes (discard, secure, version, expires or max-age,
|
---|
| 1354 | domain, path and port) and rest is a dictionary containing the rest of
|
---|
| 1355 | the cookie-attributes.
|
---|
| 1356 |
|
---|
| 1357 | """
|
---|
| 1358 | cookie_tuples = []
|
---|
| 1359 |
|
---|
| 1360 | boolean_attrs = "discard", "secure"
|
---|
| 1361 | value_attrs = ("version",
|
---|
| 1362 | "expires", "max-age",
|
---|
| 1363 | "domain", "path", "port",
|
---|
| 1364 | "comment", "commenturl")
|
---|
| 1365 |
|
---|
| 1366 | for cookie_attrs in attrs_set:
|
---|
| 1367 | name, value = cookie_attrs[0]
|
---|
| 1368 |
|
---|
| 1369 | # Build dictionary of standard cookie-attributes (standard) and
|
---|
| 1370 | # dictionary of other cookie-attributes (rest).
|
---|
| 1371 |
|
---|
| 1372 | # Note: expiry time is normalised to seconds since epoch. V0
|
---|
| 1373 | # cookies should have the Expires cookie-attribute, and V1 cookies
|
---|
| 1374 | # should have Max-Age, but since V1 includes RFC 2109 cookies (and
|
---|
| 1375 | # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
|
---|
| 1376 | # accept either (but prefer Max-Age).
|
---|
| 1377 | max_age_set = False
|
---|
| 1378 |
|
---|
| 1379 | bad_cookie = False
|
---|
| 1380 |
|
---|
| 1381 | standard = {}
|
---|
| 1382 | rest = {}
|
---|
| 1383 | for k, v in cookie_attrs[1:]:
|
---|
| 1384 | lc = k.lower()
|
---|
| 1385 | # don't lose case distinction for unknown fields
|
---|
| 1386 | if lc in value_attrs or lc in boolean_attrs:
|
---|
| 1387 | k = lc
|
---|
| 1388 | if k in boolean_attrs and v is None:
|
---|
| 1389 | # boolean cookie-attribute is present, but has no value
|
---|
| 1390 | # (like "discard", rather than "port=80")
|
---|
| 1391 | v = True
|
---|
| 1392 | if k in standard:
|
---|
| 1393 | # only first value is significant
|
---|
| 1394 | continue
|
---|
| 1395 | if k == "domain":
|
---|
| 1396 | if v is None:
|
---|
| 1397 | _debug(" missing value for domain attribute")
|
---|
| 1398 | bad_cookie = True
|
---|
| 1399 | break
|
---|
| 1400 | # RFC 2965 section 3.3.3
|
---|
| 1401 | v = v.lower()
|
---|
| 1402 | if k == "expires":
|
---|
| 1403 | if max_age_set:
|
---|
| 1404 | # Prefer max-age to expires (like Mozilla)
|
---|
| 1405 | continue
|
---|
| 1406 | if v is None:
|
---|
| 1407 | _debug(" missing or invalid value for expires "
|
---|
| 1408 | "attribute: treating as session cookie")
|
---|
| 1409 | continue
|
---|
| 1410 | if k == "max-age":
|
---|
| 1411 | max_age_set = True
|
---|
| 1412 | try:
|
---|
| 1413 | v = int(v)
|
---|
| 1414 | except ValueError:
|
---|
| 1415 | _debug(" missing or invalid (non-numeric) value for "
|
---|
| 1416 | "max-age attribute")
|
---|
| 1417 | bad_cookie = True
|
---|
| 1418 | break
|
---|
| 1419 | # convert RFC 2965 Max-Age to seconds since epoch
|
---|
| 1420 | # XXX Strictly you're supposed to follow RFC 2616
|
---|
| 1421 | # age-calculation rules. Remember that zero Max-Age is a
|
---|
| 1422 | # is a request to discard (old and new) cookie, though.
|
---|
| 1423 | k = "expires"
|
---|
| 1424 | v = self._now + v
|
---|
| 1425 | if (k in value_attrs) or (k in boolean_attrs):
|
---|
| 1426 | if (v is None and
|
---|
| 1427 | k not in ("port", "comment", "commenturl")):
|
---|
| 1428 | _debug(" missing value for %s attribute" % k)
|
---|
| 1429 | bad_cookie = True
|
---|
| 1430 | break
|
---|
| 1431 | standard[k] = v
|
---|
| 1432 | else:
|
---|
| 1433 | rest[k] = v
|
---|
| 1434 |
|
---|
| 1435 | if bad_cookie:
|
---|
| 1436 | continue
|
---|
| 1437 |
|
---|
| 1438 | cookie_tuples.append((name, value, standard, rest))
|
---|
| 1439 |
|
---|
| 1440 | return cookie_tuples
|
---|
| 1441 |
|
---|
| 1442 | def _cookie_from_cookie_tuple(self, tup, request):
|
---|
| 1443 | # standard is dict of standard cookie-attributes, rest is dict of the
|
---|
| 1444 | # rest of them
|
---|
| 1445 | name, value, standard, rest = tup
|
---|
| 1446 |
|
---|
| 1447 | domain = standard.get("domain", Absent)
|
---|
| 1448 | path = standard.get("path", Absent)
|
---|
| 1449 | port = standard.get("port", Absent)
|
---|
| 1450 | expires = standard.get("expires", Absent)
|
---|
| 1451 |
|
---|
| 1452 | # set the easy defaults
|
---|
| 1453 | version = standard.get("version", None)
|
---|
[391] | 1454 | if version is not None:
|
---|
| 1455 | try:
|
---|
| 1456 | version = int(version)
|
---|
| 1457 | except ValueError:
|
---|
| 1458 | return None # invalid version, ignore cookie
|
---|
[2] | 1459 | secure = standard.get("secure", False)
|
---|
| 1460 | # (discard is also set if expires is Absent)
|
---|
| 1461 | discard = standard.get("discard", False)
|
---|
| 1462 | comment = standard.get("comment", None)
|
---|
| 1463 | comment_url = standard.get("commenturl", None)
|
---|
| 1464 |
|
---|
| 1465 | # set default path
|
---|
| 1466 | if path is not Absent and path != "":
|
---|
| 1467 | path_specified = True
|
---|
| 1468 | path = escape_path(path)
|
---|
| 1469 | else:
|
---|
| 1470 | path_specified = False
|
---|
| 1471 | path = request_path(request)
|
---|
| 1472 | i = path.rfind("/")
|
---|
| 1473 | if i != -1:
|
---|
| 1474 | if version == 0:
|
---|
| 1475 | # Netscape spec parts company from reality here
|
---|
| 1476 | path = path[:i]
|
---|
| 1477 | else:
|
---|
| 1478 | path = path[:i+1]
|
---|
| 1479 | if len(path) == 0: path = "/"
|
---|
| 1480 |
|
---|
| 1481 | # set default domain
|
---|
| 1482 | domain_specified = domain is not Absent
|
---|
| 1483 | # but first we have to remember whether it starts with a dot
|
---|
| 1484 | domain_initial_dot = False
|
---|
| 1485 | if domain_specified:
|
---|
| 1486 | domain_initial_dot = bool(domain.startswith("."))
|
---|
| 1487 | if domain is Absent:
|
---|
| 1488 | req_host, erhn = eff_request_host(request)
|
---|
| 1489 | domain = erhn
|
---|
| 1490 | elif not domain.startswith("."):
|
---|
| 1491 | domain = "."+domain
|
---|
| 1492 |
|
---|
| 1493 | # set default port
|
---|
| 1494 | port_specified = False
|
---|
| 1495 | if port is not Absent:
|
---|
| 1496 | if port is None:
|
---|
| 1497 | # Port attr present, but has no value: default to request port.
|
---|
| 1498 | # Cookie should then only be sent back on that port.
|
---|
| 1499 | port = request_port(request)
|
---|
| 1500 | else:
|
---|
| 1501 | port_specified = True
|
---|
| 1502 | port = re.sub(r"\s+", "", port)
|
---|
| 1503 | else:
|
---|
| 1504 | # No port attr present. Cookie can be sent back on any port.
|
---|
| 1505 | port = None
|
---|
| 1506 |
|
---|
| 1507 | # set default expires and discard
|
---|
| 1508 | if expires is Absent:
|
---|
| 1509 | expires = None
|
---|
| 1510 | discard = True
|
---|
| 1511 | elif expires <= self._now:
|
---|
| 1512 | # Expiry date in past is request to delete cookie. This can't be
|
---|
| 1513 | # in DefaultCookiePolicy, because can't delete cookies there.
|
---|
| 1514 | try:
|
---|
| 1515 | self.clear(domain, path, name)
|
---|
| 1516 | except KeyError:
|
---|
| 1517 | pass
|
---|
| 1518 | _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
|
---|
| 1519 | domain, path, name)
|
---|
| 1520 | return None
|
---|
| 1521 |
|
---|
| 1522 | return Cookie(version,
|
---|
| 1523 | name, value,
|
---|
| 1524 | port, port_specified,
|
---|
| 1525 | domain, domain_specified, domain_initial_dot,
|
---|
| 1526 | path, path_specified,
|
---|
| 1527 | secure,
|
---|
| 1528 | expires,
|
---|
| 1529 | discard,
|
---|
| 1530 | comment,
|
---|
| 1531 | comment_url,
|
---|
| 1532 | rest)
|
---|
| 1533 |
|
---|
| 1534 | def _cookies_from_attrs_set(self, attrs_set, request):
|
---|
| 1535 | cookie_tuples = self._normalized_cookie_tuples(attrs_set)
|
---|
| 1536 |
|
---|
| 1537 | cookies = []
|
---|
| 1538 | for tup in cookie_tuples:
|
---|
| 1539 | cookie = self._cookie_from_cookie_tuple(tup, request)
|
---|
| 1540 | if cookie: cookies.append(cookie)
|
---|
| 1541 | return cookies
|
---|
| 1542 |
|
---|
| 1543 | def _process_rfc2109_cookies(self, cookies):
|
---|
| 1544 | rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
|
---|
| 1545 | if rfc2109_as_ns is None:
|
---|
| 1546 | rfc2109_as_ns = not self._policy.rfc2965
|
---|
| 1547 | for cookie in cookies:
|
---|
| 1548 | if cookie.version == 1:
|
---|
| 1549 | cookie.rfc2109 = True
|
---|
| 1550 | if rfc2109_as_ns:
|
---|
| 1551 | # treat 2109 cookies as Netscape cookies rather than
|
---|
| 1552 | # as RFC2965 cookies
|
---|
| 1553 | cookie.version = 0
|
---|
| 1554 |
|
---|
| 1555 | def make_cookies(self, response, request):
|
---|
| 1556 | """Return sequence of Cookie objects extracted from response object."""
|
---|
| 1557 | # get cookie-attributes for RFC 2965 and Netscape protocols
|
---|
| 1558 | headers = response.info()
|
---|
| 1559 | rfc2965_hdrs = headers.getheaders("Set-Cookie2")
|
---|
| 1560 | ns_hdrs = headers.getheaders("Set-Cookie")
|
---|
| 1561 |
|
---|
| 1562 | rfc2965 = self._policy.rfc2965
|
---|
| 1563 | netscape = self._policy.netscape
|
---|
| 1564 |
|
---|
| 1565 | if ((not rfc2965_hdrs and not ns_hdrs) or
|
---|
| 1566 | (not ns_hdrs and not rfc2965) or
|
---|
| 1567 | (not rfc2965_hdrs and not netscape) or
|
---|
| 1568 | (not netscape and not rfc2965)):
|
---|
| 1569 | return [] # no relevant cookie headers: quick exit
|
---|
| 1570 |
|
---|
| 1571 | try:
|
---|
| 1572 | cookies = self._cookies_from_attrs_set(
|
---|
| 1573 | split_header_words(rfc2965_hdrs), request)
|
---|
| 1574 | except Exception:
|
---|
| 1575 | _warn_unhandled_exception()
|
---|
| 1576 | cookies = []
|
---|
| 1577 |
|
---|
| 1578 | if ns_hdrs and netscape:
|
---|
| 1579 | try:
|
---|
| 1580 | # RFC 2109 and Netscape cookies
|
---|
| 1581 | ns_cookies = self._cookies_from_attrs_set(
|
---|
| 1582 | parse_ns_headers(ns_hdrs), request)
|
---|
| 1583 | except Exception:
|
---|
| 1584 | _warn_unhandled_exception()
|
---|
| 1585 | ns_cookies = []
|
---|
| 1586 | self._process_rfc2109_cookies(ns_cookies)
|
---|
| 1587 |
|
---|
| 1588 | # Look for Netscape cookies (from Set-Cookie headers) that match
|
---|
| 1589 | # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
|
---|
| 1590 | # For each match, keep the RFC 2965 cookie and ignore the Netscape
|
---|
| 1591 | # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
|
---|
| 1592 | # bundled in with the Netscape cookies for this purpose, which is
|
---|
| 1593 | # reasonable behaviour.
|
---|
| 1594 | if rfc2965:
|
---|
| 1595 | lookup = {}
|
---|
| 1596 | for cookie in cookies:
|
---|
| 1597 | lookup[(cookie.domain, cookie.path, cookie.name)] = None
|
---|
| 1598 |
|
---|
| 1599 | def no_matching_rfc2965(ns_cookie, lookup=lookup):
|
---|
| 1600 | key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
|
---|
| 1601 | return key not in lookup
|
---|
| 1602 | ns_cookies = filter(no_matching_rfc2965, ns_cookies)
|
---|
| 1603 |
|
---|
| 1604 | if ns_cookies:
|
---|
| 1605 | cookies.extend(ns_cookies)
|
---|
| 1606 |
|
---|
| 1607 | return cookies
|
---|
| 1608 |
|
---|
| 1609 | def set_cookie_if_ok(self, cookie, request):
|
---|
| 1610 | """Set a cookie if policy says it's OK to do so."""
|
---|
| 1611 | self._cookies_lock.acquire()
|
---|
| 1612 | try:
|
---|
| 1613 | self._policy._now = self._now = int(time.time())
|
---|
| 1614 |
|
---|
| 1615 | if self._policy.set_ok(cookie, request):
|
---|
| 1616 | self.set_cookie(cookie)
|
---|
| 1617 |
|
---|
| 1618 |
|
---|
| 1619 | finally:
|
---|
| 1620 | self._cookies_lock.release()
|
---|
| 1621 |
|
---|
| 1622 | def set_cookie(self, cookie):
|
---|
| 1623 | """Set a cookie, without checking whether or not it should be set."""
|
---|
| 1624 | c = self._cookies
|
---|
| 1625 | self._cookies_lock.acquire()
|
---|
| 1626 | try:
|
---|
| 1627 | if cookie.domain not in c: c[cookie.domain] = {}
|
---|
| 1628 | c2 = c[cookie.domain]
|
---|
| 1629 | if cookie.path not in c2: c2[cookie.path] = {}
|
---|
| 1630 | c3 = c2[cookie.path]
|
---|
| 1631 | c3[cookie.name] = cookie
|
---|
| 1632 | finally:
|
---|
| 1633 | self._cookies_lock.release()
|
---|
| 1634 |
|
---|
| 1635 | def extract_cookies(self, response, request):
|
---|
| 1636 | """Extract cookies from response, where allowable given the request."""
|
---|
| 1637 | _debug("extract_cookies: %s", response.info())
|
---|
| 1638 | self._cookies_lock.acquire()
|
---|
| 1639 | try:
|
---|
| 1640 | self._policy._now = self._now = int(time.time())
|
---|
| 1641 |
|
---|
| 1642 | for cookie in self.make_cookies(response, request):
|
---|
| 1643 | if self._policy.set_ok(cookie, request):
|
---|
| 1644 | _debug(" setting cookie: %s", cookie)
|
---|
| 1645 | self.set_cookie(cookie)
|
---|
| 1646 | finally:
|
---|
| 1647 | self._cookies_lock.release()
|
---|
| 1648 |
|
---|
| 1649 | def clear(self, domain=None, path=None, name=None):
|
---|
| 1650 | """Clear some cookies.
|
---|
| 1651 |
|
---|
| 1652 | Invoking this method without arguments will clear all cookies. If
|
---|
| 1653 | given a single argument, only cookies belonging to that domain will be
|
---|
| 1654 | removed. If given two arguments, cookies belonging to the specified
|
---|
| 1655 | path within that domain are removed. If given three arguments, then
|
---|
| 1656 | the cookie with the specified name, path and domain is removed.
|
---|
| 1657 |
|
---|
| 1658 | Raises KeyError if no matching cookie exists.
|
---|
| 1659 |
|
---|
| 1660 | """
|
---|
| 1661 | if name is not None:
|
---|
| 1662 | if (domain is None) or (path is None):
|
---|
| 1663 | raise ValueError(
|
---|
| 1664 | "domain and path must be given to remove a cookie by name")
|
---|
| 1665 | del self._cookies[domain][path][name]
|
---|
| 1666 | elif path is not None:
|
---|
| 1667 | if domain is None:
|
---|
| 1668 | raise ValueError(
|
---|
| 1669 | "domain must be given to remove cookies by path")
|
---|
| 1670 | del self._cookies[domain][path]
|
---|
| 1671 | elif domain is not None:
|
---|
| 1672 | del self._cookies[domain]
|
---|
| 1673 | else:
|
---|
| 1674 | self._cookies = {}
|
---|
| 1675 |
|
---|
| 1676 | def clear_session_cookies(self):
|
---|
| 1677 | """Discard all session cookies.
|
---|
| 1678 |
|
---|
| 1679 | Note that the .save() method won't save session cookies anyway, unless
|
---|
| 1680 | you ask otherwise by passing a true ignore_discard argument.
|
---|
| 1681 |
|
---|
| 1682 | """
|
---|
| 1683 | self._cookies_lock.acquire()
|
---|
| 1684 | try:
|
---|
| 1685 | for cookie in self:
|
---|
| 1686 | if cookie.discard:
|
---|
| 1687 | self.clear(cookie.domain, cookie.path, cookie.name)
|
---|
| 1688 | finally:
|
---|
| 1689 | self._cookies_lock.release()
|
---|
| 1690 |
|
---|
| 1691 | def clear_expired_cookies(self):
|
---|
| 1692 | """Discard all expired cookies.
|
---|
| 1693 |
|
---|
| 1694 | You probably don't need to call this method: expired cookies are never
|
---|
| 1695 | sent back to the server (provided you're using DefaultCookiePolicy),
|
---|
| 1696 | this method is called by CookieJar itself every so often, and the
|
---|
| 1697 | .save() method won't save expired cookies anyway (unless you ask
|
---|
| 1698 | otherwise by passing a true ignore_expires argument).
|
---|
| 1699 |
|
---|
| 1700 | """
|
---|
| 1701 | self._cookies_lock.acquire()
|
---|
| 1702 | try:
|
---|
| 1703 | now = time.time()
|
---|
| 1704 | for cookie in self:
|
---|
| 1705 | if cookie.is_expired(now):
|
---|
| 1706 | self.clear(cookie.domain, cookie.path, cookie.name)
|
---|
| 1707 | finally:
|
---|
| 1708 | self._cookies_lock.release()
|
---|
| 1709 |
|
---|
| 1710 | def __iter__(self):
|
---|
| 1711 | return deepvalues(self._cookies)
|
---|
| 1712 |
|
---|
| 1713 | def __len__(self):
|
---|
| 1714 | """Return number of contained cookies."""
|
---|
| 1715 | i = 0
|
---|
| 1716 | for cookie in self: i = i + 1
|
---|
| 1717 | return i
|
---|
| 1718 |
|
---|
| 1719 | def __repr__(self):
|
---|
| 1720 | r = []
|
---|
| 1721 | for cookie in self: r.append(repr(cookie))
|
---|
| 1722 | return "<%s[%s]>" % (self.__class__, ", ".join(r))
|
---|
| 1723 |
|
---|
| 1724 | def __str__(self):
|
---|
| 1725 | r = []
|
---|
| 1726 | for cookie in self: r.append(str(cookie))
|
---|
| 1727 | return "<%s[%s]>" % (self.__class__, ", ".join(r))
|
---|
| 1728 |
|
---|
| 1729 |
|
---|
| 1730 | # derives from IOError for backwards-compatibility with Python 2.4.0
|
---|
| 1731 | class LoadError(IOError): pass
|
---|
| 1732 |
|
---|
| 1733 | class FileCookieJar(CookieJar):
|
---|
| 1734 | """CookieJar that can be loaded from and saved to a file."""
|
---|
| 1735 |
|
---|
| 1736 | def __init__(self, filename=None, delayload=False, policy=None):
|
---|
| 1737 | """
|
---|
| 1738 | Cookies are NOT loaded from the named file until either the .load() or
|
---|
| 1739 | .revert() method is called.
|
---|
| 1740 |
|
---|
| 1741 | """
|
---|
| 1742 | CookieJar.__init__(self, policy)
|
---|
| 1743 | if filename is not None:
|
---|
| 1744 | try:
|
---|
| 1745 | filename+""
|
---|
| 1746 | except:
|
---|
| 1747 | raise ValueError("filename must be string-like")
|
---|
| 1748 | self.filename = filename
|
---|
| 1749 | self.delayload = bool(delayload)
|
---|
| 1750 |
|
---|
| 1751 | def save(self, filename=None, ignore_discard=False, ignore_expires=False):
|
---|
| 1752 | """Save cookies to a file."""
|
---|
| 1753 | raise NotImplementedError()
|
---|
| 1754 |
|
---|
| 1755 | def load(self, filename=None, ignore_discard=False, ignore_expires=False):
|
---|
| 1756 | """Load cookies from a file."""
|
---|
| 1757 | if filename is None:
|
---|
| 1758 | if self.filename is not None: filename = self.filename
|
---|
| 1759 | else: raise ValueError(MISSING_FILENAME_TEXT)
|
---|
| 1760 |
|
---|
| 1761 | f = open(filename)
|
---|
| 1762 | try:
|
---|
| 1763 | self._really_load(f, filename, ignore_discard, ignore_expires)
|
---|
| 1764 | finally:
|
---|
| 1765 | f.close()
|
---|
| 1766 |
|
---|
| 1767 | def revert(self, filename=None,
|
---|
| 1768 | ignore_discard=False, ignore_expires=False):
|
---|
| 1769 | """Clear all cookies and reload cookies from a saved file.
|
---|
| 1770 |
|
---|
| 1771 | Raises LoadError (or IOError) if reversion is not successful; the
|
---|
| 1772 | object's state will not be altered if this happens.
|
---|
| 1773 |
|
---|
| 1774 | """
|
---|
| 1775 | if filename is None:
|
---|
| 1776 | if self.filename is not None: filename = self.filename
|
---|
| 1777 | else: raise ValueError(MISSING_FILENAME_TEXT)
|
---|
| 1778 |
|
---|
| 1779 | self._cookies_lock.acquire()
|
---|
| 1780 | try:
|
---|
| 1781 |
|
---|
| 1782 | old_state = copy.deepcopy(self._cookies)
|
---|
| 1783 | self._cookies = {}
|
---|
| 1784 | try:
|
---|
| 1785 | self.load(filename, ignore_discard, ignore_expires)
|
---|
| 1786 | except (LoadError, IOError):
|
---|
| 1787 | self._cookies = old_state
|
---|
| 1788 | raise
|
---|
| 1789 |
|
---|
| 1790 | finally:
|
---|
| 1791 | self._cookies_lock.release()
|
---|
| 1792 |
|
---|
| 1793 | from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
|
---|
| 1794 | from _MozillaCookieJar import MozillaCookieJar
|
---|