1 | """HTTP cookie handling for web clients.
|
---|
2 |
|
---|
3 | This module has (now fairly distant) origins in Gisle Aas' Perl module
|
---|
4 | HTTP::Cookies, from the libwww-perl library.
|
---|
5 |
|
---|
6 | Docstrings, comments and debug strings in this code refer to the
|
---|
7 | attributes of the HTTP cookie system as cookie-attributes, to distinguish
|
---|
8 | them clearly from Python attributes.
|
---|
9 |
|
---|
10 | Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
|
---|
11 | distributed with the Python standard library, but are available from
|
---|
12 | http://wwwsearch.sf.net/):
|
---|
13 |
|
---|
14 | CookieJar____
|
---|
15 | / \ \
|
---|
16 | FileCookieJar \ \
|
---|
17 | / | \ \ \
|
---|
18 | MozillaCookieJar | LWPCookieJar \ \
|
---|
19 | | | \
|
---|
20 | | ---MSIEBase | \
|
---|
21 | | / | | \
|
---|
22 | | / MSIEDBCookieJar BSDDBCookieJar
|
---|
23 | |/
|
---|
24 | MSIECookieJar
|
---|
25 |
|
---|
26 | """
|
---|
27 |
|
---|
28 | __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
|
---|
29 | 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
|
---|
30 |
|
---|
31 | import re, urlparse, copy, time, urllib
|
---|
32 | try:
|
---|
33 | import threading as _threading
|
---|
34 | except ImportError:
|
---|
35 | import dummy_threading as _threading
|
---|
36 | import httplib # only for the default HTTP port
|
---|
37 | from calendar import timegm
|
---|
38 |
|
---|
39 | debug = False # set to True to enable debugging via the logging module
|
---|
40 | logger = None
|
---|
41 |
|
---|
42 | def _debug(*args):
|
---|
43 | if not debug:
|
---|
44 | return
|
---|
45 | global logger
|
---|
46 | if not logger:
|
---|
47 | import logging
|
---|
48 | logger = logging.getLogger("cookielib")
|
---|
49 | return logger.debug(*args)
|
---|
50 |
|
---|
51 |
|
---|
52 | DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
|
---|
53 | MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
|
---|
54 | "instance initialised with one)")
|
---|
55 |
|
---|
56 | def _warn_unhandled_exception():
|
---|
57 | # There are a few catch-all except: statements in this module, for
|
---|
58 | # catching input that's bad in unexpected ways. Warn if any
|
---|
59 | # exceptions are caught there.
|
---|
60 | import warnings, traceback, StringIO
|
---|
61 | f = StringIO.StringIO()
|
---|
62 | traceback.print_exc(None, f)
|
---|
63 | msg = f.getvalue()
|
---|
64 | warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
|
---|
65 |
|
---|
66 |
|
---|
67 | # Date/time conversion
|
---|
68 | # -----------------------------------------------------------------------------
|
---|
69 |
|
---|
70 | EPOCH_YEAR = 1970
|
---|
71 | def _timegm(tt):
|
---|
72 | year, month, mday, hour, min, sec = tt[:6]
|
---|
73 | if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
|
---|
74 | (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
|
---|
75 | return timegm(tt)
|
---|
76 | else:
|
---|
77 | return None
|
---|
78 |
|
---|
79 | DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
---|
80 | MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
---|
81 | "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
---|
82 | MONTHS_LOWER = []
|
---|
83 | for month in MONTHS: MONTHS_LOWER.append(month.lower())
|
---|
84 |
|
---|
85 | def time2isoz(t=None):
|
---|
86 | """Return a string representing time in seconds since epoch, t.
|
---|
87 |
|
---|
88 | If the function is called without an argument, it will use the current
|
---|
89 | time.
|
---|
90 |
|
---|
91 | The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
|
---|
92 | representing Universal Time (UTC, aka GMT). An example of this format is:
|
---|
93 |
|
---|
94 | 1994-11-24 08:49:37Z
|
---|
95 |
|
---|
96 | """
|
---|
97 | if t is None: t = time.time()
|
---|
98 | year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
|
---|
99 | return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
|
---|
100 | year, mon, mday, hour, min, sec)
|
---|
101 |
|
---|
102 | def time2netscape(t=None):
|
---|
103 | """Return a string representing time in seconds since epoch, t.
|
---|
104 |
|
---|
105 | If the function is called without an argument, it will use the current
|
---|
106 | time.
|
---|
107 |
|
---|
108 | The format of the returned string is like this:
|
---|
109 |
|
---|
110 | Wed, DD-Mon-YYYY HH:MM:SS GMT
|
---|
111 |
|
---|
112 | """
|
---|
113 | if t is None: t = time.time()
|
---|
114 | year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
|
---|
115 | return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
|
---|
116 | DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
|
---|
117 |
|
---|
118 |
|
---|
119 | UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
|
---|
120 |
|
---|
121 | TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
|
---|
122 | def offset_from_tz_string(tz):
|
---|
123 | offset = None
|
---|
124 | if tz in UTC_ZONES:
|
---|
125 | offset = 0
|
---|
126 | else:
|
---|
127 | m = TIMEZONE_RE.search(tz)
|
---|
128 | if m:
|
---|
129 | offset = 3600 * int(m.group(2))
|
---|
130 | if m.group(3):
|
---|
131 | offset = offset + 60 * int(m.group(3))
|
---|
132 | if m.group(1) == '-':
|
---|
133 | offset = -offset
|
---|
134 | return offset
|
---|
135 |
|
---|
136 | def _str2time(day, mon, yr, hr, min, sec, tz):
|
---|
137 | # translate month name to number
|
---|
138 | # month numbers start with 1 (January)
|
---|
139 | try:
|
---|
140 | mon = MONTHS_LOWER.index(mon.lower())+1
|
---|
141 | except ValueError:
|
---|
142 | # maybe it's already a number
|
---|
143 | try:
|
---|
144 | imon = int(mon)
|
---|
145 | except ValueError:
|
---|
146 | return None
|
---|
147 | if 1 <= imon <= 12:
|
---|
148 | mon = imon
|
---|
149 | else:
|
---|
150 | return None
|
---|
151 |
|
---|
152 | # make sure clock elements are defined
|
---|
153 | if hr is None: hr = 0
|
---|
154 | if min is None: min = 0
|
---|
155 | if sec is None: sec = 0
|
---|
156 |
|
---|
157 | yr = int(yr)
|
---|
158 | day = int(day)
|
---|
159 | hr = int(hr)
|
---|
160 | min = int(min)
|
---|
161 | sec = int(sec)
|
---|
162 |
|
---|
163 | if yr < 1000:
|
---|
164 | # find "obvious" year
|
---|
165 | cur_yr = time.localtime(time.time())[0]
|
---|
166 | m = cur_yr % 100
|
---|
167 | tmp = yr
|
---|
168 | yr = yr + cur_yr - m
|
---|
169 | m = m - tmp
|
---|
170 | if abs(m) > 50:
|
---|
171 | if m > 0: yr = yr + 100
|
---|
172 | else: yr = yr - 100
|
---|
173 |
|
---|
174 | # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
|
---|
175 | t = _timegm((yr, mon, day, hr, min, sec, tz))
|
---|
176 |
|
---|
177 | if t is not None:
|
---|
178 | # adjust time using timezone string, to get absolute time since epoch
|
---|
179 | if tz is None:
|
---|
180 | tz = "UTC"
|
---|
181 | tz = tz.upper()
|
---|
182 | offset = offset_from_tz_string(tz)
|
---|
183 | if offset is None:
|
---|
184 | return None
|
---|
185 | t = t - offset
|
---|
186 |
|
---|
187 | return t
|
---|
188 |
|
---|
189 | STRICT_DATE_RE = re.compile(
|
---|
190 | r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
|
---|
191 | "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
|
---|
192 | WEEKDAY_RE = re.compile(
|
---|
193 | r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
|
---|
194 | LOOSE_HTTP_DATE_RE = re.compile(
|
---|
195 | r"""^
|
---|
196 | (\d\d?) # day
|
---|
197 | (?:\s+|[-\/])
|
---|
198 | (\w+) # month
|
---|
199 | (?:\s+|[-\/])
|
---|
200 | (\d+) # year
|
---|
201 | (?:
|
---|
202 | (?:\s+|:) # separator before clock
|
---|
203 | (\d\d?):(\d\d) # hour:min
|
---|
204 | (?::(\d\d))? # optional seconds
|
---|
205 | )? # optional clock
|
---|
206 | \s*
|
---|
207 | ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
|
---|
208 | \s*
|
---|
209 | (?:\(\w+\))? # ASCII representation of timezone in parens.
|
---|
210 | \s*$""", re.X)
|
---|
211 | def http2time(text):
|
---|
212 | """Returns time in seconds since epoch of time represented by a string.
|
---|
213 |
|
---|
214 | Return value is an integer.
|
---|
215 |
|
---|
216 | None is returned if the format of str is unrecognized, the time is outside
|
---|
217 | the representable range, or the timezone string is not recognized. If the
|
---|
218 | string contains no timezone, UTC is assumed.
|
---|
219 |
|
---|
220 | The timezone in the string may be numerical (like "-0800" or "+0100") or a
|
---|
221 | string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
|
---|
222 | timezone strings equivalent to UTC (zero offset) are known to the function.
|
---|
223 |
|
---|
224 | The function loosely parses the following formats:
|
---|
225 |
|
---|
226 | Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
|
---|
227 | Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
|
---|
228 | Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
|
---|
229 | 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
|
---|
230 | 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
|
---|
231 | 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
|
---|
232 |
|
---|
233 | The parser ignores leading and trailing whitespace. The time may be
|
---|
234 | absent.
|
---|
235 |
|
---|
236 | If the year is given with only 2 digits, the function will select the
|
---|
237 | century that makes the year closest to the current date.
|
---|
238 |
|
---|
239 | """
|
---|
240 | # fast exit for strictly conforming string
|
---|
241 | m = STRICT_DATE_RE.search(text)
|
---|
242 | if m:
|
---|
243 | g = m.groups()
|
---|
244 | mon = MONTHS_LOWER.index(g[1].lower()) + 1
|
---|
245 | tt = (int(g[2]), mon, int(g[0]),
|
---|
246 | int(g[3]), int(g[4]), float(g[5]))
|
---|
247 | return _timegm(tt)
|
---|
248 |
|
---|
249 | # No, we need some messy parsing...
|
---|
250 |
|
---|
251 | # clean up
|
---|
252 | text = text.lstrip()
|
---|
253 | text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
|
---|
254 |
|
---|
255 | # tz is time zone specifier string
|
---|
256 | day, mon, yr, hr, min, sec, tz = [None]*7
|
---|
257 |
|
---|
258 | # loose regexp parse
|
---|
259 | m = LOOSE_HTTP_DATE_RE.search(text)
|
---|
260 | if m is not None:
|
---|
261 | day, mon, yr, hr, min, sec, tz = m.groups()
|
---|
262 | else:
|
---|
263 | return None # bad format
|
---|
264 |
|
---|
265 | return _str2time(day, mon, yr, hr, min, sec, tz)
|
---|
266 |
|
---|
267 | ISO_DATE_RE = re.compile(
|
---|
268 | """^
|
---|
269 | (\d{4}) # year
|
---|
270 | [-\/]?
|
---|
271 | (\d\d?) # numerical month
|
---|
272 | [-\/]?
|
---|
273 | (\d\d?) # day
|
---|
274 | (?:
|
---|
275 | (?:\s+|[-:Tt]) # separator before clock
|
---|
276 | (\d\d?):?(\d\d) # hour:min
|
---|
277 | (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
|
---|
278 | )? # optional clock
|
---|
279 | \s*
|
---|
280 | ([-+]?\d\d?:?(:?\d\d)?
|
---|
281 | |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
|
---|
282 | \s*$""", re.X)
|
---|
283 | def iso2time(text):
|
---|
284 | """
|
---|
285 | As for http2time, but parses the ISO 8601 formats:
|
---|
286 |
|
---|
287 | 1994-02-03 14:15:29 -0100 -- ISO 8601 format
|
---|
288 | 1994-02-03 14:15:29 -- zone is optional
|
---|
289 | 1994-02-03 -- only date
|
---|
290 | 1994-02-03T14:15:29 -- Use T as separator
|
---|
291 | 19940203T141529Z -- ISO 8601 compact format
|
---|
292 | 19940203 -- only date
|
---|
293 |
|
---|
294 | """
|
---|
295 | # clean up
|
---|
296 | text = text.lstrip()
|
---|
297 |
|
---|
298 | # tz is time zone specifier string
|
---|
299 | day, mon, yr, hr, min, sec, tz = [None]*7
|
---|
300 |
|
---|
301 | # loose regexp parse
|
---|
302 | m = ISO_DATE_RE.search(text)
|
---|
303 | if m is not None:
|
---|
304 | # XXX there's an extra bit of the timezone I'm ignoring here: is
|
---|
305 | # this the right thing to do?
|
---|
306 | yr, mon, day, hr, min, sec, tz, _ = m.groups()
|
---|
307 | else:
|
---|
308 | return None # bad format
|
---|
309 |
|
---|
310 | return _str2time(day, mon, yr, hr, min, sec, tz)
|
---|
311 |
|
---|
312 |
|
---|
313 | # Header parsing
|
---|
314 | # -----------------------------------------------------------------------------
|
---|
315 |
|
---|
316 | def unmatched(match):
|
---|
317 | """Return unmatched part of re.Match object."""
|
---|
318 | start, end = match.span(0)
|
---|
319 | return match.string[:start]+match.string[end:]
|
---|
320 |
|
---|
321 | HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
|
---|
322 | HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
|
---|
323 | HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
|
---|
324 | HEADER_ESCAPE_RE = re.compile(r"\\(.)")
|
---|
325 | def split_header_words(header_values):
|
---|
326 | r"""Parse header values into a list of lists containing key,value pairs.
|
---|
327 |
|
---|
328 | The function knows how to deal with ",", ";" and "=" as well as quoted
|
---|
329 | values after "=". A list of space separated tokens are parsed as if they
|
---|
330 | were separated by ";".
|
---|
331 |
|
---|
332 | If the header_values passed as argument contains multiple values, then they
|
---|
333 | are treated as if they were a single value separated by comma ",".
|
---|
334 |
|
---|
335 | This means that this function is useful for parsing header fields that
|
---|
336 | follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
|
---|
337 | the requirement for tokens).
|
---|
338 |
|
---|
339 | headers = #header
|
---|
340 | header = (token | parameter) *( [";"] (token | parameter))
|
---|
341 |
|
---|
342 | token = 1*<any CHAR except CTLs or separators>
|
---|
343 | separators = "(" | ")" | "<" | ">" | "@"
|
---|
344 | | "," | ";" | ":" | "\" | <">
|
---|
345 | | "/" | "[" | "]" | "?" | "="
|
---|
346 | | "{" | "}" | SP | HT
|
---|
347 |
|
---|
348 | quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
|
---|
349 | qdtext = <any TEXT except <">>
|
---|
350 | quoted-pair = "\" CHAR
|
---|
351 |
|
---|
352 | parameter = attribute "=" value
|
---|
353 | attribute = token
|
---|
354 | value = token | quoted-string
|
---|
355 |
|
---|
356 | Each header is represented by a list of key/value pairs. The value for a
|
---|
357 | simple token (not part of a parameter) is None. Syntactically incorrect
|
---|
358 | headers will not necessarily be parsed as you would want.
|
---|
359 |
|
---|
360 | This is easier to describe with some examples:
|
---|
361 |
|
---|
362 | >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
|
---|
363 | [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
|
---|
364 | >>> split_header_words(['text/html; charset="iso-8859-1"'])
|
---|
365 | [[('text/html', None), ('charset', 'iso-8859-1')]]
|
---|
366 | >>> split_header_words([r'Basic realm="\"foo\bar\""'])
|
---|
367 | [[('Basic', None), ('realm', '"foobar"')]]
|
---|
368 |
|
---|
369 | """
|
---|
370 | assert not isinstance(header_values, basestring)
|
---|
371 | result = []
|
---|
372 | for text in header_values:
|
---|
373 | orig_text = text
|
---|
374 | pairs = []
|
---|
375 | while text:
|
---|
376 | m = HEADER_TOKEN_RE.search(text)
|
---|
377 | if m:
|
---|
378 | text = unmatched(m)
|
---|
379 | name = m.group(1)
|
---|
380 | m = HEADER_QUOTED_VALUE_RE.search(text)
|
---|
381 | if m: # quoted value
|
---|
382 | text = unmatched(m)
|
---|
383 | value = m.group(1)
|
---|
384 | value = HEADER_ESCAPE_RE.sub(r"\1", value)
|
---|
385 | else:
|
---|
386 | m = HEADER_VALUE_RE.search(text)
|
---|
387 | if m: # unquoted value
|
---|
388 | text = unmatched(m)
|
---|
389 | value = m.group(1)
|
---|
390 | value = value.rstrip()
|
---|
391 | else:
|
---|
392 | # no value, a lone token
|
---|
393 | value = None
|
---|
394 | pairs.append((name, value))
|
---|
395 | elif text.lstrip().startswith(","):
|
---|
396 | # concatenated headers, as per RFC 2616 section 4.2
|
---|
397 | text = text.lstrip()[1:]
|
---|
398 | if pairs: result.append(pairs)
|
---|
399 | pairs = []
|
---|
400 | else:
|
---|
401 | # skip junk
|
---|
402 | non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
|
---|
403 | assert nr_junk_chars > 0, (
|
---|
404 | "split_header_words bug: '%s', '%s', %s" %
|
---|
405 | (orig_text, text, pairs))
|
---|
406 | text = non_junk
|
---|
407 | if pairs: result.append(pairs)
|
---|
408 | return result
|
---|
409 |
|
---|
410 | HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
|
---|
411 | def join_header_words(lists):
|
---|
412 | """Do the inverse (almost) of the conversion done by split_header_words.
|
---|
413 |
|
---|
414 | Takes a list of lists of (key, value) pairs and produces a single header
|
---|
415 | value. Attribute values are quoted if needed.
|
---|
416 |
|
---|
417 | >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
|
---|
418 | 'text/plain; charset="iso-8859/1"'
|
---|
419 | >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
|
---|
420 | 'text/plain, charset="iso-8859/1"'
|
---|
421 |
|
---|
422 | """
|
---|
423 | headers = []
|
---|
424 | for pairs in lists:
|
---|
425 | attr = []
|
---|
426 | for k, v in pairs:
|
---|
427 | if v is not None:
|
---|
428 | if not re.search(r"^\w+$", v):
|
---|
429 | v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
|
---|
430 | v = '"%s"' % v
|
---|
431 | k = "%s=%s" % (k, v)
|
---|
432 | attr.append(k)
|
---|
433 | if attr: headers.append("; ".join(attr))
|
---|
434 | return ", ".join(headers)
|
---|
435 |
|
---|
436 | def parse_ns_headers(ns_headers):
|
---|
437 | """Ad-hoc parser for Netscape protocol cookie-attributes.
|
---|
438 |
|
---|
439 | The old Netscape cookie format for Set-Cookie can for instance contain
|
---|
440 | an unquoted "," in the expires field, so we have to use this ad-hoc
|
---|
441 | parser instead of split_header_words.
|
---|
442 |
|
---|
443 | XXX This may not make the best possible effort to parse all the crap
|
---|
444 | that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
|
---|
445 | parser is probably better, so could do worse than following that if
|
---|
446 | this ever gives any trouble.
|
---|
447 |
|
---|
448 | Currently, this is also used for parsing RFC 2109 cookies.
|
---|
449 |
|
---|
450 | """
|
---|
451 | known_attrs = ("expires", "domain", "path", "secure",
|
---|
452 | # RFC 2109 attrs (may turn up in Netscape cookies, too)
|
---|
453 | "port", "max-age")
|
---|
454 |
|
---|
455 | result = []
|
---|
456 | for ns_header in ns_headers:
|
---|
457 | pairs = []
|
---|
458 | version_set = False
|
---|
459 | for ii, param in enumerate(re.split(r";\s*", ns_header)):
|
---|
460 | param = param.rstrip()
|
---|
461 | if param == "": continue
|
---|
462 | if "=" not in param:
|
---|
463 | k, v = param, None
|
---|
464 | else:
|
---|
465 | k, v = re.split(r"\s*=\s*", param, 1)
|
---|
466 | k = k.lstrip()
|
---|
467 | if ii != 0:
|
---|
468 | lc = k.lower()
|
---|
469 | if lc in known_attrs:
|
---|
470 | k = lc
|
---|
471 | if k == "version":
|
---|
472 | # This is an RFC 2109 cookie.
|
---|
473 | version_set = True
|
---|
474 | if k == "expires":
|
---|
475 | # convert expires date to seconds since epoch
|
---|
476 | if v.startswith('"'): v = v[1:]
|
---|
477 | if v.endswith('"'): v = v[:-1]
|
---|
478 | v = http2time(v) # None if invalid
|
---|
479 | pairs.append((k, v))
|
---|
480 |
|
---|
481 | if pairs:
|
---|
482 | if not version_set:
|
---|
483 | pairs.append(("version", "0"))
|
---|
484 | result.append(pairs)
|
---|
485 |
|
---|
486 | return result
|
---|
487 |
|
---|
488 |
|
---|
489 | IPV4_RE = re.compile(r"\.\d+$")
|
---|
490 | def is_HDN(text):
|
---|
491 | """Return True if text is a host domain name."""
|
---|
492 | # XXX
|
---|
493 | # This may well be wrong. Which RFC is HDN defined in, if any (for
|
---|
494 | # the purposes of RFC 2965)?
|
---|
495 | # For the current implementation, what about IPv6? Remember to look
|
---|
496 | # at other uses of IPV4_RE also, if change this.
|
---|
497 | if IPV4_RE.search(text):
|
---|
498 | return False
|
---|
499 | if text == "":
|
---|
500 | return False
|
---|
501 | if text[0] == "." or text[-1] == ".":
|
---|
502 | return False
|
---|
503 | return True
|
---|
504 |
|
---|
505 | def domain_match(A, B):
|
---|
506 | """Return True if domain A domain-matches domain B, according to RFC 2965.
|
---|
507 |
|
---|
508 | A and B may be host domain names or IP addresses.
|
---|
509 |
|
---|
510 | RFC 2965, section 1:
|
---|
511 |
|
---|
512 | Host names can be specified either as an IP address or a HDN string.
|
---|
513 | Sometimes we compare one host name with another. (Such comparisons SHALL
|
---|
514 | be case-insensitive.) Host A's name domain-matches host B's if
|
---|
515 |
|
---|
516 | * their host name strings string-compare equal; or
|
---|
517 |
|
---|
518 | * A is a HDN string and has the form NB, where N is a non-empty
|
---|
519 | name string, B has the form .B', and B' is a HDN string. (So,
|
---|
520 | x.y.com domain-matches .Y.com but not Y.com.)
|
---|
521 |
|
---|
522 | Note that domain-match is not a commutative operation: a.b.c.com
|
---|
523 | domain-matches .c.com, but not the reverse.
|
---|
524 |
|
---|
525 | """
|
---|
526 | # Note that, if A or B are IP addresses, the only relevant part of the
|
---|
527 | # definition of the domain-match algorithm is the direct string-compare.
|
---|
528 | A = A.lower()
|
---|
529 | B = B.lower()
|
---|
530 | if A == B:
|
---|
531 | return True
|
---|
532 | if not is_HDN(A):
|
---|
533 | return False
|
---|
534 | i = A.rfind(B)
|
---|
535 | if i == -1 or i == 0:
|
---|
536 | # A does not have form NB, or N is the empty string
|
---|
537 | return False
|
---|
538 | if not B.startswith("."):
|
---|
539 | return False
|
---|
540 | if not is_HDN(B[1:]):
|
---|
541 | return False
|
---|
542 | return True
|
---|
543 |
|
---|
544 | def liberal_is_HDN(text):
|
---|
545 | """Return True if text is a sort-of-like a host domain name.
|
---|
546 |
|
---|
547 | For accepting/blocking domains.
|
---|
548 |
|
---|
549 | """
|
---|
550 | if IPV4_RE.search(text):
|
---|
551 | return False
|
---|
552 | return True
|
---|
553 |
|
---|
554 | def user_domain_match(A, B):
|
---|
555 | """For blocking/accepting domains.
|
---|
556 |
|
---|
557 | A and B may be host domain names or IP addresses.
|
---|
558 |
|
---|
559 | """
|
---|
560 | A = A.lower()
|
---|
561 | B = B.lower()
|
---|
562 | if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
|
---|
563 | if A == B:
|
---|
564 | # equal IP addresses
|
---|
565 | return True
|
---|
566 | return False
|
---|
567 | initial_dot = B.startswith(".")
|
---|
568 | if initial_dot and A.endswith(B):
|
---|
569 | return True
|
---|
570 | if not initial_dot and A == B:
|
---|
571 | return True
|
---|
572 | return False
|
---|
573 |
|
---|
574 | cut_port_re = re.compile(r":\d+$")
|
---|
575 | def request_host(request):
|
---|
576 | """Return request-host, as defined by RFC 2965.
|
---|
577 |
|
---|
578 | Variation from RFC: returned value is lowercased, for convenient
|
---|
579 | comparison.
|
---|
580 |
|
---|
581 | """
|
---|
582 | url = request.get_full_url()
|
---|
583 | host = urlparse.urlparse(url)[1]
|
---|
584 | if host == "":
|
---|
585 | host = request.get_header("Host", "")
|
---|
586 |
|
---|
587 | # remove port, if present
|
---|
588 | host = cut_port_re.sub("", host, 1)
|
---|
589 | return host.lower()
|
---|
590 |
|
---|
591 | def eff_request_host(request):
|
---|
592 | """Return a tuple (request-host, effective request-host name).
|
---|
593 |
|
---|
594 | As defined by RFC 2965, except both are lowercased.
|
---|
595 |
|
---|
596 | """
|
---|
597 | erhn = req_host = request_host(request)
|
---|
598 | if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
|
---|
599 | erhn = req_host + ".local"
|
---|
600 | return req_host, erhn
|
---|
601 |
|
---|
602 | def request_path(request):
|
---|
603 | """request-URI, as defined by RFC 2965."""
|
---|
604 | url = request.get_full_url()
|
---|
605 | #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
|
---|
606 | #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
|
---|
607 | path, parameters, query, frag = urlparse.urlparse(url)[2:]
|
---|
608 | if parameters:
|
---|
609 | path = "%s;%s" % (path, parameters)
|
---|
610 | path = escape_path(path)
|
---|
611 | req_path = urlparse.urlunparse(("", "", path, "", query, frag))
|
---|
612 | if not req_path.startswith("/"):
|
---|
613 | # fix bad RFC 2396 absoluteURI
|
---|
614 | req_path = "/"+req_path
|
---|
615 | return req_path
|
---|
616 |
|
---|
617 | def request_port(request):
|
---|
618 | host = request.get_host()
|
---|
619 | i = host.find(':')
|
---|
620 | if i >= 0:
|
---|
621 | port = host[i+1:]
|
---|
622 | try:
|
---|
623 | int(port)
|
---|
624 | except ValueError:
|
---|
625 | _debug("nonnumeric port: '%s'", port)
|
---|
626 | return None
|
---|
627 | else:
|
---|
628 | port = DEFAULT_HTTP_PORT
|
---|
629 | return port
|
---|
630 |
|
---|
631 | # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
|
---|
632 | # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
|
---|
633 | HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
|
---|
634 | ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
|
---|
635 | def uppercase_escaped_char(match):
|
---|
636 | return "%%%s" % match.group(1).upper()
|
---|
637 | def escape_path(path):
|
---|
638 | """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
|
---|
639 | # There's no knowing what character encoding was used to create URLs
|
---|
640 | # containing %-escapes, but since we have to pick one to escape invalid
|
---|
641 | # path characters, we pick UTF-8, as recommended in the HTML 4.0
|
---|
642 | # specification:
|
---|
643 | # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
|
---|
644 | # And here, kind of: draft-fielding-uri-rfc2396bis-03
|
---|
645 | # (And in draft IRI specification: draft-duerst-iri-05)
|
---|
646 | # (And here, for new URI schemes: RFC 2718)
|
---|
647 | if isinstance(path, unicode):
|
---|
648 | path = path.encode("utf-8")
|
---|
649 | path = urllib.quote(path, HTTP_PATH_SAFE)
|
---|
650 | path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
|
---|
651 | return path
|
---|
652 |
|
---|
653 | def reach(h):
|
---|
654 | """Return reach of host h, as defined by RFC 2965, section 1.
|
---|
655 |
|
---|
656 | The reach R of a host name H is defined as follows:
|
---|
657 |
|
---|
658 | * If
|
---|
659 |
|
---|
660 | - H is the host domain name of a host; and,
|
---|
661 |
|
---|
662 | - H has the form A.B; and
|
---|
663 |
|
---|
664 | - A has no embedded (that is, interior) dots; and
|
---|
665 |
|
---|
666 | - B has at least one embedded dot, or B is the string "local".
|
---|
667 | then the reach of H is .B.
|
---|
668 |
|
---|
669 | * Otherwise, the reach of H is H.
|
---|
670 |
|
---|
671 | >>> reach("www.acme.com")
|
---|
672 | '.acme.com'
|
---|
673 | >>> reach("acme.com")
|
---|
674 | 'acme.com'
|
---|
675 | >>> reach("acme.local")
|
---|
676 | '.local'
|
---|
677 |
|
---|
678 | """
|
---|
679 | i = h.find(".")
|
---|
680 | if i >= 0:
|
---|
681 | #a = h[:i] # this line is only here to show what a is
|
---|
682 | b = h[i+1:]
|
---|
683 | i = b.find(".")
|
---|
684 | if is_HDN(h) and (i >= 0 or b == "local"):
|
---|
685 | return "."+b
|
---|
686 | return h
|
---|
687 |
|
---|
688 | def is_third_party(request):
|
---|
689 | """
|
---|
690 |
|
---|
691 | RFC 2965, section 3.3.6:
|
---|
692 |
|
---|
693 | An unverifiable transaction is to a third-party host if its request-
|
---|
694 | host U does not domain-match the reach R of the request-host O in the
|
---|
695 | origin transaction.
|
---|
696 |
|
---|
697 | """
|
---|
698 | req_host = request_host(request)
|
---|
699 | if not domain_match(req_host, reach(request.get_origin_req_host())):
|
---|
700 | return True
|
---|
701 | else:
|
---|
702 | return False
|
---|
703 |
|
---|
704 |
|
---|
705 | class Cookie:
|
---|
706 | """HTTP Cookie.
|
---|
707 |
|
---|
708 | This class represents both Netscape and RFC 2965 cookies.
|
---|
709 |
|
---|
710 | This is deliberately a very simple class. It just holds attributes. It's
|
---|
711 | possible to construct Cookie instances that don't comply with the cookie
|
---|
712 | standards. CookieJar.make_cookies is the factory function for Cookie
|
---|
713 | objects -- it deals with cookie parsing, supplying defaults, and
|
---|
714 | normalising to the representation used in this class. CookiePolicy is
|
---|
715 | responsible for checking them to see whether they should be accepted from
|
---|
716 | and returned to the server.
|
---|
717 |
|
---|
718 | Note that the port may be present in the headers, but unspecified ("Port"
|
---|
719 | rather than"Port=80", for example); if this is the case, port is None.
|
---|
720 |
|
---|
721 | """
|
---|
722 |
|
---|
723 | def __init__(self, version, name, value,
|
---|
724 | port, port_specified,
|
---|
725 | domain, domain_specified, domain_initial_dot,
|
---|
726 | path, path_specified,
|
---|
727 | secure,
|
---|
728 | expires,
|
---|
729 | discard,
|
---|
730 | comment,
|
---|
731 | comment_url,
|
---|
732 | rest,
|
---|
733 | rfc2109=False,
|
---|
734 | ):
|
---|
735 |
|
---|
736 | if version is not None: version = int(version)
|
---|
737 | if expires is not None: expires = int(expires)
|
---|
738 | if port is None and port_specified is True:
|
---|
739 | raise ValueError("if port is None, port_specified must be false")
|
---|
740 |
|
---|
741 | self.version = version
|
---|
742 | self.name = name
|
---|
743 | self.value = value
|
---|
744 | self.port = port
|
---|
745 | self.port_specified = port_specified
|
---|
746 | # normalise case, as per RFC 2965 section 3.3.3
|
---|
747 | self.domain = domain.lower()
|
---|
748 | self.domain_specified = domain_specified
|
---|
749 | # Sigh. We need to know whether the domain given in the
|
---|
750 | # cookie-attribute had an initial dot, in order to follow RFC 2965
|
---|
751 | # (as clarified in draft errata). Needed for the returned $Domain
|
---|
752 | # value.
|
---|
753 | self.domain_initial_dot = domain_initial_dot
|
---|
754 | self.path = path
|
---|
755 | self.path_specified = path_specified
|
---|
756 | self.secure = secure
|
---|
757 | self.expires = expires
|
---|
758 | self.discard = discard
|
---|
759 | self.comment = comment
|
---|
760 | self.comment_url = comment_url
|
---|
761 | self.rfc2109 = rfc2109
|
---|
762 |
|
---|
763 | self._rest = copy.copy(rest)
|
---|
764 |
|
---|
765 | def has_nonstandard_attr(self, name):
|
---|
766 | return name in self._rest
|
---|
767 | def get_nonstandard_attr(self, name, default=None):
|
---|
768 | return self._rest.get(name, default)
|
---|
769 | def set_nonstandard_attr(self, name, value):
|
---|
770 | self._rest[name] = value
|
---|
771 |
|
---|
772 | def is_expired(self, now=None):
|
---|
773 | if now is None: now = time.time()
|
---|
774 | if (self.expires is not None) and (self.expires <= now):
|
---|
775 | return True
|
---|
776 | return False
|
---|
777 |
|
---|
778 | def __str__(self):
|
---|
779 | if self.port is None: p = ""
|
---|
780 | else: p = ":"+self.port
|
---|
781 | limit = self.domain + p + self.path
|
---|
782 | if self.value is not None:
|
---|
783 | namevalue = "%s=%s" % (self.name, self.value)
|
---|
784 | else:
|
---|
785 | namevalue = self.name
|
---|
786 | return "<Cookie %s for %s>" % (namevalue, limit)
|
---|
787 |
|
---|
788 | def __repr__(self):
|
---|
789 | args = []
|
---|
790 | for name in ("version", "name", "value",
|
---|
791 | "port", "port_specified",
|
---|
792 | "domain", "domain_specified", "domain_initial_dot",
|
---|
793 | "path", "path_specified",
|
---|
794 | "secure", "expires", "discard", "comment", "comment_url",
|
---|
795 | ):
|
---|
796 | attr = getattr(self, name)
|
---|
797 | args.append("%s=%s" % (name, repr(attr)))
|
---|
798 | args.append("rest=%s" % repr(self._rest))
|
---|
799 | args.append("rfc2109=%s" % repr(self.rfc2109))
|
---|
800 | return "Cookie(%s)" % ", ".join(args)
|
---|
801 |
|
---|
802 |
|
---|
803 | class CookiePolicy:
|
---|
804 | """Defines which cookies get accepted from and returned to server.
|
---|
805 |
|
---|
806 | May also modify cookies, though this is probably a bad idea.
|
---|
807 |
|
---|
808 | The subclass DefaultCookiePolicy defines the standard rules for Netscape
|
---|
809 | and RFC 2965 cookies -- override that if you want a customised policy.
|
---|
810 |
|
---|
811 | """
|
---|
812 | def set_ok(self, cookie, request):
|
---|
813 | """Return true if (and only if) cookie should be accepted from server.
|
---|
814 |
|
---|
815 | Currently, pre-expired cookies never get this far -- the CookieJar
|
---|
816 | class deletes such cookies itself.
|
---|
817 |
|
---|
818 | """
|
---|
819 | raise NotImplementedError()
|
---|
820 |
|
---|
821 | def return_ok(self, cookie, request):
|
---|
822 | """Return true if (and only if) cookie should be returned to server."""
|
---|
823 | raise NotImplementedError()
|
---|
824 |
|
---|
825 | def domain_return_ok(self, domain, request):
|
---|
826 | """Return false if cookies should not be returned, given cookie domain.
|
---|
827 | """
|
---|
828 | return True
|
---|
829 |
|
---|
830 | def path_return_ok(self, path, request):
|
---|
831 | """Return false if cookies should not be returned, given cookie path.
|
---|
832 | """
|
---|
833 | return True
|
---|
834 |
|
---|
835 |
|
---|
836 | class DefaultCookiePolicy(CookiePolicy):
|
---|
837 | """Implements the standard rules for accepting and returning cookies."""
|
---|
838 |
|
---|
839 | DomainStrictNoDots = 1
|
---|
840 | DomainStrictNonDomain = 2
|
---|
841 | DomainRFC2965Match = 4
|
---|
842 |
|
---|
843 | DomainLiberal = 0
|
---|
844 | DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
|
---|
845 |
|
---|
846 | def __init__(self,
|
---|
847 | blocked_domains=None, allowed_domains=None,
|
---|
848 | netscape=True, rfc2965=False,
|
---|
849 | rfc2109_as_netscape=None,
|
---|
850 | hide_cookie2=False,
|
---|
851 | strict_domain=False,
|
---|
852 | strict_rfc2965_unverifiable=True,
|
---|
853 | strict_ns_unverifiable=False,
|
---|
854 | strict_ns_domain=DomainLiberal,
|
---|
855 | strict_ns_set_initial_dollar=False,
|
---|
856 | strict_ns_set_path=False,
|
---|
857 | ):
|
---|
858 | """Constructor arguments should be passed as keyword arguments only."""
|
---|
859 | self.netscape = netscape
|
---|
860 | self.rfc2965 = rfc2965
|
---|
861 | self.rfc2109_as_netscape = rfc2109_as_netscape
|
---|
862 | self.hide_cookie2 = hide_cookie2
|
---|
863 | self.strict_domain = strict_domain
|
---|
864 | self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
|
---|
865 | self.strict_ns_unverifiable = strict_ns_unverifiable
|
---|
866 | self.strict_ns_domain = strict_ns_domain
|
---|
867 | self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
|
---|
868 | self.strict_ns_set_path = strict_ns_set_path
|
---|
869 |
|
---|
870 | if blocked_domains is not None:
|
---|
871 | self._blocked_domains = tuple(blocked_domains)
|
---|
872 | else:
|
---|
873 | self._blocked_domains = ()
|
---|
874 |
|
---|
875 | if allowed_domains is not None:
|
---|
876 | allowed_domains = tuple(allowed_domains)
|
---|
877 | self._allowed_domains = allowed_domains
|
---|
878 |
|
---|
879 | def blocked_domains(self):
|
---|
880 | """Return the sequence of blocked domains (as a tuple)."""
|
---|
881 | return self._blocked_domains
|
---|
882 | def set_blocked_domains(self, blocked_domains):
|
---|
883 | """Set the sequence of blocked domains."""
|
---|
884 | self._blocked_domains = tuple(blocked_domains)
|
---|
885 |
|
---|
886 | def is_blocked(self, domain):
|
---|
887 | for blocked_domain in self._blocked_domains:
|
---|
888 | if user_domain_match(domain, blocked_domain):
|
---|
889 | return True
|
---|
890 | return False
|
---|
891 |
|
---|
892 | def allowed_domains(self):
|
---|
893 | """Return None, or the sequence of allowed domains (as a tuple)."""
|
---|
894 | return self._allowed_domains
|
---|
895 | def set_allowed_domains(self, allowed_domains):
|
---|
896 | """Set the sequence of allowed domains, or None."""
|
---|
897 | if allowed_domains is not None:
|
---|
898 | allowed_domains = tuple(allowed_domains)
|
---|
899 | self._allowed_domains = allowed_domains
|
---|
900 |
|
---|
901 | def is_not_allowed(self, domain):
|
---|
902 | if self._allowed_domains is None:
|
---|
903 | return False
|
---|
904 | for allowed_domain in self._allowed_domains:
|
---|
905 | if user_domain_match(domain, allowed_domain):
|
---|
906 | return False
|
---|
907 | return True
|
---|
908 |
|
---|
909 | def set_ok(self, cookie, request):
|
---|
910 | """
|
---|
911 | If you override .set_ok(), be sure to call this method. If it returns
|
---|
912 | false, so should your subclass (assuming your subclass wants to be more
|
---|
913 | strict about which cookies to accept).
|
---|
914 |
|
---|
915 | """
|
---|
916 | _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
|
---|
917 |
|
---|
918 | assert cookie.name is not None
|
---|
919 |
|
---|
920 | for n in "version", "verifiability", "name", "path", "domain", "port":
|
---|
921 | fn_name = "set_ok_"+n
|
---|
922 | fn = getattr(self, fn_name)
|
---|
923 | if not fn(cookie, request):
|
---|
924 | return False
|
---|
925 |
|
---|
926 | return True
|
---|
927 |
|
---|
928 | def set_ok_version(self, cookie, request):
|
---|
929 | if cookie.version is None:
|
---|
930 | # Version is always set to 0 by parse_ns_headers if it's a Netscape
|
---|
931 | # cookie, so this must be an invalid RFC 2965 cookie.
|
---|
932 | _debug(" Set-Cookie2 without version attribute (%s=%s)",
|
---|
933 | cookie.name, cookie.value)
|
---|
934 | return False
|
---|
935 | if cookie.version > 0 and not self.rfc2965:
|
---|
936 | _debug(" RFC 2965 cookies are switched off")
|
---|
937 | return False
|
---|
938 | elif cookie.version == 0 and not self.netscape:
|
---|
939 | _debug(" Netscape cookies are switched off")
|
---|
940 | return False
|
---|
941 | return True
|
---|
942 |
|
---|
943 | def set_ok_verifiability(self, cookie, request):
|
---|
944 | if request.is_unverifiable() and is_third_party(request):
|
---|
945 | if cookie.version > 0 and self.strict_rfc2965_unverifiable:
|
---|
946 | _debug(" third-party RFC 2965 cookie during "
|
---|
947 | "unverifiable transaction")
|
---|
948 | return False
|
---|
949 | elif cookie.version == 0 and self.strict_ns_unverifiable:
|
---|
950 | _debug(" third-party Netscape cookie during "
|
---|
951 | "unverifiable transaction")
|
---|
952 | return False
|
---|
953 | return True
|
---|
954 |
|
---|
955 | def set_ok_name(self, cookie, request):
|
---|
956 | # Try and stop servers setting V0 cookies designed to hack other
|
---|
957 | # servers that know both V0 and V1 protocols.
|
---|
958 | if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
|
---|
959 | cookie.name.startswith("$")):
|
---|
960 | _debug(" illegal name (starts with '$'): '%s'", cookie.name)
|
---|
961 | return False
|
---|
962 | return True
|
---|
963 |
|
---|
964 | def set_ok_path(self, cookie, request):
|
---|
965 | if cookie.path_specified:
|
---|
966 | req_path = request_path(request)
|
---|
967 | if ((cookie.version > 0 or
|
---|
968 | (cookie.version == 0 and self.strict_ns_set_path)) and
|
---|
969 | not req_path.startswith(cookie.path)):
|
---|
970 | _debug(" path attribute %s is not a prefix of request "
|
---|
971 | "path %s", cookie.path, req_path)
|
---|
972 | return False
|
---|
973 | return True
|
---|
974 |
|
---|
975 | def set_ok_domain(self, cookie, request):
|
---|
976 | if self.is_blocked(cookie.domain):
|
---|
977 | _debug(" domain %s is in user block-list", cookie.domain)
|
---|
978 | return False
|
---|
979 | if self.is_not_allowed(cookie.domain):
|
---|
980 | _debug(" domain %s is not in user allow-list", cookie.domain)
|
---|
981 | return False
|
---|
982 | if cookie.domain_specified:
|
---|
983 | req_host, erhn = eff_request_host(request)
|
---|
984 | domain = cookie.domain
|
---|
985 | if self.strict_domain and (domain.count(".") >= 2):
|
---|
986 | # XXX This should probably be compared with the Konqueror
|
---|
987 | # (kcookiejar.cpp) and Mozilla implementations, but it's a
|
---|
988 | # losing battle.
|
---|
989 | i = domain.rfind(".")
|
---|
990 | j = domain.rfind(".", 0, i)
|
---|
991 | if j == 0: # domain like .foo.bar
|
---|
992 | tld = domain[i+1:]
|
---|
993 | sld = domain[j+1:i]
|
---|
994 | if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
|
---|
995 | "gov", "mil", "int", "aero", "biz", "cat", "coop",
|
---|
996 | "info", "jobs", "mobi", "museum", "name", "pro",
|
---|
997 | "travel", "eu") and len(tld) == 2:
|
---|
998 | # domain like .co.uk
|
---|
999 | _debug(" country-code second level domain %s", domain)
|
---|
1000 | return False
|
---|
1001 | if domain.startswith("."):
|
---|
1002 | undotted_domain = domain[1:]
|
---|
1003 | else:
|
---|
1004 | undotted_domain = domain
|
---|
1005 | embedded_dots = (undotted_domain.find(".") >= 0)
|
---|
1006 | if not embedded_dots and domain != ".local":
|
---|
1007 | _debug(" non-local domain %s contains no embedded dot",
|
---|
1008 | domain)
|
---|
1009 | return False
|
---|
1010 | if cookie.version == 0:
|
---|
1011 | if (not erhn.endswith(domain) and
|
---|
1012 | (not erhn.startswith(".") and
|
---|
1013 | not ("."+erhn).endswith(domain))):
|
---|
1014 | _debug(" effective request-host %s (even with added "
|
---|
1015 | "initial dot) does not end end with %s",
|
---|
1016 | erhn, domain)
|
---|
1017 | return False
|
---|
1018 | if (cookie.version > 0 or
|
---|
1019 | (self.strict_ns_domain & self.DomainRFC2965Match)):
|
---|
1020 | if not domain_match(erhn, domain):
|
---|
1021 | _debug(" effective request-host %s does not domain-match "
|
---|
1022 | "%s", erhn, domain)
|
---|
1023 | return False
|
---|
1024 | if (cookie.version > 0 or
|
---|
1025 | (self.strict_ns_domain & self.DomainStrictNoDots)):
|
---|
1026 | host_prefix = req_host[:-len(domain)]
|
---|
1027 | if (host_prefix.find(".") >= 0 and
|
---|
1028 | not IPV4_RE.search(req_host)):
|
---|
1029 | _debug(" host prefix %s for domain %s contains a dot",
|
---|
1030 | host_prefix, domain)
|
---|
1031 | return False
|
---|
1032 | return True
|
---|
1033 |
|
---|
1034 | def set_ok_port(self, cookie, request):
|
---|
1035 | if cookie.port_specified:
|
---|
1036 | req_port = request_port(request)
|
---|
1037 | if req_port is None:
|
---|
1038 | req_port = "80"
|
---|
1039 | else:
|
---|
1040 | req_port = str(req_port)
|
---|
1041 | for p in cookie.port.split(","):
|
---|
1042 | try:
|
---|
1043 | int(p)
|
---|
1044 | except ValueError:
|
---|
1045 | _debug(" bad port %s (not numeric)", p)
|
---|
1046 | return False
|
---|
1047 | if p == req_port:
|
---|
1048 | break
|
---|
1049 | else:
|
---|
1050 | _debug(" request port (%s) not found in %s",
|
---|
1051 | req_port, cookie.port)
|
---|
1052 | return False
|
---|
1053 | return True
|
---|
1054 |
|
---|
1055 | def return_ok(self, cookie, request):
|
---|
1056 | """
|
---|
1057 | If you override .return_ok(), be sure to call this method. If it
|
---|
1058 | returns false, so should your subclass (assuming your subclass wants to
|
---|
1059 | be more strict about which cookies to return).
|
---|
1060 |
|
---|
1061 | """
|
---|
1062 | # Path has already been checked by .path_return_ok(), and domain
|
---|
1063 | # blocking done by .domain_return_ok().
|
---|
1064 | _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
|
---|
1065 |
|
---|
1066 | for n in "version", "verifiability", "secure", "expires", "port", "domain":
|
---|
1067 | fn_name = "return_ok_"+n
|
---|
1068 | fn = getattr(self, fn_name)
|
---|
1069 | if not fn(cookie, request):
|
---|
1070 | return False
|
---|
1071 | return True
|
---|
1072 |
|
---|
1073 | def return_ok_version(self, cookie, request):
|
---|
1074 | if cookie.version > 0 and not self.rfc2965:
|
---|
1075 | _debug(" RFC 2965 cookies are switched off")
|
---|
1076 | return False
|
---|
1077 | elif cookie.version == 0 and not self.netscape:
|
---|
1078 | _debug(" Netscape cookies are switched off")
|
---|
1079 | return False
|
---|
1080 | return True
|
---|
1081 |
|
---|
1082 | def return_ok_verifiability(self, cookie, request):
|
---|
1083 | if request.is_unverifiable() and is_third_party(request):
|
---|
1084 | if cookie.version > 0 and self.strict_rfc2965_unverifiable:
|
---|
1085 | _debug(" third-party RFC 2965 cookie during unverifiable "
|
---|
1086 | "transaction")
|
---|
1087 | return False
|
---|
1088 | elif cookie.version == 0 and self.strict_ns_unverifiable:
|
---|
1089 | _debug(" third-party Netscape cookie during unverifiable "
|
---|
1090 | "transaction")
|
---|
1091 | return False
|
---|
1092 | return True
|
---|
1093 |
|
---|
1094 | def return_ok_secure(self, cookie, request):
|
---|
1095 | if cookie.secure and request.get_type() != "https":
|
---|
1096 | _debug(" secure cookie with non-secure request")
|
---|
1097 | return False
|
---|
1098 | return True
|
---|
1099 |
|
---|
1100 | def return_ok_expires(self, cookie, request):
|
---|
1101 | if cookie.is_expired(self._now):
|
---|
1102 | _debug(" cookie expired")
|
---|
1103 | return False
|
---|
1104 | return True
|
---|
1105 |
|
---|
1106 | def return_ok_port(self, cookie, request):
|
---|
1107 | if cookie.port:
|
---|
1108 | req_port = request_port(request)
|
---|
1109 | if req_port is None:
|
---|
1110 | req_port = "80"
|
---|
1111 | for p in cookie.port.split(","):
|
---|
1112 | if p == req_port:
|
---|
1113 | break
|
---|
1114 | else:
|
---|
1115 | _debug(" request port %s does not match cookie port %s",
|
---|
1116 | req_port, cookie.port)
|
---|
1117 | return False
|
---|
1118 | return True
|
---|
1119 |
|
---|
1120 | def return_ok_domain(self, cookie, request):
|
---|
1121 | req_host, erhn = eff_request_host(request)
|
---|
1122 | domain = cookie.domain
|
---|
1123 |
|
---|
1124 | # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
|
---|
1125 | if (cookie.version == 0 and
|
---|
1126 | (self.strict_ns_domain & self.DomainStrictNonDomain) and
|
---|
1127 | not cookie.domain_specified and domain != erhn):
|
---|
1128 | _debug(" cookie with unspecified domain does not string-compare "
|
---|
1129 | "equal to request domain")
|
---|
1130 | return False
|
---|
1131 |
|
---|
1132 | if cookie.version > 0 and not domain_match(erhn, domain):
|
---|
1133 | _debug(" effective request-host name %s does not domain-match "
|
---|
1134 | "RFC 2965 cookie domain %s", erhn, domain)
|
---|
1135 | return False
|
---|
1136 | if cookie.version == 0 and not ("."+erhn).endswith(domain):
|
---|
1137 | _debug(" request-host %s does not match Netscape cookie domain "
|
---|
1138 | "%s", req_host, domain)
|
---|
1139 | return False
|
---|
1140 | return True
|
---|
1141 |
|
---|
1142 | def domain_return_ok(self, domain, request):
|
---|
1143 | # Liberal check of. This is here as an optimization to avoid
|
---|
1144 | # having to load lots of MSIE cookie files unless necessary.
|
---|
1145 | req_host, erhn = eff_request_host(request)
|
---|
1146 | if not req_host.startswith("."):
|
---|
1147 | req_host = "."+req_host
|
---|
1148 | if not erhn.startswith("."):
|
---|
1149 | erhn = "."+erhn
|
---|
1150 | if not (req_host.endswith(domain) or erhn.endswith(domain)):
|
---|
1151 | #_debug(" request domain %s does not match cookie domain %s",
|
---|
1152 | # req_host, domain)
|
---|
1153 | return False
|
---|
1154 |
|
---|
1155 | if self.is_blocked(domain):
|
---|
1156 | _debug(" domain %s is in user block-list", domain)
|
---|
1157 | return False
|
---|
1158 | if self.is_not_allowed(domain):
|
---|
1159 | _debug(" domain %s is not in user allow-list", domain)
|
---|
1160 | return False
|
---|
1161 |
|
---|
1162 | return True
|
---|
1163 |
|
---|
1164 | def path_return_ok(self, path, request):
|
---|
1165 | _debug("- checking cookie path=%s", path)
|
---|
1166 | req_path = request_path(request)
|
---|
1167 | if not req_path.startswith(path):
|
---|
1168 | _debug(" %s does not path-match %s", req_path, path)
|
---|
1169 | return False
|
---|
1170 | return True
|
---|
1171 |
|
---|
1172 |
|
---|
1173 | def vals_sorted_by_key(adict):
|
---|
1174 | keys = adict.keys()
|
---|
1175 | keys.sort()
|
---|
1176 | return map(adict.get, keys)
|
---|
1177 |
|
---|
1178 | def deepvalues(mapping):
|
---|
1179 | """Iterates over nested mapping, depth-first, in sorted order by key."""
|
---|
1180 | values = vals_sorted_by_key(mapping)
|
---|
1181 | for obj in values:
|
---|
1182 | mapping = False
|
---|
1183 | try:
|
---|
1184 | obj.items
|
---|
1185 | except AttributeError:
|
---|
1186 | pass
|
---|
1187 | else:
|
---|
1188 | mapping = True
|
---|
1189 | for subobj in deepvalues(obj):
|
---|
1190 | yield subobj
|
---|
1191 | if not mapping:
|
---|
1192 | yield obj
|
---|
1193 |
|
---|
1194 |
|
---|
1195 | # Used as second parameter to dict.get() method, to distinguish absent
|
---|
1196 | # dict key from one with a None value.
|
---|
1197 | class Absent: pass
|
---|
1198 |
|
---|
1199 | class CookieJar:
|
---|
1200 | """Collection of HTTP cookies.
|
---|
1201 |
|
---|
1202 | You may not need to know about this class: try
|
---|
1203 | urllib2.build_opener(HTTPCookieProcessor).open(url).
|
---|
1204 |
|
---|
1205 | """
|
---|
1206 |
|
---|
1207 | non_word_re = re.compile(r"\W")
|
---|
1208 | quote_re = re.compile(r"([\"\\])")
|
---|
1209 | strict_domain_re = re.compile(r"\.?[^.]*")
|
---|
1210 | domain_re = re.compile(r"[^.]*")
|
---|
1211 | dots_re = re.compile(r"^\.+")
|
---|
1212 |
|
---|
1213 | magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
|
---|
1214 |
|
---|
1215 | def __init__(self, policy=None):
|
---|
1216 | if policy is None:
|
---|
1217 | policy = DefaultCookiePolicy()
|
---|
1218 | self._policy = policy
|
---|
1219 |
|
---|
1220 | self._cookies_lock = _threading.RLock()
|
---|
1221 | self._cookies = {}
|
---|
1222 |
|
---|
1223 | def set_policy(self, policy):
|
---|
1224 | self._policy = policy
|
---|
1225 |
|
---|
1226 | def _cookies_for_domain(self, domain, request):
|
---|
1227 | cookies = []
|
---|
1228 | if not self._policy.domain_return_ok(domain, request):
|
---|
1229 | return []
|
---|
1230 | _debug("Checking %s for cookies to return", domain)
|
---|
1231 | cookies_by_path = self._cookies[domain]
|
---|
1232 | for path in cookies_by_path.keys():
|
---|
1233 | if not self._policy.path_return_ok(path, request):
|
---|
1234 | continue
|
---|
1235 | cookies_by_name = cookies_by_path[path]
|
---|
1236 | for cookie in cookies_by_name.values():
|
---|
1237 | if not self._policy.return_ok(cookie, request):
|
---|
1238 | _debug(" not returning cookie")
|
---|
1239 | continue
|
---|
1240 | _debug(" it's a match")
|
---|
1241 | cookies.append(cookie)
|
---|
1242 | return cookies
|
---|
1243 |
|
---|
1244 | def _cookies_for_request(self, request):
|
---|
1245 | """Return a list of cookies to be returned to server."""
|
---|
1246 | cookies = []
|
---|
1247 | for domain in self._cookies.keys():
|
---|
1248 | cookies.extend(self._cookies_for_domain(domain, request))
|
---|
1249 | return cookies
|
---|
1250 |
|
---|
1251 | def _cookie_attrs(self, cookies):
|
---|
1252 | """Return a list of cookie-attributes to be returned to server.
|
---|
1253 |
|
---|
1254 | like ['foo="bar"; $Path="/"', ...]
|
---|
1255 |
|
---|
1256 | The $Version attribute is also added when appropriate (currently only
|
---|
1257 | once per request).
|
---|
1258 |
|
---|
1259 | """
|
---|
1260 | # add cookies in order of most specific (ie. longest) path first
|
---|
1261 | def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
|
---|
1262 | cookies.sort(decreasing_size)
|
---|
1263 |
|
---|
1264 | version_set = False
|
---|
1265 |
|
---|
1266 | attrs = []
|
---|
1267 | for cookie in cookies:
|
---|
1268 | # set version of Cookie header
|
---|
1269 | # XXX
|
---|
1270 | # What should it be if multiple matching Set-Cookie headers have
|
---|
1271 | # different versions themselves?
|
---|
1272 | # Answer: there is no answer; was supposed to be settled by
|
---|
1273 | # RFC 2965 errata, but that may never appear...
|
---|
1274 | version = cookie.version
|
---|
1275 | if not version_set:
|
---|
1276 | version_set = True
|
---|
1277 | if version > 0:
|
---|
1278 | attrs.append("$Version=%s" % version)
|
---|
1279 |
|
---|
1280 | # quote cookie value if necessary
|
---|
1281 | # (not for Netscape protocol, which already has any quotes
|
---|
1282 | # intact, due to the poorly-specified Netscape Cookie: syntax)
|
---|
1283 | if ((cookie.value is not None) and
|
---|
1284 | self.non_word_re.search(cookie.value) and version > 0):
|
---|
1285 | value = self.quote_re.sub(r"\\\1", cookie.value)
|
---|
1286 | else:
|
---|
1287 | value = cookie.value
|
---|
1288 |
|
---|
1289 | # add cookie-attributes to be returned in Cookie header
|
---|
1290 | if cookie.value is None:
|
---|
1291 | attrs.append(cookie.name)
|
---|
1292 | else:
|
---|
1293 | attrs.append("%s=%s" % (cookie.name, value))
|
---|
1294 | if version > 0:
|
---|
1295 | if cookie.path_specified:
|
---|
1296 | attrs.append('$Path="%s"' % cookie.path)
|
---|
1297 | if cookie.domain.startswith("."):
|
---|
1298 | domain = cookie.domain
|
---|
1299 | if (not cookie.domain_initial_dot and
|
---|
1300 | domain.startswith(".")):
|
---|
1301 | domain = domain[1:]
|
---|
1302 | attrs.append('$Domain="%s"' % domain)
|
---|
1303 | if cookie.port is not None:
|
---|
1304 | p = "$Port"
|
---|
1305 | if cookie.port_specified:
|
---|
1306 | p = p + ('="%s"' % cookie.port)
|
---|
1307 | attrs.append(p)
|
---|
1308 |
|
---|
1309 | return attrs
|
---|
1310 |
|
---|
1311 | def add_cookie_header(self, request):
|
---|
1312 | """Add correct Cookie: header to request (urllib2.Request object).
|
---|
1313 |
|
---|
1314 | The Cookie2 header is also added unless policy.hide_cookie2 is true.
|
---|
1315 |
|
---|
1316 | """
|
---|
1317 | _debug("add_cookie_header")
|
---|
1318 | self._cookies_lock.acquire()
|
---|
1319 |
|
---|
1320 | self._policy._now = self._now = int(time.time())
|
---|
1321 |
|
---|
1322 | cookies = self._cookies_for_request(request)
|
---|
1323 |
|
---|
1324 | attrs = self._cookie_attrs(cookies)
|
---|
1325 | if attrs:
|
---|
1326 | if not request.has_header("Cookie"):
|
---|
1327 | request.add_unredirected_header(
|
---|
1328 | "Cookie", "; ".join(attrs))
|
---|
1329 |
|
---|
1330 | # if necessary, advertise that we know RFC 2965
|
---|
1331 | if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
|
---|
1332 | not request.has_header("Cookie2")):
|
---|
1333 | for cookie in cookies:
|
---|
1334 | if cookie.version != 1:
|
---|
1335 | request.add_unredirected_header("Cookie2", '$Version="1"')
|
---|
1336 | break
|
---|
1337 |
|
---|
1338 | self._cookies_lock.release()
|
---|
1339 |
|
---|
1340 | self.clear_expired_cookies()
|
---|
1341 |
|
---|
1342 | def _normalized_cookie_tuples(self, attrs_set):
|
---|
1343 | """Return list of tuples containing normalised cookie information.
|
---|
1344 |
|
---|
1345 | attrs_set is the list of lists of key,value pairs extracted from
|
---|
1346 | the Set-Cookie or Set-Cookie2 headers.
|
---|
1347 |
|
---|
1348 | Tuples are name, value, standard, rest, where name and value are the
|
---|
1349 | cookie name and value, standard is a dictionary containing the standard
|
---|
1350 | cookie-attributes (discard, secure, version, expires or max-age,
|
---|
1351 | domain, path and port) and rest is a dictionary containing the rest of
|
---|
1352 | the cookie-attributes.
|
---|
1353 |
|
---|
1354 | """
|
---|
1355 | cookie_tuples = []
|
---|
1356 |
|
---|
1357 | boolean_attrs = "discard", "secure"
|
---|
1358 | value_attrs = ("version",
|
---|
1359 | "expires", "max-age",
|
---|
1360 | "domain", "path", "port",
|
---|
1361 | "comment", "commenturl")
|
---|
1362 |
|
---|
1363 | for cookie_attrs in attrs_set:
|
---|
1364 | name, value = cookie_attrs[0]
|
---|
1365 |
|
---|
1366 | # Build dictionary of standard cookie-attributes (standard) and
|
---|
1367 | # dictionary of other cookie-attributes (rest).
|
---|
1368 |
|
---|
1369 | # Note: expiry time is normalised to seconds since epoch. V0
|
---|
1370 | # cookies should have the Expires cookie-attribute, and V1 cookies
|
---|
1371 | # should have Max-Age, but since V1 includes RFC 2109 cookies (and
|
---|
1372 | # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
|
---|
1373 | # accept either (but prefer Max-Age).
|
---|
1374 | max_age_set = False
|
---|
1375 |
|
---|
1376 | bad_cookie = False
|
---|
1377 |
|
---|
1378 | standard = {}
|
---|
1379 | rest = {}
|
---|
1380 | for k, v in cookie_attrs[1:]:
|
---|
1381 | lc = k.lower()
|
---|
1382 | # don't lose case distinction for unknown fields
|
---|
1383 | if lc in value_attrs or lc in boolean_attrs:
|
---|
1384 | k = lc
|
---|
1385 | if k in boolean_attrs and v is None:
|
---|
1386 | # boolean cookie-attribute is present, but has no value
|
---|
1387 | # (like "discard", rather than "port=80")
|
---|
1388 | v = True
|
---|
1389 | if k in standard:
|
---|
1390 | # only first value is significant
|
---|
1391 | continue
|
---|
1392 | if k == "domain":
|
---|
1393 | if v is None:
|
---|
1394 | _debug(" missing value for domain attribute")
|
---|
1395 | bad_cookie = True
|
---|
1396 | break
|
---|
1397 | # RFC 2965 section 3.3.3
|
---|
1398 | v = v.lower()
|
---|
1399 | if k == "expires":
|
---|
1400 | if max_age_set:
|
---|
1401 | # Prefer max-age to expires (like Mozilla)
|
---|
1402 | continue
|
---|
1403 | if v is None:
|
---|
1404 | _debug(" missing or invalid value for expires "
|
---|
1405 | "attribute: treating as session cookie")
|
---|
1406 | continue
|
---|
1407 | if k == "max-age":
|
---|
1408 | max_age_set = True
|
---|
1409 | try:
|
---|
1410 | v = int(v)
|
---|
1411 | except ValueError:
|
---|
1412 | _debug(" missing or invalid (non-numeric) value for "
|
---|
1413 | "max-age attribute")
|
---|
1414 | bad_cookie = True
|
---|
1415 | break
|
---|
1416 | # convert RFC 2965 Max-Age to seconds since epoch
|
---|
1417 | # XXX Strictly you're supposed to follow RFC 2616
|
---|
1418 | # age-calculation rules. Remember that zero Max-Age is a
|
---|
1419 | # is a request to discard (old and new) cookie, though.
|
---|
1420 | k = "expires"
|
---|
1421 | v = self._now + v
|
---|
1422 | if (k in value_attrs) or (k in boolean_attrs):
|
---|
1423 | if (v is None and
|
---|
1424 | k not in ("port", "comment", "commenturl")):
|
---|
1425 | _debug(" missing value for %s attribute" % k)
|
---|
1426 | bad_cookie = True
|
---|
1427 | break
|
---|
1428 | standard[k] = v
|
---|
1429 | else:
|
---|
1430 | rest[k] = v
|
---|
1431 |
|
---|
1432 | if bad_cookie:
|
---|
1433 | continue
|
---|
1434 |
|
---|
1435 | cookie_tuples.append((name, value, standard, rest))
|
---|
1436 |
|
---|
1437 | return cookie_tuples
|
---|
1438 |
|
---|
1439 | def _cookie_from_cookie_tuple(self, tup, request):
|
---|
1440 | # standard is dict of standard cookie-attributes, rest is dict of the
|
---|
1441 | # rest of them
|
---|
1442 | name, value, standard, rest = tup
|
---|
1443 |
|
---|
1444 | domain = standard.get("domain", Absent)
|
---|
1445 | path = standard.get("path", Absent)
|
---|
1446 | port = standard.get("port", Absent)
|
---|
1447 | expires = standard.get("expires", Absent)
|
---|
1448 |
|
---|
1449 | # set the easy defaults
|
---|
1450 | version = standard.get("version", None)
|
---|
1451 | if version is not None: version = int(version)
|
---|
1452 | secure = standard.get("secure", False)
|
---|
1453 | # (discard is also set if expires is Absent)
|
---|
1454 | discard = standard.get("discard", False)
|
---|
1455 | comment = standard.get("comment", None)
|
---|
1456 | comment_url = standard.get("commenturl", None)
|
---|
1457 |
|
---|
1458 | # set default path
|
---|
1459 | if path is not Absent and path != "":
|
---|
1460 | path_specified = True
|
---|
1461 | path = escape_path(path)
|
---|
1462 | else:
|
---|
1463 | path_specified = False
|
---|
1464 | path = request_path(request)
|
---|
1465 | i = path.rfind("/")
|
---|
1466 | if i != -1:
|
---|
1467 | if version == 0:
|
---|
1468 | # Netscape spec parts company from reality here
|
---|
1469 | path = path[:i]
|
---|
1470 | else:
|
---|
1471 | path = path[:i+1]
|
---|
1472 | if len(path) == 0: path = "/"
|
---|
1473 |
|
---|
1474 | # set default domain
|
---|
1475 | domain_specified = domain is not Absent
|
---|
1476 | # but first we have to remember whether it starts with a dot
|
---|
1477 | domain_initial_dot = False
|
---|
1478 | if domain_specified:
|
---|
1479 | domain_initial_dot = bool(domain.startswith("."))
|
---|
1480 | if domain is Absent:
|
---|
1481 | req_host, erhn = eff_request_host(request)
|
---|
1482 | domain = erhn
|
---|
1483 | elif not domain.startswith("."):
|
---|
1484 | domain = "."+domain
|
---|
1485 |
|
---|
1486 | # set default port
|
---|
1487 | port_specified = False
|
---|
1488 | if port is not Absent:
|
---|
1489 | if port is None:
|
---|
1490 | # Port attr present, but has no value: default to request port.
|
---|
1491 | # Cookie should then only be sent back on that port.
|
---|
1492 | port = request_port(request)
|
---|
1493 | else:
|
---|
1494 | port_specified = True
|
---|
1495 | port = re.sub(r"\s+", "", port)
|
---|
1496 | else:
|
---|
1497 | # No port attr present. Cookie can be sent back on any port.
|
---|
1498 | port = None
|
---|
1499 |
|
---|
1500 | # set default expires and discard
|
---|
1501 | if expires is Absent:
|
---|
1502 | expires = None
|
---|
1503 | discard = True
|
---|
1504 | elif expires <= self._now:
|
---|
1505 | # Expiry date in past is request to delete cookie. This can't be
|
---|
1506 | # in DefaultCookiePolicy, because can't delete cookies there.
|
---|
1507 | try:
|
---|
1508 | self.clear(domain, path, name)
|
---|
1509 | except KeyError:
|
---|
1510 | pass
|
---|
1511 | _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
|
---|
1512 | domain, path, name)
|
---|
1513 | return None
|
---|
1514 |
|
---|
1515 | return Cookie(version,
|
---|
1516 | name, value,
|
---|
1517 | port, port_specified,
|
---|
1518 | domain, domain_specified, domain_initial_dot,
|
---|
1519 | path, path_specified,
|
---|
1520 | secure,
|
---|
1521 | expires,
|
---|
1522 | discard,
|
---|
1523 | comment,
|
---|
1524 | comment_url,
|
---|
1525 | rest)
|
---|
1526 |
|
---|
1527 | def _cookies_from_attrs_set(self, attrs_set, request):
|
---|
1528 | cookie_tuples = self._normalized_cookie_tuples(attrs_set)
|
---|
1529 |
|
---|
1530 | cookies = []
|
---|
1531 | for tup in cookie_tuples:
|
---|
1532 | cookie = self._cookie_from_cookie_tuple(tup, request)
|
---|
1533 | if cookie: cookies.append(cookie)
|
---|
1534 | return cookies
|
---|
1535 |
|
---|
1536 | def _process_rfc2109_cookies(self, cookies):
|
---|
1537 | rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
|
---|
1538 | if rfc2109_as_ns is None:
|
---|
1539 | rfc2109_as_ns = not self._policy.rfc2965
|
---|
1540 | for cookie in cookies:
|
---|
1541 | if cookie.version == 1:
|
---|
1542 | cookie.rfc2109 = True
|
---|
1543 | if rfc2109_as_ns:
|
---|
1544 | # treat 2109 cookies as Netscape cookies rather than
|
---|
1545 | # as RFC2965 cookies
|
---|
1546 | cookie.version = 0
|
---|
1547 |
|
---|
1548 | def make_cookies(self, response, request):
|
---|
1549 | """Return sequence of Cookie objects extracted from response object."""
|
---|
1550 | # get cookie-attributes for RFC 2965 and Netscape protocols
|
---|
1551 | headers = response.info()
|
---|
1552 | rfc2965_hdrs = headers.getheaders("Set-Cookie2")
|
---|
1553 | ns_hdrs = headers.getheaders("Set-Cookie")
|
---|
1554 |
|
---|
1555 | rfc2965 = self._policy.rfc2965
|
---|
1556 | netscape = self._policy.netscape
|
---|
1557 |
|
---|
1558 | if ((not rfc2965_hdrs and not ns_hdrs) or
|
---|
1559 | (not ns_hdrs and not rfc2965) or
|
---|
1560 | (not rfc2965_hdrs and not netscape) or
|
---|
1561 | (not netscape and not rfc2965)):
|
---|
1562 | return [] # no relevant cookie headers: quick exit
|
---|
1563 |
|
---|
1564 | try:
|
---|
1565 | cookies = self._cookies_from_attrs_set(
|
---|
1566 | split_header_words(rfc2965_hdrs), request)
|
---|
1567 | except Exception:
|
---|
1568 | _warn_unhandled_exception()
|
---|
1569 | cookies = []
|
---|
1570 |
|
---|
1571 | if ns_hdrs and netscape:
|
---|
1572 | try:
|
---|
1573 | # RFC 2109 and Netscape cookies
|
---|
1574 | ns_cookies = self._cookies_from_attrs_set(
|
---|
1575 | parse_ns_headers(ns_hdrs), request)
|
---|
1576 | except Exception:
|
---|
1577 | _warn_unhandled_exception()
|
---|
1578 | ns_cookies = []
|
---|
1579 | self._process_rfc2109_cookies(ns_cookies)
|
---|
1580 |
|
---|
1581 | # Look for Netscape cookies (from Set-Cookie headers) that match
|
---|
1582 | # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
|
---|
1583 | # For each match, keep the RFC 2965 cookie and ignore the Netscape
|
---|
1584 | # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
|
---|
1585 | # bundled in with the Netscape cookies for this purpose, which is
|
---|
1586 | # reasonable behaviour.
|
---|
1587 | if rfc2965:
|
---|
1588 | lookup = {}
|
---|
1589 | for cookie in cookies:
|
---|
1590 | lookup[(cookie.domain, cookie.path, cookie.name)] = None
|
---|
1591 |
|
---|
1592 | def no_matching_rfc2965(ns_cookie, lookup=lookup):
|
---|
1593 | key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
|
---|
1594 | return key not in lookup
|
---|
1595 | ns_cookies = filter(no_matching_rfc2965, ns_cookies)
|
---|
1596 |
|
---|
1597 | if ns_cookies:
|
---|
1598 | cookies.extend(ns_cookies)
|
---|
1599 |
|
---|
1600 | return cookies
|
---|
1601 |
|
---|
1602 | def set_cookie_if_ok(self, cookie, request):
|
---|
1603 | """Set a cookie if policy says it's OK to do so."""
|
---|
1604 | self._cookies_lock.acquire()
|
---|
1605 | self._policy._now = self._now = int(time.time())
|
---|
1606 |
|
---|
1607 | if self._policy.set_ok(cookie, request):
|
---|
1608 | self.set_cookie(cookie)
|
---|
1609 |
|
---|
1610 | self._cookies_lock.release()
|
---|
1611 |
|
---|
1612 | def set_cookie(self, cookie):
|
---|
1613 | """Set a cookie, without checking whether or not it should be set."""
|
---|
1614 | c = self._cookies
|
---|
1615 | self._cookies_lock.acquire()
|
---|
1616 | try:
|
---|
1617 | if cookie.domain not in c: c[cookie.domain] = {}
|
---|
1618 | c2 = c[cookie.domain]
|
---|
1619 | if cookie.path not in c2: c2[cookie.path] = {}
|
---|
1620 | c3 = c2[cookie.path]
|
---|
1621 | c3[cookie.name] = cookie
|
---|
1622 | finally:
|
---|
1623 | self._cookies_lock.release()
|
---|
1624 |
|
---|
1625 | def extract_cookies(self, response, request):
|
---|
1626 | """Extract cookies from response, where allowable given the request."""
|
---|
1627 | _debug("extract_cookies: %s", response.info())
|
---|
1628 | self._cookies_lock.acquire()
|
---|
1629 | self._policy._now = self._now = int(time.time())
|
---|
1630 |
|
---|
1631 | for cookie in self.make_cookies(response, request):
|
---|
1632 | if self._policy.set_ok(cookie, request):
|
---|
1633 | _debug(" setting cookie: %s", cookie)
|
---|
1634 | self.set_cookie(cookie)
|
---|
1635 | self._cookies_lock.release()
|
---|
1636 |
|
---|
1637 | def clear(self, domain=None, path=None, name=None):
|
---|
1638 | """Clear some cookies.
|
---|
1639 |
|
---|
1640 | Invoking this method without arguments will clear all cookies. If
|
---|
1641 | given a single argument, only cookies belonging to that domain will be
|
---|
1642 | removed. If given two arguments, cookies belonging to the specified
|
---|
1643 | path within that domain are removed. If given three arguments, then
|
---|
1644 | the cookie with the specified name, path and domain is removed.
|
---|
1645 |
|
---|
1646 | Raises KeyError if no matching cookie exists.
|
---|
1647 |
|
---|
1648 | """
|
---|
1649 | if name is not None:
|
---|
1650 | if (domain is None) or (path is None):
|
---|
1651 | raise ValueError(
|
---|
1652 | "domain and path must be given to remove a cookie by name")
|
---|
1653 | del self._cookies[domain][path][name]
|
---|
1654 | elif path is not None:
|
---|
1655 | if domain is None:
|
---|
1656 | raise ValueError(
|
---|
1657 | "domain must be given to remove cookies by path")
|
---|
1658 | del self._cookies[domain][path]
|
---|
1659 | elif domain is not None:
|
---|
1660 | del self._cookies[domain]
|
---|
1661 | else:
|
---|
1662 | self._cookies = {}
|
---|
1663 |
|
---|
1664 | def clear_session_cookies(self):
|
---|
1665 | """Discard all session cookies.
|
---|
1666 |
|
---|
1667 | Note that the .save() method won't save session cookies anyway, unless
|
---|
1668 | you ask otherwise by passing a true ignore_discard argument.
|
---|
1669 |
|
---|
1670 | """
|
---|
1671 | self._cookies_lock.acquire()
|
---|
1672 | for cookie in self:
|
---|
1673 | if cookie.discard:
|
---|
1674 | self.clear(cookie.domain, cookie.path, cookie.name)
|
---|
1675 | self._cookies_lock.release()
|
---|
1676 |
|
---|
1677 | def clear_expired_cookies(self):
|
---|
1678 | """Discard all expired cookies.
|
---|
1679 |
|
---|
1680 | You probably don't need to call this method: expired cookies are never
|
---|
1681 | sent back to the server (provided you're using DefaultCookiePolicy),
|
---|
1682 | this method is called by CookieJar itself every so often, and the
|
---|
1683 | .save() method won't save expired cookies anyway (unless you ask
|
---|
1684 | otherwise by passing a true ignore_expires argument).
|
---|
1685 |
|
---|
1686 | """
|
---|
1687 | self._cookies_lock.acquire()
|
---|
1688 | now = time.time()
|
---|
1689 | for cookie in self:
|
---|
1690 | if cookie.is_expired(now):
|
---|
1691 | self.clear(cookie.domain, cookie.path, cookie.name)
|
---|
1692 | self._cookies_lock.release()
|
---|
1693 |
|
---|
1694 | def __iter__(self):
|
---|
1695 | return deepvalues(self._cookies)
|
---|
1696 |
|
---|
1697 | def __len__(self):
|
---|
1698 | """Return number of contained cookies."""
|
---|
1699 | i = 0
|
---|
1700 | for cookie in self: i = i + 1
|
---|
1701 | return i
|
---|
1702 |
|
---|
1703 | def __repr__(self):
|
---|
1704 | r = []
|
---|
1705 | for cookie in self: r.append(repr(cookie))
|
---|
1706 | return "<%s[%s]>" % (self.__class__, ", ".join(r))
|
---|
1707 |
|
---|
1708 | def __str__(self):
|
---|
1709 | r = []
|
---|
1710 | for cookie in self: r.append(str(cookie))
|
---|
1711 | return "<%s[%s]>" % (self.__class__, ", ".join(r))
|
---|
1712 |
|
---|
1713 |
|
---|
1714 | # derives from IOError for backwards-compatibility with Python 2.4.0
|
---|
1715 | class LoadError(IOError): pass
|
---|
1716 |
|
---|
1717 | class FileCookieJar(CookieJar):
|
---|
1718 | """CookieJar that can be loaded from and saved to a file."""
|
---|
1719 |
|
---|
1720 | def __init__(self, filename=None, delayload=False, policy=None):
|
---|
1721 | """
|
---|
1722 | Cookies are NOT loaded from the named file until either the .load() or
|
---|
1723 | .revert() method is called.
|
---|
1724 |
|
---|
1725 | """
|
---|
1726 | CookieJar.__init__(self, policy)
|
---|
1727 | if filename is not None:
|
---|
1728 | try:
|
---|
1729 | filename+""
|
---|
1730 | except:
|
---|
1731 | raise ValueError("filename must be string-like")
|
---|
1732 | self.filename = filename
|
---|
1733 | self.delayload = bool(delayload)
|
---|
1734 |
|
---|
1735 | def save(self, filename=None, ignore_discard=False, ignore_expires=False):
|
---|
1736 | """Save cookies to a file."""
|
---|
1737 | raise NotImplementedError()
|
---|
1738 |
|
---|
1739 | def load(self, filename=None, ignore_discard=False, ignore_expires=False):
|
---|
1740 | """Load cookies from a file."""
|
---|
1741 | if filename is None:
|
---|
1742 | if self.filename is not None: filename = self.filename
|
---|
1743 | else: raise ValueError(MISSING_FILENAME_TEXT)
|
---|
1744 |
|
---|
1745 | f = open(filename)
|
---|
1746 | try:
|
---|
1747 | self._really_load(f, filename, ignore_discard, ignore_expires)
|
---|
1748 | finally:
|
---|
1749 | f.close()
|
---|
1750 |
|
---|
1751 | def revert(self, filename=None,
|
---|
1752 | ignore_discard=False, ignore_expires=False):
|
---|
1753 | """Clear all cookies and reload cookies from a saved file.
|
---|
1754 |
|
---|
1755 | Raises LoadError (or IOError) if reversion is not successful; the
|
---|
1756 | object's state will not be altered if this happens.
|
---|
1757 |
|
---|
1758 | """
|
---|
1759 | if filename is None:
|
---|
1760 | if self.filename is not None: filename = self.filename
|
---|
1761 | else: raise ValueError(MISSING_FILENAME_TEXT)
|
---|
1762 |
|
---|
1763 | self._cookies_lock.acquire()
|
---|
1764 |
|
---|
1765 | old_state = copy.deepcopy(self._cookies)
|
---|
1766 | self._cookies = {}
|
---|
1767 | try:
|
---|
1768 | self.load(filename, ignore_discard, ignore_expires)
|
---|
1769 | except (LoadError, IOError):
|
---|
1770 | self._cookies = old_state
|
---|
1771 | raise
|
---|
1772 |
|
---|
1773 | self._cookies_lock.release()
|
---|
1774 |
|
---|
1775 | from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
|
---|
1776 | from _MozillaCookieJar import MozillaCookieJar
|
---|