source: python/trunk/Lib/urllib2.py@ 391

Last change on this file since 391 was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 50.7 KB
Line 
1"""An extensible library for opening URLs using a variety of protocols
2
3The simplest way to use this module is to call the urlopen function,
4which accepts a string containing a URL or a Request object (described
5below). It opens the URL and returns the results as file-like
6object; the returned object has some extra methods described below.
7
8The OpenerDirector manages a collection of Handler objects that do
9all the actual work. Each Handler implements a particular protocol or
10option. The OpenerDirector is a composite object that invokes the
11Handlers needed to open the requested URL. For example, the
12HTTPHandler performs HTTP GET and POST requests and deals with
13non-error returns. The HTTPRedirectHandler automatically deals with
14HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15deals with digest authentication.
16
17urlopen(url, data=None) -- Basic usage is the same as original
18urllib. pass the url and optionally data to post to an HTTP URL, and
19get a file-like object back. One difference is that you can also pass
20a Request instance instead of URL. Raises a URLError (subclass of
21IOError); for HTTP errors, raises an HTTPError, which can also be
22treated as a valid response.
23
24build_opener -- Function that creates a new OpenerDirector instance.
25Will install the default handlers. Accepts one or more Handlers as
26arguments, either instances or Handler classes that it will
27instantiate. If one of the argument is a subclass of the default
28handler, the argument will be installed instead of the default.
29
30install_opener -- Installs a new opener as the default opener.
31
32objects of interest:
33
34OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
35the Handler classes, while dealing with requests and responses.
36
37Request -- An object that encapsulates the state of a request. The
38state can be as simple as the URL. It can also include extra HTTP
39headers, e.g. a User-Agent.
40
41BaseHandler --
42
43exceptions:
44URLError -- A subclass of IOError, individual protocols have their own
45specific subclass.
46
47HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
48as an exceptional event or valid response.
49
50internals:
51BaseHandler and parent
52_call_chain conventions
53
54Example usage:
55
56import urllib2
57
58# set up authentication info
59authinfo = urllib2.HTTPBasicAuthHandler()
60authinfo.add_password(realm='PDQ Application',
61 uri='https://mahler:8092/site-updates.py',
62 user='klem',
63 passwd='geheim$parole')
64
65proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
66
67# build a new opener that adds authentication and caching FTP handlers
68opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
69
70# install it
71urllib2.install_opener(opener)
72
73f = urllib2.urlopen('http://www.python.org/')
74
75
76"""
77
78# XXX issues:
79# If an authentication error handler that tries to perform
80# authentication for some reason but fails, how should the error be
81# signalled? The client needs to know the HTTP error code. But if
82# the handler knows that the problem was, e.g., that it didn't know
83# that hash algo that requested in the challenge, it would be good to
84# pass that information along to the client, too.
85# ftp errors aren't handled cleanly
86# check digest against correct (i.e. non-apache) implementation
87
88# Possible extensions:
89# complex proxies XXX not sure what exactly was meant by this
90# abstract factory for opener
91
92import base64
93import hashlib
94import httplib
95import mimetools
96import os
97import posixpath
98import random
99import re
100import socket
101import sys
102import time
103import urlparse
104import bisect
105import warnings
106
107try:
108 from cStringIO import StringIO
109except ImportError:
110 from StringIO import StringIO
111
112from urllib import (unwrap, unquote, splittype, splithost, quote,
113 addinfourl, splitport, splittag, toBytes,
114 splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
115
116# support for FileHandler, proxies via environment variables
117from urllib import localhost, url2pathname, getproxies, proxy_bypass
118
119# used in User-Agent header sent
120__version__ = sys.version[:3]
121
122_opener = None
123def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
124 global _opener
125 if _opener is None:
126 _opener = build_opener()
127 return _opener.open(url, data, timeout)
128
129def install_opener(opener):
130 global _opener
131 _opener = opener
132
133# do these error classes make sense?
134# make sure all of the IOError stuff is overridden. we just want to be
135# subtypes.
136
137class URLError(IOError):
138 # URLError is a sub-type of IOError, but it doesn't share any of
139 # the implementation. need to override __init__ and __str__.
140 # It sets self.args for compatibility with other EnvironmentError
141 # subclasses, but args doesn't have the typical format with errno in
142 # slot 0 and strerror in slot 1. This may be better than nothing.
143 def __init__(self, reason):
144 self.args = reason,
145 self.reason = reason
146
147 def __str__(self):
148 return '<urlopen error %s>' % self.reason
149
150class HTTPError(URLError, addinfourl):
151 """Raised when HTTP error occurs, but also acts like non-error return"""
152 __super_init = addinfourl.__init__
153
154 def __init__(self, url, code, msg, hdrs, fp):
155 self.code = code
156 self.msg = msg
157 self.hdrs = hdrs
158 self.fp = fp
159 self.filename = url
160 # The addinfourl classes depend on fp being a valid file
161 # object. In some cases, the HTTPError may not have a valid
162 # file object. If this happens, the simplest workaround is to
163 # not initialize the base classes.
164 if fp is not None:
165 self.__super_init(fp, hdrs, url, code)
166
167 def __str__(self):
168 return 'HTTP Error %s: %s' % (self.code, self.msg)
169
170 # since URLError specifies a .reason attribute, HTTPError should also
171 # provide this attribute. See issue13211 fo discussion.
172 @property
173 def reason(self):
174 return self.msg
175
176 def info(self):
177 return self.hdrs
178
179# copied from cookielib.py
180_cut_port_re = re.compile(r":\d+$")
181def request_host(request):
182 """Return request-host, as defined by RFC 2965.
183
184 Variation from RFC: returned value is lowercased, for convenient
185 comparison.
186
187 """
188 url = request.get_full_url()
189 host = urlparse.urlparse(url)[1]
190 if host == "":
191 host = request.get_header("Host", "")
192
193 # remove port, if present
194 host = _cut_port_re.sub("", host, 1)
195 return host.lower()
196
197class Request:
198
199 def __init__(self, url, data=None, headers={},
200 origin_req_host=None, unverifiable=False):
201 # unwrap('<URL:type://host/path>') --> 'type://host/path'
202 self.__original = unwrap(url)
203 self.__original, self.__fragment = splittag(self.__original)
204 self.type = None
205 # self.__r_type is what's left after doing the splittype
206 self.host = None
207 self.port = None
208 self._tunnel_host = None
209 self.data = data
210 self.headers = {}
211 for key, value in headers.items():
212 self.add_header(key, value)
213 self.unredirected_hdrs = {}
214 if origin_req_host is None:
215 origin_req_host = request_host(self)
216 self.origin_req_host = origin_req_host
217 self.unverifiable = unverifiable
218
219 def __getattr__(self, attr):
220 # XXX this is a fallback mechanism to guard against these
221 # methods getting called in a non-standard order. this may be
222 # too complicated and/or unnecessary.
223 # XXX should the __r_XXX attributes be public?
224 if attr[:12] == '_Request__r_':
225 name = attr[12:]
226 if hasattr(Request, 'get_' + name):
227 getattr(self, 'get_' + name)()
228 return getattr(self, attr)
229 raise AttributeError, attr
230
231 def get_method(self):
232 if self.has_data():
233 return "POST"
234 else:
235 return "GET"
236
237 # XXX these helper methods are lame
238
239 def add_data(self, data):
240 self.data = data
241
242 def has_data(self):
243 return self.data is not None
244
245 def get_data(self):
246 return self.data
247
248 def get_full_url(self):
249 if self.__fragment:
250 return '%s#%s' % (self.__original, self.__fragment)
251 else:
252 return self.__original
253
254 def get_type(self):
255 if self.type is None:
256 self.type, self.__r_type = splittype(self.__original)
257 if self.type is None:
258 raise ValueError, "unknown url type: %s" % self.__original
259 return self.type
260
261 def get_host(self):
262 if self.host is None:
263 self.host, self.__r_host = splithost(self.__r_type)
264 if self.host:
265 self.host = unquote(self.host)
266 return self.host
267
268 def get_selector(self):
269 return self.__r_host
270
271 def set_proxy(self, host, type):
272 if self.type == 'https' and not self._tunnel_host:
273 self._tunnel_host = self.host
274 else:
275 self.type = type
276 self.__r_host = self.__original
277
278 self.host = host
279
280 def has_proxy(self):
281 return self.__r_host == self.__original
282
283 def get_origin_req_host(self):
284 return self.origin_req_host
285
286 def is_unverifiable(self):
287 return self.unverifiable
288
289 def add_header(self, key, val):
290 # useful for something like authentication
291 self.headers[key.capitalize()] = val
292
293 def add_unredirected_header(self, key, val):
294 # will not be added to a redirected request
295 self.unredirected_hdrs[key.capitalize()] = val
296
297 def has_header(self, header_name):
298 return (header_name in self.headers or
299 header_name in self.unredirected_hdrs)
300
301 def get_header(self, header_name, default=None):
302 return self.headers.get(
303 header_name,
304 self.unredirected_hdrs.get(header_name, default))
305
306 def header_items(self):
307 hdrs = self.unredirected_hdrs.copy()
308 hdrs.update(self.headers)
309 return hdrs.items()
310
311class OpenerDirector:
312 def __init__(self):
313 client_version = "Python-urllib/%s" % __version__
314 self.addheaders = [('User-agent', client_version)]
315 # self.handlers is retained only for backward compatibility
316 self.handlers = []
317 # manage the individual handlers
318 self.handle_open = {}
319 self.handle_error = {}
320 self.process_response = {}
321 self.process_request = {}
322
323 def add_handler(self, handler):
324 if not hasattr(handler, "add_parent"):
325 raise TypeError("expected BaseHandler instance, got %r" %
326 type(handler))
327
328 added = False
329 for meth in dir(handler):
330 if meth in ["redirect_request", "do_open", "proxy_open"]:
331 # oops, coincidental match
332 continue
333
334 i = meth.find("_")
335 protocol = meth[:i]
336 condition = meth[i+1:]
337
338 if condition.startswith("error"):
339 j = condition.find("_") + i + 1
340 kind = meth[j+1:]
341 try:
342 kind = int(kind)
343 except ValueError:
344 pass
345 lookup = self.handle_error.get(protocol, {})
346 self.handle_error[protocol] = lookup
347 elif condition == "open":
348 kind = protocol
349 lookup = self.handle_open
350 elif condition == "response":
351 kind = protocol
352 lookup = self.process_response
353 elif condition == "request":
354 kind = protocol
355 lookup = self.process_request
356 else:
357 continue
358
359 handlers = lookup.setdefault(kind, [])
360 if handlers:
361 bisect.insort(handlers, handler)
362 else:
363 handlers.append(handler)
364 added = True
365
366 if added:
367 bisect.insort(self.handlers, handler)
368 handler.add_parent(self)
369
370 def close(self):
371 # Only exists for backwards compatibility.
372 pass
373
374 def _call_chain(self, chain, kind, meth_name, *args):
375 # Handlers raise an exception if no one else should try to handle
376 # the request, or return None if they can't but another handler
377 # could. Otherwise, they return the response.
378 handlers = chain.get(kind, ())
379 for handler in handlers:
380 func = getattr(handler, meth_name)
381
382 result = func(*args)
383 if result is not None:
384 return result
385
386 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
387 # accept a URL or a Request object
388 if isinstance(fullurl, basestring):
389 req = Request(fullurl, data)
390 else:
391 req = fullurl
392 if data is not None:
393 req.add_data(data)
394
395 req.timeout = timeout
396 protocol = req.get_type()
397
398 # pre-process request
399 meth_name = protocol+"_request"
400 for processor in self.process_request.get(protocol, []):
401 meth = getattr(processor, meth_name)
402 req = meth(req)
403
404 response = self._open(req, data)
405
406 # post-process response
407 meth_name = protocol+"_response"
408 for processor in self.process_response.get(protocol, []):
409 meth = getattr(processor, meth_name)
410 response = meth(req, response)
411
412 return response
413
414 def _open(self, req, data=None):
415 result = self._call_chain(self.handle_open, 'default',
416 'default_open', req)
417 if result:
418 return result
419
420 protocol = req.get_type()
421 result = self._call_chain(self.handle_open, protocol, protocol +
422 '_open', req)
423 if result:
424 return result
425
426 return self._call_chain(self.handle_open, 'unknown',
427 'unknown_open', req)
428
429 def error(self, proto, *args):
430 if proto in ('http', 'https'):
431 # XXX http[s] protocols are special-cased
432 dict = self.handle_error['http'] # https is not different than http
433 proto = args[2] # YUCK!
434 meth_name = 'http_error_%s' % proto
435 http_err = 1
436 orig_args = args
437 else:
438 dict = self.handle_error
439 meth_name = proto + '_error'
440 http_err = 0
441 args = (dict, proto, meth_name) + args
442 result = self._call_chain(*args)
443 if result:
444 return result
445
446 if http_err:
447 args = (dict, 'default', 'http_error_default') + orig_args
448 return self._call_chain(*args)
449
450# XXX probably also want an abstract factory that knows when it makes
451# sense to skip a superclass in favor of a subclass and when it might
452# make sense to include both
453
454def build_opener(*handlers):
455 """Create an opener object from a list of handlers.
456
457 The opener will use several default handlers, including support
458 for HTTP, FTP and when applicable, HTTPS.
459
460 If any of the handlers passed as arguments are subclasses of the
461 default handlers, the default handlers will not be used.
462 """
463 import types
464 def isclass(obj):
465 return isinstance(obj, (types.ClassType, type))
466
467 opener = OpenerDirector()
468 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
469 HTTPDefaultErrorHandler, HTTPRedirectHandler,
470 FTPHandler, FileHandler, HTTPErrorProcessor]
471 if hasattr(httplib, 'HTTPS'):
472 default_classes.append(HTTPSHandler)
473 skip = set()
474 for klass in default_classes:
475 for check in handlers:
476 if isclass(check):
477 if issubclass(check, klass):
478 skip.add(klass)
479 elif isinstance(check, klass):
480 skip.add(klass)
481 for klass in skip:
482 default_classes.remove(klass)
483
484 for klass in default_classes:
485 opener.add_handler(klass())
486
487 for h in handlers:
488 if isclass(h):
489 h = h()
490 opener.add_handler(h)
491 return opener
492
493class BaseHandler:
494 handler_order = 500
495
496 def add_parent(self, parent):
497 self.parent = parent
498
499 def close(self):
500 # Only exists for backwards compatibility
501 pass
502
503 def __lt__(self, other):
504 if not hasattr(other, "handler_order"):
505 # Try to preserve the old behavior of having custom classes
506 # inserted after default ones (works only for custom user
507 # classes which are not aware of handler_order).
508 return True
509 return self.handler_order < other.handler_order
510
511
512class HTTPErrorProcessor(BaseHandler):
513 """Process HTTP error responses."""
514 handler_order = 1000 # after all other processing
515
516 def http_response(self, request, response):
517 code, msg, hdrs = response.code, response.msg, response.info()
518
519 # According to RFC 2616, "2xx" code indicates that the client's
520 # request was successfully received, understood, and accepted.
521 if not (200 <= code < 300):
522 response = self.parent.error(
523 'http', request, response, code, msg, hdrs)
524
525 return response
526
527 https_response = http_response
528
529class HTTPDefaultErrorHandler(BaseHandler):
530 def http_error_default(self, req, fp, code, msg, hdrs):
531 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
532
533class HTTPRedirectHandler(BaseHandler):
534 # maximum number of redirections to any single URL
535 # this is needed because of the state that cookies introduce
536 max_repeats = 4
537 # maximum total number of redirections (regardless of URL) before
538 # assuming we're in a loop
539 max_redirections = 10
540
541 def redirect_request(self, req, fp, code, msg, headers, newurl):
542 """Return a Request or None in response to a redirect.
543
544 This is called by the http_error_30x methods when a
545 redirection response is received. If a redirection should
546 take place, return a new Request to allow http_error_30x to
547 perform the redirect. Otherwise, raise HTTPError if no-one
548 else should try to handle this url. Return None if you can't
549 but another Handler might.
550 """
551 m = req.get_method()
552 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
553 or code in (301, 302, 303) and m == "POST"):
554 # Strictly (according to RFC 2616), 301 or 302 in response
555 # to a POST MUST NOT cause a redirection without confirmation
556 # from the user (of urllib2, in this case). In practice,
557 # essentially all clients do redirect in this case, so we
558 # do the same.
559 # be conciliant with URIs containing a space
560 newurl = newurl.replace(' ', '%20')
561 newheaders = dict((k,v) for k,v in req.headers.items()
562 if k.lower() not in ("content-length", "content-type")
563 )
564 return Request(newurl,
565 headers=newheaders,
566 origin_req_host=req.get_origin_req_host(),
567 unverifiable=True)
568 else:
569 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
570
571 # Implementation note: To avoid the server sending us into an
572 # infinite loop, the request object needs to track what URLs we
573 # have already seen. Do this by adding a handler-specific
574 # attribute to the Request object.
575 def http_error_302(self, req, fp, code, msg, headers):
576 # Some servers (incorrectly) return multiple Location headers
577 # (so probably same goes for URI). Use first header.
578 if 'location' in headers:
579 newurl = headers.getheaders('location')[0]
580 elif 'uri' in headers:
581 newurl = headers.getheaders('uri')[0]
582 else:
583 return
584
585 # fix a possible malformed URL
586 urlparts = urlparse.urlparse(newurl)
587 if not urlparts.path:
588 urlparts = list(urlparts)
589 urlparts[2] = "/"
590 newurl = urlparse.urlunparse(urlparts)
591
592 newurl = urlparse.urljoin(req.get_full_url(), newurl)
593
594 # For security reasons we do not allow redirects to protocols
595 # other than HTTP, HTTPS or FTP.
596 newurl_lower = newurl.lower()
597 if not (newurl_lower.startswith('http://') or
598 newurl_lower.startswith('https://') or
599 newurl_lower.startswith('ftp://')):
600 raise HTTPError(newurl, code,
601 msg + " - Redirection to url '%s' is not allowed" %
602 newurl,
603 headers, fp)
604
605 # XXX Probably want to forget about the state of the current
606 # request, although that might interact poorly with other
607 # handlers that also use handler-specific request attributes
608 new = self.redirect_request(req, fp, code, msg, headers, newurl)
609 if new is None:
610 return
611
612 # loop detection
613 # .redirect_dict has a key url if url was previously visited.
614 if hasattr(req, 'redirect_dict'):
615 visited = new.redirect_dict = req.redirect_dict
616 if (visited.get(newurl, 0) >= self.max_repeats or
617 len(visited) >= self.max_redirections):
618 raise HTTPError(req.get_full_url(), code,
619 self.inf_msg + msg, headers, fp)
620 else:
621 visited = new.redirect_dict = req.redirect_dict = {}
622 visited[newurl] = visited.get(newurl, 0) + 1
623
624 # Don't close the fp until we are sure that we won't use it
625 # with HTTPError.
626 fp.read()
627 fp.close()
628
629 return self.parent.open(new, timeout=req.timeout)
630
631 http_error_301 = http_error_303 = http_error_307 = http_error_302
632
633 inf_msg = "The HTTP server returned a redirect error that would " \
634 "lead to an infinite loop.\n" \
635 "The last 30x error message was:\n"
636
637
638def _parse_proxy(proxy):
639 """Return (scheme, user, password, host/port) given a URL or an authority.
640
641 If a URL is supplied, it must have an authority (host:port) component.
642 According to RFC 3986, having an authority component means the URL must
643 have two slashes after the scheme:
644
645 >>> _parse_proxy('file:/ftp.example.com/')
646 Traceback (most recent call last):
647 ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
648
649 The first three items of the returned tuple may be None.
650
651 Examples of authority parsing:
652
653 >>> _parse_proxy('proxy.example.com')
654 (None, None, None, 'proxy.example.com')
655 >>> _parse_proxy('proxy.example.com:3128')
656 (None, None, None, 'proxy.example.com:3128')
657
658 The authority component may optionally include userinfo (assumed to be
659 username:password):
660
661 >>> _parse_proxy('joe:password@proxy.example.com')
662 (None, 'joe', 'password', 'proxy.example.com')
663 >>> _parse_proxy('joe:password@proxy.example.com:3128')
664 (None, 'joe', 'password', 'proxy.example.com:3128')
665
666 Same examples, but with URLs instead:
667
668 >>> _parse_proxy('http://proxy.example.com/')
669 ('http', None, None, 'proxy.example.com')
670 >>> _parse_proxy('http://proxy.example.com:3128/')
671 ('http', None, None, 'proxy.example.com:3128')
672 >>> _parse_proxy('http://joe:password@proxy.example.com/')
673 ('http', 'joe', 'password', 'proxy.example.com')
674 >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
675 ('http', 'joe', 'password', 'proxy.example.com:3128')
676
677 Everything after the authority is ignored:
678
679 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
680 ('ftp', 'joe', 'password', 'proxy.example.com')
681
682 Test for no trailing '/' case:
683
684 >>> _parse_proxy('http://joe:password@proxy.example.com')
685 ('http', 'joe', 'password', 'proxy.example.com')
686
687 """
688 scheme, r_scheme = splittype(proxy)
689 if not r_scheme.startswith("/"):
690 # authority
691 scheme = None
692 authority = proxy
693 else:
694 # URL
695 if not r_scheme.startswith("//"):
696 raise ValueError("proxy URL with no authority: %r" % proxy)
697 # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
698 # and 3.3.), path is empty or starts with '/'
699 end = r_scheme.find("/", 2)
700 if end == -1:
701 end = None
702 authority = r_scheme[2:end]
703 userinfo, hostport = splituser(authority)
704 if userinfo is not None:
705 user, password = splitpasswd(userinfo)
706 else:
707 user = password = None
708 return scheme, user, password, hostport
709
710class ProxyHandler(BaseHandler):
711 # Proxies must be in front
712 handler_order = 100
713
714 def __init__(self, proxies=None):
715 if proxies is None:
716 proxies = getproxies()
717 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
718 self.proxies = proxies
719 for type, url in proxies.items():
720 setattr(self, '%s_open' % type,
721 lambda r, proxy=url, type=type, meth=self.proxy_open: \
722 meth(r, proxy, type))
723
724 def proxy_open(self, req, proxy, type):
725 orig_type = req.get_type()
726 proxy_type, user, password, hostport = _parse_proxy(proxy)
727
728 if proxy_type is None:
729 proxy_type = orig_type
730
731 if req.host and proxy_bypass(req.host):
732 return None
733
734 if user and password:
735 user_pass = '%s:%s' % (unquote(user), unquote(password))
736 creds = base64.b64encode(user_pass).strip()
737 req.add_header('Proxy-authorization', 'Basic ' + creds)
738 hostport = unquote(hostport)
739 req.set_proxy(hostport, proxy_type)
740
741 if orig_type == proxy_type or orig_type == 'https':
742 # let other handlers take care of it
743 return None
744 else:
745 # need to start over, because the other handlers don't
746 # grok the proxy's URL type
747 # e.g. if we have a constructor arg proxies like so:
748 # {'http': 'ftp://proxy.example.com'}, we may end up turning
749 # a request for http://acme.example.com/a into one for
750 # ftp://proxy.example.com/a
751 return self.parent.open(req, timeout=req.timeout)
752
753class HTTPPasswordMgr:
754
755 def __init__(self):
756 self.passwd = {}
757
758 def add_password(self, realm, uri, user, passwd):
759 # uri could be a single URI or a sequence
760 if isinstance(uri, basestring):
761 uri = [uri]
762 if not realm in self.passwd:
763 self.passwd[realm] = {}
764 for default_port in True, False:
765 reduced_uri = tuple(
766 [self.reduce_uri(u, default_port) for u in uri])
767 self.passwd[realm][reduced_uri] = (user, passwd)
768
769 def find_user_password(self, realm, authuri):
770 domains = self.passwd.get(realm, {})
771 for default_port in True, False:
772 reduced_authuri = self.reduce_uri(authuri, default_port)
773 for uris, authinfo in domains.iteritems():
774 for uri in uris:
775 if self.is_suburi(uri, reduced_authuri):
776 return authinfo
777 return None, None
778
779 def reduce_uri(self, uri, default_port=True):
780 """Accept authority or URI and extract only the authority and path."""
781 # note HTTP URLs do not have a userinfo component
782 parts = urlparse.urlsplit(uri)
783 if parts[1]:
784 # URI
785 scheme = parts[0]
786 authority = parts[1]
787 path = parts[2] or '/'
788 else:
789 # host or host:port
790 scheme = None
791 authority = uri
792 path = '/'
793 host, port = splitport(authority)
794 if default_port and port is None and scheme is not None:
795 dport = {"http": 80,
796 "https": 443,
797 }.get(scheme)
798 if dport is not None:
799 authority = "%s:%d" % (host, dport)
800 return authority, path
801
802 def is_suburi(self, base, test):
803 """Check if test is below base in a URI tree
804
805 Both args must be URIs in reduced form.
806 """
807 if base == test:
808 return True
809 if base[0] != test[0]:
810 return False
811 common = posixpath.commonprefix((base[1], test[1]))
812 if len(common) == len(base[1]):
813 return True
814 return False
815
816
817class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
818
819 def find_user_password(self, realm, authuri):
820 user, password = HTTPPasswordMgr.find_user_password(self, realm,
821 authuri)
822 if user is not None:
823 return user, password
824 return HTTPPasswordMgr.find_user_password(self, None, authuri)
825
826
827class AbstractBasicAuthHandler:
828
829 # XXX this allows for multiple auth-schemes, but will stupidly pick
830 # the last one with a realm specified.
831
832 # allow for double- and single-quoted realm values
833 # (single quotes are a violation of the RFC, but appear in the wild)
834 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
835 'realm=(["\']?)([^"\']*)\\2', re.I)
836
837 # XXX could pre-emptively send auth info already accepted (RFC 2617,
838 # end of section 2, and section 1.2 immediately after "credentials"
839 # production).
840
841 def __init__(self, password_mgr=None):
842 if password_mgr is None:
843 password_mgr = HTTPPasswordMgr()
844 self.passwd = password_mgr
845 self.add_password = self.passwd.add_password
846 self.retried = 0
847
848 def reset_retry_count(self):
849 self.retried = 0
850
851 def http_error_auth_reqed(self, authreq, host, req, headers):
852 # host may be an authority (without userinfo) or a URL with an
853 # authority
854 # XXX could be multiple headers
855 authreq = headers.get(authreq, None)
856
857 if self.retried > 5:
858 # retry sending the username:password 5 times before failing.
859 raise HTTPError(req.get_full_url(), 401, "basic auth failed",
860 headers, None)
861 else:
862 self.retried += 1
863
864 if authreq:
865 mo = AbstractBasicAuthHandler.rx.search(authreq)
866 if mo:
867 scheme, quote, realm = mo.groups()
868 if quote not in ['"', "'"]:
869 warnings.warn("Basic Auth Realm was unquoted",
870 UserWarning, 2)
871 if scheme.lower() == 'basic':
872 response = self.retry_http_basic_auth(host, req, realm)
873 if response and response.code != 401:
874 self.retried = 0
875 return response
876
877 def retry_http_basic_auth(self, host, req, realm):
878 user, pw = self.passwd.find_user_password(realm, host)
879 if pw is not None:
880 raw = "%s:%s" % (user, pw)
881 auth = 'Basic %s' % base64.b64encode(raw).strip()
882 if req.headers.get(self.auth_header, None) == auth:
883 return None
884 req.add_unredirected_header(self.auth_header, auth)
885 return self.parent.open(req, timeout=req.timeout)
886 else:
887 return None
888
889
890class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
891
892 auth_header = 'Authorization'
893
894 def http_error_401(self, req, fp, code, msg, headers):
895 url = req.get_full_url()
896 response = self.http_error_auth_reqed('www-authenticate',
897 url, req, headers)
898 self.reset_retry_count()
899 return response
900
901
902class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
903
904 auth_header = 'Proxy-authorization'
905
906 def http_error_407(self, req, fp, code, msg, headers):
907 # http_error_auth_reqed requires that there is no userinfo component in
908 # authority. Assume there isn't one, since urllib2 does not (and
909 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
910 # userinfo.
911 authority = req.get_host()
912 response = self.http_error_auth_reqed('proxy-authenticate',
913 authority, req, headers)
914 self.reset_retry_count()
915 return response
916
917
918def randombytes(n):
919 """Return n random bytes."""
920 # Use /dev/urandom if it is available. Fall back to random module
921 # if not. It might be worthwhile to extend this function to use
922 # other platform-specific mechanisms for getting random bytes.
923 if os.path.exists("/dev/urandom"):
924 f = open("/dev/urandom")
925 s = f.read(n)
926 f.close()
927 return s
928 else:
929 L = [chr(random.randrange(0, 256)) for i in range(n)]
930 return "".join(L)
931
932class AbstractDigestAuthHandler:
933 # Digest authentication is specified in RFC 2617.
934
935 # XXX The client does not inspect the Authentication-Info header
936 # in a successful response.
937
938 # XXX It should be possible to test this implementation against
939 # a mock server that just generates a static set of challenges.
940
941 # XXX qop="auth-int" supports is shaky
942
943 def __init__(self, passwd=None):
944 if passwd is None:
945 passwd = HTTPPasswordMgr()
946 self.passwd = passwd
947 self.add_password = self.passwd.add_password
948 self.retried = 0
949 self.nonce_count = 0
950 self.last_nonce = None
951
952 def reset_retry_count(self):
953 self.retried = 0
954
955 def http_error_auth_reqed(self, auth_header, host, req, headers):
956 authreq = headers.get(auth_header, None)
957 if self.retried > 5:
958 # Don't fail endlessly - if we failed once, we'll probably
959 # fail a second time. Hm. Unless the Password Manager is
960 # prompting for the information. Crap. This isn't great
961 # but it's better than the current 'repeat until recursion
962 # depth exceeded' approach <wink>
963 raise HTTPError(req.get_full_url(), 401, "digest auth failed",
964 headers, None)
965 else:
966 self.retried += 1
967 if authreq:
968 scheme = authreq.split()[0]
969 if scheme.lower() == 'digest':
970 return self.retry_http_digest_auth(req, authreq)
971
972 def retry_http_digest_auth(self, req, auth):
973 token, challenge = auth.split(' ', 1)
974 chal = parse_keqv_list(parse_http_list(challenge))
975 auth = self.get_authorization(req, chal)
976 if auth:
977 auth_val = 'Digest %s' % auth
978 if req.headers.get(self.auth_header, None) == auth_val:
979 return None
980 req.add_unredirected_header(self.auth_header, auth_val)
981 resp = self.parent.open(req, timeout=req.timeout)
982 return resp
983
984 def get_cnonce(self, nonce):
985 # The cnonce-value is an opaque
986 # quoted string value provided by the client and used by both client
987 # and server to avoid chosen plaintext attacks, to provide mutual
988 # authentication, and to provide some message integrity protection.
989 # This isn't a fabulous effort, but it's probably Good Enough.
990 dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
991 randombytes(8))).hexdigest()
992 return dig[:16]
993
994 def get_authorization(self, req, chal):
995 try:
996 realm = chal['realm']
997 nonce = chal['nonce']
998 qop = chal.get('qop')
999 algorithm = chal.get('algorithm', 'MD5')
1000 # mod_digest doesn't send an opaque, even though it isn't
1001 # supposed to be optional
1002 opaque = chal.get('opaque', None)
1003 except KeyError:
1004 return None
1005
1006 H, KD = self.get_algorithm_impls(algorithm)
1007 if H is None:
1008 return None
1009
1010 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
1011 if user is None:
1012 return None
1013
1014 # XXX not implemented yet
1015 if req.has_data():
1016 entdig = self.get_entity_digest(req.get_data(), chal)
1017 else:
1018 entdig = None
1019
1020 A1 = "%s:%s:%s" % (user, realm, pw)
1021 A2 = "%s:%s" % (req.get_method(),
1022 # XXX selector: what about proxies and full urls
1023 req.get_selector())
1024 if qop == 'auth':
1025 if nonce == self.last_nonce:
1026 self.nonce_count += 1
1027 else:
1028 self.nonce_count = 1
1029 self.last_nonce = nonce
1030
1031 ncvalue = '%08x' % self.nonce_count
1032 cnonce = self.get_cnonce(nonce)
1033 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
1034 respdig = KD(H(A1), noncebit)
1035 elif qop is None:
1036 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
1037 else:
1038 # XXX handle auth-int.
1039 raise URLError("qop '%s' is not supported." % qop)
1040
1041 # XXX should the partial digests be encoded too?
1042
1043 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
1044 'response="%s"' % (user, realm, nonce, req.get_selector(),
1045 respdig)
1046 if opaque:
1047 base += ', opaque="%s"' % opaque
1048 if entdig:
1049 base += ', digest="%s"' % entdig
1050 base += ', algorithm="%s"' % algorithm
1051 if qop:
1052 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
1053 return base
1054
1055 def get_algorithm_impls(self, algorithm):
1056 # algorithm should be case-insensitive according to RFC2617
1057 algorithm = algorithm.upper()
1058 # lambdas assume digest modules are imported at the top level
1059 if algorithm == 'MD5':
1060 H = lambda x: hashlib.md5(x).hexdigest()
1061 elif algorithm == 'SHA':
1062 H = lambda x: hashlib.sha1(x).hexdigest()
1063 # XXX MD5-sess
1064 KD = lambda s, d: H("%s:%s" % (s, d))
1065 return H, KD
1066
1067 def get_entity_digest(self, data, chal):
1068 # XXX not implemented yet
1069 return None
1070
1071
1072class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1073 """An authentication protocol defined by RFC 2069
1074
1075 Digest authentication improves on basic authentication because it
1076 does not transmit passwords in the clear.
1077 """
1078
1079 auth_header = 'Authorization'
1080 handler_order = 490 # before Basic auth
1081
1082 def http_error_401(self, req, fp, code, msg, headers):
1083 host = urlparse.urlparse(req.get_full_url())[1]
1084 retry = self.http_error_auth_reqed('www-authenticate',
1085 host, req, headers)
1086 self.reset_retry_count()
1087 return retry
1088
1089
1090class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1091
1092 auth_header = 'Proxy-Authorization'
1093 handler_order = 490 # before Basic auth
1094
1095 def http_error_407(self, req, fp, code, msg, headers):
1096 host = req.get_host()
1097 retry = self.http_error_auth_reqed('proxy-authenticate',
1098 host, req, headers)
1099 self.reset_retry_count()
1100 return retry
1101
1102class AbstractHTTPHandler(BaseHandler):
1103
1104 def __init__(self, debuglevel=0):
1105 self._debuglevel = debuglevel
1106
1107 def set_http_debuglevel(self, level):
1108 self._debuglevel = level
1109
1110 def do_request_(self, request):
1111 host = request.get_host()
1112 if not host:
1113 raise URLError('no host given')
1114
1115 if request.has_data(): # POST
1116 data = request.get_data()
1117 if not request.has_header('Content-type'):
1118 request.add_unredirected_header(
1119 'Content-type',
1120 'application/x-www-form-urlencoded')
1121 if not request.has_header('Content-length'):
1122 request.add_unredirected_header(
1123 'Content-length', '%d' % len(data))
1124
1125 sel_host = host
1126 if request.has_proxy():
1127 scheme, sel = splittype(request.get_selector())
1128 sel_host, sel_path = splithost(sel)
1129
1130 if not request.has_header('Host'):
1131 request.add_unredirected_header('Host', sel_host)
1132 for name, value in self.parent.addheaders:
1133 name = name.capitalize()
1134 if not request.has_header(name):
1135 request.add_unredirected_header(name, value)
1136
1137 return request
1138
1139 def do_open(self, http_class, req):
1140 """Return an addinfourl object for the request, using http_class.
1141
1142 http_class must implement the HTTPConnection API from httplib.
1143 The addinfourl return value is a file-like object. It also
1144 has methods and attributes including:
1145 - info(): return a mimetools.Message object for the headers
1146 - geturl(): return the original request URL
1147 - code: HTTP status code
1148 """
1149 host = req.get_host()
1150 if not host:
1151 raise URLError('no host given')
1152
1153 h = http_class(host, timeout=req.timeout) # will parse host:port
1154 h.set_debuglevel(self._debuglevel)
1155
1156 headers = dict(req.unredirected_hdrs)
1157 headers.update(dict((k, v) for k, v in req.headers.items()
1158 if k not in headers))
1159
1160 # We want to make an HTTP/1.1 request, but the addinfourl
1161 # class isn't prepared to deal with a persistent connection.
1162 # It will try to read all remaining data from the socket,
1163 # which will block while the server waits for the next request.
1164 # So make sure the connection gets closed after the (only)
1165 # request.
1166 headers["Connection"] = "close"
1167 headers = dict(
1168 (name.title(), val) for name, val in headers.items())
1169
1170 if req._tunnel_host:
1171 tunnel_headers = {}
1172 proxy_auth_hdr = "Proxy-Authorization"
1173 if proxy_auth_hdr in headers:
1174 tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
1175 # Proxy-Authorization should not be sent to origin
1176 # server.
1177 del headers[proxy_auth_hdr]
1178 h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
1179
1180 try:
1181 h.request(req.get_method(), req.get_selector(), req.data, headers)
1182 except socket.error, err: # XXX what error?
1183 h.close()
1184 raise URLError(err)
1185 else:
1186 try:
1187 r = h.getresponse(buffering=True)
1188 except TypeError: # buffering kw not supported
1189 r = h.getresponse()
1190
1191 # Pick apart the HTTPResponse object to get the addinfourl
1192 # object initialized properly.
1193
1194 # Wrap the HTTPResponse object in socket's file object adapter
1195 # for Windows. That adapter calls recv(), so delegate recv()
1196 # to read(). This weird wrapping allows the returned object to
1197 # have readline() and readlines() methods.
1198
1199 # XXX It might be better to extract the read buffering code
1200 # out of socket._fileobject() and into a base class.
1201
1202 r.recv = r.read
1203 fp = socket._fileobject(r, close=True)
1204
1205 resp = addinfourl(fp, r.msg, req.get_full_url())
1206 resp.code = r.status
1207 resp.msg = r.reason
1208 return resp
1209
1210
1211class HTTPHandler(AbstractHTTPHandler):
1212
1213 def http_open(self, req):
1214 return self.do_open(httplib.HTTPConnection, req)
1215
1216 http_request = AbstractHTTPHandler.do_request_
1217
1218if hasattr(httplib, 'HTTPS'):
1219 class HTTPSHandler(AbstractHTTPHandler):
1220
1221 def https_open(self, req):
1222 return self.do_open(httplib.HTTPSConnection, req)
1223
1224 https_request = AbstractHTTPHandler.do_request_
1225
1226class HTTPCookieProcessor(BaseHandler):
1227 def __init__(self, cookiejar=None):
1228 import cookielib
1229 if cookiejar is None:
1230 cookiejar = cookielib.CookieJar()
1231 self.cookiejar = cookiejar
1232
1233 def http_request(self, request):
1234 self.cookiejar.add_cookie_header(request)
1235 return request
1236
1237 def http_response(self, request, response):
1238 self.cookiejar.extract_cookies(response, request)
1239 return response
1240
1241 https_request = http_request
1242 https_response = http_response
1243
1244class UnknownHandler(BaseHandler):
1245 def unknown_open(self, req):
1246 type = req.get_type()
1247 raise URLError('unknown url type: %s' % type)
1248
1249def parse_keqv_list(l):
1250 """Parse list of key=value strings where keys are not duplicated."""
1251 parsed = {}
1252 for elt in l:
1253 k, v = elt.split('=', 1)
1254 if v[0] == '"' and v[-1] == '"':
1255 v = v[1:-1]
1256 parsed[k] = v
1257 return parsed
1258
1259def parse_http_list(s):
1260 """Parse lists as described by RFC 2068 Section 2.
1261
1262 In particular, parse comma-separated lists where the elements of
1263 the list may include quoted-strings. A quoted-string could
1264 contain a comma. A non-quoted string could have quotes in the
1265 middle. Neither commas nor quotes count if they are escaped.
1266 Only double-quotes count, not single-quotes.
1267 """
1268 res = []
1269 part = ''
1270
1271 escape = quote = False
1272 for cur in s:
1273 if escape:
1274 part += cur
1275 escape = False
1276 continue
1277 if quote:
1278 if cur == '\\':
1279 escape = True
1280 continue
1281 elif cur == '"':
1282 quote = False
1283 part += cur
1284 continue
1285
1286 if cur == ',':
1287 res.append(part)
1288 part = ''
1289 continue
1290
1291 if cur == '"':
1292 quote = True
1293
1294 part += cur
1295
1296 # append last part
1297 if part:
1298 res.append(part)
1299
1300 return [part.strip() for part in res]
1301
1302def _safe_gethostbyname(host):
1303 try:
1304 return socket.gethostbyname(host)
1305 except socket.gaierror:
1306 return None
1307
1308class FileHandler(BaseHandler):
1309 # Use local file or FTP depending on form of URL
1310 def file_open(self, req):
1311 url = req.get_selector()
1312 if url[:2] == '//' and url[2:3] != '/' and (req.host and
1313 req.host != 'localhost'):
1314 req.type = 'ftp'
1315 return self.parent.open(req)
1316 else:
1317 return self.open_local_file(req)
1318
1319 # names for the localhost
1320 names = None
1321 def get_names(self):
1322 if FileHandler.names is None:
1323 try:
1324 FileHandler.names = tuple(
1325 socket.gethostbyname_ex('localhost')[2] +
1326 socket.gethostbyname_ex(socket.gethostname())[2])
1327 except socket.gaierror:
1328 FileHandler.names = (socket.gethostbyname('localhost'),)
1329 return FileHandler.names
1330
1331 # not entirely sure what the rules are here
1332 def open_local_file(self, req):
1333 import email.utils
1334 import mimetypes
1335 host = req.get_host()
1336 filename = req.get_selector()
1337 # YD hack: add again drive name
1338 if os.name == 'os2' and len(host)>1 and host[1] == ':':
1339 filename = host + filename
1340 host = ""
1341 localfile = url2pathname(filename)
1342 try:
1343 stats = os.stat(localfile)
1344 size = stats.st_size
1345 modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1346 mtype = mimetypes.guess_type(filename)[0]
1347 headers = mimetools.Message(StringIO(
1348 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1349 (mtype or 'text/plain', size, modified)))
1350 if host:
1351 host, port = splitport(host)
1352 if not host or \
1353 (not port and _safe_gethostbyname(host) in self.get_names()):
1354 if host:
1355 origurl = 'file://' + host + filename
1356 else:
1357 origurl = 'file://' + filename
1358 return addinfourl(open(localfile, 'rb'), headers, origurl)
1359 except OSError, msg:
1360 # urllib2 users shouldn't expect OSErrors coming from urlopen()
1361 raise URLError(msg)
1362 raise URLError('file not on local host')
1363
1364class FTPHandler(BaseHandler):
1365 def ftp_open(self, req):
1366 import ftplib
1367 import mimetypes
1368 host = req.get_host()
1369 if not host:
1370 raise URLError('ftp error: no host given')
1371 host, port = splitport(host)
1372 if port is None:
1373 port = ftplib.FTP_PORT
1374 else:
1375 port = int(port)
1376
1377 # username/password handling
1378 user, host = splituser(host)
1379 if user:
1380 user, passwd = splitpasswd(user)
1381 else:
1382 passwd = None
1383 host = unquote(host)
1384 user = user or ''
1385 passwd = passwd or ''
1386
1387 try:
1388 host = socket.gethostbyname(host)
1389 except socket.error, msg:
1390 raise URLError(msg)
1391 path, attrs = splitattr(req.get_selector())
1392 dirs = path.split('/')
1393 dirs = map(unquote, dirs)
1394 dirs, file = dirs[:-1], dirs[-1]
1395 if dirs and not dirs[0]:
1396 dirs = dirs[1:]
1397 try:
1398 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1399 type = file and 'I' or 'D'
1400 for attr in attrs:
1401 attr, value = splitvalue(attr)
1402 if attr.lower() == 'type' and \
1403 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1404 type = value.upper()
1405 fp, retrlen = fw.retrfile(file, type)
1406 headers = ""
1407 mtype = mimetypes.guess_type(req.get_full_url())[0]
1408 if mtype:
1409 headers += "Content-type: %s\n" % mtype
1410 if retrlen is not None and retrlen >= 0:
1411 headers += "Content-length: %d\n" % retrlen
1412 sf = StringIO(headers)
1413 headers = mimetools.Message(sf)
1414 return addinfourl(fp, headers, req.get_full_url())
1415 except ftplib.all_errors, msg:
1416 raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
1417
1418 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1419 fw = ftpwrapper(user, passwd, host, port, dirs, timeout,
1420 persistent=False)
1421## fw.ftp.set_debuglevel(1)
1422 return fw
1423
1424class CacheFTPHandler(FTPHandler):
1425 # XXX would be nice to have pluggable cache strategies
1426 # XXX this stuff is definitely not thread safe
1427 def __init__(self):
1428 self.cache = {}
1429 self.timeout = {}
1430 self.soonest = 0
1431 self.delay = 60
1432 self.max_conns = 16
1433
1434 def setTimeout(self, t):
1435 self.delay = t
1436
1437 def setMaxConns(self, m):
1438 self.max_conns = m
1439
1440 def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1441 key = user, host, port, '/'.join(dirs), timeout
1442 if key in self.cache:
1443 self.timeout[key] = time.time() + self.delay
1444 else:
1445 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
1446 self.timeout[key] = time.time() + self.delay
1447 self.check_cache()
1448 return self.cache[key]
1449
1450 def check_cache(self):
1451 # first check for old ones
1452 t = time.time()
1453 if self.soonest <= t:
1454 for k, v in self.timeout.items():
1455 if v < t:
1456 self.cache[k].close()
1457 del self.cache[k]
1458 del self.timeout[k]
1459 self.soonest = min(self.timeout.values())
1460
1461 # then check the size
1462 if len(self.cache) == self.max_conns:
1463 for k, v in self.timeout.items():
1464 if v == self.soonest:
1465 del self.cache[k]
1466 del self.timeout[k]
1467 break
1468 self.soonest = min(self.timeout.values())
1469
1470 def clear_cache(self):
1471 for conn in self.cache.values():
1472 conn.close()
1473 self.cache.clear()
1474 self.timeout.clear()
Note: See TracBrowser for help on using the repository browser.