Context Navigation

urllib2.py

Visit:

Last change on this file was 3225, checked in by bird, 18 years ago
Python 2.5
File size: 46.2 KB

Line
1	"""An extensible library for opening URLs using a variety of protocols
2
3	The simplest way to use this module is to call the urlopen function,
4	which accepts a string containing a URL or a Request object (described
5	below). It opens the URL and returns the results as file-like
6	object; the returned object has some extra methods described below.
7
8	The OpenerDirector manages a collection of Handler objects that do
9	all the actual work. Each Handler implements a particular protocol or
10	option. The OpenerDirector is a composite object that invokes the
11	Handlers needed to open the requested URL. For example, the
12	HTTPHandler performs HTTP GET and POST requests and deals with
13	non-error returns. The HTTPRedirectHandler automatically deals with
14	HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15	deals with digest authentication.
16
17	urlopen(url, data=None) -- basic usage is the same as original
18	urllib. pass the url and optionally data to post to an HTTP URL, and
19	get a file-like object back. One difference is that you can also pass
20	a Request instance instead of URL. Raises a URLError (subclass of
21	IOError); for HTTP errors, raises an HTTPError, which can also be
22	treated as a valid response.
23
24	build_opener -- function that creates a new OpenerDirector instance.
25	will install the default handlers. accepts one or more Handlers as
26	arguments, either instances or Handler classes that it will
27	instantiate. if one of the argument is a subclass of the default
28	handler, the argument will be installed instead of the default.
29
30	install_opener -- installs a new opener as the default opener.
31
32	objects of interest:
33	OpenerDirector --
34
35	Request -- an object that encapsulates the state of a request. the
36	state can be a simple as the URL. it can also include extra HTTP
37	headers, e.g. a User-Agent.
38
39	BaseHandler --
40
41	exceptions:
42	URLError-- a subclass of IOError, individual protocols have their own
43	specific subclass
44
45	HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46	as an exceptional event or valid response
47
48	internals:
49	BaseHandler and parent
50	_call_chain conventions
51
52	Example usage:
53
54	import urllib2
55
56	# set up authentication info
57	authinfo = urllib2.HTTPBasicAuthHandler()
58	authinfo.add_password('realm', 'host', 'username', 'password')
59
60	proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
61
62	# build a new opener that adds authentication and caching FTP handlers
63	opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
64
65	# install it
66	urllib2.install_opener(opener)
67
68	f = urllib2.urlopen('http://www.python.org/')
69
70
71	"""
72
73	# XXX issues:
74	# If an authentication error handler that tries to perform
75	# authentication for some reason but fails, how should the error be
76	# signalled? The client needs to know the HTTP error code. But if
77	# the handler knows that the problem was, e.g., that it didn't know
78	# that hash algo that requested in the challenge, it would be good to
79	# pass that information along to the client, too.
80	# ftp errors aren't handled cleanly
81	# check digest against correct (i.e. non-apache) implementation
82
83	# Possible extensions:
84	# complex proxies XXX not sure what exactly was meant by this
85	# abstract factory for opener
86
87	import base64
88	import hashlib
89	import httplib
90	import mimetools
91	import os
92	import posixpath
93	import random
94	import re
95	import socket
96	import sys
97	import time
98	import urlparse
99	import bisect
100
101	try:
102	from cStringIO import StringIO
103	except ImportError:
104	from StringIO import StringIO
105
106	from urllib import (unwrap, unquote, splittype, splithost, quote,
107	addinfourl, splitport, splitgophertype, splitquery,
108	splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
109
110	# support for FileHandler, proxies via environment variables
111	from urllib import localhost, url2pathname, getproxies
112
113	# used in User-Agent header sent
114	__version__ = sys.version[:3]
115
116	_opener = None
117	def urlopen(url, data=None):
118	global _opener
119	if _opener is None:
120	_opener = build_opener()
121	return _opener.open(url, data)
122
123	def install_opener(opener):
124	global _opener
125	_opener = opener
126
127	# do these error classes make sense?
128	# make sure all of the IOError stuff is overridden. we just want to be
129	# subtypes.
130
131	class URLError(IOError):
132	# URLError is a sub-type of IOError, but it doesn't share any of
133	# the implementation. need to override __init__ and __str__.
134	# It sets self.args for compatibility with other EnvironmentError
135	# subclasses, but args doesn't have the typical format with errno in
136	# slot 0 and strerror in slot 1. This may be better than nothing.
137	def __init__(self, reason):
138	self.args = reason,
139	self.reason = reason
140
141	def __str__(self):
142	return '<urlopen error %s>' % self.reason
143
144	class HTTPError(URLError, addinfourl):
145	"""Raised when HTTP error occurs, but also acts like non-error return"""
146	__super_init = addinfourl.__init__
147
148	def __init__(self, url, code, msg, hdrs, fp):
149	self.code = code
150	self.msg = msg
151	self.hdrs = hdrs
152	self.fp = fp
153	self.filename = url
154	# The addinfourl classes depend on fp being a valid file
155	# object. In some cases, the HTTPError may not have a valid
156	# file object. If this happens, the simplest workaround is to
157	# not initialize the base classes.
158	if fp is not None:
159	self.__super_init(fp, hdrs, url)
160
161	def __str__(self):
162	return 'HTTP Error %s: %s' % (self.code, self.msg)
163
164	class GopherError(URLError):
165	pass
166
167	# copied from cookielib.py
168	_cut_port_re = re.compile(r":\d+$")
169	def request_host(request):
170	"""Return request-host, as defined by RFC 2965.
171
172	Variation from RFC: returned value is lowercased, for convenient
173	comparison.
174
175	"""
176	url = request.get_full_url()
177	host = urlparse.urlparse(url)[1]
178	if host == "":
179	host = request.get_header("Host", "")
180
181	# remove port, if present
182	host = _cut_port_re.sub("", host, 1)
183	return host.lower()
184
185	class Request:
186
187	def __init__(self, url, data=None, headers={},
188	origin_req_host=None, unverifiable=False):
189	# unwrap('<URL:type://host/path>') --> 'type://host/path'
190	self.__original = unwrap(url)
191	self.type = None
192	# self.__r_type is what's left after doing the splittype
193	self.host = None
194	self.port = None
195	self.data = data
196	self.headers = {}
197	for key, value in headers.items():
198	self.add_header(key, value)
199	self.unredirected_hdrs = {}
200	if origin_req_host is None:
201	origin_req_host = request_host(self)
202	self.origin_req_host = origin_req_host
203	self.unverifiable = unverifiable
204
205	def __getattr__(self, attr):
206	# XXX this is a fallback mechanism to guard against these
207	# methods getting called in a non-standard order. this may be
208	# too complicated and/or unnecessary.
209	# XXX should the __r_XXX attributes be public?
210	if attr[:12] == '_Request__r_':
211	name = attr[12:]
212	if hasattr(Request, 'get_' + name):
213	getattr(self, 'get_' + name)()
214	return getattr(self, attr)
215	raise AttributeError, attr
216
217	def get_method(self):
218	if self.has_data():
219	return "POST"
220	else:
221	return "GET"
222
223	# XXX these helper methods are lame
224
225	def add_data(self, data):
226	self.data = data
227
228	def has_data(self):
229	return self.data is not None
230
231	def get_data(self):
232	return self.data
233
234	def get_full_url(self):
235	return self.__original
236
237	def get_type(self):
238	if self.type is None:
239	self.type, self.__r_type = splittype(self.__original)
240	if self.type is None:
241	raise ValueError, "unknown url type: %s" % self.__original
242	return self.type
243
244	def get_host(self):
245	if self.host is None:
246	self.host, self.__r_host = splithost(self.__r_type)
247	if self.host:
248	self.host = unquote(self.host)
249	return self.host
250
251	def get_selector(self):
252	return self.__r_host
253
254	def set_proxy(self, host, type):
255	self.host, self.type = host, type
256	self.__r_host = self.__original
257
258	def get_origin_req_host(self):
259	return self.origin_req_host
260
261	def is_unverifiable(self):
262	return self.unverifiable
263
264	def add_header(self, key, val):
265	# useful for something like authentication
266	self.headers[key.capitalize()] = val
267
268	def add_unredirected_header(self, key, val):
269	# will not be added to a redirected request
270	self.unredirected_hdrs[key.capitalize()] = val
271
272	def has_header(self, header_name):
273	return (header_name in self.headers or
274	header_name in self.unredirected_hdrs)
275
276	def get_header(self, header_name, default=None):
277	return self.headers.get(
278	header_name,
279	self.unredirected_hdrs.get(header_name, default))
280
281	def header_items(self):
282	hdrs = self.unredirected_hdrs.copy()
283	hdrs.update(self.headers)
284	return hdrs.items()
285
286	class OpenerDirector:
287	def __init__(self):
288	client_version = "Python-urllib/%s" % __version__
289	self.addheaders = [('User-agent', client_version)]
290	# manage the individual handlers
291	self.handlers = []
292	self.handle_open = {}
293	self.handle_error = {}
294	self.process_response = {}
295	self.process_request = {}
296
297	def add_handler(self, handler):
298	added = False
299	for meth in dir(handler):
300	if meth in ["redirect_request", "do_open", "proxy_open"]:
301	# oops, coincidental match
302	continue
303
304	i = meth.find("_")
305	protocol = meth[:i]
306	condition = meth[i+1:]
307
308	if condition.startswith("error"):
309	j = condition.find("_") + i + 1
310	kind = meth[j+1:]
311	try:
312	kind = int(kind)
313	except ValueError:
314	pass
315	lookup = self.handle_error.get(protocol, {})
316	self.handle_error[protocol] = lookup
317	elif condition == "open":
318	kind = protocol
319	lookup = self.handle_open
320	elif condition == "response":
321	kind = protocol
322	lookup = self.process_response
323	elif condition == "request":
324	kind = protocol
325	lookup = self.process_request
326	else:
327	continue
328
329	handlers = lookup.setdefault(kind, [])
330	if handlers:
331	bisect.insort(handlers, handler)
332	else:
333	handlers.append(handler)
334	added = True
335
336	if added:
337	# XXX why does self.handlers need to be sorted?
338	bisect.insort(self.handlers, handler)
339	handler.add_parent(self)
340
341	def close(self):
342	# Only exists for backwards compatibility.
343	pass
344
345	def _call_chain(self, chain, kind, meth_name, *args):
346	# Handlers raise an exception if no one else should try to handle
347	# the request, or return None if they can't but another handler
348	# could. Otherwise, they return the response.
349	handlers = chain.get(kind, ())
350	for handler in handlers:
351	func = getattr(handler, meth_name)
352
353	result = func(*args)
354	if result is not None:
355	return result
356
357	def open(self, fullurl, data=None):
358	# accept a URL or a Request object
359	if isinstance(fullurl, basestring):
360	req = Request(fullurl, data)
361	else:
362	req = fullurl
363	if data is not None:
364	req.add_data(data)
365
366	protocol = req.get_type()
367
368	# pre-process request
369	meth_name = protocol+"_request"
370	for processor in self.process_request.get(protocol, []):
371	meth = getattr(processor, meth_name)
372	req = meth(req)
373
374	response = self._open(req, data)
375
376	# post-process response
377	meth_name = protocol+"_response"
378	for processor in self.process_response.get(protocol, []):
379	meth = getattr(processor, meth_name)
380	response = meth(req, response)
381
382	return response
383
384	def _open(self, req, data=None):
385	result = self._call_chain(self.handle_open, 'default',
386	'default_open', req)
387	if result:
388	return result
389
390	protocol = req.get_type()
391	result = self._call_chain(self.handle_open, protocol, protocol +
392	'_open', req)
393	if result:
394	return result
395
396	return self._call_chain(self.handle_open, 'unknown',
397	'unknown_open', req)
398
399	def error(self, proto, *args):
400	if proto in ('http', 'https'):
401	# XXX http[s] protocols are special-cased
402	dict = self.handle_error['http'] # https is not different than http
403	proto = args[2] # YUCK!
404	meth_name = 'http_error_%s' % proto
405	http_err = 1
406	orig_args = args
407	else:
408	dict = self.handle_error
409	meth_name = proto + '_error'
410	http_err = 0
411	args = (dict, proto, meth_name) + args
412	result = self._call_chain(*args)
413	if result:
414	return result
415
416	if http_err:
417	args = (dict, 'default', 'http_error_default') + orig_args
418	return self._call_chain(*args)
419
420	# XXX probably also want an abstract factory that knows when it makes
421	# sense to skip a superclass in favor of a subclass and when it might
422	# make sense to include both
423
424	def build_opener(*handlers):
425	"""Create an opener object from a list of handlers.
426
427	The opener will use several default handlers, including support
428	for HTTP and FTP.
429
430	If any of the handlers passed as arguments are subclasses of the
431	default handlers, the default handlers will not be used.
432	"""
433	import types
434	def isclass(obj):
435	return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__")
436
437	opener = OpenerDirector()
438	default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
439	HTTPDefaultErrorHandler, HTTPRedirectHandler,
440	FTPHandler, FileHandler, HTTPErrorProcessor]
441	if hasattr(httplib, 'HTTPS'):
442	default_classes.append(HTTPSHandler)
443	skip = []
444	for klass in default_classes:
445	for check in handlers:
446	if isclass(check):
447	if issubclass(check, klass):
448	skip.append(klass)
449	elif isinstance(check, klass):
450	skip.append(klass)
451	for klass in skip:
452	default_classes.remove(klass)
453
454	for klass in default_classes:
455	opener.add_handler(klass())
456
457	for h in handlers:
458	if isclass(h):
459	h = h()
460	opener.add_handler(h)
461	return opener
462
463	class BaseHandler:
464	handler_order = 500
465
466	def add_parent(self, parent):
467	self.parent = parent
468
469	def close(self):
470	# Only exists for backwards compatibility
471	pass
472
473	def __lt__(self, other):
474	if not hasattr(other, "handler_order"):
475	# Try to preserve the old behavior of having custom classes
476	# inserted after default ones (works only for custom user
477	# classes which are not aware of handler_order).
478	return True
479	return self.handler_order < other.handler_order
480
481
482	class HTTPErrorProcessor(BaseHandler):
483	"""Process HTTP error responses."""
484	handler_order = 1000 # after all other processing
485
486	def http_response(self, request, response):
487	code, msg, hdrs = response.code, response.msg, response.info()
488
489	if code not in (200, 206):
490	response = self.parent.error(
491	'http', request, response, code, msg, hdrs)
492
493	return response
494
495	https_response = http_response
496
497	class HTTPDefaultErrorHandler(BaseHandler):
498	def http_error_default(self, req, fp, code, msg, hdrs):
499	raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
500
501	class HTTPRedirectHandler(BaseHandler):
502	# maximum number of redirections to any single URL
503	# this is needed because of the state that cookies introduce
504	max_repeats = 4
505	# maximum total number of redirections (regardless of URL) before
506	# assuming we're in a loop
507	max_redirections = 10
508
509	def redirect_request(self, req, fp, code, msg, headers, newurl):
510	"""Return a Request or None in response to a redirect.
511
512	This is called by the http_error_30x methods when a
513	redirection response is received. If a redirection should
514	take place, return a new Request to allow http_error_30x to
515	perform the redirect. Otherwise, raise HTTPError if no-one
516	else should try to handle this url. Return None if you can't
517	but another Handler might.
518	"""
519	m = req.get_method()
520	if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
521	or code in (301, 302, 303) and m == "POST"):
522	# Strictly (according to RFC 2616), 301 or 302 in response
523	# to a POST MUST NOT cause a redirection without confirmation
524	# from the user (of urllib2, in this case). In practice,
525	# essentially all clients do redirect in this case, so we
526	# do the same.
527	# be conciliant with URIs containing a space
528	newurl = newurl.replace(' ', '%20')
529	return Request(newurl,
530	headers=req.headers,
531	origin_req_host=req.get_origin_req_host(),
532	unverifiable=True)
533	else:
534	raise HTTPError(req.get_full_url(), code, msg, headers, fp)
535
536	# Implementation note: To avoid the server sending us into an
537	# infinite loop, the request object needs to track what URLs we
538	# have already seen. Do this by adding a handler-specific
539	# attribute to the Request object.
540	def http_error_302(self, req, fp, code, msg, headers):
541	# Some servers (incorrectly) return multiple Location headers
542	# (so probably same goes for URI). Use first header.
543	if 'location' in headers:
544	newurl = headers.getheaders('location')[0]
545	elif 'uri' in headers:
546	newurl = headers.getheaders('uri')[0]
547	else:
548	return
549	newurl = urlparse.urljoin(req.get_full_url(), newurl)
550
551	# XXX Probably want to forget about the state of the current
552	# request, although that might interact poorly with other
553	# handlers that also use handler-specific request attributes
554	new = self.redirect_request(req, fp, code, msg, headers, newurl)
555	if new is None:
556	return
557
558	# loop detection
559	# .redirect_dict has a key url if url was previously visited.
560	if hasattr(req, 'redirect_dict'):
561	visited = new.redirect_dict = req.redirect_dict
562	if (visited.get(newurl, 0) >= self.max_repeats or
563	len(visited) >= self.max_redirections):
564	raise HTTPError(req.get_full_url(), code,
565	self.inf_msg + msg, headers, fp)
566	else:
567	visited = new.redirect_dict = req.redirect_dict = {}
568	visited[newurl] = visited.get(newurl, 0) + 1
569
570	# Don't close the fp until we are sure that we won't use it
571	# with HTTPError.
572	fp.read()
573	fp.close()
574
575	return self.parent.open(new)
576
577	http_error_301 = http_error_303 = http_error_307 = http_error_302
578
579	inf_msg = "The HTTP server returned a redirect error that would " \
580	"lead to an infinite loop.\n" \
581	"The last 30x error message was:\n"
582
583
584	def _parse_proxy(proxy):
585	"""Return (scheme, user, password, host/port) given a URL or an authority.
586
587	If a URL is supplied, it must have an authority (host:port) component.
588	According to RFC 3986, having an authority component means the URL must
589	have two slashes after the scheme:
590
591	>>> _parse_proxy('file:/ftp.example.com/')
592	Traceback (most recent call last):
593	ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
594
595	The first three items of the returned tuple may be None.
596
597	Examples of authority parsing:
598
599	>>> _parse_proxy('proxy.example.com')
600	(None, None, None, 'proxy.example.com')
601	>>> _parse_proxy('proxy.example.com:3128')
602	(None, None, None, 'proxy.example.com:3128')
603
604	The authority component may optionally include userinfo (assumed to be
605	username:password):
606
607	>>> _parse_proxy('joe:password@proxy.example.com')
608	(None, 'joe', 'password', 'proxy.example.com')
609	>>> _parse_proxy('joe:password@proxy.example.com:3128')
610	(None, 'joe', 'password', 'proxy.example.com:3128')
611
612	Same examples, but with URLs instead:
613
614	>>> _parse_proxy('http://proxy.example.com/')
615	('http', None, None, 'proxy.example.com')
616	>>> _parse_proxy('http://proxy.example.com:3128/')
617	('http', None, None, 'proxy.example.com:3128')
618	>>> _parse_proxy('http://joe:password@proxy.example.com/')
619	('http', 'joe', 'password', 'proxy.example.com')
620	>>> _parse_proxy('http://joe:password@proxy.example.com:3128')
621	('http', 'joe', 'password', 'proxy.example.com:3128')
622
623	Everything after the authority is ignored:
624
625	>>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
626	('ftp', 'joe', 'password', 'proxy.example.com')
627
628	Test for no trailing '/' case:
629
630	>>> _parse_proxy('http://joe:password@proxy.example.com')
631	('http', 'joe', 'password', 'proxy.example.com')
632
633	"""
634	scheme, r_scheme = splittype(proxy)
635	if not r_scheme.startswith("/"):
636	# authority
637	scheme = None
638	authority = proxy
639	else:
640	# URL
641	if not r_scheme.startswith("//"):
642	raise ValueError("proxy URL with no authority: %r" % proxy)
643	# We have an authority, so for RFC 3986-compliant URLs (by ss 3.
644	# and 3.3.), path is empty or starts with '/'
645	end = r_scheme.find("/", 2)
646	if end == -1:
647	end = None
648	authority = r_scheme[2:end]
649	userinfo, hostport = splituser(authority)
650	if userinfo is not None:
651	user, password = splitpasswd(userinfo)
652	else:
653	user = password = None
654	return scheme, user, password, hostport
655
656	class ProxyHandler(BaseHandler):
657	# Proxies must be in front
658	handler_order = 100
659
660	def __init__(self, proxies=None):
661	if proxies is None:
662	proxies = getproxies()
663	assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
664	self.proxies = proxies
665	for type, url in proxies.items():
666	setattr(self, '%s_open' % type,
667	lambda r, proxy=url, type=type, meth=self.proxy_open: \
668	meth(r, proxy, type))
669
670	def proxy_open(self, req, proxy, type):
671	orig_type = req.get_type()
672	proxy_type, user, password, hostport = _parse_proxy(proxy)
673	if proxy_type is None:
674	proxy_type = orig_type
675	if user and password:
676	user_pass = '%s:%s' % (unquote(user), unquote(password))
677	creds = base64.encodestring(user_pass).strip()
678	req.add_header('Proxy-authorization', 'Basic ' + creds)
679	hostport = unquote(hostport)
680	req.set_proxy(hostport, proxy_type)
681	if orig_type == proxy_type:
682	# let other handlers take care of it
683	return None
684	else:
685	# need to start over, because the other handlers don't
686	# grok the proxy's URL type
687	# e.g. if we have a constructor arg proxies like so:
688	# {'http': 'ftp://proxy.example.com'}, we may end up turning
689	# a request for http://acme.example.com/a into one for
690	# ftp://proxy.example.com/a
691	return self.parent.open(req)
692
693	class HTTPPasswordMgr:
694
695	def __init__(self):
696	self.passwd = {}
697
698	def add_password(self, realm, uri, user, passwd):
699	# uri could be a single URI or a sequence
700	if isinstance(uri, basestring):
701	uri = [uri]
702	if not realm in self.passwd:
703	self.passwd[realm] = {}
704	for default_port in True, False:
705	reduced_uri = tuple(
706	[self.reduce_uri(u, default_port) for u in uri])
707	self.passwd[realm][reduced_uri] = (user, passwd)
708
709	def find_user_password(self, realm, authuri):
710	domains = self.passwd.get(realm, {})
711	for default_port in True, False:
712	reduced_authuri = self.reduce_uri(authuri, default_port)
713	for uris, authinfo in domains.iteritems():
714	for uri in uris:
715	if self.is_suburi(uri, reduced_authuri):
716	return authinfo
717	return None, None
718
719	def reduce_uri(self, uri, default_port=True):
720	"""Accept authority or URI and extract only the authority and path."""
721	# note HTTP URLs do not have a userinfo component
722	parts = urlparse.urlsplit(uri)
723	if parts[1]:
724	# URI
725	scheme = parts[0]
726	authority = parts[1]
727	path = parts[2] or '/'
728	else:
729	# host or host:port
730	scheme = None
731	authority = uri
732	path = '/'
733	host, port = splitport(authority)
734	if default_port and port is None and scheme is not None:
735	dport = {"http": 80,
736	"https": 443,
737	}.get(scheme)
738	if dport is not None:
739	authority = "%s:%d" % (host, dport)
740	return authority, path
741
742	def is_suburi(self, base, test):
743	"""Check if test is below base in a URI tree
744
745	Both args must be URIs in reduced form.
746	"""
747	if base == test:
748	return True
749	if base[0] != test[0]:
750	return False
751	common = posixpath.commonprefix((base[1], test[1]))
752	if len(common) == len(base[1]):
753	return True
754	return False
755
756
757	class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
758
759	def find_user_password(self, realm, authuri):
760	user, password = HTTPPasswordMgr.find_user_password(self, realm,
761	authuri)
762	if user is not None:
763	return user, password
764	return HTTPPasswordMgr.find_user_password(self, None, authuri)
765
766
767	class AbstractBasicAuthHandler:
768
769	rx = re.compile('[ \t]([^ \t]+)[ \t]+realm="([^"])"', re.I)
770
771	# XXX there can actually be multiple auth-schemes in a
772	# www-authenticate header. should probably be a lot more careful
773	# in parsing them to extract multiple alternatives
774
775	# XXX could pre-emptively send auth info already accepted (RFC 2617,
776	# end of section 2, and section 1.2 immediately after "credentials"
777	# production).
778
779	def __init__(self, password_mgr=None):
780	if password_mgr is None:
781	password_mgr = HTTPPasswordMgr()
782	self.passwd = password_mgr
783	self.add_password = self.passwd.add_password
784
785	def http_error_auth_reqed(self, authreq, host, req, headers):
786	# host may be an authority (without userinfo) or a URL with an
787	# authority
788	# XXX could be multiple headers
789	authreq = headers.get(authreq, None)
790	if authreq:
791	mo = AbstractBasicAuthHandler.rx.search(authreq)
792	if mo:
793	scheme, realm = mo.groups()
794	if scheme.lower() == 'basic':
795	return self.retry_http_basic_auth(host, req, realm)
796
797	def retry_http_basic_auth(self, host, req, realm):
798	user, pw = self.passwd.find_user_password(realm, host)
799	if pw is not None:
800	raw = "%s:%s" % (user, pw)
801	auth = 'Basic %s' % base64.encodestring(raw).strip()
802	if req.headers.get(self.auth_header, None) == auth:
803	return None
804	req.add_header(self.auth_header, auth)
805	return self.parent.open(req)
806	else:
807	return None
808
809
810	class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
811
812	auth_header = 'Authorization'
813
814	def http_error_401(self, req, fp, code, msg, headers):
815	url = req.get_full_url()
816	return self.http_error_auth_reqed('www-authenticate',
817	url, req, headers)
818
819
820	class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
821
822	auth_header = 'Proxy-authorization'
823
824	def http_error_407(self, req, fp, code, msg, headers):
825	# http_error_auth_reqed requires that there is no userinfo component in
826	# authority. Assume there isn't one, since urllib2 does not (and
827	# should not, RFC 3986 s. 3.2.1) support requests for URLs containing
828	# userinfo.
829	authority = req.get_host()
830	return self.http_error_auth_reqed('proxy-authenticate',
831	authority, req, headers)
832
833
834	def randombytes(n):
835	"""Return n random bytes."""
836	# Use /dev/urandom if it is available. Fall back to random module
837	# if not. It might be worthwhile to extend this function to use
838	# other platform-specific mechanisms for getting random bytes.
839	if os.path.exists("/dev/urandom"):
840	f = open("/dev/urandom")
841	s = f.read(n)
842	f.close()
843	return s
844	else:
845	L = [chr(random.randrange(0, 256)) for i in range(n)]
846	return "".join(L)
847
848	class AbstractDigestAuthHandler:
849	# Digest authentication is specified in RFC 2617.
850
851	# XXX The client does not inspect the Authentication-Info header
852	# in a successful response.
853
854	# XXX It should be possible to test this implementation against
855	# a mock server that just generates a static set of challenges.
856
857	# XXX qop="auth-int" supports is shaky
858
859	def __init__(self, passwd=None):
860	if passwd is None:
861	passwd = HTTPPasswordMgr()
862	self.passwd = passwd
863	self.add_password = self.passwd.add_password
864	self.retried = 0
865	self.nonce_count = 0
866
867	def reset_retry_count(self):
868	self.retried = 0
869
870	def http_error_auth_reqed(self, auth_header, host, req, headers):
871	authreq = headers.get(auth_header, None)
872	if self.retried > 5:
873	# Don't fail endlessly - if we failed once, we'll probably
874	# fail a second time. Hm. Unless the Password Manager is
875	# prompting for the information. Crap. This isn't great
876	# but it's better than the current 'repeat until recursion
877	# depth exceeded' approach <wink>
878	raise HTTPError(req.get_full_url(), 401, "digest auth failed",
879	headers, None)
880	else:
881	self.retried += 1
882	if authreq:
883	scheme = authreq.split()[0]
884	if scheme.lower() == 'digest':
885	return self.retry_http_digest_auth(req, authreq)
886
887	def retry_http_digest_auth(self, req, auth):
888	token, challenge = auth.split(' ', 1)
889	chal = parse_keqv_list(parse_http_list(challenge))
890	auth = self.get_authorization(req, chal)
891	if auth:
892	auth_val = 'Digest %s' % auth
893	if req.headers.get(self.auth_header, None) == auth_val:
894	return None
895	req.add_unredirected_header(self.auth_header, auth_val)
896	resp = self.parent.open(req)
897	return resp
898
899	def get_cnonce(self, nonce):
900	# The cnonce-value is an opaque
901	# quoted string value provided by the client and used by both client
902	# and server to avoid chosen plaintext attacks, to provide mutual
903	# authentication, and to provide some message integrity protection.
904	# This isn't a fabulous effort, but it's probably Good Enough.
905	dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
906	randombytes(8))).hexdigest()
907	return dig[:16]
908
909	def get_authorization(self, req, chal):
910	try:
911	realm = chal['realm']
912	nonce = chal['nonce']
913	qop = chal.get('qop')
914	algorithm = chal.get('algorithm', 'MD5')
915	# mod_digest doesn't send an opaque, even though it isn't
916	# supposed to be optional
917	opaque = chal.get('opaque', None)
918	except KeyError:
919	return None
920
921	H, KD = self.get_algorithm_impls(algorithm)
922	if H is None:
923	return None
924
925	user, pw = self.passwd.find_user_password(realm, req.get_full_url())
926	if user is None:
927	return None
928
929	# XXX not implemented yet
930	if req.has_data():
931	entdig = self.get_entity_digest(req.get_data(), chal)
932	else:
933	entdig = None
934
935	A1 = "%s:%s:%s" % (user, realm, pw)
936	A2 = "%s:%s" % (req.get_method(),
937	# XXX selector: what about proxies and full urls
938	req.get_selector())
939	if qop == 'auth':
940	self.nonce_count += 1
941	ncvalue = '%08x' % self.nonce_count
942	cnonce = self.get_cnonce(nonce)
943	noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
944	respdig = KD(H(A1), noncebit)
945	elif qop is None:
946	respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
947	else:
948	# XXX handle auth-int.
949	pass
950
951	# XXX should the partial digests be encoded too?
952
953	base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
954	'response="%s"' % (user, realm, nonce, req.get_selector(),
955	respdig)
956	if opaque:
957	base += ', opaque="%s"' % opaque
958	if entdig:
959	base += ', digest="%s"' % entdig
960	base += ', algorithm="%s"' % algorithm
961	if qop:
962	base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
963	return base
964
965	def get_algorithm_impls(self, algorithm):
966	# lambdas assume digest modules are imported at the top level
967	if algorithm == 'MD5':
968	H = lambda x: hashlib.md5(x).hexdigest()
969	elif algorithm == 'SHA':
970	H = lambda x: hashlib.sha1(x).hexdigest()
971	# XXX MD5-sess
972	KD = lambda s, d: H("%s:%s" % (s, d))
973	return H, KD
974
975	def get_entity_digest(self, data, chal):
976	# XXX not implemented yet
977	return None
978
979
980	class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
981	"""An authentication protocol defined by RFC 2069
982
983	Digest authentication improves on basic authentication because it
984	does not transmit passwords in the clear.
985	"""
986
987	auth_header = 'Authorization'
988	handler_order = 490 # before Basic auth
989
990	def http_error_401(self, req, fp, code, msg, headers):
991	host = urlparse.urlparse(req.get_full_url())[1]
992	retry = self.http_error_auth_reqed('www-authenticate',
993	host, req, headers)
994	self.reset_retry_count()
995	return retry
996
997
998	class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
999
1000	auth_header = 'Proxy-Authorization'
1001	handler_order = 490 # before Basic auth
1002
1003	def http_error_407(self, req, fp, code, msg, headers):
1004	host = req.get_host()
1005	retry = self.http_error_auth_reqed('proxy-authenticate',
1006	host, req, headers)
1007	self.reset_retry_count()
1008	return retry
1009
1010	class AbstractHTTPHandler(BaseHandler):
1011
1012	def __init__(self, debuglevel=0):
1013	self._debuglevel = debuglevel
1014
1015	def set_http_debuglevel(self, level):
1016	self._debuglevel = level
1017
1018	def do_request_(self, request):
1019	host = request.get_host()
1020	if not host:
1021	raise URLError('no host given')
1022
1023	if request.has_data(): # POST
1024	data = request.get_data()
1025	if not request.has_header('Content-type'):
1026	request.add_unredirected_header(
1027	'Content-type',
1028	'application/x-www-form-urlencoded')
1029	if not request.has_header('Content-length'):
1030	request.add_unredirected_header(
1031	'Content-length', '%d' % len(data))
1032
1033	scheme, sel = splittype(request.get_selector())
1034	sel_host, sel_path = splithost(sel)
1035	if not request.has_header('Host'):
1036	request.add_unredirected_header('Host', sel_host or host)
1037	for name, value in self.parent.addheaders:
1038	name = name.capitalize()
1039	if not request.has_header(name):
1040	request.add_unredirected_header(name, value)
1041
1042	return request
1043
1044	def do_open(self, http_class, req):
1045	"""Return an addinfourl object for the request, using http_class.
1046
1047	http_class must implement the HTTPConnection API from httplib.
1048	The addinfourl return value is a file-like object. It also
1049	has methods and attributes including:
1050	- info(): return a mimetools.Message object for the headers
1051	- geturl(): return the original request URL
1052	- code: HTTP status code
1053	"""
1054	host = req.get_host()
1055	if not host:
1056	raise URLError('no host given')
1057
1058	h = http_class(host) # will parse host:port
1059	h.set_debuglevel(self._debuglevel)
1060
1061	headers = dict(req.headers)
1062	headers.update(req.unredirected_hdrs)
1063	# We want to make an HTTP/1.1 request, but the addinfourl
1064	# class isn't prepared to deal with a persistent connection.
1065	# It will try to read all remaining data from the socket,
1066	# which will block while the server waits for the next request.
1067	# So make sure the connection gets closed after the (only)
1068	# request.
1069	headers["Connection"] = "close"
1070	headers = dict(
1071	(name.title(), val) for name, val in headers.items())
1072	try:
1073	h.request(req.get_method(), req.get_selector(), req.data, headers)
1074	r = h.getresponse()
1075	except socket.error, err: # XXX what error?
1076	raise URLError(err)
1077
1078	# Pick apart the HTTPResponse object to get the addinfourl
1079	# object initialized properly.
1080
1081	# Wrap the HTTPResponse object in socket's file object adapter
1082	# for Windows. That adapter calls recv(), so delegate recv()
1083	# to read(). This weird wrapping allows the returned object to
1084	# have readline() and readlines() methods.
1085
1086	# XXX It might be better to extract the read buffering code
1087	# out of socket._fileobject() and into a base class.
1088
1089	r.recv = r.read
1090	fp = socket._fileobject(r)
1091
1092	resp = addinfourl(fp, r.msg, req.get_full_url())
1093	resp.code = r.status
1094	resp.msg = r.reason
1095	return resp
1096
1097
1098	class HTTPHandler(AbstractHTTPHandler):
1099
1100	def http_open(self, req):
1101	return self.do_open(httplib.HTTPConnection, req)
1102
1103	http_request = AbstractHTTPHandler.do_request_
1104
1105	if hasattr(httplib, 'HTTPS'):
1106	class HTTPSHandler(AbstractHTTPHandler):
1107
1108	def https_open(self, req):
1109	return self.do_open(httplib.HTTPSConnection, req)
1110
1111	https_request = AbstractHTTPHandler.do_request_
1112
1113	class HTTPCookieProcessor(BaseHandler):
1114	def __init__(self, cookiejar=None):
1115	import cookielib
1116	if cookiejar is None:
1117	cookiejar = cookielib.CookieJar()
1118	self.cookiejar = cookiejar
1119
1120	def http_request(self, request):
1121	self.cookiejar.add_cookie_header(request)
1122	return request
1123
1124	def http_response(self, request, response):
1125	self.cookiejar.extract_cookies(response, request)
1126	return response
1127
1128	https_request = http_request
1129	https_response = http_response
1130
1131	class UnknownHandler(BaseHandler):
1132	def unknown_open(self, req):
1133	type = req.get_type()
1134	raise URLError('unknown url type: %s' % type)
1135
1136	def parse_keqv_list(l):
1137	"""Parse list of key=value strings where keys are not duplicated."""
1138	parsed = {}
1139	for elt in l:
1140	k, v = elt.split('=', 1)
1141	if v[0] == '"' and v[-1] == '"':
1142	v = v[1:-1]
1143	parsed[k] = v
1144	return parsed
1145
1146	def parse_http_list(s):
1147	"""Parse lists as described by RFC 2068 Section 2.
1148
1149	In particular, parse comma-separated lists where the elements of
1150	the list may include quoted-strings. A quoted-string could
1151	contain a comma. A non-quoted string could have quotes in the
1152	middle. Neither commas nor quotes count if they are escaped.
1153	Only double-quotes count, not single-quotes.
1154	"""
1155	res = []
1156	part = ''
1157
1158	escape = quote = False
1159	for cur in s:
1160	if escape:
1161	part += cur
1162	escape = False
1163	continue
1164	if quote:
1165	if cur == '\\':
1166	escape = True
1167	continue
1168	elif cur == '"':
1169	quote = False
1170	part += cur
1171	continue
1172
1173	if cur == ',':
1174	res.append(part)
1175	part = ''
1176	continue
1177
1178	if cur == '"':
1179	quote = True
1180
1181	part += cur
1182
1183	# append last part
1184	if part:
1185	res.append(part)
1186
1187	return [part.strip() for part in res]
1188
1189	class FileHandler(BaseHandler):
1190	# Use local file or FTP depending on form of URL
1191	def file_open(self, req):
1192	url = req.get_selector()
1193	if url[:2] == '//' and url[2:3] != '/':
1194	req.type = 'ftp'
1195	return self.parent.open(req)
1196	else:
1197	return self.open_local_file(req)
1198
1199	# names for the localhost
1200	names = None
1201	def get_names(self):
1202	if FileHandler.names is None:
1203	try:
1204	FileHandler.names = (socket.gethostbyname('localhost'),
1205	socket.gethostbyname(socket.gethostname()))
1206	except socket.gaierror:
1207	FileHandler.names = (socket.gethostbyname('localhost'),)
1208	return FileHandler.names
1209
1210	# not entirely sure what the rules are here
1211	def open_local_file(self, req):
1212	import email.Utils
1213	import mimetypes
1214	host = req.get_host()
1215	file = req.get_selector()
1216	localfile = url2pathname(file)
1217	stats = os.stat(localfile)
1218	size = stats.st_size
1219	modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
1220	mtype = mimetypes.guess_type(file)[0]
1221	headers = mimetools.Message(StringIO(
1222	'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1223	(mtype or 'text/plain', size, modified)))
1224	if host:
1225	host, port = splitport(host)
1226	if not host or \
1227	(not port and socket.gethostbyname(host) in self.get_names()):
1228	return addinfourl(open(localfile, 'rb'),
1229	headers, 'file:'+file)
1230	raise URLError('file not on local host')
1231
1232	class FTPHandler(BaseHandler):
1233	def ftp_open(self, req):
1234	import ftplib
1235	import mimetypes
1236	host = req.get_host()
1237	if not host:
1238	raise IOError, ('ftp error', 'no host given')
1239	host, port = splitport(host)
1240	if port is None:
1241	port = ftplib.FTP_PORT
1242	else:
1243	port = int(port)
1244
1245	# username/password handling
1246	user, host = splituser(host)
1247	if user:
1248	user, passwd = splitpasswd(user)
1249	else:
1250	passwd = None
1251	host = unquote(host)
1252	user = unquote(user or '')
1253	passwd = unquote(passwd or '')
1254
1255	try:
1256	host = socket.gethostbyname(host)
1257	except socket.error, msg:
1258	raise URLError(msg)
1259	path, attrs = splitattr(req.get_selector())
1260	dirs = path.split('/')
1261	dirs = map(unquote, dirs)
1262	dirs, file = dirs[:-1], dirs[-1]
1263	if dirs and not dirs[0]:
1264	dirs = dirs[1:]
1265	try:
1266	fw = self.connect_ftp(user, passwd, host, port, dirs)
1267	type = file and 'I' or 'D'
1268	for attr in attrs:
1269	attr, value = splitvalue(attr)
1270	if attr.lower() == 'type' and \
1271	value in ('a', 'A', 'i', 'I', 'd', 'D'):
1272	type = value.upper()
1273	fp, retrlen = fw.retrfile(file, type)
1274	headers = ""
1275	mtype = mimetypes.guess_type(req.get_full_url())[0]
1276	if mtype:
1277	headers += "Content-type: %s\n" % mtype
1278	if retrlen is not None and retrlen >= 0:
1279	headers += "Content-length: %d\n" % retrlen
1280	sf = StringIO(headers)
1281	headers = mimetools.Message(sf)
1282	return addinfourl(fp, headers, req.get_full_url())
1283	except ftplib.all_errors, msg:
1284	raise IOError, ('ftp error', msg), sys.exc_info()[2]
1285
1286	def connect_ftp(self, user, passwd, host, port, dirs):
1287	fw = ftpwrapper(user, passwd, host, port, dirs)
1288	## fw.ftp.set_debuglevel(1)
1289	return fw
1290
1291	class CacheFTPHandler(FTPHandler):
1292	# XXX would be nice to have pluggable cache strategies
1293	# XXX this stuff is definitely not thread safe
1294	def __init__(self):
1295	self.cache = {}
1296	self.timeout = {}
1297	self.soonest = 0
1298	self.delay = 60
1299	self.max_conns = 16
1300
1301	def setTimeout(self, t):
1302	self.delay = t
1303
1304	def setMaxConns(self, m):
1305	self.max_conns = m
1306
1307	def connect_ftp(self, user, passwd, host, port, dirs):
1308	key = user, host, port, '/'.join(dirs)
1309	if key in self.cache:
1310	self.timeout[key] = time.time() + self.delay
1311	else:
1312	self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1313	self.timeout[key] = time.time() + self.delay
1314	self.check_cache()
1315	return self.cache[key]
1316
1317	def check_cache(self):
1318	# first check for old ones
1319	t = time.time()
1320	if self.soonest <= t:
1321	for k, v in self.timeout.items():
1322	if v < t:
1323	self.cache[k].close()
1324	del self.cache[k]
1325	del self.timeout[k]
1326	self.soonest = min(self.timeout.values())
1327
1328	# then check the size
1329	if len(self.cache) == self.max_conns:
1330	for k, v in self.timeout.items():
1331	if v == self.soonest:
1332	del self.cache[k]
1333	del self.timeout[k]
1334	break
1335	self.soonest = min(self.timeout.values())
1336
1337	class GopherHandler(BaseHandler):
1338	def gopher_open(self, req):
1339	# XXX can raise socket.error
1340	import gopherlib # this raises DeprecationWarning in 2.5
1341	host = req.get_host()
1342	if not host:
1343	raise GopherError('no host given')
1344	host = unquote(host)
1345	selector = req.get_selector()
1346	type, selector = splitgophertype(selector)
1347	selector, query = splitquery(selector)
1348	selector = unquote(selector)
1349	if query:
1350	query = unquote(query)
1351	fp = gopherlib.send_query(selector, query, host)
1352	else:
1353	fp = gopherlib.send_selector(selector, host)
1354	return addinfourl(fp, noheaders(), req.get_full_url())

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Lib/urllib2.py

Download in other formats: