Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

urllib2.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 50.7 KB

Rev	Line
[2]	1	"""An extensible library for opening URLs using a variety of protocols
	2
	3	The simplest way to use this module is to call the urlopen function,
	4	which accepts a string containing a URL or a Request object (described
	5	below). It opens the URL and returns the results as file-like
	6	object; the returned object has some extra methods described below.
	7
	8	The OpenerDirector manages a collection of Handler objects that do
	9	all the actual work. Each Handler implements a particular protocol or
	10	option. The OpenerDirector is a composite object that invokes the
	11	Handlers needed to open the requested URL. For example, the
	12	HTTPHandler performs HTTP GET and POST requests and deals with
	13	non-error returns. The HTTPRedirectHandler automatically deals with
	14	HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
	15	deals with digest authentication.
	16
	17	urlopen(url, data=None) -- Basic usage is the same as original
	18	urllib. pass the url and optionally data to post to an HTTP URL, and
	19	get a file-like object back. One difference is that you can also pass
	20	a Request instance instead of URL. Raises a URLError (subclass of
	21	IOError); for HTTP errors, raises an HTTPError, which can also be
	22	treated as a valid response.
	23
	24	build_opener -- Function that creates a new OpenerDirector instance.
	25	Will install the default handlers. Accepts one or more Handlers as
	26	arguments, either instances or Handler classes that it will
	27	instantiate. If one of the argument is a subclass of the default
	28	handler, the argument will be installed instead of the default.
	29
	30	install_opener -- Installs a new opener as the default opener.
	31
	32	objects of interest:
	33
	34	OpenerDirector -- Sets up the User Agent as the Python-urllib client and manages
	35	the Handler classes, while dealing with requests and responses.
	36
	37	Request -- An object that encapsulates the state of a request. The
	38	state can be as simple as the URL. It can also include extra HTTP
	39	headers, e.g. a User-Agent.
	40
	41	BaseHandler --
	42
	43	exceptions:
	44	URLError -- A subclass of IOError, individual protocols have their own
	45	specific subclass.
	46
	47	HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
	48	as an exceptional event or valid response.
	49
	50	internals:
	51	BaseHandler and parent
	52	_call_chain conventions
	53
	54	Example usage:
	55
	56	import urllib2
	57
	58	# set up authentication info
	59	authinfo = urllib2.HTTPBasicAuthHandler()
	60	authinfo.add_password(realm='PDQ Application',
	61	uri='https://mahler:8092/site-updates.py',
	62	user='klem',
	63	passwd='geheim$parole')
	64
	65	proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
	66
	67	# build a new opener that adds authentication and caching FTP handlers
	68	opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
	69
	70	# install it
	71	urllib2.install_opener(opener)
	72
	73	f = urllib2.urlopen('http://www.python.org/')
	74
	75
	76	"""
	77
	78	# XXX issues:
	79	# If an authentication error handler that tries to perform
	80	# authentication for some reason but fails, how should the error be
	81	# signalled? The client needs to know the HTTP error code. But if
	82	# the handler knows that the problem was, e.g., that it didn't know
	83	# that hash algo that requested in the challenge, it would be good to
	84	# pass that information along to the client, too.
	85	# ftp errors aren't handled cleanly
	86	# check digest against correct (i.e. non-apache) implementation
	87
	88	# Possible extensions:
	89	# complex proxies XXX not sure what exactly was meant by this
	90	# abstract factory for opener
	91
	92	import base64
	93	import hashlib
	94	import httplib
	95	import mimetools
	96	import os
	97	import posixpath
	98	import random
	99	import re
	100	import socket
	101	import sys
	102	import time
	103	import urlparse
	104	import bisect
[391]	105	import warnings
[2]	106
	107	try:
	108	from cStringIO import StringIO
	109	except ImportError:
	110	from StringIO import StringIO
	111
	112	from urllib import (unwrap, unquote, splittype, splithost, quote,
[391]	113	addinfourl, splitport, splittag, toBytes,
[2]	114	splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
	115
	116	# support for FileHandler, proxies via environment variables
	117	from urllib import localhost, url2pathname, getproxies, proxy_bypass
	118
	119	# used in User-Agent header sent
	120	__version__ = sys.version[:3]
	121
	122	_opener = None
	123	def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
	124	global _opener
	125	if _opener is None:
	126	_opener = build_opener()
	127	return _opener.open(url, data, timeout)
	128
	129	def install_opener(opener):
	130	global _opener
	131	_opener = opener
	132
	133	# do these error classes make sense?
	134	# make sure all of the IOError stuff is overridden. we just want to be
	135	# subtypes.
	136
	137	class URLError(IOError):
	138	# URLError is a sub-type of IOError, but it doesn't share any of
	139	# the implementation. need to override __init__ and __str__.
	140	# It sets self.args for compatibility with other EnvironmentError
	141	# subclasses, but args doesn't have the typical format with errno in
	142	# slot 0 and strerror in slot 1. This may be better than nothing.
	143	def __init__(self, reason):
	144	self.args = reason,
	145	self.reason = reason
	146
	147	def __str__(self):
	148	return '<urlopen error %s>' % self.reason
	149
	150	class HTTPError(URLError, addinfourl):
	151	"""Raised when HTTP error occurs, but also acts like non-error return"""
	152	__super_init = addinfourl.__init__
	153
	154	def __init__(self, url, code, msg, hdrs, fp):
	155	self.code = code
	156	self.msg = msg
	157	self.hdrs = hdrs
	158	self.fp = fp
	159	self.filename = url
	160	# The addinfourl classes depend on fp being a valid file
	161	# object. In some cases, the HTTPError may not have a valid
	162	# file object. If this happens, the simplest workaround is to
	163	# not initialize the base classes.
	164	if fp is not None:
	165	self.__super_init(fp, hdrs, url, code)
	166
	167	def __str__(self):
	168	return 'HTTP Error %s: %s' % (self.code, self.msg)
	169
[391]	170	# since URLError specifies a .reason attribute, HTTPError should also
	171	# provide this attribute. See issue13211 fo discussion.
	172	@property
	173	def reason(self):
	174	return self.msg
	175
	176	def info(self):
	177	return self.hdrs
	178
[2]	179	# copied from cookielib.py
	180	_cut_port_re = re.compile(r":\d+$")
	181	def request_host(request):
	182	"""Return request-host, as defined by RFC 2965.
	183
	184	Variation from RFC: returned value is lowercased, for convenient
	185	comparison.
	186
	187	"""
	188	url = request.get_full_url()
	189	host = urlparse.urlparse(url)[1]
	190	if host == "":
	191	host = request.get_header("Host", "")
	192
	193	# remove port, if present
	194	host = _cut_port_re.sub("", host, 1)
	195	return host.lower()
	196
	197	class Request:
	198
	199	def __init__(self, url, data=None, headers={},
	200	origin_req_host=None, unverifiable=False):
	201	# unwrap('<URL:type://host/path>') --> 'type://host/path'
	202	self.__original = unwrap(url)
[391]	203	self.__original, self.__fragment = splittag(self.__original)
[2]	204	self.type = None
	205	# self.__r_type is what's left after doing the splittype
	206	self.host = None
	207	self.port = None
	208	self._tunnel_host = None
	209	self.data = data
	210	self.headers = {}
	211	for key, value in headers.items():
	212	self.add_header(key, value)
	213	self.unredirected_hdrs = {}
	214	if origin_req_host is None:
	215	origin_req_host = request_host(self)
	216	self.origin_req_host = origin_req_host
	217	self.unverifiable = unverifiable
	218
	219	def __getattr__(self, attr):
	220	# XXX this is a fallback mechanism to guard against these
	221	# methods getting called in a non-standard order. this may be
	222	# too complicated and/or unnecessary.
	223	# XXX should the __r_XXX attributes be public?
	224	if attr[:12] == '_Request__r_':
	225	name = attr[12:]
	226	if hasattr(Request, 'get_' + name):
	227	getattr(self, 'get_' + name)()
	228	return getattr(self, attr)
	229	raise AttributeError, attr
	230
	231	def get_method(self):
	232	if self.has_data():
	233	return "POST"
	234	else:
	235	return "GET"
	236
	237	# XXX these helper methods are lame
	238
	239	def add_data(self, data):
	240	self.data = data
	241
	242	def has_data(self):
	243	return self.data is not None
	244
	245	def get_data(self):
	246	return self.data
	247
	248	def get_full_url(self):
[391]	249	if self.__fragment:
	250	return '%s#%s' % (self.__original, self.__fragment)
	251	else:
	252	return self.__original
[2]	253
	254	def get_type(self):
	255	if self.type is None:
	256	self.type, self.__r_type = splittype(self.__original)
	257	if self.type is None:
	258	raise ValueError, "unknown url type: %s" % self.__original
	259	return self.type
	260
	261	def get_host(self):
	262	if self.host is None:
	263	self.host, self.__r_host = splithost(self.__r_type)
	264	if self.host:
	265	self.host = unquote(self.host)
	266	return self.host
	267
	268	def get_selector(self):
	269	return self.__r_host
	270
	271	def set_proxy(self, host, type):
	272	if self.type == 'https' and not self._tunnel_host:
	273	self._tunnel_host = self.host
	274	else:
	275	self.type = type
	276	self.__r_host = self.__original
	277
	278	self.host = host
	279
	280	def has_proxy(self):
	281	return self.__r_host == self.__original
	282
	283	def get_origin_req_host(self):
	284	return self.origin_req_host
	285
	286	def is_unverifiable(self):
	287	return self.unverifiable
	288
	289	def add_header(self, key, val):
	290	# useful for something like authentication
	291	self.headers[key.capitalize()] = val
	292
	293	def add_unredirected_header(self, key, val):
	294	# will not be added to a redirected request
	295	self.unredirected_hdrs[key.capitalize()] = val
	296
	297	def has_header(self, header_name):
	298	return (header_name in self.headers or
	299	header_name in self.unredirected_hdrs)
	300
	301	def get_header(self, header_name, default=None):
	302	return self.headers.get(
	303	header_name,
	304	self.unredirected_hdrs.get(header_name, default))
	305
	306	def header_items(self):
	307	hdrs = self.unredirected_hdrs.copy()
	308	hdrs.update(self.headers)
	309	return hdrs.items()
	310
	311	class OpenerDirector:
	312	def __init__(self):
	313	client_version = "Python-urllib/%s" % __version__
	314	self.addheaders = [('User-agent', client_version)]
[391]	315	# self.handlers is retained only for backward compatibility
	316	self.handlers = []
[2]	317	# manage the individual handlers
	318	self.handle_open = {}
	319	self.handle_error = {}
	320	self.process_response = {}
	321	self.process_request = {}
	322
	323	def add_handler(self, handler):
	324	if not hasattr(handler, "add_parent"):
	325	raise TypeError("expected BaseHandler instance, got %r" %
	326	type(handler))
	327
	328	added = False
	329	for meth in dir(handler):
	330	if meth in ["redirect_request", "do_open", "proxy_open"]:
	331	# oops, coincidental match
	332	continue
	333
	334	i = meth.find("_")
	335	protocol = meth[:i]
	336	condition = meth[i+1:]
	337
	338	if condition.startswith("error"):
	339	j = condition.find("_") + i + 1
	340	kind = meth[j+1:]
	341	try:
	342	kind = int(kind)
	343	except ValueError:
	344	pass
	345	lookup = self.handle_error.get(protocol, {})
	346	self.handle_error[protocol] = lookup
	347	elif condition == "open":
	348	kind = protocol
	349	lookup = self.handle_open
	350	elif condition == "response":
	351	kind = protocol
	352	lookup = self.process_response
	353	elif condition == "request":
	354	kind = protocol
	355	lookup = self.process_request
	356	else:
	357	continue
	358
	359	handlers = lookup.setdefault(kind, [])
	360	if handlers:
	361	bisect.insort(handlers, handler)
	362	else:
	363	handlers.append(handler)
	364	added = True
	365
	366	if added:
	367	bisect.insort(self.handlers, handler)
	368	handler.add_parent(self)
	369
	370	def close(self):
	371	# Only exists for backwards compatibility.
	372	pass
	373
	374	def _call_chain(self, chain, kind, meth_name, *args):
	375	# Handlers raise an exception if no one else should try to handle
	376	# the request, or return None if they can't but another handler
	377	# could. Otherwise, they return the response.
	378	handlers = chain.get(kind, ())
	379	for handler in handlers:
	380	func = getattr(handler, meth_name)
	381
	382	result = func(*args)
	383	if result is not None:
	384	return result
	385
	386	def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
	387	# accept a URL or a Request object
	388	if isinstance(fullurl, basestring):
	389	req = Request(fullurl, data)
	390	else:
	391	req = fullurl
	392	if data is not None:
	393	req.add_data(data)
	394
	395	req.timeout = timeout
	396	protocol = req.get_type()
	397
	398	# pre-process request
	399	meth_name = protocol+"_request"
	400	for processor in self.process_request.get(protocol, []):
	401	meth = getattr(processor, meth_name)
	402	req = meth(req)
	403
	404	response = self._open(req, data)
	405
	406	# post-process response
	407	meth_name = protocol+"_response"
	408	for processor in self.process_response.get(protocol, []):
	409	meth = getattr(processor, meth_name)
	410	response = meth(req, response)
	411
	412	return response
	413
	414	def _open(self, req, data=None):
	415	result = self._call_chain(self.handle_open, 'default',
	416	'default_open', req)
	417	if result:
	418	return result
	419
	420	protocol = req.get_type()
	421	result = self._call_chain(self.handle_open, protocol, protocol +
	422	'_open', req)
	423	if result:
	424	return result
	425
	426	return self._call_chain(self.handle_open, 'unknown',
	427	'unknown_open', req)
	428
	429	def error(self, proto, *args):
	430	if proto in ('http', 'https'):
	431	# XXX http[s] protocols are special-cased
	432	dict = self.handle_error['http'] # https is not different than http
	433	proto = args[2] # YUCK!
	434	meth_name = 'http_error_%s' % proto
	435	http_err = 1
	436	orig_args = args
	437	else:
	438	dict = self.handle_error
	439	meth_name = proto + '_error'
	440	http_err = 0
	441	args = (dict, proto, meth_name) + args
	442	result = self._call_chain(*args)
	443	if result:
	444	return result
	445
	446	if http_err:
	447	args = (dict, 'default', 'http_error_default') + orig_args
	448	return self._call_chain(*args)
	449
	450	# XXX probably also want an abstract factory that knows when it makes
	451	# sense to skip a superclass in favor of a subclass and when it might
	452	# make sense to include both
	453
	454	def build_opener(*handlers):
	455	"""Create an opener object from a list of handlers.
	456
	457	The opener will use several default handlers, including support
	458	for HTTP, FTP and when applicable, HTTPS.
	459
	460	If any of the handlers passed as arguments are subclasses of the
	461	default handlers, the default handlers will not be used.
	462	"""
	463	import types
	464	def isclass(obj):
[391]	465	return isinstance(obj, (types.ClassType, type))
[2]	466
	467	opener = OpenerDirector()
	468	default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
	469	HTTPDefaultErrorHandler, HTTPRedirectHandler,
	470	FTPHandler, FileHandler, HTTPErrorProcessor]
	471	if hasattr(httplib, 'HTTPS'):
	472	default_classes.append(HTTPSHandler)
	473	skip = set()
	474	for klass in default_classes:
	475	for check in handlers:
	476	if isclass(check):
	477	if issubclass(check, klass):
	478	skip.add(klass)
	479	elif isinstance(check, klass):
	480	skip.add(klass)
	481	for klass in skip:
	482	default_classes.remove(klass)
	483
	484	for klass in default_classes:
	485	opener.add_handler(klass())
	486
	487	for h in handlers:
	488	if isclass(h):
	489	h = h()
	490	opener.add_handler(h)
	491	return opener
	492
	493	class BaseHandler:
	494	handler_order = 500
	495
	496	def add_parent(self, parent):
	497	self.parent = parent
	498
	499	def close(self):
	500	# Only exists for backwards compatibility
	501	pass
	502
	503	def __lt__(self, other):
	504	if not hasattr(other, "handler_order"):
	505	# Try to preserve the old behavior of having custom classes
	506	# inserted after default ones (works only for custom user
	507	# classes which are not aware of handler_order).
	508	return True
	509	return self.handler_order < other.handler_order
	510
	511
	512	class HTTPErrorProcessor(BaseHandler):
	513	"""Process HTTP error responses."""
	514	handler_order = 1000 # after all other processing
	515
	516	def http_response(self, request, response):
	517	code, msg, hdrs = response.code, response.msg, response.info()
	518
	519	# According to RFC 2616, "2xx" code indicates that the client's
	520	# request was successfully received, understood, and accepted.
	521	if not (200 <= code < 300):
	522	response = self.parent.error(
	523	'http', request, response, code, msg, hdrs)
	524
	525	return response
	526
	527	https_response = http_response
	528
	529	class HTTPDefaultErrorHandler(BaseHandler):
	530	def http_error_default(self, req, fp, code, msg, hdrs):
	531	raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
	532
	533	class HTTPRedirectHandler(BaseHandler):
	534	# maximum number of redirections to any single URL
	535	# this is needed because of the state that cookies introduce
	536	max_repeats = 4
	537	# maximum total number of redirections (regardless of URL) before
	538	# assuming we're in a loop
	539	max_redirections = 10
	540
	541	def redirect_request(self, req, fp, code, msg, headers, newurl):
	542	"""Return a Request or None in response to a redirect.
	543
	544	This is called by the http_error_30x methods when a
	545	redirection response is received. If a redirection should
	546	take place, return a new Request to allow http_error_30x to
	547	perform the redirect. Otherwise, raise HTTPError if no-one
	548	else should try to handle this url. Return None if you can't
	549	but another Handler might.
	550	"""
	551	m = req.get_method()
	552	if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
	553	or code in (301, 302, 303) and m == "POST"):
	554	# Strictly (according to RFC 2616), 301 or 302 in response
	555	# to a POST MUST NOT cause a redirection without confirmation
	556	# from the user (of urllib2, in this case). In practice,
	557	# essentially all clients do redirect in this case, so we
	558	# do the same.
	559	# be conciliant with URIs containing a space
	560	newurl = newurl.replace(' ', '%20')
	561	newheaders = dict((k,v) for k,v in req.headers.items()
	562	if k.lower() not in ("content-length", "content-type")
	563	)
	564	return Request(newurl,
	565	headers=newheaders,
	566	origin_req_host=req.get_origin_req_host(),
	567	unverifiable=True)
	568	else:
	569	raise HTTPError(req.get_full_url(), code, msg, headers, fp)
	570
	571	# Implementation note: To avoid the server sending us into an
	572	# infinite loop, the request object needs to track what URLs we
	573	# have already seen. Do this by adding a handler-specific
	574	# attribute to the Request object.
	575	def http_error_302(self, req, fp, code, msg, headers):
	576	# Some servers (incorrectly) return multiple Location headers
	577	# (so probably same goes for URI). Use first header.
	578	if 'location' in headers:
	579	newurl = headers.getheaders('location')[0]
	580	elif 'uri' in headers:
	581	newurl = headers.getheaders('uri')[0]
	582	else:
	583	return
	584
	585	# fix a possible malformed URL
	586	urlparts = urlparse.urlparse(newurl)
	587	if not urlparts.path:
	588	urlparts = list(urlparts)
	589	urlparts[2] = "/"
	590	newurl = urlparse.urlunparse(urlparts)
	591
	592	newurl = urlparse.urljoin(req.get_full_url(), newurl)
	593
[391]	594	# For security reasons we do not allow redirects to protocols
	595	# other than HTTP, HTTPS or FTP.
	596	newurl_lower = newurl.lower()
	597	if not (newurl_lower.startswith('http://') or
	598	newurl_lower.startswith('https://') or
	599	newurl_lower.startswith('ftp://')):
	600	raise HTTPError(newurl, code,
	601	msg + " - Redirection to url '%s' is not allowed" %
	602	newurl,
	603	headers, fp)
	604
[2]	605	# XXX Probably want to forget about the state of the current
	606	# request, although that might interact poorly with other
	607	# handlers that also use handler-specific request attributes
	608	new = self.redirect_request(req, fp, code, msg, headers, newurl)
	609	if new is None:
	610	return
	611
	612	# loop detection
	613	# .redirect_dict has a key url if url was previously visited.
	614	if hasattr(req, 'redirect_dict'):
	615	visited = new.redirect_dict = req.redirect_dict
	616	if (visited.get(newurl, 0) >= self.max_repeats or
	617	len(visited) >= self.max_redirections):
	618	raise HTTPError(req.get_full_url(), code,
	619	self.inf_msg + msg, headers, fp)
	620	else:
	621	visited = new.redirect_dict = req.redirect_dict = {}
	622	visited[newurl] = visited.get(newurl, 0) + 1
	623
	624	# Don't close the fp until we are sure that we won't use it
	625	# with HTTPError.
	626	fp.read()
	627	fp.close()
	628
	629	return self.parent.open(new, timeout=req.timeout)
	630
	631	http_error_301 = http_error_303 = http_error_307 = http_error_302
	632
	633	inf_msg = "The HTTP server returned a redirect error that would " \
	634	"lead to an infinite loop.\n" \
	635	"The last 30x error message was:\n"
	636
	637
	638	def _parse_proxy(proxy):
	639	"""Return (scheme, user, password, host/port) given a URL or an authority.
	640
	641	If a URL is supplied, it must have an authority (host:port) component.
	642	According to RFC 3986, having an authority component means the URL must
	643	have two slashes after the scheme:
	644
	645	>>> _parse_proxy('file:/ftp.example.com/')
	646	Traceback (most recent call last):
	647	ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
	648
	649	The first three items of the returned tuple may be None.
	650
	651	Examples of authority parsing:
	652
	653	>>> _parse_proxy('proxy.example.com')
	654	(None, None, None, 'proxy.example.com')
	655	>>> _parse_proxy('proxy.example.com:3128')
	656	(None, None, None, 'proxy.example.com:3128')
	657
	658	The authority component may optionally include userinfo (assumed to be
	659	username:password):
	660
	661	>>> _parse_proxy('joe:password@proxy.example.com')
	662	(None, 'joe', 'password', 'proxy.example.com')
	663	>>> _parse_proxy('joe:password@proxy.example.com:3128')
	664	(None, 'joe', 'password', 'proxy.example.com:3128')
	665
	666	Same examples, but with URLs instead:
	667
	668	>>> _parse_proxy('http://proxy.example.com/')
	669	('http', None, None, 'proxy.example.com')
	670	>>> _parse_proxy('http://proxy.example.com:3128/')
	671	('http', None, None, 'proxy.example.com:3128')
	672	>>> _parse_proxy('http://joe:password@proxy.example.com/')
	673	('http', 'joe', 'password', 'proxy.example.com')
	674	>>> _parse_proxy('http://joe:password@proxy.example.com:3128')
	675	('http', 'joe', 'password', 'proxy.example.com:3128')
	676
	677	Everything after the authority is ignored:
	678
	679	>>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
	680	('ftp', 'joe', 'password', 'proxy.example.com')
	681
	682	Test for no trailing '/' case:
	683
	684	>>> _parse_proxy('http://joe:password@proxy.example.com')
	685	('http', 'joe', 'password', 'proxy.example.com')
	686
	687	"""
	688	scheme, r_scheme = splittype(proxy)
	689	if not r_scheme.startswith("/"):
	690	# authority
	691	scheme = None
	692	authority = proxy
	693	else:
	694	# URL
	695	if not r_scheme.startswith("//"):
	696	raise ValueError("proxy URL with no authority: %r" % proxy)
	697	# We have an authority, so for RFC 3986-compliant URLs (by ss 3.
	698	# and 3.3.), path is empty or starts with '/'
	699	end = r_scheme.find("/", 2)
	700	if end == -1:
	701	end = None
	702	authority = r_scheme[2:end]
	703	userinfo, hostport = splituser(authority)
	704	if userinfo is not None:
	705	user, password = splitpasswd(userinfo)
	706	else:
	707	user = password = None
	708	return scheme, user, password, hostport
	709
	710	class ProxyHandler(BaseHandler):
	711	# Proxies must be in front
	712	handler_order = 100
	713
	714	def __init__(self, proxies=None):
	715	if proxies is None:
	716	proxies = getproxies()
	717	assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
	718	self.proxies = proxies
	719	for type, url in proxies.items():
	720	setattr(self, '%s_open' % type,
	721	lambda r, proxy=url, type=type, meth=self.proxy_open: \
	722	meth(r, proxy, type))
	723
	724	def proxy_open(self, req, proxy, type):
	725	orig_type = req.get_type()
	726	proxy_type, user, password, hostport = _parse_proxy(proxy)
	727
	728	if proxy_type is None:
	729	proxy_type = orig_type
	730
	731	if req.host and proxy_bypass(req.host):
	732	return None
	733
	734	if user and password:
	735	user_pass = '%s:%s' % (unquote(user), unquote(password))
	736	creds = base64.b64encode(user_pass).strip()
	737	req.add_header('Proxy-authorization', 'Basic ' + creds)
	738	hostport = unquote(hostport)
	739	req.set_proxy(hostport, proxy_type)
	740
	741	if orig_type == proxy_type or orig_type == 'https':
	742	# let other handlers take care of it
	743	return None
	744	else:
	745	# need to start over, because the other handlers don't
	746	# grok the proxy's URL type
	747	# e.g. if we have a constructor arg proxies like so:
	748	# {'http': 'ftp://proxy.example.com'}, we may end up turning
	749	# a request for http://acme.example.com/a into one for
	750	# ftp://proxy.example.com/a
	751	return self.parent.open(req, timeout=req.timeout)
	752
	753	class HTTPPasswordMgr:
	754
	755	def __init__(self):
	756	self.passwd = {}
	757
	758	def add_password(self, realm, uri, user, passwd):
	759	# uri could be a single URI or a sequence
	760	if isinstance(uri, basestring):
	761	uri = [uri]
	762	if not realm in self.passwd:
	763	self.passwd[realm] = {}
	764	for default_port in True, False:
	765	reduced_uri = tuple(
	766	[self.reduce_uri(u, default_port) for u in uri])
	767	self.passwd[realm][reduced_uri] = (user, passwd)
	768
	769	def find_user_password(self, realm, authuri):
	770	domains = self.passwd.get(realm, {})
	771	for default_port in True, False:
	772	reduced_authuri = self.reduce_uri(authuri, default_port)
	773	for uris, authinfo in domains.iteritems():
	774	for uri in uris:
	775	if self.is_suburi(uri, reduced_authuri):
	776	return authinfo
	777	return None, None
	778
	779	def reduce_uri(self, uri, default_port=True):
	780	"""Accept authority or URI and extract only the authority and path."""
	781	# note HTTP URLs do not have a userinfo component
	782	parts = urlparse.urlsplit(uri)
	783	if parts[1]:
	784	# URI
	785	scheme = parts[0]
	786	authority = parts[1]
	787	path = parts[2] or '/'
	788	else:
	789	# host or host:port
	790	scheme = None
	791	authority = uri
	792	path = '/'
	793	host, port = splitport(authority)
	794	if default_port and port is None and scheme is not None:
	795	dport = {"http": 80,
	796	"https": 443,
	797	}.get(scheme)
	798	if dport is not None:
	799	authority = "%s:%d" % (host, dport)
	800	return authority, path
	801
	802	def is_suburi(self, base, test):
	803	"""Check if test is below base in a URI tree
	804
	805	Both args must be URIs in reduced form.
	806	"""
	807	if base == test:
	808	return True
	809	if base[0] != test[0]:
	810	return False
	811	common = posixpath.commonprefix((base[1], test[1]))
	812	if len(common) == len(base[1]):
	813	return True
	814	return False
	815
	816
	817	class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
	818
	819	def find_user_password(self, realm, authuri):
	820	user, password = HTTPPasswordMgr.find_user_password(self, realm,
	821	authuri)
	822	if user is not None:
	823	return user, password
	824	return HTTPPasswordMgr.find_user_password(self, None, authuri)
	825
	826
	827	class AbstractBasicAuthHandler:
	828
	829	# XXX this allows for multiple auth-schemes, but will stupidly pick
	830	# the last one with a realm specified.
	831
	832	# allow for double- and single-quoted realm values
	833	# (single quotes are a violation of the RFC, but appear in the wild)
	834	rx = re.compile('(?:.,)[ \t]*([^ \t]+)[ \t]+'
[391]	835	'realm=(["\']?)([^"\']*)\\2', re.I)
[2]	836
	837	# XXX could pre-emptively send auth info already accepted (RFC 2617,
	838	# end of section 2, and section 1.2 immediately after "credentials"
	839	# production).
	840
	841	def __init__(self, password_mgr=None):
	842	if password_mgr is None:
	843	password_mgr = HTTPPasswordMgr()
	844	self.passwd = password_mgr
	845	self.add_password = self.passwd.add_password
[391]	846	self.retried = 0
[2]	847
[391]	848	def reset_retry_count(self):
	849	self.retried = 0
	850
[2]	851	def http_error_auth_reqed(self, authreq, host, req, headers):
	852	# host may be an authority (without userinfo) or a URL with an
	853	# authority
	854	# XXX could be multiple headers
	855	authreq = headers.get(authreq, None)
[391]	856
	857	if self.retried > 5:
	858	# retry sending the username:password 5 times before failing.
	859	raise HTTPError(req.get_full_url(), 401, "basic auth failed",
	860	headers, None)
	861	else:
	862	self.retried += 1
	863
[2]	864	if authreq:
	865	mo = AbstractBasicAuthHandler.rx.search(authreq)
	866	if mo:
	867	scheme, quote, realm = mo.groups()
[391]	868	if quote not in ['"', "'"]:
	869	warnings.warn("Basic Auth Realm was unquoted",
	870	UserWarning, 2)
[2]	871	if scheme.lower() == 'basic':
[391]	872	response = self.retry_http_basic_auth(host, req, realm)
	873	if response and response.code != 401:
	874	self.retried = 0
	875	return response
[2]	876
	877	def retry_http_basic_auth(self, host, req, realm):
	878	user, pw = self.passwd.find_user_password(realm, host)
	879	if pw is not None:
	880	raw = "%s:%s" % (user, pw)
	881	auth = 'Basic %s' % base64.b64encode(raw).strip()
	882	if req.headers.get(self.auth_header, None) == auth:
	883	return None
	884	req.add_unredirected_header(self.auth_header, auth)
	885	return self.parent.open(req, timeout=req.timeout)
	886	else:
	887	return None
	888
	889
	890	class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
	891
	892	auth_header = 'Authorization'
	893
	894	def http_error_401(self, req, fp, code, msg, headers):
	895	url = req.get_full_url()
[391]	896	response = self.http_error_auth_reqed('www-authenticate',
	897	url, req, headers)
	898	self.reset_retry_count()
	899	return response
[2]	900
	901
	902	class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
	903
	904	auth_header = 'Proxy-authorization'
	905
	906	def http_error_407(self, req, fp, code, msg, headers):
	907	# http_error_auth_reqed requires that there is no userinfo component in
	908	# authority. Assume there isn't one, since urllib2 does not (and
	909	# should not, RFC 3986 s. 3.2.1) support requests for URLs containing
	910	# userinfo.
	911	authority = req.get_host()
[391]	912	response = self.http_error_auth_reqed('proxy-authenticate',
[2]	913	authority, req, headers)
[391]	914	self.reset_retry_count()
	915	return response
[2]	916
	917
	918	def randombytes(n):
	919	"""Return n random bytes."""
	920	# Use /dev/urandom if it is available. Fall back to random module
	921	# if not. It might be worthwhile to extend this function to use
	922	# other platform-specific mechanisms for getting random bytes.
	923	if os.path.exists("/dev/urandom"):
	924	f = open("/dev/urandom")
	925	s = f.read(n)
	926	f.close()
	927	return s
	928	else:
	929	L = [chr(random.randrange(0, 256)) for i in range(n)]
	930	return "".join(L)
	931
	932	class AbstractDigestAuthHandler:
	933	# Digest authentication is specified in RFC 2617.
	934
	935	# XXX The client does not inspect the Authentication-Info header
	936	# in a successful response.
	937
	938	# XXX It should be possible to test this implementation against
	939	# a mock server that just generates a static set of challenges.
	940
	941	# XXX qop="auth-int" supports is shaky
	942
	943	def __init__(self, passwd=None):
	944	if passwd is None:
	945	passwd = HTTPPasswordMgr()
	946	self.passwd = passwd
	947	self.add_password = self.passwd.add_password
	948	self.retried = 0
	949	self.nonce_count = 0
	950	self.last_nonce = None
	951
	952	def reset_retry_count(self):
	953	self.retried = 0
	954
	955	def http_error_auth_reqed(self, auth_header, host, req, headers):
	956	authreq = headers.get(auth_header, None)
	957	if self.retried > 5:
	958	# Don't fail endlessly - if we failed once, we'll probably
	959	# fail a second time. Hm. Unless the Password Manager is
	960	# prompting for the information. Crap. This isn't great
	961	# but it's better than the current 'repeat until recursion
	962	# depth exceeded' approach <wink>
	963	raise HTTPError(req.get_full_url(), 401, "digest auth failed",
	964	headers, None)
	965	else:
	966	self.retried += 1
	967	if authreq:
	968	scheme = authreq.split()[0]
	969	if scheme.lower() == 'digest':
	970	return self.retry_http_digest_auth(req, authreq)
	971
	972	def retry_http_digest_auth(self, req, auth):
	973	token, challenge = auth.split(' ', 1)
	974	chal = parse_keqv_list(parse_http_list(challenge))
	975	auth = self.get_authorization(req, chal)
	976	if auth:
	977	auth_val = 'Digest %s' % auth
	978	if req.headers.get(self.auth_header, None) == auth_val:
	979	return None
	980	req.add_unredirected_header(self.auth_header, auth_val)
	981	resp = self.parent.open(req, timeout=req.timeout)
	982	return resp
	983
	984	def get_cnonce(self, nonce):
	985	# The cnonce-value is an opaque
	986	# quoted string value provided by the client and used by both client
	987	# and server to avoid chosen plaintext attacks, to provide mutual
	988	# authentication, and to provide some message integrity protection.
	989	# This isn't a fabulous effort, but it's probably Good Enough.
	990	dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
	991	randombytes(8))).hexdigest()
	992	return dig[:16]
	993
	994	def get_authorization(self, req, chal):
	995	try:
	996	realm = chal['realm']
	997	nonce = chal['nonce']
	998	qop = chal.get('qop')
	999	algorithm = chal.get('algorithm', 'MD5')
	1000	# mod_digest doesn't send an opaque, even though it isn't
	1001	# supposed to be optional
	1002	opaque = chal.get('opaque', None)
	1003	except KeyError:
	1004	return None
	1005
	1006	H, KD = self.get_algorithm_impls(algorithm)
	1007	if H is None:
	1008	return None
	1009
	1010	user, pw = self.passwd.find_user_password(realm, req.get_full_url())
	1011	if user is None:
	1012	return None
	1013
	1014	# XXX not implemented yet
	1015	if req.has_data():
	1016	entdig = self.get_entity_digest(req.get_data(), chal)
	1017	else:
	1018	entdig = None
	1019
	1020	A1 = "%s:%s:%s" % (user, realm, pw)
	1021	A2 = "%s:%s" % (req.get_method(),
	1022	# XXX selector: what about proxies and full urls
	1023	req.get_selector())
	1024	if qop == 'auth':
	1025	if nonce == self.last_nonce:
	1026	self.nonce_count += 1
	1027	else:
	1028	self.nonce_count = 1
	1029	self.last_nonce = nonce
	1030
	1031	ncvalue = '%08x' % self.nonce_count
	1032	cnonce = self.get_cnonce(nonce)
	1033	noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
	1034	respdig = KD(H(A1), noncebit)
	1035	elif qop is None:
	1036	respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
	1037	else:
	1038	# XXX handle auth-int.
	1039	raise URLError("qop '%s' is not supported." % qop)
	1040
	1041	# XXX should the partial digests be encoded too?
	1042
	1043	base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
	1044	'response="%s"' % (user, realm, nonce, req.get_selector(),
	1045	respdig)
	1046	if opaque:
	1047	base += ', opaque="%s"' % opaque
	1048	if entdig:
	1049	base += ', digest="%s"' % entdig
	1050	base += ', algorithm="%s"' % algorithm
	1051	if qop:
	1052	base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
	1053	return base
	1054
	1055	def get_algorithm_impls(self, algorithm):
	1056	# algorithm should be case-insensitive according to RFC2617
	1057	algorithm = algorithm.upper()
	1058	# lambdas assume digest modules are imported at the top level
	1059	if algorithm == 'MD5':
	1060	H = lambda x: hashlib.md5(x).hexdigest()
	1061	elif algorithm == 'SHA':
	1062	H = lambda x: hashlib.sha1(x).hexdigest()
	1063	# XXX MD5-sess
	1064	KD = lambda s, d: H("%s:%s" % (s, d))
	1065	return H, KD
	1066
	1067	def get_entity_digest(self, data, chal):
	1068	# XXX not implemented yet
	1069	return None
	1070
	1071
	1072	class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
	1073	"""An authentication protocol defined by RFC 2069
	1074
	1075	Digest authentication improves on basic authentication because it
	1076	does not transmit passwords in the clear.
	1077	"""
	1078
	1079	auth_header = 'Authorization'
	1080	handler_order = 490 # before Basic auth
	1081
	1082	def http_error_401(self, req, fp, code, msg, headers):
	1083	host = urlparse.urlparse(req.get_full_url())[1]
	1084	retry = self.http_error_auth_reqed('www-authenticate',
	1085	host, req, headers)
	1086	self.reset_retry_count()
	1087	return retry
	1088
	1089
	1090	class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
	1091
	1092	auth_header = 'Proxy-Authorization'
	1093	handler_order = 490 # before Basic auth
	1094
	1095	def http_error_407(self, req, fp, code, msg, headers):
	1096	host = req.get_host()
	1097	retry = self.http_error_auth_reqed('proxy-authenticate',
	1098	host, req, headers)
	1099	self.reset_retry_count()
	1100	return retry
	1101
	1102	class AbstractHTTPHandler(BaseHandler):
	1103
	1104	def __init__(self, debuglevel=0):
	1105	self._debuglevel = debuglevel
	1106
	1107	def set_http_debuglevel(self, level):
	1108	self._debuglevel = level
	1109
	1110	def do_request_(self, request):
	1111	host = request.get_host()
	1112	if not host:
	1113	raise URLError('no host given')
	1114
	1115	if request.has_data(): # POST
	1116	data = request.get_data()
	1117	if not request.has_header('Content-type'):
	1118	request.add_unredirected_header(
	1119	'Content-type',
	1120	'application/x-www-form-urlencoded')
	1121	if not request.has_header('Content-length'):
	1122	request.add_unredirected_header(
	1123	'Content-length', '%d' % len(data))
	1124
	1125	sel_host = host
	1126	if request.has_proxy():
	1127	scheme, sel = splittype(request.get_selector())
	1128	sel_host, sel_path = splithost(sel)
	1129
	1130	if not request.has_header('Host'):
	1131	request.add_unredirected_header('Host', sel_host)
	1132	for name, value in self.parent.addheaders:
	1133	name = name.capitalize()
	1134	if not request.has_header(name):
	1135	request.add_unredirected_header(name, value)
	1136
	1137	return request
	1138
	1139	def do_open(self, http_class, req):
	1140	"""Return an addinfourl object for the request, using http_class.
	1141
	1142	http_class must implement the HTTPConnection API from httplib.
	1143	The addinfourl return value is a file-like object. It also
	1144	has methods and attributes including:
	1145	- info(): return a mimetools.Message object for the headers
	1146	- geturl(): return the original request URL
	1147	- code: HTTP status code
	1148	"""
	1149	host = req.get_host()
	1150	if not host:
	1151	raise URLError('no host given')
	1152
	1153	h = http_class(host, timeout=req.timeout) # will parse host:port
	1154	h.set_debuglevel(self._debuglevel)
	1155
[391]	1156	headers = dict(req.unredirected_hdrs)
	1157	headers.update(dict((k, v) for k, v in req.headers.items()
	1158	if k not in headers))
	1159
[2]	1160	# We want to make an HTTP/1.1 request, but the addinfourl
	1161	# class isn't prepared to deal with a persistent connection.
	1162	# It will try to read all remaining data from the socket,
	1163	# which will block while the server waits for the next request.
	1164	# So make sure the connection gets closed after the (only)
	1165	# request.
	1166	headers["Connection"] = "close"
	1167	headers = dict(
	1168	(name.title(), val) for name, val in headers.items())
	1169
	1170	if req._tunnel_host:
	1171	tunnel_headers = {}
	1172	proxy_auth_hdr = "Proxy-Authorization"
	1173	if proxy_auth_hdr in headers:
	1174	tunnel_headers[proxy_auth_hdr] = headers[proxy_auth_hdr]
	1175	# Proxy-Authorization should not be sent to origin
	1176	# server.
	1177	del headers[proxy_auth_hdr]
[391]	1178	h.set_tunnel(req._tunnel_host, headers=tunnel_headers)
[2]	1179
	1180	try:
	1181	h.request(req.get_method(), req.get_selector(), req.data, headers)
	1182	except socket.error, err: # XXX what error?
[391]	1183	h.close()
[2]	1184	raise URLError(err)
[391]	1185	else:
	1186	try:
	1187	r = h.getresponse(buffering=True)
	1188	except TypeError: # buffering kw not supported
	1189	r = h.getresponse()
[2]	1190
	1191	# Pick apart the HTTPResponse object to get the addinfourl
	1192	# object initialized properly.
	1193
	1194	# Wrap the HTTPResponse object in socket's file object adapter
	1195	# for Windows. That adapter calls recv(), so delegate recv()
	1196	# to read(). This weird wrapping allows the returned object to
	1197	# have readline() and readlines() methods.
	1198
	1199	# XXX It might be better to extract the read buffering code
	1200	# out of socket._fileobject() and into a base class.
	1201
	1202	r.recv = r.read
	1203	fp = socket._fileobject(r, close=True)
	1204
	1205	resp = addinfourl(fp, r.msg, req.get_full_url())
	1206	resp.code = r.status
	1207	resp.msg = r.reason
	1208	return resp
	1209
	1210
	1211	class HTTPHandler(AbstractHTTPHandler):
	1212
	1213	def http_open(self, req):
	1214	return self.do_open(httplib.HTTPConnection, req)
	1215
	1216	http_request = AbstractHTTPHandler.do_request_
	1217
	1218	if hasattr(httplib, 'HTTPS'):
	1219	class HTTPSHandler(AbstractHTTPHandler):
	1220
	1221	def https_open(self, req):
	1222	return self.do_open(httplib.HTTPSConnection, req)
	1223
	1224	https_request = AbstractHTTPHandler.do_request_
	1225
	1226	class HTTPCookieProcessor(BaseHandler):
	1227	def __init__(self, cookiejar=None):
	1228	import cookielib
	1229	if cookiejar is None:
	1230	cookiejar = cookielib.CookieJar()
	1231	self.cookiejar = cookiejar
	1232
	1233	def http_request(self, request):
	1234	self.cookiejar.add_cookie_header(request)
	1235	return request
	1236
	1237	def http_response(self, request, response):
	1238	self.cookiejar.extract_cookies(response, request)
	1239	return response
	1240
	1241	https_request = http_request
	1242	https_response = http_response
	1243
	1244	class UnknownHandler(BaseHandler):
	1245	def unknown_open(self, req):
	1246	type = req.get_type()
	1247	raise URLError('unknown url type: %s' % type)
	1248
	1249	def parse_keqv_list(l):
	1250	"""Parse list of key=value strings where keys are not duplicated."""
	1251	parsed = {}
	1252	for elt in l:
	1253	k, v = elt.split('=', 1)
	1254	if v[0] == '"' and v[-1] == '"':
	1255	v = v[1:-1]
	1256	parsed[k] = v
	1257	return parsed
	1258
	1259	def parse_http_list(s):
	1260	"""Parse lists as described by RFC 2068 Section 2.
	1261
	1262	In particular, parse comma-separated lists where the elements of
	1263	the list may include quoted-strings. A quoted-string could
	1264	contain a comma. A non-quoted string could have quotes in the
	1265	middle. Neither commas nor quotes count if they are escaped.
	1266	Only double-quotes count, not single-quotes.
	1267	"""
	1268	res = []
	1269	part = ''
	1270
	1271	escape = quote = False
	1272	for cur in s:
	1273	if escape:
	1274	part += cur
	1275	escape = False
	1276	continue
	1277	if quote:
	1278	if cur == '\\':
	1279	escape = True
	1280	continue
	1281	elif cur == '"':
	1282	quote = False
	1283	part += cur
	1284	continue
	1285
	1286	if cur == ',':
	1287	res.append(part)
	1288	part = ''
	1289	continue
	1290
	1291	if cur == '"':
	1292	quote = True
	1293
	1294	part += cur
	1295
	1296	# append last part
	1297	if part:
	1298	res.append(part)
	1299
	1300	return [part.strip() for part in res]
	1301
[391]	1302	def _safe_gethostbyname(host):
	1303	try:
	1304	return socket.gethostbyname(host)
	1305	except socket.gaierror:
	1306	return None
	1307
[2]	1308	class FileHandler(BaseHandler):
	1309	# Use local file or FTP depending on form of URL
	1310	def file_open(self, req):
	1311	url = req.get_selector()
[391]	1312	if url[:2] == '//' and url[2:3] != '/' and (req.host and
	1313	req.host != 'localhost'):
[2]	1314	req.type = 'ftp'
	1315	return self.parent.open(req)
	1316	else:
	1317	return self.open_local_file(req)
	1318
	1319	# names for the localhost
	1320	names = None
	1321	def get_names(self):
	1322	if FileHandler.names is None:
	1323	try:
	1324	FileHandler.names = tuple(
	1325	socket.gethostbyname_ex('localhost')[2] +
	1326	socket.gethostbyname_ex(socket.gethostname())[2])
	1327	except socket.gaierror:
	1328	FileHandler.names = (socket.gethostbyname('localhost'),)
	1329	return FileHandler.names
	1330
	1331	# not entirely sure what the rules are here
	1332	def open_local_file(self, req):
	1333	import email.utils
	1334	import mimetypes
	1335	host = req.get_host()
[391]	1336	filename = req.get_selector()
[10]	1337	# YD hack: add again drive name
[12]	1338	if os.name == 'os2' and len(host)>1 and host[1] == ':':
[391]	1339	filename = host + filename
[10]	1340	host = ""
[391]	1341	localfile = url2pathname(filename)
[2]	1342	try:
	1343	stats = os.stat(localfile)
	1344	size = stats.st_size
	1345	modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
[391]	1346	mtype = mimetypes.guess_type(filename)[0]
[2]	1347	headers = mimetools.Message(StringIO(
	1348	'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
	1349	(mtype or 'text/plain', size, modified)))
	1350	if host:
	1351	host, port = splitport(host)
	1352	if not host or \
[391]	1353	(not port and _safe_gethostbyname(host) in self.get_names()):
	1354	if host:
	1355	origurl = 'file://' + host + filename
	1356	else:
	1357	origurl = 'file://' + filename
	1358	return addinfourl(open(localfile, 'rb'), headers, origurl)
[2]	1359	except OSError, msg:
	1360	# urllib2 users shouldn't expect OSErrors coming from urlopen()
	1361	raise URLError(msg)
	1362	raise URLError('file not on local host')
	1363
	1364	class FTPHandler(BaseHandler):
	1365	def ftp_open(self, req):
	1366	import ftplib
	1367	import mimetypes
	1368	host = req.get_host()
	1369	if not host:
	1370	raise URLError('ftp error: no host given')
	1371	host, port = splitport(host)
	1372	if port is None:
	1373	port = ftplib.FTP_PORT
	1374	else:
	1375	port = int(port)
	1376
	1377	# username/password handling
	1378	user, host = splituser(host)
	1379	if user:
	1380	user, passwd = splitpasswd(user)
	1381	else:
	1382	passwd = None
	1383	host = unquote(host)
[391]	1384	user = user or ''
	1385	passwd = passwd or ''
[2]	1386
	1387	try:
	1388	host = socket.gethostbyname(host)
	1389	except socket.error, msg:
	1390	raise URLError(msg)
	1391	path, attrs = splitattr(req.get_selector())
	1392	dirs = path.split('/')
	1393	dirs = map(unquote, dirs)
	1394	dirs, file = dirs[:-1], dirs[-1]
	1395	if dirs and not dirs[0]:
	1396	dirs = dirs[1:]
	1397	try:
	1398	fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
	1399	type = file and 'I' or 'D'
	1400	for attr in attrs:
	1401	attr, value = splitvalue(attr)
	1402	if attr.lower() == 'type' and \
	1403	value in ('a', 'A', 'i', 'I', 'd', 'D'):
	1404	type = value.upper()
	1405	fp, retrlen = fw.retrfile(file, type)
	1406	headers = ""
	1407	mtype = mimetypes.guess_type(req.get_full_url())[0]
	1408	if mtype:
	1409	headers += "Content-type: %s\n" % mtype
	1410	if retrlen is not None and retrlen >= 0:
	1411	headers += "Content-length: %d\n" % retrlen
	1412	sf = StringIO(headers)
	1413	headers = mimetools.Message(sf)
	1414	return addinfourl(fp, headers, req.get_full_url())
	1415	except ftplib.all_errors, msg:
	1416	raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
	1417
	1418	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
[391]	1419	fw = ftpwrapper(user, passwd, host, port, dirs, timeout,
	1420	persistent=False)
[2]	1421	## fw.ftp.set_debuglevel(1)
	1422	return fw
	1423
	1424	class CacheFTPHandler(FTPHandler):
	1425	# XXX would be nice to have pluggable cache strategies
	1426	# XXX this stuff is definitely not thread safe
	1427	def __init__(self):
	1428	self.cache = {}
	1429	self.timeout = {}
	1430	self.soonest = 0
	1431	self.delay = 60
	1432	self.max_conns = 16
	1433
	1434	def setTimeout(self, t):
	1435	self.delay = t
	1436
	1437	def setMaxConns(self, m):
	1438	self.max_conns = m
	1439
	1440	def connect_ftp(self, user, passwd, host, port, dirs, timeout):
	1441	key = user, host, port, '/'.join(dirs), timeout
	1442	if key in self.cache:
	1443	self.timeout[key] = time.time() + self.delay
	1444	else:
	1445	self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
	1446	self.timeout[key] = time.time() + self.delay
	1447	self.check_cache()
	1448	return self.cache[key]
	1449
	1450	def check_cache(self):
	1451	# first check for old ones
	1452	t = time.time()
	1453	if self.soonest <= t:
	1454	for k, v in self.timeout.items():
	1455	if v < t:
	1456	self.cache[k].close()
	1457	del self.cache[k]
	1458	del self.timeout[k]
	1459	self.soonest = min(self.timeout.values())
	1460
	1461	# then check the size
	1462	if len(self.cache) == self.max_conns:
	1463	for k, v in self.timeout.items():
	1464	if v == self.soonest:
	1465	del self.cache[k]
	1466	del self.timeout[k]
	1467	break
	1468	self.soonest = min(self.timeout.values())
[391]	1469
	1470	def clear_cache(self):
	1471	for conn in self.cache.values():
	1472	conn.close()
	1473	self.cache.clear()
	1474	self.timeout.clear()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/urllib2.py

Download in other formats: