Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

urlparse.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 14.8 KB

Rev	Line
[2]	1	"""Parse (absolute and relative) URLs.
	2
[391]	3	urlparse module is based upon the following RFC specifications.
	4
	5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
	6	and L. Masinter, January 2005.
	7
	8	RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
	9	and L.Masinter, December 1999.
	10
	11	RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
	12	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
	13
	14	RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
	15
	16	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
	17	1995.
	18
	19	RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
	20	McCahill, December 1994
	21
	22	RFC 3986 is considered the current standard and any future changes to
	23	urlparse module should conform with it. The urlparse module is
	24	currently not entirely compliant with this RFC due to defacto
	25	scenarios for parsing, and for backward compatibility purposes, some
	26	parsing quirks from older RFCs are retained. The testcases in
	27	test_urlparse.py provides a good indicator of parsing behavior.
	28
[2]	29	"""
	30
[391]	31	import re
	32
[2]	33	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
	34	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
	35
	36	# A classification of schemes ('' means apply by default)
	37	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
	38	'wais', 'file', 'https', 'shttp', 'mms',
[391]	39	'prospero', 'rtsp', 'rtspu', '', 'sftp',
	40	'svn', 'svn+ssh']
[2]	41	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
	42	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
	43	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
[391]	44	'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
	45	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
	46	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
	47	'mms', '', 'sftp', 'tel']
	48
	49	# These are not actually used anymore, but should stay for backwards
	50	# compatibility. (They are undocumented, but have a public-looking name.)
[2]	51	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
	52	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
	53	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
	54	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
	55	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
	56	'nntp', 'wais', 'https', 'shttp', 'snews',
	57	'file', 'prospero', '']
	58
	59	# Characters valid in scheme names
	60	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
	61	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
	62	'0123456789'
	63	'+-.')
	64
	65	MAX_CACHE_SIZE = 20
	66	_parse_cache = {}
	67
	68	def clear_cache():
	69	"""Clear the parse cache."""
	70	_parse_cache.clear()
	71
	72
	73	class ResultMixin(object):
	74	"""Shared methods for the parsed result objects."""
	75
	76	@property
	77	def username(self):
	78	netloc = self.netloc
	79	if "@" in netloc:
	80	userinfo = netloc.rsplit("@", 1)[0]
	81	if ":" in userinfo:
	82	userinfo = userinfo.split(":", 1)[0]
	83	return userinfo
	84	return None
	85
	86	@property
	87	def password(self):
	88	netloc = self.netloc
	89	if "@" in netloc:
	90	userinfo = netloc.rsplit("@", 1)[0]
	91	if ":" in userinfo:
	92	return userinfo.split(":", 1)[1]
	93	return None
	94
	95	@property
	96	def hostname(self):
[391]	97	netloc = self.netloc.split('@')[-1]
	98	if '[' in netloc and ']' in netloc:
	99	return netloc.split(']')[0][1:].lower()
	100	elif ':' in netloc:
	101	return netloc.split(':')[0].lower()
	102	elif netloc == '':
	103	return None
	104	else:
	105	return netloc.lower()
[2]	106
	107	@property
	108	def port(self):
[391]	109	netloc = self.netloc.split('@')[-1].split(']')[-1]
	110	if ':' in netloc:
	111	port = netloc.split(':')[1]
	112	port = int(port, 10)
	113	# verify legal port
	114	if (0 <= port <= 65535):
	115	return port
[2]	116	return None
	117
	118	from collections import namedtuple
	119
	120	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
	121
	122	__slots__ = ()
	123
	124	def geturl(self):
	125	return urlunsplit(self)
	126
	127
	128	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
	129
	130	__slots__ = ()
	131
	132	def geturl(self):
	133	return urlunparse(self)
	134
	135
	136	def urlparse(url, scheme='', allow_fragments=True):
	137	"""Parse a URL into 6 components:
	138	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
	139	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
	140	Note that we don't break the components up in smaller bits
	141	(e.g. netloc is a single string) and we don't expand % escapes."""
	142	tuple = urlsplit(url, scheme, allow_fragments)
	143	scheme, netloc, url, query, fragment = tuple
	144	if scheme in uses_params and ';' in url:
	145	url, params = _splitparams(url)
	146	else:
	147	params = ''
	148	return ParseResult(scheme, netloc, url, params, query, fragment)
	149
	150	def _splitparams(url):
	151	if '/' in url:
	152	i = url.find(';', url.rfind('/'))
	153	if i < 0:
	154	return url, ''
	155	else:
	156	i = url.find(';')
	157	return url[:i], url[i+1:]
	158
	159	def _splitnetloc(url, start=0):
	160	delim = len(url) # position of end of domain part of url, default is end
	161	for c in '/?#': # look for delimiters; the order is NOT important
	162	wdelim = url.find(c, start) # find first of this delim
	163	if wdelim >= 0: # if found
	164	delim = min(delim, wdelim) # use earliest delim position
	165	return url[start:delim], url[delim:] # return (domain, rest)
	166
	167	def urlsplit(url, scheme='', allow_fragments=True):
	168	"""Parse a URL into 5 components:
	169	<scheme>://<netloc>/<path>?<query>#<fragment>
	170	Return a 5-tuple: (scheme, netloc, path, query, fragment).
	171	Note that we don't break the components up in smaller bits
	172	(e.g. netloc is a single string) and we don't expand % escapes."""
	173	allow_fragments = bool(allow_fragments)
	174	key = url, scheme, allow_fragments, type(url), type(scheme)
	175	cached = _parse_cache.get(key, None)
	176	if cached:
	177	return cached
	178	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
	179	clear_cache()
	180	netloc = query = fragment = ''
	181	i = url.find(':')
	182	if i > 0:
	183	if url[:i] == 'http': # optimize the common case
	184	scheme = url[:i].lower()
	185	url = url[i+1:]
	186	if url[:2] == '//':
	187	netloc, url = _splitnetloc(url, 2)
[391]	188	if (('[' in netloc and ']' not in netloc) or
	189	(']' in netloc and '[' not in netloc)):
	190	raise ValueError("Invalid IPv6 URL")
[2]	191	if allow_fragments and '#' in url:
	192	url, fragment = url.split('#', 1)
	193	if '?' in url:
	194	url, query = url.split('?', 1)
	195	v = SplitResult(scheme, netloc, url, query, fragment)
	196	_parse_cache[key] = v
	197	return v
	198	for c in url[:i]:
	199	if c not in scheme_chars:
	200	break
	201	else:
[391]	202	# make sure "url" is not actually a port number (in which case
	203	# "scheme" is really part of the path)
	204	rest = url[i+1:]
	205	if not rest or any(c not in '0123456789' for c in rest):
	206	# not a port number
	207	scheme, url = url[:i].lower(), rest
[2]	208
	209	if url[:2] == '//':
	210	netloc, url = _splitnetloc(url, 2)
[391]	211	if (('[' in netloc and ']' not in netloc) or
	212	(']' in netloc and '[' not in netloc)):
	213	raise ValueError("Invalid IPv6 URL")
	214	if allow_fragments and '#' in url:
[2]	215	url, fragment = url.split('#', 1)
[391]	216	if '?' in url:
[2]	217	url, query = url.split('?', 1)
	218	v = SplitResult(scheme, netloc, url, query, fragment)
	219	_parse_cache[key] = v
	220	return v
	221
	222	def urlunparse(data):
	223	"""Put a parsed URL back together again. This may result in a
	224	slightly different, but equivalent URL, if the URL that was parsed
	225	originally had redundant delimiters, e.g. a ? with an empty query
	226	(the draft states that these are equivalent)."""
	227	scheme, netloc, url, params, query, fragment = data
	228	if params:
	229	url = "%s;%s" % (url, params)
	230	return urlunsplit((scheme, netloc, url, query, fragment))
	231
	232	def urlunsplit(data):
[391]	233	"""Combine the elements of a tuple as returned by urlsplit() into a
	234	complete URL as a string. The data argument can be any five-item iterable.
	235	This may result in a slightly different, but equivalent URL, if the URL that
	236	was parsed originally had unnecessary delimiters (for example, a ? with an
	237	empty query; the RFC states that these are equivalent)."""
[2]	238	scheme, netloc, url, query, fragment = data
	239	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
	240	if url and url[:1] != '/': url = '/' + url
	241	url = '//' + (netloc or '') + url
	242	if scheme:
	243	url = scheme + ':' + url
	244	if query:
	245	url = url + '?' + query
	246	if fragment:
	247	url = url + '#' + fragment
	248	return url
	249
	250	def urljoin(base, url, allow_fragments=True):
	251	"""Join a base URL and a possibly relative URL to form an absolute
	252	interpretation of the latter."""
	253	if not base:
	254	return url
	255	if not url:
	256	return base
	257	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
	258	urlparse(base, '', allow_fragments)
	259	scheme, netloc, path, params, query, fragment = \
	260	urlparse(url, bscheme, allow_fragments)
	261	if scheme != bscheme or scheme not in uses_relative:
	262	return url
	263	if scheme in uses_netloc:
	264	if netloc:
	265	return urlunparse((scheme, netloc, path,
	266	params, query, fragment))
	267	netloc = bnetloc
	268	if path[:1] == '/':
	269	return urlunparse((scheme, netloc, path,
	270	params, query, fragment))
[391]	271	if not path and not params:
[2]	272	path = bpath
[391]	273	params = bparams
[2]	274	if not query:
	275	query = bquery
	276	return urlunparse((scheme, netloc, path,
	277	params, query, fragment))
	278	segments = bpath.split('/')[:-1] + path.split('/')
	279	# XXX The stuff below is bogus in various ways...
	280	if segments[-1] == '.':
	281	segments[-1] = ''
	282	while '.' in segments:
	283	segments.remove('.')
	284	while 1:
	285	i = 1
	286	n = len(segments) - 1
	287	while i < n:
	288	if (segments[i] == '..'
	289	and segments[i-1] not in ('', '..')):
	290	del segments[i-1:i+1]
	291	break
	292	i = i+1
	293	else:
	294	break
	295	if segments == ['', '..']:
	296	segments[-1] = ''
	297	elif len(segments) >= 2 and segments[-1] == '..':
	298	segments[-2:] = ['']
	299	return urlunparse((scheme, netloc, '/'.join(segments),
	300	params, query, fragment))
	301
	302	def urldefrag(url):
	303	"""Removes any existing fragment from URL.
	304
	305	Returns a tuple of the defragmented URL and the fragment. If
	306	the URL contained no fragments, the second element is the
	307	empty string.
	308	"""
	309	if '#' in url:
	310	s, n, p, a, q, frag = urlparse(url)
	311	defrag = urlunparse((s, n, p, a, q, ''))
	312	return defrag, frag
	313	else:
	314	return url, ''
	315
[391]	316	try:
	317	unicode
	318	except NameError:
	319	def _is_unicode(x):
	320	return 0
	321	else:
	322	def _is_unicode(x):
	323	return isinstance(x, unicode)
	324
[2]	325	# unquote method for parse_qs and parse_qsl
[391]	326	# Cannot use directly from urllib as it would create a circular reference
	327	# because urllib uses urlparse methods (urljoin). If you update this function,
	328	# update it also in urllib. This code duplication does not existin in Python3.
[2]	329
[391]	330	_hexdig = '0123456789ABCDEFabcdef'
	331	_hextochr = dict((a+b, chr(int(a+b,16)))
	332	for a in _hexdig for b in _hexdig)
	333	_asciire = re.compile('([\x00-\x7f]+)')
[2]	334
	335	def unquote(s):
	336	"""unquote('abc%20def') -> 'abc def'."""
[391]	337	if _is_unicode(s):
	338	if '%' not in s:
	339	return s
	340	bits = _asciire.split(s)
	341	res = [bits[0]]
	342	append = res.append
	343	for i in range(1, len(bits), 2):
	344	append(unquote(str(bits[i])).decode('latin1'))
	345	append(bits[i + 1])
	346	return ''.join(res)
	347
	348	bits = s.split('%')
	349	# fastpath
	350	if len(bits) == 1:
	351	return s
	352	res = [bits[0]]
	353	append = res.append
	354	for item in bits[1:]:
[2]	355	try:
[391]	356	append(_hextochr[item[:2]])
	357	append(item[2:])
[2]	358	except KeyError:
[391]	359	append('%')
	360	append(item)
	361	return ''.join(res)
[2]	362
	363	def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
	364	"""Parse a query given as a string argument.
	365
	366	Arguments:
	367
[391]	368	qs: percent-encoded query string to be parsed
[2]	369
	370	keep_blank_values: flag indicating whether blank values in
[391]	371	percent-encoded queries should be treated as blank strings.
[2]	372	A true value indicates that blanks should be retained as
	373	blank strings. The default false value indicates that
	374	blank values are to be ignored and treated as if they were
	375	not included.
	376
	377	strict_parsing: flag indicating what to do with parsing errors.
	378	If false (the default), errors are silently ignored.
	379	If true, errors raise a ValueError exception.
	380	"""
	381	dict = {}
	382	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
	383	if name in dict:
	384	dict[name].append(value)
	385	else:
	386	dict[name] = [value]
	387	return dict
	388
	389	def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
	390	"""Parse a query given as a string argument.
	391
	392	Arguments:
	393
[391]	394	qs: percent-encoded query string to be parsed
[2]	395
	396	keep_blank_values: flag indicating whether blank values in
[391]	397	percent-encoded queries should be treated as blank strings. A
[2]	398	true value indicates that blanks should be retained as blank
	399	strings. The default false value indicates that blank values
	400	are to be ignored and treated as if they were not included.
	401
	402	strict_parsing: flag indicating what to do with parsing errors. If
	403	false (the default), errors are silently ignored. If true,
	404	errors raise a ValueError exception.
	405
	406	Returns a list, as G-d intended.
	407	"""
	408	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
	409	r = []
	410	for name_value in pairs:
	411	if not name_value and not strict_parsing:
	412	continue
	413	nv = name_value.split('=', 1)
	414	if len(nv) != 2:
	415	if strict_parsing:
	416	raise ValueError, "bad query field: %r" % (name_value,)
	417	# Handle case of a control-name with no equal sign
	418	if keep_blank_values:
	419	nv.append('')
	420	else:
	421	continue
	422	if len(nv[1]) or keep_blank_values:
	423	name = unquote(nv[0].replace('+', ' '))
	424	value = unquote(nv[1].replace('+', ' '))
	425	r.append((name, value))
	426
	427	return r

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/urlparse.py

Download in other formats: