Context Navigation

urlparse.py@ 611

Last change on this file since 611 was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 14.8 KB

Line
1	"""Parse (absolute and relative) URLs.
2
3	urlparse module is based upon the following RFC specifications.
4
5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6	and L. Masinter, January 2005.
7
8	RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9	and L.Masinter, December 1999.
10
11	RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14	RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
17	1995.
18
19	RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20	McCahill, December 1994
21
22	RFC 3986 is considered the current standard and any future changes to
23	urlparse module should conform with it. The urlparse module is
24	currently not entirely compliant with this RFC due to defacto
25	scenarios for parsing, and for backward compatibility purposes, some
26	parsing quirks from older RFCs are retained. The testcases in
27	test_urlparse.py provides a good indicator of parsing behavior.
28
29	"""
30
31	import re
32
33	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
34	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
35
36	# A classification of schemes ('' means apply by default)
37	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
38	'wais', 'file', 'https', 'shttp', 'mms',
39	'prospero', 'rtsp', 'rtspu', '', 'sftp',
40	'svn', 'svn+ssh']
41	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
42	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
43	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
44	'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
45	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
46	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
47	'mms', '', 'sftp', 'tel']
48
49	# These are not actually used anymore, but should stay for backwards
50	# compatibility. (They are undocumented, but have a public-looking name.)
51	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
52	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
53	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
54	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
55	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
56	'nntp', 'wais', 'https', 'shttp', 'snews',
57	'file', 'prospero', '']
58
59	# Characters valid in scheme names
60	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
61	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
62	'0123456789'
63	'+-.')
64
65	MAX_CACHE_SIZE = 20
66	_parse_cache = {}
67
68	def clear_cache():
69	"""Clear the parse cache."""
70	_parse_cache.clear()
71
72
73	class ResultMixin(object):
74	"""Shared methods for the parsed result objects."""
75
76	@property
77	def username(self):
78	netloc = self.netloc
79	if "@" in netloc:
80	userinfo = netloc.rsplit("@", 1)[0]
81	if ":" in userinfo:
82	userinfo = userinfo.split(":", 1)[0]
83	return userinfo
84	return None
85
86	@property
87	def password(self):
88	netloc = self.netloc
89	if "@" in netloc:
90	userinfo = netloc.rsplit("@", 1)[0]
91	if ":" in userinfo:
92	return userinfo.split(":", 1)[1]
93	return None
94
95	@property
96	def hostname(self):
97	netloc = self.netloc.split('@')[-1]
98	if '[' in netloc and ']' in netloc:
99	return netloc.split(']')[0][1:].lower()
100	elif ':' in netloc:
101	return netloc.split(':')[0].lower()
102	elif netloc == '':
103	return None
104	else:
105	return netloc.lower()
106
107	@property
108	def port(self):
109	netloc = self.netloc.split('@')[-1].split(']')[-1]
110	if ':' in netloc:
111	port = netloc.split(':')[1]
112	port = int(port, 10)
113	# verify legal port
114	if (0 <= port <= 65535):
115	return port
116	return None
117
118	from collections import namedtuple
119
120	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
121
122	__slots__ = ()
123
124	def geturl(self):
125	return urlunsplit(self)
126
127
128	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
129
130	__slots__ = ()
131
132	def geturl(self):
133	return urlunparse(self)
134
135
136	def urlparse(url, scheme='', allow_fragments=True):
137	"""Parse a URL into 6 components:
138	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
139	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
140	Note that we don't break the components up in smaller bits
141	(e.g. netloc is a single string) and we don't expand % escapes."""
142	tuple = urlsplit(url, scheme, allow_fragments)
143	scheme, netloc, url, query, fragment = tuple
144	if scheme in uses_params and ';' in url:
145	url, params = _splitparams(url)
146	else:
147	params = ''
148	return ParseResult(scheme, netloc, url, params, query, fragment)
149
150	def _splitparams(url):
151	if '/' in url:
152	i = url.find(';', url.rfind('/'))
153	if i < 0:
154	return url, ''
155	else:
156	i = url.find(';')
157	return url[:i], url[i+1:]
158
159	def _splitnetloc(url, start=0):
160	delim = len(url) # position of end of domain part of url, default is end
161	for c in '/?#': # look for delimiters; the order is NOT important
162	wdelim = url.find(c, start) # find first of this delim
163	if wdelim >= 0: # if found
164	delim = min(delim, wdelim) # use earliest delim position
165	return url[start:delim], url[delim:] # return (domain, rest)
166
167	def urlsplit(url, scheme='', allow_fragments=True):
168	"""Parse a URL into 5 components:
169	<scheme>://<netloc>/<path>?<query>#<fragment>
170	Return a 5-tuple: (scheme, netloc, path, query, fragment).
171	Note that we don't break the components up in smaller bits
172	(e.g. netloc is a single string) and we don't expand % escapes."""
173	allow_fragments = bool(allow_fragments)
174	key = url, scheme, allow_fragments, type(url), type(scheme)
175	cached = _parse_cache.get(key, None)
176	if cached:
177	return cached
178	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
179	clear_cache()
180	netloc = query = fragment = ''
181	i = url.find(':')
182	if i > 0:
183	if url[:i] == 'http': # optimize the common case
184	scheme = url[:i].lower()
185	url = url[i+1:]
186	if url[:2] == '//':
187	netloc, url = _splitnetloc(url, 2)
188	if (('[' in netloc and ']' not in netloc) or
189	(']' in netloc and '[' not in netloc)):
190	raise ValueError("Invalid IPv6 URL")
191	if allow_fragments and '#' in url:
192	url, fragment = url.split('#', 1)
193	if '?' in url:
194	url, query = url.split('?', 1)
195	v = SplitResult(scheme, netloc, url, query, fragment)
196	_parse_cache[key] = v
197	return v
198	for c in url[:i]:
199	if c not in scheme_chars:
200	break
201	else:
202	# make sure "url" is not actually a port number (in which case
203	# "scheme" is really part of the path)
204	rest = url[i+1:]
205	if not rest or any(c not in '0123456789' for c in rest):
206	# not a port number
207	scheme, url = url[:i].lower(), rest
208
209	if url[:2] == '//':
210	netloc, url = _splitnetloc(url, 2)
211	if (('[' in netloc and ']' not in netloc) or
212	(']' in netloc and '[' not in netloc)):
213	raise ValueError("Invalid IPv6 URL")
214	if allow_fragments and '#' in url:
215	url, fragment = url.split('#', 1)
216	if '?' in url:
217	url, query = url.split('?', 1)
218	v = SplitResult(scheme, netloc, url, query, fragment)
219	_parse_cache[key] = v
220	return v
221
222	def urlunparse(data):
223	"""Put a parsed URL back together again. This may result in a
224	slightly different, but equivalent URL, if the URL that was parsed
225	originally had redundant delimiters, e.g. a ? with an empty query
226	(the draft states that these are equivalent)."""
227	scheme, netloc, url, params, query, fragment = data
228	if params:
229	url = "%s;%s" % (url, params)
230	return urlunsplit((scheme, netloc, url, query, fragment))
231
232	def urlunsplit(data):
233	"""Combine the elements of a tuple as returned by urlsplit() into a
234	complete URL as a string. The data argument can be any five-item iterable.
235	This may result in a slightly different, but equivalent URL, if the URL that
236	was parsed originally had unnecessary delimiters (for example, a ? with an
237	empty query; the RFC states that these are equivalent)."""
238	scheme, netloc, url, query, fragment = data
239	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
240	if url and url[:1] != '/': url = '/' + url
241	url = '//' + (netloc or '') + url
242	if scheme:
243	url = scheme + ':' + url
244	if query:
245	url = url + '?' + query
246	if fragment:
247	url = url + '#' + fragment
248	return url
249
250	def urljoin(base, url, allow_fragments=True):
251	"""Join a base URL and a possibly relative URL to form an absolute
252	interpretation of the latter."""
253	if not base:
254	return url
255	if not url:
256	return base
257	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
258	urlparse(base, '', allow_fragments)
259	scheme, netloc, path, params, query, fragment = \
260	urlparse(url, bscheme, allow_fragments)
261	if scheme != bscheme or scheme not in uses_relative:
262	return url
263	if scheme in uses_netloc:
264	if netloc:
265	return urlunparse((scheme, netloc, path,
266	params, query, fragment))
267	netloc = bnetloc
268	if path[:1] == '/':
269	return urlunparse((scheme, netloc, path,
270	params, query, fragment))
271	if not path and not params:
272	path = bpath
273	params = bparams
274	if not query:
275	query = bquery
276	return urlunparse((scheme, netloc, path,
277	params, query, fragment))
278	segments = bpath.split('/')[:-1] + path.split('/')
279	# XXX The stuff below is bogus in various ways...
280	if segments[-1] == '.':
281	segments[-1] = ''
282	while '.' in segments:
283	segments.remove('.')
284	while 1:
285	i = 1
286	n = len(segments) - 1
287	while i < n:
288	if (segments[i] == '..'
289	and segments[i-1] not in ('', '..')):
290	del segments[i-1:i+1]
291	break
292	i = i+1
293	else:
294	break
295	if segments == ['', '..']:
296	segments[-1] = ''
297	elif len(segments) >= 2 and segments[-1] == '..':
298	segments[-2:] = ['']
299	return urlunparse((scheme, netloc, '/'.join(segments),
300	params, query, fragment))
301
302	def urldefrag(url):
303	"""Removes any existing fragment from URL.
304
305	Returns a tuple of the defragmented URL and the fragment. If
306	the URL contained no fragments, the second element is the
307	empty string.
308	"""
309	if '#' in url:
310	s, n, p, a, q, frag = urlparse(url)
311	defrag = urlunparse((s, n, p, a, q, ''))
312	return defrag, frag
313	else:
314	return url, ''
315
316	try:
317	unicode
318	except NameError:
319	def _is_unicode(x):
320	return 0
321	else:
322	def _is_unicode(x):
323	return isinstance(x, unicode)
324
325	# unquote method for parse_qs and parse_qsl
326	# Cannot use directly from urllib as it would create a circular reference
327	# because urllib uses urlparse methods (urljoin). If you update this function,
328	# update it also in urllib. This code duplication does not existin in Python3.
329
330	_hexdig = '0123456789ABCDEFabcdef'
331	_hextochr = dict((a+b, chr(int(a+b,16)))
332	for a in _hexdig for b in _hexdig)
333	_asciire = re.compile('([\x00-\x7f]+)')
334
335	def unquote(s):
336	"""unquote('abc%20def') -> 'abc def'."""
337	if _is_unicode(s):
338	if '%' not in s:
339	return s
340	bits = _asciire.split(s)
341	res = [bits[0]]
342	append = res.append
343	for i in range(1, len(bits), 2):
344	append(unquote(str(bits[i])).decode('latin1'))
345	append(bits[i + 1])
346	return ''.join(res)
347
348	bits = s.split('%')
349	# fastpath
350	if len(bits) == 1:
351	return s
352	res = [bits[0]]
353	append = res.append
354	for item in bits[1:]:
355	try:
356	append(_hextochr[item[:2]])
357	append(item[2:])
358	except KeyError:
359	append('%')
360	append(item)
361	return ''.join(res)
362
363	def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
364	"""Parse a query given as a string argument.
365
366	Arguments:
367
368	qs: percent-encoded query string to be parsed
369
370	keep_blank_values: flag indicating whether blank values in
371	percent-encoded queries should be treated as blank strings.
372	A true value indicates that blanks should be retained as
373	blank strings. The default false value indicates that
374	blank values are to be ignored and treated as if they were
375	not included.
376
377	strict_parsing: flag indicating what to do with parsing errors.
378	If false (the default), errors are silently ignored.
379	If true, errors raise a ValueError exception.
380	"""
381	dict = {}
382	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
383	if name in dict:
384	dict[name].append(value)
385	else:
386	dict[name] = [value]
387	return dict
388
389	def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
390	"""Parse a query given as a string argument.
391
392	Arguments:
393
394	qs: percent-encoded query string to be parsed
395
396	keep_blank_values: flag indicating whether blank values in
397	percent-encoded queries should be treated as blank strings. A
398	true value indicates that blanks should be retained as blank
399	strings. The default false value indicates that blank values
400	are to be ignored and treated as if they were not included.
401
402	strict_parsing: flag indicating what to do with parsing errors. If
403	false (the default), errors are silently ignored. If true,
404	errors raise a ValueError exception.
405
406	Returns a list, as G-d intended.
407	"""
408	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
409	r = []
410	for name_value in pairs:
411	if not name_value and not strict_parsing:
412	continue
413	nv = name_value.split('=', 1)
414	if len(nv) != 2:
415	if strict_parsing:
416	raise ValueError, "bad query field: %r" % (name_value,)
417	# Handle case of a control-name with no equal sign
418	if keep_blank_values:
419	nv.append('')
420	else:
421	continue
422	if len(nv[1]) or keep_blank_values:
423	name = unquote(nv[0].replace('+', ' '))
424	value = unquote(nv[1].replace('+', ' '))
425	r.append((name, value))
426
427	return r

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/urlparse.py@ 611

Download in other formats: