Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

urlparse.py@ 389

Last change on this file since 389 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 14.1 KB

Line
1	"""Parse (absolute and relative) URLs.
2
3	See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4	UC Irvine, June 1995.
5	"""
6
7	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
8	"urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
9
10	# A classification of schemes ('' means apply by default)
11	uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
12	'wais', 'file', 'https', 'shttp', 'mms',
13	'prospero', 'rtsp', 'rtspu', '', 'sftp']
14	uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
15	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
16	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
17	'svn', 'svn+ssh', 'sftp']
18	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
19	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
20	uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
21	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
22	'mms', '', 'sftp']
23	uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
24	'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
25	uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
26	'nntp', 'wais', 'https', 'shttp', 'snews',
27	'file', 'prospero', '']
28
29	# Characters valid in scheme names
30	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
31	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
32	'0123456789'
33	'+-.')
34
35	MAX_CACHE_SIZE = 20
36	_parse_cache = {}
37
38	def clear_cache():
39	"""Clear the parse cache."""
40	_parse_cache.clear()
41
42
43	class ResultMixin(object):
44	"""Shared methods for the parsed result objects."""
45
46	@property
47	def username(self):
48	netloc = self.netloc
49	if "@" in netloc:
50	userinfo = netloc.rsplit("@", 1)[0]
51	if ":" in userinfo:
52	userinfo = userinfo.split(":", 1)[0]
53	return userinfo
54	return None
55
56	@property
57	def password(self):
58	netloc = self.netloc
59	if "@" in netloc:
60	userinfo = netloc.rsplit("@", 1)[0]
61	if ":" in userinfo:
62	return userinfo.split(":", 1)[1]
63	return None
64
65	@property
66	def hostname(self):
67	netloc = self.netloc
68	if "@" in netloc:
69	netloc = netloc.rsplit("@", 1)[1]
70	if ":" in netloc:
71	netloc = netloc.split(":", 1)[0]
72	return netloc.lower() or None
73
74	@property
75	def port(self):
76	netloc = self.netloc
77	if "@" in netloc:
78	netloc = netloc.rsplit("@", 1)[1]
79	if ":" in netloc:
80	port = netloc.split(":", 1)[1]
81	return int(port, 10)
82	return None
83
84	from collections import namedtuple
85
86	class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
87
88	__slots__ = ()
89
90	def geturl(self):
91	return urlunsplit(self)
92
93
94	class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
95
96	__slots__ = ()
97
98	def geturl(self):
99	return urlunparse(self)
100
101
102	def urlparse(url, scheme='', allow_fragments=True):
103	"""Parse a URL into 6 components:
104	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
105	Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
106	Note that we don't break the components up in smaller bits
107	(e.g. netloc is a single string) and we don't expand % escapes."""
108	tuple = urlsplit(url, scheme, allow_fragments)
109	scheme, netloc, url, query, fragment = tuple
110	if scheme in uses_params and ';' in url:
111	url, params = _splitparams(url)
112	else:
113	params = ''
114	return ParseResult(scheme, netloc, url, params, query, fragment)
115
116	def _splitparams(url):
117	if '/' in url:
118	i = url.find(';', url.rfind('/'))
119	if i < 0:
120	return url, ''
121	else:
122	i = url.find(';')
123	return url[:i], url[i+1:]
124
125	def _splitnetloc(url, start=0):
126	delim = len(url) # position of end of domain part of url, default is end
127	for c in '/?#': # look for delimiters; the order is NOT important
128	wdelim = url.find(c, start) # find first of this delim
129	if wdelim >= 0: # if found
130	delim = min(delim, wdelim) # use earliest delim position
131	return url[start:delim], url[delim:] # return (domain, rest)
132
133	def urlsplit(url, scheme='', allow_fragments=True):
134	"""Parse a URL into 5 components:
135	<scheme>://<netloc>/<path>?<query>#<fragment>
136	Return a 5-tuple: (scheme, netloc, path, query, fragment).
137	Note that we don't break the components up in smaller bits
138	(e.g. netloc is a single string) and we don't expand % escapes."""
139	allow_fragments = bool(allow_fragments)
140	key = url, scheme, allow_fragments, type(url), type(scheme)
141	cached = _parse_cache.get(key, None)
142	if cached:
143	return cached
144	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
145	clear_cache()
146	netloc = query = fragment = ''
147	i = url.find(':')
148	if i > 0:
149	if url[:i] == 'http': # optimize the common case
150	scheme = url[:i].lower()
151	url = url[i+1:]
152	if url[:2] == '//':
153	netloc, url = _splitnetloc(url, 2)
154	if allow_fragments and '#' in url:
155	url, fragment = url.split('#', 1)
156	if '?' in url:
157	url, query = url.split('?', 1)
158	v = SplitResult(scheme, netloc, url, query, fragment)
159	_parse_cache[key] = v
160	return v
161	for c in url[:i]:
162	if c not in scheme_chars:
163	break
164	else:
165	scheme, url = url[:i].lower(), url[i+1:]
166
167	if url[:2] == '//':
168	netloc, url = _splitnetloc(url, 2)
169	if allow_fragments and scheme in uses_fragment and '#' in url:
170	url, fragment = url.split('#', 1)
171	if scheme in uses_query and '?' in url:
172	url, query = url.split('?', 1)
173	v = SplitResult(scheme, netloc, url, query, fragment)
174	_parse_cache[key] = v
175	return v
176
177	def urlunparse(data):
178	"""Put a parsed URL back together again. This may result in a
179	slightly different, but equivalent URL, if the URL that was parsed
180	originally had redundant delimiters, e.g. a ? with an empty query
181	(the draft states that these are equivalent)."""
182	scheme, netloc, url, params, query, fragment = data
183	if params:
184	url = "%s;%s" % (url, params)
185	return urlunsplit((scheme, netloc, url, query, fragment))
186
187	def urlunsplit(data):
188	scheme, netloc, url, query, fragment = data
189	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
190	if url and url[:1] != '/': url = '/' + url
191	url = '//' + (netloc or '') + url
192	if scheme:
193	url = scheme + ':' + url
194	if query:
195	url = url + '?' + query
196	if fragment:
197	url = url + '#' + fragment
198	return url
199
200	def urljoin(base, url, allow_fragments=True):
201	"""Join a base URL and a possibly relative URL to form an absolute
202	interpretation of the latter."""
203	if not base:
204	return url
205	if not url:
206	return base
207	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
208	urlparse(base, '', allow_fragments)
209	scheme, netloc, path, params, query, fragment = \
210	urlparse(url, bscheme, allow_fragments)
211	if scheme != bscheme or scheme not in uses_relative:
212	return url
213	if scheme in uses_netloc:
214	if netloc:
215	return urlunparse((scheme, netloc, path,
216	params, query, fragment))
217	netloc = bnetloc
218	if path[:1] == '/':
219	return urlunparse((scheme, netloc, path,
220	params, query, fragment))
221	if not path:
222	path = bpath
223	if not params:
224	params = bparams
225	else:
226	path = path[:-1]
227	return urlunparse((scheme, netloc, path,
228	params, query, fragment))
229	if not query:
230	query = bquery
231	return urlunparse((scheme, netloc, path,
232	params, query, fragment))
233	segments = bpath.split('/')[:-1] + path.split('/')
234	# XXX The stuff below is bogus in various ways...
235	if segments[-1] == '.':
236	segments[-1] = ''
237	while '.' in segments:
238	segments.remove('.')
239	while 1:
240	i = 1
241	n = len(segments) - 1
242	while i < n:
243	if (segments[i] == '..'
244	and segments[i-1] not in ('', '..')):
245	del segments[i-1:i+1]
246	break
247	i = i+1
248	else:
249	break
250	if segments == ['', '..']:
251	segments[-1] = ''
252	elif len(segments) >= 2 and segments[-1] == '..':
253	segments[-2:] = ['']
254	return urlunparse((scheme, netloc, '/'.join(segments),
255	params, query, fragment))
256
257	def urldefrag(url):
258	"""Removes any existing fragment from URL.
259
260	Returns a tuple of the defragmented URL and the fragment. If
261	the URL contained no fragments, the second element is the
262	empty string.
263	"""
264	if '#' in url:
265	s, n, p, a, q, frag = urlparse(url)
266	defrag = urlunparse((s, n, p, a, q, ''))
267	return defrag, frag
268	else:
269	return url, ''
270
271	# unquote method for parse_qs and parse_qsl
272	# Cannot use directly from urllib as it would create circular reference.
273	# urllib uses urlparse methods ( urljoin)
274
275	_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
276	_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
277
278	def unquote(s):
279	"""unquote('abc%20def') -> 'abc def'."""
280	res = s.split('%')
281	for i in xrange(1, len(res)):
282	item = res[i]
283	try:
284	res[i] = _hextochr[item[:2]] + item[2:]
285	except KeyError:
286	res[i] = '%' + item
287	except UnicodeDecodeError:
288	res[i] = unichr(int(item[:2], 16)) + item[2:]
289	return "".join(res)
290
291	def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
292	"""Parse a query given as a string argument.
293
294	Arguments:
295
296	qs: URL-encoded query string to be parsed
297
298	keep_blank_values: flag indicating whether blank values in
299	URL encoded queries should be treated as blank strings.
300	A true value indicates that blanks should be retained as
301	blank strings. The default false value indicates that
302	blank values are to be ignored and treated as if they were
303	not included.
304
305	strict_parsing: flag indicating what to do with parsing errors.
306	If false (the default), errors are silently ignored.
307	If true, errors raise a ValueError exception.
308	"""
309	dict = {}
310	for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
311	if name in dict:
312	dict[name].append(value)
313	else:
314	dict[name] = [value]
315	return dict
316
317	def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
318	"""Parse a query given as a string argument.
319
320	Arguments:
321
322	qs: URL-encoded query string to be parsed
323
324	keep_blank_values: flag indicating whether blank values in
325	URL encoded queries should be treated as blank strings. A
326	true value indicates that blanks should be retained as blank
327	strings. The default false value indicates that blank values
328	are to be ignored and treated as if they were not included.
329
330	strict_parsing: flag indicating what to do with parsing errors. If
331	false (the default), errors are silently ignored. If true,
332	errors raise a ValueError exception.
333
334	Returns a list, as G-d intended.
335	"""
336	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
337	r = []
338	for name_value in pairs:
339	if not name_value and not strict_parsing:
340	continue
341	nv = name_value.split('=', 1)
342	if len(nv) != 2:
343	if strict_parsing:
344	raise ValueError, "bad query field: %r" % (name_value,)
345	# Handle case of a control-name with no equal sign
346	if keep_blank_values:
347	nv.append('')
348	else:
349	continue
350	if len(nv[1]) or keep_blank_values:
351	name = unquote(nv[0].replace('+', ' '))
352	value = unquote(nv[1].replace('+', ' '))
353	r.append((name, value))
354
355	return r
356
357
358	test_input = """
359	http://a/b/c/d
360
361	g:h = <URL:g:h>
362	http:g = <URL:http://a/b/c/g>
363	http: = <URL:http://a/b/c/d>
364	g = <URL:http://a/b/c/g>
365	./g = <URL:http://a/b/c/g>
366	g/ = <URL:http://a/b/c/g/>
367	/g = <URL:http://a/g>
368	//g = <URL:http://g>
369	?y = <URL:http://a/b/c/d?y>
370	g?y = <URL:http://a/b/c/g?y>
371	g?y/./x = <URL:http://a/b/c/g?y/./x>
372	. = <URL:http://a/b/c/>
373	./ = <URL:http://a/b/c/>
374	.. = <URL:http://a/b/>
375	../ = <URL:http://a/b/>
376	../g = <URL:http://a/b/g>
377	../.. = <URL:http://a/>
378	../../g = <URL:http://a/g>
379	../../../g = <URL:http://a/../g>
380	./../g = <URL:http://a/b/g>
381	./g/. = <URL:http://a/b/c/g/>
382	/./g = <URL:http://a/./g>
383	g/./h = <URL:http://a/b/c/g/h>
384	g/../h = <URL:http://a/b/c/h>
385	http:g = <URL:http://a/b/c/g>
386	http: = <URL:http://a/b/c/d>
387	http:?y = <URL:http://a/b/c/d?y>
388	http:g?y = <URL:http://a/b/c/g?y>
389	http:g?y/./x = <URL:http://a/b/c/g?y/./x>
390	"""
391
392	def test():
393	import sys
394	base = ''
395	if sys.argv[1:]:
396	fn = sys.argv[1]
397	if fn == '-':
398	fp = sys.stdin
399	else:
400	fp = open(fn)
401	else:
402	try:
403	from cStringIO import StringIO
404	except ImportError:
405	from StringIO import StringIO
406	fp = StringIO(test_input)
407	for line in fp:
408	words = line.split()
409	if not words:
410	continue
411	url = words[0]
412	parts = urlparse(url)
413	print '%-10s : %s' % (url, parts)
414	abs = urljoin(base, url)
415	if not base:
416	base = abs
417	wrapped = '<URL:%s>' % abs
418	print '%-10s = %s' % (url, wrapped)
419	if len(words) == 3 and words[1] == '=':
420	if wrapped != words[2]:
421	print 'EXPECTED', words[2], '!!!!!!!!!!'
422
423	if __name__ == '__main__':
424	test()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/urlparse.py@ 389

Download in other formats: