source: python/vendor/Python-2.6.5/Lib/urlparse.py

Last change on this file was 2, checked in by Yuri Dario, 15 years ago

Initial import for vendor code.

  • Property svn:eol-style set to native
File size: 14.1 KB
Line 
1"""Parse (absolute and relative) URLs.
2
3See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4UC Irvine, June 1995.
5"""
6
7__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
8 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
9
10# A classification of schemes ('' means apply by default)
11uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
12 'wais', 'file', 'https', 'shttp', 'mms',
13 'prospero', 'rtsp', 'rtspu', '', 'sftp']
14uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
15 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
16 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
17 'svn', 'svn+ssh', 'sftp']
18non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
19 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
20uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
21 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
22 'mms', '', 'sftp']
23uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
24 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
25uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
26 'nntp', 'wais', 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
28
29# Characters valid in scheme names
30scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
32 '0123456789'
33 '+-.')
34
35MAX_CACHE_SIZE = 20
36_parse_cache = {}
37
38def clear_cache():
39 """Clear the parse cache."""
40 _parse_cache.clear()
41
42
43class ResultMixin(object):
44 """Shared methods for the parsed result objects."""
45
46 @property
47 def username(self):
48 netloc = self.netloc
49 if "@" in netloc:
50 userinfo = netloc.rsplit("@", 1)[0]
51 if ":" in userinfo:
52 userinfo = userinfo.split(":", 1)[0]
53 return userinfo
54 return None
55
56 @property
57 def password(self):
58 netloc = self.netloc
59 if "@" in netloc:
60 userinfo = netloc.rsplit("@", 1)[0]
61 if ":" in userinfo:
62 return userinfo.split(":", 1)[1]
63 return None
64
65 @property
66 def hostname(self):
67 netloc = self.netloc
68 if "@" in netloc:
69 netloc = netloc.rsplit("@", 1)[1]
70 if ":" in netloc:
71 netloc = netloc.split(":", 1)[0]
72 return netloc.lower() or None
73
74 @property
75 def port(self):
76 netloc = self.netloc
77 if "@" in netloc:
78 netloc = netloc.rsplit("@", 1)[1]
79 if ":" in netloc:
80 port = netloc.split(":", 1)[1]
81 return int(port, 10)
82 return None
83
84from collections import namedtuple
85
86class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
87
88 __slots__ = ()
89
90 def geturl(self):
91 return urlunsplit(self)
92
93
94class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
95
96 __slots__ = ()
97
98 def geturl(self):
99 return urlunparse(self)
100
101
102def urlparse(url, scheme='', allow_fragments=True):
103 """Parse a URL into 6 components:
104 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
105 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
106 Note that we don't break the components up in smaller bits
107 (e.g. netloc is a single string) and we don't expand % escapes."""
108 tuple = urlsplit(url, scheme, allow_fragments)
109 scheme, netloc, url, query, fragment = tuple
110 if scheme in uses_params and ';' in url:
111 url, params = _splitparams(url)
112 else:
113 params = ''
114 return ParseResult(scheme, netloc, url, params, query, fragment)
115
116def _splitparams(url):
117 if '/' in url:
118 i = url.find(';', url.rfind('/'))
119 if i < 0:
120 return url, ''
121 else:
122 i = url.find(';')
123 return url[:i], url[i+1:]
124
125def _splitnetloc(url, start=0):
126 delim = len(url) # position of end of domain part of url, default is end
127 for c in '/?#': # look for delimiters; the order is NOT important
128 wdelim = url.find(c, start) # find first of this delim
129 if wdelim >= 0: # if found
130 delim = min(delim, wdelim) # use earliest delim position
131 return url[start:delim], url[delim:] # return (domain, rest)
132
133def urlsplit(url, scheme='', allow_fragments=True):
134 """Parse a URL into 5 components:
135 <scheme>://<netloc>/<path>?<query>#<fragment>
136 Return a 5-tuple: (scheme, netloc, path, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
139 allow_fragments = bool(allow_fragments)
140 key = url, scheme, allow_fragments, type(url), type(scheme)
141 cached = _parse_cache.get(key, None)
142 if cached:
143 return cached
144 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
145 clear_cache()
146 netloc = query = fragment = ''
147 i = url.find(':')
148 if i > 0:
149 if url[:i] == 'http': # optimize the common case
150 scheme = url[:i].lower()
151 url = url[i+1:]
152 if url[:2] == '//':
153 netloc, url = _splitnetloc(url, 2)
154 if allow_fragments and '#' in url:
155 url, fragment = url.split('#', 1)
156 if '?' in url:
157 url, query = url.split('?', 1)
158 v = SplitResult(scheme, netloc, url, query, fragment)
159 _parse_cache[key] = v
160 return v
161 for c in url[:i]:
162 if c not in scheme_chars:
163 break
164 else:
165 scheme, url = url[:i].lower(), url[i+1:]
166
167 if url[:2] == '//':
168 netloc, url = _splitnetloc(url, 2)
169 if allow_fragments and scheme in uses_fragment and '#' in url:
170 url, fragment = url.split('#', 1)
171 if scheme in uses_query and '?' in url:
172 url, query = url.split('?', 1)
173 v = SplitResult(scheme, netloc, url, query, fragment)
174 _parse_cache[key] = v
175 return v
176
177def urlunparse(data):
178 """Put a parsed URL back together again. This may result in a
179 slightly different, but equivalent URL, if the URL that was parsed
180 originally had redundant delimiters, e.g. a ? with an empty query
181 (the draft states that these are equivalent)."""
182 scheme, netloc, url, params, query, fragment = data
183 if params:
184 url = "%s;%s" % (url, params)
185 return urlunsplit((scheme, netloc, url, query, fragment))
186
187def urlunsplit(data):
188 scheme, netloc, url, query, fragment = data
189 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
190 if url and url[:1] != '/': url = '/' + url
191 url = '//' + (netloc or '') + url
192 if scheme:
193 url = scheme + ':' + url
194 if query:
195 url = url + '?' + query
196 if fragment:
197 url = url + '#' + fragment
198 return url
199
200def urljoin(base, url, allow_fragments=True):
201 """Join a base URL and a possibly relative URL to form an absolute
202 interpretation of the latter."""
203 if not base:
204 return url
205 if not url:
206 return base
207 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
208 urlparse(base, '', allow_fragments)
209 scheme, netloc, path, params, query, fragment = \
210 urlparse(url, bscheme, allow_fragments)
211 if scheme != bscheme or scheme not in uses_relative:
212 return url
213 if scheme in uses_netloc:
214 if netloc:
215 return urlunparse((scheme, netloc, path,
216 params, query, fragment))
217 netloc = bnetloc
218 if path[:1] == '/':
219 return urlunparse((scheme, netloc, path,
220 params, query, fragment))
221 if not path:
222 path = bpath
223 if not params:
224 params = bparams
225 else:
226 path = path[:-1]
227 return urlunparse((scheme, netloc, path,
228 params, query, fragment))
229 if not query:
230 query = bquery
231 return urlunparse((scheme, netloc, path,
232 params, query, fragment))
233 segments = bpath.split('/')[:-1] + path.split('/')
234 # XXX The stuff below is bogus in various ways...
235 if segments[-1] == '.':
236 segments[-1] = ''
237 while '.' in segments:
238 segments.remove('.')
239 while 1:
240 i = 1
241 n = len(segments) - 1
242 while i < n:
243 if (segments[i] == '..'
244 and segments[i-1] not in ('', '..')):
245 del segments[i-1:i+1]
246 break
247 i = i+1
248 else:
249 break
250 if segments == ['', '..']:
251 segments[-1] = ''
252 elif len(segments) >= 2 and segments[-1] == '..':
253 segments[-2:] = ['']
254 return urlunparse((scheme, netloc, '/'.join(segments),
255 params, query, fragment))
256
257def urldefrag(url):
258 """Removes any existing fragment from URL.
259
260 Returns a tuple of the defragmented URL and the fragment. If
261 the URL contained no fragments, the second element is the
262 empty string.
263 """
264 if '#' in url:
265 s, n, p, a, q, frag = urlparse(url)
266 defrag = urlunparse((s, n, p, a, q, ''))
267 return defrag, frag
268 else:
269 return url, ''
270
271# unquote method for parse_qs and parse_qsl
272# Cannot use directly from urllib as it would create circular reference.
273# urllib uses urlparse methods ( urljoin)
274
275_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
276_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
277
278def unquote(s):
279 """unquote('abc%20def') -> 'abc def'."""
280 res = s.split('%')
281 for i in xrange(1, len(res)):
282 item = res[i]
283 try:
284 res[i] = _hextochr[item[:2]] + item[2:]
285 except KeyError:
286 res[i] = '%' + item
287 except UnicodeDecodeError:
288 res[i] = unichr(int(item[:2], 16)) + item[2:]
289 return "".join(res)
290
291def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
292 """Parse a query given as a string argument.
293
294 Arguments:
295
296 qs: URL-encoded query string to be parsed
297
298 keep_blank_values: flag indicating whether blank values in
299 URL encoded queries should be treated as blank strings.
300 A true value indicates that blanks should be retained as
301 blank strings. The default false value indicates that
302 blank values are to be ignored and treated as if they were
303 not included.
304
305 strict_parsing: flag indicating what to do with parsing errors.
306 If false (the default), errors are silently ignored.
307 If true, errors raise a ValueError exception.
308 """
309 dict = {}
310 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
311 if name in dict:
312 dict[name].append(value)
313 else:
314 dict[name] = [value]
315 return dict
316
317def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
318 """Parse a query given as a string argument.
319
320 Arguments:
321
322 qs: URL-encoded query string to be parsed
323
324 keep_blank_values: flag indicating whether blank values in
325 URL encoded queries should be treated as blank strings. A
326 true value indicates that blanks should be retained as blank
327 strings. The default false value indicates that blank values
328 are to be ignored and treated as if they were not included.
329
330 strict_parsing: flag indicating what to do with parsing errors. If
331 false (the default), errors are silently ignored. If true,
332 errors raise a ValueError exception.
333
334 Returns a list, as G-d intended.
335 """
336 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
337 r = []
338 for name_value in pairs:
339 if not name_value and not strict_parsing:
340 continue
341 nv = name_value.split('=', 1)
342 if len(nv) != 2:
343 if strict_parsing:
344 raise ValueError, "bad query field: %r" % (name_value,)
345 # Handle case of a control-name with no equal sign
346 if keep_blank_values:
347 nv.append('')
348 else:
349 continue
350 if len(nv[1]) or keep_blank_values:
351 name = unquote(nv[0].replace('+', ' '))
352 value = unquote(nv[1].replace('+', ' '))
353 r.append((name, value))
354
355 return r
356
357
358test_input = """
359 http://a/b/c/d
360
361 g:h = <URL:g:h>
362 http:g = <URL:http://a/b/c/g>
363 http: = <URL:http://a/b/c/d>
364 g = <URL:http://a/b/c/g>
365 ./g = <URL:http://a/b/c/g>
366 g/ = <URL:http://a/b/c/g/>
367 /g = <URL:http://a/g>
368 //g = <URL:http://g>
369 ?y = <URL:http://a/b/c/d?y>
370 g?y = <URL:http://a/b/c/g?y>
371 g?y/./x = <URL:http://a/b/c/g?y/./x>
372 . = <URL:http://a/b/c/>
373 ./ = <URL:http://a/b/c/>
374 .. = <URL:http://a/b/>
375 ../ = <URL:http://a/b/>
376 ../g = <URL:http://a/b/g>
377 ../.. = <URL:http://a/>
378 ../../g = <URL:http://a/g>
379 ../../../g = <URL:http://a/../g>
380 ./../g = <URL:http://a/b/g>
381 ./g/. = <URL:http://a/b/c/g/>
382 /./g = <URL:http://a/./g>
383 g/./h = <URL:http://a/b/c/g/h>
384 g/../h = <URL:http://a/b/c/h>
385 http:g = <URL:http://a/b/c/g>
386 http: = <URL:http://a/b/c/d>
387 http:?y = <URL:http://a/b/c/d?y>
388 http:g?y = <URL:http://a/b/c/g?y>
389 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
390"""
391
392def test():
393 import sys
394 base = ''
395 if sys.argv[1:]:
396 fn = sys.argv[1]
397 if fn == '-':
398 fp = sys.stdin
399 else:
400 fp = open(fn)
401 else:
402 try:
403 from cStringIO import StringIO
404 except ImportError:
405 from StringIO import StringIO
406 fp = StringIO(test_input)
407 for line in fp:
408 words = line.split()
409 if not words:
410 continue
411 url = words[0]
412 parts = urlparse(url)
413 print '%-10s : %s' % (url, parts)
414 abs = urljoin(base, url)
415 if not base:
416 base = abs
417 wrapped = '<URL:%s>' % abs
418 print '%-10s = %s' % (url, wrapped)
419 if len(words) == 3 and words[1] == '=':
420 if wrapped != words[2]:
421 print 'EXPECTED', words[2], '!!!!!!!!!!'
422
423if __name__ == '__main__':
424 test()
Note: See TracBrowser for help on using the repository browser.