source: python/trunk/Lib/urlparse.py@ 611

Last change on this file since 611 was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 14.8 KB
Line 
1"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28
29"""
30
31import re
32
33__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
34 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
35
36# A classification of schemes ('' means apply by default)
37uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
38 'wais', 'file', 'https', 'shttp', 'mms',
39 'prospero', 'rtsp', 'rtspu', '', 'sftp',
40 'svn', 'svn+ssh']
41uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
42 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
43 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
44 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
45uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
46 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
47 'mms', '', 'sftp', 'tel']
48
49# These are not actually used anymore, but should stay for backwards
50# compatibility. (They are undocumented, but have a public-looking name.)
51non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
52 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
53uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
54 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
55uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
56 'nntp', 'wais', 'https', 'shttp', 'snews',
57 'file', 'prospero', '']
58
59# Characters valid in scheme names
60scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
61 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
62 '0123456789'
63 '+-.')
64
65MAX_CACHE_SIZE = 20
66_parse_cache = {}
67
68def clear_cache():
69 """Clear the parse cache."""
70 _parse_cache.clear()
71
72
73class ResultMixin(object):
74 """Shared methods for the parsed result objects."""
75
76 @property
77 def username(self):
78 netloc = self.netloc
79 if "@" in netloc:
80 userinfo = netloc.rsplit("@", 1)[0]
81 if ":" in userinfo:
82 userinfo = userinfo.split(":", 1)[0]
83 return userinfo
84 return None
85
86 @property
87 def password(self):
88 netloc = self.netloc
89 if "@" in netloc:
90 userinfo = netloc.rsplit("@", 1)[0]
91 if ":" in userinfo:
92 return userinfo.split(":", 1)[1]
93 return None
94
95 @property
96 def hostname(self):
97 netloc = self.netloc.split('@')[-1]
98 if '[' in netloc and ']' in netloc:
99 return netloc.split(']')[0][1:].lower()
100 elif ':' in netloc:
101 return netloc.split(':')[0].lower()
102 elif netloc == '':
103 return None
104 else:
105 return netloc.lower()
106
107 @property
108 def port(self):
109 netloc = self.netloc.split('@')[-1].split(']')[-1]
110 if ':' in netloc:
111 port = netloc.split(':')[1]
112 port = int(port, 10)
113 # verify legal port
114 if (0 <= port <= 65535):
115 return port
116 return None
117
118from collections import namedtuple
119
120class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
121
122 __slots__ = ()
123
124 def geturl(self):
125 return urlunsplit(self)
126
127
128class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
129
130 __slots__ = ()
131
132 def geturl(self):
133 return urlunparse(self)
134
135
136def urlparse(url, scheme='', allow_fragments=True):
137 """Parse a URL into 6 components:
138 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
139 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
140 Note that we don't break the components up in smaller bits
141 (e.g. netloc is a single string) and we don't expand % escapes."""
142 tuple = urlsplit(url, scheme, allow_fragments)
143 scheme, netloc, url, query, fragment = tuple
144 if scheme in uses_params and ';' in url:
145 url, params = _splitparams(url)
146 else:
147 params = ''
148 return ParseResult(scheme, netloc, url, params, query, fragment)
149
150def _splitparams(url):
151 if '/' in url:
152 i = url.find(';', url.rfind('/'))
153 if i < 0:
154 return url, ''
155 else:
156 i = url.find(';')
157 return url[:i], url[i+1:]
158
159def _splitnetloc(url, start=0):
160 delim = len(url) # position of end of domain part of url, default is end
161 for c in '/?#': # look for delimiters; the order is NOT important
162 wdelim = url.find(c, start) # find first of this delim
163 if wdelim >= 0: # if found
164 delim = min(delim, wdelim) # use earliest delim position
165 return url[start:delim], url[delim:] # return (domain, rest)
166
167def urlsplit(url, scheme='', allow_fragments=True):
168 """Parse a URL into 5 components:
169 <scheme>://<netloc>/<path>?<query>#<fragment>
170 Return a 5-tuple: (scheme, netloc, path, query, fragment).
171 Note that we don't break the components up in smaller bits
172 (e.g. netloc is a single string) and we don't expand % escapes."""
173 allow_fragments = bool(allow_fragments)
174 key = url, scheme, allow_fragments, type(url), type(scheme)
175 cached = _parse_cache.get(key, None)
176 if cached:
177 return cached
178 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
179 clear_cache()
180 netloc = query = fragment = ''
181 i = url.find(':')
182 if i > 0:
183 if url[:i] == 'http': # optimize the common case
184 scheme = url[:i].lower()
185 url = url[i+1:]
186 if url[:2] == '//':
187 netloc, url = _splitnetloc(url, 2)
188 if (('[' in netloc and ']' not in netloc) or
189 (']' in netloc and '[' not in netloc)):
190 raise ValueError("Invalid IPv6 URL")
191 if allow_fragments and '#' in url:
192 url, fragment = url.split('#', 1)
193 if '?' in url:
194 url, query = url.split('?', 1)
195 v = SplitResult(scheme, netloc, url, query, fragment)
196 _parse_cache[key] = v
197 return v
198 for c in url[:i]:
199 if c not in scheme_chars:
200 break
201 else:
202 # make sure "url" is not actually a port number (in which case
203 # "scheme" is really part of the path)
204 rest = url[i+1:]
205 if not rest or any(c not in '0123456789' for c in rest):
206 # not a port number
207 scheme, url = url[:i].lower(), rest
208
209 if url[:2] == '//':
210 netloc, url = _splitnetloc(url, 2)
211 if (('[' in netloc and ']' not in netloc) or
212 (']' in netloc and '[' not in netloc)):
213 raise ValueError("Invalid IPv6 URL")
214 if allow_fragments and '#' in url:
215 url, fragment = url.split('#', 1)
216 if '?' in url:
217 url, query = url.split('?', 1)
218 v = SplitResult(scheme, netloc, url, query, fragment)
219 _parse_cache[key] = v
220 return v
221
222def urlunparse(data):
223 """Put a parsed URL back together again. This may result in a
224 slightly different, but equivalent URL, if the URL that was parsed
225 originally had redundant delimiters, e.g. a ? with an empty query
226 (the draft states that these are equivalent)."""
227 scheme, netloc, url, params, query, fragment = data
228 if params:
229 url = "%s;%s" % (url, params)
230 return urlunsplit((scheme, netloc, url, query, fragment))
231
232def urlunsplit(data):
233 """Combine the elements of a tuple as returned by urlsplit() into a
234 complete URL as a string. The data argument can be any five-item iterable.
235 This may result in a slightly different, but equivalent URL, if the URL that
236 was parsed originally had unnecessary delimiters (for example, a ? with an
237 empty query; the RFC states that these are equivalent)."""
238 scheme, netloc, url, query, fragment = data
239 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
240 if url and url[:1] != '/': url = '/' + url
241 url = '//' + (netloc or '') + url
242 if scheme:
243 url = scheme + ':' + url
244 if query:
245 url = url + '?' + query
246 if fragment:
247 url = url + '#' + fragment
248 return url
249
250def urljoin(base, url, allow_fragments=True):
251 """Join a base URL and a possibly relative URL to form an absolute
252 interpretation of the latter."""
253 if not base:
254 return url
255 if not url:
256 return base
257 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
258 urlparse(base, '', allow_fragments)
259 scheme, netloc, path, params, query, fragment = \
260 urlparse(url, bscheme, allow_fragments)
261 if scheme != bscheme or scheme not in uses_relative:
262 return url
263 if scheme in uses_netloc:
264 if netloc:
265 return urlunparse((scheme, netloc, path,
266 params, query, fragment))
267 netloc = bnetloc
268 if path[:1] == '/':
269 return urlunparse((scheme, netloc, path,
270 params, query, fragment))
271 if not path and not params:
272 path = bpath
273 params = bparams
274 if not query:
275 query = bquery
276 return urlunparse((scheme, netloc, path,
277 params, query, fragment))
278 segments = bpath.split('/')[:-1] + path.split('/')
279 # XXX The stuff below is bogus in various ways...
280 if segments[-1] == '.':
281 segments[-1] = ''
282 while '.' in segments:
283 segments.remove('.')
284 while 1:
285 i = 1
286 n = len(segments) - 1
287 while i < n:
288 if (segments[i] == '..'
289 and segments[i-1] not in ('', '..')):
290 del segments[i-1:i+1]
291 break
292 i = i+1
293 else:
294 break
295 if segments == ['', '..']:
296 segments[-1] = ''
297 elif len(segments) >= 2 and segments[-1] == '..':
298 segments[-2:] = ['']
299 return urlunparse((scheme, netloc, '/'.join(segments),
300 params, query, fragment))
301
302def urldefrag(url):
303 """Removes any existing fragment from URL.
304
305 Returns a tuple of the defragmented URL and the fragment. If
306 the URL contained no fragments, the second element is the
307 empty string.
308 """
309 if '#' in url:
310 s, n, p, a, q, frag = urlparse(url)
311 defrag = urlunparse((s, n, p, a, q, ''))
312 return defrag, frag
313 else:
314 return url, ''
315
316try:
317 unicode
318except NameError:
319 def _is_unicode(x):
320 return 0
321else:
322 def _is_unicode(x):
323 return isinstance(x, unicode)
324
325# unquote method for parse_qs and parse_qsl
326# Cannot use directly from urllib as it would create a circular reference
327# because urllib uses urlparse methods (urljoin). If you update this function,
328# update it also in urllib. This code duplication does not existin in Python3.
329
330_hexdig = '0123456789ABCDEFabcdef'
331_hextochr = dict((a+b, chr(int(a+b,16)))
332 for a in _hexdig for b in _hexdig)
333_asciire = re.compile('([\x00-\x7f]+)')
334
335def unquote(s):
336 """unquote('abc%20def') -> 'abc def'."""
337 if _is_unicode(s):
338 if '%' not in s:
339 return s
340 bits = _asciire.split(s)
341 res = [bits[0]]
342 append = res.append
343 for i in range(1, len(bits), 2):
344 append(unquote(str(bits[i])).decode('latin1'))
345 append(bits[i + 1])
346 return ''.join(res)
347
348 bits = s.split('%')
349 # fastpath
350 if len(bits) == 1:
351 return s
352 res = [bits[0]]
353 append = res.append
354 for item in bits[1:]:
355 try:
356 append(_hextochr[item[:2]])
357 append(item[2:])
358 except KeyError:
359 append('%')
360 append(item)
361 return ''.join(res)
362
363def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
364 """Parse a query given as a string argument.
365
366 Arguments:
367
368 qs: percent-encoded query string to be parsed
369
370 keep_blank_values: flag indicating whether blank values in
371 percent-encoded queries should be treated as blank strings.
372 A true value indicates that blanks should be retained as
373 blank strings. The default false value indicates that
374 blank values are to be ignored and treated as if they were
375 not included.
376
377 strict_parsing: flag indicating what to do with parsing errors.
378 If false (the default), errors are silently ignored.
379 If true, errors raise a ValueError exception.
380 """
381 dict = {}
382 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
383 if name in dict:
384 dict[name].append(value)
385 else:
386 dict[name] = [value]
387 return dict
388
389def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
390 """Parse a query given as a string argument.
391
392 Arguments:
393
394 qs: percent-encoded query string to be parsed
395
396 keep_blank_values: flag indicating whether blank values in
397 percent-encoded queries should be treated as blank strings. A
398 true value indicates that blanks should be retained as blank
399 strings. The default false value indicates that blank values
400 are to be ignored and treated as if they were not included.
401
402 strict_parsing: flag indicating what to do with parsing errors. If
403 false (the default), errors are silently ignored. If true,
404 errors raise a ValueError exception.
405
406 Returns a list, as G-d intended.
407 """
408 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
409 r = []
410 for name_value in pairs:
411 if not name_value and not strict_parsing:
412 continue
413 nv = name_value.split('=', 1)
414 if len(nv) != 2:
415 if strict_parsing:
416 raise ValueError, "bad query field: %r" % (name_value,)
417 # Handle case of a control-name with no equal sign
418 if keep_blank_values:
419 nv.append('')
420 else:
421 continue
422 if len(nv[1]) or keep_blank_values:
423 name = unquote(nv[0].replace('+', ' '))
424 value = unquote(nv[1].replace('+', ' '))
425 r.append((name, value))
426
427 return r
Note: See TracBrowser for help on using the repository browser.