Changeset 391 for python/trunk/Lib/urlparse.py
- Timestamp:
- Mar 19, 2014, 11:31:01 PM (11 years ago)
- Location:
- python/trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
python/trunk
-
Property svn:mergeinfo
set to
/python/vendor/Python-2.7.6 merged eligible /python/vendor/current merged eligible
-
Property svn:mergeinfo
set to
-
python/trunk/Lib/urlparse.py
r2 r391 1 1 """Parse (absolute and relative) URLs. 2 2 3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, 4 UC Irvine, June 1995. 3 urlparse module is based upon the following RFC specifications. 4 5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding 6 and L. Masinter, January 2005. 7 8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter 9 and L.Masinter, December 1999. 10 11 RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. 12 Berners-Lee, R. Fielding, and L. Masinter, August 1998. 13 14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998. 15 16 RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June 17 1995. 18 19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. 20 McCahill, December 1994 21 22 RFC 3986 is considered the current standard and any future changes to 23 urlparse module should conform with it. The urlparse module is 24 currently not entirely compliant with this RFC due to defacto 25 scenarios for parsing, and for backward compatibility purposes, some 26 parsing quirks from older RFCs are retained. The testcases in 27 test_urlparse.py provides a good indicator of parsing behavior. 28 5 29 """ 30 31 import re 6 32 7 33 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", … … 11 37 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', 12 38 'wais', 'file', 'https', 'shttp', 'mms', 13 'prospero', 'rtsp', 'rtspu', '', 'sftp'] 39 'prospero', 'rtsp', 'rtspu', '', 'sftp', 40 'svn', 'svn+ssh'] 14 41 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 15 42 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 16 43 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', 17 'svn', 'svn+ssh', 'sftp'] 44 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh'] 45 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', 46 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', 47 'mms', '', 'sftp', 'tel'] 48 49 # These are not actually used anymore, but should stay for backwards 50 # compatibility. (They are undocumented, but have a public-looking name.) 18 51 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 19 52 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] 20 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',21 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',22 'mms', '', 'sftp']23 53 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', 24 54 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] … … 65 95 @property 66 96 def hostname(self): 67 netloc = self.netloc 68 if "@" in netloc: 69 netloc = netloc.rsplit("@", 1)[1] 70 if ":" in netloc: 71 netloc = netloc.split(":", 1)[0] 72 return netloc.lower() or None 97 netloc = self.netloc.split('@')[-1] 98 if '[' in netloc and ']' in netloc: 99 return netloc.split(']')[0][1:].lower() 100 elif ':' in netloc: 101 return netloc.split(':')[0].lower() 102 elif netloc == '': 103 return None 104 else: 105 return netloc.lower() 73 106 74 107 @property 75 108 def port(self): 76 netloc = self.netloc 77 if "@" in netloc: 78 netloc = netloc.rsplit("@", 1)[1] 79 if ":" in netloc: 80 port = netloc.split(":", 1)[1] 81 return int(port, 10) 109 netloc = self.netloc.split('@')[-1].split(']')[-1] 110 if ':' in netloc: 111 port = netloc.split(':')[1] 112 port = int(port, 10) 113 # verify legal port 114 if (0 <= port <= 65535): 115 return port 82 116 return None 83 117 … … 152 186 if url[:2] == '//': 153 187 netloc, url = _splitnetloc(url, 2) 188 if (('[' in netloc and ']' not in netloc) or 189 (']' in netloc and '[' not in netloc)): 190 raise ValueError("Invalid IPv6 URL") 154 191 if allow_fragments and '#' in url: 155 192 url, fragment = url.split('#', 1) … … 163 200 break 164 201 else: 165 scheme, url = url[:i].lower(), url[i+1:] 202 # make sure "url" is not actually a port number (in which case 203 # "scheme" is really part of the path) 204 rest = url[i+1:] 205 if not rest or any(c not in '0123456789' for c in rest): 206 # not a port number 207 scheme, url = url[:i].lower(), rest 166 208 167 209 if url[:2] == '//': 168 210 netloc, url = _splitnetloc(url, 2) 169 if allow_fragments and scheme in uses_fragment and '#' in url: 211 if (('[' in netloc and ']' not in netloc) or 212 (']' in netloc and '[' not in netloc)): 213 raise ValueError("Invalid IPv6 URL") 214 if allow_fragments and '#' in url: 170 215 url, fragment = url.split('#', 1) 171 if scheme in uses_query and'?' in url:216 if '?' in url: 172 217 url, query = url.split('?', 1) 173 218 v = SplitResult(scheme, netloc, url, query, fragment) … … 186 231 187 232 def urlunsplit(data): 233 """Combine the elements of a tuple as returned by urlsplit() into a 234 complete URL as a string. The data argument can be any five-item iterable. 235 This may result in a slightly different, but equivalent URL, if the URL that 236 was parsed originally had unnecessary delimiters (for example, a ? with an 237 empty query; the RFC states that these are equivalent).""" 188 238 scheme, netloc, url, query, fragment = data 189 239 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): … … 219 269 return urlunparse((scheme, netloc, path, 220 270 params, query, fragment)) 221 if not path :271 if not path and not params: 222 272 path = bpath 223 if not params: 224 params = bparams 225 else: 226 path = path[:-1] 227 return urlunparse((scheme, netloc, path, 228 params, query, fragment)) 273 params = bparams 229 274 if not query: 230 275 query = bquery … … 269 314 return url, '' 270 315 316 try: 317 unicode 318 except NameError: 319 def _is_unicode(x): 320 return 0 321 else: 322 def _is_unicode(x): 323 return isinstance(x, unicode) 324 271 325 # unquote method for parse_qs and parse_qsl 272 # Cannot use directly from urllib as it would create circular reference. 273 # urllib uses urlparse methods ( urljoin) 274 275 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) 276 _hextochr.update(('%02X' % i, chr(i)) for i in range(256)) 326 # Cannot use directly from urllib as it would create a circular reference 327 # because urllib uses urlparse methods (urljoin). If you update this function, 328 # update it also in urllib. This code duplication does not existin in Python3. 329 330 _hexdig = '0123456789ABCDEFabcdef' 331 _hextochr = dict((a+b, chr(int(a+b,16))) 332 for a in _hexdig for b in _hexdig) 333 _asciire = re.compile('([\x00-\x7f]+)') 277 334 278 335 def unquote(s): 279 336 """unquote('abc%20def') -> 'abc def'.""" 280 res = s.split('%') 281 for i in xrange(1, len(res)): 282 item = res[i] 337 if _is_unicode(s): 338 if '%' not in s: 339 return s 340 bits = _asciire.split(s) 341 res = [bits[0]] 342 append = res.append 343 for i in range(1, len(bits), 2): 344 append(unquote(str(bits[i])).decode('latin1')) 345 append(bits[i + 1]) 346 return ''.join(res) 347 348 bits = s.split('%') 349 # fastpath 350 if len(bits) == 1: 351 return s 352 res = [bits[0]] 353 append = res.append 354 for item in bits[1:]: 283 355 try: 284 res[i] = _hextochr[item[:2]] + item[2:] 356 append(_hextochr[item[:2]]) 357 append(item[2:]) 285 358 except KeyError: 286 res[i] = '%' + item 287 except UnicodeDecodeError: 288 res[i] = unichr(int(item[:2], 16)) + item[2:] 289 return "".join(res) 359 append('%') 360 append(item) 361 return ''.join(res) 290 362 291 363 def parse_qs(qs, keep_blank_values=0, strict_parsing=0): … … 294 366 Arguments: 295 367 296 qs: URL-encoded query string to be parsed368 qs: percent-encoded query string to be parsed 297 369 298 370 keep_blank_values: flag indicating whether blank values in 299 URLencoded queries should be treated as blank strings.371 percent-encoded queries should be treated as blank strings. 300 372 A true value indicates that blanks should be retained as 301 373 blank strings. The default false value indicates that … … 320 392 Arguments: 321 393 322 qs: URL-encoded query string to be parsed394 qs: percent-encoded query string to be parsed 323 395 324 396 keep_blank_values: flag indicating whether blank values in 325 URLencoded queries should be treated as blank strings. A397 percent-encoded queries should be treated as blank strings. A 326 398 true value indicates that blanks should be retained as blank 327 399 strings. The default false value indicates that blank values … … 354 426 355 427 return r 356 357 358 test_input = """359 http://a/b/c/d360 361 g:h = <URL:g:h>362 http:g = <URL:http://a/b/c/g>363 http: = <URL:http://a/b/c/d>364 g = <URL:http://a/b/c/g>365 ./g = <URL:http://a/b/c/g>366 g/ = <URL:http://a/b/c/g/>367 /g = <URL:http://a/g>368 //g = <URL:http://g>369 ?y = <URL:http://a/b/c/d?y>370 g?y = <URL:http://a/b/c/g?y>371 g?y/./x = <URL:http://a/b/c/g?y/./x>372 . = <URL:http://a/b/c/>373 ./ = <URL:http://a/b/c/>374 .. = <URL:http://a/b/>375 ../ = <URL:http://a/b/>376 ../g = <URL:http://a/b/g>377 ../.. = <URL:http://a/>378 ../../g = <URL:http://a/g>379 ../../../g = <URL:http://a/../g>380 ./../g = <URL:http://a/b/g>381 ./g/. = <URL:http://a/b/c/g/>382 /./g = <URL:http://a/./g>383 g/./h = <URL:http://a/b/c/g/h>384 g/../h = <URL:http://a/b/c/h>385 http:g = <URL:http://a/b/c/g>386 http: = <URL:http://a/b/c/d>387 http:?y = <URL:http://a/b/c/d?y>388 http:g?y = <URL:http://a/b/c/g?y>389 http:g?y/./x = <URL:http://a/b/c/g?y/./x>390 """391 392 def test():393 import sys394 base = ''395 if sys.argv[1:]:396 fn = sys.argv[1]397 if fn == '-':398 fp = sys.stdin399 else:400 fp = open(fn)401 else:402 try:403 from cStringIO import StringIO404 except ImportError:405 from StringIO import StringIO406 fp = StringIO(test_input)407 for line in fp:408 words = line.split()409 if not words:410 continue411 url = words[0]412 parts = urlparse(url)413 print '%-10s : %s' % (url, parts)414 abs = urljoin(base, url)415 if not base:416 base = abs417 wrapped = '<URL:%s>' % abs418 print '%-10s = %s' % (url, wrapped)419 if len(words) == 3 and words[1] == '=':420 if wrapped != words[2]:421 print 'EXPECTED', words[2], '!!!!!!!!!!'422 423 if __name__ == '__main__':424 test()
Note:
See TracChangeset
for help on using the changeset viewer.