Ignore:
Timestamp:
Mar 19, 2014, 11:31:01 PM (11 years ago)
Author:
dmik
Message:

python: Merge vendor 2.7.6 to trunk.

Location:
python/trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • python/trunk

  • python/trunk/Lib/urlparse.py

    r2 r391  
    11"""Parse (absolute and relative) URLs.
    22
    3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
    4 UC Irvine, June 1995.
     3urlparse module is based upon the following RFC specifications.
     4
     5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
     6and L.  Masinter, January 2005.
     7
     8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
     9and L.Masinter, December 1999.
     10
     11RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
     12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
     13
     14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
     15
     16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
     171995.
     18
     19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
     20McCahill, December 1994
     21
     22RFC 3986 is considered the current standard and any future changes to
     23urlparse module should conform with it.  The urlparse module is
     24currently not entirely compliant with this RFC due to defacto
     25scenarios for parsing, and for backward compatibility purposes, some
     26parsing quirks from older RFCs are retained. The testcases in
     27test_urlparse.py provides a good indicator of parsing behavior.
     28
    529"""
     30
     31import re
    632
    733__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
     
    1137uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
    1238                 'wais', 'file', 'https', 'shttp', 'mms',
    13                  'prospero', 'rtsp', 'rtspu', '', 'sftp']
     39                 'prospero', 'rtsp', 'rtspu', '', 'sftp',
     40                 'svn', 'svn+ssh']
    1441uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
    1542               'imap', 'wais', 'file', 'mms', 'https', 'shttp',
    1643               'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
    17                'svn', 'svn+ssh', 'sftp']
     44               'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
     45uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
     46               'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
     47               'mms', '', 'sftp', 'tel']
     48
     49# These are not actually used anymore, but should stay for backwards
     50# compatibility.  (They are undocumented, but have a public-looking name.)
    1851non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
    1952                    'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
    20 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
    21                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
    22                'mms', '', 'sftp']
    2353uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
    2454              'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
     
    6595    @property
    6696    def hostname(self):
    67         netloc = self.netloc
    68         if "@" in netloc:
    69             netloc = netloc.rsplit("@", 1)[1]
    70         if ":" in netloc:
    71             netloc = netloc.split(":", 1)[0]
    72         return netloc.lower() or None
     97        netloc = self.netloc.split('@')[-1]
     98        if '[' in netloc and ']' in netloc:
     99            return netloc.split(']')[0][1:].lower()
     100        elif ':' in netloc:
     101            return netloc.split(':')[0].lower()
     102        elif netloc == '':
     103            return None
     104        else:
     105            return netloc.lower()
    73106
    74107    @property
    75108    def port(self):
    76         netloc = self.netloc
    77         if "@" in netloc:
    78             netloc = netloc.rsplit("@", 1)[1]
    79         if ":" in netloc:
    80             port = netloc.split(":", 1)[1]
    81             return int(port, 10)
     109        netloc = self.netloc.split('@')[-1].split(']')[-1]
     110        if ':' in netloc:
     111            port = netloc.split(':')[1]
     112            port = int(port, 10)
     113            # verify legal port
     114            if (0 <= port <= 65535):
     115                return port
    82116        return None
    83117
     
    152186            if url[:2] == '//':
    153187                netloc, url = _splitnetloc(url, 2)
     188                if (('[' in netloc and ']' not in netloc) or
     189                        (']' in netloc and '[' not in netloc)):
     190                    raise ValueError("Invalid IPv6 URL")
    154191            if allow_fragments and '#' in url:
    155192                url, fragment = url.split('#', 1)
     
    163200                break
    164201        else:
    165             scheme, url = url[:i].lower(), url[i+1:]
     202            # make sure "url" is not actually a port number (in which case
     203            # "scheme" is really part of the path)
     204            rest = url[i+1:]
     205            if not rest or any(c not in '0123456789' for c in rest):
     206                # not a port number
     207                scheme, url = url[:i].lower(), rest
    166208
    167209    if url[:2] == '//':
    168210        netloc, url = _splitnetloc(url, 2)
    169     if allow_fragments and scheme in uses_fragment and '#' in url:
     211        if (('[' in netloc and ']' not in netloc) or
     212                (']' in netloc and '[' not in netloc)):
     213            raise ValueError("Invalid IPv6 URL")
     214    if allow_fragments and '#' in url:
    170215        url, fragment = url.split('#', 1)
    171     if scheme in uses_query and '?' in url:
     216    if '?' in url:
    172217        url, query = url.split('?', 1)
    173218    v = SplitResult(scheme, netloc, url, query, fragment)
     
    186231
    187232def urlunsplit(data):
     233    """Combine the elements of a tuple as returned by urlsplit() into a
     234    complete URL as a string. The data argument can be any five-item iterable.
     235    This may result in a slightly different, but equivalent URL, if the URL that
     236    was parsed originally had unnecessary delimiters (for example, a ? with an
     237    empty query; the RFC states that these are equivalent)."""
    188238    scheme, netloc, url, query, fragment = data
    189239    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
     
    219269        return urlunparse((scheme, netloc, path,
    220270                           params, query, fragment))
    221     if not path:
     271    if not path and not params:
    222272        path = bpath
    223         if not params:
    224             params = bparams
    225         else:
    226             path = path[:-1]
    227             return urlunparse((scheme, netloc, path,
    228                                 params, query, fragment))
     273        params = bparams
    229274        if not query:
    230275            query = bquery
     
    269314        return url, ''
    270315
     316try:
     317    unicode
     318except NameError:
     319    def _is_unicode(x):
     320        return 0
     321else:
     322    def _is_unicode(x):
     323        return isinstance(x, unicode)
     324
    271325# unquote method for parse_qs and parse_qsl
    272 # Cannot use directly from urllib as it would create circular reference.
    273 # urllib uses urlparse methods ( urljoin)
    274 
    275 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
    276 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
     326# Cannot use directly from urllib as it would create a circular reference
     327# because urllib uses urlparse methods (urljoin).  If you update this function,
     328# update it also in urllib.  This code duplication does not existin in Python3.
     329
     330_hexdig = '0123456789ABCDEFabcdef'
     331_hextochr = dict((a+b, chr(int(a+b,16)))
     332                 for a in _hexdig for b in _hexdig)
     333_asciire = re.compile('([\x00-\x7f]+)')
    277334
    278335def unquote(s):
    279336    """unquote('abc%20def') -> 'abc def'."""
    280     res = s.split('%')
    281     for i in xrange(1, len(res)):
    282         item = res[i]
     337    if _is_unicode(s):
     338        if '%' not in s:
     339            return s
     340        bits = _asciire.split(s)
     341        res = [bits[0]]
     342        append = res.append
     343        for i in range(1, len(bits), 2):
     344            append(unquote(str(bits[i])).decode('latin1'))
     345            append(bits[i + 1])
     346        return ''.join(res)
     347
     348    bits = s.split('%')
     349    # fastpath
     350    if len(bits) == 1:
     351        return s
     352    res = [bits[0]]
     353    append = res.append
     354    for item in bits[1:]:
    283355        try:
    284             res[i] = _hextochr[item[:2]] + item[2:]
     356            append(_hextochr[item[:2]])
     357            append(item[2:])
    285358        except KeyError:
    286             res[i] = '%' + item
    287         except UnicodeDecodeError:
    288             res[i] = unichr(int(item[:2], 16)) + item[2:]
    289     return "".join(res)
     359            append('%')
     360            append(item)
     361    return ''.join(res)
    290362
    291363def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
     
    294366        Arguments:
    295367
    296         qs: URL-encoded query string to be parsed
     368        qs: percent-encoded query string to be parsed
    297369
    298370        keep_blank_values: flag indicating whether blank values in
    299             URL encoded queries should be treated as blank strings.
     371            percent-encoded queries should be treated as blank strings.
    300372            A true value indicates that blanks should be retained as
    301373            blank strings.  The default false value indicates that
     
    320392    Arguments:
    321393
    322     qs: URL-encoded query string to be parsed
     394    qs: percent-encoded query string to be parsed
    323395
    324396    keep_blank_values: flag indicating whether blank values in
    325         URL encoded queries should be treated as blank strings.  A
     397        percent-encoded queries should be treated as blank strings.  A
    326398        true value indicates that blanks should be retained as blank
    327399        strings.  The default false value indicates that blank values
     
    354426
    355427    return r
    356 
    357 
    358 test_input = """
    359       http://a/b/c/d
    360 
    361       g:h        = <URL:g:h>
    362       http:g     = <URL:http://a/b/c/g>
    363       http:      = <URL:http://a/b/c/d>
    364       g          = <URL:http://a/b/c/g>
    365       ./g        = <URL:http://a/b/c/g>
    366       g/         = <URL:http://a/b/c/g/>
    367       /g         = <URL:http://a/g>
    368       //g        = <URL:http://g>
    369       ?y         = <URL:http://a/b/c/d?y>
    370       g?y        = <URL:http://a/b/c/g?y>
    371       g?y/./x    = <URL:http://a/b/c/g?y/./x>
    372       .          = <URL:http://a/b/c/>
    373       ./         = <URL:http://a/b/c/>
    374       ..         = <URL:http://a/b/>
    375       ../        = <URL:http://a/b/>
    376       ../g       = <URL:http://a/b/g>
    377       ../..      = <URL:http://a/>
    378       ../../g    = <URL:http://a/g>
    379       ../../../g = <URL:http://a/../g>
    380       ./../g     = <URL:http://a/b/g>
    381       ./g/.      = <URL:http://a/b/c/g/>
    382       /./g       = <URL:http://a/./g>
    383       g/./h      = <URL:http://a/b/c/g/h>
    384       g/../h     = <URL:http://a/b/c/h>
    385       http:g     = <URL:http://a/b/c/g>
    386       http:      = <URL:http://a/b/c/d>
    387       http:?y         = <URL:http://a/b/c/d?y>
    388       http:g?y        = <URL:http://a/b/c/g?y>
    389       http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
    390 """
    391 
    392 def test():
    393     import sys
    394     base = ''
    395     if sys.argv[1:]:
    396         fn = sys.argv[1]
    397         if fn == '-':
    398             fp = sys.stdin
    399         else:
    400             fp = open(fn)
    401     else:
    402         try:
    403             from cStringIO import StringIO
    404         except ImportError:
    405             from StringIO import StringIO
    406         fp = StringIO(test_input)
    407     for line in fp:
    408         words = line.split()
    409         if not words:
    410             continue
    411         url = words[0]
    412         parts = urlparse(url)
    413         print '%-10s : %s' % (url, parts)
    414         abs = urljoin(base, url)
    415         if not base:
    416             base = abs
    417         wrapped = '<URL:%s>' % abs
    418         print '%-10s = %s' % (url, wrapped)
    419         if len(words) == 3 and words[1] == '=':
    420             if wrapped != words[2]:
    421                 print 'EXPECTED', words[2], '!!!!!!!!!!'
    422 
    423 if __name__ == '__main__':
    424     test()
Note: See TracChangeset for help on using the changeset viewer.