| 1 | # -*- coding: iso-8859-1 -*-
|
|---|
| 2 | """ Codec for the Punicode encoding, as specified in RFC 3492
|
|---|
| 3 |
|
|---|
| 4 | Written by Martin v. Löwis.
|
|---|
| 5 | """
|
|---|
| 6 |
|
|---|
| 7 | import codecs
|
|---|
| 8 |
|
|---|
| 9 | ##################### Encoding #####################################
|
|---|
| 10 |
|
|---|
| 11 | def segregate(str):
|
|---|
| 12 | """3.1 Basic code point segregation"""
|
|---|
| 13 | base = []
|
|---|
| 14 | extended = {}
|
|---|
| 15 | for c in str:
|
|---|
| 16 | if ord(c) < 128:
|
|---|
| 17 | base.append(c)
|
|---|
| 18 | else:
|
|---|
| 19 | extended[c] = 1
|
|---|
| 20 | extended = extended.keys()
|
|---|
| 21 | extended.sort()
|
|---|
| 22 | return "".join(base).encode("ascii"),extended
|
|---|
| 23 |
|
|---|
| 24 | def selective_len(str, max):
|
|---|
| 25 | """Return the length of str, considering only characters below max."""
|
|---|
| 26 | res = 0
|
|---|
| 27 | for c in str:
|
|---|
| 28 | if ord(c) < max:
|
|---|
| 29 | res += 1
|
|---|
| 30 | return res
|
|---|
| 31 |
|
|---|
| 32 | def selective_find(str, char, index, pos):
|
|---|
| 33 | """Return a pair (index, pos), indicating the next occurrence of
|
|---|
| 34 | char in str. index is the position of the character considering
|
|---|
| 35 | only ordinals up to and including char, and pos is the position in
|
|---|
| 36 | the full string. index/pos is the starting position in the full
|
|---|
| 37 | string."""
|
|---|
| 38 |
|
|---|
| 39 | l = len(str)
|
|---|
| 40 | while 1:
|
|---|
| 41 | pos += 1
|
|---|
| 42 | if pos == l:
|
|---|
| 43 | return (-1, -1)
|
|---|
| 44 | c = str[pos]
|
|---|
| 45 | if c == char:
|
|---|
| 46 | return index+1, pos
|
|---|
| 47 | elif c < char:
|
|---|
| 48 | index += 1
|
|---|
| 49 |
|
|---|
| 50 | def insertion_unsort(str, extended):
|
|---|
| 51 | """3.2 Insertion unsort coding"""
|
|---|
| 52 | oldchar = 0x80
|
|---|
| 53 | result = []
|
|---|
| 54 | oldindex = -1
|
|---|
| 55 | for c in extended:
|
|---|
| 56 | index = pos = -1
|
|---|
| 57 | char = ord(c)
|
|---|
| 58 | curlen = selective_len(str, char)
|
|---|
| 59 | delta = (curlen+1) * (char - oldchar)
|
|---|
| 60 | while 1:
|
|---|
| 61 | index,pos = selective_find(str,c,index,pos)
|
|---|
| 62 | if index == -1:
|
|---|
| 63 | break
|
|---|
| 64 | delta += index - oldindex
|
|---|
| 65 | result.append(delta-1)
|
|---|
| 66 | oldindex = index
|
|---|
| 67 | delta = 0
|
|---|
| 68 | oldchar = char
|
|---|
| 69 |
|
|---|
| 70 | return result
|
|---|
| 71 |
|
|---|
| 72 | def T(j, bias):
|
|---|
| 73 | # Punycode parameters: tmin = 1, tmax = 26, base = 36
|
|---|
| 74 | res = 36 * (j + 1) - bias
|
|---|
| 75 | if res < 1: return 1
|
|---|
| 76 | if res > 26: return 26
|
|---|
| 77 | return res
|
|---|
| 78 |
|
|---|
| 79 | digits = "abcdefghijklmnopqrstuvwxyz0123456789"
|
|---|
| 80 | def generate_generalized_integer(N, bias):
|
|---|
| 81 | """3.3 Generalized variable-length integers"""
|
|---|
| 82 | result = []
|
|---|
| 83 | j = 0
|
|---|
| 84 | while 1:
|
|---|
| 85 | t = T(j, bias)
|
|---|
| 86 | if N < t:
|
|---|
| 87 | result.append(digits[N])
|
|---|
| 88 | return result
|
|---|
| 89 | result.append(digits[t + ((N - t) % (36 - t))])
|
|---|
| 90 | N = (N - t) // (36 - t)
|
|---|
| 91 | j += 1
|
|---|
| 92 |
|
|---|
| 93 | def adapt(delta, first, numchars):
|
|---|
| 94 | if first:
|
|---|
| 95 | delta //= 700
|
|---|
| 96 | else:
|
|---|
| 97 | delta //= 2
|
|---|
| 98 | delta += delta // numchars
|
|---|
| 99 | # ((base - tmin) * tmax) // 2 == 455
|
|---|
| 100 | divisions = 0
|
|---|
| 101 | while delta > 455:
|
|---|
| 102 | delta = delta // 35 # base - tmin
|
|---|
| 103 | divisions += 36
|
|---|
| 104 | bias = divisions + (36 * delta // (delta + 38))
|
|---|
| 105 | return bias
|
|---|
| 106 |
|
|---|
| 107 |
|
|---|
| 108 | def generate_integers(baselen, deltas):
|
|---|
| 109 | """3.4 Bias adaptation"""
|
|---|
| 110 | # Punycode parameters: initial bias = 72, damp = 700, skew = 38
|
|---|
| 111 | result = []
|
|---|
| 112 | bias = 72
|
|---|
| 113 | for points, delta in enumerate(deltas):
|
|---|
| 114 | s = generate_generalized_integer(delta, bias)
|
|---|
| 115 | result.extend(s)
|
|---|
| 116 | bias = adapt(delta, points==0, baselen+points+1)
|
|---|
| 117 | return "".join(result)
|
|---|
| 118 |
|
|---|
| 119 | def punycode_encode(text):
|
|---|
| 120 | base, extended = segregate(text)
|
|---|
| 121 | base = base.encode("ascii")
|
|---|
| 122 | deltas = insertion_unsort(text, extended)
|
|---|
| 123 | extended = generate_integers(len(base), deltas)
|
|---|
| 124 | if base:
|
|---|
| 125 | return base + "-" + extended
|
|---|
| 126 | return extended
|
|---|
| 127 |
|
|---|
| 128 | ##################### Decoding #####################################
|
|---|
| 129 |
|
|---|
| 130 | def decode_generalized_number(extended, extpos, bias, errors):
|
|---|
| 131 | """3.3 Generalized variable-length integers"""
|
|---|
| 132 | result = 0
|
|---|
| 133 | w = 1
|
|---|
| 134 | j = 0
|
|---|
| 135 | while 1:
|
|---|
| 136 | try:
|
|---|
| 137 | char = ord(extended[extpos])
|
|---|
| 138 | except IndexError:
|
|---|
| 139 | if errors == "strict":
|
|---|
| 140 | raise UnicodeError, "incomplete punicode string"
|
|---|
| 141 | return extpos + 1, None
|
|---|
| 142 | extpos += 1
|
|---|
| 143 | if 0x41 <= char <= 0x5A: # A-Z
|
|---|
| 144 | digit = char - 0x41
|
|---|
| 145 | elif 0x30 <= char <= 0x39:
|
|---|
| 146 | digit = char - 22 # 0x30-26
|
|---|
| 147 | elif errors == "strict":
|
|---|
| 148 | raise UnicodeError("Invalid extended code point '%s'"
|
|---|
| 149 | % extended[extpos])
|
|---|
| 150 | else:
|
|---|
| 151 | return extpos, None
|
|---|
| 152 | t = T(j, bias)
|
|---|
| 153 | result += digit * w
|
|---|
| 154 | if digit < t:
|
|---|
| 155 | return extpos, result
|
|---|
| 156 | w = w * (36 - t)
|
|---|
| 157 | j += 1
|
|---|
| 158 |
|
|---|
| 159 |
|
|---|
| 160 | def insertion_sort(base, extended, errors):
|
|---|
| 161 | """3.2 Insertion unsort coding"""
|
|---|
| 162 | char = 0x80
|
|---|
| 163 | pos = -1
|
|---|
| 164 | bias = 72
|
|---|
| 165 | extpos = 0
|
|---|
| 166 | while extpos < len(extended):
|
|---|
| 167 | newpos, delta = decode_generalized_number(extended, extpos,
|
|---|
| 168 | bias, errors)
|
|---|
| 169 | if delta is None:
|
|---|
| 170 | # There was an error in decoding. We can't continue because
|
|---|
| 171 | # synchronization is lost.
|
|---|
| 172 | return base
|
|---|
| 173 | pos += delta+1
|
|---|
| 174 | char += pos // (len(base) + 1)
|
|---|
| 175 | if char > 0x10FFFF:
|
|---|
| 176 | if errors == "strict":
|
|---|
| 177 | raise UnicodeError, ("Invalid character U+%x" % char)
|
|---|
| 178 | char = ord('?')
|
|---|
| 179 | pos = pos % (len(base) + 1)
|
|---|
| 180 | base = base[:pos] + unichr(char) + base[pos:]
|
|---|
| 181 | bias = adapt(delta, (extpos == 0), len(base))
|
|---|
| 182 | extpos = newpos
|
|---|
| 183 | return base
|
|---|
| 184 |
|
|---|
| 185 | def punycode_decode(text, errors):
|
|---|
| 186 | pos = text.rfind("-")
|
|---|
| 187 | if pos == -1:
|
|---|
| 188 | base = ""
|
|---|
| 189 | extended = text
|
|---|
| 190 | else:
|
|---|
| 191 | base = text[:pos]
|
|---|
| 192 | extended = text[pos+1:]
|
|---|
| 193 | base = unicode(base, "ascii", errors)
|
|---|
| 194 | extended = extended.upper()
|
|---|
| 195 | return insertion_sort(base, extended, errors)
|
|---|
| 196 |
|
|---|
| 197 | ### Codec APIs
|
|---|
| 198 |
|
|---|
| 199 | class Codec(codecs.Codec):
|
|---|
| 200 |
|
|---|
| 201 | def encode(self,input,errors='strict'):
|
|---|
| 202 | res = punycode_encode(input)
|
|---|
| 203 | return res, len(input)
|
|---|
| 204 |
|
|---|
| 205 | def decode(self,input,errors='strict'):
|
|---|
| 206 | if errors not in ('strict', 'replace', 'ignore'):
|
|---|
| 207 | raise UnicodeError, "Unsupported error handling "+errors
|
|---|
| 208 | res = punycode_decode(input, errors)
|
|---|
| 209 | return res, len(input)
|
|---|
| 210 |
|
|---|
| 211 | class IncrementalEncoder(codecs.IncrementalEncoder):
|
|---|
| 212 | def encode(self, input, final=False):
|
|---|
| 213 | return punycode_encode(input)
|
|---|
| 214 |
|
|---|
| 215 | class IncrementalDecoder(codecs.IncrementalDecoder):
|
|---|
| 216 | def decode(self, input, final=False):
|
|---|
| 217 | if self.errors not in ('strict', 'replace', 'ignore'):
|
|---|
| 218 | raise UnicodeError, "Unsupported error handling "+self.errors
|
|---|
| 219 | return punycode_decode(input, self.errors)
|
|---|
| 220 |
|
|---|
| 221 | class StreamWriter(Codec,codecs.StreamWriter):
|
|---|
| 222 | pass
|
|---|
| 223 |
|
|---|
| 224 | class StreamReader(Codec,codecs.StreamReader):
|
|---|
| 225 | pass
|
|---|
| 226 |
|
|---|
| 227 | ### encodings module API
|
|---|
| 228 |
|
|---|
| 229 | def getregentry():
|
|---|
| 230 | return codecs.CodecInfo(
|
|---|
| 231 | name='punycode',
|
|---|
| 232 | encode=Codec().encode,
|
|---|
| 233 | decode=Codec().decode,
|
|---|
| 234 | incrementalencoder=IncrementalEncoder,
|
|---|
| 235 | incrementaldecoder=IncrementalDecoder,
|
|---|
| 236 | streamwriter=StreamWriter,
|
|---|
| 237 | streamreader=StreamReader,
|
|---|
| 238 | )
|
|---|