1 | # -*- coding: iso-8859-1 -*-
|
---|
2 | """ Codec for the Punicode encoding, as specified in RFC 3492
|
---|
3 |
|
---|
4 | Written by Martin v. Löwis.
|
---|
5 | """
|
---|
6 |
|
---|
7 | import codecs
|
---|
8 |
|
---|
9 | ##################### Encoding #####################################
|
---|
10 |
|
---|
11 | def segregate(str):
|
---|
12 | """3.1 Basic code point segregation"""
|
---|
13 | base = []
|
---|
14 | extended = {}
|
---|
15 | for c in str:
|
---|
16 | if ord(c) < 128:
|
---|
17 | base.append(c)
|
---|
18 | else:
|
---|
19 | extended[c] = 1
|
---|
20 | extended = extended.keys()
|
---|
21 | extended.sort()
|
---|
22 | return "".join(base).encode("ascii"),extended
|
---|
23 |
|
---|
24 | def selective_len(str, max):
|
---|
25 | """Return the length of str, considering only characters below max."""
|
---|
26 | res = 0
|
---|
27 | for c in str:
|
---|
28 | if ord(c) < max:
|
---|
29 | res += 1
|
---|
30 | return res
|
---|
31 |
|
---|
32 | def selective_find(str, char, index, pos):
|
---|
33 | """Return a pair (index, pos), indicating the next occurrence of
|
---|
34 | char in str. index is the position of the character considering
|
---|
35 | only ordinals up to and including char, and pos is the position in
|
---|
36 | the full string. index/pos is the starting position in the full
|
---|
37 | string."""
|
---|
38 |
|
---|
39 | l = len(str)
|
---|
40 | while 1:
|
---|
41 | pos += 1
|
---|
42 | if pos == l:
|
---|
43 | return (-1, -1)
|
---|
44 | c = str[pos]
|
---|
45 | if c == char:
|
---|
46 | return index+1, pos
|
---|
47 | elif c < char:
|
---|
48 | index += 1
|
---|
49 |
|
---|
50 | def insertion_unsort(str, extended):
|
---|
51 | """3.2 Insertion unsort coding"""
|
---|
52 | oldchar = 0x80
|
---|
53 | result = []
|
---|
54 | oldindex = -1
|
---|
55 | for c in extended:
|
---|
56 | index = pos = -1
|
---|
57 | char = ord(c)
|
---|
58 | curlen = selective_len(str, char)
|
---|
59 | delta = (curlen+1) * (char - oldchar)
|
---|
60 | while 1:
|
---|
61 | index,pos = selective_find(str,c,index,pos)
|
---|
62 | if index == -1:
|
---|
63 | break
|
---|
64 | delta += index - oldindex
|
---|
65 | result.append(delta-1)
|
---|
66 | oldindex = index
|
---|
67 | delta = 0
|
---|
68 | oldchar = char
|
---|
69 |
|
---|
70 | return result
|
---|
71 |
|
---|
72 | def T(j, bias):
|
---|
73 | # Punycode parameters: tmin = 1, tmax = 26, base = 36
|
---|
74 | res = 36 * (j + 1) - bias
|
---|
75 | if res < 1: return 1
|
---|
76 | if res > 26: return 26
|
---|
77 | return res
|
---|
78 |
|
---|
79 | digits = "abcdefghijklmnopqrstuvwxyz0123456789"
|
---|
80 | def generate_generalized_integer(N, bias):
|
---|
81 | """3.3 Generalized variable-length integers"""
|
---|
82 | result = []
|
---|
83 | j = 0
|
---|
84 | while 1:
|
---|
85 | t = T(j, bias)
|
---|
86 | if N < t:
|
---|
87 | result.append(digits[N])
|
---|
88 | return result
|
---|
89 | result.append(digits[t + ((N - t) % (36 - t))])
|
---|
90 | N = (N - t) // (36 - t)
|
---|
91 | j += 1
|
---|
92 |
|
---|
93 | def adapt(delta, first, numchars):
|
---|
94 | if first:
|
---|
95 | delta //= 700
|
---|
96 | else:
|
---|
97 | delta //= 2
|
---|
98 | delta += delta // numchars
|
---|
99 | # ((base - tmin) * tmax) // 2 == 455
|
---|
100 | divisions = 0
|
---|
101 | while delta > 455:
|
---|
102 | delta = delta // 35 # base - tmin
|
---|
103 | divisions += 36
|
---|
104 | bias = divisions + (36 * delta // (delta + 38))
|
---|
105 | return bias
|
---|
106 |
|
---|
107 |
|
---|
108 | def generate_integers(baselen, deltas):
|
---|
109 | """3.4 Bias adaptation"""
|
---|
110 | # Punycode parameters: initial bias = 72, damp = 700, skew = 38
|
---|
111 | result = []
|
---|
112 | bias = 72
|
---|
113 | for points, delta in enumerate(deltas):
|
---|
114 | s = generate_generalized_integer(delta, bias)
|
---|
115 | result.extend(s)
|
---|
116 | bias = adapt(delta, points==0, baselen+points+1)
|
---|
117 | return "".join(result)
|
---|
118 |
|
---|
119 | def punycode_encode(text):
|
---|
120 | base, extended = segregate(text)
|
---|
121 | base = base.encode("ascii")
|
---|
122 | deltas = insertion_unsort(text, extended)
|
---|
123 | extended = generate_integers(len(base), deltas)
|
---|
124 | if base:
|
---|
125 | return base + "-" + extended
|
---|
126 | return extended
|
---|
127 |
|
---|
128 | ##################### Decoding #####################################
|
---|
129 |
|
---|
130 | def decode_generalized_number(extended, extpos, bias, errors):
|
---|
131 | """3.3 Generalized variable-length integers"""
|
---|
132 | result = 0
|
---|
133 | w = 1
|
---|
134 | j = 0
|
---|
135 | while 1:
|
---|
136 | try:
|
---|
137 | char = ord(extended[extpos])
|
---|
138 | except IndexError:
|
---|
139 | if errors == "strict":
|
---|
140 | raise UnicodeError, "incomplete punicode string"
|
---|
141 | return extpos + 1, None
|
---|
142 | extpos += 1
|
---|
143 | if 0x41 <= char <= 0x5A: # A-Z
|
---|
144 | digit = char - 0x41
|
---|
145 | elif 0x30 <= char <= 0x39:
|
---|
146 | digit = char - 22 # 0x30-26
|
---|
147 | elif errors == "strict":
|
---|
148 | raise UnicodeError("Invalid extended code point '%s'"
|
---|
149 | % extended[extpos])
|
---|
150 | else:
|
---|
151 | return extpos, None
|
---|
152 | t = T(j, bias)
|
---|
153 | result += digit * w
|
---|
154 | if digit < t:
|
---|
155 | return extpos, result
|
---|
156 | w = w * (36 - t)
|
---|
157 | j += 1
|
---|
158 |
|
---|
159 |
|
---|
160 | def insertion_sort(base, extended, errors):
|
---|
161 | """3.2 Insertion unsort coding"""
|
---|
162 | char = 0x80
|
---|
163 | pos = -1
|
---|
164 | bias = 72
|
---|
165 | extpos = 0
|
---|
166 | while extpos < len(extended):
|
---|
167 | newpos, delta = decode_generalized_number(extended, extpos,
|
---|
168 | bias, errors)
|
---|
169 | if delta is None:
|
---|
170 | # There was an error in decoding. We can't continue because
|
---|
171 | # synchronization is lost.
|
---|
172 | return base
|
---|
173 | pos += delta+1
|
---|
174 | char += pos // (len(base) + 1)
|
---|
175 | if char > 0x10FFFF:
|
---|
176 | if errors == "strict":
|
---|
177 | raise UnicodeError, ("Invalid character U+%x" % char)
|
---|
178 | char = ord('?')
|
---|
179 | pos = pos % (len(base) + 1)
|
---|
180 | base = base[:pos] + unichr(char) + base[pos:]
|
---|
181 | bias = adapt(delta, (extpos == 0), len(base))
|
---|
182 | extpos = newpos
|
---|
183 | return base
|
---|
184 |
|
---|
185 | def punycode_decode(text, errors):
|
---|
186 | pos = text.rfind("-")
|
---|
187 | if pos == -1:
|
---|
188 | base = ""
|
---|
189 | extended = text
|
---|
190 | else:
|
---|
191 | base = text[:pos]
|
---|
192 | extended = text[pos+1:]
|
---|
193 | base = unicode(base, "ascii", errors)
|
---|
194 | extended = extended.upper()
|
---|
195 | return insertion_sort(base, extended, errors)
|
---|
196 |
|
---|
197 | ### Codec APIs
|
---|
198 |
|
---|
199 | class Codec(codecs.Codec):
|
---|
200 |
|
---|
201 | def encode(self,input,errors='strict'):
|
---|
202 | res = punycode_encode(input)
|
---|
203 | return res, len(input)
|
---|
204 |
|
---|
205 | def decode(self,input,errors='strict'):
|
---|
206 | if errors not in ('strict', 'replace', 'ignore'):
|
---|
207 | raise UnicodeError, "Unsupported error handling "+errors
|
---|
208 | res = punycode_decode(input, errors)
|
---|
209 | return res, len(input)
|
---|
210 |
|
---|
211 | class IncrementalEncoder(codecs.IncrementalEncoder):
|
---|
212 | def encode(self, input, final=False):
|
---|
213 | return punycode_encode(input)
|
---|
214 |
|
---|
215 | class IncrementalDecoder(codecs.IncrementalDecoder):
|
---|
216 | def decode(self, input, final=False):
|
---|
217 | if self.errors not in ('strict', 'replace', 'ignore'):
|
---|
218 | raise UnicodeError, "Unsupported error handling "+self.errors
|
---|
219 | return punycode_decode(input, self.errors)
|
---|
220 |
|
---|
221 | class StreamWriter(Codec,codecs.StreamWriter):
|
---|
222 | pass
|
---|
223 |
|
---|
224 | class StreamReader(Codec,codecs.StreamReader):
|
---|
225 | pass
|
---|
226 |
|
---|
227 | ### encodings module API
|
---|
228 |
|
---|
229 | def getregentry():
|
---|
230 | return codecs.CodecInfo(
|
---|
231 | name='punycode',
|
---|
232 | encode=Codec().encode,
|
---|
233 | decode=Codec().decode,
|
---|
234 | incrementalencoder=IncrementalEncoder,
|
---|
235 | incrementaldecoder=IncrementalDecoder,
|
---|
236 | streamwriter=StreamWriter,
|
---|
237 | streamreader=StreamReader,
|
---|
238 | )
|
---|