1 | # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
|
---|
2 |
|
---|
3 | import stringprep, re, codecs
|
---|
4 | from unicodedata import ucd_3_2_0 as unicodedata
|
---|
5 |
|
---|
6 | # IDNA section 3.1
|
---|
7 | dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
|
---|
8 |
|
---|
9 | # IDNA section 5
|
---|
10 | ace_prefix = "xn--"
|
---|
11 | uace_prefix = unicode(ace_prefix, "ascii")
|
---|
12 |
|
---|
13 | # This assumes query strings, so AllowUnassigned is true
|
---|
14 | def nameprep(label):
|
---|
15 | # Map
|
---|
16 | newlabel = []
|
---|
17 | for c in label:
|
---|
18 | if stringprep.in_table_b1(c):
|
---|
19 | # Map to nothing
|
---|
20 | continue
|
---|
21 | newlabel.append(stringprep.map_table_b2(c))
|
---|
22 | label = u"".join(newlabel)
|
---|
23 |
|
---|
24 | # Normalize
|
---|
25 | label = unicodedata.normalize("NFKC", label)
|
---|
26 |
|
---|
27 | # Prohibit
|
---|
28 | for c in label:
|
---|
29 | if stringprep.in_table_c12(c) or \
|
---|
30 | stringprep.in_table_c22(c) or \
|
---|
31 | stringprep.in_table_c3(c) or \
|
---|
32 | stringprep.in_table_c4(c) or \
|
---|
33 | stringprep.in_table_c5(c) or \
|
---|
34 | stringprep.in_table_c6(c) or \
|
---|
35 | stringprep.in_table_c7(c) or \
|
---|
36 | stringprep.in_table_c8(c) or \
|
---|
37 | stringprep.in_table_c9(c):
|
---|
38 | raise UnicodeError("Invalid character %r" % c)
|
---|
39 |
|
---|
40 | # Check bidi
|
---|
41 | RandAL = map(stringprep.in_table_d1, label)
|
---|
42 | for c in RandAL:
|
---|
43 | if c:
|
---|
44 | # There is a RandAL char in the string. Must perform further
|
---|
45 | # tests:
|
---|
46 | # 1) The characters in section 5.8 MUST be prohibited.
|
---|
47 | # This is table C.8, which was already checked
|
---|
48 | # 2) If a string contains any RandALCat character, the string
|
---|
49 | # MUST NOT contain any LCat character.
|
---|
50 | if filter(stringprep.in_table_d2, label):
|
---|
51 | raise UnicodeError("Violation of BIDI requirement 2")
|
---|
52 |
|
---|
53 | # 3) If a string contains any RandALCat character, a
|
---|
54 | # RandALCat character MUST be the first character of the
|
---|
55 | # string, and a RandALCat character MUST be the last
|
---|
56 | # character of the string.
|
---|
57 | if not RandAL[0] or not RandAL[-1]:
|
---|
58 | raise UnicodeError("Violation of BIDI requirement 3")
|
---|
59 |
|
---|
60 | return label
|
---|
61 |
|
---|
62 | def ToASCII(label):
|
---|
63 | try:
|
---|
64 | # Step 1: try ASCII
|
---|
65 | label = label.encode("ascii")
|
---|
66 | except UnicodeError:
|
---|
67 | pass
|
---|
68 | else:
|
---|
69 | # Skip to step 3: UseSTD3ASCIIRules is false, so
|
---|
70 | # Skip to step 8.
|
---|
71 | if 0 < len(label) < 64:
|
---|
72 | return label
|
---|
73 | raise UnicodeError("label empty or too long")
|
---|
74 |
|
---|
75 | # Step 2: nameprep
|
---|
76 | label = nameprep(label)
|
---|
77 |
|
---|
78 | # Step 3: UseSTD3ASCIIRules is false
|
---|
79 | # Step 4: try ASCII
|
---|
80 | try:
|
---|
81 | label = label.encode("ascii")
|
---|
82 | except UnicodeError:
|
---|
83 | pass
|
---|
84 | else:
|
---|
85 | # Skip to step 8.
|
---|
86 | if 0 < len(label) < 64:
|
---|
87 | return label
|
---|
88 | raise UnicodeError("label empty or too long")
|
---|
89 |
|
---|
90 | # Step 5: Check ACE prefix
|
---|
91 | if label.startswith(uace_prefix):
|
---|
92 | raise UnicodeError("Label starts with ACE prefix")
|
---|
93 |
|
---|
94 | # Step 6: Encode with PUNYCODE
|
---|
95 | label = label.encode("punycode")
|
---|
96 |
|
---|
97 | # Step 7: Prepend ACE prefix
|
---|
98 | label = ace_prefix + label
|
---|
99 |
|
---|
100 | # Step 8: Check size
|
---|
101 | if 0 < len(label) < 64:
|
---|
102 | return label
|
---|
103 | raise UnicodeError("label empty or too long")
|
---|
104 |
|
---|
105 | def ToUnicode(label):
|
---|
106 | # Step 1: Check for ASCII
|
---|
107 | if isinstance(label, str):
|
---|
108 | pure_ascii = True
|
---|
109 | else:
|
---|
110 | try:
|
---|
111 | label = label.encode("ascii")
|
---|
112 | pure_ascii = True
|
---|
113 | except UnicodeError:
|
---|
114 | pure_ascii = False
|
---|
115 | if not pure_ascii:
|
---|
116 | # Step 2: Perform nameprep
|
---|
117 | label = nameprep(label)
|
---|
118 | # It doesn't say this, but apparently, it should be ASCII now
|
---|
119 | try:
|
---|
120 | label = label.encode("ascii")
|
---|
121 | except UnicodeError:
|
---|
122 | raise UnicodeError("Invalid character in IDN label")
|
---|
123 | # Step 3: Check for ACE prefix
|
---|
124 | if not label.startswith(ace_prefix):
|
---|
125 | return unicode(label, "ascii")
|
---|
126 |
|
---|
127 | # Step 4: Remove ACE prefix
|
---|
128 | label1 = label[len(ace_prefix):]
|
---|
129 |
|
---|
130 | # Step 5: Decode using PUNYCODE
|
---|
131 | result = label1.decode("punycode")
|
---|
132 |
|
---|
133 | # Step 6: Apply ToASCII
|
---|
134 | label2 = ToASCII(result)
|
---|
135 |
|
---|
136 | # Step 7: Compare the result of step 6 with the one of step 3
|
---|
137 | # label2 will already be in lower case.
|
---|
138 | if label.lower() != label2:
|
---|
139 | raise UnicodeError("IDNA does not round-trip", label, label2)
|
---|
140 |
|
---|
141 | # Step 8: return the result of step 5
|
---|
142 | return result
|
---|
143 |
|
---|
144 | ### Codec APIs
|
---|
145 |
|
---|
146 | class Codec(codecs.Codec):
|
---|
147 | def encode(self,input,errors='strict'):
|
---|
148 |
|
---|
149 | if errors != 'strict':
|
---|
150 | # IDNA is quite clear that implementations must be strict
|
---|
151 | raise UnicodeError("unsupported error handling "+errors)
|
---|
152 |
|
---|
153 | if not input:
|
---|
154 | return "", 0
|
---|
155 |
|
---|
156 | result = []
|
---|
157 | labels = dots.split(input)
|
---|
158 | if labels and len(labels[-1])==0:
|
---|
159 | trailing_dot = '.'
|
---|
160 | del labels[-1]
|
---|
161 | else:
|
---|
162 | trailing_dot = ''
|
---|
163 | for label in labels:
|
---|
164 | result.append(ToASCII(label))
|
---|
165 | # Join with U+002E
|
---|
166 | return ".".join(result)+trailing_dot, len(input)
|
---|
167 |
|
---|
168 | def decode(self,input,errors='strict'):
|
---|
169 |
|
---|
170 | if errors != 'strict':
|
---|
171 | raise UnicodeError("Unsupported error handling "+errors)
|
---|
172 |
|
---|
173 | if not input:
|
---|
174 | return u"", 0
|
---|
175 |
|
---|
176 | # IDNA allows decoding to operate on Unicode strings, too.
|
---|
177 | if isinstance(input, unicode):
|
---|
178 | labels = dots.split(input)
|
---|
179 | else:
|
---|
180 | # Must be ASCII string
|
---|
181 | input = str(input)
|
---|
182 | unicode(input, "ascii")
|
---|
183 | labels = input.split(".")
|
---|
184 |
|
---|
185 | if labels and len(labels[-1]) == 0:
|
---|
186 | trailing_dot = u'.'
|
---|
187 | del labels[-1]
|
---|
188 | else:
|
---|
189 | trailing_dot = u''
|
---|
190 |
|
---|
191 | result = []
|
---|
192 | for label in labels:
|
---|
193 | result.append(ToUnicode(label))
|
---|
194 |
|
---|
195 | return u".".join(result)+trailing_dot, len(input)
|
---|
196 |
|
---|
197 | class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
|
---|
198 | def _buffer_encode(self, input, errors, final):
|
---|
199 | if errors != 'strict':
|
---|
200 | # IDNA is quite clear that implementations must be strict
|
---|
201 | raise UnicodeError("unsupported error handling "+errors)
|
---|
202 |
|
---|
203 | if not input:
|
---|
204 | return ("", 0)
|
---|
205 |
|
---|
206 | labels = dots.split(input)
|
---|
207 | trailing_dot = u''
|
---|
208 | if labels:
|
---|
209 | if not labels[-1]:
|
---|
210 | trailing_dot = '.'
|
---|
211 | del labels[-1]
|
---|
212 | elif not final:
|
---|
213 | # Keep potentially unfinished label until the next call
|
---|
214 | del labels[-1]
|
---|
215 | if labels:
|
---|
216 | trailing_dot = '.'
|
---|
217 |
|
---|
218 | result = []
|
---|
219 | size = 0
|
---|
220 | for label in labels:
|
---|
221 | result.append(ToASCII(label))
|
---|
222 | if size:
|
---|
223 | size += 1
|
---|
224 | size += len(label)
|
---|
225 |
|
---|
226 | # Join with U+002E
|
---|
227 | result = ".".join(result) + trailing_dot
|
---|
228 | size += len(trailing_dot)
|
---|
229 | return (result, size)
|
---|
230 |
|
---|
231 | class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
---|
232 | def _buffer_decode(self, input, errors, final):
|
---|
233 | if errors != 'strict':
|
---|
234 | raise UnicodeError("Unsupported error handling "+errors)
|
---|
235 |
|
---|
236 | if not input:
|
---|
237 | return (u"", 0)
|
---|
238 |
|
---|
239 | # IDNA allows decoding to operate on Unicode strings, too.
|
---|
240 | if isinstance(input, unicode):
|
---|
241 | labels = dots.split(input)
|
---|
242 | else:
|
---|
243 | # Must be ASCII string
|
---|
244 | input = str(input)
|
---|
245 | unicode(input, "ascii")
|
---|
246 | labels = input.split(".")
|
---|
247 |
|
---|
248 | trailing_dot = u''
|
---|
249 | if labels:
|
---|
250 | if not labels[-1]:
|
---|
251 | trailing_dot = u'.'
|
---|
252 | del labels[-1]
|
---|
253 | elif not final:
|
---|
254 | # Keep potentially unfinished label until the next call
|
---|
255 | del labels[-1]
|
---|
256 | if labels:
|
---|
257 | trailing_dot = u'.'
|
---|
258 |
|
---|
259 | result = []
|
---|
260 | size = 0
|
---|
261 | for label in labels:
|
---|
262 | result.append(ToUnicode(label))
|
---|
263 | if size:
|
---|
264 | size += 1
|
---|
265 | size += len(label)
|
---|
266 |
|
---|
267 | result = u".".join(result) + trailing_dot
|
---|
268 | size += len(trailing_dot)
|
---|
269 | return (result, size)
|
---|
270 |
|
---|
271 | class StreamWriter(Codec,codecs.StreamWriter):
|
---|
272 | pass
|
---|
273 |
|
---|
274 | class StreamReader(Codec,codecs.StreamReader):
|
---|
275 | pass
|
---|
276 |
|
---|
277 | ### encodings module API
|
---|
278 |
|
---|
279 | def getregentry():
|
---|
280 | return codecs.CodecInfo(
|
---|
281 | name='idna',
|
---|
282 | encode=Codec().encode,
|
---|
283 | decode=Codec().decode,
|
---|
284 | incrementalencoder=IncrementalEncoder,
|
---|
285 | incrementaldecoder=IncrementalDecoder,
|
---|
286 | streamwriter=StreamWriter,
|
---|
287 | streamreader=StreamReader,
|
---|
288 | )
|
---|