| 1 | # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
|
|---|
| 2 |
|
|---|
| 3 | import stringprep, re, codecs
|
|---|
| 4 | from unicodedata import ucd_3_2_0 as unicodedata
|
|---|
| 5 |
|
|---|
| 6 | # IDNA section 3.1
|
|---|
| 7 | dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
|
|---|
| 8 |
|
|---|
| 9 | # IDNA section 5
|
|---|
| 10 | ace_prefix = "xn--"
|
|---|
| 11 | uace_prefix = unicode(ace_prefix, "ascii")
|
|---|
| 12 |
|
|---|
| 13 | # This assumes query strings, so AllowUnassigned is true
|
|---|
| 14 | def nameprep(label):
|
|---|
| 15 | # Map
|
|---|
| 16 | newlabel = []
|
|---|
| 17 | for c in label:
|
|---|
| 18 | if stringprep.in_table_b1(c):
|
|---|
| 19 | # Map to nothing
|
|---|
| 20 | continue
|
|---|
| 21 | newlabel.append(stringprep.map_table_b2(c))
|
|---|
| 22 | label = u"".join(newlabel)
|
|---|
| 23 |
|
|---|
| 24 | # Normalize
|
|---|
| 25 | label = unicodedata.normalize("NFKC", label)
|
|---|
| 26 |
|
|---|
| 27 | # Prohibit
|
|---|
| 28 | for c in label:
|
|---|
| 29 | if stringprep.in_table_c12(c) or \
|
|---|
| 30 | stringprep.in_table_c22(c) or \
|
|---|
| 31 | stringprep.in_table_c3(c) or \
|
|---|
| 32 | stringprep.in_table_c4(c) or \
|
|---|
| 33 | stringprep.in_table_c5(c) or \
|
|---|
| 34 | stringprep.in_table_c6(c) or \
|
|---|
| 35 | stringprep.in_table_c7(c) or \
|
|---|
| 36 | stringprep.in_table_c8(c) or \
|
|---|
| 37 | stringprep.in_table_c9(c):
|
|---|
| 38 | raise UnicodeError("Invalid character %r" % c)
|
|---|
| 39 |
|
|---|
| 40 | # Check bidi
|
|---|
| 41 | RandAL = map(stringprep.in_table_d1, label)
|
|---|
| 42 | for c in RandAL:
|
|---|
| 43 | if c:
|
|---|
| 44 | # There is a RandAL char in the string. Must perform further
|
|---|
| 45 | # tests:
|
|---|
| 46 | # 1) The characters in section 5.8 MUST be prohibited.
|
|---|
| 47 | # This is table C.8, which was already checked
|
|---|
| 48 | # 2) If a string contains any RandALCat character, the string
|
|---|
| 49 | # MUST NOT contain any LCat character.
|
|---|
| 50 | if filter(stringprep.in_table_d2, label):
|
|---|
| 51 | raise UnicodeError("Violation of BIDI requirement 2")
|
|---|
| 52 |
|
|---|
| 53 | # 3) If a string contains any RandALCat character, a
|
|---|
| 54 | # RandALCat character MUST be the first character of the
|
|---|
| 55 | # string, and a RandALCat character MUST be the last
|
|---|
| 56 | # character of the string.
|
|---|
| 57 | if not RandAL[0] or not RandAL[-1]:
|
|---|
| 58 | raise UnicodeError("Violation of BIDI requirement 3")
|
|---|
| 59 |
|
|---|
| 60 | return label
|
|---|
| 61 |
|
|---|
| 62 | def ToASCII(label):
|
|---|
| 63 | try:
|
|---|
| 64 | # Step 1: try ASCII
|
|---|
| 65 | label = label.encode("ascii")
|
|---|
| 66 | except UnicodeError:
|
|---|
| 67 | pass
|
|---|
| 68 | else:
|
|---|
| 69 | # Skip to step 3: UseSTD3ASCIIRules is false, so
|
|---|
| 70 | # Skip to step 8.
|
|---|
| 71 | if 0 < len(label) < 64:
|
|---|
| 72 | return label
|
|---|
| 73 | raise UnicodeError("label empty or too long")
|
|---|
| 74 |
|
|---|
| 75 | # Step 2: nameprep
|
|---|
| 76 | label = nameprep(label)
|
|---|
| 77 |
|
|---|
| 78 | # Step 3: UseSTD3ASCIIRules is false
|
|---|
| 79 | # Step 4: try ASCII
|
|---|
| 80 | try:
|
|---|
| 81 | label = label.encode("ascii")
|
|---|
| 82 | except UnicodeError:
|
|---|
| 83 | pass
|
|---|
| 84 | else:
|
|---|
| 85 | # Skip to step 8.
|
|---|
| 86 | if 0 < len(label) < 64:
|
|---|
| 87 | return label
|
|---|
| 88 | raise UnicodeError("label empty or too long")
|
|---|
| 89 |
|
|---|
| 90 | # Step 5: Check ACE prefix
|
|---|
| 91 | if label.startswith(uace_prefix):
|
|---|
| 92 | raise UnicodeError("Label starts with ACE prefix")
|
|---|
| 93 |
|
|---|
| 94 | # Step 6: Encode with PUNYCODE
|
|---|
| 95 | label = label.encode("punycode")
|
|---|
| 96 |
|
|---|
| 97 | # Step 7: Prepend ACE prefix
|
|---|
| 98 | label = ace_prefix + label
|
|---|
| 99 |
|
|---|
| 100 | # Step 8: Check size
|
|---|
| 101 | if 0 < len(label) < 64:
|
|---|
| 102 | return label
|
|---|
| 103 | raise UnicodeError("label empty or too long")
|
|---|
| 104 |
|
|---|
| 105 | def ToUnicode(label):
|
|---|
| 106 | # Step 1: Check for ASCII
|
|---|
| 107 | if isinstance(label, str):
|
|---|
| 108 | pure_ascii = True
|
|---|
| 109 | else:
|
|---|
| 110 | try:
|
|---|
| 111 | label = label.encode("ascii")
|
|---|
| 112 | pure_ascii = True
|
|---|
| 113 | except UnicodeError:
|
|---|
| 114 | pure_ascii = False
|
|---|
| 115 | if not pure_ascii:
|
|---|
| 116 | # Step 2: Perform nameprep
|
|---|
| 117 | label = nameprep(label)
|
|---|
| 118 | # It doesn't say this, but apparently, it should be ASCII now
|
|---|
| 119 | try:
|
|---|
| 120 | label = label.encode("ascii")
|
|---|
| 121 | except UnicodeError:
|
|---|
| 122 | raise UnicodeError("Invalid character in IDN label")
|
|---|
| 123 | # Step 3: Check for ACE prefix
|
|---|
| 124 | if not label.startswith(ace_prefix):
|
|---|
| 125 | return unicode(label, "ascii")
|
|---|
| 126 |
|
|---|
| 127 | # Step 4: Remove ACE prefix
|
|---|
| 128 | label1 = label[len(ace_prefix):]
|
|---|
| 129 |
|
|---|
| 130 | # Step 5: Decode using PUNYCODE
|
|---|
| 131 | result = label1.decode("punycode")
|
|---|
| 132 |
|
|---|
| 133 | # Step 6: Apply ToASCII
|
|---|
| 134 | label2 = ToASCII(result)
|
|---|
| 135 |
|
|---|
| 136 | # Step 7: Compare the result of step 6 with the one of step 3
|
|---|
| 137 | # label2 will already be in lower case.
|
|---|
| 138 | if label.lower() != label2:
|
|---|
| 139 | raise UnicodeError("IDNA does not round-trip", label, label2)
|
|---|
| 140 |
|
|---|
| 141 | # Step 8: return the result of step 5
|
|---|
| 142 | return result
|
|---|
| 143 |
|
|---|
| 144 | ### Codec APIs
|
|---|
| 145 |
|
|---|
| 146 | class Codec(codecs.Codec):
|
|---|
| 147 | def encode(self,input,errors='strict'):
|
|---|
| 148 |
|
|---|
| 149 | if errors != 'strict':
|
|---|
| 150 | # IDNA is quite clear that implementations must be strict
|
|---|
| 151 | raise UnicodeError("unsupported error handling "+errors)
|
|---|
| 152 |
|
|---|
| 153 | if not input:
|
|---|
| 154 | return "", 0
|
|---|
| 155 |
|
|---|
| 156 | result = []
|
|---|
| 157 | labels = dots.split(input)
|
|---|
| 158 | if labels and len(labels[-1])==0:
|
|---|
| 159 | trailing_dot = '.'
|
|---|
| 160 | del labels[-1]
|
|---|
| 161 | else:
|
|---|
| 162 | trailing_dot = ''
|
|---|
| 163 | for label in labels:
|
|---|
| 164 | result.append(ToASCII(label))
|
|---|
| 165 | # Join with U+002E
|
|---|
| 166 | return ".".join(result)+trailing_dot, len(input)
|
|---|
| 167 |
|
|---|
| 168 | def decode(self,input,errors='strict'):
|
|---|
| 169 |
|
|---|
| 170 | if errors != 'strict':
|
|---|
| 171 | raise UnicodeError("Unsupported error handling "+errors)
|
|---|
| 172 |
|
|---|
| 173 | if not input:
|
|---|
| 174 | return u"", 0
|
|---|
| 175 |
|
|---|
| 176 | # IDNA allows decoding to operate on Unicode strings, too.
|
|---|
| 177 | if isinstance(input, unicode):
|
|---|
| 178 | labels = dots.split(input)
|
|---|
| 179 | else:
|
|---|
| 180 | # Must be ASCII string
|
|---|
| 181 | input = str(input)
|
|---|
| 182 | unicode(input, "ascii")
|
|---|
| 183 | labels = input.split(".")
|
|---|
| 184 |
|
|---|
| 185 | if labels and len(labels[-1]) == 0:
|
|---|
| 186 | trailing_dot = u'.'
|
|---|
| 187 | del labels[-1]
|
|---|
| 188 | else:
|
|---|
| 189 | trailing_dot = u''
|
|---|
| 190 |
|
|---|
| 191 | result = []
|
|---|
| 192 | for label in labels:
|
|---|
| 193 | result.append(ToUnicode(label))
|
|---|
| 194 |
|
|---|
| 195 | return u".".join(result)+trailing_dot, len(input)
|
|---|
| 196 |
|
|---|
| 197 | class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
|
|---|
| 198 | def _buffer_encode(self, input, errors, final):
|
|---|
| 199 | if errors != 'strict':
|
|---|
| 200 | # IDNA is quite clear that implementations must be strict
|
|---|
| 201 | raise UnicodeError("unsupported error handling "+errors)
|
|---|
| 202 |
|
|---|
| 203 | if not input:
|
|---|
| 204 | return ("", 0)
|
|---|
| 205 |
|
|---|
| 206 | labels = dots.split(input)
|
|---|
| 207 | trailing_dot = u''
|
|---|
| 208 | if labels:
|
|---|
| 209 | if not labels[-1]:
|
|---|
| 210 | trailing_dot = '.'
|
|---|
| 211 | del labels[-1]
|
|---|
| 212 | elif not final:
|
|---|
| 213 | # Keep potentially unfinished label until the next call
|
|---|
| 214 | del labels[-1]
|
|---|
| 215 | if labels:
|
|---|
| 216 | trailing_dot = '.'
|
|---|
| 217 |
|
|---|
| 218 | result = []
|
|---|
| 219 | size = 0
|
|---|
| 220 | for label in labels:
|
|---|
| 221 | result.append(ToASCII(label))
|
|---|
| 222 | if size:
|
|---|
| 223 | size += 1
|
|---|
| 224 | size += len(label)
|
|---|
| 225 |
|
|---|
| 226 | # Join with U+002E
|
|---|
| 227 | result = ".".join(result) + trailing_dot
|
|---|
| 228 | size += len(trailing_dot)
|
|---|
| 229 | return (result, size)
|
|---|
| 230 |
|
|---|
| 231 | class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
|---|
| 232 | def _buffer_decode(self, input, errors, final):
|
|---|
| 233 | if errors != 'strict':
|
|---|
| 234 | raise UnicodeError("Unsupported error handling "+errors)
|
|---|
| 235 |
|
|---|
| 236 | if not input:
|
|---|
| 237 | return (u"", 0)
|
|---|
| 238 |
|
|---|
| 239 | # IDNA allows decoding to operate on Unicode strings, too.
|
|---|
| 240 | if isinstance(input, unicode):
|
|---|
| 241 | labels = dots.split(input)
|
|---|
| 242 | else:
|
|---|
| 243 | # Must be ASCII string
|
|---|
| 244 | input = str(input)
|
|---|
| 245 | unicode(input, "ascii")
|
|---|
| 246 | labels = input.split(".")
|
|---|
| 247 |
|
|---|
| 248 | trailing_dot = u''
|
|---|
| 249 | if labels:
|
|---|
| 250 | if not labels[-1]:
|
|---|
| 251 | trailing_dot = u'.'
|
|---|
| 252 | del labels[-1]
|
|---|
| 253 | elif not final:
|
|---|
| 254 | # Keep potentially unfinished label until the next call
|
|---|
| 255 | del labels[-1]
|
|---|
| 256 | if labels:
|
|---|
| 257 | trailing_dot = u'.'
|
|---|
| 258 |
|
|---|
| 259 | result = []
|
|---|
| 260 | size = 0
|
|---|
| 261 | for label in labels:
|
|---|
| 262 | result.append(ToUnicode(label))
|
|---|
| 263 | if size:
|
|---|
| 264 | size += 1
|
|---|
| 265 | size += len(label)
|
|---|
| 266 |
|
|---|
| 267 | result = u".".join(result) + trailing_dot
|
|---|
| 268 | size += len(trailing_dot)
|
|---|
| 269 | return (result, size)
|
|---|
| 270 |
|
|---|
| 271 | class StreamWriter(Codec,codecs.StreamWriter):
|
|---|
| 272 | pass
|
|---|
| 273 |
|
|---|
| 274 | class StreamReader(Codec,codecs.StreamReader):
|
|---|
| 275 | pass
|
|---|
| 276 |
|
|---|
| 277 | ### encodings module API
|
|---|
| 278 |
|
|---|
| 279 | def getregentry():
|
|---|
| 280 | return codecs.CodecInfo(
|
|---|
| 281 | name='idna',
|
|---|
| 282 | encode=Codec().encode,
|
|---|
| 283 | decode=Codec().decode,
|
|---|
| 284 | incrementalencoder=IncrementalEncoder,
|
|---|
| 285 | incrementaldecoder=IncrementalDecoder,
|
|---|
| 286 | streamwriter=StreamWriter,
|
|---|
| 287 | streamreader=StreamReader,
|
|---|
| 288 | )
|
|---|