Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

idna.py@ 602

Last change on this file since 602 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 8.3 KB

Line
1	# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
2
3	import stringprep, re, codecs
4	from unicodedata import ucd_3_2_0 as unicodedata
5
6	# IDNA section 3.1
7	dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
8
9	# IDNA section 5
10	ace_prefix = "xn--"
11	uace_prefix = unicode(ace_prefix, "ascii")
12
13	# This assumes query strings, so AllowUnassigned is true
14	def nameprep(label):
15	# Map
16	newlabel = []
17	for c in label:
18	if stringprep.in_table_b1(c):
19	# Map to nothing
20	continue
21	newlabel.append(stringprep.map_table_b2(c))
22	label = u"".join(newlabel)
23
24	# Normalize
25	label = unicodedata.normalize("NFKC", label)
26
27	# Prohibit
28	for c in label:
29	if stringprep.in_table_c12(c) or \
30	stringprep.in_table_c22(c) or \
31	stringprep.in_table_c3(c) or \
32	stringprep.in_table_c4(c) or \
33	stringprep.in_table_c5(c) or \
34	stringprep.in_table_c6(c) or \
35	stringprep.in_table_c7(c) or \
36	stringprep.in_table_c8(c) or \
37	stringprep.in_table_c9(c):
38	raise UnicodeError("Invalid character %r" % c)
39
40	# Check bidi
41	RandAL = map(stringprep.in_table_d1, label)
42	for c in RandAL:
43	if c:
44	# There is a RandAL char in the string. Must perform further
45	# tests:
46	# 1) The characters in section 5.8 MUST be prohibited.
47	# This is table C.8, which was already checked
48	# 2) If a string contains any RandALCat character, the string
49	# MUST NOT contain any LCat character.
50	if filter(stringprep.in_table_d2, label):
51	raise UnicodeError("Violation of BIDI requirement 2")
52
53	# 3) If a string contains any RandALCat character, a
54	# RandALCat character MUST be the first character of the
55	# string, and a RandALCat character MUST be the last
56	# character of the string.
57	if not RandAL[0] or not RandAL[-1]:
58	raise UnicodeError("Violation of BIDI requirement 3")
59
60	return label
61
62	def ToASCII(label):
63	try:
64	# Step 1: try ASCII
65	label = label.encode("ascii")
66	except UnicodeError:
67	pass
68	else:
69	# Skip to step 3: UseSTD3ASCIIRules is false, so
70	# Skip to step 8.
71	if 0 < len(label) < 64:
72	return label
73	raise UnicodeError("label empty or too long")
74
75	# Step 2: nameprep
76	label = nameprep(label)
77
78	# Step 3: UseSTD3ASCIIRules is false
79	# Step 4: try ASCII
80	try:
81	label = label.encode("ascii")
82	except UnicodeError:
83	pass
84	else:
85	# Skip to step 8.
86	if 0 < len(label) < 64:
87	return label
88	raise UnicodeError("label empty or too long")
89
90	# Step 5: Check ACE prefix
91	if label.startswith(uace_prefix):
92	raise UnicodeError("Label starts with ACE prefix")
93
94	# Step 6: Encode with PUNYCODE
95	label = label.encode("punycode")
96
97	# Step 7: Prepend ACE prefix
98	label = ace_prefix + label
99
100	# Step 8: Check size
101	if 0 < len(label) < 64:
102	return label
103	raise UnicodeError("label empty or too long")
104
105	def ToUnicode(label):
106	# Step 1: Check for ASCII
107	if isinstance(label, str):
108	pure_ascii = True
109	else:
110	try:
111	label = label.encode("ascii")
112	pure_ascii = True
113	except UnicodeError:
114	pure_ascii = False
115	if not pure_ascii:
116	# Step 2: Perform nameprep
117	label = nameprep(label)
118	# It doesn't say this, but apparently, it should be ASCII now
119	try:
120	label = label.encode("ascii")
121	except UnicodeError:
122	raise UnicodeError("Invalid character in IDN label")
123	# Step 3: Check for ACE prefix
124	if not label.startswith(ace_prefix):
125	return unicode(label, "ascii")
126
127	# Step 4: Remove ACE prefix
128	label1 = label[len(ace_prefix):]
129
130	# Step 5: Decode using PUNYCODE
131	result = label1.decode("punycode")
132
133	# Step 6: Apply ToASCII
134	label2 = ToASCII(result)
135
136	# Step 7: Compare the result of step 6 with the one of step 3
137	# label2 will already be in lower case.
138	if label.lower() != label2:
139	raise UnicodeError("IDNA does not round-trip", label, label2)
140
141	# Step 8: return the result of step 5
142	return result
143
144	### Codec APIs
145
146	class Codec(codecs.Codec):
147	def encode(self,input,errors='strict'):
148
149	if errors != 'strict':
150	# IDNA is quite clear that implementations must be strict
151	raise UnicodeError("unsupported error handling "+errors)
152
153	if not input:
154	return "", 0
155
156	result = []
157	labels = dots.split(input)
158	if labels and len(labels[-1])==0:
159	trailing_dot = '.'
160	del labels[-1]
161	else:
162	trailing_dot = ''
163	for label in labels:
164	result.append(ToASCII(label))
165	# Join with U+002E
166	return ".".join(result)+trailing_dot, len(input)
167
168	def decode(self,input,errors='strict'):
169
170	if errors != 'strict':
171	raise UnicodeError("Unsupported error handling "+errors)
172
173	if not input:
174	return u"", 0
175
176	# IDNA allows decoding to operate on Unicode strings, too.
177	if isinstance(input, unicode):
178	labels = dots.split(input)
179	else:
180	# Must be ASCII string
181	input = str(input)
182	unicode(input, "ascii")
183	labels = input.split(".")
184
185	if labels and len(labels[-1]) == 0:
186	trailing_dot = u'.'
187	del labels[-1]
188	else:
189	trailing_dot = u''
190
191	result = []
192	for label in labels:
193	result.append(ToUnicode(label))
194
195	return u".".join(result)+trailing_dot, len(input)
196
197	class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
198	def _buffer_encode(self, input, errors, final):
199	if errors != 'strict':
200	# IDNA is quite clear that implementations must be strict
201	raise UnicodeError("unsupported error handling "+errors)
202
203	if not input:
204	return ("", 0)
205
206	labels = dots.split(input)
207	trailing_dot = u''
208	if labels:
209	if not labels[-1]:
210	trailing_dot = '.'
211	del labels[-1]
212	elif not final:
213	# Keep potentially unfinished label until the next call
214	del labels[-1]
215	if labels:
216	trailing_dot = '.'
217
218	result = []
219	size = 0
220	for label in labels:
221	result.append(ToASCII(label))
222	if size:
223	size += 1
224	size += len(label)
225
226	# Join with U+002E
227	result = ".".join(result) + trailing_dot
228	size += len(trailing_dot)
229	return (result, size)
230
231	class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
232	def _buffer_decode(self, input, errors, final):
233	if errors != 'strict':
234	raise UnicodeError("Unsupported error handling "+errors)
235
236	if not input:
237	return (u"", 0)
238
239	# IDNA allows decoding to operate on Unicode strings, too.
240	if isinstance(input, unicode):
241	labels = dots.split(input)
242	else:
243	# Must be ASCII string
244	input = str(input)
245	unicode(input, "ascii")
246	labels = input.split(".")
247
248	trailing_dot = u''
249	if labels:
250	if not labels[-1]:
251	trailing_dot = u'.'
252	del labels[-1]
253	elif not final:
254	# Keep potentially unfinished label until the next call
255	del labels[-1]
256	if labels:
257	trailing_dot = u'.'
258
259	result = []
260	size = 0
261	for label in labels:
262	result.append(ToUnicode(label))
263	if size:
264	size += 1
265	size += len(label)
266
267	result = u".".join(result) + trailing_dot
268	size += len(trailing_dot)
269	return (result, size)
270
271	class StreamWriter(Codec,codecs.StreamWriter):
272	pass
273
274	class StreamReader(Codec,codecs.StreamReader):
275	pass
276
277	### encodings module API
278
279	def getregentry():
280	return codecs.CodecInfo(
281	name='idna',
282	encode=Codec().encode,
283	decode=Codec().decode,
284	incrementalencoder=IncrementalEncoder,
285	incrementaldecoder=IncrementalDecoder,
286	streamwriter=StreamWriter,
287	streamreader=StreamReader,
288	)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/encodings/idna.py@ 602

Download in other formats: