Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

test_unicode.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 74.4 KB

Line
1	""" Test script for the Unicode implementation.
2
3	Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7	"""#"
8	import sys
9	import struct
10	import codecs
11	import unittest
12	from test import test_support, string_tests
13
14	# decorator to skip tests on narrow builds
15	requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
16	'requires wide build')
17
18	# Error handling (bad decoder return)
19	def search_function(encoding):
20	def decode1(input, errors="strict"):
21	return 42 # not a tuple
22	def encode1(input, errors="strict"):
23	return 42 # not a tuple
24	def encode2(input, errors="strict"):
25	return (42, 42) # no unicode
26	def decode2(input, errors="strict"):
27	return (42, 42) # no unicode
28	if encoding=="test.unicode1":
29	return (encode1, decode1, None, None)
30	elif encoding=="test.unicode2":
31	return (encode2, decode2, None, None)
32	else:
33	return None
34	codecs.register(search_function)
35
36	class UnicodeTest(
37	string_tests.CommonTest,
38	string_tests.MixinStrUnicodeUserStringTest,
39	string_tests.MixinStrUnicodeTest,
40	):
41	type2test = unicode
42
43	def assertEqual(self, first, second, msg=None):
44	# strict assertEqual method: reject implicit bytes/unicode equality
45	super(UnicodeTest, self).assertEqual(first, second, msg)
46	if isinstance(first, unicode) or isinstance(second, unicode):
47	self.assertIsInstance(first, unicode)
48	self.assertIsInstance(second, unicode)
49	elif isinstance(first, str) or isinstance(second, str):
50	self.assertIsInstance(first, str)
51	self.assertIsInstance(second, str)
52
53	def checkequalnofix(self, result, object, methodname, *args):
54	method = getattr(object, methodname)
55	realresult = method(*args)
56	self.assertEqual(realresult, result)
57	self.assertTrue(type(realresult) is type(result))
58
59	# if the original is returned make sure that
60	# this doesn't happen with subclasses
61	if realresult is object:
62	class usub(unicode):
63	def __repr__(self):
64	return 'usub(%r)' % unicode.__repr__(self)
65	object = usub(object)
66	method = getattr(object, methodname)
67	realresult = method(*args)
68	self.assertEqual(realresult, result)
69	self.assertTrue(object is not realresult)
70
71	def test_literals(self):
72	self.assertEqual(u'\xff', u'\u00ff')
73	self.assertEqual(u'\uffff', u'\U0000ffff')
74	self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
75	self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
76	self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
77
78	def test_repr(self):
79	if not sys.platform.startswith('java'):
80	# Test basic sanity of repr()
81	self.assertEqual(repr(u'abc'), "u'abc'")
82	self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
83	self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
84	self.assertEqual(repr(u'\\c'), "u'\\\\c'")
85	self.assertEqual(repr(u'\\'), "u'\\\\'")
86	self.assertEqual(repr(u'\n'), "u'\\n'")
87	self.assertEqual(repr(u'\r'), "u'\\r'")
88	self.assertEqual(repr(u'\t'), "u'\\t'")
89	self.assertEqual(repr(u'\b'), "u'\\x08'")
90	self.assertEqual(repr(u"'\""), """u'\\'"'""")
91	self.assertEqual(repr(u"'\""), """u'\\'"'""")
92	self.assertEqual(repr(u"'"), '''u"'"''')
93	self.assertEqual(repr(u'"'), """u'"'""")
94	latin1repr = (
95	"u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
96	"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
97	"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
98	"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{\|}~\\x7f"
99	"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
100	"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
101	"\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
102	"\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
103	"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
104	"\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
105	"\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
106	"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
107	"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
108	"\\xfe\\xff'")
109	testrepr = repr(u''.join(map(unichr, xrange(256))))
110	self.assertEqual(testrepr, latin1repr)
111	# Test repr works on wide unicode escapes without overflow.
112	self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
113	repr(u"\U00010000" * 39 + u"\uffff" * 4096))
114
115
116	def test_count(self):
117	string_tests.CommonTest.test_count(self)
118	# check mixed argument types
119	self.checkequalnofix(3, 'aaa', 'count', u'a')
120	self.checkequalnofix(0, 'aaa', 'count', u'b')
121	self.checkequalnofix(3, u'aaa', 'count', 'a')
122	self.checkequalnofix(0, u'aaa', 'count', 'b')
123	self.checkequalnofix(0, u'aaa', 'count', 'b')
124	self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
125	self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
126	self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
127	self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
128
129	def test_find(self):
130	self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
131	self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
132	self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
133
134	self.assertRaises(TypeError, u'hello'.find)
135	self.assertRaises(TypeError, u'hello'.find, 42)
136
137	def test_rfind(self):
138	string_tests.CommonTest.test_rfind(self)
139	# check mixed argument types
140	self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
141	self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
142	self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
143
144	def test_index(self):
145	string_tests.CommonTest.test_index(self)
146	# check mixed argument types
147	for (t1, t2) in ((str, unicode), (unicode, str)):
148	self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
149	self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
150	self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
151	self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
152	self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
153	self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
154	self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
155	self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
156
157	def test_rindex(self):
158	string_tests.CommonTest.test_rindex(self)
159	# check mixed argument types
160	for (t1, t2) in ((str, unicode), (unicode, str)):
161	self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
162	self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
163	self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
164	self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
165
166	self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
167	self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
168	self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
169	self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
170	self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
171
172	def test_translate(self):
173	self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
174	self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
175	self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
176	self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
177	self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
178	self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
179
180	self.assertRaises(TypeError, u'hello'.translate)
181	self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
182
183	def test_split(self):
184	string_tests.CommonTest.test_split(self)
185
186	# Mixed arguments
187	self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
188	self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
189	self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
190
191	def test_join(self):
192	string_tests.MixinStrUnicodeUserStringTest.test_join(self)
193
194	# mixed arguments
195	self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
196	self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
197	self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
198	self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
199	self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
200	self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
201	self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
202
203	def test_strip(self):
204	string_tests.CommonTest.test_strip(self)
205	self.assertRaises(UnicodeError, u"hello".strip, "\xff")
206
207	def test_replace(self):
208	string_tests.CommonTest.test_replace(self)
209
210	# method call forwarded from str implementation because of unicode argument
211	self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
212	self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
213
214	def test_comparison(self):
215	# Comparisons:
216	self.assertTrue(u'abc' == 'abc')
217	self.assertTrue('abc' == u'abc')
218	self.assertTrue(u'abc' == u'abc')
219	self.assertTrue(u'abcd' > 'abc')
220	self.assertTrue('abcd' > u'abc')
221	self.assertTrue(u'abcd' > u'abc')
222	self.assertTrue(u'abc' < 'abcd')
223	self.assertTrue('abc' < u'abcd')
224	self.assertTrue(u'abc' < u'abcd')
225
226	if 0:
227	# Move these tests to a Unicode collation module test...
228	# Testing UTF-16 code point order comparisons...
229
230	# No surrogates, no fixup required.
231	self.assertTrue(u'\u0061' < u'\u20ac')
232	# Non surrogate below surrogate value, no fixup required
233	self.assertTrue(u'\u0061' < u'\ud800\udc02')
234
235	# Non surrogate above surrogate value, fixup required
236	def test_lecmp(s, s2):
237	self.assertTrue(s < s2)
238
239	def test_fixup(s):
240	s2 = u'\ud800\udc01'
241	test_lecmp(s, s2)
242	s2 = u'\ud900\udc01'
243	test_lecmp(s, s2)
244	s2 = u'\uda00\udc01'
245	test_lecmp(s, s2)
246	s2 = u'\udb00\udc01'
247	test_lecmp(s, s2)
248	s2 = u'\ud800\udd01'
249	test_lecmp(s, s2)
250	s2 = u'\ud900\udd01'
251	test_lecmp(s, s2)
252	s2 = u'\uda00\udd01'
253	test_lecmp(s, s2)
254	s2 = u'\udb00\udd01'
255	test_lecmp(s, s2)
256	s2 = u'\ud800\ude01'
257	test_lecmp(s, s2)
258	s2 = u'\ud900\ude01'
259	test_lecmp(s, s2)
260	s2 = u'\uda00\ude01'
261	test_lecmp(s, s2)
262	s2 = u'\udb00\ude01'
263	test_lecmp(s, s2)
264	s2 = u'\ud800\udfff'
265	test_lecmp(s, s2)
266	s2 = u'\ud900\udfff'
267	test_lecmp(s, s2)
268	s2 = u'\uda00\udfff'
269	test_lecmp(s, s2)
270	s2 = u'\udb00\udfff'
271	test_lecmp(s, s2)
272
273	test_fixup(u'\ue000')
274	test_fixup(u'\uff61')
275
276	# Surrogates on both sides, no fixup required
277	self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56')
278
279	def test_capitalize(self):
280	string_tests.CommonTest.test_capitalize(self)
281	# check that titlecased chars are lowered correctly
282	# \u1ffc is the titlecased char
283	self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3',
284	u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
285	# check with cased non-letter chars
286	self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
287	u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize')
288	self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
289	u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize')
290	self.checkequal(u'\u2160\u2171\u2172',
291	u'\u2160\u2161\u2162', 'capitalize')
292	self.checkequal(u'\u2160\u2171\u2172',
293	u'\u2170\u2171\u2172', 'capitalize')
294	# check with Ll chars with no upper - nothing changes here
295	self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7',
296	u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize')
297
298	def test_islower(self):
299	string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
300	self.checkequalnofix(False, u'\u1FFc', 'islower')
301
302	@requires_wide_build
303	def test_islower_non_bmp(self):
304	# non-BMP, uppercase
305	self.assertFalse(u'\U00010401'.islower())
306	self.assertFalse(u'\U00010427'.islower())
307	# non-BMP, lowercase
308	self.assertTrue(u'\U00010429'.islower())
309	self.assertTrue(u'\U0001044E'.islower())
310	# non-BMP, non-cased
311	self.assertFalse(u'\U0001F40D'.islower())
312	self.assertFalse(u'\U0001F46F'.islower())
313
314	def test_isupper(self):
315	string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
316	if not sys.platform.startswith('java'):
317	self.checkequalnofix(False, u'\u1FFc', 'isupper')
318
319	@requires_wide_build
320	def test_isupper_non_bmp(self):
321	# non-BMP, uppercase
322	self.assertTrue(u'\U00010401'.isupper())
323	self.assertTrue(u'\U00010427'.isupper())
324	# non-BMP, lowercase
325	self.assertFalse(u'\U00010429'.isupper())
326	self.assertFalse(u'\U0001044E'.isupper())
327	# non-BMP, non-cased
328	self.assertFalse(u'\U0001F40D'.isupper())
329	self.assertFalse(u'\U0001F46F'.isupper())
330
331	def test_istitle(self):
332	string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
333	self.checkequalnofix(True, u'\u1FFc', 'istitle')
334	self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
335
336	@requires_wide_build
337	def test_istitle_non_bmp(self):
338	# non-BMP, uppercase + lowercase
339	self.assertTrue(u'\U00010401\U00010429'.istitle())
340	self.assertTrue(u'\U00010427\U0001044E'.istitle())
341	# apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
342	for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']:
343	self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch))
344
345	def test_isspace(self):
346	string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
347	self.checkequalnofix(True, u'\u2000', 'isspace')
348	self.checkequalnofix(True, u'\u200a', 'isspace')
349	self.checkequalnofix(False, u'\u2014', 'isspace')
350
351	@requires_wide_build
352	def test_isspace_non_bmp(self):
353	# apparently there are no non-BMP spaces chars in Unicode 6
354	for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
355	u'\U0001F40D', u'\U0001F46F']:
356	self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch))
357
358	@requires_wide_build
359	def test_isalnum_non_bmp(self):
360	for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
361	u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
362	self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch))
363
364	def test_isalpha(self):
365	string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
366	self.checkequalnofix(True, u'\u1FFc', 'isalpha')
367
368	@requires_wide_build
369	def test_isalpha_non_bmp(self):
370	# non-BMP, cased
371	self.assertTrue(u'\U00010401'.isalpha())
372	self.assertTrue(u'\U00010427'.isalpha())
373	self.assertTrue(u'\U00010429'.isalpha())
374	self.assertTrue(u'\U0001044E'.isalpha())
375	# non-BMP, non-cased
376	self.assertFalse(u'\U0001F40D'.isalpha())
377	self.assertFalse(u'\U0001F46F'.isalpha())
378
379	def test_isdecimal(self):
380	self.checkequalnofix(False, u'', 'isdecimal')
381	self.checkequalnofix(False, u'a', 'isdecimal')
382	self.checkequalnofix(True, u'0', 'isdecimal')
383	self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
384	self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
385	self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
386	self.checkequalnofix(True, u'0123456789', 'isdecimal')
387	self.checkequalnofix(False, u'0123456789a', 'isdecimal')
388
389	self.checkraises(TypeError, 'abc', 'isdecimal', 42)
390
391	@requires_wide_build
392	def test_isdecimal_non_bmp(self):
393	for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
394	u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']:
395	self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch))
396	for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']:
397	self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch))
398
399	def test_isdigit(self):
400	string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
401	self.checkequalnofix(True, u'\u2460', 'isdigit')
402	self.checkequalnofix(False, u'\xbc', 'isdigit')
403	self.checkequalnofix(True, u'\u0660', 'isdigit')
404
405	@requires_wide_build
406	def test_isdigit_non_bmp(self):
407	for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
408	u'\U0001F40D', u'\U0001F46F', u'\U00011065']:
409	self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch))
410	for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
411	self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch))
412
413	def test_isnumeric(self):
414	self.checkequalnofix(False, u'', 'isnumeric')
415	self.checkequalnofix(False, u'a', 'isnumeric')
416	self.checkequalnofix(True, u'0', 'isnumeric')
417	self.checkequalnofix(True, u'\u2460', 'isnumeric')
418	self.checkequalnofix(True, u'\xbc', 'isnumeric')
419	self.checkequalnofix(True, u'\u0660', 'isnumeric')
420	self.checkequalnofix(True, u'0123456789', 'isnumeric')
421	self.checkequalnofix(False, u'0123456789a', 'isnumeric')
422
423	self.assertRaises(TypeError, u"abc".isnumeric, 42)
424
425	@requires_wide_build
426	def test_isnumeric_non_bmp(self):
427	for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
428	u'\U0001F40D', u'\U0001F46F']:
429	self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch))
430	for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b',
431	u'\U000104A0', u'\U0001F107']:
432	self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch))
433
434	@requires_wide_build
435	def test_surrogates(self):
436	# this test actually passes on narrow too, but it's just by accident.
437	# Surrogates are seen as non-cased chars, so u'X\uD800X' is as
438	# uppercase as 'X X'
439	for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
440	u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
441	self.assertTrue(s.islower())
442	self.assertFalse(s.isupper())
443	self.assertFalse(s.istitle())
444	for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800',
445	u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'):
446	self.assertFalse(s.islower())
447	self.assertTrue(s.isupper())
448	self.assertTrue(s.istitle())
449
450	for meth_name in ('islower', 'isupper', 'istitle'):
451	meth = getattr(unicode, meth_name)
452	for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'):
453	self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
454
455	for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
456	'isdecimal', 'isnumeric'):
457	meth = getattr(unicode, meth_name)
458	for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF',
459	u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
460	u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
461	self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
462
463
464	@requires_wide_build
465	def test_lower(self):
466	string_tests.CommonTest.test_lower(self)
467	self.assertEqual(u'\U00010427'.lower(), u'\U0001044F')
468	self.assertEqual(u'\U00010427\U00010427'.lower(),
469	u'\U0001044F\U0001044F')
470	self.assertEqual(u'\U00010427\U0001044F'.lower(),
471	u'\U0001044F\U0001044F')
472	self.assertEqual(u'X\U00010427x\U0001044F'.lower(),
473	u'x\U0001044Fx\U0001044F')
474
475	@requires_wide_build
476	def test_upper(self):
477	string_tests.CommonTest.test_upper(self)
478	self.assertEqual(u'\U0001044F'.upper(), u'\U00010427')
479	self.assertEqual(u'\U0001044F\U0001044F'.upper(),
480	u'\U00010427\U00010427')
481	self.assertEqual(u'\U00010427\U0001044F'.upper(),
482	u'\U00010427\U00010427')
483	self.assertEqual(u'X\U00010427x\U0001044F'.upper(),
484	u'X\U00010427X\U00010427')
485
486	@requires_wide_build
487	def test_capitalize(self):
488	string_tests.CommonTest.test_capitalize(self)
489	self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427')
490	self.assertEqual(u'\U0001044F\U0001044F'.capitalize(),
491	u'\U00010427\U0001044F')
492	self.assertEqual(u'\U00010427\U0001044F'.capitalize(),
493	u'\U00010427\U0001044F')
494	self.assertEqual(u'\U0001044F\U00010427'.capitalize(),
495	u'\U00010427\U0001044F')
496	self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(),
497	u'X\U0001044Fx\U0001044F')
498
499	@requires_wide_build
500	def test_title(self):
501	string_tests.MixinStrUnicodeUserStringTest.test_title(self)
502	self.assertEqual(u'\U0001044F'.title(), u'\U00010427')
503	self.assertEqual(u'\U0001044F\U0001044F'.title(),
504	u'\U00010427\U0001044F')
505	self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
506	u'\U00010427\U0001044F \U00010427\U0001044F')
507	self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(),
508	u'\U00010427\U0001044F \U00010427\U0001044F')
509	self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(),
510	u'\U00010427\U0001044F \U00010427\U0001044F')
511	self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
512	u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
513
514	@requires_wide_build
515	def test_swapcase(self):
516	string_tests.CommonTest.test_swapcase(self)
517	self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427')
518	self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F')
519	self.assertEqual(u'\U0001044F\U0001044F'.swapcase(),
520	u'\U00010427\U00010427')
521	self.assertEqual(u'\U00010427\U0001044F'.swapcase(),
522	u'\U0001044F\U00010427')
523	self.assertEqual(u'\U0001044F\U00010427'.swapcase(),
524	u'\U00010427\U0001044F')
525	self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(),
526	u'x\U0001044FX\U00010427')
527
528	def test_contains(self):
529	# Testing Unicode contains method
530	self.assertIn('a', u'abdb')
531	self.assertIn('a', u'bdab')
532	self.assertIn('a', u'bdaba')
533	self.assertIn('a', u'bdba')
534	self.assertIn('a', u'bdba')
535	self.assertIn(u'a', u'bdba')
536	self.assertNotIn(u'a', u'bdb')
537	self.assertNotIn(u'a', 'bdb')
538	self.assertIn(u'a', 'bdba')
539	self.assertIn(u'a', ('a',1,None))
540	self.assertIn(u'a', (1,None,'a'))
541	self.assertIn(u'a', (1,None,u'a'))
542	self.assertIn('a', ('a',1,None))
543	self.assertIn('a', (1,None,'a'))
544	self.assertIn('a', (1,None,u'a'))
545	self.assertNotIn('a', ('x',1,u'y'))
546	self.assertNotIn('a', ('x',1,None))
547	self.assertNotIn(u'abcd', u'abcxxxx')
548	self.assertIn(u'ab', u'abcd')
549	self.assertIn('ab', u'abc')
550	self.assertIn(u'ab', 'abc')
551	self.assertIn(u'ab', (1,None,u'ab'))
552	self.assertIn(u'', u'abc')
553	self.assertIn('', u'abc')
554
555	# If the following fails either
556	# the contains operator does not propagate UnicodeErrors or
557	# someone has changed the default encoding
558	self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2')
559	self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2')
560
561	self.assertIn(u'', '')
562	self.assertIn('', u'')
563	self.assertIn(u'', u'')
564	self.assertIn(u'', 'abc')
565	self.assertIn('', u'abc')
566	self.assertIn(u'', u'abc')
567	self.assertNotIn(u'\0', 'abc')
568	self.assertNotIn('\0', u'abc')
569	self.assertNotIn(u'\0', u'abc')
570	self.assertIn(u'\0', '\0abc')
571	self.assertIn('\0', u'\0abc')
572	self.assertIn(u'\0', u'\0abc')
573	self.assertIn(u'\0', 'abc\0')
574	self.assertIn('\0', u'abc\0')
575	self.assertIn(u'\0', u'abc\0')
576	self.assertIn(u'a', '\0abc')
577	self.assertIn('a', u'\0abc')
578	self.assertIn(u'a', u'\0abc')
579	self.assertIn(u'asdf', 'asdf')
580	self.assertIn('asdf', u'asdf')
581	self.assertIn(u'asdf', u'asdf')
582	self.assertNotIn(u'asdf', 'asd')
583	self.assertNotIn('asdf', u'asd')
584	self.assertNotIn(u'asdf', u'asd')
585	self.assertNotIn(u'asdf', '')
586	self.assertNotIn('asdf', u'')
587	self.assertNotIn(u'asdf', u'')
588
589	self.assertRaises(TypeError, u"abc".__contains__)
590	self.assertRaises(TypeError, u"abc".__contains__, object())
591
592	def test_formatting(self):
593	string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
594	# Testing Unicode formatting strings...
595	self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
596	self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
597	self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
598	self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
599	self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
600	self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
601	if not sys.platform.startswith('java'):
602	self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
603	self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
604	self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
605
606	self.assertEqual(u'%c' % 0x1234, u'\u1234')
607	self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
608	self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3))
609
610	for num in range(0x00,0x80):
611	char = chr(num)
612	self.assertEqual(u"%c" % char, unicode(char))
613	self.assertEqual(u"%c" % num, unicode(char))
614	self.assertTrue(char == u"%c" % char)
615	self.assertTrue(char == u"%c" % num)
616	# Issue 7649
617	for num in range(0x80,0x100):
618	uchar = unichr(num)
619	self.assertEqual(uchar, u"%c" % num) # works only with ints
620	self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
621	# the implicit decoding should fail for non-ascii chars
622	self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
623	self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
624
625	# formatting jobs delegated from the string implementation:
626	self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
627	self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
628	self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
629	self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
630	self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
631	self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
632	self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
633	self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
634	self.assertEqual('...%s...' % u"abc", u'...abc...')
635	self.assertEqual('%*s' % (5,u'abc',), u' abc')
636	self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
637	self.assertEqual('%.s' % (5,2,u'abc',), u' ab')
638	self.assertEqual('%.s' % (5,3,u'abc',), u' abc')
639	self.assertEqual('%i %.s' % (10, 5,3,u'abc',), u'10 abc')
640	self.assertEqual('%i%s %.s' % (10, 3, 5, 3, u'abc',), u'103 abc')
641	self.assertEqual('%c' % u'a', u'a')
642	class Wrapper:
643	def __str__(self):
644	return u'\u1234'
645	self.assertEqual('%s' % Wrapper(), u'\u1234')
646
647	@test_support.cpython_only
648	def test_formatting_huge_precision(self):
649	from _testcapi import INT_MAX
650	format_string = u"%.{}f".format(INT_MAX + 1)
651	with self.assertRaises(ValueError):
652	result = format_string % 2.34
653
654	def test_formatting_huge_width(self):
655	format_string = u"%{}f".format(sys.maxsize + 1)
656	with self.assertRaises(ValueError):
657	result = format_string % 2.34
658
659	def test_startswith_endswith_errors(self):
660	for meth in (u'foo'.startswith, u'foo'.endswith):
661	with self.assertRaises(UnicodeDecodeError):
662	meth('\xff')
663	with self.assertRaises(TypeError) as cm:
664	meth(['f'])
665	exc = str(cm.exception)
666	self.assertIn('unicode', exc)
667	self.assertIn('str', exc)
668	self.assertIn('tuple', exc)
669
670	@test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
671	def test_format_float(self):
672	# should not format with a comma, but always with C locale
673	self.assertEqual(u'1.0', u'%.1f' % 1.0)
674
675	def test_constructor(self):
676	# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
677
678	self.assertEqual(
679	unicode(u'unicode remains unicode'),
680	u'unicode remains unicode'
681	)
682
683	class UnicodeSubclass(unicode):
684	pass
685
686	self.assertEqual(
687	unicode(UnicodeSubclass('unicode subclass becomes unicode')),
688	u'unicode subclass becomes unicode'
689	)
690
691	self.assertEqual(
692	unicode('strings are converted to unicode'),
693	u'strings are converted to unicode'
694	)
695
696	class UnicodeCompat:
697	def __init__(self, x):
698	self.x = x
699	def __unicode__(self):
700	return self.x
701
702	self.assertEqual(
703	unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
704	u'__unicode__ compatible objects are recognized')
705
706	class StringCompat:
707	def __init__(self, x):
708	self.x = x
709	def __str__(self):
710	return self.x
711
712	self.assertEqual(
713	unicode(StringCompat('__str__ compatible objects are recognized')),
714	u'__str__ compatible objects are recognized'
715	)
716
717	# unicode(obj) is compatible to str():
718
719	o = StringCompat('unicode(obj) is compatible to str()')
720	self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
721	self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
722
723	# %-formatting and .__unicode__()
724	self.assertEqual(u'%s' %
725	UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
726	u"u'%s' % obj uses obj.__unicode__()")
727	self.assertEqual(u'%s' %
728	UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
729	u"u'%s' % obj falls back to obj.__str__()")
730
731	for obj in (123, 123.45, 123L):
732	self.assertEqual(unicode(obj), unicode(str(obj)))
733
734	# unicode(obj, encoding, error) tests (this maps to
735	# PyUnicode_FromEncodedObject() at C level)
736
737	if not sys.platform.startswith('java'):
738	self.assertRaises(
739	TypeError,
740	unicode,
741	u'decoding unicode is not supported',
742	'utf-8',
743	'strict'
744	)
745
746	self.assertEqual(
747	unicode('strings are decoded to unicode', 'utf-8', 'strict'),
748	u'strings are decoded to unicode'
749	)
750
751	if not sys.platform.startswith('java'):
752	with test_support.check_py3k_warnings():
753	buf = buffer('character buffers are decoded to unicode')
754	self.assertEqual(
755	unicode(
756	buf,
757	'utf-8',
758	'strict'
759	),
760	u'character buffers are decoded to unicode'
761	)
762
763	self.assertRaises(TypeError, unicode, 42, 42, 42)
764
765	def test_codecs_utf7(self):
766	utfTests = [
767	(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
768	(u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
769	(u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
770	(u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
771	(u'+', '+-'),
772	(u'+-', '+--'),
773	(u'+?', '+-?'),
774	(u'\?', '+AFw?'),
775	(u'+?', '+-?'),
776	(ur'\\?', '+AFwAXA?'),
777	(ur'\\\?', '+AFwAXABc?'),
778	(ur'++--', '+-+---'),
779	(u'\U000abcde', '+2m/c3g-'), # surrogate pairs
780	(u'/', '/'),
781	]
782
783	for (x, y) in utfTests:
784	self.assertEqual(x.encode('utf-7'), y)
785
786	# Unpaired surrogates are passed through
787	self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-')
788	self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x')
789	self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-')
790	self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x')
791	self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801')
792	self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x')
793	self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01')
794	self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x')
795
796	self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-')
797	self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde')
798
799	# Direct encoded characters
800	set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
801	# Optional direct characters
802	set_o = '!"#$%&*;<=>@[]^_`{\|}'
803	for c in set_d:
804	self.assertEqual(c.encode('utf7'), c.encode('ascii'))
805	self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
806	self.assertTrue(c == c.encode('ascii').decode('utf7'))
807	for c in set_o:
808	self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
809	self.assertTrue(c == c.encode('ascii').decode('utf7'))
810
811	def test_codecs_utf8(self):
812	self.assertEqual(u''.encode('utf-8'), '')
813	self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
814	self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
815	self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
816	self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
817	self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
818	self.assertEqual(
819	(u'\ud800\udc02'*1000).encode('utf-8'),
820	'\xf0\x90\x80\x82'*1000
821	)
822	self.assertEqual(
823	u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
824	u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
825	u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
826	u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
827	u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
828	u' Nunstuck git und'.encode('utf-8'),
829	'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
830	'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
831	'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
832	'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
833	'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
834	'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
835	'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
836	'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
837	'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
838	'\xe3\x80\x8cWenn ist das Nunstuck git und'
839	)
840
841	# UTF-8 specific decoding tests
842	self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456')
843	self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002')
844	self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac')
845
846	# Other possible utf-8 test cases:
847	# * strict decoding testing for all of the
848	# UTF8_ERROR cases in PyUnicode_DecodeUTF8
849
850	def test_utf8_decode_valid_sequences(self):
851	sequences = [
852	# single byte
853	('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
854	# 2 bytes
855	('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
856	# 3 bytes
857	('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
858	('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
859	# 4 bytes
860	('\xF0\x90\x80\x80', u'\U00010000'),
861	('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
862	]
863	for seq, res in sequences:
864	self.assertEqual(seq.decode('utf-8'), res)
865
866	for ch in map(unichr, range(0, sys.maxunicode)):
867	self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
868
869	def test_utf8_decode_invalid_sequences(self):
870	# continuation bytes in a sequence of 2, 3, or 4 bytes
871	continuation_bytes = map(chr, range(0x80, 0xC0))
872	# start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
873	invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
874	# start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
875	invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
876	invalid_start_bytes = (
877	continuation_bytes + invalid_2B_seq_start_bytes +
878	invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
879	)
880
881	for byte in invalid_start_bytes:
882	self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
883
884	for sb in invalid_2B_seq_start_bytes:
885	for cb in continuation_bytes:
886	self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
887
888	for sb in invalid_4B_seq_start_bytes:
889	for cb1 in continuation_bytes[:3]:
890	for cb3 in continuation_bytes[:3]:
891	self.assertRaises(UnicodeDecodeError,
892	(sb+cb1+'\x80'+cb3).decode, 'utf-8')
893
894	for cb in map(chr, range(0x80, 0xA0)):
895	self.assertRaises(UnicodeDecodeError,
896	('\xE0'+cb+'\x80').decode, 'utf-8')
897	self.assertRaises(UnicodeDecodeError,
898	('\xE0'+cb+'\xBF').decode, 'utf-8')
899	# XXX: surrogates shouldn't be valid UTF-8!
900	# see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
901	# (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
902	#for cb in map(chr, range(0xA0, 0xC0)):
903	#self.assertRaises(UnicodeDecodeError,
904	#('\xED'+cb+'\x80').decode, 'utf-8')
905	#self.assertRaises(UnicodeDecodeError,
906	#('\xED'+cb+'\xBF').decode, 'utf-8')
907	# but since they are valid on Python 2 add a test for that:
908	for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)),
909	map(unichr, range(0xd800, 0xe000, 64))):
910	encoded = '\xED'+cb+'\x80'
911	self.assertEqual(encoded.decode('utf-8'), surrogate)
912	self.assertEqual(surrogate.encode('utf-8'), encoded)
913
914	for cb in map(chr, range(0x80, 0x90)):
915	self.assertRaises(UnicodeDecodeError,
916	('\xF0'+cb+'\x80\x80').decode, 'utf-8')
917	self.assertRaises(UnicodeDecodeError,
918	('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
919	for cb in map(chr, range(0x90, 0xC0)):
920	self.assertRaises(UnicodeDecodeError,
921	('\xF4'+cb+'\x80\x80').decode, 'utf-8')
922	self.assertRaises(UnicodeDecodeError,
923	('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
924
925	def test_issue8271(self):
926	# Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
927	# only the start byte and the continuation byte(s) are now considered
928	# invalid, instead of the number of bytes specified by the start byte.
929	# See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
930	# table 3-8, Row 2) for more information about the algorithm used.
931	FFFD = u'\ufffd'
932	sequences = [
933	# invalid start bytes
934	('\x80', FFFD), # continuation byte
935	('\x80\x80', FFFD*2), # 2 continuation bytes
936	('\xc0', FFFD),
937	('\xc0\xc0', FFFD*2),
938	('\xc1', FFFD),
939	('\xc1\xc0', FFFD*2),
940	('\xc0\xc1', FFFD*2),
941	# with start byte of a 2-byte sequence
942	('\xc2', FFFD), # only the start byte
943	('\xc2\xc2', FFFD*2), # 2 start bytes
944	('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
945	('\xc2\x41', FFFD+'A'), # invalid continuation byte
946	# with start byte of a 3-byte sequence
947	('\xe1', FFFD), # only the start byte
948	('\xe1\xe1', FFFD*2), # 2 start bytes
949	('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
950	('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
951	('\xe1\x80', FFFD), # only 1 continuation byte
952	('\xe1\x41', FFFD+'A'), # invalid continuation byte
953	('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
954	('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
955	('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
956	('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
957	('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
958	# with start byte of a 4-byte sequence
959	('\xf1', FFFD), # only the start byte
960	('\xf1\xf1', FFFD*2), # 2 start bytes
961	('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
962	('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
963	('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
964	('\xf1\x80', FFFD), # only 1 continuation bytes
965	('\xf1\x80\x80', FFFD), # only 2 continuation bytes
966	('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
967	('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
968	('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
969	('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
970	('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
971	('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
972	('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
973	('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
974	('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
975	('\xf1\xf1\x80\x41', FFFD*2+'A'),
976	('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
977	# with invalid start byte of a 4-byte sequence (rfc2279)
978	('\xf5', FFFD), # only the start byte
979	('\xf5\xf5', FFFD*2), # 2 start bytes
980	('\xf5\x80', FFFD*2), # only 1 continuation byte
981	('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
982	('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
983	('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
984	('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
985	('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
986	# with invalid start byte of a 5-byte sequence (rfc2279)
987	('\xf8', FFFD), # only the start byte
988	('\xf8\xf8', FFFD*2), # 2 start bytes
989	('\xf8\x80', FFFD*2), # only one continuation byte
990	('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
991	('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
992	# with invalid start byte of a 6-byte sequence (rfc2279)
993	('\xfc', FFFD), # only the start byte
994	('\xfc\xfc', FFFD*2), # 2 start bytes
995	('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
996	('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
997	# invalid start byte
998	('\xfe', FFFD),
999	('\xfe\x80\x80', FFFD*3),
1000	# other sequences
1001	('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
1002	('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
1003	('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
1004	('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1005	u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1006	]
1007	for n, (seq, res) in enumerate(sequences):
1008	self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1009	self.assertEqual(seq.decode('utf-8', 'replace'), res)
1010	self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
1011	self.assertEqual(seq.decode('utf-8', 'ignore'),
1012	res.replace(u'\uFFFD', ''))
1013
1014	def test_codecs_idna(self):
1015	# Test whether trailing dot is preserved
1016	self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
1017
1018	def test_codecs_errors(self):
1019	# Error handling (encoding)
1020	self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
1021	self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
1022	self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
1023	self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
1024	self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'),
1025	u'Andr\202 x'.encode('ascii', errors='replace'))
1026	self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'),
1027	u'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
1028
1029	# Error handling (decoding)
1030	self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
1031	self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
1032	self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
1033	self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
1034	self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
1035	u'abcde'.decode('ascii', errors='ignore'))
1036	self.assertEqual(u'abcde'.decode('ascii', 'replace'),
1037	u'abcde'.decode(encoding='ascii', errors='replace'))
1038
1039	# Error handling (unknown character names)
1040	self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
1041
1042	# Error handling (truncated escape sequence)
1043	self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
1044
1045	self.assertRaises(TypeError, "hello".decode, "test.unicode1")
1046	self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
1047	self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
1048	self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
1049	# executes PyUnicode_Encode()
1050	import imp
1051	self.assertRaises(
1052	ImportError,
1053	imp.find_module,
1054	"non-existing module",
1055	[u"non-existing dir"]
1056	)
1057
1058	# Error handling (wrong arguments)
1059	self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
1060
1061	# Error handling (PyUnicode_EncodeDecimal())
1062	self.assertRaises(UnicodeError, int, u"\u0200")
1063
1064	def test_codecs(self):
1065	# Encoding
1066	self.assertEqual(u'hello'.encode('ascii'), 'hello')
1067	self.assertEqual(u'hello'.encode('utf-7'), 'hello')
1068	self.assertEqual(u'hello'.encode('utf-8'), 'hello')
1069	self.assertEqual(u'hello'.encode('utf8'), 'hello')
1070	self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
1071	self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
1072	self.assertEqual(u'hello'.encode('latin-1'), 'hello')
1073
1074	# Roundtrip safety for BMP (just the first 1024 chars)
1075	for c in xrange(1024):
1076	u = unichr(c)
1077	for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1078	'utf-16-be', 'raw_unicode_escape',
1079	'unicode_escape', 'unicode_internal'):
1080	self.assertEqual(unicode(u.encode(encoding),encoding), u)
1081
1082	# Roundtrip safety for BMP (just the first 256 chars)
1083	for c in xrange(256):
1084	u = unichr(c)
1085	for encoding in ('latin-1',):
1086	self.assertEqual(unicode(u.encode(encoding),encoding), u)
1087
1088	# Roundtrip safety for BMP (just the first 128 chars)
1089	for c in xrange(128):
1090	u = unichr(c)
1091	for encoding in ('ascii',):
1092	self.assertEqual(unicode(u.encode(encoding),encoding), u)
1093
1094	# Roundtrip safety for non-BMP (just a few chars)
1095	u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
1096	for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1097	#'raw_unicode_escape',
1098	'unicode_escape', 'unicode_internal'):
1099	self.assertEqual(unicode(u.encode(encoding),encoding), u)
1100
1101	# UTF-8 must be roundtrip safe for all UCS-2 code points
1102	# This excludes surrogates: in the full range, there would be
1103	# a surrogate pair (\udbff\udc00), which gets converted back
1104	# to a non-BMP character (\U0010fc00)
1105	u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
1106	for encoding in ('utf-8',):
1107	self.assertEqual(unicode(u.encode(encoding),encoding), u)
1108
1109	def test_codecs_charmap(self):
1110	# 0-127
1111	s = ''.join(map(chr, xrange(128)))
1112	for encoding in (
1113	'cp037', 'cp1026',
1114	'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1115	'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1116	'cp863', 'cp865', 'cp866',
1117	'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1118	'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1119	'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1120	'mac_cyrillic', 'mac_latin2',
1121
1122	'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1123	'cp1256', 'cp1257', 'cp1258',
1124	'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1125
1126	'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1127	'cp1006', 'iso8859_8',
1128
1129	### These have undefined mappings:
1130	#'cp424',
1131
1132	### These fail the round-trip:
1133	#'cp875'
1134
1135	):
1136	self.assertEqual(unicode(s, encoding).encode(encoding), s)
1137
1138	# 128-255
1139	s = ''.join(map(chr, xrange(128, 256)))
1140	for encoding in (
1141	'cp037', 'cp1026',
1142	'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1143	'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1144	'cp863', 'cp865', 'cp866',
1145	'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1146	'iso8859_2', 'iso8859_4', 'iso8859_5',
1147	'iso8859_9', 'koi8_r', 'latin_1',
1148	'mac_cyrillic', 'mac_latin2',
1149
1150	### These have undefined mappings:
1151	#'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1152	#'cp1256', 'cp1257', 'cp1258',
1153	#'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1154	#'iso8859_3', 'iso8859_6', 'iso8859_7',
1155	#'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1156
1157	### These fail the round-trip:
1158	#'cp1006', 'cp875', 'iso8859_8',
1159
1160	):
1161	self.assertEqual(unicode(s, encoding).encode(encoding), s)
1162
1163	def test_concatenation(self):
1164	self.assertEqual((u"abc" u"def"), u"abcdef")
1165	self.assertEqual(("abc" u"def"), u"abcdef")
1166	self.assertEqual((u"abc" "def"), u"abcdef")
1167	self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
1168	self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
1169
1170	def test_printing(self):
1171	class BitBucket:
1172	def write(self, text):
1173	pass
1174
1175	out = BitBucket()
1176	print >>out, u'abc'
1177	print >>out, u'abc', u'def'
1178	print >>out, u'abc', 'def'
1179	print >>out, 'abc', u'def'
1180	print >>out, u'abc\n'
1181	print >>out, u'abc\n',
1182	print >>out, u'abc\n',
1183	print >>out, u'def\n'
1184	print >>out, u'def\n'
1185
1186	def test_ucs4(self):
1187	x = u'\U00100000'
1188	y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1189	self.assertEqual(x, y)
1190
1191	y = r'\U00100000'
1192	x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1193	self.assertEqual(x, y)
1194	y = r'\U00010000'
1195	x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1196	self.assertEqual(x, y)
1197
1198	try:
1199	'\U11111111'.decode("raw-unicode-escape")
1200	except UnicodeDecodeError as e:
1201	self.assertEqual(e.start, 0)
1202	self.assertEqual(e.end, 10)
1203	else:
1204	self.fail("Should have raised UnicodeDecodeError")
1205
1206	def test_conversion(self):
1207	# Make sure __unicode__() works properly
1208	class Foo0:
1209	def __str__(self):
1210	return "foo"
1211
1212	class Foo1:
1213	def __unicode__(self):
1214	return u"foo"
1215
1216	class Foo2(object):
1217	def __unicode__(self):
1218	return u"foo"
1219
1220	class Foo3(object):
1221	def __unicode__(self):
1222	return "foo"
1223
1224	class Foo4(str):
1225	def __unicode__(self):
1226	return "foo"
1227
1228	class Foo5(unicode):
1229	def __unicode__(self):
1230	return "foo"
1231
1232	class Foo6(str):
1233	def __str__(self):
1234	return "foos"
1235
1236	def __unicode__(self):
1237	return u"foou"
1238
1239	class Foo7(unicode):
1240	def __str__(self):
1241	return "foos"
1242	def __unicode__(self):
1243	return u"foou"
1244
1245	class Foo8(unicode):
1246	def __new__(cls, content=""):
1247	return unicode.__new__(cls, 2*content)
1248	def __unicode__(self):
1249	return self
1250
1251	class Foo9(unicode):
1252	def __str__(self):
1253	return "string"
1254	def __unicode__(self):
1255	return "not unicode"
1256
1257	self.assertEqual(unicode(Foo0()), u"foo")
1258	self.assertEqual(unicode(Foo1()), u"foo")
1259	self.assertEqual(unicode(Foo2()), u"foo")
1260	self.assertEqual(unicode(Foo3()), u"foo")
1261	self.assertEqual(unicode(Foo4("bar")), u"foo")
1262	self.assertEqual(unicode(Foo5("bar")), u"foo")
1263	self.assertEqual(unicode(Foo6("bar")), u"foou")
1264	self.assertEqual(unicode(Foo7("bar")), u"foou")
1265	self.assertEqual(unicode(Foo8("foo")), u"foofoo")
1266	self.assertEqual(str(Foo9("foo")), "string")
1267	self.assertEqual(unicode(Foo9("foo")), u"not unicode")
1268
1269	def test_unicode_repr(self):
1270	class s1:
1271	def __repr__(self):
1272	return '\\n'
1273
1274	class s2:
1275	def __repr__(self):
1276	return u'\\n'
1277
1278	self.assertEqual(repr(s1()), '\\n')
1279	self.assertEqual(repr(s2()), '\\n')
1280
1281	def test_expandtabs_overflows_gracefully(self):
1282	# This test only affects 32-bit platforms because expandtabs can only take
1283	# an int as the max value, not a 64-bit C long. If expandtabs is changed
1284	# to take a 64-bit long, this test should apply to all platforms.
1285	if sys.maxint > (1 << 32) or struct.calcsize('P') != 4:
1286	return
1287	self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
1288
1289	def test__format__(self):
1290	def test(value, format, expected):
1291	# test both with and without the trailing 's'
1292	self.assertEqual(value.__format__(format), expected)
1293	self.assertEqual(value.__format__(format + u's'), expected)
1294
1295	test(u'', u'', u'')
1296	test(u'abc', u'', u'abc')
1297	test(u'abc', u'.3', u'abc')
1298	test(u'ab', u'.3', u'ab')
1299	test(u'abcdef', u'.3', u'abc')
1300	test(u'abcdef', u'.0', u'')
1301	test(u'abc', u'3.3', u'abc')
1302	test(u'abc', u'2.3', u'abc')
1303	test(u'abc', u'2.2', u'ab')
1304	test(u'abc', u'3.2', u'ab ')
1305	test(u'result', u'x<0', u'result')
1306	test(u'result', u'x<5', u'result')
1307	test(u'result', u'x<6', u'result')
1308	test(u'result', u'x<7', u'resultx')
1309	test(u'result', u'x<8', u'resultxx')
1310	test(u'result', u' <7', u'result ')
1311	test(u'result', u'<7', u'result ')
1312	test(u'result', u'>7', u' result')
1313	test(u'result', u'>8', u' result')
1314	test(u'result', u'^8', u' result ')
1315	test(u'result', u'^9', u' result ')
1316	test(u'result', u'^10', u' result ')
1317	test(u'a', u'10000', u'a' + u' ' * 9999)
1318	test(u'', u'10000', u' ' * 10000)
1319	test(u'', u'10000000', u' ' * 10000000)
1320
1321	# test mixing unicode and str
1322	self.assertEqual(u'abc'.__format__('s'), u'abc')
1323	self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
1324
1325	def test_format(self):
1326	self.assertEqual(u''.format(), u'')
1327	self.assertEqual(u'a'.format(), u'a')
1328	self.assertEqual(u'ab'.format(), u'ab')
1329	self.assertEqual(u'a{{'.format(), u'a{')
1330	self.assertEqual(u'a}}'.format(), u'a}')
1331	self.assertEqual(u'{{b'.format(), u'{b')
1332	self.assertEqual(u'}}b'.format(), u'}b')
1333	self.assertEqual(u'a{{b'.format(), u'a{b')
1334
1335	# examples from the PEP:
1336	import datetime
1337	self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
1338	self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
1339	u"My name is Fred")
1340	self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
1341	u"My name is Fred :-{}")
1342
1343	# datetime.__format__ doesn't work with unicode
1344	#d = datetime.date(2007, 8, 18)
1345	#self.assertEqual("The year is {0.year}".format(d),
1346	# "The year is 2007")
1347
1348	# classes we'll use for testing
1349	class C:
1350	def __init__(self, x=100):
1351	self._x = x
1352	def __format__(self, spec):
1353	return spec
1354
1355	class D:
1356	def __init__(self, x):
1357	self.x = x
1358	def __format__(self, spec):
1359	return str(self.x)
1360
1361	# class with __str__, but no __format__
1362	class E:
1363	def __init__(self, x):
1364	self.x = x
1365	def __str__(self):
1366	return u'E(' + self.x + u')'
1367
1368	# class with __repr__, but no __format__ or __str__
1369	class F:
1370	def __init__(self, x):
1371	self.x = x
1372	def __repr__(self):
1373	return u'F(' + self.x + u')'
1374
1375	# class with __format__ that forwards to string, for some format_spec's
1376	class G:
1377	def __init__(self, x):
1378	self.x = x
1379	def __str__(self):
1380	return u"string is " + self.x
1381	def __format__(self, format_spec):
1382	if format_spec == 'd':
1383	return u'G(' + self.x + u')'
1384	return object.__format__(self, format_spec)
1385
1386	# class that returns a bad type from __format__
1387	class H:
1388	def __format__(self, format_spec):
1389	return 1.0
1390
1391	class I(datetime.date):
1392	def __format__(self, format_spec):
1393	return self.strftime(format_spec)
1394
1395	class J(int):
1396	def __format__(self, format_spec):
1397	return int.__format__(self * 2, format_spec)
1398
1399
1400	self.assertEqual(u''.format(), u'')
1401	self.assertEqual(u'abc'.format(), u'abc')
1402	self.assertEqual(u'{0}'.format(u'abc'), u'abc')
1403	self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
1404	self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
1405	self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
1406	self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
1407	self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
1408	self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
1409	self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
1410	self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
1411	self.assertEqual(u'{0}'.format(-15), u'-15')
1412	self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
1413	self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
1414	self.assertEqual(u'{{'.format(), u'{')
1415	self.assertEqual(u'}}'.format(), u'}')
1416	self.assertEqual(u'{{}}'.format(), u'{}')
1417	self.assertEqual(u'{{x}}'.format(), u'{x}')
1418	self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
1419	self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
1420	self.assertEqual(u'}}{{'.format(), u'}{')
1421	self.assertEqual(u'}}x{{'.format(), u'}x{')
1422
1423	# weird field names
1424	self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
1425	self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
1426	self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
1427
1428	self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
1429	self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
1430	self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
1431	self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
1432	self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
1433	self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
1434	self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
1435
1436	# strings
1437	self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
1438	self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
1439	self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
1440	self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
1441	self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
1442	self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
1443	self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
1444	self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
1445	self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
1446	self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
1447	self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
1448	self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
1449	self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
1450	self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
1451	self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
1452	self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
1453	self.assertEqual(u'{0:>8s}'.format(u'result'), u' result')
1454	self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
1455	self.assertEqual(u'{0:^9s}'.format(u'result'), u' result ')
1456	self.assertEqual(u'{0:^10s}'.format(u'result'), u' result ')
1457	self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
1458	self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
1459	self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
1460
1461	# format specifiers for user defined type
1462	self.assertEqual(u'{0:abc}'.format(C()), u'abc')
1463
1464	# !r and !s coercions
1465	self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
1466	self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
1467	self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello ')
1468	self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello ')
1469	self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
1470	self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
1471	self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
1472
1473	# test fallback to object.__format__
1474	self.assertEqual(u'{0}'.format({}), u'{}')
1475	self.assertEqual(u'{0}'.format([]), u'[]')
1476	self.assertEqual(u'{0}'.format([1]), u'[1]')
1477	self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
1478	self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
1479	self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
1480
1481	msg = 'object.__format__ with a non-empty format string is deprecated'
1482	with test_support.check_warnings((msg, PendingDeprecationWarning)):
1483	self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data) ')
1484	self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data) ')
1485	self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
1486
1487	self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
1488	month=8,
1489	day=27)),
1490	u"date: 2007-08-27")
1491
1492	# test deriving from a builtin type and overriding __format__
1493	self.assertEqual(u"{0}".format(J(10)), u"20")
1494
1495
1496	# string format specifiers
1497	self.assertEqual(u'{0:}'.format('a'), u'a')
1498
1499	# computed format specifiers
1500	self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
1501	self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
1502	self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
1503	self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello ')
1504	self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello ')
1505
1506	# test various errors
1507	self.assertRaises(ValueError, u'{'.format)
1508	self.assertRaises(ValueError, u'}'.format)
1509	self.assertRaises(ValueError, u'a{'.format)
1510	self.assertRaises(ValueError, u'a}'.format)
1511	self.assertRaises(ValueError, u'{a'.format)
1512	self.assertRaises(ValueError, u'}a'.format)
1513	self.assertRaises(IndexError, u'{0}'.format)
1514	self.assertRaises(IndexError, u'{1}'.format, u'abc')
1515	self.assertRaises(KeyError, u'{x}'.format)
1516	self.assertRaises(ValueError, u"}{".format)
1517	self.assertRaises(ValueError, u"{".format)
1518	self.assertRaises(ValueError, u"}".format)
1519	self.assertRaises(ValueError, u"abc{0:{}".format)
1520	self.assertRaises(ValueError, u"{0".format)
1521	self.assertRaises(IndexError, u"{0.}".format)
1522	self.assertRaises(ValueError, u"{0.}".format, 0)
1523	self.assertRaises(IndexError, u"{0[}".format)
1524	self.assertRaises(ValueError, u"{0[}".format, [])
1525	self.assertRaises(KeyError, u"{0]}".format)
1526	self.assertRaises(ValueError, u"{0.[]}".format, 0)
1527	self.assertRaises(ValueError, u"{0..foo}".format, 0)
1528	self.assertRaises(ValueError, u"{0[0}".format, 0)
1529	self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
1530	self.assertRaises(KeyError, u"{c]}".format)
1531	self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
1532	self.assertRaises(ValueError, u"{0}}".format, 0)
1533	self.assertRaises(KeyError, u"{foo}".format, bar=3)
1534	self.assertRaises(ValueError, u"{0!x}".format, 3)
1535	self.assertRaises(ValueError, u"{0!}".format, 0)
1536	self.assertRaises(ValueError, u"{0!rs}".format, 0)
1537	self.assertRaises(ValueError, u"{!}".format)
1538	self.assertRaises(IndexError, u"{:}".format)
1539	self.assertRaises(IndexError, u"{:s}".format)
1540	self.assertRaises(IndexError, u"{}".format)
1541	big = u"23098475029384702983476098230754973209482573"
1542	self.assertRaises(ValueError, (u"{" + big + u"}").format)
1543	self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
1544
1545	# issue 6089
1546	self.assertRaises(ValueError, u"{0[0]x}".format, [None])
1547	self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
1548
1549	# can't have a replacement on the field name portion
1550	self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
1551
1552	# exceed maximum recursion depth
1553	self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
1554	self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1555	0, 1, 2, 3, 4, 5, 6, 7)
1556
1557	# string format spec errors
1558	self.assertRaises(ValueError, u"{0:-s}".format, u'')
1559	self.assertRaises(ValueError, format, u"", u"-")
1560	self.assertRaises(ValueError, u"{0:=s}".format, u'')
1561
1562	# test combining string and unicode
1563	self.assertEqual(u"foo{0}".format('bar'), u'foobar')
1564	# This will try to convert the argument from unicode to str, which
1565	# will succeed
1566	self.assertEqual("foo{0}".format(u'bar'), 'foobar')
1567	# This will try to convert the argument from unicode to str, which
1568	# will fail
1569	self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
1570
1571	def test_format_huge_precision(self):
1572	format_string = u".{}f".format(sys.maxsize + 1)
1573	with self.assertRaises(ValueError):
1574	result = format(2.34, format_string)
1575
1576	def test_format_huge_width(self):
1577	format_string = u"{}f".format(sys.maxsize + 1)
1578	with self.assertRaises(ValueError):
1579	result = format(2.34, format_string)
1580
1581	def test_format_huge_item_number(self):
1582	format_string = u"{{{}:.6f}}".format(sys.maxsize + 1)
1583	with self.assertRaises(ValueError):
1584	result = format_string.format(2.34)
1585
1586	def test_format_auto_numbering(self):
1587	class C:
1588	def __init__(self, x=100):
1589	self._x = x
1590	def __format__(self, spec):
1591	return spec
1592
1593	self.assertEqual(u'{}'.format(10), u'10')
1594	self.assertEqual(u'{:5}'.format('s'), u's ')
1595	self.assertEqual(u'{!r}'.format('s'), u"'s'")
1596	self.assertEqual(u'{._x}'.format(C(10)), u'10')
1597	self.assertEqual(u'{[1]}'.format([1, 2]), u'2')
1598	self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4')
1599	self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c')
1600
1601	self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a x b')
1602	self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b')
1603
1604	# can't mix and match numbering and auto-numbering
1605	self.assertRaises(ValueError, u'{}{1}'.format, 1, 2)
1606	self.assertRaises(ValueError, u'{1}{}'.format, 1, 2)
1607	self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2)
1608	self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2)
1609
1610	# can mix and match auto-numbering and named
1611	self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4')
1612	self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test')
1613	self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3')
1614	self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g')
1615
1616	def test_raiseMemError(self):
1617	# Ensure that the freelist contains a consistent object, even
1618	# when a string allocation fails with a MemoryError.
1619	# This used to crash the interpreter,
1620	# or leak references when the number was smaller.
1621	charwidth = 4 if sys.maxunicode >= 0x10000 else 2
1622	# Note: sys.maxsize is half of the actual max allocation because of
1623	# the signedness of Py_ssize_t.
1624	alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
1625	self.assertRaises(MemoryError, alloc)
1626	self.assertRaises(MemoryError, alloc)
1627
1628	def test_format_subclass(self):
1629	class U(unicode):
1630	def __unicode__(self):
1631	return u'__unicode__ overridden'
1632	u = U(u'xxx')
1633	self.assertEqual("%s" % u, u'__unicode__ overridden')
1634	self.assertEqual("{}".format(u), '__unicode__ overridden')
1635
1636	def test_encode_decimal(self):
1637	from _testcapi import unicode_encodedecimal
1638	self.assertEqual(unicode_encodedecimal(u'123'),
1639	b'123')
1640	self.assertEqual(unicode_encodedecimal(u'\u0663.\u0661\u0664'),
1641	b'3.14')
1642	self.assertEqual(unicode_encodedecimal(u"\N{EM SPACE}3.14\N{EN SPACE}"),
1643	b' 3.14 ')
1644	self.assertRaises(UnicodeEncodeError,
1645	unicode_encodedecimal, u"123\u20ac", "strict")
1646	self.assertEqual(unicode_encodedecimal(u"123\u20ac", "replace"),
1647	b'123?')
1648	self.assertEqual(unicode_encodedecimal(u"123\u20ac", "ignore"),
1649	b'123')
1650	self.assertEqual(unicode_encodedecimal(u"123\u20ac", "xmlcharrefreplace"),
1651	b'123€')
1652	self.assertEqual(unicode_encodedecimal(u"123\u20ac", "backslashreplace"),
1653	b'123\\u20ac')
1654	self.assertEqual(unicode_encodedecimal(u"123\u20ac\N{EM SPACE}", "replace"),
1655	b'123? ')
1656	self.assertEqual(unicode_encodedecimal(u"123\u20ac\u20ac", "replace"),
1657	b'123??')
1658	self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
1659	b'123?0')
1660
1661	def test_encode_decimal_with_surrogates(self):
1662	from _testcapi import unicode_encodedecimal
1663	tests = [(u'\U0001f49d', '💝'),
1664	(u'\ud83d', '&#55357;'),
1665	(u'\udc9d', '&#56477;'),
1666	]
1667	if u'\ud83d\udc9d' != u'\U0001f49d':
1668	tests += [(u'\ud83d\udc9d', '&#55357;&#56477;')]
1669	for s, exp in tests:
1670	self.assertEqual(
1671	unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"),
1672	'123' + exp)
1673
1674	def test_main():
1675	test_support.run_unittest(__name__)
1676
1677	if __name__ == "__main__":
1678	test_main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/test/test_unicode.py

Download in other formats: