source: python/trunk/Lib/test/test_unicode.py

Last change on this file was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 74.4 KB
Line 
1""" Test script for the Unicode implementation.
2
3Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""#"
8import sys
9import struct
10import codecs
11import unittest
12from test import test_support, string_tests
13
14# decorator to skip tests on narrow builds
15requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
16 'requires wide build')
17
18# Error handling (bad decoder return)
19def search_function(encoding):
20 def decode1(input, errors="strict"):
21 return 42 # not a tuple
22 def encode1(input, errors="strict"):
23 return 42 # not a tuple
24 def encode2(input, errors="strict"):
25 return (42, 42) # no unicode
26 def decode2(input, errors="strict"):
27 return (42, 42) # no unicode
28 if encoding=="test.unicode1":
29 return (encode1, decode1, None, None)
30 elif encoding=="test.unicode2":
31 return (encode2, decode2, None, None)
32 else:
33 return None
34codecs.register(search_function)
35
36class UnicodeTest(
37 string_tests.CommonTest,
38 string_tests.MixinStrUnicodeUserStringTest,
39 string_tests.MixinStrUnicodeTest,
40 ):
41 type2test = unicode
42
43 def assertEqual(self, first, second, msg=None):
44 # strict assertEqual method: reject implicit bytes/unicode equality
45 super(UnicodeTest, self).assertEqual(first, second, msg)
46 if isinstance(first, unicode) or isinstance(second, unicode):
47 self.assertIsInstance(first, unicode)
48 self.assertIsInstance(second, unicode)
49 elif isinstance(first, str) or isinstance(second, str):
50 self.assertIsInstance(first, str)
51 self.assertIsInstance(second, str)
52
53 def checkequalnofix(self, result, object, methodname, *args):
54 method = getattr(object, methodname)
55 realresult = method(*args)
56 self.assertEqual(realresult, result)
57 self.assertTrue(type(realresult) is type(result))
58
59 # if the original is returned make sure that
60 # this doesn't happen with subclasses
61 if realresult is object:
62 class usub(unicode):
63 def __repr__(self):
64 return 'usub(%r)' % unicode.__repr__(self)
65 object = usub(object)
66 method = getattr(object, methodname)
67 realresult = method(*args)
68 self.assertEqual(realresult, result)
69 self.assertTrue(object is not realresult)
70
71 def test_literals(self):
72 self.assertEqual(u'\xff', u'\u00ff')
73 self.assertEqual(u'\uffff', u'\U0000ffff')
74 self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
75 self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
76 self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
77
78 def test_repr(self):
79 if not sys.platform.startswith('java'):
80 # Test basic sanity of repr()
81 self.assertEqual(repr(u'abc'), "u'abc'")
82 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
83 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
84 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
85 self.assertEqual(repr(u'\\'), "u'\\\\'")
86 self.assertEqual(repr(u'\n'), "u'\\n'")
87 self.assertEqual(repr(u'\r'), "u'\\r'")
88 self.assertEqual(repr(u'\t'), "u'\\t'")
89 self.assertEqual(repr(u'\b'), "u'\\x08'")
90 self.assertEqual(repr(u"'\""), """u'\\'"'""")
91 self.assertEqual(repr(u"'\""), """u'\\'"'""")
92 self.assertEqual(repr(u"'"), '''u"'"''')
93 self.assertEqual(repr(u'"'), """u'"'""")
94 latin1repr = (
95 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
96 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
97 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
98 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
99 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
100 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
101 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
102 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
103 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
104 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
105 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
106 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
107 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
108 "\\xfe\\xff'")
109 testrepr = repr(u''.join(map(unichr, xrange(256))))
110 self.assertEqual(testrepr, latin1repr)
111 # Test repr works on wide unicode escapes without overflow.
112 self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
113 repr(u"\U00010000" * 39 + u"\uffff" * 4096))
114
115
116 def test_count(self):
117 string_tests.CommonTest.test_count(self)
118 # check mixed argument types
119 self.checkequalnofix(3, 'aaa', 'count', u'a')
120 self.checkequalnofix(0, 'aaa', 'count', u'b')
121 self.checkequalnofix(3, u'aaa', 'count', 'a')
122 self.checkequalnofix(0, u'aaa', 'count', 'b')
123 self.checkequalnofix(0, u'aaa', 'count', 'b')
124 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
125 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
126 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
127 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
128
129 def test_find(self):
130 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
131 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
132 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
133
134 self.assertRaises(TypeError, u'hello'.find)
135 self.assertRaises(TypeError, u'hello'.find, 42)
136
137 def test_rfind(self):
138 string_tests.CommonTest.test_rfind(self)
139 # check mixed argument types
140 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
141 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
142 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
143
144 def test_index(self):
145 string_tests.CommonTest.test_index(self)
146 # check mixed argument types
147 for (t1, t2) in ((str, unicode), (unicode, str)):
148 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
149 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
150 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
151 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
152 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
153 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
154 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
155 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
156
157 def test_rindex(self):
158 string_tests.CommonTest.test_rindex(self)
159 # check mixed argument types
160 for (t1, t2) in ((str, unicode), (unicode, str)):
161 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
162 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
163 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
164 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
165
166 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
167 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
168 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
169 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
170 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
171
172 def test_translate(self):
173 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
174 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
175 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
176 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
177 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
178 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
179
180 self.assertRaises(TypeError, u'hello'.translate)
181 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
182
183 def test_split(self):
184 string_tests.CommonTest.test_split(self)
185
186 # Mixed arguments
187 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
188 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
189 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
190
191 def test_join(self):
192 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
193
194 # mixed arguments
195 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
196 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
197 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
198 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
199 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
200 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
201 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
202
203 def test_strip(self):
204 string_tests.CommonTest.test_strip(self)
205 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
206
207 def test_replace(self):
208 string_tests.CommonTest.test_replace(self)
209
210 # method call forwarded from str implementation because of unicode argument
211 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
212 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
213
214 def test_comparison(self):
215 # Comparisons:
216 self.assertTrue(u'abc' == 'abc')
217 self.assertTrue('abc' == u'abc')
218 self.assertTrue(u'abc' == u'abc')
219 self.assertTrue(u'abcd' > 'abc')
220 self.assertTrue('abcd' > u'abc')
221 self.assertTrue(u'abcd' > u'abc')
222 self.assertTrue(u'abc' < 'abcd')
223 self.assertTrue('abc' < u'abcd')
224 self.assertTrue(u'abc' < u'abcd')
225
226 if 0:
227 # Move these tests to a Unicode collation module test...
228 # Testing UTF-16 code point order comparisons...
229
230 # No surrogates, no fixup required.
231 self.assertTrue(u'\u0061' < u'\u20ac')
232 # Non surrogate below surrogate value, no fixup required
233 self.assertTrue(u'\u0061' < u'\ud800\udc02')
234
235 # Non surrogate above surrogate value, fixup required
236 def test_lecmp(s, s2):
237 self.assertTrue(s < s2)
238
239 def test_fixup(s):
240 s2 = u'\ud800\udc01'
241 test_lecmp(s, s2)
242 s2 = u'\ud900\udc01'
243 test_lecmp(s, s2)
244 s2 = u'\uda00\udc01'
245 test_lecmp(s, s2)
246 s2 = u'\udb00\udc01'
247 test_lecmp(s, s2)
248 s2 = u'\ud800\udd01'
249 test_lecmp(s, s2)
250 s2 = u'\ud900\udd01'
251 test_lecmp(s, s2)
252 s2 = u'\uda00\udd01'
253 test_lecmp(s, s2)
254 s2 = u'\udb00\udd01'
255 test_lecmp(s, s2)
256 s2 = u'\ud800\ude01'
257 test_lecmp(s, s2)
258 s2 = u'\ud900\ude01'
259 test_lecmp(s, s2)
260 s2 = u'\uda00\ude01'
261 test_lecmp(s, s2)
262 s2 = u'\udb00\ude01'
263 test_lecmp(s, s2)
264 s2 = u'\ud800\udfff'
265 test_lecmp(s, s2)
266 s2 = u'\ud900\udfff'
267 test_lecmp(s, s2)
268 s2 = u'\uda00\udfff'
269 test_lecmp(s, s2)
270 s2 = u'\udb00\udfff'
271 test_lecmp(s, s2)
272
273 test_fixup(u'\ue000')
274 test_fixup(u'\uff61')
275
276 # Surrogates on both sides, no fixup required
277 self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56')
278
279 def test_capitalize(self):
280 string_tests.CommonTest.test_capitalize(self)
281 # check that titlecased chars are lowered correctly
282 # \u1ffc is the titlecased char
283 self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3',
284 u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
285 # check with cased non-letter chars
286 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
287 u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize')
288 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
289 u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize')
290 self.checkequal(u'\u2160\u2171\u2172',
291 u'\u2160\u2161\u2162', 'capitalize')
292 self.checkequal(u'\u2160\u2171\u2172',
293 u'\u2170\u2171\u2172', 'capitalize')
294 # check with Ll chars with no upper - nothing changes here
295 self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7',
296 u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize')
297
298 def test_islower(self):
299 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
300 self.checkequalnofix(False, u'\u1FFc', 'islower')
301
302 @requires_wide_build
303 def test_islower_non_bmp(self):
304 # non-BMP, uppercase
305 self.assertFalse(u'\U00010401'.islower())
306 self.assertFalse(u'\U00010427'.islower())
307 # non-BMP, lowercase
308 self.assertTrue(u'\U00010429'.islower())
309 self.assertTrue(u'\U0001044E'.islower())
310 # non-BMP, non-cased
311 self.assertFalse(u'\U0001F40D'.islower())
312 self.assertFalse(u'\U0001F46F'.islower())
313
314 def test_isupper(self):
315 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
316 if not sys.platform.startswith('java'):
317 self.checkequalnofix(False, u'\u1FFc', 'isupper')
318
319 @requires_wide_build
320 def test_isupper_non_bmp(self):
321 # non-BMP, uppercase
322 self.assertTrue(u'\U00010401'.isupper())
323 self.assertTrue(u'\U00010427'.isupper())
324 # non-BMP, lowercase
325 self.assertFalse(u'\U00010429'.isupper())
326 self.assertFalse(u'\U0001044E'.isupper())
327 # non-BMP, non-cased
328 self.assertFalse(u'\U0001F40D'.isupper())
329 self.assertFalse(u'\U0001F46F'.isupper())
330
331 def test_istitle(self):
332 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
333 self.checkequalnofix(True, u'\u1FFc', 'istitle')
334 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
335
336 @requires_wide_build
337 def test_istitle_non_bmp(self):
338 # non-BMP, uppercase + lowercase
339 self.assertTrue(u'\U00010401\U00010429'.istitle())
340 self.assertTrue(u'\U00010427\U0001044E'.istitle())
341 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
342 for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']:
343 self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch))
344
345 def test_isspace(self):
346 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
347 self.checkequalnofix(True, u'\u2000', 'isspace')
348 self.checkequalnofix(True, u'\u200a', 'isspace')
349 self.checkequalnofix(False, u'\u2014', 'isspace')
350
351 @requires_wide_build
352 def test_isspace_non_bmp(self):
353 # apparently there are no non-BMP spaces chars in Unicode 6
354 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
355 u'\U0001F40D', u'\U0001F46F']:
356 self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch))
357
358 @requires_wide_build
359 def test_isalnum_non_bmp(self):
360 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
361 u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
362 self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch))
363
364 def test_isalpha(self):
365 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
366 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
367
368 @requires_wide_build
369 def test_isalpha_non_bmp(self):
370 # non-BMP, cased
371 self.assertTrue(u'\U00010401'.isalpha())
372 self.assertTrue(u'\U00010427'.isalpha())
373 self.assertTrue(u'\U00010429'.isalpha())
374 self.assertTrue(u'\U0001044E'.isalpha())
375 # non-BMP, non-cased
376 self.assertFalse(u'\U0001F40D'.isalpha())
377 self.assertFalse(u'\U0001F46F'.isalpha())
378
379 def test_isdecimal(self):
380 self.checkequalnofix(False, u'', 'isdecimal')
381 self.checkequalnofix(False, u'a', 'isdecimal')
382 self.checkequalnofix(True, u'0', 'isdecimal')
383 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
384 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
385 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
386 self.checkequalnofix(True, u'0123456789', 'isdecimal')
387 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
388
389 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
390
391 @requires_wide_build
392 def test_isdecimal_non_bmp(self):
393 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
394 u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']:
395 self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch))
396 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']:
397 self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch))
398
399 def test_isdigit(self):
400 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
401 self.checkequalnofix(True, u'\u2460', 'isdigit')
402 self.checkequalnofix(False, u'\xbc', 'isdigit')
403 self.checkequalnofix(True, u'\u0660', 'isdigit')
404
405 @requires_wide_build
406 def test_isdigit_non_bmp(self):
407 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
408 u'\U0001F40D', u'\U0001F46F', u'\U00011065']:
409 self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch))
410 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
411 self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch))
412
413 def test_isnumeric(self):
414 self.checkequalnofix(False, u'', 'isnumeric')
415 self.checkequalnofix(False, u'a', 'isnumeric')
416 self.checkequalnofix(True, u'0', 'isnumeric')
417 self.checkequalnofix(True, u'\u2460', 'isnumeric')
418 self.checkequalnofix(True, u'\xbc', 'isnumeric')
419 self.checkequalnofix(True, u'\u0660', 'isnumeric')
420 self.checkequalnofix(True, u'0123456789', 'isnumeric')
421 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
422
423 self.assertRaises(TypeError, u"abc".isnumeric, 42)
424
425 @requires_wide_build
426 def test_isnumeric_non_bmp(self):
427 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
428 u'\U0001F40D', u'\U0001F46F']:
429 self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch))
430 for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b',
431 u'\U000104A0', u'\U0001F107']:
432 self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch))
433
434 @requires_wide_build
435 def test_surrogates(self):
436 # this test actually passes on narrow too, but it's just by accident.
437 # Surrogates are seen as non-cased chars, so u'X\uD800X' is as
438 # uppercase as 'X X'
439 for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
440 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
441 self.assertTrue(s.islower())
442 self.assertFalse(s.isupper())
443 self.assertFalse(s.istitle())
444 for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800',
445 u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'):
446 self.assertFalse(s.islower())
447 self.assertTrue(s.isupper())
448 self.assertTrue(s.istitle())
449
450 for meth_name in ('islower', 'isupper', 'istitle'):
451 meth = getattr(unicode, meth_name)
452 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'):
453 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
454
455 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
456 'isdecimal', 'isnumeric'):
457 meth = getattr(unicode, meth_name)
458 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF',
459 u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
460 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
461 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
462
463
464 @requires_wide_build
465 def test_lower(self):
466 string_tests.CommonTest.test_lower(self)
467 self.assertEqual(u'\U00010427'.lower(), u'\U0001044F')
468 self.assertEqual(u'\U00010427\U00010427'.lower(),
469 u'\U0001044F\U0001044F')
470 self.assertEqual(u'\U00010427\U0001044F'.lower(),
471 u'\U0001044F\U0001044F')
472 self.assertEqual(u'X\U00010427x\U0001044F'.lower(),
473 u'x\U0001044Fx\U0001044F')
474
475 @requires_wide_build
476 def test_upper(self):
477 string_tests.CommonTest.test_upper(self)
478 self.assertEqual(u'\U0001044F'.upper(), u'\U00010427')
479 self.assertEqual(u'\U0001044F\U0001044F'.upper(),
480 u'\U00010427\U00010427')
481 self.assertEqual(u'\U00010427\U0001044F'.upper(),
482 u'\U00010427\U00010427')
483 self.assertEqual(u'X\U00010427x\U0001044F'.upper(),
484 u'X\U00010427X\U00010427')
485
486 @requires_wide_build
487 def test_capitalize(self):
488 string_tests.CommonTest.test_capitalize(self)
489 self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427')
490 self.assertEqual(u'\U0001044F\U0001044F'.capitalize(),
491 u'\U00010427\U0001044F')
492 self.assertEqual(u'\U00010427\U0001044F'.capitalize(),
493 u'\U00010427\U0001044F')
494 self.assertEqual(u'\U0001044F\U00010427'.capitalize(),
495 u'\U00010427\U0001044F')
496 self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(),
497 u'X\U0001044Fx\U0001044F')
498
499 @requires_wide_build
500 def test_title(self):
501 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
502 self.assertEqual(u'\U0001044F'.title(), u'\U00010427')
503 self.assertEqual(u'\U0001044F\U0001044F'.title(),
504 u'\U00010427\U0001044F')
505 self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
506 u'\U00010427\U0001044F \U00010427\U0001044F')
507 self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(),
508 u'\U00010427\U0001044F \U00010427\U0001044F')
509 self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(),
510 u'\U00010427\U0001044F \U00010427\U0001044F')
511 self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
512 u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
513
514 @requires_wide_build
515 def test_swapcase(self):
516 string_tests.CommonTest.test_swapcase(self)
517 self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427')
518 self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F')
519 self.assertEqual(u'\U0001044F\U0001044F'.swapcase(),
520 u'\U00010427\U00010427')
521 self.assertEqual(u'\U00010427\U0001044F'.swapcase(),
522 u'\U0001044F\U00010427')
523 self.assertEqual(u'\U0001044F\U00010427'.swapcase(),
524 u'\U00010427\U0001044F')
525 self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(),
526 u'x\U0001044FX\U00010427')
527
528 def test_contains(self):
529 # Testing Unicode contains method
530 self.assertIn('a', u'abdb')
531 self.assertIn('a', u'bdab')
532 self.assertIn('a', u'bdaba')
533 self.assertIn('a', u'bdba')
534 self.assertIn('a', u'bdba')
535 self.assertIn(u'a', u'bdba')
536 self.assertNotIn(u'a', u'bdb')
537 self.assertNotIn(u'a', 'bdb')
538 self.assertIn(u'a', 'bdba')
539 self.assertIn(u'a', ('a',1,None))
540 self.assertIn(u'a', (1,None,'a'))
541 self.assertIn(u'a', (1,None,u'a'))
542 self.assertIn('a', ('a',1,None))
543 self.assertIn('a', (1,None,'a'))
544 self.assertIn('a', (1,None,u'a'))
545 self.assertNotIn('a', ('x',1,u'y'))
546 self.assertNotIn('a', ('x',1,None))
547 self.assertNotIn(u'abcd', u'abcxxxx')
548 self.assertIn(u'ab', u'abcd')
549 self.assertIn('ab', u'abc')
550 self.assertIn(u'ab', 'abc')
551 self.assertIn(u'ab', (1,None,u'ab'))
552 self.assertIn(u'', u'abc')
553 self.assertIn('', u'abc')
554
555 # If the following fails either
556 # the contains operator does not propagate UnicodeErrors or
557 # someone has changed the default encoding
558 self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2')
559 self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2')
560
561 self.assertIn(u'', '')
562 self.assertIn('', u'')
563 self.assertIn(u'', u'')
564 self.assertIn(u'', 'abc')
565 self.assertIn('', u'abc')
566 self.assertIn(u'', u'abc')
567 self.assertNotIn(u'\0', 'abc')
568 self.assertNotIn('\0', u'abc')
569 self.assertNotIn(u'\0', u'abc')
570 self.assertIn(u'\0', '\0abc')
571 self.assertIn('\0', u'\0abc')
572 self.assertIn(u'\0', u'\0abc')
573 self.assertIn(u'\0', 'abc\0')
574 self.assertIn('\0', u'abc\0')
575 self.assertIn(u'\0', u'abc\0')
576 self.assertIn(u'a', '\0abc')
577 self.assertIn('a', u'\0abc')
578 self.assertIn(u'a', u'\0abc')
579 self.assertIn(u'asdf', 'asdf')
580 self.assertIn('asdf', u'asdf')
581 self.assertIn(u'asdf', u'asdf')
582 self.assertNotIn(u'asdf', 'asd')
583 self.assertNotIn('asdf', u'asd')
584 self.assertNotIn(u'asdf', u'asd')
585 self.assertNotIn(u'asdf', '')
586 self.assertNotIn('asdf', u'')
587 self.assertNotIn(u'asdf', u'')
588
589 self.assertRaises(TypeError, u"abc".__contains__)
590 self.assertRaises(TypeError, u"abc".__contains__, object())
591
592 def test_formatting(self):
593 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
594 # Testing Unicode formatting strings...
595 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
596 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
597 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
598 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
599 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
600 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
601 if not sys.platform.startswith('java'):
602 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
603 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
604 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
605
606 self.assertEqual(u'%c' % 0x1234, u'\u1234')
607 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
608 self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3))
609
610 for num in range(0x00,0x80):
611 char = chr(num)
612 self.assertEqual(u"%c" % char, unicode(char))
613 self.assertEqual(u"%c" % num, unicode(char))
614 self.assertTrue(char == u"%c" % char)
615 self.assertTrue(char == u"%c" % num)
616 # Issue 7649
617 for num in range(0x80,0x100):
618 uchar = unichr(num)
619 self.assertEqual(uchar, u"%c" % num) # works only with ints
620 self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
621 # the implicit decoding should fail for non-ascii chars
622 self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
623 self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
624
625 # formatting jobs delegated from the string implementation:
626 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
627 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
628 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
629 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
630 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
631 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
632 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
633 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
634 self.assertEqual('...%s...' % u"abc", u'...abc...')
635 self.assertEqual('%*s' % (5,u'abc',), u' abc')
636 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
637 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
638 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
639 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
640 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
641 self.assertEqual('%c' % u'a', u'a')
642 class Wrapper:
643 def __str__(self):
644 return u'\u1234'
645 self.assertEqual('%s' % Wrapper(), u'\u1234')
646
647 @test_support.cpython_only
648 def test_formatting_huge_precision(self):
649 from _testcapi import INT_MAX
650 format_string = u"%.{}f".format(INT_MAX + 1)
651 with self.assertRaises(ValueError):
652 result = format_string % 2.34
653
654 def test_formatting_huge_width(self):
655 format_string = u"%{}f".format(sys.maxsize + 1)
656 with self.assertRaises(ValueError):
657 result = format_string % 2.34
658
659 def test_startswith_endswith_errors(self):
660 for meth in (u'foo'.startswith, u'foo'.endswith):
661 with self.assertRaises(UnicodeDecodeError):
662 meth('\xff')
663 with self.assertRaises(TypeError) as cm:
664 meth(['f'])
665 exc = str(cm.exception)
666 self.assertIn('unicode', exc)
667 self.assertIn('str', exc)
668 self.assertIn('tuple', exc)
669
670 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
671 def test_format_float(self):
672 # should not format with a comma, but always with C locale
673 self.assertEqual(u'1.0', u'%.1f' % 1.0)
674
675 def test_constructor(self):
676 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
677
678 self.assertEqual(
679 unicode(u'unicode remains unicode'),
680 u'unicode remains unicode'
681 )
682
683 class UnicodeSubclass(unicode):
684 pass
685
686 self.assertEqual(
687 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
688 u'unicode subclass becomes unicode'
689 )
690
691 self.assertEqual(
692 unicode('strings are converted to unicode'),
693 u'strings are converted to unicode'
694 )
695
696 class UnicodeCompat:
697 def __init__(self, x):
698 self.x = x
699 def __unicode__(self):
700 return self.x
701
702 self.assertEqual(
703 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
704 u'__unicode__ compatible objects are recognized')
705
706 class StringCompat:
707 def __init__(self, x):
708 self.x = x
709 def __str__(self):
710 return self.x
711
712 self.assertEqual(
713 unicode(StringCompat('__str__ compatible objects are recognized')),
714 u'__str__ compatible objects are recognized'
715 )
716
717 # unicode(obj) is compatible to str():
718
719 o = StringCompat('unicode(obj) is compatible to str()')
720 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
721 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
722
723 # %-formatting and .__unicode__()
724 self.assertEqual(u'%s' %
725 UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
726 u"u'%s' % obj uses obj.__unicode__()")
727 self.assertEqual(u'%s' %
728 UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
729 u"u'%s' % obj falls back to obj.__str__()")
730
731 for obj in (123, 123.45, 123L):
732 self.assertEqual(unicode(obj), unicode(str(obj)))
733
734 # unicode(obj, encoding, error) tests (this maps to
735 # PyUnicode_FromEncodedObject() at C level)
736
737 if not sys.platform.startswith('java'):
738 self.assertRaises(
739 TypeError,
740 unicode,
741 u'decoding unicode is not supported',
742 'utf-8',
743 'strict'
744 )
745
746 self.assertEqual(
747 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
748 u'strings are decoded to unicode'
749 )
750
751 if not sys.platform.startswith('java'):
752 with test_support.check_py3k_warnings():
753 buf = buffer('character buffers are decoded to unicode')
754 self.assertEqual(
755 unicode(
756 buf,
757 'utf-8',
758 'strict'
759 ),
760 u'character buffers are decoded to unicode'
761 )
762
763 self.assertRaises(TypeError, unicode, 42, 42, 42)
764
765 def test_codecs_utf7(self):
766 utfTests = [
767 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
768 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
769 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
770 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
771 (u'+', '+-'),
772 (u'+-', '+--'),
773 (u'+?', '+-?'),
774 (u'\?', '+AFw?'),
775 (u'+?', '+-?'),
776 (ur'\\?', '+AFwAXA?'),
777 (ur'\\\?', '+AFwAXABc?'),
778 (ur'++--', '+-+---'),
779 (u'\U000abcde', '+2m/c3g-'), # surrogate pairs
780 (u'/', '/'),
781 ]
782
783 for (x, y) in utfTests:
784 self.assertEqual(x.encode('utf-7'), y)
785
786 # Unpaired surrogates are passed through
787 self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-')
788 self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x')
789 self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-')
790 self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x')
791 self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801')
792 self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x')
793 self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01')
794 self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x')
795
796 self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-')
797 self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde')
798
799 # Direct encoded characters
800 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
801 # Optional direct characters
802 set_o = '!"#$%&*;<=>@[]^_`{|}'
803 for c in set_d:
804 self.assertEqual(c.encode('utf7'), c.encode('ascii'))
805 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
806 self.assertTrue(c == c.encode('ascii').decode('utf7'))
807 for c in set_o:
808 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
809 self.assertTrue(c == c.encode('ascii').decode('utf7'))
810
811 def test_codecs_utf8(self):
812 self.assertEqual(u''.encode('utf-8'), '')
813 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
814 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
815 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
816 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
817 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
818 self.assertEqual(
819 (u'\ud800\udc02'*1000).encode('utf-8'),
820 '\xf0\x90\x80\x82'*1000
821 )
822 self.assertEqual(
823 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
824 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
825 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
826 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
827 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
828 u' Nunstuck git und'.encode('utf-8'),
829 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
830 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
831 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
832 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
833 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
834 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
835 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
836 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
837 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
838 '\xe3\x80\x8cWenn ist das Nunstuck git und'
839 )
840
841 # UTF-8 specific decoding tests
842 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456')
843 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002')
844 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac')
845
846 # Other possible utf-8 test cases:
847 # * strict decoding testing for all of the
848 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
849
850 def test_utf8_decode_valid_sequences(self):
851 sequences = [
852 # single byte
853 ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
854 # 2 bytes
855 ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
856 # 3 bytes
857 ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
858 ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
859 # 4 bytes
860 ('\xF0\x90\x80\x80', u'\U00010000'),
861 ('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
862 ]
863 for seq, res in sequences:
864 self.assertEqual(seq.decode('utf-8'), res)
865
866 for ch in map(unichr, range(0, sys.maxunicode)):
867 self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
868
869 def test_utf8_decode_invalid_sequences(self):
870 # continuation bytes in a sequence of 2, 3, or 4 bytes
871 continuation_bytes = map(chr, range(0x80, 0xC0))
872 # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
873 invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
874 # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
875 invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
876 invalid_start_bytes = (
877 continuation_bytes + invalid_2B_seq_start_bytes +
878 invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
879 )
880
881 for byte in invalid_start_bytes:
882 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
883
884 for sb in invalid_2B_seq_start_bytes:
885 for cb in continuation_bytes:
886 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
887
888 for sb in invalid_4B_seq_start_bytes:
889 for cb1 in continuation_bytes[:3]:
890 for cb3 in continuation_bytes[:3]:
891 self.assertRaises(UnicodeDecodeError,
892 (sb+cb1+'\x80'+cb3).decode, 'utf-8')
893
894 for cb in map(chr, range(0x80, 0xA0)):
895 self.assertRaises(UnicodeDecodeError,
896 ('\xE0'+cb+'\x80').decode, 'utf-8')
897 self.assertRaises(UnicodeDecodeError,
898 ('\xE0'+cb+'\xBF').decode, 'utf-8')
899 # XXX: surrogates shouldn't be valid UTF-8!
900 # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
901 # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
902 #for cb in map(chr, range(0xA0, 0xC0)):
903 #self.assertRaises(UnicodeDecodeError,
904 #('\xED'+cb+'\x80').decode, 'utf-8')
905 #self.assertRaises(UnicodeDecodeError,
906 #('\xED'+cb+'\xBF').decode, 'utf-8')
907 # but since they are valid on Python 2 add a test for that:
908 for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)),
909 map(unichr, range(0xd800, 0xe000, 64))):
910 encoded = '\xED'+cb+'\x80'
911 self.assertEqual(encoded.decode('utf-8'), surrogate)
912 self.assertEqual(surrogate.encode('utf-8'), encoded)
913
914 for cb in map(chr, range(0x80, 0x90)):
915 self.assertRaises(UnicodeDecodeError,
916 ('\xF0'+cb+'\x80\x80').decode, 'utf-8')
917 self.assertRaises(UnicodeDecodeError,
918 ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
919 for cb in map(chr, range(0x90, 0xC0)):
920 self.assertRaises(UnicodeDecodeError,
921 ('\xF4'+cb+'\x80\x80').decode, 'utf-8')
922 self.assertRaises(UnicodeDecodeError,
923 ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
924
925 def test_issue8271(self):
926 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
927 # only the start byte and the continuation byte(s) are now considered
928 # invalid, instead of the number of bytes specified by the start byte.
929 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
930 # table 3-8, Row 2) for more information about the algorithm used.
931 FFFD = u'\ufffd'
932 sequences = [
933 # invalid start bytes
934 ('\x80', FFFD), # continuation byte
935 ('\x80\x80', FFFD*2), # 2 continuation bytes
936 ('\xc0', FFFD),
937 ('\xc0\xc0', FFFD*2),
938 ('\xc1', FFFD),
939 ('\xc1\xc0', FFFD*2),
940 ('\xc0\xc1', FFFD*2),
941 # with start byte of a 2-byte sequence
942 ('\xc2', FFFD), # only the start byte
943 ('\xc2\xc2', FFFD*2), # 2 start bytes
944 ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
945 ('\xc2\x41', FFFD+'A'), # invalid continuation byte
946 # with start byte of a 3-byte sequence
947 ('\xe1', FFFD), # only the start byte
948 ('\xe1\xe1', FFFD*2), # 2 start bytes
949 ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
950 ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
951 ('\xe1\x80', FFFD), # only 1 continuation byte
952 ('\xe1\x41', FFFD+'A'), # invalid continuation byte
953 ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
954 ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
955 ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
956 ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
957 ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
958 # with start byte of a 4-byte sequence
959 ('\xf1', FFFD), # only the start byte
960 ('\xf1\xf1', FFFD*2), # 2 start bytes
961 ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
962 ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
963 ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
964 ('\xf1\x80', FFFD), # only 1 continuation bytes
965 ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
966 ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
967 ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
968 ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
969 ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
970 ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
971 ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
972 ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
973 ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
974 ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
975 ('\xf1\xf1\x80\x41', FFFD*2+'A'),
976 ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
977 # with invalid start byte of a 4-byte sequence (rfc2279)
978 ('\xf5', FFFD), # only the start byte
979 ('\xf5\xf5', FFFD*2), # 2 start bytes
980 ('\xf5\x80', FFFD*2), # only 1 continuation byte
981 ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
982 ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
983 ('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
984 ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
985 ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
986 # with invalid start byte of a 5-byte sequence (rfc2279)
987 ('\xf8', FFFD), # only the start byte
988 ('\xf8\xf8', FFFD*2), # 2 start bytes
989 ('\xf8\x80', FFFD*2), # only one continuation byte
990 ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
991 ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
992 # with invalid start byte of a 6-byte sequence (rfc2279)
993 ('\xfc', FFFD), # only the start byte
994 ('\xfc\xfc', FFFD*2), # 2 start bytes
995 ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
996 ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
997 # invalid start byte
998 ('\xfe', FFFD),
999 ('\xfe\x80\x80', FFFD*3),
1000 # other sequences
1001 ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
1002 ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
1003 ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
1004 ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1005 u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1006 ]
1007 for n, (seq, res) in enumerate(sequences):
1008 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1009 self.assertEqual(seq.decode('utf-8', 'replace'), res)
1010 self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
1011 self.assertEqual(seq.decode('utf-8', 'ignore'),
1012 res.replace(u'\uFFFD', ''))
1013
1014 def test_codecs_idna(self):
1015 # Test whether trailing dot is preserved
1016 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
1017
1018 def test_codecs_errors(self):
1019 # Error handling (encoding)
1020 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
1021 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
1022 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
1023 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
1024 self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'),
1025 u'Andr\202 x'.encode('ascii', errors='replace'))
1026 self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'),
1027 u'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
1028
1029 # Error handling (decoding)
1030 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
1031 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
1032 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
1033 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
1034 self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
1035 u'abcde'.decode('ascii', errors='ignore'))
1036 self.assertEqual(u'abcde'.decode('ascii', 'replace'),
1037 u'abcde'.decode(encoding='ascii', errors='replace'))
1038
1039 # Error handling (unknown character names)
1040 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
1041
1042 # Error handling (truncated escape sequence)
1043 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
1044
1045 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
1046 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
1047 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
1048 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
1049 # executes PyUnicode_Encode()
1050 import imp
1051 self.assertRaises(
1052 ImportError,
1053 imp.find_module,
1054 "non-existing module",
1055 [u"non-existing dir"]
1056 )
1057
1058 # Error handling (wrong arguments)
1059 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
1060
1061 # Error handling (PyUnicode_EncodeDecimal())
1062 self.assertRaises(UnicodeError, int, u"\u0200")
1063
1064 def test_codecs(self):
1065 # Encoding
1066 self.assertEqual(u'hello'.encode('ascii'), 'hello')
1067 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
1068 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
1069 self.assertEqual(u'hello'.encode('utf8'), 'hello')
1070 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
1071 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
1072 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
1073
1074 # Roundtrip safety for BMP (just the first 1024 chars)
1075 for c in xrange(1024):
1076 u = unichr(c)
1077 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1078 'utf-16-be', 'raw_unicode_escape',
1079 'unicode_escape', 'unicode_internal'):
1080 self.assertEqual(unicode(u.encode(encoding),encoding), u)
1081
1082 # Roundtrip safety for BMP (just the first 256 chars)
1083 for c in xrange(256):
1084 u = unichr(c)
1085 for encoding in ('latin-1',):
1086 self.assertEqual(unicode(u.encode(encoding),encoding), u)
1087
1088 # Roundtrip safety for BMP (just the first 128 chars)
1089 for c in xrange(128):
1090 u = unichr(c)
1091 for encoding in ('ascii',):
1092 self.assertEqual(unicode(u.encode(encoding),encoding), u)
1093
1094 # Roundtrip safety for non-BMP (just a few chars)
1095 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
1096 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1097 #'raw_unicode_escape',
1098 'unicode_escape', 'unicode_internal'):
1099 self.assertEqual(unicode(u.encode(encoding),encoding), u)
1100
1101 # UTF-8 must be roundtrip safe for all UCS-2 code points
1102 # This excludes surrogates: in the full range, there would be
1103 # a surrogate pair (\udbff\udc00), which gets converted back
1104 # to a non-BMP character (\U0010fc00)
1105 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
1106 for encoding in ('utf-8',):
1107 self.assertEqual(unicode(u.encode(encoding),encoding), u)
1108
1109 def test_codecs_charmap(self):
1110 # 0-127
1111 s = ''.join(map(chr, xrange(128)))
1112 for encoding in (
1113 'cp037', 'cp1026',
1114 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1115 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1116 'cp863', 'cp865', 'cp866',
1117 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1118 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1119 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1120 'mac_cyrillic', 'mac_latin2',
1121
1122 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1123 'cp1256', 'cp1257', 'cp1258',
1124 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1125
1126 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1127 'cp1006', 'iso8859_8',
1128
1129 ### These have undefined mappings:
1130 #'cp424',
1131
1132 ### These fail the round-trip:
1133 #'cp875'
1134
1135 ):
1136 self.assertEqual(unicode(s, encoding).encode(encoding), s)
1137
1138 # 128-255
1139 s = ''.join(map(chr, xrange(128, 256)))
1140 for encoding in (
1141 'cp037', 'cp1026',
1142 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1143 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1144 'cp863', 'cp865', 'cp866',
1145 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1146 'iso8859_2', 'iso8859_4', 'iso8859_5',
1147 'iso8859_9', 'koi8_r', 'latin_1',
1148 'mac_cyrillic', 'mac_latin2',
1149
1150 ### These have undefined mappings:
1151 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1152 #'cp1256', 'cp1257', 'cp1258',
1153 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1154 #'iso8859_3', 'iso8859_6', 'iso8859_7',
1155 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1156
1157 ### These fail the round-trip:
1158 #'cp1006', 'cp875', 'iso8859_8',
1159
1160 ):
1161 self.assertEqual(unicode(s, encoding).encode(encoding), s)
1162
1163 def test_concatenation(self):
1164 self.assertEqual((u"abc" u"def"), u"abcdef")
1165 self.assertEqual(("abc" u"def"), u"abcdef")
1166 self.assertEqual((u"abc" "def"), u"abcdef")
1167 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
1168 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
1169
1170 def test_printing(self):
1171 class BitBucket:
1172 def write(self, text):
1173 pass
1174
1175 out = BitBucket()
1176 print >>out, u'abc'
1177 print >>out, u'abc', u'def'
1178 print >>out, u'abc', 'def'
1179 print >>out, 'abc', u'def'
1180 print >>out, u'abc\n'
1181 print >>out, u'abc\n',
1182 print >>out, u'abc\n',
1183 print >>out, u'def\n'
1184 print >>out, u'def\n'
1185
1186 def test_ucs4(self):
1187 x = u'\U00100000'
1188 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1189 self.assertEqual(x, y)
1190
1191 y = r'\U00100000'
1192 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1193 self.assertEqual(x, y)
1194 y = r'\U00010000'
1195 x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1196 self.assertEqual(x, y)
1197
1198 try:
1199 '\U11111111'.decode("raw-unicode-escape")
1200 except UnicodeDecodeError as e:
1201 self.assertEqual(e.start, 0)
1202 self.assertEqual(e.end, 10)
1203 else:
1204 self.fail("Should have raised UnicodeDecodeError")
1205
1206 def test_conversion(self):
1207 # Make sure __unicode__() works properly
1208 class Foo0:
1209 def __str__(self):
1210 return "foo"
1211
1212 class Foo1:
1213 def __unicode__(self):
1214 return u"foo"
1215
1216 class Foo2(object):
1217 def __unicode__(self):
1218 return u"foo"
1219
1220 class Foo3(object):
1221 def __unicode__(self):
1222 return "foo"
1223
1224 class Foo4(str):
1225 def __unicode__(self):
1226 return "foo"
1227
1228 class Foo5(unicode):
1229 def __unicode__(self):
1230 return "foo"
1231
1232 class Foo6(str):
1233 def __str__(self):
1234 return "foos"
1235
1236 def __unicode__(self):
1237 return u"foou"
1238
1239 class Foo7(unicode):
1240 def __str__(self):
1241 return "foos"
1242 def __unicode__(self):
1243 return u"foou"
1244
1245 class Foo8(unicode):
1246 def __new__(cls, content=""):
1247 return unicode.__new__(cls, 2*content)
1248 def __unicode__(self):
1249 return self
1250
1251 class Foo9(unicode):
1252 def __str__(self):
1253 return "string"
1254 def __unicode__(self):
1255 return "not unicode"
1256
1257 self.assertEqual(unicode(Foo0()), u"foo")
1258 self.assertEqual(unicode(Foo1()), u"foo")
1259 self.assertEqual(unicode(Foo2()), u"foo")
1260 self.assertEqual(unicode(Foo3()), u"foo")
1261 self.assertEqual(unicode(Foo4("bar")), u"foo")
1262 self.assertEqual(unicode(Foo5("bar")), u"foo")
1263 self.assertEqual(unicode(Foo6("bar")), u"foou")
1264 self.assertEqual(unicode(Foo7("bar")), u"foou")
1265 self.assertEqual(unicode(Foo8("foo")), u"foofoo")
1266 self.assertEqual(str(Foo9("foo")), "string")
1267 self.assertEqual(unicode(Foo9("foo")), u"not unicode")
1268
1269 def test_unicode_repr(self):
1270 class s1:
1271 def __repr__(self):
1272 return '\\n'
1273
1274 class s2:
1275 def __repr__(self):
1276 return u'\\n'
1277
1278 self.assertEqual(repr(s1()), '\\n')
1279 self.assertEqual(repr(s2()), '\\n')
1280
1281 def test_expandtabs_overflows_gracefully(self):
1282 # This test only affects 32-bit platforms because expandtabs can only take
1283 # an int as the max value, not a 64-bit C long. If expandtabs is changed
1284 # to take a 64-bit long, this test should apply to all platforms.
1285 if sys.maxint > (1 << 32) or struct.calcsize('P') != 4:
1286 return
1287 self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
1288
1289 def test__format__(self):
1290 def test(value, format, expected):
1291 # test both with and without the trailing 's'
1292 self.assertEqual(value.__format__(format), expected)
1293 self.assertEqual(value.__format__(format + u's'), expected)
1294
1295 test(u'', u'', u'')
1296 test(u'abc', u'', u'abc')
1297 test(u'abc', u'.3', u'abc')
1298 test(u'ab', u'.3', u'ab')
1299 test(u'abcdef', u'.3', u'abc')
1300 test(u'abcdef', u'.0', u'')
1301 test(u'abc', u'3.3', u'abc')
1302 test(u'abc', u'2.3', u'abc')
1303 test(u'abc', u'2.2', u'ab')
1304 test(u'abc', u'3.2', u'ab ')
1305 test(u'result', u'x<0', u'result')
1306 test(u'result', u'x<5', u'result')
1307 test(u'result', u'x<6', u'result')
1308 test(u'result', u'x<7', u'resultx')
1309 test(u'result', u'x<8', u'resultxx')
1310 test(u'result', u' <7', u'result ')
1311 test(u'result', u'<7', u'result ')
1312 test(u'result', u'>7', u' result')
1313 test(u'result', u'>8', u' result')
1314 test(u'result', u'^8', u' result ')
1315 test(u'result', u'^9', u' result ')
1316 test(u'result', u'^10', u' result ')
1317 test(u'a', u'10000', u'a' + u' ' * 9999)
1318 test(u'', u'10000', u' ' * 10000)
1319 test(u'', u'10000000', u' ' * 10000000)
1320
1321 # test mixing unicode and str
1322 self.assertEqual(u'abc'.__format__('s'), u'abc')
1323 self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
1324
1325 def test_format(self):
1326 self.assertEqual(u''.format(), u'')
1327 self.assertEqual(u'a'.format(), u'a')
1328 self.assertEqual(u'ab'.format(), u'ab')
1329 self.assertEqual(u'a{{'.format(), u'a{')
1330 self.assertEqual(u'a}}'.format(), u'a}')
1331 self.assertEqual(u'{{b'.format(), u'{b')
1332 self.assertEqual(u'}}b'.format(), u'}b')
1333 self.assertEqual(u'a{{b'.format(), u'a{b')
1334
1335 # examples from the PEP:
1336 import datetime
1337 self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
1338 self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
1339 u"My name is Fred")
1340 self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
1341 u"My name is Fred :-{}")
1342
1343 # datetime.__format__ doesn't work with unicode
1344 #d = datetime.date(2007, 8, 18)
1345 #self.assertEqual("The year is {0.year}".format(d),
1346 # "The year is 2007")
1347
1348 # classes we'll use for testing
1349 class C:
1350 def __init__(self, x=100):
1351 self._x = x
1352 def __format__(self, spec):
1353 return spec
1354
1355 class D:
1356 def __init__(self, x):
1357 self.x = x
1358 def __format__(self, spec):
1359 return str(self.x)
1360
1361 # class with __str__, but no __format__
1362 class E:
1363 def __init__(self, x):
1364 self.x = x
1365 def __str__(self):
1366 return u'E(' + self.x + u')'
1367
1368 # class with __repr__, but no __format__ or __str__
1369 class F:
1370 def __init__(self, x):
1371 self.x = x
1372 def __repr__(self):
1373 return u'F(' + self.x + u')'
1374
1375 # class with __format__ that forwards to string, for some format_spec's
1376 class G:
1377 def __init__(self, x):
1378 self.x = x
1379 def __str__(self):
1380 return u"string is " + self.x
1381 def __format__(self, format_spec):
1382 if format_spec == 'd':
1383 return u'G(' + self.x + u')'
1384 return object.__format__(self, format_spec)
1385
1386 # class that returns a bad type from __format__
1387 class H:
1388 def __format__(self, format_spec):
1389 return 1.0
1390
1391 class I(datetime.date):
1392 def __format__(self, format_spec):
1393 return self.strftime(format_spec)
1394
1395 class J(int):
1396 def __format__(self, format_spec):
1397 return int.__format__(self * 2, format_spec)
1398
1399
1400 self.assertEqual(u''.format(), u'')
1401 self.assertEqual(u'abc'.format(), u'abc')
1402 self.assertEqual(u'{0}'.format(u'abc'), u'abc')
1403 self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
1404 self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
1405 self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
1406 self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
1407 self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
1408 self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
1409 self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
1410 self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
1411 self.assertEqual(u'{0}'.format(-15), u'-15')
1412 self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
1413 self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
1414 self.assertEqual(u'{{'.format(), u'{')
1415 self.assertEqual(u'}}'.format(), u'}')
1416 self.assertEqual(u'{{}}'.format(), u'{}')
1417 self.assertEqual(u'{{x}}'.format(), u'{x}')
1418 self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
1419 self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
1420 self.assertEqual(u'}}{{'.format(), u'}{')
1421 self.assertEqual(u'}}x{{'.format(), u'}x{')
1422
1423 # weird field names
1424 self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
1425 self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
1426 self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
1427
1428 self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
1429 self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
1430 self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
1431 self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
1432 self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
1433 self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
1434 self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
1435
1436 # strings
1437 self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
1438 self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
1439 self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
1440 self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
1441 self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
1442 self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
1443 self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
1444 self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
1445 self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
1446 self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
1447 self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
1448 self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
1449 self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
1450 self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
1451 self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
1452 self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
1453 self.assertEqual(u'{0:>8s}'.format(u'result'), u' result')
1454 self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
1455 self.assertEqual(u'{0:^9s}'.format(u'result'), u' result ')
1456 self.assertEqual(u'{0:^10s}'.format(u'result'), u' result ')
1457 self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
1458 self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
1459 self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
1460
1461 # format specifiers for user defined type
1462 self.assertEqual(u'{0:abc}'.format(C()), u'abc')
1463
1464 # !r and !s coercions
1465 self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
1466 self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
1467 self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello ')
1468 self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello ')
1469 self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
1470 self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
1471 self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
1472
1473 # test fallback to object.__format__
1474 self.assertEqual(u'{0}'.format({}), u'{}')
1475 self.assertEqual(u'{0}'.format([]), u'[]')
1476 self.assertEqual(u'{0}'.format([1]), u'[1]')
1477 self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
1478 self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
1479 self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
1480
1481 msg = 'object.__format__ with a non-empty format string is deprecated'
1482 with test_support.check_warnings((msg, PendingDeprecationWarning)):
1483 self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data) ')
1484 self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data) ')
1485 self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
1486
1487 self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
1488 month=8,
1489 day=27)),
1490 u"date: 2007-08-27")
1491
1492 # test deriving from a builtin type and overriding __format__
1493 self.assertEqual(u"{0}".format(J(10)), u"20")
1494
1495
1496 # string format specifiers
1497 self.assertEqual(u'{0:}'.format('a'), u'a')
1498
1499 # computed format specifiers
1500 self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
1501 self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
1502 self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
1503 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello ')
1504 self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello ')
1505
1506 # test various errors
1507 self.assertRaises(ValueError, u'{'.format)
1508 self.assertRaises(ValueError, u'}'.format)
1509 self.assertRaises(ValueError, u'a{'.format)
1510 self.assertRaises(ValueError, u'a}'.format)
1511 self.assertRaises(ValueError, u'{a'.format)
1512 self.assertRaises(ValueError, u'}a'.format)
1513 self.assertRaises(IndexError, u'{0}'.format)
1514 self.assertRaises(IndexError, u'{1}'.format, u'abc')
1515 self.assertRaises(KeyError, u'{x}'.format)
1516 self.assertRaises(ValueError, u"}{".format)
1517 self.assertRaises(ValueError, u"{".format)
1518 self.assertRaises(ValueError, u"}".format)
1519 self.assertRaises(ValueError, u"abc{0:{}".format)
1520 self.assertRaises(ValueError, u"{0".format)
1521 self.assertRaises(IndexError, u"{0.}".format)
1522 self.assertRaises(ValueError, u"{0.}".format, 0)
1523 self.assertRaises(IndexError, u"{0[}".format)
1524 self.assertRaises(ValueError, u"{0[}".format, [])
1525 self.assertRaises(KeyError, u"{0]}".format)
1526 self.assertRaises(ValueError, u"{0.[]}".format, 0)
1527 self.assertRaises(ValueError, u"{0..foo}".format, 0)
1528 self.assertRaises(ValueError, u"{0[0}".format, 0)
1529 self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
1530 self.assertRaises(KeyError, u"{c]}".format)
1531 self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
1532 self.assertRaises(ValueError, u"{0}}".format, 0)
1533 self.assertRaises(KeyError, u"{foo}".format, bar=3)
1534 self.assertRaises(ValueError, u"{0!x}".format, 3)
1535 self.assertRaises(ValueError, u"{0!}".format, 0)
1536 self.assertRaises(ValueError, u"{0!rs}".format, 0)
1537 self.assertRaises(ValueError, u"{!}".format)
1538 self.assertRaises(IndexError, u"{:}".format)
1539 self.assertRaises(IndexError, u"{:s}".format)
1540 self.assertRaises(IndexError, u"{}".format)
1541 big = u"23098475029384702983476098230754973209482573"
1542 self.assertRaises(ValueError, (u"{" + big + u"}").format)
1543 self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
1544
1545 # issue 6089
1546 self.assertRaises(ValueError, u"{0[0]x}".format, [None])
1547 self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
1548
1549 # can't have a replacement on the field name portion
1550 self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
1551
1552 # exceed maximum recursion depth
1553 self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
1554 self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1555 0, 1, 2, 3, 4, 5, 6, 7)
1556
1557 # string format spec errors
1558 self.assertRaises(ValueError, u"{0:-s}".format, u'')
1559 self.assertRaises(ValueError, format, u"", u"-")
1560 self.assertRaises(ValueError, u"{0:=s}".format, u'')
1561
1562 # test combining string and unicode
1563 self.assertEqual(u"foo{0}".format('bar'), u'foobar')
1564 # This will try to convert the argument from unicode to str, which
1565 # will succeed
1566 self.assertEqual("foo{0}".format(u'bar'), 'foobar')
1567 # This will try to convert the argument from unicode to str, which
1568 # will fail
1569 self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
1570
1571 def test_format_huge_precision(self):
1572 format_string = u".{}f".format(sys.maxsize + 1)
1573 with self.assertRaises(ValueError):
1574 result = format(2.34, format_string)
1575
1576 def test_format_huge_width(self):
1577 format_string = u"{}f".format(sys.maxsize + 1)
1578 with self.assertRaises(ValueError):
1579 result = format(2.34, format_string)
1580
1581 def test_format_huge_item_number(self):
1582 format_string = u"{{{}:.6f}}".format(sys.maxsize + 1)
1583 with self.assertRaises(ValueError):
1584 result = format_string.format(2.34)
1585
1586 def test_format_auto_numbering(self):
1587 class C:
1588 def __init__(self, x=100):
1589 self._x = x
1590 def __format__(self, spec):
1591 return spec
1592
1593 self.assertEqual(u'{}'.format(10), u'10')
1594 self.assertEqual(u'{:5}'.format('s'), u's ')
1595 self.assertEqual(u'{!r}'.format('s'), u"'s'")
1596 self.assertEqual(u'{._x}'.format(C(10)), u'10')
1597 self.assertEqual(u'{[1]}'.format([1, 2]), u'2')
1598 self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4')
1599 self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c')
1600
1601 self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a x b')
1602 self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b')
1603
1604 # can't mix and match numbering and auto-numbering
1605 self.assertRaises(ValueError, u'{}{1}'.format, 1, 2)
1606 self.assertRaises(ValueError, u'{1}{}'.format, 1, 2)
1607 self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2)
1608 self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2)
1609
1610 # can mix and match auto-numbering and named
1611 self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4')
1612 self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test')
1613 self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3')
1614 self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g')
1615
1616 def test_raiseMemError(self):
1617 # Ensure that the freelist contains a consistent object, even
1618 # when a string allocation fails with a MemoryError.
1619 # This used to crash the interpreter,
1620 # or leak references when the number was smaller.
1621 charwidth = 4 if sys.maxunicode >= 0x10000 else 2
1622 # Note: sys.maxsize is half of the actual max allocation because of
1623 # the signedness of Py_ssize_t.
1624 alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
1625 self.assertRaises(MemoryError, alloc)
1626 self.assertRaises(MemoryError, alloc)
1627
1628 def test_format_subclass(self):
1629 class U(unicode):
1630 def __unicode__(self):
1631 return u'__unicode__ overridden'
1632 u = U(u'xxx')
1633 self.assertEqual("%s" % u, u'__unicode__ overridden')
1634 self.assertEqual("{}".format(u), '__unicode__ overridden')
1635
1636 def test_encode_decimal(self):
1637 from _testcapi import unicode_encodedecimal
1638 self.assertEqual(unicode_encodedecimal(u'123'),
1639 b'123')
1640 self.assertEqual(unicode_encodedecimal(u'\u0663.\u0661\u0664'),
1641 b'3.14')
1642 self.assertEqual(unicode_encodedecimal(u"\N{EM SPACE}3.14\N{EN SPACE}"),
1643 b' 3.14 ')
1644 self.assertRaises(UnicodeEncodeError,
1645 unicode_encodedecimal, u"123\u20ac", "strict")
1646 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "replace"),
1647 b'123?')
1648 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "ignore"),
1649 b'123')
1650 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "xmlcharrefreplace"),
1651 b'123&#8364;')
1652 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "backslashreplace"),
1653 b'123\\u20ac')
1654 self.assertEqual(unicode_encodedecimal(u"123\u20ac\N{EM SPACE}", "replace"),
1655 b'123? ')
1656 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u20ac", "replace"),
1657 b'123??')
1658 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
1659 b'123?0')
1660
1661 def test_encode_decimal_with_surrogates(self):
1662 from _testcapi import unicode_encodedecimal
1663 tests = [(u'\U0001f49d', '&#128157;'),
1664 (u'\ud83d', '&#55357;'),
1665 (u'\udc9d', '&#56477;'),
1666 ]
1667 if u'\ud83d\udc9d' != u'\U0001f49d':
1668 tests += [(u'\ud83d\udc9d', '&#55357;&#56477;')]
1669 for s, exp in tests:
1670 self.assertEqual(
1671 unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"),
1672 '123' + exp)
1673
1674def test_main():
1675 test_support.run_unittest(__name__)
1676
1677if __name__ == "__main__":
1678 test_main()
Note: See TracBrowser for help on using the repository browser.