Changeset 391 for python/trunk/Lib/test/test_unicode.py
- Timestamp:
- Mar 19, 2014, 11:31:01 PM (11 years ago)
- Location:
- python/trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
python/trunk
-
Property svn:mergeinfo
set to
/python/vendor/Python-2.7.6 merged eligible /python/vendor/current merged eligible
-
Property svn:mergeinfo
set to
-
python/trunk/Lib/test/test_unicode.py
r2 r391 1 # -*- coding: iso-8859-1 -*-2 1 """ Test script for the Unicode implementation. 3 2 … … 7 6 8 7 """#" 9 import sys, struct, codecs 8 import sys 9 import struct 10 import codecs 11 import unittest 10 12 from test import test_support, string_tests 13 14 # decorator to skip tests on narrow builds 15 requires_wide_build = unittest.skipIf(sys.maxunicode == 65535, 16 'requires wide build') 11 17 12 18 # Error handling (bad decoder return) … … 35 41 type2test = unicode 36 42 43 def assertEqual(self, first, second, msg=None): 44 # strict assertEqual method: reject implicit bytes/unicode equality 45 super(UnicodeTest, self).assertEqual(first, second, msg) 46 if isinstance(first, unicode) or isinstance(second, unicode): 47 self.assertIsInstance(first, unicode) 48 self.assertIsInstance(second, unicode) 49 elif isinstance(first, str) or isinstance(second, str): 50 self.assertIsInstance(first, str) 51 self.assertIsInstance(second, str) 52 37 53 def checkequalnofix(self, result, object, methodname, *args): 38 54 method = getattr(object, methodname) 39 55 realresult = method(*args) 40 56 self.assertEqual(realresult, result) 41 self.assert _(type(realresult) is type(result))57 self.assertTrue(type(realresult) is type(result)) 42 58 43 59 # if the original is returned make sure that … … 51 67 realresult = method(*args) 52 68 self.assertEqual(realresult, result) 53 self.assert _(object is not realresult)69 self.assertTrue(object is not realresult) 54 70 55 71 def test_literals(self): … … 198 214 def test_comparison(self): 199 215 # Comparisons: 200 self.assert Equal(u'abc','abc')201 self.assert Equal('abc',u'abc')202 self.assert Equal(u'abc',u'abc')203 self.assert _(u'abcd' > 'abc')204 self.assert _('abcd' > u'abc')205 self.assert _(u'abcd' > u'abc')206 self.assert _(u'abc' < 'abcd')207 self.assert _('abc' < u'abcd')208 self.assert _(u'abc' < u'abcd')216 self.assertTrue(u'abc' == 'abc') 217 self.assertTrue('abc' == u'abc') 218 self.assertTrue(u'abc' == u'abc') 219 self.assertTrue(u'abcd' > 'abc') 220 self.assertTrue('abcd' > u'abc') 221 self.assertTrue(u'abcd' > u'abc') 222 self.assertTrue(u'abc' < 'abcd') 223 self.assertTrue('abc' < u'abcd') 224 self.assertTrue(u'abc' < u'abcd') 209 225 210 226 if 0: … … 213 229 214 230 # No surrogates, no fixup required. 215 self.assert _(u'\u0061' < u'\u20ac')231 self.assertTrue(u'\u0061' < u'\u20ac') 216 232 # Non surrogate below surrogate value, no fixup required 217 self.assert _(u'\u0061' < u'\ud800\udc02')233 self.assertTrue(u'\u0061' < u'\ud800\udc02') 218 234 219 235 # Non surrogate above surrogate value, fixup required 220 236 def test_lecmp(s, s2): 221 self.assert _(s < s2)237 self.assertTrue(s < s2) 222 238 223 239 def test_fixup(s): … … 259 275 260 276 # Surrogates on both sides, no fixup required 261 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56') 277 self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56') 278 279 def test_capitalize(self): 280 string_tests.CommonTest.test_capitalize(self) 281 # check that titlecased chars are lowered correctly 282 # \u1ffc is the titlecased char 283 self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3', 284 u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize') 285 # check with cased non-letter chars 286 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd', 287 u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize') 288 self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd', 289 u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize') 290 self.checkequal(u'\u2160\u2171\u2172', 291 u'\u2160\u2161\u2162', 'capitalize') 292 self.checkequal(u'\u2160\u2171\u2172', 293 u'\u2170\u2171\u2172', 'capitalize') 294 # check with Ll chars with no upper - nothing changes here 295 self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7', 296 u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize') 262 297 263 298 def test_islower(self): 264 299 string_tests.MixinStrUnicodeUserStringTest.test_islower(self) 265 300 self.checkequalnofix(False, u'\u1FFc', 'islower') 301 302 @requires_wide_build 303 def test_islower_non_bmp(self): 304 # non-BMP, uppercase 305 self.assertFalse(u'\U00010401'.islower()) 306 self.assertFalse(u'\U00010427'.islower()) 307 # non-BMP, lowercase 308 self.assertTrue(u'\U00010429'.islower()) 309 self.assertTrue(u'\U0001044E'.islower()) 310 # non-BMP, non-cased 311 self.assertFalse(u'\U0001F40D'.islower()) 312 self.assertFalse(u'\U0001F46F'.islower()) 266 313 267 314 def test_isupper(self): … … 270 317 self.checkequalnofix(False, u'\u1FFc', 'isupper') 271 318 319 @requires_wide_build 320 def test_isupper_non_bmp(self): 321 # non-BMP, uppercase 322 self.assertTrue(u'\U00010401'.isupper()) 323 self.assertTrue(u'\U00010427'.isupper()) 324 # non-BMP, lowercase 325 self.assertFalse(u'\U00010429'.isupper()) 326 self.assertFalse(u'\U0001044E'.isupper()) 327 # non-BMP, non-cased 328 self.assertFalse(u'\U0001F40D'.isupper()) 329 self.assertFalse(u'\U0001F46F'.isupper()) 330 272 331 def test_istitle(self): 273 string_tests.MixinStrUnicodeUserStringTest.test_ title(self)332 string_tests.MixinStrUnicodeUserStringTest.test_istitle(self) 274 333 self.checkequalnofix(True, u'\u1FFc', 'istitle') 275 334 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle') 335 336 @requires_wide_build 337 def test_istitle_non_bmp(self): 338 # non-BMP, uppercase + lowercase 339 self.assertTrue(u'\U00010401\U00010429'.istitle()) 340 self.assertTrue(u'\U00010427\U0001044E'.istitle()) 341 # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6 342 for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']: 343 self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch)) 276 344 277 345 def test_isspace(self): … … 281 349 self.checkequalnofix(False, u'\u2014', 'isspace') 282 350 351 @requires_wide_build 352 def test_isspace_non_bmp(self): 353 # apparently there are no non-BMP spaces chars in Unicode 6 354 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E', 355 u'\U0001F40D', u'\U0001F46F']: 356 self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch)) 357 358 @requires_wide_build 359 def test_isalnum_non_bmp(self): 360 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E', 361 u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']: 362 self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch)) 363 283 364 def test_isalpha(self): 284 365 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self) 285 366 self.checkequalnofix(True, u'\u1FFc', 'isalpha') 367 368 @requires_wide_build 369 def test_isalpha_non_bmp(self): 370 # non-BMP, cased 371 self.assertTrue(u'\U00010401'.isalpha()) 372 self.assertTrue(u'\U00010427'.isalpha()) 373 self.assertTrue(u'\U00010429'.isalpha()) 374 self.assertTrue(u'\U0001044E'.isalpha()) 375 # non-BMP, non-cased 376 self.assertFalse(u'\U0001F40D'.isalpha()) 377 self.assertFalse(u'\U0001F46F'.isalpha()) 286 378 287 379 def test_isdecimal(self): … … 297 389 self.checkraises(TypeError, 'abc', 'isdecimal', 42) 298 390 391 @requires_wide_build 392 def test_isdecimal_non_bmp(self): 393 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E', 394 u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']: 395 self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch)) 396 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']: 397 self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch)) 398 299 399 def test_isdigit(self): 300 400 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self) … … 302 402 self.checkequalnofix(False, u'\xbc', 'isdigit') 303 403 self.checkequalnofix(True, u'\u0660', 'isdigit') 404 405 @requires_wide_build 406 def test_isdigit_non_bmp(self): 407 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E', 408 u'\U0001F40D', u'\U0001F46F', u'\U00011065']: 409 self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch)) 410 for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']: 411 self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch)) 304 412 305 413 def test_isnumeric(self): … … 315 423 self.assertRaises(TypeError, u"abc".isnumeric, 42) 316 424 425 @requires_wide_build 426 def test_isnumeric_non_bmp(self): 427 for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E', 428 u'\U0001F40D', u'\U0001F46F']: 429 self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch)) 430 for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b', 431 u'\U000104A0', u'\U0001F107']: 432 self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch)) 433 434 @requires_wide_build 435 def test_surrogates(self): 436 # this test actually passes on narrow too, but it's just by accident. 437 # Surrogates are seen as non-cased chars, so u'X\uD800X' is as 438 # uppercase as 'X X' 439 for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800', 440 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'): 441 self.assertTrue(s.islower()) 442 self.assertFalse(s.isupper()) 443 self.assertFalse(s.istitle()) 444 for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800', 445 u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'): 446 self.assertFalse(s.islower()) 447 self.assertTrue(s.isupper()) 448 self.assertTrue(s.istitle()) 449 450 for meth_name in ('islower', 'isupper', 'istitle'): 451 meth = getattr(unicode, meth_name) 452 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'): 453 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name)) 454 455 for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace', 456 'isdecimal', 'isnumeric'): 457 meth = getattr(unicode, meth_name) 458 for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF', 459 u'a\uD800b\uDFFF', u'a\uDFFFb\uD800', 460 u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'): 461 self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name)) 462 463 464 @requires_wide_build 465 def test_lower(self): 466 string_tests.CommonTest.test_lower(self) 467 self.assertEqual(u'\U00010427'.lower(), u'\U0001044F') 468 self.assertEqual(u'\U00010427\U00010427'.lower(), 469 u'\U0001044F\U0001044F') 470 self.assertEqual(u'\U00010427\U0001044F'.lower(), 471 u'\U0001044F\U0001044F') 472 self.assertEqual(u'X\U00010427x\U0001044F'.lower(), 473 u'x\U0001044Fx\U0001044F') 474 475 @requires_wide_build 476 def test_upper(self): 477 string_tests.CommonTest.test_upper(self) 478 self.assertEqual(u'\U0001044F'.upper(), u'\U00010427') 479 self.assertEqual(u'\U0001044F\U0001044F'.upper(), 480 u'\U00010427\U00010427') 481 self.assertEqual(u'\U00010427\U0001044F'.upper(), 482 u'\U00010427\U00010427') 483 self.assertEqual(u'X\U00010427x\U0001044F'.upper(), 484 u'X\U00010427X\U00010427') 485 486 @requires_wide_build 487 def test_capitalize(self): 488 string_tests.CommonTest.test_capitalize(self) 489 self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427') 490 self.assertEqual(u'\U0001044F\U0001044F'.capitalize(), 491 u'\U00010427\U0001044F') 492 self.assertEqual(u'\U00010427\U0001044F'.capitalize(), 493 u'\U00010427\U0001044F') 494 self.assertEqual(u'\U0001044F\U00010427'.capitalize(), 495 u'\U00010427\U0001044F') 496 self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(), 497 u'X\U0001044Fx\U0001044F') 498 499 @requires_wide_build 500 def test_title(self): 501 string_tests.MixinStrUnicodeUserStringTest.test_title(self) 502 self.assertEqual(u'\U0001044F'.title(), u'\U00010427') 503 self.assertEqual(u'\U0001044F\U0001044F'.title(), 504 u'\U00010427\U0001044F') 505 self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(), 506 u'\U00010427\U0001044F \U00010427\U0001044F') 507 self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(), 508 u'\U00010427\U0001044F \U00010427\U0001044F') 509 self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(), 510 u'\U00010427\U0001044F \U00010427\U0001044F') 511 self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(), 512 u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F') 513 514 @requires_wide_build 515 def test_swapcase(self): 516 string_tests.CommonTest.test_swapcase(self) 517 self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427') 518 self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F') 519 self.assertEqual(u'\U0001044F\U0001044F'.swapcase(), 520 u'\U00010427\U00010427') 521 self.assertEqual(u'\U00010427\U0001044F'.swapcase(), 522 u'\U0001044F\U00010427') 523 self.assertEqual(u'\U0001044F\U00010427'.swapcase(), 524 u'\U00010427\U0001044F') 525 self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(), 526 u'x\U0001044FX\U00010427') 527 317 528 def test_contains(self): 318 529 # Testing Unicode contains method 319 self.assert _('a' inu'abdb')320 self.assert _('a' inu'bdab')321 self.assert _('a' inu'bdaba')322 self.assert _('a' inu'bdba')323 self.assert _('a' inu'bdba')324 self.assert _(u'a' inu'bdba')325 self.assert _(u'a' not inu'bdb')326 self.assert _(u'a' not in'bdb')327 self.assert _(u'a' in'bdba')328 self.assert _(u'a' in('a',1,None))329 self.assert _(u'a' in(1,None,'a'))330 self.assert _(u'a' in(1,None,u'a'))331 self.assert _('a' in('a',1,None))332 self.assert _('a' in(1,None,'a'))333 self.assert _('a' in(1,None,u'a'))334 self.assert _('a' not in('x',1,u'y'))335 self.assert _('a' not in('x',1,None))336 self.assert _(u'abcd' not inu'abcxxxx')337 self.assert _(u'ab' inu'abcd')338 self.assert _('ab' inu'abc')339 self.assert _(u'ab' in'abc')340 self.assert _(u'ab' in(1,None,u'ab'))341 self.assert _(u'' inu'abc')342 self.assert _('' inu'abc')530 self.assertIn('a', u'abdb') 531 self.assertIn('a', u'bdab') 532 self.assertIn('a', u'bdaba') 533 self.assertIn('a', u'bdba') 534 self.assertIn('a', u'bdba') 535 self.assertIn(u'a', u'bdba') 536 self.assertNotIn(u'a', u'bdb') 537 self.assertNotIn(u'a', 'bdb') 538 self.assertIn(u'a', 'bdba') 539 self.assertIn(u'a', ('a',1,None)) 540 self.assertIn(u'a', (1,None,'a')) 541 self.assertIn(u'a', (1,None,u'a')) 542 self.assertIn('a', ('a',1,None)) 543 self.assertIn('a', (1,None,'a')) 544 self.assertIn('a', (1,None,u'a')) 545 self.assertNotIn('a', ('x',1,u'y')) 546 self.assertNotIn('a', ('x',1,None)) 547 self.assertNotIn(u'abcd', u'abcxxxx') 548 self.assertIn(u'ab', u'abcd') 549 self.assertIn('ab', u'abc') 550 self.assertIn(u'ab', 'abc') 551 self.assertIn(u'ab', (1,None,u'ab')) 552 self.assertIn(u'', u'abc') 553 self.assertIn('', u'abc') 343 554 344 555 # If the following fails either 345 556 # the contains operator does not propagate UnicodeErrors or 346 557 # someone has changed the default encoding 347 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2') 348 349 self.assert_(u'' in '') 350 self.assert_('' in u'') 351 self.assert_(u'' in u'') 352 self.assert_(u'' in 'abc') 353 self.assert_('' in u'abc') 354 self.assert_(u'' in u'abc') 355 self.assert_(u'\0' not in 'abc') 356 self.assert_('\0' not in u'abc') 357 self.assert_(u'\0' not in u'abc') 358 self.assert_(u'\0' in '\0abc') 359 self.assert_('\0' in u'\0abc') 360 self.assert_(u'\0' in u'\0abc') 361 self.assert_(u'\0' in 'abc\0') 362 self.assert_('\0' in u'abc\0') 363 self.assert_(u'\0' in u'abc\0') 364 self.assert_(u'a' in '\0abc') 365 self.assert_('a' in u'\0abc') 366 self.assert_(u'a' in u'\0abc') 367 self.assert_(u'asdf' in 'asdf') 368 self.assert_('asdf' in u'asdf') 369 self.assert_(u'asdf' in u'asdf') 370 self.assert_(u'asdf' not in 'asd') 371 self.assert_('asdf' not in u'asd') 372 self.assert_(u'asdf' not in u'asd') 373 self.assert_(u'asdf' not in '') 374 self.assert_('asdf' not in u'') 375 self.assert_(u'asdf' not in u'') 558 self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2') 559 self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2') 560 561 self.assertIn(u'', '') 562 self.assertIn('', u'') 563 self.assertIn(u'', u'') 564 self.assertIn(u'', 'abc') 565 self.assertIn('', u'abc') 566 self.assertIn(u'', u'abc') 567 self.assertNotIn(u'\0', 'abc') 568 self.assertNotIn('\0', u'abc') 569 self.assertNotIn(u'\0', u'abc') 570 self.assertIn(u'\0', '\0abc') 571 self.assertIn('\0', u'\0abc') 572 self.assertIn(u'\0', u'\0abc') 573 self.assertIn(u'\0', 'abc\0') 574 self.assertIn('\0', u'abc\0') 575 self.assertIn(u'\0', u'abc\0') 576 self.assertIn(u'a', '\0abc') 577 self.assertIn('a', u'\0abc') 578 self.assertIn(u'a', u'\0abc') 579 self.assertIn(u'asdf', 'asdf') 580 self.assertIn('asdf', u'asdf') 581 self.assertIn(u'asdf', u'asdf') 582 self.assertNotIn(u'asdf', 'asd') 583 self.assertNotIn('asdf', u'asd') 584 self.assertNotIn(u'asdf', u'asd') 585 self.assertNotIn(u'asdf', '') 586 self.assertNotIn('asdf', u'') 587 self.assertNotIn(u'asdf', u'') 376 588 377 589 self.assertRaises(TypeError, u"abc".__contains__) 590 self.assertRaises(TypeError, u"abc".__contains__, object()) 378 591 379 592 def test_formatting(self): … … 393 606 self.assertEqual(u'%c' % 0x1234, u'\u1234') 394 607 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,)) 608 self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3)) 395 609 396 610 for num in range(0x00,0x80): 397 611 char = chr(num) 398 self.assertEqual(u"%c" % char, char) 399 self.assertEqual(u"%c" % num, char) 612 self.assertEqual(u"%c" % char, unicode(char)) 613 self.assertEqual(u"%c" % num, unicode(char)) 614 self.assertTrue(char == u"%c" % char) 615 self.assertTrue(char == u"%c" % num) 400 616 # Issue 7649 401 617 for num in range(0x80,0x100): … … 429 645 self.assertEqual('%s' % Wrapper(), u'\u1234') 430 646 647 @test_support.cpython_only 648 def test_formatting_huge_precision(self): 649 from _testcapi import INT_MAX 650 format_string = u"%.{}f".format(INT_MAX + 1) 651 with self.assertRaises(ValueError): 652 result = format_string % 2.34 653 654 def test_formatting_huge_width(self): 655 format_string = u"%{}f".format(sys.maxsize + 1) 656 with self.assertRaises(ValueError): 657 result = format_string % 2.34 658 659 def test_startswith_endswith_errors(self): 660 for meth in (u'foo'.startswith, u'foo'.endswith): 661 with self.assertRaises(UnicodeDecodeError): 662 meth('\xff') 663 with self.assertRaises(TypeError) as cm: 664 meth(['f']) 665 exc = str(cm.exception) 666 self.assertIn('unicode', exc) 667 self.assertIn('str', exc) 668 self.assertIn('tuple', exc) 669 431 670 @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR') 432 671 def test_format_float(self): … … 511 750 512 751 if not sys.platform.startswith('java'): 752 with test_support.check_py3k_warnings(): 753 buf = buffer('character buffers are decoded to unicode') 513 754 self.assertEqual( 514 755 unicode( 515 buf fer('character buffers are decoded to unicode'),756 buf, 516 757 'utf-8', 517 758 'strict' … … 535 776 (ur'\\?', '+AFwAXA?'), 536 777 (ur'\\\?', '+AFwAXABc?'), 537 (ur'++--', '+-+---') 778 (ur'++--', '+-+---'), 779 (u'\U000abcde', '+2m/c3g-'), # surrogate pairs 780 (u'/', '/'), 538 781 ] 539 782 … … 541 784 self.assertEqual(x.encode('utf-7'), y) 542 785 543 # surrogates not supported 544 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7') 545 546 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd') 547 548 # Issue #2242: crash on some Windows/MSVC versions 549 self.assertRaises(UnicodeDecodeError, '+\xc1'.decode, 'utf-7') 786 # Unpaired surrogates are passed through 787 self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-') 788 self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x') 789 self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-') 790 self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x') 791 self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801') 792 self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x') 793 self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01') 794 self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x') 795 796 self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-') 797 self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde') 798 799 # Direct encoded characters 800 set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" 801 # Optional direct characters 802 set_o = '!"#$%&*;<=>@[]^_`{|}' 803 for c in set_d: 804 self.assertEqual(c.encode('utf7'), c.encode('ascii')) 805 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c)) 806 self.assertTrue(c == c.encode('ascii').decode('utf7')) 807 for c in set_o: 808 self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c)) 809 self.assertTrue(c == c.encode('ascii').decode('utf7')) 550 810 551 811 def test_codecs_utf8(self): … … 580 840 581 841 # UTF-8 specific decoding tests 582 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' 583 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' 584 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' 842 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456') 843 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002') 844 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac') 585 845 586 846 # Other possible utf-8 test cases: 587 847 # * strict decoding testing for all of the 588 848 # UTF8_ERROR cases in PyUnicode_DecodeUTF8 849 850 def test_utf8_decode_valid_sequences(self): 851 sequences = [ 852 # single byte 853 ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'), 854 # 2 bytes 855 ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'), 856 # 3 bytes 857 ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'), 858 ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'), 859 # 4 bytes 860 ('\xF0\x90\x80\x80', u'\U00010000'), 861 ('\xf4\x8f\xbf\xbf', u'\U0010FFFF') 862 ] 863 for seq, res in sequences: 864 self.assertEqual(seq.decode('utf-8'), res) 865 866 for ch in map(unichr, range(0, sys.maxunicode)): 867 self.assertEqual(ch, ch.encode('utf-8').decode('utf-8')) 868 869 def test_utf8_decode_invalid_sequences(self): 870 # continuation bytes in a sequence of 2, 3, or 4 bytes 871 continuation_bytes = map(chr, range(0x80, 0xC0)) 872 # start bytes of a 2-byte sequence equivalent to codepoints < 0x7F 873 invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2)) 874 # start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF 875 invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8)) 876 invalid_start_bytes = ( 877 continuation_bytes + invalid_2B_seq_start_bytes + 878 invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100)) 879 ) 880 881 for byte in invalid_start_bytes: 882 self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8') 883 884 for sb in invalid_2B_seq_start_bytes: 885 for cb in continuation_bytes: 886 self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8') 887 888 for sb in invalid_4B_seq_start_bytes: 889 for cb1 in continuation_bytes[:3]: 890 for cb3 in continuation_bytes[:3]: 891 self.assertRaises(UnicodeDecodeError, 892 (sb+cb1+'\x80'+cb3).decode, 'utf-8') 893 894 for cb in map(chr, range(0x80, 0xA0)): 895 self.assertRaises(UnicodeDecodeError, 896 ('\xE0'+cb+'\x80').decode, 'utf-8') 897 self.assertRaises(UnicodeDecodeError, 898 ('\xE0'+cb+'\xBF').decode, 'utf-8') 899 # XXX: surrogates shouldn't be valid UTF-8! 900 # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 901 # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt 902 #for cb in map(chr, range(0xA0, 0xC0)): 903 #self.assertRaises(UnicodeDecodeError, 904 #('\xED'+cb+'\x80').decode, 'utf-8') 905 #self.assertRaises(UnicodeDecodeError, 906 #('\xED'+cb+'\xBF').decode, 'utf-8') 907 # but since they are valid on Python 2 add a test for that: 908 for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)), 909 map(unichr, range(0xd800, 0xe000, 64))): 910 encoded = '\xED'+cb+'\x80' 911 self.assertEqual(encoded.decode('utf-8'), surrogate) 912 self.assertEqual(surrogate.encode('utf-8'), encoded) 913 914 for cb in map(chr, range(0x80, 0x90)): 915 self.assertRaises(UnicodeDecodeError, 916 ('\xF0'+cb+'\x80\x80').decode, 'utf-8') 917 self.assertRaises(UnicodeDecodeError, 918 ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8') 919 for cb in map(chr, range(0x90, 0xC0)): 920 self.assertRaises(UnicodeDecodeError, 921 ('\xF4'+cb+'\x80\x80').decode, 'utf-8') 922 self.assertRaises(UnicodeDecodeError, 923 ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8') 924 925 def test_issue8271(self): 926 # Issue #8271: during the decoding of an invalid UTF-8 byte sequence, 927 # only the start byte and the continuation byte(s) are now considered 928 # invalid, instead of the number of bytes specified by the start byte. 929 # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95, 930 # table 3-8, Row 2) for more information about the algorithm used. 931 FFFD = u'\ufffd' 932 sequences = [ 933 # invalid start bytes 934 ('\x80', FFFD), # continuation byte 935 ('\x80\x80', FFFD*2), # 2 continuation bytes 936 ('\xc0', FFFD), 937 ('\xc0\xc0', FFFD*2), 938 ('\xc1', FFFD), 939 ('\xc1\xc0', FFFD*2), 940 ('\xc0\xc1', FFFD*2), 941 # with start byte of a 2-byte sequence 942 ('\xc2', FFFD), # only the start byte 943 ('\xc2\xc2', FFFD*2), # 2 start bytes 944 ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes 945 ('\xc2\x41', FFFD+'A'), # invalid continuation byte 946 # with start byte of a 3-byte sequence 947 ('\xe1', FFFD), # only the start byte 948 ('\xe1\xe1', FFFD*2), # 2 start bytes 949 ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes 950 ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes 951 ('\xe1\x80', FFFD), # only 1 continuation byte 952 ('\xe1\x41', FFFD+'A'), # invalid continuation byte 953 ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb 954 ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes 955 ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte 956 ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid 957 ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid 958 # with start byte of a 4-byte sequence 959 ('\xf1', FFFD), # only the start byte 960 ('\xf1\xf1', FFFD*2), # 2 start bytes 961 ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes 962 ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes 963 ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes 964 ('\xf1\x80', FFFD), # only 1 continuation bytes 965 ('\xf1\x80\x80', FFFD), # only 2 continuation bytes 966 ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid 967 ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid 968 ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid 969 ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid 970 ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid 971 ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid 972 ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid 973 ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD), 974 ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2), 975 ('\xf1\xf1\x80\x41', FFFD*2+'A'), 976 ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2), 977 # with invalid start byte of a 4-byte sequence (rfc2279) 978 ('\xf5', FFFD), # only the start byte 979 ('\xf5\xf5', FFFD*2), # 2 start bytes 980 ('\xf5\x80', FFFD*2), # only 1 continuation byte 981 ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte 982 ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes 983 ('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid 984 ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD), 985 ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'), 986 # with invalid start byte of a 5-byte sequence (rfc2279) 987 ('\xf8', FFFD), # only the start byte 988 ('\xf8\xf8', FFFD*2), # 2 start bytes 989 ('\xf8\x80', FFFD*2), # only one continuation byte 990 ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid 991 ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes 992 # with invalid start byte of a 6-byte sequence (rfc2279) 993 ('\xfc', FFFD), # only the start byte 994 ('\xfc\xfc', FFFD*2), # 2 start bytes 995 ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes 996 ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes 997 # invalid start byte 998 ('\xfe', FFFD), 999 ('\xfe\x80\x80', FFFD*3), 1000 # other sequences 1001 ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'), 1002 ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'), 1003 ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'), 1004 ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64', 1005 u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'), 1006 ] 1007 for n, (seq, res) in enumerate(sequences): 1008 self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict') 1009 self.assertEqual(seq.decode('utf-8', 'replace'), res) 1010 self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b') 1011 self.assertEqual(seq.decode('utf-8', 'ignore'), 1012 res.replace(u'\uFFFD', '')) 589 1013 590 1014 def test_codecs_idna(self): … … 598 1022 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x") 599 1023 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x") 1024 self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'), 1025 u'Andr\202 x'.encode('ascii', errors='replace')) 1026 self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'), 1027 u'Andr\202 x'.encode(encoding='ascii', errors='ignore')) 600 1028 601 1029 # Error handling (decoding) … … 604 1032 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x") 605 1033 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x') 1034 self.assertEqual(u'abcde'.decode('ascii', 'ignore'), 1035 u'abcde'.decode('ascii', errors='ignore')) 1036 self.assertEqual(u'abcde'.decode('ascii', 'replace'), 1037 u'abcde'.decode(encoding='ascii', errors='replace')) 606 1038 607 1039 # Error handling (unknown character names) … … 680 1112 for encoding in ( 681 1113 'cp037', 'cp1026', 682 'cp437', 'cp500', 'cp7 37', 'cp775', 'cp850',683 'cp852', 'cp855', 'cp8 60', 'cp861', 'cp862',1114 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 1115 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 684 1116 'cp863', 'cp865', 'cp866', 685 1117 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', … … 708 1140 for encoding in ( 709 1141 'cp037', 'cp1026', 710 'cp437', 'cp500', 'cp7 37', 'cp775', 'cp850',711 'cp852', 'cp855', 'cp8 60', 'cp861', 'cp862',1142 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 1143 'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862', 712 1144 'cp863', 'cp865', 'cp866', 713 1145 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', … … 1030 1462 self.assertEqual(u'{0:abc}'.format(C()), u'abc') 1031 1463 1032 # !r and !s coer sions1464 # !r and !s coercions 1033 1465 self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello') 1034 1466 self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello') … … 1044 1476 self.assertEqual(u'{0}'.format([1]), u'[1]') 1045 1477 self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)') 1046 self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data) ')1047 self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data) ')1048 1478 self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)') 1049 self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')1050 1479 self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data') 1480 1481 msg = 'object.__format__ with a non-empty format string is deprecated' 1482 with test_support.check_warnings((msg, PendingDeprecationWarning)): 1483 self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data) ') 1484 self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data) ') 1485 self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data') 1051 1486 1052 1487 self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007, … … 1101 1536 self.assertRaises(ValueError, u"{0!rs}".format, 0) 1102 1537 self.assertRaises(ValueError, u"{!}".format) 1103 self.assertRaises(ValueError, u"{:}".format) 1104 self.assertRaises(ValueError, u"{:s}".format) 1105 self.assertRaises(ValueError, u"{}".format) 1538 self.assertRaises(IndexError, u"{:}".format) 1539 self.assertRaises(IndexError, u"{:s}".format) 1540 self.assertRaises(IndexError, u"{}".format) 1541 big = u"23098475029384702983476098230754973209482573" 1542 self.assertRaises(ValueError, (u"{" + big + u"}").format) 1543 self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0]) 1106 1544 1107 1545 # issue 6089 … … 1130 1568 # will fail 1131 1569 self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar') 1570 1571 def test_format_huge_precision(self): 1572 format_string = u".{}f".format(sys.maxsize + 1) 1573 with self.assertRaises(ValueError): 1574 result = format(2.34, format_string) 1575 1576 def test_format_huge_width(self): 1577 format_string = u"{}f".format(sys.maxsize + 1) 1578 with self.assertRaises(ValueError): 1579 result = format(2.34, format_string) 1580 1581 def test_format_huge_item_number(self): 1582 format_string = u"{{{}:.6f}}".format(sys.maxsize + 1) 1583 with self.assertRaises(ValueError): 1584 result = format_string.format(2.34) 1585 1586 def test_format_auto_numbering(self): 1587 class C: 1588 def __init__(self, x=100): 1589 self._x = x 1590 def __format__(self, spec): 1591 return spec 1592 1593 self.assertEqual(u'{}'.format(10), u'10') 1594 self.assertEqual(u'{:5}'.format('s'), u's ') 1595 self.assertEqual(u'{!r}'.format('s'), u"'s'") 1596 self.assertEqual(u'{._x}'.format(C(10)), u'10') 1597 self.assertEqual(u'{[1]}'.format([1, 2]), u'2') 1598 self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4') 1599 self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c') 1600 1601 self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a x b') 1602 self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b') 1603 1604 # can't mix and match numbering and auto-numbering 1605 self.assertRaises(ValueError, u'{}{1}'.format, 1, 2) 1606 self.assertRaises(ValueError, u'{1}{}'.format, 1, 2) 1607 self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2) 1608 self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2) 1609 1610 # can mix and match auto-numbering and named 1611 self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4') 1612 self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test') 1613 self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3') 1614 self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g') 1132 1615 1133 1616 def test_raiseMemError(self): … … 1143 1626 self.assertRaises(MemoryError, alloc) 1144 1627 1628 def test_format_subclass(self): 1629 class U(unicode): 1630 def __unicode__(self): 1631 return u'__unicode__ overridden' 1632 u = U(u'xxx') 1633 self.assertEqual("%s" % u, u'__unicode__ overridden') 1634 self.assertEqual("{}".format(u), '__unicode__ overridden') 1635 1636 def test_encode_decimal(self): 1637 from _testcapi import unicode_encodedecimal 1638 self.assertEqual(unicode_encodedecimal(u'123'), 1639 b'123') 1640 self.assertEqual(unicode_encodedecimal(u'\u0663.\u0661\u0664'), 1641 b'3.14') 1642 self.assertEqual(unicode_encodedecimal(u"\N{EM SPACE}3.14\N{EN SPACE}"), 1643 b' 3.14 ') 1644 self.assertRaises(UnicodeEncodeError, 1645 unicode_encodedecimal, u"123\u20ac", "strict") 1646 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "replace"), 1647 b'123?') 1648 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "ignore"), 1649 b'123') 1650 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "xmlcharrefreplace"), 1651 b'123€') 1652 self.assertEqual(unicode_encodedecimal(u"123\u20ac", "backslashreplace"), 1653 b'123\\u20ac') 1654 self.assertEqual(unicode_encodedecimal(u"123\u20ac\N{EM SPACE}", "replace"), 1655 b'123? ') 1656 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u20ac", "replace"), 1657 b'123??') 1658 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"), 1659 b'123?0') 1660 1661 def test_encode_decimal_with_surrogates(self): 1662 from _testcapi import unicode_encodedecimal 1663 tests = [(u'\U0001f49d', '💝'), 1664 (u'\ud83d', '�'), 1665 (u'\udc9d', '�'), 1666 ] 1667 if u'\ud83d\udc9d' != u'\U0001f49d': 1668 tests += [(u'\ud83d\udc9d', '��')] 1669 for s, exp in tests: 1670 self.assertEqual( 1671 unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"), 1672 '123' + exp) 1673 1145 1674 def test_main(): 1146 1675 test_support.run_unittest(__name__)
Note:
See TracChangeset
for help on using the changeset viewer.