1 | """ Test script for the Unicode implementation.
|
---|
2 |
|
---|
3 | Written by Bill Tutt.
|
---|
4 | Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
---|
5 |
|
---|
6 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
---|
7 |
|
---|
8 | """#"
|
---|
9 |
|
---|
10 | import unittest
|
---|
11 | import sys
|
---|
12 | import _testcapi
|
---|
13 |
|
---|
14 | from test import test_support
|
---|
15 |
|
---|
16 | class UnicodeNamesTest(unittest.TestCase):
|
---|
17 |
|
---|
18 | def checkletter(self, name, code):
|
---|
19 | # Helper that put all \N escapes inside eval'd raw strings,
|
---|
20 | # to make sure this script runs even if the compiler
|
---|
21 | # chokes on \N escapes
|
---|
22 | res = eval(ur'u"\N{%s}"' % name)
|
---|
23 | self.assertEqual(res, code)
|
---|
24 | return res
|
---|
25 |
|
---|
26 | def test_general(self):
|
---|
27 | # General and case insensitivity test:
|
---|
28 | chars = [
|
---|
29 | "LATIN CAPITAL LETTER T",
|
---|
30 | "LATIN SMALL LETTER H",
|
---|
31 | "LATIN SMALL LETTER E",
|
---|
32 | "SPACE",
|
---|
33 | "LATIN SMALL LETTER R",
|
---|
34 | "LATIN CAPITAL LETTER E",
|
---|
35 | "LATIN SMALL LETTER D",
|
---|
36 | "SPACE",
|
---|
37 | "LATIN SMALL LETTER f",
|
---|
38 | "LATIN CAPITAL LeTtEr o",
|
---|
39 | "LATIN SMaLl LETTER x",
|
---|
40 | "SPACE",
|
---|
41 | "LATIN SMALL LETTER A",
|
---|
42 | "LATIN SMALL LETTER T",
|
---|
43 | "LATIN SMALL LETTER E",
|
---|
44 | "SPACE",
|
---|
45 | "LATIN SMALL LETTER T",
|
---|
46 | "LATIN SMALL LETTER H",
|
---|
47 | "LATIN SMALL LETTER E",
|
---|
48 | "SpAcE",
|
---|
49 | "LATIN SMALL LETTER S",
|
---|
50 | "LATIN SMALL LETTER H",
|
---|
51 | "LATIN small LETTER e",
|
---|
52 | "LATIN small LETTER e",
|
---|
53 | "LATIN SMALL LETTER P",
|
---|
54 | "FULL STOP"
|
---|
55 | ]
|
---|
56 | string = u"The rEd fOx ate the sheep."
|
---|
57 |
|
---|
58 | self.assertEqual(
|
---|
59 | u"".join([self.checkletter(*args) for args in zip(chars, string)]),
|
---|
60 | string
|
---|
61 | )
|
---|
62 |
|
---|
63 | def test_ascii_letters(self):
|
---|
64 | import unicodedata
|
---|
65 |
|
---|
66 | for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
|
---|
67 | name = "LATIN SMALL LETTER %s" % char.upper()
|
---|
68 | code = unicodedata.lookup(name)
|
---|
69 | self.assertEqual(unicodedata.name(code), name)
|
---|
70 |
|
---|
71 | def test_hangul_syllables(self):
|
---|
72 | self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
|
---|
73 | self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
|
---|
74 | self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
|
---|
75 | self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
|
---|
76 | self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
|
---|
77 | self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
|
---|
78 | self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
|
---|
79 | self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
|
---|
80 | self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
|
---|
81 | self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
|
---|
82 | self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
|
---|
83 | self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
|
---|
84 | self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
|
---|
85 |
|
---|
86 | import unicodedata
|
---|
87 | self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
|
---|
88 |
|
---|
89 | def test_cjk_unified_ideographs(self):
|
---|
90 | self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")
|
---|
91 | self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")
|
---|
92 | self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")
|
---|
93 | self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")
|
---|
94 | self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")
|
---|
95 | self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")
|
---|
96 |
|
---|
97 | def test_bmp_characters(self):
|
---|
98 | import unicodedata
|
---|
99 | count = 0
|
---|
100 | for code in xrange(0x10000):
|
---|
101 | char = unichr(code)
|
---|
102 | name = unicodedata.name(char, None)
|
---|
103 | if name is not None:
|
---|
104 | self.assertEqual(unicodedata.lookup(name), char)
|
---|
105 | count += 1
|
---|
106 |
|
---|
107 | def test_misc_symbols(self):
|
---|
108 | self.checkletter("PILCROW SIGN", u"\u00b6")
|
---|
109 | self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")
|
---|
110 | self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")
|
---|
111 | self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")
|
---|
112 |
|
---|
113 | def test_errors(self):
|
---|
114 | import unicodedata
|
---|
115 | self.assertRaises(TypeError, unicodedata.name)
|
---|
116 | self.assertRaises(TypeError, unicodedata.name, u'xx')
|
---|
117 | self.assertRaises(TypeError, unicodedata.lookup)
|
---|
118 | self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
|
---|
119 |
|
---|
120 | def test_strict_eror_handling(self):
|
---|
121 | # bogus character name
|
---|
122 | self.assertRaises(
|
---|
123 | UnicodeError,
|
---|
124 | unicode, "\\N{blah}", 'unicode-escape', 'strict'
|
---|
125 | )
|
---|
126 | # long bogus character name
|
---|
127 | self.assertRaises(
|
---|
128 | UnicodeError,
|
---|
129 | unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
|
---|
130 | )
|
---|
131 | # missing closing brace
|
---|
132 | self.assertRaises(
|
---|
133 | UnicodeError,
|
---|
134 | unicode, "\\N{SPACE", 'unicode-escape', 'strict'
|
---|
135 | )
|
---|
136 | # missing opening brace
|
---|
137 | self.assertRaises(
|
---|
138 | UnicodeError,
|
---|
139 | unicode, "\\NSPACE", 'unicode-escape', 'strict'
|
---|
140 | )
|
---|
141 |
|
---|
142 | @unittest.skipUnless(_testcapi.INT_MAX < _testcapi.PY_SSIZE_T_MAX,
|
---|
143 | "needs UINT_MAX < SIZE_MAX")
|
---|
144 | @unittest.skipUnless(_testcapi.UINT_MAX < sys.maxint,
|
---|
145 | "needs UINT_MAX < sys.maxint")
|
---|
146 | @test_support.bigmemtest(minsize=_testcapi.UINT_MAX + 1,
|
---|
147 | memuse=2 + 4 // len(u'\U00010000'))
|
---|
148 | def test_issue16335(self, size):
|
---|
149 | func = self.test_issue16335
|
---|
150 | if size < func.minsize:
|
---|
151 | raise unittest.SkipTest("not enough memory: %.1fG minimum needed" %
|
---|
152 | (func.minsize * func.memuse / float(1024**3),))
|
---|
153 | # very very long bogus character name
|
---|
154 | x = b'\\N{SPACE' + b'x' * int(_testcapi.UINT_MAX + 1) + b'}'
|
---|
155 | self.assertEqual(len(x), len(b'\\N{SPACE}') +
|
---|
156 | (_testcapi.UINT_MAX + 1))
|
---|
157 | self.assertRaisesRegexp(UnicodeError,
|
---|
158 | 'unknown Unicode character name',
|
---|
159 | x.decode, 'unicode-escape'
|
---|
160 | )
|
---|
161 |
|
---|
162 |
|
---|
163 | def test_main():
|
---|
164 | test_support.run_unittest(UnicodeNamesTest)
|
---|
165 |
|
---|
166 | if __name__ == "__main__":
|
---|
167 | test_main()
|
---|