Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

test_unicodedata.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 11.8 KB

Line
1	""" Test script for the unicodedata module.
2
3	Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7	"""
8
9	import sys
10	import unittest
11	import hashlib
12	import subprocess
13	import test.test_support
14
15	encoding = 'utf-8'
16
17
18	### Run tests
19
20	class UnicodeMethodsTest(unittest.TestCase):
21
22	# update this, if the database changes
23	expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf'
24
25	def test_method_checksum(self):
26	h = hashlib.sha1()
27	for i in range(0x10000):
28	char = unichr(i)
29	data = [
30	# Predicates (single char)
31	u"01"[char.isalnum()],
32	u"01"[char.isalpha()],
33	u"01"[char.isdecimal()],
34	u"01"[char.isdigit()],
35	u"01"[char.islower()],
36	u"01"[char.isnumeric()],
37	u"01"[char.isspace()],
38	u"01"[char.istitle()],
39	u"01"[char.isupper()],
40
41	# Predicates (multiple chars)
42	u"01"[(char + u'abc').isalnum()],
43	u"01"[(char + u'abc').isalpha()],
44	u"01"[(char + u'123').isdecimal()],
45	u"01"[(char + u'123').isdigit()],
46	u"01"[(char + u'abc').islower()],
47	u"01"[(char + u'123').isnumeric()],
48	u"01"[(char + u' \t').isspace()],
49	u"01"[(char + u'abc').istitle()],
50	u"01"[(char + u'ABC').isupper()],
51
52	# Mappings (single char)
53	char.lower(),
54	char.upper(),
55	char.title(),
56
57	# Mappings (multiple chars)
58	(char + u'abc').lower(),
59	(char + u'ABC').upper(),
60	(char + u'abc').title(),
61	(char + u'ABC').title(),
62
63	]
64	h.update(u''.join(data).encode(encoding))
65	result = h.hexdigest()
66	self.assertEqual(result, self.expectedchecksum)
67
68	class UnicodeDatabaseTest(unittest.TestCase):
69
70	def setUp(self):
71	# In case unicodedata is not available, this will raise an ImportError,
72	# but the other test cases will still be run
73	import unicodedata
74	self.db = unicodedata
75
76	def tearDown(self):
77	del self.db
78
79	class UnicodeFunctionsTest(UnicodeDatabaseTest):
80
81	# update this, if the database changes
82	expectedchecksum = '6ccf1b1a36460d2694f9b0b0f0324942fe70ede6'
83
84	def test_function_checksum(self):
85	data = []
86	h = hashlib.sha1()
87
88	for i in range(0x10000):
89	char = unichr(i)
90	data = [
91	# Properties
92	str(self.db.digit(char, -1)),
93	str(self.db.numeric(char, -1)),
94	str(self.db.decimal(char, -1)),
95	self.db.category(char),
96	self.db.bidirectional(char),
97	self.db.decomposition(char),
98	str(self.db.mirrored(char)),
99	str(self.db.combining(char)),
100	]
101	h.update(''.join(data))
102	result = h.hexdigest()
103	self.assertEqual(result, self.expectedchecksum)
104
105	def test_digit(self):
106	self.assertEqual(self.db.digit(u'A', None), None)
107	self.assertEqual(self.db.digit(u'9'), 9)
108	self.assertEqual(self.db.digit(u'\u215b', None), None)
109	self.assertEqual(self.db.digit(u'\u2468'), 9)
110	self.assertEqual(self.db.digit(u'\U00020000', None), None)
111
112	self.assertRaises(TypeError, self.db.digit)
113	self.assertRaises(TypeError, self.db.digit, u'xx')
114	self.assertRaises(ValueError, self.db.digit, u'x')
115
116	def test_numeric(self):
117	self.assertEqual(self.db.numeric(u'A',None), None)
118	self.assertEqual(self.db.numeric(u'9'), 9)
119	self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
120	self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
121	self.assertEqual(self.db.numeric(u'\ua627'), 7.0)
122	self.assertEqual(self.db.numeric(u'\U00020000', None), None)
123
124	self.assertRaises(TypeError, self.db.numeric)
125	self.assertRaises(TypeError, self.db.numeric, u'xx')
126	self.assertRaises(ValueError, self.db.numeric, u'x')
127
128	def test_decimal(self):
129	self.assertEqual(self.db.decimal(u'A',None), None)
130	self.assertEqual(self.db.decimal(u'9'), 9)
131	self.assertEqual(self.db.decimal(u'\u215b', None), None)
132	self.assertEqual(self.db.decimal(u'\u2468', None), None)
133	self.assertEqual(self.db.decimal(u'\U00020000', None), None)
134
135	self.assertRaises(TypeError, self.db.decimal)
136	self.assertRaises(TypeError, self.db.decimal, u'xx')
137	self.assertRaises(ValueError, self.db.decimal, u'x')
138
139	def test_category(self):
140	self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
141	self.assertEqual(self.db.category(u'a'), 'Ll')
142	self.assertEqual(self.db.category(u'A'), 'Lu')
143	self.assertEqual(self.db.category(u'\U00020000'), 'Lo')
144
145	self.assertRaises(TypeError, self.db.category)
146	self.assertRaises(TypeError, self.db.category, u'xx')
147
148	def test_bidirectional(self):
149	self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
150	self.assertEqual(self.db.bidirectional(u' '), 'WS')
151	self.assertEqual(self.db.bidirectional(u'A'), 'L')
152	self.assertEqual(self.db.bidirectional(u'\U00020000'), 'L')
153
154	self.assertRaises(TypeError, self.db.bidirectional)
155	self.assertRaises(TypeError, self.db.bidirectional, u'xx')
156
157	def test_decomposition(self):
158	self.assertEqual(self.db.decomposition(u'\uFFFE'),'')
159	self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034')
160
161	self.assertRaises(TypeError, self.db.decomposition)
162	self.assertRaises(TypeError, self.db.decomposition, u'xx')
163
164	def test_mirrored(self):
165	self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
166	self.assertEqual(self.db.mirrored(u'a'), 0)
167	self.assertEqual(self.db.mirrored(u'\u2201'), 1)
168	self.assertEqual(self.db.mirrored(u'\U00020000'), 0)
169
170	self.assertRaises(TypeError, self.db.mirrored)
171	self.assertRaises(TypeError, self.db.mirrored, u'xx')
172
173	def test_combining(self):
174	self.assertEqual(self.db.combining(u'\uFFFE'), 0)
175	self.assertEqual(self.db.combining(u'a'), 0)
176	self.assertEqual(self.db.combining(u'\u20e1'), 230)
177	self.assertEqual(self.db.combining(u'\U00020000'), 0)
178
179	self.assertRaises(TypeError, self.db.combining)
180	self.assertRaises(TypeError, self.db.combining, u'xx')
181
182	def test_normalize(self):
183	self.assertRaises(TypeError, self.db.normalize)
184	self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx')
185	self.assertEqual(self.db.normalize('NFKC', u''), u'')
186	# The rest can be found in test_normalization.py
187	# which requires an external file.
188
189	def test_pr29(self):
190	# http://www.unicode.org/review/pr-29.html
191	# See issues #1054943 and #10254.
192	composed = (u"\u0b47\u0300\u0b3e", u"\u1100\u0300\u1161",
193	u'Li\u030dt-s\u1e73\u0301',
194	u'\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
195	+ u'\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
196	u'\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
197	+ 'u\u0938\u094d\u0924\u093e\u0928')
198	for text in composed:
199	self.assertEqual(self.db.normalize('NFC', text), text)
200
201	def test_issue10254(self):
202	# Crash reported in #10254
203	a = u'C\u0338' * 20 + u'C\u0327'
204	b = u'C\u0338' * 20 + u'\xC7'
205	self.assertEqual(self.db.normalize('NFC', a), b)
206
207	def test_east_asian_width(self):
208	eaw = self.db.east_asian_width
209	self.assertRaises(TypeError, eaw, 'a')
210	self.assertRaises(TypeError, eaw, u'')
211	self.assertRaises(TypeError, eaw, u'ra')
212	self.assertEqual(eaw(u'\x1e'), 'N')
213	self.assertEqual(eaw(u'\x20'), 'Na')
214	self.assertEqual(eaw(u'\uC894'), 'W')
215	self.assertEqual(eaw(u'\uFF66'), 'H')
216	self.assertEqual(eaw(u'\uFF1F'), 'F')
217	self.assertEqual(eaw(u'\u2010'), 'A')
218	self.assertEqual(eaw(u'\U00020000'), 'W')
219
220	class UnicodeMiscTest(UnicodeDatabaseTest):
221
222	def test_failed_import_during_compiling(self):
223	# Issue 4367
224	# Decoding \N escapes requires the unicodedata module. If it can't be
225	# imported, we shouldn't segfault.
226
227	# This program should raise a SyntaxError in the eval.
228	code = "import sys;" \
229	"sys.modules['unicodedata'] = None;" \
230	"""eval("u'\N{SOFT HYPHEN}'")"""
231	args = [sys.executable, "-c", code]
232	# We use a subprocess because the unicodedata module may already have
233	# been loaded in this process.
234	popen = subprocess.Popen(args, stderr=subprocess.PIPE)
235	popen.wait()
236	self.assertEqual(popen.returncode, 1)
237	error = "SyntaxError: (unicode error) \N escapes not supported " \
238	"(can't load unicodedata module)"
239	self.assertIn(error, popen.stderr.read())
240
241	def test_decimal_numeric_consistent(self):
242	# Test that decimal and numeric are consistent,
243	# i.e. if a character has a decimal value,
244	# its numeric value should be the same.
245	count = 0
246	for i in xrange(0x10000):
247	c = unichr(i)
248	dec = self.db.decimal(c, -1)
249	if dec != -1:
250	self.assertEqual(dec, self.db.numeric(c))
251	count += 1
252	self.assertTrue(count >= 10) # should have tested at least the ASCII digits
253
254	def test_digit_numeric_consistent(self):
255	# Test that digit and numeric are consistent,
256	# i.e. if a character has a digit value,
257	# its numeric value should be the same.
258	count = 0
259	for i in xrange(0x10000):
260	c = unichr(i)
261	dec = self.db.digit(c, -1)
262	if dec != -1:
263	self.assertEqual(dec, self.db.numeric(c))
264	count += 1
265	self.assertTrue(count >= 10) # should have tested at least the ASCII digits
266
267	def test_bug_1704793(self):
268	self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
269
270	def test_ucd_510(self):
271	import unicodedata
272	# In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
273	self.assertTrue(unicodedata.mirrored(u"\u0f3a"))
274	self.assertTrue(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a"))
275	# Also, we now have two ways of representing
276	# the upper-case mapping: as delta, or as absolute value
277	self.assertTrue(u"a".upper()==u'A')
278	self.assertTrue(u"\u1d79".upper()==u'\ua77d')
279	self.assertTrue(u".".upper()==u".")
280
281	def test_bug_5828(self):
282	self.assertEqual(u"\u1d79".lower(), u"\u1d79")
283	# Only U+0000 should have U+0000 as its upper/lower/titlecase variant
284	self.assertEqual(
285	[
286	c for c in range(sys.maxunicode+1)
287	if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title()
288	],
289	[0]
290	)
291
292	def test_bug_4971(self):
293	# LETTER DZ WITH CARON: DZ, Dz, dz
294	self.assertEqual(u"\u01c4".title(), u"\u01c5")
295	self.assertEqual(u"\u01c5".title(), u"\u01c5")
296	self.assertEqual(u"\u01c6".title(), u"\u01c5")
297
298	def test_linebreak_7643(self):
299	for i in range(0x10000):
300	lines = (unichr(i) + u'A').splitlines()
301	if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
302	0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
303	self.assertEqual(len(lines), 2,
304	r"\u%.4x should be a linebreak" % i)
305	else:
306	self.assertEqual(len(lines), 1,
307	r"\u%.4x should not be a linebreak" % i)
308
309	def test_main():
310	test.test_support.run_unittest(
311	UnicodeMiscTest,
312	UnicodeMethodsTest,
313	UnicodeFunctionsTest
314	)
315
316	if __name__ == "__main__":
317	test_main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/test/test_unicodedata.py

Download in other formats: