source: python/trunk/Lib/test/test_codecs.py

Last change on this file was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 71.5 KB
Line 
1from test import test_support
2import unittest
3import codecs
4import locale
5import sys, StringIO, _testcapi
6
7def coding_checker(self, coder):
8 def check(input, expect):
9 self.assertEqual(coder(input), (expect, len(input)))
10 return check
11
12class Queue(object):
13 """
14 queue: write bytes at one end, read bytes from the other end
15 """
16 def __init__(self):
17 self._buffer = ""
18
19 def write(self, chars):
20 self._buffer += chars
21
22 def read(self, size=-1):
23 if size<0:
24 s = self._buffer
25 self._buffer = ""
26 return s
27 else:
28 s = self._buffer[:size]
29 self._buffer = self._buffer[size:]
30 return s
31
32class ReadTest(unittest.TestCase):
33 def check_partial(self, input, partialresults):
34 # get a StreamReader for the encoding and feed the bytestring version
35 # of input to the reader byte by byte. Read everything available from
36 # the StreamReader and check that the results equal the appropriate
37 # entries from partialresults.
38 q = Queue()
39 r = codecs.getreader(self.encoding)(q)
40 result = u""
41 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
42 q.write(c)
43 result += r.read()
44 self.assertEqual(result, partialresult)
45 # check that there's nothing left in the buffers
46 self.assertEqual(r.read(), u"")
47 self.assertEqual(r.bytebuffer, "")
48 self.assertEqual(r.charbuffer, u"")
49
50 # do the check again, this time using a incremental decoder
51 d = codecs.getincrementaldecoder(self.encoding)()
52 result = u""
53 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54 result += d.decode(c)
55 self.assertEqual(result, partialresult)
56 # check that there's nothing left in the buffers
57 self.assertEqual(d.decode("", True), u"")
58 self.assertEqual(d.buffer, "")
59
60 # Check whether the reset method works properly
61 d.reset()
62 result = u""
63 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64 result += d.decode(c)
65 self.assertEqual(result, partialresult)
66 # check that there's nothing left in the buffers
67 self.assertEqual(d.decode("", True), u"")
68 self.assertEqual(d.buffer, "")
69
70 # check iterdecode()
71 encoded = input.encode(self.encoding)
72 self.assertEqual(
73 input,
74 u"".join(codecs.iterdecode(encoded, self.encoding))
75 )
76
77 def test_readline(self):
78 def getreader(input):
79 stream = StringIO.StringIO(input.encode(self.encoding))
80 return codecs.getreader(self.encoding)(stream)
81
82 def readalllines(input, keepends=True, size=None):
83 reader = getreader(input)
84 lines = []
85 while True:
86 line = reader.readline(size=size, keepends=keepends)
87 if not line:
88 break
89 lines.append(line)
90 return "|".join(lines)
91
92 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
93 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94 sexpectednoends = u"foo|bar|baz|spam|eggs"
95 self.assertEqual(readalllines(s, True), sexpected)
96 self.assertEqual(readalllines(s, False), sexpectednoends)
97 self.assertEqual(readalllines(s, True, 10), sexpected)
98 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
99
100 # Test long lines (multiple calls to read() in readline())
101 vw = []
102 vwo = []
103 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
104 vw.append((i*200)*u"\3042" + lineend)
105 vwo.append((i*200)*u"\3042")
106 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
107 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
108
109 # Test lines where the first read might end with \r, so the
110 # reader has to look ahead whether this is a lone \r or a \r\n
111 for size in xrange(80):
112 for lineend in u"\n \r\n \r \u2028".split():
113 s = 10*(size*u"a" + lineend + u"xxx\n")
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=True),
118 size*u"a" + lineend,
119 )
120 reader = getreader(s)
121 for i in xrange(10):
122 self.assertEqual(
123 reader.readline(keepends=False),
124 size*u"a",
125 )
126
127 def test_bug1175396(self):
128 s = [
129 '<%!--===================================================\r\n',
130 ' BLOG index page: show recent articles,\r\n',
131 ' today\'s articles, or articles of a specific date.\r\n',
132 '========================================================--%>\r\n',
133 '<%@inputencoding="ISO-8859-1"%>\r\n',
134 '<%@pagetemplate=TEMPLATE.y%>\r\n',
135 '<%@import=import frog.util, frog%>\r\n',
136 '<%@import=import frog.objects%>\r\n',
137 '<%@import=from frog.storageerrors import StorageError%>\r\n',
138 '<%\r\n',
139 '\r\n',
140 'import logging\r\n',
141 'log=logging.getLogger("Snakelets.logger")\r\n',
142 '\r\n',
143 '\r\n',
144 'user=self.SessionCtx.user\r\n',
145 'storageEngine=self.SessionCtx.storageEngine\r\n',
146 '\r\n',
147 '\r\n',
148 'def readArticlesFromDate(date, count=None):\r\n',
149 ' entryids=storageEngine.listBlogEntries(date)\r\n',
150 ' entryids.reverse() # descending\r\n',
151 ' if count:\r\n',
152 ' entryids=entryids[:count]\r\n',
153 ' try:\r\n',
154 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
155 ' except StorageError,x:\r\n',
156 ' log.error("Error loading articles: "+str(x))\r\n',
157 ' self.abort("cannot load articles")\r\n',
158 '\r\n',
159 'showdate=None\r\n',
160 '\r\n',
161 'arg=self.Request.getArg()\r\n',
162 'if arg=="today":\r\n',
163 ' #-------------------- TODAY\'S ARTICLES\r\n',
164 ' self.write("<h2>Today\'s articles</h2>")\r\n',
165 ' showdate = frog.util.isodatestr() \r\n',
166 ' entries = readArticlesFromDate(showdate)\r\n',
167 'elif arg=="active":\r\n',
168 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
169 ' self.Yredirect("active.y")\r\n',
170 'elif arg=="login":\r\n',
171 ' #-------------------- LOGIN PAGE redirect\r\n',
172 ' self.Yredirect("login.y")\r\n',
173 'elif arg=="date":\r\n',
174 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
175 ' showdate = self.Request.getParameter("date")\r\n',
176 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
177 ' entries = readArticlesFromDate(showdate)\r\n',
178 'else:\r\n',
179 ' #-------------------- RECENT ARTICLES\r\n',
180 ' self.write("<h2>Recent articles</h2>")\r\n',
181 ' dates=storageEngine.listBlogEntryDates()\r\n',
182 ' if dates:\r\n',
183 ' entries=[]\r\n',
184 ' SHOWAMOUNT=10\r\n',
185 ' for showdate in dates:\r\n',
186 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
187 ' if len(entries)>=SHOWAMOUNT:\r\n',
188 ' break\r\n',
189 ' \r\n',
190 ]
191 stream = StringIO.StringIO("".join(s).encode(self.encoding))
192 reader = codecs.getreader(self.encoding)(stream)
193 for (i, line) in enumerate(reader):
194 self.assertEqual(line, s[i])
195
196 def test_readlinequeue(self):
197 q = Queue()
198 writer = codecs.getwriter(self.encoding)(q)
199 reader = codecs.getreader(self.encoding)(q)
200
201 # No lineends
202 writer.write(u"foo\r")
203 self.assertEqual(reader.readline(keepends=False), u"foo")
204 writer.write(u"\nbar\r")
205 self.assertEqual(reader.readline(keepends=False), u"")
206 self.assertEqual(reader.readline(keepends=False), u"bar")
207 writer.write(u"baz")
208 self.assertEqual(reader.readline(keepends=False), u"baz")
209 self.assertEqual(reader.readline(keepends=False), u"")
210
211 # Lineends
212 writer.write(u"foo\r")
213 self.assertEqual(reader.readline(keepends=True), u"foo\r")
214 writer.write(u"\nbar\r")
215 self.assertEqual(reader.readline(keepends=True), u"\n")
216 self.assertEqual(reader.readline(keepends=True), u"bar\r")
217 writer.write(u"baz")
218 self.assertEqual(reader.readline(keepends=True), u"baz")
219 self.assertEqual(reader.readline(keepends=True), u"")
220 writer.write(u"foo\r\n")
221 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
222
223 def test_bug1098990_a(self):
224 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
225 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
226 s3 = u"next line.\r\n"
227
228 s = (s1+s2+s3).encode(self.encoding)
229 stream = StringIO.StringIO(s)
230 reader = codecs.getreader(self.encoding)(stream)
231 self.assertEqual(reader.readline(), s1)
232 self.assertEqual(reader.readline(), s2)
233 self.assertEqual(reader.readline(), s3)
234 self.assertEqual(reader.readline(), u"")
235
236 def test_bug1098990_b(self):
237 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
238 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
239 s3 = u"stillokay:bbbbxx\r\n"
240 s4 = u"broken!!!!badbad\r\n"
241 s5 = u"againokay.\r\n"
242
243 s = (s1+s2+s3+s4+s5).encode(self.encoding)
244 stream = StringIO.StringIO(s)
245 reader = codecs.getreader(self.encoding)(stream)
246 self.assertEqual(reader.readline(), s1)
247 self.assertEqual(reader.readline(), s2)
248 self.assertEqual(reader.readline(), s3)
249 self.assertEqual(reader.readline(), s4)
250 self.assertEqual(reader.readline(), s5)
251 self.assertEqual(reader.readline(), u"")
252
253class UTF32Test(ReadTest):
254 encoding = "utf-32"
255
256 spamle = ('\xff\xfe\x00\x00'
257 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
258 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
259 spambe = ('\x00\x00\xfe\xff'
260 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
261 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
262
263 def test_only_one_bom(self):
264 _,_,reader,writer = codecs.lookup(self.encoding)
265 # encode some stream
266 s = StringIO.StringIO()
267 f = writer(s)
268 f.write(u"spam")
269 f.write(u"spam")
270 d = s.getvalue()
271 # check whether there is exactly one BOM in it
272 self.assertTrue(d == self.spamle or d == self.spambe)
273 # try to read it back
274 s = StringIO.StringIO(d)
275 f = reader(s)
276 self.assertEqual(f.read(), u"spamspam")
277
278 def test_badbom(self):
279 s = StringIO.StringIO(4*"\xff")
280 f = codecs.getreader(self.encoding)(s)
281 self.assertRaises(UnicodeError, f.read)
282
283 s = StringIO.StringIO(8*"\xff")
284 f = codecs.getreader(self.encoding)(s)
285 self.assertRaises(UnicodeError, f.read)
286
287 def test_partial(self):
288 self.check_partial(
289 u"\x00\xff\u0100\uffff\U00010000",
290 [
291 u"", # first byte of BOM read
292 u"", # second byte of BOM read
293 u"", # third byte of BOM read
294 u"", # fourth byte of BOM read => byteorder known
295 u"",
296 u"",
297 u"",
298 u"\x00",
299 u"\x00",
300 u"\x00",
301 u"\x00",
302 u"\x00\xff",
303 u"\x00\xff",
304 u"\x00\xff",
305 u"\x00\xff",
306 u"\x00\xff\u0100",
307 u"\x00\xff\u0100",
308 u"\x00\xff\u0100",
309 u"\x00\xff\u0100",
310 u"\x00\xff\u0100\uffff",
311 u"\x00\xff\u0100\uffff",
312 u"\x00\xff\u0100\uffff",
313 u"\x00\xff\u0100\uffff",
314 u"\x00\xff\u0100\uffff\U00010000",
315 ]
316 )
317
318 def test_handlers(self):
319 self.assertEqual((u'\ufffd', 1),
320 codecs.utf_32_decode('\x01', 'replace', True))
321 self.assertEqual((u'', 1),
322 codecs.utf_32_decode('\x01', 'ignore', True))
323
324 def test_errors(self):
325 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
326 "\xff", "strict", True)
327
328 def test_issue8941(self):
329 # Issue #8941: insufficient result allocation when decoding into
330 # surrogate pairs on UCS-2 builds.
331 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
332 self.assertEqual(u'\U00010000' * 1024,
333 codecs.utf_32_decode(encoded_le)[0])
334 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
335 self.assertEqual(u'\U00010000' * 1024,
336 codecs.utf_32_decode(encoded_be)[0])
337
338class UTF32LETest(ReadTest):
339 encoding = "utf-32-le"
340
341 def test_partial(self):
342 self.check_partial(
343 u"\x00\xff\u0100\uffff\U00010000",
344 [
345 u"",
346 u"",
347 u"",
348 u"\x00",
349 u"\x00",
350 u"\x00",
351 u"\x00",
352 u"\x00\xff",
353 u"\x00\xff",
354 u"\x00\xff",
355 u"\x00\xff",
356 u"\x00\xff\u0100",
357 u"\x00\xff\u0100",
358 u"\x00\xff\u0100",
359 u"\x00\xff\u0100",
360 u"\x00\xff\u0100\uffff",
361 u"\x00\xff\u0100\uffff",
362 u"\x00\xff\u0100\uffff",
363 u"\x00\xff\u0100\uffff",
364 u"\x00\xff\u0100\uffff\U00010000",
365 ]
366 )
367
368 def test_simple(self):
369 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
370
371 def test_errors(self):
372 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
373 "\xff", "strict", True)
374
375 def test_issue8941(self):
376 # Issue #8941: insufficient result allocation when decoding into
377 # surrogate pairs on UCS-2 builds.
378 encoded = '\x00\x00\x01\x00' * 1024
379 self.assertEqual(u'\U00010000' * 1024,
380 codecs.utf_32_le_decode(encoded)[0])
381
382class UTF32BETest(ReadTest):
383 encoding = "utf-32-be"
384
385 def test_partial(self):
386 self.check_partial(
387 u"\x00\xff\u0100\uffff\U00010000",
388 [
389 u"",
390 u"",
391 u"",
392 u"\x00",
393 u"\x00",
394 u"\x00",
395 u"\x00",
396 u"\x00\xff",
397 u"\x00\xff",
398 u"\x00\xff",
399 u"\x00\xff",
400 u"\x00\xff\u0100",
401 u"\x00\xff\u0100",
402 u"\x00\xff\u0100",
403 u"\x00\xff\u0100",
404 u"\x00\xff\u0100\uffff",
405 u"\x00\xff\u0100\uffff",
406 u"\x00\xff\u0100\uffff",
407 u"\x00\xff\u0100\uffff",
408 u"\x00\xff\u0100\uffff\U00010000",
409 ]
410 )
411
412 def test_simple(self):
413 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
414
415 def test_errors(self):
416 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
417 "\xff", "strict", True)
418
419 def test_issue8941(self):
420 # Issue #8941: insufficient result allocation when decoding into
421 # surrogate pairs on UCS-2 builds.
422 encoded = '\x00\x01\x00\x00' * 1024
423 self.assertEqual(u'\U00010000' * 1024,
424 codecs.utf_32_be_decode(encoded)[0])
425
426
427class UTF16Test(ReadTest):
428 encoding = "utf-16"
429
430 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
431 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
432
433 def test_only_one_bom(self):
434 _,_,reader,writer = codecs.lookup(self.encoding)
435 # encode some stream
436 s = StringIO.StringIO()
437 f = writer(s)
438 f.write(u"spam")
439 f.write(u"spam")
440 d = s.getvalue()
441 # check whether there is exactly one BOM in it
442 self.assertTrue(d == self.spamle or d == self.spambe)
443 # try to read it back
444 s = StringIO.StringIO(d)
445 f = reader(s)
446 self.assertEqual(f.read(), u"spamspam")
447
448 def test_badbom(self):
449 s = StringIO.StringIO("\xff\xff")
450 f = codecs.getreader(self.encoding)(s)
451 self.assertRaises(UnicodeError, f.read)
452
453 s = StringIO.StringIO("\xff\xff\xff\xff")
454 f = codecs.getreader(self.encoding)(s)
455 self.assertRaises(UnicodeError, f.read)
456
457 def test_partial(self):
458 self.check_partial(
459 u"\x00\xff\u0100\uffff\U00010000",
460 [
461 u"", # first byte of BOM read
462 u"", # second byte of BOM read => byteorder known
463 u"",
464 u"\x00",
465 u"\x00",
466 u"\x00\xff",
467 u"\x00\xff",
468 u"\x00\xff\u0100",
469 u"\x00\xff\u0100",
470 u"\x00\xff\u0100\uffff",
471 u"\x00\xff\u0100\uffff",
472 u"\x00\xff\u0100\uffff",
473 u"\x00\xff\u0100\uffff",
474 u"\x00\xff\u0100\uffff\U00010000",
475 ]
476 )
477
478 def test_handlers(self):
479 self.assertEqual((u'\ufffd', 1),
480 codecs.utf_16_decode('\x01', 'replace', True))
481 self.assertEqual((u'', 1),
482 codecs.utf_16_decode('\x01', 'ignore', True))
483
484 def test_errors(self):
485 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
486
487 def test_bug691291(self):
488 # Files are always opened in binary mode, even if no binary mode was
489 # specified. This means that no automatic conversion of '\n' is done
490 # on reading and writing.
491 s1 = u'Hello\r\nworld\r\n'
492
493 s = s1.encode(self.encoding)
494 self.addCleanup(test_support.unlink, test_support.TESTFN)
495 with open(test_support.TESTFN, 'wb') as fp:
496 fp.write(s)
497 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
498 self.assertEqual(reader.read(), s1)
499
500class UTF16LETest(ReadTest):
501 encoding = "utf-16-le"
502
503 def test_partial(self):
504 self.check_partial(
505 u"\x00\xff\u0100\uffff\U00010000",
506 [
507 u"",
508 u"\x00",
509 u"\x00",
510 u"\x00\xff",
511 u"\x00\xff",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100",
514 u"\x00\xff\u0100\uffff",
515 u"\x00\xff\u0100\uffff",
516 u"\x00\xff\u0100\uffff",
517 u"\x00\xff\u0100\uffff",
518 u"\x00\xff\u0100\uffff\U00010000",
519 ]
520 )
521
522 def test_errors(self):
523 tests = [
524 (b'\xff', u'\ufffd'),
525 (b'A\x00Z', u'A\ufffd'),
526 (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
527 (b'\x00\xd8', u'\ufffd'),
528 (b'\x00\xd8A', u'\ufffd'),
529 (b'\x00\xd8A\x00', u'\ufffdA'),
530 (b'\x00\xdcA\x00', u'\ufffdA'),
531 ]
532 for raw, expected in tests:
533 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
534 raw, 'strict', True)
535 self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
536
537class UTF16BETest(ReadTest):
538 encoding = "utf-16-be"
539
540 def test_partial(self):
541 self.check_partial(
542 u"\x00\xff\u0100\uffff\U00010000",
543 [
544 u"",
545 u"\x00",
546 u"\x00",
547 u"\x00\xff",
548 u"\x00\xff",
549 u"\x00\xff\u0100",
550 u"\x00\xff\u0100",
551 u"\x00\xff\u0100\uffff",
552 u"\x00\xff\u0100\uffff",
553 u"\x00\xff\u0100\uffff",
554 u"\x00\xff\u0100\uffff",
555 u"\x00\xff\u0100\uffff\U00010000",
556 ]
557 )
558
559 def test_errors(self):
560 tests = [
561 (b'\xff', u'\ufffd'),
562 (b'\x00A\xff', u'A\ufffd'),
563 (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
564 (b'\xd8\x00', u'\ufffd'),
565 (b'\xd8\x00\xdc', u'\ufffd'),
566 (b'\xd8\x00\x00A', u'\ufffdA'),
567 (b'\xdc\x00\x00A', u'\ufffdA'),
568 ]
569 for raw, expected in tests:
570 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
571 raw, 'strict', True)
572 self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
573
574class UTF8Test(ReadTest):
575 encoding = "utf-8"
576
577 def test_partial(self):
578 self.check_partial(
579 u"\x00\xff\u07ff\u0800\uffff\U00010000",
580 [
581 u"\x00",
582 u"\x00",
583 u"\x00\xff",
584 u"\x00\xff",
585 u"\x00\xff\u07ff",
586 u"\x00\xff\u07ff",
587 u"\x00\xff\u07ff",
588 u"\x00\xff\u07ff\u0800",
589 u"\x00\xff\u07ff\u0800",
590 u"\x00\xff\u07ff\u0800",
591 u"\x00\xff\u07ff\u0800\uffff",
592 u"\x00\xff\u07ff\u0800\uffff",
593 u"\x00\xff\u07ff\u0800\uffff",
594 u"\x00\xff\u07ff\u0800\uffff",
595 u"\x00\xff\u07ff\u0800\uffff\U00010000",
596 ]
597 )
598
599class UTF7Test(ReadTest):
600 encoding = "utf-7"
601
602 def test_partial(self):
603 self.check_partial(
604 u"a+-b",
605 [
606 u"a",
607 u"a",
608 u"a+",
609 u"a+-",
610 u"a+-b",
611 ]
612 )
613
614 def test_errors(self):
615 tests = [
616 ('a\xffb', u'a\ufffdb'),
617 ('a+IK', u'a\ufffd'),
618 ('a+IK-b', u'a\ufffdb'),
619 ('a+IK,b', u'a\ufffdb'),
620 ('a+IKx', u'a\u20ac\ufffd'),
621 ('a+IKx-b', u'a\u20ac\ufffdb'),
622 ('a+IKwgr', u'a\u20ac\ufffd'),
623 ('a+IKwgr-b', u'a\u20ac\ufffdb'),
624 ('a+IKwgr,', u'a\u20ac\ufffd'),
625 ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
626 ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
627 ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
628 ('a+/,+IKw-b', u'a\ufffd\u20acb'),
629 ('a+//,+IKw-b', u'a\ufffd\u20acb'),
630 ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
631 ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
632 ]
633 for raw, expected in tests:
634 self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
635 raw, 'strict', True)
636 self.assertEqual(raw.decode('utf-7', 'replace'), expected)
637
638 def test_nonbmp(self):
639 self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
640 self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
641 self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
642
643class UTF16ExTest(unittest.TestCase):
644
645 def test_errors(self):
646 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
647
648 def test_bad_args(self):
649 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
650
651class ReadBufferTest(unittest.TestCase):
652
653 def test_array(self):
654 import array
655 self.assertEqual(
656 codecs.readbuffer_encode(array.array("c", "spam")),
657 ("spam", 4)
658 )
659
660 def test_empty(self):
661 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
662
663 def test_bad_args(self):
664 self.assertRaises(TypeError, codecs.readbuffer_encode)
665 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
666
667class CharBufferTest(unittest.TestCase):
668
669 def test_string(self):
670 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
671
672 def test_empty(self):
673 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
674
675 def test_bad_args(self):
676 self.assertRaises(TypeError, codecs.charbuffer_encode)
677 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
678
679class UTF8SigTest(ReadTest):
680 encoding = "utf-8-sig"
681
682 def test_partial(self):
683 self.check_partial(
684 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
685 [
686 u"",
687 u"",
688 u"", # First BOM has been read and skipped
689 u"",
690 u"",
691 u"\ufeff", # Second BOM has been read and emitted
692 u"\ufeff\x00", # "\x00" read and emitted
693 u"\ufeff\x00", # First byte of encoded u"\xff" read
694 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
695 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
696 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
697 u"\ufeff\x00\xff\u07ff",
698 u"\ufeff\x00\xff\u07ff",
699 u"\ufeff\x00\xff\u07ff\u0800",
700 u"\ufeff\x00\xff\u07ff\u0800",
701 u"\ufeff\x00\xff\u07ff\u0800",
702 u"\ufeff\x00\xff\u07ff\u0800\uffff",
703 u"\ufeff\x00\xff\u07ff\u0800\uffff",
704 u"\ufeff\x00\xff\u07ff\u0800\uffff",
705 u"\ufeff\x00\xff\u07ff\u0800\uffff",
706 u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
707 ]
708 )
709
710 def test_bug1601501(self):
711 # SF bug #1601501: check that the codec works with a buffer
712 unicode("\xef\xbb\xbf", "utf-8-sig")
713
714 def test_bom(self):
715 d = codecs.getincrementaldecoder("utf-8-sig")()
716 s = u"spam"
717 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
718
719 def test_stream_bom(self):
720 unistring = u"ABC\u00A1\u2200XYZ"
721 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
722
723 reader = codecs.getreader("utf-8-sig")
724 for sizehint in [None] + range(1, 11) + \
725 [64, 128, 256, 512, 1024]:
726 istream = reader(StringIO.StringIO(bytestring))
727 ostream = StringIO.StringIO()
728 while 1:
729 if sizehint is not None:
730 data = istream.read(sizehint)
731 else:
732 data = istream.read()
733
734 if not data:
735 break
736 ostream.write(data)
737
738 got = ostream.getvalue()
739 self.assertEqual(got, unistring)
740
741 def test_stream_bare(self):
742 unistring = u"ABC\u00A1\u2200XYZ"
743 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
744
745 reader = codecs.getreader("utf-8-sig")
746 for sizehint in [None] + range(1, 11) + \
747 [64, 128, 256, 512, 1024]:
748 istream = reader(StringIO.StringIO(bytestring))
749 ostream = StringIO.StringIO()
750 while 1:
751 if sizehint is not None:
752 data = istream.read(sizehint)
753 else:
754 data = istream.read()
755
756 if not data:
757 break
758 ostream.write(data)
759
760 got = ostream.getvalue()
761 self.assertEqual(got, unistring)
762
763class EscapeDecodeTest(unittest.TestCase):
764 def test_empty(self):
765 self.assertEqual(codecs.escape_decode(""), ("", 0))
766
767 def test_raw(self):
768 decode = codecs.escape_decode
769 for b in range(256):
770 b = chr(b)
771 if b != '\\':
772 self.assertEqual(decode(b + '0'), (b + '0', 2))
773
774 def test_escape(self):
775 decode = codecs.escape_decode
776 check = coding_checker(self, decode)
777 check(b"[\\\n]", b"[]")
778 check(br'[\"]', b'["]')
779 check(br"[\']", b"[']")
780 check(br"[\\]", br"[\]")
781 check(br"[\a]", b"[\x07]")
782 check(br"[\b]", b"[\x08]")
783 check(br"[\t]", b"[\x09]")
784 check(br"[\n]", b"[\x0a]")
785 check(br"[\v]", b"[\x0b]")
786 check(br"[\f]", b"[\x0c]")
787 check(br"[\r]", b"[\x0d]")
788 check(br"[\7]", b"[\x07]")
789 check(br"[\8]", br"[\8]")
790 check(br"[\78]", b"[\x078]")
791 check(br"[\41]", b"[!]")
792 check(br"[\418]", b"[!8]")
793 check(br"[\101]", b"[A]")
794 check(br"[\1010]", b"[A0]")
795 check(br"[\501]", b"[A]")
796 check(br"[\x41]", b"[A]")
797 check(br"[\X41]", br"[\X41]")
798 check(br"[\x410]", b"[A0]")
799 for b in range(256):
800 b = chr(b)
801 if b not in '\n"\'\\abtnvfr01234567x':
802 check('\\' + b, '\\' + b)
803
804 def test_errors(self):
805 decode = codecs.escape_decode
806 self.assertRaises(ValueError, decode, br"\x")
807 self.assertRaises(ValueError, decode, br"[\x]")
808 self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
809 self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
810 self.assertRaises(ValueError, decode, br"\x0")
811 self.assertRaises(ValueError, decode, br"[\x0]")
812 self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
813 self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
814
815class RecodingTest(unittest.TestCase):
816 def test_recoding(self):
817 f = StringIO.StringIO()
818 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
819 f2.write(u"a")
820 f2.close()
821 # Python used to crash on this at exit because of a refcount
822 # bug in _codecsmodule.c
823
824# From RFC 3492
825punycode_testcases = [
826 # A Arabic (Egyptian):
827 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
828 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
829 "egbpdaj6bu4bxfgehfvwxn"),
830 # B Chinese (simplified):
831 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
832 "ihqwcrb4cv8a8dqg056pqjye"),
833 # C Chinese (traditional):
834 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
835 "ihqwctvzc91f659drss3x8bo0yb"),
836 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
837 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
838 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
839 u"\u0065\u0073\u006B\u0079",
840 "Proprostnemluvesky-uyb24dma41a"),
841 # E Hebrew:
842 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
843 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
844 u"\u05D1\u05E8\u05D9\u05EA",
845 "4dbcagdahymbxekheh6e0a7fei0b"),
846 # F Hindi (Devanagari):
847 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
848 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
849 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
850 u"\u0939\u0948\u0902",
851 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
852
853 #(G) Japanese (kanji and hiragana):
854 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
855 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
856 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
857
858 # (H) Korean (Hangul syllables):
859 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
860 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
861 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
862 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
863 "psd879ccm6fea98c"),
864
865 # (I) Russian (Cyrillic):
866 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
867 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
868 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
869 u"\u0438",
870 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
871
872 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
873 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
874 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
875 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
876 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
877 u"\u0061\u00F1\u006F\u006C",
878 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
879
880 # (K) Vietnamese:
881 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
882 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
883 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
884 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
885 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
886 u"\u0056\u0069\u1EC7\u0074",
887 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
888
889 #(L) 3<nen>B<gumi><kinpachi><sensei>
890 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
891 "3B-ww4c5e180e575a65lsy2b"),
892
893 # (M) <amuro><namie>-with-SUPER-MONKEYS
894 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
895 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
896 u"\u004F\u004E\u004B\u0045\u0059\u0053",
897 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
898
899 # (N) Hello-Another-Way-<sorezore><no><basho>
900 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
901 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
902 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
903 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
904
905 # (O) <hitotsu><yane><no><shita>2
906 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
907 "2-u9tlzr9756bt3uc0v"),
908
909 # (P) Maji<de>Koi<suru>5<byou><mae>
910 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
911 u"\u308B\u0035\u79D2\u524D",
912 "MajiKoi5-783gue6qz075azm5e"),
913
914 # (Q) <pafii>de<runba>
915 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
916 "de-jg4avhby1noc0d"),
917
918 # (R) <sono><supiido><de>
919 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
920 "d9juau41awczczp"),
921
922 # (S) -> $1.00 <-
923 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
924 u"\u003C\u002D",
925 "-> $1.00 <--")
926 ]
927
928for i in punycode_testcases:
929 if len(i)!=2:
930 print repr(i)
931
932class PunycodeTest(unittest.TestCase):
933 def test_encode(self):
934 for uni, puny in punycode_testcases:
935 # Need to convert both strings to lower case, since
936 # some of the extended encodings use upper case, but our
937 # code produces only lower case. Converting just puny to
938 # lower is also insufficient, since some of the input characters
939 # are upper case.
940 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
941
942 def test_decode(self):
943 for uni, puny in punycode_testcases:
944 self.assertEqual(uni, puny.decode("punycode"))
945
946class UnicodeInternalTest(unittest.TestCase):
947 def test_bug1251300(self):
948 # Decoding with unicode_internal used to not correctly handle "code
949 # points" above 0x10ffff on UCS-4 builds.
950 if sys.maxunicode > 0xffff:
951 ok = [
952 ("\x00\x10\xff\xff", u"\U0010ffff"),
953 ("\x00\x00\x01\x01", u"\U00000101"),
954 ("", u""),
955 ]
956 not_ok = [
957 "\x7f\xff\xff\xff",
958 "\x80\x00\x00\x00",
959 "\x81\x00\x00\x00",
960 "\x00",
961 "\x00\x00\x00\x00\x00",
962 ]
963 for internal, uni in ok:
964 if sys.byteorder == "little":
965 internal = "".join(reversed(internal))
966 self.assertEqual(uni, internal.decode("unicode_internal"))
967 for internal in not_ok:
968 if sys.byteorder == "little":
969 internal = "".join(reversed(internal))
970 self.assertRaises(UnicodeDecodeError, internal.decode,
971 "unicode_internal")
972
973 def test_decode_error_attributes(self):
974 if sys.maxunicode > 0xffff:
975 try:
976 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
977 except UnicodeDecodeError, ex:
978 self.assertEqual("unicode_internal", ex.encoding)
979 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
980 self.assertEqual(4, ex.start)
981 self.assertEqual(8, ex.end)
982 else:
983 self.fail()
984
985 def test_decode_callback(self):
986 if sys.maxunicode > 0xffff:
987 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
988 decoder = codecs.getdecoder("unicode_internal")
989 ab = u"ab".encode("unicode_internal")
990 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
991 "UnicodeInternalTest")
992 self.assertEqual((u"ab", 12), ignored)
993
994 def test_encode_length(self):
995 # Issue 3739
996 encoder = codecs.getencoder("unicode_internal")
997 self.assertEqual(encoder(u"a")[1], 1)
998 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
999
1000 encoder = codecs.getencoder("string-escape")
1001 self.assertEqual(encoder(r'\x00')[1], 4)
1002
1003# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1004nameprep_tests = [
1005 # 3.1 Map to nothing.
1006 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1007 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1008 '\xb8\x8f\xef\xbb\xbf',
1009 'foobarbaz'),
1010 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1011 ('CAFE',
1012 'cafe'),
1013 # 3.3 Case folding 8bit U+00DF (german sharp s).
1014 # The original test case is bogus; it says \xc3\xdf
1015 ('\xc3\x9f',
1016 'ss'),
1017 # 3.4 Case folding U+0130 (turkish capital I with dot).
1018 ('\xc4\xb0',
1019 'i\xcc\x87'),
1020 # 3.5 Case folding multibyte U+0143 U+037A.
1021 ('\xc5\x83\xcd\xba',
1022 '\xc5\x84 \xce\xb9'),
1023 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1024 # XXX: skip this as it fails in UCS-2 mode
1025 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1026 # 'telc\xe2\x88\x95kg\xcf\x83'),
1027 (None, None),
1028 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1029 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1030 '\xc7\xb0 a'),
1031 # 3.8 Case folding U+1FB7 and normalization.
1032 ('\xe1\xbe\xb7',
1033 '\xe1\xbe\xb6\xce\xb9'),
1034 # 3.9 Self-reverting case folding U+01F0 and normalization.
1035 # The original test case is bogus, it says `\xc7\xf0'
1036 ('\xc7\xb0',
1037 '\xc7\xb0'),
1038 # 3.10 Self-reverting case folding U+0390 and normalization.
1039 ('\xce\x90',
1040 '\xce\x90'),
1041 # 3.11 Self-reverting case folding U+03B0 and normalization.
1042 ('\xce\xb0',
1043 '\xce\xb0'),
1044 # 3.12 Self-reverting case folding U+1E96 and normalization.
1045 ('\xe1\xba\x96',
1046 '\xe1\xba\x96'),
1047 # 3.13 Self-reverting case folding U+1F56 and normalization.
1048 ('\xe1\xbd\x96',
1049 '\xe1\xbd\x96'),
1050 # 3.14 ASCII space character U+0020.
1051 (' ',
1052 ' '),
1053 # 3.15 Non-ASCII 8bit space character U+00A0.
1054 ('\xc2\xa0',
1055 ' '),
1056 # 3.16 Non-ASCII multibyte space character U+1680.
1057 ('\xe1\x9a\x80',
1058 None),
1059 # 3.17 Non-ASCII multibyte space character U+2000.
1060 ('\xe2\x80\x80',
1061 ' '),
1062 # 3.18 Zero Width Space U+200b.
1063 ('\xe2\x80\x8b',
1064 ''),
1065 # 3.19 Non-ASCII multibyte space character U+3000.
1066 ('\xe3\x80\x80',
1067 ' '),
1068 # 3.20 ASCII control characters U+0010 U+007F.
1069 ('\x10\x7f',
1070 '\x10\x7f'),
1071 # 3.21 Non-ASCII 8bit control character U+0085.
1072 ('\xc2\x85',
1073 None),
1074 # 3.22 Non-ASCII multibyte control character U+180E.
1075 ('\xe1\xa0\x8e',
1076 None),
1077 # 3.23 Zero Width No-Break Space U+FEFF.
1078 ('\xef\xbb\xbf',
1079 ''),
1080 # 3.24 Non-ASCII control character U+1D175.
1081 ('\xf0\x9d\x85\xb5',
1082 None),
1083 # 3.25 Plane 0 private use character U+F123.
1084 ('\xef\x84\xa3',
1085 None),
1086 # 3.26 Plane 15 private use character U+F1234.
1087 ('\xf3\xb1\x88\xb4',
1088 None),
1089 # 3.27 Plane 16 private use character U+10F234.
1090 ('\xf4\x8f\x88\xb4',
1091 None),
1092 # 3.28 Non-character code point U+8FFFE.
1093 ('\xf2\x8f\xbf\xbe',
1094 None),
1095 # 3.29 Non-character code point U+10FFFF.
1096 ('\xf4\x8f\xbf\xbf',
1097 None),
1098 # 3.30 Surrogate code U+DF42.
1099 ('\xed\xbd\x82',
1100 None),
1101 # 3.31 Non-plain text character U+FFFD.
1102 ('\xef\xbf\xbd',
1103 None),
1104 # 3.32 Ideographic description character U+2FF5.
1105 ('\xe2\xbf\xb5',
1106 None),
1107 # 3.33 Display property character U+0341.
1108 ('\xcd\x81',
1109 '\xcc\x81'),
1110 # 3.34 Left-to-right mark U+200E.
1111 ('\xe2\x80\x8e',
1112 None),
1113 # 3.35 Deprecated U+202A.
1114 ('\xe2\x80\xaa',
1115 None),
1116 # 3.36 Language tagging character U+E0001.
1117 ('\xf3\xa0\x80\x81',
1118 None),
1119 # 3.37 Language tagging character U+E0042.
1120 ('\xf3\xa0\x81\x82',
1121 None),
1122 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1123 ('foo\xd6\xbebar',
1124 None),
1125 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1126 ('foo\xef\xb5\x90bar',
1127 None),
1128 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1129 ('foo\xef\xb9\xb6bar',
1130 'foo \xd9\x8ebar'),
1131 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1132 ('\xd8\xa71',
1133 None),
1134 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1135 ('\xd8\xa71\xd8\xa8',
1136 '\xd8\xa71\xd8\xa8'),
1137 # 3.43 Unassigned code point U+E0002.
1138 # Skip this test as we allow unassigned
1139 #('\xf3\xa0\x80\x82',
1140 # None),
1141 (None, None),
1142 # 3.44 Larger test (shrinking).
1143 # Original test case reads \xc3\xdf
1144 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1145 '\xaa\xce\xb0\xe2\x80\x80',
1146 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1147 # 3.45 Larger test (expanding).
1148 # Original test case reads \xc3\x9f
1149 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1150 '\x80',
1151 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1152 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1153 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1154 ]
1155
1156
1157class NameprepTest(unittest.TestCase):
1158 def test_nameprep(self):
1159 from encodings.idna import nameprep
1160 for pos, (orig, prepped) in enumerate(nameprep_tests):
1161 if orig is None:
1162 # Skipped
1163 continue
1164 # The Unicode strings are given in UTF-8
1165 orig = unicode(orig, "utf-8")
1166 if prepped is None:
1167 # Input contains prohibited characters
1168 self.assertRaises(UnicodeError, nameprep, orig)
1169 else:
1170 prepped = unicode(prepped, "utf-8")
1171 try:
1172 self.assertEqual(nameprep(orig), prepped)
1173 except Exception,e:
1174 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1175
1176class IDNACodecTest(unittest.TestCase):
1177 def test_builtin_decode(self):
1178 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1179 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1180 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1181 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
1182
1183 def test_builtin_encode(self):
1184 self.assertEqual(u"python.org".encode("idna"), "python.org")
1185 self.assertEqual("python.org.".encode("idna"), "python.org.")
1186 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1187 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
1188
1189 def test_stream(self):
1190 import StringIO
1191 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1192 r.read(3)
1193 self.assertEqual(r.read(), u"")
1194
1195 def test_incremental_decode(self):
1196 self.assertEqual(
1197 "".join(codecs.iterdecode("python.org", "idna")),
1198 u"python.org"
1199 )
1200 self.assertEqual(
1201 "".join(codecs.iterdecode("python.org.", "idna")),
1202 u"python.org."
1203 )
1204 self.assertEqual(
1205 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1206 u"pyth\xf6n.org."
1207 )
1208 self.assertEqual(
1209 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1210 u"pyth\xf6n.org."
1211 )
1212
1213 decoder = codecs.getincrementaldecoder("idna")()
1214 self.assertEqual(decoder.decode("xn--xam", ), u"")
1215 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1216 self.assertEqual(decoder.decode(u"rg"), u"")
1217 self.assertEqual(decoder.decode(u"", True), u"org")
1218
1219 decoder.reset()
1220 self.assertEqual(decoder.decode("xn--xam", ), u"")
1221 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1222 self.assertEqual(decoder.decode("rg."), u"org.")
1223 self.assertEqual(decoder.decode("", True), u"")
1224
1225 def test_incremental_encode(self):
1226 self.assertEqual(
1227 "".join(codecs.iterencode(u"python.org", "idna")),
1228 "python.org"
1229 )
1230 self.assertEqual(
1231 "".join(codecs.iterencode(u"python.org.", "idna")),
1232 "python.org."
1233 )
1234 self.assertEqual(
1235 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1236 "xn--pythn-mua.org."
1237 )
1238 self.assertEqual(
1239 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1240 "xn--pythn-mua.org."
1241 )
1242
1243 encoder = codecs.getincrementalencoder("idna")()
1244 self.assertEqual(encoder.encode(u"\xe4x"), "")
1245 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1246 self.assertEqual(encoder.encode(u"", True), "org")
1247
1248 encoder.reset()
1249 self.assertEqual(encoder.encode(u"\xe4x"), "")
1250 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1251 self.assertEqual(encoder.encode(u"", True), "")
1252
1253class CodecsModuleTest(unittest.TestCase):
1254
1255 def test_decode(self):
1256 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1257 u'\xe4\xf6\xfc')
1258 self.assertRaises(TypeError, codecs.decode)
1259 self.assertEqual(codecs.decode('abc'), u'abc')
1260 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1261
1262 def test_encode(self):
1263 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1264 '\xe4\xf6\xfc')
1265 self.assertRaises(TypeError, codecs.encode)
1266 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
1267 self.assertEqual(codecs.encode(u'abc'), 'abc')
1268 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1269
1270 def test_register(self):
1271 self.assertRaises(TypeError, codecs.register)
1272 self.assertRaises(TypeError, codecs.register, 42)
1273
1274 def test_lookup(self):
1275 self.assertRaises(TypeError, codecs.lookup)
1276 self.assertRaises(LookupError, codecs.lookup, "__spam__")
1277 self.assertRaises(LookupError, codecs.lookup, " ")
1278
1279 def test_getencoder(self):
1280 self.assertRaises(TypeError, codecs.getencoder)
1281 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1282
1283 def test_getdecoder(self):
1284 self.assertRaises(TypeError, codecs.getdecoder)
1285 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1286
1287 def test_getreader(self):
1288 self.assertRaises(TypeError, codecs.getreader)
1289 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1290
1291 def test_getwriter(self):
1292 self.assertRaises(TypeError, codecs.getwriter)
1293 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
1294
1295 def test_lookup_issue1813(self):
1296 # Issue #1813: under Turkish locales, lookup of some codecs failed
1297 # because 'I' is lowercased as a dotless "i"
1298 oldlocale = locale.getlocale(locale.LC_CTYPE)
1299 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1300 try:
1301 locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1302 except locale.Error:
1303 # Unsupported locale on this system
1304 self.skipTest('test needs Turkish locale')
1305 c = codecs.lookup('ASCII')
1306 self.assertEqual(c.name, 'ascii')
1307
1308class StreamReaderTest(unittest.TestCase):
1309
1310 def setUp(self):
1311 self.reader = codecs.getreader('utf-8')
1312 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1313
1314 def test_readlines(self):
1315 f = self.reader(self.stream)
1316 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
1317
1318class EncodedFileTest(unittest.TestCase):
1319
1320 def test_basic(self):
1321 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1322 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1323 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
1324
1325 f = StringIO.StringIO()
1326 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1327 ef.write('\xc3\xbc')
1328 self.assertEqual(f.getvalue(), '\xfc')
1329
1330class Str2StrTest(unittest.TestCase):
1331
1332 def test_read(self):
1333 sin = "\x80".encode("base64_codec")
1334 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1335 sout = reader.read()
1336 self.assertEqual(sout, "\x80")
1337 self.assertIsInstance(sout, str)
1338
1339 def test_readline(self):
1340 sin = "\x80".encode("base64_codec")
1341 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1342 sout = reader.readline()
1343 self.assertEqual(sout, "\x80")
1344 self.assertIsInstance(sout, str)
1345
1346all_unicode_encodings = [
1347 "ascii",
1348 "base64_codec",
1349 "big5",
1350 "big5hkscs",
1351 "charmap",
1352 "cp037",
1353 "cp1006",
1354 "cp1026",
1355 "cp1140",
1356 "cp1250",
1357 "cp1251",
1358 "cp1252",
1359 "cp1253",
1360 "cp1254",
1361 "cp1255",
1362 "cp1256",
1363 "cp1257",
1364 "cp1258",
1365 "cp424",
1366 "cp437",
1367 "cp500",
1368 "cp720",
1369 "cp737",
1370 "cp775",
1371 "cp850",
1372 "cp852",
1373 "cp855",
1374 "cp856",
1375 "cp857",
1376 "cp858",
1377 "cp860",
1378 "cp861",
1379 "cp862",
1380 "cp863",
1381 "cp864",
1382 "cp865",
1383 "cp866",
1384 "cp869",
1385 "cp874",
1386 "cp875",
1387 "cp932",
1388 "cp949",
1389 "cp950",
1390 "euc_jis_2004",
1391 "euc_jisx0213",
1392 "euc_jp",
1393 "euc_kr",
1394 "gb18030",
1395 "gb2312",
1396 "gbk",
1397 "hex_codec",
1398 "hp_roman8",
1399 "hz",
1400 "idna",
1401 "iso2022_jp",
1402 "iso2022_jp_1",
1403 "iso2022_jp_2",
1404 "iso2022_jp_2004",
1405 "iso2022_jp_3",
1406 "iso2022_jp_ext",
1407 "iso2022_kr",
1408 "iso8859_1",
1409 "iso8859_10",
1410 "iso8859_11",
1411 "iso8859_13",
1412 "iso8859_14",
1413 "iso8859_15",
1414 "iso8859_16",
1415 "iso8859_2",
1416 "iso8859_3",
1417 "iso8859_4",
1418 "iso8859_5",
1419 "iso8859_6",
1420 "iso8859_7",
1421 "iso8859_8",
1422 "iso8859_9",
1423 "johab",
1424 "koi8_r",
1425 "koi8_u",
1426 "latin_1",
1427 "mac_cyrillic",
1428 "mac_greek",
1429 "mac_iceland",
1430 "mac_latin2",
1431 "mac_roman",
1432 "mac_turkish",
1433 "palmos",
1434 "ptcp154",
1435 "punycode",
1436 "raw_unicode_escape",
1437 "rot_13",
1438 "shift_jis",
1439 "shift_jis_2004",
1440 "shift_jisx0213",
1441 "tis_620",
1442 "unicode_escape",
1443 "unicode_internal",
1444 "utf_16",
1445 "utf_16_be",
1446 "utf_16_le",
1447 "utf_7",
1448 "utf_8",
1449]
1450
1451if hasattr(codecs, "mbcs_encode"):
1452 all_unicode_encodings.append("mbcs")
1453
1454# The following encodings work only with str, not unicode
1455all_string_encodings = [
1456 "quopri_codec",
1457 "string_escape",
1458 "uu_codec",
1459]
1460
1461# The following encoding is not tested, because it's not supposed
1462# to work:
1463# "undefined"
1464
1465# The following encodings don't work in stateful mode
1466broken_unicode_with_streams = [
1467 "base64_codec",
1468 "hex_codec",
1469 "punycode",
1470 "unicode_internal"
1471]
1472broken_incremental_coders = broken_unicode_with_streams[:]
1473
1474# The following encodings only support "strict" mode
1475only_strict_mode = [
1476 "idna",
1477 "zlib_codec",
1478 "bz2_codec",
1479]
1480
1481try:
1482 import bz2
1483except ImportError:
1484 pass
1485else:
1486 all_unicode_encodings.append("bz2_codec")
1487 broken_unicode_with_streams.append("bz2_codec")
1488
1489try:
1490 import zlib
1491except ImportError:
1492 pass
1493else:
1494 all_unicode_encodings.append("zlib_codec")
1495 broken_unicode_with_streams.append("zlib_codec")
1496
1497class BasicUnicodeTest(unittest.TestCase):
1498 def test_basics(self):
1499 s = u"abc123" # all codecs should be able to encode these
1500 for encoding in all_unicode_encodings:
1501 name = codecs.lookup(encoding).name
1502 if encoding.endswith("_codec"):
1503 name += "_codec"
1504 elif encoding == "latin_1":
1505 name = "latin_1"
1506 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
1507 (bytes, size) = codecs.getencoder(encoding)(s)
1508 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1509 (chars, size) = codecs.getdecoder(encoding)(bytes)
1510 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1511
1512 if encoding not in broken_unicode_with_streams:
1513 # check stream reader/writer
1514 q = Queue()
1515 writer = codecs.getwriter(encoding)(q)
1516 encodedresult = ""
1517 for c in s:
1518 writer.write(c)
1519 encodedresult += q.read()
1520 q = Queue()
1521 reader = codecs.getreader(encoding)(q)
1522 decodedresult = u""
1523 for c in encodedresult:
1524 q.write(c)
1525 decodedresult += reader.read()
1526 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1527
1528 if encoding not in broken_incremental_coders:
1529 # check incremental decoder/encoder (fetched via the Python
1530 # and C API) and iterencode()/iterdecode()
1531 try:
1532 encoder = codecs.getincrementalencoder(encoding)()
1533 cencoder = _testcapi.codec_incrementalencoder(encoding)
1534 except LookupError: # no IncrementalEncoder
1535 pass
1536 else:
1537 # check incremental decoder/encoder
1538 encodedresult = ""
1539 for c in s:
1540 encodedresult += encoder.encode(c)
1541 encodedresult += encoder.encode(u"", True)
1542 decoder = codecs.getincrementaldecoder(encoding)()
1543 decodedresult = u""
1544 for c in encodedresult:
1545 decodedresult += decoder.decode(c)
1546 decodedresult += decoder.decode("", True)
1547 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1548
1549 # check C API
1550 encodedresult = ""
1551 for c in s:
1552 encodedresult += cencoder.encode(c)
1553 encodedresult += cencoder.encode(u"", True)
1554 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1555 decodedresult = u""
1556 for c in encodedresult:
1557 decodedresult += cdecoder.decode(c)
1558 decodedresult += cdecoder.decode("", True)
1559 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1560
1561 # check iterencode()/iterdecode()
1562 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1563 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1564
1565 # check iterencode()/iterdecode() with empty string
1566 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1567 self.assertEqual(result, u"")
1568
1569 if encoding not in only_strict_mode:
1570 # check incremental decoder/encoder with errors argument
1571 try:
1572 encoder = codecs.getincrementalencoder(encoding)("ignore")
1573 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1574 except LookupError: # no IncrementalEncoder
1575 pass
1576 else:
1577 encodedresult = "".join(encoder.encode(c) for c in s)
1578 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1579 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1580 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1581
1582 encodedresult = "".join(cencoder.encode(c) for c in s)
1583 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1584 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1585 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1586
1587 def test_seek(self):
1588 # all codecs should be able to encode these
1589 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1590 for encoding in all_unicode_encodings:
1591 if encoding == "idna": # FIXME: See SF bug #1163178
1592 continue
1593 if encoding in broken_unicode_with_streams:
1594 continue
1595 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1596 for t in xrange(5):
1597 # Test that calling seek resets the internal codec state and buffers
1598 reader.seek(0, 0)
1599 line = reader.readline()
1600 self.assertEqual(s[:len(line)], line)
1601
1602 def test_bad_decode_args(self):
1603 for encoding in all_unicode_encodings:
1604 decoder = codecs.getdecoder(encoding)
1605 self.assertRaises(TypeError, decoder)
1606 if encoding not in ("idna", "punycode"):
1607 self.assertRaises(TypeError, decoder, 42)
1608
1609 def test_bad_encode_args(self):
1610 for encoding in all_unicode_encodings:
1611 encoder = codecs.getencoder(encoding)
1612 self.assertRaises(TypeError, encoder)
1613
1614 def test_encoding_map_type_initialized(self):
1615 from encodings import cp1140
1616 # This used to crash, we are only verifying there's no crash.
1617 table_type = type(cp1140.encoding_table)
1618 self.assertEqual(table_type, table_type)
1619
1620class BasicStrTest(unittest.TestCase):
1621 def test_basics(self):
1622 s = "abc123"
1623 for encoding in all_string_encodings:
1624 (bytes, size) = codecs.getencoder(encoding)(s)
1625 self.assertEqual(size, len(s))
1626 (chars, size) = codecs.getdecoder(encoding)(bytes)
1627 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1628
1629class CharmapTest(unittest.TestCase):
1630 def test_decode_with_string_map(self):
1631 self.assertEqual(
1632 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1633 (u"abc", 3)
1634 )
1635
1636 self.assertRaises(UnicodeDecodeError,
1637 codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1638 )
1639
1640 self.assertRaises(UnicodeDecodeError,
1641 codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1642 )
1643
1644 self.assertEqual(
1645 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1646 (u"ab\ufffd", 3)
1647 )
1648
1649 self.assertEqual(
1650 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1651 (u"ab\ufffd", 3)
1652 )
1653
1654 self.assertEqual(
1655 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1656 (u"ab", 3)
1657 )
1658
1659 self.assertEqual(
1660 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1661 (u"ab", 3)
1662 )
1663
1664 allbytes = "".join(chr(i) for i in xrange(256))
1665 self.assertEqual(
1666 codecs.charmap_decode(allbytes, "ignore", u""),
1667 (u"", len(allbytes))
1668 )
1669
1670 def test_decode_with_int2str_map(self):
1671 self.assertEqual(
1672 codecs.charmap_decode("\x00\x01\x02", "strict",
1673 {0: u'a', 1: u'b', 2: u'c'}),
1674 (u"abc", 3)
1675 )
1676
1677 self.assertEqual(
1678 codecs.charmap_decode("\x00\x01\x02", "strict",
1679 {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1680 (u"AaBbCc", 3)
1681 )
1682
1683 self.assertEqual(
1684 codecs.charmap_decode("\x00\x01\x02", "strict",
1685 {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1686 (u"\U0010FFFFbc", 3)
1687 )
1688
1689 self.assertEqual(
1690 codecs.charmap_decode("\x00\x01\x02", "strict",
1691 {0: u'a', 1: u'b', 2: u''}),
1692 (u"ab", 3)
1693 )
1694
1695 self.assertRaises(UnicodeDecodeError,
1696 codecs.charmap_decode, "\x00\x01\x02", "strict",
1697 {0: u'a', 1: u'b'}
1698 )
1699
1700 self.assertRaises(UnicodeDecodeError,
1701 codecs.charmap_decode, "\x00\x01\x02", "strict",
1702 {0: u'a', 1: u'b', 2: None}
1703 )
1704
1705 # Issue #14850
1706 self.assertRaises(UnicodeDecodeError,
1707 codecs.charmap_decode, "\x00\x01\x02", "strict",
1708 {0: u'a', 1: u'b', 2: u'\ufffe'}
1709 )
1710
1711 self.assertEqual(
1712 codecs.charmap_decode("\x00\x01\x02", "replace",
1713 {0: u'a', 1: u'b'}),
1714 (u"ab\ufffd", 3)
1715 )
1716
1717 self.assertEqual(
1718 codecs.charmap_decode("\x00\x01\x02", "replace",
1719 {0: u'a', 1: u'b', 2: None}),
1720 (u"ab\ufffd", 3)
1721 )
1722
1723 # Issue #14850
1724 self.assertEqual(
1725 codecs.charmap_decode("\x00\x01\x02", "replace",
1726 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1727 (u"ab\ufffd", 3)
1728 )
1729
1730 self.assertEqual(
1731 codecs.charmap_decode("\x00\x01\x02", "ignore",
1732 {0: u'a', 1: u'b'}),
1733 (u"ab", 3)
1734 )
1735
1736 self.assertEqual(
1737 codecs.charmap_decode("\x00\x01\x02", "ignore",
1738 {0: u'a', 1: u'b', 2: None}),
1739 (u"ab", 3)
1740 )
1741
1742 # Issue #14850
1743 self.assertEqual(
1744 codecs.charmap_decode("\x00\x01\x02", "ignore",
1745 {0: u'a', 1: u'b', 2: u'\ufffe'}),
1746 (u"ab", 3)
1747 )
1748
1749 allbytes = "".join(chr(i) for i in xrange(256))
1750 self.assertEqual(
1751 codecs.charmap_decode(allbytes, "ignore", {}),
1752 (u"", len(allbytes))
1753 )
1754
1755 def test_decode_with_int2int_map(self):
1756 a = ord(u'a')
1757 b = ord(u'b')
1758 c = ord(u'c')
1759
1760 self.assertEqual(
1761 codecs.charmap_decode("\x00\x01\x02", "strict",
1762 {0: a, 1: b, 2: c}),
1763 (u"abc", 3)
1764 )
1765
1766 # Issue #15379
1767 self.assertEqual(
1768 codecs.charmap_decode("\x00\x01\x02", "strict",
1769 {0: 0x10FFFF, 1: b, 2: c}),
1770 (u"\U0010FFFFbc", 3)
1771 )
1772
1773 self.assertRaises(TypeError,
1774 codecs.charmap_decode, "\x00\x01\x02", "strict",
1775 {0: 0x110000, 1: b, 2: c}
1776 )
1777
1778 self.assertRaises(UnicodeDecodeError,
1779 codecs.charmap_decode, "\x00\x01\x02", "strict",
1780 {0: a, 1: b},
1781 )
1782
1783 self.assertRaises(UnicodeDecodeError,
1784 codecs.charmap_decode, "\x00\x01\x02", "strict",
1785 {0: a, 1: b, 2: 0xFFFE},
1786 )
1787
1788 self.assertEqual(
1789 codecs.charmap_decode("\x00\x01\x02", "replace",
1790 {0: a, 1: b}),
1791 (u"ab\ufffd", 3)
1792 )
1793
1794 self.assertEqual(
1795 codecs.charmap_decode("\x00\x01\x02", "replace",
1796 {0: a, 1: b, 2: 0xFFFE}),
1797 (u"ab\ufffd", 3)
1798 )
1799
1800 self.assertEqual(
1801 codecs.charmap_decode("\x00\x01\x02", "ignore",
1802 {0: a, 1: b}),
1803 (u"ab", 3)
1804 )
1805
1806 self.assertEqual(
1807 codecs.charmap_decode("\x00\x01\x02", "ignore",
1808 {0: a, 1: b, 2: 0xFFFE}),
1809 (u"ab", 3)
1810 )
1811
1812
1813class WithStmtTest(unittest.TestCase):
1814 def test_encodedfile(self):
1815 f = StringIO.StringIO("\xc3\xbc")
1816 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1817 self.assertEqual(ef.read(), "\xfc")
1818
1819 def test_streamreaderwriter(self):
1820 f = StringIO.StringIO("\xc3\xbc")
1821 info = codecs.lookup("utf-8")
1822 with codecs.StreamReaderWriter(f, info.streamreader,
1823 info.streamwriter, 'strict') as srw:
1824 self.assertEqual(srw.read(), u"\xfc")
1825
1826
1827class UnicodeEscapeTest(unittest.TestCase):
1828 def test_empty(self):
1829 self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
1830 self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
1831
1832 def test_raw_encode(self):
1833 encode = codecs.unicode_escape_encode
1834 for b in range(32, 127):
1835 if b != ord('\\'):
1836 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1837
1838 def test_raw_decode(self):
1839 decode = codecs.unicode_escape_decode
1840 for b in range(256):
1841 if b != ord('\\'):
1842 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1843
1844 def test_escape_encode(self):
1845 encode = codecs.unicode_escape_encode
1846 check = coding_checker(self, encode)
1847 check(u'\t', r'\t')
1848 check(u'\n', r'\n')
1849 check(u'\r', r'\r')
1850 check(u'\\', r'\\')
1851 for b in range(32):
1852 if chr(b) not in '\t\n\r':
1853 check(unichr(b), '\\x%02x' % b)
1854 for b in range(127, 256):
1855 check(unichr(b), '\\x%02x' % b)
1856 check(u'\u20ac', r'\u20ac')
1857 check(u'\U0001d120', r'\U0001d120')
1858
1859 def test_escape_decode(self):
1860 decode = codecs.unicode_escape_decode
1861 check = coding_checker(self, decode)
1862 check("[\\\n]", u"[]")
1863 check(r'[\"]', u'["]')
1864 check(r"[\']", u"[']")
1865 check(r"[\\]", ur"[\]")
1866 check(r"[\a]", u"[\x07]")
1867 check(r"[\b]", u"[\x08]")
1868 check(r"[\t]", u"[\x09]")
1869 check(r"[\n]", u"[\x0a]")
1870 check(r"[\v]", u"[\x0b]")
1871 check(r"[\f]", u"[\x0c]")
1872 check(r"[\r]", u"[\x0d]")
1873 check(r"[\7]", u"[\x07]")
1874 check(r"[\8]", ur"[\8]")
1875 check(r"[\78]", u"[\x078]")
1876 check(r"[\41]", u"[!]")
1877 check(r"[\418]", u"[!8]")
1878 check(r"[\101]", u"[A]")
1879 check(r"[\1010]", u"[A0]")
1880 check(r"[\x41]", u"[A]")
1881 check(r"[\x410]", u"[A0]")
1882 check(r"\u20ac", u"\u20ac")
1883 check(r"\U0001d120", u"\U0001d120")
1884 for b in range(256):
1885 if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
1886 check('\\' + chr(b), u'\\' + unichr(b))
1887
1888 def test_decode_errors(self):
1889 decode = codecs.unicode_escape_decode
1890 for c, d in ('x', 2), ('u', 4), ('U', 4):
1891 for i in range(d):
1892 self.assertRaises(UnicodeDecodeError, decode,
1893 "\\" + c + "0"*i)
1894 self.assertRaises(UnicodeDecodeError, decode,
1895 "[\\" + c + "0"*i + "]")
1896 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1897 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1898 self.assertEqual(decode(data, "replace"),
1899 (u"[\ufffd]\ufffd", len(data)))
1900 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1901 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1902 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1903
1904
1905class RawUnicodeEscapeTest(unittest.TestCase):
1906 def test_empty(self):
1907 self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
1908 self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
1909
1910 def test_raw_encode(self):
1911 encode = codecs.raw_unicode_escape_encode
1912 for b in range(256):
1913 self.assertEqual(encode(unichr(b)), (chr(b), 1))
1914
1915 def test_raw_decode(self):
1916 decode = codecs.raw_unicode_escape_decode
1917 for b in range(256):
1918 self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
1919
1920 def test_escape_encode(self):
1921 encode = codecs.raw_unicode_escape_encode
1922 check = coding_checker(self, encode)
1923 for b in range(256):
1924 if chr(b) not in 'uU':
1925 check(u'\\' + unichr(b), '\\' + chr(b))
1926 check(u'\u20ac', r'\u20ac')
1927 check(u'\U0001d120', r'\U0001d120')
1928
1929 def test_escape_decode(self):
1930 decode = codecs.raw_unicode_escape_decode
1931 check = coding_checker(self, decode)
1932 for b in range(256):
1933 if chr(b) not in 'uU':
1934 check('\\' + chr(b), u'\\' + unichr(b))
1935 check(r"\u20ac", u"\u20ac")
1936 check(r"\U0001d120", u"\U0001d120")
1937
1938 def test_decode_errors(self):
1939 decode = codecs.raw_unicode_escape_decode
1940 for c, d in ('u', 4), ('U', 4):
1941 for i in range(d):
1942 self.assertRaises(UnicodeDecodeError, decode,
1943 "\\" + c + "0"*i)
1944 self.assertRaises(UnicodeDecodeError, decode,
1945 "[\\" + c + "0"*i + "]")
1946 data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
1947 self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
1948 self.assertEqual(decode(data, "replace"),
1949 (u"[\ufffd]\ufffd", len(data)))
1950 self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
1951 self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
1952 self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
1953
1954
1955class BomTest(unittest.TestCase):
1956 def test_seek0(self):
1957 data = u"1234567890"
1958 tests = ("utf-16",
1959 "utf-16-le",
1960 "utf-16-be",
1961 "utf-32",
1962 "utf-32-le",
1963 "utf-32-be")
1964 self.addCleanup(test_support.unlink, test_support.TESTFN)
1965 for encoding in tests:
1966 # Check if the BOM is written only once
1967 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1968 f.write(data)
1969 f.write(data)
1970 f.seek(0)
1971 self.assertEqual(f.read(), data * 2)
1972 f.seek(0)
1973 self.assertEqual(f.read(), data * 2)
1974
1975 # Check that the BOM is written after a seek(0)
1976 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1977 f.write(data[0])
1978 self.assertNotEqual(f.tell(), 0)
1979 f.seek(0)
1980 f.write(data)
1981 f.seek(0)
1982 self.assertEqual(f.read(), data)
1983
1984 # (StreamWriter) Check that the BOM is written after a seek(0)
1985 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1986 f.writer.write(data[0])
1987 self.assertNotEqual(f.writer.tell(), 0)
1988 f.writer.seek(0)
1989 f.writer.write(data)
1990 f.seek(0)
1991 self.assertEqual(f.read(), data)
1992
1993 # Check that the BOM is not written after a seek() at a position
1994 # different than the start
1995 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1996 f.write(data)
1997 f.seek(f.tell())
1998 f.write(data)
1999 f.seek(0)
2000 self.assertEqual(f.read(), data * 2)
2001
2002 # (StreamWriter) Check that the BOM is not written after a seek()
2003 # at a position different than the start
2004 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2005 f.writer.write(data)
2006 f.writer.seek(f.writer.tell())
2007 f.writer.write(data)
2008 f.seek(0)
2009 self.assertEqual(f.read(), data * 2)
2010
2011
2012def test_main():
2013 test_support.run_unittest(
2014 UTF32Test,
2015 UTF32LETest,
2016 UTF32BETest,
2017 UTF16Test,
2018 UTF16LETest,
2019 UTF16BETest,
2020 UTF8Test,
2021 UTF8SigTest,
2022 UTF7Test,
2023 UTF16ExTest,
2024 ReadBufferTest,
2025 CharBufferTest,
2026 EscapeDecodeTest,
2027 RecodingTest,
2028 PunycodeTest,
2029 UnicodeInternalTest,
2030 NameprepTest,
2031 IDNACodecTest,
2032 CodecsModuleTest,
2033 StreamReaderTest,
2034 EncodedFileTest,
2035 Str2StrTest,
2036 BasicUnicodeTest,
2037 BasicStrTest,
2038 CharmapTest,
2039 WithStmtTest,
2040 UnicodeEscapeTest,
2041 RawUnicodeEscapeTest,
2042 BomTest,
2043 )
2044
2045
2046if __name__ == "__main__":
2047 test_main()
Note: See TracBrowser for help on using the repository browser.