1 | from test import test_support
|
---|
2 | import unittest
|
---|
3 | import codecs
|
---|
4 | import locale
|
---|
5 | import sys, StringIO, _testcapi
|
---|
6 |
|
---|
7 | def coding_checker(self, coder):
|
---|
8 | def check(input, expect):
|
---|
9 | self.assertEqual(coder(input), (expect, len(input)))
|
---|
10 | return check
|
---|
11 |
|
---|
12 | class Queue(object):
|
---|
13 | """
|
---|
14 | queue: write bytes at one end, read bytes from the other end
|
---|
15 | """
|
---|
16 | def __init__(self):
|
---|
17 | self._buffer = ""
|
---|
18 |
|
---|
19 | def write(self, chars):
|
---|
20 | self._buffer += chars
|
---|
21 |
|
---|
22 | def read(self, size=-1):
|
---|
23 | if size<0:
|
---|
24 | s = self._buffer
|
---|
25 | self._buffer = ""
|
---|
26 | return s
|
---|
27 | else:
|
---|
28 | s = self._buffer[:size]
|
---|
29 | self._buffer = self._buffer[size:]
|
---|
30 | return s
|
---|
31 |
|
---|
32 | class ReadTest(unittest.TestCase):
|
---|
33 | def check_partial(self, input, partialresults):
|
---|
34 | # get a StreamReader for the encoding and feed the bytestring version
|
---|
35 | # of input to the reader byte by byte. Read everything available from
|
---|
36 | # the StreamReader and check that the results equal the appropriate
|
---|
37 | # entries from partialresults.
|
---|
38 | q = Queue()
|
---|
39 | r = codecs.getreader(self.encoding)(q)
|
---|
40 | result = u""
|
---|
41 | for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
|
---|
42 | q.write(c)
|
---|
43 | result += r.read()
|
---|
44 | self.assertEqual(result, partialresult)
|
---|
45 | # check that there's nothing left in the buffers
|
---|
46 | self.assertEqual(r.read(), u"")
|
---|
47 | self.assertEqual(r.bytebuffer, "")
|
---|
48 | self.assertEqual(r.charbuffer, u"")
|
---|
49 |
|
---|
50 | # do the check again, this time using a incremental decoder
|
---|
51 | d = codecs.getincrementaldecoder(self.encoding)()
|
---|
52 | result = u""
|
---|
53 | for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
|
---|
54 | result += d.decode(c)
|
---|
55 | self.assertEqual(result, partialresult)
|
---|
56 | # check that there's nothing left in the buffers
|
---|
57 | self.assertEqual(d.decode("", True), u"")
|
---|
58 | self.assertEqual(d.buffer, "")
|
---|
59 |
|
---|
60 | # Check whether the reset method works properly
|
---|
61 | d.reset()
|
---|
62 | result = u""
|
---|
63 | for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
|
---|
64 | result += d.decode(c)
|
---|
65 | self.assertEqual(result, partialresult)
|
---|
66 | # check that there's nothing left in the buffers
|
---|
67 | self.assertEqual(d.decode("", True), u"")
|
---|
68 | self.assertEqual(d.buffer, "")
|
---|
69 |
|
---|
70 | # check iterdecode()
|
---|
71 | encoded = input.encode(self.encoding)
|
---|
72 | self.assertEqual(
|
---|
73 | input,
|
---|
74 | u"".join(codecs.iterdecode(encoded, self.encoding))
|
---|
75 | )
|
---|
76 |
|
---|
77 | def test_readline(self):
|
---|
78 | def getreader(input):
|
---|
79 | stream = StringIO.StringIO(input.encode(self.encoding))
|
---|
80 | return codecs.getreader(self.encoding)(stream)
|
---|
81 |
|
---|
82 | def readalllines(input, keepends=True, size=None):
|
---|
83 | reader = getreader(input)
|
---|
84 | lines = []
|
---|
85 | while True:
|
---|
86 | line = reader.readline(size=size, keepends=keepends)
|
---|
87 | if not line:
|
---|
88 | break
|
---|
89 | lines.append(line)
|
---|
90 | return "|".join(lines)
|
---|
91 |
|
---|
92 | s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
|
---|
93 | sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
|
---|
94 | sexpectednoends = u"foo|bar|baz|spam|eggs"
|
---|
95 | self.assertEqual(readalllines(s, True), sexpected)
|
---|
96 | self.assertEqual(readalllines(s, False), sexpectednoends)
|
---|
97 | self.assertEqual(readalllines(s, True, 10), sexpected)
|
---|
98 | self.assertEqual(readalllines(s, False, 10), sexpectednoends)
|
---|
99 |
|
---|
100 | # Test long lines (multiple calls to read() in readline())
|
---|
101 | vw = []
|
---|
102 | vwo = []
|
---|
103 | for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
|
---|
104 | vw.append((i*200)*u"\3042" + lineend)
|
---|
105 | vwo.append((i*200)*u"\3042")
|
---|
106 | self.assertEqual(readalllines("".join(vw), True), "".join(vw))
|
---|
107 | self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
|
---|
108 |
|
---|
109 | # Test lines where the first read might end with \r, so the
|
---|
110 | # reader has to look ahead whether this is a lone \r or a \r\n
|
---|
111 | for size in xrange(80):
|
---|
112 | for lineend in u"\n \r\n \r \u2028".split():
|
---|
113 | s = 10*(size*u"a" + lineend + u"xxx\n")
|
---|
114 | reader = getreader(s)
|
---|
115 | for i in xrange(10):
|
---|
116 | self.assertEqual(
|
---|
117 | reader.readline(keepends=True),
|
---|
118 | size*u"a" + lineend,
|
---|
119 | )
|
---|
120 | reader = getreader(s)
|
---|
121 | for i in xrange(10):
|
---|
122 | self.assertEqual(
|
---|
123 | reader.readline(keepends=False),
|
---|
124 | size*u"a",
|
---|
125 | )
|
---|
126 |
|
---|
127 | def test_bug1175396(self):
|
---|
128 | s = [
|
---|
129 | '<%!--===================================================\r\n',
|
---|
130 | ' BLOG index page: show recent articles,\r\n',
|
---|
131 | ' today\'s articles, or articles of a specific date.\r\n',
|
---|
132 | '========================================================--%>\r\n',
|
---|
133 | '<%@inputencoding="ISO-8859-1"%>\r\n',
|
---|
134 | '<%@pagetemplate=TEMPLATE.y%>\r\n',
|
---|
135 | '<%@import=import frog.util, frog%>\r\n',
|
---|
136 | '<%@import=import frog.objects%>\r\n',
|
---|
137 | '<%@import=from frog.storageerrors import StorageError%>\r\n',
|
---|
138 | '<%\r\n',
|
---|
139 | '\r\n',
|
---|
140 | 'import logging\r\n',
|
---|
141 | 'log=logging.getLogger("Snakelets.logger")\r\n',
|
---|
142 | '\r\n',
|
---|
143 | '\r\n',
|
---|
144 | 'user=self.SessionCtx.user\r\n',
|
---|
145 | 'storageEngine=self.SessionCtx.storageEngine\r\n',
|
---|
146 | '\r\n',
|
---|
147 | '\r\n',
|
---|
148 | 'def readArticlesFromDate(date, count=None):\r\n',
|
---|
149 | ' entryids=storageEngine.listBlogEntries(date)\r\n',
|
---|
150 | ' entryids.reverse() # descending\r\n',
|
---|
151 | ' if count:\r\n',
|
---|
152 | ' entryids=entryids[:count]\r\n',
|
---|
153 | ' try:\r\n',
|
---|
154 | ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
|
---|
155 | ' except StorageError,x:\r\n',
|
---|
156 | ' log.error("Error loading articles: "+str(x))\r\n',
|
---|
157 | ' self.abort("cannot load articles")\r\n',
|
---|
158 | '\r\n',
|
---|
159 | 'showdate=None\r\n',
|
---|
160 | '\r\n',
|
---|
161 | 'arg=self.Request.getArg()\r\n',
|
---|
162 | 'if arg=="today":\r\n',
|
---|
163 | ' #-------------------- TODAY\'S ARTICLES\r\n',
|
---|
164 | ' self.write("<h2>Today\'s articles</h2>")\r\n',
|
---|
165 | ' showdate = frog.util.isodatestr() \r\n',
|
---|
166 | ' entries = readArticlesFromDate(showdate)\r\n',
|
---|
167 | 'elif arg=="active":\r\n',
|
---|
168 | ' #-------------------- ACTIVE ARTICLES redirect\r\n',
|
---|
169 | ' self.Yredirect("active.y")\r\n',
|
---|
170 | 'elif arg=="login":\r\n',
|
---|
171 | ' #-------------------- LOGIN PAGE redirect\r\n',
|
---|
172 | ' self.Yredirect("login.y")\r\n',
|
---|
173 | 'elif arg=="date":\r\n',
|
---|
174 | ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
|
---|
175 | ' showdate = self.Request.getParameter("date")\r\n',
|
---|
176 | ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
|
---|
177 | ' entries = readArticlesFromDate(showdate)\r\n',
|
---|
178 | 'else:\r\n',
|
---|
179 | ' #-------------------- RECENT ARTICLES\r\n',
|
---|
180 | ' self.write("<h2>Recent articles</h2>")\r\n',
|
---|
181 | ' dates=storageEngine.listBlogEntryDates()\r\n',
|
---|
182 | ' if dates:\r\n',
|
---|
183 | ' entries=[]\r\n',
|
---|
184 | ' SHOWAMOUNT=10\r\n',
|
---|
185 | ' for showdate in dates:\r\n',
|
---|
186 | ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
|
---|
187 | ' if len(entries)>=SHOWAMOUNT:\r\n',
|
---|
188 | ' break\r\n',
|
---|
189 | ' \r\n',
|
---|
190 | ]
|
---|
191 | stream = StringIO.StringIO("".join(s).encode(self.encoding))
|
---|
192 | reader = codecs.getreader(self.encoding)(stream)
|
---|
193 | for (i, line) in enumerate(reader):
|
---|
194 | self.assertEqual(line, s[i])
|
---|
195 |
|
---|
196 | def test_readlinequeue(self):
|
---|
197 | q = Queue()
|
---|
198 | writer = codecs.getwriter(self.encoding)(q)
|
---|
199 | reader = codecs.getreader(self.encoding)(q)
|
---|
200 |
|
---|
201 | # No lineends
|
---|
202 | writer.write(u"foo\r")
|
---|
203 | self.assertEqual(reader.readline(keepends=False), u"foo")
|
---|
204 | writer.write(u"\nbar\r")
|
---|
205 | self.assertEqual(reader.readline(keepends=False), u"")
|
---|
206 | self.assertEqual(reader.readline(keepends=False), u"bar")
|
---|
207 | writer.write(u"baz")
|
---|
208 | self.assertEqual(reader.readline(keepends=False), u"baz")
|
---|
209 | self.assertEqual(reader.readline(keepends=False), u"")
|
---|
210 |
|
---|
211 | # Lineends
|
---|
212 | writer.write(u"foo\r")
|
---|
213 | self.assertEqual(reader.readline(keepends=True), u"foo\r")
|
---|
214 | writer.write(u"\nbar\r")
|
---|
215 | self.assertEqual(reader.readline(keepends=True), u"\n")
|
---|
216 | self.assertEqual(reader.readline(keepends=True), u"bar\r")
|
---|
217 | writer.write(u"baz")
|
---|
218 | self.assertEqual(reader.readline(keepends=True), u"baz")
|
---|
219 | self.assertEqual(reader.readline(keepends=True), u"")
|
---|
220 | writer.write(u"foo\r\n")
|
---|
221 | self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
|
---|
222 |
|
---|
223 | def test_bug1098990_a(self):
|
---|
224 | s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
|
---|
225 | s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
|
---|
226 | s3 = u"next line.\r\n"
|
---|
227 |
|
---|
228 | s = (s1+s2+s3).encode(self.encoding)
|
---|
229 | stream = StringIO.StringIO(s)
|
---|
230 | reader = codecs.getreader(self.encoding)(stream)
|
---|
231 | self.assertEqual(reader.readline(), s1)
|
---|
232 | self.assertEqual(reader.readline(), s2)
|
---|
233 | self.assertEqual(reader.readline(), s3)
|
---|
234 | self.assertEqual(reader.readline(), u"")
|
---|
235 |
|
---|
236 | def test_bug1098990_b(self):
|
---|
237 | s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
|
---|
238 | s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
|
---|
239 | s3 = u"stillokay:bbbbxx\r\n"
|
---|
240 | s4 = u"broken!!!!badbad\r\n"
|
---|
241 | s5 = u"againokay.\r\n"
|
---|
242 |
|
---|
243 | s = (s1+s2+s3+s4+s5).encode(self.encoding)
|
---|
244 | stream = StringIO.StringIO(s)
|
---|
245 | reader = codecs.getreader(self.encoding)(stream)
|
---|
246 | self.assertEqual(reader.readline(), s1)
|
---|
247 | self.assertEqual(reader.readline(), s2)
|
---|
248 | self.assertEqual(reader.readline(), s3)
|
---|
249 | self.assertEqual(reader.readline(), s4)
|
---|
250 | self.assertEqual(reader.readline(), s5)
|
---|
251 | self.assertEqual(reader.readline(), u"")
|
---|
252 |
|
---|
253 | class UTF32Test(ReadTest):
|
---|
254 | encoding = "utf-32"
|
---|
255 |
|
---|
256 | spamle = ('\xff\xfe\x00\x00'
|
---|
257 | 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
|
---|
258 | 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
|
---|
259 | spambe = ('\x00\x00\xfe\xff'
|
---|
260 | '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
|
---|
261 | '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
|
---|
262 |
|
---|
263 | def test_only_one_bom(self):
|
---|
264 | _,_,reader,writer = codecs.lookup(self.encoding)
|
---|
265 | # encode some stream
|
---|
266 | s = StringIO.StringIO()
|
---|
267 | f = writer(s)
|
---|
268 | f.write(u"spam")
|
---|
269 | f.write(u"spam")
|
---|
270 | d = s.getvalue()
|
---|
271 | # check whether there is exactly one BOM in it
|
---|
272 | self.assertTrue(d == self.spamle or d == self.spambe)
|
---|
273 | # try to read it back
|
---|
274 | s = StringIO.StringIO(d)
|
---|
275 | f = reader(s)
|
---|
276 | self.assertEqual(f.read(), u"spamspam")
|
---|
277 |
|
---|
278 | def test_badbom(self):
|
---|
279 | s = StringIO.StringIO(4*"\xff")
|
---|
280 | f = codecs.getreader(self.encoding)(s)
|
---|
281 | self.assertRaises(UnicodeError, f.read)
|
---|
282 |
|
---|
283 | s = StringIO.StringIO(8*"\xff")
|
---|
284 | f = codecs.getreader(self.encoding)(s)
|
---|
285 | self.assertRaises(UnicodeError, f.read)
|
---|
286 |
|
---|
287 | def test_partial(self):
|
---|
288 | self.check_partial(
|
---|
289 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
290 | [
|
---|
291 | u"", # first byte of BOM read
|
---|
292 | u"", # second byte of BOM read
|
---|
293 | u"", # third byte of BOM read
|
---|
294 | u"", # fourth byte of BOM read => byteorder known
|
---|
295 | u"",
|
---|
296 | u"",
|
---|
297 | u"",
|
---|
298 | u"\x00",
|
---|
299 | u"\x00",
|
---|
300 | u"\x00",
|
---|
301 | u"\x00",
|
---|
302 | u"\x00\xff",
|
---|
303 | u"\x00\xff",
|
---|
304 | u"\x00\xff",
|
---|
305 | u"\x00\xff",
|
---|
306 | u"\x00\xff\u0100",
|
---|
307 | u"\x00\xff\u0100",
|
---|
308 | u"\x00\xff\u0100",
|
---|
309 | u"\x00\xff\u0100",
|
---|
310 | u"\x00\xff\u0100\uffff",
|
---|
311 | u"\x00\xff\u0100\uffff",
|
---|
312 | u"\x00\xff\u0100\uffff",
|
---|
313 | u"\x00\xff\u0100\uffff",
|
---|
314 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
315 | ]
|
---|
316 | )
|
---|
317 |
|
---|
318 | def test_handlers(self):
|
---|
319 | self.assertEqual((u'\ufffd', 1),
|
---|
320 | codecs.utf_32_decode('\x01', 'replace', True))
|
---|
321 | self.assertEqual((u'', 1),
|
---|
322 | codecs.utf_32_decode('\x01', 'ignore', True))
|
---|
323 |
|
---|
324 | def test_errors(self):
|
---|
325 | self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
|
---|
326 | "\xff", "strict", True)
|
---|
327 |
|
---|
328 | def test_issue8941(self):
|
---|
329 | # Issue #8941: insufficient result allocation when decoding into
|
---|
330 | # surrogate pairs on UCS-2 builds.
|
---|
331 | encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
|
---|
332 | self.assertEqual(u'\U00010000' * 1024,
|
---|
333 | codecs.utf_32_decode(encoded_le)[0])
|
---|
334 | encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
|
---|
335 | self.assertEqual(u'\U00010000' * 1024,
|
---|
336 | codecs.utf_32_decode(encoded_be)[0])
|
---|
337 |
|
---|
338 | class UTF32LETest(ReadTest):
|
---|
339 | encoding = "utf-32-le"
|
---|
340 |
|
---|
341 | def test_partial(self):
|
---|
342 | self.check_partial(
|
---|
343 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
344 | [
|
---|
345 | u"",
|
---|
346 | u"",
|
---|
347 | u"",
|
---|
348 | u"\x00",
|
---|
349 | u"\x00",
|
---|
350 | u"\x00",
|
---|
351 | u"\x00",
|
---|
352 | u"\x00\xff",
|
---|
353 | u"\x00\xff",
|
---|
354 | u"\x00\xff",
|
---|
355 | u"\x00\xff",
|
---|
356 | u"\x00\xff\u0100",
|
---|
357 | u"\x00\xff\u0100",
|
---|
358 | u"\x00\xff\u0100",
|
---|
359 | u"\x00\xff\u0100",
|
---|
360 | u"\x00\xff\u0100\uffff",
|
---|
361 | u"\x00\xff\u0100\uffff",
|
---|
362 | u"\x00\xff\u0100\uffff",
|
---|
363 | u"\x00\xff\u0100\uffff",
|
---|
364 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
365 | ]
|
---|
366 | )
|
---|
367 |
|
---|
368 | def test_simple(self):
|
---|
369 | self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
|
---|
370 |
|
---|
371 | def test_errors(self):
|
---|
372 | self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
|
---|
373 | "\xff", "strict", True)
|
---|
374 |
|
---|
375 | def test_issue8941(self):
|
---|
376 | # Issue #8941: insufficient result allocation when decoding into
|
---|
377 | # surrogate pairs on UCS-2 builds.
|
---|
378 | encoded = '\x00\x00\x01\x00' * 1024
|
---|
379 | self.assertEqual(u'\U00010000' * 1024,
|
---|
380 | codecs.utf_32_le_decode(encoded)[0])
|
---|
381 |
|
---|
382 | class UTF32BETest(ReadTest):
|
---|
383 | encoding = "utf-32-be"
|
---|
384 |
|
---|
385 | def test_partial(self):
|
---|
386 | self.check_partial(
|
---|
387 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
388 | [
|
---|
389 | u"",
|
---|
390 | u"",
|
---|
391 | u"",
|
---|
392 | u"\x00",
|
---|
393 | u"\x00",
|
---|
394 | u"\x00",
|
---|
395 | u"\x00",
|
---|
396 | u"\x00\xff",
|
---|
397 | u"\x00\xff",
|
---|
398 | u"\x00\xff",
|
---|
399 | u"\x00\xff",
|
---|
400 | u"\x00\xff\u0100",
|
---|
401 | u"\x00\xff\u0100",
|
---|
402 | u"\x00\xff\u0100",
|
---|
403 | u"\x00\xff\u0100",
|
---|
404 | u"\x00\xff\u0100\uffff",
|
---|
405 | u"\x00\xff\u0100\uffff",
|
---|
406 | u"\x00\xff\u0100\uffff",
|
---|
407 | u"\x00\xff\u0100\uffff",
|
---|
408 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
409 | ]
|
---|
410 | )
|
---|
411 |
|
---|
412 | def test_simple(self):
|
---|
413 | self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
|
---|
414 |
|
---|
415 | def test_errors(self):
|
---|
416 | self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
|
---|
417 | "\xff", "strict", True)
|
---|
418 |
|
---|
419 | def test_issue8941(self):
|
---|
420 | # Issue #8941: insufficient result allocation when decoding into
|
---|
421 | # surrogate pairs on UCS-2 builds.
|
---|
422 | encoded = '\x00\x01\x00\x00' * 1024
|
---|
423 | self.assertEqual(u'\U00010000' * 1024,
|
---|
424 | codecs.utf_32_be_decode(encoded)[0])
|
---|
425 |
|
---|
426 |
|
---|
427 | class UTF16Test(ReadTest):
|
---|
428 | encoding = "utf-16"
|
---|
429 |
|
---|
430 | spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
|
---|
431 | spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
|
---|
432 |
|
---|
433 | def test_only_one_bom(self):
|
---|
434 | _,_,reader,writer = codecs.lookup(self.encoding)
|
---|
435 | # encode some stream
|
---|
436 | s = StringIO.StringIO()
|
---|
437 | f = writer(s)
|
---|
438 | f.write(u"spam")
|
---|
439 | f.write(u"spam")
|
---|
440 | d = s.getvalue()
|
---|
441 | # check whether there is exactly one BOM in it
|
---|
442 | self.assertTrue(d == self.spamle or d == self.spambe)
|
---|
443 | # try to read it back
|
---|
444 | s = StringIO.StringIO(d)
|
---|
445 | f = reader(s)
|
---|
446 | self.assertEqual(f.read(), u"spamspam")
|
---|
447 |
|
---|
448 | def test_badbom(self):
|
---|
449 | s = StringIO.StringIO("\xff\xff")
|
---|
450 | f = codecs.getreader(self.encoding)(s)
|
---|
451 | self.assertRaises(UnicodeError, f.read)
|
---|
452 |
|
---|
453 | s = StringIO.StringIO("\xff\xff\xff\xff")
|
---|
454 | f = codecs.getreader(self.encoding)(s)
|
---|
455 | self.assertRaises(UnicodeError, f.read)
|
---|
456 |
|
---|
457 | def test_partial(self):
|
---|
458 | self.check_partial(
|
---|
459 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
460 | [
|
---|
461 | u"", # first byte of BOM read
|
---|
462 | u"", # second byte of BOM read => byteorder known
|
---|
463 | u"",
|
---|
464 | u"\x00",
|
---|
465 | u"\x00",
|
---|
466 | u"\x00\xff",
|
---|
467 | u"\x00\xff",
|
---|
468 | u"\x00\xff\u0100",
|
---|
469 | u"\x00\xff\u0100",
|
---|
470 | u"\x00\xff\u0100\uffff",
|
---|
471 | u"\x00\xff\u0100\uffff",
|
---|
472 | u"\x00\xff\u0100\uffff",
|
---|
473 | u"\x00\xff\u0100\uffff",
|
---|
474 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
475 | ]
|
---|
476 | )
|
---|
477 |
|
---|
478 | def test_handlers(self):
|
---|
479 | self.assertEqual((u'\ufffd', 1),
|
---|
480 | codecs.utf_16_decode('\x01', 'replace', True))
|
---|
481 | self.assertEqual((u'', 1),
|
---|
482 | codecs.utf_16_decode('\x01', 'ignore', True))
|
---|
483 |
|
---|
484 | def test_errors(self):
|
---|
485 | self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
|
---|
486 |
|
---|
487 | def test_bug691291(self):
|
---|
488 | # Files are always opened in binary mode, even if no binary mode was
|
---|
489 | # specified. This means that no automatic conversion of '\n' is done
|
---|
490 | # on reading and writing.
|
---|
491 | s1 = u'Hello\r\nworld\r\n'
|
---|
492 |
|
---|
493 | s = s1.encode(self.encoding)
|
---|
494 | self.addCleanup(test_support.unlink, test_support.TESTFN)
|
---|
495 | with open(test_support.TESTFN, 'wb') as fp:
|
---|
496 | fp.write(s)
|
---|
497 | with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
|
---|
498 | self.assertEqual(reader.read(), s1)
|
---|
499 |
|
---|
500 | class UTF16LETest(ReadTest):
|
---|
501 | encoding = "utf-16-le"
|
---|
502 |
|
---|
503 | def test_partial(self):
|
---|
504 | self.check_partial(
|
---|
505 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
506 | [
|
---|
507 | u"",
|
---|
508 | u"\x00",
|
---|
509 | u"\x00",
|
---|
510 | u"\x00\xff",
|
---|
511 | u"\x00\xff",
|
---|
512 | u"\x00\xff\u0100",
|
---|
513 | u"\x00\xff\u0100",
|
---|
514 | u"\x00\xff\u0100\uffff",
|
---|
515 | u"\x00\xff\u0100\uffff",
|
---|
516 | u"\x00\xff\u0100\uffff",
|
---|
517 | u"\x00\xff\u0100\uffff",
|
---|
518 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
519 | ]
|
---|
520 | )
|
---|
521 |
|
---|
522 | def test_errors(self):
|
---|
523 | tests = [
|
---|
524 | (b'\xff', u'\ufffd'),
|
---|
525 | (b'A\x00Z', u'A\ufffd'),
|
---|
526 | (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
|
---|
527 | (b'\x00\xd8', u'\ufffd'),
|
---|
528 | (b'\x00\xd8A', u'\ufffd'),
|
---|
529 | (b'\x00\xd8A\x00', u'\ufffdA'),
|
---|
530 | (b'\x00\xdcA\x00', u'\ufffdA'),
|
---|
531 | ]
|
---|
532 | for raw, expected in tests:
|
---|
533 | self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
|
---|
534 | raw, 'strict', True)
|
---|
535 | self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
|
---|
536 |
|
---|
537 | class UTF16BETest(ReadTest):
|
---|
538 | encoding = "utf-16-be"
|
---|
539 |
|
---|
540 | def test_partial(self):
|
---|
541 | self.check_partial(
|
---|
542 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
543 | [
|
---|
544 | u"",
|
---|
545 | u"\x00",
|
---|
546 | u"\x00",
|
---|
547 | u"\x00\xff",
|
---|
548 | u"\x00\xff",
|
---|
549 | u"\x00\xff\u0100",
|
---|
550 | u"\x00\xff\u0100",
|
---|
551 | u"\x00\xff\u0100\uffff",
|
---|
552 | u"\x00\xff\u0100\uffff",
|
---|
553 | u"\x00\xff\u0100\uffff",
|
---|
554 | u"\x00\xff\u0100\uffff",
|
---|
555 | u"\x00\xff\u0100\uffff\U00010000",
|
---|
556 | ]
|
---|
557 | )
|
---|
558 |
|
---|
559 | def test_errors(self):
|
---|
560 | tests = [
|
---|
561 | (b'\xff', u'\ufffd'),
|
---|
562 | (b'\x00A\xff', u'A\ufffd'),
|
---|
563 | (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
|
---|
564 | (b'\xd8\x00', u'\ufffd'),
|
---|
565 | (b'\xd8\x00\xdc', u'\ufffd'),
|
---|
566 | (b'\xd8\x00\x00A', u'\ufffdA'),
|
---|
567 | (b'\xdc\x00\x00A', u'\ufffdA'),
|
---|
568 | ]
|
---|
569 | for raw, expected in tests:
|
---|
570 | self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
|
---|
571 | raw, 'strict', True)
|
---|
572 | self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
|
---|
573 |
|
---|
574 | class UTF8Test(ReadTest):
|
---|
575 | encoding = "utf-8"
|
---|
576 |
|
---|
577 | def test_partial(self):
|
---|
578 | self.check_partial(
|
---|
579 | u"\x00\xff\u07ff\u0800\uffff\U00010000",
|
---|
580 | [
|
---|
581 | u"\x00",
|
---|
582 | u"\x00",
|
---|
583 | u"\x00\xff",
|
---|
584 | u"\x00\xff",
|
---|
585 | u"\x00\xff\u07ff",
|
---|
586 | u"\x00\xff\u07ff",
|
---|
587 | u"\x00\xff\u07ff",
|
---|
588 | u"\x00\xff\u07ff\u0800",
|
---|
589 | u"\x00\xff\u07ff\u0800",
|
---|
590 | u"\x00\xff\u07ff\u0800",
|
---|
591 | u"\x00\xff\u07ff\u0800\uffff",
|
---|
592 | u"\x00\xff\u07ff\u0800\uffff",
|
---|
593 | u"\x00\xff\u07ff\u0800\uffff",
|
---|
594 | u"\x00\xff\u07ff\u0800\uffff",
|
---|
595 | u"\x00\xff\u07ff\u0800\uffff\U00010000",
|
---|
596 | ]
|
---|
597 | )
|
---|
598 |
|
---|
599 | class UTF7Test(ReadTest):
|
---|
600 | encoding = "utf-7"
|
---|
601 |
|
---|
602 | def test_partial(self):
|
---|
603 | self.check_partial(
|
---|
604 | u"a+-b",
|
---|
605 | [
|
---|
606 | u"a",
|
---|
607 | u"a",
|
---|
608 | u"a+",
|
---|
609 | u"a+-",
|
---|
610 | u"a+-b",
|
---|
611 | ]
|
---|
612 | )
|
---|
613 |
|
---|
614 | def test_errors(self):
|
---|
615 | tests = [
|
---|
616 | ('a\xffb', u'a\ufffdb'),
|
---|
617 | ('a+IK', u'a\ufffd'),
|
---|
618 | ('a+IK-b', u'a\ufffdb'),
|
---|
619 | ('a+IK,b', u'a\ufffdb'),
|
---|
620 | ('a+IKx', u'a\u20ac\ufffd'),
|
---|
621 | ('a+IKx-b', u'a\u20ac\ufffdb'),
|
---|
622 | ('a+IKwgr', u'a\u20ac\ufffd'),
|
---|
623 | ('a+IKwgr-b', u'a\u20ac\ufffdb'),
|
---|
624 | ('a+IKwgr,', u'a\u20ac\ufffd'),
|
---|
625 | ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
|
---|
626 | ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
|
---|
627 | ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
|
---|
628 | ('a+/,+IKw-b', u'a\ufffd\u20acb'),
|
---|
629 | ('a+//,+IKw-b', u'a\ufffd\u20acb'),
|
---|
630 | ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
|
---|
631 | ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
|
---|
632 | ]
|
---|
633 | for raw, expected in tests:
|
---|
634 | self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
|
---|
635 | raw, 'strict', True)
|
---|
636 | self.assertEqual(raw.decode('utf-7', 'replace'), expected)
|
---|
637 |
|
---|
638 | def test_nonbmp(self):
|
---|
639 | self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
|
---|
640 | self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
|
---|
641 | self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
|
---|
642 |
|
---|
643 | class UTF16ExTest(unittest.TestCase):
|
---|
644 |
|
---|
645 | def test_errors(self):
|
---|
646 | self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
|
---|
647 |
|
---|
648 | def test_bad_args(self):
|
---|
649 | self.assertRaises(TypeError, codecs.utf_16_ex_decode)
|
---|
650 |
|
---|
651 | class ReadBufferTest(unittest.TestCase):
|
---|
652 |
|
---|
653 | def test_array(self):
|
---|
654 | import array
|
---|
655 | self.assertEqual(
|
---|
656 | codecs.readbuffer_encode(array.array("c", "spam")),
|
---|
657 | ("spam", 4)
|
---|
658 | )
|
---|
659 |
|
---|
660 | def test_empty(self):
|
---|
661 | self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
|
---|
662 |
|
---|
663 | def test_bad_args(self):
|
---|
664 | self.assertRaises(TypeError, codecs.readbuffer_encode)
|
---|
665 | self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
|
---|
666 |
|
---|
667 | class CharBufferTest(unittest.TestCase):
|
---|
668 |
|
---|
669 | def test_string(self):
|
---|
670 | self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
|
---|
671 |
|
---|
672 | def test_empty(self):
|
---|
673 | self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
|
---|
674 |
|
---|
675 | def test_bad_args(self):
|
---|
676 | self.assertRaises(TypeError, codecs.charbuffer_encode)
|
---|
677 | self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
|
---|
678 |
|
---|
679 | class UTF8SigTest(ReadTest):
|
---|
680 | encoding = "utf-8-sig"
|
---|
681 |
|
---|
682 | def test_partial(self):
|
---|
683 | self.check_partial(
|
---|
684 | u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
|
---|
685 | [
|
---|
686 | u"",
|
---|
687 | u"",
|
---|
688 | u"", # First BOM has been read and skipped
|
---|
689 | u"",
|
---|
690 | u"",
|
---|
691 | u"\ufeff", # Second BOM has been read and emitted
|
---|
692 | u"\ufeff\x00", # "\x00" read and emitted
|
---|
693 | u"\ufeff\x00", # First byte of encoded u"\xff" read
|
---|
694 | u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
|
---|
695 | u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
|
---|
696 | u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
|
---|
697 | u"\ufeff\x00\xff\u07ff",
|
---|
698 | u"\ufeff\x00\xff\u07ff",
|
---|
699 | u"\ufeff\x00\xff\u07ff\u0800",
|
---|
700 | u"\ufeff\x00\xff\u07ff\u0800",
|
---|
701 | u"\ufeff\x00\xff\u07ff\u0800",
|
---|
702 | u"\ufeff\x00\xff\u07ff\u0800\uffff",
|
---|
703 | u"\ufeff\x00\xff\u07ff\u0800\uffff",
|
---|
704 | u"\ufeff\x00\xff\u07ff\u0800\uffff",
|
---|
705 | u"\ufeff\x00\xff\u07ff\u0800\uffff",
|
---|
706 | u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
|
---|
707 | ]
|
---|
708 | )
|
---|
709 |
|
---|
710 | def test_bug1601501(self):
|
---|
711 | # SF bug #1601501: check that the codec works with a buffer
|
---|
712 | unicode("\xef\xbb\xbf", "utf-8-sig")
|
---|
713 |
|
---|
714 | def test_bom(self):
|
---|
715 | d = codecs.getincrementaldecoder("utf-8-sig")()
|
---|
716 | s = u"spam"
|
---|
717 | self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
|
---|
718 |
|
---|
719 | def test_stream_bom(self):
|
---|
720 | unistring = u"ABC\u00A1\u2200XYZ"
|
---|
721 | bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
|
---|
722 |
|
---|
723 | reader = codecs.getreader("utf-8-sig")
|
---|
724 | for sizehint in [None] + range(1, 11) + \
|
---|
725 | [64, 128, 256, 512, 1024]:
|
---|
726 | istream = reader(StringIO.StringIO(bytestring))
|
---|
727 | ostream = StringIO.StringIO()
|
---|
728 | while 1:
|
---|
729 | if sizehint is not None:
|
---|
730 | data = istream.read(sizehint)
|
---|
731 | else:
|
---|
732 | data = istream.read()
|
---|
733 |
|
---|
734 | if not data:
|
---|
735 | break
|
---|
736 | ostream.write(data)
|
---|
737 |
|
---|
738 | got = ostream.getvalue()
|
---|
739 | self.assertEqual(got, unistring)
|
---|
740 |
|
---|
741 | def test_stream_bare(self):
|
---|
742 | unistring = u"ABC\u00A1\u2200XYZ"
|
---|
743 | bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
|
---|
744 |
|
---|
745 | reader = codecs.getreader("utf-8-sig")
|
---|
746 | for sizehint in [None] + range(1, 11) + \
|
---|
747 | [64, 128, 256, 512, 1024]:
|
---|
748 | istream = reader(StringIO.StringIO(bytestring))
|
---|
749 | ostream = StringIO.StringIO()
|
---|
750 | while 1:
|
---|
751 | if sizehint is not None:
|
---|
752 | data = istream.read(sizehint)
|
---|
753 | else:
|
---|
754 | data = istream.read()
|
---|
755 |
|
---|
756 | if not data:
|
---|
757 | break
|
---|
758 | ostream.write(data)
|
---|
759 |
|
---|
760 | got = ostream.getvalue()
|
---|
761 | self.assertEqual(got, unistring)
|
---|
762 |
|
---|
763 | class EscapeDecodeTest(unittest.TestCase):
|
---|
764 | def test_empty(self):
|
---|
765 | self.assertEqual(codecs.escape_decode(""), ("", 0))
|
---|
766 |
|
---|
767 | def test_raw(self):
|
---|
768 | decode = codecs.escape_decode
|
---|
769 | for b in range(256):
|
---|
770 | b = chr(b)
|
---|
771 | if b != '\\':
|
---|
772 | self.assertEqual(decode(b + '0'), (b + '0', 2))
|
---|
773 |
|
---|
774 | def test_escape(self):
|
---|
775 | decode = codecs.escape_decode
|
---|
776 | check = coding_checker(self, decode)
|
---|
777 | check(b"[\\\n]", b"[]")
|
---|
778 | check(br'[\"]', b'["]')
|
---|
779 | check(br"[\']", b"[']")
|
---|
780 | check(br"[\\]", br"[\]")
|
---|
781 | check(br"[\a]", b"[\x07]")
|
---|
782 | check(br"[\b]", b"[\x08]")
|
---|
783 | check(br"[\t]", b"[\x09]")
|
---|
784 | check(br"[\n]", b"[\x0a]")
|
---|
785 | check(br"[\v]", b"[\x0b]")
|
---|
786 | check(br"[\f]", b"[\x0c]")
|
---|
787 | check(br"[\r]", b"[\x0d]")
|
---|
788 | check(br"[\7]", b"[\x07]")
|
---|
789 | check(br"[\8]", br"[\8]")
|
---|
790 | check(br"[\78]", b"[\x078]")
|
---|
791 | check(br"[\41]", b"[!]")
|
---|
792 | check(br"[\418]", b"[!8]")
|
---|
793 | check(br"[\101]", b"[A]")
|
---|
794 | check(br"[\1010]", b"[A0]")
|
---|
795 | check(br"[\501]", b"[A]")
|
---|
796 | check(br"[\x41]", b"[A]")
|
---|
797 | check(br"[\X41]", br"[\X41]")
|
---|
798 | check(br"[\x410]", b"[A0]")
|
---|
799 | for b in range(256):
|
---|
800 | b = chr(b)
|
---|
801 | if b not in '\n"\'\\abtnvfr01234567x':
|
---|
802 | check('\\' + b, '\\' + b)
|
---|
803 |
|
---|
804 | def test_errors(self):
|
---|
805 | decode = codecs.escape_decode
|
---|
806 | self.assertRaises(ValueError, decode, br"\x")
|
---|
807 | self.assertRaises(ValueError, decode, br"[\x]")
|
---|
808 | self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
|
---|
809 | self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
|
---|
810 | self.assertRaises(ValueError, decode, br"\x0")
|
---|
811 | self.assertRaises(ValueError, decode, br"[\x0]")
|
---|
812 | self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
|
---|
813 | self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
|
---|
814 |
|
---|
815 | class RecodingTest(unittest.TestCase):
|
---|
816 | def test_recoding(self):
|
---|
817 | f = StringIO.StringIO()
|
---|
818 | f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
|
---|
819 | f2.write(u"a")
|
---|
820 | f2.close()
|
---|
821 | # Python used to crash on this at exit because of a refcount
|
---|
822 | # bug in _codecsmodule.c
|
---|
823 |
|
---|
824 | # From RFC 3492
|
---|
825 | punycode_testcases = [
|
---|
826 | # A Arabic (Egyptian):
|
---|
827 | (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
|
---|
828 | u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
|
---|
829 | "egbpdaj6bu4bxfgehfvwxn"),
|
---|
830 | # B Chinese (simplified):
|
---|
831 | (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
|
---|
832 | "ihqwcrb4cv8a8dqg056pqjye"),
|
---|
833 | # C Chinese (traditional):
|
---|
834 | (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
|
---|
835 | "ihqwctvzc91f659drss3x8bo0yb"),
|
---|
836 | # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
|
---|
837 | (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
|
---|
838 | u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
|
---|
839 | u"\u0065\u0073\u006B\u0079",
|
---|
840 | "Proprostnemluvesky-uyb24dma41a"),
|
---|
841 | # E Hebrew:
|
---|
842 | (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
|
---|
843 | u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
|
---|
844 | u"\u05D1\u05E8\u05D9\u05EA",
|
---|
845 | "4dbcagdahymbxekheh6e0a7fei0b"),
|
---|
846 | # F Hindi (Devanagari):
|
---|
847 | (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
|
---|
848 | u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
|
---|
849 | u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
|
---|
850 | u"\u0939\u0948\u0902",
|
---|
851 | "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
|
---|
852 |
|
---|
853 | #(G) Japanese (kanji and hiragana):
|
---|
854 | (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
|
---|
855 | u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
|
---|
856 | "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
|
---|
857 |
|
---|
858 | # (H) Korean (Hangul syllables):
|
---|
859 | (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
|
---|
860 | u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
|
---|
861 | u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
|
---|
862 | "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
|
---|
863 | "psd879ccm6fea98c"),
|
---|
864 |
|
---|
865 | # (I) Russian (Cyrillic):
|
---|
866 | (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
|
---|
867 | u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
|
---|
868 | u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
|
---|
869 | u"\u0438",
|
---|
870 | "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
|
---|
871 |
|
---|
872 | # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
|
---|
873 | (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
|
---|
874 | u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
|
---|
875 | u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
|
---|
876 | u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
|
---|
877 | u"\u0061\u00F1\u006F\u006C",
|
---|
878 | "PorqunopuedensimplementehablarenEspaol-fmd56a"),
|
---|
879 |
|
---|
880 | # (K) Vietnamese:
|
---|
881 | # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
|
---|
882 | # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
|
---|
883 | (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
|
---|
884 | u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
|
---|
885 | u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
|
---|
886 | u"\u0056\u0069\u1EC7\u0074",
|
---|
887 | "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
|
---|
888 |
|
---|
889 | #(L) 3<nen>B<gumi><kinpachi><sensei>
|
---|
890 | (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
|
---|
891 | "3B-ww4c5e180e575a65lsy2b"),
|
---|
892 |
|
---|
893 | # (M) <amuro><namie>-with-SUPER-MONKEYS
|
---|
894 | (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
|
---|
895 | u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
|
---|
896 | u"\u004F\u004E\u004B\u0045\u0059\u0053",
|
---|
897 | "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
|
---|
898 |
|
---|
899 | # (N) Hello-Another-Way-<sorezore><no><basho>
|
---|
900 | (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
|
---|
901 | u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
|
---|
902 | u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
|
---|
903 | "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
|
---|
904 |
|
---|
905 | # (O) <hitotsu><yane><no><shita>2
|
---|
906 | (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
|
---|
907 | "2-u9tlzr9756bt3uc0v"),
|
---|
908 |
|
---|
909 | # (P) Maji<de>Koi<suru>5<byou><mae>
|
---|
910 | (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
|
---|
911 | u"\u308B\u0035\u79D2\u524D",
|
---|
912 | "MajiKoi5-783gue6qz075azm5e"),
|
---|
913 |
|
---|
914 | # (Q) <pafii>de<runba>
|
---|
915 | (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
|
---|
916 | "de-jg4avhby1noc0d"),
|
---|
917 |
|
---|
918 | # (R) <sono><supiido><de>
|
---|
919 | (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
|
---|
920 | "d9juau41awczczp"),
|
---|
921 |
|
---|
922 | # (S) -> $1.00 <-
|
---|
923 | (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
|
---|
924 | u"\u003C\u002D",
|
---|
925 | "-> $1.00 <--")
|
---|
926 | ]
|
---|
927 |
|
---|
928 | for i in punycode_testcases:
|
---|
929 | if len(i)!=2:
|
---|
930 | print repr(i)
|
---|
931 |
|
---|
932 | class PunycodeTest(unittest.TestCase):
|
---|
933 | def test_encode(self):
|
---|
934 | for uni, puny in punycode_testcases:
|
---|
935 | # Need to convert both strings to lower case, since
|
---|
936 | # some of the extended encodings use upper case, but our
|
---|
937 | # code produces only lower case. Converting just puny to
|
---|
938 | # lower is also insufficient, since some of the input characters
|
---|
939 | # are upper case.
|
---|
940 | self.assertEqual(uni.encode("punycode").lower(), puny.lower())
|
---|
941 |
|
---|
942 | def test_decode(self):
|
---|
943 | for uni, puny in punycode_testcases:
|
---|
944 | self.assertEqual(uni, puny.decode("punycode"))
|
---|
945 |
|
---|
946 | class UnicodeInternalTest(unittest.TestCase):
|
---|
947 | def test_bug1251300(self):
|
---|
948 | # Decoding with unicode_internal used to not correctly handle "code
|
---|
949 | # points" above 0x10ffff on UCS-4 builds.
|
---|
950 | if sys.maxunicode > 0xffff:
|
---|
951 | ok = [
|
---|
952 | ("\x00\x10\xff\xff", u"\U0010ffff"),
|
---|
953 | ("\x00\x00\x01\x01", u"\U00000101"),
|
---|
954 | ("", u""),
|
---|
955 | ]
|
---|
956 | not_ok = [
|
---|
957 | "\x7f\xff\xff\xff",
|
---|
958 | "\x80\x00\x00\x00",
|
---|
959 | "\x81\x00\x00\x00",
|
---|
960 | "\x00",
|
---|
961 | "\x00\x00\x00\x00\x00",
|
---|
962 | ]
|
---|
963 | for internal, uni in ok:
|
---|
964 | if sys.byteorder == "little":
|
---|
965 | internal = "".join(reversed(internal))
|
---|
966 | self.assertEqual(uni, internal.decode("unicode_internal"))
|
---|
967 | for internal in not_ok:
|
---|
968 | if sys.byteorder == "little":
|
---|
969 | internal = "".join(reversed(internal))
|
---|
970 | self.assertRaises(UnicodeDecodeError, internal.decode,
|
---|
971 | "unicode_internal")
|
---|
972 |
|
---|
973 | def test_decode_error_attributes(self):
|
---|
974 | if sys.maxunicode > 0xffff:
|
---|
975 | try:
|
---|
976 | "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
|
---|
977 | except UnicodeDecodeError, ex:
|
---|
978 | self.assertEqual("unicode_internal", ex.encoding)
|
---|
979 | self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
|
---|
980 | self.assertEqual(4, ex.start)
|
---|
981 | self.assertEqual(8, ex.end)
|
---|
982 | else:
|
---|
983 | self.fail()
|
---|
984 |
|
---|
985 | def test_decode_callback(self):
|
---|
986 | if sys.maxunicode > 0xffff:
|
---|
987 | codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
|
---|
988 | decoder = codecs.getdecoder("unicode_internal")
|
---|
989 | ab = u"ab".encode("unicode_internal")
|
---|
990 | ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
|
---|
991 | "UnicodeInternalTest")
|
---|
992 | self.assertEqual((u"ab", 12), ignored)
|
---|
993 |
|
---|
994 | def test_encode_length(self):
|
---|
995 | # Issue 3739
|
---|
996 | encoder = codecs.getencoder("unicode_internal")
|
---|
997 | self.assertEqual(encoder(u"a")[1], 1)
|
---|
998 | self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
|
---|
999 |
|
---|
1000 | encoder = codecs.getencoder("string-escape")
|
---|
1001 | self.assertEqual(encoder(r'\x00')[1], 4)
|
---|
1002 |
|
---|
1003 | # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
|
---|
1004 | nameprep_tests = [
|
---|
1005 | # 3.1 Map to nothing.
|
---|
1006 | ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
|
---|
1007 | '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
|
---|
1008 | '\xb8\x8f\xef\xbb\xbf',
|
---|
1009 | 'foobarbaz'),
|
---|
1010 | # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
|
---|
1011 | ('CAFE',
|
---|
1012 | 'cafe'),
|
---|
1013 | # 3.3 Case folding 8bit U+00DF (german sharp s).
|
---|
1014 | # The original test case is bogus; it says \xc3\xdf
|
---|
1015 | ('\xc3\x9f',
|
---|
1016 | 'ss'),
|
---|
1017 | # 3.4 Case folding U+0130 (turkish capital I with dot).
|
---|
1018 | ('\xc4\xb0',
|
---|
1019 | 'i\xcc\x87'),
|
---|
1020 | # 3.5 Case folding multibyte U+0143 U+037A.
|
---|
1021 | ('\xc5\x83\xcd\xba',
|
---|
1022 | '\xc5\x84 \xce\xb9'),
|
---|
1023 | # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
|
---|
1024 | # XXX: skip this as it fails in UCS-2 mode
|
---|
1025 | #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
|
---|
1026 | # 'telc\xe2\x88\x95kg\xcf\x83'),
|
---|
1027 | (None, None),
|
---|
1028 | # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
|
---|
1029 | ('j\xcc\x8c\xc2\xa0\xc2\xaa',
|
---|
1030 | '\xc7\xb0 a'),
|
---|
1031 | # 3.8 Case folding U+1FB7 and normalization.
|
---|
1032 | ('\xe1\xbe\xb7',
|
---|
1033 | '\xe1\xbe\xb6\xce\xb9'),
|
---|
1034 | # 3.9 Self-reverting case folding U+01F0 and normalization.
|
---|
1035 | # The original test case is bogus, it says `\xc7\xf0'
|
---|
1036 | ('\xc7\xb0',
|
---|
1037 | '\xc7\xb0'),
|
---|
1038 | # 3.10 Self-reverting case folding U+0390 and normalization.
|
---|
1039 | ('\xce\x90',
|
---|
1040 | '\xce\x90'),
|
---|
1041 | # 3.11 Self-reverting case folding U+03B0 and normalization.
|
---|
1042 | ('\xce\xb0',
|
---|
1043 | '\xce\xb0'),
|
---|
1044 | # 3.12 Self-reverting case folding U+1E96 and normalization.
|
---|
1045 | ('\xe1\xba\x96',
|
---|
1046 | '\xe1\xba\x96'),
|
---|
1047 | # 3.13 Self-reverting case folding U+1F56 and normalization.
|
---|
1048 | ('\xe1\xbd\x96',
|
---|
1049 | '\xe1\xbd\x96'),
|
---|
1050 | # 3.14 ASCII space character U+0020.
|
---|
1051 | (' ',
|
---|
1052 | ' '),
|
---|
1053 | # 3.15 Non-ASCII 8bit space character U+00A0.
|
---|
1054 | ('\xc2\xa0',
|
---|
1055 | ' '),
|
---|
1056 | # 3.16 Non-ASCII multibyte space character U+1680.
|
---|
1057 | ('\xe1\x9a\x80',
|
---|
1058 | None),
|
---|
1059 | # 3.17 Non-ASCII multibyte space character U+2000.
|
---|
1060 | ('\xe2\x80\x80',
|
---|
1061 | ' '),
|
---|
1062 | # 3.18 Zero Width Space U+200b.
|
---|
1063 | ('\xe2\x80\x8b',
|
---|
1064 | ''),
|
---|
1065 | # 3.19 Non-ASCII multibyte space character U+3000.
|
---|
1066 | ('\xe3\x80\x80',
|
---|
1067 | ' '),
|
---|
1068 | # 3.20 ASCII control characters U+0010 U+007F.
|
---|
1069 | ('\x10\x7f',
|
---|
1070 | '\x10\x7f'),
|
---|
1071 | # 3.21 Non-ASCII 8bit control character U+0085.
|
---|
1072 | ('\xc2\x85',
|
---|
1073 | None),
|
---|
1074 | # 3.22 Non-ASCII multibyte control character U+180E.
|
---|
1075 | ('\xe1\xa0\x8e',
|
---|
1076 | None),
|
---|
1077 | # 3.23 Zero Width No-Break Space U+FEFF.
|
---|
1078 | ('\xef\xbb\xbf',
|
---|
1079 | ''),
|
---|
1080 | # 3.24 Non-ASCII control character U+1D175.
|
---|
1081 | ('\xf0\x9d\x85\xb5',
|
---|
1082 | None),
|
---|
1083 | # 3.25 Plane 0 private use character U+F123.
|
---|
1084 | ('\xef\x84\xa3',
|
---|
1085 | None),
|
---|
1086 | # 3.26 Plane 15 private use character U+F1234.
|
---|
1087 | ('\xf3\xb1\x88\xb4',
|
---|
1088 | None),
|
---|
1089 | # 3.27 Plane 16 private use character U+10F234.
|
---|
1090 | ('\xf4\x8f\x88\xb4',
|
---|
1091 | None),
|
---|
1092 | # 3.28 Non-character code point U+8FFFE.
|
---|
1093 | ('\xf2\x8f\xbf\xbe',
|
---|
1094 | None),
|
---|
1095 | # 3.29 Non-character code point U+10FFFF.
|
---|
1096 | ('\xf4\x8f\xbf\xbf',
|
---|
1097 | None),
|
---|
1098 | # 3.30 Surrogate code U+DF42.
|
---|
1099 | ('\xed\xbd\x82',
|
---|
1100 | None),
|
---|
1101 | # 3.31 Non-plain text character U+FFFD.
|
---|
1102 | ('\xef\xbf\xbd',
|
---|
1103 | None),
|
---|
1104 | # 3.32 Ideographic description character U+2FF5.
|
---|
1105 | ('\xe2\xbf\xb5',
|
---|
1106 | None),
|
---|
1107 | # 3.33 Display property character U+0341.
|
---|
1108 | ('\xcd\x81',
|
---|
1109 | '\xcc\x81'),
|
---|
1110 | # 3.34 Left-to-right mark U+200E.
|
---|
1111 | ('\xe2\x80\x8e',
|
---|
1112 | None),
|
---|
1113 | # 3.35 Deprecated U+202A.
|
---|
1114 | ('\xe2\x80\xaa',
|
---|
1115 | None),
|
---|
1116 | # 3.36 Language tagging character U+E0001.
|
---|
1117 | ('\xf3\xa0\x80\x81',
|
---|
1118 | None),
|
---|
1119 | # 3.37 Language tagging character U+E0042.
|
---|
1120 | ('\xf3\xa0\x81\x82',
|
---|
1121 | None),
|
---|
1122 | # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
|
---|
1123 | ('foo\xd6\xbebar',
|
---|
1124 | None),
|
---|
1125 | # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
|
---|
1126 | ('foo\xef\xb5\x90bar',
|
---|
1127 | None),
|
---|
1128 | # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
|
---|
1129 | ('foo\xef\xb9\xb6bar',
|
---|
1130 | 'foo \xd9\x8ebar'),
|
---|
1131 | # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
|
---|
1132 | ('\xd8\xa71',
|
---|
1133 | None),
|
---|
1134 | # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
|
---|
1135 | ('\xd8\xa71\xd8\xa8',
|
---|
1136 | '\xd8\xa71\xd8\xa8'),
|
---|
1137 | # 3.43 Unassigned code point U+E0002.
|
---|
1138 | # Skip this test as we allow unassigned
|
---|
1139 | #('\xf3\xa0\x80\x82',
|
---|
1140 | # None),
|
---|
1141 | (None, None),
|
---|
1142 | # 3.44 Larger test (shrinking).
|
---|
1143 | # Original test case reads \xc3\xdf
|
---|
1144 | ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
|
---|
1145 | '\xaa\xce\xb0\xe2\x80\x80',
|
---|
1146 | 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
|
---|
1147 | # 3.45 Larger test (expanding).
|
---|
1148 | # Original test case reads \xc3\x9f
|
---|
1149 | ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
|
---|
1150 | '\x80',
|
---|
1151 | 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
|
---|
1152 | '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
|
---|
1153 | '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
|
---|
1154 | ]
|
---|
1155 |
|
---|
1156 |
|
---|
1157 | class NameprepTest(unittest.TestCase):
|
---|
1158 | def test_nameprep(self):
|
---|
1159 | from encodings.idna import nameprep
|
---|
1160 | for pos, (orig, prepped) in enumerate(nameprep_tests):
|
---|
1161 | if orig is None:
|
---|
1162 | # Skipped
|
---|
1163 | continue
|
---|
1164 | # The Unicode strings are given in UTF-8
|
---|
1165 | orig = unicode(orig, "utf-8")
|
---|
1166 | if prepped is None:
|
---|
1167 | # Input contains prohibited characters
|
---|
1168 | self.assertRaises(UnicodeError, nameprep, orig)
|
---|
1169 | else:
|
---|
1170 | prepped = unicode(prepped, "utf-8")
|
---|
1171 | try:
|
---|
1172 | self.assertEqual(nameprep(orig), prepped)
|
---|
1173 | except Exception,e:
|
---|
1174 | raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
|
---|
1175 |
|
---|
1176 | class IDNACodecTest(unittest.TestCase):
|
---|
1177 | def test_builtin_decode(self):
|
---|
1178 | self.assertEqual(unicode("python.org", "idna"), u"python.org")
|
---|
1179 | self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
|
---|
1180 | self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
|
---|
1181 | self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
|
---|
1182 |
|
---|
1183 | def test_builtin_encode(self):
|
---|
1184 | self.assertEqual(u"python.org".encode("idna"), "python.org")
|
---|
1185 | self.assertEqual("python.org.".encode("idna"), "python.org.")
|
---|
1186 | self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
|
---|
1187 | self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
|
---|
1188 |
|
---|
1189 | def test_stream(self):
|
---|
1190 | import StringIO
|
---|
1191 | r = codecs.getreader("idna")(StringIO.StringIO("abc"))
|
---|
1192 | r.read(3)
|
---|
1193 | self.assertEqual(r.read(), u"")
|
---|
1194 |
|
---|
1195 | def test_incremental_decode(self):
|
---|
1196 | self.assertEqual(
|
---|
1197 | "".join(codecs.iterdecode("python.org", "idna")),
|
---|
1198 | u"python.org"
|
---|
1199 | )
|
---|
1200 | self.assertEqual(
|
---|
1201 | "".join(codecs.iterdecode("python.org.", "idna")),
|
---|
1202 | u"python.org."
|
---|
1203 | )
|
---|
1204 | self.assertEqual(
|
---|
1205 | "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
|
---|
1206 | u"pyth\xf6n.org."
|
---|
1207 | )
|
---|
1208 | self.assertEqual(
|
---|
1209 | "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
|
---|
1210 | u"pyth\xf6n.org."
|
---|
1211 | )
|
---|
1212 |
|
---|
1213 | decoder = codecs.getincrementaldecoder("idna")()
|
---|
1214 | self.assertEqual(decoder.decode("xn--xam", ), u"")
|
---|
1215 | self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
|
---|
1216 | self.assertEqual(decoder.decode(u"rg"), u"")
|
---|
1217 | self.assertEqual(decoder.decode(u"", True), u"org")
|
---|
1218 |
|
---|
1219 | decoder.reset()
|
---|
1220 | self.assertEqual(decoder.decode("xn--xam", ), u"")
|
---|
1221 | self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
|
---|
1222 | self.assertEqual(decoder.decode("rg."), u"org.")
|
---|
1223 | self.assertEqual(decoder.decode("", True), u"")
|
---|
1224 |
|
---|
1225 | def test_incremental_encode(self):
|
---|
1226 | self.assertEqual(
|
---|
1227 | "".join(codecs.iterencode(u"python.org", "idna")),
|
---|
1228 | "python.org"
|
---|
1229 | )
|
---|
1230 | self.assertEqual(
|
---|
1231 | "".join(codecs.iterencode(u"python.org.", "idna")),
|
---|
1232 | "python.org."
|
---|
1233 | )
|
---|
1234 | self.assertEqual(
|
---|
1235 | "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
|
---|
1236 | "xn--pythn-mua.org."
|
---|
1237 | )
|
---|
1238 | self.assertEqual(
|
---|
1239 | "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
|
---|
1240 | "xn--pythn-mua.org."
|
---|
1241 | )
|
---|
1242 |
|
---|
1243 | encoder = codecs.getincrementalencoder("idna")()
|
---|
1244 | self.assertEqual(encoder.encode(u"\xe4x"), "")
|
---|
1245 | self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
|
---|
1246 | self.assertEqual(encoder.encode(u"", True), "org")
|
---|
1247 |
|
---|
1248 | encoder.reset()
|
---|
1249 | self.assertEqual(encoder.encode(u"\xe4x"), "")
|
---|
1250 | self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
|
---|
1251 | self.assertEqual(encoder.encode(u"", True), "")
|
---|
1252 |
|
---|
1253 | class CodecsModuleTest(unittest.TestCase):
|
---|
1254 |
|
---|
1255 | def test_decode(self):
|
---|
1256 | self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
|
---|
1257 | u'\xe4\xf6\xfc')
|
---|
1258 | self.assertRaises(TypeError, codecs.decode)
|
---|
1259 | self.assertEqual(codecs.decode('abc'), u'abc')
|
---|
1260 | self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
|
---|
1261 |
|
---|
1262 | def test_encode(self):
|
---|
1263 | self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
|
---|
1264 | '\xe4\xf6\xfc')
|
---|
1265 | self.assertRaises(TypeError, codecs.encode)
|
---|
1266 | self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
|
---|
1267 | self.assertEqual(codecs.encode(u'abc'), 'abc')
|
---|
1268 | self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
|
---|
1269 |
|
---|
1270 | def test_register(self):
|
---|
1271 | self.assertRaises(TypeError, codecs.register)
|
---|
1272 | self.assertRaises(TypeError, codecs.register, 42)
|
---|
1273 |
|
---|
1274 | def test_lookup(self):
|
---|
1275 | self.assertRaises(TypeError, codecs.lookup)
|
---|
1276 | self.assertRaises(LookupError, codecs.lookup, "__spam__")
|
---|
1277 | self.assertRaises(LookupError, codecs.lookup, " ")
|
---|
1278 |
|
---|
1279 | def test_getencoder(self):
|
---|
1280 | self.assertRaises(TypeError, codecs.getencoder)
|
---|
1281 | self.assertRaises(LookupError, codecs.getencoder, "__spam__")
|
---|
1282 |
|
---|
1283 | def test_getdecoder(self):
|
---|
1284 | self.assertRaises(TypeError, codecs.getdecoder)
|
---|
1285 | self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
|
---|
1286 |
|
---|
1287 | def test_getreader(self):
|
---|
1288 | self.assertRaises(TypeError, codecs.getreader)
|
---|
1289 | self.assertRaises(LookupError, codecs.getreader, "__spam__")
|
---|
1290 |
|
---|
1291 | def test_getwriter(self):
|
---|
1292 | self.assertRaises(TypeError, codecs.getwriter)
|
---|
1293 | self.assertRaises(LookupError, codecs.getwriter, "__spam__")
|
---|
1294 |
|
---|
1295 | def test_lookup_issue1813(self):
|
---|
1296 | # Issue #1813: under Turkish locales, lookup of some codecs failed
|
---|
1297 | # because 'I' is lowercased as a dotless "i"
|
---|
1298 | oldlocale = locale.getlocale(locale.LC_CTYPE)
|
---|
1299 | self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
|
---|
1300 | try:
|
---|
1301 | locale.setlocale(locale.LC_CTYPE, 'tr_TR')
|
---|
1302 | except locale.Error:
|
---|
1303 | # Unsupported locale on this system
|
---|
1304 | self.skipTest('test needs Turkish locale')
|
---|
1305 | c = codecs.lookup('ASCII')
|
---|
1306 | self.assertEqual(c.name, 'ascii')
|
---|
1307 |
|
---|
1308 | class StreamReaderTest(unittest.TestCase):
|
---|
1309 |
|
---|
1310 | def setUp(self):
|
---|
1311 | self.reader = codecs.getreader('utf-8')
|
---|
1312 | self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
|
---|
1313 |
|
---|
1314 | def test_readlines(self):
|
---|
1315 | f = self.reader(self.stream)
|
---|
1316 | self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
|
---|
1317 |
|
---|
1318 | class EncodedFileTest(unittest.TestCase):
|
---|
1319 |
|
---|
1320 | def test_basic(self):
|
---|
1321 | f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
|
---|
1322 | ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
|
---|
1323 | self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
|
---|
1324 |
|
---|
1325 | f = StringIO.StringIO()
|
---|
1326 | ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
|
---|
1327 | ef.write('\xc3\xbc')
|
---|
1328 | self.assertEqual(f.getvalue(), '\xfc')
|
---|
1329 |
|
---|
1330 | class Str2StrTest(unittest.TestCase):
|
---|
1331 |
|
---|
1332 | def test_read(self):
|
---|
1333 | sin = "\x80".encode("base64_codec")
|
---|
1334 | reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
|
---|
1335 | sout = reader.read()
|
---|
1336 | self.assertEqual(sout, "\x80")
|
---|
1337 | self.assertIsInstance(sout, str)
|
---|
1338 |
|
---|
1339 | def test_readline(self):
|
---|
1340 | sin = "\x80".encode("base64_codec")
|
---|
1341 | reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
|
---|
1342 | sout = reader.readline()
|
---|
1343 | self.assertEqual(sout, "\x80")
|
---|
1344 | self.assertIsInstance(sout, str)
|
---|
1345 |
|
---|
1346 | all_unicode_encodings = [
|
---|
1347 | "ascii",
|
---|
1348 | "base64_codec",
|
---|
1349 | "big5",
|
---|
1350 | "big5hkscs",
|
---|
1351 | "charmap",
|
---|
1352 | "cp037",
|
---|
1353 | "cp1006",
|
---|
1354 | "cp1026",
|
---|
1355 | "cp1140",
|
---|
1356 | "cp1250",
|
---|
1357 | "cp1251",
|
---|
1358 | "cp1252",
|
---|
1359 | "cp1253",
|
---|
1360 | "cp1254",
|
---|
1361 | "cp1255",
|
---|
1362 | "cp1256",
|
---|
1363 | "cp1257",
|
---|
1364 | "cp1258",
|
---|
1365 | "cp424",
|
---|
1366 | "cp437",
|
---|
1367 | "cp500",
|
---|
1368 | "cp720",
|
---|
1369 | "cp737",
|
---|
1370 | "cp775",
|
---|
1371 | "cp850",
|
---|
1372 | "cp852",
|
---|
1373 | "cp855",
|
---|
1374 | "cp856",
|
---|
1375 | "cp857",
|
---|
1376 | "cp858",
|
---|
1377 | "cp860",
|
---|
1378 | "cp861",
|
---|
1379 | "cp862",
|
---|
1380 | "cp863",
|
---|
1381 | "cp864",
|
---|
1382 | "cp865",
|
---|
1383 | "cp866",
|
---|
1384 | "cp869",
|
---|
1385 | "cp874",
|
---|
1386 | "cp875",
|
---|
1387 | "cp932",
|
---|
1388 | "cp949",
|
---|
1389 | "cp950",
|
---|
1390 | "euc_jis_2004",
|
---|
1391 | "euc_jisx0213",
|
---|
1392 | "euc_jp",
|
---|
1393 | "euc_kr",
|
---|
1394 | "gb18030",
|
---|
1395 | "gb2312",
|
---|
1396 | "gbk",
|
---|
1397 | "hex_codec",
|
---|
1398 | "hp_roman8",
|
---|
1399 | "hz",
|
---|
1400 | "idna",
|
---|
1401 | "iso2022_jp",
|
---|
1402 | "iso2022_jp_1",
|
---|
1403 | "iso2022_jp_2",
|
---|
1404 | "iso2022_jp_2004",
|
---|
1405 | "iso2022_jp_3",
|
---|
1406 | "iso2022_jp_ext",
|
---|
1407 | "iso2022_kr",
|
---|
1408 | "iso8859_1",
|
---|
1409 | "iso8859_10",
|
---|
1410 | "iso8859_11",
|
---|
1411 | "iso8859_13",
|
---|
1412 | "iso8859_14",
|
---|
1413 | "iso8859_15",
|
---|
1414 | "iso8859_16",
|
---|
1415 | "iso8859_2",
|
---|
1416 | "iso8859_3",
|
---|
1417 | "iso8859_4",
|
---|
1418 | "iso8859_5",
|
---|
1419 | "iso8859_6",
|
---|
1420 | "iso8859_7",
|
---|
1421 | "iso8859_8",
|
---|
1422 | "iso8859_9",
|
---|
1423 | "johab",
|
---|
1424 | "koi8_r",
|
---|
1425 | "koi8_u",
|
---|
1426 | "latin_1",
|
---|
1427 | "mac_cyrillic",
|
---|
1428 | "mac_greek",
|
---|
1429 | "mac_iceland",
|
---|
1430 | "mac_latin2",
|
---|
1431 | "mac_roman",
|
---|
1432 | "mac_turkish",
|
---|
1433 | "palmos",
|
---|
1434 | "ptcp154",
|
---|
1435 | "punycode",
|
---|
1436 | "raw_unicode_escape",
|
---|
1437 | "rot_13",
|
---|
1438 | "shift_jis",
|
---|
1439 | "shift_jis_2004",
|
---|
1440 | "shift_jisx0213",
|
---|
1441 | "tis_620",
|
---|
1442 | "unicode_escape",
|
---|
1443 | "unicode_internal",
|
---|
1444 | "utf_16",
|
---|
1445 | "utf_16_be",
|
---|
1446 | "utf_16_le",
|
---|
1447 | "utf_7",
|
---|
1448 | "utf_8",
|
---|
1449 | ]
|
---|
1450 |
|
---|
1451 | if hasattr(codecs, "mbcs_encode"):
|
---|
1452 | all_unicode_encodings.append("mbcs")
|
---|
1453 |
|
---|
1454 | # The following encodings work only with str, not unicode
|
---|
1455 | all_string_encodings = [
|
---|
1456 | "quopri_codec",
|
---|
1457 | "string_escape",
|
---|
1458 | "uu_codec",
|
---|
1459 | ]
|
---|
1460 |
|
---|
1461 | # The following encoding is not tested, because it's not supposed
|
---|
1462 | # to work:
|
---|
1463 | # "undefined"
|
---|
1464 |
|
---|
1465 | # The following encodings don't work in stateful mode
|
---|
1466 | broken_unicode_with_streams = [
|
---|
1467 | "base64_codec",
|
---|
1468 | "hex_codec",
|
---|
1469 | "punycode",
|
---|
1470 | "unicode_internal"
|
---|
1471 | ]
|
---|
1472 | broken_incremental_coders = broken_unicode_with_streams[:]
|
---|
1473 |
|
---|
1474 | # The following encodings only support "strict" mode
|
---|
1475 | only_strict_mode = [
|
---|
1476 | "idna",
|
---|
1477 | "zlib_codec",
|
---|
1478 | "bz2_codec",
|
---|
1479 | ]
|
---|
1480 |
|
---|
1481 | try:
|
---|
1482 | import bz2
|
---|
1483 | except ImportError:
|
---|
1484 | pass
|
---|
1485 | else:
|
---|
1486 | all_unicode_encodings.append("bz2_codec")
|
---|
1487 | broken_unicode_with_streams.append("bz2_codec")
|
---|
1488 |
|
---|
1489 | try:
|
---|
1490 | import zlib
|
---|
1491 | except ImportError:
|
---|
1492 | pass
|
---|
1493 | else:
|
---|
1494 | all_unicode_encodings.append("zlib_codec")
|
---|
1495 | broken_unicode_with_streams.append("zlib_codec")
|
---|
1496 |
|
---|
1497 | class BasicUnicodeTest(unittest.TestCase):
|
---|
1498 | def test_basics(self):
|
---|
1499 | s = u"abc123" # all codecs should be able to encode these
|
---|
1500 | for encoding in all_unicode_encodings:
|
---|
1501 | name = codecs.lookup(encoding).name
|
---|
1502 | if encoding.endswith("_codec"):
|
---|
1503 | name += "_codec"
|
---|
1504 | elif encoding == "latin_1":
|
---|
1505 | name = "latin_1"
|
---|
1506 | self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
|
---|
1507 | (bytes, size) = codecs.getencoder(encoding)(s)
|
---|
1508 | self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
|
---|
1509 | (chars, size) = codecs.getdecoder(encoding)(bytes)
|
---|
1510 | self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
|
---|
1511 |
|
---|
1512 | if encoding not in broken_unicode_with_streams:
|
---|
1513 | # check stream reader/writer
|
---|
1514 | q = Queue()
|
---|
1515 | writer = codecs.getwriter(encoding)(q)
|
---|
1516 | encodedresult = ""
|
---|
1517 | for c in s:
|
---|
1518 | writer.write(c)
|
---|
1519 | encodedresult += q.read()
|
---|
1520 | q = Queue()
|
---|
1521 | reader = codecs.getreader(encoding)(q)
|
---|
1522 | decodedresult = u""
|
---|
1523 | for c in encodedresult:
|
---|
1524 | q.write(c)
|
---|
1525 | decodedresult += reader.read()
|
---|
1526 | self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
|
---|
1527 |
|
---|
1528 | if encoding not in broken_incremental_coders:
|
---|
1529 | # check incremental decoder/encoder (fetched via the Python
|
---|
1530 | # and C API) and iterencode()/iterdecode()
|
---|
1531 | try:
|
---|
1532 | encoder = codecs.getincrementalencoder(encoding)()
|
---|
1533 | cencoder = _testcapi.codec_incrementalencoder(encoding)
|
---|
1534 | except LookupError: # no IncrementalEncoder
|
---|
1535 | pass
|
---|
1536 | else:
|
---|
1537 | # check incremental decoder/encoder
|
---|
1538 | encodedresult = ""
|
---|
1539 | for c in s:
|
---|
1540 | encodedresult += encoder.encode(c)
|
---|
1541 | encodedresult += encoder.encode(u"", True)
|
---|
1542 | decoder = codecs.getincrementaldecoder(encoding)()
|
---|
1543 | decodedresult = u""
|
---|
1544 | for c in encodedresult:
|
---|
1545 | decodedresult += decoder.decode(c)
|
---|
1546 | decodedresult += decoder.decode("", True)
|
---|
1547 | self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
|
---|
1548 |
|
---|
1549 | # check C API
|
---|
1550 | encodedresult = ""
|
---|
1551 | for c in s:
|
---|
1552 | encodedresult += cencoder.encode(c)
|
---|
1553 | encodedresult += cencoder.encode(u"", True)
|
---|
1554 | cdecoder = _testcapi.codec_incrementaldecoder(encoding)
|
---|
1555 | decodedresult = u""
|
---|
1556 | for c in encodedresult:
|
---|
1557 | decodedresult += cdecoder.decode(c)
|
---|
1558 | decodedresult += cdecoder.decode("", True)
|
---|
1559 | self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
|
---|
1560 |
|
---|
1561 | # check iterencode()/iterdecode()
|
---|
1562 | result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
|
---|
1563 | self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
|
---|
1564 |
|
---|
1565 | # check iterencode()/iterdecode() with empty string
|
---|
1566 | result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
|
---|
1567 | self.assertEqual(result, u"")
|
---|
1568 |
|
---|
1569 | if encoding not in only_strict_mode:
|
---|
1570 | # check incremental decoder/encoder with errors argument
|
---|
1571 | try:
|
---|
1572 | encoder = codecs.getincrementalencoder(encoding)("ignore")
|
---|
1573 | cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
|
---|
1574 | except LookupError: # no IncrementalEncoder
|
---|
1575 | pass
|
---|
1576 | else:
|
---|
1577 | encodedresult = "".join(encoder.encode(c) for c in s)
|
---|
1578 | decoder = codecs.getincrementaldecoder(encoding)("ignore")
|
---|
1579 | decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
|
---|
1580 | self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
|
---|
1581 |
|
---|
1582 | encodedresult = "".join(cencoder.encode(c) for c in s)
|
---|
1583 | cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
|
---|
1584 | decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
|
---|
1585 | self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
|
---|
1586 |
|
---|
1587 | def test_seek(self):
|
---|
1588 | # all codecs should be able to encode these
|
---|
1589 | s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
|
---|
1590 | for encoding in all_unicode_encodings:
|
---|
1591 | if encoding == "idna": # FIXME: See SF bug #1163178
|
---|
1592 | continue
|
---|
1593 | if encoding in broken_unicode_with_streams:
|
---|
1594 | continue
|
---|
1595 | reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
|
---|
1596 | for t in xrange(5):
|
---|
1597 | # Test that calling seek resets the internal codec state and buffers
|
---|
1598 | reader.seek(0, 0)
|
---|
1599 | line = reader.readline()
|
---|
1600 | self.assertEqual(s[:len(line)], line)
|
---|
1601 |
|
---|
1602 | def test_bad_decode_args(self):
|
---|
1603 | for encoding in all_unicode_encodings:
|
---|
1604 | decoder = codecs.getdecoder(encoding)
|
---|
1605 | self.assertRaises(TypeError, decoder)
|
---|
1606 | if encoding not in ("idna", "punycode"):
|
---|
1607 | self.assertRaises(TypeError, decoder, 42)
|
---|
1608 |
|
---|
1609 | def test_bad_encode_args(self):
|
---|
1610 | for encoding in all_unicode_encodings:
|
---|
1611 | encoder = codecs.getencoder(encoding)
|
---|
1612 | self.assertRaises(TypeError, encoder)
|
---|
1613 |
|
---|
1614 | def test_encoding_map_type_initialized(self):
|
---|
1615 | from encodings import cp1140
|
---|
1616 | # This used to crash, we are only verifying there's no crash.
|
---|
1617 | table_type = type(cp1140.encoding_table)
|
---|
1618 | self.assertEqual(table_type, table_type)
|
---|
1619 |
|
---|
1620 | class BasicStrTest(unittest.TestCase):
|
---|
1621 | def test_basics(self):
|
---|
1622 | s = "abc123"
|
---|
1623 | for encoding in all_string_encodings:
|
---|
1624 | (bytes, size) = codecs.getencoder(encoding)(s)
|
---|
1625 | self.assertEqual(size, len(s))
|
---|
1626 | (chars, size) = codecs.getdecoder(encoding)(bytes)
|
---|
1627 | self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
|
---|
1628 |
|
---|
1629 | class CharmapTest(unittest.TestCase):
|
---|
1630 | def test_decode_with_string_map(self):
|
---|
1631 | self.assertEqual(
|
---|
1632 | codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
|
---|
1633 | (u"abc", 3)
|
---|
1634 | )
|
---|
1635 |
|
---|
1636 | self.assertRaises(UnicodeDecodeError,
|
---|
1637 | codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
|
---|
1638 | )
|
---|
1639 |
|
---|
1640 | self.assertRaises(UnicodeDecodeError,
|
---|
1641 | codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
|
---|
1642 | )
|
---|
1643 |
|
---|
1644 | self.assertEqual(
|
---|
1645 | codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
|
---|
1646 | (u"ab\ufffd", 3)
|
---|
1647 | )
|
---|
1648 |
|
---|
1649 | self.assertEqual(
|
---|
1650 | codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
|
---|
1651 | (u"ab\ufffd", 3)
|
---|
1652 | )
|
---|
1653 |
|
---|
1654 | self.assertEqual(
|
---|
1655 | codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
|
---|
1656 | (u"ab", 3)
|
---|
1657 | )
|
---|
1658 |
|
---|
1659 | self.assertEqual(
|
---|
1660 | codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
|
---|
1661 | (u"ab", 3)
|
---|
1662 | )
|
---|
1663 |
|
---|
1664 | allbytes = "".join(chr(i) for i in xrange(256))
|
---|
1665 | self.assertEqual(
|
---|
1666 | codecs.charmap_decode(allbytes, "ignore", u""),
|
---|
1667 | (u"", len(allbytes))
|
---|
1668 | )
|
---|
1669 |
|
---|
1670 | def test_decode_with_int2str_map(self):
|
---|
1671 | self.assertEqual(
|
---|
1672 | codecs.charmap_decode("\x00\x01\x02", "strict",
|
---|
1673 | {0: u'a', 1: u'b', 2: u'c'}),
|
---|
1674 | (u"abc", 3)
|
---|
1675 | )
|
---|
1676 |
|
---|
1677 | self.assertEqual(
|
---|
1678 | codecs.charmap_decode("\x00\x01\x02", "strict",
|
---|
1679 | {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
|
---|
1680 | (u"AaBbCc", 3)
|
---|
1681 | )
|
---|
1682 |
|
---|
1683 | self.assertEqual(
|
---|
1684 | codecs.charmap_decode("\x00\x01\x02", "strict",
|
---|
1685 | {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
|
---|
1686 | (u"\U0010FFFFbc", 3)
|
---|
1687 | )
|
---|
1688 |
|
---|
1689 | self.assertEqual(
|
---|
1690 | codecs.charmap_decode("\x00\x01\x02", "strict",
|
---|
1691 | {0: u'a', 1: u'b', 2: u''}),
|
---|
1692 | (u"ab", 3)
|
---|
1693 | )
|
---|
1694 |
|
---|
1695 | self.assertRaises(UnicodeDecodeError,
|
---|
1696 | codecs.charmap_decode, "\x00\x01\x02", "strict",
|
---|
1697 | {0: u'a', 1: u'b'}
|
---|
1698 | )
|
---|
1699 |
|
---|
1700 | self.assertRaises(UnicodeDecodeError,
|
---|
1701 | codecs.charmap_decode, "\x00\x01\x02", "strict",
|
---|
1702 | {0: u'a', 1: u'b', 2: None}
|
---|
1703 | )
|
---|
1704 |
|
---|
1705 | # Issue #14850
|
---|
1706 | self.assertRaises(UnicodeDecodeError,
|
---|
1707 | codecs.charmap_decode, "\x00\x01\x02", "strict",
|
---|
1708 | {0: u'a', 1: u'b', 2: u'\ufffe'}
|
---|
1709 | )
|
---|
1710 |
|
---|
1711 | self.assertEqual(
|
---|
1712 | codecs.charmap_decode("\x00\x01\x02", "replace",
|
---|
1713 | {0: u'a', 1: u'b'}),
|
---|
1714 | (u"ab\ufffd", 3)
|
---|
1715 | )
|
---|
1716 |
|
---|
1717 | self.assertEqual(
|
---|
1718 | codecs.charmap_decode("\x00\x01\x02", "replace",
|
---|
1719 | {0: u'a', 1: u'b', 2: None}),
|
---|
1720 | (u"ab\ufffd", 3)
|
---|
1721 | )
|
---|
1722 |
|
---|
1723 | # Issue #14850
|
---|
1724 | self.assertEqual(
|
---|
1725 | codecs.charmap_decode("\x00\x01\x02", "replace",
|
---|
1726 | {0: u'a', 1: u'b', 2: u'\ufffe'}),
|
---|
1727 | (u"ab\ufffd", 3)
|
---|
1728 | )
|
---|
1729 |
|
---|
1730 | self.assertEqual(
|
---|
1731 | codecs.charmap_decode("\x00\x01\x02", "ignore",
|
---|
1732 | {0: u'a', 1: u'b'}),
|
---|
1733 | (u"ab", 3)
|
---|
1734 | )
|
---|
1735 |
|
---|
1736 | self.assertEqual(
|
---|
1737 | codecs.charmap_decode("\x00\x01\x02", "ignore",
|
---|
1738 | {0: u'a', 1: u'b', 2: None}),
|
---|
1739 | (u"ab", 3)
|
---|
1740 | )
|
---|
1741 |
|
---|
1742 | # Issue #14850
|
---|
1743 | self.assertEqual(
|
---|
1744 | codecs.charmap_decode("\x00\x01\x02", "ignore",
|
---|
1745 | {0: u'a', 1: u'b', 2: u'\ufffe'}),
|
---|
1746 | (u"ab", 3)
|
---|
1747 | )
|
---|
1748 |
|
---|
1749 | allbytes = "".join(chr(i) for i in xrange(256))
|
---|
1750 | self.assertEqual(
|
---|
1751 | codecs.charmap_decode(allbytes, "ignore", {}),
|
---|
1752 | (u"", len(allbytes))
|
---|
1753 | )
|
---|
1754 |
|
---|
1755 | def test_decode_with_int2int_map(self):
|
---|
1756 | a = ord(u'a')
|
---|
1757 | b = ord(u'b')
|
---|
1758 | c = ord(u'c')
|
---|
1759 |
|
---|
1760 | self.assertEqual(
|
---|
1761 | codecs.charmap_decode("\x00\x01\x02", "strict",
|
---|
1762 | {0: a, 1: b, 2: c}),
|
---|
1763 | (u"abc", 3)
|
---|
1764 | )
|
---|
1765 |
|
---|
1766 | # Issue #15379
|
---|
1767 | self.assertEqual(
|
---|
1768 | codecs.charmap_decode("\x00\x01\x02", "strict",
|
---|
1769 | {0: 0x10FFFF, 1: b, 2: c}),
|
---|
1770 | (u"\U0010FFFFbc", 3)
|
---|
1771 | )
|
---|
1772 |
|
---|
1773 | self.assertRaises(TypeError,
|
---|
1774 | codecs.charmap_decode, "\x00\x01\x02", "strict",
|
---|
1775 | {0: 0x110000, 1: b, 2: c}
|
---|
1776 | )
|
---|
1777 |
|
---|
1778 | self.assertRaises(UnicodeDecodeError,
|
---|
1779 | codecs.charmap_decode, "\x00\x01\x02", "strict",
|
---|
1780 | {0: a, 1: b},
|
---|
1781 | )
|
---|
1782 |
|
---|
1783 | self.assertRaises(UnicodeDecodeError,
|
---|
1784 | codecs.charmap_decode, "\x00\x01\x02", "strict",
|
---|
1785 | {0: a, 1: b, 2: 0xFFFE},
|
---|
1786 | )
|
---|
1787 |
|
---|
1788 | self.assertEqual(
|
---|
1789 | codecs.charmap_decode("\x00\x01\x02", "replace",
|
---|
1790 | {0: a, 1: b}),
|
---|
1791 | (u"ab\ufffd", 3)
|
---|
1792 | )
|
---|
1793 |
|
---|
1794 | self.assertEqual(
|
---|
1795 | codecs.charmap_decode("\x00\x01\x02", "replace",
|
---|
1796 | {0: a, 1: b, 2: 0xFFFE}),
|
---|
1797 | (u"ab\ufffd", 3)
|
---|
1798 | )
|
---|
1799 |
|
---|
1800 | self.assertEqual(
|
---|
1801 | codecs.charmap_decode("\x00\x01\x02", "ignore",
|
---|
1802 | {0: a, 1: b}),
|
---|
1803 | (u"ab", 3)
|
---|
1804 | )
|
---|
1805 |
|
---|
1806 | self.assertEqual(
|
---|
1807 | codecs.charmap_decode("\x00\x01\x02", "ignore",
|
---|
1808 | {0: a, 1: b, 2: 0xFFFE}),
|
---|
1809 | (u"ab", 3)
|
---|
1810 | )
|
---|
1811 |
|
---|
1812 |
|
---|
1813 | class WithStmtTest(unittest.TestCase):
|
---|
1814 | def test_encodedfile(self):
|
---|
1815 | f = StringIO.StringIO("\xc3\xbc")
|
---|
1816 | with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
|
---|
1817 | self.assertEqual(ef.read(), "\xfc")
|
---|
1818 |
|
---|
1819 | def test_streamreaderwriter(self):
|
---|
1820 | f = StringIO.StringIO("\xc3\xbc")
|
---|
1821 | info = codecs.lookup("utf-8")
|
---|
1822 | with codecs.StreamReaderWriter(f, info.streamreader,
|
---|
1823 | info.streamwriter, 'strict') as srw:
|
---|
1824 | self.assertEqual(srw.read(), u"\xfc")
|
---|
1825 |
|
---|
1826 |
|
---|
1827 | class UnicodeEscapeTest(unittest.TestCase):
|
---|
1828 | def test_empty(self):
|
---|
1829 | self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
|
---|
1830 | self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
|
---|
1831 |
|
---|
1832 | def test_raw_encode(self):
|
---|
1833 | encode = codecs.unicode_escape_encode
|
---|
1834 | for b in range(32, 127):
|
---|
1835 | if b != ord('\\'):
|
---|
1836 | self.assertEqual(encode(unichr(b)), (chr(b), 1))
|
---|
1837 |
|
---|
1838 | def test_raw_decode(self):
|
---|
1839 | decode = codecs.unicode_escape_decode
|
---|
1840 | for b in range(256):
|
---|
1841 | if b != ord('\\'):
|
---|
1842 | self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
|
---|
1843 |
|
---|
1844 | def test_escape_encode(self):
|
---|
1845 | encode = codecs.unicode_escape_encode
|
---|
1846 | check = coding_checker(self, encode)
|
---|
1847 | check(u'\t', r'\t')
|
---|
1848 | check(u'\n', r'\n')
|
---|
1849 | check(u'\r', r'\r')
|
---|
1850 | check(u'\\', r'\\')
|
---|
1851 | for b in range(32):
|
---|
1852 | if chr(b) not in '\t\n\r':
|
---|
1853 | check(unichr(b), '\\x%02x' % b)
|
---|
1854 | for b in range(127, 256):
|
---|
1855 | check(unichr(b), '\\x%02x' % b)
|
---|
1856 | check(u'\u20ac', r'\u20ac')
|
---|
1857 | check(u'\U0001d120', r'\U0001d120')
|
---|
1858 |
|
---|
1859 | def test_escape_decode(self):
|
---|
1860 | decode = codecs.unicode_escape_decode
|
---|
1861 | check = coding_checker(self, decode)
|
---|
1862 | check("[\\\n]", u"[]")
|
---|
1863 | check(r'[\"]', u'["]')
|
---|
1864 | check(r"[\']", u"[']")
|
---|
1865 | check(r"[\\]", ur"[\]")
|
---|
1866 | check(r"[\a]", u"[\x07]")
|
---|
1867 | check(r"[\b]", u"[\x08]")
|
---|
1868 | check(r"[\t]", u"[\x09]")
|
---|
1869 | check(r"[\n]", u"[\x0a]")
|
---|
1870 | check(r"[\v]", u"[\x0b]")
|
---|
1871 | check(r"[\f]", u"[\x0c]")
|
---|
1872 | check(r"[\r]", u"[\x0d]")
|
---|
1873 | check(r"[\7]", u"[\x07]")
|
---|
1874 | check(r"[\8]", ur"[\8]")
|
---|
1875 | check(r"[\78]", u"[\x078]")
|
---|
1876 | check(r"[\41]", u"[!]")
|
---|
1877 | check(r"[\418]", u"[!8]")
|
---|
1878 | check(r"[\101]", u"[A]")
|
---|
1879 | check(r"[\1010]", u"[A0]")
|
---|
1880 | check(r"[\x41]", u"[A]")
|
---|
1881 | check(r"[\x410]", u"[A0]")
|
---|
1882 | check(r"\u20ac", u"\u20ac")
|
---|
1883 | check(r"\U0001d120", u"\U0001d120")
|
---|
1884 | for b in range(256):
|
---|
1885 | if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
|
---|
1886 | check('\\' + chr(b), u'\\' + unichr(b))
|
---|
1887 |
|
---|
1888 | def test_decode_errors(self):
|
---|
1889 | decode = codecs.unicode_escape_decode
|
---|
1890 | for c, d in ('x', 2), ('u', 4), ('U', 4):
|
---|
1891 | for i in range(d):
|
---|
1892 | self.assertRaises(UnicodeDecodeError, decode,
|
---|
1893 | "\\" + c + "0"*i)
|
---|
1894 | self.assertRaises(UnicodeDecodeError, decode,
|
---|
1895 | "[\\" + c + "0"*i + "]")
|
---|
1896 | data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
|
---|
1897 | self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
|
---|
1898 | self.assertEqual(decode(data, "replace"),
|
---|
1899 | (u"[\ufffd]\ufffd", len(data)))
|
---|
1900 | self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
|
---|
1901 | self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
|
---|
1902 | self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
|
---|
1903 |
|
---|
1904 |
|
---|
1905 | class RawUnicodeEscapeTest(unittest.TestCase):
|
---|
1906 | def test_empty(self):
|
---|
1907 | self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
|
---|
1908 | self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
|
---|
1909 |
|
---|
1910 | def test_raw_encode(self):
|
---|
1911 | encode = codecs.raw_unicode_escape_encode
|
---|
1912 | for b in range(256):
|
---|
1913 | self.assertEqual(encode(unichr(b)), (chr(b), 1))
|
---|
1914 |
|
---|
1915 | def test_raw_decode(self):
|
---|
1916 | decode = codecs.raw_unicode_escape_decode
|
---|
1917 | for b in range(256):
|
---|
1918 | self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
|
---|
1919 |
|
---|
1920 | def test_escape_encode(self):
|
---|
1921 | encode = codecs.raw_unicode_escape_encode
|
---|
1922 | check = coding_checker(self, encode)
|
---|
1923 | for b in range(256):
|
---|
1924 | if chr(b) not in 'uU':
|
---|
1925 | check(u'\\' + unichr(b), '\\' + chr(b))
|
---|
1926 | check(u'\u20ac', r'\u20ac')
|
---|
1927 | check(u'\U0001d120', r'\U0001d120')
|
---|
1928 |
|
---|
1929 | def test_escape_decode(self):
|
---|
1930 | decode = codecs.raw_unicode_escape_decode
|
---|
1931 | check = coding_checker(self, decode)
|
---|
1932 | for b in range(256):
|
---|
1933 | if chr(b) not in 'uU':
|
---|
1934 | check('\\' + chr(b), u'\\' + unichr(b))
|
---|
1935 | check(r"\u20ac", u"\u20ac")
|
---|
1936 | check(r"\U0001d120", u"\U0001d120")
|
---|
1937 |
|
---|
1938 | def test_decode_errors(self):
|
---|
1939 | decode = codecs.raw_unicode_escape_decode
|
---|
1940 | for c, d in ('u', 4), ('U', 4):
|
---|
1941 | for i in range(d):
|
---|
1942 | self.assertRaises(UnicodeDecodeError, decode,
|
---|
1943 | "\\" + c + "0"*i)
|
---|
1944 | self.assertRaises(UnicodeDecodeError, decode,
|
---|
1945 | "[\\" + c + "0"*i + "]")
|
---|
1946 | data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
|
---|
1947 | self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
|
---|
1948 | self.assertEqual(decode(data, "replace"),
|
---|
1949 | (u"[\ufffd]\ufffd", len(data)))
|
---|
1950 | self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
|
---|
1951 | self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
|
---|
1952 | self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
|
---|
1953 |
|
---|
1954 |
|
---|
1955 | class BomTest(unittest.TestCase):
|
---|
1956 | def test_seek0(self):
|
---|
1957 | data = u"1234567890"
|
---|
1958 | tests = ("utf-16",
|
---|
1959 | "utf-16-le",
|
---|
1960 | "utf-16-be",
|
---|
1961 | "utf-32",
|
---|
1962 | "utf-32-le",
|
---|
1963 | "utf-32-be")
|
---|
1964 | self.addCleanup(test_support.unlink, test_support.TESTFN)
|
---|
1965 | for encoding in tests:
|
---|
1966 | # Check if the BOM is written only once
|
---|
1967 | with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
|
---|
1968 | f.write(data)
|
---|
1969 | f.write(data)
|
---|
1970 | f.seek(0)
|
---|
1971 | self.assertEqual(f.read(), data * 2)
|
---|
1972 | f.seek(0)
|
---|
1973 | self.assertEqual(f.read(), data * 2)
|
---|
1974 |
|
---|
1975 | # Check that the BOM is written after a seek(0)
|
---|
1976 | with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
|
---|
1977 | f.write(data[0])
|
---|
1978 | self.assertNotEqual(f.tell(), 0)
|
---|
1979 | f.seek(0)
|
---|
1980 | f.write(data)
|
---|
1981 | f.seek(0)
|
---|
1982 | self.assertEqual(f.read(), data)
|
---|
1983 |
|
---|
1984 | # (StreamWriter) Check that the BOM is written after a seek(0)
|
---|
1985 | with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
|
---|
1986 | f.writer.write(data[0])
|
---|
1987 | self.assertNotEqual(f.writer.tell(), 0)
|
---|
1988 | f.writer.seek(0)
|
---|
1989 | f.writer.write(data)
|
---|
1990 | f.seek(0)
|
---|
1991 | self.assertEqual(f.read(), data)
|
---|
1992 |
|
---|
1993 | # Check that the BOM is not written after a seek() at a position
|
---|
1994 | # different than the start
|
---|
1995 | with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
|
---|
1996 | f.write(data)
|
---|
1997 | f.seek(f.tell())
|
---|
1998 | f.write(data)
|
---|
1999 | f.seek(0)
|
---|
2000 | self.assertEqual(f.read(), data * 2)
|
---|
2001 |
|
---|
2002 | # (StreamWriter) Check that the BOM is not written after a seek()
|
---|
2003 | # at a position different than the start
|
---|
2004 | with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
|
---|
2005 | f.writer.write(data)
|
---|
2006 | f.writer.seek(f.writer.tell())
|
---|
2007 | f.writer.write(data)
|
---|
2008 | f.seek(0)
|
---|
2009 | self.assertEqual(f.read(), data * 2)
|
---|
2010 |
|
---|
2011 |
|
---|
2012 | def test_main():
|
---|
2013 | test_support.run_unittest(
|
---|
2014 | UTF32Test,
|
---|
2015 | UTF32LETest,
|
---|
2016 | UTF32BETest,
|
---|
2017 | UTF16Test,
|
---|
2018 | UTF16LETest,
|
---|
2019 | UTF16BETest,
|
---|
2020 | UTF8Test,
|
---|
2021 | UTF8SigTest,
|
---|
2022 | UTF7Test,
|
---|
2023 | UTF16ExTest,
|
---|
2024 | ReadBufferTest,
|
---|
2025 | CharBufferTest,
|
---|
2026 | EscapeDecodeTest,
|
---|
2027 | RecodingTest,
|
---|
2028 | PunycodeTest,
|
---|
2029 | UnicodeInternalTest,
|
---|
2030 | NameprepTest,
|
---|
2031 | IDNACodecTest,
|
---|
2032 | CodecsModuleTest,
|
---|
2033 | StreamReaderTest,
|
---|
2034 | EncodedFileTest,
|
---|
2035 | Str2StrTest,
|
---|
2036 | BasicUnicodeTest,
|
---|
2037 | BasicStrTest,
|
---|
2038 | CharmapTest,
|
---|
2039 | WithStmtTest,
|
---|
2040 | UnicodeEscapeTest,
|
---|
2041 | RawUnicodeEscapeTest,
|
---|
2042 | BomTest,
|
---|
2043 | )
|
---|
2044 |
|
---|
2045 |
|
---|
2046 | if __name__ == "__main__":
|
---|
2047 | test_main()
|
---|