1 | /* ------------------------------------------------------------------------
|
---|
2 |
|
---|
3 | _codecs -- Provides access to the codec registry and the builtin
|
---|
4 | codecs.
|
---|
5 |
|
---|
6 | This module should never be imported directly. The standard library
|
---|
7 | module "codecs" wraps this builtin module for use within Python.
|
---|
8 |
|
---|
9 | The codec registry is accessible via:
|
---|
10 |
|
---|
11 | register(search_function) -> None
|
---|
12 |
|
---|
13 | lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
|
---|
14 |
|
---|
15 | The builtin Unicode codecs use the following interface:
|
---|
16 |
|
---|
17 | <encoding>_encode(Unicode_object[,errors='strict']) ->
|
---|
18 | (string object, bytes consumed)
|
---|
19 |
|
---|
20 | <encoding>_decode(char_buffer_obj[,errors='strict']) ->
|
---|
21 | (Unicode object, bytes consumed)
|
---|
22 |
|
---|
23 | <encoding>_encode() interfaces also accept non-Unicode object as
|
---|
24 | input. The objects are then converted to Unicode using
|
---|
25 | PyUnicode_FromObject() prior to applying the conversion.
|
---|
26 |
|
---|
27 | These <encoding>s are available: utf_8, unicode_escape,
|
---|
28 | raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
|
---|
29 | mbcs (on win32).
|
---|
30 |
|
---|
31 |
|
---|
32 | Written by Marc-Andre Lemburg (mal@lemburg.com).
|
---|
33 |
|
---|
34 | Copyright (c) Corporation for National Research Initiatives.
|
---|
35 |
|
---|
36 | ------------------------------------------------------------------------ */
|
---|
37 |
|
---|
38 | #define PY_SSIZE_T_CLEAN
|
---|
39 | #include "Python.h"
|
---|
40 |
|
---|
41 | /* --- Registry ----------------------------------------------------------- */
|
---|
42 |
|
---|
43 | PyDoc_STRVAR(register__doc__,
|
---|
44 | "register(search_function)\n\
|
---|
45 | \n\
|
---|
46 | Register a codec search function. Search functions are expected to take\n\
|
---|
47 | one argument, the encoding name in all lower case letters, and return\n\
|
---|
48 | a tuple of functions (encoder, decoder, stream_reader, stream_writer).");
|
---|
49 |
|
---|
50 | static
|
---|
51 | PyObject *codec_register(PyObject *self, PyObject *search_function)
|
---|
52 | {
|
---|
53 | if (PyCodec_Register(search_function))
|
---|
54 | return NULL;
|
---|
55 |
|
---|
56 | Py_RETURN_NONE;
|
---|
57 | }
|
---|
58 |
|
---|
59 | PyDoc_STRVAR(lookup__doc__,
|
---|
60 | "lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)\n\
|
---|
61 | \n\
|
---|
62 | Looks up a codec tuple in the Python codec registry and returns\n\
|
---|
63 | a tuple of functions.");
|
---|
64 |
|
---|
65 | static
|
---|
66 | PyObject *codec_lookup(PyObject *self, PyObject *args)
|
---|
67 | {
|
---|
68 | char *encoding;
|
---|
69 |
|
---|
70 | if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
|
---|
71 | return NULL;
|
---|
72 |
|
---|
73 | return _PyCodec_Lookup(encoding);
|
---|
74 | }
|
---|
75 |
|
---|
76 | PyDoc_STRVAR(encode__doc__,
|
---|
77 | "encode(obj, [encoding[,errors]]) -> object\n\
|
---|
78 | \n\
|
---|
79 | Encodes obj using the codec registered for encoding. encoding defaults\n\
|
---|
80 | to the default encoding. errors may be given to set a different error\n\
|
---|
81 | handling scheme. Default is 'strict' meaning that encoding errors raise\n\
|
---|
82 | a ValueError. Other possible values are 'ignore', 'replace' and\n\
|
---|
83 | 'xmlcharrefreplace' as well as any other name registered with\n\
|
---|
84 | codecs.register_error that can handle ValueErrors.");
|
---|
85 |
|
---|
86 | static PyObject *
|
---|
87 | codec_encode(PyObject *self, PyObject *args)
|
---|
88 | {
|
---|
89 | const char *encoding = NULL;
|
---|
90 | const char *errors = NULL;
|
---|
91 | PyObject *v;
|
---|
92 |
|
---|
93 | if (!PyArg_ParseTuple(args, "O|ss:encode", &v, &encoding, &errors))
|
---|
94 | return NULL;
|
---|
95 |
|
---|
96 | #ifdef Py_USING_UNICODE
|
---|
97 | if (encoding == NULL)
|
---|
98 | encoding = PyUnicode_GetDefaultEncoding();
|
---|
99 | #else
|
---|
100 | if (encoding == NULL) {
|
---|
101 | PyErr_SetString(PyExc_ValueError, "no encoding specified");
|
---|
102 | return NULL;
|
---|
103 | }
|
---|
104 | #endif
|
---|
105 |
|
---|
106 | /* Encode via the codec registry */
|
---|
107 | return PyCodec_Encode(v, encoding, errors);
|
---|
108 | }
|
---|
109 |
|
---|
110 | PyDoc_STRVAR(decode__doc__,
|
---|
111 | "decode(obj, [encoding[,errors]]) -> object\n\
|
---|
112 | \n\
|
---|
113 | Decodes obj using the codec registered for encoding. encoding defaults\n\
|
---|
114 | to the default encoding. errors may be given to set a different error\n\
|
---|
115 | handling scheme. Default is 'strict' meaning that encoding errors raise\n\
|
---|
116 | a ValueError. Other possible values are 'ignore' and 'replace'\n\
|
---|
117 | as well as any other name registerd with codecs.register_error that is\n\
|
---|
118 | able to handle ValueErrors.");
|
---|
119 |
|
---|
120 | static PyObject *
|
---|
121 | codec_decode(PyObject *self, PyObject *args)
|
---|
122 | {
|
---|
123 | const char *encoding = NULL;
|
---|
124 | const char *errors = NULL;
|
---|
125 | PyObject *v;
|
---|
126 |
|
---|
127 | if (!PyArg_ParseTuple(args, "O|ss:decode", &v, &encoding, &errors))
|
---|
128 | return NULL;
|
---|
129 |
|
---|
130 | #ifdef Py_USING_UNICODE
|
---|
131 | if (encoding == NULL)
|
---|
132 | encoding = PyUnicode_GetDefaultEncoding();
|
---|
133 | #else
|
---|
134 | if (encoding == NULL) {
|
---|
135 | PyErr_SetString(PyExc_ValueError, "no encoding specified");
|
---|
136 | return NULL;
|
---|
137 | }
|
---|
138 | #endif
|
---|
139 |
|
---|
140 | /* Decode via the codec registry */
|
---|
141 | return PyCodec_Decode(v, encoding, errors);
|
---|
142 | }
|
---|
143 |
|
---|
144 | /* --- Helpers ------------------------------------------------------------ */
|
---|
145 |
|
---|
146 | static
|
---|
147 | PyObject *codec_tuple(PyObject *unicode,
|
---|
148 | Py_ssize_t len)
|
---|
149 | {
|
---|
150 | PyObject *v;
|
---|
151 | if (unicode == NULL)
|
---|
152 | return NULL;
|
---|
153 | v = Py_BuildValue("On", unicode, len);
|
---|
154 | Py_DECREF(unicode);
|
---|
155 | return v;
|
---|
156 | }
|
---|
157 |
|
---|
158 | /* --- String codecs ------------------------------------------------------ */
|
---|
159 | static PyObject *
|
---|
160 | escape_decode(PyObject *self,
|
---|
161 | PyObject *args)
|
---|
162 | {
|
---|
163 | const char *errors = NULL;
|
---|
164 | const char *data;
|
---|
165 | Py_ssize_t size;
|
---|
166 |
|
---|
167 | if (!PyArg_ParseTuple(args, "s#|z:escape_decode",
|
---|
168 | &data, &size, &errors))
|
---|
169 | return NULL;
|
---|
170 | return codec_tuple(PyString_DecodeEscape(data, size, errors, 0, NULL),
|
---|
171 | size);
|
---|
172 | }
|
---|
173 |
|
---|
174 | static PyObject *
|
---|
175 | escape_encode(PyObject *self,
|
---|
176 | PyObject *args)
|
---|
177 | {
|
---|
178 | PyObject *str;
|
---|
179 | const char *errors = NULL;
|
---|
180 | char *buf;
|
---|
181 | Py_ssize_t len;
|
---|
182 |
|
---|
183 | if (!PyArg_ParseTuple(args, "O!|z:escape_encode",
|
---|
184 | &PyString_Type, &str, &errors))
|
---|
185 | return NULL;
|
---|
186 |
|
---|
187 | str = PyString_Repr(str, 0);
|
---|
188 | if (!str)
|
---|
189 | return NULL;
|
---|
190 |
|
---|
191 | /* The string will be quoted. Unquote, similar to unicode-escape. */
|
---|
192 | buf = PyString_AS_STRING (str);
|
---|
193 | len = PyString_GET_SIZE (str);
|
---|
194 | memmove(buf, buf+1, len-2);
|
---|
195 | if (_PyString_Resize(&str, len-2) < 0)
|
---|
196 | return NULL;
|
---|
197 |
|
---|
198 | return codec_tuple(str, PyString_Size(str));
|
---|
199 | }
|
---|
200 |
|
---|
201 | #ifdef Py_USING_UNICODE
|
---|
202 | /* --- Decoder ------------------------------------------------------------ */
|
---|
203 |
|
---|
204 | static PyObject *
|
---|
205 | unicode_internal_decode(PyObject *self,
|
---|
206 | PyObject *args)
|
---|
207 | {
|
---|
208 | PyObject *obj;
|
---|
209 | const char *errors = NULL;
|
---|
210 | const char *data;
|
---|
211 | Py_ssize_t size;
|
---|
212 |
|
---|
213 | if (!PyArg_ParseTuple(args, "O|z:unicode_internal_decode",
|
---|
214 | &obj, &errors))
|
---|
215 | return NULL;
|
---|
216 |
|
---|
217 | if (PyUnicode_Check(obj)) {
|
---|
218 | Py_INCREF(obj);
|
---|
219 | return codec_tuple(obj, PyUnicode_GET_SIZE(obj));
|
---|
220 | }
|
---|
221 | else {
|
---|
222 | if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
|
---|
223 | return NULL;
|
---|
224 |
|
---|
225 | return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
|
---|
226 | size);
|
---|
227 | }
|
---|
228 | }
|
---|
229 |
|
---|
230 | static PyObject *
|
---|
231 | utf_7_decode(PyObject *self,
|
---|
232 | PyObject *args)
|
---|
233 | {
|
---|
234 | const char *data;
|
---|
235 | Py_ssize_t size;
|
---|
236 | const char *errors = NULL;
|
---|
237 |
|
---|
238 | if (!PyArg_ParseTuple(args, "t#|z:utf_7_decode",
|
---|
239 | &data, &size, &errors))
|
---|
240 | return NULL;
|
---|
241 |
|
---|
242 | return codec_tuple(PyUnicode_DecodeUTF7(data, size, errors),
|
---|
243 | size);
|
---|
244 | }
|
---|
245 |
|
---|
246 | static PyObject *
|
---|
247 | utf_8_decode(PyObject *self,
|
---|
248 | PyObject *args)
|
---|
249 | {
|
---|
250 | const char *data;
|
---|
251 | Py_ssize_t size;
|
---|
252 | const char *errors = NULL;
|
---|
253 | int final = 0;
|
---|
254 | Py_ssize_t consumed;
|
---|
255 | PyObject *decoded = NULL;
|
---|
256 |
|
---|
257 | if (!PyArg_ParseTuple(args, "t#|zi:utf_8_decode",
|
---|
258 | &data, &size, &errors, &final))
|
---|
259 | return NULL;
|
---|
260 | if (size < 0) {
|
---|
261 | PyErr_SetString(PyExc_ValueError, "negative argument");
|
---|
262 | return 0;
|
---|
263 | }
|
---|
264 | consumed = size;
|
---|
265 |
|
---|
266 | decoded = PyUnicode_DecodeUTF8Stateful(data, size, errors,
|
---|
267 | final ? NULL : &consumed);
|
---|
268 | if (decoded == NULL)
|
---|
269 | return NULL;
|
---|
270 | return codec_tuple(decoded, consumed);
|
---|
271 | }
|
---|
272 |
|
---|
273 | static PyObject *
|
---|
274 | utf_16_decode(PyObject *self,
|
---|
275 | PyObject *args)
|
---|
276 | {
|
---|
277 | const char *data;
|
---|
278 | Py_ssize_t size;
|
---|
279 | const char *errors = NULL;
|
---|
280 | int byteorder = 0;
|
---|
281 | int final = 0;
|
---|
282 | Py_ssize_t consumed;
|
---|
283 | PyObject *decoded;
|
---|
284 |
|
---|
285 | if (!PyArg_ParseTuple(args, "t#|zi:utf_16_decode",
|
---|
286 | &data, &size, &errors, &final))
|
---|
287 | return NULL;
|
---|
288 | if (size < 0) {
|
---|
289 | PyErr_SetString(PyExc_ValueError, "negative argument");
|
---|
290 | return 0;
|
---|
291 | }
|
---|
292 | consumed = size; /* This is overwritten unless final is true. */
|
---|
293 | decoded = PyUnicode_DecodeUTF16Stateful(data, size, errors, &byteorder,
|
---|
294 | final ? NULL : &consumed);
|
---|
295 | if (decoded == NULL)
|
---|
296 | return NULL;
|
---|
297 | return codec_tuple(decoded, consumed);
|
---|
298 | }
|
---|
299 |
|
---|
300 | static PyObject *
|
---|
301 | utf_16_le_decode(PyObject *self,
|
---|
302 | PyObject *args)
|
---|
303 | {
|
---|
304 | const char *data;
|
---|
305 | Py_ssize_t size;
|
---|
306 | const char *errors = NULL;
|
---|
307 | int byteorder = -1;
|
---|
308 | int final = 0;
|
---|
309 | Py_ssize_t consumed;
|
---|
310 | PyObject *decoded = NULL;
|
---|
311 |
|
---|
312 | if (!PyArg_ParseTuple(args, "t#|zi:utf_16_le_decode",
|
---|
313 | &data, &size, &errors, &final))
|
---|
314 | return NULL;
|
---|
315 |
|
---|
316 | if (size < 0) {
|
---|
317 | PyErr_SetString(PyExc_ValueError, "negative argument");
|
---|
318 | return 0;
|
---|
319 | }
|
---|
320 | consumed = size; /* This is overwritten unless final is true. */
|
---|
321 | decoded = PyUnicode_DecodeUTF16Stateful(data, size, errors,
|
---|
322 | &byteorder, final ? NULL : &consumed);
|
---|
323 | if (decoded == NULL)
|
---|
324 | return NULL;
|
---|
325 | return codec_tuple(decoded, consumed);
|
---|
326 |
|
---|
327 | }
|
---|
328 |
|
---|
329 | static PyObject *
|
---|
330 | utf_16_be_decode(PyObject *self,
|
---|
331 | PyObject *args)
|
---|
332 | {
|
---|
333 | const char *data;
|
---|
334 | Py_ssize_t size;
|
---|
335 | const char *errors = NULL;
|
---|
336 | int byteorder = 1;
|
---|
337 | int final = 0;
|
---|
338 | Py_ssize_t consumed;
|
---|
339 | PyObject *decoded = NULL;
|
---|
340 |
|
---|
341 | if (!PyArg_ParseTuple(args, "t#|zi:utf_16_be_decode",
|
---|
342 | &data, &size, &errors, &final))
|
---|
343 | return NULL;
|
---|
344 | if (size < 0) {
|
---|
345 | PyErr_SetString(PyExc_ValueError, "negative argument");
|
---|
346 | return 0;
|
---|
347 | }
|
---|
348 | consumed = size; /* This is overwritten unless final is true. */
|
---|
349 | decoded = PyUnicode_DecodeUTF16Stateful(data, size, errors,
|
---|
350 | &byteorder, final ? NULL : &consumed);
|
---|
351 | if (decoded == NULL)
|
---|
352 | return NULL;
|
---|
353 | return codec_tuple(decoded, consumed);
|
---|
354 | }
|
---|
355 |
|
---|
356 | /* This non-standard version also provides access to the byteorder
|
---|
357 | parameter of the builtin UTF-16 codec.
|
---|
358 |
|
---|
359 | It returns a tuple (unicode, bytesread, byteorder) with byteorder
|
---|
360 | being the value in effect at the end of data.
|
---|
361 |
|
---|
362 | */
|
---|
363 |
|
---|
364 | static PyObject *
|
---|
365 | utf_16_ex_decode(PyObject *self,
|
---|
366 | PyObject *args)
|
---|
367 | {
|
---|
368 | const char *data;
|
---|
369 | Py_ssize_t size;
|
---|
370 | const char *errors = NULL;
|
---|
371 | int byteorder = 0;
|
---|
372 | PyObject *unicode, *tuple;
|
---|
373 | int final = 0;
|
---|
374 | Py_ssize_t consumed;
|
---|
375 |
|
---|
376 | if (!PyArg_ParseTuple(args, "t#|zii:utf_16_ex_decode",
|
---|
377 | &data, &size, &errors, &byteorder, &final))
|
---|
378 | return NULL;
|
---|
379 | if (size < 0) {
|
---|
380 | PyErr_SetString(PyExc_ValueError, "negative argument");
|
---|
381 | return 0;
|
---|
382 | }
|
---|
383 | consumed = size; /* This is overwritten unless final is true. */
|
---|
384 | unicode = PyUnicode_DecodeUTF16Stateful(data, size, errors, &byteorder,
|
---|
385 | final ? NULL : &consumed);
|
---|
386 | if (unicode == NULL)
|
---|
387 | return NULL;
|
---|
388 | tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
|
---|
389 | Py_DECREF(unicode);
|
---|
390 | return tuple;
|
---|
391 | }
|
---|
392 |
|
---|
393 | static PyObject *
|
---|
394 | unicode_escape_decode(PyObject *self,
|
---|
395 | PyObject *args)
|
---|
396 | {
|
---|
397 | const char *data;
|
---|
398 | Py_ssize_t size;
|
---|
399 | const char *errors = NULL;
|
---|
400 |
|
---|
401 | if (!PyArg_ParseTuple(args, "t#|z:unicode_escape_decode",
|
---|
402 | &data, &size, &errors))
|
---|
403 | return NULL;
|
---|
404 |
|
---|
405 | return codec_tuple(PyUnicode_DecodeUnicodeEscape(data, size, errors),
|
---|
406 | size);
|
---|
407 | }
|
---|
408 |
|
---|
409 | static PyObject *
|
---|
410 | raw_unicode_escape_decode(PyObject *self,
|
---|
411 | PyObject *args)
|
---|
412 | {
|
---|
413 | const char *data;
|
---|
414 | Py_ssize_t size;
|
---|
415 | const char *errors = NULL;
|
---|
416 |
|
---|
417 | if (!PyArg_ParseTuple(args, "t#|z:raw_unicode_escape_decode",
|
---|
418 | &data, &size, &errors))
|
---|
419 | return NULL;
|
---|
420 |
|
---|
421 | return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data, size, errors),
|
---|
422 | size);
|
---|
423 | }
|
---|
424 |
|
---|
425 | static PyObject *
|
---|
426 | latin_1_decode(PyObject *self,
|
---|
427 | PyObject *args)
|
---|
428 | {
|
---|
429 | const char *data;
|
---|
430 | Py_ssize_t size;
|
---|
431 | const char *errors = NULL;
|
---|
432 |
|
---|
433 | if (!PyArg_ParseTuple(args, "t#|z:latin_1_decode",
|
---|
434 | &data, &size, &errors))
|
---|
435 | return NULL;
|
---|
436 |
|
---|
437 | return codec_tuple(PyUnicode_DecodeLatin1(data, size, errors),
|
---|
438 | size);
|
---|
439 | }
|
---|
440 |
|
---|
441 | static PyObject *
|
---|
442 | ascii_decode(PyObject *self,
|
---|
443 | PyObject *args)
|
---|
444 | {
|
---|
445 | const char *data;
|
---|
446 | Py_ssize_t size;
|
---|
447 | const char *errors = NULL;
|
---|
448 |
|
---|
449 | if (!PyArg_ParseTuple(args, "t#|z:ascii_decode",
|
---|
450 | &data, &size, &errors))
|
---|
451 | return NULL;
|
---|
452 |
|
---|
453 | return codec_tuple(PyUnicode_DecodeASCII(data, size, errors),
|
---|
454 | size);
|
---|
455 | }
|
---|
456 |
|
---|
457 | static PyObject *
|
---|
458 | charmap_decode(PyObject *self,
|
---|
459 | PyObject *args)
|
---|
460 | {
|
---|
461 | const char *data;
|
---|
462 | Py_ssize_t size;
|
---|
463 | const char *errors = NULL;
|
---|
464 | PyObject *mapping = NULL;
|
---|
465 |
|
---|
466 | if (!PyArg_ParseTuple(args, "t#|zO:charmap_decode",
|
---|
467 | &data, &size, &errors, &mapping))
|
---|
468 | return NULL;
|
---|
469 | if (mapping == Py_None)
|
---|
470 | mapping = NULL;
|
---|
471 |
|
---|
472 | return codec_tuple(PyUnicode_DecodeCharmap(data, size, mapping, errors),
|
---|
473 | size);
|
---|
474 | }
|
---|
475 |
|
---|
476 | #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
|
---|
477 |
|
---|
478 | static PyObject *
|
---|
479 | mbcs_decode(PyObject *self,
|
---|
480 | PyObject *args)
|
---|
481 | {
|
---|
482 | const char *data;
|
---|
483 | Py_ssize_t size, consumed;
|
---|
484 | const char *errors = NULL;
|
---|
485 | int final = 0;
|
---|
486 | PyObject *decoded;
|
---|
487 |
|
---|
488 | if (!PyArg_ParseTuple(args, "t#|zi:mbcs_decode",
|
---|
489 | &data, &size, &errors, &final))
|
---|
490 | return NULL;
|
---|
491 |
|
---|
492 | decoded = PyUnicode_DecodeMBCSStateful(
|
---|
493 | data, size, errors, final ? NULL : &consumed);
|
---|
494 | if (!decoded)
|
---|
495 | return NULL;
|
---|
496 | return codec_tuple(decoded, final ? size : consumed);
|
---|
497 | }
|
---|
498 |
|
---|
499 | #endif /* MS_WINDOWS */
|
---|
500 |
|
---|
501 | /* --- Encoder ------------------------------------------------------------ */
|
---|
502 |
|
---|
503 | static PyObject *
|
---|
504 | readbuffer_encode(PyObject *self,
|
---|
505 | PyObject *args)
|
---|
506 | {
|
---|
507 | const char *data;
|
---|
508 | Py_ssize_t size;
|
---|
509 | const char *errors = NULL;
|
---|
510 |
|
---|
511 | if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode",
|
---|
512 | &data, &size, &errors))
|
---|
513 | return NULL;
|
---|
514 |
|
---|
515 | return codec_tuple(PyString_FromStringAndSize(data, size),
|
---|
516 | size);
|
---|
517 | }
|
---|
518 |
|
---|
519 | static PyObject *
|
---|
520 | charbuffer_encode(PyObject *self,
|
---|
521 | PyObject *args)
|
---|
522 | {
|
---|
523 | const char *data;
|
---|
524 | Py_ssize_t size;
|
---|
525 | const char *errors = NULL;
|
---|
526 |
|
---|
527 | if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode",
|
---|
528 | &data, &size, &errors))
|
---|
529 | return NULL;
|
---|
530 |
|
---|
531 | return codec_tuple(PyString_FromStringAndSize(data, size),
|
---|
532 | size);
|
---|
533 | }
|
---|
534 |
|
---|
535 | static PyObject *
|
---|
536 | unicode_internal_encode(PyObject *self,
|
---|
537 | PyObject *args)
|
---|
538 | {
|
---|
539 | PyObject *obj;
|
---|
540 | const char *errors = NULL;
|
---|
541 | const char *data;
|
---|
542 | Py_ssize_t size;
|
---|
543 |
|
---|
544 | if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode",
|
---|
545 | &obj, &errors))
|
---|
546 | return NULL;
|
---|
547 |
|
---|
548 | if (PyUnicode_Check(obj)) {
|
---|
549 | data = PyUnicode_AS_DATA(obj);
|
---|
550 | size = PyUnicode_GET_DATA_SIZE(obj);
|
---|
551 | return codec_tuple(PyString_FromStringAndSize(data, size),
|
---|
552 | size);
|
---|
553 | }
|
---|
554 | else {
|
---|
555 | if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
|
---|
556 | return NULL;
|
---|
557 | return codec_tuple(PyString_FromStringAndSize(data, size),
|
---|
558 | size);
|
---|
559 | }
|
---|
560 | }
|
---|
561 |
|
---|
562 | static PyObject *
|
---|
563 | utf_7_encode(PyObject *self,
|
---|
564 | PyObject *args)
|
---|
565 | {
|
---|
566 | PyObject *str, *v;
|
---|
567 | const char *errors = NULL;
|
---|
568 |
|
---|
569 | if (!PyArg_ParseTuple(args, "O|z:utf_7_encode",
|
---|
570 | &str, &errors))
|
---|
571 | return NULL;
|
---|
572 |
|
---|
573 | str = PyUnicode_FromObject(str);
|
---|
574 | if (str == NULL)
|
---|
575 | return NULL;
|
---|
576 | v = codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str),
|
---|
577 | PyUnicode_GET_SIZE(str),
|
---|
578 | 0,
|
---|
579 | 0,
|
---|
580 | errors),
|
---|
581 | PyUnicode_GET_SIZE(str));
|
---|
582 | Py_DECREF(str);
|
---|
583 | return v;
|
---|
584 | }
|
---|
585 |
|
---|
586 | static PyObject *
|
---|
587 | utf_8_encode(PyObject *self,
|
---|
588 | PyObject *args)
|
---|
589 | {
|
---|
590 | PyObject *str, *v;
|
---|
591 | const char *errors = NULL;
|
---|
592 |
|
---|
593 | if (!PyArg_ParseTuple(args, "O|z:utf_8_encode",
|
---|
594 | &str, &errors))
|
---|
595 | return NULL;
|
---|
596 |
|
---|
597 | str = PyUnicode_FromObject(str);
|
---|
598 | if (str == NULL)
|
---|
599 | return NULL;
|
---|
600 | v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
|
---|
601 | PyUnicode_GET_SIZE(str),
|
---|
602 | errors),
|
---|
603 | PyUnicode_GET_SIZE(str));
|
---|
604 | Py_DECREF(str);
|
---|
605 | return v;
|
---|
606 | }
|
---|
607 |
|
---|
608 | /* This version provides access to the byteorder parameter of the
|
---|
609 | builtin UTF-16 codecs as optional third argument. It defaults to 0
|
---|
610 | which means: use the native byte order and prepend the data with a
|
---|
611 | BOM mark.
|
---|
612 |
|
---|
613 | */
|
---|
614 |
|
---|
615 | static PyObject *
|
---|
616 | utf_16_encode(PyObject *self,
|
---|
617 | PyObject *args)
|
---|
618 | {
|
---|
619 | PyObject *str, *v;
|
---|
620 | const char *errors = NULL;
|
---|
621 | int byteorder = 0;
|
---|
622 |
|
---|
623 | if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode",
|
---|
624 | &str, &errors, &byteorder))
|
---|
625 | return NULL;
|
---|
626 |
|
---|
627 | str = PyUnicode_FromObject(str);
|
---|
628 | if (str == NULL)
|
---|
629 | return NULL;
|
---|
630 | v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
|
---|
631 | PyUnicode_GET_SIZE(str),
|
---|
632 | errors,
|
---|
633 | byteorder),
|
---|
634 | PyUnicode_GET_SIZE(str));
|
---|
635 | Py_DECREF(str);
|
---|
636 | return v;
|
---|
637 | }
|
---|
638 |
|
---|
639 | static PyObject *
|
---|
640 | utf_16_le_encode(PyObject *self,
|
---|
641 | PyObject *args)
|
---|
642 | {
|
---|
643 | PyObject *str, *v;
|
---|
644 | const char *errors = NULL;
|
---|
645 |
|
---|
646 | if (!PyArg_ParseTuple(args, "O|z:utf_16_le_encode",
|
---|
647 | &str, &errors))
|
---|
648 | return NULL;
|
---|
649 |
|
---|
650 | str = PyUnicode_FromObject(str);
|
---|
651 | if (str == NULL)
|
---|
652 | return NULL;
|
---|
653 | v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
|
---|
654 | PyUnicode_GET_SIZE(str),
|
---|
655 | errors,
|
---|
656 | -1),
|
---|
657 | PyUnicode_GET_SIZE(str));
|
---|
658 | Py_DECREF(str);
|
---|
659 | return v;
|
---|
660 | }
|
---|
661 |
|
---|
662 | static PyObject *
|
---|
663 | utf_16_be_encode(PyObject *self,
|
---|
664 | PyObject *args)
|
---|
665 | {
|
---|
666 | PyObject *str, *v;
|
---|
667 | const char *errors = NULL;
|
---|
668 |
|
---|
669 | if (!PyArg_ParseTuple(args, "O|z:utf_16_be_encode",
|
---|
670 | &str, &errors))
|
---|
671 | return NULL;
|
---|
672 |
|
---|
673 | str = PyUnicode_FromObject(str);
|
---|
674 | if (str == NULL)
|
---|
675 | return NULL;
|
---|
676 | v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
|
---|
677 | PyUnicode_GET_SIZE(str),
|
---|
678 | errors,
|
---|
679 | +1),
|
---|
680 | PyUnicode_GET_SIZE(str));
|
---|
681 | Py_DECREF(str);
|
---|
682 | return v;
|
---|
683 | }
|
---|
684 |
|
---|
685 | static PyObject *
|
---|
686 | unicode_escape_encode(PyObject *self,
|
---|
687 | PyObject *args)
|
---|
688 | {
|
---|
689 | PyObject *str, *v;
|
---|
690 | const char *errors = NULL;
|
---|
691 |
|
---|
692 | if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode",
|
---|
693 | &str, &errors))
|
---|
694 | return NULL;
|
---|
695 |
|
---|
696 | str = PyUnicode_FromObject(str);
|
---|
697 | if (str == NULL)
|
---|
698 | return NULL;
|
---|
699 | v = codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str),
|
---|
700 | PyUnicode_GET_SIZE(str)),
|
---|
701 | PyUnicode_GET_SIZE(str));
|
---|
702 | Py_DECREF(str);
|
---|
703 | return v;
|
---|
704 | }
|
---|
705 |
|
---|
706 | static PyObject *
|
---|
707 | raw_unicode_escape_encode(PyObject *self,
|
---|
708 | PyObject *args)
|
---|
709 | {
|
---|
710 | PyObject *str, *v;
|
---|
711 | const char *errors = NULL;
|
---|
712 |
|
---|
713 | if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode",
|
---|
714 | &str, &errors))
|
---|
715 | return NULL;
|
---|
716 |
|
---|
717 | str = PyUnicode_FromObject(str);
|
---|
718 | if (str == NULL)
|
---|
719 | return NULL;
|
---|
720 | v = codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
|
---|
721 | PyUnicode_AS_UNICODE(str),
|
---|
722 | PyUnicode_GET_SIZE(str)),
|
---|
723 | PyUnicode_GET_SIZE(str));
|
---|
724 | Py_DECREF(str);
|
---|
725 | return v;
|
---|
726 | }
|
---|
727 |
|
---|
728 | static PyObject *
|
---|
729 | latin_1_encode(PyObject *self,
|
---|
730 | PyObject *args)
|
---|
731 | {
|
---|
732 | PyObject *str, *v;
|
---|
733 | const char *errors = NULL;
|
---|
734 |
|
---|
735 | if (!PyArg_ParseTuple(args, "O|z:latin_1_encode",
|
---|
736 | &str, &errors))
|
---|
737 | return NULL;
|
---|
738 |
|
---|
739 | str = PyUnicode_FromObject(str);
|
---|
740 | if (str == NULL)
|
---|
741 | return NULL;
|
---|
742 | v = codec_tuple(PyUnicode_EncodeLatin1(
|
---|
743 | PyUnicode_AS_UNICODE(str),
|
---|
744 | PyUnicode_GET_SIZE(str),
|
---|
745 | errors),
|
---|
746 | PyUnicode_GET_SIZE(str));
|
---|
747 | Py_DECREF(str);
|
---|
748 | return v;
|
---|
749 | }
|
---|
750 |
|
---|
751 | static PyObject *
|
---|
752 | ascii_encode(PyObject *self,
|
---|
753 | PyObject *args)
|
---|
754 | {
|
---|
755 | PyObject *str, *v;
|
---|
756 | const char *errors = NULL;
|
---|
757 |
|
---|
758 | if (!PyArg_ParseTuple(args, "O|z:ascii_encode",
|
---|
759 | &str, &errors))
|
---|
760 | return NULL;
|
---|
761 |
|
---|
762 | str = PyUnicode_FromObject(str);
|
---|
763 | if (str == NULL)
|
---|
764 | return NULL;
|
---|
765 | v = codec_tuple(PyUnicode_EncodeASCII(
|
---|
766 | PyUnicode_AS_UNICODE(str),
|
---|
767 | PyUnicode_GET_SIZE(str),
|
---|
768 | errors),
|
---|
769 | PyUnicode_GET_SIZE(str));
|
---|
770 | Py_DECREF(str);
|
---|
771 | return v;
|
---|
772 | }
|
---|
773 |
|
---|
774 | static PyObject *
|
---|
775 | charmap_encode(PyObject *self,
|
---|
776 | PyObject *args)
|
---|
777 | {
|
---|
778 | PyObject *str, *v;
|
---|
779 | const char *errors = NULL;
|
---|
780 | PyObject *mapping = NULL;
|
---|
781 |
|
---|
782 | if (!PyArg_ParseTuple(args, "O|zO:charmap_encode",
|
---|
783 | &str, &errors, &mapping))
|
---|
784 | return NULL;
|
---|
785 | if (mapping == Py_None)
|
---|
786 | mapping = NULL;
|
---|
787 |
|
---|
788 | str = PyUnicode_FromObject(str);
|
---|
789 | if (str == NULL)
|
---|
790 | return NULL;
|
---|
791 | v = codec_tuple(PyUnicode_EncodeCharmap(
|
---|
792 | PyUnicode_AS_UNICODE(str),
|
---|
793 | PyUnicode_GET_SIZE(str),
|
---|
794 | mapping,
|
---|
795 | errors),
|
---|
796 | PyUnicode_GET_SIZE(str));
|
---|
797 | Py_DECREF(str);
|
---|
798 | return v;
|
---|
799 | }
|
---|
800 |
|
---|
801 | static PyObject*
|
---|
802 | charmap_build(PyObject *self, PyObject *args)
|
---|
803 | {
|
---|
804 | PyObject *map;
|
---|
805 | if (!PyArg_ParseTuple(args, "U:charmap_build", &map))
|
---|
806 | return NULL;
|
---|
807 | return PyUnicode_BuildEncodingMap(map);
|
---|
808 | }
|
---|
809 |
|
---|
810 | #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
|
---|
811 |
|
---|
812 | static PyObject *
|
---|
813 | mbcs_encode(PyObject *self,
|
---|
814 | PyObject *args)
|
---|
815 | {
|
---|
816 | PyObject *str, *v;
|
---|
817 | const char *errors = NULL;
|
---|
818 |
|
---|
819 | if (!PyArg_ParseTuple(args, "O|z:mbcs_encode",
|
---|
820 | &str, &errors))
|
---|
821 | return NULL;
|
---|
822 |
|
---|
823 | str = PyUnicode_FromObject(str);
|
---|
824 | if (str == NULL)
|
---|
825 | return NULL;
|
---|
826 | v = codec_tuple(PyUnicode_EncodeMBCS(
|
---|
827 | PyUnicode_AS_UNICODE(str),
|
---|
828 | PyUnicode_GET_SIZE(str),
|
---|
829 | errors),
|
---|
830 | PyUnicode_GET_SIZE(str));
|
---|
831 | Py_DECREF(str);
|
---|
832 | return v;
|
---|
833 | }
|
---|
834 |
|
---|
835 | #endif /* MS_WINDOWS */
|
---|
836 | #endif /* Py_USING_UNICODE */
|
---|
837 |
|
---|
838 | /* --- Error handler registry --------------------------------------------- */
|
---|
839 |
|
---|
840 | PyDoc_STRVAR(register_error__doc__,
|
---|
841 | "register_error(errors, handler)\n\
|
---|
842 | \n\
|
---|
843 | Register the specified error handler under the name\n\
|
---|
844 | errors. handler must be a callable object, that\n\
|
---|
845 | will be called with an exception instance containing\n\
|
---|
846 | information about the location of the encoding/decoding\n\
|
---|
847 | error and must return a (replacement, new position) tuple.");
|
---|
848 |
|
---|
849 | static PyObject *register_error(PyObject *self, PyObject *args)
|
---|
850 | {
|
---|
851 | const char *name;
|
---|
852 | PyObject *handler;
|
---|
853 |
|
---|
854 | if (!PyArg_ParseTuple(args, "sO:register_error",
|
---|
855 | &name, &handler))
|
---|
856 | return NULL;
|
---|
857 | if (PyCodec_RegisterError(name, handler))
|
---|
858 | return NULL;
|
---|
859 | Py_RETURN_NONE;
|
---|
860 | }
|
---|
861 |
|
---|
862 | PyDoc_STRVAR(lookup_error__doc__,
|
---|
863 | "lookup_error(errors) -> handler\n\
|
---|
864 | \n\
|
---|
865 | Return the error handler for the specified error handling name\n\
|
---|
866 | or raise a LookupError, if no handler exists under this name.");
|
---|
867 |
|
---|
868 | static PyObject *lookup_error(PyObject *self, PyObject *args)
|
---|
869 | {
|
---|
870 | const char *name;
|
---|
871 |
|
---|
872 | if (!PyArg_ParseTuple(args, "s:lookup_error",
|
---|
873 | &name))
|
---|
874 | return NULL;
|
---|
875 | return PyCodec_LookupError(name);
|
---|
876 | }
|
---|
877 |
|
---|
878 | /* --- Module API --------------------------------------------------------- */
|
---|
879 |
|
---|
880 | static PyMethodDef _codecs_functions[] = {
|
---|
881 | {"register", codec_register, METH_O,
|
---|
882 | register__doc__},
|
---|
883 | {"lookup", codec_lookup, METH_VARARGS,
|
---|
884 | lookup__doc__},
|
---|
885 | {"encode", codec_encode, METH_VARARGS,
|
---|
886 | encode__doc__},
|
---|
887 | {"decode", codec_decode, METH_VARARGS,
|
---|
888 | decode__doc__},
|
---|
889 | {"escape_encode", escape_encode, METH_VARARGS},
|
---|
890 | {"escape_decode", escape_decode, METH_VARARGS},
|
---|
891 | #ifdef Py_USING_UNICODE
|
---|
892 | {"utf_8_encode", utf_8_encode, METH_VARARGS},
|
---|
893 | {"utf_8_decode", utf_8_decode, METH_VARARGS},
|
---|
894 | {"utf_7_encode", utf_7_encode, METH_VARARGS},
|
---|
895 | {"utf_7_decode", utf_7_decode, METH_VARARGS},
|
---|
896 | {"utf_16_encode", utf_16_encode, METH_VARARGS},
|
---|
897 | {"utf_16_le_encode", utf_16_le_encode, METH_VARARGS},
|
---|
898 | {"utf_16_be_encode", utf_16_be_encode, METH_VARARGS},
|
---|
899 | {"utf_16_decode", utf_16_decode, METH_VARARGS},
|
---|
900 | {"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
|
---|
901 | {"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
|
---|
902 | {"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
|
---|
903 | {"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
|
---|
904 | {"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
|
---|
905 | {"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},
|
---|
906 | {"unicode_internal_decode", unicode_internal_decode, METH_VARARGS},
|
---|
907 | {"raw_unicode_escape_encode", raw_unicode_escape_encode, METH_VARARGS},
|
---|
908 | {"raw_unicode_escape_decode", raw_unicode_escape_decode, METH_VARARGS},
|
---|
909 | {"latin_1_encode", latin_1_encode, METH_VARARGS},
|
---|
910 | {"latin_1_decode", latin_1_decode, METH_VARARGS},
|
---|
911 | {"ascii_encode", ascii_encode, METH_VARARGS},
|
---|
912 | {"ascii_decode", ascii_decode, METH_VARARGS},
|
---|
913 | {"charmap_encode", charmap_encode, METH_VARARGS},
|
---|
914 | {"charmap_decode", charmap_decode, METH_VARARGS},
|
---|
915 | {"charmap_build", charmap_build, METH_VARARGS},
|
---|
916 | {"readbuffer_encode", readbuffer_encode, METH_VARARGS},
|
---|
917 | {"charbuffer_encode", charbuffer_encode, METH_VARARGS},
|
---|
918 | #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
|
---|
919 | {"mbcs_encode", mbcs_encode, METH_VARARGS},
|
---|
920 | {"mbcs_decode", mbcs_decode, METH_VARARGS},
|
---|
921 | #endif
|
---|
922 | #endif /* Py_USING_UNICODE */
|
---|
923 | {"register_error", register_error, METH_VARARGS,
|
---|
924 | register_error__doc__},
|
---|
925 | {"lookup_error", lookup_error, METH_VARARGS,
|
---|
926 | lookup_error__doc__},
|
---|
927 | {NULL, NULL} /* sentinel */
|
---|
928 | };
|
---|
929 |
|
---|
930 | PyMODINIT_FUNC
|
---|
931 | init_codecs(void)
|
---|
932 | {
|
---|
933 | Py_InitModule("_codecs", _codecs_functions);
|
---|
934 | }
|
---|