1 | /* ------------------------------------------------------------------------
|
---|
2 |
|
---|
3 | Python Codec Registry and support functions
|
---|
4 |
|
---|
5 | Written by Marc-Andre Lemburg (mal@lemburg.com).
|
---|
6 |
|
---|
7 | Copyright (c) Corporation for National Research Initiatives.
|
---|
8 |
|
---|
9 | ------------------------------------------------------------------------ */
|
---|
10 |
|
---|
11 | #include "Python.h"
|
---|
12 | #include <ctype.h>
|
---|
13 |
|
---|
14 | /* --- Codec Registry ----------------------------------------------------- */
|
---|
15 |
|
---|
16 | /* Import the standard encodings package which will register the first
|
---|
17 | codec search function.
|
---|
18 |
|
---|
19 | This is done in a lazy way so that the Unicode implementation does
|
---|
20 | not downgrade startup time of scripts not needing it.
|
---|
21 |
|
---|
22 | ImportErrors are silently ignored by this function. Only one try is
|
---|
23 | made.
|
---|
24 |
|
---|
25 | */
|
---|
26 |
|
---|
27 | static int _PyCodecRegistry_Init(void); /* Forward */
|
---|
28 |
|
---|
29 | int PyCodec_Register(PyObject *search_function)
|
---|
30 | {
|
---|
31 | PyInterpreterState *interp = PyThreadState_GET()->interp;
|
---|
32 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
|
---|
33 | goto onError;
|
---|
34 | if (search_function == NULL) {
|
---|
35 | PyErr_BadArgument();
|
---|
36 | goto onError;
|
---|
37 | }
|
---|
38 | if (!PyCallable_Check(search_function)) {
|
---|
39 | PyErr_SetString(PyExc_TypeError, "argument must be callable");
|
---|
40 | goto onError;
|
---|
41 | }
|
---|
42 | return PyList_Append(interp->codec_search_path, search_function);
|
---|
43 |
|
---|
44 | onError:
|
---|
45 | return -1;
|
---|
46 | }
|
---|
47 |
|
---|
48 | /* Convert a string to a normalized Python string: all characters are
|
---|
49 | converted to lower case, spaces are replaced with underscores. */
|
---|
50 |
|
---|
51 | static
|
---|
52 | PyObject *normalizestring(const char *string)
|
---|
53 | {
|
---|
54 | register size_t i;
|
---|
55 | size_t len = strlen(string);
|
---|
56 | char *p;
|
---|
57 | PyObject *v;
|
---|
58 |
|
---|
59 | if (len > PY_SSIZE_T_MAX) {
|
---|
60 | PyErr_SetString(PyExc_OverflowError, "string is too large");
|
---|
61 | return NULL;
|
---|
62 | }
|
---|
63 |
|
---|
64 | v = PyString_FromStringAndSize(NULL, len);
|
---|
65 | if (v == NULL)
|
---|
66 | return NULL;
|
---|
67 | p = PyString_AS_STRING(v);
|
---|
68 | for (i = 0; i < len; i++) {
|
---|
69 | register char ch = string[i];
|
---|
70 | if (ch == ' ')
|
---|
71 | ch = '-';
|
---|
72 | else
|
---|
73 | ch = Py_TOLOWER(Py_CHARMASK(ch));
|
---|
74 | p[i] = ch;
|
---|
75 | }
|
---|
76 | return v;
|
---|
77 | }
|
---|
78 |
|
---|
79 | /* Lookup the given encoding and return a tuple providing the codec
|
---|
80 | facilities.
|
---|
81 |
|
---|
82 | The encoding string is looked up converted to all lower-case
|
---|
83 | characters. This makes encodings looked up through this mechanism
|
---|
84 | effectively case-insensitive.
|
---|
85 |
|
---|
86 | If no codec is found, a LookupError is set and NULL returned.
|
---|
87 |
|
---|
88 | As side effect, this tries to load the encodings package, if not
|
---|
89 | yet done. This is part of the lazy load strategy for the encodings
|
---|
90 | package.
|
---|
91 |
|
---|
92 | */
|
---|
93 |
|
---|
94 | PyObject *_PyCodec_Lookup(const char *encoding)
|
---|
95 | {
|
---|
96 | PyInterpreterState *interp;
|
---|
97 | PyObject *result, *args = NULL, *v;
|
---|
98 | Py_ssize_t i, len;
|
---|
99 |
|
---|
100 | if (encoding == NULL) {
|
---|
101 | PyErr_BadArgument();
|
---|
102 | goto onError;
|
---|
103 | }
|
---|
104 |
|
---|
105 | interp = PyThreadState_GET()->interp;
|
---|
106 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
|
---|
107 | goto onError;
|
---|
108 |
|
---|
109 | /* Convert the encoding to a normalized Python string: all
|
---|
110 | characters are converted to lower case, spaces and hyphens are
|
---|
111 | replaced with underscores. */
|
---|
112 | v = normalizestring(encoding);
|
---|
113 | if (v == NULL)
|
---|
114 | goto onError;
|
---|
115 | PyString_InternInPlace(&v);
|
---|
116 |
|
---|
117 | /* First, try to lookup the name in the registry dictionary */
|
---|
118 | result = PyDict_GetItem(interp->codec_search_cache, v);
|
---|
119 | if (result != NULL) {
|
---|
120 | Py_INCREF(result);
|
---|
121 | Py_DECREF(v);
|
---|
122 | return result;
|
---|
123 | }
|
---|
124 |
|
---|
125 | /* Next, scan the search functions in order of registration */
|
---|
126 | args = PyTuple_New(1);
|
---|
127 | if (args == NULL)
|
---|
128 | goto onError;
|
---|
129 | PyTuple_SET_ITEM(args,0,v);
|
---|
130 |
|
---|
131 | len = PyList_Size(interp->codec_search_path);
|
---|
132 | if (len < 0)
|
---|
133 | goto onError;
|
---|
134 | if (len == 0) {
|
---|
135 | PyErr_SetString(PyExc_LookupError,
|
---|
136 | "no codec search functions registered: "
|
---|
137 | "can't find encoding");
|
---|
138 | goto onError;
|
---|
139 | }
|
---|
140 |
|
---|
141 | for (i = 0; i < len; i++) {
|
---|
142 | PyObject *func;
|
---|
143 |
|
---|
144 | func = PyList_GetItem(interp->codec_search_path, i);
|
---|
145 | if (func == NULL)
|
---|
146 | goto onError;
|
---|
147 | result = PyEval_CallObject(func, args);
|
---|
148 | if (result == NULL)
|
---|
149 | goto onError;
|
---|
150 | if (result == Py_None) {
|
---|
151 | Py_DECREF(result);
|
---|
152 | continue;
|
---|
153 | }
|
---|
154 | if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
|
---|
155 | PyErr_SetString(PyExc_TypeError,
|
---|
156 | "codec search functions must return 4-tuples");
|
---|
157 | Py_DECREF(result);
|
---|
158 | goto onError;
|
---|
159 | }
|
---|
160 | break;
|
---|
161 | }
|
---|
162 | if (i == len) {
|
---|
163 | /* XXX Perhaps we should cache misses too ? */
|
---|
164 | PyErr_Format(PyExc_LookupError,
|
---|
165 | "unknown encoding: %s", encoding);
|
---|
166 | goto onError;
|
---|
167 | }
|
---|
168 |
|
---|
169 | /* Cache and return the result */
|
---|
170 | PyDict_SetItem(interp->codec_search_cache, v, result);
|
---|
171 | Py_DECREF(args);
|
---|
172 | return result;
|
---|
173 |
|
---|
174 | onError:
|
---|
175 | Py_XDECREF(args);
|
---|
176 | return NULL;
|
---|
177 | }
|
---|
178 |
|
---|
179 | static
|
---|
180 | PyObject *args_tuple(PyObject *object,
|
---|
181 | const char *errors)
|
---|
182 | {
|
---|
183 | PyObject *args;
|
---|
184 |
|
---|
185 | args = PyTuple_New(1 + (errors != NULL));
|
---|
186 | if (args == NULL)
|
---|
187 | return NULL;
|
---|
188 | Py_INCREF(object);
|
---|
189 | PyTuple_SET_ITEM(args,0,object);
|
---|
190 | if (errors) {
|
---|
191 | PyObject *v;
|
---|
192 |
|
---|
193 | v = PyString_FromString(errors);
|
---|
194 | if (v == NULL) {
|
---|
195 | Py_DECREF(args);
|
---|
196 | return NULL;
|
---|
197 | }
|
---|
198 | PyTuple_SET_ITEM(args, 1, v);
|
---|
199 | }
|
---|
200 | return args;
|
---|
201 | }
|
---|
202 |
|
---|
203 | /* Helper function to get a codec item */
|
---|
204 |
|
---|
205 | static
|
---|
206 | PyObject *codec_getitem(const char *encoding, int index)
|
---|
207 | {
|
---|
208 | PyObject *codecs;
|
---|
209 | PyObject *v;
|
---|
210 |
|
---|
211 | codecs = _PyCodec_Lookup(encoding);
|
---|
212 | if (codecs == NULL)
|
---|
213 | return NULL;
|
---|
214 | v = PyTuple_GET_ITEM(codecs, index);
|
---|
215 | Py_DECREF(codecs);
|
---|
216 | Py_INCREF(v);
|
---|
217 | return v;
|
---|
218 | }
|
---|
219 |
|
---|
220 | /* Helper function to create an incremental codec. */
|
---|
221 |
|
---|
222 | static
|
---|
223 | PyObject *codec_getincrementalcodec(const char *encoding,
|
---|
224 | const char *errors,
|
---|
225 | const char *attrname)
|
---|
226 | {
|
---|
227 | PyObject *codecs, *ret, *inccodec;
|
---|
228 |
|
---|
229 | codecs = _PyCodec_Lookup(encoding);
|
---|
230 | if (codecs == NULL)
|
---|
231 | return NULL;
|
---|
232 | inccodec = PyObject_GetAttrString(codecs, attrname);
|
---|
233 | Py_DECREF(codecs);
|
---|
234 | if (inccodec == NULL)
|
---|
235 | return NULL;
|
---|
236 | if (errors)
|
---|
237 | ret = PyObject_CallFunction(inccodec, "s", errors);
|
---|
238 | else
|
---|
239 | ret = PyObject_CallFunction(inccodec, NULL);
|
---|
240 | Py_DECREF(inccodec);
|
---|
241 | return ret;
|
---|
242 | }
|
---|
243 |
|
---|
244 | /* Helper function to create a stream codec. */
|
---|
245 |
|
---|
246 | static
|
---|
247 | PyObject *codec_getstreamcodec(const char *encoding,
|
---|
248 | PyObject *stream,
|
---|
249 | const char *errors,
|
---|
250 | const int index)
|
---|
251 | {
|
---|
252 | PyObject *codecs, *streamcodec, *codeccls;
|
---|
253 |
|
---|
254 | codecs = _PyCodec_Lookup(encoding);
|
---|
255 | if (codecs == NULL)
|
---|
256 | return NULL;
|
---|
257 |
|
---|
258 | codeccls = PyTuple_GET_ITEM(codecs, index);
|
---|
259 | if (errors != NULL)
|
---|
260 | streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
|
---|
261 | else
|
---|
262 | streamcodec = PyObject_CallFunction(codeccls, "O", stream);
|
---|
263 | Py_DECREF(codecs);
|
---|
264 | return streamcodec;
|
---|
265 | }
|
---|
266 |
|
---|
267 | /* Convenience APIs to query the Codec registry.
|
---|
268 |
|
---|
269 | All APIs return a codec object with incremented refcount.
|
---|
270 |
|
---|
271 | */
|
---|
272 |
|
---|
273 | PyObject *PyCodec_Encoder(const char *encoding)
|
---|
274 | {
|
---|
275 | return codec_getitem(encoding, 0);
|
---|
276 | }
|
---|
277 |
|
---|
278 | PyObject *PyCodec_Decoder(const char *encoding)
|
---|
279 | {
|
---|
280 | return codec_getitem(encoding, 1);
|
---|
281 | }
|
---|
282 |
|
---|
283 | PyObject *PyCodec_IncrementalEncoder(const char *encoding,
|
---|
284 | const char *errors)
|
---|
285 | {
|
---|
286 | return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
|
---|
287 | }
|
---|
288 |
|
---|
289 | PyObject *PyCodec_IncrementalDecoder(const char *encoding,
|
---|
290 | const char *errors)
|
---|
291 | {
|
---|
292 | return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
|
---|
293 | }
|
---|
294 |
|
---|
295 | PyObject *PyCodec_StreamReader(const char *encoding,
|
---|
296 | PyObject *stream,
|
---|
297 | const char *errors)
|
---|
298 | {
|
---|
299 | return codec_getstreamcodec(encoding, stream, errors, 2);
|
---|
300 | }
|
---|
301 |
|
---|
302 | PyObject *PyCodec_StreamWriter(const char *encoding,
|
---|
303 | PyObject *stream,
|
---|
304 | const char *errors)
|
---|
305 | {
|
---|
306 | return codec_getstreamcodec(encoding, stream, errors, 3);
|
---|
307 | }
|
---|
308 |
|
---|
309 | /* Encode an object (e.g. an Unicode object) using the given encoding
|
---|
310 | and return the resulting encoded object (usually a Python string).
|
---|
311 |
|
---|
312 | errors is passed to the encoder factory as argument if non-NULL. */
|
---|
313 |
|
---|
314 | PyObject *PyCodec_Encode(PyObject *object,
|
---|
315 | const char *encoding,
|
---|
316 | const char *errors)
|
---|
317 | {
|
---|
318 | PyObject *encoder = NULL;
|
---|
319 | PyObject *args = NULL, *result = NULL;
|
---|
320 | PyObject *v;
|
---|
321 |
|
---|
322 | encoder = PyCodec_Encoder(encoding);
|
---|
323 | if (encoder == NULL)
|
---|
324 | goto onError;
|
---|
325 |
|
---|
326 | args = args_tuple(object, errors);
|
---|
327 | if (args == NULL)
|
---|
328 | goto onError;
|
---|
329 |
|
---|
330 | result = PyEval_CallObject(encoder,args);
|
---|
331 | if (result == NULL)
|
---|
332 | goto onError;
|
---|
333 |
|
---|
334 | if (!PyTuple_Check(result) ||
|
---|
335 | PyTuple_GET_SIZE(result) != 2) {
|
---|
336 | PyErr_SetString(PyExc_TypeError,
|
---|
337 | "encoder must return a tuple (object,integer)");
|
---|
338 | goto onError;
|
---|
339 | }
|
---|
340 | v = PyTuple_GET_ITEM(result,0);
|
---|
341 | Py_INCREF(v);
|
---|
342 | /* We don't check or use the second (integer) entry. */
|
---|
343 |
|
---|
344 | Py_DECREF(args);
|
---|
345 | Py_DECREF(encoder);
|
---|
346 | Py_DECREF(result);
|
---|
347 | return v;
|
---|
348 |
|
---|
349 | onError:
|
---|
350 | Py_XDECREF(result);
|
---|
351 | Py_XDECREF(args);
|
---|
352 | Py_XDECREF(encoder);
|
---|
353 | return NULL;
|
---|
354 | }
|
---|
355 |
|
---|
356 | /* Decode an object (usually a Python string) using the given encoding
|
---|
357 | and return an equivalent object (e.g. an Unicode object).
|
---|
358 |
|
---|
359 | errors is passed to the decoder factory as argument if non-NULL. */
|
---|
360 |
|
---|
361 | PyObject *PyCodec_Decode(PyObject *object,
|
---|
362 | const char *encoding,
|
---|
363 | const char *errors)
|
---|
364 | {
|
---|
365 | PyObject *decoder = NULL;
|
---|
366 | PyObject *args = NULL, *result = NULL;
|
---|
367 | PyObject *v;
|
---|
368 |
|
---|
369 | decoder = PyCodec_Decoder(encoding);
|
---|
370 | if (decoder == NULL)
|
---|
371 | goto onError;
|
---|
372 |
|
---|
373 | args = args_tuple(object, errors);
|
---|
374 | if (args == NULL)
|
---|
375 | goto onError;
|
---|
376 |
|
---|
377 | result = PyEval_CallObject(decoder,args);
|
---|
378 | if (result == NULL)
|
---|
379 | goto onError;
|
---|
380 | if (!PyTuple_Check(result) ||
|
---|
381 | PyTuple_GET_SIZE(result) != 2) {
|
---|
382 | PyErr_SetString(PyExc_TypeError,
|
---|
383 | "decoder must return a tuple (object,integer)");
|
---|
384 | goto onError;
|
---|
385 | }
|
---|
386 | v = PyTuple_GET_ITEM(result,0);
|
---|
387 | Py_INCREF(v);
|
---|
388 | /* We don't check or use the second (integer) entry. */
|
---|
389 |
|
---|
390 | Py_DECREF(args);
|
---|
391 | Py_DECREF(decoder);
|
---|
392 | Py_DECREF(result);
|
---|
393 | return v;
|
---|
394 |
|
---|
395 | onError:
|
---|
396 | Py_XDECREF(args);
|
---|
397 | Py_XDECREF(decoder);
|
---|
398 | Py_XDECREF(result);
|
---|
399 | return NULL;
|
---|
400 | }
|
---|
401 |
|
---|
402 | /* Register the error handling callback function error under the name
|
---|
403 | name. This function will be called by the codec when it encounters
|
---|
404 | an unencodable characters/undecodable bytes and doesn't know the
|
---|
405 | callback name, when name is specified as the error parameter
|
---|
406 | in the call to the encode/decode function.
|
---|
407 | Return 0 on success, -1 on error */
|
---|
408 | int PyCodec_RegisterError(const char *name, PyObject *error)
|
---|
409 | {
|
---|
410 | PyInterpreterState *interp = PyThreadState_GET()->interp;
|
---|
411 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
|
---|
412 | return -1;
|
---|
413 | if (!PyCallable_Check(error)) {
|
---|
414 | PyErr_SetString(PyExc_TypeError, "handler must be callable");
|
---|
415 | return -1;
|
---|
416 | }
|
---|
417 | return PyDict_SetItemString(interp->codec_error_registry,
|
---|
418 | (char *)name, error);
|
---|
419 | }
|
---|
420 |
|
---|
421 | /* Lookup the error handling callback function registered under the
|
---|
422 | name error. As a special case NULL can be passed, in which case
|
---|
423 | the error handling callback for strict encoding will be returned. */
|
---|
424 | PyObject *PyCodec_LookupError(const char *name)
|
---|
425 | {
|
---|
426 | PyObject *handler = NULL;
|
---|
427 |
|
---|
428 | PyInterpreterState *interp = PyThreadState_GET()->interp;
|
---|
429 | if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
|
---|
430 | return NULL;
|
---|
431 |
|
---|
432 | if (name==NULL)
|
---|
433 | name = "strict";
|
---|
434 | handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
|
---|
435 | if (!handler)
|
---|
436 | PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
|
---|
437 | else
|
---|
438 | Py_INCREF(handler);
|
---|
439 | return handler;
|
---|
440 | }
|
---|
441 |
|
---|
442 | static void wrong_exception_type(PyObject *exc)
|
---|
443 | {
|
---|
444 | PyObject *type = PyObject_GetAttrString(exc, "__class__");
|
---|
445 | if (type != NULL) {
|
---|
446 | PyObject *name = PyObject_GetAttrString(type, "__name__");
|
---|
447 | Py_DECREF(type);
|
---|
448 | if (name != NULL) {
|
---|
449 | PyObject *string = PyObject_Str(name);
|
---|
450 | Py_DECREF(name);
|
---|
451 | if (string != NULL) {
|
---|
452 | PyErr_Format(PyExc_TypeError,
|
---|
453 | "don't know how to handle %.400s in error callback",
|
---|
454 | PyString_AS_STRING(string));
|
---|
455 | Py_DECREF(string);
|
---|
456 | }
|
---|
457 | }
|
---|
458 | }
|
---|
459 | }
|
---|
460 |
|
---|
461 | PyObject *PyCodec_StrictErrors(PyObject *exc)
|
---|
462 | {
|
---|
463 | if (PyExceptionInstance_Check(exc))
|
---|
464 | PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
---|
465 | else
|
---|
466 | PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
|
---|
467 | return NULL;
|
---|
468 | }
|
---|
469 |
|
---|
470 |
|
---|
471 | #ifdef Py_USING_UNICODE
|
---|
472 | PyObject *PyCodec_IgnoreErrors(PyObject *exc)
|
---|
473 | {
|
---|
474 | Py_ssize_t end;
|
---|
475 | if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
---|
476 | if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
---|
477 | return NULL;
|
---|
478 | }
|
---|
479 | else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
|
---|
480 | if (PyUnicodeDecodeError_GetEnd(exc, &end))
|
---|
481 | return NULL;
|
---|
482 | }
|
---|
483 | else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
|
---|
484 | if (PyUnicodeTranslateError_GetEnd(exc, &end))
|
---|
485 | return NULL;
|
---|
486 | }
|
---|
487 | else {
|
---|
488 | wrong_exception_type(exc);
|
---|
489 | return NULL;
|
---|
490 | }
|
---|
491 | /* ouch: passing NULL, 0, pos gives None instead of u'' */
|
---|
492 | return Py_BuildValue("(u#n)", &end, 0, end);
|
---|
493 | }
|
---|
494 |
|
---|
495 |
|
---|
496 | PyObject *PyCodec_ReplaceErrors(PyObject *exc)
|
---|
497 | {
|
---|
498 | PyObject *restuple;
|
---|
499 | Py_ssize_t start;
|
---|
500 | Py_ssize_t end;
|
---|
501 | Py_ssize_t i;
|
---|
502 |
|
---|
503 | if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
---|
504 | PyObject *res;
|
---|
505 | Py_UNICODE *p;
|
---|
506 | if (PyUnicodeEncodeError_GetStart(exc, &start))
|
---|
507 | return NULL;
|
---|
508 | if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
---|
509 | return NULL;
|
---|
510 | res = PyUnicode_FromUnicode(NULL, end-start);
|
---|
511 | if (res == NULL)
|
---|
512 | return NULL;
|
---|
513 | for (p = PyUnicode_AS_UNICODE(res), i = start;
|
---|
514 | i<end; ++p, ++i)
|
---|
515 | *p = '?';
|
---|
516 | restuple = Py_BuildValue("(On)", res, end);
|
---|
517 | Py_DECREF(res);
|
---|
518 | return restuple;
|
---|
519 | }
|
---|
520 | else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
|
---|
521 | Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
|
---|
522 | if (PyUnicodeDecodeError_GetEnd(exc, &end))
|
---|
523 | return NULL;
|
---|
524 | return Py_BuildValue("(u#n)", &res, (Py_ssize_t)1, end);
|
---|
525 | }
|
---|
526 | else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
|
---|
527 | PyObject *res;
|
---|
528 | Py_UNICODE *p;
|
---|
529 | if (PyUnicodeTranslateError_GetStart(exc, &start))
|
---|
530 | return NULL;
|
---|
531 | if (PyUnicodeTranslateError_GetEnd(exc, &end))
|
---|
532 | return NULL;
|
---|
533 | res = PyUnicode_FromUnicode(NULL, end-start);
|
---|
534 | if (res == NULL)
|
---|
535 | return NULL;
|
---|
536 | for (p = PyUnicode_AS_UNICODE(res), i = start;
|
---|
537 | i<end; ++p, ++i)
|
---|
538 | *p = Py_UNICODE_REPLACEMENT_CHARACTER;
|
---|
539 | restuple = Py_BuildValue("(On)", res, end);
|
---|
540 | Py_DECREF(res);
|
---|
541 | return restuple;
|
---|
542 | }
|
---|
543 | else {
|
---|
544 | wrong_exception_type(exc);
|
---|
545 | return NULL;
|
---|
546 | }
|
---|
547 | }
|
---|
548 |
|
---|
549 | PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
|
---|
550 | {
|
---|
551 | if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
---|
552 | PyObject *restuple;
|
---|
553 | PyObject *object;
|
---|
554 | Py_ssize_t start;
|
---|
555 | Py_ssize_t end;
|
---|
556 | PyObject *res;
|
---|
557 | Py_UNICODE *p;
|
---|
558 | Py_UNICODE *startp;
|
---|
559 | Py_UNICODE *e;
|
---|
560 | Py_UNICODE *outp;
|
---|
561 | int ressize;
|
---|
562 | if (PyUnicodeEncodeError_GetStart(exc, &start))
|
---|
563 | return NULL;
|
---|
564 | if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
---|
565 | return NULL;
|
---|
566 | if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
---|
567 | return NULL;
|
---|
568 | startp = PyUnicode_AS_UNICODE(object);
|
---|
569 | e = startp + end;
|
---|
570 | for (p = startp+start, ressize = 0; p < e;) {
|
---|
571 | Py_UCS4 ch = *p++;
|
---|
572 | #ifndef Py_UNICODE_WIDE
|
---|
573 | if ((0xD800 <= ch && ch <= 0xDBFF) &&
|
---|
574 | (p < e) &&
|
---|
575 | (0xDC00 <= *p && *p <= 0xDFFF)) {
|
---|
576 | ch = ((((ch & 0x03FF) << 10) |
|
---|
577 | ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
|
---|
578 | }
|
---|
579 | #endif
|
---|
580 | if (ch < 10)
|
---|
581 | ressize += 2+1+1;
|
---|
582 | else if (ch < 100)
|
---|
583 | ressize += 2+2+1;
|
---|
584 | else if (ch < 1000)
|
---|
585 | ressize += 2+3+1;
|
---|
586 | else if (ch < 10000)
|
---|
587 | ressize += 2+4+1;
|
---|
588 | else if (ch < 100000)
|
---|
589 | ressize += 2+5+1;
|
---|
590 | else if (ch < 1000000)
|
---|
591 | ressize += 2+6+1;
|
---|
592 | else
|
---|
593 | ressize += 2+7+1;
|
---|
594 | }
|
---|
595 | /* allocate replacement */
|
---|
596 | res = PyUnicode_FromUnicode(NULL, ressize);
|
---|
597 | if (res == NULL) {
|
---|
598 | Py_DECREF(object);
|
---|
599 | return NULL;
|
---|
600 | }
|
---|
601 | /* generate replacement */
|
---|
602 | for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < e;) {
|
---|
603 | int digits;
|
---|
604 | int base;
|
---|
605 | Py_UCS4 ch = *p++;
|
---|
606 | #ifndef Py_UNICODE_WIDE
|
---|
607 | if ((0xD800 <= ch && ch <= 0xDBFF) &&
|
---|
608 | (p < startp+end) &&
|
---|
609 | (0xDC00 <= *p && *p <= 0xDFFF)) {
|
---|
610 | ch = ((((ch & 0x03FF) << 10) |
|
---|
611 | ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
|
---|
612 | }
|
---|
613 | #endif
|
---|
614 | *outp++ = '&';
|
---|
615 | *outp++ = '#';
|
---|
616 | if (ch < 10) {
|
---|
617 | digits = 1;
|
---|
618 | base = 1;
|
---|
619 | }
|
---|
620 | else if (ch < 100) {
|
---|
621 | digits = 2;
|
---|
622 | base = 10;
|
---|
623 | }
|
---|
624 | else if (ch < 1000) {
|
---|
625 | digits = 3;
|
---|
626 | base = 100;
|
---|
627 | }
|
---|
628 | else if (ch < 10000) {
|
---|
629 | digits = 4;
|
---|
630 | base = 1000;
|
---|
631 | }
|
---|
632 | else if (ch < 100000) {
|
---|
633 | digits = 5;
|
---|
634 | base = 10000;
|
---|
635 | }
|
---|
636 | else if (ch < 1000000) {
|
---|
637 | digits = 6;
|
---|
638 | base = 100000;
|
---|
639 | }
|
---|
640 | else {
|
---|
641 | digits = 7;
|
---|
642 | base = 1000000;
|
---|
643 | }
|
---|
644 | while (digits-->0) {
|
---|
645 | *outp++ = '0' + ch/base;
|
---|
646 | ch %= base;
|
---|
647 | base /= 10;
|
---|
648 | }
|
---|
649 | *outp++ = ';';
|
---|
650 | }
|
---|
651 | restuple = Py_BuildValue("(On)", res, end);
|
---|
652 | Py_DECREF(res);
|
---|
653 | Py_DECREF(object);
|
---|
654 | return restuple;
|
---|
655 | }
|
---|
656 | else {
|
---|
657 | wrong_exception_type(exc);
|
---|
658 | return NULL;
|
---|
659 | }
|
---|
660 | }
|
---|
661 |
|
---|
662 | static Py_UNICODE hexdigits[] = {
|
---|
663 | '0', '1', '2', '3', '4', '5', '6', '7',
|
---|
664 | '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
|
---|
665 | };
|
---|
666 |
|
---|
667 | PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
---|
668 | {
|
---|
669 | if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
---|
670 | PyObject *restuple;
|
---|
671 | PyObject *object;
|
---|
672 | Py_ssize_t start;
|
---|
673 | Py_ssize_t end;
|
---|
674 | PyObject *res;
|
---|
675 | Py_UNICODE *p;
|
---|
676 | Py_UNICODE *startp;
|
---|
677 | Py_UNICODE *outp;
|
---|
678 | int ressize;
|
---|
679 | if (PyUnicodeEncodeError_GetStart(exc, &start))
|
---|
680 | return NULL;
|
---|
681 | if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
---|
682 | return NULL;
|
---|
683 | if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
---|
684 | return NULL;
|
---|
685 | startp = PyUnicode_AS_UNICODE(object);
|
---|
686 | for (p = startp+start, ressize = 0; p < startp+end; ++p) {
|
---|
687 | #ifdef Py_UNICODE_WIDE
|
---|
688 | if (*p >= 0x00010000)
|
---|
689 | ressize += 1+1+8;
|
---|
690 | else
|
---|
691 | #endif
|
---|
692 | if (*p >= 0x100) {
|
---|
693 | ressize += 1+1+4;
|
---|
694 | }
|
---|
695 | else
|
---|
696 | ressize += 1+1+2;
|
---|
697 | }
|
---|
698 | res = PyUnicode_FromUnicode(NULL, ressize);
|
---|
699 | if (res==NULL)
|
---|
700 | return NULL;
|
---|
701 | for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
|
---|
702 | p < startp+end; ++p) {
|
---|
703 | Py_UNICODE c = *p;
|
---|
704 | *outp++ = '\\';
|
---|
705 | #ifdef Py_UNICODE_WIDE
|
---|
706 | if (c >= 0x00010000) {
|
---|
707 | *outp++ = 'U';
|
---|
708 | *outp++ = hexdigits[(c>>28)&0xf];
|
---|
709 | *outp++ = hexdigits[(c>>24)&0xf];
|
---|
710 | *outp++ = hexdigits[(c>>20)&0xf];
|
---|
711 | *outp++ = hexdigits[(c>>16)&0xf];
|
---|
712 | *outp++ = hexdigits[(c>>12)&0xf];
|
---|
713 | *outp++ = hexdigits[(c>>8)&0xf];
|
---|
714 | }
|
---|
715 | else
|
---|
716 | #endif
|
---|
717 | if (c >= 0x100) {
|
---|
718 | *outp++ = 'u';
|
---|
719 | *outp++ = hexdigits[(c>>12)&0xf];
|
---|
720 | *outp++ = hexdigits[(c>>8)&0xf];
|
---|
721 | }
|
---|
722 | else
|
---|
723 | *outp++ = 'x';
|
---|
724 | *outp++ = hexdigits[(c>>4)&0xf];
|
---|
725 | *outp++ = hexdigits[c&0xf];
|
---|
726 | }
|
---|
727 |
|
---|
728 | restuple = Py_BuildValue("(On)", res, end);
|
---|
729 | Py_DECREF(res);
|
---|
730 | Py_DECREF(object);
|
---|
731 | return restuple;
|
---|
732 | }
|
---|
733 | else {
|
---|
734 | wrong_exception_type(exc);
|
---|
735 | return NULL;
|
---|
736 | }
|
---|
737 | }
|
---|
738 | #endif
|
---|
739 |
|
---|
740 | static PyObject *strict_errors(PyObject *self, PyObject *exc)
|
---|
741 | {
|
---|
742 | return PyCodec_StrictErrors(exc);
|
---|
743 | }
|
---|
744 |
|
---|
745 |
|
---|
746 | #ifdef Py_USING_UNICODE
|
---|
747 | static PyObject *ignore_errors(PyObject *self, PyObject *exc)
|
---|
748 | {
|
---|
749 | return PyCodec_IgnoreErrors(exc);
|
---|
750 | }
|
---|
751 |
|
---|
752 |
|
---|
753 | static PyObject *replace_errors(PyObject *self, PyObject *exc)
|
---|
754 | {
|
---|
755 | return PyCodec_ReplaceErrors(exc);
|
---|
756 | }
|
---|
757 |
|
---|
758 |
|
---|
759 | static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
|
---|
760 | {
|
---|
761 | return PyCodec_XMLCharRefReplaceErrors(exc);
|
---|
762 | }
|
---|
763 |
|
---|
764 |
|
---|
765 | static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
|
---|
766 | {
|
---|
767 | return PyCodec_BackslashReplaceErrors(exc);
|
---|
768 | }
|
---|
769 | #endif
|
---|
770 |
|
---|
771 | static int _PyCodecRegistry_Init(void)
|
---|
772 | {
|
---|
773 | static struct {
|
---|
774 | char *name;
|
---|
775 | PyMethodDef def;
|
---|
776 | } methods[] =
|
---|
777 | {
|
---|
778 | {
|
---|
779 | "strict",
|
---|
780 | {
|
---|
781 | "strict_errors",
|
---|
782 | strict_errors,
|
---|
783 | METH_O,
|
---|
784 | PyDoc_STR("Implements the 'strict' error handling, which "
|
---|
785 | "raises a UnicodeError on coding errors.")
|
---|
786 | }
|
---|
787 | },
|
---|
788 | #ifdef Py_USING_UNICODE
|
---|
789 | {
|
---|
790 | "ignore",
|
---|
791 | {
|
---|
792 | "ignore_errors",
|
---|
793 | ignore_errors,
|
---|
794 | METH_O,
|
---|
795 | PyDoc_STR("Implements the 'ignore' error handling, which "
|
---|
796 | "ignores malformed data and continues.")
|
---|
797 | }
|
---|
798 | },
|
---|
799 | {
|
---|
800 | "replace",
|
---|
801 | {
|
---|
802 | "replace_errors",
|
---|
803 | replace_errors,
|
---|
804 | METH_O,
|
---|
805 | PyDoc_STR("Implements the 'replace' error handling, which "
|
---|
806 | "replaces malformed data with a replacement marker.")
|
---|
807 | }
|
---|
808 | },
|
---|
809 | {
|
---|
810 | "xmlcharrefreplace",
|
---|
811 | {
|
---|
812 | "xmlcharrefreplace_errors",
|
---|
813 | xmlcharrefreplace_errors,
|
---|
814 | METH_O,
|
---|
815 | PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
|
---|
816 | "which replaces an unencodable character with the "
|
---|
817 | "appropriate XML character reference.")
|
---|
818 | }
|
---|
819 | },
|
---|
820 | {
|
---|
821 | "backslashreplace",
|
---|
822 | {
|
---|
823 | "backslashreplace_errors",
|
---|
824 | backslashreplace_errors,
|
---|
825 | METH_O,
|
---|
826 | PyDoc_STR("Implements the 'backslashreplace' error handling, "
|
---|
827 | "which replaces an unencodable character with a "
|
---|
828 | "backslashed escape sequence.")
|
---|
829 | }
|
---|
830 | }
|
---|
831 | #endif
|
---|
832 | };
|
---|
833 |
|
---|
834 | PyInterpreterState *interp = PyThreadState_GET()->interp;
|
---|
835 | PyObject *mod;
|
---|
836 | unsigned i;
|
---|
837 |
|
---|
838 | if (interp->codec_search_path != NULL)
|
---|
839 | return 0;
|
---|
840 |
|
---|
841 | interp->codec_search_path = PyList_New(0);
|
---|
842 | interp->codec_search_cache = PyDict_New();
|
---|
843 | interp->codec_error_registry = PyDict_New();
|
---|
844 |
|
---|
845 | if (interp->codec_error_registry) {
|
---|
846 | for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
|
---|
847 | PyObject *func = PyCFunction_New(&methods[i].def, NULL);
|
---|
848 | int res;
|
---|
849 | if (!func)
|
---|
850 | Py_FatalError("can't initialize codec error registry");
|
---|
851 | res = PyCodec_RegisterError(methods[i].name, func);
|
---|
852 | Py_DECREF(func);
|
---|
853 | if (res)
|
---|
854 | Py_FatalError("can't initialize codec error registry");
|
---|
855 | }
|
---|
856 | }
|
---|
857 |
|
---|
858 | if (interp->codec_search_path == NULL ||
|
---|
859 | interp->codec_search_cache == NULL ||
|
---|
860 | interp->codec_error_registry == NULL)
|
---|
861 | Py_FatalError("can't initialize codec registry");
|
---|
862 |
|
---|
863 | mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
|
---|
864 | if (mod == NULL) {
|
---|
865 | if (PyErr_ExceptionMatches(PyExc_ImportError)) {
|
---|
866 | /* Ignore ImportErrors... this is done so that
|
---|
867 | distributions can disable the encodings package. Note
|
---|
868 | that other errors are not masked, e.g. SystemErrors
|
---|
869 | raised to inform the user of an error in the Python
|
---|
870 | configuration are still reported back to the user. */
|
---|
871 | PyErr_Clear();
|
---|
872 | return 0;
|
---|
873 | }
|
---|
874 | return -1;
|
---|
875 | }
|
---|
876 | Py_DECREF(mod);
|
---|
877 | return 0;
|
---|
878 | }
|
---|