1 | /*
|
---|
2 |
|
---|
3 | Unicode implementation based on original code by Fredrik Lundh,
|
---|
4 | modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
|
---|
5 | Unicode Integration Proposal (see file Misc/unicode.txt).
|
---|
6 |
|
---|
7 | Major speed upgrades to the method implementations at the Reykjavik
|
---|
8 | NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
|
---|
9 |
|
---|
10 | Copyright (c) Corporation for National Research Initiatives.
|
---|
11 |
|
---|
12 | --------------------------------------------------------------------
|
---|
13 | The original string type implementation is:
|
---|
14 |
|
---|
15 | Copyright (c) 1999 by Secret Labs AB
|
---|
16 | Copyright (c) 1999 by Fredrik Lundh
|
---|
17 |
|
---|
18 | By obtaining, using, and/or copying this software and/or its
|
---|
19 | associated documentation, you agree that you have read, understood,
|
---|
20 | and will comply with the following terms and conditions:
|
---|
21 |
|
---|
22 | Permission to use, copy, modify, and distribute this software and its
|
---|
23 | associated documentation for any purpose and without fee is hereby
|
---|
24 | granted, provided that the above copyright notice appears in all
|
---|
25 | copies, and that both that copyright notice and this permission notice
|
---|
26 | appear in supporting documentation, and that the name of Secret Labs
|
---|
27 | AB or the author not be used in advertising or publicity pertaining to
|
---|
28 | distribution of the software without specific, written prior
|
---|
29 | permission.
|
---|
30 |
|
---|
31 | SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
|
---|
32 | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
---|
33 | FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
|
---|
34 | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
---|
35 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
---|
36 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
---|
37 | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
---|
38 | --------------------------------------------------------------------
|
---|
39 |
|
---|
40 | */
|
---|
41 |
|
---|
42 | #define PY_SSIZE_T_CLEAN
|
---|
43 | #include "Python.h"
|
---|
44 |
|
---|
45 | #include "unicodeobject.h"
|
---|
46 | #include "ucnhash.h"
|
---|
47 |
|
---|
48 | #ifdef MS_WINDOWS
|
---|
49 | #include <windows.h>
|
---|
50 | #endif
|
---|
51 |
|
---|
52 | /* Limit for the Unicode object free list */
|
---|
53 |
|
---|
54 | #define MAX_UNICODE_FREELIST_SIZE 1024
|
---|
55 |
|
---|
56 | /* Limit for the Unicode object free list stay alive optimization.
|
---|
57 |
|
---|
58 | The implementation will keep allocated Unicode memory intact for
|
---|
59 | all objects on the free list having a size less than this
|
---|
60 | limit. This reduces malloc() overhead for small Unicode objects.
|
---|
61 |
|
---|
62 | At worst this will result in MAX_UNICODE_FREELIST_SIZE *
|
---|
63 | (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
|
---|
64 | malloc()-overhead) bytes of unused garbage.
|
---|
65 |
|
---|
66 | Setting the limit to 0 effectively turns the feature off.
|
---|
67 |
|
---|
68 | Note: This is an experimental feature ! If you get core dumps when
|
---|
69 | using Unicode objects, turn this feature off.
|
---|
70 |
|
---|
71 | */
|
---|
72 |
|
---|
73 | #define KEEPALIVE_SIZE_LIMIT 9
|
---|
74 |
|
---|
75 | /* Endianness switches; defaults to little endian */
|
---|
76 |
|
---|
77 | #ifdef WORDS_BIGENDIAN
|
---|
78 | # define BYTEORDER_IS_BIG_ENDIAN
|
---|
79 | #else
|
---|
80 | # define BYTEORDER_IS_LITTLE_ENDIAN
|
---|
81 | #endif
|
---|
82 |
|
---|
83 | /* --- Globals ------------------------------------------------------------
|
---|
84 |
|
---|
85 | The globals are initialized by the _PyUnicode_Init() API and should
|
---|
86 | not be used before calling that API.
|
---|
87 |
|
---|
88 | */
|
---|
89 |
|
---|
90 |
|
---|
91 | #ifdef __cplusplus
|
---|
92 | extern "C" {
|
---|
93 | #endif
|
---|
94 |
|
---|
95 | /* Free list for Unicode objects */
|
---|
96 | static PyUnicodeObject *unicode_freelist;
|
---|
97 | static int unicode_freelist_size;
|
---|
98 |
|
---|
99 | /* The empty Unicode object is shared to improve performance. */
|
---|
100 | static PyUnicodeObject *unicode_empty;
|
---|
101 |
|
---|
102 | /* Single character Unicode strings in the Latin-1 range are being
|
---|
103 | shared as well. */
|
---|
104 | static PyUnicodeObject *unicode_latin1[256];
|
---|
105 |
|
---|
106 | /* Default encoding to use and assume when NULL is passed as encoding
|
---|
107 | parameter; it is initialized by _PyUnicode_Init().
|
---|
108 |
|
---|
109 | Always use the PyUnicode_SetDefaultEncoding() and
|
---|
110 | PyUnicode_GetDefaultEncoding() APIs to access this global.
|
---|
111 |
|
---|
112 | */
|
---|
113 | static char unicode_default_encoding[100];
|
---|
114 |
|
---|
115 | Py_UNICODE
|
---|
116 | PyUnicode_GetMax(void)
|
---|
117 | {
|
---|
118 | #ifdef Py_UNICODE_WIDE
|
---|
119 | return 0x10FFFF;
|
---|
120 | #else
|
---|
121 | /* This is actually an illegal character, so it should
|
---|
122 | not be passed to unichr. */
|
---|
123 | return 0xFFFF;
|
---|
124 | #endif
|
---|
125 | }
|
---|
126 |
|
---|
127 | /* --- Bloom Filters ----------------------------------------------------- */
|
---|
128 |
|
---|
129 | /* stuff to implement simple "bloom filters" for Unicode characters.
|
---|
130 | to keep things simple, we use a single bitmask, using the least 5
|
---|
131 | bits from each unicode characters as the bit index. */
|
---|
132 |
|
---|
133 | /* the linebreak mask is set up by Unicode_Init below */
|
---|
134 |
|
---|
135 | #define BLOOM_MASK unsigned long
|
---|
136 |
|
---|
137 | static BLOOM_MASK bloom_linebreak;
|
---|
138 |
|
---|
139 | #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
|
---|
140 |
|
---|
141 | #define BLOOM_LINEBREAK(ch)\
|
---|
142 | (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
|
---|
143 |
|
---|
144 | Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
|
---|
145 | {
|
---|
146 | /* calculate simple bloom-style bitmask for a given unicode string */
|
---|
147 |
|
---|
148 | long mask;
|
---|
149 | Py_ssize_t i;
|
---|
150 |
|
---|
151 | mask = 0;
|
---|
152 | for (i = 0; i < len; i++)
|
---|
153 | mask |= (1 << (ptr[i] & 0x1F));
|
---|
154 |
|
---|
155 | return mask;
|
---|
156 | }
|
---|
157 |
|
---|
158 | Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
|
---|
159 | {
|
---|
160 | Py_ssize_t i;
|
---|
161 |
|
---|
162 | for (i = 0; i < setlen; i++)
|
---|
163 | if (set[i] == chr)
|
---|
164 | return 1;
|
---|
165 |
|
---|
166 | return 0;
|
---|
167 | }
|
---|
168 |
|
---|
169 | #define BLOOM_MEMBER(mask, chr, set, setlen)\
|
---|
170 | BLOOM(mask, chr) && unicode_member(chr, set, setlen)
|
---|
171 |
|
---|
172 | /* --- Unicode Object ----------------------------------------------------- */
|
---|
173 |
|
---|
174 | static
|
---|
175 | int unicode_resize(register PyUnicodeObject *unicode,
|
---|
176 | Py_ssize_t length)
|
---|
177 | {
|
---|
178 | void *oldstr;
|
---|
179 |
|
---|
180 | /* Shortcut if there's nothing much to do. */
|
---|
181 | if (unicode->length == length)
|
---|
182 | goto reset;
|
---|
183 |
|
---|
184 | /* Resizing shared object (unicode_empty or single character
|
---|
185 | objects) in-place is not allowed. Use PyUnicode_Resize()
|
---|
186 | instead ! */
|
---|
187 |
|
---|
188 | if (unicode == unicode_empty ||
|
---|
189 | (unicode->length == 1 &&
|
---|
190 | unicode->str[0] < 256U &&
|
---|
191 | unicode_latin1[unicode->str[0]] == unicode)) {
|
---|
192 | PyErr_SetString(PyExc_SystemError,
|
---|
193 | "can't resize shared unicode objects");
|
---|
194 | return -1;
|
---|
195 | }
|
---|
196 |
|
---|
197 | /* We allocate one more byte to make sure the string is Ux0000 terminated.
|
---|
198 | The overallocation is also used by fastsearch, which assumes that it's
|
---|
199 | safe to look at str[length] (without making any assumptions about what
|
---|
200 | it contains). */
|
---|
201 |
|
---|
202 | oldstr = unicode->str;
|
---|
203 | PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
|
---|
204 | if (!unicode->str) {
|
---|
205 | unicode->str = (Py_UNICODE *)oldstr;
|
---|
206 | PyErr_NoMemory();
|
---|
207 | return -1;
|
---|
208 | }
|
---|
209 | unicode->str[length] = 0;
|
---|
210 | unicode->length = length;
|
---|
211 |
|
---|
212 | reset:
|
---|
213 | /* Reset the object caches */
|
---|
214 | if (unicode->defenc) {
|
---|
215 | Py_DECREF(unicode->defenc);
|
---|
216 | unicode->defenc = NULL;
|
---|
217 | }
|
---|
218 | unicode->hash = -1;
|
---|
219 |
|
---|
220 | return 0;
|
---|
221 | }
|
---|
222 |
|
---|
223 | /* We allocate one more byte to make sure the string is
|
---|
224 | Ux0000 terminated -- XXX is this needed ?
|
---|
225 |
|
---|
226 | XXX This allocator could further be enhanced by assuring that the
|
---|
227 | free list never reduces its size below 1.
|
---|
228 |
|
---|
229 | */
|
---|
230 |
|
---|
231 | static
|
---|
232 | PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
|
---|
233 | {
|
---|
234 | register PyUnicodeObject *unicode;
|
---|
235 |
|
---|
236 | /* Optimization for empty strings */
|
---|
237 | if (length == 0 && unicode_empty != NULL) {
|
---|
238 | Py_INCREF(unicode_empty);
|
---|
239 | return unicode_empty;
|
---|
240 | }
|
---|
241 |
|
---|
242 | /* Unicode freelist & memory allocation */
|
---|
243 | if (unicode_freelist) {
|
---|
244 | unicode = unicode_freelist;
|
---|
245 | unicode_freelist = *(PyUnicodeObject **)unicode;
|
---|
246 | unicode_freelist_size--;
|
---|
247 | if (unicode->str) {
|
---|
248 | /* Keep-Alive optimization: we only upsize the buffer,
|
---|
249 | never downsize it. */
|
---|
250 | if ((unicode->length < length) &&
|
---|
251 | unicode_resize(unicode, length) < 0) {
|
---|
252 | PyMem_DEL(unicode->str);
|
---|
253 | goto onError;
|
---|
254 | }
|
---|
255 | }
|
---|
256 | else {
|
---|
257 | unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
|
---|
258 | }
|
---|
259 | PyObject_INIT(unicode, &PyUnicode_Type);
|
---|
260 | }
|
---|
261 | else {
|
---|
262 | unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
|
---|
263 | if (unicode == NULL)
|
---|
264 | return NULL;
|
---|
265 | unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
|
---|
266 | }
|
---|
267 |
|
---|
268 | if (!unicode->str) {
|
---|
269 | PyErr_NoMemory();
|
---|
270 | goto onError;
|
---|
271 | }
|
---|
272 | /* Initialize the first element to guard against cases where
|
---|
273 | * the caller fails before initializing str -- unicode_resize()
|
---|
274 | * reads str[0], and the Keep-Alive optimization can keep memory
|
---|
275 | * allocated for str alive across a call to unicode_dealloc(unicode).
|
---|
276 | * We don't want unicode_resize to read uninitialized memory in
|
---|
277 | * that case.
|
---|
278 | */
|
---|
279 | unicode->str[0] = 0;
|
---|
280 | unicode->str[length] = 0;
|
---|
281 | unicode->length = length;
|
---|
282 | unicode->hash = -1;
|
---|
283 | unicode->defenc = NULL;
|
---|
284 | return unicode;
|
---|
285 |
|
---|
286 | onError:
|
---|
287 | _Py_ForgetReference((PyObject *)unicode);
|
---|
288 | PyObject_Del(unicode);
|
---|
289 | return NULL;
|
---|
290 | }
|
---|
291 |
|
---|
292 | static
|
---|
293 | void unicode_dealloc(register PyUnicodeObject *unicode)
|
---|
294 | {
|
---|
295 | if (PyUnicode_CheckExact(unicode) &&
|
---|
296 | unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
|
---|
297 | /* Keep-Alive optimization */
|
---|
298 | if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
|
---|
299 | PyMem_DEL(unicode->str);
|
---|
300 | unicode->str = NULL;
|
---|
301 | unicode->length = 0;
|
---|
302 | }
|
---|
303 | if (unicode->defenc) {
|
---|
304 | Py_DECREF(unicode->defenc);
|
---|
305 | unicode->defenc = NULL;
|
---|
306 | }
|
---|
307 | /* Add to free list */
|
---|
308 | *(PyUnicodeObject **)unicode = unicode_freelist;
|
---|
309 | unicode_freelist = unicode;
|
---|
310 | unicode_freelist_size++;
|
---|
311 | }
|
---|
312 | else {
|
---|
313 | PyMem_DEL(unicode->str);
|
---|
314 | Py_XDECREF(unicode->defenc);
|
---|
315 | unicode->ob_type->tp_free((PyObject *)unicode);
|
---|
316 | }
|
---|
317 | }
|
---|
318 |
|
---|
319 | int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
|
---|
320 | {
|
---|
321 | register PyUnicodeObject *v;
|
---|
322 |
|
---|
323 | /* Argument checks */
|
---|
324 | if (unicode == NULL) {
|
---|
325 | PyErr_BadInternalCall();
|
---|
326 | return -1;
|
---|
327 | }
|
---|
328 | v = (PyUnicodeObject *)*unicode;
|
---|
329 | if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
|
---|
330 | PyErr_BadInternalCall();
|
---|
331 | return -1;
|
---|
332 | }
|
---|
333 |
|
---|
334 | /* Resizing unicode_empty and single character objects is not
|
---|
335 | possible since these are being shared. We simply return a fresh
|
---|
336 | copy with the same Unicode content. */
|
---|
337 | if (v->length != length &&
|
---|
338 | (v == unicode_empty || v->length == 1)) {
|
---|
339 | PyUnicodeObject *w = _PyUnicode_New(length);
|
---|
340 | if (w == NULL)
|
---|
341 | return -1;
|
---|
342 | Py_UNICODE_COPY(w->str, v->str,
|
---|
343 | length < v->length ? length : v->length);
|
---|
344 | Py_DECREF(*unicode);
|
---|
345 | *unicode = (PyObject *)w;
|
---|
346 | return 0;
|
---|
347 | }
|
---|
348 |
|
---|
349 | /* Note that we don't have to modify *unicode for unshared Unicode
|
---|
350 | objects, since we can modify them in-place. */
|
---|
351 | return unicode_resize(v, length);
|
---|
352 | }
|
---|
353 |
|
---|
354 | /* Internal API for use in unicodeobject.c only ! */
|
---|
355 | #define _PyUnicode_Resize(unicodevar, length) \
|
---|
356 | PyUnicode_Resize(((PyObject **)(unicodevar)), length)
|
---|
357 |
|
---|
358 | PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
|
---|
359 | Py_ssize_t size)
|
---|
360 | {
|
---|
361 | PyUnicodeObject *unicode;
|
---|
362 |
|
---|
363 | /* If the Unicode data is known at construction time, we can apply
|
---|
364 | some optimizations which share commonly used objects. */
|
---|
365 | if (u != NULL) {
|
---|
366 |
|
---|
367 | /* Optimization for empty strings */
|
---|
368 | if (size == 0 && unicode_empty != NULL) {
|
---|
369 | Py_INCREF(unicode_empty);
|
---|
370 | return (PyObject *)unicode_empty;
|
---|
371 | }
|
---|
372 |
|
---|
373 | /* Single character Unicode objects in the Latin-1 range are
|
---|
374 | shared when using this constructor */
|
---|
375 | if (size == 1 && *u < 256) {
|
---|
376 | unicode = unicode_latin1[*u];
|
---|
377 | if (!unicode) {
|
---|
378 | unicode = _PyUnicode_New(1);
|
---|
379 | if (!unicode)
|
---|
380 | return NULL;
|
---|
381 | unicode->str[0] = *u;
|
---|
382 | unicode_latin1[*u] = unicode;
|
---|
383 | }
|
---|
384 | Py_INCREF(unicode);
|
---|
385 | return (PyObject *)unicode;
|
---|
386 | }
|
---|
387 | }
|
---|
388 |
|
---|
389 | unicode = _PyUnicode_New(size);
|
---|
390 | if (!unicode)
|
---|
391 | return NULL;
|
---|
392 |
|
---|
393 | /* Copy the Unicode data into the new object */
|
---|
394 | if (u != NULL)
|
---|
395 | Py_UNICODE_COPY(unicode->str, u, size);
|
---|
396 |
|
---|
397 | return (PyObject *)unicode;
|
---|
398 | }
|
---|
399 |
|
---|
400 | #ifdef HAVE_WCHAR_H
|
---|
401 |
|
---|
402 | PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
|
---|
403 | Py_ssize_t size)
|
---|
404 | {
|
---|
405 | PyUnicodeObject *unicode;
|
---|
406 |
|
---|
407 | if (w == NULL) {
|
---|
408 | PyErr_BadInternalCall();
|
---|
409 | return NULL;
|
---|
410 | }
|
---|
411 |
|
---|
412 | unicode = _PyUnicode_New(size);
|
---|
413 | if (!unicode)
|
---|
414 | return NULL;
|
---|
415 |
|
---|
416 | /* Copy the wchar_t data into the new object */
|
---|
417 | #ifdef HAVE_USABLE_WCHAR_T
|
---|
418 | memcpy(unicode->str, w, size * sizeof(wchar_t));
|
---|
419 | #else
|
---|
420 | {
|
---|
421 | register Py_UNICODE *u;
|
---|
422 | register Py_ssize_t i;
|
---|
423 | u = PyUnicode_AS_UNICODE(unicode);
|
---|
424 | for (i = size; i > 0; i--)
|
---|
425 | *u++ = *w++;
|
---|
426 | }
|
---|
427 | #endif
|
---|
428 |
|
---|
429 | return (PyObject *)unicode;
|
---|
430 | }
|
---|
431 |
|
---|
432 | Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
|
---|
433 | wchar_t *w,
|
---|
434 | Py_ssize_t size)
|
---|
435 | {
|
---|
436 | if (unicode == NULL) {
|
---|
437 | PyErr_BadInternalCall();
|
---|
438 | return -1;
|
---|
439 | }
|
---|
440 |
|
---|
441 | /* If possible, try to copy the 0-termination as well */
|
---|
442 | if (size > PyUnicode_GET_SIZE(unicode))
|
---|
443 | size = PyUnicode_GET_SIZE(unicode) + 1;
|
---|
444 |
|
---|
445 | #ifdef HAVE_USABLE_WCHAR_T
|
---|
446 | memcpy(w, unicode->str, size * sizeof(wchar_t));
|
---|
447 | #else
|
---|
448 | {
|
---|
449 | register Py_UNICODE *u;
|
---|
450 | register Py_ssize_t i;
|
---|
451 | u = PyUnicode_AS_UNICODE(unicode);
|
---|
452 | for (i = size; i > 0; i--)
|
---|
453 | *w++ = *u++;
|
---|
454 | }
|
---|
455 | #endif
|
---|
456 |
|
---|
457 | if (size > PyUnicode_GET_SIZE(unicode))
|
---|
458 | return PyUnicode_GET_SIZE(unicode);
|
---|
459 | else
|
---|
460 | return size;
|
---|
461 | }
|
---|
462 |
|
---|
463 | #endif
|
---|
464 |
|
---|
465 | PyObject *PyUnicode_FromOrdinal(int ordinal)
|
---|
466 | {
|
---|
467 | Py_UNICODE s[1];
|
---|
468 |
|
---|
469 | #ifdef Py_UNICODE_WIDE
|
---|
470 | if (ordinal < 0 || ordinal > 0x10ffff) {
|
---|
471 | PyErr_SetString(PyExc_ValueError,
|
---|
472 | "unichr() arg not in range(0x110000) "
|
---|
473 | "(wide Python build)");
|
---|
474 | return NULL;
|
---|
475 | }
|
---|
476 | #else
|
---|
477 | if (ordinal < 0 || ordinal > 0xffff) {
|
---|
478 | PyErr_SetString(PyExc_ValueError,
|
---|
479 | "unichr() arg not in range(0x10000) "
|
---|
480 | "(narrow Python build)");
|
---|
481 | return NULL;
|
---|
482 | }
|
---|
483 | #endif
|
---|
484 |
|
---|
485 | s[0] = (Py_UNICODE)ordinal;
|
---|
486 | return PyUnicode_FromUnicode(s, 1);
|
---|
487 | }
|
---|
488 |
|
---|
489 | PyObject *PyUnicode_FromObject(register PyObject *obj)
|
---|
490 | {
|
---|
491 | /* XXX Perhaps we should make this API an alias of
|
---|
492 | PyObject_Unicode() instead ?! */
|
---|
493 | if (PyUnicode_CheckExact(obj)) {
|
---|
494 | Py_INCREF(obj);
|
---|
495 | return obj;
|
---|
496 | }
|
---|
497 | if (PyUnicode_Check(obj)) {
|
---|
498 | /* For a Unicode subtype that's not a Unicode object,
|
---|
499 | return a true Unicode object with the same data. */
|
---|
500 | return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
|
---|
501 | PyUnicode_GET_SIZE(obj));
|
---|
502 | }
|
---|
503 | return PyUnicode_FromEncodedObject(obj, NULL, "strict");
|
---|
504 | }
|
---|
505 |
|
---|
506 | PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
|
---|
507 | const char *encoding,
|
---|
508 | const char *errors)
|
---|
509 | {
|
---|
510 | const char *s = NULL;
|
---|
511 | Py_ssize_t len;
|
---|
512 | PyObject *v;
|
---|
513 |
|
---|
514 | if (obj == NULL) {
|
---|
515 | PyErr_BadInternalCall();
|
---|
516 | return NULL;
|
---|
517 | }
|
---|
518 |
|
---|
519 | #if 0
|
---|
520 | /* For b/w compatibility we also accept Unicode objects provided
|
---|
521 | that no encodings is given and then redirect to
|
---|
522 | PyObject_Unicode() which then applies the additional logic for
|
---|
523 | Unicode subclasses.
|
---|
524 |
|
---|
525 | NOTE: This API should really only be used for object which
|
---|
526 | represent *encoded* Unicode !
|
---|
527 |
|
---|
528 | */
|
---|
529 | if (PyUnicode_Check(obj)) {
|
---|
530 | if (encoding) {
|
---|
531 | PyErr_SetString(PyExc_TypeError,
|
---|
532 | "decoding Unicode is not supported");
|
---|
533 | return NULL;
|
---|
534 | }
|
---|
535 | return PyObject_Unicode(obj);
|
---|
536 | }
|
---|
537 | #else
|
---|
538 | if (PyUnicode_Check(obj)) {
|
---|
539 | PyErr_SetString(PyExc_TypeError,
|
---|
540 | "decoding Unicode is not supported");
|
---|
541 | return NULL;
|
---|
542 | }
|
---|
543 | #endif
|
---|
544 |
|
---|
545 | /* Coerce object */
|
---|
546 | if (PyString_Check(obj)) {
|
---|
547 | s = PyString_AS_STRING(obj);
|
---|
548 | len = PyString_GET_SIZE(obj);
|
---|
549 | }
|
---|
550 | else if (PyObject_AsCharBuffer(obj, &s, &len)) {
|
---|
551 | /* Overwrite the error message with something more useful in
|
---|
552 | case of a TypeError. */
|
---|
553 | if (PyErr_ExceptionMatches(PyExc_TypeError))
|
---|
554 | PyErr_Format(PyExc_TypeError,
|
---|
555 | "coercing to Unicode: need string or buffer, "
|
---|
556 | "%.80s found",
|
---|
557 | obj->ob_type->tp_name);
|
---|
558 | goto onError;
|
---|
559 | }
|
---|
560 |
|
---|
561 | /* Convert to Unicode */
|
---|
562 | if (len == 0) {
|
---|
563 | Py_INCREF(unicode_empty);
|
---|
564 | v = (PyObject *)unicode_empty;
|
---|
565 | }
|
---|
566 | else
|
---|
567 | v = PyUnicode_Decode(s, len, encoding, errors);
|
---|
568 |
|
---|
569 | return v;
|
---|
570 |
|
---|
571 | onError:
|
---|
572 | return NULL;
|
---|
573 | }
|
---|
574 |
|
---|
575 | PyObject *PyUnicode_Decode(const char *s,
|
---|
576 | Py_ssize_t size,
|
---|
577 | const char *encoding,
|
---|
578 | const char *errors)
|
---|
579 | {
|
---|
580 | PyObject *buffer = NULL, *unicode;
|
---|
581 |
|
---|
582 | if (encoding == NULL)
|
---|
583 | encoding = PyUnicode_GetDefaultEncoding();
|
---|
584 |
|
---|
585 | /* Shortcuts for common default encodings */
|
---|
586 | if (strcmp(encoding, "utf-8") == 0)
|
---|
587 | return PyUnicode_DecodeUTF8(s, size, errors);
|
---|
588 | else if (strcmp(encoding, "latin-1") == 0)
|
---|
589 | return PyUnicode_DecodeLatin1(s, size, errors);
|
---|
590 | #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
|
---|
591 | else if (strcmp(encoding, "mbcs") == 0)
|
---|
592 | return PyUnicode_DecodeMBCS(s, size, errors);
|
---|
593 | #endif
|
---|
594 | else if (strcmp(encoding, "ascii") == 0)
|
---|
595 | return PyUnicode_DecodeASCII(s, size, errors);
|
---|
596 |
|
---|
597 | /* Decode via the codec registry */
|
---|
598 | buffer = PyBuffer_FromMemory((void *)s, size);
|
---|
599 | if (buffer == NULL)
|
---|
600 | goto onError;
|
---|
601 | unicode = PyCodec_Decode(buffer, encoding, errors);
|
---|
602 | if (unicode == NULL)
|
---|
603 | goto onError;
|
---|
604 | if (!PyUnicode_Check(unicode)) {
|
---|
605 | PyErr_Format(PyExc_TypeError,
|
---|
606 | "decoder did not return an unicode object (type=%.400s)",
|
---|
607 | unicode->ob_type->tp_name);
|
---|
608 | Py_DECREF(unicode);
|
---|
609 | goto onError;
|
---|
610 | }
|
---|
611 | Py_DECREF(buffer);
|
---|
612 | return unicode;
|
---|
613 |
|
---|
614 | onError:
|
---|
615 | Py_XDECREF(buffer);
|
---|
616 | return NULL;
|
---|
617 | }
|
---|
618 |
|
---|
619 | PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
|
---|
620 | const char *encoding,
|
---|
621 | const char *errors)
|
---|
622 | {
|
---|
623 | PyObject *v;
|
---|
624 |
|
---|
625 | if (!PyUnicode_Check(unicode)) {
|
---|
626 | PyErr_BadArgument();
|
---|
627 | goto onError;
|
---|
628 | }
|
---|
629 |
|
---|
630 | if (encoding == NULL)
|
---|
631 | encoding = PyUnicode_GetDefaultEncoding();
|
---|
632 |
|
---|
633 | /* Decode via the codec registry */
|
---|
634 | v = PyCodec_Decode(unicode, encoding, errors);
|
---|
635 | if (v == NULL)
|
---|
636 | goto onError;
|
---|
637 | return v;
|
---|
638 |
|
---|
639 | onError:
|
---|
640 | return NULL;
|
---|
641 | }
|
---|
642 |
|
---|
643 | PyObject *PyUnicode_Encode(const Py_UNICODE *s,
|
---|
644 | Py_ssize_t size,
|
---|
645 | const char *encoding,
|
---|
646 | const char *errors)
|
---|
647 | {
|
---|
648 | PyObject *v, *unicode;
|
---|
649 |
|
---|
650 | unicode = PyUnicode_FromUnicode(s, size);
|
---|
651 | if (unicode == NULL)
|
---|
652 | return NULL;
|
---|
653 | v = PyUnicode_AsEncodedString(unicode, encoding, errors);
|
---|
654 | Py_DECREF(unicode);
|
---|
655 | return v;
|
---|
656 | }
|
---|
657 |
|
---|
658 | PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
|
---|
659 | const char *encoding,
|
---|
660 | const char *errors)
|
---|
661 | {
|
---|
662 | PyObject *v;
|
---|
663 |
|
---|
664 | if (!PyUnicode_Check(unicode)) {
|
---|
665 | PyErr_BadArgument();
|
---|
666 | goto onError;
|
---|
667 | }
|
---|
668 |
|
---|
669 | if (encoding == NULL)
|
---|
670 | encoding = PyUnicode_GetDefaultEncoding();
|
---|
671 |
|
---|
672 | /* Encode via the codec registry */
|
---|
673 | v = PyCodec_Encode(unicode, encoding, errors);
|
---|
674 | if (v == NULL)
|
---|
675 | goto onError;
|
---|
676 | return v;
|
---|
677 |
|
---|
678 | onError:
|
---|
679 | return NULL;
|
---|
680 | }
|
---|
681 |
|
---|
682 | PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
|
---|
683 | const char *encoding,
|
---|
684 | const char *errors)
|
---|
685 | {
|
---|
686 | PyObject *v;
|
---|
687 |
|
---|
688 | if (!PyUnicode_Check(unicode)) {
|
---|
689 | PyErr_BadArgument();
|
---|
690 | goto onError;
|
---|
691 | }
|
---|
692 |
|
---|
693 | if (encoding == NULL)
|
---|
694 | encoding = PyUnicode_GetDefaultEncoding();
|
---|
695 |
|
---|
696 | /* Shortcuts for common default encodings */
|
---|
697 | if (errors == NULL) {
|
---|
698 | if (strcmp(encoding, "utf-8") == 0)
|
---|
699 | return PyUnicode_AsUTF8String(unicode);
|
---|
700 | else if (strcmp(encoding, "latin-1") == 0)
|
---|
701 | return PyUnicode_AsLatin1String(unicode);
|
---|
702 | #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
|
---|
703 | else if (strcmp(encoding, "mbcs") == 0)
|
---|
704 | return PyUnicode_AsMBCSString(unicode);
|
---|
705 | #endif
|
---|
706 | else if (strcmp(encoding, "ascii") == 0)
|
---|
707 | return PyUnicode_AsASCIIString(unicode);
|
---|
708 | }
|
---|
709 |
|
---|
710 | /* Encode via the codec registry */
|
---|
711 | v = PyCodec_Encode(unicode, encoding, errors);
|
---|
712 | if (v == NULL)
|
---|
713 | goto onError;
|
---|
714 | if (!PyString_Check(v)) {
|
---|
715 | PyErr_Format(PyExc_TypeError,
|
---|
716 | "encoder did not return a string object (type=%.400s)",
|
---|
717 | v->ob_type->tp_name);
|
---|
718 | Py_DECREF(v);
|
---|
719 | goto onError;
|
---|
720 | }
|
---|
721 | return v;
|
---|
722 |
|
---|
723 | onError:
|
---|
724 | return NULL;
|
---|
725 | }
|
---|
726 |
|
---|
727 | PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
|
---|
728 | const char *errors)
|
---|
729 | {
|
---|
730 | PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
|
---|
731 |
|
---|
732 | if (v)
|
---|
733 | return v;
|
---|
734 | v = PyUnicode_AsEncodedString(unicode, NULL, errors);
|
---|
735 | if (v && errors == NULL)
|
---|
736 | ((PyUnicodeObject *)unicode)->defenc = v;
|
---|
737 | return v;
|
---|
738 | }
|
---|
739 |
|
---|
740 | Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
|
---|
741 | {
|
---|
742 | if (!PyUnicode_Check(unicode)) {
|
---|
743 | PyErr_BadArgument();
|
---|
744 | goto onError;
|
---|
745 | }
|
---|
746 | return PyUnicode_AS_UNICODE(unicode);
|
---|
747 |
|
---|
748 | onError:
|
---|
749 | return NULL;
|
---|
750 | }
|
---|
751 |
|
---|
752 | Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
|
---|
753 | {
|
---|
754 | if (!PyUnicode_Check(unicode)) {
|
---|
755 | PyErr_BadArgument();
|
---|
756 | goto onError;
|
---|
757 | }
|
---|
758 | return PyUnicode_GET_SIZE(unicode);
|
---|
759 |
|
---|
760 | onError:
|
---|
761 | return -1;
|
---|
762 | }
|
---|
763 |
|
---|
764 | const char *PyUnicode_GetDefaultEncoding(void)
|
---|
765 | {
|
---|
766 | return unicode_default_encoding;
|
---|
767 | }
|
---|
768 |
|
---|
769 | int PyUnicode_SetDefaultEncoding(const char *encoding)
|
---|
770 | {
|
---|
771 | PyObject *v;
|
---|
772 |
|
---|
773 | /* Make sure the encoding is valid. As side effect, this also
|
---|
774 | loads the encoding into the codec registry cache. */
|
---|
775 | v = _PyCodec_Lookup(encoding);
|
---|
776 | if (v == NULL)
|
---|
777 | goto onError;
|
---|
778 | Py_DECREF(v);
|
---|
779 | strncpy(unicode_default_encoding,
|
---|
780 | encoding,
|
---|
781 | sizeof(unicode_default_encoding));
|
---|
782 | return 0;
|
---|
783 |
|
---|
784 | onError:
|
---|
785 | return -1;
|
---|
786 | }
|
---|
787 |
|
---|
788 | /* error handling callback helper:
|
---|
789 | build arguments, call the callback and check the arguments,
|
---|
790 | if no exception occurred, copy the replacement to the output
|
---|
791 | and adjust various state variables.
|
---|
792 | return 0 on success, -1 on error
|
---|
793 | */
|
---|
794 |
|
---|
795 | static
|
---|
796 | int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
|
---|
797 | const char *encoding, const char *reason,
|
---|
798 | const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
|
---|
799 | PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
|
---|
800 | {
|
---|
801 | static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
|
---|
802 |
|
---|
803 | PyObject *restuple = NULL;
|
---|
804 | PyObject *repunicode = NULL;
|
---|
805 | Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
|
---|
806 | Py_ssize_t requiredsize;
|
---|
807 | Py_ssize_t newpos;
|
---|
808 | Py_UNICODE *repptr;
|
---|
809 | Py_ssize_t repsize;
|
---|
810 | int res = -1;
|
---|
811 |
|
---|
812 | if (*errorHandler == NULL) {
|
---|
813 | *errorHandler = PyCodec_LookupError(errors);
|
---|
814 | if (*errorHandler == NULL)
|
---|
815 | goto onError;
|
---|
816 | }
|
---|
817 |
|
---|
818 | if (*exceptionObject == NULL) {
|
---|
819 | *exceptionObject = PyUnicodeDecodeError_Create(
|
---|
820 | encoding, input, insize, *startinpos, *endinpos, reason);
|
---|
821 | if (*exceptionObject == NULL)
|
---|
822 | goto onError;
|
---|
823 | }
|
---|
824 | else {
|
---|
825 | if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
|
---|
826 | goto onError;
|
---|
827 | if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
|
---|
828 | goto onError;
|
---|
829 | if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
|
---|
830 | goto onError;
|
---|
831 | }
|
---|
832 |
|
---|
833 | restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
|
---|
834 | if (restuple == NULL)
|
---|
835 | goto onError;
|
---|
836 | if (!PyTuple_Check(restuple)) {
|
---|
837 | PyErr_Format(PyExc_TypeError, &argparse[4]);
|
---|
838 | goto onError;
|
---|
839 | }
|
---|
840 | if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
|
---|
841 | goto onError;
|
---|
842 | if (newpos<0)
|
---|
843 | newpos = insize+newpos;
|
---|
844 | if (newpos<0 || newpos>insize) {
|
---|
845 | PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
|
---|
846 | goto onError;
|
---|
847 | }
|
---|
848 |
|
---|
849 | /* need more space? (at least enough for what we
|
---|
850 | have+the replacement+the rest of the string (starting
|
---|
851 | at the new input position), so we won't have to check space
|
---|
852 | when there are no errors in the rest of the string) */
|
---|
853 | repptr = PyUnicode_AS_UNICODE(repunicode);
|
---|
854 | repsize = PyUnicode_GET_SIZE(repunicode);
|
---|
855 | requiredsize = *outpos + repsize + insize-newpos;
|
---|
856 | if (requiredsize > outsize) {
|
---|
857 | if (requiredsize<2*outsize)
|
---|
858 | requiredsize = 2*outsize;
|
---|
859 | if (PyUnicode_Resize(output, requiredsize) < 0)
|
---|
860 | goto onError;
|
---|
861 | *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
|
---|
862 | }
|
---|
863 | *endinpos = newpos;
|
---|
864 | *inptr = input + newpos;
|
---|
865 | Py_UNICODE_COPY(*outptr, repptr, repsize);
|
---|
866 | *outptr += repsize;
|
---|
867 | *outpos += repsize;
|
---|
868 | /* we made it! */
|
---|
869 | res = 0;
|
---|
870 |
|
---|
871 | onError:
|
---|
872 | Py_XDECREF(restuple);
|
---|
873 | return res;
|
---|
874 | }
|
---|
875 |
|
---|
876 | /* --- UTF-7 Codec -------------------------------------------------------- */
|
---|
877 |
|
---|
878 | /* see RFC2152 for details */
|
---|
879 |
|
---|
880 | static
|
---|
881 | char utf7_special[128] = {
|
---|
882 | /* indicate whether a UTF-7 character is special i.e. cannot be directly
|
---|
883 | encoded:
|
---|
884 | 0 - not special
|
---|
885 | 1 - special
|
---|
886 | 2 - whitespace (optional)
|
---|
887 | 3 - RFC2152 Set O (optional) */
|
---|
888 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
|
---|
889 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
890 | 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
|
---|
891 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
|
---|
892 | 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
---|
893 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
|
---|
894 | 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
---|
895 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
|
---|
896 |
|
---|
897 | };
|
---|
898 |
|
---|
899 | /* Note: The comparison (c) <= 0 is a trick to work-around gcc
|
---|
900 | warnings about the comparison always being false; since
|
---|
901 | utf7_special[0] is 1, we can safely make that one comparison
|
---|
902 | true */
|
---|
903 |
|
---|
904 | #define SPECIAL(c, encodeO, encodeWS) \
|
---|
905 | ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
|
---|
906 | (encodeWS && (utf7_special[(c)] == 2)) || \
|
---|
907 | (encodeO && (utf7_special[(c)] == 3)))
|
---|
908 |
|
---|
909 | #define B64(n) \
|
---|
910 | ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
|
---|
911 | #define B64CHAR(c) \
|
---|
912 | (isalnum(c) || (c) == '+' || (c) == '/')
|
---|
913 | #define UB64(c) \
|
---|
914 | ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
|
---|
915 | (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
|
---|
916 |
|
---|
917 | #define ENCODE(out, ch, bits) \
|
---|
918 | while (bits >= 6) { \
|
---|
919 | *out++ = B64(ch >> (bits-6)); \
|
---|
920 | bits -= 6; \
|
---|
921 | }
|
---|
922 |
|
---|
923 | #define DECODE(out, ch, bits, surrogate) \
|
---|
924 | while (bits >= 16) { \
|
---|
925 | Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
|
---|
926 | bits -= 16; \
|
---|
927 | if (surrogate) { \
|
---|
928 | /* We have already generated an error for the high surrogate \
|
---|
929 | so let's not bother seeing if the low surrogate is correct or not */ \
|
---|
930 | surrogate = 0; \
|
---|
931 | } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
|
---|
932 | /* This is a surrogate pair. Unfortunately we can't represent \
|
---|
933 | it in a 16-bit character */ \
|
---|
934 | surrogate = 1; \
|
---|
935 | errmsg = "code pairs are not supported"; \
|
---|
936 | goto utf7Error; \
|
---|
937 | } else { \
|
---|
938 | *out++ = outCh; \
|
---|
939 | } \
|
---|
940 | }
|
---|
941 |
|
---|
942 | PyObject *PyUnicode_DecodeUTF7(const char *s,
|
---|
943 | Py_ssize_t size,
|
---|
944 | const char *errors)
|
---|
945 | {
|
---|
946 | const char *starts = s;
|
---|
947 | Py_ssize_t startinpos;
|
---|
948 | Py_ssize_t endinpos;
|
---|
949 | Py_ssize_t outpos;
|
---|
950 | const char *e;
|
---|
951 | PyUnicodeObject *unicode;
|
---|
952 | Py_UNICODE *p;
|
---|
953 | const char *errmsg = "";
|
---|
954 | int inShift = 0;
|
---|
955 | unsigned int bitsleft = 0;
|
---|
956 | unsigned long charsleft = 0;
|
---|
957 | int surrogate = 0;
|
---|
958 | PyObject *errorHandler = NULL;
|
---|
959 | PyObject *exc = NULL;
|
---|
960 |
|
---|
961 | unicode = _PyUnicode_New(size);
|
---|
962 | if (!unicode)
|
---|
963 | return NULL;
|
---|
964 | if (size == 0)
|
---|
965 | return (PyObject *)unicode;
|
---|
966 |
|
---|
967 | p = unicode->str;
|
---|
968 | e = s + size;
|
---|
969 |
|
---|
970 | while (s < e) {
|
---|
971 | Py_UNICODE ch;
|
---|
972 | restart:
|
---|
973 | ch = *s;
|
---|
974 |
|
---|
975 | if (inShift) {
|
---|
976 | if ((ch == '-') || !B64CHAR(ch)) {
|
---|
977 | inShift = 0;
|
---|
978 | s++;
|
---|
979 |
|
---|
980 | /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
|
---|
981 | if (bitsleft >= 6) {
|
---|
982 | /* The shift sequence has a partial character in it. If
|
---|
983 | bitsleft < 6 then we could just classify it as padding
|
---|
984 | but that is not the case here */
|
---|
985 |
|
---|
986 | errmsg = "partial character in shift sequence";
|
---|
987 | goto utf7Error;
|
---|
988 | }
|
---|
989 | /* According to RFC2152 the remaining bits should be zero. We
|
---|
990 | choose to signal an error/insert a replacement character
|
---|
991 | here so indicate the potential of a misencoded character. */
|
---|
992 |
|
---|
993 | /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
|
---|
994 | if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
|
---|
995 | errmsg = "non-zero padding bits in shift sequence";
|
---|
996 | goto utf7Error;
|
---|
997 | }
|
---|
998 |
|
---|
999 | if (ch == '-') {
|
---|
1000 | if ((s < e) && (*(s) == '-')) {
|
---|
1001 | *p++ = '-';
|
---|
1002 | inShift = 1;
|
---|
1003 | }
|
---|
1004 | } else if (SPECIAL(ch,0,0)) {
|
---|
1005 | errmsg = "unexpected special character";
|
---|
1006 | goto utf7Error;
|
---|
1007 | } else {
|
---|
1008 | *p++ = ch;
|
---|
1009 | }
|
---|
1010 | } else {
|
---|
1011 | charsleft = (charsleft << 6) | UB64(ch);
|
---|
1012 | bitsleft += 6;
|
---|
1013 | s++;
|
---|
1014 | /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
|
---|
1015 | }
|
---|
1016 | }
|
---|
1017 | else if ( ch == '+' ) {
|
---|
1018 | startinpos = s-starts;
|
---|
1019 | s++;
|
---|
1020 | if (s < e && *s == '-') {
|
---|
1021 | s++;
|
---|
1022 | *p++ = '+';
|
---|
1023 | } else
|
---|
1024 | {
|
---|
1025 | inShift = 1;
|
---|
1026 | bitsleft = 0;
|
---|
1027 | }
|
---|
1028 | }
|
---|
1029 | else if (SPECIAL(ch,0,0)) {
|
---|
1030 | errmsg = "unexpected special character";
|
---|
1031 | s++;
|
---|
1032 | goto utf7Error;
|
---|
1033 | }
|
---|
1034 | else {
|
---|
1035 | *p++ = ch;
|
---|
1036 | s++;
|
---|
1037 | }
|
---|
1038 | continue;
|
---|
1039 | utf7Error:
|
---|
1040 | outpos = p-PyUnicode_AS_UNICODE(unicode);
|
---|
1041 | endinpos = s-starts;
|
---|
1042 | if (unicode_decode_call_errorhandler(
|
---|
1043 | errors, &errorHandler,
|
---|
1044 | "utf7", errmsg,
|
---|
1045 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
1046 | (PyObject **)&unicode, &outpos, &p))
|
---|
1047 | goto onError;
|
---|
1048 | }
|
---|
1049 |
|
---|
1050 | if (inShift) {
|
---|
1051 | outpos = p-PyUnicode_AS_UNICODE(unicode);
|
---|
1052 | endinpos = size;
|
---|
1053 | if (unicode_decode_call_errorhandler(
|
---|
1054 | errors, &errorHandler,
|
---|
1055 | "utf7", "unterminated shift sequence",
|
---|
1056 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
1057 | (PyObject **)&unicode, &outpos, &p))
|
---|
1058 | goto onError;
|
---|
1059 | if (s < e)
|
---|
1060 | goto restart;
|
---|
1061 | }
|
---|
1062 |
|
---|
1063 | if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
|
---|
1064 | goto onError;
|
---|
1065 |
|
---|
1066 | Py_XDECREF(errorHandler);
|
---|
1067 | Py_XDECREF(exc);
|
---|
1068 | return (PyObject *)unicode;
|
---|
1069 |
|
---|
1070 | onError:
|
---|
1071 | Py_XDECREF(errorHandler);
|
---|
1072 | Py_XDECREF(exc);
|
---|
1073 | Py_DECREF(unicode);
|
---|
1074 | return NULL;
|
---|
1075 | }
|
---|
1076 |
|
---|
1077 |
|
---|
1078 | PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
|
---|
1079 | Py_ssize_t size,
|
---|
1080 | int encodeSetO,
|
---|
1081 | int encodeWhiteSpace,
|
---|
1082 | const char *errors)
|
---|
1083 | {
|
---|
1084 | PyObject *v;
|
---|
1085 | /* It might be possible to tighten this worst case */
|
---|
1086 | Py_ssize_t cbAllocated = 5 * size;
|
---|
1087 | int inShift = 0;
|
---|
1088 | Py_ssize_t i = 0;
|
---|
1089 | unsigned int bitsleft = 0;
|
---|
1090 | unsigned long charsleft = 0;
|
---|
1091 | char * out;
|
---|
1092 | char * start;
|
---|
1093 |
|
---|
1094 | if (size == 0)
|
---|
1095 | return PyString_FromStringAndSize(NULL, 0);
|
---|
1096 |
|
---|
1097 | v = PyString_FromStringAndSize(NULL, cbAllocated);
|
---|
1098 | if (v == NULL)
|
---|
1099 | return NULL;
|
---|
1100 |
|
---|
1101 | start = out = PyString_AS_STRING(v);
|
---|
1102 | for (;i < size; ++i) {
|
---|
1103 | Py_UNICODE ch = s[i];
|
---|
1104 |
|
---|
1105 | if (!inShift) {
|
---|
1106 | if (ch == '+') {
|
---|
1107 | *out++ = '+';
|
---|
1108 | *out++ = '-';
|
---|
1109 | } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
|
---|
1110 | charsleft = ch;
|
---|
1111 | bitsleft = 16;
|
---|
1112 | *out++ = '+';
|
---|
1113 | /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
|
---|
1114 | inShift = bitsleft > 0;
|
---|
1115 | } else {
|
---|
1116 | *out++ = (char) ch;
|
---|
1117 | }
|
---|
1118 | } else {
|
---|
1119 | if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
|
---|
1120 | *out++ = B64(charsleft << (6-bitsleft));
|
---|
1121 | charsleft = 0;
|
---|
1122 | bitsleft = 0;
|
---|
1123 | /* Characters not in the BASE64 set implicitly unshift the sequence
|
---|
1124 | so no '-' is required, except if the character is itself a '-' */
|
---|
1125 | if (B64CHAR(ch) || ch == '-') {
|
---|
1126 | *out++ = '-';
|
---|
1127 | }
|
---|
1128 | inShift = 0;
|
---|
1129 | *out++ = (char) ch;
|
---|
1130 | } else {
|
---|
1131 | bitsleft += 16;
|
---|
1132 | charsleft = (charsleft << 16) | ch;
|
---|
1133 | /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
|
---|
1134 |
|
---|
1135 | /* If the next character is special then we dont' need to terminate
|
---|
1136 | the shift sequence. If the next character is not a BASE64 character
|
---|
1137 | or '-' then the shift sequence will be terminated implicitly and we
|
---|
1138 | don't have to insert a '-'. */
|
---|
1139 |
|
---|
1140 | if (bitsleft == 0) {
|
---|
1141 | if (i + 1 < size) {
|
---|
1142 | Py_UNICODE ch2 = s[i+1];
|
---|
1143 |
|
---|
1144 | if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
|
---|
1145 |
|
---|
1146 | } else if (B64CHAR(ch2) || ch2 == '-') {
|
---|
1147 | *out++ = '-';
|
---|
1148 | inShift = 0;
|
---|
1149 | } else {
|
---|
1150 | inShift = 0;
|
---|
1151 | }
|
---|
1152 |
|
---|
1153 | }
|
---|
1154 | else {
|
---|
1155 | *out++ = '-';
|
---|
1156 | inShift = 0;
|
---|
1157 | }
|
---|
1158 | }
|
---|
1159 | }
|
---|
1160 | }
|
---|
1161 | }
|
---|
1162 | if (bitsleft) {
|
---|
1163 | *out++= B64(charsleft << (6-bitsleft) );
|
---|
1164 | *out++ = '-';
|
---|
1165 | }
|
---|
1166 |
|
---|
1167 | _PyString_Resize(&v, out - start);
|
---|
1168 | return v;
|
---|
1169 | }
|
---|
1170 |
|
---|
1171 | #undef SPECIAL
|
---|
1172 | #undef B64
|
---|
1173 | #undef B64CHAR
|
---|
1174 | #undef UB64
|
---|
1175 | #undef ENCODE
|
---|
1176 | #undef DECODE
|
---|
1177 |
|
---|
1178 | /* --- UTF-8 Codec -------------------------------------------------------- */
|
---|
1179 |
|
---|
1180 | static
|
---|
1181 | char utf8_code_length[256] = {
|
---|
1182 | /* Map UTF-8 encoded prefix byte to sequence length. zero means
|
---|
1183 | illegal prefix. see RFC 2279 for details */
|
---|
1184 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
1185 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
1186 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
1187 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
1188 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
1189 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
1190 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
1191 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
---|
1192 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
---|
1193 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
---|
1194 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
---|
1195 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
---|
1196 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
---|
1197 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
---|
1198 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
---|
1199 | 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
|
---|
1200 | };
|
---|
1201 |
|
---|
1202 | PyObject *PyUnicode_DecodeUTF8(const char *s,
|
---|
1203 | Py_ssize_t size,
|
---|
1204 | const char *errors)
|
---|
1205 | {
|
---|
1206 | return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
|
---|
1207 | }
|
---|
1208 |
|
---|
1209 | PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
|
---|
1210 | Py_ssize_t size,
|
---|
1211 | const char *errors,
|
---|
1212 | Py_ssize_t *consumed)
|
---|
1213 | {
|
---|
1214 | const char *starts = s;
|
---|
1215 | int n;
|
---|
1216 | Py_ssize_t startinpos;
|
---|
1217 | Py_ssize_t endinpos;
|
---|
1218 | Py_ssize_t outpos;
|
---|
1219 | const char *e;
|
---|
1220 | PyUnicodeObject *unicode;
|
---|
1221 | Py_UNICODE *p;
|
---|
1222 | const char *errmsg = "";
|
---|
1223 | PyObject *errorHandler = NULL;
|
---|
1224 | PyObject *exc = NULL;
|
---|
1225 |
|
---|
1226 | /* Note: size will always be longer than the resulting Unicode
|
---|
1227 | character count */
|
---|
1228 | unicode = _PyUnicode_New(size);
|
---|
1229 | if (!unicode)
|
---|
1230 | return NULL;
|
---|
1231 | if (size == 0) {
|
---|
1232 | if (consumed)
|
---|
1233 | *consumed = 0;
|
---|
1234 | return (PyObject *)unicode;
|
---|
1235 | }
|
---|
1236 |
|
---|
1237 | /* Unpack UTF-8 encoded data */
|
---|
1238 | p = unicode->str;
|
---|
1239 | e = s + size;
|
---|
1240 |
|
---|
1241 | while (s < e) {
|
---|
1242 | Py_UCS4 ch = (unsigned char)*s;
|
---|
1243 |
|
---|
1244 | if (ch < 0x80) {
|
---|
1245 | *p++ = (Py_UNICODE)ch;
|
---|
1246 | s++;
|
---|
1247 | continue;
|
---|
1248 | }
|
---|
1249 |
|
---|
1250 | n = utf8_code_length[ch];
|
---|
1251 |
|
---|
1252 | if (s + n > e) {
|
---|
1253 | if (consumed)
|
---|
1254 | break;
|
---|
1255 | else {
|
---|
1256 | errmsg = "unexpected end of data";
|
---|
1257 | startinpos = s-starts;
|
---|
1258 | endinpos = size;
|
---|
1259 | goto utf8Error;
|
---|
1260 | }
|
---|
1261 | }
|
---|
1262 |
|
---|
1263 | switch (n) {
|
---|
1264 |
|
---|
1265 | case 0:
|
---|
1266 | errmsg = "unexpected code byte";
|
---|
1267 | startinpos = s-starts;
|
---|
1268 | endinpos = startinpos+1;
|
---|
1269 | goto utf8Error;
|
---|
1270 |
|
---|
1271 | case 1:
|
---|
1272 | errmsg = "internal error";
|
---|
1273 | startinpos = s-starts;
|
---|
1274 | endinpos = startinpos+1;
|
---|
1275 | goto utf8Error;
|
---|
1276 |
|
---|
1277 | case 2:
|
---|
1278 | if ((s[1] & 0xc0) != 0x80) {
|
---|
1279 | errmsg = "invalid data";
|
---|
1280 | startinpos = s-starts;
|
---|
1281 | endinpos = startinpos+2;
|
---|
1282 | goto utf8Error;
|
---|
1283 | }
|
---|
1284 | ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
|
---|
1285 | if (ch < 0x80) {
|
---|
1286 | startinpos = s-starts;
|
---|
1287 | endinpos = startinpos+2;
|
---|
1288 | errmsg = "illegal encoding";
|
---|
1289 | goto utf8Error;
|
---|
1290 | }
|
---|
1291 | else
|
---|
1292 | *p++ = (Py_UNICODE)ch;
|
---|
1293 | break;
|
---|
1294 |
|
---|
1295 | case 3:
|
---|
1296 | if ((s[1] & 0xc0) != 0x80 ||
|
---|
1297 | (s[2] & 0xc0) != 0x80) {
|
---|
1298 | errmsg = "invalid data";
|
---|
1299 | startinpos = s-starts;
|
---|
1300 | endinpos = startinpos+3;
|
---|
1301 | goto utf8Error;
|
---|
1302 | }
|
---|
1303 | ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
---|
1304 | if (ch < 0x0800) {
|
---|
1305 | /* Note: UTF-8 encodings of surrogates are considered
|
---|
1306 | legal UTF-8 sequences;
|
---|
1307 |
|
---|
1308 | XXX For wide builds (UCS-4) we should probably try
|
---|
1309 | to recombine the surrogates into a single code
|
---|
1310 | unit.
|
---|
1311 | */
|
---|
1312 | errmsg = "illegal encoding";
|
---|
1313 | startinpos = s-starts;
|
---|
1314 | endinpos = startinpos+3;
|
---|
1315 | goto utf8Error;
|
---|
1316 | }
|
---|
1317 | else
|
---|
1318 | *p++ = (Py_UNICODE)ch;
|
---|
1319 | break;
|
---|
1320 |
|
---|
1321 | case 4:
|
---|
1322 | if ((s[1] & 0xc0) != 0x80 ||
|
---|
1323 | (s[2] & 0xc0) != 0x80 ||
|
---|
1324 | (s[3] & 0xc0) != 0x80) {
|
---|
1325 | errmsg = "invalid data";
|
---|
1326 | startinpos = s-starts;
|
---|
1327 | endinpos = startinpos+4;
|
---|
1328 | goto utf8Error;
|
---|
1329 | }
|
---|
1330 | ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
---|
1331 | ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
---|
1332 | /* validate and convert to UTF-16 */
|
---|
1333 | if ((ch < 0x10000) /* minimum value allowed for 4
|
---|
1334 | byte encoding */
|
---|
1335 | || (ch > 0x10ffff)) /* maximum value allowed for
|
---|
1336 | UTF-16 */
|
---|
1337 | {
|
---|
1338 | errmsg = "illegal encoding";
|
---|
1339 | startinpos = s-starts;
|
---|
1340 | endinpos = startinpos+4;
|
---|
1341 | goto utf8Error;
|
---|
1342 | }
|
---|
1343 | #ifdef Py_UNICODE_WIDE
|
---|
1344 | *p++ = (Py_UNICODE)ch;
|
---|
1345 | #else
|
---|
1346 | /* compute and append the two surrogates: */
|
---|
1347 |
|
---|
1348 | /* translate from 10000..10FFFF to 0..FFFF */
|
---|
1349 | ch -= 0x10000;
|
---|
1350 |
|
---|
1351 | /* high surrogate = top 10 bits added to D800 */
|
---|
1352 | *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
|
---|
1353 |
|
---|
1354 | /* low surrogate = bottom 10 bits added to DC00 */
|
---|
1355 | *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
|
---|
1356 | #endif
|
---|
1357 | break;
|
---|
1358 |
|
---|
1359 | default:
|
---|
1360 | /* Other sizes are only needed for UCS-4 */
|
---|
1361 | errmsg = "unsupported Unicode code range";
|
---|
1362 | startinpos = s-starts;
|
---|
1363 | endinpos = startinpos+n;
|
---|
1364 | goto utf8Error;
|
---|
1365 | }
|
---|
1366 | s += n;
|
---|
1367 | continue;
|
---|
1368 |
|
---|
1369 | utf8Error:
|
---|
1370 | outpos = p-PyUnicode_AS_UNICODE(unicode);
|
---|
1371 | if (unicode_decode_call_errorhandler(
|
---|
1372 | errors, &errorHandler,
|
---|
1373 | "utf8", errmsg,
|
---|
1374 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
1375 | (PyObject **)&unicode, &outpos, &p))
|
---|
1376 | goto onError;
|
---|
1377 | }
|
---|
1378 | if (consumed)
|
---|
1379 | *consumed = s-starts;
|
---|
1380 |
|
---|
1381 | /* Adjust length */
|
---|
1382 | if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
|
---|
1383 | goto onError;
|
---|
1384 |
|
---|
1385 | Py_XDECREF(errorHandler);
|
---|
1386 | Py_XDECREF(exc);
|
---|
1387 | return (PyObject *)unicode;
|
---|
1388 |
|
---|
1389 | onError:
|
---|
1390 | Py_XDECREF(errorHandler);
|
---|
1391 | Py_XDECREF(exc);
|
---|
1392 | Py_DECREF(unicode);
|
---|
1393 | return NULL;
|
---|
1394 | }
|
---|
1395 |
|
---|
1396 | /* Allocation strategy: if the string is short, convert into a stack buffer
|
---|
1397 | and allocate exactly as much space needed at the end. Else allocate the
|
---|
1398 | maximum possible needed (4 result bytes per Unicode character), and return
|
---|
1399 | the excess memory at the end.
|
---|
1400 | */
|
---|
1401 | PyObject *
|
---|
1402 | PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
---|
1403 | Py_ssize_t size,
|
---|
1404 | const char *errors)
|
---|
1405 | {
|
---|
1406 | #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
|
---|
1407 |
|
---|
1408 | Py_ssize_t i; /* index into s of next input byte */
|
---|
1409 | PyObject *v; /* result string object */
|
---|
1410 | char *p; /* next free byte in output buffer */
|
---|
1411 | Py_ssize_t nallocated; /* number of result bytes allocated */
|
---|
1412 | Py_ssize_t nneeded; /* number of result bytes needed */
|
---|
1413 | char stackbuf[MAX_SHORT_UNICHARS * 4];
|
---|
1414 |
|
---|
1415 | assert(s != NULL);
|
---|
1416 | assert(size >= 0);
|
---|
1417 |
|
---|
1418 | if (size <= MAX_SHORT_UNICHARS) {
|
---|
1419 | /* Write into the stack buffer; nallocated can't overflow.
|
---|
1420 | * At the end, we'll allocate exactly as much heap space as it
|
---|
1421 | * turns out we need.
|
---|
1422 | */
|
---|
1423 | nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
|
---|
1424 | v = NULL; /* will allocate after we're done */
|
---|
1425 | p = stackbuf;
|
---|
1426 | }
|
---|
1427 | else {
|
---|
1428 | /* Overallocate on the heap, and give the excess back at the end. */
|
---|
1429 | nallocated = size * 4;
|
---|
1430 | if (nallocated / 4 != size) /* overflow! */
|
---|
1431 | return PyErr_NoMemory();
|
---|
1432 | v = PyString_FromStringAndSize(NULL, nallocated);
|
---|
1433 | if (v == NULL)
|
---|
1434 | return NULL;
|
---|
1435 | p = PyString_AS_STRING(v);
|
---|
1436 | }
|
---|
1437 |
|
---|
1438 | for (i = 0; i < size;) {
|
---|
1439 | Py_UCS4 ch = s[i++];
|
---|
1440 |
|
---|
1441 | if (ch < 0x80)
|
---|
1442 | /* Encode ASCII */
|
---|
1443 | *p++ = (char) ch;
|
---|
1444 |
|
---|
1445 | else if (ch < 0x0800) {
|
---|
1446 | /* Encode Latin-1 */
|
---|
1447 | *p++ = (char)(0xc0 | (ch >> 6));
|
---|
1448 | *p++ = (char)(0x80 | (ch & 0x3f));
|
---|
1449 | }
|
---|
1450 | else {
|
---|
1451 | /* Encode UCS2 Unicode ordinals */
|
---|
1452 | if (ch < 0x10000) {
|
---|
1453 | /* Special case: check for high surrogate */
|
---|
1454 | if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
|
---|
1455 | Py_UCS4 ch2 = s[i];
|
---|
1456 | /* Check for low surrogate and combine the two to
|
---|
1457 | form a UCS4 value */
|
---|
1458 | if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
|
---|
1459 | ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
|
---|
1460 | i++;
|
---|
1461 | goto encodeUCS4;
|
---|
1462 | }
|
---|
1463 | /* Fall through: handles isolated high surrogates */
|
---|
1464 | }
|
---|
1465 | *p++ = (char)(0xe0 | (ch >> 12));
|
---|
1466 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
---|
1467 | *p++ = (char)(0x80 | (ch & 0x3f));
|
---|
1468 | continue;
|
---|
1469 | }
|
---|
1470 | encodeUCS4:
|
---|
1471 | /* Encode UCS4 Unicode ordinals */
|
---|
1472 | *p++ = (char)(0xf0 | (ch >> 18));
|
---|
1473 | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
|
---|
1474 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
---|
1475 | *p++ = (char)(0x80 | (ch & 0x3f));
|
---|
1476 | }
|
---|
1477 | }
|
---|
1478 |
|
---|
1479 | if (v == NULL) {
|
---|
1480 | /* This was stack allocated. */
|
---|
1481 | nneeded = p - stackbuf;
|
---|
1482 | assert(nneeded <= nallocated);
|
---|
1483 | v = PyString_FromStringAndSize(stackbuf, nneeded);
|
---|
1484 | }
|
---|
1485 | else {
|
---|
1486 | /* Cut back to size actually needed. */
|
---|
1487 | nneeded = p - PyString_AS_STRING(v);
|
---|
1488 | assert(nneeded <= nallocated);
|
---|
1489 | _PyString_Resize(&v, nneeded);
|
---|
1490 | }
|
---|
1491 | return v;
|
---|
1492 |
|
---|
1493 | #undef MAX_SHORT_UNICHARS
|
---|
1494 | }
|
---|
1495 |
|
---|
1496 | PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
|
---|
1497 | {
|
---|
1498 | if (!PyUnicode_Check(unicode)) {
|
---|
1499 | PyErr_BadArgument();
|
---|
1500 | return NULL;
|
---|
1501 | }
|
---|
1502 | return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
|
---|
1503 | PyUnicode_GET_SIZE(unicode),
|
---|
1504 | NULL);
|
---|
1505 | }
|
---|
1506 |
|
---|
1507 | /* --- UTF-16 Codec ------------------------------------------------------- */
|
---|
1508 |
|
---|
1509 | PyObject *
|
---|
1510 | PyUnicode_DecodeUTF16(const char *s,
|
---|
1511 | Py_ssize_t size,
|
---|
1512 | const char *errors,
|
---|
1513 | int *byteorder)
|
---|
1514 | {
|
---|
1515 | return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
|
---|
1516 | }
|
---|
1517 |
|
---|
1518 | PyObject *
|
---|
1519 | PyUnicode_DecodeUTF16Stateful(const char *s,
|
---|
1520 | Py_ssize_t size,
|
---|
1521 | const char *errors,
|
---|
1522 | int *byteorder,
|
---|
1523 | Py_ssize_t *consumed)
|
---|
1524 | {
|
---|
1525 | const char *starts = s;
|
---|
1526 | Py_ssize_t startinpos;
|
---|
1527 | Py_ssize_t endinpos;
|
---|
1528 | Py_ssize_t outpos;
|
---|
1529 | PyUnicodeObject *unicode;
|
---|
1530 | Py_UNICODE *p;
|
---|
1531 | const unsigned char *q, *e;
|
---|
1532 | int bo = 0; /* assume native ordering by default */
|
---|
1533 | const char *errmsg = "";
|
---|
1534 | /* Offsets from q for retrieving byte pairs in the right order. */
|
---|
1535 | #ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
---|
1536 | int ihi = 1, ilo = 0;
|
---|
1537 | #else
|
---|
1538 | int ihi = 0, ilo = 1;
|
---|
1539 | #endif
|
---|
1540 | PyObject *errorHandler = NULL;
|
---|
1541 | PyObject *exc = NULL;
|
---|
1542 |
|
---|
1543 | /* Note: size will always be longer than the resulting Unicode
|
---|
1544 | character count */
|
---|
1545 | unicode = _PyUnicode_New(size);
|
---|
1546 | if (!unicode)
|
---|
1547 | return NULL;
|
---|
1548 | if (size == 0)
|
---|
1549 | return (PyObject *)unicode;
|
---|
1550 |
|
---|
1551 | /* Unpack UTF-16 encoded data */
|
---|
1552 | p = unicode->str;
|
---|
1553 | q = (unsigned char *)s;
|
---|
1554 | e = q + size;
|
---|
1555 |
|
---|
1556 | if (byteorder)
|
---|
1557 | bo = *byteorder;
|
---|
1558 |
|
---|
1559 | /* Check for BOM marks (U+FEFF) in the input and adjust current
|
---|
1560 | byte order setting accordingly. In native mode, the leading BOM
|
---|
1561 | mark is skipped, in all other modes, it is copied to the output
|
---|
1562 | stream as-is (giving a ZWNBSP character). */
|
---|
1563 | if (bo == 0) {
|
---|
1564 | if (size >= 2) {
|
---|
1565 | const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
|
---|
1566 | #ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
---|
1567 | if (bom == 0xFEFF) {
|
---|
1568 | q += 2;
|
---|
1569 | bo = -1;
|
---|
1570 | }
|
---|
1571 | else if (bom == 0xFFFE) {
|
---|
1572 | q += 2;
|
---|
1573 | bo = 1;
|
---|
1574 | }
|
---|
1575 | #else
|
---|
1576 | if (bom == 0xFEFF) {
|
---|
1577 | q += 2;
|
---|
1578 | bo = 1;
|
---|
1579 | }
|
---|
1580 | else if (bom == 0xFFFE) {
|
---|
1581 | q += 2;
|
---|
1582 | bo = -1;
|
---|
1583 | }
|
---|
1584 | #endif
|
---|
1585 | }
|
---|
1586 | }
|
---|
1587 |
|
---|
1588 | if (bo == -1) {
|
---|
1589 | /* force LE */
|
---|
1590 | ihi = 1;
|
---|
1591 | ilo = 0;
|
---|
1592 | }
|
---|
1593 | else if (bo == 1) {
|
---|
1594 | /* force BE */
|
---|
1595 | ihi = 0;
|
---|
1596 | ilo = 1;
|
---|
1597 | }
|
---|
1598 |
|
---|
1599 | while (q < e) {
|
---|
1600 | Py_UNICODE ch;
|
---|
1601 | /* remaining bytes at the end? (size should be even) */
|
---|
1602 | if (e-q<2) {
|
---|
1603 | if (consumed)
|
---|
1604 | break;
|
---|
1605 | errmsg = "truncated data";
|
---|
1606 | startinpos = ((const char *)q)-starts;
|
---|
1607 | endinpos = ((const char *)e)-starts;
|
---|
1608 | goto utf16Error;
|
---|
1609 | /* The remaining input chars are ignored if the callback
|
---|
1610 | chooses to skip the input */
|
---|
1611 | }
|
---|
1612 | ch = (q[ihi] << 8) | q[ilo];
|
---|
1613 |
|
---|
1614 | q += 2;
|
---|
1615 |
|
---|
1616 | if (ch < 0xD800 || ch > 0xDFFF) {
|
---|
1617 | *p++ = ch;
|
---|
1618 | continue;
|
---|
1619 | }
|
---|
1620 |
|
---|
1621 | /* UTF-16 code pair: */
|
---|
1622 | if (q >= e) {
|
---|
1623 | errmsg = "unexpected end of data";
|
---|
1624 | startinpos = (((const char *)q)-2)-starts;
|
---|
1625 | endinpos = ((const char *)e)-starts;
|
---|
1626 | goto utf16Error;
|
---|
1627 | }
|
---|
1628 | if (0xD800 <= ch && ch <= 0xDBFF) {
|
---|
1629 | Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
|
---|
1630 | q += 2;
|
---|
1631 | if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
|
---|
1632 | #ifndef Py_UNICODE_WIDE
|
---|
1633 | *p++ = ch;
|
---|
1634 | *p++ = ch2;
|
---|
1635 | #else
|
---|
1636 | *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
|
---|
1637 | #endif
|
---|
1638 | continue;
|
---|
1639 | }
|
---|
1640 | else {
|
---|
1641 | errmsg = "illegal UTF-16 surrogate";
|
---|
1642 | startinpos = (((const char *)q)-4)-starts;
|
---|
1643 | endinpos = startinpos+2;
|
---|
1644 | goto utf16Error;
|
---|
1645 | }
|
---|
1646 |
|
---|
1647 | }
|
---|
1648 | errmsg = "illegal encoding";
|
---|
1649 | startinpos = (((const char *)q)-2)-starts;
|
---|
1650 | endinpos = startinpos+2;
|
---|
1651 | /* Fall through to report the error */
|
---|
1652 |
|
---|
1653 | utf16Error:
|
---|
1654 | outpos = p-PyUnicode_AS_UNICODE(unicode);
|
---|
1655 | if (unicode_decode_call_errorhandler(
|
---|
1656 | errors, &errorHandler,
|
---|
1657 | "utf16", errmsg,
|
---|
1658 | starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
|
---|
1659 | (PyObject **)&unicode, &outpos, &p))
|
---|
1660 | goto onError;
|
---|
1661 | }
|
---|
1662 |
|
---|
1663 | if (byteorder)
|
---|
1664 | *byteorder = bo;
|
---|
1665 |
|
---|
1666 | if (consumed)
|
---|
1667 | *consumed = (const char *)q-starts;
|
---|
1668 |
|
---|
1669 | /* Adjust length */
|
---|
1670 | if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
|
---|
1671 | goto onError;
|
---|
1672 |
|
---|
1673 | Py_XDECREF(errorHandler);
|
---|
1674 | Py_XDECREF(exc);
|
---|
1675 | return (PyObject *)unicode;
|
---|
1676 |
|
---|
1677 | onError:
|
---|
1678 | Py_DECREF(unicode);
|
---|
1679 | Py_XDECREF(errorHandler);
|
---|
1680 | Py_XDECREF(exc);
|
---|
1681 | return NULL;
|
---|
1682 | }
|
---|
1683 |
|
---|
1684 | PyObject *
|
---|
1685 | PyUnicode_EncodeUTF16(const Py_UNICODE *s,
|
---|
1686 | Py_ssize_t size,
|
---|
1687 | const char *errors,
|
---|
1688 | int byteorder)
|
---|
1689 | {
|
---|
1690 | PyObject *v;
|
---|
1691 | unsigned char *p;
|
---|
1692 | #ifdef Py_UNICODE_WIDE
|
---|
1693 | int i, pairs;
|
---|
1694 | #else
|
---|
1695 | const int pairs = 0;
|
---|
1696 | #endif
|
---|
1697 | /* Offsets from p for storing byte pairs in the right order. */
|
---|
1698 | #ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
---|
1699 | int ihi = 1, ilo = 0;
|
---|
1700 | #else
|
---|
1701 | int ihi = 0, ilo = 1;
|
---|
1702 | #endif
|
---|
1703 |
|
---|
1704 | #define STORECHAR(CH) \
|
---|
1705 | do { \
|
---|
1706 | p[ihi] = ((CH) >> 8) & 0xff; \
|
---|
1707 | p[ilo] = (CH) & 0xff; \
|
---|
1708 | p += 2; \
|
---|
1709 | } while(0)
|
---|
1710 |
|
---|
1711 | #ifdef Py_UNICODE_WIDE
|
---|
1712 | for (i = pairs = 0; i < size; i++)
|
---|
1713 | if (s[i] >= 0x10000)
|
---|
1714 | pairs++;
|
---|
1715 | #endif
|
---|
1716 | v = PyString_FromStringAndSize(NULL,
|
---|
1717 | 2 * (size + pairs + (byteorder == 0)));
|
---|
1718 | if (v == NULL)
|
---|
1719 | return NULL;
|
---|
1720 |
|
---|
1721 | p = (unsigned char *)PyString_AS_STRING(v);
|
---|
1722 | if (byteorder == 0)
|
---|
1723 | STORECHAR(0xFEFF);
|
---|
1724 | if (size == 0)
|
---|
1725 | return v;
|
---|
1726 |
|
---|
1727 | if (byteorder == -1) {
|
---|
1728 | /* force LE */
|
---|
1729 | ihi = 1;
|
---|
1730 | ilo = 0;
|
---|
1731 | }
|
---|
1732 | else if (byteorder == 1) {
|
---|
1733 | /* force BE */
|
---|
1734 | ihi = 0;
|
---|
1735 | ilo = 1;
|
---|
1736 | }
|
---|
1737 |
|
---|
1738 | while (size-- > 0) {
|
---|
1739 | Py_UNICODE ch = *s++;
|
---|
1740 | Py_UNICODE ch2 = 0;
|
---|
1741 | #ifdef Py_UNICODE_WIDE
|
---|
1742 | if (ch >= 0x10000) {
|
---|
1743 | ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
|
---|
1744 | ch = 0xD800 | ((ch-0x10000) >> 10);
|
---|
1745 | }
|
---|
1746 | #endif
|
---|
1747 | STORECHAR(ch);
|
---|
1748 | if (ch2)
|
---|
1749 | STORECHAR(ch2);
|
---|
1750 | }
|
---|
1751 | return v;
|
---|
1752 | #undef STORECHAR
|
---|
1753 | }
|
---|
1754 |
|
---|
1755 | PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
|
---|
1756 | {
|
---|
1757 | if (!PyUnicode_Check(unicode)) {
|
---|
1758 | PyErr_BadArgument();
|
---|
1759 | return NULL;
|
---|
1760 | }
|
---|
1761 | return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
|
---|
1762 | PyUnicode_GET_SIZE(unicode),
|
---|
1763 | NULL,
|
---|
1764 | 0);
|
---|
1765 | }
|
---|
1766 |
|
---|
1767 | /* --- Unicode Escape Codec ----------------------------------------------- */
|
---|
1768 |
|
---|
1769 | static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
|
---|
1770 |
|
---|
1771 | PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
---|
1772 | Py_ssize_t size,
|
---|
1773 | const char *errors)
|
---|
1774 | {
|
---|
1775 | const char *starts = s;
|
---|
1776 | Py_ssize_t startinpos;
|
---|
1777 | Py_ssize_t endinpos;
|
---|
1778 | Py_ssize_t outpos;
|
---|
1779 | int i;
|
---|
1780 | PyUnicodeObject *v;
|
---|
1781 | Py_UNICODE *p;
|
---|
1782 | const char *end;
|
---|
1783 | char* message;
|
---|
1784 | Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
|
---|
1785 | PyObject *errorHandler = NULL;
|
---|
1786 | PyObject *exc = NULL;
|
---|
1787 |
|
---|
1788 | /* Escaped strings will always be longer than the resulting
|
---|
1789 | Unicode string, so we start with size here and then reduce the
|
---|
1790 | length after conversion to the true value.
|
---|
1791 | (but if the error callback returns a long replacement string
|
---|
1792 | we'll have to allocate more space) */
|
---|
1793 | v = _PyUnicode_New(size);
|
---|
1794 | if (v == NULL)
|
---|
1795 | goto onError;
|
---|
1796 | if (size == 0)
|
---|
1797 | return (PyObject *)v;
|
---|
1798 |
|
---|
1799 | p = PyUnicode_AS_UNICODE(v);
|
---|
1800 | end = s + size;
|
---|
1801 |
|
---|
1802 | while (s < end) {
|
---|
1803 | unsigned char c;
|
---|
1804 | Py_UNICODE x;
|
---|
1805 | int digits;
|
---|
1806 |
|
---|
1807 | /* Non-escape characters are interpreted as Unicode ordinals */
|
---|
1808 | if (*s != '\\') {
|
---|
1809 | *p++ = (unsigned char) *s++;
|
---|
1810 | continue;
|
---|
1811 | }
|
---|
1812 |
|
---|
1813 | startinpos = s-starts;
|
---|
1814 | /* \ - Escapes */
|
---|
1815 | s++;
|
---|
1816 | switch (*s++) {
|
---|
1817 |
|
---|
1818 | /* \x escapes */
|
---|
1819 | case '\n': break;
|
---|
1820 | case '\\': *p++ = '\\'; break;
|
---|
1821 | case '\'': *p++ = '\''; break;
|
---|
1822 | case '\"': *p++ = '\"'; break;
|
---|
1823 | case 'b': *p++ = '\b'; break;
|
---|
1824 | case 'f': *p++ = '\014'; break; /* FF */
|
---|
1825 | case 't': *p++ = '\t'; break;
|
---|
1826 | case 'n': *p++ = '\n'; break;
|
---|
1827 | case 'r': *p++ = '\r'; break;
|
---|
1828 | case 'v': *p++ = '\013'; break; /* VT */
|
---|
1829 | case 'a': *p++ = '\007'; break; /* BEL, not classic C */
|
---|
1830 |
|
---|
1831 | /* \OOO (octal) escapes */
|
---|
1832 | case '0': case '1': case '2': case '3':
|
---|
1833 | case '4': case '5': case '6': case '7':
|
---|
1834 | x = s[-1] - '0';
|
---|
1835 | if ('0' <= *s && *s <= '7') {
|
---|
1836 | x = (x<<3) + *s++ - '0';
|
---|
1837 | if ('0' <= *s && *s <= '7')
|
---|
1838 | x = (x<<3) + *s++ - '0';
|
---|
1839 | }
|
---|
1840 | *p++ = x;
|
---|
1841 | break;
|
---|
1842 |
|
---|
1843 | /* hex escapes */
|
---|
1844 | /* \xXX */
|
---|
1845 | case 'x':
|
---|
1846 | digits = 2;
|
---|
1847 | message = "truncated \\xXX escape";
|
---|
1848 | goto hexescape;
|
---|
1849 |
|
---|
1850 | /* \uXXXX */
|
---|
1851 | case 'u':
|
---|
1852 | digits = 4;
|
---|
1853 | message = "truncated \\uXXXX escape";
|
---|
1854 | goto hexescape;
|
---|
1855 |
|
---|
1856 | /* \UXXXXXXXX */
|
---|
1857 | case 'U':
|
---|
1858 | digits = 8;
|
---|
1859 | message = "truncated \\UXXXXXXXX escape";
|
---|
1860 | hexescape:
|
---|
1861 | chr = 0;
|
---|
1862 | outpos = p-PyUnicode_AS_UNICODE(v);
|
---|
1863 | if (s+digits>end) {
|
---|
1864 | endinpos = size;
|
---|
1865 | if (unicode_decode_call_errorhandler(
|
---|
1866 | errors, &errorHandler,
|
---|
1867 | "unicodeescape", "end of string in escape sequence",
|
---|
1868 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
1869 | (PyObject **)&v, &outpos, &p))
|
---|
1870 | goto onError;
|
---|
1871 | goto nextByte;
|
---|
1872 | }
|
---|
1873 | for (i = 0; i < digits; ++i) {
|
---|
1874 | c = (unsigned char) s[i];
|
---|
1875 | if (!isxdigit(c)) {
|
---|
1876 | endinpos = (s+i+1)-starts;
|
---|
1877 | if (unicode_decode_call_errorhandler(
|
---|
1878 | errors, &errorHandler,
|
---|
1879 | "unicodeescape", message,
|
---|
1880 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
1881 | (PyObject **)&v, &outpos, &p))
|
---|
1882 | goto onError;
|
---|
1883 | goto nextByte;
|
---|
1884 | }
|
---|
1885 | chr = (chr<<4) & ~0xF;
|
---|
1886 | if (c >= '0' && c <= '9')
|
---|
1887 | chr += c - '0';
|
---|
1888 | else if (c >= 'a' && c <= 'f')
|
---|
1889 | chr += 10 + c - 'a';
|
---|
1890 | else
|
---|
1891 | chr += 10 + c - 'A';
|
---|
1892 | }
|
---|
1893 | s += i;
|
---|
1894 | if (chr == 0xffffffff && PyErr_Occurred())
|
---|
1895 | /* _decoding_error will have already written into the
|
---|
1896 | target buffer. */
|
---|
1897 | break;
|
---|
1898 | store:
|
---|
1899 | /* when we get here, chr is a 32-bit unicode character */
|
---|
1900 | if (chr <= 0xffff)
|
---|
1901 | /* UCS-2 character */
|
---|
1902 | *p++ = (Py_UNICODE) chr;
|
---|
1903 | else if (chr <= 0x10ffff) {
|
---|
1904 | /* UCS-4 character. Either store directly, or as
|
---|
1905 | surrogate pair. */
|
---|
1906 | #ifdef Py_UNICODE_WIDE
|
---|
1907 | *p++ = chr;
|
---|
1908 | #else
|
---|
1909 | chr -= 0x10000L;
|
---|
1910 | *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
|
---|
1911 | *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
|
---|
1912 | #endif
|
---|
1913 | } else {
|
---|
1914 | endinpos = s-starts;
|
---|
1915 | outpos = p-PyUnicode_AS_UNICODE(v);
|
---|
1916 | if (unicode_decode_call_errorhandler(
|
---|
1917 | errors, &errorHandler,
|
---|
1918 | "unicodeescape", "illegal Unicode character",
|
---|
1919 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
1920 | (PyObject **)&v, &outpos, &p))
|
---|
1921 | goto onError;
|
---|
1922 | }
|
---|
1923 | break;
|
---|
1924 |
|
---|
1925 | /* \N{name} */
|
---|
1926 | case 'N':
|
---|
1927 | message = "malformed \\N character escape";
|
---|
1928 | if (ucnhash_CAPI == NULL) {
|
---|
1929 | /* load the unicode data module */
|
---|
1930 | PyObject *m, *api;
|
---|
1931 | m = PyImport_ImportModule("unicodedata");
|
---|
1932 | if (m == NULL)
|
---|
1933 | goto ucnhashError;
|
---|
1934 | api = PyObject_GetAttrString(m, "ucnhash_CAPI");
|
---|
1935 | Py_DECREF(m);
|
---|
1936 | if (api == NULL)
|
---|
1937 | goto ucnhashError;
|
---|
1938 | ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
|
---|
1939 | Py_DECREF(api);
|
---|
1940 | if (ucnhash_CAPI == NULL)
|
---|
1941 | goto ucnhashError;
|
---|
1942 | }
|
---|
1943 | if (*s == '{') {
|
---|
1944 | const char *start = s+1;
|
---|
1945 | /* look for the closing brace */
|
---|
1946 | while (*s != '}' && s < end)
|
---|
1947 | s++;
|
---|
1948 | if (s > start && s < end && *s == '}') {
|
---|
1949 | /* found a name. look it up in the unicode database */
|
---|
1950 | message = "unknown Unicode character name";
|
---|
1951 | s++;
|
---|
1952 | if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
|
---|
1953 | goto store;
|
---|
1954 | }
|
---|
1955 | }
|
---|
1956 | endinpos = s-starts;
|
---|
1957 | outpos = p-PyUnicode_AS_UNICODE(v);
|
---|
1958 | if (unicode_decode_call_errorhandler(
|
---|
1959 | errors, &errorHandler,
|
---|
1960 | "unicodeescape", message,
|
---|
1961 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
1962 | (PyObject **)&v, &outpos, &p))
|
---|
1963 | goto onError;
|
---|
1964 | break;
|
---|
1965 |
|
---|
1966 | default:
|
---|
1967 | if (s > end) {
|
---|
1968 | message = "\\ at end of string";
|
---|
1969 | s--;
|
---|
1970 | endinpos = s-starts;
|
---|
1971 | outpos = p-PyUnicode_AS_UNICODE(v);
|
---|
1972 | if (unicode_decode_call_errorhandler(
|
---|
1973 | errors, &errorHandler,
|
---|
1974 | "unicodeescape", message,
|
---|
1975 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
1976 | (PyObject **)&v, &outpos, &p))
|
---|
1977 | goto onError;
|
---|
1978 | }
|
---|
1979 | else {
|
---|
1980 | *p++ = '\\';
|
---|
1981 | *p++ = (unsigned char)s[-1];
|
---|
1982 | }
|
---|
1983 | break;
|
---|
1984 | }
|
---|
1985 | nextByte:
|
---|
1986 | ;
|
---|
1987 | }
|
---|
1988 | if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
|
---|
1989 | goto onError;
|
---|
1990 | Py_XDECREF(errorHandler);
|
---|
1991 | Py_XDECREF(exc);
|
---|
1992 | return (PyObject *)v;
|
---|
1993 |
|
---|
1994 | ucnhashError:
|
---|
1995 | PyErr_SetString(
|
---|
1996 | PyExc_UnicodeError,
|
---|
1997 | "\\N escapes not supported (can't load unicodedata module)"
|
---|
1998 | );
|
---|
1999 | Py_XDECREF(v);
|
---|
2000 | Py_XDECREF(errorHandler);
|
---|
2001 | Py_XDECREF(exc);
|
---|
2002 | return NULL;
|
---|
2003 |
|
---|
2004 | onError:
|
---|
2005 | Py_XDECREF(v);
|
---|
2006 | Py_XDECREF(errorHandler);
|
---|
2007 | Py_XDECREF(exc);
|
---|
2008 | return NULL;
|
---|
2009 | }
|
---|
2010 |
|
---|
2011 | /* Return a Unicode-Escape string version of the Unicode object.
|
---|
2012 |
|
---|
2013 | If quotes is true, the string is enclosed in u"" or u'' quotes as
|
---|
2014 | appropriate.
|
---|
2015 |
|
---|
2016 | */
|
---|
2017 |
|
---|
2018 | Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
|
---|
2019 | Py_ssize_t size,
|
---|
2020 | Py_UNICODE ch)
|
---|
2021 | {
|
---|
2022 | /* like wcschr, but doesn't stop at NULL characters */
|
---|
2023 |
|
---|
2024 | while (size-- > 0) {
|
---|
2025 | if (*s == ch)
|
---|
2026 | return s;
|
---|
2027 | s++;
|
---|
2028 | }
|
---|
2029 |
|
---|
2030 | return NULL;
|
---|
2031 | }
|
---|
2032 |
|
---|
2033 | static
|
---|
2034 | PyObject *unicodeescape_string(const Py_UNICODE *s,
|
---|
2035 | Py_ssize_t size,
|
---|
2036 | int quotes)
|
---|
2037 | {
|
---|
2038 | PyObject *repr;
|
---|
2039 | char *p;
|
---|
2040 |
|
---|
2041 | static const char *hexdigit = "0123456789abcdef";
|
---|
2042 |
|
---|
2043 | /* Initial allocation is based on the longest-possible unichr
|
---|
2044 | escape.
|
---|
2045 |
|
---|
2046 | In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
|
---|
2047 | unichr, so in this case it's the longest unichr escape. In
|
---|
2048 | narrow (UTF-16) builds this is five chars per source unichr
|
---|
2049 | since there are two unichrs in the surrogate pair, so in narrow
|
---|
2050 | (UTF-16) builds it's not the longest unichr escape.
|
---|
2051 |
|
---|
2052 | In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
|
---|
2053 | so in the narrow (UTF-16) build case it's the longest unichr
|
---|
2054 | escape.
|
---|
2055 | */
|
---|
2056 |
|
---|
2057 | repr = PyString_FromStringAndSize(NULL,
|
---|
2058 | 2
|
---|
2059 | #ifdef Py_UNICODE_WIDE
|
---|
2060 | + 10*size
|
---|
2061 | #else
|
---|
2062 | + 6*size
|
---|
2063 | #endif
|
---|
2064 | + 1);
|
---|
2065 | if (repr == NULL)
|
---|
2066 | return NULL;
|
---|
2067 |
|
---|
2068 | p = PyString_AS_STRING(repr);
|
---|
2069 |
|
---|
2070 | if (quotes) {
|
---|
2071 | *p++ = 'u';
|
---|
2072 | *p++ = (findchar(s, size, '\'') &&
|
---|
2073 | !findchar(s, size, '"')) ? '"' : '\'';
|
---|
2074 | }
|
---|
2075 | while (size-- > 0) {
|
---|
2076 | Py_UNICODE ch = *s++;
|
---|
2077 |
|
---|
2078 | /* Escape quotes and backslashes */
|
---|
2079 | if ((quotes &&
|
---|
2080 | ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
|
---|
2081 | *p++ = '\\';
|
---|
2082 | *p++ = (char) ch;
|
---|
2083 | continue;
|
---|
2084 | }
|
---|
2085 |
|
---|
2086 | #ifdef Py_UNICODE_WIDE
|
---|
2087 | /* Map 21-bit characters to '\U00xxxxxx' */
|
---|
2088 | else if (ch >= 0x10000) {
|
---|
2089 | *p++ = '\\';
|
---|
2090 | *p++ = 'U';
|
---|
2091 | *p++ = hexdigit[(ch >> 28) & 0x0000000F];
|
---|
2092 | *p++ = hexdigit[(ch >> 24) & 0x0000000F];
|
---|
2093 | *p++ = hexdigit[(ch >> 20) & 0x0000000F];
|
---|
2094 | *p++ = hexdigit[(ch >> 16) & 0x0000000F];
|
---|
2095 | *p++ = hexdigit[(ch >> 12) & 0x0000000F];
|
---|
2096 | *p++ = hexdigit[(ch >> 8) & 0x0000000F];
|
---|
2097 | *p++ = hexdigit[(ch >> 4) & 0x0000000F];
|
---|
2098 | *p++ = hexdigit[ch & 0x0000000F];
|
---|
2099 | continue;
|
---|
2100 | }
|
---|
2101 | #else
|
---|
2102 | /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
|
---|
2103 | else if (ch >= 0xD800 && ch < 0xDC00) {
|
---|
2104 | Py_UNICODE ch2;
|
---|
2105 | Py_UCS4 ucs;
|
---|
2106 |
|
---|
2107 | ch2 = *s++;
|
---|
2108 | size--;
|
---|
2109 | if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
|
---|
2110 | ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
|
---|
2111 | *p++ = '\\';
|
---|
2112 | *p++ = 'U';
|
---|
2113 | *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
|
---|
2114 | *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
|
---|
2115 | *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
|
---|
2116 | *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
|
---|
2117 | *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
|
---|
2118 | *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
|
---|
2119 | *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
|
---|
2120 | *p++ = hexdigit[ucs & 0x0000000F];
|
---|
2121 | continue;
|
---|
2122 | }
|
---|
2123 | /* Fall through: isolated surrogates are copied as-is */
|
---|
2124 | s--;
|
---|
2125 | size++;
|
---|
2126 | }
|
---|
2127 | #endif
|
---|
2128 |
|
---|
2129 | /* Map 16-bit characters to '\uxxxx' */
|
---|
2130 | if (ch >= 256) {
|
---|
2131 | *p++ = '\\';
|
---|
2132 | *p++ = 'u';
|
---|
2133 | *p++ = hexdigit[(ch >> 12) & 0x000F];
|
---|
2134 | *p++ = hexdigit[(ch >> 8) & 0x000F];
|
---|
2135 | *p++ = hexdigit[(ch >> 4) & 0x000F];
|
---|
2136 | *p++ = hexdigit[ch & 0x000F];
|
---|
2137 | }
|
---|
2138 |
|
---|
2139 | /* Map special whitespace to '\t', \n', '\r' */
|
---|
2140 | else if (ch == '\t') {
|
---|
2141 | *p++ = '\\';
|
---|
2142 | *p++ = 't';
|
---|
2143 | }
|
---|
2144 | else if (ch == '\n') {
|
---|
2145 | *p++ = '\\';
|
---|
2146 | *p++ = 'n';
|
---|
2147 | }
|
---|
2148 | else if (ch == '\r') {
|
---|
2149 | *p++ = '\\';
|
---|
2150 | *p++ = 'r';
|
---|
2151 | }
|
---|
2152 |
|
---|
2153 | /* Map non-printable US ASCII to '\xhh' */
|
---|
2154 | else if (ch < ' ' || ch >= 0x7F) {
|
---|
2155 | *p++ = '\\';
|
---|
2156 | *p++ = 'x';
|
---|
2157 | *p++ = hexdigit[(ch >> 4) & 0x000F];
|
---|
2158 | *p++ = hexdigit[ch & 0x000F];
|
---|
2159 | }
|
---|
2160 |
|
---|
2161 | /* Copy everything else as-is */
|
---|
2162 | else
|
---|
2163 | *p++ = (char) ch;
|
---|
2164 | }
|
---|
2165 | if (quotes)
|
---|
2166 | *p++ = PyString_AS_STRING(repr)[1];
|
---|
2167 |
|
---|
2168 | *p = '\0';
|
---|
2169 | _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
|
---|
2170 | return repr;
|
---|
2171 | }
|
---|
2172 |
|
---|
2173 | PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
|
---|
2174 | Py_ssize_t size)
|
---|
2175 | {
|
---|
2176 | return unicodeescape_string(s, size, 0);
|
---|
2177 | }
|
---|
2178 |
|
---|
2179 | PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
|
---|
2180 | {
|
---|
2181 | if (!PyUnicode_Check(unicode)) {
|
---|
2182 | PyErr_BadArgument();
|
---|
2183 | return NULL;
|
---|
2184 | }
|
---|
2185 | return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
|
---|
2186 | PyUnicode_GET_SIZE(unicode));
|
---|
2187 | }
|
---|
2188 |
|
---|
2189 | /* --- Raw Unicode Escape Codec ------------------------------------------- */
|
---|
2190 |
|
---|
2191 | PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
---|
2192 | Py_ssize_t size,
|
---|
2193 | const char *errors)
|
---|
2194 | {
|
---|
2195 | const char *starts = s;
|
---|
2196 | Py_ssize_t startinpos;
|
---|
2197 | Py_ssize_t endinpos;
|
---|
2198 | Py_ssize_t outpos;
|
---|
2199 | PyUnicodeObject *v;
|
---|
2200 | Py_UNICODE *p;
|
---|
2201 | const char *end;
|
---|
2202 | const char *bs;
|
---|
2203 | PyObject *errorHandler = NULL;
|
---|
2204 | PyObject *exc = NULL;
|
---|
2205 |
|
---|
2206 | /* Escaped strings will always be longer than the resulting
|
---|
2207 | Unicode string, so we start with size here and then reduce the
|
---|
2208 | length after conversion to the true value. (But decoding error
|
---|
2209 | handler might have to resize the string) */
|
---|
2210 | v = _PyUnicode_New(size);
|
---|
2211 | if (v == NULL)
|
---|
2212 | goto onError;
|
---|
2213 | if (size == 0)
|
---|
2214 | return (PyObject *)v;
|
---|
2215 | p = PyUnicode_AS_UNICODE(v);
|
---|
2216 | end = s + size;
|
---|
2217 | while (s < end) {
|
---|
2218 | unsigned char c;
|
---|
2219 | Py_UCS4 x;
|
---|
2220 | int i;
|
---|
2221 | int count;
|
---|
2222 |
|
---|
2223 | /* Non-escape characters are interpreted as Unicode ordinals */
|
---|
2224 | if (*s != '\\') {
|
---|
2225 | *p++ = (unsigned char)*s++;
|
---|
2226 | continue;
|
---|
2227 | }
|
---|
2228 | startinpos = s-starts;
|
---|
2229 |
|
---|
2230 | /* \u-escapes are only interpreted iff the number of leading
|
---|
2231 | backslashes if odd */
|
---|
2232 | bs = s;
|
---|
2233 | for (;s < end;) {
|
---|
2234 | if (*s != '\\')
|
---|
2235 | break;
|
---|
2236 | *p++ = (unsigned char)*s++;
|
---|
2237 | }
|
---|
2238 | if (((s - bs) & 1) == 0 ||
|
---|
2239 | s >= end ||
|
---|
2240 | (*s != 'u' && *s != 'U')) {
|
---|
2241 | continue;
|
---|
2242 | }
|
---|
2243 | p--;
|
---|
2244 | count = *s=='u' ? 4 : 8;
|
---|
2245 | s++;
|
---|
2246 |
|
---|
2247 | /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
|
---|
2248 | outpos = p-PyUnicode_AS_UNICODE(v);
|
---|
2249 | for (x = 0, i = 0; i < count; ++i, ++s) {
|
---|
2250 | c = (unsigned char)*s;
|
---|
2251 | if (!isxdigit(c)) {
|
---|
2252 | endinpos = s-starts;
|
---|
2253 | if (unicode_decode_call_errorhandler(
|
---|
2254 | errors, &errorHandler,
|
---|
2255 | "rawunicodeescape", "truncated \\uXXXX",
|
---|
2256 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
2257 | (PyObject **)&v, &outpos, &p))
|
---|
2258 | goto onError;
|
---|
2259 | goto nextByte;
|
---|
2260 | }
|
---|
2261 | x = (x<<4) & ~0xF;
|
---|
2262 | if (c >= '0' && c <= '9')
|
---|
2263 | x += c - '0';
|
---|
2264 | else if (c >= 'a' && c <= 'f')
|
---|
2265 | x += 10 + c - 'a';
|
---|
2266 | else
|
---|
2267 | x += 10 + c - 'A';
|
---|
2268 | }
|
---|
2269 | #ifndef Py_UNICODE_WIDE
|
---|
2270 | if (x > 0x10000) {
|
---|
2271 | if (unicode_decode_call_errorhandler(
|
---|
2272 | errors, &errorHandler,
|
---|
2273 | "rawunicodeescape", "\\Uxxxxxxxx out of range",
|
---|
2274 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
2275 | (PyObject **)&v, &outpos, &p))
|
---|
2276 | goto onError;
|
---|
2277 | }
|
---|
2278 | #endif
|
---|
2279 | *p++ = x;
|
---|
2280 | nextByte:
|
---|
2281 | ;
|
---|
2282 | }
|
---|
2283 | if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
|
---|
2284 | goto onError;
|
---|
2285 | Py_XDECREF(errorHandler);
|
---|
2286 | Py_XDECREF(exc);
|
---|
2287 | return (PyObject *)v;
|
---|
2288 |
|
---|
2289 | onError:
|
---|
2290 | Py_XDECREF(v);
|
---|
2291 | Py_XDECREF(errorHandler);
|
---|
2292 | Py_XDECREF(exc);
|
---|
2293 | return NULL;
|
---|
2294 | }
|
---|
2295 |
|
---|
2296 | PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
|
---|
2297 | Py_ssize_t size)
|
---|
2298 | {
|
---|
2299 | PyObject *repr;
|
---|
2300 | char *p;
|
---|
2301 | char *q;
|
---|
2302 |
|
---|
2303 | static const char *hexdigit = "0123456789abcdef";
|
---|
2304 |
|
---|
2305 | #ifdef Py_UNICODE_WIDE
|
---|
2306 | repr = PyString_FromStringAndSize(NULL, 10 * size);
|
---|
2307 | #else
|
---|
2308 | repr = PyString_FromStringAndSize(NULL, 6 * size);
|
---|
2309 | #endif
|
---|
2310 | if (repr == NULL)
|
---|
2311 | return NULL;
|
---|
2312 | if (size == 0)
|
---|
2313 | return repr;
|
---|
2314 |
|
---|
2315 | p = q = PyString_AS_STRING(repr);
|
---|
2316 | while (size-- > 0) {
|
---|
2317 | Py_UNICODE ch = *s++;
|
---|
2318 | #ifdef Py_UNICODE_WIDE
|
---|
2319 | /* Map 32-bit characters to '\Uxxxxxxxx' */
|
---|
2320 | if (ch >= 0x10000) {
|
---|
2321 | *p++ = '\\';
|
---|
2322 | *p++ = 'U';
|
---|
2323 | *p++ = hexdigit[(ch >> 28) & 0xf];
|
---|
2324 | *p++ = hexdigit[(ch >> 24) & 0xf];
|
---|
2325 | *p++ = hexdigit[(ch >> 20) & 0xf];
|
---|
2326 | *p++ = hexdigit[(ch >> 16) & 0xf];
|
---|
2327 | *p++ = hexdigit[(ch >> 12) & 0xf];
|
---|
2328 | *p++ = hexdigit[(ch >> 8) & 0xf];
|
---|
2329 | *p++ = hexdigit[(ch >> 4) & 0xf];
|
---|
2330 | *p++ = hexdigit[ch & 15];
|
---|
2331 | }
|
---|
2332 | else
|
---|
2333 | #endif
|
---|
2334 | /* Map 16-bit characters to '\uxxxx' */
|
---|
2335 | if (ch >= 256) {
|
---|
2336 | *p++ = '\\';
|
---|
2337 | *p++ = 'u';
|
---|
2338 | *p++ = hexdigit[(ch >> 12) & 0xf];
|
---|
2339 | *p++ = hexdigit[(ch >> 8) & 0xf];
|
---|
2340 | *p++ = hexdigit[(ch >> 4) & 0xf];
|
---|
2341 | *p++ = hexdigit[ch & 15];
|
---|
2342 | }
|
---|
2343 | /* Copy everything else as-is */
|
---|
2344 | else
|
---|
2345 | *p++ = (char) ch;
|
---|
2346 | }
|
---|
2347 | *p = '\0';
|
---|
2348 | _PyString_Resize(&repr, p - q);
|
---|
2349 | return repr;
|
---|
2350 | }
|
---|
2351 |
|
---|
2352 | PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
|
---|
2353 | {
|
---|
2354 | if (!PyUnicode_Check(unicode)) {
|
---|
2355 | PyErr_BadArgument();
|
---|
2356 | return NULL;
|
---|
2357 | }
|
---|
2358 | return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
|
---|
2359 | PyUnicode_GET_SIZE(unicode));
|
---|
2360 | }
|
---|
2361 |
|
---|
2362 | /* --- Unicode Internal Codec ------------------------------------------- */
|
---|
2363 |
|
---|
2364 | PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
|
---|
2365 | Py_ssize_t size,
|
---|
2366 | const char *errors)
|
---|
2367 | {
|
---|
2368 | const char *starts = s;
|
---|
2369 | Py_ssize_t startinpos;
|
---|
2370 | Py_ssize_t endinpos;
|
---|
2371 | Py_ssize_t outpos;
|
---|
2372 | PyUnicodeObject *v;
|
---|
2373 | Py_UNICODE *p;
|
---|
2374 | const char *end;
|
---|
2375 | const char *reason;
|
---|
2376 | PyObject *errorHandler = NULL;
|
---|
2377 | PyObject *exc = NULL;
|
---|
2378 |
|
---|
2379 | #ifdef Py_UNICODE_WIDE
|
---|
2380 | Py_UNICODE unimax = PyUnicode_GetMax();
|
---|
2381 | #endif
|
---|
2382 |
|
---|
2383 | v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
|
---|
2384 | if (v == NULL)
|
---|
2385 | goto onError;
|
---|
2386 | if (PyUnicode_GetSize((PyObject *)v) == 0)
|
---|
2387 | return (PyObject *)v;
|
---|
2388 | p = PyUnicode_AS_UNICODE(v);
|
---|
2389 | end = s + size;
|
---|
2390 |
|
---|
2391 | while (s < end) {
|
---|
2392 | memcpy(p, s, sizeof(Py_UNICODE));
|
---|
2393 | /* We have to sanity check the raw data, otherwise doom looms for
|
---|
2394 | some malformed UCS-4 data. */
|
---|
2395 | if (
|
---|
2396 | #ifdef Py_UNICODE_WIDE
|
---|
2397 | *p > unimax || *p < 0 ||
|
---|
2398 | #endif
|
---|
2399 | end-s < Py_UNICODE_SIZE
|
---|
2400 | )
|
---|
2401 | {
|
---|
2402 | startinpos = s - starts;
|
---|
2403 | if (end-s < Py_UNICODE_SIZE) {
|
---|
2404 | endinpos = end-starts;
|
---|
2405 | reason = "truncated input";
|
---|
2406 | }
|
---|
2407 | else {
|
---|
2408 | endinpos = s - starts + Py_UNICODE_SIZE;
|
---|
2409 | reason = "illegal code point (> 0x10FFFF)";
|
---|
2410 | }
|
---|
2411 | outpos = p - PyUnicode_AS_UNICODE(v);
|
---|
2412 | if (unicode_decode_call_errorhandler(
|
---|
2413 | errors, &errorHandler,
|
---|
2414 | "unicode_internal", reason,
|
---|
2415 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
2416 | (PyObject **)&v, &outpos, &p)) {
|
---|
2417 | goto onError;
|
---|
2418 | }
|
---|
2419 | }
|
---|
2420 | else {
|
---|
2421 | p++;
|
---|
2422 | s += Py_UNICODE_SIZE;
|
---|
2423 | }
|
---|
2424 | }
|
---|
2425 |
|
---|
2426 | if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
|
---|
2427 | goto onError;
|
---|
2428 | Py_XDECREF(errorHandler);
|
---|
2429 | Py_XDECREF(exc);
|
---|
2430 | return (PyObject *)v;
|
---|
2431 |
|
---|
2432 | onError:
|
---|
2433 | Py_XDECREF(v);
|
---|
2434 | Py_XDECREF(errorHandler);
|
---|
2435 | Py_XDECREF(exc);
|
---|
2436 | return NULL;
|
---|
2437 | }
|
---|
2438 |
|
---|
2439 | /* --- Latin-1 Codec ------------------------------------------------------ */
|
---|
2440 |
|
---|
2441 | PyObject *PyUnicode_DecodeLatin1(const char *s,
|
---|
2442 | Py_ssize_t size,
|
---|
2443 | const char *errors)
|
---|
2444 | {
|
---|
2445 | PyUnicodeObject *v;
|
---|
2446 | Py_UNICODE *p;
|
---|
2447 |
|
---|
2448 | /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
|
---|
2449 | if (size == 1) {
|
---|
2450 | Py_UNICODE r = *(unsigned char*)s;
|
---|
2451 | return PyUnicode_FromUnicode(&r, 1);
|
---|
2452 | }
|
---|
2453 |
|
---|
2454 | v = _PyUnicode_New(size);
|
---|
2455 | if (v == NULL)
|
---|
2456 | goto onError;
|
---|
2457 | if (size == 0)
|
---|
2458 | return (PyObject *)v;
|
---|
2459 | p = PyUnicode_AS_UNICODE(v);
|
---|
2460 | while (size-- > 0)
|
---|
2461 | *p++ = (unsigned char)*s++;
|
---|
2462 | return (PyObject *)v;
|
---|
2463 |
|
---|
2464 | onError:
|
---|
2465 | Py_XDECREF(v);
|
---|
2466 | return NULL;
|
---|
2467 | }
|
---|
2468 |
|
---|
2469 | /* create or adjust a UnicodeEncodeError */
|
---|
2470 | static void make_encode_exception(PyObject **exceptionObject,
|
---|
2471 | const char *encoding,
|
---|
2472 | const Py_UNICODE *unicode, Py_ssize_t size,
|
---|
2473 | Py_ssize_t startpos, Py_ssize_t endpos,
|
---|
2474 | const char *reason)
|
---|
2475 | {
|
---|
2476 | if (*exceptionObject == NULL) {
|
---|
2477 | *exceptionObject = PyUnicodeEncodeError_Create(
|
---|
2478 | encoding, unicode, size, startpos, endpos, reason);
|
---|
2479 | }
|
---|
2480 | else {
|
---|
2481 | if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
|
---|
2482 | goto onError;
|
---|
2483 | if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
|
---|
2484 | goto onError;
|
---|
2485 | if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
|
---|
2486 | goto onError;
|
---|
2487 | return;
|
---|
2488 | onError:
|
---|
2489 | Py_DECREF(*exceptionObject);
|
---|
2490 | *exceptionObject = NULL;
|
---|
2491 | }
|
---|
2492 | }
|
---|
2493 |
|
---|
2494 | /* raises a UnicodeEncodeError */
|
---|
2495 | static void raise_encode_exception(PyObject **exceptionObject,
|
---|
2496 | const char *encoding,
|
---|
2497 | const Py_UNICODE *unicode, Py_ssize_t size,
|
---|
2498 | Py_ssize_t startpos, Py_ssize_t endpos,
|
---|
2499 | const char *reason)
|
---|
2500 | {
|
---|
2501 | make_encode_exception(exceptionObject,
|
---|
2502 | encoding, unicode, size, startpos, endpos, reason);
|
---|
2503 | if (*exceptionObject != NULL)
|
---|
2504 | PyCodec_StrictErrors(*exceptionObject);
|
---|
2505 | }
|
---|
2506 |
|
---|
2507 | /* error handling callback helper:
|
---|
2508 | build arguments, call the callback and check the arguments,
|
---|
2509 | put the result into newpos and return the replacement string, which
|
---|
2510 | has to be freed by the caller */
|
---|
2511 | static PyObject *unicode_encode_call_errorhandler(const char *errors,
|
---|
2512 | PyObject **errorHandler,
|
---|
2513 | const char *encoding, const char *reason,
|
---|
2514 | const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
|
---|
2515 | Py_ssize_t startpos, Py_ssize_t endpos,
|
---|
2516 | Py_ssize_t *newpos)
|
---|
2517 | {
|
---|
2518 | static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
|
---|
2519 |
|
---|
2520 | PyObject *restuple;
|
---|
2521 | PyObject *resunicode;
|
---|
2522 |
|
---|
2523 | if (*errorHandler == NULL) {
|
---|
2524 | *errorHandler = PyCodec_LookupError(errors);
|
---|
2525 | if (*errorHandler == NULL)
|
---|
2526 | return NULL;
|
---|
2527 | }
|
---|
2528 |
|
---|
2529 | make_encode_exception(exceptionObject,
|
---|
2530 | encoding, unicode, size, startpos, endpos, reason);
|
---|
2531 | if (*exceptionObject == NULL)
|
---|
2532 | return NULL;
|
---|
2533 |
|
---|
2534 | restuple = PyObject_CallFunctionObjArgs(
|
---|
2535 | *errorHandler, *exceptionObject, NULL);
|
---|
2536 | if (restuple == NULL)
|
---|
2537 | return NULL;
|
---|
2538 | if (!PyTuple_Check(restuple)) {
|
---|
2539 | PyErr_Format(PyExc_TypeError, &argparse[4]);
|
---|
2540 | Py_DECREF(restuple);
|
---|
2541 | return NULL;
|
---|
2542 | }
|
---|
2543 | if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
|
---|
2544 | &resunicode, newpos)) {
|
---|
2545 | Py_DECREF(restuple);
|
---|
2546 | return NULL;
|
---|
2547 | }
|
---|
2548 | if (*newpos<0)
|
---|
2549 | *newpos = size+*newpos;
|
---|
2550 | if (*newpos<0 || *newpos>size) {
|
---|
2551 | PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
|
---|
2552 | Py_DECREF(restuple);
|
---|
2553 | return NULL;
|
---|
2554 | }
|
---|
2555 | Py_INCREF(resunicode);
|
---|
2556 | Py_DECREF(restuple);
|
---|
2557 | return resunicode;
|
---|
2558 | }
|
---|
2559 |
|
---|
2560 | static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
|
---|
2561 | Py_ssize_t size,
|
---|
2562 | const char *errors,
|
---|
2563 | int limit)
|
---|
2564 | {
|
---|
2565 | /* output object */
|
---|
2566 | PyObject *res;
|
---|
2567 | /* pointers to the beginning and end+1 of input */
|
---|
2568 | const Py_UNICODE *startp = p;
|
---|
2569 | const Py_UNICODE *endp = p + size;
|
---|
2570 | /* pointer to the beginning of the unencodable characters */
|
---|
2571 | /* const Py_UNICODE *badp = NULL; */
|
---|
2572 | /* pointer into the output */
|
---|
2573 | char *str;
|
---|
2574 | /* current output position */
|
---|
2575 | Py_ssize_t respos = 0;
|
---|
2576 | Py_ssize_t ressize;
|
---|
2577 | const char *encoding = (limit == 256) ? "latin-1" : "ascii";
|
---|
2578 | const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
|
---|
2579 | PyObject *errorHandler = NULL;
|
---|
2580 | PyObject *exc = NULL;
|
---|
2581 | /* the following variable is used for caching string comparisons
|
---|
2582 | * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
|
---|
2583 | int known_errorHandler = -1;
|
---|
2584 |
|
---|
2585 | /* allocate enough for a simple encoding without
|
---|
2586 | replacements, if we need more, we'll resize */
|
---|
2587 | res = PyString_FromStringAndSize(NULL, size);
|
---|
2588 | if (res == NULL)
|
---|
2589 | goto onError;
|
---|
2590 | if (size == 0)
|
---|
2591 | return res;
|
---|
2592 | str = PyString_AS_STRING(res);
|
---|
2593 | ressize = size;
|
---|
2594 |
|
---|
2595 | while (p<endp) {
|
---|
2596 | Py_UNICODE c = *p;
|
---|
2597 |
|
---|
2598 | /* can we encode this? */
|
---|
2599 | if (c<limit) {
|
---|
2600 | /* no overflow check, because we know that the space is enough */
|
---|
2601 | *str++ = (char)c;
|
---|
2602 | ++p;
|
---|
2603 | }
|
---|
2604 | else {
|
---|
2605 | Py_ssize_t unicodepos = p-startp;
|
---|
2606 | Py_ssize_t requiredsize;
|
---|
2607 | PyObject *repunicode;
|
---|
2608 | Py_ssize_t repsize;
|
---|
2609 | Py_ssize_t newpos;
|
---|
2610 | Py_ssize_t respos;
|
---|
2611 | Py_UNICODE *uni2;
|
---|
2612 | /* startpos for collecting unencodable chars */
|
---|
2613 | const Py_UNICODE *collstart = p;
|
---|
2614 | const Py_UNICODE *collend = p;
|
---|
2615 | /* find all unecodable characters */
|
---|
2616 | while ((collend < endp) && ((*collend)>=limit))
|
---|
2617 | ++collend;
|
---|
2618 | /* cache callback name lookup (if not done yet, i.e. it's the first error) */
|
---|
2619 | if (known_errorHandler==-1) {
|
---|
2620 | if ((errors==NULL) || (!strcmp(errors, "strict")))
|
---|
2621 | known_errorHandler = 1;
|
---|
2622 | else if (!strcmp(errors, "replace"))
|
---|
2623 | known_errorHandler = 2;
|
---|
2624 | else if (!strcmp(errors, "ignore"))
|
---|
2625 | known_errorHandler = 3;
|
---|
2626 | else if (!strcmp(errors, "xmlcharrefreplace"))
|
---|
2627 | known_errorHandler = 4;
|
---|
2628 | else
|
---|
2629 | known_errorHandler = 0;
|
---|
2630 | }
|
---|
2631 | switch (known_errorHandler) {
|
---|
2632 | case 1: /* strict */
|
---|
2633 | raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
|
---|
2634 | goto onError;
|
---|
2635 | case 2: /* replace */
|
---|
2636 | while (collstart++<collend)
|
---|
2637 | *str++ = '?'; /* fall through */
|
---|
2638 | case 3: /* ignore */
|
---|
2639 | p = collend;
|
---|
2640 | break;
|
---|
2641 | case 4: /* xmlcharrefreplace */
|
---|
2642 | respos = str-PyString_AS_STRING(res);
|
---|
2643 | /* determine replacement size (temporarily (mis)uses p) */
|
---|
2644 | for (p = collstart, repsize = 0; p < collend; ++p) {
|
---|
2645 | if (*p<10)
|
---|
2646 | repsize += 2+1+1;
|
---|
2647 | else if (*p<100)
|
---|
2648 | repsize += 2+2+1;
|
---|
2649 | else if (*p<1000)
|
---|
2650 | repsize += 2+3+1;
|
---|
2651 | else if (*p<10000)
|
---|
2652 | repsize += 2+4+1;
|
---|
2653 | #ifndef Py_UNICODE_WIDE
|
---|
2654 | else
|
---|
2655 | repsize += 2+5+1;
|
---|
2656 | #else
|
---|
2657 | else if (*p<100000)
|
---|
2658 | repsize += 2+5+1;
|
---|
2659 | else if (*p<1000000)
|
---|
2660 | repsize += 2+6+1;
|
---|
2661 | else
|
---|
2662 | repsize += 2+7+1;
|
---|
2663 | #endif
|
---|
2664 | }
|
---|
2665 | requiredsize = respos+repsize+(endp-collend);
|
---|
2666 | if (requiredsize > ressize) {
|
---|
2667 | if (requiredsize<2*ressize)
|
---|
2668 | requiredsize = 2*ressize;
|
---|
2669 | if (_PyString_Resize(&res, requiredsize))
|
---|
2670 | goto onError;
|
---|
2671 | str = PyString_AS_STRING(res) + respos;
|
---|
2672 | ressize = requiredsize;
|
---|
2673 | }
|
---|
2674 | /* generate replacement (temporarily (mis)uses p) */
|
---|
2675 | for (p = collstart; p < collend; ++p) {
|
---|
2676 | str += sprintf(str, "&#%d;", (int)*p);
|
---|
2677 | }
|
---|
2678 | p = collend;
|
---|
2679 | break;
|
---|
2680 | default:
|
---|
2681 | repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
|
---|
2682 | encoding, reason, startp, size, &exc,
|
---|
2683 | collstart-startp, collend-startp, &newpos);
|
---|
2684 | if (repunicode == NULL)
|
---|
2685 | goto onError;
|
---|
2686 | /* need more space? (at least enough for what we
|
---|
2687 | have+the replacement+the rest of the string, so
|
---|
2688 | we won't have to check space for encodable characters) */
|
---|
2689 | respos = str-PyString_AS_STRING(res);
|
---|
2690 | repsize = PyUnicode_GET_SIZE(repunicode);
|
---|
2691 | requiredsize = respos+repsize+(endp-collend);
|
---|
2692 | if (requiredsize > ressize) {
|
---|
2693 | if (requiredsize<2*ressize)
|
---|
2694 | requiredsize = 2*ressize;
|
---|
2695 | if (_PyString_Resize(&res, requiredsize)) {
|
---|
2696 | Py_DECREF(repunicode);
|
---|
2697 | goto onError;
|
---|
2698 | }
|
---|
2699 | str = PyString_AS_STRING(res) + respos;
|
---|
2700 | ressize = requiredsize;
|
---|
2701 | }
|
---|
2702 | /* check if there is anything unencodable in the replacement
|
---|
2703 | and copy it to the output */
|
---|
2704 | for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
|
---|
2705 | c = *uni2;
|
---|
2706 | if (c >= limit) {
|
---|
2707 | raise_encode_exception(&exc, encoding, startp, size,
|
---|
2708 | unicodepos, unicodepos+1, reason);
|
---|
2709 | Py_DECREF(repunicode);
|
---|
2710 | goto onError;
|
---|
2711 | }
|
---|
2712 | *str = (char)c;
|
---|
2713 | }
|
---|
2714 | p = startp + newpos;
|
---|
2715 | Py_DECREF(repunicode);
|
---|
2716 | }
|
---|
2717 | }
|
---|
2718 | }
|
---|
2719 | /* Resize if we allocated to much */
|
---|
2720 | respos = str-PyString_AS_STRING(res);
|
---|
2721 | if (respos<ressize)
|
---|
2722 | /* If this falls res will be NULL */
|
---|
2723 | _PyString_Resize(&res, respos);
|
---|
2724 | Py_XDECREF(errorHandler);
|
---|
2725 | Py_XDECREF(exc);
|
---|
2726 | return res;
|
---|
2727 |
|
---|
2728 | onError:
|
---|
2729 | Py_XDECREF(res);
|
---|
2730 | Py_XDECREF(errorHandler);
|
---|
2731 | Py_XDECREF(exc);
|
---|
2732 | return NULL;
|
---|
2733 | }
|
---|
2734 |
|
---|
2735 | PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
|
---|
2736 | Py_ssize_t size,
|
---|
2737 | const char *errors)
|
---|
2738 | {
|
---|
2739 | return unicode_encode_ucs1(p, size, errors, 256);
|
---|
2740 | }
|
---|
2741 |
|
---|
2742 | PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
|
---|
2743 | {
|
---|
2744 | if (!PyUnicode_Check(unicode)) {
|
---|
2745 | PyErr_BadArgument();
|
---|
2746 | return NULL;
|
---|
2747 | }
|
---|
2748 | return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
|
---|
2749 | PyUnicode_GET_SIZE(unicode),
|
---|
2750 | NULL);
|
---|
2751 | }
|
---|
2752 |
|
---|
2753 | /* --- 7-bit ASCII Codec -------------------------------------------------- */
|
---|
2754 |
|
---|
2755 | PyObject *PyUnicode_DecodeASCII(const char *s,
|
---|
2756 | Py_ssize_t size,
|
---|
2757 | const char *errors)
|
---|
2758 | {
|
---|
2759 | const char *starts = s;
|
---|
2760 | PyUnicodeObject *v;
|
---|
2761 | Py_UNICODE *p;
|
---|
2762 | Py_ssize_t startinpos;
|
---|
2763 | Py_ssize_t endinpos;
|
---|
2764 | Py_ssize_t outpos;
|
---|
2765 | const char *e;
|
---|
2766 | PyObject *errorHandler = NULL;
|
---|
2767 | PyObject *exc = NULL;
|
---|
2768 |
|
---|
2769 | /* ASCII is equivalent to the first 128 ordinals in Unicode. */
|
---|
2770 | if (size == 1 && *(unsigned char*)s < 128) {
|
---|
2771 | Py_UNICODE r = *(unsigned char*)s;
|
---|
2772 | return PyUnicode_FromUnicode(&r, 1);
|
---|
2773 | }
|
---|
2774 |
|
---|
2775 | v = _PyUnicode_New(size);
|
---|
2776 | if (v == NULL)
|
---|
2777 | goto onError;
|
---|
2778 | if (size == 0)
|
---|
2779 | return (PyObject *)v;
|
---|
2780 | p = PyUnicode_AS_UNICODE(v);
|
---|
2781 | e = s + size;
|
---|
2782 | while (s < e) {
|
---|
2783 | register unsigned char c = (unsigned char)*s;
|
---|
2784 | if (c < 128) {
|
---|
2785 | *p++ = c;
|
---|
2786 | ++s;
|
---|
2787 | }
|
---|
2788 | else {
|
---|
2789 | startinpos = s-starts;
|
---|
2790 | endinpos = startinpos + 1;
|
---|
2791 | outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
|
---|
2792 | if (unicode_decode_call_errorhandler(
|
---|
2793 | errors, &errorHandler,
|
---|
2794 | "ascii", "ordinal not in range(128)",
|
---|
2795 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
2796 | (PyObject **)&v, &outpos, &p))
|
---|
2797 | goto onError;
|
---|
2798 | }
|
---|
2799 | }
|
---|
2800 | if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
|
---|
2801 | if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
|
---|
2802 | goto onError;
|
---|
2803 | Py_XDECREF(errorHandler);
|
---|
2804 | Py_XDECREF(exc);
|
---|
2805 | return (PyObject *)v;
|
---|
2806 |
|
---|
2807 | onError:
|
---|
2808 | Py_XDECREF(v);
|
---|
2809 | Py_XDECREF(errorHandler);
|
---|
2810 | Py_XDECREF(exc);
|
---|
2811 | return NULL;
|
---|
2812 | }
|
---|
2813 |
|
---|
2814 | PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
|
---|
2815 | Py_ssize_t size,
|
---|
2816 | const char *errors)
|
---|
2817 | {
|
---|
2818 | return unicode_encode_ucs1(p, size, errors, 128);
|
---|
2819 | }
|
---|
2820 |
|
---|
2821 | PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
|
---|
2822 | {
|
---|
2823 | if (!PyUnicode_Check(unicode)) {
|
---|
2824 | PyErr_BadArgument();
|
---|
2825 | return NULL;
|
---|
2826 | }
|
---|
2827 | return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
|
---|
2828 | PyUnicode_GET_SIZE(unicode),
|
---|
2829 | NULL);
|
---|
2830 | }
|
---|
2831 |
|
---|
2832 | #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
|
---|
2833 |
|
---|
2834 | /* --- MBCS codecs for Windows -------------------------------------------- */
|
---|
2835 |
|
---|
2836 | #if SIZEOF_INT < SIZEOF_SSIZE_T
|
---|
2837 | #define NEED_RETRY
|
---|
2838 | #endif
|
---|
2839 |
|
---|
2840 | /* XXX This code is limited to "true" double-byte encodings, as
|
---|
2841 | a) it assumes an incomplete character consists of a single byte, and
|
---|
2842 | b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
|
---|
2843 | encodings, see IsDBCSLeadByteEx documentation. */
|
---|
2844 |
|
---|
2845 | static int is_dbcs_lead_byte(const char *s, int offset)
|
---|
2846 | {
|
---|
2847 | const char *curr = s + offset;
|
---|
2848 |
|
---|
2849 | if (IsDBCSLeadByte(*curr)) {
|
---|
2850 | const char *prev = CharPrev(s, curr);
|
---|
2851 | return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
|
---|
2852 | }
|
---|
2853 | return 0;
|
---|
2854 | }
|
---|
2855 |
|
---|
2856 | /*
|
---|
2857 | * Decode MBCS string into unicode object. If 'final' is set, converts
|
---|
2858 | * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
|
---|
2859 | */
|
---|
2860 | static int decode_mbcs(PyUnicodeObject **v,
|
---|
2861 | const char *s, /* MBCS string */
|
---|
2862 | int size, /* sizeof MBCS string */
|
---|
2863 | int final)
|
---|
2864 | {
|
---|
2865 | Py_UNICODE *p;
|
---|
2866 | Py_ssize_t n = 0;
|
---|
2867 | int usize = 0;
|
---|
2868 |
|
---|
2869 | assert(size >= 0);
|
---|
2870 |
|
---|
2871 | /* Skip trailing lead-byte unless 'final' is set */
|
---|
2872 | if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
|
---|
2873 | --size;
|
---|
2874 |
|
---|
2875 | /* First get the size of the result */
|
---|
2876 | if (size > 0) {
|
---|
2877 | usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
|
---|
2878 | if (usize == 0) {
|
---|
2879 | PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
---|
2880 | return -1;
|
---|
2881 | }
|
---|
2882 | }
|
---|
2883 |
|
---|
2884 | if (*v == NULL) {
|
---|
2885 | /* Create unicode object */
|
---|
2886 | *v = _PyUnicode_New(usize);
|
---|
2887 | if (*v == NULL)
|
---|
2888 | return -1;
|
---|
2889 | }
|
---|
2890 | else {
|
---|
2891 | /* Extend unicode object */
|
---|
2892 | n = PyUnicode_GET_SIZE(*v);
|
---|
2893 | if (_PyUnicode_Resize(v, n + usize) < 0)
|
---|
2894 | return -1;
|
---|
2895 | }
|
---|
2896 |
|
---|
2897 | /* Do the conversion */
|
---|
2898 | if (size > 0) {
|
---|
2899 | p = PyUnicode_AS_UNICODE(*v) + n;
|
---|
2900 | if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
|
---|
2901 | PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
---|
2902 | return -1;
|
---|
2903 | }
|
---|
2904 | }
|
---|
2905 |
|
---|
2906 | return size;
|
---|
2907 | }
|
---|
2908 |
|
---|
2909 | PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
|
---|
2910 | Py_ssize_t size,
|
---|
2911 | const char *errors,
|
---|
2912 | Py_ssize_t *consumed)
|
---|
2913 | {
|
---|
2914 | PyUnicodeObject *v = NULL;
|
---|
2915 | int done;
|
---|
2916 |
|
---|
2917 | if (consumed)
|
---|
2918 | *consumed = 0;
|
---|
2919 |
|
---|
2920 | #ifdef NEED_RETRY
|
---|
2921 | retry:
|
---|
2922 | if (size > INT_MAX)
|
---|
2923 | done = decode_mbcs(&v, s, INT_MAX, 0);
|
---|
2924 | else
|
---|
2925 | #endif
|
---|
2926 | done = decode_mbcs(&v, s, (int)size, !consumed);
|
---|
2927 |
|
---|
2928 | if (done < 0) {
|
---|
2929 | Py_XDECREF(v);
|
---|
2930 | return NULL;
|
---|
2931 | }
|
---|
2932 |
|
---|
2933 | if (consumed)
|
---|
2934 | *consumed += done;
|
---|
2935 |
|
---|
2936 | #ifdef NEED_RETRY
|
---|
2937 | if (size > INT_MAX) {
|
---|
2938 | s += done;
|
---|
2939 | size -= done;
|
---|
2940 | goto retry;
|
---|
2941 | }
|
---|
2942 | #endif
|
---|
2943 |
|
---|
2944 | return (PyObject *)v;
|
---|
2945 | }
|
---|
2946 |
|
---|
2947 | PyObject *PyUnicode_DecodeMBCS(const char *s,
|
---|
2948 | Py_ssize_t size,
|
---|
2949 | const char *errors)
|
---|
2950 | {
|
---|
2951 | return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
|
---|
2952 | }
|
---|
2953 |
|
---|
2954 | /*
|
---|
2955 | * Convert unicode into string object (MBCS).
|
---|
2956 | * Returns 0 if succeed, -1 otherwise.
|
---|
2957 | */
|
---|
2958 | static int encode_mbcs(PyObject **repr,
|
---|
2959 | const Py_UNICODE *p, /* unicode */
|
---|
2960 | int size) /* size of unicode */
|
---|
2961 | {
|
---|
2962 | int mbcssize = 0;
|
---|
2963 | Py_ssize_t n = 0;
|
---|
2964 |
|
---|
2965 | assert(size >= 0);
|
---|
2966 |
|
---|
2967 | /* First get the size of the result */
|
---|
2968 | if (size > 0) {
|
---|
2969 | mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
|
---|
2970 | if (mbcssize == 0) {
|
---|
2971 | PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
---|
2972 | return -1;
|
---|
2973 | }
|
---|
2974 | }
|
---|
2975 |
|
---|
2976 | if (*repr == NULL) {
|
---|
2977 | /* Create string object */
|
---|
2978 | *repr = PyString_FromStringAndSize(NULL, mbcssize);
|
---|
2979 | if (*repr == NULL)
|
---|
2980 | return -1;
|
---|
2981 | }
|
---|
2982 | else {
|
---|
2983 | /* Extend string object */
|
---|
2984 | n = PyString_Size(*repr);
|
---|
2985 | if (_PyString_Resize(repr, n + mbcssize) < 0)
|
---|
2986 | return -1;
|
---|
2987 | }
|
---|
2988 |
|
---|
2989 | /* Do the conversion */
|
---|
2990 | if (size > 0) {
|
---|
2991 | char *s = PyString_AS_STRING(*repr) + n;
|
---|
2992 | if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
|
---|
2993 | PyErr_SetFromWindowsErrWithFilename(0, NULL);
|
---|
2994 | return -1;
|
---|
2995 | }
|
---|
2996 | }
|
---|
2997 |
|
---|
2998 | return 0;
|
---|
2999 | }
|
---|
3000 |
|
---|
3001 | PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
|
---|
3002 | Py_ssize_t size,
|
---|
3003 | const char *errors)
|
---|
3004 | {
|
---|
3005 | PyObject *repr = NULL;
|
---|
3006 | int ret;
|
---|
3007 |
|
---|
3008 | #ifdef NEED_RETRY
|
---|
3009 | retry:
|
---|
3010 | if (size > INT_MAX)
|
---|
3011 | ret = encode_mbcs(&repr, p, INT_MAX);
|
---|
3012 | else
|
---|
3013 | #endif
|
---|
3014 | ret = encode_mbcs(&repr, p, (int)size);
|
---|
3015 |
|
---|
3016 | if (ret < 0) {
|
---|
3017 | Py_XDECREF(repr);
|
---|
3018 | return NULL;
|
---|
3019 | }
|
---|
3020 |
|
---|
3021 | #ifdef NEED_RETRY
|
---|
3022 | if (size > INT_MAX) {
|
---|
3023 | p += INT_MAX;
|
---|
3024 | size -= INT_MAX;
|
---|
3025 | goto retry;
|
---|
3026 | }
|
---|
3027 | #endif
|
---|
3028 |
|
---|
3029 | return repr;
|
---|
3030 | }
|
---|
3031 |
|
---|
3032 | PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
|
---|
3033 | {
|
---|
3034 | if (!PyUnicode_Check(unicode)) {
|
---|
3035 | PyErr_BadArgument();
|
---|
3036 | return NULL;
|
---|
3037 | }
|
---|
3038 | return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
|
---|
3039 | PyUnicode_GET_SIZE(unicode),
|
---|
3040 | NULL);
|
---|
3041 | }
|
---|
3042 |
|
---|
3043 | #undef NEED_RETRY
|
---|
3044 |
|
---|
3045 | #endif /* MS_WINDOWS */
|
---|
3046 |
|
---|
3047 | /* --- Character Mapping Codec -------------------------------------------- */
|
---|
3048 |
|
---|
3049 | PyObject *PyUnicode_DecodeCharmap(const char *s,
|
---|
3050 | Py_ssize_t size,
|
---|
3051 | PyObject *mapping,
|
---|
3052 | const char *errors)
|
---|
3053 | {
|
---|
3054 | const char *starts = s;
|
---|
3055 | Py_ssize_t startinpos;
|
---|
3056 | Py_ssize_t endinpos;
|
---|
3057 | Py_ssize_t outpos;
|
---|
3058 | const char *e;
|
---|
3059 | PyUnicodeObject *v;
|
---|
3060 | Py_UNICODE *p;
|
---|
3061 | Py_ssize_t extrachars = 0;
|
---|
3062 | PyObject *errorHandler = NULL;
|
---|
3063 | PyObject *exc = NULL;
|
---|
3064 | Py_UNICODE *mapstring = NULL;
|
---|
3065 | Py_ssize_t maplen = 0;
|
---|
3066 |
|
---|
3067 | /* Default to Latin-1 */
|
---|
3068 | if (mapping == NULL)
|
---|
3069 | return PyUnicode_DecodeLatin1(s, size, errors);
|
---|
3070 |
|
---|
3071 | v = _PyUnicode_New(size);
|
---|
3072 | if (v == NULL)
|
---|
3073 | goto onError;
|
---|
3074 | if (size == 0)
|
---|
3075 | return (PyObject *)v;
|
---|
3076 | p = PyUnicode_AS_UNICODE(v);
|
---|
3077 | e = s + size;
|
---|
3078 | if (PyUnicode_CheckExact(mapping)) {
|
---|
3079 | mapstring = PyUnicode_AS_UNICODE(mapping);
|
---|
3080 | maplen = PyUnicode_GET_SIZE(mapping);
|
---|
3081 | while (s < e) {
|
---|
3082 | unsigned char ch = *s;
|
---|
3083 | Py_UNICODE x = 0xfffe; /* illegal value */
|
---|
3084 |
|
---|
3085 | if (ch < maplen)
|
---|
3086 | x = mapstring[ch];
|
---|
3087 |
|
---|
3088 | if (x == 0xfffe) {
|
---|
3089 | /* undefined mapping */
|
---|
3090 | outpos = p-PyUnicode_AS_UNICODE(v);
|
---|
3091 | startinpos = s-starts;
|
---|
3092 | endinpos = startinpos+1;
|
---|
3093 | if (unicode_decode_call_errorhandler(
|
---|
3094 | errors, &errorHandler,
|
---|
3095 | "charmap", "character maps to <undefined>",
|
---|
3096 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
3097 | (PyObject **)&v, &outpos, &p)) {
|
---|
3098 | goto onError;
|
---|
3099 | }
|
---|
3100 | continue;
|
---|
3101 | }
|
---|
3102 | *p++ = x;
|
---|
3103 | ++s;
|
---|
3104 | }
|
---|
3105 | }
|
---|
3106 | else {
|
---|
3107 | while (s < e) {
|
---|
3108 | unsigned char ch = *s;
|
---|
3109 | PyObject *w, *x;
|
---|
3110 |
|
---|
3111 | /* Get mapping (char ordinal -> integer, Unicode char or None) */
|
---|
3112 | w = PyInt_FromLong((long)ch);
|
---|
3113 | if (w == NULL)
|
---|
3114 | goto onError;
|
---|
3115 | x = PyObject_GetItem(mapping, w);
|
---|
3116 | Py_DECREF(w);
|
---|
3117 | if (x == NULL) {
|
---|
3118 | if (PyErr_ExceptionMatches(PyExc_LookupError)) {
|
---|
3119 | /* No mapping found means: mapping is undefined. */
|
---|
3120 | PyErr_Clear();
|
---|
3121 | x = Py_None;
|
---|
3122 | Py_INCREF(x);
|
---|
3123 | } else
|
---|
3124 | goto onError;
|
---|
3125 | }
|
---|
3126 |
|
---|
3127 | /* Apply mapping */
|
---|
3128 | if (PyInt_Check(x)) {
|
---|
3129 | long value = PyInt_AS_LONG(x);
|
---|
3130 | if (value < 0 || value > 65535) {
|
---|
3131 | PyErr_SetString(PyExc_TypeError,
|
---|
3132 | "character mapping must be in range(65536)");
|
---|
3133 | Py_DECREF(x);
|
---|
3134 | goto onError;
|
---|
3135 | }
|
---|
3136 | *p++ = (Py_UNICODE)value;
|
---|
3137 | }
|
---|
3138 | else if (x == Py_None) {
|
---|
3139 | /* undefined mapping */
|
---|
3140 | outpos = p-PyUnicode_AS_UNICODE(v);
|
---|
3141 | startinpos = s-starts;
|
---|
3142 | endinpos = startinpos+1;
|
---|
3143 | if (unicode_decode_call_errorhandler(
|
---|
3144 | errors, &errorHandler,
|
---|
3145 | "charmap", "character maps to <undefined>",
|
---|
3146 | starts, size, &startinpos, &endinpos, &exc, &s,
|
---|
3147 | (PyObject **)&v, &outpos, &p)) {
|
---|
3148 | Py_DECREF(x);
|
---|
3149 | goto onError;
|
---|
3150 | }
|
---|
3151 | Py_DECREF(x);
|
---|
3152 | continue;
|
---|
3153 | }
|
---|
3154 | else if (PyUnicode_Check(x)) {
|
---|
3155 | Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
|
---|
3156 |
|
---|
3157 | if (targetsize == 1)
|
---|
3158 | /* 1-1 mapping */
|
---|
3159 | *p++ = *PyUnicode_AS_UNICODE(x);
|
---|
3160 |
|
---|
3161 | else if (targetsize > 1) {
|
---|
3162 | /* 1-n mapping */
|
---|
3163 | if (targetsize > extrachars) {
|
---|
3164 | /* resize first */
|
---|
3165 | Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
|
---|
3166 | Py_ssize_t needed = (targetsize - extrachars) + \
|
---|
3167 | (targetsize << 2);
|
---|
3168 | extrachars += needed;
|
---|
3169 | if (_PyUnicode_Resize(&v,
|
---|
3170 | PyUnicode_GET_SIZE(v) + needed) < 0) {
|
---|
3171 | Py_DECREF(x);
|
---|
3172 | goto onError;
|
---|
3173 | }
|
---|
3174 | p = PyUnicode_AS_UNICODE(v) + oldpos;
|
---|
3175 | }
|
---|
3176 | Py_UNICODE_COPY(p,
|
---|
3177 | PyUnicode_AS_UNICODE(x),
|
---|
3178 | targetsize);
|
---|
3179 | p += targetsize;
|
---|
3180 | extrachars -= targetsize;
|
---|
3181 | }
|
---|
3182 | /* 1-0 mapping: skip the character */
|
---|
3183 | }
|
---|
3184 | else {
|
---|
3185 | /* wrong return value */
|
---|
3186 | PyErr_SetString(PyExc_TypeError,
|
---|
3187 | "character mapping must return integer, None or unicode");
|
---|
3188 | Py_DECREF(x);
|
---|
3189 | goto onError;
|
---|
3190 | }
|
---|
3191 | Py_DECREF(x);
|
---|
3192 | ++s;
|
---|
3193 | }
|
---|
3194 | }
|
---|
3195 | if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
|
---|
3196 | if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
|
---|
3197 | goto onError;
|
---|
3198 | Py_XDECREF(errorHandler);
|
---|
3199 | Py_XDECREF(exc);
|
---|
3200 | return (PyObject *)v;
|
---|
3201 |
|
---|
3202 | onError:
|
---|
3203 | Py_XDECREF(errorHandler);
|
---|
3204 | Py_XDECREF(exc);
|
---|
3205 | Py_XDECREF(v);
|
---|
3206 | return NULL;
|
---|
3207 | }
|
---|
3208 |
|
---|
3209 | /* Charmap encoding: the lookup table */
|
---|
3210 |
|
---|
3211 | struct encoding_map{
|
---|
3212 | PyObject_HEAD
|
---|
3213 | unsigned char level1[32];
|
---|
3214 | int count2, count3;
|
---|
3215 | unsigned char level23[1];
|
---|
3216 | };
|
---|
3217 |
|
---|
3218 | static PyObject*
|
---|
3219 | encoding_map_size(PyObject *obj, PyObject* args)
|
---|
3220 | {
|
---|
3221 | struct encoding_map *map = (struct encoding_map*)obj;
|
---|
3222 | return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
|
---|
3223 | 128*map->count3);
|
---|
3224 | }
|
---|
3225 |
|
---|
3226 | static PyMethodDef encoding_map_methods[] = {
|
---|
3227 | {"size", encoding_map_size, METH_NOARGS,
|
---|
3228 | PyDoc_STR("Return the size (in bytes) of this object") },
|
---|
3229 | { 0 }
|
---|
3230 | };
|
---|
3231 |
|
---|
3232 | static void
|
---|
3233 | encoding_map_dealloc(PyObject* o)
|
---|
3234 | {
|
---|
3235 | PyObject_FREE(o);
|
---|
3236 | }
|
---|
3237 |
|
---|
3238 | static PyTypeObject EncodingMapType = {
|
---|
3239 | PyObject_HEAD_INIT(NULL)
|
---|
3240 | 0, /*ob_size*/
|
---|
3241 | "EncodingMap", /*tp_name*/
|
---|
3242 | sizeof(struct encoding_map), /*tp_basicsize*/
|
---|
3243 | 0, /*tp_itemsize*/
|
---|
3244 | /* methods */
|
---|
3245 | encoding_map_dealloc, /*tp_dealloc*/
|
---|
3246 | 0, /*tp_print*/
|
---|
3247 | 0, /*tp_getattr*/
|
---|
3248 | 0, /*tp_setattr*/
|
---|
3249 | 0, /*tp_compare*/
|
---|
3250 | 0, /*tp_repr*/
|
---|
3251 | 0, /*tp_as_number*/
|
---|
3252 | 0, /*tp_as_sequence*/
|
---|
3253 | 0, /*tp_as_mapping*/
|
---|
3254 | 0, /*tp_hash*/
|
---|
3255 | 0, /*tp_call*/
|
---|
3256 | 0, /*tp_str*/
|
---|
3257 | 0, /*tp_getattro*/
|
---|
3258 | 0, /*tp_setattro*/
|
---|
3259 | 0, /*tp_as_buffer*/
|
---|
3260 | Py_TPFLAGS_DEFAULT, /*tp_flags*/
|
---|
3261 | 0, /*tp_doc*/
|
---|
3262 | 0, /*tp_traverse*/
|
---|
3263 | 0, /*tp_clear*/
|
---|
3264 | 0, /*tp_richcompare*/
|
---|
3265 | 0, /*tp_weaklistoffset*/
|
---|
3266 | 0, /*tp_iter*/
|
---|
3267 | 0, /*tp_iternext*/
|
---|
3268 | encoding_map_methods, /*tp_methods*/
|
---|
3269 | 0, /*tp_members*/
|
---|
3270 | 0, /*tp_getset*/
|
---|
3271 | 0, /*tp_base*/
|
---|
3272 | 0, /*tp_dict*/
|
---|
3273 | 0, /*tp_descr_get*/
|
---|
3274 | 0, /*tp_descr_set*/
|
---|
3275 | 0, /*tp_dictoffset*/
|
---|
3276 | 0, /*tp_init*/
|
---|
3277 | 0, /*tp_alloc*/
|
---|
3278 | 0, /*tp_new*/
|
---|
3279 | 0, /*tp_free*/
|
---|
3280 | 0, /*tp_is_gc*/
|
---|
3281 | };
|
---|
3282 |
|
---|
3283 | PyObject*
|
---|
3284 | PyUnicode_BuildEncodingMap(PyObject* string)
|
---|
3285 | {
|
---|
3286 | Py_UNICODE *decode;
|
---|
3287 | PyObject *result;
|
---|
3288 | struct encoding_map *mresult;
|
---|
3289 | int i;
|
---|
3290 | int need_dict = 0;
|
---|
3291 | unsigned char level1[32];
|
---|
3292 | unsigned char level2[512];
|
---|
3293 | unsigned char *mlevel1, *mlevel2, *mlevel3;
|
---|
3294 | int count2 = 0, count3 = 0;
|
---|
3295 |
|
---|
3296 | if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
|
---|
3297 | PyErr_BadArgument();
|
---|
3298 | return NULL;
|
---|
3299 | }
|
---|
3300 | decode = PyUnicode_AS_UNICODE(string);
|
---|
3301 | memset(level1, 0xFF, sizeof level1);
|
---|
3302 | memset(level2, 0xFF, sizeof level2);
|
---|
3303 |
|
---|
3304 | /* If there isn't a one-to-one mapping of NULL to \0,
|
---|
3305 | or if there are non-BMP characters, we need to use
|
---|
3306 | a mapping dictionary. */
|
---|
3307 | if (decode[0] != 0)
|
---|
3308 | need_dict = 1;
|
---|
3309 | for (i = 1; i < 256; i++) {
|
---|
3310 | int l1, l2;
|
---|
3311 | if (decode[i] == 0
|
---|
3312 | #ifdef Py_UNICODE_WIDE
|
---|
3313 | || decode[i] > 0xFFFF
|
---|
3314 | #endif
|
---|
3315 | ) {
|
---|
3316 | need_dict = 1;
|
---|
3317 | break;
|
---|
3318 | }
|
---|
3319 | if (decode[i] == 0xFFFE)
|
---|
3320 | /* unmapped character */
|
---|
3321 | continue;
|
---|
3322 | l1 = decode[i] >> 11;
|
---|
3323 | l2 = decode[i] >> 7;
|
---|
3324 | if (level1[l1] == 0xFF)
|
---|
3325 | level1[l1] = count2++;
|
---|
3326 | if (level2[l2] == 0xFF)
|
---|
3327 | level2[l2] = count3++;
|
---|
3328 | }
|
---|
3329 |
|
---|
3330 | if (count2 >= 0xFF || count3 >= 0xFF)
|
---|
3331 | need_dict = 1;
|
---|
3332 |
|
---|
3333 | if (need_dict) {
|
---|
3334 | PyObject *result = PyDict_New();
|
---|
3335 | PyObject *key, *value;
|
---|
3336 | if (!result)
|
---|
3337 | return NULL;
|
---|
3338 | for (i = 0; i < 256; i++) {
|
---|
3339 | key = value = NULL;
|
---|
3340 | key = PyInt_FromLong(decode[i]);
|
---|
3341 | value = PyInt_FromLong(i);
|
---|
3342 | if (!key || !value)
|
---|
3343 | goto failed1;
|
---|
3344 | if (PyDict_SetItem(result, key, value) == -1)
|
---|
3345 | goto failed1;
|
---|
3346 | Py_DECREF(key);
|
---|
3347 | Py_DECREF(value);
|
---|
3348 | }
|
---|
3349 | return result;
|
---|
3350 | failed1:
|
---|
3351 | Py_XDECREF(key);
|
---|
3352 | Py_XDECREF(value);
|
---|
3353 | Py_DECREF(result);
|
---|
3354 | return NULL;
|
---|
3355 | }
|
---|
3356 |
|
---|
3357 | /* Create a three-level trie */
|
---|
3358 | result = PyObject_MALLOC(sizeof(struct encoding_map) +
|
---|
3359 | 16*count2 + 128*count3 - 1);
|
---|
3360 | if (!result)
|
---|
3361 | return PyErr_NoMemory();
|
---|
3362 | PyObject_Init(result, &EncodingMapType);
|
---|
3363 | mresult = (struct encoding_map*)result;
|
---|
3364 | mresult->count2 = count2;
|
---|
3365 | mresult->count3 = count3;
|
---|
3366 | mlevel1 = mresult->level1;
|
---|
3367 | mlevel2 = mresult->level23;
|
---|
3368 | mlevel3 = mresult->level23 + 16*count2;
|
---|
3369 | memcpy(mlevel1, level1, 32);
|
---|
3370 | memset(mlevel2, 0xFF, 16*count2);
|
---|
3371 | memset(mlevel3, 0, 128*count3);
|
---|
3372 | count3 = 0;
|
---|
3373 | for (i = 1; i < 256; i++) {
|
---|
3374 | int o1, o2, o3, i2, i3;
|
---|
3375 | if (decode[i] == 0xFFFE)
|
---|
3376 | /* unmapped character */
|
---|
3377 | continue;
|
---|
3378 | o1 = decode[i]>>11;
|
---|
3379 | o2 = (decode[i]>>7) & 0xF;
|
---|
3380 | i2 = 16*mlevel1[o1] + o2;
|
---|
3381 | if (mlevel2[i2] == 0xFF)
|
---|
3382 | mlevel2[i2] = count3++;
|
---|
3383 | o3 = decode[i] & 0x7F;
|
---|
3384 | i3 = 128*mlevel2[i2] + o3;
|
---|
3385 | mlevel3[i3] = i;
|
---|
3386 | }
|
---|
3387 | return result;
|
---|
3388 | }
|
---|
3389 |
|
---|
3390 | static int
|
---|
3391 | encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
|
---|
3392 | {
|
---|
3393 | struct encoding_map *map = (struct encoding_map*)mapping;
|
---|
3394 | int l1 = c>>11;
|
---|
3395 | int l2 = (c>>7) & 0xF;
|
---|
3396 | int l3 = c & 0x7F;
|
---|
3397 | int i;
|
---|
3398 |
|
---|
3399 | #ifdef Py_UNICODE_WIDE
|
---|
3400 | if (c > 0xFFFF) {
|
---|
3401 | return -1;
|
---|
3402 | }
|
---|
3403 | #endif
|
---|
3404 | if (c == 0)
|
---|
3405 | return 0;
|
---|
3406 | /* level 1*/
|
---|
3407 | i = map->level1[l1];
|
---|
3408 | if (i == 0xFF) {
|
---|
3409 | return -1;
|
---|
3410 | }
|
---|
3411 | /* level 2*/
|
---|
3412 | i = map->level23[16*i+l2];
|
---|
3413 | if (i == 0xFF) {
|
---|
3414 | return -1;
|
---|
3415 | }
|
---|
3416 | /* level 3 */
|
---|
3417 | i = map->level23[16*map->count2 + 128*i + l3];
|
---|
3418 | if (i == 0) {
|
---|
3419 | return -1;
|
---|
3420 | }
|
---|
3421 | return i;
|
---|
3422 | }
|
---|
3423 |
|
---|
3424 | /* Lookup the character ch in the mapping. If the character
|
---|
3425 | can't be found, Py_None is returned (or NULL, if another
|
---|
3426 | error occurred). */
|
---|
3427 | static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
|
---|
3428 | {
|
---|
3429 | PyObject *w = PyInt_FromLong((long)c);
|
---|
3430 | PyObject *x;
|
---|
3431 |
|
---|
3432 | if (w == NULL)
|
---|
3433 | return NULL;
|
---|
3434 | x = PyObject_GetItem(mapping, w);
|
---|
3435 | Py_DECREF(w);
|
---|
3436 | if (x == NULL) {
|
---|
3437 | if (PyErr_ExceptionMatches(PyExc_LookupError)) {
|
---|
3438 | /* No mapping found means: mapping is undefined. */
|
---|
3439 | PyErr_Clear();
|
---|
3440 | x = Py_None;
|
---|
3441 | Py_INCREF(x);
|
---|
3442 | return x;
|
---|
3443 | } else
|
---|
3444 | return NULL;
|
---|
3445 | }
|
---|
3446 | else if (x == Py_None)
|
---|
3447 | return x;
|
---|
3448 | else if (PyInt_Check(x)) {
|
---|
3449 | long value = PyInt_AS_LONG(x);
|
---|
3450 | if (value < 0 || value > 255) {
|
---|
3451 | PyErr_SetString(PyExc_TypeError,
|
---|
3452 | "character mapping must be in range(256)");
|
---|
3453 | Py_DECREF(x);
|
---|
3454 | return NULL;
|
---|
3455 | }
|
---|
3456 | return x;
|
---|
3457 | }
|
---|
3458 | else if (PyString_Check(x))
|
---|
3459 | return x;
|
---|
3460 | else {
|
---|
3461 | /* wrong return value */
|
---|
3462 | PyErr_SetString(PyExc_TypeError,
|
---|
3463 | "character mapping must return integer, None or str");
|
---|
3464 | Py_DECREF(x);
|
---|
3465 | return NULL;
|
---|
3466 | }
|
---|
3467 | }
|
---|
3468 |
|
---|
3469 | static int
|
---|
3470 | charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
|
---|
3471 | {
|
---|
3472 | Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
|
---|
3473 | /* exponentially overallocate to minimize reallocations */
|
---|
3474 | if (requiredsize < 2*outsize)
|
---|
3475 | requiredsize = 2*outsize;
|
---|
3476 | if (_PyString_Resize(outobj, requiredsize)) {
|
---|
3477 | return 0;
|
---|
3478 | }
|
---|
3479 | return 1;
|
---|
3480 | }
|
---|
3481 |
|
---|
3482 | typedef enum charmapencode_result {
|
---|
3483 | enc_SUCCESS, enc_FAILED, enc_EXCEPTION
|
---|
3484 | }charmapencode_result;
|
---|
3485 | /* lookup the character, put the result in the output string and adjust
|
---|
3486 | various state variables. Reallocate the output string if not enough
|
---|
3487 | space is available. Return a new reference to the object that
|
---|
3488 | was put in the output buffer, or Py_None, if the mapping was undefined
|
---|
3489 | (in which case no character was written) or NULL, if a
|
---|
3490 | reallocation error occurred. The caller must decref the result */
|
---|
3491 | static
|
---|
3492 | charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
|
---|
3493 | PyObject **outobj, Py_ssize_t *outpos)
|
---|
3494 | {
|
---|
3495 | PyObject *rep;
|
---|
3496 | char *outstart;
|
---|
3497 | Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
|
---|
3498 |
|
---|
3499 | if (mapping->ob_type == &EncodingMapType) {
|
---|
3500 | int res = encoding_map_lookup(c, mapping);
|
---|
3501 | Py_ssize_t requiredsize = *outpos+1;
|
---|
3502 | if (res == -1)
|
---|
3503 | return enc_FAILED;
|
---|
3504 | if (outsize<requiredsize)
|
---|
3505 | if (!charmapencode_resize(outobj, outpos, requiredsize))
|
---|
3506 | return enc_EXCEPTION;
|
---|
3507 | outstart = PyString_AS_STRING(*outobj);
|
---|
3508 | outstart[(*outpos)++] = (char)res;
|
---|
3509 | return enc_SUCCESS;
|
---|
3510 | }
|
---|
3511 |
|
---|
3512 | rep = charmapencode_lookup(c, mapping);
|
---|
3513 | if (rep==NULL)
|
---|
3514 | return enc_EXCEPTION;
|
---|
3515 | else if (rep==Py_None) {
|
---|
3516 | Py_DECREF(rep);
|
---|
3517 | return enc_FAILED;
|
---|
3518 | } else {
|
---|
3519 | if (PyInt_Check(rep)) {
|
---|
3520 | Py_ssize_t requiredsize = *outpos+1;
|
---|
3521 | if (outsize<requiredsize)
|
---|
3522 | if (!charmapencode_resize(outobj, outpos, requiredsize)) {
|
---|
3523 | Py_DECREF(rep);
|
---|
3524 | return enc_EXCEPTION;
|
---|
3525 | }
|
---|
3526 | outstart = PyString_AS_STRING(*outobj);
|
---|
3527 | outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
|
---|
3528 | }
|
---|
3529 | else {
|
---|
3530 | const char *repchars = PyString_AS_STRING(rep);
|
---|
3531 | Py_ssize_t repsize = PyString_GET_SIZE(rep);
|
---|
3532 | Py_ssize_t requiredsize = *outpos+repsize;
|
---|
3533 | if (outsize<requiredsize)
|
---|
3534 | if (!charmapencode_resize(outobj, outpos, requiredsize)) {
|
---|
3535 | Py_DECREF(rep);
|
---|
3536 | return enc_EXCEPTION;
|
---|
3537 | }
|
---|
3538 | outstart = PyString_AS_STRING(*outobj);
|
---|
3539 | memcpy(outstart + *outpos, repchars, repsize);
|
---|
3540 | *outpos += repsize;
|
---|
3541 | }
|
---|
3542 | }
|
---|
3543 | Py_DECREF(rep);
|
---|
3544 | return enc_SUCCESS;
|
---|
3545 | }
|
---|
3546 |
|
---|
3547 | /* handle an error in PyUnicode_EncodeCharmap
|
---|
3548 | Return 0 on success, -1 on error */
|
---|
3549 | static
|
---|
3550 | int charmap_encoding_error(
|
---|
3551 | const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
|
---|
3552 | PyObject **exceptionObject,
|
---|
3553 | int *known_errorHandler, PyObject **errorHandler, const char *errors,
|
---|
3554 | PyObject **res, Py_ssize_t *respos)
|
---|
3555 | {
|
---|
3556 | PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
|
---|
3557 | Py_ssize_t repsize;
|
---|
3558 | Py_ssize_t newpos;
|
---|
3559 | Py_UNICODE *uni2;
|
---|
3560 | /* startpos for collecting unencodable chars */
|
---|
3561 | Py_ssize_t collstartpos = *inpos;
|
---|
3562 | Py_ssize_t collendpos = *inpos+1;
|
---|
3563 | Py_ssize_t collpos;
|
---|
3564 | char *encoding = "charmap";
|
---|
3565 | char *reason = "character maps to <undefined>";
|
---|
3566 | charmapencode_result x;
|
---|
3567 |
|
---|
3568 | /* find all unencodable characters */
|
---|
3569 | while (collendpos < size) {
|
---|
3570 | PyObject *rep;
|
---|
3571 | if (mapping->ob_type == &EncodingMapType) {
|
---|
3572 | int res = encoding_map_lookup(p[collendpos], mapping);
|
---|
3573 | if (res != -1)
|
---|
3574 | break;
|
---|
3575 | ++collendpos;
|
---|
3576 | continue;
|
---|
3577 | }
|
---|
3578 |
|
---|
3579 | rep = charmapencode_lookup(p[collendpos], mapping);
|
---|
3580 | if (rep==NULL)
|
---|
3581 | return -1;
|
---|
3582 | else if (rep!=Py_None) {
|
---|
3583 | Py_DECREF(rep);
|
---|
3584 | break;
|
---|
3585 | }
|
---|
3586 | Py_DECREF(rep);
|
---|
3587 | ++collendpos;
|
---|
3588 | }
|
---|
3589 | /* cache callback name lookup
|
---|
3590 | * (if not done yet, i.e. it's the first error) */
|
---|
3591 | if (*known_errorHandler==-1) {
|
---|
3592 | if ((errors==NULL) || (!strcmp(errors, "strict")))
|
---|
3593 | *known_errorHandler = 1;
|
---|
3594 | else if (!strcmp(errors, "replace"))
|
---|
3595 | *known_errorHandler = 2;
|
---|
3596 | else if (!strcmp(errors, "ignore"))
|
---|
3597 | *known_errorHandler = 3;
|
---|
3598 | else if (!strcmp(errors, "xmlcharrefreplace"))
|
---|
3599 | *known_errorHandler = 4;
|
---|
3600 | else
|
---|
3601 | *known_errorHandler = 0;
|
---|
3602 | }
|
---|
3603 | switch (*known_errorHandler) {
|
---|
3604 | case 1: /* strict */
|
---|
3605 | raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
|
---|
3606 | return -1;
|
---|
3607 | case 2: /* replace */
|
---|
3608 | for (collpos = collstartpos; collpos<collendpos; ++collpos) {
|
---|
3609 | x = charmapencode_output('?', mapping, res, respos);
|
---|
3610 | if (x==enc_EXCEPTION) {
|
---|
3611 | return -1;
|
---|
3612 | }
|
---|
3613 | else if (x==enc_FAILED) {
|
---|
3614 | raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
|
---|
3615 | return -1;
|
---|
3616 | }
|
---|
3617 | }
|
---|
3618 | /* fall through */
|
---|
3619 | case 3: /* ignore */
|
---|
3620 | *inpos = collendpos;
|
---|
3621 | break;
|
---|
3622 | case 4: /* xmlcharrefreplace */
|
---|
3623 | /* generate replacement (temporarily (mis)uses p) */
|
---|
3624 | for (collpos = collstartpos; collpos < collendpos; ++collpos) {
|
---|
3625 | char buffer[2+29+1+1];
|
---|
3626 | char *cp;
|
---|
3627 | sprintf(buffer, "&#%d;", (int)p[collpos]);
|
---|
3628 | for (cp = buffer; *cp; ++cp) {
|
---|
3629 | x = charmapencode_output(*cp, mapping, res, respos);
|
---|
3630 | if (x==enc_EXCEPTION)
|
---|
3631 | return -1;
|
---|
3632 | else if (x==enc_FAILED) {
|
---|
3633 | raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
|
---|
3634 | return -1;
|
---|
3635 | }
|
---|
3636 | }
|
---|
3637 | }
|
---|
3638 | *inpos = collendpos;
|
---|
3639 | break;
|
---|
3640 | default:
|
---|
3641 | repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
|
---|
3642 | encoding, reason, p, size, exceptionObject,
|
---|
3643 | collstartpos, collendpos, &newpos);
|
---|
3644 | if (repunicode == NULL)
|
---|
3645 | return -1;
|
---|
3646 | /* generate replacement */
|
---|
3647 | repsize = PyUnicode_GET_SIZE(repunicode);
|
---|
3648 | for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
|
---|
3649 | x = charmapencode_output(*uni2, mapping, res, respos);
|
---|
3650 | if (x==enc_EXCEPTION) {
|
---|
3651 | return -1;
|
---|
3652 | }
|
---|
3653 | else if (x==enc_FAILED) {
|
---|
3654 | Py_DECREF(repunicode);
|
---|
3655 | raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
|
---|
3656 | return -1;
|
---|
3657 | }
|
---|
3658 | }
|
---|
3659 | *inpos = newpos;
|
---|
3660 | Py_DECREF(repunicode);
|
---|
3661 | }
|
---|
3662 | return 0;
|
---|
3663 | }
|
---|
3664 |
|
---|
3665 | PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
|
---|
3666 | Py_ssize_t size,
|
---|
3667 | PyObject *mapping,
|
---|
3668 | const char *errors)
|
---|
3669 | {
|
---|
3670 | /* output object */
|
---|
3671 | PyObject *res = NULL;
|
---|
3672 | /* current input position */
|
---|
3673 | Py_ssize_t inpos = 0;
|
---|
3674 | /* current output position */
|
---|
3675 | Py_ssize_t respos = 0;
|
---|
3676 | PyObject *errorHandler = NULL;
|
---|
3677 | PyObject *exc = NULL;
|
---|
3678 | /* the following variable is used for caching string comparisons
|
---|
3679 | * -1=not initialized, 0=unknown, 1=strict, 2=replace,
|
---|
3680 | * 3=ignore, 4=xmlcharrefreplace */
|
---|
3681 | int known_errorHandler = -1;
|
---|
3682 |
|
---|
3683 | /* Default to Latin-1 */
|
---|
3684 | if (mapping == NULL)
|
---|
3685 | return PyUnicode_EncodeLatin1(p, size, errors);
|
---|
3686 |
|
---|
3687 | /* allocate enough for a simple encoding without
|
---|
3688 | replacements, if we need more, we'll resize */
|
---|
3689 | res = PyString_FromStringAndSize(NULL, size);
|
---|
3690 | if (res == NULL)
|
---|
3691 | goto onError;
|
---|
3692 | if (size == 0)
|
---|
3693 | return res;
|
---|
3694 |
|
---|
3695 | while (inpos<size) {
|
---|
3696 | /* try to encode it */
|
---|
3697 | charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
|
---|
3698 | if (x==enc_EXCEPTION) /* error */
|
---|
3699 | goto onError;
|
---|
3700 | if (x==enc_FAILED) { /* unencodable character */
|
---|
3701 | if (charmap_encoding_error(p, size, &inpos, mapping,
|
---|
3702 | &exc,
|
---|
3703 | &known_errorHandler, &errorHandler, errors,
|
---|
3704 | &res, &respos)) {
|
---|
3705 | goto onError;
|
---|
3706 | }
|
---|
3707 | }
|
---|
3708 | else
|
---|
3709 | /* done with this character => adjust input position */
|
---|
3710 | ++inpos;
|
---|
3711 | }
|
---|
3712 |
|
---|
3713 | /* Resize if we allocated to much */
|
---|
3714 | if (respos<PyString_GET_SIZE(res)) {
|
---|
3715 | if (_PyString_Resize(&res, respos))
|
---|
3716 | goto onError;
|
---|
3717 | }
|
---|
3718 | Py_XDECREF(exc);
|
---|
3719 | Py_XDECREF(errorHandler);
|
---|
3720 | return res;
|
---|
3721 |
|
---|
3722 | onError:
|
---|
3723 | Py_XDECREF(res);
|
---|
3724 | Py_XDECREF(exc);
|
---|
3725 | Py_XDECREF(errorHandler);
|
---|
3726 | return NULL;
|
---|
3727 | }
|
---|
3728 |
|
---|
3729 | PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
|
---|
3730 | PyObject *mapping)
|
---|
3731 | {
|
---|
3732 | if (!PyUnicode_Check(unicode) || mapping == NULL) {
|
---|
3733 | PyErr_BadArgument();
|
---|
3734 | return NULL;
|
---|
3735 | }
|
---|
3736 | return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
|
---|
3737 | PyUnicode_GET_SIZE(unicode),
|
---|
3738 | mapping,
|
---|
3739 | NULL);
|
---|
3740 | }
|
---|
3741 |
|
---|
3742 | /* create or adjust a UnicodeTranslateError */
|
---|
3743 | static void make_translate_exception(PyObject **exceptionObject,
|
---|
3744 | const Py_UNICODE *unicode, Py_ssize_t size,
|
---|
3745 | Py_ssize_t startpos, Py_ssize_t endpos,
|
---|
3746 | const char *reason)
|
---|
3747 | {
|
---|
3748 | if (*exceptionObject == NULL) {
|
---|
3749 | *exceptionObject = PyUnicodeTranslateError_Create(
|
---|
3750 | unicode, size, startpos, endpos, reason);
|
---|
3751 | }
|
---|
3752 | else {
|
---|
3753 | if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
|
---|
3754 | goto onError;
|
---|
3755 | if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
|
---|
3756 | goto onError;
|
---|
3757 | if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
|
---|
3758 | goto onError;
|
---|
3759 | return;
|
---|
3760 | onError:
|
---|
3761 | Py_DECREF(*exceptionObject);
|
---|
3762 | *exceptionObject = NULL;
|
---|
3763 | }
|
---|
3764 | }
|
---|
3765 |
|
---|
3766 | /* raises a UnicodeTranslateError */
|
---|
3767 | static void raise_translate_exception(PyObject **exceptionObject,
|
---|
3768 | const Py_UNICODE *unicode, Py_ssize_t size,
|
---|
3769 | Py_ssize_t startpos, Py_ssize_t endpos,
|
---|
3770 | const char *reason)
|
---|
3771 | {
|
---|
3772 | make_translate_exception(exceptionObject,
|
---|
3773 | unicode, size, startpos, endpos, reason);
|
---|
3774 | if (*exceptionObject != NULL)
|
---|
3775 | PyCodec_StrictErrors(*exceptionObject);
|
---|
3776 | }
|
---|
3777 |
|
---|
3778 | /* error handling callback helper:
|
---|
3779 | build arguments, call the callback and check the arguments,
|
---|
3780 | put the result into newpos and return the replacement string, which
|
---|
3781 | has to be freed by the caller */
|
---|
3782 | static PyObject *unicode_translate_call_errorhandler(const char *errors,
|
---|
3783 | PyObject **errorHandler,
|
---|
3784 | const char *reason,
|
---|
3785 | const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
|
---|
3786 | Py_ssize_t startpos, Py_ssize_t endpos,
|
---|
3787 | Py_ssize_t *newpos)
|
---|
3788 | {
|
---|
3789 | static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
|
---|
3790 |
|
---|
3791 | Py_ssize_t i_newpos;
|
---|
3792 | PyObject *restuple;
|
---|
3793 | PyObject *resunicode;
|
---|
3794 |
|
---|
3795 | if (*errorHandler == NULL) {
|
---|
3796 | *errorHandler = PyCodec_LookupError(errors);
|
---|
3797 | if (*errorHandler == NULL)
|
---|
3798 | return NULL;
|
---|
3799 | }
|
---|
3800 |
|
---|
3801 | make_translate_exception(exceptionObject,
|
---|
3802 | unicode, size, startpos, endpos, reason);
|
---|
3803 | if (*exceptionObject == NULL)
|
---|
3804 | return NULL;
|
---|
3805 |
|
---|
3806 | restuple = PyObject_CallFunctionObjArgs(
|
---|
3807 | *errorHandler, *exceptionObject, NULL);
|
---|
3808 | if (restuple == NULL)
|
---|
3809 | return NULL;
|
---|
3810 | if (!PyTuple_Check(restuple)) {
|
---|
3811 | PyErr_Format(PyExc_TypeError, &argparse[4]);
|
---|
3812 | Py_DECREF(restuple);
|
---|
3813 | return NULL;
|
---|
3814 | }
|
---|
3815 | if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
|
---|
3816 | &resunicode, &i_newpos)) {
|
---|
3817 | Py_DECREF(restuple);
|
---|
3818 | return NULL;
|
---|
3819 | }
|
---|
3820 | if (i_newpos<0)
|
---|
3821 | *newpos = size+i_newpos;
|
---|
3822 | else
|
---|
3823 | *newpos = i_newpos;
|
---|
3824 | if (*newpos<0 || *newpos>size) {
|
---|
3825 | PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
|
---|
3826 | Py_DECREF(restuple);
|
---|
3827 | return NULL;
|
---|
3828 | }
|
---|
3829 | Py_INCREF(resunicode);
|
---|
3830 | Py_DECREF(restuple);
|
---|
3831 | return resunicode;
|
---|
3832 | }
|
---|
3833 |
|
---|
3834 | /* Lookup the character ch in the mapping and put the result in result,
|
---|
3835 | which must be decrefed by the caller.
|
---|
3836 | Return 0 on success, -1 on error */
|
---|
3837 | static
|
---|
3838 | int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
|
---|
3839 | {
|
---|
3840 | PyObject *w = PyInt_FromLong((long)c);
|
---|
3841 | PyObject *x;
|
---|
3842 |
|
---|
3843 | if (w == NULL)
|
---|
3844 | return -1;
|
---|
3845 | x = PyObject_GetItem(mapping, w);
|
---|
3846 | Py_DECREF(w);
|
---|
3847 | if (x == NULL) {
|
---|
3848 | if (PyErr_ExceptionMatches(PyExc_LookupError)) {
|
---|
3849 | /* No mapping found means: use 1:1 mapping. */
|
---|
3850 | PyErr_Clear();
|
---|
3851 | *result = NULL;
|
---|
3852 | return 0;
|
---|
3853 | } else
|
---|
3854 | return -1;
|
---|
3855 | }
|
---|
3856 | else if (x == Py_None) {
|
---|
3857 | *result = x;
|
---|
3858 | return 0;
|
---|
3859 | }
|
---|
3860 | else if (PyInt_Check(x)) {
|
---|
3861 | long value = PyInt_AS_LONG(x);
|
---|
3862 | long max = PyUnicode_GetMax();
|
---|
3863 | if (value < 0 || value > max) {
|
---|
3864 | PyErr_Format(PyExc_TypeError,
|
---|
3865 | "character mapping must be in range(0x%lx)", max+1);
|
---|
3866 | Py_DECREF(x);
|
---|
3867 | return -1;
|
---|
3868 | }
|
---|
3869 | *result = x;
|
---|
3870 | return 0;
|
---|
3871 | }
|
---|
3872 | else if (PyUnicode_Check(x)) {
|
---|
3873 | *result = x;
|
---|
3874 | return 0;
|
---|
3875 | }
|
---|
3876 | else {
|
---|
3877 | /* wrong return value */
|
---|
3878 | PyErr_SetString(PyExc_TypeError,
|
---|
3879 | "character mapping must return integer, None or unicode");
|
---|
3880 | Py_DECREF(x);
|
---|
3881 | return -1;
|
---|
3882 | }
|
---|
3883 | }
|
---|
3884 | /* ensure that *outobj is at least requiredsize characters long,
|
---|
3885 | if not reallocate and adjust various state variables.
|
---|
3886 | Return 0 on success, -1 on error */
|
---|
3887 | static
|
---|
3888 | int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
|
---|
3889 | Py_ssize_t requiredsize)
|
---|
3890 | {
|
---|
3891 | Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
|
---|
3892 | if (requiredsize > oldsize) {
|
---|
3893 | /* remember old output position */
|
---|
3894 | Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
|
---|
3895 | /* exponentially overallocate to minimize reallocations */
|
---|
3896 | if (requiredsize < 2 * oldsize)
|
---|
3897 | requiredsize = 2 * oldsize;
|
---|
3898 | if (_PyUnicode_Resize(outobj, requiredsize) < 0)
|
---|
3899 | return -1;
|
---|
3900 | *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
|
---|
3901 | }
|
---|
3902 | return 0;
|
---|
3903 | }
|
---|
3904 | /* lookup the character, put the result in the output string and adjust
|
---|
3905 | various state variables. Return a new reference to the object that
|
---|
3906 | was put in the output buffer in *result, or Py_None, if the mapping was
|
---|
3907 | undefined (in which case no character was written).
|
---|
3908 | The called must decref result.
|
---|
3909 | Return 0 on success, -1 on error. */
|
---|
3910 | static
|
---|
3911 | int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
|
---|
3912 | Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
|
---|
3913 | PyObject **res)
|
---|
3914 | {
|
---|
3915 | if (charmaptranslate_lookup(*curinp, mapping, res))
|
---|
3916 | return -1;
|
---|
3917 | if (*res==NULL) {
|
---|
3918 | /* not found => default to 1:1 mapping */
|
---|
3919 | *(*outp)++ = *curinp;
|
---|
3920 | }
|
---|
3921 | else if (*res==Py_None)
|
---|
3922 | ;
|
---|
3923 | else if (PyInt_Check(*res)) {
|
---|
3924 | /* no overflow check, because we know that the space is enough */
|
---|
3925 | *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
|
---|
3926 | }
|
---|
3927 | else if (PyUnicode_Check(*res)) {
|
---|
3928 | Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
|
---|
3929 | if (repsize==1) {
|
---|
3930 | /* no overflow check, because we know that the space is enough */
|
---|
3931 | *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
|
---|
3932 | }
|
---|
3933 | else if (repsize!=0) {
|
---|
3934 | /* more than one character */
|
---|
3935 | Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
|
---|
3936 | (insize - (curinp-startinp)) +
|
---|
3937 | repsize - 1;
|
---|
3938 | if (charmaptranslate_makespace(outobj, outp, requiredsize))
|
---|
3939 | return -1;
|
---|
3940 | memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
|
---|
3941 | *outp += repsize;
|
---|
3942 | }
|
---|
3943 | }
|
---|
3944 | else
|
---|
3945 | return -1;
|
---|
3946 | return 0;
|
---|
3947 | }
|
---|
3948 |
|
---|
3949 | PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
|
---|
3950 | Py_ssize_t size,
|
---|
3951 | PyObject *mapping,
|
---|
3952 | const char *errors)
|
---|
3953 | {
|
---|
3954 | /* output object */
|
---|
3955 | PyObject *res = NULL;
|
---|
3956 | /* pointers to the beginning and end+1 of input */
|
---|
3957 | const Py_UNICODE *startp = p;
|
---|
3958 | const Py_UNICODE *endp = p + size;
|
---|
3959 | /* pointer into the output */
|
---|
3960 | Py_UNICODE *str;
|
---|
3961 | /* current output position */
|
---|
3962 | Py_ssize_t respos = 0;
|
---|
3963 | char *reason = "character maps to <undefined>";
|
---|
3964 | PyObject *errorHandler = NULL;
|
---|
3965 | PyObject *exc = NULL;
|
---|
3966 | /* the following variable is used for caching string comparisons
|
---|
3967 | * -1=not initialized, 0=unknown, 1=strict, 2=replace,
|
---|
3968 | * 3=ignore, 4=xmlcharrefreplace */
|
---|
3969 | int known_errorHandler = -1;
|
---|
3970 |
|
---|
3971 | if (mapping == NULL) {
|
---|
3972 | PyErr_BadArgument();
|
---|
3973 | return NULL;
|
---|
3974 | }
|
---|
3975 |
|
---|
3976 | /* allocate enough for a simple 1:1 translation without
|
---|
3977 | replacements, if we need more, we'll resize */
|
---|
3978 | res = PyUnicode_FromUnicode(NULL, size);
|
---|
3979 | if (res == NULL)
|
---|
3980 | goto onError;
|
---|
3981 | if (size == 0)
|
---|
3982 | return res;
|
---|
3983 | str = PyUnicode_AS_UNICODE(res);
|
---|
3984 |
|
---|
3985 | while (p<endp) {
|
---|
3986 | /* try to encode it */
|
---|
3987 | PyObject *x = NULL;
|
---|
3988 | if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
|
---|
3989 | Py_XDECREF(x);
|
---|
3990 | goto onError;
|
---|
3991 | }
|
---|
3992 | Py_XDECREF(x);
|
---|
3993 | if (x!=Py_None) /* it worked => adjust input pointer */
|
---|
3994 | ++p;
|
---|
3995 | else { /* untranslatable character */
|
---|
3996 | PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
|
---|
3997 | Py_ssize_t repsize;
|
---|
3998 | Py_ssize_t newpos;
|
---|
3999 | Py_UNICODE *uni2;
|
---|
4000 | /* startpos for collecting untranslatable chars */
|
---|
4001 | const Py_UNICODE *collstart = p;
|
---|
4002 | const Py_UNICODE *collend = p+1;
|
---|
4003 | const Py_UNICODE *coll;
|
---|
4004 |
|
---|
4005 | /* find all untranslatable characters */
|
---|
4006 | while (collend < endp) {
|
---|
4007 | if (charmaptranslate_lookup(*collend, mapping, &x))
|
---|
4008 | goto onError;
|
---|
4009 | Py_XDECREF(x);
|
---|
4010 | if (x!=Py_None)
|
---|
4011 | break;
|
---|
4012 | ++collend;
|
---|
4013 | }
|
---|
4014 | /* cache callback name lookup
|
---|
4015 | * (if not done yet, i.e. it's the first error) */
|
---|
4016 | if (known_errorHandler==-1) {
|
---|
4017 | if ((errors==NULL) || (!strcmp(errors, "strict")))
|
---|
4018 | known_errorHandler = 1;
|
---|
4019 | else if (!strcmp(errors, "replace"))
|
---|
4020 | known_errorHandler = 2;
|
---|
4021 | else if (!strcmp(errors, "ignore"))
|
---|
4022 | known_errorHandler = 3;
|
---|
4023 | else if (!strcmp(errors, "xmlcharrefreplace"))
|
---|
4024 | known_errorHandler = 4;
|
---|
4025 | else
|
---|
4026 | known_errorHandler = 0;
|
---|
4027 | }
|
---|
4028 | switch (known_errorHandler) {
|
---|
4029 | case 1: /* strict */
|
---|
4030 | raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
|
---|
4031 | goto onError;
|
---|
4032 | case 2: /* replace */
|
---|
4033 | /* No need to check for space, this is a 1:1 replacement */
|
---|
4034 | for (coll = collstart; coll<collend; ++coll)
|
---|
4035 | *str++ = '?';
|
---|
4036 | /* fall through */
|
---|
4037 | case 3: /* ignore */
|
---|
4038 | p = collend;
|
---|
4039 | break;
|
---|
4040 | case 4: /* xmlcharrefreplace */
|
---|
4041 | /* generate replacement (temporarily (mis)uses p) */
|
---|
4042 | for (p = collstart; p < collend; ++p) {
|
---|
4043 | char buffer[2+29+1+1];
|
---|
4044 | char *cp;
|
---|
4045 | sprintf(buffer, "&#%d;", (int)*p);
|
---|
4046 | if (charmaptranslate_makespace(&res, &str,
|
---|
4047 | (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
|
---|
4048 | goto onError;
|
---|
4049 | for (cp = buffer; *cp; ++cp)
|
---|
4050 | *str++ = *cp;
|
---|
4051 | }
|
---|
4052 | p = collend;
|
---|
4053 | break;
|
---|
4054 | default:
|
---|
4055 | repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
|
---|
4056 | reason, startp, size, &exc,
|
---|
4057 | collstart-startp, collend-startp, &newpos);
|
---|
4058 | if (repunicode == NULL)
|
---|
4059 | goto onError;
|
---|
4060 | /* generate replacement */
|
---|
4061 | repsize = PyUnicode_GET_SIZE(repunicode);
|
---|
4062 | if (charmaptranslate_makespace(&res, &str,
|
---|
4063 | (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
|
---|
4064 | Py_DECREF(repunicode);
|
---|
4065 | goto onError;
|
---|
4066 | }
|
---|
4067 | for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
|
---|
4068 | *str++ = *uni2;
|
---|
4069 | p = startp + newpos;
|
---|
4070 | Py_DECREF(repunicode);
|
---|
4071 | }
|
---|
4072 | }
|
---|
4073 | }
|
---|
4074 | /* Resize if we allocated to much */
|
---|
4075 | respos = str-PyUnicode_AS_UNICODE(res);
|
---|
4076 | if (respos<PyUnicode_GET_SIZE(res)) {
|
---|
4077 | if (_PyUnicode_Resize(&res, respos) < 0)
|
---|
4078 | goto onError;
|
---|
4079 | }
|
---|
4080 | Py_XDECREF(exc);
|
---|
4081 | Py_XDECREF(errorHandler);
|
---|
4082 | return res;
|
---|
4083 |
|
---|
4084 | onError:
|
---|
4085 | Py_XDECREF(res);
|
---|
4086 | Py_XDECREF(exc);
|
---|
4087 | Py_XDECREF(errorHandler);
|
---|
4088 | return NULL;
|
---|
4089 | }
|
---|
4090 |
|
---|
4091 | PyObject *PyUnicode_Translate(PyObject *str,
|
---|
4092 | PyObject *mapping,
|
---|
4093 | const char *errors)
|
---|
4094 | {
|
---|
4095 | PyObject *result;
|
---|
4096 |
|
---|
4097 | str = PyUnicode_FromObject(str);
|
---|
4098 | if (str == NULL)
|
---|
4099 | goto onError;
|
---|
4100 | result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
|
---|
4101 | PyUnicode_GET_SIZE(str),
|
---|
4102 | mapping,
|
---|
4103 | errors);
|
---|
4104 | Py_DECREF(str);
|
---|
4105 | return result;
|
---|
4106 |
|
---|
4107 | onError:
|
---|
4108 | Py_XDECREF(str);
|
---|
4109 | return NULL;
|
---|
4110 | }
|
---|
4111 |
|
---|
4112 | /* --- Decimal Encoder ---------------------------------------------------- */
|
---|
4113 |
|
---|
4114 | int PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
---|
4115 | Py_ssize_t length,
|
---|
4116 | char *output,
|
---|
4117 | const char *errors)
|
---|
4118 | {
|
---|
4119 | Py_UNICODE *p, *end;
|
---|
4120 | PyObject *errorHandler = NULL;
|
---|
4121 | PyObject *exc = NULL;
|
---|
4122 | const char *encoding = "decimal";
|
---|
4123 | const char *reason = "invalid decimal Unicode string";
|
---|
4124 | /* the following variable is used for caching string comparisons
|
---|
4125 | * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
|
---|
4126 | int known_errorHandler = -1;
|
---|
4127 |
|
---|
4128 | if (output == NULL) {
|
---|
4129 | PyErr_BadArgument();
|
---|
4130 | return -1;
|
---|
4131 | }
|
---|
4132 |
|
---|
4133 | p = s;
|
---|
4134 | end = s + length;
|
---|
4135 | while (p < end) {
|
---|
4136 | register Py_UNICODE ch = *p;
|
---|
4137 | int decimal;
|
---|
4138 | PyObject *repunicode;
|
---|
4139 | Py_ssize_t repsize;
|
---|
4140 | Py_ssize_t newpos;
|
---|
4141 | Py_UNICODE *uni2;
|
---|
4142 | Py_UNICODE *collstart;
|
---|
4143 | Py_UNICODE *collend;
|
---|
4144 |
|
---|
4145 | if (Py_UNICODE_ISSPACE(ch)) {
|
---|
4146 | *output++ = ' ';
|
---|
4147 | ++p;
|
---|
4148 | continue;
|
---|
4149 | }
|
---|
4150 | decimal = Py_UNICODE_TODECIMAL(ch);
|
---|
4151 | if (decimal >= 0) {
|
---|
4152 | *output++ = '0' + decimal;
|
---|
4153 | ++p;
|
---|
4154 | continue;
|
---|
4155 | }
|
---|
4156 | if (0 < ch && ch < 256) {
|
---|
4157 | *output++ = (char)ch;
|
---|
4158 | ++p;
|
---|
4159 | continue;
|
---|
4160 | }
|
---|
4161 | /* All other characters are considered unencodable */
|
---|
4162 | collstart = p;
|
---|
4163 | collend = p+1;
|
---|
4164 | while (collend < end) {
|
---|
4165 | if ((0 < *collend && *collend < 256) ||
|
---|
4166 | !Py_UNICODE_ISSPACE(*collend) ||
|
---|
4167 | Py_UNICODE_TODECIMAL(*collend))
|
---|
4168 | break;
|
---|
4169 | }
|
---|
4170 | /* cache callback name lookup
|
---|
4171 | * (if not done yet, i.e. it's the first error) */
|
---|
4172 | if (known_errorHandler==-1) {
|
---|
4173 | if ((errors==NULL) || (!strcmp(errors, "strict")))
|
---|
4174 | known_errorHandler = 1;
|
---|
4175 | else if (!strcmp(errors, "replace"))
|
---|
4176 | known_errorHandler = 2;
|
---|
4177 | else if (!strcmp(errors, "ignore"))
|
---|
4178 | known_errorHandler = 3;
|
---|
4179 | else if (!strcmp(errors, "xmlcharrefreplace"))
|
---|
4180 | known_errorHandler = 4;
|
---|
4181 | else
|
---|
4182 | known_errorHandler = 0;
|
---|
4183 | }
|
---|
4184 | switch (known_errorHandler) {
|
---|
4185 | case 1: /* strict */
|
---|
4186 | raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
|
---|
4187 | goto onError;
|
---|
4188 | case 2: /* replace */
|
---|
4189 | for (p = collstart; p < collend; ++p)
|
---|
4190 | *output++ = '?';
|
---|
4191 | /* fall through */
|
---|
4192 | case 3: /* ignore */
|
---|
4193 | p = collend;
|
---|
4194 | break;
|
---|
4195 | case 4: /* xmlcharrefreplace */
|
---|
4196 | /* generate replacement (temporarily (mis)uses p) */
|
---|
4197 | for (p = collstart; p < collend; ++p)
|
---|
4198 | output += sprintf(output, "&#%d;", (int)*p);
|
---|
4199 | p = collend;
|
---|
4200 | break;
|
---|
4201 | default:
|
---|
4202 | repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
|
---|
4203 | encoding, reason, s, length, &exc,
|
---|
4204 | collstart-s, collend-s, &newpos);
|
---|
4205 | if (repunicode == NULL)
|
---|
4206 | goto onError;
|
---|
4207 | /* generate replacement */
|
---|
4208 | repsize = PyUnicode_GET_SIZE(repunicode);
|
---|
4209 | for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
|
---|
4210 | Py_UNICODE ch = *uni2;
|
---|
4211 | if (Py_UNICODE_ISSPACE(ch))
|
---|
4212 | *output++ = ' ';
|
---|
4213 | else {
|
---|
4214 | decimal = Py_UNICODE_TODECIMAL(ch);
|
---|
4215 | if (decimal >= 0)
|
---|
4216 | *output++ = '0' + decimal;
|
---|
4217 | else if (0 < ch && ch < 256)
|
---|
4218 | *output++ = (char)ch;
|
---|
4219 | else {
|
---|
4220 | Py_DECREF(repunicode);
|
---|
4221 | raise_encode_exception(&exc, encoding,
|
---|
4222 | s, length, collstart-s, collend-s, reason);
|
---|
4223 | goto onError;
|
---|
4224 | }
|
---|
4225 | }
|
---|
4226 | }
|
---|
4227 | p = s + newpos;
|
---|
4228 | Py_DECREF(repunicode);
|
---|
4229 | }
|
---|
4230 | }
|
---|
4231 | /* 0-terminate the output string */
|
---|
4232 | *output++ = '\0';
|
---|
4233 | Py_XDECREF(exc);
|
---|
4234 | Py_XDECREF(errorHandler);
|
---|
4235 | return 0;
|
---|
4236 |
|
---|
4237 | onError:
|
---|
4238 | Py_XDECREF(exc);
|
---|
4239 | Py_XDECREF(errorHandler);
|
---|
4240 | return -1;
|
---|
4241 | }
|
---|
4242 |
|
---|
4243 | /* --- Helpers ------------------------------------------------------------ */
|
---|
4244 |
|
---|
4245 | #define STRINGLIB_CHAR Py_UNICODE
|
---|
4246 |
|
---|
4247 | #define STRINGLIB_LEN PyUnicode_GET_SIZE
|
---|
4248 | #define STRINGLIB_NEW PyUnicode_FromUnicode
|
---|
4249 | #define STRINGLIB_STR PyUnicode_AS_UNICODE
|
---|
4250 |
|
---|
4251 | Py_LOCAL_INLINE(int)
|
---|
4252 | STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
|
---|
4253 | {
|
---|
4254 | if (str[0] != other[0])
|
---|
4255 | return 1;
|
---|
4256 | return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
|
---|
4257 | }
|
---|
4258 |
|
---|
4259 | #define STRINGLIB_EMPTY unicode_empty
|
---|
4260 |
|
---|
4261 | #include "stringlib/fastsearch.h"
|
---|
4262 |
|
---|
4263 | #include "stringlib/count.h"
|
---|
4264 | #include "stringlib/find.h"
|
---|
4265 | #include "stringlib/partition.h"
|
---|
4266 |
|
---|
4267 | /* helper macro to fixup start/end slice values */
|
---|
4268 | #define FIX_START_END(obj) \
|
---|
4269 | if (start < 0) \
|
---|
4270 | start += (obj)->length; \
|
---|
4271 | if (start < 0) \
|
---|
4272 | start = 0; \
|
---|
4273 | if (end > (obj)->length) \
|
---|
4274 | end = (obj)->length; \
|
---|
4275 | if (end < 0) \
|
---|
4276 | end += (obj)->length; \
|
---|
4277 | if (end < 0) \
|
---|
4278 | end = 0;
|
---|
4279 |
|
---|
4280 | Py_ssize_t PyUnicode_Count(PyObject *str,
|
---|
4281 | PyObject *substr,
|
---|
4282 | Py_ssize_t start,
|
---|
4283 | Py_ssize_t end)
|
---|
4284 | {
|
---|
4285 | Py_ssize_t result;
|
---|
4286 | PyUnicodeObject* str_obj;
|
---|
4287 | PyUnicodeObject* sub_obj;
|
---|
4288 |
|
---|
4289 | str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
|
---|
4290 | if (!str_obj)
|
---|
4291 | return -1;
|
---|
4292 | sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
|
---|
4293 | if (!sub_obj) {
|
---|
4294 | Py_DECREF(str_obj);
|
---|
4295 | return -1;
|
---|
4296 | }
|
---|
4297 |
|
---|
4298 | FIX_START_END(str_obj);
|
---|
4299 |
|
---|
4300 | result = stringlib_count(
|
---|
4301 | str_obj->str + start, end - start, sub_obj->str, sub_obj->length
|
---|
4302 | );
|
---|
4303 |
|
---|
4304 | Py_DECREF(sub_obj);
|
---|
4305 | Py_DECREF(str_obj);
|
---|
4306 |
|
---|
4307 | return result;
|
---|
4308 | }
|
---|
4309 |
|
---|
4310 | Py_ssize_t PyUnicode_Find(PyObject *str,
|
---|
4311 | PyObject *sub,
|
---|
4312 | Py_ssize_t start,
|
---|
4313 | Py_ssize_t end,
|
---|
4314 | int direction)
|
---|
4315 | {
|
---|
4316 | Py_ssize_t result;
|
---|
4317 |
|
---|
4318 | str = PyUnicode_FromObject(str);
|
---|
4319 | if (!str)
|
---|
4320 | return -2;
|
---|
4321 | sub = PyUnicode_FromObject(sub);
|
---|
4322 | if (!sub) {
|
---|
4323 | Py_DECREF(str);
|
---|
4324 | return -2;
|
---|
4325 | }
|
---|
4326 |
|
---|
4327 | if (direction > 0)
|
---|
4328 | result = stringlib_find_slice(
|
---|
4329 | PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
|
---|
4330 | PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
|
---|
4331 | start, end
|
---|
4332 | );
|
---|
4333 | else
|
---|
4334 | result = stringlib_rfind_slice(
|
---|
4335 | PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
|
---|
4336 | PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
|
---|
4337 | start, end
|
---|
4338 | );
|
---|
4339 |
|
---|
4340 | Py_DECREF(str);
|
---|
4341 | Py_DECREF(sub);
|
---|
4342 |
|
---|
4343 | return result;
|
---|
4344 | }
|
---|
4345 |
|
---|
4346 | static
|
---|
4347 | int tailmatch(PyUnicodeObject *self,
|
---|
4348 | PyUnicodeObject *substring,
|
---|
4349 | Py_ssize_t start,
|
---|
4350 | Py_ssize_t end,
|
---|
4351 | int direction)
|
---|
4352 | {
|
---|
4353 | if (substring->length == 0)
|
---|
4354 | return 1;
|
---|
4355 |
|
---|
4356 | FIX_START_END(self);
|
---|
4357 |
|
---|
4358 | end -= substring->length;
|
---|
4359 | if (end < start)
|
---|
4360 | return 0;
|
---|
4361 |
|
---|
4362 | if (direction > 0) {
|
---|
4363 | if (Py_UNICODE_MATCH(self, end, substring))
|
---|
4364 | return 1;
|
---|
4365 | } else {
|
---|
4366 | if (Py_UNICODE_MATCH(self, start, substring))
|
---|
4367 | return 1;
|
---|
4368 | }
|
---|
4369 |
|
---|
4370 | return 0;
|
---|
4371 | }
|
---|
4372 |
|
---|
4373 | Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
|
---|
4374 | PyObject *substr,
|
---|
4375 | Py_ssize_t start,
|
---|
4376 | Py_ssize_t end,
|
---|
4377 | int direction)
|
---|
4378 | {
|
---|
4379 | Py_ssize_t result;
|
---|
4380 |
|
---|
4381 | str = PyUnicode_FromObject(str);
|
---|
4382 | if (str == NULL)
|
---|
4383 | return -1;
|
---|
4384 | substr = PyUnicode_FromObject(substr);
|
---|
4385 | if (substr == NULL) {
|
---|
4386 | Py_DECREF(str);
|
---|
4387 | return -1;
|
---|
4388 | }
|
---|
4389 |
|
---|
4390 | result = tailmatch((PyUnicodeObject *)str,
|
---|
4391 | (PyUnicodeObject *)substr,
|
---|
4392 | start, end, direction);
|
---|
4393 | Py_DECREF(str);
|
---|
4394 | Py_DECREF(substr);
|
---|
4395 | return result;
|
---|
4396 | }
|
---|
4397 |
|
---|
4398 | /* Apply fixfct filter to the Unicode object self and return a
|
---|
4399 | reference to the modified object */
|
---|
4400 |
|
---|
4401 | static
|
---|
4402 | PyObject *fixup(PyUnicodeObject *self,
|
---|
4403 | int (*fixfct)(PyUnicodeObject *s))
|
---|
4404 | {
|
---|
4405 |
|
---|
4406 | PyUnicodeObject *u;
|
---|
4407 |
|
---|
4408 | u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
|
---|
4409 | if (u == NULL)
|
---|
4410 | return NULL;
|
---|
4411 |
|
---|
4412 | Py_UNICODE_COPY(u->str, self->str, self->length);
|
---|
4413 |
|
---|
4414 | if (!fixfct(u) && PyUnicode_CheckExact(self)) {
|
---|
4415 | /* fixfct should return TRUE if it modified the buffer. If
|
---|
4416 | FALSE, return a reference to the original buffer instead
|
---|
4417 | (to save space, not time) */
|
---|
4418 | Py_INCREF(self);
|
---|
4419 | Py_DECREF(u);
|
---|
4420 | return (PyObject*) self;
|
---|
4421 | }
|
---|
4422 | return (PyObject*) u;
|
---|
4423 | }
|
---|
4424 |
|
---|
4425 | static
|
---|
4426 | int fixupper(PyUnicodeObject *self)
|
---|
4427 | {
|
---|
4428 | Py_ssize_t len = self->length;
|
---|
4429 | Py_UNICODE *s = self->str;
|
---|
4430 | int status = 0;
|
---|
4431 |
|
---|
4432 | while (len-- > 0) {
|
---|
4433 | register Py_UNICODE ch;
|
---|
4434 |
|
---|
4435 | ch = Py_UNICODE_TOUPPER(*s);
|
---|
4436 | if (ch != *s) {
|
---|
4437 | status = 1;
|
---|
4438 | *s = ch;
|
---|
4439 | }
|
---|
4440 | s++;
|
---|
4441 | }
|
---|
4442 |
|
---|
4443 | return status;
|
---|
4444 | }
|
---|
4445 |
|
---|
4446 | static
|
---|
4447 | int fixlower(PyUnicodeObject *self)
|
---|
4448 | {
|
---|
4449 | Py_ssize_t len = self->length;
|
---|
4450 | Py_UNICODE *s = self->str;
|
---|
4451 | int status = 0;
|
---|
4452 |
|
---|
4453 | while (len-- > 0) {
|
---|
4454 | register Py_UNICODE ch;
|
---|
4455 |
|
---|
4456 | ch = Py_UNICODE_TOLOWER(*s);
|
---|
4457 | if (ch != *s) {
|
---|
4458 | status = 1;
|
---|
4459 | *s = ch;
|
---|
4460 | }
|
---|
4461 | s++;
|
---|
4462 | }
|
---|
4463 |
|
---|
4464 | return status;
|
---|
4465 | }
|
---|
4466 |
|
---|
4467 | static
|
---|
4468 | int fixswapcase(PyUnicodeObject *self)
|
---|
4469 | {
|
---|
4470 | Py_ssize_t len = self->length;
|
---|
4471 | Py_UNICODE *s = self->str;
|
---|
4472 | int status = 0;
|
---|
4473 |
|
---|
4474 | while (len-- > 0) {
|
---|
4475 | if (Py_UNICODE_ISUPPER(*s)) {
|
---|
4476 | *s = Py_UNICODE_TOLOWER(*s);
|
---|
4477 | status = 1;
|
---|
4478 | } else if (Py_UNICODE_ISLOWER(*s)) {
|
---|
4479 | *s = Py_UNICODE_TOUPPER(*s);
|
---|
4480 | status = 1;
|
---|
4481 | }
|
---|
4482 | s++;
|
---|
4483 | }
|
---|
4484 |
|
---|
4485 | return status;
|
---|
4486 | }
|
---|
4487 |
|
---|
4488 | static
|
---|
4489 | int fixcapitalize(PyUnicodeObject *self)
|
---|
4490 | {
|
---|
4491 | Py_ssize_t len = self->length;
|
---|
4492 | Py_UNICODE *s = self->str;
|
---|
4493 | int status = 0;
|
---|
4494 |
|
---|
4495 | if (len == 0)
|
---|
4496 | return 0;
|
---|
4497 | if (Py_UNICODE_ISLOWER(*s)) {
|
---|
4498 | *s = Py_UNICODE_TOUPPER(*s);
|
---|
4499 | status = 1;
|
---|
4500 | }
|
---|
4501 | s++;
|
---|
4502 | while (--len > 0) {
|
---|
4503 | if (Py_UNICODE_ISUPPER(*s)) {
|
---|
4504 | *s = Py_UNICODE_TOLOWER(*s);
|
---|
4505 | status = 1;
|
---|
4506 | }
|
---|
4507 | s++;
|
---|
4508 | }
|
---|
4509 | return status;
|
---|
4510 | }
|
---|
4511 |
|
---|
4512 | static
|
---|
4513 | int fixtitle(PyUnicodeObject *self)
|
---|
4514 | {
|
---|
4515 | register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
---|
4516 | register Py_UNICODE *e;
|
---|
4517 | int previous_is_cased;
|
---|
4518 |
|
---|
4519 | /* Shortcut for single character strings */
|
---|
4520 | if (PyUnicode_GET_SIZE(self) == 1) {
|
---|
4521 | Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
|
---|
4522 | if (*p != ch) {
|
---|
4523 | *p = ch;
|
---|
4524 | return 1;
|
---|
4525 | }
|
---|
4526 | else
|
---|
4527 | return 0;
|
---|
4528 | }
|
---|
4529 |
|
---|
4530 | e = p + PyUnicode_GET_SIZE(self);
|
---|
4531 | previous_is_cased = 0;
|
---|
4532 | for (; p < e; p++) {
|
---|
4533 | register const Py_UNICODE ch = *p;
|
---|
4534 |
|
---|
4535 | if (previous_is_cased)
|
---|
4536 | *p = Py_UNICODE_TOLOWER(ch);
|
---|
4537 | else
|
---|
4538 | *p = Py_UNICODE_TOTITLE(ch);
|
---|
4539 |
|
---|
4540 | if (Py_UNICODE_ISLOWER(ch) ||
|
---|
4541 | Py_UNICODE_ISUPPER(ch) ||
|
---|
4542 | Py_UNICODE_ISTITLE(ch))
|
---|
4543 | previous_is_cased = 1;
|
---|
4544 | else
|
---|
4545 | previous_is_cased = 0;
|
---|
4546 | }
|
---|
4547 | return 1;
|
---|
4548 | }
|
---|
4549 |
|
---|
4550 | PyObject *
|
---|
4551 | PyUnicode_Join(PyObject *separator, PyObject *seq)
|
---|
4552 | {
|
---|
4553 | PyObject *internal_separator = NULL;
|
---|
4554 | const Py_UNICODE blank = ' ';
|
---|
4555 | const Py_UNICODE *sep = ␣
|
---|
4556 | Py_ssize_t seplen = 1;
|
---|
4557 | PyUnicodeObject *res = NULL; /* the result */
|
---|
4558 | Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
|
---|
4559 | Py_ssize_t res_used; /* # used bytes */
|
---|
4560 | Py_UNICODE *res_p; /* pointer to free byte in res's string area */
|
---|
4561 | PyObject *fseq; /* PySequence_Fast(seq) */
|
---|
4562 | Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
|
---|
4563 | PyObject *item;
|
---|
4564 | Py_ssize_t i;
|
---|
4565 |
|
---|
4566 | fseq = PySequence_Fast(seq, "");
|
---|
4567 | if (fseq == NULL) {
|
---|
4568 | return NULL;
|
---|
4569 | }
|
---|
4570 |
|
---|
4571 | /* Grrrr. A codec may be invoked to convert str objects to
|
---|
4572 | * Unicode, and so it's possible to call back into Python code
|
---|
4573 | * during PyUnicode_FromObject(), and so it's possible for a sick
|
---|
4574 | * codec to change the size of fseq (if seq is a list). Therefore
|
---|
4575 | * we have to keep refetching the size -- can't assume seqlen
|
---|
4576 | * is invariant.
|
---|
4577 | */
|
---|
4578 | seqlen = PySequence_Fast_GET_SIZE(fseq);
|
---|
4579 | /* If empty sequence, return u"". */
|
---|
4580 | if (seqlen == 0) {
|
---|
4581 | res = _PyUnicode_New(0); /* empty sequence; return u"" */
|
---|
4582 | goto Done;
|
---|
4583 | }
|
---|
4584 | /* If singleton sequence with an exact Unicode, return that. */
|
---|
4585 | if (seqlen == 1) {
|
---|
4586 | item = PySequence_Fast_GET_ITEM(fseq, 0);
|
---|
4587 | if (PyUnicode_CheckExact(item)) {
|
---|
4588 | Py_INCREF(item);
|
---|
4589 | res = (PyUnicodeObject *)item;
|
---|
4590 | goto Done;
|
---|
4591 | }
|
---|
4592 | }
|
---|
4593 |
|
---|
4594 | /* At least two items to join, or one that isn't exact Unicode. */
|
---|
4595 | if (seqlen > 1) {
|
---|
4596 | /* Set up sep and seplen -- they're needed. */
|
---|
4597 | if (separator == NULL) {
|
---|
4598 | sep = ␣
|
---|
4599 | seplen = 1;
|
---|
4600 | }
|
---|
4601 | else {
|
---|
4602 | internal_separator = PyUnicode_FromObject(separator);
|
---|
4603 | if (internal_separator == NULL)
|
---|
4604 | goto onError;
|
---|
4605 | sep = PyUnicode_AS_UNICODE(internal_separator);
|
---|
4606 | seplen = PyUnicode_GET_SIZE(internal_separator);
|
---|
4607 | /* In case PyUnicode_FromObject() mutated seq. */
|
---|
4608 | seqlen = PySequence_Fast_GET_SIZE(fseq);
|
---|
4609 | }
|
---|
4610 | }
|
---|
4611 |
|
---|
4612 | /* Get space. */
|
---|
4613 | res = _PyUnicode_New(res_alloc);
|
---|
4614 | if (res == NULL)
|
---|
4615 | goto onError;
|
---|
4616 | res_p = PyUnicode_AS_UNICODE(res);
|
---|
4617 | res_used = 0;
|
---|
4618 |
|
---|
4619 | for (i = 0; i < seqlen; ++i) {
|
---|
4620 | Py_ssize_t itemlen;
|
---|
4621 | Py_ssize_t new_res_used;
|
---|
4622 |
|
---|
4623 | item = PySequence_Fast_GET_ITEM(fseq, i);
|
---|
4624 | /* Convert item to Unicode. */
|
---|
4625 | if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
|
---|
4626 | PyErr_Format(PyExc_TypeError,
|
---|
4627 | "sequence item %zd: expected string or Unicode,"
|
---|
4628 | " %.80s found",
|
---|
4629 | i, item->ob_type->tp_name);
|
---|
4630 | goto onError;
|
---|
4631 | }
|
---|
4632 | item = PyUnicode_FromObject(item);
|
---|
4633 | if (item == NULL)
|
---|
4634 | goto onError;
|
---|
4635 | /* We own a reference to item from here on. */
|
---|
4636 |
|
---|
4637 | /* In case PyUnicode_FromObject() mutated seq. */
|
---|
4638 | seqlen = PySequence_Fast_GET_SIZE(fseq);
|
---|
4639 |
|
---|
4640 | /* Make sure we have enough space for the separator and the item. */
|
---|
4641 | itemlen = PyUnicode_GET_SIZE(item);
|
---|
4642 | new_res_used = res_used + itemlen;
|
---|
4643 | if (new_res_used < 0)
|
---|
4644 | goto Overflow;
|
---|
4645 | if (i < seqlen - 1) {
|
---|
4646 | new_res_used += seplen;
|
---|
4647 | if (new_res_used < 0)
|
---|
4648 | goto Overflow;
|
---|
4649 | }
|
---|
4650 | if (new_res_used > res_alloc) {
|
---|
4651 | /* double allocated size until it's big enough */
|
---|
4652 | do {
|
---|
4653 | res_alloc += res_alloc;
|
---|
4654 | if (res_alloc <= 0)
|
---|
4655 | goto Overflow;
|
---|
4656 | } while (new_res_used > res_alloc);
|
---|
4657 | if (_PyUnicode_Resize(&res, res_alloc) < 0) {
|
---|
4658 | Py_DECREF(item);
|
---|
4659 | goto onError;
|
---|
4660 | }
|
---|
4661 | res_p = PyUnicode_AS_UNICODE(res) + res_used;
|
---|
4662 | }
|
---|
4663 |
|
---|
4664 | /* Copy item, and maybe the separator. */
|
---|
4665 | Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
|
---|
4666 | res_p += itemlen;
|
---|
4667 | if (i < seqlen - 1) {
|
---|
4668 | Py_UNICODE_COPY(res_p, sep, seplen);
|
---|
4669 | res_p += seplen;
|
---|
4670 | }
|
---|
4671 | Py_DECREF(item);
|
---|
4672 | res_used = new_res_used;
|
---|
4673 | }
|
---|
4674 |
|
---|
4675 | /* Shrink res to match the used area; this probably can't fail,
|
---|
4676 | * but it's cheap to check.
|
---|
4677 | */
|
---|
4678 | if (_PyUnicode_Resize(&res, res_used) < 0)
|
---|
4679 | goto onError;
|
---|
4680 |
|
---|
4681 | Done:
|
---|
4682 | Py_XDECREF(internal_separator);
|
---|
4683 | Py_DECREF(fseq);
|
---|
4684 | return (PyObject *)res;
|
---|
4685 |
|
---|
4686 | Overflow:
|
---|
4687 | PyErr_SetString(PyExc_OverflowError,
|
---|
4688 | "join() result is too long for a Python string");
|
---|
4689 | Py_DECREF(item);
|
---|
4690 | /* fall through */
|
---|
4691 |
|
---|
4692 | onError:
|
---|
4693 | Py_XDECREF(internal_separator);
|
---|
4694 | Py_DECREF(fseq);
|
---|
4695 | Py_XDECREF(res);
|
---|
4696 | return NULL;
|
---|
4697 | }
|
---|
4698 |
|
---|
4699 | static
|
---|
4700 | PyUnicodeObject *pad(PyUnicodeObject *self,
|
---|
4701 | Py_ssize_t left,
|
---|
4702 | Py_ssize_t right,
|
---|
4703 | Py_UNICODE fill)
|
---|
4704 | {
|
---|
4705 | PyUnicodeObject *u;
|
---|
4706 |
|
---|
4707 | if (left < 0)
|
---|
4708 | left = 0;
|
---|
4709 | if (right < 0)
|
---|
4710 | right = 0;
|
---|
4711 |
|
---|
4712 | if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
|
---|
4713 | Py_INCREF(self);
|
---|
4714 | return self;
|
---|
4715 | }
|
---|
4716 |
|
---|
4717 | u = _PyUnicode_New(left + self->length + right);
|
---|
4718 | if (u) {
|
---|
4719 | if (left)
|
---|
4720 | Py_UNICODE_FILL(u->str, fill, left);
|
---|
4721 | Py_UNICODE_COPY(u->str + left, self->str, self->length);
|
---|
4722 | if (right)
|
---|
4723 | Py_UNICODE_FILL(u->str + left + self->length, fill, right);
|
---|
4724 | }
|
---|
4725 |
|
---|
4726 | return u;
|
---|
4727 | }
|
---|
4728 |
|
---|
4729 | #define SPLIT_APPEND(data, left, right) \
|
---|
4730 | str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
|
---|
4731 | if (!str) \
|
---|
4732 | goto onError; \
|
---|
4733 | if (PyList_Append(list, str)) { \
|
---|
4734 | Py_DECREF(str); \
|
---|
4735 | goto onError; \
|
---|
4736 | } \
|
---|
4737 | else \
|
---|
4738 | Py_DECREF(str);
|
---|
4739 |
|
---|
4740 | static
|
---|
4741 | PyObject *split_whitespace(PyUnicodeObject *self,
|
---|
4742 | PyObject *list,
|
---|
4743 | Py_ssize_t maxcount)
|
---|
4744 | {
|
---|
4745 | register Py_ssize_t i;
|
---|
4746 | register Py_ssize_t j;
|
---|
4747 | Py_ssize_t len = self->length;
|
---|
4748 | PyObject *str;
|
---|
4749 |
|
---|
4750 | for (i = j = 0; i < len; ) {
|
---|
4751 | /* find a token */
|
---|
4752 | while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
|
---|
4753 | i++;
|
---|
4754 | j = i;
|
---|
4755 | while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
|
---|
4756 | i++;
|
---|
4757 | if (j < i) {
|
---|
4758 | if (maxcount-- <= 0)
|
---|
4759 | break;
|
---|
4760 | SPLIT_APPEND(self->str, j, i);
|
---|
4761 | while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
|
---|
4762 | i++;
|
---|
4763 | j = i;
|
---|
4764 | }
|
---|
4765 | }
|
---|
4766 | if (j < len) {
|
---|
4767 | SPLIT_APPEND(self->str, j, len);
|
---|
4768 | }
|
---|
4769 | return list;
|
---|
4770 |
|
---|
4771 | onError:
|
---|
4772 | Py_DECREF(list);
|
---|
4773 | return NULL;
|
---|
4774 | }
|
---|
4775 |
|
---|
4776 | PyObject *PyUnicode_Splitlines(PyObject *string,
|
---|
4777 | int keepends)
|
---|
4778 | {
|
---|
4779 | register Py_ssize_t i;
|
---|
4780 | register Py_ssize_t j;
|
---|
4781 | Py_ssize_t len;
|
---|
4782 | PyObject *list;
|
---|
4783 | PyObject *str;
|
---|
4784 | Py_UNICODE *data;
|
---|
4785 |
|
---|
4786 | string = PyUnicode_FromObject(string);
|
---|
4787 | if (string == NULL)
|
---|
4788 | return NULL;
|
---|
4789 | data = PyUnicode_AS_UNICODE(string);
|
---|
4790 | len = PyUnicode_GET_SIZE(string);
|
---|
4791 |
|
---|
4792 | list = PyList_New(0);
|
---|
4793 | if (!list)
|
---|
4794 | goto onError;
|
---|
4795 |
|
---|
4796 | for (i = j = 0; i < len; ) {
|
---|
4797 | Py_ssize_t eol;
|
---|
4798 |
|
---|
4799 | /* Find a line and append it */
|
---|
4800 | while (i < len && !BLOOM_LINEBREAK(data[i]))
|
---|
4801 | i++;
|
---|
4802 |
|
---|
4803 | /* Skip the line break reading CRLF as one line break */
|
---|
4804 | eol = i;
|
---|
4805 | if (i < len) {
|
---|
4806 | if (data[i] == '\r' && i + 1 < len &&
|
---|
4807 | data[i+1] == '\n')
|
---|
4808 | i += 2;
|
---|
4809 | else
|
---|
4810 | i++;
|
---|
4811 | if (keepends)
|
---|
4812 | eol = i;
|
---|
4813 | }
|
---|
4814 | SPLIT_APPEND(data, j, eol);
|
---|
4815 | j = i;
|
---|
4816 | }
|
---|
4817 | if (j < len) {
|
---|
4818 | SPLIT_APPEND(data, j, len);
|
---|
4819 | }
|
---|
4820 |
|
---|
4821 | Py_DECREF(string);
|
---|
4822 | return list;
|
---|
4823 |
|
---|
4824 | onError:
|
---|
4825 | Py_XDECREF(list);
|
---|
4826 | Py_DECREF(string);
|
---|
4827 | return NULL;
|
---|
4828 | }
|
---|
4829 |
|
---|
4830 | static
|
---|
4831 | PyObject *split_char(PyUnicodeObject *self,
|
---|
4832 | PyObject *list,
|
---|
4833 | Py_UNICODE ch,
|
---|
4834 | Py_ssize_t maxcount)
|
---|
4835 | {
|
---|
4836 | register Py_ssize_t i;
|
---|
4837 | register Py_ssize_t j;
|
---|
4838 | Py_ssize_t len = self->length;
|
---|
4839 | PyObject *str;
|
---|
4840 |
|
---|
4841 | for (i = j = 0; i < len; ) {
|
---|
4842 | if (self->str[i] == ch) {
|
---|
4843 | if (maxcount-- <= 0)
|
---|
4844 | break;
|
---|
4845 | SPLIT_APPEND(self->str, j, i);
|
---|
4846 | i = j = i + 1;
|
---|
4847 | } else
|
---|
4848 | i++;
|
---|
4849 | }
|
---|
4850 | if (j <= len) {
|
---|
4851 | SPLIT_APPEND(self->str, j, len);
|
---|
4852 | }
|
---|
4853 | return list;
|
---|
4854 |
|
---|
4855 | onError:
|
---|
4856 | Py_DECREF(list);
|
---|
4857 | return NULL;
|
---|
4858 | }
|
---|
4859 |
|
---|
4860 | static
|
---|
4861 | PyObject *split_substring(PyUnicodeObject *self,
|
---|
4862 | PyObject *list,
|
---|
4863 | PyUnicodeObject *substring,
|
---|
4864 | Py_ssize_t maxcount)
|
---|
4865 | {
|
---|
4866 | register Py_ssize_t i;
|
---|
4867 | register Py_ssize_t j;
|
---|
4868 | Py_ssize_t len = self->length;
|
---|
4869 | Py_ssize_t sublen = substring->length;
|
---|
4870 | PyObject *str;
|
---|
4871 |
|
---|
4872 | for (i = j = 0; i <= len - sublen; ) {
|
---|
4873 | if (Py_UNICODE_MATCH(self, i, substring)) {
|
---|
4874 | if (maxcount-- <= 0)
|
---|
4875 | break;
|
---|
4876 | SPLIT_APPEND(self->str, j, i);
|
---|
4877 | i = j = i + sublen;
|
---|
4878 | } else
|
---|
4879 | i++;
|
---|
4880 | }
|
---|
4881 | if (j <= len) {
|
---|
4882 | SPLIT_APPEND(self->str, j, len);
|
---|
4883 | }
|
---|
4884 | return list;
|
---|
4885 |
|
---|
4886 | onError:
|
---|
4887 | Py_DECREF(list);
|
---|
4888 | return NULL;
|
---|
4889 | }
|
---|
4890 |
|
---|
4891 | static
|
---|
4892 | PyObject *rsplit_whitespace(PyUnicodeObject *self,
|
---|
4893 | PyObject *list,
|
---|
4894 | Py_ssize_t maxcount)
|
---|
4895 | {
|
---|
4896 | register Py_ssize_t i;
|
---|
4897 | register Py_ssize_t j;
|
---|
4898 | Py_ssize_t len = self->length;
|
---|
4899 | PyObject *str;
|
---|
4900 |
|
---|
4901 | for (i = j = len - 1; i >= 0; ) {
|
---|
4902 | /* find a token */
|
---|
4903 | while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
|
---|
4904 | i--;
|
---|
4905 | j = i;
|
---|
4906 | while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
|
---|
4907 | i--;
|
---|
4908 | if (j > i) {
|
---|
4909 | if (maxcount-- <= 0)
|
---|
4910 | break;
|
---|
4911 | SPLIT_APPEND(self->str, i + 1, j + 1);
|
---|
4912 | while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
|
---|
4913 | i--;
|
---|
4914 | j = i;
|
---|
4915 | }
|
---|
4916 | }
|
---|
4917 | if (j >= 0) {
|
---|
4918 | SPLIT_APPEND(self->str, 0, j + 1);
|
---|
4919 | }
|
---|
4920 | if (PyList_Reverse(list) < 0)
|
---|
4921 | goto onError;
|
---|
4922 | return list;
|
---|
4923 |
|
---|
4924 | onError:
|
---|
4925 | Py_DECREF(list);
|
---|
4926 | return NULL;
|
---|
4927 | }
|
---|
4928 |
|
---|
4929 | static
|
---|
4930 | PyObject *rsplit_char(PyUnicodeObject *self,
|
---|
4931 | PyObject *list,
|
---|
4932 | Py_UNICODE ch,
|
---|
4933 | Py_ssize_t maxcount)
|
---|
4934 | {
|
---|
4935 | register Py_ssize_t i;
|
---|
4936 | register Py_ssize_t j;
|
---|
4937 | Py_ssize_t len = self->length;
|
---|
4938 | PyObject *str;
|
---|
4939 |
|
---|
4940 | for (i = j = len - 1; i >= 0; ) {
|
---|
4941 | if (self->str[i] == ch) {
|
---|
4942 | if (maxcount-- <= 0)
|
---|
4943 | break;
|
---|
4944 | SPLIT_APPEND(self->str, i + 1, j + 1);
|
---|
4945 | j = i = i - 1;
|
---|
4946 | } else
|
---|
4947 | i--;
|
---|
4948 | }
|
---|
4949 | if (j >= -1) {
|
---|
4950 | SPLIT_APPEND(self->str, 0, j + 1);
|
---|
4951 | }
|
---|
4952 | if (PyList_Reverse(list) < 0)
|
---|
4953 | goto onError;
|
---|
4954 | return list;
|
---|
4955 |
|
---|
4956 | onError:
|
---|
4957 | Py_DECREF(list);
|
---|
4958 | return NULL;
|
---|
4959 | }
|
---|
4960 |
|
---|
4961 | static
|
---|
4962 | PyObject *rsplit_substring(PyUnicodeObject *self,
|
---|
4963 | PyObject *list,
|
---|
4964 | PyUnicodeObject *substring,
|
---|
4965 | Py_ssize_t maxcount)
|
---|
4966 | {
|
---|
4967 | register Py_ssize_t i;
|
---|
4968 | register Py_ssize_t j;
|
---|
4969 | Py_ssize_t len = self->length;
|
---|
4970 | Py_ssize_t sublen = substring->length;
|
---|
4971 | PyObject *str;
|
---|
4972 |
|
---|
4973 | for (i = len - sublen, j = len; i >= 0; ) {
|
---|
4974 | if (Py_UNICODE_MATCH(self, i, substring)) {
|
---|
4975 | if (maxcount-- <= 0)
|
---|
4976 | break;
|
---|
4977 | SPLIT_APPEND(self->str, i + sublen, j);
|
---|
4978 | j = i;
|
---|
4979 | i -= sublen;
|
---|
4980 | } else
|
---|
4981 | i--;
|
---|
4982 | }
|
---|
4983 | if (j >= 0) {
|
---|
4984 | SPLIT_APPEND(self->str, 0, j);
|
---|
4985 | }
|
---|
4986 | if (PyList_Reverse(list) < 0)
|
---|
4987 | goto onError;
|
---|
4988 | return list;
|
---|
4989 |
|
---|
4990 | onError:
|
---|
4991 | Py_DECREF(list);
|
---|
4992 | return NULL;
|
---|
4993 | }
|
---|
4994 |
|
---|
4995 | #undef SPLIT_APPEND
|
---|
4996 |
|
---|
4997 | static
|
---|
4998 | PyObject *split(PyUnicodeObject *self,
|
---|
4999 | PyUnicodeObject *substring,
|
---|
5000 | Py_ssize_t maxcount)
|
---|
5001 | {
|
---|
5002 | PyObject *list;
|
---|
5003 |
|
---|
5004 | if (maxcount < 0)
|
---|
5005 | maxcount = PY_SSIZE_T_MAX;
|
---|
5006 |
|
---|
5007 | list = PyList_New(0);
|
---|
5008 | if (!list)
|
---|
5009 | return NULL;
|
---|
5010 |
|
---|
5011 | if (substring == NULL)
|
---|
5012 | return split_whitespace(self,list,maxcount);
|
---|
5013 |
|
---|
5014 | else if (substring->length == 1)
|
---|
5015 | return split_char(self,list,substring->str[0],maxcount);
|
---|
5016 |
|
---|
5017 | else if (substring->length == 0) {
|
---|
5018 | Py_DECREF(list);
|
---|
5019 | PyErr_SetString(PyExc_ValueError, "empty separator");
|
---|
5020 | return NULL;
|
---|
5021 | }
|
---|
5022 | else
|
---|
5023 | return split_substring(self,list,substring,maxcount);
|
---|
5024 | }
|
---|
5025 |
|
---|
5026 | static
|
---|
5027 | PyObject *rsplit(PyUnicodeObject *self,
|
---|
5028 | PyUnicodeObject *substring,
|
---|
5029 | Py_ssize_t maxcount)
|
---|
5030 | {
|
---|
5031 | PyObject *list;
|
---|
5032 |
|
---|
5033 | if (maxcount < 0)
|
---|
5034 | maxcount = PY_SSIZE_T_MAX;
|
---|
5035 |
|
---|
5036 | list = PyList_New(0);
|
---|
5037 | if (!list)
|
---|
5038 | return NULL;
|
---|
5039 |
|
---|
5040 | if (substring == NULL)
|
---|
5041 | return rsplit_whitespace(self,list,maxcount);
|
---|
5042 |
|
---|
5043 | else if (substring->length == 1)
|
---|
5044 | return rsplit_char(self,list,substring->str[0],maxcount);
|
---|
5045 |
|
---|
5046 | else if (substring->length == 0) {
|
---|
5047 | Py_DECREF(list);
|
---|
5048 | PyErr_SetString(PyExc_ValueError, "empty separator");
|
---|
5049 | return NULL;
|
---|
5050 | }
|
---|
5051 | else
|
---|
5052 | return rsplit_substring(self,list,substring,maxcount);
|
---|
5053 | }
|
---|
5054 |
|
---|
5055 | static
|
---|
5056 | PyObject *replace(PyUnicodeObject *self,
|
---|
5057 | PyUnicodeObject *str1,
|
---|
5058 | PyUnicodeObject *str2,
|
---|
5059 | Py_ssize_t maxcount)
|
---|
5060 | {
|
---|
5061 | PyUnicodeObject *u;
|
---|
5062 |
|
---|
5063 | if (maxcount < 0)
|
---|
5064 | maxcount = PY_SSIZE_T_MAX;
|
---|
5065 |
|
---|
5066 | if (str1->length == str2->length) {
|
---|
5067 | /* same length */
|
---|
5068 | Py_ssize_t i;
|
---|
5069 | if (str1->length == 1) {
|
---|
5070 | /* replace characters */
|
---|
5071 | Py_UNICODE u1, u2;
|
---|
5072 | if (!findchar(self->str, self->length, str1->str[0]))
|
---|
5073 | goto nothing;
|
---|
5074 | u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
|
---|
5075 | if (!u)
|
---|
5076 | return NULL;
|
---|
5077 | Py_UNICODE_COPY(u->str, self->str, self->length);
|
---|
5078 | u1 = str1->str[0];
|
---|
5079 | u2 = str2->str[0];
|
---|
5080 | for (i = 0; i < u->length; i++)
|
---|
5081 | if (u->str[i] == u1) {
|
---|
5082 | if (--maxcount < 0)
|
---|
5083 | break;
|
---|
5084 | u->str[i] = u2;
|
---|
5085 | }
|
---|
5086 | } else {
|
---|
5087 | i = fastsearch(
|
---|
5088 | self->str, self->length, str1->str, str1->length, FAST_SEARCH
|
---|
5089 | );
|
---|
5090 | if (i < 0)
|
---|
5091 | goto nothing;
|
---|
5092 | u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
|
---|
5093 | if (!u)
|
---|
5094 | return NULL;
|
---|
5095 | Py_UNICODE_COPY(u->str, self->str, self->length);
|
---|
5096 | while (i <= self->length - str1->length)
|
---|
5097 | if (Py_UNICODE_MATCH(self, i, str1)) {
|
---|
5098 | if (--maxcount < 0)
|
---|
5099 | break;
|
---|
5100 | Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
|
---|
5101 | i += str1->length;
|
---|
5102 | } else
|
---|
5103 | i++;
|
---|
5104 | }
|
---|
5105 | } else {
|
---|
5106 |
|
---|
5107 | Py_ssize_t n, i, j, e;
|
---|
5108 | Py_ssize_t product, new_size, delta;
|
---|
5109 | Py_UNICODE *p;
|
---|
5110 |
|
---|
5111 | /* replace strings */
|
---|
5112 | n = stringlib_count(self->str, self->length, str1->str, str1->length);
|
---|
5113 | if (n > maxcount)
|
---|
5114 | n = maxcount;
|
---|
5115 | if (n == 0)
|
---|
5116 | goto nothing;
|
---|
5117 | /* new_size = self->length + n * (str2->length - str1->length)); */
|
---|
5118 | delta = (str2->length - str1->length);
|
---|
5119 | if (delta == 0) {
|
---|
5120 | new_size = self->length;
|
---|
5121 | } else {
|
---|
5122 | product = n * (str2->length - str1->length);
|
---|
5123 | if ((product / (str2->length - str1->length)) != n) {
|
---|
5124 | PyErr_SetString(PyExc_OverflowError,
|
---|
5125 | "replace string is too long");
|
---|
5126 | return NULL;
|
---|
5127 | }
|
---|
5128 | new_size = self->length + product;
|
---|
5129 | if (new_size < 0) {
|
---|
5130 | PyErr_SetString(PyExc_OverflowError,
|
---|
5131 | "replace string is too long");
|
---|
5132 | return NULL;
|
---|
5133 | }
|
---|
5134 | }
|
---|
5135 | u = _PyUnicode_New(new_size);
|
---|
5136 | if (!u)
|
---|
5137 | return NULL;
|
---|
5138 | i = 0;
|
---|
5139 | p = u->str;
|
---|
5140 | e = self->length - str1->length;
|
---|
5141 | if (str1->length > 0) {
|
---|
5142 | while (n-- > 0) {
|
---|
5143 | /* look for next match */
|
---|
5144 | j = i;
|
---|
5145 | while (j <= e) {
|
---|
5146 | if (Py_UNICODE_MATCH(self, j, str1))
|
---|
5147 | break;
|
---|
5148 | j++;
|
---|
5149 | }
|
---|
5150 | if (j > i) {
|
---|
5151 | if (j > e)
|
---|
5152 | break;
|
---|
5153 | /* copy unchanged part [i:j] */
|
---|
5154 | Py_UNICODE_COPY(p, self->str+i, j-i);
|
---|
5155 | p += j - i;
|
---|
5156 | }
|
---|
5157 | /* copy substitution string */
|
---|
5158 | if (str2->length > 0) {
|
---|
5159 | Py_UNICODE_COPY(p, str2->str, str2->length);
|
---|
5160 | p += str2->length;
|
---|
5161 | }
|
---|
5162 | i = j + str1->length;
|
---|
5163 | }
|
---|
5164 | if (i < self->length)
|
---|
5165 | /* copy tail [i:] */
|
---|
5166 | Py_UNICODE_COPY(p, self->str+i, self->length-i);
|
---|
5167 | } else {
|
---|
5168 | /* interleave */
|
---|
5169 | while (n > 0) {
|
---|
5170 | Py_UNICODE_COPY(p, str2->str, str2->length);
|
---|
5171 | p += str2->length;
|
---|
5172 | if (--n <= 0)
|
---|
5173 | break;
|
---|
5174 | *p++ = self->str[i++];
|
---|
5175 | }
|
---|
5176 | Py_UNICODE_COPY(p, self->str+i, self->length-i);
|
---|
5177 | }
|
---|
5178 | }
|
---|
5179 | return (PyObject *) u;
|
---|
5180 |
|
---|
5181 | nothing:
|
---|
5182 | /* nothing to replace; return original string (when possible) */
|
---|
5183 | if (PyUnicode_CheckExact(self)) {
|
---|
5184 | Py_INCREF(self);
|
---|
5185 | return (PyObject *) self;
|
---|
5186 | }
|
---|
5187 | return PyUnicode_FromUnicode(self->str, self->length);
|
---|
5188 | }
|
---|
5189 |
|
---|
5190 | /* --- Unicode Object Methods --------------------------------------------- */
|
---|
5191 |
|
---|
5192 | PyDoc_STRVAR(title__doc__,
|
---|
5193 | "S.title() -> unicode\n\
|
---|
5194 | \n\
|
---|
5195 | Return a titlecased version of S, i.e. words start with title case\n\
|
---|
5196 | characters, all remaining cased characters have lower case.");
|
---|
5197 |
|
---|
5198 | static PyObject*
|
---|
5199 | unicode_title(PyUnicodeObject *self)
|
---|
5200 | {
|
---|
5201 | return fixup(self, fixtitle);
|
---|
5202 | }
|
---|
5203 |
|
---|
5204 | PyDoc_STRVAR(capitalize__doc__,
|
---|
5205 | "S.capitalize() -> unicode\n\
|
---|
5206 | \n\
|
---|
5207 | Return a capitalized version of S, i.e. make the first character\n\
|
---|
5208 | have upper case.");
|
---|
5209 |
|
---|
5210 | static PyObject*
|
---|
5211 | unicode_capitalize(PyUnicodeObject *self)
|
---|
5212 | {
|
---|
5213 | return fixup(self, fixcapitalize);
|
---|
5214 | }
|
---|
5215 |
|
---|
5216 | #if 0
|
---|
5217 | PyDoc_STRVAR(capwords__doc__,
|
---|
5218 | "S.capwords() -> unicode\n\
|
---|
5219 | \n\
|
---|
5220 | Apply .capitalize() to all words in S and return the result with\n\
|
---|
5221 | normalized whitespace (all whitespace strings are replaced by ' ').");
|
---|
5222 |
|
---|
5223 | static PyObject*
|
---|
5224 | unicode_capwords(PyUnicodeObject *self)
|
---|
5225 | {
|
---|
5226 | PyObject *list;
|
---|
5227 | PyObject *item;
|
---|
5228 | Py_ssize_t i;
|
---|
5229 |
|
---|
5230 | /* Split into words */
|
---|
5231 | list = split(self, NULL, -1);
|
---|
5232 | if (!list)
|
---|
5233 | return NULL;
|
---|
5234 |
|
---|
5235 | /* Capitalize each word */
|
---|
5236 | for (i = 0; i < PyList_GET_SIZE(list); i++) {
|
---|
5237 | item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
|
---|
5238 | fixcapitalize);
|
---|
5239 | if (item == NULL)
|
---|
5240 | goto onError;
|
---|
5241 | Py_DECREF(PyList_GET_ITEM(list, i));
|
---|
5242 | PyList_SET_ITEM(list, i, item);
|
---|
5243 | }
|
---|
5244 |
|
---|
5245 | /* Join the words to form a new string */
|
---|
5246 | item = PyUnicode_Join(NULL, list);
|
---|
5247 |
|
---|
5248 | onError:
|
---|
5249 | Py_DECREF(list);
|
---|
5250 | return (PyObject *)item;
|
---|
5251 | }
|
---|
5252 | #endif
|
---|
5253 |
|
---|
5254 | /* Argument converter. Coerces to a single unicode character */
|
---|
5255 |
|
---|
5256 | static int
|
---|
5257 | convert_uc(PyObject *obj, void *addr)
|
---|
5258 | {
|
---|
5259 | Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
|
---|
5260 | PyObject *uniobj;
|
---|
5261 | Py_UNICODE *unistr;
|
---|
5262 |
|
---|
5263 | uniobj = PyUnicode_FromObject(obj);
|
---|
5264 | if (uniobj == NULL) {
|
---|
5265 | PyErr_SetString(PyExc_TypeError,
|
---|
5266 | "The fill character cannot be converted to Unicode");
|
---|
5267 | return 0;
|
---|
5268 | }
|
---|
5269 | if (PyUnicode_GET_SIZE(uniobj) != 1) {
|
---|
5270 | PyErr_SetString(PyExc_TypeError,
|
---|
5271 | "The fill character must be exactly one character long");
|
---|
5272 | Py_DECREF(uniobj);
|
---|
5273 | return 0;
|
---|
5274 | }
|
---|
5275 | unistr = PyUnicode_AS_UNICODE(uniobj);
|
---|
5276 | *fillcharloc = unistr[0];
|
---|
5277 | Py_DECREF(uniobj);
|
---|
5278 | return 1;
|
---|
5279 | }
|
---|
5280 |
|
---|
5281 | PyDoc_STRVAR(center__doc__,
|
---|
5282 | "S.center(width[, fillchar]) -> unicode\n\
|
---|
5283 | \n\
|
---|
5284 | Return S centered in a Unicode string of length width. Padding is\n\
|
---|
5285 | done using the specified fill character (default is a space)");
|
---|
5286 |
|
---|
5287 | static PyObject *
|
---|
5288 | unicode_center(PyUnicodeObject *self, PyObject *args)
|
---|
5289 | {
|
---|
5290 | Py_ssize_t marg, left;
|
---|
5291 | Py_ssize_t width;
|
---|
5292 | Py_UNICODE fillchar = ' ';
|
---|
5293 |
|
---|
5294 | if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
|
---|
5295 | return NULL;
|
---|
5296 |
|
---|
5297 | if (self->length >= width && PyUnicode_CheckExact(self)) {
|
---|
5298 | Py_INCREF(self);
|
---|
5299 | return (PyObject*) self;
|
---|
5300 | }
|
---|
5301 |
|
---|
5302 | marg = width - self->length;
|
---|
5303 | left = marg / 2 + (marg & width & 1);
|
---|
5304 |
|
---|
5305 | return (PyObject*) pad(self, left, marg - left, fillchar);
|
---|
5306 | }
|
---|
5307 |
|
---|
5308 | #if 0
|
---|
5309 |
|
---|
5310 | /* This code should go into some future Unicode collation support
|
---|
5311 | module. The basic comparison should compare ordinals on a naive
|
---|
5312 | basis (this is what Java does and thus JPython too). */
|
---|
5313 |
|
---|
5314 | /* speedy UTF-16 code point order comparison */
|
---|
5315 | /* gleaned from: */
|
---|
5316 | /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
|
---|
5317 |
|
---|
5318 | static short utf16Fixup[32] =
|
---|
5319 | {
|
---|
5320 | 0, 0, 0, 0, 0, 0, 0, 0,
|
---|
5321 | 0, 0, 0, 0, 0, 0, 0, 0,
|
---|
5322 | 0, 0, 0, 0, 0, 0, 0, 0,
|
---|
5323 | 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
|
---|
5324 | };
|
---|
5325 |
|
---|
5326 | static int
|
---|
5327 | unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
|
---|
5328 | {
|
---|
5329 | Py_ssize_t len1, len2;
|
---|
5330 |
|
---|
5331 | Py_UNICODE *s1 = str1->str;
|
---|
5332 | Py_UNICODE *s2 = str2->str;
|
---|
5333 |
|
---|
5334 | len1 = str1->length;
|
---|
5335 | len2 = str2->length;
|
---|
5336 |
|
---|
5337 | while (len1 > 0 && len2 > 0) {
|
---|
5338 | Py_UNICODE c1, c2;
|
---|
5339 |
|
---|
5340 | c1 = *s1++;
|
---|
5341 | c2 = *s2++;
|
---|
5342 |
|
---|
5343 | if (c1 > (1<<11) * 26)
|
---|
5344 | c1 += utf16Fixup[c1>>11];
|
---|
5345 | if (c2 > (1<<11) * 26)
|
---|
5346 | c2 += utf16Fixup[c2>>11];
|
---|
5347 | /* now c1 and c2 are in UTF-32-compatible order */
|
---|
5348 |
|
---|
5349 | if (c1 != c2)
|
---|
5350 | return (c1 < c2) ? -1 : 1;
|
---|
5351 |
|
---|
5352 | len1--; len2--;
|
---|
5353 | }
|
---|
5354 |
|
---|
5355 | return (len1 < len2) ? -1 : (len1 != len2);
|
---|
5356 | }
|
---|
5357 |
|
---|
5358 | #else
|
---|
5359 |
|
---|
5360 | static int
|
---|
5361 | unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
|
---|
5362 | {
|
---|
5363 | register Py_ssize_t len1, len2;
|
---|
5364 |
|
---|
5365 | Py_UNICODE *s1 = str1->str;
|
---|
5366 | Py_UNICODE *s2 = str2->str;
|
---|
5367 |
|
---|
5368 | len1 = str1->length;
|
---|
5369 | len2 = str2->length;
|
---|
5370 |
|
---|
5371 | while (len1 > 0 && len2 > 0) {
|
---|
5372 | Py_UNICODE c1, c2;
|
---|
5373 |
|
---|
5374 | c1 = *s1++;
|
---|
5375 | c2 = *s2++;
|
---|
5376 |
|
---|
5377 | if (c1 != c2)
|
---|
5378 | return (c1 < c2) ? -1 : 1;
|
---|
5379 |
|
---|
5380 | len1--; len2--;
|
---|
5381 | }
|
---|
5382 |
|
---|
5383 | return (len1 < len2) ? -1 : (len1 != len2);
|
---|
5384 | }
|
---|
5385 |
|
---|
5386 | #endif
|
---|
5387 |
|
---|
5388 | int PyUnicode_Compare(PyObject *left,
|
---|
5389 | PyObject *right)
|
---|
5390 | {
|
---|
5391 | PyUnicodeObject *u = NULL, *v = NULL;
|
---|
5392 | int result;
|
---|
5393 |
|
---|
5394 | /* Coerce the two arguments */
|
---|
5395 | u = (PyUnicodeObject *)PyUnicode_FromObject(left);
|
---|
5396 | if (u == NULL)
|
---|
5397 | goto onError;
|
---|
5398 | v = (PyUnicodeObject *)PyUnicode_FromObject(right);
|
---|
5399 | if (v == NULL)
|
---|
5400 | goto onError;
|
---|
5401 |
|
---|
5402 | /* Shortcut for empty or interned objects */
|
---|
5403 | if (v == u) {
|
---|
5404 | Py_DECREF(u);
|
---|
5405 | Py_DECREF(v);
|
---|
5406 | return 0;
|
---|
5407 | }
|
---|
5408 |
|
---|
5409 | result = unicode_compare(u, v);
|
---|
5410 |
|
---|
5411 | Py_DECREF(u);
|
---|
5412 | Py_DECREF(v);
|
---|
5413 | return result;
|
---|
5414 |
|
---|
5415 | onError:
|
---|
5416 | Py_XDECREF(u);
|
---|
5417 | Py_XDECREF(v);
|
---|
5418 | return -1;
|
---|
5419 | }
|
---|
5420 |
|
---|
5421 | PyObject *PyUnicode_RichCompare(PyObject *left,
|
---|
5422 | PyObject *right,
|
---|
5423 | int op)
|
---|
5424 | {
|
---|
5425 | int result;
|
---|
5426 |
|
---|
5427 | result = PyUnicode_Compare(left, right);
|
---|
5428 | if (result == -1 && PyErr_Occurred())
|
---|
5429 | goto onError;
|
---|
5430 |
|
---|
5431 | /* Convert the return value to a Boolean */
|
---|
5432 | switch (op) {
|
---|
5433 | case Py_EQ:
|
---|
5434 | result = (result == 0);
|
---|
5435 | break;
|
---|
5436 | case Py_NE:
|
---|
5437 | result = (result != 0);
|
---|
5438 | break;
|
---|
5439 | case Py_LE:
|
---|
5440 | result = (result <= 0);
|
---|
5441 | break;
|
---|
5442 | case Py_GE:
|
---|
5443 | result = (result >= 0);
|
---|
5444 | break;
|
---|
5445 | case Py_LT:
|
---|
5446 | result = (result == -1);
|
---|
5447 | break;
|
---|
5448 | case Py_GT:
|
---|
5449 | result = (result == 1);
|
---|
5450 | break;
|
---|
5451 | }
|
---|
5452 | return PyBool_FromLong(result);
|
---|
5453 |
|
---|
5454 | onError:
|
---|
5455 |
|
---|
5456 | /* Standard case
|
---|
5457 |
|
---|
5458 | Type errors mean that PyUnicode_FromObject() could not convert
|
---|
5459 | one of the arguments (usually the right hand side) to Unicode,
|
---|
5460 | ie. we can't handle the comparison request. However, it is
|
---|
5461 | possible that the other object knows a comparison method, which
|
---|
5462 | is why we return Py_NotImplemented to give the other object a
|
---|
5463 | chance.
|
---|
5464 |
|
---|
5465 | */
|
---|
5466 | if (PyErr_ExceptionMatches(PyExc_TypeError)) {
|
---|
5467 | PyErr_Clear();
|
---|
5468 | Py_INCREF(Py_NotImplemented);
|
---|
5469 | return Py_NotImplemented;
|
---|
5470 | }
|
---|
5471 | if (op != Py_EQ && op != Py_NE)
|
---|
5472 | return NULL;
|
---|
5473 |
|
---|
5474 | /* Equality comparison.
|
---|
5475 |
|
---|
5476 | This is a special case: we silence any PyExc_UnicodeDecodeError
|
---|
5477 | and instead turn it into a PyErr_UnicodeWarning.
|
---|
5478 |
|
---|
5479 | */
|
---|
5480 | if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
|
---|
5481 | return NULL;
|
---|
5482 | PyErr_Clear();
|
---|
5483 | if (PyErr_Warn(PyExc_UnicodeWarning,
|
---|
5484 | (op == Py_EQ) ?
|
---|
5485 | "Unicode equal comparison "
|
---|
5486 | "failed to convert both arguments to Unicode - "
|
---|
5487 | "interpreting them as being unequal" :
|
---|
5488 | "Unicode unequal comparison "
|
---|
5489 | "failed to convert both arguments to Unicode - "
|
---|
5490 | "interpreting them as being unequal"
|
---|
5491 | ) < 0)
|
---|
5492 | return NULL;
|
---|
5493 | result = (op == Py_NE);
|
---|
5494 | return PyBool_FromLong(result);
|
---|
5495 | }
|
---|
5496 |
|
---|
5497 | int PyUnicode_Contains(PyObject *container,
|
---|
5498 | PyObject *element)
|
---|
5499 | {
|
---|
5500 | PyObject *str, *sub;
|
---|
5501 | int result;
|
---|
5502 |
|
---|
5503 | /* Coerce the two arguments */
|
---|
5504 | sub = PyUnicode_FromObject(element);
|
---|
5505 | if (!sub) {
|
---|
5506 | PyErr_SetString(PyExc_TypeError,
|
---|
5507 | "'in <string>' requires string as left operand");
|
---|
5508 | return -1;
|
---|
5509 | }
|
---|
5510 |
|
---|
5511 | str = PyUnicode_FromObject(container);
|
---|
5512 | if (!str) {
|
---|
5513 | Py_DECREF(sub);
|
---|
5514 | return -1;
|
---|
5515 | }
|
---|
5516 |
|
---|
5517 | result = stringlib_contains_obj(str, sub);
|
---|
5518 |
|
---|
5519 | Py_DECREF(str);
|
---|
5520 | Py_DECREF(sub);
|
---|
5521 |
|
---|
5522 | return result;
|
---|
5523 | }
|
---|
5524 |
|
---|
5525 | /* Concat to string or Unicode object giving a new Unicode object. */
|
---|
5526 |
|
---|
5527 | PyObject *PyUnicode_Concat(PyObject *left,
|
---|
5528 | PyObject *right)
|
---|
5529 | {
|
---|
5530 | PyUnicodeObject *u = NULL, *v = NULL, *w;
|
---|
5531 |
|
---|
5532 | /* Coerce the two arguments */
|
---|
5533 | u = (PyUnicodeObject *)PyUnicode_FromObject(left);
|
---|
5534 | if (u == NULL)
|
---|
5535 | goto onError;
|
---|
5536 | v = (PyUnicodeObject *)PyUnicode_FromObject(right);
|
---|
5537 | if (v == NULL)
|
---|
5538 | goto onError;
|
---|
5539 |
|
---|
5540 | /* Shortcuts */
|
---|
5541 | if (v == unicode_empty) {
|
---|
5542 | Py_DECREF(v);
|
---|
5543 | return (PyObject *)u;
|
---|
5544 | }
|
---|
5545 | if (u == unicode_empty) {
|
---|
5546 | Py_DECREF(u);
|
---|
5547 | return (PyObject *)v;
|
---|
5548 | }
|
---|
5549 |
|
---|
5550 | /* Concat the two Unicode strings */
|
---|
5551 | w = _PyUnicode_New(u->length + v->length);
|
---|
5552 | if (w == NULL)
|
---|
5553 | goto onError;
|
---|
5554 | Py_UNICODE_COPY(w->str, u->str, u->length);
|
---|
5555 | Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
|
---|
5556 |
|
---|
5557 | Py_DECREF(u);
|
---|
5558 | Py_DECREF(v);
|
---|
5559 | return (PyObject *)w;
|
---|
5560 |
|
---|
5561 | onError:
|
---|
5562 | Py_XDECREF(u);
|
---|
5563 | Py_XDECREF(v);
|
---|
5564 | return NULL;
|
---|
5565 | }
|
---|
5566 |
|
---|
5567 | PyDoc_STRVAR(count__doc__,
|
---|
5568 | "S.count(sub[, start[, end]]) -> int\n\
|
---|
5569 | \n\
|
---|
5570 | Return the number of non-overlapping occurrences of substring sub in\n\
|
---|
5571 | Unicode string S[start:end]. Optional arguments start and end are\n\
|
---|
5572 | interpreted as in slice notation.");
|
---|
5573 |
|
---|
5574 | static PyObject *
|
---|
5575 | unicode_count(PyUnicodeObject *self, PyObject *args)
|
---|
5576 | {
|
---|
5577 | PyUnicodeObject *substring;
|
---|
5578 | Py_ssize_t start = 0;
|
---|
5579 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
5580 | PyObject *result;
|
---|
5581 |
|
---|
5582 | if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
|
---|
5583 | _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
|
---|
5584 | return NULL;
|
---|
5585 |
|
---|
5586 | substring = (PyUnicodeObject *)PyUnicode_FromObject(
|
---|
5587 | (PyObject *)substring);
|
---|
5588 | if (substring == NULL)
|
---|
5589 | return NULL;
|
---|
5590 |
|
---|
5591 | FIX_START_END(self);
|
---|
5592 |
|
---|
5593 | result = PyInt_FromSsize_t(
|
---|
5594 | stringlib_count(self->str + start, end - start,
|
---|
5595 | substring->str, substring->length)
|
---|
5596 | );
|
---|
5597 |
|
---|
5598 | Py_DECREF(substring);
|
---|
5599 |
|
---|
5600 | return result;
|
---|
5601 | }
|
---|
5602 |
|
---|
5603 | PyDoc_STRVAR(encode__doc__,
|
---|
5604 | "S.encode([encoding[,errors]]) -> string or unicode\n\
|
---|
5605 | \n\
|
---|
5606 | Encodes S using the codec registered for encoding. encoding defaults\n\
|
---|
5607 | to the default encoding. errors may be given to set a different error\n\
|
---|
5608 | handling scheme. Default is 'strict' meaning that encoding errors raise\n\
|
---|
5609 | a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
|
---|
5610 | 'xmlcharrefreplace' as well as any other name registered with\n\
|
---|
5611 | codecs.register_error that can handle UnicodeEncodeErrors.");
|
---|
5612 |
|
---|
5613 | static PyObject *
|
---|
5614 | unicode_encode(PyUnicodeObject *self, PyObject *args)
|
---|
5615 | {
|
---|
5616 | char *encoding = NULL;
|
---|
5617 | char *errors = NULL;
|
---|
5618 | PyObject *v;
|
---|
5619 |
|
---|
5620 | if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
|
---|
5621 | return NULL;
|
---|
5622 | v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
|
---|
5623 | if (v == NULL)
|
---|
5624 | goto onError;
|
---|
5625 | if (!PyString_Check(v) && !PyUnicode_Check(v)) {
|
---|
5626 | PyErr_Format(PyExc_TypeError,
|
---|
5627 | "encoder did not return a string/unicode object "
|
---|
5628 | "(type=%.400s)",
|
---|
5629 | v->ob_type->tp_name);
|
---|
5630 | Py_DECREF(v);
|
---|
5631 | return NULL;
|
---|
5632 | }
|
---|
5633 | return v;
|
---|
5634 |
|
---|
5635 | onError:
|
---|
5636 | return NULL;
|
---|
5637 | }
|
---|
5638 |
|
---|
5639 | PyDoc_STRVAR(decode__doc__,
|
---|
5640 | "S.decode([encoding[,errors]]) -> string or unicode\n\
|
---|
5641 | \n\
|
---|
5642 | Decodes S using the codec registered for encoding. encoding defaults\n\
|
---|
5643 | to the default encoding. errors may be given to set a different error\n\
|
---|
5644 | handling scheme. Default is 'strict' meaning that encoding errors raise\n\
|
---|
5645 | a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
|
---|
5646 | as well as any other name registerd with codecs.register_error that is\n\
|
---|
5647 | able to handle UnicodeDecodeErrors.");
|
---|
5648 |
|
---|
5649 | static PyObject *
|
---|
5650 | unicode_decode(PyUnicodeObject *self, PyObject *args)
|
---|
5651 | {
|
---|
5652 | char *encoding = NULL;
|
---|
5653 | char *errors = NULL;
|
---|
5654 | PyObject *v;
|
---|
5655 |
|
---|
5656 | if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
|
---|
5657 | return NULL;
|
---|
5658 | v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
|
---|
5659 | if (v == NULL)
|
---|
5660 | goto onError;
|
---|
5661 | if (!PyString_Check(v) && !PyUnicode_Check(v)) {
|
---|
5662 | PyErr_Format(PyExc_TypeError,
|
---|
5663 | "decoder did not return a string/unicode object "
|
---|
5664 | "(type=%.400s)",
|
---|
5665 | v->ob_type->tp_name);
|
---|
5666 | Py_DECREF(v);
|
---|
5667 | return NULL;
|
---|
5668 | }
|
---|
5669 | return v;
|
---|
5670 |
|
---|
5671 | onError:
|
---|
5672 | return NULL;
|
---|
5673 | }
|
---|
5674 |
|
---|
5675 | PyDoc_STRVAR(expandtabs__doc__,
|
---|
5676 | "S.expandtabs([tabsize]) -> unicode\n\
|
---|
5677 | \n\
|
---|
5678 | Return a copy of S where all tab characters are expanded using spaces.\n\
|
---|
5679 | If tabsize is not given, a tab size of 8 characters is assumed.");
|
---|
5680 |
|
---|
5681 | static PyObject*
|
---|
5682 | unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
|
---|
5683 | {
|
---|
5684 | Py_UNICODE *e;
|
---|
5685 | Py_UNICODE *p;
|
---|
5686 | Py_UNICODE *q;
|
---|
5687 | Py_ssize_t i, j;
|
---|
5688 | PyUnicodeObject *u;
|
---|
5689 | int tabsize = 8;
|
---|
5690 |
|
---|
5691 | if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
|
---|
5692 | return NULL;
|
---|
5693 |
|
---|
5694 | /* First pass: determine size of output string */
|
---|
5695 | i = j = 0;
|
---|
5696 | e = self->str + self->length;
|
---|
5697 | for (p = self->str; p < e; p++)
|
---|
5698 | if (*p == '\t') {
|
---|
5699 | if (tabsize > 0)
|
---|
5700 | j += tabsize - (j % tabsize);
|
---|
5701 | }
|
---|
5702 | else {
|
---|
5703 | j++;
|
---|
5704 | if (*p == '\n' || *p == '\r') {
|
---|
5705 | i += j;
|
---|
5706 | j = 0;
|
---|
5707 | }
|
---|
5708 | }
|
---|
5709 |
|
---|
5710 | /* Second pass: create output string and fill it */
|
---|
5711 | u = _PyUnicode_New(i + j);
|
---|
5712 | if (!u)
|
---|
5713 | return NULL;
|
---|
5714 |
|
---|
5715 | j = 0;
|
---|
5716 | q = u->str;
|
---|
5717 |
|
---|
5718 | for (p = self->str; p < e; p++)
|
---|
5719 | if (*p == '\t') {
|
---|
5720 | if (tabsize > 0) {
|
---|
5721 | i = tabsize - (j % tabsize);
|
---|
5722 | j += i;
|
---|
5723 | while (i--)
|
---|
5724 | *q++ = ' ';
|
---|
5725 | }
|
---|
5726 | }
|
---|
5727 | else {
|
---|
5728 | j++;
|
---|
5729 | *q++ = *p;
|
---|
5730 | if (*p == '\n' || *p == '\r')
|
---|
5731 | j = 0;
|
---|
5732 | }
|
---|
5733 |
|
---|
5734 | return (PyObject*) u;
|
---|
5735 | }
|
---|
5736 |
|
---|
5737 | PyDoc_STRVAR(find__doc__,
|
---|
5738 | "S.find(sub [,start [,end]]) -> int\n\
|
---|
5739 | \n\
|
---|
5740 | Return the lowest index in S where substring sub is found,\n\
|
---|
5741 | such that sub is contained within s[start,end]. Optional\n\
|
---|
5742 | arguments start and end are interpreted as in slice notation.\n\
|
---|
5743 | \n\
|
---|
5744 | Return -1 on failure.");
|
---|
5745 |
|
---|
5746 | static PyObject *
|
---|
5747 | unicode_find(PyUnicodeObject *self, PyObject *args)
|
---|
5748 | {
|
---|
5749 | PyObject *substring;
|
---|
5750 | Py_ssize_t start = 0;
|
---|
5751 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
5752 | Py_ssize_t result;
|
---|
5753 |
|
---|
5754 | if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
|
---|
5755 | _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
|
---|
5756 | return NULL;
|
---|
5757 | substring = PyUnicode_FromObject(substring);
|
---|
5758 | if (!substring)
|
---|
5759 | return NULL;
|
---|
5760 |
|
---|
5761 | result = stringlib_find_slice(
|
---|
5762 | PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
|
---|
5763 | PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
|
---|
5764 | start, end
|
---|
5765 | );
|
---|
5766 |
|
---|
5767 | Py_DECREF(substring);
|
---|
5768 |
|
---|
5769 | return PyInt_FromSsize_t(result);
|
---|
5770 | }
|
---|
5771 |
|
---|
5772 | static PyObject *
|
---|
5773 | unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
|
---|
5774 | {
|
---|
5775 | if (index < 0 || index >= self->length) {
|
---|
5776 | PyErr_SetString(PyExc_IndexError, "string index out of range");
|
---|
5777 | return NULL;
|
---|
5778 | }
|
---|
5779 |
|
---|
5780 | return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
|
---|
5781 | }
|
---|
5782 |
|
---|
5783 | static long
|
---|
5784 | unicode_hash(PyUnicodeObject *self)
|
---|
5785 | {
|
---|
5786 | /* Since Unicode objects compare equal to their ASCII string
|
---|
5787 | counterparts, they should use the individual character values
|
---|
5788 | as basis for their hash value. This is needed to assure that
|
---|
5789 | strings and Unicode objects behave in the same way as
|
---|
5790 | dictionary keys. */
|
---|
5791 |
|
---|
5792 | register Py_ssize_t len;
|
---|
5793 | register Py_UNICODE *p;
|
---|
5794 | register long x;
|
---|
5795 |
|
---|
5796 | if (self->hash != -1)
|
---|
5797 | return self->hash;
|
---|
5798 | len = PyUnicode_GET_SIZE(self);
|
---|
5799 | p = PyUnicode_AS_UNICODE(self);
|
---|
5800 | x = *p << 7;
|
---|
5801 | while (--len >= 0)
|
---|
5802 | x = (1000003*x) ^ *p++;
|
---|
5803 | x ^= PyUnicode_GET_SIZE(self);
|
---|
5804 | if (x == -1)
|
---|
5805 | x = -2;
|
---|
5806 | self->hash = x;
|
---|
5807 | return x;
|
---|
5808 | }
|
---|
5809 |
|
---|
5810 | PyDoc_STRVAR(index__doc__,
|
---|
5811 | "S.index(sub [,start [,end]]) -> int\n\
|
---|
5812 | \n\
|
---|
5813 | Like S.find() but raise ValueError when the substring is not found.");
|
---|
5814 |
|
---|
5815 | static PyObject *
|
---|
5816 | unicode_index(PyUnicodeObject *self, PyObject *args)
|
---|
5817 | {
|
---|
5818 | Py_ssize_t result;
|
---|
5819 | PyObject *substring;
|
---|
5820 | Py_ssize_t start = 0;
|
---|
5821 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
5822 |
|
---|
5823 | if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
|
---|
5824 | _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
|
---|
5825 | return NULL;
|
---|
5826 | substring = PyUnicode_FromObject(substring);
|
---|
5827 | if (!substring)
|
---|
5828 | return NULL;
|
---|
5829 |
|
---|
5830 | result = stringlib_find_slice(
|
---|
5831 | PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
|
---|
5832 | PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
|
---|
5833 | start, end
|
---|
5834 | );
|
---|
5835 |
|
---|
5836 | Py_DECREF(substring);
|
---|
5837 |
|
---|
5838 | if (result < 0) {
|
---|
5839 | PyErr_SetString(PyExc_ValueError, "substring not found");
|
---|
5840 | return NULL;
|
---|
5841 | }
|
---|
5842 |
|
---|
5843 | return PyInt_FromSsize_t(result);
|
---|
5844 | }
|
---|
5845 |
|
---|
5846 | PyDoc_STRVAR(islower__doc__,
|
---|
5847 | "S.islower() -> bool\n\
|
---|
5848 | \n\
|
---|
5849 | Return True if all cased characters in S are lowercase and there is\n\
|
---|
5850 | at least one cased character in S, False otherwise.");
|
---|
5851 |
|
---|
5852 | static PyObject*
|
---|
5853 | unicode_islower(PyUnicodeObject *self)
|
---|
5854 | {
|
---|
5855 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
---|
5856 | register const Py_UNICODE *e;
|
---|
5857 | int cased;
|
---|
5858 |
|
---|
5859 | /* Shortcut for single character strings */
|
---|
5860 | if (PyUnicode_GET_SIZE(self) == 1)
|
---|
5861 | return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
|
---|
5862 |
|
---|
5863 | /* Special case for empty strings */
|
---|
5864 | if (PyUnicode_GET_SIZE(self) == 0)
|
---|
5865 | return PyBool_FromLong(0);
|
---|
5866 |
|
---|
5867 | e = p + PyUnicode_GET_SIZE(self);
|
---|
5868 | cased = 0;
|
---|
5869 | for (; p < e; p++) {
|
---|
5870 | register const Py_UNICODE ch = *p;
|
---|
5871 |
|
---|
5872 | if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
|
---|
5873 | return PyBool_FromLong(0);
|
---|
5874 | else if (!cased && Py_UNICODE_ISLOWER(ch))
|
---|
5875 | cased = 1;
|
---|
5876 | }
|
---|
5877 | return PyBool_FromLong(cased);
|
---|
5878 | }
|
---|
5879 |
|
---|
5880 | PyDoc_STRVAR(isupper__doc__,
|
---|
5881 | "S.isupper() -> bool\n\
|
---|
5882 | \n\
|
---|
5883 | Return True if all cased characters in S are uppercase and there is\n\
|
---|
5884 | at least one cased character in S, False otherwise.");
|
---|
5885 |
|
---|
5886 | static PyObject*
|
---|
5887 | unicode_isupper(PyUnicodeObject *self)
|
---|
5888 | {
|
---|
5889 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
---|
5890 | register const Py_UNICODE *e;
|
---|
5891 | int cased;
|
---|
5892 |
|
---|
5893 | /* Shortcut for single character strings */
|
---|
5894 | if (PyUnicode_GET_SIZE(self) == 1)
|
---|
5895 | return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
|
---|
5896 |
|
---|
5897 | /* Special case for empty strings */
|
---|
5898 | if (PyUnicode_GET_SIZE(self) == 0)
|
---|
5899 | return PyBool_FromLong(0);
|
---|
5900 |
|
---|
5901 | e = p + PyUnicode_GET_SIZE(self);
|
---|
5902 | cased = 0;
|
---|
5903 | for (; p < e; p++) {
|
---|
5904 | register const Py_UNICODE ch = *p;
|
---|
5905 |
|
---|
5906 | if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
|
---|
5907 | return PyBool_FromLong(0);
|
---|
5908 | else if (!cased && Py_UNICODE_ISUPPER(ch))
|
---|
5909 | cased = 1;
|
---|
5910 | }
|
---|
5911 | return PyBool_FromLong(cased);
|
---|
5912 | }
|
---|
5913 |
|
---|
5914 | PyDoc_STRVAR(istitle__doc__,
|
---|
5915 | "S.istitle() -> bool\n\
|
---|
5916 | \n\
|
---|
5917 | Return True if S is a titlecased string and there is at least one\n\
|
---|
5918 | character in S, i.e. upper- and titlecase characters may only\n\
|
---|
5919 | follow uncased characters and lowercase characters only cased ones.\n\
|
---|
5920 | Return False otherwise.");
|
---|
5921 |
|
---|
5922 | static PyObject*
|
---|
5923 | unicode_istitle(PyUnicodeObject *self)
|
---|
5924 | {
|
---|
5925 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
---|
5926 | register const Py_UNICODE *e;
|
---|
5927 | int cased, previous_is_cased;
|
---|
5928 |
|
---|
5929 | /* Shortcut for single character strings */
|
---|
5930 | if (PyUnicode_GET_SIZE(self) == 1)
|
---|
5931 | return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
|
---|
5932 | (Py_UNICODE_ISUPPER(*p) != 0));
|
---|
5933 |
|
---|
5934 | /* Special case for empty strings */
|
---|
5935 | if (PyUnicode_GET_SIZE(self) == 0)
|
---|
5936 | return PyBool_FromLong(0);
|
---|
5937 |
|
---|
5938 | e = p + PyUnicode_GET_SIZE(self);
|
---|
5939 | cased = 0;
|
---|
5940 | previous_is_cased = 0;
|
---|
5941 | for (; p < e; p++) {
|
---|
5942 | register const Py_UNICODE ch = *p;
|
---|
5943 |
|
---|
5944 | if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
|
---|
5945 | if (previous_is_cased)
|
---|
5946 | return PyBool_FromLong(0);
|
---|
5947 | previous_is_cased = 1;
|
---|
5948 | cased = 1;
|
---|
5949 | }
|
---|
5950 | else if (Py_UNICODE_ISLOWER(ch)) {
|
---|
5951 | if (!previous_is_cased)
|
---|
5952 | return PyBool_FromLong(0);
|
---|
5953 | previous_is_cased = 1;
|
---|
5954 | cased = 1;
|
---|
5955 | }
|
---|
5956 | else
|
---|
5957 | previous_is_cased = 0;
|
---|
5958 | }
|
---|
5959 | return PyBool_FromLong(cased);
|
---|
5960 | }
|
---|
5961 |
|
---|
5962 | PyDoc_STRVAR(isspace__doc__,
|
---|
5963 | "S.isspace() -> bool\n\
|
---|
5964 | \n\
|
---|
5965 | Return True if all characters in S are whitespace\n\
|
---|
5966 | and there is at least one character in S, False otherwise.");
|
---|
5967 |
|
---|
5968 | static PyObject*
|
---|
5969 | unicode_isspace(PyUnicodeObject *self)
|
---|
5970 | {
|
---|
5971 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
---|
5972 | register const Py_UNICODE *e;
|
---|
5973 |
|
---|
5974 | /* Shortcut for single character strings */
|
---|
5975 | if (PyUnicode_GET_SIZE(self) == 1 &&
|
---|
5976 | Py_UNICODE_ISSPACE(*p))
|
---|
5977 | return PyBool_FromLong(1);
|
---|
5978 |
|
---|
5979 | /* Special case for empty strings */
|
---|
5980 | if (PyUnicode_GET_SIZE(self) == 0)
|
---|
5981 | return PyBool_FromLong(0);
|
---|
5982 |
|
---|
5983 | e = p + PyUnicode_GET_SIZE(self);
|
---|
5984 | for (; p < e; p++) {
|
---|
5985 | if (!Py_UNICODE_ISSPACE(*p))
|
---|
5986 | return PyBool_FromLong(0);
|
---|
5987 | }
|
---|
5988 | return PyBool_FromLong(1);
|
---|
5989 | }
|
---|
5990 |
|
---|
5991 | PyDoc_STRVAR(isalpha__doc__,
|
---|
5992 | "S.isalpha() -> bool\n\
|
---|
5993 | \n\
|
---|
5994 | Return True if all characters in S are alphabetic\n\
|
---|
5995 | and there is at least one character in S, False otherwise.");
|
---|
5996 |
|
---|
5997 | static PyObject*
|
---|
5998 | unicode_isalpha(PyUnicodeObject *self)
|
---|
5999 | {
|
---|
6000 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
---|
6001 | register const Py_UNICODE *e;
|
---|
6002 |
|
---|
6003 | /* Shortcut for single character strings */
|
---|
6004 | if (PyUnicode_GET_SIZE(self) == 1 &&
|
---|
6005 | Py_UNICODE_ISALPHA(*p))
|
---|
6006 | return PyBool_FromLong(1);
|
---|
6007 |
|
---|
6008 | /* Special case for empty strings */
|
---|
6009 | if (PyUnicode_GET_SIZE(self) == 0)
|
---|
6010 | return PyBool_FromLong(0);
|
---|
6011 |
|
---|
6012 | e = p + PyUnicode_GET_SIZE(self);
|
---|
6013 | for (; p < e; p++) {
|
---|
6014 | if (!Py_UNICODE_ISALPHA(*p))
|
---|
6015 | return PyBool_FromLong(0);
|
---|
6016 | }
|
---|
6017 | return PyBool_FromLong(1);
|
---|
6018 | }
|
---|
6019 |
|
---|
6020 | PyDoc_STRVAR(isalnum__doc__,
|
---|
6021 | "S.isalnum() -> bool\n\
|
---|
6022 | \n\
|
---|
6023 | Return True if all characters in S are alphanumeric\n\
|
---|
6024 | and there is at least one character in S, False otherwise.");
|
---|
6025 |
|
---|
6026 | static PyObject*
|
---|
6027 | unicode_isalnum(PyUnicodeObject *self)
|
---|
6028 | {
|
---|
6029 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
---|
6030 | register const Py_UNICODE *e;
|
---|
6031 |
|
---|
6032 | /* Shortcut for single character strings */
|
---|
6033 | if (PyUnicode_GET_SIZE(self) == 1 &&
|
---|
6034 | Py_UNICODE_ISALNUM(*p))
|
---|
6035 | return PyBool_FromLong(1);
|
---|
6036 |
|
---|
6037 | /* Special case for empty strings */
|
---|
6038 | if (PyUnicode_GET_SIZE(self) == 0)
|
---|
6039 | return PyBool_FromLong(0);
|
---|
6040 |
|
---|
6041 | e = p + PyUnicode_GET_SIZE(self);
|
---|
6042 | for (; p < e; p++) {
|
---|
6043 | if (!Py_UNICODE_ISALNUM(*p))
|
---|
6044 | return PyBool_FromLong(0);
|
---|
6045 | }
|
---|
6046 | return PyBool_FromLong(1);
|
---|
6047 | }
|
---|
6048 |
|
---|
6049 | PyDoc_STRVAR(isdecimal__doc__,
|
---|
6050 | "S.isdecimal() -> bool\n\
|
---|
6051 | \n\
|
---|
6052 | Return True if there are only decimal characters in S,\n\
|
---|
6053 | False otherwise.");
|
---|
6054 |
|
---|
6055 | static PyObject*
|
---|
6056 | unicode_isdecimal(PyUnicodeObject *self)
|
---|
6057 | {
|
---|
6058 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
---|
6059 | register const Py_UNICODE *e;
|
---|
6060 |
|
---|
6061 | /* Shortcut for single character strings */
|
---|
6062 | if (PyUnicode_GET_SIZE(self) == 1 &&
|
---|
6063 | Py_UNICODE_ISDECIMAL(*p))
|
---|
6064 | return PyBool_FromLong(1);
|
---|
6065 |
|
---|
6066 | /* Special case for empty strings */
|
---|
6067 | if (PyUnicode_GET_SIZE(self) == 0)
|
---|
6068 | return PyBool_FromLong(0);
|
---|
6069 |
|
---|
6070 | e = p + PyUnicode_GET_SIZE(self);
|
---|
6071 | for (; p < e; p++) {
|
---|
6072 | if (!Py_UNICODE_ISDECIMAL(*p))
|
---|
6073 | return PyBool_FromLong(0);
|
---|
6074 | }
|
---|
6075 | return PyBool_FromLong(1);
|
---|
6076 | }
|
---|
6077 |
|
---|
6078 | PyDoc_STRVAR(isdigit__doc__,
|
---|
6079 | "S.isdigit() -> bool\n\
|
---|
6080 | \n\
|
---|
6081 | Return True if all characters in S are digits\n\
|
---|
6082 | and there is at least one character in S, False otherwise.");
|
---|
6083 |
|
---|
6084 | static PyObject*
|
---|
6085 | unicode_isdigit(PyUnicodeObject *self)
|
---|
6086 | {
|
---|
6087 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
---|
6088 | register const Py_UNICODE *e;
|
---|
6089 |
|
---|
6090 | /* Shortcut for single character strings */
|
---|
6091 | if (PyUnicode_GET_SIZE(self) == 1 &&
|
---|
6092 | Py_UNICODE_ISDIGIT(*p))
|
---|
6093 | return PyBool_FromLong(1);
|
---|
6094 |
|
---|
6095 | /* Special case for empty strings */
|
---|
6096 | if (PyUnicode_GET_SIZE(self) == 0)
|
---|
6097 | return PyBool_FromLong(0);
|
---|
6098 |
|
---|
6099 | e = p + PyUnicode_GET_SIZE(self);
|
---|
6100 | for (; p < e; p++) {
|
---|
6101 | if (!Py_UNICODE_ISDIGIT(*p))
|
---|
6102 | return PyBool_FromLong(0);
|
---|
6103 | }
|
---|
6104 | return PyBool_FromLong(1);
|
---|
6105 | }
|
---|
6106 |
|
---|
6107 | PyDoc_STRVAR(isnumeric__doc__,
|
---|
6108 | "S.isnumeric() -> bool\n\
|
---|
6109 | \n\
|
---|
6110 | Return True if there are only numeric characters in S,\n\
|
---|
6111 | False otherwise.");
|
---|
6112 |
|
---|
6113 | static PyObject*
|
---|
6114 | unicode_isnumeric(PyUnicodeObject *self)
|
---|
6115 | {
|
---|
6116 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
---|
6117 | register const Py_UNICODE *e;
|
---|
6118 |
|
---|
6119 | /* Shortcut for single character strings */
|
---|
6120 | if (PyUnicode_GET_SIZE(self) == 1 &&
|
---|
6121 | Py_UNICODE_ISNUMERIC(*p))
|
---|
6122 | return PyBool_FromLong(1);
|
---|
6123 |
|
---|
6124 | /* Special case for empty strings */
|
---|
6125 | if (PyUnicode_GET_SIZE(self) == 0)
|
---|
6126 | return PyBool_FromLong(0);
|
---|
6127 |
|
---|
6128 | e = p + PyUnicode_GET_SIZE(self);
|
---|
6129 | for (; p < e; p++) {
|
---|
6130 | if (!Py_UNICODE_ISNUMERIC(*p))
|
---|
6131 | return PyBool_FromLong(0);
|
---|
6132 | }
|
---|
6133 | return PyBool_FromLong(1);
|
---|
6134 | }
|
---|
6135 |
|
---|
6136 | PyDoc_STRVAR(join__doc__,
|
---|
6137 | "S.join(sequence) -> unicode\n\
|
---|
6138 | \n\
|
---|
6139 | Return a string which is the concatenation of the strings in the\n\
|
---|
6140 | sequence. The separator between elements is S.");
|
---|
6141 |
|
---|
6142 | static PyObject*
|
---|
6143 | unicode_join(PyObject *self, PyObject *data)
|
---|
6144 | {
|
---|
6145 | return PyUnicode_Join(self, data);
|
---|
6146 | }
|
---|
6147 |
|
---|
6148 | static Py_ssize_t
|
---|
6149 | unicode_length(PyUnicodeObject *self)
|
---|
6150 | {
|
---|
6151 | return self->length;
|
---|
6152 | }
|
---|
6153 |
|
---|
6154 | PyDoc_STRVAR(ljust__doc__,
|
---|
6155 | "S.ljust(width[, fillchar]) -> int\n\
|
---|
6156 | \n\
|
---|
6157 | Return S left justified in a Unicode string of length width. Padding is\n\
|
---|
6158 | done using the specified fill character (default is a space).");
|
---|
6159 |
|
---|
6160 | static PyObject *
|
---|
6161 | unicode_ljust(PyUnicodeObject *self, PyObject *args)
|
---|
6162 | {
|
---|
6163 | Py_ssize_t width;
|
---|
6164 | Py_UNICODE fillchar = ' ';
|
---|
6165 |
|
---|
6166 | if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
|
---|
6167 | return NULL;
|
---|
6168 |
|
---|
6169 | if (self->length >= width && PyUnicode_CheckExact(self)) {
|
---|
6170 | Py_INCREF(self);
|
---|
6171 | return (PyObject*) self;
|
---|
6172 | }
|
---|
6173 |
|
---|
6174 | return (PyObject*) pad(self, 0, width - self->length, fillchar);
|
---|
6175 | }
|
---|
6176 |
|
---|
6177 | PyDoc_STRVAR(lower__doc__,
|
---|
6178 | "S.lower() -> unicode\n\
|
---|
6179 | \n\
|
---|
6180 | Return a copy of the string S converted to lowercase.");
|
---|
6181 |
|
---|
6182 | static PyObject*
|
---|
6183 | unicode_lower(PyUnicodeObject *self)
|
---|
6184 | {
|
---|
6185 | return fixup(self, fixlower);
|
---|
6186 | }
|
---|
6187 |
|
---|
6188 | #define LEFTSTRIP 0
|
---|
6189 | #define RIGHTSTRIP 1
|
---|
6190 | #define BOTHSTRIP 2
|
---|
6191 |
|
---|
6192 | /* Arrays indexed by above */
|
---|
6193 | static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
|
---|
6194 |
|
---|
6195 | #define STRIPNAME(i) (stripformat[i]+3)
|
---|
6196 |
|
---|
6197 | /* externally visible for str.strip(unicode) */
|
---|
6198 | PyObject *
|
---|
6199 | _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
|
---|
6200 | {
|
---|
6201 | Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
|
---|
6202 | Py_ssize_t len = PyUnicode_GET_SIZE(self);
|
---|
6203 | Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
|
---|
6204 | Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
|
---|
6205 | Py_ssize_t i, j;
|
---|
6206 |
|
---|
6207 | BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
|
---|
6208 |
|
---|
6209 | i = 0;
|
---|
6210 | if (striptype != RIGHTSTRIP) {
|
---|
6211 | while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
|
---|
6212 | i++;
|
---|
6213 | }
|
---|
6214 | }
|
---|
6215 |
|
---|
6216 | j = len;
|
---|
6217 | if (striptype != LEFTSTRIP) {
|
---|
6218 | do {
|
---|
6219 | j--;
|
---|
6220 | } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
|
---|
6221 | j++;
|
---|
6222 | }
|
---|
6223 |
|
---|
6224 | if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
|
---|
6225 | Py_INCREF(self);
|
---|
6226 | return (PyObject*)self;
|
---|
6227 | }
|
---|
6228 | else
|
---|
6229 | return PyUnicode_FromUnicode(s+i, j-i);
|
---|
6230 | }
|
---|
6231 |
|
---|
6232 |
|
---|
6233 | static PyObject *
|
---|
6234 | do_strip(PyUnicodeObject *self, int striptype)
|
---|
6235 | {
|
---|
6236 | Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
|
---|
6237 | Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
|
---|
6238 |
|
---|
6239 | i = 0;
|
---|
6240 | if (striptype != RIGHTSTRIP) {
|
---|
6241 | while (i < len && Py_UNICODE_ISSPACE(s[i])) {
|
---|
6242 | i++;
|
---|
6243 | }
|
---|
6244 | }
|
---|
6245 |
|
---|
6246 | j = len;
|
---|
6247 | if (striptype != LEFTSTRIP) {
|
---|
6248 | do {
|
---|
6249 | j--;
|
---|
6250 | } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
|
---|
6251 | j++;
|
---|
6252 | }
|
---|
6253 |
|
---|
6254 | if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
|
---|
6255 | Py_INCREF(self);
|
---|
6256 | return (PyObject*)self;
|
---|
6257 | }
|
---|
6258 | else
|
---|
6259 | return PyUnicode_FromUnicode(s+i, j-i);
|
---|
6260 | }
|
---|
6261 |
|
---|
6262 |
|
---|
6263 | static PyObject *
|
---|
6264 | do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
|
---|
6265 | {
|
---|
6266 | PyObject *sep = NULL;
|
---|
6267 |
|
---|
6268 | if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
|
---|
6269 | return NULL;
|
---|
6270 |
|
---|
6271 | if (sep != NULL && sep != Py_None) {
|
---|
6272 | if (PyUnicode_Check(sep))
|
---|
6273 | return _PyUnicode_XStrip(self, striptype, sep);
|
---|
6274 | else if (PyString_Check(sep)) {
|
---|
6275 | PyObject *res;
|
---|
6276 | sep = PyUnicode_FromObject(sep);
|
---|
6277 | if (sep==NULL)
|
---|
6278 | return NULL;
|
---|
6279 | res = _PyUnicode_XStrip(self, striptype, sep);
|
---|
6280 | Py_DECREF(sep);
|
---|
6281 | return res;
|
---|
6282 | }
|
---|
6283 | else {
|
---|
6284 | PyErr_Format(PyExc_TypeError,
|
---|
6285 | "%s arg must be None, unicode or str",
|
---|
6286 | STRIPNAME(striptype));
|
---|
6287 | return NULL;
|
---|
6288 | }
|
---|
6289 | }
|
---|
6290 |
|
---|
6291 | return do_strip(self, striptype);
|
---|
6292 | }
|
---|
6293 |
|
---|
6294 |
|
---|
6295 | PyDoc_STRVAR(strip__doc__,
|
---|
6296 | "S.strip([chars]) -> unicode\n\
|
---|
6297 | \n\
|
---|
6298 | Return a copy of the string S with leading and trailing\n\
|
---|
6299 | whitespace removed.\n\
|
---|
6300 | If chars is given and not None, remove characters in chars instead.\n\
|
---|
6301 | If chars is a str, it will be converted to unicode before stripping");
|
---|
6302 |
|
---|
6303 | static PyObject *
|
---|
6304 | unicode_strip(PyUnicodeObject *self, PyObject *args)
|
---|
6305 | {
|
---|
6306 | if (PyTuple_GET_SIZE(args) == 0)
|
---|
6307 | return do_strip(self, BOTHSTRIP); /* Common case */
|
---|
6308 | else
|
---|
6309 | return do_argstrip(self, BOTHSTRIP, args);
|
---|
6310 | }
|
---|
6311 |
|
---|
6312 |
|
---|
6313 | PyDoc_STRVAR(lstrip__doc__,
|
---|
6314 | "S.lstrip([chars]) -> unicode\n\
|
---|
6315 | \n\
|
---|
6316 | Return a copy of the string S with leading whitespace removed.\n\
|
---|
6317 | If chars is given and not None, remove characters in chars instead.\n\
|
---|
6318 | If chars is a str, it will be converted to unicode before stripping");
|
---|
6319 |
|
---|
6320 | static PyObject *
|
---|
6321 | unicode_lstrip(PyUnicodeObject *self, PyObject *args)
|
---|
6322 | {
|
---|
6323 | if (PyTuple_GET_SIZE(args) == 0)
|
---|
6324 | return do_strip(self, LEFTSTRIP); /* Common case */
|
---|
6325 | else
|
---|
6326 | return do_argstrip(self, LEFTSTRIP, args);
|
---|
6327 | }
|
---|
6328 |
|
---|
6329 |
|
---|
6330 | PyDoc_STRVAR(rstrip__doc__,
|
---|
6331 | "S.rstrip([chars]) -> unicode\n\
|
---|
6332 | \n\
|
---|
6333 | Return a copy of the string S with trailing whitespace removed.\n\
|
---|
6334 | If chars is given and not None, remove characters in chars instead.\n\
|
---|
6335 | If chars is a str, it will be converted to unicode before stripping");
|
---|
6336 |
|
---|
6337 | static PyObject *
|
---|
6338 | unicode_rstrip(PyUnicodeObject *self, PyObject *args)
|
---|
6339 | {
|
---|
6340 | if (PyTuple_GET_SIZE(args) == 0)
|
---|
6341 | return do_strip(self, RIGHTSTRIP); /* Common case */
|
---|
6342 | else
|
---|
6343 | return do_argstrip(self, RIGHTSTRIP, args);
|
---|
6344 | }
|
---|
6345 |
|
---|
6346 |
|
---|
6347 | static PyObject*
|
---|
6348 | unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
|
---|
6349 | {
|
---|
6350 | PyUnicodeObject *u;
|
---|
6351 | Py_UNICODE *p;
|
---|
6352 | Py_ssize_t nchars;
|
---|
6353 | size_t nbytes;
|
---|
6354 |
|
---|
6355 | if (len < 0)
|
---|
6356 | len = 0;
|
---|
6357 |
|
---|
6358 | if (len == 1 && PyUnicode_CheckExact(str)) {
|
---|
6359 | /* no repeat, return original string */
|
---|
6360 | Py_INCREF(str);
|
---|
6361 | return (PyObject*) str;
|
---|
6362 | }
|
---|
6363 |
|
---|
6364 | /* ensure # of chars needed doesn't overflow int and # of bytes
|
---|
6365 | * needed doesn't overflow size_t
|
---|
6366 | */
|
---|
6367 | nchars = len * str->length;
|
---|
6368 | if (len && nchars / len != str->length) {
|
---|
6369 | PyErr_SetString(PyExc_OverflowError,
|
---|
6370 | "repeated string is too long");
|
---|
6371 | return NULL;
|
---|
6372 | }
|
---|
6373 | nbytes = (nchars + 1) * sizeof(Py_UNICODE);
|
---|
6374 | if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
|
---|
6375 | PyErr_SetString(PyExc_OverflowError,
|
---|
6376 | "repeated string is too long");
|
---|
6377 | return NULL;
|
---|
6378 | }
|
---|
6379 | u = _PyUnicode_New(nchars);
|
---|
6380 | if (!u)
|
---|
6381 | return NULL;
|
---|
6382 |
|
---|
6383 | p = u->str;
|
---|
6384 |
|
---|
6385 | if (str->length == 1 && len > 0) {
|
---|
6386 | Py_UNICODE_FILL(p, str->str[0], len);
|
---|
6387 | } else {
|
---|
6388 | Py_ssize_t done = 0; /* number of characters copied this far */
|
---|
6389 | if (done < nchars) {
|
---|
6390 | Py_UNICODE_COPY(p, str->str, str->length);
|
---|
6391 | done = str->length;
|
---|
6392 | }
|
---|
6393 | while (done < nchars) {
|
---|
6394 | int n = (done <= nchars-done) ? done : nchars-done;
|
---|
6395 | Py_UNICODE_COPY(p+done, p, n);
|
---|
6396 | done += n;
|
---|
6397 | }
|
---|
6398 | }
|
---|
6399 |
|
---|
6400 | return (PyObject*) u;
|
---|
6401 | }
|
---|
6402 |
|
---|
6403 | PyObject *PyUnicode_Replace(PyObject *obj,
|
---|
6404 | PyObject *subobj,
|
---|
6405 | PyObject *replobj,
|
---|
6406 | Py_ssize_t maxcount)
|
---|
6407 | {
|
---|
6408 | PyObject *self;
|
---|
6409 | PyObject *str1;
|
---|
6410 | PyObject *str2;
|
---|
6411 | PyObject *result;
|
---|
6412 |
|
---|
6413 | self = PyUnicode_FromObject(obj);
|
---|
6414 | if (self == NULL)
|
---|
6415 | return NULL;
|
---|
6416 | str1 = PyUnicode_FromObject(subobj);
|
---|
6417 | if (str1 == NULL) {
|
---|
6418 | Py_DECREF(self);
|
---|
6419 | return NULL;
|
---|
6420 | }
|
---|
6421 | str2 = PyUnicode_FromObject(replobj);
|
---|
6422 | if (str2 == NULL) {
|
---|
6423 | Py_DECREF(self);
|
---|
6424 | Py_DECREF(str1);
|
---|
6425 | return NULL;
|
---|
6426 | }
|
---|
6427 | result = replace((PyUnicodeObject *)self,
|
---|
6428 | (PyUnicodeObject *)str1,
|
---|
6429 | (PyUnicodeObject *)str2,
|
---|
6430 | maxcount);
|
---|
6431 | Py_DECREF(self);
|
---|
6432 | Py_DECREF(str1);
|
---|
6433 | Py_DECREF(str2);
|
---|
6434 | return result;
|
---|
6435 | }
|
---|
6436 |
|
---|
6437 | PyDoc_STRVAR(replace__doc__,
|
---|
6438 | "S.replace (old, new[, maxsplit]) -> unicode\n\
|
---|
6439 | \n\
|
---|
6440 | Return a copy of S with all occurrences of substring\n\
|
---|
6441 | old replaced by new. If the optional argument maxsplit is\n\
|
---|
6442 | given, only the first maxsplit occurrences are replaced.");
|
---|
6443 |
|
---|
6444 | static PyObject*
|
---|
6445 | unicode_replace(PyUnicodeObject *self, PyObject *args)
|
---|
6446 | {
|
---|
6447 | PyUnicodeObject *str1;
|
---|
6448 | PyUnicodeObject *str2;
|
---|
6449 | Py_ssize_t maxcount = -1;
|
---|
6450 | PyObject *result;
|
---|
6451 |
|
---|
6452 | if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
|
---|
6453 | return NULL;
|
---|
6454 | str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
|
---|
6455 | if (str1 == NULL)
|
---|
6456 | return NULL;
|
---|
6457 | str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
|
---|
6458 | if (str2 == NULL) {
|
---|
6459 | Py_DECREF(str1);
|
---|
6460 | return NULL;
|
---|
6461 | }
|
---|
6462 |
|
---|
6463 | result = replace(self, str1, str2, maxcount);
|
---|
6464 |
|
---|
6465 | Py_DECREF(str1);
|
---|
6466 | Py_DECREF(str2);
|
---|
6467 | return result;
|
---|
6468 | }
|
---|
6469 |
|
---|
6470 | static
|
---|
6471 | PyObject *unicode_repr(PyObject *unicode)
|
---|
6472 | {
|
---|
6473 | return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
|
---|
6474 | PyUnicode_GET_SIZE(unicode),
|
---|
6475 | 1);
|
---|
6476 | }
|
---|
6477 |
|
---|
6478 | PyDoc_STRVAR(rfind__doc__,
|
---|
6479 | "S.rfind(sub [,start [,end]]) -> int\n\
|
---|
6480 | \n\
|
---|
6481 | Return the highest index in S where substring sub is found,\n\
|
---|
6482 | such that sub is contained within s[start,end]. Optional\n\
|
---|
6483 | arguments start and end are interpreted as in slice notation.\n\
|
---|
6484 | \n\
|
---|
6485 | Return -1 on failure.");
|
---|
6486 |
|
---|
6487 | static PyObject *
|
---|
6488 | unicode_rfind(PyUnicodeObject *self, PyObject *args)
|
---|
6489 | {
|
---|
6490 | PyObject *substring;
|
---|
6491 | Py_ssize_t start = 0;
|
---|
6492 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
6493 | Py_ssize_t result;
|
---|
6494 |
|
---|
6495 | if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
|
---|
6496 | _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
|
---|
6497 | return NULL;
|
---|
6498 | substring = PyUnicode_FromObject(substring);
|
---|
6499 | if (!substring)
|
---|
6500 | return NULL;
|
---|
6501 |
|
---|
6502 | result = stringlib_rfind_slice(
|
---|
6503 | PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
|
---|
6504 | PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
|
---|
6505 | start, end
|
---|
6506 | );
|
---|
6507 |
|
---|
6508 | Py_DECREF(substring);
|
---|
6509 |
|
---|
6510 | return PyInt_FromSsize_t(result);
|
---|
6511 | }
|
---|
6512 |
|
---|
6513 | PyDoc_STRVAR(rindex__doc__,
|
---|
6514 | "S.rindex(sub [,start [,end]]) -> int\n\
|
---|
6515 | \n\
|
---|
6516 | Like S.rfind() but raise ValueError when the substring is not found.");
|
---|
6517 |
|
---|
6518 | static PyObject *
|
---|
6519 | unicode_rindex(PyUnicodeObject *self, PyObject *args)
|
---|
6520 | {
|
---|
6521 | PyObject *substring;
|
---|
6522 | Py_ssize_t start = 0;
|
---|
6523 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
6524 | Py_ssize_t result;
|
---|
6525 |
|
---|
6526 | if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
|
---|
6527 | _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
|
---|
6528 | return NULL;
|
---|
6529 | substring = PyUnicode_FromObject(substring);
|
---|
6530 | if (!substring)
|
---|
6531 | return NULL;
|
---|
6532 |
|
---|
6533 | result = stringlib_rfind_slice(
|
---|
6534 | PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
|
---|
6535 | PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
|
---|
6536 | start, end
|
---|
6537 | );
|
---|
6538 |
|
---|
6539 | Py_DECREF(substring);
|
---|
6540 |
|
---|
6541 | if (result < 0) {
|
---|
6542 | PyErr_SetString(PyExc_ValueError, "substring not found");
|
---|
6543 | return NULL;
|
---|
6544 | }
|
---|
6545 | return PyInt_FromSsize_t(result);
|
---|
6546 | }
|
---|
6547 |
|
---|
6548 | PyDoc_STRVAR(rjust__doc__,
|
---|
6549 | "S.rjust(width[, fillchar]) -> unicode\n\
|
---|
6550 | \n\
|
---|
6551 | Return S right justified in a Unicode string of length width. Padding is\n\
|
---|
6552 | done using the specified fill character (default is a space).");
|
---|
6553 |
|
---|
6554 | static PyObject *
|
---|
6555 | unicode_rjust(PyUnicodeObject *self, PyObject *args)
|
---|
6556 | {
|
---|
6557 | Py_ssize_t width;
|
---|
6558 | Py_UNICODE fillchar = ' ';
|
---|
6559 |
|
---|
6560 | if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
|
---|
6561 | return NULL;
|
---|
6562 |
|
---|
6563 | if (self->length >= width && PyUnicode_CheckExact(self)) {
|
---|
6564 | Py_INCREF(self);
|
---|
6565 | return (PyObject*) self;
|
---|
6566 | }
|
---|
6567 |
|
---|
6568 | return (PyObject*) pad(self, width - self->length, 0, fillchar);
|
---|
6569 | }
|
---|
6570 |
|
---|
6571 | static PyObject*
|
---|
6572 | unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
|
---|
6573 | {
|
---|
6574 | /* standard clamping */
|
---|
6575 | if (start < 0)
|
---|
6576 | start = 0;
|
---|
6577 | if (end < 0)
|
---|
6578 | end = 0;
|
---|
6579 | if (end > self->length)
|
---|
6580 | end = self->length;
|
---|
6581 | if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
|
---|
6582 | /* full slice, return original string */
|
---|
6583 | Py_INCREF(self);
|
---|
6584 | return (PyObject*) self;
|
---|
6585 | }
|
---|
6586 | if (start > end)
|
---|
6587 | start = end;
|
---|
6588 | /* copy slice */
|
---|
6589 | return (PyObject*) PyUnicode_FromUnicode(self->str + start,
|
---|
6590 | end - start);
|
---|
6591 | }
|
---|
6592 |
|
---|
6593 | PyObject *PyUnicode_Split(PyObject *s,
|
---|
6594 | PyObject *sep,
|
---|
6595 | Py_ssize_t maxsplit)
|
---|
6596 | {
|
---|
6597 | PyObject *result;
|
---|
6598 |
|
---|
6599 | s = PyUnicode_FromObject(s);
|
---|
6600 | if (s == NULL)
|
---|
6601 | return NULL;
|
---|
6602 | if (sep != NULL) {
|
---|
6603 | sep = PyUnicode_FromObject(sep);
|
---|
6604 | if (sep == NULL) {
|
---|
6605 | Py_DECREF(s);
|
---|
6606 | return NULL;
|
---|
6607 | }
|
---|
6608 | }
|
---|
6609 |
|
---|
6610 | result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
|
---|
6611 |
|
---|
6612 | Py_DECREF(s);
|
---|
6613 | Py_XDECREF(sep);
|
---|
6614 | return result;
|
---|
6615 | }
|
---|
6616 |
|
---|
6617 | PyDoc_STRVAR(split__doc__,
|
---|
6618 | "S.split([sep [,maxsplit]]) -> list of strings\n\
|
---|
6619 | \n\
|
---|
6620 | Return a list of the words in S, using sep as the\n\
|
---|
6621 | delimiter string. If maxsplit is given, at most maxsplit\n\
|
---|
6622 | splits are done. If sep is not specified or is None,\n\
|
---|
6623 | any whitespace string is a separator.");
|
---|
6624 |
|
---|
6625 | static PyObject*
|
---|
6626 | unicode_split(PyUnicodeObject *self, PyObject *args)
|
---|
6627 | {
|
---|
6628 | PyObject *substring = Py_None;
|
---|
6629 | Py_ssize_t maxcount = -1;
|
---|
6630 |
|
---|
6631 | if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
|
---|
6632 | return NULL;
|
---|
6633 |
|
---|
6634 | if (substring == Py_None)
|
---|
6635 | return split(self, NULL, maxcount);
|
---|
6636 | else if (PyUnicode_Check(substring))
|
---|
6637 | return split(self, (PyUnicodeObject *)substring, maxcount);
|
---|
6638 | else
|
---|
6639 | return PyUnicode_Split((PyObject *)self, substring, maxcount);
|
---|
6640 | }
|
---|
6641 |
|
---|
6642 | PyObject *
|
---|
6643 | PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
|
---|
6644 | {
|
---|
6645 | PyObject* str_obj;
|
---|
6646 | PyObject* sep_obj;
|
---|
6647 | PyObject* out;
|
---|
6648 |
|
---|
6649 | str_obj = PyUnicode_FromObject(str_in);
|
---|
6650 | if (!str_obj)
|
---|
6651 | return NULL;
|
---|
6652 | sep_obj = PyUnicode_FromObject(sep_in);
|
---|
6653 | if (!sep_obj) {
|
---|
6654 | Py_DECREF(str_obj);
|
---|
6655 | return NULL;
|
---|
6656 | }
|
---|
6657 |
|
---|
6658 | out = stringlib_partition(
|
---|
6659 | str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
|
---|
6660 | sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
|
---|
6661 | );
|
---|
6662 |
|
---|
6663 | Py_DECREF(sep_obj);
|
---|
6664 | Py_DECREF(str_obj);
|
---|
6665 |
|
---|
6666 | return out;
|
---|
6667 | }
|
---|
6668 |
|
---|
6669 |
|
---|
6670 | PyObject *
|
---|
6671 | PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
|
---|
6672 | {
|
---|
6673 | PyObject* str_obj;
|
---|
6674 | PyObject* sep_obj;
|
---|
6675 | PyObject* out;
|
---|
6676 |
|
---|
6677 | str_obj = PyUnicode_FromObject(str_in);
|
---|
6678 | if (!str_obj)
|
---|
6679 | return NULL;
|
---|
6680 | sep_obj = PyUnicode_FromObject(sep_in);
|
---|
6681 | if (!sep_obj) {
|
---|
6682 | Py_DECREF(str_obj);
|
---|
6683 | return NULL;
|
---|
6684 | }
|
---|
6685 |
|
---|
6686 | out = stringlib_rpartition(
|
---|
6687 | str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
|
---|
6688 | sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
|
---|
6689 | );
|
---|
6690 |
|
---|
6691 | Py_DECREF(sep_obj);
|
---|
6692 | Py_DECREF(str_obj);
|
---|
6693 |
|
---|
6694 | return out;
|
---|
6695 | }
|
---|
6696 |
|
---|
6697 | PyDoc_STRVAR(partition__doc__,
|
---|
6698 | "S.partition(sep) -> (head, sep, tail)\n\
|
---|
6699 | \n\
|
---|
6700 | Searches for the separator sep in S, and returns the part before it,\n\
|
---|
6701 | the separator itself, and the part after it. If the separator is not\n\
|
---|
6702 | found, returns S and two empty strings.");
|
---|
6703 |
|
---|
6704 | static PyObject*
|
---|
6705 | unicode_partition(PyUnicodeObject *self, PyObject *separator)
|
---|
6706 | {
|
---|
6707 | return PyUnicode_Partition((PyObject *)self, separator);
|
---|
6708 | }
|
---|
6709 |
|
---|
6710 | PyDoc_STRVAR(rpartition__doc__,
|
---|
6711 | "S.rpartition(sep) -> (tail, sep, head)\n\
|
---|
6712 | \n\
|
---|
6713 | Searches for the separator sep in S, starting at the end of S, and returns\n\
|
---|
6714 | the part before it, the separator itself, and the part after it. If the\n\
|
---|
6715 | separator is not found, returns two empty strings and S.");
|
---|
6716 |
|
---|
6717 | static PyObject*
|
---|
6718 | unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
|
---|
6719 | {
|
---|
6720 | return PyUnicode_RPartition((PyObject *)self, separator);
|
---|
6721 | }
|
---|
6722 |
|
---|
6723 | PyObject *PyUnicode_RSplit(PyObject *s,
|
---|
6724 | PyObject *sep,
|
---|
6725 | Py_ssize_t maxsplit)
|
---|
6726 | {
|
---|
6727 | PyObject *result;
|
---|
6728 |
|
---|
6729 | s = PyUnicode_FromObject(s);
|
---|
6730 | if (s == NULL)
|
---|
6731 | return NULL;
|
---|
6732 | if (sep != NULL) {
|
---|
6733 | sep = PyUnicode_FromObject(sep);
|
---|
6734 | if (sep == NULL) {
|
---|
6735 | Py_DECREF(s);
|
---|
6736 | return NULL;
|
---|
6737 | }
|
---|
6738 | }
|
---|
6739 |
|
---|
6740 | result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
|
---|
6741 |
|
---|
6742 | Py_DECREF(s);
|
---|
6743 | Py_XDECREF(sep);
|
---|
6744 | return result;
|
---|
6745 | }
|
---|
6746 |
|
---|
6747 | PyDoc_STRVAR(rsplit__doc__,
|
---|
6748 | "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
|
---|
6749 | \n\
|
---|
6750 | Return a list of the words in S, using sep as the\n\
|
---|
6751 | delimiter string, starting at the end of the string and\n\
|
---|
6752 | working to the front. If maxsplit is given, at most maxsplit\n\
|
---|
6753 | splits are done. If sep is not specified, any whitespace string\n\
|
---|
6754 | is a separator.");
|
---|
6755 |
|
---|
6756 | static PyObject*
|
---|
6757 | unicode_rsplit(PyUnicodeObject *self, PyObject *args)
|
---|
6758 | {
|
---|
6759 | PyObject *substring = Py_None;
|
---|
6760 | Py_ssize_t maxcount = -1;
|
---|
6761 |
|
---|
6762 | if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
|
---|
6763 | return NULL;
|
---|
6764 |
|
---|
6765 | if (substring == Py_None)
|
---|
6766 | return rsplit(self, NULL, maxcount);
|
---|
6767 | else if (PyUnicode_Check(substring))
|
---|
6768 | return rsplit(self, (PyUnicodeObject *)substring, maxcount);
|
---|
6769 | else
|
---|
6770 | return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
|
---|
6771 | }
|
---|
6772 |
|
---|
6773 | PyDoc_STRVAR(splitlines__doc__,
|
---|
6774 | "S.splitlines([keepends]]) -> list of strings\n\
|
---|
6775 | \n\
|
---|
6776 | Return a list of the lines in S, breaking at line boundaries.\n\
|
---|
6777 | Line breaks are not included in the resulting list unless keepends\n\
|
---|
6778 | is given and true.");
|
---|
6779 |
|
---|
6780 | static PyObject*
|
---|
6781 | unicode_splitlines(PyUnicodeObject *self, PyObject *args)
|
---|
6782 | {
|
---|
6783 | int keepends = 0;
|
---|
6784 |
|
---|
6785 | if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
|
---|
6786 | return NULL;
|
---|
6787 |
|
---|
6788 | return PyUnicode_Splitlines((PyObject *)self, keepends);
|
---|
6789 | }
|
---|
6790 |
|
---|
6791 | static
|
---|
6792 | PyObject *unicode_str(PyUnicodeObject *self)
|
---|
6793 | {
|
---|
6794 | return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
|
---|
6795 | }
|
---|
6796 |
|
---|
6797 | PyDoc_STRVAR(swapcase__doc__,
|
---|
6798 | "S.swapcase() -> unicode\n\
|
---|
6799 | \n\
|
---|
6800 | Return a copy of S with uppercase characters converted to lowercase\n\
|
---|
6801 | and vice versa.");
|
---|
6802 |
|
---|
6803 | static PyObject*
|
---|
6804 | unicode_swapcase(PyUnicodeObject *self)
|
---|
6805 | {
|
---|
6806 | return fixup(self, fixswapcase);
|
---|
6807 | }
|
---|
6808 |
|
---|
6809 | PyDoc_STRVAR(translate__doc__,
|
---|
6810 | "S.translate(table) -> unicode\n\
|
---|
6811 | \n\
|
---|
6812 | Return a copy of the string S, where all characters have been mapped\n\
|
---|
6813 | through the given translation table, which must be a mapping of\n\
|
---|
6814 | Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
|
---|
6815 | Unmapped characters are left untouched. Characters mapped to None\n\
|
---|
6816 | are deleted.");
|
---|
6817 |
|
---|
6818 | static PyObject*
|
---|
6819 | unicode_translate(PyUnicodeObject *self, PyObject *table)
|
---|
6820 | {
|
---|
6821 | return PyUnicode_TranslateCharmap(self->str,
|
---|
6822 | self->length,
|
---|
6823 | table,
|
---|
6824 | "ignore");
|
---|
6825 | }
|
---|
6826 |
|
---|
6827 | PyDoc_STRVAR(upper__doc__,
|
---|
6828 | "S.upper() -> unicode\n\
|
---|
6829 | \n\
|
---|
6830 | Return a copy of S converted to uppercase.");
|
---|
6831 |
|
---|
6832 | static PyObject*
|
---|
6833 | unicode_upper(PyUnicodeObject *self)
|
---|
6834 | {
|
---|
6835 | return fixup(self, fixupper);
|
---|
6836 | }
|
---|
6837 |
|
---|
6838 | PyDoc_STRVAR(zfill__doc__,
|
---|
6839 | "S.zfill(width) -> unicode\n\
|
---|
6840 | \n\
|
---|
6841 | Pad a numeric string x with zeros on the left, to fill a field\n\
|
---|
6842 | of the specified width. The string x is never truncated.");
|
---|
6843 |
|
---|
6844 | static PyObject *
|
---|
6845 | unicode_zfill(PyUnicodeObject *self, PyObject *args)
|
---|
6846 | {
|
---|
6847 | Py_ssize_t fill;
|
---|
6848 | PyUnicodeObject *u;
|
---|
6849 |
|
---|
6850 | Py_ssize_t width;
|
---|
6851 | if (!PyArg_ParseTuple(args, "n:zfill", &width))
|
---|
6852 | return NULL;
|
---|
6853 |
|
---|
6854 | if (self->length >= width) {
|
---|
6855 | if (PyUnicode_CheckExact(self)) {
|
---|
6856 | Py_INCREF(self);
|
---|
6857 | return (PyObject*) self;
|
---|
6858 | }
|
---|
6859 | else
|
---|
6860 | return PyUnicode_FromUnicode(
|
---|
6861 | PyUnicode_AS_UNICODE(self),
|
---|
6862 | PyUnicode_GET_SIZE(self)
|
---|
6863 | );
|
---|
6864 | }
|
---|
6865 |
|
---|
6866 | fill = width - self->length;
|
---|
6867 |
|
---|
6868 | u = pad(self, fill, 0, '0');
|
---|
6869 |
|
---|
6870 | if (u == NULL)
|
---|
6871 | return NULL;
|
---|
6872 |
|
---|
6873 | if (u->str[fill] == '+' || u->str[fill] == '-') {
|
---|
6874 | /* move sign to beginning of string */
|
---|
6875 | u->str[0] = u->str[fill];
|
---|
6876 | u->str[fill] = '0';
|
---|
6877 | }
|
---|
6878 |
|
---|
6879 | return (PyObject*) u;
|
---|
6880 | }
|
---|
6881 |
|
---|
6882 | #if 0
|
---|
6883 | static PyObject*
|
---|
6884 | unicode_freelistsize(PyUnicodeObject *self)
|
---|
6885 | {
|
---|
6886 | return PyInt_FromLong(unicode_freelist_size);
|
---|
6887 | }
|
---|
6888 | #endif
|
---|
6889 |
|
---|
6890 | PyDoc_STRVAR(startswith__doc__,
|
---|
6891 | "S.startswith(prefix[, start[, end]]) -> bool\n\
|
---|
6892 | \n\
|
---|
6893 | Return True if S starts with the specified prefix, False otherwise.\n\
|
---|
6894 | With optional start, test S beginning at that position.\n\
|
---|
6895 | With optional end, stop comparing S at that position.\n\
|
---|
6896 | prefix can also be a tuple of strings to try.");
|
---|
6897 |
|
---|
6898 | static PyObject *
|
---|
6899 | unicode_startswith(PyUnicodeObject *self,
|
---|
6900 | PyObject *args)
|
---|
6901 | {
|
---|
6902 | PyObject *subobj;
|
---|
6903 | PyUnicodeObject *substring;
|
---|
6904 | Py_ssize_t start = 0;
|
---|
6905 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
6906 | int result;
|
---|
6907 |
|
---|
6908 | if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
|
---|
6909 | _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
|
---|
6910 | return NULL;
|
---|
6911 | if (PyTuple_Check(subobj)) {
|
---|
6912 | Py_ssize_t i;
|
---|
6913 | for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
|
---|
6914 | substring = (PyUnicodeObject *)PyUnicode_FromObject(
|
---|
6915 | PyTuple_GET_ITEM(subobj, i));
|
---|
6916 | if (substring == NULL)
|
---|
6917 | return NULL;
|
---|
6918 | result = tailmatch(self, substring, start, end, -1);
|
---|
6919 | Py_DECREF(substring);
|
---|
6920 | if (result) {
|
---|
6921 | Py_RETURN_TRUE;
|
---|
6922 | }
|
---|
6923 | }
|
---|
6924 | /* nothing matched */
|
---|
6925 | Py_RETURN_FALSE;
|
---|
6926 | }
|
---|
6927 | substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
|
---|
6928 | if (substring == NULL)
|
---|
6929 | return NULL;
|
---|
6930 | result = tailmatch(self, substring, start, end, -1);
|
---|
6931 | Py_DECREF(substring);
|
---|
6932 | return PyBool_FromLong(result);
|
---|
6933 | }
|
---|
6934 |
|
---|
6935 |
|
---|
6936 | PyDoc_STRVAR(endswith__doc__,
|
---|
6937 | "S.endswith(suffix[, start[, end]]) -> bool\n\
|
---|
6938 | \n\
|
---|
6939 | Return True if S ends with the specified suffix, False otherwise.\n\
|
---|
6940 | With optional start, test S beginning at that position.\n\
|
---|
6941 | With optional end, stop comparing S at that position.\n\
|
---|
6942 | suffix can also be a tuple of strings to try.");
|
---|
6943 |
|
---|
6944 | static PyObject *
|
---|
6945 | unicode_endswith(PyUnicodeObject *self,
|
---|
6946 | PyObject *args)
|
---|
6947 | {
|
---|
6948 | PyObject *subobj;
|
---|
6949 | PyUnicodeObject *substring;
|
---|
6950 | Py_ssize_t start = 0;
|
---|
6951 | Py_ssize_t end = PY_SSIZE_T_MAX;
|
---|
6952 | int result;
|
---|
6953 |
|
---|
6954 | if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
|
---|
6955 | _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
|
---|
6956 | return NULL;
|
---|
6957 | if (PyTuple_Check(subobj)) {
|
---|
6958 | Py_ssize_t i;
|
---|
6959 | for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
|
---|
6960 | substring = (PyUnicodeObject *)PyUnicode_FromObject(
|
---|
6961 | PyTuple_GET_ITEM(subobj, i));
|
---|
6962 | if (substring == NULL)
|
---|
6963 | return NULL;
|
---|
6964 | result = tailmatch(self, substring, start, end, +1);
|
---|
6965 | Py_DECREF(substring);
|
---|
6966 | if (result) {
|
---|
6967 | Py_RETURN_TRUE;
|
---|
6968 | }
|
---|
6969 | }
|
---|
6970 | Py_RETURN_FALSE;
|
---|
6971 | }
|
---|
6972 | substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
|
---|
6973 | if (substring == NULL)
|
---|
6974 | return NULL;
|
---|
6975 |
|
---|
6976 | result = tailmatch(self, substring, start, end, +1);
|
---|
6977 | Py_DECREF(substring);
|
---|
6978 | return PyBool_FromLong(result);
|
---|
6979 | }
|
---|
6980 |
|
---|
6981 |
|
---|
6982 |
|
---|
6983 | static PyObject *
|
---|
6984 | unicode_getnewargs(PyUnicodeObject *v)
|
---|
6985 | {
|
---|
6986 | return Py_BuildValue("(u#)", v->str, v->length);
|
---|
6987 | }
|
---|
6988 |
|
---|
6989 |
|
---|
6990 | static PyMethodDef unicode_methods[] = {
|
---|
6991 |
|
---|
6992 | /* Order is according to common usage: often used methods should
|
---|
6993 | appear first, since lookup is done sequentially. */
|
---|
6994 |
|
---|
6995 | {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
|
---|
6996 | {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
|
---|
6997 | {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
|
---|
6998 | {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
|
---|
6999 | {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
|
---|
7000 | {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
|
---|
7001 | {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
|
---|
7002 | {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
|
---|
7003 | {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
|
---|
7004 | {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
|
---|
7005 | {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
|
---|
7006 | {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
|
---|
7007 | {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
|
---|
7008 | {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
|
---|
7009 | {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
|
---|
7010 | {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
|
---|
7011 | {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
|
---|
7012 | /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
|
---|
7013 | {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
|
---|
7014 | {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
|
---|
7015 | {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
|
---|
7016 | {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
|
---|
7017 | {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
|
---|
7018 | {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
|
---|
7019 | {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
|
---|
7020 | {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
|
---|
7021 | {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
|
---|
7022 | {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
|
---|
7023 | {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
|
---|
7024 | {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
|
---|
7025 | {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
|
---|
7026 | {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
|
---|
7027 | {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
|
---|
7028 | {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
|
---|
7029 | {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
|
---|
7030 | {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
|
---|
7031 | {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
|
---|
7032 | {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
|
---|
7033 | {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
|
---|
7034 | {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
|
---|
7035 | #if 0
|
---|
7036 | {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
|
---|
7037 | #endif
|
---|
7038 |
|
---|
7039 | #if 0
|
---|
7040 | /* This one is just used for debugging the implementation. */
|
---|
7041 | {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
|
---|
7042 | #endif
|
---|
7043 |
|
---|
7044 | {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
|
---|
7045 | {NULL, NULL}
|
---|
7046 | };
|
---|
7047 |
|
---|
7048 | static PyObject *
|
---|
7049 | unicode_mod(PyObject *v, PyObject *w)
|
---|
7050 | {
|
---|
7051 | if (!PyUnicode_Check(v)) {
|
---|
7052 | Py_INCREF(Py_NotImplemented);
|
---|
7053 | return Py_NotImplemented;
|
---|
7054 | }
|
---|
7055 | return PyUnicode_Format(v, w);
|
---|
7056 | }
|
---|
7057 |
|
---|
7058 | static PyNumberMethods unicode_as_number = {
|
---|
7059 | 0, /*nb_add*/
|
---|
7060 | 0, /*nb_subtract*/
|
---|
7061 | 0, /*nb_multiply*/
|
---|
7062 | 0, /*nb_divide*/
|
---|
7063 | unicode_mod, /*nb_remainder*/
|
---|
7064 | };
|
---|
7065 |
|
---|
7066 | static PySequenceMethods unicode_as_sequence = {
|
---|
7067 | (lenfunc) unicode_length, /* sq_length */
|
---|
7068 | PyUnicode_Concat, /* sq_concat */
|
---|
7069 | (ssizeargfunc) unicode_repeat, /* sq_repeat */
|
---|
7070 | (ssizeargfunc) unicode_getitem, /* sq_item */
|
---|
7071 | (ssizessizeargfunc) unicode_slice, /* sq_slice */
|
---|
7072 | 0, /* sq_ass_item */
|
---|
7073 | 0, /* sq_ass_slice */
|
---|
7074 | PyUnicode_Contains, /* sq_contains */
|
---|
7075 | };
|
---|
7076 |
|
---|
7077 | static PyObject*
|
---|
7078 | unicode_subscript(PyUnicodeObject* self, PyObject* item)
|
---|
7079 | {
|
---|
7080 | if (PyIndex_Check(item)) {
|
---|
7081 | Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
|
---|
7082 | if (i == -1 && PyErr_Occurred())
|
---|
7083 | return NULL;
|
---|
7084 | if (i < 0)
|
---|
7085 | i += PyUnicode_GET_SIZE(self);
|
---|
7086 | return unicode_getitem(self, i);
|
---|
7087 | } else if (PySlice_Check(item)) {
|
---|
7088 | Py_ssize_t start, stop, step, slicelength, cur, i;
|
---|
7089 | Py_UNICODE* source_buf;
|
---|
7090 | Py_UNICODE* result_buf;
|
---|
7091 | PyObject* result;
|
---|
7092 |
|
---|
7093 | if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
|
---|
7094 | &start, &stop, &step, &slicelength) < 0) {
|
---|
7095 | return NULL;
|
---|
7096 | }
|
---|
7097 |
|
---|
7098 | if (slicelength <= 0) {
|
---|
7099 | return PyUnicode_FromUnicode(NULL, 0);
|
---|
7100 | } else {
|
---|
7101 | source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
|
---|
7102 | result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
|
---|
7103 | sizeof(Py_UNICODE));
|
---|
7104 |
|
---|
7105 | if (result_buf == NULL)
|
---|
7106 | return PyErr_NoMemory();
|
---|
7107 |
|
---|
7108 | for (cur = start, i = 0; i < slicelength; cur += step, i++) {
|
---|
7109 | result_buf[i] = source_buf[cur];
|
---|
7110 | }
|
---|
7111 |
|
---|
7112 | result = PyUnicode_FromUnicode(result_buf, slicelength);
|
---|
7113 | PyMem_FREE(result_buf);
|
---|
7114 | return result;
|
---|
7115 | }
|
---|
7116 | } else {
|
---|
7117 | PyErr_SetString(PyExc_TypeError, "string indices must be integers");
|
---|
7118 | return NULL;
|
---|
7119 | }
|
---|
7120 | }
|
---|
7121 |
|
---|
7122 | static PyMappingMethods unicode_as_mapping = {
|
---|
7123 | (lenfunc)unicode_length, /* mp_length */
|
---|
7124 | (binaryfunc)unicode_subscript, /* mp_subscript */
|
---|
7125 | (objobjargproc)0, /* mp_ass_subscript */
|
---|
7126 | };
|
---|
7127 |
|
---|
7128 | static Py_ssize_t
|
---|
7129 | unicode_buffer_getreadbuf(PyUnicodeObject *self,
|
---|
7130 | Py_ssize_t index,
|
---|
7131 | const void **ptr)
|
---|
7132 | {
|
---|
7133 | if (index != 0) {
|
---|
7134 | PyErr_SetString(PyExc_SystemError,
|
---|
7135 | "accessing non-existent unicode segment");
|
---|
7136 | return -1;
|
---|
7137 | }
|
---|
7138 | *ptr = (void *) self->str;
|
---|
7139 | return PyUnicode_GET_DATA_SIZE(self);
|
---|
7140 | }
|
---|
7141 |
|
---|
7142 | static Py_ssize_t
|
---|
7143 | unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
|
---|
7144 | const void **ptr)
|
---|
7145 | {
|
---|
7146 | PyErr_SetString(PyExc_TypeError,
|
---|
7147 | "cannot use unicode as modifiable buffer");
|
---|
7148 | return -1;
|
---|
7149 | }
|
---|
7150 |
|
---|
7151 | static int
|
---|
7152 | unicode_buffer_getsegcount(PyUnicodeObject *self,
|
---|
7153 | Py_ssize_t *lenp)
|
---|
7154 | {
|
---|
7155 | if (lenp)
|
---|
7156 | *lenp = PyUnicode_GET_DATA_SIZE(self);
|
---|
7157 | return 1;
|
---|
7158 | }
|
---|
7159 |
|
---|
7160 | static Py_ssize_t
|
---|
7161 | unicode_buffer_getcharbuf(PyUnicodeObject *self,
|
---|
7162 | Py_ssize_t index,
|
---|
7163 | const void **ptr)
|
---|
7164 | {
|
---|
7165 | PyObject *str;
|
---|
7166 |
|
---|
7167 | if (index != 0) {
|
---|
7168 | PyErr_SetString(PyExc_SystemError,
|
---|
7169 | "accessing non-existent unicode segment");
|
---|
7170 | return -1;
|
---|
7171 | }
|
---|
7172 | str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
|
---|
7173 | if (str == NULL)
|
---|
7174 | return -1;
|
---|
7175 | *ptr = (void *) PyString_AS_STRING(str);
|
---|
7176 | return PyString_GET_SIZE(str);
|
---|
7177 | }
|
---|
7178 |
|
---|
7179 | /* Helpers for PyUnicode_Format() */
|
---|
7180 |
|
---|
7181 | static PyObject *
|
---|
7182 | getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
|
---|
7183 | {
|
---|
7184 | Py_ssize_t argidx = *p_argidx;
|
---|
7185 | if (argidx < arglen) {
|
---|
7186 | (*p_argidx)++;
|
---|
7187 | if (arglen < 0)
|
---|
7188 | return args;
|
---|
7189 | else
|
---|
7190 | return PyTuple_GetItem(args, argidx);
|
---|
7191 | }
|
---|
7192 | PyErr_SetString(PyExc_TypeError,
|
---|
7193 | "not enough arguments for format string");
|
---|
7194 | return NULL;
|
---|
7195 | }
|
---|
7196 |
|
---|
7197 | #define F_LJUST (1<<0)
|
---|
7198 | #define F_SIGN (1<<1)
|
---|
7199 | #define F_BLANK (1<<2)
|
---|
7200 | #define F_ALT (1<<3)
|
---|
7201 | #define F_ZERO (1<<4)
|
---|
7202 |
|
---|
7203 | static Py_ssize_t
|
---|
7204 | strtounicode(Py_UNICODE *buffer, const char *charbuffer)
|
---|
7205 | {
|
---|
7206 | register Py_ssize_t i;
|
---|
7207 | Py_ssize_t len = strlen(charbuffer);
|
---|
7208 | for (i = len - 1; i >= 0; i--)
|
---|
7209 | buffer[i] = (Py_UNICODE) charbuffer[i];
|
---|
7210 |
|
---|
7211 | return len;
|
---|
7212 | }
|
---|
7213 |
|
---|
7214 | static int
|
---|
7215 | doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
|
---|
7216 | {
|
---|
7217 | Py_ssize_t result;
|
---|
7218 |
|
---|
7219 | PyOS_ascii_formatd((char *)buffer, len, format, x);
|
---|
7220 | result = strtounicode(buffer, (char *)buffer);
|
---|
7221 | return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
|
---|
7222 | }
|
---|
7223 |
|
---|
7224 | static int
|
---|
7225 | longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
|
---|
7226 | {
|
---|
7227 | Py_ssize_t result;
|
---|
7228 |
|
---|
7229 | PyOS_snprintf((char *)buffer, len, format, x);
|
---|
7230 | result = strtounicode(buffer, (char *)buffer);
|
---|
7231 | return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
|
---|
7232 | }
|
---|
7233 |
|
---|
7234 | /* XXX To save some code duplication, formatfloat/long/int could have been
|
---|
7235 | shared with stringobject.c, converting from 8-bit to Unicode after the
|
---|
7236 | formatting is done. */
|
---|
7237 |
|
---|
7238 | static int
|
---|
7239 | formatfloat(Py_UNICODE *buf,
|
---|
7240 | size_t buflen,
|
---|
7241 | int flags,
|
---|
7242 | int prec,
|
---|
7243 | int type,
|
---|
7244 | PyObject *v)
|
---|
7245 | {
|
---|
7246 | /* fmt = '%#.' + `prec` + `type`
|
---|
7247 | worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
|
---|
7248 | char fmt[20];
|
---|
7249 | double x;
|
---|
7250 |
|
---|
7251 | x = PyFloat_AsDouble(v);
|
---|
7252 | if (x == -1.0 && PyErr_Occurred())
|
---|
7253 | return -1;
|
---|
7254 | if (prec < 0)
|
---|
7255 | prec = 6;
|
---|
7256 | if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
|
---|
7257 | type = 'g';
|
---|
7258 | /* Worst case length calc to ensure no buffer overrun:
|
---|
7259 |
|
---|
7260 | 'g' formats:
|
---|
7261 | fmt = %#.<prec>g
|
---|
7262 | buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
|
---|
7263 | for any double rep.)
|
---|
7264 | len = 1 + prec + 1 + 2 + 5 = 9 + prec
|
---|
7265 |
|
---|
7266 | 'f' formats:
|
---|
7267 | buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
|
---|
7268 | len = 1 + 50 + 1 + prec = 52 + prec
|
---|
7269 |
|
---|
7270 | If prec=0 the effective precision is 1 (the leading digit is
|
---|
7271 | always given), therefore increase the length by one.
|
---|
7272 |
|
---|
7273 | */
|
---|
7274 | if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
|
---|
7275 | (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
|
---|
7276 | PyErr_SetString(PyExc_OverflowError,
|
---|
7277 | "formatted float is too long (precision too large?)");
|
---|
7278 | return -1;
|
---|
7279 | }
|
---|
7280 | PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
|
---|
7281 | (flags&F_ALT) ? "#" : "",
|
---|
7282 | prec, type);
|
---|
7283 | return doubletounicode(buf, buflen, fmt, x);
|
---|
7284 | }
|
---|
7285 |
|
---|
7286 | static PyObject*
|
---|
7287 | formatlong(PyObject *val, int flags, int prec, int type)
|
---|
7288 | {
|
---|
7289 | char *buf;
|
---|
7290 | int i, len;
|
---|
7291 | PyObject *str; /* temporary string object. */
|
---|
7292 | PyUnicodeObject *result;
|
---|
7293 |
|
---|
7294 | str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
|
---|
7295 | if (!str)
|
---|
7296 | return NULL;
|
---|
7297 | result = _PyUnicode_New(len);
|
---|
7298 | if (!result) {
|
---|
7299 | Py_DECREF(str);
|
---|
7300 | return NULL;
|
---|
7301 | }
|
---|
7302 | for (i = 0; i < len; i++)
|
---|
7303 | result->str[i] = buf[i];
|
---|
7304 | result->str[len] = 0;
|
---|
7305 | Py_DECREF(str);
|
---|
7306 | return (PyObject*)result;
|
---|
7307 | }
|
---|
7308 |
|
---|
7309 | static int
|
---|
7310 | formatint(Py_UNICODE *buf,
|
---|
7311 | size_t buflen,
|
---|
7312 | int flags,
|
---|
7313 | int prec,
|
---|
7314 | int type,
|
---|
7315 | PyObject *v)
|
---|
7316 | {
|
---|
7317 | /* fmt = '%#.' + `prec` + 'l' + `type`
|
---|
7318 | * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
|
---|
7319 | * + 1 + 1
|
---|
7320 | * = 24
|
---|
7321 | */
|
---|
7322 | char fmt[64]; /* plenty big enough! */
|
---|
7323 | char *sign;
|
---|
7324 | long x;
|
---|
7325 |
|
---|
7326 | x = PyInt_AsLong(v);
|
---|
7327 | if (x == -1 && PyErr_Occurred())
|
---|
7328 | return -1;
|
---|
7329 | if (x < 0 && type == 'u') {
|
---|
7330 | type = 'd';
|
---|
7331 | }
|
---|
7332 | if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
|
---|
7333 | sign = "-";
|
---|
7334 | else
|
---|
7335 | sign = "";
|
---|
7336 | if (prec < 0)
|
---|
7337 | prec = 1;
|
---|
7338 |
|
---|
7339 | /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
|
---|
7340 | * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
|
---|
7341 | */
|
---|
7342 | if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
|
---|
7343 | PyErr_SetString(PyExc_OverflowError,
|
---|
7344 | "formatted integer is too long (precision too large?)");
|
---|
7345 | return -1;
|
---|
7346 | }
|
---|
7347 |
|
---|
7348 | if ((flags & F_ALT) &&
|
---|
7349 | (type == 'x' || type == 'X')) {
|
---|
7350 | /* When converting under %#x or %#X, there are a number
|
---|
7351 | * of issues that cause pain:
|
---|
7352 | * - when 0 is being converted, the C standard leaves off
|
---|
7353 | * the '0x' or '0X', which is inconsistent with other
|
---|
7354 | * %#x/%#X conversions and inconsistent with Python's
|
---|
7355 | * hex() function
|
---|
7356 | * - there are platforms that violate the standard and
|
---|
7357 | * convert 0 with the '0x' or '0X'
|
---|
7358 | * (Metrowerks, Compaq Tru64)
|
---|
7359 | * - there are platforms that give '0x' when converting
|
---|
7360 | * under %#X, but convert 0 in accordance with the
|
---|
7361 | * standard (OS/2 EMX)
|
---|
7362 | *
|
---|
7363 | * We can achieve the desired consistency by inserting our
|
---|
7364 | * own '0x' or '0X' prefix, and substituting %x/%X in place
|
---|
7365 | * of %#x/%#X.
|
---|
7366 | *
|
---|
7367 | * Note that this is the same approach as used in
|
---|
7368 | * formatint() in stringobject.c
|
---|
7369 | */
|
---|
7370 | PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
|
---|
7371 | sign, type, prec, type);
|
---|
7372 | }
|
---|
7373 | else {
|
---|
7374 | PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
|
---|
7375 | sign, (flags&F_ALT) ? "#" : "",
|
---|
7376 | prec, type);
|
---|
7377 | }
|
---|
7378 | if (sign[0])
|
---|
7379 | return longtounicode(buf, buflen, fmt, -x);
|
---|
7380 | else
|
---|
7381 | return longtounicode(buf, buflen, fmt, x);
|
---|
7382 | }
|
---|
7383 |
|
---|
7384 | static int
|
---|
7385 | formatchar(Py_UNICODE *buf,
|
---|
7386 | size_t buflen,
|
---|
7387 | PyObject *v)
|
---|
7388 | {
|
---|
7389 | /* presume that the buffer is at least 2 characters long */
|
---|
7390 | if (PyUnicode_Check(v)) {
|
---|
7391 | if (PyUnicode_GET_SIZE(v) != 1)
|
---|
7392 | goto onError;
|
---|
7393 | buf[0] = PyUnicode_AS_UNICODE(v)[0];
|
---|
7394 | }
|
---|
7395 |
|
---|
7396 | else if (PyString_Check(v)) {
|
---|
7397 | if (PyString_GET_SIZE(v) != 1)
|
---|
7398 | goto onError;
|
---|
7399 | buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
|
---|
7400 | }
|
---|
7401 |
|
---|
7402 | else {
|
---|
7403 | /* Integer input truncated to a character */
|
---|
7404 | long x;
|
---|
7405 | x = PyInt_AsLong(v);
|
---|
7406 | if (x == -1 && PyErr_Occurred())
|
---|
7407 | goto onError;
|
---|
7408 | #ifdef Py_UNICODE_WIDE
|
---|
7409 | if (x < 0 || x > 0x10ffff) {
|
---|
7410 | PyErr_SetString(PyExc_OverflowError,
|
---|
7411 | "%c arg not in range(0x110000) "
|
---|
7412 | "(wide Python build)");
|
---|
7413 | return -1;
|
---|
7414 | }
|
---|
7415 | #else
|
---|
7416 | if (x < 0 || x > 0xffff) {
|
---|
7417 | PyErr_SetString(PyExc_OverflowError,
|
---|
7418 | "%c arg not in range(0x10000) "
|
---|
7419 | "(narrow Python build)");
|
---|
7420 | return -1;
|
---|
7421 | }
|
---|
7422 | #endif
|
---|
7423 | buf[0] = (Py_UNICODE) x;
|
---|
7424 | }
|
---|
7425 | buf[1] = '\0';
|
---|
7426 | return 1;
|
---|
7427 |
|
---|
7428 | onError:
|
---|
7429 | PyErr_SetString(PyExc_TypeError,
|
---|
7430 | "%c requires int or char");
|
---|
7431 | return -1;
|
---|
7432 | }
|
---|
7433 |
|
---|
7434 | /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
|
---|
7435 |
|
---|
7436 | FORMATBUFLEN is the length of the buffer in which the floats, ints, &
|
---|
7437 | chars are formatted. XXX This is a magic number. Each formatting
|
---|
7438 | routine does bounds checking to ensure no overflow, but a better
|
---|
7439 | solution may be to malloc a buffer of appropriate size for each
|
---|
7440 | format. For now, the current solution is sufficient.
|
---|
7441 | */
|
---|
7442 | #define FORMATBUFLEN (size_t)120
|
---|
7443 |
|
---|
7444 | PyObject *PyUnicode_Format(PyObject *format,
|
---|
7445 | PyObject *args)
|
---|
7446 | {
|
---|
7447 | Py_UNICODE *fmt, *res;
|
---|
7448 | Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
|
---|
7449 | int args_owned = 0;
|
---|
7450 | PyUnicodeObject *result = NULL;
|
---|
7451 | PyObject *dict = NULL;
|
---|
7452 | PyObject *uformat;
|
---|
7453 |
|
---|
7454 | if (format == NULL || args == NULL) {
|
---|
7455 | PyErr_BadInternalCall();
|
---|
7456 | return NULL;
|
---|
7457 | }
|
---|
7458 | uformat = PyUnicode_FromObject(format);
|
---|
7459 | if (uformat == NULL)
|
---|
7460 | return NULL;
|
---|
7461 | fmt = PyUnicode_AS_UNICODE(uformat);
|
---|
7462 | fmtcnt = PyUnicode_GET_SIZE(uformat);
|
---|
7463 |
|
---|
7464 | reslen = rescnt = fmtcnt + 100;
|
---|
7465 | result = _PyUnicode_New(reslen);
|
---|
7466 | if (result == NULL)
|
---|
7467 | goto onError;
|
---|
7468 | res = PyUnicode_AS_UNICODE(result);
|
---|
7469 |
|
---|
7470 | if (PyTuple_Check(args)) {
|
---|
7471 | arglen = PyTuple_Size(args);
|
---|
7472 | argidx = 0;
|
---|
7473 | }
|
---|
7474 | else {
|
---|
7475 | arglen = -1;
|
---|
7476 | argidx = -2;
|
---|
7477 | }
|
---|
7478 | if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
|
---|
7479 | !PyObject_TypeCheck(args, &PyBaseString_Type))
|
---|
7480 | dict = args;
|
---|
7481 |
|
---|
7482 | while (--fmtcnt >= 0) {
|
---|
7483 | if (*fmt != '%') {
|
---|
7484 | if (--rescnt < 0) {
|
---|
7485 | rescnt = fmtcnt + 100;
|
---|
7486 | reslen += rescnt;
|
---|
7487 | if (_PyUnicode_Resize(&result, reslen) < 0)
|
---|
7488 | goto onError;
|
---|
7489 | res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
|
---|
7490 | --rescnt;
|
---|
7491 | }
|
---|
7492 | *res++ = *fmt++;
|
---|
7493 | }
|
---|
7494 | else {
|
---|
7495 | /* Got a format specifier */
|
---|
7496 | int flags = 0;
|
---|
7497 | Py_ssize_t width = -1;
|
---|
7498 | int prec = -1;
|
---|
7499 | Py_UNICODE c = '\0';
|
---|
7500 | Py_UNICODE fill;
|
---|
7501 | PyObject *v = NULL;
|
---|
7502 | PyObject *temp = NULL;
|
---|
7503 | Py_UNICODE *pbuf;
|
---|
7504 | Py_UNICODE sign;
|
---|
7505 | Py_ssize_t len;
|
---|
7506 | Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
|
---|
7507 |
|
---|
7508 | fmt++;
|
---|
7509 | if (*fmt == '(') {
|
---|
7510 | Py_UNICODE *keystart;
|
---|
7511 | Py_ssize_t keylen;
|
---|
7512 | PyObject *key;
|
---|
7513 | int pcount = 1;
|
---|
7514 |
|
---|
7515 | if (dict == NULL) {
|
---|
7516 | PyErr_SetString(PyExc_TypeError,
|
---|
7517 | "format requires a mapping");
|
---|
7518 | goto onError;
|
---|
7519 | }
|
---|
7520 | ++fmt;
|
---|
7521 | --fmtcnt;
|
---|
7522 | keystart = fmt;
|
---|
7523 | /* Skip over balanced parentheses */
|
---|
7524 | while (pcount > 0 && --fmtcnt >= 0) {
|
---|
7525 | if (*fmt == ')')
|
---|
7526 | --pcount;
|
---|
7527 | else if (*fmt == '(')
|
---|
7528 | ++pcount;
|
---|
7529 | fmt++;
|
---|
7530 | }
|
---|
7531 | keylen = fmt - keystart - 1;
|
---|
7532 | if (fmtcnt < 0 || pcount > 0) {
|
---|
7533 | PyErr_SetString(PyExc_ValueError,
|
---|
7534 | "incomplete format key");
|
---|
7535 | goto onError;
|
---|
7536 | }
|
---|
7537 | #if 0
|
---|
7538 | /* keys are converted to strings using UTF-8 and
|
---|
7539 | then looked up since Python uses strings to hold
|
---|
7540 | variables names etc. in its namespaces and we
|
---|
7541 | wouldn't want to break common idioms. */
|
---|
7542 | key = PyUnicode_EncodeUTF8(keystart,
|
---|
7543 | keylen,
|
---|
7544 | NULL);
|
---|
7545 | #else
|
---|
7546 | key = PyUnicode_FromUnicode(keystart, keylen);
|
---|
7547 | #endif
|
---|
7548 | if (key == NULL)
|
---|
7549 | goto onError;
|
---|
7550 | if (args_owned) {
|
---|
7551 | Py_DECREF(args);
|
---|
7552 | args_owned = 0;
|
---|
7553 | }
|
---|
7554 | args = PyObject_GetItem(dict, key);
|
---|
7555 | Py_DECREF(key);
|
---|
7556 | if (args == NULL) {
|
---|
7557 | goto onError;
|
---|
7558 | }
|
---|
7559 | args_owned = 1;
|
---|
7560 | arglen = -1;
|
---|
7561 | argidx = -2;
|
---|
7562 | }
|
---|
7563 | while (--fmtcnt >= 0) {
|
---|
7564 | switch (c = *fmt++) {
|
---|
7565 | case '-': flags |= F_LJUST; continue;
|
---|
7566 | case '+': flags |= F_SIGN; continue;
|
---|
7567 | case ' ': flags |= F_BLANK; continue;
|
---|
7568 | case '#': flags |= F_ALT; continue;
|
---|
7569 | case '0': flags |= F_ZERO; continue;
|
---|
7570 | }
|
---|
7571 | break;
|
---|
7572 | }
|
---|
7573 | if (c == '*') {
|
---|
7574 | v = getnextarg(args, arglen, &argidx);
|
---|
7575 | if (v == NULL)
|
---|
7576 | goto onError;
|
---|
7577 | if (!PyInt_Check(v)) {
|
---|
7578 | PyErr_SetString(PyExc_TypeError,
|
---|
7579 | "* wants int");
|
---|
7580 | goto onError;
|
---|
7581 | }
|
---|
7582 | width = PyInt_AsLong(v);
|
---|
7583 | if (width < 0) {
|
---|
7584 | flags |= F_LJUST;
|
---|
7585 | width = -width;
|
---|
7586 | }
|
---|
7587 | if (--fmtcnt >= 0)
|
---|
7588 | c = *fmt++;
|
---|
7589 | }
|
---|
7590 | else if (c >= '0' && c <= '9') {
|
---|
7591 | width = c - '0';
|
---|
7592 | while (--fmtcnt >= 0) {
|
---|
7593 | c = *fmt++;
|
---|
7594 | if (c < '0' || c > '9')
|
---|
7595 | break;
|
---|
7596 | if ((width*10) / 10 != width) {
|
---|
7597 | PyErr_SetString(PyExc_ValueError,
|
---|
7598 | "width too big");
|
---|
7599 | goto onError;
|
---|
7600 | }
|
---|
7601 | width = width*10 + (c - '0');
|
---|
7602 | }
|
---|
7603 | }
|
---|
7604 | if (c == '.') {
|
---|
7605 | prec = 0;
|
---|
7606 | if (--fmtcnt >= 0)
|
---|
7607 | c = *fmt++;
|
---|
7608 | if (c == '*') {
|
---|
7609 | v = getnextarg(args, arglen, &argidx);
|
---|
7610 | if (v == NULL)
|
---|
7611 | goto onError;
|
---|
7612 | if (!PyInt_Check(v)) {
|
---|
7613 | PyErr_SetString(PyExc_TypeError,
|
---|
7614 | "* wants int");
|
---|
7615 | goto onError;
|
---|
7616 | }
|
---|
7617 | prec = PyInt_AsLong(v);
|
---|
7618 | if (prec < 0)
|
---|
7619 | prec = 0;
|
---|
7620 | if (--fmtcnt >= 0)
|
---|
7621 | c = *fmt++;
|
---|
7622 | }
|
---|
7623 | else if (c >= '0' && c <= '9') {
|
---|
7624 | prec = c - '0';
|
---|
7625 | while (--fmtcnt >= 0) {
|
---|
7626 | c = Py_CHARMASK(*fmt++);
|
---|
7627 | if (c < '0' || c > '9')
|
---|
7628 | break;
|
---|
7629 | if ((prec*10) / 10 != prec) {
|
---|
7630 | PyErr_SetString(PyExc_ValueError,
|
---|
7631 | "prec too big");
|
---|
7632 | goto onError;
|
---|
7633 | }
|
---|
7634 | prec = prec*10 + (c - '0');
|
---|
7635 | }
|
---|
7636 | }
|
---|
7637 | } /* prec */
|
---|
7638 | if (fmtcnt >= 0) {
|
---|
7639 | if (c == 'h' || c == 'l' || c == 'L') {
|
---|
7640 | if (--fmtcnt >= 0)
|
---|
7641 | c = *fmt++;
|
---|
7642 | }
|
---|
7643 | }
|
---|
7644 | if (fmtcnt < 0) {
|
---|
7645 | PyErr_SetString(PyExc_ValueError,
|
---|
7646 | "incomplete format");
|
---|
7647 | goto onError;
|
---|
7648 | }
|
---|
7649 | if (c != '%') {
|
---|
7650 | v = getnextarg(args, arglen, &argidx);
|
---|
7651 | if (v == NULL)
|
---|
7652 | goto onError;
|
---|
7653 | }
|
---|
7654 | sign = 0;
|
---|
7655 | fill = ' ';
|
---|
7656 | switch (c) {
|
---|
7657 |
|
---|
7658 | case '%':
|
---|
7659 | pbuf = formatbuf;
|
---|
7660 | /* presume that buffer length is at least 1 */
|
---|
7661 | pbuf[0] = '%';
|
---|
7662 | len = 1;
|
---|
7663 | break;
|
---|
7664 |
|
---|
7665 | case 's':
|
---|
7666 | case 'r':
|
---|
7667 | if (PyUnicode_Check(v) && c == 's') {
|
---|
7668 | temp = v;
|
---|
7669 | Py_INCREF(temp);
|
---|
7670 | }
|
---|
7671 | else {
|
---|
7672 | PyObject *unicode;
|
---|
7673 | if (c == 's')
|
---|
7674 | temp = PyObject_Unicode(v);
|
---|
7675 | else
|
---|
7676 | temp = PyObject_Repr(v);
|
---|
7677 | if (temp == NULL)
|
---|
7678 | goto onError;
|
---|
7679 | if (PyUnicode_Check(temp))
|
---|
7680 | /* nothing to do */;
|
---|
7681 | else if (PyString_Check(temp)) {
|
---|
7682 | /* convert to string to Unicode */
|
---|
7683 | unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
|
---|
7684 | PyString_GET_SIZE(temp),
|
---|
7685 | NULL,
|
---|
7686 | "strict");
|
---|
7687 | Py_DECREF(temp);
|
---|
7688 | temp = unicode;
|
---|
7689 | if (temp == NULL)
|
---|
7690 | goto onError;
|
---|
7691 | }
|
---|
7692 | else {
|
---|
7693 | Py_DECREF(temp);
|
---|
7694 | PyErr_SetString(PyExc_TypeError,
|
---|
7695 | "%s argument has non-string str()");
|
---|
7696 | goto onError;
|
---|
7697 | }
|
---|
7698 | }
|
---|
7699 | pbuf = PyUnicode_AS_UNICODE(temp);
|
---|
7700 | len = PyUnicode_GET_SIZE(temp);
|
---|
7701 | if (prec >= 0 && len > prec)
|
---|
7702 | len = prec;
|
---|
7703 | break;
|
---|
7704 |
|
---|
7705 | case 'i':
|
---|
7706 | case 'd':
|
---|
7707 | case 'u':
|
---|
7708 | case 'o':
|
---|
7709 | case 'x':
|
---|
7710 | case 'X':
|
---|
7711 | if (c == 'i')
|
---|
7712 | c = 'd';
|
---|
7713 | if (PyLong_Check(v)) {
|
---|
7714 | temp = formatlong(v, flags, prec, c);
|
---|
7715 | if (!temp)
|
---|
7716 | goto onError;
|
---|
7717 | pbuf = PyUnicode_AS_UNICODE(temp);
|
---|
7718 | len = PyUnicode_GET_SIZE(temp);
|
---|
7719 | sign = 1;
|
---|
7720 | }
|
---|
7721 | else {
|
---|
7722 | pbuf = formatbuf;
|
---|
7723 | len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
|
---|
7724 | flags, prec, c, v);
|
---|
7725 | if (len < 0)
|
---|
7726 | goto onError;
|
---|
7727 | sign = 1;
|
---|
7728 | }
|
---|
7729 | if (flags & F_ZERO)
|
---|
7730 | fill = '0';
|
---|
7731 | break;
|
---|
7732 |
|
---|
7733 | case 'e':
|
---|
7734 | case 'E':
|
---|
7735 | case 'f':
|
---|
7736 | case 'F':
|
---|
7737 | case 'g':
|
---|
7738 | case 'G':
|
---|
7739 | if (c == 'F')
|
---|
7740 | c = 'f';
|
---|
7741 | pbuf = formatbuf;
|
---|
7742 | len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
|
---|
7743 | flags, prec, c, v);
|
---|
7744 | if (len < 0)
|
---|
7745 | goto onError;
|
---|
7746 | sign = 1;
|
---|
7747 | if (flags & F_ZERO)
|
---|
7748 | fill = '0';
|
---|
7749 | break;
|
---|
7750 |
|
---|
7751 | case 'c':
|
---|
7752 | pbuf = formatbuf;
|
---|
7753 | len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
|
---|
7754 | if (len < 0)
|
---|
7755 | goto onError;
|
---|
7756 | break;
|
---|
7757 |
|
---|
7758 | default:
|
---|
7759 | PyErr_Format(PyExc_ValueError,
|
---|
7760 | "unsupported format character '%c' (0x%x) "
|
---|
7761 | "at index %i",
|
---|
7762 | (31<=c && c<=126) ? (char)c : '?',
|
---|
7763 | (int)c,
|
---|
7764 | (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
|
---|
7765 | goto onError;
|
---|
7766 | }
|
---|
7767 | if (sign) {
|
---|
7768 | if (*pbuf == '-' || *pbuf == '+') {
|
---|
7769 | sign = *pbuf++;
|
---|
7770 | len--;
|
---|
7771 | }
|
---|
7772 | else if (flags & F_SIGN)
|
---|
7773 | sign = '+';
|
---|
7774 | else if (flags & F_BLANK)
|
---|
7775 | sign = ' ';
|
---|
7776 | else
|
---|
7777 | sign = 0;
|
---|
7778 | }
|
---|
7779 | if (width < len)
|
---|
7780 | width = len;
|
---|
7781 | if (rescnt - (sign != 0) < width) {
|
---|
7782 | reslen -= rescnt;
|
---|
7783 | rescnt = width + fmtcnt + 100;
|
---|
7784 | reslen += rescnt;
|
---|
7785 | if (reslen < 0) {
|
---|
7786 | Py_XDECREF(temp);
|
---|
7787 | PyErr_NoMemory();
|
---|
7788 | goto onError;
|
---|
7789 | }
|
---|
7790 | if (_PyUnicode_Resize(&result, reslen) < 0) {
|
---|
7791 | Py_XDECREF(temp);
|
---|
7792 | goto onError;
|
---|
7793 | }
|
---|
7794 | res = PyUnicode_AS_UNICODE(result)
|
---|
7795 | + reslen - rescnt;
|
---|
7796 | }
|
---|
7797 | if (sign) {
|
---|
7798 | if (fill != ' ')
|
---|
7799 | *res++ = sign;
|
---|
7800 | rescnt--;
|
---|
7801 | if (width > len)
|
---|
7802 | width--;
|
---|
7803 | }
|
---|
7804 | if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
|
---|
7805 | assert(pbuf[0] == '0');
|
---|
7806 | assert(pbuf[1] == c);
|
---|
7807 | if (fill != ' ') {
|
---|
7808 | *res++ = *pbuf++;
|
---|
7809 | *res++ = *pbuf++;
|
---|
7810 | }
|
---|
7811 | rescnt -= 2;
|
---|
7812 | width -= 2;
|
---|
7813 | if (width < 0)
|
---|
7814 | width = 0;
|
---|
7815 | len -= 2;
|
---|
7816 | }
|
---|
7817 | if (width > len && !(flags & F_LJUST)) {
|
---|
7818 | do {
|
---|
7819 | --rescnt;
|
---|
7820 | *res++ = fill;
|
---|
7821 | } while (--width > len);
|
---|
7822 | }
|
---|
7823 | if (fill == ' ') {
|
---|
7824 | if (sign)
|
---|
7825 | *res++ = sign;
|
---|
7826 | if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
|
---|
7827 | assert(pbuf[0] == '0');
|
---|
7828 | assert(pbuf[1] == c);
|
---|
7829 | *res++ = *pbuf++;
|
---|
7830 | *res++ = *pbuf++;
|
---|
7831 | }
|
---|
7832 | }
|
---|
7833 | Py_UNICODE_COPY(res, pbuf, len);
|
---|
7834 | res += len;
|
---|
7835 | rescnt -= len;
|
---|
7836 | while (--width >= len) {
|
---|
7837 | --rescnt;
|
---|
7838 | *res++ = ' ';
|
---|
7839 | }
|
---|
7840 | if (dict && (argidx < arglen) && c != '%') {
|
---|
7841 | PyErr_SetString(PyExc_TypeError,
|
---|
7842 | "not all arguments converted during string formatting");
|
---|
7843 | Py_XDECREF(temp);
|
---|
7844 | goto onError;
|
---|
7845 | }
|
---|
7846 | Py_XDECREF(temp);
|
---|
7847 | } /* '%' */
|
---|
7848 | } /* until end */
|
---|
7849 | if (argidx < arglen && !dict) {
|
---|
7850 | PyErr_SetString(PyExc_TypeError,
|
---|
7851 | "not all arguments converted during string formatting");
|
---|
7852 | goto onError;
|
---|
7853 | }
|
---|
7854 |
|
---|
7855 | if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
|
---|
7856 | goto onError;
|
---|
7857 | if (args_owned) {
|
---|
7858 | Py_DECREF(args);
|
---|
7859 | }
|
---|
7860 | Py_DECREF(uformat);
|
---|
7861 | return (PyObject *)result;
|
---|
7862 |
|
---|
7863 | onError:
|
---|
7864 | Py_XDECREF(result);
|
---|
7865 | Py_DECREF(uformat);
|
---|
7866 | if (args_owned) {
|
---|
7867 | Py_DECREF(args);
|
---|
7868 | }
|
---|
7869 | return NULL;
|
---|
7870 | }
|
---|
7871 |
|
---|
7872 | static PyBufferProcs unicode_as_buffer = {
|
---|
7873 | (readbufferproc) unicode_buffer_getreadbuf,
|
---|
7874 | (writebufferproc) unicode_buffer_getwritebuf,
|
---|
7875 | (segcountproc) unicode_buffer_getsegcount,
|
---|
7876 | (charbufferproc) unicode_buffer_getcharbuf,
|
---|
7877 | };
|
---|
7878 |
|
---|
7879 | static PyObject *
|
---|
7880 | unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
|
---|
7881 |
|
---|
7882 | static PyObject *
|
---|
7883 | unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
---|
7884 | {
|
---|
7885 | PyObject *x = NULL;
|
---|
7886 | static char *kwlist[] = {"string", "encoding", "errors", 0};
|
---|
7887 | char *encoding = NULL;
|
---|
7888 | char *errors = NULL;
|
---|
7889 |
|
---|
7890 | if (type != &PyUnicode_Type)
|
---|
7891 | return unicode_subtype_new(type, args, kwds);
|
---|
7892 | if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
|
---|
7893 | kwlist, &x, &encoding, &errors))
|
---|
7894 | return NULL;
|
---|
7895 | if (x == NULL)
|
---|
7896 | return (PyObject *)_PyUnicode_New(0);
|
---|
7897 | if (encoding == NULL && errors == NULL)
|
---|
7898 | return PyObject_Unicode(x);
|
---|
7899 | else
|
---|
7900 | return PyUnicode_FromEncodedObject(x, encoding, errors);
|
---|
7901 | }
|
---|
7902 |
|
---|
7903 | static PyObject *
|
---|
7904 | unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
---|
7905 | {
|
---|
7906 | PyUnicodeObject *tmp, *pnew;
|
---|
7907 | Py_ssize_t n;
|
---|
7908 |
|
---|
7909 | assert(PyType_IsSubtype(type, &PyUnicode_Type));
|
---|
7910 | tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
|
---|
7911 | if (tmp == NULL)
|
---|
7912 | return NULL;
|
---|
7913 | assert(PyUnicode_Check(tmp));
|
---|
7914 | pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
|
---|
7915 | if (pnew == NULL) {
|
---|
7916 | Py_DECREF(tmp);
|
---|
7917 | return NULL;
|
---|
7918 | }
|
---|
7919 | pnew->str = PyMem_NEW(Py_UNICODE, n+1);
|
---|
7920 | if (pnew->str == NULL) {
|
---|
7921 | _Py_ForgetReference((PyObject *)pnew);
|
---|
7922 | PyObject_Del(pnew);
|
---|
7923 | Py_DECREF(tmp);
|
---|
7924 | return PyErr_NoMemory();
|
---|
7925 | }
|
---|
7926 | Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
|
---|
7927 | pnew->length = n;
|
---|
7928 | pnew->hash = tmp->hash;
|
---|
7929 | Py_DECREF(tmp);
|
---|
7930 | return (PyObject *)pnew;
|
---|
7931 | }
|
---|
7932 |
|
---|
7933 | PyDoc_STRVAR(unicode_doc,
|
---|
7934 | "unicode(string [, encoding[, errors]]) -> object\n\
|
---|
7935 | \n\
|
---|
7936 | Create a new Unicode object from the given encoded string.\n\
|
---|
7937 | encoding defaults to the current default string encoding.\n\
|
---|
7938 | errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
|
---|
7939 |
|
---|
7940 | PyTypeObject PyUnicode_Type = {
|
---|
7941 | PyObject_HEAD_INIT(&PyType_Type)
|
---|
7942 | 0, /* ob_size */
|
---|
7943 | "unicode", /* tp_name */
|
---|
7944 | sizeof(PyUnicodeObject), /* tp_size */
|
---|
7945 | 0, /* tp_itemsize */
|
---|
7946 | /* Slots */
|
---|
7947 | (destructor)unicode_dealloc, /* tp_dealloc */
|
---|
7948 | 0, /* tp_print */
|
---|
7949 | 0, /* tp_getattr */
|
---|
7950 | 0, /* tp_setattr */
|
---|
7951 | 0, /* tp_compare */
|
---|
7952 | unicode_repr, /* tp_repr */
|
---|
7953 | &unicode_as_number, /* tp_as_number */
|
---|
7954 | &unicode_as_sequence, /* tp_as_sequence */
|
---|
7955 | &unicode_as_mapping, /* tp_as_mapping */
|
---|
7956 | (hashfunc) unicode_hash, /* tp_hash*/
|
---|
7957 | 0, /* tp_call*/
|
---|
7958 | (reprfunc) unicode_str, /* tp_str */
|
---|
7959 | PyObject_GenericGetAttr, /* tp_getattro */
|
---|
7960 | 0, /* tp_setattro */
|
---|
7961 | &unicode_as_buffer, /* tp_as_buffer */
|
---|
7962 | Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
|
---|
7963 | Py_TPFLAGS_BASETYPE, /* tp_flags */
|
---|
7964 | unicode_doc, /* tp_doc */
|
---|
7965 | 0, /* tp_traverse */
|
---|
7966 | 0, /* tp_clear */
|
---|
7967 | PyUnicode_RichCompare, /* tp_richcompare */
|
---|
7968 | 0, /* tp_weaklistoffset */
|
---|
7969 | 0, /* tp_iter */
|
---|
7970 | 0, /* tp_iternext */
|
---|
7971 | unicode_methods, /* tp_methods */
|
---|
7972 | 0, /* tp_members */
|
---|
7973 | 0, /* tp_getset */
|
---|
7974 | &PyBaseString_Type, /* tp_base */
|
---|
7975 | 0, /* tp_dict */
|
---|
7976 | 0, /* tp_descr_get */
|
---|
7977 | 0, /* tp_descr_set */
|
---|
7978 | 0, /* tp_dictoffset */
|
---|
7979 | 0, /* tp_init */
|
---|
7980 | 0, /* tp_alloc */
|
---|
7981 | unicode_new, /* tp_new */
|
---|
7982 | PyObject_Del, /* tp_free */
|
---|
7983 | };
|
---|
7984 |
|
---|
7985 | /* Initialize the Unicode implementation */
|
---|
7986 |
|
---|
7987 | void _PyUnicode_Init(void)
|
---|
7988 | {
|
---|
7989 | int i;
|
---|
7990 |
|
---|
7991 | /* XXX - move this array to unicodectype.c ? */
|
---|
7992 | Py_UNICODE linebreak[] = {
|
---|
7993 | 0x000A, /* LINE FEED */
|
---|
7994 | 0x000D, /* CARRIAGE RETURN */
|
---|
7995 | 0x001C, /* FILE SEPARATOR */
|
---|
7996 | 0x001D, /* GROUP SEPARATOR */
|
---|
7997 | 0x001E, /* RECORD SEPARATOR */
|
---|
7998 | 0x0085, /* NEXT LINE */
|
---|
7999 | 0x2028, /* LINE SEPARATOR */
|
---|
8000 | 0x2029, /* PARAGRAPH SEPARATOR */
|
---|
8001 | };
|
---|
8002 |
|
---|
8003 | /* Init the implementation */
|
---|
8004 | unicode_freelist = NULL;
|
---|
8005 | unicode_freelist_size = 0;
|
---|
8006 | unicode_empty = _PyUnicode_New(0);
|
---|
8007 | if (!unicode_empty)
|
---|
8008 | return;
|
---|
8009 |
|
---|
8010 | strcpy(unicode_default_encoding, "ascii");
|
---|
8011 | for (i = 0; i < 256; i++)
|
---|
8012 | unicode_latin1[i] = NULL;
|
---|
8013 | if (PyType_Ready(&PyUnicode_Type) < 0)
|
---|
8014 | Py_FatalError("Can't initialize 'unicode'");
|
---|
8015 |
|
---|
8016 | /* initialize the linebreak bloom filter */
|
---|
8017 | bloom_linebreak = make_bloom_mask(
|
---|
8018 | linebreak, sizeof(linebreak) / sizeof(linebreak[0])
|
---|
8019 | );
|
---|
8020 |
|
---|
8021 | PyType_Ready(&EncodingMapType);
|
---|
8022 | }
|
---|
8023 |
|
---|
8024 | /* Finalize the Unicode implementation */
|
---|
8025 |
|
---|
8026 | void
|
---|
8027 | _PyUnicode_Fini(void)
|
---|
8028 | {
|
---|
8029 | PyUnicodeObject *u;
|
---|
8030 | int i;
|
---|
8031 |
|
---|
8032 | Py_XDECREF(unicode_empty);
|
---|
8033 | unicode_empty = NULL;
|
---|
8034 |
|
---|
8035 | for (i = 0; i < 256; i++) {
|
---|
8036 | if (unicode_latin1[i]) {
|
---|
8037 | Py_DECREF(unicode_latin1[i]);
|
---|
8038 | unicode_latin1[i] = NULL;
|
---|
8039 | }
|
---|
8040 | }
|
---|
8041 |
|
---|
8042 | for (u = unicode_freelist; u != NULL;) {
|
---|
8043 | PyUnicodeObject *v = u;
|
---|
8044 | u = *(PyUnicodeObject **)u;
|
---|
8045 | if (v->str)
|
---|
8046 | PyMem_DEL(v->str);
|
---|
8047 | Py_XDECREF(v->defenc);
|
---|
8048 | PyObject_Del(v);
|
---|
8049 | }
|
---|
8050 | unicode_freelist = NULL;
|
---|
8051 | unicode_freelist_size = 0;
|
---|
8052 | }
|
---|
8053 |
|
---|
8054 | #ifdef __cplusplus
|
---|
8055 | }
|
---|
8056 | #endif
|
---|
8057 |
|
---|
8058 |
|
---|
8059 | /*
|
---|
8060 | Local variables:
|
---|
8061 | c-basic-offset: 4
|
---|
8062 | indent-tabs-mode: nil
|
---|
8063 | End:
|
---|
8064 | */
|
---|