source: vendor/python/2.5/Objects/unicodeobject.c

Last change on this file was 3225, checked in by bird, 18 years ago

Python 2.5

File size: 210.3 KB
Line 
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44
45#include "unicodeobject.h"
46#include "ucnhash.h"
47
48#ifdef MS_WINDOWS
49#include <windows.h>
50#endif
51
52/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
61
62 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
70
71*/
72
73#define KEEPALIVE_SIZE_LIMIT 9
74
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
83/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
89
90
91#ifdef __cplusplus
92extern "C" {
93#endif
94
95/* Free list for Unicode objects */
96static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
98
99/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
111
112*/
113static char unicode_default_encoding[100];
114
115Py_UNICODE
116PyUnicode_GetMax(void)
117{
118#ifdef Py_UNICODE_WIDE
119 return 0x10FFFF;
120#else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124#endif
125}
126
127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
145{
146 /* calculate simple bloom-style bitmask for a given unicode string */
147
148 long mask;
149 Py_ssize_t i;
150
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
154
155 return mask;
156}
157
158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
159{
160 Py_ssize_t i;
161
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
165
166 return 0;
167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
172/* --- Unicode Object ----------------------------------------------------- */
173
174static
175int unicode_resize(register PyUnicodeObject *unicode,
176 Py_ssize_t length)
177{
178 void *oldstr;
179
180 /* Shortcut if there's nothing much to do. */
181 if (unicode->length == length)
182 goto reset;
183
184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
187
188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
191 unicode_latin1[unicode->str[0]] == unicode)) {
192 PyErr_SetString(PyExc_SystemError,
193 "can't resize shared unicode objects");
194 return -1;
195 }
196
197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
199 safe to look at str[length] (without making any assumptions about what
200 it contains). */
201
202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
205 unicode->str = (Py_UNICODE *)oldstr;
206 PyErr_NoMemory();
207 return -1;
208 }
209 unicode->str[length] = 0;
210 unicode->length = length;
211
212 reset:
213 /* Reset the object caches */
214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
217 }
218 unicode->hash = -1;
219
220 return 0;
221}
222
223/* We allocate one more byte to make sure the string is
224 Ux0000 terminated -- XXX is this needed ?
225
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
228
229*/
230
231static
232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
233{
234 register PyUnicodeObject *unicode;
235
236 /* Optimization for empty strings */
237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
240 }
241
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
245 unicode_freelist = *(PyUnicodeObject **)unicode;
246 unicode_freelist_size--;
247 if (unicode->str) {
248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
250 if ((unicode->length < length) &&
251 unicode_resize(unicode, length) < 0) {
252 PyMem_DEL(unicode->str);
253 goto onError;
254 }
255 }
256 else {
257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
258 }
259 PyObject_INIT(unicode, &PyUnicode_Type);
260 }
261 else {
262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266 }
267
268 if (!unicode->str) {
269 PyErr_NoMemory();
270 goto onError;
271 }
272 /* Initialize the first element to guard against cases where
273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
278 */
279 unicode->str[0] = 0;
280 unicode->str[length] = 0;
281 unicode->length = length;
282 unicode->hash = -1;
283 unicode->defenc = NULL;
284 return unicode;
285
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
288 PyObject_Del(unicode);
289 return NULL;
290}
291
292static
293void unicode_dealloc(register PyUnicodeObject *unicode)
294{
295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
299 PyMem_DEL(unicode->str);
300 unicode->str = NULL;
301 unicode->length = 0;
302 }
303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
306 }
307 /* Add to free list */
308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
311 }
312 else {
313 PyMem_DEL(unicode->str);
314 Py_XDECREF(unicode->defenc);
315 unicode->ob_type->tp_free((PyObject *)unicode);
316 }
317}
318
319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
320{
321 register PyUnicodeObject *v;
322
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
327 }
328 v = (PyUnicodeObject *)*unicode;
329 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
330 PyErr_BadInternalCall();
331 return -1;
332 }
333
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
337 if (v->length != length &&
338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
344 Py_DECREF(*unicode);
345 *unicode = (PyObject *)w;
346 return 0;
347 }
348
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
359 Py_ssize_t size)
360{
361 PyUnicodeObject *unicode;
362
363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
366
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
371 }
372
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
379 if (!unicode)
380 return NULL;
381 unicode->str[0] = *u;
382 unicode_latin1[*u] = unicode;
383 }
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
386 }
387 }
388
389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
392
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
395 Py_UNICODE_COPY(unicode->str, u, size);
396
397 return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
403 Py_ssize_t size)
404{
405 PyUnicodeObject *unicode;
406
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
411
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
415
416 /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
419#else
420 {
421 register Py_UNICODE *u;
422 register Py_ssize_t i;
423 u = PyUnicode_AS_UNICODE(unicode);
424 for (i = size; i > 0; i--)
425 *u++ = *w++;
426 }
427#endif
428
429 return (PyObject *)unicode;
430}
431
432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
435{
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
439 }
440
441 /* If possible, try to copy the 0-termination as well */
442 if (size > PyUnicode_GET_SIZE(unicode))
443 size = PyUnicode_GET_SIZE(unicode) + 1;
444
445#ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448 {
449 register Py_UNICODE *u;
450 register Py_ssize_t i;
451 u = PyUnicode_AS_UNICODE(unicode);
452 for (i = size; i > 0; i--)
453 *w++ = *u++;
454 }
455#endif
456
457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
460 return size;
461}
462
463#endif
464
465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
467 Py_UNICODE s[1];
468
469#ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
475 }
476#else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
482 }
483#endif
484
485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
487}
488
489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
496 }
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
502 }
503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
509{
510 const char *s = NULL;
511 Py_ssize_t len;
512 PyObject *v;
513
514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
517 }
518
519#if 0
520 /* For b/w compatibility we also accept Unicode objects provided
521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
524
525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
527
528 */
529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
533 return NULL;
534 }
535 return PyObject_Unicode(obj);
536 }
537#else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
542 }
543#endif
544
545 /* Coerce object */
546 if (PyString_Check(obj)) {
547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
549 }
550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
554 PyErr_Format(PyExc_TypeError,
555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
557 obj->ob_type->tp_name);
558 goto onError;
559 }
560
561 /* Convert to Unicode */
562 if (len == 0) {
563 Py_INCREF(unicode_empty);
564 v = (PyObject *)unicode_empty;
565 }
566 else
567 v = PyUnicode_Decode(s, len, encoding, errors);
568
569 return v;
570
571 onError:
572 return NULL;
573}
574
575PyObject *PyUnicode_Decode(const char *s,
576 Py_ssize_t size,
577 const char *encoding,
578 const char *errors)
579{
580 PyObject *buffer = NULL, *unicode;
581
582 if (encoding == NULL)
583 encoding = PyUnicode_GetDefaultEncoding();
584
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
587 return PyUnicode_DecodeUTF8(s, size, errors);
588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
596
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
606 "decoder did not return an unicode object (type=%.400s)",
607 unicode->ob_type->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
610 }
611 Py_DECREF(buffer);
612 return unicode;
613
614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
617}
618
619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
622{
623 PyObject *v;
624
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
628 }
629
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
632
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
638
639 onError:
640 return NULL;
641}
642
643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
644 Py_ssize_t size,
645 const char *encoding,
646 const char *errors)
647{
648 PyObject *v, *unicode;
649
650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
656}
657
658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
661{
662 PyObject *v;
663
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
667 }
668
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
671
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
677
678 onError:
679 return NULL;
680}
681
682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
685{
686 PyObject *v;
687
688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
691 }
692
693 if (encoding == NULL)
694 encoding = PyUnicode_GetDefaultEncoding();
695
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
699 return PyUnicode_AsUTF8String(unicode);
700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705#endif
706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
708 }
709
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
716 "encoder did not return a string object (type=%.400s)",
717 v->ob_type->tp_name);
718 Py_DECREF(v);
719 goto onError;
720 }
721 return v;
722
723 onError:
724 return NULL;
725}
726
727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
729{
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
738}
739
740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
745 }
746 return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749 return NULL;
750}
751
752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
753{
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
757 }
758 return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761 return -1;
762}
763
764const char *PyUnicode_GetDefaultEncoding(void)
765{
766 return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771 PyObject *v;
772
773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
780 encoding,
781 sizeof(unicode_default_encoding));
782 return 0;
783
784 onError:
785 return -1;
786}
787
788/* error handling callback helper:
789 build arguments, call the callback and check the arguments,
790 if no exception occurred, copy the replacement to the output
791 and adjust various state variables.
792 return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
800{
801 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
802
803 PyObject *restuple = NULL;
804 PyObject *repunicode = NULL;
805 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806 Py_ssize_t requiredsize;
807 Py_ssize_t newpos;
808 Py_UNICODE *repptr;
809 Py_ssize_t repsize;
810 int res = -1;
811
812 if (*errorHandler == NULL) {
813 *errorHandler = PyCodec_LookupError(errors);
814 if (*errorHandler == NULL)
815 goto onError;
816 }
817
818 if (*exceptionObject == NULL) {
819 *exceptionObject = PyUnicodeDecodeError_Create(
820 encoding, input, insize, *startinpos, *endinpos, reason);
821 if (*exceptionObject == NULL)
822 goto onError;
823 }
824 else {
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826 goto onError;
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828 goto onError;
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830 goto onError;
831 }
832
833 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834 if (restuple == NULL)
835 goto onError;
836 if (!PyTuple_Check(restuple)) {
837 PyErr_Format(PyExc_TypeError, &argparse[4]);
838 goto onError;
839 }
840 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841 goto onError;
842 if (newpos<0)
843 newpos = insize+newpos;
844 if (newpos<0 || newpos>insize) {
845 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
846 goto onError;
847 }
848
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr = PyUnicode_AS_UNICODE(repunicode);
854 repsize = PyUnicode_GET_SIZE(repunicode);
855 requiredsize = *outpos + repsize + insize-newpos;
856 if (requiredsize > outsize) {
857 if (requiredsize<2*outsize)
858 requiredsize = 2*outsize;
859 if (PyUnicode_Resize(output, requiredsize) < 0)
860 goto onError;
861 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862 }
863 *endinpos = newpos;
864 *inptr = input + newpos;
865 Py_UNICODE_COPY(*outptr, repptr, repsize);
866 *outptr += repsize;
867 *outpos += repsize;
868 /* we made it! */
869 res = 0;
870
871 onError:
872 Py_XDECREF(restuple);
873 return res;
874}
875
876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
880static
881char utf7_special[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
883 encoded:
884 0 - not special
885 1 - special
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
902 true */
903
904#define SPECIAL(c, encodeO, encodeWS) \
905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
906 (encodeWS && (utf7_special[(c)] == 2)) || \
907 (encodeO && (utf7_special[(c)] == 3)))
908
909#define B64(n) \
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912 (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
916
917#define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
920 bits -= 6; \
921 }
922
923#define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926 bits -= 16; \
927 if (surrogate) { \
928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
930 surrogate = 0; \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
932 /* This is a surrogate pair. Unfortunately we can't represent \
933 it in a 16-bit character */ \
934 surrogate = 1; \
935 errmsg = "code pairs are not supported"; \
936 goto utf7Error; \
937 } else { \
938 *out++ = outCh; \
939 } \
940 }
941
942PyObject *PyUnicode_DecodeUTF7(const char *s,
943 Py_ssize_t size,
944 const char *errors)
945{
946 const char *starts = s;
947 Py_ssize_t startinpos;
948 Py_ssize_t endinpos;
949 Py_ssize_t outpos;
950 const char *e;
951 PyUnicodeObject *unicode;
952 Py_UNICODE *p;
953 const char *errmsg = "";
954 int inShift = 0;
955 unsigned int bitsleft = 0;
956 unsigned long charsleft = 0;
957 int surrogate = 0;
958 PyObject *errorHandler = NULL;
959 PyObject *exc = NULL;
960
961 unicode = _PyUnicode_New(size);
962 if (!unicode)
963 return NULL;
964 if (size == 0)
965 return (PyObject *)unicode;
966
967 p = unicode->str;
968 e = s + size;
969
970 while (s < e) {
971 Py_UNICODE ch;
972 restart:
973 ch = *s;
974
975 if (inShift) {
976 if ((ch == '-') || !B64CHAR(ch)) {
977 inShift = 0;
978 s++;
979
980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981 if (bitsleft >= 6) {
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
985
986 errmsg = "partial character in shift sequence";
987 goto utf7Error;
988 }
989 /* According to RFC2152 the remaining bits should be zero. We
990 choose to signal an error/insert a replacement character
991 here so indicate the potential of a misencoded character. */
992
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995 errmsg = "non-zero padding bits in shift sequence";
996 goto utf7Error;
997 }
998
999 if (ch == '-') {
1000 if ((s < e) && (*(s) == '-')) {
1001 *p++ = '-';
1002 inShift = 1;
1003 }
1004 } else if (SPECIAL(ch,0,0)) {
1005 errmsg = "unexpected special character";
1006 goto utf7Error;
1007 } else {
1008 *p++ = ch;
1009 }
1010 } else {
1011 charsleft = (charsleft << 6) | UB64(ch);
1012 bitsleft += 6;
1013 s++;
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015 }
1016 }
1017 else if ( ch == '+' ) {
1018 startinpos = s-starts;
1019 s++;
1020 if (s < e && *s == '-') {
1021 s++;
1022 *p++ = '+';
1023 } else
1024 {
1025 inShift = 1;
1026 bitsleft = 0;
1027 }
1028 }
1029 else if (SPECIAL(ch,0,0)) {
1030 errmsg = "unexpected special character";
1031 s++;
1032 goto utf7Error;
1033 }
1034 else {
1035 *p++ = ch;
1036 s++;
1037 }
1038 continue;
1039 utf7Error:
1040 outpos = p-PyUnicode_AS_UNICODE(unicode);
1041 endinpos = s-starts;
1042 if (unicode_decode_call_errorhandler(
1043 errors, &errorHandler,
1044 "utf7", errmsg,
1045 starts, size, &startinpos, &endinpos, &exc, &s,
1046 (PyObject **)&unicode, &outpos, &p))
1047 goto onError;
1048 }
1049
1050 if (inShift) {
1051 outpos = p-PyUnicode_AS_UNICODE(unicode);
1052 endinpos = size;
1053 if (unicode_decode_call_errorhandler(
1054 errors, &errorHandler,
1055 "utf7", "unterminated shift sequence",
1056 starts, size, &startinpos, &endinpos, &exc, &s,
1057 (PyObject **)&unicode, &outpos, &p))
1058 goto onError;
1059 if (s < e)
1060 goto restart;
1061 }
1062
1063 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1064 goto onError;
1065
1066 Py_XDECREF(errorHandler);
1067 Py_XDECREF(exc);
1068 return (PyObject *)unicode;
1069
1070onError:
1071 Py_XDECREF(errorHandler);
1072 Py_XDECREF(exc);
1073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1079 Py_ssize_t size,
1080 int encodeSetO,
1081 int encodeWhiteSpace,
1082 const char *errors)
1083{
1084 PyObject *v;
1085 /* It might be possible to tighten this worst case */
1086 Py_ssize_t cbAllocated = 5 * size;
1087 int inShift = 0;
1088 Py_ssize_t i = 0;
1089 unsigned int bitsleft = 0;
1090 unsigned long charsleft = 0;
1091 char * out;
1092 char * start;
1093
1094 if (size == 0)
1095 return PyString_FromStringAndSize(NULL, 0);
1096
1097 v = PyString_FromStringAndSize(NULL, cbAllocated);
1098 if (v == NULL)
1099 return NULL;
1100
1101 start = out = PyString_AS_STRING(v);
1102 for (;i < size; ++i) {
1103 Py_UNICODE ch = s[i];
1104
1105 if (!inShift) {
1106 if (ch == '+') {
1107 *out++ = '+';
1108 *out++ = '-';
1109 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110 charsleft = ch;
1111 bitsleft = 16;
1112 *out++ = '+';
1113 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1114 inShift = bitsleft > 0;
1115 } else {
1116 *out++ = (char) ch;
1117 }
1118 } else {
1119 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120 *out++ = B64(charsleft << (6-bitsleft));
1121 charsleft = 0;
1122 bitsleft = 0;
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch) || ch == '-') {
1126 *out++ = '-';
1127 }
1128 inShift = 0;
1129 *out++ = (char) ch;
1130 } else {
1131 bitsleft += 16;
1132 charsleft = (charsleft << 16) | ch;
1133 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135 /* If the next character is special then we dont' need to terminate
1136 the shift sequence. If the next character is not a BASE64 character
1137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1139
1140 if (bitsleft == 0) {
1141 if (i + 1 < size) {
1142 Py_UNICODE ch2 = s[i+1];
1143
1144 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1145
1146 } else if (B64CHAR(ch2) || ch2 == '-') {
1147 *out++ = '-';
1148 inShift = 0;
1149 } else {
1150 inShift = 0;
1151 }
1152
1153 }
1154 else {
1155 *out++ = '-';
1156 inShift = 0;
1157 }
1158 }
1159 }
1160 }
1161 }
1162 if (bitsleft) {
1163 *out++= B64(charsleft << (6-bitsleft) );
1164 *out++ = '-';
1165 }
1166
1167 _PyString_Resize(&v, out - start);
1168 return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
1178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
1180static
1181char utf8_code_length[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
1202PyObject *PyUnicode_DecodeUTF8(const char *s,
1203 Py_ssize_t size,
1204 const char *errors)
1205{
1206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1210 Py_ssize_t size,
1211 const char *errors,
1212 Py_ssize_t *consumed)
1213{
1214 const char *starts = s;
1215 int n;
1216 Py_ssize_t startinpos;
1217 Py_ssize_t endinpos;
1218 Py_ssize_t outpos;
1219 const char *e;
1220 PyUnicodeObject *unicode;
1221 Py_UNICODE *p;
1222 const char *errmsg = "";
1223 PyObject *errorHandler = NULL;
1224 PyObject *exc = NULL;
1225
1226 /* Note: size will always be longer than the resulting Unicode
1227 character count */
1228 unicode = _PyUnicode_New(size);
1229 if (!unicode)
1230 return NULL;
1231 if (size == 0) {
1232 if (consumed)
1233 *consumed = 0;
1234 return (PyObject *)unicode;
1235 }
1236
1237 /* Unpack UTF-8 encoded data */
1238 p = unicode->str;
1239 e = s + size;
1240
1241 while (s < e) {
1242 Py_UCS4 ch = (unsigned char)*s;
1243
1244 if (ch < 0x80) {
1245 *p++ = (Py_UNICODE)ch;
1246 s++;
1247 continue;
1248 }
1249
1250 n = utf8_code_length[ch];
1251
1252 if (s + n > e) {
1253 if (consumed)
1254 break;
1255 else {
1256 errmsg = "unexpected end of data";
1257 startinpos = s-starts;
1258 endinpos = size;
1259 goto utf8Error;
1260 }
1261 }
1262
1263 switch (n) {
1264
1265 case 0:
1266 errmsg = "unexpected code byte";
1267 startinpos = s-starts;
1268 endinpos = startinpos+1;
1269 goto utf8Error;
1270
1271 case 1:
1272 errmsg = "internal error";
1273 startinpos = s-starts;
1274 endinpos = startinpos+1;
1275 goto utf8Error;
1276
1277 case 2:
1278 if ((s[1] & 0xc0) != 0x80) {
1279 errmsg = "invalid data";
1280 startinpos = s-starts;
1281 endinpos = startinpos+2;
1282 goto utf8Error;
1283 }
1284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1285 if (ch < 0x80) {
1286 startinpos = s-starts;
1287 endinpos = startinpos+2;
1288 errmsg = "illegal encoding";
1289 goto utf8Error;
1290 }
1291 else
1292 *p++ = (Py_UNICODE)ch;
1293 break;
1294
1295 case 3:
1296 if ((s[1] & 0xc0) != 0x80 ||
1297 (s[2] & 0xc0) != 0x80) {
1298 errmsg = "invalid data";
1299 startinpos = s-starts;
1300 endinpos = startinpos+3;
1301 goto utf8Error;
1302 }
1303 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1304 if (ch < 0x0800) {
1305 /* Note: UTF-8 encodings of surrogates are considered
1306 legal UTF-8 sequences;
1307
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1310 unit.
1311 */
1312 errmsg = "illegal encoding";
1313 startinpos = s-starts;
1314 endinpos = startinpos+3;
1315 goto utf8Error;
1316 }
1317 else
1318 *p++ = (Py_UNICODE)ch;
1319 break;
1320
1321 case 4:
1322 if ((s[1] & 0xc0) != 0x80 ||
1323 (s[2] & 0xc0) != 0x80 ||
1324 (s[3] & 0xc0) != 0x80) {
1325 errmsg = "invalid data";
1326 startinpos = s-starts;
1327 endinpos = startinpos+4;
1328 goto utf8Error;
1329 }
1330 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
1333 if ((ch < 0x10000) /* minimum value allowed for 4
1334 byte encoding */
1335 || (ch > 0x10ffff)) /* maximum value allowed for
1336 UTF-16 */
1337 {
1338 errmsg = "illegal encoding";
1339 startinpos = s-starts;
1340 endinpos = startinpos+4;
1341 goto utf8Error;
1342 }
1343#ifdef Py_UNICODE_WIDE
1344 *p++ = (Py_UNICODE)ch;
1345#else
1346 /* compute and append the two surrogates: */
1347
1348 /* translate from 10000..10FFFF to 0..FFFF */
1349 ch -= 0x10000;
1350
1351 /* high surrogate = top 10 bits added to D800 */
1352 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1353
1354 /* low surrogate = bottom 10 bits added to DC00 */
1355 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1356#endif
1357 break;
1358
1359 default:
1360 /* Other sizes are only needed for UCS-4 */
1361 errmsg = "unsupported Unicode code range";
1362 startinpos = s-starts;
1363 endinpos = startinpos+n;
1364 goto utf8Error;
1365 }
1366 s += n;
1367 continue;
1368
1369 utf8Error:
1370 outpos = p-PyUnicode_AS_UNICODE(unicode);
1371 if (unicode_decode_call_errorhandler(
1372 errors, &errorHandler,
1373 "utf8", errmsg,
1374 starts, size, &startinpos, &endinpos, &exc, &s,
1375 (PyObject **)&unicode, &outpos, &p))
1376 goto onError;
1377 }
1378 if (consumed)
1379 *consumed = s-starts;
1380
1381 /* Adjust length */
1382 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1383 goto onError;
1384
1385 Py_XDECREF(errorHandler);
1386 Py_XDECREF(exc);
1387 return (PyObject *)unicode;
1388
1389onError:
1390 Py_XDECREF(errorHandler);
1391 Py_XDECREF(exc);
1392 Py_DECREF(unicode);
1393 return NULL;
1394}
1395
1396/* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
1400*/
1401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1403 Py_ssize_t size,
1404 const char *errors)
1405{
1406#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1407
1408 Py_ssize_t i; /* index into s of next input byte */
1409 PyObject *v; /* result string object */
1410 char *p; /* next free byte in output buffer */
1411 Py_ssize_t nallocated; /* number of result bytes allocated */
1412 Py_ssize_t nneeded; /* number of result bytes needed */
1413 char stackbuf[MAX_SHORT_UNICHARS * 4];
1414
1415 assert(s != NULL);
1416 assert(size >= 0);
1417
1418 if (size <= MAX_SHORT_UNICHARS) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1422 */
1423 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424 v = NULL; /* will allocate after we're done */
1425 p = stackbuf;
1426 }
1427 else {
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated = size * 4;
1430 if (nallocated / 4 != size) /* overflow! */
1431 return PyErr_NoMemory();
1432 v = PyString_FromStringAndSize(NULL, nallocated);
1433 if (v == NULL)
1434 return NULL;
1435 p = PyString_AS_STRING(v);
1436 }
1437
1438 for (i = 0; i < size;) {
1439 Py_UCS4 ch = s[i++];
1440
1441 if (ch < 0x80)
1442 /* Encode ASCII */
1443 *p++ = (char) ch;
1444
1445 else if (ch < 0x0800) {
1446 /* Encode Latin-1 */
1447 *p++ = (char)(0xc0 | (ch >> 6));
1448 *p++ = (char)(0x80 | (ch & 0x3f));
1449 }
1450 else {
1451 /* Encode UCS2 Unicode ordinals */
1452 if (ch < 0x10000) {
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455 Py_UCS4 ch2 = s[i];
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1459 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1460 i++;
1461 goto encodeUCS4;
1462 }
1463 /* Fall through: handles isolated high surrogates */
1464 }
1465 *p++ = (char)(0xe0 | (ch >> 12));
1466 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467 *p++ = (char)(0x80 | (ch & 0x3f));
1468 continue;
1469 }
1470encodeUCS4:
1471 /* Encode UCS4 Unicode ordinals */
1472 *p++ = (char)(0xf0 | (ch >> 18));
1473 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475 *p++ = (char)(0x80 | (ch & 0x3f));
1476 }
1477 }
1478
1479 if (v == NULL) {
1480 /* This was stack allocated. */
1481 nneeded = p - stackbuf;
1482 assert(nneeded <= nallocated);
1483 v = PyString_FromStringAndSize(stackbuf, nneeded);
1484 }
1485 else {
1486 /* Cut back to size actually needed. */
1487 nneeded = p - PyString_AS_STRING(v);
1488 assert(nneeded <= nallocated);
1489 _PyString_Resize(&v, nneeded);
1490 }
1491 return v;
1492
1493#undef MAX_SHORT_UNICHARS
1494}
1495
1496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
1498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_BadArgument();
1500 return NULL;
1501 }
1502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503 PyUnicode_GET_SIZE(unicode),
1504 NULL);
1505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
1509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
1511 Py_ssize_t size,
1512 const char *errors,
1513 int *byteorder)
1514{
1515 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
1520 Py_ssize_t size,
1521 const char *errors,
1522 int *byteorder,
1523 Py_ssize_t *consumed)
1524{
1525 const char *starts = s;
1526 Py_ssize_t startinpos;
1527 Py_ssize_t endinpos;
1528 Py_ssize_t outpos;
1529 PyUnicodeObject *unicode;
1530 Py_UNICODE *p;
1531 const unsigned char *q, *e;
1532 int bo = 0; /* assume native ordering by default */
1533 const char *errmsg = "";
1534 /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi = 1, ilo = 0;
1537#else
1538 int ihi = 0, ilo = 1;
1539#endif
1540 PyObject *errorHandler = NULL;
1541 PyObject *exc = NULL;
1542
1543 /* Note: size will always be longer than the resulting Unicode
1544 character count */
1545 unicode = _PyUnicode_New(size);
1546 if (!unicode)
1547 return NULL;
1548 if (size == 0)
1549 return (PyObject *)unicode;
1550
1551 /* Unpack UTF-16 encoded data */
1552 p = unicode->str;
1553 q = (unsigned char *)s;
1554 e = q + size;
1555
1556 if (byteorder)
1557 bo = *byteorder;
1558
1559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1563 if (bo == 0) {
1564 if (size >= 2) {
1565 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1567 if (bom == 0xFEFF) {
1568 q += 2;
1569 bo = -1;
1570 }
1571 else if (bom == 0xFFFE) {
1572 q += 2;
1573 bo = 1;
1574 }
1575#else
1576 if (bom == 0xFEFF) {
1577 q += 2;
1578 bo = 1;
1579 }
1580 else if (bom == 0xFFFE) {
1581 q += 2;
1582 bo = -1;
1583 }
1584#endif
1585 }
1586 }
1587
1588 if (bo == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (bo == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
1599 while (q < e) {
1600 Py_UNICODE ch;
1601 /* remaining bytes at the end? (size should be even) */
1602 if (e-q<2) {
1603 if (consumed)
1604 break;
1605 errmsg = "truncated data";
1606 startinpos = ((const char *)q)-starts;
1607 endinpos = ((const char *)e)-starts;
1608 goto utf16Error;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1611 }
1612 ch = (q[ihi] << 8) | q[ilo];
1613
1614 q += 2;
1615
1616 if (ch < 0xD800 || ch > 0xDFFF) {
1617 *p++ = ch;
1618 continue;
1619 }
1620
1621 /* UTF-16 code pair: */
1622 if (q >= e) {
1623 errmsg = "unexpected end of data";
1624 startinpos = (((const char *)q)-2)-starts;
1625 endinpos = ((const char *)e)-starts;
1626 goto utf16Error;
1627 }
1628 if (0xD800 <= ch && ch <= 0xDBFF) {
1629 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630 q += 2;
1631 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1632#ifndef Py_UNICODE_WIDE
1633 *p++ = ch;
1634 *p++ = ch2;
1635#else
1636 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1637#endif
1638 continue;
1639 }
1640 else {
1641 errmsg = "illegal UTF-16 surrogate";
1642 startinpos = (((const char *)q)-4)-starts;
1643 endinpos = startinpos+2;
1644 goto utf16Error;
1645 }
1646
1647 }
1648 errmsg = "illegal encoding";
1649 startinpos = (((const char *)q)-2)-starts;
1650 endinpos = startinpos+2;
1651 /* Fall through to report the error */
1652
1653 utf16Error:
1654 outpos = p-PyUnicode_AS_UNICODE(unicode);
1655 if (unicode_decode_call_errorhandler(
1656 errors, &errorHandler,
1657 "utf16", errmsg,
1658 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659 (PyObject **)&unicode, &outpos, &p))
1660 goto onError;
1661 }
1662
1663 if (byteorder)
1664 *byteorder = bo;
1665
1666 if (consumed)
1667 *consumed = (const char *)q-starts;
1668
1669 /* Adjust length */
1670 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1671 goto onError;
1672
1673 Py_XDECREF(errorHandler);
1674 Py_XDECREF(exc);
1675 return (PyObject *)unicode;
1676
1677onError:
1678 Py_DECREF(unicode);
1679 Py_XDECREF(errorHandler);
1680 Py_XDECREF(exc);
1681 return NULL;
1682}
1683
1684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1686 Py_ssize_t size,
1687 const char *errors,
1688 int byteorder)
1689{
1690 PyObject *v;
1691 unsigned char *p;
1692#ifdef Py_UNICODE_WIDE
1693 int i, pairs;
1694#else
1695 const int pairs = 0;
1696#endif
1697 /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi = 1, ilo = 0;
1700#else
1701 int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH) \
1705 do { \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1708 p += 2; \
1709 } while(0)
1710
1711#ifdef Py_UNICODE_WIDE
1712 for (i = pairs = 0; i < size; i++)
1713 if (s[i] >= 0x10000)
1714 pairs++;
1715#endif
1716 v = PyString_FromStringAndSize(NULL,
1717 2 * (size + pairs + (byteorder == 0)));
1718 if (v == NULL)
1719 return NULL;
1720
1721 p = (unsigned char *)PyString_AS_STRING(v);
1722 if (byteorder == 0)
1723 STORECHAR(0xFEFF);
1724 if (size == 0)
1725 return v;
1726
1727 if (byteorder == -1) {
1728 /* force LE */
1729 ihi = 1;
1730 ilo = 0;
1731 }
1732 else if (byteorder == 1) {
1733 /* force BE */
1734 ihi = 0;
1735 ilo = 1;
1736 }
1737
1738 while (size-- > 0) {
1739 Py_UNICODE ch = *s++;
1740 Py_UNICODE ch2 = 0;
1741#ifdef Py_UNICODE_WIDE
1742 if (ch >= 0x10000) {
1743 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744 ch = 0xD800 | ((ch-0x10000) >> 10);
1745 }
1746#endif
1747 STORECHAR(ch);
1748 if (ch2)
1749 STORECHAR(ch2);
1750 }
1751 return v;
1752#undef STORECHAR
1753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757 if (!PyUnicode_Check(unicode)) {
1758 PyErr_BadArgument();
1759 return NULL;
1760 }
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762 PyUnicode_GET_SIZE(unicode),
1763 NULL,
1764 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
1769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1770
1771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1772 Py_ssize_t size,
1773 const char *errors)
1774{
1775 const char *starts = s;
1776 Py_ssize_t startinpos;
1777 Py_ssize_t endinpos;
1778 Py_ssize_t outpos;
1779 int i;
1780 PyUnicodeObject *v;
1781 Py_UNICODE *p;
1782 const char *end;
1783 char* message;
1784 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1785 PyObject *errorHandler = NULL;
1786 PyObject *exc = NULL;
1787
1788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
1790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
1793 v = _PyUnicode_New(size);
1794 if (v == NULL)
1795 goto onError;
1796 if (size == 0)
1797 return (PyObject *)v;
1798
1799 p = PyUnicode_AS_UNICODE(v);
1800 end = s + size;
1801
1802 while (s < end) {
1803 unsigned char c;
1804 Py_UNICODE x;
1805 int digits;
1806
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1808 if (*s != '\\') {
1809 *p++ = (unsigned char) *s++;
1810 continue;
1811 }
1812
1813 startinpos = s-starts;
1814 /* \ - Escapes */
1815 s++;
1816 switch (*s++) {
1817
1818 /* \x escapes */
1819 case '\n': break;
1820 case '\\': *p++ = '\\'; break;
1821 case '\'': *p++ = '\''; break;
1822 case '\"': *p++ = '\"'; break;
1823 case 'b': *p++ = '\b'; break;
1824 case 'f': *p++ = '\014'; break; /* FF */
1825 case 't': *p++ = '\t'; break;
1826 case 'n': *p++ = '\n'; break;
1827 case 'r': *p++ = '\r'; break;
1828 case 'v': *p++ = '\013'; break; /* VT */
1829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
1834 x = s[-1] - '0';
1835 if ('0' <= *s && *s <= '7') {
1836 x = (x<<3) + *s++ - '0';
1837 if ('0' <= *s && *s <= '7')
1838 x = (x<<3) + *s++ - '0';
1839 }
1840 *p++ = x;
1841 break;
1842
1843 /* hex escapes */
1844 /* \xXX */
1845 case 'x':
1846 digits = 2;
1847 message = "truncated \\xXX escape";
1848 goto hexescape;
1849
1850 /* \uXXXX */
1851 case 'u':
1852 digits = 4;
1853 message = "truncated \\uXXXX escape";
1854 goto hexescape;
1855
1856 /* \UXXXXXXXX */
1857 case 'U':
1858 digits = 8;
1859 message = "truncated \\UXXXXXXXX escape";
1860 hexescape:
1861 chr = 0;
1862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (s+digits>end) {
1864 endinpos = size;
1865 if (unicode_decode_call_errorhandler(
1866 errors, &errorHandler,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts, size, &startinpos, &endinpos, &exc, &s,
1869 (PyObject **)&v, &outpos, &p))
1870 goto onError;
1871 goto nextByte;
1872 }
1873 for (i = 0; i < digits; ++i) {
1874 c = (unsigned char) s[i];
1875 if (!isxdigit(c)) {
1876 endinpos = (s+i+1)-starts;
1877 if (unicode_decode_call_errorhandler(
1878 errors, &errorHandler,
1879 "unicodeescape", message,
1880 starts, size, &startinpos, &endinpos, &exc, &s,
1881 (PyObject **)&v, &outpos, &p))
1882 goto onError;
1883 goto nextByte;
1884 }
1885 chr = (chr<<4) & ~0xF;
1886 if (c >= '0' && c <= '9')
1887 chr += c - '0';
1888 else if (c >= 'a' && c <= 'f')
1889 chr += 10 + c - 'a';
1890 else
1891 chr += 10 + c - 'A';
1892 }
1893 s += i;
1894 if (chr == 0xffffffff && PyErr_Occurred())
1895 /* _decoding_error will have already written into the
1896 target buffer. */
1897 break;
1898 store:
1899 /* when we get here, chr is a 32-bit unicode character */
1900 if (chr <= 0xffff)
1901 /* UCS-2 character */
1902 *p++ = (Py_UNICODE) chr;
1903 else if (chr <= 0x10ffff) {
1904 /* UCS-4 character. Either store directly, or as
1905 surrogate pair. */
1906#ifdef Py_UNICODE_WIDE
1907 *p++ = chr;
1908#else
1909 chr -= 0x10000L;
1910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1912#endif
1913 } else {
1914 endinpos = s-starts;
1915 outpos = p-PyUnicode_AS_UNICODE(v);
1916 if (unicode_decode_call_errorhandler(
1917 errors, &errorHandler,
1918 "unicodeescape", "illegal Unicode character",
1919 starts, size, &startinpos, &endinpos, &exc, &s,
1920 (PyObject **)&v, &outpos, &p))
1921 goto onError;
1922 }
1923 break;
1924
1925 /* \N{name} */
1926 case 'N':
1927 message = "malformed \\N character escape";
1928 if (ucnhash_CAPI == NULL) {
1929 /* load the unicode data module */
1930 PyObject *m, *api;
1931 m = PyImport_ImportModule("unicodedata");
1932 if (m == NULL)
1933 goto ucnhashError;
1934 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
1935 Py_DECREF(m);
1936 if (api == NULL)
1937 goto ucnhashError;
1938 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
1939 Py_DECREF(api);
1940 if (ucnhash_CAPI == NULL)
1941 goto ucnhashError;
1942 }
1943 if (*s == '{') {
1944 const char *start = s+1;
1945 /* look for the closing brace */
1946 while (*s != '}' && s < end)
1947 s++;
1948 if (s > start && s < end && *s == '}') {
1949 /* found a name. look it up in the unicode database */
1950 message = "unknown Unicode character name";
1951 s++;
1952 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
1953 goto store;
1954 }
1955 }
1956 endinpos = s-starts;
1957 outpos = p-PyUnicode_AS_UNICODE(v);
1958 if (unicode_decode_call_errorhandler(
1959 errors, &errorHandler,
1960 "unicodeescape", message,
1961 starts, size, &startinpos, &endinpos, &exc, &s,
1962 (PyObject **)&v, &outpos, &p))
1963 goto onError;
1964 break;
1965
1966 default:
1967 if (s > end) {
1968 message = "\\ at end of string";
1969 s--;
1970 endinpos = s-starts;
1971 outpos = p-PyUnicode_AS_UNICODE(v);
1972 if (unicode_decode_call_errorhandler(
1973 errors, &errorHandler,
1974 "unicodeescape", message,
1975 starts, size, &startinpos, &endinpos, &exc, &s,
1976 (PyObject **)&v, &outpos, &p))
1977 goto onError;
1978 }
1979 else {
1980 *p++ = '\\';
1981 *p++ = (unsigned char)s[-1];
1982 }
1983 break;
1984 }
1985 nextByte:
1986 ;
1987 }
1988 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
1989 goto onError;
1990 Py_XDECREF(errorHandler);
1991 Py_XDECREF(exc);
1992 return (PyObject *)v;
1993
1994ucnhashError:
1995 PyErr_SetString(
1996 PyExc_UnicodeError,
1997 "\\N escapes not supported (can't load unicodedata module)"
1998 );
1999 Py_XDECREF(v);
2000 Py_XDECREF(errorHandler);
2001 Py_XDECREF(exc);
2002 return NULL;
2003
2004onError:
2005 Py_XDECREF(v);
2006 Py_XDECREF(errorHandler);
2007 Py_XDECREF(exc);
2008 return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2014 appropriate.
2015
2016*/
2017
2018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2019 Py_ssize_t size,
2020 Py_UNICODE ch)
2021{
2022 /* like wcschr, but doesn't stop at NULL characters */
2023
2024 while (size-- > 0) {
2025 if (*s == ch)
2026 return s;
2027 s++;
2028 }
2029
2030 return NULL;
2031}
2032
2033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
2035 Py_ssize_t size,
2036 int quotes)
2037{
2038 PyObject *repr;
2039 char *p;
2040
2041 static const char *hexdigit = "0123456789abcdef";
2042
2043 /* Initial allocation is based on the longest-possible unichr
2044 escape.
2045
2046 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2047 unichr, so in this case it's the longest unichr escape. In
2048 narrow (UTF-16) builds this is five chars per source unichr
2049 since there are two unichrs in the surrogate pair, so in narrow
2050 (UTF-16) builds it's not the longest unichr escape.
2051
2052 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2053 so in the narrow (UTF-16) build case it's the longest unichr
2054 escape.
2055 */
2056
2057 repr = PyString_FromStringAndSize(NULL,
2058 2
2059#ifdef Py_UNICODE_WIDE
2060 + 10*size
2061#else
2062 + 6*size
2063#endif
2064 + 1);
2065 if (repr == NULL)
2066 return NULL;
2067
2068 p = PyString_AS_STRING(repr);
2069
2070 if (quotes) {
2071 *p++ = 'u';
2072 *p++ = (findchar(s, size, '\'') &&
2073 !findchar(s, size, '"')) ? '"' : '\'';
2074 }
2075 while (size-- > 0) {
2076 Py_UNICODE ch = *s++;
2077
2078 /* Escape quotes and backslashes */
2079 if ((quotes &&
2080 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2081 *p++ = '\\';
2082 *p++ = (char) ch;
2083 continue;
2084 }
2085
2086#ifdef Py_UNICODE_WIDE
2087 /* Map 21-bit characters to '\U00xxxxxx' */
2088 else if (ch >= 0x10000) {
2089 *p++ = '\\';
2090 *p++ = 'U';
2091 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2092 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2093 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2094 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2095 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2096 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2097 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2098 *p++ = hexdigit[ch & 0x0000000F];
2099 continue;
2100 }
2101#else
2102 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2103 else if (ch >= 0xD800 && ch < 0xDC00) {
2104 Py_UNICODE ch2;
2105 Py_UCS4 ucs;
2106
2107 ch2 = *s++;
2108 size--;
2109 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2110 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2111 *p++ = '\\';
2112 *p++ = 'U';
2113 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2114 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2115 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2116 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2117 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2118 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2119 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2120 *p++ = hexdigit[ucs & 0x0000000F];
2121 continue;
2122 }
2123 /* Fall through: isolated surrogates are copied as-is */
2124 s--;
2125 size++;
2126 }
2127#endif
2128
2129 /* Map 16-bit characters to '\uxxxx' */
2130 if (ch >= 256) {
2131 *p++ = '\\';
2132 *p++ = 'u';
2133 *p++ = hexdigit[(ch >> 12) & 0x000F];
2134 *p++ = hexdigit[(ch >> 8) & 0x000F];
2135 *p++ = hexdigit[(ch >> 4) & 0x000F];
2136 *p++ = hexdigit[ch & 0x000F];
2137 }
2138
2139 /* Map special whitespace to '\t', \n', '\r' */
2140 else if (ch == '\t') {
2141 *p++ = '\\';
2142 *p++ = 't';
2143 }
2144 else if (ch == '\n') {
2145 *p++ = '\\';
2146 *p++ = 'n';
2147 }
2148 else if (ch == '\r') {
2149 *p++ = '\\';
2150 *p++ = 'r';
2151 }
2152
2153 /* Map non-printable US ASCII to '\xhh' */
2154 else if (ch < ' ' || ch >= 0x7F) {
2155 *p++ = '\\';
2156 *p++ = 'x';
2157 *p++ = hexdigit[(ch >> 4) & 0x000F];
2158 *p++ = hexdigit[ch & 0x000F];
2159 }
2160
2161 /* Copy everything else as-is */
2162 else
2163 *p++ = (char) ch;
2164 }
2165 if (quotes)
2166 *p++ = PyString_AS_STRING(repr)[1];
2167
2168 *p = '\0';
2169 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2170 return repr;
2171}
2172
2173PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2174 Py_ssize_t size)
2175{
2176 return unicodeescape_string(s, size, 0);
2177}
2178
2179PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2180{
2181 if (!PyUnicode_Check(unicode)) {
2182 PyErr_BadArgument();
2183 return NULL;
2184 }
2185 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2186 PyUnicode_GET_SIZE(unicode));
2187}
2188
2189/* --- Raw Unicode Escape Codec ------------------------------------------- */
2190
2191PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2192 Py_ssize_t size,
2193 const char *errors)
2194{
2195 const char *starts = s;
2196 Py_ssize_t startinpos;
2197 Py_ssize_t endinpos;
2198 Py_ssize_t outpos;
2199 PyUnicodeObject *v;
2200 Py_UNICODE *p;
2201 const char *end;
2202 const char *bs;
2203 PyObject *errorHandler = NULL;
2204 PyObject *exc = NULL;
2205
2206 /* Escaped strings will always be longer than the resulting
2207 Unicode string, so we start with size here and then reduce the
2208 length after conversion to the true value. (But decoding error
2209 handler might have to resize the string) */
2210 v = _PyUnicode_New(size);
2211 if (v == NULL)
2212 goto onError;
2213 if (size == 0)
2214 return (PyObject *)v;
2215 p = PyUnicode_AS_UNICODE(v);
2216 end = s + size;
2217 while (s < end) {
2218 unsigned char c;
2219 Py_UCS4 x;
2220 int i;
2221 int count;
2222
2223 /* Non-escape characters are interpreted as Unicode ordinals */
2224 if (*s != '\\') {
2225 *p++ = (unsigned char)*s++;
2226 continue;
2227 }
2228 startinpos = s-starts;
2229
2230 /* \u-escapes are only interpreted iff the number of leading
2231 backslashes if odd */
2232 bs = s;
2233 for (;s < end;) {
2234 if (*s != '\\')
2235 break;
2236 *p++ = (unsigned char)*s++;
2237 }
2238 if (((s - bs) & 1) == 0 ||
2239 s >= end ||
2240 (*s != 'u' && *s != 'U')) {
2241 continue;
2242 }
2243 p--;
2244 count = *s=='u' ? 4 : 8;
2245 s++;
2246
2247 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2248 outpos = p-PyUnicode_AS_UNICODE(v);
2249 for (x = 0, i = 0; i < count; ++i, ++s) {
2250 c = (unsigned char)*s;
2251 if (!isxdigit(c)) {
2252 endinpos = s-starts;
2253 if (unicode_decode_call_errorhandler(
2254 errors, &errorHandler,
2255 "rawunicodeescape", "truncated \\uXXXX",
2256 starts, size, &startinpos, &endinpos, &exc, &s,
2257 (PyObject **)&v, &outpos, &p))
2258 goto onError;
2259 goto nextByte;
2260 }
2261 x = (x<<4) & ~0xF;
2262 if (c >= '0' && c <= '9')
2263 x += c - '0';
2264 else if (c >= 'a' && c <= 'f')
2265 x += 10 + c - 'a';
2266 else
2267 x += 10 + c - 'A';
2268 }
2269#ifndef Py_UNICODE_WIDE
2270 if (x > 0x10000) {
2271 if (unicode_decode_call_errorhandler(
2272 errors, &errorHandler,
2273 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2274 starts, size, &startinpos, &endinpos, &exc, &s,
2275 (PyObject **)&v, &outpos, &p))
2276 goto onError;
2277 }
2278#endif
2279 *p++ = x;
2280 nextByte:
2281 ;
2282 }
2283 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2284 goto onError;
2285 Py_XDECREF(errorHandler);
2286 Py_XDECREF(exc);
2287 return (PyObject *)v;
2288
2289 onError:
2290 Py_XDECREF(v);
2291 Py_XDECREF(errorHandler);
2292 Py_XDECREF(exc);
2293 return NULL;
2294}
2295
2296PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2297 Py_ssize_t size)
2298{
2299 PyObject *repr;
2300 char *p;
2301 char *q;
2302
2303 static const char *hexdigit = "0123456789abcdef";
2304
2305#ifdef Py_UNICODE_WIDE
2306 repr = PyString_FromStringAndSize(NULL, 10 * size);
2307#else
2308 repr = PyString_FromStringAndSize(NULL, 6 * size);
2309#endif
2310 if (repr == NULL)
2311 return NULL;
2312 if (size == 0)
2313 return repr;
2314
2315 p = q = PyString_AS_STRING(repr);
2316 while (size-- > 0) {
2317 Py_UNICODE ch = *s++;
2318#ifdef Py_UNICODE_WIDE
2319 /* Map 32-bit characters to '\Uxxxxxxxx' */
2320 if (ch >= 0x10000) {
2321 *p++ = '\\';
2322 *p++ = 'U';
2323 *p++ = hexdigit[(ch >> 28) & 0xf];
2324 *p++ = hexdigit[(ch >> 24) & 0xf];
2325 *p++ = hexdigit[(ch >> 20) & 0xf];
2326 *p++ = hexdigit[(ch >> 16) & 0xf];
2327 *p++ = hexdigit[(ch >> 12) & 0xf];
2328 *p++ = hexdigit[(ch >> 8) & 0xf];
2329 *p++ = hexdigit[(ch >> 4) & 0xf];
2330 *p++ = hexdigit[ch & 15];
2331 }
2332 else
2333#endif
2334 /* Map 16-bit characters to '\uxxxx' */
2335 if (ch >= 256) {
2336 *p++ = '\\';
2337 *p++ = 'u';
2338 *p++ = hexdigit[(ch >> 12) & 0xf];
2339 *p++ = hexdigit[(ch >> 8) & 0xf];
2340 *p++ = hexdigit[(ch >> 4) & 0xf];
2341 *p++ = hexdigit[ch & 15];
2342 }
2343 /* Copy everything else as-is */
2344 else
2345 *p++ = (char) ch;
2346 }
2347 *p = '\0';
2348 _PyString_Resize(&repr, p - q);
2349 return repr;
2350}
2351
2352PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2353{
2354 if (!PyUnicode_Check(unicode)) {
2355 PyErr_BadArgument();
2356 return NULL;
2357 }
2358 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2359 PyUnicode_GET_SIZE(unicode));
2360}
2361
2362/* --- Unicode Internal Codec ------------------------------------------- */
2363
2364PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2365 Py_ssize_t size,
2366 const char *errors)
2367{
2368 const char *starts = s;
2369 Py_ssize_t startinpos;
2370 Py_ssize_t endinpos;
2371 Py_ssize_t outpos;
2372 PyUnicodeObject *v;
2373 Py_UNICODE *p;
2374 const char *end;
2375 const char *reason;
2376 PyObject *errorHandler = NULL;
2377 PyObject *exc = NULL;
2378
2379#ifdef Py_UNICODE_WIDE
2380 Py_UNICODE unimax = PyUnicode_GetMax();
2381#endif
2382
2383 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2384 if (v == NULL)
2385 goto onError;
2386 if (PyUnicode_GetSize((PyObject *)v) == 0)
2387 return (PyObject *)v;
2388 p = PyUnicode_AS_UNICODE(v);
2389 end = s + size;
2390
2391 while (s < end) {
2392 memcpy(p, s, sizeof(Py_UNICODE));
2393 /* We have to sanity check the raw data, otherwise doom looms for
2394 some malformed UCS-4 data. */
2395 if (
2396 #ifdef Py_UNICODE_WIDE
2397 *p > unimax || *p < 0 ||
2398 #endif
2399 end-s < Py_UNICODE_SIZE
2400 )
2401 {
2402 startinpos = s - starts;
2403 if (end-s < Py_UNICODE_SIZE) {
2404 endinpos = end-starts;
2405 reason = "truncated input";
2406 }
2407 else {
2408 endinpos = s - starts + Py_UNICODE_SIZE;
2409 reason = "illegal code point (> 0x10FFFF)";
2410 }
2411 outpos = p - PyUnicode_AS_UNICODE(v);
2412 if (unicode_decode_call_errorhandler(
2413 errors, &errorHandler,
2414 "unicode_internal", reason,
2415 starts, size, &startinpos, &endinpos, &exc, &s,
2416 (PyObject **)&v, &outpos, &p)) {
2417 goto onError;
2418 }
2419 }
2420 else {
2421 p++;
2422 s += Py_UNICODE_SIZE;
2423 }
2424 }
2425
2426 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2427 goto onError;
2428 Py_XDECREF(errorHandler);
2429 Py_XDECREF(exc);
2430 return (PyObject *)v;
2431
2432 onError:
2433 Py_XDECREF(v);
2434 Py_XDECREF(errorHandler);
2435 Py_XDECREF(exc);
2436 return NULL;
2437}
2438
2439/* --- Latin-1 Codec ------------------------------------------------------ */
2440
2441PyObject *PyUnicode_DecodeLatin1(const char *s,
2442 Py_ssize_t size,
2443 const char *errors)
2444{
2445 PyUnicodeObject *v;
2446 Py_UNICODE *p;
2447
2448 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2449 if (size == 1) {
2450 Py_UNICODE r = *(unsigned char*)s;
2451 return PyUnicode_FromUnicode(&r, 1);
2452 }
2453
2454 v = _PyUnicode_New(size);
2455 if (v == NULL)
2456 goto onError;
2457 if (size == 0)
2458 return (PyObject *)v;
2459 p = PyUnicode_AS_UNICODE(v);
2460 while (size-- > 0)
2461 *p++ = (unsigned char)*s++;
2462 return (PyObject *)v;
2463
2464 onError:
2465 Py_XDECREF(v);
2466 return NULL;
2467}
2468
2469/* create or adjust a UnicodeEncodeError */
2470static void make_encode_exception(PyObject **exceptionObject,
2471 const char *encoding,
2472 const Py_UNICODE *unicode, Py_ssize_t size,
2473 Py_ssize_t startpos, Py_ssize_t endpos,
2474 const char *reason)
2475{
2476 if (*exceptionObject == NULL) {
2477 *exceptionObject = PyUnicodeEncodeError_Create(
2478 encoding, unicode, size, startpos, endpos, reason);
2479 }
2480 else {
2481 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2482 goto onError;
2483 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2484 goto onError;
2485 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2486 goto onError;
2487 return;
2488 onError:
2489 Py_DECREF(*exceptionObject);
2490 *exceptionObject = NULL;
2491 }
2492}
2493
2494/* raises a UnicodeEncodeError */
2495static void raise_encode_exception(PyObject **exceptionObject,
2496 const char *encoding,
2497 const Py_UNICODE *unicode, Py_ssize_t size,
2498 Py_ssize_t startpos, Py_ssize_t endpos,
2499 const char *reason)
2500{
2501 make_encode_exception(exceptionObject,
2502 encoding, unicode, size, startpos, endpos, reason);
2503 if (*exceptionObject != NULL)
2504 PyCodec_StrictErrors(*exceptionObject);
2505}
2506
2507/* error handling callback helper:
2508 build arguments, call the callback and check the arguments,
2509 put the result into newpos and return the replacement string, which
2510 has to be freed by the caller */
2511static PyObject *unicode_encode_call_errorhandler(const char *errors,
2512 PyObject **errorHandler,
2513 const char *encoding, const char *reason,
2514 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2515 Py_ssize_t startpos, Py_ssize_t endpos,
2516 Py_ssize_t *newpos)
2517{
2518 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
2519
2520 PyObject *restuple;
2521 PyObject *resunicode;
2522
2523 if (*errorHandler == NULL) {
2524 *errorHandler = PyCodec_LookupError(errors);
2525 if (*errorHandler == NULL)
2526 return NULL;
2527 }
2528
2529 make_encode_exception(exceptionObject,
2530 encoding, unicode, size, startpos, endpos, reason);
2531 if (*exceptionObject == NULL)
2532 return NULL;
2533
2534 restuple = PyObject_CallFunctionObjArgs(
2535 *errorHandler, *exceptionObject, NULL);
2536 if (restuple == NULL)
2537 return NULL;
2538 if (!PyTuple_Check(restuple)) {
2539 PyErr_Format(PyExc_TypeError, &argparse[4]);
2540 Py_DECREF(restuple);
2541 return NULL;
2542 }
2543 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2544 &resunicode, newpos)) {
2545 Py_DECREF(restuple);
2546 return NULL;
2547 }
2548 if (*newpos<0)
2549 *newpos = size+*newpos;
2550 if (*newpos<0 || *newpos>size) {
2551 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
2552 Py_DECREF(restuple);
2553 return NULL;
2554 }
2555 Py_INCREF(resunicode);
2556 Py_DECREF(restuple);
2557 return resunicode;
2558}
2559
2560static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2561 Py_ssize_t size,
2562 const char *errors,
2563 int limit)
2564{
2565 /* output object */
2566 PyObject *res;
2567 /* pointers to the beginning and end+1 of input */
2568 const Py_UNICODE *startp = p;
2569 const Py_UNICODE *endp = p + size;
2570 /* pointer to the beginning of the unencodable characters */
2571 /* const Py_UNICODE *badp = NULL; */
2572 /* pointer into the output */
2573 char *str;
2574 /* current output position */
2575 Py_ssize_t respos = 0;
2576 Py_ssize_t ressize;
2577 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2578 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2579 PyObject *errorHandler = NULL;
2580 PyObject *exc = NULL;
2581 /* the following variable is used for caching string comparisons
2582 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2583 int known_errorHandler = -1;
2584
2585 /* allocate enough for a simple encoding without
2586 replacements, if we need more, we'll resize */
2587 res = PyString_FromStringAndSize(NULL, size);
2588 if (res == NULL)
2589 goto onError;
2590 if (size == 0)
2591 return res;
2592 str = PyString_AS_STRING(res);
2593 ressize = size;
2594
2595 while (p<endp) {
2596 Py_UNICODE c = *p;
2597
2598 /* can we encode this? */
2599 if (c<limit) {
2600 /* no overflow check, because we know that the space is enough */
2601 *str++ = (char)c;
2602 ++p;
2603 }
2604 else {
2605 Py_ssize_t unicodepos = p-startp;
2606 Py_ssize_t requiredsize;
2607 PyObject *repunicode;
2608 Py_ssize_t repsize;
2609 Py_ssize_t newpos;
2610 Py_ssize_t respos;
2611 Py_UNICODE *uni2;
2612 /* startpos for collecting unencodable chars */
2613 const Py_UNICODE *collstart = p;
2614 const Py_UNICODE *collend = p;
2615 /* find all unecodable characters */
2616 while ((collend < endp) && ((*collend)>=limit))
2617 ++collend;
2618 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2619 if (known_errorHandler==-1) {
2620 if ((errors==NULL) || (!strcmp(errors, "strict")))
2621 known_errorHandler = 1;
2622 else if (!strcmp(errors, "replace"))
2623 known_errorHandler = 2;
2624 else if (!strcmp(errors, "ignore"))
2625 known_errorHandler = 3;
2626 else if (!strcmp(errors, "xmlcharrefreplace"))
2627 known_errorHandler = 4;
2628 else
2629 known_errorHandler = 0;
2630 }
2631 switch (known_errorHandler) {
2632 case 1: /* strict */
2633 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2634 goto onError;
2635 case 2: /* replace */
2636 while (collstart++<collend)
2637 *str++ = '?'; /* fall through */
2638 case 3: /* ignore */
2639 p = collend;
2640 break;
2641 case 4: /* xmlcharrefreplace */
2642 respos = str-PyString_AS_STRING(res);
2643 /* determine replacement size (temporarily (mis)uses p) */
2644 for (p = collstart, repsize = 0; p < collend; ++p) {
2645 if (*p<10)
2646 repsize += 2+1+1;
2647 else if (*p<100)
2648 repsize += 2+2+1;
2649 else if (*p<1000)
2650 repsize += 2+3+1;
2651 else if (*p<10000)
2652 repsize += 2+4+1;
2653#ifndef Py_UNICODE_WIDE
2654 else
2655 repsize += 2+5+1;
2656#else
2657 else if (*p<100000)
2658 repsize += 2+5+1;
2659 else if (*p<1000000)
2660 repsize += 2+6+1;
2661 else
2662 repsize += 2+7+1;
2663#endif
2664 }
2665 requiredsize = respos+repsize+(endp-collend);
2666 if (requiredsize > ressize) {
2667 if (requiredsize<2*ressize)
2668 requiredsize = 2*ressize;
2669 if (_PyString_Resize(&res, requiredsize))
2670 goto onError;
2671 str = PyString_AS_STRING(res) + respos;
2672 ressize = requiredsize;
2673 }
2674 /* generate replacement (temporarily (mis)uses p) */
2675 for (p = collstart; p < collend; ++p) {
2676 str += sprintf(str, "&#%d;", (int)*p);
2677 }
2678 p = collend;
2679 break;
2680 default:
2681 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2682 encoding, reason, startp, size, &exc,
2683 collstart-startp, collend-startp, &newpos);
2684 if (repunicode == NULL)
2685 goto onError;
2686 /* need more space? (at least enough for what we
2687 have+the replacement+the rest of the string, so
2688 we won't have to check space for encodable characters) */
2689 respos = str-PyString_AS_STRING(res);
2690 repsize = PyUnicode_GET_SIZE(repunicode);
2691 requiredsize = respos+repsize+(endp-collend);
2692 if (requiredsize > ressize) {
2693 if (requiredsize<2*ressize)
2694 requiredsize = 2*ressize;
2695 if (_PyString_Resize(&res, requiredsize)) {
2696 Py_DECREF(repunicode);
2697 goto onError;
2698 }
2699 str = PyString_AS_STRING(res) + respos;
2700 ressize = requiredsize;
2701 }
2702 /* check if there is anything unencodable in the replacement
2703 and copy it to the output */
2704 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2705 c = *uni2;
2706 if (c >= limit) {
2707 raise_encode_exception(&exc, encoding, startp, size,
2708 unicodepos, unicodepos+1, reason);
2709 Py_DECREF(repunicode);
2710 goto onError;
2711 }
2712 *str = (char)c;
2713 }
2714 p = startp + newpos;
2715 Py_DECREF(repunicode);
2716 }
2717 }
2718 }
2719 /* Resize if we allocated to much */
2720 respos = str-PyString_AS_STRING(res);
2721 if (respos<ressize)
2722 /* If this falls res will be NULL */
2723 _PyString_Resize(&res, respos);
2724 Py_XDECREF(errorHandler);
2725 Py_XDECREF(exc);
2726 return res;
2727
2728 onError:
2729 Py_XDECREF(res);
2730 Py_XDECREF(errorHandler);
2731 Py_XDECREF(exc);
2732 return NULL;
2733}
2734
2735PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2736 Py_ssize_t size,
2737 const char *errors)
2738{
2739 return unicode_encode_ucs1(p, size, errors, 256);
2740}
2741
2742PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2743{
2744 if (!PyUnicode_Check(unicode)) {
2745 PyErr_BadArgument();
2746 return NULL;
2747 }
2748 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2749 PyUnicode_GET_SIZE(unicode),
2750 NULL);
2751}
2752
2753/* --- 7-bit ASCII Codec -------------------------------------------------- */
2754
2755PyObject *PyUnicode_DecodeASCII(const char *s,
2756 Py_ssize_t size,
2757 const char *errors)
2758{
2759 const char *starts = s;
2760 PyUnicodeObject *v;
2761 Py_UNICODE *p;
2762 Py_ssize_t startinpos;
2763 Py_ssize_t endinpos;
2764 Py_ssize_t outpos;
2765 const char *e;
2766 PyObject *errorHandler = NULL;
2767 PyObject *exc = NULL;
2768
2769 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2770 if (size == 1 && *(unsigned char*)s < 128) {
2771 Py_UNICODE r = *(unsigned char*)s;
2772 return PyUnicode_FromUnicode(&r, 1);
2773 }
2774
2775 v = _PyUnicode_New(size);
2776 if (v == NULL)
2777 goto onError;
2778 if (size == 0)
2779 return (PyObject *)v;
2780 p = PyUnicode_AS_UNICODE(v);
2781 e = s + size;
2782 while (s < e) {
2783 register unsigned char c = (unsigned char)*s;
2784 if (c < 128) {
2785 *p++ = c;
2786 ++s;
2787 }
2788 else {
2789 startinpos = s-starts;
2790 endinpos = startinpos + 1;
2791 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
2792 if (unicode_decode_call_errorhandler(
2793 errors, &errorHandler,
2794 "ascii", "ordinal not in range(128)",
2795 starts, size, &startinpos, &endinpos, &exc, &s,
2796 (PyObject **)&v, &outpos, &p))
2797 goto onError;
2798 }
2799 }
2800 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2801 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2802 goto onError;
2803 Py_XDECREF(errorHandler);
2804 Py_XDECREF(exc);
2805 return (PyObject *)v;
2806
2807 onError:
2808 Py_XDECREF(v);
2809 Py_XDECREF(errorHandler);
2810 Py_XDECREF(exc);
2811 return NULL;
2812}
2813
2814PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2815 Py_ssize_t size,
2816 const char *errors)
2817{
2818 return unicode_encode_ucs1(p, size, errors, 128);
2819}
2820
2821PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2822{
2823 if (!PyUnicode_Check(unicode)) {
2824 PyErr_BadArgument();
2825 return NULL;
2826 }
2827 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2828 PyUnicode_GET_SIZE(unicode),
2829 NULL);
2830}
2831
2832#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2833
2834/* --- MBCS codecs for Windows -------------------------------------------- */
2835
2836#if SIZEOF_INT < SIZEOF_SSIZE_T
2837#define NEED_RETRY
2838#endif
2839
2840/* XXX This code is limited to "true" double-byte encodings, as
2841 a) it assumes an incomplete character consists of a single byte, and
2842 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2843 encodings, see IsDBCSLeadByteEx documentation. */
2844
2845static int is_dbcs_lead_byte(const char *s, int offset)
2846{
2847 const char *curr = s + offset;
2848
2849 if (IsDBCSLeadByte(*curr)) {
2850 const char *prev = CharPrev(s, curr);
2851 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2852 }
2853 return 0;
2854}
2855
2856/*
2857 * Decode MBCS string into unicode object. If 'final' is set, converts
2858 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2859 */
2860static int decode_mbcs(PyUnicodeObject **v,
2861 const char *s, /* MBCS string */
2862 int size, /* sizeof MBCS string */
2863 int final)
2864{
2865 Py_UNICODE *p;
2866 Py_ssize_t n = 0;
2867 int usize = 0;
2868
2869 assert(size >= 0);
2870
2871 /* Skip trailing lead-byte unless 'final' is set */
2872 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2873 --size;
2874
2875 /* First get the size of the result */
2876 if (size > 0) {
2877 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2878 if (usize == 0) {
2879 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2880 return -1;
2881 }
2882 }
2883
2884 if (*v == NULL) {
2885 /* Create unicode object */
2886 *v = _PyUnicode_New(usize);
2887 if (*v == NULL)
2888 return -1;
2889 }
2890 else {
2891 /* Extend unicode object */
2892 n = PyUnicode_GET_SIZE(*v);
2893 if (_PyUnicode_Resize(v, n + usize) < 0)
2894 return -1;
2895 }
2896
2897 /* Do the conversion */
2898 if (size > 0) {
2899 p = PyUnicode_AS_UNICODE(*v) + n;
2900 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2901 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2902 return -1;
2903 }
2904 }
2905
2906 return size;
2907}
2908
2909PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2910 Py_ssize_t size,
2911 const char *errors,
2912 Py_ssize_t *consumed)
2913{
2914 PyUnicodeObject *v = NULL;
2915 int done;
2916
2917 if (consumed)
2918 *consumed = 0;
2919
2920#ifdef NEED_RETRY
2921 retry:
2922 if (size > INT_MAX)
2923 done = decode_mbcs(&v, s, INT_MAX, 0);
2924 else
2925#endif
2926 done = decode_mbcs(&v, s, (int)size, !consumed);
2927
2928 if (done < 0) {
2929 Py_XDECREF(v);
2930 return NULL;
2931 }
2932
2933 if (consumed)
2934 *consumed += done;
2935
2936#ifdef NEED_RETRY
2937 if (size > INT_MAX) {
2938 s += done;
2939 size -= done;
2940 goto retry;
2941 }
2942#endif
2943
2944 return (PyObject *)v;
2945}
2946
2947PyObject *PyUnicode_DecodeMBCS(const char *s,
2948 Py_ssize_t size,
2949 const char *errors)
2950{
2951 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2952}
2953
2954/*
2955 * Convert unicode into string object (MBCS).
2956 * Returns 0 if succeed, -1 otherwise.
2957 */
2958static int encode_mbcs(PyObject **repr,
2959 const Py_UNICODE *p, /* unicode */
2960 int size) /* size of unicode */
2961{
2962 int mbcssize = 0;
2963 Py_ssize_t n = 0;
2964
2965 assert(size >= 0);
2966
2967 /* First get the size of the result */
2968 if (size > 0) {
2969 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2970 if (mbcssize == 0) {
2971 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2972 return -1;
2973 }
2974 }
2975
2976 if (*repr == NULL) {
2977 /* Create string object */
2978 *repr = PyString_FromStringAndSize(NULL, mbcssize);
2979 if (*repr == NULL)
2980 return -1;
2981 }
2982 else {
2983 /* Extend string object */
2984 n = PyString_Size(*repr);
2985 if (_PyString_Resize(repr, n + mbcssize) < 0)
2986 return -1;
2987 }
2988
2989 /* Do the conversion */
2990 if (size > 0) {
2991 char *s = PyString_AS_STRING(*repr) + n;
2992 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2993 PyErr_SetFromWindowsErrWithFilename(0, NULL);
2994 return -1;
2995 }
2996 }
2997
2998 return 0;
2999}
3000
3001PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3002 Py_ssize_t size,
3003 const char *errors)
3004{
3005 PyObject *repr = NULL;
3006 int ret;
3007
3008#ifdef NEED_RETRY
3009 retry:
3010 if (size > INT_MAX)
3011 ret = encode_mbcs(&repr, p, INT_MAX);
3012 else
3013#endif
3014 ret = encode_mbcs(&repr, p, (int)size);
3015
3016 if (ret < 0) {
3017 Py_XDECREF(repr);
3018 return NULL;
3019 }
3020
3021#ifdef NEED_RETRY
3022 if (size > INT_MAX) {
3023 p += INT_MAX;
3024 size -= INT_MAX;
3025 goto retry;
3026 }
3027#endif
3028
3029 return repr;
3030}
3031
3032PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3033{
3034 if (!PyUnicode_Check(unicode)) {
3035 PyErr_BadArgument();
3036 return NULL;
3037 }
3038 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3039 PyUnicode_GET_SIZE(unicode),
3040 NULL);
3041}
3042
3043#undef NEED_RETRY
3044
3045#endif /* MS_WINDOWS */
3046
3047/* --- Character Mapping Codec -------------------------------------------- */
3048
3049PyObject *PyUnicode_DecodeCharmap(const char *s,
3050 Py_ssize_t size,
3051 PyObject *mapping,
3052 const char *errors)
3053{
3054 const char *starts = s;
3055 Py_ssize_t startinpos;
3056 Py_ssize_t endinpos;
3057 Py_ssize_t outpos;
3058 const char *e;
3059 PyUnicodeObject *v;
3060 Py_UNICODE *p;
3061 Py_ssize_t extrachars = 0;
3062 PyObject *errorHandler = NULL;
3063 PyObject *exc = NULL;
3064 Py_UNICODE *mapstring = NULL;
3065 Py_ssize_t maplen = 0;
3066
3067 /* Default to Latin-1 */
3068 if (mapping == NULL)
3069 return PyUnicode_DecodeLatin1(s, size, errors);
3070
3071 v = _PyUnicode_New(size);
3072 if (v == NULL)
3073 goto onError;
3074 if (size == 0)
3075 return (PyObject *)v;
3076 p = PyUnicode_AS_UNICODE(v);
3077 e = s + size;
3078 if (PyUnicode_CheckExact(mapping)) {
3079 mapstring = PyUnicode_AS_UNICODE(mapping);
3080 maplen = PyUnicode_GET_SIZE(mapping);
3081 while (s < e) {
3082 unsigned char ch = *s;
3083 Py_UNICODE x = 0xfffe; /* illegal value */
3084
3085 if (ch < maplen)
3086 x = mapstring[ch];
3087
3088 if (x == 0xfffe) {
3089 /* undefined mapping */
3090 outpos = p-PyUnicode_AS_UNICODE(v);
3091 startinpos = s-starts;
3092 endinpos = startinpos+1;
3093 if (unicode_decode_call_errorhandler(
3094 errors, &errorHandler,
3095 "charmap", "character maps to <undefined>",
3096 starts, size, &startinpos, &endinpos, &exc, &s,
3097 (PyObject **)&v, &outpos, &p)) {
3098 goto onError;
3099 }
3100 continue;
3101 }
3102 *p++ = x;
3103 ++s;
3104 }
3105 }
3106 else {
3107 while (s < e) {
3108 unsigned char ch = *s;
3109 PyObject *w, *x;
3110
3111 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3112 w = PyInt_FromLong((long)ch);
3113 if (w == NULL)
3114 goto onError;
3115 x = PyObject_GetItem(mapping, w);
3116 Py_DECREF(w);
3117 if (x == NULL) {
3118 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3119 /* No mapping found means: mapping is undefined. */
3120 PyErr_Clear();
3121 x = Py_None;
3122 Py_INCREF(x);
3123 } else
3124 goto onError;
3125 }
3126
3127 /* Apply mapping */
3128 if (PyInt_Check(x)) {
3129 long value = PyInt_AS_LONG(x);
3130 if (value < 0 || value > 65535) {
3131 PyErr_SetString(PyExc_TypeError,
3132 "character mapping must be in range(65536)");
3133 Py_DECREF(x);
3134 goto onError;
3135 }
3136 *p++ = (Py_UNICODE)value;
3137 }
3138 else if (x == Py_None) {
3139 /* undefined mapping */
3140 outpos = p-PyUnicode_AS_UNICODE(v);
3141 startinpos = s-starts;
3142 endinpos = startinpos+1;
3143 if (unicode_decode_call_errorhandler(
3144 errors, &errorHandler,
3145 "charmap", "character maps to <undefined>",
3146 starts, size, &startinpos, &endinpos, &exc, &s,
3147 (PyObject **)&v, &outpos, &p)) {
3148 Py_DECREF(x);
3149 goto onError;
3150 }
3151 Py_DECREF(x);
3152 continue;
3153 }
3154 else if (PyUnicode_Check(x)) {
3155 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3156
3157 if (targetsize == 1)
3158 /* 1-1 mapping */
3159 *p++ = *PyUnicode_AS_UNICODE(x);
3160
3161 else if (targetsize > 1) {
3162 /* 1-n mapping */
3163 if (targetsize > extrachars) {
3164 /* resize first */
3165 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3166 Py_ssize_t needed = (targetsize - extrachars) + \
3167 (targetsize << 2);
3168 extrachars += needed;
3169 if (_PyUnicode_Resize(&v,
3170 PyUnicode_GET_SIZE(v) + needed) < 0) {
3171 Py_DECREF(x);
3172 goto onError;
3173 }
3174 p = PyUnicode_AS_UNICODE(v) + oldpos;
3175 }
3176 Py_UNICODE_COPY(p,
3177 PyUnicode_AS_UNICODE(x),
3178 targetsize);
3179 p += targetsize;
3180 extrachars -= targetsize;
3181 }
3182 /* 1-0 mapping: skip the character */
3183 }
3184 else {
3185 /* wrong return value */
3186 PyErr_SetString(PyExc_TypeError,
3187 "character mapping must return integer, None or unicode");
3188 Py_DECREF(x);
3189 goto onError;
3190 }
3191 Py_DECREF(x);
3192 ++s;
3193 }
3194 }
3195 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3196 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3197 goto onError;
3198 Py_XDECREF(errorHandler);
3199 Py_XDECREF(exc);
3200 return (PyObject *)v;
3201
3202 onError:
3203 Py_XDECREF(errorHandler);
3204 Py_XDECREF(exc);
3205 Py_XDECREF(v);
3206 return NULL;
3207}
3208
3209/* Charmap encoding: the lookup table */
3210
3211struct encoding_map{
3212 PyObject_HEAD
3213 unsigned char level1[32];
3214 int count2, count3;
3215 unsigned char level23[1];
3216};
3217
3218static PyObject*
3219encoding_map_size(PyObject *obj, PyObject* args)
3220{
3221 struct encoding_map *map = (struct encoding_map*)obj;
3222 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3223 128*map->count3);
3224}
3225
3226static PyMethodDef encoding_map_methods[] = {
3227 {"size", encoding_map_size, METH_NOARGS,
3228 PyDoc_STR("Return the size (in bytes) of this object") },
3229 { 0 }
3230};
3231
3232static void
3233encoding_map_dealloc(PyObject* o)
3234{
3235 PyObject_FREE(o);
3236}
3237
3238static PyTypeObject EncodingMapType = {
3239 PyObject_HEAD_INIT(NULL)
3240 0, /*ob_size*/
3241 "EncodingMap", /*tp_name*/
3242 sizeof(struct encoding_map), /*tp_basicsize*/
3243 0, /*tp_itemsize*/
3244 /* methods */
3245 encoding_map_dealloc, /*tp_dealloc*/
3246 0, /*tp_print*/
3247 0, /*tp_getattr*/
3248 0, /*tp_setattr*/
3249 0, /*tp_compare*/
3250 0, /*tp_repr*/
3251 0, /*tp_as_number*/
3252 0, /*tp_as_sequence*/
3253 0, /*tp_as_mapping*/
3254 0, /*tp_hash*/
3255 0, /*tp_call*/
3256 0, /*tp_str*/
3257 0, /*tp_getattro*/
3258 0, /*tp_setattro*/
3259 0, /*tp_as_buffer*/
3260 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3261 0, /*tp_doc*/
3262 0, /*tp_traverse*/
3263 0, /*tp_clear*/
3264 0, /*tp_richcompare*/
3265 0, /*tp_weaklistoffset*/
3266 0, /*tp_iter*/
3267 0, /*tp_iternext*/
3268 encoding_map_methods, /*tp_methods*/
3269 0, /*tp_members*/
3270 0, /*tp_getset*/
3271 0, /*tp_base*/
3272 0, /*tp_dict*/
3273 0, /*tp_descr_get*/
3274 0, /*tp_descr_set*/
3275 0, /*tp_dictoffset*/
3276 0, /*tp_init*/
3277 0, /*tp_alloc*/
3278 0, /*tp_new*/
3279 0, /*tp_free*/
3280 0, /*tp_is_gc*/
3281};
3282
3283PyObject*
3284PyUnicode_BuildEncodingMap(PyObject* string)
3285{
3286 Py_UNICODE *decode;
3287 PyObject *result;
3288 struct encoding_map *mresult;
3289 int i;
3290 int need_dict = 0;
3291 unsigned char level1[32];
3292 unsigned char level2[512];
3293 unsigned char *mlevel1, *mlevel2, *mlevel3;
3294 int count2 = 0, count3 = 0;
3295
3296 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3297 PyErr_BadArgument();
3298 return NULL;
3299 }
3300 decode = PyUnicode_AS_UNICODE(string);
3301 memset(level1, 0xFF, sizeof level1);
3302 memset(level2, 0xFF, sizeof level2);
3303
3304 /* If there isn't a one-to-one mapping of NULL to \0,
3305 or if there are non-BMP characters, we need to use
3306 a mapping dictionary. */
3307 if (decode[0] != 0)
3308 need_dict = 1;
3309 for (i = 1; i < 256; i++) {
3310 int l1, l2;
3311 if (decode[i] == 0
3312 #ifdef Py_UNICODE_WIDE
3313 || decode[i] > 0xFFFF
3314 #endif
3315 ) {
3316 need_dict = 1;
3317 break;
3318 }
3319 if (decode[i] == 0xFFFE)
3320 /* unmapped character */
3321 continue;
3322 l1 = decode[i] >> 11;
3323 l2 = decode[i] >> 7;
3324 if (level1[l1] == 0xFF)
3325 level1[l1] = count2++;
3326 if (level2[l2] == 0xFF)
3327 level2[l2] = count3++;
3328 }
3329
3330 if (count2 >= 0xFF || count3 >= 0xFF)
3331 need_dict = 1;
3332
3333 if (need_dict) {
3334 PyObject *result = PyDict_New();
3335 PyObject *key, *value;
3336 if (!result)
3337 return NULL;
3338 for (i = 0; i < 256; i++) {
3339 key = value = NULL;
3340 key = PyInt_FromLong(decode[i]);
3341 value = PyInt_FromLong(i);
3342 if (!key || !value)
3343 goto failed1;
3344 if (PyDict_SetItem(result, key, value) == -1)
3345 goto failed1;
3346 Py_DECREF(key);
3347 Py_DECREF(value);
3348 }
3349 return result;
3350 failed1:
3351 Py_XDECREF(key);
3352 Py_XDECREF(value);
3353 Py_DECREF(result);
3354 return NULL;
3355 }
3356
3357 /* Create a three-level trie */
3358 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3359 16*count2 + 128*count3 - 1);
3360 if (!result)
3361 return PyErr_NoMemory();
3362 PyObject_Init(result, &EncodingMapType);
3363 mresult = (struct encoding_map*)result;
3364 mresult->count2 = count2;
3365 mresult->count3 = count3;
3366 mlevel1 = mresult->level1;
3367 mlevel2 = mresult->level23;
3368 mlevel3 = mresult->level23 + 16*count2;
3369 memcpy(mlevel1, level1, 32);
3370 memset(mlevel2, 0xFF, 16*count2);
3371 memset(mlevel3, 0, 128*count3);
3372 count3 = 0;
3373 for (i = 1; i < 256; i++) {
3374 int o1, o2, o3, i2, i3;
3375 if (decode[i] == 0xFFFE)
3376 /* unmapped character */
3377 continue;
3378 o1 = decode[i]>>11;
3379 o2 = (decode[i]>>7) & 0xF;
3380 i2 = 16*mlevel1[o1] + o2;
3381 if (mlevel2[i2] == 0xFF)
3382 mlevel2[i2] = count3++;
3383 o3 = decode[i] & 0x7F;
3384 i3 = 128*mlevel2[i2] + o3;
3385 mlevel3[i3] = i;
3386 }
3387 return result;
3388}
3389
3390static int
3391encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3392{
3393 struct encoding_map *map = (struct encoding_map*)mapping;
3394 int l1 = c>>11;
3395 int l2 = (c>>7) & 0xF;
3396 int l3 = c & 0x7F;
3397 int i;
3398
3399#ifdef Py_UNICODE_WIDE
3400 if (c > 0xFFFF) {
3401 return -1;
3402 }
3403#endif
3404 if (c == 0)
3405 return 0;
3406 /* level 1*/
3407 i = map->level1[l1];
3408 if (i == 0xFF) {
3409 return -1;
3410 }
3411 /* level 2*/
3412 i = map->level23[16*i+l2];
3413 if (i == 0xFF) {
3414 return -1;
3415 }
3416 /* level 3 */
3417 i = map->level23[16*map->count2 + 128*i + l3];
3418 if (i == 0) {
3419 return -1;
3420 }
3421 return i;
3422}
3423
3424/* Lookup the character ch in the mapping. If the character
3425 can't be found, Py_None is returned (or NULL, if another
3426 error occurred). */
3427static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
3428{
3429 PyObject *w = PyInt_FromLong((long)c);
3430 PyObject *x;
3431
3432 if (w == NULL)
3433 return NULL;
3434 x = PyObject_GetItem(mapping, w);
3435 Py_DECREF(w);
3436 if (x == NULL) {
3437 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3438 /* No mapping found means: mapping is undefined. */
3439 PyErr_Clear();
3440 x = Py_None;
3441 Py_INCREF(x);
3442 return x;
3443 } else
3444 return NULL;
3445 }
3446 else if (x == Py_None)
3447 return x;
3448 else if (PyInt_Check(x)) {
3449 long value = PyInt_AS_LONG(x);
3450 if (value < 0 || value > 255) {
3451 PyErr_SetString(PyExc_TypeError,
3452 "character mapping must be in range(256)");
3453 Py_DECREF(x);
3454 return NULL;
3455 }
3456 return x;
3457 }
3458 else if (PyString_Check(x))
3459 return x;
3460 else {
3461 /* wrong return value */
3462 PyErr_SetString(PyExc_TypeError,
3463 "character mapping must return integer, None or str");
3464 Py_DECREF(x);
3465 return NULL;
3466 }
3467}
3468
3469static int
3470charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3471{
3472 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3473 /* exponentially overallocate to minimize reallocations */
3474 if (requiredsize < 2*outsize)
3475 requiredsize = 2*outsize;
3476 if (_PyString_Resize(outobj, requiredsize)) {
3477 return 0;
3478 }
3479 return 1;
3480}
3481
3482typedef enum charmapencode_result {
3483 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3484}charmapencode_result;
3485/* lookup the character, put the result in the output string and adjust
3486 various state variables. Reallocate the output string if not enough
3487 space is available. Return a new reference to the object that
3488 was put in the output buffer, or Py_None, if the mapping was undefined
3489 (in which case no character was written) or NULL, if a
3490 reallocation error occurred. The caller must decref the result */
3491static
3492charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
3493 PyObject **outobj, Py_ssize_t *outpos)
3494{
3495 PyObject *rep;
3496 char *outstart;
3497 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3498
3499 if (mapping->ob_type == &EncodingMapType) {
3500 int res = encoding_map_lookup(c, mapping);
3501 Py_ssize_t requiredsize = *outpos+1;
3502 if (res == -1)
3503 return enc_FAILED;
3504 if (outsize<requiredsize)
3505 if (!charmapencode_resize(outobj, outpos, requiredsize))
3506 return enc_EXCEPTION;
3507 outstart = PyString_AS_STRING(*outobj);
3508 outstart[(*outpos)++] = (char)res;
3509 return enc_SUCCESS;
3510 }
3511
3512 rep = charmapencode_lookup(c, mapping);
3513 if (rep==NULL)
3514 return enc_EXCEPTION;
3515 else if (rep==Py_None) {
3516 Py_DECREF(rep);
3517 return enc_FAILED;
3518 } else {
3519 if (PyInt_Check(rep)) {
3520 Py_ssize_t requiredsize = *outpos+1;
3521 if (outsize<requiredsize)
3522 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3523 Py_DECREF(rep);
3524 return enc_EXCEPTION;
3525 }
3526 outstart = PyString_AS_STRING(*outobj);
3527 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3528 }
3529 else {
3530 const char *repchars = PyString_AS_STRING(rep);
3531 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3532 Py_ssize_t requiredsize = *outpos+repsize;
3533 if (outsize<requiredsize)
3534 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3535 Py_DECREF(rep);
3536 return enc_EXCEPTION;
3537 }
3538 outstart = PyString_AS_STRING(*outobj);
3539 memcpy(outstart + *outpos, repchars, repsize);
3540 *outpos += repsize;
3541 }
3542 }
3543 Py_DECREF(rep);
3544 return enc_SUCCESS;
3545}
3546
3547/* handle an error in PyUnicode_EncodeCharmap
3548 Return 0 on success, -1 on error */
3549static
3550int charmap_encoding_error(
3551 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
3552 PyObject **exceptionObject,
3553 int *known_errorHandler, PyObject **errorHandler, const char *errors,
3554 PyObject **res, Py_ssize_t *respos)
3555{
3556 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3557 Py_ssize_t repsize;
3558 Py_ssize_t newpos;
3559 Py_UNICODE *uni2;
3560 /* startpos for collecting unencodable chars */
3561 Py_ssize_t collstartpos = *inpos;
3562 Py_ssize_t collendpos = *inpos+1;
3563 Py_ssize_t collpos;
3564 char *encoding = "charmap";
3565 char *reason = "character maps to <undefined>";
3566 charmapencode_result x;
3567
3568 /* find all unencodable characters */
3569 while (collendpos < size) {
3570 PyObject *rep;
3571 if (mapping->ob_type == &EncodingMapType) {
3572 int res = encoding_map_lookup(p[collendpos], mapping);
3573 if (res != -1)
3574 break;
3575 ++collendpos;
3576 continue;
3577 }
3578
3579 rep = charmapencode_lookup(p[collendpos], mapping);
3580 if (rep==NULL)
3581 return -1;
3582 else if (rep!=Py_None) {
3583 Py_DECREF(rep);
3584 break;
3585 }
3586 Py_DECREF(rep);
3587 ++collendpos;
3588 }
3589 /* cache callback name lookup
3590 * (if not done yet, i.e. it's the first error) */
3591 if (*known_errorHandler==-1) {
3592 if ((errors==NULL) || (!strcmp(errors, "strict")))
3593 *known_errorHandler = 1;
3594 else if (!strcmp(errors, "replace"))
3595 *known_errorHandler = 2;
3596 else if (!strcmp(errors, "ignore"))
3597 *known_errorHandler = 3;
3598 else if (!strcmp(errors, "xmlcharrefreplace"))
3599 *known_errorHandler = 4;
3600 else
3601 *known_errorHandler = 0;
3602 }
3603 switch (*known_errorHandler) {
3604 case 1: /* strict */
3605 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3606 return -1;
3607 case 2: /* replace */
3608 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3609 x = charmapencode_output('?', mapping, res, respos);
3610 if (x==enc_EXCEPTION) {
3611 return -1;
3612 }
3613 else if (x==enc_FAILED) {
3614 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3615 return -1;
3616 }
3617 }
3618 /* fall through */
3619 case 3: /* ignore */
3620 *inpos = collendpos;
3621 break;
3622 case 4: /* xmlcharrefreplace */
3623 /* generate replacement (temporarily (mis)uses p) */
3624 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3625 char buffer[2+29+1+1];
3626 char *cp;
3627 sprintf(buffer, "&#%d;", (int)p[collpos]);
3628 for (cp = buffer; *cp; ++cp) {
3629 x = charmapencode_output(*cp, mapping, res, respos);
3630 if (x==enc_EXCEPTION)
3631 return -1;
3632 else if (x==enc_FAILED) {
3633 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3634 return -1;
3635 }
3636 }
3637 }
3638 *inpos = collendpos;
3639 break;
3640 default:
3641 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3642 encoding, reason, p, size, exceptionObject,
3643 collstartpos, collendpos, &newpos);
3644 if (repunicode == NULL)
3645 return -1;
3646 /* generate replacement */
3647 repsize = PyUnicode_GET_SIZE(repunicode);
3648 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3649 x = charmapencode_output(*uni2, mapping, res, respos);
3650 if (x==enc_EXCEPTION) {
3651 return -1;
3652 }
3653 else if (x==enc_FAILED) {
3654 Py_DECREF(repunicode);
3655 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3656 return -1;
3657 }
3658 }
3659 *inpos = newpos;
3660 Py_DECREF(repunicode);
3661 }
3662 return 0;
3663}
3664
3665PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3666 Py_ssize_t size,
3667 PyObject *mapping,
3668 const char *errors)
3669{
3670 /* output object */
3671 PyObject *res = NULL;
3672 /* current input position */
3673 Py_ssize_t inpos = 0;
3674 /* current output position */
3675 Py_ssize_t respos = 0;
3676 PyObject *errorHandler = NULL;
3677 PyObject *exc = NULL;
3678 /* the following variable is used for caching string comparisons
3679 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3680 * 3=ignore, 4=xmlcharrefreplace */
3681 int known_errorHandler = -1;
3682
3683 /* Default to Latin-1 */
3684 if (mapping == NULL)
3685 return PyUnicode_EncodeLatin1(p, size, errors);
3686
3687 /* allocate enough for a simple encoding without
3688 replacements, if we need more, we'll resize */
3689 res = PyString_FromStringAndSize(NULL, size);
3690 if (res == NULL)
3691 goto onError;
3692 if (size == 0)
3693 return res;
3694
3695 while (inpos<size) {
3696 /* try to encode it */
3697 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3698 if (x==enc_EXCEPTION) /* error */
3699 goto onError;
3700 if (x==enc_FAILED) { /* unencodable character */
3701 if (charmap_encoding_error(p, size, &inpos, mapping,
3702 &exc,
3703 &known_errorHandler, &errorHandler, errors,
3704 &res, &respos)) {
3705 goto onError;
3706 }
3707 }
3708 else
3709 /* done with this character => adjust input position */
3710 ++inpos;
3711 }
3712
3713 /* Resize if we allocated to much */
3714 if (respos<PyString_GET_SIZE(res)) {
3715 if (_PyString_Resize(&res, respos))
3716 goto onError;
3717 }
3718 Py_XDECREF(exc);
3719 Py_XDECREF(errorHandler);
3720 return res;
3721
3722 onError:
3723 Py_XDECREF(res);
3724 Py_XDECREF(exc);
3725 Py_XDECREF(errorHandler);
3726 return NULL;
3727}
3728
3729PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3730 PyObject *mapping)
3731{
3732 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3733 PyErr_BadArgument();
3734 return NULL;
3735 }
3736 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3737 PyUnicode_GET_SIZE(unicode),
3738 mapping,
3739 NULL);
3740}
3741
3742/* create or adjust a UnicodeTranslateError */
3743static void make_translate_exception(PyObject **exceptionObject,
3744 const Py_UNICODE *unicode, Py_ssize_t size,
3745 Py_ssize_t startpos, Py_ssize_t endpos,
3746 const char *reason)
3747{
3748 if (*exceptionObject == NULL) {
3749 *exceptionObject = PyUnicodeTranslateError_Create(
3750 unicode, size, startpos, endpos, reason);
3751 }
3752 else {
3753 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3754 goto onError;
3755 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3756 goto onError;
3757 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3758 goto onError;
3759 return;
3760 onError:
3761 Py_DECREF(*exceptionObject);
3762 *exceptionObject = NULL;
3763 }
3764}
3765
3766/* raises a UnicodeTranslateError */
3767static void raise_translate_exception(PyObject **exceptionObject,
3768 const Py_UNICODE *unicode, Py_ssize_t size,
3769 Py_ssize_t startpos, Py_ssize_t endpos,
3770 const char *reason)
3771{
3772 make_translate_exception(exceptionObject,
3773 unicode, size, startpos, endpos, reason);
3774 if (*exceptionObject != NULL)
3775 PyCodec_StrictErrors(*exceptionObject);
3776}
3777
3778/* error handling callback helper:
3779 build arguments, call the callback and check the arguments,
3780 put the result into newpos and return the replacement string, which
3781 has to be freed by the caller */
3782static PyObject *unicode_translate_call_errorhandler(const char *errors,
3783 PyObject **errorHandler,
3784 const char *reason,
3785 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3786 Py_ssize_t startpos, Py_ssize_t endpos,
3787 Py_ssize_t *newpos)
3788{
3789 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
3790
3791 Py_ssize_t i_newpos;
3792 PyObject *restuple;
3793 PyObject *resunicode;
3794
3795 if (*errorHandler == NULL) {
3796 *errorHandler = PyCodec_LookupError(errors);
3797 if (*errorHandler == NULL)
3798 return NULL;
3799 }
3800
3801 make_translate_exception(exceptionObject,
3802 unicode, size, startpos, endpos, reason);
3803 if (*exceptionObject == NULL)
3804 return NULL;
3805
3806 restuple = PyObject_CallFunctionObjArgs(
3807 *errorHandler, *exceptionObject, NULL);
3808 if (restuple == NULL)
3809 return NULL;
3810 if (!PyTuple_Check(restuple)) {
3811 PyErr_Format(PyExc_TypeError, &argparse[4]);
3812 Py_DECREF(restuple);
3813 return NULL;
3814 }
3815 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3816 &resunicode, &i_newpos)) {
3817 Py_DECREF(restuple);
3818 return NULL;
3819 }
3820 if (i_newpos<0)
3821 *newpos = size+i_newpos;
3822 else
3823 *newpos = i_newpos;
3824 if (*newpos<0 || *newpos>size) {
3825 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3826 Py_DECREF(restuple);
3827 return NULL;
3828 }
3829 Py_INCREF(resunicode);
3830 Py_DECREF(restuple);
3831 return resunicode;
3832}
3833
3834/* Lookup the character ch in the mapping and put the result in result,
3835 which must be decrefed by the caller.
3836 Return 0 on success, -1 on error */
3837static
3838int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3839{
3840 PyObject *w = PyInt_FromLong((long)c);
3841 PyObject *x;
3842
3843 if (w == NULL)
3844 return -1;
3845 x = PyObject_GetItem(mapping, w);
3846 Py_DECREF(w);
3847 if (x == NULL) {
3848 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3849 /* No mapping found means: use 1:1 mapping. */
3850 PyErr_Clear();
3851 *result = NULL;
3852 return 0;
3853 } else
3854 return -1;
3855 }
3856 else if (x == Py_None) {
3857 *result = x;
3858 return 0;
3859 }
3860 else if (PyInt_Check(x)) {
3861 long value = PyInt_AS_LONG(x);
3862 long max = PyUnicode_GetMax();
3863 if (value < 0 || value > max) {
3864 PyErr_Format(PyExc_TypeError,
3865 "character mapping must be in range(0x%lx)", max+1);
3866 Py_DECREF(x);
3867 return -1;
3868 }
3869 *result = x;
3870 return 0;
3871 }
3872 else if (PyUnicode_Check(x)) {
3873 *result = x;
3874 return 0;
3875 }
3876 else {
3877 /* wrong return value */
3878 PyErr_SetString(PyExc_TypeError,
3879 "character mapping must return integer, None or unicode");
3880 Py_DECREF(x);
3881 return -1;
3882 }
3883}
3884/* ensure that *outobj is at least requiredsize characters long,
3885if not reallocate and adjust various state variables.
3886Return 0 on success, -1 on error */
3887static
3888int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
3889 Py_ssize_t requiredsize)
3890{
3891 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
3892 if (requiredsize > oldsize) {
3893 /* remember old output position */
3894 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3895 /* exponentially overallocate to minimize reallocations */
3896 if (requiredsize < 2 * oldsize)
3897 requiredsize = 2 * oldsize;
3898 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
3899 return -1;
3900 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3901 }
3902 return 0;
3903}
3904/* lookup the character, put the result in the output string and adjust
3905 various state variables. Return a new reference to the object that
3906 was put in the output buffer in *result, or Py_None, if the mapping was
3907 undefined (in which case no character was written).
3908 The called must decref result.
3909 Return 0 on success, -1 on error. */
3910static
3911int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3912 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3913 PyObject **res)
3914{
3915 if (charmaptranslate_lookup(*curinp, mapping, res))
3916 return -1;
3917 if (*res==NULL) {
3918 /* not found => default to 1:1 mapping */
3919 *(*outp)++ = *curinp;
3920 }
3921 else if (*res==Py_None)
3922 ;
3923 else if (PyInt_Check(*res)) {
3924 /* no overflow check, because we know that the space is enough */
3925 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3926 }
3927 else if (PyUnicode_Check(*res)) {
3928 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
3929 if (repsize==1) {
3930 /* no overflow check, because we know that the space is enough */
3931 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3932 }
3933 else if (repsize!=0) {
3934 /* more than one character */
3935 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3936 (insize - (curinp-startinp)) +
3937 repsize - 1;
3938 if (charmaptranslate_makespace(outobj, outp, requiredsize))
3939 return -1;
3940 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3941 *outp += repsize;
3942 }
3943 }
3944 else
3945 return -1;
3946 return 0;
3947}
3948
3949PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3950 Py_ssize_t size,
3951 PyObject *mapping,
3952 const char *errors)
3953{
3954 /* output object */
3955 PyObject *res = NULL;
3956 /* pointers to the beginning and end+1 of input */
3957 const Py_UNICODE *startp = p;
3958 const Py_UNICODE *endp = p + size;
3959 /* pointer into the output */
3960 Py_UNICODE *str;
3961 /* current output position */
3962 Py_ssize_t respos = 0;
3963 char *reason = "character maps to <undefined>";
3964 PyObject *errorHandler = NULL;
3965 PyObject *exc = NULL;
3966 /* the following variable is used for caching string comparisons
3967 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3968 * 3=ignore, 4=xmlcharrefreplace */
3969 int known_errorHandler = -1;
3970
3971 if (mapping == NULL) {
3972 PyErr_BadArgument();
3973 return NULL;
3974 }
3975
3976 /* allocate enough for a simple 1:1 translation without
3977 replacements, if we need more, we'll resize */
3978 res = PyUnicode_FromUnicode(NULL, size);
3979 if (res == NULL)
3980 goto onError;
3981 if (size == 0)
3982 return res;
3983 str = PyUnicode_AS_UNICODE(res);
3984
3985 while (p<endp) {
3986 /* try to encode it */
3987 PyObject *x = NULL;
3988 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
3989 Py_XDECREF(x);
3990 goto onError;
3991 }
3992 Py_XDECREF(x);
3993 if (x!=Py_None) /* it worked => adjust input pointer */
3994 ++p;
3995 else { /* untranslatable character */
3996 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3997 Py_ssize_t repsize;
3998 Py_ssize_t newpos;
3999 Py_UNICODE *uni2;
4000 /* startpos for collecting untranslatable chars */
4001 const Py_UNICODE *collstart = p;
4002 const Py_UNICODE *collend = p+1;
4003 const Py_UNICODE *coll;
4004
4005 /* find all untranslatable characters */
4006 while (collend < endp) {
4007 if (charmaptranslate_lookup(*collend, mapping, &x))
4008 goto onError;
4009 Py_XDECREF(x);
4010 if (x!=Py_None)
4011 break;
4012 ++collend;
4013 }
4014 /* cache callback name lookup
4015 * (if not done yet, i.e. it's the first error) */
4016 if (known_errorHandler==-1) {
4017 if ((errors==NULL) || (!strcmp(errors, "strict")))
4018 known_errorHandler = 1;
4019 else if (!strcmp(errors, "replace"))
4020 known_errorHandler = 2;
4021 else if (!strcmp(errors, "ignore"))
4022 known_errorHandler = 3;
4023 else if (!strcmp(errors, "xmlcharrefreplace"))
4024 known_errorHandler = 4;
4025 else
4026 known_errorHandler = 0;
4027 }
4028 switch (known_errorHandler) {
4029 case 1: /* strict */
4030 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4031 goto onError;
4032 case 2: /* replace */
4033 /* No need to check for space, this is a 1:1 replacement */
4034 for (coll = collstart; coll<collend; ++coll)
4035 *str++ = '?';
4036 /* fall through */
4037 case 3: /* ignore */
4038 p = collend;
4039 break;
4040 case 4: /* xmlcharrefreplace */
4041 /* generate replacement (temporarily (mis)uses p) */
4042 for (p = collstart; p < collend; ++p) {
4043 char buffer[2+29+1+1];
4044 char *cp;
4045 sprintf(buffer, "&#%d;", (int)*p);
4046 if (charmaptranslate_makespace(&res, &str,
4047 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4048 goto onError;
4049 for (cp = buffer; *cp; ++cp)
4050 *str++ = *cp;
4051 }
4052 p = collend;
4053 break;
4054 default:
4055 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4056 reason, startp, size, &exc,
4057 collstart-startp, collend-startp, &newpos);
4058 if (repunicode == NULL)
4059 goto onError;
4060 /* generate replacement */
4061 repsize = PyUnicode_GET_SIZE(repunicode);
4062 if (charmaptranslate_makespace(&res, &str,
4063 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4064 Py_DECREF(repunicode);
4065 goto onError;
4066 }
4067 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4068 *str++ = *uni2;
4069 p = startp + newpos;
4070 Py_DECREF(repunicode);
4071 }
4072 }
4073 }
4074 /* Resize if we allocated to much */
4075 respos = str-PyUnicode_AS_UNICODE(res);
4076 if (respos<PyUnicode_GET_SIZE(res)) {
4077 if (_PyUnicode_Resize(&res, respos) < 0)
4078 goto onError;
4079 }
4080 Py_XDECREF(exc);
4081 Py_XDECREF(errorHandler);
4082 return res;
4083
4084 onError:
4085 Py_XDECREF(res);
4086 Py_XDECREF(exc);
4087 Py_XDECREF(errorHandler);
4088 return NULL;
4089}
4090
4091PyObject *PyUnicode_Translate(PyObject *str,
4092 PyObject *mapping,
4093 const char *errors)
4094{
4095 PyObject *result;
4096
4097 str = PyUnicode_FromObject(str);
4098 if (str == NULL)
4099 goto onError;
4100 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4101 PyUnicode_GET_SIZE(str),
4102 mapping,
4103 errors);
4104 Py_DECREF(str);
4105 return result;
4106
4107 onError:
4108 Py_XDECREF(str);
4109 return NULL;
4110}
4111
4112/* --- Decimal Encoder ---------------------------------------------------- */
4113
4114int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4115 Py_ssize_t length,
4116 char *output,
4117 const char *errors)
4118{
4119 Py_UNICODE *p, *end;
4120 PyObject *errorHandler = NULL;
4121 PyObject *exc = NULL;
4122 const char *encoding = "decimal";
4123 const char *reason = "invalid decimal Unicode string";
4124 /* the following variable is used for caching string comparisons
4125 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4126 int known_errorHandler = -1;
4127
4128 if (output == NULL) {
4129 PyErr_BadArgument();
4130 return -1;
4131 }
4132
4133 p = s;
4134 end = s + length;
4135 while (p < end) {
4136 register Py_UNICODE ch = *p;
4137 int decimal;
4138 PyObject *repunicode;
4139 Py_ssize_t repsize;
4140 Py_ssize_t newpos;
4141 Py_UNICODE *uni2;
4142 Py_UNICODE *collstart;
4143 Py_UNICODE *collend;
4144
4145 if (Py_UNICODE_ISSPACE(ch)) {
4146 *output++ = ' ';
4147 ++p;
4148 continue;
4149 }
4150 decimal = Py_UNICODE_TODECIMAL(ch);
4151 if (decimal >= 0) {
4152 *output++ = '0' + decimal;
4153 ++p;
4154 continue;
4155 }
4156 if (0 < ch && ch < 256) {
4157 *output++ = (char)ch;
4158 ++p;
4159 continue;
4160 }
4161 /* All other characters are considered unencodable */
4162 collstart = p;
4163 collend = p+1;
4164 while (collend < end) {
4165 if ((0 < *collend && *collend < 256) ||
4166 !Py_UNICODE_ISSPACE(*collend) ||
4167 Py_UNICODE_TODECIMAL(*collend))
4168 break;
4169 }
4170 /* cache callback name lookup
4171 * (if not done yet, i.e. it's the first error) */
4172 if (known_errorHandler==-1) {
4173 if ((errors==NULL) || (!strcmp(errors, "strict")))
4174 known_errorHandler = 1;
4175 else if (!strcmp(errors, "replace"))
4176 known_errorHandler = 2;
4177 else if (!strcmp(errors, "ignore"))
4178 known_errorHandler = 3;
4179 else if (!strcmp(errors, "xmlcharrefreplace"))
4180 known_errorHandler = 4;
4181 else
4182 known_errorHandler = 0;
4183 }
4184 switch (known_errorHandler) {
4185 case 1: /* strict */
4186 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4187 goto onError;
4188 case 2: /* replace */
4189 for (p = collstart; p < collend; ++p)
4190 *output++ = '?';
4191 /* fall through */
4192 case 3: /* ignore */
4193 p = collend;
4194 break;
4195 case 4: /* xmlcharrefreplace */
4196 /* generate replacement (temporarily (mis)uses p) */
4197 for (p = collstart; p < collend; ++p)
4198 output += sprintf(output, "&#%d;", (int)*p);
4199 p = collend;
4200 break;
4201 default:
4202 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4203 encoding, reason, s, length, &exc,
4204 collstart-s, collend-s, &newpos);
4205 if (repunicode == NULL)
4206 goto onError;
4207 /* generate replacement */
4208 repsize = PyUnicode_GET_SIZE(repunicode);
4209 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4210 Py_UNICODE ch = *uni2;
4211 if (Py_UNICODE_ISSPACE(ch))
4212 *output++ = ' ';
4213 else {
4214 decimal = Py_UNICODE_TODECIMAL(ch);
4215 if (decimal >= 0)
4216 *output++ = '0' + decimal;
4217 else if (0 < ch && ch < 256)
4218 *output++ = (char)ch;
4219 else {
4220 Py_DECREF(repunicode);
4221 raise_encode_exception(&exc, encoding,
4222 s, length, collstart-s, collend-s, reason);
4223 goto onError;
4224 }
4225 }
4226 }
4227 p = s + newpos;
4228 Py_DECREF(repunicode);
4229 }
4230 }
4231 /* 0-terminate the output string */
4232 *output++ = '\0';
4233 Py_XDECREF(exc);
4234 Py_XDECREF(errorHandler);
4235 return 0;
4236
4237 onError:
4238 Py_XDECREF(exc);
4239 Py_XDECREF(errorHandler);
4240 return -1;
4241}
4242
4243/* --- Helpers ------------------------------------------------------------ */
4244
4245#define STRINGLIB_CHAR Py_UNICODE
4246
4247#define STRINGLIB_LEN PyUnicode_GET_SIZE
4248#define STRINGLIB_NEW PyUnicode_FromUnicode
4249#define STRINGLIB_STR PyUnicode_AS_UNICODE
4250
4251Py_LOCAL_INLINE(int)
4252STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4253{
4254 if (str[0] != other[0])
4255 return 1;
4256 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4257}
4258
4259#define STRINGLIB_EMPTY unicode_empty
4260
4261#include "stringlib/fastsearch.h"
4262
4263#include "stringlib/count.h"
4264#include "stringlib/find.h"
4265#include "stringlib/partition.h"
4266
4267/* helper macro to fixup start/end slice values */
4268#define FIX_START_END(obj) \
4269 if (start < 0) \
4270 start += (obj)->length; \
4271 if (start < 0) \
4272 start = 0; \
4273 if (end > (obj)->length) \
4274 end = (obj)->length; \
4275 if (end < 0) \
4276 end += (obj)->length; \
4277 if (end < 0) \
4278 end = 0;
4279
4280Py_ssize_t PyUnicode_Count(PyObject *str,
4281 PyObject *substr,
4282 Py_ssize_t start,
4283 Py_ssize_t end)
4284{
4285 Py_ssize_t result;
4286 PyUnicodeObject* str_obj;
4287 PyUnicodeObject* sub_obj;
4288
4289 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4290 if (!str_obj)
4291 return -1;
4292 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4293 if (!sub_obj) {
4294 Py_DECREF(str_obj);
4295 return -1;
4296 }
4297
4298 FIX_START_END(str_obj);
4299
4300 result = stringlib_count(
4301 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4302 );
4303
4304 Py_DECREF(sub_obj);
4305 Py_DECREF(str_obj);
4306
4307 return result;
4308}
4309
4310Py_ssize_t PyUnicode_Find(PyObject *str,
4311 PyObject *sub,
4312 Py_ssize_t start,
4313 Py_ssize_t end,
4314 int direction)
4315{
4316 Py_ssize_t result;
4317
4318 str = PyUnicode_FromObject(str);
4319 if (!str)
4320 return -2;
4321 sub = PyUnicode_FromObject(sub);
4322 if (!sub) {
4323 Py_DECREF(str);
4324 return -2;
4325 }
4326
4327 if (direction > 0)
4328 result = stringlib_find_slice(
4329 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4330 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4331 start, end
4332 );
4333 else
4334 result = stringlib_rfind_slice(
4335 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4336 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4337 start, end
4338 );
4339
4340 Py_DECREF(str);
4341 Py_DECREF(sub);
4342
4343 return result;
4344}
4345
4346static
4347int tailmatch(PyUnicodeObject *self,
4348 PyUnicodeObject *substring,
4349 Py_ssize_t start,
4350 Py_ssize_t end,
4351 int direction)
4352{
4353 if (substring->length == 0)
4354 return 1;
4355
4356 FIX_START_END(self);
4357
4358 end -= substring->length;
4359 if (end < start)
4360 return 0;
4361
4362 if (direction > 0) {
4363 if (Py_UNICODE_MATCH(self, end, substring))
4364 return 1;
4365 } else {
4366 if (Py_UNICODE_MATCH(self, start, substring))
4367 return 1;
4368 }
4369
4370 return 0;
4371}
4372
4373Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
4374 PyObject *substr,
4375 Py_ssize_t start,
4376 Py_ssize_t end,
4377 int direction)
4378{
4379 Py_ssize_t result;
4380
4381 str = PyUnicode_FromObject(str);
4382 if (str == NULL)
4383 return -1;
4384 substr = PyUnicode_FromObject(substr);
4385 if (substr == NULL) {
4386 Py_DECREF(str);
4387 return -1;
4388 }
4389
4390 result = tailmatch((PyUnicodeObject *)str,
4391 (PyUnicodeObject *)substr,
4392 start, end, direction);
4393 Py_DECREF(str);
4394 Py_DECREF(substr);
4395 return result;
4396}
4397
4398/* Apply fixfct filter to the Unicode object self and return a
4399 reference to the modified object */
4400
4401static
4402PyObject *fixup(PyUnicodeObject *self,
4403 int (*fixfct)(PyUnicodeObject *s))
4404{
4405
4406 PyUnicodeObject *u;
4407
4408 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4409 if (u == NULL)
4410 return NULL;
4411
4412 Py_UNICODE_COPY(u->str, self->str, self->length);
4413
4414 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
4415 /* fixfct should return TRUE if it modified the buffer. If
4416 FALSE, return a reference to the original buffer instead
4417 (to save space, not time) */
4418 Py_INCREF(self);
4419 Py_DECREF(u);
4420 return (PyObject*) self;
4421 }
4422 return (PyObject*) u;
4423}
4424
4425static
4426int fixupper(PyUnicodeObject *self)
4427{
4428 Py_ssize_t len = self->length;
4429 Py_UNICODE *s = self->str;
4430 int status = 0;
4431
4432 while (len-- > 0) {
4433 register Py_UNICODE ch;
4434
4435 ch = Py_UNICODE_TOUPPER(*s);
4436 if (ch != *s) {
4437 status = 1;
4438 *s = ch;
4439 }
4440 s++;
4441 }
4442
4443 return status;
4444}
4445
4446static
4447int fixlower(PyUnicodeObject *self)
4448{
4449 Py_ssize_t len = self->length;
4450 Py_UNICODE *s = self->str;
4451 int status = 0;
4452
4453 while (len-- > 0) {
4454 register Py_UNICODE ch;
4455
4456 ch = Py_UNICODE_TOLOWER(*s);
4457 if (ch != *s) {
4458 status = 1;
4459 *s = ch;
4460 }
4461 s++;
4462 }
4463
4464 return status;
4465}
4466
4467static
4468int fixswapcase(PyUnicodeObject *self)
4469{
4470 Py_ssize_t len = self->length;
4471 Py_UNICODE *s = self->str;
4472 int status = 0;
4473
4474 while (len-- > 0) {
4475 if (Py_UNICODE_ISUPPER(*s)) {
4476 *s = Py_UNICODE_TOLOWER(*s);
4477 status = 1;
4478 } else if (Py_UNICODE_ISLOWER(*s)) {
4479 *s = Py_UNICODE_TOUPPER(*s);
4480 status = 1;
4481 }
4482 s++;
4483 }
4484
4485 return status;
4486}
4487
4488static
4489int fixcapitalize(PyUnicodeObject *self)
4490{
4491 Py_ssize_t len = self->length;
4492 Py_UNICODE *s = self->str;
4493 int status = 0;
4494
4495 if (len == 0)
4496 return 0;
4497 if (Py_UNICODE_ISLOWER(*s)) {
4498 *s = Py_UNICODE_TOUPPER(*s);
4499 status = 1;
4500 }
4501 s++;
4502 while (--len > 0) {
4503 if (Py_UNICODE_ISUPPER(*s)) {
4504 *s = Py_UNICODE_TOLOWER(*s);
4505 status = 1;
4506 }
4507 s++;
4508 }
4509 return status;
4510}
4511
4512static
4513int fixtitle(PyUnicodeObject *self)
4514{
4515 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4516 register Py_UNICODE *e;
4517 int previous_is_cased;
4518
4519 /* Shortcut for single character strings */
4520 if (PyUnicode_GET_SIZE(self) == 1) {
4521 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4522 if (*p != ch) {
4523 *p = ch;
4524 return 1;
4525 }
4526 else
4527 return 0;
4528 }
4529
4530 e = p + PyUnicode_GET_SIZE(self);
4531 previous_is_cased = 0;
4532 for (; p < e; p++) {
4533 register const Py_UNICODE ch = *p;
4534
4535 if (previous_is_cased)
4536 *p = Py_UNICODE_TOLOWER(ch);
4537 else
4538 *p = Py_UNICODE_TOTITLE(ch);
4539
4540 if (Py_UNICODE_ISLOWER(ch) ||
4541 Py_UNICODE_ISUPPER(ch) ||
4542 Py_UNICODE_ISTITLE(ch))
4543 previous_is_cased = 1;
4544 else
4545 previous_is_cased = 0;
4546 }
4547 return 1;
4548}
4549
4550PyObject *
4551PyUnicode_Join(PyObject *separator, PyObject *seq)
4552{
4553 PyObject *internal_separator = NULL;
4554 const Py_UNICODE blank = ' ';
4555 const Py_UNICODE *sep = &blank;
4556 Py_ssize_t seplen = 1;
4557 PyUnicodeObject *res = NULL; /* the result */
4558 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4559 Py_ssize_t res_used; /* # used bytes */
4560 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4561 PyObject *fseq; /* PySequence_Fast(seq) */
4562 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
4563 PyObject *item;
4564 Py_ssize_t i;
4565
4566 fseq = PySequence_Fast(seq, "");
4567 if (fseq == NULL) {
4568 return NULL;
4569 }
4570
4571 /* Grrrr. A codec may be invoked to convert str objects to
4572 * Unicode, and so it's possible to call back into Python code
4573 * during PyUnicode_FromObject(), and so it's possible for a sick
4574 * codec to change the size of fseq (if seq is a list). Therefore
4575 * we have to keep refetching the size -- can't assume seqlen
4576 * is invariant.
4577 */
4578 seqlen = PySequence_Fast_GET_SIZE(fseq);
4579 /* If empty sequence, return u"". */
4580 if (seqlen == 0) {
4581 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4582 goto Done;
4583 }
4584 /* If singleton sequence with an exact Unicode, return that. */
4585 if (seqlen == 1) {
4586 item = PySequence_Fast_GET_ITEM(fseq, 0);
4587 if (PyUnicode_CheckExact(item)) {
4588 Py_INCREF(item);
4589 res = (PyUnicodeObject *)item;
4590 goto Done;
4591 }
4592 }
4593
4594 /* At least two items to join, or one that isn't exact Unicode. */
4595 if (seqlen > 1) {
4596 /* Set up sep and seplen -- they're needed. */
4597 if (separator == NULL) {
4598 sep = &blank;
4599 seplen = 1;
4600 }
4601 else {
4602 internal_separator = PyUnicode_FromObject(separator);
4603 if (internal_separator == NULL)
4604 goto onError;
4605 sep = PyUnicode_AS_UNICODE(internal_separator);
4606 seplen = PyUnicode_GET_SIZE(internal_separator);
4607 /* In case PyUnicode_FromObject() mutated seq. */
4608 seqlen = PySequence_Fast_GET_SIZE(fseq);
4609 }
4610 }
4611
4612 /* Get space. */
4613 res = _PyUnicode_New(res_alloc);
4614 if (res == NULL)
4615 goto onError;
4616 res_p = PyUnicode_AS_UNICODE(res);
4617 res_used = 0;
4618
4619 for (i = 0; i < seqlen; ++i) {
4620 Py_ssize_t itemlen;
4621 Py_ssize_t new_res_used;
4622
4623 item = PySequence_Fast_GET_ITEM(fseq, i);
4624 /* Convert item to Unicode. */
4625 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4626 PyErr_Format(PyExc_TypeError,
4627 "sequence item %zd: expected string or Unicode,"
4628 " %.80s found",
4629 i, item->ob_type->tp_name);
4630 goto onError;
4631 }
4632 item = PyUnicode_FromObject(item);
4633 if (item == NULL)
4634 goto onError;
4635 /* We own a reference to item from here on. */
4636
4637 /* In case PyUnicode_FromObject() mutated seq. */
4638 seqlen = PySequence_Fast_GET_SIZE(fseq);
4639
4640 /* Make sure we have enough space for the separator and the item. */
4641 itemlen = PyUnicode_GET_SIZE(item);
4642 new_res_used = res_used + itemlen;
4643 if (new_res_used < 0)
4644 goto Overflow;
4645 if (i < seqlen - 1) {
4646 new_res_used += seplen;
4647 if (new_res_used < 0)
4648 goto Overflow;
4649 }
4650 if (new_res_used > res_alloc) {
4651 /* double allocated size until it's big enough */
4652 do {
4653 res_alloc += res_alloc;
4654 if (res_alloc <= 0)
4655 goto Overflow;
4656 } while (new_res_used > res_alloc);
4657 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
4658 Py_DECREF(item);
4659 goto onError;
4660 }
4661 res_p = PyUnicode_AS_UNICODE(res) + res_used;
4662 }
4663
4664 /* Copy item, and maybe the separator. */
4665 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
4666 res_p += itemlen;
4667 if (i < seqlen - 1) {
4668 Py_UNICODE_COPY(res_p, sep, seplen);
4669 res_p += seplen;
4670 }
4671 Py_DECREF(item);
4672 res_used = new_res_used;
4673 }
4674
4675 /* Shrink res to match the used area; this probably can't fail,
4676 * but it's cheap to check.
4677 */
4678 if (_PyUnicode_Resize(&res, res_used) < 0)
4679 goto onError;
4680
4681 Done:
4682 Py_XDECREF(internal_separator);
4683 Py_DECREF(fseq);
4684 return (PyObject *)res;
4685
4686 Overflow:
4687 PyErr_SetString(PyExc_OverflowError,
4688 "join() result is too long for a Python string");
4689 Py_DECREF(item);
4690 /* fall through */
4691
4692 onError:
4693 Py_XDECREF(internal_separator);
4694 Py_DECREF(fseq);
4695 Py_XDECREF(res);
4696 return NULL;
4697}
4698
4699static
4700PyUnicodeObject *pad(PyUnicodeObject *self,
4701 Py_ssize_t left,
4702 Py_ssize_t right,
4703 Py_UNICODE fill)
4704{
4705 PyUnicodeObject *u;
4706
4707 if (left < 0)
4708 left = 0;
4709 if (right < 0)
4710 right = 0;
4711
4712 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4713 Py_INCREF(self);
4714 return self;
4715 }
4716
4717 u = _PyUnicode_New(left + self->length + right);
4718 if (u) {
4719 if (left)
4720 Py_UNICODE_FILL(u->str, fill, left);
4721 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4722 if (right)
4723 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4724 }
4725
4726 return u;
4727}
4728
4729#define SPLIT_APPEND(data, left, right) \
4730 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4731 if (!str) \
4732 goto onError; \
4733 if (PyList_Append(list, str)) { \
4734 Py_DECREF(str); \
4735 goto onError; \
4736 } \
4737 else \
4738 Py_DECREF(str);
4739
4740static
4741PyObject *split_whitespace(PyUnicodeObject *self,
4742 PyObject *list,
4743 Py_ssize_t maxcount)
4744{
4745 register Py_ssize_t i;
4746 register Py_ssize_t j;
4747 Py_ssize_t len = self->length;
4748 PyObject *str;
4749
4750 for (i = j = 0; i < len; ) {
4751 /* find a token */
4752 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4753 i++;
4754 j = i;
4755 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4756 i++;
4757 if (j < i) {
4758 if (maxcount-- <= 0)
4759 break;
4760 SPLIT_APPEND(self->str, j, i);
4761 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4762 i++;
4763 j = i;
4764 }
4765 }
4766 if (j < len) {
4767 SPLIT_APPEND(self->str, j, len);
4768 }
4769 return list;
4770
4771 onError:
4772 Py_DECREF(list);
4773 return NULL;
4774}
4775
4776PyObject *PyUnicode_Splitlines(PyObject *string,
4777 int keepends)
4778{
4779 register Py_ssize_t i;
4780 register Py_ssize_t j;
4781 Py_ssize_t len;
4782 PyObject *list;
4783 PyObject *str;
4784 Py_UNICODE *data;
4785
4786 string = PyUnicode_FromObject(string);
4787 if (string == NULL)
4788 return NULL;
4789 data = PyUnicode_AS_UNICODE(string);
4790 len = PyUnicode_GET_SIZE(string);
4791
4792 list = PyList_New(0);
4793 if (!list)
4794 goto onError;
4795
4796 for (i = j = 0; i < len; ) {
4797 Py_ssize_t eol;
4798
4799 /* Find a line and append it */
4800 while (i < len && !BLOOM_LINEBREAK(data[i]))
4801 i++;
4802
4803 /* Skip the line break reading CRLF as one line break */
4804 eol = i;
4805 if (i < len) {
4806 if (data[i] == '\r' && i + 1 < len &&
4807 data[i+1] == '\n')
4808 i += 2;
4809 else
4810 i++;
4811 if (keepends)
4812 eol = i;
4813 }
4814 SPLIT_APPEND(data, j, eol);
4815 j = i;
4816 }
4817 if (j < len) {
4818 SPLIT_APPEND(data, j, len);
4819 }
4820
4821 Py_DECREF(string);
4822 return list;
4823
4824 onError:
4825 Py_XDECREF(list);
4826 Py_DECREF(string);
4827 return NULL;
4828}
4829
4830static
4831PyObject *split_char(PyUnicodeObject *self,
4832 PyObject *list,
4833 Py_UNICODE ch,
4834 Py_ssize_t maxcount)
4835{
4836 register Py_ssize_t i;
4837 register Py_ssize_t j;
4838 Py_ssize_t len = self->length;
4839 PyObject *str;
4840
4841 for (i = j = 0; i < len; ) {
4842 if (self->str[i] == ch) {
4843 if (maxcount-- <= 0)
4844 break;
4845 SPLIT_APPEND(self->str, j, i);
4846 i = j = i + 1;
4847 } else
4848 i++;
4849 }
4850 if (j <= len) {
4851 SPLIT_APPEND(self->str, j, len);
4852 }
4853 return list;
4854
4855 onError:
4856 Py_DECREF(list);
4857 return NULL;
4858}
4859
4860static
4861PyObject *split_substring(PyUnicodeObject *self,
4862 PyObject *list,
4863 PyUnicodeObject *substring,
4864 Py_ssize_t maxcount)
4865{
4866 register Py_ssize_t i;
4867 register Py_ssize_t j;
4868 Py_ssize_t len = self->length;
4869 Py_ssize_t sublen = substring->length;
4870 PyObject *str;
4871
4872 for (i = j = 0; i <= len - sublen; ) {
4873 if (Py_UNICODE_MATCH(self, i, substring)) {
4874 if (maxcount-- <= 0)
4875 break;
4876 SPLIT_APPEND(self->str, j, i);
4877 i = j = i + sublen;
4878 } else
4879 i++;
4880 }
4881 if (j <= len) {
4882 SPLIT_APPEND(self->str, j, len);
4883 }
4884 return list;
4885
4886 onError:
4887 Py_DECREF(list);
4888 return NULL;
4889}
4890
4891static
4892PyObject *rsplit_whitespace(PyUnicodeObject *self,
4893 PyObject *list,
4894 Py_ssize_t maxcount)
4895{
4896 register Py_ssize_t i;
4897 register Py_ssize_t j;
4898 Py_ssize_t len = self->length;
4899 PyObject *str;
4900
4901 for (i = j = len - 1; i >= 0; ) {
4902 /* find a token */
4903 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4904 i--;
4905 j = i;
4906 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4907 i--;
4908 if (j > i) {
4909 if (maxcount-- <= 0)
4910 break;
4911 SPLIT_APPEND(self->str, i + 1, j + 1);
4912 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4913 i--;
4914 j = i;
4915 }
4916 }
4917 if (j >= 0) {
4918 SPLIT_APPEND(self->str, 0, j + 1);
4919 }
4920 if (PyList_Reverse(list) < 0)
4921 goto onError;
4922 return list;
4923
4924 onError:
4925 Py_DECREF(list);
4926 return NULL;
4927}
4928
4929static
4930PyObject *rsplit_char(PyUnicodeObject *self,
4931 PyObject *list,
4932 Py_UNICODE ch,
4933 Py_ssize_t maxcount)
4934{
4935 register Py_ssize_t i;
4936 register Py_ssize_t j;
4937 Py_ssize_t len = self->length;
4938 PyObject *str;
4939
4940 for (i = j = len - 1; i >= 0; ) {
4941 if (self->str[i] == ch) {
4942 if (maxcount-- <= 0)
4943 break;
4944 SPLIT_APPEND(self->str, i + 1, j + 1);
4945 j = i = i - 1;
4946 } else
4947 i--;
4948 }
4949 if (j >= -1) {
4950 SPLIT_APPEND(self->str, 0, j + 1);
4951 }
4952 if (PyList_Reverse(list) < 0)
4953 goto onError;
4954 return list;
4955
4956 onError:
4957 Py_DECREF(list);
4958 return NULL;
4959}
4960
4961static
4962PyObject *rsplit_substring(PyUnicodeObject *self,
4963 PyObject *list,
4964 PyUnicodeObject *substring,
4965 Py_ssize_t maxcount)
4966{
4967 register Py_ssize_t i;
4968 register Py_ssize_t j;
4969 Py_ssize_t len = self->length;
4970 Py_ssize_t sublen = substring->length;
4971 PyObject *str;
4972
4973 for (i = len - sublen, j = len; i >= 0; ) {
4974 if (Py_UNICODE_MATCH(self, i, substring)) {
4975 if (maxcount-- <= 0)
4976 break;
4977 SPLIT_APPEND(self->str, i + sublen, j);
4978 j = i;
4979 i -= sublen;
4980 } else
4981 i--;
4982 }
4983 if (j >= 0) {
4984 SPLIT_APPEND(self->str, 0, j);
4985 }
4986 if (PyList_Reverse(list) < 0)
4987 goto onError;
4988 return list;
4989
4990 onError:
4991 Py_DECREF(list);
4992 return NULL;
4993}
4994
4995#undef SPLIT_APPEND
4996
4997static
4998PyObject *split(PyUnicodeObject *self,
4999 PyUnicodeObject *substring,
5000 Py_ssize_t maxcount)
5001{
5002 PyObject *list;
5003
5004 if (maxcount < 0)
5005 maxcount = PY_SSIZE_T_MAX;
5006
5007 list = PyList_New(0);
5008 if (!list)
5009 return NULL;
5010
5011 if (substring == NULL)
5012 return split_whitespace(self,list,maxcount);
5013
5014 else if (substring->length == 1)
5015 return split_char(self,list,substring->str[0],maxcount);
5016
5017 else if (substring->length == 0) {
5018 Py_DECREF(list);
5019 PyErr_SetString(PyExc_ValueError, "empty separator");
5020 return NULL;
5021 }
5022 else
5023 return split_substring(self,list,substring,maxcount);
5024}
5025
5026static
5027PyObject *rsplit(PyUnicodeObject *self,
5028 PyUnicodeObject *substring,
5029 Py_ssize_t maxcount)
5030{
5031 PyObject *list;
5032
5033 if (maxcount < 0)
5034 maxcount = PY_SSIZE_T_MAX;
5035
5036 list = PyList_New(0);
5037 if (!list)
5038 return NULL;
5039
5040 if (substring == NULL)
5041 return rsplit_whitespace(self,list,maxcount);
5042
5043 else if (substring->length == 1)
5044 return rsplit_char(self,list,substring->str[0],maxcount);
5045
5046 else if (substring->length == 0) {
5047 Py_DECREF(list);
5048 PyErr_SetString(PyExc_ValueError, "empty separator");
5049 return NULL;
5050 }
5051 else
5052 return rsplit_substring(self,list,substring,maxcount);
5053}
5054
5055static
5056PyObject *replace(PyUnicodeObject *self,
5057 PyUnicodeObject *str1,
5058 PyUnicodeObject *str2,
5059 Py_ssize_t maxcount)
5060{
5061 PyUnicodeObject *u;
5062
5063 if (maxcount < 0)
5064 maxcount = PY_SSIZE_T_MAX;
5065
5066 if (str1->length == str2->length) {
5067 /* same length */
5068 Py_ssize_t i;
5069 if (str1->length == 1) {
5070 /* replace characters */
5071 Py_UNICODE u1, u2;
5072 if (!findchar(self->str, self->length, str1->str[0]))
5073 goto nothing;
5074 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5075 if (!u)
5076 return NULL;
5077 Py_UNICODE_COPY(u->str, self->str, self->length);
5078 u1 = str1->str[0];
5079 u2 = str2->str[0];
5080 for (i = 0; i < u->length; i++)
5081 if (u->str[i] == u1) {
5082 if (--maxcount < 0)
5083 break;
5084 u->str[i] = u2;
5085 }
5086 } else {
5087 i = fastsearch(
5088 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5089 );
5090 if (i < 0)
5091 goto nothing;
5092 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5093 if (!u)
5094 return NULL;
5095 Py_UNICODE_COPY(u->str, self->str, self->length);
5096 while (i <= self->length - str1->length)
5097 if (Py_UNICODE_MATCH(self, i, str1)) {
5098 if (--maxcount < 0)
5099 break;
5100 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5101 i += str1->length;
5102 } else
5103 i++;
5104 }
5105 } else {
5106
5107 Py_ssize_t n, i, j, e;
5108 Py_ssize_t product, new_size, delta;
5109 Py_UNICODE *p;
5110
5111 /* replace strings */
5112 n = stringlib_count(self->str, self->length, str1->str, str1->length);
5113 if (n > maxcount)
5114 n = maxcount;
5115 if (n == 0)
5116 goto nothing;
5117 /* new_size = self->length + n * (str2->length - str1->length)); */
5118 delta = (str2->length - str1->length);
5119 if (delta == 0) {
5120 new_size = self->length;
5121 } else {
5122 product = n * (str2->length - str1->length);
5123 if ((product / (str2->length - str1->length)) != n) {
5124 PyErr_SetString(PyExc_OverflowError,
5125 "replace string is too long");
5126 return NULL;
5127 }
5128 new_size = self->length + product;
5129 if (new_size < 0) {
5130 PyErr_SetString(PyExc_OverflowError,
5131 "replace string is too long");
5132 return NULL;
5133 }
5134 }
5135 u = _PyUnicode_New(new_size);
5136 if (!u)
5137 return NULL;
5138 i = 0;
5139 p = u->str;
5140 e = self->length - str1->length;
5141 if (str1->length > 0) {
5142 while (n-- > 0) {
5143 /* look for next match */
5144 j = i;
5145 while (j <= e) {
5146 if (Py_UNICODE_MATCH(self, j, str1))
5147 break;
5148 j++;
5149 }
5150 if (j > i) {
5151 if (j > e)
5152 break;
5153 /* copy unchanged part [i:j] */
5154 Py_UNICODE_COPY(p, self->str+i, j-i);
5155 p += j - i;
5156 }
5157 /* copy substitution string */
5158 if (str2->length > 0) {
5159 Py_UNICODE_COPY(p, str2->str, str2->length);
5160 p += str2->length;
5161 }
5162 i = j + str1->length;
5163 }
5164 if (i < self->length)
5165 /* copy tail [i:] */
5166 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5167 } else {
5168 /* interleave */
5169 while (n > 0) {
5170 Py_UNICODE_COPY(p, str2->str, str2->length);
5171 p += str2->length;
5172 if (--n <= 0)
5173 break;
5174 *p++ = self->str[i++];
5175 }
5176 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5177 }
5178 }
5179 return (PyObject *) u;
5180
5181nothing:
5182 /* nothing to replace; return original string (when possible) */
5183 if (PyUnicode_CheckExact(self)) {
5184 Py_INCREF(self);
5185 return (PyObject *) self;
5186 }
5187 return PyUnicode_FromUnicode(self->str, self->length);
5188}
5189
5190/* --- Unicode Object Methods --------------------------------------------- */
5191
5192PyDoc_STRVAR(title__doc__,
5193"S.title() -> unicode\n\
5194\n\
5195Return a titlecased version of S, i.e. words start with title case\n\
5196characters, all remaining cased characters have lower case.");
5197
5198static PyObject*
5199unicode_title(PyUnicodeObject *self)
5200{
5201 return fixup(self, fixtitle);
5202}
5203
5204PyDoc_STRVAR(capitalize__doc__,
5205"S.capitalize() -> unicode\n\
5206\n\
5207Return a capitalized version of S, i.e. make the first character\n\
5208have upper case.");
5209
5210static PyObject*
5211unicode_capitalize(PyUnicodeObject *self)
5212{
5213 return fixup(self, fixcapitalize);
5214}
5215
5216#if 0
5217PyDoc_STRVAR(capwords__doc__,
5218"S.capwords() -> unicode\n\
5219\n\
5220Apply .capitalize() to all words in S and return the result with\n\
5221normalized whitespace (all whitespace strings are replaced by ' ').");
5222
5223static PyObject*
5224unicode_capwords(PyUnicodeObject *self)
5225{
5226 PyObject *list;
5227 PyObject *item;
5228 Py_ssize_t i;
5229
5230 /* Split into words */
5231 list = split(self, NULL, -1);
5232 if (!list)
5233 return NULL;
5234
5235 /* Capitalize each word */
5236 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5237 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5238 fixcapitalize);
5239 if (item == NULL)
5240 goto onError;
5241 Py_DECREF(PyList_GET_ITEM(list, i));
5242 PyList_SET_ITEM(list, i, item);
5243 }
5244
5245 /* Join the words to form a new string */
5246 item = PyUnicode_Join(NULL, list);
5247
5248onError:
5249 Py_DECREF(list);
5250 return (PyObject *)item;
5251}
5252#endif
5253
5254/* Argument converter. Coerces to a single unicode character */
5255
5256static int
5257convert_uc(PyObject *obj, void *addr)
5258{
5259 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5260 PyObject *uniobj;
5261 Py_UNICODE *unistr;
5262
5263 uniobj = PyUnicode_FromObject(obj);
5264 if (uniobj == NULL) {
5265 PyErr_SetString(PyExc_TypeError,
5266 "The fill character cannot be converted to Unicode");
5267 return 0;
5268 }
5269 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5270 PyErr_SetString(PyExc_TypeError,
5271 "The fill character must be exactly one character long");
5272 Py_DECREF(uniobj);
5273 return 0;
5274 }
5275 unistr = PyUnicode_AS_UNICODE(uniobj);
5276 *fillcharloc = unistr[0];
5277 Py_DECREF(uniobj);
5278 return 1;
5279}
5280
5281PyDoc_STRVAR(center__doc__,
5282"S.center(width[, fillchar]) -> unicode\n\
5283\n\
5284Return S centered in a Unicode string of length width. Padding is\n\
5285done using the specified fill character (default is a space)");
5286
5287static PyObject *
5288unicode_center(PyUnicodeObject *self, PyObject *args)
5289{
5290 Py_ssize_t marg, left;
5291 Py_ssize_t width;
5292 Py_UNICODE fillchar = ' ';
5293
5294 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
5295 return NULL;
5296
5297 if (self->length >= width && PyUnicode_CheckExact(self)) {
5298 Py_INCREF(self);
5299 return (PyObject*) self;
5300 }
5301
5302 marg = width - self->length;
5303 left = marg / 2 + (marg & width & 1);
5304
5305 return (PyObject*) pad(self, left, marg - left, fillchar);
5306}
5307
5308#if 0
5309
5310/* This code should go into some future Unicode collation support
5311 module. The basic comparison should compare ordinals on a naive
5312 basis (this is what Java does and thus JPython too). */
5313
5314/* speedy UTF-16 code point order comparison */
5315/* gleaned from: */
5316/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5317
5318static short utf16Fixup[32] =
5319{
5320 0, 0, 0, 0, 0, 0, 0, 0,
5321 0, 0, 0, 0, 0, 0, 0, 0,
5322 0, 0, 0, 0, 0, 0, 0, 0,
5323 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5324};
5325
5326static int
5327unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5328{
5329 Py_ssize_t len1, len2;
5330
5331 Py_UNICODE *s1 = str1->str;
5332 Py_UNICODE *s2 = str2->str;
5333
5334 len1 = str1->length;
5335 len2 = str2->length;
5336
5337 while (len1 > 0 && len2 > 0) {
5338 Py_UNICODE c1, c2;
5339
5340 c1 = *s1++;
5341 c2 = *s2++;
5342
5343 if (c1 > (1<<11) * 26)
5344 c1 += utf16Fixup[c1>>11];
5345 if (c2 > (1<<11) * 26)
5346 c2 += utf16Fixup[c2>>11];
5347 /* now c1 and c2 are in UTF-32-compatible order */
5348
5349 if (c1 != c2)
5350 return (c1 < c2) ? -1 : 1;
5351
5352 len1--; len2--;
5353 }
5354
5355 return (len1 < len2) ? -1 : (len1 != len2);
5356}
5357
5358#else
5359
5360static int
5361unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5362{
5363 register Py_ssize_t len1, len2;
5364
5365 Py_UNICODE *s1 = str1->str;
5366 Py_UNICODE *s2 = str2->str;
5367
5368 len1 = str1->length;
5369 len2 = str2->length;
5370
5371 while (len1 > 0 && len2 > 0) {
5372 Py_UNICODE c1, c2;
5373
5374 c1 = *s1++;
5375 c2 = *s2++;
5376
5377 if (c1 != c2)
5378 return (c1 < c2) ? -1 : 1;
5379
5380 len1--; len2--;
5381 }
5382
5383 return (len1 < len2) ? -1 : (len1 != len2);
5384}
5385
5386#endif
5387
5388int PyUnicode_Compare(PyObject *left,
5389 PyObject *right)
5390{
5391 PyUnicodeObject *u = NULL, *v = NULL;
5392 int result;
5393
5394 /* Coerce the two arguments */
5395 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5396 if (u == NULL)
5397 goto onError;
5398 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5399 if (v == NULL)
5400 goto onError;
5401
5402 /* Shortcut for empty or interned objects */
5403 if (v == u) {
5404 Py_DECREF(u);
5405 Py_DECREF(v);
5406 return 0;
5407 }
5408
5409 result = unicode_compare(u, v);
5410
5411 Py_DECREF(u);
5412 Py_DECREF(v);
5413 return result;
5414
5415onError:
5416 Py_XDECREF(u);
5417 Py_XDECREF(v);
5418 return -1;
5419}
5420
5421PyObject *PyUnicode_RichCompare(PyObject *left,
5422 PyObject *right,
5423 int op)
5424{
5425 int result;
5426
5427 result = PyUnicode_Compare(left, right);
5428 if (result == -1 && PyErr_Occurred())
5429 goto onError;
5430
5431 /* Convert the return value to a Boolean */
5432 switch (op) {
5433 case Py_EQ:
5434 result = (result == 0);
5435 break;
5436 case Py_NE:
5437 result = (result != 0);
5438 break;
5439 case Py_LE:
5440 result = (result <= 0);
5441 break;
5442 case Py_GE:
5443 result = (result >= 0);
5444 break;
5445 case Py_LT:
5446 result = (result == -1);
5447 break;
5448 case Py_GT:
5449 result = (result == 1);
5450 break;
5451 }
5452 return PyBool_FromLong(result);
5453
5454 onError:
5455
5456 /* Standard case
5457
5458 Type errors mean that PyUnicode_FromObject() could not convert
5459 one of the arguments (usually the right hand side) to Unicode,
5460 ie. we can't handle the comparison request. However, it is
5461 possible that the other object knows a comparison method, which
5462 is why we return Py_NotImplemented to give the other object a
5463 chance.
5464
5465 */
5466 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5467 PyErr_Clear();
5468 Py_INCREF(Py_NotImplemented);
5469 return Py_NotImplemented;
5470 }
5471 if (op != Py_EQ && op != Py_NE)
5472 return NULL;
5473
5474 /* Equality comparison.
5475
5476 This is a special case: we silence any PyExc_UnicodeDecodeError
5477 and instead turn it into a PyErr_UnicodeWarning.
5478
5479 */
5480 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5481 return NULL;
5482 PyErr_Clear();
5483 if (PyErr_Warn(PyExc_UnicodeWarning,
5484 (op == Py_EQ) ?
5485 "Unicode equal comparison "
5486 "failed to convert both arguments to Unicode - "
5487 "interpreting them as being unequal" :
5488 "Unicode unequal comparison "
5489 "failed to convert both arguments to Unicode - "
5490 "interpreting them as being unequal"
5491 ) < 0)
5492 return NULL;
5493 result = (op == Py_NE);
5494 return PyBool_FromLong(result);
5495}
5496
5497int PyUnicode_Contains(PyObject *container,
5498 PyObject *element)
5499{
5500 PyObject *str, *sub;
5501 int result;
5502
5503 /* Coerce the two arguments */
5504 sub = PyUnicode_FromObject(element);
5505 if (!sub) {
5506 PyErr_SetString(PyExc_TypeError,
5507 "'in <string>' requires string as left operand");
5508 return -1;
5509 }
5510
5511 str = PyUnicode_FromObject(container);
5512 if (!str) {
5513 Py_DECREF(sub);
5514 return -1;
5515 }
5516
5517 result = stringlib_contains_obj(str, sub);
5518
5519 Py_DECREF(str);
5520 Py_DECREF(sub);
5521
5522 return result;
5523}
5524
5525/* Concat to string or Unicode object giving a new Unicode object. */
5526
5527PyObject *PyUnicode_Concat(PyObject *left,
5528 PyObject *right)
5529{
5530 PyUnicodeObject *u = NULL, *v = NULL, *w;
5531
5532 /* Coerce the two arguments */
5533 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5534 if (u == NULL)
5535 goto onError;
5536 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5537 if (v == NULL)
5538 goto onError;
5539
5540 /* Shortcuts */
5541 if (v == unicode_empty) {
5542 Py_DECREF(v);
5543 return (PyObject *)u;
5544 }
5545 if (u == unicode_empty) {
5546 Py_DECREF(u);
5547 return (PyObject *)v;
5548 }
5549
5550 /* Concat the two Unicode strings */
5551 w = _PyUnicode_New(u->length + v->length);
5552 if (w == NULL)
5553 goto onError;
5554 Py_UNICODE_COPY(w->str, u->str, u->length);
5555 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5556
5557 Py_DECREF(u);
5558 Py_DECREF(v);
5559 return (PyObject *)w;
5560
5561onError:
5562 Py_XDECREF(u);
5563 Py_XDECREF(v);
5564 return NULL;
5565}
5566
5567PyDoc_STRVAR(count__doc__,
5568"S.count(sub[, start[, end]]) -> int\n\
5569\n\
5570Return the number of non-overlapping occurrences of substring sub in\n\
5571Unicode string S[start:end]. Optional arguments start and end are\n\
5572interpreted as in slice notation.");
5573
5574static PyObject *
5575unicode_count(PyUnicodeObject *self, PyObject *args)
5576{
5577 PyUnicodeObject *substring;
5578 Py_ssize_t start = 0;
5579 Py_ssize_t end = PY_SSIZE_T_MAX;
5580 PyObject *result;
5581
5582 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5583 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5584 return NULL;
5585
5586 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5587 (PyObject *)substring);
5588 if (substring == NULL)
5589 return NULL;
5590
5591 FIX_START_END(self);
5592
5593 result = PyInt_FromSsize_t(
5594 stringlib_count(self->str + start, end - start,
5595 substring->str, substring->length)
5596 );
5597
5598 Py_DECREF(substring);
5599
5600 return result;
5601}
5602
5603PyDoc_STRVAR(encode__doc__,
5604"S.encode([encoding[,errors]]) -> string or unicode\n\
5605\n\
5606Encodes S using the codec registered for encoding. encoding defaults\n\
5607to the default encoding. errors may be given to set a different error\n\
5608handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5609a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5610'xmlcharrefreplace' as well as any other name registered with\n\
5611codecs.register_error that can handle UnicodeEncodeErrors.");
5612
5613static PyObject *
5614unicode_encode(PyUnicodeObject *self, PyObject *args)
5615{
5616 char *encoding = NULL;
5617 char *errors = NULL;
5618 PyObject *v;
5619
5620 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5621 return NULL;
5622 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5623 if (v == NULL)
5624 goto onError;
5625 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5626 PyErr_Format(PyExc_TypeError,
5627 "encoder did not return a string/unicode object "
5628 "(type=%.400s)",
5629 v->ob_type->tp_name);
5630 Py_DECREF(v);
5631 return NULL;
5632 }
5633 return v;
5634
5635 onError:
5636 return NULL;
5637}
5638
5639PyDoc_STRVAR(decode__doc__,
5640"S.decode([encoding[,errors]]) -> string or unicode\n\
5641\n\
5642Decodes S using the codec registered for encoding. encoding defaults\n\
5643to the default encoding. errors may be given to set a different error\n\
5644handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5645a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5646as well as any other name registerd with codecs.register_error that is\n\
5647able to handle UnicodeDecodeErrors.");
5648
5649static PyObject *
5650unicode_decode(PyUnicodeObject *self, PyObject *args)
5651{
5652 char *encoding = NULL;
5653 char *errors = NULL;
5654 PyObject *v;
5655
5656 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5657 return NULL;
5658 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5659 if (v == NULL)
5660 goto onError;
5661 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5662 PyErr_Format(PyExc_TypeError,
5663 "decoder did not return a string/unicode object "
5664 "(type=%.400s)",
5665 v->ob_type->tp_name);
5666 Py_DECREF(v);
5667 return NULL;
5668 }
5669 return v;
5670
5671 onError:
5672 return NULL;
5673}
5674
5675PyDoc_STRVAR(expandtabs__doc__,
5676"S.expandtabs([tabsize]) -> unicode\n\
5677\n\
5678Return a copy of S where all tab characters are expanded using spaces.\n\
5679If tabsize is not given, a tab size of 8 characters is assumed.");
5680
5681static PyObject*
5682unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5683{
5684 Py_UNICODE *e;
5685 Py_UNICODE *p;
5686 Py_UNICODE *q;
5687 Py_ssize_t i, j;
5688 PyUnicodeObject *u;
5689 int tabsize = 8;
5690
5691 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5692 return NULL;
5693
5694 /* First pass: determine size of output string */
5695 i = j = 0;
5696 e = self->str + self->length;
5697 for (p = self->str; p < e; p++)
5698 if (*p == '\t') {
5699 if (tabsize > 0)
5700 j += tabsize - (j % tabsize);
5701 }
5702 else {
5703 j++;
5704 if (*p == '\n' || *p == '\r') {
5705 i += j;
5706 j = 0;
5707 }
5708 }
5709
5710 /* Second pass: create output string and fill it */
5711 u = _PyUnicode_New(i + j);
5712 if (!u)
5713 return NULL;
5714
5715 j = 0;
5716 q = u->str;
5717
5718 for (p = self->str; p < e; p++)
5719 if (*p == '\t') {
5720 if (tabsize > 0) {
5721 i = tabsize - (j % tabsize);
5722 j += i;
5723 while (i--)
5724 *q++ = ' ';
5725 }
5726 }
5727 else {
5728 j++;
5729 *q++ = *p;
5730 if (*p == '\n' || *p == '\r')
5731 j = 0;
5732 }
5733
5734 return (PyObject*) u;
5735}
5736
5737PyDoc_STRVAR(find__doc__,
5738"S.find(sub [,start [,end]]) -> int\n\
5739\n\
5740Return the lowest index in S where substring sub is found,\n\
5741such that sub is contained within s[start,end]. Optional\n\
5742arguments start and end are interpreted as in slice notation.\n\
5743\n\
5744Return -1 on failure.");
5745
5746static PyObject *
5747unicode_find(PyUnicodeObject *self, PyObject *args)
5748{
5749 PyObject *substring;
5750 Py_ssize_t start = 0;
5751 Py_ssize_t end = PY_SSIZE_T_MAX;
5752 Py_ssize_t result;
5753
5754 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5755 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5756 return NULL;
5757 substring = PyUnicode_FromObject(substring);
5758 if (!substring)
5759 return NULL;
5760
5761 result = stringlib_find_slice(
5762 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5763 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5764 start, end
5765 );
5766
5767 Py_DECREF(substring);
5768
5769 return PyInt_FromSsize_t(result);
5770}
5771
5772static PyObject *
5773unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
5774{
5775 if (index < 0 || index >= self->length) {
5776 PyErr_SetString(PyExc_IndexError, "string index out of range");
5777 return NULL;
5778 }
5779
5780 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5781}
5782
5783static long
5784unicode_hash(PyUnicodeObject *self)
5785{
5786 /* Since Unicode objects compare equal to their ASCII string
5787 counterparts, they should use the individual character values
5788 as basis for their hash value. This is needed to assure that
5789 strings and Unicode objects behave in the same way as
5790 dictionary keys. */
5791
5792 register Py_ssize_t len;
5793 register Py_UNICODE *p;
5794 register long x;
5795
5796 if (self->hash != -1)
5797 return self->hash;
5798 len = PyUnicode_GET_SIZE(self);
5799 p = PyUnicode_AS_UNICODE(self);
5800 x = *p << 7;
5801 while (--len >= 0)
5802 x = (1000003*x) ^ *p++;
5803 x ^= PyUnicode_GET_SIZE(self);
5804 if (x == -1)
5805 x = -2;
5806 self->hash = x;
5807 return x;
5808}
5809
5810PyDoc_STRVAR(index__doc__,
5811"S.index(sub [,start [,end]]) -> int\n\
5812\n\
5813Like S.find() but raise ValueError when the substring is not found.");
5814
5815static PyObject *
5816unicode_index(PyUnicodeObject *self, PyObject *args)
5817{
5818 Py_ssize_t result;
5819 PyObject *substring;
5820 Py_ssize_t start = 0;
5821 Py_ssize_t end = PY_SSIZE_T_MAX;
5822
5823 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5824 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5825 return NULL;
5826 substring = PyUnicode_FromObject(substring);
5827 if (!substring)
5828 return NULL;
5829
5830 result = stringlib_find_slice(
5831 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5832 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5833 start, end
5834 );
5835
5836 Py_DECREF(substring);
5837
5838 if (result < 0) {
5839 PyErr_SetString(PyExc_ValueError, "substring not found");
5840 return NULL;
5841 }
5842
5843 return PyInt_FromSsize_t(result);
5844}
5845
5846PyDoc_STRVAR(islower__doc__,
5847"S.islower() -> bool\n\
5848\n\
5849Return True if all cased characters in S are lowercase and there is\n\
5850at least one cased character in S, False otherwise.");
5851
5852static PyObject*
5853unicode_islower(PyUnicodeObject *self)
5854{
5855 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5856 register const Py_UNICODE *e;
5857 int cased;
5858
5859 /* Shortcut for single character strings */
5860 if (PyUnicode_GET_SIZE(self) == 1)
5861 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
5862
5863 /* Special case for empty strings */
5864 if (PyUnicode_GET_SIZE(self) == 0)
5865 return PyBool_FromLong(0);
5866
5867 e = p + PyUnicode_GET_SIZE(self);
5868 cased = 0;
5869 for (; p < e; p++) {
5870 register const Py_UNICODE ch = *p;
5871
5872 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
5873 return PyBool_FromLong(0);
5874 else if (!cased && Py_UNICODE_ISLOWER(ch))
5875 cased = 1;
5876 }
5877 return PyBool_FromLong(cased);
5878}
5879
5880PyDoc_STRVAR(isupper__doc__,
5881"S.isupper() -> bool\n\
5882\n\
5883Return True if all cased characters in S are uppercase and there is\n\
5884at least one cased character in S, False otherwise.");
5885
5886static PyObject*
5887unicode_isupper(PyUnicodeObject *self)
5888{
5889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5890 register const Py_UNICODE *e;
5891 int cased;
5892
5893 /* Shortcut for single character strings */
5894 if (PyUnicode_GET_SIZE(self) == 1)
5895 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
5896
5897 /* Special case for empty strings */
5898 if (PyUnicode_GET_SIZE(self) == 0)
5899 return PyBool_FromLong(0);
5900
5901 e = p + PyUnicode_GET_SIZE(self);
5902 cased = 0;
5903 for (; p < e; p++) {
5904 register const Py_UNICODE ch = *p;
5905
5906 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
5907 return PyBool_FromLong(0);
5908 else if (!cased && Py_UNICODE_ISUPPER(ch))
5909 cased = 1;
5910 }
5911 return PyBool_FromLong(cased);
5912}
5913
5914PyDoc_STRVAR(istitle__doc__,
5915"S.istitle() -> bool\n\
5916\n\
5917Return True if S is a titlecased string and there is at least one\n\
5918character in S, i.e. upper- and titlecase characters may only\n\
5919follow uncased characters and lowercase characters only cased ones.\n\
5920Return False otherwise.");
5921
5922static PyObject*
5923unicode_istitle(PyUnicodeObject *self)
5924{
5925 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5926 register const Py_UNICODE *e;
5927 int cased, previous_is_cased;
5928
5929 /* Shortcut for single character strings */
5930 if (PyUnicode_GET_SIZE(self) == 1)
5931 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5932 (Py_UNICODE_ISUPPER(*p) != 0));
5933
5934 /* Special case for empty strings */
5935 if (PyUnicode_GET_SIZE(self) == 0)
5936 return PyBool_FromLong(0);
5937
5938 e = p + PyUnicode_GET_SIZE(self);
5939 cased = 0;
5940 previous_is_cased = 0;
5941 for (; p < e; p++) {
5942 register const Py_UNICODE ch = *p;
5943
5944 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5945 if (previous_is_cased)
5946 return PyBool_FromLong(0);
5947 previous_is_cased = 1;
5948 cased = 1;
5949 }
5950 else if (Py_UNICODE_ISLOWER(ch)) {
5951 if (!previous_is_cased)
5952 return PyBool_FromLong(0);
5953 previous_is_cased = 1;
5954 cased = 1;
5955 }
5956 else
5957 previous_is_cased = 0;
5958 }
5959 return PyBool_FromLong(cased);
5960}
5961
5962PyDoc_STRVAR(isspace__doc__,
5963"S.isspace() -> bool\n\
5964\n\
5965Return True if all characters in S are whitespace\n\
5966and there is at least one character in S, False otherwise.");
5967
5968static PyObject*
5969unicode_isspace(PyUnicodeObject *self)
5970{
5971 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5972 register const Py_UNICODE *e;
5973
5974 /* Shortcut for single character strings */
5975 if (PyUnicode_GET_SIZE(self) == 1 &&
5976 Py_UNICODE_ISSPACE(*p))
5977 return PyBool_FromLong(1);
5978
5979 /* Special case for empty strings */
5980 if (PyUnicode_GET_SIZE(self) == 0)
5981 return PyBool_FromLong(0);
5982
5983 e = p + PyUnicode_GET_SIZE(self);
5984 for (; p < e; p++) {
5985 if (!Py_UNICODE_ISSPACE(*p))
5986 return PyBool_FromLong(0);
5987 }
5988 return PyBool_FromLong(1);
5989}
5990
5991PyDoc_STRVAR(isalpha__doc__,
5992"S.isalpha() -> bool\n\
5993\n\
5994Return True if all characters in S are alphabetic\n\
5995and there is at least one character in S, False otherwise.");
5996
5997static PyObject*
5998unicode_isalpha(PyUnicodeObject *self)
5999{
6000 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6001 register const Py_UNICODE *e;
6002
6003 /* Shortcut for single character strings */
6004 if (PyUnicode_GET_SIZE(self) == 1 &&
6005 Py_UNICODE_ISALPHA(*p))
6006 return PyBool_FromLong(1);
6007
6008 /* Special case for empty strings */
6009 if (PyUnicode_GET_SIZE(self) == 0)
6010 return PyBool_FromLong(0);
6011
6012 e = p + PyUnicode_GET_SIZE(self);
6013 for (; p < e; p++) {
6014 if (!Py_UNICODE_ISALPHA(*p))
6015 return PyBool_FromLong(0);
6016 }
6017 return PyBool_FromLong(1);
6018}
6019
6020PyDoc_STRVAR(isalnum__doc__,
6021"S.isalnum() -> bool\n\
6022\n\
6023Return True if all characters in S are alphanumeric\n\
6024and there is at least one character in S, False otherwise.");
6025
6026static PyObject*
6027unicode_isalnum(PyUnicodeObject *self)
6028{
6029 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6030 register const Py_UNICODE *e;
6031
6032 /* Shortcut for single character strings */
6033 if (PyUnicode_GET_SIZE(self) == 1 &&
6034 Py_UNICODE_ISALNUM(*p))
6035 return PyBool_FromLong(1);
6036
6037 /* Special case for empty strings */
6038 if (PyUnicode_GET_SIZE(self) == 0)
6039 return PyBool_FromLong(0);
6040
6041 e = p + PyUnicode_GET_SIZE(self);
6042 for (; p < e; p++) {
6043 if (!Py_UNICODE_ISALNUM(*p))
6044 return PyBool_FromLong(0);
6045 }
6046 return PyBool_FromLong(1);
6047}
6048
6049PyDoc_STRVAR(isdecimal__doc__,
6050"S.isdecimal() -> bool\n\
6051\n\
6052Return True if there are only decimal characters in S,\n\
6053False otherwise.");
6054
6055static PyObject*
6056unicode_isdecimal(PyUnicodeObject *self)
6057{
6058 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6059 register const Py_UNICODE *e;
6060
6061 /* Shortcut for single character strings */
6062 if (PyUnicode_GET_SIZE(self) == 1 &&
6063 Py_UNICODE_ISDECIMAL(*p))
6064 return PyBool_FromLong(1);
6065
6066 /* Special case for empty strings */
6067 if (PyUnicode_GET_SIZE(self) == 0)
6068 return PyBool_FromLong(0);
6069
6070 e = p + PyUnicode_GET_SIZE(self);
6071 for (; p < e; p++) {
6072 if (!Py_UNICODE_ISDECIMAL(*p))
6073 return PyBool_FromLong(0);
6074 }
6075 return PyBool_FromLong(1);
6076}
6077
6078PyDoc_STRVAR(isdigit__doc__,
6079"S.isdigit() -> bool\n\
6080\n\
6081Return True if all characters in S are digits\n\
6082and there is at least one character in S, False otherwise.");
6083
6084static PyObject*
6085unicode_isdigit(PyUnicodeObject *self)
6086{
6087 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6088 register const Py_UNICODE *e;
6089
6090 /* Shortcut for single character strings */
6091 if (PyUnicode_GET_SIZE(self) == 1 &&
6092 Py_UNICODE_ISDIGIT(*p))
6093 return PyBool_FromLong(1);
6094
6095 /* Special case for empty strings */
6096 if (PyUnicode_GET_SIZE(self) == 0)
6097 return PyBool_FromLong(0);
6098
6099 e = p + PyUnicode_GET_SIZE(self);
6100 for (; p < e; p++) {
6101 if (!Py_UNICODE_ISDIGIT(*p))
6102 return PyBool_FromLong(0);
6103 }
6104 return PyBool_FromLong(1);
6105}
6106
6107PyDoc_STRVAR(isnumeric__doc__,
6108"S.isnumeric() -> bool\n\
6109\n\
6110Return True if there are only numeric characters in S,\n\
6111False otherwise.");
6112
6113static PyObject*
6114unicode_isnumeric(PyUnicodeObject *self)
6115{
6116 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6117 register const Py_UNICODE *e;
6118
6119 /* Shortcut for single character strings */
6120 if (PyUnicode_GET_SIZE(self) == 1 &&
6121 Py_UNICODE_ISNUMERIC(*p))
6122 return PyBool_FromLong(1);
6123
6124 /* Special case for empty strings */
6125 if (PyUnicode_GET_SIZE(self) == 0)
6126 return PyBool_FromLong(0);
6127
6128 e = p + PyUnicode_GET_SIZE(self);
6129 for (; p < e; p++) {
6130 if (!Py_UNICODE_ISNUMERIC(*p))
6131 return PyBool_FromLong(0);
6132 }
6133 return PyBool_FromLong(1);
6134}
6135
6136PyDoc_STRVAR(join__doc__,
6137"S.join(sequence) -> unicode\n\
6138\n\
6139Return a string which is the concatenation of the strings in the\n\
6140sequence. The separator between elements is S.");
6141
6142static PyObject*
6143unicode_join(PyObject *self, PyObject *data)
6144{
6145 return PyUnicode_Join(self, data);
6146}
6147
6148static Py_ssize_t
6149unicode_length(PyUnicodeObject *self)
6150{
6151 return self->length;
6152}
6153
6154PyDoc_STRVAR(ljust__doc__,
6155"S.ljust(width[, fillchar]) -> int\n\
6156\n\
6157Return S left justified in a Unicode string of length width. Padding is\n\
6158done using the specified fill character (default is a space).");
6159
6160static PyObject *
6161unicode_ljust(PyUnicodeObject *self, PyObject *args)
6162{
6163 Py_ssize_t width;
6164 Py_UNICODE fillchar = ' ';
6165
6166 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6167 return NULL;
6168
6169 if (self->length >= width && PyUnicode_CheckExact(self)) {
6170 Py_INCREF(self);
6171 return (PyObject*) self;
6172 }
6173
6174 return (PyObject*) pad(self, 0, width - self->length, fillchar);
6175}
6176
6177PyDoc_STRVAR(lower__doc__,
6178"S.lower() -> unicode\n\
6179\n\
6180Return a copy of the string S converted to lowercase.");
6181
6182static PyObject*
6183unicode_lower(PyUnicodeObject *self)
6184{
6185 return fixup(self, fixlower);
6186}
6187
6188#define LEFTSTRIP 0
6189#define RIGHTSTRIP 1
6190#define BOTHSTRIP 2
6191
6192/* Arrays indexed by above */
6193static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6194
6195#define STRIPNAME(i) (stripformat[i]+3)
6196
6197/* externally visible for str.strip(unicode) */
6198PyObject *
6199_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6200{
6201 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6202 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6203 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6204 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6205 Py_ssize_t i, j;
6206
6207 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6208
6209 i = 0;
6210 if (striptype != RIGHTSTRIP) {
6211 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6212 i++;
6213 }
6214 }
6215
6216 j = len;
6217 if (striptype != LEFTSTRIP) {
6218 do {
6219 j--;
6220 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6221 j++;
6222 }
6223
6224 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6225 Py_INCREF(self);
6226 return (PyObject*)self;
6227 }
6228 else
6229 return PyUnicode_FromUnicode(s+i, j-i);
6230}
6231
6232
6233static PyObject *
6234do_strip(PyUnicodeObject *self, int striptype)
6235{
6236 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6237 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6238
6239 i = 0;
6240 if (striptype != RIGHTSTRIP) {
6241 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6242 i++;
6243 }
6244 }
6245
6246 j = len;
6247 if (striptype != LEFTSTRIP) {
6248 do {
6249 j--;
6250 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6251 j++;
6252 }
6253
6254 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6255 Py_INCREF(self);
6256 return (PyObject*)self;
6257 }
6258 else
6259 return PyUnicode_FromUnicode(s+i, j-i);
6260}
6261
6262
6263static PyObject *
6264do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6265{
6266 PyObject *sep = NULL;
6267
6268 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6269 return NULL;
6270
6271 if (sep != NULL && sep != Py_None) {
6272 if (PyUnicode_Check(sep))
6273 return _PyUnicode_XStrip(self, striptype, sep);
6274 else if (PyString_Check(sep)) {
6275 PyObject *res;
6276 sep = PyUnicode_FromObject(sep);
6277 if (sep==NULL)
6278 return NULL;
6279 res = _PyUnicode_XStrip(self, striptype, sep);
6280 Py_DECREF(sep);
6281 return res;
6282 }
6283 else {
6284 PyErr_Format(PyExc_TypeError,
6285 "%s arg must be None, unicode or str",
6286 STRIPNAME(striptype));
6287 return NULL;
6288 }
6289 }
6290
6291 return do_strip(self, striptype);
6292}
6293
6294
6295PyDoc_STRVAR(strip__doc__,
6296"S.strip([chars]) -> unicode\n\
6297\n\
6298Return a copy of the string S with leading and trailing\n\
6299whitespace removed.\n\
6300If chars is given and not None, remove characters in chars instead.\n\
6301If chars is a str, it will be converted to unicode before stripping");
6302
6303static PyObject *
6304unicode_strip(PyUnicodeObject *self, PyObject *args)
6305{
6306 if (PyTuple_GET_SIZE(args) == 0)
6307 return do_strip(self, BOTHSTRIP); /* Common case */
6308 else
6309 return do_argstrip(self, BOTHSTRIP, args);
6310}
6311
6312
6313PyDoc_STRVAR(lstrip__doc__,
6314"S.lstrip([chars]) -> unicode\n\
6315\n\
6316Return a copy of the string S with leading whitespace removed.\n\
6317If chars is given and not None, remove characters in chars instead.\n\
6318If chars is a str, it will be converted to unicode before stripping");
6319
6320static PyObject *
6321unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6322{
6323 if (PyTuple_GET_SIZE(args) == 0)
6324 return do_strip(self, LEFTSTRIP); /* Common case */
6325 else
6326 return do_argstrip(self, LEFTSTRIP, args);
6327}
6328
6329
6330PyDoc_STRVAR(rstrip__doc__,
6331"S.rstrip([chars]) -> unicode\n\
6332\n\
6333Return a copy of the string S with trailing whitespace removed.\n\
6334If chars is given and not None, remove characters in chars instead.\n\
6335If chars is a str, it will be converted to unicode before stripping");
6336
6337static PyObject *
6338unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6339{
6340 if (PyTuple_GET_SIZE(args) == 0)
6341 return do_strip(self, RIGHTSTRIP); /* Common case */
6342 else
6343 return do_argstrip(self, RIGHTSTRIP, args);
6344}
6345
6346
6347static PyObject*
6348unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
6349{
6350 PyUnicodeObject *u;
6351 Py_UNICODE *p;
6352 Py_ssize_t nchars;
6353 size_t nbytes;
6354
6355 if (len < 0)
6356 len = 0;
6357
6358 if (len == 1 && PyUnicode_CheckExact(str)) {
6359 /* no repeat, return original string */
6360 Py_INCREF(str);
6361 return (PyObject*) str;
6362 }
6363
6364 /* ensure # of chars needed doesn't overflow int and # of bytes
6365 * needed doesn't overflow size_t
6366 */
6367 nchars = len * str->length;
6368 if (len && nchars / len != str->length) {
6369 PyErr_SetString(PyExc_OverflowError,
6370 "repeated string is too long");
6371 return NULL;
6372 }
6373 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6374 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6375 PyErr_SetString(PyExc_OverflowError,
6376 "repeated string is too long");
6377 return NULL;
6378 }
6379 u = _PyUnicode_New(nchars);
6380 if (!u)
6381 return NULL;
6382
6383 p = u->str;
6384
6385 if (str->length == 1 && len > 0) {
6386 Py_UNICODE_FILL(p, str->str[0], len);
6387 } else {
6388 Py_ssize_t done = 0; /* number of characters copied this far */
6389 if (done < nchars) {
6390 Py_UNICODE_COPY(p, str->str, str->length);
6391 done = str->length;
6392 }
6393 while (done < nchars) {
6394 int n = (done <= nchars-done) ? done : nchars-done;
6395 Py_UNICODE_COPY(p+done, p, n);
6396 done += n;
6397 }
6398 }
6399
6400 return (PyObject*) u;
6401}
6402
6403PyObject *PyUnicode_Replace(PyObject *obj,
6404 PyObject *subobj,
6405 PyObject *replobj,
6406 Py_ssize_t maxcount)
6407{
6408 PyObject *self;
6409 PyObject *str1;
6410 PyObject *str2;
6411 PyObject *result;
6412
6413 self = PyUnicode_FromObject(obj);
6414 if (self == NULL)
6415 return NULL;
6416 str1 = PyUnicode_FromObject(subobj);
6417 if (str1 == NULL) {
6418 Py_DECREF(self);
6419 return NULL;
6420 }
6421 str2 = PyUnicode_FromObject(replobj);
6422 if (str2 == NULL) {
6423 Py_DECREF(self);
6424 Py_DECREF(str1);
6425 return NULL;
6426 }
6427 result = replace((PyUnicodeObject *)self,
6428 (PyUnicodeObject *)str1,
6429 (PyUnicodeObject *)str2,
6430 maxcount);
6431 Py_DECREF(self);
6432 Py_DECREF(str1);
6433 Py_DECREF(str2);
6434 return result;
6435}
6436
6437PyDoc_STRVAR(replace__doc__,
6438"S.replace (old, new[, maxsplit]) -> unicode\n\
6439\n\
6440Return a copy of S with all occurrences of substring\n\
6441old replaced by new. If the optional argument maxsplit is\n\
6442given, only the first maxsplit occurrences are replaced.");
6443
6444static PyObject*
6445unicode_replace(PyUnicodeObject *self, PyObject *args)
6446{
6447 PyUnicodeObject *str1;
6448 PyUnicodeObject *str2;
6449 Py_ssize_t maxcount = -1;
6450 PyObject *result;
6451
6452 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
6453 return NULL;
6454 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6455 if (str1 == NULL)
6456 return NULL;
6457 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
6458 if (str2 == NULL) {
6459 Py_DECREF(str1);
6460 return NULL;
6461 }
6462
6463 result = replace(self, str1, str2, maxcount);
6464
6465 Py_DECREF(str1);
6466 Py_DECREF(str2);
6467 return result;
6468}
6469
6470static
6471PyObject *unicode_repr(PyObject *unicode)
6472{
6473 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6474 PyUnicode_GET_SIZE(unicode),
6475 1);
6476}
6477
6478PyDoc_STRVAR(rfind__doc__,
6479"S.rfind(sub [,start [,end]]) -> int\n\
6480\n\
6481Return the highest index in S where substring sub is found,\n\
6482such that sub is contained within s[start,end]. Optional\n\
6483arguments start and end are interpreted as in slice notation.\n\
6484\n\
6485Return -1 on failure.");
6486
6487static PyObject *
6488unicode_rfind(PyUnicodeObject *self, PyObject *args)
6489{
6490 PyObject *substring;
6491 Py_ssize_t start = 0;
6492 Py_ssize_t end = PY_SSIZE_T_MAX;
6493 Py_ssize_t result;
6494
6495 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6496 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6497 return NULL;
6498 substring = PyUnicode_FromObject(substring);
6499 if (!substring)
6500 return NULL;
6501
6502 result = stringlib_rfind_slice(
6503 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6504 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6505 start, end
6506 );
6507
6508 Py_DECREF(substring);
6509
6510 return PyInt_FromSsize_t(result);
6511}
6512
6513PyDoc_STRVAR(rindex__doc__,
6514"S.rindex(sub [,start [,end]]) -> int\n\
6515\n\
6516Like S.rfind() but raise ValueError when the substring is not found.");
6517
6518static PyObject *
6519unicode_rindex(PyUnicodeObject *self, PyObject *args)
6520{
6521 PyObject *substring;
6522 Py_ssize_t start = 0;
6523 Py_ssize_t end = PY_SSIZE_T_MAX;
6524 Py_ssize_t result;
6525
6526 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6527 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6528 return NULL;
6529 substring = PyUnicode_FromObject(substring);
6530 if (!substring)
6531 return NULL;
6532
6533 result = stringlib_rfind_slice(
6534 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6535 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6536 start, end
6537 );
6538
6539 Py_DECREF(substring);
6540
6541 if (result < 0) {
6542 PyErr_SetString(PyExc_ValueError, "substring not found");
6543 return NULL;
6544 }
6545 return PyInt_FromSsize_t(result);
6546}
6547
6548PyDoc_STRVAR(rjust__doc__,
6549"S.rjust(width[, fillchar]) -> unicode\n\
6550\n\
6551Return S right justified in a Unicode string of length width. Padding is\n\
6552done using the specified fill character (default is a space).");
6553
6554static PyObject *
6555unicode_rjust(PyUnicodeObject *self, PyObject *args)
6556{
6557 Py_ssize_t width;
6558 Py_UNICODE fillchar = ' ';
6559
6560 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
6561 return NULL;
6562
6563 if (self->length >= width && PyUnicode_CheckExact(self)) {
6564 Py_INCREF(self);
6565 return (PyObject*) self;
6566 }
6567
6568 return (PyObject*) pad(self, width - self->length, 0, fillchar);
6569}
6570
6571static PyObject*
6572unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
6573{
6574 /* standard clamping */
6575 if (start < 0)
6576 start = 0;
6577 if (end < 0)
6578 end = 0;
6579 if (end > self->length)
6580 end = self->length;
6581 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6582 /* full slice, return original string */
6583 Py_INCREF(self);
6584 return (PyObject*) self;
6585 }
6586 if (start > end)
6587 start = end;
6588 /* copy slice */
6589 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6590 end - start);
6591}
6592
6593PyObject *PyUnicode_Split(PyObject *s,
6594 PyObject *sep,
6595 Py_ssize_t maxsplit)
6596{
6597 PyObject *result;
6598
6599 s = PyUnicode_FromObject(s);
6600 if (s == NULL)
6601 return NULL;
6602 if (sep != NULL) {
6603 sep = PyUnicode_FromObject(sep);
6604 if (sep == NULL) {
6605 Py_DECREF(s);
6606 return NULL;
6607 }
6608 }
6609
6610 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6611
6612 Py_DECREF(s);
6613 Py_XDECREF(sep);
6614 return result;
6615}
6616
6617PyDoc_STRVAR(split__doc__,
6618"S.split([sep [,maxsplit]]) -> list of strings\n\
6619\n\
6620Return a list of the words in S, using sep as the\n\
6621delimiter string. If maxsplit is given, at most maxsplit\n\
6622splits are done. If sep is not specified or is None,\n\
6623any whitespace string is a separator.");
6624
6625static PyObject*
6626unicode_split(PyUnicodeObject *self, PyObject *args)
6627{
6628 PyObject *substring = Py_None;
6629 Py_ssize_t maxcount = -1;
6630
6631 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
6632 return NULL;
6633
6634 if (substring == Py_None)
6635 return split(self, NULL, maxcount);
6636 else if (PyUnicode_Check(substring))
6637 return split(self, (PyUnicodeObject *)substring, maxcount);
6638 else
6639 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6640}
6641
6642PyObject *
6643PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6644{
6645 PyObject* str_obj;
6646 PyObject* sep_obj;
6647 PyObject* out;
6648
6649 str_obj = PyUnicode_FromObject(str_in);
6650 if (!str_obj)
6651 return NULL;
6652 sep_obj = PyUnicode_FromObject(sep_in);
6653 if (!sep_obj) {
6654 Py_DECREF(str_obj);
6655 return NULL;
6656 }
6657
6658 out = stringlib_partition(
6659 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6660 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6661 );
6662
6663 Py_DECREF(sep_obj);
6664 Py_DECREF(str_obj);
6665
6666 return out;
6667}
6668
6669
6670PyObject *
6671PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6672{
6673 PyObject* str_obj;
6674 PyObject* sep_obj;
6675 PyObject* out;
6676
6677 str_obj = PyUnicode_FromObject(str_in);
6678 if (!str_obj)
6679 return NULL;
6680 sep_obj = PyUnicode_FromObject(sep_in);
6681 if (!sep_obj) {
6682 Py_DECREF(str_obj);
6683 return NULL;
6684 }
6685
6686 out = stringlib_rpartition(
6687 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6688 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6689 );
6690
6691 Py_DECREF(sep_obj);
6692 Py_DECREF(str_obj);
6693
6694 return out;
6695}
6696
6697PyDoc_STRVAR(partition__doc__,
6698"S.partition(sep) -> (head, sep, tail)\n\
6699\n\
6700Searches for the separator sep in S, and returns the part before it,\n\
6701the separator itself, and the part after it. If the separator is not\n\
6702found, returns S and two empty strings.");
6703
6704static PyObject*
6705unicode_partition(PyUnicodeObject *self, PyObject *separator)
6706{
6707 return PyUnicode_Partition((PyObject *)self, separator);
6708}
6709
6710PyDoc_STRVAR(rpartition__doc__,
6711"S.rpartition(sep) -> (tail, sep, head)\n\
6712\n\
6713Searches for the separator sep in S, starting at the end of S, and returns\n\
6714the part before it, the separator itself, and the part after it. If the\n\
6715separator is not found, returns two empty strings and S.");
6716
6717static PyObject*
6718unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6719{
6720 return PyUnicode_RPartition((PyObject *)self, separator);
6721}
6722
6723PyObject *PyUnicode_RSplit(PyObject *s,
6724 PyObject *sep,
6725 Py_ssize_t maxsplit)
6726{
6727 PyObject *result;
6728
6729 s = PyUnicode_FromObject(s);
6730 if (s == NULL)
6731 return NULL;
6732 if (sep != NULL) {
6733 sep = PyUnicode_FromObject(sep);
6734 if (sep == NULL) {
6735 Py_DECREF(s);
6736 return NULL;
6737 }
6738 }
6739
6740 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6741
6742 Py_DECREF(s);
6743 Py_XDECREF(sep);
6744 return result;
6745}
6746
6747PyDoc_STRVAR(rsplit__doc__,
6748"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6749\n\
6750Return a list of the words in S, using sep as the\n\
6751delimiter string, starting at the end of the string and\n\
6752working to the front. If maxsplit is given, at most maxsplit\n\
6753splits are done. If sep is not specified, any whitespace string\n\
6754is a separator.");
6755
6756static PyObject*
6757unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6758{
6759 PyObject *substring = Py_None;
6760 Py_ssize_t maxcount = -1;
6761
6762 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
6763 return NULL;
6764
6765 if (substring == Py_None)
6766 return rsplit(self, NULL, maxcount);
6767 else if (PyUnicode_Check(substring))
6768 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6769 else
6770 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6771}
6772
6773PyDoc_STRVAR(splitlines__doc__,
6774"S.splitlines([keepends]]) -> list of strings\n\
6775\n\
6776Return a list of the lines in S, breaking at line boundaries.\n\
6777Line breaks are not included in the resulting list unless keepends\n\
6778is given and true.");
6779
6780static PyObject*
6781unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6782{
6783 int keepends = 0;
6784
6785 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
6786 return NULL;
6787
6788 return PyUnicode_Splitlines((PyObject *)self, keepends);
6789}
6790
6791static
6792PyObject *unicode_str(PyUnicodeObject *self)
6793{
6794 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
6795}
6796
6797PyDoc_STRVAR(swapcase__doc__,
6798"S.swapcase() -> unicode\n\
6799\n\
6800Return a copy of S with uppercase characters converted to lowercase\n\
6801and vice versa.");
6802
6803static PyObject*
6804unicode_swapcase(PyUnicodeObject *self)
6805{
6806 return fixup(self, fixswapcase);
6807}
6808
6809PyDoc_STRVAR(translate__doc__,
6810"S.translate(table) -> unicode\n\
6811\n\
6812Return a copy of the string S, where all characters have been mapped\n\
6813through the given translation table, which must be a mapping of\n\
6814Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6815Unmapped characters are left untouched. Characters mapped to None\n\
6816are deleted.");
6817
6818static PyObject*
6819unicode_translate(PyUnicodeObject *self, PyObject *table)
6820{
6821 return PyUnicode_TranslateCharmap(self->str,
6822 self->length,
6823 table,
6824 "ignore");
6825}
6826
6827PyDoc_STRVAR(upper__doc__,
6828"S.upper() -> unicode\n\
6829\n\
6830Return a copy of S converted to uppercase.");
6831
6832static PyObject*
6833unicode_upper(PyUnicodeObject *self)
6834{
6835 return fixup(self, fixupper);
6836}
6837
6838PyDoc_STRVAR(zfill__doc__,
6839"S.zfill(width) -> unicode\n\
6840\n\
6841Pad a numeric string x with zeros on the left, to fill a field\n\
6842of the specified width. The string x is never truncated.");
6843
6844static PyObject *
6845unicode_zfill(PyUnicodeObject *self, PyObject *args)
6846{
6847 Py_ssize_t fill;
6848 PyUnicodeObject *u;
6849
6850 Py_ssize_t width;
6851 if (!PyArg_ParseTuple(args, "n:zfill", &width))
6852 return NULL;
6853
6854 if (self->length >= width) {
6855 if (PyUnicode_CheckExact(self)) {
6856 Py_INCREF(self);
6857 return (PyObject*) self;
6858 }
6859 else
6860 return PyUnicode_FromUnicode(
6861 PyUnicode_AS_UNICODE(self),
6862 PyUnicode_GET_SIZE(self)
6863 );
6864 }
6865
6866 fill = width - self->length;
6867
6868 u = pad(self, fill, 0, '0');
6869
6870 if (u == NULL)
6871 return NULL;
6872
6873 if (u->str[fill] == '+' || u->str[fill] == '-') {
6874 /* move sign to beginning of string */
6875 u->str[0] = u->str[fill];
6876 u->str[fill] = '0';
6877 }
6878
6879 return (PyObject*) u;
6880}
6881
6882#if 0
6883static PyObject*
6884unicode_freelistsize(PyUnicodeObject *self)
6885{
6886 return PyInt_FromLong(unicode_freelist_size);
6887}
6888#endif
6889
6890PyDoc_STRVAR(startswith__doc__,
6891"S.startswith(prefix[, start[, end]]) -> bool\n\
6892\n\
6893Return True if S starts with the specified prefix, False otherwise.\n\
6894With optional start, test S beginning at that position.\n\
6895With optional end, stop comparing S at that position.\n\
6896prefix can also be a tuple of strings to try.");
6897
6898static PyObject *
6899unicode_startswith(PyUnicodeObject *self,
6900 PyObject *args)
6901{
6902 PyObject *subobj;
6903 PyUnicodeObject *substring;
6904 Py_ssize_t start = 0;
6905 Py_ssize_t end = PY_SSIZE_T_MAX;
6906 int result;
6907
6908 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
6909 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6910 return NULL;
6911 if (PyTuple_Check(subobj)) {
6912 Py_ssize_t i;
6913 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6914 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6915 PyTuple_GET_ITEM(subobj, i));
6916 if (substring == NULL)
6917 return NULL;
6918 result = tailmatch(self, substring, start, end, -1);
6919 Py_DECREF(substring);
6920 if (result) {
6921 Py_RETURN_TRUE;
6922 }
6923 }
6924 /* nothing matched */
6925 Py_RETURN_FALSE;
6926 }
6927 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6928 if (substring == NULL)
6929 return NULL;
6930 result = tailmatch(self, substring, start, end, -1);
6931 Py_DECREF(substring);
6932 return PyBool_FromLong(result);
6933}
6934
6935
6936PyDoc_STRVAR(endswith__doc__,
6937"S.endswith(suffix[, start[, end]]) -> bool\n\
6938\n\
6939Return True if S ends with the specified suffix, False otherwise.\n\
6940With optional start, test S beginning at that position.\n\
6941With optional end, stop comparing S at that position.\n\
6942suffix can also be a tuple of strings to try.");
6943
6944static PyObject *
6945unicode_endswith(PyUnicodeObject *self,
6946 PyObject *args)
6947{
6948 PyObject *subobj;
6949 PyUnicodeObject *substring;
6950 Py_ssize_t start = 0;
6951 Py_ssize_t end = PY_SSIZE_T_MAX;
6952 int result;
6953
6954 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6955 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6956 return NULL;
6957 if (PyTuple_Check(subobj)) {
6958 Py_ssize_t i;
6959 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6960 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6961 PyTuple_GET_ITEM(subobj, i));
6962 if (substring == NULL)
6963 return NULL;
6964 result = tailmatch(self, substring, start, end, +1);
6965 Py_DECREF(substring);
6966 if (result) {
6967 Py_RETURN_TRUE;
6968 }
6969 }
6970 Py_RETURN_FALSE;
6971 }
6972 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6973 if (substring == NULL)
6974 return NULL;
6975
6976 result = tailmatch(self, substring, start, end, +1);
6977 Py_DECREF(substring);
6978 return PyBool_FromLong(result);
6979}
6980
6981
6982
6983static PyObject *
6984unicode_getnewargs(PyUnicodeObject *v)
6985{
6986 return Py_BuildValue("(u#)", v->str, v->length);
6987}
6988
6989
6990static PyMethodDef unicode_methods[] = {
6991
6992 /* Order is according to common usage: often used methods should
6993 appear first, since lookup is done sequentially. */
6994
6995 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6996 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6997 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
6998 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
6999 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7000 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7001 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7002 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7003 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7004 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7005 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7006 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7007 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7008 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7009 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7010 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7011 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7012/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7013 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7014 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7015 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7016 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7017 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7018 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7019 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7020 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7021 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7022 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7023 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7024 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7025 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7026 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7027 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7028 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7029 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7030 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7031 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7032 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7033 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7034 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7035#if 0
7036 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7037#endif
7038
7039#if 0
7040 /* This one is just used for debugging the implementation. */
7041 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
7042#endif
7043
7044 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7045 {NULL, NULL}
7046};
7047
7048static PyObject *
7049unicode_mod(PyObject *v, PyObject *w)
7050{
7051 if (!PyUnicode_Check(v)) {
7052 Py_INCREF(Py_NotImplemented);
7053 return Py_NotImplemented;
7054 }
7055 return PyUnicode_Format(v, w);
7056}
7057
7058static PyNumberMethods unicode_as_number = {
7059 0, /*nb_add*/
7060 0, /*nb_subtract*/
7061 0, /*nb_multiply*/
7062 0, /*nb_divide*/
7063 unicode_mod, /*nb_remainder*/
7064};
7065
7066static PySequenceMethods unicode_as_sequence = {
7067 (lenfunc) unicode_length, /* sq_length */
7068 PyUnicode_Concat, /* sq_concat */
7069 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7070 (ssizeargfunc) unicode_getitem, /* sq_item */
7071 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7072 0, /* sq_ass_item */
7073 0, /* sq_ass_slice */
7074 PyUnicode_Contains, /* sq_contains */
7075};
7076
7077static PyObject*
7078unicode_subscript(PyUnicodeObject* self, PyObject* item)
7079{
7080 if (PyIndex_Check(item)) {
7081 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7082 if (i == -1 && PyErr_Occurred())
7083 return NULL;
7084 if (i < 0)
7085 i += PyUnicode_GET_SIZE(self);
7086 return unicode_getitem(self, i);
7087 } else if (PySlice_Check(item)) {
7088 Py_ssize_t start, stop, step, slicelength, cur, i;
7089 Py_UNICODE* source_buf;
7090 Py_UNICODE* result_buf;
7091 PyObject* result;
7092
7093 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7094 &start, &stop, &step, &slicelength) < 0) {
7095 return NULL;
7096 }
7097
7098 if (slicelength <= 0) {
7099 return PyUnicode_FromUnicode(NULL, 0);
7100 } else {
7101 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7102 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7103 sizeof(Py_UNICODE));
7104
7105 if (result_buf == NULL)
7106 return PyErr_NoMemory();
7107
7108 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7109 result_buf[i] = source_buf[cur];
7110 }
7111
7112 result = PyUnicode_FromUnicode(result_buf, slicelength);
7113 PyMem_FREE(result_buf);
7114 return result;
7115 }
7116 } else {
7117 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7118 return NULL;
7119 }
7120}
7121
7122static PyMappingMethods unicode_as_mapping = {
7123 (lenfunc)unicode_length, /* mp_length */
7124 (binaryfunc)unicode_subscript, /* mp_subscript */
7125 (objobjargproc)0, /* mp_ass_subscript */
7126};
7127
7128static Py_ssize_t
7129unicode_buffer_getreadbuf(PyUnicodeObject *self,
7130 Py_ssize_t index,
7131 const void **ptr)
7132{
7133 if (index != 0) {
7134 PyErr_SetString(PyExc_SystemError,
7135 "accessing non-existent unicode segment");
7136 return -1;
7137 }
7138 *ptr = (void *) self->str;
7139 return PyUnicode_GET_DATA_SIZE(self);
7140}
7141
7142static Py_ssize_t
7143unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7144 const void **ptr)
7145{
7146 PyErr_SetString(PyExc_TypeError,
7147 "cannot use unicode as modifiable buffer");
7148 return -1;
7149}
7150
7151static int
7152unicode_buffer_getsegcount(PyUnicodeObject *self,
7153 Py_ssize_t *lenp)
7154{
7155 if (lenp)
7156 *lenp = PyUnicode_GET_DATA_SIZE(self);
7157 return 1;
7158}
7159
7160static Py_ssize_t
7161unicode_buffer_getcharbuf(PyUnicodeObject *self,
7162 Py_ssize_t index,
7163 const void **ptr)
7164{
7165 PyObject *str;
7166
7167 if (index != 0) {
7168 PyErr_SetString(PyExc_SystemError,
7169 "accessing non-existent unicode segment");
7170 return -1;
7171 }
7172 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7173 if (str == NULL)
7174 return -1;
7175 *ptr = (void *) PyString_AS_STRING(str);
7176 return PyString_GET_SIZE(str);
7177}
7178
7179/* Helpers for PyUnicode_Format() */
7180
7181static PyObject *
7182getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7183{
7184 Py_ssize_t argidx = *p_argidx;
7185 if (argidx < arglen) {
7186 (*p_argidx)++;
7187 if (arglen < 0)
7188 return args;
7189 else
7190 return PyTuple_GetItem(args, argidx);
7191 }
7192 PyErr_SetString(PyExc_TypeError,
7193 "not enough arguments for format string");
7194 return NULL;
7195}
7196
7197#define F_LJUST (1<<0)
7198#define F_SIGN (1<<1)
7199#define F_BLANK (1<<2)
7200#define F_ALT (1<<3)
7201#define F_ZERO (1<<4)
7202
7203static Py_ssize_t
7204strtounicode(Py_UNICODE *buffer, const char *charbuffer)
7205{
7206 register Py_ssize_t i;
7207 Py_ssize_t len = strlen(charbuffer);
7208 for (i = len - 1; i >= 0; i--)
7209 buffer[i] = (Py_UNICODE) charbuffer[i];
7210
7211 return len;
7212}
7213
7214static int
7215doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7216{
7217 Py_ssize_t result;
7218
7219 PyOS_ascii_formatd((char *)buffer, len, format, x);
7220 result = strtounicode(buffer, (char *)buffer);
7221 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7222}
7223
7224static int
7225longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7226{
7227 Py_ssize_t result;
7228
7229 PyOS_snprintf((char *)buffer, len, format, x);
7230 result = strtounicode(buffer, (char *)buffer);
7231 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7232}
7233
7234/* XXX To save some code duplication, formatfloat/long/int could have been
7235 shared with stringobject.c, converting from 8-bit to Unicode after the
7236 formatting is done. */
7237
7238static int
7239formatfloat(Py_UNICODE *buf,
7240 size_t buflen,
7241 int flags,
7242 int prec,
7243 int type,
7244 PyObject *v)
7245{
7246 /* fmt = '%#.' + `prec` + `type`
7247 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7248 char fmt[20];
7249 double x;
7250
7251 x = PyFloat_AsDouble(v);
7252 if (x == -1.0 && PyErr_Occurred())
7253 return -1;
7254 if (prec < 0)
7255 prec = 6;
7256 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7257 type = 'g';
7258 /* Worst case length calc to ensure no buffer overrun:
7259
7260 'g' formats:
7261 fmt = %#.<prec>g
7262 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7263 for any double rep.)
7264 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7265
7266 'f' formats:
7267 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7268 len = 1 + 50 + 1 + prec = 52 + prec
7269
7270 If prec=0 the effective precision is 1 (the leading digit is
7271 always given), therefore increase the length by one.
7272
7273 */
7274 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7275 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
7276 PyErr_SetString(PyExc_OverflowError,
7277 "formatted float is too long (precision too large?)");
7278 return -1;
7279 }
7280 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7281 (flags&F_ALT) ? "#" : "",
7282 prec, type);
7283 return doubletounicode(buf, buflen, fmt, x);
7284}
7285
7286static PyObject*
7287formatlong(PyObject *val, int flags, int prec, int type)
7288{
7289 char *buf;
7290 int i, len;
7291 PyObject *str; /* temporary string object. */
7292 PyUnicodeObject *result;
7293
7294 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7295 if (!str)
7296 return NULL;
7297 result = _PyUnicode_New(len);
7298 if (!result) {
7299 Py_DECREF(str);
7300 return NULL;
7301 }
7302 for (i = 0; i < len; i++)
7303 result->str[i] = buf[i];
7304 result->str[len] = 0;
7305 Py_DECREF(str);
7306 return (PyObject*)result;
7307}
7308
7309static int
7310formatint(Py_UNICODE *buf,
7311 size_t buflen,
7312 int flags,
7313 int prec,
7314 int type,
7315 PyObject *v)
7316{
7317 /* fmt = '%#.' + `prec` + 'l' + `type`
7318 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7319 * + 1 + 1
7320 * = 24
7321 */
7322 char fmt[64]; /* plenty big enough! */
7323 char *sign;
7324 long x;
7325
7326 x = PyInt_AsLong(v);
7327 if (x == -1 && PyErr_Occurred())
7328 return -1;
7329 if (x < 0 && type == 'u') {
7330 type = 'd';
7331 }
7332 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7333 sign = "-";
7334 else
7335 sign = "";
7336 if (prec < 0)
7337 prec = 1;
7338
7339 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7340 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7341 */
7342 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
7343 PyErr_SetString(PyExc_OverflowError,
7344 "formatted integer is too long (precision too large?)");
7345 return -1;
7346 }
7347
7348 if ((flags & F_ALT) &&
7349 (type == 'x' || type == 'X')) {
7350 /* When converting under %#x or %#X, there are a number
7351 * of issues that cause pain:
7352 * - when 0 is being converted, the C standard leaves off
7353 * the '0x' or '0X', which is inconsistent with other
7354 * %#x/%#X conversions and inconsistent with Python's
7355 * hex() function
7356 * - there are platforms that violate the standard and
7357 * convert 0 with the '0x' or '0X'
7358 * (Metrowerks, Compaq Tru64)
7359 * - there are platforms that give '0x' when converting
7360 * under %#X, but convert 0 in accordance with the
7361 * standard (OS/2 EMX)
7362 *
7363 * We can achieve the desired consistency by inserting our
7364 * own '0x' or '0X' prefix, and substituting %x/%X in place
7365 * of %#x/%#X.
7366 *
7367 * Note that this is the same approach as used in
7368 * formatint() in stringobject.c
7369 */
7370 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7371 sign, type, prec, type);
7372 }
7373 else {
7374 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7375 sign, (flags&F_ALT) ? "#" : "",
7376 prec, type);
7377 }
7378 if (sign[0])
7379 return longtounicode(buf, buflen, fmt, -x);
7380 else
7381 return longtounicode(buf, buflen, fmt, x);
7382}
7383
7384static int
7385formatchar(Py_UNICODE *buf,
7386 size_t buflen,
7387 PyObject *v)
7388{
7389 /* presume that the buffer is at least 2 characters long */
7390 if (PyUnicode_Check(v)) {
7391 if (PyUnicode_GET_SIZE(v) != 1)
7392 goto onError;
7393 buf[0] = PyUnicode_AS_UNICODE(v)[0];
7394 }
7395
7396 else if (PyString_Check(v)) {
7397 if (PyString_GET_SIZE(v) != 1)
7398 goto onError;
7399 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7400 }
7401
7402 else {
7403 /* Integer input truncated to a character */
7404 long x;
7405 x = PyInt_AsLong(v);
7406 if (x == -1 && PyErr_Occurred())
7407 goto onError;
7408#ifdef Py_UNICODE_WIDE
7409 if (x < 0 || x > 0x10ffff) {
7410 PyErr_SetString(PyExc_OverflowError,
7411 "%c arg not in range(0x110000) "
7412 "(wide Python build)");
7413 return -1;
7414 }
7415#else
7416 if (x < 0 || x > 0xffff) {
7417 PyErr_SetString(PyExc_OverflowError,
7418 "%c arg not in range(0x10000) "
7419 "(narrow Python build)");
7420 return -1;
7421 }
7422#endif
7423 buf[0] = (Py_UNICODE) x;
7424 }
7425 buf[1] = '\0';
7426 return 1;
7427
7428 onError:
7429 PyErr_SetString(PyExc_TypeError,
7430 "%c requires int or char");
7431 return -1;
7432}
7433
7434/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7435
7436 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7437 chars are formatted. XXX This is a magic number. Each formatting
7438 routine does bounds checking to ensure no overflow, but a better
7439 solution may be to malloc a buffer of appropriate size for each
7440 format. For now, the current solution is sufficient.
7441*/
7442#define FORMATBUFLEN (size_t)120
7443
7444PyObject *PyUnicode_Format(PyObject *format,
7445 PyObject *args)
7446{
7447 Py_UNICODE *fmt, *res;
7448 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
7449 int args_owned = 0;
7450 PyUnicodeObject *result = NULL;
7451 PyObject *dict = NULL;
7452 PyObject *uformat;
7453
7454 if (format == NULL || args == NULL) {
7455 PyErr_BadInternalCall();
7456 return NULL;
7457 }
7458 uformat = PyUnicode_FromObject(format);
7459 if (uformat == NULL)
7460 return NULL;
7461 fmt = PyUnicode_AS_UNICODE(uformat);
7462 fmtcnt = PyUnicode_GET_SIZE(uformat);
7463
7464 reslen = rescnt = fmtcnt + 100;
7465 result = _PyUnicode_New(reslen);
7466 if (result == NULL)
7467 goto onError;
7468 res = PyUnicode_AS_UNICODE(result);
7469
7470 if (PyTuple_Check(args)) {
7471 arglen = PyTuple_Size(args);
7472 argidx = 0;
7473 }
7474 else {
7475 arglen = -1;
7476 argidx = -2;
7477 }
7478 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7479 !PyObject_TypeCheck(args, &PyBaseString_Type))
7480 dict = args;
7481
7482 while (--fmtcnt >= 0) {
7483 if (*fmt != '%') {
7484 if (--rescnt < 0) {
7485 rescnt = fmtcnt + 100;
7486 reslen += rescnt;
7487 if (_PyUnicode_Resize(&result, reslen) < 0)
7488 goto onError;
7489 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7490 --rescnt;
7491 }
7492 *res++ = *fmt++;
7493 }
7494 else {
7495 /* Got a format specifier */
7496 int flags = 0;
7497 Py_ssize_t width = -1;
7498 int prec = -1;
7499 Py_UNICODE c = '\0';
7500 Py_UNICODE fill;
7501 PyObject *v = NULL;
7502 PyObject *temp = NULL;
7503 Py_UNICODE *pbuf;
7504 Py_UNICODE sign;
7505 Py_ssize_t len;
7506 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
7507
7508 fmt++;
7509 if (*fmt == '(') {
7510 Py_UNICODE *keystart;
7511 Py_ssize_t keylen;
7512 PyObject *key;
7513 int pcount = 1;
7514
7515 if (dict == NULL) {
7516 PyErr_SetString(PyExc_TypeError,
7517 "format requires a mapping");
7518 goto onError;
7519 }
7520 ++fmt;
7521 --fmtcnt;
7522 keystart = fmt;
7523 /* Skip over balanced parentheses */
7524 while (pcount > 0 && --fmtcnt >= 0) {
7525 if (*fmt == ')')
7526 --pcount;
7527 else if (*fmt == '(')
7528 ++pcount;
7529 fmt++;
7530 }
7531 keylen = fmt - keystart - 1;
7532 if (fmtcnt < 0 || pcount > 0) {
7533 PyErr_SetString(PyExc_ValueError,
7534 "incomplete format key");
7535 goto onError;
7536 }
7537#if 0
7538 /* keys are converted to strings using UTF-8 and
7539 then looked up since Python uses strings to hold
7540 variables names etc. in its namespaces and we
7541 wouldn't want to break common idioms. */
7542 key = PyUnicode_EncodeUTF8(keystart,
7543 keylen,
7544 NULL);
7545#else
7546 key = PyUnicode_FromUnicode(keystart, keylen);
7547#endif
7548 if (key == NULL)
7549 goto onError;
7550 if (args_owned) {
7551 Py_DECREF(args);
7552 args_owned = 0;
7553 }
7554 args = PyObject_GetItem(dict, key);
7555 Py_DECREF(key);
7556 if (args == NULL) {
7557 goto onError;
7558 }
7559 args_owned = 1;
7560 arglen = -1;
7561 argidx = -2;
7562 }
7563 while (--fmtcnt >= 0) {
7564 switch (c = *fmt++) {
7565 case '-': flags |= F_LJUST; continue;
7566 case '+': flags |= F_SIGN; continue;
7567 case ' ': flags |= F_BLANK; continue;
7568 case '#': flags |= F_ALT; continue;
7569 case '0': flags |= F_ZERO; continue;
7570 }
7571 break;
7572 }
7573 if (c == '*') {
7574 v = getnextarg(args, arglen, &argidx);
7575 if (v == NULL)
7576 goto onError;
7577 if (!PyInt_Check(v)) {
7578 PyErr_SetString(PyExc_TypeError,
7579 "* wants int");
7580 goto onError;
7581 }
7582 width = PyInt_AsLong(v);
7583 if (width < 0) {
7584 flags |= F_LJUST;
7585 width = -width;
7586 }
7587 if (--fmtcnt >= 0)
7588 c = *fmt++;
7589 }
7590 else if (c >= '0' && c <= '9') {
7591 width = c - '0';
7592 while (--fmtcnt >= 0) {
7593 c = *fmt++;
7594 if (c < '0' || c > '9')
7595 break;
7596 if ((width*10) / 10 != width) {
7597 PyErr_SetString(PyExc_ValueError,
7598 "width too big");
7599 goto onError;
7600 }
7601 width = width*10 + (c - '0');
7602 }
7603 }
7604 if (c == '.') {
7605 prec = 0;
7606 if (--fmtcnt >= 0)
7607 c = *fmt++;
7608 if (c == '*') {
7609 v = getnextarg(args, arglen, &argidx);
7610 if (v == NULL)
7611 goto onError;
7612 if (!PyInt_Check(v)) {
7613 PyErr_SetString(PyExc_TypeError,
7614 "* wants int");
7615 goto onError;
7616 }
7617 prec = PyInt_AsLong(v);
7618 if (prec < 0)
7619 prec = 0;
7620 if (--fmtcnt >= 0)
7621 c = *fmt++;
7622 }
7623 else if (c >= '0' && c <= '9') {
7624 prec = c - '0';
7625 while (--fmtcnt >= 0) {
7626 c = Py_CHARMASK(*fmt++);
7627 if (c < '0' || c > '9')
7628 break;
7629 if ((prec*10) / 10 != prec) {
7630 PyErr_SetString(PyExc_ValueError,
7631 "prec too big");
7632 goto onError;
7633 }
7634 prec = prec*10 + (c - '0');
7635 }
7636 }
7637 } /* prec */
7638 if (fmtcnt >= 0) {
7639 if (c == 'h' || c == 'l' || c == 'L') {
7640 if (--fmtcnt >= 0)
7641 c = *fmt++;
7642 }
7643 }
7644 if (fmtcnt < 0) {
7645 PyErr_SetString(PyExc_ValueError,
7646 "incomplete format");
7647 goto onError;
7648 }
7649 if (c != '%') {
7650 v = getnextarg(args, arglen, &argidx);
7651 if (v == NULL)
7652 goto onError;
7653 }
7654 sign = 0;
7655 fill = ' ';
7656 switch (c) {
7657
7658 case '%':
7659 pbuf = formatbuf;
7660 /* presume that buffer length is at least 1 */
7661 pbuf[0] = '%';
7662 len = 1;
7663 break;
7664
7665 case 's':
7666 case 'r':
7667 if (PyUnicode_Check(v) && c == 's') {
7668 temp = v;
7669 Py_INCREF(temp);
7670 }
7671 else {
7672 PyObject *unicode;
7673 if (c == 's')
7674 temp = PyObject_Unicode(v);
7675 else
7676 temp = PyObject_Repr(v);
7677 if (temp == NULL)
7678 goto onError;
7679 if (PyUnicode_Check(temp))
7680 /* nothing to do */;
7681 else if (PyString_Check(temp)) {
7682 /* convert to string to Unicode */
7683 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7684 PyString_GET_SIZE(temp),
7685 NULL,
7686 "strict");
7687 Py_DECREF(temp);
7688 temp = unicode;
7689 if (temp == NULL)
7690 goto onError;
7691 }
7692 else {
7693 Py_DECREF(temp);
7694 PyErr_SetString(PyExc_TypeError,
7695 "%s argument has non-string str()");
7696 goto onError;
7697 }
7698 }
7699 pbuf = PyUnicode_AS_UNICODE(temp);
7700 len = PyUnicode_GET_SIZE(temp);
7701 if (prec >= 0 && len > prec)
7702 len = prec;
7703 break;
7704
7705 case 'i':
7706 case 'd':
7707 case 'u':
7708 case 'o':
7709 case 'x':
7710 case 'X':
7711 if (c == 'i')
7712 c = 'd';
7713 if (PyLong_Check(v)) {
7714 temp = formatlong(v, flags, prec, c);
7715 if (!temp)
7716 goto onError;
7717 pbuf = PyUnicode_AS_UNICODE(temp);
7718 len = PyUnicode_GET_SIZE(temp);
7719 sign = 1;
7720 }
7721 else {
7722 pbuf = formatbuf;
7723 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7724 flags, prec, c, v);
7725 if (len < 0)
7726 goto onError;
7727 sign = 1;
7728 }
7729 if (flags & F_ZERO)
7730 fill = '0';
7731 break;
7732
7733 case 'e':
7734 case 'E':
7735 case 'f':
7736 case 'F':
7737 case 'g':
7738 case 'G':
7739 if (c == 'F')
7740 c = 'f';
7741 pbuf = formatbuf;
7742 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7743 flags, prec, c, v);
7744 if (len < 0)
7745 goto onError;
7746 sign = 1;
7747 if (flags & F_ZERO)
7748 fill = '0';
7749 break;
7750
7751 case 'c':
7752 pbuf = formatbuf;
7753 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
7754 if (len < 0)
7755 goto onError;
7756 break;
7757
7758 default:
7759 PyErr_Format(PyExc_ValueError,
7760 "unsupported format character '%c' (0x%x) "
7761 "at index %i",
7762 (31<=c && c<=126) ? (char)c : '?',
7763 (int)c,
7764 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
7765 goto onError;
7766 }
7767 if (sign) {
7768 if (*pbuf == '-' || *pbuf == '+') {
7769 sign = *pbuf++;
7770 len--;
7771 }
7772 else if (flags & F_SIGN)
7773 sign = '+';
7774 else if (flags & F_BLANK)
7775 sign = ' ';
7776 else
7777 sign = 0;
7778 }
7779 if (width < len)
7780 width = len;
7781 if (rescnt - (sign != 0) < width) {
7782 reslen -= rescnt;
7783 rescnt = width + fmtcnt + 100;
7784 reslen += rescnt;
7785 if (reslen < 0) {
7786 Py_XDECREF(temp);
7787 PyErr_NoMemory();
7788 goto onError;
7789 }
7790 if (_PyUnicode_Resize(&result, reslen) < 0) {
7791 Py_XDECREF(temp);
7792 goto onError;
7793 }
7794 res = PyUnicode_AS_UNICODE(result)
7795 + reslen - rescnt;
7796 }
7797 if (sign) {
7798 if (fill != ' ')
7799 *res++ = sign;
7800 rescnt--;
7801 if (width > len)
7802 width--;
7803 }
7804 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7805 assert(pbuf[0] == '0');
7806 assert(pbuf[1] == c);
7807 if (fill != ' ') {
7808 *res++ = *pbuf++;
7809 *res++ = *pbuf++;
7810 }
7811 rescnt -= 2;
7812 width -= 2;
7813 if (width < 0)
7814 width = 0;
7815 len -= 2;
7816 }
7817 if (width > len && !(flags & F_LJUST)) {
7818 do {
7819 --rescnt;
7820 *res++ = fill;
7821 } while (--width > len);
7822 }
7823 if (fill == ' ') {
7824 if (sign)
7825 *res++ = sign;
7826 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7827 assert(pbuf[0] == '0');
7828 assert(pbuf[1] == c);
7829 *res++ = *pbuf++;
7830 *res++ = *pbuf++;
7831 }
7832 }
7833 Py_UNICODE_COPY(res, pbuf, len);
7834 res += len;
7835 rescnt -= len;
7836 while (--width >= len) {
7837 --rescnt;
7838 *res++ = ' ';
7839 }
7840 if (dict && (argidx < arglen) && c != '%') {
7841 PyErr_SetString(PyExc_TypeError,
7842 "not all arguments converted during string formatting");
7843 Py_XDECREF(temp);
7844 goto onError;
7845 }
7846 Py_XDECREF(temp);
7847 } /* '%' */
7848 } /* until end */
7849 if (argidx < arglen && !dict) {
7850 PyErr_SetString(PyExc_TypeError,
7851 "not all arguments converted during string formatting");
7852 goto onError;
7853 }
7854
7855 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7856 goto onError;
7857 if (args_owned) {
7858 Py_DECREF(args);
7859 }
7860 Py_DECREF(uformat);
7861 return (PyObject *)result;
7862
7863 onError:
7864 Py_XDECREF(result);
7865 Py_DECREF(uformat);
7866 if (args_owned) {
7867 Py_DECREF(args);
7868 }
7869 return NULL;
7870}
7871
7872static PyBufferProcs unicode_as_buffer = {
7873 (readbufferproc) unicode_buffer_getreadbuf,
7874 (writebufferproc) unicode_buffer_getwritebuf,
7875 (segcountproc) unicode_buffer_getsegcount,
7876 (charbufferproc) unicode_buffer_getcharbuf,
7877};
7878
7879static PyObject *
7880unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7881
7882static PyObject *
7883unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7884{
7885 PyObject *x = NULL;
7886 static char *kwlist[] = {"string", "encoding", "errors", 0};
7887 char *encoding = NULL;
7888 char *errors = NULL;
7889
7890 if (type != &PyUnicode_Type)
7891 return unicode_subtype_new(type, args, kwds);
7892 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7893 kwlist, &x, &encoding, &errors))
7894 return NULL;
7895 if (x == NULL)
7896 return (PyObject *)_PyUnicode_New(0);
7897 if (encoding == NULL && errors == NULL)
7898 return PyObject_Unicode(x);
7899 else
7900 return PyUnicode_FromEncodedObject(x, encoding, errors);
7901}
7902
7903static PyObject *
7904unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7905{
7906 PyUnicodeObject *tmp, *pnew;
7907 Py_ssize_t n;
7908
7909 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7910 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7911 if (tmp == NULL)
7912 return NULL;
7913 assert(PyUnicode_Check(tmp));
7914 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
7915 if (pnew == NULL) {
7916 Py_DECREF(tmp);
7917 return NULL;
7918 }
7919 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7920 if (pnew->str == NULL) {
7921 _Py_ForgetReference((PyObject *)pnew);
7922 PyObject_Del(pnew);
7923 Py_DECREF(tmp);
7924 return PyErr_NoMemory();
7925 }
7926 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7927 pnew->length = n;
7928 pnew->hash = tmp->hash;
7929 Py_DECREF(tmp);
7930 return (PyObject *)pnew;
7931}
7932
7933PyDoc_STRVAR(unicode_doc,
7934"unicode(string [, encoding[, errors]]) -> object\n\
7935\n\
7936Create a new Unicode object from the given encoded string.\n\
7937encoding defaults to the current default string encoding.\n\
7938errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7939
7940PyTypeObject PyUnicode_Type = {
7941 PyObject_HEAD_INIT(&PyType_Type)
7942 0, /* ob_size */
7943 "unicode", /* tp_name */
7944 sizeof(PyUnicodeObject), /* tp_size */
7945 0, /* tp_itemsize */
7946 /* Slots */
7947 (destructor)unicode_dealloc, /* tp_dealloc */
7948 0, /* tp_print */
7949 0, /* tp_getattr */
7950 0, /* tp_setattr */
7951 0, /* tp_compare */
7952 unicode_repr, /* tp_repr */
7953 &unicode_as_number, /* tp_as_number */
7954 &unicode_as_sequence, /* tp_as_sequence */
7955 &unicode_as_mapping, /* tp_as_mapping */
7956 (hashfunc) unicode_hash, /* tp_hash*/
7957 0, /* tp_call*/
7958 (reprfunc) unicode_str, /* tp_str */
7959 PyObject_GenericGetAttr, /* tp_getattro */
7960 0, /* tp_setattro */
7961 &unicode_as_buffer, /* tp_as_buffer */
7962 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7963 Py_TPFLAGS_BASETYPE, /* tp_flags */
7964 unicode_doc, /* tp_doc */
7965 0, /* tp_traverse */
7966 0, /* tp_clear */
7967 PyUnicode_RichCompare, /* tp_richcompare */
7968 0, /* tp_weaklistoffset */
7969 0, /* tp_iter */
7970 0, /* tp_iternext */
7971 unicode_methods, /* tp_methods */
7972 0, /* tp_members */
7973 0, /* tp_getset */
7974 &PyBaseString_Type, /* tp_base */
7975 0, /* tp_dict */
7976 0, /* tp_descr_get */
7977 0, /* tp_descr_set */
7978 0, /* tp_dictoffset */
7979 0, /* tp_init */
7980 0, /* tp_alloc */
7981 unicode_new, /* tp_new */
7982 PyObject_Del, /* tp_free */
7983};
7984
7985/* Initialize the Unicode implementation */
7986
7987void _PyUnicode_Init(void)
7988{
7989 int i;
7990
7991 /* XXX - move this array to unicodectype.c ? */
7992 Py_UNICODE linebreak[] = {
7993 0x000A, /* LINE FEED */
7994 0x000D, /* CARRIAGE RETURN */
7995 0x001C, /* FILE SEPARATOR */
7996 0x001D, /* GROUP SEPARATOR */
7997 0x001E, /* RECORD SEPARATOR */
7998 0x0085, /* NEXT LINE */
7999 0x2028, /* LINE SEPARATOR */
8000 0x2029, /* PARAGRAPH SEPARATOR */
8001 };
8002
8003 /* Init the implementation */
8004 unicode_freelist = NULL;
8005 unicode_freelist_size = 0;
8006 unicode_empty = _PyUnicode_New(0);
8007 if (!unicode_empty)
8008 return;
8009
8010 strcpy(unicode_default_encoding, "ascii");
8011 for (i = 0; i < 256; i++)
8012 unicode_latin1[i] = NULL;
8013 if (PyType_Ready(&PyUnicode_Type) < 0)
8014 Py_FatalError("Can't initialize 'unicode'");
8015
8016 /* initialize the linebreak bloom filter */
8017 bloom_linebreak = make_bloom_mask(
8018 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8019 );
8020
8021 PyType_Ready(&EncodingMapType);
8022}
8023
8024/* Finalize the Unicode implementation */
8025
8026void
8027_PyUnicode_Fini(void)
8028{
8029 PyUnicodeObject *u;
8030 int i;
8031
8032 Py_XDECREF(unicode_empty);
8033 unicode_empty = NULL;
8034
8035 for (i = 0; i < 256; i++) {
8036 if (unicode_latin1[i]) {
8037 Py_DECREF(unicode_latin1[i]);
8038 unicode_latin1[i] = NULL;
8039 }
8040 }
8041
8042 for (u = unicode_freelist; u != NULL;) {
8043 PyUnicodeObject *v = u;
8044 u = *(PyUnicodeObject **)u;
8045 if (v->str)
8046 PyMem_DEL(v->str);
8047 Py_XDECREF(v->defenc);
8048 PyObject_Del(v);
8049 }
8050 unicode_freelist = NULL;
8051 unicode_freelist_size = 0;
8052}
8053
8054#ifdef __cplusplus
8055}
8056#endif
8057
8058
8059/*
8060Local variables:
8061c-basic-offset: 4
8062indent-tabs-mode: nil
8063End:
8064*/
Note: See TracBrowser for help on using the repository browser.