Context Navigation

unicodeobject.c

Visit:

Last change on this file was 3225, checked in by bird, 18 years ago
Python 2.5
File size: 210.3 KB

Line
1	/*
2
3	Unicode implementation based on original code by Fredrik Lundh,
4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5	Unicode Integration Proposal (see file Misc/unicode.txt).
6
7	Major speed upgrades to the method implementations at the Reykjavik
8	NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10	Copyright (c) Corporation for National Research Initiatives.
11
12	--------------------------------------------------------------------
13	The original string type implementation is:
14
15	Copyright (c) 1999 by Secret Labs AB
16	Copyright (c) 1999 by Fredrik Lundh
17
18	By obtaining, using, and/or copying this software and/or its
19	associated documentation, you agree that you have read, understood,
20	and will comply with the following terms and conditions:
21
22	Permission to use, copy, modify, and distribute this software and its
23	associated documentation for any purpose and without fee is hereby
24	granted, provided that the above copyright notice appears in all
25	copies, and that both that copyright notice and this permission notice
26	appear in supporting documentation, and that the name of Secret Labs
27	AB or the author not be used in advertising or publicity pertaining to
28	distribution of the software without specific, written prior
29	permission.
30
31	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38	--------------------------------------------------------------------
39
40	*/
41
42	#define PY_SSIZE_T_CLEAN
43	#include "Python.h"
44
45	#include "unicodeobject.h"
46	#include "ucnhash.h"
47
48	#ifdef MS_WINDOWS
49	#include <windows.h>
50	#endif
51
52	/* Limit for the Unicode object free list */
53
54	#define MAX_UNICODE_FREELIST_SIZE 1024
55
56	/* Limit for the Unicode object free list stay alive optimization.
57
58	The implementation will keep allocated Unicode memory intact for
59	all objects on the free list having a size less than this
60	limit. This reduces malloc() overhead for small Unicode objects.
61
62	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
63	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64	malloc()-overhead) bytes of unused garbage.
65
66	Setting the limit to 0 effectively turns the feature off.
67
68	Note: This is an experimental feature ! If you get core dumps when
69	using Unicode objects, turn this feature off.
70
71	*/
72
73	#define KEEPALIVE_SIZE_LIMIT 9
74
75	/* Endianness switches; defaults to little endian */
76
77	#ifdef WORDS_BIGENDIAN
78	# define BYTEORDER_IS_BIG_ENDIAN
79	#else
80	# define BYTEORDER_IS_LITTLE_ENDIAN
81	#endif
82
83	/* --- Globals ------------------------------------------------------------
84
85	The globals are initialized by the _PyUnicode_Init() API and should
86	not be used before calling that API.
87
88	*/
89
90
91	#ifdef __cplusplus
92	extern "C" {
93	#endif
94
95	/* Free list for Unicode objects */
96	static PyUnicodeObject *unicode_freelist;
97	static int unicode_freelist_size;
98
99	/* The empty Unicode object is shared to improve performance. */
100	static PyUnicodeObject *unicode_empty;
101
102	/* Single character Unicode strings in the Latin-1 range are being
103	shared as well. */
104	static PyUnicodeObject *unicode_latin1[256];
105
106	/* Default encoding to use and assume when NULL is passed as encoding
107	parameter; it is initialized by _PyUnicode_Init().
108
109	Always use the PyUnicode_SetDefaultEncoding() and
110	PyUnicode_GetDefaultEncoding() APIs to access this global.
111
112	*/
113	static char unicode_default_encoding[100];
114
115	Py_UNICODE
116	PyUnicode_GetMax(void)
117	{
118	#ifdef Py_UNICODE_WIDE
119	return 0x10FFFF;
120	#else
121	/* This is actually an illegal character, so it should
122	not be passed to unichr. */
123	return 0xFFFF;
124	#endif
125	}
126
127	/* --- Bloom Filters ----------------------------------------------------- */
128
129	/* stuff to implement simple "bloom filters" for Unicode characters.
130	to keep things simple, we use a single bitmask, using the least 5
131	bits from each unicode characters as the bit index. */
132
133	/* the linebreak mask is set up by Unicode_Init below */
134
135	#define BLOOM_MASK unsigned long
136
137	static BLOOM_MASK bloom_linebreak;
138
139	#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141	#define BLOOM_LINEBREAK(ch)\
142	(BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
144	Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
145	{
146	/* calculate simple bloom-style bitmask for a given unicode string */
147
148	long mask;
149	Py_ssize_t i;
150
151	mask = 0;
152	for (i = 0; i < len; i++)
153	mask \|= (1 << (ptr[i] & 0x1F));
154
155	return mask;
156	}
157
158	Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
159	{
160	Py_ssize_t i;
161
162	for (i = 0; i < setlen; i++)
163	if (set[i] == chr)
164	return 1;
165
166	return 0;
167	}
168
169	#define BLOOM_MEMBER(mask, chr, set, setlen)\
170	BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
172	/* --- Unicode Object ----------------------------------------------------- */
173
174	static
175	int unicode_resize(register PyUnicodeObject *unicode,
176	Py_ssize_t length)
177	{
178	void *oldstr;
179
180	/* Shortcut if there's nothing much to do. */
181	if (unicode->length == length)
182	goto reset;
183
184	/* Resizing shared object (unicode_empty or single character
185	objects) in-place is not allowed. Use PyUnicode_Resize()
186	instead ! */
187
188	if (unicode == unicode_empty \|\|
189	(unicode->length == 1 &&
190	unicode->str[0] < 256U &&
191	unicode_latin1[unicode->str[0]] == unicode)) {
192	PyErr_SetString(PyExc_SystemError,
193	"can't resize shared unicode objects");
194	return -1;
195	}
196
197	/* We allocate one more byte to make sure the string is Ux0000 terminated.
198	The overallocation is also used by fastsearch, which assumes that it's
199	safe to look at str[length] (without making any assumptions about what
200	it contains). */
201
202	oldstr = unicode->str;
203	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204	if (!unicode->str) {
205	unicode->str = (Py_UNICODE *)oldstr;
206	PyErr_NoMemory();
207	return -1;
208	}
209	unicode->str[length] = 0;
210	unicode->length = length;
211
212	reset:
213	/* Reset the object caches */
214	if (unicode->defenc) {
215	Py_DECREF(unicode->defenc);
216	unicode->defenc = NULL;
217	}
218	unicode->hash = -1;
219
220	return 0;
221	}
222
223	/* We allocate one more byte to make sure the string is
224	Ux0000 terminated -- XXX is this needed ?
225
226	XXX This allocator could further be enhanced by assuring that the
227	free list never reduces its size below 1.
228
229	*/
230
231	static
232	PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
233	{
234	register PyUnicodeObject *unicode;
235
236	/* Optimization for empty strings */
237	if (length == 0 && unicode_empty != NULL) {
238	Py_INCREF(unicode_empty);
239	return unicode_empty;
240	}
241
242	/* Unicode freelist & memory allocation */
243	if (unicode_freelist) {
244	unicode = unicode_freelist;
245	unicode_freelist = (PyUnicodeObject *)unicode;
246	unicode_freelist_size--;
247	if (unicode->str) {
248	/* Keep-Alive optimization: we only upsize the buffer,
249	never downsize it. */
250	if ((unicode->length < length) &&
251	unicode_resize(unicode, length) < 0) {
252	PyMem_DEL(unicode->str);
253	goto onError;
254	}
255	}
256	else {
257	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
258	}
259	PyObject_INIT(unicode, &PyUnicode_Type);
260	}
261	else {
262	unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
263	if (unicode == NULL)
264	return NULL;
265	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266	}
267
268	if (!unicode->str) {
269	PyErr_NoMemory();
270	goto onError;
271	}
272	/* Initialize the first element to guard against cases where
273	* the caller fails before initializing str -- unicode_resize()
274	* reads str[0], and the Keep-Alive optimization can keep memory
275	* allocated for str alive across a call to unicode_dealloc(unicode).
276	* We don't want unicode_resize to read uninitialized memory in
277	* that case.
278	*/
279	unicode->str[0] = 0;
280	unicode->str[length] = 0;
281	unicode->length = length;
282	unicode->hash = -1;
283	unicode->defenc = NULL;
284	return unicode;
285
286	onError:
287	_Py_ForgetReference((PyObject *)unicode);
288	PyObject_Del(unicode);
289	return NULL;
290	}
291
292	static
293	void unicode_dealloc(register PyUnicodeObject *unicode)
294	{
295	if (PyUnicode_CheckExact(unicode) &&
296	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
297	/* Keep-Alive optimization */
298	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
299	PyMem_DEL(unicode->str);
300	unicode->str = NULL;
301	unicode->length = 0;
302	}
303	if (unicode->defenc) {
304	Py_DECREF(unicode->defenc);
305	unicode->defenc = NULL;
306	}
307	/* Add to free list */
308	(PyUnicodeObject *)unicode = unicode_freelist;
309	unicode_freelist = unicode;
310	unicode_freelist_size++;
311	}
312	else {
313	PyMem_DEL(unicode->str);
314	Py_XDECREF(unicode->defenc);
315	unicode->ob_type->tp_free((PyObject *)unicode);
316	}
317	}
318
319	int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
320	{
321	register PyUnicodeObject *v;
322
323	/* Argument checks */
324	if (unicode == NULL) {
325	PyErr_BadInternalCall();
326	return -1;
327	}
328	v = (PyUnicodeObject )unicode;
329	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1 \|\| length < 0) {
330	PyErr_BadInternalCall();
331	return -1;
332	}
333
334	/* Resizing unicode_empty and single character objects is not
335	possible since these are being shared. We simply return a fresh
336	copy with the same Unicode content. */
337	if (v->length != length &&
338	(v == unicode_empty \|\| v->length == 1)) {
339	PyUnicodeObject *w = _PyUnicode_New(length);
340	if (w == NULL)
341	return -1;
342	Py_UNICODE_COPY(w->str, v->str,
343	length < v->length ? length : v->length);
344	Py_DECREF(*unicode);
345	unicode = (PyObject )w;
346	return 0;
347	}
348
349	/* Note that we don't have to modify *unicode for unshared Unicode
350	objects, since we can modify them in-place. */
351	return unicode_resize(v, length);
352	}
353
354	/* Internal API for use in unicodeobject.c only ! */
355	#define _PyUnicode_Resize(unicodevar, length) \
356	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
358	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
359	Py_ssize_t size)
360	{
361	PyUnicodeObject *unicode;
362
363	/* If the Unicode data is known at construction time, we can apply
364	some optimizations which share commonly used objects. */
365	if (u != NULL) {
366
367	/* Optimization for empty strings */
368	if (size == 0 && unicode_empty != NULL) {
369	Py_INCREF(unicode_empty);
370	return (PyObject *)unicode_empty;
371	}
372
373	/* Single character Unicode objects in the Latin-1 range are
374	shared when using this constructor */
375	if (size == 1 && *u < 256) {
376	unicode = unicode_latin1[*u];
377	if (!unicode) {
378	unicode = _PyUnicode_New(1);
379	if (!unicode)
380	return NULL;
381	unicode->str[0] = *u;
382	unicode_latin1[*u] = unicode;
383	}
384	Py_INCREF(unicode);
385	return (PyObject *)unicode;
386	}
387	}
388
389	unicode = _PyUnicode_New(size);
390	if (!unicode)
391	return NULL;
392
393	/* Copy the Unicode data into the new object */
394	if (u != NULL)
395	Py_UNICODE_COPY(unicode->str, u, size);
396
397	return (PyObject *)unicode;
398	}
399
400	#ifdef HAVE_WCHAR_H
401
402	PyObject PyUnicode_FromWideChar(register const wchar_t w,
403	Py_ssize_t size)
404	{
405	PyUnicodeObject *unicode;
406
407	if (w == NULL) {
408	PyErr_BadInternalCall();
409	return NULL;
410	}
411
412	unicode = _PyUnicode_New(size);
413	if (!unicode)
414	return NULL;
415
416	/* Copy the wchar_t data into the new object */
417	#ifdef HAVE_USABLE_WCHAR_T
418	memcpy(unicode->str, w, size * sizeof(wchar_t));
419	#else
420	{
421	register Py_UNICODE *u;
422	register Py_ssize_t i;
423	u = PyUnicode_AS_UNICODE(unicode);
424	for (i = size; i > 0; i--)
425	u++ = w++;
426	}
427	#endif
428
429	return (PyObject *)unicode;
430	}
431
432	Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433	wchar_t *w,
434	Py_ssize_t size)
435	{
436	if (unicode == NULL) {
437	PyErr_BadInternalCall();
438	return -1;
439	}
440
441	/* If possible, try to copy the 0-termination as well */
442	if (size > PyUnicode_GET_SIZE(unicode))
443	size = PyUnicode_GET_SIZE(unicode) + 1;
444
445	#ifdef HAVE_USABLE_WCHAR_T
446	memcpy(w, unicode->str, size * sizeof(wchar_t));
447	#else
448	{
449	register Py_UNICODE *u;
450	register Py_ssize_t i;
451	u = PyUnicode_AS_UNICODE(unicode);
452	for (i = size; i > 0; i--)
453	w++ = u++;
454	}
455	#endif
456
457	if (size > PyUnicode_GET_SIZE(unicode))
458	return PyUnicode_GET_SIZE(unicode);
459	else
460	return size;
461	}
462
463	#endif
464
465	PyObject *PyUnicode_FromOrdinal(int ordinal)
466	{
467	Py_UNICODE s[1];
468
469	#ifdef Py_UNICODE_WIDE
470	if (ordinal < 0 \|\| ordinal > 0x10ffff) {
471	PyErr_SetString(PyExc_ValueError,
472	"unichr() arg not in range(0x110000) "
473	"(wide Python build)");
474	return NULL;
475	}
476	#else
477	if (ordinal < 0 \|\| ordinal > 0xffff) {
478	PyErr_SetString(PyExc_ValueError,
479	"unichr() arg not in range(0x10000) "
480	"(narrow Python build)");
481	return NULL;
482	}
483	#endif
484
485	s[0] = (Py_UNICODE)ordinal;
486	return PyUnicode_FromUnicode(s, 1);
487	}
488
489	PyObject PyUnicode_FromObject(register PyObject obj)
490	{
491	/* XXX Perhaps we should make this API an alias of
492	PyObject_Unicode() instead ?! */
493	if (PyUnicode_CheckExact(obj)) {
494	Py_INCREF(obj);
495	return obj;
496	}
497	if (PyUnicode_Check(obj)) {
498	/* For a Unicode subtype that's not a Unicode object,
499	return a true Unicode object with the same data. */
500	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501	PyUnicode_GET_SIZE(obj));
502	}
503	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504	}
505
506	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
507	const char *encoding,
508	const char *errors)
509	{
510	const char *s = NULL;
511	Py_ssize_t len;
512	PyObject *v;
513
514	if (obj == NULL) {
515	PyErr_BadInternalCall();
516	return NULL;
517	}
518
519	#if 0
520	/* For b/w compatibility we also accept Unicode objects provided
521	that no encodings is given and then redirect to
522	PyObject_Unicode() which then applies the additional logic for
523	Unicode subclasses.
524
525	NOTE: This API should really only be used for object which
526	represent encoded Unicode !
527
528	*/
529	if (PyUnicode_Check(obj)) {
530	if (encoding) {
531	PyErr_SetString(PyExc_TypeError,
532	"decoding Unicode is not supported");
533	return NULL;
534	}
535	return PyObject_Unicode(obj);
536	}
537	#else
538	if (PyUnicode_Check(obj)) {
539	PyErr_SetString(PyExc_TypeError,
540	"decoding Unicode is not supported");
541	return NULL;
542	}
543	#endif
544
545	/* Coerce object */
546	if (PyString_Check(obj)) {
547	s = PyString_AS_STRING(obj);
548	len = PyString_GET_SIZE(obj);
549	}
550	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551	/* Overwrite the error message with something more useful in
552	case of a TypeError. */
553	if (PyErr_ExceptionMatches(PyExc_TypeError))
554	PyErr_Format(PyExc_TypeError,
555	"coercing to Unicode: need string or buffer, "
556	"%.80s found",
557	obj->ob_type->tp_name);
558	goto onError;
559	}
560
561	/* Convert to Unicode */
562	if (len == 0) {
563	Py_INCREF(unicode_empty);
564	v = (PyObject *)unicode_empty;
565	}
566	else
567	v = PyUnicode_Decode(s, len, encoding, errors);
568
569	return v;
570
571	onError:
572	return NULL;
573	}
574
575	PyObject PyUnicode_Decode(const char s,
576	Py_ssize_t size,
577	const char *encoding,
578	const char *errors)
579	{
580	PyObject buffer = NULL, unicode;
581
582	if (encoding == NULL)
583	encoding = PyUnicode_GetDefaultEncoding();
584
585	/* Shortcuts for common default encodings */
586	if (strcmp(encoding, "utf-8") == 0)
587	return PyUnicode_DecodeUTF8(s, size, errors);
588	else if (strcmp(encoding, "latin-1") == 0)
589	return PyUnicode_DecodeLatin1(s, size, errors);
590	#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591	else if (strcmp(encoding, "mbcs") == 0)
592	return PyUnicode_DecodeMBCS(s, size, errors);
593	#endif
594	else if (strcmp(encoding, "ascii") == 0)
595	return PyUnicode_DecodeASCII(s, size, errors);
596
597	/* Decode via the codec registry */
598	buffer = PyBuffer_FromMemory((void *)s, size);
599	if (buffer == NULL)
600	goto onError;
601	unicode = PyCodec_Decode(buffer, encoding, errors);
602	if (unicode == NULL)
603	goto onError;
604	if (!PyUnicode_Check(unicode)) {
605	PyErr_Format(PyExc_TypeError,
606	"decoder did not return an unicode object (type=%.400s)",
607	unicode->ob_type->tp_name);
608	Py_DECREF(unicode);
609	goto onError;
610	}
611	Py_DECREF(buffer);
612	return unicode;
613
614	onError:
615	Py_XDECREF(buffer);
616	return NULL;
617	}
618
619	PyObject PyUnicode_AsDecodedObject(PyObject unicode,
620	const char *encoding,
621	const char *errors)
622	{
623	PyObject *v;
624
625	if (!PyUnicode_Check(unicode)) {
626	PyErr_BadArgument();
627	goto onError;
628	}
629
630	if (encoding == NULL)
631	encoding = PyUnicode_GetDefaultEncoding();
632
633	/* Decode via the codec registry */
634	v = PyCodec_Decode(unicode, encoding, errors);
635	if (v == NULL)
636	goto onError;
637	return v;
638
639	onError:
640	return NULL;
641	}
642
643	PyObject PyUnicode_Encode(const Py_UNICODE s,
644	Py_ssize_t size,
645	const char *encoding,
646	const char *errors)
647	{
648	PyObject v, unicode;
649
650	unicode = PyUnicode_FromUnicode(s, size);
651	if (unicode == NULL)
652	return NULL;
653	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654	Py_DECREF(unicode);
655	return v;
656	}
657
658	PyObject PyUnicode_AsEncodedObject(PyObject unicode,
659	const char *encoding,
660	const char *errors)
661	{
662	PyObject *v;
663
664	if (!PyUnicode_Check(unicode)) {
665	PyErr_BadArgument();
666	goto onError;
667	}
668
669	if (encoding == NULL)
670	encoding = PyUnicode_GetDefaultEncoding();
671
672	/* Encode via the codec registry */
673	v = PyCodec_Encode(unicode, encoding, errors);
674	if (v == NULL)
675	goto onError;
676	return v;
677
678	onError:
679	return NULL;
680	}
681
682	PyObject PyUnicode_AsEncodedString(PyObject unicode,
683	const char *encoding,
684	const char *errors)
685	{
686	PyObject *v;
687
688	if (!PyUnicode_Check(unicode)) {
689	PyErr_BadArgument();
690	goto onError;
691	}
692
693	if (encoding == NULL)
694	encoding = PyUnicode_GetDefaultEncoding();
695
696	/* Shortcuts for common default encodings */
697	if (errors == NULL) {
698	if (strcmp(encoding, "utf-8") == 0)
699	return PyUnicode_AsUTF8String(unicode);
700	else if (strcmp(encoding, "latin-1") == 0)
701	return PyUnicode_AsLatin1String(unicode);
702	#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703	else if (strcmp(encoding, "mbcs") == 0)
704	return PyUnicode_AsMBCSString(unicode);
705	#endif
706	else if (strcmp(encoding, "ascii") == 0)
707	return PyUnicode_AsASCIIString(unicode);
708	}
709
710	/* Encode via the codec registry */
711	v = PyCodec_Encode(unicode, encoding, errors);
712	if (v == NULL)
713	goto onError;
714	if (!PyString_Check(v)) {
715	PyErr_Format(PyExc_TypeError,
716	"encoder did not return a string object (type=%.400s)",
717	v->ob_type->tp_name);
718	Py_DECREF(v);
719	goto onError;
720	}
721	return v;
722
723	onError:
724	return NULL;
725	}
726
727	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
728	const char *errors)
729	{
730	PyObject v = ((PyUnicodeObject )unicode)->defenc;
731
732	if (v)
733	return v;
734	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735	if (v && errors == NULL)
736	((PyUnicodeObject *)unicode)->defenc = v;
737	return v;
738	}
739
740	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
741	{
742	if (!PyUnicode_Check(unicode)) {
743	PyErr_BadArgument();
744	goto onError;
745	}
746	return PyUnicode_AS_UNICODE(unicode);
747
748	onError:
749	return NULL;
750	}
751
752	Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
753	{
754	if (!PyUnicode_Check(unicode)) {
755	PyErr_BadArgument();
756	goto onError;
757	}
758	return PyUnicode_GET_SIZE(unicode);
759
760	onError:
761	return -1;
762	}
763
764	const char *PyUnicode_GetDefaultEncoding(void)
765	{
766	return unicode_default_encoding;
767	}
768
769	int PyUnicode_SetDefaultEncoding(const char *encoding)
770	{
771	PyObject *v;
772
773	/* Make sure the encoding is valid. As side effect, this also
774	loads the encoding into the codec registry cache. */
775	v = _PyCodec_Lookup(encoding);
776	if (v == NULL)
777	goto onError;
778	Py_DECREF(v);
779	strncpy(unicode_default_encoding,
780	encoding,
781	sizeof(unicode_default_encoding));
782	return 0;
783
784	onError:
785	return -1;
786	}
787
788	/* error handling callback helper:
789	build arguments, call the callback and check the arguments,
790	if no exception occurred, copy the replacement to the output
791	and adjust various state variables.
792	return 0 on success, -1 on error
793	*/
794
795	static
796	int unicode_decode_call_errorhandler(const char errors, PyObject *errorHandler,
797	const char encoding, const char reason,
798	const char input, Py_ssize_t insize, Py_ssize_t startinpos, Py_ssize_t endinpos, PyObject exceptionObject, const char *inptr,
799	PyObject *output, Py_ssize_t outpos, Py_UNICODE **outptr)
800	{
801	static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
802
803	PyObject *restuple = NULL;
804	PyObject *repunicode = NULL;
805	Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806	Py_ssize_t requiredsize;
807	Py_ssize_t newpos;
808	Py_UNICODE *repptr;
809	Py_ssize_t repsize;
810	int res = -1;
811
812	if (*errorHandler == NULL) {
813	*errorHandler = PyCodec_LookupError(errors);
814	if (*errorHandler == NULL)
815	goto onError;
816	}
817
818	if (*exceptionObject == NULL) {
819	*exceptionObject = PyUnicodeDecodeError_Create(
820	encoding, input, insize, startinpos, endinpos, reason);
821	if (*exceptionObject == NULL)
822	goto onError;
823	}
824	else {
825	if (PyUnicodeDecodeError_SetStart(exceptionObject, startinpos))
826	goto onError;
827	if (PyUnicodeDecodeError_SetEnd(exceptionObject, endinpos))
828	goto onError;
829	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830	goto onError;
831	}
832
833	restuple = PyObject_CallFunctionObjArgs(errorHandler, exceptionObject, NULL);
834	if (restuple == NULL)
835	goto onError;
836	if (!PyTuple_Check(restuple)) {
837	PyErr_Format(PyExc_TypeError, &argparse[4]);
838	goto onError;
839	}
840	if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841	goto onError;
842	if (newpos<0)
843	newpos = insize+newpos;
844	if (newpos<0 \|\| newpos>insize) {
845	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
846	goto onError;
847	}
848
849	/* need more space? (at least enough for what we
850	have+the replacement+the rest of the string (starting
851	at the new input position), so we won't have to check space
852	when there are no errors in the rest of the string) */
853	repptr = PyUnicode_AS_UNICODE(repunicode);
854	repsize = PyUnicode_GET_SIZE(repunicode);
855	requiredsize = *outpos + repsize + insize-newpos;
856	if (requiredsize > outsize) {
857	if (requiredsize<2*outsize)
858	requiredsize = 2*outsize;
859	if (PyUnicode_Resize(output, requiredsize) < 0)
860	goto onError;
861	outptr = PyUnicode_AS_UNICODE(output) + *outpos;
862	}
863	*endinpos = newpos;
864	*inptr = input + newpos;
865	Py_UNICODE_COPY(*outptr, repptr, repsize);
866	*outptr += repsize;
867	*outpos += repsize;
868	/* we made it! */
869	res = 0;
870
871	onError:
872	Py_XDECREF(restuple);
873	return res;
874	}
875
876	/* --- UTF-7 Codec -------------------------------------------------------- */
877
878	/* see RFC2152 for details */
879
880	static
881	char utf7_special[128] = {
882	/* indicate whether a UTF-7 character is special i.e. cannot be directly
883	encoded:
884	0 - not special
885	1 - special
886	2 - whitespace (optional)
887	3 - RFC2152 Set O (optional) */
888	1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890	2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897	};
898
899	/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900	warnings about the comparison always being false; since
901	utf7_special[0] is 1, we can safely make that one comparison
902	true */
903
904	#define SPECIAL(c, encodeO, encodeWS) \
905	((c) > 127 \|\| (c) <= 0 \|\| utf7_special[(c)] == 1 \|\| \
906	(encodeWS && (utf7_special[(c)] == 2)) \|\| \
907	(encodeO && (utf7_special[(c)] == 3)))
908
909	#define B64(n) \
910	("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911	#define B64CHAR(c) \
912	(isalnum(c) \|\| (c) == '+' \|\| (c) == '/')
913	#define UB64(c) \
914	((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915	(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
916
917	#define ENCODE(out, ch, bits) \
918	while (bits >= 6) { \
919	*out++ = B64(ch >> (bits-6)); \
920	bits -= 6; \
921	}
922
923	#define DECODE(out, ch, bits, surrogate) \
924	while (bits >= 16) { \
925	Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
926	bits -= 16; \
927	if (surrogate) { \
928	/* We have already generated an error for the high surrogate \
929	so let's not bother seeing if the low surrogate is correct or not */ \
930	surrogate = 0; \
931	} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
932	/* This is a surrogate pair. Unfortunately we can't represent \
933	it in a 16-bit character */ \
934	surrogate = 1; \
935	errmsg = "code pairs are not supported"; \
936	goto utf7Error; \
937	} else { \
938	*out++ = outCh; \
939	} \
940	}
941
942	PyObject PyUnicode_DecodeUTF7(const char s,
943	Py_ssize_t size,
944	const char *errors)
945	{
946	const char *starts = s;
947	Py_ssize_t startinpos;
948	Py_ssize_t endinpos;
949	Py_ssize_t outpos;
950	const char *e;
951	PyUnicodeObject *unicode;
952	Py_UNICODE *p;
953	const char *errmsg = "";
954	int inShift = 0;
955	unsigned int bitsleft = 0;
956	unsigned long charsleft = 0;
957	int surrogate = 0;
958	PyObject *errorHandler = NULL;
959	PyObject *exc = NULL;
960
961	unicode = _PyUnicode_New(size);
962	if (!unicode)
963	return NULL;
964	if (size == 0)
965	return (PyObject *)unicode;
966
967	p = unicode->str;
968	e = s + size;
969
970	while (s < e) {
971	Py_UNICODE ch;
972	restart:
973	ch = *s;
974
975	if (inShift) {
976	if ((ch == '-') \|\| !B64CHAR(ch)) {
977	inShift = 0;
978	s++;
979
980	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981	if (bitsleft >= 6) {
982	/* The shift sequence has a partial character in it. If
983	bitsleft < 6 then we could just classify it as padding
984	but that is not the case here */
985
986	errmsg = "partial character in shift sequence";
987	goto utf7Error;
988	}
989	/* According to RFC2152 the remaining bits should be zero. We
990	choose to signal an error/insert a replacement character
991	here so indicate the potential of a misencoded character. */
992
993	/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994	if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995	errmsg = "non-zero padding bits in shift sequence";
996	goto utf7Error;
997	}
998
999	if (ch == '-') {
1000	if ((s < e) && (*(s) == '-')) {
1001	*p++ = '-';
1002	inShift = 1;
1003	}
1004	} else if (SPECIAL(ch,0,0)) {
1005	errmsg = "unexpected special character";
1006	goto utf7Error;
1007	} else {
1008	*p++ = ch;
1009	}
1010	} else {
1011	charsleft = (charsleft << 6) \| UB64(ch);
1012	bitsleft += 6;
1013	s++;
1014	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015	}
1016	}
1017	else if ( ch == '+' ) {
1018	startinpos = s-starts;
1019	s++;
1020	if (s < e && *s == '-') {
1021	s++;
1022	*p++ = '+';
1023	} else
1024	{
1025	inShift = 1;
1026	bitsleft = 0;
1027	}
1028	}
1029	else if (SPECIAL(ch,0,0)) {
1030	errmsg = "unexpected special character";
1031	s++;
1032	goto utf7Error;
1033	}
1034	else {
1035	*p++ = ch;
1036	s++;
1037	}
1038	continue;
1039	utf7Error:
1040	outpos = p-PyUnicode_AS_UNICODE(unicode);
1041	endinpos = s-starts;
1042	if (unicode_decode_call_errorhandler(
1043	errors, &errorHandler,
1044	"utf7", errmsg,
1045	starts, size, &startinpos, &endinpos, &exc, &s,
1046	(PyObject **)&unicode, &outpos, &p))
1047	goto onError;
1048	}
1049
1050	if (inShift) {
1051	outpos = p-PyUnicode_AS_UNICODE(unicode);
1052	endinpos = size;
1053	if (unicode_decode_call_errorhandler(
1054	errors, &errorHandler,
1055	"utf7", "unterminated shift sequence",
1056	starts, size, &startinpos, &endinpos, &exc, &s,
1057	(PyObject **)&unicode, &outpos, &p))
1058	goto onError;
1059	if (s < e)
1060	goto restart;
1061	}
1062
1063	if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1064	goto onError;
1065
1066	Py_XDECREF(errorHandler);
1067	Py_XDECREF(exc);
1068	return (PyObject *)unicode;
1069
1070	onError:
1071	Py_XDECREF(errorHandler);
1072	Py_XDECREF(exc);
1073	Py_DECREF(unicode);
1074	return NULL;
1075	}
1076
1077
1078	PyObject PyUnicode_EncodeUTF7(const Py_UNICODE s,
1079	Py_ssize_t size,
1080	int encodeSetO,
1081	int encodeWhiteSpace,
1082	const char *errors)
1083	{
1084	PyObject *v;
1085	/* It might be possible to tighten this worst case */
1086	Py_ssize_t cbAllocated = 5 * size;
1087	int inShift = 0;
1088	Py_ssize_t i = 0;
1089	unsigned int bitsleft = 0;
1090	unsigned long charsleft = 0;
1091	char * out;
1092	char * start;
1093
1094	if (size == 0)
1095	return PyString_FromStringAndSize(NULL, 0);
1096
1097	v = PyString_FromStringAndSize(NULL, cbAllocated);
1098	if (v == NULL)
1099	return NULL;
1100
1101	start = out = PyString_AS_STRING(v);
1102	for (;i < size; ++i) {
1103	Py_UNICODE ch = s[i];
1104
1105	if (!inShift) {
1106	if (ch == '+') {
1107	*out++ = '+';
1108	*out++ = '-';
1109	} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110	charsleft = ch;
1111	bitsleft = 16;
1112	*out++ = '+';
1113	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1114	inShift = bitsleft > 0;
1115	} else {
1116	*out++ = (char) ch;
1117	}
1118	} else {
1119	if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120	*out++ = B64(charsleft << (6-bitsleft));
1121	charsleft = 0;
1122	bitsleft = 0;
1123	/* Characters not in the BASE64 set implicitly unshift the sequence
1124	so no '-' is required, except if the character is itself a '-' */
1125	if (B64CHAR(ch) \|\| ch == '-') {
1126	*out++ = '-';
1127	}
1128	inShift = 0;
1129	*out++ = (char) ch;
1130	} else {
1131	bitsleft += 16;
1132	charsleft = (charsleft << 16) \| ch;
1133	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135	/* If the next character is special then we dont' need to terminate
1136	the shift sequence. If the next character is not a BASE64 character
1137	or '-' then the shift sequence will be terminated implicitly and we
1138	don't have to insert a '-'. */
1139
1140	if (bitsleft == 0) {
1141	if (i + 1 < size) {
1142	Py_UNICODE ch2 = s[i+1];
1143
1144	if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1145
1146	} else if (B64CHAR(ch2) \|\| ch2 == '-') {
1147	*out++ = '-';
1148	inShift = 0;
1149	} else {
1150	inShift = 0;
1151	}
1152
1153	}
1154	else {
1155	*out++ = '-';
1156	inShift = 0;
1157	}
1158	}
1159	}
1160	}
1161	}
1162	if (bitsleft) {
1163	*out++= B64(charsleft << (6-bitsleft) );
1164	*out++ = '-';
1165	}
1166
1167	_PyString_Resize(&v, out - start);
1168	return v;
1169	}
1170
1171	#undef SPECIAL
1172	#undef B64
1173	#undef B64CHAR
1174	#undef UB64
1175	#undef ENCODE
1176	#undef DECODE
1177
1178	/* --- UTF-8 Codec -------------------------------------------------------- */
1179
1180	static
1181	char utf8_code_length[256] = {
1182	/* Map UTF-8 encoded prefix byte to sequence length. zero means
1183	illegal prefix. see RFC 2279 for details */
1184	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200	};
1201
1202	PyObject PyUnicode_DecodeUTF8(const char s,
1203	Py_ssize_t size,
1204	const char *errors)
1205	{
1206	return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207	}
1208
1209	PyObject PyUnicode_DecodeUTF8Stateful(const char s,
1210	Py_ssize_t size,
1211	const char *errors,
1212	Py_ssize_t *consumed)
1213	{
1214	const char *starts = s;
1215	int n;
1216	Py_ssize_t startinpos;
1217	Py_ssize_t endinpos;
1218	Py_ssize_t outpos;
1219	const char *e;
1220	PyUnicodeObject *unicode;
1221	Py_UNICODE *p;
1222	const char *errmsg = "";
1223	PyObject *errorHandler = NULL;
1224	PyObject *exc = NULL;
1225
1226	/* Note: size will always be longer than the resulting Unicode
1227	character count */
1228	unicode = _PyUnicode_New(size);
1229	if (!unicode)
1230	return NULL;
1231	if (size == 0) {
1232	if (consumed)
1233	*consumed = 0;
1234	return (PyObject *)unicode;
1235	}
1236
1237	/* Unpack UTF-8 encoded data */
1238	p = unicode->str;
1239	e = s + size;
1240
1241	while (s < e) {
1242	Py_UCS4 ch = (unsigned char)*s;
1243
1244	if (ch < 0x80) {
1245	*p++ = (Py_UNICODE)ch;
1246	s++;
1247	continue;
1248	}
1249
1250	n = utf8_code_length[ch];
1251
1252	if (s + n > e) {
1253	if (consumed)
1254	break;
1255	else {
1256	errmsg = "unexpected end of data";
1257	startinpos = s-starts;
1258	endinpos = size;
1259	goto utf8Error;
1260	}
1261	}
1262
1263	switch (n) {
1264
1265	case 0:
1266	errmsg = "unexpected code byte";
1267	startinpos = s-starts;
1268	endinpos = startinpos+1;
1269	goto utf8Error;
1270
1271	case 1:
1272	errmsg = "internal error";
1273	startinpos = s-starts;
1274	endinpos = startinpos+1;
1275	goto utf8Error;
1276
1277	case 2:
1278	if ((s[1] & 0xc0) != 0x80) {
1279	errmsg = "invalid data";
1280	startinpos = s-starts;
1281	endinpos = startinpos+2;
1282	goto utf8Error;
1283	}
1284	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1285	if (ch < 0x80) {
1286	startinpos = s-starts;
1287	endinpos = startinpos+2;
1288	errmsg = "illegal encoding";
1289	goto utf8Error;
1290	}
1291	else
1292	*p++ = (Py_UNICODE)ch;
1293	break;
1294
1295	case 3:
1296	if ((s[1] & 0xc0) != 0x80 \|\|
1297	(s[2] & 0xc0) != 0x80) {
1298	errmsg = "invalid data";
1299	startinpos = s-starts;
1300	endinpos = startinpos+3;
1301	goto utf8Error;
1302	}
1303	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1304	if (ch < 0x0800) {
1305	/* Note: UTF-8 encodings of surrogates are considered
1306	legal UTF-8 sequences;
1307
1308	XXX For wide builds (UCS-4) we should probably try
1309	to recombine the surrogates into a single code
1310	unit.
1311	*/
1312	errmsg = "illegal encoding";
1313	startinpos = s-starts;
1314	endinpos = startinpos+3;
1315	goto utf8Error;
1316	}
1317	else
1318	*p++ = (Py_UNICODE)ch;
1319	break;
1320
1321	case 4:
1322	if ((s[1] & 0xc0) != 0x80 \|\|
1323	(s[2] & 0xc0) != 0x80 \|\|
1324	(s[3] & 0xc0) != 0x80) {
1325	errmsg = "invalid data";
1326	startinpos = s-starts;
1327	endinpos = startinpos+4;
1328	goto utf8Error;
1329	}
1330	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332	/* validate and convert to UTF-16 */
1333	if ((ch < 0x10000) /* minimum value allowed for 4
1334	byte encoding */
1335	\|\| (ch > 0x10ffff)) /* maximum value allowed for
1336	UTF-16 */
1337	{
1338	errmsg = "illegal encoding";
1339	startinpos = s-starts;
1340	endinpos = startinpos+4;
1341	goto utf8Error;
1342	}
1343	#ifdef Py_UNICODE_WIDE
1344	*p++ = (Py_UNICODE)ch;
1345	#else
1346	/* compute and append the two surrogates: */
1347
1348	/* translate from 10000..10FFFF to 0..FFFF */
1349	ch -= 0x10000;
1350
1351	/* high surrogate = top 10 bits added to D800 */
1352	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1353
1354	/* low surrogate = bottom 10 bits added to DC00 */
1355	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1356	#endif
1357	break;
1358
1359	default:
1360	/* Other sizes are only needed for UCS-4 */
1361	errmsg = "unsupported Unicode code range";
1362	startinpos = s-starts;
1363	endinpos = startinpos+n;
1364	goto utf8Error;
1365	}
1366	s += n;
1367	continue;
1368
1369	utf8Error:
1370	outpos = p-PyUnicode_AS_UNICODE(unicode);
1371	if (unicode_decode_call_errorhandler(
1372	errors, &errorHandler,
1373	"utf8", errmsg,
1374	starts, size, &startinpos, &endinpos, &exc, &s,
1375	(PyObject **)&unicode, &outpos, &p))
1376	goto onError;
1377	}
1378	if (consumed)
1379	*consumed = s-starts;
1380
1381	/* Adjust length */
1382	if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1383	goto onError;
1384
1385	Py_XDECREF(errorHandler);
1386	Py_XDECREF(exc);
1387	return (PyObject *)unicode;
1388
1389	onError:
1390	Py_XDECREF(errorHandler);
1391	Py_XDECREF(exc);
1392	Py_DECREF(unicode);
1393	return NULL;
1394	}
1395
1396	/* Allocation strategy: if the string is short, convert into a stack buffer
1397	and allocate exactly as much space needed at the end. Else allocate the
1398	maximum possible needed (4 result bytes per Unicode character), and return
1399	the excess memory at the end.
1400	*/
1401	PyObject *
1402	PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1403	Py_ssize_t size,
1404	const char *errors)
1405	{
1406	#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1407
1408	Py_ssize_t i; /* index into s of next input byte */
1409	PyObject v; / result string object */
1410	char p; / next free byte in output buffer */
1411	Py_ssize_t nallocated; /* number of result bytes allocated */
1412	Py_ssize_t nneeded; /* number of result bytes needed */
1413	char stackbuf[MAX_SHORT_UNICHARS * 4];
1414
1415	assert(s != NULL);
1416	assert(size >= 0);
1417
1418	if (size <= MAX_SHORT_UNICHARS) {
1419	/* Write into the stack buffer; nallocated can't overflow.
1420	* At the end, we'll allocate exactly as much heap space as it
1421	* turns out we need.
1422	*/
1423	nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424	v = NULL; /* will allocate after we're done */
1425	p = stackbuf;
1426	}
1427	else {
1428	/* Overallocate on the heap, and give the excess back at the end. */
1429	nallocated = size * 4;
1430	if (nallocated / 4 != size) /* overflow! */
1431	return PyErr_NoMemory();
1432	v = PyString_FromStringAndSize(NULL, nallocated);
1433	if (v == NULL)
1434	return NULL;
1435	p = PyString_AS_STRING(v);
1436	}
1437
1438	for (i = 0; i < size;) {
1439	Py_UCS4 ch = s[i++];
1440
1441	if (ch < 0x80)
1442	/* Encode ASCII */
1443	*p++ = (char) ch;
1444
1445	else if (ch < 0x0800) {
1446	/* Encode Latin-1 */
1447	*p++ = (char)(0xc0 \| (ch >> 6));
1448	*p++ = (char)(0x80 \| (ch & 0x3f));
1449	}
1450	else {
1451	/* Encode UCS2 Unicode ordinals */
1452	if (ch < 0x10000) {
1453	/* Special case: check for high surrogate */
1454	if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455	Py_UCS4 ch2 = s[i];
1456	/* Check for low surrogate and combine the two to
1457	form a UCS4 value */
1458	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1459	ch = ((ch - 0xD800) << 10 \| (ch2 - 0xDC00)) + 0x10000;
1460	i++;
1461	goto encodeUCS4;
1462	}
1463	/* Fall through: handles isolated high surrogates */
1464	}
1465	*p++ = (char)(0xe0 \| (ch >> 12));
1466	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
1467	*p++ = (char)(0x80 \| (ch & 0x3f));
1468	continue;
1469	}
1470	encodeUCS4:
1471	/* Encode UCS4 Unicode ordinals */
1472	*p++ = (char)(0xf0 \| (ch >> 18));
1473	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
1474	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
1475	*p++ = (char)(0x80 \| (ch & 0x3f));
1476	}
1477	}
1478
1479	if (v == NULL) {
1480	/* This was stack allocated. */
1481	nneeded = p - stackbuf;
1482	assert(nneeded <= nallocated);
1483	v = PyString_FromStringAndSize(stackbuf, nneeded);
1484	}
1485	else {
1486	/* Cut back to size actually needed. */
1487	nneeded = p - PyString_AS_STRING(v);
1488	assert(nneeded <= nallocated);
1489	_PyString_Resize(&v, nneeded);
1490	}
1491	return v;
1492
1493	#undef MAX_SHORT_UNICHARS
1494	}
1495
1496	PyObject PyUnicode_AsUTF8String(PyObject unicode)
1497	{
1498	if (!PyUnicode_Check(unicode)) {
1499	PyErr_BadArgument();
1500	return NULL;
1501	}
1502	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503	PyUnicode_GET_SIZE(unicode),
1504	NULL);
1505	}
1506
1507	/* --- UTF-16 Codec ------------------------------------------------------- */
1508
1509	PyObject *
1510	PyUnicode_DecodeUTF16(const char *s,
1511	Py_ssize_t size,
1512	const char *errors,
1513	int *byteorder)
1514	{
1515	return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516	}
1517
1518	PyObject *
1519	PyUnicode_DecodeUTF16Stateful(const char *s,
1520	Py_ssize_t size,
1521	const char *errors,
1522	int *byteorder,
1523	Py_ssize_t *consumed)
1524	{
1525	const char *starts = s;
1526	Py_ssize_t startinpos;
1527	Py_ssize_t endinpos;
1528	Py_ssize_t outpos;
1529	PyUnicodeObject *unicode;
1530	Py_UNICODE *p;
1531	const unsigned char q, e;
1532	int bo = 0; /* assume native ordering by default */
1533	const char *errmsg = "";
1534	/* Offsets from q for retrieving byte pairs in the right order. */
1535	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536	int ihi = 1, ilo = 0;
1537	#else
1538	int ihi = 0, ilo = 1;
1539	#endif
1540	PyObject *errorHandler = NULL;
1541	PyObject *exc = NULL;
1542
1543	/* Note: size will always be longer than the resulting Unicode
1544	character count */
1545	unicode = _PyUnicode_New(size);
1546	if (!unicode)
1547	return NULL;
1548	if (size == 0)
1549	return (PyObject *)unicode;
1550
1551	/* Unpack UTF-16 encoded data */
1552	p = unicode->str;
1553	q = (unsigned char *)s;
1554	e = q + size;
1555
1556	if (byteorder)
1557	bo = *byteorder;
1558
1559	/* Check for BOM marks (U+FEFF) in the input and adjust current
1560	byte order setting accordingly. In native mode, the leading BOM
1561	mark is skipped, in all other modes, it is copied to the output
1562	stream as-is (giving a ZWNBSP character). */
1563	if (bo == 0) {
1564	if (size >= 2) {
1565	const Py_UNICODE bom = (q[ihi] << 8) \| q[ilo];
1566	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1567	if (bom == 0xFEFF) {
1568	q += 2;
1569	bo = -1;
1570	}
1571	else if (bom == 0xFFFE) {
1572	q += 2;
1573	bo = 1;
1574	}
1575	#else
1576	if (bom == 0xFEFF) {
1577	q += 2;
1578	bo = 1;
1579	}
1580	else if (bom == 0xFFFE) {
1581	q += 2;
1582	bo = -1;
1583	}
1584	#endif
1585	}
1586	}
1587
1588	if (bo == -1) {
1589	/* force LE */
1590	ihi = 1;
1591	ilo = 0;
1592	}
1593	else if (bo == 1) {
1594	/* force BE */
1595	ihi = 0;
1596	ilo = 1;
1597	}
1598
1599	while (q < e) {
1600	Py_UNICODE ch;
1601	/* remaining bytes at the end? (size should be even) */
1602	if (e-q<2) {
1603	if (consumed)
1604	break;
1605	errmsg = "truncated data";
1606	startinpos = ((const char *)q)-starts;
1607	endinpos = ((const char *)e)-starts;
1608	goto utf16Error;
1609	/* The remaining input chars are ignored if the callback
1610	chooses to skip the input */
1611	}
1612	ch = (q[ihi] << 8) \| q[ilo];
1613
1614	q += 2;
1615
1616	if (ch < 0xD800 \|\| ch > 0xDFFF) {
1617	*p++ = ch;
1618	continue;
1619	}
1620
1621	/* UTF-16 code pair: */
1622	if (q >= e) {
1623	errmsg = "unexpected end of data";
1624	startinpos = (((const char *)q)-2)-starts;
1625	endinpos = ((const char *)e)-starts;
1626	goto utf16Error;
1627	}
1628	if (0xD800 <= ch && ch <= 0xDBFF) {
1629	Py_UNICODE ch2 = (q[ihi] << 8) \| q[ilo];
1630	q += 2;
1631	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1632	#ifndef Py_UNICODE_WIDE
1633	*p++ = ch;
1634	*p++ = ch2;
1635	#else
1636	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
1637	#endif
1638	continue;
1639	}
1640	else {
1641	errmsg = "illegal UTF-16 surrogate";
1642	startinpos = (((const char *)q)-4)-starts;
1643	endinpos = startinpos+2;
1644	goto utf16Error;
1645	}
1646
1647	}
1648	errmsg = "illegal encoding";
1649	startinpos = (((const char *)q)-2)-starts;
1650	endinpos = startinpos+2;
1651	/* Fall through to report the error */
1652
1653	utf16Error:
1654	outpos = p-PyUnicode_AS_UNICODE(unicode);
1655	if (unicode_decode_call_errorhandler(
1656	errors, &errorHandler,
1657	"utf16", errmsg,
1658	starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659	(PyObject **)&unicode, &outpos, &p))
1660	goto onError;
1661	}
1662
1663	if (byteorder)
1664	*byteorder = bo;
1665
1666	if (consumed)
1667	consumed = (const char )q-starts;
1668
1669	/* Adjust length */
1670	if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1671	goto onError;
1672
1673	Py_XDECREF(errorHandler);
1674	Py_XDECREF(exc);
1675	return (PyObject *)unicode;
1676
1677	onError:
1678	Py_DECREF(unicode);
1679	Py_XDECREF(errorHandler);
1680	Py_XDECREF(exc);
1681	return NULL;
1682	}
1683
1684	PyObject *
1685	PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1686	Py_ssize_t size,
1687	const char *errors,
1688	int byteorder)
1689	{
1690	PyObject *v;
1691	unsigned char *p;
1692	#ifdef Py_UNICODE_WIDE
1693	int i, pairs;
1694	#else
1695	const int pairs = 0;
1696	#endif
1697	/* Offsets from p for storing byte pairs in the right order. */
1698	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699	int ihi = 1, ilo = 0;
1700	#else
1701	int ihi = 0, ilo = 1;
1702	#endif
1703
1704	#define STORECHAR(CH) \
1705	do { \
1706	p[ihi] = ((CH) >> 8) & 0xff; \
1707	p[ilo] = (CH) & 0xff; \
1708	p += 2; \
1709	} while(0)
1710
1711	#ifdef Py_UNICODE_WIDE
1712	for (i = pairs = 0; i < size; i++)
1713	if (s[i] >= 0x10000)
1714	pairs++;
1715	#endif
1716	v = PyString_FromStringAndSize(NULL,
1717	2 * (size + pairs + (byteorder == 0)));
1718	if (v == NULL)
1719	return NULL;
1720
1721	p = (unsigned char *)PyString_AS_STRING(v);
1722	if (byteorder == 0)
1723	STORECHAR(0xFEFF);
1724	if (size == 0)
1725	return v;
1726
1727	if (byteorder == -1) {
1728	/* force LE */
1729	ihi = 1;
1730	ilo = 0;
1731	}
1732	else if (byteorder == 1) {
1733	/* force BE */
1734	ihi = 0;
1735	ilo = 1;
1736	}
1737
1738	while (size-- > 0) {
1739	Py_UNICODE ch = *s++;
1740	Py_UNICODE ch2 = 0;
1741	#ifdef Py_UNICODE_WIDE
1742	if (ch >= 0x10000) {
1743	ch2 = 0xDC00 \| ((ch-0x10000) & 0x3FF);
1744	ch = 0xD800 \| ((ch-0x10000) >> 10);
1745	}
1746	#endif
1747	STORECHAR(ch);
1748	if (ch2)
1749	STORECHAR(ch2);
1750	}
1751	return v;
1752	#undef STORECHAR
1753	}
1754
1755	PyObject PyUnicode_AsUTF16String(PyObject unicode)
1756	{
1757	if (!PyUnicode_Check(unicode)) {
1758	PyErr_BadArgument();
1759	return NULL;
1760	}
1761	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762	PyUnicode_GET_SIZE(unicode),
1763	NULL,
1764	0);
1765	}
1766
1767	/* --- Unicode Escape Codec ----------------------------------------------- */
1768
1769	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1770
1771	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
1772	Py_ssize_t size,
1773	const char *errors)
1774	{
1775	const char *starts = s;
1776	Py_ssize_t startinpos;
1777	Py_ssize_t endinpos;
1778	Py_ssize_t outpos;
1779	int i;
1780	PyUnicodeObject *v;
1781	Py_UNICODE *p;
1782	const char *end;
1783	char* message;
1784	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1785	PyObject *errorHandler = NULL;
1786	PyObject *exc = NULL;
1787
1788	/* Escaped strings will always be longer than the resulting
1789	Unicode string, so we start with size here and then reduce the
1790	length after conversion to the true value.
1791	(but if the error callback returns a long replacement string
1792	we'll have to allocate more space) */
1793	v = _PyUnicode_New(size);
1794	if (v == NULL)
1795	goto onError;
1796	if (size == 0)
1797	return (PyObject *)v;
1798
1799	p = PyUnicode_AS_UNICODE(v);
1800	end = s + size;
1801
1802	while (s < end) {
1803	unsigned char c;
1804	Py_UNICODE x;
1805	int digits;
1806
1807	/* Non-escape characters are interpreted as Unicode ordinals */
1808	if (*s != '\\') {
1809	p++ = (unsigned char) s++;
1810	continue;
1811	}
1812
1813	startinpos = s-starts;
1814	/* \ - Escapes */
1815	s++;
1816	switch (*s++) {
1817
1818	/* \x escapes */
1819	case '\n': break;
1820	case '\\': *p++ = '\\'; break;
1821	case '\'': *p++ = '\''; break;
1822	case '\"': *p++ = '\"'; break;
1823	case 'b': *p++ = '\b'; break;
1824	case 'f': p++ = '\014'; break; / FF */
1825	case 't': *p++ = '\t'; break;
1826	case 'n': *p++ = '\n'; break;
1827	case 'r': *p++ = '\r'; break;
1828	case 'v': p++ = '\013'; break; / VT */
1829	case 'a': p++ = '\007'; break; / BEL, not classic C */
1830
1831	/* \OOO (octal) escapes */
1832	case '0': case '1': case '2': case '3':
1833	case '4': case '5': case '6': case '7':
1834	x = s[-1] - '0';
1835	if ('0' <= s && s <= '7') {
1836	x = (x<<3) + *s++ - '0';
1837	if ('0' <= s && s <= '7')
1838	x = (x<<3) + *s++ - '0';
1839	}
1840	*p++ = x;
1841	break;
1842
1843	/* hex escapes */
1844	/* \xXX */
1845	case 'x':
1846	digits = 2;
1847	message = "truncated \\xXX escape";
1848	goto hexescape;
1849
1850	/* \uXXXX */
1851	case 'u':
1852	digits = 4;
1853	message = "truncated \\uXXXX escape";
1854	goto hexescape;
1855
1856	/* \UXXXXXXXX */
1857	case 'U':
1858	digits = 8;
1859	message = "truncated \\UXXXXXXXX escape";
1860	hexescape:
1861	chr = 0;
1862	outpos = p-PyUnicode_AS_UNICODE(v);
1863	if (s+digits>end) {
1864	endinpos = size;
1865	if (unicode_decode_call_errorhandler(
1866	errors, &errorHandler,
1867	"unicodeescape", "end of string in escape sequence",
1868	starts, size, &startinpos, &endinpos, &exc, &s,
1869	(PyObject **)&v, &outpos, &p))
1870	goto onError;
1871	goto nextByte;
1872	}
1873	for (i = 0; i < digits; ++i) {
1874	c = (unsigned char) s[i];
1875	if (!isxdigit(c)) {
1876	endinpos = (s+i+1)-starts;
1877	if (unicode_decode_call_errorhandler(
1878	errors, &errorHandler,
1879	"unicodeescape", message,
1880	starts, size, &startinpos, &endinpos, &exc, &s,
1881	(PyObject **)&v, &outpos, &p))
1882	goto onError;
1883	goto nextByte;
1884	}
1885	chr = (chr<<4) & ~0xF;
1886	if (c >= '0' && c <= '9')
1887	chr += c - '0';
1888	else if (c >= 'a' && c <= 'f')
1889	chr += 10 + c - 'a';
1890	else
1891	chr += 10 + c - 'A';
1892	}
1893	s += i;
1894	if (chr == 0xffffffff && PyErr_Occurred())
1895	/* _decoding_error will have already written into the
1896	target buffer. */
1897	break;
1898	store:
1899	/* when we get here, chr is a 32-bit unicode character */
1900	if (chr <= 0xffff)
1901	/* UCS-2 character */
1902	*p++ = (Py_UNICODE) chr;
1903	else if (chr <= 0x10ffff) {
1904	/* UCS-4 character. Either store directly, or as
1905	surrogate pair. */
1906	#ifdef Py_UNICODE_WIDE
1907	*p++ = chr;
1908	#else
1909	chr -= 0x10000L;
1910	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1911	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1912	#endif
1913	} else {
1914	endinpos = s-starts;
1915	outpos = p-PyUnicode_AS_UNICODE(v);
1916	if (unicode_decode_call_errorhandler(
1917	errors, &errorHandler,
1918	"unicodeescape", "illegal Unicode character",
1919	starts, size, &startinpos, &endinpos, &exc, &s,
1920	(PyObject **)&v, &outpos, &p))
1921	goto onError;
1922	}
1923	break;
1924
1925	/* \N{name} */
1926	case 'N':
1927	message = "malformed \\N character escape";
1928	if (ucnhash_CAPI == NULL) {
1929	/* load the unicode data module */
1930	PyObject m, api;
1931	m = PyImport_ImportModule("unicodedata");
1932	if (m == NULL)
1933	goto ucnhashError;
1934	api = PyObject_GetAttrString(m, "ucnhash_CAPI");
1935	Py_DECREF(m);
1936	if (api == NULL)
1937	goto ucnhashError;
1938	ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
1939	Py_DECREF(api);
1940	if (ucnhash_CAPI == NULL)
1941	goto ucnhashError;
1942	}
1943	if (*s == '{') {
1944	const char *start = s+1;
1945	/* look for the closing brace */
1946	while (*s != '}' && s < end)
1947	s++;
1948	if (s > start && s < end && *s == '}') {
1949	/* found a name. look it up in the unicode database */
1950	message = "unknown Unicode character name";
1951	s++;
1952	if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
1953	goto store;
1954	}
1955	}
1956	endinpos = s-starts;
1957	outpos = p-PyUnicode_AS_UNICODE(v);
1958	if (unicode_decode_call_errorhandler(
1959	errors, &errorHandler,
1960	"unicodeescape", message,
1961	starts, size, &startinpos, &endinpos, &exc, &s,
1962	(PyObject **)&v, &outpos, &p))
1963	goto onError;
1964	break;
1965
1966	default:
1967	if (s > end) {
1968	message = "\\ at end of string";
1969	s--;
1970	endinpos = s-starts;
1971	outpos = p-PyUnicode_AS_UNICODE(v);
1972	if (unicode_decode_call_errorhandler(
1973	errors, &errorHandler,
1974	"unicodeescape", message,
1975	starts, size, &startinpos, &endinpos, &exc, &s,
1976	(PyObject **)&v, &outpos, &p))
1977	goto onError;
1978	}
1979	else {
1980	*p++ = '\\';
1981	*p++ = (unsigned char)s[-1];
1982	}
1983	break;
1984	}
1985	nextByte:
1986	;
1987	}
1988	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
1989	goto onError;
1990	Py_XDECREF(errorHandler);
1991	Py_XDECREF(exc);
1992	return (PyObject *)v;
1993
1994	ucnhashError:
1995	PyErr_SetString(
1996	PyExc_UnicodeError,
1997	"\\N escapes not supported (can't load unicodedata module)"
1998	);
1999	Py_XDECREF(v);
2000	Py_XDECREF(errorHandler);
2001	Py_XDECREF(exc);
2002	return NULL;
2003
2004	onError:
2005	Py_XDECREF(v);
2006	Py_XDECREF(errorHandler);
2007	Py_XDECREF(exc);
2008	return NULL;
2009	}
2010
2011	/* Return a Unicode-Escape string version of the Unicode object.
2012
2013	If quotes is true, the string is enclosed in u"" or u'' quotes as
2014	appropriate.
2015
2016	*/
2017
2018	Py_LOCAL_INLINE(const Py_UNICODE ) findchar(const Py_UNICODE s,
2019	Py_ssize_t size,
2020	Py_UNICODE ch)
2021	{
2022	/* like wcschr, but doesn't stop at NULL characters */
2023
2024	while (size-- > 0) {
2025	if (*s == ch)
2026	return s;
2027	s++;
2028	}
2029
2030	return NULL;
2031	}
2032
2033	static
2034	PyObject unicodeescape_string(const Py_UNICODE s,
2035	Py_ssize_t size,
2036	int quotes)
2037	{
2038	PyObject *repr;
2039	char *p;
2040
2041	static const char *hexdigit = "0123456789abcdef";
2042
2043	/* Initial allocation is based on the longest-possible unichr
2044	escape.
2045
2046	In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2047	unichr, so in this case it's the longest unichr escape. In
2048	narrow (UTF-16) builds this is five chars per source unichr
2049	since there are two unichrs in the surrogate pair, so in narrow
2050	(UTF-16) builds it's not the longest unichr escape.
2051
2052	In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2053	so in the narrow (UTF-16) build case it's the longest unichr
2054	escape.
2055	*/
2056
2057	repr = PyString_FromStringAndSize(NULL,
2058	2
2059	#ifdef Py_UNICODE_WIDE
2060	+ 10*size
2061	#else
2062	+ 6*size
2063	#endif
2064	+ 1);
2065	if (repr == NULL)
2066	return NULL;
2067
2068	p = PyString_AS_STRING(repr);
2069
2070	if (quotes) {
2071	*p++ = 'u';
2072	*p++ = (findchar(s, size, '\'') &&
2073	!findchar(s, size, '"')) ? '"' : '\'';
2074	}
2075	while (size-- > 0) {
2076	Py_UNICODE ch = *s++;
2077
2078	/* Escape quotes and backslashes */
2079	if ((quotes &&
2080	ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) \|\| ch == '\\') {
2081	*p++ = '\\';
2082	*p++ = (char) ch;
2083	continue;
2084	}
2085
2086	#ifdef Py_UNICODE_WIDE
2087	/* Map 21-bit characters to '\U00xxxxxx' */
2088	else if (ch >= 0x10000) {
2089	*p++ = '\\';
2090	*p++ = 'U';
2091	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
2092	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
2093	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
2094	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
2095	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
2096	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
2097	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
2098	*p++ = hexdigit[ch & 0x0000000F];
2099	continue;
2100	}
2101	#else
2102	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2103	else if (ch >= 0xD800 && ch < 0xDC00) {
2104	Py_UNICODE ch2;
2105	Py_UCS4 ucs;
2106
2107	ch2 = *s++;
2108	size--;
2109	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2110	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
2111	*p++ = '\\';
2112	*p++ = 'U';
2113	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2114	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2115	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2116	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2117	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2118	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2119	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2120	*p++ = hexdigit[ucs & 0x0000000F];
2121	continue;
2122	}
2123	/* Fall through: isolated surrogates are copied as-is */
2124	s--;
2125	size++;
2126	}
2127	#endif
2128
2129	/* Map 16-bit characters to '\uxxxx' */
2130	if (ch >= 256) {
2131	*p++ = '\\';
2132	*p++ = 'u';
2133	*p++ = hexdigit[(ch >> 12) & 0x000F];
2134	*p++ = hexdigit[(ch >> 8) & 0x000F];
2135	*p++ = hexdigit[(ch >> 4) & 0x000F];
2136	*p++ = hexdigit[ch & 0x000F];
2137	}
2138
2139	/* Map special whitespace to '\t', \n', '\r' */
2140	else if (ch == '\t') {
2141	*p++ = '\\';
2142	*p++ = 't';
2143	}
2144	else if (ch == '\n') {
2145	*p++ = '\\';
2146	*p++ = 'n';
2147	}
2148	else if (ch == '\r') {
2149	*p++ = '\\';
2150	*p++ = 'r';
2151	}
2152
2153	/* Map non-printable US ASCII to '\xhh' */
2154	else if (ch < ' ' \|\| ch >= 0x7F) {
2155	*p++ = '\\';
2156	*p++ = 'x';
2157	*p++ = hexdigit[(ch >> 4) & 0x000F];
2158	*p++ = hexdigit[ch & 0x000F];
2159	}
2160
2161	/* Copy everything else as-is */
2162	else
2163	*p++ = (char) ch;
2164	}
2165	if (quotes)
2166	*p++ = PyString_AS_STRING(repr)[1];
2167
2168	*p = '\0';
2169	_PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2170	return repr;
2171	}
2172
2173	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
2174	Py_ssize_t size)
2175	{
2176	return unicodeescape_string(s, size, 0);
2177	}
2178
2179	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
2180	{
2181	if (!PyUnicode_Check(unicode)) {
2182	PyErr_BadArgument();
2183	return NULL;
2184	}
2185	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2186	PyUnicode_GET_SIZE(unicode));
2187	}
2188
2189	/* --- Raw Unicode Escape Codec ------------------------------------------- */
2190
2191	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
2192	Py_ssize_t size,
2193	const char *errors)
2194	{
2195	const char *starts = s;
2196	Py_ssize_t startinpos;
2197	Py_ssize_t endinpos;
2198	Py_ssize_t outpos;
2199	PyUnicodeObject *v;
2200	Py_UNICODE *p;
2201	const char *end;
2202	const char *bs;
2203	PyObject *errorHandler = NULL;
2204	PyObject *exc = NULL;
2205
2206	/* Escaped strings will always be longer than the resulting
2207	Unicode string, so we start with size here and then reduce the
2208	length after conversion to the true value. (But decoding error
2209	handler might have to resize the string) */
2210	v = _PyUnicode_New(size);
2211	if (v == NULL)
2212	goto onError;
2213	if (size == 0)
2214	return (PyObject *)v;
2215	p = PyUnicode_AS_UNICODE(v);
2216	end = s + size;
2217	while (s < end) {
2218	unsigned char c;
2219	Py_UCS4 x;
2220	int i;
2221	int count;
2222
2223	/* Non-escape characters are interpreted as Unicode ordinals */
2224	if (*s != '\\') {
2225	p++ = (unsigned char)s++;
2226	continue;
2227	}
2228	startinpos = s-starts;
2229
2230	/* \u-escapes are only interpreted iff the number of leading
2231	backslashes if odd */
2232	bs = s;
2233	for (;s < end;) {
2234	if (*s != '\\')
2235	break;
2236	p++ = (unsigned char)s++;
2237	}
2238	if (((s - bs) & 1) == 0 \|\|
2239	s >= end \|\|
2240	(s != 'u' && s != 'U')) {
2241	continue;
2242	}
2243	p--;
2244	count = *s=='u' ? 4 : 8;
2245	s++;
2246
2247	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2248	outpos = p-PyUnicode_AS_UNICODE(v);
2249	for (x = 0, i = 0; i < count; ++i, ++s) {
2250	c = (unsigned char)*s;
2251	if (!isxdigit(c)) {
2252	endinpos = s-starts;
2253	if (unicode_decode_call_errorhandler(
2254	errors, &errorHandler,
2255	"rawunicodeescape", "truncated \\uXXXX",
2256	starts, size, &startinpos, &endinpos, &exc, &s,
2257	(PyObject **)&v, &outpos, &p))
2258	goto onError;
2259	goto nextByte;
2260	}
2261	x = (x<<4) & ~0xF;
2262	if (c >= '0' && c <= '9')
2263	x += c - '0';
2264	else if (c >= 'a' && c <= 'f')
2265	x += 10 + c - 'a';
2266	else
2267	x += 10 + c - 'A';
2268	}
2269	#ifndef Py_UNICODE_WIDE
2270	if (x > 0x10000) {
2271	if (unicode_decode_call_errorhandler(
2272	errors, &errorHandler,
2273	"rawunicodeescape", "\\Uxxxxxxxx out of range",
2274	starts, size, &startinpos, &endinpos, &exc, &s,
2275	(PyObject **)&v, &outpos, &p))
2276	goto onError;
2277	}
2278	#endif
2279	*p++ = x;
2280	nextByte:
2281	;
2282	}
2283	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2284	goto onError;
2285	Py_XDECREF(errorHandler);
2286	Py_XDECREF(exc);
2287	return (PyObject *)v;
2288
2289	onError:
2290	Py_XDECREF(v);
2291	Py_XDECREF(errorHandler);
2292	Py_XDECREF(exc);
2293	return NULL;
2294	}
2295
2296	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
2297	Py_ssize_t size)
2298	{
2299	PyObject *repr;
2300	char *p;
2301	char *q;
2302
2303	static const char *hexdigit = "0123456789abcdef";
2304
2305	#ifdef Py_UNICODE_WIDE
2306	repr = PyString_FromStringAndSize(NULL, 10 * size);
2307	#else
2308	repr = PyString_FromStringAndSize(NULL, 6 * size);
2309	#endif
2310	if (repr == NULL)
2311	return NULL;
2312	if (size == 0)
2313	return repr;
2314
2315	p = q = PyString_AS_STRING(repr);
2316	while (size-- > 0) {
2317	Py_UNICODE ch = *s++;
2318	#ifdef Py_UNICODE_WIDE
2319	/* Map 32-bit characters to '\Uxxxxxxxx' */
2320	if (ch >= 0x10000) {
2321	*p++ = '\\';
2322	*p++ = 'U';
2323	*p++ = hexdigit[(ch >> 28) & 0xf];
2324	*p++ = hexdigit[(ch >> 24) & 0xf];
2325	*p++ = hexdigit[(ch >> 20) & 0xf];
2326	*p++ = hexdigit[(ch >> 16) & 0xf];
2327	*p++ = hexdigit[(ch >> 12) & 0xf];
2328	*p++ = hexdigit[(ch >> 8) & 0xf];
2329	*p++ = hexdigit[(ch >> 4) & 0xf];
2330	*p++ = hexdigit[ch & 15];
2331	}
2332	else
2333	#endif
2334	/* Map 16-bit characters to '\uxxxx' */
2335	if (ch >= 256) {
2336	*p++ = '\\';
2337	*p++ = 'u';
2338	*p++ = hexdigit[(ch >> 12) & 0xf];
2339	*p++ = hexdigit[(ch >> 8) & 0xf];
2340	*p++ = hexdigit[(ch >> 4) & 0xf];
2341	*p++ = hexdigit[ch & 15];
2342	}
2343	/* Copy everything else as-is */
2344	else
2345	*p++ = (char) ch;
2346	}
2347	*p = '\0';
2348	_PyString_Resize(&repr, p - q);
2349	return repr;
2350	}
2351
2352	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
2353	{
2354	if (!PyUnicode_Check(unicode)) {
2355	PyErr_BadArgument();
2356	return NULL;
2357	}
2358	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2359	PyUnicode_GET_SIZE(unicode));
2360	}
2361
2362	/* --- Unicode Internal Codec ------------------------------------------- */
2363
2364	PyObject _PyUnicode_DecodeUnicodeInternal(const char s,
2365	Py_ssize_t size,
2366	const char *errors)
2367	{
2368	const char *starts = s;
2369	Py_ssize_t startinpos;
2370	Py_ssize_t endinpos;
2371	Py_ssize_t outpos;
2372	PyUnicodeObject *v;
2373	Py_UNICODE *p;
2374	const char *end;
2375	const char *reason;
2376	PyObject *errorHandler = NULL;
2377	PyObject *exc = NULL;
2378
2379	#ifdef Py_UNICODE_WIDE
2380	Py_UNICODE unimax = PyUnicode_GetMax();
2381	#endif
2382
2383	v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2384	if (v == NULL)
2385	goto onError;
2386	if (PyUnicode_GetSize((PyObject *)v) == 0)
2387	return (PyObject *)v;
2388	p = PyUnicode_AS_UNICODE(v);
2389	end = s + size;
2390
2391	while (s < end) {
2392	memcpy(p, s, sizeof(Py_UNICODE));
2393	/* We have to sanity check the raw data, otherwise doom looms for
2394	some malformed UCS-4 data. */
2395	if (
2396	#ifdef Py_UNICODE_WIDE
2397	p > unimax \|\| p < 0 \|\|
2398	#endif
2399	end-s < Py_UNICODE_SIZE
2400	)
2401	{
2402	startinpos = s - starts;
2403	if (end-s < Py_UNICODE_SIZE) {
2404	endinpos = end-starts;
2405	reason = "truncated input";
2406	}
2407	else {
2408	endinpos = s - starts + Py_UNICODE_SIZE;
2409	reason = "illegal code point (> 0x10FFFF)";
2410	}
2411	outpos = p - PyUnicode_AS_UNICODE(v);
2412	if (unicode_decode_call_errorhandler(
2413	errors, &errorHandler,
2414	"unicode_internal", reason,
2415	starts, size, &startinpos, &endinpos, &exc, &s,
2416	(PyObject **)&v, &outpos, &p)) {
2417	goto onError;
2418	}
2419	}
2420	else {
2421	p++;
2422	s += Py_UNICODE_SIZE;
2423	}
2424	}
2425
2426	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2427	goto onError;
2428	Py_XDECREF(errorHandler);
2429	Py_XDECREF(exc);
2430	return (PyObject *)v;
2431
2432	onError:
2433	Py_XDECREF(v);
2434	Py_XDECREF(errorHandler);
2435	Py_XDECREF(exc);
2436	return NULL;
2437	}
2438
2439	/* --- Latin-1 Codec ------------------------------------------------------ */
2440
2441	PyObject PyUnicode_DecodeLatin1(const char s,
2442	Py_ssize_t size,
2443	const char *errors)
2444	{
2445	PyUnicodeObject *v;
2446	Py_UNICODE *p;
2447
2448	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2449	if (size == 1) {
2450	Py_UNICODE r = (unsigned char)s;
2451	return PyUnicode_FromUnicode(&r, 1);
2452	}
2453
2454	v = _PyUnicode_New(size);
2455	if (v == NULL)
2456	goto onError;
2457	if (size == 0)
2458	return (PyObject *)v;
2459	p = PyUnicode_AS_UNICODE(v);
2460	while (size-- > 0)
2461	p++ = (unsigned char)s++;
2462	return (PyObject *)v;
2463
2464	onError:
2465	Py_XDECREF(v);
2466	return NULL;
2467	}
2468
2469	/* create or adjust a UnicodeEncodeError */
2470	static void make_encode_exception(PyObject **exceptionObject,
2471	const char *encoding,
2472	const Py_UNICODE *unicode, Py_ssize_t size,
2473	Py_ssize_t startpos, Py_ssize_t endpos,
2474	const char *reason)
2475	{
2476	if (*exceptionObject == NULL) {
2477	*exceptionObject = PyUnicodeEncodeError_Create(
2478	encoding, unicode, size, startpos, endpos, reason);
2479	}
2480	else {
2481	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2482	goto onError;
2483	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2484	goto onError;
2485	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2486	goto onError;
2487	return;
2488	onError:
2489	Py_DECREF(*exceptionObject);
2490	*exceptionObject = NULL;
2491	}
2492	}
2493
2494	/* raises a UnicodeEncodeError */
2495	static void raise_encode_exception(PyObject **exceptionObject,
2496	const char *encoding,
2497	const Py_UNICODE *unicode, Py_ssize_t size,
2498	Py_ssize_t startpos, Py_ssize_t endpos,
2499	const char *reason)
2500	{
2501	make_encode_exception(exceptionObject,
2502	encoding, unicode, size, startpos, endpos, reason);
2503	if (*exceptionObject != NULL)
2504	PyCodec_StrictErrors(*exceptionObject);
2505	}
2506
2507	/* error handling callback helper:
2508	build arguments, call the callback and check the arguments,
2509	put the result into newpos and return the replacement string, which
2510	has to be freed by the caller */
2511	static PyObject unicode_encode_call_errorhandler(const char errors,
2512	PyObject **errorHandler,
2513	const char encoding, const char reason,
2514	const Py_UNICODE unicode, Py_ssize_t size, PyObject *exceptionObject,
2515	Py_ssize_t startpos, Py_ssize_t endpos,
2516	Py_ssize_t *newpos)
2517	{
2518	static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
2519
2520	PyObject *restuple;
2521	PyObject *resunicode;
2522
2523	if (*errorHandler == NULL) {
2524	*errorHandler = PyCodec_LookupError(errors);
2525	if (*errorHandler == NULL)
2526	return NULL;
2527	}
2528
2529	make_encode_exception(exceptionObject,
2530	encoding, unicode, size, startpos, endpos, reason);
2531	if (*exceptionObject == NULL)
2532	return NULL;
2533
2534	restuple = PyObject_CallFunctionObjArgs(
2535	errorHandler, exceptionObject, NULL);
2536	if (restuple == NULL)
2537	return NULL;
2538	if (!PyTuple_Check(restuple)) {
2539	PyErr_Format(PyExc_TypeError, &argparse[4]);
2540	Py_DECREF(restuple);
2541	return NULL;
2542	}
2543	if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2544	&resunicode, newpos)) {
2545	Py_DECREF(restuple);
2546	return NULL;
2547	}
2548	if (*newpos<0)
2549	newpos = size+newpos;
2550	if (newpos<0 \|\| newpos>size) {
2551	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
2552	Py_DECREF(restuple);
2553	return NULL;
2554	}
2555	Py_INCREF(resunicode);
2556	Py_DECREF(restuple);
2557	return resunicode;
2558	}
2559
2560	static PyObject unicode_encode_ucs1(const Py_UNICODE p,
2561	Py_ssize_t size,
2562	const char *errors,
2563	int limit)
2564	{
2565	/* output object */
2566	PyObject *res;
2567	/* pointers to the beginning and end+1 of input */
2568	const Py_UNICODE *startp = p;
2569	const Py_UNICODE *endp = p + size;
2570	/* pointer to the beginning of the unencodable characters */
2571	/* const Py_UNICODE badp = NULL; /
2572	/* pointer into the output */
2573	char *str;
2574	/* current output position */
2575	Py_ssize_t respos = 0;
2576	Py_ssize_t ressize;
2577	const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2578	const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2579	PyObject *errorHandler = NULL;
2580	PyObject *exc = NULL;
2581	/* the following variable is used for caching string comparisons
2582	* -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2583	int known_errorHandler = -1;
2584
2585	/* allocate enough for a simple encoding without
2586	replacements, if we need more, we'll resize */
2587	res = PyString_FromStringAndSize(NULL, size);
2588	if (res == NULL)
2589	goto onError;
2590	if (size == 0)
2591	return res;
2592	str = PyString_AS_STRING(res);
2593	ressize = size;
2594
2595	while (p<endp) {
2596	Py_UNICODE c = *p;
2597
2598	/* can we encode this? */
2599	if (c<limit) {
2600	/* no overflow check, because we know that the space is enough */
2601	*str++ = (char)c;
2602	++p;
2603	}
2604	else {
2605	Py_ssize_t unicodepos = p-startp;
2606	Py_ssize_t requiredsize;
2607	PyObject *repunicode;
2608	Py_ssize_t repsize;
2609	Py_ssize_t newpos;
2610	Py_ssize_t respos;
2611	Py_UNICODE *uni2;
2612	/* startpos for collecting unencodable chars */
2613	const Py_UNICODE *collstart = p;
2614	const Py_UNICODE *collend = p;
2615	/* find all unecodable characters */
2616	while ((collend < endp) && ((*collend)>=limit))
2617	++collend;
2618	/* cache callback name lookup (if not done yet, i.e. it's the first error) */
2619	if (known_errorHandler==-1) {
2620	if ((errors==NULL) \|\| (!strcmp(errors, "strict")))
2621	known_errorHandler = 1;
2622	else if (!strcmp(errors, "replace"))
2623	known_errorHandler = 2;
2624	else if (!strcmp(errors, "ignore"))
2625	known_errorHandler = 3;
2626	else if (!strcmp(errors, "xmlcharrefreplace"))
2627	known_errorHandler = 4;
2628	else
2629	known_errorHandler = 0;
2630	}
2631	switch (known_errorHandler) {
2632	case 1: /* strict */
2633	raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2634	goto onError;
2635	case 2: /* replace */
2636	while (collstart++<collend)
2637	str++ = '?'; / fall through */
2638	case 3: /* ignore */
2639	p = collend;
2640	break;
2641	case 4: /* xmlcharrefreplace */
2642	respos = str-PyString_AS_STRING(res);
2643	/* determine replacement size (temporarily (mis)uses p) */
2644	for (p = collstart, repsize = 0; p < collend; ++p) {
2645	if (*p<10)
2646	repsize += 2+1+1;
2647	else if (*p<100)
2648	repsize += 2+2+1;
2649	else if (*p<1000)
2650	repsize += 2+3+1;
2651	else if (*p<10000)
2652	repsize += 2+4+1;
2653	#ifndef Py_UNICODE_WIDE
2654	else
2655	repsize += 2+5+1;
2656	#else
2657	else if (*p<100000)
2658	repsize += 2+5+1;
2659	else if (*p<1000000)
2660	repsize += 2+6+1;
2661	else
2662	repsize += 2+7+1;
2663	#endif
2664	}
2665	requiredsize = respos+repsize+(endp-collend);
2666	if (requiredsize > ressize) {
2667	if (requiredsize<2*ressize)
2668	requiredsize = 2*ressize;
2669	if (_PyString_Resize(&res, requiredsize))
2670	goto onError;
2671	str = PyString_AS_STRING(res) + respos;
2672	ressize = requiredsize;
2673	}
2674	/* generate replacement (temporarily (mis)uses p) */
2675	for (p = collstart; p < collend; ++p) {
2676	str += sprintf(str, "&#%d;", (int)*p);
2677	}
2678	p = collend;
2679	break;
2680	default:
2681	repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2682	encoding, reason, startp, size, &exc,
2683	collstart-startp, collend-startp, &newpos);
2684	if (repunicode == NULL)
2685	goto onError;
2686	/* need more space? (at least enough for what we
2687	have+the replacement+the rest of the string, so
2688	we won't have to check space for encodable characters) */
2689	respos = str-PyString_AS_STRING(res);
2690	repsize = PyUnicode_GET_SIZE(repunicode);
2691	requiredsize = respos+repsize+(endp-collend);
2692	if (requiredsize > ressize) {
2693	if (requiredsize<2*ressize)
2694	requiredsize = 2*ressize;
2695	if (_PyString_Resize(&res, requiredsize)) {
2696	Py_DECREF(repunicode);
2697	goto onError;
2698	}
2699	str = PyString_AS_STRING(res) + respos;
2700	ressize = requiredsize;
2701	}
2702	/* check if there is anything unencodable in the replacement
2703	and copy it to the output */
2704	for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2705	c = *uni2;
2706	if (c >= limit) {
2707	raise_encode_exception(&exc, encoding, startp, size,
2708	unicodepos, unicodepos+1, reason);
2709	Py_DECREF(repunicode);
2710	goto onError;
2711	}
2712	*str = (char)c;
2713	}
2714	p = startp + newpos;
2715	Py_DECREF(repunicode);
2716	}
2717	}
2718	}
2719	/* Resize if we allocated to much */
2720	respos = str-PyString_AS_STRING(res);
2721	if (respos<ressize)
2722	/* If this falls res will be NULL */
2723	_PyString_Resize(&res, respos);
2724	Py_XDECREF(errorHandler);
2725	Py_XDECREF(exc);
2726	return res;
2727
2728	onError:
2729	Py_XDECREF(res);
2730	Py_XDECREF(errorHandler);
2731	Py_XDECREF(exc);
2732	return NULL;
2733	}
2734
2735	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
2736	Py_ssize_t size,
2737	const char *errors)
2738	{
2739	return unicode_encode_ucs1(p, size, errors, 256);
2740	}
2741
2742	PyObject PyUnicode_AsLatin1String(PyObject unicode)
2743	{
2744	if (!PyUnicode_Check(unicode)) {
2745	PyErr_BadArgument();
2746	return NULL;
2747	}
2748	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2749	PyUnicode_GET_SIZE(unicode),
2750	NULL);
2751	}
2752
2753	/* --- 7-bit ASCII Codec -------------------------------------------------- */
2754
2755	PyObject PyUnicode_DecodeASCII(const char s,
2756	Py_ssize_t size,
2757	const char *errors)
2758	{
2759	const char *starts = s;
2760	PyUnicodeObject *v;
2761	Py_UNICODE *p;
2762	Py_ssize_t startinpos;
2763	Py_ssize_t endinpos;
2764	Py_ssize_t outpos;
2765	const char *e;
2766	PyObject *errorHandler = NULL;
2767	PyObject *exc = NULL;
2768
2769	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
2770	if (size == 1 && (unsigned char)s < 128) {
2771	Py_UNICODE r = (unsigned char)s;
2772	return PyUnicode_FromUnicode(&r, 1);
2773	}
2774
2775	v = _PyUnicode_New(size);
2776	if (v == NULL)
2777	goto onError;
2778	if (size == 0)
2779	return (PyObject *)v;
2780	p = PyUnicode_AS_UNICODE(v);
2781	e = s + size;
2782	while (s < e) {
2783	register unsigned char c = (unsigned char)*s;
2784	if (c < 128) {
2785	*p++ = c;
2786	++s;
2787	}
2788	else {
2789	startinpos = s-starts;
2790	endinpos = startinpos + 1;
2791	outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
2792	if (unicode_decode_call_errorhandler(
2793	errors, &errorHandler,
2794	"ascii", "ordinal not in range(128)",
2795	starts, size, &startinpos, &endinpos, &exc, &s,
2796	(PyObject **)&v, &outpos, &p))
2797	goto onError;
2798	}
2799	}
2800	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2801	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2802	goto onError;
2803	Py_XDECREF(errorHandler);
2804	Py_XDECREF(exc);
2805	return (PyObject *)v;
2806
2807	onError:
2808	Py_XDECREF(v);
2809	Py_XDECREF(errorHandler);
2810	Py_XDECREF(exc);
2811	return NULL;
2812	}
2813
2814	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
2815	Py_ssize_t size,
2816	const char *errors)
2817	{
2818	return unicode_encode_ucs1(p, size, errors, 128);
2819	}
2820
2821	PyObject PyUnicode_AsASCIIString(PyObject unicode)
2822	{
2823	if (!PyUnicode_Check(unicode)) {
2824	PyErr_BadArgument();
2825	return NULL;
2826	}
2827	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2828	PyUnicode_GET_SIZE(unicode),
2829	NULL);
2830	}
2831
2832	#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2833
2834	/* --- MBCS codecs for Windows -------------------------------------------- */
2835
2836	#if SIZEOF_INT < SIZEOF_SSIZE_T
2837	#define NEED_RETRY
2838	#endif
2839
2840	/* XXX This code is limited to "true" double-byte encodings, as
2841	a) it assumes an incomplete character consists of a single byte, and
2842	b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2843	encodings, see IsDBCSLeadByteEx documentation. */
2844
2845	static int is_dbcs_lead_byte(const char *s, int offset)
2846	{
2847	const char *curr = s + offset;
2848
2849	if (IsDBCSLeadByte(*curr)) {
2850	const char *prev = CharPrev(s, curr);
2851	return (prev == curr) \|\| !IsDBCSLeadByte(*prev) \|\| (curr - prev == 2);
2852	}
2853	return 0;
2854	}
2855
2856	/*
2857	* Decode MBCS string into unicode object. If 'final' is set, converts
2858	* trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2859	*/
2860	static int decode_mbcs(PyUnicodeObject **v,
2861	const char s, / MBCS string */
2862	int size, /* sizeof MBCS string */
2863	int final)
2864	{
2865	Py_UNICODE *p;
2866	Py_ssize_t n = 0;
2867	int usize = 0;
2868
2869	assert(size >= 0);
2870
2871	/* Skip trailing lead-byte unless 'final' is set */
2872	if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2873	--size;
2874
2875	/* First get the size of the result */
2876	if (size > 0) {
2877	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2878	if (usize == 0) {
2879	PyErr_SetFromWindowsErrWithFilename(0, NULL);
2880	return -1;
2881	}
2882	}
2883
2884	if (*v == NULL) {
2885	/* Create unicode object */
2886	*v = _PyUnicode_New(usize);
2887	if (*v == NULL)
2888	return -1;
2889	}
2890	else {
2891	/* Extend unicode object */
2892	n = PyUnicode_GET_SIZE(*v);
2893	if (_PyUnicode_Resize(v, n + usize) < 0)
2894	return -1;
2895	}
2896
2897	/* Do the conversion */
2898	if (size > 0) {
2899	p = PyUnicode_AS_UNICODE(*v) + n;
2900	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2901	PyErr_SetFromWindowsErrWithFilename(0, NULL);
2902	return -1;
2903	}
2904	}
2905
2906	return size;
2907	}
2908
2909	PyObject PyUnicode_DecodeMBCSStateful(const char s,
2910	Py_ssize_t size,
2911	const char *errors,
2912	Py_ssize_t *consumed)
2913	{
2914	PyUnicodeObject *v = NULL;
2915	int done;
2916
2917	if (consumed)
2918	*consumed = 0;
2919
2920	#ifdef NEED_RETRY
2921	retry:
2922	if (size > INT_MAX)
2923	done = decode_mbcs(&v, s, INT_MAX, 0);
2924	else
2925	#endif
2926	done = decode_mbcs(&v, s, (int)size, !consumed);
2927
2928	if (done < 0) {
2929	Py_XDECREF(v);
2930	return NULL;
2931	}
2932
2933	if (consumed)
2934	*consumed += done;
2935
2936	#ifdef NEED_RETRY
2937	if (size > INT_MAX) {
2938	s += done;
2939	size -= done;
2940	goto retry;
2941	}
2942	#endif
2943
2944	return (PyObject *)v;
2945	}
2946
2947	PyObject PyUnicode_DecodeMBCS(const char s,
2948	Py_ssize_t size,
2949	const char *errors)
2950	{
2951	return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2952	}
2953
2954	/*
2955	* Convert unicode into string object (MBCS).
2956	* Returns 0 if succeed, -1 otherwise.
2957	*/
2958	static int encode_mbcs(PyObject **repr,
2959	const Py_UNICODE p, / unicode */
2960	int size) /* size of unicode */
2961	{
2962	int mbcssize = 0;
2963	Py_ssize_t n = 0;
2964
2965	assert(size >= 0);
2966
2967	/* First get the size of the result */
2968	if (size > 0) {
2969	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2970	if (mbcssize == 0) {
2971	PyErr_SetFromWindowsErrWithFilename(0, NULL);
2972	return -1;
2973	}
2974	}
2975
2976	if (*repr == NULL) {
2977	/* Create string object */
2978	*repr = PyString_FromStringAndSize(NULL, mbcssize);
2979	if (*repr == NULL)
2980	return -1;
2981	}
2982	else {
2983	/* Extend string object */
2984	n = PyString_Size(*repr);
2985	if (_PyString_Resize(repr, n + mbcssize) < 0)
2986	return -1;
2987	}
2988
2989	/* Do the conversion */
2990	if (size > 0) {
2991	char s = PyString_AS_STRING(repr) + n;
2992	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2993	PyErr_SetFromWindowsErrWithFilename(0, NULL);
2994	return -1;
2995	}
2996	}
2997
2998	return 0;
2999	}
3000
3001	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
3002	Py_ssize_t size,
3003	const char *errors)
3004	{
3005	PyObject *repr = NULL;
3006	int ret;
3007
3008	#ifdef NEED_RETRY
3009	retry:
3010	if (size > INT_MAX)
3011	ret = encode_mbcs(&repr, p, INT_MAX);
3012	else
3013	#endif
3014	ret = encode_mbcs(&repr, p, (int)size);
3015
3016	if (ret < 0) {
3017	Py_XDECREF(repr);
3018	return NULL;
3019	}
3020
3021	#ifdef NEED_RETRY
3022	if (size > INT_MAX) {
3023	p += INT_MAX;
3024	size -= INT_MAX;
3025	goto retry;
3026	}
3027	#endif
3028
3029	return repr;
3030	}
3031
3032	PyObject PyUnicode_AsMBCSString(PyObject unicode)
3033	{
3034	if (!PyUnicode_Check(unicode)) {
3035	PyErr_BadArgument();
3036	return NULL;
3037	}
3038	return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3039	PyUnicode_GET_SIZE(unicode),
3040	NULL);
3041	}
3042
3043	#undef NEED_RETRY
3044
3045	#endif /* MS_WINDOWS */
3046
3047	/* --- Character Mapping Codec -------------------------------------------- */
3048
3049	PyObject PyUnicode_DecodeCharmap(const char s,
3050	Py_ssize_t size,
3051	PyObject *mapping,
3052	const char *errors)
3053	{
3054	const char *starts = s;
3055	Py_ssize_t startinpos;
3056	Py_ssize_t endinpos;
3057	Py_ssize_t outpos;
3058	const char *e;
3059	PyUnicodeObject *v;
3060	Py_UNICODE *p;
3061	Py_ssize_t extrachars = 0;
3062	PyObject *errorHandler = NULL;
3063	PyObject *exc = NULL;
3064	Py_UNICODE *mapstring = NULL;
3065	Py_ssize_t maplen = 0;
3066
3067	/* Default to Latin-1 */
3068	if (mapping == NULL)
3069	return PyUnicode_DecodeLatin1(s, size, errors);
3070
3071	v = _PyUnicode_New(size);
3072	if (v == NULL)
3073	goto onError;
3074	if (size == 0)
3075	return (PyObject *)v;
3076	p = PyUnicode_AS_UNICODE(v);
3077	e = s + size;
3078	if (PyUnicode_CheckExact(mapping)) {
3079	mapstring = PyUnicode_AS_UNICODE(mapping);
3080	maplen = PyUnicode_GET_SIZE(mapping);
3081	while (s < e) {
3082	unsigned char ch = *s;
3083	Py_UNICODE x = 0xfffe; /* illegal value */
3084
3085	if (ch < maplen)
3086	x = mapstring[ch];
3087
3088	if (x == 0xfffe) {
3089	/* undefined mapping */
3090	outpos = p-PyUnicode_AS_UNICODE(v);
3091	startinpos = s-starts;
3092	endinpos = startinpos+1;
3093	if (unicode_decode_call_errorhandler(
3094	errors, &errorHandler,
3095	"charmap", "character maps to <undefined>",
3096	starts, size, &startinpos, &endinpos, &exc, &s,
3097	(PyObject **)&v, &outpos, &p)) {
3098	goto onError;
3099	}
3100	continue;
3101	}
3102	*p++ = x;
3103	++s;
3104	}
3105	}
3106	else {
3107	while (s < e) {
3108	unsigned char ch = *s;
3109	PyObject w, x;
3110
3111	/* Get mapping (char ordinal -> integer, Unicode char or None) */
3112	w = PyInt_FromLong((long)ch);
3113	if (w == NULL)
3114	goto onError;
3115	x = PyObject_GetItem(mapping, w);
3116	Py_DECREF(w);
3117	if (x == NULL) {
3118	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3119	/* No mapping found means: mapping is undefined. */
3120	PyErr_Clear();
3121	x = Py_None;
3122	Py_INCREF(x);
3123	} else
3124	goto onError;
3125	}
3126
3127	/* Apply mapping */
3128	if (PyInt_Check(x)) {
3129	long value = PyInt_AS_LONG(x);
3130	if (value < 0 \|\| value > 65535) {
3131	PyErr_SetString(PyExc_TypeError,
3132	"character mapping must be in range(65536)");
3133	Py_DECREF(x);
3134	goto onError;
3135	}
3136	*p++ = (Py_UNICODE)value;
3137	}
3138	else if (x == Py_None) {
3139	/* undefined mapping */
3140	outpos = p-PyUnicode_AS_UNICODE(v);
3141	startinpos = s-starts;
3142	endinpos = startinpos+1;
3143	if (unicode_decode_call_errorhandler(
3144	errors, &errorHandler,
3145	"charmap", "character maps to <undefined>",
3146	starts, size, &startinpos, &endinpos, &exc, &s,
3147	(PyObject **)&v, &outpos, &p)) {
3148	Py_DECREF(x);
3149	goto onError;
3150	}
3151	Py_DECREF(x);
3152	continue;
3153	}
3154	else if (PyUnicode_Check(x)) {
3155	Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3156
3157	if (targetsize == 1)
3158	/* 1-1 mapping */
3159	p++ = PyUnicode_AS_UNICODE(x);
3160
3161	else if (targetsize > 1) {
3162	/* 1-n mapping */
3163	if (targetsize > extrachars) {
3164	/* resize first */
3165	Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3166	Py_ssize_t needed = (targetsize - extrachars) + \
3167	(targetsize << 2);
3168	extrachars += needed;
3169	if (_PyUnicode_Resize(&v,
3170	PyUnicode_GET_SIZE(v) + needed) < 0) {
3171	Py_DECREF(x);
3172	goto onError;
3173	}
3174	p = PyUnicode_AS_UNICODE(v) + oldpos;
3175	}
3176	Py_UNICODE_COPY(p,
3177	PyUnicode_AS_UNICODE(x),
3178	targetsize);
3179	p += targetsize;
3180	extrachars -= targetsize;
3181	}
3182	/* 1-0 mapping: skip the character */
3183	}
3184	else {
3185	/* wrong return value */
3186	PyErr_SetString(PyExc_TypeError,
3187	"character mapping must return integer, None or unicode");
3188	Py_DECREF(x);
3189	goto onError;
3190	}
3191	Py_DECREF(x);
3192	++s;
3193	}
3194	}
3195	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3196	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3197	goto onError;
3198	Py_XDECREF(errorHandler);
3199	Py_XDECREF(exc);
3200	return (PyObject *)v;
3201
3202	onError:
3203	Py_XDECREF(errorHandler);
3204	Py_XDECREF(exc);
3205	Py_XDECREF(v);
3206	return NULL;
3207	}
3208
3209	/* Charmap encoding: the lookup table */
3210
3211	struct encoding_map{
3212	PyObject_HEAD
3213	unsigned char level1[32];
3214	int count2, count3;
3215	unsigned char level23[1];
3216	};
3217
3218	static PyObject*
3219	encoding_map_size(PyObject obj, PyObject args)
3220	{
3221	struct encoding_map map = (struct encoding_map)obj;
3222	return PyInt_FromLong(sizeof(map) - 1 + 16map->count2 +
3223	128*map->count3);
3224	}
3225
3226	static PyMethodDef encoding_map_methods[] = {
3227	{"size", encoding_map_size, METH_NOARGS,
3228	PyDoc_STR("Return the size (in bytes) of this object") },
3229	{ 0 }
3230	};
3231
3232	static void
3233	encoding_map_dealloc(PyObject* o)
3234	{
3235	PyObject_FREE(o);
3236	}
3237
3238	static PyTypeObject EncodingMapType = {
3239	PyObject_HEAD_INIT(NULL)
3240	0, /ob_size/
3241	"EncodingMap", /tp_name/
3242	sizeof(struct encoding_map), /tp_basicsize/
3243	0, /tp_itemsize/
3244	/* methods */
3245	encoding_map_dealloc, /tp_dealloc/
3246	0, /tp_print/
3247	0, /tp_getattr/
3248	0, /tp_setattr/
3249	0, /tp_compare/
3250	0, /tp_repr/
3251	0, /tp_as_number/
3252	0, /tp_as_sequence/
3253	0, /tp_as_mapping/
3254	0, /tp_hash/
3255	0, /tp_call/
3256	0, /tp_str/
3257	0, /tp_getattro/
3258	0, /tp_setattro/
3259	0, /tp_as_buffer/
3260	Py_TPFLAGS_DEFAULT, /tp_flags/
3261	0, /tp_doc/
3262	0, /tp_traverse/
3263	0, /tp_clear/
3264	0, /tp_richcompare/
3265	0, /tp_weaklistoffset/
3266	0, /tp_iter/
3267	0, /tp_iternext/
3268	encoding_map_methods, /tp_methods/
3269	0, /tp_members/
3270	0, /tp_getset/
3271	0, /tp_base/
3272	0, /tp_dict/
3273	0, /tp_descr_get/
3274	0, /tp_descr_set/
3275	0, /tp_dictoffset/
3276	0, /tp_init/
3277	0, /tp_alloc/
3278	0, /tp_new/
3279	0, /tp_free/
3280	0, /tp_is_gc/
3281	};
3282
3283	PyObject*
3284	PyUnicode_BuildEncodingMap(PyObject* string)
3285	{
3286	Py_UNICODE *decode;
3287	PyObject *result;
3288	struct encoding_map *mresult;
3289	int i;
3290	int need_dict = 0;
3291	unsigned char level1[32];
3292	unsigned char level2[512];
3293	unsigned char mlevel1, mlevel2, *mlevel3;
3294	int count2 = 0, count3 = 0;
3295
3296	if (!PyUnicode_Check(string) \|\| PyUnicode_GetSize(string) != 256) {
3297	PyErr_BadArgument();
3298	return NULL;
3299	}
3300	decode = PyUnicode_AS_UNICODE(string);
3301	memset(level1, 0xFF, sizeof level1);
3302	memset(level2, 0xFF, sizeof level2);
3303
3304	/* If there isn't a one-to-one mapping of NULL to \0,
3305	or if there are non-BMP characters, we need to use
3306	a mapping dictionary. */
3307	if (decode[0] != 0)
3308	need_dict = 1;
3309	for (i = 1; i < 256; i++) {
3310	int l1, l2;
3311	if (decode[i] == 0
3312	#ifdef Py_UNICODE_WIDE
3313	\|\| decode[i] > 0xFFFF
3314	#endif
3315	) {
3316	need_dict = 1;
3317	break;
3318	}
3319	if (decode[i] == 0xFFFE)
3320	/* unmapped character */
3321	continue;
3322	l1 = decode[i] >> 11;
3323	l2 = decode[i] >> 7;
3324	if (level1[l1] == 0xFF)
3325	level1[l1] = count2++;
3326	if (level2[l2] == 0xFF)
3327	level2[l2] = count3++;
3328	}
3329
3330	if (count2 >= 0xFF \|\| count3 >= 0xFF)
3331	need_dict = 1;
3332
3333	if (need_dict) {
3334	PyObject *result = PyDict_New();
3335	PyObject key, value;
3336	if (!result)
3337	return NULL;
3338	for (i = 0; i < 256; i++) {
3339	key = value = NULL;
3340	key = PyInt_FromLong(decode[i]);
3341	value = PyInt_FromLong(i);
3342	if (!key \|\| !value)
3343	goto failed1;
3344	if (PyDict_SetItem(result, key, value) == -1)
3345	goto failed1;
3346	Py_DECREF(key);
3347	Py_DECREF(value);
3348	}
3349	return result;
3350	failed1:
3351	Py_XDECREF(key);
3352	Py_XDECREF(value);
3353	Py_DECREF(result);
3354	return NULL;
3355	}
3356
3357	/* Create a three-level trie */
3358	result = PyObject_MALLOC(sizeof(struct encoding_map) +
3359	16count2 + 128count3 - 1);
3360	if (!result)
3361	return PyErr_NoMemory();
3362	PyObject_Init(result, &EncodingMapType);
3363	mresult = (struct encoding_map*)result;
3364	mresult->count2 = count2;
3365	mresult->count3 = count3;
3366	mlevel1 = mresult->level1;
3367	mlevel2 = mresult->level23;
3368	mlevel3 = mresult->level23 + 16*count2;
3369	memcpy(mlevel1, level1, 32);
3370	memset(mlevel2, 0xFF, 16*count2);
3371	memset(mlevel3, 0, 128*count3);
3372	count3 = 0;
3373	for (i = 1; i < 256; i++) {
3374	int o1, o2, o3, i2, i3;
3375	if (decode[i] == 0xFFFE)
3376	/* unmapped character */
3377	continue;
3378	o1 = decode[i]>>11;
3379	o2 = (decode[i]>>7) & 0xF;
3380	i2 = 16*mlevel1[o1] + o2;
3381	if (mlevel2[i2] == 0xFF)
3382	mlevel2[i2] = count3++;
3383	o3 = decode[i] & 0x7F;
3384	i3 = 128*mlevel2[i2] + o3;
3385	mlevel3[i3] = i;
3386	}
3387	return result;
3388	}
3389
3390	static int
3391	encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3392	{
3393	struct encoding_map map = (struct encoding_map)mapping;
3394	int l1 = c>>11;
3395	int l2 = (c>>7) & 0xF;
3396	int l3 = c & 0x7F;
3397	int i;
3398
3399	#ifdef Py_UNICODE_WIDE
3400	if (c > 0xFFFF) {
3401	return -1;
3402	}
3403	#endif
3404	if (c == 0)
3405	return 0;
3406	/* level 1*/
3407	i = map->level1[l1];
3408	if (i == 0xFF) {
3409	return -1;
3410	}
3411	/* level 2*/
3412	i = map->level23[16*i+l2];
3413	if (i == 0xFF) {
3414	return -1;
3415	}
3416	/* level 3 */
3417	i = map->level23[16map->count2 + 128i + l3];
3418	if (i == 0) {
3419	return -1;
3420	}
3421	return i;
3422	}
3423
3424	/* Lookup the character ch in the mapping. If the character
3425	can't be found, Py_None is returned (or NULL, if another
3426	error occurred). */
3427	static PyObject charmapencode_lookup(Py_UNICODE c, PyObject mapping)
3428	{
3429	PyObject *w = PyInt_FromLong((long)c);
3430	PyObject *x;
3431
3432	if (w == NULL)
3433	return NULL;
3434	x = PyObject_GetItem(mapping, w);
3435	Py_DECREF(w);
3436	if (x == NULL) {
3437	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3438	/* No mapping found means: mapping is undefined. */
3439	PyErr_Clear();
3440	x = Py_None;
3441	Py_INCREF(x);
3442	return x;
3443	} else
3444	return NULL;
3445	}
3446	else if (x == Py_None)
3447	return x;
3448	else if (PyInt_Check(x)) {
3449	long value = PyInt_AS_LONG(x);
3450	if (value < 0 \|\| value > 255) {
3451	PyErr_SetString(PyExc_TypeError,
3452	"character mapping must be in range(256)");
3453	Py_DECREF(x);
3454	return NULL;
3455	}
3456	return x;
3457	}
3458	else if (PyString_Check(x))
3459	return x;
3460	else {
3461	/* wrong return value */
3462	PyErr_SetString(PyExc_TypeError,
3463	"character mapping must return integer, None or str");
3464	Py_DECREF(x);
3465	return NULL;
3466	}
3467	}
3468
3469	static int
3470	charmapencode_resize(PyObject *outobj, Py_ssize_t outpos, Py_ssize_t requiredsize)
3471	{
3472	Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3473	/* exponentially overallocate to minimize reallocations */
3474	if (requiredsize < 2*outsize)
3475	requiredsize = 2*outsize;
3476	if (_PyString_Resize(outobj, requiredsize)) {
3477	return 0;
3478	}
3479	return 1;
3480	}
3481
3482	typedef enum charmapencode_result {
3483	enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3484	}charmapencode_result;
3485	/* lookup the character, put the result in the output string and adjust
3486	various state variables. Reallocate the output string if not enough
3487	space is available. Return a new reference to the object that
3488	was put in the output buffer, or Py_None, if the mapping was undefined
3489	(in which case no character was written) or NULL, if a
3490	reallocation error occurred. The caller must decref the result */
3491	static
3492	charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
3493	PyObject *outobj, Py_ssize_t outpos)
3494	{
3495	PyObject *rep;
3496	char *outstart;
3497	Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3498
3499	if (mapping->ob_type == &EncodingMapType) {
3500	int res = encoding_map_lookup(c, mapping);
3501	Py_ssize_t requiredsize = *outpos+1;
3502	if (res == -1)
3503	return enc_FAILED;
3504	if (outsize<requiredsize)
3505	if (!charmapencode_resize(outobj, outpos, requiredsize))
3506	return enc_EXCEPTION;
3507	outstart = PyString_AS_STRING(*outobj);
3508	outstart[(*outpos)++] = (char)res;
3509	return enc_SUCCESS;
3510	}
3511
3512	rep = charmapencode_lookup(c, mapping);
3513	if (rep==NULL)
3514	return enc_EXCEPTION;
3515	else if (rep==Py_None) {
3516	Py_DECREF(rep);
3517	return enc_FAILED;
3518	} else {
3519	if (PyInt_Check(rep)) {
3520	Py_ssize_t requiredsize = *outpos+1;
3521	if (outsize<requiredsize)
3522	if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3523	Py_DECREF(rep);
3524	return enc_EXCEPTION;
3525	}
3526	outstart = PyString_AS_STRING(*outobj);
3527	outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3528	}
3529	else {
3530	const char *repchars = PyString_AS_STRING(rep);
3531	Py_ssize_t repsize = PyString_GET_SIZE(rep);
3532	Py_ssize_t requiredsize = *outpos+repsize;
3533	if (outsize<requiredsize)
3534	if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3535	Py_DECREF(rep);
3536	return enc_EXCEPTION;
3537	}
3538	outstart = PyString_AS_STRING(*outobj);
3539	memcpy(outstart + *outpos, repchars, repsize);
3540	*outpos += repsize;
3541	}
3542	}
3543	Py_DECREF(rep);
3544	return enc_SUCCESS;
3545	}
3546
3547	/* handle an error in PyUnicode_EncodeCharmap
3548	Return 0 on success, -1 on error */
3549	static
3550	int charmap_encoding_error(
3551	const Py_UNICODE p, Py_ssize_t size, Py_ssize_t inpos, PyObject *mapping,
3552	PyObject **exceptionObject,
3553	int known_errorHandler, PyObject errorHandler, const char errors,
3554	PyObject *res, Py_ssize_t respos)
3555	{
3556	PyObject repunicode = NULL; / initialize to prevent gcc warning */
3557	Py_ssize_t repsize;
3558	Py_ssize_t newpos;
3559	Py_UNICODE *uni2;
3560	/* startpos for collecting unencodable chars */
3561	Py_ssize_t collstartpos = *inpos;
3562	Py_ssize_t collendpos = *inpos+1;
3563	Py_ssize_t collpos;
3564	char *encoding = "charmap";
3565	char *reason = "character maps to <undefined>";
3566	charmapencode_result x;
3567
3568	/* find all unencodable characters */
3569	while (collendpos < size) {
3570	PyObject *rep;
3571	if (mapping->ob_type == &EncodingMapType) {
3572	int res = encoding_map_lookup(p[collendpos], mapping);
3573	if (res != -1)
3574	break;
3575	++collendpos;
3576	continue;
3577	}
3578
3579	rep = charmapencode_lookup(p[collendpos], mapping);
3580	if (rep==NULL)
3581	return -1;
3582	else if (rep!=Py_None) {
3583	Py_DECREF(rep);
3584	break;
3585	}
3586	Py_DECREF(rep);
3587	++collendpos;
3588	}
3589	/* cache callback name lookup
3590	* (if not done yet, i.e. it's the first error) */
3591	if (*known_errorHandler==-1) {
3592	if ((errors==NULL) \|\| (!strcmp(errors, "strict")))
3593	*known_errorHandler = 1;
3594	else if (!strcmp(errors, "replace"))
3595	*known_errorHandler = 2;
3596	else if (!strcmp(errors, "ignore"))
3597	*known_errorHandler = 3;
3598	else if (!strcmp(errors, "xmlcharrefreplace"))
3599	*known_errorHandler = 4;
3600	else
3601	*known_errorHandler = 0;
3602	}
3603	switch (*known_errorHandler) {
3604	case 1: /* strict */
3605	raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3606	return -1;
3607	case 2: /* replace */
3608	for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3609	x = charmapencode_output('?', mapping, res, respos);
3610	if (x==enc_EXCEPTION) {
3611	return -1;
3612	}
3613	else if (x==enc_FAILED) {
3614	raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3615	return -1;
3616	}
3617	}
3618	/* fall through */
3619	case 3: /* ignore */
3620	*inpos = collendpos;
3621	break;
3622	case 4: /* xmlcharrefreplace */
3623	/* generate replacement (temporarily (mis)uses p) */
3624	for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3625	char buffer[2+29+1+1];
3626	char *cp;
3627	sprintf(buffer, "&#%d;", (int)p[collpos]);
3628	for (cp = buffer; *cp; ++cp) {
3629	x = charmapencode_output(*cp, mapping, res, respos);
3630	if (x==enc_EXCEPTION)
3631	return -1;
3632	else if (x==enc_FAILED) {
3633	raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3634	return -1;
3635	}
3636	}
3637	}
3638	*inpos = collendpos;
3639	break;
3640	default:
3641	repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3642	encoding, reason, p, size, exceptionObject,
3643	collstartpos, collendpos, &newpos);
3644	if (repunicode == NULL)
3645	return -1;
3646	/* generate replacement */
3647	repsize = PyUnicode_GET_SIZE(repunicode);
3648	for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3649	x = charmapencode_output(*uni2, mapping, res, respos);
3650	if (x==enc_EXCEPTION) {
3651	return -1;
3652	}
3653	else if (x==enc_FAILED) {
3654	Py_DECREF(repunicode);
3655	raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3656	return -1;
3657	}
3658	}
3659	*inpos = newpos;
3660	Py_DECREF(repunicode);
3661	}
3662	return 0;
3663	}
3664
3665	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
3666	Py_ssize_t size,
3667	PyObject *mapping,
3668	const char *errors)
3669	{
3670	/* output object */
3671	PyObject *res = NULL;
3672	/* current input position */
3673	Py_ssize_t inpos = 0;
3674	/* current output position */
3675	Py_ssize_t respos = 0;
3676	PyObject *errorHandler = NULL;
3677	PyObject *exc = NULL;
3678	/* the following variable is used for caching string comparisons
3679	* -1=not initialized, 0=unknown, 1=strict, 2=replace,
3680	* 3=ignore, 4=xmlcharrefreplace */
3681	int known_errorHandler = -1;
3682
3683	/* Default to Latin-1 */
3684	if (mapping == NULL)
3685	return PyUnicode_EncodeLatin1(p, size, errors);
3686
3687	/* allocate enough for a simple encoding without
3688	replacements, if we need more, we'll resize */
3689	res = PyString_FromStringAndSize(NULL, size);
3690	if (res == NULL)
3691	goto onError;
3692	if (size == 0)
3693	return res;
3694
3695	while (inpos<size) {
3696	/* try to encode it */
3697	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3698	if (x==enc_EXCEPTION) /* error */
3699	goto onError;
3700	if (x==enc_FAILED) { /* unencodable character */
3701	if (charmap_encoding_error(p, size, &inpos, mapping,
3702	&exc,
3703	&known_errorHandler, &errorHandler, errors,
3704	&res, &respos)) {
3705	goto onError;
3706	}
3707	}
3708	else
3709	/* done with this character => adjust input position */
3710	++inpos;
3711	}
3712
3713	/* Resize if we allocated to much */
3714	if (respos<PyString_GET_SIZE(res)) {
3715	if (_PyString_Resize(&res, respos))
3716	goto onError;
3717	}
3718	Py_XDECREF(exc);
3719	Py_XDECREF(errorHandler);
3720	return res;
3721
3722	onError:
3723	Py_XDECREF(res);
3724	Py_XDECREF(exc);
3725	Py_XDECREF(errorHandler);
3726	return NULL;
3727	}
3728
3729	PyObject PyUnicode_AsCharmapString(PyObject unicode,
3730	PyObject *mapping)
3731	{
3732	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
3733	PyErr_BadArgument();
3734	return NULL;
3735	}
3736	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3737	PyUnicode_GET_SIZE(unicode),
3738	mapping,
3739	NULL);
3740	}
3741
3742	/* create or adjust a UnicodeTranslateError */
3743	static void make_translate_exception(PyObject **exceptionObject,
3744	const Py_UNICODE *unicode, Py_ssize_t size,
3745	Py_ssize_t startpos, Py_ssize_t endpos,
3746	const char *reason)
3747	{
3748	if (*exceptionObject == NULL) {
3749	*exceptionObject = PyUnicodeTranslateError_Create(
3750	unicode, size, startpos, endpos, reason);
3751	}
3752	else {
3753	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3754	goto onError;
3755	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3756	goto onError;
3757	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3758	goto onError;
3759	return;
3760	onError:
3761	Py_DECREF(*exceptionObject);
3762	*exceptionObject = NULL;
3763	}
3764	}
3765
3766	/* raises a UnicodeTranslateError */
3767	static void raise_translate_exception(PyObject **exceptionObject,
3768	const Py_UNICODE *unicode, Py_ssize_t size,
3769	Py_ssize_t startpos, Py_ssize_t endpos,
3770	const char *reason)
3771	{
3772	make_translate_exception(exceptionObject,
3773	unicode, size, startpos, endpos, reason);
3774	if (*exceptionObject != NULL)
3775	PyCodec_StrictErrors(*exceptionObject);
3776	}
3777
3778	/* error handling callback helper:
3779	build arguments, call the callback and check the arguments,
3780	put the result into newpos and return the replacement string, which
3781	has to be freed by the caller */
3782	static PyObject unicode_translate_call_errorhandler(const char errors,
3783	PyObject **errorHandler,
3784	const char *reason,
3785	const Py_UNICODE unicode, Py_ssize_t size, PyObject *exceptionObject,
3786	Py_ssize_t startpos, Py_ssize_t endpos,
3787	Py_ssize_t *newpos)
3788	{
3789	static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
3790
3791	Py_ssize_t i_newpos;
3792	PyObject *restuple;
3793	PyObject *resunicode;
3794
3795	if (*errorHandler == NULL) {
3796	*errorHandler = PyCodec_LookupError(errors);
3797	if (*errorHandler == NULL)
3798	return NULL;
3799	}
3800
3801	make_translate_exception(exceptionObject,
3802	unicode, size, startpos, endpos, reason);
3803	if (*exceptionObject == NULL)
3804	return NULL;
3805
3806	restuple = PyObject_CallFunctionObjArgs(
3807	errorHandler, exceptionObject, NULL);
3808	if (restuple == NULL)
3809	return NULL;
3810	if (!PyTuple_Check(restuple)) {
3811	PyErr_Format(PyExc_TypeError, &argparse[4]);
3812	Py_DECREF(restuple);
3813	return NULL;
3814	}
3815	if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3816	&resunicode, &i_newpos)) {
3817	Py_DECREF(restuple);
3818	return NULL;
3819	}
3820	if (i_newpos<0)
3821	*newpos = size+i_newpos;
3822	else
3823	*newpos = i_newpos;
3824	if (newpos<0 \|\| newpos>size) {
3825	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3826	Py_DECREF(restuple);
3827	return NULL;
3828	}
3829	Py_INCREF(resunicode);
3830	Py_DECREF(restuple);
3831	return resunicode;
3832	}
3833
3834	/* Lookup the character ch in the mapping and put the result in result,
3835	which must be decrefed by the caller.
3836	Return 0 on success, -1 on error */
3837	static
3838	int charmaptranslate_lookup(Py_UNICODE c, PyObject mapping, PyObject *result)
3839	{
3840	PyObject *w = PyInt_FromLong((long)c);
3841	PyObject *x;
3842
3843	if (w == NULL)
3844	return -1;
3845	x = PyObject_GetItem(mapping, w);
3846	Py_DECREF(w);
3847	if (x == NULL) {
3848	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3849	/* No mapping found means: use 1:1 mapping. */
3850	PyErr_Clear();
3851	*result = NULL;
3852	return 0;
3853	} else
3854	return -1;
3855	}
3856	else if (x == Py_None) {
3857	*result = x;
3858	return 0;
3859	}
3860	else if (PyInt_Check(x)) {
3861	long value = PyInt_AS_LONG(x);
3862	long max = PyUnicode_GetMax();
3863	if (value < 0 \|\| value > max) {
3864	PyErr_Format(PyExc_TypeError,
3865	"character mapping must be in range(0x%lx)", max+1);
3866	Py_DECREF(x);
3867	return -1;
3868	}
3869	*result = x;
3870	return 0;
3871	}
3872	else if (PyUnicode_Check(x)) {
3873	*result = x;
3874	return 0;
3875	}
3876	else {
3877	/* wrong return value */
3878	PyErr_SetString(PyExc_TypeError,
3879	"character mapping must return integer, None or unicode");
3880	Py_DECREF(x);
3881	return -1;
3882	}
3883	}
3884	/* ensure that *outobj is at least requiredsize characters long,
3885	if not reallocate and adjust various state variables.
3886	Return 0 on success, -1 on error */
3887	static
3888	int charmaptranslate_makespace(PyObject outobj, Py_UNICODE outp,
3889	Py_ssize_t requiredsize)
3890	{
3891	Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
3892	if (requiredsize > oldsize) {
3893	/* remember old output position */
3894	Py_ssize_t outpos = outp-PyUnicode_AS_UNICODE(outobj);
3895	/* exponentially overallocate to minimize reallocations */
3896	if (requiredsize < 2 * oldsize)
3897	requiredsize = 2 * oldsize;
3898	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
3899	return -1;
3900	outp = PyUnicode_AS_UNICODE(outobj) + outpos;
3901	}
3902	return 0;
3903	}
3904	/* lookup the character, put the result in the output string and adjust
3905	various state variables. Return a new reference to the object that
3906	was put in the output buffer in *result, or Py_None, if the mapping was
3907	undefined (in which case no character was written).
3908	The called must decref result.
3909	Return 0 on success, -1 on error. */
3910	static
3911	int charmaptranslate_output(const Py_UNICODE startinp, const Py_UNICODE curinp,
3912	Py_ssize_t insize, PyObject mapping, PyObject outobj, Py_UNICODE *outp,
3913	PyObject **res)
3914	{
3915	if (charmaptranslate_lookup(*curinp, mapping, res))
3916	return -1;
3917	if (*res==NULL) {
3918	/* not found => default to 1:1 mapping */
3919	(outp)++ = *curinp;
3920	}
3921	else if (*res==Py_None)
3922	;
3923	else if (PyInt_Check(*res)) {
3924	/* no overflow check, because we know that the space is enough */
3925	(outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3926	}
3927	else if (PyUnicode_Check(*res)) {
3928	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
3929	if (repsize==1) {
3930	/* no overflow check, because we know that the space is enough */
3931	(outp)++ = PyUnicode_AS_UNICODE(res);
3932	}
3933	else if (repsize!=0) {
3934	/* more than one character */
3935	Py_ssize_t requiredsize = (outp-PyUnicode_AS_UNICODE(outobj)) +
3936	(insize - (curinp-startinp)) +
3937	repsize - 1;
3938	if (charmaptranslate_makespace(outobj, outp, requiredsize))
3939	return -1;
3940	memcpy(outp, PyUnicode_AS_UNICODE(res), sizeof(Py_UNICODE)*repsize);
3941	*outp += repsize;
3942	}
3943	}
3944	else
3945	return -1;
3946	return 0;
3947	}
3948
3949	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE p,
3950	Py_ssize_t size,
3951	PyObject *mapping,
3952	const char *errors)
3953	{
3954	/* output object */
3955	PyObject *res = NULL;
3956	/* pointers to the beginning and end+1 of input */
3957	const Py_UNICODE *startp = p;
3958	const Py_UNICODE *endp = p + size;
3959	/* pointer into the output */
3960	Py_UNICODE *str;
3961	/* current output position */
3962	Py_ssize_t respos = 0;
3963	char *reason = "character maps to <undefined>";
3964	PyObject *errorHandler = NULL;
3965	PyObject *exc = NULL;
3966	/* the following variable is used for caching string comparisons
3967	* -1=not initialized, 0=unknown, 1=strict, 2=replace,
3968	* 3=ignore, 4=xmlcharrefreplace */
3969	int known_errorHandler = -1;
3970
3971	if (mapping == NULL) {
3972	PyErr_BadArgument();
3973	return NULL;
3974	}
3975
3976	/* allocate enough for a simple 1:1 translation without
3977	replacements, if we need more, we'll resize */
3978	res = PyUnicode_FromUnicode(NULL, size);
3979	if (res == NULL)
3980	goto onError;
3981	if (size == 0)
3982	return res;
3983	str = PyUnicode_AS_UNICODE(res);
3984
3985	while (p<endp) {
3986	/* try to encode it */
3987	PyObject *x = NULL;
3988	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
3989	Py_XDECREF(x);
3990	goto onError;
3991	}
3992	Py_XDECREF(x);
3993	if (x!=Py_None) /* it worked => adjust input pointer */
3994	++p;
3995	else { /* untranslatable character */
3996	PyObject repunicode = NULL; / initialize to prevent gcc warning */
3997	Py_ssize_t repsize;
3998	Py_ssize_t newpos;
3999	Py_UNICODE *uni2;
4000	/* startpos for collecting untranslatable chars */
4001	const Py_UNICODE *collstart = p;
4002	const Py_UNICODE *collend = p+1;
4003	const Py_UNICODE *coll;
4004
4005	/* find all untranslatable characters */
4006	while (collend < endp) {
4007	if (charmaptranslate_lookup(*collend, mapping, &x))
4008	goto onError;
4009	Py_XDECREF(x);
4010	if (x!=Py_None)
4011	break;
4012	++collend;
4013	}
4014	/* cache callback name lookup
4015	* (if not done yet, i.e. it's the first error) */
4016	if (known_errorHandler==-1) {
4017	if ((errors==NULL) \|\| (!strcmp(errors, "strict")))
4018	known_errorHandler = 1;
4019	else if (!strcmp(errors, "replace"))
4020	known_errorHandler = 2;
4021	else if (!strcmp(errors, "ignore"))
4022	known_errorHandler = 3;
4023	else if (!strcmp(errors, "xmlcharrefreplace"))
4024	known_errorHandler = 4;
4025	else
4026	known_errorHandler = 0;
4027	}
4028	switch (known_errorHandler) {
4029	case 1: /* strict */
4030	raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4031	goto onError;
4032	case 2: /* replace */
4033	/* No need to check for space, this is a 1:1 replacement */
4034	for (coll = collstart; coll<collend; ++coll)
4035	*str++ = '?';
4036	/* fall through */
4037	case 3: /* ignore */
4038	p = collend;
4039	break;
4040	case 4: /* xmlcharrefreplace */
4041	/* generate replacement (temporarily (mis)uses p) */
4042	for (p = collstart; p < collend; ++p) {
4043	char buffer[2+29+1+1];
4044	char *cp;
4045	sprintf(buffer, "&#%d;", (int)*p);
4046	if (charmaptranslate_makespace(&res, &str,
4047	(str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4048	goto onError;
4049	for (cp = buffer; *cp; ++cp)
4050	str++ = cp;
4051	}
4052	p = collend;
4053	break;
4054	default:
4055	repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4056	reason, startp, size, &exc,
4057	collstart-startp, collend-startp, &newpos);
4058	if (repunicode == NULL)
4059	goto onError;
4060	/* generate replacement */
4061	repsize = PyUnicode_GET_SIZE(repunicode);
4062	if (charmaptranslate_makespace(&res, &str,
4063	(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4064	Py_DECREF(repunicode);
4065	goto onError;
4066	}
4067	for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4068	str++ = uni2;
4069	p = startp + newpos;
4070	Py_DECREF(repunicode);
4071	}
4072	}
4073	}
4074	/* Resize if we allocated to much */
4075	respos = str-PyUnicode_AS_UNICODE(res);
4076	if (respos<PyUnicode_GET_SIZE(res)) {
4077	if (_PyUnicode_Resize(&res, respos) < 0)
4078	goto onError;
4079	}
4080	Py_XDECREF(exc);
4081	Py_XDECREF(errorHandler);
4082	return res;
4083
4084	onError:
4085	Py_XDECREF(res);
4086	Py_XDECREF(exc);
4087	Py_XDECREF(errorHandler);
4088	return NULL;
4089	}
4090
4091	PyObject PyUnicode_Translate(PyObject str,
4092	PyObject *mapping,
4093	const char *errors)
4094	{
4095	PyObject *result;
4096
4097	str = PyUnicode_FromObject(str);
4098	if (str == NULL)
4099	goto onError;
4100	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4101	PyUnicode_GET_SIZE(str),
4102	mapping,
4103	errors);
4104	Py_DECREF(str);
4105	return result;
4106
4107	onError:
4108	Py_XDECREF(str);
4109	return NULL;
4110	}
4111
4112	/* --- Decimal Encoder ---------------------------------------------------- */
4113
4114	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4115	Py_ssize_t length,
4116	char *output,
4117	const char *errors)
4118	{
4119	Py_UNICODE p, end;
4120	PyObject *errorHandler = NULL;
4121	PyObject *exc = NULL;
4122	const char *encoding = "decimal";
4123	const char *reason = "invalid decimal Unicode string";
4124	/* the following variable is used for caching string comparisons
4125	* -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4126	int known_errorHandler = -1;
4127
4128	if (output == NULL) {
4129	PyErr_BadArgument();
4130	return -1;
4131	}
4132
4133	p = s;
4134	end = s + length;
4135	while (p < end) {
4136	register Py_UNICODE ch = *p;
4137	int decimal;
4138	PyObject *repunicode;
4139	Py_ssize_t repsize;
4140	Py_ssize_t newpos;
4141	Py_UNICODE *uni2;
4142	Py_UNICODE *collstart;
4143	Py_UNICODE *collend;
4144
4145	if (Py_UNICODE_ISSPACE(ch)) {
4146	*output++ = ' ';
4147	++p;
4148	continue;
4149	}
4150	decimal = Py_UNICODE_TODECIMAL(ch);
4151	if (decimal >= 0) {
4152	*output++ = '0' + decimal;
4153	++p;
4154	continue;
4155	}
4156	if (0 < ch && ch < 256) {
4157	*output++ = (char)ch;
4158	++p;
4159	continue;
4160	}
4161	/* All other characters are considered unencodable */
4162	collstart = p;
4163	collend = p+1;
4164	while (collend < end) {
4165	if ((0 < collend && collend < 256) \|\|
4166	!Py_UNICODE_ISSPACE(*collend) \|\|
4167	Py_UNICODE_TODECIMAL(*collend))
4168	break;
4169	}
4170	/* cache callback name lookup
4171	* (if not done yet, i.e. it's the first error) */
4172	if (known_errorHandler==-1) {
4173	if ((errors==NULL) \|\| (!strcmp(errors, "strict")))
4174	known_errorHandler = 1;
4175	else if (!strcmp(errors, "replace"))
4176	known_errorHandler = 2;
4177	else if (!strcmp(errors, "ignore"))
4178	known_errorHandler = 3;
4179	else if (!strcmp(errors, "xmlcharrefreplace"))
4180	known_errorHandler = 4;
4181	else
4182	known_errorHandler = 0;
4183	}
4184	switch (known_errorHandler) {
4185	case 1: /* strict */
4186	raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4187	goto onError;
4188	case 2: /* replace */
4189	for (p = collstart; p < collend; ++p)
4190	*output++ = '?';
4191	/* fall through */
4192	case 3: /* ignore */
4193	p = collend;
4194	break;
4195	case 4: /* xmlcharrefreplace */
4196	/* generate replacement (temporarily (mis)uses p) */
4197	for (p = collstart; p < collend; ++p)
4198	output += sprintf(output, "&#%d;", (int)*p);
4199	p = collend;
4200	break;
4201	default:
4202	repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4203	encoding, reason, s, length, &exc,
4204	collstart-s, collend-s, &newpos);
4205	if (repunicode == NULL)
4206	goto onError;
4207	/* generate replacement */
4208	repsize = PyUnicode_GET_SIZE(repunicode);
4209	for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4210	Py_UNICODE ch = *uni2;
4211	if (Py_UNICODE_ISSPACE(ch))
4212	*output++ = ' ';
4213	else {
4214	decimal = Py_UNICODE_TODECIMAL(ch);
4215	if (decimal >= 0)
4216	*output++ = '0' + decimal;
4217	else if (0 < ch && ch < 256)
4218	*output++ = (char)ch;
4219	else {
4220	Py_DECREF(repunicode);
4221	raise_encode_exception(&exc, encoding,
4222	s, length, collstart-s, collend-s, reason);
4223	goto onError;
4224	}
4225	}
4226	}
4227	p = s + newpos;
4228	Py_DECREF(repunicode);
4229	}
4230	}
4231	/* 0-terminate the output string */
4232	*output++ = '\0';
4233	Py_XDECREF(exc);
4234	Py_XDECREF(errorHandler);
4235	return 0;
4236
4237	onError:
4238	Py_XDECREF(exc);
4239	Py_XDECREF(errorHandler);
4240	return -1;
4241	}
4242
4243	/* --- Helpers ------------------------------------------------------------ */
4244
4245	#define STRINGLIB_CHAR Py_UNICODE
4246
4247	#define STRINGLIB_LEN PyUnicode_GET_SIZE
4248	#define STRINGLIB_NEW PyUnicode_FromUnicode
4249	#define STRINGLIB_STR PyUnicode_AS_UNICODE
4250
4251	Py_LOCAL_INLINE(int)
4252	STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4253	{
4254	if (str[0] != other[0])
4255	return 1;
4256	return memcmp((void) str, (void) other, len * sizeof(Py_UNICODE));
4257	}
4258
4259	#define STRINGLIB_EMPTY unicode_empty
4260
4261	#include "stringlib/fastsearch.h"
4262
4263	#include "stringlib/count.h"
4264	#include "stringlib/find.h"
4265	#include "stringlib/partition.h"
4266
4267	/* helper macro to fixup start/end slice values */
4268	#define FIX_START_END(obj) \
4269	if (start < 0) \
4270	start += (obj)->length; \
4271	if (start < 0) \
4272	start = 0; \
4273	if (end > (obj)->length) \
4274	end = (obj)->length; \
4275	if (end < 0) \
4276	end += (obj)->length; \
4277	if (end < 0) \
4278	end = 0;
4279
4280	Py_ssize_t PyUnicode_Count(PyObject *str,
4281	PyObject *substr,
4282	Py_ssize_t start,
4283	Py_ssize_t end)
4284	{
4285	Py_ssize_t result;
4286	PyUnicodeObject* str_obj;
4287	PyUnicodeObject* sub_obj;
4288
4289	str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4290	if (!str_obj)
4291	return -1;
4292	sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4293	if (!sub_obj) {
4294	Py_DECREF(str_obj);
4295	return -1;
4296	}
4297
4298	FIX_START_END(str_obj);
4299
4300	result = stringlib_count(
4301	str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4302	);
4303
4304	Py_DECREF(sub_obj);
4305	Py_DECREF(str_obj);
4306
4307	return result;
4308	}
4309
4310	Py_ssize_t PyUnicode_Find(PyObject *str,
4311	PyObject *sub,
4312	Py_ssize_t start,
4313	Py_ssize_t end,
4314	int direction)
4315	{
4316	Py_ssize_t result;
4317
4318	str = PyUnicode_FromObject(str);
4319	if (!str)
4320	return -2;
4321	sub = PyUnicode_FromObject(sub);
4322	if (!sub) {
4323	Py_DECREF(str);
4324	return -2;
4325	}
4326
4327	if (direction > 0)
4328	result = stringlib_find_slice(
4329	PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4330	PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4331	start, end
4332	);
4333	else
4334	result = stringlib_rfind_slice(
4335	PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4336	PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4337	start, end
4338	);
4339
4340	Py_DECREF(str);
4341	Py_DECREF(sub);
4342
4343	return result;
4344	}
4345
4346	static
4347	int tailmatch(PyUnicodeObject *self,
4348	PyUnicodeObject *substring,
4349	Py_ssize_t start,
4350	Py_ssize_t end,
4351	int direction)
4352	{
4353	if (substring->length == 0)
4354	return 1;
4355
4356	FIX_START_END(self);
4357
4358	end -= substring->length;
4359	if (end < start)
4360	return 0;
4361
4362	if (direction > 0) {
4363	if (Py_UNICODE_MATCH(self, end, substring))
4364	return 1;
4365	} else {
4366	if (Py_UNICODE_MATCH(self, start, substring))
4367	return 1;
4368	}
4369
4370	return 0;
4371	}
4372
4373	Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
4374	PyObject *substr,
4375	Py_ssize_t start,
4376	Py_ssize_t end,
4377	int direction)
4378	{
4379	Py_ssize_t result;
4380
4381	str = PyUnicode_FromObject(str);
4382	if (str == NULL)
4383	return -1;
4384	substr = PyUnicode_FromObject(substr);
4385	if (substr == NULL) {
4386	Py_DECREF(str);
4387	return -1;
4388	}
4389
4390	result = tailmatch((PyUnicodeObject *)str,
4391	(PyUnicodeObject *)substr,
4392	start, end, direction);
4393	Py_DECREF(str);
4394	Py_DECREF(substr);
4395	return result;
4396	}
4397
4398	/* Apply fixfct filter to the Unicode object self and return a
4399	reference to the modified object */
4400
4401	static
4402	PyObject fixup(PyUnicodeObject self,
4403	int (fixfct)(PyUnicodeObject s))
4404	{
4405
4406	PyUnicodeObject *u;
4407
4408	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4409	if (u == NULL)
4410	return NULL;
4411
4412	Py_UNICODE_COPY(u->str, self->str, self->length);
4413
4414	if (!fixfct(u) && PyUnicode_CheckExact(self)) {
4415	/* fixfct should return TRUE if it modified the buffer. If
4416	FALSE, return a reference to the original buffer instead
4417	(to save space, not time) */
4418	Py_INCREF(self);
4419	Py_DECREF(u);
4420	return (PyObject*) self;
4421	}
4422	return (PyObject*) u;
4423	}
4424
4425	static
4426	int fixupper(PyUnicodeObject *self)
4427	{
4428	Py_ssize_t len = self->length;
4429	Py_UNICODE *s = self->str;
4430	int status = 0;
4431
4432	while (len-- > 0) {
4433	register Py_UNICODE ch;
4434
4435	ch = Py_UNICODE_TOUPPER(*s);
4436	if (ch != *s) {
4437	status = 1;
4438	*s = ch;
4439	}
4440	s++;
4441	}
4442
4443	return status;
4444	}
4445
4446	static
4447	int fixlower(PyUnicodeObject *self)
4448	{
4449	Py_ssize_t len = self->length;
4450	Py_UNICODE *s = self->str;
4451	int status = 0;
4452
4453	while (len-- > 0) {
4454	register Py_UNICODE ch;
4455
4456	ch = Py_UNICODE_TOLOWER(*s);
4457	if (ch != *s) {
4458	status = 1;
4459	*s = ch;
4460	}
4461	s++;
4462	}
4463
4464	return status;
4465	}
4466
4467	static
4468	int fixswapcase(PyUnicodeObject *self)
4469	{
4470	Py_ssize_t len = self->length;
4471	Py_UNICODE *s = self->str;
4472	int status = 0;
4473
4474	while (len-- > 0) {
4475	if (Py_UNICODE_ISUPPER(*s)) {
4476	s = Py_UNICODE_TOLOWER(s);
4477	status = 1;
4478	} else if (Py_UNICODE_ISLOWER(*s)) {
4479	s = Py_UNICODE_TOUPPER(s);
4480	status = 1;
4481	}
4482	s++;
4483	}
4484
4485	return status;
4486	}
4487
4488	static
4489	int fixcapitalize(PyUnicodeObject *self)
4490	{
4491	Py_ssize_t len = self->length;
4492	Py_UNICODE *s = self->str;
4493	int status = 0;
4494
4495	if (len == 0)
4496	return 0;
4497	if (Py_UNICODE_ISLOWER(*s)) {
4498	s = Py_UNICODE_TOUPPER(s);
4499	status = 1;
4500	}
4501	s++;
4502	while (--len > 0) {
4503	if (Py_UNICODE_ISUPPER(*s)) {
4504	s = Py_UNICODE_TOLOWER(s);
4505	status = 1;
4506	}
4507	s++;
4508	}
4509	return status;
4510	}
4511
4512	static
4513	int fixtitle(PyUnicodeObject *self)
4514	{
4515	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4516	register Py_UNICODE *e;
4517	int previous_is_cased;
4518
4519	/* Shortcut for single character strings */
4520	if (PyUnicode_GET_SIZE(self) == 1) {
4521	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4522	if (*p != ch) {
4523	*p = ch;
4524	return 1;
4525	}
4526	else
4527	return 0;
4528	}
4529
4530	e = p + PyUnicode_GET_SIZE(self);
4531	previous_is_cased = 0;
4532	for (; p < e; p++) {
4533	register const Py_UNICODE ch = *p;
4534
4535	if (previous_is_cased)
4536	*p = Py_UNICODE_TOLOWER(ch);
4537	else
4538	*p = Py_UNICODE_TOTITLE(ch);
4539
4540	if (Py_UNICODE_ISLOWER(ch) \|\|
4541	Py_UNICODE_ISUPPER(ch) \|\|
4542	Py_UNICODE_ISTITLE(ch))
4543	previous_is_cased = 1;
4544	else
4545	previous_is_cased = 0;
4546	}
4547	return 1;
4548	}
4549
4550	PyObject *
4551	PyUnicode_Join(PyObject separator, PyObject seq)
4552	{
4553	PyObject *internal_separator = NULL;
4554	const Py_UNICODE blank = ' ';
4555	const Py_UNICODE *sep = &blank;
4556	Py_ssize_t seplen = 1;
4557	PyUnicodeObject res = NULL; / the result */
4558	Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4559	Py_ssize_t res_used; /* # used bytes */
4560	Py_UNICODE res_p; / pointer to free byte in res's string area */
4561	PyObject fseq; / PySequence_Fast(seq) */
4562	Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
4563	PyObject *item;
4564	Py_ssize_t i;
4565
4566	fseq = PySequence_Fast(seq, "");
4567	if (fseq == NULL) {
4568	return NULL;
4569	}
4570
4571	/* Grrrr. A codec may be invoked to convert str objects to
4572	* Unicode, and so it's possible to call back into Python code
4573	* during PyUnicode_FromObject(), and so it's possible for a sick
4574	* codec to change the size of fseq (if seq is a list). Therefore
4575	* we have to keep refetching the size -- can't assume seqlen
4576	* is invariant.
4577	*/
4578	seqlen = PySequence_Fast_GET_SIZE(fseq);
4579	/* If empty sequence, return u"". */
4580	if (seqlen == 0) {
4581	res = _PyUnicode_New(0); /* empty sequence; return u"" */
4582	goto Done;
4583	}
4584	/* If singleton sequence with an exact Unicode, return that. */
4585	if (seqlen == 1) {
4586	item = PySequence_Fast_GET_ITEM(fseq, 0);
4587	if (PyUnicode_CheckExact(item)) {
4588	Py_INCREF(item);
4589	res = (PyUnicodeObject *)item;
4590	goto Done;
4591	}
4592	}
4593
4594	/* At least two items to join, or one that isn't exact Unicode. */
4595	if (seqlen > 1) {
4596	/* Set up sep and seplen -- they're needed. */
4597	if (separator == NULL) {
4598	sep = &blank;
4599	seplen = 1;
4600	}
4601	else {
4602	internal_separator = PyUnicode_FromObject(separator);
4603	if (internal_separator == NULL)
4604	goto onError;
4605	sep = PyUnicode_AS_UNICODE(internal_separator);
4606	seplen = PyUnicode_GET_SIZE(internal_separator);
4607	/* In case PyUnicode_FromObject() mutated seq. */
4608	seqlen = PySequence_Fast_GET_SIZE(fseq);
4609	}
4610	}
4611
4612	/* Get space. */
4613	res = _PyUnicode_New(res_alloc);
4614	if (res == NULL)
4615	goto onError;
4616	res_p = PyUnicode_AS_UNICODE(res);
4617	res_used = 0;
4618
4619	for (i = 0; i < seqlen; ++i) {
4620	Py_ssize_t itemlen;
4621	Py_ssize_t new_res_used;
4622
4623	item = PySequence_Fast_GET_ITEM(fseq, i);
4624	/* Convert item to Unicode. */
4625	if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4626	PyErr_Format(PyExc_TypeError,
4627	"sequence item %zd: expected string or Unicode,"
4628	" %.80s found",
4629	i, item->ob_type->tp_name);
4630	goto onError;
4631	}
4632	item = PyUnicode_FromObject(item);
4633	if (item == NULL)
4634	goto onError;
4635	/* We own a reference to item from here on. */
4636
4637	/* In case PyUnicode_FromObject() mutated seq. */
4638	seqlen = PySequence_Fast_GET_SIZE(fseq);
4639
4640	/* Make sure we have enough space for the separator and the item. */
4641	itemlen = PyUnicode_GET_SIZE(item);
4642	new_res_used = res_used + itemlen;
4643	if (new_res_used < 0)
4644	goto Overflow;
4645	if (i < seqlen - 1) {
4646	new_res_used += seplen;
4647	if (new_res_used < 0)
4648	goto Overflow;
4649	}
4650	if (new_res_used > res_alloc) {
4651	/* double allocated size until it's big enough */
4652	do {
4653	res_alloc += res_alloc;
4654	if (res_alloc <= 0)
4655	goto Overflow;
4656	} while (new_res_used > res_alloc);
4657	if (_PyUnicode_Resize(&res, res_alloc) < 0) {
4658	Py_DECREF(item);
4659	goto onError;
4660	}
4661	res_p = PyUnicode_AS_UNICODE(res) + res_used;
4662	}
4663
4664	/* Copy item, and maybe the separator. */
4665	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
4666	res_p += itemlen;
4667	if (i < seqlen - 1) {
4668	Py_UNICODE_COPY(res_p, sep, seplen);
4669	res_p += seplen;
4670	}
4671	Py_DECREF(item);
4672	res_used = new_res_used;
4673	}
4674
4675	/* Shrink res to match the used area; this probably can't fail,
4676	* but it's cheap to check.
4677	*/
4678	if (_PyUnicode_Resize(&res, res_used) < 0)
4679	goto onError;
4680
4681	Done:
4682	Py_XDECREF(internal_separator);
4683	Py_DECREF(fseq);
4684	return (PyObject *)res;
4685
4686	Overflow:
4687	PyErr_SetString(PyExc_OverflowError,
4688	"join() result is too long for a Python string");
4689	Py_DECREF(item);
4690	/* fall through */
4691
4692	onError:
4693	Py_XDECREF(internal_separator);
4694	Py_DECREF(fseq);
4695	Py_XDECREF(res);
4696	return NULL;
4697	}
4698
4699	static
4700	PyUnicodeObject pad(PyUnicodeObject self,
4701	Py_ssize_t left,
4702	Py_ssize_t right,
4703	Py_UNICODE fill)
4704	{
4705	PyUnicodeObject *u;
4706
4707	if (left < 0)
4708	left = 0;
4709	if (right < 0)
4710	right = 0;
4711
4712	if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4713	Py_INCREF(self);
4714	return self;
4715	}
4716
4717	u = _PyUnicode_New(left + self->length + right);
4718	if (u) {
4719	if (left)
4720	Py_UNICODE_FILL(u->str, fill, left);
4721	Py_UNICODE_COPY(u->str + left, self->str, self->length);
4722	if (right)
4723	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4724	}
4725
4726	return u;
4727	}
4728
4729	#define SPLIT_APPEND(data, left, right) \
4730	str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4731	if (!str) \
4732	goto onError; \
4733	if (PyList_Append(list, str)) { \
4734	Py_DECREF(str); \
4735	goto onError; \
4736	} \
4737	else \
4738	Py_DECREF(str);
4739
4740	static
4741	PyObject split_whitespace(PyUnicodeObject self,
4742	PyObject *list,
4743	Py_ssize_t maxcount)
4744	{
4745	register Py_ssize_t i;
4746	register Py_ssize_t j;
4747	Py_ssize_t len = self->length;
4748	PyObject *str;
4749
4750	for (i = j = 0; i < len; ) {
4751	/* find a token */
4752	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4753	i++;
4754	j = i;
4755	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4756	i++;
4757	if (j < i) {
4758	if (maxcount-- <= 0)
4759	break;
4760	SPLIT_APPEND(self->str, j, i);
4761	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4762	i++;
4763	j = i;
4764	}
4765	}
4766	if (j < len) {
4767	SPLIT_APPEND(self->str, j, len);
4768	}
4769	return list;
4770
4771	onError:
4772	Py_DECREF(list);
4773	return NULL;
4774	}
4775
4776	PyObject PyUnicode_Splitlines(PyObject string,
4777	int keepends)
4778	{
4779	register Py_ssize_t i;
4780	register Py_ssize_t j;
4781	Py_ssize_t len;
4782	PyObject *list;
4783	PyObject *str;
4784	Py_UNICODE *data;
4785
4786	string = PyUnicode_FromObject(string);
4787	if (string == NULL)
4788	return NULL;
4789	data = PyUnicode_AS_UNICODE(string);
4790	len = PyUnicode_GET_SIZE(string);
4791
4792	list = PyList_New(0);
4793	if (!list)
4794	goto onError;
4795
4796	for (i = j = 0; i < len; ) {
4797	Py_ssize_t eol;
4798
4799	/* Find a line and append it */
4800	while (i < len && !BLOOM_LINEBREAK(data[i]))
4801	i++;
4802
4803	/* Skip the line break reading CRLF as one line break */
4804	eol = i;
4805	if (i < len) {
4806	if (data[i] == '\r' && i + 1 < len &&
4807	data[i+1] == '\n')
4808	i += 2;
4809	else
4810	i++;
4811	if (keepends)
4812	eol = i;
4813	}
4814	SPLIT_APPEND(data, j, eol);
4815	j = i;
4816	}
4817	if (j < len) {
4818	SPLIT_APPEND(data, j, len);
4819	}
4820
4821	Py_DECREF(string);
4822	return list;
4823
4824	onError:
4825	Py_XDECREF(list);
4826	Py_DECREF(string);
4827	return NULL;
4828	}
4829
4830	static
4831	PyObject split_char(PyUnicodeObject self,
4832	PyObject *list,
4833	Py_UNICODE ch,
4834	Py_ssize_t maxcount)
4835	{
4836	register Py_ssize_t i;
4837	register Py_ssize_t j;
4838	Py_ssize_t len = self->length;
4839	PyObject *str;
4840
4841	for (i = j = 0; i < len; ) {
4842	if (self->str[i] == ch) {
4843	if (maxcount-- <= 0)
4844	break;
4845	SPLIT_APPEND(self->str, j, i);
4846	i = j = i + 1;
4847	} else
4848	i++;
4849	}
4850	if (j <= len) {
4851	SPLIT_APPEND(self->str, j, len);
4852	}
4853	return list;
4854
4855	onError:
4856	Py_DECREF(list);
4857	return NULL;
4858	}
4859
4860	static
4861	PyObject split_substring(PyUnicodeObject self,
4862	PyObject *list,
4863	PyUnicodeObject *substring,
4864	Py_ssize_t maxcount)
4865	{
4866	register Py_ssize_t i;
4867	register Py_ssize_t j;
4868	Py_ssize_t len = self->length;
4869	Py_ssize_t sublen = substring->length;
4870	PyObject *str;
4871
4872	for (i = j = 0; i <= len - sublen; ) {
4873	if (Py_UNICODE_MATCH(self, i, substring)) {
4874	if (maxcount-- <= 0)
4875	break;
4876	SPLIT_APPEND(self->str, j, i);
4877	i = j = i + sublen;
4878	} else
4879	i++;
4880	}
4881	if (j <= len) {
4882	SPLIT_APPEND(self->str, j, len);
4883	}
4884	return list;
4885
4886	onError:
4887	Py_DECREF(list);
4888	return NULL;
4889	}
4890
4891	static
4892	PyObject rsplit_whitespace(PyUnicodeObject self,
4893	PyObject *list,
4894	Py_ssize_t maxcount)
4895	{
4896	register Py_ssize_t i;
4897	register Py_ssize_t j;
4898	Py_ssize_t len = self->length;
4899	PyObject *str;
4900
4901	for (i = j = len - 1; i >= 0; ) {
4902	/* find a token */
4903	while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4904	i--;
4905	j = i;
4906	while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4907	i--;
4908	if (j > i) {
4909	if (maxcount-- <= 0)
4910	break;
4911	SPLIT_APPEND(self->str, i + 1, j + 1);
4912	while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4913	i--;
4914	j = i;
4915	}
4916	}
4917	if (j >= 0) {
4918	SPLIT_APPEND(self->str, 0, j + 1);
4919	}
4920	if (PyList_Reverse(list) < 0)
4921	goto onError;
4922	return list;
4923
4924	onError:
4925	Py_DECREF(list);
4926	return NULL;
4927	}
4928
4929	static
4930	PyObject rsplit_char(PyUnicodeObject self,
4931	PyObject *list,
4932	Py_UNICODE ch,
4933	Py_ssize_t maxcount)
4934	{
4935	register Py_ssize_t i;
4936	register Py_ssize_t j;
4937	Py_ssize_t len = self->length;
4938	PyObject *str;
4939
4940	for (i = j = len - 1; i >= 0; ) {
4941	if (self->str[i] == ch) {
4942	if (maxcount-- <= 0)
4943	break;
4944	SPLIT_APPEND(self->str, i + 1, j + 1);
4945	j = i = i - 1;
4946	} else
4947	i--;
4948	}
4949	if (j >= -1) {
4950	SPLIT_APPEND(self->str, 0, j + 1);
4951	}
4952	if (PyList_Reverse(list) < 0)
4953	goto onError;
4954	return list;
4955
4956	onError:
4957	Py_DECREF(list);
4958	return NULL;
4959	}
4960
4961	static
4962	PyObject rsplit_substring(PyUnicodeObject self,
4963	PyObject *list,
4964	PyUnicodeObject *substring,
4965	Py_ssize_t maxcount)
4966	{
4967	register Py_ssize_t i;
4968	register Py_ssize_t j;
4969	Py_ssize_t len = self->length;
4970	Py_ssize_t sublen = substring->length;
4971	PyObject *str;
4972
4973	for (i = len - sublen, j = len; i >= 0; ) {
4974	if (Py_UNICODE_MATCH(self, i, substring)) {
4975	if (maxcount-- <= 0)
4976	break;
4977	SPLIT_APPEND(self->str, i + sublen, j);
4978	j = i;
4979	i -= sublen;
4980	} else
4981	i--;
4982	}
4983	if (j >= 0) {
4984	SPLIT_APPEND(self->str, 0, j);
4985	}
4986	if (PyList_Reverse(list) < 0)
4987	goto onError;
4988	return list;
4989
4990	onError:
4991	Py_DECREF(list);
4992	return NULL;
4993	}
4994
4995	#undef SPLIT_APPEND
4996
4997	static
4998	PyObject split(PyUnicodeObject self,
4999	PyUnicodeObject *substring,
5000	Py_ssize_t maxcount)
5001	{
5002	PyObject *list;
5003
5004	if (maxcount < 0)
5005	maxcount = PY_SSIZE_T_MAX;
5006
5007	list = PyList_New(0);
5008	if (!list)
5009	return NULL;
5010
5011	if (substring == NULL)
5012	return split_whitespace(self,list,maxcount);
5013
5014	else if (substring->length == 1)
5015	return split_char(self,list,substring->str[0],maxcount);
5016
5017	else if (substring->length == 0) {
5018	Py_DECREF(list);
5019	PyErr_SetString(PyExc_ValueError, "empty separator");
5020	return NULL;
5021	}
5022	else
5023	return split_substring(self,list,substring,maxcount);
5024	}
5025
5026	static
5027	PyObject rsplit(PyUnicodeObject self,
5028	PyUnicodeObject *substring,
5029	Py_ssize_t maxcount)
5030	{
5031	PyObject *list;
5032
5033	if (maxcount < 0)
5034	maxcount = PY_SSIZE_T_MAX;
5035
5036	list = PyList_New(0);
5037	if (!list)
5038	return NULL;
5039
5040	if (substring == NULL)
5041	return rsplit_whitespace(self,list,maxcount);
5042
5043	else if (substring->length == 1)
5044	return rsplit_char(self,list,substring->str[0],maxcount);
5045
5046	else if (substring->length == 0) {
5047	Py_DECREF(list);
5048	PyErr_SetString(PyExc_ValueError, "empty separator");
5049	return NULL;
5050	}
5051	else
5052	return rsplit_substring(self,list,substring,maxcount);
5053	}
5054
5055	static
5056	PyObject replace(PyUnicodeObject self,
5057	PyUnicodeObject *str1,
5058	PyUnicodeObject *str2,
5059	Py_ssize_t maxcount)
5060	{
5061	PyUnicodeObject *u;
5062
5063	if (maxcount < 0)
5064	maxcount = PY_SSIZE_T_MAX;
5065
5066	if (str1->length == str2->length) {
5067	/* same length */
5068	Py_ssize_t i;
5069	if (str1->length == 1) {
5070	/* replace characters */
5071	Py_UNICODE u1, u2;
5072	if (!findchar(self->str, self->length, str1->str[0]))
5073	goto nothing;
5074	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5075	if (!u)
5076	return NULL;
5077	Py_UNICODE_COPY(u->str, self->str, self->length);
5078	u1 = str1->str[0];
5079	u2 = str2->str[0];
5080	for (i = 0; i < u->length; i++)
5081	if (u->str[i] == u1) {
5082	if (--maxcount < 0)
5083	break;
5084	u->str[i] = u2;
5085	}
5086	} else {
5087	i = fastsearch(
5088	self->str, self->length, str1->str, str1->length, FAST_SEARCH
5089	);
5090	if (i < 0)
5091	goto nothing;
5092	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5093	if (!u)
5094	return NULL;
5095	Py_UNICODE_COPY(u->str, self->str, self->length);
5096	while (i <= self->length - str1->length)
5097	if (Py_UNICODE_MATCH(self, i, str1)) {
5098	if (--maxcount < 0)
5099	break;
5100	Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5101	i += str1->length;
5102	} else
5103	i++;
5104	}
5105	} else {
5106
5107	Py_ssize_t n, i, j, e;
5108	Py_ssize_t product, new_size, delta;
5109	Py_UNICODE *p;
5110
5111	/* replace strings */
5112	n = stringlib_count(self->str, self->length, str1->str, str1->length);
5113	if (n > maxcount)
5114	n = maxcount;
5115	if (n == 0)
5116	goto nothing;
5117	/* new_size = self->length + n * (str2->length - str1->length)); */
5118	delta = (str2->length - str1->length);
5119	if (delta == 0) {
5120	new_size = self->length;
5121	} else {
5122	product = n * (str2->length - str1->length);
5123	if ((product / (str2->length - str1->length)) != n) {
5124	PyErr_SetString(PyExc_OverflowError,
5125	"replace string is too long");
5126	return NULL;
5127	}
5128	new_size = self->length + product;
5129	if (new_size < 0) {
5130	PyErr_SetString(PyExc_OverflowError,
5131	"replace string is too long");
5132	return NULL;
5133	}
5134	}
5135	u = _PyUnicode_New(new_size);
5136	if (!u)
5137	return NULL;
5138	i = 0;
5139	p = u->str;
5140	e = self->length - str1->length;
5141	if (str1->length > 0) {
5142	while (n-- > 0) {
5143	/* look for next match */
5144	j = i;
5145	while (j <= e) {
5146	if (Py_UNICODE_MATCH(self, j, str1))
5147	break;
5148	j++;
5149	}
5150	if (j > i) {
5151	if (j > e)
5152	break;
5153	/* copy unchanged part [i:j] */
5154	Py_UNICODE_COPY(p, self->str+i, j-i);
5155	p += j - i;
5156	}
5157	/* copy substitution string */
5158	if (str2->length > 0) {
5159	Py_UNICODE_COPY(p, str2->str, str2->length);
5160	p += str2->length;
5161	}
5162	i = j + str1->length;
5163	}
5164	if (i < self->length)
5165	/* copy tail [i:] */
5166	Py_UNICODE_COPY(p, self->str+i, self->length-i);
5167	} else {
5168	/* interleave */
5169	while (n > 0) {
5170	Py_UNICODE_COPY(p, str2->str, str2->length);
5171	p += str2->length;
5172	if (--n <= 0)
5173	break;
5174	*p++ = self->str[i++];
5175	}
5176	Py_UNICODE_COPY(p, self->str+i, self->length-i);
5177	}
5178	}
5179	return (PyObject *) u;
5180
5181	nothing:
5182	/* nothing to replace; return original string (when possible) */
5183	if (PyUnicode_CheckExact(self)) {
5184	Py_INCREF(self);
5185	return (PyObject *) self;
5186	}
5187	return PyUnicode_FromUnicode(self->str, self->length);
5188	}
5189
5190	/* --- Unicode Object Methods --------------------------------------------- */
5191
5192	PyDoc_STRVAR(title__doc__,
5193	"S.title() -> unicode\n\
5194	\n\
5195	Return a titlecased version of S, i.e. words start with title case\n\
5196	characters, all remaining cased characters have lower case.");
5197
5198	static PyObject*
5199	unicode_title(PyUnicodeObject *self)
5200	{
5201	return fixup(self, fixtitle);
5202	}
5203
5204	PyDoc_STRVAR(capitalize__doc__,
5205	"S.capitalize() -> unicode\n\
5206	\n\
5207	Return a capitalized version of S, i.e. make the first character\n\
5208	have upper case.");
5209
5210	static PyObject*
5211	unicode_capitalize(PyUnicodeObject *self)
5212	{
5213	return fixup(self, fixcapitalize);
5214	}
5215
5216	#if 0
5217	PyDoc_STRVAR(capwords__doc__,
5218	"S.capwords() -> unicode\n\
5219	\n\
5220	Apply .capitalize() to all words in S and return the result with\n\
5221	normalized whitespace (all whitespace strings are replaced by ' ').");
5222
5223	static PyObject*
5224	unicode_capwords(PyUnicodeObject *self)
5225	{
5226	PyObject *list;
5227	PyObject *item;
5228	Py_ssize_t i;
5229
5230	/* Split into words */
5231	list = split(self, NULL, -1);
5232	if (!list)
5233	return NULL;
5234
5235	/* Capitalize each word */
5236	for (i = 0; i < PyList_GET_SIZE(list); i++) {
5237	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5238	fixcapitalize);
5239	if (item == NULL)
5240	goto onError;
5241	Py_DECREF(PyList_GET_ITEM(list, i));
5242	PyList_SET_ITEM(list, i, item);
5243	}
5244
5245	/* Join the words to form a new string */
5246	item = PyUnicode_Join(NULL, list);
5247
5248	onError:
5249	Py_DECREF(list);
5250	return (PyObject *)item;
5251	}
5252	#endif
5253
5254	/* Argument converter. Coerces to a single unicode character */
5255
5256	static int
5257	convert_uc(PyObject obj, void addr)
5258	{
5259	Py_UNICODE fillcharloc = (Py_UNICODE )addr;
5260	PyObject *uniobj;
5261	Py_UNICODE *unistr;
5262
5263	uniobj = PyUnicode_FromObject(obj);
5264	if (uniobj == NULL) {
5265	PyErr_SetString(PyExc_TypeError,
5266	"The fill character cannot be converted to Unicode");
5267	return 0;
5268	}
5269	if (PyUnicode_GET_SIZE(uniobj) != 1) {
5270	PyErr_SetString(PyExc_TypeError,
5271	"The fill character must be exactly one character long");
5272	Py_DECREF(uniobj);
5273	return 0;
5274	}
5275	unistr = PyUnicode_AS_UNICODE(uniobj);
5276	*fillcharloc = unistr[0];
5277	Py_DECREF(uniobj);
5278	return 1;
5279	}
5280
5281	PyDoc_STRVAR(center__doc__,
5282	"S.center(width[, fillchar]) -> unicode\n\
5283	\n\
5284	Return S centered in a Unicode string of length width. Padding is\n\
5285	done using the specified fill character (default is a space)");
5286
5287	static PyObject *
5288	unicode_center(PyUnicodeObject self, PyObject args)
5289	{
5290	Py_ssize_t marg, left;
5291	Py_ssize_t width;
5292	Py_UNICODE fillchar = ' ';
5293
5294	if (!PyArg_ParseTuple(args, "n\|O&:center", &width, convert_uc, &fillchar))
5295	return NULL;
5296
5297	if (self->length >= width && PyUnicode_CheckExact(self)) {
5298	Py_INCREF(self);
5299	return (PyObject*) self;
5300	}
5301
5302	marg = width - self->length;
5303	left = marg / 2 + (marg & width & 1);
5304
5305	return (PyObject*) pad(self, left, marg - left, fillchar);
5306	}
5307
5308	#if 0
5309
5310	/* This code should go into some future Unicode collation support
5311	module. The basic comparison should compare ordinals on a naive
5312	basis (this is what Java does and thus JPython too). */
5313
5314	/* speedy UTF-16 code point order comparison */
5315	/* gleaned from: */
5316	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5317
5318	static short utf16Fixup[32] =
5319	{
5320	0, 0, 0, 0, 0, 0, 0, 0,
5321	0, 0, 0, 0, 0, 0, 0, 0,
5322	0, 0, 0, 0, 0, 0, 0, 0,
5323	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5324	};
5325
5326	static int
5327	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
5328	{
5329	Py_ssize_t len1, len2;
5330
5331	Py_UNICODE *s1 = str1->str;
5332	Py_UNICODE *s2 = str2->str;
5333
5334	len1 = str1->length;
5335	len2 = str2->length;
5336
5337	while (len1 > 0 && len2 > 0) {
5338	Py_UNICODE c1, c2;
5339
5340	c1 = *s1++;
5341	c2 = *s2++;
5342
5343	if (c1 > (1<<11) * 26)
5344	c1 += utf16Fixup[c1>>11];
5345	if (c2 > (1<<11) * 26)
5346	c2 += utf16Fixup[c2>>11];
5347	/* now c1 and c2 are in UTF-32-compatible order */
5348
5349	if (c1 != c2)
5350	return (c1 < c2) ? -1 : 1;
5351
5352	len1--; len2--;
5353	}
5354
5355	return (len1 < len2) ? -1 : (len1 != len2);
5356	}
5357
5358	#else
5359
5360	static int
5361	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
5362	{
5363	register Py_ssize_t len1, len2;
5364
5365	Py_UNICODE *s1 = str1->str;
5366	Py_UNICODE *s2 = str2->str;
5367
5368	len1 = str1->length;
5369	len2 = str2->length;
5370
5371	while (len1 > 0 && len2 > 0) {
5372	Py_UNICODE c1, c2;
5373
5374	c1 = *s1++;
5375	c2 = *s2++;
5376
5377	if (c1 != c2)
5378	return (c1 < c2) ? -1 : 1;
5379
5380	len1--; len2--;
5381	}
5382
5383	return (len1 < len2) ? -1 : (len1 != len2);
5384	}
5385
5386	#endif
5387
5388	int PyUnicode_Compare(PyObject *left,
5389	PyObject *right)
5390	{
5391	PyUnicodeObject u = NULL, v = NULL;
5392	int result;
5393
5394	/* Coerce the two arguments */
5395	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5396	if (u == NULL)
5397	goto onError;
5398	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5399	if (v == NULL)
5400	goto onError;
5401
5402	/* Shortcut for empty or interned objects */
5403	if (v == u) {
5404	Py_DECREF(u);
5405	Py_DECREF(v);
5406	return 0;
5407	}
5408
5409	result = unicode_compare(u, v);
5410
5411	Py_DECREF(u);
5412	Py_DECREF(v);
5413	return result;
5414
5415	onError:
5416	Py_XDECREF(u);
5417	Py_XDECREF(v);
5418	return -1;
5419	}
5420
5421	PyObject PyUnicode_RichCompare(PyObject left,
5422	PyObject *right,
5423	int op)
5424	{
5425	int result;
5426
5427	result = PyUnicode_Compare(left, right);
5428	if (result == -1 && PyErr_Occurred())
5429	goto onError;
5430
5431	/* Convert the return value to a Boolean */
5432	switch (op) {
5433	case Py_EQ:
5434	result = (result == 0);
5435	break;
5436	case Py_NE:
5437	result = (result != 0);
5438	break;
5439	case Py_LE:
5440	result = (result <= 0);
5441	break;
5442	case Py_GE:
5443	result = (result >= 0);
5444	break;
5445	case Py_LT:
5446	result = (result == -1);
5447	break;
5448	case Py_GT:
5449	result = (result == 1);
5450	break;
5451	}
5452	return PyBool_FromLong(result);
5453
5454	onError:
5455
5456	/* Standard case
5457
5458	Type errors mean that PyUnicode_FromObject() could not convert
5459	one of the arguments (usually the right hand side) to Unicode,
5460	ie. we can't handle the comparison request. However, it is
5461	possible that the other object knows a comparison method, which
5462	is why we return Py_NotImplemented to give the other object a
5463	chance.
5464
5465	*/
5466	if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5467	PyErr_Clear();
5468	Py_INCREF(Py_NotImplemented);
5469	return Py_NotImplemented;
5470	}
5471	if (op != Py_EQ && op != Py_NE)
5472	return NULL;
5473
5474	/* Equality comparison.
5475
5476	This is a special case: we silence any PyExc_UnicodeDecodeError
5477	and instead turn it into a PyErr_UnicodeWarning.
5478
5479	*/
5480	if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5481	return NULL;
5482	PyErr_Clear();
5483	if (PyErr_Warn(PyExc_UnicodeWarning,
5484	(op == Py_EQ) ?
5485	"Unicode equal comparison "
5486	"failed to convert both arguments to Unicode - "
5487	"interpreting them as being unequal" :
5488	"Unicode unequal comparison "
5489	"failed to convert both arguments to Unicode - "
5490	"interpreting them as being unequal"
5491	) < 0)
5492	return NULL;
5493	result = (op == Py_NE);
5494	return PyBool_FromLong(result);
5495	}
5496
5497	int PyUnicode_Contains(PyObject *container,
5498	PyObject *element)
5499	{
5500	PyObject str, sub;
5501	int result;
5502
5503	/* Coerce the two arguments */
5504	sub = PyUnicode_FromObject(element);
5505	if (!sub) {
5506	PyErr_SetString(PyExc_TypeError,
5507	"'in <string>' requires string as left operand");
5508	return -1;
5509	}
5510
5511	str = PyUnicode_FromObject(container);
5512	if (!str) {
5513	Py_DECREF(sub);
5514	return -1;
5515	}
5516
5517	result = stringlib_contains_obj(str, sub);
5518
5519	Py_DECREF(str);
5520	Py_DECREF(sub);
5521
5522	return result;
5523	}
5524
5525	/* Concat to string or Unicode object giving a new Unicode object. */
5526
5527	PyObject PyUnicode_Concat(PyObject left,
5528	PyObject *right)
5529	{
5530	PyUnicodeObject u = NULL, v = NULL, *w;
5531
5532	/* Coerce the two arguments */
5533	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5534	if (u == NULL)
5535	goto onError;
5536	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5537	if (v == NULL)
5538	goto onError;
5539
5540	/* Shortcuts */
5541	if (v == unicode_empty) {
5542	Py_DECREF(v);
5543	return (PyObject *)u;
5544	}
5545	if (u == unicode_empty) {
5546	Py_DECREF(u);
5547	return (PyObject *)v;
5548	}
5549
5550	/* Concat the two Unicode strings */
5551	w = _PyUnicode_New(u->length + v->length);
5552	if (w == NULL)
5553	goto onError;
5554	Py_UNICODE_COPY(w->str, u->str, u->length);
5555	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5556
5557	Py_DECREF(u);
5558	Py_DECREF(v);
5559	return (PyObject *)w;
5560
5561	onError:
5562	Py_XDECREF(u);
5563	Py_XDECREF(v);
5564	return NULL;
5565	}
5566
5567	PyDoc_STRVAR(count__doc__,
5568	"S.count(sub[, start[, end]]) -> int\n\
5569	\n\
5570	Return the number of non-overlapping occurrences of substring sub in\n\
5571	Unicode string S[start:end]. Optional arguments start and end are\n\
5572	interpreted as in slice notation.");
5573
5574	static PyObject *
5575	unicode_count(PyUnicodeObject self, PyObject args)
5576	{
5577	PyUnicodeObject *substring;
5578	Py_ssize_t start = 0;
5579	Py_ssize_t end = PY_SSIZE_T_MAX;
5580	PyObject *result;
5581
5582	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
5583	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5584	return NULL;
5585
5586	substring = (PyUnicodeObject *)PyUnicode_FromObject(
5587	(PyObject *)substring);
5588	if (substring == NULL)
5589	return NULL;
5590
5591	FIX_START_END(self);
5592
5593	result = PyInt_FromSsize_t(
5594	stringlib_count(self->str + start, end - start,
5595	substring->str, substring->length)
5596	);
5597
5598	Py_DECREF(substring);
5599
5600	return result;
5601	}
5602
5603	PyDoc_STRVAR(encode__doc__,
5604	"S.encode([encoding[,errors]]) -> string or unicode\n\
5605	\n\
5606	Encodes S using the codec registered for encoding. encoding defaults\n\
5607	to the default encoding. errors may be given to set a different error\n\
5608	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5609	a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5610	'xmlcharrefreplace' as well as any other name registered with\n\
5611	codecs.register_error that can handle UnicodeEncodeErrors.");
5612
5613	static PyObject *
5614	unicode_encode(PyUnicodeObject self, PyObject args)
5615	{
5616	char *encoding = NULL;
5617	char *errors = NULL;
5618	PyObject *v;
5619
5620	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
5621	return NULL;
5622	v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5623	if (v == NULL)
5624	goto onError;
5625	if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5626	PyErr_Format(PyExc_TypeError,
5627	"encoder did not return a string/unicode object "
5628	"(type=%.400s)",
5629	v->ob_type->tp_name);
5630	Py_DECREF(v);
5631	return NULL;
5632	}
5633	return v;
5634
5635	onError:
5636	return NULL;
5637	}
5638
5639	PyDoc_STRVAR(decode__doc__,
5640	"S.decode([encoding[,errors]]) -> string or unicode\n\
5641	\n\
5642	Decodes S using the codec registered for encoding. encoding defaults\n\
5643	to the default encoding. errors may be given to set a different error\n\
5644	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5645	a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5646	as well as any other name registerd with codecs.register_error that is\n\
5647	able to handle UnicodeDecodeErrors.");
5648
5649	static PyObject *
5650	unicode_decode(PyUnicodeObject self, PyObject args)
5651	{
5652	char *encoding = NULL;
5653	char *errors = NULL;
5654	PyObject *v;
5655
5656	if (!PyArg_ParseTuple(args, "\|ss:decode", &encoding, &errors))
5657	return NULL;
5658	v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5659	if (v == NULL)
5660	goto onError;
5661	if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5662	PyErr_Format(PyExc_TypeError,
5663	"decoder did not return a string/unicode object "
5664	"(type=%.400s)",
5665	v->ob_type->tp_name);
5666	Py_DECREF(v);
5667	return NULL;
5668	}
5669	return v;
5670
5671	onError:
5672	return NULL;
5673	}
5674
5675	PyDoc_STRVAR(expandtabs__doc__,
5676	"S.expandtabs([tabsize]) -> unicode\n\
5677	\n\
5678	Return a copy of S where all tab characters are expanded using spaces.\n\
5679	If tabsize is not given, a tab size of 8 characters is assumed.");
5680
5681	static PyObject*
5682	unicode_expandtabs(PyUnicodeObject self, PyObject args)
5683	{
5684	Py_UNICODE *e;
5685	Py_UNICODE *p;
5686	Py_UNICODE *q;
5687	Py_ssize_t i, j;
5688	PyUnicodeObject *u;
5689	int tabsize = 8;
5690
5691	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
5692	return NULL;
5693
5694	/* First pass: determine size of output string */
5695	i = j = 0;
5696	e = self->str + self->length;
5697	for (p = self->str; p < e; p++)
5698	if (*p == '\t') {
5699	if (tabsize > 0)
5700	j += tabsize - (j % tabsize);
5701	}
5702	else {
5703	j++;
5704	if (p == '\n' \|\| p == '\r') {
5705	i += j;
5706	j = 0;
5707	}
5708	}
5709
5710	/* Second pass: create output string and fill it */
5711	u = _PyUnicode_New(i + j);
5712	if (!u)
5713	return NULL;
5714
5715	j = 0;
5716	q = u->str;
5717
5718	for (p = self->str; p < e; p++)
5719	if (*p == '\t') {
5720	if (tabsize > 0) {
5721	i = tabsize - (j % tabsize);
5722	j += i;
5723	while (i--)
5724	*q++ = ' ';
5725	}
5726	}
5727	else {
5728	j++;
5729	q++ = p;
5730	if (p == '\n' \|\| p == '\r')
5731	j = 0;
5732	}
5733
5734	return (PyObject*) u;
5735	}
5736
5737	PyDoc_STRVAR(find__doc__,
5738	"S.find(sub [,start [,end]]) -> int\n\
5739	\n\
5740	Return the lowest index in S where substring sub is found,\n\
5741	such that sub is contained within s[start,end]. Optional\n\
5742	arguments start and end are interpreted as in slice notation.\n\
5743	\n\
5744	Return -1 on failure.");
5745
5746	static PyObject *
5747	unicode_find(PyUnicodeObject self, PyObject args)
5748	{
5749	PyObject *substring;
5750	Py_ssize_t start = 0;
5751	Py_ssize_t end = PY_SSIZE_T_MAX;
5752	Py_ssize_t result;
5753
5754	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
5755	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5756	return NULL;
5757	substring = PyUnicode_FromObject(substring);
5758	if (!substring)
5759	return NULL;
5760
5761	result = stringlib_find_slice(
5762	PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5763	PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5764	start, end
5765	);
5766
5767	Py_DECREF(substring);
5768
5769	return PyInt_FromSsize_t(result);
5770	}
5771
5772	static PyObject *
5773	unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
5774	{
5775	if (index < 0 \|\| index >= self->length) {
5776	PyErr_SetString(PyExc_IndexError, "string index out of range");
5777	return NULL;
5778	}
5779
5780	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5781	}
5782
5783	static long
5784	unicode_hash(PyUnicodeObject *self)
5785	{
5786	/* Since Unicode objects compare equal to their ASCII string
5787	counterparts, they should use the individual character values
5788	as basis for their hash value. This is needed to assure that
5789	strings and Unicode objects behave in the same way as
5790	dictionary keys. */
5791
5792	register Py_ssize_t len;
5793	register Py_UNICODE *p;
5794	register long x;
5795
5796	if (self->hash != -1)
5797	return self->hash;
5798	len = PyUnicode_GET_SIZE(self);
5799	p = PyUnicode_AS_UNICODE(self);
5800	x = *p << 7;
5801	while (--len >= 0)
5802	x = (1000003x) ^ p++;
5803	x ^= PyUnicode_GET_SIZE(self);
5804	if (x == -1)
5805	x = -2;
5806	self->hash = x;
5807	return x;
5808	}
5809
5810	PyDoc_STRVAR(index__doc__,
5811	"S.index(sub [,start [,end]]) -> int\n\
5812	\n\
5813	Like S.find() but raise ValueError when the substring is not found.");
5814
5815	static PyObject *
5816	unicode_index(PyUnicodeObject self, PyObject args)
5817	{
5818	Py_ssize_t result;
5819	PyObject *substring;
5820	Py_ssize_t start = 0;
5821	Py_ssize_t end = PY_SSIZE_T_MAX;
5822
5823	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
5824	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5825	return NULL;
5826	substring = PyUnicode_FromObject(substring);
5827	if (!substring)
5828	return NULL;
5829
5830	result = stringlib_find_slice(
5831	PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5832	PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5833	start, end
5834	);
5835
5836	Py_DECREF(substring);
5837
5838	if (result < 0) {
5839	PyErr_SetString(PyExc_ValueError, "substring not found");
5840	return NULL;
5841	}
5842
5843	return PyInt_FromSsize_t(result);
5844	}
5845
5846	PyDoc_STRVAR(islower__doc__,
5847	"S.islower() -> bool\n\
5848	\n\
5849	Return True if all cased characters in S are lowercase and there is\n\
5850	at least one cased character in S, False otherwise.");
5851
5852	static PyObject*
5853	unicode_islower(PyUnicodeObject *self)
5854	{
5855	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5856	register const Py_UNICODE *e;
5857	int cased;
5858
5859	/* Shortcut for single character strings */
5860	if (PyUnicode_GET_SIZE(self) == 1)
5861	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
5862
5863	/* Special case for empty strings */
5864	if (PyUnicode_GET_SIZE(self) == 0)
5865	return PyBool_FromLong(0);
5866
5867	e = p + PyUnicode_GET_SIZE(self);
5868	cased = 0;
5869	for (; p < e; p++) {
5870	register const Py_UNICODE ch = *p;
5871
5872	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
5873	return PyBool_FromLong(0);
5874	else if (!cased && Py_UNICODE_ISLOWER(ch))
5875	cased = 1;
5876	}
5877	return PyBool_FromLong(cased);
5878	}
5879
5880	PyDoc_STRVAR(isupper__doc__,
5881	"S.isupper() -> bool\n\
5882	\n\
5883	Return True if all cased characters in S are uppercase and there is\n\
5884	at least one cased character in S, False otherwise.");
5885
5886	static PyObject*
5887	unicode_isupper(PyUnicodeObject *self)
5888	{
5889	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5890	register const Py_UNICODE *e;
5891	int cased;
5892
5893	/* Shortcut for single character strings */
5894	if (PyUnicode_GET_SIZE(self) == 1)
5895	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
5896
5897	/* Special case for empty strings */
5898	if (PyUnicode_GET_SIZE(self) == 0)
5899	return PyBool_FromLong(0);
5900
5901	e = p + PyUnicode_GET_SIZE(self);
5902	cased = 0;
5903	for (; p < e; p++) {
5904	register const Py_UNICODE ch = *p;
5905
5906	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
5907	return PyBool_FromLong(0);
5908	else if (!cased && Py_UNICODE_ISUPPER(ch))
5909	cased = 1;
5910	}
5911	return PyBool_FromLong(cased);
5912	}
5913
5914	PyDoc_STRVAR(istitle__doc__,
5915	"S.istitle() -> bool\n\
5916	\n\
5917	Return True if S is a titlecased string and there is at least one\n\
5918	character in S, i.e. upper- and titlecase characters may only\n\
5919	follow uncased characters and lowercase characters only cased ones.\n\
5920	Return False otherwise.");
5921
5922	static PyObject*
5923	unicode_istitle(PyUnicodeObject *self)
5924	{
5925	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5926	register const Py_UNICODE *e;
5927	int cased, previous_is_cased;
5928
5929	/* Shortcut for single character strings */
5930	if (PyUnicode_GET_SIZE(self) == 1)
5931	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
5932	(Py_UNICODE_ISUPPER(*p) != 0));
5933
5934	/* Special case for empty strings */
5935	if (PyUnicode_GET_SIZE(self) == 0)
5936	return PyBool_FromLong(0);
5937
5938	e = p + PyUnicode_GET_SIZE(self);
5939	cased = 0;
5940	previous_is_cased = 0;
5941	for (; p < e; p++) {
5942	register const Py_UNICODE ch = *p;
5943
5944	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
5945	if (previous_is_cased)
5946	return PyBool_FromLong(0);
5947	previous_is_cased = 1;
5948	cased = 1;
5949	}
5950	else if (Py_UNICODE_ISLOWER(ch)) {
5951	if (!previous_is_cased)
5952	return PyBool_FromLong(0);
5953	previous_is_cased = 1;
5954	cased = 1;
5955	}
5956	else
5957	previous_is_cased = 0;
5958	}
5959	return PyBool_FromLong(cased);
5960	}
5961
5962	PyDoc_STRVAR(isspace__doc__,
5963	"S.isspace() -> bool\n\
5964	\n\
5965	Return True if all characters in S are whitespace\n\
5966	and there is at least one character in S, False otherwise.");
5967
5968	static PyObject*
5969	unicode_isspace(PyUnicodeObject *self)
5970	{
5971	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5972	register const Py_UNICODE *e;
5973
5974	/* Shortcut for single character strings */
5975	if (PyUnicode_GET_SIZE(self) == 1 &&
5976	Py_UNICODE_ISSPACE(*p))
5977	return PyBool_FromLong(1);
5978
5979	/* Special case for empty strings */
5980	if (PyUnicode_GET_SIZE(self) == 0)
5981	return PyBool_FromLong(0);
5982
5983	e = p + PyUnicode_GET_SIZE(self);
5984	for (; p < e; p++) {
5985	if (!Py_UNICODE_ISSPACE(*p))
5986	return PyBool_FromLong(0);
5987	}
5988	return PyBool_FromLong(1);
5989	}
5990
5991	PyDoc_STRVAR(isalpha__doc__,
5992	"S.isalpha() -> bool\n\
5993	\n\
5994	Return True if all characters in S are alphabetic\n\
5995	and there is at least one character in S, False otherwise.");
5996
5997	static PyObject*
5998	unicode_isalpha(PyUnicodeObject *self)
5999	{
6000	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6001	register const Py_UNICODE *e;
6002
6003	/* Shortcut for single character strings */
6004	if (PyUnicode_GET_SIZE(self) == 1 &&
6005	Py_UNICODE_ISALPHA(*p))
6006	return PyBool_FromLong(1);
6007
6008	/* Special case for empty strings */
6009	if (PyUnicode_GET_SIZE(self) == 0)
6010	return PyBool_FromLong(0);
6011
6012	e = p + PyUnicode_GET_SIZE(self);
6013	for (; p < e; p++) {
6014	if (!Py_UNICODE_ISALPHA(*p))
6015	return PyBool_FromLong(0);
6016	}
6017	return PyBool_FromLong(1);
6018	}
6019
6020	PyDoc_STRVAR(isalnum__doc__,
6021	"S.isalnum() -> bool\n\
6022	\n\
6023	Return True if all characters in S are alphanumeric\n\
6024	and there is at least one character in S, False otherwise.");
6025
6026	static PyObject*
6027	unicode_isalnum(PyUnicodeObject *self)
6028	{
6029	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6030	register const Py_UNICODE *e;
6031
6032	/* Shortcut for single character strings */
6033	if (PyUnicode_GET_SIZE(self) == 1 &&
6034	Py_UNICODE_ISALNUM(*p))
6035	return PyBool_FromLong(1);
6036
6037	/* Special case for empty strings */
6038	if (PyUnicode_GET_SIZE(self) == 0)
6039	return PyBool_FromLong(0);
6040
6041	e = p + PyUnicode_GET_SIZE(self);
6042	for (; p < e; p++) {
6043	if (!Py_UNICODE_ISALNUM(*p))
6044	return PyBool_FromLong(0);
6045	}
6046	return PyBool_FromLong(1);
6047	}
6048
6049	PyDoc_STRVAR(isdecimal__doc__,
6050	"S.isdecimal() -> bool\n\
6051	\n\
6052	Return True if there are only decimal characters in S,\n\
6053	False otherwise.");
6054
6055	static PyObject*
6056	unicode_isdecimal(PyUnicodeObject *self)
6057	{
6058	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6059	register const Py_UNICODE *e;
6060
6061	/* Shortcut for single character strings */
6062	if (PyUnicode_GET_SIZE(self) == 1 &&
6063	Py_UNICODE_ISDECIMAL(*p))
6064	return PyBool_FromLong(1);
6065
6066	/* Special case for empty strings */
6067	if (PyUnicode_GET_SIZE(self) == 0)
6068	return PyBool_FromLong(0);
6069
6070	e = p + PyUnicode_GET_SIZE(self);
6071	for (; p < e; p++) {
6072	if (!Py_UNICODE_ISDECIMAL(*p))
6073	return PyBool_FromLong(0);
6074	}
6075	return PyBool_FromLong(1);
6076	}
6077
6078	PyDoc_STRVAR(isdigit__doc__,
6079	"S.isdigit() -> bool\n\
6080	\n\
6081	Return True if all characters in S are digits\n\
6082	and there is at least one character in S, False otherwise.");
6083
6084	static PyObject*
6085	unicode_isdigit(PyUnicodeObject *self)
6086	{
6087	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6088	register const Py_UNICODE *e;
6089
6090	/* Shortcut for single character strings */
6091	if (PyUnicode_GET_SIZE(self) == 1 &&
6092	Py_UNICODE_ISDIGIT(*p))
6093	return PyBool_FromLong(1);
6094
6095	/* Special case for empty strings */
6096	if (PyUnicode_GET_SIZE(self) == 0)
6097	return PyBool_FromLong(0);
6098
6099	e = p + PyUnicode_GET_SIZE(self);
6100	for (; p < e; p++) {
6101	if (!Py_UNICODE_ISDIGIT(*p))
6102	return PyBool_FromLong(0);
6103	}
6104	return PyBool_FromLong(1);
6105	}
6106
6107	PyDoc_STRVAR(isnumeric__doc__,
6108	"S.isnumeric() -> bool\n\
6109	\n\
6110	Return True if there are only numeric characters in S,\n\
6111	False otherwise.");
6112
6113	static PyObject*
6114	unicode_isnumeric(PyUnicodeObject *self)
6115	{
6116	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6117	register const Py_UNICODE *e;
6118
6119	/* Shortcut for single character strings */
6120	if (PyUnicode_GET_SIZE(self) == 1 &&
6121	Py_UNICODE_ISNUMERIC(*p))
6122	return PyBool_FromLong(1);
6123
6124	/* Special case for empty strings */
6125	if (PyUnicode_GET_SIZE(self) == 0)
6126	return PyBool_FromLong(0);
6127
6128	e = p + PyUnicode_GET_SIZE(self);
6129	for (; p < e; p++) {
6130	if (!Py_UNICODE_ISNUMERIC(*p))
6131	return PyBool_FromLong(0);
6132	}
6133	return PyBool_FromLong(1);
6134	}
6135
6136	PyDoc_STRVAR(join__doc__,
6137	"S.join(sequence) -> unicode\n\
6138	\n\
6139	Return a string which is the concatenation of the strings in the\n\
6140	sequence. The separator between elements is S.");
6141
6142	static PyObject*
6143	unicode_join(PyObject self, PyObject data)
6144	{
6145	return PyUnicode_Join(self, data);
6146	}
6147
6148	static Py_ssize_t
6149	unicode_length(PyUnicodeObject *self)
6150	{
6151	return self->length;
6152	}
6153
6154	PyDoc_STRVAR(ljust__doc__,
6155	"S.ljust(width[, fillchar]) -> int\n\
6156	\n\
6157	Return S left justified in a Unicode string of length width. Padding is\n\
6158	done using the specified fill character (default is a space).");
6159
6160	static PyObject *
6161	unicode_ljust(PyUnicodeObject self, PyObject args)
6162	{
6163	Py_ssize_t width;
6164	Py_UNICODE fillchar = ' ';
6165
6166	if (!PyArg_ParseTuple(args, "n\|O&:ljust", &width, convert_uc, &fillchar))
6167	return NULL;
6168
6169	if (self->length >= width && PyUnicode_CheckExact(self)) {
6170	Py_INCREF(self);
6171	return (PyObject*) self;
6172	}
6173
6174	return (PyObject*) pad(self, 0, width - self->length, fillchar);
6175	}
6176
6177	PyDoc_STRVAR(lower__doc__,
6178	"S.lower() -> unicode\n\
6179	\n\
6180	Return a copy of the string S converted to lowercase.");
6181
6182	static PyObject*
6183	unicode_lower(PyUnicodeObject *self)
6184	{
6185	return fixup(self, fixlower);
6186	}
6187
6188	#define LEFTSTRIP 0
6189	#define RIGHTSTRIP 1
6190	#define BOTHSTRIP 2
6191
6192	/* Arrays indexed by above */
6193	static const char *stripformat[] = {"\|O:lstrip", "\|O:rstrip", "\|O:strip"};
6194
6195	#define STRIPNAME(i) (stripformat[i]+3)
6196
6197	/* externally visible for str.strip(unicode) */
6198	PyObject *
6199	_PyUnicode_XStrip(PyUnicodeObject self, int striptype, PyObject sepobj)
6200	{
6201	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6202	Py_ssize_t len = PyUnicode_GET_SIZE(self);
6203	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6204	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6205	Py_ssize_t i, j;
6206
6207	BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6208
6209	i = 0;
6210	if (striptype != RIGHTSTRIP) {
6211	while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6212	i++;
6213	}
6214	}
6215
6216	j = len;
6217	if (striptype != LEFTSTRIP) {
6218	do {
6219	j--;
6220	} while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6221	j++;
6222	}
6223
6224	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6225	Py_INCREF(self);
6226	return (PyObject*)self;
6227	}
6228	else
6229	return PyUnicode_FromUnicode(s+i, j-i);
6230	}
6231
6232
6233	static PyObject *
6234	do_strip(PyUnicodeObject *self, int striptype)
6235	{
6236	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6237	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6238
6239	i = 0;
6240	if (striptype != RIGHTSTRIP) {
6241	while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6242	i++;
6243	}
6244	}
6245
6246	j = len;
6247	if (striptype != LEFTSTRIP) {
6248	do {
6249	j--;
6250	} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6251	j++;
6252	}
6253
6254	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6255	Py_INCREF(self);
6256	return (PyObject*)self;
6257	}
6258	else
6259	return PyUnicode_FromUnicode(s+i, j-i);
6260	}
6261
6262
6263	static PyObject *
6264	do_argstrip(PyUnicodeObject self, int striptype, PyObject args)
6265	{
6266	PyObject *sep = NULL;
6267
6268	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6269	return NULL;
6270
6271	if (sep != NULL && sep != Py_None) {
6272	if (PyUnicode_Check(sep))
6273	return _PyUnicode_XStrip(self, striptype, sep);
6274	else if (PyString_Check(sep)) {
6275	PyObject *res;
6276	sep = PyUnicode_FromObject(sep);
6277	if (sep==NULL)
6278	return NULL;
6279	res = _PyUnicode_XStrip(self, striptype, sep);
6280	Py_DECREF(sep);
6281	return res;
6282	}
6283	else {
6284	PyErr_Format(PyExc_TypeError,
6285	"%s arg must be None, unicode or str",
6286	STRIPNAME(striptype));
6287	return NULL;
6288	}
6289	}
6290
6291	return do_strip(self, striptype);
6292	}
6293
6294
6295	PyDoc_STRVAR(strip__doc__,
6296	"S.strip([chars]) -> unicode\n\
6297	\n\
6298	Return a copy of the string S with leading and trailing\n\
6299	whitespace removed.\n\
6300	If chars is given and not None, remove characters in chars instead.\n\
6301	If chars is a str, it will be converted to unicode before stripping");
6302
6303	static PyObject *
6304	unicode_strip(PyUnicodeObject self, PyObject args)
6305	{
6306	if (PyTuple_GET_SIZE(args) == 0)
6307	return do_strip(self, BOTHSTRIP); /* Common case */
6308	else
6309	return do_argstrip(self, BOTHSTRIP, args);
6310	}
6311
6312
6313	PyDoc_STRVAR(lstrip__doc__,
6314	"S.lstrip([chars]) -> unicode\n\
6315	\n\
6316	Return a copy of the string S with leading whitespace removed.\n\
6317	If chars is given and not None, remove characters in chars instead.\n\
6318	If chars is a str, it will be converted to unicode before stripping");
6319
6320	static PyObject *
6321	unicode_lstrip(PyUnicodeObject self, PyObject args)
6322	{
6323	if (PyTuple_GET_SIZE(args) == 0)
6324	return do_strip(self, LEFTSTRIP); /* Common case */
6325	else
6326	return do_argstrip(self, LEFTSTRIP, args);
6327	}
6328
6329
6330	PyDoc_STRVAR(rstrip__doc__,
6331	"S.rstrip([chars]) -> unicode\n\
6332	\n\
6333	Return a copy of the string S with trailing whitespace removed.\n\
6334	If chars is given and not None, remove characters in chars instead.\n\
6335	If chars is a str, it will be converted to unicode before stripping");
6336
6337	static PyObject *
6338	unicode_rstrip(PyUnicodeObject self, PyObject args)
6339	{
6340	if (PyTuple_GET_SIZE(args) == 0)
6341	return do_strip(self, RIGHTSTRIP); /* Common case */
6342	else
6343	return do_argstrip(self, RIGHTSTRIP, args);
6344	}
6345
6346
6347	static PyObject*
6348	unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
6349	{
6350	PyUnicodeObject *u;
6351	Py_UNICODE *p;
6352	Py_ssize_t nchars;
6353	size_t nbytes;
6354
6355	if (len < 0)
6356	len = 0;
6357
6358	if (len == 1 && PyUnicode_CheckExact(str)) {
6359	/* no repeat, return original string */
6360	Py_INCREF(str);
6361	return (PyObject*) str;
6362	}
6363
6364	/* ensure # of chars needed doesn't overflow int and # of bytes
6365	* needed doesn't overflow size_t
6366	*/
6367	nchars = len * str->length;
6368	if (len && nchars / len != str->length) {
6369	PyErr_SetString(PyExc_OverflowError,
6370	"repeated string is too long");
6371	return NULL;
6372	}
6373	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6374	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6375	PyErr_SetString(PyExc_OverflowError,
6376	"repeated string is too long");
6377	return NULL;
6378	}
6379	u = _PyUnicode_New(nchars);
6380	if (!u)
6381	return NULL;
6382
6383	p = u->str;
6384
6385	if (str->length == 1 && len > 0) {
6386	Py_UNICODE_FILL(p, str->str[0], len);
6387	} else {
6388	Py_ssize_t done = 0; /* number of characters copied this far */
6389	if (done < nchars) {
6390	Py_UNICODE_COPY(p, str->str, str->length);
6391	done = str->length;
6392	}
6393	while (done < nchars) {
6394	int n = (done <= nchars-done) ? done : nchars-done;
6395	Py_UNICODE_COPY(p+done, p, n);
6396	done += n;
6397	}
6398	}
6399
6400	return (PyObject*) u;
6401	}
6402
6403	PyObject PyUnicode_Replace(PyObject obj,
6404	PyObject *subobj,
6405	PyObject *replobj,
6406	Py_ssize_t maxcount)
6407	{
6408	PyObject *self;
6409	PyObject *str1;
6410	PyObject *str2;
6411	PyObject *result;
6412
6413	self = PyUnicode_FromObject(obj);
6414	if (self == NULL)
6415	return NULL;
6416	str1 = PyUnicode_FromObject(subobj);
6417	if (str1 == NULL) {
6418	Py_DECREF(self);
6419	return NULL;
6420	}
6421	str2 = PyUnicode_FromObject(replobj);
6422	if (str2 == NULL) {
6423	Py_DECREF(self);
6424	Py_DECREF(str1);
6425	return NULL;
6426	}
6427	result = replace((PyUnicodeObject *)self,
6428	(PyUnicodeObject *)str1,
6429	(PyUnicodeObject *)str2,
6430	maxcount);
6431	Py_DECREF(self);
6432	Py_DECREF(str1);
6433	Py_DECREF(str2);
6434	return result;
6435	}
6436
6437	PyDoc_STRVAR(replace__doc__,
6438	"S.replace (old, new[, maxsplit]) -> unicode\n\
6439	\n\
6440	Return a copy of S with all occurrences of substring\n\
6441	old replaced by new. If the optional argument maxsplit is\n\
6442	given, only the first maxsplit occurrences are replaced.");
6443
6444	static PyObject*
6445	unicode_replace(PyUnicodeObject self, PyObject args)
6446	{
6447	PyUnicodeObject *str1;
6448	PyUnicodeObject *str2;
6449	Py_ssize_t maxcount = -1;
6450	PyObject *result;
6451
6452	if (!PyArg_ParseTuple(args, "OO\|n:replace", &str1, &str2, &maxcount))
6453	return NULL;
6454	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
6455	if (str1 == NULL)
6456	return NULL;
6457	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
6458	if (str2 == NULL) {
6459	Py_DECREF(str1);
6460	return NULL;
6461	}
6462
6463	result = replace(self, str1, str2, maxcount);
6464
6465	Py_DECREF(str1);
6466	Py_DECREF(str2);
6467	return result;
6468	}
6469
6470	static
6471	PyObject unicode_repr(PyObject unicode)
6472	{
6473	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6474	PyUnicode_GET_SIZE(unicode),
6475	1);
6476	}
6477
6478	PyDoc_STRVAR(rfind__doc__,
6479	"S.rfind(sub [,start [,end]]) -> int\n\
6480	\n\
6481	Return the highest index in S where substring sub is found,\n\
6482	such that sub is contained within s[start,end]. Optional\n\
6483	arguments start and end are interpreted as in slice notation.\n\
6484	\n\
6485	Return -1 on failure.");
6486
6487	static PyObject *
6488	unicode_rfind(PyUnicodeObject self, PyObject args)
6489	{
6490	PyObject *substring;
6491	Py_ssize_t start = 0;
6492	Py_ssize_t end = PY_SSIZE_T_MAX;
6493	Py_ssize_t result;
6494
6495	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
6496	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6497	return NULL;
6498	substring = PyUnicode_FromObject(substring);
6499	if (!substring)
6500	return NULL;
6501
6502	result = stringlib_rfind_slice(
6503	PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6504	PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6505	start, end
6506	);
6507
6508	Py_DECREF(substring);
6509
6510	return PyInt_FromSsize_t(result);
6511	}
6512
6513	PyDoc_STRVAR(rindex__doc__,
6514	"S.rindex(sub [,start [,end]]) -> int\n\
6515	\n\
6516	Like S.rfind() but raise ValueError when the substring is not found.");
6517
6518	static PyObject *
6519	unicode_rindex(PyUnicodeObject self, PyObject args)
6520	{
6521	PyObject *substring;
6522	Py_ssize_t start = 0;
6523	Py_ssize_t end = PY_SSIZE_T_MAX;
6524	Py_ssize_t result;
6525
6526	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
6527	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6528	return NULL;
6529	substring = PyUnicode_FromObject(substring);
6530	if (!substring)
6531	return NULL;
6532
6533	result = stringlib_rfind_slice(
6534	PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6535	PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6536	start, end
6537	);
6538
6539	Py_DECREF(substring);
6540
6541	if (result < 0) {
6542	PyErr_SetString(PyExc_ValueError, "substring not found");
6543	return NULL;
6544	}
6545	return PyInt_FromSsize_t(result);
6546	}
6547
6548	PyDoc_STRVAR(rjust__doc__,
6549	"S.rjust(width[, fillchar]) -> unicode\n\
6550	\n\
6551	Return S right justified in a Unicode string of length width. Padding is\n\
6552	done using the specified fill character (default is a space).");
6553
6554	static PyObject *
6555	unicode_rjust(PyUnicodeObject self, PyObject args)
6556	{
6557	Py_ssize_t width;
6558	Py_UNICODE fillchar = ' ';
6559
6560	if (!PyArg_ParseTuple(args, "n\|O&:rjust", &width, convert_uc, &fillchar))
6561	return NULL;
6562
6563	if (self->length >= width && PyUnicode_CheckExact(self)) {
6564	Py_INCREF(self);
6565	return (PyObject*) self;
6566	}
6567
6568	return (PyObject*) pad(self, width - self->length, 0, fillchar);
6569	}
6570
6571	static PyObject*
6572	unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
6573	{
6574	/* standard clamping */
6575	if (start < 0)
6576	start = 0;
6577	if (end < 0)
6578	end = 0;
6579	if (end > self->length)
6580	end = self->length;
6581	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6582	/* full slice, return original string */
6583	Py_INCREF(self);
6584	return (PyObject*) self;
6585	}
6586	if (start > end)
6587	start = end;
6588	/* copy slice */
6589	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6590	end - start);
6591	}
6592
6593	PyObject PyUnicode_Split(PyObject s,
6594	PyObject *sep,
6595	Py_ssize_t maxsplit)
6596	{
6597	PyObject *result;
6598
6599	s = PyUnicode_FromObject(s);
6600	if (s == NULL)
6601	return NULL;
6602	if (sep != NULL) {
6603	sep = PyUnicode_FromObject(sep);
6604	if (sep == NULL) {
6605	Py_DECREF(s);
6606	return NULL;
6607	}
6608	}
6609
6610	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
6611
6612	Py_DECREF(s);
6613	Py_XDECREF(sep);
6614	return result;
6615	}
6616
6617	PyDoc_STRVAR(split__doc__,
6618	"S.split([sep [,maxsplit]]) -> list of strings\n\
6619	\n\
6620	Return a list of the words in S, using sep as the\n\
6621	delimiter string. If maxsplit is given, at most maxsplit\n\
6622	splits are done. If sep is not specified or is None,\n\
6623	any whitespace string is a separator.");
6624
6625	static PyObject*
6626	unicode_split(PyUnicodeObject self, PyObject args)
6627	{
6628	PyObject *substring = Py_None;
6629	Py_ssize_t maxcount = -1;
6630
6631	if (!PyArg_ParseTuple(args, "\|On:split", &substring, &maxcount))
6632	return NULL;
6633
6634	if (substring == Py_None)
6635	return split(self, NULL, maxcount);
6636	else if (PyUnicode_Check(substring))
6637	return split(self, (PyUnicodeObject *)substring, maxcount);
6638	else
6639	return PyUnicode_Split((PyObject *)self, substring, maxcount);
6640	}
6641
6642	PyObject *
6643	PyUnicode_Partition(PyObject str_in, PyObject sep_in)
6644	{
6645	PyObject* str_obj;
6646	PyObject* sep_obj;
6647	PyObject* out;
6648
6649	str_obj = PyUnicode_FromObject(str_in);
6650	if (!str_obj)
6651	return NULL;
6652	sep_obj = PyUnicode_FromObject(sep_in);
6653	if (!sep_obj) {
6654	Py_DECREF(str_obj);
6655	return NULL;
6656	}
6657
6658	out = stringlib_partition(
6659	str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6660	sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6661	);
6662
6663	Py_DECREF(sep_obj);
6664	Py_DECREF(str_obj);
6665
6666	return out;
6667	}
6668
6669
6670	PyObject *
6671	PyUnicode_RPartition(PyObject str_in, PyObject sep_in)
6672	{
6673	PyObject* str_obj;
6674	PyObject* sep_obj;
6675	PyObject* out;
6676
6677	str_obj = PyUnicode_FromObject(str_in);
6678	if (!str_obj)
6679	return NULL;
6680	sep_obj = PyUnicode_FromObject(sep_in);
6681	if (!sep_obj) {
6682	Py_DECREF(str_obj);
6683	return NULL;
6684	}
6685
6686	out = stringlib_rpartition(
6687	str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6688	sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6689	);
6690
6691	Py_DECREF(sep_obj);
6692	Py_DECREF(str_obj);
6693
6694	return out;
6695	}
6696
6697	PyDoc_STRVAR(partition__doc__,
6698	"S.partition(sep) -> (head, sep, tail)\n\
6699	\n\
6700	Searches for the separator sep in S, and returns the part before it,\n\
6701	the separator itself, and the part after it. If the separator is not\n\
6702	found, returns S and two empty strings.");
6703
6704	static PyObject*
6705	unicode_partition(PyUnicodeObject self, PyObject separator)
6706	{
6707	return PyUnicode_Partition((PyObject *)self, separator);
6708	}
6709
6710	PyDoc_STRVAR(rpartition__doc__,
6711	"S.rpartition(sep) -> (tail, sep, head)\n\
6712	\n\
6713	Searches for the separator sep in S, starting at the end of S, and returns\n\
6714	the part before it, the separator itself, and the part after it. If the\n\
6715	separator is not found, returns two empty strings and S.");
6716
6717	static PyObject*
6718	unicode_rpartition(PyUnicodeObject self, PyObject separator)
6719	{
6720	return PyUnicode_RPartition((PyObject *)self, separator);
6721	}
6722
6723	PyObject PyUnicode_RSplit(PyObject s,
6724	PyObject *sep,
6725	Py_ssize_t maxsplit)
6726	{
6727	PyObject *result;
6728
6729	s = PyUnicode_FromObject(s);
6730	if (s == NULL)
6731	return NULL;
6732	if (sep != NULL) {
6733	sep = PyUnicode_FromObject(sep);
6734	if (sep == NULL) {
6735	Py_DECREF(s);
6736	return NULL;
6737	}
6738	}
6739
6740	result = rsplit((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
6741
6742	Py_DECREF(s);
6743	Py_XDECREF(sep);
6744	return result;
6745	}
6746
6747	PyDoc_STRVAR(rsplit__doc__,
6748	"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6749	\n\
6750	Return a list of the words in S, using sep as the\n\
6751	delimiter string, starting at the end of the string and\n\
6752	working to the front. If maxsplit is given, at most maxsplit\n\
6753	splits are done. If sep is not specified, any whitespace string\n\
6754	is a separator.");
6755
6756	static PyObject*
6757	unicode_rsplit(PyUnicodeObject self, PyObject args)
6758	{
6759	PyObject *substring = Py_None;
6760	Py_ssize_t maxcount = -1;
6761
6762	if (!PyArg_ParseTuple(args, "\|On:rsplit", &substring, &maxcount))
6763	return NULL;
6764
6765	if (substring == Py_None)
6766	return rsplit(self, NULL, maxcount);
6767	else if (PyUnicode_Check(substring))
6768	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6769	else
6770	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6771	}
6772
6773	PyDoc_STRVAR(splitlines__doc__,
6774	"S.splitlines([keepends]]) -> list of strings\n\
6775	\n\
6776	Return a list of the lines in S, breaking at line boundaries.\n\
6777	Line breaks are not included in the resulting list unless keepends\n\
6778	is given and true.");
6779
6780	static PyObject*
6781	unicode_splitlines(PyUnicodeObject self, PyObject args)
6782	{
6783	int keepends = 0;
6784
6785	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
6786	return NULL;
6787
6788	return PyUnicode_Splitlines((PyObject *)self, keepends);
6789	}
6790
6791	static
6792	PyObject unicode_str(PyUnicodeObject self)
6793	{
6794	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
6795	}
6796
6797	PyDoc_STRVAR(swapcase__doc__,
6798	"S.swapcase() -> unicode\n\
6799	\n\
6800	Return a copy of S with uppercase characters converted to lowercase\n\
6801	and vice versa.");
6802
6803	static PyObject*
6804	unicode_swapcase(PyUnicodeObject *self)
6805	{
6806	return fixup(self, fixswapcase);
6807	}
6808
6809	PyDoc_STRVAR(translate__doc__,
6810	"S.translate(table) -> unicode\n\
6811	\n\
6812	Return a copy of the string S, where all characters have been mapped\n\
6813	through the given translation table, which must be a mapping of\n\
6814	Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6815	Unmapped characters are left untouched. Characters mapped to None\n\
6816	are deleted.");
6817
6818	static PyObject*
6819	unicode_translate(PyUnicodeObject self, PyObject table)
6820	{
6821	return PyUnicode_TranslateCharmap(self->str,
6822	self->length,
6823	table,
6824	"ignore");
6825	}
6826
6827	PyDoc_STRVAR(upper__doc__,
6828	"S.upper() -> unicode\n\
6829	\n\
6830	Return a copy of S converted to uppercase.");
6831
6832	static PyObject*
6833	unicode_upper(PyUnicodeObject *self)
6834	{
6835	return fixup(self, fixupper);
6836	}
6837
6838	PyDoc_STRVAR(zfill__doc__,
6839	"S.zfill(width) -> unicode\n\
6840	\n\
6841	Pad a numeric string x with zeros on the left, to fill a field\n\
6842	of the specified width. The string x is never truncated.");
6843
6844	static PyObject *
6845	unicode_zfill(PyUnicodeObject self, PyObject args)
6846	{
6847	Py_ssize_t fill;
6848	PyUnicodeObject *u;
6849
6850	Py_ssize_t width;
6851	if (!PyArg_ParseTuple(args, "n:zfill", &width))
6852	return NULL;
6853
6854	if (self->length >= width) {
6855	if (PyUnicode_CheckExact(self)) {
6856	Py_INCREF(self);
6857	return (PyObject*) self;
6858	}
6859	else
6860	return PyUnicode_FromUnicode(
6861	PyUnicode_AS_UNICODE(self),
6862	PyUnicode_GET_SIZE(self)
6863	);
6864	}
6865
6866	fill = width - self->length;
6867
6868	u = pad(self, fill, 0, '0');
6869
6870	if (u == NULL)
6871	return NULL;
6872
6873	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
6874	/* move sign to beginning of string */
6875	u->str[0] = u->str[fill];
6876	u->str[fill] = '0';
6877	}
6878
6879	return (PyObject*) u;
6880	}
6881
6882	#if 0
6883	static PyObject*
6884	unicode_freelistsize(PyUnicodeObject *self)
6885	{
6886	return PyInt_FromLong(unicode_freelist_size);
6887	}
6888	#endif
6889
6890	PyDoc_STRVAR(startswith__doc__,
6891	"S.startswith(prefix[, start[, end]]) -> bool\n\
6892	\n\
6893	Return True if S starts with the specified prefix, False otherwise.\n\
6894	With optional start, test S beginning at that position.\n\
6895	With optional end, stop comparing S at that position.\n\
6896	prefix can also be a tuple of strings to try.");
6897
6898	static PyObject *
6899	unicode_startswith(PyUnicodeObject *self,
6900	PyObject *args)
6901	{
6902	PyObject *subobj;
6903	PyUnicodeObject *substring;
6904	Py_ssize_t start = 0;
6905	Py_ssize_t end = PY_SSIZE_T_MAX;
6906	int result;
6907
6908	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &subobj,
6909	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6910	return NULL;
6911	if (PyTuple_Check(subobj)) {
6912	Py_ssize_t i;
6913	for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6914	substring = (PyUnicodeObject *)PyUnicode_FromObject(
6915	PyTuple_GET_ITEM(subobj, i));
6916	if (substring == NULL)
6917	return NULL;
6918	result = tailmatch(self, substring, start, end, -1);
6919	Py_DECREF(substring);
6920	if (result) {
6921	Py_RETURN_TRUE;
6922	}
6923	}
6924	/* nothing matched */
6925	Py_RETURN_FALSE;
6926	}
6927	substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6928	if (substring == NULL)
6929	return NULL;
6930	result = tailmatch(self, substring, start, end, -1);
6931	Py_DECREF(substring);
6932	return PyBool_FromLong(result);
6933	}
6934
6935
6936	PyDoc_STRVAR(endswith__doc__,
6937	"S.endswith(suffix[, start[, end]]) -> bool\n\
6938	\n\
6939	Return True if S ends with the specified suffix, False otherwise.\n\
6940	With optional start, test S beginning at that position.\n\
6941	With optional end, stop comparing S at that position.\n\
6942	suffix can also be a tuple of strings to try.");
6943
6944	static PyObject *
6945	unicode_endswith(PyUnicodeObject *self,
6946	PyObject *args)
6947	{
6948	PyObject *subobj;
6949	PyUnicodeObject *substring;
6950	Py_ssize_t start = 0;
6951	Py_ssize_t end = PY_SSIZE_T_MAX;
6952	int result;
6953
6954	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &subobj,
6955	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6956	return NULL;
6957	if (PyTuple_Check(subobj)) {
6958	Py_ssize_t i;
6959	for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6960	substring = (PyUnicodeObject *)PyUnicode_FromObject(
6961	PyTuple_GET_ITEM(subobj, i));
6962	if (substring == NULL)
6963	return NULL;
6964	result = tailmatch(self, substring, start, end, +1);
6965	Py_DECREF(substring);
6966	if (result) {
6967	Py_RETURN_TRUE;
6968	}
6969	}
6970	Py_RETURN_FALSE;
6971	}
6972	substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6973	if (substring == NULL)
6974	return NULL;
6975
6976	result = tailmatch(self, substring, start, end, +1);
6977	Py_DECREF(substring);
6978	return PyBool_FromLong(result);
6979	}
6980
6981
6982
6983	static PyObject *
6984	unicode_getnewargs(PyUnicodeObject *v)
6985	{
6986	return Py_BuildValue("(u#)", v->str, v->length);
6987	}
6988
6989
6990	static PyMethodDef unicode_methods[] = {
6991
6992	/* Order is according to common usage: often used methods should
6993	appear first, since lookup is done sequentially. */
6994
6995	{"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6996	{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6997	{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
6998	{"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
6999	{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7000	{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7001	{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7002	{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7003	{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7004	{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7005	{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7006	{"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7007	{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7008	{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7009	{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7010	{"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7011	{"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7012	/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7013	{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7014	{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7015	{"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7016	{"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7017	{"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7018	{"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7019	{"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7020	{"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7021	{"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7022	{"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7023	{"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7024	{"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7025	{"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7026	{"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7027	{"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7028	{"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7029	{"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7030	{"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7031	{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7032	{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7033	{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7034	{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7035	#if 0
7036	{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7037	#endif
7038
7039	#if 0
7040	/* This one is just used for debugging the implementation. */
7041	{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
7042	#endif
7043
7044	{"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7045	{NULL, NULL}
7046	};
7047
7048	static PyObject *
7049	unicode_mod(PyObject v, PyObject w)
7050	{
7051	if (!PyUnicode_Check(v)) {
7052	Py_INCREF(Py_NotImplemented);
7053	return Py_NotImplemented;
7054	}
7055	return PyUnicode_Format(v, w);
7056	}
7057
7058	static PyNumberMethods unicode_as_number = {
7059	0, /nb_add/
7060	0, /nb_subtract/
7061	0, /nb_multiply/
7062	0, /nb_divide/
7063	unicode_mod, /nb_remainder/
7064	};
7065
7066	static PySequenceMethods unicode_as_sequence = {
7067	(lenfunc) unicode_length, /* sq_length */
7068	PyUnicode_Concat, /* sq_concat */
7069	(ssizeargfunc) unicode_repeat, /* sq_repeat */
7070	(ssizeargfunc) unicode_getitem, /* sq_item */
7071	(ssizessizeargfunc) unicode_slice, /* sq_slice */
7072	0, /* sq_ass_item */
7073	0, /* sq_ass_slice */
7074	PyUnicode_Contains, /* sq_contains */
7075	};
7076
7077	static PyObject*
7078	unicode_subscript(PyUnicodeObject* self, PyObject* item)
7079	{
7080	if (PyIndex_Check(item)) {
7081	Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7082	if (i == -1 && PyErr_Occurred())
7083	return NULL;
7084	if (i < 0)
7085	i += PyUnicode_GET_SIZE(self);
7086	return unicode_getitem(self, i);
7087	} else if (PySlice_Check(item)) {
7088	Py_ssize_t start, stop, step, slicelength, cur, i;
7089	Py_UNICODE* source_buf;
7090	Py_UNICODE* result_buf;
7091	PyObject* result;
7092
7093	if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7094	&start, &stop, &step, &slicelength) < 0) {
7095	return NULL;
7096	}
7097
7098	if (slicelength <= 0) {
7099	return PyUnicode_FromUnicode(NULL, 0);
7100	} else {
7101	source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7102	result_buf = (Py_UNICODE )PyMem_MALLOC(slicelength
7103	sizeof(Py_UNICODE));
7104
7105	if (result_buf == NULL)
7106	return PyErr_NoMemory();
7107
7108	for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7109	result_buf[i] = source_buf[cur];
7110	}
7111
7112	result = PyUnicode_FromUnicode(result_buf, slicelength);
7113	PyMem_FREE(result_buf);
7114	return result;
7115	}
7116	} else {
7117	PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7118	return NULL;
7119	}
7120	}
7121
7122	static PyMappingMethods unicode_as_mapping = {
7123	(lenfunc)unicode_length, /* mp_length */
7124	(binaryfunc)unicode_subscript, /* mp_subscript */
7125	(objobjargproc)0, /* mp_ass_subscript */
7126	};
7127
7128	static Py_ssize_t
7129	unicode_buffer_getreadbuf(PyUnicodeObject *self,
7130	Py_ssize_t index,
7131	const void **ptr)
7132	{
7133	if (index != 0) {
7134	PyErr_SetString(PyExc_SystemError,
7135	"accessing non-existent unicode segment");
7136	return -1;
7137	}
7138	ptr = (void ) self->str;
7139	return PyUnicode_GET_DATA_SIZE(self);
7140	}
7141
7142	static Py_ssize_t
7143	unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7144	const void **ptr)
7145	{
7146	PyErr_SetString(PyExc_TypeError,
7147	"cannot use unicode as modifiable buffer");
7148	return -1;
7149	}
7150
7151	static int
7152	unicode_buffer_getsegcount(PyUnicodeObject *self,
7153	Py_ssize_t *lenp)
7154	{
7155	if (lenp)
7156	*lenp = PyUnicode_GET_DATA_SIZE(self);
7157	return 1;
7158	}
7159
7160	static Py_ssize_t
7161	unicode_buffer_getcharbuf(PyUnicodeObject *self,
7162	Py_ssize_t index,
7163	const void **ptr)
7164	{
7165	PyObject *str;
7166
7167	if (index != 0) {
7168	PyErr_SetString(PyExc_SystemError,
7169	"accessing non-existent unicode segment");
7170	return -1;
7171	}
7172	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7173	if (str == NULL)
7174	return -1;
7175	ptr = (void ) PyString_AS_STRING(str);
7176	return PyString_GET_SIZE(str);
7177	}
7178
7179	/* Helpers for PyUnicode_Format() */
7180
7181	static PyObject *
7182	getnextarg(PyObject args, Py_ssize_t arglen, Py_ssize_t p_argidx)
7183	{
7184	Py_ssize_t argidx = *p_argidx;
7185	if (argidx < arglen) {
7186	(*p_argidx)++;
7187	if (arglen < 0)
7188	return args;
7189	else
7190	return PyTuple_GetItem(args, argidx);
7191	}
7192	PyErr_SetString(PyExc_TypeError,
7193	"not enough arguments for format string");
7194	return NULL;
7195	}
7196
7197	#define F_LJUST (1<<0)
7198	#define F_SIGN (1<<1)
7199	#define F_BLANK (1<<2)
7200	#define F_ALT (1<<3)
7201	#define F_ZERO (1<<4)
7202
7203	static Py_ssize_t
7204	strtounicode(Py_UNICODE buffer, const char charbuffer)
7205	{
7206	register Py_ssize_t i;
7207	Py_ssize_t len = strlen(charbuffer);
7208	for (i = len - 1; i >= 0; i--)
7209	buffer[i] = (Py_UNICODE) charbuffer[i];
7210
7211	return len;
7212	}
7213
7214	static int
7215	doubletounicode(Py_UNICODE buffer, size_t len, const char format, double x)
7216	{
7217	Py_ssize_t result;
7218
7219	PyOS_ascii_formatd((char *)buffer, len, format, x);
7220	result = strtounicode(buffer, (char *)buffer);
7221	return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7222	}
7223
7224	static int
7225	longtounicode(Py_UNICODE buffer, size_t len, const char format, long x)
7226	{
7227	Py_ssize_t result;
7228
7229	PyOS_snprintf((char *)buffer, len, format, x);
7230	result = strtounicode(buffer, (char *)buffer);
7231	return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7232	}
7233
7234	/* XXX To save some code duplication, formatfloat/long/int could have been
7235	shared with stringobject.c, converting from 8-bit to Unicode after the
7236	formatting is done. */
7237
7238	static int
7239	formatfloat(Py_UNICODE *buf,
7240	size_t buflen,
7241	int flags,
7242	int prec,
7243	int type,
7244	PyObject *v)
7245	{
7246	/* fmt = '%#.' + `prec` + `type`
7247	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7248	char fmt[20];
7249	double x;
7250
7251	x = PyFloat_AsDouble(v);
7252	if (x == -1.0 && PyErr_Occurred())
7253	return -1;
7254	if (prec < 0)
7255	prec = 6;
7256	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7257	type = 'g';
7258	/* Worst case length calc to ensure no buffer overrun:
7259
7260	'g' formats:
7261	fmt = %#.<prec>g
7262	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7263	for any double rep.)
7264	len = 1 + prec + 1 + 2 + 5 = 9 + prec
7265
7266	'f' formats:
7267	buf = '-' + [0-9]x + '.' + [0-9]prec (with x < 50)
7268	len = 1 + 50 + 1 + prec = 52 + prec
7269
7270	If prec=0 the effective precision is 1 (the leading digit is
7271	always given), therefore increase the length by one.
7272
7273	*/
7274	if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) \|\|
7275	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
7276	PyErr_SetString(PyExc_OverflowError,
7277	"formatted float is too long (precision too large?)");
7278	return -1;
7279	}
7280	PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7281	(flags&F_ALT) ? "#" : "",
7282	prec, type);
7283	return doubletounicode(buf, buflen, fmt, x);
7284	}
7285
7286	static PyObject*
7287	formatlong(PyObject *val, int flags, int prec, int type)
7288	{
7289	char *buf;
7290	int i, len;
7291	PyObject str; / temporary string object. */
7292	PyUnicodeObject *result;
7293
7294	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7295	if (!str)
7296	return NULL;
7297	result = _PyUnicode_New(len);
7298	if (!result) {
7299	Py_DECREF(str);
7300	return NULL;
7301	}
7302	for (i = 0; i < len; i++)
7303	result->str[i] = buf[i];
7304	result->str[len] = 0;
7305	Py_DECREF(str);
7306	return (PyObject*)result;
7307	}
7308
7309	static int
7310	formatint(Py_UNICODE *buf,
7311	size_t buflen,
7312	int flags,
7313	int prec,
7314	int type,
7315	PyObject *v)
7316	{
7317	/* fmt = '%#.' + `prec` + 'l' + `type`
7318	* worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7319	* + 1 + 1
7320	* = 24
7321	*/
7322	char fmt[64]; /* plenty big enough! */
7323	char *sign;
7324	long x;
7325
7326	x = PyInt_AsLong(v);
7327	if (x == -1 && PyErr_Occurred())
7328	return -1;
7329	if (x < 0 && type == 'u') {
7330	type = 'd';
7331	}
7332	if (x < 0 && (type == 'x' \|\| type == 'X' \|\| type == 'o'))
7333	sign = "-";
7334	else
7335	sign = "";
7336	if (prec < 0)
7337	prec = 1;
7338
7339	/* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7340	* worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7341	*/
7342	if (buflen <= 14 \|\| buflen <= (size_t)3 + (size_t)prec) {
7343	PyErr_SetString(PyExc_OverflowError,
7344	"formatted integer is too long (precision too large?)");
7345	return -1;
7346	}
7347
7348	if ((flags & F_ALT) &&
7349	(type == 'x' \|\| type == 'X')) {
7350	/* When converting under %#x or %#X, there are a number
7351	* of issues that cause pain:
7352	* - when 0 is being converted, the C standard leaves off
7353	* the '0x' or '0X', which is inconsistent with other
7354	* %#x/%#X conversions and inconsistent with Python's
7355	* hex() function
7356	* - there are platforms that violate the standard and
7357	* convert 0 with the '0x' or '0X'
7358	* (Metrowerks, Compaq Tru64)
7359	* - there are platforms that give '0x' when converting
7360	* under %#X, but convert 0 in accordance with the
7361	* standard (OS/2 EMX)
7362	*
7363	* We can achieve the desired consistency by inserting our
7364	* own '0x' or '0X' prefix, and substituting %x/%X in place
7365	* of %#x/%#X.
7366	*
7367	* Note that this is the same approach as used in
7368	* formatint() in stringobject.c
7369	*/
7370	PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7371	sign, type, prec, type);
7372	}
7373	else {
7374	PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7375	sign, (flags&F_ALT) ? "#" : "",
7376	prec, type);
7377	}
7378	if (sign[0])
7379	return longtounicode(buf, buflen, fmt, -x);
7380	else
7381	return longtounicode(buf, buflen, fmt, x);
7382	}
7383
7384	static int
7385	formatchar(Py_UNICODE *buf,
7386	size_t buflen,
7387	PyObject *v)
7388	{
7389	/* presume that the buffer is at least 2 characters long */
7390	if (PyUnicode_Check(v)) {
7391	if (PyUnicode_GET_SIZE(v) != 1)
7392	goto onError;
7393	buf[0] = PyUnicode_AS_UNICODE(v)[0];
7394	}
7395
7396	else if (PyString_Check(v)) {
7397	if (PyString_GET_SIZE(v) != 1)
7398	goto onError;
7399	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7400	}
7401
7402	else {
7403	/* Integer input truncated to a character */
7404	long x;
7405	x = PyInt_AsLong(v);
7406	if (x == -1 && PyErr_Occurred())
7407	goto onError;
7408	#ifdef Py_UNICODE_WIDE
7409	if (x < 0 \|\| x > 0x10ffff) {
7410	PyErr_SetString(PyExc_OverflowError,
7411	"%c arg not in range(0x110000) "
7412	"(wide Python build)");
7413	return -1;
7414	}
7415	#else
7416	if (x < 0 \|\| x > 0xffff) {
7417	PyErr_SetString(PyExc_OverflowError,
7418	"%c arg not in range(0x10000) "
7419	"(narrow Python build)");
7420	return -1;
7421	}
7422	#endif
7423	buf[0] = (Py_UNICODE) x;
7424	}
7425	buf[1] = '\0';
7426	return 1;
7427
7428	onError:
7429	PyErr_SetString(PyExc_TypeError,
7430	"%c requires int or char");
7431	return -1;
7432	}
7433
7434	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7435
7436	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7437	chars are formatted. XXX This is a magic number. Each formatting
7438	routine does bounds checking to ensure no overflow, but a better
7439	solution may be to malloc a buffer of appropriate size for each
7440	format. For now, the current solution is sufficient.
7441	*/
7442	#define FORMATBUFLEN (size_t)120
7443
7444	PyObject PyUnicode_Format(PyObject format,
7445	PyObject *args)
7446	{
7447	Py_UNICODE fmt, res;
7448	Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
7449	int args_owned = 0;
7450	PyUnicodeObject *result = NULL;
7451	PyObject *dict = NULL;
7452	PyObject *uformat;
7453
7454	if (format == NULL \|\| args == NULL) {
7455	PyErr_BadInternalCall();
7456	return NULL;
7457	}
7458	uformat = PyUnicode_FromObject(format);
7459	if (uformat == NULL)
7460	return NULL;
7461	fmt = PyUnicode_AS_UNICODE(uformat);
7462	fmtcnt = PyUnicode_GET_SIZE(uformat);
7463
7464	reslen = rescnt = fmtcnt + 100;
7465	result = _PyUnicode_New(reslen);
7466	if (result == NULL)
7467	goto onError;
7468	res = PyUnicode_AS_UNICODE(result);
7469
7470	if (PyTuple_Check(args)) {
7471	arglen = PyTuple_Size(args);
7472	argidx = 0;
7473	}
7474	else {
7475	arglen = -1;
7476	argidx = -2;
7477	}
7478	if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7479	!PyObject_TypeCheck(args, &PyBaseString_Type))
7480	dict = args;
7481
7482	while (--fmtcnt >= 0) {
7483	if (*fmt != '%') {
7484	if (--rescnt < 0) {
7485	rescnt = fmtcnt + 100;
7486	reslen += rescnt;
7487	if (_PyUnicode_Resize(&result, reslen) < 0)
7488	goto onError;
7489	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7490	--rescnt;
7491	}
7492	res++ = fmt++;
7493	}
7494	else {
7495	/* Got a format specifier */
7496	int flags = 0;
7497	Py_ssize_t width = -1;
7498	int prec = -1;
7499	Py_UNICODE c = '\0';
7500	Py_UNICODE fill;
7501	PyObject *v = NULL;
7502	PyObject *temp = NULL;
7503	Py_UNICODE *pbuf;
7504	Py_UNICODE sign;
7505	Py_ssize_t len;
7506	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
7507
7508	fmt++;
7509	if (*fmt == '(') {
7510	Py_UNICODE *keystart;
7511	Py_ssize_t keylen;
7512	PyObject *key;
7513	int pcount = 1;
7514
7515	if (dict == NULL) {
7516	PyErr_SetString(PyExc_TypeError,
7517	"format requires a mapping");
7518	goto onError;
7519	}
7520	++fmt;
7521	--fmtcnt;
7522	keystart = fmt;
7523	/* Skip over balanced parentheses */
7524	while (pcount > 0 && --fmtcnt >= 0) {
7525	if (*fmt == ')')
7526	--pcount;
7527	else if (*fmt == '(')
7528	++pcount;
7529	fmt++;
7530	}
7531	keylen = fmt - keystart - 1;
7532	if (fmtcnt < 0 \|\| pcount > 0) {
7533	PyErr_SetString(PyExc_ValueError,
7534	"incomplete format key");
7535	goto onError;
7536	}
7537	#if 0
7538	/* keys are converted to strings using UTF-8 and
7539	then looked up since Python uses strings to hold
7540	variables names etc. in its namespaces and we
7541	wouldn't want to break common idioms. */
7542	key = PyUnicode_EncodeUTF8(keystart,
7543	keylen,
7544	NULL);
7545	#else
7546	key = PyUnicode_FromUnicode(keystart, keylen);
7547	#endif
7548	if (key == NULL)
7549	goto onError;
7550	if (args_owned) {
7551	Py_DECREF(args);
7552	args_owned = 0;
7553	}
7554	args = PyObject_GetItem(dict, key);
7555	Py_DECREF(key);
7556	if (args == NULL) {
7557	goto onError;
7558	}
7559	args_owned = 1;
7560	arglen = -1;
7561	argidx = -2;
7562	}
7563	while (--fmtcnt >= 0) {
7564	switch (c = *fmt++) {
7565	case '-': flags \|= F_LJUST; continue;
7566	case '+': flags \|= F_SIGN; continue;
7567	case ' ': flags \|= F_BLANK; continue;
7568	case '#': flags \|= F_ALT; continue;
7569	case '0': flags \|= F_ZERO; continue;
7570	}
7571	break;
7572	}
7573	if (c == '*') {
7574	v = getnextarg(args, arglen, &argidx);
7575	if (v == NULL)
7576	goto onError;
7577	if (!PyInt_Check(v)) {
7578	PyErr_SetString(PyExc_TypeError,
7579	"* wants int");
7580	goto onError;
7581	}
7582	width = PyInt_AsLong(v);
7583	if (width < 0) {
7584	flags \|= F_LJUST;
7585	width = -width;
7586	}
7587	if (--fmtcnt >= 0)
7588	c = *fmt++;
7589	}
7590	else if (c >= '0' && c <= '9') {
7591	width = c - '0';
7592	while (--fmtcnt >= 0) {
7593	c = *fmt++;
7594	if (c < '0' \|\| c > '9')
7595	break;
7596	if ((width*10) / 10 != width) {
7597	PyErr_SetString(PyExc_ValueError,
7598	"width too big");
7599	goto onError;
7600	}
7601	width = width*10 + (c - '0');
7602	}
7603	}
7604	if (c == '.') {
7605	prec = 0;
7606	if (--fmtcnt >= 0)
7607	c = *fmt++;
7608	if (c == '*') {
7609	v = getnextarg(args, arglen, &argidx);
7610	if (v == NULL)
7611	goto onError;
7612	if (!PyInt_Check(v)) {
7613	PyErr_SetString(PyExc_TypeError,
7614	"* wants int");
7615	goto onError;
7616	}
7617	prec = PyInt_AsLong(v);
7618	if (prec < 0)
7619	prec = 0;
7620	if (--fmtcnt >= 0)
7621	c = *fmt++;
7622	}
7623	else if (c >= '0' && c <= '9') {
7624	prec = c - '0';
7625	while (--fmtcnt >= 0) {
7626	c = Py_CHARMASK(*fmt++);
7627	if (c < '0' \|\| c > '9')
7628	break;
7629	if ((prec*10) / 10 != prec) {
7630	PyErr_SetString(PyExc_ValueError,
7631	"prec too big");
7632	goto onError;
7633	}
7634	prec = prec*10 + (c - '0');
7635	}
7636	}
7637	} /* prec */
7638	if (fmtcnt >= 0) {
7639	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
7640	if (--fmtcnt >= 0)
7641	c = *fmt++;
7642	}
7643	}
7644	if (fmtcnt < 0) {
7645	PyErr_SetString(PyExc_ValueError,
7646	"incomplete format");
7647	goto onError;
7648	}
7649	if (c != '%') {
7650	v = getnextarg(args, arglen, &argidx);
7651	if (v == NULL)
7652	goto onError;
7653	}
7654	sign = 0;
7655	fill = ' ';
7656	switch (c) {
7657
7658	case '%':
7659	pbuf = formatbuf;
7660	/* presume that buffer length is at least 1 */
7661	pbuf[0] = '%';
7662	len = 1;
7663	break;
7664
7665	case 's':
7666	case 'r':
7667	if (PyUnicode_Check(v) && c == 's') {
7668	temp = v;
7669	Py_INCREF(temp);
7670	}
7671	else {
7672	PyObject *unicode;
7673	if (c == 's')
7674	temp = PyObject_Unicode(v);
7675	else
7676	temp = PyObject_Repr(v);
7677	if (temp == NULL)
7678	goto onError;
7679	if (PyUnicode_Check(temp))
7680	/* nothing to do */;
7681	else if (PyString_Check(temp)) {
7682	/* convert to string to Unicode */
7683	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7684	PyString_GET_SIZE(temp),
7685	NULL,
7686	"strict");
7687	Py_DECREF(temp);
7688	temp = unicode;
7689	if (temp == NULL)
7690	goto onError;
7691	}
7692	else {
7693	Py_DECREF(temp);
7694	PyErr_SetString(PyExc_TypeError,
7695	"%s argument has non-string str()");
7696	goto onError;
7697	}
7698	}
7699	pbuf = PyUnicode_AS_UNICODE(temp);
7700	len = PyUnicode_GET_SIZE(temp);
7701	if (prec >= 0 && len > prec)
7702	len = prec;
7703	break;
7704
7705	case 'i':
7706	case 'd':
7707	case 'u':
7708	case 'o':
7709	case 'x':
7710	case 'X':
7711	if (c == 'i')
7712	c = 'd';
7713	if (PyLong_Check(v)) {
7714	temp = formatlong(v, flags, prec, c);
7715	if (!temp)
7716	goto onError;
7717	pbuf = PyUnicode_AS_UNICODE(temp);
7718	len = PyUnicode_GET_SIZE(temp);
7719	sign = 1;
7720	}
7721	else {
7722	pbuf = formatbuf;
7723	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7724	flags, prec, c, v);
7725	if (len < 0)
7726	goto onError;
7727	sign = 1;
7728	}
7729	if (flags & F_ZERO)
7730	fill = '0';
7731	break;
7732
7733	case 'e':
7734	case 'E':
7735	case 'f':
7736	case 'F':
7737	case 'g':
7738	case 'G':
7739	if (c == 'F')
7740	c = 'f';
7741	pbuf = formatbuf;
7742	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7743	flags, prec, c, v);
7744	if (len < 0)
7745	goto onError;
7746	sign = 1;
7747	if (flags & F_ZERO)
7748	fill = '0';
7749	break;
7750
7751	case 'c':
7752	pbuf = formatbuf;
7753	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
7754	if (len < 0)
7755	goto onError;
7756	break;
7757
7758	default:
7759	PyErr_Format(PyExc_ValueError,
7760	"unsupported format character '%c' (0x%x) "
7761	"at index %i",
7762	(31<=c && c<=126) ? (char)c : '?',
7763	(int)c,
7764	(int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
7765	goto onError;
7766	}
7767	if (sign) {
7768	if (pbuf == '-' \|\| pbuf == '+') {
7769	sign = *pbuf++;
7770	len--;
7771	}
7772	else if (flags & F_SIGN)
7773	sign = '+';
7774	else if (flags & F_BLANK)
7775	sign = ' ';
7776	else
7777	sign = 0;
7778	}
7779	if (width < len)
7780	width = len;
7781	if (rescnt - (sign != 0) < width) {
7782	reslen -= rescnt;
7783	rescnt = width + fmtcnt + 100;
7784	reslen += rescnt;
7785	if (reslen < 0) {
7786	Py_XDECREF(temp);
7787	PyErr_NoMemory();
7788	goto onError;
7789	}
7790	if (_PyUnicode_Resize(&result, reslen) < 0) {
7791	Py_XDECREF(temp);
7792	goto onError;
7793	}
7794	res = PyUnicode_AS_UNICODE(result)
7795	+ reslen - rescnt;
7796	}
7797	if (sign) {
7798	if (fill != ' ')
7799	*res++ = sign;
7800	rescnt--;
7801	if (width > len)
7802	width--;
7803	}
7804	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
7805	assert(pbuf[0] == '0');
7806	assert(pbuf[1] == c);
7807	if (fill != ' ') {
7808	res++ = pbuf++;
7809	res++ = pbuf++;
7810	}
7811	rescnt -= 2;
7812	width -= 2;
7813	if (width < 0)
7814	width = 0;
7815	len -= 2;
7816	}
7817	if (width > len && !(flags & F_LJUST)) {
7818	do {
7819	--rescnt;
7820	*res++ = fill;
7821	} while (--width > len);
7822	}
7823	if (fill == ' ') {
7824	if (sign)
7825	*res++ = sign;
7826	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
7827	assert(pbuf[0] == '0');
7828	assert(pbuf[1] == c);
7829	res++ = pbuf++;
7830	res++ = pbuf++;
7831	}
7832	}
7833	Py_UNICODE_COPY(res, pbuf, len);
7834	res += len;
7835	rescnt -= len;
7836	while (--width >= len) {
7837	--rescnt;
7838	*res++ = ' ';
7839	}
7840	if (dict && (argidx < arglen) && c != '%') {
7841	PyErr_SetString(PyExc_TypeError,
7842	"not all arguments converted during string formatting");
7843	Py_XDECREF(temp);
7844	goto onError;
7845	}
7846	Py_XDECREF(temp);
7847	} /* '%' */
7848	} /* until end */
7849	if (argidx < arglen && !dict) {
7850	PyErr_SetString(PyExc_TypeError,
7851	"not all arguments converted during string formatting");
7852	goto onError;
7853	}
7854
7855	if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7856	goto onError;
7857	if (args_owned) {
7858	Py_DECREF(args);
7859	}
7860	Py_DECREF(uformat);
7861	return (PyObject *)result;
7862
7863	onError:
7864	Py_XDECREF(result);
7865	Py_DECREF(uformat);
7866	if (args_owned) {
7867	Py_DECREF(args);
7868	}
7869	return NULL;
7870	}
7871
7872	static PyBufferProcs unicode_as_buffer = {
7873	(readbufferproc) unicode_buffer_getreadbuf,
7874	(writebufferproc) unicode_buffer_getwritebuf,
7875	(segcountproc) unicode_buffer_getsegcount,
7876	(charbufferproc) unicode_buffer_getcharbuf,
7877	};
7878
7879	static PyObject *
7880	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds);
7881
7882	static PyObject *
7883	unicode_new(PyTypeObject type, PyObject args, PyObject *kwds)
7884	{
7885	PyObject *x = NULL;
7886	static char *kwlist[] = {"string", "encoding", "errors", 0};
7887	char *encoding = NULL;
7888	char *errors = NULL;
7889
7890	if (type != &PyUnicode_Type)
7891	return unicode_subtype_new(type, args, kwds);
7892	if (!PyArg_ParseTupleAndKeywords(args, kwds, "\|Oss:unicode",
7893	kwlist, &x, &encoding, &errors))
7894	return NULL;
7895	if (x == NULL)
7896	return (PyObject *)_PyUnicode_New(0);
7897	if (encoding == NULL && errors == NULL)
7898	return PyObject_Unicode(x);
7899	else
7900	return PyUnicode_FromEncodedObject(x, encoding, errors);
7901	}
7902
7903	static PyObject *
7904	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds)
7905	{
7906	PyUnicodeObject tmp, pnew;
7907	Py_ssize_t n;
7908
7909	assert(PyType_IsSubtype(type, &PyUnicode_Type));
7910	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7911	if (tmp == NULL)
7912	return NULL;
7913	assert(PyUnicode_Check(tmp));
7914	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
7915	if (pnew == NULL) {
7916	Py_DECREF(tmp);
7917	return NULL;
7918	}
7919	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7920	if (pnew->str == NULL) {
7921	_Py_ForgetReference((PyObject *)pnew);
7922	PyObject_Del(pnew);
7923	Py_DECREF(tmp);
7924	return PyErr_NoMemory();
7925	}
7926	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7927	pnew->length = n;
7928	pnew->hash = tmp->hash;
7929	Py_DECREF(tmp);
7930	return (PyObject *)pnew;
7931	}
7932
7933	PyDoc_STRVAR(unicode_doc,
7934	"unicode(string [, encoding[, errors]]) -> object\n\
7935	\n\
7936	Create a new Unicode object from the given encoded string.\n\
7937	encoding defaults to the current default string encoding.\n\
7938	errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7939
7940	PyTypeObject PyUnicode_Type = {
7941	PyObject_HEAD_INIT(&PyType_Type)
7942	0, /* ob_size */
7943	"unicode", /* tp_name */
7944	sizeof(PyUnicodeObject), /* tp_size */
7945	0, /* tp_itemsize */
7946	/* Slots */
7947	(destructor)unicode_dealloc, /* tp_dealloc */
7948	0, /* tp_print */
7949	0, /* tp_getattr */
7950	0, /* tp_setattr */
7951	0, /* tp_compare */
7952	unicode_repr, /* tp_repr */
7953	&unicode_as_number, /* tp_as_number */
7954	&unicode_as_sequence, /* tp_as_sequence */
7955	&unicode_as_mapping, /* tp_as_mapping */
7956	(hashfunc) unicode_hash, /* tp_hash*/
7957	0, /* tp_call*/
7958	(reprfunc) unicode_str, /* tp_str */
7959	PyObject_GenericGetAttr, /* tp_getattro */
7960	0, /* tp_setattro */
7961	&unicode_as_buffer, /* tp_as_buffer */
7962	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_CHECKTYPES \|
7963	Py_TPFLAGS_BASETYPE, /* tp_flags */
7964	unicode_doc, /* tp_doc */
7965	0, /* tp_traverse */
7966	0, /* tp_clear */
7967	PyUnicode_RichCompare, /* tp_richcompare */
7968	0, /* tp_weaklistoffset */
7969	0, /* tp_iter */
7970	0, /* tp_iternext */
7971	unicode_methods, /* tp_methods */
7972	0, /* tp_members */
7973	0, /* tp_getset */
7974	&PyBaseString_Type, /* tp_base */
7975	0, /* tp_dict */
7976	0, /* tp_descr_get */
7977	0, /* tp_descr_set */
7978	0, /* tp_dictoffset */
7979	0, /* tp_init */
7980	0, /* tp_alloc */
7981	unicode_new, /* tp_new */
7982	PyObject_Del, /* tp_free */
7983	};
7984
7985	/* Initialize the Unicode implementation */
7986
7987	void _PyUnicode_Init(void)
7988	{
7989	int i;
7990
7991	/* XXX - move this array to unicodectype.c ? */
7992	Py_UNICODE linebreak[] = {
7993	0x000A, /* LINE FEED */
7994	0x000D, /* CARRIAGE RETURN */
7995	0x001C, /* FILE SEPARATOR */
7996	0x001D, /* GROUP SEPARATOR */
7997	0x001E, /* RECORD SEPARATOR */
7998	0x0085, /* NEXT LINE */
7999	0x2028, /* LINE SEPARATOR */
8000	0x2029, /* PARAGRAPH SEPARATOR */
8001	};
8002
8003	/* Init the implementation */
8004	unicode_freelist = NULL;
8005	unicode_freelist_size = 0;
8006	unicode_empty = _PyUnicode_New(0);
8007	if (!unicode_empty)
8008	return;
8009
8010	strcpy(unicode_default_encoding, "ascii");
8011	for (i = 0; i < 256; i++)
8012	unicode_latin1[i] = NULL;
8013	if (PyType_Ready(&PyUnicode_Type) < 0)
8014	Py_FatalError("Can't initialize 'unicode'");
8015
8016	/* initialize the linebreak bloom filter */
8017	bloom_linebreak = make_bloom_mask(
8018	linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8019	);
8020
8021	PyType_Ready(&EncodingMapType);
8022	}
8023
8024	/* Finalize the Unicode implementation */
8025
8026	void
8027	_PyUnicode_Fini(void)
8028	{
8029	PyUnicodeObject *u;
8030	int i;
8031
8032	Py_XDECREF(unicode_empty);
8033	unicode_empty = NULL;
8034
8035	for (i = 0; i < 256; i++) {
8036	if (unicode_latin1[i]) {
8037	Py_DECREF(unicode_latin1[i]);
8038	unicode_latin1[i] = NULL;
8039	}
8040	}
8041
8042	for (u = unicode_freelist; u != NULL;) {
8043	PyUnicodeObject *v = u;
8044	u = (PyUnicodeObject *)u;
8045	if (v->str)
8046	PyMem_DEL(v->str);
8047	Py_XDECREF(v->defenc);
8048	PyObject_Del(v);
8049	}
8050	unicode_freelist = NULL;
8051	unicode_freelist_size = 0;
8052	}
8053
8054	#ifdef __cplusplus
8055	}
8056	#endif
8057
8058
8059	/*
8060	Local variables:
8061	c-basic-offset: 4
8062	indent-tabs-mode: nil
8063	End:
8064	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/python/2.5/Objects/unicodeobject.c

Download in other formats: