Context Navigation

← Previous Change
Next Change →

unicodeobject.c

Timestamp:

Mar 19, 2014, 11:11:30 AM (11 years ago)

Author:

dmik

Message:

python: Update vendor to 2.7.6.

File:

: 1 edited

python/vendor/current/Objects/unicodeobject.c (modified) (112 diffs)

Legend:

: Unmodified
: Added
: Removed

python/vendor/current/Objects/unicodeobject.c

-              r2
+              r388
 /* --- Globals ------------------------------------------------------------
+   The globals are initialized by the _PyUnicode_Init() API and should
+   not be used before calling that API.
+NOTE: In the interpreter's initialization phase, some globals are currently
+      initialized dynamically as needed. In the process Unicode objects may
+      be created before the Unicode type is ready.
 */
 …
 /* Free list for Unicode objects */
 static PyUnicodeObject *free_list;
 static int numfree;
+static PyUnicodeObject *free_list = NULL;
+static int numfree = 0;
 /* The empty Unicode object is shared to improve performance. */
+static PyUnicodeObject *unicode_empty;
+static PyUnicodeObject *unicode_empty = NULL;
+#define _Py_RETURN_UNICODE_EMPTY()                      \
+    do {                                                \
+        if (unicode_empty != NULL)                      \
+            Py_INCREF(unicode_empty);                   \
+        else {                                          \
+            unicode_empty = _PyUnicode_New(0);          \
+            if (unicode_empty != NULL)                  \
+                Py_INCREF(unicode_empty);               \
+        }                                               \
+        return (PyObject *)unicode_empty;               \
+    } while (0)
 /* Single character Unicode strings in the Latin-1 range are being
    shared as well. */
 static PyUnicodeObject *unicode_latin1[256];
+static PyUnicodeObject *unicode_latin1[256] = {NULL};
 /* Default encoding to use and assume when NULL is passed as encoding
 …
 */
 static char unicode_default_encoding[100];
+static char unicode_default_encoding[100 + 1] = "ascii";
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
 , 0, 0, 0, 0, 0, 0, 0,
 /*     case 0x0009: * HORIZONTAL TABULATION */
+/*     case 0x0009: * CHARACTER TABULATION */
 /*     case 0x000A: * LINE FEED */
 /*     case 0x000B: * VERTICAL TABULATION */
+/*     case 0x000B: * LINE TABULATION */
 /*     case 0x000C: * FORM FEED */
 /*     case 0x000D: * CARRIAGE RETURN */
 …
 , 0, 0, 0, 0, 0, 0, 0,
 /*         0x000A, * LINE FEED */
+/*         0x000B, * LINE TABULATION */
+/*         0x000C, * FORM FEED */
 /*         0x000D, * CARRIAGE RETURN */
 , 0, 1, 0, 0, 1, 0, 0,
+, 0, 1, 1, 1, 1, 0, 0,
 , 0, 0, 0, 0, 0, 0, 0,
 /*         0x001C, * FILE SEPARATOR */
 …
 /* the linebreak mask is set up by Unicode_Init below */
+#if LONG_BIT >= 128
+#define BLOOM_WIDTH 128
+#elif LONG_BIT >= 64
+#define BLOOM_WIDTH 64
+#elif LONG_BIT >= 32
+#define BLOOM_WIDTH 32
+#else
+#error "LONG_BIT is smaller than 32"
+#endif
 #define BLOOM_MASK unsigned long
+static BLOOM_MASK bloom_linebreak;
+#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
+static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
+#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
+#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 #define BLOOM_LINEBREAK(ch)                                             \
 …
     /* calculate simple bloom-style bitmask for a given unicode string */
     long mask;
+    BLOOM_MASK mask;
     Py_ssize_t i;
     mask = 0;
     for (i = 0; i < len; i++)
         mask |= (1 << (ptr[i] & 0x1F));
+        BLOOM_ADD(mask, ptr[i]);
     return mask;
 …
     /* Reset the object caches */
     if (unicode->defenc) {
+        Py_DECREF(unicode->defenc);
+        unicode->defenc = NULL;
+        Py_CLEAR(unicode->defenc);
+    }
     unicode->hash = -1;
 …
 /* We allocate one more byte to make sure the string is
    Ux0000 terminated -- XXX is this needed ?
+   Ux0000 terminated; some code relies on that.
    XXX This allocator could further be enhanced by assuring that the
 …
+        }
         if (unicode->defenc) {
+            Py_DECREF(unicode->defenc);
+            unicode->defenc = NULL;
+            Py_CLEAR(unicode->defenc);
+        }
         /* Add to free list */
 …
         /* Optimization for empty strings */
+        if (size == 0 && unicode_empty != NULL) {
+            Py_INCREF(unicode_empty);
+            return (PyObject *)unicode_empty;
+        }
+        if (size == 0)
+            _Py_RETURN_UNICODE_EMPTY();
         /* Single character Unicode objects in the Latin-1 range are
 …
         /* Optimization for empty strings */
+        if (size == 0 && unicode_empty != NULL) {
+            Py_INCREF(unicode_empty);
+            return (PyObject *)unicode_empty;
+        }
+        if (size == 0)
+            _Py_RETURN_UNICODE_EMPTY();
         /* Single characters are shared when using this constructor.
 …
+}
+/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
+ * by 'ptr', possibly combining surrogate pairs on narrow builds.
+ * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
+ * that should be returned and 'end' pointing to the end of the buffer.
+ * ('end' is used on narrow builds to detect a lone surrogate at the
+ * end of the buffer that should be returned unchanged.)
+ * The ptr and end arguments should be side-effect free and ptr must an lvalue.
+ * The type of the returned char is always Py_UCS4.
+ *
+ * Note: the macro advances ptr to next char, so it might have side-effects
+ *       (especially if used with other macros).
+ */
+/* helper macros used by _Py_UNICODE_NEXT */
+#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
+#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
+/* Join two surrogate characters and return a single Py_UCS4 value. */
+#define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
+    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
+      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
+#ifdef Py_UNICODE_WIDE
+#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
+#else
+#define _Py_UNICODE_NEXT(ptr, end)                                      \
+     (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
+        _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
+       ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
+       (Py_UCS4)*(ptr)++)
+#endif
 #ifdef HAVE_WCHAR_H
+#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
+# define CONVERT_WCHAR_TO_SURROGATES
+#endif
+#ifdef CONVERT_WCHAR_TO_SURROGATES
+/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
+   to convert from UTF32 to UTF16. */
+PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
+                                 Py_ssize_t size)
+{
+    PyUnicodeObject *unicode;
+    register Py_ssize_t i;
+    Py_ssize_t alloc;
+    const wchar_t *orig_w;
+    if (w == NULL) {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+    alloc = size;
+    orig_w = w;
+    for (i = size; i > 0; i--) {
+        if (*w > 0xFFFF)
+            alloc++;
+        w++;
+    }
+    w = orig_w;
+    unicode = _PyUnicode_New(alloc);
+    if (!unicode)
+        return NULL;
+    /* Copy the wchar_t data into the new object */
+    {
+        register Py_UNICODE *u;
+        u = PyUnicode_AS_UNICODE(unicode);
+        for (i = size; i > 0; i--) {
+            if (*w > 0xFFFF) {
+                wchar_t ordinal = *w++;
+                ordinal -= 0x10000;
+                *u++ = 0xD800 | (ordinal >> 10);
+                *u++ = 0xDC00 | (ordinal & 0x3FF);
+            }
+            else
+                *u++ = *w++;
+        }
+    }
+    return (PyObject *)unicode;
+}
+#else
 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 …
     return (PyObject *)unicode;
+}
+#endif /* CONVERT_WCHAR_TO_SURROGATES */
+#undef CONVERT_WCHAR_TO_SURROGATES
 static void
 …
             switch (*f) {
             case 'c':
+                (void)va_arg(count, int);
+            {
+                int ordinal = va_arg(count, int);
+#ifdef Py_UNICODE_WIDE
+                if (ordinal < 0 || ordinal > 0x10ffff) {
+                    PyErr_SetString(PyExc_OverflowError,
+                                    "%c arg not in range(0x110000) "
+                                    "(wide Python build)");
+                    goto fail;
+                }
+#else
+                if (ordinal < 0 || ordinal > 0xffff) {
+                    PyErr_SetString(PyExc_OverflowError,
+                                    "%c arg not in range(0x10000) "
+                                    "(narrow Python build)");
+                    goto fail;
+                }
+#endif
                 /* fall through... */
+            }
             case '%':
                 n++;
 …
+            {
                 /* UTF-8 */
                 unsigned char *s = va_arg(count, unsigned char*);
+                const char *s = va_arg(count, const char*);
                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
                 if (!str)
 …
     /* Convert to Unicode */
+    if (len == 0) {
+        Py_INCREF(unicode_empty);
+        v = (PyObject *)unicode_empty;
+    }
+    else
+        v = PyUnicode_Decode(s, len, encoding, errors);
+    if (len == 0)
+        _Py_RETURN_UNICODE_EMPTY();
+    v = PyUnicode_Decode(s, len, encoding, errors);
     return v;
 …
     strncpy(unicode_default_encoding,
             encoding,
             sizeof(unicode_default_encoding));
+            sizeof(unicode_default_encoding) - 1);
     return 0;
 …
 /* --- UTF-7 Codec -------------------------------------------------------- */
+/* see RFC2152 for details */
+/* See RFC2152 for details.  We encode conservatively and decode liberally. */
+/* Three simple macros defining base-64. */
+/* Is c a base-64 character? */
+#define IS_BASE64(c) \
+    (isalnum(c) || (c) == '+' || (c) == '/')
+/* given that c is a base-64 character, what is its base-64 value? */
+#define FROM_BASE64(c)                                                  \
+    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
+     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
+     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
+     (c) == '+' ? 62 : 63)
+/* What is the base-64 character of the bottom 6 bits of n? */
+#define TO_BASE64(n)  \
+    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
+/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
+ * decoded as itself.  We are permissive on decoding; the only ASCII
+ * byte not decoding to itself is the + which begins a base64
+ * string. */
+#define DECODE_DIRECT(c)                                \
+    ((c) <= 127 && (c) != '+')
+/* The UTF-7 encoder treats ASCII characters differently according to
+ * whether they are Set D, Set O, Whitespace, or special (i.e. none of
+ * the above).  See RFC2152.  This array identifies these different
+ * sets:
+ * 0 : "Set D"
+ *     alphanumeric and '(),-./:?
+ * 1 : "Set O"
+ *     !"#$%&*;<=>@[]^_`{|}
+ * 2 : "whitespace"
+ *     ht nl cr sp
+ * 3 : special (must be base64 encoded)
+ *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
+ */
 static
+char utf7_special[128] = {
+    /* indicate whether a UTF-7 character is special i.e. cannot be directly
+       encoded:
+- not special
+- special
+- whitespace (optional)
+- RFC2152 Set O (optional) */
+, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
+, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
+char utf7_category[128] = {
+/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
+,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
+/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
+,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
+,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
+/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
+,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
+/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
+,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
+,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
+/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
+,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
+,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
 };
+/* Note: The comparison (c) <= 0 is a trick to work-around gcc
+   warnings about the comparison always being false; since
+   utf7_special[0] is 1, we can safely make that one comparison
+   true  */
+#define SPECIAL(c, encodeO, encodeWS)                   \
+    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
+     (encodeWS && (utf7_special[(c)] == 2)) ||          \
+     (encodeO && (utf7_special[(c)] == 3)))
+#define B64(n)                                                          \
+    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
+#define B64CHAR(c)                              \
+    (isalnum(c) || (c) == '+' || (c) == '/')
+#define UB64(c)                                         \
+    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?   \
+     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
+#define ENCODE(out, ch, bits)                   \
+    while (bits >= 6) {                         \
+        *out++ = B64(ch >> (bits-6));           \
+        bits -= 6;                              \
+    }
+#define DECODE(out, ch, bits, surrogate)                                \
+    while (bits >= 16) {                                                \
+        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
+        bits -= 16;                                                     \
+        if (surrogate) {                                                \
+            /* We have already generated an error for the high surrogate \
+               so let's not bother seeing if the low surrogate is correct or not */ \
+            surrogate = 0;                                              \
+        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
+            /* This is a surrogate pair. Unfortunately we can't represent \
+               it in a 16-bit character */                              \
+            surrogate = 1;                                              \
+            errmsg = "code pairs are not supported";                    \
+            goto utf7Error;                                             \
+        } else {                                                        \
+            *out++ = outCh;                                             \
+        }                                                               \
+    }
+/* ENCODE_DIRECT: this character should be encoded as itself.  The
+ * answer depends on whether we are encoding set O as itself, and also
+ * on whether we are encoding whitespace as itself.  RFC2152 makes it
+ * clear that the answers to these questions vary between
+ * applications, so this code needs to be flexible.  */
+#define ENCODE_DIRECT(c, directO, directWS)             \
+    ((c) < 128 && (c) > 0 &&                            \
+     ((utf7_category[(c)] == 0) ||                      \
+      (directWS && (utf7_category[(c)] == 2)) ||        \
+      (directO && (utf7_category[(c)] == 1))))
 PyObject *PyUnicode_DecodeUTF7(const char *s,
 …
     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
+}
+/* The decoder.  The only state we preserve is our read position,
+ * i.e. how many characters we have consumed.  So if we end in the
+ * middle of a shift sequence we have to back off the read position
+ * and the output to the beginning of the sequence, otherwise we lose
+ * all the shift state (seen bits, number of bits seen, high
+ * surrogate). */
 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
 …
     const char *errmsg = "";
     int inShift = 0;
+    unsigned int bitsleft = 0;
+    unsigned long charsleft = 0;
+    int surrogate = 0;
+    Py_UNICODE *shiftOutStart;
+    unsigned int base64bits = 0;
+    unsigned long base64buffer = 0;
+    Py_UNICODE surrogate = 0;
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
 …
     p = unicode->str;
+    shiftOutStart = p;
     e = s + size;
     while (s < e) {
+        Py_UNICODE ch;
+      restart:
+        ch = (unsigned char) *s;
+        if (inShift) {
+            if ((ch == '-') || !B64CHAR(ch)) {
+        Py_UNICODE ch = (unsigned char) *s;
+        if (inShift) { /* in a base-64 section */
+            if (IS_BASE64(ch)) { /* consume a base-64 character */
+                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
+                base64bits += 6;
+                s++;
+                if (base64bits >= 16) {
+                    /* we have enough bits for a UTF-16 value */
+                    Py_UNICODE outCh = (Py_UNICODE)
+                                       (base64buffer >> (base64bits-16));
+                    base64bits -= 16;
+                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
+                    assert(outCh <= 0xffff);
+                    if (surrogate) {
+                        /* expecting a second surrogate */
+                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
+#ifdef Py_UNICODE_WIDE
+                            *p++ = (((surrogate & 0x3FF)<<10)
+                                    | (outCh & 0x3FF)) + 0x10000;
+#else
+                            *p++ = surrogate;
+                            *p++ = outCh;
+#endif
+                            surrogate = 0;
+                            continue;
+                        }
+                        else {
+                            *p++ = surrogate;
+                            surrogate = 0;
+                        }
+                    }
+                    if (outCh >= 0xD800 && outCh <= 0xDBFF) {
+                        /* first surrogate */
+                        surrogate = outCh;
+                    }
+                    else {
+                        *p++ = outCh;
+                    }
+                }
+            }
+            else { /* now leaving a base-64 section */
                 inShift = 0;
                 s++;
+                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
+                if (bitsleft >= 6) {
+                    /* The shift sequence has a partial character in it. If
+                       bitsleft < 6 then we could just classify it as padding
+                       but that is not the case here */
+                    errmsg = "partial character in shift sequence";
+                    goto utf7Error;
+                if (surrogate) {
+                    *p++ = surrogate;
+                    surrogate = 0;
+                }
+                /* According to RFC2152 the remaining bits should be zero. We
+                   choose to signal an error/insert a replacement character
+                   here so indicate the potential of a misencoded character. */
+                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
+                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
+                    errmsg = "non-zero padding bits in shift sequence";
+                    goto utf7Error;
+                if (base64bits > 0) { /* left-over bits */
+                    if (base64bits >= 6) {
+                        /* We've seen at least one base-64 character */
+                        errmsg = "partial character in shift sequence";
+                        goto utf7Error;
+                    }
+                    else {
+                        /* Some bits remain; they should be zero */
+                        if (base64buffer != 0) {
+                            errmsg = "non-zero padding bits in shift sequence";
+                            goto utf7Error;
+                        }
+                    }
+                }
+                if (ch == '-') {
+                    if ((s < e) && (*(s) == '-')) {
+                        *p++ = '-';
+                        inShift = 1;
+                    }
+                } else if (SPECIAL(ch,0,0)) {
+                    errmsg = "unexpected special character";
+                    goto utf7Error;
+                } else  {
+                if (ch != '-') {
+                    /* '-' is absorbed; other terminating
+                       characters are preserved */
                     *p++ = ch;
+                }
-            } else {
-                charsleft = (charsleft << 6) | UB64(ch);
-                bitsleft += 6;
-                s++;
-                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
+            }
+        }
         else if ( ch == '+' ) {
             startinpos = s-starts;
             s++;
             if (s < e && *s == '-') {
+            s++; /* consume '+' */
+            if (s < e && *s == '-') { /* '+-' encodes '+' */
                 s++;
                 *p++ = '+';
             } else
+            {
+            }
+            else { /* begin base64-encoded section */
                 inShift = 1;
+                bitsleft = 0;
+            }
+        }
+        else if (SPECIAL(ch,0,0)) {
+            startinpos = s-starts;
+            errmsg = "unexpected special character";
+            s++;
+            goto utf7Error;
+        }
+        else {
+                shiftOutStart = p;
+                base64bits = 0;
+                base64buffer = 0;
+            }
+        }
+        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
             *p++ = ch;
             s++;
+        }
+        else {
+            startinpos = s-starts;
+            s++;
+            errmsg = "unexpected special character";
+            goto utf7Error;
+        }
         continue;
       utf7Error:
+utf7Error:
         outpos = p-PyUnicode_AS_UNICODE(unicode);
         endinpos = s-starts;
 …
+    }
+    if (inShift && !consumed) {
+        outpos = p-PyUnicode_AS_UNICODE(unicode);
+        endinpos = size;
+        if (unicode_decode_call_errorhandler(
+                errors, &errorHandler,
+                "utf7", "unterminated shift sequence",
+                starts, size, &startinpos, &endinpos, &exc, &s,
+                &unicode, &outpos, &p))
+            goto onError;
+        if (s < e)
+            goto restart;
+    }
+    /* end of string */
+    if (inShift && !consumed) { /* in shift sequence, no more to follow */
+        /* if we're in an inconsistent state, that's an error */
+        if (surrogate ||
+                (base64bits >= 6) ||
+                (base64bits > 0 && base64buffer != 0)) {
+            outpos = p-PyUnicode_AS_UNICODE(unicode);
+            endinpos = size;
+            if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "utf7", "unterminated shift sequence",
+                    starts, size, &startinpos, &endinpos, &exc, &s,
+                    &unicode, &outpos, &p))
+                goto onError;
+        }
+    }
+    /* return state */
     if (consumed) {
+        if(inShift)
+        if (inShift) {
+            p = shiftOutStart; /* back off output */
             *consumed = startinpos;
+        else
+        }
+        else {
             *consumed = s-starts;
+        }
+    }
 …
 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
                                Py_ssize_t size,
                                int encodeSetO,
                                int encodeWhiteSpace,
+                               int base64SetO,
+                               int base64WhiteSpace,
                                const char *errors)
+{
     PyObject *v;
     /* It might be possible to tighten this worst case */
     Py_ssize_t cbAllocated = 5 * size;
+    Py_ssize_t allocated = 8 * size;
     int inShift = 0;
     Py_ssize_t i = 0;
     unsigned int bitsleft = 0;
     unsigned long charsleft = 0;
+    unsigned int base64bits = 0;
+    unsigned long base64buffer = 0;
     char * out;
     char * start;
     if (cbAllocated / 5 != size)
+    if (allocated / 8 != size)
         return PyErr_NoMemory();
 …
         return PyString_FromStringAndSize(NULL, 0);
     v = PyString_FromStringAndSize(NULL, cbAllocated);
+    v = PyString_FromStringAndSize(NULL, allocated);
     if (v == NULL)
         return NULL;
 …
         Py_UNICODE ch = s[i];
+        if (!inShift) {
+        if (inShift) {
+            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
+                /* shifting out */
+                if (base64bits) { /* output remaining bits */
+                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
+                    base64buffer = 0;
+                    base64bits = 0;
+                }
+                inShift = 0;
+                /* Characters not in the BASE64 set implicitly unshift the sequence
+                   so no '-' is required, except if the character is itself a '-' */
+                if (IS_BASE64(ch) || ch == '-') {
+                    *out++ = '-';
+                }
+                *out++ = (char) ch;
+            }
+            else {
+                goto encode_char;
+            }
+        }
+        else { /* not in a shift sequence */
             if (ch == '+') {
                 *out++ = '+';
+                *out++ = '-';
+            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
+                charsleft = ch;
+                bitsleft = 16;
+                        *out++ = '-';
+            }
+            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
+                *out++ = (char) ch;
+            }
+            else {
                 *out++ = '+';
+                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
+                inShift = bitsleft > 0;
+            } else {
+                *out++ = (char) ch;
+            }
+        } else {
+            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
+                *out++ = B64(charsleft << (6-bitsleft));
+                charsleft = 0;
+                bitsleft = 0;
+                /* Characters not in the BASE64 set implicitly unshift the sequence
+                   so no '-' is required, except if the character is itself a '-' */
+                if (B64CHAR(ch) || ch == '-') {
+                    *out++ = '-';
+                }
+                inShift = 0;
+                *out++ = (char) ch;
+            } else {
+                bitsleft += 16;
+                charsleft = (charsleft << 16) | ch;
+                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
+                /* If the next character is special then we don't need to terminate
+                   the shift sequence. If the next character is not a BASE64 character
+                   or '-' then the shift sequence will be terminated implicitly and we
+                   don't have to insert a '-'. */
+                if (bitsleft == 0) {
+                    if (i + 1 < size) {
+                        Py_UNICODE ch2 = s[i+1];
+                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
+                        } else if (B64CHAR(ch2) || ch2 == '-') {
+                            *out++ = '-';
+                            inShift = 0;
+                        } else {
+                            inShift = 0;
+                        }
+                    }
+                    else {
+                        *out++ = '-';
+                        inShift = 0;
+                    }
+                }
+            }
+        }
+    }
+    if (bitsleft) {
+        *out++= B64(charsleft << (6-bitsleft) );
+                inShift = 1;
+                goto encode_char;
+            }
+        }
+        continue;
+encode_char:
+#ifdef Py_UNICODE_WIDE
+        if (ch >= 0x10000) {
+            /* code first surrogate */
+            base64bits += 16;
+            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
+            while (base64bits >= 6) {
+                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
+                base64bits -= 6;
+            }
+            /* prepare second surrogate */
+            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
+        }
+#endif
+        base64bits += 16;
+        base64buffer = (base64buffer << 16) | ch;
+        while (base64bits >= 6) {
+            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
+            base64bits -= 6;
+        }
+    }
+    if (base64bits)
+        *out++= TO_BASE64(base64buffer << (6-base64bits) );
+    if (inShift)
         *out++ = '-';
+    }
     _PyString_Resize(&v, out - start);
+    if (_PyString_Resize(&v, out - start))
+        return NULL;
     return v;
+}
+#undef SPECIAL
+#undef B64
+#undef B64CHAR
+#undef UB64
+#undef ENCODE
+#undef DECODE
+#undef IS_BASE64
+#undef FROM_BASE64
+#undef TO_BASE64
+#undef DECODE_DIRECT
+#undef ENCODE_DIRECT
 /* --- UTF-8 Codec -------------------------------------------------------- */
 …
 static
 char utf8_code_length[256] = {
+    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
+       illegal prefix.  see RFC 2279 for details */
+    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
+       illegal prefix.  See RFC 3629 for details */
+, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 …
 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
+, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
+, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
+, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
+, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
 };
 …
     const char *starts = s;
     int n;
+    int k;
     Py_ssize_t startinpos;
     Py_ssize_t endinpos;
 …
                 errmsg = "unexpected end of data";
                 startinpos = s-starts;
+                endinpos = size;
+                endinpos = startinpos+1;
+                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
+                    endinpos++;
                 goto utf8Error;
+            }
 …
         case 0:
             errmsg = "unexpected code byte";
+            errmsg = "invalid start byte";
             startinpos = s-starts;
             endinpos = startinpos+1;
 …
         case 2:
             if ((s[1] & 0xc0) != 0x80) {
                 errmsg = "invalid data";
+                errmsg = "invalid continuation byte";
                 startinpos = s-starts;
                 endinpos = startinpos+2;
+                endinpos = startinpos + 1;
                 goto utf8Error;
+            }
             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
+            if (ch < 0x80) {
+            assert ((ch > 0x007F) && (ch <= 0x07FF));
+            *p++ = (Py_UNICODE)ch;
+            break;
+        case 3:
+            /* XXX: surrogates shouldn't be valid UTF-8!
+               see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
+               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
+               Uncomment the 2 lines below to make them invalid,
+               codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
+            if ((s[1] & 0xc0) != 0x80 ||
+                (s[2] & 0xc0) != 0x80 ||
+                ((unsigned char)s[0] == 0xE0 &&
+                 (unsigned char)s[1] < 0xA0)/* ||
+                ((unsigned char)s[0] == 0xED &&
+                 (unsigned char)s[1] > 0x9F)*/) {
+                errmsg = "invalid continuation byte";
                 startinpos = s-starts;
+                endinpos = startinpos+2;
+                errmsg = "illegal encoding";
+                endinpos = startinpos + 1;
+                /* if s[1] first two bits are 1 and 0, then the invalid
+                   continuation byte is s[2], so increment endinpos by 1,
+                   if not, s[1] is invalid and endinpos doesn't need to
+                   be incremented. */
+                if ((s[1] & 0xC0) == 0x80)
+                    endinpos++;
                 goto utf8Error;
+            }
-            else
-                *p++ = (Py_UNICODE)ch;
-            break;
-        case 3:
-            if ((s[1] & 0xc0) != 0x80 ||
-                (s[2] & 0xc0) != 0x80) {
-                errmsg = "invalid data";
-                startinpos = s-starts;
-                endinpos = startinpos+3;
-                goto utf8Error;
+            }
             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+            if (ch < 0x0800) {
+                /* Note: UTF-8 encodings of surrogates are considered
+                   legal UTF-8 sequences;
+                   XXX For wide builds (UCS-4) we should probably try
+                   to recombine the surrogates into a single code
+                   unit.
+                */
+                errmsg = "illegal encoding";
+                startinpos = s-starts;
+                endinpos = startinpos+3;
+                goto utf8Error;
+            }
+            else
+                *p++ = (Py_UNICODE)ch;
+            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
+            *p++ = (Py_UNICODE)ch;
             break;
 …
             if ((s[1] & 0xc0) != 0x80 ||
                 (s[2] & 0xc0) != 0x80 ||
+                (s[3] & 0xc0) != 0x80) {
+                errmsg = "invalid data";
+                (s[3] & 0xc0) != 0x80 ||
+                ((unsigned char)s[0] == 0xF0 &&
+                 (unsigned char)s[1] < 0x90) ||
+                ((unsigned char)s[0] == 0xF4 &&
+                 (unsigned char)s[1] > 0x8F)) {
+                errmsg = "invalid continuation byte";
                 startinpos = s-starts;
+                endinpos = startinpos+4;
+                endinpos = startinpos + 1;
+                if ((s[1] & 0xC0) == 0x80) {
+                    endinpos++;
+                    if ((s[2] & 0xC0) == 0x80)
+                        endinpos++;
+                }
                 goto utf8Error;
+            }
             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
+                ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+            /* validate and convert to UTF-16 */
+            if ((ch < 0x10000)        /* minimum value allowed for 4
+                                         byte encoding */
+                || (ch > 0x10ffff))   /* maximum value allowed for
+                                         UTF-16 */
+            {
+                errmsg = "illegal encoding";
+                startinpos = s-starts;
+                endinpos = startinpos+4;
+                goto utf8Error;
+            }
+                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
 #ifdef Py_UNICODE_WIDE
             *p++ = (Py_UNICODE)ch;
 …
 #endif
             break;
-        default:
-            /* Other sizes are only needed for UCS-4 */
-            errmsg = "unsupported Unicode code range";
-            startinpos = s-starts;
-            endinpos = startinpos+n;
-            goto utf8Error;
+        }
         s += n;
 …
         nneeded = p - PyString_AS_STRING(v);
         assert(nneeded <= nallocated);
+        _PyString_Resize(&v, nneeded);
+        if (_PyString_Resize(&v, nneeded))
+            return NULL;
+    }
     return v;
 …
     Py_UNICODE *p;
 #ifndef Py_UNICODE_WIDE
+    int i, pairs;
+    int pairs = 0;
+    const unsigned char *qq;
 #else
     const int pairs = 0;
 …
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
+    /* On narrow builds we split characters outside the BMP into two
+       codepoints => count how much extra space we need. */
+#ifndef Py_UNICODE_WIDE
+    for (i = pairs = 0; i < size/4; i++)
+        if (((Py_UCS4 *)s)[i] >= 0x10000)
+            pairs++;
+#endif
+    /* This might be one to much, because of a BOM */
+    unicode = _PyUnicode_New((size+3)/4+pairs);
+    if (!unicode)
+        return NULL;
+    if (size == 0)
+        return (PyObject *)unicode;
+    /* Unpack UTF-32 encoded data */
+    p = unicode->str;
     q = (unsigned char *)s;
     e = q + size;
 …
         iorder[3] = 0;
+    }
+    /* On narrow builds we split characters outside the BMP into two
+       codepoints => count how much extra space we need. */
+#ifndef Py_UNICODE_WIDE
+    for (qq = q; e - qq >= 4; qq += 4)
+        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
+            pairs++;
+#endif
+    /* This might be one to much, because of a BOM */
+    unicode = _PyUnicode_New((size+3)/4+pairs);
+    if (!unicode)
+        return NULL;
+    if (size == 0)
+        return (PyObject *)unicode;
+    /* Unpack UTF-32 encoded data */
+    p = unicode->str;
     while (q < e) {
 …
         /* UTF-16 code pair: */
+        if (q >= e) {
+        if (e - q < 2) {
+            q -= 2;
+            if (consumed)
+                break;
             errmsg = "unexpected end of data";
             startinpos = (((const char *)q)-2)-starts;
+            startinpos = ((const char *)q)-starts;
             endinpos = ((const char *)e)-starts;
             goto utf16Error;
 …
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
-    int i;
     PyUnicodeObject *v;
     Py_UNICODE *p;
 …
         hexescape:
             chr = 0;
+            outpos = p-PyUnicode_AS_UNICODE(v);
+            if (s+digits>end) {
+                endinpos = size;
+                if (unicode_decode_call_errorhandler(
+                        errors, &errorHandler,
+                        "unicodeescape", "end of string in escape sequence",
+                        starts, size, &startinpos, &endinpos, &exc, &s,
+                        &v, &outpos, &p))
+                    goto onError;
+                goto nextByte;
+            }
+            for (i = 0; i < digits; ++i) {
+                c = (unsigned char) s[i];
+                if (!isxdigit(c)) {
+                    endinpos = (s+i+1)-starts;
+                    if (unicode_decode_call_errorhandler(
+                            errors, &errorHandler,
+                            "unicodeescape", message,
+                            starts, size, &startinpos, &endinpos, &exc, &s,
+                            &v, &outpos, &p))
+                        goto onError;
+                    goto nextByte;
+            if (end - s < digits) {
+                /* count only hex digits */
+                for (; s < end; ++s) {
+                    c = (unsigned char)*s;
+                    if (!Py_ISXDIGIT(c))
+                        goto error;
+                }
+                goto error;
+            }
+            for (; digits--; ++s) {
+                c = (unsigned char)*s;
+                if (!Py_ISXDIGIT(c))
+                    goto error;
                 chr = (chr<<4) & ~0xF;
                 if (c >= '0' && c <= '9')
 …
                     chr += 10 + c - 'A';
+            }
-            s += i;
             if (chr == 0xffffffff && PyErr_Occurred())
                 /* _decoding_error will have already written into the
 …
 #endif
             } else {
+                endinpos = s-starts;
+                outpos = p-PyUnicode_AS_UNICODE(v);
+                if (unicode_decode_call_errorhandler(
+                        errors, &errorHandler,
+                        "unicodeescape", "illegal Unicode character",
+                        starts, size, &startinpos, &endinpos, &exc, &s,
+                        &v, &outpos, &p))
+                    goto onError;
+                message = "illegal Unicode character";
+                goto error;
+            }
             break;
 …
             if (ucnhash_CAPI == NULL) {
                 /* load the unicode data module */
+                PyObject *m, *api;
+                m = PyImport_ImportModuleNoBlock("unicodedata");
+                if (m == NULL)
+                    goto ucnhashError;
+                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
+                Py_DECREF(m);
+                if (api == NULL)
+                    goto ucnhashError;
+                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
+                Py_DECREF(api);
+                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
                 if (ucnhash_CAPI == NULL)
                     goto ucnhashError;
 …
                     message = "unknown Unicode character name";
                     s++;
+                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
+                    if (s - start - 1 <= INT_MAX &&
+                        ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
                         goto store;
+                }
+            }
+            endinpos = s-starts;
+            outpos = p-PyUnicode_AS_UNICODE(v);
+            if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "unicodeescape", message,
+                    starts, size, &startinpos, &endinpos, &exc, &s,
+                    &v, &outpos, &p))
+                goto onError;
+            break;
+            goto error;
         default:
 …
                 message = "\\ at end of string";
                 s--;
+                endinpos = s-starts;
+                outpos = p-PyUnicode_AS_UNICODE(v);
+                if (unicode_decode_call_errorhandler(
+                        errors, &errorHandler,
+                        "unicodeescape", message,
+                        starts, size, &startinpos, &endinpos, &exc, &s,
+                        &v, &outpos, &p))
+                    goto onError;
+                goto error;
+            }
             else {
 …
             break;
+        }
+      nextByte:
+        ;
+        continue;
+      error:
+        endinpos = s-starts;
+        outpos = p-PyUnicode_AS_UNICODE(v);
+        if (unicode_decode_call_errorhandler(
+                errors, &errorHandler,
+                "unicodeescape", message,
+                starts, size, &startinpos, &endinpos, &exc, &s,
+                &v, &outpos, &p))
+            goto onError;
+        continue;
+    }
     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
 …
     *p = '\0';
+    _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
+    if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
+        return NULL;
     return repr;
+}
 …
+    }
     *p = '\0';
+    _PyString_Resize(&repr, p - q);
+    if (_PyString_Resize(&repr, p - q))
+        return NULL;
     return repr;
+}
 …
     while (s < end) {
+        if (end-s < Py_UNICODE_SIZE) {
+            endinpos = end-starts;
+            reason = "truncated input";
+            goto error;
+        }
         memcpy(p, s, sizeof(Py_UNICODE));
+#ifdef Py_UNICODE_WIDE
         /* We have to sanity check the raw data, otherwise doom looms for
            some malformed UCS-4 data. */
+        if (
+#ifdef Py_UNICODE_WIDE
+            *p > unimax || *p < 0 ||
+        if (*p > unimax || *p < 0) {
+            endinpos = s - starts + Py_UNICODE_SIZE;
+            reason = "illegal code point (> 0x10FFFF)";
+            goto error;
+        }
 #endif
+            end-s < Py_UNICODE_SIZE
+            )
+        {
+            startinpos = s - starts;
+            if (end-s < Py_UNICODE_SIZE) {
+                endinpos = end-starts;
+                reason = "truncated input";
+            }
+            else {
+                endinpos = s - starts + Py_UNICODE_SIZE;
+                reason = "illegal code point (> 0x10FFFF)";
+            }
+            outpos = p - PyUnicode_AS_UNICODE(v);
+            if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "unicode_internal", reason,
+                    starts, size, &startinpos, &endinpos, &exc, &s,
+                    &v, &outpos, &p)) {
+                goto onError;
+            }
+        }
+        else {
+            p++;
+            s += Py_UNICODE_SIZE;
+        p++;
+        s += Py_UNICODE_SIZE;
+        continue;
+  error:
+        startinpos = s - starts;
+        outpos = p - PyUnicode_AS_UNICODE(v);
+        if (unicode_decode_call_errorhandler(
+                errors, &errorHandler,
+                "unicode_internal", reason,
+                starts, size, &startinpos, &endinpos, &exc, &s,
+                &v, &outpos, &p)) {
+            goto onError;
+        }
+    }
 …
                 respos = str-PyString_AS_STRING(res);
                 /* determine replacement size (temporarily (mis)uses p) */
+                for (p = collstart, repsize = 0; p < collend; ++p) {
+                    if (*p<10)
+                for (p = collstart, repsize = 0; p < collend;) {
+                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                    if (ch < 10)
                         repsize += 2+1+1;
                     else if (*p<100)
+                    else if (ch < 100)
                         repsize += 2+2+1;
                     else if (*p<1000)
+                    else if (ch < 1000)
                         repsize += 2+3+1;
                     else if (*p<10000)
+                    else if (ch < 10000)
                         repsize += 2+4+1;
+#ifndef Py_UNICODE_WIDE
+                    else
+                    else if (ch < 100000)
                         repsize += 2+5+1;
+#else
+                    else if (*p<100000)
+                        repsize += 2+5+1;
+                    else if (*p<1000000)
+                    else if (ch < 1000000)
                         repsize += 2+6+1;
                     else
                         repsize += 2+7+1;
-#endif
+                }
                 requiredsize = respos+repsize+(endp-collend);
 …
+                }
                 /* generate replacement (temporarily (mis)uses p) */
+                for (p = collstart; p < collend; ++p) {
+                    str += sprintf(str, "&#%d;", (int)*p);
+                for (p = collstart; p < collend;) {
+                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                    str += sprintf(str, "&#%d;", (int)ch);
+                }
                 p = collend;
 …
                 if (repunicode == NULL)
                     goto onError;
                 /* need more space? (at least enough for what we
                    have+the replacement+the rest of the string, so
                    we won't have to check space for encodable characters) */
+                /* need more space? (at least enough for what we have+the
+                   replacement+the rest of the string, so we won't have to
+                   check space for encodable characters) */
                 respos = str-PyString_AS_STRING(res);
                 repsize = PyUnicode_GET_SIZE(repunicode);
 …
                     /* No mapping found means: mapping is undefined. */
                     PyErr_Clear();
+                    x = Py_None;
+                    Py_INCREF(x);
+                    goto Undefined;
                 } else
                     goto onError;
 …
             /* Apply mapping */
+            if (x == Py_None)
+                goto Undefined;
             if (PyInt_Check(x)) {
                 long value = PyInt_AS_LONG(x);
+                if (value < 0 || value > 65535) {
+                if (value == 0xFFFE)
+                    goto Undefined;
+                if (value < 0 || value > 0x10FFFF) {
                     PyErr_SetString(PyExc_TypeError,
                                     "character mapping must be in range(65536)");
+                                    "character mapping must be in range(0x110000)");
                     Py_DECREF(x);
                     goto onError;
+                }
+#ifndef Py_UNICODE_WIDE
+                if (value > 0xFFFF) {
+                    /* see the code for 1-n mapping below */
+                    if (extrachars < 2) {
+                        /* resize first */
+                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
+                        Py_ssize_t needed = 10 - extrachars;
+                        extrachars += needed;
+                        /* XXX overflow detection missing */
+                        if (_PyUnicode_Resize(&v,
+                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
+                            Py_DECREF(x);
+                            goto onError;
+                        }
+                        p = PyUnicode_AS_UNICODE(v) + oldpos;
+                    }
+                    value -= 0x10000;
+                    *p++ = 0xD800 | (value >> 10);
+                    *p++ = 0xDC00 | (value & 0x3FF);
+                    extrachars -= 2;
+                }
+                else
+#endif
                 *p++ = (Py_UNICODE)value;
+            }
-            else if (x == Py_None) {
-                /* undefined mapping */
-                outpos = p-PyUnicode_AS_UNICODE(v);
-                startinpos = s-starts;
-                endinpos = startinpos+1;
-                if (unicode_decode_call_errorhandler(
-                        errors, &errorHandler,
-                        "charmap", "character maps to <undefined>",
-                        starts, size, &startinpos, &endinpos, &exc, &s,
-                        &v, &outpos, &p)) {
-                    Py_DECREF(x);
-                    goto onError;
+                }
-                Py_DECREF(x);
-                continue;
+            }
             else if (PyUnicode_Check(x)) {
                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
                 if (targetsize == 1)
+                if (targetsize == 1) {
                     /* 1-1 mapping */
+                    *p++ = *PyUnicode_AS_UNICODE(x);
+                    Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
+                    if (value == 0xFFFE)
+                        goto Undefined;
+                    *p++ = value;
+                }
                 else if (targetsize > 1) {
                     /* 1-n mapping */
 …
             Py_DECREF(x);
             ++s;
+            continue;
+Undefined:
+            /* undefined mapping */
+            Py_XDECREF(x);
+            outpos = p-PyUnicode_AS_UNICODE(v);
+            startinpos = s-starts;
+            endinpos = startinpos+1;
+            if (unicode_decode_call_errorhandler(
+                    errors, &errorHandler,
+                    "charmap", "character maps to <undefined>",
+                    starts, size, &startinpos, &endinpos, &exc, &s,
+                    &v, &outpos, &p)) {
+                goto onError;
+            }
+        }
+    }
 …
             return NULL;
         for (i = 0; i < 256; i++) {
             key = value = NULL;
+            value = NULL;
             key = PyInt_FromLong(decode[i]);
             value = PyInt_FromLong(i);
 …
         break;
     case 4: /* xmlcharrefreplace */
         /* generate replacement (temporarily (mis)uses p) */
         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
+        /* generate replacement */
+        for (collpos = collstartpos; collpos < collendpos;) {
             char buffer[2+29+1+1];
             char *cp;
+            sprintf(buffer, "&#%d;", (int)p[collpos]);
+            Py_UCS4 ch = p[collpos++];
+#ifndef Py_UNICODE_WIDE
+            if ((0xD800 <= ch && ch <= 0xDBFF) &&
+                (collpos < collendpos) &&
+                (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
+                ch = ((((ch & 0x03FF) << 10) |
+                       ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
+            }
+#endif
+            sprintf(buffer, "&#%d;", (int)ch);
             for (cp = buffer; *cp; ++cp) {
                 x = charmapencode_output(*cp, mapping, res, respos);
 …
             case 4: /* xmlcharrefreplace */
                 /* generate replacement (temporarily (mis)uses p) */
                 for (p = collstart; p < collend; ++p) {
+                for (p = collstart; p < collend;) {
                     char buffer[2+29+1+1];
                     char *cp;
+                    sprintf(buffer, "&#%d;", (int)*p);
+                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                    sprintf(buffer, "&#%d;", (int)ch);
                     if (charmaptranslate_makespace(&res, &str,
                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
 …
         /* All other characters are considered unencodable */
         collstart = p;
+        collend = p+1;
+        while (collend < end) {
+        for (collend = p+1; collend < end; collend++) {
             if ((0 < *collend && *collend < 256) ||
                 !Py_UNICODE_ISSPACE(*collend) ||
                 Py_UNICODE_TODECIMAL(*collend))
+                Py_UNICODE_ISSPACE(*collend) ||
+<= Py_UNICODE_TODECIMAL(*collend))
                 break;
+        }
 …
         case 4: /* xmlcharrefreplace */
             /* generate replacement (temporarily (mis)uses p) */
+            for (p = collstart; p < collend; ++p)
+                output += sprintf(output, "&#%d;", (int)*p);
+            for (p = collstart; p < collend;) {
+                Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+                output += sprintf(output, "&#%d;", ch);
+            }
             p = collend;
             break;
 …
 #include "stringlib/unicodedefs.h"
-#define FROM_UNICODE
 #include "stringlib/fastsearch.h"
 …
 #include "stringlib/find.h"
 #include "stringlib/partition.h"
+#include "stringlib/split.h"
 /* helper macro to fixup start/end slice values */
+#define FIX_START_END(obj)                      \
+    if (start < 0)                              \
+        start += (obj)->length;                 \
+    if (start < 0)                              \
+        start = 0;                              \
+    if (end > (obj)->length)                    \
+        end = (obj)->length;                    \
+    if (end < 0)                                \
+        end += (obj)->length;                   \
+    if (end < 0)                                \
+        end = 0;
+#define ADJUST_INDICES(start, end, len)         \
+    if (end > len)                              \
+        end = len;                              \
+    else if (end < 0) {                         \
+        end += len;                             \
+        if (end < 0)                            \
+            end = 0;                            \
+    }                                           \
+    if (start < 0) {                            \
+        start += len;                           \
+        if (start < 0)                          \
+            start = 0;                          \
+    }
 Py_ssize_t PyUnicode_Count(PyObject *str,
 …
+    }
+    FIX_START_END(str_obj);
+    ADJUST_INDICES(start, end, str_obj->length);
     result = stringlib_count(
+        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
+        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
+        PY_SSIZE_T_MAX
         );
 …
         return 1;
+    FIX_START_END(self);
+    ADJUST_INDICES(start, end, self->length);
     end -= substring->length;
     if (end < start)
 …
     if (len == 0)
         return 0;
     if (Py_UNICODE_ISLOWER(*s)) {
+    if (!Py_UNICODE_ISUPPER(*s)) {
         *s = Py_UNICODE_TOUPPER(*s);
         status = 1;
 …
     s++;
     while (--len > 0) {
         if (Py_UNICODE_ISUPPER(*s)) {
+        if (!Py_UNICODE_ISLOWER(*s)) {
             *s = Py_UNICODE_TOLOWER(*s);
             status = 1;
 …
+}
+#define SPLIT_APPEND(data, left, right)                                 \
+    str = PyUnicode_FromUnicode((data) + (left), (right) - (left));     \
+    if (!str)                                                           \
+        goto onError;                                                   \
+    if (PyList_Append(list, str)) {                                     \
+        Py_DECREF(str);                                                 \
+        goto onError;                                                   \
+    }                                                                   \
+    else                                                                \
+        Py_DECREF(str);
+static
+PyObject *split_whitespace(PyUnicodeObject *self,
+                           PyObject *list,
+                           Py_ssize_t maxcount)
+{
+    register Py_ssize_t i;
+    register Py_ssize_t j;
+    Py_ssize_t len = self->length;
+    PyObject *str;
+    register const Py_UNICODE *buf = self->str;
+    for (i = j = 0; i < len; ) {
+        /* find a token */
+        while (i < len && Py_UNICODE_ISSPACE(buf[i]))
+            i++;
+        j = i;
+        while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
+            i++;
+        if (j < i) {
+            if (maxcount-- <= 0)
+                break;
+            SPLIT_APPEND(buf, j, i);
+            while (i < len && Py_UNICODE_ISSPACE(buf[i]))
+                i++;
+            j = i;
+        }
+    }
+    if (j < len) {
+        SPLIT_APPEND(buf, j, len);
+    }
+    return list;
+  onError:
+    Py_DECREF(list);
+    return NULL;
+}
+PyObject *PyUnicode_Splitlines(PyObject *string,
+                               int keepends)
+{
+    register Py_ssize_t i;
+    register Py_ssize_t j;
+    Py_ssize_t len;
+PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
+{
     PyObject *list;
-    PyObject *str;
-    Py_UNICODE *data;
     string = PyUnicode_FromObject(string);
     if (string == NULL)
         return NULL;
+    data = PyUnicode_AS_UNICODE(string);
+    len = PyUnicode_GET_SIZE(string);
+    list = PyList_New(0);
+    if (!list)
+        goto onError;
+    for (i = j = 0; i < len; ) {
+        Py_ssize_t eol;
+        /* Find a line and append it */
+        while (i < len && !BLOOM_LINEBREAK(data[i]))
+            i++;
+        /* Skip the line break reading CRLF as one line break */
+        eol = i;
+        if (i < len) {
+            if (data[i] == '\r' && i + 1 < len &&
+                data[i+1] == '\n')
+                i += 2;
+            else
+                i++;
+            if (keepends)
+                eol = i;
+        }
+        SPLIT_APPEND(data, j, eol);
+        j = i;
+    }
+    if (j < len) {
+        SPLIT_APPEND(data, j, len);
+    }
+    list = stringlib_splitlines(
+        (PyObject*) string, PyUnicode_AS_UNICODE(string),
+        PyUnicode_GET_SIZE(string), keepends);
     Py_DECREF(string);
     return list;
+  onError:
+    Py_XDECREF(list);
+    Py_DECREF(string);
+    return NULL;
+}
+static
+PyObject *split_char(PyUnicodeObject *self,
+                     PyObject *list,
+                     Py_UNICODE ch,
+                     Py_ssize_t maxcount)
+{
+    register Py_ssize_t i;
+    register Py_ssize_t j;
+    Py_ssize_t len = self->length;
+    PyObject *str;
+    register const Py_UNICODE *buf = self->str;
+    for (i = j = 0; i < len; ) {
+        if (buf[i] == ch) {
+            if (maxcount-- <= 0)
+                break;
+            SPLIT_APPEND(buf, j, i);
+            i = j = i + 1;
+        } else
+            i++;
+    }
+    if (j <= len) {
+        SPLIT_APPEND(buf, j, len);
+    }
+    return list;
+  onError:
+    Py_DECREF(list);
+    return NULL;
+}
+static
+PyObject *split_substring(PyUnicodeObject *self,
+                          PyObject *list,
+                          PyUnicodeObject *substring,
+                          Py_ssize_t maxcount)
+{
+    register Py_ssize_t i;
+    register Py_ssize_t j;
+    Py_ssize_t len = self->length;
+    Py_ssize_t sublen = substring->length;
+    PyObject *str;
+    for (i = j = 0; i <= len - sublen; ) {
+        if (Py_UNICODE_MATCH(self, i, substring)) {
+            if (maxcount-- <= 0)
+                break;
+            SPLIT_APPEND(self->str, j, i);
+            i = j = i + sublen;
+        } else
+            i++;
+    }
+    if (j <= len) {
+        SPLIT_APPEND(self->str, j, len);
+    }
+    return list;
+  onError:
+    Py_DECREF(list);
+    return NULL;
+}
+static
+PyObject *rsplit_whitespace(PyUnicodeObject *self,
+                            PyObject *list,
+                            Py_ssize_t maxcount)
+{
+    register Py_ssize_t i;
+    register Py_ssize_t j;
+    Py_ssize_t len = self->length;
+    PyObject *str;
+    register const Py_UNICODE *buf = self->str;
+    for (i = j = len - 1; i >= 0; ) {
+        /* find a token */
+        while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
+            i--;
+        j = i;
+        while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
+            i--;
+        if (j > i) {
+            if (maxcount-- <= 0)
+                break;
+            SPLIT_APPEND(buf, i + 1, j + 1);
+            while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
+                i--;
+            j = i;
+        }
+    }
+    if (j >= 0) {
+        SPLIT_APPEND(buf, 0, j + 1);
+    }
+    if (PyList_Reverse(list) < 0)
+        goto onError;
+    return list;
+  onError:
+    Py_DECREF(list);
+    return NULL;
+}
+static
+PyObject *rsplit_char(PyUnicodeObject *self,
+                      PyObject *list,
+                      Py_UNICODE ch,
+                      Py_ssize_t maxcount)
+{
+    register Py_ssize_t i;
+    register Py_ssize_t j;
+    Py_ssize_t len = self->length;
+    PyObject *str;
+    register const Py_UNICODE *buf = self->str;
+    for (i = j = len - 1; i >= 0; ) {
+        if (buf[i] == ch) {
+            if (maxcount-- <= 0)
+                break;
+            SPLIT_APPEND(buf, i + 1, j + 1);
+            j = i = i - 1;
+        } else
+            i--;
+    }
+    if (j >= -1) {
+        SPLIT_APPEND(buf, 0, j + 1);
+    }
+    if (PyList_Reverse(list) < 0)
+        goto onError;
+    return list;
+  onError:
+    Py_DECREF(list);
+    return NULL;
+}
+static
+PyObject *rsplit_substring(PyUnicodeObject *self,
+                           PyObject *list,
+                           PyUnicodeObject *substring,
+                           Py_ssize_t maxcount)
+{
+    register Py_ssize_t i;
+    register Py_ssize_t j;
+    Py_ssize_t len = self->length;
+    Py_ssize_t sublen = substring->length;
+    PyObject *str;
+    for (i = len - sublen, j = len; i >= 0; ) {
+        if (Py_UNICODE_MATCH(self, i, substring)) {
+            if (maxcount-- <= 0)
+                break;
+            SPLIT_APPEND(self->str, i + sublen, j);
+            j = i;
+            i -= sublen;
+        } else
+            i--;
+    }
+    if (j >= 0) {
+        SPLIT_APPEND(self->str, 0, j);
+    }
+    if (PyList_Reverse(list) < 0)
+        goto onError;
+    return list;
+  onError:
+    Py_DECREF(list);
+    return NULL;
+}
+#undef SPLIT_APPEND
+}
 static
 …
                 Py_ssize_t maxcount)
+{
-    PyObject *list;
     if (maxcount < 0)
         maxcount = PY_SSIZE_T_MAX;
-    list = PyList_New(0);
-    if (!list)
-        return NULL;
     if (substring == NULL)
+        return split_whitespace(self,list,maxcount);
+    else if (substring->length == 1)
+        return split_char(self,list,substring->str[0],maxcount);
+    else if (substring->length == 0) {
+        Py_DECREF(list);
+        PyErr_SetString(PyExc_ValueError, "empty separator");
+        return NULL;
+    }
+    else
+        return split_substring(self,list,substring,maxcount);
+        return stringlib_split_whitespace(
+            (PyObject*) self,  self->str, self->length, maxcount
+            );
+    return stringlib_split(
+        (PyObject*) self,  self->str, self->length,
+        substring->str, substring->length,
+        maxcount
+        );
+}
 …
                  Py_ssize_t maxcount)
+{
-    PyObject *list;
     if (maxcount < 0)
         maxcount = PY_SSIZE_T_MAX;
-    list = PyList_New(0);
-    if (!list)
-        return NULL;
     if (substring == NULL)
+        return rsplit_whitespace(self,list,maxcount);
+    else if (substring->length == 1)
+        return rsplit_char(self,list,substring->str[0],maxcount);
+    else if (substring->length == 0) {
+        Py_DECREF(list);
+        PyErr_SetString(PyExc_ValueError, "empty separator");
+        return NULL;
+    }
+    else
+        return rsplit_substring(self,list,substring,maxcount);
+        return stringlib_rsplit_whitespace(
+            (PyObject*) self,  self->str, self->length, maxcount
+            );
+    return stringlib_rsplit(
+        (PyObject*) self,  self->str, self->length,
+        substring->str, substring->length,
+        maxcount
+        );
+}
 …
     if (maxcount < 0)
         maxcount = PY_SSIZE_T_MAX;
+    else if (maxcount == 0 || self->length == 0)
+        goto nothing;
     if (str1->length == str2->length) {
+        Py_ssize_t i;
         /* same length */
+        Py_ssize_t i;
+        if (str1->length == 0)
+            goto nothing;
         if (str1->length == 1) {
             /* replace characters */
 …
+                }
         } else {
             i = fastsearch(
                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
+            i = stringlib_find(
+                self->str, self->length, str1->str, str1->length, 0
                 );
             if (i < 0)
 …
                 return NULL;
             Py_UNICODE_COPY(u->str, self->str, self->length);
+            while (i <= self->length - str1->length)
+                if (Py_UNICODE_MATCH(self, i, str1)) {
+                    if (--maxcount < 0)
+                        break;
+                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+                    i += str1->length;
+                } else
+                    i++;
+            /* change everything in-place, starting with this one */
+            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+            i += str1->length;
+            while ( --maxcount > 0) {
+                i = stringlib_find(self->str+i, self->length-i,
+                                   str1->str, str1->length,
+                                   i);
+                if (i == -1)
+                    break;
+                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+                i += str1->length;
+            }
+        }
     } else {
         Py_ssize_t n, i, j, e;
+        Py_ssize_t n, i, j;
         Py_ssize_t product, new_size, delta;
         Py_UNICODE *p;
         /* replace strings */
+        n = stringlib_count(self->str, self->length, str1->str, str1->length);
+        if (n > maxcount)
+            n = maxcount;
+        n = stringlib_count(self->str, self->length, str1->str, str1->length,
+                            maxcount);
         if (n == 0)
             goto nothing;
 …
         i = 0;
         p = u->str;
-        e = self->length - str1->length;
         if (str1->length > 0) {
             while (n-- > 0) {
                 /* look for next match */
+                j = i;
+                while (j <= e) {
+                    if (Py_UNICODE_MATCH(self, j, str1))
+                        break;
+                    j++;
+                }
+                if (j > i) {
+                    if (j > e)
+                        break;
+                j = stringlib_find(self->str+i, self->length-i,
+                                   str1->str, str1->length,
+                                   i);
+                if (j == -1)
+                    break;
+                else if (j > i) {
                     /* copy unchanged part [i:j] */
                     Py_UNICODE_COPY(p, self->str+i, j-i);
 …
 \n\
 Return a capitalized version of S, i.e. make the first character\n\
 have upper case.");
+have upper case and the rest lower case.");
 static PyObject*
 …
     sub = PyUnicode_FromObject(element);
     if (!sub) {
-        PyErr_SetString(PyExc_TypeError,
-                        "'in <string>' requires string as left operand");
         return -1;
+    }
 …
     PyObject *result;
+    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
+                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
+        return NULL;
+    substring = (PyUnicodeObject *)PyUnicode_FromObject(
+        (PyObject *)substring);
+    if (substring == NULL)
+        return NULL;
+    FIX_START_END(self);
+    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
+                                            &start, &end))
+        return NULL;
+    ADJUST_INDICES(start, end, self->length);
     result = PyInt_FromSsize_t(
         stringlib_count(self->str + start, end - start,
+                        substring->str, substring->length)
+                        substring->str, substring->length,
+                        PY_SSIZE_T_MAX)
         );
 …
 static PyObject *
+unicode_encode(PyUnicodeObject *self, PyObject *args)
+{
+unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
+{
+    static char *kwlist[] = {"encoding", "errors", 0};
     char *encoding = NULL;
     char *errors = NULL;
     PyObject *v;
+    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
+                                     kwlist, &encoding, &errors))
         return NULL;
     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
 …
 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
 as well as any other name registerd with codecs.register_error that is\n\
+as well as any other name registered with codecs.register_error that is\n\
 able to handle UnicodeDecodeErrors.");
 static PyObject *
+unicode_decode(PyUnicodeObject *self, PyObject *args)
+{
+unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
+{
+    static char *kwlist[] = {"encoding", "errors", 0};
     char *encoding = NULL;
     char *errors = NULL;
     PyObject *v;
+    if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
+                                     kwlist, &encoding, &errors))
         return NULL;
     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
 …
 \n\
 Return the lowest index in S where substring sub is found,\n\
 such that sub is contained within s[start:end].  Optional\n\
+such that sub is contained within S[start:end].  Optional\n\
 arguments start and end are interpreted as in slice notation.\n\
 \n\
 …
 unicode_find(PyUnicodeObject *self, PyObject *args)
+{
     PyObject *substring;
+    PyUnicodeObject *substring;
     Py_ssize_t start;
     Py_ssize_t end;
     Py_ssize_t result;
+    if (!_ParseTupleFinds(args, &substring, &start, &end))
+    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
+                                            &start, &end))
         return NULL;
 …
     register long x;
+#ifdef Py_DEBUG
+    assert(_Py_HashSecret_Initialized);
+#endif
     if (self->hash != -1)
         return self->hash;
     len = PyUnicode_GET_SIZE(self);
+    /*
+      We make the hash of the empty string be 0, rather than using
+      (prefix ^ suffix), since this slightly obfuscates the hash secret
+    */
+    if (len == 0) {
+        self->hash = 0;
+        return 0;
+    }
     p = PyUnicode_AS_UNICODE(self);
+    x = *p << 7;
+    x = _Py_HashSecret.prefix;
+    x ^= *p << 7;
     while (--len >= 0)
         x = (1000003*x) ^ *p++;
     x ^= PyUnicode_GET_SIZE(self);
+    x ^= _Py_HashSecret.suffix;
     if (x == -1)
         x = -2;
 …
+{
     Py_ssize_t result;
     PyObject *substring;
+    PyUnicodeObject *substring;
     Py_ssize_t start;
     Py_ssize_t end;
+    if (!_ParseTupleFinds(args, &substring, &start, &end))
+    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
+                                            &start, &end))
         return NULL;
 …
 PyDoc_STRVAR(replace__doc__,
              "S.replace (old, new[, count]) -> unicode\n\
+             "S.replace(old, new[, count]) -> unicode\n\
 \n\
 Return a copy of S with all occurrences of substring\n\
 …
 \n\
 Return the highest index in S where substring sub is found,\n\
 such that sub is contained within s[start:end].  Optional\n\
+such that sub is contained within S[start:end].  Optional\n\
 arguments start and end are interpreted as in slice notation.\n\
 \n\
 …
 unicode_rfind(PyUnicodeObject *self, PyObject *args)
+{
     PyObject *substring;
+    PyUnicodeObject *substring;
     Py_ssize_t start;
     Py_ssize_t end;
     Py_ssize_t result;
+    if (!_ParseTupleFinds(args, &substring, &start, &end))
+    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
+                                            &start, &end))
         return NULL;
 …
 unicode_rindex(PyUnicodeObject *self, PyObject *args)
+{
     PyObject *substring;
+    PyUnicodeObject *substring;
     Py_ssize_t start;
     Py_ssize_t end;
     Py_ssize_t result;
+    if (!_ParseTupleFinds(args, &substring, &start, &end))
+    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
+                                            &start, &end))
         return NULL;
 …
 PyDoc_STRVAR(splitlines__doc__,
              "S.splitlines([keepends]) -> list of strings\n\
+             "S.splitlines(keepends=False) -> list of strings\n\
 \n\
 Return a list of the lines in S, breaking at line boundaries.\n\
 …
     int result;
+    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
+                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
+    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
         return NULL;
     if (PyTuple_Check(subobj)) {
 …
+    }
     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
+    if (substring == NULL)
+        return NULL;
+    if (substring == NULL) {
+        if (PyErr_ExceptionMatches(PyExc_TypeError))
+            PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
+                         "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
+        return NULL;
+    }
     result = tailmatch(self, substring, start, end, -1);
     Py_DECREF(substring);
 …
     int result;
+    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
+                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
+    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
         return NULL;
     if (PyTuple_Check(subobj)) {
 …
+    }
     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
+    if (substring == NULL)
+        return NULL;
+    if (substring == NULL) {
+        if (PyErr_ExceptionMatches(PyExc_TypeError))
+            PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
+                         "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
+        return NULL;
+    }
     result = tailmatch(self, substring, start, end, +1);
     Py_DECREF(substring);
 …
              "S.format(*args, **kwargs) -> unicode\n\
 \n\
+");
+Return a formatted version of S, using substitutions from args and kwargs.\n\
+The substitutions are identified by braces ('{' and '}').");
 static PyObject *
 …
              "S.__format__(format_spec) -> unicode\n\
 \n\
 ");
+Return a formatted version of S as described by format_spec.");
 static PyObject *
 …
 static PyMethodDef unicode_methods[] = {
+    /* Order is according to common usage: often used methods should
+       appear first, since lookup is done sequentially. */
+    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
+    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
 …
     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
+    {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
 …
 static int
-doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
+{
-    Py_ssize_t result;
-    PyOS_ascii_formatd((char *)buffer, len, format, x);
-    result = strtounicode(buffer, (char *)buffer);
-    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
+}
-static int
 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
+{
 …
    formatting is done. */
+static int
+formatfloat(Py_UNICODE *buf,
+            size_t buflen,
+            int flags,
+            int prec,
+            int type,
+            PyObject *v)
+{
+    /* fmt = '%#.' + `prec` + `type`
+       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
+    char fmt[20];
+/* Returns a new reference to a PyUnicode object, or NULL on failure. */
+static PyObject *
+formatfloat(PyObject *v, int flags, int prec, int type)
+{
+    char *p;
+    PyObject *result;
     double x;
     x = PyFloat_AsDouble(v);
     if (x == -1.0 && PyErr_Occurred())
+        return -1;
+        return NULL;
     if (prec < 0)
         prec = 6;
+#if SIZEOF_INT > 4
+    /* make sure that the decimal representation of precision really does
+       need at most 10 digits: platforms with sizeof(int) == 8 exist! */
+    if (prec > 0x7fffffff) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "outrageously large precision "
+                        "for formatted float");
+        return -1;
+    }
+#endif
+    if (type == 'f' && fabs(x) >= 1e50)
+        type = 'g';
+    /* Worst case length calc to ensure no buffer overrun:
+       'g' formats:
+       fmt = %#.<prec>g
+       buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
+       for any double rep.)
+       len = 1 + prec + 1 + 2 + 5 = 9 + prec
+       'f' formats:
+       buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
+       len = 1 + 50 + 1 + prec = 52 + prec
+       If prec=0 the effective precision is 1 (the leading digit is
+       always given), therefore increase the length by one.
+    */
+    if (((type == 'g' || type == 'G') &&
+         buflen <= (size_t)10 + (size_t)prec) ||
+        (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "formatted float is too long (precision too large?)");
+        return -1;
+    }
+    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
+                  (flags&F_ALT) ? "#" : "",
+                  prec, type);
+    return doubletounicode(buf, buflen, fmt, x);
+    p = PyOS_double_to_string(x, type, prec,
+                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
+    if (p == NULL)
+        return NULL;
+    result = PyUnicode_FromStringAndSize(p, strlen(p));
+    PyMem_Free(p);
+    return result;
+}
 …
 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
+   FORMATBUFLEN is the length of the buffer in which the ints &
    chars are formatted. XXX This is a magic number. Each formatting
    routine does bounds checking to ensure no overflow, but a better
 …
         argidx = -2;
+    }
     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
         !PyObject_TypeCheck(args, &PyBaseString_Type))
+    if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
+        !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
         dict = args;
 …
             Py_UNICODE sign;
             Py_ssize_t len;
             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
+            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
             fmt++;
 …
                     goto onError;
+                }
+                width = PyInt_AsLong(v);
+                width = PyInt_AsSsize_t(v);
+                if (width == -1 && PyErr_Occurred())
+                    goto onError;
                 if (width < 0) {
                     flags |= F_LJUST;
 …
                     if (c < '0' || c > '9')
                         break;
                     if ((width*10) / 10 != width) {
+                    if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
                         PyErr_SetString(PyExc_ValueError,
                                         "width too big");
 …
                         goto onError;
+                    }
+                    prec = PyInt_AsLong(v);
+                    prec = _PyInt_AsInt(v);
+                    if (prec == -1 && PyErr_Occurred())
+                        goto onError;
                     if (prec < 0)
                         prec = 0;
 …
                     prec = c - '0';
                     while (--fmtcnt >= 0) {
                         c = Py_CHARMASK(*fmt++);
+                        c = *fmt++;
                         if (c < '0' || c > '9')
                             break;
                         if ((prec*10) / 10 != prec) {
+                        if (prec > (INT_MAX - ((int)c - '0')) / 10) {
                             PyErr_SetString(PyExc_ValueError,
                                             "prec too big");
 …
             case 's':
             case 'r':
                 if (PyUnicode_Check(v) && c == 's') {
+                if (PyUnicode_CheckExact(v) && c == 's') {
                     temp = v;
                     Py_INCREF(temp);
 …
             case 'g':
             case 'G':
+                if (c == 'F')
+                    c = 'f';
+                pbuf = formatbuf;
+                len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
+                                  flags, prec, c, v);
+                if (len < 0)
+                temp = formatfloat(v, flags, prec, c);
+                if (temp == NULL)
                     goto onError;
+                pbuf = PyUnicode_AS_UNICODE(temp);
+                len = PyUnicode_GET_SIZE(temp);
                 sign = 1;
                 if (flags & F_ZERO)
 …
 PyDoc_STRVAR(unicode_doc,
+             "unicode(string [, encoding[, errors]]) -> object\n\
+             "unicode(object='') -> unicode object\n\
+unicode(string[, encoding[, errors]]) -> unicode object\n\
 \n\
 Create a new Unicode object from the given encoded string.\n\
 …
 void _PyUnicode_Init(void)
+{
-    int i;
     /* XXX - move this array to unicodectype.c ? */
     Py_UNICODE linebreak[] = {
 …
     /* Init the implementation */
+    free_list = NULL;
+    numfree = 0;
+    unicode_empty = _PyUnicode_New(0);
+    if (!unicode_empty)
+        return;
+    strcpy(unicode_default_encoding, "ascii");
+    for (i = 0; i < 256; i++)
+        unicode_latin1[i] = NULL;
+    if (!unicode_empty) {
+        unicode_empty = _PyUnicode_New(0);
+        if (!unicode_empty)
+            return;
+    }
     if (PyType_Ready(&PyUnicode_Type) < 0)
         Py_FatalError("Can't initialize 'unicode'");
 …
     PyType_Ready(&EncodingMapType);
+    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
+        Py_FatalError("Can't initialize field name iterator type");
+    if (PyType_Ready(&PyFormatterIter_Type) < 0)
+        Py_FatalError("Can't initialize formatter iter type");
+}
 …
     int i;
+    Py_XDECREF(unicode_empty);
+    unicode_empty = NULL;
+    for (i = 0; i < 256; i++) {
+        if (unicode_latin1[i]) {
+            Py_DECREF(unicode_latin1[i]);
+            unicode_latin1[i] = NULL;
+        }
+    }
+    Py_CLEAR(unicode_empty);
+    for (i = 0; i < 256; i++)
+        Py_CLEAR(unicode_latin1[i]);
     (void)PyUnicode_ClearFreeList();
+}
 …
+}
 #endif
-/*
-  Local variables:
-  c-basic-offset: 4
-  indent-tabs-mode: nil
-  End:
-*/

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 388 for python/vendor/current/Objects/unicodeobject.c

Legend:

python/vendor/current/Objects/unicodeobject.c

Download in other formats: