Changeset 388 for python/vendor/current/Objects/unicodeobject.c
- Timestamp:
- Mar 19, 2014, 11:11:30 AM (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
python/vendor/current/Objects/unicodeobject.c
r2 r388 83 83 /* --- Globals ------------------------------------------------------------ 84 84 85 The globals are initialized by the _PyUnicode_Init() API and should 86 not be used before calling that API. 85 NOTE: In the interpreter's initialization phase, some globals are currently 86 initialized dynamically as needed. In the process Unicode objects may 87 be created before the Unicode type is ready. 87 88 88 89 */ … … 94 95 95 96 /* Free list for Unicode objects */ 96 static PyUnicodeObject *free_list ;97 static int numfree ;97 static PyUnicodeObject *free_list = NULL; 98 static int numfree = 0; 98 99 99 100 /* The empty Unicode object is shared to improve performance. */ 100 static PyUnicodeObject *unicode_empty; 101 static PyUnicodeObject *unicode_empty = NULL; 102 103 #define _Py_RETURN_UNICODE_EMPTY() \ 104 do { \ 105 if (unicode_empty != NULL) \ 106 Py_INCREF(unicode_empty); \ 107 else { \ 108 unicode_empty = _PyUnicode_New(0); \ 109 if (unicode_empty != NULL) \ 110 Py_INCREF(unicode_empty); \ 111 } \ 112 return (PyObject *)unicode_empty; \ 113 } while (0) 101 114 102 115 /* Single character Unicode strings in the Latin-1 range are being 103 116 shared as well. */ 104 static PyUnicodeObject *unicode_latin1[256] ;117 static PyUnicodeObject *unicode_latin1[256] = {NULL}; 105 118 106 119 /* Default encoding to use and assume when NULL is passed as encoding … … 111 124 112 125 */ 113 static char unicode_default_encoding[100 ];126 static char unicode_default_encoding[100 + 1] = "ascii"; 114 127 115 128 /* Fast detection of the most frequent whitespace characters */ 116 129 const unsigned char _Py_ascii_whitespace[] = { 117 130 0, 0, 0, 0, 0, 0, 0, 0, 118 /* case 0x0009: * HORIZONTALTABULATION */131 /* case 0x0009: * CHARACTER TABULATION */ 119 132 /* case 0x000A: * LINE FEED */ 120 /* case 0x000B: * VERTICALTABULATION */133 /* case 0x000B: * LINE TABULATION */ 121 134 /* case 0x000C: * FORM FEED */ 122 135 /* case 0x000D: * CARRIAGE RETURN */ … … 148 161 0, 0, 0, 0, 0, 0, 0, 0, 149 162 /* 0x000A, * LINE FEED */ 163 /* 0x000B, * LINE TABULATION */ 164 /* 0x000C, * FORM FEED */ 150 165 /* 0x000D, * CARRIAGE RETURN */ 151 0, 0, 1, 0, 0, 1, 0, 0,166 0, 0, 1, 1, 1, 1, 0, 0, 152 167 0, 0, 0, 0, 0, 0, 0, 0, 153 168 /* 0x001C, * FILE SEPARATOR */ … … 191 206 /* the linebreak mask is set up by Unicode_Init below */ 192 207 208 #if LONG_BIT >= 128 209 #define BLOOM_WIDTH 128 210 #elif LONG_BIT >= 64 211 #define BLOOM_WIDTH 64 212 #elif LONG_BIT >= 32 213 #define BLOOM_WIDTH 32 214 #else 215 #error "LONG_BIT is smaller than 32" 216 #endif 217 193 218 #define BLOOM_MASK unsigned long 194 219 195 static BLOOM_MASK bloom_linebreak; 196 197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 221 222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 223 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 198 224 199 225 #define BLOOM_LINEBREAK(ch) \ … … 205 231 /* calculate simple bloom-style bitmask for a given unicode string */ 206 232 207 longmask;233 BLOOM_MASK mask; 208 234 Py_ssize_t i; 209 235 210 236 mask = 0; 211 237 for (i = 0; i < len; i++) 212 mask |= (1 << (ptr[i] & 0x1F));238 BLOOM_ADD(mask, ptr[i]); 213 239 214 240 return mask; … … 273 299 /* Reset the object caches */ 274 300 if (unicode->defenc) { 275 Py_DECREF(unicode->defenc); 276 unicode->defenc = NULL; 301 Py_CLEAR(unicode->defenc); 277 302 } 278 303 unicode->hash = -1; … … 282 307 283 308 /* We allocate one more byte to make sure the string is 284 Ux0000 terminated -- XXX is this needed ?309 Ux0000 terminated; some code relies on that. 285 310 286 311 XXX This allocator could further be enhanced by assuring that the … … 372 397 } 373 398 if (unicode->defenc) { 374 Py_DECREF(unicode->defenc); 375 unicode->defenc = NULL; 399 Py_CLEAR(unicode->defenc); 376 400 } 377 401 /* Add to free list */ … … 438 462 439 463 /* Optimization for empty strings */ 440 if (size == 0 && unicode_empty != NULL) { 441 Py_INCREF(unicode_empty); 442 return (PyObject *)unicode_empty; 443 } 464 if (size == 0) 465 _Py_RETURN_UNICODE_EMPTY(); 444 466 445 467 /* Single character Unicode objects in the Latin-1 range are … … 487 509 488 510 /* Optimization for empty strings */ 489 if (size == 0 && unicode_empty != NULL) { 490 Py_INCREF(unicode_empty); 491 return (PyObject *)unicode_empty; 492 } 511 if (size == 0) 512 _Py_RETURN_UNICODE_EMPTY(); 493 513 494 514 /* Single characters are shared when using this constructor. … … 528 548 } 529 549 550 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed 551 * by 'ptr', possibly combining surrogate pairs on narrow builds. 552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character 553 * that should be returned and 'end' pointing to the end of the buffer. 554 * ('end' is used on narrow builds to detect a lone surrogate at the 555 * end of the buffer that should be returned unchanged.) 556 * The ptr and end arguments should be side-effect free and ptr must an lvalue. 557 * The type of the returned char is always Py_UCS4. 558 * 559 * Note: the macro advances ptr to next char, so it might have side-effects 560 * (especially if used with other macros). 561 */ 562 563 /* helper macros used by _Py_UNICODE_NEXT */ 564 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) 565 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) 566 /* Join two surrogate characters and return a single Py_UCS4 value. */ 567 #define _Py_UNICODE_JOIN_SURROGATES(high, low) \ 568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ 569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) 570 571 #ifdef Py_UNICODE_WIDE 572 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++ 573 #else 574 #define _Py_UNICODE_NEXT(ptr, end) \ 575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \ 576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \ 577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \ 578 (Py_UCS4)*(ptr)++) 579 #endif 580 530 581 #ifdef HAVE_WCHAR_H 582 583 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) 584 # define CONVERT_WCHAR_TO_SURROGATES 585 #endif 586 587 #ifdef CONVERT_WCHAR_TO_SURROGATES 588 589 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need 590 to convert from UTF32 to UTF16. */ 591 592 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 593 Py_ssize_t size) 594 { 595 PyUnicodeObject *unicode; 596 register Py_ssize_t i; 597 Py_ssize_t alloc; 598 const wchar_t *orig_w; 599 600 if (w == NULL) { 601 PyErr_BadInternalCall(); 602 return NULL; 603 } 604 605 alloc = size; 606 orig_w = w; 607 for (i = size; i > 0; i--) { 608 if (*w > 0xFFFF) 609 alloc++; 610 w++; 611 } 612 w = orig_w; 613 unicode = _PyUnicode_New(alloc); 614 if (!unicode) 615 return NULL; 616 617 /* Copy the wchar_t data into the new object */ 618 { 619 register Py_UNICODE *u; 620 u = PyUnicode_AS_UNICODE(unicode); 621 for (i = size; i > 0; i--) { 622 if (*w > 0xFFFF) { 623 wchar_t ordinal = *w++; 624 ordinal -= 0x10000; 625 *u++ = 0xD800 | (ordinal >> 10); 626 *u++ = 0xDC00 | (ordinal & 0x3FF); 627 } 628 else 629 *u++ = *w++; 630 } 631 } 632 return (PyObject *)unicode; 633 } 634 635 #else 531 636 532 637 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, … … 559 664 return (PyObject *)unicode; 560 665 } 666 667 #endif /* CONVERT_WCHAR_TO_SURROGATES */ 668 669 #undef CONVERT_WCHAR_TO_SURROGATES 561 670 562 671 static void … … 663 772 switch (*f) { 664 773 case 'c': 665 (void)va_arg(count, int); 774 { 775 int ordinal = va_arg(count, int); 776 #ifdef Py_UNICODE_WIDE 777 if (ordinal < 0 || ordinal > 0x10ffff) { 778 PyErr_SetString(PyExc_OverflowError, 779 "%c arg not in range(0x110000) " 780 "(wide Python build)"); 781 goto fail; 782 } 783 #else 784 if (ordinal < 0 || ordinal > 0xffff) { 785 PyErr_SetString(PyExc_OverflowError, 786 "%c arg not in range(0x10000) " 787 "(narrow Python build)"); 788 goto fail; 789 } 790 #endif 666 791 /* fall through... */ 792 } 667 793 case '%': 668 794 n++; … … 684 810 { 685 811 /* UTF-8 */ 686 unsigned char *s = va_arg(count, unsignedchar*);812 const char *s = va_arg(count, const char*); 687 813 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 688 814 if (!str) … … 1094 1220 1095 1221 /* Convert to Unicode */ 1096 if (len == 0) { 1097 Py_INCREF(unicode_empty); 1098 v = (PyObject *)unicode_empty; 1099 } 1100 else 1101 v = PyUnicode_Decode(s, len, encoding, errors); 1102 1222 if (len == 0) 1223 _Py_RETURN_UNICODE_EMPTY(); 1224 1225 v = PyUnicode_Decode(s, len, encoding, errors); 1103 1226 return v; 1104 1227 … … 1313 1436 strncpy(unicode_default_encoding, 1314 1437 encoding, 1315 sizeof(unicode_default_encoding) );1438 sizeof(unicode_default_encoding) - 1); 1316 1439 return 0; 1317 1440 … … 1411 1534 /* --- UTF-7 Codec -------------------------------------------------------- */ 1412 1535 1413 /* see RFC2152 for details */ 1536 /* See RFC2152 for details. We encode conservatively and decode liberally. */ 1537 1538 /* Three simple macros defining base-64. */ 1539 1540 /* Is c a base-64 character? */ 1541 1542 #define IS_BASE64(c) \ 1543 (isalnum(c) || (c) == '+' || (c) == '/') 1544 1545 /* given that c is a base-64 character, what is its base-64 value? */ 1546 1547 #define FROM_BASE64(c) \ 1548 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 1549 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 1550 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 1551 (c) == '+' ? 62 : 63) 1552 1553 /* What is the base-64 character of the bottom 6 bits of n? */ 1554 1555 #define TO_BASE64(n) \ 1556 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1557 1558 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 1559 * decoded as itself. We are permissive on decoding; the only ASCII 1560 * byte not decoding to itself is the + which begins a base64 1561 * string. */ 1562 1563 #define DECODE_DIRECT(c) \ 1564 ((c) <= 127 && (c) != '+') 1565 1566 /* The UTF-7 encoder treats ASCII characters differently according to 1567 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 1568 * the above). See RFC2152. This array identifies these different 1569 * sets: 1570 * 0 : "Set D" 1571 * alphanumeric and '(),-./:? 1572 * 1 : "Set O" 1573 * !"#$%&*;<=>@[]^_`{|} 1574 * 2 : "whitespace" 1575 * ht nl cr sp 1576 * 3 : special (must be base64 encoded) 1577 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 1578 */ 1414 1579 1415 1580 static 1416 char utf7_special[128] = { 1417 /* indicate whether a UTF-7 character is special i.e. cannot be directly 1418 encoded: 1419 0 - not special 1420 1 - special 1421 2 - whitespace (optional) 1422 3 - RFC2152 Set O (optional) */ 1423 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1424 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1425 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 1427 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1428 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 1429 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1431 1581 char utf7_category[128] = { 1582 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 1583 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 1584 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 1585 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1586 /* sp ! " # $ % & ' ( ) * + , - . / */ 1587 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 1588 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 1589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1590 /* @ A B C D E F G H I J K L M N O */ 1591 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1592 /* P Q R S T U V W X Y Z [ \ ] ^ _ */ 1593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 1594 /* ` a b c d e f g h i j k l m n o */ 1595 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1596 /* p q r s t u v w x y z { | } ~ del */ 1597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 1432 1598 }; 1433 1599 1434 /* Note: The comparison (c) <= 0 is a trick to work-around gcc 1435 warnings about the comparison always being false; since 1436 utf7_special[0] is 1, we can safely make that one comparison 1437 true */ 1438 1439 #define SPECIAL(c, encodeO, encodeWS) \ 1440 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 1441 (encodeWS && (utf7_special[(c)] == 2)) || \ 1442 (encodeO && (utf7_special[(c)] == 3))) 1443 1444 #define B64(n) \ 1445 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1446 #define B64CHAR(c) \ 1447 (isalnum(c) || (c) == '+' || (c) == '/') 1448 #define UB64(c) \ 1449 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 1450 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 1451 1452 #define ENCODE(out, ch, bits) \ 1453 while (bits >= 6) { \ 1454 *out++ = B64(ch >> (bits-6)); \ 1455 bits -= 6; \ 1456 } 1457 1458 #define DECODE(out, ch, bits, surrogate) \ 1459 while (bits >= 16) { \ 1460 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 1461 bits -= 16; \ 1462 if (surrogate) { \ 1463 /* We have already generated an error for the high surrogate \ 1464 so let's not bother seeing if the low surrogate is correct or not */ \ 1465 surrogate = 0; \ 1466 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 1467 /* This is a surrogate pair. Unfortunately we can't represent \ 1468 it in a 16-bit character */ \ 1469 surrogate = 1; \ 1470 errmsg = "code pairs are not supported"; \ 1471 goto utf7Error; \ 1472 } else { \ 1473 *out++ = outCh; \ 1474 } \ 1475 } 1600 /* ENCODE_DIRECT: this character should be encoded as itself. The 1601 * answer depends on whether we are encoding set O as itself, and also 1602 * on whether we are encoding whitespace as itself. RFC2152 makes it 1603 * clear that the answers to these questions vary between 1604 * applications, so this code needs to be flexible. */ 1605 1606 #define ENCODE_DIRECT(c, directO, directWS) \ 1607 ((c) < 128 && (c) > 0 && \ 1608 ((utf7_category[(c)] == 0) || \ 1609 (directWS && (utf7_category[(c)] == 2)) || \ 1610 (directO && (utf7_category[(c)] == 1)))) 1476 1611 1477 1612 PyObject *PyUnicode_DecodeUTF7(const char *s, … … 1481 1616 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 1482 1617 } 1618 1619 /* The decoder. The only state we preserve is our read position, 1620 * i.e. how many characters we have consumed. So if we end in the 1621 * middle of a shift sequence we have to back off the read position 1622 * and the output to the beginning of the sequence, otherwise we lose 1623 * all the shift state (seen bits, number of bits seen, high 1624 * surrogate). */ 1483 1625 1484 1626 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, … … 1496 1638 const char *errmsg = ""; 1497 1639 int inShift = 0; 1498 unsigned int bitsleft = 0; 1499 unsigned long charsleft = 0; 1500 int surrogate = 0; 1640 Py_UNICODE *shiftOutStart; 1641 unsigned int base64bits = 0; 1642 unsigned long base64buffer = 0; 1643 Py_UNICODE surrogate = 0; 1501 1644 PyObject *errorHandler = NULL; 1502 1645 PyObject *exc = NULL; … … 1512 1655 1513 1656 p = unicode->str; 1657 shiftOutStart = p; 1514 1658 e = s + size; 1515 1659 1516 1660 while (s < e) { 1517 Py_UNICODE ch; 1518 restart: 1519 ch = (unsigned char) *s; 1520 1521 if (inShift) { 1522 if ((ch == '-') || !B64CHAR(ch)) { 1661 Py_UNICODE ch = (unsigned char) *s; 1662 1663 if (inShift) { /* in a base-64 section */ 1664 if (IS_BASE64(ch)) { /* consume a base-64 character */ 1665 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 1666 base64bits += 6; 1667 s++; 1668 if (base64bits >= 16) { 1669 /* we have enough bits for a UTF-16 value */ 1670 Py_UNICODE outCh = (Py_UNICODE) 1671 (base64buffer >> (base64bits-16)); 1672 base64bits -= 16; 1673 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 1674 assert(outCh <= 0xffff); 1675 if (surrogate) { 1676 /* expecting a second surrogate */ 1677 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 1678 #ifdef Py_UNICODE_WIDE 1679 *p++ = (((surrogate & 0x3FF)<<10) 1680 | (outCh & 0x3FF)) + 0x10000; 1681 #else 1682 *p++ = surrogate; 1683 *p++ = outCh; 1684 #endif 1685 surrogate = 0; 1686 continue; 1687 } 1688 else { 1689 *p++ = surrogate; 1690 surrogate = 0; 1691 } 1692 } 1693 if (outCh >= 0xD800 && outCh <= 0xDBFF) { 1694 /* first surrogate */ 1695 surrogate = outCh; 1696 } 1697 else { 1698 *p++ = outCh; 1699 } 1700 } 1701 } 1702 else { /* now leaving a base-64 section */ 1523 1703 inShift = 0; 1524 1704 s++; 1525 1526 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1527 if (bitsleft >= 6) { 1528 /* The shift sequence has a partial character in it. If 1529 bitsleft < 6 then we could just classify it as padding 1530 but that is not the case here */ 1531 1532 errmsg = "partial character in shift sequence"; 1533 goto utf7Error; 1705 if (surrogate) { 1706 *p++ = surrogate; 1707 surrogate = 0; 1534 1708 } 1535 /* According to RFC2152 the remaining bits should be zero. We 1536 choose to signal an error/insert a replacement character 1537 here so indicate the potential of a misencoded character. */ 1538 1539 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 1540 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 1541 errmsg = "non-zero padding bits in shift sequence"; 1542 goto utf7Error; 1709 if (base64bits > 0) { /* left-over bits */ 1710 if (base64bits >= 6) { 1711 /* We've seen at least one base-64 character */ 1712 errmsg = "partial character in shift sequence"; 1713 goto utf7Error; 1714 } 1715 else { 1716 /* Some bits remain; they should be zero */ 1717 if (base64buffer != 0) { 1718 errmsg = "non-zero padding bits in shift sequence"; 1719 goto utf7Error; 1720 } 1721 } 1543 1722 } 1544 1545 if (ch == '-') { 1546 if ((s < e) && (*(s) == '-')) { 1547 *p++ = '-'; 1548 inShift = 1; 1549 } 1550 } else if (SPECIAL(ch,0,0)) { 1551 errmsg = "unexpected special character"; 1552 goto utf7Error; 1553 } else { 1723 if (ch != '-') { 1724 /* '-' is absorbed; other terminating 1725 characters are preserved */ 1554 1726 *p++ = ch; 1555 1727 } 1556 } else {1557 charsleft = (charsleft << 6) | UB64(ch);1558 bitsleft += 6;1559 s++;1560 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);1561 1728 } 1562 1729 } 1563 1730 else if ( ch == '+' ) { 1564 1731 startinpos = s-starts; 1565 s++; 1566 if (s < e && *s == '-') { 1732 s++; /* consume '+' */ 1733 if (s < e && *s == '-') { /* '+-' encodes '+' */ 1567 1734 s++; 1568 1735 *p++ = '+'; 1569 } else1570 {1736 } 1737 else { /* begin base64-encoded section */ 1571 1738 inShift = 1; 1572 bitsleft = 0; 1573 } 1574 } 1575 else if (SPECIAL(ch,0,0)) { 1576 startinpos = s-starts; 1577 errmsg = "unexpected special character"; 1578 s++; 1579 goto utf7Error; 1580 } 1581 else { 1739 shiftOutStart = p; 1740 base64bits = 0; 1741 base64buffer = 0; 1742 } 1743 } 1744 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 1582 1745 *p++ = ch; 1583 1746 s++; 1584 1747 } 1748 else { 1749 startinpos = s-starts; 1750 s++; 1751 errmsg = "unexpected special character"; 1752 goto utf7Error; 1753 } 1585 1754 continue; 1586 1755 utf7Error: 1587 1756 outpos = p-PyUnicode_AS_UNICODE(unicode); 1588 1757 endinpos = s-starts; … … 1595 1764 } 1596 1765 1597 if (inShift && !consumed) { 1598 outpos = p-PyUnicode_AS_UNICODE(unicode); 1599 endinpos = size; 1600 if (unicode_decode_call_errorhandler( 1601 errors, &errorHandler, 1602 "utf7", "unterminated shift sequence", 1603 starts, size, &startinpos, &endinpos, &exc, &s, 1604 &unicode, &outpos, &p)) 1605 goto onError; 1606 if (s < e) 1607 goto restart; 1608 } 1766 /* end of string */ 1767 1768 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 1769 /* if we're in an inconsistent state, that's an error */ 1770 if (surrogate || 1771 (base64bits >= 6) || 1772 (base64bits > 0 && base64buffer != 0)) { 1773 outpos = p-PyUnicode_AS_UNICODE(unicode); 1774 endinpos = size; 1775 if (unicode_decode_call_errorhandler( 1776 errors, &errorHandler, 1777 "utf7", "unterminated shift sequence", 1778 starts, size, &startinpos, &endinpos, &exc, &s, 1779 &unicode, &outpos, &p)) 1780 goto onError; 1781 } 1782 } 1783 1784 /* return state */ 1609 1785 if (consumed) { 1610 if(inShift) 1786 if (inShift) { 1787 p = shiftOutStart; /* back off output */ 1611 1788 *consumed = startinpos; 1612 else 1789 } 1790 else { 1613 1791 *consumed = s-starts; 1792 } 1614 1793 } 1615 1794 … … 1631 1810 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1632 1811 Py_ssize_t size, 1633 int encodeSetO,1634 int encodeWhiteSpace,1812 int base64SetO, 1813 int base64WhiteSpace, 1635 1814 const char *errors) 1636 1815 { 1637 1816 PyObject *v; 1638 1817 /* It might be possible to tighten this worst case */ 1639 Py_ssize_t cbAllocated = 5* size;1818 Py_ssize_t allocated = 8 * size; 1640 1819 int inShift = 0; 1641 1820 Py_ssize_t i = 0; 1642 unsigned int b itsleft= 0;1643 unsigned long charsleft= 0;1821 unsigned int base64bits = 0; 1822 unsigned long base64buffer = 0; 1644 1823 char * out; 1645 1824 char * start; 1646 1825 1647 if ( cbAllocated / 5!= size)1826 if (allocated / 8 != size) 1648 1827 return PyErr_NoMemory(); 1649 1828 … … 1651 1830 return PyString_FromStringAndSize(NULL, 0); 1652 1831 1653 v = PyString_FromStringAndSize(NULL, cbAllocated);1832 v = PyString_FromStringAndSize(NULL, allocated); 1654 1833 if (v == NULL) 1655 1834 return NULL; … … 1659 1838 Py_UNICODE ch = s[i]; 1660 1839 1661 if (!inShift) { 1840 if (inShift) { 1841 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 1842 /* shifting out */ 1843 if (base64bits) { /* output remaining bits */ 1844 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 1845 base64buffer = 0; 1846 base64bits = 0; 1847 } 1848 inShift = 0; 1849 /* Characters not in the BASE64 set implicitly unshift the sequence 1850 so no '-' is required, except if the character is itself a '-' */ 1851 if (IS_BASE64(ch) || ch == '-') { 1852 *out++ = '-'; 1853 } 1854 *out++ = (char) ch; 1855 } 1856 else { 1857 goto encode_char; 1858 } 1859 } 1860 else { /* not in a shift sequence */ 1662 1861 if (ch == '+') { 1663 1862 *out++ = '+'; 1664 *out++ = '-'; 1665 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1666 charsleft = ch; 1667 bitsleft = 16; 1863 *out++ = '-'; 1864 } 1865 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 1866 *out++ = (char) ch; 1867 } 1868 else { 1668 1869 *out++ = '+'; 1669 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1670 inShift = bitsleft > 0; 1671 } else { 1672 *out++ = (char) ch; 1673 } 1674 } else { 1675 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1676 *out++ = B64(charsleft << (6-bitsleft)); 1677 charsleft = 0; 1678 bitsleft = 0; 1679 /* Characters not in the BASE64 set implicitly unshift the sequence 1680 so no '-' is required, except if the character is itself a '-' */ 1681 if (B64CHAR(ch) || ch == '-') { 1682 *out++ = '-'; 1683 } 1684 inShift = 0; 1685 *out++ = (char) ch; 1686 } else { 1687 bitsleft += 16; 1688 charsleft = (charsleft << 16) | ch; 1689 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1690 1691 /* If the next character is special then we don't need to terminate 1692 the shift sequence. If the next character is not a BASE64 character 1693 or '-' then the shift sequence will be terminated implicitly and we 1694 don't have to insert a '-'. */ 1695 1696 if (bitsleft == 0) { 1697 if (i + 1 < size) { 1698 Py_UNICODE ch2 = s[i+1]; 1699 1700 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1701 1702 } else if (B64CHAR(ch2) || ch2 == '-') { 1703 *out++ = '-'; 1704 inShift = 0; 1705 } else { 1706 inShift = 0; 1707 } 1708 1709 } 1710 else { 1711 *out++ = '-'; 1712 inShift = 0; 1713 } 1714 } 1715 } 1716 } 1717 } 1718 if (bitsleft) { 1719 *out++= B64(charsleft << (6-bitsleft) ); 1870 inShift = 1; 1871 goto encode_char; 1872 } 1873 } 1874 continue; 1875 encode_char: 1876 #ifdef Py_UNICODE_WIDE 1877 if (ch >= 0x10000) { 1878 /* code first surrogate */ 1879 base64bits += 16; 1880 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 1881 while (base64bits >= 6) { 1882 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 1883 base64bits -= 6; 1884 } 1885 /* prepare second surrogate */ 1886 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 1887 } 1888 #endif 1889 base64bits += 16; 1890 base64buffer = (base64buffer << 16) | ch; 1891 while (base64bits >= 6) { 1892 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 1893 base64bits -= 6; 1894 } 1895 } 1896 if (base64bits) 1897 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 1898 if (inShift) 1720 1899 *out++ = '-'; 1721 } 1722 1723 _PyString_Resize(&v, out - start);1900 1901 if (_PyString_Resize(&v, out - start)) 1902 return NULL; 1724 1903 return v; 1725 1904 } 1726 1905 1727 #undef SPECIAL 1728 #undef B64 1729 #undef B64CHAR 1730 #undef UB64 1731 #undef ENCODE 1732 #undef DECODE 1906 #undef IS_BASE64 1907 #undef FROM_BASE64 1908 #undef TO_BASE64 1909 #undef DECODE_DIRECT 1910 #undef ENCODE_DIRECT 1733 1911 1734 1912 /* --- UTF-8 Codec -------------------------------------------------------- */ … … 1736 1914 static 1737 1915 char utf8_code_length[256] = { 1738 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1739 illegal prefix. see RFC 2279 for details */ 1916 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 1917 illegal prefix. See RFC 3629 for details */ 1918 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 1740 1919 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1741 1920 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, … … 1744 1923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1745 1924 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1925 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 1926 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 1748 1927 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1749 1928 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1751 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1752 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1753 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1754 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1755 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1929 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 1930 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 1931 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 1932 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 1933 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 1756 1934 }; 1757 1935 … … 1770 1948 const char *starts = s; 1771 1949 int n; 1950 int k; 1772 1951 Py_ssize_t startinpos; 1773 1952 Py_ssize_t endinpos; … … 1812 1991 errmsg = "unexpected end of data"; 1813 1992 startinpos = s-starts; 1814 endinpos = size; 1993 endinpos = startinpos+1; 1994 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 1995 endinpos++; 1815 1996 goto utf8Error; 1816 1997 } … … 1820 2001 1821 2002 case 0: 1822 errmsg = " unexpected codebyte";2003 errmsg = "invalid start byte"; 1823 2004 startinpos = s-starts; 1824 2005 endinpos = startinpos+1; … … 1833 2014 case 2: 1834 2015 if ((s[1] & 0xc0) != 0x80) { 1835 errmsg = "invalid data";2016 errmsg = "invalid continuation byte"; 1836 2017 startinpos = s-starts; 1837 endinpos = startinpos +2;2018 endinpos = startinpos + 1; 1838 2019 goto utf8Error; 1839 2020 } 1840 2021 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1841 if (ch < 0x80) { 2022 assert ((ch > 0x007F) && (ch <= 0x07FF)); 2023 *p++ = (Py_UNICODE)ch; 2024 break; 2025 2026 case 3: 2027 /* XXX: surrogates shouldn't be valid UTF-8! 2028 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 2029 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt 2030 Uncomment the 2 lines below to make them invalid, 2031 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */ 2032 if ((s[1] & 0xc0) != 0x80 || 2033 (s[2] & 0xc0) != 0x80 || 2034 ((unsigned char)s[0] == 0xE0 && 2035 (unsigned char)s[1] < 0xA0)/* || 2036 ((unsigned char)s[0] == 0xED && 2037 (unsigned char)s[1] > 0x9F)*/) { 2038 errmsg = "invalid continuation byte"; 1842 2039 startinpos = s-starts; 1843 endinpos = startinpos+2; 1844 errmsg = "illegal encoding"; 2040 endinpos = startinpos + 1; 2041 2042 /* if s[1] first two bits are 1 and 0, then the invalid 2043 continuation byte is s[2], so increment endinpos by 1, 2044 if not, s[1] is invalid and endinpos doesn't need to 2045 be incremented. */ 2046 if ((s[1] & 0xC0) == 0x80) 2047 endinpos++; 1845 2048 goto utf8Error; 1846 2049 } 1847 else1848 *p++ = (Py_UNICODE)ch;1849 break;1850 1851 case 3:1852 if ((s[1] & 0xc0) != 0x80 ||1853 (s[2] & 0xc0) != 0x80) {1854 errmsg = "invalid data";1855 startinpos = s-starts;1856 endinpos = startinpos+3;1857 goto utf8Error;1858 }1859 2050 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1860 if (ch < 0x0800) { 1861 /* Note: UTF-8 encodings of surrogates are considered 1862 legal UTF-8 sequences; 1863 1864 XXX For wide builds (UCS-4) we should probably try 1865 to recombine the surrogates into a single code 1866 unit. 1867 */ 1868 errmsg = "illegal encoding"; 1869 startinpos = s-starts; 1870 endinpos = startinpos+3; 1871 goto utf8Error; 1872 } 1873 else 1874 *p++ = (Py_UNICODE)ch; 2051 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 2052 *p++ = (Py_UNICODE)ch; 1875 2053 break; 1876 2054 … … 1878 2056 if ((s[1] & 0xc0) != 0x80 || 1879 2057 (s[2] & 0xc0) != 0x80 || 1880 (s[3] & 0xc0) != 0x80) { 1881 errmsg = "invalid data"; 2058 (s[3] & 0xc0) != 0x80 || 2059 ((unsigned char)s[0] == 0xF0 && 2060 (unsigned char)s[1] < 0x90) || 2061 ((unsigned char)s[0] == 0xF4 && 2062 (unsigned char)s[1] > 0x8F)) { 2063 errmsg = "invalid continuation byte"; 1882 2064 startinpos = s-starts; 1883 endinpos = startinpos+4; 2065 endinpos = startinpos + 1; 2066 if ((s[1] & 0xC0) == 0x80) { 2067 endinpos++; 2068 if ((s[2] & 0xC0) == 0x80) 2069 endinpos++; 2070 } 1884 2071 goto utf8Error; 1885 2072 } 1886 2073 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1887 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1888 /* validate and convert to UTF-16 */ 1889 if ((ch < 0x10000) /* minimum value allowed for 4 1890 byte encoding */ 1891 || (ch > 0x10ffff)) /* maximum value allowed for 1892 UTF-16 */ 1893 { 1894 errmsg = "illegal encoding"; 1895 startinpos = s-starts; 1896 endinpos = startinpos+4; 1897 goto utf8Error; 1898 } 2074 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 2075 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 2076 1899 2077 #ifdef Py_UNICODE_WIDE 1900 2078 *p++ = (Py_UNICODE)ch; … … 1912 2090 #endif 1913 2091 break; 1914 1915 default:1916 /* Other sizes are only needed for UCS-4 */1917 errmsg = "unsupported Unicode code range";1918 startinpos = s-starts;1919 endinpos = startinpos+n;1920 goto utf8Error;1921 2092 } 1922 2093 s += n; … … 2043 2214 nneeded = p - PyString_AS_STRING(v); 2044 2215 assert(nneeded <= nallocated); 2045 _PyString_Resize(&v, nneeded); 2216 if (_PyString_Resize(&v, nneeded)) 2217 return NULL; 2046 2218 } 2047 2219 return v; … … 2086 2258 Py_UNICODE *p; 2087 2259 #ifndef Py_UNICODE_WIDE 2088 int i, pairs; 2260 int pairs = 0; 2261 const unsigned char *qq; 2089 2262 #else 2090 2263 const int pairs = 0; … … 2101 2274 PyObject *errorHandler = NULL; 2102 2275 PyObject *exc = NULL; 2103 /* On narrow builds we split characters outside the BMP into two 2104 codepoints => count how much extra space we need. */ 2105 #ifndef Py_UNICODE_WIDE 2106 for (i = pairs = 0; i < size/4; i++) 2107 if (((Py_UCS4 *)s)[i] >= 0x10000) 2108 pairs++; 2109 #endif 2110 2111 /* This might be one to much, because of a BOM */ 2112 unicode = _PyUnicode_New((size+3)/4+pairs); 2113 if (!unicode) 2114 return NULL; 2115 if (size == 0) 2116 return (PyObject *)unicode; 2117 2118 /* Unpack UTF-32 encoded data */ 2119 p = unicode->str; 2276 2120 2277 q = (unsigned char *)s; 2121 2278 e = q + size; … … 2168 2325 iorder[3] = 0; 2169 2326 } 2327 2328 /* On narrow builds we split characters outside the BMP into two 2329 codepoints => count how much extra space we need. */ 2330 #ifndef Py_UNICODE_WIDE 2331 for (qq = q; e - qq >= 4; qq += 4) 2332 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 2333 pairs++; 2334 #endif 2335 2336 /* This might be one to much, because of a BOM */ 2337 unicode = _PyUnicode_New((size+3)/4+pairs); 2338 if (!unicode) 2339 return NULL; 2340 if (size == 0) 2341 return (PyObject *)unicode; 2342 2343 /* Unpack UTF-32 encoded data */ 2344 p = unicode->str; 2170 2345 2171 2346 while (q < e) { … … 2446 2621 2447 2622 /* UTF-16 code pair: */ 2448 if (q >= e) { 2623 if (e - q < 2) { 2624 q -= 2; 2625 if (consumed) 2626 break; 2449 2627 errmsg = "unexpected end of data"; 2450 startinpos = (( (const char *)q)-2)-starts;2628 startinpos = ((const char *)q)-starts; 2451 2629 endinpos = ((const char *)e)-starts; 2452 2630 goto utf16Error; … … 2611 2789 Py_ssize_t endinpos; 2612 2790 Py_ssize_t outpos; 2613 int i;2614 2791 PyUnicodeObject *v; 2615 2792 Py_UNICODE *p; … … 2697 2874 hexescape: 2698 2875 chr = 0; 2699 outpos = p-PyUnicode_AS_UNICODE(v); 2700 if (s+digits>end) { 2701 endinpos = size; 2702 if (unicode_decode_call_errorhandler( 2703 errors, &errorHandler, 2704 "unicodeescape", "end of string in escape sequence", 2705 starts, size, &startinpos, &endinpos, &exc, &s, 2706 &v, &outpos, &p)) 2707 goto onError; 2708 goto nextByte; 2709 } 2710 for (i = 0; i < digits; ++i) { 2711 c = (unsigned char) s[i]; 2712 if (!isxdigit(c)) { 2713 endinpos = (s+i+1)-starts; 2714 if (unicode_decode_call_errorhandler( 2715 errors, &errorHandler, 2716 "unicodeescape", message, 2717 starts, size, &startinpos, &endinpos, &exc, &s, 2718 &v, &outpos, &p)) 2719 goto onError; 2720 goto nextByte; 2876 if (end - s < digits) { 2877 /* count only hex digits */ 2878 for (; s < end; ++s) { 2879 c = (unsigned char)*s; 2880 if (!Py_ISXDIGIT(c)) 2881 goto error; 2721 2882 } 2883 goto error; 2884 } 2885 for (; digits--; ++s) { 2886 c = (unsigned char)*s; 2887 if (!Py_ISXDIGIT(c)) 2888 goto error; 2722 2889 chr = (chr<<4) & ~0xF; 2723 2890 if (c >= '0' && c <= '9') … … 2728 2895 chr += 10 + c - 'A'; 2729 2896 } 2730 s += i;2731 2897 if (chr == 0xffffffff && PyErr_Occurred()) 2732 2898 /* _decoding_error will have already written into the … … 2749 2915 #endif 2750 2916 } else { 2751 endinpos = s-starts; 2752 outpos = p-PyUnicode_AS_UNICODE(v); 2753 if (unicode_decode_call_errorhandler( 2754 errors, &errorHandler, 2755 "unicodeescape", "illegal Unicode character", 2756 starts, size, &startinpos, &endinpos, &exc, &s, 2757 &v, &outpos, &p)) 2758 goto onError; 2917 message = "illegal Unicode character"; 2918 goto error; 2759 2919 } 2760 2920 break; … … 2765 2925 if (ucnhash_CAPI == NULL) { 2766 2926 /* load the unicode data module */ 2767 PyObject *m, *api; 2768 m = PyImport_ImportModuleNoBlock("unicodedata"); 2769 if (m == NULL) 2770 goto ucnhashError; 2771 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 2772 Py_DECREF(m); 2773 if (api == NULL) 2774 goto ucnhashError; 2775 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 2776 Py_DECREF(api); 2927 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1); 2777 2928 if (ucnhash_CAPI == NULL) 2778 2929 goto ucnhashError; … … 2787 2938 message = "unknown Unicode character name"; 2788 2939 s++; 2789 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2940 if (s - start - 1 <= INT_MAX && 2941 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2790 2942 goto store; 2791 2943 } 2792 2944 } 2793 endinpos = s-starts; 2794 outpos = p-PyUnicode_AS_UNICODE(v); 2795 if (unicode_decode_call_errorhandler( 2796 errors, &errorHandler, 2797 "unicodeescape", message, 2798 starts, size, &startinpos, &endinpos, &exc, &s, 2799 &v, &outpos, &p)) 2800 goto onError; 2801 break; 2945 goto error; 2802 2946 2803 2947 default: … … 2805 2949 message = "\\ at end of string"; 2806 2950 s--; 2807 endinpos = s-starts; 2808 outpos = p-PyUnicode_AS_UNICODE(v); 2809 if (unicode_decode_call_errorhandler( 2810 errors, &errorHandler, 2811 "unicodeescape", message, 2812 starts, size, &startinpos, &endinpos, &exc, &s, 2813 &v, &outpos, &p)) 2814 goto onError; 2951 goto error; 2815 2952 } 2816 2953 else { … … 2820 2957 break; 2821 2958 } 2822 nextByte: 2823 ; 2959 continue; 2960 2961 error: 2962 endinpos = s-starts; 2963 outpos = p-PyUnicode_AS_UNICODE(v); 2964 if (unicode_decode_call_errorhandler( 2965 errors, &errorHandler, 2966 "unicodeescape", message, 2967 starts, size, &startinpos, &endinpos, &exc, &s, 2968 &v, &outpos, &p)) 2969 goto onError; 2970 continue; 2824 2971 } 2825 2972 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) … … 3012 3159 3013 3160 *p = '\0'; 3014 _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); 3161 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr))) 3162 return NULL; 3015 3163 return repr; 3016 3164 } … … 3233 3381 } 3234 3382 *p = '\0'; 3235 _PyString_Resize(&repr, p - q); 3383 if (_PyString_Resize(&repr, p - q)) 3384 return NULL; 3236 3385 return repr; 3237 3386 } … … 3278 3427 3279 3428 while (s < end) { 3429 if (end-s < Py_UNICODE_SIZE) { 3430 endinpos = end-starts; 3431 reason = "truncated input"; 3432 goto error; 3433 } 3280 3434 memcpy(p, s, sizeof(Py_UNICODE)); 3435 #ifdef Py_UNICODE_WIDE 3281 3436 /* We have to sanity check the raw data, otherwise doom looms for 3282 3437 some malformed UCS-4 data. */ 3283 if ( 3284 #ifdef Py_UNICODE_WIDE 3285 *p > unimax || *p < 0 || 3438 if (*p > unimax || *p < 0) { 3439 endinpos = s - starts + Py_UNICODE_SIZE; 3440 reason = "illegal code point (> 0x10FFFF)"; 3441 goto error; 3442 } 3286 3443 #endif 3287 end-s < Py_UNICODE_SIZE 3288 ) 3289 { 3290 startinpos = s - starts; 3291 if (end-s < Py_UNICODE_SIZE) { 3292 endinpos = end-starts; 3293 reason = "truncated input"; 3294 } 3295 else { 3296 endinpos = s - starts + Py_UNICODE_SIZE; 3297 reason = "illegal code point (> 0x10FFFF)"; 3298 } 3299 outpos = p - PyUnicode_AS_UNICODE(v); 3300 if (unicode_decode_call_errorhandler( 3301 errors, &errorHandler, 3302 "unicode_internal", reason, 3303 starts, size, &startinpos, &endinpos, &exc, &s, 3304 &v, &outpos, &p)) { 3305 goto onError; 3306 } 3307 } 3308 else { 3309 p++; 3310 s += Py_UNICODE_SIZE; 3444 p++; 3445 s += Py_UNICODE_SIZE; 3446 continue; 3447 3448 error: 3449 startinpos = s - starts; 3450 outpos = p - PyUnicode_AS_UNICODE(v); 3451 if (unicode_decode_call_errorhandler( 3452 errors, &errorHandler, 3453 "unicode_internal", reason, 3454 starts, size, &startinpos, &endinpos, &exc, &s, 3455 &v, &outpos, &p)) { 3456 goto onError; 3311 3457 } 3312 3458 } … … 3530 3676 respos = str-PyString_AS_STRING(res); 3531 3677 /* determine replacement size (temporarily (mis)uses p) */ 3532 for (p = collstart, repsize = 0; p < collend; ++p) { 3533 if (*p<10) 3678 for (p = collstart, repsize = 0; p < collend;) { 3679 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 3680 if (ch < 10) 3534 3681 repsize += 2+1+1; 3535 else if ( *p<100)3682 else if (ch < 100) 3536 3683 repsize += 2+2+1; 3537 else if ( *p<1000)3684 else if (ch < 1000) 3538 3685 repsize += 2+3+1; 3539 else if ( *p<10000)3686 else if (ch < 10000) 3540 3687 repsize += 2+4+1; 3541 #ifndef Py_UNICODE_WIDE 3542 else 3688 else if (ch < 100000) 3543 3689 repsize += 2+5+1; 3544 #else 3545 else if (*p<100000) 3546 repsize += 2+5+1; 3547 else if (*p<1000000) 3690 else if (ch < 1000000) 3548 3691 repsize += 2+6+1; 3549 3692 else 3550 3693 repsize += 2+7+1; 3551 #endif3552 3694 } 3553 3695 requiredsize = respos+repsize+(endp-collend); … … 3561 3703 } 3562 3704 /* generate replacement (temporarily (mis)uses p) */ 3563 for (p = collstart; p < collend; ++p) { 3564 str += sprintf(str, "&#%d;", (int)*p); 3705 for (p = collstart; p < collend;) { 3706 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 3707 str += sprintf(str, "&#%d;", (int)ch); 3565 3708 } 3566 3709 p = collend; … … 3572 3715 if (repunicode == NULL) 3573 3716 goto onError; 3574 /* need more space? (at least enough for what we 3575 have+the replacement+the rest of the string, so3576 we won't have tocheck space for encodable characters) */3717 /* need more space? (at least enough for what we have+the 3718 replacement+the rest of the string, so we won't have to 3719 check space for encodable characters) */ 3577 3720 respos = str-PyString_AS_STRING(res); 3578 3721 repsize = PyUnicode_GET_SIZE(repunicode); … … 4007 4150 /* No mapping found means: mapping is undefined. */ 4008 4151 PyErr_Clear(); 4009 x = Py_None; 4010 Py_INCREF(x); 4152 goto Undefined; 4011 4153 } else 4012 4154 goto onError; … … 4014 4156 4015 4157 /* Apply mapping */ 4158 if (x == Py_None) 4159 goto Undefined; 4016 4160 if (PyInt_Check(x)) { 4017 4161 long value = PyInt_AS_LONG(x); 4018 if (value < 0 || value > 65535) { 4162 if (value == 0xFFFE) 4163 goto Undefined; 4164 if (value < 0 || value > 0x10FFFF) { 4019 4165 PyErr_SetString(PyExc_TypeError, 4020 "character mapping must be in range( 65536)");4166 "character mapping must be in range(0x110000)"); 4021 4167 Py_DECREF(x); 4022 4168 goto onError; 4023 4169 } 4170 4171 #ifndef Py_UNICODE_WIDE 4172 if (value > 0xFFFF) { 4173 /* see the code for 1-n mapping below */ 4174 if (extrachars < 2) { 4175 /* resize first */ 4176 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 4177 Py_ssize_t needed = 10 - extrachars; 4178 extrachars += needed; 4179 /* XXX overflow detection missing */ 4180 if (_PyUnicode_Resize(&v, 4181 PyUnicode_GET_SIZE(v) + needed) < 0) { 4182 Py_DECREF(x); 4183 goto onError; 4184 } 4185 p = PyUnicode_AS_UNICODE(v) + oldpos; 4186 } 4187 value -= 0x10000; 4188 *p++ = 0xD800 | (value >> 10); 4189 *p++ = 0xDC00 | (value & 0x3FF); 4190 extrachars -= 2; 4191 } 4192 else 4193 #endif 4024 4194 *p++ = (Py_UNICODE)value; 4025 }4026 else if (x == Py_None) {4027 /* undefined mapping */4028 outpos = p-PyUnicode_AS_UNICODE(v);4029 startinpos = s-starts;4030 endinpos = startinpos+1;4031 if (unicode_decode_call_errorhandler(4032 errors, &errorHandler,4033 "charmap", "character maps to <undefined>",4034 starts, size, &startinpos, &endinpos, &exc, &s,4035 &v, &outpos, &p)) {4036 Py_DECREF(x);4037 goto onError;4038 }4039 Py_DECREF(x);4040 continue;4041 4195 } 4042 4196 else if (PyUnicode_Check(x)) { 4043 4197 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 4044 4198 4045 if (targetsize == 1) 4199 if (targetsize == 1) { 4046 4200 /* 1-1 mapping */ 4047 *p++ = *PyUnicode_AS_UNICODE(x); 4048 4201 Py_UNICODE value = *PyUnicode_AS_UNICODE(x); 4202 if (value == 0xFFFE) 4203 goto Undefined; 4204 *p++ = value; 4205 } 4049 4206 else if (targetsize > 1) { 4050 4207 /* 1-n mapping */ … … 4080 4237 Py_DECREF(x); 4081 4238 ++s; 4239 continue; 4240 Undefined: 4241 /* undefined mapping */ 4242 Py_XDECREF(x); 4243 outpos = p-PyUnicode_AS_UNICODE(v); 4244 startinpos = s-starts; 4245 endinpos = startinpos+1; 4246 if (unicode_decode_call_errorhandler( 4247 errors, &errorHandler, 4248 "charmap", "character maps to <undefined>", 4249 starts, size, &startinpos, &endinpos, &exc, &s, 4250 &v, &outpos, &p)) { 4251 goto onError; 4252 } 4082 4253 } 4083 4254 } … … 4225 4396 return NULL; 4226 4397 for (i = 0; i < 256; i++) { 4227 key =value = NULL;4398 value = NULL; 4228 4399 key = PyInt_FromLong(decode[i]); 4229 4400 value = PyInt_FromLong(i); … … 4509 4680 break; 4510 4681 case 4: /* xmlcharrefreplace */ 4511 /* generate replacement (temporarily (mis)uses p)*/4512 for (collpos = collstartpos; collpos < collendpos; ++collpos) {4682 /* generate replacement */ 4683 for (collpos = collstartpos; collpos < collendpos;) { 4513 4684 char buffer[2+29+1+1]; 4514 4685 char *cp; 4515 sprintf(buffer, "&#%d;", (int)p[collpos]); 4686 Py_UCS4 ch = p[collpos++]; 4687 #ifndef Py_UNICODE_WIDE 4688 if ((0xD800 <= ch && ch <= 0xDBFF) && 4689 (collpos < collendpos) && 4690 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) { 4691 ch = ((((ch & 0x03FF) << 10) | 4692 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000); 4693 } 4694 #endif 4695 sprintf(buffer, "&#%d;", (int)ch); 4516 4696 for (cp = buffer; *cp; ++cp) { 4517 4697 x = charmapencode_output(*cp, mapping, res, respos); … … 4928 5108 case 4: /* xmlcharrefreplace */ 4929 5109 /* generate replacement (temporarily (mis)uses p) */ 4930 for (p = collstart; p < collend; ++p) {5110 for (p = collstart; p < collend;) { 4931 5111 char buffer[2+29+1+1]; 4932 5112 char *cp; 4933 sprintf(buffer, "&#%d;", (int)*p); 5113 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 5114 sprintf(buffer, "&#%d;", (int)ch); 4934 5115 if (charmaptranslate_makespace(&res, &str, 4935 5116 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) … … 5049 5230 /* All other characters are considered unencodable */ 5050 5231 collstart = p; 5051 collend = p+1; 5052 while (collend < end) { 5232 for (collend = p+1; collend < end; collend++) { 5053 5233 if ((0 < *collend && *collend < 256) || 5054 !Py_UNICODE_ISSPACE(*collend) ||5055 Py_UNICODE_TODECIMAL(*collend))5234 Py_UNICODE_ISSPACE(*collend) || 5235 0 <= Py_UNICODE_TODECIMAL(*collend)) 5056 5236 break; 5057 5237 } … … 5083 5263 case 4: /* xmlcharrefreplace */ 5084 5264 /* generate replacement (temporarily (mis)uses p) */ 5085 for (p = collstart; p < collend; ++p) 5086 output += sprintf(output, "&#%d;", (int)*p); 5265 for (p = collstart; p < collend;) { 5266 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); 5267 output += sprintf(output, "&#%d;", ch); 5268 } 5087 5269 p = collend; 5088 5270 break; … … 5132 5314 5133 5315 #include "stringlib/unicodedefs.h" 5134 5135 #define FROM_UNICODE5136 5137 5316 #include "stringlib/fastsearch.h" 5138 5317 … … 5140 5319 #include "stringlib/find.h" 5141 5320 #include "stringlib/partition.h" 5321 #include "stringlib/split.h" 5142 5322 5143 5323 /* helper macro to fixup start/end slice values */ 5144 #define FIX_START_END(obj) \ 5145 if (start < 0) \ 5146 start += (obj)->length; \ 5147 if (start < 0) \ 5148 start = 0; \ 5149 if (end > (obj)->length) \ 5150 end = (obj)->length; \ 5151 if (end < 0) \ 5152 end += (obj)->length; \ 5153 if (end < 0) \ 5154 end = 0; 5324 #define ADJUST_INDICES(start, end, len) \ 5325 if (end > len) \ 5326 end = len; \ 5327 else if (end < 0) { \ 5328 end += len; \ 5329 if (end < 0) \ 5330 end = 0; \ 5331 } \ 5332 if (start < 0) { \ 5333 start += len; \ 5334 if (start < 0) \ 5335 start = 0; \ 5336 } 5155 5337 5156 5338 Py_ssize_t PyUnicode_Count(PyObject *str, … … 5172 5354 } 5173 5355 5174 FIX_START_END(str_obj); 5175 5356 ADJUST_INDICES(start, end, str_obj->length); 5176 5357 result = stringlib_count( 5177 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 5358 str_obj->str + start, end - start, sub_obj->str, sub_obj->length, 5359 PY_SSIZE_T_MAX 5178 5360 ); 5179 5361 … … 5230 5412 return 1; 5231 5413 5232 FIX_START_END(self); 5233 5414 ADJUST_INDICES(start, end, self->length); 5234 5415 end -= substring->length; 5235 5416 if (end < start) … … 5371 5552 if (len == 0) 5372 5553 return 0; 5373 if ( Py_UNICODE_ISLOWER(*s)) {5554 if (!Py_UNICODE_ISUPPER(*s)) { 5374 5555 *s = Py_UNICODE_TOUPPER(*s); 5375 5556 status = 1; … … 5377 5558 s++; 5378 5559 while (--len > 0) { 5379 if ( Py_UNICODE_ISUPPER(*s)) {5560 if (!Py_UNICODE_ISLOWER(*s)) { 5380 5561 *s = Py_UNICODE_TOLOWER(*s); 5381 5562 status = 1; … … 5608 5789 } 5609 5790 5610 #define SPLIT_APPEND(data, left, right) \ 5611 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 5612 if (!str) \ 5613 goto onError; \ 5614 if (PyList_Append(list, str)) { \ 5615 Py_DECREF(str); \ 5616 goto onError; \ 5617 } \ 5618 else \ 5619 Py_DECREF(str); 5620 5621 static 5622 PyObject *split_whitespace(PyUnicodeObject *self, 5623 PyObject *list, 5624 Py_ssize_t maxcount) 5625 { 5626 register Py_ssize_t i; 5627 register Py_ssize_t j; 5628 Py_ssize_t len = self->length; 5629 PyObject *str; 5630 register const Py_UNICODE *buf = self->str; 5631 5632 for (i = j = 0; i < len; ) { 5633 /* find a token */ 5634 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 5635 i++; 5636 j = i; 5637 while (i < len && !Py_UNICODE_ISSPACE(buf[i])) 5638 i++; 5639 if (j < i) { 5640 if (maxcount-- <= 0) 5641 break; 5642 SPLIT_APPEND(buf, j, i); 5643 while (i < len && Py_UNICODE_ISSPACE(buf[i])) 5644 i++; 5645 j = i; 5646 } 5647 } 5648 if (j < len) { 5649 SPLIT_APPEND(buf, j, len); 5650 } 5651 return list; 5652 5653 onError: 5654 Py_DECREF(list); 5655 return NULL; 5656 } 5657 5658 PyObject *PyUnicode_Splitlines(PyObject *string, 5659 int keepends) 5660 { 5661 register Py_ssize_t i; 5662 register Py_ssize_t j; 5663 Py_ssize_t len; 5791 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) 5792 { 5664 5793 PyObject *list; 5665 PyObject *str;5666 Py_UNICODE *data;5667 5794 5668 5795 string = PyUnicode_FromObject(string); 5669 5796 if (string == NULL) 5670 5797 return NULL; 5671 data = PyUnicode_AS_UNICODE(string); 5672 len = PyUnicode_GET_SIZE(string); 5673 5674 list = PyList_New(0); 5675 if (!list) 5676 goto onError; 5677 5678 for (i = j = 0; i < len; ) { 5679 Py_ssize_t eol; 5680 5681 /* Find a line and append it */ 5682 while (i < len && !BLOOM_LINEBREAK(data[i])) 5683 i++; 5684 5685 /* Skip the line break reading CRLF as one line break */ 5686 eol = i; 5687 if (i < len) { 5688 if (data[i] == '\r' && i + 1 < len && 5689 data[i+1] == '\n') 5690 i += 2; 5691 else 5692 i++; 5693 if (keepends) 5694 eol = i; 5695 } 5696 SPLIT_APPEND(data, j, eol); 5697 j = i; 5698 } 5699 if (j < len) { 5700 SPLIT_APPEND(data, j, len); 5701 } 5798 5799 list = stringlib_splitlines( 5800 (PyObject*) string, PyUnicode_AS_UNICODE(string), 5801 PyUnicode_GET_SIZE(string), keepends); 5702 5802 5703 5803 Py_DECREF(string); 5704 5804 return list; 5705 5706 onError: 5707 Py_XDECREF(list); 5708 Py_DECREF(string); 5709 return NULL; 5710 } 5711 5712 static 5713 PyObject *split_char(PyUnicodeObject *self, 5714 PyObject *list, 5715 Py_UNICODE ch, 5716 Py_ssize_t maxcount) 5717 { 5718 register Py_ssize_t i; 5719 register Py_ssize_t j; 5720 Py_ssize_t len = self->length; 5721 PyObject *str; 5722 register const Py_UNICODE *buf = self->str; 5723 5724 for (i = j = 0; i < len; ) { 5725 if (buf[i] == ch) { 5726 if (maxcount-- <= 0) 5727 break; 5728 SPLIT_APPEND(buf, j, i); 5729 i = j = i + 1; 5730 } else 5731 i++; 5732 } 5733 if (j <= len) { 5734 SPLIT_APPEND(buf, j, len); 5735 } 5736 return list; 5737 5738 onError: 5739 Py_DECREF(list); 5740 return NULL; 5741 } 5742 5743 static 5744 PyObject *split_substring(PyUnicodeObject *self, 5745 PyObject *list, 5746 PyUnicodeObject *substring, 5747 Py_ssize_t maxcount) 5748 { 5749 register Py_ssize_t i; 5750 register Py_ssize_t j; 5751 Py_ssize_t len = self->length; 5752 Py_ssize_t sublen = substring->length; 5753 PyObject *str; 5754 5755 for (i = j = 0; i <= len - sublen; ) { 5756 if (Py_UNICODE_MATCH(self, i, substring)) { 5757 if (maxcount-- <= 0) 5758 break; 5759 SPLIT_APPEND(self->str, j, i); 5760 i = j = i + sublen; 5761 } else 5762 i++; 5763 } 5764 if (j <= len) { 5765 SPLIT_APPEND(self->str, j, len); 5766 } 5767 return list; 5768 5769 onError: 5770 Py_DECREF(list); 5771 return NULL; 5772 } 5773 5774 static 5775 PyObject *rsplit_whitespace(PyUnicodeObject *self, 5776 PyObject *list, 5777 Py_ssize_t maxcount) 5778 { 5779 register Py_ssize_t i; 5780 register Py_ssize_t j; 5781 Py_ssize_t len = self->length; 5782 PyObject *str; 5783 register const Py_UNICODE *buf = self->str; 5784 5785 for (i = j = len - 1; i >= 0; ) { 5786 /* find a token */ 5787 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 5788 i--; 5789 j = i; 5790 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) 5791 i--; 5792 if (j > i) { 5793 if (maxcount-- <= 0) 5794 break; 5795 SPLIT_APPEND(buf, i + 1, j + 1); 5796 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) 5797 i--; 5798 j = i; 5799 } 5800 } 5801 if (j >= 0) { 5802 SPLIT_APPEND(buf, 0, j + 1); 5803 } 5804 if (PyList_Reverse(list) < 0) 5805 goto onError; 5806 return list; 5807 5808 onError: 5809 Py_DECREF(list); 5810 return NULL; 5811 } 5812 5813 static 5814 PyObject *rsplit_char(PyUnicodeObject *self, 5815 PyObject *list, 5816 Py_UNICODE ch, 5817 Py_ssize_t maxcount) 5818 { 5819 register Py_ssize_t i; 5820 register Py_ssize_t j; 5821 Py_ssize_t len = self->length; 5822 PyObject *str; 5823 register const Py_UNICODE *buf = self->str; 5824 5825 for (i = j = len - 1; i >= 0; ) { 5826 if (buf[i] == ch) { 5827 if (maxcount-- <= 0) 5828 break; 5829 SPLIT_APPEND(buf, i + 1, j + 1); 5830 j = i = i - 1; 5831 } else 5832 i--; 5833 } 5834 if (j >= -1) { 5835 SPLIT_APPEND(buf, 0, j + 1); 5836 } 5837 if (PyList_Reverse(list) < 0) 5838 goto onError; 5839 return list; 5840 5841 onError: 5842 Py_DECREF(list); 5843 return NULL; 5844 } 5845 5846 static 5847 PyObject *rsplit_substring(PyUnicodeObject *self, 5848 PyObject *list, 5849 PyUnicodeObject *substring, 5850 Py_ssize_t maxcount) 5851 { 5852 register Py_ssize_t i; 5853 register Py_ssize_t j; 5854 Py_ssize_t len = self->length; 5855 Py_ssize_t sublen = substring->length; 5856 PyObject *str; 5857 5858 for (i = len - sublen, j = len; i >= 0; ) { 5859 if (Py_UNICODE_MATCH(self, i, substring)) { 5860 if (maxcount-- <= 0) 5861 break; 5862 SPLIT_APPEND(self->str, i + sublen, j); 5863 j = i; 5864 i -= sublen; 5865 } else 5866 i--; 5867 } 5868 if (j >= 0) { 5869 SPLIT_APPEND(self->str, 0, j); 5870 } 5871 if (PyList_Reverse(list) < 0) 5872 goto onError; 5873 return list; 5874 5875 onError: 5876 Py_DECREF(list); 5877 return NULL; 5878 } 5879 5880 #undef SPLIT_APPEND 5805 } 5881 5806 5882 5807 static … … 5885 5810 Py_ssize_t maxcount) 5886 5811 { 5887 PyObject *list;5888 5889 5812 if (maxcount < 0) 5890 5813 maxcount = PY_SSIZE_T_MAX; 5891 5814 5892 list = PyList_New(0);5893 if (!list)5894 return NULL;5895 5896 5815 if (substring == NULL) 5897 return split_whitespace(self,list,maxcount); 5898 5899 else if (substring->length == 1) 5900 return split_char(self,list,substring->str[0],maxcount); 5901 5902 else if (substring->length == 0) { 5903 Py_DECREF(list); 5904 PyErr_SetString(PyExc_ValueError, "empty separator"); 5905 return NULL; 5906 } 5907 else 5908 return split_substring(self,list,substring,maxcount); 5816 return stringlib_split_whitespace( 5817 (PyObject*) self, self->str, self->length, maxcount 5818 ); 5819 5820 return stringlib_split( 5821 (PyObject*) self, self->str, self->length, 5822 substring->str, substring->length, 5823 maxcount 5824 ); 5909 5825 } 5910 5826 … … 5914 5830 Py_ssize_t maxcount) 5915 5831 { 5916 PyObject *list;5917 5918 5832 if (maxcount < 0) 5919 5833 maxcount = PY_SSIZE_T_MAX; 5920 5834 5921 list = PyList_New(0);5922 if (!list)5923 return NULL;5924 5925 5835 if (substring == NULL) 5926 return rsplit_whitespace(self,list,maxcount); 5927 5928 else if (substring->length == 1) 5929 return rsplit_char(self,list,substring->str[0],maxcount); 5930 5931 else if (substring->length == 0) { 5932 Py_DECREF(list); 5933 PyErr_SetString(PyExc_ValueError, "empty separator"); 5934 return NULL; 5935 } 5936 else 5937 return rsplit_substring(self,list,substring,maxcount); 5836 return stringlib_rsplit_whitespace( 5837 (PyObject*) self, self->str, self->length, maxcount 5838 ); 5839 5840 return stringlib_rsplit( 5841 (PyObject*) self, self->str, self->length, 5842 substring->str, substring->length, 5843 maxcount 5844 ); 5938 5845 } 5939 5846 … … 5948 5855 if (maxcount < 0) 5949 5856 maxcount = PY_SSIZE_T_MAX; 5857 else if (maxcount == 0 || self->length == 0) 5858 goto nothing; 5950 5859 5951 5860 if (str1->length == str2->length) { 5861 Py_ssize_t i; 5952 5862 /* same length */ 5953 Py_ssize_t i; 5863 if (str1->length == 0) 5864 goto nothing; 5954 5865 if (str1->length == 1) { 5955 5866 /* replace characters */ … … 5970 5881 } 5971 5882 } else { 5972 i = fastsearch(5973 self->str, self->length, str1->str, str1->length, FAST_SEARCH5883 i = stringlib_find( 5884 self->str, self->length, str1->str, str1->length, 0 5974 5885 ); 5975 5886 if (i < 0) … … 5979 5890 return NULL; 5980 5891 Py_UNICODE_COPY(u->str, self->str, self->length); 5981 while (i <= self->length - str1->length) 5982 if (Py_UNICODE_MATCH(self, i, str1)) { 5983 if (--maxcount < 0) 5984 break; 5985 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5986 i += str1->length; 5987 } else 5988 i++; 5892 5893 /* change everything in-place, starting with this one */ 5894 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5895 i += str1->length; 5896 5897 while ( --maxcount > 0) { 5898 i = stringlib_find(self->str+i, self->length-i, 5899 str1->str, str1->length, 5900 i); 5901 if (i == -1) 5902 break; 5903 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5904 i += str1->length; 5905 } 5989 5906 } 5990 5907 } else { 5991 5908 5992 Py_ssize_t n, i, j , e;5909 Py_ssize_t n, i, j; 5993 5910 Py_ssize_t product, new_size, delta; 5994 5911 Py_UNICODE *p; 5995 5912 5996 5913 /* replace strings */ 5997 n = stringlib_count(self->str, self->length, str1->str, str1->length); 5998 if (n > maxcount) 5999 n = maxcount; 5914 n = stringlib_count(self->str, self->length, str1->str, str1->length, 5915 maxcount); 6000 5916 if (n == 0) 6001 5917 goto nothing; … … 6023 5939 i = 0; 6024 5940 p = u->str; 6025 e = self->length - str1->length;6026 5941 if (str1->length > 0) { 6027 5942 while (n-- > 0) { 6028 5943 /* look for next match */ 6029 j = i; 6030 while (j <= e) { 6031 if (Py_UNICODE_MATCH(self, j, str1)) 6032 break; 6033 j++; 6034 } 6035 if (j > i) { 6036 if (j > e) 6037 break; 5944 j = stringlib_find(self->str+i, self->length-i, 5945 str1->str, str1->length, 5946 i); 5947 if (j == -1) 5948 break; 5949 else if (j > i) { 6038 5950 /* copy unchanged part [i:j] */ 6039 5951 Py_UNICODE_COPY(p, self->str+i, j-i); … … 6091 6003 \n\ 6092 6004 Return a capitalized version of S, i.e. make the first character\n\ 6093 have upper case .");6005 have upper case and the rest lower case."); 6094 6006 6095 6007 static PyObject* … … 6389 6301 sub = PyUnicode_FromObject(element); 6390 6302 if (!sub) { 6391 PyErr_SetString(PyExc_TypeError,6392 "'in <string>' requires string as left operand");6393 6303 return -1; 6394 6304 } … … 6465 6375 PyObject *result; 6466 6376 6467 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 6468 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6469 return NULL; 6470 6471 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6472 (PyObject *)substring); 6473 if (substring == NULL) 6474 return NULL; 6475 6476 FIX_START_END(self); 6477 6377 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 6378 &start, &end)) 6379 return NULL; 6380 6381 ADJUST_INDICES(start, end, self->length); 6478 6382 result = PyInt_FromSsize_t( 6479 6383 stringlib_count(self->str + start, end - start, 6480 substring->str, substring->length) 6384 substring->str, substring->length, 6385 PY_SSIZE_T_MAX) 6481 6386 ); 6482 6387 … … 6497 6402 6498 6403 static PyObject * 6499 unicode_encode(PyUnicodeObject *self, PyObject *args) 6500 { 6404 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 6405 { 6406 static char *kwlist[] = {"encoding", "errors", 0}; 6501 6407 char *encoding = NULL; 6502 6408 char *errors = NULL; 6503 6409 PyObject *v; 6504 6410 6505 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 6411 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 6412 kwlist, &encoding, &errors)) 6506 6413 return NULL; 6507 6414 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); … … 6529 6436 handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6530 6437 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ 6531 as well as any other name register d with codecs.register_error that is\n\6438 as well as any other name registered with codecs.register_error that is\n\ 6532 6439 able to handle UnicodeDecodeErrors."); 6533 6440 6534 6441 static PyObject * 6535 unicode_decode(PyUnicodeObject *self, PyObject *args) 6536 { 6442 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 6443 { 6444 static char *kwlist[] = {"encoding", "errors", 0}; 6537 6445 char *encoding = NULL; 6538 6446 char *errors = NULL; 6539 6447 PyObject *v; 6540 6448 6541 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors)) 6449 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode", 6450 kwlist, &encoding, &errors)) 6542 6451 return NULL; 6543 6452 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors); … … 6649 6558 \n\ 6650 6559 Return the lowest index in S where substring sub is found,\n\ 6651 such that sub is contained within s[start:end]. Optional\n\6560 such that sub is contained within S[start:end]. Optional\n\ 6652 6561 arguments start and end are interpreted as in slice notation.\n\ 6653 6562 \n\ … … 6657 6566 unicode_find(PyUnicodeObject *self, PyObject *args) 6658 6567 { 6659 Py Object *substring;6568 PyUnicodeObject *substring; 6660 6569 Py_ssize_t start; 6661 6570 Py_ssize_t end; 6662 6571 Py_ssize_t result; 6663 6572 6664 if (!_ParseTupleFinds(args, &substring, &start, &end)) 6573 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 6574 &start, &end)) 6665 6575 return NULL; 6666 6576 … … 6700 6610 register long x; 6701 6611 6612 #ifdef Py_DEBUG 6613 assert(_Py_HashSecret_Initialized); 6614 #endif 6702 6615 if (self->hash != -1) 6703 6616 return self->hash; 6704 6617 len = PyUnicode_GET_SIZE(self); 6618 /* 6619 We make the hash of the empty string be 0, rather than using 6620 (prefix ^ suffix), since this slightly obfuscates the hash secret 6621 */ 6622 if (len == 0) { 6623 self->hash = 0; 6624 return 0; 6625 } 6705 6626 p = PyUnicode_AS_UNICODE(self); 6706 x = *p << 7; 6627 x = _Py_HashSecret.prefix; 6628 x ^= *p << 7; 6707 6629 while (--len >= 0) 6708 6630 x = (1000003*x) ^ *p++; 6709 6631 x ^= PyUnicode_GET_SIZE(self); 6632 x ^= _Py_HashSecret.suffix; 6710 6633 if (x == -1) 6711 6634 x = -2; … … 6723 6646 { 6724 6647 Py_ssize_t result; 6725 Py Object *substring;6648 PyUnicodeObject *substring; 6726 6649 Py_ssize_t start; 6727 6650 Py_ssize_t end; 6728 6651 6729 if (!_ParseTupleFinds(args, &substring, &start, &end)) 6652 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 6653 &start, &end)) 6730 6654 return NULL; 6731 6655 … … 7338 7262 7339 7263 PyDoc_STRVAR(replace__doc__, 7340 "S.replace 7264 "S.replace(old, new[, count]) -> unicode\n\ 7341 7265 \n\ 7342 7266 Return a copy of S with all occurrences of substring\n\ … … 7382 7306 \n\ 7383 7307 Return the highest index in S where substring sub is found,\n\ 7384 such that sub is contained within s[start:end]. Optional\n\7308 such that sub is contained within S[start:end]. Optional\n\ 7385 7309 arguments start and end are interpreted as in slice notation.\n\ 7386 7310 \n\ … … 7390 7314 unicode_rfind(PyUnicodeObject *self, PyObject *args) 7391 7315 { 7392 Py Object *substring;7316 PyUnicodeObject *substring; 7393 7317 Py_ssize_t start; 7394 7318 Py_ssize_t end; 7395 7319 Py_ssize_t result; 7396 7320 7397 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7321 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 7322 &start, &end)) 7398 7323 return NULL; 7399 7324 … … 7417 7342 unicode_rindex(PyUnicodeObject *self, PyObject *args) 7418 7343 { 7419 Py Object *substring;7344 PyUnicodeObject *substring; 7420 7345 Py_ssize_t start; 7421 7346 Py_ssize_t end; 7422 7347 Py_ssize_t result; 7423 7348 7424 if (!_ParseTupleFinds(args, &substring, &start, &end)) 7349 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 7350 &start, &end)) 7425 7351 return NULL; 7426 7352 … … 7667 7593 7668 7594 PyDoc_STRVAR(splitlines__doc__, 7669 "S.splitlines( [keepends]) -> list of strings\n\7595 "S.splitlines(keepends=False) -> list of strings\n\ 7670 7596 \n\ 7671 7597 Return a list of the lines in S, breaking at line boundaries.\n\ … … 7801 7727 int result; 7802 7728 7803 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 7804 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7729 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 7805 7730 return NULL; 7806 7731 if (PyTuple_Check(subobj)) { … … 7821 7746 } 7822 7747 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7823 if (substring == NULL) 7824 return NULL; 7748 if (substring == NULL) { 7749 if (PyErr_ExceptionMatches(PyExc_TypeError)) 7750 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, " 7751 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name); 7752 return NULL; 7753 } 7825 7754 result = tailmatch(self, substring, start, end, -1); 7826 7755 Py_DECREF(substring); … … 7847 7776 int result; 7848 7777 7849 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 7850 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7778 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 7851 7779 return NULL; 7852 7780 if (PyTuple_Check(subobj)) { … … 7866 7794 } 7867 7795 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7868 if (substring == NULL) 7869 return NULL; 7870 7796 if (substring == NULL) { 7797 if (PyErr_ExceptionMatches(PyExc_TypeError)) 7798 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, " 7799 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name); 7800 return NULL; 7801 } 7871 7802 result = tailmatch(self, substring, start, end, +1); 7872 7803 Py_DECREF(substring); … … 7881 7812 "S.format(*args, **kwargs) -> unicode\n\ 7882 7813 \n\ 7883 "); 7814 Return a formatted version of S, using substitutions from args and kwargs.\n\ 7815 The substitutions are identified by braces ('{' and '}')."); 7884 7816 7885 7817 static PyObject * … … 7915 7847 "S.__format__(format_spec) -> unicode\n\ 7916 7848 \n\ 7917 ");7849 Return a formatted version of S as described by format_spec."); 7918 7850 7919 7851 static PyObject * … … 7937 7869 7938 7870 static PyMethodDef unicode_methods[] = { 7939 7940 /* Order is according to common usage: often used methods should 7941 appear first, since lookup is done sequentially. */ 7942 7943 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 7871 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 7944 7872 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 7945 7873 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, … … 7957 7885 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 7958 7886 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 7959 {"decode", (PyCFunction) unicode_decode, METH_VARARGS , decode__doc__},7887 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__}, 7960 7888 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 7961 7889 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, … … 8172 8100 8173 8101 static int 8174 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)8175 {8176 Py_ssize_t result;8177 8178 PyOS_ascii_formatd((char *)buffer, len, format, x);8179 result = strtounicode(buffer, (char *)buffer);8180 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);8181 }8182 8183 static int8184 8102 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 8185 8103 { … … 8195 8113 formatting is done. */ 8196 8114 8197 static int 8198 formatfloat(Py_UNICODE *buf, 8199 size_t buflen, 8200 int flags, 8201 int prec, 8202 int type, 8203 PyObject *v) 8204 { 8205 /* fmt = '%#.' + `prec` + `type` 8206 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 8207 char fmt[20]; 8115 /* Returns a new reference to a PyUnicode object, or NULL on failure. */ 8116 8117 static PyObject * 8118 formatfloat(PyObject *v, int flags, int prec, int type) 8119 { 8120 char *p; 8121 PyObject *result; 8208 8122 double x; 8209 8123 8210 8124 x = PyFloat_AsDouble(v); 8211 8125 if (x == -1.0 && PyErr_Occurred()) 8212 return -1; 8126 return NULL; 8127 8213 8128 if (prec < 0) 8214 8129 prec = 6; 8215 #if SIZEOF_INT > 4 8216 /* make sure that the decimal representation of precision really does 8217 need at most 10 digits: platforms with sizeof(int) == 8 exist! */ 8218 if (prec > 0x7fffffff) { 8219 PyErr_SetString(PyExc_OverflowError, 8220 "outrageously large precision " 8221 "for formatted float"); 8222 return -1; 8223 } 8224 #endif 8225 8226 if (type == 'f' && fabs(x) >= 1e50) 8227 type = 'g'; 8228 /* Worst case length calc to ensure no buffer overrun: 8229 8230 'g' formats: 8231 fmt = %#.<prec>g 8232 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 8233 for any double rep.) 8234 len = 1 + prec + 1 + 2 + 5 = 9 + prec 8235 8236 'f' formats: 8237 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 8238 len = 1 + 50 + 1 + prec = 52 + prec 8239 8240 If prec=0 the effective precision is 1 (the leading digit is 8241 always given), therefore increase the length by one. 8242 8243 */ 8244 if (((type == 'g' || type == 'G') && 8245 buflen <= (size_t)10 + (size_t)prec) || 8246 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 8247 PyErr_SetString(PyExc_OverflowError, 8248 "formatted float is too long (precision too large?)"); 8249 return -1; 8250 } 8251 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 8252 (flags&F_ALT) ? "#" : "", 8253 prec, type); 8254 return doubletounicode(buf, buflen, fmt, x); 8130 8131 p = PyOS_double_to_string(x, type, prec, 8132 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 8133 if (p == NULL) 8134 return NULL; 8135 result = PyUnicode_FromStringAndSize(p, strlen(p)); 8136 PyMem_Free(p); 8137 return result; 8255 8138 } 8256 8139 … … 8422 8305 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8423 8306 8424 FORMATBUFLEN is the length of the buffer in which the floats, ints,&8307 FORMATBUFLEN is the length of the buffer in which the ints & 8425 8308 chars are formatted. XXX This is a magic number. Each formatting 8426 8309 routine does bounds checking to ensure no overflow, but a better … … 8464 8347 argidx = -2; 8465 8348 } 8466 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args)&&8467 !Py Object_TypeCheck(args, &PyBaseString_Type))8349 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript && 8350 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type)) 8468 8351 dict = args; 8469 8352 … … 8493 8376 Py_UNICODE sign; 8494 8377 Py_ssize_t len; 8495 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{ float,int,char}() */8378 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */ 8496 8379 8497 8380 fmt++; … … 8569 8452 goto onError; 8570 8453 } 8571 width = PyInt_AsLong(v); 8454 width = PyInt_AsSsize_t(v); 8455 if (width == -1 && PyErr_Occurred()) 8456 goto onError; 8572 8457 if (width < 0) { 8573 8458 flags |= F_LJUST; … … 8583 8468 if (c < '0' || c > '9') 8584 8469 break; 8585 if ( (width*10) / 10 != width) {8470 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) { 8586 8471 PyErr_SetString(PyExc_ValueError, 8587 8472 "width too big"); … … 8604 8489 goto onError; 8605 8490 } 8606 prec = PyInt_AsLong(v); 8491 prec = _PyInt_AsInt(v); 8492 if (prec == -1 && PyErr_Occurred()) 8493 goto onError; 8607 8494 if (prec < 0) 8608 8495 prec = 0; … … 8613 8500 prec = c - '0'; 8614 8501 while (--fmtcnt >= 0) { 8615 c = Py_CHARMASK(*fmt++);8502 c = *fmt++; 8616 8503 if (c < '0' || c > '9') 8617 8504 break; 8618 if ( (prec*10) / 10 != prec) {8505 if (prec > (INT_MAX - ((int)c - '0')) / 10) { 8619 8506 PyErr_SetString(PyExc_ValueError, 8620 8507 "prec too big"); … … 8654 8541 case 's': 8655 8542 case 'r': 8656 if (PyUnicode_Check (v) && c == 's') {8543 if (PyUnicode_CheckExact(v) && c == 's') { 8657 8544 temp = v; 8658 8545 Py_INCREF(temp); … … 8754 8641 case 'g': 8755 8642 case 'G': 8756 if (c == 'F') 8757 c = 'f'; 8758 pbuf = formatbuf; 8759 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 8760 flags, prec, c, v); 8761 if (len < 0) 8643 temp = formatfloat(v, flags, prec, c); 8644 if (temp == NULL) 8762 8645 goto onError; 8646 pbuf = PyUnicode_AS_UNICODE(temp); 8647 len = PyUnicode_GET_SIZE(temp); 8763 8648 sign = 1; 8764 8649 if (flags & F_ZERO) … … 8950 8835 8951 8836 PyDoc_STRVAR(unicode_doc, 8952 "unicode(string [, encoding[, errors]]) -> object\n\ 8837 "unicode(object='') -> unicode object\n\ 8838 unicode(string[, encoding[, errors]]) -> unicode object\n\ 8953 8839 \n\ 8954 8840 Create a new Unicode object from the given encoded string.\n\ … … 9004 8890 void _PyUnicode_Init(void) 9005 8891 { 9006 int i;9007 9008 8892 /* XXX - move this array to unicodectype.c ? */ 9009 8893 Py_UNICODE linebreak[] = { … … 9019 8903 9020 8904 /* Init the implementation */ 9021 free_list = NULL; 9022 numfree = 0; 9023 unicode_empty = _PyUnicode_New(0); 9024 if (!unicode_empty) 9025 return; 9026 9027 strcpy(unicode_default_encoding, "ascii"); 9028 for (i = 0; i < 256; i++) 9029 unicode_latin1[i] = NULL; 8905 if (!unicode_empty) { 8906 unicode_empty = _PyUnicode_New(0); 8907 if (!unicode_empty) 8908 return; 8909 } 8910 9030 8911 if (PyType_Ready(&PyUnicode_Type) < 0) 9031 8912 Py_FatalError("Can't initialize 'unicode'"); … … 9037 8918 9038 8919 PyType_Ready(&EncodingMapType); 8920 8921 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 8922 Py_FatalError("Can't initialize field name iterator type"); 8923 8924 if (PyType_Ready(&PyFormatterIter_Type) < 0) 8925 Py_FatalError("Can't initialize formatter iter type"); 9039 8926 } 9040 8927 … … 9066 8953 int i; 9067 8954 9068 Py_XDECREF(unicode_empty); 9069 unicode_empty = NULL; 9070 9071 for (i = 0; i < 256; i++) { 9072 if (unicode_latin1[i]) { 9073 Py_DECREF(unicode_latin1[i]); 9074 unicode_latin1[i] = NULL; 9075 } 9076 } 8955 Py_CLEAR(unicode_empty); 8956 8957 for (i = 0; i < 256; i++) 8958 Py_CLEAR(unicode_latin1[i]); 8959 9077 8960 (void)PyUnicode_ClearFreeList(); 9078 8961 } … … 9081 8964 } 9082 8965 #endif 9083 9084 9085 /*9086 Local variables:9087 c-basic-offset: 49088 indent-tabs-mode: nil9089 End:9090 */
Note:
See TracChangeset
for help on using the changeset viewer.