Changeset 391 for python/trunk/Modules/unicodedata.c
- Timestamp:
- Mar 19, 2014, 11:31:01 PM (11 years ago)
- Location:
- python/trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
python/trunk
-
Property svn:mergeinfo
set to
/python/vendor/Python-2.7.6 merged eligible /python/vendor/current merged eligible
-
Property svn:mergeinfo
set to
-
python/trunk/Modules/unicodedata.c
r2 r391 1 1 /* ------------------------------------------------------------------------ 2 2 3 unicodedata -- Provides access to the Unicode 5. 1data base.4 5 Data was extracted from the Unicode 5. 1UnicodeData.txt file.3 unicodedata -- Provides access to the Unicode 5.2 data base. 4 5 Data was extracted from the Unicode 5.2 UnicodeData.txt file. 6 6 7 7 Written by Marc-Andre Lemburg (mal@lemburg.com). … … 20 20 21 21 typedef struct { 22 const unsigned char category; /* index into 23 _PyUnicode_CategoryNames */ 24 const unsigned char combining; /* combining class value 0 - 255 */ 25 const unsigned char bidirectional; /* index into 26 _PyUnicode_BidirectionalNames */ 27 const unsigned char mirrored; /* true if mirrored in bidir mode */ 28 const unsigned char east_asian_width; /* index into 29 _PyUnicode_EastAsianWidth */ 22 const unsigned char category; /* index into 23 _PyUnicode_CategoryNames */ 24 const unsigned char combining; /* combining class value 0 - 255 */ 25 const unsigned char bidirectional; /* index into 26 _PyUnicode_BidirectionalNames */ 27 const unsigned char mirrored; /* true if mirrored in bidir mode */ 28 const unsigned char east_asian_width; /* index into 29 _PyUnicode_EastAsianWidth */ 30 const unsigned char normalization_quick_check; /* see is_normalized() */ 30 31 } _PyUnicode_DatabaseRecord; 31 32 … … 36 37 const unsigned char decimal_changed; 37 38 const unsigned char mirrored_changed; 38 const intnumeric_changed;39 const double numeric_changed; 39 40 } change_record; 40 41 … … 67 68 68 69 static PyMemberDef DB_members[] = { 69 70 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, 70 71 {NULL} 71 72 }; … … 78 79 Py_UCS4 (*normalization)(Py_UCS4)) 79 80 { 80 81 82 83 84 85 81 PreviousDBVersion *self; 82 self = PyObject_New(PreviousDBVersion, &UCD_Type); 83 if (self == NULL) 84 return NULL; 85 self->name = name; 86 self->getrecord = getrecord; 86 87 self->normalization = normalization; 87 88 return (PyObject*)self; 88 89 } 89 90 … … 94 95 95 96 if (PyUnicode_GET_SIZE(obj) == 1) 96 97 return *v; 97 98 #ifndef Py_UNICODE_WIDE 98 99 else if ((PyUnicode_GET_SIZE(obj) == 2) && 99 100 (0xD800 <= v[0] && v[0] <= 0xDBFF) && 100 101 (0xDC00 <= v[1] && v[1] <= 0xDFFF)) 101 102 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000; 102 103 #endif 103 104 PyErr_SetString(PyExc_TypeError, … … 136 137 have_old = 1; 137 138 rc = -1; 138 } 139 } 139 140 else if (old->decimal_changed != 0xFF) { 140 141 have_old = 1; … … 146 147 rc = Py_UNICODE_TODECIMAL(c); 147 148 if (rc < 0) { 148 149 150 149 if (defobj == NULL) { 150 PyErr_SetString(PyExc_ValueError, 151 "not a decimal"); 151 152 return NULL; 152 153 154 155 156 153 } 154 else { 155 Py_INCREF(defobj); 156 return defobj; 157 } 157 158 } 158 159 return PyInt_FromLong(rc); … … 181 182 rc = Py_UNICODE_TODIGIT(c); 182 183 if (rc < 0) { 183 184 184 if (defobj == NULL) { 185 PyErr_SetString(PyExc_ValueError, "not a digit"); 185 186 return NULL; 186 187 188 189 190 187 } 188 else { 189 Py_INCREF(defobj); 190 return defobj; 191 } 191 192 } 192 193 return PyInt_FromLong(rc); … … 221 222 have_old = 1; 222 223 rc = -1.0; 223 } 224 } 224 225 else if (old->decimal_changed != 0xFF) { 225 226 have_old = 1; … … 231 232 rc = Py_UNICODE_TONUMERIC(c); 232 233 if (rc == -1.0) { 233 234 235 236 237 238 239 240 234 if (defobj == NULL) { 235 PyErr_SetString(PyExc_ValueError, "not a numeric character"); 236 return NULL; 237 } 238 else { 239 Py_INCREF(defobj); 240 return defobj; 241 } 241 242 } 242 243 return PyFloat_FromDouble(rc); … … 257 258 258 259 if (!PyArg_ParseTuple(args, "O!:category", 259 260 260 &PyUnicode_Type, &v)) 261 return NULL; 261 262 c = getuchar(v); 262 263 if (c == (Py_UCS4)-1) … … 274 275 "bidirectional(unichr)\n\ 275 276 \n\ 276 Returns the bidirectional c ategoryassigned to the Unicode character\n\277 Returns the bidirectional class assigned to the Unicode character\n\ 277 278 unichr as string. If no such value is defined, an empty string is\n\ 278 279 returned."); … … 286 287 287 288 if (!PyArg_ParseTuple(args, "O!:bidirectional", 288 289 289 &PyUnicode_Type, &v)) 290 return NULL; 290 291 c = getuchar(v); 291 292 if (c == (Py_UCS4)-1) … … 317 318 318 319 if (!PyArg_ParseTuple(args, "O!:combining", 319 320 320 &PyUnicode_Type, &v)) 321 return NULL; 321 322 c = getuchar(v); 322 323 if (c == (Py_UCS4)-1) … … 346 347 347 348 if (!PyArg_ParseTuple(args, "O!:mirrored", 348 349 349 &PyUnicode_Type, &v)) 350 return NULL; 350 351 c = getuchar(v); 351 352 if (c == (Py_UCS4)-1) … … 376 377 377 378 if (!PyArg_ParseTuple(args, "O!:east_asian_width", 378 379 379 &PyUnicode_Type, &v)) 380 return NULL; 380 381 c = getuchar(v); 381 382 if (c == (Py_UCS4)-1) … … 407 408 408 409 if (!PyArg_ParseTuple(args, "O!:decomposition", 409 410 410 &PyUnicode_Type, &v)) 411 return NULL; 411 412 c = getuchar(v); 412 413 if (c == (Py_UCS4)-1) … … 454 455 i += strlen(decomp + i); 455 456 } 456 457 457 458 decomp[i] = '\0'; 458 459 … … 474 475 (code&((1<<DECOMP_SHIFT)-1))]; 475 476 } 476 477 477 478 /* high byte is number of hex bytes (usually one or two), low byte 478 479 is prefix code (from*/ … … 499 500 Py_UNICODE *i, *end, *o; 500 501 /* Longest decomposition in Unicode 3.2: U+FDFA */ 501 Py_UNICODE stack[20]; 502 Py_UNICODE stack[20]; 502 503 Py_ssize_t space, isize; 503 504 int index, prefix, count, stackptr; 504 505 unsigned char prev, cur; 505 506 506 507 stackptr = 0; 507 508 isize = PyUnicode_GET_SIZE(input); 508 /* Overallocate at most 10 characters. */509 /* Overallocate at most 10 characters. */ 509 510 space = (isize > 10 ? 10 : isize) + isize; 510 511 result = PyUnicode_FromUnicode(NULL, space); … … 520 521 Py_UNICODE code = stack[--stackptr]; 521 522 /* Hangul Decomposition adds three characters in 522 a single step, so we need at least that much room. */523 a single step, so we need at least that much room. */ 523 524 if (space < 3) { 524 525 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10; … … 640 641 end = i + PyUnicode_GET_SIZE(result); 641 642 o = PyUnicode_AS_UNICODE(result); 642 643 643 644 again: 644 645 while (i < end) { 645 646 for (index = 0; index < cskipped; index++) { 646 647 if (skipped[index] == i) { 647 /* *i character is skipped. 648 /* *i character is skipped. 648 649 Remove from list. */ 649 650 skipped[index] = skipped[cskipped-1]; … … 656 657 pairs, since we always have decomposed data. */ 657 658 if (LBase <= *i && *i < (LBase+LCount) && 658 i + 1 < end && 659 i + 1 < end && 659 660 VBase <= i[1] && i[1] <= (VBase+VCount)) { 660 661 int LIndex, VIndex; … … 682 683 while (i1 < end) { 683 684 int comb1 = _getrecord_ex(*i1)->combining; 684 if (comb1 && comb == comb1) { 685 /* Character is blocked. */ 686 i1++; 687 continue; 685 if (comb) { 686 if (comb1 == 0) 687 break; 688 if (comb >= comb1) { 689 /* Character is blocked. */ 690 i1++; 691 continue; 692 } 688 693 } 689 694 l = find_nfc_index(self, nfc_last, *i1); … … 705 710 if (code == 0) 706 711 goto not_combinable; 707 712 708 713 /* Replace the original character. */ 709 714 *i = code; 710 715 /* Mark the second character unused. */ 716 assert(cskipped < 20); 711 717 skipped[cskipped++] = i1; 712 718 i1++; … … 721 727 return result; 722 728 } 723 729 730 /* Return 1 if the input is certainly normalized, 0 if it might not be. */ 731 static int 732 is_normalized(PyObject *self, PyObject *input, int nfc, int k) 733 { 734 Py_UNICODE *i, *end; 735 unsigned char prev_combining = 0, quickcheck_mask; 736 737 /* An older version of the database is requested, quickchecks must be 738 disabled. */ 739 if (self != NULL) 740 return 0; 741 742 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, 743 as described in http://unicode.org/reports/tr15/#Annex8. */ 744 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); 745 746 i = PyUnicode_AS_UNICODE(input); 747 end = i + PyUnicode_GET_SIZE(input); 748 while (i < end) { 749 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++); 750 unsigned char combining = record->combining; 751 unsigned char quickcheck = record->normalization_quick_check; 752 753 if (quickcheck & quickcheck_mask) 754 return 0; /* this string might need normalization */ 755 if (combining && prev_combining > combining) 756 return 0; /* non-canonical sort order, not normalized */ 757 prev_combining = combining; 758 } 759 return 1; /* certainly normalized */ 760 } 761 724 762 PyDoc_STRVAR(unicodedata_normalize__doc__, 725 763 "normalize(form, unistr)\n\ … … 745 783 } 746 784 747 if (strcmp(form, "NFC") == 0) 785 if (strcmp(form, "NFC") == 0) { 786 if (is_normalized(self, input, 1, 0)) { 787 Py_INCREF(input); 788 return input; 789 } 748 790 return nfc_nfkc(self, input, 0); 749 if (strcmp(form, "NFKC") == 0) 791 } 792 if (strcmp(form, "NFKC") == 0) { 793 if (is_normalized(self, input, 1, 1)) { 794 Py_INCREF(input); 795 return input; 796 } 750 797 return nfc_nfkc(self, input, 1); 751 if (strcmp(form, "NFD") == 0) 798 } 799 if (strcmp(form, "NFD") == 0) { 800 if (is_normalized(self, input, 0, 0)) { 801 Py_INCREF(input); 802 return input; 803 } 752 804 return nfd_nfkd(self, input, 0); 753 if (strcmp(form, "NFKD") == 0) 805 } 806 if (strcmp(form, "NFKD") == 0) { 807 if (is_normalized(self, input, 0, 1)) { 808 Py_INCREF(input); 809 return input; 810 } 754 811 return nfd_nfkd(self, input, 1); 812 } 755 813 PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 756 814 return NULL; … … 773 831 unsigned long ix; 774 832 for (i = 0; i < len; i++) { 775 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));833 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i])); 776 834 ix = h & 0xff000000; 777 835 if (ix) … … 817 875 return ( 818 876 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ 819 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */ 820 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */ 877 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */ 878 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */ 879 (0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */ 821 880 } 822 881 … … 837 896 /* unassigned */ 838 897 return 0; 839 } 898 } 840 899 } 841 900 842 901 if (SBase <= code && code < SBase+SCount) { 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 902 /* Hangul syllable. */ 903 int SIndex = code - SBase; 904 int L = SIndex / NCount; 905 int V = (SIndex % NCount) / TCount; 906 int T = SIndex % TCount; 907 908 if (buflen < 27) 909 /* Worst case: HANGUL SYLLABLE <10chars>. */ 910 return 0; 911 strcpy(buffer, "HANGUL SYLLABLE "); 912 buffer += 16; 913 strcpy(buffer, hangul_syllables[L][0]); 914 buffer += strlen(hangul_syllables[L][0]); 915 strcpy(buffer, hangul_syllables[V][1]); 916 buffer += strlen(hangul_syllables[V][1]); 917 strcpy(buffer, hangul_syllables[T][2]); 918 buffer += strlen(hangul_syllables[T][2]); 919 *buffer = '\0'; 920 return 1; 862 921 } 863 922 … … 920 979 return 0; 921 980 for (i = 0; i < namelen; i++) { 922 if ( toupper(Py_CHARMASK(name[i])) != buffer[i])981 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i]) 923 982 return 0; 924 983 } … … 926 985 } 927 986 928 static void 987 static void 929 988 find_syllable(const char *str, int *len, int *pos, int count, int column) 930 989 { … … 932 991 *len = -1; 933 992 for (i = 0; i < count; i++) { 934 935 936 937 938 939 940 941 993 char *s = hangul_syllables[i][column]; 994 len1 = strlen(s); 995 if (len1 <= *len) 996 continue; 997 if (strncmp(str, s, len1) == 0) { 998 *len = len1; 999 *pos = i; 1000 } 942 1001 } 943 1002 if (*len == -1) { 944 1003 *len = 0; 945 1004 } 946 1005 } … … 955 1014 /* Check for hangul syllables. */ 956 1015 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { 957 958 959 960 961 962 963 964 965 966 967 968 1016 int len, L = -1, V = -1, T = -1; 1017 const char *pos = name + 16; 1018 find_syllable(pos, &len, &L, LCount, 0); 1019 pos += len; 1020 find_syllable(pos, &len, &V, VCount, 1); 1021 pos += len; 1022 find_syllable(pos, &len, &T, TCount, 2); 1023 pos += len; 1024 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { 1025 *code = SBase + (L*VCount+V)*TCount + T; 1026 return 1; 1027 } 969 1028 /* Otherwise, it's an illegal syllable name. */ 970 1029 return 0; … … 1026 1085 } 1027 1086 1028 static const _PyUnicode_Name_CAPI hashAPI = 1087 static const _PyUnicode_Name_CAPI hashAPI = 1029 1088 { 1030 1089 sizeof(_PyUnicode_Name_CAPI), … … 1058 1117 1059 1118 if (!_getucname(self, c, name, sizeof(name))) { 1060 1061 1119 if (defobj == NULL) { 1120 PyErr_SetString(PyExc_ValueError, "no such name"); 1062 1121 return NULL; 1063 1064 1065 1066 1067 1122 } 1123 else { 1124 Py_INCREF(defobj); 1125 return defobj; 1126 } 1068 1127 } 1069 1128 … … 1103 1162 #endif 1104 1163 str[0] = (Py_UNICODE) code; 1105 return PyUnicode_FromUnicode(str, 1); 1164 return PyUnicode_FromUnicode(str, 1); 1106 1165 } 1107 1166 … … 1128 1187 {"normalize", unicodedata_normalize, METH_VARARGS, 1129 1188 unicodedata_normalize__doc__}, 1130 {NULL, NULL} 1189 {NULL, NULL} /* sentinel */ 1131 1190 }; 1132 1191 1133 1192 static PyTypeObject UCD_Type = { 1134 1135 1136 1137 "unicodedata.UCD",/*tp_name*/1138 sizeof(PreviousDBVersion),/*tp_basicsize*/1139 0,/*tp_itemsize*/1140 1141 1142 0,/*tp_print*/1143 1144 0,/*tp_setattr*/1145 0,/*tp_compare*/1146 0,/*tp_repr*/1147 0,/*tp_as_number*/1148 0,/*tp_as_sequence*/1149 0,/*tp_as_mapping*/1150 0,/*tp_hash*/1193 /* The ob_type field must be initialized in the module init function 1194 * to be portable to Windows without using C++. */ 1195 PyVarObject_HEAD_INIT(NULL, 0) 1196 "unicodedata.UCD", /*tp_name*/ 1197 sizeof(PreviousDBVersion), /*tp_basicsize*/ 1198 0, /*tp_itemsize*/ 1199 /* methods */ 1200 (destructor)PyObject_Del, /*tp_dealloc*/ 1201 0, /*tp_print*/ 1202 0, /*tp_getattr*/ 1203 0, /*tp_setattr*/ 1204 0, /*tp_compare*/ 1205 0, /*tp_repr*/ 1206 0, /*tp_as_number*/ 1207 0, /*tp_as_sequence*/ 1208 0, /*tp_as_mapping*/ 1209 0, /*tp_hash*/ 1151 1210 0, /*tp_call*/ 1152 1211 0, /*tp_str*/ … … 1181 1240 defines character properties for all Unicode characters. The data in\n\ 1182 1241 this database is based on the UnicodeData.txt file version\n\ 1183 5. 1.0 which is publically available from ftp://ftp.unicode.org/.\n\1242 5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\ 1184 1243 \n\ 1185 1244 The module uses the same names and symbols as defined by the\n\ 1186 UnicodeData File Format 5. 1.0 (see\n\1187 http://www.unicode.org/ Public/5.1.0/ucd/UCD.html).");1245 UnicodeData File Format 5.2.0 (see\n\ 1246 http://www.unicode.org/reports/tr44/tr44-4.html)."); 1188 1247 1189 1248 PyMODINIT_FUNC … … 1209 1268 1210 1269 /* Export C API */ 1211 v = PyC Object_FromVoidPtr((void *) &hashAPI, NULL);1270 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL); 1212 1271 if (v != NULL) 1213 1272 PyModule_AddObject(m, "ucnhash_CAPI", v); 1214 1273 } 1215 1274 1216 /* 1275 /* 1217 1276 Local variables: 1218 1277 c-basic-offset: 4
Note:
See TracChangeset
for help on using the changeset viewer.