Context Navigation

← Previous Change
Next Change →

unicodedata.c

Timestamp:

Mar 19, 2014, 11:31:01 PM (11 years ago)

Author:

dmik

Message:

python: Merge vendor 2.7.6 to trunk.

Location:

python/trunk

Files:

: 2 edited

. (modified) (1 prop)
Modules/unicodedata.c (modified) (41 diffs)

Legend:

: Unmodified
: Added
: Removed

python/trunk
- Property svn:mergeinfo set to
  /python/vendor/Python-2.7.6 merged eligible
  /python/vendor/current merged eligible

python/trunk/Modules/unicodedata.c

-              r2
+              r391
 /* ------------------------------------------------------------------------
    unicodedata -- Provides access to the Unicode 5.1 data base.
    Data was extracted from the Unicode 5.1 UnicodeData.txt file.
+   unicodedata -- Provides access to the Unicode 5.2 data base.
+   Data was extracted from the Unicode 5.2 UnicodeData.txt file.
    Written by Marc-Andre Lemburg (mal@lemburg.com).
 …
 typedef struct {
+    const unsigned char category;       /* index into
+                                           _PyUnicode_CategoryNames */
+    const unsigned char combining;      /* combining class value 0 - 255 */
+    const unsigned char bidirectional;  /* index into
+                                           _PyUnicode_BidirectionalNames */
+    const unsigned char mirrored;       /* true if mirrored in bidir mode */
+    const unsigned char east_asian_width;       /* index into
+                                                   _PyUnicode_EastAsianWidth */
+    const unsigned char category;       /* index into
+                                           _PyUnicode_CategoryNames */
+    const unsigned char combining;      /* combining class value 0 - 255 */
+    const unsigned char bidirectional;  /* index into
+                                           _PyUnicode_BidirectionalNames */
+    const unsigned char mirrored;       /* true if mirrored in bidir mode */
+    const unsigned char east_asian_width;       /* index into
+                                                   _PyUnicode_EastAsianWidth */
+    const unsigned char normalization_quick_check; /* see is_normalized() */
 } _PyUnicode_DatabaseRecord;
 …
     const unsigned char decimal_changed;
     const unsigned char mirrored_changed;
     const int numeric_changed;
+    const double numeric_changed;
 } change_record;
 …
 static PyMemberDef DB_members[] = {
         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
+        {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
         {NULL}
 };
 …
                      Py_UCS4 (*normalization)(Py_UCS4))
+{
         PreviousDBVersion *self;
         self = PyObject_New(PreviousDBVersion, &UCD_Type);
         if (self == NULL)
                 return NULL;
         self->name = name;
         self->getrecord = getrecord;
+        PreviousDBVersion *self;
+        self = PyObject_New(PreviousDBVersion, &UCD_Type);
+        if (self == NULL)
+                return NULL;
+        self->name = name;
+        self->getrecord = getrecord;
         self->normalization = normalization;
         return (PyObject*)self;
+        return (PyObject*)self;
+}
 …
     if (PyUnicode_GET_SIZE(obj) == 1)
         return *v;
+        return *v;
 #ifndef Py_UNICODE_WIDE
     else if ((PyUnicode_GET_SIZE(obj) == 2) &&
              (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
              (0xDC00 <= v[1] && v[1] <= 0xDFFF))
         return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
+        return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
 #endif
     PyErr_SetString(PyExc_TypeError,
 …
             have_old = 1;
             rc = -1;
+        }
+        }
         else if (old->decimal_changed != 0xFF) {
             have_old = 1;
 …
         rc = Py_UNICODE_TODECIMAL(c);
     if (rc < 0) {
         if (defobj == NULL) {
             PyErr_SetString(PyExc_ValueError,
                             "not a decimal");
+        if (defobj == NULL) {
+            PyErr_SetString(PyExc_ValueError,
+                            "not a decimal");
             return NULL;
+        }
         else {
             Py_INCREF(defobj);
             return defobj;
+        }
+        }
+        else {
+            Py_INCREF(defobj);
+            return defobj;
+        }
+    }
     return PyInt_FromLong(rc);
 …
     rc = Py_UNICODE_TODIGIT(c);
     if (rc < 0) {
         if (defobj == NULL) {
             PyErr_SetString(PyExc_ValueError, "not a digit");
+        if (defobj == NULL) {
+            PyErr_SetString(PyExc_ValueError, "not a digit");
             return NULL;
+        }
         else {
             Py_INCREF(defobj);
             return defobj;
+        }
+        }
+        else {
+            Py_INCREF(defobj);
+            return defobj;
+        }
+    }
     return PyInt_FromLong(rc);
 …
             have_old = 1;
             rc = -1.0;
+        }
+        }
         else if (old->decimal_changed != 0xFF) {
             have_old = 1;
 …
         rc = Py_UNICODE_TONUMERIC(c);
     if (rc == -1.0) {
         if (defobj == NULL) {
             PyErr_SetString(PyExc_ValueError, "not a numeric character");
             return NULL;
+        }
         else {
             Py_INCREF(defobj);
             return defobj;
+        }
+        if (defobj == NULL) {
+            PyErr_SetString(PyExc_ValueError, "not a numeric character");
+            return NULL;
+        }
+        else {
+            Py_INCREF(defobj);
+            return defobj;
+        }
+    }
     return PyFloat_FromDouble(rc);
 …
     if (!PyArg_ParseTuple(args, "O!:category",
                           &PyUnicode_Type, &v))
         return NULL;
+                          &PyUnicode_Type, &v))
+        return NULL;
     c = getuchar(v);
     if (c == (Py_UCS4)-1)
 …
 "bidirectional(unichr)\n\
 \n\
 Returns the bidirectional category assigned to the Unicode character\n\
+Returns the bidirectional class assigned to the Unicode character\n\
 unichr as string. If no such value is defined, an empty string is\n\
 returned.");
 …
     if (!PyArg_ParseTuple(args, "O!:bidirectional",
                           &PyUnicode_Type, &v))
         return NULL;
+                          &PyUnicode_Type, &v))
+        return NULL;
     c = getuchar(v);
     if (c == (Py_UCS4)-1)
 …
     if (!PyArg_ParseTuple(args, "O!:combining",
                           &PyUnicode_Type, &v))
         return NULL;
+                          &PyUnicode_Type, &v))
+        return NULL;
     c = getuchar(v);
     if (c == (Py_UCS4)-1)
 …
     if (!PyArg_ParseTuple(args, "O!:mirrored",
                           &PyUnicode_Type, &v))
         return NULL;
+                          &PyUnicode_Type, &v))
+        return NULL;
     c = getuchar(v);
     if (c == (Py_UCS4)-1)
 …
     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
                           &PyUnicode_Type, &v))
         return NULL;
+                          &PyUnicode_Type, &v))
+        return NULL;
     c = getuchar(v);
     if (c == (Py_UCS4)-1)
 …
     if (!PyArg_ParseTuple(args, "O!:decomposition",
                           &PyUnicode_Type, &v))
         return NULL;
+                          &PyUnicode_Type, &v))
+        return NULL;
     c = getuchar(v);
     if (c == (Py_UCS4)-1)
 …
         i += strlen(decomp + i);
+    }
     decomp[i] = '\0';
 …
                                (code&((1<<DECOMP_SHIFT)-1))];
+    }
     /* high byte is number of hex bytes (usually one or two), low byte
        is prefix code (from*/
 …
     Py_UNICODE *i, *end, *o;
     /* Longest decomposition in Unicode 3.2: U+FDFA */
     Py_UNICODE stack[20];
+    Py_UNICODE stack[20];
     Py_ssize_t space, isize;
     int index, prefix, count, stackptr;
     unsigned char prev, cur;
     stackptr = 0;
     isize = PyUnicode_GET_SIZE(input);
     /* Overallocate atmost 10 characters. */
+    /* Overallocate at most 10 characters. */
     space = (isize > 10 ? 10 : isize) + isize;
     result = PyUnicode_FromUnicode(NULL, space);
 …
             Py_UNICODE code = stack[--stackptr];
             /* Hangul Decomposition adds three characters in
                a single step, so we need atleast that much room. */
+               a single step, so we need at least that much room. */
             if (space < 3) {
                 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
 …
     end = i + PyUnicode_GET_SIZE(result);
     o = PyUnicode_AS_UNICODE(result);
   again:
     while (i < end) {
       for (index = 0; index < cskipped; index++) {
           if (skipped[index] == i) {
               /* *i character is skipped.
+              /* *i character is skipped.
                  Remove from list. */
               skipped[index] = skipped[cskipped-1];
 …
          pairs, since we always have decomposed data. */
       if (LBase <= *i && *i < (LBase+LCount) &&
           i + 1 < end &&
+          i + 1 < end &&
           VBase <= i[1] && i[1] <= (VBase+VCount)) {
           int LIndex, VIndex;
 …
       while (i1 < end) {
           int comb1 = _getrecord_ex(*i1)->combining;
+          if (comb1 && comb == comb1) {
+              /* Character is blocked. */
+              i1++;
+              continue;
+          if (comb) {
+              if (comb1 == 0)
+                  break;
+              if (comb >= comb1) {
+                  /* Character is blocked. */
+                  i1++;
+                  continue;
+              }
+          }
           l = find_nfc_index(self, nfc_last, *i1);
 …
           if (code == 0)
               goto not_combinable;
           /* Replace the original character. */
           *i = code;
           /* Mark the second character unused. */
+          assert(cskipped < 20);
           skipped[cskipped++] = i1;
           i1++;
 …
     return result;
+}
+/* Return 1 if the input is certainly normalized, 0 if it might not be. */
+static int
+is_normalized(PyObject *self, PyObject *input, int nfc, int k)
+{
+    Py_UNICODE *i, *end;
+    unsigned char prev_combining = 0, quickcheck_mask;
+    /* An older version of the database is requested, quickchecks must be
+       disabled. */
+    if (self != NULL)
+        return 0;
+    /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
+       as described in http://unicode.org/reports/tr15/#Annex8. */
+    quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
+    i = PyUnicode_AS_UNICODE(input);
+    end = i + PyUnicode_GET_SIZE(input);
+    while (i < end) {
+        const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
+        unsigned char combining = record->combining;
+        unsigned char quickcheck = record->normalization_quick_check;
+        if (quickcheck & quickcheck_mask)
+            return 0; /* this string might need normalization */
+        if (combining && prev_combining > combining)
+            return 0; /* non-canonical sort order, not normalized */
+        prev_combining = combining;
+    }
+    return 1; /* certainly normalized */
+}
 PyDoc_STRVAR(unicodedata_normalize__doc__,
 "normalize(form, unistr)\n\
 …
+    }
+    if (strcmp(form, "NFC") == 0)
+    if (strcmp(form, "NFC") == 0) {
+        if (is_normalized(self, input, 1, 0)) {
+            Py_INCREF(input);
+            return input;
+        }
         return nfc_nfkc(self, input, 0);
+    if (strcmp(form, "NFKC") == 0)
+    }
+    if (strcmp(form, "NFKC") == 0) {
+        if (is_normalized(self, input, 1, 1)) {
+            Py_INCREF(input);
+            return input;
+        }
         return nfc_nfkc(self, input, 1);
+    if (strcmp(form, "NFD") == 0)
+    }
+    if (strcmp(form, "NFD") == 0) {
+        if (is_normalized(self, input, 0, 0)) {
+            Py_INCREF(input);
+            return input;
+        }
         return nfd_nfkd(self, input, 0);
+    if (strcmp(form, "NFKD") == 0)
+    }
+    if (strcmp(form, "NFKD") == 0) {
+        if (is_normalized(self, input, 0, 1)) {
+            Py_INCREF(input);
+            return input;
+        }
         return nfd_nfkd(self, input, 1);
+    }
     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
     return NULL;
 …
     unsigned long ix;
     for (i = 0; i < len; i++) {
         h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
+        h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
         ix = h & 0xff000000;
         if (ix)
 …
     return (
         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
+        (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
+        (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
+        (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
+        (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
+        (0x2A700 <= code && code <= 0x2B734));  /* CJK Ideograph Extension C */
+}
 …
             /* unassigned */
             return 0;
+        }
+        }
+    }
     if (SBase <= code && code < SBase+SCount) {
         /* Hangul syllable. */
         int SIndex = code - SBase;
         int L = SIndex / NCount;
         int V = (SIndex % NCount) / TCount;
         int T = SIndex % TCount;
         if (buflen < 27)
             /* Worst case: HANGUL SYLLABLE <10chars>. */
             return 0;
         strcpy(buffer, "HANGUL SYLLABLE ");
         buffer += 16;
         strcpy(buffer, hangul_syllables[L][0]);
         buffer += strlen(hangul_syllables[L][0]);
         strcpy(buffer, hangul_syllables[V][1]);
         buffer += strlen(hangul_syllables[V][1]);
         strcpy(buffer, hangul_syllables[T][2]);
         buffer += strlen(hangul_syllables[T][2]);
         *buffer = '\0';
         return 1;
+        /* Hangul syllable. */
+        int SIndex = code - SBase;
+        int L = SIndex / NCount;
+        int V = (SIndex % NCount) / TCount;
+        int T = SIndex % TCount;
+        if (buflen < 27)
+            /* Worst case: HANGUL SYLLABLE <10chars>. */
+            return 0;
+        strcpy(buffer, "HANGUL SYLLABLE ");
+        buffer += 16;
+        strcpy(buffer, hangul_syllables[L][0]);
+        buffer += strlen(hangul_syllables[L][0]);
+        strcpy(buffer, hangul_syllables[V][1]);
+        buffer += strlen(hangul_syllables[V][1]);
+        strcpy(buffer, hangul_syllables[T][2]);
+        buffer += strlen(hangul_syllables[T][2]);
+        *buffer = '\0';
+        return 1;
+    }
 …
         return 0;
     for (i = 0; i < namelen; i++) {
         if (toupper(Py_CHARMASK(name[i])) != buffer[i])
+        if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
             return 0;
+    }
 …
+}
 static void
+static void
 find_syllable(const char *str, int *len, int *pos, int count, int column)
+{
 …
     *len = -1;
     for (i = 0; i < count; i++) {
         char *s = hangul_syllables[i][column];
         len1 = strlen(s);
         if (len1 <= *len)
             continue;
         if (strncmp(str, s, len1) == 0) {
             *len = len1;
             *pos = i;
+        }
+        char *s = hangul_syllables[i][column];
+        len1 = strlen(s);
+        if (len1 <= *len)
+            continue;
+        if (strncmp(str, s, len1) == 0) {
+            *len = len1;
+            *pos = i;
+        }
+    }
     if (*len == -1) {
         *len = 0;
+        *len = 0;
+    }
+}
 …
     /* Check for hangul syllables. */
     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
         int len, L = -1, V = -1, T = -1;
         const char *pos = name + 16;
         find_syllable(pos, &len, &L, LCount, 0);
         pos += len;
         find_syllable(pos, &len, &V, VCount, 1);
         pos += len;
         find_syllable(pos, &len, &T, TCount, 2);
         pos += len;
         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
             *code = SBase + (L*VCount+V)*TCount + T;
             return 1;
+        }
+        int len, L = -1, V = -1, T = -1;
+        const char *pos = name + 16;
+        find_syllable(pos, &len, &L, LCount, 0);
+        pos += len;
+        find_syllable(pos, &len, &V, VCount, 1);
+        pos += len;
+        find_syllable(pos, &len, &T, TCount, 2);
+        pos += len;
+        if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
+            *code = SBase + (L*VCount+V)*TCount + T;
+            return 1;
+        }
         /* Otherwise, it's an illegal syllable name. */
         return 0;
 …
+}
 static const _PyUnicode_Name_CAPI hashAPI =
+static const _PyUnicode_Name_CAPI hashAPI =
+{
     sizeof(_PyUnicode_Name_CAPI),
 …
     if (!_getucname(self, c, name, sizeof(name))) {
         if (defobj == NULL) {
             PyErr_SetString(PyExc_ValueError, "no such name");
+        if (defobj == NULL) {
+            PyErr_SetString(PyExc_ValueError, "no such name");
             return NULL;
+        }
         else {
             Py_INCREF(defobj);
             return defobj;
+        }
+        }
+        else {
+            Py_INCREF(defobj);
+            return defobj;
+        }
+    }
 …
 #endif
     str[0] = (Py_UNICODE) code;
     return PyUnicode_FromUnicode(str, 1);
+    return PyUnicode_FromUnicode(str, 1);
+}
 …
     {"normalize", unicodedata_normalize, METH_VARARGS,
                   unicodedata_normalize__doc__},
     {NULL, NULL}                /* sentinel */
+    {NULL, NULL}                /* sentinel */
 };
 static PyTypeObject UCD_Type = {
         /* The ob_type field must be initialized in the module init function
          * to be portable to Windows without using C++. */
         PyVarObject_HEAD_INIT(NULL, 0)
         "unicodedata.UCD",              /*tp_name*/
         sizeof(PreviousDBVersion),      /*tp_basicsize*/
 ,                      /*tp_itemsize*/
         /* methods */
         (destructor)PyObject_Del, /*tp_dealloc*/
 ,                      /*tp_print*/
 ,                      /*tp_getattr*/
 ,                      /*tp_setattr*/
 ,                      /*tp_compare*/
 ,                      /*tp_repr*/
 ,                      /*tp_as_number*/
 ,                      /*tp_as_sequence*/
 ,                      /*tp_as_mapping*/
 ,                      /*tp_hash*/
+        /* The ob_type field must be initialized in the module init function
+         * to be portable to Windows without using C++. */
+        PyVarObject_HEAD_INIT(NULL, 0)
+        "unicodedata.UCD",              /*tp_name*/
+        sizeof(PreviousDBVersion),      /*tp_basicsize*/
+,                      /*tp_itemsize*/
+        /* methods */
+        (destructor)PyObject_Del, /*tp_dealloc*/
+,                      /*tp_print*/
+,                      /*tp_getattr*/
+,                      /*tp_setattr*/
+,                      /*tp_compare*/
+,                      /*tp_repr*/
+,                      /*tp_as_number*/
+,                      /*tp_as_sequence*/
+,                      /*tp_as_mapping*/
+,                      /*tp_hash*/
 ,                      /*tp_call*/
 ,                      /*tp_str*/
 …
 defines character properties for all Unicode characters. The data in\n\
 this database is based on the UnicodeData.txt file version\n\
 .1.0 which is publically available from ftp://ftp.unicode.org/.\n\
+.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
 \n\
 The module uses the same names and symbols as defined by the\n\
 UnicodeData File Format 5.1.0 (see\n\
 http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
+UnicodeData File Format 5.2.0 (see\n\
+http://www.unicode.org/reports/tr44/tr44-4.html).");
 PyMODINIT_FUNC
 …
     /* Export C API */
     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
+    v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
     if (v != NULL)
         PyModule_AddObject(m, "ucnhash_CAPI", v);
+}
 /*
+/*
 Local variables:
 c-basic-offset: 4

Note: See TracChangeset for help on using the changeset viewer.

/python/vendor/Python-2.7.6	merged	eligible
/python/vendor/current	merged	eligible

Context Navigation

Changeset 391 for python/trunk/Modules/unicodedata.c

Legend:

python/trunk

python/trunk/Modules/unicodedata.c

Download in other formats: