source: python/trunk/Modules/unicodedata.c

Last change on this file was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 37.2 KB
RevLine 
[2]1/* ------------------------------------------------------------------------
2
[391]3 unicodedata -- Provides access to the Unicode 5.2 data base.
[2]4
[391]5 Data was extracted from the Unicode 5.2 UnicodeData.txt file.
[2]6
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
10
11 Copyright (c) Corporation for National Research Initiatives.
12
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
16#include "ucnhash.h"
17#include "structmember.h"
18
19/* character properties */
20
21typedef struct {
[391]22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
30 const unsigned char normalization_quick_check; /* see is_normalized() */
[2]31} _PyUnicode_DatabaseRecord;
32
33typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
38 const unsigned char mirrored_changed;
[391]39 const double numeric_changed;
[2]40} change_record;
41
42/* data file generated by Tools/unicode/makeunicodedata.py */
43#include "unicodedata_db.h"
44
45static const _PyUnicode_DatabaseRecord*
46_getrecord_ex(Py_UCS4 code)
47{
48 int index;
49 if (code >= 0x110000)
50 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 }
55
56 return &_PyUnicode_Database_Records[index];
57}
58
59/* ------------- Previous-version API ------------------------------------- */
60typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65} PreviousDBVersion;
66
67#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68
69static PyMemberDef DB_members[] = {
[391]70 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
[2]71 {NULL}
72};
73
74/* forward declaration */
75static PyTypeObject UCD_Type;
76
77static PyObject*
78new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
79 Py_UCS4 (*normalization)(Py_UCS4))
80{
[391]81 PreviousDBVersion *self;
82 self = PyObject_New(PreviousDBVersion, &UCD_Type);
83 if (self == NULL)
84 return NULL;
85 self->name = name;
86 self->getrecord = getrecord;
[2]87 self->normalization = normalization;
[391]88 return (PyObject*)self;
[2]89}
90
91
92static Py_UCS4 getuchar(PyUnicodeObject *obj)
93{
94 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
95
96 if (PyUnicode_GET_SIZE(obj) == 1)
[391]97 return *v;
[2]98#ifndef Py_UNICODE_WIDE
99 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
100 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
101 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
[391]102 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
[2]103#endif
104 PyErr_SetString(PyExc_TypeError,
105 "need a single Unicode character as parameter");
106 return (Py_UCS4)-1;
107}
108
109/* --- Module API --------------------------------------------------------- */
110
111PyDoc_STRVAR(unicodedata_decimal__doc__,
112"decimal(unichr[, default])\n\
113\n\
114Returns the decimal value assigned to the Unicode character unichr\n\
115as integer. If no such value is defined, default is returned, or, if\n\
116not given, ValueError is raised.");
117
118static PyObject *
119unicodedata_decimal(PyObject *self, PyObject *args)
120{
121 PyUnicodeObject *v;
122 PyObject *defobj = NULL;
123 int have_old = 0;
124 long rc;
125 Py_UCS4 c;
126
127 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
128 return NULL;
129 c = getuchar(v);
130 if (c == (Py_UCS4)-1)
131 return NULL;
132
133 if (self) {
134 const change_record *old = get_old_record(self, c);
135 if (old->category_changed == 0) {
136 /* unassigned */
137 have_old = 1;
138 rc = -1;
[391]139 }
[2]140 else if (old->decimal_changed != 0xFF) {
141 have_old = 1;
142 rc = old->decimal_changed;
143 }
144 }
145
146 if (!have_old)
147 rc = Py_UNICODE_TODECIMAL(c);
148 if (rc < 0) {
[391]149 if (defobj == NULL) {
150 PyErr_SetString(PyExc_ValueError,
151 "not a decimal");
[2]152 return NULL;
[391]153 }
154 else {
155 Py_INCREF(defobj);
156 return defobj;
157 }
[2]158 }
159 return PyInt_FromLong(rc);
160}
161
162PyDoc_STRVAR(unicodedata_digit__doc__,
163"digit(unichr[, default])\n\
164\n\
165Returns the digit value assigned to the Unicode character unichr as\n\
166integer. If no such value is defined, default is returned, or, if\n\
167not given, ValueError is raised.");
168
169static PyObject *
170unicodedata_digit(PyObject *self, PyObject *args)
171{
172 PyUnicodeObject *v;
173 PyObject *defobj = NULL;
174 long rc;
175 Py_UCS4 c;
176
177 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
178 return NULL;
179 c = getuchar(v);
180 if (c == (Py_UCS4)-1)
181 return NULL;
182 rc = Py_UNICODE_TODIGIT(c);
183 if (rc < 0) {
[391]184 if (defobj == NULL) {
185 PyErr_SetString(PyExc_ValueError, "not a digit");
[2]186 return NULL;
[391]187 }
188 else {
189 Py_INCREF(defobj);
190 return defobj;
191 }
[2]192 }
193 return PyInt_FromLong(rc);
194}
195
196PyDoc_STRVAR(unicodedata_numeric__doc__,
197"numeric(unichr[, default])\n\
198\n\
199Returns the numeric value assigned to the Unicode character unichr\n\
200as float. If no such value is defined, default is returned, or, if\n\
201not given, ValueError is raised.");
202
203static PyObject *
204unicodedata_numeric(PyObject *self, PyObject *args)
205{
206 PyUnicodeObject *v;
207 PyObject *defobj = NULL;
208 int have_old = 0;
209 double rc;
210 Py_UCS4 c;
211
212 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
213 return NULL;
214 c = getuchar(v);
215 if (c == (Py_UCS4)-1)
216 return NULL;
217
218 if (self) {
219 const change_record *old = get_old_record(self, c);
220 if (old->category_changed == 0) {
221 /* unassigned */
222 have_old = 1;
223 rc = -1.0;
[391]224 }
[2]225 else if (old->decimal_changed != 0xFF) {
226 have_old = 1;
227 rc = old->decimal_changed;
228 }
229 }
230
231 if (!have_old)
232 rc = Py_UNICODE_TONUMERIC(c);
233 if (rc == -1.0) {
[391]234 if (defobj == NULL) {
235 PyErr_SetString(PyExc_ValueError, "not a numeric character");
236 return NULL;
237 }
238 else {
239 Py_INCREF(defobj);
240 return defobj;
241 }
[2]242 }
243 return PyFloat_FromDouble(rc);
244}
245
246PyDoc_STRVAR(unicodedata_category__doc__,
247"category(unichr)\n\
248\n\
249Returns the general category assigned to the Unicode character\n\
250unichr as string.");
251
252static PyObject *
253unicodedata_category(PyObject *self, PyObject *args)
254{
255 PyUnicodeObject *v;
256 int index;
257 Py_UCS4 c;
258
259 if (!PyArg_ParseTuple(args, "O!:category",
[391]260 &PyUnicode_Type, &v))
261 return NULL;
[2]262 c = getuchar(v);
263 if (c == (Py_UCS4)-1)
264 return NULL;
265 index = (int) _getrecord_ex(c)->category;
266 if (self) {
267 const change_record *old = get_old_record(self, c);
268 if (old->category_changed != 0xFF)
269 index = old->category_changed;
270 }
271 return PyString_FromString(_PyUnicode_CategoryNames[index]);
272}
273
274PyDoc_STRVAR(unicodedata_bidirectional__doc__,
275"bidirectional(unichr)\n\
276\n\
[391]277Returns the bidirectional class assigned to the Unicode character\n\
[2]278unichr as string. If no such value is defined, an empty string is\n\
279returned.");
280
281static PyObject *
282unicodedata_bidirectional(PyObject *self, PyObject *args)
283{
284 PyUnicodeObject *v;
285 int index;
286 Py_UCS4 c;
287
288 if (!PyArg_ParseTuple(args, "O!:bidirectional",
[391]289 &PyUnicode_Type, &v))
290 return NULL;
[2]291 c = getuchar(v);
292 if (c == (Py_UCS4)-1)
293 return NULL;
294 index = (int) _getrecord_ex(c)->bidirectional;
295 if (self) {
296 const change_record *old = get_old_record(self, c);
297 if (old->category_changed == 0)
298 index = 0; /* unassigned */
299 else if (old->bidir_changed != 0xFF)
300 index = old->bidir_changed;
301 }
302 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
303}
304
305PyDoc_STRVAR(unicodedata_combining__doc__,
306"combining(unichr)\n\
307\n\
308Returns the canonical combining class assigned to the Unicode\n\
309character unichr as integer. Returns 0 if no combining class is\n\
310defined.");
311
312static PyObject *
313unicodedata_combining(PyObject *self, PyObject *args)
314{
315 PyUnicodeObject *v;
316 int index;
317 Py_UCS4 c;
318
319 if (!PyArg_ParseTuple(args, "O!:combining",
[391]320 &PyUnicode_Type, &v))
321 return NULL;
[2]322 c = getuchar(v);
323 if (c == (Py_UCS4)-1)
324 return NULL;
325 index = (int) _getrecord_ex(c)->combining;
326 if (self) {
327 const change_record *old = get_old_record(self, c);
328 if (old->category_changed == 0)
329 index = 0; /* unassigned */
330 }
331 return PyInt_FromLong(index);
332}
333
334PyDoc_STRVAR(unicodedata_mirrored__doc__,
335"mirrored(unichr)\n\
336\n\
337Returns the mirrored property assigned to the Unicode character\n\
338unichr as integer. Returns 1 if the character has been identified as\n\
339a \"mirrored\" character in bidirectional text, 0 otherwise.");
340
341static PyObject *
342unicodedata_mirrored(PyObject *self, PyObject *args)
343{
344 PyUnicodeObject *v;
345 int index;
346 Py_UCS4 c;
347
348 if (!PyArg_ParseTuple(args, "O!:mirrored",
[391]349 &PyUnicode_Type, &v))
350 return NULL;
[2]351 c = getuchar(v);
352 if (c == (Py_UCS4)-1)
353 return NULL;
354 index = (int) _getrecord_ex(c)->mirrored;
355 if (self) {
356 const change_record *old = get_old_record(self, c);
357 if (old->category_changed == 0)
358 index = 0; /* unassigned */
359 else if (old->mirrored_changed != 0xFF)
360 index = old->mirrored_changed;
361 }
362 return PyInt_FromLong(index);
363}
364
365PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
366"east_asian_width(unichr)\n\
367\n\
368Returns the east asian width assigned to the Unicode character\n\
369unichr as string.");
370
371static PyObject *
372unicodedata_east_asian_width(PyObject *self, PyObject *args)
373{
374 PyUnicodeObject *v;
375 int index;
376 Py_UCS4 c;
377
378 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
[391]379 &PyUnicode_Type, &v))
380 return NULL;
[2]381 c = getuchar(v);
382 if (c == (Py_UCS4)-1)
383 return NULL;
384 index = (int) _getrecord_ex(c)->east_asian_width;
385 if (self) {
386 const change_record *old = get_old_record(self, c);
387 if (old->category_changed == 0)
388 index = 0; /* unassigned */
389 }
390 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
391}
392
393PyDoc_STRVAR(unicodedata_decomposition__doc__,
394"decomposition(unichr)\n\
395\n\
396Returns the character decomposition mapping assigned to the Unicode\n\
397character unichr as string. An empty string is returned in case no\n\
398such mapping is defined.");
399
400static PyObject *
401unicodedata_decomposition(PyObject *self, PyObject *args)
402{
403 PyUnicodeObject *v;
404 char decomp[256];
405 int code, index, count, i;
406 unsigned int prefix_index;
407 Py_UCS4 c;
408
409 if (!PyArg_ParseTuple(args, "O!:decomposition",
[391]410 &PyUnicode_Type, &v))
411 return NULL;
[2]412 c = getuchar(v);
413 if (c == (Py_UCS4)-1)
414 return NULL;
415
416 code = (int)c;
417
418 if (self) {
419 const change_record *old = get_old_record(self, c);
420 if (old->category_changed == 0)
421 return PyString_FromString(""); /* unassigned */
422 }
423
424 if (code < 0 || code >= 0x110000)
425 index = 0;
426 else {
427 index = decomp_index1[(code>>DECOMP_SHIFT)];
428 index = decomp_index2[(index<<DECOMP_SHIFT)+
429 (code&((1<<DECOMP_SHIFT)-1))];
430 }
431
432 /* high byte is number of hex bytes (usually one or two), low byte
433 is prefix code (from*/
434 count = decomp_data[index] >> 8;
435
436 /* XXX: could allocate the PyString up front instead
437 (strlen(prefix) + 5 * count + 1 bytes) */
438
439 /* Based on how index is calculated above and decomp_data is generated
440 from Tools/unicode/makeunicodedata.py, it should not be possible
441 to overflow decomp_prefix. */
442 prefix_index = decomp_data[index] & 255;
443 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
444
445 /* copy prefix */
446 i = strlen(decomp_prefix[prefix_index]);
447 memcpy(decomp, decomp_prefix[prefix_index], i);
448
449 while (count-- > 0) {
450 if (i)
451 decomp[i++] = ' ';
452 assert((size_t)i < sizeof(decomp));
453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454 decomp_data[++index]);
455 i += strlen(decomp + i);
456 }
[391]457
[2]458 decomp[i] = '\0';
459
460 return PyString_FromString(decomp);
461}
462
463static void
464get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
465{
466 if (code >= 0x110000) {
467 *index = 0;
468 } else if (self && get_old_record(self, code)->category_changed==0) {
469 /* unassigned in old version */
470 *index = 0;
471 }
472 else {
473 *index = decomp_index1[(code>>DECOMP_SHIFT)];
474 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475 (code&((1<<DECOMP_SHIFT)-1))];
476 }
[391]477
[2]478 /* high byte is number of hex bytes (usually one or two), low byte
479 is prefix code (from*/
480 *count = decomp_data[*index] >> 8;
481 *prefix = decomp_data[*index] & 255;
482
483 (*index)++;
484}
485
486#define SBase 0xAC00
487#define LBase 0x1100
488#define VBase 0x1161
489#define TBase 0x11A7
490#define LCount 19
491#define VCount 21
492#define TCount 28
493#define NCount (VCount*TCount)
494#define SCount (LCount*NCount)
495
496static PyObject*
497nfd_nfkd(PyObject *self, PyObject *input, int k)
498{
499 PyObject *result;
500 Py_UNICODE *i, *end, *o;
501 /* Longest decomposition in Unicode 3.2: U+FDFA */
[391]502 Py_UNICODE stack[20];
[2]503 Py_ssize_t space, isize;
504 int index, prefix, count, stackptr;
505 unsigned char prev, cur;
[391]506
[2]507 stackptr = 0;
508 isize = PyUnicode_GET_SIZE(input);
[391]509 /* Overallocate at most 10 characters. */
[2]510 space = (isize > 10 ? 10 : isize) + isize;
511 result = PyUnicode_FromUnicode(NULL, space);
512 if (!result)
513 return NULL;
514 i = PyUnicode_AS_UNICODE(input);
515 end = i + isize;
516 o = PyUnicode_AS_UNICODE(result);
517
518 while (i < end) {
519 stack[stackptr++] = *i++;
520 while(stackptr) {
521 Py_UNICODE code = stack[--stackptr];
522 /* Hangul Decomposition adds three characters in
[391]523 a single step, so we need at least that much room. */
[2]524 if (space < 3) {
525 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
526 space += 10;
527 if (PyUnicode_Resize(&result, newsize) == -1)
528 return NULL;
529 o = PyUnicode_AS_UNICODE(result) + newsize - space;
530 }
531 /* Hangul Decomposition. */
532 if (SBase <= code && code < (SBase+SCount)) {
533 int SIndex = code - SBase;
534 int L = LBase + SIndex / NCount;
535 int V = VBase + (SIndex % NCount) / TCount;
536 int T = TBase + SIndex % TCount;
537 *o++ = L;
538 *o++ = V;
539 space -= 2;
540 if (T != TBase) {
541 *o++ = T;
542 space --;
543 }
544 continue;
545 }
546 /* normalization changes */
547 if (self) {
548 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
549 if (value != 0) {
550 stack[stackptr++] = value;
551 continue;
552 }
553 }
554
555 /* Other decompositions. */
556 get_decomp_record(self, code, &index, &prefix, &count);
557
558 /* Copy character if it is not decomposable, or has a
559 compatibility decomposition, but we do NFD. */
560 if (!count || (prefix && !k)) {
561 *o++ = code;
562 space--;
563 continue;
564 }
565 /* Copy decomposition onto the stack, in reverse
566 order. */
567 while(count) {
568 code = decomp_data[index + (--count)];
569 stack[stackptr++] = code;
570 }
571 }
572 }
573
574 /* Drop overallocation. Cannot fail. */
575 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
576
577 /* Sort canonically. */
578 i = PyUnicode_AS_UNICODE(result);
579 prev = _getrecord_ex(*i)->combining;
580 end = i + PyUnicode_GET_SIZE(result);
581 for (i++; i < end; i++) {
582 cur = _getrecord_ex(*i)->combining;
583 if (prev == 0 || cur == 0 || prev <= cur) {
584 prev = cur;
585 continue;
586 }
587 /* Non-canonical order. Need to switch *i with previous. */
588 o = i - 1;
589 while (1) {
590 Py_UNICODE tmp = o[1];
591 o[1] = o[0];
592 o[0] = tmp;
593 o--;
594 if (o < PyUnicode_AS_UNICODE(result))
595 break;
596 prev = _getrecord_ex(*o)->combining;
597 if (prev == 0 || prev <= cur)
598 break;
599 }
600 prev = _getrecord_ex(*i)->combining;
601 }
602 return result;
603}
604
605static int
606find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
607{
608 int index;
609 for (index = 0; nfc[index].start; index++) {
610 int start = nfc[index].start;
611 if (code < start)
612 return -1;
613 if (code <= start + nfc[index].count) {
614 int delta = code - start;
615 return nfc[index].index + delta;
616 }
617 }
618 return -1;
619}
620
621static PyObject*
622nfc_nfkc(PyObject *self, PyObject *input, int k)
623{
624 PyObject *result;
625 Py_UNICODE *i, *i1, *o, *end;
626 int f,l,index,index1,comb;
627 Py_UNICODE code;
628 Py_UNICODE *skipped[20];
629 int cskipped = 0;
630
631 result = nfd_nfkd(self, input, k);
632 if (!result)
633 return NULL;
634
635 /* We are going to modify result in-place.
636 If nfd_nfkd is changed to sometimes return the input,
637 this code needs to be reviewed. */
638 assert(result != input);
639
640 i = PyUnicode_AS_UNICODE(result);
641 end = i + PyUnicode_GET_SIZE(result);
642 o = PyUnicode_AS_UNICODE(result);
[391]643
[2]644 again:
645 while (i < end) {
646 for (index = 0; index < cskipped; index++) {
647 if (skipped[index] == i) {
[391]648 /* *i character is skipped.
[2]649 Remove from list. */
650 skipped[index] = skipped[cskipped-1];
651 cskipped--;
652 i++;
653 goto again; /* continue while */
654 }
655 }
656 /* Hangul Composition. We don't need to check for <LV,T>
657 pairs, since we always have decomposed data. */
658 if (LBase <= *i && *i < (LBase+LCount) &&
[391]659 i + 1 < end &&
[2]660 VBase <= i[1] && i[1] <= (VBase+VCount)) {
661 int LIndex, VIndex;
662 LIndex = i[0] - LBase;
663 VIndex = i[1] - VBase;
664 code = SBase + (LIndex*VCount+VIndex)*TCount;
665 i+=2;
666 if (i < end &&
667 TBase <= *i && *i <= (TBase+TCount)) {
668 code += *i-TBase;
669 i++;
670 }
671 *o++ = code;
672 continue;
673 }
674
675 f = find_nfc_index(self, nfc_first, *i);
676 if (f == -1) {
677 *o++ = *i++;
678 continue;
679 }
680 /* Find next unblocked character. */
681 i1 = i+1;
682 comb = 0;
683 while (i1 < end) {
684 int comb1 = _getrecord_ex(*i1)->combining;
[391]685 if (comb) {
686 if (comb1 == 0)
687 break;
688 if (comb >= comb1) {
689 /* Character is blocked. */
690 i1++;
691 continue;
692 }
[2]693 }
694 l = find_nfc_index(self, nfc_last, *i1);
695 /* *i1 cannot be combined with *i. If *i1
696 is a starter, we don't need to look further.
697 Otherwise, record the combining class. */
698 if (l == -1) {
699 not_combinable:
700 if (comb1 == 0)
701 break;
702 comb = comb1;
703 i1++;
704 continue;
705 }
706 index = f*TOTAL_LAST + l;
707 index1 = comp_index[index >> COMP_SHIFT];
708 code = comp_data[(index1<<COMP_SHIFT)+
709 (index&((1<<COMP_SHIFT)-1))];
710 if (code == 0)
711 goto not_combinable;
[391]712
[2]713 /* Replace the original character. */
714 *i = code;
715 /* Mark the second character unused. */
[391]716 assert(cskipped < 20);
[2]717 skipped[cskipped++] = i1;
718 i1++;
719 f = find_nfc_index(self, nfc_first, *i);
720 if (f == -1)
721 break;
722 }
723 *o++ = *i++;
724 }
725 if (o != end)
726 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
727 return result;
728}
[391]729
730/* Return 1 if the input is certainly normalized, 0 if it might not be. */
731static int
732is_normalized(PyObject *self, PyObject *input, int nfc, int k)
733{
734 Py_UNICODE *i, *end;
735 unsigned char prev_combining = 0, quickcheck_mask;
736
737 /* An older version of the database is requested, quickchecks must be
738 disabled. */
739 if (self != NULL)
740 return 0;
741
742 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
743 as described in http://unicode.org/reports/tr15/#Annex8. */
744 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
745
746 i = PyUnicode_AS_UNICODE(input);
747 end = i + PyUnicode_GET_SIZE(input);
748 while (i < end) {
749 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
750 unsigned char combining = record->combining;
751 unsigned char quickcheck = record->normalization_quick_check;
752
753 if (quickcheck & quickcheck_mask)
754 return 0; /* this string might need normalization */
755 if (combining && prev_combining > combining)
756 return 0; /* non-canonical sort order, not normalized */
757 prev_combining = combining;
758 }
759 return 1; /* certainly normalized */
760}
761
[2]762PyDoc_STRVAR(unicodedata_normalize__doc__,
763"normalize(form, unistr)\n\
764\n\
765Return the normal form 'form' for the Unicode string unistr. Valid\n\
766values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
767
768static PyObject*
769unicodedata_normalize(PyObject *self, PyObject *args)
770{
771 char *form;
772 PyObject *input;
773
774 if(!PyArg_ParseTuple(args, "sO!:normalize",
775 &form, &PyUnicode_Type, &input))
776 return NULL;
777
778 if (PyUnicode_GetSize(input) == 0) {
779 /* Special case empty input strings, since resizing
780 them later would cause internal errors. */
781 Py_INCREF(input);
782 return input;
783 }
784
[391]785 if (strcmp(form, "NFC") == 0) {
786 if (is_normalized(self, input, 1, 0)) {
787 Py_INCREF(input);
788 return input;
789 }
[2]790 return nfc_nfkc(self, input, 0);
[391]791 }
792 if (strcmp(form, "NFKC") == 0) {
793 if (is_normalized(self, input, 1, 1)) {
794 Py_INCREF(input);
795 return input;
796 }
[2]797 return nfc_nfkc(self, input, 1);
[391]798 }
799 if (strcmp(form, "NFD") == 0) {
800 if (is_normalized(self, input, 0, 0)) {
801 Py_INCREF(input);
802 return input;
803 }
[2]804 return nfd_nfkd(self, input, 0);
[391]805 }
806 if (strcmp(form, "NFKD") == 0) {
807 if (is_normalized(self, input, 0, 1)) {
808 Py_INCREF(input);
809 return input;
810 }
[2]811 return nfd_nfkd(self, input, 1);
[391]812 }
[2]813 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
814 return NULL;
815}
816
817/* -------------------------------------------------------------------- */
818/* unicode character name tables */
819
820/* data file generated by Tools/unicode/makeunicodedata.py */
821#include "unicodename_db.h"
822
823/* -------------------------------------------------------------------- */
824/* database code (cut and pasted from the unidb package) */
825
826static unsigned long
827_gethash(const char *s, int len, int scale)
828{
829 int i;
830 unsigned long h = 0;
831 unsigned long ix;
832 for (i = 0; i < len; i++) {
[391]833 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
[2]834 ix = h & 0xff000000;
835 if (ix)
836 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
837 }
838 return h;
839}
840
841static char *hangul_syllables[][3] = {
842 { "G", "A", "" },
843 { "GG", "AE", "G" },
844 { "N", "YA", "GG" },
845 { "D", "YAE", "GS" },
846 { "DD", "EO", "N", },
847 { "R", "E", "NJ" },
848 { "M", "YEO", "NH" },
849 { "B", "YE", "D" },
850 { "BB", "O", "L" },
851 { "S", "WA", "LG" },
852 { "SS", "WAE", "LM" },
853 { "", "OE", "LB" },
854 { "J", "YO", "LS" },
855 { "JJ", "U", "LT" },
856 { "C", "WEO", "LP" },
857 { "K", "WE", "LH" },
858 { "T", "WI", "M" },
859 { "P", "YU", "B" },
860 { "H", "EU", "BS" },
861 { 0, "YI", "S" },
862 { 0, "I", "SS" },
863 { 0, 0, "NG" },
864 { 0, 0, "J" },
865 { 0, 0, "C" },
866 { 0, 0, "K" },
867 { 0, 0, "T" },
868 { 0, 0, "P" },
869 { 0, 0, "H" }
870};
871
872static int
873is_unified_ideograph(Py_UCS4 code)
874{
875 return (
876 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
[391]877 (0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph, Unicode 5.2 */
878 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
879 (0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */
[2]880}
881
882static int
883_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
884{
885 int offset;
886 int i;
887 int word;
888 unsigned char* w;
889
890 if (code >= 0x110000)
891 return 0;
892
893 if (self) {
894 const change_record *old = get_old_record(self, code);
895 if (old->category_changed == 0) {
896 /* unassigned */
897 return 0;
[391]898 }
[2]899 }
900
901 if (SBase <= code && code < SBase+SCount) {
[391]902 /* Hangul syllable. */
903 int SIndex = code - SBase;
904 int L = SIndex / NCount;
905 int V = (SIndex % NCount) / TCount;
906 int T = SIndex % TCount;
[2]907
[391]908 if (buflen < 27)
909 /* Worst case: HANGUL SYLLABLE <10chars>. */
910 return 0;
911 strcpy(buffer, "HANGUL SYLLABLE ");
912 buffer += 16;
913 strcpy(buffer, hangul_syllables[L][0]);
914 buffer += strlen(hangul_syllables[L][0]);
915 strcpy(buffer, hangul_syllables[V][1]);
916 buffer += strlen(hangul_syllables[V][1]);
917 strcpy(buffer, hangul_syllables[T][2]);
918 buffer += strlen(hangul_syllables[T][2]);
919 *buffer = '\0';
920 return 1;
[2]921 }
922
923 if (is_unified_ideograph(code)) {
924 if (buflen < 28)
925 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
926 return 0;
927 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
928 return 1;
929 }
930
931 /* get offset into phrasebook */
932 offset = phrasebook_offset1[(code>>phrasebook_shift)];
933 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
934 (code&((1<<phrasebook_shift)-1))];
935 if (!offset)
936 return 0;
937
938 i = 0;
939
940 for (;;) {
941 /* get word index */
942 word = phrasebook[offset] - phrasebook_short;
943 if (word >= 0) {
944 word = (word << 8) + phrasebook[offset+1];
945 offset += 2;
946 } else
947 word = phrasebook[offset++];
948 if (i) {
949 if (i > buflen)
950 return 0; /* buffer overflow */
951 buffer[i++] = ' ';
952 }
953 /* copy word string from lexicon. the last character in the
954 word has bit 7 set. the last word in a string ends with
955 0x80 */
956 w = lexicon + lexicon_offset[word];
957 while (*w < 128) {
958 if (i >= buflen)
959 return 0; /* buffer overflow */
960 buffer[i++] = *w++;
961 }
962 if (i >= buflen)
963 return 0; /* buffer overflow */
964 buffer[i++] = *w & 127;
965 if (*w == 128)
966 break; /* end of word */
967 }
968
969 return 1;
970}
971
972static int
973_cmpname(PyObject *self, int code, const char* name, int namelen)
974{
975 /* check if code corresponds to the given name */
976 int i;
977 char buffer[NAME_MAXLEN];
978 if (!_getucname(self, code, buffer, sizeof(buffer)))
979 return 0;
980 for (i = 0; i < namelen; i++) {
[391]981 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
[2]982 return 0;
983 }
984 return buffer[namelen] == '\0';
985}
986
[391]987static void
[2]988find_syllable(const char *str, int *len, int *pos, int count, int column)
989{
990 int i, len1;
991 *len = -1;
992 for (i = 0; i < count; i++) {
[391]993 char *s = hangul_syllables[i][column];
994 len1 = strlen(s);
995 if (len1 <= *len)
996 continue;
997 if (strncmp(str, s, len1) == 0) {
998 *len = len1;
999 *pos = i;
1000 }
[2]1001 }
1002 if (*len == -1) {
[391]1003 *len = 0;
[2]1004 }
1005}
1006
1007static int
1008_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
1009{
1010 unsigned int h, v;
1011 unsigned int mask = code_size-1;
1012 unsigned int i, incr;
1013
1014 /* Check for hangul syllables. */
1015 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
[391]1016 int len, L = -1, V = -1, T = -1;
1017 const char *pos = name + 16;
1018 find_syllable(pos, &len, &L, LCount, 0);
1019 pos += len;
1020 find_syllable(pos, &len, &V, VCount, 1);
1021 pos += len;
1022 find_syllable(pos, &len, &T, TCount, 2);
1023 pos += len;
1024 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1025 *code = SBase + (L*VCount+V)*TCount + T;
1026 return 1;
1027 }
[2]1028 /* Otherwise, it's an illegal syllable name. */
1029 return 0;
1030 }
1031
1032 /* Check for unified ideographs. */
1033 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1034 /* Four or five hexdigits must follow. */
1035 v = 0;
1036 name += 22;
1037 namelen -= 22;
1038 if (namelen != 4 && namelen != 5)
1039 return 0;
1040 while (namelen--) {
1041 v *= 16;
1042 if (*name >= '0' && *name <= '9')
1043 v += *name - '0';
1044 else if (*name >= 'A' && *name <= 'F')
1045 v += *name - 'A' + 10;
1046 else
1047 return 0;
1048 name++;
1049 }
1050 if (!is_unified_ideograph(v))
1051 return 0;
1052 *code = v;
1053 return 1;
1054 }
1055
1056 /* the following is the same as python's dictionary lookup, with
1057 only minor changes. see the makeunicodedata script for more
1058 details */
1059
1060 h = (unsigned int) _gethash(name, namelen, code_magic);
1061 i = (~h) & mask;
1062 v = code_hash[i];
1063 if (!v)
1064 return 0;
1065 if (_cmpname(self, v, name, namelen)) {
1066 *code = v;
1067 return 1;
1068 }
1069 incr = (h ^ (h >> 3)) & mask;
1070 if (!incr)
1071 incr = mask;
1072 for (;;) {
1073 i = (i + incr) & mask;
1074 v = code_hash[i];
1075 if (!v)
1076 return 0;
1077 if (_cmpname(self, v, name, namelen)) {
1078 *code = v;
1079 return 1;
1080 }
1081 incr = incr << 1;
1082 if (incr > mask)
1083 incr = incr ^ code_poly;
1084 }
1085}
1086
[391]1087static const _PyUnicode_Name_CAPI hashAPI =
[2]1088{
1089 sizeof(_PyUnicode_Name_CAPI),
1090 _getucname,
1091 _getcode
1092};
1093
1094/* -------------------------------------------------------------------- */
1095/* Python bindings */
1096
1097PyDoc_STRVAR(unicodedata_name__doc__,
1098"name(unichr[, default])\n\
1099Returns the name assigned to the Unicode character unichr as a\n\
1100string. If no name is defined, default is returned, or, if not\n\
1101given, ValueError is raised.");
1102
1103static PyObject *
1104unicodedata_name(PyObject* self, PyObject* args)
1105{
1106 char name[NAME_MAXLEN];
1107 Py_UCS4 c;
1108
1109 PyUnicodeObject* v;
1110 PyObject* defobj = NULL;
1111 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1112 return NULL;
1113
1114 c = getuchar(v);
1115 if (c == (Py_UCS4)-1)
1116 return NULL;
1117
1118 if (!_getucname(self, c, name, sizeof(name))) {
[391]1119 if (defobj == NULL) {
1120 PyErr_SetString(PyExc_ValueError, "no such name");
[2]1121 return NULL;
[391]1122 }
1123 else {
1124 Py_INCREF(defobj);
1125 return defobj;
1126 }
[2]1127 }
1128
1129 return Py_BuildValue("s", name);
1130}
1131
1132PyDoc_STRVAR(unicodedata_lookup__doc__,
1133"lookup(name)\n\
1134\n\
1135Look up character by name. If a character with the\n\
1136given name is found, return the corresponding Unicode\n\
1137character. If not found, KeyError is raised.");
1138
1139static PyObject *
1140unicodedata_lookup(PyObject* self, PyObject* args)
1141{
1142 Py_UCS4 code;
1143 Py_UNICODE str[2];
1144
1145 char* name;
1146 int namelen;
1147 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1148 return NULL;
1149
1150 if (!_getcode(self, name, namelen, &code)) {
1151 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1152 name);
1153 return NULL;
1154 }
1155
1156#ifndef Py_UNICODE_WIDE
1157 if (code >= 0x10000) {
1158 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1159 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1160 return PyUnicode_FromUnicode(str, 2);
1161 }
1162#endif
1163 str[0] = (Py_UNICODE) code;
[391]1164 return PyUnicode_FromUnicode(str, 1);
[2]1165}
1166
1167/* XXX Add doc strings. */
1168
1169static PyMethodDef unicodedata_functions[] = {
1170 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1171 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1172 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1173 {"category", unicodedata_category, METH_VARARGS,
1174 unicodedata_category__doc__},
1175 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1176 unicodedata_bidirectional__doc__},
1177 {"combining", unicodedata_combining, METH_VARARGS,
1178 unicodedata_combining__doc__},
1179 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1180 unicodedata_mirrored__doc__},
1181 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1182 unicodedata_east_asian_width__doc__},
1183 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1184 unicodedata_decomposition__doc__},
1185 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1186 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1187 {"normalize", unicodedata_normalize, METH_VARARGS,
1188 unicodedata_normalize__doc__},
[391]1189 {NULL, NULL} /* sentinel */
[2]1190};
1191
1192static PyTypeObject UCD_Type = {
[391]1193 /* The ob_type field must be initialized in the module init function
1194 * to be portable to Windows without using C++. */
1195 PyVarObject_HEAD_INIT(NULL, 0)
1196 "unicodedata.UCD", /*tp_name*/
1197 sizeof(PreviousDBVersion), /*tp_basicsize*/
1198 0, /*tp_itemsize*/
1199 /* methods */
1200 (destructor)PyObject_Del, /*tp_dealloc*/
1201 0, /*tp_print*/
1202 0, /*tp_getattr*/
1203 0, /*tp_setattr*/
1204 0, /*tp_compare*/
1205 0, /*tp_repr*/
1206 0, /*tp_as_number*/
1207 0, /*tp_as_sequence*/
1208 0, /*tp_as_mapping*/
1209 0, /*tp_hash*/
[2]1210 0, /*tp_call*/
1211 0, /*tp_str*/
1212 PyObject_GenericGetAttr,/*tp_getattro*/
1213 0, /*tp_setattro*/
1214 0, /*tp_as_buffer*/
1215 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1216 0, /*tp_doc*/
1217 0, /*tp_traverse*/
1218 0, /*tp_clear*/
1219 0, /*tp_richcompare*/
1220 0, /*tp_weaklistoffset*/
1221 0, /*tp_iter*/
1222 0, /*tp_iternext*/
1223 unicodedata_functions, /*tp_methods*/
1224 DB_members, /*tp_members*/
1225 0, /*tp_getset*/
1226 0, /*tp_base*/
1227 0, /*tp_dict*/
1228 0, /*tp_descr_get*/
1229 0, /*tp_descr_set*/
1230 0, /*tp_dictoffset*/
1231 0, /*tp_init*/
1232 0, /*tp_alloc*/
1233 0, /*tp_new*/
1234 0, /*tp_free*/
1235 0, /*tp_is_gc*/
1236};
1237
1238PyDoc_STRVAR(unicodedata_docstring,
1239"This module provides access to the Unicode Character Database which\n\
1240defines character properties for all Unicode characters. The data in\n\
1241this database is based on the UnicodeData.txt file version\n\
[391]12425.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
[2]1243\n\
1244The module uses the same names and symbols as defined by the\n\
[391]1245UnicodeData File Format 5.2.0 (see\n\
1246http://www.unicode.org/reports/tr44/tr44-4.html).");
[2]1247
1248PyMODINIT_FUNC
1249initunicodedata(void)
1250{
1251 PyObject *m, *v;
1252
1253 Py_TYPE(&UCD_Type) = &PyType_Type;
1254
1255 m = Py_InitModule3(
1256 "unicodedata", unicodedata_functions, unicodedata_docstring);
1257 if (!m)
1258 return;
1259
1260 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1261 Py_INCREF(&UCD_Type);
1262 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1263
1264 /* Previous versions */
1265 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1266 if (v != NULL)
1267 PyModule_AddObject(m, "ucd_3_2_0", v);
1268
1269 /* Export C API */
[391]1270 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
[2]1271 if (v != NULL)
1272 PyModule_AddObject(m, "ucnhash_CAPI", v);
1273}
1274
[391]1275/*
[2]1276Local variables:
1277c-basic-offset: 4
1278indent-tabs-mode: nil
1279End:
1280*/
Note: See TracBrowser for help on using the repository browser.