source: python/vendor/Python-2.6.5/Modules/unicodedata.c

Last change on this file was 2, checked in by Yuri Dario, 15 years ago

Initial import for vendor code.

  • Property svn:eol-style set to native
File size: 34.1 KB
Line 
1/* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode 5.1 data base.
4
5 Data was extracted from the Unicode 5.1 UnicodeData.txt file.
6
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
10
11 Copyright (c) Corporation for National Research Initiatives.
12
13 ------------------------------------------------------------------------ */
14
15#include "Python.h"
16#include "ucnhash.h"
17#include "structmember.h"
18
19/* character properties */
20
21typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
30} _PyUnicode_DatabaseRecord;
31
32typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const unsigned char mirrored_changed;
38 const int numeric_changed;
39} change_record;
40
41/* data file generated by Tools/unicode/makeunicodedata.py */
42#include "unicodedata_db.h"
43
44static const _PyUnicode_DatabaseRecord*
45_getrecord_ex(Py_UCS4 code)
46{
47 int index;
48 if (code >= 0x110000)
49 index = 0;
50 else {
51 index = index1[(code>>SHIFT)];
52 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
53 }
54
55 return &_PyUnicode_Database_Records[index];
56}
57
58/* ------------- Previous-version API ------------------------------------- */
59typedef struct previous_version {
60 PyObject_HEAD
61 const char *name;
62 const change_record* (*getrecord)(Py_UCS4);
63 Py_UCS4 (*normalization)(Py_UCS4);
64} PreviousDBVersion;
65
66#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
67
68static PyMemberDef DB_members[] = {
69 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
70 {NULL}
71};
72
73/* forward declaration */
74static PyTypeObject UCD_Type;
75
76static PyObject*
77new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
78 Py_UCS4 (*normalization)(Py_UCS4))
79{
80 PreviousDBVersion *self;
81 self = PyObject_New(PreviousDBVersion, &UCD_Type);
82 if (self == NULL)
83 return NULL;
84 self->name = name;
85 self->getrecord = getrecord;
86 self->normalization = normalization;
87 return (PyObject*)self;
88}
89
90
91static Py_UCS4 getuchar(PyUnicodeObject *obj)
92{
93 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
94
95 if (PyUnicode_GET_SIZE(obj) == 1)
96 return *v;
97#ifndef Py_UNICODE_WIDE
98 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
99 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
100 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
101 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
102#endif
103 PyErr_SetString(PyExc_TypeError,
104 "need a single Unicode character as parameter");
105 return (Py_UCS4)-1;
106}
107
108/* --- Module API --------------------------------------------------------- */
109
110PyDoc_STRVAR(unicodedata_decimal__doc__,
111"decimal(unichr[, default])\n\
112\n\
113Returns the decimal value assigned to the Unicode character unichr\n\
114as integer. If no such value is defined, default is returned, or, if\n\
115not given, ValueError is raised.");
116
117static PyObject *
118unicodedata_decimal(PyObject *self, PyObject *args)
119{
120 PyUnicodeObject *v;
121 PyObject *defobj = NULL;
122 int have_old = 0;
123 long rc;
124 Py_UCS4 c;
125
126 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
127 return NULL;
128 c = getuchar(v);
129 if (c == (Py_UCS4)-1)
130 return NULL;
131
132 if (self) {
133 const change_record *old = get_old_record(self, c);
134 if (old->category_changed == 0) {
135 /* unassigned */
136 have_old = 1;
137 rc = -1;
138 }
139 else if (old->decimal_changed != 0xFF) {
140 have_old = 1;
141 rc = old->decimal_changed;
142 }
143 }
144
145 if (!have_old)
146 rc = Py_UNICODE_TODECIMAL(c);
147 if (rc < 0) {
148 if (defobj == NULL) {
149 PyErr_SetString(PyExc_ValueError,
150 "not a decimal");
151 return NULL;
152 }
153 else {
154 Py_INCREF(defobj);
155 return defobj;
156 }
157 }
158 return PyInt_FromLong(rc);
159}
160
161PyDoc_STRVAR(unicodedata_digit__doc__,
162"digit(unichr[, default])\n\
163\n\
164Returns the digit value assigned to the Unicode character unichr as\n\
165integer. If no such value is defined, default is returned, or, if\n\
166not given, ValueError is raised.");
167
168static PyObject *
169unicodedata_digit(PyObject *self, PyObject *args)
170{
171 PyUnicodeObject *v;
172 PyObject *defobj = NULL;
173 long rc;
174 Py_UCS4 c;
175
176 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
177 return NULL;
178 c = getuchar(v);
179 if (c == (Py_UCS4)-1)
180 return NULL;
181 rc = Py_UNICODE_TODIGIT(c);
182 if (rc < 0) {
183 if (defobj == NULL) {
184 PyErr_SetString(PyExc_ValueError, "not a digit");
185 return NULL;
186 }
187 else {
188 Py_INCREF(defobj);
189 return defobj;
190 }
191 }
192 return PyInt_FromLong(rc);
193}
194
195PyDoc_STRVAR(unicodedata_numeric__doc__,
196"numeric(unichr[, default])\n\
197\n\
198Returns the numeric value assigned to the Unicode character unichr\n\
199as float. If no such value is defined, default is returned, or, if\n\
200not given, ValueError is raised.");
201
202static PyObject *
203unicodedata_numeric(PyObject *self, PyObject *args)
204{
205 PyUnicodeObject *v;
206 PyObject *defobj = NULL;
207 int have_old = 0;
208 double rc;
209 Py_UCS4 c;
210
211 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
212 return NULL;
213 c = getuchar(v);
214 if (c == (Py_UCS4)-1)
215 return NULL;
216
217 if (self) {
218 const change_record *old = get_old_record(self, c);
219 if (old->category_changed == 0) {
220 /* unassigned */
221 have_old = 1;
222 rc = -1.0;
223 }
224 else if (old->decimal_changed != 0xFF) {
225 have_old = 1;
226 rc = old->decimal_changed;
227 }
228 }
229
230 if (!have_old)
231 rc = Py_UNICODE_TONUMERIC(c);
232 if (rc == -1.0) {
233 if (defobj == NULL) {
234 PyErr_SetString(PyExc_ValueError, "not a numeric character");
235 return NULL;
236 }
237 else {
238 Py_INCREF(defobj);
239 return defobj;
240 }
241 }
242 return PyFloat_FromDouble(rc);
243}
244
245PyDoc_STRVAR(unicodedata_category__doc__,
246"category(unichr)\n\
247\n\
248Returns the general category assigned to the Unicode character\n\
249unichr as string.");
250
251static PyObject *
252unicodedata_category(PyObject *self, PyObject *args)
253{
254 PyUnicodeObject *v;
255 int index;
256 Py_UCS4 c;
257
258 if (!PyArg_ParseTuple(args, "O!:category",
259 &PyUnicode_Type, &v))
260 return NULL;
261 c = getuchar(v);
262 if (c == (Py_UCS4)-1)
263 return NULL;
264 index = (int) _getrecord_ex(c)->category;
265 if (self) {
266 const change_record *old = get_old_record(self, c);
267 if (old->category_changed != 0xFF)
268 index = old->category_changed;
269 }
270 return PyString_FromString(_PyUnicode_CategoryNames[index]);
271}
272
273PyDoc_STRVAR(unicodedata_bidirectional__doc__,
274"bidirectional(unichr)\n\
275\n\
276Returns the bidirectional category assigned to the Unicode character\n\
277unichr as string. If no such value is defined, an empty string is\n\
278returned.");
279
280static PyObject *
281unicodedata_bidirectional(PyObject *self, PyObject *args)
282{
283 PyUnicodeObject *v;
284 int index;
285 Py_UCS4 c;
286
287 if (!PyArg_ParseTuple(args, "O!:bidirectional",
288 &PyUnicode_Type, &v))
289 return NULL;
290 c = getuchar(v);
291 if (c == (Py_UCS4)-1)
292 return NULL;
293 index = (int) _getrecord_ex(c)->bidirectional;
294 if (self) {
295 const change_record *old = get_old_record(self, c);
296 if (old->category_changed == 0)
297 index = 0; /* unassigned */
298 else if (old->bidir_changed != 0xFF)
299 index = old->bidir_changed;
300 }
301 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
302}
303
304PyDoc_STRVAR(unicodedata_combining__doc__,
305"combining(unichr)\n\
306\n\
307Returns the canonical combining class assigned to the Unicode\n\
308character unichr as integer. Returns 0 if no combining class is\n\
309defined.");
310
311static PyObject *
312unicodedata_combining(PyObject *self, PyObject *args)
313{
314 PyUnicodeObject *v;
315 int index;
316 Py_UCS4 c;
317
318 if (!PyArg_ParseTuple(args, "O!:combining",
319 &PyUnicode_Type, &v))
320 return NULL;
321 c = getuchar(v);
322 if (c == (Py_UCS4)-1)
323 return NULL;
324 index = (int) _getrecord_ex(c)->combining;
325 if (self) {
326 const change_record *old = get_old_record(self, c);
327 if (old->category_changed == 0)
328 index = 0; /* unassigned */
329 }
330 return PyInt_FromLong(index);
331}
332
333PyDoc_STRVAR(unicodedata_mirrored__doc__,
334"mirrored(unichr)\n\
335\n\
336Returns the mirrored property assigned to the Unicode character\n\
337unichr as integer. Returns 1 if the character has been identified as\n\
338a \"mirrored\" character in bidirectional text, 0 otherwise.");
339
340static PyObject *
341unicodedata_mirrored(PyObject *self, PyObject *args)
342{
343 PyUnicodeObject *v;
344 int index;
345 Py_UCS4 c;
346
347 if (!PyArg_ParseTuple(args, "O!:mirrored",
348 &PyUnicode_Type, &v))
349 return NULL;
350 c = getuchar(v);
351 if (c == (Py_UCS4)-1)
352 return NULL;
353 index = (int) _getrecord_ex(c)->mirrored;
354 if (self) {
355 const change_record *old = get_old_record(self, c);
356 if (old->category_changed == 0)
357 index = 0; /* unassigned */
358 else if (old->mirrored_changed != 0xFF)
359 index = old->mirrored_changed;
360 }
361 return PyInt_FromLong(index);
362}
363
364PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
365"east_asian_width(unichr)\n\
366\n\
367Returns the east asian width assigned to the Unicode character\n\
368unichr as string.");
369
370static PyObject *
371unicodedata_east_asian_width(PyObject *self, PyObject *args)
372{
373 PyUnicodeObject *v;
374 int index;
375 Py_UCS4 c;
376
377 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
378 &PyUnicode_Type, &v))
379 return NULL;
380 c = getuchar(v);
381 if (c == (Py_UCS4)-1)
382 return NULL;
383 index = (int) _getrecord_ex(c)->east_asian_width;
384 if (self) {
385 const change_record *old = get_old_record(self, c);
386 if (old->category_changed == 0)
387 index = 0; /* unassigned */
388 }
389 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
390}
391
392PyDoc_STRVAR(unicodedata_decomposition__doc__,
393"decomposition(unichr)\n\
394\n\
395Returns the character decomposition mapping assigned to the Unicode\n\
396character unichr as string. An empty string is returned in case no\n\
397such mapping is defined.");
398
399static PyObject *
400unicodedata_decomposition(PyObject *self, PyObject *args)
401{
402 PyUnicodeObject *v;
403 char decomp[256];
404 int code, index, count, i;
405 unsigned int prefix_index;
406 Py_UCS4 c;
407
408 if (!PyArg_ParseTuple(args, "O!:decomposition",
409 &PyUnicode_Type, &v))
410 return NULL;
411 c = getuchar(v);
412 if (c == (Py_UCS4)-1)
413 return NULL;
414
415 code = (int)c;
416
417 if (self) {
418 const change_record *old = get_old_record(self, c);
419 if (old->category_changed == 0)
420 return PyString_FromString(""); /* unassigned */
421 }
422
423 if (code < 0 || code >= 0x110000)
424 index = 0;
425 else {
426 index = decomp_index1[(code>>DECOMP_SHIFT)];
427 index = decomp_index2[(index<<DECOMP_SHIFT)+
428 (code&((1<<DECOMP_SHIFT)-1))];
429 }
430
431 /* high byte is number of hex bytes (usually one or two), low byte
432 is prefix code (from*/
433 count = decomp_data[index] >> 8;
434
435 /* XXX: could allocate the PyString up front instead
436 (strlen(prefix) + 5 * count + 1 bytes) */
437
438 /* Based on how index is calculated above and decomp_data is generated
439 from Tools/unicode/makeunicodedata.py, it should not be possible
440 to overflow decomp_prefix. */
441 prefix_index = decomp_data[index] & 255;
442 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
443
444 /* copy prefix */
445 i = strlen(decomp_prefix[prefix_index]);
446 memcpy(decomp, decomp_prefix[prefix_index], i);
447
448 while (count-- > 0) {
449 if (i)
450 decomp[i++] = ' ';
451 assert((size_t)i < sizeof(decomp));
452 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
453 decomp_data[++index]);
454 i += strlen(decomp + i);
455 }
456
457 decomp[i] = '\0';
458
459 return PyString_FromString(decomp);
460}
461
462static void
463get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
464{
465 if (code >= 0x110000) {
466 *index = 0;
467 } else if (self && get_old_record(self, code)->category_changed==0) {
468 /* unassigned in old version */
469 *index = 0;
470 }
471 else {
472 *index = decomp_index1[(code>>DECOMP_SHIFT)];
473 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
474 (code&((1<<DECOMP_SHIFT)-1))];
475 }
476
477 /* high byte is number of hex bytes (usually one or two), low byte
478 is prefix code (from*/
479 *count = decomp_data[*index] >> 8;
480 *prefix = decomp_data[*index] & 255;
481
482 (*index)++;
483}
484
485#define SBase 0xAC00
486#define LBase 0x1100
487#define VBase 0x1161
488#define TBase 0x11A7
489#define LCount 19
490#define VCount 21
491#define TCount 28
492#define NCount (VCount*TCount)
493#define SCount (LCount*NCount)
494
495static PyObject*
496nfd_nfkd(PyObject *self, PyObject *input, int k)
497{
498 PyObject *result;
499 Py_UNICODE *i, *end, *o;
500 /* Longest decomposition in Unicode 3.2: U+FDFA */
501 Py_UNICODE stack[20];
502 Py_ssize_t space, isize;
503 int index, prefix, count, stackptr;
504 unsigned char prev, cur;
505
506 stackptr = 0;
507 isize = PyUnicode_GET_SIZE(input);
508 /* Overallocate atmost 10 characters. */
509 space = (isize > 10 ? 10 : isize) + isize;
510 result = PyUnicode_FromUnicode(NULL, space);
511 if (!result)
512 return NULL;
513 i = PyUnicode_AS_UNICODE(input);
514 end = i + isize;
515 o = PyUnicode_AS_UNICODE(result);
516
517 while (i < end) {
518 stack[stackptr++] = *i++;
519 while(stackptr) {
520 Py_UNICODE code = stack[--stackptr];
521 /* Hangul Decomposition adds three characters in
522 a single step, so we need atleast that much room. */
523 if (space < 3) {
524 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
525 space += 10;
526 if (PyUnicode_Resize(&result, newsize) == -1)
527 return NULL;
528 o = PyUnicode_AS_UNICODE(result) + newsize - space;
529 }
530 /* Hangul Decomposition. */
531 if (SBase <= code && code < (SBase+SCount)) {
532 int SIndex = code - SBase;
533 int L = LBase + SIndex / NCount;
534 int V = VBase + (SIndex % NCount) / TCount;
535 int T = TBase + SIndex % TCount;
536 *o++ = L;
537 *o++ = V;
538 space -= 2;
539 if (T != TBase) {
540 *o++ = T;
541 space --;
542 }
543 continue;
544 }
545 /* normalization changes */
546 if (self) {
547 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
548 if (value != 0) {
549 stack[stackptr++] = value;
550 continue;
551 }
552 }
553
554 /* Other decompositions. */
555 get_decomp_record(self, code, &index, &prefix, &count);
556
557 /* Copy character if it is not decomposable, or has a
558 compatibility decomposition, but we do NFD. */
559 if (!count || (prefix && !k)) {
560 *o++ = code;
561 space--;
562 continue;
563 }
564 /* Copy decomposition onto the stack, in reverse
565 order. */
566 while(count) {
567 code = decomp_data[index + (--count)];
568 stack[stackptr++] = code;
569 }
570 }
571 }
572
573 /* Drop overallocation. Cannot fail. */
574 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
575
576 /* Sort canonically. */
577 i = PyUnicode_AS_UNICODE(result);
578 prev = _getrecord_ex(*i)->combining;
579 end = i + PyUnicode_GET_SIZE(result);
580 for (i++; i < end; i++) {
581 cur = _getrecord_ex(*i)->combining;
582 if (prev == 0 || cur == 0 || prev <= cur) {
583 prev = cur;
584 continue;
585 }
586 /* Non-canonical order. Need to switch *i with previous. */
587 o = i - 1;
588 while (1) {
589 Py_UNICODE tmp = o[1];
590 o[1] = o[0];
591 o[0] = tmp;
592 o--;
593 if (o < PyUnicode_AS_UNICODE(result))
594 break;
595 prev = _getrecord_ex(*o)->combining;
596 if (prev == 0 || prev <= cur)
597 break;
598 }
599 prev = _getrecord_ex(*i)->combining;
600 }
601 return result;
602}
603
604static int
605find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
606{
607 int index;
608 for (index = 0; nfc[index].start; index++) {
609 int start = nfc[index].start;
610 if (code < start)
611 return -1;
612 if (code <= start + nfc[index].count) {
613 int delta = code - start;
614 return nfc[index].index + delta;
615 }
616 }
617 return -1;
618}
619
620static PyObject*
621nfc_nfkc(PyObject *self, PyObject *input, int k)
622{
623 PyObject *result;
624 Py_UNICODE *i, *i1, *o, *end;
625 int f,l,index,index1,comb;
626 Py_UNICODE code;
627 Py_UNICODE *skipped[20];
628 int cskipped = 0;
629
630 result = nfd_nfkd(self, input, k);
631 if (!result)
632 return NULL;
633
634 /* We are going to modify result in-place.
635 If nfd_nfkd is changed to sometimes return the input,
636 this code needs to be reviewed. */
637 assert(result != input);
638
639 i = PyUnicode_AS_UNICODE(result);
640 end = i + PyUnicode_GET_SIZE(result);
641 o = PyUnicode_AS_UNICODE(result);
642
643 again:
644 while (i < end) {
645 for (index = 0; index < cskipped; index++) {
646 if (skipped[index] == i) {
647 /* *i character is skipped.
648 Remove from list. */
649 skipped[index] = skipped[cskipped-1];
650 cskipped--;
651 i++;
652 goto again; /* continue while */
653 }
654 }
655 /* Hangul Composition. We don't need to check for <LV,T>
656 pairs, since we always have decomposed data. */
657 if (LBase <= *i && *i < (LBase+LCount) &&
658 i + 1 < end &&
659 VBase <= i[1] && i[1] <= (VBase+VCount)) {
660 int LIndex, VIndex;
661 LIndex = i[0] - LBase;
662 VIndex = i[1] - VBase;
663 code = SBase + (LIndex*VCount+VIndex)*TCount;
664 i+=2;
665 if (i < end &&
666 TBase <= *i && *i <= (TBase+TCount)) {
667 code += *i-TBase;
668 i++;
669 }
670 *o++ = code;
671 continue;
672 }
673
674 f = find_nfc_index(self, nfc_first, *i);
675 if (f == -1) {
676 *o++ = *i++;
677 continue;
678 }
679 /* Find next unblocked character. */
680 i1 = i+1;
681 comb = 0;
682 while (i1 < end) {
683 int comb1 = _getrecord_ex(*i1)->combining;
684 if (comb1 && comb == comb1) {
685 /* Character is blocked. */
686 i1++;
687 continue;
688 }
689 l = find_nfc_index(self, nfc_last, *i1);
690 /* *i1 cannot be combined with *i. If *i1
691 is a starter, we don't need to look further.
692 Otherwise, record the combining class. */
693 if (l == -1) {
694 not_combinable:
695 if (comb1 == 0)
696 break;
697 comb = comb1;
698 i1++;
699 continue;
700 }
701 index = f*TOTAL_LAST + l;
702 index1 = comp_index[index >> COMP_SHIFT];
703 code = comp_data[(index1<<COMP_SHIFT)+
704 (index&((1<<COMP_SHIFT)-1))];
705 if (code == 0)
706 goto not_combinable;
707
708 /* Replace the original character. */
709 *i = code;
710 /* Mark the second character unused. */
711 skipped[cskipped++] = i1;
712 i1++;
713 f = find_nfc_index(self, nfc_first, *i);
714 if (f == -1)
715 break;
716 }
717 *o++ = *i++;
718 }
719 if (o != end)
720 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
721 return result;
722}
723
724PyDoc_STRVAR(unicodedata_normalize__doc__,
725"normalize(form, unistr)\n\
726\n\
727Return the normal form 'form' for the Unicode string unistr. Valid\n\
728values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
729
730static PyObject*
731unicodedata_normalize(PyObject *self, PyObject *args)
732{
733 char *form;
734 PyObject *input;
735
736 if(!PyArg_ParseTuple(args, "sO!:normalize",
737 &form, &PyUnicode_Type, &input))
738 return NULL;
739
740 if (PyUnicode_GetSize(input) == 0) {
741 /* Special case empty input strings, since resizing
742 them later would cause internal errors. */
743 Py_INCREF(input);
744 return input;
745 }
746
747 if (strcmp(form, "NFC") == 0)
748 return nfc_nfkc(self, input, 0);
749 if (strcmp(form, "NFKC") == 0)
750 return nfc_nfkc(self, input, 1);
751 if (strcmp(form, "NFD") == 0)
752 return nfd_nfkd(self, input, 0);
753 if (strcmp(form, "NFKD") == 0)
754 return nfd_nfkd(self, input, 1);
755 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
756 return NULL;
757}
758
759/* -------------------------------------------------------------------- */
760/* unicode character name tables */
761
762/* data file generated by Tools/unicode/makeunicodedata.py */
763#include "unicodename_db.h"
764
765/* -------------------------------------------------------------------- */
766/* database code (cut and pasted from the unidb package) */
767
768static unsigned long
769_gethash(const char *s, int len, int scale)
770{
771 int i;
772 unsigned long h = 0;
773 unsigned long ix;
774 for (i = 0; i < len; i++) {
775 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
776 ix = h & 0xff000000;
777 if (ix)
778 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
779 }
780 return h;
781}
782
783static char *hangul_syllables[][3] = {
784 { "G", "A", "" },
785 { "GG", "AE", "G" },
786 { "N", "YA", "GG" },
787 { "D", "YAE", "GS" },
788 { "DD", "EO", "N", },
789 { "R", "E", "NJ" },
790 { "M", "YEO", "NH" },
791 { "B", "YE", "D" },
792 { "BB", "O", "L" },
793 { "S", "WA", "LG" },
794 { "SS", "WAE", "LM" },
795 { "", "OE", "LB" },
796 { "J", "YO", "LS" },
797 { "JJ", "U", "LT" },
798 { "C", "WEO", "LP" },
799 { "K", "WE", "LH" },
800 { "T", "WI", "M" },
801 { "P", "YU", "B" },
802 { "H", "EU", "BS" },
803 { 0, "YI", "S" },
804 { 0, "I", "SS" },
805 { 0, 0, "NG" },
806 { 0, 0, "J" },
807 { 0, 0, "C" },
808 { 0, 0, "K" },
809 { 0, 0, "T" },
810 { 0, 0, "P" },
811 { 0, 0, "H" }
812};
813
814static int
815is_unified_ideograph(Py_UCS4 code)
816{
817 return (
818 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
819 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
820 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
821}
822
823static int
824_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
825{
826 int offset;
827 int i;
828 int word;
829 unsigned char* w;
830
831 if (code >= 0x110000)
832 return 0;
833
834 if (self) {
835 const change_record *old = get_old_record(self, code);
836 if (old->category_changed == 0) {
837 /* unassigned */
838 return 0;
839 }
840 }
841
842 if (SBase <= code && code < SBase+SCount) {
843 /* Hangul syllable. */
844 int SIndex = code - SBase;
845 int L = SIndex / NCount;
846 int V = (SIndex % NCount) / TCount;
847 int T = SIndex % TCount;
848
849 if (buflen < 27)
850 /* Worst case: HANGUL SYLLABLE <10chars>. */
851 return 0;
852 strcpy(buffer, "HANGUL SYLLABLE ");
853 buffer += 16;
854 strcpy(buffer, hangul_syllables[L][0]);
855 buffer += strlen(hangul_syllables[L][0]);
856 strcpy(buffer, hangul_syllables[V][1]);
857 buffer += strlen(hangul_syllables[V][1]);
858 strcpy(buffer, hangul_syllables[T][2]);
859 buffer += strlen(hangul_syllables[T][2]);
860 *buffer = '\0';
861 return 1;
862 }
863
864 if (is_unified_ideograph(code)) {
865 if (buflen < 28)
866 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
867 return 0;
868 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
869 return 1;
870 }
871
872 /* get offset into phrasebook */
873 offset = phrasebook_offset1[(code>>phrasebook_shift)];
874 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
875 (code&((1<<phrasebook_shift)-1))];
876 if (!offset)
877 return 0;
878
879 i = 0;
880
881 for (;;) {
882 /* get word index */
883 word = phrasebook[offset] - phrasebook_short;
884 if (word >= 0) {
885 word = (word << 8) + phrasebook[offset+1];
886 offset += 2;
887 } else
888 word = phrasebook[offset++];
889 if (i) {
890 if (i > buflen)
891 return 0; /* buffer overflow */
892 buffer[i++] = ' ';
893 }
894 /* copy word string from lexicon. the last character in the
895 word has bit 7 set. the last word in a string ends with
896 0x80 */
897 w = lexicon + lexicon_offset[word];
898 while (*w < 128) {
899 if (i >= buflen)
900 return 0; /* buffer overflow */
901 buffer[i++] = *w++;
902 }
903 if (i >= buflen)
904 return 0; /* buffer overflow */
905 buffer[i++] = *w & 127;
906 if (*w == 128)
907 break; /* end of word */
908 }
909
910 return 1;
911}
912
913static int
914_cmpname(PyObject *self, int code, const char* name, int namelen)
915{
916 /* check if code corresponds to the given name */
917 int i;
918 char buffer[NAME_MAXLEN];
919 if (!_getucname(self, code, buffer, sizeof(buffer)))
920 return 0;
921 for (i = 0; i < namelen; i++) {
922 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
923 return 0;
924 }
925 return buffer[namelen] == '\0';
926}
927
928static void
929find_syllable(const char *str, int *len, int *pos, int count, int column)
930{
931 int i, len1;
932 *len = -1;
933 for (i = 0; i < count; i++) {
934 char *s = hangul_syllables[i][column];
935 len1 = strlen(s);
936 if (len1 <= *len)
937 continue;
938 if (strncmp(str, s, len1) == 0) {
939 *len = len1;
940 *pos = i;
941 }
942 }
943 if (*len == -1) {
944 *len = 0;
945 }
946}
947
948static int
949_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
950{
951 unsigned int h, v;
952 unsigned int mask = code_size-1;
953 unsigned int i, incr;
954
955 /* Check for hangul syllables. */
956 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
957 int len, L = -1, V = -1, T = -1;
958 const char *pos = name + 16;
959 find_syllable(pos, &len, &L, LCount, 0);
960 pos += len;
961 find_syllable(pos, &len, &V, VCount, 1);
962 pos += len;
963 find_syllable(pos, &len, &T, TCount, 2);
964 pos += len;
965 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
966 *code = SBase + (L*VCount+V)*TCount + T;
967 return 1;
968 }
969 /* Otherwise, it's an illegal syllable name. */
970 return 0;
971 }
972
973 /* Check for unified ideographs. */
974 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
975 /* Four or five hexdigits must follow. */
976 v = 0;
977 name += 22;
978 namelen -= 22;
979 if (namelen != 4 && namelen != 5)
980 return 0;
981 while (namelen--) {
982 v *= 16;
983 if (*name >= '0' && *name <= '9')
984 v += *name - '0';
985 else if (*name >= 'A' && *name <= 'F')
986 v += *name - 'A' + 10;
987 else
988 return 0;
989 name++;
990 }
991 if (!is_unified_ideograph(v))
992 return 0;
993 *code = v;
994 return 1;
995 }
996
997 /* the following is the same as python's dictionary lookup, with
998 only minor changes. see the makeunicodedata script for more
999 details */
1000
1001 h = (unsigned int) _gethash(name, namelen, code_magic);
1002 i = (~h) & mask;
1003 v = code_hash[i];
1004 if (!v)
1005 return 0;
1006 if (_cmpname(self, v, name, namelen)) {
1007 *code = v;
1008 return 1;
1009 }
1010 incr = (h ^ (h >> 3)) & mask;
1011 if (!incr)
1012 incr = mask;
1013 for (;;) {
1014 i = (i + incr) & mask;
1015 v = code_hash[i];
1016 if (!v)
1017 return 0;
1018 if (_cmpname(self, v, name, namelen)) {
1019 *code = v;
1020 return 1;
1021 }
1022 incr = incr << 1;
1023 if (incr > mask)
1024 incr = incr ^ code_poly;
1025 }
1026}
1027
1028static const _PyUnicode_Name_CAPI hashAPI =
1029{
1030 sizeof(_PyUnicode_Name_CAPI),
1031 _getucname,
1032 _getcode
1033};
1034
1035/* -------------------------------------------------------------------- */
1036/* Python bindings */
1037
1038PyDoc_STRVAR(unicodedata_name__doc__,
1039"name(unichr[, default])\n\
1040Returns the name assigned to the Unicode character unichr as a\n\
1041string. If no name is defined, default is returned, or, if not\n\
1042given, ValueError is raised.");
1043
1044static PyObject *
1045unicodedata_name(PyObject* self, PyObject* args)
1046{
1047 char name[NAME_MAXLEN];
1048 Py_UCS4 c;
1049
1050 PyUnicodeObject* v;
1051 PyObject* defobj = NULL;
1052 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1053 return NULL;
1054
1055 c = getuchar(v);
1056 if (c == (Py_UCS4)-1)
1057 return NULL;
1058
1059 if (!_getucname(self, c, name, sizeof(name))) {
1060 if (defobj == NULL) {
1061 PyErr_SetString(PyExc_ValueError, "no such name");
1062 return NULL;
1063 }
1064 else {
1065 Py_INCREF(defobj);
1066 return defobj;
1067 }
1068 }
1069
1070 return Py_BuildValue("s", name);
1071}
1072
1073PyDoc_STRVAR(unicodedata_lookup__doc__,
1074"lookup(name)\n\
1075\n\
1076Look up character by name. If a character with the\n\
1077given name is found, return the corresponding Unicode\n\
1078character. If not found, KeyError is raised.");
1079
1080static PyObject *
1081unicodedata_lookup(PyObject* self, PyObject* args)
1082{
1083 Py_UCS4 code;
1084 Py_UNICODE str[2];
1085
1086 char* name;
1087 int namelen;
1088 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1089 return NULL;
1090
1091 if (!_getcode(self, name, namelen, &code)) {
1092 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1093 name);
1094 return NULL;
1095 }
1096
1097#ifndef Py_UNICODE_WIDE
1098 if (code >= 0x10000) {
1099 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1100 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1101 return PyUnicode_FromUnicode(str, 2);
1102 }
1103#endif
1104 str[0] = (Py_UNICODE) code;
1105 return PyUnicode_FromUnicode(str, 1);
1106}
1107
1108/* XXX Add doc strings. */
1109
1110static PyMethodDef unicodedata_functions[] = {
1111 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1112 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1113 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1114 {"category", unicodedata_category, METH_VARARGS,
1115 unicodedata_category__doc__},
1116 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1117 unicodedata_bidirectional__doc__},
1118 {"combining", unicodedata_combining, METH_VARARGS,
1119 unicodedata_combining__doc__},
1120 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1121 unicodedata_mirrored__doc__},
1122 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1123 unicodedata_east_asian_width__doc__},
1124 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1125 unicodedata_decomposition__doc__},
1126 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1127 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1128 {"normalize", unicodedata_normalize, METH_VARARGS,
1129 unicodedata_normalize__doc__},
1130 {NULL, NULL} /* sentinel */
1131};
1132
1133static PyTypeObject UCD_Type = {
1134 /* The ob_type field must be initialized in the module init function
1135 * to be portable to Windows without using C++. */
1136 PyVarObject_HEAD_INIT(NULL, 0)
1137 "unicodedata.UCD", /*tp_name*/
1138 sizeof(PreviousDBVersion), /*tp_basicsize*/
1139 0, /*tp_itemsize*/
1140 /* methods */
1141 (destructor)PyObject_Del, /*tp_dealloc*/
1142 0, /*tp_print*/
1143 0, /*tp_getattr*/
1144 0, /*tp_setattr*/
1145 0, /*tp_compare*/
1146 0, /*tp_repr*/
1147 0, /*tp_as_number*/
1148 0, /*tp_as_sequence*/
1149 0, /*tp_as_mapping*/
1150 0, /*tp_hash*/
1151 0, /*tp_call*/
1152 0, /*tp_str*/
1153 PyObject_GenericGetAttr,/*tp_getattro*/
1154 0, /*tp_setattro*/
1155 0, /*tp_as_buffer*/
1156 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1157 0, /*tp_doc*/
1158 0, /*tp_traverse*/
1159 0, /*tp_clear*/
1160 0, /*tp_richcompare*/
1161 0, /*tp_weaklistoffset*/
1162 0, /*tp_iter*/
1163 0, /*tp_iternext*/
1164 unicodedata_functions, /*tp_methods*/
1165 DB_members, /*tp_members*/
1166 0, /*tp_getset*/
1167 0, /*tp_base*/
1168 0, /*tp_dict*/
1169 0, /*tp_descr_get*/
1170 0, /*tp_descr_set*/
1171 0, /*tp_dictoffset*/
1172 0, /*tp_init*/
1173 0, /*tp_alloc*/
1174 0, /*tp_new*/
1175 0, /*tp_free*/
1176 0, /*tp_is_gc*/
1177};
1178
1179PyDoc_STRVAR(unicodedata_docstring,
1180"This module provides access to the Unicode Character Database which\n\
1181defines character properties for all Unicode characters. The data in\n\
1182this database is based on the UnicodeData.txt file version\n\
11835.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1184\n\
1185The module uses the same names and symbols as defined by the\n\
1186UnicodeData File Format 5.1.0 (see\n\
1187http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1188
1189PyMODINIT_FUNC
1190initunicodedata(void)
1191{
1192 PyObject *m, *v;
1193
1194 Py_TYPE(&UCD_Type) = &PyType_Type;
1195
1196 m = Py_InitModule3(
1197 "unicodedata", unicodedata_functions, unicodedata_docstring);
1198 if (!m)
1199 return;
1200
1201 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1202 Py_INCREF(&UCD_Type);
1203 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1204
1205 /* Previous versions */
1206 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1207 if (v != NULL)
1208 PyModule_AddObject(m, "ucd_3_2_0", v);
1209
1210 /* Export C API */
1211 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1212 if (v != NULL)
1213 PyModule_AddObject(m, "ucnhash_CAPI", v);
1214}
1215
1216/*
1217Local variables:
1218c-basic-offset: 4
1219indent-tabs-mode: nil
1220End:
1221*/
Note: See TracBrowser for help on using the repository browser.