Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

unicodedata.c@ 281

Last change on this file since 281 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 34.1 KB

Line
1	/* ------------------------------------------------------------------------
2
3	unicodedata -- Provides access to the Unicode 5.1 data base.
4
5	Data was extracted from the Unicode 5.1 UnicodeData.txt file.
6
7	Written by Marc-Andre Lemburg (mal@lemburg.com).
8	Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9	Modified by Martin v. Löwis (martin@v.loewis.de)
10
11	Copyright (c) Corporation for National Research Initiatives.
12
13	------------------------------------------------------------------------ */
14
15	#include "Python.h"
16	#include "ucnhash.h"
17	#include "structmember.h"
18
19	/* character properties */
20
21	typedef struct {
22	const unsigned char category; /* index into
23	_PyUnicode_CategoryNames */
24	const unsigned char combining; /* combining class value 0 - 255 */
25	const unsigned char bidirectional; /* index into
26	_PyUnicode_BidirectionalNames */
27	const unsigned char mirrored; /* true if mirrored in bidir mode */
28	const unsigned char east_asian_width; /* index into
29	_PyUnicode_EastAsianWidth */
30	} _PyUnicode_DatabaseRecord;
31
32	typedef struct change_record {
33	/* sequence of fields should be the same as in merge_old_version */
34	const unsigned char bidir_changed;
35	const unsigned char category_changed;
36	const unsigned char decimal_changed;
37	const unsigned char mirrored_changed;
38	const int numeric_changed;
39	} change_record;
40
41	/* data file generated by Tools/unicode/makeunicodedata.py */
42	#include "unicodedata_db.h"
43
44	static const _PyUnicode_DatabaseRecord*
45	_getrecord_ex(Py_UCS4 code)
46	{
47	int index;
48	if (code >= 0x110000)
49	index = 0;
50	else {
51	index = index1[(code>>SHIFT)];
52	index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
53	}
54
55	return &_PyUnicode_Database_Records[index];
56	}
57
58	/* ------------- Previous-version API ------------------------------------- */
59	typedef struct previous_version {
60	PyObject_HEAD
61	const char *name;
62	const change_record* (*getrecord)(Py_UCS4);
63	Py_UCS4 (*normalization)(Py_UCS4);
64	} PreviousDBVersion;
65
66	#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
67
68	static PyMemberDef DB_members[] = {
69	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
70	{NULL}
71	};
72
73	/* forward declaration */
74	static PyTypeObject UCD_Type;
75
76	static PyObject*
77	new_previous_version(const charname, const change_record (*getrecord)(Py_UCS4),
78	Py_UCS4 (*normalization)(Py_UCS4))
79	{
80	PreviousDBVersion *self;
81	self = PyObject_New(PreviousDBVersion, &UCD_Type);
82	if (self == NULL)
83	return NULL;
84	self->name = name;
85	self->getrecord = getrecord;
86	self->normalization = normalization;
87	return (PyObject*)self;
88	}
89
90
91	static Py_UCS4 getuchar(PyUnicodeObject *obj)
92	{
93	Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
94
95	if (PyUnicode_GET_SIZE(obj) == 1)
96	return *v;
97	#ifndef Py_UNICODE_WIDE
98	else if ((PyUnicode_GET_SIZE(obj) == 2) &&
99	(0xD800 <= v[0] && v[0] <= 0xDBFF) &&
100	(0xDC00 <= v[1] && v[1] <= 0xDFFF))
101	return (((v[0] & 0x3FF)<<10) \| (v[1] & 0x3FF)) + 0x10000;
102	#endif
103	PyErr_SetString(PyExc_TypeError,
104	"need a single Unicode character as parameter");
105	return (Py_UCS4)-1;
106	}
107
108	/* --- Module API --------------------------------------------------------- */
109
110	PyDoc_STRVAR(unicodedata_decimal__doc__,
111	"decimal(unichr[, default])\n\
112	\n\
113	Returns the decimal value assigned to the Unicode character unichr\n\
114	as integer. If no such value is defined, default is returned, or, if\n\
115	not given, ValueError is raised.");
116
117	static PyObject *
118	unicodedata_decimal(PyObject self, PyObject args)
119	{
120	PyUnicodeObject *v;
121	PyObject *defobj = NULL;
122	int have_old = 0;
123	long rc;
124	Py_UCS4 c;
125
126	if (!PyArg_ParseTuple(args, "O!\|O:decimal", &PyUnicode_Type, &v, &defobj))
127	return NULL;
128	c = getuchar(v);
129	if (c == (Py_UCS4)-1)
130	return NULL;
131
132	if (self) {
133	const change_record *old = get_old_record(self, c);
134	if (old->category_changed == 0) {
135	/* unassigned */
136	have_old = 1;
137	rc = -1;
138	}
139	else if (old->decimal_changed != 0xFF) {
140	have_old = 1;
141	rc = old->decimal_changed;
142	}
143	}
144
145	if (!have_old)
146	rc = Py_UNICODE_TODECIMAL(c);
147	if (rc < 0) {
148	if (defobj == NULL) {
149	PyErr_SetString(PyExc_ValueError,
150	"not a decimal");
151	return NULL;
152	}
153	else {
154	Py_INCREF(defobj);
155	return defobj;
156	}
157	}
158	return PyInt_FromLong(rc);
159	}
160
161	PyDoc_STRVAR(unicodedata_digit__doc__,
162	"digit(unichr[, default])\n\
163	\n\
164	Returns the digit value assigned to the Unicode character unichr as\n\
165	integer. If no such value is defined, default is returned, or, if\n\
166	not given, ValueError is raised.");
167
168	static PyObject *
169	unicodedata_digit(PyObject self, PyObject args)
170	{
171	PyUnicodeObject *v;
172	PyObject *defobj = NULL;
173	long rc;
174	Py_UCS4 c;
175
176	if (!PyArg_ParseTuple(args, "O!\|O:digit", &PyUnicode_Type, &v, &defobj))
177	return NULL;
178	c = getuchar(v);
179	if (c == (Py_UCS4)-1)
180	return NULL;
181	rc = Py_UNICODE_TODIGIT(c);
182	if (rc < 0) {
183	if (defobj == NULL) {
184	PyErr_SetString(PyExc_ValueError, "not a digit");
185	return NULL;
186	}
187	else {
188	Py_INCREF(defobj);
189	return defobj;
190	}
191	}
192	return PyInt_FromLong(rc);
193	}
194
195	PyDoc_STRVAR(unicodedata_numeric__doc__,
196	"numeric(unichr[, default])\n\
197	\n\
198	Returns the numeric value assigned to the Unicode character unichr\n\
199	as float. If no such value is defined, default is returned, or, if\n\
200	not given, ValueError is raised.");
201
202	static PyObject *
203	unicodedata_numeric(PyObject self, PyObject args)
204	{
205	PyUnicodeObject *v;
206	PyObject *defobj = NULL;
207	int have_old = 0;
208	double rc;
209	Py_UCS4 c;
210
211	if (!PyArg_ParseTuple(args, "O!\|O:numeric", &PyUnicode_Type, &v, &defobj))
212	return NULL;
213	c = getuchar(v);
214	if (c == (Py_UCS4)-1)
215	return NULL;
216
217	if (self) {
218	const change_record *old = get_old_record(self, c);
219	if (old->category_changed == 0) {
220	/* unassigned */
221	have_old = 1;
222	rc = -1.0;
223	}
224	else if (old->decimal_changed != 0xFF) {
225	have_old = 1;
226	rc = old->decimal_changed;
227	}
228	}
229
230	if (!have_old)
231	rc = Py_UNICODE_TONUMERIC(c);
232	if (rc == -1.0) {
233	if (defobj == NULL) {
234	PyErr_SetString(PyExc_ValueError, "not a numeric character");
235	return NULL;
236	}
237	else {
238	Py_INCREF(defobj);
239	return defobj;
240	}
241	}
242	return PyFloat_FromDouble(rc);
243	}
244
245	PyDoc_STRVAR(unicodedata_category__doc__,
246	"category(unichr)\n\
247	\n\
248	Returns the general category assigned to the Unicode character\n\
249	unichr as string.");
250
251	static PyObject *
252	unicodedata_category(PyObject self, PyObject args)
253	{
254	PyUnicodeObject *v;
255	int index;
256	Py_UCS4 c;
257
258	if (!PyArg_ParseTuple(args, "O!:category",
259	&PyUnicode_Type, &v))
260	return NULL;
261	c = getuchar(v);
262	if (c == (Py_UCS4)-1)
263	return NULL;
264	index = (int) _getrecord_ex(c)->category;
265	if (self) {
266	const change_record *old = get_old_record(self, c);
267	if (old->category_changed != 0xFF)
268	index = old->category_changed;
269	}
270	return PyString_FromString(_PyUnicode_CategoryNames[index]);
271	}
272
273	PyDoc_STRVAR(unicodedata_bidirectional__doc__,
274	"bidirectional(unichr)\n\
275	\n\
276	Returns the bidirectional category assigned to the Unicode character\n\
277	unichr as string. If no such value is defined, an empty string is\n\
278	returned.");
279
280	static PyObject *
281	unicodedata_bidirectional(PyObject self, PyObject args)
282	{
283	PyUnicodeObject *v;
284	int index;
285	Py_UCS4 c;
286
287	if (!PyArg_ParseTuple(args, "O!:bidirectional",
288	&PyUnicode_Type, &v))
289	return NULL;
290	c = getuchar(v);
291	if (c == (Py_UCS4)-1)
292	return NULL;
293	index = (int) _getrecord_ex(c)->bidirectional;
294	if (self) {
295	const change_record *old = get_old_record(self, c);
296	if (old->category_changed == 0)
297	index = 0; /* unassigned */
298	else if (old->bidir_changed != 0xFF)
299	index = old->bidir_changed;
300	}
301	return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
302	}
303
304	PyDoc_STRVAR(unicodedata_combining__doc__,
305	"combining(unichr)\n\
306	\n\
307	Returns the canonical combining class assigned to the Unicode\n\
308	character unichr as integer. Returns 0 if no combining class is\n\
309	defined.");
310
311	static PyObject *
312	unicodedata_combining(PyObject self, PyObject args)
313	{
314	PyUnicodeObject *v;
315	int index;
316	Py_UCS4 c;
317
318	if (!PyArg_ParseTuple(args, "O!:combining",
319	&PyUnicode_Type, &v))
320	return NULL;
321	c = getuchar(v);
322	if (c == (Py_UCS4)-1)
323	return NULL;
324	index = (int) _getrecord_ex(c)->combining;
325	if (self) {
326	const change_record *old = get_old_record(self, c);
327	if (old->category_changed == 0)
328	index = 0; /* unassigned */
329	}
330	return PyInt_FromLong(index);
331	}
332
333	PyDoc_STRVAR(unicodedata_mirrored__doc__,
334	"mirrored(unichr)\n\
335	\n\
336	Returns the mirrored property assigned to the Unicode character\n\
337	unichr as integer. Returns 1 if the character has been identified as\n\
338	a \"mirrored\" character in bidirectional text, 0 otherwise.");
339
340	static PyObject *
341	unicodedata_mirrored(PyObject self, PyObject args)
342	{
343	PyUnicodeObject *v;
344	int index;
345	Py_UCS4 c;
346
347	if (!PyArg_ParseTuple(args, "O!:mirrored",
348	&PyUnicode_Type, &v))
349	return NULL;
350	c = getuchar(v);
351	if (c == (Py_UCS4)-1)
352	return NULL;
353	index = (int) _getrecord_ex(c)->mirrored;
354	if (self) {
355	const change_record *old = get_old_record(self, c);
356	if (old->category_changed == 0)
357	index = 0; /* unassigned */
358	else if (old->mirrored_changed != 0xFF)
359	index = old->mirrored_changed;
360	}
361	return PyInt_FromLong(index);
362	}
363
364	PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
365	"east_asian_width(unichr)\n\
366	\n\
367	Returns the east asian width assigned to the Unicode character\n\
368	unichr as string.");
369
370	static PyObject *
371	unicodedata_east_asian_width(PyObject self, PyObject args)
372	{
373	PyUnicodeObject *v;
374	int index;
375	Py_UCS4 c;
376
377	if (!PyArg_ParseTuple(args, "O!:east_asian_width",
378	&PyUnicode_Type, &v))
379	return NULL;
380	c = getuchar(v);
381	if (c == (Py_UCS4)-1)
382	return NULL;
383	index = (int) _getrecord_ex(c)->east_asian_width;
384	if (self) {
385	const change_record *old = get_old_record(self, c);
386	if (old->category_changed == 0)
387	index = 0; /* unassigned */
388	}
389	return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
390	}
391
392	PyDoc_STRVAR(unicodedata_decomposition__doc__,
393	"decomposition(unichr)\n\
394	\n\
395	Returns the character decomposition mapping assigned to the Unicode\n\
396	character unichr as string. An empty string is returned in case no\n\
397	such mapping is defined.");
398
399	static PyObject *
400	unicodedata_decomposition(PyObject self, PyObject args)
401	{
402	PyUnicodeObject *v;
403	char decomp[256];
404	int code, index, count, i;
405	unsigned int prefix_index;
406	Py_UCS4 c;
407
408	if (!PyArg_ParseTuple(args, "O!:decomposition",
409	&PyUnicode_Type, &v))
410	return NULL;
411	c = getuchar(v);
412	if (c == (Py_UCS4)-1)
413	return NULL;
414
415	code = (int)c;
416
417	if (self) {
418	const change_record *old = get_old_record(self, c);
419	if (old->category_changed == 0)
420	return PyString_FromString(""); /* unassigned */
421	}
422
423	if (code < 0 \|\| code >= 0x110000)
424	index = 0;
425	else {
426	index = decomp_index1[(code>>DECOMP_SHIFT)];
427	index = decomp_index2[(index<<DECOMP_SHIFT)+
428	(code&((1<<DECOMP_SHIFT)-1))];
429	}
430
431	/* high byte is number of hex bytes (usually one or two), low byte
432	is prefix code (from*/
433	count = decomp_data[index] >> 8;
434
435	/* XXX: could allocate the PyString up front instead
436	(strlen(prefix) + 5 * count + 1 bytes) */
437
438	/* Based on how index is calculated above and decomp_data is generated
439	from Tools/unicode/makeunicodedata.py, it should not be possible
440	to overflow decomp_prefix. */
441	prefix_index = decomp_data[index] & 255;
442	assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
443
444	/* copy prefix */
445	i = strlen(decomp_prefix[prefix_index]);
446	memcpy(decomp, decomp_prefix[prefix_index], i);
447
448	while (count-- > 0) {
449	if (i)
450	decomp[i++] = ' ';
451	assert((size_t)i < sizeof(decomp));
452	PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
453	decomp_data[++index]);
454	i += strlen(decomp + i);
455	}
456
457	decomp[i] = '\0';
458
459	return PyString_FromString(decomp);
460	}
461
462	static void
463	get_decomp_record(PyObject self, Py_UCS4 code, int index, int prefix, int count)
464	{
465	if (code >= 0x110000) {
466	*index = 0;
467	} else if (self && get_old_record(self, code)->category_changed==0) {
468	/* unassigned in old version */
469	*index = 0;
470	}
471	else {
472	*index = decomp_index1[(code>>DECOMP_SHIFT)];
473	index = decomp_index2[(index<<DECOMP_SHIFT)+
474	(code&((1<<DECOMP_SHIFT)-1))];
475	}
476
477	/* high byte is number of hex bytes (usually one or two), low byte
478	is prefix code (from*/
479	count = decomp_data[index] >> 8;
480	prefix = decomp_data[index] & 255;
481
482	(*index)++;
483	}
484
485	#define SBase 0xAC00
486	#define LBase 0x1100
487	#define VBase 0x1161
488	#define TBase 0x11A7
489	#define LCount 19
490	#define VCount 21
491	#define TCount 28
492	#define NCount (VCount*TCount)
493	#define SCount (LCount*NCount)
494
495	static PyObject*
496	nfd_nfkd(PyObject self, PyObject input, int k)
497	{
498	PyObject *result;
499	Py_UNICODE i, end, *o;
500	/* Longest decomposition in Unicode 3.2: U+FDFA */
501	Py_UNICODE stack[20];
502	Py_ssize_t space, isize;
503	int index, prefix, count, stackptr;
504	unsigned char prev, cur;
505
506	stackptr = 0;
507	isize = PyUnicode_GET_SIZE(input);
508	/* Overallocate atmost 10 characters. */
509	space = (isize > 10 ? 10 : isize) + isize;
510	result = PyUnicode_FromUnicode(NULL, space);
511	if (!result)
512	return NULL;
513	i = PyUnicode_AS_UNICODE(input);
514	end = i + isize;
515	o = PyUnicode_AS_UNICODE(result);
516
517	while (i < end) {
518	stack[stackptr++] = *i++;
519	while(stackptr) {
520	Py_UNICODE code = stack[--stackptr];
521	/* Hangul Decomposition adds three characters in
522	a single step, so we need atleast that much room. */
523	if (space < 3) {
524	Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
525	space += 10;
526	if (PyUnicode_Resize(&result, newsize) == -1)
527	return NULL;
528	o = PyUnicode_AS_UNICODE(result) + newsize - space;
529	}
530	/* Hangul Decomposition. */
531	if (SBase <= code && code < (SBase+SCount)) {
532	int SIndex = code - SBase;
533	int L = LBase + SIndex / NCount;
534	int V = VBase + (SIndex % NCount) / TCount;
535	int T = TBase + SIndex % TCount;
536	*o++ = L;
537	*o++ = V;
538	space -= 2;
539	if (T != TBase) {
540	*o++ = T;
541	space --;
542	}
543	continue;
544	}
545	/* normalization changes */
546	if (self) {
547	Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
548	if (value != 0) {
549	stack[stackptr++] = value;
550	continue;
551	}
552	}
553
554	/* Other decompositions. */
555	get_decomp_record(self, code, &index, &prefix, &count);
556
557	/* Copy character if it is not decomposable, or has a
558	compatibility decomposition, but we do NFD. */
559	if (!count \|\| (prefix && !k)) {
560	*o++ = code;
561	space--;
562	continue;
563	}
564	/* Copy decomposition onto the stack, in reverse
565	order. */
566	while(count) {
567	code = decomp_data[index + (--count)];
568	stack[stackptr++] = code;
569	}
570	}
571	}
572
573	/* Drop overallocation. Cannot fail. */
574	PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
575
576	/* Sort canonically. */
577	i = PyUnicode_AS_UNICODE(result);
578	prev = _getrecord_ex(*i)->combining;
579	end = i + PyUnicode_GET_SIZE(result);
580	for (i++; i < end; i++) {
581	cur = _getrecord_ex(*i)->combining;
582	if (prev == 0 \|\| cur == 0 \|\| prev <= cur) {
583	prev = cur;
584	continue;
585	}
586	/* Non-canonical order. Need to switch i with previous. /
587	o = i - 1;
588	while (1) {
589	Py_UNICODE tmp = o[1];
590	o[1] = o[0];
591	o[0] = tmp;
592	o--;
593	if (o < PyUnicode_AS_UNICODE(result))
594	break;
595	prev = _getrecord_ex(*o)->combining;
596	if (prev == 0 \|\| prev <= cur)
597	break;
598	}
599	prev = _getrecord_ex(*i)->combining;
600	}
601	return result;
602	}
603
604	static int
605	find_nfc_index(PyObject self, struct reindex nfc, Py_UNICODE code)
606	{
607	int index;
608	for (index = 0; nfc[index].start; index++) {
609	int start = nfc[index].start;
610	if (code < start)
611	return -1;
612	if (code <= start + nfc[index].count) {
613	int delta = code - start;
614	return nfc[index].index + delta;
615	}
616	}
617	return -1;
618	}
619
620	static PyObject*
621	nfc_nfkc(PyObject self, PyObject input, int k)
622	{
623	PyObject *result;
624	Py_UNICODE i, i1, o, end;
625	int f,l,index,index1,comb;
626	Py_UNICODE code;
627	Py_UNICODE *skipped[20];
628	int cskipped = 0;
629
630	result = nfd_nfkd(self, input, k);
631	if (!result)
632	return NULL;
633
634	/* We are going to modify result in-place.
635	If nfd_nfkd is changed to sometimes return the input,
636	this code needs to be reviewed. */
637	assert(result != input);
638
639	i = PyUnicode_AS_UNICODE(result);
640	end = i + PyUnicode_GET_SIZE(result);
641	o = PyUnicode_AS_UNICODE(result);
642
643	again:
644	while (i < end) {
645	for (index = 0; index < cskipped; index++) {
646	if (skipped[index] == i) {
647	/* *i character is skipped.
648	Remove from list. */
649	skipped[index] = skipped[cskipped-1];
650	cskipped--;
651	i++;
652	goto again; /* continue while */
653	}
654	}
655	/* Hangul Composition. We don't need to check for <LV,T>
656	pairs, since we always have decomposed data. */
657	if (LBase <= i && i < (LBase+LCount) &&
658	i + 1 < end &&
659	VBase <= i[1] && i[1] <= (VBase+VCount)) {
660	int LIndex, VIndex;
661	LIndex = i[0] - LBase;
662	VIndex = i[1] - VBase;
663	code = SBase + (LIndexVCount+VIndex)TCount;
664	i+=2;
665	if (i < end &&
666	TBase <= i && i <= (TBase+TCount)) {
667	code += *i-TBase;
668	i++;
669	}
670	*o++ = code;
671	continue;
672	}
673
674	f = find_nfc_index(self, nfc_first, *i);
675	if (f == -1) {
676	o++ = i++;
677	continue;
678	}
679	/* Find next unblocked character. */
680	i1 = i+1;
681	comb = 0;
682	while (i1 < end) {
683	int comb1 = _getrecord_ex(*i1)->combining;
684	if (comb1 && comb == comb1) {
685	/* Character is blocked. */
686	i1++;
687	continue;
688	}
689	l = find_nfc_index(self, nfc_last, *i1);
690	/* i1 cannot be combined with i. If *i1
691	is a starter, we don't need to look further.
692	Otherwise, record the combining class. */
693	if (l == -1) {
694	not_combinable:
695	if (comb1 == 0)
696	break;
697	comb = comb1;
698	i1++;
699	continue;
700	}
701	index = f*TOTAL_LAST + l;
702	index1 = comp_index[index >> COMP_SHIFT];
703	code = comp_data[(index1<<COMP_SHIFT)+
704	(index&((1<<COMP_SHIFT)-1))];
705	if (code == 0)
706	goto not_combinable;
707
708	/* Replace the original character. */
709	*i = code;
710	/* Mark the second character unused. */
711	skipped[cskipped++] = i1;
712	i1++;
713	f = find_nfc_index(self, nfc_first, *i);
714	if (f == -1)
715	break;
716	}
717	o++ = i++;
718	}
719	if (o != end)
720	PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
721	return result;
722	}
723
724	PyDoc_STRVAR(unicodedata_normalize__doc__,
725	"normalize(form, unistr)\n\
726	\n\
727	Return the normal form 'form' for the Unicode string unistr. Valid\n\
728	values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
729
730	static PyObject*
731	unicodedata_normalize(PyObject self, PyObject args)
732	{
733	char *form;
734	PyObject *input;
735
736	if(!PyArg_ParseTuple(args, "sO!:normalize",
737	&form, &PyUnicode_Type, &input))
738	return NULL;
739
740	if (PyUnicode_GetSize(input) == 0) {
741	/* Special case empty input strings, since resizing
742	them later would cause internal errors. */
743	Py_INCREF(input);
744	return input;
745	}
746
747	if (strcmp(form, "NFC") == 0)
748	return nfc_nfkc(self, input, 0);
749	if (strcmp(form, "NFKC") == 0)
750	return nfc_nfkc(self, input, 1);
751	if (strcmp(form, "NFD") == 0)
752	return nfd_nfkd(self, input, 0);
753	if (strcmp(form, "NFKD") == 0)
754	return nfd_nfkd(self, input, 1);
755	PyErr_SetString(PyExc_ValueError, "invalid normalization form");
756	return NULL;
757	}
758
759	/* -------------------------------------------------------------------- */
760	/* unicode character name tables */
761
762	/* data file generated by Tools/unicode/makeunicodedata.py */
763	#include "unicodename_db.h"
764
765	/* -------------------------------------------------------------------- */
766	/* database code (cut and pasted from the unidb package) */
767
768	static unsigned long
769	_gethash(const char *s, int len, int scale)
770	{
771	int i;
772	unsigned long h = 0;
773	unsigned long ix;
774	for (i = 0; i < len; i++) {
775	h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
776	ix = h & 0xff000000;
777	if (ix)
778	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
779	}
780	return h;
781	}
782
783	static char *hangul_syllables[][3] = {
784	{ "G", "A", "" },
785	{ "GG", "AE", "G" },
786	{ "N", "YA", "GG" },
787	{ "D", "YAE", "GS" },
788	{ "DD", "EO", "N", },
789	{ "R", "E", "NJ" },
790	{ "M", "YEO", "NH" },
791	{ "B", "YE", "D" },
792	{ "BB", "O", "L" },
793	{ "S", "WA", "LG" },
794	{ "SS", "WAE", "LM" },
795	{ "", "OE", "LB" },
796	{ "J", "YO", "LS" },
797	{ "JJ", "U", "LT" },
798	{ "C", "WEO", "LP" },
799	{ "K", "WE", "LH" },
800	{ "T", "WI", "M" },
801	{ "P", "YU", "B" },
802	{ "H", "EU", "BS" },
803	{ 0, "YI", "S" },
804	{ 0, "I", "SS" },
805	{ 0, 0, "NG" },
806	{ 0, 0, "J" },
807	{ 0, 0, "C" },
808	{ 0, 0, "K" },
809	{ 0, 0, "T" },
810	{ 0, 0, "P" },
811	{ 0, 0, "H" }
812	};
813
814	static int
815	is_unified_ideograph(Py_UCS4 code)
816	{
817	return (
818	(0x3400 <= code && code <= 0x4DB5) \|\| /* CJK Ideograph Extension A */
819	(0x4E00 <= code && code <= 0x9FBB) \|\| /* CJK Ideograph */
820	(0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
821	}
822
823	static int
824	_getucname(PyObject self, Py_UCS4 code, char buffer, int buflen)
825	{
826	int offset;
827	int i;
828	int word;
829	unsigned char* w;
830
831	if (code >= 0x110000)
832	return 0;
833
834	if (self) {
835	const change_record *old = get_old_record(self, code);
836	if (old->category_changed == 0) {
837	/* unassigned */
838	return 0;
839	}
840	}
841
842	if (SBase <= code && code < SBase+SCount) {
843	/* Hangul syllable. */
844	int SIndex = code - SBase;
845	int L = SIndex / NCount;
846	int V = (SIndex % NCount) / TCount;
847	int T = SIndex % TCount;
848
849	if (buflen < 27)
850	/* Worst case: HANGUL SYLLABLE <10chars>. */
851	return 0;
852	strcpy(buffer, "HANGUL SYLLABLE ");
853	buffer += 16;
854	strcpy(buffer, hangul_syllables[L][0]);
855	buffer += strlen(hangul_syllables[L][0]);
856	strcpy(buffer, hangul_syllables[V][1]);
857	buffer += strlen(hangul_syllables[V][1]);
858	strcpy(buffer, hangul_syllables[T][2]);
859	buffer += strlen(hangul_syllables[T][2]);
860	*buffer = '\0';
861	return 1;
862	}
863
864	if (is_unified_ideograph(code)) {
865	if (buflen < 28)
866	/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
867	return 0;
868	sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
869	return 1;
870	}
871
872	/* get offset into phrasebook */
873	offset = phrasebook_offset1[(code>>phrasebook_shift)];
874	offset = phrasebook_offset2[(offset<<phrasebook_shift) +
875	(code&((1<<phrasebook_shift)-1))];
876	if (!offset)
877	return 0;
878
879	i = 0;
880
881	for (;;) {
882	/* get word index */
883	word = phrasebook[offset] - phrasebook_short;
884	if (word >= 0) {
885	word = (word << 8) + phrasebook[offset+1];
886	offset += 2;
887	} else
888	word = phrasebook[offset++];
889	if (i) {
890	if (i > buflen)
891	return 0; /* buffer overflow */
892	buffer[i++] = ' ';
893	}
894	/* copy word string from lexicon. the last character in the
895	word has bit 7 set. the last word in a string ends with
896	0x80 */
897	w = lexicon + lexicon_offset[word];
898	while (*w < 128) {
899	if (i >= buflen)
900	return 0; /* buffer overflow */
901	buffer[i++] = *w++;
902	}
903	if (i >= buflen)
904	return 0; /* buffer overflow */
905	buffer[i++] = *w & 127;
906	if (*w == 128)
907	break; /* end of word */
908	}
909
910	return 1;
911	}
912
913	static int
914	_cmpname(PyObject self, int code, const char name, int namelen)
915	{
916	/* check if code corresponds to the given name */
917	int i;
918	char buffer[NAME_MAXLEN];
919	if (!_getucname(self, code, buffer, sizeof(buffer)))
920	return 0;
921	for (i = 0; i < namelen; i++) {
922	if (toupper(Py_CHARMASK(name[i])) != buffer[i])
923	return 0;
924	}
925	return buffer[namelen] == '\0';
926	}
927
928	static void
929	find_syllable(const char str, int len, int *pos, int count, int column)
930	{
931	int i, len1;
932	*len = -1;
933	for (i = 0; i < count; i++) {
934	char *s = hangul_syllables[i][column];
935	len1 = strlen(s);
936	if (len1 <= *len)
937	continue;
938	if (strncmp(str, s, len1) == 0) {
939	*len = len1;
940	*pos = i;
941	}
942	}
943	if (*len == -1) {
944	*len = 0;
945	}
946	}
947
948	static int
949	_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
950	{
951	unsigned int h, v;
952	unsigned int mask = code_size-1;
953	unsigned int i, incr;
954
955	/* Check for hangul syllables. */
956	if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
957	int len, L = -1, V = -1, T = -1;
958	const char *pos = name + 16;
959	find_syllable(pos, &len, &L, LCount, 0);
960	pos += len;
961	find_syllable(pos, &len, &V, VCount, 1);
962	pos += len;
963	find_syllable(pos, &len, &T, TCount, 2);
964	pos += len;
965	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
966	code = SBase + (LVCount+V)*TCount + T;
967	return 1;
968	}
969	/* Otherwise, it's an illegal syllable name. */
970	return 0;
971	}
972
973	/* Check for unified ideographs. */
974	if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
975	/* Four or five hexdigits must follow. */
976	v = 0;
977	name += 22;
978	namelen -= 22;
979	if (namelen != 4 && namelen != 5)
980	return 0;
981	while (namelen--) {
982	v *= 16;
983	if (name >= '0' && name <= '9')
984	v += *name - '0';
985	else if (name >= 'A' && name <= 'F')
986	v += *name - 'A' + 10;
987	else
988	return 0;
989	name++;
990	}
991	if (!is_unified_ideograph(v))
992	return 0;
993	*code = v;
994	return 1;
995	}
996
997	/* the following is the same as python's dictionary lookup, with
998	only minor changes. see the makeunicodedata script for more
999	details */
1000
1001	h = (unsigned int) _gethash(name, namelen, code_magic);
1002	i = (~h) & mask;
1003	v = code_hash[i];
1004	if (!v)
1005	return 0;
1006	if (_cmpname(self, v, name, namelen)) {
1007	*code = v;
1008	return 1;
1009	}
1010	incr = (h ^ (h >> 3)) & mask;
1011	if (!incr)
1012	incr = mask;
1013	for (;;) {
1014	i = (i + incr) & mask;
1015	v = code_hash[i];
1016	if (!v)
1017	return 0;
1018	if (_cmpname(self, v, name, namelen)) {
1019	*code = v;
1020	return 1;
1021	}
1022	incr = incr << 1;
1023	if (incr > mask)
1024	incr = incr ^ code_poly;
1025	}
1026	}
1027
1028	static const _PyUnicode_Name_CAPI hashAPI =
1029	{
1030	sizeof(_PyUnicode_Name_CAPI),
1031	_getucname,
1032	_getcode
1033	};
1034
1035	/* -------------------------------------------------------------------- */
1036	/* Python bindings */
1037
1038	PyDoc_STRVAR(unicodedata_name__doc__,
1039	"name(unichr[, default])\n\
1040	Returns the name assigned to the Unicode character unichr as a\n\
1041	string. If no name is defined, default is returned, or, if not\n\
1042	given, ValueError is raised.");
1043
1044	static PyObject *
1045	unicodedata_name(PyObject* self, PyObject* args)
1046	{
1047	char name[NAME_MAXLEN];
1048	Py_UCS4 c;
1049
1050	PyUnicodeObject* v;
1051	PyObject* defobj = NULL;
1052	if (!PyArg_ParseTuple(args, "O!\|O:name", &PyUnicode_Type, &v, &defobj))
1053	return NULL;
1054
1055	c = getuchar(v);
1056	if (c == (Py_UCS4)-1)
1057	return NULL;
1058
1059	if (!_getucname(self, c, name, sizeof(name))) {
1060	if (defobj == NULL) {
1061	PyErr_SetString(PyExc_ValueError, "no such name");
1062	return NULL;
1063	}
1064	else {
1065	Py_INCREF(defobj);
1066	return defobj;
1067	}
1068	}
1069
1070	return Py_BuildValue("s", name);
1071	}
1072
1073	PyDoc_STRVAR(unicodedata_lookup__doc__,
1074	"lookup(name)\n\
1075	\n\
1076	Look up character by name. If a character with the\n\
1077	given name is found, return the corresponding Unicode\n\
1078	character. If not found, KeyError is raised.");
1079
1080	static PyObject *
1081	unicodedata_lookup(PyObject* self, PyObject* args)
1082	{
1083	Py_UCS4 code;
1084	Py_UNICODE str[2];
1085
1086	char* name;
1087	int namelen;
1088	if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1089	return NULL;
1090
1091	if (!_getcode(self, name, namelen, &code)) {
1092	PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1093	name);
1094	return NULL;
1095	}
1096
1097	#ifndef Py_UNICODE_WIDE
1098	if (code >= 0x10000) {
1099	str[0] = 0xd800 + ((code - 0x10000) >> 10);
1100	str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1101	return PyUnicode_FromUnicode(str, 2);
1102	}
1103	#endif
1104	str[0] = (Py_UNICODE) code;
1105	return PyUnicode_FromUnicode(str, 1);
1106	}
1107
1108	/* XXX Add doc strings. */
1109
1110	static PyMethodDef unicodedata_functions[] = {
1111	{"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1112	{"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1113	{"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1114	{"category", unicodedata_category, METH_VARARGS,
1115	unicodedata_category__doc__},
1116	{"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1117	unicodedata_bidirectional__doc__},
1118	{"combining", unicodedata_combining, METH_VARARGS,
1119	unicodedata_combining__doc__},
1120	{"mirrored", unicodedata_mirrored, METH_VARARGS,
1121	unicodedata_mirrored__doc__},
1122	{"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1123	unicodedata_east_asian_width__doc__},
1124	{"decomposition", unicodedata_decomposition, METH_VARARGS,
1125	unicodedata_decomposition__doc__},
1126	{"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1127	{"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1128	{"normalize", unicodedata_normalize, METH_VARARGS,
1129	unicodedata_normalize__doc__},
1130	{NULL, NULL} /* sentinel */
1131	};
1132
1133	static PyTypeObject UCD_Type = {
1134	/* The ob_type field must be initialized in the module init function
1135	* to be portable to Windows without using C++. */
1136	PyVarObject_HEAD_INIT(NULL, 0)
1137	"unicodedata.UCD", /tp_name/
1138	sizeof(PreviousDBVersion), /tp_basicsize/
1139	0, /tp_itemsize/
1140	/* methods */
1141	(destructor)PyObject_Del, /tp_dealloc/
1142	0, /tp_print/
1143	0, /tp_getattr/
1144	0, /tp_setattr/
1145	0, /tp_compare/
1146	0, /tp_repr/
1147	0, /tp_as_number/
1148	0, /tp_as_sequence/
1149	0, /tp_as_mapping/
1150	0, /tp_hash/
1151	0, /tp_call/
1152	0, /tp_str/
1153	PyObject_GenericGetAttr,/tp_getattro/
1154	0, /tp_setattro/
1155	0, /tp_as_buffer/
1156	Py_TPFLAGS_DEFAULT, /tp_flags/
1157	0, /tp_doc/
1158	0, /tp_traverse/
1159	0, /tp_clear/
1160	0, /tp_richcompare/
1161	0, /tp_weaklistoffset/
1162	0, /tp_iter/
1163	0, /tp_iternext/
1164	unicodedata_functions, /tp_methods/
1165	DB_members, /tp_members/
1166	0, /tp_getset/
1167	0, /tp_base/
1168	0, /tp_dict/
1169	0, /tp_descr_get/
1170	0, /tp_descr_set/
1171	0, /tp_dictoffset/
1172	0, /tp_init/
1173	0, /tp_alloc/
1174	0, /tp_new/
1175	0, /tp_free/
1176	0, /tp_is_gc/
1177	};
1178
1179	PyDoc_STRVAR(unicodedata_docstring,
1180	"This module provides access to the Unicode Character Database which\n\
1181	defines character properties for all Unicode characters. The data in\n\
1182	this database is based on the UnicodeData.txt file version\n\
1183	5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1184	\n\
1185	The module uses the same names and symbols as defined by the\n\
1186	UnicodeData File Format 5.1.0 (see\n\
1187	http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1188
1189	PyMODINIT_FUNC
1190	initunicodedata(void)
1191	{
1192	PyObject m, v;
1193
1194	Py_TYPE(&UCD_Type) = &PyType_Type;
1195
1196	m = Py_InitModule3(
1197	"unicodedata", unicodedata_functions, unicodedata_docstring);
1198	if (!m)
1199	return;
1200
1201	PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1202	Py_INCREF(&UCD_Type);
1203	PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1204
1205	/* Previous versions */
1206	v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1207	if (v != NULL)
1208	PyModule_AddObject(m, "ucd_3_2_0", v);
1209
1210	/* Export C API */
1211	v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1212	if (v != NULL)
1213	PyModule_AddObject(m, "ucnhash_CAPI", v);
1214	}
1215
1216	/*
1217	Local variables:
1218	c-basic-offset: 4
1219	indent-tabs-mode: nil
1220	End:
1221	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Modules/unicodedata.c@ 281

Download in other formats: