Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

unicodedata.c

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 37.2 KB

Rev	Line
[2]	1	/* ------------------------------------------------------------------------
	2
[391]	3	unicodedata -- Provides access to the Unicode 5.2 data base.
[2]	4
[391]	5	Data was extracted from the Unicode 5.2 UnicodeData.txt file.
[2]	6
	7	Written by Marc-Andre Lemburg (mal@lemburg.com).
	8	Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
	9	Modified by Martin v. Löwis (martin@v.loewis.de)
	10
	11	Copyright (c) Corporation for National Research Initiatives.
	12
	13	------------------------------------------------------------------------ */
	14
	15	#include "Python.h"
	16	#include "ucnhash.h"
	17	#include "structmember.h"
	18
	19	/* character properties */
	20
	21	typedef struct {
[391]	22	const unsigned char category; /* index into
	23	_PyUnicode_CategoryNames */
	24	const unsigned char combining; /* combining class value 0 - 255 */
	25	const unsigned char bidirectional; /* index into
	26	_PyUnicode_BidirectionalNames */
	27	const unsigned char mirrored; /* true if mirrored in bidir mode */
	28	const unsigned char east_asian_width; /* index into
	29	_PyUnicode_EastAsianWidth */
	30	const unsigned char normalization_quick_check; /* see is_normalized() */
[2]	31	} _PyUnicode_DatabaseRecord;
	32
	33	typedef struct change_record {
	34	/* sequence of fields should be the same as in merge_old_version */
	35	const unsigned char bidir_changed;
	36	const unsigned char category_changed;
	37	const unsigned char decimal_changed;
	38	const unsigned char mirrored_changed;
[391]	39	const double numeric_changed;
[2]	40	} change_record;
	41
	42	/* data file generated by Tools/unicode/makeunicodedata.py */
	43	#include "unicodedata_db.h"
	44
	45	static const _PyUnicode_DatabaseRecord*
	46	_getrecord_ex(Py_UCS4 code)
	47	{
	48	int index;
	49	if (code >= 0x110000)
	50	index = 0;
	51	else {
	52	index = index1[(code>>SHIFT)];
	53	index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
	54	}
	55
	56	return &_PyUnicode_Database_Records[index];
	57	}
	58
	59	/* ------------- Previous-version API ------------------------------------- */
	60	typedef struct previous_version {
	61	PyObject_HEAD
	62	const char *name;
	63	const change_record* (*getrecord)(Py_UCS4);
	64	Py_UCS4 (*normalization)(Py_UCS4);
	65	} PreviousDBVersion;
	66
	67	#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
	68
	69	static PyMemberDef DB_members[] = {
[391]	70	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
[2]	71	{NULL}
	72	};
	73
	74	/* forward declaration */
	75	static PyTypeObject UCD_Type;
	76
	77	static PyObject*
	78	new_previous_version(const charname, const change_record (*getrecord)(Py_UCS4),
	79	Py_UCS4 (*normalization)(Py_UCS4))
	80	{
[391]	81	PreviousDBVersion *self;
	82	self = PyObject_New(PreviousDBVersion, &UCD_Type);
	83	if (self == NULL)
	84	return NULL;
	85	self->name = name;
	86	self->getrecord = getrecord;
[2]	87	self->normalization = normalization;
[391]	88	return (PyObject*)self;
[2]	89	}
	90
	91
	92	static Py_UCS4 getuchar(PyUnicodeObject *obj)
	93	{
	94	Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
	95
	96	if (PyUnicode_GET_SIZE(obj) == 1)
[391]	97	return *v;
[2]	98	#ifndef Py_UNICODE_WIDE
	99	else if ((PyUnicode_GET_SIZE(obj) == 2) &&
	100	(0xD800 <= v[0] && v[0] <= 0xDBFF) &&
	101	(0xDC00 <= v[1] && v[1] <= 0xDFFF))
[391]	102	return (((v[0] & 0x3FF)<<10) \| (v[1] & 0x3FF)) + 0x10000;
[2]	103	#endif
	104	PyErr_SetString(PyExc_TypeError,
	105	"need a single Unicode character as parameter");
	106	return (Py_UCS4)-1;
	107	}
	108
	109	/* --- Module API --------------------------------------------------------- */
	110
	111	PyDoc_STRVAR(unicodedata_decimal__doc__,
	112	"decimal(unichr[, default])\n\
	113	\n\
	114	Returns the decimal value assigned to the Unicode character unichr\n\
	115	as integer. If no such value is defined, default is returned, or, if\n\
	116	not given, ValueError is raised.");
	117
	118	static PyObject *
	119	unicodedata_decimal(PyObject self, PyObject args)
	120	{
	121	PyUnicodeObject *v;
	122	PyObject *defobj = NULL;
	123	int have_old = 0;
	124	long rc;
	125	Py_UCS4 c;
	126
	127	if (!PyArg_ParseTuple(args, "O!\|O:decimal", &PyUnicode_Type, &v, &defobj))
	128	return NULL;
	129	c = getuchar(v);
	130	if (c == (Py_UCS4)-1)
	131	return NULL;
	132
	133	if (self) {
	134	const change_record *old = get_old_record(self, c);
	135	if (old->category_changed == 0) {
	136	/* unassigned */
	137	have_old = 1;
	138	rc = -1;
[391]	139	}
[2]	140	else if (old->decimal_changed != 0xFF) {
	141	have_old = 1;
	142	rc = old->decimal_changed;
	143	}
	144	}
	145
	146	if (!have_old)
	147	rc = Py_UNICODE_TODECIMAL(c);
	148	if (rc < 0) {
[391]	149	if (defobj == NULL) {
	150	PyErr_SetString(PyExc_ValueError,
	151	"not a decimal");
[2]	152	return NULL;
[391]	153	}
	154	else {
	155	Py_INCREF(defobj);
	156	return defobj;
	157	}
[2]	158	}
	159	return PyInt_FromLong(rc);
	160	}
	161
	162	PyDoc_STRVAR(unicodedata_digit__doc__,
	163	"digit(unichr[, default])\n\
	164	\n\
	165	Returns the digit value assigned to the Unicode character unichr as\n\
	166	integer. If no such value is defined, default is returned, or, if\n\
	167	not given, ValueError is raised.");
	168
	169	static PyObject *
	170	unicodedata_digit(PyObject self, PyObject args)
	171	{
	172	PyUnicodeObject *v;
	173	PyObject *defobj = NULL;
	174	long rc;
	175	Py_UCS4 c;
	176
	177	if (!PyArg_ParseTuple(args, "O!\|O:digit", &PyUnicode_Type, &v, &defobj))
	178	return NULL;
	179	c = getuchar(v);
	180	if (c == (Py_UCS4)-1)
	181	return NULL;
	182	rc = Py_UNICODE_TODIGIT(c);
	183	if (rc < 0) {
[391]	184	if (defobj == NULL) {
	185	PyErr_SetString(PyExc_ValueError, "not a digit");
[2]	186	return NULL;
[391]	187	}
	188	else {
	189	Py_INCREF(defobj);
	190	return defobj;
	191	}
[2]	192	}
	193	return PyInt_FromLong(rc);
	194	}
	195
	196	PyDoc_STRVAR(unicodedata_numeric__doc__,
	197	"numeric(unichr[, default])\n\
	198	\n\
	199	Returns the numeric value assigned to the Unicode character unichr\n\
	200	as float. If no such value is defined, default is returned, or, if\n\
	201	not given, ValueError is raised.");
	202
	203	static PyObject *
	204	unicodedata_numeric(PyObject self, PyObject args)
	205	{
	206	PyUnicodeObject *v;
	207	PyObject *defobj = NULL;
	208	int have_old = 0;
	209	double rc;
	210	Py_UCS4 c;
	211
	212	if (!PyArg_ParseTuple(args, "O!\|O:numeric", &PyUnicode_Type, &v, &defobj))
	213	return NULL;
	214	c = getuchar(v);
	215	if (c == (Py_UCS4)-1)
	216	return NULL;
	217
	218	if (self) {
	219	const change_record *old = get_old_record(self, c);
	220	if (old->category_changed == 0) {
	221	/* unassigned */
	222	have_old = 1;
	223	rc = -1.0;
[391]	224	}
[2]	225	else if (old->decimal_changed != 0xFF) {
	226	have_old = 1;
	227	rc = old->decimal_changed;
	228	}
	229	}
	230
	231	if (!have_old)
	232	rc = Py_UNICODE_TONUMERIC(c);
	233	if (rc == -1.0) {
[391]	234	if (defobj == NULL) {
	235	PyErr_SetString(PyExc_ValueError, "not a numeric character");
	236	return NULL;
	237	}
	238	else {
	239	Py_INCREF(defobj);
	240	return defobj;
	241	}
[2]	242	}
	243	return PyFloat_FromDouble(rc);
	244	}
	245
	246	PyDoc_STRVAR(unicodedata_category__doc__,
	247	"category(unichr)\n\
	248	\n\
	249	Returns the general category assigned to the Unicode character\n\
	250	unichr as string.");
	251
	252	static PyObject *
	253	unicodedata_category(PyObject self, PyObject args)
	254	{
	255	PyUnicodeObject *v;
	256	int index;
	257	Py_UCS4 c;
	258
	259	if (!PyArg_ParseTuple(args, "O!:category",
[391]	260	&PyUnicode_Type, &v))
	261	return NULL;
[2]	262	c = getuchar(v);
	263	if (c == (Py_UCS4)-1)
	264	return NULL;
	265	index = (int) _getrecord_ex(c)->category;
	266	if (self) {
	267	const change_record *old = get_old_record(self, c);
	268	if (old->category_changed != 0xFF)
	269	index = old->category_changed;
	270	}
	271	return PyString_FromString(_PyUnicode_CategoryNames[index]);
	272	}
	273
	274	PyDoc_STRVAR(unicodedata_bidirectional__doc__,
	275	"bidirectional(unichr)\n\
	276	\n\
[391]	277	Returns the bidirectional class assigned to the Unicode character\n\
[2]	278	unichr as string. If no such value is defined, an empty string is\n\
	279	returned.");
	280
	281	static PyObject *
	282	unicodedata_bidirectional(PyObject self, PyObject args)
	283	{
	284	PyUnicodeObject *v;
	285	int index;
	286	Py_UCS4 c;
	287
	288	if (!PyArg_ParseTuple(args, "O!:bidirectional",
[391]	289	&PyUnicode_Type, &v))
	290	return NULL;
[2]	291	c = getuchar(v);
	292	if (c == (Py_UCS4)-1)
	293	return NULL;
	294	index = (int) _getrecord_ex(c)->bidirectional;
	295	if (self) {
	296	const change_record *old = get_old_record(self, c);
	297	if (old->category_changed == 0)
	298	index = 0; /* unassigned */
	299	else if (old->bidir_changed != 0xFF)
	300	index = old->bidir_changed;
	301	}
	302	return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
	303	}
	304
	305	PyDoc_STRVAR(unicodedata_combining__doc__,
	306	"combining(unichr)\n\
	307	\n\
	308	Returns the canonical combining class assigned to the Unicode\n\
	309	character unichr as integer. Returns 0 if no combining class is\n\
	310	defined.");
	311
	312	static PyObject *
	313	unicodedata_combining(PyObject self, PyObject args)
	314	{
	315	PyUnicodeObject *v;
	316	int index;
	317	Py_UCS4 c;
	318
	319	if (!PyArg_ParseTuple(args, "O!:combining",
[391]	320	&PyUnicode_Type, &v))
	321	return NULL;
[2]	322	c = getuchar(v);
	323	if (c == (Py_UCS4)-1)
	324	return NULL;
	325	index = (int) _getrecord_ex(c)->combining;
	326	if (self) {
	327	const change_record *old = get_old_record(self, c);
	328	if (old->category_changed == 0)
	329	index = 0; /* unassigned */
	330	}
	331	return PyInt_FromLong(index);
	332	}
	333
	334	PyDoc_STRVAR(unicodedata_mirrored__doc__,
	335	"mirrored(unichr)\n\
	336	\n\
	337	Returns the mirrored property assigned to the Unicode character\n\
	338	unichr as integer. Returns 1 if the character has been identified as\n\
	339	a \"mirrored\" character in bidirectional text, 0 otherwise.");
	340
	341	static PyObject *
	342	unicodedata_mirrored(PyObject self, PyObject args)
	343	{
	344	PyUnicodeObject *v;
	345	int index;
	346	Py_UCS4 c;
	347
	348	if (!PyArg_ParseTuple(args, "O!:mirrored",
[391]	349	&PyUnicode_Type, &v))
	350	return NULL;
[2]	351	c = getuchar(v);
	352	if (c == (Py_UCS4)-1)
	353	return NULL;
	354	index = (int) _getrecord_ex(c)->mirrored;
	355	if (self) {
	356	const change_record *old = get_old_record(self, c);
	357	if (old->category_changed == 0)
	358	index = 0; /* unassigned */
	359	else if (old->mirrored_changed != 0xFF)
	360	index = old->mirrored_changed;
	361	}
	362	return PyInt_FromLong(index);
	363	}
	364
	365	PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
	366	"east_asian_width(unichr)\n\
	367	\n\
	368	Returns the east asian width assigned to the Unicode character\n\
	369	unichr as string.");
	370
	371	static PyObject *
	372	unicodedata_east_asian_width(PyObject self, PyObject args)
	373	{
	374	PyUnicodeObject *v;
	375	int index;
	376	Py_UCS4 c;
	377
	378	if (!PyArg_ParseTuple(args, "O!:east_asian_width",
[391]	379	&PyUnicode_Type, &v))
	380	return NULL;
[2]	381	c = getuchar(v);
	382	if (c == (Py_UCS4)-1)
	383	return NULL;
	384	index = (int) _getrecord_ex(c)->east_asian_width;
	385	if (self) {
	386	const change_record *old = get_old_record(self, c);
	387	if (old->category_changed == 0)
	388	index = 0; /* unassigned */
	389	}
	390	return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
	391	}
	392
	393	PyDoc_STRVAR(unicodedata_decomposition__doc__,
	394	"decomposition(unichr)\n\
	395	\n\
	396	Returns the character decomposition mapping assigned to the Unicode\n\
	397	character unichr as string. An empty string is returned in case no\n\
	398	such mapping is defined.");
	399
	400	static PyObject *
	401	unicodedata_decomposition(PyObject self, PyObject args)
	402	{
	403	PyUnicodeObject *v;
	404	char decomp[256];
	405	int code, index, count, i;
	406	unsigned int prefix_index;
	407	Py_UCS4 c;
	408
	409	if (!PyArg_ParseTuple(args, "O!:decomposition",
[391]	410	&PyUnicode_Type, &v))
	411	return NULL;
[2]	412	c = getuchar(v);
	413	if (c == (Py_UCS4)-1)
	414	return NULL;
	415
	416	code = (int)c;
	417
	418	if (self) {
	419	const change_record *old = get_old_record(self, c);
	420	if (old->category_changed == 0)
	421	return PyString_FromString(""); /* unassigned */
	422	}
	423
	424	if (code < 0 \|\| code >= 0x110000)
	425	index = 0;
	426	else {
	427	index = decomp_index1[(code>>DECOMP_SHIFT)];
	428	index = decomp_index2[(index<<DECOMP_SHIFT)+
	429	(code&((1<<DECOMP_SHIFT)-1))];
	430	}
	431
	432	/* high byte is number of hex bytes (usually one or two), low byte
	433	is prefix code (from*/
	434	count = decomp_data[index] >> 8;
	435
	436	/* XXX: could allocate the PyString up front instead
	437	(strlen(prefix) + 5 * count + 1 bytes) */
	438
	439	/* Based on how index is calculated above and decomp_data is generated
	440	from Tools/unicode/makeunicodedata.py, it should not be possible
	441	to overflow decomp_prefix. */
	442	prefix_index = decomp_data[index] & 255;
	443	assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
	444
	445	/* copy prefix */
	446	i = strlen(decomp_prefix[prefix_index]);
	447	memcpy(decomp, decomp_prefix[prefix_index], i);
	448
	449	while (count-- > 0) {
	450	if (i)
	451	decomp[i++] = ' ';
	452	assert((size_t)i < sizeof(decomp));
	453	PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
	454	decomp_data[++index]);
	455	i += strlen(decomp + i);
	456	}
[391]	457
[2]	458	decomp[i] = '\0';
	459
	460	return PyString_FromString(decomp);
	461	}
	462
	463	static void
	464	get_decomp_record(PyObject self, Py_UCS4 code, int index, int prefix, int count)
	465	{
	466	if (code >= 0x110000) {
	467	*index = 0;
	468	} else if (self && get_old_record(self, code)->category_changed==0) {
	469	/* unassigned in old version */
	470	*index = 0;
	471	}
	472	else {
	473	*index = decomp_index1[(code>>DECOMP_SHIFT)];
	474	index = decomp_index2[(index<<DECOMP_SHIFT)+
	475	(code&((1<<DECOMP_SHIFT)-1))];
	476	}
[391]	477
[2]	478	/* high byte is number of hex bytes (usually one or two), low byte
	479	is prefix code (from*/
	480	count = decomp_data[index] >> 8;
	481	prefix = decomp_data[index] & 255;
	482
	483	(*index)++;
	484	}
	485
	486	#define SBase 0xAC00
	487	#define LBase 0x1100
	488	#define VBase 0x1161
	489	#define TBase 0x11A7
	490	#define LCount 19
	491	#define VCount 21
	492	#define TCount 28
	493	#define NCount (VCount*TCount)
	494	#define SCount (LCount*NCount)
	495
	496	static PyObject*
	497	nfd_nfkd(PyObject self, PyObject input, int k)
	498	{
	499	PyObject *result;
	500	Py_UNICODE i, end, *o;
	501	/* Longest decomposition in Unicode 3.2: U+FDFA */
[391]	502	Py_UNICODE stack[20];
[2]	503	Py_ssize_t space, isize;
	504	int index, prefix, count, stackptr;
	505	unsigned char prev, cur;
[391]	506
[2]	507	stackptr = 0;
	508	isize = PyUnicode_GET_SIZE(input);
[391]	509	/* Overallocate at most 10 characters. */
[2]	510	space = (isize > 10 ? 10 : isize) + isize;
	511	result = PyUnicode_FromUnicode(NULL, space);
	512	if (!result)
	513	return NULL;
	514	i = PyUnicode_AS_UNICODE(input);
	515	end = i + isize;
	516	o = PyUnicode_AS_UNICODE(result);
	517
	518	while (i < end) {
	519	stack[stackptr++] = *i++;
	520	while(stackptr) {
	521	Py_UNICODE code = stack[--stackptr];
	522	/* Hangul Decomposition adds three characters in
[391]	523	a single step, so we need at least that much room. */
[2]	524	if (space < 3) {
	525	Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
	526	space += 10;
	527	if (PyUnicode_Resize(&result, newsize) == -1)
	528	return NULL;
	529	o = PyUnicode_AS_UNICODE(result) + newsize - space;
	530	}
	531	/* Hangul Decomposition. */
	532	if (SBase <= code && code < (SBase+SCount)) {
	533	int SIndex = code - SBase;
	534	int L = LBase + SIndex / NCount;
	535	int V = VBase + (SIndex % NCount) / TCount;
	536	int T = TBase + SIndex % TCount;
	537	*o++ = L;
	538	*o++ = V;
	539	space -= 2;
	540	if (T != TBase) {
	541	*o++ = T;
	542	space --;
	543	}
	544	continue;
	545	}
	546	/* normalization changes */
	547	if (self) {
	548	Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
	549	if (value != 0) {
	550	stack[stackptr++] = value;
	551	continue;
	552	}
	553	}
	554
	555	/* Other decompositions. */
	556	get_decomp_record(self, code, &index, &prefix, &count);
	557
	558	/* Copy character if it is not decomposable, or has a
	559	compatibility decomposition, but we do NFD. */
	560	if (!count \|\| (prefix && !k)) {
	561	*o++ = code;
	562	space--;
	563	continue;
	564	}
	565	/* Copy decomposition onto the stack, in reverse
	566	order. */
	567	while(count) {
	568	code = decomp_data[index + (--count)];
	569	stack[stackptr++] = code;
	570	}
	571	}
	572	}
	573
	574	/* Drop overallocation. Cannot fail. */
	575	PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
	576
	577	/* Sort canonically. */
	578	i = PyUnicode_AS_UNICODE(result);
	579	prev = _getrecord_ex(*i)->combining;
	580	end = i + PyUnicode_GET_SIZE(result);
	581	for (i++; i < end; i++) {
	582	cur = _getrecord_ex(*i)->combining;
	583	if (prev == 0 \|\| cur == 0 \|\| prev <= cur) {
	584	prev = cur;
	585	continue;
	586	}
	587	/* Non-canonical order. Need to switch i with previous. /
	588	o = i - 1;
	589	while (1) {
	590	Py_UNICODE tmp = o[1];
	591	o[1] = o[0];
	592	o[0] = tmp;
	593	o--;
	594	if (o < PyUnicode_AS_UNICODE(result))
	595	break;
	596	prev = _getrecord_ex(*o)->combining;
	597	if (prev == 0 \|\| prev <= cur)
	598	break;
	599	}
	600	prev = _getrecord_ex(*i)->combining;
	601	}
	602	return result;
	603	}
	604
	605	static int
	606	find_nfc_index(PyObject self, struct reindex nfc, Py_UNICODE code)
	607	{
	608	int index;
	609	for (index = 0; nfc[index].start; index++) {
	610	int start = nfc[index].start;
	611	if (code < start)
	612	return -1;
	613	if (code <= start + nfc[index].count) {
	614	int delta = code - start;
	615	return nfc[index].index + delta;
	616	}
	617	}
	618	return -1;
	619	}
	620
	621	static PyObject*
	622	nfc_nfkc(PyObject self, PyObject input, int k)
	623	{
	624	PyObject *result;
	625	Py_UNICODE i, i1, o, end;
	626	int f,l,index,index1,comb;
	627	Py_UNICODE code;
	628	Py_UNICODE *skipped[20];
	629	int cskipped = 0;
	630
	631	result = nfd_nfkd(self, input, k);
	632	if (!result)
	633	return NULL;
	634
	635	/* We are going to modify result in-place.
	636	If nfd_nfkd is changed to sometimes return the input,
	637	this code needs to be reviewed. */
	638	assert(result != input);
	639
	640	i = PyUnicode_AS_UNICODE(result);
	641	end = i + PyUnicode_GET_SIZE(result);
	642	o = PyUnicode_AS_UNICODE(result);
[391]	643
[2]	644	again:
	645	while (i < end) {
	646	for (index = 0; index < cskipped; index++) {
	647	if (skipped[index] == i) {
[391]	648	/* *i character is skipped.
[2]	649	Remove from list. */
	650	skipped[index] = skipped[cskipped-1];
	651	cskipped--;
	652	i++;
	653	goto again; /* continue while */
	654	}
	655	}
	656	/* Hangul Composition. We don't need to check for <LV,T>
	657	pairs, since we always have decomposed data. */
	658	if (LBase <= i && i < (LBase+LCount) &&
[391]	659	i + 1 < end &&
[2]	660	VBase <= i[1] && i[1] <= (VBase+VCount)) {
	661	int LIndex, VIndex;
	662	LIndex = i[0] - LBase;
	663	VIndex = i[1] - VBase;
	664	code = SBase + (LIndexVCount+VIndex)TCount;
	665	i+=2;
	666	if (i < end &&
	667	TBase <= i && i <= (TBase+TCount)) {
	668	code += *i-TBase;
	669	i++;
	670	}
	671	*o++ = code;
	672	continue;
	673	}
	674
	675	f = find_nfc_index(self, nfc_first, *i);
	676	if (f == -1) {
	677	o++ = i++;
	678	continue;
	679	}
	680	/* Find next unblocked character. */
	681	i1 = i+1;
	682	comb = 0;
	683	while (i1 < end) {
	684	int comb1 = _getrecord_ex(*i1)->combining;
[391]	685	if (comb) {
	686	if (comb1 == 0)
	687	break;
	688	if (comb >= comb1) {
	689	/* Character is blocked. */
	690	i1++;
	691	continue;
	692	}
[2]	693	}
	694	l = find_nfc_index(self, nfc_last, *i1);
	695	/* i1 cannot be combined with i. If *i1
	696	is a starter, we don't need to look further.
	697	Otherwise, record the combining class. */
	698	if (l == -1) {
	699	not_combinable:
	700	if (comb1 == 0)
	701	break;
	702	comb = comb1;
	703	i1++;
	704	continue;
	705	}
	706	index = f*TOTAL_LAST + l;
	707	index1 = comp_index[index >> COMP_SHIFT];
	708	code = comp_data[(index1<<COMP_SHIFT)+
	709	(index&((1<<COMP_SHIFT)-1))];
	710	if (code == 0)
	711	goto not_combinable;
[391]	712
[2]	713	/* Replace the original character. */
	714	*i = code;
	715	/* Mark the second character unused. */
[391]	716	assert(cskipped < 20);
[2]	717	skipped[cskipped++] = i1;
	718	i1++;
	719	f = find_nfc_index(self, nfc_first, *i);
	720	if (f == -1)
	721	break;
	722	}
	723	o++ = i++;
	724	}
	725	if (o != end)
	726	PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
	727	return result;
	728	}
[391]	729
	730	/* Return 1 if the input is certainly normalized, 0 if it might not be. */
	731	static int
	732	is_normalized(PyObject self, PyObject input, int nfc, int k)
	733	{
	734	Py_UNICODE i, end;
	735	unsigned char prev_combining = 0, quickcheck_mask;
	736
	737	/* An older version of the database is requested, quickchecks must be
	738	disabled. */
	739	if (self != NULL)
	740	return 0;
	741
	742	/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
	743	as described in http://unicode.org/reports/tr15/#Annex8. */
	744	quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
	745
	746	i = PyUnicode_AS_UNICODE(input);
	747	end = i + PyUnicode_GET_SIZE(input);
	748	while (i < end) {
	749	const _PyUnicode_DatabaseRecord record = _getrecord_ex(i++);
	750	unsigned char combining = record->combining;
	751	unsigned char quickcheck = record->normalization_quick_check;
	752
	753	if (quickcheck & quickcheck_mask)
	754	return 0; /* this string might need normalization */
	755	if (combining && prev_combining > combining)
	756	return 0; /* non-canonical sort order, not normalized */
	757	prev_combining = combining;
	758	}
	759	return 1; /* certainly normalized */
	760	}
	761
[2]	762	PyDoc_STRVAR(unicodedata_normalize__doc__,
	763	"normalize(form, unistr)\n\
	764	\n\
	765	Return the normal form 'form' for the Unicode string unistr. Valid\n\
	766	values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
	767
	768	static PyObject*
	769	unicodedata_normalize(PyObject self, PyObject args)
	770	{
	771	char *form;
	772	PyObject *input;
	773
	774	if(!PyArg_ParseTuple(args, "sO!:normalize",
	775	&form, &PyUnicode_Type, &input))
	776	return NULL;
	777
	778	if (PyUnicode_GetSize(input) == 0) {
	779	/* Special case empty input strings, since resizing
	780	them later would cause internal errors. */
	781	Py_INCREF(input);
	782	return input;
	783	}
	784
[391]	785	if (strcmp(form, "NFC") == 0) {
	786	if (is_normalized(self, input, 1, 0)) {
	787	Py_INCREF(input);
	788	return input;
	789	}
[2]	790	return nfc_nfkc(self, input, 0);
[391]	791	}
	792	if (strcmp(form, "NFKC") == 0) {
	793	if (is_normalized(self, input, 1, 1)) {
	794	Py_INCREF(input);
	795	return input;
	796	}
[2]	797	return nfc_nfkc(self, input, 1);
[391]	798	}
	799	if (strcmp(form, "NFD") == 0) {
	800	if (is_normalized(self, input, 0, 0)) {
	801	Py_INCREF(input);
	802	return input;
	803	}
[2]	804	return nfd_nfkd(self, input, 0);
[391]	805	}
	806	if (strcmp(form, "NFKD") == 0) {
	807	if (is_normalized(self, input, 0, 1)) {
	808	Py_INCREF(input);
	809	return input;
	810	}
[2]	811	return nfd_nfkd(self, input, 1);
[391]	812	}
[2]	813	PyErr_SetString(PyExc_ValueError, "invalid normalization form");
	814	return NULL;
	815	}
	816
	817	/* -------------------------------------------------------------------- */
	818	/* unicode character name tables */
	819
	820	/* data file generated by Tools/unicode/makeunicodedata.py */
	821	#include "unicodename_db.h"
	822
	823	/* -------------------------------------------------------------------- */
	824	/* database code (cut and pasted from the unidb package) */
	825
	826	static unsigned long
	827	_gethash(const char *s, int len, int scale)
	828	{
	829	int i;
	830	unsigned long h = 0;
	831	unsigned long ix;
	832	for (i = 0; i < len; i++) {
[391]	833	h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
[2]	834	ix = h & 0xff000000;
	835	if (ix)
	836	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
	837	}
	838	return h;
	839	}
	840
	841	static char *hangul_syllables[][3] = {
	842	{ "G", "A", "" },
	843	{ "GG", "AE", "G" },
	844	{ "N", "YA", "GG" },
	845	{ "D", "YAE", "GS" },
	846	{ "DD", "EO", "N", },
	847	{ "R", "E", "NJ" },
	848	{ "M", "YEO", "NH" },
	849	{ "B", "YE", "D" },
	850	{ "BB", "O", "L" },
	851	{ "S", "WA", "LG" },
	852	{ "SS", "WAE", "LM" },
	853	{ "", "OE", "LB" },
	854	{ "J", "YO", "LS" },
	855	{ "JJ", "U", "LT" },
	856	{ "C", "WEO", "LP" },
	857	{ "K", "WE", "LH" },
	858	{ "T", "WI", "M" },
	859	{ "P", "YU", "B" },
	860	{ "H", "EU", "BS" },
	861	{ 0, "YI", "S" },
	862	{ 0, "I", "SS" },
	863	{ 0, 0, "NG" },
	864	{ 0, 0, "J" },
	865	{ 0, 0, "C" },
	866	{ 0, 0, "K" },
	867	{ 0, 0, "T" },
	868	{ 0, 0, "P" },
	869	{ 0, 0, "H" }
	870	};
	871
	872	static int
	873	is_unified_ideograph(Py_UCS4 code)
	874	{
	875	return (
	876	(0x3400 <= code && code <= 0x4DB5) \|\| /* CJK Ideograph Extension A */
[391]	877	(0x4E00 <= code && code <= 0x9FCB) \|\| /* CJK Ideograph, Unicode 5.2 */
	878	(0x20000 <= code && code <= 0x2A6D6) \|\| /* CJK Ideograph Extension B */
	879	(0x2A700 <= code && code <= 0x2B734)); /* CJK Ideograph Extension C */
[2]	880	}
	881
	882	static int
	883	_getucname(PyObject self, Py_UCS4 code, char buffer, int buflen)
	884	{
	885	int offset;
	886	int i;
	887	int word;
	888	unsigned char* w;
	889
	890	if (code >= 0x110000)
	891	return 0;
	892
	893	if (self) {
	894	const change_record *old = get_old_record(self, code);
	895	if (old->category_changed == 0) {
	896	/* unassigned */
	897	return 0;
[391]	898	}
[2]	899	}
	900
	901	if (SBase <= code && code < SBase+SCount) {
[391]	902	/* Hangul syllable. */
	903	int SIndex = code - SBase;
	904	int L = SIndex / NCount;
	905	int V = (SIndex % NCount) / TCount;
	906	int T = SIndex % TCount;
[2]	907
[391]	908	if (buflen < 27)
	909	/* Worst case: HANGUL SYLLABLE <10chars>. */
	910	return 0;
	911	strcpy(buffer, "HANGUL SYLLABLE ");
	912	buffer += 16;
	913	strcpy(buffer, hangul_syllables[L][0]);
	914	buffer += strlen(hangul_syllables[L][0]);
	915	strcpy(buffer, hangul_syllables[V][1]);
	916	buffer += strlen(hangul_syllables[V][1]);
	917	strcpy(buffer, hangul_syllables[T][2]);
	918	buffer += strlen(hangul_syllables[T][2]);
	919	*buffer = '\0';
	920	return 1;
[2]	921	}
	922
	923	if (is_unified_ideograph(code)) {
	924	if (buflen < 28)
	925	/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
	926	return 0;
	927	sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
	928	return 1;
	929	}
	930
	931	/* get offset into phrasebook */
	932	offset = phrasebook_offset1[(code>>phrasebook_shift)];
	933	offset = phrasebook_offset2[(offset<<phrasebook_shift) +
	934	(code&((1<<phrasebook_shift)-1))];
	935	if (!offset)
	936	return 0;
	937
	938	i = 0;
	939
	940	for (;;) {
	941	/* get word index */
	942	word = phrasebook[offset] - phrasebook_short;
	943	if (word >= 0) {
	944	word = (word << 8) + phrasebook[offset+1];
	945	offset += 2;
	946	} else
	947	word = phrasebook[offset++];
	948	if (i) {
	949	if (i > buflen)
	950	return 0; /* buffer overflow */
	951	buffer[i++] = ' ';
	952	}
	953	/* copy word string from lexicon. the last character in the
	954	word has bit 7 set. the last word in a string ends with
	955	0x80 */
	956	w = lexicon + lexicon_offset[word];
	957	while (*w < 128) {
	958	if (i >= buflen)
	959	return 0; /* buffer overflow */
	960	buffer[i++] = *w++;
	961	}
	962	if (i >= buflen)
	963	return 0; /* buffer overflow */
	964	buffer[i++] = *w & 127;
	965	if (*w == 128)
	966	break; /* end of word */
	967	}
	968
	969	return 1;
	970	}
	971
	972	static int
	973	_cmpname(PyObject self, int code, const char name, int namelen)
	974	{
	975	/* check if code corresponds to the given name */
	976	int i;
	977	char buffer[NAME_MAXLEN];
	978	if (!_getucname(self, code, buffer, sizeof(buffer)))
	979	return 0;
	980	for (i = 0; i < namelen; i++) {
[391]	981	if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
[2]	982	return 0;
	983	}
	984	return buffer[namelen] == '\0';
	985	}
	986
[391]	987	static void
[2]	988	find_syllable(const char str, int len, int *pos, int count, int column)
	989	{
	990	int i, len1;
	991	*len = -1;
	992	for (i = 0; i < count; i++) {
[391]	993	char *s = hangul_syllables[i][column];
	994	len1 = strlen(s);
	995	if (len1 <= *len)
	996	continue;
	997	if (strncmp(str, s, len1) == 0) {
	998	*len = len1;
	999	*pos = i;
	1000	}
[2]	1001	}
	1002	if (*len == -1) {
[391]	1003	*len = 0;
[2]	1004	}
	1005	}
	1006
	1007	static int
	1008	_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
	1009	{
	1010	unsigned int h, v;
	1011	unsigned int mask = code_size-1;
	1012	unsigned int i, incr;
	1013
	1014	/* Check for hangul syllables. */
	1015	if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
[391]	1016	int len, L = -1, V = -1, T = -1;
	1017	const char *pos = name + 16;
	1018	find_syllable(pos, &len, &L, LCount, 0);
	1019	pos += len;
	1020	find_syllable(pos, &len, &V, VCount, 1);
	1021	pos += len;
	1022	find_syllable(pos, &len, &T, TCount, 2);
	1023	pos += len;
	1024	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
	1025	code = SBase + (LVCount+V)*TCount + T;
	1026	return 1;
	1027	}
[2]	1028	/* Otherwise, it's an illegal syllable name. */
	1029	return 0;
	1030	}
	1031
	1032	/* Check for unified ideographs. */
	1033	if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
	1034	/* Four or five hexdigits must follow. */
	1035	v = 0;
	1036	name += 22;
	1037	namelen -= 22;
	1038	if (namelen != 4 && namelen != 5)
	1039	return 0;
	1040	while (namelen--) {
	1041	v *= 16;
	1042	if (name >= '0' && name <= '9')
	1043	v += *name - '0';
	1044	else if (name >= 'A' && name <= 'F')
	1045	v += *name - 'A' + 10;
	1046	else
	1047	return 0;
	1048	name++;
	1049	}
	1050	if (!is_unified_ideograph(v))
	1051	return 0;
	1052	*code = v;
	1053	return 1;
	1054	}
	1055
	1056	/* the following is the same as python's dictionary lookup, with
	1057	only minor changes. see the makeunicodedata script for more
	1058	details */
	1059
	1060	h = (unsigned int) _gethash(name, namelen, code_magic);
	1061	i = (~h) & mask;
	1062	v = code_hash[i];
	1063	if (!v)
	1064	return 0;
	1065	if (_cmpname(self, v, name, namelen)) {
	1066	*code = v;
	1067	return 1;
	1068	}
	1069	incr = (h ^ (h >> 3)) & mask;
	1070	if (!incr)
	1071	incr = mask;
	1072	for (;;) {
	1073	i = (i + incr) & mask;
	1074	v = code_hash[i];
	1075	if (!v)
	1076	return 0;
	1077	if (_cmpname(self, v, name, namelen)) {
	1078	*code = v;
	1079	return 1;
	1080	}
	1081	incr = incr << 1;
	1082	if (incr > mask)
	1083	incr = incr ^ code_poly;
	1084	}
	1085	}
	1086
[391]	1087	static const _PyUnicode_Name_CAPI hashAPI =
[2]	1088	{
	1089	sizeof(_PyUnicode_Name_CAPI),
	1090	_getucname,
	1091	_getcode
	1092	};
	1093
	1094	/* -------------------------------------------------------------------- */
	1095	/* Python bindings */
	1096
	1097	PyDoc_STRVAR(unicodedata_name__doc__,
	1098	"name(unichr[, default])\n\
	1099	Returns the name assigned to the Unicode character unichr as a\n\
	1100	string. If no name is defined, default is returned, or, if not\n\
	1101	given, ValueError is raised.");
	1102
	1103	static PyObject *
	1104	unicodedata_name(PyObject* self, PyObject* args)
	1105	{
	1106	char name[NAME_MAXLEN];
	1107	Py_UCS4 c;
	1108
	1109	PyUnicodeObject* v;
	1110	PyObject* defobj = NULL;
	1111	if (!PyArg_ParseTuple(args, "O!\|O:name", &PyUnicode_Type, &v, &defobj))
	1112	return NULL;
	1113
	1114	c = getuchar(v);
	1115	if (c == (Py_UCS4)-1)
	1116	return NULL;
	1117
	1118	if (!_getucname(self, c, name, sizeof(name))) {
[391]	1119	if (defobj == NULL) {
	1120	PyErr_SetString(PyExc_ValueError, "no such name");
[2]	1121	return NULL;
[391]	1122	}
	1123	else {
	1124	Py_INCREF(defobj);
	1125	return defobj;
	1126	}
[2]	1127	}
	1128
	1129	return Py_BuildValue("s", name);
	1130	}
	1131
	1132	PyDoc_STRVAR(unicodedata_lookup__doc__,
	1133	"lookup(name)\n\
	1134	\n\
	1135	Look up character by name. If a character with the\n\
	1136	given name is found, return the corresponding Unicode\n\
	1137	character. If not found, KeyError is raised.");
	1138
	1139	static PyObject *
	1140	unicodedata_lookup(PyObject* self, PyObject* args)
	1141	{
	1142	Py_UCS4 code;
	1143	Py_UNICODE str[2];
	1144
	1145	char* name;
	1146	int namelen;
	1147	if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
	1148	return NULL;
	1149
	1150	if (!_getcode(self, name, namelen, &code)) {
	1151	PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
	1152	name);
	1153	return NULL;
	1154	}
	1155
	1156	#ifndef Py_UNICODE_WIDE
	1157	if (code >= 0x10000) {
	1158	str[0] = 0xd800 + ((code - 0x10000) >> 10);
	1159	str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
	1160	return PyUnicode_FromUnicode(str, 2);
	1161	}
	1162	#endif
	1163	str[0] = (Py_UNICODE) code;
[391]	1164	return PyUnicode_FromUnicode(str, 1);
[2]	1165	}
	1166
	1167	/* XXX Add doc strings. */
	1168
	1169	static PyMethodDef unicodedata_functions[] = {
	1170	{"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
	1171	{"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
	1172	{"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
	1173	{"category", unicodedata_category, METH_VARARGS,
	1174	unicodedata_category__doc__},
	1175	{"bidirectional", unicodedata_bidirectional, METH_VARARGS,
	1176	unicodedata_bidirectional__doc__},
	1177	{"combining", unicodedata_combining, METH_VARARGS,
	1178	unicodedata_combining__doc__},
	1179	{"mirrored", unicodedata_mirrored, METH_VARARGS,
	1180	unicodedata_mirrored__doc__},
	1181	{"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
	1182	unicodedata_east_asian_width__doc__},
	1183	{"decomposition", unicodedata_decomposition, METH_VARARGS,
	1184	unicodedata_decomposition__doc__},
	1185	{"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
	1186	{"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
	1187	{"normalize", unicodedata_normalize, METH_VARARGS,
	1188	unicodedata_normalize__doc__},
[391]	1189	{NULL, NULL} /* sentinel */
[2]	1190	};
	1191
	1192	static PyTypeObject UCD_Type = {
[391]	1193	/* The ob_type field must be initialized in the module init function
	1194	* to be portable to Windows without using C++. */
	1195	PyVarObject_HEAD_INIT(NULL, 0)
	1196	"unicodedata.UCD", /tp_name/
	1197	sizeof(PreviousDBVersion), /tp_basicsize/
	1198	0, /tp_itemsize/
	1199	/* methods */
	1200	(destructor)PyObject_Del, /tp_dealloc/
	1201	0, /tp_print/
	1202	0, /tp_getattr/
	1203	0, /tp_setattr/
	1204	0, /tp_compare/
	1205	0, /tp_repr/
	1206	0, /tp_as_number/
	1207	0, /tp_as_sequence/
	1208	0, /tp_as_mapping/
	1209	0, /tp_hash/
[2]	1210	0, /tp_call/
	1211	0, /tp_str/
	1212	PyObject_GenericGetAttr,/tp_getattro/
	1213	0, /tp_setattro/
	1214	0, /tp_as_buffer/
	1215	Py_TPFLAGS_DEFAULT, /tp_flags/
	1216	0, /tp_doc/
	1217	0, /tp_traverse/
	1218	0, /tp_clear/
	1219	0, /tp_richcompare/
	1220	0, /tp_weaklistoffset/
	1221	0, /tp_iter/
	1222	0, /tp_iternext/
	1223	unicodedata_functions, /tp_methods/
	1224	DB_members, /tp_members/
	1225	0, /tp_getset/
	1226	0, /tp_base/
	1227	0, /tp_dict/
	1228	0, /tp_descr_get/
	1229	0, /tp_descr_set/
	1230	0, /tp_dictoffset/
	1231	0, /tp_init/
	1232	0, /tp_alloc/
	1233	0, /tp_new/
	1234	0, /tp_free/
	1235	0, /tp_is_gc/
	1236	};
	1237
	1238	PyDoc_STRVAR(unicodedata_docstring,
	1239	"This module provides access to the Unicode Character Database which\n\
	1240	defines character properties for all Unicode characters. The data in\n\
	1241	this database is based on the UnicodeData.txt file version\n\
[391]	1242	5.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
[2]	1243	\n\
	1244	The module uses the same names and symbols as defined by the\n\
[391]	1245	UnicodeData File Format 5.2.0 (see\n\
	1246	http://www.unicode.org/reports/tr44/tr44-4.html).");
[2]	1247
	1248	PyMODINIT_FUNC
	1249	initunicodedata(void)
	1250	{
	1251	PyObject m, v;
	1252
	1253	Py_TYPE(&UCD_Type) = &PyType_Type;
	1254
	1255	m = Py_InitModule3(
	1256	"unicodedata", unicodedata_functions, unicodedata_docstring);
	1257	if (!m)
	1258	return;
	1259
	1260	PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
	1261	Py_INCREF(&UCD_Type);
	1262	PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
	1263
	1264	/* Previous versions */
	1265	v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
	1266	if (v != NULL)
	1267	PyModule_AddObject(m, "ucd_3_2_0", v);
	1268
	1269	/* Export C API */
[391]	1270	v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
[2]	1271	if (v != NULL)
	1272	PyModule_AddObject(m, "ucnhash_CAPI", v);
	1273	}
	1274
[391]	1275	/*
[2]	1276	Local variables:
	1277	c-basic-offset: 4
	1278	indent-tabs-mode: nil
	1279	End:
	1280	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Modules/unicodedata.c

Download in other formats: