Context Navigation

libunicodedata.tex

Visit:

Last change on this file was 3225, checked in by bird, 18 years ago
Python 2.5
File size: 5.6 KB

Line
1	\section{\module{unicodedata} ---
2	Unicode Database}
3
4	\declaremodule{standard}{unicodedata}
5	\modulesynopsis{Access the Unicode Database.}
6	\moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com}
7	\sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com}
8	\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de}
9
10	\index{Unicode}
11	\index{character}
12	\indexii{Unicode}{database}
13
14	This module provides access to the Unicode Character Database which
15	defines character properties for all Unicode characters. The data in
16	this database is based on the \file{UnicodeData.txt} file version
17	4.1.0 which is publicly available from \url{ftp://ftp.unicode.org/}.
18
19	The module uses the same names and symbols as defined by the
20	UnicodeData File Format 4.1.0 (see
21	\url{http://www.unicode.org/Public/4.1.0/ucd/UCD.html}). It
22	defines the following functions:
23
24	\begin{funcdesc}{lookup}{name}
25	Look up character by name. If a character with the
26	given name is found, return the corresponding Unicode
27	character. If not found, \exception{KeyError} is raised.
28	\end{funcdesc}
29
30	\begin{funcdesc}{name}{unichr\optional{, default}}
31	Returns the name assigned to the Unicode character
32	\var{unichr} as a string. If no name is defined,
33	\var{default} is returned, or, if not given,
34	\exception{ValueError} is raised.
35	\end{funcdesc}
36
37	\begin{funcdesc}{decimal}{unichr\optional{, default}}
38	Returns the decimal value assigned to the Unicode character
39	\var{unichr} as integer. If no such value is defined,
40	\var{default} is returned, or, if not given,
41	\exception{ValueError} is raised.
42	\end{funcdesc}
43
44	\begin{funcdesc}{digit}{unichr\optional{, default}}
45	Returns the digit value assigned to the Unicode character
46	\var{unichr} as integer. If no such value is defined,
47	\var{default} is returned, or, if not given,
48	\exception{ValueError} is raised.
49	\end{funcdesc}
50
51	\begin{funcdesc}{numeric}{unichr\optional{, default}}
52	Returns the numeric value assigned to the Unicode character
53	\var{unichr} as float. If no such value is defined, \var{default} is
54	returned, or, if not given, \exception{ValueError} is raised.
55	\end{funcdesc}
56
57	\begin{funcdesc}{category}{unichr}
58	Returns the general category assigned to the Unicode character
59	\var{unichr} as string.
60	\end{funcdesc}
61
62	\begin{funcdesc}{bidirectional}{unichr}
63	Returns the bidirectional category assigned to the Unicode character
64	\var{unichr} as string. If no such value is defined, an empty string
65	is returned.
66	\end{funcdesc}
67
68	\begin{funcdesc}{combining}{unichr}
69	Returns the canonical combining class assigned to the Unicode
70	character \var{unichr} as integer. Returns \code{0} if no combining
71	class is defined.
72	\end{funcdesc}
73
74	\begin{funcdesc}{east_asian_width}{unichr}
75	Returns the east asian width assigned to the Unicode character
76	\var{unichr} as string.
77	\versionadded{2.4}
78	\end{funcdesc}
79
80	\begin{funcdesc}{mirrored}{unichr}
81	Returns the mirrored property assigned to the Unicode character
82	\var{unichr} as integer. Returns \code{1} if the character has been
83	identified as a ``mirrored'' character in bidirectional text,
84	\code{0} otherwise.
85	\end{funcdesc}
86
87	\begin{funcdesc}{decomposition}{unichr}
88	Returns the character decomposition mapping assigned to the Unicode
89	character \var{unichr} as string. An empty string is returned in case
90	no such mapping is defined.
91	\end{funcdesc}
92
93	\begin{funcdesc}{normalize}{form, unistr}
94
95	Return the normal form \var{form} for the Unicode string \var{unistr}.
96	Valid values for \var{form} are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
97
98	The Unicode standard defines various normalization forms of a Unicode
99	string, based on the definition of canonical equivalence and
100	compatibility equivalence. In Unicode, several characters can be
101	expressed in various way. For example, the character U+00C7 (LATIN
102	CAPITAL LETTER C WITH CEDILLA) can also be expressed as the sequence
103	U+0043 (LATIN CAPITAL LETTER C) U+0327 (COMBINING CEDILLA).
104
105	For each character, there are two normal forms: normal form C and
106	normal form D. Normal form D (NFD) is also known as canonical
107	decomposition, and translates each character into its decomposed form.
108	Normal form C (NFC) first applies a canonical decomposition, then
109	composes pre-combined characters again.
110
111	In addition to these two forms, there are two additional normal forms
112	based on compatibility equivalence. In Unicode, certain characters are
113	supported which normally would be unified with other characters. For
114	example, U+2160 (ROMAN NUMERAL ONE) is really the same thing as U+0049
115	(LATIN CAPITAL LETTER I). However, it is supported in Unicode for
116	compatibility with existing character sets (e.g. gb2312).
117
118	The normal form KD (NFKD) will apply the compatibility decomposition,
119	i.e. replace all compatibility characters with their equivalents. The
120	normal form KC (NFKC) first applies the compatibility decomposition,
121	followed by the canonical composition.
122
123	\versionadded{2.3}
124	\end{funcdesc}
125
126	In addition, the module exposes the following constant:
127
128	\begin{datadesc}{unidata_version}
129	The version of the Unicode database used in this module.
130
131	\versionadded{2.3}
132	\end{datadesc}
133
134	\begin{datadesc}{ucd_3_2_0}
135	This is an object that has the same methods as the entire
136	module, but uses the Unicode database version 3.2 instead,
137	for applications that require this specific version of
138	the Unicode database (such as IDNA).
139
140	\versionadded{2.5}
141	\end{datadesc}
142
143	Examples:
144
145	\begin{verbatim}
146	>>> unicodedata.lookup('LEFT CURLY BRACKET')
147	u'{'
148	>>> unicodedata.name(u'/')
149	'SOLIDUS'
150	>>> unicodedata.decimal(u'9')
151	9
152	>>> unicodedata.decimal(u'a')
153	Traceback (most recent call last):
154	File "<stdin>", line 1, in ?
155	ValueError: not a decimal
156	>>> unicodedata.category(u'A') # 'L'etter, 'u'ppercase
157	'Lu'
158	>>> unicodedata.bidirectional(u'\u0660') # 'A'rabic, 'N'umber
159	'AN'
160	\end{verbatim}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/python/2.5/Doc/lib/libunicodedata.tex

Download in other formats: