Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

nfkc.c@ 59

Last change on this file since 59 was 2, checked in by dmik, 19 years ago
Imported original Psi 0.10 sources from Affinix
File size: 25.3 KB

Line
1	/* nfkc.c Unicode normalization utilities.
2	* Copyright (C) 2002, 2003 Simon Josefsson
3	*
4	* This file is part of GNU Libidn.
5	*
6	* GNU Libidn is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2.1 of the License, or (at your option) any later version.
10	*
11	* GNU Libidn is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with GNU Libidn; if not, write to the Free Software
18	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19	*
20	*/
21
22	#include "internal.h"
23
24	/* This file contains functions from GLIB, including gutf8.c and
25	* gunidecomp.c, all licensed under LGPL and copyright hold by:
26	*
27	* Copyright (C) 1999, 2000 Tom Tromey
28	* Copyright 2000 Red Hat, Inc.
29	*/
30
31	/* Hacks to make syncing with GLIB code easier. */
32	#define gboolean int
33	#define gchar char
34	#define guchar unsigned char
35	#define glong long
36	#define gint int
37	#define guint unsigned int
38	#define gushort unsigned short
39	#define gint16 my_int16_t
40	#define guint16 my_uint16_t
41	#define gunichar my_uint32_t
42	#define gsize size_t
43	#define gssize ssize_t
44	#define g_malloc malloc
45	#define g_free free
46	#define GError void
47	#define g_set_error(a,b,c,d) 0
48	#define g_new(struct_type, n_structs) \
49	((struct_type ) g_malloc (((gsize) sizeof (struct_type)) ((gsize) (n_structs))))
50	# if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
51	# define G_STMT_START (void)(
52	# define G_STMT_END )
53	# else
54	# if (defined (sun) \|\| defined (__sun__))
55	# define G_STMT_START if (1)
56	# define G_STMT_END else (void)0
57	# else
58	# define G_STMT_START do
59	# define G_STMT_END while (0)
60	# endif
61	# endif
62	#define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
63	#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
64	#define TRUE 1
65	#define FALSE 0
66
67	/* Code from GLIB gunicode.h starts here. */
68
69	typedef enum
70	{
71	G_NORMALIZE_DEFAULT,
72	G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
73	G_NORMALIZE_DEFAULT_COMPOSE,
74	G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
75	G_NORMALIZE_ALL,
76	G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
77	G_NORMALIZE_ALL_COMPOSE,
78	G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
79	}
80	GNormalizeMode;
81
82	/* Code from GLIB gutf8.c starts here. */
83
84	#define UTF8_COMPUTE(Char, Mask, Len) \
85	if (Char < 128) \
86	{ \
87	Len = 1; \
88	Mask = 0x7f; \
89	} \
90	else if ((Char & 0xe0) == 0xc0) \
91	{ \
92	Len = 2; \
93	Mask = 0x1f; \
94	} \
95	else if ((Char & 0xf0) == 0xe0) \
96	{ \
97	Len = 3; \
98	Mask = 0x0f; \
99	} \
100	else if ((Char & 0xf8) == 0xf0) \
101	{ \
102	Len = 4; \
103	Mask = 0x07; \
104	} \
105	else if ((Char & 0xfc) == 0xf8) \
106	{ \
107	Len = 5; \
108	Mask = 0x03; \
109	} \
110	else if ((Char & 0xfe) == 0xfc) \
111	{ \
112	Len = 6; \
113	Mask = 0x01; \
114	} \
115	else \
116	Len = -1;
117
118	#define UTF8_LENGTH(Char) \
119	((Char) < 0x80 ? 1 : \
120	((Char) < 0x800 ? 2 : \
121	((Char) < 0x10000 ? 3 : \
122	((Char) < 0x200000 ? 4 : \
123	((Char) < 0x4000000 ? 5 : 6)))))
124
125
126	#define UTF8_GET(Result, Chars, Count, Mask, Len) \
127	(Result) = (Chars)[0] & (Mask); \
128	for ((Count) = 1; (Count) < (Len); ++(Count)) \
129	{ \
130	if (((Chars)[(Count)] & 0xc0) != 0x80) \
131	{ \
132	(Result) = -1; \
133	break; \
134	} \
135	(Result) <<= 6; \
136	(Result) \|= ((Chars)[(Count)] & 0x3f); \
137	}
138
139	#define UNICODE_VALID(Char) \
140	((Char) < 0x110000 && \
141	(((Char) & 0xFFFFF800) != 0xD800) && \
142	((Char) < 0xFDD0 \|\| (Char) > 0xFDEF) && \
143	((Char) & 0xFFFE) != 0xFFFE)
144
145
146	static const gchar utf8_skip_data[256] = {
147	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148	1, 1, 1, 1, 1, 1, 1,
149	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
150	1, 1, 1, 1, 1, 1, 1,
151	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
152	1, 1, 1, 1, 1, 1, 1,
153	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154	1, 1, 1, 1, 1, 1, 1,
155	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156	1, 1, 1, 1, 1, 1, 1,
157	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158	1, 1, 1, 1, 1, 1, 1,
159	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
160	2, 2, 2, 2, 2, 2, 2,
161	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
162	5, 5, 5, 6, 6, 1, 1
163	};
164
165	const gchar *const g_utf8_skip = utf8_skip_data;
166
167	#define g_utf8_next_char(p) (char )((p) + g_utf8_skip[(guchar *)(p)])
168
169	/**
170	* g_utf8_strlen:
171	* @p: pointer to the start of a UTF-8 encoded string.
172	* @max: the maximum number of bytes to examine. If @max
173	* is less than 0, then the string is assumed to be
174	* nul-terminated. If @max is 0, @p will not be examined and
175	* may be %NULL.
176	*
177	* Returns the length of the string in characters.
178	*
179	* Return value: the length of the string in characters
180	**/
181	static glong
182	g_utf8_strlen (const gchar * p, gssize max)
183	{
184	glong len = 0;
185	const gchar *start = p;
186	g_return_val_if_fail (p != NULL \|\| max == 0, 0);
187
188	if (max < 0)
189	{
190	while (*p)
191	{
192	p = g_utf8_next_char (p);
193	++len;
194	}
195	}
196	else
197	{
198	if (max == 0 \|\| !*p)
199	return 0;
200
201	p = g_utf8_next_char (p);
202
203	while (p - start < max && *p)
204	{
205	++len;
206	p = g_utf8_next_char (p);
207	}
208
209	/* only do the last len increment if we got a complete
210	* char (don't count partial chars)
211	*/
212	if (p - start == max)
213	++len;
214	}
215
216	return len;
217	}
218
219	/**
220	* g_utf8_get_char:
221	* @p: a pointer to Unicode character encoded as UTF-8
222	*
223	* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
224	* If @p does not point to a valid UTF-8 encoded character, results are
225	* undefined. If you are not sure that the bytes are complete
226	* valid Unicode characters, you should use g_utf8_get_char_validated()
227	* instead.
228	*
229	* Return value: the resulting character
230	**/
231	static gunichar
232	g_utf8_get_char (const gchar * p)
233	{
234	int i, mask = 0, len;
235	gunichar result;
236	unsigned char c = (unsigned char) *p;
237
238	UTF8_COMPUTE (c, mask, len);
239	if (len == -1)
240	return (gunichar) - 1;
241	UTF8_GET (result, p, i, mask, len);
242
243	return result;
244	}
245
246	/**
247	* g_unichar_to_utf8:
248	* @c: a ISO10646 character code
249	* @outbuf: output buffer, must have at least 6 bytes of space.
250	* If %NULL, the length will be computed and returned
251	* and nothing will be written to @outbuf.
252	*
253	* Converts a single character to UTF-8.
254	*
255	* Return value: number of bytes written
256	**/
257	static int
258	g_unichar_to_utf8 (gunichar c, gchar * outbuf)
259	{
260	guint len = 0;
261	int first;
262	int i;
263
264	if (c < 0x80)
265	{
266	first = 0;
267	len = 1;
268	}
269	else if (c < 0x800)
270	{
271	first = 0xc0;
272	len = 2;
273	}
274	else if (c < 0x10000)
275	{
276	first = 0xe0;
277	len = 3;
278	}
279	else if (c < 0x200000)
280	{
281	first = 0xf0;
282	len = 4;
283	}
284	else if (c < 0x4000000)
285	{
286	first = 0xf8;
287	len = 5;
288	}
289	else
290	{
291	first = 0xfc;
292	len = 6;
293	}
294
295	if (outbuf)
296	{
297	for (i = len - 1; i > 0; --i)
298	{
299	outbuf[i] = (c & 0x3f) \| 0x80;
300	c >>= 6;
301	}
302	outbuf[0] = c \| first;
303	}
304
305	return len;
306	}
307
308	/**
309	* g_utf8_to_ucs4_fast:
310	* @str: a UTF-8 encoded string
311	* @len: the maximum length of @str to use. If @len < 0, then
312	* the string is nul-terminated.
313	* @items_written: location to store the number of characters in the
314	* result, or %NULL.
315	*
316	* Convert a string from UTF-8 to a 32-bit fixed width
317	* representation as UCS-4, assuming valid UTF-8 input.
318	* This function is roughly twice as fast as g_utf8_to_ucs4()
319	* but does no error checking on the input.
320	*
321	* Return value: a pointer to a newly allocated UCS-4 string.
322	* This value must be freed with g_free().
323	**/
324	static gunichar *
325	g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
326	{
327	gint j, charlen;
328	gunichar *result;
329	gint n_chars, i;
330	const gchar *p;
331
332	g_return_val_if_fail (str != NULL, NULL);
333
334	p = str;
335	n_chars = 0;
336	if (len < 0)
337	{
338	while (*p)
339	{
340	p = g_utf8_next_char (p);
341	++n_chars;
342	}
343	}
344	else
345	{
346	while (p < str + len && *p)
347	{
348	p = g_utf8_next_char (p);
349	++n_chars;
350	}
351	}
352
353	result = g_new (gunichar, n_chars + 1);
354
355	p = str;
356	for (i = 0; i < n_chars; i++)
357	{
358	gunichar wc = ((unsigned char *) p)[0];
359
360	if (wc < 0x80)
361	{
362	result[i] = wc;
363	p++;
364	}
365	else
366	{
367	if (wc < 0xe0)
368	{
369	charlen = 2;
370	wc &= 0x1f;
371	}
372	else if (wc < 0xf0)
373	{
374	charlen = 3;
375	wc &= 0x0f;
376	}
377	else if (wc < 0xf8)
378	{
379	charlen = 4;
380	wc &= 0x07;
381	}
382	else if (wc < 0xfc)
383	{
384	charlen = 5;
385	wc &= 0x03;
386	}
387	else
388	{
389	charlen = 6;
390	wc &= 0x01;
391	}
392
393	for (j = 1; j < charlen; j++)
394	{
395	wc <<= 6;
396	wc \|= ((unsigned char *) p)[j] & 0x3f;
397	}
398
399	result[i] = wc;
400	p += charlen;
401	}
402	}
403	result[i] = 0;
404
405	if (items_written)
406	*items_written = i;
407
408	return result;
409	}
410
411	/**
412	* g_ucs4_to_utf8:
413	* @str: a UCS-4 encoded string
414	* @len: the maximum length of @str to use. If @len < 0, then
415	* the string is terminated with a 0 character.
416	* @items_read: location to store number of characters read read, or %NULL.
417	* @items_written: location to store number of bytes written or %NULL.
418	* The value here stored does not include the trailing 0
419	* byte.
420	* @error: location to store the error occuring, or %NULL to ignore
421	* errors. Any of the errors in #GConvertError other than
422	* %G_CONVERT_ERROR_NO_CONVERSION may occur.
423	*
424	* Convert a string from a 32-bit fixed width representation as UCS-4.
425	* to UTF-8. The result will be terminated with a 0 byte.
426	*
427	* Return value: a pointer to a newly allocated UTF-8 string.
428	* This value must be freed with g_free(). If an
429	* error occurs, %NULL will be returned and
430	* @error set.
431	**/
432	static gchar *
433	g_ucs4_to_utf8 (const gunichar * str,
434	glong len,
435	glong * items_read, glong * items_written, GError ** error)
436	{
437	gint result_length;
438	gchar *result = NULL;
439	gchar *p;
440	gint i;
441
442	result_length = 0;
443	for (i = 0; len < 0 \|\| i < len; i++)
444	{
445	if (!str[i])
446	break;
447
448	if (str[i] >= 0x80000000)
449	{
450	if (items_read)
451	*items_read = i;
452
453	/*g_set_error (error, G_CONVERT_ERROR,
454	G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
455	_("Character out of range for UTF-8"));*/
456	goto err_out;
457	}
458
459	result_length += UTF8_LENGTH (str[i]);
460	}
461
462	result = g_malloc (result_length + 1);
463	p = result;
464
465	i = 0;
466	while (p < result + result_length)
467	p += g_unichar_to_utf8 (str[i++], p);
468
469	*p = '\0';
470
471	if (items_written)
472	*items_written = p - result;
473
474	err_out:
475	if (items_read)
476	*items_read = i;
477
478	return result;
479	}
480
481	/* Code from GLIB gunidecomp.c starts here. */
482
483	#include "gunidecomp.h"
484	#include "gunicomp.h"
485
486	#define CC_PART1(Page, Char) \
487	((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
488	? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
489	: (cclass_data[combining_class_table_part1[Page]][Char]))
490
491	#define CC_PART2(Page, Char) \
492	((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
493	? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
494	: (cclass_data[combining_class_table_part2[Page]][Char]))
495
496	#define COMBINING_CLASS(Char) \
497	(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
498	? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
499	: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
500	? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
501	: 0))
502
503	/* constants for hangul syllable [de]composition */
504	#define SBase 0xAC00
505	#define LBase 0x1100
506	#define VBase 0x1161
507	#define TBase 0x11A7
508	#define LCount 19
509	#define VCount 21
510	#define TCount 28
511	#define NCount (VCount * TCount)
512	#define SCount (LCount * NCount)
513
514	/**
515	* g_unicode_canonical_ordering:
516	* @string: a UCS-4 encoded string.
517	* @len: the maximum length of @string to use.
518	*
519	* Computes the canonical ordering of a string in-place.
520	* This rearranges decomposed characters in the string
521	* according to their combining classes. See the Unicode
522	* manual for more information.
523	**/
524	static void
525	g_unicode_canonical_ordering (gunichar * string, gsize len)
526	{
527	gsize i;
528	int swap = 1;
529
530	while (swap)
531	{
532	int last;
533	swap = 0;
534	last = COMBINING_CLASS (string[0]);
535	for (i = 0; i < len - 1; ++i)
536	{
537	int next = COMBINING_CLASS (string[i + 1]);
538	if (next != 0 && last > next)
539	{
540	gsize j;
541	/* Percolate item leftward through string. */
542	for (j = i + 1; j > 0; --j)
543	{
544	gunichar t;
545	if (COMBINING_CLASS (string[j - 1]) <= next)
546	break;
547	t = string[j];
548	string[j] = string[j - 1];
549	string[j - 1] = t;
550	swap = 1;
551	}
552	/* We're re-entering the loop looking at the old
553	character again. */
554	next = last;
555	}
556	last = next;
557	}
558	}
559	}
560
561	/* http://www.unicode.org/unicode/reports/tr15/#Hangul
562	* r should be null or have sufficient space. Calling with r == NULL will
563	* only calculate the result_len; however, a buffer with space for three
564	* characters will always be big enough. */
565	static void
566	decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
567	{
568	gint SIndex = s - SBase;
569
570	/* not a hangul syllable */
571	if (SIndex < 0 \|\| SIndex >= SCount)
572	{
573	if (r)
574	r[0] = s;
575	*result_len = 1;
576	}
577	else
578	{
579	gunichar L = LBase + SIndex / NCount;
580	gunichar V = VBase + (SIndex % NCount) / TCount;
581	gunichar T = TBase + SIndex % TCount;
582
583	if (r)
584	{
585	r[0] = L;
586	r[1] = V;
587	}
588
589	if (T != TBase)
590	{
591	if (r)
592	r[2] = T;
593	*result_len = 3;
594	}
595	else
596	*result_len = 2;
597	}
598	}
599
600	/* returns a pointer to a null-terminated UTF-8 string */
601	static const gchar *
602	find_decomposition (gunichar ch, gboolean compat)
603	{
604	int start = 0;
605	int end = G_N_ELEMENTS (decomp_table);
606
607	if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
608	{
609	while (TRUE)
610	{
611	int half = (start + end) / 2;
612	if (ch == decomp_table[half].ch)
613	{
614	int offset;
615
616	if (compat)
617	{
618	offset = decomp_table[half].compat_offset;
619	if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
620	offset = decomp_table[half].canon_offset;
621	}
622	else
623	{
624	offset = decomp_table[half].canon_offset;
625	if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
626	return NULL;
627	}
628
629	return &(decomp_expansion_string[offset]);
630	}
631	else if (half == start)
632	break;
633	else if (ch > decomp_table[half].ch)
634	start = half;
635	else
636	end = half;
637	}
638	}
639
640	return NULL;
641	}
642
643	/* L,V => LV and LV,T => LVT */
644	static gboolean
645	combine_hangul (gunichar a, gunichar b, gunichar * result)
646	{
647	gint LIndex = a - LBase;
648	gint SIndex = a - SBase;
649
650	gint VIndex = b - VBase;
651	gint TIndex = b - TBase;
652
653	if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
654	{
655	result = SBase + (LIndex VCount + VIndex) * TCount;
656	return TRUE;
657	}
658	else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
659	&& 0 <= TIndex && TIndex <= TCount)
660	{
661	*result = a + TIndex;
662	return TRUE;
663	}
664
665	return FALSE;
666	}
667
668	#define CI(Page, Char) \
669	((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
670	? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
671	: (compose_data[compose_table[Page]][Char]))
672
673	#define COMPOSE_INDEX(Char) \
674	((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
675
676	static gboolean
677	combine (gunichar a, gunichar b, gunichar * result)
678	{
679	gushort index_a, index_b;
680
681	if (combine_hangul (a, b, result))
682	return TRUE;
683
684	index_a = COMPOSE_INDEX (a);
685
686	if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
687	{
688	if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
689	{
690	*result =
691	compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
692	return TRUE;
693	}
694	else
695	return FALSE;
696	}
697
698	index_b = COMPOSE_INDEX (b);
699
700	if (index_b >= COMPOSE_SECOND_SINGLE_START)
701	{
702	if (a ==
703	compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
704	{
705	*result =
706	compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
707	return TRUE;
708	}
709	else
710	return FALSE;
711	}
712
713	if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
714	&& index_b >= COMPOSE_SECOND_START
715	&& index_b < COMPOSE_SECOND_SINGLE_START)
716	{
717	gunichar res =
718	compose_array[index_a - COMPOSE_FIRST_START][index_b -
719	COMPOSE_SECOND_START];
720
721	if (res)
722	{
723	*result = res;
724	return TRUE;
725	}
726	}
727
728	return FALSE;
729	}
730
731	static gunichar *
732	_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
733	{
734	gsize n_wc;
735	gunichar *wc_buffer;
736	const char *p;
737	gsize last_start;
738	gboolean do_compat = (mode == G_NORMALIZE_NFKC \|\| mode == G_NORMALIZE_NFKD);
739	gboolean do_compose = (mode == G_NORMALIZE_NFC \|\| mode == G_NORMALIZE_NFKC);
740
741	n_wc = 0;
742	p = str;
743	while ((max_len < 0 \|\| p < str + max_len) && *p)
744	{
745	const gchar *decomp;
746	gunichar wc = g_utf8_get_char (p);
747
748	if (wc >= 0xac00 && wc <= 0xd7af)
749	{
750	gsize result_len;
751	decompose_hangul (wc, NULL, &result_len);
752	n_wc += result_len;
753	}
754	else
755	{
756	decomp = find_decomposition (wc, do_compat);
757
758	if (decomp)
759	n_wc += g_utf8_strlen (decomp, -1);
760	else
761	n_wc++;
762	}
763
764	p = g_utf8_next_char (p);
765	}
766
767	wc_buffer = g_new (gunichar, n_wc + 1);
768
769	last_start = 0;
770	n_wc = 0;
771	p = str;
772	while ((max_len < 0 \|\| p < str + max_len) && *p)
773	{
774	gunichar wc = g_utf8_get_char (p);
775	const gchar *decomp;
776	int cc;
777	gsize old_n_wc = n_wc;
778
779	if (wc >= 0xac00 && wc <= 0xd7af)
780	{
781	gsize result_len;
782	decompose_hangul (wc, wc_buffer + n_wc, &result_len);
783	n_wc += result_len;
784	}
785	else
786	{
787	decomp = find_decomposition (wc, do_compat);
788
789	if (decomp)
790	{
791	const char *pd;
792	for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
793	wc_buffer[n_wc++] = g_utf8_get_char (pd);
794	}
795	else
796	wc_buffer[n_wc++] = wc;
797	}
798
799	if (n_wc > 0)
800	{
801	cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
802
803	if (cc == 0)
804	{
805	g_unicode_canonical_ordering (wc_buffer + last_start,
806	n_wc - last_start);
807	last_start = old_n_wc;
808	}
809	}
810
811	p = g_utf8_next_char (p);
812	}
813
814	if (n_wc > 0)
815	{
816	g_unicode_canonical_ordering (wc_buffer + last_start,
817	n_wc - last_start);
818	last_start = n_wc;
819	}
820
821	wc_buffer[n_wc] = 0;
822
823	/* All decomposed and reordered */
824
825	if (do_compose && n_wc > 0)
826	{
827	gsize i, j;
828	int last_cc = 0;
829	last_start = 0;
830
831	for (i = 0; i < n_wc; i++)
832	{
833	int cc = COMBINING_CLASS (wc_buffer[i]);
834
835	if (i > 0 &&
836	(last_cc == 0 \|\| last_cc != cc) &&
837	combine (wc_buffer[last_start], wc_buffer[i],
838	&wc_buffer[last_start]))
839	{
840	for (j = i + 1; j < n_wc; j++)
841	wc_buffer[j - 1] = wc_buffer[j];
842	n_wc--;
843	i--;
844
845	if (i == last_start)
846	last_cc = 0;
847	else
848	last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
849
850	continue;
851	}
852
853	if (cc == 0)
854	last_start = i;
855
856	last_cc = cc;
857	}
858	}
859
860	wc_buffer[n_wc] = 0;
861
862	return wc_buffer;
863	}
864
865	/**
866	* g_utf8_normalize:
867	* @str: a UTF-8 encoded string.
868	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
869	* @mode: the type of normalization to perform.
870	*
871	* Converts a string into canonical form, standardizing
872	* such issues as whether a character with an accent
873	* is represented as a base character and combining
874	* accent or as a single precomposed character. You
875	* should generally call g_utf8_normalize() before
876	* comparing two Unicode strings.
877	*
878	* The normalization mode %G_NORMALIZE_DEFAULT only
879	* standardizes differences that do not affect the
880	* text content, such as the above-mentioned accent
881	* representation. %G_NORMALIZE_ALL also standardizes
882	* the "compatibility" characters in Unicode, such
883	* as SUPERSCRIPT THREE to the standard forms
884	* (in this case DIGIT THREE). Formatting information
885	* may be lost but for most text operations such
886	* characters should be considered the same.
887	* For example, g_utf8_collate() normalizes
888	* with %G_NORMALIZE_ALL as its first step.
889	*
890	* %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
891	* are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
892	* but returned a result with composed forms rather
893	* than a maximally decomposed form. This is often
894	* useful if you intend to convert the string to
895	* a legacy encoding or pass it to a system with
896	* less capable Unicode handling.
897	*
898	* Return value: a newly allocated string, that is the
899	* normalized form of @str.
900	**/
901	static gchar *
902	g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
903	{
904	gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
905	gchar *result;
906
907	result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
908	g_free (result_wc);
909
910	return result;
911	}
912
913	/* Public Libidn API starts here. */
914
915	/**
916	* stringprep_utf8_to_unichar:
917	* @p: a pointer to Unicode character encoded as UTF-8
918	*
919	* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
920	* If @p does not point to a valid UTF-8 encoded character, results are
921	* undefined. If you are not sure that the bytes are complete
922	* valid Unicode characters, you should use g_utf8_get_char_validated()
923	* instead.
924	*
925	* Return value: the resulting character
926	**/
927	my_uint32_t
928	stringprep_utf8_to_unichar (const char *p)
929	{
930	return g_utf8_get_char (p);
931	}
932
933	/**
934	* stringprep_unichar_to_utf8:
935	* @c: a ISO10646 character code
936	* @outbuf: output buffer, must have at least 6 bytes of space.
937	* If %NULL, the length will be computed and returned
938	* and nothing will be written to @outbuf.
939	*
940	* Converts a single character to UTF-8.
941	*
942	* Return value: number of bytes written
943	**/
944	int
945	stringprep_unichar_to_utf8 (my_uint32_t c, char *outbuf)
946	{
947	return g_unichar_to_utf8 (c, outbuf);
948	}
949
950	/**
951	* stringprep_utf8_to_ucs4:
952	* @str: a UTF-8 encoded string
953	* @len: the maximum length of @str to use. If @len < 0, then
954	* the string is nul-terminated.
955	* @items_written: location to store the number of characters in the
956	* result, or %NULL.
957	*
958	* Convert a string from UTF-8 to a 32-bit fixed width
959	* representation as UCS-4, assuming valid UTF-8 input.
960	* This function does no error checking on the input.
961	*
962	* Return value: a pointer to a newly allocated UCS-4 string.
963	* This value must be freed with free().
964	**/
965	my_uint32_t *
966	stringprep_utf8_to_ucs4 (const char str, ssize_t len, size_t items_written)
967	{
968	return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
969	}
970
971	/**
972	* stringprep_ucs4_to_utf8:
973	* @str: a UCS-4 encoded string
974	* @len: the maximum length of @str to use. If @len < 0, then
975	* the string is terminated with a 0 character.
976	* @items_read: location to store number of characters read read, or %NULL.
977	* @items_written: location to store number of bytes written or %NULL.
978	* The value here stored does not include the trailing 0
979	* byte.
980	*
981	* Convert a string from a 32-bit fixed width representation as UCS-4.
982	* to UTF-8. The result will be terminated with a 0 byte.
983	*
984	* Return value: a pointer to a newly allocated UTF-8 string.
985	* This value must be freed with free(). If an
986	* error occurs, %NULL will be returned and
987	* @error set.
988	**/
989	char *
990	stringprep_ucs4_to_utf8 (const my_uint32_t * str, ssize_t len,
991	size_t * items_read, size_t * items_written)
992	{
993	return g_ucs4_to_utf8 (str, len, (glong *) items_read,
994	(glong *) items_written, NULL);
995	}
996
997	/**
998	* stringprep_utf8_nfkc_normalize:
999	* @str: a UTF-8 encoded string.
1000	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1001	*
1002	* Converts a string into canonical form, standardizing
1003	* such issues as whether a character with an accent
1004	* is represented as a base character and combining
1005	* accent or as a single precomposed character.
1006	*
1007	* The normalization mode is NFKC (ALL COMPOSE). It standardizes
1008	* differences that do not affect the text content, such as the
1009	* above-mentioned accent representation. It standardizes the
1010	* "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1011	* the standard forms (in this case DIGIT THREE). Formatting
1012	* information may be lost but for most text operations such
1013	* characters should be considered the same. It returns a result with
1014	* composed forms rather than a maximally decomposed form.
1015	*
1016	* Return value: a newly allocated string, that is the
1017	* NFKC normalized form of @str.
1018	**/
1019	char *
1020	stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1021	{
1022	return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1023	}
1024
1025	/**
1026	* stringprep_ucs4_nfkc_normalize:
1027	* @str: a Unicode string.
1028	* @len: length of @str array, or -1 if @str is nul-terminated.
1029	*
1030	* Converts UCS4 string into UTF-8 and runs
1031	* stringprep_utf8_nfkc_normalize().
1032	*
1033	* Return value: a newly allocated Unicode string, that is the NFKC
1034	* normalized form of @str.
1035	**/
1036	my_uint32_t *
1037	stringprep_ucs4_nfkc_normalize (my_uint32_t * str, ssize_t len)
1038	{
1039	char *p;
1040	my_uint32_t *result_wc;
1041
1042	p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1043	result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1044	free (p);
1045
1046	return result_wc;
1047	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: psi/trunk/iris/libidn/nfkc.c@ 59

Download in other formats: