Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

CUnicode.cpp@ 3848

Visit:

Last change on this file since 3848 was 2749, checked in by bird, 19 years ago
synergy v1.3.1 sources (zip).
File size: 16.0 KB

Line
1	/*
2	* synergy -- mouse and keyboard sharing utility
3	* Copyright (C) 2002 Chris Schoeneman
4	*
5	* This package is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU General Public License
7	* found in the file COPYING that should have accompanied this file.
8	*
9	* This package is distributed in the hope that it will be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.
13	*/
14
15	#include "CUnicode.h"
16	#include "CArch.h"
17	#include <string.h>
18
19	//
20	// local utility functions
21	//
22
23	inline
24	static
25	UInt16
26	decode16(const UInt8* n, bool byteSwapped)
27	{
28	union x16 {
29	UInt8 n8[2];
30	UInt16 n16;
31	} c;
32	if (byteSwapped) {
33	c.n8[0] = n[1];
34	c.n8[1] = n[0];
35	}
36	else {
37	c.n8[0] = n[0];
38	c.n8[1] = n[1];
39	}
40	return c.n16;
41	}
42
43	inline
44	static
45	UInt32
46	decode32(const UInt8* n, bool byteSwapped)
47	{
48	union x32 {
49	UInt8 n8[4];
50	UInt32 n32;
51	} c;
52	if (byteSwapped) {
53	c.n8[0] = n[3];
54	c.n8[1] = n[2];
55	c.n8[2] = n[1];
56	c.n8[3] = n[0];
57	}
58	else {
59	c.n8[0] = n[0];
60	c.n8[1] = n[1];
61	c.n8[2] = n[2];
62	c.n8[3] = n[3];
63	}
64	return c.n32;
65	}
66
67	inline
68	static
69	void
70	resetError(bool* errors)
71	{
72	if (errors != NULL) {
73	*errors = false;
74	}
75	}
76
77	inline
78	static
79	void
80	setError(bool* errors)
81	{
82	if (errors != NULL) {
83	*errors = true;
84	}
85	}
86
87
88	//
89	// CUnicode
90	//
91
92	UInt32 CUnicode::s_invalid = 0x0000ffff;
93	UInt32 CUnicode::s_replacement = 0x0000fffd;
94
95	bool
96	CUnicode::isUTF8(const CString& src)
97	{
98	// convert and test each character
99	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
100	for (UInt32 n = src.size(); n > 0; ) {
101	if (fromUTF8(data, n) == s_invalid) {
102	return false;
103	}
104	}
105	return true;
106	}
107
108	CString
109	CUnicode::UTF8ToUCS2(const CString& src, bool* errors)
110	{
111	// default to success
112	resetError(errors);
113
114	// get size of input string and reserve some space in output
115	UInt32 n = src.size();
116	CString dst;
117	dst.reserve(2 * n);
118
119	// convert each character
120	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
121	while (n > 0) {
122	UInt32 c = fromUTF8(data, n);
123	if (c == s_invalid) {
124	c = s_replacement;
125	}
126	else if (c >= 0x00010000) {
127	setError(errors);
128	c = s_replacement;
129	}
130	UInt16 ucs2 = static_cast<UInt16>(c);
131	dst.append(reinterpret_cast<const char*>(&ucs2), 2);
132	}
133
134	return dst;
135	}
136
137	CString
138	CUnicode::UTF8ToUCS4(const CString& src, bool* errors)
139	{
140	// default to success
141	resetError(errors);
142
143	// get size of input string and reserve some space in output
144	UInt32 n = src.size();
145	CString dst;
146	dst.reserve(4 * n);
147
148	// convert each character
149	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
150	while (n > 0) {
151	UInt32 c = fromUTF8(data, n);
152	if (c == s_invalid) {
153	c = s_replacement;
154	}
155	dst.append(reinterpret_cast<const char*>(&c), 4);
156	}
157
158	return dst;
159	}
160
161	CString
162	CUnicode::UTF8ToUTF16(const CString& src, bool* errors)
163	{
164	// default to success
165	resetError(errors);
166
167	// get size of input string and reserve some space in output
168	UInt32 n = src.size();
169	CString dst;
170	dst.reserve(2 * n);
171
172	// convert each character
173	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
174	while (n > 0) {
175	UInt32 c = fromUTF8(data, n);
176	if (c == s_invalid) {
177	c = s_replacement;
178	}
179	else if (c >= 0x00110000) {
180	setError(errors);
181	c = s_replacement;
182	}
183	if (c < 0x00010000) {
184	UInt16 ucs2 = static_cast<UInt16>(c);
185	dst.append(reinterpret_cast<const char*>(&ucs2), 2);
186	}
187	else {
188	c -= 0x00010000;
189	UInt16 utf16h = static_cast<UInt16>((c >> 10) + 0xd800);
190	UInt16 utf16l = static_cast<UInt16>((c & 0x03ff) + 0xdc00);
191	dst.append(reinterpret_cast<const char*>(&utf16h), 2);
192	dst.append(reinterpret_cast<const char*>(&utf16l), 2);
193	}
194	}
195
196	return dst;
197	}
198
199	CString
200	CUnicode::UTF8ToUTF32(const CString& src, bool* errors)
201	{
202	// default to success
203	resetError(errors);
204
205	// get size of input string and reserve some space in output
206	UInt32 n = src.size();
207	CString dst;
208	dst.reserve(4 * n);
209
210	// convert each character
211	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
212	while (n > 0) {
213	UInt32 c = fromUTF8(data, n);
214	if (c == s_invalid) {
215	c = s_replacement;
216	}
217	else if (c >= 0x00110000) {
218	setError(errors);
219	c = s_replacement;
220	}
221	dst.append(reinterpret_cast<const char*>(&c), 4);
222	}
223
224	return dst;
225	}
226
227	CString
228	CUnicode::UTF8ToText(const CString& src, bool* errors)
229	{
230	// default to success
231	resetError(errors);
232
233	// convert to wide char
234	UInt32 size;
235	wchar_t* tmp = UTF8ToWideChar(src, size, errors);
236
237	// convert string to multibyte
238	int len = ARCH->convStringWCToMB(NULL, tmp, size, errors);
239	char* mbs = new char[len + 1];
240	ARCH->convStringWCToMB(mbs, tmp, size, errors);
241	CString text(mbs, len);
242
243	// clean up
244	delete[] mbs;
245	delete[] tmp;
246
247	return text;
248	}
249
250	CString
251	CUnicode::UCS2ToUTF8(const CString& src, bool* errors)
252	{
253	// default to success
254	resetError(errors);
255
256	// convert
257	UInt32 n = src.size() >> 1;
258	return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
259	}
260
261	CString
262	CUnicode::UCS4ToUTF8(const CString& src, bool* errors)
263	{
264	// default to success
265	resetError(errors);
266
267	// convert
268	UInt32 n = src.size() >> 2;
269	return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
270	}
271
272	CString
273	CUnicode::UTF16ToUTF8(const CString& src, bool* errors)
274	{
275	// default to success
276	resetError(errors);
277
278	// convert
279	UInt32 n = src.size() >> 1;
280	return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
281	}
282
283	CString
284	CUnicode::UTF32ToUTF8(const CString& src, bool* errors)
285	{
286	// default to success
287	resetError(errors);
288
289	// convert
290	UInt32 n = src.size() >> 2;
291	return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
292	}
293
294	CString
295	CUnicode::textToUTF8(const CString& src, bool* errors)
296	{
297	// default to success
298	resetError(errors);
299
300	// convert string to wide characters
301	UInt32 n = src.size();
302	int len = ARCH->convStringMBToWC(NULL, src.c_str(), n, errors);
303	wchar_t* wcs = new wchar_t[len + 1];
304	ARCH->convStringMBToWC(wcs, src.c_str(), n, errors);
305
306	// convert to UTF8
307	CString utf8 = wideCharToUTF8(wcs, len, errors);
308
309	// clean up
310	delete[] wcs;
311
312	return utf8;
313	}
314
315	wchar_t*
316	CUnicode::UTF8ToWideChar(const CString& src, UInt32& size, bool* errors)
317	{
318	// convert to platform's wide character encoding
319	CString tmp;
320	switch (ARCH->getWideCharEncoding()) {
321	case IArchString::kUCS2:
322	tmp = UTF8ToUCS2(src, errors);
323	size = tmp.size() >> 1;
324	break;
325
326	case IArchString::kUCS4:
327	tmp = UTF8ToUCS4(src, errors);
328	size = tmp.size() >> 2;
329	break;
330
331	case IArchString::kUTF16:
332	tmp = UTF8ToUTF16(src, errors);
333	size = tmp.size() >> 1;
334	break;
335
336	case IArchString::kUTF32:
337	tmp = UTF8ToUTF32(src, errors);
338	size = tmp.size() >> 2;
339	break;
340
341	default:
342	assert(0 && "unknown wide character encoding");
343	}
344
345	// copy to a wchar_t array
346	wchar_t* dst = new wchar_t[size];
347	::memcpy(dst, tmp.data(), sizeof(wchar_t) * size);
348	return dst;
349	}
350
351	CString
352	CUnicode::wideCharToUTF8(const wchar_t* src, UInt32 size, bool* errors)
353	{
354	// convert from platform's wide character encoding.
355	// note -- this must include a wide nul character (independent of
356	// the CString's nul character).
357	switch (ARCH->getWideCharEncoding()) {
358	case IArchString::kUCS2:
359	return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
360
361	case IArchString::kUCS4:
362	return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
363
364	case IArchString::kUTF16:
365	return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
366
367	case IArchString::kUTF32:
368	return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
369
370	default:
371	assert(0 && "unknown wide character encoding");
372	return CString();
373	}
374	}
375
376	CString
377	CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors)
378	{
379	// make some space
380	CString dst;
381	dst.reserve(n);
382
383	// check if first character is 0xfffe or 0xfeff
384	bool byteSwapped = false;
385	if (n >= 1) {
386	switch (decode16(data, false)) {
387	case 0x0000feff:
388	data += 2;
389	--n;
390	break;
391
392	case 0x0000fffe:
393	byteSwapped = true;
394	data += 2;
395	--n;
396	break;
397
398	default:
399	break;
400	}
401	}
402
403	// convert each character
404	for (; n > 0; data += 2, --n) {
405	UInt32 c = decode16(data, byteSwapped);
406	toUTF8(dst, c, errors);
407	}
408
409	return dst;
410	}
411
412	CString
413	CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors)
414	{
415	// make some space
416	CString dst;
417	dst.reserve(n);
418
419	// check if first character is 0xfffe or 0xfeff
420	bool byteSwapped = false;
421	if (n >= 1) {
422	switch (decode32(data, false)) {
423	case 0x0000feff:
424	data += 4;
425	--n;
426	break;
427
428	case 0x0000fffe:
429	byteSwapped = true;
430	data += 4;
431	--n;
432	break;
433
434	default:
435	break;
436	}
437	}
438
439	// convert each character
440	for (; n > 0; data += 4, --n) {
441	UInt32 c = decode32(data, byteSwapped);
442	toUTF8(dst, c, errors);
443	}
444
445	return dst;
446	}
447
448	CString
449	CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors)
450	{
451	// make some space
452	CString dst;
453	dst.reserve(n);
454
455	// check if first character is 0xfffe or 0xfeff
456	bool byteSwapped = false;
457	if (n >= 1) {
458	switch (decode16(data, false)) {
459	case 0x0000feff:
460	data += 2;
461	--n;
462	break;
463
464	case 0x0000fffe:
465	byteSwapped = true;
466	data += 2;
467	--n;
468	break;
469
470	default:
471	break;
472	}
473	}
474
475	// convert each character
476	for (; n > 0; data += 2, --n) {
477	UInt32 c = decode16(data, byteSwapped);
478	if (c < 0x0000d800 \|\| c > 0x0000dfff) {
479	toUTF8(dst, c, errors);
480	}
481	else if (n == 1) {
482	// error -- missing second word
483	setError(errors);
484	toUTF8(dst, s_replacement, NULL);
485	}
486	else if (c >= 0x0000d800 && c <= 0x0000dbff) {
487	UInt32 c2 = decode16(data, byteSwapped);
488	data += 2;
489	--n;
490	if (c2 < 0x0000dc00 \|\| c2 > 0x0000dfff) {
491	// error -- [d800,dbff] not followed by [dc00,dfff]
492	setError(errors);
493	toUTF8(dst, s_replacement, NULL);
494	}
495	else {
496	c = (((c - 0x0000d800) << 10) \| (c2 - 0x0000dc00)) + 0x00010000;
497	toUTF8(dst, c, errors);
498	}
499	}
500	else {
501	// error -- [dc00,dfff] without leading [d800,dbff]
502	setError(errors);
503	toUTF8(dst, s_replacement, NULL);
504	}
505	}
506
507	return dst;
508	}
509
510	CString
511	CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors)
512	{
513	// make some space
514	CString dst;
515	dst.reserve(n);
516
517	// check if first character is 0xfffe or 0xfeff
518	bool byteSwapped = false;
519	if (n >= 1) {
520	switch (decode32(data, false)) {
521	case 0x0000feff:
522	data += 4;
523	--n;
524	break;
525
526	case 0x0000fffe:
527	byteSwapped = true;
528	data += 4;
529	--n;
530	break;
531
532	default:
533	break;
534	}
535	}
536
537	// convert each character
538	for (; n > 0; data += 4, --n) {
539	UInt32 c = decode32(data, byteSwapped);
540	if (c >= 0x00110000) {
541	setError(errors);
542	c = s_replacement;
543	}
544	toUTF8(dst, c, errors);
545	}
546
547	return dst;
548	}
549
550	UInt32
551	CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
552	{
553	assert(data != NULL);
554	assert(n != 0);
555
556	// compute character encoding length, checking for overlong
557	// sequences (i.e. characters that don't use the shortest
558	// possible encoding).
559	UInt32 size;
560	if (data[0] < 0x80) {
561	// 0xxxxxxx
562	size = 1;
563	}
564	else if (data[0] < 0xc0) {
565	// 10xxxxxx -- in the middle of a multibyte character. counts
566	// as one invalid character.
567	--n;
568	++data;
569	return s_invalid;
570	}
571	else if (data[0] < 0xe0) {
572	// 110xxxxx
573	size = 2;
574	}
575	else if (data[0] < 0xf0) {
576	// 1110xxxx
577	size = 3;
578	}
579	else if (data[0] < 0xf8) {
580	// 11110xxx
581	size = 4;
582	}
583	else if (data[0] < 0xfc) {
584	// 111110xx
585	size = 5;
586	}
587	else if (data[0] < 0xfe) {
588	// 1111110x
589	size = 6;
590	}
591	else {
592	// invalid sequence. dunno how many bytes to skip so skip one.
593	--n;
594	++data;
595	return s_invalid;
596	}
597
598	// make sure we have enough data
599	if (size > n) {
600	data += n;
601	n = 0;
602	return s_invalid;
603	}
604
605	// extract character
606	UInt32 c;
607	switch (size) {
608	case 1:
609	c = static_cast<UInt32>(data[0]);
610	break;
611
612	case 2:
613	c = ((static_cast<UInt32>(data[0]) & 0x1f) << 6) \|
614	((static_cast<UInt32>(data[1]) & 0x3f) );
615	break;
616
617	case 3:
618	c = ((static_cast<UInt32>(data[0]) & 0x0f) << 12) \|
619	((static_cast<UInt32>(data[1]) & 0x3f) << 6) \|
620	((static_cast<UInt32>(data[2]) & 0x3f) );
621	break;
622
623	case 4:
624	c = ((static_cast<UInt32>(data[0]) & 0x07) << 18) \|
625	((static_cast<UInt32>(data[1]) & 0x3f) << 12) \|
626	((static_cast<UInt32>(data[1]) & 0x3f) << 6) \|
627	((static_cast<UInt32>(data[1]) & 0x3f) );
628	break;
629
630	case 5:
631	c = ((static_cast<UInt32>(data[0]) & 0x03) << 24) \|
632	((static_cast<UInt32>(data[1]) & 0x3f) << 18) \|
633	((static_cast<UInt32>(data[1]) & 0x3f) << 12) \|
634	((static_cast<UInt32>(data[1]) & 0x3f) << 6) \|
635	((static_cast<UInt32>(data[1]) & 0x3f) );
636	break;
637
638	case 6:
639	c = ((static_cast<UInt32>(data[0]) & 0x01) << 30) \|
640	((static_cast<UInt32>(data[1]) & 0x3f) << 24) \|
641	((static_cast<UInt32>(data[1]) & 0x3f) << 18) \|
642	((static_cast<UInt32>(data[1]) & 0x3f) << 12) \|
643	((static_cast<UInt32>(data[1]) & 0x3f) << 6) \|
644	((static_cast<UInt32>(data[1]) & 0x3f) );
645	break;
646
647	default:
648	assert(0 && "invalid size");
649	return s_invalid;
650	}
651
652	// check that all bytes after the first have the pattern 10xxxxxx.
653	// truncated sequences are treated as a single malformed character.
654	bool truncated = false;
655	switch (size) {
656	case 6:
657	if ((data[5] & 0xc0) != 0x80) {
658	truncated = true;
659	size = 5;
660	}
661	// fall through
662
663	case 5:
664	if ((data[4] & 0xc0) != 0x80) {
665	truncated = true;
666	size = 4;
667	}
668	// fall through
669
670	case 4:
671	if ((data[3] & 0xc0) != 0x80) {
672	truncated = true;
673	size = 3;
674	}
675	// fall through
676
677	case 3:
678	if ((data[2] & 0xc0) != 0x80) {
679	truncated = true;
680	size = 2;
681	}
682	// fall through
683
684	case 2:
685	if ((data[1] & 0xc0) != 0x80) {
686	truncated = true;
687	size = 1;
688	}
689	}
690
691	// update parameters
692	data += size;
693	n -= size;
694
695	// invalid if sequence was truncated
696	if (truncated) {
697	return s_invalid;
698	}
699
700	// check for characters that didn't use the smallest possible encoding
701	static UInt32 s_minChar[] = {
702	0,
703	0x00000000,
704	0x00000080,
705	0x00000800,
706	0x00010000,
707	0x00200000,
708	0x04000000
709	};
710	if (c < s_minChar[size]) {
711	return s_invalid;
712	}
713
714	// check for characters not in ISO-10646
715	if (c >= 0x0000d800 && c <= 0x0000dfff) {
716	return s_invalid;
717	}
718	if (c >= 0x0000fffe && c <= 0x0000ffff) {
719	return s_invalid;
720	}
721
722	return c;
723	}
724
725	void
726	CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors)
727	{
728	UInt8 data[6];
729
730	// handle characters outside the valid range
731	if ((c >= 0x0000d800 && c <= 0x0000dfff) \|\| c >= 0x80000000) {
732	setError(errors);
733	c = s_replacement;
734	}
735
736	// convert to UTF-8
737	if (c < 0x00000080) {
738	data[0] = static_cast<UInt8>(c);
739	dst.append(reinterpret_cast<char*>(data), 1);
740	}
741	else if (c < 0x00000800) {
742	data[0] = static_cast<UInt8>(((c >> 6) & 0x0000001f) + 0xc0);
743	data[1] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
744	dst.append(reinterpret_cast<char*>(data), 2);
745	}
746	else if (c < 0x00010000) {
747	data[0] = static_cast<UInt8>(((c >> 12) & 0x0000000f) + 0xe0);
748	data[1] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
749	data[2] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
750	dst.append(reinterpret_cast<char*>(data), 3);
751	}
752	else if (c < 0x00200000) {
753	data[0] = static_cast<UInt8>(((c >> 18) & 0x00000007) + 0xf0);
754	data[1] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
755	data[2] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
756	data[3] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
757	dst.append(reinterpret_cast<char*>(data), 4);
758	}
759	else if (c < 0x04000000) {
760	data[0] = static_cast<UInt8>(((c >> 24) & 0x00000003) + 0xf8);
761	data[1] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
762	data[2] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
763	data[3] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
764	data[4] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
765	dst.append(reinterpret_cast<char*>(data), 5);
766	}
767	else if (c < 0x80000000) {
768	data[0] = static_cast<UInt8>(((c >> 30) & 0x00000001) + 0xfc);
769	data[1] = static_cast<UInt8>(((c >> 24) & 0x0000003f) + 0x80);
770	data[2] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
771	data[3] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
772	data[4] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
773	data[5] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
774	dst.append(reinterpret_cast<char*>(data), 6);
775	}
776	else {
777	assert(0 && "character out of range");
778	}
779	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/synergy/lib/base/CUnicode.cpp@ 3848

Download in other formats: