source: trunk/synergy/lib/base/CUnicode.cpp@ 3885

Last change on this file since 3885 was 2749, checked in by bird, 19 years ago

synergy v1.3.1 sources (zip).

File size: 16.0 KB
Line 
1/*
2 * synergy -- mouse and keyboard sharing utility
3 * Copyright (C) 2002 Chris Schoeneman
4 *
5 * This package is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * found in the file COPYING that should have accompanied this file.
8 *
9 * This package is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 */
14
15#include "CUnicode.h"
16#include "CArch.h"
17#include <string.h>
18
19//
20// local utility functions
21//
22
23inline
24static
25UInt16
26decode16(const UInt8* n, bool byteSwapped)
27{
28 union x16 {
29 UInt8 n8[2];
30 UInt16 n16;
31 } c;
32 if (byteSwapped) {
33 c.n8[0] = n[1];
34 c.n8[1] = n[0];
35 }
36 else {
37 c.n8[0] = n[0];
38 c.n8[1] = n[1];
39 }
40 return c.n16;
41}
42
43inline
44static
45UInt32
46decode32(const UInt8* n, bool byteSwapped)
47{
48 union x32 {
49 UInt8 n8[4];
50 UInt32 n32;
51 } c;
52 if (byteSwapped) {
53 c.n8[0] = n[3];
54 c.n8[1] = n[2];
55 c.n8[2] = n[1];
56 c.n8[3] = n[0];
57 }
58 else {
59 c.n8[0] = n[0];
60 c.n8[1] = n[1];
61 c.n8[2] = n[2];
62 c.n8[3] = n[3];
63 }
64 return c.n32;
65}
66
67inline
68static
69void
70resetError(bool* errors)
71{
72 if (errors != NULL) {
73 *errors = false;
74 }
75}
76
77inline
78static
79void
80setError(bool* errors)
81{
82 if (errors != NULL) {
83 *errors = true;
84 }
85}
86
87
88//
89// CUnicode
90//
91
92UInt32 CUnicode::s_invalid = 0x0000ffff;
93UInt32 CUnicode::s_replacement = 0x0000fffd;
94
95bool
96CUnicode::isUTF8(const CString& src)
97{
98 // convert and test each character
99 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
100 for (UInt32 n = src.size(); n > 0; ) {
101 if (fromUTF8(data, n) == s_invalid) {
102 return false;
103 }
104 }
105 return true;
106}
107
108CString
109CUnicode::UTF8ToUCS2(const CString& src, bool* errors)
110{
111 // default to success
112 resetError(errors);
113
114 // get size of input string and reserve some space in output
115 UInt32 n = src.size();
116 CString dst;
117 dst.reserve(2 * n);
118
119 // convert each character
120 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
121 while (n > 0) {
122 UInt32 c = fromUTF8(data, n);
123 if (c == s_invalid) {
124 c = s_replacement;
125 }
126 else if (c >= 0x00010000) {
127 setError(errors);
128 c = s_replacement;
129 }
130 UInt16 ucs2 = static_cast<UInt16>(c);
131 dst.append(reinterpret_cast<const char*>(&ucs2), 2);
132 }
133
134 return dst;
135}
136
137CString
138CUnicode::UTF8ToUCS4(const CString& src, bool* errors)
139{
140 // default to success
141 resetError(errors);
142
143 // get size of input string and reserve some space in output
144 UInt32 n = src.size();
145 CString dst;
146 dst.reserve(4 * n);
147
148 // convert each character
149 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
150 while (n > 0) {
151 UInt32 c = fromUTF8(data, n);
152 if (c == s_invalid) {
153 c = s_replacement;
154 }
155 dst.append(reinterpret_cast<const char*>(&c), 4);
156 }
157
158 return dst;
159}
160
161CString
162CUnicode::UTF8ToUTF16(const CString& src, bool* errors)
163{
164 // default to success
165 resetError(errors);
166
167 // get size of input string and reserve some space in output
168 UInt32 n = src.size();
169 CString dst;
170 dst.reserve(2 * n);
171
172 // convert each character
173 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
174 while (n > 0) {
175 UInt32 c = fromUTF8(data, n);
176 if (c == s_invalid) {
177 c = s_replacement;
178 }
179 else if (c >= 0x00110000) {
180 setError(errors);
181 c = s_replacement;
182 }
183 if (c < 0x00010000) {
184 UInt16 ucs2 = static_cast<UInt16>(c);
185 dst.append(reinterpret_cast<const char*>(&ucs2), 2);
186 }
187 else {
188 c -= 0x00010000;
189 UInt16 utf16h = static_cast<UInt16>((c >> 10) + 0xd800);
190 UInt16 utf16l = static_cast<UInt16>((c & 0x03ff) + 0xdc00);
191 dst.append(reinterpret_cast<const char*>(&utf16h), 2);
192 dst.append(reinterpret_cast<const char*>(&utf16l), 2);
193 }
194 }
195
196 return dst;
197}
198
199CString
200CUnicode::UTF8ToUTF32(const CString& src, bool* errors)
201{
202 // default to success
203 resetError(errors);
204
205 // get size of input string and reserve some space in output
206 UInt32 n = src.size();
207 CString dst;
208 dst.reserve(4 * n);
209
210 // convert each character
211 const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
212 while (n > 0) {
213 UInt32 c = fromUTF8(data, n);
214 if (c == s_invalid) {
215 c = s_replacement;
216 }
217 else if (c >= 0x00110000) {
218 setError(errors);
219 c = s_replacement;
220 }
221 dst.append(reinterpret_cast<const char*>(&c), 4);
222 }
223
224 return dst;
225}
226
227CString
228CUnicode::UTF8ToText(const CString& src, bool* errors)
229{
230 // default to success
231 resetError(errors);
232
233 // convert to wide char
234 UInt32 size;
235 wchar_t* tmp = UTF8ToWideChar(src, size, errors);
236
237 // convert string to multibyte
238 int len = ARCH->convStringWCToMB(NULL, tmp, size, errors);
239 char* mbs = new char[len + 1];
240 ARCH->convStringWCToMB(mbs, tmp, size, errors);
241 CString text(mbs, len);
242
243 // clean up
244 delete[] mbs;
245 delete[] tmp;
246
247 return text;
248}
249
250CString
251CUnicode::UCS2ToUTF8(const CString& src, bool* errors)
252{
253 // default to success
254 resetError(errors);
255
256 // convert
257 UInt32 n = src.size() >> 1;
258 return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
259}
260
261CString
262CUnicode::UCS4ToUTF8(const CString& src, bool* errors)
263{
264 // default to success
265 resetError(errors);
266
267 // convert
268 UInt32 n = src.size() >> 2;
269 return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
270}
271
272CString
273CUnicode::UTF16ToUTF8(const CString& src, bool* errors)
274{
275 // default to success
276 resetError(errors);
277
278 // convert
279 UInt32 n = src.size() >> 1;
280 return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
281}
282
283CString
284CUnicode::UTF32ToUTF8(const CString& src, bool* errors)
285{
286 // default to success
287 resetError(errors);
288
289 // convert
290 UInt32 n = src.size() >> 2;
291 return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
292}
293
294CString
295CUnicode::textToUTF8(const CString& src, bool* errors)
296{
297 // default to success
298 resetError(errors);
299
300 // convert string to wide characters
301 UInt32 n = src.size();
302 int len = ARCH->convStringMBToWC(NULL, src.c_str(), n, errors);
303 wchar_t* wcs = new wchar_t[len + 1];
304 ARCH->convStringMBToWC(wcs, src.c_str(), n, errors);
305
306 // convert to UTF8
307 CString utf8 = wideCharToUTF8(wcs, len, errors);
308
309 // clean up
310 delete[] wcs;
311
312 return utf8;
313}
314
315wchar_t*
316CUnicode::UTF8ToWideChar(const CString& src, UInt32& size, bool* errors)
317{
318 // convert to platform's wide character encoding
319 CString tmp;
320 switch (ARCH->getWideCharEncoding()) {
321 case IArchString::kUCS2:
322 tmp = UTF8ToUCS2(src, errors);
323 size = tmp.size() >> 1;
324 break;
325
326 case IArchString::kUCS4:
327 tmp = UTF8ToUCS4(src, errors);
328 size = tmp.size() >> 2;
329 break;
330
331 case IArchString::kUTF16:
332 tmp = UTF8ToUTF16(src, errors);
333 size = tmp.size() >> 1;
334 break;
335
336 case IArchString::kUTF32:
337 tmp = UTF8ToUTF32(src, errors);
338 size = tmp.size() >> 2;
339 break;
340
341 default:
342 assert(0 && "unknown wide character encoding");
343 }
344
345 // copy to a wchar_t array
346 wchar_t* dst = new wchar_t[size];
347 ::memcpy(dst, tmp.data(), sizeof(wchar_t) * size);
348 return dst;
349}
350
351CString
352CUnicode::wideCharToUTF8(const wchar_t* src, UInt32 size, bool* errors)
353{
354 // convert from platform's wide character encoding.
355 // note -- this must include a wide nul character (independent of
356 // the CString's nul character).
357 switch (ARCH->getWideCharEncoding()) {
358 case IArchString::kUCS2:
359 return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
360
361 case IArchString::kUCS4:
362 return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
363
364 case IArchString::kUTF16:
365 return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
366
367 case IArchString::kUTF32:
368 return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
369
370 default:
371 assert(0 && "unknown wide character encoding");
372 return CString();
373 }
374}
375
376CString
377CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors)
378{
379 // make some space
380 CString dst;
381 dst.reserve(n);
382
383 // check if first character is 0xfffe or 0xfeff
384 bool byteSwapped = false;
385 if (n >= 1) {
386 switch (decode16(data, false)) {
387 case 0x0000feff:
388 data += 2;
389 --n;
390 break;
391
392 case 0x0000fffe:
393 byteSwapped = true;
394 data += 2;
395 --n;
396 break;
397
398 default:
399 break;
400 }
401 }
402
403 // convert each character
404 for (; n > 0; data += 2, --n) {
405 UInt32 c = decode16(data, byteSwapped);
406 toUTF8(dst, c, errors);
407 }
408
409 return dst;
410}
411
412CString
413CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors)
414{
415 // make some space
416 CString dst;
417 dst.reserve(n);
418
419 // check if first character is 0xfffe or 0xfeff
420 bool byteSwapped = false;
421 if (n >= 1) {
422 switch (decode32(data, false)) {
423 case 0x0000feff:
424 data += 4;
425 --n;
426 break;
427
428 case 0x0000fffe:
429 byteSwapped = true;
430 data += 4;
431 --n;
432 break;
433
434 default:
435 break;
436 }
437 }
438
439 // convert each character
440 for (; n > 0; data += 4, --n) {
441 UInt32 c = decode32(data, byteSwapped);
442 toUTF8(dst, c, errors);
443 }
444
445 return dst;
446}
447
448CString
449CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors)
450{
451 // make some space
452 CString dst;
453 dst.reserve(n);
454
455 // check if first character is 0xfffe or 0xfeff
456 bool byteSwapped = false;
457 if (n >= 1) {
458 switch (decode16(data, false)) {
459 case 0x0000feff:
460 data += 2;
461 --n;
462 break;
463
464 case 0x0000fffe:
465 byteSwapped = true;
466 data += 2;
467 --n;
468 break;
469
470 default:
471 break;
472 }
473 }
474
475 // convert each character
476 for (; n > 0; data += 2, --n) {
477 UInt32 c = decode16(data, byteSwapped);
478 if (c < 0x0000d800 || c > 0x0000dfff) {
479 toUTF8(dst, c, errors);
480 }
481 else if (n == 1) {
482 // error -- missing second word
483 setError(errors);
484 toUTF8(dst, s_replacement, NULL);
485 }
486 else if (c >= 0x0000d800 && c <= 0x0000dbff) {
487 UInt32 c2 = decode16(data, byteSwapped);
488 data += 2;
489 --n;
490 if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
491 // error -- [d800,dbff] not followed by [dc00,dfff]
492 setError(errors);
493 toUTF8(dst, s_replacement, NULL);
494 }
495 else {
496 c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
497 toUTF8(dst, c, errors);
498 }
499 }
500 else {
501 // error -- [dc00,dfff] without leading [d800,dbff]
502 setError(errors);
503 toUTF8(dst, s_replacement, NULL);
504 }
505 }
506
507 return dst;
508}
509
510CString
511CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors)
512{
513 // make some space
514 CString dst;
515 dst.reserve(n);
516
517 // check if first character is 0xfffe or 0xfeff
518 bool byteSwapped = false;
519 if (n >= 1) {
520 switch (decode32(data, false)) {
521 case 0x0000feff:
522 data += 4;
523 --n;
524 break;
525
526 case 0x0000fffe:
527 byteSwapped = true;
528 data += 4;
529 --n;
530 break;
531
532 default:
533 break;
534 }
535 }
536
537 // convert each character
538 for (; n > 0; data += 4, --n) {
539 UInt32 c = decode32(data, byteSwapped);
540 if (c >= 0x00110000) {
541 setError(errors);
542 c = s_replacement;
543 }
544 toUTF8(dst, c, errors);
545 }
546
547 return dst;
548}
549
550UInt32
551CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
552{
553 assert(data != NULL);
554 assert(n != 0);
555
556 // compute character encoding length, checking for overlong
557 // sequences (i.e. characters that don't use the shortest
558 // possible encoding).
559 UInt32 size;
560 if (data[0] < 0x80) {
561 // 0xxxxxxx
562 size = 1;
563 }
564 else if (data[0] < 0xc0) {
565 // 10xxxxxx -- in the middle of a multibyte character. counts
566 // as one invalid character.
567 --n;
568 ++data;
569 return s_invalid;
570 }
571 else if (data[0] < 0xe0) {
572 // 110xxxxx
573 size = 2;
574 }
575 else if (data[0] < 0xf0) {
576 // 1110xxxx
577 size = 3;
578 }
579 else if (data[0] < 0xf8) {
580 // 11110xxx
581 size = 4;
582 }
583 else if (data[0] < 0xfc) {
584 // 111110xx
585 size = 5;
586 }
587 else if (data[0] < 0xfe) {
588 // 1111110x
589 size = 6;
590 }
591 else {
592 // invalid sequence. dunno how many bytes to skip so skip one.
593 --n;
594 ++data;
595 return s_invalid;
596 }
597
598 // make sure we have enough data
599 if (size > n) {
600 data += n;
601 n = 0;
602 return s_invalid;
603 }
604
605 // extract character
606 UInt32 c;
607 switch (size) {
608 case 1:
609 c = static_cast<UInt32>(data[0]);
610 break;
611
612 case 2:
613 c = ((static_cast<UInt32>(data[0]) & 0x1f) << 6) |
614 ((static_cast<UInt32>(data[1]) & 0x3f) );
615 break;
616
617 case 3:
618 c = ((static_cast<UInt32>(data[0]) & 0x0f) << 12) |
619 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
620 ((static_cast<UInt32>(data[2]) & 0x3f) );
621 break;
622
623 case 4:
624 c = ((static_cast<UInt32>(data[0]) & 0x07) << 18) |
625 ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
626 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
627 ((static_cast<UInt32>(data[1]) & 0x3f) );
628 break;
629
630 case 5:
631 c = ((static_cast<UInt32>(data[0]) & 0x03) << 24) |
632 ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
633 ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
634 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
635 ((static_cast<UInt32>(data[1]) & 0x3f) );
636 break;
637
638 case 6:
639 c = ((static_cast<UInt32>(data[0]) & 0x01) << 30) |
640 ((static_cast<UInt32>(data[1]) & 0x3f) << 24) |
641 ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
642 ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
643 ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
644 ((static_cast<UInt32>(data[1]) & 0x3f) );
645 break;
646
647 default:
648 assert(0 && "invalid size");
649 return s_invalid;
650 }
651
652 // check that all bytes after the first have the pattern 10xxxxxx.
653 // truncated sequences are treated as a single malformed character.
654 bool truncated = false;
655 switch (size) {
656 case 6:
657 if ((data[5] & 0xc0) != 0x80) {
658 truncated = true;
659 size = 5;
660 }
661 // fall through
662
663 case 5:
664 if ((data[4] & 0xc0) != 0x80) {
665 truncated = true;
666 size = 4;
667 }
668 // fall through
669
670 case 4:
671 if ((data[3] & 0xc0) != 0x80) {
672 truncated = true;
673 size = 3;
674 }
675 // fall through
676
677 case 3:
678 if ((data[2] & 0xc0) != 0x80) {
679 truncated = true;
680 size = 2;
681 }
682 // fall through
683
684 case 2:
685 if ((data[1] & 0xc0) != 0x80) {
686 truncated = true;
687 size = 1;
688 }
689 }
690
691 // update parameters
692 data += size;
693 n -= size;
694
695 // invalid if sequence was truncated
696 if (truncated) {
697 return s_invalid;
698 }
699
700 // check for characters that didn't use the smallest possible encoding
701 static UInt32 s_minChar[] = {
702 0,
703 0x00000000,
704 0x00000080,
705 0x00000800,
706 0x00010000,
707 0x00200000,
708 0x04000000
709 };
710 if (c < s_minChar[size]) {
711 return s_invalid;
712 }
713
714 // check for characters not in ISO-10646
715 if (c >= 0x0000d800 && c <= 0x0000dfff) {
716 return s_invalid;
717 }
718 if (c >= 0x0000fffe && c <= 0x0000ffff) {
719 return s_invalid;
720 }
721
722 return c;
723}
724
725void
726CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors)
727{
728 UInt8 data[6];
729
730 // handle characters outside the valid range
731 if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) {
732 setError(errors);
733 c = s_replacement;
734 }
735
736 // convert to UTF-8
737 if (c < 0x00000080) {
738 data[0] = static_cast<UInt8>(c);
739 dst.append(reinterpret_cast<char*>(data), 1);
740 }
741 else if (c < 0x00000800) {
742 data[0] = static_cast<UInt8>(((c >> 6) & 0x0000001f) + 0xc0);
743 data[1] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
744 dst.append(reinterpret_cast<char*>(data), 2);
745 }
746 else if (c < 0x00010000) {
747 data[0] = static_cast<UInt8>(((c >> 12) & 0x0000000f) + 0xe0);
748 data[1] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
749 data[2] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
750 dst.append(reinterpret_cast<char*>(data), 3);
751 }
752 else if (c < 0x00200000) {
753 data[0] = static_cast<UInt8>(((c >> 18) & 0x00000007) + 0xf0);
754 data[1] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
755 data[2] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
756 data[3] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
757 dst.append(reinterpret_cast<char*>(data), 4);
758 }
759 else if (c < 0x04000000) {
760 data[0] = static_cast<UInt8>(((c >> 24) & 0x00000003) + 0xf8);
761 data[1] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
762 data[2] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
763 data[3] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
764 data[4] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
765 dst.append(reinterpret_cast<char*>(data), 5);
766 }
767 else if (c < 0x80000000) {
768 data[0] = static_cast<UInt8>(((c >> 30) & 0x00000001) + 0xfc);
769 data[1] = static_cast<UInt8>(((c >> 24) & 0x0000003f) + 0x80);
770 data[2] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
771 data[3] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
772 data[4] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
773 data[5] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
774 dst.append(reinterpret_cast<char*>(data), 6);
775 }
776 else {
777 assert(0 && "character out of range");
778 }
779}
Note: See TracBrowser for help on using the repository browser.