1 | /*
|
---|
2 | * synergy -- mouse and keyboard sharing utility
|
---|
3 | * Copyright (C) 2002 Chris Schoeneman
|
---|
4 | *
|
---|
5 | * This package is free software; you can redistribute it and/or
|
---|
6 | * modify it under the terms of the GNU General Public License
|
---|
7 | * found in the file COPYING that should have accompanied this file.
|
---|
8 | *
|
---|
9 | * This package is distributed in the hope that it will be useful,
|
---|
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
12 | * GNU General Public License for more details.
|
---|
13 | */
|
---|
14 |
|
---|
15 | #include "CUnicode.h"
|
---|
16 | #include "CArch.h"
|
---|
17 | #include <string.h>
|
---|
18 |
|
---|
19 | //
|
---|
20 | // local utility functions
|
---|
21 | //
|
---|
22 |
|
---|
23 | inline
|
---|
24 | static
|
---|
25 | UInt16
|
---|
26 | decode16(const UInt8* n, bool byteSwapped)
|
---|
27 | {
|
---|
28 | union x16 {
|
---|
29 | UInt8 n8[2];
|
---|
30 | UInt16 n16;
|
---|
31 | } c;
|
---|
32 | if (byteSwapped) {
|
---|
33 | c.n8[0] = n[1];
|
---|
34 | c.n8[1] = n[0];
|
---|
35 | }
|
---|
36 | else {
|
---|
37 | c.n8[0] = n[0];
|
---|
38 | c.n8[1] = n[1];
|
---|
39 | }
|
---|
40 | return c.n16;
|
---|
41 | }
|
---|
42 |
|
---|
43 | inline
|
---|
44 | static
|
---|
45 | UInt32
|
---|
46 | decode32(const UInt8* n, bool byteSwapped)
|
---|
47 | {
|
---|
48 | union x32 {
|
---|
49 | UInt8 n8[4];
|
---|
50 | UInt32 n32;
|
---|
51 | } c;
|
---|
52 | if (byteSwapped) {
|
---|
53 | c.n8[0] = n[3];
|
---|
54 | c.n8[1] = n[2];
|
---|
55 | c.n8[2] = n[1];
|
---|
56 | c.n8[3] = n[0];
|
---|
57 | }
|
---|
58 | else {
|
---|
59 | c.n8[0] = n[0];
|
---|
60 | c.n8[1] = n[1];
|
---|
61 | c.n8[2] = n[2];
|
---|
62 | c.n8[3] = n[3];
|
---|
63 | }
|
---|
64 | return c.n32;
|
---|
65 | }
|
---|
66 |
|
---|
67 | inline
|
---|
68 | static
|
---|
69 | void
|
---|
70 | resetError(bool* errors)
|
---|
71 | {
|
---|
72 | if (errors != NULL) {
|
---|
73 | *errors = false;
|
---|
74 | }
|
---|
75 | }
|
---|
76 |
|
---|
77 | inline
|
---|
78 | static
|
---|
79 | void
|
---|
80 | setError(bool* errors)
|
---|
81 | {
|
---|
82 | if (errors != NULL) {
|
---|
83 | *errors = true;
|
---|
84 | }
|
---|
85 | }
|
---|
86 |
|
---|
87 |
|
---|
88 | //
|
---|
89 | // CUnicode
|
---|
90 | //
|
---|
91 |
|
---|
92 | UInt32 CUnicode::s_invalid = 0x0000ffff;
|
---|
93 | UInt32 CUnicode::s_replacement = 0x0000fffd;
|
---|
94 |
|
---|
95 | bool
|
---|
96 | CUnicode::isUTF8(const CString& src)
|
---|
97 | {
|
---|
98 | // convert and test each character
|
---|
99 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
---|
100 | for (UInt32 n = src.size(); n > 0; ) {
|
---|
101 | if (fromUTF8(data, n) == s_invalid) {
|
---|
102 | return false;
|
---|
103 | }
|
---|
104 | }
|
---|
105 | return true;
|
---|
106 | }
|
---|
107 |
|
---|
108 | CString
|
---|
109 | CUnicode::UTF8ToUCS2(const CString& src, bool* errors)
|
---|
110 | {
|
---|
111 | // default to success
|
---|
112 | resetError(errors);
|
---|
113 |
|
---|
114 | // get size of input string and reserve some space in output
|
---|
115 | UInt32 n = src.size();
|
---|
116 | CString dst;
|
---|
117 | dst.reserve(2 * n);
|
---|
118 |
|
---|
119 | // convert each character
|
---|
120 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
---|
121 | while (n > 0) {
|
---|
122 | UInt32 c = fromUTF8(data, n);
|
---|
123 | if (c == s_invalid) {
|
---|
124 | c = s_replacement;
|
---|
125 | }
|
---|
126 | else if (c >= 0x00010000) {
|
---|
127 | setError(errors);
|
---|
128 | c = s_replacement;
|
---|
129 | }
|
---|
130 | UInt16 ucs2 = static_cast<UInt16>(c);
|
---|
131 | dst.append(reinterpret_cast<const char*>(&ucs2), 2);
|
---|
132 | }
|
---|
133 |
|
---|
134 | return dst;
|
---|
135 | }
|
---|
136 |
|
---|
137 | CString
|
---|
138 | CUnicode::UTF8ToUCS4(const CString& src, bool* errors)
|
---|
139 | {
|
---|
140 | // default to success
|
---|
141 | resetError(errors);
|
---|
142 |
|
---|
143 | // get size of input string and reserve some space in output
|
---|
144 | UInt32 n = src.size();
|
---|
145 | CString dst;
|
---|
146 | dst.reserve(4 * n);
|
---|
147 |
|
---|
148 | // convert each character
|
---|
149 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
---|
150 | while (n > 0) {
|
---|
151 | UInt32 c = fromUTF8(data, n);
|
---|
152 | if (c == s_invalid) {
|
---|
153 | c = s_replacement;
|
---|
154 | }
|
---|
155 | dst.append(reinterpret_cast<const char*>(&c), 4);
|
---|
156 | }
|
---|
157 |
|
---|
158 | return dst;
|
---|
159 | }
|
---|
160 |
|
---|
161 | CString
|
---|
162 | CUnicode::UTF8ToUTF16(const CString& src, bool* errors)
|
---|
163 | {
|
---|
164 | // default to success
|
---|
165 | resetError(errors);
|
---|
166 |
|
---|
167 | // get size of input string and reserve some space in output
|
---|
168 | UInt32 n = src.size();
|
---|
169 | CString dst;
|
---|
170 | dst.reserve(2 * n);
|
---|
171 |
|
---|
172 | // convert each character
|
---|
173 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
---|
174 | while (n > 0) {
|
---|
175 | UInt32 c = fromUTF8(data, n);
|
---|
176 | if (c == s_invalid) {
|
---|
177 | c = s_replacement;
|
---|
178 | }
|
---|
179 | else if (c >= 0x00110000) {
|
---|
180 | setError(errors);
|
---|
181 | c = s_replacement;
|
---|
182 | }
|
---|
183 | if (c < 0x00010000) {
|
---|
184 | UInt16 ucs2 = static_cast<UInt16>(c);
|
---|
185 | dst.append(reinterpret_cast<const char*>(&ucs2), 2);
|
---|
186 | }
|
---|
187 | else {
|
---|
188 | c -= 0x00010000;
|
---|
189 | UInt16 utf16h = static_cast<UInt16>((c >> 10) + 0xd800);
|
---|
190 | UInt16 utf16l = static_cast<UInt16>((c & 0x03ff) + 0xdc00);
|
---|
191 | dst.append(reinterpret_cast<const char*>(&utf16h), 2);
|
---|
192 | dst.append(reinterpret_cast<const char*>(&utf16l), 2);
|
---|
193 | }
|
---|
194 | }
|
---|
195 |
|
---|
196 | return dst;
|
---|
197 | }
|
---|
198 |
|
---|
199 | CString
|
---|
200 | CUnicode::UTF8ToUTF32(const CString& src, bool* errors)
|
---|
201 | {
|
---|
202 | // default to success
|
---|
203 | resetError(errors);
|
---|
204 |
|
---|
205 | // get size of input string and reserve some space in output
|
---|
206 | UInt32 n = src.size();
|
---|
207 | CString dst;
|
---|
208 | dst.reserve(4 * n);
|
---|
209 |
|
---|
210 | // convert each character
|
---|
211 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
---|
212 | while (n > 0) {
|
---|
213 | UInt32 c = fromUTF8(data, n);
|
---|
214 | if (c == s_invalid) {
|
---|
215 | c = s_replacement;
|
---|
216 | }
|
---|
217 | else if (c >= 0x00110000) {
|
---|
218 | setError(errors);
|
---|
219 | c = s_replacement;
|
---|
220 | }
|
---|
221 | dst.append(reinterpret_cast<const char*>(&c), 4);
|
---|
222 | }
|
---|
223 |
|
---|
224 | return dst;
|
---|
225 | }
|
---|
226 |
|
---|
227 | CString
|
---|
228 | CUnicode::UTF8ToText(const CString& src, bool* errors)
|
---|
229 | {
|
---|
230 | // default to success
|
---|
231 | resetError(errors);
|
---|
232 |
|
---|
233 | // convert to wide char
|
---|
234 | UInt32 size;
|
---|
235 | wchar_t* tmp = UTF8ToWideChar(src, size, errors);
|
---|
236 |
|
---|
237 | // convert string to multibyte
|
---|
238 | int len = ARCH->convStringWCToMB(NULL, tmp, size, errors);
|
---|
239 | char* mbs = new char[len + 1];
|
---|
240 | ARCH->convStringWCToMB(mbs, tmp, size, errors);
|
---|
241 | CString text(mbs, len);
|
---|
242 |
|
---|
243 | // clean up
|
---|
244 | delete[] mbs;
|
---|
245 | delete[] tmp;
|
---|
246 |
|
---|
247 | return text;
|
---|
248 | }
|
---|
249 |
|
---|
250 | CString
|
---|
251 | CUnicode::UCS2ToUTF8(const CString& src, bool* errors)
|
---|
252 | {
|
---|
253 | // default to success
|
---|
254 | resetError(errors);
|
---|
255 |
|
---|
256 | // convert
|
---|
257 | UInt32 n = src.size() >> 1;
|
---|
258 | return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
---|
259 | }
|
---|
260 |
|
---|
261 | CString
|
---|
262 | CUnicode::UCS4ToUTF8(const CString& src, bool* errors)
|
---|
263 | {
|
---|
264 | // default to success
|
---|
265 | resetError(errors);
|
---|
266 |
|
---|
267 | // convert
|
---|
268 | UInt32 n = src.size() >> 2;
|
---|
269 | return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
---|
270 | }
|
---|
271 |
|
---|
272 | CString
|
---|
273 | CUnicode::UTF16ToUTF8(const CString& src, bool* errors)
|
---|
274 | {
|
---|
275 | // default to success
|
---|
276 | resetError(errors);
|
---|
277 |
|
---|
278 | // convert
|
---|
279 | UInt32 n = src.size() >> 1;
|
---|
280 | return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
---|
281 | }
|
---|
282 |
|
---|
283 | CString
|
---|
284 | CUnicode::UTF32ToUTF8(const CString& src, bool* errors)
|
---|
285 | {
|
---|
286 | // default to success
|
---|
287 | resetError(errors);
|
---|
288 |
|
---|
289 | // convert
|
---|
290 | UInt32 n = src.size() >> 2;
|
---|
291 | return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
---|
292 | }
|
---|
293 |
|
---|
294 | CString
|
---|
295 | CUnicode::textToUTF8(const CString& src, bool* errors)
|
---|
296 | {
|
---|
297 | // default to success
|
---|
298 | resetError(errors);
|
---|
299 |
|
---|
300 | // convert string to wide characters
|
---|
301 | UInt32 n = src.size();
|
---|
302 | int len = ARCH->convStringMBToWC(NULL, src.c_str(), n, errors);
|
---|
303 | wchar_t* wcs = new wchar_t[len + 1];
|
---|
304 | ARCH->convStringMBToWC(wcs, src.c_str(), n, errors);
|
---|
305 |
|
---|
306 | // convert to UTF8
|
---|
307 | CString utf8 = wideCharToUTF8(wcs, len, errors);
|
---|
308 |
|
---|
309 | // clean up
|
---|
310 | delete[] wcs;
|
---|
311 |
|
---|
312 | return utf8;
|
---|
313 | }
|
---|
314 |
|
---|
315 | wchar_t*
|
---|
316 | CUnicode::UTF8ToWideChar(const CString& src, UInt32& size, bool* errors)
|
---|
317 | {
|
---|
318 | // convert to platform's wide character encoding
|
---|
319 | CString tmp;
|
---|
320 | switch (ARCH->getWideCharEncoding()) {
|
---|
321 | case IArchString::kUCS2:
|
---|
322 | tmp = UTF8ToUCS2(src, errors);
|
---|
323 | size = tmp.size() >> 1;
|
---|
324 | break;
|
---|
325 |
|
---|
326 | case IArchString::kUCS4:
|
---|
327 | tmp = UTF8ToUCS4(src, errors);
|
---|
328 | size = tmp.size() >> 2;
|
---|
329 | break;
|
---|
330 |
|
---|
331 | case IArchString::kUTF16:
|
---|
332 | tmp = UTF8ToUTF16(src, errors);
|
---|
333 | size = tmp.size() >> 1;
|
---|
334 | break;
|
---|
335 |
|
---|
336 | case IArchString::kUTF32:
|
---|
337 | tmp = UTF8ToUTF32(src, errors);
|
---|
338 | size = tmp.size() >> 2;
|
---|
339 | break;
|
---|
340 |
|
---|
341 | default:
|
---|
342 | assert(0 && "unknown wide character encoding");
|
---|
343 | }
|
---|
344 |
|
---|
345 | // copy to a wchar_t array
|
---|
346 | wchar_t* dst = new wchar_t[size];
|
---|
347 | ::memcpy(dst, tmp.data(), sizeof(wchar_t) * size);
|
---|
348 | return dst;
|
---|
349 | }
|
---|
350 |
|
---|
351 | CString
|
---|
352 | CUnicode::wideCharToUTF8(const wchar_t* src, UInt32 size, bool* errors)
|
---|
353 | {
|
---|
354 | // convert from platform's wide character encoding.
|
---|
355 | // note -- this must include a wide nul character (independent of
|
---|
356 | // the CString's nul character).
|
---|
357 | switch (ARCH->getWideCharEncoding()) {
|
---|
358 | case IArchString::kUCS2:
|
---|
359 | return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
---|
360 |
|
---|
361 | case IArchString::kUCS4:
|
---|
362 | return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
---|
363 |
|
---|
364 | case IArchString::kUTF16:
|
---|
365 | return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
---|
366 |
|
---|
367 | case IArchString::kUTF32:
|
---|
368 | return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
---|
369 |
|
---|
370 | default:
|
---|
371 | assert(0 && "unknown wide character encoding");
|
---|
372 | return CString();
|
---|
373 | }
|
---|
374 | }
|
---|
375 |
|
---|
376 | CString
|
---|
377 | CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
---|
378 | {
|
---|
379 | // make some space
|
---|
380 | CString dst;
|
---|
381 | dst.reserve(n);
|
---|
382 |
|
---|
383 | // check if first character is 0xfffe or 0xfeff
|
---|
384 | bool byteSwapped = false;
|
---|
385 | if (n >= 1) {
|
---|
386 | switch (decode16(data, false)) {
|
---|
387 | case 0x0000feff:
|
---|
388 | data += 2;
|
---|
389 | --n;
|
---|
390 | break;
|
---|
391 |
|
---|
392 | case 0x0000fffe:
|
---|
393 | byteSwapped = true;
|
---|
394 | data += 2;
|
---|
395 | --n;
|
---|
396 | break;
|
---|
397 |
|
---|
398 | default:
|
---|
399 | break;
|
---|
400 | }
|
---|
401 | }
|
---|
402 |
|
---|
403 | // convert each character
|
---|
404 | for (; n > 0; data += 2, --n) {
|
---|
405 | UInt32 c = decode16(data, byteSwapped);
|
---|
406 | toUTF8(dst, c, errors);
|
---|
407 | }
|
---|
408 |
|
---|
409 | return dst;
|
---|
410 | }
|
---|
411 |
|
---|
412 | CString
|
---|
413 | CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
---|
414 | {
|
---|
415 | // make some space
|
---|
416 | CString dst;
|
---|
417 | dst.reserve(n);
|
---|
418 |
|
---|
419 | // check if first character is 0xfffe or 0xfeff
|
---|
420 | bool byteSwapped = false;
|
---|
421 | if (n >= 1) {
|
---|
422 | switch (decode32(data, false)) {
|
---|
423 | case 0x0000feff:
|
---|
424 | data += 4;
|
---|
425 | --n;
|
---|
426 | break;
|
---|
427 |
|
---|
428 | case 0x0000fffe:
|
---|
429 | byteSwapped = true;
|
---|
430 | data += 4;
|
---|
431 | --n;
|
---|
432 | break;
|
---|
433 |
|
---|
434 | default:
|
---|
435 | break;
|
---|
436 | }
|
---|
437 | }
|
---|
438 |
|
---|
439 | // convert each character
|
---|
440 | for (; n > 0; data += 4, --n) {
|
---|
441 | UInt32 c = decode32(data, byteSwapped);
|
---|
442 | toUTF8(dst, c, errors);
|
---|
443 | }
|
---|
444 |
|
---|
445 | return dst;
|
---|
446 | }
|
---|
447 |
|
---|
448 | CString
|
---|
449 | CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
---|
450 | {
|
---|
451 | // make some space
|
---|
452 | CString dst;
|
---|
453 | dst.reserve(n);
|
---|
454 |
|
---|
455 | // check if first character is 0xfffe or 0xfeff
|
---|
456 | bool byteSwapped = false;
|
---|
457 | if (n >= 1) {
|
---|
458 | switch (decode16(data, false)) {
|
---|
459 | case 0x0000feff:
|
---|
460 | data += 2;
|
---|
461 | --n;
|
---|
462 | break;
|
---|
463 |
|
---|
464 | case 0x0000fffe:
|
---|
465 | byteSwapped = true;
|
---|
466 | data += 2;
|
---|
467 | --n;
|
---|
468 | break;
|
---|
469 |
|
---|
470 | default:
|
---|
471 | break;
|
---|
472 | }
|
---|
473 | }
|
---|
474 |
|
---|
475 | // convert each character
|
---|
476 | for (; n > 0; data += 2, --n) {
|
---|
477 | UInt32 c = decode16(data, byteSwapped);
|
---|
478 | if (c < 0x0000d800 || c > 0x0000dfff) {
|
---|
479 | toUTF8(dst, c, errors);
|
---|
480 | }
|
---|
481 | else if (n == 1) {
|
---|
482 | // error -- missing second word
|
---|
483 | setError(errors);
|
---|
484 | toUTF8(dst, s_replacement, NULL);
|
---|
485 | }
|
---|
486 | else if (c >= 0x0000d800 && c <= 0x0000dbff) {
|
---|
487 | UInt32 c2 = decode16(data, byteSwapped);
|
---|
488 | data += 2;
|
---|
489 | --n;
|
---|
490 | if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
|
---|
491 | // error -- [d800,dbff] not followed by [dc00,dfff]
|
---|
492 | setError(errors);
|
---|
493 | toUTF8(dst, s_replacement, NULL);
|
---|
494 | }
|
---|
495 | else {
|
---|
496 | c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
|
---|
497 | toUTF8(dst, c, errors);
|
---|
498 | }
|
---|
499 | }
|
---|
500 | else {
|
---|
501 | // error -- [dc00,dfff] without leading [d800,dbff]
|
---|
502 | setError(errors);
|
---|
503 | toUTF8(dst, s_replacement, NULL);
|
---|
504 | }
|
---|
505 | }
|
---|
506 |
|
---|
507 | return dst;
|
---|
508 | }
|
---|
509 |
|
---|
510 | CString
|
---|
511 | CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
---|
512 | {
|
---|
513 | // make some space
|
---|
514 | CString dst;
|
---|
515 | dst.reserve(n);
|
---|
516 |
|
---|
517 | // check if first character is 0xfffe or 0xfeff
|
---|
518 | bool byteSwapped = false;
|
---|
519 | if (n >= 1) {
|
---|
520 | switch (decode32(data, false)) {
|
---|
521 | case 0x0000feff:
|
---|
522 | data += 4;
|
---|
523 | --n;
|
---|
524 | break;
|
---|
525 |
|
---|
526 | case 0x0000fffe:
|
---|
527 | byteSwapped = true;
|
---|
528 | data += 4;
|
---|
529 | --n;
|
---|
530 | break;
|
---|
531 |
|
---|
532 | default:
|
---|
533 | break;
|
---|
534 | }
|
---|
535 | }
|
---|
536 |
|
---|
537 | // convert each character
|
---|
538 | for (; n > 0; data += 4, --n) {
|
---|
539 | UInt32 c = decode32(data, byteSwapped);
|
---|
540 | if (c >= 0x00110000) {
|
---|
541 | setError(errors);
|
---|
542 | c = s_replacement;
|
---|
543 | }
|
---|
544 | toUTF8(dst, c, errors);
|
---|
545 | }
|
---|
546 |
|
---|
547 | return dst;
|
---|
548 | }
|
---|
549 |
|
---|
550 | UInt32
|
---|
551 | CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
|
---|
552 | {
|
---|
553 | assert(data != NULL);
|
---|
554 | assert(n != 0);
|
---|
555 |
|
---|
556 | // compute character encoding length, checking for overlong
|
---|
557 | // sequences (i.e. characters that don't use the shortest
|
---|
558 | // possible encoding).
|
---|
559 | UInt32 size;
|
---|
560 | if (data[0] < 0x80) {
|
---|
561 | // 0xxxxxxx
|
---|
562 | size = 1;
|
---|
563 | }
|
---|
564 | else if (data[0] < 0xc0) {
|
---|
565 | // 10xxxxxx -- in the middle of a multibyte character. counts
|
---|
566 | // as one invalid character.
|
---|
567 | --n;
|
---|
568 | ++data;
|
---|
569 | return s_invalid;
|
---|
570 | }
|
---|
571 | else if (data[0] < 0xe0) {
|
---|
572 | // 110xxxxx
|
---|
573 | size = 2;
|
---|
574 | }
|
---|
575 | else if (data[0] < 0xf0) {
|
---|
576 | // 1110xxxx
|
---|
577 | size = 3;
|
---|
578 | }
|
---|
579 | else if (data[0] < 0xf8) {
|
---|
580 | // 11110xxx
|
---|
581 | size = 4;
|
---|
582 | }
|
---|
583 | else if (data[0] < 0xfc) {
|
---|
584 | // 111110xx
|
---|
585 | size = 5;
|
---|
586 | }
|
---|
587 | else if (data[0] < 0xfe) {
|
---|
588 | // 1111110x
|
---|
589 | size = 6;
|
---|
590 | }
|
---|
591 | else {
|
---|
592 | // invalid sequence. dunno how many bytes to skip so skip one.
|
---|
593 | --n;
|
---|
594 | ++data;
|
---|
595 | return s_invalid;
|
---|
596 | }
|
---|
597 |
|
---|
598 | // make sure we have enough data
|
---|
599 | if (size > n) {
|
---|
600 | data += n;
|
---|
601 | n = 0;
|
---|
602 | return s_invalid;
|
---|
603 | }
|
---|
604 |
|
---|
605 | // extract character
|
---|
606 | UInt32 c;
|
---|
607 | switch (size) {
|
---|
608 | case 1:
|
---|
609 | c = static_cast<UInt32>(data[0]);
|
---|
610 | break;
|
---|
611 |
|
---|
612 | case 2:
|
---|
613 | c = ((static_cast<UInt32>(data[0]) & 0x1f) << 6) |
|
---|
614 | ((static_cast<UInt32>(data[1]) & 0x3f) );
|
---|
615 | break;
|
---|
616 |
|
---|
617 | case 3:
|
---|
618 | c = ((static_cast<UInt32>(data[0]) & 0x0f) << 12) |
|
---|
619 | ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
---|
620 | ((static_cast<UInt32>(data[2]) & 0x3f) );
|
---|
621 | break;
|
---|
622 |
|
---|
623 | case 4:
|
---|
624 | c = ((static_cast<UInt32>(data[0]) & 0x07) << 18) |
|
---|
625 | ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
|
---|
626 | ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
---|
627 | ((static_cast<UInt32>(data[1]) & 0x3f) );
|
---|
628 | break;
|
---|
629 |
|
---|
630 | case 5:
|
---|
631 | c = ((static_cast<UInt32>(data[0]) & 0x03) << 24) |
|
---|
632 | ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
|
---|
633 | ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
|
---|
634 | ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
---|
635 | ((static_cast<UInt32>(data[1]) & 0x3f) );
|
---|
636 | break;
|
---|
637 |
|
---|
638 | case 6:
|
---|
639 | c = ((static_cast<UInt32>(data[0]) & 0x01) << 30) |
|
---|
640 | ((static_cast<UInt32>(data[1]) & 0x3f) << 24) |
|
---|
641 | ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
|
---|
642 | ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
|
---|
643 | ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
---|
644 | ((static_cast<UInt32>(data[1]) & 0x3f) );
|
---|
645 | break;
|
---|
646 |
|
---|
647 | default:
|
---|
648 | assert(0 && "invalid size");
|
---|
649 | return s_invalid;
|
---|
650 | }
|
---|
651 |
|
---|
652 | // check that all bytes after the first have the pattern 10xxxxxx.
|
---|
653 | // truncated sequences are treated as a single malformed character.
|
---|
654 | bool truncated = false;
|
---|
655 | switch (size) {
|
---|
656 | case 6:
|
---|
657 | if ((data[5] & 0xc0) != 0x80) {
|
---|
658 | truncated = true;
|
---|
659 | size = 5;
|
---|
660 | }
|
---|
661 | // fall through
|
---|
662 |
|
---|
663 | case 5:
|
---|
664 | if ((data[4] & 0xc0) != 0x80) {
|
---|
665 | truncated = true;
|
---|
666 | size = 4;
|
---|
667 | }
|
---|
668 | // fall through
|
---|
669 |
|
---|
670 | case 4:
|
---|
671 | if ((data[3] & 0xc0) != 0x80) {
|
---|
672 | truncated = true;
|
---|
673 | size = 3;
|
---|
674 | }
|
---|
675 | // fall through
|
---|
676 |
|
---|
677 | case 3:
|
---|
678 | if ((data[2] & 0xc0) != 0x80) {
|
---|
679 | truncated = true;
|
---|
680 | size = 2;
|
---|
681 | }
|
---|
682 | // fall through
|
---|
683 |
|
---|
684 | case 2:
|
---|
685 | if ((data[1] & 0xc0) != 0x80) {
|
---|
686 | truncated = true;
|
---|
687 | size = 1;
|
---|
688 | }
|
---|
689 | }
|
---|
690 |
|
---|
691 | // update parameters
|
---|
692 | data += size;
|
---|
693 | n -= size;
|
---|
694 |
|
---|
695 | // invalid if sequence was truncated
|
---|
696 | if (truncated) {
|
---|
697 | return s_invalid;
|
---|
698 | }
|
---|
699 |
|
---|
700 | // check for characters that didn't use the smallest possible encoding
|
---|
701 | static UInt32 s_minChar[] = {
|
---|
702 | 0,
|
---|
703 | 0x00000000,
|
---|
704 | 0x00000080,
|
---|
705 | 0x00000800,
|
---|
706 | 0x00010000,
|
---|
707 | 0x00200000,
|
---|
708 | 0x04000000
|
---|
709 | };
|
---|
710 | if (c < s_minChar[size]) {
|
---|
711 | return s_invalid;
|
---|
712 | }
|
---|
713 |
|
---|
714 | // check for characters not in ISO-10646
|
---|
715 | if (c >= 0x0000d800 && c <= 0x0000dfff) {
|
---|
716 | return s_invalid;
|
---|
717 | }
|
---|
718 | if (c >= 0x0000fffe && c <= 0x0000ffff) {
|
---|
719 | return s_invalid;
|
---|
720 | }
|
---|
721 |
|
---|
722 | return c;
|
---|
723 | }
|
---|
724 |
|
---|
725 | void
|
---|
726 | CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors)
|
---|
727 | {
|
---|
728 | UInt8 data[6];
|
---|
729 |
|
---|
730 | // handle characters outside the valid range
|
---|
731 | if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) {
|
---|
732 | setError(errors);
|
---|
733 | c = s_replacement;
|
---|
734 | }
|
---|
735 |
|
---|
736 | // convert to UTF-8
|
---|
737 | if (c < 0x00000080) {
|
---|
738 | data[0] = static_cast<UInt8>(c);
|
---|
739 | dst.append(reinterpret_cast<char*>(data), 1);
|
---|
740 | }
|
---|
741 | else if (c < 0x00000800) {
|
---|
742 | data[0] = static_cast<UInt8>(((c >> 6) & 0x0000001f) + 0xc0);
|
---|
743 | data[1] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
---|
744 | dst.append(reinterpret_cast<char*>(data), 2);
|
---|
745 | }
|
---|
746 | else if (c < 0x00010000) {
|
---|
747 | data[0] = static_cast<UInt8>(((c >> 12) & 0x0000000f) + 0xe0);
|
---|
748 | data[1] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
---|
749 | data[2] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
---|
750 | dst.append(reinterpret_cast<char*>(data), 3);
|
---|
751 | }
|
---|
752 | else if (c < 0x00200000) {
|
---|
753 | data[0] = static_cast<UInt8>(((c >> 18) & 0x00000007) + 0xf0);
|
---|
754 | data[1] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
|
---|
755 | data[2] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
---|
756 | data[3] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
---|
757 | dst.append(reinterpret_cast<char*>(data), 4);
|
---|
758 | }
|
---|
759 | else if (c < 0x04000000) {
|
---|
760 | data[0] = static_cast<UInt8>(((c >> 24) & 0x00000003) + 0xf8);
|
---|
761 | data[1] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
|
---|
762 | data[2] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
|
---|
763 | data[3] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
---|
764 | data[4] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
---|
765 | dst.append(reinterpret_cast<char*>(data), 5);
|
---|
766 | }
|
---|
767 | else if (c < 0x80000000) {
|
---|
768 | data[0] = static_cast<UInt8>(((c >> 30) & 0x00000001) + 0xfc);
|
---|
769 | data[1] = static_cast<UInt8>(((c >> 24) & 0x0000003f) + 0x80);
|
---|
770 | data[2] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
|
---|
771 | data[3] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
|
---|
772 | data[4] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
---|
773 | data[5] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
---|
774 | dst.append(reinterpret_cast<char*>(data), 6);
|
---|
775 | }
|
---|
776 | else {
|
---|
777 | assert(0 && "character out of range");
|
---|
778 | }
|
---|
779 | }
|
---|