| 1 | /*
|
|---|
| 2 | * synergy -- mouse and keyboard sharing utility
|
|---|
| 3 | * Copyright (C) 2002 Chris Schoeneman
|
|---|
| 4 | *
|
|---|
| 5 | * This package is free software; you can redistribute it and/or
|
|---|
| 6 | * modify it under the terms of the GNU General Public License
|
|---|
| 7 | * found in the file COPYING that should have accompanied this file.
|
|---|
| 8 | *
|
|---|
| 9 | * This package is distributed in the hope that it will be useful,
|
|---|
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|---|
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|---|
| 12 | * GNU General Public License for more details.
|
|---|
| 13 | */
|
|---|
| 14 |
|
|---|
| 15 | #include "CUnicode.h"
|
|---|
| 16 | #include "CArch.h"
|
|---|
| 17 | #include <string.h>
|
|---|
| 18 |
|
|---|
| 19 | //
|
|---|
| 20 | // local utility functions
|
|---|
| 21 | //
|
|---|
| 22 |
|
|---|
| 23 | inline
|
|---|
| 24 | static
|
|---|
| 25 | UInt16
|
|---|
| 26 | decode16(const UInt8* n, bool byteSwapped)
|
|---|
| 27 | {
|
|---|
| 28 | union x16 {
|
|---|
| 29 | UInt8 n8[2];
|
|---|
| 30 | UInt16 n16;
|
|---|
| 31 | } c;
|
|---|
| 32 | if (byteSwapped) {
|
|---|
| 33 | c.n8[0] = n[1];
|
|---|
| 34 | c.n8[1] = n[0];
|
|---|
| 35 | }
|
|---|
| 36 | else {
|
|---|
| 37 | c.n8[0] = n[0];
|
|---|
| 38 | c.n8[1] = n[1];
|
|---|
| 39 | }
|
|---|
| 40 | return c.n16;
|
|---|
| 41 | }
|
|---|
| 42 |
|
|---|
| 43 | inline
|
|---|
| 44 | static
|
|---|
| 45 | UInt32
|
|---|
| 46 | decode32(const UInt8* n, bool byteSwapped)
|
|---|
| 47 | {
|
|---|
| 48 | union x32 {
|
|---|
| 49 | UInt8 n8[4];
|
|---|
| 50 | UInt32 n32;
|
|---|
| 51 | } c;
|
|---|
| 52 | if (byteSwapped) {
|
|---|
| 53 | c.n8[0] = n[3];
|
|---|
| 54 | c.n8[1] = n[2];
|
|---|
| 55 | c.n8[2] = n[1];
|
|---|
| 56 | c.n8[3] = n[0];
|
|---|
| 57 | }
|
|---|
| 58 | else {
|
|---|
| 59 | c.n8[0] = n[0];
|
|---|
| 60 | c.n8[1] = n[1];
|
|---|
| 61 | c.n8[2] = n[2];
|
|---|
| 62 | c.n8[3] = n[3];
|
|---|
| 63 | }
|
|---|
| 64 | return c.n32;
|
|---|
| 65 | }
|
|---|
| 66 |
|
|---|
| 67 | inline
|
|---|
| 68 | static
|
|---|
| 69 | void
|
|---|
| 70 | resetError(bool* errors)
|
|---|
| 71 | {
|
|---|
| 72 | if (errors != NULL) {
|
|---|
| 73 | *errors = false;
|
|---|
| 74 | }
|
|---|
| 75 | }
|
|---|
| 76 |
|
|---|
| 77 | inline
|
|---|
| 78 | static
|
|---|
| 79 | void
|
|---|
| 80 | setError(bool* errors)
|
|---|
| 81 | {
|
|---|
| 82 | if (errors != NULL) {
|
|---|
| 83 | *errors = true;
|
|---|
| 84 | }
|
|---|
| 85 | }
|
|---|
| 86 |
|
|---|
| 87 |
|
|---|
| 88 | //
|
|---|
| 89 | // CUnicode
|
|---|
| 90 | //
|
|---|
| 91 |
|
|---|
| 92 | UInt32 CUnicode::s_invalid = 0x0000ffff;
|
|---|
| 93 | UInt32 CUnicode::s_replacement = 0x0000fffd;
|
|---|
| 94 |
|
|---|
| 95 | bool
|
|---|
| 96 | CUnicode::isUTF8(const CString& src)
|
|---|
| 97 | {
|
|---|
| 98 | // convert and test each character
|
|---|
| 99 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
|---|
| 100 | for (UInt32 n = src.size(); n > 0; ) {
|
|---|
| 101 | if (fromUTF8(data, n) == s_invalid) {
|
|---|
| 102 | return false;
|
|---|
| 103 | }
|
|---|
| 104 | }
|
|---|
| 105 | return true;
|
|---|
| 106 | }
|
|---|
| 107 |
|
|---|
| 108 | CString
|
|---|
| 109 | CUnicode::UTF8ToUCS2(const CString& src, bool* errors)
|
|---|
| 110 | {
|
|---|
| 111 | // default to success
|
|---|
| 112 | resetError(errors);
|
|---|
| 113 |
|
|---|
| 114 | // get size of input string and reserve some space in output
|
|---|
| 115 | UInt32 n = src.size();
|
|---|
| 116 | CString dst;
|
|---|
| 117 | dst.reserve(2 * n);
|
|---|
| 118 |
|
|---|
| 119 | // convert each character
|
|---|
| 120 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
|---|
| 121 | while (n > 0) {
|
|---|
| 122 | UInt32 c = fromUTF8(data, n);
|
|---|
| 123 | if (c == s_invalid) {
|
|---|
| 124 | c = s_replacement;
|
|---|
| 125 | }
|
|---|
| 126 | else if (c >= 0x00010000) {
|
|---|
| 127 | setError(errors);
|
|---|
| 128 | c = s_replacement;
|
|---|
| 129 | }
|
|---|
| 130 | UInt16 ucs2 = static_cast<UInt16>(c);
|
|---|
| 131 | dst.append(reinterpret_cast<const char*>(&ucs2), 2);
|
|---|
| 132 | }
|
|---|
| 133 |
|
|---|
| 134 | return dst;
|
|---|
| 135 | }
|
|---|
| 136 |
|
|---|
| 137 | CString
|
|---|
| 138 | CUnicode::UTF8ToUCS4(const CString& src, bool* errors)
|
|---|
| 139 | {
|
|---|
| 140 | // default to success
|
|---|
| 141 | resetError(errors);
|
|---|
| 142 |
|
|---|
| 143 | // get size of input string and reserve some space in output
|
|---|
| 144 | UInt32 n = src.size();
|
|---|
| 145 | CString dst;
|
|---|
| 146 | dst.reserve(4 * n);
|
|---|
| 147 |
|
|---|
| 148 | // convert each character
|
|---|
| 149 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
|---|
| 150 | while (n > 0) {
|
|---|
| 151 | UInt32 c = fromUTF8(data, n);
|
|---|
| 152 | if (c == s_invalid) {
|
|---|
| 153 | c = s_replacement;
|
|---|
| 154 | }
|
|---|
| 155 | dst.append(reinterpret_cast<const char*>(&c), 4);
|
|---|
| 156 | }
|
|---|
| 157 |
|
|---|
| 158 | return dst;
|
|---|
| 159 | }
|
|---|
| 160 |
|
|---|
| 161 | CString
|
|---|
| 162 | CUnicode::UTF8ToUTF16(const CString& src, bool* errors)
|
|---|
| 163 | {
|
|---|
| 164 | // default to success
|
|---|
| 165 | resetError(errors);
|
|---|
| 166 |
|
|---|
| 167 | // get size of input string and reserve some space in output
|
|---|
| 168 | UInt32 n = src.size();
|
|---|
| 169 | CString dst;
|
|---|
| 170 | dst.reserve(2 * n);
|
|---|
| 171 |
|
|---|
| 172 | // convert each character
|
|---|
| 173 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
|---|
| 174 | while (n > 0) {
|
|---|
| 175 | UInt32 c = fromUTF8(data, n);
|
|---|
| 176 | if (c == s_invalid) {
|
|---|
| 177 | c = s_replacement;
|
|---|
| 178 | }
|
|---|
| 179 | else if (c >= 0x00110000) {
|
|---|
| 180 | setError(errors);
|
|---|
| 181 | c = s_replacement;
|
|---|
| 182 | }
|
|---|
| 183 | if (c < 0x00010000) {
|
|---|
| 184 | UInt16 ucs2 = static_cast<UInt16>(c);
|
|---|
| 185 | dst.append(reinterpret_cast<const char*>(&ucs2), 2);
|
|---|
| 186 | }
|
|---|
| 187 | else {
|
|---|
| 188 | c -= 0x00010000;
|
|---|
| 189 | UInt16 utf16h = static_cast<UInt16>((c >> 10) + 0xd800);
|
|---|
| 190 | UInt16 utf16l = static_cast<UInt16>((c & 0x03ff) + 0xdc00);
|
|---|
| 191 | dst.append(reinterpret_cast<const char*>(&utf16h), 2);
|
|---|
| 192 | dst.append(reinterpret_cast<const char*>(&utf16l), 2);
|
|---|
| 193 | }
|
|---|
| 194 | }
|
|---|
| 195 |
|
|---|
| 196 | return dst;
|
|---|
| 197 | }
|
|---|
| 198 |
|
|---|
| 199 | CString
|
|---|
| 200 | CUnicode::UTF8ToUTF32(const CString& src, bool* errors)
|
|---|
| 201 | {
|
|---|
| 202 | // default to success
|
|---|
| 203 | resetError(errors);
|
|---|
| 204 |
|
|---|
| 205 | // get size of input string and reserve some space in output
|
|---|
| 206 | UInt32 n = src.size();
|
|---|
| 207 | CString dst;
|
|---|
| 208 | dst.reserve(4 * n);
|
|---|
| 209 |
|
|---|
| 210 | // convert each character
|
|---|
| 211 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
|---|
| 212 | while (n > 0) {
|
|---|
| 213 | UInt32 c = fromUTF8(data, n);
|
|---|
| 214 | if (c == s_invalid) {
|
|---|
| 215 | c = s_replacement;
|
|---|
| 216 | }
|
|---|
| 217 | else if (c >= 0x00110000) {
|
|---|
| 218 | setError(errors);
|
|---|
| 219 | c = s_replacement;
|
|---|
| 220 | }
|
|---|
| 221 | dst.append(reinterpret_cast<const char*>(&c), 4);
|
|---|
| 222 | }
|
|---|
| 223 |
|
|---|
| 224 | return dst;
|
|---|
| 225 | }
|
|---|
| 226 |
|
|---|
| 227 | CString
|
|---|
| 228 | CUnicode::UTF8ToText(const CString& src, bool* errors)
|
|---|
| 229 | {
|
|---|
| 230 | // default to success
|
|---|
| 231 | resetError(errors);
|
|---|
| 232 |
|
|---|
| 233 | // convert to wide char
|
|---|
| 234 | UInt32 size;
|
|---|
| 235 | wchar_t* tmp = UTF8ToWideChar(src, size, errors);
|
|---|
| 236 |
|
|---|
| 237 | // convert string to multibyte
|
|---|
| 238 | int len = ARCH->convStringWCToMB(NULL, tmp, size, errors);
|
|---|
| 239 | char* mbs = new char[len + 1];
|
|---|
| 240 | ARCH->convStringWCToMB(mbs, tmp, size, errors);
|
|---|
| 241 | CString text(mbs, len);
|
|---|
| 242 |
|
|---|
| 243 | // clean up
|
|---|
| 244 | delete[] mbs;
|
|---|
| 245 | delete[] tmp;
|
|---|
| 246 |
|
|---|
| 247 | return text;
|
|---|
| 248 | }
|
|---|
| 249 |
|
|---|
| 250 | CString
|
|---|
| 251 | CUnicode::UCS2ToUTF8(const CString& src, bool* errors)
|
|---|
| 252 | {
|
|---|
| 253 | // default to success
|
|---|
| 254 | resetError(errors);
|
|---|
| 255 |
|
|---|
| 256 | // convert
|
|---|
| 257 | UInt32 n = src.size() >> 1;
|
|---|
| 258 | return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
|---|
| 259 | }
|
|---|
| 260 |
|
|---|
| 261 | CString
|
|---|
| 262 | CUnicode::UCS4ToUTF8(const CString& src, bool* errors)
|
|---|
| 263 | {
|
|---|
| 264 | // default to success
|
|---|
| 265 | resetError(errors);
|
|---|
| 266 |
|
|---|
| 267 | // convert
|
|---|
| 268 | UInt32 n = src.size() >> 2;
|
|---|
| 269 | return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
|---|
| 270 | }
|
|---|
| 271 |
|
|---|
| 272 | CString
|
|---|
| 273 | CUnicode::UTF16ToUTF8(const CString& src, bool* errors)
|
|---|
| 274 | {
|
|---|
| 275 | // default to success
|
|---|
| 276 | resetError(errors);
|
|---|
| 277 |
|
|---|
| 278 | // convert
|
|---|
| 279 | UInt32 n = src.size() >> 1;
|
|---|
| 280 | return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
|---|
| 281 | }
|
|---|
| 282 |
|
|---|
| 283 | CString
|
|---|
| 284 | CUnicode::UTF32ToUTF8(const CString& src, bool* errors)
|
|---|
| 285 | {
|
|---|
| 286 | // default to success
|
|---|
| 287 | resetError(errors);
|
|---|
| 288 |
|
|---|
| 289 | // convert
|
|---|
| 290 | UInt32 n = src.size() >> 2;
|
|---|
| 291 | return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
|
|---|
| 292 | }
|
|---|
| 293 |
|
|---|
| 294 | CString
|
|---|
| 295 | CUnicode::textToUTF8(const CString& src, bool* errors)
|
|---|
| 296 | {
|
|---|
| 297 | // default to success
|
|---|
| 298 | resetError(errors);
|
|---|
| 299 |
|
|---|
| 300 | // convert string to wide characters
|
|---|
| 301 | UInt32 n = src.size();
|
|---|
| 302 | int len = ARCH->convStringMBToWC(NULL, src.c_str(), n, errors);
|
|---|
| 303 | wchar_t* wcs = new wchar_t[len + 1];
|
|---|
| 304 | ARCH->convStringMBToWC(wcs, src.c_str(), n, errors);
|
|---|
| 305 |
|
|---|
| 306 | // convert to UTF8
|
|---|
| 307 | CString utf8 = wideCharToUTF8(wcs, len, errors);
|
|---|
| 308 |
|
|---|
| 309 | // clean up
|
|---|
| 310 | delete[] wcs;
|
|---|
| 311 |
|
|---|
| 312 | return utf8;
|
|---|
| 313 | }
|
|---|
| 314 |
|
|---|
| 315 | wchar_t*
|
|---|
| 316 | CUnicode::UTF8ToWideChar(const CString& src, UInt32& size, bool* errors)
|
|---|
| 317 | {
|
|---|
| 318 | // convert to platform's wide character encoding
|
|---|
| 319 | CString tmp;
|
|---|
| 320 | switch (ARCH->getWideCharEncoding()) {
|
|---|
| 321 | case IArchString::kUCS2:
|
|---|
| 322 | tmp = UTF8ToUCS2(src, errors);
|
|---|
| 323 | size = tmp.size() >> 1;
|
|---|
| 324 | break;
|
|---|
| 325 |
|
|---|
| 326 | case IArchString::kUCS4:
|
|---|
| 327 | tmp = UTF8ToUCS4(src, errors);
|
|---|
| 328 | size = tmp.size() >> 2;
|
|---|
| 329 | break;
|
|---|
| 330 |
|
|---|
| 331 | case IArchString::kUTF16:
|
|---|
| 332 | tmp = UTF8ToUTF16(src, errors);
|
|---|
| 333 | size = tmp.size() >> 1;
|
|---|
| 334 | break;
|
|---|
| 335 |
|
|---|
| 336 | case IArchString::kUTF32:
|
|---|
| 337 | tmp = UTF8ToUTF32(src, errors);
|
|---|
| 338 | size = tmp.size() >> 2;
|
|---|
| 339 | break;
|
|---|
| 340 |
|
|---|
| 341 | default:
|
|---|
| 342 | assert(0 && "unknown wide character encoding");
|
|---|
| 343 | }
|
|---|
| 344 |
|
|---|
| 345 | // copy to a wchar_t array
|
|---|
| 346 | wchar_t* dst = new wchar_t[size];
|
|---|
| 347 | ::memcpy(dst, tmp.data(), sizeof(wchar_t) * size);
|
|---|
| 348 | return dst;
|
|---|
| 349 | }
|
|---|
| 350 |
|
|---|
| 351 | CString
|
|---|
| 352 | CUnicode::wideCharToUTF8(const wchar_t* src, UInt32 size, bool* errors)
|
|---|
| 353 | {
|
|---|
| 354 | // convert from platform's wide character encoding.
|
|---|
| 355 | // note -- this must include a wide nul character (independent of
|
|---|
| 356 | // the CString's nul character).
|
|---|
| 357 | switch (ARCH->getWideCharEncoding()) {
|
|---|
| 358 | case IArchString::kUCS2:
|
|---|
| 359 | return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
|---|
| 360 |
|
|---|
| 361 | case IArchString::kUCS4:
|
|---|
| 362 | return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
|---|
| 363 |
|
|---|
| 364 | case IArchString::kUTF16:
|
|---|
| 365 | return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
|---|
| 366 |
|
|---|
| 367 | case IArchString::kUTF32:
|
|---|
| 368 | return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors);
|
|---|
| 369 |
|
|---|
| 370 | default:
|
|---|
| 371 | assert(0 && "unknown wide character encoding");
|
|---|
| 372 | return CString();
|
|---|
| 373 | }
|
|---|
| 374 | }
|
|---|
| 375 |
|
|---|
| 376 | CString
|
|---|
| 377 | CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
|---|
| 378 | {
|
|---|
| 379 | // make some space
|
|---|
| 380 | CString dst;
|
|---|
| 381 | dst.reserve(n);
|
|---|
| 382 |
|
|---|
| 383 | // check if first character is 0xfffe or 0xfeff
|
|---|
| 384 | bool byteSwapped = false;
|
|---|
| 385 | if (n >= 1) {
|
|---|
| 386 | switch (decode16(data, false)) {
|
|---|
| 387 | case 0x0000feff:
|
|---|
| 388 | data += 2;
|
|---|
| 389 | --n;
|
|---|
| 390 | break;
|
|---|
| 391 |
|
|---|
| 392 | case 0x0000fffe:
|
|---|
| 393 | byteSwapped = true;
|
|---|
| 394 | data += 2;
|
|---|
| 395 | --n;
|
|---|
| 396 | break;
|
|---|
| 397 |
|
|---|
| 398 | default:
|
|---|
| 399 | break;
|
|---|
| 400 | }
|
|---|
| 401 | }
|
|---|
| 402 |
|
|---|
| 403 | // convert each character
|
|---|
| 404 | for (; n > 0; data += 2, --n) {
|
|---|
| 405 | UInt32 c = decode16(data, byteSwapped);
|
|---|
| 406 | toUTF8(dst, c, errors);
|
|---|
| 407 | }
|
|---|
| 408 |
|
|---|
| 409 | return dst;
|
|---|
| 410 | }
|
|---|
| 411 |
|
|---|
| 412 | CString
|
|---|
| 413 | CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
|---|
| 414 | {
|
|---|
| 415 | // make some space
|
|---|
| 416 | CString dst;
|
|---|
| 417 | dst.reserve(n);
|
|---|
| 418 |
|
|---|
| 419 | // check if first character is 0xfffe or 0xfeff
|
|---|
| 420 | bool byteSwapped = false;
|
|---|
| 421 | if (n >= 1) {
|
|---|
| 422 | switch (decode32(data, false)) {
|
|---|
| 423 | case 0x0000feff:
|
|---|
| 424 | data += 4;
|
|---|
| 425 | --n;
|
|---|
| 426 | break;
|
|---|
| 427 |
|
|---|
| 428 | case 0x0000fffe:
|
|---|
| 429 | byteSwapped = true;
|
|---|
| 430 | data += 4;
|
|---|
| 431 | --n;
|
|---|
| 432 | break;
|
|---|
| 433 |
|
|---|
| 434 | default:
|
|---|
| 435 | break;
|
|---|
| 436 | }
|
|---|
| 437 | }
|
|---|
| 438 |
|
|---|
| 439 | // convert each character
|
|---|
| 440 | for (; n > 0; data += 4, --n) {
|
|---|
| 441 | UInt32 c = decode32(data, byteSwapped);
|
|---|
| 442 | toUTF8(dst, c, errors);
|
|---|
| 443 | }
|
|---|
| 444 |
|
|---|
| 445 | return dst;
|
|---|
| 446 | }
|
|---|
| 447 |
|
|---|
| 448 | CString
|
|---|
| 449 | CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
|---|
| 450 | {
|
|---|
| 451 | // make some space
|
|---|
| 452 | CString dst;
|
|---|
| 453 | dst.reserve(n);
|
|---|
| 454 |
|
|---|
| 455 | // check if first character is 0xfffe or 0xfeff
|
|---|
| 456 | bool byteSwapped = false;
|
|---|
| 457 | if (n >= 1) {
|
|---|
| 458 | switch (decode16(data, false)) {
|
|---|
| 459 | case 0x0000feff:
|
|---|
| 460 | data += 2;
|
|---|
| 461 | --n;
|
|---|
| 462 | break;
|
|---|
| 463 |
|
|---|
| 464 | case 0x0000fffe:
|
|---|
| 465 | byteSwapped = true;
|
|---|
| 466 | data += 2;
|
|---|
| 467 | --n;
|
|---|
| 468 | break;
|
|---|
| 469 |
|
|---|
| 470 | default:
|
|---|
| 471 | break;
|
|---|
| 472 | }
|
|---|
| 473 | }
|
|---|
| 474 |
|
|---|
| 475 | // convert each character
|
|---|
| 476 | for (; n > 0; data += 2, --n) {
|
|---|
| 477 | UInt32 c = decode16(data, byteSwapped);
|
|---|
| 478 | if (c < 0x0000d800 || c > 0x0000dfff) {
|
|---|
| 479 | toUTF8(dst, c, errors);
|
|---|
| 480 | }
|
|---|
| 481 | else if (n == 1) {
|
|---|
| 482 | // error -- missing second word
|
|---|
| 483 | setError(errors);
|
|---|
| 484 | toUTF8(dst, s_replacement, NULL);
|
|---|
| 485 | }
|
|---|
| 486 | else if (c >= 0x0000d800 && c <= 0x0000dbff) {
|
|---|
| 487 | UInt32 c2 = decode16(data, byteSwapped);
|
|---|
| 488 | data += 2;
|
|---|
| 489 | --n;
|
|---|
| 490 | if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
|
|---|
| 491 | // error -- [d800,dbff] not followed by [dc00,dfff]
|
|---|
| 492 | setError(errors);
|
|---|
| 493 | toUTF8(dst, s_replacement, NULL);
|
|---|
| 494 | }
|
|---|
| 495 | else {
|
|---|
| 496 | c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
|
|---|
| 497 | toUTF8(dst, c, errors);
|
|---|
| 498 | }
|
|---|
| 499 | }
|
|---|
| 500 | else {
|
|---|
| 501 | // error -- [dc00,dfff] without leading [d800,dbff]
|
|---|
| 502 | setError(errors);
|
|---|
| 503 | toUTF8(dst, s_replacement, NULL);
|
|---|
| 504 | }
|
|---|
| 505 | }
|
|---|
| 506 |
|
|---|
| 507 | return dst;
|
|---|
| 508 | }
|
|---|
| 509 |
|
|---|
| 510 | CString
|
|---|
| 511 | CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors)
|
|---|
| 512 | {
|
|---|
| 513 | // make some space
|
|---|
| 514 | CString dst;
|
|---|
| 515 | dst.reserve(n);
|
|---|
| 516 |
|
|---|
| 517 | // check if first character is 0xfffe or 0xfeff
|
|---|
| 518 | bool byteSwapped = false;
|
|---|
| 519 | if (n >= 1) {
|
|---|
| 520 | switch (decode32(data, false)) {
|
|---|
| 521 | case 0x0000feff:
|
|---|
| 522 | data += 4;
|
|---|
| 523 | --n;
|
|---|
| 524 | break;
|
|---|
| 525 |
|
|---|
| 526 | case 0x0000fffe:
|
|---|
| 527 | byteSwapped = true;
|
|---|
| 528 | data += 4;
|
|---|
| 529 | --n;
|
|---|
| 530 | break;
|
|---|
| 531 |
|
|---|
| 532 | default:
|
|---|
| 533 | break;
|
|---|
| 534 | }
|
|---|
| 535 | }
|
|---|
| 536 |
|
|---|
| 537 | // convert each character
|
|---|
| 538 | for (; n > 0; data += 4, --n) {
|
|---|
| 539 | UInt32 c = decode32(data, byteSwapped);
|
|---|
| 540 | if (c >= 0x00110000) {
|
|---|
| 541 | setError(errors);
|
|---|
| 542 | c = s_replacement;
|
|---|
| 543 | }
|
|---|
| 544 | toUTF8(dst, c, errors);
|
|---|
| 545 | }
|
|---|
| 546 |
|
|---|
| 547 | return dst;
|
|---|
| 548 | }
|
|---|
| 549 |
|
|---|
| 550 | UInt32
|
|---|
| 551 | CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
|
|---|
| 552 | {
|
|---|
| 553 | assert(data != NULL);
|
|---|
| 554 | assert(n != 0);
|
|---|
| 555 |
|
|---|
| 556 | // compute character encoding length, checking for overlong
|
|---|
| 557 | // sequences (i.e. characters that don't use the shortest
|
|---|
| 558 | // possible encoding).
|
|---|
| 559 | UInt32 size;
|
|---|
| 560 | if (data[0] < 0x80) {
|
|---|
| 561 | // 0xxxxxxx
|
|---|
| 562 | size = 1;
|
|---|
| 563 | }
|
|---|
| 564 | else if (data[0] < 0xc0) {
|
|---|
| 565 | // 10xxxxxx -- in the middle of a multibyte character. counts
|
|---|
| 566 | // as one invalid character.
|
|---|
| 567 | --n;
|
|---|
| 568 | ++data;
|
|---|
| 569 | return s_invalid;
|
|---|
| 570 | }
|
|---|
| 571 | else if (data[0] < 0xe0) {
|
|---|
| 572 | // 110xxxxx
|
|---|
| 573 | size = 2;
|
|---|
| 574 | }
|
|---|
| 575 | else if (data[0] < 0xf0) {
|
|---|
| 576 | // 1110xxxx
|
|---|
| 577 | size = 3;
|
|---|
| 578 | }
|
|---|
| 579 | else if (data[0] < 0xf8) {
|
|---|
| 580 | // 11110xxx
|
|---|
| 581 | size = 4;
|
|---|
| 582 | }
|
|---|
| 583 | else if (data[0] < 0xfc) {
|
|---|
| 584 | // 111110xx
|
|---|
| 585 | size = 5;
|
|---|
| 586 | }
|
|---|
| 587 | else if (data[0] < 0xfe) {
|
|---|
| 588 | // 1111110x
|
|---|
| 589 | size = 6;
|
|---|
| 590 | }
|
|---|
| 591 | else {
|
|---|
| 592 | // invalid sequence. dunno how many bytes to skip so skip one.
|
|---|
| 593 | --n;
|
|---|
| 594 | ++data;
|
|---|
| 595 | return s_invalid;
|
|---|
| 596 | }
|
|---|
| 597 |
|
|---|
| 598 | // make sure we have enough data
|
|---|
| 599 | if (size > n) {
|
|---|
| 600 | data += n;
|
|---|
| 601 | n = 0;
|
|---|
| 602 | return s_invalid;
|
|---|
| 603 | }
|
|---|
| 604 |
|
|---|
| 605 | // extract character
|
|---|
| 606 | UInt32 c;
|
|---|
| 607 | switch (size) {
|
|---|
| 608 | case 1:
|
|---|
| 609 | c = static_cast<UInt32>(data[0]);
|
|---|
| 610 | break;
|
|---|
| 611 |
|
|---|
| 612 | case 2:
|
|---|
| 613 | c = ((static_cast<UInt32>(data[0]) & 0x1f) << 6) |
|
|---|
| 614 | ((static_cast<UInt32>(data[1]) & 0x3f) );
|
|---|
| 615 | break;
|
|---|
| 616 |
|
|---|
| 617 | case 3:
|
|---|
| 618 | c = ((static_cast<UInt32>(data[0]) & 0x0f) << 12) |
|
|---|
| 619 | ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
|---|
| 620 | ((static_cast<UInt32>(data[2]) & 0x3f) );
|
|---|
| 621 | break;
|
|---|
| 622 |
|
|---|
| 623 | case 4:
|
|---|
| 624 | c = ((static_cast<UInt32>(data[0]) & 0x07) << 18) |
|
|---|
| 625 | ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
|
|---|
| 626 | ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
|---|
| 627 | ((static_cast<UInt32>(data[1]) & 0x3f) );
|
|---|
| 628 | break;
|
|---|
| 629 |
|
|---|
| 630 | case 5:
|
|---|
| 631 | c = ((static_cast<UInt32>(data[0]) & 0x03) << 24) |
|
|---|
| 632 | ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
|
|---|
| 633 | ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
|
|---|
| 634 | ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
|---|
| 635 | ((static_cast<UInt32>(data[1]) & 0x3f) );
|
|---|
| 636 | break;
|
|---|
| 637 |
|
|---|
| 638 | case 6:
|
|---|
| 639 | c = ((static_cast<UInt32>(data[0]) & 0x01) << 30) |
|
|---|
| 640 | ((static_cast<UInt32>(data[1]) & 0x3f) << 24) |
|
|---|
| 641 | ((static_cast<UInt32>(data[1]) & 0x3f) << 18) |
|
|---|
| 642 | ((static_cast<UInt32>(data[1]) & 0x3f) << 12) |
|
|---|
| 643 | ((static_cast<UInt32>(data[1]) & 0x3f) << 6) |
|
|---|
| 644 | ((static_cast<UInt32>(data[1]) & 0x3f) );
|
|---|
| 645 | break;
|
|---|
| 646 |
|
|---|
| 647 | default:
|
|---|
| 648 | assert(0 && "invalid size");
|
|---|
| 649 | return s_invalid;
|
|---|
| 650 | }
|
|---|
| 651 |
|
|---|
| 652 | // check that all bytes after the first have the pattern 10xxxxxx.
|
|---|
| 653 | // truncated sequences are treated as a single malformed character.
|
|---|
| 654 | bool truncated = false;
|
|---|
| 655 | switch (size) {
|
|---|
| 656 | case 6:
|
|---|
| 657 | if ((data[5] & 0xc0) != 0x80) {
|
|---|
| 658 | truncated = true;
|
|---|
| 659 | size = 5;
|
|---|
| 660 | }
|
|---|
| 661 | // fall through
|
|---|
| 662 |
|
|---|
| 663 | case 5:
|
|---|
| 664 | if ((data[4] & 0xc0) != 0x80) {
|
|---|
| 665 | truncated = true;
|
|---|
| 666 | size = 4;
|
|---|
| 667 | }
|
|---|
| 668 | // fall through
|
|---|
| 669 |
|
|---|
| 670 | case 4:
|
|---|
| 671 | if ((data[3] & 0xc0) != 0x80) {
|
|---|
| 672 | truncated = true;
|
|---|
| 673 | size = 3;
|
|---|
| 674 | }
|
|---|
| 675 | // fall through
|
|---|
| 676 |
|
|---|
| 677 | case 3:
|
|---|
| 678 | if ((data[2] & 0xc0) != 0x80) {
|
|---|
| 679 | truncated = true;
|
|---|
| 680 | size = 2;
|
|---|
| 681 | }
|
|---|
| 682 | // fall through
|
|---|
| 683 |
|
|---|
| 684 | case 2:
|
|---|
| 685 | if ((data[1] & 0xc0) != 0x80) {
|
|---|
| 686 | truncated = true;
|
|---|
| 687 | size = 1;
|
|---|
| 688 | }
|
|---|
| 689 | }
|
|---|
| 690 |
|
|---|
| 691 | // update parameters
|
|---|
| 692 | data += size;
|
|---|
| 693 | n -= size;
|
|---|
| 694 |
|
|---|
| 695 | // invalid if sequence was truncated
|
|---|
| 696 | if (truncated) {
|
|---|
| 697 | return s_invalid;
|
|---|
| 698 | }
|
|---|
| 699 |
|
|---|
| 700 | // check for characters that didn't use the smallest possible encoding
|
|---|
| 701 | static UInt32 s_minChar[] = {
|
|---|
| 702 | 0,
|
|---|
| 703 | 0x00000000,
|
|---|
| 704 | 0x00000080,
|
|---|
| 705 | 0x00000800,
|
|---|
| 706 | 0x00010000,
|
|---|
| 707 | 0x00200000,
|
|---|
| 708 | 0x04000000
|
|---|
| 709 | };
|
|---|
| 710 | if (c < s_minChar[size]) {
|
|---|
| 711 | return s_invalid;
|
|---|
| 712 | }
|
|---|
| 713 |
|
|---|
| 714 | // check for characters not in ISO-10646
|
|---|
| 715 | if (c >= 0x0000d800 && c <= 0x0000dfff) {
|
|---|
| 716 | return s_invalid;
|
|---|
| 717 | }
|
|---|
| 718 | if (c >= 0x0000fffe && c <= 0x0000ffff) {
|
|---|
| 719 | return s_invalid;
|
|---|
| 720 | }
|
|---|
| 721 |
|
|---|
| 722 | return c;
|
|---|
| 723 | }
|
|---|
| 724 |
|
|---|
| 725 | void
|
|---|
| 726 | CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors)
|
|---|
| 727 | {
|
|---|
| 728 | UInt8 data[6];
|
|---|
| 729 |
|
|---|
| 730 | // handle characters outside the valid range
|
|---|
| 731 | if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) {
|
|---|
| 732 | setError(errors);
|
|---|
| 733 | c = s_replacement;
|
|---|
| 734 | }
|
|---|
| 735 |
|
|---|
| 736 | // convert to UTF-8
|
|---|
| 737 | if (c < 0x00000080) {
|
|---|
| 738 | data[0] = static_cast<UInt8>(c);
|
|---|
| 739 | dst.append(reinterpret_cast<char*>(data), 1);
|
|---|
| 740 | }
|
|---|
| 741 | else if (c < 0x00000800) {
|
|---|
| 742 | data[0] = static_cast<UInt8>(((c >> 6) & 0x0000001f) + 0xc0);
|
|---|
| 743 | data[1] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
|---|
| 744 | dst.append(reinterpret_cast<char*>(data), 2);
|
|---|
| 745 | }
|
|---|
| 746 | else if (c < 0x00010000) {
|
|---|
| 747 | data[0] = static_cast<UInt8>(((c >> 12) & 0x0000000f) + 0xe0);
|
|---|
| 748 | data[1] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
|---|
| 749 | data[2] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
|---|
| 750 | dst.append(reinterpret_cast<char*>(data), 3);
|
|---|
| 751 | }
|
|---|
| 752 | else if (c < 0x00200000) {
|
|---|
| 753 | data[0] = static_cast<UInt8>(((c >> 18) & 0x00000007) + 0xf0);
|
|---|
| 754 | data[1] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
|
|---|
| 755 | data[2] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
|---|
| 756 | data[3] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
|---|
| 757 | dst.append(reinterpret_cast<char*>(data), 4);
|
|---|
| 758 | }
|
|---|
| 759 | else if (c < 0x04000000) {
|
|---|
| 760 | data[0] = static_cast<UInt8>(((c >> 24) & 0x00000003) + 0xf8);
|
|---|
| 761 | data[1] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
|
|---|
| 762 | data[2] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
|
|---|
| 763 | data[3] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
|---|
| 764 | data[4] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
|---|
| 765 | dst.append(reinterpret_cast<char*>(data), 5);
|
|---|
| 766 | }
|
|---|
| 767 | else if (c < 0x80000000) {
|
|---|
| 768 | data[0] = static_cast<UInt8>(((c >> 30) & 0x00000001) + 0xfc);
|
|---|
| 769 | data[1] = static_cast<UInt8>(((c >> 24) & 0x0000003f) + 0x80);
|
|---|
| 770 | data[2] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80);
|
|---|
| 771 | data[3] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80);
|
|---|
| 772 | data[4] = static_cast<UInt8>(((c >> 6) & 0x0000003f) + 0x80);
|
|---|
| 773 | data[5] = static_cast<UInt8>((c & 0x0000003f) + 0x80);
|
|---|
| 774 | dst.append(reinterpret_cast<char*>(data), 6);
|
|---|
| 775 | }
|
|---|
| 776 | else {
|
|---|
| 777 | assert(0 && "character out of range");
|
|---|
| 778 | }
|
|---|
| 779 | }
|
|---|