| 1 | /* | 
|---|
| 2 | * synergy -- mouse and keyboard sharing utility | 
|---|
| 3 | * Copyright (C) 2002 Chris Schoeneman | 
|---|
| 4 | * | 
|---|
| 5 | * This package is free software; you can redistribute it and/or | 
|---|
| 6 | * modify it under the terms of the GNU General Public License | 
|---|
| 7 | * found in the file COPYING that should have accompanied this file. | 
|---|
| 8 | * | 
|---|
| 9 | * This package is distributed in the hope that it will be useful, | 
|---|
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|---|
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|---|
| 12 | * GNU General Public License for more details. | 
|---|
| 13 | */ | 
|---|
| 14 |  | 
|---|
| 15 | #include "CUnicode.h" | 
|---|
| 16 | #include "CArch.h" | 
|---|
| 17 | #include <string.h> | 
|---|
| 18 |  | 
|---|
| 19 | // | 
|---|
| 20 | // local utility functions | 
|---|
| 21 | // | 
|---|
| 22 |  | 
|---|
| 23 | inline | 
|---|
| 24 | static | 
|---|
| 25 | UInt16 | 
|---|
| 26 | decode16(const UInt8* n, bool byteSwapped) | 
|---|
| 27 | { | 
|---|
| 28 | union x16 { | 
|---|
| 29 | UInt8   n8[2]; | 
|---|
| 30 | UInt16  n16; | 
|---|
| 31 | } c; | 
|---|
| 32 | if (byteSwapped) { | 
|---|
| 33 | c.n8[0] = n[1]; | 
|---|
| 34 | c.n8[1] = n[0]; | 
|---|
| 35 | } | 
|---|
| 36 | else { | 
|---|
| 37 | c.n8[0] = n[0]; | 
|---|
| 38 | c.n8[1] = n[1]; | 
|---|
| 39 | } | 
|---|
| 40 | return c.n16; | 
|---|
| 41 | } | 
|---|
| 42 |  | 
|---|
| 43 | inline | 
|---|
| 44 | static | 
|---|
| 45 | UInt32 | 
|---|
| 46 | decode32(const UInt8* n, bool byteSwapped) | 
|---|
| 47 | { | 
|---|
| 48 | union x32 { | 
|---|
| 49 | UInt8   n8[4]; | 
|---|
| 50 | UInt32  n32; | 
|---|
| 51 | } c; | 
|---|
| 52 | if (byteSwapped) { | 
|---|
| 53 | c.n8[0] = n[3]; | 
|---|
| 54 | c.n8[1] = n[2]; | 
|---|
| 55 | c.n8[2] = n[1]; | 
|---|
| 56 | c.n8[3] = n[0]; | 
|---|
| 57 | } | 
|---|
| 58 | else { | 
|---|
| 59 | c.n8[0] = n[0]; | 
|---|
| 60 | c.n8[1] = n[1]; | 
|---|
| 61 | c.n8[2] = n[2]; | 
|---|
| 62 | c.n8[3] = n[3]; | 
|---|
| 63 | } | 
|---|
| 64 | return c.n32; | 
|---|
| 65 | } | 
|---|
| 66 |  | 
|---|
| 67 | inline | 
|---|
| 68 | static | 
|---|
| 69 | void | 
|---|
| 70 | resetError(bool* errors) | 
|---|
| 71 | { | 
|---|
| 72 | if (errors != NULL) { | 
|---|
| 73 | *errors = false; | 
|---|
| 74 | } | 
|---|
| 75 | } | 
|---|
| 76 |  | 
|---|
| 77 | inline | 
|---|
| 78 | static | 
|---|
| 79 | void | 
|---|
| 80 | setError(bool* errors) | 
|---|
| 81 | { | 
|---|
| 82 | if (errors != NULL) { | 
|---|
| 83 | *errors = true; | 
|---|
| 84 | } | 
|---|
| 85 | } | 
|---|
| 86 |  | 
|---|
| 87 |  | 
|---|
| 88 | // | 
|---|
| 89 | // CUnicode | 
|---|
| 90 | // | 
|---|
| 91 |  | 
|---|
| 92 | UInt32                                  CUnicode::s_invalid     = 0x0000ffff; | 
|---|
| 93 | UInt32                                  CUnicode::s_replacement = 0x0000fffd; | 
|---|
| 94 |  | 
|---|
| 95 | bool | 
|---|
| 96 | CUnicode::isUTF8(const CString& src) | 
|---|
| 97 | { | 
|---|
| 98 | // convert and test each character | 
|---|
| 99 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); | 
|---|
| 100 | for (UInt32 n = src.size(); n > 0; ) { | 
|---|
| 101 | if (fromUTF8(data, n) == s_invalid) { | 
|---|
| 102 | return false; | 
|---|
| 103 | } | 
|---|
| 104 | } | 
|---|
| 105 | return true; | 
|---|
| 106 | } | 
|---|
| 107 |  | 
|---|
| 108 | CString | 
|---|
| 109 | CUnicode::UTF8ToUCS2(const CString& src, bool* errors) | 
|---|
| 110 | { | 
|---|
| 111 | // default to success | 
|---|
| 112 | resetError(errors); | 
|---|
| 113 |  | 
|---|
| 114 | // get size of input string and reserve some space in output | 
|---|
| 115 | UInt32 n = src.size(); | 
|---|
| 116 | CString dst; | 
|---|
| 117 | dst.reserve(2 * n); | 
|---|
| 118 |  | 
|---|
| 119 | // convert each character | 
|---|
| 120 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); | 
|---|
| 121 | while (n > 0) { | 
|---|
| 122 | UInt32 c = fromUTF8(data, n); | 
|---|
| 123 | if (c == s_invalid) { | 
|---|
| 124 | c = s_replacement; | 
|---|
| 125 | } | 
|---|
| 126 | else if (c >= 0x00010000) { | 
|---|
| 127 | setError(errors); | 
|---|
| 128 | c = s_replacement; | 
|---|
| 129 | } | 
|---|
| 130 | UInt16 ucs2 = static_cast<UInt16>(c); | 
|---|
| 131 | dst.append(reinterpret_cast<const char*>(&ucs2), 2); | 
|---|
| 132 | } | 
|---|
| 133 |  | 
|---|
| 134 | return dst; | 
|---|
| 135 | } | 
|---|
| 136 |  | 
|---|
| 137 | CString | 
|---|
| 138 | CUnicode::UTF8ToUCS4(const CString& src, bool* errors) | 
|---|
| 139 | { | 
|---|
| 140 | // default to success | 
|---|
| 141 | resetError(errors); | 
|---|
| 142 |  | 
|---|
| 143 | // get size of input string and reserve some space in output | 
|---|
| 144 | UInt32 n = src.size(); | 
|---|
| 145 | CString dst; | 
|---|
| 146 | dst.reserve(4 * n); | 
|---|
| 147 |  | 
|---|
| 148 | // convert each character | 
|---|
| 149 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); | 
|---|
| 150 | while (n > 0) { | 
|---|
| 151 | UInt32 c = fromUTF8(data, n); | 
|---|
| 152 | if (c == s_invalid) { | 
|---|
| 153 | c = s_replacement; | 
|---|
| 154 | } | 
|---|
| 155 | dst.append(reinterpret_cast<const char*>(&c), 4); | 
|---|
| 156 | } | 
|---|
| 157 |  | 
|---|
| 158 | return dst; | 
|---|
| 159 | } | 
|---|
| 160 |  | 
|---|
| 161 | CString | 
|---|
| 162 | CUnicode::UTF8ToUTF16(const CString& src, bool* errors) | 
|---|
| 163 | { | 
|---|
| 164 | // default to success | 
|---|
| 165 | resetError(errors); | 
|---|
| 166 |  | 
|---|
| 167 | // get size of input string and reserve some space in output | 
|---|
| 168 | UInt32 n = src.size(); | 
|---|
| 169 | CString dst; | 
|---|
| 170 | dst.reserve(2 * n); | 
|---|
| 171 |  | 
|---|
| 172 | // convert each character | 
|---|
| 173 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); | 
|---|
| 174 | while (n > 0) { | 
|---|
| 175 | UInt32 c = fromUTF8(data, n); | 
|---|
| 176 | if (c == s_invalid) { | 
|---|
| 177 | c = s_replacement; | 
|---|
| 178 | } | 
|---|
| 179 | else if (c >= 0x00110000) { | 
|---|
| 180 | setError(errors); | 
|---|
| 181 | c = s_replacement; | 
|---|
| 182 | } | 
|---|
| 183 | if (c < 0x00010000) { | 
|---|
| 184 | UInt16 ucs2 = static_cast<UInt16>(c); | 
|---|
| 185 | dst.append(reinterpret_cast<const char*>(&ucs2), 2); | 
|---|
| 186 | } | 
|---|
| 187 | else { | 
|---|
| 188 | c -= 0x00010000; | 
|---|
| 189 | UInt16 utf16h = static_cast<UInt16>((c >> 10) + 0xd800); | 
|---|
| 190 | UInt16 utf16l = static_cast<UInt16>((c & 0x03ff) + 0xdc00); | 
|---|
| 191 | dst.append(reinterpret_cast<const char*>(&utf16h), 2); | 
|---|
| 192 | dst.append(reinterpret_cast<const char*>(&utf16l), 2); | 
|---|
| 193 | } | 
|---|
| 194 | } | 
|---|
| 195 |  | 
|---|
| 196 | return dst; | 
|---|
| 197 | } | 
|---|
| 198 |  | 
|---|
| 199 | CString | 
|---|
| 200 | CUnicode::UTF8ToUTF32(const CString& src, bool* errors) | 
|---|
| 201 | { | 
|---|
| 202 | // default to success | 
|---|
| 203 | resetError(errors); | 
|---|
| 204 |  | 
|---|
| 205 | // get size of input string and reserve some space in output | 
|---|
| 206 | UInt32 n = src.size(); | 
|---|
| 207 | CString dst; | 
|---|
| 208 | dst.reserve(4 * n); | 
|---|
| 209 |  | 
|---|
| 210 | // convert each character | 
|---|
| 211 | const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); | 
|---|
| 212 | while (n > 0) { | 
|---|
| 213 | UInt32 c = fromUTF8(data, n); | 
|---|
| 214 | if (c == s_invalid) { | 
|---|
| 215 | c = s_replacement; | 
|---|
| 216 | } | 
|---|
| 217 | else if (c >= 0x00110000) { | 
|---|
| 218 | setError(errors); | 
|---|
| 219 | c = s_replacement; | 
|---|
| 220 | } | 
|---|
| 221 | dst.append(reinterpret_cast<const char*>(&c), 4); | 
|---|
| 222 | } | 
|---|
| 223 |  | 
|---|
| 224 | return dst; | 
|---|
| 225 | } | 
|---|
| 226 |  | 
|---|
| 227 | CString | 
|---|
| 228 | CUnicode::UTF8ToText(const CString& src, bool* errors) | 
|---|
| 229 | { | 
|---|
| 230 | // default to success | 
|---|
| 231 | resetError(errors); | 
|---|
| 232 |  | 
|---|
| 233 | // convert to wide char | 
|---|
| 234 | UInt32 size; | 
|---|
| 235 | wchar_t* tmp = UTF8ToWideChar(src, size, errors); | 
|---|
| 236 |  | 
|---|
| 237 | // convert string to multibyte | 
|---|
| 238 | int len   = ARCH->convStringWCToMB(NULL, tmp, size, errors); | 
|---|
| 239 | char* mbs = new char[len + 1]; | 
|---|
| 240 | ARCH->convStringWCToMB(mbs, tmp, size, errors); | 
|---|
| 241 | CString text(mbs, len); | 
|---|
| 242 |  | 
|---|
| 243 | // clean up | 
|---|
| 244 | delete[] mbs; | 
|---|
| 245 | delete[] tmp; | 
|---|
| 246 |  | 
|---|
| 247 | return text; | 
|---|
| 248 | } | 
|---|
| 249 |  | 
|---|
| 250 | CString | 
|---|
| 251 | CUnicode::UCS2ToUTF8(const CString& src, bool* errors) | 
|---|
| 252 | { | 
|---|
| 253 | // default to success | 
|---|
| 254 | resetError(errors); | 
|---|
| 255 |  | 
|---|
| 256 | // convert | 
|---|
| 257 | UInt32 n = src.size() >> 1; | 
|---|
| 258 | return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors); | 
|---|
| 259 | } | 
|---|
| 260 |  | 
|---|
| 261 | CString | 
|---|
| 262 | CUnicode::UCS4ToUTF8(const CString& src, bool* errors) | 
|---|
| 263 | { | 
|---|
| 264 | // default to success | 
|---|
| 265 | resetError(errors); | 
|---|
| 266 |  | 
|---|
| 267 | // convert | 
|---|
| 268 | UInt32 n = src.size() >> 2; | 
|---|
| 269 | return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors); | 
|---|
| 270 | } | 
|---|
| 271 |  | 
|---|
| 272 | CString | 
|---|
| 273 | CUnicode::UTF16ToUTF8(const CString& src, bool* errors) | 
|---|
| 274 | { | 
|---|
| 275 | // default to success | 
|---|
| 276 | resetError(errors); | 
|---|
| 277 |  | 
|---|
| 278 | // convert | 
|---|
| 279 | UInt32 n = src.size() >> 1; | 
|---|
| 280 | return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors); | 
|---|
| 281 | } | 
|---|
| 282 |  | 
|---|
| 283 | CString | 
|---|
| 284 | CUnicode::UTF32ToUTF8(const CString& src, bool* errors) | 
|---|
| 285 | { | 
|---|
| 286 | // default to success | 
|---|
| 287 | resetError(errors); | 
|---|
| 288 |  | 
|---|
| 289 | // convert | 
|---|
| 290 | UInt32 n = src.size() >> 2; | 
|---|
| 291 | return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors); | 
|---|
| 292 | } | 
|---|
| 293 |  | 
|---|
| 294 | CString | 
|---|
| 295 | CUnicode::textToUTF8(const CString& src, bool* errors) | 
|---|
| 296 | { | 
|---|
| 297 | // default to success | 
|---|
| 298 | resetError(errors); | 
|---|
| 299 |  | 
|---|
| 300 | // convert string to wide characters | 
|---|
| 301 | UInt32 n     = src.size(); | 
|---|
| 302 | int len      = ARCH->convStringMBToWC(NULL, src.c_str(), n, errors); | 
|---|
| 303 | wchar_t* wcs = new wchar_t[len + 1]; | 
|---|
| 304 | ARCH->convStringMBToWC(wcs, src.c_str(), n, errors); | 
|---|
| 305 |  | 
|---|
| 306 | // convert to UTF8 | 
|---|
| 307 | CString utf8 = wideCharToUTF8(wcs, len, errors); | 
|---|
| 308 |  | 
|---|
| 309 | // clean up | 
|---|
| 310 | delete[] wcs; | 
|---|
| 311 |  | 
|---|
| 312 | return utf8; | 
|---|
| 313 | } | 
|---|
| 314 |  | 
|---|
| 315 | wchar_t* | 
|---|
| 316 | CUnicode::UTF8ToWideChar(const CString& src, UInt32& size, bool* errors) | 
|---|
| 317 | { | 
|---|
| 318 | // convert to platform's wide character encoding | 
|---|
| 319 | CString tmp; | 
|---|
| 320 | switch (ARCH->getWideCharEncoding()) { | 
|---|
| 321 | case IArchString::kUCS2: | 
|---|
| 322 | tmp = UTF8ToUCS2(src, errors); | 
|---|
| 323 | size = tmp.size() >> 1; | 
|---|
| 324 | break; | 
|---|
| 325 |  | 
|---|
| 326 | case IArchString::kUCS4: | 
|---|
| 327 | tmp = UTF8ToUCS4(src, errors); | 
|---|
| 328 | size = tmp.size() >> 2; | 
|---|
| 329 | break; | 
|---|
| 330 |  | 
|---|
| 331 | case IArchString::kUTF16: | 
|---|
| 332 | tmp = UTF8ToUTF16(src, errors); | 
|---|
| 333 | size = tmp.size() >> 1; | 
|---|
| 334 | break; | 
|---|
| 335 |  | 
|---|
| 336 | case IArchString::kUTF32: | 
|---|
| 337 | tmp = UTF8ToUTF32(src, errors); | 
|---|
| 338 | size = tmp.size() >> 2; | 
|---|
| 339 | break; | 
|---|
| 340 |  | 
|---|
| 341 | default: | 
|---|
| 342 | assert(0 && "unknown wide character encoding"); | 
|---|
| 343 | } | 
|---|
| 344 |  | 
|---|
| 345 | // copy to a wchar_t array | 
|---|
| 346 | wchar_t* dst = new wchar_t[size]; | 
|---|
| 347 | ::memcpy(dst, tmp.data(), sizeof(wchar_t) * size); | 
|---|
| 348 | return dst; | 
|---|
| 349 | } | 
|---|
| 350 |  | 
|---|
| 351 | CString | 
|---|
| 352 | CUnicode::wideCharToUTF8(const wchar_t* src, UInt32 size, bool* errors) | 
|---|
| 353 | { | 
|---|
| 354 | // convert from platform's wide character encoding. | 
|---|
| 355 | // note -- this must include a wide nul character (independent of | 
|---|
| 356 | // the CString's nul character). | 
|---|
| 357 | switch (ARCH->getWideCharEncoding()) { | 
|---|
| 358 | case IArchString::kUCS2: | 
|---|
| 359 | return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors); | 
|---|
| 360 |  | 
|---|
| 361 | case IArchString::kUCS4: | 
|---|
| 362 | return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors); | 
|---|
| 363 |  | 
|---|
| 364 | case IArchString::kUTF16: | 
|---|
| 365 | return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors); | 
|---|
| 366 |  | 
|---|
| 367 | case IArchString::kUTF32: | 
|---|
| 368 | return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src), size, errors); | 
|---|
| 369 |  | 
|---|
| 370 | default: | 
|---|
| 371 | assert(0 && "unknown wide character encoding"); | 
|---|
| 372 | return CString(); | 
|---|
| 373 | } | 
|---|
| 374 | } | 
|---|
| 375 |  | 
|---|
| 376 | CString | 
|---|
| 377 | CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors) | 
|---|
| 378 | { | 
|---|
| 379 | // make some space | 
|---|
| 380 | CString dst; | 
|---|
| 381 | dst.reserve(n); | 
|---|
| 382 |  | 
|---|
| 383 | // check if first character is 0xfffe or 0xfeff | 
|---|
| 384 | bool byteSwapped = false; | 
|---|
| 385 | if (n >= 1) { | 
|---|
| 386 | switch (decode16(data, false)) { | 
|---|
| 387 | case 0x0000feff: | 
|---|
| 388 | data += 2; | 
|---|
| 389 | --n; | 
|---|
| 390 | break; | 
|---|
| 391 |  | 
|---|
| 392 | case 0x0000fffe: | 
|---|
| 393 | byteSwapped = true; | 
|---|
| 394 | data += 2; | 
|---|
| 395 | --n; | 
|---|
| 396 | break; | 
|---|
| 397 |  | 
|---|
| 398 | default: | 
|---|
| 399 | break; | 
|---|
| 400 | } | 
|---|
| 401 | } | 
|---|
| 402 |  | 
|---|
| 403 | // convert each character | 
|---|
| 404 | for (; n > 0; data += 2, --n) { | 
|---|
| 405 | UInt32 c = decode16(data, byteSwapped); | 
|---|
| 406 | toUTF8(dst, c, errors); | 
|---|
| 407 | } | 
|---|
| 408 |  | 
|---|
| 409 | return dst; | 
|---|
| 410 | } | 
|---|
| 411 |  | 
|---|
| 412 | CString | 
|---|
| 413 | CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors) | 
|---|
| 414 | { | 
|---|
| 415 | // make some space | 
|---|
| 416 | CString dst; | 
|---|
| 417 | dst.reserve(n); | 
|---|
| 418 |  | 
|---|
| 419 | // check if first character is 0xfffe or 0xfeff | 
|---|
| 420 | bool byteSwapped = false; | 
|---|
| 421 | if (n >= 1) { | 
|---|
| 422 | switch (decode32(data, false)) { | 
|---|
| 423 | case 0x0000feff: | 
|---|
| 424 | data += 4; | 
|---|
| 425 | --n; | 
|---|
| 426 | break; | 
|---|
| 427 |  | 
|---|
| 428 | case 0x0000fffe: | 
|---|
| 429 | byteSwapped = true; | 
|---|
| 430 | data += 4; | 
|---|
| 431 | --n; | 
|---|
| 432 | break; | 
|---|
| 433 |  | 
|---|
| 434 | default: | 
|---|
| 435 | break; | 
|---|
| 436 | } | 
|---|
| 437 | } | 
|---|
| 438 |  | 
|---|
| 439 | // convert each character | 
|---|
| 440 | for (; n > 0; data += 4, --n) { | 
|---|
| 441 | UInt32 c = decode32(data, byteSwapped); | 
|---|
| 442 | toUTF8(dst, c, errors); | 
|---|
| 443 | } | 
|---|
| 444 |  | 
|---|
| 445 | return dst; | 
|---|
| 446 | } | 
|---|
| 447 |  | 
|---|
| 448 | CString | 
|---|
| 449 | CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors) | 
|---|
| 450 | { | 
|---|
| 451 | // make some space | 
|---|
| 452 | CString dst; | 
|---|
| 453 | dst.reserve(n); | 
|---|
| 454 |  | 
|---|
| 455 | // check if first character is 0xfffe or 0xfeff | 
|---|
| 456 | bool byteSwapped = false; | 
|---|
| 457 | if (n >= 1) { | 
|---|
| 458 | switch (decode16(data, false)) { | 
|---|
| 459 | case 0x0000feff: | 
|---|
| 460 | data += 2; | 
|---|
| 461 | --n; | 
|---|
| 462 | break; | 
|---|
| 463 |  | 
|---|
| 464 | case 0x0000fffe: | 
|---|
| 465 | byteSwapped = true; | 
|---|
| 466 | data += 2; | 
|---|
| 467 | --n; | 
|---|
| 468 | break; | 
|---|
| 469 |  | 
|---|
| 470 | default: | 
|---|
| 471 | break; | 
|---|
| 472 | } | 
|---|
| 473 | } | 
|---|
| 474 |  | 
|---|
| 475 | // convert each character | 
|---|
| 476 | for (; n > 0; data += 2, --n) { | 
|---|
| 477 | UInt32 c = decode16(data, byteSwapped); | 
|---|
| 478 | if (c < 0x0000d800 || c > 0x0000dfff) { | 
|---|
| 479 | toUTF8(dst, c, errors); | 
|---|
| 480 | } | 
|---|
| 481 | else if (n == 1) { | 
|---|
| 482 | // error -- missing second word | 
|---|
| 483 | setError(errors); | 
|---|
| 484 | toUTF8(dst, s_replacement, NULL); | 
|---|
| 485 | } | 
|---|
| 486 | else if (c >= 0x0000d800 && c <= 0x0000dbff) { | 
|---|
| 487 | UInt32 c2 = decode16(data, byteSwapped); | 
|---|
| 488 | data += 2; | 
|---|
| 489 | --n; | 
|---|
| 490 | if (c2 < 0x0000dc00 || c2 > 0x0000dfff) { | 
|---|
| 491 | // error -- [d800,dbff] not followed by [dc00,dfff] | 
|---|
| 492 | setError(errors); | 
|---|
| 493 | toUTF8(dst, s_replacement, NULL); | 
|---|
| 494 | } | 
|---|
| 495 | else { | 
|---|
| 496 | c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000; | 
|---|
| 497 | toUTF8(dst, c, errors); | 
|---|
| 498 | } | 
|---|
| 499 | } | 
|---|
| 500 | else { | 
|---|
| 501 | // error -- [dc00,dfff] without leading [d800,dbff] | 
|---|
| 502 | setError(errors); | 
|---|
| 503 | toUTF8(dst, s_replacement, NULL); | 
|---|
| 504 | } | 
|---|
| 505 | } | 
|---|
| 506 |  | 
|---|
| 507 | return dst; | 
|---|
| 508 | } | 
|---|
| 509 |  | 
|---|
| 510 | CString | 
|---|
| 511 | CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors) | 
|---|
| 512 | { | 
|---|
| 513 | // make some space | 
|---|
| 514 | CString dst; | 
|---|
| 515 | dst.reserve(n); | 
|---|
| 516 |  | 
|---|
| 517 | // check if first character is 0xfffe or 0xfeff | 
|---|
| 518 | bool byteSwapped = false; | 
|---|
| 519 | if (n >= 1) { | 
|---|
| 520 | switch (decode32(data, false)) { | 
|---|
| 521 | case 0x0000feff: | 
|---|
| 522 | data += 4; | 
|---|
| 523 | --n; | 
|---|
| 524 | break; | 
|---|
| 525 |  | 
|---|
| 526 | case 0x0000fffe: | 
|---|
| 527 | byteSwapped = true; | 
|---|
| 528 | data += 4; | 
|---|
| 529 | --n; | 
|---|
| 530 | break; | 
|---|
| 531 |  | 
|---|
| 532 | default: | 
|---|
| 533 | break; | 
|---|
| 534 | } | 
|---|
| 535 | } | 
|---|
| 536 |  | 
|---|
| 537 | // convert each character | 
|---|
| 538 | for (; n > 0; data += 4, --n) { | 
|---|
| 539 | UInt32 c = decode32(data, byteSwapped); | 
|---|
| 540 | if (c >= 0x00110000) { | 
|---|
| 541 | setError(errors); | 
|---|
| 542 | c = s_replacement; | 
|---|
| 543 | } | 
|---|
| 544 | toUTF8(dst, c, errors); | 
|---|
| 545 | } | 
|---|
| 546 |  | 
|---|
| 547 | return dst; | 
|---|
| 548 | } | 
|---|
| 549 |  | 
|---|
| 550 | UInt32 | 
|---|
| 551 | CUnicode::fromUTF8(const UInt8*& data, UInt32& n) | 
|---|
| 552 | { | 
|---|
| 553 | assert(data != NULL); | 
|---|
| 554 | assert(n    != 0); | 
|---|
| 555 |  | 
|---|
| 556 | // compute character encoding length, checking for overlong | 
|---|
| 557 | // sequences (i.e. characters that don't use the shortest | 
|---|
| 558 | // possible encoding). | 
|---|
| 559 | UInt32 size; | 
|---|
| 560 | if (data[0] < 0x80) { | 
|---|
| 561 | // 0xxxxxxx | 
|---|
| 562 | size = 1; | 
|---|
| 563 | } | 
|---|
| 564 | else if (data[0] < 0xc0) { | 
|---|
| 565 | // 10xxxxxx -- in the middle of a multibyte character.  counts | 
|---|
| 566 | // as one invalid character. | 
|---|
| 567 | --n; | 
|---|
| 568 | ++data; | 
|---|
| 569 | return s_invalid; | 
|---|
| 570 | } | 
|---|
| 571 | else if (data[0] < 0xe0) { | 
|---|
| 572 | // 110xxxxx | 
|---|
| 573 | size = 2; | 
|---|
| 574 | } | 
|---|
| 575 | else if (data[0] < 0xf0) { | 
|---|
| 576 | // 1110xxxx | 
|---|
| 577 | size = 3; | 
|---|
| 578 | } | 
|---|
| 579 | else if (data[0] < 0xf8) { | 
|---|
| 580 | // 11110xxx | 
|---|
| 581 | size = 4; | 
|---|
| 582 | } | 
|---|
| 583 | else if (data[0] < 0xfc) { | 
|---|
| 584 | // 111110xx | 
|---|
| 585 | size = 5; | 
|---|
| 586 | } | 
|---|
| 587 | else if (data[0] < 0xfe) { | 
|---|
| 588 | // 1111110x | 
|---|
| 589 | size = 6; | 
|---|
| 590 | } | 
|---|
| 591 | else { | 
|---|
| 592 | // invalid sequence.  dunno how many bytes to skip so skip one. | 
|---|
| 593 | --n; | 
|---|
| 594 | ++data; | 
|---|
| 595 | return s_invalid; | 
|---|
| 596 | } | 
|---|
| 597 |  | 
|---|
| 598 | // make sure we have enough data | 
|---|
| 599 | if (size > n) { | 
|---|
| 600 | data += n; | 
|---|
| 601 | n     = 0; | 
|---|
| 602 | return s_invalid; | 
|---|
| 603 | } | 
|---|
| 604 |  | 
|---|
| 605 | // extract character | 
|---|
| 606 | UInt32 c; | 
|---|
| 607 | switch (size) { | 
|---|
| 608 | case 1: | 
|---|
| 609 | c = static_cast<UInt32>(data[0]); | 
|---|
| 610 | break; | 
|---|
| 611 |  | 
|---|
| 612 | case 2: | 
|---|
| 613 | c = ((static_cast<UInt32>(data[0]) & 0x1f) <<  6) | | 
|---|
| 614 | ((static_cast<UInt32>(data[1]) & 0x3f)      ); | 
|---|
| 615 | break; | 
|---|
| 616 |  | 
|---|
| 617 | case 3: | 
|---|
| 618 | c = ((static_cast<UInt32>(data[0]) & 0x0f) << 12) | | 
|---|
| 619 | ((static_cast<UInt32>(data[1]) & 0x3f) <<  6) | | 
|---|
| 620 | ((static_cast<UInt32>(data[2]) & 0x3f)      ); | 
|---|
| 621 | break; | 
|---|
| 622 |  | 
|---|
| 623 | case 4: | 
|---|
| 624 | c = ((static_cast<UInt32>(data[0]) & 0x07) << 18) | | 
|---|
| 625 | ((static_cast<UInt32>(data[1]) & 0x3f) << 12) | | 
|---|
| 626 | ((static_cast<UInt32>(data[1]) & 0x3f) <<  6) | | 
|---|
| 627 | ((static_cast<UInt32>(data[1]) & 0x3f)      ); | 
|---|
| 628 | break; | 
|---|
| 629 |  | 
|---|
| 630 | case 5: | 
|---|
| 631 | c = ((static_cast<UInt32>(data[0]) & 0x03) << 24) | | 
|---|
| 632 | ((static_cast<UInt32>(data[1]) & 0x3f) << 18) | | 
|---|
| 633 | ((static_cast<UInt32>(data[1]) & 0x3f) << 12) | | 
|---|
| 634 | ((static_cast<UInt32>(data[1]) & 0x3f) <<  6) | | 
|---|
| 635 | ((static_cast<UInt32>(data[1]) & 0x3f)      ); | 
|---|
| 636 | break; | 
|---|
| 637 |  | 
|---|
| 638 | case 6: | 
|---|
| 639 | c = ((static_cast<UInt32>(data[0]) & 0x01) << 30) | | 
|---|
| 640 | ((static_cast<UInt32>(data[1]) & 0x3f) << 24) | | 
|---|
| 641 | ((static_cast<UInt32>(data[1]) & 0x3f) << 18) | | 
|---|
| 642 | ((static_cast<UInt32>(data[1]) & 0x3f) << 12) | | 
|---|
| 643 | ((static_cast<UInt32>(data[1]) & 0x3f) <<  6) | | 
|---|
| 644 | ((static_cast<UInt32>(data[1]) & 0x3f)      ); | 
|---|
| 645 | break; | 
|---|
| 646 |  | 
|---|
| 647 | default: | 
|---|
| 648 | assert(0 && "invalid size"); | 
|---|
| 649 | return s_invalid; | 
|---|
| 650 | } | 
|---|
| 651 |  | 
|---|
| 652 | // check that all bytes after the first have the pattern 10xxxxxx. | 
|---|
| 653 | // truncated sequences are treated as a single malformed character. | 
|---|
| 654 | bool truncated = false; | 
|---|
| 655 | switch (size) { | 
|---|
| 656 | case 6: | 
|---|
| 657 | if ((data[5] & 0xc0) != 0x80) { | 
|---|
| 658 | truncated = true; | 
|---|
| 659 | size = 5; | 
|---|
| 660 | } | 
|---|
| 661 | // fall through | 
|---|
| 662 |  | 
|---|
| 663 | case 5: | 
|---|
| 664 | if ((data[4] & 0xc0) != 0x80) { | 
|---|
| 665 | truncated = true; | 
|---|
| 666 | size = 4; | 
|---|
| 667 | } | 
|---|
| 668 | // fall through | 
|---|
| 669 |  | 
|---|
| 670 | case 4: | 
|---|
| 671 | if ((data[3] & 0xc0) != 0x80) { | 
|---|
| 672 | truncated = true; | 
|---|
| 673 | size = 3; | 
|---|
| 674 | } | 
|---|
| 675 | // fall through | 
|---|
| 676 |  | 
|---|
| 677 | case 3: | 
|---|
| 678 | if ((data[2] & 0xc0) != 0x80) { | 
|---|
| 679 | truncated = true; | 
|---|
| 680 | size = 2; | 
|---|
| 681 | } | 
|---|
| 682 | // fall through | 
|---|
| 683 |  | 
|---|
| 684 | case 2: | 
|---|
| 685 | if ((data[1] & 0xc0) != 0x80) { | 
|---|
| 686 | truncated = true; | 
|---|
| 687 | size = 1; | 
|---|
| 688 | } | 
|---|
| 689 | } | 
|---|
| 690 |  | 
|---|
| 691 | // update parameters | 
|---|
| 692 | data += size; | 
|---|
| 693 | n    -= size; | 
|---|
| 694 |  | 
|---|
| 695 | // invalid if sequence was truncated | 
|---|
| 696 | if (truncated) { | 
|---|
| 697 | return s_invalid; | 
|---|
| 698 | } | 
|---|
| 699 |  | 
|---|
| 700 | // check for characters that didn't use the smallest possible encoding | 
|---|
| 701 | static UInt32 s_minChar[] = { | 
|---|
| 702 | 0, | 
|---|
| 703 | 0x00000000, | 
|---|
| 704 | 0x00000080, | 
|---|
| 705 | 0x00000800, | 
|---|
| 706 | 0x00010000, | 
|---|
| 707 | 0x00200000, | 
|---|
| 708 | 0x04000000 | 
|---|
| 709 | }; | 
|---|
| 710 | if (c < s_minChar[size]) { | 
|---|
| 711 | return s_invalid; | 
|---|
| 712 | } | 
|---|
| 713 |  | 
|---|
| 714 | // check for characters not in ISO-10646 | 
|---|
| 715 | if (c >= 0x0000d800 && c <= 0x0000dfff) { | 
|---|
| 716 | return s_invalid; | 
|---|
| 717 | } | 
|---|
| 718 | if (c >= 0x0000fffe && c <= 0x0000ffff) { | 
|---|
| 719 | return s_invalid; | 
|---|
| 720 | } | 
|---|
| 721 |  | 
|---|
| 722 | return c; | 
|---|
| 723 | } | 
|---|
| 724 |  | 
|---|
| 725 | void | 
|---|
| 726 | CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors) | 
|---|
| 727 | { | 
|---|
| 728 | UInt8 data[6]; | 
|---|
| 729 |  | 
|---|
| 730 | // handle characters outside the valid range | 
|---|
| 731 | if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) { | 
|---|
| 732 | setError(errors); | 
|---|
| 733 | c = s_replacement; | 
|---|
| 734 | } | 
|---|
| 735 |  | 
|---|
| 736 | // convert to UTF-8 | 
|---|
| 737 | if (c < 0x00000080) { | 
|---|
| 738 | data[0] = static_cast<UInt8>(c); | 
|---|
| 739 | dst.append(reinterpret_cast<char*>(data), 1); | 
|---|
| 740 | } | 
|---|
| 741 | else if (c < 0x00000800) { | 
|---|
| 742 | data[0] = static_cast<UInt8>(((c >>  6) & 0x0000001f) + 0xc0); | 
|---|
| 743 | data[1] = static_cast<UInt8>((c         & 0x0000003f) + 0x80); | 
|---|
| 744 | dst.append(reinterpret_cast<char*>(data), 2); | 
|---|
| 745 | } | 
|---|
| 746 | else if (c < 0x00010000) { | 
|---|
| 747 | data[0] = static_cast<UInt8>(((c >> 12) & 0x0000000f) + 0xe0); | 
|---|
| 748 | data[1] = static_cast<UInt8>(((c >>  6) & 0x0000003f) + 0x80); | 
|---|
| 749 | data[2] = static_cast<UInt8>((c         & 0x0000003f) + 0x80); | 
|---|
| 750 | dst.append(reinterpret_cast<char*>(data), 3); | 
|---|
| 751 | } | 
|---|
| 752 | else if (c < 0x00200000) { | 
|---|
| 753 | data[0] = static_cast<UInt8>(((c >> 18) & 0x00000007) + 0xf0); | 
|---|
| 754 | data[1] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80); | 
|---|
| 755 | data[2] = static_cast<UInt8>(((c >>  6) & 0x0000003f) + 0x80); | 
|---|
| 756 | data[3] = static_cast<UInt8>((c         & 0x0000003f) + 0x80); | 
|---|
| 757 | dst.append(reinterpret_cast<char*>(data), 4); | 
|---|
| 758 | } | 
|---|
| 759 | else if (c < 0x04000000) { | 
|---|
| 760 | data[0] = static_cast<UInt8>(((c >> 24) & 0x00000003) + 0xf8); | 
|---|
| 761 | data[1] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80); | 
|---|
| 762 | data[2] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80); | 
|---|
| 763 | data[3] = static_cast<UInt8>(((c >>  6) & 0x0000003f) + 0x80); | 
|---|
| 764 | data[4] = static_cast<UInt8>((c         & 0x0000003f) + 0x80); | 
|---|
| 765 | dst.append(reinterpret_cast<char*>(data), 5); | 
|---|
| 766 | } | 
|---|
| 767 | else if (c < 0x80000000) { | 
|---|
| 768 | data[0] = static_cast<UInt8>(((c >> 30) & 0x00000001) + 0xfc); | 
|---|
| 769 | data[1] = static_cast<UInt8>(((c >> 24) & 0x0000003f) + 0x80); | 
|---|
| 770 | data[2] = static_cast<UInt8>(((c >> 18) & 0x0000003f) + 0x80); | 
|---|
| 771 | data[3] = static_cast<UInt8>(((c >> 12) & 0x0000003f) + 0x80); | 
|---|
| 772 | data[4] = static_cast<UInt8>(((c >>  6) & 0x0000003f) + 0x80); | 
|---|
| 773 | data[5] = static_cast<UInt8>((c         & 0x0000003f) + 0x80); | 
|---|
| 774 | dst.append(reinterpret_cast<char*>(data), 6); | 
|---|
| 775 | } | 
|---|
| 776 | else { | 
|---|
| 777 | assert(0 && "character out of range"); | 
|---|
| 778 | } | 
|---|
| 779 | } | 
|---|