source: vendor/current/lib/util/charset/iconv.c

Last change on this file was 988, checked in by Silvan Scherrer, 9 years ago

Samba Server: update vendor to version 4.4.3

File size: 20.3 KB
Line 
1/*
2 Unix SMB/CIFS implementation.
3 minimal iconv implementation
4 Copyright (C) Andrew Tridgell 2001
5 Copyright (C) Jelmer Vernooij 2002
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19*/
20
21#include "includes.h"
22#include "../lib/util/dlinklist.h"
23#include "system/iconv.h"
24#include "system/filesys.h"
25#include "charset_proto.h"
26
27#ifdef strcasecmp
28#undef strcasecmp
29#endif
30
31/**
32 * @file
33 *
34 * @brief Samba wrapper/stub for iconv character set conversion.
35 *
36 * iconv is the XPG2 interface for converting between character
37 * encodings. This file provides a Samba wrapper around it, and also
38 * a simple reimplementation that is used if the system does not
39 * implement iconv.
40 *
41 * Samba only works with encodings that are supersets of ASCII: ascii
42 * characters like whitespace can be tested for directly, multibyte
43 * sequences start with a byte with the high bit set, and strings are
44 * terminated by a nul byte.
45 *
46 * Note that the only function provided by iconv is conversion between
47 * characters. It doesn't directly support operations like
48 * uppercasing or comparison. We have to convert to UTF-16LE and
49 * compare there.
50 *
51 * @sa Samba Developers Guide
52 **/
53
54static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *);
55static size_t ascii_push (void *,const char **, size_t *, char **, size_t *);
56static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
57static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
58static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *);
59static size_t utf8_push (void *,const char **, size_t *, char **, size_t *);
60static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
61static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
62static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
63static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *);
64static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *);
65
66static const struct charset_functions builtin_functions[] = {
67 /* windows is closest to UTF-16 */
68 {"UCS-2LE", iconv_copy, iconv_copy},
69 {"UTF-16LE", iconv_copy, iconv_copy},
70 {"UCS-2BE", iconv_swab, iconv_swab},
71 {"UTF-16BE", iconv_swab, iconv_swab},
72
73 /* we include the UTF-8 alias to cope with differing locale settings */
74 {"UTF8", utf8_pull, utf8_push},
75 {"UTF-8", utf8_pull, utf8_push},
76
77 /* this handles the munging needed for String2Key */
78 {"UTF16_MUNGED", utf16_munged_pull, iconv_copy, true},
79
80 {"ASCII", ascii_pull, ascii_push},
81 {"646", ascii_pull, ascii_push},
82 {"ISO-8859-1", latin1_pull, latin1_push},
83#ifdef DEVELOPER
84 {"WEIRD", weird_pull, weird_push, true},
85#endif
86#ifdef DARWINOS
87 {"MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push, true},
88#endif
89 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push, true}
90
91};
92
93#ifdef HAVE_NATIVE_ICONV
94/* if there was an error then reset the internal state,
95 this ensures that we don't have a shift state remaining for
96 character sets like SJIS */
97static size_t sys_iconv(void *cd,
98 const char **inbuf, size_t *inbytesleft,
99 char **outbuf, size_t *outbytesleft)
100{
101 size_t ret = iconv((iconv_t)cd,
102 discard_const_p(char *, inbuf), inbytesleft,
103 outbuf, outbytesleft);
104 if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
105 return ret;
106}
107#endif
108
109/**
110 * This is a simple portable iconv() implementaion.
111 *
112 * It only knows about a very small number of character sets - just
113 * enough that Samba works on systems that don't have iconv.
114 **/
115_PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
116 const char **inbuf, size_t *inbytesleft,
117 char **outbuf, size_t *outbytesleft)
118{
119 /* in many cases we can go direct */
120 if (cd->direct) {
121 return cd->direct(cd->cd_direct,
122 inbuf, inbytesleft, outbuf, outbytesleft);
123 }
124
125 /* otherwise we have to do it chunks at a time */
126 {
127#ifndef SMB_ICONV_BUFSIZE
128#define SMB_ICONV_BUFSIZE 2048
129#endif
130 size_t bufsize;
131 char cvtbuf[SMB_ICONV_BUFSIZE];
132
133 while (*inbytesleft > 0) {
134 char *bufp1 = cvtbuf;
135 const char *bufp2 = cvtbuf;
136 int saved_errno = errno;
137 bool pull_failed = false;
138 bufsize = SMB_ICONV_BUFSIZE;
139
140 if (cd->pull(cd->cd_pull,
141 inbuf, inbytesleft, &bufp1, &bufsize) == -1
142 && errno != E2BIG) {
143 saved_errno = errno;
144 pull_failed = true;
145 }
146
147 bufsize = SMB_ICONV_BUFSIZE - bufsize;
148
149 if (cd->push(cd->cd_push,
150 &bufp2, &bufsize,
151 outbuf, outbytesleft) == -1) {
152 return -1;
153 } else if (pull_failed) {
154 /* We want the pull errno if possible */
155 errno = saved_errno;
156 return -1;
157 }
158 }
159 }
160
161 return 0;
162}
163
164static bool is_utf16(const char *name)
165{
166 return strcasecmp(name, "UCS-2LE") == 0 ||
167 strcasecmp(name, "UTF-16LE") == 0;
168}
169
170static int smb_iconv_t_destructor(smb_iconv_t hwd)
171{
172#ifdef HAVE_NATIVE_ICONV
173 if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
174 iconv_close(hwd->cd_pull);
175 if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
176 iconv_close(hwd->cd_push);
177 if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
178 iconv_close(hwd->cd_direct);
179#endif
180
181 return 0;
182}
183
184_PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
185 const char *fromcode, bool use_builtin_handlers)
186{
187 smb_iconv_t ret;
188 const struct charset_functions *from=NULL, *to=NULL;
189 int i;
190
191 ret = (smb_iconv_t)talloc_named(mem_ctx,
192 sizeof(*ret),
193 "iconv(%s,%s)", tocode, fromcode);
194 if (!ret) {
195 errno = ENOMEM;
196 return (smb_iconv_t)-1;
197 }
198 memset(ret, 0, sizeof(*ret));
199 talloc_set_destructor(ret, smb_iconv_t_destructor);
200
201 /* check for the simplest null conversion */
202 if (strcmp(fromcode, tocode) == 0) {
203 ret->direct = iconv_copy;
204 return ret;
205 }
206
207 /* check if we have a builtin function for this conversion */
208 for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
209 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
210 if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
211 from = &builtin_functions[i];
212 }
213 }
214 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
215 if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
216 to = &builtin_functions[i];
217 }
218 }
219 }
220
221#ifdef HAVE_NATIVE_ICONV
222 /* the from and to variables indicate a samba module or
223 * internal conversion, ret->pull and ret->push are
224 * initialised only in this block for iconv based
225 * conversions */
226
227 if (from == NULL) {
228 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
229 if (ret->cd_pull == (iconv_t)-1)
230 ret->cd_pull = iconv_open("UCS-2LE", fromcode);
231 if (ret->cd_pull != (iconv_t)-1) {
232 ret->pull = sys_iconv;
233 }
234 }
235
236 if (to == NULL) {
237 ret->cd_push = iconv_open(tocode, "UTF-16LE");
238 if (ret->cd_push == (iconv_t)-1)
239 ret->cd_push = iconv_open(tocode, "UCS-2LE");
240 if (ret->cd_push != (iconv_t)-1) {
241 ret->push = sys_iconv;
242 }
243 }
244#endif
245
246 if (ret->pull == NULL && from == NULL) {
247 goto failed;
248 }
249
250 if (ret->push == NULL && to == NULL) {
251 goto failed;
252 }
253
254 /* check for conversion to/from ucs2 */
255 if (is_utf16(fromcode) && to) {
256 ret->direct = to->push;
257 return ret;
258 }
259 if (is_utf16(tocode) && from) {
260 ret->direct = from->pull;
261 return ret;
262 }
263
264#ifdef HAVE_NATIVE_ICONV
265 if (is_utf16(fromcode)) {
266 ret->direct = sys_iconv;
267 ret->cd_direct = ret->cd_push;
268 ret->cd_push = NULL;
269 return ret;
270 }
271 if (is_utf16(tocode)) {
272 ret->direct = sys_iconv;
273 ret->cd_direct = ret->cd_pull;
274 ret->cd_pull = NULL;
275 return ret;
276 }
277#endif
278
279 /* the general case has to go via a buffer */
280 if (!ret->pull) ret->pull = from->pull;
281 if (!ret->push) ret->push = to->push;
282 return ret;
283
284failed:
285 talloc_free(ret);
286 errno = EINVAL;
287 return (smb_iconv_t)-1;
288}
289
290/*
291 simple iconv_open() wrapper
292 */
293_PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
294{
295 return smb_iconv_open_ex(NULL, tocode, fromcode, true);
296}
297
298/*
299 simple iconv_close() wrapper
300*/
301_PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
302{
303 talloc_free(cd);
304 return 0;
305}
306
307
308/**********************************************************************
309 the following functions implement the builtin character sets in Samba
310 and also the "test" character sets that are designed to test
311 multi-byte character set support for english users
312***********************************************************************/
313
314/*
315 this takes an ASCII sequence and produces a UTF16 sequence
316
317 The first 127 codepoints of latin1 matches the first 127 codepoints
318 of unicode, and so can be put into the first byte of UTF16LE
319
320 */
321
322static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
323 char **outbuf, size_t *outbytesleft)
324{
325 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
326 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
327 /* If this is multi-byte, then it isn't legal ASCII */
328 errno = EILSEQ;
329 return -1;
330 }
331 (*outbuf)[0] = (*inbuf)[0];
332 (*outbuf)[1] = 0;
333 (*inbytesleft) -= 1;
334 (*outbytesleft) -= 2;
335 (*inbuf) += 1;
336 (*outbuf) += 2;
337 }
338
339 if (*inbytesleft > 0) {
340 errno = E2BIG;
341 return -1;
342 }
343
344 return 0;
345}
346
347/*
348 this takes a UTF16 sequence and produces an ASCII sequence
349
350 The first 127 codepoints of ASCII matches the first 127 codepoints
351 of unicode, and so can be read directly from the first byte of UTF16LE
352
353 */
354static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
355 char **outbuf, size_t *outbytesleft)
356{
357 int ir_count=0;
358
359 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
360 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
361 (*inbuf)[1] != 0) {
362 /* If this is multi-byte, then it isn't legal ASCII */
363 errno = EILSEQ;
364 return -1;
365 }
366 (*outbuf)[0] = (*inbuf)[0];
367 (*inbytesleft) -= 2;
368 (*outbytesleft) -= 1;
369 (*inbuf) += 2;
370 (*outbuf) += 1;
371 }
372
373 if (*inbytesleft == 1) {
374 errno = EINVAL;
375 return -1;
376 }
377
378 if (*inbytesleft > 1) {
379 errno = E2BIG;
380 return -1;
381 }
382
383 return ir_count;
384}
385
386/*
387 this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
388
389 The first 256 codepoints of latin1 matches the first 256 codepoints
390 of unicode, and so can be put into the first byte of UTF16LE
391
392 */
393static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
394 char **outbuf, size_t *outbytesleft)
395{
396 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
397 (*outbuf)[0] = (*inbuf)[0];
398 (*outbuf)[1] = 0;
399 (*inbytesleft) -= 1;
400 (*outbytesleft) -= 2;
401 (*inbuf) += 1;
402 (*outbuf) += 2;
403 }
404
405 if (*inbytesleft > 0) {
406 errno = E2BIG;
407 return -1;
408 }
409
410 return 0;
411}
412
413/*
414 this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
415
416 The first 256 codepoints of latin1 matches the first 256 codepoints
417 of unicode, and so can be read directly from the first byte of UTF16LE
418
419 */
420static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
421 char **outbuf, size_t *outbytesleft)
422{
423 int ir_count=0;
424
425 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
426 (*outbuf)[0] = (*inbuf)[0];
427 if ((*inbuf)[1] != 0) {
428 /* If this is multi-byte, then it isn't legal latin1 */
429 errno = EILSEQ;
430 return -1;
431 }
432 (*inbytesleft) -= 2;
433 (*outbytesleft) -= 1;
434 (*inbuf) += 2;
435 (*outbuf) += 1;
436 }
437
438 if (*inbytesleft == 1) {
439 errno = EINVAL;
440 return -1;
441 }
442
443 if (*inbytesleft > 1) {
444 errno = E2BIG;
445 return -1;
446 }
447
448 return ir_count;
449}
450
451static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
452 char **outbuf, size_t *outbytesleft)
453{
454 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
455 unsigned int v;
456
457 if ((*inbuf)[0] != '@') {
458 /* seven bit ascii case */
459 (*outbuf)[0] = (*inbuf)[0];
460 (*outbuf)[1] = 0;
461 (*inbytesleft) -= 1;
462 (*outbytesleft) -= 2;
463 (*inbuf) += 1;
464 (*outbuf) += 2;
465 continue;
466 }
467 /* it's a hex character */
468 if (*inbytesleft < 5) {
469 errno = EINVAL;
470 return -1;
471 }
472
473 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
474 errno = EILSEQ;
475 return -1;
476 }
477
478 (*outbuf)[0] = v&0xff;
479 (*outbuf)[1] = v>>8;
480 (*inbytesleft) -= 5;
481 (*outbytesleft) -= 2;
482 (*inbuf) += 5;
483 (*outbuf) += 2;
484 }
485
486 if (*inbytesleft > 0) {
487 errno = E2BIG;
488 return -1;
489 }
490
491 return 0;
492}
493
494static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
495 char **outbuf, size_t *outbytesleft)
496{
497 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
498 char buf[6];
499
500 if ((*inbuf)[1] == 0 &&
501 ((*inbuf)[0] & 0x80) == 0 &&
502 (*inbuf)[0] != '@') {
503 (*outbuf)[0] = (*inbuf)[0];
504 (*inbytesleft) -= 2;
505 (*outbytesleft) -= 1;
506 (*inbuf) += 2;
507 (*outbuf) += 1;
508 continue;
509 }
510 if (*outbytesleft < 5) {
511 errno = E2BIG;
512 return -1;
513 }
514 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
515 memcpy(*outbuf, buf, 5);
516 (*inbytesleft) -= 2;
517 (*outbytesleft) -= 5;
518 (*inbuf) += 2;
519 (*outbuf) += 5;
520 }
521
522 if (*inbytesleft == 1) {
523 errno = EINVAL;
524 return -1;
525 }
526
527 if (*inbytesleft > 1) {
528 errno = E2BIG;
529 return -1;
530 }
531
532 return 0;
533}
534
535static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
536 char **outbuf, size_t *outbytesleft)
537{
538 int n;
539
540 n = MIN(*inbytesleft, *outbytesleft);
541
542 swab(*inbuf, *outbuf, (n&~1));
543 if (n&1) {
544 (*outbuf)[n-1] = 0;
545 }
546
547 (*inbytesleft) -= n;
548 (*outbytesleft) -= n;
549 (*inbuf) += n;
550 (*outbuf) += n;
551
552 if (*inbytesleft > 0) {
553 errno = E2BIG;
554 return -1;
555 }
556
557 return 0;
558}
559
560
561static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
562 char **outbuf, size_t *outbytesleft)
563{
564 int n;
565
566 n = MIN(*inbytesleft, *outbytesleft);
567
568 memmove(*outbuf, *inbuf, n);
569
570 (*inbytesleft) -= n;
571 (*outbytesleft) -= n;
572 (*inbuf) += n;
573 (*outbuf) += n;
574
575 if (*inbytesleft > 0) {
576 errno = E2BIG;
577 return -1;
578 }
579
580 return 0;
581}
582
583/*
584 this takes a UTF8 sequence and produces a UTF16 sequence
585 */
586static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
587 char **outbuf, size_t *outbytesleft)
588{
589 size_t in_left=*inbytesleft, out_left=*outbytesleft;
590 const uint8_t *c = (const uint8_t *)*inbuf;
591 uint8_t *uc = (uint8_t *)*outbuf;
592
593 while (in_left >= 1 && out_left >= 2) {
594 if ((c[0] & 0x80) == 0) {
595 uc[0] = c[0];
596 uc[1] = 0;
597 c += 1;
598 in_left -= 1;
599 out_left -= 2;
600 uc += 2;
601 continue;
602 }
603
604 if ((c[0] & 0xe0) == 0xc0) {
605 if (in_left < 2 ||
606 (c[1] & 0xc0) != 0x80) {
607 errno = EILSEQ;
608 goto error;
609 }
610 uc[1] = (c[0]>>2) & 0x7;
611 uc[0] = (c[0]<<6) | (c[1]&0x3f);
612 c += 2;
613 in_left -= 2;
614 out_left -= 2;
615 uc += 2;
616 continue;
617 }
618
619 if ((c[0] & 0xf0) == 0xe0) {
620 if (in_left < 3 ||
621 (c[1] & 0xc0) != 0x80 ||
622 (c[2] & 0xc0) != 0x80) {
623 errno = EILSEQ;
624 goto error;
625 }
626 uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
627 uc[0] = (c[1]<<6) | (c[2]&0x3f);
628 c += 3;
629 in_left -= 3;
630 out_left -= 2;
631 uc += 2;
632 continue;
633 }
634
635 if ((c[0] & 0xf8) == 0xf0) {
636 unsigned int codepoint;
637 if (in_left < 4 ||
638 (c[1] & 0xc0) != 0x80 ||
639 (c[2] & 0xc0) != 0x80 ||
640 (c[3] & 0xc0) != 0x80) {
641 errno = EILSEQ;
642 goto error;
643 }
644 codepoint =
645 (c[3]&0x3f) |
646 ((c[2]&0x3f)<<6) |
647 ((c[1]&0x3f)<<12) |
648 ((c[0]&0x7)<<18);
649 if (codepoint < 0x10000) {
650 /* accept UTF-8 characters that are not
651 minimally packed, but pack the result */
652 uc[0] = (codepoint & 0xFF);
653 uc[1] = (codepoint >> 8);
654 c += 4;
655 in_left -= 4;
656 out_left -= 2;
657 uc += 2;
658 continue;
659 }
660
661 codepoint -= 0x10000;
662
663 if (out_left < 4) {
664 errno = E2BIG;
665 goto error;
666 }
667
668 uc[0] = (codepoint>>10) & 0xFF;
669 uc[1] = (codepoint>>18) | 0xd8;
670 uc[2] = codepoint & 0xFF;
671 uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
672 c += 4;
673 in_left -= 4;
674 out_left -= 4;
675 uc += 4;
676 continue;
677 }
678
679 /* we don't handle 5 byte sequences */
680 errno = EINVAL;
681 goto error;
682 }
683
684 if (in_left > 0) {
685 errno = E2BIG;
686 goto error;
687 }
688
689 *inbytesleft = in_left;
690 *outbytesleft = out_left;
691 *inbuf = (const char *)c;
692 *outbuf = (char *)uc;
693 return 0;
694
695error:
696 *inbytesleft = in_left;
697 *outbytesleft = out_left;
698 *inbuf = (const char *)c;
699 *outbuf = (char *)uc;
700 return -1;
701}
702
703
704/*
705 this takes a UTF16 sequence and produces a UTF8 sequence
706 */
707static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
708 char **outbuf, size_t *outbytesleft)
709{
710 size_t in_left=*inbytesleft, out_left=*outbytesleft;
711 uint8_t *c = (uint8_t *)*outbuf;
712 const uint8_t *uc = (const uint8_t *)*inbuf;
713
714 while (in_left >= 2 && out_left >= 1) {
715 unsigned int codepoint;
716
717 if (uc[1] == 0 && !(uc[0] & 0x80)) {
718 /* simplest case */
719 c[0] = uc[0];
720 in_left -= 2;
721 out_left -= 1;
722 uc += 2;
723 c += 1;
724 continue;
725 }
726
727 if ((uc[1]&0xf8) == 0) {
728 /* next simplest case */
729 if (out_left < 2) {
730 errno = E2BIG;
731 goto error;
732 }
733 c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
734 c[1] = 0x80 | (uc[0] & 0x3f);
735 in_left -= 2;
736 out_left -= 2;
737 uc += 2;
738 c += 2;
739 continue;
740 }
741
742 if ((uc[1] & 0xfc) == 0xdc) {
743 /* its the second part of a 4 byte sequence. Illegal */
744 if (in_left < 4) {
745 errno = EINVAL;
746 } else {
747 errno = EILSEQ;
748 }
749 goto error;
750 }
751
752 if ((uc[1] & 0xfc) != 0xd8) {
753 codepoint = uc[0] | (uc[1]<<8);
754 if (out_left < 3) {
755 errno = E2BIG;
756 goto error;
757 }
758 c[0] = 0xe0 | (codepoint >> 12);
759 c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
760 c[2] = 0x80 | (codepoint & 0x3f);
761
762 in_left -= 2;
763 out_left -= 3;
764 uc += 2;
765 c += 3;
766 continue;
767 }
768
769 /* its the first part of a 4 byte sequence */
770 if (in_left < 4) {
771 errno = EINVAL;
772 goto error;
773 }
774 if ((uc[3] & 0xfc) != 0xdc) {
775 errno = EILSEQ;
776 goto error;
777 }
778 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
779 (uc[0]<<10) | ((uc[1] & 0x3)<<18));
780
781 if (out_left < 4) {
782 errno = E2BIG;
783 goto error;
784 }
785 c[0] = 0xf0 | (codepoint >> 18);
786 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
787 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
788 c[3] = 0x80 | (codepoint & 0x3f);
789
790 in_left -= 4;
791 out_left -= 4;
792 uc += 4;
793 c += 4;
794 }
795
796 if (in_left == 1) {
797 errno = EINVAL;
798 goto error;
799 }
800
801 if (in_left > 1) {
802 errno = E2BIG;
803 goto error;
804 }
805
806 *inbytesleft = in_left;
807 *outbytesleft = out_left;
808 *inbuf = (const char *)uc;
809 *outbuf = (char *)c;
810
811 return 0;
812
813error:
814 *inbytesleft = in_left;
815 *outbytesleft = out_left;
816 *inbuf = (const char *)uc;
817 *outbuf = (char *)c;
818 return -1;
819}
820
821
822/*
823 this takes a UTF16 munged sequence, modifies it according to the
824 string2key rules, and produces a UTF16 sequence
825
826The rules are:
827
828 1) any 0x0000 characters are mapped to 0x0001
829
830 2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
831 without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
832 U+FFFD (OBJECT REPLACEMENT CHARACTER).
833
834 3) the same for any low surrogate that was not preceded by a high surrogate.
835
836 */
837static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
838 char **outbuf, size_t *outbytesleft)
839{
840 size_t in_left=*inbytesleft, out_left=*outbytesleft;
841 uint8_t *c = (uint8_t *)*outbuf;
842 const uint8_t *uc = (const uint8_t *)*inbuf;
843
844 while (in_left >= 2 && out_left >= 2) {
845 unsigned int codepoint = uc[0] | (uc[1]<<8);
846
847 if (codepoint == 0) {
848 codepoint = 1;
849 }
850
851 if ((codepoint & 0xfc00) == 0xd800) {
852 /* a high surrogate */
853 unsigned int codepoint2;
854 if (in_left < 4) {
855 codepoint = 0xfffd;
856 goto codepoint16;
857 }
858 codepoint2 = uc[2] | (uc[3]<<8);
859 if ((codepoint2 & 0xfc00) != 0xdc00) {
860 /* high surrogate not followed by low
861 surrogate: convert to 0xfffd */
862 codepoint = 0xfffd;
863 goto codepoint16;
864 }
865 if (out_left < 4) {
866 errno = E2BIG;
867 goto error;
868 }
869 memcpy(c, uc, 4);
870 in_left -= 4;
871 out_left -= 4;
872 uc += 4;
873 c += 4;
874 continue;
875 }
876
877 if ((codepoint & 0xfc00) == 0xdc00) {
878 /* low surrogate not preceded by high
879 surrogate: convert to 0xfffd */
880 codepoint = 0xfffd;
881 }
882
883 codepoint16:
884 c[0] = codepoint & 0xFF;
885 c[1] = (codepoint>>8) & 0xFF;
886
887 in_left -= 2;
888 out_left -= 2;
889 uc += 2;
890 c += 2;
891 continue;
892 }
893
894 if (in_left == 1) {
895 errno = EINVAL;
896 goto error;
897 }
898
899 if (in_left > 1) {
900 errno = E2BIG;
901 goto error;
902 }
903
904 *inbytesleft = in_left;
905 *outbytesleft = out_left;
906 *inbuf = (const char *)uc;
907 *outbuf = (char *)c;
908
909 return 0;
910
911error:
912 *inbytesleft = in_left;
913 *outbytesleft = out_left;
914 *inbuf = (const char *)uc;
915 *outbuf = (char *)c;
916 return -1;
917}
918
919
920
Note: See TracBrowser for help on using the repository browser.