source: vendor/3.6.23/lib/util/charset/iconv.c

Last change on this file was 746, checked in by Silvan Scherrer, 13 years ago

Samba Server: updated vendor to 3.6.9

File size: 20.2 KB
Line 
1/*
2 Unix SMB/CIFS implementation.
3 minimal iconv implementation
4 Copyright (C) Andrew Tridgell 2001
5 Copyright (C) Jelmer Vernooij 2002
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19*/
20
21#include "includes.h"
22#include "../lib/util/dlinklist.h"
23#include "system/iconv.h"
24#include "system/filesys.h"
25
26#ifdef strcasecmp
27#undef strcasecmp
28#endif
29
30#ifdef static_decl_charset
31static_decl_charset;
32#endif
33
34/**
35 * @file
36 *
37 * @brief Samba wrapper/stub for iconv character set conversion.
38 *
39 * iconv is the XPG2 interface for converting between character
40 * encodings. This file provides a Samba wrapper around it, and also
41 * a simple reimplementation that is used if the system does not
42 * implement iconv.
43 *
44 * Samba only works with encodings that are supersets of ASCII: ascii
45 * characters like whitespace can be tested for directly, multibyte
46 * sequences start with a byte with the high bit set, and strings are
47 * terminated by a nul byte.
48 *
49 * Note that the only function provided by iconv is conversion between
50 * characters. It doesn't directly support operations like
51 * uppercasing or comparison. We have to convert to UTF-16LE and
52 * compare there.
53 *
54 * @sa Samba Developers Guide
55 **/
56
57static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *);
58static size_t ascii_push (void *,const char **, size_t *, char **, size_t *);
59static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
60static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *);
61static size_t utf8_push (void *,const char **, size_t *, char **, size_t *);
62static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
63static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
64static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
65static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *);
66static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *);
67
68static const struct charset_functions builtin_functions[] = {
69 /* windows is closest to UTF-16 */
70 {"UCS-2LE", iconv_copy, iconv_copy},
71 {"UTF-16LE", iconv_copy, iconv_copy},
72 {"UCS-2BE", iconv_swab, iconv_swab},
73 {"UTF-16BE", iconv_swab, iconv_swab},
74
75 /* we include the UTF-8 alias to cope with differing locale settings */
76 {"UTF8", utf8_pull, utf8_push},
77 {"UTF-8", utf8_pull, utf8_push},
78
79 /* this handles the munging needed for String2Key */
80 {"UTF16_MUNGED", utf16_munged_pull, iconv_copy},
81
82 {"ASCII", ascii_pull, ascii_push},
83 {"646", ascii_pull, ascii_push},
84 {"ISO-8859-1", ascii_pull, latin1_push},
85 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
86};
87
88static struct charset_functions *charsets = NULL;
89
90static struct charset_functions *find_charset_functions(const char *name)
91{
92 struct charset_functions *c;
93
94 /* Check whether we already have this charset... */
95 for (c = charsets; c != NULL; c = c->next) {
96 if(strcasecmp(c->name, name) == 0) {
97 return c;
98 }
99 }
100
101 return NULL;
102}
103
104bool smb_register_charset(const struct charset_functions *funcs_in)
105{
106 struct charset_functions *funcs;
107
108 DEBUG(5, ("Attempting to register new charset %s\n", funcs_in->name));
109 /* Check whether we already have this charset... */
110 if (find_charset_functions(funcs_in->name)) {
111 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs_in->name));
112 return false;
113 }
114
115 funcs = talloc(NULL, struct charset_functions);
116 if (!funcs) {
117 DEBUG(0, ("Out of memory duplicating charset %s\n", funcs_in->name));
118 return false;
119 }
120 *funcs = *funcs_in;
121
122 funcs->next = funcs->prev = NULL;
123 DEBUG(5, ("Registered charset %s\n", funcs->name));
124 DLIST_ADD(charsets, funcs);
125 return true;
126}
127
128static void lazy_initialize_iconv(void)
129{
130 static bool initialized;
131
132#ifdef static_init_charset
133 if (!initialized) {
134 static_init_charset;
135 initialized = true;
136 }
137#endif
138}
139
140#ifdef HAVE_NATIVE_ICONV
141/* if there was an error then reset the internal state,
142 this ensures that we don't have a shift state remaining for
143 character sets like SJIS */
144static size_t sys_iconv(void *cd,
145 const char **inbuf, size_t *inbytesleft,
146 char **outbuf, size_t *outbytesleft)
147{
148 size_t ret = iconv((iconv_t)cd,
149 discard_const_p(char *, inbuf), inbytesleft,
150 outbuf, outbytesleft);
151 if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
152 return ret;
153}
154#endif
155
156/**
157 * This is a simple portable iconv() implementaion.
158 *
159 * It only knows about a very small number of character sets - just
160 * enough that Samba works on systems that don't have iconv.
161 **/
162_PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
163 const char **inbuf, size_t *inbytesleft,
164 char **outbuf, size_t *outbytesleft)
165{
166 char cvtbuf[2048];
167 size_t bufsize;
168
169 /* in many cases we can go direct */
170 if (cd->direct) {
171 return cd->direct(cd->cd_direct,
172 inbuf, inbytesleft, outbuf, outbytesleft);
173 }
174
175
176 /* otherwise we have to do it chunks at a time */
177 while (*inbytesleft > 0) {
178 char *bufp1 = cvtbuf;
179 const char *bufp2 = cvtbuf;
180
181 bufsize = sizeof(cvtbuf);
182
183 if (cd->pull(cd->cd_pull,
184 inbuf, inbytesleft, &bufp1, &bufsize) == -1
185 && errno != E2BIG) return -1;
186
187 bufsize = sizeof(cvtbuf) - bufsize;
188
189 if (cd->push(cd->cd_push,
190 &bufp2, &bufsize,
191 outbuf, outbytesleft) == -1) return -1;
192 }
193
194 return 0;
195}
196
197static bool is_utf16(const char *name)
198{
199 return strcasecmp(name, "UCS-2LE") == 0 ||
200 strcasecmp(name, "UTF-16LE") == 0;
201}
202
203static int smb_iconv_t_destructor(smb_iconv_t hwd)
204{
205#ifdef HAVE_NATIVE_ICONV
206 if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
207 iconv_close(hwd->cd_pull);
208 if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
209 iconv_close(hwd->cd_push);
210 if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
211 iconv_close(hwd->cd_direct);
212#endif
213
214 return 0;
215}
216
217_PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
218 const char *fromcode, bool native_iconv)
219{
220 smb_iconv_t ret;
221 const struct charset_functions *from=NULL, *to=NULL;
222 int i;
223
224 lazy_initialize_iconv();
225
226 ret = (smb_iconv_t)talloc_named(mem_ctx,
227 sizeof(*ret),
228 "iconv(%s,%s)", tocode, fromcode);
229 if (!ret) {
230 errno = ENOMEM;
231 return (smb_iconv_t)-1;
232 }
233 memset(ret, 0, sizeof(*ret));
234 talloc_set_destructor(ret, smb_iconv_t_destructor);
235
236 /* check for the simplest null conversion */
237 if (strcmp(fromcode, tocode) == 0) {
238 ret->direct = iconv_copy;
239 return ret;
240 }
241
242 for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
243 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
244 from = &builtin_functions[i];
245 }
246 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
247 to = &builtin_functions[i];
248 }
249 }
250
251 if (from == NULL) {
252 for (from=charsets; from; from=from->next) {
253 if (strcasecmp(from->name, fromcode) == 0) break;
254 }
255 }
256
257 if (to == NULL) {
258 for (to=charsets; to; to=to->next) {
259 if (strcasecmp(to->name, tocode) == 0) break;
260 }
261 }
262
263#ifdef HAVE_NATIVE_ICONV
264 if ((!from || !to) && !native_iconv) {
265 goto failed;
266 }
267 if (!from) {
268 ret->pull = sys_iconv;
269 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
270 if (ret->cd_pull == (iconv_t)-1)
271 ret->cd_pull = iconv_open("UCS-2LE", fromcode);
272 if (ret->cd_pull == (iconv_t)-1) goto failed;
273 }
274
275 if (!to) {
276 ret->push = sys_iconv;
277 ret->cd_push = iconv_open(tocode, "UTF-16LE");
278 if (ret->cd_push == (iconv_t)-1)
279 ret->cd_push = iconv_open(tocode, "UCS-2LE");
280 if (ret->cd_push == (iconv_t)-1) goto failed;
281 }
282#else
283 if (!from || !to) {
284 goto failed;
285 }
286#endif
287
288 /* check for conversion to/from ucs2 */
289 if (is_utf16(fromcode) && to) {
290 ret->direct = to->push;
291 return ret;
292 }
293 if (is_utf16(tocode) && from) {
294 ret->direct = from->pull;
295 return ret;
296 }
297
298#ifdef HAVE_NATIVE_ICONV
299 if (is_utf16(fromcode)) {
300 ret->direct = sys_iconv;
301 ret->cd_direct = ret->cd_push;
302 ret->cd_push = NULL;
303 return ret;
304 }
305 if (is_utf16(tocode)) {
306 ret->direct = sys_iconv;
307 ret->cd_direct = ret->cd_pull;
308 ret->cd_pull = NULL;
309 return ret;
310 }
311#endif
312
313 /* the general case has to go via a buffer */
314 if (!ret->pull) ret->pull = from->pull;
315 if (!ret->push) ret->push = to->push;
316 return ret;
317
318failed:
319 talloc_free(ret);
320 errno = EINVAL;
321 return (smb_iconv_t)-1;
322}
323
324/*
325 simple iconv_open() wrapper
326 */
327_PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
328{
329 return smb_iconv_open_ex(NULL, tocode, fromcode, true);
330}
331
332/*
333 simple iconv_close() wrapper
334*/
335_PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
336{
337 talloc_free(cd);
338 return 0;
339}
340
341
342/**********************************************************************
343 the following functions implement the builtin character sets in Samba
344 and also the "test" character sets that are designed to test
345 multi-byte character set support for english users
346***********************************************************************/
347static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
348 char **outbuf, size_t *outbytesleft)
349{
350 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
351 (*outbuf)[0] = (*inbuf)[0];
352 (*outbuf)[1] = 0;
353 (*inbytesleft) -= 1;
354 (*outbytesleft) -= 2;
355 (*inbuf) += 1;
356 (*outbuf) += 2;
357 }
358
359 if (*inbytesleft > 0) {
360 errno = E2BIG;
361 return -1;
362 }
363
364 return 0;
365}
366
367static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
368 char **outbuf, size_t *outbytesleft)
369{
370 int ir_count=0;
371
372 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
373 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
374 if ((*inbuf)[1]) ir_count++;
375 (*inbytesleft) -= 2;
376 (*outbytesleft) -= 1;
377 (*inbuf) += 2;
378 (*outbuf) += 1;
379 }
380
381 if (*inbytesleft == 1) {
382 errno = EINVAL;
383 return -1;
384 }
385
386 if (*inbytesleft > 1) {
387 errno = E2BIG;
388 return -1;
389 }
390
391 return ir_count;
392}
393
394static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
395 char **outbuf, size_t *outbytesleft)
396{
397 int ir_count=0;
398
399 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
400 (*outbuf)[0] = (*inbuf)[0];
401 if ((*inbuf)[1]) ir_count++;
402 (*inbytesleft) -= 2;
403 (*outbytesleft) -= 1;
404 (*inbuf) += 2;
405 (*outbuf) += 1;
406 }
407
408 if (*inbytesleft == 1) {
409 errno = EINVAL;
410 return -1;
411 }
412
413 if (*inbytesleft > 1) {
414 errno = E2BIG;
415 return -1;
416 }
417
418 return ir_count;
419}
420
421static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
422 char **outbuf, size_t *outbytesleft)
423{
424 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
425 unsigned int v;
426
427 if ((*inbuf)[0] != '@') {
428 /* seven bit ascii case */
429 (*outbuf)[0] = (*inbuf)[0];
430 (*outbuf)[1] = 0;
431 (*inbytesleft) -= 1;
432 (*outbytesleft) -= 2;
433 (*inbuf) += 1;
434 (*outbuf) += 2;
435 continue;
436 }
437 /* it's a hex character */
438 if (*inbytesleft < 5) {
439 errno = EINVAL;
440 return -1;
441 }
442
443 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
444 errno = EILSEQ;
445 return -1;
446 }
447
448 (*outbuf)[0] = v&0xff;
449 (*outbuf)[1] = v>>8;
450 (*inbytesleft) -= 5;
451 (*outbytesleft) -= 2;
452 (*inbuf) += 5;
453 (*outbuf) += 2;
454 }
455
456 if (*inbytesleft > 0) {
457 errno = E2BIG;
458 return -1;
459 }
460
461 return 0;
462}
463
464static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
465 char **outbuf, size_t *outbytesleft)
466{
467 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
468 char buf[6];
469
470 if ((*inbuf)[1] == 0 &&
471 ((*inbuf)[0] & 0x80) == 0 &&
472 (*inbuf)[0] != '@') {
473 (*outbuf)[0] = (*inbuf)[0];
474 (*inbytesleft) -= 2;
475 (*outbytesleft) -= 1;
476 (*inbuf) += 2;
477 (*outbuf) += 1;
478 continue;
479 }
480 if (*outbytesleft < 5) {
481 errno = E2BIG;
482 return -1;
483 }
484 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
485 memcpy(*outbuf, buf, 5);
486 (*inbytesleft) -= 2;
487 (*outbytesleft) -= 5;
488 (*inbuf) += 2;
489 (*outbuf) += 5;
490 }
491
492 if (*inbytesleft == 1) {
493 errno = EINVAL;
494 return -1;
495 }
496
497 if (*inbytesleft > 1) {
498 errno = E2BIG;
499 return -1;
500 }
501
502 return 0;
503}
504
505static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
506 char **outbuf, size_t *outbytesleft)
507{
508 int n;
509
510 n = MIN(*inbytesleft, *outbytesleft);
511
512 swab(*inbuf, *outbuf, (n&~1));
513 if (n&1) {
514 (*outbuf)[n-1] = 0;
515 }
516
517 (*inbytesleft) -= n;
518 (*outbytesleft) -= n;
519 (*inbuf) += n;
520 (*outbuf) += n;
521
522 if (*inbytesleft > 0) {
523 errno = E2BIG;
524 return -1;
525 }
526
527 return 0;
528}
529
530
531static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
532 char **outbuf, size_t *outbytesleft)
533{
534 int n;
535
536 n = MIN(*inbytesleft, *outbytesleft);
537
538 memmove(*outbuf, *inbuf, n);
539
540 (*inbytesleft) -= n;
541 (*outbytesleft) -= n;
542 (*inbuf) += n;
543 (*outbuf) += n;
544
545 if (*inbytesleft > 0) {
546 errno = E2BIG;
547 return -1;
548 }
549
550 return 0;
551}
552
553/*
554 this takes a UTF8 sequence and produces a UTF16 sequence
555 */
556static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
557 char **outbuf, size_t *outbytesleft)
558{
559 size_t in_left=*inbytesleft, out_left=*outbytesleft;
560 const uint8_t *c = (const uint8_t *)*inbuf;
561 uint8_t *uc = (uint8_t *)*outbuf;
562
563 while (in_left >= 1 && out_left >= 2) {
564 if ((c[0] & 0x80) == 0) {
565 uc[0] = c[0];
566 uc[1] = 0;
567 c += 1;
568 in_left -= 1;
569 out_left -= 2;
570 uc += 2;
571 continue;
572 }
573
574 if ((c[0] & 0xe0) == 0xc0) {
575 if (in_left < 2 ||
576 (c[1] & 0xc0) != 0x80) {
577 errno = EILSEQ;
578 goto error;
579 }
580 uc[1] = (c[0]>>2) & 0x7;
581 uc[0] = (c[0]<<6) | (c[1]&0x3f);
582 c += 2;
583 in_left -= 2;
584 out_left -= 2;
585 uc += 2;
586 continue;
587 }
588
589 if ((c[0] & 0xf0) == 0xe0) {
590 if (in_left < 3 ||
591 (c[1] & 0xc0) != 0x80 ||
592 (c[2] & 0xc0) != 0x80) {
593 errno = EILSEQ;
594 goto error;
595 }
596 uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
597 uc[0] = (c[1]<<6) | (c[2]&0x3f);
598 c += 3;
599 in_left -= 3;
600 out_left -= 2;
601 uc += 2;
602 continue;
603 }
604
605 if ((c[0] & 0xf8) == 0xf0) {
606 unsigned int codepoint;
607 if (in_left < 4 ||
608 (c[1] & 0xc0) != 0x80 ||
609 (c[2] & 0xc0) != 0x80 ||
610 (c[3] & 0xc0) != 0x80) {
611 errno = EILSEQ;
612 goto error;
613 }
614 codepoint =
615 (c[3]&0x3f) |
616 ((c[2]&0x3f)<<6) |
617 ((c[1]&0x3f)<<12) |
618 ((c[0]&0x7)<<18);
619 if (codepoint < 0x10000) {
620 /* accept UTF-8 characters that are not
621 minimally packed, but pack the result */
622 uc[0] = (codepoint & 0xFF);
623 uc[1] = (codepoint >> 8);
624 c += 4;
625 in_left -= 4;
626 out_left -= 2;
627 uc += 2;
628 continue;
629 }
630
631 codepoint -= 0x10000;
632
633 if (out_left < 4) {
634 errno = E2BIG;
635 goto error;
636 }
637
638 uc[0] = (codepoint>>10) & 0xFF;
639 uc[1] = (codepoint>>18) | 0xd8;
640 uc[2] = codepoint & 0xFF;
641 uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
642 c += 4;
643 in_left -= 4;
644 out_left -= 4;
645 uc += 4;
646 continue;
647 }
648
649 /* we don't handle 5 byte sequences */
650 errno = EINVAL;
651 goto error;
652 }
653
654 if (in_left > 0) {
655 errno = E2BIG;
656 goto error;
657 }
658
659 *inbytesleft = in_left;
660 *outbytesleft = out_left;
661 *inbuf = (const char *)c;
662 *outbuf = (char *)uc;
663 return 0;
664
665error:
666 *inbytesleft = in_left;
667 *outbytesleft = out_left;
668 *inbuf = (const char *)c;
669 *outbuf = (char *)uc;
670 return -1;
671}
672
673
674/*
675 this takes a UTF16 sequence and produces a UTF8 sequence
676 */
677static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
678 char **outbuf, size_t *outbytesleft)
679{
680 size_t in_left=*inbytesleft, out_left=*outbytesleft;
681 uint8_t *c = (uint8_t *)*outbuf;
682 const uint8_t *uc = (const uint8_t *)*inbuf;
683
684 while (in_left >= 2 && out_left >= 1) {
685 unsigned int codepoint;
686
687 if (uc[1] == 0 && !(uc[0] & 0x80)) {
688 /* simplest case */
689 c[0] = uc[0];
690 in_left -= 2;
691 out_left -= 1;
692 uc += 2;
693 c += 1;
694 continue;
695 }
696
697 if ((uc[1]&0xf8) == 0) {
698 /* next simplest case */
699 if (out_left < 2) {
700 errno = E2BIG;
701 goto error;
702 }
703 c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
704 c[1] = 0x80 | (uc[0] & 0x3f);
705 in_left -= 2;
706 out_left -= 2;
707 uc += 2;
708 c += 2;
709 continue;
710 }
711
712 if ((uc[1] & 0xfc) == 0xdc) {
713 /* its the second part of a 4 byte sequence. Illegal */
714 if (in_left < 4) {
715 errno = EINVAL;
716 } else {
717 errno = EILSEQ;
718 }
719 goto error;
720 }
721
722 if ((uc[1] & 0xfc) != 0xd8) {
723 codepoint = uc[0] | (uc[1]<<8);
724 if (out_left < 3) {
725 errno = E2BIG;
726 goto error;
727 }
728 c[0] = 0xe0 | (codepoint >> 12);
729 c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
730 c[2] = 0x80 | (codepoint & 0x3f);
731
732 in_left -= 2;
733 out_left -= 3;
734 uc += 2;
735 c += 3;
736 continue;
737 }
738
739 /* its the first part of a 4 byte sequence */
740 if (in_left < 4) {
741 errno = EINVAL;
742 goto error;
743 }
744 if ((uc[3] & 0xfc) != 0xdc) {
745 errno = EILSEQ;
746 goto error;
747 }
748 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
749 (uc[0]<<10) | ((uc[1] & 0x3)<<18));
750
751 if (out_left < 4) {
752 errno = E2BIG;
753 goto error;
754 }
755 c[0] = 0xf0 | (codepoint >> 18);
756 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
757 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
758 c[3] = 0x80 | (codepoint & 0x3f);
759
760 in_left -= 4;
761 out_left -= 4;
762 uc += 4;
763 c += 4;
764 }
765
766 if (in_left == 1) {
767 errno = EINVAL;
768 goto error;
769 }
770
771 if (in_left > 1) {
772 errno = E2BIG;
773 goto error;
774 }
775
776 *inbytesleft = in_left;
777 *outbytesleft = out_left;
778 *inbuf = (const char *)uc;
779 *outbuf = (char *)c;
780
781 return 0;
782
783error:
784 *inbytesleft = in_left;
785 *outbytesleft = out_left;
786 *inbuf = (const char *)uc;
787 *outbuf = (char *)c;
788 return -1;
789}
790
791
792/*
793 this takes a UTF16 munged sequence, modifies it according to the
794 string2key rules, and produces a UTF16 sequence
795
796The rules are:
797
798 1) any 0x0000 characters are mapped to 0x0001
799
800 2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
801 without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
802 U+FFFD (OBJECT REPLACEMENT CHARACTER).
803
804 3) the same for any low surrogate that was not preceded by a high surrogate.
805
806 */
807static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
808 char **outbuf, size_t *outbytesleft)
809{
810 size_t in_left=*inbytesleft, out_left=*outbytesleft;
811 uint8_t *c = (uint8_t *)*outbuf;
812 const uint8_t *uc = (const uint8_t *)*inbuf;
813
814 while (in_left >= 2 && out_left >= 2) {
815 unsigned int codepoint = uc[0] | (uc[1]<<8);
816
817 if (codepoint == 0) {
818 codepoint = 1;
819 }
820
821 if ((codepoint & 0xfc00) == 0xd800) {
822 /* a high surrogate */
823 unsigned int codepoint2;
824 if (in_left < 4) {
825 codepoint = 0xfffd;
826 goto codepoint16;
827 }
828 codepoint2 = uc[2] | (uc[3]<<8);
829 if ((codepoint2 & 0xfc00) != 0xdc00) {
830 /* high surrogate not followed by low
831 surrogate: convert to 0xfffd */
832 codepoint = 0xfffd;
833 goto codepoint16;
834 }
835 if (out_left < 4) {
836 errno = E2BIG;
837 goto error;
838 }
839 memcpy(c, uc, 4);
840 in_left -= 4;
841 out_left -= 4;
842 uc += 4;
843 c += 4;
844 continue;
845 }
846
847 if ((codepoint & 0xfc00) == 0xdc00) {
848 /* low surrogate not preceded by high
849 surrogate: convert to 0xfffd */
850 codepoint = 0xfffd;
851 }
852
853 codepoint16:
854 c[0] = codepoint & 0xFF;
855 c[1] = (codepoint>>8) & 0xFF;
856
857 in_left -= 2;
858 out_left -= 2;
859 uc += 2;
860 c += 2;
861 continue;
862 }
863
864 if (in_left == 1) {
865 errno = EINVAL;
866 goto error;
867 }
868
869 if (in_left > 1) {
870 errno = E2BIG;
871 goto error;
872 }
873
874 *inbytesleft = in_left;
875 *outbytesleft = out_left;
876 *inbuf = (const char *)uc;
877 *outbuf = (char *)c;
878
879 return 0;
880
881error:
882 *inbytesleft = in_left;
883 *outbytesleft = out_left;
884 *inbuf = (const char *)uc;
885 *outbuf = (char *)c;
886 return -1;
887}
888
889
890
Note: See TracBrowser for help on using the repository browser.