source: vendor/3.6.0/lib/util/charset/iconv.c

Last change on this file was 740, checked in by Silvan Scherrer, 13 years ago

Samba Server: update vendor to 3.6.0

File size: 20.2 KB
Line 
1/*
2 Unix SMB/CIFS implementation.
3 minimal iconv implementation
4 Copyright (C) Andrew Tridgell 2001
5 Copyright (C) Jelmer Vernooij 2002
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19*/
20
21#include "includes.h"
22#include "../lib/util/dlinklist.h"
23#include "system/iconv.h"
24#include "system/filesys.h"
25
26#ifdef strcasecmp
27#undef strcasecmp
28#endif
29
30#ifdef static_decl_charset
31static_decl_charset;
32#endif
33
34/**
35 * @file
36 *
37 * @brief Samba wrapper/stub for iconv character set conversion.
38 *
39 * iconv is the XPG2 interface for converting between character
40 * encodings. This file provides a Samba wrapper around it, and also
41 * a simple reimplementation that is used if the system does not
42 * implement iconv.
43 *
44 * Samba only works with encodings that are supersets of ASCII: ascii
45 * characters like whitespace can be tested for directly, multibyte
46 * sequences start with a byte with the high bit set, and strings are
47 * terminated by a nul byte.
48 *
49 * Note that the only function provided by iconv is conversion between
50 * characters. It doesn't directly support operations like
51 * uppercasing or comparison. We have to convert to UTF-16LE and
52 * compare there.
53 *
54 * @sa Samba Developers Guide
55 **/
56
57static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *);
58static size_t ascii_push (void *,const char **, size_t *, char **, size_t *);
59static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
60static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *);
61static size_t utf8_push (void *,const char **, size_t *, char **, size_t *);
62static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
63static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
64static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
65static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *);
66static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *);
67
68static const struct charset_functions builtin_functions[] = {
69 /* windows is closest to UTF-16 */
70 {"UCS-2LE", iconv_copy, iconv_copy},
71 {"UTF-16LE", iconv_copy, iconv_copy},
72 {"UCS-2BE", iconv_swab, iconv_swab},
73 {"UTF-16BE", iconv_swab, iconv_swab},
74
75 /* we include the UTF-8 alias to cope with differing locale settings */
76 {"UTF8", utf8_pull, utf8_push},
77 {"UTF-8", utf8_pull, utf8_push},
78
79 /* this handles the munging needed for String2Key */
80 {"UTF16_MUNGED", utf16_munged_pull, iconv_copy},
81
82 {"ASCII", ascii_pull, ascii_push},
83 {"646", ascii_pull, ascii_push},
84 {"ISO-8859-1", ascii_pull, latin1_push},
85 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
86};
87
88static struct charset_functions *charsets = NULL;
89
90static struct charset_functions *find_charset_functions(const char *name)
91{
92 struct charset_functions *c;
93
94 /* Check whether we already have this charset... */
95 for (c = charsets; c != NULL; c = c->next) {
96 if(strcasecmp(c->name, name) == 0) {
97 return c;
98 }
99 c = c->next;
100 }
101
102 return NULL;
103}
104
105bool smb_register_charset(const struct charset_functions *funcs_in)
106{
107 struct charset_functions *funcs;
108
109 DEBUG(5, ("Attempting to register new charset %s\n", funcs_in->name));
110 /* Check whether we already have this charset... */
111 if (find_charset_functions(funcs_in->name)) {
112 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs_in->name));
113 return false;
114 }
115
116 funcs = talloc(NULL, struct charset_functions);
117 if (!funcs) {
118 DEBUG(0, ("Out of memory duplicating charset %s\n", funcs_in->name));
119 return false;
120 }
121 *funcs = *funcs_in;
122
123 funcs->next = funcs->prev = NULL;
124 DEBUG(5, ("Registered charset %s\n", funcs->name));
125 DLIST_ADD(charsets, funcs);
126 return true;
127}
128
129static void lazy_initialize_iconv(void)
130{
131 static bool initialized;
132
133#ifdef static_init_charset
134 if (!initialized) {
135 static_init_charset;
136 initialized = true;
137 }
138#endif
139}
140
141#ifdef HAVE_NATIVE_ICONV
142/* if there was an error then reset the internal state,
143 this ensures that we don't have a shift state remaining for
144 character sets like SJIS */
145static size_t sys_iconv(void *cd,
146 const char **inbuf, size_t *inbytesleft,
147 char **outbuf, size_t *outbytesleft)
148{
149 size_t ret = iconv((iconv_t)cd,
150 discard_const_p(char *, inbuf), inbytesleft,
151 outbuf, outbytesleft);
152 if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
153 return ret;
154}
155#endif
156
157/**
158 * This is a simple portable iconv() implementaion.
159 *
160 * It only knows about a very small number of character sets - just
161 * enough that Samba works on systems that don't have iconv.
162 **/
163_PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
164 const char **inbuf, size_t *inbytesleft,
165 char **outbuf, size_t *outbytesleft)
166{
167 char cvtbuf[2048];
168 size_t bufsize;
169
170 /* in many cases we can go direct */
171 if (cd->direct) {
172 return cd->direct(cd->cd_direct,
173 inbuf, inbytesleft, outbuf, outbytesleft);
174 }
175
176
177 /* otherwise we have to do it chunks at a time */
178 while (*inbytesleft > 0) {
179 char *bufp1 = cvtbuf;
180 const char *bufp2 = cvtbuf;
181
182 bufsize = sizeof(cvtbuf);
183
184 if (cd->pull(cd->cd_pull,
185 inbuf, inbytesleft, &bufp1, &bufsize) == -1
186 && errno != E2BIG) return -1;
187
188 bufsize = sizeof(cvtbuf) - bufsize;
189
190 if (cd->push(cd->cd_push,
191 &bufp2, &bufsize,
192 outbuf, outbytesleft) == -1) return -1;
193 }
194
195 return 0;
196}
197
198static bool is_utf16(const char *name)
199{
200 return strcasecmp(name, "UCS-2LE") == 0 ||
201 strcasecmp(name, "UTF-16LE") == 0;
202}
203
204static int smb_iconv_t_destructor(smb_iconv_t hwd)
205{
206#ifdef HAVE_NATIVE_ICONV
207 if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
208 iconv_close(hwd->cd_pull);
209 if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
210 iconv_close(hwd->cd_push);
211 if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
212 iconv_close(hwd->cd_direct);
213#endif
214
215 return 0;
216}
217
218_PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
219 const char *fromcode, bool native_iconv)
220{
221 smb_iconv_t ret;
222 const struct charset_functions *from=NULL, *to=NULL;
223 int i;
224
225 lazy_initialize_iconv();
226
227 ret = (smb_iconv_t)talloc_named(mem_ctx,
228 sizeof(*ret),
229 "iconv(%s,%s)", tocode, fromcode);
230 if (!ret) {
231 errno = ENOMEM;
232 return (smb_iconv_t)-1;
233 }
234 memset(ret, 0, sizeof(*ret));
235 talloc_set_destructor(ret, smb_iconv_t_destructor);
236
237 /* check for the simplest null conversion */
238 if (strcmp(fromcode, tocode) == 0) {
239 ret->direct = iconv_copy;
240 return ret;
241 }
242
243 for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
244 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
245 from = &builtin_functions[i];
246 }
247 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
248 to = &builtin_functions[i];
249 }
250 }
251
252 if (from == NULL) {
253 for (from=charsets; from; from=from->next) {
254 if (strcasecmp(from->name, fromcode) == 0) break;
255 }
256 }
257
258 if (to == NULL) {
259 for (to=charsets; to; to=to->next) {
260 if (strcasecmp(to->name, tocode) == 0) break;
261 }
262 }
263
264#ifdef HAVE_NATIVE_ICONV
265 if ((!from || !to) && !native_iconv) {
266 goto failed;
267 }
268 if (!from) {
269 ret->pull = sys_iconv;
270 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
271 if (ret->cd_pull == (iconv_t)-1)
272 ret->cd_pull = iconv_open("UCS-2LE", fromcode);
273 if (ret->cd_pull == (iconv_t)-1) goto failed;
274 }
275
276 if (!to) {
277 ret->push = sys_iconv;
278 ret->cd_push = iconv_open(tocode, "UTF-16LE");
279 if (ret->cd_push == (iconv_t)-1)
280 ret->cd_push = iconv_open(tocode, "UCS-2LE");
281 if (ret->cd_push == (iconv_t)-1) goto failed;
282 }
283#else
284 if (!from || !to) {
285 goto failed;
286 }
287#endif
288
289 /* check for conversion to/from ucs2 */
290 if (is_utf16(fromcode) && to) {
291 ret->direct = to->push;
292 return ret;
293 }
294 if (is_utf16(tocode) && from) {
295 ret->direct = from->pull;
296 return ret;
297 }
298
299#ifdef HAVE_NATIVE_ICONV
300 if (is_utf16(fromcode)) {
301 ret->direct = sys_iconv;
302 ret->cd_direct = ret->cd_push;
303 ret->cd_push = NULL;
304 return ret;
305 }
306 if (is_utf16(tocode)) {
307 ret->direct = sys_iconv;
308 ret->cd_direct = ret->cd_pull;
309 ret->cd_pull = NULL;
310 return ret;
311 }
312#endif
313
314 /* the general case has to go via a buffer */
315 if (!ret->pull) ret->pull = from->pull;
316 if (!ret->push) ret->push = to->push;
317 return ret;
318
319failed:
320 talloc_free(ret);
321 errno = EINVAL;
322 return (smb_iconv_t)-1;
323}
324
325/*
326 simple iconv_open() wrapper
327 */
328_PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
329{
330 return smb_iconv_open_ex(NULL, tocode, fromcode, true);
331}
332
333/*
334 simple iconv_close() wrapper
335*/
336_PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
337{
338 talloc_free(cd);
339 return 0;
340}
341
342
343/**********************************************************************
344 the following functions implement the builtin character sets in Samba
345 and also the "test" character sets that are designed to test
346 multi-byte character set support for english users
347***********************************************************************/
348static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
349 char **outbuf, size_t *outbytesleft)
350{
351 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
352 (*outbuf)[0] = (*inbuf)[0];
353 (*outbuf)[1] = 0;
354 (*inbytesleft) -= 1;
355 (*outbytesleft) -= 2;
356 (*inbuf) += 1;
357 (*outbuf) += 2;
358 }
359
360 if (*inbytesleft > 0) {
361 errno = E2BIG;
362 return -1;
363 }
364
365 return 0;
366}
367
368static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
369 char **outbuf, size_t *outbytesleft)
370{
371 int ir_count=0;
372
373 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
374 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
375 if ((*inbuf)[1]) ir_count++;
376 (*inbytesleft) -= 2;
377 (*outbytesleft) -= 1;
378 (*inbuf) += 2;
379 (*outbuf) += 1;
380 }
381
382 if (*inbytesleft == 1) {
383 errno = EINVAL;
384 return -1;
385 }
386
387 if (*inbytesleft > 1) {
388 errno = E2BIG;
389 return -1;
390 }
391
392 return ir_count;
393}
394
395static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
396 char **outbuf, size_t *outbytesleft)
397{
398 int ir_count=0;
399
400 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
401 (*outbuf)[0] = (*inbuf)[0];
402 if ((*inbuf)[1]) ir_count++;
403 (*inbytesleft) -= 2;
404 (*outbytesleft) -= 1;
405 (*inbuf) += 2;
406 (*outbuf) += 1;
407 }
408
409 if (*inbytesleft == 1) {
410 errno = EINVAL;
411 return -1;
412 }
413
414 if (*inbytesleft > 1) {
415 errno = E2BIG;
416 return -1;
417 }
418
419 return ir_count;
420}
421
422static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
423 char **outbuf, size_t *outbytesleft)
424{
425 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
426 unsigned int v;
427
428 if ((*inbuf)[0] != '@') {
429 /* seven bit ascii case */
430 (*outbuf)[0] = (*inbuf)[0];
431 (*outbuf)[1] = 0;
432 (*inbytesleft) -= 1;
433 (*outbytesleft) -= 2;
434 (*inbuf) += 1;
435 (*outbuf) += 2;
436 continue;
437 }
438 /* it's a hex character */
439 if (*inbytesleft < 5) {
440 errno = EINVAL;
441 return -1;
442 }
443
444 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
445 errno = EILSEQ;
446 return -1;
447 }
448
449 (*outbuf)[0] = v&0xff;
450 (*outbuf)[1] = v>>8;
451 (*inbytesleft) -= 5;
452 (*outbytesleft) -= 2;
453 (*inbuf) += 5;
454 (*outbuf) += 2;
455 }
456
457 if (*inbytesleft > 0) {
458 errno = E2BIG;
459 return -1;
460 }
461
462 return 0;
463}
464
465static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
466 char **outbuf, size_t *outbytesleft)
467{
468 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
469 char buf[6];
470
471 if ((*inbuf)[1] == 0 &&
472 ((*inbuf)[0] & 0x80) == 0 &&
473 (*inbuf)[0] != '@') {
474 (*outbuf)[0] = (*inbuf)[0];
475 (*inbytesleft) -= 2;
476 (*outbytesleft) -= 1;
477 (*inbuf) += 2;
478 (*outbuf) += 1;
479 continue;
480 }
481 if (*outbytesleft < 5) {
482 errno = E2BIG;
483 return -1;
484 }
485 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
486 memcpy(*outbuf, buf, 5);
487 (*inbytesleft) -= 2;
488 (*outbytesleft) -= 5;
489 (*inbuf) += 2;
490 (*outbuf) += 5;
491 }
492
493 if (*inbytesleft == 1) {
494 errno = EINVAL;
495 return -1;
496 }
497
498 if (*inbytesleft > 1) {
499 errno = E2BIG;
500 return -1;
501 }
502
503 return 0;
504}
505
506static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
507 char **outbuf, size_t *outbytesleft)
508{
509 int n;
510
511 n = MIN(*inbytesleft, *outbytesleft);
512
513 swab(*inbuf, *outbuf, (n&~1));
514 if (n&1) {
515 (*outbuf)[n-1] = 0;
516 }
517
518 (*inbytesleft) -= n;
519 (*outbytesleft) -= n;
520 (*inbuf) += n;
521 (*outbuf) += n;
522
523 if (*inbytesleft > 0) {
524 errno = E2BIG;
525 return -1;
526 }
527
528 return 0;
529}
530
531
532static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
533 char **outbuf, size_t *outbytesleft)
534{
535 int n;
536
537 n = MIN(*inbytesleft, *outbytesleft);
538
539 memmove(*outbuf, *inbuf, n);
540
541 (*inbytesleft) -= n;
542 (*outbytesleft) -= n;
543 (*inbuf) += n;
544 (*outbuf) += n;
545
546 if (*inbytesleft > 0) {
547 errno = E2BIG;
548 return -1;
549 }
550
551 return 0;
552}
553
554/*
555 this takes a UTF8 sequence and produces a UTF16 sequence
556 */
557static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
558 char **outbuf, size_t *outbytesleft)
559{
560 size_t in_left=*inbytesleft, out_left=*outbytesleft;
561 const uint8_t *c = (const uint8_t *)*inbuf;
562 uint8_t *uc = (uint8_t *)*outbuf;
563
564 while (in_left >= 1 && out_left >= 2) {
565 if ((c[0] & 0x80) == 0) {
566 uc[0] = c[0];
567 uc[1] = 0;
568 c += 1;
569 in_left -= 1;
570 out_left -= 2;
571 uc += 2;
572 continue;
573 }
574
575 if ((c[0] & 0xe0) == 0xc0) {
576 if (in_left < 2 ||
577 (c[1] & 0xc0) != 0x80) {
578 errno = EILSEQ;
579 goto error;
580 }
581 uc[1] = (c[0]>>2) & 0x7;
582 uc[0] = (c[0]<<6) | (c[1]&0x3f);
583 c += 2;
584 in_left -= 2;
585 out_left -= 2;
586 uc += 2;
587 continue;
588 }
589
590 if ((c[0] & 0xf0) == 0xe0) {
591 if (in_left < 3 ||
592 (c[1] & 0xc0) != 0x80 ||
593 (c[2] & 0xc0) != 0x80) {
594 errno = EILSEQ;
595 goto error;
596 }
597 uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
598 uc[0] = (c[1]<<6) | (c[2]&0x3f);
599 c += 3;
600 in_left -= 3;
601 out_left -= 2;
602 uc += 2;
603 continue;
604 }
605
606 if ((c[0] & 0xf8) == 0xf0) {
607 unsigned int codepoint;
608 if (in_left < 4 ||
609 (c[1] & 0xc0) != 0x80 ||
610 (c[2] & 0xc0) != 0x80 ||
611 (c[3] & 0xc0) != 0x80) {
612 errno = EILSEQ;
613 goto error;
614 }
615 codepoint =
616 (c[3]&0x3f) |
617 ((c[2]&0x3f)<<6) |
618 ((c[1]&0x3f)<<12) |
619 ((c[0]&0x7)<<18);
620 if (codepoint < 0x10000) {
621 /* accept UTF-8 characters that are not
622 minimally packed, but pack the result */
623 uc[0] = (codepoint & 0xFF);
624 uc[1] = (codepoint >> 8);
625 c += 4;
626 in_left -= 4;
627 out_left -= 2;
628 uc += 2;
629 continue;
630 }
631
632 codepoint -= 0x10000;
633
634 if (out_left < 4) {
635 errno = E2BIG;
636 goto error;
637 }
638
639 uc[0] = (codepoint>>10) & 0xFF;
640 uc[1] = (codepoint>>18) | 0xd8;
641 uc[2] = codepoint & 0xFF;
642 uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
643 c += 4;
644 in_left -= 4;
645 out_left -= 4;
646 uc += 4;
647 continue;
648 }
649
650 /* we don't handle 5 byte sequences */
651 errno = EINVAL;
652 goto error;
653 }
654
655 if (in_left > 0) {
656 errno = E2BIG;
657 goto error;
658 }
659
660 *inbytesleft = in_left;
661 *outbytesleft = out_left;
662 *inbuf = (const char *)c;
663 *outbuf = (char *)uc;
664 return 0;
665
666error:
667 *inbytesleft = in_left;
668 *outbytesleft = out_left;
669 *inbuf = (const char *)c;
670 *outbuf = (char *)uc;
671 return -1;
672}
673
674
675/*
676 this takes a UTF16 sequence and produces a UTF8 sequence
677 */
678static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
679 char **outbuf, size_t *outbytesleft)
680{
681 size_t in_left=*inbytesleft, out_left=*outbytesleft;
682 uint8_t *c = (uint8_t *)*outbuf;
683 const uint8_t *uc = (const uint8_t *)*inbuf;
684
685 while (in_left >= 2 && out_left >= 1) {
686 unsigned int codepoint;
687
688 if (uc[1] == 0 && !(uc[0] & 0x80)) {
689 /* simplest case */
690 c[0] = uc[0];
691 in_left -= 2;
692 out_left -= 1;
693 uc += 2;
694 c += 1;
695 continue;
696 }
697
698 if ((uc[1]&0xf8) == 0) {
699 /* next simplest case */
700 if (out_left < 2) {
701 errno = E2BIG;
702 goto error;
703 }
704 c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
705 c[1] = 0x80 | (uc[0] & 0x3f);
706 in_left -= 2;
707 out_left -= 2;
708 uc += 2;
709 c += 2;
710 continue;
711 }
712
713 if ((uc[1] & 0xfc) == 0xdc) {
714 /* its the second part of a 4 byte sequence. Illegal */
715 if (in_left < 4) {
716 errno = EINVAL;
717 } else {
718 errno = EILSEQ;
719 }
720 goto error;
721 }
722
723 if ((uc[1] & 0xfc) != 0xd8) {
724 codepoint = uc[0] | (uc[1]<<8);
725 if (out_left < 3) {
726 errno = E2BIG;
727 goto error;
728 }
729 c[0] = 0xe0 | (codepoint >> 12);
730 c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
731 c[2] = 0x80 | (codepoint & 0x3f);
732
733 in_left -= 2;
734 out_left -= 3;
735 uc += 2;
736 c += 3;
737 continue;
738 }
739
740 /* its the first part of a 4 byte sequence */
741 if (in_left < 4) {
742 errno = EINVAL;
743 goto error;
744 }
745 if ((uc[3] & 0xfc) != 0xdc) {
746 errno = EILSEQ;
747 goto error;
748 }
749 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
750 (uc[0]<<10) | ((uc[1] & 0x3)<<18));
751
752 if (out_left < 4) {
753 errno = E2BIG;
754 goto error;
755 }
756 c[0] = 0xf0 | (codepoint >> 18);
757 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
758 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
759 c[3] = 0x80 | (codepoint & 0x3f);
760
761 in_left -= 4;
762 out_left -= 4;
763 uc += 4;
764 c += 4;
765 }
766
767 if (in_left == 1) {
768 errno = EINVAL;
769 goto error;
770 }
771
772 if (in_left > 1) {
773 errno = E2BIG;
774 goto error;
775 }
776
777 *inbytesleft = in_left;
778 *outbytesleft = out_left;
779 *inbuf = (const char *)uc;
780 *outbuf = (char *)c;
781
782 return 0;
783
784error:
785 *inbytesleft = in_left;
786 *outbytesleft = out_left;
787 *inbuf = (const char *)uc;
788 *outbuf = (char *)c;
789 return -1;
790}
791
792
793/*
794 this takes a UTF16 munged sequence, modifies it according to the
795 string2key rules, and produces a UTF16 sequence
796
797The rules are:
798
799 1) any 0x0000 characters are mapped to 0x0001
800
801 2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
802 without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
803 U+FFFD (OBJECT REPLACEMENT CHARACTER).
804
805 3) the same for any low surrogate that was not preceded by a high surrogate.
806
807 */
808static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
809 char **outbuf, size_t *outbytesleft)
810{
811 size_t in_left=*inbytesleft, out_left=*outbytesleft;
812 uint8_t *c = (uint8_t *)*outbuf;
813 const uint8_t *uc = (const uint8_t *)*inbuf;
814
815 while (in_left >= 2 && out_left >= 2) {
816 unsigned int codepoint = uc[0] | (uc[1]<<8);
817
818 if (codepoint == 0) {
819 codepoint = 1;
820 }
821
822 if ((codepoint & 0xfc00) == 0xd800) {
823 /* a high surrogate */
824 unsigned int codepoint2;
825 if (in_left < 4) {
826 codepoint = 0xfffd;
827 goto codepoint16;
828 }
829 codepoint2 = uc[2] | (uc[3]<<8);
830 if ((codepoint2 & 0xfc00) != 0xdc00) {
831 /* high surrogate not followed by low
832 surrogate: convert to 0xfffd */
833 codepoint = 0xfffd;
834 goto codepoint16;
835 }
836 if (out_left < 4) {
837 errno = E2BIG;
838 goto error;
839 }
840 memcpy(c, uc, 4);
841 in_left -= 4;
842 out_left -= 4;
843 uc += 4;
844 c += 4;
845 continue;
846 }
847
848 if ((codepoint & 0xfc00) == 0xdc00) {
849 /* low surrogate not preceded by high
850 surrogate: convert to 0xfffd */
851 codepoint = 0xfffd;
852 }
853
854 codepoint16:
855 c[0] = codepoint & 0xFF;
856 c[1] = (codepoint>>8) & 0xFF;
857
858 in_left -= 2;
859 out_left -= 2;
860 uc += 2;
861 c += 2;
862 continue;
863 }
864
865 if (in_left == 1) {
866 errno = EINVAL;
867 goto error;
868 }
869
870 if (in_left > 1) {
871 errno = E2BIG;
872 goto error;
873 }
874
875 *inbytesleft = in_left;
876 *outbytesleft = out_left;
877 *inbuf = (const char *)uc;
878 *outbuf = (char *)c;
879
880 return 0;
881
882error:
883 *inbytesleft = in_left;
884 *outbytesleft = out_left;
885 *inbuf = (const char *)uc;
886 *outbuf = (char *)c;
887 return -1;
888}
889
890
891
Note: See TracBrowser for help on using the repository browser.