source: trunk/server/lib/util/charset/iconv.c

Last change on this file was 752, checked in by Silvan Scherrer, 13 years ago

Samba Server: updated trunk to 3.6.9 2nd part

File size: 20.9 KB
Line 
1/*
2 Unix SMB/CIFS implementation.
3 minimal iconv implementation
4 Copyright (C) Andrew Tridgell 2001
5 Copyright (C) Jelmer Vernooij 2002
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19*/
20
21#include "includes.h"
22#include "../lib/util/dlinklist.h"
23#include "system/iconv.h"
24#include "system/filesys.h"
25
26#ifdef strcasecmp
27#undef strcasecmp
28#endif
29
30#ifdef static_decl_charset
31static_decl_charset;
32#endif
33
34/**
35 * @file
36 *
37 * @brief Samba wrapper/stub for iconv character set conversion.
38 *
39 * iconv is the XPG2 interface for converting between character
40 * encodings. This file provides a Samba wrapper around it, and also
41 * a simple reimplementation that is used if the system does not
42 * implement iconv.
43 *
44 * Samba only works with encodings that are supersets of ASCII: ascii
45 * characters like whitespace can be tested for directly, multibyte
46 * sequences start with a byte with the high bit set, and strings are
47 * terminated by a nul byte.
48 *
49 * Note that the only function provided by iconv is conversion between
50 * characters. It doesn't directly support operations like
51 * uppercasing or comparison. We have to convert to UTF-16LE and
52 * compare there.
53 *
54 * @sa Samba Developers Guide
55 **/
56
57static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *);
58static size_t ascii_push (void *,const char **, size_t *, char **, size_t *);
59static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
60static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *);
61static size_t utf8_push (void *,const char **, size_t *, char **, size_t *);
62static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
63static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
64static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
65static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *);
66static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *);
67
68static const struct charset_functions builtin_functions[] = {
69 /* windows is closest to UTF-16 */
70 {"UCS-2LE", iconv_copy, iconv_copy},
71 {"UTF-16LE", iconv_copy, iconv_copy},
72 {"UCS-2BE", iconv_swab, iconv_swab},
73 {"UTF-16BE", iconv_swab, iconv_swab},
74
75 /* we include the UTF-8 alias to cope with differing locale settings */
76 {"UTF8", utf8_pull, utf8_push},
77 {"UTF-8", utf8_pull, utf8_push},
78
79 /* this handles the munging needed for String2Key */
80 {"UTF16_MUNGED", utf16_munged_pull, iconv_copy},
81
82 {"ASCII", ascii_pull, ascii_push},
83 {"646", ascii_pull, ascii_push},
84 {"ISO-8859-1", ascii_pull, latin1_push},
85 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
86};
87
88static struct charset_functions *charsets = NULL;
89
90static struct charset_functions *find_charset_functions(const char *name)
91{
92 struct charset_functions *c;
93
94 /* Check whether we already have this charset... */
95 for (c = charsets; c != NULL; c = c->next) {
96 if(strcasecmp(c->name, name) == 0) {
97 return c;
98 }
99 }
100
101 return NULL;
102}
103
104bool smb_register_charset(const struct charset_functions *funcs_in)
105{
106 struct charset_functions *funcs;
107
108 DEBUG(5, ("Attempting to register new charset %s\n", funcs_in->name));
109 /* Check whether we already have this charset... */
110 if (find_charset_functions(funcs_in->name)) {
111 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs_in->name));
112 return false;
113 }
114
115 funcs = talloc(NULL, struct charset_functions);
116 if (!funcs) {
117 DEBUG(0, ("Out of memory duplicating charset %s\n", funcs_in->name));
118 return false;
119 }
120 *funcs = *funcs_in;
121
122 funcs->next = funcs->prev = NULL;
123 DEBUG(5, ("Registered charset %s\n", funcs->name));
124 DLIST_ADD(charsets, funcs);
125 return true;
126}
127
128static void lazy_initialize_iconv(void)
129{
130 static bool initialized;
131
132#ifdef static_init_charset
133 if (!initialized) {
134 static_init_charset;
135 initialized = true;
136 }
137#endif
138}
139
140#if defined(__OS2__) && defined(__INNOTEK_LIBC__)
141#include <uconv.h>
142
143typedef struct os2_iconv_t
144{
145 UconvObject from;
146} os2_iconv_t;
147
148iconv_t os2_iconv_open (const char *tocode, const char *fromcode)
149{
150 os2_iconv_t *os2_cd = (os2_iconv_t *)iconv_open(tocode, fromcode);
151
152 if (os2_cd != (iconv_t)(-1))
153 {
154 /* Assume strings contain pathnames */
155 uconv_attribute_t attr;
156
157 UniQueryUconvObject(os2_cd->from, &attr,
158 sizeof(uconv_attribute_t),
159 NULL, NULL, NULL );
160 attr.converttype |= CVTTYPE_PATH;
161 UniSetUconvObject(os2_cd->from, &attr);
162 }
163
164 return (iconv_t)os2_cd;
165}
166
167#define iconv_open os2_iconv_open
168#endif
169
170#ifdef HAVE_NATIVE_ICONV
171/* if there was an error then reset the internal state,
172 this ensures that we don't have a shift state remaining for
173 character sets like SJIS */
174static size_t sys_iconv(void *cd,
175 const char **inbuf, size_t *inbytesleft,
176 char **outbuf, size_t *outbytesleft)
177{
178 size_t ret = iconv((iconv_t)cd,
179 discard_const_p(char *, inbuf), inbytesleft,
180 outbuf, outbytesleft);
181 if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
182 return ret;
183}
184#endif
185
186/**
187 * This is a simple portable iconv() implementaion.
188 *
189 * It only knows about a very small number of character sets - just
190 * enough that Samba works on systems that don't have iconv.
191 **/
192_PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
193 const char **inbuf, size_t *inbytesleft,
194 char **outbuf, size_t *outbytesleft)
195{
196 char cvtbuf[2048];
197 size_t bufsize;
198
199 /* in many cases we can go direct */
200 if (cd->direct) {
201 return cd->direct(cd->cd_direct,
202 inbuf, inbytesleft, outbuf, outbytesleft);
203 }
204
205
206 /* otherwise we have to do it chunks at a time */
207 while (*inbytesleft > 0) {
208 char *bufp1 = cvtbuf;
209 const char *bufp2 = cvtbuf;
210
211 bufsize = sizeof(cvtbuf);
212
213 if (cd->pull(cd->cd_pull,
214 inbuf, inbytesleft, &bufp1, &bufsize) == -1
215 && errno != E2BIG) return -1;
216
217 bufsize = sizeof(cvtbuf) - bufsize;
218
219 if (cd->push(cd->cd_push,
220 &bufp2, &bufsize,
221 outbuf, outbytesleft) == -1) return -1;
222 }
223
224 return 0;
225}
226
227static bool is_utf16(const char *name)
228{
229 return strcasecmp(name, "UCS-2LE") == 0 ||
230 strcasecmp(name, "UTF-16LE") == 0;
231}
232
233static int smb_iconv_t_destructor(smb_iconv_t hwd)
234{
235#ifdef HAVE_NATIVE_ICONV
236 if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
237 iconv_close(hwd->cd_pull);
238 if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
239 iconv_close(hwd->cd_push);
240 if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
241 iconv_close(hwd->cd_direct);
242#endif
243
244 return 0;
245}
246
247_PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
248 const char *fromcode, bool native_iconv)
249{
250 smb_iconv_t ret;
251 const struct charset_functions *from=NULL, *to=NULL;
252 int i;
253
254 lazy_initialize_iconv();
255
256 ret = (smb_iconv_t)talloc_named(mem_ctx,
257 sizeof(*ret),
258 "iconv(%s,%s)", tocode, fromcode);
259 if (!ret) {
260 errno = ENOMEM;
261 return (smb_iconv_t)-1;
262 }
263 memset(ret, 0, sizeof(*ret));
264 talloc_set_destructor(ret, smb_iconv_t_destructor);
265
266 /* check for the simplest null conversion */
267 if (strcmp(fromcode, tocode) == 0) {
268 ret->direct = iconv_copy;
269 return ret;
270 }
271
272 for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
273 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
274 from = &builtin_functions[i];
275 }
276 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
277 to = &builtin_functions[i];
278 }
279 }
280
281 if (from == NULL) {
282 for (from=charsets; from; from=from->next) {
283 if (strcasecmp(from->name, fromcode) == 0) break;
284 }
285 }
286
287 if (to == NULL) {
288 for (to=charsets; to; to=to->next) {
289 if (strcasecmp(to->name, tocode) == 0) break;
290 }
291 }
292
293#ifdef HAVE_NATIVE_ICONV
294 if ((!from || !to) && !native_iconv) {
295 goto failed;
296 }
297 if (!from) {
298 ret->pull = sys_iconv;
299 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
300 if (ret->cd_pull == (iconv_t)-1)
301 ret->cd_pull = iconv_open("UCS-2LE", fromcode);
302 if (ret->cd_pull == (iconv_t)-1) goto failed;
303 }
304
305 if (!to) {
306 ret->push = sys_iconv;
307 ret->cd_push = iconv_open(tocode, "UTF-16LE");
308 if (ret->cd_push == (iconv_t)-1)
309 ret->cd_push = iconv_open(tocode, "UCS-2LE");
310 if (ret->cd_push == (iconv_t)-1) goto failed;
311 }
312#else
313 if (!from || !to) {
314 goto failed;
315 }
316#endif
317
318 /* check for conversion to/from ucs2 */
319 if (is_utf16(fromcode) && to) {
320 ret->direct = to->push;
321 return ret;
322 }
323 if (is_utf16(tocode) && from) {
324 ret->direct = from->pull;
325 return ret;
326 }
327
328#ifdef HAVE_NATIVE_ICONV
329 if (is_utf16(fromcode)) {
330 ret->direct = sys_iconv;
331 ret->cd_direct = ret->cd_push;
332 ret->cd_push = NULL;
333 return ret;
334 }
335 if (is_utf16(tocode)) {
336 ret->direct = sys_iconv;
337 ret->cd_direct = ret->cd_pull;
338 ret->cd_pull = NULL;
339 return ret;
340 }
341#endif
342
343 /* the general case has to go via a buffer */
344 if (!ret->pull) ret->pull = from->pull;
345 if (!ret->push) ret->push = to->push;
346 return ret;
347
348failed:
349 talloc_free(ret);
350 errno = EINVAL;
351 return (smb_iconv_t)-1;
352}
353
354/*
355 simple iconv_open() wrapper
356 */
357_PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
358{
359 return smb_iconv_open_ex(NULL, tocode, fromcode, true);
360}
361
362/*
363 simple iconv_close() wrapper
364*/
365_PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
366{
367 talloc_free(cd);
368 return 0;
369}
370
371
372/**********************************************************************
373 the following functions implement the builtin character sets in Samba
374 and also the "test" character sets that are designed to test
375 multi-byte character set support for english users
376***********************************************************************/
377static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
378 char **outbuf, size_t *outbytesleft)
379{
380 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
381 (*outbuf)[0] = (*inbuf)[0];
382 (*outbuf)[1] = 0;
383 (*inbytesleft) -= 1;
384 (*outbytesleft) -= 2;
385 (*inbuf) += 1;
386 (*outbuf) += 2;
387 }
388
389 if (*inbytesleft > 0) {
390 errno = E2BIG;
391 return -1;
392 }
393
394 return 0;
395}
396
397static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
398 char **outbuf, size_t *outbytesleft)
399{
400 int ir_count=0;
401
402 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
403 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
404 if ((*inbuf)[1]) ir_count++;
405 (*inbytesleft) -= 2;
406 (*outbytesleft) -= 1;
407 (*inbuf) += 2;
408 (*outbuf) += 1;
409 }
410
411 if (*inbytesleft == 1) {
412 errno = EINVAL;
413 return -1;
414 }
415
416 if (*inbytesleft > 1) {
417 errno = E2BIG;
418 return -1;
419 }
420
421 return ir_count;
422}
423
424static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
425 char **outbuf, size_t *outbytesleft)
426{
427 int ir_count=0;
428
429 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
430 (*outbuf)[0] = (*inbuf)[0];
431 if ((*inbuf)[1]) ir_count++;
432 (*inbytesleft) -= 2;
433 (*outbytesleft) -= 1;
434 (*inbuf) += 2;
435 (*outbuf) += 1;
436 }
437
438 if (*inbytesleft == 1) {
439 errno = EINVAL;
440 return -1;
441 }
442
443 if (*inbytesleft > 1) {
444 errno = E2BIG;
445 return -1;
446 }
447
448 return ir_count;
449}
450
451static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
452 char **outbuf, size_t *outbytesleft)
453{
454 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
455 unsigned int v;
456
457 if ((*inbuf)[0] != '@') {
458 /* seven bit ascii case */
459 (*outbuf)[0] = (*inbuf)[0];
460 (*outbuf)[1] = 0;
461 (*inbytesleft) -= 1;
462 (*outbytesleft) -= 2;
463 (*inbuf) += 1;
464 (*outbuf) += 2;
465 continue;
466 }
467 /* it's a hex character */
468 if (*inbytesleft < 5) {
469 errno = EINVAL;
470 return -1;
471 }
472
473 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
474 errno = EILSEQ;
475 return -1;
476 }
477
478 (*outbuf)[0] = v&0xff;
479 (*outbuf)[1] = v>>8;
480 (*inbytesleft) -= 5;
481 (*outbytesleft) -= 2;
482 (*inbuf) += 5;
483 (*outbuf) += 2;
484 }
485
486 if (*inbytesleft > 0) {
487 errno = E2BIG;
488 return -1;
489 }
490
491 return 0;
492}
493
494static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
495 char **outbuf, size_t *outbytesleft)
496{
497 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
498 char buf[6];
499
500 if ((*inbuf)[1] == 0 &&
501 ((*inbuf)[0] & 0x80) == 0 &&
502 (*inbuf)[0] != '@') {
503 (*outbuf)[0] = (*inbuf)[0];
504 (*inbytesleft) -= 2;
505 (*outbytesleft) -= 1;
506 (*inbuf) += 2;
507 (*outbuf) += 1;
508 continue;
509 }
510 if (*outbytesleft < 5) {
511 errno = E2BIG;
512 return -1;
513 }
514 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
515 memcpy(*outbuf, buf, 5);
516 (*inbytesleft) -= 2;
517 (*outbytesleft) -= 5;
518 (*inbuf) += 2;
519 (*outbuf) += 5;
520 }
521
522 if (*inbytesleft == 1) {
523 errno = EINVAL;
524 return -1;
525 }
526
527 if (*inbytesleft > 1) {
528 errno = E2BIG;
529 return -1;
530 }
531
532 return 0;
533}
534
535static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
536 char **outbuf, size_t *outbytesleft)
537{
538 int n;
539
540 n = MIN(*inbytesleft, *outbytesleft);
541
542 swab(*inbuf, *outbuf, (n&~1));
543 if (n&1) {
544 (*outbuf)[n-1] = 0;
545 }
546
547 (*inbytesleft) -= n;
548 (*outbytesleft) -= n;
549 (*inbuf) += n;
550 (*outbuf) += n;
551
552 if (*inbytesleft > 0) {
553 errno = E2BIG;
554 return -1;
555 }
556
557 return 0;
558}
559
560
561static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
562 char **outbuf, size_t *outbytesleft)
563{
564 int n;
565
566 n = MIN(*inbytesleft, *outbytesleft);
567
568 memmove(*outbuf, *inbuf, n);
569
570 (*inbytesleft) -= n;
571 (*outbytesleft) -= n;
572 (*inbuf) += n;
573 (*outbuf) += n;
574
575 if (*inbytesleft > 0) {
576 errno = E2BIG;
577 return -1;
578 }
579
580 return 0;
581}
582
583/*
584 this takes a UTF8 sequence and produces a UTF16 sequence
585 */
586static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
587 char **outbuf, size_t *outbytesleft)
588{
589 size_t in_left=*inbytesleft, out_left=*outbytesleft;
590 const uint8_t *c = (const uint8_t *)*inbuf;
591 uint8_t *uc = (uint8_t *)*outbuf;
592
593 while (in_left >= 1 && out_left >= 2) {
594 if ((c[0] & 0x80) == 0) {
595 uc[0] = c[0];
596 uc[1] = 0;
597 c += 1;
598 in_left -= 1;
599 out_left -= 2;
600 uc += 2;
601 continue;
602 }
603
604 if ((c[0] & 0xe0) == 0xc0) {
605 if (in_left < 2 ||
606 (c[1] & 0xc0) != 0x80) {
607 errno = EILSEQ;
608 goto error;
609 }
610 uc[1] = (c[0]>>2) & 0x7;
611 uc[0] = (c[0]<<6) | (c[1]&0x3f);
612 c += 2;
613 in_left -= 2;
614 out_left -= 2;
615 uc += 2;
616 continue;
617 }
618
619 if ((c[0] & 0xf0) == 0xe0) {
620 if (in_left < 3 ||
621 (c[1] & 0xc0) != 0x80 ||
622 (c[2] & 0xc0) != 0x80) {
623 errno = EILSEQ;
624 goto error;
625 }
626 uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
627 uc[0] = (c[1]<<6) | (c[2]&0x3f);
628 c += 3;
629 in_left -= 3;
630 out_left -= 2;
631 uc += 2;
632 continue;
633 }
634
635 if ((c[0] & 0xf8) == 0xf0) {
636 unsigned int codepoint;
637 if (in_left < 4 ||
638 (c[1] & 0xc0) != 0x80 ||
639 (c[2] & 0xc0) != 0x80 ||
640 (c[3] & 0xc0) != 0x80) {
641 errno = EILSEQ;
642 goto error;
643 }
644 codepoint =
645 (c[3]&0x3f) |
646 ((c[2]&0x3f)<<6) |
647 ((c[1]&0x3f)<<12) |
648 ((c[0]&0x7)<<18);
649 if (codepoint < 0x10000) {
650 /* accept UTF-8 characters that are not
651 minimally packed, but pack the result */
652 uc[0] = (codepoint & 0xFF);
653 uc[1] = (codepoint >> 8);
654 c += 4;
655 in_left -= 4;
656 out_left -= 2;
657 uc += 2;
658 continue;
659 }
660
661 codepoint -= 0x10000;
662
663 if (out_left < 4) {
664 errno = E2BIG;
665 goto error;
666 }
667
668 uc[0] = (codepoint>>10) & 0xFF;
669 uc[1] = (codepoint>>18) | 0xd8;
670 uc[2] = codepoint & 0xFF;
671 uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
672 c += 4;
673 in_left -= 4;
674 out_left -= 4;
675 uc += 4;
676 continue;
677 }
678
679 /* we don't handle 5 byte sequences */
680 errno = EINVAL;
681 goto error;
682 }
683
684 if (in_left > 0) {
685 errno = E2BIG;
686 goto error;
687 }
688
689 *inbytesleft = in_left;
690 *outbytesleft = out_left;
691 *inbuf = (const char *)c;
692 *outbuf = (char *)uc;
693 return 0;
694
695error:
696 *inbytesleft = in_left;
697 *outbytesleft = out_left;
698 *inbuf = (const char *)c;
699 *outbuf = (char *)uc;
700 return -1;
701}
702
703
704/*
705 this takes a UTF16 sequence and produces a UTF8 sequence
706 */
707static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
708 char **outbuf, size_t *outbytesleft)
709{
710 size_t in_left=*inbytesleft, out_left=*outbytesleft;
711 uint8_t *c = (uint8_t *)*outbuf;
712 const uint8_t *uc = (const uint8_t *)*inbuf;
713
714 while (in_left >= 2 && out_left >= 1) {
715 unsigned int codepoint;
716
717 if (uc[1] == 0 && !(uc[0] & 0x80)) {
718 /* simplest case */
719 c[0] = uc[0];
720 in_left -= 2;
721 out_left -= 1;
722 uc += 2;
723 c += 1;
724 continue;
725 }
726
727 if ((uc[1]&0xf8) == 0) {
728 /* next simplest case */
729 if (out_left < 2) {
730 errno = E2BIG;
731 goto error;
732 }
733 c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
734 c[1] = 0x80 | (uc[0] & 0x3f);
735 in_left -= 2;
736 out_left -= 2;
737 uc += 2;
738 c += 2;
739 continue;
740 }
741
742 if ((uc[1] & 0xfc) == 0xdc) {
743 /* its the second part of a 4 byte sequence. Illegal */
744 if (in_left < 4) {
745 errno = EINVAL;
746 } else {
747 errno = EILSEQ;
748 }
749 goto error;
750 }
751
752 if ((uc[1] & 0xfc) != 0xd8) {
753 codepoint = uc[0] | (uc[1]<<8);
754 if (out_left < 3) {
755 errno = E2BIG;
756 goto error;
757 }
758 c[0] = 0xe0 | (codepoint >> 12);
759 c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
760 c[2] = 0x80 | (codepoint & 0x3f);
761
762 in_left -= 2;
763 out_left -= 3;
764 uc += 2;
765 c += 3;
766 continue;
767 }
768
769 /* its the first part of a 4 byte sequence */
770 if (in_left < 4) {
771 errno = EINVAL;
772 goto error;
773 }
774 if ((uc[3] & 0xfc) != 0xdc) {
775 errno = EILSEQ;
776 goto error;
777 }
778 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
779 (uc[0]<<10) | ((uc[1] & 0x3)<<18));
780
781 if (out_left < 4) {
782 errno = E2BIG;
783 goto error;
784 }
785 c[0] = 0xf0 | (codepoint >> 18);
786 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
787 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
788 c[3] = 0x80 | (codepoint & 0x3f);
789
790 in_left -= 4;
791 out_left -= 4;
792 uc += 4;
793 c += 4;
794 }
795
796 if (in_left == 1) {
797 errno = EINVAL;
798 goto error;
799 }
800
801 if (in_left > 1) {
802 errno = E2BIG;
803 goto error;
804 }
805
806 *inbytesleft = in_left;
807 *outbytesleft = out_left;
808 *inbuf = (const char *)uc;
809 *outbuf = (char *)c;
810
811 return 0;
812
813error:
814 *inbytesleft = in_left;
815 *outbytesleft = out_left;
816 *inbuf = (const char *)uc;
817 *outbuf = (char *)c;
818 return -1;
819}
820
821
822/*
823 this takes a UTF16 munged sequence, modifies it according to the
824 string2key rules, and produces a UTF16 sequence
825
826The rules are:
827
828 1) any 0x0000 characters are mapped to 0x0001
829
830 2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
831 without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
832 U+FFFD (OBJECT REPLACEMENT CHARACTER).
833
834 3) the same for any low surrogate that was not preceded by a high surrogate.
835
836 */
837static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
838 char **outbuf, size_t *outbytesleft)
839{
840 size_t in_left=*inbytesleft, out_left=*outbytesleft;
841 uint8_t *c = (uint8_t *)*outbuf;
842 const uint8_t *uc = (const uint8_t *)*inbuf;
843
844 while (in_left >= 2 && out_left >= 2) {
845 unsigned int codepoint = uc[0] | (uc[1]<<8);
846
847 if (codepoint == 0) {
848 codepoint = 1;
849 }
850
851 if ((codepoint & 0xfc00) == 0xd800) {
852 /* a high surrogate */
853 unsigned int codepoint2;
854 if (in_left < 4) {
855 codepoint = 0xfffd;
856 goto codepoint16;
857 }
858 codepoint2 = uc[2] | (uc[3]<<8);
859 if ((codepoint2 & 0xfc00) != 0xdc00) {
860 /* high surrogate not followed by low
861 surrogate: convert to 0xfffd */
862 codepoint = 0xfffd;
863 goto codepoint16;
864 }
865 if (out_left < 4) {
866 errno = E2BIG;
867 goto error;
868 }
869 memcpy(c, uc, 4);
870 in_left -= 4;
871 out_left -= 4;
872 uc += 4;
873 c += 4;
874 continue;
875 }
876
877 if ((codepoint & 0xfc00) == 0xdc00) {
878 /* low surrogate not preceded by high
879 surrogate: convert to 0xfffd */
880 codepoint = 0xfffd;
881 }
882
883 codepoint16:
884 c[0] = codepoint & 0xFF;
885 c[1] = (codepoint>>8) & 0xFF;
886
887 in_left -= 2;
888 out_left -= 2;
889 uc += 2;
890 c += 2;
891 continue;
892 }
893
894 if (in_left == 1) {
895 errno = EINVAL;
896 goto error;
897 }
898
899 if (in_left > 1) {
900 errno = E2BIG;
901 goto error;
902 }
903
904 *inbytesleft = in_left;
905 *outbytesleft = out_left;
906 *inbuf = (const char *)uc;
907 *outbuf = (char *)c;
908
909 return 0;
910
911error:
912 *inbytesleft = in_left;
913 *outbytesleft = out_left;
914 *inbuf = (const char *)uc;
915 *outbuf = (char *)c;
916 return -1;
917}
918
919
920
Note: See TracBrowser for help on using the repository browser.