Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

iconv.c

Visit:

Last change on this file was 752, checked in by Silvan Scherrer, 13 years ago
Samba Server: updated trunk to 3.6.9 2nd part
File size: 20.9 KB

Line
1	/*
2	Unix SMB/CIFS implementation.
3	minimal iconv implementation
4	Copyright (C) Andrew Tridgell 2001
5	Copyright (C) Jelmer Vernooij 2002
6
7	This program is free software; you can redistribute it and/or modify
8	it under the terms of the GNU General Public License as published by
9	the Free Software Foundation; either version 3 of the License, or
10	(at your option) any later version.
11
12	This program is distributed in the hope that it will be useful,
13	but WITHOUT ANY WARRANTY; without even the implied warranty of
14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	GNU General Public License for more details.
16
17	You should have received a copy of the GNU General Public License
18	along with this program. If not, see <http://www.gnu.org/licenses/>.
19	*/
20
21	#include "includes.h"
22	#include "../lib/util/dlinklist.h"
23	#include "system/iconv.h"
24	#include "system/filesys.h"
25
26	#ifdef strcasecmp
27	#undef strcasecmp
28	#endif
29
30	#ifdef static_decl_charset
31	static_decl_charset;
32	#endif
33
34	/**
35	* @file
36	*
37	* @brief Samba wrapper/stub for iconv character set conversion.
38	*
39	* iconv is the XPG2 interface for converting between character
40	* encodings. This file provides a Samba wrapper around it, and also
41	* a simple reimplementation that is used if the system does not
42	* implement iconv.
43	*
44	* Samba only works with encodings that are supersets of ASCII: ascii
45	* characters like whitespace can be tested for directly, multibyte
46	* sequences start with a byte with the high bit set, and strings are
47	* terminated by a nul byte.
48	*
49	* Note that the only function provided by iconv is conversion between
50	* characters. It doesn't directly support operations like
51	* uppercasing or comparison. We have to convert to UTF-16LE and
52	* compare there.
53	*
54	* @sa Samba Developers Guide
55	**/
56
57	static size_t ascii_pull (void ,const char , size_t , char *, size_t );
58	static size_t ascii_push (void ,const char , size_t , char *, size_t );
59	static size_t latin1_push(void ,const char , size_t , char *, size_t );
60	static size_t utf8_pull (void ,const char , size_t , char *, size_t );
61	static size_t utf8_push (void ,const char , size_t , char *, size_t );
62	static size_t utf16_munged_pull(void ,const char , size_t , char *, size_t );
63	static size_t ucs2hex_pull(void ,const char , size_t , char *, size_t );
64	static size_t ucs2hex_push(void ,const char , size_t , char *, size_t );
65	static size_t iconv_copy (void ,const char , size_t , char *, size_t );
66	static size_t iconv_swab (void ,const char , size_t , char *, size_t );
67
68	static const struct charset_functions builtin_functions[] = {
69	/* windows is closest to UTF-16 */
70	{"UCS-2LE", iconv_copy, iconv_copy},
71	{"UTF-16LE", iconv_copy, iconv_copy},
72	{"UCS-2BE", iconv_swab, iconv_swab},
73	{"UTF-16BE", iconv_swab, iconv_swab},
74
75	/* we include the UTF-8 alias to cope with differing locale settings */
76	{"UTF8", utf8_pull, utf8_push},
77	{"UTF-8", utf8_pull, utf8_push},
78
79	/* this handles the munging needed for String2Key */
80	{"UTF16_MUNGED", utf16_munged_pull, iconv_copy},
81
82	{"ASCII", ascii_pull, ascii_push},
83	{"646", ascii_pull, ascii_push},
84	{"ISO-8859-1", ascii_pull, latin1_push},
85	{"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
86	};
87
88	static struct charset_functions *charsets = NULL;
89
90	static struct charset_functions find_charset_functions(const char name)
91	{
92	struct charset_functions *c;
93
94	/* Check whether we already have this charset... */
95	for (c = charsets; c != NULL; c = c->next) {
96	if(strcasecmp(c->name, name) == 0) {
97	return c;
98	}
99	}
100
101	return NULL;
102	}
103
104	bool smb_register_charset(const struct charset_functions *funcs_in)
105	{
106	struct charset_functions *funcs;
107
108	DEBUG(5, ("Attempting to register new charset %s\n", funcs_in->name));
109	/* Check whether we already have this charset... */
110	if (find_charset_functions(funcs_in->name)) {
111	DEBUG(0, ("Duplicate charset %s, not registering\n", funcs_in->name));
112	return false;
113	}
114
115	funcs = talloc(NULL, struct charset_functions);
116	if (!funcs) {
117	DEBUG(0, ("Out of memory duplicating charset %s\n", funcs_in->name));
118	return false;
119	}
120	funcs = funcs_in;
121
122	funcs->next = funcs->prev = NULL;
123	DEBUG(5, ("Registered charset %s\n", funcs->name));
124	DLIST_ADD(charsets, funcs);
125	return true;
126	}
127
128	static void lazy_initialize_iconv(void)
129	{
130	static bool initialized;
131
132	#ifdef static_init_charset
133	if (!initialized) {
134	static_init_charset;
135	initialized = true;
136	}
137	#endif
138	}
139
140	#if defined(__OS2__) && defined(__INNOTEK_LIBC__)
141	#include <uconv.h>
142
143	typedef struct os2_iconv_t
144	{
145	UconvObject from;
146	} os2_iconv_t;
147
148	iconv_t os2_iconv_open (const char tocode, const char fromcode)
149	{
150	os2_iconv_t os2_cd = (os2_iconv_t )iconv_open(tocode, fromcode);
151
152	if (os2_cd != (iconv_t)(-1))
153	{
154	/* Assume strings contain pathnames */
155	uconv_attribute_t attr;
156
157	UniQueryUconvObject(os2_cd->from, &attr,
158	sizeof(uconv_attribute_t),
159	NULL, NULL, NULL );
160	attr.converttype \|= CVTTYPE_PATH;
161	UniSetUconvObject(os2_cd->from, &attr);
162	}
163
164	return (iconv_t)os2_cd;
165	}
166
167	#define iconv_open os2_iconv_open
168	#endif
169
170	#ifdef HAVE_NATIVE_ICONV
171	/* if there was an error then reset the internal state,
172	this ensures that we don't have a shift state remaining for
173	character sets like SJIS */
174	static size_t sys_iconv(void *cd,
175	const char *inbuf, size_t inbytesleft,
176	char *outbuf, size_t outbytesleft)
177	{
178	size_t ret = iconv((iconv_t)cd,
179	discard_const_p(char *, inbuf), inbytesleft,
180	outbuf, outbytesleft);
181	if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
182	return ret;
183	}
184	#endif
185
186	/**
187	* This is a simple portable iconv() implementaion.
188	*
189	* It only knows about a very small number of character sets - just
190	* enough that Samba works on systems that don't have iconv.
191	**/
192	_PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
193	const char *inbuf, size_t inbytesleft,
194	char *outbuf, size_t outbytesleft)
195	{
196	char cvtbuf[2048];
197	size_t bufsize;
198
199	/* in many cases we can go direct */
200	if (cd->direct) {
201	return cd->direct(cd->cd_direct,
202	inbuf, inbytesleft, outbuf, outbytesleft);
203	}
204
205
206	/* otherwise we have to do it chunks at a time */
207	while (*inbytesleft > 0) {
208	char *bufp1 = cvtbuf;
209	const char *bufp2 = cvtbuf;
210
211	bufsize = sizeof(cvtbuf);
212
213	if (cd->pull(cd->cd_pull,
214	inbuf, inbytesleft, &bufp1, &bufsize) == -1
215	&& errno != E2BIG) return -1;
216
217	bufsize = sizeof(cvtbuf) - bufsize;
218
219	if (cd->push(cd->cd_push,
220	&bufp2, &bufsize,
221	outbuf, outbytesleft) == -1) return -1;
222	}
223
224	return 0;
225	}
226
227	static bool is_utf16(const char *name)
228	{
229	return strcasecmp(name, "UCS-2LE") == 0 \|\|
230	strcasecmp(name, "UTF-16LE") == 0;
231	}
232
233	static int smb_iconv_t_destructor(smb_iconv_t hwd)
234	{
235	#ifdef HAVE_NATIVE_ICONV
236	if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
237	iconv_close(hwd->cd_pull);
238	if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
239	iconv_close(hwd->cd_push);
240	if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
241	iconv_close(hwd->cd_direct);
242	#endif
243
244	return 0;
245	}
246
247	_PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX mem_ctx, const char tocode,
248	const char *fromcode, bool native_iconv)
249	{
250	smb_iconv_t ret;
251	const struct charset_functions from=NULL, to=NULL;
252	int i;
253
254	lazy_initialize_iconv();
255
256	ret = (smb_iconv_t)talloc_named(mem_ctx,
257	sizeof(*ret),
258	"iconv(%s,%s)", tocode, fromcode);
259	if (!ret) {
260	errno = ENOMEM;
261	return (smb_iconv_t)-1;
262	}
263	memset(ret, 0, sizeof(*ret));
264	talloc_set_destructor(ret, smb_iconv_t_destructor);
265
266	/* check for the simplest null conversion */
267	if (strcmp(fromcode, tocode) == 0) {
268	ret->direct = iconv_copy;
269	return ret;
270	}
271
272	for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
273	if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
274	from = &builtin_functions[i];
275	}
276	if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
277	to = &builtin_functions[i];
278	}
279	}
280
281	if (from == NULL) {
282	for (from=charsets; from; from=from->next) {
283	if (strcasecmp(from->name, fromcode) == 0) break;
284	}
285	}
286
287	if (to == NULL) {
288	for (to=charsets; to; to=to->next) {
289	if (strcasecmp(to->name, tocode) == 0) break;
290	}
291	}
292
293	#ifdef HAVE_NATIVE_ICONV
294	if ((!from \|\| !to) && !native_iconv) {
295	goto failed;
296	}
297	if (!from) {
298	ret->pull = sys_iconv;
299	ret->cd_pull = iconv_open("UTF-16LE", fromcode);
300	if (ret->cd_pull == (iconv_t)-1)
301	ret->cd_pull = iconv_open("UCS-2LE", fromcode);
302	if (ret->cd_pull == (iconv_t)-1) goto failed;
303	}
304
305	if (!to) {
306	ret->push = sys_iconv;
307	ret->cd_push = iconv_open(tocode, "UTF-16LE");
308	if (ret->cd_push == (iconv_t)-1)
309	ret->cd_push = iconv_open(tocode, "UCS-2LE");
310	if (ret->cd_push == (iconv_t)-1) goto failed;
311	}
312	#else
313	if (!from \|\| !to) {
314	goto failed;
315	}
316	#endif
317
318	/* check for conversion to/from ucs2 */
319	if (is_utf16(fromcode) && to) {
320	ret->direct = to->push;
321	return ret;
322	}
323	if (is_utf16(tocode) && from) {
324	ret->direct = from->pull;
325	return ret;
326	}
327
328	#ifdef HAVE_NATIVE_ICONV
329	if (is_utf16(fromcode)) {
330	ret->direct = sys_iconv;
331	ret->cd_direct = ret->cd_push;
332	ret->cd_push = NULL;
333	return ret;
334	}
335	if (is_utf16(tocode)) {
336	ret->direct = sys_iconv;
337	ret->cd_direct = ret->cd_pull;
338	ret->cd_pull = NULL;
339	return ret;
340	}
341	#endif
342
343	/* the general case has to go via a buffer */
344	if (!ret->pull) ret->pull = from->pull;
345	if (!ret->push) ret->push = to->push;
346	return ret;
347
348	failed:
349	talloc_free(ret);
350	errno = EINVAL;
351	return (smb_iconv_t)-1;
352	}
353
354	/*
355	simple iconv_open() wrapper
356	*/
357	_PUBLIC_ smb_iconv_t smb_iconv_open(const char tocode, const char fromcode)
358	{
359	return smb_iconv_open_ex(NULL, tocode, fromcode, true);
360	}
361
362	/*
363	simple iconv_close() wrapper
364	*/
365	_PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
366	{
367	talloc_free(cd);
368	return 0;
369	}
370
371
372	/**********************************************************************
373	the following functions implement the builtin character sets in Samba
374	and also the "test" character sets that are designed to test
375	multi-byte character set support for english users
376	***********************************************************************/
377	static size_t ascii_pull(void cd, const char inbuf, size_t inbytesleft,
378	char *outbuf, size_t outbytesleft)
379	{
380	while (inbytesleft >= 1 && outbytesleft >= 2) {
381	(outbuf)[0] = (inbuf)[0];
382	(*outbuf)[1] = 0;
383	(*inbytesleft) -= 1;
384	(*outbytesleft) -= 2;
385	(*inbuf) += 1;
386	(*outbuf) += 2;
387	}
388
389	if (*inbytesleft > 0) {
390	errno = E2BIG;
391	return -1;
392	}
393
394	return 0;
395	}
396
397	static size_t ascii_push(void cd, const char inbuf, size_t inbytesleft,
398	char *outbuf, size_t outbytesleft)
399	{
400	int ir_count=0;
401
402	while (inbytesleft >= 2 && outbytesleft >= 1) {
403	(outbuf)[0] = (inbuf)[0] & 0x7F;
404	if ((*inbuf)[1]) ir_count++;
405	(*inbytesleft) -= 2;
406	(*outbytesleft) -= 1;
407	(*inbuf) += 2;
408	(*outbuf) += 1;
409	}
410
411	if (*inbytesleft == 1) {
412	errno = EINVAL;
413	return -1;
414	}
415
416	if (*inbytesleft > 1) {
417	errno = E2BIG;
418	return -1;
419	}
420
421	return ir_count;
422	}
423
424	static size_t latin1_push(void cd, const char inbuf, size_t inbytesleft,
425	char *outbuf, size_t outbytesleft)
426	{
427	int ir_count=0;
428
429	while (inbytesleft >= 2 && outbytesleft >= 1) {
430	(outbuf)[0] = (inbuf)[0];
431	if ((*inbuf)[1]) ir_count++;
432	(*inbytesleft) -= 2;
433	(*outbytesleft) -= 1;
434	(*inbuf) += 2;
435	(*outbuf) += 1;
436	}
437
438	if (*inbytesleft == 1) {
439	errno = EINVAL;
440	return -1;
441	}
442
443	if (*inbytesleft > 1) {
444	errno = E2BIG;
445	return -1;
446	}
447
448	return ir_count;
449	}
450
451	static size_t ucs2hex_pull(void cd, const char inbuf, size_t inbytesleft,
452	char *outbuf, size_t outbytesleft)
453	{
454	while (inbytesleft >= 1 && outbytesleft >= 2) {
455	unsigned int v;
456
457	if ((*inbuf)[0] != '@') {
458	/* seven bit ascii case */
459	(outbuf)[0] = (inbuf)[0];
460	(*outbuf)[1] = 0;
461	(*inbytesleft) -= 1;
462	(*outbytesleft) -= 2;
463	(*inbuf) += 1;
464	(*outbuf) += 2;
465	continue;
466	}
467	/* it's a hex character */
468	if (*inbytesleft < 5) {
469	errno = EINVAL;
470	return -1;
471	}
472
473	if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
474	errno = EILSEQ;
475	return -1;
476	}
477
478	(*outbuf)[0] = v&0xff;
479	(*outbuf)[1] = v>>8;
480	(*inbytesleft) -= 5;
481	(*outbytesleft) -= 2;
482	(*inbuf) += 5;
483	(*outbuf) += 2;
484	}
485
486	if (*inbytesleft > 0) {
487	errno = E2BIG;
488	return -1;
489	}
490
491	return 0;
492	}
493
494	static size_t ucs2hex_push(void cd, const char inbuf, size_t inbytesleft,
495	char *outbuf, size_t outbytesleft)
496	{
497	while (inbytesleft >= 2 && outbytesleft >= 1) {
498	char buf[6];
499
500	if ((*inbuf)[1] == 0 &&
501	((*inbuf)[0] & 0x80) == 0 &&
502	(*inbuf)[0] != '@') {
503	(outbuf)[0] = (inbuf)[0];
504	(*inbytesleft) -= 2;
505	(*outbytesleft) -= 1;
506	(*inbuf) += 2;
507	(*outbuf) += 1;
508	continue;
509	}
510	if (*outbytesleft < 5) {
511	errno = E2BIG;
512	return -1;
513	}
514	snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
515	memcpy(*outbuf, buf, 5);
516	(*inbytesleft) -= 2;
517	(*outbytesleft) -= 5;
518	(*inbuf) += 2;
519	(*outbuf) += 5;
520	}
521
522	if (*inbytesleft == 1) {
523	errno = EINVAL;
524	return -1;
525	}
526
527	if (*inbytesleft > 1) {
528	errno = E2BIG;
529	return -1;
530	}
531
532	return 0;
533	}
534
535	static size_t iconv_swab(void cd, const char inbuf, size_t inbytesleft,
536	char *outbuf, size_t outbytesleft)
537	{
538	int n;
539
540	n = MIN(inbytesleft, outbytesleft);
541
542	swab(inbuf, outbuf, (n&~1));
543	if (n&1) {
544	(*outbuf)[n-1] = 0;
545	}
546
547	(*inbytesleft) -= n;
548	(*outbytesleft) -= n;
549	(*inbuf) += n;
550	(*outbuf) += n;
551
552	if (*inbytesleft > 0) {
553	errno = E2BIG;
554	return -1;
555	}
556
557	return 0;
558	}
559
560
561	static size_t iconv_copy(void cd, const char inbuf, size_t inbytesleft,
562	char *outbuf, size_t outbytesleft)
563	{
564	int n;
565
566	n = MIN(inbytesleft, outbytesleft);
567
568	memmove(outbuf, inbuf, n);
569
570	(*inbytesleft) -= n;
571	(*outbytesleft) -= n;
572	(*inbuf) += n;
573	(*outbuf) += n;
574
575	if (*inbytesleft > 0) {
576	errno = E2BIG;
577	return -1;
578	}
579
580	return 0;
581	}
582
583	/*
584	this takes a UTF8 sequence and produces a UTF16 sequence
585	*/
586	static size_t utf8_pull(void cd, const char inbuf, size_t inbytesleft,
587	char *outbuf, size_t outbytesleft)
588	{
589	size_t in_left=inbytesleft, out_left=outbytesleft;
590	const uint8_t c = (const uint8_t )*inbuf;
591	uint8_t uc = (uint8_t )*outbuf;
592
593	while (in_left >= 1 && out_left >= 2) {
594	if ((c[0] & 0x80) == 0) {
595	uc[0] = c[0];
596	uc[1] = 0;
597	c += 1;
598	in_left -= 1;
599	out_left -= 2;
600	uc += 2;
601	continue;
602	}
603
604	if ((c[0] & 0xe0) == 0xc0) {
605	if (in_left < 2 \|\|
606	(c[1] & 0xc0) != 0x80) {
607	errno = EILSEQ;
608	goto error;
609	}
610	uc[1] = (c[0]>>2) & 0x7;
611	uc[0] = (c[0]<<6) \| (c[1]&0x3f);
612	c += 2;
613	in_left -= 2;
614	out_left -= 2;
615	uc += 2;
616	continue;
617	}
618
619	if ((c[0] & 0xf0) == 0xe0) {
620	if (in_left < 3 \|\|
621	(c[1] & 0xc0) != 0x80 \|\|
622	(c[2] & 0xc0) != 0x80) {
623	errno = EILSEQ;
624	goto error;
625	}
626	uc[1] = ((c[0]&0xF)<<4) \| ((c[1]>>2)&0xF);
627	uc[0] = (c[1]<<6) \| (c[2]&0x3f);
628	c += 3;
629	in_left -= 3;
630	out_left -= 2;
631	uc += 2;
632	continue;
633	}
634
635	if ((c[0] & 0xf8) == 0xf0) {
636	unsigned int codepoint;
637	if (in_left < 4 \|\|
638	(c[1] & 0xc0) != 0x80 \|\|
639	(c[2] & 0xc0) != 0x80 \|\|
640	(c[3] & 0xc0) != 0x80) {
641	errno = EILSEQ;
642	goto error;
643	}
644	codepoint =
645	(c[3]&0x3f) \|
646	((c[2]&0x3f)<<6) \|
647	((c[1]&0x3f)<<12) \|
648	((c[0]&0x7)<<18);
649	if (codepoint < 0x10000) {
650	/* accept UTF-8 characters that are not
651	minimally packed, but pack the result */
652	uc[0] = (codepoint & 0xFF);
653	uc[1] = (codepoint >> 8);
654	c += 4;
655	in_left -= 4;
656	out_left -= 2;
657	uc += 2;
658	continue;
659	}
660
661	codepoint -= 0x10000;
662
663	if (out_left < 4) {
664	errno = E2BIG;
665	goto error;
666	}
667
668	uc[0] = (codepoint>>10) & 0xFF;
669	uc[1] = (codepoint>>18) \| 0xd8;
670	uc[2] = codepoint & 0xFF;
671	uc[3] = ((codepoint>>8) & 0x3) \| 0xdc;
672	c += 4;
673	in_left -= 4;
674	out_left -= 4;
675	uc += 4;
676	continue;
677	}
678
679	/* we don't handle 5 byte sequences */
680	errno = EINVAL;
681	goto error;
682	}
683
684	if (in_left > 0) {
685	errno = E2BIG;
686	goto error;
687	}
688
689	*inbytesleft = in_left;
690	*outbytesleft = out_left;
691	inbuf = (const char )c;
692	outbuf = (char )uc;
693	return 0;
694
695	error:
696	*inbytesleft = in_left;
697	*outbytesleft = out_left;
698	inbuf = (const char )c;
699	outbuf = (char )uc;
700	return -1;
701	}
702
703
704	/*
705	this takes a UTF16 sequence and produces a UTF8 sequence
706	*/
707	static size_t utf8_push(void cd, const char inbuf, size_t inbytesleft,
708	char *outbuf, size_t outbytesleft)
709	{
710	size_t in_left=inbytesleft, out_left=outbytesleft;
711	uint8_t c = (uint8_t )*outbuf;
712	const uint8_t uc = (const uint8_t )*inbuf;
713
714	while (in_left >= 2 && out_left >= 1) {
715	unsigned int codepoint;
716
717	if (uc[1] == 0 && !(uc[0] & 0x80)) {
718	/* simplest case */
719	c[0] = uc[0];
720	in_left -= 2;
721	out_left -= 1;
722	uc += 2;
723	c += 1;
724	continue;
725	}
726
727	if ((uc[1]&0xf8) == 0) {
728	/* next simplest case */
729	if (out_left < 2) {
730	errno = E2BIG;
731	goto error;
732	}
733	c[0] = 0xc0 \| (uc[0]>>6) \| (uc[1]<<2);
734	c[1] = 0x80 \| (uc[0] & 0x3f);
735	in_left -= 2;
736	out_left -= 2;
737	uc += 2;
738	c += 2;
739	continue;
740	}
741
742	if ((uc[1] & 0xfc) == 0xdc) {
743	/* its the second part of a 4 byte sequence. Illegal */
744	if (in_left < 4) {
745	errno = EINVAL;
746	} else {
747	errno = EILSEQ;
748	}
749	goto error;
750	}
751
752	if ((uc[1] & 0xfc) != 0xd8) {
753	codepoint = uc[0] \| (uc[1]<<8);
754	if (out_left < 3) {
755	errno = E2BIG;
756	goto error;
757	}
758	c[0] = 0xe0 \| (codepoint >> 12);
759	c[1] = 0x80 \| ((codepoint >> 6) & 0x3f);
760	c[2] = 0x80 \| (codepoint & 0x3f);
761
762	in_left -= 2;
763	out_left -= 3;
764	uc += 2;
765	c += 3;
766	continue;
767	}
768
769	/* its the first part of a 4 byte sequence */
770	if (in_left < 4) {
771	errno = EINVAL;
772	goto error;
773	}
774	if ((uc[3] & 0xfc) != 0xdc) {
775	errno = EILSEQ;
776	goto error;
777	}
778	codepoint = 0x10000 + (uc[2] \| ((uc[3] & 0x3)<<8) \|
779	(uc[0]<<10) \| ((uc[1] & 0x3)<<18));
780
781	if (out_left < 4) {
782	errno = E2BIG;
783	goto error;
784	}
785	c[0] = 0xf0 \| (codepoint >> 18);
786	c[1] = 0x80 \| ((codepoint >> 12) & 0x3f);
787	c[2] = 0x80 \| ((codepoint >> 6) & 0x3f);
788	c[3] = 0x80 \| (codepoint & 0x3f);
789
790	in_left -= 4;
791	out_left -= 4;
792	uc += 4;
793	c += 4;
794	}
795
796	if (in_left == 1) {
797	errno = EINVAL;
798	goto error;
799	}
800
801	if (in_left > 1) {
802	errno = E2BIG;
803	goto error;
804	}
805
806	*inbytesleft = in_left;
807	*outbytesleft = out_left;
808	inbuf = (const char )uc;
809	outbuf = (char )c;
810
811	return 0;
812
813	error:
814	*inbytesleft = in_left;
815	*outbytesleft = out_left;
816	inbuf = (const char )uc;
817	outbuf = (char )c;
818	return -1;
819	}
820
821
822	/*
823	this takes a UTF16 munged sequence, modifies it according to the
824	string2key rules, and produces a UTF16 sequence
825
826	The rules are:
827
828	1) any 0x0000 characters are mapped to 0x0001
829
830	2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
831	without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
832	U+FFFD (OBJECT REPLACEMENT CHARACTER).
833
834	3) the same for any low surrogate that was not preceded by a high surrogate.
835
836	*/
837	static size_t utf16_munged_pull(void cd, const char inbuf, size_t inbytesleft,
838	char *outbuf, size_t outbytesleft)
839	{
840	size_t in_left=inbytesleft, out_left=outbytesleft;
841	uint8_t c = (uint8_t )*outbuf;
842	const uint8_t uc = (const uint8_t )*inbuf;
843
844	while (in_left >= 2 && out_left >= 2) {
845	unsigned int codepoint = uc[0] \| (uc[1]<<8);
846
847	if (codepoint == 0) {
848	codepoint = 1;
849	}
850
851	if ((codepoint & 0xfc00) == 0xd800) {
852	/* a high surrogate */
853	unsigned int codepoint2;
854	if (in_left < 4) {
855	codepoint = 0xfffd;
856	goto codepoint16;
857	}
858	codepoint2 = uc[2] \| (uc[3]<<8);
859	if ((codepoint2 & 0xfc00) != 0xdc00) {
860	/* high surrogate not followed by low
861	surrogate: convert to 0xfffd */
862	codepoint = 0xfffd;
863	goto codepoint16;
864	}
865	if (out_left < 4) {
866	errno = E2BIG;
867	goto error;
868	}
869	memcpy(c, uc, 4);
870	in_left -= 4;
871	out_left -= 4;
872	uc += 4;
873	c += 4;
874	continue;
875	}
876
877	if ((codepoint & 0xfc00) == 0xdc00) {
878	/* low surrogate not preceded by high
879	surrogate: convert to 0xfffd */
880	codepoint = 0xfffd;
881	}
882
883	codepoint16:
884	c[0] = codepoint & 0xFF;
885	c[1] = (codepoint>>8) & 0xFF;
886
887	in_left -= 2;
888	out_left -= 2;
889	uc += 2;
890	c += 2;
891	continue;
892	}
893
894	if (in_left == 1) {
895	errno = EINVAL;
896	goto error;
897	}
898
899	if (in_left > 1) {
900	errno = E2BIG;
901	goto error;
902	}
903
904	*inbytesleft = in_left;
905	*outbytesleft = out_left;
906	inbuf = (const char )uc;
907	outbuf = (char )c;
908
909	return 0;
910
911	error:
912	*inbytesleft = in_left;
913	*outbytesleft = out_left;
914	inbuf = (const char )uc;
915	outbuf = (char )c;
916	return -1;
917	}
918
919
920

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/server/lib/util/charset/iconv.c

Download in other formats: