Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

codepoints.c

Visit:

Last change on this file was 1052, checked in by Silvan Scherrer, 8 years ago
fix umlaut issues in pathnames ticket #319
File size: 14.2 KB

Line
1	/*
2	Unix SMB/CIFS implementation.
3	Character set conversion Extensions
4	Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5	Copyright (C) Andrew Tridgell 2001
6	Copyright (C) Simo Sorce 2001
7	Copyright (C) Jelmer Vernooij 2007
8
9	This program is free software; you can redistribute it and/or modify
10	it under the terms of the GNU General Public License as published by
11	the Free Software Foundation; either version 3 of the License, or
12	(at your option) any later version.
13
14	This program is distributed in the hope that it will be useful,
15	but WITHOUT ANY WARRANTY; without even the implied warranty of
16	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17	GNU General Public License for more details.
18
19	You should have received a copy of the GNU General Public License
20	along with this program. If not, see <http://www.gnu.org/licenses/>.
21
22	*/
23	#include "includes.h"
24	#include "lib/util/charset/charset.h"
25	#include "system/locale.h"
26	#include "dynconfig.h"
27
28	#ifdef strcasecmp
29	#undef strcasecmp
30	#endif
31
32	/**
33	* @file
34	* @brief Unicode string manipulation
35	*/
36
37	/* these 2 tables define the unicode case handling. They are loaded
38	at startup either via mmap() or read() from the lib directory */
39	static void *upcase_table;
40	static void *lowcase_table;
41
42
43	/*******************************************************************
44	load the case handling tables
45
46	This is the function that should be called from library code.
47	********************************************************************/
48	void load_case_tables_library(void)
49	{
50	TALLOC_CTX *mem_ctx;
51
52	mem_ctx = talloc_init("load_case_tables");
53	if (!mem_ctx) {
54	smb_panic("No memory for case_tables");
55	}
56	upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
57	lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
58	talloc_free(mem_ctx);
59	if (upcase_table == NULL) {
60	DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
61	upcase_table = (void *)-1;
62	}
63	if (lowcase_table == NULL) {
64	DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
65	lowcase_table = (void *)-1;
66	}
67	}
68
69	/*******************************************************************
70	load the case handling tables
71
72	This MUST only be called from main() in application code, never from a
73	library. We don't know if the calling program has already done
74	setlocale() to another value, and can't tell if they have.
75	********************************************************************/
76	void load_case_tables(void)
77	{
78	/* This is a useful global hook where we can ensure that the
79	* locale is set from the environment. This is needed so that
80	* we can use LOCALE as a codepage */
81	#ifdef HAVE_SETLOCALE
82	setlocale(LC_ALL, "");
83	#endif
84	load_case_tables_library();
85	}
86
87	/**
88	Convert a codepoint_t to upper case.
89	**/
90	_PUBLIC_ codepoint_t toupper_m(codepoint_t val)
91	{
92	if (val < 128) {
93	return toupper(val);
94	}
95	if (upcase_table == NULL) {
96	load_case_tables_library();
97	}
98	if (upcase_table == (void *)-1) {
99	return val;
100	}
101	if (val & 0xFFFF0000) {
102	return val;
103	}
104	return SVAL(upcase_table, val*2);
105	}
106
107	/**
108	Convert a codepoint_t to lower case.
109	**/
110	_PUBLIC_ codepoint_t tolower_m(codepoint_t val)
111	{
112	if (val < 128) {
113	return tolower(val);
114	}
115	if (lowcase_table == NULL) {
116	load_case_tables_library();
117	}
118	if (lowcase_table == (void *)-1) {
119	return val;
120	}
121	if (val & 0xFFFF0000) {
122	return val;
123	}
124	return SVAL(lowcase_table, val*2);
125	}
126
127	/**
128	If we upper cased this character, would we get the same character?
129	**/
130	_PUBLIC_ bool islower_m(codepoint_t val)
131	{
132	return (toupper_m(val) != val);
133	}
134
135	/**
136	If we lower cased this character, would we get the same character?
137	**/
138	_PUBLIC_ bool isupper_m(codepoint_t val)
139	{
140	return (tolower_m(val) != val);
141	}
142
143	/**
144	compare two codepoints case insensitively
145	*/
146	_PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
147	{
148	if (c1 == c2 \|\|
149	toupper_m(c1) == toupper_m(c2)) {
150	return 0;
151	}
152	return c1 - c2;
153	}
154
155
156	struct smb_iconv_convenience {
157	TALLOC_CTX *child_ctx;
158	const char *unix_charset;
159	const char *dos_charset;
160	const char *display_charset;
161	bool native_iconv;
162	smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
163	};
164
165	struct smb_iconv_convenience *global_iconv_convenience = NULL;
166
167	struct smb_iconv_convenience *get_iconv_convenience(void)
168	{
169	if (global_iconv_convenience == NULL)
170	global_iconv_convenience = smb_iconv_convenience_reinit(talloc_autofree_context(),
171	#ifdef __OS2__
172	lp_dos_charset(), lp_unix_charset(), lp_display_charset(), true, NULL);
173	#else
174	"ASCII", "UTF-8", "ASCII", true, NULL);
175	#endif
176	return global_iconv_convenience;
177	}
178
179	/**
180	* Return the name of a charset to give to iconv().
181	**/
182	const char charset_name(struct smb_iconv_convenience ic, charset_t ch)
183	{
184	switch (ch) {
185	case CH_UTF16: return "UTF-16LE";
186	case CH_UNIX: return ic->unix_charset;
187	case CH_DOS: return ic->dos_charset;
188	case CH_DISPLAY: return ic->display_charset;
189	case CH_UTF8: return "UTF8";
190	case CH_UTF16BE: return "UTF-16BE";
191	case CH_UTF16MUNGED: return "UTF16_MUNGED";
192	default:
193	return "ASCII";
194	}
195	}
196
197	/**
198	re-initialize iconv conversion descriptors
199	**/
200	static int close_iconv_convenience(struct smb_iconv_convenience *data)
201	{
202	unsigned c1, c2;
203	for (c1=0;c1<NUM_CHARSETS;c1++) {
204	for (c2=0;c2<NUM_CHARSETS;c2++) {
205	if (data->conv_handles[c1][c2] != NULL) {
206	if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
207	smb_iconv_close(data->conv_handles[c1][c2]);
208	}
209	data->conv_handles[c1][c2] = NULL;
210	}
211	}
212	}
213
214	return 0;
215	}
216
217	static const char map_locale(const char charset)
218	{
219	if (strcmp(charset, "LOCALE") != 0) {
220	return charset;
221	}
222	#if defined(HAVE_NL_LANGINFO) && defined(CODESET)
223	{
224	const char *ln;
225	smb_iconv_t handle;
226
227	ln = nl_langinfo(CODESET);
228	if (ln == NULL) {
229	DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n"));
230	return "ASCII";
231	}
232	/* Check whether the charset name is supported
233	by iconv */
234	handle = smb_iconv_open(ln, "UCS-2LE");
235	if (handle == (smb_iconv_t) -1) {
236	DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
237	return "ASCII";
238	} else {
239	DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
240	smb_iconv_close(handle);
241	}
242	return ln;
243	}
244	#endif
245	return "ASCII";
246	}
247
248	/*
249	the old_ic is passed in here as the smb_iconv_convenience structure
250	is used as a global pointer in some places (eg. python modules). We
251	don't want to invalidate those global pointers, but we do want to
252	update them with the right charset information when loadparm
253	runs. To do that we need to re-use the structure pointer, but
254	re-fill the elements in the structure with the updated values
255	*/
256	_PUBLIC_ struct smb_iconv_convenience smb_iconv_convenience_reinit(TALLOC_CTX mem_ctx,
257	const char *dos_charset,
258	const char *unix_charset,
259	const char *display_charset,
260	bool native_iconv,
261	struct smb_iconv_convenience *old_ic)
262	{
263	struct smb_iconv_convenience *ret;
264
265	display_charset = map_locale(display_charset);
266
267	if (old_ic != NULL) {
268	ret = old_ic;
269	close_iconv_convenience(ret);
270	talloc_free(ret->child_ctx);
271	ZERO_STRUCTP(ret);
272	} else {
273	ret = talloc_zero(mem_ctx, struct smb_iconv_convenience);
274	}
275	if (ret == NULL) {
276	return NULL;
277	}
278
279	/* we use a child context to allow us to free all ptrs without
280	freeing the structure itself */
281	ret->child_ctx = talloc_new(ret);
282	if (ret->child_ctx == NULL) {
283	return NULL;
284	}
285
286	talloc_set_destructor(ret, close_iconv_convenience);
287
288	ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
289	ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
290	ret->display_charset = talloc_strdup(ret->child_ctx, display_charset);
291	ret->native_iconv = native_iconv;
292
293	return ret;
294	}
295
296	/*
297	on-demand initialisation of conversion handles
298	*/
299	smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
300	charset_t from, charset_t to)
301	{
302	const char n1, n2;
303	static bool initialised;
304
305	if (initialised == false) {
306	initialised = true;
307	}
308
309	if (ic->conv_handles[from][to]) {
310	return ic->conv_handles[from][to];
311	}
312
313	n1 = charset_name(ic, from);
314	n2 = charset_name(ic, to);
315
316	ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
317	ic->native_iconv);
318
319	if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
320	if ((from == CH_DOS \|\| to == CH_DOS) &&
321	strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
322	DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
323	charset_name(ic, CH_DOS)));
324	ic->dos_charset = "ASCII";
325
326	n1 = charset_name(ic, from);
327	n2 = charset_name(ic, to);
328
329	ic->conv_handles[from][to] =
330	smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
331	}
332	}
333
334	return ic->conv_handles[from][to];
335	}
336
337	/**
338	* Return the unicode codepoint for the next character in the input
339	* string in the given src_charset.
340	* The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
341	*
342	* Also return the number of bytes consumed (which tells the caller
343	* how many bytes to skip to get to the next src_charset-character).
344	*
345	* This is implemented (in the non-ascii-case) by first converting the
346	* next character in the input string to UTF16_LE and then calculating
347	* the unicode codepoint from that.
348	*
349	* Return INVALID_CODEPOINT if the next character cannot be converted.
350	*/
351	_PUBLIC_ codepoint_t next_codepoint_convenience_ext(
352	struct smb_iconv_convenience *ic,
353	const char *str, charset_t src_charset,
354	size_t *bytes_consumed)
355	{
356	/* it cannot occupy more than 4 bytes in UTF16 format */
357	uint8_t buf[4];
358	smb_iconv_t descriptor;
359	size_t ilen_orig;
360	#ifdef __OS2__
361	size_t ilen_max;
362	size_t olen_orig;
363	const char *inbuf;
364	#endif
365	size_t ilen;
366	size_t olen;
367	char *outbuf;
368
369	if ((str[0] & 0x80) == 0) {
370	*bytes_consumed = 1;
371	return (codepoint_t)str[0];
372	}
373
374	/*
375	* we assume that no multi-byte character can take more than 5 bytes.
376	* This is OK as we only support codepoints up to 1M (U+100000)
377	*/
378	ilen_orig = strnlen(str, 5);
379	#ifdef __OS2__
380	ilen_max = strnlen(str, 5);
381	*bytes_consumed = 1;
382	#endif
383	ilen = ilen_orig;
384
385	descriptor = get_conv_handle(ic, src_charset, CH_UTF16);
386	if (descriptor == (smb_iconv_t)-1) {
387	*bytes_consumed = 1;
388	return INVALID_CODEPOINT;
389	}
390
391	#ifdef __OS2__
392	ilen_orig = 1;
393	olen_orig = 2;
394	while( 1 )
395	{
396	ilen = ilen_orig;
397	olen = olen_orig;
398	inbuf = str;
399	outbuf = ( char * )buf;
400	if( smb_iconv( descriptor, &inbuf, &ilen, &outbuf, &olen ) != ( size_t )-1 )
401	break;
402
403	switch( errno )
404	{
405	case E2BIG :
406	if( olen_orig == 2 )
407	olen_orig = 4;
408	else
409	return INVALID_CODEPOINT;
410	break;
411
412
413	case EINVAL :
414	if( ilen_orig < ilen_max )
415	ilen_orig++;
416	else
417	return INVALID_CODEPOINT;
418	break;
419
420	case EILSEQ :
421	default :
422	return INVALID_CODEPOINT;
423	}
424	}
425	olen = olen_orig - olen;
426	#else
427	/*
428	* this looks a little strange, but it is needed to cope with
429	* codepoints above 64k (U+1000) which are encoded as per RFC2781.
430	*/
431	olen = 2;
432	outbuf = (char *)buf;
433	smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
434	if (olen == 2) {
435	olen = 4;
436	outbuf = (char *)buf;
437	smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
438	if (olen == 4) {
439	/* we didn't convert any bytes */
440	*bytes_consumed = 1;
441	return INVALID_CODEPOINT;
442	}
443	olen = 4 - olen;
444	} else {
445	olen = 2 - olen;
446	}
447	#endif
448
449	*bytes_consumed = ilen_orig - ilen;
450
451	if (olen == 2) {
452	return (codepoint_t)SVAL(buf, 0);
453	}
454	if (olen == 4) {
455	/* decode a 4 byte UTF16 character manually */
456	return (codepoint_t)0x10000 +
457	(buf[2] \| ((buf[3] & 0x3)<<8) \|
458	(buf[0]<<10) \| ((buf[1] & 0x3)<<18));
459	}
460
461	/* no other length is valid */
462	return INVALID_CODEPOINT;
463	}
464
465	/*
466	return the unicode codepoint for the next multi-byte CH_UNIX character
467	in the string
468
469	also return the number of bytes consumed (which tells the caller
470	how many bytes to skip to get to the next CH_UNIX character)
471
472	return INVALID_CODEPOINT if the next character cannot be converted
473	*/
474	_PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic,
475	const char str, size_t size)
476	{
477	return next_codepoint_convenience_ext(ic, str, CH_UNIX, size);
478	}
479
480	/*
481	push a single codepoint into a CH_UNIX string the target string must
482	be able to hold the full character, which is guaranteed if it is at
483	least 5 bytes in size. The caller may pass less than 5 bytes if they
484	are sure the character will fit (for example, you can assume that
485	uppercase/lowercase of a character will not add more than 1 byte)
486
487	return the number of bytes occupied by the CH_UNIX character, or
488	-1 on failure
489	*/
490	_PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic,
491	char *str, codepoint_t c)
492	{
493	smb_iconv_t descriptor;
494	uint8_t buf[4];
495	size_t ilen, olen;
496	const char *inbuf;
497
498	if (c < 128) {
499	*str = c;
500	return 1;
501	}
502
503	descriptor = get_conv_handle(ic,
504	CH_UTF16, CH_UNIX);
505	if (descriptor == (smb_iconv_t)-1) {
506	return -1;
507	}
508
509	if (c < 0x10000) {
510	ilen = 2;
511	olen = 5;
512	inbuf = (char *)buf;
513	SSVAL(buf, 0, c);
514	smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
515	if (ilen != 0) {
516	return -1;
517	}
518	return 5 - olen;
519	}
520
521	c -= 0x10000;
522
523	buf[0] = (c>>10) & 0xFF;
524	buf[1] = (c>>18) \| 0xd8;
525	buf[2] = c & 0xFF;
526	buf[3] = ((c>>8) & 0x3) \| 0xdc;
527
528	ilen = 4;
529	olen = 5;
530	inbuf = (char *)buf;
531
532	smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
533	if (ilen != 0) {
534	return -1;
535	}
536	return 5 - olen;
537	}
538
539	_PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
540	size_t *size)
541	{
542	return next_codepoint_convenience_ext(get_iconv_convenience(), str,
543	src_charset, size);
544	}
545
546	_PUBLIC_ codepoint_t next_codepoint(const char str, size_t size)
547	{
548	return next_codepoint_convenience(get_iconv_convenience(), str, size);
549	}
550
551	_PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
552	{
553	return push_codepoint_convenience(get_iconv_convenience(), str, c);
554	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/server/lib/util/charset/codepoints.c

Download in other formats: