source: trunk/server/lib/util/charset/codepoints.c

Last change on this file was 1052, checked in by Silvan Scherrer, 8 years ago

fix umlaut issues in pathnames ticket #319

File size: 14.2 KB
Line 
1/*
2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Jelmer Vernooij 2007
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
21
22*/
23#include "includes.h"
24#include "lib/util/charset/charset.h"
25#include "system/locale.h"
26#include "dynconfig.h"
27
28#ifdef strcasecmp
29#undef strcasecmp
30#endif
31
32/**
33 * @file
34 * @brief Unicode string manipulation
35 */
36
37/* these 2 tables define the unicode case handling. They are loaded
38 at startup either via mmap() or read() from the lib directory */
39static void *upcase_table;
40static void *lowcase_table;
41
42
43/*******************************************************************
44load the case handling tables
45
46This is the function that should be called from library code.
47********************************************************************/
48void load_case_tables_library(void)
49{
50 TALLOC_CTX *mem_ctx;
51
52 mem_ctx = talloc_init("load_case_tables");
53 if (!mem_ctx) {
54 smb_panic("No memory for case_tables");
55 }
56 upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
57 lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
58 talloc_free(mem_ctx);
59 if (upcase_table == NULL) {
60 DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
61 upcase_table = (void *)-1;
62 }
63 if (lowcase_table == NULL) {
64 DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
65 lowcase_table = (void *)-1;
66 }
67}
68
69/*******************************************************************
70load the case handling tables
71
72This MUST only be called from main() in application code, never from a
73library. We don't know if the calling program has already done
74setlocale() to another value, and can't tell if they have.
75********************************************************************/
76void load_case_tables(void)
77{
78 /* This is a useful global hook where we can ensure that the
79 * locale is set from the environment. This is needed so that
80 * we can use LOCALE as a codepage */
81#ifdef HAVE_SETLOCALE
82 setlocale(LC_ALL, "");
83#endif
84 load_case_tables_library();
85}
86
87/**
88 Convert a codepoint_t to upper case.
89**/
90_PUBLIC_ codepoint_t toupper_m(codepoint_t val)
91{
92 if (val < 128) {
93 return toupper(val);
94 }
95 if (upcase_table == NULL) {
96 load_case_tables_library();
97 }
98 if (upcase_table == (void *)-1) {
99 return val;
100 }
101 if (val & 0xFFFF0000) {
102 return val;
103 }
104 return SVAL(upcase_table, val*2);
105}
106
107/**
108 Convert a codepoint_t to lower case.
109**/
110_PUBLIC_ codepoint_t tolower_m(codepoint_t val)
111{
112 if (val < 128) {
113 return tolower(val);
114 }
115 if (lowcase_table == NULL) {
116 load_case_tables_library();
117 }
118 if (lowcase_table == (void *)-1) {
119 return val;
120 }
121 if (val & 0xFFFF0000) {
122 return val;
123 }
124 return SVAL(lowcase_table, val*2);
125}
126
127/**
128 If we upper cased this character, would we get the same character?
129**/
130_PUBLIC_ bool islower_m(codepoint_t val)
131{
132 return (toupper_m(val) != val);
133}
134
135/**
136 If we lower cased this character, would we get the same character?
137**/
138_PUBLIC_ bool isupper_m(codepoint_t val)
139{
140 return (tolower_m(val) != val);
141}
142
143/**
144 compare two codepoints case insensitively
145*/
146_PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
147{
148 if (c1 == c2 ||
149 toupper_m(c1) == toupper_m(c2)) {
150 return 0;
151 }
152 return c1 - c2;
153}
154
155
156struct smb_iconv_convenience {
157 TALLOC_CTX *child_ctx;
158 const char *unix_charset;
159 const char *dos_charset;
160 const char *display_charset;
161 bool native_iconv;
162 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
163};
164
165struct smb_iconv_convenience *global_iconv_convenience = NULL;
166
167struct smb_iconv_convenience *get_iconv_convenience(void)
168{
169 if (global_iconv_convenience == NULL)
170 global_iconv_convenience = smb_iconv_convenience_reinit(talloc_autofree_context(),
171#ifdef __OS2__
172 lp_dos_charset(), lp_unix_charset(), lp_display_charset(), true, NULL);
173#else
174 "ASCII", "UTF-8", "ASCII", true, NULL);
175#endif
176 return global_iconv_convenience;
177}
178
179/**
180 * Return the name of a charset to give to iconv().
181 **/
182const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
183{
184 switch (ch) {
185 case CH_UTF16: return "UTF-16LE";
186 case CH_UNIX: return ic->unix_charset;
187 case CH_DOS: return ic->dos_charset;
188 case CH_DISPLAY: return ic->display_charset;
189 case CH_UTF8: return "UTF8";
190 case CH_UTF16BE: return "UTF-16BE";
191 case CH_UTF16MUNGED: return "UTF16_MUNGED";
192 default:
193 return "ASCII";
194 }
195}
196
197/**
198 re-initialize iconv conversion descriptors
199**/
200static int close_iconv_convenience(struct smb_iconv_convenience *data)
201{
202 unsigned c1, c2;
203 for (c1=0;c1<NUM_CHARSETS;c1++) {
204 for (c2=0;c2<NUM_CHARSETS;c2++) {
205 if (data->conv_handles[c1][c2] != NULL) {
206 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
207 smb_iconv_close(data->conv_handles[c1][c2]);
208 }
209 data->conv_handles[c1][c2] = NULL;
210 }
211 }
212 }
213
214 return 0;
215}
216
217static const char *map_locale(const char *charset)
218{
219 if (strcmp(charset, "LOCALE") != 0) {
220 return charset;
221 }
222#if defined(HAVE_NL_LANGINFO) && defined(CODESET)
223 {
224 const char *ln;
225 smb_iconv_t handle;
226
227 ln = nl_langinfo(CODESET);
228 if (ln == NULL) {
229 DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n"));
230 return "ASCII";
231 }
232 /* Check whether the charset name is supported
233 by iconv */
234 handle = smb_iconv_open(ln, "UCS-2LE");
235 if (handle == (smb_iconv_t) -1) {
236 DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
237 return "ASCII";
238 } else {
239 DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
240 smb_iconv_close(handle);
241 }
242 return ln;
243 }
244#endif
245 return "ASCII";
246}
247
248/*
249 the old_ic is passed in here as the smb_iconv_convenience structure
250 is used as a global pointer in some places (eg. python modules). We
251 don't want to invalidate those global pointers, but we do want to
252 update them with the right charset information when loadparm
253 runs. To do that we need to re-use the structure pointer, but
254 re-fill the elements in the structure with the updated values
255 */
256_PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_reinit(TALLOC_CTX *mem_ctx,
257 const char *dos_charset,
258 const char *unix_charset,
259 const char *display_charset,
260 bool native_iconv,
261 struct smb_iconv_convenience *old_ic)
262{
263 struct smb_iconv_convenience *ret;
264
265 display_charset = map_locale(display_charset);
266
267 if (old_ic != NULL) {
268 ret = old_ic;
269 close_iconv_convenience(ret);
270 talloc_free(ret->child_ctx);
271 ZERO_STRUCTP(ret);
272 } else {
273 ret = talloc_zero(mem_ctx, struct smb_iconv_convenience);
274 }
275 if (ret == NULL) {
276 return NULL;
277 }
278
279 /* we use a child context to allow us to free all ptrs without
280 freeing the structure itself */
281 ret->child_ctx = talloc_new(ret);
282 if (ret->child_ctx == NULL) {
283 return NULL;
284 }
285
286 talloc_set_destructor(ret, close_iconv_convenience);
287
288 ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
289 ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
290 ret->display_charset = talloc_strdup(ret->child_ctx, display_charset);
291 ret->native_iconv = native_iconv;
292
293 return ret;
294}
295
296/*
297 on-demand initialisation of conversion handles
298*/
299smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
300 charset_t from, charset_t to)
301{
302 const char *n1, *n2;
303 static bool initialised;
304
305 if (initialised == false) {
306 initialised = true;
307 }
308
309 if (ic->conv_handles[from][to]) {
310 return ic->conv_handles[from][to];
311 }
312
313 n1 = charset_name(ic, from);
314 n2 = charset_name(ic, to);
315
316 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
317 ic->native_iconv);
318
319 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
320 if ((from == CH_DOS || to == CH_DOS) &&
321 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
322 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
323 charset_name(ic, CH_DOS)));
324 ic->dos_charset = "ASCII";
325
326 n1 = charset_name(ic, from);
327 n2 = charset_name(ic, to);
328
329 ic->conv_handles[from][to] =
330 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
331 }
332 }
333
334 return ic->conv_handles[from][to];
335}
336
337/**
338 * Return the unicode codepoint for the next character in the input
339 * string in the given src_charset.
340 * The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
341 *
342 * Also return the number of bytes consumed (which tells the caller
343 * how many bytes to skip to get to the next src_charset-character).
344 *
345 * This is implemented (in the non-ascii-case) by first converting the
346 * next character in the input string to UTF16_LE and then calculating
347 * the unicode codepoint from that.
348 *
349 * Return INVALID_CODEPOINT if the next character cannot be converted.
350 */
351_PUBLIC_ codepoint_t next_codepoint_convenience_ext(
352 struct smb_iconv_convenience *ic,
353 const char *str, charset_t src_charset,
354 size_t *bytes_consumed)
355{
356 /* it cannot occupy more than 4 bytes in UTF16 format */
357 uint8_t buf[4];
358 smb_iconv_t descriptor;
359 size_t ilen_orig;
360#ifdef __OS2__
361 size_t ilen_max;
362 size_t olen_orig;
363 const char *inbuf;
364#endif
365 size_t ilen;
366 size_t olen;
367 char *outbuf;
368
369 if ((str[0] & 0x80) == 0) {
370 *bytes_consumed = 1;
371 return (codepoint_t)str[0];
372 }
373
374 /*
375 * we assume that no multi-byte character can take more than 5 bytes.
376 * This is OK as we only support codepoints up to 1M (U+100000)
377 */
378 ilen_orig = strnlen(str, 5);
379#ifdef __OS2__
380 ilen_max = strnlen(str, 5);
381 *bytes_consumed = 1;
382#endif
383 ilen = ilen_orig;
384
385 descriptor = get_conv_handle(ic, src_charset, CH_UTF16);
386 if (descriptor == (smb_iconv_t)-1) {
387 *bytes_consumed = 1;
388 return INVALID_CODEPOINT;
389 }
390
391#ifdef __OS2__
392 ilen_orig = 1;
393 olen_orig = 2;
394 while( 1 )
395 {
396 ilen = ilen_orig;
397 olen = olen_orig;
398 inbuf = str;
399 outbuf = ( char * )buf;
400 if( smb_iconv( descriptor, &inbuf, &ilen, &outbuf, &olen ) != ( size_t )-1 )
401 break;
402
403 switch( errno )
404 {
405 case E2BIG :
406 if( olen_orig == 2 )
407 olen_orig = 4;
408 else
409 return INVALID_CODEPOINT;
410 break;
411
412
413 case EINVAL :
414 if( ilen_orig < ilen_max )
415 ilen_orig++;
416 else
417 return INVALID_CODEPOINT;
418 break;
419
420 case EILSEQ :
421 default :
422 return INVALID_CODEPOINT;
423 }
424 }
425 olen = olen_orig - olen;
426#else
427 /*
428 * this looks a little strange, but it is needed to cope with
429 * codepoints above 64k (U+1000) which are encoded as per RFC2781.
430 */
431 olen = 2;
432 outbuf = (char *)buf;
433 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
434 if (olen == 2) {
435 olen = 4;
436 outbuf = (char *)buf;
437 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
438 if (olen == 4) {
439 /* we didn't convert any bytes */
440 *bytes_consumed = 1;
441 return INVALID_CODEPOINT;
442 }
443 olen = 4 - olen;
444 } else {
445 olen = 2 - olen;
446 }
447#endif
448
449 *bytes_consumed = ilen_orig - ilen;
450
451 if (olen == 2) {
452 return (codepoint_t)SVAL(buf, 0);
453 }
454 if (olen == 4) {
455 /* decode a 4 byte UTF16 character manually */
456 return (codepoint_t)0x10000 +
457 (buf[2] | ((buf[3] & 0x3)<<8) |
458 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
459 }
460
461 /* no other length is valid */
462 return INVALID_CODEPOINT;
463}
464
465/*
466 return the unicode codepoint for the next multi-byte CH_UNIX character
467 in the string
468
469 also return the number of bytes consumed (which tells the caller
470 how many bytes to skip to get to the next CH_UNIX character)
471
472 return INVALID_CODEPOINT if the next character cannot be converted
473*/
474_PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic,
475 const char *str, size_t *size)
476{
477 return next_codepoint_convenience_ext(ic, str, CH_UNIX, size);
478}
479
480/*
481 push a single codepoint into a CH_UNIX string the target string must
482 be able to hold the full character, which is guaranteed if it is at
483 least 5 bytes in size. The caller may pass less than 5 bytes if they
484 are sure the character will fit (for example, you can assume that
485 uppercase/lowercase of a character will not add more than 1 byte)
486
487 return the number of bytes occupied by the CH_UNIX character, or
488 -1 on failure
489*/
490_PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic,
491 char *str, codepoint_t c)
492{
493 smb_iconv_t descriptor;
494 uint8_t buf[4];
495 size_t ilen, olen;
496 const char *inbuf;
497
498 if (c < 128) {
499 *str = c;
500 return 1;
501 }
502
503 descriptor = get_conv_handle(ic,
504 CH_UTF16, CH_UNIX);
505 if (descriptor == (smb_iconv_t)-1) {
506 return -1;
507 }
508
509 if (c < 0x10000) {
510 ilen = 2;
511 olen = 5;
512 inbuf = (char *)buf;
513 SSVAL(buf, 0, c);
514 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
515 if (ilen != 0) {
516 return -1;
517 }
518 return 5 - olen;
519 }
520
521 c -= 0x10000;
522
523 buf[0] = (c>>10) & 0xFF;
524 buf[1] = (c>>18) | 0xd8;
525 buf[2] = c & 0xFF;
526 buf[3] = ((c>>8) & 0x3) | 0xdc;
527
528 ilen = 4;
529 olen = 5;
530 inbuf = (char *)buf;
531
532 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
533 if (ilen != 0) {
534 return -1;
535 }
536 return 5 - olen;
537}
538
539_PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
540 size_t *size)
541{
542 return next_codepoint_convenience_ext(get_iconv_convenience(), str,
543 src_charset, size);
544}
545
546_PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
547{
548 return next_codepoint_convenience(get_iconv_convenience(), str, size);
549}
550
551_PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
552{
553 return push_codepoint_convenience(get_iconv_convenience(), str, c);
554}
Note: See TracBrowser for help on using the repository browser.