Changeset 745 for trunk/server/lib/util/charset/codepoints.c
- Timestamp:
- Nov 27, 2012, 4:43:17 PM (13 years ago)
- Location:
- trunk/server
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/server
- Property svn:mergeinfo changed
/vendor/current merged: 581,587,591,594,597,600,615,618,740
- Property svn:mergeinfo changed
-
trunk/server/lib/util/charset/codepoints.c
r414 r745 1 1 /* 2 2 Unix SMB/CIFS implementation. 3 Samba utility functions 4 Copyright (C) Andrew Tridgell 1992-2001 3 Character set conversion Extensions 4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001 5 Copyright (C) Andrew Tridgell 2001 5 6 Copyright (C) Simo Sorce 2001 7 Copyright (C) Jelmer Vernooij 2007 6 8 7 9 This program is free software; you can redistribute it and/or modify … … 17 19 You should have received a copy of the GNU General Public License 18 20 along with this program. If not, see <http://www.gnu.org/licenses/>. 21 19 22 */ 20 21 23 #include "includes.h" 24 #include "lib/util/charset/charset.h" 22 25 #include "system/locale.h" 23 #include "dynconfig/dynconfig.h" 26 #include "dynconfig.h" 27 28 #ifdef strcasecmp 29 #undef strcasecmp 30 #endif 24 31 25 32 /** … … 36 43 /******************************************************************* 37 44 load the case handling tables 45 46 This is the function that should be called from library code. 38 47 ********************************************************************/ 39 void load_case_tables (void)48 void load_case_tables_library(void) 40 49 { 41 50 TALLOC_CTX *mem_ctx; … … 45 54 smb_panic("No memory for case_tables"); 46 55 } 47 upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", dyn_DATADIR), 0x20000);48 lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", dyn_DATADIR), 0x20000);56 upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000); 57 lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000); 49 58 talloc_free(mem_ctx); 50 59 if (upcase_table == NULL) { 51 /* try also under codepages for testing purposes */ 52 upcase_table = map_file("codepages/upcase.dat", 0x20000); 53 if (upcase_table == NULL) { 54 upcase_table = (void *)-1; 55 } 60 DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n")); 61 upcase_table = (void *)-1; 56 62 } 57 63 if (lowcase_table == NULL) { 58 /* try also under codepages for testing purposes */ 59 lowcase_table = map_file("codepages/lowcase.dat", 0x20000); 60 if (lowcase_table == NULL) { 61 lowcase_table = (void *)-1; 62 } 63 } 64 DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n")); 65 lowcase_table = (void *)-1; 66 } 67 } 68 69 /******************************************************************* 70 load the case handling tables 71 72 This MUST only be called from main() in application code, never from a 73 library. We don't know if the calling program has already done 74 setlocale() to another value, and can't tell if they have. 75 ********************************************************************/ 76 void load_case_tables(void) 77 { 78 /* This is a useful global hook where we can ensure that the 79 * locale is set from the environment. This is needed so that 80 * we can use LOCALE as a codepage */ 81 #ifdef HAVE_SETLOCALE 82 setlocale(LC_ALL, ""); 83 #endif 84 load_case_tables_library(); 64 85 } 65 86 … … 73 94 } 74 95 if (upcase_table == NULL) { 75 load_case_tables ();96 load_case_tables_library(); 76 97 } 77 98 if (upcase_table == (void *)-1) { … … 93 114 } 94 115 if (lowcase_table == NULL) { 95 load_case_tables ();116 load_case_tables_library(); 96 117 } 97 118 if (lowcase_table == (void *)-1) { … … 102 123 } 103 124 return SVAL(lowcase_table, val*2); 125 } 126 127 /** 128 If we upper cased this character, would we get the same character? 129 **/ 130 _PUBLIC_ bool islower_m(codepoint_t val) 131 { 132 return (toupper_m(val) != val); 133 } 134 135 /** 136 If we lower cased this character, would we get the same character? 137 **/ 138 _PUBLIC_ bool isupper_m(codepoint_t val) 139 { 140 return (tolower_m(val) != val); 104 141 } 105 142 … … 117 154 118 155 156 struct smb_iconv_convenience { 157 TALLOC_CTX *child_ctx; 158 const char *unix_charset; 159 const char *dos_charset; 160 const char *display_charset; 161 bool native_iconv; 162 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS]; 163 }; 164 165 struct smb_iconv_convenience *global_iconv_convenience = NULL; 166 167 struct smb_iconv_convenience *get_iconv_convenience(void) 168 { 169 if (global_iconv_convenience == NULL) 170 global_iconv_convenience = smb_iconv_convenience_reinit(talloc_autofree_context(), 171 "ASCII", "UTF-8", "ASCII", true, NULL); 172 return global_iconv_convenience; 173 } 174 175 /** 176 * Return the name of a charset to give to iconv(). 177 **/ 178 const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch) 179 { 180 switch (ch) { 181 case CH_UTF16: return "UTF-16LE"; 182 case CH_UNIX: return ic->unix_charset; 183 case CH_DOS: return ic->dos_charset; 184 case CH_DISPLAY: return ic->display_charset; 185 case CH_UTF8: return "UTF8"; 186 case CH_UTF16BE: return "UTF-16BE"; 187 case CH_UTF16MUNGED: return "UTF16_MUNGED"; 188 default: 189 return "ASCII"; 190 } 191 } 192 193 /** 194 re-initialize iconv conversion descriptors 195 **/ 196 static int close_iconv_convenience(struct smb_iconv_convenience *data) 197 { 198 unsigned c1, c2; 199 for (c1=0;c1<NUM_CHARSETS;c1++) { 200 for (c2=0;c2<NUM_CHARSETS;c2++) { 201 if (data->conv_handles[c1][c2] != NULL) { 202 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) { 203 smb_iconv_close(data->conv_handles[c1][c2]); 204 } 205 data->conv_handles[c1][c2] = NULL; 206 } 207 } 208 } 209 210 return 0; 211 } 212 213 static const char *map_locale(const char *charset) 214 { 215 if (strcmp(charset, "LOCALE") != 0) { 216 return charset; 217 } 218 #if defined(HAVE_NL_LANGINFO) && defined(CODESET) 219 { 220 const char *ln; 221 smb_iconv_t handle; 222 223 ln = nl_langinfo(CODESET); 224 if (ln == NULL) { 225 DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n")); 226 return "ASCII"; 227 } 228 /* Check whether the charset name is supported 229 by iconv */ 230 handle = smb_iconv_open(ln, "UCS-2LE"); 231 if (handle == (smb_iconv_t) -1) { 232 DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln)); 233 return "ASCII"; 234 } else { 235 DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln)); 236 smb_iconv_close(handle); 237 } 238 return ln; 239 } 240 #endif 241 return "ASCII"; 242 } 243 244 /* 245 the old_ic is passed in here as the smb_iconv_convenience structure 246 is used as a global pointer in some places (eg. python modules). We 247 don't want to invalidate those global pointers, but we do want to 248 update them with the right charset information when loadparm 249 runs. To do that we need to re-use the structure pointer, but 250 re-fill the elements in the structure with the updated values 251 */ 252 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_reinit(TALLOC_CTX *mem_ctx, 253 const char *dos_charset, 254 const char *unix_charset, 255 const char *display_charset, 256 bool native_iconv, 257 struct smb_iconv_convenience *old_ic) 258 { 259 struct smb_iconv_convenience *ret; 260 261 display_charset = map_locale(display_charset); 262 263 if (old_ic != NULL) { 264 ret = old_ic; 265 close_iconv_convenience(ret); 266 talloc_free(ret->child_ctx); 267 ZERO_STRUCTP(ret); 268 } else { 269 ret = talloc_zero(mem_ctx, struct smb_iconv_convenience); 270 } 271 if (ret == NULL) { 272 return NULL; 273 } 274 275 /* we use a child context to allow us to free all ptrs without 276 freeing the structure itself */ 277 ret->child_ctx = talloc_new(ret); 278 if (ret->child_ctx == NULL) { 279 return NULL; 280 } 281 282 talloc_set_destructor(ret, close_iconv_convenience); 283 284 ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset); 285 ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset); 286 ret->display_charset = talloc_strdup(ret->child_ctx, display_charset); 287 ret->native_iconv = native_iconv; 288 289 return ret; 290 } 291 292 /* 293 on-demand initialisation of conversion handles 294 */ 295 smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic, 296 charset_t from, charset_t to) 297 { 298 const char *n1, *n2; 299 static bool initialised; 300 301 if (initialised == false) { 302 initialised = true; 303 } 304 305 if (ic->conv_handles[from][to]) { 306 return ic->conv_handles[from][to]; 307 } 308 309 n1 = charset_name(ic, from); 310 n2 = charset_name(ic, to); 311 312 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1, 313 ic->native_iconv); 314 315 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) { 316 if ((from == CH_DOS || to == CH_DOS) && 317 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) { 318 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n", 319 charset_name(ic, CH_DOS))); 320 ic->dos_charset = "ASCII"; 321 322 n1 = charset_name(ic, from); 323 n2 = charset_name(ic, to); 324 325 ic->conv_handles[from][to] = 326 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv); 327 } 328 } 329 330 return ic->conv_handles[from][to]; 331 } 332 333 /** 334 * Return the unicode codepoint for the next character in the input 335 * string in the given src_charset. 336 * The unicode codepoint (codepoint_t) is an unsinged 32 bit value. 337 * 338 * Also return the number of bytes consumed (which tells the caller 339 * how many bytes to skip to get to the next src_charset-character). 340 * 341 * This is implemented (in the non-ascii-case) by first converting the 342 * next character in the input string to UTF16_LE and then calculating 343 * the unicode codepoint from that. 344 * 345 * Return INVALID_CODEPOINT if the next character cannot be converted. 346 */ 347 _PUBLIC_ codepoint_t next_codepoint_convenience_ext( 348 struct smb_iconv_convenience *ic, 349 const char *str, charset_t src_charset, 350 size_t *bytes_consumed) 351 { 352 /* it cannot occupy more than 4 bytes in UTF16 format */ 353 uint8_t buf[4]; 354 smb_iconv_t descriptor; 355 size_t ilen_orig; 356 size_t ilen; 357 size_t olen; 358 char *outbuf; 359 360 if ((str[0] & 0x80) == 0) { 361 *bytes_consumed = 1; 362 return (codepoint_t)str[0]; 363 } 364 365 /* 366 * we assume that no multi-byte character can take more than 5 bytes. 367 * This is OK as we only support codepoints up to 1M (U+100000) 368 */ 369 ilen_orig = strnlen(str, 5); 370 ilen = ilen_orig; 371 372 descriptor = get_conv_handle(ic, src_charset, CH_UTF16); 373 if (descriptor == (smb_iconv_t)-1) { 374 *bytes_consumed = 1; 375 return INVALID_CODEPOINT; 376 } 377 378 /* 379 * this looks a little strange, but it is needed to cope with 380 * codepoints above 64k (U+1000) which are encoded as per RFC2781. 381 */ 382 olen = 2; 383 outbuf = (char *)buf; 384 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); 385 if (olen == 2) { 386 olen = 4; 387 outbuf = (char *)buf; 388 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); 389 if (olen == 4) { 390 /* we didn't convert any bytes */ 391 *bytes_consumed = 1; 392 return INVALID_CODEPOINT; 393 } 394 olen = 4 - olen; 395 } else { 396 olen = 2 - olen; 397 } 398 399 *bytes_consumed = ilen_orig - ilen; 400 401 if (olen == 2) { 402 return (codepoint_t)SVAL(buf, 0); 403 } 404 if (olen == 4) { 405 /* decode a 4 byte UTF16 character manually */ 406 return (codepoint_t)0x10000 + 407 (buf[2] | ((buf[3] & 0x3)<<8) | 408 (buf[0]<<10) | ((buf[1] & 0x3)<<18)); 409 } 410 411 /* no other length is valid */ 412 return INVALID_CODEPOINT; 413 } 414 415 /* 416 return the unicode codepoint for the next multi-byte CH_UNIX character 417 in the string 418 419 also return the number of bytes consumed (which tells the caller 420 how many bytes to skip to get to the next CH_UNIX character) 421 422 return INVALID_CODEPOINT if the next character cannot be converted 423 */ 424 _PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic, 425 const char *str, size_t *size) 426 { 427 return next_codepoint_convenience_ext(ic, str, CH_UNIX, size); 428 } 429 430 /* 431 push a single codepoint into a CH_UNIX string the target string must 432 be able to hold the full character, which is guaranteed if it is at 433 least 5 bytes in size. The caller may pass less than 5 bytes if they 434 are sure the character will fit (for example, you can assume that 435 uppercase/lowercase of a character will not add more than 1 byte) 436 437 return the number of bytes occupied by the CH_UNIX character, or 438 -1 on failure 439 */ 440 _PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic, 441 char *str, codepoint_t c) 442 { 443 smb_iconv_t descriptor; 444 uint8_t buf[4]; 445 size_t ilen, olen; 446 const char *inbuf; 447 448 if (c < 128) { 449 *str = c; 450 return 1; 451 } 452 453 descriptor = get_conv_handle(ic, 454 CH_UTF16, CH_UNIX); 455 if (descriptor == (smb_iconv_t)-1) { 456 return -1; 457 } 458 459 if (c < 0x10000) { 460 ilen = 2; 461 olen = 5; 462 inbuf = (char *)buf; 463 SSVAL(buf, 0, c); 464 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen); 465 if (ilen != 0) { 466 return -1; 467 } 468 return 5 - olen; 469 } 470 471 c -= 0x10000; 472 473 buf[0] = (c>>10) & 0xFF; 474 buf[1] = (c>>18) | 0xd8; 475 buf[2] = c & 0xFF; 476 buf[3] = ((c>>8) & 0x3) | 0xdc; 477 478 ilen = 4; 479 olen = 5; 480 inbuf = (char *)buf; 481 482 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen); 483 if (ilen != 0) { 484 return -1; 485 } 486 return 5 - olen; 487 } 488 489 _PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset, 490 size_t *size) 491 { 492 return next_codepoint_convenience_ext(get_iconv_convenience(), str, 493 src_charset, size); 494 } 495 496 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size) 497 { 498 return next_codepoint_convenience(get_iconv_convenience(), str, size); 499 } 500 501 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c) 502 { 503 return push_codepoint_convenience(get_iconv_convenience(), str, c); 504 }
Note:
See TracChangeset
for help on using the changeset viewer.