Changeset 745 for trunk/server/lib/util/charset
- Timestamp:
- Nov 27, 2012, 4:43:17 PM (13 years ago)
- Location:
- trunk/server
- Files:
-
- 2 deleted
- 8 edited
- 2 copied
Legend:
- Unmodified
- Added
- Removed
-
trunk/server
- Property svn:mergeinfo changed
/vendor/current merged: 581,587,591,594,597,600,615,618,740
- Property svn:mergeinfo changed
-
trunk/server/lib/util/charset/charcnv.c
r414 r745 39 39 */ 40 40 41 struct smb_iconv_convenience {42 const char *unix_charset;43 const char *dos_charset;44 bool native_iconv;45 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];46 };47 48 49 /**50 * Return the name of a charset to give to iconv().51 **/52 static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)53 {54 switch (ch) {55 case CH_UTF16: return "UTF-16LE";56 case CH_UNIX: return ic->unix_charset;57 case CH_DOS: return ic->dos_charset;58 case CH_UTF8: return "UTF8";59 case CH_UTF16BE: return "UTF-16BE";60 case CH_UTF16MUNGED: return "UTF16_MUNGED";61 default:62 return "ASCII";63 }64 }65 66 /**67 re-initialize iconv conversion descriptors68 **/69 static int close_iconv_convenience(struct smb_iconv_convenience *data)70 {71 unsigned c1, c2;72 for (c1=0;c1<NUM_CHARSETS;c1++) {73 for (c2=0;c2<NUM_CHARSETS;c2++) {74 if (data->conv_handles[c1][c2] != NULL) {75 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {76 smb_iconv_close(data->conv_handles[c1][c2]);77 }78 data->conv_handles[c1][c2] = NULL;79 }80 }81 }82 83 return 0;84 }85 86 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx,87 const char *dos_charset,88 const char *unix_charset,89 bool native_iconv)90 {91 struct smb_iconv_convenience *ret = talloc_zero(mem_ctx,92 struct smb_iconv_convenience);93 94 if (ret == NULL) {95 return NULL;96 }97 98 talloc_set_destructor(ret, close_iconv_convenience);99 100 ret->dos_charset = talloc_strdup(ret, dos_charset);101 ret->unix_charset = talloc_strdup(ret, unix_charset);102 ret->native_iconv = native_iconv;103 104 return ret;105 }106 107 /*108 on-demand initialisation of conversion handles109 */110 static smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,111 charset_t from, charset_t to)112 {113 const char *n1, *n2;114 static bool initialised;115 116 if (initialised == false) {117 initialised = true;118 119 #ifdef LC_ALL120 /* we set back the locale to C to get ASCII-compatible121 toupper/lower functions. For now we do not need122 any other POSIX localisations anyway. When we123 should really need localized string functions one124 day we need to write our own ascii_tolower etc.125 */126 setlocale(LC_ALL, "C");127 #endif128 }129 130 if (ic->conv_handles[from][to]) {131 return ic->conv_handles[from][to];132 }133 134 n1 = charset_name(ic, from);135 n2 = charset_name(ic, to);136 137 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,138 ic->native_iconv);139 140 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {141 if ((from == CH_DOS || to == CH_DOS) &&142 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {143 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",144 charset_name(ic, CH_DOS)));145 ic->dos_charset = "ASCII";146 147 n1 = charset_name(ic, from);148 n2 = charset_name(ic, to);149 150 ic->conv_handles[from][to] =151 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);152 }153 }154 155 return ic->conv_handles[from][to];156 }157 158 41 /** 159 42 * Convert string from one encoding to another, making error checking etc … … 214 97 break; 215 98 } 216 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf)); 99 DEBUG(0,("Conversion error: %s - ",reason)); 100 dump_data(0, (const uint8_t *) inbuf, i_len); 217 101 talloc_free(ob); 218 102 return (size_t)-1; … … 349 233 } 350 234 351 /*352 return the unicode codepoint for the next multi-byte CH_UNIX character353 in the string354 355 also return the number of bytes consumed (which tells the caller356 how many bytes to skip to get to the next CH_UNIX character)357 358 return INVALID_CODEPOINT if the next character cannot be converted359 */360 _PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic,361 const char *str, size_t *size)362 {363 /* it cannot occupy more than 4 bytes in UTF16 format */364 uint8_t buf[4];365 smb_iconv_t descriptor;366 size_t ilen_orig;367 size_t ilen;368 size_t olen;369 char *outbuf;370 371 if ((str[0] & 0x80) == 0) {372 *size = 1;373 return (codepoint_t)str[0];374 }375 376 /* we assume that no multi-byte character can take377 more than 5 bytes. This is OK as we only378 support codepoints up to 1M */379 ilen_orig = strnlen(str, 5);380 ilen = ilen_orig;381 382 descriptor = get_conv_handle(ic, CH_UNIX, CH_UTF16);383 if (descriptor == (smb_iconv_t)-1) {384 *size = 1;385 return INVALID_CODEPOINT;386 }387 388 /* this looks a little strange, but it is needed to cope389 with codepoints above 64k */390 olen = 2;391 outbuf = (char *)buf;392 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);393 if (olen == 2) {394 olen = 4;395 outbuf = (char *)buf;396 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);397 if (olen == 4) {398 /* we didn't convert any bytes */399 *size = 1;400 return INVALID_CODEPOINT;401 }402 olen = 4 - olen;403 } else {404 olen = 2 - olen;405 }406 407 *size = ilen_orig - ilen;408 409 if (olen == 2) {410 return (codepoint_t)SVAL(buf, 0);411 }412 if (olen == 4) {413 /* decode a 4 byte UTF16 character manually */414 return (codepoint_t)0x10000 +415 (buf[2] | ((buf[3] & 0x3)<<8) |416 (buf[0]<<10) | ((buf[1] & 0x3)<<18));417 }418 419 /* no other length is valid */420 return INVALID_CODEPOINT;421 }422 423 /*424 push a single codepoint into a CH_UNIX string the target string must425 be able to hold the full character, which is guaranteed if it is at426 least 5 bytes in size. The caller may pass less than 5 bytes if they427 are sure the character will fit (for example, you can assume that428 uppercase/lowercase of a character will not add more than 1 byte)429 430 return the number of bytes occupied by the CH_UNIX character, or431 -1 on failure432 */433 _PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic,434 char *str, codepoint_t c)435 {436 smb_iconv_t descriptor;437 uint8_t buf[4];438 size_t ilen, olen;439 const char *inbuf;440 441 if (c < 128) {442 *str = c;443 return 1;444 }445 446 descriptor = get_conv_handle(ic,447 CH_UTF16, CH_UNIX);448 if (descriptor == (smb_iconv_t)-1) {449 return -1;450 }451 452 if (c < 0x10000) {453 ilen = 2;454 olen = 5;455 inbuf = (char *)buf;456 SSVAL(buf, 0, c);457 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);458 if (ilen != 0) {459 return -1;460 }461 return 5 - olen;462 }463 464 c -= 0x10000;465 466 buf[0] = (c>>10) & 0xFF;467 buf[1] = (c>>18) | 0xd8;468 buf[2] = c & 0xFF;469 buf[3] = ((c>>8) & 0x3) | 0xdc;470 471 ilen = 4;472 olen = 5;473 inbuf = (char *)buf;474 475 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);476 if (ilen != 0) {477 return -1;478 }479 return 5 - olen;480 }481 482 -
trunk/server/lib/util/charset/charset.h
r590 r745 40 40 typedef uint16_t smb_ucs2_t; 41 41 42 #ifdef WORDS_BIGENDIAN 43 #define UCS2_SHIFT 8 44 #else 45 #define UCS2_SHIFT 0 46 #endif 47 48 /* turn a 7 bit character into a ucs2 character */ 49 #define UCS2_CHAR(c) ((c) << UCS2_SHIFT) 50 51 /* return an ascii version of a ucs2 character */ 52 #define UCS2_TO_CHAR(c) (((c) >> UCS2_SHIFT) & 0xff) 53 54 /* Copy into a smb_ucs2_t from a possibly unaligned buffer. Return the copied smb_ucs2_t */ 55 #define COPY_UCS2_CHAR(dest,src) (((unsigned char *)(dest))[0] = ((unsigned char *)(src))[0],\ 56 ((unsigned char *)(dest))[1] = ((unsigned char *)(src))[1], (dest)) 57 58 59 42 60 /* 43 61 * for each charset we have a function that pulls from that charset to … … 103 121 104 122 char *strchr_m(const char *s, char c); 123 size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset); 124 size_t strlen_m_ext_term(const char *s, charset_t src_charset, 125 charset_t dst_charset); 105 126 size_t strlen_m_term(const char *s); 106 127 size_t strlen_m_term_null(const char *s); … … 150 171 151 172 extern struct smb_iconv_convenience *global_iconv_convenience; 152 173 struct smb_iconv_convenience *get_iconv_convenience(void); 174 smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic, 175 charset_t from, charset_t to); 176 const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch); 177 178 codepoint_t next_codepoint_ext(const char *str, charset_t src_charset, 179 size_t *size); 153 180 codepoint_t next_codepoint(const char *str, size_t *size); 154 181 ssize_t push_codepoint(char *str, codepoint_t c); 155 182 156 183 /* codepoints */ 184 codepoint_t next_codepoint_convenience_ext(struct smb_iconv_convenience *ic, 185 const char *str, charset_t src_charset, 186 size_t *size); 157 187 codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic, 158 188 const char *str, size_t *size); 159 189 ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic, 160 190 char *str, codepoint_t c); 191 161 192 codepoint_t toupper_m(codepoint_t val); 162 193 codepoint_t tolower_m(codepoint_t val); 194 bool islower_m(codepoint_t val); 195 bool isupper_m(codepoint_t val); 163 196 int codepoint_cmpi(codepoint_t c1, codepoint_t c2); 164 197 165 198 /* Iconv convenience functions */ 166 struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx, 167 const char *dos_charset, 168 const char *unix_charset, 169 bool native_iconv); 199 struct smb_iconv_convenience *smb_iconv_convenience_reinit(TALLOC_CTX *mem_ctx, 200 const char *dos_charset, 201 const char *unix_charset, 202 const char *display_charset, 203 bool native_iconv, 204 struct smb_iconv_convenience *old_ic); 170 205 171 206 bool convert_string_convenience(struct smb_iconv_convenience *ic, … … 189 224 190 225 void load_case_tables(void); 191 bool charset_register_backend(const void *_funcs); 226 void load_case_tables_library(void); 227 bool smb_register_charset(const struct charset_functions *funcs_in); 192 228 193 229 /* … … 264 300 NTSTATUS charset_ ## CHARSETNAME ## _init(void) \ 265 301 { \ 266 return smb_register_charset(& CHARSETNAME ## _functions); \ 267 } \ 302 if (!smb_register_charset(& CHARSETNAME ## _functions)) { \ 303 return NT_STATUS_INTERNAL_ERROR; \ 304 } \ 305 return NT_STATUS_OK; \ 306 } \ 268 307 269 308 -
trunk/server/lib/util/charset/codepoints.c
r414 r745 1 1 /* 2 2 Unix SMB/CIFS implementation. 3 Samba utility functions 4 Copyright (C) Andrew Tridgell 1992-2001 3 Character set conversion Extensions 4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001 5 Copyright (C) Andrew Tridgell 2001 5 6 Copyright (C) Simo Sorce 2001 7 Copyright (C) Jelmer Vernooij 2007 6 8 7 9 This program is free software; you can redistribute it and/or modify … … 17 19 You should have received a copy of the GNU General Public License 18 20 along with this program. If not, see <http://www.gnu.org/licenses/>. 21 19 22 */ 20 21 23 #include "includes.h" 24 #include "lib/util/charset/charset.h" 22 25 #include "system/locale.h" 23 #include "dynconfig/dynconfig.h" 26 #include "dynconfig.h" 27 28 #ifdef strcasecmp 29 #undef strcasecmp 30 #endif 24 31 25 32 /** … … 36 43 /******************************************************************* 37 44 load the case handling tables 45 46 This is the function that should be called from library code. 38 47 ********************************************************************/ 39 void load_case_tables (void)48 void load_case_tables_library(void) 40 49 { 41 50 TALLOC_CTX *mem_ctx; … … 45 54 smb_panic("No memory for case_tables"); 46 55 } 47 upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", dyn_DATADIR), 0x20000);48 lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", dyn_DATADIR), 0x20000);56 upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000); 57 lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000); 49 58 talloc_free(mem_ctx); 50 59 if (upcase_table == NULL) { 51 /* try also under codepages for testing purposes */ 52 upcase_table = map_file("codepages/upcase.dat", 0x20000); 53 if (upcase_table == NULL) { 54 upcase_table = (void *)-1; 55 } 60 DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n")); 61 upcase_table = (void *)-1; 56 62 } 57 63 if (lowcase_table == NULL) { 58 /* try also under codepages for testing purposes */ 59 lowcase_table = map_file("codepages/lowcase.dat", 0x20000); 60 if (lowcase_table == NULL) { 61 lowcase_table = (void *)-1; 62 } 63 } 64 DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n")); 65 lowcase_table = (void *)-1; 66 } 67 } 68 69 /******************************************************************* 70 load the case handling tables 71 72 This MUST only be called from main() in application code, never from a 73 library. We don't know if the calling program has already done 74 setlocale() to another value, and can't tell if they have. 75 ********************************************************************/ 76 void load_case_tables(void) 77 { 78 /* This is a useful global hook where we can ensure that the 79 * locale is set from the environment. This is needed so that 80 * we can use LOCALE as a codepage */ 81 #ifdef HAVE_SETLOCALE 82 setlocale(LC_ALL, ""); 83 #endif 84 load_case_tables_library(); 64 85 } 65 86 … … 73 94 } 74 95 if (upcase_table == NULL) { 75 load_case_tables ();96 load_case_tables_library(); 76 97 } 77 98 if (upcase_table == (void *)-1) { … … 93 114 } 94 115 if (lowcase_table == NULL) { 95 load_case_tables ();116 load_case_tables_library(); 96 117 } 97 118 if (lowcase_table == (void *)-1) { … … 102 123 } 103 124 return SVAL(lowcase_table, val*2); 125 } 126 127 /** 128 If we upper cased this character, would we get the same character? 129 **/ 130 _PUBLIC_ bool islower_m(codepoint_t val) 131 { 132 return (toupper_m(val) != val); 133 } 134 135 /** 136 If we lower cased this character, would we get the same character? 137 **/ 138 _PUBLIC_ bool isupper_m(codepoint_t val) 139 { 140 return (tolower_m(val) != val); 104 141 } 105 142 … … 117 154 118 155 156 struct smb_iconv_convenience { 157 TALLOC_CTX *child_ctx; 158 const char *unix_charset; 159 const char *dos_charset; 160 const char *display_charset; 161 bool native_iconv; 162 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS]; 163 }; 164 165 struct smb_iconv_convenience *global_iconv_convenience = NULL; 166 167 struct smb_iconv_convenience *get_iconv_convenience(void) 168 { 169 if (global_iconv_convenience == NULL) 170 global_iconv_convenience = smb_iconv_convenience_reinit(talloc_autofree_context(), 171 "ASCII", "UTF-8", "ASCII", true, NULL); 172 return global_iconv_convenience; 173 } 174 175 /** 176 * Return the name of a charset to give to iconv(). 177 **/ 178 const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch) 179 { 180 switch (ch) { 181 case CH_UTF16: return "UTF-16LE"; 182 case CH_UNIX: return ic->unix_charset; 183 case CH_DOS: return ic->dos_charset; 184 case CH_DISPLAY: return ic->display_charset; 185 case CH_UTF8: return "UTF8"; 186 case CH_UTF16BE: return "UTF-16BE"; 187 case CH_UTF16MUNGED: return "UTF16_MUNGED"; 188 default: 189 return "ASCII"; 190 } 191 } 192 193 /** 194 re-initialize iconv conversion descriptors 195 **/ 196 static int close_iconv_convenience(struct smb_iconv_convenience *data) 197 { 198 unsigned c1, c2; 199 for (c1=0;c1<NUM_CHARSETS;c1++) { 200 for (c2=0;c2<NUM_CHARSETS;c2++) { 201 if (data->conv_handles[c1][c2] != NULL) { 202 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) { 203 smb_iconv_close(data->conv_handles[c1][c2]); 204 } 205 data->conv_handles[c1][c2] = NULL; 206 } 207 } 208 } 209 210 return 0; 211 } 212 213 static const char *map_locale(const char *charset) 214 { 215 if (strcmp(charset, "LOCALE") != 0) { 216 return charset; 217 } 218 #if defined(HAVE_NL_LANGINFO) && defined(CODESET) 219 { 220 const char *ln; 221 smb_iconv_t handle; 222 223 ln = nl_langinfo(CODESET); 224 if (ln == NULL) { 225 DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n")); 226 return "ASCII"; 227 } 228 /* Check whether the charset name is supported 229 by iconv */ 230 handle = smb_iconv_open(ln, "UCS-2LE"); 231 if (handle == (smb_iconv_t) -1) { 232 DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln)); 233 return "ASCII"; 234 } else { 235 DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln)); 236 smb_iconv_close(handle); 237 } 238 return ln; 239 } 240 #endif 241 return "ASCII"; 242 } 243 244 /* 245 the old_ic is passed in here as the smb_iconv_convenience structure 246 is used as a global pointer in some places (eg. python modules). We 247 don't want to invalidate those global pointers, but we do want to 248 update them with the right charset information when loadparm 249 runs. To do that we need to re-use the structure pointer, but 250 re-fill the elements in the structure with the updated values 251 */ 252 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_reinit(TALLOC_CTX *mem_ctx, 253 const char *dos_charset, 254 const char *unix_charset, 255 const char *display_charset, 256 bool native_iconv, 257 struct smb_iconv_convenience *old_ic) 258 { 259 struct smb_iconv_convenience *ret; 260 261 display_charset = map_locale(display_charset); 262 263 if (old_ic != NULL) { 264 ret = old_ic; 265 close_iconv_convenience(ret); 266 talloc_free(ret->child_ctx); 267 ZERO_STRUCTP(ret); 268 } else { 269 ret = talloc_zero(mem_ctx, struct smb_iconv_convenience); 270 } 271 if (ret == NULL) { 272 return NULL; 273 } 274 275 /* we use a child context to allow us to free all ptrs without 276 freeing the structure itself */ 277 ret->child_ctx = talloc_new(ret); 278 if (ret->child_ctx == NULL) { 279 return NULL; 280 } 281 282 talloc_set_destructor(ret, close_iconv_convenience); 283 284 ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset); 285 ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset); 286 ret->display_charset = talloc_strdup(ret->child_ctx, display_charset); 287 ret->native_iconv = native_iconv; 288 289 return ret; 290 } 291 292 /* 293 on-demand initialisation of conversion handles 294 */ 295 smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic, 296 charset_t from, charset_t to) 297 { 298 const char *n1, *n2; 299 static bool initialised; 300 301 if (initialised == false) { 302 initialised = true; 303 } 304 305 if (ic->conv_handles[from][to]) { 306 return ic->conv_handles[from][to]; 307 } 308 309 n1 = charset_name(ic, from); 310 n2 = charset_name(ic, to); 311 312 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1, 313 ic->native_iconv); 314 315 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) { 316 if ((from == CH_DOS || to == CH_DOS) && 317 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) { 318 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n", 319 charset_name(ic, CH_DOS))); 320 ic->dos_charset = "ASCII"; 321 322 n1 = charset_name(ic, from); 323 n2 = charset_name(ic, to); 324 325 ic->conv_handles[from][to] = 326 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv); 327 } 328 } 329 330 return ic->conv_handles[from][to]; 331 } 332 333 /** 334 * Return the unicode codepoint for the next character in the input 335 * string in the given src_charset. 336 * The unicode codepoint (codepoint_t) is an unsinged 32 bit value. 337 * 338 * Also return the number of bytes consumed (which tells the caller 339 * how many bytes to skip to get to the next src_charset-character). 340 * 341 * This is implemented (in the non-ascii-case) by first converting the 342 * next character in the input string to UTF16_LE and then calculating 343 * the unicode codepoint from that. 344 * 345 * Return INVALID_CODEPOINT if the next character cannot be converted. 346 */ 347 _PUBLIC_ codepoint_t next_codepoint_convenience_ext( 348 struct smb_iconv_convenience *ic, 349 const char *str, charset_t src_charset, 350 size_t *bytes_consumed) 351 { 352 /* it cannot occupy more than 4 bytes in UTF16 format */ 353 uint8_t buf[4]; 354 smb_iconv_t descriptor; 355 size_t ilen_orig; 356 size_t ilen; 357 size_t olen; 358 char *outbuf; 359 360 if ((str[0] & 0x80) == 0) { 361 *bytes_consumed = 1; 362 return (codepoint_t)str[0]; 363 } 364 365 /* 366 * we assume that no multi-byte character can take more than 5 bytes. 367 * This is OK as we only support codepoints up to 1M (U+100000) 368 */ 369 ilen_orig = strnlen(str, 5); 370 ilen = ilen_orig; 371 372 descriptor = get_conv_handle(ic, src_charset, CH_UTF16); 373 if (descriptor == (smb_iconv_t)-1) { 374 *bytes_consumed = 1; 375 return INVALID_CODEPOINT; 376 } 377 378 /* 379 * this looks a little strange, but it is needed to cope with 380 * codepoints above 64k (U+1000) which are encoded as per RFC2781. 381 */ 382 olen = 2; 383 outbuf = (char *)buf; 384 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); 385 if (olen == 2) { 386 olen = 4; 387 outbuf = (char *)buf; 388 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); 389 if (olen == 4) { 390 /* we didn't convert any bytes */ 391 *bytes_consumed = 1; 392 return INVALID_CODEPOINT; 393 } 394 olen = 4 - olen; 395 } else { 396 olen = 2 - olen; 397 } 398 399 *bytes_consumed = ilen_orig - ilen; 400 401 if (olen == 2) { 402 return (codepoint_t)SVAL(buf, 0); 403 } 404 if (olen == 4) { 405 /* decode a 4 byte UTF16 character manually */ 406 return (codepoint_t)0x10000 + 407 (buf[2] | ((buf[3] & 0x3)<<8) | 408 (buf[0]<<10) | ((buf[1] & 0x3)<<18)); 409 } 410 411 /* no other length is valid */ 412 return INVALID_CODEPOINT; 413 } 414 415 /* 416 return the unicode codepoint for the next multi-byte CH_UNIX character 417 in the string 418 419 also return the number of bytes consumed (which tells the caller 420 how many bytes to skip to get to the next CH_UNIX character) 421 422 return INVALID_CODEPOINT if the next character cannot be converted 423 */ 424 _PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic, 425 const char *str, size_t *size) 426 { 427 return next_codepoint_convenience_ext(ic, str, CH_UNIX, size); 428 } 429 430 /* 431 push a single codepoint into a CH_UNIX string the target string must 432 be able to hold the full character, which is guaranteed if it is at 433 least 5 bytes in size. The caller may pass less than 5 bytes if they 434 are sure the character will fit (for example, you can assume that 435 uppercase/lowercase of a character will not add more than 1 byte) 436 437 return the number of bytes occupied by the CH_UNIX character, or 438 -1 on failure 439 */ 440 _PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic, 441 char *str, codepoint_t c) 442 { 443 smb_iconv_t descriptor; 444 uint8_t buf[4]; 445 size_t ilen, olen; 446 const char *inbuf; 447 448 if (c < 128) { 449 *str = c; 450 return 1; 451 } 452 453 descriptor = get_conv_handle(ic, 454 CH_UTF16, CH_UNIX); 455 if (descriptor == (smb_iconv_t)-1) { 456 return -1; 457 } 458 459 if (c < 0x10000) { 460 ilen = 2; 461 olen = 5; 462 inbuf = (char *)buf; 463 SSVAL(buf, 0, c); 464 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen); 465 if (ilen != 0) { 466 return -1; 467 } 468 return 5 - olen; 469 } 470 471 c -= 0x10000; 472 473 buf[0] = (c>>10) & 0xFF; 474 buf[1] = (c>>18) | 0xd8; 475 buf[2] = c & 0xFF; 476 buf[3] = ((c>>8) & 0x3) | 0xdc; 477 478 ilen = 4; 479 olen = 5; 480 inbuf = (char *)buf; 481 482 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen); 483 if (ilen != 0) { 484 return -1; 485 } 486 return 5 - olen; 487 } 488 489 _PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset, 490 size_t *size) 491 { 492 return next_codepoint_convenience_ext(get_iconv_convenience(), str, 493 src_charset, size); 494 } 495 496 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size) 497 { 498 return next_codepoint_convenience(get_iconv_convenience(), str, size); 499 } 500 501 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c) 502 { 503 return push_codepoint_convenience(get_iconv_convenience(), str, c); 504 } -
trunk/server/lib/util/charset/iconv.c
r414 r745 24 24 #include "system/filesys.h" 25 25 26 #ifdef strcasecmp 27 #undef strcasecmp 28 #endif 29 30 #ifdef static_decl_charset 31 static_decl_charset; 32 #endif 26 33 27 34 /** … … 50 57 static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *); 51 58 static size_t ascii_push (void *,const char **, size_t *, char **, size_t *); 59 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *); 52 60 static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *); 53 61 static size_t utf8_push (void *,const char **, size_t *, char **, size_t *); … … 73 81 74 82 {"ASCII", ascii_pull, ascii_push}, 83 {"646", ascii_pull, ascii_push}, 84 {"ISO-8859-1", ascii_pull, latin1_push}, 75 85 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push} 76 86 }; … … 78 88 static struct charset_functions *charsets = NULL; 79 89 80 bool charset_register_backend(const void *_funcs) 81 { 82 struct charset_functions *funcs = (struct charset_functions *)memdup(_funcs,sizeof(struct charset_functions)); 90 static struct charset_functions *find_charset_functions(const char *name) 91 { 83 92 struct charset_functions *c; 84 93 85 94 /* Check whether we already have this charset... */ 86 95 for (c = charsets; c != NULL; c = c->next) { 87 if(!strcasecmp(c->name, funcs->name)) { 88 DEBUG(2, ("Duplicate charset %s, not registering\n", funcs->name)); 89 return false; 90 } 91 } 96 if(strcasecmp(c->name, name) == 0) { 97 return c; 98 } 99 // c = c->next; 100 } 101 102 return NULL; 103 } 104 105 bool smb_register_charset(const struct charset_functions *funcs_in) 106 { 107 struct charset_functions *funcs; 108 109 DEBUG(5, ("Attempting to register new charset %s\n", funcs_in->name)); 110 /* Check whether we already have this charset... */ 111 if (find_charset_functions(funcs_in->name)) { 112 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs_in->name)); 113 return false; 114 } 115 116 funcs = talloc(NULL, struct charset_functions); 117 if (!funcs) { 118 DEBUG(0, ("Out of memory duplicating charset %s\n", funcs_in->name)); 119 return false; 120 } 121 *funcs = *funcs_in; 92 122 93 123 funcs->next = funcs->prev = NULL; 124 DEBUG(5, ("Registered charset %s\n", funcs->name)); 94 125 DLIST_ADD(charsets, funcs); 95 126 return true; 96 127 } 128 129 static void lazy_initialize_iconv(void) 130 { 131 static bool initialized; 132 133 #ifdef static_init_charset 134 if (!initialized) { 135 static_init_charset; 136 initialized = true; 137 } 138 #endif 139 } 140 141 #if defined(__OS2__) && defined(__INNOTEK_LIBC__) 142 #include <uconv.h> 143 144 typedef struct os2_iconv_t 145 { 146 UconvObject from; 147 } os2_iconv_t; 148 149 iconv_t os2_iconv_open (const char *tocode, const char *fromcode) 150 { 151 os2_iconv_t *os2_cd = (os2_iconv_t *)iconv_open(tocode, fromcode); 152 153 if (os2_cd != (iconv_t)(-1)) 154 { 155 /* Assume strings contain pathnames */ 156 uconv_attribute_t attr; 157 158 UniQueryUconvObject(os2_cd->from, &attr, 159 sizeof(uconv_attribute_t), 160 NULL, NULL, NULL ); 161 attr.converttype |= CVTTYPE_PATH; 162 UniSetUconvObject(os2_cd->from, &attr); 163 } 164 165 return (iconv_t)os2_cd; 166 } 167 168 #define iconv_open os2_iconv_open 169 #endif 97 170 98 171 #ifdef HAVE_NATIVE_ICONV … … 159 232 } 160 233 161 int smb_iconv_t_destructor(smb_iconv_t hwd)162 { 234 static int smb_iconv_t_destructor(smb_iconv_t hwd) 235 { 163 236 #ifdef HAVE_NATIVE_ICONV 164 237 if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1) … … 180 253 int i; 181 254 255 lazy_initialize_iconv(); 256 182 257 ret = (smb_iconv_t)talloc_named(mem_ctx, 183 258 sizeof(*ret), … … 261 336 if (is_utf16(tocode)) { 262 337 ret->direct = sys_iconv; 263 /* could be set just above - so we need to close iconv */264 if (ret->cd_direct != NULL && ret->cd_direct != (iconv_t)-1)265 iconv_close(ret->cd_direct);266 338 ret->cd_direct = ret->cd_pull; 267 339 ret->cd_pull = NULL; … … 286 358 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode) 287 359 { 288 return smb_iconv_open_ex( talloc_autofree_context(), tocode, fromcode, true);360 return smb_iconv_open_ex(NULL, tocode, fromcode, true); 289 361 } 290 362 … … 351 423 } 352 424 425 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft, 426 char **outbuf, size_t *outbytesleft) 427 { 428 int ir_count=0; 429 430 while (*inbytesleft >= 2 && *outbytesleft >= 1) { 431 (*outbuf)[0] = (*inbuf)[0]; 432 if ((*inbuf)[1]) ir_count++; 433 (*inbytesleft) -= 2; 434 (*outbytesleft) -= 1; 435 (*inbuf) += 2; 436 (*outbuf) += 1; 437 } 438 439 if (*inbytesleft == 1) { 440 errno = EINVAL; 441 return -1; 442 } 443 444 if (*inbytesleft > 1) { 445 errno = E2BIG; 446 return -1; 447 } 448 449 return ir_count; 450 } 353 451 354 452 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft, … … 356 454 { 357 455 while (*inbytesleft >= 1 && *outbytesleft >= 2) { 358 u int_t v;456 unsigned int v; 359 457 360 458 if ((*inbuf)[0] != '@') { -
trunk/server/lib/util/charset/tests/charset.c
r414 r745 247 247 struct torture_suite *torture_local_charset(TALLOC_CTX *mem_ctx) 248 248 { 249 struct torture_suite *suite = torture_suite_create(mem_ctx, " CHARSET");249 struct torture_suite *suite = torture_suite_create(mem_ctx, "charset"); 250 250 251 251 torture_suite_add_simple_test(suite, "toupper_m", test_toupper_m); -
trunk/server/lib/util/charset/tests/iconv.c
r414 r745 36 36 iconv_t cd; 37 37 38 if (!lp _parm_bool(tctx->lp_ctx, NULL, "iconv", "native", true))38 if (!lpcfg_parm_bool(tctx->lp_ctx, NULL, "iconv", "native", true)) 39 39 torture_skip(tctx, "system iconv disabled - skipping test"); 40 40 … … 135 135 uint8_t buf1[1000], buf2[1000], buf3[1000]; 136 136 size_t outsize1, outsize2, outsize3; 137 c onst char *ptr_in;137 char *ptr_in; 138 138 char *ptr_out; 139 139 size_t size_in1, size_in2, size_in3; … … 159 159 charset)); 160 160 } 161 cd2 = smb_iconv_open_ex(test, charset, "UTF-16LE", lp _parm_bool(test->lp_ctx, NULL, "iconv", "native", true));162 cd3 = smb_iconv_open_ex(test, "UTF-16LE", charset, lp _parm_bool(test->lp_ctx, NULL, "iconv", "native", true));161 cd2 = smb_iconv_open_ex(test, charset, "UTF-16LE", lpcfg_parm_bool(test->lp_ctx, NULL, "iconv", "native", true)); 162 cd3 = smb_iconv_open_ex(test, "UTF-16LE", charset, lpcfg_parm_bool(test->lp_ctx, NULL, "iconv", "native", true)); 163 163 last_charset = charset; 164 164 } 165 165 166 166 /* internal convert to charset - placing result in buf1 */ 167 ptr_in = (c onst char *)inbuf;167 ptr_in = (char *)inbuf; 168 168 ptr_out = (char *)buf1; 169 169 size_in1 = size; … … 172 172 memset(ptr_out, 0, outsize1); 173 173 errno = 0; 174 ret1 = smb_iconv(cd2, &ptr_in, &size_in1, &ptr_out, &outsize1);174 ret1 = smb_iconv(cd2, (const char **) &ptr_in, &size_in1, &ptr_out, &outsize1); 175 175 errno1 = errno; 176 176 177 177 /* system convert to charset - placing result in buf2 */ 178 ptr_in = (c onst char *)inbuf;178 ptr_in = (char *)inbuf; 179 179 ptr_out = (char *)buf2; 180 180 size_in2 = size; … … 183 183 memset(ptr_out, 0, outsize2); 184 184 errno = 0; 185 ret2 = iconv(cd, discard_const_p(char *, &ptr_in), &size_in2, &ptr_out, &outsize2);185 ret2 = iconv(cd, &ptr_in, &size_in2, &ptr_out, &outsize2); 186 186 errno2 = errno; 187 187 … … 237 237 /* convert back to UTF-16, putting result in buf3 */ 238 238 size = size - size_in1; 239 ptr_in = (c onst char *)buf1;239 ptr_in = (char *)buf1; 240 240 ptr_out = (char *)buf3; 241 241 size_in3 = len1; … … 243 243 244 244 memset(ptr_out, 0, outsize3); 245 ret3 = smb_iconv(cd3, &ptr_in, &size_in3, &ptr_out, &outsize3);245 ret3 = smb_iconv(cd3, (const char **) &ptr_in, &size_in3, &ptr_out, &outsize3); 246 246 247 247 /* we only internally support the first 1M codepoints */ … … 290 290 codepoint_t c; 291 291 292 size = push_codepoint_convenience(lp _iconv_convenience(tctx->lp_ctx), (char *)buf, codepoint);292 size = push_codepoint_convenience(lpcfg_iconv_convenience(tctx->lp_ctx), (char *)buf, codepoint); 293 293 torture_assert(tctx, size != -1 || (codepoint >= 0xd800 && codepoint <= 0x10000), 294 294 "Invalid Codepoint range"); … … 301 301 buf[size+3] = random(); 302 302 303 c = next_codepoint_convenience(lp _iconv_convenience(tctx->lp_ctx), (char *)buf, &size2);303 c = next_codepoint_convenience(lpcfg_iconv_convenience(tctx->lp_ctx), (char *)buf, &size2); 304 304 305 305 torture_assert(tctx, c == codepoint, … … 452 452 struct torture_suite *torture_local_iconv(TALLOC_CTX *mem_ctx) 453 453 { 454 struct torture_suite *suite = torture_suite_create(mem_ctx, " ICONV");454 struct torture_suite *suite = torture_suite_create(mem_ctx, "iconv"); 455 455 456 456 torture_suite_add_simple_test(suite, "string2key", -
trunk/server/lib/util/charset/util_unistr.c
r414 r745 22 22 #include "system/locale.h" 23 23 24 struct smb_iconv_convenience *global_iconv_convenience = NULL;25 26 static inline struct smb_iconv_convenience *get_iconv_convenience(void)27 {28 if (global_iconv_convenience == NULL)29 global_iconv_convenience = smb_iconv_convenience_init(talloc_autofree_context(), "ASCII", "UTF-8", true);30 return global_iconv_convenience;31 }32 33 24 /** 34 25 Case insensitive string compararison … … 68 59 69 60 return *s1 - *s2; 70 }71 72 /**73 * Get the next token from a string, return False if none found.74 * Handles double-quotes.75 *76 * Based on a routine by GJC@VILLAGE.COM.77 * Extensively modified by Andrew.Tridgell@anu.edu.au78 **/79 _PUBLIC_ bool next_token(const char **ptr,char *buff, const char *sep, size_t bufsize)80 {81 const char *s;82 bool quoted;83 size_t len=1;84 85 if (!ptr)86 return false;87 88 s = *ptr;89 90 /* default to simple separators */91 if (!sep)92 sep = " \t\n\r";93 94 /* find the first non sep char */95 while (*s && strchr_m(sep,*s))96 s++;97 98 /* nothing left? */99 if (!*s)100 return false;101 102 /* copy over the token */103 for (quoted = false; len < bufsize && *s && (quoted || !strchr_m(sep,*s)); s++) {104 if (*s == '\"') {105 quoted = !quoted;106 } else {107 len++;108 *buff++ = *s;109 }110 }111 112 *ptr = (*s) ? s+1 : s;113 *buff = 0;114 115 return true;116 61 } 117 62 … … 249 194 250 195 /** 251 Count the number of UCS2 characters in a string. Normally this will 252 be the same as the number of bytes in a string for single byte strings, 253 but will be different for multibyte. 254 **/ 255 _PUBLIC_ size_t strlen_m(const char *s) 196 * Calculate the number of units (8 or 16-bit, depending on the 197 * destination charset), that would be needed to convert the input 198 * string which is expected to be in in src_charset encoding to the 199 * destination charset (which should be a unicode charset). 200 */ 201 _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset) 256 202 { 257 203 size_t count = 0; … … 273 219 while (*s) { 274 220 size_t c_size; 275 codepoint_t c = next_codepoint_convenience(ic, s, &c_size); 276 if (c < 0x10000) { 221 codepoint_t c = next_codepoint_convenience_ext(ic, s, src_charset, &c_size); 222 s += c_size; 223 224 switch (dst_charset) { 225 case CH_UTF16LE: 226 case CH_UTF16BE: 227 case CH_UTF16MUNGED: 228 if (c < 0x10000) { 229 count += 1; 230 } else { 231 count += 2; 232 } 233 break; 234 case CH_UTF8: 235 /* 236 * this only checks ranges, and does not 237 * check for invalid codepoints 238 */ 239 if (c < 0x80) { 240 count += 1; 241 } else if (c < 0x800) { 242 count += 2; 243 } else if (c < 0x1000) { 244 count += 3; 245 } else { 246 count += 4; 247 } 248 break; 249 default: 250 /* 251 * non-unicode encoding: 252 * assume that each codepoint fits into 253 * one unit in the destination encoding. 254 */ 277 255 count += 1; 278 } else { 279 count += 2; 280 } 281 s += c_size; 256 } 282 257 } 283 258 284 259 return count; 260 } 261 262 _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset, 263 const charset_t dst_charset) 264 { 265 if (!s) { 266 return 0; 267 } 268 return strlen_m_ext(s, src_charset, dst_charset) + 1; 269 } 270 271 /** 272 * Calculate the number of 16-bit units that would be needed to convert 273 * the input string which is expected to be in CH_UNIX encoding to UTF16. 274 * 275 * This will be the same as the number of bytes in a string for single 276 * byte strings, but will be different for multibyte. 277 */ 278 _PUBLIC_ size_t strlen_m(const char *s) 279 { 280 return strlen_m_ext(s, CH_UNIX, CH_UTF16LE); 285 281 } 286 282 … … 430 426 char *dest; 431 427 struct smb_iconv_convenience *iconv_convenience = get_iconv_convenience(); 428 429 if(src == NULL) { 430 return NULL; 431 } 432 432 433 433 /* this takes advantage of the fact that upper/lower can't … … 988 988 } 989 989 990 991 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)992 {993 return next_codepoint_convenience(get_iconv_convenience(), str, size);994 }995 996 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)997 {998 return push_codepoint_convenience(get_iconv_convenience(), str, c);999 }
Note:
See TracChangeset
for help on using the changeset viewer.