Context Navigation

source: vendor/3.6.23/lib/util/charset/codepoints.c

Visit:

Last change on this file was 740, checked in by Silvan Scherrer, 13 years ago
Samba Server: update vendor to 3.6.0
File size: 13.3 KB

Line
1	/*
2	Unix SMB/CIFS implementation.
3	Character set conversion Extensions
4	Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5	Copyright (C) Andrew Tridgell 2001
6	Copyright (C) Simo Sorce 2001
7	Copyright (C) Jelmer Vernooij 2007
8
9	This program is free software; you can redistribute it and/or modify
10	it under the terms of the GNU General Public License as published by
11	the Free Software Foundation; either version 3 of the License, or
12	(at your option) any later version.
13
14	This program is distributed in the hope that it will be useful,
15	but WITHOUT ANY WARRANTY; without even the implied warranty of
16	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17	GNU General Public License for more details.
18
19	You should have received a copy of the GNU General Public License
20	along with this program. If not, see <http://www.gnu.org/licenses/>.
21
22	*/
23	#include "includes.h"
24	#include "lib/util/charset/charset.h"
25	#include "system/locale.h"
26	#include "dynconfig.h"
27
28	#ifdef strcasecmp
29	#undef strcasecmp
30	#endif
31
32	/**
33	* @file
34	* @brief Unicode string manipulation
35	*/
36
37	/* these 2 tables define the unicode case handling. They are loaded
38	at startup either via mmap() or read() from the lib directory */
39	static void *upcase_table;
40	static void *lowcase_table;
41
42
43	/*******************************************************************
44	load the case handling tables
45
46	This is the function that should be called from library code.
47	********************************************************************/
48	void load_case_tables_library(void)
49	{
50	TALLOC_CTX *mem_ctx;
51
52	mem_ctx = talloc_init("load_case_tables");
53	if (!mem_ctx) {
54	smb_panic("No memory for case_tables");
55	}
56	upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
57	lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
58	talloc_free(mem_ctx);
59	if (upcase_table == NULL) {
60	DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
61	upcase_table = (void *)-1;
62	}
63	if (lowcase_table == NULL) {
64	DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
65	lowcase_table = (void *)-1;
66	}
67	}
68
69	/*******************************************************************
70	load the case handling tables
71
72	This MUST only be called from main() in application code, never from a
73	library. We don't know if the calling program has already done
74	setlocale() to another value, and can't tell if they have.
75	********************************************************************/
76	void load_case_tables(void)
77	{
78	/* This is a useful global hook where we can ensure that the
79	* locale is set from the environment. This is needed so that
80	* we can use LOCALE as a codepage */
81	#ifdef HAVE_SETLOCALE
82	setlocale(LC_ALL, "");
83	#endif
84	load_case_tables_library();
85	}
86
87	/**
88	Convert a codepoint_t to upper case.
89	**/
90	_PUBLIC_ codepoint_t toupper_m(codepoint_t val)
91	{
92	if (val < 128) {
93	return toupper(val);
94	}
95	if (upcase_table == NULL) {
96	load_case_tables_library();
97	}
98	if (upcase_table == (void *)-1) {
99	return val;
100	}
101	if (val & 0xFFFF0000) {
102	return val;
103	}
104	return SVAL(upcase_table, val*2);
105	}
106
107	/**
108	Convert a codepoint_t to lower case.
109	**/
110	_PUBLIC_ codepoint_t tolower_m(codepoint_t val)
111	{
112	if (val < 128) {
113	return tolower(val);
114	}
115	if (lowcase_table == NULL) {
116	load_case_tables_library();
117	}
118	if (lowcase_table == (void *)-1) {
119	return val;
120	}
121	if (val & 0xFFFF0000) {
122	return val;
123	}
124	return SVAL(lowcase_table, val*2);
125	}
126
127	/**
128	If we upper cased this character, would we get the same character?
129	**/
130	_PUBLIC_ bool islower_m(codepoint_t val)
131	{
132	return (toupper_m(val) != val);
133	}
134
135	/**
136	If we lower cased this character, would we get the same character?
137	**/
138	_PUBLIC_ bool isupper_m(codepoint_t val)
139	{
140	return (tolower_m(val) != val);
141	}
142
143	/**
144	compare two codepoints case insensitively
145	*/
146	_PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
147	{
148	if (c1 == c2 \|\|
149	toupper_m(c1) == toupper_m(c2)) {
150	return 0;
151	}
152	return c1 - c2;
153	}
154
155
156	struct smb_iconv_convenience {
157	TALLOC_CTX *child_ctx;
158	const char *unix_charset;
159	const char *dos_charset;
160	const char *display_charset;
161	bool native_iconv;
162	smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
163	};
164
165	struct smb_iconv_convenience *global_iconv_convenience = NULL;
166
167	struct smb_iconv_convenience *get_iconv_convenience(void)
168	{
169	if (global_iconv_convenience == NULL)
170	global_iconv_convenience = smb_iconv_convenience_reinit(talloc_autofree_context(),
171	"ASCII", "UTF-8", "ASCII", true, NULL);
172	return global_iconv_convenience;
173	}
174
175	/**
176	* Return the name of a charset to give to iconv().
177	**/
178	const char charset_name(struct smb_iconv_convenience ic, charset_t ch)
179	{
180	switch (ch) {
181	case CH_UTF16: return "UTF-16LE";
182	case CH_UNIX: return ic->unix_charset;
183	case CH_DOS: return ic->dos_charset;
184	case CH_DISPLAY: return ic->display_charset;
185	case CH_UTF8: return "UTF8";
186	case CH_UTF16BE: return "UTF-16BE";
187	case CH_UTF16MUNGED: return "UTF16_MUNGED";
188	default:
189	return "ASCII";
190	}
191	}
192
193	/**
194	re-initialize iconv conversion descriptors
195	**/
196	static int close_iconv_convenience(struct smb_iconv_convenience *data)
197	{
198	unsigned c1, c2;
199	for (c1=0;c1<NUM_CHARSETS;c1++) {
200	for (c2=0;c2<NUM_CHARSETS;c2++) {
201	if (data->conv_handles[c1][c2] != NULL) {
202	if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
203	smb_iconv_close(data->conv_handles[c1][c2]);
204	}
205	data->conv_handles[c1][c2] = NULL;
206	}
207	}
208	}
209
210	return 0;
211	}
212
213	static const char map_locale(const char charset)
214	{
215	if (strcmp(charset, "LOCALE") != 0) {
216	return charset;
217	}
218	#if defined(HAVE_NL_LANGINFO) && defined(CODESET)
219	{
220	const char *ln;
221	smb_iconv_t handle;
222
223	ln = nl_langinfo(CODESET);
224	if (ln == NULL) {
225	DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n"));
226	return "ASCII";
227	}
228	/* Check whether the charset name is supported
229	by iconv */
230	handle = smb_iconv_open(ln, "UCS-2LE");
231	if (handle == (smb_iconv_t) -1) {
232	DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
233	return "ASCII";
234	} else {
235	DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
236	smb_iconv_close(handle);
237	}
238	return ln;
239	}
240	#endif
241	return "ASCII";
242	}
243
244	/*
245	the old_ic is passed in here as the smb_iconv_convenience structure
246	is used as a global pointer in some places (eg. python modules). We
247	don't want to invalidate those global pointers, but we do want to
248	update them with the right charset information when loadparm
249	runs. To do that we need to re-use the structure pointer, but
250	re-fill the elements in the structure with the updated values
251	*/
252	_PUBLIC_ struct smb_iconv_convenience smb_iconv_convenience_reinit(TALLOC_CTX mem_ctx,
253	const char *dos_charset,
254	const char *unix_charset,
255	const char *display_charset,
256	bool native_iconv,
257	struct smb_iconv_convenience *old_ic)
258	{
259	struct smb_iconv_convenience *ret;
260
261	display_charset = map_locale(display_charset);
262
263	if (old_ic != NULL) {
264	ret = old_ic;
265	close_iconv_convenience(ret);
266	talloc_free(ret->child_ctx);
267	ZERO_STRUCTP(ret);
268	} else {
269	ret = talloc_zero(mem_ctx, struct smb_iconv_convenience);
270	}
271	if (ret == NULL) {
272	return NULL;
273	}
274
275	/* we use a child context to allow us to free all ptrs without
276	freeing the structure itself */
277	ret->child_ctx = talloc_new(ret);
278	if (ret->child_ctx == NULL) {
279	return NULL;
280	}
281
282	talloc_set_destructor(ret, close_iconv_convenience);
283
284	ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
285	ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
286	ret->display_charset = talloc_strdup(ret->child_ctx, display_charset);
287	ret->native_iconv = native_iconv;
288
289	return ret;
290	}
291
292	/*
293	on-demand initialisation of conversion handles
294	*/
295	smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
296	charset_t from, charset_t to)
297	{
298	const char n1, n2;
299	static bool initialised;
300
301	if (initialised == false) {
302	initialised = true;
303	}
304
305	if (ic->conv_handles[from][to]) {
306	return ic->conv_handles[from][to];
307	}
308
309	n1 = charset_name(ic, from);
310	n2 = charset_name(ic, to);
311
312	ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
313	ic->native_iconv);
314
315	if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
316	if ((from == CH_DOS \|\| to == CH_DOS) &&
317	strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
318	DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
319	charset_name(ic, CH_DOS)));
320	ic->dos_charset = "ASCII";
321
322	n1 = charset_name(ic, from);
323	n2 = charset_name(ic, to);
324
325	ic->conv_handles[from][to] =
326	smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
327	}
328	}
329
330	return ic->conv_handles[from][to];
331	}
332
333	/**
334	* Return the unicode codepoint for the next character in the input
335	* string in the given src_charset.
336	* The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
337	*
338	* Also return the number of bytes consumed (which tells the caller
339	* how many bytes to skip to get to the next src_charset-character).
340	*
341	* This is implemented (in the non-ascii-case) by first converting the
342	* next character in the input string to UTF16_LE and then calculating
343	* the unicode codepoint from that.
344	*
345	* Return INVALID_CODEPOINT if the next character cannot be converted.
346	*/
347	_PUBLIC_ codepoint_t next_codepoint_convenience_ext(
348	struct smb_iconv_convenience *ic,
349	const char *str, charset_t src_charset,
350	size_t *bytes_consumed)
351	{
352	/* it cannot occupy more than 4 bytes in UTF16 format */
353	uint8_t buf[4];
354	smb_iconv_t descriptor;
355	size_t ilen_orig;
356	size_t ilen;
357	size_t olen;
358	char *outbuf;
359
360	if ((str[0] & 0x80) == 0) {
361	*bytes_consumed = 1;
362	return (codepoint_t)str[0];
363	}
364
365	/*
366	* we assume that no multi-byte character can take more than 5 bytes.
367	* This is OK as we only support codepoints up to 1M (U+100000)
368	*/
369	ilen_orig = strnlen(str, 5);
370	ilen = ilen_orig;
371
372	descriptor = get_conv_handle(ic, src_charset, CH_UTF16);
373	if (descriptor == (smb_iconv_t)-1) {
374	*bytes_consumed = 1;
375	return INVALID_CODEPOINT;
376	}
377
378	/*
379	* this looks a little strange, but it is needed to cope with
380	* codepoints above 64k (U+1000) which are encoded as per RFC2781.
381	*/
382	olen = 2;
383	outbuf = (char *)buf;
384	smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
385	if (olen == 2) {
386	olen = 4;
387	outbuf = (char *)buf;
388	smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
389	if (olen == 4) {
390	/* we didn't convert any bytes */
391	*bytes_consumed = 1;
392	return INVALID_CODEPOINT;
393	}
394	olen = 4 - olen;
395	} else {
396	olen = 2 - olen;
397	}
398
399	*bytes_consumed = ilen_orig - ilen;
400
401	if (olen == 2) {
402	return (codepoint_t)SVAL(buf, 0);
403	}
404	if (olen == 4) {
405	/* decode a 4 byte UTF16 character manually */
406	return (codepoint_t)0x10000 +
407	(buf[2] \| ((buf[3] & 0x3)<<8) \|
408	(buf[0]<<10) \| ((buf[1] & 0x3)<<18));
409	}
410
411	/* no other length is valid */
412	return INVALID_CODEPOINT;
413	}
414
415	/*
416	return the unicode codepoint for the next multi-byte CH_UNIX character
417	in the string
418
419	also return the number of bytes consumed (which tells the caller
420	how many bytes to skip to get to the next CH_UNIX character)
421
422	return INVALID_CODEPOINT if the next character cannot be converted
423	*/
424	_PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic,
425	const char str, size_t size)
426	{
427	return next_codepoint_convenience_ext(ic, str, CH_UNIX, size);
428	}
429
430	/*
431	push a single codepoint into a CH_UNIX string the target string must
432	be able to hold the full character, which is guaranteed if it is at
433	least 5 bytes in size. The caller may pass less than 5 bytes if they
434	are sure the character will fit (for example, you can assume that
435	uppercase/lowercase of a character will not add more than 1 byte)
436
437	return the number of bytes occupied by the CH_UNIX character, or
438	-1 on failure
439	*/
440	_PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic,
441	char *str, codepoint_t c)
442	{
443	smb_iconv_t descriptor;
444	uint8_t buf[4];
445	size_t ilen, olen;
446	const char *inbuf;
447
448	if (c < 128) {
449	*str = c;
450	return 1;
451	}
452
453	descriptor = get_conv_handle(ic,
454	CH_UTF16, CH_UNIX);
455	if (descriptor == (smb_iconv_t)-1) {
456	return -1;
457	}
458
459	if (c < 0x10000) {
460	ilen = 2;
461	olen = 5;
462	inbuf = (char *)buf;
463	SSVAL(buf, 0, c);
464	smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
465	if (ilen != 0) {
466	return -1;
467	}
468	return 5 - olen;
469	}
470
471	c -= 0x10000;
472
473	buf[0] = (c>>10) & 0xFF;
474	buf[1] = (c>>18) \| 0xd8;
475	buf[2] = c & 0xFF;
476	buf[3] = ((c>>8) & 0x3) \| 0xdc;
477
478	ilen = 4;
479	olen = 5;
480	inbuf = (char *)buf;
481
482	smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
483	if (ilen != 0) {
484	return -1;
485	}
486	return 5 - olen;
487	}
488
489	_PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
490	size_t *size)
491	{
492	return next_codepoint_convenience_ext(get_iconv_convenience(), str,
493	src_charset, size);
494	}
495
496	_PUBLIC_ codepoint_t next_codepoint(const char str, size_t size)
497	{
498	return next_codepoint_convenience(get_iconv_convenience(), str, size);
499	}
500
501	_PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
502	{
503	return push_codepoint_convenience(get_iconv_convenience(), str, c);
504	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: