source: trunk/lib/unicode/utf8.c

Last change on this file was 21927, checked in by dmik, 14 years ago

Fix build breaks with the newest GCC 4.4.6 from GIT.

In particular, GCC is now strict about matching the calling convention
of the prototype (argument) and the real function used.

File size: 5.5 KB
RevLine 
[5450]1/*
2 * UTF-8 support routines
3 *
4 * Copyright 2000 Alexandre Julliard
5 */
6
7#include <string.h>
8
9#include "winnls.h"
10#include "wine/unicode.h"
11
12/* number of following bytes in sequence based on first byte value (for bytes above 0x7f) */
13static const char utf8_length[128] =
14{
15 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
16 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
17 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
19 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
20 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
21 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
22 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0 /* 0xf0-0xff */
23};
24
25/* first byte mask depending on UTF-8 sequence length */
26static const unsigned char utf8_mask[6] = { 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
27
28/* minimum Unicode value depending on UTF-8 sequence length */
29static const unsigned int utf8_minval[6] = { 0x0, 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
30
31
32/* query necessary dst length for src string */
33inline static int get_length_wcs_utf8( const WCHAR *src, unsigned int srclen )
34{
35 int len;
36 for (len = 0; srclen; srclen--, src++, len++)
37 {
38 if (*src >= 0x80)
39 {
40 len++;
41 if (*src >= 0x800) len++;
42 }
43 }
44 return len;
45}
46
47/* wide char to UTF-8 string conversion */
48/* return -1 on dst buffer overflow */
[21927]49_K32CONV int utf8_wcstombs( const WCHAR *src, int srclen, char *dst, int dstlen )
[5450]50{
51 int ret = srclen;
52
53 if (!dstlen) return get_length_wcs_utf8( src, srclen );
54
55 for (ret = srclen; srclen; srclen--, src++)
56 {
57 WCHAR ch = *src;
58
59 if (ch < 0x80) /* 0x00-0x7f: 1 byte */
60 {
61 if (!dstlen--) return -1; /* overflow */
62 *dst++ = ch;
63 continue;
64 }
65
66 if (ch < 0x800) /* 0x80-0x7ff: 2 bytes */
67 {
68 if ((dstlen -= 2) < 0) return -1; /* overflow */
69 dst[1] = 0x80 | (ch & 0x3f);
70 ch >>= 6;
71 dst[0] = 0xc0 | ch;
72 dst += 2;
73 continue;
74 }
75
76 /* 0x800-0xffff: 3 bytes */
77
78 if ((dstlen -= 3) < 0) return -1; /* overflow */
79 dst[2] = 0x80 | (ch & 0x3f);
80 ch >>= 6;
81 dst[1] = 0x80 | (ch & 0x3f);
82 ch >>= 6;
83 dst[0] = 0xe0 | ch;
84 dst += 3;
85 }
86 return ret;
87}
88
89/* query necessary dst length for src string */
90inline static int get_length_mbs_utf8( const unsigned char *src, int srclen )
91{
92 int ret;
93 const unsigned char *srcend = src + srclen;
94
95 for (ret = 0; src < srcend; ret++)
96 {
97 unsigned char ch = *src++;
98 if (ch < 0xc0) continue;
99
100 switch(utf8_length[ch-0x80])
101 {
102 case 5:
103 if (src >= srcend) return ret; /* ignore partial char */
104 if ((ch = *src ^ 0x80) >= 0x40) continue;
105 src++;
106 case 4:
107 if (src >= srcend) return ret; /* ignore partial char */
108 if ((ch = *src ^ 0x80) >= 0x40) continue;
109 src++;
110 case 3:
111 if (src >= srcend) return ret; /* ignore partial char */
112 if ((ch = *src ^ 0x80) >= 0x40) continue;
113 src++;
114 case 2:
115 if (src >= srcend) return ret; /* ignore partial char */
116 if ((ch = *src ^ 0x80) >= 0x40) continue;
117 src++;
118 case 1:
119 if (src >= srcend) return ret; /* ignore partial char */
120 if ((ch = *src ^ 0x80) >= 0x40) continue;
121 src++;
122 }
123 }
124 return ret;
125}
126
127/* UTF-8 to wide char string conversion */
128/* return -1 on dst buffer overflow, -2 on invalid input char */
[21927]129_K32CONV int utf8_mbstowcs( int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
[5450]130{
131 int len, count;
132 unsigned int res;
133 const char *srcend = src + srclen;
134
135 if (!dstlen) return get_length_mbs_utf8( (const unsigned char *)src, srclen );
136
137 for (count = dstlen; count && (src < srcend); count--, dst++)
138 {
139 unsigned char ch = *src++;
140 if (ch < 0x80) /* special fast case for 7-bit ASCII */
141 {
142 *dst = ch;
143 continue;
144 }
145 len = utf8_length[ch-0x80];
146 res = ch & utf8_mask[len];
147
148 switch(len)
149 {
150 case 5:
151 if (src >= srcend) goto done; /* ignore partial char */
152 if ((ch = *src ^ 0x80) >= 0x40) goto bad;
153 res = (res << 6) | ch;
154 src++;
155 case 4:
156 if (src >= srcend) goto done; /* ignore partial char */
157 if ((ch = *src ^ 0x80) >= 0x40) goto bad;
158 res = (res << 6) | ch;
159 src++;
160 case 3:
161 if (src >= srcend) goto done; /* ignore partial char */
162 if ((ch = *src ^ 0x80) >= 0x40) goto bad;
163 res = (res << 6) | ch;
164 src++;
165 case 2:
166 if (src >= srcend) goto done; /* ignore partial char */
167 if ((ch = *src ^ 0x80) >= 0x40) goto bad;
168 res = (res << 6) | ch;
169 src++;
170 case 1:
171 if (src >= srcend) goto done; /* ignore partial char */
172 if ((ch = *src ^ 0x80) >= 0x40) goto bad;
173 res = (res << 6) | ch;
174 src++;
175 if (res < utf8_minval[len]) goto bad;
176 if (res >= 0x10000) goto bad; /* FIXME: maybe we should do surrogates here */
177 *dst = res;
178 continue;
179 }
180 bad:
181 if (flags & MB_ERR_INVALID_CHARS) return -2; /* bad char */
182 *dst = (WCHAR)'?';
183 }
184 if (src < srcend) return -1; /* overflow */
185done:
186 return dstlen - count;
187}
Note: See TracBrowser for help on using the repository browser.